featcopilot 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,7 @@
1
1
  """Text feature engineering engine.
2
2
 
3
3
  Generates features from text data using embeddings and NLP techniques.
4
+ Supports local offline processing with transformers and spacy.
4
5
  """
5
6
 
6
7
  from typing import Any, Optional, Union
@@ -22,33 +23,62 @@ class TextEngineConfig(EngineConfig):
22
23
  name: str = "TextEngine"
23
24
  features: list[str] = Field(
24
25
  default_factory=lambda: ["length", "word_count", "char_stats"],
25
- description="Feature types to extract",
26
+ description="Feature types to extract: length, word_count, char_stats, tfidf, sentiment, ner, pos, embeddings",
26
27
  )
27
28
  max_vocab_size: int = Field(default=5000, description="Max vocabulary size for TF-IDF")
28
29
  n_components: int = Field(default=50, description="Components for dimensionality reduction")
30
+ embedding_model: str = Field(
31
+ default="sentence-transformers/all-MiniLM-L6-v2",
32
+ description="Sentence transformer model for embeddings",
33
+ )
34
+ embedding_dim: int = Field(default=32, description="Reduced embedding dimensions (PCA)")
35
+ spacy_model: str = Field(default="en_core_web_sm", description="Spacy model for NER/POS")
36
+ sentiment_model: str = Field(
37
+ default="cardiffnlp/twitter-roberta-base-sentiment-latest",
38
+ description="HuggingFace sentiment model",
39
+ )
29
40
 
30
41
 
31
42
  class TextEngine(BaseEngine):
32
43
  """
33
- Text feature engineering engine.
44
+ Text feature engineering engine with advanced NLP capabilities.
34
45
 
35
46
  Extracts features from text columns including:
36
47
  - Length and character statistics
37
48
  - Word count features
38
49
  - TF-IDF features (optional)
39
- - Sentiment features (optional)
40
- - Embedding features (with LLM integration)
50
+ - Sentiment analysis using transformers (local, offline)
51
+ - Named Entity Recognition (NER) using spacy
52
+ - Part-of-speech (POS) tag distributions
53
+ - Sentence embeddings using sentence-transformers
41
54
 
42
55
  Parameters
43
56
  ----------
44
57
  features : list
45
- Feature types to extract
58
+ Feature types to extract. Options:
59
+ - 'length': character and word counts
60
+ - 'word_count': word-level statistics
61
+ - 'char_stats': character-level statistics
62
+ - 'tfidf': TF-IDF with SVD reduction
63
+ - 'sentiment': transformer-based sentiment scores
64
+ - 'ner': named entity counts by type
65
+ - 'pos': part-of-speech tag distributions
66
+ - 'embeddings': sentence embeddings (reduced via PCA)
46
67
  max_vocab_size : int, default=5000
47
68
  Maximum vocabulary size for TF-IDF
69
+ embedding_model : str
70
+ Sentence transformer model name
71
+ spacy_model : str
72
+ Spacy model for NER/POS tagging
48
73
 
49
74
  Examples
50
75
  --------
51
- >>> engine = TextEngine(features=['length', 'word_count', 'tfidf'])
76
+ >>> # Basic features (fast, no dependencies)
77
+ >>> engine = TextEngine(features=['length', 'word_count', 'char_stats'])
78
+ >>> X_features = engine.fit_transform(text_df)
79
+
80
+ >>> # Advanced features with transformers/spacy
81
+ >>> engine = TextEngine(features=['sentiment', 'ner', 'pos', 'embeddings'])
52
82
  >>> X_features = engine.fit_transform(text_df)
53
83
  """
54
84
 
@@ -56,6 +86,11 @@ class TextEngine(BaseEngine):
56
86
  self,
57
87
  features: Optional[list[str]] = None,
58
88
  max_vocab_size: int = 5000,
89
+ n_components: int = 50,
90
+ embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2",
91
+ embedding_dim: int = 32,
92
+ spacy_model: str = "en_core_web_sm",
93
+ sentiment_model: str = "cardiffnlp/twitter-roberta-base-sentiment-latest",
59
94
  max_features: Optional[int] = None,
60
95
  verbose: bool = False,
61
96
  **kwargs,
@@ -63,6 +98,11 @@ class TextEngine(BaseEngine):
63
98
  config = TextEngineConfig(
64
99
  features=features or ["length", "word_count", "char_stats"],
65
100
  max_vocab_size=max_vocab_size,
101
+ n_components=n_components,
102
+ embedding_model=embedding_model,
103
+ embedding_dim=embedding_dim,
104
+ spacy_model=spacy_model,
105
+ sentiment_model=sentiment_model,
66
106
  max_features=max_features,
67
107
  verbose=verbose,
68
108
  **kwargs,
@@ -73,6 +113,12 @@ class TextEngine(BaseEngine):
73
113
  self._vectorizers: dict[str, Any] = {}
74
114
  self._feature_set = FeatureSet()
75
115
 
116
+ # Lazy-loaded models
117
+ self._nlp = None # spacy
118
+ self._sentiment_pipeline = None # transformers
119
+ self._embedding_model = None # sentence-transformers
120
+ self._pca_models: dict[str, Any] = {} # PCA for embeddings
121
+
76
122
  def fit(
77
123
  self,
78
124
  X: Union[pd.DataFrame, np.ndarray],
@@ -115,9 +161,86 @@ class TextEngine(BaseEngine):
115
161
  if "tfidf" in self.config.features:
116
162
  self._fit_tfidf(X)
117
163
 
164
+ # Fit embedding PCA if needed
165
+ if "embeddings" in self.config.features:
166
+ self._fit_embeddings(X)
167
+
168
+ # Load spacy model if needed
169
+ if "ner" in self.config.features or "pos" in self.config.features:
170
+ self._load_spacy()
171
+
172
+ # Load sentiment model if needed
173
+ if "sentiment" in self.config.features:
174
+ self._load_sentiment()
175
+
118
176
  self._is_fitted = True
119
177
  return self
120
178
 
179
+ def _load_spacy(self) -> None:
180
+ """Load spacy model for NER/POS tagging."""
181
+ if self._nlp is not None:
182
+ return
183
+
184
+ try:
185
+ import spacy
186
+
187
+ try:
188
+ self._nlp = spacy.load(self.config.spacy_model)
189
+ if self.config.verbose:
190
+ logger.info(f"TextEngine: Loaded spacy model '{self.config.spacy_model}'")
191
+ except OSError:
192
+ # Try to download the model
193
+ if self.config.verbose:
194
+ logger.info(f"TextEngine: Downloading spacy model '{self.config.spacy_model}'...")
195
+ spacy.cli.download(self.config.spacy_model)
196
+ self._nlp = spacy.load(self.config.spacy_model)
197
+
198
+ except ImportError:
199
+ logger.warning("TextEngine: spacy not installed. Install with: pip install spacy")
200
+ self._nlp = None
201
+
202
+ def _load_sentiment(self) -> None:
203
+ """Load sentiment analysis pipeline."""
204
+ if self._sentiment_pipeline is not None:
205
+ return
206
+
207
+ try:
208
+ from transformers import pipeline
209
+
210
+ self._sentiment_pipeline = pipeline(
211
+ "sentiment-analysis",
212
+ model=self.config.sentiment_model,
213
+ truncation=True,
214
+ max_length=512,
215
+ )
216
+ if self.config.verbose:
217
+ logger.info(f"TextEngine: Loaded sentiment model '{self.config.sentiment_model}'")
218
+
219
+ except ImportError:
220
+ logger.warning("TextEngine: transformers not installed. Install with: pip install transformers")
221
+ self._sentiment_pipeline = None
222
+ except Exception as e:
223
+ logger.warning(f"TextEngine: Could not load sentiment model: {e}")
224
+ self._sentiment_pipeline = None
225
+
226
+ def _load_embedding_model(self) -> None:
227
+ """Load sentence transformer model."""
228
+ if self._embedding_model is not None:
229
+ return
230
+
231
+ try:
232
+ from sentence_transformers import SentenceTransformer
233
+
234
+ self._embedding_model = SentenceTransformer(self.config.embedding_model)
235
+ if self.config.verbose:
236
+ logger.info(f"TextEngine: Loaded embedding model '{self.config.embedding_model}'")
237
+
238
+ except ImportError:
239
+ logger.warning(
240
+ "TextEngine: sentence-transformers not installed. Install with: pip install sentence-transformers"
241
+ )
242
+ self._embedding_model = None
243
+
121
244
  def _fit_tfidf(self, X: pd.DataFrame) -> None:
122
245
  """Fit TF-IDF vectorizers for text columns."""
123
246
  try:
@@ -140,6 +263,34 @@ class TextEngine(BaseEngine):
140
263
  if self.config.verbose:
141
264
  logger.warning("TextEngine: sklearn not available for TF-IDF, skipping")
142
265
 
266
+ def _fit_embeddings(self, X: pd.DataFrame) -> None:
267
+ """Fit PCA for embedding dimensionality reduction."""
268
+ self._load_embedding_model()
269
+ if self._embedding_model is None:
270
+ return
271
+
272
+ try:
273
+ from sklearn.decomposition import PCA
274
+
275
+ for col in self._text_columns:
276
+ texts = X[col].fillna("").astype(str).tolist()
277
+ # Sample for fitting PCA (limit to 1000 for speed)
278
+ sample_texts = texts[: min(1000, len(texts))]
279
+ embeddings = self._embedding_model.encode(sample_texts, show_progress_bar=False)
280
+
281
+ # Fit PCA
282
+ n_components = min(self.config.embedding_dim, embeddings.shape[1], len(sample_texts))
283
+ if n_components > 0:
284
+ pca = PCA(n_components=n_components)
285
+ pca.fit(embeddings)
286
+ self._pca_models[col] = pca
287
+
288
+ if self.config.verbose:
289
+ logger.info(f"TextEngine: Fitted embedding PCA for '{col}' ({n_components} components)")
290
+
291
+ except Exception as e:
292
+ logger.warning(f"TextEngine: Could not fit embeddings: {e}")
293
+
143
294
  def transform(self, X: Union[pd.DataFrame, np.ndarray], **kwargs) -> pd.DataFrame:
144
295
  """
145
296
  Extract text features.
@@ -152,7 +303,7 @@ class TextEngine(BaseEngine):
152
303
  Returns
153
304
  -------
154
305
  X_features : DataFrame
155
- Extracted features
306
+ Extracted features (numerical only, text columns dropped)
156
307
  """
157
308
  if not self._is_fitted:
158
309
  raise RuntimeError("Engine must be fitted before transform")
@@ -191,7 +342,35 @@ class TextEngine(BaseEngine):
191
342
  tfidf_features = self._transform_tfidf(texts, col)
192
343
  result = pd.concat([result, tfidf_features], axis=1)
193
344
 
194
- self._feature_names = [c for c in result.columns if c not in X.columns]
345
+ # Sentiment features (transformers)
346
+ if "sentiment" in self.config.features:
347
+ sentiment_features = self._extract_sentiment(texts, col)
348
+ for feat_name, feat_values in sentiment_features.items():
349
+ result[feat_name] = feat_values
350
+
351
+ # NER features (spacy)
352
+ if "ner" in self.config.features:
353
+ ner_features = self._extract_ner(texts, col)
354
+ for feat_name, feat_values in ner_features.items():
355
+ result[feat_name] = feat_values
356
+
357
+ # POS features (spacy)
358
+ if "pos" in self.config.features:
359
+ pos_features = self._extract_pos(texts, col)
360
+ for feat_name, feat_values in pos_features.items():
361
+ result[feat_name] = feat_values
362
+
363
+ # Embedding features (sentence-transformers)
364
+ if "embeddings" in self.config.features:
365
+ emb_features = self._extract_embeddings(texts, col)
366
+ if emb_features is not None:
367
+ result = pd.concat([result, emb_features], axis=1)
368
+
369
+ # Drop original text columns
370
+ cols_to_drop = [col for col in self._text_columns if col in result.columns]
371
+ result = result.drop(columns=cols_to_drop)
372
+
373
+ self._feature_names = [c for c in result.columns if c not in X.columns or c in cols_to_drop]
195
374
 
196
375
  if self.config.verbose:
197
376
  logger.info(f"TextEngine: Extracted {len(self._feature_names)} features")
@@ -209,6 +388,165 @@ class TextEngine(BaseEngine):
209
388
  feature_names = [f"{col}_tfidf_{i}" for i in range(reduced.shape[1])]
210
389
  return pd.DataFrame(reduced, columns=feature_names, index=texts.index)
211
390
 
391
+ def _extract_sentiment(self, texts: pd.Series, col: str) -> dict[str, list]:
392
+ """Extract sentiment scores using transformers."""
393
+ if self._sentiment_pipeline is None:
394
+ self._load_sentiment()
395
+ if self._sentiment_pipeline is None:
396
+ return {}
397
+
398
+ features = {
399
+ f"{col}_sentiment_positive": [],
400
+ f"{col}_sentiment_negative": [],
401
+ f"{col}_sentiment_neutral": [],
402
+ f"{col}_sentiment_score": [],
403
+ }
404
+
405
+ # Process in batches for efficiency
406
+ batch_size = 32
407
+ text_list = texts.tolist()
408
+
409
+ for i in range(0, len(text_list), batch_size):
410
+ batch = text_list[i : i + batch_size]
411
+ # Truncate very long texts
412
+ batch = [t[:512] if len(t) > 512 else t for t in batch]
413
+
414
+ try:
415
+ results = self._sentiment_pipeline(batch)
416
+ for res in results:
417
+ label = res["label"].lower()
418
+ score = res["score"]
419
+
420
+ # Map to standard sentiment scores
421
+ if "positive" in label or label == "pos":
422
+ features[f"{col}_sentiment_positive"].append(score)
423
+ features[f"{col}_sentiment_negative"].append(0)
424
+ features[f"{col}_sentiment_neutral"].append(0)
425
+ features[f"{col}_sentiment_score"].append(score)
426
+ elif "negative" in label or label == "neg":
427
+ features[f"{col}_sentiment_positive"].append(0)
428
+ features[f"{col}_sentiment_negative"].append(score)
429
+ features[f"{col}_sentiment_neutral"].append(0)
430
+ features[f"{col}_sentiment_score"].append(-score)
431
+ else: # neutral
432
+ features[f"{col}_sentiment_positive"].append(0)
433
+ features[f"{col}_sentiment_negative"].append(0)
434
+ features[f"{col}_sentiment_neutral"].append(score)
435
+ features[f"{col}_sentiment_score"].append(0)
436
+
437
+ except Exception as e:
438
+ # Fill with zeros on error
439
+ for _ in batch:
440
+ features[f"{col}_sentiment_positive"].append(0)
441
+ features[f"{col}_sentiment_negative"].append(0)
442
+ features[f"{col}_sentiment_neutral"].append(0)
443
+ features[f"{col}_sentiment_score"].append(0)
444
+ if self.config.verbose:
445
+ logger.warning(f"TextEngine: Sentiment error: {e}")
446
+
447
+ return features
448
+
449
+ def _extract_ner(self, texts: pd.Series, col: str) -> dict[str, list]:
450
+ """Extract NER counts using spacy."""
451
+ if self._nlp is None:
452
+ return {}
453
+
454
+ # Entity types to count
455
+ entity_types = ["PERSON", "ORG", "GPE", "DATE", "MONEY", "PRODUCT", "EVENT", "LOC"]
456
+ features = {f"{col}_ner_{ent.lower()}": [] for ent in entity_types}
457
+ features[f"{col}_ner_total"] = []
458
+
459
+ for text in texts:
460
+ try:
461
+ doc = self._nlp(text[:10000]) # Limit text length
462
+ ent_counts = {ent: 0 for ent in entity_types}
463
+
464
+ for ent in doc.ents:
465
+ if ent.label_ in ent_counts:
466
+ ent_counts[ent.label_] += 1
467
+
468
+ for ent_type in entity_types:
469
+ features[f"{col}_ner_{ent_type.lower()}"].append(ent_counts[ent_type])
470
+ features[f"{col}_ner_total"].append(len(doc.ents))
471
+
472
+ except Exception:
473
+ for ent_type in entity_types:
474
+ features[f"{col}_ner_{ent_type.lower()}"].append(0)
475
+ features[f"{col}_ner_total"].append(0)
476
+
477
+ return features
478
+
479
+ def _extract_pos(self, texts: pd.Series, col: str) -> dict[str, list]:
480
+ """Extract POS tag distributions using spacy."""
481
+ if self._nlp is None:
482
+ return {}
483
+
484
+ # POS tags to track (ratios)
485
+ pos_tags = ["NOUN", "VERB", "ADJ", "ADV", "PROPN", "PRON", "DET", "ADP", "PUNCT"]
486
+ features = {f"{col}_pos_{tag.lower()}_ratio": [] for tag in pos_tags}
487
+ features[f"{col}_pos_noun_verb_ratio"] = []
488
+ features[f"{col}_pos_content_ratio"] = [] # nouns + verbs + adj
489
+
490
+ for text in texts:
491
+ try:
492
+ doc = self._nlp(text[:10000])
493
+ total_tokens = len(doc)
494
+
495
+ if total_tokens == 0:
496
+ for tag in pos_tags:
497
+ features[f"{col}_pos_{tag.lower()}_ratio"].append(0)
498
+ features[f"{col}_pos_noun_verb_ratio"].append(0)
499
+ features[f"{col}_pos_content_ratio"].append(0)
500
+ continue
501
+
502
+ pos_counts = {tag: 0 for tag in pos_tags}
503
+ for token in doc:
504
+ if token.pos_ in pos_counts:
505
+ pos_counts[token.pos_] += 1
506
+
507
+ for tag in pos_tags:
508
+ features[f"{col}_pos_{tag.lower()}_ratio"].append(pos_counts[tag] / total_tokens)
509
+
510
+ # Noun to verb ratio
511
+ verb_count = pos_counts["VERB"]
512
+ noun_count = pos_counts["NOUN"]
513
+ features[f"{col}_pos_noun_verb_ratio"].append(noun_count / max(verb_count, 1))
514
+
515
+ # Content word ratio (nouns + verbs + adjectives)
516
+ content_count = noun_count + verb_count + pos_counts["ADJ"]
517
+ features[f"{col}_pos_content_ratio"].append(content_count / total_tokens)
518
+
519
+ except Exception:
520
+ for tag in pos_tags:
521
+ features[f"{col}_pos_{tag.lower()}_ratio"].append(0)
522
+ features[f"{col}_pos_noun_verb_ratio"].append(0)
523
+ features[f"{col}_pos_content_ratio"].append(0)
524
+
525
+ return features
526
+
527
+ def _extract_embeddings(self, texts: pd.Series, col: str) -> Optional[pd.DataFrame]:
528
+ """Extract sentence embeddings using sentence-transformers."""
529
+ if self._embedding_model is None:
530
+ self._load_embedding_model()
531
+ if self._embedding_model is None:
532
+ return None
533
+
534
+ try:
535
+ text_list = texts.tolist()
536
+ embeddings = self._embedding_model.encode(text_list, show_progress_bar=False)
537
+
538
+ # Apply PCA if fitted
539
+ if col in self._pca_models:
540
+ embeddings = self._pca_models[col].transform(embeddings)
541
+
542
+ feature_names = [f"{col}_emb_{i}" for i in range(embeddings.shape[1])]
543
+ return pd.DataFrame(embeddings, columns=feature_names, index=texts.index)
544
+
545
+ except Exception as e:
546
+ if self.config.verbose:
547
+ logger.warning(f"TextEngine: Embedding error: {e}")
548
+ return None
549
+
212
550
  def get_feature_set(self) -> FeatureSet:
213
551
  """Get the feature set with metadata."""
214
552
  return self._feature_set