featcopilot 0.1.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,7 @@
1
1
  """Text feature engineering engine.
2
2
 
3
3
  Generates features from text data using embeddings and NLP techniques.
4
+ Supports local offline processing with transformers and spacy.
4
5
  """
5
6
 
6
7
  from typing import Any, Optional, Union
@@ -11,6 +12,9 @@ from pydantic import Field
11
12
 
12
13
  from featcopilot.core.base import BaseEngine, EngineConfig
13
14
  from featcopilot.core.feature import FeatureSet
15
+ from featcopilot.utils.logger import get_logger
16
+
17
+ logger = get_logger(__name__)
14
18
 
15
19
 
16
20
  class TextEngineConfig(EngineConfig):
@@ -19,33 +23,62 @@ class TextEngineConfig(EngineConfig):
19
23
  name: str = "TextEngine"
20
24
  features: list[str] = Field(
21
25
  default_factory=lambda: ["length", "word_count", "char_stats"],
22
- description="Feature types to extract",
26
+ description="Feature types to extract: length, word_count, char_stats, tfidf, sentiment, ner, pos, embeddings",
23
27
  )
24
28
  max_vocab_size: int = Field(default=5000, description="Max vocabulary size for TF-IDF")
25
29
  n_components: int = Field(default=50, description="Components for dimensionality reduction")
30
+ embedding_model: str = Field(
31
+ default="sentence-transformers/all-MiniLM-L6-v2",
32
+ description="Sentence transformer model for embeddings",
33
+ )
34
+ embedding_dim: int = Field(default=32, description="Reduced embedding dimensions (PCA)")
35
+ spacy_model: str = Field(default="en_core_web_sm", description="Spacy model for NER/POS")
36
+ sentiment_model: str = Field(
37
+ default="cardiffnlp/twitter-roberta-base-sentiment-latest",
38
+ description="HuggingFace sentiment model",
39
+ )
26
40
 
27
41
 
28
42
  class TextEngine(BaseEngine):
29
43
  """
30
- Text feature engineering engine.
44
+ Text feature engineering engine with advanced NLP capabilities.
31
45
 
32
46
  Extracts features from text columns including:
33
47
  - Length and character statistics
34
48
  - Word count features
35
49
  - TF-IDF features (optional)
36
- - Sentiment features (optional)
37
- - Embedding features (with LLM integration)
50
+ - Sentiment analysis using transformers (local, offline)
51
+ - Named Entity Recognition (NER) using spacy
52
+ - Part-of-speech (POS) tag distributions
53
+ - Sentence embeddings using sentence-transformers
38
54
 
39
55
  Parameters
40
56
  ----------
41
57
  features : list
42
- Feature types to extract
58
+ Feature types to extract. Options:
59
+ - 'length': character and word counts
60
+ - 'word_count': word-level statistics
61
+ - 'char_stats': character-level statistics
62
+ - 'tfidf': TF-IDF with SVD reduction
63
+ - 'sentiment': transformer-based sentiment scores
64
+ - 'ner': named entity counts by type
65
+ - 'pos': part-of-speech tag distributions
66
+ - 'embeddings': sentence embeddings (reduced via PCA)
43
67
  max_vocab_size : int, default=5000
44
68
  Maximum vocabulary size for TF-IDF
69
+ embedding_model : str
70
+ Sentence transformer model name
71
+ spacy_model : str
72
+ Spacy model for NER/POS tagging
45
73
 
46
74
  Examples
47
75
  --------
48
- >>> engine = TextEngine(features=['length', 'word_count', 'tfidf'])
76
+ >>> # Basic features (fast, no dependencies)
77
+ >>> engine = TextEngine(features=['length', 'word_count', 'char_stats'])
78
+ >>> X_features = engine.fit_transform(text_df)
79
+
80
+ >>> # Advanced features with transformers/spacy
81
+ >>> engine = TextEngine(features=['sentiment', 'ner', 'pos', 'embeddings'])
49
82
  >>> X_features = engine.fit_transform(text_df)
50
83
  """
51
84
 
@@ -53,6 +86,11 @@ class TextEngine(BaseEngine):
53
86
  self,
54
87
  features: Optional[list[str]] = None,
55
88
  max_vocab_size: int = 5000,
89
+ n_components: int = 50,
90
+ embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2",
91
+ embedding_dim: int = 32,
92
+ spacy_model: str = "en_core_web_sm",
93
+ sentiment_model: str = "cardiffnlp/twitter-roberta-base-sentiment-latest",
56
94
  max_features: Optional[int] = None,
57
95
  verbose: bool = False,
58
96
  **kwargs,
@@ -60,6 +98,11 @@ class TextEngine(BaseEngine):
60
98
  config = TextEngineConfig(
61
99
  features=features or ["length", "word_count", "char_stats"],
62
100
  max_vocab_size=max_vocab_size,
101
+ n_components=n_components,
102
+ embedding_model=embedding_model,
103
+ embedding_dim=embedding_dim,
104
+ spacy_model=spacy_model,
105
+ sentiment_model=sentiment_model,
63
106
  max_features=max_features,
64
107
  verbose=verbose,
65
108
  **kwargs,
@@ -70,6 +113,12 @@ class TextEngine(BaseEngine):
70
113
  self._vectorizers: dict[str, Any] = {}
71
114
  self._feature_set = FeatureSet()
72
115
 
116
+ # Lazy-loaded models
117
+ self._nlp = None # spacy
118
+ self._sentiment_pipeline = None # transformers
119
+ self._embedding_model = None # sentence-transformers
120
+ self._pca_models: dict[str, Any] = {} # PCA for embeddings
121
+
73
122
  def fit(
74
123
  self,
75
124
  X: Union[pd.DataFrame, np.ndarray],
@@ -106,15 +155,92 @@ class TextEngine(BaseEngine):
106
155
  ]
107
156
 
108
157
  if self.config.verbose:
109
- print(f"TextEngine: Found {len(self._text_columns)} text columns")
158
+ logger.info(f"TextEngine: Found {len(self._text_columns)} text columns")
110
159
 
111
160
  # Fit TF-IDF vectorizers if needed
112
161
  if "tfidf" in self.config.features:
113
162
  self._fit_tfidf(X)
114
163
 
164
+ # Fit embedding PCA if needed
165
+ if "embeddings" in self.config.features:
166
+ self._fit_embeddings(X)
167
+
168
+ # Load spacy model if needed
169
+ if "ner" in self.config.features or "pos" in self.config.features:
170
+ self._load_spacy()
171
+
172
+ # Load sentiment model if needed
173
+ if "sentiment" in self.config.features:
174
+ self._load_sentiment()
175
+
115
176
  self._is_fitted = True
116
177
  return self
117
178
 
179
+ def _load_spacy(self) -> None:
180
+ """Load spacy model for NER/POS tagging."""
181
+ if self._nlp is not None:
182
+ return
183
+
184
+ try:
185
+ import spacy
186
+
187
+ try:
188
+ self._nlp = spacy.load(self.config.spacy_model)
189
+ if self.config.verbose:
190
+ logger.info(f"TextEngine: Loaded spacy model '{self.config.spacy_model}'")
191
+ except OSError:
192
+ # Try to download the model
193
+ if self.config.verbose:
194
+ logger.info(f"TextEngine: Downloading spacy model '{self.config.spacy_model}'...")
195
+ spacy.cli.download(self.config.spacy_model)
196
+ self._nlp = spacy.load(self.config.spacy_model)
197
+
198
+ except ImportError:
199
+ logger.warning("TextEngine: spacy not installed. Install with: pip install spacy")
200
+ self._nlp = None
201
+
202
+ def _load_sentiment(self) -> None:
203
+ """Load sentiment analysis pipeline."""
204
+ if self._sentiment_pipeline is not None:
205
+ return
206
+
207
+ try:
208
+ from transformers import pipeline
209
+
210
+ self._sentiment_pipeline = pipeline(
211
+ "sentiment-analysis",
212
+ model=self.config.sentiment_model,
213
+ truncation=True,
214
+ max_length=512,
215
+ )
216
+ if self.config.verbose:
217
+ logger.info(f"TextEngine: Loaded sentiment model '{self.config.sentiment_model}'")
218
+
219
+ except ImportError:
220
+ logger.warning("TextEngine: transformers not installed. Install with: pip install transformers")
221
+ self._sentiment_pipeline = None
222
+ except Exception as e:
223
+ logger.warning(f"TextEngine: Could not load sentiment model: {e}")
224
+ self._sentiment_pipeline = None
225
+
226
+ def _load_embedding_model(self) -> None:
227
+ """Load sentence transformer model."""
228
+ if self._embedding_model is not None:
229
+ return
230
+
231
+ try:
232
+ from sentence_transformers import SentenceTransformer
233
+
234
+ self._embedding_model = SentenceTransformer(self.config.embedding_model)
235
+ if self.config.verbose:
236
+ logger.info(f"TextEngine: Loaded embedding model '{self.config.embedding_model}'")
237
+
238
+ except ImportError:
239
+ logger.warning(
240
+ "TextEngine: sentence-transformers not installed. Install with: pip install sentence-transformers"
241
+ )
242
+ self._embedding_model = None
243
+
118
244
  def _fit_tfidf(self, X: pd.DataFrame) -> None:
119
245
  """Fit TF-IDF vectorizers for text columns."""
120
246
  try:
@@ -135,7 +261,35 @@ class TextEngine(BaseEngine):
135
261
 
136
262
  except ImportError:
137
263
  if self.config.verbose:
138
- print("TextEngine: sklearn not available for TF-IDF, skipping")
264
+ logger.warning("TextEngine: sklearn not available for TF-IDF, skipping")
265
+
266
+ def _fit_embeddings(self, X: pd.DataFrame) -> None:
267
+ """Fit PCA for embedding dimensionality reduction."""
268
+ self._load_embedding_model()
269
+ if self._embedding_model is None:
270
+ return
271
+
272
+ try:
273
+ from sklearn.decomposition import PCA
274
+
275
+ for col in self._text_columns:
276
+ texts = X[col].fillna("").astype(str).tolist()
277
+ # Sample for fitting PCA (limit to 1000 for speed)
278
+ sample_texts = texts[: min(1000, len(texts))]
279
+ embeddings = self._embedding_model.encode(sample_texts, show_progress_bar=False)
280
+
281
+ # Fit PCA
282
+ n_components = min(self.config.embedding_dim, embeddings.shape[1], len(sample_texts))
283
+ if n_components > 0:
284
+ pca = PCA(n_components=n_components)
285
+ pca.fit(embeddings)
286
+ self._pca_models[col] = pca
287
+
288
+ if self.config.verbose:
289
+ logger.info(f"TextEngine: Fitted embedding PCA for '{col}' ({n_components} components)")
290
+
291
+ except Exception as e:
292
+ logger.warning(f"TextEngine: Could not fit embeddings: {e}")
139
293
 
140
294
  def transform(self, X: Union[pd.DataFrame, np.ndarray], **kwargs) -> pd.DataFrame:
141
295
  """
@@ -149,7 +303,7 @@ class TextEngine(BaseEngine):
149
303
  Returns
150
304
  -------
151
305
  X_features : DataFrame
152
- Extracted features
306
+ Extracted features (numerical only, text columns dropped)
153
307
  """
154
308
  if not self._is_fitted:
155
309
  raise RuntimeError("Engine must be fitted before transform")
@@ -188,10 +342,38 @@ class TextEngine(BaseEngine):
188
342
  tfidf_features = self._transform_tfidf(texts, col)
189
343
  result = pd.concat([result, tfidf_features], axis=1)
190
344
 
191
- self._feature_names = [c for c in result.columns if c not in X.columns]
345
+ # Sentiment features (transformers)
346
+ if "sentiment" in self.config.features:
347
+ sentiment_features = self._extract_sentiment(texts, col)
348
+ for feat_name, feat_values in sentiment_features.items():
349
+ result[feat_name] = feat_values
350
+
351
+ # NER features (spacy)
352
+ if "ner" in self.config.features:
353
+ ner_features = self._extract_ner(texts, col)
354
+ for feat_name, feat_values in ner_features.items():
355
+ result[feat_name] = feat_values
356
+
357
+ # POS features (spacy)
358
+ if "pos" in self.config.features:
359
+ pos_features = self._extract_pos(texts, col)
360
+ for feat_name, feat_values in pos_features.items():
361
+ result[feat_name] = feat_values
362
+
363
+ # Embedding features (sentence-transformers)
364
+ if "embeddings" in self.config.features:
365
+ emb_features = self._extract_embeddings(texts, col)
366
+ if emb_features is not None:
367
+ result = pd.concat([result, emb_features], axis=1)
368
+
369
+ # Drop original text columns
370
+ cols_to_drop = [col for col in self._text_columns if col in result.columns]
371
+ result = result.drop(columns=cols_to_drop)
372
+
373
+ self._feature_names = [c for c in result.columns if c not in X.columns or c in cols_to_drop]
192
374
 
193
375
  if self.config.verbose:
194
- print(f"TextEngine: Extracted {len(self._feature_names)} features")
376
+ logger.info(f"TextEngine: Extracted {len(self._feature_names)} features")
195
377
 
196
378
  return result
197
379
 
@@ -206,6 +388,165 @@ class TextEngine(BaseEngine):
206
388
  feature_names = [f"{col}_tfidf_{i}" for i in range(reduced.shape[1])]
207
389
  return pd.DataFrame(reduced, columns=feature_names, index=texts.index)
208
390
 
391
+ def _extract_sentiment(self, texts: pd.Series, col: str) -> dict[str, list]:
392
+ """Extract sentiment scores using transformers."""
393
+ if self._sentiment_pipeline is None:
394
+ self._load_sentiment()
395
+ if self._sentiment_pipeline is None:
396
+ return {}
397
+
398
+ features = {
399
+ f"{col}_sentiment_positive": [],
400
+ f"{col}_sentiment_negative": [],
401
+ f"{col}_sentiment_neutral": [],
402
+ f"{col}_sentiment_score": [],
403
+ }
404
+
405
+ # Process in batches for efficiency
406
+ batch_size = 32
407
+ text_list = texts.tolist()
408
+
409
+ for i in range(0, len(text_list), batch_size):
410
+ batch = text_list[i : i + batch_size]
411
+ # Truncate very long texts
412
+ batch = [t[:512] if len(t) > 512 else t for t in batch]
413
+
414
+ try:
415
+ results = self._sentiment_pipeline(batch)
416
+ for res in results:
417
+ label = res["label"].lower()
418
+ score = res["score"]
419
+
420
+ # Map to standard sentiment scores
421
+ if "positive" in label or label == "pos":
422
+ features[f"{col}_sentiment_positive"].append(score)
423
+ features[f"{col}_sentiment_negative"].append(0)
424
+ features[f"{col}_sentiment_neutral"].append(0)
425
+ features[f"{col}_sentiment_score"].append(score)
426
+ elif "negative" in label or label == "neg":
427
+ features[f"{col}_sentiment_positive"].append(0)
428
+ features[f"{col}_sentiment_negative"].append(score)
429
+ features[f"{col}_sentiment_neutral"].append(0)
430
+ features[f"{col}_sentiment_score"].append(-score)
431
+ else: # neutral
432
+ features[f"{col}_sentiment_positive"].append(0)
433
+ features[f"{col}_sentiment_negative"].append(0)
434
+ features[f"{col}_sentiment_neutral"].append(score)
435
+ features[f"{col}_sentiment_score"].append(0)
436
+
437
+ except Exception as e:
438
+ # Fill with zeros on error
439
+ for _ in batch:
440
+ features[f"{col}_sentiment_positive"].append(0)
441
+ features[f"{col}_sentiment_negative"].append(0)
442
+ features[f"{col}_sentiment_neutral"].append(0)
443
+ features[f"{col}_sentiment_score"].append(0)
444
+ if self.config.verbose:
445
+ logger.warning(f"TextEngine: Sentiment error: {e}")
446
+
447
+ return features
448
+
449
+ def _extract_ner(self, texts: pd.Series, col: str) -> dict[str, list]:
450
+ """Extract NER counts using spacy."""
451
+ if self._nlp is None:
452
+ return {}
453
+
454
+ # Entity types to count
455
+ entity_types = ["PERSON", "ORG", "GPE", "DATE", "MONEY", "PRODUCT", "EVENT", "LOC"]
456
+ features = {f"{col}_ner_{ent.lower()}": [] for ent in entity_types}
457
+ features[f"{col}_ner_total"] = []
458
+
459
+ for text in texts:
460
+ try:
461
+ doc = self._nlp(text[:10000]) # Limit text length
462
+ ent_counts = {ent: 0 for ent in entity_types}
463
+
464
+ for ent in doc.ents:
465
+ if ent.label_ in ent_counts:
466
+ ent_counts[ent.label_] += 1
467
+
468
+ for ent_type in entity_types:
469
+ features[f"{col}_ner_{ent_type.lower()}"].append(ent_counts[ent_type])
470
+ features[f"{col}_ner_total"].append(len(doc.ents))
471
+
472
+ except Exception:
473
+ for ent_type in entity_types:
474
+ features[f"{col}_ner_{ent_type.lower()}"].append(0)
475
+ features[f"{col}_ner_total"].append(0)
476
+
477
+ return features
478
+
479
+ def _extract_pos(self, texts: pd.Series, col: str) -> dict[str, list]:
480
+ """Extract POS tag distributions using spacy."""
481
+ if self._nlp is None:
482
+ return {}
483
+
484
+ # POS tags to track (ratios)
485
+ pos_tags = ["NOUN", "VERB", "ADJ", "ADV", "PROPN", "PRON", "DET", "ADP", "PUNCT"]
486
+ features = {f"{col}_pos_{tag.lower()}_ratio": [] for tag in pos_tags}
487
+ features[f"{col}_pos_noun_verb_ratio"] = []
488
+ features[f"{col}_pos_content_ratio"] = [] # nouns + verbs + adj
489
+
490
+ for text in texts:
491
+ try:
492
+ doc = self._nlp(text[:10000])
493
+ total_tokens = len(doc)
494
+
495
+ if total_tokens == 0:
496
+ for tag in pos_tags:
497
+ features[f"{col}_pos_{tag.lower()}_ratio"].append(0)
498
+ features[f"{col}_pos_noun_verb_ratio"].append(0)
499
+ features[f"{col}_pos_content_ratio"].append(0)
500
+ continue
501
+
502
+ pos_counts = {tag: 0 for tag in pos_tags}
503
+ for token in doc:
504
+ if token.pos_ in pos_counts:
505
+ pos_counts[token.pos_] += 1
506
+
507
+ for tag in pos_tags:
508
+ features[f"{col}_pos_{tag.lower()}_ratio"].append(pos_counts[tag] / total_tokens)
509
+
510
+ # Noun to verb ratio
511
+ verb_count = pos_counts["VERB"]
512
+ noun_count = pos_counts["NOUN"]
513
+ features[f"{col}_pos_noun_verb_ratio"].append(noun_count / max(verb_count, 1))
514
+
515
+ # Content word ratio (nouns + verbs + adjectives)
516
+ content_count = noun_count + verb_count + pos_counts["ADJ"]
517
+ features[f"{col}_pos_content_ratio"].append(content_count / total_tokens)
518
+
519
+ except Exception:
520
+ for tag in pos_tags:
521
+ features[f"{col}_pos_{tag.lower()}_ratio"].append(0)
522
+ features[f"{col}_pos_noun_verb_ratio"].append(0)
523
+ features[f"{col}_pos_content_ratio"].append(0)
524
+
525
+ return features
526
+
527
+ def _extract_embeddings(self, texts: pd.Series, col: str) -> Optional[pd.DataFrame]:
528
+ """Extract sentence embeddings using sentence-transformers."""
529
+ if self._embedding_model is None:
530
+ self._load_embedding_model()
531
+ if self._embedding_model is None:
532
+ return None
533
+
534
+ try:
535
+ text_list = texts.tolist()
536
+ embeddings = self._embedding_model.encode(text_list, show_progress_bar=False)
537
+
538
+ # Apply PCA if fitted
539
+ if col in self._pca_models:
540
+ embeddings = self._pca_models[col].transform(embeddings)
541
+
542
+ feature_names = [f"{col}_emb_{i}" for i in range(embeddings.shape[1])]
543
+ return pd.DataFrame(embeddings, columns=feature_names, index=texts.index)
544
+
545
+ except Exception as e:
546
+ if self.config.verbose:
547
+ logger.warning(f"TextEngine: Embedding error: {e}")
548
+ return None
549
+
209
550
  def get_feature_set(self) -> FeatureSet:
210
551
  """Get the feature set with metadata."""
211
552
  return self._feature_set