featcopilot 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- featcopilot/__init__.py +7 -0
- featcopilot/core/__init__.py +2 -0
- featcopilot/core/transform_rule.py +276 -0
- featcopilot/engines/tabular.py +145 -2
- featcopilot/engines/text.py +346 -8
- featcopilot/engines/timeseries.py +230 -1
- featcopilot/llm/__init__.py +2 -0
- featcopilot/llm/copilot_client.py +50 -17
- featcopilot/llm/semantic_engine.py +652 -10
- featcopilot/llm/transform_rule_generator.py +403 -0
- featcopilot/selection/importance.py +35 -7
- featcopilot/selection/redundancy.py +35 -9
- featcopilot/selection/statistical.py +103 -33
- featcopilot/selection/unified.py +54 -3
- featcopilot/stores/__init__.py +2 -0
- featcopilot/stores/rule_store.py +343 -0
- featcopilot/transformers/sklearn_compat.py +10 -1
- {featcopilot-0.2.0.dist-info → featcopilot-0.3.0.dist-info}/METADATA +27 -19
- featcopilot-0.3.0.dist-info/RECORD +38 -0
- featcopilot-0.2.0.dist-info/RECORD +0 -35
- {featcopilot-0.2.0.dist-info → featcopilot-0.3.0.dist-info}/WHEEL +0 -0
- {featcopilot-0.2.0.dist-info → featcopilot-0.3.0.dist-info}/top_level.txt +0 -0
featcopilot/engines/text.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""Text feature engineering engine.
|
|
2
2
|
|
|
3
3
|
Generates features from text data using embeddings and NLP techniques.
|
|
4
|
+
Supports local offline processing with transformers and spacy.
|
|
4
5
|
"""
|
|
5
6
|
|
|
6
7
|
from typing import Any, Optional, Union
|
|
@@ -22,33 +23,62 @@ class TextEngineConfig(EngineConfig):
|
|
|
22
23
|
name: str = "TextEngine"
|
|
23
24
|
features: list[str] = Field(
|
|
24
25
|
default_factory=lambda: ["length", "word_count", "char_stats"],
|
|
25
|
-
description="Feature types to extract",
|
|
26
|
+
description="Feature types to extract: length, word_count, char_stats, tfidf, sentiment, ner, pos, embeddings",
|
|
26
27
|
)
|
|
27
28
|
max_vocab_size: int = Field(default=5000, description="Max vocabulary size for TF-IDF")
|
|
28
29
|
n_components: int = Field(default=50, description="Components for dimensionality reduction")
|
|
30
|
+
embedding_model: str = Field(
|
|
31
|
+
default="sentence-transformers/all-MiniLM-L6-v2",
|
|
32
|
+
description="Sentence transformer model for embeddings",
|
|
33
|
+
)
|
|
34
|
+
embedding_dim: int = Field(default=32, description="Reduced embedding dimensions (PCA)")
|
|
35
|
+
spacy_model: str = Field(default="en_core_web_sm", description="Spacy model for NER/POS")
|
|
36
|
+
sentiment_model: str = Field(
|
|
37
|
+
default="cardiffnlp/twitter-roberta-base-sentiment-latest",
|
|
38
|
+
description="HuggingFace sentiment model",
|
|
39
|
+
)
|
|
29
40
|
|
|
30
41
|
|
|
31
42
|
class TextEngine(BaseEngine):
|
|
32
43
|
"""
|
|
33
|
-
Text feature engineering engine.
|
|
44
|
+
Text feature engineering engine with advanced NLP capabilities.
|
|
34
45
|
|
|
35
46
|
Extracts features from text columns including:
|
|
36
47
|
- Length and character statistics
|
|
37
48
|
- Word count features
|
|
38
49
|
- TF-IDF features (optional)
|
|
39
|
-
- Sentiment
|
|
40
|
-
-
|
|
50
|
+
- Sentiment analysis using transformers (local, offline)
|
|
51
|
+
- Named Entity Recognition (NER) using spacy
|
|
52
|
+
- Part-of-speech (POS) tag distributions
|
|
53
|
+
- Sentence embeddings using sentence-transformers
|
|
41
54
|
|
|
42
55
|
Parameters
|
|
43
56
|
----------
|
|
44
57
|
features : list
|
|
45
|
-
Feature types to extract
|
|
58
|
+
Feature types to extract. Options:
|
|
59
|
+
- 'length': character and word counts
|
|
60
|
+
- 'word_count': word-level statistics
|
|
61
|
+
- 'char_stats': character-level statistics
|
|
62
|
+
- 'tfidf': TF-IDF with SVD reduction
|
|
63
|
+
- 'sentiment': transformer-based sentiment scores
|
|
64
|
+
- 'ner': named entity counts by type
|
|
65
|
+
- 'pos': part-of-speech tag distributions
|
|
66
|
+
- 'embeddings': sentence embeddings (reduced via PCA)
|
|
46
67
|
max_vocab_size : int, default=5000
|
|
47
68
|
Maximum vocabulary size for TF-IDF
|
|
69
|
+
embedding_model : str
|
|
70
|
+
Sentence transformer model name
|
|
71
|
+
spacy_model : str
|
|
72
|
+
Spacy model for NER/POS tagging
|
|
48
73
|
|
|
49
74
|
Examples
|
|
50
75
|
--------
|
|
51
|
-
>>>
|
|
76
|
+
>>> # Basic features (fast, no dependencies)
|
|
77
|
+
>>> engine = TextEngine(features=['length', 'word_count', 'char_stats'])
|
|
78
|
+
>>> X_features = engine.fit_transform(text_df)
|
|
79
|
+
|
|
80
|
+
>>> # Advanced features with transformers/spacy
|
|
81
|
+
>>> engine = TextEngine(features=['sentiment', 'ner', 'pos', 'embeddings'])
|
|
52
82
|
>>> X_features = engine.fit_transform(text_df)
|
|
53
83
|
"""
|
|
54
84
|
|
|
@@ -56,6 +86,11 @@ class TextEngine(BaseEngine):
|
|
|
56
86
|
self,
|
|
57
87
|
features: Optional[list[str]] = None,
|
|
58
88
|
max_vocab_size: int = 5000,
|
|
89
|
+
n_components: int = 50,
|
|
90
|
+
embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2",
|
|
91
|
+
embedding_dim: int = 32,
|
|
92
|
+
spacy_model: str = "en_core_web_sm",
|
|
93
|
+
sentiment_model: str = "cardiffnlp/twitter-roberta-base-sentiment-latest",
|
|
59
94
|
max_features: Optional[int] = None,
|
|
60
95
|
verbose: bool = False,
|
|
61
96
|
**kwargs,
|
|
@@ -63,6 +98,11 @@ class TextEngine(BaseEngine):
|
|
|
63
98
|
config = TextEngineConfig(
|
|
64
99
|
features=features or ["length", "word_count", "char_stats"],
|
|
65
100
|
max_vocab_size=max_vocab_size,
|
|
101
|
+
n_components=n_components,
|
|
102
|
+
embedding_model=embedding_model,
|
|
103
|
+
embedding_dim=embedding_dim,
|
|
104
|
+
spacy_model=spacy_model,
|
|
105
|
+
sentiment_model=sentiment_model,
|
|
66
106
|
max_features=max_features,
|
|
67
107
|
verbose=verbose,
|
|
68
108
|
**kwargs,
|
|
@@ -73,6 +113,12 @@ class TextEngine(BaseEngine):
|
|
|
73
113
|
self._vectorizers: dict[str, Any] = {}
|
|
74
114
|
self._feature_set = FeatureSet()
|
|
75
115
|
|
|
116
|
+
# Lazy-loaded models
|
|
117
|
+
self._nlp = None # spacy
|
|
118
|
+
self._sentiment_pipeline = None # transformers
|
|
119
|
+
self._embedding_model = None # sentence-transformers
|
|
120
|
+
self._pca_models: dict[str, Any] = {} # PCA for embeddings
|
|
121
|
+
|
|
76
122
|
def fit(
|
|
77
123
|
self,
|
|
78
124
|
X: Union[pd.DataFrame, np.ndarray],
|
|
@@ -115,9 +161,86 @@ class TextEngine(BaseEngine):
|
|
|
115
161
|
if "tfidf" in self.config.features:
|
|
116
162
|
self._fit_tfidf(X)
|
|
117
163
|
|
|
164
|
+
# Fit embedding PCA if needed
|
|
165
|
+
if "embeddings" in self.config.features:
|
|
166
|
+
self._fit_embeddings(X)
|
|
167
|
+
|
|
168
|
+
# Load spacy model if needed
|
|
169
|
+
if "ner" in self.config.features or "pos" in self.config.features:
|
|
170
|
+
self._load_spacy()
|
|
171
|
+
|
|
172
|
+
# Load sentiment model if needed
|
|
173
|
+
if "sentiment" in self.config.features:
|
|
174
|
+
self._load_sentiment()
|
|
175
|
+
|
|
118
176
|
self._is_fitted = True
|
|
119
177
|
return self
|
|
120
178
|
|
|
179
|
+
def _load_spacy(self) -> None:
|
|
180
|
+
"""Load spacy model for NER/POS tagging."""
|
|
181
|
+
if self._nlp is not None:
|
|
182
|
+
return
|
|
183
|
+
|
|
184
|
+
try:
|
|
185
|
+
import spacy
|
|
186
|
+
|
|
187
|
+
try:
|
|
188
|
+
self._nlp = spacy.load(self.config.spacy_model)
|
|
189
|
+
if self.config.verbose:
|
|
190
|
+
logger.info(f"TextEngine: Loaded spacy model '{self.config.spacy_model}'")
|
|
191
|
+
except OSError:
|
|
192
|
+
# Try to download the model
|
|
193
|
+
if self.config.verbose:
|
|
194
|
+
logger.info(f"TextEngine: Downloading spacy model '{self.config.spacy_model}'...")
|
|
195
|
+
spacy.cli.download(self.config.spacy_model)
|
|
196
|
+
self._nlp = spacy.load(self.config.spacy_model)
|
|
197
|
+
|
|
198
|
+
except ImportError:
|
|
199
|
+
logger.warning("TextEngine: spacy not installed. Install with: pip install spacy")
|
|
200
|
+
self._nlp = None
|
|
201
|
+
|
|
202
|
+
def _load_sentiment(self) -> None:
|
|
203
|
+
"""Load sentiment analysis pipeline."""
|
|
204
|
+
if self._sentiment_pipeline is not None:
|
|
205
|
+
return
|
|
206
|
+
|
|
207
|
+
try:
|
|
208
|
+
from transformers import pipeline
|
|
209
|
+
|
|
210
|
+
self._sentiment_pipeline = pipeline(
|
|
211
|
+
"sentiment-analysis",
|
|
212
|
+
model=self.config.sentiment_model,
|
|
213
|
+
truncation=True,
|
|
214
|
+
max_length=512,
|
|
215
|
+
)
|
|
216
|
+
if self.config.verbose:
|
|
217
|
+
logger.info(f"TextEngine: Loaded sentiment model '{self.config.sentiment_model}'")
|
|
218
|
+
|
|
219
|
+
except ImportError:
|
|
220
|
+
logger.warning("TextEngine: transformers not installed. Install with: pip install transformers")
|
|
221
|
+
self._sentiment_pipeline = None
|
|
222
|
+
except Exception as e:
|
|
223
|
+
logger.warning(f"TextEngine: Could not load sentiment model: {e}")
|
|
224
|
+
self._sentiment_pipeline = None
|
|
225
|
+
|
|
226
|
+
def _load_embedding_model(self) -> None:
|
|
227
|
+
"""Load sentence transformer model."""
|
|
228
|
+
if self._embedding_model is not None:
|
|
229
|
+
return
|
|
230
|
+
|
|
231
|
+
try:
|
|
232
|
+
from sentence_transformers import SentenceTransformer
|
|
233
|
+
|
|
234
|
+
self._embedding_model = SentenceTransformer(self.config.embedding_model)
|
|
235
|
+
if self.config.verbose:
|
|
236
|
+
logger.info(f"TextEngine: Loaded embedding model '{self.config.embedding_model}'")
|
|
237
|
+
|
|
238
|
+
except ImportError:
|
|
239
|
+
logger.warning(
|
|
240
|
+
"TextEngine: sentence-transformers not installed. Install with: pip install sentence-transformers"
|
|
241
|
+
)
|
|
242
|
+
self._embedding_model = None
|
|
243
|
+
|
|
121
244
|
def _fit_tfidf(self, X: pd.DataFrame) -> None:
|
|
122
245
|
"""Fit TF-IDF vectorizers for text columns."""
|
|
123
246
|
try:
|
|
@@ -140,6 +263,34 @@ class TextEngine(BaseEngine):
|
|
|
140
263
|
if self.config.verbose:
|
|
141
264
|
logger.warning("TextEngine: sklearn not available for TF-IDF, skipping")
|
|
142
265
|
|
|
266
|
+
def _fit_embeddings(self, X: pd.DataFrame) -> None:
|
|
267
|
+
"""Fit PCA for embedding dimensionality reduction."""
|
|
268
|
+
self._load_embedding_model()
|
|
269
|
+
if self._embedding_model is None:
|
|
270
|
+
return
|
|
271
|
+
|
|
272
|
+
try:
|
|
273
|
+
from sklearn.decomposition import PCA
|
|
274
|
+
|
|
275
|
+
for col in self._text_columns:
|
|
276
|
+
texts = X[col].fillna("").astype(str).tolist()
|
|
277
|
+
# Sample for fitting PCA (limit to 1000 for speed)
|
|
278
|
+
sample_texts = texts[: min(1000, len(texts))]
|
|
279
|
+
embeddings = self._embedding_model.encode(sample_texts, show_progress_bar=False)
|
|
280
|
+
|
|
281
|
+
# Fit PCA
|
|
282
|
+
n_components = min(self.config.embedding_dim, embeddings.shape[1], len(sample_texts))
|
|
283
|
+
if n_components > 0:
|
|
284
|
+
pca = PCA(n_components=n_components)
|
|
285
|
+
pca.fit(embeddings)
|
|
286
|
+
self._pca_models[col] = pca
|
|
287
|
+
|
|
288
|
+
if self.config.verbose:
|
|
289
|
+
logger.info(f"TextEngine: Fitted embedding PCA for '{col}' ({n_components} components)")
|
|
290
|
+
|
|
291
|
+
except Exception as e:
|
|
292
|
+
logger.warning(f"TextEngine: Could not fit embeddings: {e}")
|
|
293
|
+
|
|
143
294
|
def transform(self, X: Union[pd.DataFrame, np.ndarray], **kwargs) -> pd.DataFrame:
|
|
144
295
|
"""
|
|
145
296
|
Extract text features.
|
|
@@ -152,7 +303,7 @@ class TextEngine(BaseEngine):
|
|
|
152
303
|
Returns
|
|
153
304
|
-------
|
|
154
305
|
X_features : DataFrame
|
|
155
|
-
Extracted features
|
|
306
|
+
Extracted features (numerical only, text columns dropped)
|
|
156
307
|
"""
|
|
157
308
|
if not self._is_fitted:
|
|
158
309
|
raise RuntimeError("Engine must be fitted before transform")
|
|
@@ -191,7 +342,35 @@ class TextEngine(BaseEngine):
|
|
|
191
342
|
tfidf_features = self._transform_tfidf(texts, col)
|
|
192
343
|
result = pd.concat([result, tfidf_features], axis=1)
|
|
193
344
|
|
|
194
|
-
|
|
345
|
+
# Sentiment features (transformers)
|
|
346
|
+
if "sentiment" in self.config.features:
|
|
347
|
+
sentiment_features = self._extract_sentiment(texts, col)
|
|
348
|
+
for feat_name, feat_values in sentiment_features.items():
|
|
349
|
+
result[feat_name] = feat_values
|
|
350
|
+
|
|
351
|
+
# NER features (spacy)
|
|
352
|
+
if "ner" in self.config.features:
|
|
353
|
+
ner_features = self._extract_ner(texts, col)
|
|
354
|
+
for feat_name, feat_values in ner_features.items():
|
|
355
|
+
result[feat_name] = feat_values
|
|
356
|
+
|
|
357
|
+
# POS features (spacy)
|
|
358
|
+
if "pos" in self.config.features:
|
|
359
|
+
pos_features = self._extract_pos(texts, col)
|
|
360
|
+
for feat_name, feat_values in pos_features.items():
|
|
361
|
+
result[feat_name] = feat_values
|
|
362
|
+
|
|
363
|
+
# Embedding features (sentence-transformers)
|
|
364
|
+
if "embeddings" in self.config.features:
|
|
365
|
+
emb_features = self._extract_embeddings(texts, col)
|
|
366
|
+
if emb_features is not None:
|
|
367
|
+
result = pd.concat([result, emb_features], axis=1)
|
|
368
|
+
|
|
369
|
+
# Drop original text columns
|
|
370
|
+
cols_to_drop = [col for col in self._text_columns if col in result.columns]
|
|
371
|
+
result = result.drop(columns=cols_to_drop)
|
|
372
|
+
|
|
373
|
+
self._feature_names = [c for c in result.columns if c not in X.columns or c in cols_to_drop]
|
|
195
374
|
|
|
196
375
|
if self.config.verbose:
|
|
197
376
|
logger.info(f"TextEngine: Extracted {len(self._feature_names)} features")
|
|
@@ -209,6 +388,165 @@ class TextEngine(BaseEngine):
|
|
|
209
388
|
feature_names = [f"{col}_tfidf_{i}" for i in range(reduced.shape[1])]
|
|
210
389
|
return pd.DataFrame(reduced, columns=feature_names, index=texts.index)
|
|
211
390
|
|
|
391
|
+
def _extract_sentiment(self, texts: pd.Series, col: str) -> dict[str, list]:
|
|
392
|
+
"""Extract sentiment scores using transformers."""
|
|
393
|
+
if self._sentiment_pipeline is None:
|
|
394
|
+
self._load_sentiment()
|
|
395
|
+
if self._sentiment_pipeline is None:
|
|
396
|
+
return {}
|
|
397
|
+
|
|
398
|
+
features = {
|
|
399
|
+
f"{col}_sentiment_positive": [],
|
|
400
|
+
f"{col}_sentiment_negative": [],
|
|
401
|
+
f"{col}_sentiment_neutral": [],
|
|
402
|
+
f"{col}_sentiment_score": [],
|
|
403
|
+
}
|
|
404
|
+
|
|
405
|
+
# Process in batches for efficiency
|
|
406
|
+
batch_size = 32
|
|
407
|
+
text_list = texts.tolist()
|
|
408
|
+
|
|
409
|
+
for i in range(0, len(text_list), batch_size):
|
|
410
|
+
batch = text_list[i : i + batch_size]
|
|
411
|
+
# Truncate very long texts
|
|
412
|
+
batch = [t[:512] if len(t) > 512 else t for t in batch]
|
|
413
|
+
|
|
414
|
+
try:
|
|
415
|
+
results = self._sentiment_pipeline(batch)
|
|
416
|
+
for res in results:
|
|
417
|
+
label = res["label"].lower()
|
|
418
|
+
score = res["score"]
|
|
419
|
+
|
|
420
|
+
# Map to standard sentiment scores
|
|
421
|
+
if "positive" in label or label == "pos":
|
|
422
|
+
features[f"{col}_sentiment_positive"].append(score)
|
|
423
|
+
features[f"{col}_sentiment_negative"].append(0)
|
|
424
|
+
features[f"{col}_sentiment_neutral"].append(0)
|
|
425
|
+
features[f"{col}_sentiment_score"].append(score)
|
|
426
|
+
elif "negative" in label or label == "neg":
|
|
427
|
+
features[f"{col}_sentiment_positive"].append(0)
|
|
428
|
+
features[f"{col}_sentiment_negative"].append(score)
|
|
429
|
+
features[f"{col}_sentiment_neutral"].append(0)
|
|
430
|
+
features[f"{col}_sentiment_score"].append(-score)
|
|
431
|
+
else: # neutral
|
|
432
|
+
features[f"{col}_sentiment_positive"].append(0)
|
|
433
|
+
features[f"{col}_sentiment_negative"].append(0)
|
|
434
|
+
features[f"{col}_sentiment_neutral"].append(score)
|
|
435
|
+
features[f"{col}_sentiment_score"].append(0)
|
|
436
|
+
|
|
437
|
+
except Exception as e:
|
|
438
|
+
# Fill with zeros on error
|
|
439
|
+
for _ in batch:
|
|
440
|
+
features[f"{col}_sentiment_positive"].append(0)
|
|
441
|
+
features[f"{col}_sentiment_negative"].append(0)
|
|
442
|
+
features[f"{col}_sentiment_neutral"].append(0)
|
|
443
|
+
features[f"{col}_sentiment_score"].append(0)
|
|
444
|
+
if self.config.verbose:
|
|
445
|
+
logger.warning(f"TextEngine: Sentiment error: {e}")
|
|
446
|
+
|
|
447
|
+
return features
|
|
448
|
+
|
|
449
|
+
def _extract_ner(self, texts: pd.Series, col: str) -> dict[str, list]:
|
|
450
|
+
"""Extract NER counts using spacy."""
|
|
451
|
+
if self._nlp is None:
|
|
452
|
+
return {}
|
|
453
|
+
|
|
454
|
+
# Entity types to count
|
|
455
|
+
entity_types = ["PERSON", "ORG", "GPE", "DATE", "MONEY", "PRODUCT", "EVENT", "LOC"]
|
|
456
|
+
features = {f"{col}_ner_{ent.lower()}": [] for ent in entity_types}
|
|
457
|
+
features[f"{col}_ner_total"] = []
|
|
458
|
+
|
|
459
|
+
for text in texts:
|
|
460
|
+
try:
|
|
461
|
+
doc = self._nlp(text[:10000]) # Limit text length
|
|
462
|
+
ent_counts = {ent: 0 for ent in entity_types}
|
|
463
|
+
|
|
464
|
+
for ent in doc.ents:
|
|
465
|
+
if ent.label_ in ent_counts:
|
|
466
|
+
ent_counts[ent.label_] += 1
|
|
467
|
+
|
|
468
|
+
for ent_type in entity_types:
|
|
469
|
+
features[f"{col}_ner_{ent_type.lower()}"].append(ent_counts[ent_type])
|
|
470
|
+
features[f"{col}_ner_total"].append(len(doc.ents))
|
|
471
|
+
|
|
472
|
+
except Exception:
|
|
473
|
+
for ent_type in entity_types:
|
|
474
|
+
features[f"{col}_ner_{ent_type.lower()}"].append(0)
|
|
475
|
+
features[f"{col}_ner_total"].append(0)
|
|
476
|
+
|
|
477
|
+
return features
|
|
478
|
+
|
|
479
|
+
def _extract_pos(self, texts: pd.Series, col: str) -> dict[str, list]:
|
|
480
|
+
"""Extract POS tag distributions using spacy."""
|
|
481
|
+
if self._nlp is None:
|
|
482
|
+
return {}
|
|
483
|
+
|
|
484
|
+
# POS tags to track (ratios)
|
|
485
|
+
pos_tags = ["NOUN", "VERB", "ADJ", "ADV", "PROPN", "PRON", "DET", "ADP", "PUNCT"]
|
|
486
|
+
features = {f"{col}_pos_{tag.lower()}_ratio": [] for tag in pos_tags}
|
|
487
|
+
features[f"{col}_pos_noun_verb_ratio"] = []
|
|
488
|
+
features[f"{col}_pos_content_ratio"] = [] # nouns + verbs + adj
|
|
489
|
+
|
|
490
|
+
for text in texts:
|
|
491
|
+
try:
|
|
492
|
+
doc = self._nlp(text[:10000])
|
|
493
|
+
total_tokens = len(doc)
|
|
494
|
+
|
|
495
|
+
if total_tokens == 0:
|
|
496
|
+
for tag in pos_tags:
|
|
497
|
+
features[f"{col}_pos_{tag.lower()}_ratio"].append(0)
|
|
498
|
+
features[f"{col}_pos_noun_verb_ratio"].append(0)
|
|
499
|
+
features[f"{col}_pos_content_ratio"].append(0)
|
|
500
|
+
continue
|
|
501
|
+
|
|
502
|
+
pos_counts = {tag: 0 for tag in pos_tags}
|
|
503
|
+
for token in doc:
|
|
504
|
+
if token.pos_ in pos_counts:
|
|
505
|
+
pos_counts[token.pos_] += 1
|
|
506
|
+
|
|
507
|
+
for tag in pos_tags:
|
|
508
|
+
features[f"{col}_pos_{tag.lower()}_ratio"].append(pos_counts[tag] / total_tokens)
|
|
509
|
+
|
|
510
|
+
# Noun to verb ratio
|
|
511
|
+
verb_count = pos_counts["VERB"]
|
|
512
|
+
noun_count = pos_counts["NOUN"]
|
|
513
|
+
features[f"{col}_pos_noun_verb_ratio"].append(noun_count / max(verb_count, 1))
|
|
514
|
+
|
|
515
|
+
# Content word ratio (nouns + verbs + adjectives)
|
|
516
|
+
content_count = noun_count + verb_count + pos_counts["ADJ"]
|
|
517
|
+
features[f"{col}_pos_content_ratio"].append(content_count / total_tokens)
|
|
518
|
+
|
|
519
|
+
except Exception:
|
|
520
|
+
for tag in pos_tags:
|
|
521
|
+
features[f"{col}_pos_{tag.lower()}_ratio"].append(0)
|
|
522
|
+
features[f"{col}_pos_noun_verb_ratio"].append(0)
|
|
523
|
+
features[f"{col}_pos_content_ratio"].append(0)
|
|
524
|
+
|
|
525
|
+
return features
|
|
526
|
+
|
|
527
|
+
def _extract_embeddings(self, texts: pd.Series, col: str) -> Optional[pd.DataFrame]:
|
|
528
|
+
"""Extract sentence embeddings using sentence-transformers."""
|
|
529
|
+
if self._embedding_model is None:
|
|
530
|
+
self._load_embedding_model()
|
|
531
|
+
if self._embedding_model is None:
|
|
532
|
+
return None
|
|
533
|
+
|
|
534
|
+
try:
|
|
535
|
+
text_list = texts.tolist()
|
|
536
|
+
embeddings = self._embedding_model.encode(text_list, show_progress_bar=False)
|
|
537
|
+
|
|
538
|
+
# Apply PCA if fitted
|
|
539
|
+
if col in self._pca_models:
|
|
540
|
+
embeddings = self._pca_models[col].transform(embeddings)
|
|
541
|
+
|
|
542
|
+
feature_names = [f"{col}_emb_{i}" for i in range(embeddings.shape[1])]
|
|
543
|
+
return pd.DataFrame(embeddings, columns=feature_names, index=texts.index)
|
|
544
|
+
|
|
545
|
+
except Exception as e:
|
|
546
|
+
if self.config.verbose:
|
|
547
|
+
logger.warning(f"TextEngine: Embedding error: {e}")
|
|
548
|
+
return None
|
|
549
|
+
|
|
212
550
|
def get_feature_set(self) -> FeatureSet:
|
|
213
551
|
"""Get the feature set with metadata."""
|
|
214
552
|
return self._feature_set
|