featcopilot 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- featcopilot/__init__.py +7 -0
- featcopilot/core/__init__.py +2 -0
- featcopilot/core/transform_rule.py +276 -0
- featcopilot/engines/tabular.py +145 -2
- featcopilot/engines/text.py +346 -8
- featcopilot/engines/timeseries.py +230 -1
- featcopilot/llm/__init__.py +2 -0
- featcopilot/llm/copilot_client.py +50 -17
- featcopilot/llm/semantic_engine.py +652 -10
- featcopilot/llm/transform_rule_generator.py +403 -0
- featcopilot/selection/importance.py +35 -7
- featcopilot/selection/redundancy.py +35 -9
- featcopilot/selection/statistical.py +103 -33
- featcopilot/selection/unified.py +54 -3
- featcopilot/stores/__init__.py +2 -0
- featcopilot/stores/rule_store.py +343 -0
- featcopilot/transformers/sklearn_compat.py +10 -1
- {featcopilot-0.2.0.dist-info → featcopilot-0.3.0.dist-info}/METADATA +27 -19
- featcopilot-0.3.0.dist-info/RECORD +38 -0
- featcopilot-0.2.0.dist-info/RECORD +0 -35
- {featcopilot-0.2.0.dist-info → featcopilot-0.3.0.dist-info}/WHEEL +0 -0
- {featcopilot-0.2.0.dist-info → featcopilot-0.3.0.dist-info}/top_level.txt +0 -0
|
@@ -28,6 +28,14 @@ class SemanticEngineConfig(EngineConfig):
|
|
|
28
28
|
backend: Literal["copilot", "litellm"] = Field(default="copilot", description="LLM backend to use")
|
|
29
29
|
api_key: Optional[str] = Field(default=None, description="API key for litellm backend")
|
|
30
30
|
api_base: Optional[str] = Field(default=None, description="Custom API base URL for litellm")
|
|
31
|
+
enable_text_features: bool = Field(default=True, description="Generate ML features from text columns")
|
|
32
|
+
keep_text_columns: bool = Field(
|
|
33
|
+
default=True, description="Keep original text columns (for models that handle them natively)"
|
|
34
|
+
)
|
|
35
|
+
text_feature_types: list[str] = Field(
|
|
36
|
+
default_factory=lambda: ["sentiment", "readability", "linguistic", "semantic"],
|
|
37
|
+
description="Types of text features to generate",
|
|
38
|
+
)
|
|
31
39
|
|
|
32
40
|
|
|
33
41
|
class SemanticEngine(BaseEngine):
|
|
@@ -100,6 +108,8 @@ class SemanticEngine(BaseEngine):
|
|
|
100
108
|
backend: Literal["copilot", "litellm"] = "copilot",
|
|
101
109
|
api_key: Optional[str] = None,
|
|
102
110
|
api_base: Optional[str] = None,
|
|
111
|
+
enable_text_features: bool = True,
|
|
112
|
+
text_feature_types: Optional[list[str]] = None,
|
|
103
113
|
**kwargs,
|
|
104
114
|
):
|
|
105
115
|
config = SemanticEngineConfig(
|
|
@@ -111,16 +121,20 @@ class SemanticEngine(BaseEngine):
|
|
|
111
121
|
backend=backend,
|
|
112
122
|
api_key=api_key,
|
|
113
123
|
api_base=api_base,
|
|
124
|
+
enable_text_features=enable_text_features,
|
|
125
|
+
text_feature_types=text_feature_types or ["sentiment", "readability", "linguistic", "semantic"],
|
|
114
126
|
**kwargs,
|
|
115
127
|
)
|
|
116
128
|
super().__init__(config=config)
|
|
117
129
|
self.config: SemanticEngineConfig = config
|
|
118
130
|
self._client: Optional[Any] = None
|
|
119
131
|
self._suggested_features: list[dict[str, Any]] = []
|
|
132
|
+
self._text_features: list[dict[str, Any]] = []
|
|
120
133
|
self._feature_set = FeatureSet()
|
|
121
134
|
self._column_info: dict[str, str] = {}
|
|
122
135
|
self._column_descriptions: dict[str, str] = {}
|
|
123
136
|
self._task_description: str = ""
|
|
137
|
+
self._text_columns: list[str] = []
|
|
124
138
|
|
|
125
139
|
def _ensure_client(self) -> None:
|
|
126
140
|
"""Ensure LLM client is initialized."""
|
|
@@ -172,29 +186,53 @@ class SemanticEngine(BaseEngine):
|
|
|
172
186
|
self._column_descriptions = column_descriptions or {}
|
|
173
187
|
self._task_description = task_description
|
|
174
188
|
|
|
175
|
-
# Build column info
|
|
189
|
+
# Build column info and detect text columns
|
|
176
190
|
self._column_info = {}
|
|
191
|
+
self._text_columns = []
|
|
177
192
|
for col in X.columns:
|
|
178
193
|
dtype = str(X[col].dtype)
|
|
179
194
|
if X[col].dtype == "object":
|
|
180
195
|
dtype = "string"
|
|
196
|
+
# Detect if it's a text column (long strings with high variance)
|
|
197
|
+
if X[col].str.len().mean() > 20 and X[col].nunique() > 10:
|
|
198
|
+
self._text_columns.append(col)
|
|
181
199
|
elif np.issubdtype(X[col].dtype, np.integer):
|
|
182
200
|
dtype = "integer"
|
|
183
201
|
elif np.issubdtype(X[col].dtype, np.floating):
|
|
184
202
|
dtype = "float"
|
|
185
203
|
self._column_info[col] = dtype
|
|
186
204
|
|
|
187
|
-
|
|
205
|
+
if self.config.verbose:
|
|
206
|
+
logger.info(f"SemanticEngine: Detected {len(self._text_columns)} text columns: {self._text_columns}")
|
|
207
|
+
|
|
208
|
+
# Generate text-specific features if enabled
|
|
209
|
+
if self.config.enable_text_features and self._text_columns:
|
|
210
|
+
self._text_features = self._generate_text_features(X)
|
|
211
|
+
if self.config.verbose:
|
|
212
|
+
logger.info(f"SemanticEngine: Generated {len(self._text_features)} text features")
|
|
213
|
+
|
|
214
|
+
# Get LLM suggestions for general features (excluding text columns)
|
|
188
215
|
if self.config.verbose:
|
|
189
216
|
logger.info("SemanticEngine: Requesting feature suggestions from LLM...")
|
|
190
217
|
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
218
|
+
# Filter out text columns for general feature suggestions
|
|
219
|
+
non_text_column_info = {k: v for k, v in self._column_info.items() if k not in self._text_columns}
|
|
220
|
+
|
|
221
|
+
if non_text_column_info:
|
|
222
|
+
try:
|
|
223
|
+
self._suggested_features = self._client.suggest_features(
|
|
224
|
+
column_info=non_text_column_info,
|
|
225
|
+
task_description=task_description,
|
|
226
|
+
column_descriptions=column_descriptions,
|
|
227
|
+
domain=self.config.domain,
|
|
228
|
+
max_suggestions=self.config.max_suggestions,
|
|
229
|
+
)
|
|
230
|
+
except Exception as e:
|
|
231
|
+
if self.config.verbose:
|
|
232
|
+
logger.warning(f"SemanticEngine: Could not get LLM suggestions: {e}")
|
|
233
|
+
self._suggested_features = []
|
|
234
|
+
else:
|
|
235
|
+
self._suggested_features = []
|
|
198
236
|
|
|
199
237
|
if self.config.verbose:
|
|
200
238
|
logger.info(f"SemanticEngine: Received {len(self._suggested_features)} suggestions")
|
|
@@ -209,6 +247,198 @@ class SemanticEngine(BaseEngine):
|
|
|
209
247
|
self._is_fitted = True
|
|
210
248
|
return self
|
|
211
249
|
|
|
250
|
+
def _generate_text_features(self, X: pd.DataFrame) -> list[dict[str, Any]]:
|
|
251
|
+
"""
|
|
252
|
+
Generate ML-ready numerical features from text columns using LLM suggestions.
|
|
253
|
+
|
|
254
|
+
This is the key differentiator - LLM suggests Python code to transform text
|
|
255
|
+
into numerical features that can be used by ML models.
|
|
256
|
+
"""
|
|
257
|
+
text_features = []
|
|
258
|
+
|
|
259
|
+
for col in self._text_columns:
|
|
260
|
+
# Always add fallback features first (don't require LLM)
|
|
261
|
+
fallback_features = self._get_fallback_text_features(col)
|
|
262
|
+
text_features.extend(fallback_features)
|
|
263
|
+
|
|
264
|
+
# Try to get LLM-suggested features (optional)
|
|
265
|
+
try:
|
|
266
|
+
col_desc = self._column_descriptions.get(col, f"Text column: {col}")
|
|
267
|
+
|
|
268
|
+
# Use suggest_features instead of send_prompt for better compatibility
|
|
269
|
+
response = self._client.suggest_features(
|
|
270
|
+
column_info={col: "string"},
|
|
271
|
+
task_description=f"Extract numerical features from text column '{col}' for {self._task_description}",
|
|
272
|
+
column_descriptions={col: col_desc},
|
|
273
|
+
domain=self.config.domain,
|
|
274
|
+
max_suggestions=5,
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
# Response is already parsed as list of features
|
|
278
|
+
for f in response:
|
|
279
|
+
f["source_columns"] = [col]
|
|
280
|
+
f["is_text_feature"] = True
|
|
281
|
+
text_features.append(f)
|
|
282
|
+
|
|
283
|
+
except Exception as e:
|
|
284
|
+
if self.config.verbose:
|
|
285
|
+
logger.warning(f"SemanticEngine: Could not get LLM suggestions for '{col}': {e}")
|
|
286
|
+
|
|
287
|
+
return text_features
|
|
288
|
+
|
|
289
|
+
def _build_text_feature_prompt(self, col: str, samples: list[str], description: str) -> str:
|
|
290
|
+
"""Build prompt for text feature generation."""
|
|
291
|
+
return f"""You are an expert data scientist. Generate Python code to extract NUMERICAL features from text data.
|
|
292
|
+
|
|
293
|
+
## Text Column
|
|
294
|
+
Name: {col}
|
|
295
|
+
Description: {description}
|
|
296
|
+
|
|
297
|
+
## Sample Values
|
|
298
|
+
{chr(10).join([f'- "{s[:200]}..."' if len(str(s)) > 200 else f'- "{s}"' for s in samples[:5]])}
|
|
299
|
+
|
|
300
|
+
## Task
|
|
301
|
+
{self._task_description}
|
|
302
|
+
|
|
303
|
+
## Requirements
|
|
304
|
+
Generate features that transform text into NUMERICAL values suitable for ML models:
|
|
305
|
+
1. Sentiment scores (positive/negative/neutral)
|
|
306
|
+
2. Readability metrics (Flesch score, word complexity)
|
|
307
|
+
3. Linguistic features (noun ratio, verb ratio, sentence count)
|
|
308
|
+
4. Pattern detection (contains numbers, URLs, emails)
|
|
309
|
+
5. Domain-specific indicators
|
|
310
|
+
|
|
311
|
+
## Output Format
|
|
312
|
+
Return JSON with "features" array:
|
|
313
|
+
{{
|
|
314
|
+
"features": [
|
|
315
|
+
{{
|
|
316
|
+
"name": "{col}_sentiment_score",
|
|
317
|
+
"code": "result = df['{col}'].apply(lambda x: len([w for w in str(x).lower().split() if w in ['good','great','excellent','best']]) - len([w for w in str(x).lower().split() if w in ['bad','poor','worst','terrible']]))",
|
|
318
|
+
"explanation": "Simple sentiment score based on positive/negative word counts"
|
|
319
|
+
}}
|
|
320
|
+
]
|
|
321
|
+
}}
|
|
322
|
+
|
|
323
|
+
Return ONLY the JSON object, no other text. Generate 5-10 useful features."""
|
|
324
|
+
|
|
325
|
+
def _parse_text_features(self, response: str, col: str) -> list[dict[str, Any]]:
|
|
326
|
+
"""Parse text features from LLM response."""
|
|
327
|
+
import json
|
|
328
|
+
import re
|
|
329
|
+
|
|
330
|
+
try:
|
|
331
|
+
response = response.strip()
|
|
332
|
+
if response.startswith("```"):
|
|
333
|
+
lines = response.split("\n")
|
|
334
|
+
response = "\n".join(lines[1:-1])
|
|
335
|
+
|
|
336
|
+
data = json.loads(response)
|
|
337
|
+
features = data.get("features", [])
|
|
338
|
+
|
|
339
|
+
# Add source column info
|
|
340
|
+
for f in features:
|
|
341
|
+
f["source_columns"] = [col]
|
|
342
|
+
f["is_text_feature"] = True
|
|
343
|
+
|
|
344
|
+
return features
|
|
345
|
+
|
|
346
|
+
except json.JSONDecodeError:
|
|
347
|
+
json_match = re.search(r"\{.*\}", response, re.DOTALL)
|
|
348
|
+
if json_match:
|
|
349
|
+
try:
|
|
350
|
+
data = json.loads(json_match.group())
|
|
351
|
+
features = data.get("features", [])
|
|
352
|
+
for f in features:
|
|
353
|
+
f["source_columns"] = [col]
|
|
354
|
+
f["is_text_feature"] = True
|
|
355
|
+
return features
|
|
356
|
+
except json.JSONDecodeError:
|
|
357
|
+
pass
|
|
358
|
+
return []
|
|
359
|
+
|
|
360
|
+
def _get_fallback_text_features(self, col: str) -> list[dict[str, Any]]:
|
|
361
|
+
"""Generate fallback text features that don't require LLM."""
|
|
362
|
+
return [
|
|
363
|
+
{
|
|
364
|
+
"name": f"{col}_char_length",
|
|
365
|
+
"code": f"result = df['{col}'].fillna('').astype(str).str.len()",
|
|
366
|
+
"explanation": "Character length of text",
|
|
367
|
+
"source_columns": [col],
|
|
368
|
+
"is_text_feature": True,
|
|
369
|
+
},
|
|
370
|
+
{
|
|
371
|
+
"name": f"{col}_word_count",
|
|
372
|
+
"code": f"result = df['{col}'].fillna('').astype(str).str.split().str.len()",
|
|
373
|
+
"explanation": "Word count in text",
|
|
374
|
+
"source_columns": [col],
|
|
375
|
+
"is_text_feature": True,
|
|
376
|
+
},
|
|
377
|
+
{
|
|
378
|
+
"name": f"{col}_avg_word_length",
|
|
379
|
+
"code": f"result = df['{col}'].fillna('').astype(str).apply(lambda x: np.mean([len(w) for w in x.split()] or [0]))",
|
|
380
|
+
"explanation": "Average word length",
|
|
381
|
+
"source_columns": [col],
|
|
382
|
+
"is_text_feature": True,
|
|
383
|
+
},
|
|
384
|
+
{
|
|
385
|
+
"name": f"{col}_sentence_count",
|
|
386
|
+
"code": f"result = df['{col}'].fillna('').astype(str).str.count(r'[.!?]+')",
|
|
387
|
+
"explanation": "Number of sentences (approximate)",
|
|
388
|
+
"source_columns": [col],
|
|
389
|
+
"is_text_feature": True,
|
|
390
|
+
},
|
|
391
|
+
{
|
|
392
|
+
"name": f"{col}_uppercase_ratio",
|
|
393
|
+
"code": f"result = df['{col}'].fillna('').astype(str).apply(lambda x: sum(1 for c in x if c.isupper()) / max(len(x), 1))",
|
|
394
|
+
"explanation": "Ratio of uppercase characters",
|
|
395
|
+
"source_columns": [col],
|
|
396
|
+
"is_text_feature": True,
|
|
397
|
+
},
|
|
398
|
+
{
|
|
399
|
+
"name": f"{col}_digit_count",
|
|
400
|
+
"code": f"result = df['{col}'].fillna('').astype(str).str.count(r'\\d')",
|
|
401
|
+
"explanation": "Count of digits in text",
|
|
402
|
+
"source_columns": [col],
|
|
403
|
+
"is_text_feature": True,
|
|
404
|
+
},
|
|
405
|
+
{
|
|
406
|
+
"name": f"{col}_special_char_count",
|
|
407
|
+
"code": f"result = df['{col}'].fillna('').astype(str).str.count(r'[^a-zA-Z0-9\\s]')",
|
|
408
|
+
"explanation": "Count of special characters",
|
|
409
|
+
"source_columns": [col],
|
|
410
|
+
"is_text_feature": True,
|
|
411
|
+
},
|
|
412
|
+
{
|
|
413
|
+
"name": f"{col}_unique_word_ratio",
|
|
414
|
+
"code": f"result = df['{col}'].fillna('').astype(str).apply(lambda x: len(set(x.lower().split())) / max(len(x.split()), 1))",
|
|
415
|
+
"explanation": "Ratio of unique words to total words",
|
|
416
|
+
"source_columns": [col],
|
|
417
|
+
"is_text_feature": True,
|
|
418
|
+
},
|
|
419
|
+
{
|
|
420
|
+
"name": f"{col}_exclamation_count",
|
|
421
|
+
"code": f"result = df['{col}'].fillna('').astype(str).str.count('!')",
|
|
422
|
+
"explanation": "Count of exclamation marks (indicates emphasis/emotion)",
|
|
423
|
+
"source_columns": [col],
|
|
424
|
+
"is_text_feature": True,
|
|
425
|
+
},
|
|
426
|
+
{
|
|
427
|
+
"name": f"{col}_question_count",
|
|
428
|
+
"code": f"result = df['{col}'].fillna('').astype(str).str.count(r'\\?')",
|
|
429
|
+
"explanation": "Count of question marks",
|
|
430
|
+
"source_columns": [col],
|
|
431
|
+
"is_text_feature": True,
|
|
432
|
+
},
|
|
433
|
+
{
|
|
434
|
+
"name": f"{col}_caps_word_ratio",
|
|
435
|
+
"code": f"result = df['{col}'].fillna('').astype(str).apply(lambda x: sum(1 for w in x.split() if w.isupper()) / max(len(x.split()), 1))",
|
|
436
|
+
"explanation": "Ratio of all-caps words (indicates shouting/emphasis)",
|
|
437
|
+
"source_columns": [col],
|
|
438
|
+
"is_text_feature": True,
|
|
439
|
+
},
|
|
440
|
+
]
|
|
441
|
+
|
|
212
442
|
def _validate_suggestions(self, X: pd.DataFrame) -> None:
|
|
213
443
|
"""Validate suggested feature code."""
|
|
214
444
|
valid_features = []
|
|
@@ -237,6 +467,20 @@ class SemanticEngine(BaseEngine):
|
|
|
237
467
|
"""Build FeatureSet from suggestions."""
|
|
238
468
|
self._feature_set = FeatureSet()
|
|
239
469
|
|
|
470
|
+
# Add text features
|
|
471
|
+
for suggestion in self._text_features:
|
|
472
|
+
feature = Feature(
|
|
473
|
+
name=suggestion.get("name", f"text_feature_{len(self._feature_set)}"),
|
|
474
|
+
dtype=FeatureType.NUMERIC,
|
|
475
|
+
origin=FeatureOrigin.LLM_GENERATED,
|
|
476
|
+
source_columns=suggestion.get("source_columns", []),
|
|
477
|
+
transformation="text_to_numeric",
|
|
478
|
+
explanation=suggestion.get("explanation", ""),
|
|
479
|
+
code=suggestion.get("code", ""),
|
|
480
|
+
)
|
|
481
|
+
self._feature_set.add(feature)
|
|
482
|
+
|
|
483
|
+
# Add general features
|
|
240
484
|
for suggestion in self._suggested_features:
|
|
241
485
|
feature = Feature(
|
|
242
486
|
name=suggestion.get("name", f"llm_feature_{len(self._feature_set)}"),
|
|
@@ -261,7 +505,7 @@ class SemanticEngine(BaseEngine):
|
|
|
261
505
|
Returns
|
|
262
506
|
-------
|
|
263
507
|
X_features : DataFrame
|
|
264
|
-
Data with generated features
|
|
508
|
+
Data with generated features (numerical only, text columns dropped)
|
|
265
509
|
"""
|
|
266
510
|
if not self._is_fitted:
|
|
267
511
|
raise RuntimeError("Engine must be fitted before transform")
|
|
@@ -271,6 +515,52 @@ class SemanticEngine(BaseEngine):
|
|
|
271
515
|
|
|
272
516
|
successful_features = []
|
|
273
517
|
|
|
518
|
+
# Apply text features first
|
|
519
|
+
for suggestion in self._text_features:
|
|
520
|
+
name = suggestion.get("name", "")
|
|
521
|
+
code = suggestion.get("code", "")
|
|
522
|
+
|
|
523
|
+
if not code:
|
|
524
|
+
continue
|
|
525
|
+
|
|
526
|
+
try:
|
|
527
|
+
local_vars = {"df": result, "np": np, "pd": pd}
|
|
528
|
+
exec(
|
|
529
|
+
code,
|
|
530
|
+
{
|
|
531
|
+
"__builtins__": {
|
|
532
|
+
"len": len,
|
|
533
|
+
"sum": sum,
|
|
534
|
+
"max": max,
|
|
535
|
+
"min": min,
|
|
536
|
+
"abs": abs,
|
|
537
|
+
"round": round,
|
|
538
|
+
"int": int,
|
|
539
|
+
"float": float,
|
|
540
|
+
"str": str,
|
|
541
|
+
"list": list,
|
|
542
|
+
"dict": dict,
|
|
543
|
+
"set": set,
|
|
544
|
+
},
|
|
545
|
+
"np": np,
|
|
546
|
+
"pd": pd,
|
|
547
|
+
},
|
|
548
|
+
local_vars,
|
|
549
|
+
)
|
|
550
|
+
|
|
551
|
+
if "result" in local_vars:
|
|
552
|
+
feature_values = local_vars["result"]
|
|
553
|
+
if isinstance(feature_values, pd.Series):
|
|
554
|
+
result[name] = feature_values.values
|
|
555
|
+
else:
|
|
556
|
+
result[name] = feature_values
|
|
557
|
+
successful_features.append(name)
|
|
558
|
+
|
|
559
|
+
except Exception as e:
|
|
560
|
+
if self.config.verbose:
|
|
561
|
+
logger.error(f"SemanticEngine: Error computing text feature '{name}': {e}")
|
|
562
|
+
|
|
563
|
+
# Apply general features
|
|
274
564
|
for suggestion in self._suggested_features:
|
|
275
565
|
name = suggestion.get("name", "")
|
|
276
566
|
code = suggestion.get("code", "")
|
|
@@ -320,6 +610,14 @@ class SemanticEngine(BaseEngine):
|
|
|
320
610
|
# Handle infinities and NaNs
|
|
321
611
|
result = result.replace([np.inf, -np.inf], np.nan)
|
|
322
612
|
|
|
613
|
+
# Optionally drop original text columns (only if not keeping them for downstream models)
|
|
614
|
+
if not self.config.keep_text_columns:
|
|
615
|
+
cols_to_drop = [col for col in self._text_columns if col in result.columns]
|
|
616
|
+
if cols_to_drop:
|
|
617
|
+
result = result.drop(columns=cols_to_drop)
|
|
618
|
+
if self.config.verbose:
|
|
619
|
+
logger.info(f"SemanticEngine: Dropped {len(cols_to_drop)} text columns, keeping numerical features")
|
|
620
|
+
|
|
323
621
|
self._feature_names = successful_features
|
|
324
622
|
|
|
325
623
|
if self.config.verbose:
|
|
@@ -419,6 +717,350 @@ class SemanticEngine(BaseEngine):
|
|
|
419
717
|
"""Get the feature set with metadata."""
|
|
420
718
|
return self._feature_set
|
|
421
719
|
|
|
720
|
+
def standardize_categories(
|
|
721
|
+
self,
|
|
722
|
+
df: pd.DataFrame,
|
|
723
|
+
column: str,
|
|
724
|
+
target_categories: Optional[list[str]] = None,
|
|
725
|
+
similarity_threshold: float = 0.8,
|
|
726
|
+
max_categories: int = 50,
|
|
727
|
+
context: Optional[str] = None,
|
|
728
|
+
) -> dict[str, str]:
|
|
729
|
+
"""
|
|
730
|
+
Use LLM to standardize similar category values in a column.
|
|
731
|
+
|
|
732
|
+
Identifies semantically similar values (e.g., "software engineer", "Software Engineer",
|
|
733
|
+
"SDE") and maps them to a canonical form.
|
|
734
|
+
|
|
735
|
+
Parameters
|
|
736
|
+
----------
|
|
737
|
+
df : DataFrame
|
|
738
|
+
Input DataFrame containing the column to standardize
|
|
739
|
+
column : str
|
|
740
|
+
Name of the categorical column to standardize
|
|
741
|
+
target_categories : list[str], optional
|
|
742
|
+
If provided, map values to these specific categories.
|
|
743
|
+
If None, LLM will infer appropriate canonical forms.
|
|
744
|
+
similarity_threshold : float, default=0.8
|
|
745
|
+
Minimum similarity for grouping (hint for LLM, not strictly enforced)
|
|
746
|
+
max_categories : int, default=50
|
|
747
|
+
Maximum number of unique values to process (for efficiency)
|
|
748
|
+
context : str, optional
|
|
749
|
+
Additional context about the data domain (e.g., "job titles in tech industry")
|
|
750
|
+
|
|
751
|
+
Returns
|
|
752
|
+
-------
|
|
753
|
+
mapping : dict[str, str]
|
|
754
|
+
Dictionary mapping original values to standardized values.
|
|
755
|
+
Only includes values that need transformation.
|
|
756
|
+
|
|
757
|
+
Examples
|
|
758
|
+
--------
|
|
759
|
+
>>> engine = SemanticEngine()
|
|
760
|
+
>>> mapping = engine.standardize_categories(
|
|
761
|
+
... df,
|
|
762
|
+
... column="job_title",
|
|
763
|
+
... context="job titles in software industry"
|
|
764
|
+
... )
|
|
765
|
+
>>> print(mapping)
|
|
766
|
+
{'software engineer': 'Software Engineer', 'SDE': 'Software Engineer',
|
|
767
|
+
'Sr. SWE': 'Senior Software Engineer', 'data scientist': 'Data Scientist'}
|
|
768
|
+
|
|
769
|
+
>>> # Apply the mapping
|
|
770
|
+
>>> df_clean = engine.apply_category_mapping(df, "job_title", mapping)
|
|
771
|
+
"""
|
|
772
|
+
if column not in df.columns:
|
|
773
|
+
raise ValueError(f"Column '{column}' not found in DataFrame")
|
|
774
|
+
|
|
775
|
+
self._ensure_client()
|
|
776
|
+
|
|
777
|
+
# Get unique values (excluding NaN)
|
|
778
|
+
unique_values = df[column].dropna().unique().tolist()
|
|
779
|
+
|
|
780
|
+
# Convert to strings and filter
|
|
781
|
+
unique_values = [str(v) for v in unique_values if v is not None and str(v).strip()]
|
|
782
|
+
unique_values = list(set(unique_values)) # Remove duplicates after string conversion
|
|
783
|
+
|
|
784
|
+
if len(unique_values) == 0:
|
|
785
|
+
if self.config.verbose:
|
|
786
|
+
logger.info(f"SemanticEngine: No valid values found in column '{column}'")
|
|
787
|
+
return {}
|
|
788
|
+
|
|
789
|
+
if len(unique_values) > max_categories:
|
|
790
|
+
if self.config.verbose:
|
|
791
|
+
logger.warning(
|
|
792
|
+
f"SemanticEngine: Column '{column}' has {len(unique_values)} unique values, "
|
|
793
|
+
f"truncating to {max_categories} most frequent"
|
|
794
|
+
)
|
|
795
|
+
# Get most frequent values
|
|
796
|
+
value_counts = df[column].value_counts().head(max_categories)
|
|
797
|
+
unique_values = [str(v) for v in value_counts.index.tolist()]
|
|
798
|
+
|
|
799
|
+
# Build and send prompt
|
|
800
|
+
prompt = self._build_category_standardization_prompt(
|
|
801
|
+
column=column,
|
|
802
|
+
unique_values=unique_values,
|
|
803
|
+
target_categories=target_categories,
|
|
804
|
+
context=context,
|
|
805
|
+
similarity_threshold=similarity_threshold,
|
|
806
|
+
)
|
|
807
|
+
|
|
808
|
+
try:
|
|
809
|
+
# Use the client's send_prompt method if available, otherwise use suggest_features
|
|
810
|
+
if hasattr(self._client, "send_prompt"):
|
|
811
|
+
response = self._client.send_prompt(prompt)
|
|
812
|
+
else:
|
|
813
|
+
# Fallback: use suggest_features with a specialized task
|
|
814
|
+
response_list = self._client.suggest_features(
|
|
815
|
+
column_info={column: "categorical"},
|
|
816
|
+
task_description=prompt,
|
|
817
|
+
column_descriptions={column: context or "Categorical column to standardize"},
|
|
818
|
+
domain=self.config.domain,
|
|
819
|
+
max_suggestions=1,
|
|
820
|
+
)
|
|
821
|
+
# Extract mapping from response if possible
|
|
822
|
+
if response_list and isinstance(response_list, list) and len(response_list) > 0:
|
|
823
|
+
first = response_list[0]
|
|
824
|
+
if isinstance(first, dict) and "mapping" in first:
|
|
825
|
+
return first["mapping"]
|
|
826
|
+
response = str(first)
|
|
827
|
+
else:
|
|
828
|
+
response = str(response_list)
|
|
829
|
+
|
|
830
|
+
mapping = self._parse_category_mapping(response, unique_values)
|
|
831
|
+
|
|
832
|
+
if self.config.verbose:
|
|
833
|
+
logger.info(f"SemanticEngine: Created mapping for {len(mapping)} values in column '{column}'")
|
|
834
|
+
|
|
835
|
+
return mapping
|
|
836
|
+
|
|
837
|
+
except Exception as e:
|
|
838
|
+
if self.config.verbose:
|
|
839
|
+
logger.error(f"SemanticEngine: Error standardizing categories: {e}")
|
|
840
|
+
return {}
|
|
841
|
+
|
|
842
|
+
def _build_category_standardization_prompt(
|
|
843
|
+
self,
|
|
844
|
+
column: str,
|
|
845
|
+
unique_values: list[str],
|
|
846
|
+
target_categories: Optional[list[str]] = None,
|
|
847
|
+
context: Optional[str] = None,
|
|
848
|
+
similarity_threshold: float = 0.8,
|
|
849
|
+
) -> str:
|
|
850
|
+
"""Build prompt for category standardization."""
|
|
851
|
+
values_str = "\n".join([f'- "{v}"' for v in unique_values[:100]])
|
|
852
|
+
|
|
853
|
+
target_str = ""
|
|
854
|
+
if target_categories:
|
|
855
|
+
target_str = f"""
|
|
856
|
+
## Target Categories (map values to these)
|
|
857
|
+
{chr(10).join([f'- "{c}"' for c in target_categories])}
|
|
858
|
+
"""
|
|
859
|
+
|
|
860
|
+
context_str = f"\n## Context\n{context}" if context else ""
|
|
861
|
+
|
|
862
|
+
return f"""You are an expert data scientist specializing in data cleaning and standardization.
|
|
863
|
+
|
|
864
|
+
## Task
|
|
865
|
+
Analyze the following categorical values from column "{column}" and identify semantically similar values that should be standardized to a common form.
|
|
866
|
+
|
|
867
|
+
## Unique Values in Column
|
|
868
|
+
{values_str}
|
|
869
|
+
{target_str}{context_str}
|
|
870
|
+
|
|
871
|
+
## Requirements
|
|
872
|
+
1. Identify values that represent the same concept (case variations, abbreviations, typos, synonyms)
|
|
873
|
+
2. Map similar values to a single canonical/standardized form
|
|
874
|
+
3. Use proper capitalization for the standardized form (e.g., "Software Engineer" not "software engineer")
|
|
875
|
+
4. Common patterns to look for:
|
|
876
|
+
- Case variations: "Software Engineer" vs "software engineer" vs "SOFTWARE ENGINEER"
|
|
877
|
+
- Abbreviations: "SDE" vs "Software Development Engineer", "Sr." vs "Senior"
|
|
878
|
+
- Typos: "Enginer" vs "Engineer"
|
|
879
|
+
- Synonyms: "Developer" vs "Programmer" vs "Software Engineer"
|
|
880
|
+
- Formatting: "Data-Scientist" vs "Data Scientist" vs "DataScientist"
|
|
881
|
+
5. Only include values that need mapping (exclude already-standardized values)
|
|
882
|
+
6. Preserve values that are already properly formatted or don't have similar alternatives
|
|
883
|
+
|
|
884
|
+
## Output Format
|
|
885
|
+
Return ONLY a valid JSON object with this structure:
|
|
886
|
+
{{
|
|
887
|
+
"mapping": {{
|
|
888
|
+
"original_value_1": "Standardized Value",
|
|
889
|
+
"original_value_2": "Standardized Value",
|
|
890
|
+
"typo_value": "Corrected Value"
|
|
891
|
+
}},
|
|
892
|
+
"groups": [
|
|
893
|
+
{{
|
|
894
|
+
"canonical": "Software Engineer",
|
|
895
|
+
"members": ["software engineer", "SDE", "Software Dev", "SW Engineer"]
|
|
896
|
+
}}
|
|
897
|
+
]
|
|
898
|
+
}}
|
|
899
|
+
|
|
900
|
+
Return ONLY the JSON object, no markdown formatting, no explanation text."""
|
|
901
|
+
|
|
902
|
+
def _parse_category_mapping(
|
|
903
|
+
self,
|
|
904
|
+
response: str,
|
|
905
|
+
original_values: list[str],
|
|
906
|
+
) -> dict[str, str]:
|
|
907
|
+
"""Parse category mapping from LLM response."""
|
|
908
|
+
import json
|
|
909
|
+
import re
|
|
910
|
+
|
|
911
|
+
try:
|
|
912
|
+
# Clean response
|
|
913
|
+
response = response.strip()
|
|
914
|
+
|
|
915
|
+
# Remove markdown code blocks if present
|
|
916
|
+
if response.startswith("```"):
|
|
917
|
+
lines = response.split("\n")
|
|
918
|
+
# Find the JSON content between ``` markers
|
|
919
|
+
start_idx = 1 if lines[0].startswith("```") else 0
|
|
920
|
+
end_idx = len(lines)
|
|
921
|
+
for i, line in enumerate(lines[1:], 1):
|
|
922
|
+
if line.strip() == "```":
|
|
923
|
+
end_idx = i
|
|
924
|
+
break
|
|
925
|
+
response = "\n".join(lines[start_idx:end_idx])
|
|
926
|
+
|
|
927
|
+
# Try to parse as JSON
|
|
928
|
+
data = json.loads(response)
|
|
929
|
+
|
|
930
|
+
# Extract mapping from response
|
|
931
|
+
if isinstance(data, dict):
|
|
932
|
+
if "mapping" in data:
|
|
933
|
+
mapping = data["mapping"]
|
|
934
|
+
elif "groups" in data:
|
|
935
|
+
# Build mapping from groups
|
|
936
|
+
mapping = {}
|
|
937
|
+
for group in data["groups"]:
|
|
938
|
+
canonical = group.get("canonical", "")
|
|
939
|
+
members = group.get("members", [])
|
|
940
|
+
for member in members:
|
|
941
|
+
if member != canonical:
|
|
942
|
+
mapping[member] = canonical
|
|
943
|
+
else:
|
|
944
|
+
# Assume the entire dict is the mapping
|
|
945
|
+
mapping = data
|
|
946
|
+
else:
|
|
947
|
+
mapping = {}
|
|
948
|
+
|
|
949
|
+
# Validate mapping - only keep mappings for values that exist
|
|
950
|
+
original_set = set(original_values)
|
|
951
|
+
original_lower = {v.lower(): v for v in original_values}
|
|
952
|
+
|
|
953
|
+
validated_mapping = {}
|
|
954
|
+
for orig, standardized in mapping.items():
|
|
955
|
+
# Check exact match or case-insensitive match
|
|
956
|
+
if orig in original_set:
|
|
957
|
+
validated_mapping[orig] = standardized
|
|
958
|
+
elif orig.lower() in original_lower:
|
|
959
|
+
actual_orig = original_lower[orig.lower()]
|
|
960
|
+
validated_mapping[actual_orig] = standardized
|
|
961
|
+
|
|
962
|
+
return validated_mapping
|
|
963
|
+
|
|
964
|
+
except json.JSONDecodeError:
|
|
965
|
+
# Try to extract JSON from response
|
|
966
|
+
json_match = re.search(r"\{[\s\S]*\}", response)
|
|
967
|
+
if json_match:
|
|
968
|
+
try:
|
|
969
|
+
return self._parse_category_mapping(json_match.group(), original_values)
|
|
970
|
+
except Exception:
|
|
971
|
+
pass
|
|
972
|
+
|
|
973
|
+
if self.config.verbose:
|
|
974
|
+
logger.warning("SemanticEngine: Could not parse category mapping response")
|
|
975
|
+
return {}
|
|
976
|
+
|
|
977
|
+
def apply_category_mapping(
|
|
978
|
+
self,
|
|
979
|
+
df: pd.DataFrame,
|
|
980
|
+
column: str,
|
|
981
|
+
mapping: dict[str, str],
|
|
982
|
+
inplace: bool = False,
|
|
983
|
+
) -> pd.DataFrame:
|
|
984
|
+
"""
|
|
985
|
+
Apply a category mapping to standardize values in a DataFrame column.
|
|
986
|
+
|
|
987
|
+
Parameters
|
|
988
|
+
----------
|
|
989
|
+
df : DataFrame
|
|
990
|
+
Input DataFrame
|
|
991
|
+
column : str
|
|
992
|
+
Column to transform
|
|
993
|
+
mapping : dict[str, str]
|
|
994
|
+
Mapping from original values to standardized values
|
|
995
|
+
inplace : bool, default=False
|
|
996
|
+
If True, modify DataFrame in place
|
|
997
|
+
|
|
998
|
+
Returns
|
|
999
|
+
-------
|
|
1000
|
+
DataFrame
|
|
1001
|
+
DataFrame with standardized column values
|
|
1002
|
+
"""
|
|
1003
|
+
if column not in df.columns:
|
|
1004
|
+
raise ValueError(f"Column '{column}' not found in DataFrame")
|
|
1005
|
+
|
|
1006
|
+
if not inplace:
|
|
1007
|
+
df = df.copy()
|
|
1008
|
+
|
|
1009
|
+
# Apply mapping, keeping original values for unmapped entries
|
|
1010
|
+
df[column] = df[column].apply(lambda x: mapping.get(str(x), x) if pd.notna(x) else x)
|
|
1011
|
+
|
|
1012
|
+
if self.config.verbose:
|
|
1013
|
+
logger.info(f"SemanticEngine: Applied mapping to column '{column}'")
|
|
1014
|
+
|
|
1015
|
+
return df
|
|
1016
|
+
|
|
1017
|
+
def standardize_multiple_columns(
|
|
1018
|
+
self,
|
|
1019
|
+
df: pd.DataFrame,
|
|
1020
|
+
columns: list[str],
|
|
1021
|
+
contexts: Optional[dict[str, str]] = None,
|
|
1022
|
+
**kwargs,
|
|
1023
|
+
) -> tuple[pd.DataFrame, dict[str, dict[str, str]]]:
|
|
1024
|
+
"""
|
|
1025
|
+
Standardize multiple categorical columns at once.
|
|
1026
|
+
|
|
1027
|
+
Parameters
|
|
1028
|
+
----------
|
|
1029
|
+
df : DataFrame
|
|
1030
|
+
Input DataFrame
|
|
1031
|
+
columns : list[str]
|
|
1032
|
+
List of column names to standardize
|
|
1033
|
+
contexts : dict[str, str], optional
|
|
1034
|
+
Context descriptions for each column
|
|
1035
|
+
**kwargs
|
|
1036
|
+
Additional arguments passed to standardize_categories
|
|
1037
|
+
|
|
1038
|
+
Returns
|
|
1039
|
+
-------
|
|
1040
|
+
df_clean : DataFrame
|
|
1041
|
+
DataFrame with standardized columns
|
|
1042
|
+
all_mappings : dict[str, dict[str, str]]
|
|
1043
|
+
Dictionary of mappings for each column
|
|
1044
|
+
"""
|
|
1045
|
+
contexts = contexts or {}
|
|
1046
|
+
all_mappings = {}
|
|
1047
|
+
result_df = df.copy()
|
|
1048
|
+
|
|
1049
|
+
for col in columns:
|
|
1050
|
+
if col not in df.columns:
|
|
1051
|
+
if self.config.verbose:
|
|
1052
|
+
logger.warning(f"SemanticEngine: Column '{col}' not found, skipping")
|
|
1053
|
+
continue
|
|
1054
|
+
|
|
1055
|
+
context = contexts.get(col)
|
|
1056
|
+
mapping = self.standardize_categories(result_df, col, context=context, **kwargs)
|
|
1057
|
+
all_mappings[col] = mapping
|
|
1058
|
+
|
|
1059
|
+
if mapping:
|
|
1060
|
+
result_df = self.apply_category_mapping(result_df, col, mapping)
|
|
1061
|
+
|
|
1062
|
+
return result_df, all_mappings
|
|
1063
|
+
|
|
422
1064
|
def __del__(self):
|
|
423
1065
|
"""Clean up client on deletion."""
|
|
424
1066
|
if self._client:
|