featcopilot 0.1.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,7 +3,7 @@
3
3
  Uses contextual understanding of data to generate meaningful features.
4
4
  """
5
5
 
6
- from typing import Any, Optional, Union
6
+ from typing import Any, Literal, Optional, Union
7
7
 
8
8
  import numpy as np
9
9
  import pandas as pd
@@ -11,25 +11,38 @@ from pydantic import Field
11
11
 
12
12
  from featcopilot.core.base import BaseEngine, EngineConfig
13
13
  from featcopilot.core.feature import Feature, FeatureOrigin, FeatureSet, FeatureType
14
- from featcopilot.llm.copilot_client import SyncCopilotFeatureClient
14
+ from featcopilot.utils.logger import get_logger
15
+
16
+ logger = get_logger(__name__)
15
17
 
16
18
 
17
19
  class SemanticEngineConfig(EngineConfig):
18
20
  """Configuration for semantic feature engine."""
19
21
 
20
22
  name: str = "SemanticEngine"
21
- model: str = Field(default="gpt-5", description="LLM model to use")
23
+ model: str = Field(default="gpt-5.2", description="LLM model to use")
22
24
  max_suggestions: int = Field(default=20, description="Max features to suggest")
23
25
  validate_features: bool = Field(default=True, description="Validate generated code")
24
26
  domain: Optional[str] = Field(default=None, description="Domain context")
25
27
  temperature: float = Field(default=0.3, description="LLM temperature")
28
+ backend: Literal["copilot", "litellm"] = Field(default="copilot", description="LLM backend to use")
29
+ api_key: Optional[str] = Field(default=None, description="API key for litellm backend")
30
+ api_base: Optional[str] = Field(default=None, description="Custom API base URL for litellm")
31
+ enable_text_features: bool = Field(default=True, description="Generate ML features from text columns")
32
+ keep_text_columns: bool = Field(
33
+ default=True, description="Keep original text columns (for models that handle them natively)"
34
+ )
35
+ text_feature_types: list[str] = Field(
36
+ default_factory=lambda: ["sentiment", "readability", "linguistic", "semantic"],
37
+ description="Types of text features to generate",
38
+ )
26
39
 
27
40
 
28
41
  class SemanticEngine(BaseEngine):
29
42
  """
30
43
  LLM-powered semantic feature engineering engine.
31
44
 
32
- Uses GitHub Copilot SDK to:
45
+ Uses GitHub Copilot SDK or LiteLLM to:
33
46
  - Understand column semantics from names and descriptions
34
47
  - Generate domain-aware features
35
48
  - Create interpretable features with explanations
@@ -39,7 +52,7 @@ class SemanticEngine(BaseEngine):
39
52
 
40
53
  Parameters
41
54
  ----------
42
- model : str, default='gpt-5'
55
+ model : str, default='gpt-5.2'
43
56
  LLM model to use
44
57
  max_suggestions : int, default=20
45
58
  Maximum number of features to suggest
@@ -47,24 +60,56 @@ class SemanticEngine(BaseEngine):
47
60
  Whether to validate generated feature code
48
61
  domain : str, optional
49
62
  Domain context (e.g., 'healthcare', 'finance', 'retail')
63
+ backend : str, default='copilot'
64
+ LLM backend to use: 'copilot' or 'litellm'
65
+ api_key : str, optional
66
+ API key for litellm backend (uses environment variable if not provided)
67
+ api_base : str, optional
68
+ Custom API base URL for litellm backend (for self-hosted models)
50
69
 
51
70
  Examples
52
71
  --------
53
- >>> engine = SemanticEngine(model='gpt-5', domain='healthcare')
72
+ Using GitHub Copilot SDK (default):
73
+ >>> engine = SemanticEngine(model='gpt-5.2', domain='healthcare')
54
74
  >>> X_features = engine.fit_transform(
55
75
  ... X, y,
56
76
  ... column_descriptions={'age': 'Patient age', 'bmi': 'Body mass index'},
57
77
  ... task_description='Predict diabetes risk'
58
78
  ... )
79
+
80
+ Using LiteLLM with OpenAI:
81
+ >>> engine = SemanticEngine(
82
+ ... model='gpt-4o',
83
+ ... backend='litellm',
84
+ ... api_key='your-api-key' # or set OPENAI_API_KEY env var
85
+ ... )
86
+
87
+ Using LiteLLM with Anthropic:
88
+ >>> engine = SemanticEngine(
89
+ ... model='claude-3-opus',
90
+ ... backend='litellm'
91
+ ... )
92
+
93
+ Using LiteLLM with local Ollama:
94
+ >>> engine = SemanticEngine(
95
+ ... model='ollama/llama2',
96
+ ... backend='litellm',
97
+ ... api_base='http://localhost:11434'
98
+ ... )
59
99
  """
60
100
 
61
101
  def __init__(
62
102
  self,
63
- model: str = "gpt-5",
103
+ model: str = "gpt-5.2",
64
104
  max_suggestions: int = 20,
65
105
  validate_features: bool = True,
66
106
  domain: Optional[str] = None,
67
107
  verbose: bool = False,
108
+ backend: Literal["copilot", "litellm"] = "copilot",
109
+ api_key: Optional[str] = None,
110
+ api_base: Optional[str] = None,
111
+ enable_text_features: bool = True,
112
+ text_feature_types: Optional[list[str]] = None,
68
113
  **kwargs,
69
114
  ):
70
115
  config = SemanticEngineConfig(
@@ -73,21 +118,39 @@ class SemanticEngine(BaseEngine):
73
118
  validate_features=validate_features,
74
119
  domain=domain,
75
120
  verbose=verbose,
121
+ backend=backend,
122
+ api_key=api_key,
123
+ api_base=api_base,
124
+ enable_text_features=enable_text_features,
125
+ text_feature_types=text_feature_types or ["sentiment", "readability", "linguistic", "semantic"],
76
126
  **kwargs,
77
127
  )
78
128
  super().__init__(config=config)
79
129
  self.config: SemanticEngineConfig = config
80
- self._client: Optional[SyncCopilotFeatureClient] = None
130
+ self._client: Optional[Any] = None
81
131
  self._suggested_features: list[dict[str, Any]] = []
132
+ self._text_features: list[dict[str, Any]] = []
82
133
  self._feature_set = FeatureSet()
83
134
  self._column_info: dict[str, str] = {}
84
135
  self._column_descriptions: dict[str, str] = {}
85
136
  self._task_description: str = ""
137
+ self._text_columns: list[str] = []
86
138
 
87
139
  def _ensure_client(self) -> None:
88
- """Ensure Copilot client is initialized."""
140
+ """Ensure LLM client is initialized."""
89
141
  if self._client is None:
90
- self._client = SyncCopilotFeatureClient(model=self.config.model)
142
+ if self.config.backend == "litellm":
143
+ from featcopilot.llm.litellm_client import SyncLiteLLMFeatureClient
144
+
145
+ self._client = SyncLiteLLMFeatureClient(
146
+ model=self.config.model,
147
+ api_key=self.config.api_key,
148
+ api_base=self.config.api_base,
149
+ )
150
+ else:
151
+ from featcopilot.llm.copilot_client import SyncCopilotFeatureClient
152
+
153
+ self._client = SyncCopilotFeatureClient(model=self.config.model)
91
154
  self._client.start()
92
155
 
93
156
  def fit(
@@ -123,32 +186,56 @@ class SemanticEngine(BaseEngine):
123
186
  self._column_descriptions = column_descriptions or {}
124
187
  self._task_description = task_description
125
188
 
126
- # Build column info
189
+ # Build column info and detect text columns
127
190
  self._column_info = {}
191
+ self._text_columns = []
128
192
  for col in X.columns:
129
193
  dtype = str(X[col].dtype)
130
194
  if X[col].dtype == "object":
131
195
  dtype = "string"
196
+ # Detect if it's a text column (long strings with high variance)
197
+ if X[col].str.len().mean() > 20 and X[col].nunique() > 10:
198
+ self._text_columns.append(col)
132
199
  elif np.issubdtype(X[col].dtype, np.integer):
133
200
  dtype = "integer"
134
201
  elif np.issubdtype(X[col].dtype, np.floating):
135
202
  dtype = "float"
136
203
  self._column_info[col] = dtype
137
204
 
138
- # Get LLM suggestions
139
205
  if self.config.verbose:
140
- print("SemanticEngine: Requesting feature suggestions from LLM...")
206
+ logger.info(f"SemanticEngine: Detected {len(self._text_columns)} text columns: {self._text_columns}")
141
207
 
142
- self._suggested_features = self._client.suggest_features(
143
- column_info=self._column_info,
144
- task_description=task_description,
145
- column_descriptions=column_descriptions,
146
- domain=self.config.domain,
147
- max_suggestions=self.config.max_suggestions,
148
- )
208
+ # Generate text-specific features if enabled
209
+ if self.config.enable_text_features and self._text_columns:
210
+ self._text_features = self._generate_text_features(X)
211
+ if self.config.verbose:
212
+ logger.info(f"SemanticEngine: Generated {len(self._text_features)} text features")
213
+
214
+ # Get LLM suggestions for general features (excluding text columns)
215
+ if self.config.verbose:
216
+ logger.info("SemanticEngine: Requesting feature suggestions from LLM...")
217
+
218
+ # Filter out text columns for general feature suggestions
219
+ non_text_column_info = {k: v for k, v in self._column_info.items() if k not in self._text_columns}
220
+
221
+ if non_text_column_info:
222
+ try:
223
+ self._suggested_features = self._client.suggest_features(
224
+ column_info=non_text_column_info,
225
+ task_description=task_description,
226
+ column_descriptions=column_descriptions,
227
+ domain=self.config.domain,
228
+ max_suggestions=self.config.max_suggestions,
229
+ )
230
+ except Exception as e:
231
+ if self.config.verbose:
232
+ logger.warning(f"SemanticEngine: Could not get LLM suggestions: {e}")
233
+ self._suggested_features = []
234
+ else:
235
+ self._suggested_features = []
149
236
 
150
237
  if self.config.verbose:
151
- print(f"SemanticEngine: Received {len(self._suggested_features)} suggestions")
238
+ logger.info(f"SemanticEngine: Received {len(self._suggested_features)} suggestions")
152
239
 
153
240
  # Validate features if enabled
154
241
  if self.config.validate_features:
@@ -160,6 +247,198 @@ class SemanticEngine(BaseEngine):
160
247
  self._is_fitted = True
161
248
  return self
162
249
 
250
+ def _generate_text_features(self, X: pd.DataFrame) -> list[dict[str, Any]]:
251
+ """
252
+ Generate ML-ready numerical features from text columns using LLM suggestions.
253
+
254
+ This is the key differentiator - LLM suggests Python code to transform text
255
+ into numerical features that can be used by ML models.
256
+ """
257
+ text_features = []
258
+
259
+ for col in self._text_columns:
260
+ # Always add fallback features first (don't require LLM)
261
+ fallback_features = self._get_fallback_text_features(col)
262
+ text_features.extend(fallback_features)
263
+
264
+ # Try to get LLM-suggested features (optional)
265
+ try:
266
+ col_desc = self._column_descriptions.get(col, f"Text column: {col}")
267
+
268
+ # Use suggest_features instead of send_prompt for better compatibility
269
+ response = self._client.suggest_features(
270
+ column_info={col: "string"},
271
+ task_description=f"Extract numerical features from text column '{col}' for {self._task_description}",
272
+ column_descriptions={col: col_desc},
273
+ domain=self.config.domain,
274
+ max_suggestions=5,
275
+ )
276
+
277
+ # Response is already parsed as list of features
278
+ for f in response:
279
+ f["source_columns"] = [col]
280
+ f["is_text_feature"] = True
281
+ text_features.append(f)
282
+
283
+ except Exception as e:
284
+ if self.config.verbose:
285
+ logger.warning(f"SemanticEngine: Could not get LLM suggestions for '{col}': {e}")
286
+
287
+ return text_features
288
+
289
+ def _build_text_feature_prompt(self, col: str, samples: list[str], description: str) -> str:
290
+ """Build prompt for text feature generation."""
291
+ return f"""You are an expert data scientist. Generate Python code to extract NUMERICAL features from text data.
292
+
293
+ ## Text Column
294
+ Name: {col}
295
+ Description: {description}
296
+
297
+ ## Sample Values
298
+ {chr(10).join([f'- "{s[:200]}..."' if len(str(s)) > 200 else f'- "{s}"' for s in samples[:5]])}
299
+
300
+ ## Task
301
+ {self._task_description}
302
+
303
+ ## Requirements
304
+ Generate features that transform text into NUMERICAL values suitable for ML models:
305
+ 1. Sentiment scores (positive/negative/neutral)
306
+ 2. Readability metrics (Flesch score, word complexity)
307
+ 3. Linguistic features (noun ratio, verb ratio, sentence count)
308
+ 4. Pattern detection (contains numbers, URLs, emails)
309
+ 5. Domain-specific indicators
310
+
311
+ ## Output Format
312
+ Return JSON with "features" array:
313
+ {{
314
+ "features": [
315
+ {{
316
+ "name": "{col}_sentiment_score",
317
+ "code": "result = df['{col}'].apply(lambda x: len([w for w in str(x).lower().split() if w in ['good','great','excellent','best']]) - len([w for w in str(x).lower().split() if w in ['bad','poor','worst','terrible']]))",
318
+ "explanation": "Simple sentiment score based on positive/negative word counts"
319
+ }}
320
+ ]
321
+ }}
322
+
323
+ Return ONLY the JSON object, no other text. Generate 5-10 useful features."""
324
+
325
+ def _parse_text_features(self, response: str, col: str) -> list[dict[str, Any]]:
326
+ """Parse text features from LLM response."""
327
+ import json
328
+ import re
329
+
330
+ try:
331
+ response = response.strip()
332
+ if response.startswith("```"):
333
+ lines = response.split("\n")
334
+ response = "\n".join(lines[1:-1])
335
+
336
+ data = json.loads(response)
337
+ features = data.get("features", [])
338
+
339
+ # Add source column info
340
+ for f in features:
341
+ f["source_columns"] = [col]
342
+ f["is_text_feature"] = True
343
+
344
+ return features
345
+
346
+ except json.JSONDecodeError:
347
+ json_match = re.search(r"\{.*\}", response, re.DOTALL)
348
+ if json_match:
349
+ try:
350
+ data = json.loads(json_match.group())
351
+ features = data.get("features", [])
352
+ for f in features:
353
+ f["source_columns"] = [col]
354
+ f["is_text_feature"] = True
355
+ return features
356
+ except json.JSONDecodeError:
357
+ pass
358
+ return []
359
+
360
+ def _get_fallback_text_features(self, col: str) -> list[dict[str, Any]]:
361
+ """Generate fallback text features that don't require LLM."""
362
+ return [
363
+ {
364
+ "name": f"{col}_char_length",
365
+ "code": f"result = df['{col}'].fillna('').astype(str).str.len()",
366
+ "explanation": "Character length of text",
367
+ "source_columns": [col],
368
+ "is_text_feature": True,
369
+ },
370
+ {
371
+ "name": f"{col}_word_count",
372
+ "code": f"result = df['{col}'].fillna('').astype(str).str.split().str.len()",
373
+ "explanation": "Word count in text",
374
+ "source_columns": [col],
375
+ "is_text_feature": True,
376
+ },
377
+ {
378
+ "name": f"{col}_avg_word_length",
379
+ "code": f"result = df['{col}'].fillna('').astype(str).apply(lambda x: np.mean([len(w) for w in x.split()] or [0]))",
380
+ "explanation": "Average word length",
381
+ "source_columns": [col],
382
+ "is_text_feature": True,
383
+ },
384
+ {
385
+ "name": f"{col}_sentence_count",
386
+ "code": f"result = df['{col}'].fillna('').astype(str).str.count(r'[.!?]+')",
387
+ "explanation": "Number of sentences (approximate)",
388
+ "source_columns": [col],
389
+ "is_text_feature": True,
390
+ },
391
+ {
392
+ "name": f"{col}_uppercase_ratio",
393
+ "code": f"result = df['{col}'].fillna('').astype(str).apply(lambda x: sum(1 for c in x if c.isupper()) / max(len(x), 1))",
394
+ "explanation": "Ratio of uppercase characters",
395
+ "source_columns": [col],
396
+ "is_text_feature": True,
397
+ },
398
+ {
399
+ "name": f"{col}_digit_count",
400
+ "code": f"result = df['{col}'].fillna('').astype(str).str.count(r'\\d')",
401
+ "explanation": "Count of digits in text",
402
+ "source_columns": [col],
403
+ "is_text_feature": True,
404
+ },
405
+ {
406
+ "name": f"{col}_special_char_count",
407
+ "code": f"result = df['{col}'].fillna('').astype(str).str.count(r'[^a-zA-Z0-9\\s]')",
408
+ "explanation": "Count of special characters",
409
+ "source_columns": [col],
410
+ "is_text_feature": True,
411
+ },
412
+ {
413
+ "name": f"{col}_unique_word_ratio",
414
+ "code": f"result = df['{col}'].fillna('').astype(str).apply(lambda x: len(set(x.lower().split())) / max(len(x.split()), 1))",
415
+ "explanation": "Ratio of unique words to total words",
416
+ "source_columns": [col],
417
+ "is_text_feature": True,
418
+ },
419
+ {
420
+ "name": f"{col}_exclamation_count",
421
+ "code": f"result = df['{col}'].fillna('').astype(str).str.count('!')",
422
+ "explanation": "Count of exclamation marks (indicates emphasis/emotion)",
423
+ "source_columns": [col],
424
+ "is_text_feature": True,
425
+ },
426
+ {
427
+ "name": f"{col}_question_count",
428
+ "code": f"result = df['{col}'].fillna('').astype(str).str.count(r'\\?')",
429
+ "explanation": "Count of question marks",
430
+ "source_columns": [col],
431
+ "is_text_feature": True,
432
+ },
433
+ {
434
+ "name": f"{col}_caps_word_ratio",
435
+ "code": f"result = df['{col}'].fillna('').astype(str).apply(lambda x: sum(1 for w in x.split() if w.isupper()) / max(len(x.split()), 1))",
436
+ "explanation": "Ratio of all-caps words (indicates shouting/emphasis)",
437
+ "source_columns": [col],
438
+ "is_text_feature": True,
439
+ },
440
+ ]
441
+
163
442
  def _validate_suggestions(self, X: pd.DataFrame) -> None:
164
443
  """Validate suggested feature code."""
165
444
  valid_features = []
@@ -175,19 +454,33 @@ class SemanticEngine(BaseEngine):
175
454
  if result["valid"]:
176
455
  valid_features.append(feature)
177
456
  elif self.config.verbose:
178
- print(
457
+ logger.warning(
179
458
  f"SemanticEngine: Invalid feature '{feature.get('name', 'unknown')}': {result.get('error', 'unknown error')}"
180
459
  )
181
460
 
182
461
  self._suggested_features = valid_features
183
462
 
184
463
  if self.config.verbose:
185
- print(f"SemanticEngine: {len(valid_features)} valid features after validation")
464
+ logger.info(f"SemanticEngine: {len(valid_features)} valid features after validation")
186
465
 
187
466
  def _build_feature_set(self) -> None:
188
467
  """Build FeatureSet from suggestions."""
189
468
  self._feature_set = FeatureSet()
190
469
 
470
+ # Add text features
471
+ for suggestion in self._text_features:
472
+ feature = Feature(
473
+ name=suggestion.get("name", f"text_feature_{len(self._feature_set)}"),
474
+ dtype=FeatureType.NUMERIC,
475
+ origin=FeatureOrigin.LLM_GENERATED,
476
+ source_columns=suggestion.get("source_columns", []),
477
+ transformation="text_to_numeric",
478
+ explanation=suggestion.get("explanation", ""),
479
+ code=suggestion.get("code", ""),
480
+ )
481
+ self._feature_set.add(feature)
482
+
483
+ # Add general features
191
484
  for suggestion in self._suggested_features:
192
485
  feature = Feature(
193
486
  name=suggestion.get("name", f"llm_feature_{len(self._feature_set)}"),
@@ -212,7 +505,7 @@ class SemanticEngine(BaseEngine):
212
505
  Returns
213
506
  -------
214
507
  X_features : DataFrame
215
- Data with generated features
508
+ Data with generated features (numerical only, text columns dropped)
216
509
  """
217
510
  if not self._is_fitted:
218
511
  raise RuntimeError("Engine must be fitted before transform")
@@ -222,6 +515,52 @@ class SemanticEngine(BaseEngine):
222
515
 
223
516
  successful_features = []
224
517
 
518
+ # Apply text features first
519
+ for suggestion in self._text_features:
520
+ name = suggestion.get("name", "")
521
+ code = suggestion.get("code", "")
522
+
523
+ if not code:
524
+ continue
525
+
526
+ try:
527
+ local_vars = {"df": result, "np": np, "pd": pd}
528
+ exec(
529
+ code,
530
+ {
531
+ "__builtins__": {
532
+ "len": len,
533
+ "sum": sum,
534
+ "max": max,
535
+ "min": min,
536
+ "abs": abs,
537
+ "round": round,
538
+ "int": int,
539
+ "float": float,
540
+ "str": str,
541
+ "list": list,
542
+ "dict": dict,
543
+ "set": set,
544
+ },
545
+ "np": np,
546
+ "pd": pd,
547
+ },
548
+ local_vars,
549
+ )
550
+
551
+ if "result" in local_vars:
552
+ feature_values = local_vars["result"]
553
+ if isinstance(feature_values, pd.Series):
554
+ result[name] = feature_values.values
555
+ else:
556
+ result[name] = feature_values
557
+ successful_features.append(name)
558
+
559
+ except Exception as e:
560
+ if self.config.verbose:
561
+ logger.error(f"SemanticEngine: Error computing text feature '{name}': {e}")
562
+
563
+ # Apply general features
225
564
  for suggestion in self._suggested_features:
226
565
  name = suggestion.get("name", "")
227
566
  code = suggestion.get("code", "")
@@ -266,15 +605,23 @@ class SemanticEngine(BaseEngine):
266
605
 
267
606
  except Exception as e:
268
607
  if self.config.verbose:
269
- print(f"SemanticEngine: Error computing '{name}': {e}")
608
+ logger.error(f"SemanticEngine: Error computing '{name}': {e}")
270
609
 
271
610
  # Handle infinities and NaNs
272
611
  result = result.replace([np.inf, -np.inf], np.nan)
273
612
 
613
+ # Optionally drop original text columns (only if not keeping them for downstream models)
614
+ if not self.config.keep_text_columns:
615
+ cols_to_drop = [col for col in self._text_columns if col in result.columns]
616
+ if cols_to_drop:
617
+ result = result.drop(columns=cols_to_drop)
618
+ if self.config.verbose:
619
+ logger.info(f"SemanticEngine: Dropped {len(cols_to_drop)} text columns, keeping numerical features")
620
+
274
621
  self._feature_names = successful_features
275
622
 
276
623
  if self.config.verbose:
277
- print(f"SemanticEngine: Successfully generated {len(successful_features)} features")
624
+ logger.info(f"SemanticEngine: Successfully generated {len(successful_features)} features")
278
625
 
279
626
  return result
280
627
 
@@ -370,6 +717,350 @@ class SemanticEngine(BaseEngine):
370
717
  """Get the feature set with metadata."""
371
718
  return self._feature_set
372
719
 
720
+ def standardize_categories(
721
+ self,
722
+ df: pd.DataFrame,
723
+ column: str,
724
+ target_categories: Optional[list[str]] = None,
725
+ similarity_threshold: float = 0.8,
726
+ max_categories: int = 50,
727
+ context: Optional[str] = None,
728
+ ) -> dict[str, str]:
729
+ """
730
+ Use LLM to standardize similar category values in a column.
731
+
732
+ Identifies semantically similar values (e.g., "software engineer", "Software Engineer",
733
+ "SDE") and maps them to a canonical form.
734
+
735
+ Parameters
736
+ ----------
737
+ df : DataFrame
738
+ Input DataFrame containing the column to standardize
739
+ column : str
740
+ Name of the categorical column to standardize
741
+ target_categories : list[str], optional
742
+ If provided, map values to these specific categories.
743
+ If None, LLM will infer appropriate canonical forms.
744
+ similarity_threshold : float, default=0.8
745
+ Minimum similarity for grouping (hint for LLM, not strictly enforced)
746
+ max_categories : int, default=50
747
+ Maximum number of unique values to process (for efficiency)
748
+ context : str, optional
749
+ Additional context about the data domain (e.g., "job titles in tech industry")
750
+
751
+ Returns
752
+ -------
753
+ mapping : dict[str, str]
754
+ Dictionary mapping original values to standardized values.
755
+ Only includes values that need transformation.
756
+
757
+ Examples
758
+ --------
759
+ >>> engine = SemanticEngine()
760
+ >>> mapping = engine.standardize_categories(
761
+ ... df,
762
+ ... column="job_title",
763
+ ... context="job titles in software industry"
764
+ ... )
765
+ >>> print(mapping)
766
+ {'software engineer': 'Software Engineer', 'SDE': 'Software Engineer',
767
+ 'Sr. SWE': 'Senior Software Engineer', 'data scientist': 'Data Scientist'}
768
+
769
+ >>> # Apply the mapping
770
+ >>> df_clean = engine.apply_category_mapping(df, "job_title", mapping)
771
+ """
772
+ if column not in df.columns:
773
+ raise ValueError(f"Column '{column}' not found in DataFrame")
774
+
775
+ self._ensure_client()
776
+
777
+ # Get unique values (excluding NaN)
778
+ unique_values = df[column].dropna().unique().tolist()
779
+
780
+ # Convert to strings and filter
781
+ unique_values = [str(v) for v in unique_values if v is not None and str(v).strip()]
782
+ unique_values = list(set(unique_values)) # Remove duplicates after string conversion
783
+
784
+ if len(unique_values) == 0:
785
+ if self.config.verbose:
786
+ logger.info(f"SemanticEngine: No valid values found in column '{column}'")
787
+ return {}
788
+
789
+ if len(unique_values) > max_categories:
790
+ if self.config.verbose:
791
+ logger.warning(
792
+ f"SemanticEngine: Column '{column}' has {len(unique_values)} unique values, "
793
+ f"truncating to {max_categories} most frequent"
794
+ )
795
+ # Get most frequent values
796
+ value_counts = df[column].value_counts().head(max_categories)
797
+ unique_values = [str(v) for v in value_counts.index.tolist()]
798
+
799
+ # Build and send prompt
800
+ prompt = self._build_category_standardization_prompt(
801
+ column=column,
802
+ unique_values=unique_values,
803
+ target_categories=target_categories,
804
+ context=context,
805
+ similarity_threshold=similarity_threshold,
806
+ )
807
+
808
+ try:
809
+ # Use the client's send_prompt method if available, otherwise use suggest_features
810
+ if hasattr(self._client, "send_prompt"):
811
+ response = self._client.send_prompt(prompt)
812
+ else:
813
+ # Fallback: use suggest_features with a specialized task
814
+ response_list = self._client.suggest_features(
815
+ column_info={column: "categorical"},
816
+ task_description=prompt,
817
+ column_descriptions={column: context or "Categorical column to standardize"},
818
+ domain=self.config.domain,
819
+ max_suggestions=1,
820
+ )
821
+ # Extract mapping from response if possible
822
+ if response_list and isinstance(response_list, list) and len(response_list) > 0:
823
+ first = response_list[0]
824
+ if isinstance(first, dict) and "mapping" in first:
825
+ return first["mapping"]
826
+ response = str(first)
827
+ else:
828
+ response = str(response_list)
829
+
830
+ mapping = self._parse_category_mapping(response, unique_values)
831
+
832
+ if self.config.verbose:
833
+ logger.info(f"SemanticEngine: Created mapping for {len(mapping)} values in column '{column}'")
834
+
835
+ return mapping
836
+
837
+ except Exception as e:
838
+ if self.config.verbose:
839
+ logger.error(f"SemanticEngine: Error standardizing categories: {e}")
840
+ return {}
841
+
842
+ def _build_category_standardization_prompt(
843
+ self,
844
+ column: str,
845
+ unique_values: list[str],
846
+ target_categories: Optional[list[str]] = None,
847
+ context: Optional[str] = None,
848
+ similarity_threshold: float = 0.8,
849
+ ) -> str:
850
+ """Build prompt for category standardization."""
851
+ values_str = "\n".join([f'- "{v}"' for v in unique_values[:100]])
852
+
853
+ target_str = ""
854
+ if target_categories:
855
+ target_str = f"""
856
+ ## Target Categories (map values to these)
857
+ {chr(10).join([f'- "{c}"' for c in target_categories])}
858
+ """
859
+
860
+ context_str = f"\n## Context\n{context}" if context else ""
861
+
862
+ return f"""You are an expert data scientist specializing in data cleaning and standardization.
863
+
864
+ ## Task
865
+ Analyze the following categorical values from column "{column}" and identify semantically similar values that should be standardized to a common form.
866
+
867
+ ## Unique Values in Column
868
+ {values_str}
869
+ {target_str}{context_str}
870
+
871
+ ## Requirements
872
+ 1. Identify values that represent the same concept (case variations, abbreviations, typos, synonyms)
873
+ 2. Map similar values to a single canonical/standardized form
874
+ 3. Use proper capitalization for the standardized form (e.g., "Software Engineer" not "software engineer")
875
+ 4. Common patterns to look for:
876
+ - Case variations: "Software Engineer" vs "software engineer" vs "SOFTWARE ENGINEER"
877
+ - Abbreviations: "SDE" vs "Software Development Engineer", "Sr." vs "Senior"
878
+ - Typos: "Enginer" vs "Engineer"
879
+ - Synonyms: "Developer" vs "Programmer" vs "Software Engineer"
880
+ - Formatting: "Data-Scientist" vs "Data Scientist" vs "DataScientist"
881
+ 5. Only include values that need mapping (exclude already-standardized values)
882
+ 6. Preserve values that are already properly formatted or don't have similar alternatives
883
+
884
+ ## Output Format
885
+ Return ONLY a valid JSON object with this structure:
886
+ {{
887
+ "mapping": {{
888
+ "original_value_1": "Standardized Value",
889
+ "original_value_2": "Standardized Value",
890
+ "typo_value": "Corrected Value"
891
+ }},
892
+ "groups": [
893
+ {{
894
+ "canonical": "Software Engineer",
895
+ "members": ["software engineer", "SDE", "Software Dev", "SW Engineer"]
896
+ }}
897
+ ]
898
+ }}
899
+
900
+ Return ONLY the JSON object, no markdown formatting, no explanation text."""
901
+
902
+ def _parse_category_mapping(
903
+ self,
904
+ response: str,
905
+ original_values: list[str],
906
+ ) -> dict[str, str]:
907
+ """Parse category mapping from LLM response."""
908
+ import json
909
+ import re
910
+
911
+ try:
912
+ # Clean response
913
+ response = response.strip()
914
+
915
+ # Remove markdown code blocks if present
916
+ if response.startswith("```"):
917
+ lines = response.split("\n")
918
+ # Find the JSON content between ``` markers
919
+ start_idx = 1 if lines[0].startswith("```") else 0
920
+ end_idx = len(lines)
921
+ for i, line in enumerate(lines[1:], 1):
922
+ if line.strip() == "```":
923
+ end_idx = i
924
+ break
925
+ response = "\n".join(lines[start_idx:end_idx])
926
+
927
+ # Try to parse as JSON
928
+ data = json.loads(response)
929
+
930
+ # Extract mapping from response
931
+ if isinstance(data, dict):
932
+ if "mapping" in data:
933
+ mapping = data["mapping"]
934
+ elif "groups" in data:
935
+ # Build mapping from groups
936
+ mapping = {}
937
+ for group in data["groups"]:
938
+ canonical = group.get("canonical", "")
939
+ members = group.get("members", [])
940
+ for member in members:
941
+ if member != canonical:
942
+ mapping[member] = canonical
943
+ else:
944
+ # Assume the entire dict is the mapping
945
+ mapping = data
946
+ else:
947
+ mapping = {}
948
+
949
+ # Validate mapping - only keep mappings for values that exist
950
+ original_set = set(original_values)
951
+ original_lower = {v.lower(): v for v in original_values}
952
+
953
+ validated_mapping = {}
954
+ for orig, standardized in mapping.items():
955
+ # Check exact match or case-insensitive match
956
+ if orig in original_set:
957
+ validated_mapping[orig] = standardized
958
+ elif orig.lower() in original_lower:
959
+ actual_orig = original_lower[orig.lower()]
960
+ validated_mapping[actual_orig] = standardized
961
+
962
+ return validated_mapping
963
+
964
+ except json.JSONDecodeError:
965
+ # Try to extract JSON from response
966
+ json_match = re.search(r"\{[\s\S]*\}", response)
967
+ if json_match:
968
+ try:
969
+ return self._parse_category_mapping(json_match.group(), original_values)
970
+ except Exception:
971
+ pass
972
+
973
+ if self.config.verbose:
974
+ logger.warning("SemanticEngine: Could not parse category mapping response")
975
+ return {}
976
+
977
+ def apply_category_mapping(
978
+ self,
979
+ df: pd.DataFrame,
980
+ column: str,
981
+ mapping: dict[str, str],
982
+ inplace: bool = False,
983
+ ) -> pd.DataFrame:
984
+ """
985
+ Apply a category mapping to standardize values in a DataFrame column.
986
+
987
+ Parameters
988
+ ----------
989
+ df : DataFrame
990
+ Input DataFrame
991
+ column : str
992
+ Column to transform
993
+ mapping : dict[str, str]
994
+ Mapping from original values to standardized values
995
+ inplace : bool, default=False
996
+ If True, modify DataFrame in place
997
+
998
+ Returns
999
+ -------
1000
+ DataFrame
1001
+ DataFrame with standardized column values
1002
+ """
1003
+ if column not in df.columns:
1004
+ raise ValueError(f"Column '{column}' not found in DataFrame")
1005
+
1006
+ if not inplace:
1007
+ df = df.copy()
1008
+
1009
+ # Apply mapping, keeping original values for unmapped entries
1010
+ df[column] = df[column].apply(lambda x: mapping.get(str(x), x) if pd.notna(x) else x)
1011
+
1012
+ if self.config.verbose:
1013
+ logger.info(f"SemanticEngine: Applied mapping to column '{column}'")
1014
+
1015
+ return df
1016
+
1017
+ def standardize_multiple_columns(
1018
+ self,
1019
+ df: pd.DataFrame,
1020
+ columns: list[str],
1021
+ contexts: Optional[dict[str, str]] = None,
1022
+ **kwargs,
1023
+ ) -> tuple[pd.DataFrame, dict[str, dict[str, str]]]:
1024
+ """
1025
+ Standardize multiple categorical columns at once.
1026
+
1027
+ Parameters
1028
+ ----------
1029
+ df : DataFrame
1030
+ Input DataFrame
1031
+ columns : list[str]
1032
+ List of column names to standardize
1033
+ contexts : dict[str, str], optional
1034
+ Context descriptions for each column
1035
+ **kwargs
1036
+ Additional arguments passed to standardize_categories
1037
+
1038
+ Returns
1039
+ -------
1040
+ df_clean : DataFrame
1041
+ DataFrame with standardized columns
1042
+ all_mappings : dict[str, dict[str, str]]
1043
+ Dictionary of mappings for each column
1044
+ """
1045
+ contexts = contexts or {}
1046
+ all_mappings = {}
1047
+ result_df = df.copy()
1048
+
1049
+ for col in columns:
1050
+ if col not in df.columns:
1051
+ if self.config.verbose:
1052
+ logger.warning(f"SemanticEngine: Column '{col}' not found, skipping")
1053
+ continue
1054
+
1055
+ context = contexts.get(col)
1056
+ mapping = self.standardize_categories(result_df, col, context=context, **kwargs)
1057
+ all_mappings[col] = mapping
1058
+
1059
+ if mapping:
1060
+ result_df = self.apply_category_mapping(result_df, col, mapping)
1061
+
1062
+ return result_df, all_mappings
1063
+
373
1064
  def __del__(self):
374
1065
  """Clean up client on deletion."""
375
1066
  if self._client: