featcopilot 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -28,6 +28,14 @@ class SemanticEngineConfig(EngineConfig):
28
28
  backend: Literal["copilot", "litellm"] = Field(default="copilot", description="LLM backend to use")
29
29
  api_key: Optional[str] = Field(default=None, description="API key for litellm backend")
30
30
  api_base: Optional[str] = Field(default=None, description="Custom API base URL for litellm")
31
+ enable_text_features: bool = Field(default=True, description="Generate ML features from text columns")
32
+ keep_text_columns: bool = Field(
33
+ default=True, description="Keep original text columns (for models that handle them natively)"
34
+ )
35
+ text_feature_types: list[str] = Field(
36
+ default_factory=lambda: ["sentiment", "readability", "linguistic", "semantic"],
37
+ description="Types of text features to generate",
38
+ )
31
39
 
32
40
 
33
41
  class SemanticEngine(BaseEngine):
@@ -100,6 +108,8 @@ class SemanticEngine(BaseEngine):
100
108
  backend: Literal["copilot", "litellm"] = "copilot",
101
109
  api_key: Optional[str] = None,
102
110
  api_base: Optional[str] = None,
111
+ enable_text_features: bool = True,
112
+ text_feature_types: Optional[list[str]] = None,
103
113
  **kwargs,
104
114
  ):
105
115
  config = SemanticEngineConfig(
@@ -111,16 +121,20 @@ class SemanticEngine(BaseEngine):
111
121
  backend=backend,
112
122
  api_key=api_key,
113
123
  api_base=api_base,
124
+ enable_text_features=enable_text_features,
125
+ text_feature_types=text_feature_types or ["sentiment", "readability", "linguistic", "semantic"],
114
126
  **kwargs,
115
127
  )
116
128
  super().__init__(config=config)
117
129
  self.config: SemanticEngineConfig = config
118
130
  self._client: Optional[Any] = None
119
131
  self._suggested_features: list[dict[str, Any]] = []
132
+ self._text_features: list[dict[str, Any]] = []
120
133
  self._feature_set = FeatureSet()
121
134
  self._column_info: dict[str, str] = {}
122
135
  self._column_descriptions: dict[str, str] = {}
123
136
  self._task_description: str = ""
137
+ self._text_columns: list[str] = []
124
138
 
125
139
  def _ensure_client(self) -> None:
126
140
  """Ensure LLM client is initialized."""
@@ -172,29 +186,53 @@ class SemanticEngine(BaseEngine):
172
186
  self._column_descriptions = column_descriptions or {}
173
187
  self._task_description = task_description
174
188
 
175
- # Build column info
189
+ # Build column info and detect text columns
176
190
  self._column_info = {}
191
+ self._text_columns = []
177
192
  for col in X.columns:
178
193
  dtype = str(X[col].dtype)
179
194
  if X[col].dtype == "object":
180
195
  dtype = "string"
196
+ # Detect if it's a text column (long strings with high variance)
197
+ if X[col].str.len().mean() > 20 and X[col].nunique() > 10:
198
+ self._text_columns.append(col)
181
199
  elif np.issubdtype(X[col].dtype, np.integer):
182
200
  dtype = "integer"
183
201
  elif np.issubdtype(X[col].dtype, np.floating):
184
202
  dtype = "float"
185
203
  self._column_info[col] = dtype
186
204
 
187
- # Get LLM suggestions
205
+ if self.config.verbose:
206
+ logger.info(f"SemanticEngine: Detected {len(self._text_columns)} text columns: {self._text_columns}")
207
+
208
+ # Generate text-specific features if enabled
209
+ if self.config.enable_text_features and self._text_columns:
210
+ self._text_features = self._generate_text_features(X)
211
+ if self.config.verbose:
212
+ logger.info(f"SemanticEngine: Generated {len(self._text_features)} text features")
213
+
214
+ # Get LLM suggestions for general features (excluding text columns)
188
215
  if self.config.verbose:
189
216
  logger.info("SemanticEngine: Requesting feature suggestions from LLM...")
190
217
 
191
- self._suggested_features = self._client.suggest_features(
192
- column_info=self._column_info,
193
- task_description=task_description,
194
- column_descriptions=column_descriptions,
195
- domain=self.config.domain,
196
- max_suggestions=self.config.max_suggestions,
197
- )
218
+ # Filter out text columns for general feature suggestions
219
+ non_text_column_info = {k: v for k, v in self._column_info.items() if k not in self._text_columns}
220
+
221
+ if non_text_column_info:
222
+ try:
223
+ self._suggested_features = self._client.suggest_features(
224
+ column_info=non_text_column_info,
225
+ task_description=task_description,
226
+ column_descriptions=column_descriptions,
227
+ domain=self.config.domain,
228
+ max_suggestions=self.config.max_suggestions,
229
+ )
230
+ except Exception as e:
231
+ if self.config.verbose:
232
+ logger.warning(f"SemanticEngine: Could not get LLM suggestions: {e}")
233
+ self._suggested_features = []
234
+ else:
235
+ self._suggested_features = []
198
236
 
199
237
  if self.config.verbose:
200
238
  logger.info(f"SemanticEngine: Received {len(self._suggested_features)} suggestions")
@@ -209,6 +247,198 @@ class SemanticEngine(BaseEngine):
209
247
  self._is_fitted = True
210
248
  return self
211
249
 
250
+ def _generate_text_features(self, X: pd.DataFrame) -> list[dict[str, Any]]:
251
+ """
252
+ Generate ML-ready numerical features from text columns using LLM suggestions.
253
+
254
+ This is the key differentiator - LLM suggests Python code to transform text
255
+ into numerical features that can be used by ML models.
256
+ """
257
+ text_features = []
258
+
259
+ for col in self._text_columns:
260
+ # Always add fallback features first (don't require LLM)
261
+ fallback_features = self._get_fallback_text_features(col)
262
+ text_features.extend(fallback_features)
263
+
264
+ # Try to get LLM-suggested features (optional)
265
+ try:
266
+ col_desc = self._column_descriptions.get(col, f"Text column: {col}")
267
+
268
+ # Use suggest_features instead of send_prompt for better compatibility
269
+ response = self._client.suggest_features(
270
+ column_info={col: "string"},
271
+ task_description=f"Extract numerical features from text column '{col}' for {self._task_description}",
272
+ column_descriptions={col: col_desc},
273
+ domain=self.config.domain,
274
+ max_suggestions=5,
275
+ )
276
+
277
+ # Response is already parsed as list of features
278
+ for f in response:
279
+ f["source_columns"] = [col]
280
+ f["is_text_feature"] = True
281
+ text_features.append(f)
282
+
283
+ except Exception as e:
284
+ if self.config.verbose:
285
+ logger.warning(f"SemanticEngine: Could not get LLM suggestions for '{col}': {e}")
286
+
287
+ return text_features
288
+
289
+ def _build_text_feature_prompt(self, col: str, samples: list[str], description: str) -> str:
290
+ """Build prompt for text feature generation."""
291
+ return f"""You are an expert data scientist. Generate Python code to extract NUMERICAL features from text data.
292
+
293
+ ## Text Column
294
+ Name: {col}
295
+ Description: {description}
296
+
297
+ ## Sample Values
298
+ {chr(10).join([f'- "{s[:200]}..."' if len(str(s)) > 200 else f'- "{s}"' for s in samples[:5]])}
299
+
300
+ ## Task
301
+ {self._task_description}
302
+
303
+ ## Requirements
304
+ Generate features that transform text into NUMERICAL values suitable for ML models:
305
+ 1. Sentiment scores (positive/negative/neutral)
306
+ 2. Readability metrics (Flesch score, word complexity)
307
+ 3. Linguistic features (noun ratio, verb ratio, sentence count)
308
+ 4. Pattern detection (contains numbers, URLs, emails)
309
+ 5. Domain-specific indicators
310
+
311
+ ## Output Format
312
+ Return JSON with "features" array:
313
+ {{
314
+ "features": [
315
+ {{
316
+ "name": "{col}_sentiment_score",
317
+ "code": "result = df['{col}'].apply(lambda x: len([w for w in str(x).lower().split() if w in ['good','great','excellent','best']]) - len([w for w in str(x).lower().split() if w in ['bad','poor','worst','terrible']]))",
318
+ "explanation": "Simple sentiment score based on positive/negative word counts"
319
+ }}
320
+ ]
321
+ }}
322
+
323
+ Return ONLY the JSON object, no other text. Generate 5-10 useful features."""
324
+
325
+ def _parse_text_features(self, response: str, col: str) -> list[dict[str, Any]]:
326
+ """Parse text features from LLM response."""
327
+ import json
328
+ import re
329
+
330
+ try:
331
+ response = response.strip()
332
+ if response.startswith("```"):
333
+ lines = response.split("\n")
334
+ response = "\n".join(lines[1:-1])
335
+
336
+ data = json.loads(response)
337
+ features = data.get("features", [])
338
+
339
+ # Add source column info
340
+ for f in features:
341
+ f["source_columns"] = [col]
342
+ f["is_text_feature"] = True
343
+
344
+ return features
345
+
346
+ except json.JSONDecodeError:
347
+ json_match = re.search(r"\{.*\}", response, re.DOTALL)
348
+ if json_match:
349
+ try:
350
+ data = json.loads(json_match.group())
351
+ features = data.get("features", [])
352
+ for f in features:
353
+ f["source_columns"] = [col]
354
+ f["is_text_feature"] = True
355
+ return features
356
+ except json.JSONDecodeError:
357
+ pass
358
+ return []
359
+
360
+ def _get_fallback_text_features(self, col: str) -> list[dict[str, Any]]:
361
+ """Generate fallback text features that don't require LLM."""
362
+ return [
363
+ {
364
+ "name": f"{col}_char_length",
365
+ "code": f"result = df['{col}'].fillna('').astype(str).str.len()",
366
+ "explanation": "Character length of text",
367
+ "source_columns": [col],
368
+ "is_text_feature": True,
369
+ },
370
+ {
371
+ "name": f"{col}_word_count",
372
+ "code": f"result = df['{col}'].fillna('').astype(str).str.split().str.len()",
373
+ "explanation": "Word count in text",
374
+ "source_columns": [col],
375
+ "is_text_feature": True,
376
+ },
377
+ {
378
+ "name": f"{col}_avg_word_length",
379
+ "code": f"result = df['{col}'].fillna('').astype(str).apply(lambda x: np.mean([len(w) for w in x.split()] or [0]))",
380
+ "explanation": "Average word length",
381
+ "source_columns": [col],
382
+ "is_text_feature": True,
383
+ },
384
+ {
385
+ "name": f"{col}_sentence_count",
386
+ "code": f"result = df['{col}'].fillna('').astype(str).str.count(r'[.!?]+')",
387
+ "explanation": "Number of sentences (approximate)",
388
+ "source_columns": [col],
389
+ "is_text_feature": True,
390
+ },
391
+ {
392
+ "name": f"{col}_uppercase_ratio",
393
+ "code": f"result = df['{col}'].fillna('').astype(str).apply(lambda x: sum(1 for c in x if c.isupper()) / max(len(x), 1))",
394
+ "explanation": "Ratio of uppercase characters",
395
+ "source_columns": [col],
396
+ "is_text_feature": True,
397
+ },
398
+ {
399
+ "name": f"{col}_digit_count",
400
+ "code": f"result = df['{col}'].fillna('').astype(str).str.count(r'\\d')",
401
+ "explanation": "Count of digits in text",
402
+ "source_columns": [col],
403
+ "is_text_feature": True,
404
+ },
405
+ {
406
+ "name": f"{col}_special_char_count",
407
+ "code": f"result = df['{col}'].fillna('').astype(str).str.count(r'[^a-zA-Z0-9\\s]')",
408
+ "explanation": "Count of special characters",
409
+ "source_columns": [col],
410
+ "is_text_feature": True,
411
+ },
412
+ {
413
+ "name": f"{col}_unique_word_ratio",
414
+ "code": f"result = df['{col}'].fillna('').astype(str).apply(lambda x: len(set(x.lower().split())) / max(len(x.split()), 1))",
415
+ "explanation": "Ratio of unique words to total words",
416
+ "source_columns": [col],
417
+ "is_text_feature": True,
418
+ },
419
+ {
420
+ "name": f"{col}_exclamation_count",
421
+ "code": f"result = df['{col}'].fillna('').astype(str).str.count('!')",
422
+ "explanation": "Count of exclamation marks (indicates emphasis/emotion)",
423
+ "source_columns": [col],
424
+ "is_text_feature": True,
425
+ },
426
+ {
427
+ "name": f"{col}_question_count",
428
+ "code": f"result = df['{col}'].fillna('').astype(str).str.count(r'\\?')",
429
+ "explanation": "Count of question marks",
430
+ "source_columns": [col],
431
+ "is_text_feature": True,
432
+ },
433
+ {
434
+ "name": f"{col}_caps_word_ratio",
435
+ "code": f"result = df['{col}'].fillna('').astype(str).apply(lambda x: sum(1 for w in x.split() if w.isupper()) / max(len(x.split()), 1))",
436
+ "explanation": "Ratio of all-caps words (indicates shouting/emphasis)",
437
+ "source_columns": [col],
438
+ "is_text_feature": True,
439
+ },
440
+ ]
441
+
212
442
  def _validate_suggestions(self, X: pd.DataFrame) -> None:
213
443
  """Validate suggested feature code."""
214
444
  valid_features = []
@@ -237,6 +467,20 @@ class SemanticEngine(BaseEngine):
237
467
  """Build FeatureSet from suggestions."""
238
468
  self._feature_set = FeatureSet()
239
469
 
470
+ # Add text features
471
+ for suggestion in self._text_features:
472
+ feature = Feature(
473
+ name=suggestion.get("name", f"text_feature_{len(self._feature_set)}"),
474
+ dtype=FeatureType.NUMERIC,
475
+ origin=FeatureOrigin.LLM_GENERATED,
476
+ source_columns=suggestion.get("source_columns", []),
477
+ transformation="text_to_numeric",
478
+ explanation=suggestion.get("explanation", ""),
479
+ code=suggestion.get("code", ""),
480
+ )
481
+ self._feature_set.add(feature)
482
+
483
+ # Add general features
240
484
  for suggestion in self._suggested_features:
241
485
  feature = Feature(
242
486
  name=suggestion.get("name", f"llm_feature_{len(self._feature_set)}"),
@@ -261,7 +505,7 @@ class SemanticEngine(BaseEngine):
261
505
  Returns
262
506
  -------
263
507
  X_features : DataFrame
264
- Data with generated features
508
+ Data with generated features (numerical only, text columns dropped)
265
509
  """
266
510
  if not self._is_fitted:
267
511
  raise RuntimeError("Engine must be fitted before transform")
@@ -271,6 +515,52 @@ class SemanticEngine(BaseEngine):
271
515
 
272
516
  successful_features = []
273
517
 
518
+ # Apply text features first
519
+ for suggestion in self._text_features:
520
+ name = suggestion.get("name", "")
521
+ code = suggestion.get("code", "")
522
+
523
+ if not code:
524
+ continue
525
+
526
+ try:
527
+ local_vars = {"df": result, "np": np, "pd": pd}
528
+ exec(
529
+ code,
530
+ {
531
+ "__builtins__": {
532
+ "len": len,
533
+ "sum": sum,
534
+ "max": max,
535
+ "min": min,
536
+ "abs": abs,
537
+ "round": round,
538
+ "int": int,
539
+ "float": float,
540
+ "str": str,
541
+ "list": list,
542
+ "dict": dict,
543
+ "set": set,
544
+ },
545
+ "np": np,
546
+ "pd": pd,
547
+ },
548
+ local_vars,
549
+ )
550
+
551
+ if "result" in local_vars:
552
+ feature_values = local_vars["result"]
553
+ if isinstance(feature_values, pd.Series):
554
+ result[name] = feature_values.values
555
+ else:
556
+ result[name] = feature_values
557
+ successful_features.append(name)
558
+
559
+ except Exception as e:
560
+ if self.config.verbose:
561
+ logger.error(f"SemanticEngine: Error computing text feature '{name}': {e}")
562
+
563
+ # Apply general features
274
564
  for suggestion in self._suggested_features:
275
565
  name = suggestion.get("name", "")
276
566
  code = suggestion.get("code", "")
@@ -320,6 +610,14 @@ class SemanticEngine(BaseEngine):
320
610
  # Handle infinities and NaNs
321
611
  result = result.replace([np.inf, -np.inf], np.nan)
322
612
 
613
+ # Optionally drop original text columns (only if not keeping them for downstream models)
614
+ if not self.config.keep_text_columns:
615
+ cols_to_drop = [col for col in self._text_columns if col in result.columns]
616
+ if cols_to_drop:
617
+ result = result.drop(columns=cols_to_drop)
618
+ if self.config.verbose:
619
+ logger.info(f"SemanticEngine: Dropped {len(cols_to_drop)} text columns, keeping numerical features")
620
+
323
621
  self._feature_names = successful_features
324
622
 
325
623
  if self.config.verbose:
@@ -419,6 +717,350 @@ class SemanticEngine(BaseEngine):
419
717
  """Get the feature set with metadata."""
420
718
  return self._feature_set
421
719
 
720
+ def standardize_categories(
721
+ self,
722
+ df: pd.DataFrame,
723
+ column: str,
724
+ target_categories: Optional[list[str]] = None,
725
+ similarity_threshold: float = 0.8,
726
+ max_categories: int = 50,
727
+ context: Optional[str] = None,
728
+ ) -> dict[str, str]:
729
+ """
730
+ Use LLM to standardize similar category values in a column.
731
+
732
+ Identifies semantically similar values (e.g., "software engineer", "Software Engineer",
733
+ "SDE") and maps them to a canonical form.
734
+
735
+ Parameters
736
+ ----------
737
+ df : DataFrame
738
+ Input DataFrame containing the column to standardize
739
+ column : str
740
+ Name of the categorical column to standardize
741
+ target_categories : list[str], optional
742
+ If provided, map values to these specific categories.
743
+ If None, LLM will infer appropriate canonical forms.
744
+ similarity_threshold : float, default=0.8
745
+ Minimum similarity for grouping (hint for LLM, not strictly enforced)
746
+ max_categories : int, default=50
747
+ Maximum number of unique values to process (for efficiency)
748
+ context : str, optional
749
+ Additional context about the data domain (e.g., "job titles in tech industry")
750
+
751
+ Returns
752
+ -------
753
+ mapping : dict[str, str]
754
+ Dictionary mapping original values to standardized values.
755
+ Only includes values that need transformation.
756
+
757
+ Examples
758
+ --------
759
+ >>> engine = SemanticEngine()
760
+ >>> mapping = engine.standardize_categories(
761
+ ... df,
762
+ ... column="job_title",
763
+ ... context="job titles in software industry"
764
+ ... )
765
+ >>> print(mapping)
766
+ {'software engineer': 'Software Engineer', 'SDE': 'Software Engineer',
767
+ 'Sr. SWE': 'Senior Software Engineer', 'data scientist': 'Data Scientist'}
768
+
769
+ >>> # Apply the mapping
770
+ >>> df_clean = engine.apply_category_mapping(df, "job_title", mapping)
771
+ """
772
+ if column not in df.columns:
773
+ raise ValueError(f"Column '{column}' not found in DataFrame")
774
+
775
+ self._ensure_client()
776
+
777
+ # Get unique values (excluding NaN)
778
+ unique_values = df[column].dropna().unique().tolist()
779
+
780
+ # Convert to strings and filter
781
+ unique_values = [str(v) for v in unique_values if v is not None and str(v).strip()]
782
+ unique_values = list(set(unique_values)) # Remove duplicates after string conversion
783
+
784
+ if len(unique_values) == 0:
785
+ if self.config.verbose:
786
+ logger.info(f"SemanticEngine: No valid values found in column '{column}'")
787
+ return {}
788
+
789
+ if len(unique_values) > max_categories:
790
+ if self.config.verbose:
791
+ logger.warning(
792
+ f"SemanticEngine: Column '{column}' has {len(unique_values)} unique values, "
793
+ f"truncating to {max_categories} most frequent"
794
+ )
795
+ # Get most frequent values
796
+ value_counts = df[column].value_counts().head(max_categories)
797
+ unique_values = [str(v) for v in value_counts.index.tolist()]
798
+
799
+ # Build and send prompt
800
+ prompt = self._build_category_standardization_prompt(
801
+ column=column,
802
+ unique_values=unique_values,
803
+ target_categories=target_categories,
804
+ context=context,
805
+ similarity_threshold=similarity_threshold,
806
+ )
807
+
808
+ try:
809
+ # Use the client's send_prompt method if available, otherwise use suggest_features
810
+ if hasattr(self._client, "send_prompt"):
811
+ response = self._client.send_prompt(prompt)
812
+ else:
813
+ # Fallback: use suggest_features with a specialized task
814
+ response_list = self._client.suggest_features(
815
+ column_info={column: "categorical"},
816
+ task_description=prompt,
817
+ column_descriptions={column: context or "Categorical column to standardize"},
818
+ domain=self.config.domain,
819
+ max_suggestions=1,
820
+ )
821
+ # Extract mapping from response if possible
822
+ if response_list and isinstance(response_list, list) and len(response_list) > 0:
823
+ first = response_list[0]
824
+ if isinstance(first, dict) and "mapping" in first:
825
+ return first["mapping"]
826
+ response = str(first)
827
+ else:
828
+ response = str(response_list)
829
+
830
+ mapping = self._parse_category_mapping(response, unique_values)
831
+
832
+ if self.config.verbose:
833
+ logger.info(f"SemanticEngine: Created mapping for {len(mapping)} values in column '{column}'")
834
+
835
+ return mapping
836
+
837
+ except Exception as e:
838
+ if self.config.verbose:
839
+ logger.error(f"SemanticEngine: Error standardizing categories: {e}")
840
+ return {}
841
+
842
+ def _build_category_standardization_prompt(
843
+ self,
844
+ column: str,
845
+ unique_values: list[str],
846
+ target_categories: Optional[list[str]] = None,
847
+ context: Optional[str] = None,
848
+ similarity_threshold: float = 0.8,
849
+ ) -> str:
850
+ """Build prompt for category standardization."""
851
+ values_str = "\n".join([f'- "{v}"' for v in unique_values[:100]])
852
+
853
+ target_str = ""
854
+ if target_categories:
855
+ target_str = f"""
856
+ ## Target Categories (map values to these)
857
+ {chr(10).join([f'- "{c}"' for c in target_categories])}
858
+ """
859
+
860
+ context_str = f"\n## Context\n{context}" if context else ""
861
+
862
+ return f"""You are an expert data scientist specializing in data cleaning and standardization.
863
+
864
+ ## Task
865
+ Analyze the following categorical values from column "{column}" and identify semantically similar values that should be standardized to a common form.
866
+
867
+ ## Unique Values in Column
868
+ {values_str}
869
+ {target_str}{context_str}
870
+
871
+ ## Requirements
872
+ 1. Identify values that represent the same concept (case variations, abbreviations, typos, synonyms)
873
+ 2. Map similar values to a single canonical/standardized form
874
+ 3. Use proper capitalization for the standardized form (e.g., "Software Engineer" not "software engineer")
875
+ 4. Common patterns to look for:
876
+ - Case variations: "Software Engineer" vs "software engineer" vs "SOFTWARE ENGINEER"
877
+ - Abbreviations: "SDE" vs "Software Development Engineer", "Sr." vs "Senior"
878
+ - Typos: "Enginer" vs "Engineer"
879
+ - Synonyms: "Developer" vs "Programmer" vs "Software Engineer"
880
+ - Formatting: "Data-Scientist" vs "Data Scientist" vs "DataScientist"
881
+ 5. Only include values that need mapping (exclude already-standardized values)
882
+ 6. Preserve values that are already properly formatted or don't have similar alternatives
883
+
884
+ ## Output Format
885
+ Return ONLY a valid JSON object with this structure:
886
+ {{
887
+ "mapping": {{
888
+ "original_value_1": "Standardized Value",
889
+ "original_value_2": "Standardized Value",
890
+ "typo_value": "Corrected Value"
891
+ }},
892
+ "groups": [
893
+ {{
894
+ "canonical": "Software Engineer",
895
+ "members": ["software engineer", "SDE", "Software Dev", "SW Engineer"]
896
+ }}
897
+ ]
898
+ }}
899
+
900
+ Return ONLY the JSON object, no markdown formatting, no explanation text."""
901
+
902
+ def _parse_category_mapping(
903
+ self,
904
+ response: str,
905
+ original_values: list[str],
906
+ ) -> dict[str, str]:
907
+ """Parse category mapping from LLM response."""
908
+ import json
909
+ import re
910
+
911
+ try:
912
+ # Clean response
913
+ response = response.strip()
914
+
915
+ # Remove markdown code blocks if present
916
+ if response.startswith("```"):
917
+ lines = response.split("\n")
918
+ # Find the JSON content between ``` markers
919
+ start_idx = 1 if lines[0].startswith("```") else 0
920
+ end_idx = len(lines)
921
+ for i, line in enumerate(lines[1:], 1):
922
+ if line.strip() == "```":
923
+ end_idx = i
924
+ break
925
+ response = "\n".join(lines[start_idx:end_idx])
926
+
927
+ # Try to parse as JSON
928
+ data = json.loads(response)
929
+
930
+ # Extract mapping from response
931
+ if isinstance(data, dict):
932
+ if "mapping" in data:
933
+ mapping = data["mapping"]
934
+ elif "groups" in data:
935
+ # Build mapping from groups
936
+ mapping = {}
937
+ for group in data["groups"]:
938
+ canonical = group.get("canonical", "")
939
+ members = group.get("members", [])
940
+ for member in members:
941
+ if member != canonical:
942
+ mapping[member] = canonical
943
+ else:
944
+ # Assume the entire dict is the mapping
945
+ mapping = data
946
+ else:
947
+ mapping = {}
948
+
949
+ # Validate mapping - only keep mappings for values that exist
950
+ original_set = set(original_values)
951
+ original_lower = {v.lower(): v for v in original_values}
952
+
953
+ validated_mapping = {}
954
+ for orig, standardized in mapping.items():
955
+ # Check exact match or case-insensitive match
956
+ if orig in original_set:
957
+ validated_mapping[orig] = standardized
958
+ elif orig.lower() in original_lower:
959
+ actual_orig = original_lower[orig.lower()]
960
+ validated_mapping[actual_orig] = standardized
961
+
962
+ return validated_mapping
963
+
964
+ except json.JSONDecodeError:
965
+ # Try to extract JSON from response
966
+ json_match = re.search(r"\{[\s\S]*\}", response)
967
+ if json_match:
968
+ try:
969
+ return self._parse_category_mapping(json_match.group(), original_values)
970
+ except Exception:
971
+ pass
972
+
973
+ if self.config.verbose:
974
+ logger.warning("SemanticEngine: Could not parse category mapping response")
975
+ return {}
976
+
977
+ def apply_category_mapping(
978
+ self,
979
+ df: pd.DataFrame,
980
+ column: str,
981
+ mapping: dict[str, str],
982
+ inplace: bool = False,
983
+ ) -> pd.DataFrame:
984
+ """
985
+ Apply a category mapping to standardize values in a DataFrame column.
986
+
987
+ Parameters
988
+ ----------
989
+ df : DataFrame
990
+ Input DataFrame
991
+ column : str
992
+ Column to transform
993
+ mapping : dict[str, str]
994
+ Mapping from original values to standardized values
995
+ inplace : bool, default=False
996
+ If True, modify DataFrame in place
997
+
998
+ Returns
999
+ -------
1000
+ DataFrame
1001
+ DataFrame with standardized column values
1002
+ """
1003
+ if column not in df.columns:
1004
+ raise ValueError(f"Column '{column}' not found in DataFrame")
1005
+
1006
+ if not inplace:
1007
+ df = df.copy()
1008
+
1009
+ # Apply mapping, keeping original values for unmapped entries
1010
+ df[column] = df[column].apply(lambda x: mapping.get(str(x), x) if pd.notna(x) else x)
1011
+
1012
+ if self.config.verbose:
1013
+ logger.info(f"SemanticEngine: Applied mapping to column '{column}'")
1014
+
1015
+ return df
1016
+
1017
+ def standardize_multiple_columns(
1018
+ self,
1019
+ df: pd.DataFrame,
1020
+ columns: list[str],
1021
+ contexts: Optional[dict[str, str]] = None,
1022
+ **kwargs,
1023
+ ) -> tuple[pd.DataFrame, dict[str, dict[str, str]]]:
1024
+ """
1025
+ Standardize multiple categorical columns at once.
1026
+
1027
+ Parameters
1028
+ ----------
1029
+ df : DataFrame
1030
+ Input DataFrame
1031
+ columns : list[str]
1032
+ List of column names to standardize
1033
+ contexts : dict[str, str], optional
1034
+ Context descriptions for each column
1035
+ **kwargs
1036
+ Additional arguments passed to standardize_categories
1037
+
1038
+ Returns
1039
+ -------
1040
+ df_clean : DataFrame
1041
+ DataFrame with standardized columns
1042
+ all_mappings : dict[str, dict[str, str]]
1043
+ Dictionary of mappings for each column
1044
+ """
1045
+ contexts = contexts or {}
1046
+ all_mappings = {}
1047
+ result_df = df.copy()
1048
+
1049
+ for col in columns:
1050
+ if col not in df.columns:
1051
+ if self.config.verbose:
1052
+ logger.warning(f"SemanticEngine: Column '{col}' not found, skipping")
1053
+ continue
1054
+
1055
+ context = contexts.get(col)
1056
+ mapping = self.standardize_categories(result_df, col, context=context, **kwargs)
1057
+ all_mappings[col] = mapping
1058
+
1059
+ if mapping:
1060
+ result_df = self.apply_category_mapping(result_df, col, mapping)
1061
+
1062
+ return result_df, all_mappings
1063
+
422
1064
  def __del__(self):
423
1065
  """Clean up client on deletion."""
424
1066
  if self._client: