featcopilot 0.1.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- featcopilot/__init__.py +10 -1
- featcopilot/core/__init__.py +2 -0
- featcopilot/core/feature.py +5 -1
- featcopilot/core/transform_rule.py +276 -0
- featcopilot/engines/relational.py +5 -2
- featcopilot/engines/tabular.py +151 -5
- featcopilot/engines/text.py +352 -11
- featcopilot/engines/timeseries.py +235 -3
- featcopilot/llm/__init__.py +6 -1
- featcopilot/llm/code_generator.py +7 -4
- featcopilot/llm/copilot_client.py +97 -20
- featcopilot/llm/explainer.py +6 -3
- featcopilot/llm/litellm_client.py +595 -0
- featcopilot/llm/semantic_engine.py +717 -26
- featcopilot/llm/transform_rule_generator.py +403 -0
- featcopilot/selection/importance.py +40 -9
- featcopilot/selection/redundancy.py +39 -10
- featcopilot/selection/statistical.py +107 -34
- featcopilot/selection/unified.py +57 -3
- featcopilot/stores/__init__.py +17 -0
- featcopilot/stores/base.py +166 -0
- featcopilot/stores/feast_store.py +541 -0
- featcopilot/stores/rule_store.py +343 -0
- featcopilot/transformers/sklearn_compat.py +18 -6
- featcopilot/utils/__init__.py +14 -0
- featcopilot/utils/logger.py +47 -0
- featcopilot/utils/models.py +287 -0
- featcopilot/utils/parallel.py +5 -1
- {featcopilot-0.1.0.dist-info → featcopilot-0.3.0.dist-info}/METADATA +56 -25
- featcopilot-0.3.0.dist-info/RECORD +38 -0
- featcopilot-0.1.0.dist-info/RECORD +0 -29
- {featcopilot-0.1.0.dist-info → featcopilot-0.3.0.dist-info}/WHEEL +0 -0
- {featcopilot-0.1.0.dist-info → featcopilot-0.3.0.dist-info}/top_level.txt +0 -0
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
Uses contextual understanding of data to generate meaningful features.
|
|
4
4
|
"""
|
|
5
5
|
|
|
6
|
-
from typing import Any, Optional, Union
|
|
6
|
+
from typing import Any, Literal, Optional, Union
|
|
7
7
|
|
|
8
8
|
import numpy as np
|
|
9
9
|
import pandas as pd
|
|
@@ -11,25 +11,38 @@ from pydantic import Field
|
|
|
11
11
|
|
|
12
12
|
from featcopilot.core.base import BaseEngine, EngineConfig
|
|
13
13
|
from featcopilot.core.feature import Feature, FeatureOrigin, FeatureSet, FeatureType
|
|
14
|
-
from featcopilot.
|
|
14
|
+
from featcopilot.utils.logger import get_logger
|
|
15
|
+
|
|
16
|
+
logger = get_logger(__name__)
|
|
15
17
|
|
|
16
18
|
|
|
17
19
|
class SemanticEngineConfig(EngineConfig):
|
|
18
20
|
"""Configuration for semantic feature engine."""
|
|
19
21
|
|
|
20
22
|
name: str = "SemanticEngine"
|
|
21
|
-
model: str = Field(default="gpt-5", description="LLM model to use")
|
|
23
|
+
model: str = Field(default="gpt-5.2", description="LLM model to use")
|
|
22
24
|
max_suggestions: int = Field(default=20, description="Max features to suggest")
|
|
23
25
|
validate_features: bool = Field(default=True, description="Validate generated code")
|
|
24
26
|
domain: Optional[str] = Field(default=None, description="Domain context")
|
|
25
27
|
temperature: float = Field(default=0.3, description="LLM temperature")
|
|
28
|
+
backend: Literal["copilot", "litellm"] = Field(default="copilot", description="LLM backend to use")
|
|
29
|
+
api_key: Optional[str] = Field(default=None, description="API key for litellm backend")
|
|
30
|
+
api_base: Optional[str] = Field(default=None, description="Custom API base URL for litellm")
|
|
31
|
+
enable_text_features: bool = Field(default=True, description="Generate ML features from text columns")
|
|
32
|
+
keep_text_columns: bool = Field(
|
|
33
|
+
default=True, description="Keep original text columns (for models that handle them natively)"
|
|
34
|
+
)
|
|
35
|
+
text_feature_types: list[str] = Field(
|
|
36
|
+
default_factory=lambda: ["sentiment", "readability", "linguistic", "semantic"],
|
|
37
|
+
description="Types of text features to generate",
|
|
38
|
+
)
|
|
26
39
|
|
|
27
40
|
|
|
28
41
|
class SemanticEngine(BaseEngine):
|
|
29
42
|
"""
|
|
30
43
|
LLM-powered semantic feature engineering engine.
|
|
31
44
|
|
|
32
|
-
Uses GitHub Copilot SDK to:
|
|
45
|
+
Uses GitHub Copilot SDK or LiteLLM to:
|
|
33
46
|
- Understand column semantics from names and descriptions
|
|
34
47
|
- Generate domain-aware features
|
|
35
48
|
- Create interpretable features with explanations
|
|
@@ -39,7 +52,7 @@ class SemanticEngine(BaseEngine):
|
|
|
39
52
|
|
|
40
53
|
Parameters
|
|
41
54
|
----------
|
|
42
|
-
model : str, default='gpt-5'
|
|
55
|
+
model : str, default='gpt-5.2'
|
|
43
56
|
LLM model to use
|
|
44
57
|
max_suggestions : int, default=20
|
|
45
58
|
Maximum number of features to suggest
|
|
@@ -47,24 +60,56 @@ class SemanticEngine(BaseEngine):
|
|
|
47
60
|
Whether to validate generated feature code
|
|
48
61
|
domain : str, optional
|
|
49
62
|
Domain context (e.g., 'healthcare', 'finance', 'retail')
|
|
63
|
+
backend : str, default='copilot'
|
|
64
|
+
LLM backend to use: 'copilot' or 'litellm'
|
|
65
|
+
api_key : str, optional
|
|
66
|
+
API key for litellm backend (uses environment variable if not provided)
|
|
67
|
+
api_base : str, optional
|
|
68
|
+
Custom API base URL for litellm backend (for self-hosted models)
|
|
50
69
|
|
|
51
70
|
Examples
|
|
52
71
|
--------
|
|
53
|
-
|
|
72
|
+
Using GitHub Copilot SDK (default):
|
|
73
|
+
>>> engine = SemanticEngine(model='gpt-5.2', domain='healthcare')
|
|
54
74
|
>>> X_features = engine.fit_transform(
|
|
55
75
|
... X, y,
|
|
56
76
|
... column_descriptions={'age': 'Patient age', 'bmi': 'Body mass index'},
|
|
57
77
|
... task_description='Predict diabetes risk'
|
|
58
78
|
... )
|
|
79
|
+
|
|
80
|
+
Using LiteLLM with OpenAI:
|
|
81
|
+
>>> engine = SemanticEngine(
|
|
82
|
+
... model='gpt-4o',
|
|
83
|
+
... backend='litellm',
|
|
84
|
+
... api_key='your-api-key' # or set OPENAI_API_KEY env var
|
|
85
|
+
... )
|
|
86
|
+
|
|
87
|
+
Using LiteLLM with Anthropic:
|
|
88
|
+
>>> engine = SemanticEngine(
|
|
89
|
+
... model='claude-3-opus',
|
|
90
|
+
... backend='litellm'
|
|
91
|
+
... )
|
|
92
|
+
|
|
93
|
+
Using LiteLLM with local Ollama:
|
|
94
|
+
>>> engine = SemanticEngine(
|
|
95
|
+
... model='ollama/llama2',
|
|
96
|
+
... backend='litellm',
|
|
97
|
+
... api_base='http://localhost:11434'
|
|
98
|
+
... )
|
|
59
99
|
"""
|
|
60
100
|
|
|
61
101
|
def __init__(
|
|
62
102
|
self,
|
|
63
|
-
model: str = "gpt-5",
|
|
103
|
+
model: str = "gpt-5.2",
|
|
64
104
|
max_suggestions: int = 20,
|
|
65
105
|
validate_features: bool = True,
|
|
66
106
|
domain: Optional[str] = None,
|
|
67
107
|
verbose: bool = False,
|
|
108
|
+
backend: Literal["copilot", "litellm"] = "copilot",
|
|
109
|
+
api_key: Optional[str] = None,
|
|
110
|
+
api_base: Optional[str] = None,
|
|
111
|
+
enable_text_features: bool = True,
|
|
112
|
+
text_feature_types: Optional[list[str]] = None,
|
|
68
113
|
**kwargs,
|
|
69
114
|
):
|
|
70
115
|
config = SemanticEngineConfig(
|
|
@@ -73,21 +118,39 @@ class SemanticEngine(BaseEngine):
|
|
|
73
118
|
validate_features=validate_features,
|
|
74
119
|
domain=domain,
|
|
75
120
|
verbose=verbose,
|
|
121
|
+
backend=backend,
|
|
122
|
+
api_key=api_key,
|
|
123
|
+
api_base=api_base,
|
|
124
|
+
enable_text_features=enable_text_features,
|
|
125
|
+
text_feature_types=text_feature_types or ["sentiment", "readability", "linguistic", "semantic"],
|
|
76
126
|
**kwargs,
|
|
77
127
|
)
|
|
78
128
|
super().__init__(config=config)
|
|
79
129
|
self.config: SemanticEngineConfig = config
|
|
80
|
-
self._client: Optional[
|
|
130
|
+
self._client: Optional[Any] = None
|
|
81
131
|
self._suggested_features: list[dict[str, Any]] = []
|
|
132
|
+
self._text_features: list[dict[str, Any]] = []
|
|
82
133
|
self._feature_set = FeatureSet()
|
|
83
134
|
self._column_info: dict[str, str] = {}
|
|
84
135
|
self._column_descriptions: dict[str, str] = {}
|
|
85
136
|
self._task_description: str = ""
|
|
137
|
+
self._text_columns: list[str] = []
|
|
86
138
|
|
|
87
139
|
def _ensure_client(self) -> None:
|
|
88
|
-
"""Ensure
|
|
140
|
+
"""Ensure LLM client is initialized."""
|
|
89
141
|
if self._client is None:
|
|
90
|
-
|
|
142
|
+
if self.config.backend == "litellm":
|
|
143
|
+
from featcopilot.llm.litellm_client import SyncLiteLLMFeatureClient
|
|
144
|
+
|
|
145
|
+
self._client = SyncLiteLLMFeatureClient(
|
|
146
|
+
model=self.config.model,
|
|
147
|
+
api_key=self.config.api_key,
|
|
148
|
+
api_base=self.config.api_base,
|
|
149
|
+
)
|
|
150
|
+
else:
|
|
151
|
+
from featcopilot.llm.copilot_client import SyncCopilotFeatureClient
|
|
152
|
+
|
|
153
|
+
self._client = SyncCopilotFeatureClient(model=self.config.model)
|
|
91
154
|
self._client.start()
|
|
92
155
|
|
|
93
156
|
def fit(
|
|
@@ -123,32 +186,56 @@ class SemanticEngine(BaseEngine):
|
|
|
123
186
|
self._column_descriptions = column_descriptions or {}
|
|
124
187
|
self._task_description = task_description
|
|
125
188
|
|
|
126
|
-
# Build column info
|
|
189
|
+
# Build column info and detect text columns
|
|
127
190
|
self._column_info = {}
|
|
191
|
+
self._text_columns = []
|
|
128
192
|
for col in X.columns:
|
|
129
193
|
dtype = str(X[col].dtype)
|
|
130
194
|
if X[col].dtype == "object":
|
|
131
195
|
dtype = "string"
|
|
196
|
+
# Detect if it's a text column (long strings with high variance)
|
|
197
|
+
if X[col].str.len().mean() > 20 and X[col].nunique() > 10:
|
|
198
|
+
self._text_columns.append(col)
|
|
132
199
|
elif np.issubdtype(X[col].dtype, np.integer):
|
|
133
200
|
dtype = "integer"
|
|
134
201
|
elif np.issubdtype(X[col].dtype, np.floating):
|
|
135
202
|
dtype = "float"
|
|
136
203
|
self._column_info[col] = dtype
|
|
137
204
|
|
|
138
|
-
# Get LLM suggestions
|
|
139
205
|
if self.config.verbose:
|
|
140
|
-
|
|
206
|
+
logger.info(f"SemanticEngine: Detected {len(self._text_columns)} text columns: {self._text_columns}")
|
|
141
207
|
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
)
|
|
208
|
+
# Generate text-specific features if enabled
|
|
209
|
+
if self.config.enable_text_features and self._text_columns:
|
|
210
|
+
self._text_features = self._generate_text_features(X)
|
|
211
|
+
if self.config.verbose:
|
|
212
|
+
logger.info(f"SemanticEngine: Generated {len(self._text_features)} text features")
|
|
213
|
+
|
|
214
|
+
# Get LLM suggestions for general features (excluding text columns)
|
|
215
|
+
if self.config.verbose:
|
|
216
|
+
logger.info("SemanticEngine: Requesting feature suggestions from LLM...")
|
|
217
|
+
|
|
218
|
+
# Filter out text columns for general feature suggestions
|
|
219
|
+
non_text_column_info = {k: v for k, v in self._column_info.items() if k not in self._text_columns}
|
|
220
|
+
|
|
221
|
+
if non_text_column_info:
|
|
222
|
+
try:
|
|
223
|
+
self._suggested_features = self._client.suggest_features(
|
|
224
|
+
column_info=non_text_column_info,
|
|
225
|
+
task_description=task_description,
|
|
226
|
+
column_descriptions=column_descriptions,
|
|
227
|
+
domain=self.config.domain,
|
|
228
|
+
max_suggestions=self.config.max_suggestions,
|
|
229
|
+
)
|
|
230
|
+
except Exception as e:
|
|
231
|
+
if self.config.verbose:
|
|
232
|
+
logger.warning(f"SemanticEngine: Could not get LLM suggestions: {e}")
|
|
233
|
+
self._suggested_features = []
|
|
234
|
+
else:
|
|
235
|
+
self._suggested_features = []
|
|
149
236
|
|
|
150
237
|
if self.config.verbose:
|
|
151
|
-
|
|
238
|
+
logger.info(f"SemanticEngine: Received {len(self._suggested_features)} suggestions")
|
|
152
239
|
|
|
153
240
|
# Validate features if enabled
|
|
154
241
|
if self.config.validate_features:
|
|
@@ -160,6 +247,198 @@ class SemanticEngine(BaseEngine):
|
|
|
160
247
|
self._is_fitted = True
|
|
161
248
|
return self
|
|
162
249
|
|
|
250
|
+
def _generate_text_features(self, X: pd.DataFrame) -> list[dict[str, Any]]:
|
|
251
|
+
"""
|
|
252
|
+
Generate ML-ready numerical features from text columns using LLM suggestions.
|
|
253
|
+
|
|
254
|
+
This is the key differentiator - LLM suggests Python code to transform text
|
|
255
|
+
into numerical features that can be used by ML models.
|
|
256
|
+
"""
|
|
257
|
+
text_features = []
|
|
258
|
+
|
|
259
|
+
for col in self._text_columns:
|
|
260
|
+
# Always add fallback features first (don't require LLM)
|
|
261
|
+
fallback_features = self._get_fallback_text_features(col)
|
|
262
|
+
text_features.extend(fallback_features)
|
|
263
|
+
|
|
264
|
+
# Try to get LLM-suggested features (optional)
|
|
265
|
+
try:
|
|
266
|
+
col_desc = self._column_descriptions.get(col, f"Text column: {col}")
|
|
267
|
+
|
|
268
|
+
# Use suggest_features instead of send_prompt for better compatibility
|
|
269
|
+
response = self._client.suggest_features(
|
|
270
|
+
column_info={col: "string"},
|
|
271
|
+
task_description=f"Extract numerical features from text column '{col}' for {self._task_description}",
|
|
272
|
+
column_descriptions={col: col_desc},
|
|
273
|
+
domain=self.config.domain,
|
|
274
|
+
max_suggestions=5,
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
# Response is already parsed as list of features
|
|
278
|
+
for f in response:
|
|
279
|
+
f["source_columns"] = [col]
|
|
280
|
+
f["is_text_feature"] = True
|
|
281
|
+
text_features.append(f)
|
|
282
|
+
|
|
283
|
+
except Exception as e:
|
|
284
|
+
if self.config.verbose:
|
|
285
|
+
logger.warning(f"SemanticEngine: Could not get LLM suggestions for '{col}': {e}")
|
|
286
|
+
|
|
287
|
+
return text_features
|
|
288
|
+
|
|
289
|
+
def _build_text_feature_prompt(self, col: str, samples: list[str], description: str) -> str:
|
|
290
|
+
"""Build prompt for text feature generation."""
|
|
291
|
+
return f"""You are an expert data scientist. Generate Python code to extract NUMERICAL features from text data.
|
|
292
|
+
|
|
293
|
+
## Text Column
|
|
294
|
+
Name: {col}
|
|
295
|
+
Description: {description}
|
|
296
|
+
|
|
297
|
+
## Sample Values
|
|
298
|
+
{chr(10).join([f'- "{s[:200]}..."' if len(str(s)) > 200 else f'- "{s}"' for s in samples[:5]])}
|
|
299
|
+
|
|
300
|
+
## Task
|
|
301
|
+
{self._task_description}
|
|
302
|
+
|
|
303
|
+
## Requirements
|
|
304
|
+
Generate features that transform text into NUMERICAL values suitable for ML models:
|
|
305
|
+
1. Sentiment scores (positive/negative/neutral)
|
|
306
|
+
2. Readability metrics (Flesch score, word complexity)
|
|
307
|
+
3. Linguistic features (noun ratio, verb ratio, sentence count)
|
|
308
|
+
4. Pattern detection (contains numbers, URLs, emails)
|
|
309
|
+
5. Domain-specific indicators
|
|
310
|
+
|
|
311
|
+
## Output Format
|
|
312
|
+
Return JSON with "features" array:
|
|
313
|
+
{{
|
|
314
|
+
"features": [
|
|
315
|
+
{{
|
|
316
|
+
"name": "{col}_sentiment_score",
|
|
317
|
+
"code": "result = df['{col}'].apply(lambda x: len([w for w in str(x).lower().split() if w in ['good','great','excellent','best']]) - len([w for w in str(x).lower().split() if w in ['bad','poor','worst','terrible']]))",
|
|
318
|
+
"explanation": "Simple sentiment score based on positive/negative word counts"
|
|
319
|
+
}}
|
|
320
|
+
]
|
|
321
|
+
}}
|
|
322
|
+
|
|
323
|
+
Return ONLY the JSON object, no other text. Generate 5-10 useful features."""
|
|
324
|
+
|
|
325
|
+
def _parse_text_features(self, response: str, col: str) -> list[dict[str, Any]]:
|
|
326
|
+
"""Parse text features from LLM response."""
|
|
327
|
+
import json
|
|
328
|
+
import re
|
|
329
|
+
|
|
330
|
+
try:
|
|
331
|
+
response = response.strip()
|
|
332
|
+
if response.startswith("```"):
|
|
333
|
+
lines = response.split("\n")
|
|
334
|
+
response = "\n".join(lines[1:-1])
|
|
335
|
+
|
|
336
|
+
data = json.loads(response)
|
|
337
|
+
features = data.get("features", [])
|
|
338
|
+
|
|
339
|
+
# Add source column info
|
|
340
|
+
for f in features:
|
|
341
|
+
f["source_columns"] = [col]
|
|
342
|
+
f["is_text_feature"] = True
|
|
343
|
+
|
|
344
|
+
return features
|
|
345
|
+
|
|
346
|
+
except json.JSONDecodeError:
|
|
347
|
+
json_match = re.search(r"\{.*\}", response, re.DOTALL)
|
|
348
|
+
if json_match:
|
|
349
|
+
try:
|
|
350
|
+
data = json.loads(json_match.group())
|
|
351
|
+
features = data.get("features", [])
|
|
352
|
+
for f in features:
|
|
353
|
+
f["source_columns"] = [col]
|
|
354
|
+
f["is_text_feature"] = True
|
|
355
|
+
return features
|
|
356
|
+
except json.JSONDecodeError:
|
|
357
|
+
pass
|
|
358
|
+
return []
|
|
359
|
+
|
|
360
|
+
def _get_fallback_text_features(self, col: str) -> list[dict[str, Any]]:
|
|
361
|
+
"""Generate fallback text features that don't require LLM."""
|
|
362
|
+
return [
|
|
363
|
+
{
|
|
364
|
+
"name": f"{col}_char_length",
|
|
365
|
+
"code": f"result = df['{col}'].fillna('').astype(str).str.len()",
|
|
366
|
+
"explanation": "Character length of text",
|
|
367
|
+
"source_columns": [col],
|
|
368
|
+
"is_text_feature": True,
|
|
369
|
+
},
|
|
370
|
+
{
|
|
371
|
+
"name": f"{col}_word_count",
|
|
372
|
+
"code": f"result = df['{col}'].fillna('').astype(str).str.split().str.len()",
|
|
373
|
+
"explanation": "Word count in text",
|
|
374
|
+
"source_columns": [col],
|
|
375
|
+
"is_text_feature": True,
|
|
376
|
+
},
|
|
377
|
+
{
|
|
378
|
+
"name": f"{col}_avg_word_length",
|
|
379
|
+
"code": f"result = df['{col}'].fillna('').astype(str).apply(lambda x: np.mean([len(w) for w in x.split()] or [0]))",
|
|
380
|
+
"explanation": "Average word length",
|
|
381
|
+
"source_columns": [col],
|
|
382
|
+
"is_text_feature": True,
|
|
383
|
+
},
|
|
384
|
+
{
|
|
385
|
+
"name": f"{col}_sentence_count",
|
|
386
|
+
"code": f"result = df['{col}'].fillna('').astype(str).str.count(r'[.!?]+')",
|
|
387
|
+
"explanation": "Number of sentences (approximate)",
|
|
388
|
+
"source_columns": [col],
|
|
389
|
+
"is_text_feature": True,
|
|
390
|
+
},
|
|
391
|
+
{
|
|
392
|
+
"name": f"{col}_uppercase_ratio",
|
|
393
|
+
"code": f"result = df['{col}'].fillna('').astype(str).apply(lambda x: sum(1 for c in x if c.isupper()) / max(len(x), 1))",
|
|
394
|
+
"explanation": "Ratio of uppercase characters",
|
|
395
|
+
"source_columns": [col],
|
|
396
|
+
"is_text_feature": True,
|
|
397
|
+
},
|
|
398
|
+
{
|
|
399
|
+
"name": f"{col}_digit_count",
|
|
400
|
+
"code": f"result = df['{col}'].fillna('').astype(str).str.count(r'\\d')",
|
|
401
|
+
"explanation": "Count of digits in text",
|
|
402
|
+
"source_columns": [col],
|
|
403
|
+
"is_text_feature": True,
|
|
404
|
+
},
|
|
405
|
+
{
|
|
406
|
+
"name": f"{col}_special_char_count",
|
|
407
|
+
"code": f"result = df['{col}'].fillna('').astype(str).str.count(r'[^a-zA-Z0-9\\s]')",
|
|
408
|
+
"explanation": "Count of special characters",
|
|
409
|
+
"source_columns": [col],
|
|
410
|
+
"is_text_feature": True,
|
|
411
|
+
},
|
|
412
|
+
{
|
|
413
|
+
"name": f"{col}_unique_word_ratio",
|
|
414
|
+
"code": f"result = df['{col}'].fillna('').astype(str).apply(lambda x: len(set(x.lower().split())) / max(len(x.split()), 1))",
|
|
415
|
+
"explanation": "Ratio of unique words to total words",
|
|
416
|
+
"source_columns": [col],
|
|
417
|
+
"is_text_feature": True,
|
|
418
|
+
},
|
|
419
|
+
{
|
|
420
|
+
"name": f"{col}_exclamation_count",
|
|
421
|
+
"code": f"result = df['{col}'].fillna('').astype(str).str.count('!')",
|
|
422
|
+
"explanation": "Count of exclamation marks (indicates emphasis/emotion)",
|
|
423
|
+
"source_columns": [col],
|
|
424
|
+
"is_text_feature": True,
|
|
425
|
+
},
|
|
426
|
+
{
|
|
427
|
+
"name": f"{col}_question_count",
|
|
428
|
+
"code": f"result = df['{col}'].fillna('').astype(str).str.count(r'\\?')",
|
|
429
|
+
"explanation": "Count of question marks",
|
|
430
|
+
"source_columns": [col],
|
|
431
|
+
"is_text_feature": True,
|
|
432
|
+
},
|
|
433
|
+
{
|
|
434
|
+
"name": f"{col}_caps_word_ratio",
|
|
435
|
+
"code": f"result = df['{col}'].fillna('').astype(str).apply(lambda x: sum(1 for w in x.split() if w.isupper()) / max(len(x.split()), 1))",
|
|
436
|
+
"explanation": "Ratio of all-caps words (indicates shouting/emphasis)",
|
|
437
|
+
"source_columns": [col],
|
|
438
|
+
"is_text_feature": True,
|
|
439
|
+
},
|
|
440
|
+
]
|
|
441
|
+
|
|
163
442
|
def _validate_suggestions(self, X: pd.DataFrame) -> None:
|
|
164
443
|
"""Validate suggested feature code."""
|
|
165
444
|
valid_features = []
|
|
@@ -175,19 +454,33 @@ class SemanticEngine(BaseEngine):
|
|
|
175
454
|
if result["valid"]:
|
|
176
455
|
valid_features.append(feature)
|
|
177
456
|
elif self.config.verbose:
|
|
178
|
-
|
|
457
|
+
logger.warning(
|
|
179
458
|
f"SemanticEngine: Invalid feature '{feature.get('name', 'unknown')}': {result.get('error', 'unknown error')}"
|
|
180
459
|
)
|
|
181
460
|
|
|
182
461
|
self._suggested_features = valid_features
|
|
183
462
|
|
|
184
463
|
if self.config.verbose:
|
|
185
|
-
|
|
464
|
+
logger.info(f"SemanticEngine: {len(valid_features)} valid features after validation")
|
|
186
465
|
|
|
187
466
|
def _build_feature_set(self) -> None:
|
|
188
467
|
"""Build FeatureSet from suggestions."""
|
|
189
468
|
self._feature_set = FeatureSet()
|
|
190
469
|
|
|
470
|
+
# Add text features
|
|
471
|
+
for suggestion in self._text_features:
|
|
472
|
+
feature = Feature(
|
|
473
|
+
name=suggestion.get("name", f"text_feature_{len(self._feature_set)}"),
|
|
474
|
+
dtype=FeatureType.NUMERIC,
|
|
475
|
+
origin=FeatureOrigin.LLM_GENERATED,
|
|
476
|
+
source_columns=suggestion.get("source_columns", []),
|
|
477
|
+
transformation="text_to_numeric",
|
|
478
|
+
explanation=suggestion.get("explanation", ""),
|
|
479
|
+
code=suggestion.get("code", ""),
|
|
480
|
+
)
|
|
481
|
+
self._feature_set.add(feature)
|
|
482
|
+
|
|
483
|
+
# Add general features
|
|
191
484
|
for suggestion in self._suggested_features:
|
|
192
485
|
feature = Feature(
|
|
193
486
|
name=suggestion.get("name", f"llm_feature_{len(self._feature_set)}"),
|
|
@@ -212,7 +505,7 @@ class SemanticEngine(BaseEngine):
|
|
|
212
505
|
Returns
|
|
213
506
|
-------
|
|
214
507
|
X_features : DataFrame
|
|
215
|
-
Data with generated features
|
|
508
|
+
Data with generated features (numerical only, text columns dropped)
|
|
216
509
|
"""
|
|
217
510
|
if not self._is_fitted:
|
|
218
511
|
raise RuntimeError("Engine must be fitted before transform")
|
|
@@ -222,6 +515,52 @@ class SemanticEngine(BaseEngine):
|
|
|
222
515
|
|
|
223
516
|
successful_features = []
|
|
224
517
|
|
|
518
|
+
# Apply text features first
|
|
519
|
+
for suggestion in self._text_features:
|
|
520
|
+
name = suggestion.get("name", "")
|
|
521
|
+
code = suggestion.get("code", "")
|
|
522
|
+
|
|
523
|
+
if not code:
|
|
524
|
+
continue
|
|
525
|
+
|
|
526
|
+
try:
|
|
527
|
+
local_vars = {"df": result, "np": np, "pd": pd}
|
|
528
|
+
exec(
|
|
529
|
+
code,
|
|
530
|
+
{
|
|
531
|
+
"__builtins__": {
|
|
532
|
+
"len": len,
|
|
533
|
+
"sum": sum,
|
|
534
|
+
"max": max,
|
|
535
|
+
"min": min,
|
|
536
|
+
"abs": abs,
|
|
537
|
+
"round": round,
|
|
538
|
+
"int": int,
|
|
539
|
+
"float": float,
|
|
540
|
+
"str": str,
|
|
541
|
+
"list": list,
|
|
542
|
+
"dict": dict,
|
|
543
|
+
"set": set,
|
|
544
|
+
},
|
|
545
|
+
"np": np,
|
|
546
|
+
"pd": pd,
|
|
547
|
+
},
|
|
548
|
+
local_vars,
|
|
549
|
+
)
|
|
550
|
+
|
|
551
|
+
if "result" in local_vars:
|
|
552
|
+
feature_values = local_vars["result"]
|
|
553
|
+
if isinstance(feature_values, pd.Series):
|
|
554
|
+
result[name] = feature_values.values
|
|
555
|
+
else:
|
|
556
|
+
result[name] = feature_values
|
|
557
|
+
successful_features.append(name)
|
|
558
|
+
|
|
559
|
+
except Exception as e:
|
|
560
|
+
if self.config.verbose:
|
|
561
|
+
logger.error(f"SemanticEngine: Error computing text feature '{name}': {e}")
|
|
562
|
+
|
|
563
|
+
# Apply general features
|
|
225
564
|
for suggestion in self._suggested_features:
|
|
226
565
|
name = suggestion.get("name", "")
|
|
227
566
|
code = suggestion.get("code", "")
|
|
@@ -266,15 +605,23 @@ class SemanticEngine(BaseEngine):
|
|
|
266
605
|
|
|
267
606
|
except Exception as e:
|
|
268
607
|
if self.config.verbose:
|
|
269
|
-
|
|
608
|
+
logger.error(f"SemanticEngine: Error computing '{name}': {e}")
|
|
270
609
|
|
|
271
610
|
# Handle infinities and NaNs
|
|
272
611
|
result = result.replace([np.inf, -np.inf], np.nan)
|
|
273
612
|
|
|
613
|
+
# Optionally drop original text columns (only if not keeping them for downstream models)
|
|
614
|
+
if not self.config.keep_text_columns:
|
|
615
|
+
cols_to_drop = [col for col in self._text_columns if col in result.columns]
|
|
616
|
+
if cols_to_drop:
|
|
617
|
+
result = result.drop(columns=cols_to_drop)
|
|
618
|
+
if self.config.verbose:
|
|
619
|
+
logger.info(f"SemanticEngine: Dropped {len(cols_to_drop)} text columns, keeping numerical features")
|
|
620
|
+
|
|
274
621
|
self._feature_names = successful_features
|
|
275
622
|
|
|
276
623
|
if self.config.verbose:
|
|
277
|
-
|
|
624
|
+
logger.info(f"SemanticEngine: Successfully generated {len(successful_features)} features")
|
|
278
625
|
|
|
279
626
|
return result
|
|
280
627
|
|
|
@@ -370,6 +717,350 @@ class SemanticEngine(BaseEngine):
|
|
|
370
717
|
"""Get the feature set with metadata."""
|
|
371
718
|
return self._feature_set
|
|
372
719
|
|
|
720
|
+
def standardize_categories(
|
|
721
|
+
self,
|
|
722
|
+
df: pd.DataFrame,
|
|
723
|
+
column: str,
|
|
724
|
+
target_categories: Optional[list[str]] = None,
|
|
725
|
+
similarity_threshold: float = 0.8,
|
|
726
|
+
max_categories: int = 50,
|
|
727
|
+
context: Optional[str] = None,
|
|
728
|
+
) -> dict[str, str]:
|
|
729
|
+
"""
|
|
730
|
+
Use LLM to standardize similar category values in a column.
|
|
731
|
+
|
|
732
|
+
Identifies semantically similar values (e.g., "software engineer", "Software Engineer",
|
|
733
|
+
"SDE") and maps them to a canonical form.
|
|
734
|
+
|
|
735
|
+
Parameters
|
|
736
|
+
----------
|
|
737
|
+
df : DataFrame
|
|
738
|
+
Input DataFrame containing the column to standardize
|
|
739
|
+
column : str
|
|
740
|
+
Name of the categorical column to standardize
|
|
741
|
+
target_categories : list[str], optional
|
|
742
|
+
If provided, map values to these specific categories.
|
|
743
|
+
If None, LLM will infer appropriate canonical forms.
|
|
744
|
+
similarity_threshold : float, default=0.8
|
|
745
|
+
Minimum similarity for grouping (hint for LLM, not strictly enforced)
|
|
746
|
+
max_categories : int, default=50
|
|
747
|
+
Maximum number of unique values to process (for efficiency)
|
|
748
|
+
context : str, optional
|
|
749
|
+
Additional context about the data domain (e.g., "job titles in tech industry")
|
|
750
|
+
|
|
751
|
+
Returns
|
|
752
|
+
-------
|
|
753
|
+
mapping : dict[str, str]
|
|
754
|
+
Dictionary mapping original values to standardized values.
|
|
755
|
+
Only includes values that need transformation.
|
|
756
|
+
|
|
757
|
+
Examples
|
|
758
|
+
--------
|
|
759
|
+
>>> engine = SemanticEngine()
|
|
760
|
+
>>> mapping = engine.standardize_categories(
|
|
761
|
+
... df,
|
|
762
|
+
... column="job_title",
|
|
763
|
+
... context="job titles in software industry"
|
|
764
|
+
... )
|
|
765
|
+
>>> print(mapping)
|
|
766
|
+
{'software engineer': 'Software Engineer', 'SDE': 'Software Engineer',
|
|
767
|
+
'Sr. SWE': 'Senior Software Engineer', 'data scientist': 'Data Scientist'}
|
|
768
|
+
|
|
769
|
+
>>> # Apply the mapping
|
|
770
|
+
>>> df_clean = engine.apply_category_mapping(df, "job_title", mapping)
|
|
771
|
+
"""
|
|
772
|
+
if column not in df.columns:
|
|
773
|
+
raise ValueError(f"Column '{column}' not found in DataFrame")
|
|
774
|
+
|
|
775
|
+
self._ensure_client()
|
|
776
|
+
|
|
777
|
+
# Get unique values (excluding NaN)
|
|
778
|
+
unique_values = df[column].dropna().unique().tolist()
|
|
779
|
+
|
|
780
|
+
# Convert to strings and filter
|
|
781
|
+
unique_values = [str(v) for v in unique_values if v is not None and str(v).strip()]
|
|
782
|
+
unique_values = list(set(unique_values)) # Remove duplicates after string conversion
|
|
783
|
+
|
|
784
|
+
if len(unique_values) == 0:
|
|
785
|
+
if self.config.verbose:
|
|
786
|
+
logger.info(f"SemanticEngine: No valid values found in column '{column}'")
|
|
787
|
+
return {}
|
|
788
|
+
|
|
789
|
+
if len(unique_values) > max_categories:
|
|
790
|
+
if self.config.verbose:
|
|
791
|
+
logger.warning(
|
|
792
|
+
f"SemanticEngine: Column '{column}' has {len(unique_values)} unique values, "
|
|
793
|
+
f"truncating to {max_categories} most frequent"
|
|
794
|
+
)
|
|
795
|
+
# Get most frequent values
|
|
796
|
+
value_counts = df[column].value_counts().head(max_categories)
|
|
797
|
+
unique_values = [str(v) for v in value_counts.index.tolist()]
|
|
798
|
+
|
|
799
|
+
# Build and send prompt
|
|
800
|
+
prompt = self._build_category_standardization_prompt(
|
|
801
|
+
column=column,
|
|
802
|
+
unique_values=unique_values,
|
|
803
|
+
target_categories=target_categories,
|
|
804
|
+
context=context,
|
|
805
|
+
similarity_threshold=similarity_threshold,
|
|
806
|
+
)
|
|
807
|
+
|
|
808
|
+
try:
|
|
809
|
+
# Use the client's send_prompt method if available, otherwise use suggest_features
|
|
810
|
+
if hasattr(self._client, "send_prompt"):
|
|
811
|
+
response = self._client.send_prompt(prompt)
|
|
812
|
+
else:
|
|
813
|
+
# Fallback: use suggest_features with a specialized task
|
|
814
|
+
response_list = self._client.suggest_features(
|
|
815
|
+
column_info={column: "categorical"},
|
|
816
|
+
task_description=prompt,
|
|
817
|
+
column_descriptions={column: context or "Categorical column to standardize"},
|
|
818
|
+
domain=self.config.domain,
|
|
819
|
+
max_suggestions=1,
|
|
820
|
+
)
|
|
821
|
+
# Extract mapping from response if possible
|
|
822
|
+
if response_list and isinstance(response_list, list) and len(response_list) > 0:
|
|
823
|
+
first = response_list[0]
|
|
824
|
+
if isinstance(first, dict) and "mapping" in first:
|
|
825
|
+
return first["mapping"]
|
|
826
|
+
response = str(first)
|
|
827
|
+
else:
|
|
828
|
+
response = str(response_list)
|
|
829
|
+
|
|
830
|
+
mapping = self._parse_category_mapping(response, unique_values)
|
|
831
|
+
|
|
832
|
+
if self.config.verbose:
|
|
833
|
+
logger.info(f"SemanticEngine: Created mapping for {len(mapping)} values in column '{column}'")
|
|
834
|
+
|
|
835
|
+
return mapping
|
|
836
|
+
|
|
837
|
+
except Exception as e:
|
|
838
|
+
if self.config.verbose:
|
|
839
|
+
logger.error(f"SemanticEngine: Error standardizing categories: {e}")
|
|
840
|
+
return {}
|
|
841
|
+
|
|
842
|
+
def _build_category_standardization_prompt(
|
|
843
|
+
self,
|
|
844
|
+
column: str,
|
|
845
|
+
unique_values: list[str],
|
|
846
|
+
target_categories: Optional[list[str]] = None,
|
|
847
|
+
context: Optional[str] = None,
|
|
848
|
+
similarity_threshold: float = 0.8,
|
|
849
|
+
) -> str:
|
|
850
|
+
"""Build prompt for category standardization."""
|
|
851
|
+
values_str = "\n".join([f'- "{v}"' for v in unique_values[:100]])
|
|
852
|
+
|
|
853
|
+
target_str = ""
|
|
854
|
+
if target_categories:
|
|
855
|
+
target_str = f"""
|
|
856
|
+
## Target Categories (map values to these)
|
|
857
|
+
{chr(10).join([f'- "{c}"' for c in target_categories])}
|
|
858
|
+
"""
|
|
859
|
+
|
|
860
|
+
context_str = f"\n## Context\n{context}" if context else ""
|
|
861
|
+
|
|
862
|
+
return f"""You are an expert data scientist specializing in data cleaning and standardization.
|
|
863
|
+
|
|
864
|
+
## Task
|
|
865
|
+
Analyze the following categorical values from column "{column}" and identify semantically similar values that should be standardized to a common form.
|
|
866
|
+
|
|
867
|
+
## Unique Values in Column
|
|
868
|
+
{values_str}
|
|
869
|
+
{target_str}{context_str}
|
|
870
|
+
|
|
871
|
+
## Requirements
|
|
872
|
+
1. Identify values that represent the same concept (case variations, abbreviations, typos, synonyms)
|
|
873
|
+
2. Map similar values to a single canonical/standardized form
|
|
874
|
+
3. Use proper capitalization for the standardized form (e.g., "Software Engineer" not "software engineer")
|
|
875
|
+
4. Common patterns to look for:
|
|
876
|
+
- Case variations: "Software Engineer" vs "software engineer" vs "SOFTWARE ENGINEER"
|
|
877
|
+
- Abbreviations: "SDE" vs "Software Development Engineer", "Sr." vs "Senior"
|
|
878
|
+
- Typos: "Enginer" vs "Engineer"
|
|
879
|
+
- Synonyms: "Developer" vs "Programmer" vs "Software Engineer"
|
|
880
|
+
- Formatting: "Data-Scientist" vs "Data Scientist" vs "DataScientist"
|
|
881
|
+
5. Only include values that need mapping (exclude already-standardized values)
|
|
882
|
+
6. Preserve values that are already properly formatted or don't have similar alternatives
|
|
883
|
+
|
|
884
|
+
## Output Format
|
|
885
|
+
Return ONLY a valid JSON object with this structure:
|
|
886
|
+
{{
|
|
887
|
+
"mapping": {{
|
|
888
|
+
"original_value_1": "Standardized Value",
|
|
889
|
+
"original_value_2": "Standardized Value",
|
|
890
|
+
"typo_value": "Corrected Value"
|
|
891
|
+
}},
|
|
892
|
+
"groups": [
|
|
893
|
+
{{
|
|
894
|
+
"canonical": "Software Engineer",
|
|
895
|
+
"members": ["software engineer", "SDE", "Software Dev", "SW Engineer"]
|
|
896
|
+
}}
|
|
897
|
+
]
|
|
898
|
+
}}
|
|
899
|
+
|
|
900
|
+
Return ONLY the JSON object, no markdown formatting, no explanation text."""
|
|
901
|
+
|
|
902
|
+
def _parse_category_mapping(
|
|
903
|
+
self,
|
|
904
|
+
response: str,
|
|
905
|
+
original_values: list[str],
|
|
906
|
+
) -> dict[str, str]:
|
|
907
|
+
"""Parse category mapping from LLM response."""
|
|
908
|
+
import json
|
|
909
|
+
import re
|
|
910
|
+
|
|
911
|
+
try:
|
|
912
|
+
# Clean response
|
|
913
|
+
response = response.strip()
|
|
914
|
+
|
|
915
|
+
# Remove markdown code blocks if present
|
|
916
|
+
if response.startswith("```"):
|
|
917
|
+
lines = response.split("\n")
|
|
918
|
+
# Find the JSON content between ``` markers
|
|
919
|
+
start_idx = 1 if lines[0].startswith("```") else 0
|
|
920
|
+
end_idx = len(lines)
|
|
921
|
+
for i, line in enumerate(lines[1:], 1):
|
|
922
|
+
if line.strip() == "```":
|
|
923
|
+
end_idx = i
|
|
924
|
+
break
|
|
925
|
+
response = "\n".join(lines[start_idx:end_idx])
|
|
926
|
+
|
|
927
|
+
# Try to parse as JSON
|
|
928
|
+
data = json.loads(response)
|
|
929
|
+
|
|
930
|
+
# Extract mapping from response
|
|
931
|
+
if isinstance(data, dict):
|
|
932
|
+
if "mapping" in data:
|
|
933
|
+
mapping = data["mapping"]
|
|
934
|
+
elif "groups" in data:
|
|
935
|
+
# Build mapping from groups
|
|
936
|
+
mapping = {}
|
|
937
|
+
for group in data["groups"]:
|
|
938
|
+
canonical = group.get("canonical", "")
|
|
939
|
+
members = group.get("members", [])
|
|
940
|
+
for member in members:
|
|
941
|
+
if member != canonical:
|
|
942
|
+
mapping[member] = canonical
|
|
943
|
+
else:
|
|
944
|
+
# Assume the entire dict is the mapping
|
|
945
|
+
mapping = data
|
|
946
|
+
else:
|
|
947
|
+
mapping = {}
|
|
948
|
+
|
|
949
|
+
# Validate mapping - only keep mappings for values that exist
|
|
950
|
+
original_set = set(original_values)
|
|
951
|
+
original_lower = {v.lower(): v for v in original_values}
|
|
952
|
+
|
|
953
|
+
validated_mapping = {}
|
|
954
|
+
for orig, standardized in mapping.items():
|
|
955
|
+
# Check exact match or case-insensitive match
|
|
956
|
+
if orig in original_set:
|
|
957
|
+
validated_mapping[orig] = standardized
|
|
958
|
+
elif orig.lower() in original_lower:
|
|
959
|
+
actual_orig = original_lower[orig.lower()]
|
|
960
|
+
validated_mapping[actual_orig] = standardized
|
|
961
|
+
|
|
962
|
+
return validated_mapping
|
|
963
|
+
|
|
964
|
+
except json.JSONDecodeError:
|
|
965
|
+
# Try to extract JSON from response
|
|
966
|
+
json_match = re.search(r"\{[\s\S]*\}", response)
|
|
967
|
+
if json_match:
|
|
968
|
+
try:
|
|
969
|
+
return self._parse_category_mapping(json_match.group(), original_values)
|
|
970
|
+
except Exception:
|
|
971
|
+
pass
|
|
972
|
+
|
|
973
|
+
if self.config.verbose:
|
|
974
|
+
logger.warning("SemanticEngine: Could not parse category mapping response")
|
|
975
|
+
return {}
|
|
976
|
+
|
|
977
|
+
def apply_category_mapping(
|
|
978
|
+
self,
|
|
979
|
+
df: pd.DataFrame,
|
|
980
|
+
column: str,
|
|
981
|
+
mapping: dict[str, str],
|
|
982
|
+
inplace: bool = False,
|
|
983
|
+
) -> pd.DataFrame:
|
|
984
|
+
"""
|
|
985
|
+
Apply a category mapping to standardize values in a DataFrame column.
|
|
986
|
+
|
|
987
|
+
Parameters
|
|
988
|
+
----------
|
|
989
|
+
df : DataFrame
|
|
990
|
+
Input DataFrame
|
|
991
|
+
column : str
|
|
992
|
+
Column to transform
|
|
993
|
+
mapping : dict[str, str]
|
|
994
|
+
Mapping from original values to standardized values
|
|
995
|
+
inplace : bool, default=False
|
|
996
|
+
If True, modify DataFrame in place
|
|
997
|
+
|
|
998
|
+
Returns
|
|
999
|
+
-------
|
|
1000
|
+
DataFrame
|
|
1001
|
+
DataFrame with standardized column values
|
|
1002
|
+
"""
|
|
1003
|
+
if column not in df.columns:
|
|
1004
|
+
raise ValueError(f"Column '{column}' not found in DataFrame")
|
|
1005
|
+
|
|
1006
|
+
if not inplace:
|
|
1007
|
+
df = df.copy()
|
|
1008
|
+
|
|
1009
|
+
# Apply mapping, keeping original values for unmapped entries
|
|
1010
|
+
df[column] = df[column].apply(lambda x: mapping.get(str(x), x) if pd.notna(x) else x)
|
|
1011
|
+
|
|
1012
|
+
if self.config.verbose:
|
|
1013
|
+
logger.info(f"SemanticEngine: Applied mapping to column '{column}'")
|
|
1014
|
+
|
|
1015
|
+
return df
|
|
1016
|
+
|
|
1017
|
+
def standardize_multiple_columns(
|
|
1018
|
+
self,
|
|
1019
|
+
df: pd.DataFrame,
|
|
1020
|
+
columns: list[str],
|
|
1021
|
+
contexts: Optional[dict[str, str]] = None,
|
|
1022
|
+
**kwargs,
|
|
1023
|
+
) -> tuple[pd.DataFrame, dict[str, dict[str, str]]]:
|
|
1024
|
+
"""
|
|
1025
|
+
Standardize multiple categorical columns at once.
|
|
1026
|
+
|
|
1027
|
+
Parameters
|
|
1028
|
+
----------
|
|
1029
|
+
df : DataFrame
|
|
1030
|
+
Input DataFrame
|
|
1031
|
+
columns : list[str]
|
|
1032
|
+
List of column names to standardize
|
|
1033
|
+
contexts : dict[str, str], optional
|
|
1034
|
+
Context descriptions for each column
|
|
1035
|
+
**kwargs
|
|
1036
|
+
Additional arguments passed to standardize_categories
|
|
1037
|
+
|
|
1038
|
+
Returns
|
|
1039
|
+
-------
|
|
1040
|
+
df_clean : DataFrame
|
|
1041
|
+
DataFrame with standardized columns
|
|
1042
|
+
all_mappings : dict[str, dict[str, str]]
|
|
1043
|
+
Dictionary of mappings for each column
|
|
1044
|
+
"""
|
|
1045
|
+
contexts = contexts or {}
|
|
1046
|
+
all_mappings = {}
|
|
1047
|
+
result_df = df.copy()
|
|
1048
|
+
|
|
1049
|
+
for col in columns:
|
|
1050
|
+
if col not in df.columns:
|
|
1051
|
+
if self.config.verbose:
|
|
1052
|
+
logger.warning(f"SemanticEngine: Column '{col}' not found, skipping")
|
|
1053
|
+
continue
|
|
1054
|
+
|
|
1055
|
+
context = contexts.get(col)
|
|
1056
|
+
mapping = self.standardize_categories(result_df, col, context=context, **kwargs)
|
|
1057
|
+
all_mappings[col] = mapping
|
|
1058
|
+
|
|
1059
|
+
if mapping:
|
|
1060
|
+
result_df = self.apply_category_mapping(result_df, col, mapping)
|
|
1061
|
+
|
|
1062
|
+
return result_df, all_mappings
|
|
1063
|
+
|
|
373
1064
|
def __del__(self):
|
|
374
1065
|
"""Clean up client on deletion."""
|
|
375
1066
|
if self._client:
|