additory 0.1.0a4__py3-none-any.whl → 0.1.1a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- additory/__init__.py +58 -14
- additory/common/__init__.py +31 -147
- additory/common/column_selector.py +255 -0
- additory/common/distributions.py +286 -613
- additory/common/extractors.py +313 -0
- additory/common/knn_imputation.py +332 -0
- additory/common/result.py +380 -0
- additory/common/strategy_parser.py +243 -0
- additory/common/unit_conversions.py +338 -0
- additory/common/validation.py +283 -103
- additory/core/__init__.py +34 -22
- additory/core/backend.py +258 -0
- additory/core/config.py +177 -305
- additory/core/logging.py +230 -24
- additory/core/memory_manager.py +157 -495
- additory/expressions/__init__.py +2 -23
- additory/expressions/compiler.py +457 -0
- additory/expressions/engine.py +264 -487
- additory/expressions/integrity.py +179 -0
- additory/expressions/loader.py +263 -0
- additory/expressions/parser.py +363 -167
- additory/expressions/resolver.py +274 -0
- additory/functions/__init__.py +1 -0
- additory/functions/analyze/__init__.py +144 -0
- additory/functions/analyze/cardinality.py +58 -0
- additory/functions/analyze/correlations.py +66 -0
- additory/functions/analyze/distributions.py +53 -0
- additory/functions/analyze/duplicates.py +49 -0
- additory/functions/analyze/features.py +61 -0
- additory/functions/analyze/imputation.py +66 -0
- additory/functions/analyze/outliers.py +65 -0
- additory/functions/analyze/patterns.py +65 -0
- additory/functions/analyze/presets.py +72 -0
- additory/functions/analyze/quality.py +59 -0
- additory/functions/analyze/timeseries.py +53 -0
- additory/functions/analyze/types.py +45 -0
- additory/functions/expressions/__init__.py +161 -0
- additory/functions/snapshot/__init__.py +82 -0
- additory/functions/snapshot/filter.py +119 -0
- additory/functions/synthetic/__init__.py +113 -0
- additory/functions/synthetic/mode_detector.py +47 -0
- additory/functions/synthetic/strategies/__init__.py +1 -0
- additory/functions/synthetic/strategies/advanced.py +35 -0
- additory/functions/synthetic/strategies/augmentative.py +160 -0
- additory/functions/synthetic/strategies/generative.py +168 -0
- additory/functions/synthetic/strategies/presets.py +116 -0
- additory/functions/to/__init__.py +188 -0
- additory/functions/to/lookup.py +351 -0
- additory/functions/to/merge.py +189 -0
- additory/functions/to/sort.py +91 -0
- additory/functions/to/summarize.py +170 -0
- additory/functions/transform/__init__.py +140 -0
- additory/functions/transform/datetime.py +79 -0
- additory/functions/transform/extract.py +85 -0
- additory/functions/transform/harmonize.py +105 -0
- additory/functions/transform/knn.py +62 -0
- additory/functions/transform/onehotencoding.py +68 -0
- additory/functions/transform/transpose.py +42 -0
- additory-0.1.1a1.dist-info/METADATA +83 -0
- additory-0.1.1a1.dist-info/RECORD +62 -0
- additory/analysis/__init__.py +0 -48
- additory/analysis/cardinality.py +0 -126
- additory/analysis/correlations.py +0 -124
- additory/analysis/distributions.py +0 -376
- additory/analysis/quality.py +0 -158
- additory/analysis/scan.py +0 -400
- additory/common/backend.py +0 -371
- additory/common/column_utils.py +0 -191
- additory/common/exceptions.py +0 -62
- additory/common/lists.py +0 -229
- additory/common/patterns.py +0 -240
- additory/common/resolver.py +0 -567
- additory/common/sample_data.py +0 -182
- additory/core/ast_builder.py +0 -165
- additory/core/backends/__init__.py +0 -23
- additory/core/backends/arrow_bridge.py +0 -483
- additory/core/backends/cudf_bridge.py +0 -355
- additory/core/column_positioning.py +0 -358
- additory/core/compiler_polars.py +0 -166
- additory/core/enhanced_cache_manager.py +0 -1119
- additory/core/enhanced_matchers.py +0 -473
- additory/core/enhanced_version_manager.py +0 -325
- additory/core/executor.py +0 -59
- additory/core/integrity_manager.py +0 -477
- additory/core/loader.py +0 -190
- additory/core/namespace_manager.py +0 -657
- additory/core/parser.py +0 -176
- additory/core/polars_expression_engine.py +0 -601
- additory/core/registry.py +0 -177
- additory/core/sample_data_manager.py +0 -492
- additory/core/user_namespace.py +0 -751
- additory/core/validator.py +0 -27
- additory/dynamic_api.py +0 -352
- additory/expressions/proxy.py +0 -549
- additory/expressions/registry.py +0 -313
- additory/expressions/samples.py +0 -492
- additory/synthetic/__init__.py +0 -13
- additory/synthetic/column_name_resolver.py +0 -149
- additory/synthetic/deduce.py +0 -259
- additory/synthetic/distributions.py +0 -22
- additory/synthetic/forecast.py +0 -1132
- additory/synthetic/linked_list_parser.py +0 -415
- additory/synthetic/namespace_lookup.py +0 -129
- additory/synthetic/smote.py +0 -320
- additory/synthetic/strategies.py +0 -926
- additory/synthetic/synthesizer.py +0 -713
- additory/utilities/__init__.py +0 -53
- additory/utilities/encoding.py +0 -600
- additory/utilities/games.py +0 -300
- additory/utilities/keys.py +0 -8
- additory/utilities/lookup.py +0 -103
- additory/utilities/matchers.py +0 -216
- additory/utilities/resolvers.py +0 -286
- additory/utilities/settings.py +0 -167
- additory/utilities/units.py +0 -749
- additory/utilities/validators.py +0 -153
- additory-0.1.0a4.dist-info/METADATA +0 -311
- additory-0.1.0a4.dist-info/RECORD +0 -72
- additory-0.1.0a4.dist-info/licenses/LICENSE +0 -21
- {additory-0.1.0a4.dist-info → additory-0.1.1a1.dist-info}/WHEEL +0 -0
- {additory-0.1.0a4.dist-info → additory-0.1.1a1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,380 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Result wrapper classes for Additory operations.
|
|
3
|
+
|
|
4
|
+
Provides DataFrameResult and AnalysisResult classes that wrap
|
|
5
|
+
operation results with metadata and helper methods.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from typing import Any, Dict, List, Optional
|
|
9
|
+
import polars as pl
|
|
10
|
+
import json
|
|
11
|
+
import time
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class DataFrameResult:
|
|
15
|
+
"""
|
|
16
|
+
Enhanced DataFrame wrapper with metadata and helper methods.
|
|
17
|
+
|
|
18
|
+
Used by: to, transform, snapshot, synthetic, expressions functions
|
|
19
|
+
|
|
20
|
+
Attributes:
|
|
21
|
+
df: Polars DataFrame (the actual result)
|
|
22
|
+
metadata: Dictionary of operation metadata
|
|
23
|
+
operation: Operation name ('to', 'transform', 'snapshot', 'synthetic')
|
|
24
|
+
input_shape: Original DataFrame shape
|
|
25
|
+
output_shape: Result DataFrame shape
|
|
26
|
+
columns_added: List of columns added
|
|
27
|
+
columns_removed: List of columns removed
|
|
28
|
+
execution_time: Time taken for operation
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
def __init__(self, df: pl.DataFrame, operation: str, metadata: Dict[str, Any]):
|
|
32
|
+
"""
|
|
33
|
+
Initialize result wrapper.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
df: Result DataFrame
|
|
37
|
+
operation: Operation name
|
|
38
|
+
metadata: Operation metadata
|
|
39
|
+
"""
|
|
40
|
+
self.df = df
|
|
41
|
+
self.operation = operation
|
|
42
|
+
self.metadata = metadata
|
|
43
|
+
|
|
44
|
+
# Extract common metadata
|
|
45
|
+
self.input_shape = metadata.get('input_shape', (0, 0))
|
|
46
|
+
self.output_shape = (df.height, df.width)
|
|
47
|
+
self.columns_added = metadata.get('columns_added', [])
|
|
48
|
+
self.columns_removed = metadata.get('columns_removed', [])
|
|
49
|
+
self.execution_time = metadata.get('execution_time', 0.0)
|
|
50
|
+
|
|
51
|
+
def info(self) -> Dict[str, Any]:
|
|
52
|
+
"""
|
|
53
|
+
Get summary information about the result.
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
Dictionary with shape, columns, operation info
|
|
57
|
+
|
|
58
|
+
Example:
|
|
59
|
+
result = add.to(df, ...)
|
|
60
|
+
print(result.info())
|
|
61
|
+
# {'operation': 'to', 'rows': 1000, 'columns': 15, ...}
|
|
62
|
+
"""
|
|
63
|
+
return {
|
|
64
|
+
'operation': self.operation,
|
|
65
|
+
'rows': self.output_shape[0],
|
|
66
|
+
'columns': self.output_shape[1],
|
|
67
|
+
'columns_added': self.columns_added,
|
|
68
|
+
'columns_removed': self.columns_removed,
|
|
69
|
+
'execution_time': self.execution_time,
|
|
70
|
+
'input_shape': self.input_shape,
|
|
71
|
+
'output_shape': self.output_shape,
|
|
72
|
+
'metadata': self.metadata
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
def summary(self) -> str:
|
|
76
|
+
"""
|
|
77
|
+
Get human-readable summary of the operation.
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
Formatted string summary
|
|
81
|
+
|
|
82
|
+
Example:
|
|
83
|
+
result = add.to(df, ...)
|
|
84
|
+
print(result.summary())
|
|
85
|
+
# "Added 1 column (price) to 1000 rows in 0.05s"
|
|
86
|
+
"""
|
|
87
|
+
parts = []
|
|
88
|
+
|
|
89
|
+
# Operation name
|
|
90
|
+
parts.append(f"Operation: add.{self.operation}()")
|
|
91
|
+
|
|
92
|
+
# Columns added
|
|
93
|
+
if self.columns_added:
|
|
94
|
+
cols_str = ', '.join(self.columns_added)
|
|
95
|
+
parts.append(f"Added {len(self.columns_added)} column(s): {cols_str}")
|
|
96
|
+
|
|
97
|
+
# Columns removed
|
|
98
|
+
if self.columns_removed:
|
|
99
|
+
cols_str = ', '.join(self.columns_removed)
|
|
100
|
+
parts.append(f"Removed {len(self.columns_removed)} column(s): {cols_str}")
|
|
101
|
+
|
|
102
|
+
# Shape change
|
|
103
|
+
if self.input_shape != self.output_shape:
|
|
104
|
+
parts.append(
|
|
105
|
+
f"Shape: {self.input_shape[0]}x{self.input_shape[1]} → "
|
|
106
|
+
f"{self.output_shape[0]}x{self.output_shape[1]}"
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
# Execution time
|
|
110
|
+
parts.append(f"Time: {self.execution_time:.3f}s")
|
|
111
|
+
|
|
112
|
+
return '\n'.join(parts)
|
|
113
|
+
|
|
114
|
+
def explain(self) -> str:
|
|
115
|
+
"""
|
|
116
|
+
Get detailed explanation of what happened.
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
Detailed explanation string
|
|
120
|
+
|
|
121
|
+
Example:
|
|
122
|
+
result = add.to(df, ...)
|
|
123
|
+
print(result.explain())
|
|
124
|
+
# "Operation: add.to()
|
|
125
|
+
# - Looked up 'price' from reference DataFrame
|
|
126
|
+
# - Matched on 'product_id' (1000 matches)
|
|
127
|
+
# - Used 'fetch if unique' mode
|
|
128
|
+
# - Added column at position 'after:product_id'"
|
|
129
|
+
"""
|
|
130
|
+
lines = [f"Operation: add.{self.operation}()"]
|
|
131
|
+
|
|
132
|
+
# Add metadata details
|
|
133
|
+
for key, value in self.metadata.items():
|
|
134
|
+
if key not in ['input_shape', 'execution_time', 'columns_added', 'columns_removed']:
|
|
135
|
+
lines.append(f" - {key}: {value}")
|
|
136
|
+
|
|
137
|
+
# Add summary info
|
|
138
|
+
if self.columns_added:
|
|
139
|
+
lines.append(f" - Added columns: {', '.join(self.columns_added)}")
|
|
140
|
+
if self.columns_removed:
|
|
141
|
+
lines.append(f" - Removed columns: {', '.join(self.columns_removed)}")
|
|
142
|
+
|
|
143
|
+
lines.append(f" - Execution time: {self.execution_time:.3f}s")
|
|
144
|
+
|
|
145
|
+
return '\n'.join(lines)
|
|
146
|
+
|
|
147
|
+
def to_polars(self) -> pl.DataFrame:
|
|
148
|
+
"""
|
|
149
|
+
Get the underlying Polars DataFrame.
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
Polars DataFrame
|
|
153
|
+
"""
|
|
154
|
+
return self.df
|
|
155
|
+
|
|
156
|
+
def to_pandas(self):
|
|
157
|
+
"""
|
|
158
|
+
Convert result to pandas DataFrame.
|
|
159
|
+
|
|
160
|
+
Returns:
|
|
161
|
+
pandas DataFrame
|
|
162
|
+
"""
|
|
163
|
+
return self.df.to_pandas()
|
|
164
|
+
|
|
165
|
+
def to_arrow(self):
|
|
166
|
+
"""
|
|
167
|
+
Convert result to Arrow Table.
|
|
168
|
+
|
|
169
|
+
Returns:
|
|
170
|
+
Arrow Table
|
|
171
|
+
"""
|
|
172
|
+
return self.df.to_arrow()
|
|
173
|
+
|
|
174
|
+
def __repr__(self) -> str:
|
|
175
|
+
"""String representation of result."""
|
|
176
|
+
return (
|
|
177
|
+
f"DataFrameResult(operation='{self.operation}', "
|
|
178
|
+
f"shape={self.output_shape}, "
|
|
179
|
+
f"columns_added={len(self.columns_added)}, "
|
|
180
|
+
f"columns_removed={len(self.columns_removed)})"
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
def __getattr__(self, name: str) -> Any:
|
|
184
|
+
"""
|
|
185
|
+
Delegate attribute access to underlying DataFrame.
|
|
186
|
+
|
|
187
|
+
Args:
|
|
188
|
+
name: Attribute name
|
|
189
|
+
|
|
190
|
+
Returns:
|
|
191
|
+
Attribute from DataFrame
|
|
192
|
+
|
|
193
|
+
Example:
|
|
194
|
+
result = add.to(df, ...)
|
|
195
|
+
result.select(['name', 'age']) # Delegates to df.select()
|
|
196
|
+
"""
|
|
197
|
+
# Avoid infinite recursion for special attributes
|
|
198
|
+
if name in ['df', 'operation', 'metadata', 'input_shape', 'output_shape',
|
|
199
|
+
'columns_added', 'columns_removed', 'execution_time']:
|
|
200
|
+
raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
|
|
201
|
+
|
|
202
|
+
# Delegate to underlying DataFrame
|
|
203
|
+
return getattr(self.df, name)
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
class AnalysisResult:
|
|
207
|
+
"""
|
|
208
|
+
Special result wrapper for analyze() function.
|
|
209
|
+
|
|
210
|
+
Used by: analyze function
|
|
211
|
+
|
|
212
|
+
Attributes:
|
|
213
|
+
quality: Quality analysis results
|
|
214
|
+
cardinality: Cardinality analysis results
|
|
215
|
+
distributions: Distribution analysis results
|
|
216
|
+
correlations: Correlation analysis results
|
|
217
|
+
features: Feature analysis results
|
|
218
|
+
types: Type analysis results
|
|
219
|
+
patterns: Pattern analysis results
|
|
220
|
+
outliers: Outlier analysis results
|
|
221
|
+
duplicates: Duplicate analysis results
|
|
222
|
+
timeseries: Time series analysis results (if applicable)
|
|
223
|
+
imputation: Imputation recommendations
|
|
224
|
+
metadata: Analysis metadata
|
|
225
|
+
"""
|
|
226
|
+
|
|
227
|
+
def __init__(self, analyses: Dict[str, Any], metadata: Dict[str, Any]):
|
|
228
|
+
"""
|
|
229
|
+
Initialize analysis result.
|
|
230
|
+
|
|
231
|
+
Args:
|
|
232
|
+
analyses: Dictionary of analysis results
|
|
233
|
+
metadata: Analysis metadata
|
|
234
|
+
"""
|
|
235
|
+
self.metadata = metadata
|
|
236
|
+
|
|
237
|
+
# Store individual analysis results
|
|
238
|
+
self.quality = analyses.get('quality')
|
|
239
|
+
self.cardinality = analyses.get('cardinality')
|
|
240
|
+
self.distributions = analyses.get('distributions')
|
|
241
|
+
self.correlations = analyses.get('correlations')
|
|
242
|
+
self.features = analyses.get('features')
|
|
243
|
+
self.types = analyses.get('types')
|
|
244
|
+
self.patterns = analyses.get('patterns')
|
|
245
|
+
self.outliers = analyses.get('outliers')
|
|
246
|
+
self.duplicates = analyses.get('duplicates')
|
|
247
|
+
self.timeseries = analyses.get('timeseries')
|
|
248
|
+
self.imputation = analyses.get('imputation')
|
|
249
|
+
|
|
250
|
+
# Store all analyses
|
|
251
|
+
self._analyses = analyses
|
|
252
|
+
|
|
253
|
+
def summary(self) -> str:
|
|
254
|
+
"""
|
|
255
|
+
Get summary of all analyses.
|
|
256
|
+
|
|
257
|
+
Returns:
|
|
258
|
+
Formatted summary string
|
|
259
|
+
"""
|
|
260
|
+
lines = ["Analysis Summary"]
|
|
261
|
+
lines.append("=" * 50)
|
|
262
|
+
|
|
263
|
+
# Count available analyses
|
|
264
|
+
available = [k for k, v in self._analyses.items() if v is not None]
|
|
265
|
+
lines.append(f"Analyses performed: {len(available)}")
|
|
266
|
+
lines.append("")
|
|
267
|
+
|
|
268
|
+
# Summarize each analysis
|
|
269
|
+
for analysis_name in available:
|
|
270
|
+
result = self._analyses[analysis_name]
|
|
271
|
+
lines.append(f"{analysis_name.upper()}:")
|
|
272
|
+
|
|
273
|
+
if isinstance(result, dict):
|
|
274
|
+
for key, value in result.items():
|
|
275
|
+
if isinstance(value, (int, float, str, bool)):
|
|
276
|
+
lines.append(f" {key}: {value}")
|
|
277
|
+
elif isinstance(value, list):
|
|
278
|
+
lines.append(f" {key}: {len(value)} items")
|
|
279
|
+
elif isinstance(value, pl.DataFrame):
|
|
280
|
+
lines.append(f" {key}: DataFrame ({value.height}x{value.width})")
|
|
281
|
+
else:
|
|
282
|
+
lines.append(f" {result}")
|
|
283
|
+
|
|
284
|
+
lines.append("")
|
|
285
|
+
|
|
286
|
+
# Add metadata
|
|
287
|
+
if self.metadata:
|
|
288
|
+
lines.append("METADATA:")
|
|
289
|
+
for key, value in self.metadata.items():
|
|
290
|
+
lines.append(f" {key}: {value}")
|
|
291
|
+
|
|
292
|
+
return '\n'.join(lines)
|
|
293
|
+
|
|
294
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
295
|
+
"""
|
|
296
|
+
Convert all results to dictionary.
|
|
297
|
+
|
|
298
|
+
Returns:
|
|
299
|
+
Dictionary of all analysis results
|
|
300
|
+
"""
|
|
301
|
+
result = {}
|
|
302
|
+
|
|
303
|
+
for key, value in self._analyses.items():
|
|
304
|
+
if value is None:
|
|
305
|
+
continue
|
|
306
|
+
|
|
307
|
+
# Convert DataFrames to dictionaries
|
|
308
|
+
if isinstance(value, pl.DataFrame):
|
|
309
|
+
result[key] = value.to_dict()
|
|
310
|
+
elif isinstance(value, dict):
|
|
311
|
+
# Recursively convert nested DataFrames
|
|
312
|
+
result[key] = {}
|
|
313
|
+
for k, v in value.items():
|
|
314
|
+
if isinstance(v, pl.DataFrame):
|
|
315
|
+
result[key][k] = v.to_dict()
|
|
316
|
+
else:
|
|
317
|
+
result[key][k] = v
|
|
318
|
+
else:
|
|
319
|
+
result[key] = value
|
|
320
|
+
|
|
321
|
+
# Add metadata
|
|
322
|
+
result['metadata'] = self.metadata
|
|
323
|
+
|
|
324
|
+
return result
|
|
325
|
+
|
|
326
|
+
def to_json(self) -> str:
|
|
327
|
+
"""
|
|
328
|
+
Convert all results to JSON.
|
|
329
|
+
|
|
330
|
+
Returns:
|
|
331
|
+
JSON string
|
|
332
|
+
"""
|
|
333
|
+
return json.dumps(self.to_dict(), indent=2, default=str)
|
|
334
|
+
|
|
335
|
+
def __repr__(self) -> str:
|
|
336
|
+
"""String representation."""
|
|
337
|
+
available = [k for k, v in self._analyses.items() if v is not None]
|
|
338
|
+
return f"AnalysisResult(analyses={len(available)})"
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
def wrap_result(df: pl.DataFrame, operation: str, metadata: Dict[str, Any]) -> DataFrameResult:
|
|
342
|
+
"""
|
|
343
|
+
Convenience function to wrap DataFrame as result.
|
|
344
|
+
|
|
345
|
+
Called by: to, transform, snapshot, synthetic, expressions functions
|
|
346
|
+
|
|
347
|
+
Args:
|
|
348
|
+
df: DataFrame to wrap
|
|
349
|
+
operation: Operation name
|
|
350
|
+
metadata: Operation metadata
|
|
351
|
+
|
|
352
|
+
Returns:
|
|
353
|
+
DataFrameResult instance
|
|
354
|
+
|
|
355
|
+
Example:
|
|
356
|
+
result = wrap_result(df, 'to', {'columns_added': ['price']})
|
|
357
|
+
"""
|
|
358
|
+
return DataFrameResult(df, operation, metadata)
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
def wrap_analysis(analyses: Dict[str, Any], metadata: Dict[str, Any]) -> AnalysisResult:
|
|
362
|
+
"""
|
|
363
|
+
Convenience function to wrap analysis results.
|
|
364
|
+
|
|
365
|
+
Called by: analyze function
|
|
366
|
+
|
|
367
|
+
Args:
|
|
368
|
+
analyses: Dictionary of analysis results
|
|
369
|
+
metadata: Analysis metadata
|
|
370
|
+
|
|
371
|
+
Returns:
|
|
372
|
+
AnalysisResult instance
|
|
373
|
+
|
|
374
|
+
Example:
|
|
375
|
+
result = wrap_analysis(
|
|
376
|
+
{'quality': {...}, 'cardinality': {...}},
|
|
377
|
+
{'execution_time': 0.5}
|
|
378
|
+
)
|
|
379
|
+
"""
|
|
380
|
+
return AnalysisResult(analyses, metadata)
|
|
@@ -0,0 +1,243 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Strategy parsing and validation utilities for Additory.
|
|
3
|
+
|
|
4
|
+
Provides functions to parse and validate strategy dictionaries
|
|
5
|
+
used across multiple functions (to, transform, synthetic).
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from typing import Any, Dict, List, Tuple
|
|
9
|
+
import polars as pl
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def parse_strategy(strategy: Dict, context: str) -> Dict:
|
|
13
|
+
"""
|
|
14
|
+
Parse and validate strategy dictionary for a given context.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
strategy: Dictionary containing strategy configuration
|
|
18
|
+
context: Context string ('to', 'transform', 'synthetic')
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
Validated and normalized strategy dictionary
|
|
22
|
+
|
|
23
|
+
Example:
|
|
24
|
+
strategy = {'price': {'mode': 'first', 'position': 'after:id'}}
|
|
25
|
+
parsed = parse_strategy(strategy, context='to')
|
|
26
|
+
"""
|
|
27
|
+
# Define allowed keys for each context
|
|
28
|
+
allowed_keys_by_context = {
|
|
29
|
+
'to': ['mode', 'position', 'default', 'deduce'],
|
|
30
|
+
'transform': ['mode', 'from_unit', 'to_unit', 'features', 'deduce'],
|
|
31
|
+
'synthetic': ['mode', 'distribution', 'min', 'max', 'mean', 'std',
|
|
32
|
+
'categories', 'deduce', 'correlation']
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
if context not in allowed_keys_by_context:
|
|
36
|
+
raise ValueError(f"Invalid context: {context}. Must be one of: {list(allowed_keys_by_context.keys())}")
|
|
37
|
+
|
|
38
|
+
allowed_keys = allowed_keys_by_context[context]
|
|
39
|
+
|
|
40
|
+
# Validate and normalize each column strategy
|
|
41
|
+
parsed = {}
|
|
42
|
+
for column, column_strategy in strategy.items():
|
|
43
|
+
if isinstance(column_strategy, dict):
|
|
44
|
+
# Validate keys
|
|
45
|
+
validate_strategy_keys(column_strategy, allowed_keys)
|
|
46
|
+
|
|
47
|
+
# Normalize values
|
|
48
|
+
normalized = {}
|
|
49
|
+
for key, value in column_strategy.items():
|
|
50
|
+
normalized[key] = normalize_strategy_value(value, key)
|
|
51
|
+
|
|
52
|
+
parsed[column] = normalized
|
|
53
|
+
else:
|
|
54
|
+
# Simple value (e.g., 'deduce:expression')
|
|
55
|
+
parsed[column] = normalize_strategy_value(column_strategy, 'simple')
|
|
56
|
+
|
|
57
|
+
return parsed
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def validate_strategy_keys(strategy: Dict, allowed_keys: List[str]) -> bool:
|
|
61
|
+
"""
|
|
62
|
+
Validate that strategy contains only allowed keys.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
strategy: Strategy dictionary to validate
|
|
66
|
+
allowed_keys: List of allowed keys for this context
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
True if valid
|
|
70
|
+
|
|
71
|
+
Raises:
|
|
72
|
+
ValueError: If strategy contains invalid keys
|
|
73
|
+
"""
|
|
74
|
+
invalid_keys = set(strategy.keys()) - set(allowed_keys)
|
|
75
|
+
|
|
76
|
+
if invalid_keys:
|
|
77
|
+
raise ValueError(
|
|
78
|
+
f"Invalid strategy keys: {invalid_keys}. "
|
|
79
|
+
f"Allowed keys: {allowed_keys}"
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
return True
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def normalize_strategy_value(value: Any, value_type: str) -> Any:
|
|
86
|
+
"""
|
|
87
|
+
Normalize strategy value to expected type.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
value: Value to normalize
|
|
91
|
+
value_type: Expected type ('mode', 'position', 'expression', etc.)
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
Normalized value
|
|
95
|
+
"""
|
|
96
|
+
if value_type == 'mode':
|
|
97
|
+
# Mode should be a string
|
|
98
|
+
if not isinstance(value, str):
|
|
99
|
+
raise ValueError(f"Mode must be a string, got {type(value)}")
|
|
100
|
+
return value.lower()
|
|
101
|
+
|
|
102
|
+
elif value_type == 'position':
|
|
103
|
+
# Position should be a string (e.g., 'after:id', 'before:name')
|
|
104
|
+
if not isinstance(value, str):
|
|
105
|
+
raise ValueError(f"Position must be a string, got {type(value)}")
|
|
106
|
+
return value.lower()
|
|
107
|
+
|
|
108
|
+
elif value_type == 'deduce':
|
|
109
|
+
# Deduce should be a string (expression or reference)
|
|
110
|
+
if not isinstance(value, str):
|
|
111
|
+
raise ValueError(f"Deduce must be a string, got {type(value)}")
|
|
112
|
+
return value
|
|
113
|
+
|
|
114
|
+
elif value_type == 'distribution':
|
|
115
|
+
# Distribution should be a string
|
|
116
|
+
if not isinstance(value, str):
|
|
117
|
+
raise ValueError(f"Distribution must be a string, got {type(value)}")
|
|
118
|
+
return value.lower()
|
|
119
|
+
|
|
120
|
+
elif value_type in ['min', 'max', 'mean', 'std', 'correlation']:
|
|
121
|
+
# Numeric values
|
|
122
|
+
if not isinstance(value, (int, float)):
|
|
123
|
+
raise ValueError(f"{value_type} must be numeric, got {type(value)}")
|
|
124
|
+
return float(value)
|
|
125
|
+
|
|
126
|
+
elif value_type == 'categories':
|
|
127
|
+
# Categories should be a list
|
|
128
|
+
if not isinstance(value, list):
|
|
129
|
+
raise ValueError(f"Categories must be a list, got {type(value)}")
|
|
130
|
+
return value
|
|
131
|
+
|
|
132
|
+
elif value_type == 'features':
|
|
133
|
+
# Features should be a list
|
|
134
|
+
if not isinstance(value, list):
|
|
135
|
+
raise ValueError(f"Features must be a list, got {type(value)}")
|
|
136
|
+
return value
|
|
137
|
+
|
|
138
|
+
elif value_type in ['from_unit', 'to_unit']:
|
|
139
|
+
# Units should be strings
|
|
140
|
+
if not isinstance(value, str):
|
|
141
|
+
raise ValueError(f"{value_type} must be a string, got {type(value)}")
|
|
142
|
+
return value.lower()
|
|
143
|
+
|
|
144
|
+
elif value_type == 'default':
|
|
145
|
+
# Default can be any type
|
|
146
|
+
return value
|
|
147
|
+
|
|
148
|
+
elif value_type == 'simple':
|
|
149
|
+
# Simple value (not in a dict)
|
|
150
|
+
return value
|
|
151
|
+
|
|
152
|
+
else:
|
|
153
|
+
# Unknown type, return as-is
|
|
154
|
+
return value
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def parse_deduce_strategy(strategy_value: str, df: pl.DataFrame) -> pl.Series:
|
|
158
|
+
"""
|
|
159
|
+
Parse 'deduce:' strategy (inline expression or reference).
|
|
160
|
+
|
|
161
|
+
Args:
|
|
162
|
+
strategy_value: Strategy string starting with 'deduce:'
|
|
163
|
+
df: DataFrame for expression evaluation
|
|
164
|
+
|
|
165
|
+
Returns:
|
|
166
|
+
Polars Series with computed values
|
|
167
|
+
|
|
168
|
+
Example:
|
|
169
|
+
# Inline expression
|
|
170
|
+
result = parse_deduce_strategy('deduce:weight / (height ** 2)', df)
|
|
171
|
+
|
|
172
|
+
# Reference expression
|
|
173
|
+
result = parse_deduce_strategy('deduce:inbuilt:bmi', df)
|
|
174
|
+
"""
|
|
175
|
+
if not strategy_value.startswith('deduce:'):
|
|
176
|
+
raise ValueError(f"Strategy value must start with 'deduce:', got: {strategy_value}")
|
|
177
|
+
|
|
178
|
+
# Remove 'deduce:' prefix
|
|
179
|
+
expression_part = strategy_value[7:] # len('deduce:') = 7
|
|
180
|
+
|
|
181
|
+
# Check if it's a reference (contains ':')
|
|
182
|
+
if ':' in expression_part:
|
|
183
|
+
# It's a reference like 'inbuilt:bmi' or 'myfolder:roi'
|
|
184
|
+
namespace, expr_name = extract_namespace_from_reference(expression_part)
|
|
185
|
+
|
|
186
|
+
# For now, we'll return a placeholder since expressions.engine is not yet implemented
|
|
187
|
+
# This will be replaced when expressions.engine is available
|
|
188
|
+
raise NotImplementedError(
|
|
189
|
+
f"Expression references not yet implemented. "
|
|
190
|
+
f"Namespace: {namespace}, Expression: {expr_name}"
|
|
191
|
+
)
|
|
192
|
+
else:
|
|
193
|
+
# It's an inline expression like 'weight / (height ** 2)'
|
|
194
|
+
# For now, we'll try to evaluate it directly using Polars
|
|
195
|
+
try:
|
|
196
|
+
# Try to evaluate as a simple Polars expression
|
|
197
|
+
# This is a simplified version - full implementation will use expressions.engine
|
|
198
|
+
result = df.select(pl.lit(expression_part).alias('result'))['result']
|
|
199
|
+
return result
|
|
200
|
+
except Exception as e:
|
|
201
|
+
raise ValueError(
|
|
202
|
+
f"Failed to evaluate inline expression: {expression_part}. "
|
|
203
|
+
f"Error: {str(e)}"
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def extract_namespace_from_reference(reference: str) -> Tuple[str, str]:
|
|
208
|
+
"""
|
|
209
|
+
Extract namespace and expression name from reference.
|
|
210
|
+
|
|
211
|
+
Args:
|
|
212
|
+
reference: Reference string like 'inbuilt:bmi' or 'myfolder:roi'
|
|
213
|
+
|
|
214
|
+
Returns:
|
|
215
|
+
Tuple of (namespace, expression_name)
|
|
216
|
+
|
|
217
|
+
Example:
|
|
218
|
+
namespace, expr_name = extract_namespace_from_reference('inbuilt:bmi')
|
|
219
|
+
# Returns: ('inbuilt', 'bmi')
|
|
220
|
+
"""
|
|
221
|
+
if ':' not in reference:
|
|
222
|
+
raise ValueError(
|
|
223
|
+
f"Invalid reference format: {reference}. "
|
|
224
|
+
f"Expected format: 'namespace:expression_name'"
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
parts = reference.split(':', 1)
|
|
228
|
+
if len(parts) != 2:
|
|
229
|
+
raise ValueError(
|
|
230
|
+
f"Invalid reference format: {reference}. "
|
|
231
|
+
f"Expected format: 'namespace:expression_name'"
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
namespace = parts[0].strip()
|
|
235
|
+
expr_name = parts[1].strip()
|
|
236
|
+
|
|
237
|
+
if not namespace or not expr_name:
|
|
238
|
+
raise ValueError(
|
|
239
|
+
f"Invalid reference format: {reference}. "
|
|
240
|
+
f"Both namespace and expression name must be non-empty"
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
return namespace, expr_name
|