additory 0.1.0a4__py3-none-any.whl → 0.1.1a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- additory/__init__.py +58 -14
- additory/common/__init__.py +31 -147
- additory/common/column_selector.py +255 -0
- additory/common/distributions.py +286 -613
- additory/common/extractors.py +313 -0
- additory/common/knn_imputation.py +332 -0
- additory/common/result.py +380 -0
- additory/common/strategy_parser.py +243 -0
- additory/common/unit_conversions.py +338 -0
- additory/common/validation.py +283 -103
- additory/core/__init__.py +34 -22
- additory/core/backend.py +258 -0
- additory/core/config.py +177 -305
- additory/core/logging.py +230 -24
- additory/core/memory_manager.py +157 -495
- additory/expressions/__init__.py +2 -23
- additory/expressions/compiler.py +457 -0
- additory/expressions/engine.py +264 -487
- additory/expressions/integrity.py +179 -0
- additory/expressions/loader.py +263 -0
- additory/expressions/parser.py +363 -167
- additory/expressions/resolver.py +274 -0
- additory/functions/__init__.py +1 -0
- additory/functions/analyze/__init__.py +144 -0
- additory/functions/analyze/cardinality.py +58 -0
- additory/functions/analyze/correlations.py +66 -0
- additory/functions/analyze/distributions.py +53 -0
- additory/functions/analyze/duplicates.py +49 -0
- additory/functions/analyze/features.py +61 -0
- additory/functions/analyze/imputation.py +66 -0
- additory/functions/analyze/outliers.py +65 -0
- additory/functions/analyze/patterns.py +65 -0
- additory/functions/analyze/presets.py +72 -0
- additory/functions/analyze/quality.py +59 -0
- additory/functions/analyze/timeseries.py +53 -0
- additory/functions/analyze/types.py +45 -0
- additory/functions/expressions/__init__.py +161 -0
- additory/functions/snapshot/__init__.py +82 -0
- additory/functions/snapshot/filter.py +119 -0
- additory/functions/synthetic/__init__.py +113 -0
- additory/functions/synthetic/mode_detector.py +47 -0
- additory/functions/synthetic/strategies/__init__.py +1 -0
- additory/functions/synthetic/strategies/advanced.py +35 -0
- additory/functions/synthetic/strategies/augmentative.py +160 -0
- additory/functions/synthetic/strategies/generative.py +168 -0
- additory/functions/synthetic/strategies/presets.py +116 -0
- additory/functions/to/__init__.py +188 -0
- additory/functions/to/lookup.py +351 -0
- additory/functions/to/merge.py +189 -0
- additory/functions/to/sort.py +91 -0
- additory/functions/to/summarize.py +170 -0
- additory/functions/transform/__init__.py +140 -0
- additory/functions/transform/datetime.py +79 -0
- additory/functions/transform/extract.py +85 -0
- additory/functions/transform/harmonize.py +105 -0
- additory/functions/transform/knn.py +62 -0
- additory/functions/transform/onehotencoding.py +68 -0
- additory/functions/transform/transpose.py +42 -0
- additory-0.1.1a1.dist-info/METADATA +83 -0
- additory-0.1.1a1.dist-info/RECORD +62 -0
- additory/analysis/__init__.py +0 -48
- additory/analysis/cardinality.py +0 -126
- additory/analysis/correlations.py +0 -124
- additory/analysis/distributions.py +0 -376
- additory/analysis/quality.py +0 -158
- additory/analysis/scan.py +0 -400
- additory/common/backend.py +0 -371
- additory/common/column_utils.py +0 -191
- additory/common/exceptions.py +0 -62
- additory/common/lists.py +0 -229
- additory/common/patterns.py +0 -240
- additory/common/resolver.py +0 -567
- additory/common/sample_data.py +0 -182
- additory/core/ast_builder.py +0 -165
- additory/core/backends/__init__.py +0 -23
- additory/core/backends/arrow_bridge.py +0 -483
- additory/core/backends/cudf_bridge.py +0 -355
- additory/core/column_positioning.py +0 -358
- additory/core/compiler_polars.py +0 -166
- additory/core/enhanced_cache_manager.py +0 -1119
- additory/core/enhanced_matchers.py +0 -473
- additory/core/enhanced_version_manager.py +0 -325
- additory/core/executor.py +0 -59
- additory/core/integrity_manager.py +0 -477
- additory/core/loader.py +0 -190
- additory/core/namespace_manager.py +0 -657
- additory/core/parser.py +0 -176
- additory/core/polars_expression_engine.py +0 -601
- additory/core/registry.py +0 -177
- additory/core/sample_data_manager.py +0 -492
- additory/core/user_namespace.py +0 -751
- additory/core/validator.py +0 -27
- additory/dynamic_api.py +0 -352
- additory/expressions/proxy.py +0 -549
- additory/expressions/registry.py +0 -313
- additory/expressions/samples.py +0 -492
- additory/synthetic/__init__.py +0 -13
- additory/synthetic/column_name_resolver.py +0 -149
- additory/synthetic/deduce.py +0 -259
- additory/synthetic/distributions.py +0 -22
- additory/synthetic/forecast.py +0 -1132
- additory/synthetic/linked_list_parser.py +0 -415
- additory/synthetic/namespace_lookup.py +0 -129
- additory/synthetic/smote.py +0 -320
- additory/synthetic/strategies.py +0 -926
- additory/synthetic/synthesizer.py +0 -713
- additory/utilities/__init__.py +0 -53
- additory/utilities/encoding.py +0 -600
- additory/utilities/games.py +0 -300
- additory/utilities/keys.py +0 -8
- additory/utilities/lookup.py +0 -103
- additory/utilities/matchers.py +0 -216
- additory/utilities/resolvers.py +0 -286
- additory/utilities/settings.py +0 -167
- additory/utilities/units.py +0 -749
- additory/utilities/validators.py +0 -153
- additory-0.1.0a4.dist-info/METADATA +0 -311
- additory-0.1.0a4.dist-info/RECORD +0 -72
- additory-0.1.0a4.dist-info/licenses/LICENSE +0 -21
- {additory-0.1.0a4.dist-info → additory-0.1.1a1.dist-info}/WHEEL +0 -0
- {additory-0.1.0a4.dist-info → additory-0.1.1a1.dist-info}/top_level.txt +0 -0
additory/synthetic/strategies.py
DELETED
|
@@ -1,926 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Strategy handlers for synthetic data generation
|
|
3
|
-
|
|
4
|
-
Provides different strategies for generating synthetic data:
|
|
5
|
-
- auto: Random sampling from existing values
|
|
6
|
-
- increment: Increment numeric or pattern-based values
|
|
7
|
-
- choice:[...]: Random selection from inline list
|
|
8
|
-
"""
|
|
9
|
-
|
|
10
|
-
import re
|
|
11
|
-
import random
|
|
12
|
-
from typing import Any, Dict, List, Optional, Tuple
|
|
13
|
-
|
|
14
|
-
from additory.common.exceptions import ValidationError, AugmentError
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
def parse_strategy_params(strategy_spec: str) -> Tuple[str, Dict[str, Any]]:
|
|
18
|
-
"""
|
|
19
|
-
Parse strategy specification with inline parameters.
|
|
20
|
-
|
|
21
|
-
Supports two formats:
|
|
22
|
-
1. key=value format: "increment:start=100:pattern=EMP_[001]"
|
|
23
|
-
2. range format: "range:18-65"
|
|
24
|
-
|
|
25
|
-
Args:
|
|
26
|
-
strategy_spec: Strategy string with optional parameters
|
|
27
|
-
|
|
28
|
-
Returns:
|
|
29
|
-
Tuple of (strategy_name, params_dict)
|
|
30
|
-
- strategy_name: Base strategy name (e.g., "increment", "range")
|
|
31
|
-
- params_dict: Dictionary of parsed parameters
|
|
32
|
-
|
|
33
|
-
Raises:
|
|
34
|
-
ValidationError: If parameter format is invalid
|
|
35
|
-
|
|
36
|
-
Examples:
|
|
37
|
-
>>> parse_strategy_params("increment")
|
|
38
|
-
("increment", {})
|
|
39
|
-
|
|
40
|
-
>>> parse_strategy_params("increment:start=100")
|
|
41
|
-
("increment", {"start": 100})
|
|
42
|
-
|
|
43
|
-
>>> parse_strategy_params("increment:start=100:pattern=EMP_[001]")
|
|
44
|
-
("increment", {"start": 100, "pattern": "EMP_[001]"})
|
|
45
|
-
|
|
46
|
-
>>> parse_strategy_params("range:18-65")
|
|
47
|
-
("range", {"min": 18, "max": 65})
|
|
48
|
-
"""
|
|
49
|
-
if not strategy_spec or not strategy_spec.strip():
|
|
50
|
-
raise ValidationError("Empty strategy specification")
|
|
51
|
-
|
|
52
|
-
parts = strategy_spec.split(":")
|
|
53
|
-
|
|
54
|
-
strategy_name = parts[0].strip()
|
|
55
|
-
|
|
56
|
-
if not strategy_name:
|
|
57
|
-
raise ValidationError("Empty strategy name")
|
|
58
|
-
|
|
59
|
-
if len(parts) == 1:
|
|
60
|
-
# No parameters
|
|
61
|
-
return strategy_name, {}
|
|
62
|
-
|
|
63
|
-
# Special case: range format "range:18-65"
|
|
64
|
-
if strategy_name == "range" and len(parts) == 2:
|
|
65
|
-
range_part = parts[1].strip()
|
|
66
|
-
|
|
67
|
-
# Check if it's the min-max format (no = sign)
|
|
68
|
-
if "=" not in range_part:
|
|
69
|
-
if "-" not in range_part:
|
|
70
|
-
raise ValidationError(
|
|
71
|
-
f"Invalid range format: {range_part}. "
|
|
72
|
-
"Expected format: range:min-max (e.g., range:18-65)"
|
|
73
|
-
)
|
|
74
|
-
|
|
75
|
-
# Split by dash, handling negative numbers
|
|
76
|
-
# Use regex to properly split on dash
|
|
77
|
-
match = re.match(r'^(-?\d+)-(-?\d+)$', range_part)
|
|
78
|
-
if match:
|
|
79
|
-
try:
|
|
80
|
-
min_val = int(match.group(1))
|
|
81
|
-
max_val = int(match.group(2))
|
|
82
|
-
return strategy_name, {"min": min_val, "max": max_val}
|
|
83
|
-
except ValueError:
|
|
84
|
-
raise ValidationError(
|
|
85
|
-
f"Invalid range format: {range_part}. "
|
|
86
|
-
"Expected format: range:min-max (e.g., range:18-65)"
|
|
87
|
-
)
|
|
88
|
-
else:
|
|
89
|
-
raise ValidationError(
|
|
90
|
-
f"Invalid range format: {range_part}. "
|
|
91
|
-
"Expected format: range:min-max (e.g., range:18-65)"
|
|
92
|
-
)
|
|
93
|
-
|
|
94
|
-
# Parse key=value parameters
|
|
95
|
-
params = {}
|
|
96
|
-
|
|
97
|
-
for i in range(1, len(parts)):
|
|
98
|
-
param_part = parts[i].strip()
|
|
99
|
-
|
|
100
|
-
if "=" not in param_part:
|
|
101
|
-
raise ValidationError(
|
|
102
|
-
f"Invalid parameter format: '{param_part}'. "
|
|
103
|
-
"Expected format: key=value (e.g., start=100)"
|
|
104
|
-
)
|
|
105
|
-
|
|
106
|
-
key, value = param_part.split("=", 1)
|
|
107
|
-
key = key.strip()
|
|
108
|
-
value = value.strip()
|
|
109
|
-
|
|
110
|
-
if not key:
|
|
111
|
-
raise ValidationError(
|
|
112
|
-
f"Empty parameter key in: '{param_part}'"
|
|
113
|
-
)
|
|
114
|
-
|
|
115
|
-
if not value:
|
|
116
|
-
raise ValidationError(
|
|
117
|
-
f"Empty parameter value for key '{key}'"
|
|
118
|
-
)
|
|
119
|
-
|
|
120
|
-
# Try to convert numeric strings to integers
|
|
121
|
-
try:
|
|
122
|
-
# Check if it's a valid integer
|
|
123
|
-
if value.lstrip('-').isdigit():
|
|
124
|
-
params[key] = int(value)
|
|
125
|
-
else:
|
|
126
|
-
# Keep as string
|
|
127
|
-
params[key] = value
|
|
128
|
-
except ValueError:
|
|
129
|
-
# Keep as string if conversion fails
|
|
130
|
-
params[key] = value
|
|
131
|
-
|
|
132
|
-
return strategy_name, params
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
def parse_increment_strategy(strategy_spec: str) -> Tuple[Optional[str], Optional[str]]:
|
|
136
|
-
"""
|
|
137
|
-
Parse increment strategy specification.
|
|
138
|
-
|
|
139
|
-
Args:
|
|
140
|
-
strategy_spec: Strategy string like:
|
|
141
|
-
- "increment"
|
|
142
|
-
- "increment:EMP_[001]_ID"
|
|
143
|
-
- r"increment:A(\\d+)"
|
|
144
|
-
|
|
145
|
-
Returns:
|
|
146
|
-
Tuple of (pattern, regex_pattern)
|
|
147
|
-
- pattern: Original pattern string (for bracket notation)
|
|
148
|
-
- regex_pattern: Compiled regex pattern (for extraction)
|
|
149
|
-
|
|
150
|
-
Examples:
|
|
151
|
-
>>> parse_increment_strategy("increment")
|
|
152
|
-
(None, None)
|
|
153
|
-
|
|
154
|
-
>>> parse_increment_strategy("increment:EMP_[001]_ID")
|
|
155
|
-
("EMP_[001]_ID", r"EMP_(\\d{3})_ID")
|
|
156
|
-
|
|
157
|
-
>>> parse_increment_strategy(r"increment:A(\\d+)")
|
|
158
|
-
(None, r"A(\\d+)")
|
|
159
|
-
"""
|
|
160
|
-
parts = strategy_spec.split(":", 1)
|
|
161
|
-
|
|
162
|
-
if len(parts) == 1:
|
|
163
|
-
# Simple "increment" with no pattern
|
|
164
|
-
return None, None
|
|
165
|
-
|
|
166
|
-
pattern_str = parts[1].strip()
|
|
167
|
-
|
|
168
|
-
# Check if it's bracket notation: EMP_[001]_ID
|
|
169
|
-
if "[" in pattern_str and "]" in pattern_str:
|
|
170
|
-
# Extract the bracketed part
|
|
171
|
-
bracket_match = re.search(r'\[(\d+)\]', pattern_str)
|
|
172
|
-
if not bracket_match:
|
|
173
|
-
raise ValidationError(
|
|
174
|
-
f"Invalid bracket pattern: {pattern_str}. "
|
|
175
|
-
"Brackets must contain digits, e.g., [001] or [123]"
|
|
176
|
-
)
|
|
177
|
-
|
|
178
|
-
# Get the number inside brackets to determine padding
|
|
179
|
-
number_str = bracket_match.group(1)
|
|
180
|
-
padding = len(number_str)
|
|
181
|
-
|
|
182
|
-
# Convert bracket notation to regex
|
|
183
|
-
# EMP_[001]_ID -> EMP_(\d{3})_ID
|
|
184
|
-
regex_pattern = pattern_str.replace(f"[{number_str}]", f"(\\d{{{padding}}})")
|
|
185
|
-
|
|
186
|
-
return pattern_str, regex_pattern
|
|
187
|
-
|
|
188
|
-
# Otherwise, assume it's already a regex pattern
|
|
189
|
-
# Validate that it has a capture group
|
|
190
|
-
if "(" not in pattern_str or ")" not in pattern_str:
|
|
191
|
-
raise ValidationError(
|
|
192
|
-
f"Invalid pattern: {pattern_str}. "
|
|
193
|
-
"Pattern must either use bracket notation [001] or regex with capture group (\\d+)"
|
|
194
|
-
)
|
|
195
|
-
|
|
196
|
-
return None, pattern_str
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
def validate_increment_column(
|
|
200
|
-
last_value: Any,
|
|
201
|
-
pattern: Optional[str],
|
|
202
|
-
regex_pattern: Optional[str]
|
|
203
|
-
) -> Tuple[int, Optional[str], Optional[int]]:
|
|
204
|
-
"""
|
|
205
|
-
Validate that a column can be incremented and extract current value.
|
|
206
|
-
|
|
207
|
-
Args:
|
|
208
|
-
last_value: Last value in the column
|
|
209
|
-
pattern: Pattern string (if using bracket notation)
|
|
210
|
-
regex_pattern: Regex pattern (if provided)
|
|
211
|
-
|
|
212
|
-
Returns:
|
|
213
|
-
Tuple of (current_number, prefix_suffix_template, padding)
|
|
214
|
-
- current_number: The numeric value to increment from
|
|
215
|
-
- prefix_suffix_template: Template for reconstruction (e.g., "EMP_{}_ID")
|
|
216
|
-
- padding: Number of digits for zero-padding (or None)
|
|
217
|
-
|
|
218
|
-
Raises:
|
|
219
|
-
ValidationError: If column cannot be incremented
|
|
220
|
-
"""
|
|
221
|
-
last_value_str = str(last_value)
|
|
222
|
-
|
|
223
|
-
# Case 1: Pure numeric value
|
|
224
|
-
if regex_pattern is None and pattern is None:
|
|
225
|
-
try:
|
|
226
|
-
current_number = int(last_value)
|
|
227
|
-
return current_number, None, None
|
|
228
|
-
except (ValueError, TypeError):
|
|
229
|
-
raise ValidationError(
|
|
230
|
-
f"Column has non-numeric last value '{last_value}'. "
|
|
231
|
-
"For non-numeric columns, you must provide a pattern. "
|
|
232
|
-
"Examples: 'increment:EMP_[001]_ID' or 'increment:A(\\d+)'"
|
|
233
|
-
)
|
|
234
|
-
|
|
235
|
-
# Case 2: Pattern-based value
|
|
236
|
-
if regex_pattern is None:
|
|
237
|
-
raise ValidationError("Pattern parsing failed - this should not happen")
|
|
238
|
-
|
|
239
|
-
# Try to match the pattern
|
|
240
|
-
match = re.search(regex_pattern, last_value_str)
|
|
241
|
-
if not match:
|
|
242
|
-
raise ValidationError(
|
|
243
|
-
f"Pattern '{pattern or regex_pattern}' does not match last value '{last_value}'. "
|
|
244
|
-
"Please verify the pattern matches your data."
|
|
245
|
-
)
|
|
246
|
-
|
|
247
|
-
# Extract the numeric part
|
|
248
|
-
try:
|
|
249
|
-
number_str = match.group(1)
|
|
250
|
-
current_number = int(number_str)
|
|
251
|
-
padding = len(number_str) if number_str.startswith('0') else None
|
|
252
|
-
except (ValueError, IndexError):
|
|
253
|
-
raise ValidationError(
|
|
254
|
-
f"Could not extract numeric value from '{last_value}' using pattern '{pattern or regex_pattern}'"
|
|
255
|
-
)
|
|
256
|
-
|
|
257
|
-
# Create template for reconstruction
|
|
258
|
-
# Replace the captured group with {} placeholder
|
|
259
|
-
template = re.sub(r'\([^)]+\)', '{}', regex_pattern)
|
|
260
|
-
# Remove regex special characters for simple replacement
|
|
261
|
-
template = template.replace('\\d', '').replace('{', '').replace('}', '')
|
|
262
|
-
|
|
263
|
-
# Better approach: use the actual matched string positions
|
|
264
|
-
start, end = match.span(1)
|
|
265
|
-
template = last_value_str[:start] + '{}' + last_value_str[end:]
|
|
266
|
-
|
|
267
|
-
return current_number, template, padding
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
def generate_increment_values(
|
|
271
|
-
start_number: int,
|
|
272
|
-
count: int,
|
|
273
|
-
template: Optional[str],
|
|
274
|
-
padding: Optional[int]
|
|
275
|
-
) -> List[Any]:
|
|
276
|
-
"""
|
|
277
|
-
Generate incremented values.
|
|
278
|
-
|
|
279
|
-
Args:
|
|
280
|
-
start_number: Starting number (last value + 1)
|
|
281
|
-
count: Number of values to generate
|
|
282
|
-
template: Template for reconstruction (e.g., "EMP_{}_ID")
|
|
283
|
-
padding: Number of digits for zero-padding
|
|
284
|
-
|
|
285
|
-
Returns:
|
|
286
|
-
List of generated values
|
|
287
|
-
"""
|
|
288
|
-
values = []
|
|
289
|
-
|
|
290
|
-
for i in range(count):
|
|
291
|
-
new_number = start_number + i
|
|
292
|
-
|
|
293
|
-
if template is None:
|
|
294
|
-
# Pure numeric
|
|
295
|
-
values.append(new_number)
|
|
296
|
-
else:
|
|
297
|
-
# Pattern-based
|
|
298
|
-
if padding:
|
|
299
|
-
number_str = str(new_number).zfill(padding)
|
|
300
|
-
else:
|
|
301
|
-
number_str = str(new_number)
|
|
302
|
-
|
|
303
|
-
new_value = template.format(number_str)
|
|
304
|
-
values.append(new_value)
|
|
305
|
-
|
|
306
|
-
return values
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
def apply_increment_strategy(
|
|
310
|
-
df_polars: Any,
|
|
311
|
-
column: str,
|
|
312
|
-
strategy_spec: str,
|
|
313
|
-
n_rows: int,
|
|
314
|
-
params: Optional[Dict[str, Any]] = None
|
|
315
|
-
) -> List[Any]:
|
|
316
|
-
"""
|
|
317
|
-
Apply increment strategy to a column (Polars-only).
|
|
318
|
-
|
|
319
|
-
Supports two modes:
|
|
320
|
-
1. Extend mode: Increment from last value in df_polars
|
|
321
|
-
2. Create mode: Start from specified value (requires params with 'start')
|
|
322
|
-
|
|
323
|
-
Args:
|
|
324
|
-
df_polars: Input Polars DataFrame (None in create mode)
|
|
325
|
-
column: Column name to increment
|
|
326
|
-
strategy_spec: Strategy specification (e.g., "increment:EMP_[001]_ID")
|
|
327
|
-
n_rows: Number of new values to generate
|
|
328
|
-
params: Optional parameters dict with 'start' and/or 'pattern' keys
|
|
329
|
-
|
|
330
|
-
Returns:
|
|
331
|
-
List of new values for the column
|
|
332
|
-
|
|
333
|
-
Raises:
|
|
334
|
-
ValidationError: If strategy cannot be applied
|
|
335
|
-
|
|
336
|
-
Examples:
|
|
337
|
-
# Extend mode (with DataFrame)
|
|
338
|
-
>>> apply_increment_strategy(df, "id", "increment", 5)
|
|
339
|
-
[11, 12, 13, 14, 15] # if last value was 10
|
|
340
|
-
|
|
341
|
-
# Create mode (no DataFrame, with start parameter)
|
|
342
|
-
>>> apply_increment_strategy(None, "id", "increment", 5, {"start": 100})
|
|
343
|
-
[100, 101, 102, 103, 104]
|
|
344
|
-
|
|
345
|
-
# Create mode with pattern
|
|
346
|
-
>>> apply_increment_strategy(None, "emp_id", "increment", 3,
|
|
347
|
-
... {"start": 1, "pattern": "EMP_[001]"})
|
|
348
|
-
["EMP_001", "EMP_002", "EMP_003"]
|
|
349
|
-
"""
|
|
350
|
-
# Determine mode: extend (has df) or create (no df)
|
|
351
|
-
is_create_mode = df_polars is None
|
|
352
|
-
|
|
353
|
-
if is_create_mode:
|
|
354
|
-
# Create mode: use start parameter
|
|
355
|
-
if params is None or "start" not in params:
|
|
356
|
-
raise ValidationError(
|
|
357
|
-
f"Increment strategy in create mode requires 'start' parameter. "
|
|
358
|
-
f"Use format: 'increment:start=N' or 'increment:start=N:pattern=P'"
|
|
359
|
-
)
|
|
360
|
-
|
|
361
|
-
start_number = params["start"]
|
|
362
|
-
|
|
363
|
-
# Check if pattern is provided in params
|
|
364
|
-
if "pattern" in params:
|
|
365
|
-
pattern_str = params["pattern"]
|
|
366
|
-
|
|
367
|
-
# Parse the pattern to get template and padding
|
|
368
|
-
if "[" in pattern_str and "]" in pattern_str:
|
|
369
|
-
# Bracket notation: EMP_[001]
|
|
370
|
-
bracket_match = re.search(r'\[(\d+)\]', pattern_str)
|
|
371
|
-
if not bracket_match:
|
|
372
|
-
raise ValidationError(
|
|
373
|
-
f"Invalid bracket pattern: {pattern_str}. "
|
|
374
|
-
"Brackets must contain digits, e.g., [001] or [123]"
|
|
375
|
-
)
|
|
376
|
-
|
|
377
|
-
number_str = bracket_match.group(1)
|
|
378
|
-
padding = len(number_str)
|
|
379
|
-
|
|
380
|
-
# Create template by replacing [NNN] with {}
|
|
381
|
-
template = pattern_str.replace(f"[{number_str}]", "{}")
|
|
382
|
-
else:
|
|
383
|
-
raise ValidationError(
|
|
384
|
-
f"Invalid pattern: {pattern_str}. "
|
|
385
|
-
"Pattern must use bracket notation [001]"
|
|
386
|
-
)
|
|
387
|
-
else:
|
|
388
|
-
# No pattern, pure numeric
|
|
389
|
-
template = None
|
|
390
|
-
padding = None
|
|
391
|
-
|
|
392
|
-
# Generate values starting from start_number
|
|
393
|
-
new_values = generate_increment_values(
|
|
394
|
-
start_number=start_number,
|
|
395
|
-
count=n_rows,
|
|
396
|
-
template=template,
|
|
397
|
-
padding=padding
|
|
398
|
-
)
|
|
399
|
-
|
|
400
|
-
return new_values
|
|
401
|
-
|
|
402
|
-
else:
|
|
403
|
-
# Extend mode: use existing logic
|
|
404
|
-
# Parse the strategy
|
|
405
|
-
pattern, regex_pattern = parse_increment_strategy(strategy_spec)
|
|
406
|
-
|
|
407
|
-
# Get last value from the Polars column
|
|
408
|
-
last_value = df_polars[column][-1]
|
|
409
|
-
|
|
410
|
-
# Validate and extract current value
|
|
411
|
-
current_number, template, padding = validate_increment_column(
|
|
412
|
-
last_value, pattern, regex_pattern
|
|
413
|
-
)
|
|
414
|
-
|
|
415
|
-
# Generate new values starting from current + 1
|
|
416
|
-
new_values = generate_increment_values(
|
|
417
|
-
start_number=current_number + 1,
|
|
418
|
-
count=n_rows,
|
|
419
|
-
template=template,
|
|
420
|
-
padding=padding
|
|
421
|
-
)
|
|
422
|
-
|
|
423
|
-
return new_values
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
def parse_strategy_dict(strategy: Any) -> Dict[str, str]:
|
|
427
|
-
"""
|
|
428
|
-
Parse and validate strategy parameter.
|
|
429
|
-
|
|
430
|
-
Args:
|
|
431
|
-
strategy: Strategy specification, can be:
|
|
432
|
-
- str: "auto" (default for all columns)
|
|
433
|
-
- dict: {"col1": "increment", "col2": "auto", ...}
|
|
434
|
-
|
|
435
|
-
Returns:
|
|
436
|
-
Dictionary mapping column names to strategy specs
|
|
437
|
-
|
|
438
|
-
Raises:
|
|
439
|
-
ValidationError: If strategy format is invalid
|
|
440
|
-
"""
|
|
441
|
-
if isinstance(strategy, str):
|
|
442
|
-
# Simple string strategy applies to all columns
|
|
443
|
-
return {"__default__": strategy}
|
|
444
|
-
|
|
445
|
-
if isinstance(strategy, dict):
|
|
446
|
-
# Validate all strategy values are strings
|
|
447
|
-
for col, strat in strategy.items():
|
|
448
|
-
if not isinstance(strat, str):
|
|
449
|
-
raise ValidationError(
|
|
450
|
-
f"Strategy for column '{col}' must be a string, got {type(strat)}"
|
|
451
|
-
)
|
|
452
|
-
return strategy
|
|
453
|
-
|
|
454
|
-
raise ValidationError(
|
|
455
|
-
f"Strategy must be str or dict, got {type(strategy)}"
|
|
456
|
-
)
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
def get_column_strategy(column: str, strategy_dict: Dict[str, str]) -> str:
|
|
460
|
-
"""
|
|
461
|
-
Get strategy for a specific column.
|
|
462
|
-
|
|
463
|
-
Args:
|
|
464
|
-
column: Column name
|
|
465
|
-
strategy_dict: Parsed strategy dictionary
|
|
466
|
-
|
|
467
|
-
Returns:
|
|
468
|
-
Strategy string for the column (defaults to "auto")
|
|
469
|
-
"""
|
|
470
|
-
if column in strategy_dict:
|
|
471
|
-
return strategy_dict[column]
|
|
472
|
-
|
|
473
|
-
# Return default strategy
|
|
474
|
-
return strategy_dict.get("__default__", "auto")
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
def parse_choice_strategy(strategy_spec: str) -> Tuple[str, Optional[List[Any]]]:
|
|
478
|
-
"""
|
|
479
|
-
Parse choice strategy specification.
|
|
480
|
-
|
|
481
|
-
Args:
|
|
482
|
-
strategy_spec: Strategy string like:
|
|
483
|
-
- "choice:[value1,value2,value3]"
|
|
484
|
-
|
|
485
|
-
Returns:
|
|
486
|
-
Tuple of (strategy_type, values)
|
|
487
|
-
- strategy_type: "choice"
|
|
488
|
-
- values: List of values
|
|
489
|
-
|
|
490
|
-
Raises:
|
|
491
|
-
ValidationError: If strategy format is invalid
|
|
492
|
-
|
|
493
|
-
Examples:
|
|
494
|
-
>>> parse_choice_strategy("choice:[Active,Inactive,Pending]")
|
|
495
|
-
("choice", ["Active", "Inactive", "Pending"])
|
|
496
|
-
"""
|
|
497
|
-
if strategy_spec.startswith("choice:["):
|
|
498
|
-
# Inline list: choice:[value1,value2,value3]
|
|
499
|
-
if not strategy_spec.endswith("]"):
|
|
500
|
-
raise ValidationError(
|
|
501
|
-
f"Invalid choice strategy: {strategy_spec}. "
|
|
502
|
-
"Must be in format: choice:[value1,value2,value3]"
|
|
503
|
-
)
|
|
504
|
-
|
|
505
|
-
# Extract values between [ and ]
|
|
506
|
-
values_str = strategy_spec[len("choice:["):-1]
|
|
507
|
-
|
|
508
|
-
if not values_str.strip():
|
|
509
|
-
raise ValidationError(
|
|
510
|
-
f"Choice list cannot be empty: {strategy_spec}"
|
|
511
|
-
)
|
|
512
|
-
|
|
513
|
-
# Split by comma and strip whitespace
|
|
514
|
-
values = [v.strip() for v in values_str.split(",")]
|
|
515
|
-
|
|
516
|
-
if len(values) == 0:
|
|
517
|
-
raise ValidationError(
|
|
518
|
-
f"Choice list must contain at least one value: {strategy_spec}"
|
|
519
|
-
)
|
|
520
|
-
|
|
521
|
-
return "choice", values
|
|
522
|
-
|
|
523
|
-
else:
|
|
524
|
-
raise ValidationError(
|
|
525
|
-
f"Invalid choice strategy: {strategy_spec}. "
|
|
526
|
-
"Must start with 'choice:['"
|
|
527
|
-
)
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
def apply_range_strategy(
|
|
531
|
-
min_val: int,
|
|
532
|
-
max_val: int,
|
|
533
|
-
n_rows: int,
|
|
534
|
-
seed: Optional[int]
|
|
535
|
-
) -> List[int]:
|
|
536
|
-
"""
|
|
537
|
-
Apply range strategy to generate random integers within a range.
|
|
538
|
-
|
|
539
|
-
Args:
|
|
540
|
-
min_val: Minimum value (inclusive)
|
|
541
|
-
max_val: Maximum value (inclusive)
|
|
542
|
-
n_rows: Number of values to generate
|
|
543
|
-
seed: Random seed for reproducibility
|
|
544
|
-
|
|
545
|
-
Returns:
|
|
546
|
-
List of random integers within the specified range
|
|
547
|
-
|
|
548
|
-
Raises:
|
|
549
|
-
ValidationError: If min_val >= max_val
|
|
550
|
-
|
|
551
|
-
Examples:
|
|
552
|
-
>>> apply_range_strategy(18, 65, 5, seed=42)
|
|
553
|
-
[34, 52, 23, 61, 38]
|
|
554
|
-
|
|
555
|
-
>>> apply_range_strategy(40000, 120000, 3, seed=42)
|
|
556
|
-
[75000, 110000, 45000]
|
|
557
|
-
"""
|
|
558
|
-
# Validate range
|
|
559
|
-
if min_val >= max_val:
|
|
560
|
-
raise ValidationError(
|
|
561
|
-
f"Invalid range: min ({min_val}) must be less than max ({max_val})"
|
|
562
|
-
)
|
|
563
|
-
|
|
564
|
-
# Set seed for reproducibility
|
|
565
|
-
if seed is not None:
|
|
566
|
-
random.seed(seed)
|
|
567
|
-
|
|
568
|
-
# Generate random integers within range (inclusive)
|
|
569
|
-
values = [random.randint(min_val, max_val) for _ in range(n_rows)]
|
|
570
|
-
|
|
571
|
-
return values
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
def apply_choice_strategy(
|
|
575
|
-
strategy_spec: str,
|
|
576
|
-
n_rows: int,
|
|
577
|
-
seed: Optional[int]
|
|
578
|
-
) -> List[Any]:
|
|
579
|
-
"""
|
|
580
|
-
Apply choice strategy to generate values.
|
|
581
|
-
|
|
582
|
-
Args:
|
|
583
|
-
strategy_spec: Strategy specification (e.g., "choice:[A,B,C]")
|
|
584
|
-
n_rows: Number of values to generate
|
|
585
|
-
seed: Random seed for reproducibility
|
|
586
|
-
|
|
587
|
-
Returns:
|
|
588
|
-
List of randomly selected values
|
|
589
|
-
|
|
590
|
-
Raises:
|
|
591
|
-
ValidationError: If strategy cannot be applied
|
|
592
|
-
"""
|
|
593
|
-
# Parse the strategy
|
|
594
|
-
strategy_type, values = parse_choice_strategy(strategy_spec)
|
|
595
|
-
|
|
596
|
-
# Generate random selections
|
|
597
|
-
if seed is not None:
|
|
598
|
-
# Use Python's random for consistency across backends
|
|
599
|
-
random.seed(seed)
|
|
600
|
-
|
|
601
|
-
selected_values = random.choices(values, k=n_rows)
|
|
602
|
-
|
|
603
|
-
return selected_values
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
def apply_forecast_strategy(
|
|
608
|
-
df_polars: Any,
|
|
609
|
-
column: str,
|
|
610
|
-
strategy_spec: str,
|
|
611
|
-
n_rows: int,
|
|
612
|
-
seed: Optional[int] = None
|
|
613
|
-
) -> List[Any]:
|
|
614
|
-
"""
|
|
615
|
-
Apply forecast strategy to a column.
|
|
616
|
-
|
|
617
|
-
Supports:
|
|
618
|
-
- forecast:linear
|
|
619
|
-
- forecast:polynomial
|
|
620
|
-
- forecast:exponential
|
|
621
|
-
- forecast:moving_average
|
|
622
|
-
- forecast:seasonal
|
|
623
|
-
- forecast:auto
|
|
624
|
-
|
|
625
|
-
Args:
|
|
626
|
-
df_polars: Input Polars DataFrame
|
|
627
|
-
column: Column name to forecast
|
|
628
|
-
strategy_spec: Strategy specification (e.g., "forecast:seasonal:period=12")
|
|
629
|
-
n_rows: Number of values to forecast
|
|
630
|
-
seed: Random seed (not used for deterministic forecasts)
|
|
631
|
-
|
|
632
|
-
Returns:
|
|
633
|
-
List of forecasted values
|
|
634
|
-
|
|
635
|
-
Raises:
|
|
636
|
-
ValidationError: If strategy cannot be applied
|
|
637
|
-
|
|
638
|
-
Examples:
|
|
639
|
-
>>> apply_forecast_strategy(df, "sales", "forecast:linear", 10)
|
|
640
|
-
[105.2, 110.4, 115.6, ...]
|
|
641
|
-
|
|
642
|
-
>>> apply_forecast_strategy(df, "sales", "forecast:seasonal:period=12", 24)
|
|
643
|
-
[98.5, 102.3, 95.8, ...]
|
|
644
|
-
"""
|
|
645
|
-
from additory.synthetic.forecast import forecast_values, ForecastMethod
|
|
646
|
-
|
|
647
|
-
# Parse strategy: forecast:method:param1=val1:param2=val2
|
|
648
|
-
parts = strategy_spec.split(":")
|
|
649
|
-
|
|
650
|
-
if len(parts) < 2:
|
|
651
|
-
raise ValidationError(
|
|
652
|
-
f"Invalid forecast strategy: {strategy_spec}. "
|
|
653
|
-
"Expected format: forecast:method or forecast:method:param=value"
|
|
654
|
-
)
|
|
655
|
-
|
|
656
|
-
# parts[0] is "forecast", parts[1] is method
|
|
657
|
-
method = parts[1].strip()
|
|
658
|
-
|
|
659
|
-
# Parse additional parameters
|
|
660
|
-
params = {}
|
|
661
|
-
for i in range(2, len(parts)):
|
|
662
|
-
param_part = parts[i].strip()
|
|
663
|
-
|
|
664
|
-
if "=" in param_part:
|
|
665
|
-
key, value = param_part.split("=", 1)
|
|
666
|
-
key = key.strip()
|
|
667
|
-
value = value.strip()
|
|
668
|
-
|
|
669
|
-
# Try to convert to int/float
|
|
670
|
-
try:
|
|
671
|
-
if "." in value:
|
|
672
|
-
params[key] = float(value)
|
|
673
|
-
else:
|
|
674
|
-
params[key] = int(value)
|
|
675
|
-
except ValueError:
|
|
676
|
-
params[key] = value
|
|
677
|
-
|
|
678
|
-
# Call forecast function
|
|
679
|
-
try:
|
|
680
|
-
return forecast_values(
|
|
681
|
-
df_polars,
|
|
682
|
-
column,
|
|
683
|
-
n_rows,
|
|
684
|
-
method=method,
|
|
685
|
-
**params
|
|
686
|
-
)
|
|
687
|
-
except Exception as e:
|
|
688
|
-
raise ValidationError(f"Forecast strategy failed: {e}")
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
def apply_distribution_strategy(
|
|
692
|
-
df_polars: Any,
|
|
693
|
-
column: str,
|
|
694
|
-
strategy_spec: str,
|
|
695
|
-
n_rows: int,
|
|
696
|
-
seed: Optional[int] = None
|
|
697
|
-
) -> List[Any]:
|
|
698
|
-
"""
|
|
699
|
-
Apply distribution strategy to a column.
|
|
700
|
-
|
|
701
|
-
Supports:
|
|
702
|
-
- normal (or normal:auto)
|
|
703
|
-
- normal:mean=X:std=Y
|
|
704
|
-
- uniform:min=X:max=Y
|
|
705
|
-
- skewed_left:skewness=X
|
|
706
|
-
- skewed_right:skewness=X
|
|
707
|
-
|
|
708
|
-
Args:
|
|
709
|
-
df_polars: Input Polars DataFrame (for parameter estimation)
|
|
710
|
-
column: Column name to generate from
|
|
711
|
-
strategy_spec: Strategy specification (e.g., "normal:auto")
|
|
712
|
-
n_rows: Number of values to generate
|
|
713
|
-
seed: Random seed for reproducibility
|
|
714
|
-
|
|
715
|
-
Returns:
|
|
716
|
-
List of generated values
|
|
717
|
-
|
|
718
|
-
Raises:
|
|
719
|
-
ValidationError: If strategy cannot be applied
|
|
720
|
-
|
|
721
|
-
Examples:
|
|
722
|
-
>>> apply_distribution_strategy(df, "age", "normal:auto", 100, seed=42)
|
|
723
|
-
[34.5, 28.9, 41.2, ...]
|
|
724
|
-
|
|
725
|
-
>>> apply_distribution_strategy(df, "score", "uniform:min=0:max=100", 50)
|
|
726
|
-
[45.2, 78.9, 12.3, ...]
|
|
727
|
-
"""
|
|
728
|
-
from additory.common.distributions import generate_distribution_values
|
|
729
|
-
|
|
730
|
-
# Parse strategy: distribution:param1=val1:param2=val2
|
|
731
|
-
parts = strategy_spec.split(":")
|
|
732
|
-
|
|
733
|
-
if len(parts) < 1:
|
|
734
|
-
raise ValidationError(f"Invalid distribution strategy: {strategy_spec}")
|
|
735
|
-
|
|
736
|
-
distribution = parts[0].strip()
|
|
737
|
-
|
|
738
|
-
# Parse additional parameters
|
|
739
|
-
params = {}
|
|
740
|
-
auto_mode = False
|
|
741
|
-
|
|
742
|
-
for i in range(1, len(parts)):
|
|
743
|
-
param_part = parts[i].strip()
|
|
744
|
-
|
|
745
|
-
if param_part == "auto":
|
|
746
|
-
# Special case: normal:auto
|
|
747
|
-
auto_mode = True
|
|
748
|
-
continue
|
|
749
|
-
|
|
750
|
-
if "=" in param_part:
|
|
751
|
-
key, value = param_part.split("=", 1)
|
|
752
|
-
key = key.strip()
|
|
753
|
-
value = value.strip()
|
|
754
|
-
|
|
755
|
-
# Try to convert to int/float
|
|
756
|
-
try:
|
|
757
|
-
if "." in value:
|
|
758
|
-
params[key] = float(value)
|
|
759
|
-
else:
|
|
760
|
-
params[key] = int(value)
|
|
761
|
-
except ValueError:
|
|
762
|
-
params[key] = value
|
|
763
|
-
|
|
764
|
-
# Get existing data for parameter estimation
|
|
765
|
-
data = df_polars[column].to_numpy()
|
|
766
|
-
|
|
767
|
-
# Determine distribution type
|
|
768
|
-
if auto_mode:
|
|
769
|
-
dist_type = 'auto'
|
|
770
|
-
else:
|
|
771
|
-
dist_type = distribution
|
|
772
|
-
|
|
773
|
-
# Call distribution function
|
|
774
|
-
try:
|
|
775
|
-
return generate_distribution_values(
|
|
776
|
-
n_rows,
|
|
777
|
-
distribution=dist_type,
|
|
778
|
-
data=data,
|
|
779
|
-
seed=seed,
|
|
780
|
-
**params
|
|
781
|
-
)
|
|
782
|
-
except Exception as e:
|
|
783
|
-
raise ValidationError(f"Distribution strategy failed: {e}")
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
def apply_smote_strategy(
|
|
787
|
-
df_polars: Any,
|
|
788
|
-
columns: List[str],
|
|
789
|
-
strategy_spec: str,
|
|
790
|
-
n_rows: int,
|
|
791
|
-
seed: Optional[int] = None
|
|
792
|
-
) -> Dict[str, List[Any]]:
|
|
793
|
-
"""
|
|
794
|
-
Apply SMOTE strategy to multiple columns.
|
|
795
|
-
|
|
796
|
-
SMOTE generates synthetic samples using k-nearest neighbors.
|
|
797
|
-
|
|
798
|
-
Args:
|
|
799
|
-
df_polars: Input Polars DataFrame
|
|
800
|
-
columns: List of column names to use for SMOTE
|
|
801
|
-
strategy_spec: Strategy specification (e.g., "smote:k=5")
|
|
802
|
-
n_rows: Number of synthetic samples to generate
|
|
803
|
-
seed: Random seed for reproducibility
|
|
804
|
-
|
|
805
|
-
Returns:
|
|
806
|
-
Dictionary mapping column names to generated values
|
|
807
|
-
|
|
808
|
-
Raises:
|
|
809
|
-
ValidationError: If strategy cannot be applied
|
|
810
|
-
|
|
811
|
-
Examples:
|
|
812
|
-
>>> apply_smote_strategy(df, ["feature1", "feature2"], "smote:k=5", 100)
|
|
813
|
-
{"feature1": [1.2, 3.4, ...], "feature2": [5.6, 7.8, ...]}
|
|
814
|
-
"""
|
|
815
|
-
from additory.synthetic.smote import generate_smote_values
|
|
816
|
-
|
|
817
|
-
# Parse strategy: smote:k=5
|
|
818
|
-
parts = strategy_spec.split(":")
|
|
819
|
-
|
|
820
|
-
# Parse parameters
|
|
821
|
-
params = {}
|
|
822
|
-
for i in range(1, len(parts)):
|
|
823
|
-
param_part = parts[i].strip()
|
|
824
|
-
|
|
825
|
-
if "=" in param_part:
|
|
826
|
-
key, value = param_part.split("=", 1)
|
|
827
|
-
key = key.strip()
|
|
828
|
-
value = value.strip()
|
|
829
|
-
|
|
830
|
-
# Convert k to k_neighbors
|
|
831
|
-
if key == "k":
|
|
832
|
-
key = "k_neighbors"
|
|
833
|
-
|
|
834
|
-
# Try to convert to int
|
|
835
|
-
try:
|
|
836
|
-
params[key] = int(value)
|
|
837
|
-
except ValueError:
|
|
838
|
-
params[key] = value
|
|
839
|
-
|
|
840
|
-
# Call SMOTE function
|
|
841
|
-
try:
|
|
842
|
-
return generate_smote_values(
|
|
843
|
-
df_polars,
|
|
844
|
-
columns,
|
|
845
|
-
n_rows,
|
|
846
|
-
seed=seed,
|
|
847
|
-
**params
|
|
848
|
-
)
|
|
849
|
-
except Exception as e:
|
|
850
|
-
raise ValidationError(f"SMOTE strategy failed: {e}")
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
def parse_deduce_strategy(strategy_spec: str) -> Tuple[str, List[str]]:
|
|
854
|
-
"""
|
|
855
|
-
Parse deduce strategy specification.
|
|
856
|
-
|
|
857
|
-
Args:
|
|
858
|
-
strategy_spec: Strategy string like:
|
|
859
|
-
- "deduce:comment"
|
|
860
|
-
- "deduce:[comment, notes]"
|
|
861
|
-
|
|
862
|
-
Returns:
|
|
863
|
-
Tuple of (strategy_type, source_columns)
|
|
864
|
-
- strategy_type: "deduce"
|
|
865
|
-
- source_columns: List of source column names
|
|
866
|
-
|
|
867
|
-
Raises:
|
|
868
|
-
ValidationError: If strategy format is invalid
|
|
869
|
-
|
|
870
|
-
Examples:
|
|
871
|
-
>>> parse_deduce_strategy("deduce:comment")
|
|
872
|
-
("deduce", ["comment"])
|
|
873
|
-
|
|
874
|
-
>>> parse_deduce_strategy("deduce:[comment, notes]")
|
|
875
|
-
("deduce", ["comment", "notes"])
|
|
876
|
-
"""
|
|
877
|
-
if not strategy_spec.startswith("deduce:"):
|
|
878
|
-
raise ValidationError(
|
|
879
|
-
f"Invalid deduce strategy: {strategy_spec}. "
|
|
880
|
-
"Must start with 'deduce:'"
|
|
881
|
-
)
|
|
882
|
-
|
|
883
|
-
# Extract source specification after "deduce:"
|
|
884
|
-
source_spec = strategy_spec[7:].strip() # Remove "deduce:" prefix
|
|
885
|
-
|
|
886
|
-
if not source_spec:
|
|
887
|
-
raise ValidationError(
|
|
888
|
-
f"Deduce strategy requires source column(s): {strategy_spec}. "
|
|
889
|
-
"Format: 'deduce:column' or 'deduce:[col1, col2]'"
|
|
890
|
-
)
|
|
891
|
-
|
|
892
|
-
# Check if it's multiple columns: deduce:[col1, col2]
|
|
893
|
-
if source_spec.startswith("[") and source_spec.endswith("]"):
|
|
894
|
-
# Multiple columns
|
|
895
|
-
columns_str = source_spec[1:-1] # Remove brackets
|
|
896
|
-
|
|
897
|
-
if not columns_str.strip():
|
|
898
|
-
raise ValidationError(
|
|
899
|
-
f"Deduce column list cannot be empty: {strategy_spec}"
|
|
900
|
-
)
|
|
901
|
-
|
|
902
|
-
# Split by comma and strip whitespace
|
|
903
|
-
columns = [c.strip() for c in columns_str.split(",")]
|
|
904
|
-
|
|
905
|
-
if len(columns) == 0:
|
|
906
|
-
raise ValidationError(
|
|
907
|
-
f"Deduce strategy must specify at least one column: {strategy_spec}"
|
|
908
|
-
)
|
|
909
|
-
|
|
910
|
-
return "deduce", columns
|
|
911
|
-
else:
|
|
912
|
-
# Single column
|
|
913
|
-
return "deduce", [source_spec]
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
def is_deduce_strategy(strategy_spec: str) -> bool:
|
|
917
|
-
"""
|
|
918
|
-
Check if a strategy specification is a deduce strategy.
|
|
919
|
-
|
|
920
|
-
Args:
|
|
921
|
-
strategy_spec: Strategy string
|
|
922
|
-
|
|
923
|
-
Returns:
|
|
924
|
-
True if it's a deduce strategy, False otherwise
|
|
925
|
-
"""
|
|
926
|
-
return isinstance(strategy_spec, str) and strategy_spec.startswith("deduce:")
|