additory 0.1.0a4__py3-none-any.whl → 0.1.1a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- additory/__init__.py +58 -14
- additory/common/__init__.py +31 -147
- additory/common/column_selector.py +255 -0
- additory/common/distributions.py +286 -613
- additory/common/extractors.py +313 -0
- additory/common/knn_imputation.py +332 -0
- additory/common/result.py +380 -0
- additory/common/strategy_parser.py +243 -0
- additory/common/unit_conversions.py +338 -0
- additory/common/validation.py +283 -103
- additory/core/__init__.py +34 -22
- additory/core/backend.py +258 -0
- additory/core/config.py +177 -305
- additory/core/logging.py +230 -24
- additory/core/memory_manager.py +157 -495
- additory/expressions/__init__.py +2 -23
- additory/expressions/compiler.py +457 -0
- additory/expressions/engine.py +264 -487
- additory/expressions/integrity.py +179 -0
- additory/expressions/loader.py +263 -0
- additory/expressions/parser.py +363 -167
- additory/expressions/resolver.py +274 -0
- additory/functions/__init__.py +1 -0
- additory/functions/analyze/__init__.py +144 -0
- additory/functions/analyze/cardinality.py +58 -0
- additory/functions/analyze/correlations.py +66 -0
- additory/functions/analyze/distributions.py +53 -0
- additory/functions/analyze/duplicates.py +49 -0
- additory/functions/analyze/features.py +61 -0
- additory/functions/analyze/imputation.py +66 -0
- additory/functions/analyze/outliers.py +65 -0
- additory/functions/analyze/patterns.py +65 -0
- additory/functions/analyze/presets.py +72 -0
- additory/functions/analyze/quality.py +59 -0
- additory/functions/analyze/timeseries.py +53 -0
- additory/functions/analyze/types.py +45 -0
- additory/functions/expressions/__init__.py +161 -0
- additory/functions/snapshot/__init__.py +82 -0
- additory/functions/snapshot/filter.py +119 -0
- additory/functions/synthetic/__init__.py +113 -0
- additory/functions/synthetic/mode_detector.py +47 -0
- additory/functions/synthetic/strategies/__init__.py +1 -0
- additory/functions/synthetic/strategies/advanced.py +35 -0
- additory/functions/synthetic/strategies/augmentative.py +160 -0
- additory/functions/synthetic/strategies/generative.py +168 -0
- additory/functions/synthetic/strategies/presets.py +116 -0
- additory/functions/to/__init__.py +188 -0
- additory/functions/to/lookup.py +351 -0
- additory/functions/to/merge.py +189 -0
- additory/functions/to/sort.py +91 -0
- additory/functions/to/summarize.py +170 -0
- additory/functions/transform/__init__.py +140 -0
- additory/functions/transform/datetime.py +79 -0
- additory/functions/transform/extract.py +85 -0
- additory/functions/transform/harmonize.py +105 -0
- additory/functions/transform/knn.py +62 -0
- additory/functions/transform/onehotencoding.py +68 -0
- additory/functions/transform/transpose.py +42 -0
- additory-0.1.1a1.dist-info/METADATA +83 -0
- additory-0.1.1a1.dist-info/RECORD +62 -0
- additory/analysis/__init__.py +0 -48
- additory/analysis/cardinality.py +0 -126
- additory/analysis/correlations.py +0 -124
- additory/analysis/distributions.py +0 -376
- additory/analysis/quality.py +0 -158
- additory/analysis/scan.py +0 -400
- additory/common/backend.py +0 -371
- additory/common/column_utils.py +0 -191
- additory/common/exceptions.py +0 -62
- additory/common/lists.py +0 -229
- additory/common/patterns.py +0 -240
- additory/common/resolver.py +0 -567
- additory/common/sample_data.py +0 -182
- additory/core/ast_builder.py +0 -165
- additory/core/backends/__init__.py +0 -23
- additory/core/backends/arrow_bridge.py +0 -483
- additory/core/backends/cudf_bridge.py +0 -355
- additory/core/column_positioning.py +0 -358
- additory/core/compiler_polars.py +0 -166
- additory/core/enhanced_cache_manager.py +0 -1119
- additory/core/enhanced_matchers.py +0 -473
- additory/core/enhanced_version_manager.py +0 -325
- additory/core/executor.py +0 -59
- additory/core/integrity_manager.py +0 -477
- additory/core/loader.py +0 -190
- additory/core/namespace_manager.py +0 -657
- additory/core/parser.py +0 -176
- additory/core/polars_expression_engine.py +0 -601
- additory/core/registry.py +0 -177
- additory/core/sample_data_manager.py +0 -492
- additory/core/user_namespace.py +0 -751
- additory/core/validator.py +0 -27
- additory/dynamic_api.py +0 -352
- additory/expressions/proxy.py +0 -549
- additory/expressions/registry.py +0 -313
- additory/expressions/samples.py +0 -492
- additory/synthetic/__init__.py +0 -13
- additory/synthetic/column_name_resolver.py +0 -149
- additory/synthetic/deduce.py +0 -259
- additory/synthetic/distributions.py +0 -22
- additory/synthetic/forecast.py +0 -1132
- additory/synthetic/linked_list_parser.py +0 -415
- additory/synthetic/namespace_lookup.py +0 -129
- additory/synthetic/smote.py +0 -320
- additory/synthetic/strategies.py +0 -926
- additory/synthetic/synthesizer.py +0 -713
- additory/utilities/__init__.py +0 -53
- additory/utilities/encoding.py +0 -600
- additory/utilities/games.py +0 -300
- additory/utilities/keys.py +0 -8
- additory/utilities/lookup.py +0 -103
- additory/utilities/matchers.py +0 -216
- additory/utilities/resolvers.py +0 -286
- additory/utilities/settings.py +0 -167
- additory/utilities/units.py +0 -749
- additory/utilities/validators.py +0 -153
- additory-0.1.0a4.dist-info/METADATA +0 -311
- additory-0.1.0a4.dist-info/RECORD +0 -72
- additory-0.1.0a4.dist-info/licenses/LICENSE +0 -21
- {additory-0.1.0a4.dist-info → additory-0.1.1a1.dist-info}/WHEEL +0 -0
- {additory-0.1.0a4.dist-info → additory-0.1.1a1.dist-info}/top_level.txt +0 -0
|
@@ -1,713 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Data Augmentation Engine - Polars-Only Architecture
|
|
3
|
-
|
|
4
|
-
Provides functionality to augment dataframes by adding synthetic rows
|
|
5
|
-
based on existing data patterns.
|
|
6
|
-
|
|
7
|
-
Architecture:
|
|
8
|
-
1. Detect input format (pandas/polars/cuDF)
|
|
9
|
-
2. Convert to Polars via Arrow bridge (if needed)
|
|
10
|
-
3. Process augmentation in Polars
|
|
11
|
-
4. Convert back to original format via Arrow bridge
|
|
12
|
-
"""
|
|
13
|
-
|
|
14
|
-
from typing import Union, Optional, Any, Dict, Literal
|
|
15
|
-
import random
|
|
16
|
-
|
|
17
|
-
from additory.common.backend import detect_backend, to_polars, from_polars
|
|
18
|
-
from additory.common.exceptions import ValidationError, AugmentError
|
|
19
|
-
from additory.common.validation import validate_dataframe
|
|
20
|
-
from additory.common.sample_data import get_sample_dataset
|
|
21
|
-
from additory.synthetic.strategies import (
|
|
22
|
-
parse_strategy_dict,
|
|
23
|
-
get_column_strategy,
|
|
24
|
-
apply_increment_strategy,
|
|
25
|
-
apply_choice_strategy,
|
|
26
|
-
apply_range_strategy,
|
|
27
|
-
parse_strategy_params
|
|
28
|
-
)
|
|
29
|
-
|
|
30
|
-
# Linked lists feature imports
|
|
31
|
-
from additory.synthetic.namespace_lookup import lookup_linked_list
|
|
32
|
-
from additory.synthetic.linked_list_parser import (
|
|
33
|
-
parse_linked_list,
|
|
34
|
-
generate_linked_list_data
|
|
35
|
-
)
|
|
36
|
-
from additory.synthetic.column_name_resolver import resolve_column_names
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
def _validate_generative_strategies(strategy_dict: Dict[str, str]) -> None:
|
|
40
|
-
"""
|
|
41
|
-
Validate that all strategies are generative (not augmentative).
|
|
42
|
-
|
|
43
|
-
Generative strategies can create data from scratch:
|
|
44
|
-
- increment (with start parameter)
|
|
45
|
-
- range
|
|
46
|
-
- choice
|
|
47
|
-
- lists (inline linked lists)
|
|
48
|
-
|
|
49
|
-
Augmentative strategies require existing data:
|
|
50
|
-
- auto (random sampling)
|
|
51
|
-
- forecast (time series)
|
|
52
|
-
- seasonal (time series)
|
|
53
|
-
- smote (synthetic minority oversampling)
|
|
54
|
-
|
|
55
|
-
Args:
|
|
56
|
-
strategy_dict: Dictionary mapping column names to strategy specs
|
|
57
|
-
|
|
58
|
-
Raises:
|
|
59
|
-
ValidationError: If any augmentative strategies are found
|
|
60
|
-
"""
|
|
61
|
-
augmentative_strategies = ["auto", "forecast", "seasonal", "smote"]
|
|
62
|
-
|
|
63
|
-
invalid_columns = []
|
|
64
|
-
|
|
65
|
-
for col, strategy_spec in strategy_dict.items():
|
|
66
|
-
if col == "__default__":
|
|
67
|
-
continue
|
|
68
|
-
|
|
69
|
-
# Get the base strategy name (before any parameters)
|
|
70
|
-
strategy_name = strategy_spec.split(":")[0].strip()
|
|
71
|
-
|
|
72
|
-
# Handle lists@ pattern
|
|
73
|
-
if strategy_name.startswith("lists@"):
|
|
74
|
-
continue # Valid generative strategy
|
|
75
|
-
|
|
76
|
-
if strategy_name in augmentative_strategies:
|
|
77
|
-
invalid_columns.append((col, strategy_name))
|
|
78
|
-
|
|
79
|
-
if invalid_columns:
|
|
80
|
-
error_lines = [
|
|
81
|
-
f"Create mode requires generative strategies. Found augmentative strategies:"
|
|
82
|
-
]
|
|
83
|
-
for col, strat in invalid_columns:
|
|
84
|
-
error_lines.append(f" - Column '{col}': '{strat}'")
|
|
85
|
-
|
|
86
|
-
error_lines.append("")
|
|
87
|
-
error_lines.append("Valid generative strategies:")
|
|
88
|
-
error_lines.append(" - increment (with start parameter)")
|
|
89
|
-
error_lines.append(" - range:min-max")
|
|
90
|
-
error_lines.append(" - choice:[value1,value2,...]")
|
|
91
|
-
error_lines.append(" - lists@variable_name (inline linked lists)")
|
|
92
|
-
|
|
93
|
-
raise ValidationError("\n".join(error_lines))
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
def _detect_mode(df: Any) -> Literal["augment", "create", "sample"]:
|
|
97
|
-
"""
|
|
98
|
-
Detect the augmentation mode based on the df parameter.
|
|
99
|
-
|
|
100
|
-
Three modes are supported:
|
|
101
|
-
1. "augment" - Augment an existing DataFrame (default)
|
|
102
|
-
2. "create" - Create data from scratch using "@new" sentinel
|
|
103
|
-
3. "sample" - Load and optionally augment sample dataset using "@sample" sentinel
|
|
104
|
-
|
|
105
|
-
Args:
|
|
106
|
-
df: Input parameter (DataFrame or sentinel string)
|
|
107
|
-
|
|
108
|
-
Returns:
|
|
109
|
-
Mode string: "augment", "create", or "sample"
|
|
110
|
-
|
|
111
|
-
Raises:
|
|
112
|
-
ValidationError: If df is an invalid string (not "@new" or "@sample")
|
|
113
|
-
"""
|
|
114
|
-
# Check for sentinel values
|
|
115
|
-
if isinstance(df, str):
|
|
116
|
-
if df == "@new":
|
|
117
|
-
return "create"
|
|
118
|
-
elif df == "@sample":
|
|
119
|
-
return "sample"
|
|
120
|
-
else:
|
|
121
|
-
# Provide helpful error messages
|
|
122
|
-
if df.lower() in ["new", "create"]:
|
|
123
|
-
raise ValidationError(
|
|
124
|
-
f"Invalid input: '{df}'. Did you mean '@new'? "
|
|
125
|
-
"Use '@new' to create data from scratch."
|
|
126
|
-
)
|
|
127
|
-
elif df.lower() in ["sample", "samples"]:
|
|
128
|
-
raise ValidationError(
|
|
129
|
-
f"Invalid input: '{df}'. Did you mean '@sample'? "
|
|
130
|
-
"Use '@sample' to load sample dataset."
|
|
131
|
-
)
|
|
132
|
-
else:
|
|
133
|
-
raise ValidationError(
|
|
134
|
-
f"Invalid string input: '{df}'. "
|
|
135
|
-
"Expected a DataFrame, '@new' (create mode), or '@sample' (sample mode)."
|
|
136
|
-
)
|
|
137
|
-
|
|
138
|
-
# If not a string, assume it's a DataFrame (augment mode)
|
|
139
|
-
return "augment"
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
def _parse_n_rows(n_rows: Union[int, str], df_length: int) -> int:
|
|
143
|
-
"""
|
|
144
|
-
Parse n_rows parameter to get actual number of rows to generate.
|
|
145
|
-
|
|
146
|
-
Args:
|
|
147
|
-
n_rows: Number of rows (int), percentage ("50%"), or multiplier ("2x")
|
|
148
|
-
df_length: Length of the input dataframe
|
|
149
|
-
|
|
150
|
-
Returns:
|
|
151
|
-
Actual number of rows to generate
|
|
152
|
-
|
|
153
|
-
Raises:
|
|
154
|
-
ValidationError: If n_rows format is invalid
|
|
155
|
-
"""
|
|
156
|
-
if isinstance(n_rows, int):
|
|
157
|
-
if n_rows <= 0:
|
|
158
|
-
raise ValidationError("n_rows must be positive")
|
|
159
|
-
return n_rows
|
|
160
|
-
|
|
161
|
-
if isinstance(n_rows, str):
|
|
162
|
-
n_rows = n_rows.strip()
|
|
163
|
-
|
|
164
|
-
# Handle percentage: "50%"
|
|
165
|
-
if n_rows.endswith("%"):
|
|
166
|
-
try:
|
|
167
|
-
percentage = float(n_rows[:-1])
|
|
168
|
-
if percentage <= 0:
|
|
169
|
-
raise ValidationError("Percentage must be positive")
|
|
170
|
-
return max(1, int(df_length * percentage / 100))
|
|
171
|
-
except ValueError:
|
|
172
|
-
raise ValidationError(f"Invalid percentage format: {n_rows}")
|
|
173
|
-
|
|
174
|
-
# Handle multiplier: "2x"
|
|
175
|
-
if n_rows.endswith("x"):
|
|
176
|
-
try:
|
|
177
|
-
multiplier = float(n_rows[:-1])
|
|
178
|
-
if multiplier <= 0:
|
|
179
|
-
raise ValidationError("Multiplier must be positive")
|
|
180
|
-
return max(1, int(df_length * multiplier))
|
|
181
|
-
except ValueError:
|
|
182
|
-
raise ValidationError(f"Invalid multiplier format: {n_rows}")
|
|
183
|
-
|
|
184
|
-
raise ValidationError(
|
|
185
|
-
f"Invalid n_rows format: {n_rows}. "
|
|
186
|
-
"Use int (5), percentage ('50%'), or multiplier ('2x')"
|
|
187
|
-
)
|
|
188
|
-
|
|
189
|
-
raise ValidationError(f"n_rows must be int or str, got {type(n_rows)}")
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
def _augment_polars_engine(df_polars: Any, n_rows: int, strategy_dict: Dict[str, str], seed: Optional[int]) -> Any:
|
|
193
|
-
"""
|
|
194
|
-
Augment Polars DataFrame with strategy support.
|
|
195
|
-
|
|
196
|
-
This is the core augmentation engine that processes all dataframes
|
|
197
|
-
(pandas, polars, cuDF) after conversion to Polars format.
|
|
198
|
-
|
|
199
|
-
Args:
|
|
200
|
-
df_polars: Input Polars DataFrame
|
|
201
|
-
n_rows: Number of rows to generate
|
|
202
|
-
strategy_dict: Column-specific strategies
|
|
203
|
-
seed: Random seed for reproducibility
|
|
204
|
-
|
|
205
|
-
Returns:
|
|
206
|
-
Augmented Polars DataFrame
|
|
207
|
-
"""
|
|
208
|
-
import polars as pl
|
|
209
|
-
|
|
210
|
-
# Get column names
|
|
211
|
-
columns = df_polars.columns
|
|
212
|
-
|
|
213
|
-
# Check if any column uses non-auto strategy
|
|
214
|
-
has_custom_strategy = any(
|
|
215
|
-
not get_column_strategy(col, strategy_dict).startswith("auto")
|
|
216
|
-
for col in columns
|
|
217
|
-
)
|
|
218
|
-
|
|
219
|
-
if not has_custom_strategy:
|
|
220
|
-
# Simple random sampling (original behavior)
|
|
221
|
-
if seed is not None:
|
|
222
|
-
sampled = df_polars.sample(n=n_rows, with_replacement=True, seed=seed)
|
|
223
|
-
else:
|
|
224
|
-
sampled = df_polars.sample(n=n_rows, with_replacement=True)
|
|
225
|
-
else:
|
|
226
|
-
# Build new rows column by column
|
|
227
|
-
new_data = {}
|
|
228
|
-
|
|
229
|
-
for col in columns:
|
|
230
|
-
col_strategy = get_column_strategy(col, strategy_dict)
|
|
231
|
-
|
|
232
|
-
if col_strategy.startswith("increment"):
|
|
233
|
-
# Generate incremented values
|
|
234
|
-
new_values = apply_increment_strategy(
|
|
235
|
-
df_polars, col, col_strategy, n_rows
|
|
236
|
-
)
|
|
237
|
-
new_data[col] = new_values
|
|
238
|
-
elif col_strategy.startswith("range"):
|
|
239
|
-
# Parse range parameters
|
|
240
|
-
strategy_name, params = parse_strategy_params(col_strategy)
|
|
241
|
-
|
|
242
|
-
if "min" not in params or "max" not in params:
|
|
243
|
-
raise ValidationError(
|
|
244
|
-
f"Range strategy for column '{col}' requires min and max parameters. "
|
|
245
|
-
f"Use format: 'range:min-max' (e.g., 'range:18-65')"
|
|
246
|
-
)
|
|
247
|
-
|
|
248
|
-
# Generate range values
|
|
249
|
-
new_values = apply_range_strategy(
|
|
250
|
-
min_val=params["min"],
|
|
251
|
-
max_val=params["max"],
|
|
252
|
-
n_rows=n_rows,
|
|
253
|
-
seed=seed
|
|
254
|
-
)
|
|
255
|
-
new_data[col] = new_values
|
|
256
|
-
elif col_strategy.startswith("choice"):
|
|
257
|
-
# Generate choice values
|
|
258
|
-
new_values = apply_choice_strategy(
|
|
259
|
-
col_strategy, n_rows, seed
|
|
260
|
-
)
|
|
261
|
-
new_data[col] = new_values
|
|
262
|
-
elif col_strategy.startswith("forecast"):
|
|
263
|
-
# Import here to avoid circular dependency
|
|
264
|
-
from additory.synthetic.strategies import apply_forecast_strategy
|
|
265
|
-
|
|
266
|
-
# Generate forecasted values
|
|
267
|
-
new_values = apply_forecast_strategy(
|
|
268
|
-
df_polars, col, col_strategy, n_rows, seed
|
|
269
|
-
)
|
|
270
|
-
|
|
271
|
-
# Cast to match original column dtype if needed
|
|
272
|
-
original_dtype = df_polars[col].dtype
|
|
273
|
-
if original_dtype.is_integer():
|
|
274
|
-
# Round and convert to int for integer columns
|
|
275
|
-
new_values = [int(round(v)) for v in new_values]
|
|
276
|
-
|
|
277
|
-
new_data[col] = new_values
|
|
278
|
-
elif col_strategy.startswith(("normal", "uniform", "skewed_left", "skewed_right", "beta", "gamma", "exponential", "kde")):
|
|
279
|
-
# Import here to avoid circular dependency
|
|
280
|
-
from additory.synthetic.strategies import apply_distribution_strategy
|
|
281
|
-
|
|
282
|
-
# Generate distribution values
|
|
283
|
-
new_values = apply_distribution_strategy(
|
|
284
|
-
df_polars, col, col_strategy, n_rows, seed
|
|
285
|
-
)
|
|
286
|
-
|
|
287
|
-
# Cast to match original column dtype if needed
|
|
288
|
-
original_dtype = df_polars[col].dtype
|
|
289
|
-
if original_dtype.is_integer():
|
|
290
|
-
# Round and convert to int for integer columns
|
|
291
|
-
new_values = [int(round(v)) for v in new_values]
|
|
292
|
-
|
|
293
|
-
new_data[col] = new_values
|
|
294
|
-
else:
|
|
295
|
-
# Random sampling for this column (auto)
|
|
296
|
-
if seed is not None:
|
|
297
|
-
sampled_col = df_polars.select(col).sample(n=n_rows, with_replacement=True, seed=seed)
|
|
298
|
-
else:
|
|
299
|
-
sampled_col = df_polars.select(col).sample(n=n_rows, with_replacement=True)
|
|
300
|
-
new_data[col] = sampled_col[col].to_list()
|
|
301
|
-
|
|
302
|
-
sampled = pl.DataFrame(new_data)
|
|
303
|
-
|
|
304
|
-
# Concatenate original and new rows
|
|
305
|
-
result = pl.concat([df_polars, sampled])
|
|
306
|
-
|
|
307
|
-
return result
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
def _create_from_scratch_engine(
|
|
311
|
-
n_rows: int,
|
|
312
|
-
strategy_dict: Dict[str, str],
|
|
313
|
-
seed: Optional[int]
|
|
314
|
-
) -> Any:
|
|
315
|
-
"""
|
|
316
|
-
Create DataFrame from scratch using generative strategies.
|
|
317
|
-
|
|
318
|
-
This engine generates data column by column without requiring
|
|
319
|
-
an existing DataFrame. All strategies must be generative.
|
|
320
|
-
|
|
321
|
-
Generative strategies (supported):
|
|
322
|
-
- increment (with start parameter)
|
|
323
|
-
- range
|
|
324
|
-
- choice
|
|
325
|
-
|
|
326
|
-
Augmentative strategies (NOT supported):
|
|
327
|
-
- auto (requires existing data)
|
|
328
|
-
- forecast (requires time series)
|
|
329
|
-
- seasonal (requires time series)
|
|
330
|
-
- smote (requires existing data)
|
|
331
|
-
|
|
332
|
-
Args:
|
|
333
|
-
n_rows: Number of rows to generate
|
|
334
|
-
strategy_dict: Column-specific strategies (all must be generative)
|
|
335
|
-
seed: Random seed for reproducibility
|
|
336
|
-
|
|
337
|
-
Returns:
|
|
338
|
-
Polars DataFrame with generated data
|
|
339
|
-
|
|
340
|
-
Raises:
|
|
341
|
-
ValidationError: If any augmentative strategies are found
|
|
342
|
-
|
|
343
|
-
Examples:
|
|
344
|
-
>>> # Simple create with increment and range
|
|
345
|
-
>>> result = _create_from_scratch_engine(
|
|
346
|
-
... n_rows=10,
|
|
347
|
-
... strategy_dict={
|
|
348
|
-
... "id": "increment:start=1",
|
|
349
|
-
... "age": "range:18-65"
|
|
350
|
-
... },
|
|
351
|
-
... seed=42
|
|
352
|
-
... )
|
|
353
|
-
>>> result.shape
|
|
354
|
-
(10, 2)
|
|
355
|
-
|
|
356
|
-
>>> # Create with mixed strategies
|
|
357
|
-
>>> result = _create_from_scratch_engine(
|
|
358
|
-
... n_rows=100,
|
|
359
|
-
... strategy_dict={
|
|
360
|
-
... "id": "increment:start=1",
|
|
361
|
-
... "emp_id": "increment:start=1:pattern=EMP_[001]",
|
|
362
|
-
... "age": "range:18-65",
|
|
363
|
-
... "status": "choice:[Active,Inactive,Pending]"
|
|
364
|
-
... },
|
|
365
|
-
... seed=42
|
|
366
|
-
... )
|
|
367
|
-
>>> result.shape
|
|
368
|
-
(100, 4)
|
|
369
|
-
"""
|
|
370
|
-
import polars as pl
|
|
371
|
-
|
|
372
|
-
# Validate all strategies are generative
|
|
373
|
-
_validate_generative_strategies(strategy_dict)
|
|
374
|
-
|
|
375
|
-
# Pre-process linked lists strategies
|
|
376
|
-
# Linked lists generate multiple columns, so we need to expand strategy_dict
|
|
377
|
-
expanded_strategy_dict = {}
|
|
378
|
-
lists_to_process = [] # Store (original_key, var_name, parsed_data, column_names)
|
|
379
|
-
|
|
380
|
-
for col, col_strategy in strategy_dict.items():
|
|
381
|
-
if col == "__default__":
|
|
382
|
-
continue
|
|
383
|
-
|
|
384
|
-
# Check for lists@ pattern
|
|
385
|
-
if col_strategy.startswith("lists@"):
|
|
386
|
-
# Extract variable name
|
|
387
|
-
var_name = col_strategy[6:].strip() # Remove "lists@" prefix
|
|
388
|
-
|
|
389
|
-
try:
|
|
390
|
-
# Lookup variable in namespace
|
|
391
|
-
# Depth=5: user -> add.synthetic (API) -> synthetic() -> _create_from_scratch_engine -> here
|
|
392
|
-
linked_list_data = lookup_linked_list(var_name, depth=5)
|
|
393
|
-
|
|
394
|
-
# Parse linked list
|
|
395
|
-
parsed_data = parse_linked_list(linked_list_data)
|
|
396
|
-
|
|
397
|
-
# Resolve column names
|
|
398
|
-
column_names = resolve_column_names(
|
|
399
|
-
list_name=var_name,
|
|
400
|
-
strategy_key=col,
|
|
401
|
-
num_columns=parsed_data['num_columns'],
|
|
402
|
-
explicit_names=parsed_data['column_names']
|
|
403
|
-
)
|
|
404
|
-
|
|
405
|
-
# Store for later processing
|
|
406
|
-
lists_to_process.append((col, var_name, parsed_data, column_names))
|
|
407
|
-
|
|
408
|
-
except ValidationError as e:
|
|
409
|
-
raise ValidationError(f"Linked list error for column '{col}': {e}")
|
|
410
|
-
else:
|
|
411
|
-
# Regular strategy - keep as is
|
|
412
|
-
expanded_strategy_dict[col] = col_strategy
|
|
413
|
-
|
|
414
|
-
# Build data column by column
|
|
415
|
-
new_data = {}
|
|
416
|
-
|
|
417
|
-
# Process regular strategies first
|
|
418
|
-
for col, col_strategy in expanded_strategy_dict.items():
|
|
419
|
-
if col == "__default__":
|
|
420
|
-
continue
|
|
421
|
-
|
|
422
|
-
if col_strategy.startswith("increment"):
|
|
423
|
-
# Parse parameters for increment strategy
|
|
424
|
-
strategy_name, params = parse_strategy_params(col_strategy)
|
|
425
|
-
|
|
426
|
-
# Generate incremented values (create mode)
|
|
427
|
-
new_values = apply_increment_strategy(
|
|
428
|
-
df_polars=None, # No DataFrame in create mode
|
|
429
|
-
column=col,
|
|
430
|
-
strategy_spec=col_strategy,
|
|
431
|
-
n_rows=n_rows,
|
|
432
|
-
params=params
|
|
433
|
-
)
|
|
434
|
-
new_data[col] = new_values
|
|
435
|
-
|
|
436
|
-
elif col_strategy.startswith("range"):
|
|
437
|
-
# Parse range parameters
|
|
438
|
-
strategy_name, params = parse_strategy_params(col_strategy)
|
|
439
|
-
|
|
440
|
-
if "min" not in params or "max" not in params:
|
|
441
|
-
raise ValidationError(
|
|
442
|
-
f"Range strategy for column '{col}' requires min and max parameters. "
|
|
443
|
-
f"Use format: 'range:min-max' (e.g., 'range:18-65')"
|
|
444
|
-
)
|
|
445
|
-
|
|
446
|
-
# Generate range values
|
|
447
|
-
new_values = apply_range_strategy(
|
|
448
|
-
min_val=params["min"],
|
|
449
|
-
max_val=params["max"],
|
|
450
|
-
n_rows=n_rows,
|
|
451
|
-
seed=seed
|
|
452
|
-
)
|
|
453
|
-
new_data[col] = new_values
|
|
454
|
-
|
|
455
|
-
elif col_strategy.startswith("choice"):
|
|
456
|
-
# Generate choice values
|
|
457
|
-
new_values = apply_choice_strategy(
|
|
458
|
-
col_strategy, n_rows, seed
|
|
459
|
-
)
|
|
460
|
-
new_data[col] = new_values
|
|
461
|
-
|
|
462
|
-
else:
|
|
463
|
-
raise ValidationError(
|
|
464
|
-
f"Unknown or unsupported strategy for column '{col}': '{col_strategy}'"
|
|
465
|
-
)
|
|
466
|
-
|
|
467
|
-
# Process linked lists strategies
|
|
468
|
-
for original_key, var_name, parsed_data, column_names in lists_to_process:
|
|
469
|
-
# Generate data rows
|
|
470
|
-
data_rows = generate_linked_list_data(parsed_data, n_rows, seed)
|
|
471
|
-
|
|
472
|
-
# Transpose: list of tuples -> dict of lists
|
|
473
|
-
# data_rows = [(val1_col1, val1_col2), (val2_col1, val2_col2), ...]
|
|
474
|
-
# -> {col1: [val1_col1, val2_col1, ...], col2: [val1_col2, val2_col2, ...]}
|
|
475
|
-
for col_idx, col_name in enumerate(column_names):
|
|
476
|
-
new_data[col_name] = [row[col_idx] for row in data_rows]
|
|
477
|
-
|
|
478
|
-
# Build Polars DataFrame from generated columns
|
|
479
|
-
result = pl.DataFrame(new_data)
|
|
480
|
-
|
|
481
|
-
return result
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
def synthetic(
|
|
485
|
-
df: Any,
|
|
486
|
-
n_rows: Union[int, str] = 5,
|
|
487
|
-
strategy: Union[str, Dict[str, str]] = "auto",
|
|
488
|
-
seed: Optional[int] = None,
|
|
489
|
-
output_format: str = "pandas"
|
|
490
|
-
) -> Any:
|
|
491
|
-
"""
|
|
492
|
-
Generate synthetic data by extending a dataframe or creating from scratch.
|
|
493
|
-
|
|
494
|
-
Uses Polars-only architecture:
|
|
495
|
-
1. Detect input format (pandas/polars/cuDF)
|
|
496
|
-
2. Convert to Polars via Arrow bridge (if needed)
|
|
497
|
-
3. Process synthetic data generation in Polars
|
|
498
|
-
4. Convert back to original format via Arrow bridge
|
|
499
|
-
|
|
500
|
-
This function adds new rows to a dataframe using various strategies:
|
|
501
|
-
- "auto": Random sampling from existing values (default)
|
|
502
|
-
- "increment": Increment numeric or pattern-based values
|
|
503
|
-
- "range:min-max": Random integers within range
|
|
504
|
-
- "choice:[...]": Random selection from inline list
|
|
505
|
-
- "lists@variable_name": Inline linked lists (generates multiple columns)
|
|
506
|
-
- "forecast:method": Time series forecasting (linear, polynomial, exponential, seasonal)
|
|
507
|
-
- "normal": Normal distribution generation
|
|
508
|
-
- "uniform": Uniform distribution generation
|
|
509
|
-
- "skewed_left/skewed_right": Skewed distribution generation
|
|
510
|
-
- "smote": Synthetic Minority Over-sampling Technique
|
|
511
|
-
|
|
512
|
-
Args:
|
|
513
|
-
df: Input dataframe (pandas, polars, or cudf), or sentinel:
|
|
514
|
-
- DataFrame: Augment mode (add rows to existing data)
|
|
515
|
-
- "@new": Create mode (generate data from scratch)
|
|
516
|
-
- "@sample": Sample mode (load sample dataset)
|
|
517
|
-
n_rows: Number of rows to add. Can be:
|
|
518
|
-
- int: Exact number (e.g., 5)
|
|
519
|
-
- str percentage: Percentage of current size (e.g., "50%")
|
|
520
|
-
- str multiplier: Multiple of current size (e.g., "2x")
|
|
521
|
-
strategy: Augmentation strategy. Can be:
|
|
522
|
-
- str: "auto" (applies to all columns)
|
|
523
|
-
- dict: Column-specific strategies, e.g.:
|
|
524
|
-
{
|
|
525
|
-
"id": "increment",
|
|
526
|
-
"emp_id": "increment:EMP_[001]_ID",
|
|
527
|
-
"age": "range:18-65",
|
|
528
|
-
"status": "choice:[Active,Inactive,Pending]",
|
|
529
|
-
"sales": "forecast:seasonal:period=12",
|
|
530
|
-
"score": "normal:mean=75:std=10",
|
|
531
|
-
"income": "skewed_right:skewness=1.5"
|
|
532
|
-
}
|
|
533
|
-
Unlisted columns default to "auto"
|
|
534
|
-
seed: Random seed for reproducibility. If None, results will vary.
|
|
535
|
-
output_format: Output format for create/sample modes. Options:
|
|
536
|
-
- "pandas": Return pandas DataFrame (default)
|
|
537
|
-
- "polars": Return Polars DataFrame
|
|
538
|
-
- "cudf": Return cuDF DataFrame
|
|
539
|
-
Note: In augment mode (with DataFrame input), output format
|
|
540
|
-
matches input format and this parameter is ignored.
|
|
541
|
-
|
|
542
|
-
Returns:
|
|
543
|
-
Augmented dataframe with original + new rows (same type as input)
|
|
544
|
-
|
|
545
|
-
Raises:
|
|
546
|
-
ValidationError: If input validation fails
|
|
547
|
-
AugmentError: If augmentation fails
|
|
548
|
-
|
|
549
|
-
Examples:
|
|
550
|
-
>>> # Add 5 rows with random sampling (default)
|
|
551
|
-
>>> df_aug = add.augment(df)
|
|
552
|
-
|
|
553
|
-
>>> # Increment numeric ID column
|
|
554
|
-
>>> df_aug = add.augment(df, strategy={"id": "increment"})
|
|
555
|
-
|
|
556
|
-
>>> # Forecast sales with seasonal pattern
|
|
557
|
-
>>> df_aug = add.augment(df, n_rows=24, strategy={
|
|
558
|
-
... "sales": "forecast:seasonal:period=12"
|
|
559
|
-
... })
|
|
560
|
-
|
|
561
|
-
>>> # Generate from normal distribution
|
|
562
|
-
>>> df_aug = add.augment(df, n_rows=100, strategy={
|
|
563
|
-
... "age": "normal:mean=35:std=10",
|
|
564
|
-
... "score": "uniform:min=0:max=100"
|
|
565
|
-
... })
|
|
566
|
-
|
|
567
|
-
>>> # Mixed strategies
|
|
568
|
-
>>> df_aug = add.augment(df, n_rows=100, strategy={
|
|
569
|
-
... "id": "increment",
|
|
570
|
-
... "age": "range:18-65",
|
|
571
|
-
... "status": "choice:[Active,Inactive]",
|
|
572
|
-
... "sales": "forecast:linear",
|
|
573
|
-
... "score": "normal:auto"
|
|
574
|
-
... })
|
|
575
|
-
|
|
576
|
-
>>> # Create data from scratch (returns pandas by default)
|
|
577
|
-
>>> df_new = add.augment("@new", n_rows=50, strategy={
|
|
578
|
-
... "id": "increment:start=1",
|
|
579
|
-
... "age": "range:18-65",
|
|
580
|
-
... "status": "choice:[Active,Inactive]"
|
|
581
|
-
... })
|
|
582
|
-
"""
|
|
583
|
-
# Detect mode
|
|
584
|
-
mode = _detect_mode(df)
|
|
585
|
-
|
|
586
|
-
# Validate output_format parameter
|
|
587
|
-
valid_formats = ["pandas", "polars", "cudf"]
|
|
588
|
-
if output_format not in valid_formats:
|
|
589
|
-
raise ValidationError(
|
|
590
|
-
f"Invalid output_format: '{output_format}'. "
|
|
591
|
-
f"Must be one of: {', '.join(valid_formats)}"
|
|
592
|
-
)
|
|
593
|
-
|
|
594
|
-
# Parse and validate strategy
|
|
595
|
-
try:
|
|
596
|
-
strategy_dict = parse_strategy_dict(strategy)
|
|
597
|
-
except ValidationError as e:
|
|
598
|
-
raise ValidationError(f"Invalid strategy parameter: {e}")
|
|
599
|
-
|
|
600
|
-
# Handle create mode
|
|
601
|
-
if mode == "create":
|
|
602
|
-
# Validate create mode requirements
|
|
603
|
-
if not isinstance(strategy, dict):
|
|
604
|
-
raise ValidationError(
|
|
605
|
-
"Create mode requires a strategy dict with column definitions. "
|
|
606
|
-
"Example: strategy={'id': 'increment:start=1', 'age': 'range:18-65'}"
|
|
607
|
-
)
|
|
608
|
-
|
|
609
|
-
if not strategy or len(strategy) == 0:
|
|
610
|
-
raise ValidationError(
|
|
611
|
-
"Create mode requires at least one column in strategy dict"
|
|
612
|
-
)
|
|
613
|
-
|
|
614
|
-
if not isinstance(n_rows, int):
|
|
615
|
-
raise ValidationError(
|
|
616
|
-
f"Create mode requires n_rows to be an integer, got {type(n_rows).__name__}. "
|
|
617
|
-
"Percentage ('50%') and multiplier ('2x') formats are not supported in create mode."
|
|
618
|
-
)
|
|
619
|
-
|
|
620
|
-
if n_rows <= 0:
|
|
621
|
-
raise ValidationError("n_rows must be positive")
|
|
622
|
-
|
|
623
|
-
try:
|
|
624
|
-
# Generate data from scratch
|
|
625
|
-
result_polars = _create_from_scratch_engine(n_rows, strategy_dict, seed)
|
|
626
|
-
|
|
627
|
-
# Convert to requested output format
|
|
628
|
-
result_df = from_polars(result_polars, output_format)
|
|
629
|
-
|
|
630
|
-
# Memory cleanup
|
|
631
|
-
del result_polars
|
|
632
|
-
import gc
|
|
633
|
-
gc.collect()
|
|
634
|
-
|
|
635
|
-
return result_df
|
|
636
|
-
|
|
637
|
-
except Exception as e:
|
|
638
|
-
if isinstance(e, (ValidationError, AugmentError)):
|
|
639
|
-
raise
|
|
640
|
-
raise AugmentError(f"Create mode failed: {e}")
|
|
641
|
-
|
|
642
|
-
# Handle sample mode
|
|
643
|
-
if mode == "sample":
|
|
644
|
-
# Load sample dataset
|
|
645
|
-
try:
|
|
646
|
-
df = get_sample_dataset("augment", "sample", "clean")
|
|
647
|
-
except Exception as e:
|
|
648
|
-
raise ValidationError(f"Failed to load sample dataset: {e}")
|
|
649
|
-
|
|
650
|
-
# Continue to augment mode with loaded sample
|
|
651
|
-
# (will use output_format at the end)
|
|
652
|
-
|
|
653
|
-
# Augment mode (original behavior)
|
|
654
|
-
# Validate input dataframe
|
|
655
|
-
validate_dataframe(df, "df")
|
|
656
|
-
|
|
657
|
-
# Check minimum size - require at least 3 rows for meaningful augmentation
|
|
658
|
-
MIN_ROWS = 3
|
|
659
|
-
df_length = len(df)
|
|
660
|
-
if df_length < MIN_ROWS:
|
|
661
|
-
raise ValidationError(
|
|
662
|
-
f"Minimum {MIN_ROWS} rows required for augmentation. "
|
|
663
|
-
f"Current size: {df_length}"
|
|
664
|
-
)
|
|
665
|
-
|
|
666
|
-
# Parse n_rows
|
|
667
|
-
try:
|
|
668
|
-
actual_n_rows = _parse_n_rows(n_rows, df_length)
|
|
669
|
-
except ValidationError as e:
|
|
670
|
-
raise ValidationError(f"Invalid n_rows parameter: {e}")
|
|
671
|
-
|
|
672
|
-
# Detect input backend (for augment mode, use input format; for sample mode, use output_format)
|
|
673
|
-
if mode == "sample":
|
|
674
|
-
input_backend = output_format
|
|
675
|
-
else:
|
|
676
|
-
input_backend = detect_backend(df)
|
|
677
|
-
|
|
678
|
-
# Augment using Polars-only architecture
|
|
679
|
-
try:
|
|
680
|
-
# 1. Convert to Polars via Arrow bridge
|
|
681
|
-
df_polars = to_polars(df, input_backend if mode != "sample" else "polars")
|
|
682
|
-
|
|
683
|
-
# Memory cleanup: delete original if converted
|
|
684
|
-
if mode != "sample" and input_backend != 'polars':
|
|
685
|
-
del df
|
|
686
|
-
import gc
|
|
687
|
-
gc.collect()
|
|
688
|
-
|
|
689
|
-
# 2. Process augmentation in Polars
|
|
690
|
-
result_polars = _augment_polars_engine(df_polars, actual_n_rows, strategy_dict, seed)
|
|
691
|
-
|
|
692
|
-
# Memory cleanup: delete intermediate Polars DataFrame
|
|
693
|
-
del df_polars
|
|
694
|
-
import gc
|
|
695
|
-
gc.collect()
|
|
696
|
-
|
|
697
|
-
# 3. Convert back to target format
|
|
698
|
-
# In augment mode: match input format
|
|
699
|
-
# In sample mode: use output_format parameter
|
|
700
|
-
target_backend = output_format if mode == "sample" else input_backend
|
|
701
|
-
result_df = from_polars(result_polars, target_backend)
|
|
702
|
-
|
|
703
|
-
# Final memory cleanup
|
|
704
|
-
del result_polars
|
|
705
|
-
import gc
|
|
706
|
-
gc.collect()
|
|
707
|
-
|
|
708
|
-
return result_df
|
|
709
|
-
|
|
710
|
-
except Exception as e:
|
|
711
|
-
if isinstance(e, (ValidationError, AugmentError)):
|
|
712
|
-
raise
|
|
713
|
-
raise AugmentError(f"Augmentation failed: {e}")
|