additory 0.1.0a4__py3-none-any.whl → 0.1.1a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. additory/__init__.py +58 -14
  2. additory/common/__init__.py +31 -147
  3. additory/common/column_selector.py +255 -0
  4. additory/common/distributions.py +286 -613
  5. additory/common/extractors.py +313 -0
  6. additory/common/knn_imputation.py +332 -0
  7. additory/common/result.py +380 -0
  8. additory/common/strategy_parser.py +243 -0
  9. additory/common/unit_conversions.py +338 -0
  10. additory/common/validation.py +283 -103
  11. additory/core/__init__.py +34 -22
  12. additory/core/backend.py +258 -0
  13. additory/core/config.py +177 -305
  14. additory/core/logging.py +230 -24
  15. additory/core/memory_manager.py +157 -495
  16. additory/expressions/__init__.py +2 -23
  17. additory/expressions/compiler.py +457 -0
  18. additory/expressions/engine.py +264 -487
  19. additory/expressions/integrity.py +179 -0
  20. additory/expressions/loader.py +263 -0
  21. additory/expressions/parser.py +363 -167
  22. additory/expressions/resolver.py +274 -0
  23. additory/functions/__init__.py +1 -0
  24. additory/functions/analyze/__init__.py +144 -0
  25. additory/functions/analyze/cardinality.py +58 -0
  26. additory/functions/analyze/correlations.py +66 -0
  27. additory/functions/analyze/distributions.py +53 -0
  28. additory/functions/analyze/duplicates.py +49 -0
  29. additory/functions/analyze/features.py +61 -0
  30. additory/functions/analyze/imputation.py +66 -0
  31. additory/functions/analyze/outliers.py +65 -0
  32. additory/functions/analyze/patterns.py +65 -0
  33. additory/functions/analyze/presets.py +72 -0
  34. additory/functions/analyze/quality.py +59 -0
  35. additory/functions/analyze/timeseries.py +53 -0
  36. additory/functions/analyze/types.py +45 -0
  37. additory/functions/expressions/__init__.py +161 -0
  38. additory/functions/snapshot/__init__.py +82 -0
  39. additory/functions/snapshot/filter.py +119 -0
  40. additory/functions/synthetic/__init__.py +113 -0
  41. additory/functions/synthetic/mode_detector.py +47 -0
  42. additory/functions/synthetic/strategies/__init__.py +1 -0
  43. additory/functions/synthetic/strategies/advanced.py +35 -0
  44. additory/functions/synthetic/strategies/augmentative.py +160 -0
  45. additory/functions/synthetic/strategies/generative.py +168 -0
  46. additory/functions/synthetic/strategies/presets.py +116 -0
  47. additory/functions/to/__init__.py +188 -0
  48. additory/functions/to/lookup.py +351 -0
  49. additory/functions/to/merge.py +189 -0
  50. additory/functions/to/sort.py +91 -0
  51. additory/functions/to/summarize.py +170 -0
  52. additory/functions/transform/__init__.py +140 -0
  53. additory/functions/transform/datetime.py +79 -0
  54. additory/functions/transform/extract.py +85 -0
  55. additory/functions/transform/harmonize.py +105 -0
  56. additory/functions/transform/knn.py +62 -0
  57. additory/functions/transform/onehotencoding.py +68 -0
  58. additory/functions/transform/transpose.py +42 -0
  59. additory-0.1.1a1.dist-info/METADATA +83 -0
  60. additory-0.1.1a1.dist-info/RECORD +62 -0
  61. additory/analysis/__init__.py +0 -48
  62. additory/analysis/cardinality.py +0 -126
  63. additory/analysis/correlations.py +0 -124
  64. additory/analysis/distributions.py +0 -376
  65. additory/analysis/quality.py +0 -158
  66. additory/analysis/scan.py +0 -400
  67. additory/common/backend.py +0 -371
  68. additory/common/column_utils.py +0 -191
  69. additory/common/exceptions.py +0 -62
  70. additory/common/lists.py +0 -229
  71. additory/common/patterns.py +0 -240
  72. additory/common/resolver.py +0 -567
  73. additory/common/sample_data.py +0 -182
  74. additory/core/ast_builder.py +0 -165
  75. additory/core/backends/__init__.py +0 -23
  76. additory/core/backends/arrow_bridge.py +0 -483
  77. additory/core/backends/cudf_bridge.py +0 -355
  78. additory/core/column_positioning.py +0 -358
  79. additory/core/compiler_polars.py +0 -166
  80. additory/core/enhanced_cache_manager.py +0 -1119
  81. additory/core/enhanced_matchers.py +0 -473
  82. additory/core/enhanced_version_manager.py +0 -325
  83. additory/core/executor.py +0 -59
  84. additory/core/integrity_manager.py +0 -477
  85. additory/core/loader.py +0 -190
  86. additory/core/namespace_manager.py +0 -657
  87. additory/core/parser.py +0 -176
  88. additory/core/polars_expression_engine.py +0 -601
  89. additory/core/registry.py +0 -177
  90. additory/core/sample_data_manager.py +0 -492
  91. additory/core/user_namespace.py +0 -751
  92. additory/core/validator.py +0 -27
  93. additory/dynamic_api.py +0 -352
  94. additory/expressions/proxy.py +0 -549
  95. additory/expressions/registry.py +0 -313
  96. additory/expressions/samples.py +0 -492
  97. additory/synthetic/__init__.py +0 -13
  98. additory/synthetic/column_name_resolver.py +0 -149
  99. additory/synthetic/deduce.py +0 -259
  100. additory/synthetic/distributions.py +0 -22
  101. additory/synthetic/forecast.py +0 -1132
  102. additory/synthetic/linked_list_parser.py +0 -415
  103. additory/synthetic/namespace_lookup.py +0 -129
  104. additory/synthetic/smote.py +0 -320
  105. additory/synthetic/strategies.py +0 -926
  106. additory/synthetic/synthesizer.py +0 -713
  107. additory/utilities/__init__.py +0 -53
  108. additory/utilities/encoding.py +0 -600
  109. additory/utilities/games.py +0 -300
  110. additory/utilities/keys.py +0 -8
  111. additory/utilities/lookup.py +0 -103
  112. additory/utilities/matchers.py +0 -216
  113. additory/utilities/resolvers.py +0 -286
  114. additory/utilities/settings.py +0 -167
  115. additory/utilities/units.py +0 -749
  116. additory/utilities/validators.py +0 -153
  117. additory-0.1.0a4.dist-info/METADATA +0 -311
  118. additory-0.1.0a4.dist-info/RECORD +0 -72
  119. additory-0.1.0a4.dist-info/licenses/LICENSE +0 -21
  120. {additory-0.1.0a4.dist-info → additory-0.1.1a1.dist-info}/WHEEL +0 -0
  121. {additory-0.1.0a4.dist-info → additory-0.1.1a1.dist-info}/top_level.txt +0 -0
@@ -1,713 +0,0 @@
1
- """
2
- Data Augmentation Engine - Polars-Only Architecture
3
-
4
- Provides functionality to augment dataframes by adding synthetic rows
5
- based on existing data patterns.
6
-
7
- Architecture:
8
- 1. Detect input format (pandas/polars/cuDF)
9
- 2. Convert to Polars via Arrow bridge (if needed)
10
- 3. Process augmentation in Polars
11
- 4. Convert back to original format via Arrow bridge
12
- """
13
-
14
- from typing import Union, Optional, Any, Dict, Literal
15
- import random
16
-
17
- from additory.common.backend import detect_backend, to_polars, from_polars
18
- from additory.common.exceptions import ValidationError, AugmentError
19
- from additory.common.validation import validate_dataframe
20
- from additory.common.sample_data import get_sample_dataset
21
- from additory.synthetic.strategies import (
22
- parse_strategy_dict,
23
- get_column_strategy,
24
- apply_increment_strategy,
25
- apply_choice_strategy,
26
- apply_range_strategy,
27
- parse_strategy_params
28
- )
29
-
30
- # Linked lists feature imports
31
- from additory.synthetic.namespace_lookup import lookup_linked_list
32
- from additory.synthetic.linked_list_parser import (
33
- parse_linked_list,
34
- generate_linked_list_data
35
- )
36
- from additory.synthetic.column_name_resolver import resolve_column_names
37
-
38
-
39
- def _validate_generative_strategies(strategy_dict: Dict[str, str]) -> None:
40
- """
41
- Validate that all strategies are generative (not augmentative).
42
-
43
- Generative strategies can create data from scratch:
44
- - increment (with start parameter)
45
- - range
46
- - choice
47
- - lists (inline linked lists)
48
-
49
- Augmentative strategies require existing data:
50
- - auto (random sampling)
51
- - forecast (time series)
52
- - seasonal (time series)
53
- - smote (synthetic minority oversampling)
54
-
55
- Args:
56
- strategy_dict: Dictionary mapping column names to strategy specs
57
-
58
- Raises:
59
- ValidationError: If any augmentative strategies are found
60
- """
61
- augmentative_strategies = ["auto", "forecast", "seasonal", "smote"]
62
-
63
- invalid_columns = []
64
-
65
- for col, strategy_spec in strategy_dict.items():
66
- if col == "__default__":
67
- continue
68
-
69
- # Get the base strategy name (before any parameters)
70
- strategy_name = strategy_spec.split(":")[0].strip()
71
-
72
- # Handle lists@ pattern
73
- if strategy_name.startswith("lists@"):
74
- continue # Valid generative strategy
75
-
76
- if strategy_name in augmentative_strategies:
77
- invalid_columns.append((col, strategy_name))
78
-
79
- if invalid_columns:
80
- error_lines = [
81
- f"Create mode requires generative strategies. Found augmentative strategies:"
82
- ]
83
- for col, strat in invalid_columns:
84
- error_lines.append(f" - Column '{col}': '{strat}'")
85
-
86
- error_lines.append("")
87
- error_lines.append("Valid generative strategies:")
88
- error_lines.append(" - increment (with start parameter)")
89
- error_lines.append(" - range:min-max")
90
- error_lines.append(" - choice:[value1,value2,...]")
91
- error_lines.append(" - lists@variable_name (inline linked lists)")
92
-
93
- raise ValidationError("\n".join(error_lines))
94
-
95
-
96
- def _detect_mode(df: Any) -> Literal["augment", "create", "sample"]:
97
- """
98
- Detect the augmentation mode based on the df parameter.
99
-
100
- Three modes are supported:
101
- 1. "augment" - Augment an existing DataFrame (default)
102
- 2. "create" - Create data from scratch using "@new" sentinel
103
- 3. "sample" - Load and optionally augment sample dataset using "@sample" sentinel
104
-
105
- Args:
106
- df: Input parameter (DataFrame or sentinel string)
107
-
108
- Returns:
109
- Mode string: "augment", "create", or "sample"
110
-
111
- Raises:
112
- ValidationError: If df is an invalid string (not "@new" or "@sample")
113
- """
114
- # Check for sentinel values
115
- if isinstance(df, str):
116
- if df == "@new":
117
- return "create"
118
- elif df == "@sample":
119
- return "sample"
120
- else:
121
- # Provide helpful error messages
122
- if df.lower() in ["new", "create"]:
123
- raise ValidationError(
124
- f"Invalid input: '{df}'. Did you mean '@new'? "
125
- "Use '@new' to create data from scratch."
126
- )
127
- elif df.lower() in ["sample", "samples"]:
128
- raise ValidationError(
129
- f"Invalid input: '{df}'. Did you mean '@sample'? "
130
- "Use '@sample' to load sample dataset."
131
- )
132
- else:
133
- raise ValidationError(
134
- f"Invalid string input: '{df}'. "
135
- "Expected a DataFrame, '@new' (create mode), or '@sample' (sample mode)."
136
- )
137
-
138
- # If not a string, assume it's a DataFrame (augment mode)
139
- return "augment"
140
-
141
-
142
- def _parse_n_rows(n_rows: Union[int, str], df_length: int) -> int:
143
- """
144
- Parse n_rows parameter to get actual number of rows to generate.
145
-
146
- Args:
147
- n_rows: Number of rows (int), percentage ("50%"), or multiplier ("2x")
148
- df_length: Length of the input dataframe
149
-
150
- Returns:
151
- Actual number of rows to generate
152
-
153
- Raises:
154
- ValidationError: If n_rows format is invalid
155
- """
156
- if isinstance(n_rows, int):
157
- if n_rows <= 0:
158
- raise ValidationError("n_rows must be positive")
159
- return n_rows
160
-
161
- if isinstance(n_rows, str):
162
- n_rows = n_rows.strip()
163
-
164
- # Handle percentage: "50%"
165
- if n_rows.endswith("%"):
166
- try:
167
- percentage = float(n_rows[:-1])
168
- if percentage <= 0:
169
- raise ValidationError("Percentage must be positive")
170
- return max(1, int(df_length * percentage / 100))
171
- except ValueError:
172
- raise ValidationError(f"Invalid percentage format: {n_rows}")
173
-
174
- # Handle multiplier: "2x"
175
- if n_rows.endswith("x"):
176
- try:
177
- multiplier = float(n_rows[:-1])
178
- if multiplier <= 0:
179
- raise ValidationError("Multiplier must be positive")
180
- return max(1, int(df_length * multiplier))
181
- except ValueError:
182
- raise ValidationError(f"Invalid multiplier format: {n_rows}")
183
-
184
- raise ValidationError(
185
- f"Invalid n_rows format: {n_rows}. "
186
- "Use int (5), percentage ('50%'), or multiplier ('2x')"
187
- )
188
-
189
- raise ValidationError(f"n_rows must be int or str, got {type(n_rows)}")
190
-
191
-
192
- def _augment_polars_engine(df_polars: Any, n_rows: int, strategy_dict: Dict[str, str], seed: Optional[int]) -> Any:
193
- """
194
- Augment Polars DataFrame with strategy support.
195
-
196
- This is the core augmentation engine that processes all dataframes
197
- (pandas, polars, cuDF) after conversion to Polars format.
198
-
199
- Args:
200
- df_polars: Input Polars DataFrame
201
- n_rows: Number of rows to generate
202
- strategy_dict: Column-specific strategies
203
- seed: Random seed for reproducibility
204
-
205
- Returns:
206
- Augmented Polars DataFrame
207
- """
208
- import polars as pl
209
-
210
- # Get column names
211
- columns = df_polars.columns
212
-
213
- # Check if any column uses non-auto strategy
214
- has_custom_strategy = any(
215
- not get_column_strategy(col, strategy_dict).startswith("auto")
216
- for col in columns
217
- )
218
-
219
- if not has_custom_strategy:
220
- # Simple random sampling (original behavior)
221
- if seed is not None:
222
- sampled = df_polars.sample(n=n_rows, with_replacement=True, seed=seed)
223
- else:
224
- sampled = df_polars.sample(n=n_rows, with_replacement=True)
225
- else:
226
- # Build new rows column by column
227
- new_data = {}
228
-
229
- for col in columns:
230
- col_strategy = get_column_strategy(col, strategy_dict)
231
-
232
- if col_strategy.startswith("increment"):
233
- # Generate incremented values
234
- new_values = apply_increment_strategy(
235
- df_polars, col, col_strategy, n_rows
236
- )
237
- new_data[col] = new_values
238
- elif col_strategy.startswith("range"):
239
- # Parse range parameters
240
- strategy_name, params = parse_strategy_params(col_strategy)
241
-
242
- if "min" not in params or "max" not in params:
243
- raise ValidationError(
244
- f"Range strategy for column '{col}' requires min and max parameters. "
245
- f"Use format: 'range:min-max' (e.g., 'range:18-65')"
246
- )
247
-
248
- # Generate range values
249
- new_values = apply_range_strategy(
250
- min_val=params["min"],
251
- max_val=params["max"],
252
- n_rows=n_rows,
253
- seed=seed
254
- )
255
- new_data[col] = new_values
256
- elif col_strategy.startswith("choice"):
257
- # Generate choice values
258
- new_values = apply_choice_strategy(
259
- col_strategy, n_rows, seed
260
- )
261
- new_data[col] = new_values
262
- elif col_strategy.startswith("forecast"):
263
- # Import here to avoid circular dependency
264
- from additory.synthetic.strategies import apply_forecast_strategy
265
-
266
- # Generate forecasted values
267
- new_values = apply_forecast_strategy(
268
- df_polars, col, col_strategy, n_rows, seed
269
- )
270
-
271
- # Cast to match original column dtype if needed
272
- original_dtype = df_polars[col].dtype
273
- if original_dtype.is_integer():
274
- # Round and convert to int for integer columns
275
- new_values = [int(round(v)) for v in new_values]
276
-
277
- new_data[col] = new_values
278
- elif col_strategy.startswith(("normal", "uniform", "skewed_left", "skewed_right", "beta", "gamma", "exponential", "kde")):
279
- # Import here to avoid circular dependency
280
- from additory.synthetic.strategies import apply_distribution_strategy
281
-
282
- # Generate distribution values
283
- new_values = apply_distribution_strategy(
284
- df_polars, col, col_strategy, n_rows, seed
285
- )
286
-
287
- # Cast to match original column dtype if needed
288
- original_dtype = df_polars[col].dtype
289
- if original_dtype.is_integer():
290
- # Round and convert to int for integer columns
291
- new_values = [int(round(v)) for v in new_values]
292
-
293
- new_data[col] = new_values
294
- else:
295
- # Random sampling for this column (auto)
296
- if seed is not None:
297
- sampled_col = df_polars.select(col).sample(n=n_rows, with_replacement=True, seed=seed)
298
- else:
299
- sampled_col = df_polars.select(col).sample(n=n_rows, with_replacement=True)
300
- new_data[col] = sampled_col[col].to_list()
301
-
302
- sampled = pl.DataFrame(new_data)
303
-
304
- # Concatenate original and new rows
305
- result = pl.concat([df_polars, sampled])
306
-
307
- return result
308
-
309
-
310
- def _create_from_scratch_engine(
311
- n_rows: int,
312
- strategy_dict: Dict[str, str],
313
- seed: Optional[int]
314
- ) -> Any:
315
- """
316
- Create DataFrame from scratch using generative strategies.
317
-
318
- This engine generates data column by column without requiring
319
- an existing DataFrame. All strategies must be generative.
320
-
321
- Generative strategies (supported):
322
- - increment (with start parameter)
323
- - range
324
- - choice
325
-
326
- Augmentative strategies (NOT supported):
327
- - auto (requires existing data)
328
- - forecast (requires time series)
329
- - seasonal (requires time series)
330
- - smote (requires existing data)
331
-
332
- Args:
333
- n_rows: Number of rows to generate
334
- strategy_dict: Column-specific strategies (all must be generative)
335
- seed: Random seed for reproducibility
336
-
337
- Returns:
338
- Polars DataFrame with generated data
339
-
340
- Raises:
341
- ValidationError: If any augmentative strategies are found
342
-
343
- Examples:
344
- >>> # Simple create with increment and range
345
- >>> result = _create_from_scratch_engine(
346
- ... n_rows=10,
347
- ... strategy_dict={
348
- ... "id": "increment:start=1",
349
- ... "age": "range:18-65"
350
- ... },
351
- ... seed=42
352
- ... )
353
- >>> result.shape
354
- (10, 2)
355
-
356
- >>> # Create with mixed strategies
357
- >>> result = _create_from_scratch_engine(
358
- ... n_rows=100,
359
- ... strategy_dict={
360
- ... "id": "increment:start=1",
361
- ... "emp_id": "increment:start=1:pattern=EMP_[001]",
362
- ... "age": "range:18-65",
363
- ... "status": "choice:[Active,Inactive,Pending]"
364
- ... },
365
- ... seed=42
366
- ... )
367
- >>> result.shape
368
- (100, 4)
369
- """
370
- import polars as pl
371
-
372
- # Validate all strategies are generative
373
- _validate_generative_strategies(strategy_dict)
374
-
375
- # Pre-process linked lists strategies
376
- # Linked lists generate multiple columns, so we need to expand strategy_dict
377
- expanded_strategy_dict = {}
378
- lists_to_process = [] # Store (original_key, var_name, parsed_data, column_names)
379
-
380
- for col, col_strategy in strategy_dict.items():
381
- if col == "__default__":
382
- continue
383
-
384
- # Check for lists@ pattern
385
- if col_strategy.startswith("lists@"):
386
- # Extract variable name
387
- var_name = col_strategy[6:].strip() # Remove "lists@" prefix
388
-
389
- try:
390
- # Lookup variable in namespace
391
- # Depth=5: user -> add.synthetic (API) -> synthetic() -> _create_from_scratch_engine -> here
392
- linked_list_data = lookup_linked_list(var_name, depth=5)
393
-
394
- # Parse linked list
395
- parsed_data = parse_linked_list(linked_list_data)
396
-
397
- # Resolve column names
398
- column_names = resolve_column_names(
399
- list_name=var_name,
400
- strategy_key=col,
401
- num_columns=parsed_data['num_columns'],
402
- explicit_names=parsed_data['column_names']
403
- )
404
-
405
- # Store for later processing
406
- lists_to_process.append((col, var_name, parsed_data, column_names))
407
-
408
- except ValidationError as e:
409
- raise ValidationError(f"Linked list error for column '{col}': {e}")
410
- else:
411
- # Regular strategy - keep as is
412
- expanded_strategy_dict[col] = col_strategy
413
-
414
- # Build data column by column
415
- new_data = {}
416
-
417
- # Process regular strategies first
418
- for col, col_strategy in expanded_strategy_dict.items():
419
- if col == "__default__":
420
- continue
421
-
422
- if col_strategy.startswith("increment"):
423
- # Parse parameters for increment strategy
424
- strategy_name, params = parse_strategy_params(col_strategy)
425
-
426
- # Generate incremented values (create mode)
427
- new_values = apply_increment_strategy(
428
- df_polars=None, # No DataFrame in create mode
429
- column=col,
430
- strategy_spec=col_strategy,
431
- n_rows=n_rows,
432
- params=params
433
- )
434
- new_data[col] = new_values
435
-
436
- elif col_strategy.startswith("range"):
437
- # Parse range parameters
438
- strategy_name, params = parse_strategy_params(col_strategy)
439
-
440
- if "min" not in params or "max" not in params:
441
- raise ValidationError(
442
- f"Range strategy for column '{col}' requires min and max parameters. "
443
- f"Use format: 'range:min-max' (e.g., 'range:18-65')"
444
- )
445
-
446
- # Generate range values
447
- new_values = apply_range_strategy(
448
- min_val=params["min"],
449
- max_val=params["max"],
450
- n_rows=n_rows,
451
- seed=seed
452
- )
453
- new_data[col] = new_values
454
-
455
- elif col_strategy.startswith("choice"):
456
- # Generate choice values
457
- new_values = apply_choice_strategy(
458
- col_strategy, n_rows, seed
459
- )
460
- new_data[col] = new_values
461
-
462
- else:
463
- raise ValidationError(
464
- f"Unknown or unsupported strategy for column '{col}': '{col_strategy}'"
465
- )
466
-
467
- # Process linked lists strategies
468
- for original_key, var_name, parsed_data, column_names in lists_to_process:
469
- # Generate data rows
470
- data_rows = generate_linked_list_data(parsed_data, n_rows, seed)
471
-
472
- # Transpose: list of tuples -> dict of lists
473
- # data_rows = [(val1_col1, val1_col2), (val2_col1, val2_col2), ...]
474
- # -> {col1: [val1_col1, val2_col1, ...], col2: [val1_col2, val2_col2, ...]}
475
- for col_idx, col_name in enumerate(column_names):
476
- new_data[col_name] = [row[col_idx] for row in data_rows]
477
-
478
- # Build Polars DataFrame from generated columns
479
- result = pl.DataFrame(new_data)
480
-
481
- return result
482
-
483
-
484
- def synthetic(
485
- df: Any,
486
- n_rows: Union[int, str] = 5,
487
- strategy: Union[str, Dict[str, str]] = "auto",
488
- seed: Optional[int] = None,
489
- output_format: str = "pandas"
490
- ) -> Any:
491
- """
492
- Generate synthetic data by extending a dataframe or creating from scratch.
493
-
494
- Uses Polars-only architecture:
495
- 1. Detect input format (pandas/polars/cuDF)
496
- 2. Convert to Polars via Arrow bridge (if needed)
497
- 3. Process synthetic data generation in Polars
498
- 4. Convert back to original format via Arrow bridge
499
-
500
- This function adds new rows to a dataframe using various strategies:
501
- - "auto": Random sampling from existing values (default)
502
- - "increment": Increment numeric or pattern-based values
503
- - "range:min-max": Random integers within range
504
- - "choice:[...]": Random selection from inline list
505
- - "lists@variable_name": Inline linked lists (generates multiple columns)
506
- - "forecast:method": Time series forecasting (linear, polynomial, exponential, seasonal)
507
- - "normal": Normal distribution generation
508
- - "uniform": Uniform distribution generation
509
- - "skewed_left/skewed_right": Skewed distribution generation
510
- - "smote": Synthetic Minority Over-sampling Technique
511
-
512
- Args:
513
- df: Input dataframe (pandas, polars, or cudf), or sentinel:
514
- - DataFrame: Augment mode (add rows to existing data)
515
- - "@new": Create mode (generate data from scratch)
516
- - "@sample": Sample mode (load sample dataset)
517
- n_rows: Number of rows to add. Can be:
518
- - int: Exact number (e.g., 5)
519
- - str percentage: Percentage of current size (e.g., "50%")
520
- - str multiplier: Multiple of current size (e.g., "2x")
521
- strategy: Augmentation strategy. Can be:
522
- - str: "auto" (applies to all columns)
523
- - dict: Column-specific strategies, e.g.:
524
- {
525
- "id": "increment",
526
- "emp_id": "increment:EMP_[001]_ID",
527
- "age": "range:18-65",
528
- "status": "choice:[Active,Inactive,Pending]",
529
- "sales": "forecast:seasonal:period=12",
530
- "score": "normal:mean=75:std=10",
531
- "income": "skewed_right:skewness=1.5"
532
- }
533
- Unlisted columns default to "auto"
534
- seed: Random seed for reproducibility. If None, results will vary.
535
- output_format: Output format for create/sample modes. Options:
536
- - "pandas": Return pandas DataFrame (default)
537
- - "polars": Return Polars DataFrame
538
- - "cudf": Return cuDF DataFrame
539
- Note: In augment mode (with DataFrame input), output format
540
- matches input format and this parameter is ignored.
541
-
542
- Returns:
543
- Augmented dataframe with original + new rows (same type as input)
544
-
545
- Raises:
546
- ValidationError: If input validation fails
547
- AugmentError: If augmentation fails
548
-
549
- Examples:
550
- >>> # Add 5 rows with random sampling (default)
551
- >>> df_aug = add.augment(df)
552
-
553
- >>> # Increment numeric ID column
554
- >>> df_aug = add.augment(df, strategy={"id": "increment"})
555
-
556
- >>> # Forecast sales with seasonal pattern
557
- >>> df_aug = add.augment(df, n_rows=24, strategy={
558
- ... "sales": "forecast:seasonal:period=12"
559
- ... })
560
-
561
- >>> # Generate from normal distribution
562
- >>> df_aug = add.augment(df, n_rows=100, strategy={
563
- ... "age": "normal:mean=35:std=10",
564
- ... "score": "uniform:min=0:max=100"
565
- ... })
566
-
567
- >>> # Mixed strategies
568
- >>> df_aug = add.augment(df, n_rows=100, strategy={
569
- ... "id": "increment",
570
- ... "age": "range:18-65",
571
- ... "status": "choice:[Active,Inactive]",
572
- ... "sales": "forecast:linear",
573
- ... "score": "normal:auto"
574
- ... })
575
-
576
- >>> # Create data from scratch (returns pandas by default)
577
- >>> df_new = add.augment("@new", n_rows=50, strategy={
578
- ... "id": "increment:start=1",
579
- ... "age": "range:18-65",
580
- ... "status": "choice:[Active,Inactive]"
581
- ... })
582
- """
583
- # Detect mode
584
- mode = _detect_mode(df)
585
-
586
- # Validate output_format parameter
587
- valid_formats = ["pandas", "polars", "cudf"]
588
- if output_format not in valid_formats:
589
- raise ValidationError(
590
- f"Invalid output_format: '{output_format}'. "
591
- f"Must be one of: {', '.join(valid_formats)}"
592
- )
593
-
594
- # Parse and validate strategy
595
- try:
596
- strategy_dict = parse_strategy_dict(strategy)
597
- except ValidationError as e:
598
- raise ValidationError(f"Invalid strategy parameter: {e}")
599
-
600
- # Handle create mode
601
- if mode == "create":
602
- # Validate create mode requirements
603
- if not isinstance(strategy, dict):
604
- raise ValidationError(
605
- "Create mode requires a strategy dict with column definitions. "
606
- "Example: strategy={'id': 'increment:start=1', 'age': 'range:18-65'}"
607
- )
608
-
609
- if not strategy or len(strategy) == 0:
610
- raise ValidationError(
611
- "Create mode requires at least one column in strategy dict"
612
- )
613
-
614
- if not isinstance(n_rows, int):
615
- raise ValidationError(
616
- f"Create mode requires n_rows to be an integer, got {type(n_rows).__name__}. "
617
- "Percentage ('50%') and multiplier ('2x') formats are not supported in create mode."
618
- )
619
-
620
- if n_rows <= 0:
621
- raise ValidationError("n_rows must be positive")
622
-
623
- try:
624
- # Generate data from scratch
625
- result_polars = _create_from_scratch_engine(n_rows, strategy_dict, seed)
626
-
627
- # Convert to requested output format
628
- result_df = from_polars(result_polars, output_format)
629
-
630
- # Memory cleanup
631
- del result_polars
632
- import gc
633
- gc.collect()
634
-
635
- return result_df
636
-
637
- except Exception as e:
638
- if isinstance(e, (ValidationError, AugmentError)):
639
- raise
640
- raise AugmentError(f"Create mode failed: {e}")
641
-
642
- # Handle sample mode
643
- if mode == "sample":
644
- # Load sample dataset
645
- try:
646
- df = get_sample_dataset("augment", "sample", "clean")
647
- except Exception as e:
648
- raise ValidationError(f"Failed to load sample dataset: {e}")
649
-
650
- # Continue to augment mode with loaded sample
651
- # (will use output_format at the end)
652
-
653
- # Augment mode (original behavior)
654
- # Validate input dataframe
655
- validate_dataframe(df, "df")
656
-
657
- # Check minimum size - require at least 3 rows for meaningful augmentation
658
- MIN_ROWS = 3
659
- df_length = len(df)
660
- if df_length < MIN_ROWS:
661
- raise ValidationError(
662
- f"Minimum {MIN_ROWS} rows required for augmentation. "
663
- f"Current size: {df_length}"
664
- )
665
-
666
- # Parse n_rows
667
- try:
668
- actual_n_rows = _parse_n_rows(n_rows, df_length)
669
- except ValidationError as e:
670
- raise ValidationError(f"Invalid n_rows parameter: {e}")
671
-
672
- # Detect input backend (for augment mode, use input format; for sample mode, use output_format)
673
- if mode == "sample":
674
- input_backend = output_format
675
- else:
676
- input_backend = detect_backend(df)
677
-
678
- # Augment using Polars-only architecture
679
- try:
680
- # 1. Convert to Polars via Arrow bridge
681
- df_polars = to_polars(df, input_backend if mode != "sample" else "polars")
682
-
683
- # Memory cleanup: delete original if converted
684
- if mode != "sample" and input_backend != 'polars':
685
- del df
686
- import gc
687
- gc.collect()
688
-
689
- # 2. Process augmentation in Polars
690
- result_polars = _augment_polars_engine(df_polars, actual_n_rows, strategy_dict, seed)
691
-
692
- # Memory cleanup: delete intermediate Polars DataFrame
693
- del df_polars
694
- import gc
695
- gc.collect()
696
-
697
- # 3. Convert back to target format
698
- # In augment mode: match input format
699
- # In sample mode: use output_format parameter
700
- target_backend = output_format if mode == "sample" else input_backend
701
- result_df = from_polars(result_polars, target_backend)
702
-
703
- # Final memory cleanup
704
- del result_polars
705
- import gc
706
- gc.collect()
707
-
708
- return result_df
709
-
710
- except Exception as e:
711
- if isinstance(e, (ValidationError, AugmentError)):
712
- raise
713
- raise AugmentError(f"Augmentation failed: {e}")