additory 0.1.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. additory/__init__.py +15 -0
  2. additory/analysis/__init__.py +48 -0
  3. additory/analysis/cardinality.py +126 -0
  4. additory/analysis/correlations.py +124 -0
  5. additory/analysis/distributions.py +376 -0
  6. additory/analysis/quality.py +158 -0
  7. additory/analysis/scan.py +400 -0
  8. additory/augment/__init__.py +24 -0
  9. additory/augment/augmentor.py +653 -0
  10. additory/augment/builtin_lists.py +430 -0
  11. additory/augment/distributions.py +22 -0
  12. additory/augment/forecast.py +1132 -0
  13. additory/augment/list_registry.py +177 -0
  14. additory/augment/smote.py +320 -0
  15. additory/augment/strategies.py +883 -0
  16. additory/common/__init__.py +157 -0
  17. additory/common/backend.py +355 -0
  18. additory/common/column_utils.py +191 -0
  19. additory/common/distributions.py +737 -0
  20. additory/common/exceptions.py +62 -0
  21. additory/common/lists.py +229 -0
  22. additory/common/patterns.py +240 -0
  23. additory/common/resolver.py +567 -0
  24. additory/common/sample_data.py +182 -0
  25. additory/common/validation.py +197 -0
  26. additory/core/__init__.py +27 -0
  27. additory/core/ast_builder.py +165 -0
  28. additory/core/backends/__init__.py +23 -0
  29. additory/core/backends/arrow_bridge.py +476 -0
  30. additory/core/backends/cudf_bridge.py +355 -0
  31. additory/core/column_positioning.py +358 -0
  32. additory/core/compiler_polars.py +166 -0
  33. additory/core/config.py +342 -0
  34. additory/core/enhanced_cache_manager.py +1119 -0
  35. additory/core/enhanced_matchers.py +473 -0
  36. additory/core/enhanced_version_manager.py +325 -0
  37. additory/core/executor.py +59 -0
  38. additory/core/integrity_manager.py +477 -0
  39. additory/core/loader.py +190 -0
  40. additory/core/logging.py +24 -0
  41. additory/core/memory_manager.py +547 -0
  42. additory/core/namespace_manager.py +657 -0
  43. additory/core/parser.py +176 -0
  44. additory/core/polars_expression_engine.py +551 -0
  45. additory/core/registry.py +176 -0
  46. additory/core/sample_data_manager.py +492 -0
  47. additory/core/user_namespace.py +751 -0
  48. additory/core/validator.py +27 -0
  49. additory/dynamic_api.py +308 -0
  50. additory/expressions/__init__.py +26 -0
  51. additory/expressions/engine.py +551 -0
  52. additory/expressions/parser.py +176 -0
  53. additory/expressions/proxy.py +546 -0
  54. additory/expressions/registry.py +313 -0
  55. additory/expressions/samples.py +492 -0
  56. additory/synthetic/__init__.py +101 -0
  57. additory/synthetic/api.py +220 -0
  58. additory/synthetic/common_integration.py +314 -0
  59. additory/synthetic/config.py +262 -0
  60. additory/synthetic/engines.py +529 -0
  61. additory/synthetic/exceptions.py +180 -0
  62. additory/synthetic/file_managers.py +518 -0
  63. additory/synthetic/generator.py +702 -0
  64. additory/synthetic/generator_parser.py +68 -0
  65. additory/synthetic/integration.py +319 -0
  66. additory/synthetic/models.py +241 -0
  67. additory/synthetic/pattern_resolver.py +573 -0
  68. additory/synthetic/performance.py +469 -0
  69. additory/synthetic/polars_integration.py +464 -0
  70. additory/synthetic/proxy.py +60 -0
  71. additory/synthetic/schema_parser.py +685 -0
  72. additory/synthetic/validator.py +553 -0
  73. additory/utilities/__init__.py +53 -0
  74. additory/utilities/encoding.py +600 -0
  75. additory/utilities/games.py +300 -0
  76. additory/utilities/keys.py +8 -0
  77. additory/utilities/lookup.py +103 -0
  78. additory/utilities/matchers.py +216 -0
  79. additory/utilities/resolvers.py +286 -0
  80. additory/utilities/settings.py +167 -0
  81. additory/utilities/units.py +746 -0
  82. additory/utilities/validators.py +153 -0
  83. additory-0.1.0a1.dist-info/METADATA +293 -0
  84. additory-0.1.0a1.dist-info/RECORD +87 -0
  85. additory-0.1.0a1.dist-info/WHEEL +5 -0
  86. additory-0.1.0a1.dist-info/licenses/LICENSE +21 -0
  87. additory-0.1.0a1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,653 @@
1
+ """
2
+ Data Augmentation Engine - Polars-Only Architecture
3
+
4
+ Provides functionality to augment dataframes by adding synthetic rows
5
+ based on existing data patterns.
6
+
7
+ Architecture:
8
+ 1. Detect input format (pandas/polars/cuDF)
9
+ 2. Convert to Polars via Arrow bridge (if needed)
10
+ 3. Process augmentation in Polars
11
+ 4. Convert back to original format via Arrow bridge
12
+ """
13
+
14
+ from typing import Union, Optional, Any, Dict, Literal
15
+ import random
16
+
17
+ from additory.common.backend import detect_backend, to_polars, from_polars
18
+ from additory.common.exceptions import ValidationError, AugmentError
19
+ from additory.common.validation import validate_dataframe
20
+ from additory.common.sample_data import get_sample_dataset
21
+ from additory.augment.strategies import (
22
+ parse_strategy_dict,
23
+ get_column_strategy,
24
+ apply_increment_strategy,
25
+ apply_choice_strategy,
26
+ apply_range_strategy,
27
+ parse_strategy_params
28
+ )
29
+
30
+
31
+ def _validate_generative_strategies(strategy_dict: Dict[str, str]) -> None:
32
+ """
33
+ Validate that all strategies are generative (not augmentative).
34
+
35
+ Generative strategies can create data from scratch:
36
+ - increment (with start parameter)
37
+ - range
38
+ - choice
39
+ - choice_list
40
+
41
+ Augmentative strategies require existing data:
42
+ - auto (random sampling)
43
+ - forecast (time series)
44
+ - seasonal (time series)
45
+ - smote (synthetic minority oversampling)
46
+
47
+ Args:
48
+ strategy_dict: Dictionary mapping column names to strategy specs
49
+
50
+ Raises:
51
+ ValidationError: If any augmentative strategies are found
52
+ """
53
+ augmentative_strategies = ["auto", "forecast", "seasonal", "smote"]
54
+
55
+ invalid_columns = []
56
+
57
+ for col, strategy_spec in strategy_dict.items():
58
+ if col == "__default__":
59
+ continue
60
+
61
+ # Get the base strategy name (before any parameters)
62
+ strategy_name = strategy_spec.split(":")[0].strip()
63
+
64
+ if strategy_name in augmentative_strategies:
65
+ invalid_columns.append((col, strategy_name))
66
+
67
+ if invalid_columns:
68
+ error_lines = [
69
+ f"Create mode requires generative strategies. Found augmentative strategies:"
70
+ ]
71
+ for col, strat in invalid_columns:
72
+ error_lines.append(f" - Column '{col}': '{strat}'")
73
+
74
+ error_lines.append("")
75
+ error_lines.append("Valid generative strategies:")
76
+ error_lines.append(" - increment (with start parameter)")
77
+ error_lines.append(" - range:min-max")
78
+ error_lines.append(" - choice:[value1,value2,...]")
79
+ error_lines.append(" - choice_list:list_name")
80
+
81
+ raise ValidationError("\n".join(error_lines))
82
+
83
+
84
+ def _detect_mode(df: Any) -> Literal["augment", "create", "sample"]:
85
+ """
86
+ Detect the augmentation mode based on the df parameter.
87
+
88
+ Three modes are supported:
89
+ 1. "augment" - Augment an existing DataFrame (default)
90
+ 2. "create" - Create data from scratch using "@new" sentinel
91
+ 3. "sample" - Load and optionally augment sample dataset using "@sample" sentinel
92
+
93
+ Args:
94
+ df: Input parameter (DataFrame or sentinel string)
95
+
96
+ Returns:
97
+ Mode string: "augment", "create", or "sample"
98
+
99
+ Raises:
100
+ ValidationError: If df is an invalid string (not "@new" or "@sample")
101
+ """
102
+ # Check for sentinel values
103
+ if isinstance(df, str):
104
+ if df == "@new":
105
+ return "create"
106
+ elif df == "@sample":
107
+ return "sample"
108
+ else:
109
+ # Provide helpful error messages
110
+ if df.lower() in ["new", "create"]:
111
+ raise ValidationError(
112
+ f"Invalid input: '{df}'. Did you mean '@new'? "
113
+ "Use '@new' to create data from scratch."
114
+ )
115
+ elif df.lower() in ["sample", "samples"]:
116
+ raise ValidationError(
117
+ f"Invalid input: '{df}'. Did you mean '@sample'? "
118
+ "Use '@sample' to load sample dataset."
119
+ )
120
+ else:
121
+ raise ValidationError(
122
+ f"Invalid string input: '{df}'. "
123
+ "Expected a DataFrame, '@new' (create mode), or '@sample' (sample mode)."
124
+ )
125
+
126
+ # If not a string, assume it's a DataFrame (augment mode)
127
+ return "augment"
128
+
129
+
130
+ def _parse_n_rows(n_rows: Union[int, str], df_length: int) -> int:
131
+ """
132
+ Parse n_rows parameter to get actual number of rows to generate.
133
+
134
+ Args:
135
+ n_rows: Number of rows (int), percentage ("50%"), or multiplier ("2x")
136
+ df_length: Length of the input dataframe
137
+
138
+ Returns:
139
+ Actual number of rows to generate
140
+
141
+ Raises:
142
+ ValidationError: If n_rows format is invalid
143
+ """
144
+ if isinstance(n_rows, int):
145
+ if n_rows <= 0:
146
+ raise ValidationError("n_rows must be positive")
147
+ return n_rows
148
+
149
+ if isinstance(n_rows, str):
150
+ n_rows = n_rows.strip()
151
+
152
+ # Handle percentage: "50%"
153
+ if n_rows.endswith("%"):
154
+ try:
155
+ percentage = float(n_rows[:-1])
156
+ if percentage <= 0:
157
+ raise ValidationError("Percentage must be positive")
158
+ return max(1, int(df_length * percentage / 100))
159
+ except ValueError:
160
+ raise ValidationError(f"Invalid percentage format: {n_rows}")
161
+
162
+ # Handle multiplier: "2x"
163
+ if n_rows.endswith("x"):
164
+ try:
165
+ multiplier = float(n_rows[:-1])
166
+ if multiplier <= 0:
167
+ raise ValidationError("Multiplier must be positive")
168
+ return max(1, int(df_length * multiplier))
169
+ except ValueError:
170
+ raise ValidationError(f"Invalid multiplier format: {n_rows}")
171
+
172
+ raise ValidationError(
173
+ f"Invalid n_rows format: {n_rows}. "
174
+ "Use int (5), percentage ('50%'), or multiplier ('2x')"
175
+ )
176
+
177
+ raise ValidationError(f"n_rows must be int or str, got {type(n_rows)}")
178
+
179
+
180
+ def _augment_polars_engine(df_polars: Any, n_rows: int, strategy_dict: Dict[str, str], seed: Optional[int]) -> Any:
181
+ """
182
+ Augment Polars DataFrame with strategy support.
183
+
184
+ This is the core augmentation engine that processes all dataframes
185
+ (pandas, polars, cuDF) after conversion to Polars format.
186
+
187
+ Args:
188
+ df_polars: Input Polars DataFrame
189
+ n_rows: Number of rows to generate
190
+ strategy_dict: Column-specific strategies
191
+ seed: Random seed for reproducibility
192
+
193
+ Returns:
194
+ Augmented Polars DataFrame
195
+ """
196
+ import polars as pl
197
+
198
+ # Get column names
199
+ columns = df_polars.columns
200
+
201
+ # Check if any column uses non-auto strategy
202
+ has_custom_strategy = any(
203
+ not get_column_strategy(col, strategy_dict).startswith("auto")
204
+ for col in columns
205
+ )
206
+
207
+ if not has_custom_strategy:
208
+ # Simple random sampling (original behavior)
209
+ if seed is not None:
210
+ sampled = df_polars.sample(n=n_rows, with_replacement=True, seed=seed)
211
+ else:
212
+ sampled = df_polars.sample(n=n_rows, with_replacement=True)
213
+ else:
214
+ # Build new rows column by column
215
+ new_data = {}
216
+
217
+ for col in columns:
218
+ col_strategy = get_column_strategy(col, strategy_dict)
219
+
220
+ if col_strategy.startswith("increment"):
221
+ # Generate incremented values
222
+ new_values = apply_increment_strategy(
223
+ df_polars, col, col_strategy, n_rows
224
+ )
225
+ new_data[col] = new_values
226
+ elif col_strategy.startswith("range"):
227
+ # Parse range parameters
228
+ strategy_name, params = parse_strategy_params(col_strategy)
229
+
230
+ if "min" not in params or "max" not in params:
231
+ raise ValidationError(
232
+ f"Range strategy for column '{col}' requires min and max parameters. "
233
+ f"Use format: 'range:min-max' (e.g., 'range:18-65')"
234
+ )
235
+
236
+ # Generate range values
237
+ new_values = apply_range_strategy(
238
+ min_val=params["min"],
239
+ max_val=params["max"],
240
+ n_rows=n_rows,
241
+ seed=seed
242
+ )
243
+ new_data[col] = new_values
244
+ elif col_strategy.startswith("choice"):
245
+ # Generate choice values
246
+ new_values = apply_choice_strategy(
247
+ col_strategy, n_rows, seed
248
+ )
249
+ new_data[col] = new_values
250
+ elif col_strategy.startswith("forecast"):
251
+ # Import here to avoid circular dependency
252
+ from additory.augment.strategies import apply_forecast_strategy
253
+
254
+ # Generate forecasted values
255
+ new_values = apply_forecast_strategy(
256
+ df_polars, col, col_strategy, n_rows, seed
257
+ )
258
+
259
+ # Cast to match original column dtype if needed
260
+ original_dtype = df_polars[col].dtype
261
+ if original_dtype.is_integer():
262
+ # Round and convert to int for integer columns
263
+ new_values = [int(round(v)) for v in new_values]
264
+
265
+ new_data[col] = new_values
266
+ elif col_strategy.startswith(("normal", "uniform", "skewed_left", "skewed_right", "beta", "gamma", "exponential", "kde")):
267
+ # Import here to avoid circular dependency
268
+ from additory.augment.strategies import apply_distribution_strategy
269
+
270
+ # Generate distribution values
271
+ new_values = apply_distribution_strategy(
272
+ df_polars, col, col_strategy, n_rows, seed
273
+ )
274
+
275
+ # Cast to match original column dtype if needed
276
+ original_dtype = df_polars[col].dtype
277
+ if original_dtype.is_integer():
278
+ # Round and convert to int for integer columns
279
+ new_values = [int(round(v)) for v in new_values]
280
+
281
+ new_data[col] = new_values
282
+ else:
283
+ # Random sampling for this column (auto)
284
+ if seed is not None:
285
+ sampled_col = df_polars.select(col).sample(n=n_rows, with_replacement=True, seed=seed)
286
+ else:
287
+ sampled_col = df_polars.select(col).sample(n=n_rows, with_replacement=True)
288
+ new_data[col] = sampled_col[col].to_list()
289
+
290
+ sampled = pl.DataFrame(new_data)
291
+
292
+ # Concatenate original and new rows
293
+ result = pl.concat([df_polars, sampled])
294
+
295
+ return result
296
+
297
+
298
+ def _create_from_scratch_engine(
299
+ n_rows: int,
300
+ strategy_dict: Dict[str, str],
301
+ seed: Optional[int]
302
+ ) -> Any:
303
+ """
304
+ Create DataFrame from scratch using generative strategies.
305
+
306
+ This engine generates data column by column without requiring
307
+ an existing DataFrame. All strategies must be generative.
308
+
309
+ Generative strategies (supported):
310
+ - increment (with start parameter)
311
+ - range
312
+ - choice
313
+ - choice_list
314
+
315
+ Augmentative strategies (NOT supported):
316
+ - auto (requires existing data)
317
+ - forecast (requires time series)
318
+ - seasonal (requires time series)
319
+ - smote (requires existing data)
320
+
321
+ Args:
322
+ n_rows: Number of rows to generate
323
+ strategy_dict: Column-specific strategies (all must be generative)
324
+ seed: Random seed for reproducibility
325
+
326
+ Returns:
327
+ Polars DataFrame with generated data
328
+
329
+ Raises:
330
+ ValidationError: If any augmentative strategies are found
331
+
332
+ Examples:
333
+ >>> # Simple create with increment and range
334
+ >>> result = _create_from_scratch_engine(
335
+ ... n_rows=10,
336
+ ... strategy_dict={
337
+ ... "id": "increment:start=1",
338
+ ... "age": "range:18-65"
339
+ ... },
340
+ ... seed=42
341
+ ... )
342
+ >>> result.shape
343
+ (10, 2)
344
+
345
+ >>> # Create with mixed strategies
346
+ >>> result = _create_from_scratch_engine(
347
+ ... n_rows=100,
348
+ ... strategy_dict={
349
+ ... "id": "increment:start=1",
350
+ ... "emp_id": "increment:start=1:pattern=EMP_[001]",
351
+ ... "age": "range:18-65",
352
+ ... "status": "choice:[Active,Inactive]",
353
+ ... "department": "choice_list:departments"
354
+ ... },
355
+ ... seed=42
356
+ ... )
357
+ >>> result.shape
358
+ (100, 5)
359
+ """
360
+ import polars as pl
361
+
362
+ # Validate all strategies are generative
363
+ _validate_generative_strategies(strategy_dict)
364
+
365
+ # Build data column by column
366
+ new_data = {}
367
+
368
+ for col, col_strategy in strategy_dict.items():
369
+ if col == "__default__":
370
+ continue
371
+
372
+ if col_strategy.startswith("increment"):
373
+ # Parse parameters for increment strategy
374
+ strategy_name, params = parse_strategy_params(col_strategy)
375
+
376
+ # Generate incremented values (create mode)
377
+ new_values = apply_increment_strategy(
378
+ df_polars=None, # No DataFrame in create mode
379
+ column=col,
380
+ strategy_spec=col_strategy,
381
+ n_rows=n_rows,
382
+ params=params
383
+ )
384
+ new_data[col] = new_values
385
+
386
+ elif col_strategy.startswith("range"):
387
+ # Parse range parameters
388
+ strategy_name, params = parse_strategy_params(col_strategy)
389
+
390
+ if "min" not in params or "max" not in params:
391
+ raise ValidationError(
392
+ f"Range strategy for column '{col}' requires min and max parameters. "
393
+ f"Use format: 'range:min-max' (e.g., 'range:18-65')"
394
+ )
395
+
396
+ # Generate range values
397
+ new_values = apply_range_strategy(
398
+ min_val=params["min"],
399
+ max_val=params["max"],
400
+ n_rows=n_rows,
401
+ seed=seed
402
+ )
403
+ new_data[col] = new_values
404
+
405
+ elif col_strategy.startswith("choice"):
406
+ # Generate choice values
407
+ new_values = apply_choice_strategy(
408
+ col_strategy, n_rows, seed
409
+ )
410
+ new_data[col] = new_values
411
+
412
+ else:
413
+ raise ValidationError(
414
+ f"Unknown or unsupported strategy for column '{col}': '{col_strategy}'"
415
+ )
416
+
417
+ # Build Polars DataFrame from generated columns
418
+ result = pl.DataFrame(new_data)
419
+
420
+ return result
421
+
422
+
423
+ def augment(
424
+ df: Any,
425
+ n_rows: Union[int, str] = 5,
426
+ strategy: Union[str, Dict[str, str]] = "auto",
427
+ seed: Optional[int] = None,
428
+ output_format: str = "pandas"
429
+ ) -> Any:
430
+ """
431
+ Augment a dataframe by adding synthetic rows based on existing data.
432
+
433
+ Uses Polars-only architecture:
434
+ 1. Detect input format (pandas/polars/cuDF)
435
+ 2. Convert to Polars via Arrow bridge (if needed)
436
+ 3. Process augmentation in Polars
437
+ 4. Convert back to original format via Arrow bridge
438
+
439
+ This function adds new rows to a dataframe using various strategies:
440
+ - "auto": Random sampling from existing values (default)
441
+ - "increment": Increment numeric or pattern-based values
442
+ - "range:min-max": Random integers within range
443
+ - "choice:[...]": Random selection from inline list
444
+ - "choice_list:name": Random selection from registered/built-in list
445
+ - "forecast:method": Time series forecasting (linear, polynomial, exponential, seasonal)
446
+ - "normal": Normal distribution generation
447
+ - "uniform": Uniform distribution generation
448
+ - "skewed_left/skewed_right": Skewed distribution generation
449
+ - "smote": Synthetic Minority Over-sampling Technique
450
+
451
+ Args:
452
+ df: Input dataframe (pandas, polars, or cudf), or sentinel:
453
+ - DataFrame: Augment mode (add rows to existing data)
454
+ - "@new": Create mode (generate data from scratch)
455
+ - "@sample": Sample mode (load sample dataset)
456
+ n_rows: Number of rows to add. Can be:
457
+ - int: Exact number (e.g., 5)
458
+ - str percentage: Percentage of current size (e.g., "50%")
459
+ - str multiplier: Multiple of current size (e.g., "2x")
460
+ strategy: Augmentation strategy. Can be:
461
+ - str: "auto" (applies to all columns)
462
+ - dict: Column-specific strategies, e.g.:
463
+ {
464
+ "id": "increment",
465
+ "emp_id": "increment:EMP_[001]_ID",
466
+ "age": "range:18-65",
467
+ "status": "choice:[Active,Inactive,Pending]",
468
+ "bank": "choice_list:banks",
469
+ "sales": "forecast:seasonal:period=12",
470
+ "score": "normal:mean=75:std=10",
471
+ "income": "skewed_right:skewness=1.5"
472
+ }
473
+ Unlisted columns default to "auto"
474
+ seed: Random seed for reproducibility. If None, results will vary.
475
+ output_format: Output format for create/sample modes. Options:
476
+ - "pandas": Return pandas DataFrame (default)
477
+ - "polars": Return Polars DataFrame
478
+ - "cudf": Return cuDF DataFrame
479
+ Note: In augment mode (with DataFrame input), output format
480
+ matches input format and this parameter is ignored.
481
+
482
+ Returns:
483
+ Augmented dataframe with original + new rows (same type as input)
484
+
485
+ Raises:
486
+ ValidationError: If input validation fails
487
+ AugmentError: If augmentation fails
488
+
489
+ Examples:
490
+ >>> # Add 5 rows with random sampling (default)
491
+ >>> df_aug = add.augment(df)
492
+
493
+ >>> # Increment numeric ID column
494
+ >>> df_aug = add.augment(df, strategy={"id": "increment"})
495
+
496
+ >>> # Forecast sales with seasonal pattern
497
+ >>> df_aug = add.augment(df, n_rows=24, strategy={
498
+ ... "sales": "forecast:seasonal:period=12"
499
+ ... })
500
+
501
+ >>> # Generate from normal distribution
502
+ >>> df_aug = add.augment(df, n_rows=100, strategy={
503
+ ... "age": "normal:mean=35:std=10",
504
+ ... "score": "uniform:min=0:max=100"
505
+ ... })
506
+
507
+ >>> # Mixed strategies
508
+ >>> df_aug = add.augment(df, n_rows=100, strategy={
509
+ ... "id": "increment",
510
+ ... "age": "range:18-65",
511
+ ... "status": "choice:[Active,Inactive]",
512
+ ... "sales": "forecast:linear",
513
+ ... "score": "normal:auto"
514
+ ... })
515
+
516
+ >>> # Create data from scratch (returns pandas by default)
517
+ >>> df_new = add.augment("@new", n_rows=50, strategy={
518
+ ... "id": "increment:start=1",
519
+ ... "age": "range:18-65",
520
+ ... "status": "choice:[Active,Inactive]"
521
+ ... })
522
+ """
523
+ # Detect mode
524
+ mode = _detect_mode(df)
525
+
526
+ # Validate output_format parameter
527
+ valid_formats = ["pandas", "polars", "cudf"]
528
+ if output_format not in valid_formats:
529
+ raise ValidationError(
530
+ f"Invalid output_format: '{output_format}'. "
531
+ f"Must be one of: {', '.join(valid_formats)}"
532
+ )
533
+
534
+ # Parse and validate strategy
535
+ try:
536
+ strategy_dict = parse_strategy_dict(strategy)
537
+ except ValidationError as e:
538
+ raise ValidationError(f"Invalid strategy parameter: {e}")
539
+
540
+ # Handle create mode
541
+ if mode == "create":
542
+ # Validate create mode requirements
543
+ if not isinstance(strategy, dict):
544
+ raise ValidationError(
545
+ "Create mode requires a strategy dict with column definitions. "
546
+ "Example: strategy={'id': 'increment:start=1', 'age': 'range:18-65'}"
547
+ )
548
+
549
+ if not strategy or len(strategy) == 0:
550
+ raise ValidationError(
551
+ "Create mode requires at least one column in strategy dict"
552
+ )
553
+
554
+ if not isinstance(n_rows, int):
555
+ raise ValidationError(
556
+ f"Create mode requires n_rows to be an integer, got {type(n_rows).__name__}. "
557
+ "Percentage ('50%') and multiplier ('2x') formats are not supported in create mode."
558
+ )
559
+
560
+ if n_rows <= 0:
561
+ raise ValidationError("n_rows must be positive")
562
+
563
+ try:
564
+ # Generate data from scratch
565
+ result_polars = _create_from_scratch_engine(n_rows, strategy_dict, seed)
566
+
567
+ # Convert to requested output format
568
+ result_df = from_polars(result_polars, output_format)
569
+
570
+ # Memory cleanup
571
+ del result_polars
572
+ import gc
573
+ gc.collect()
574
+
575
+ return result_df
576
+
577
+ except Exception as e:
578
+ if isinstance(e, (ValidationError, AugmentError)):
579
+ raise
580
+ raise AugmentError(f"Create mode failed: {e}")
581
+
582
+ # Handle sample mode
583
+ if mode == "sample":
584
+ # Load sample dataset
585
+ try:
586
+ df = get_sample_dataset("augment", "sample", "clean")
587
+ except Exception as e:
588
+ raise ValidationError(f"Failed to load sample dataset: {e}")
589
+
590
+ # Continue to augment mode with loaded sample
591
+ # (will use output_format at the end)
592
+
593
+ # Augment mode (original behavior)
594
+ # Validate input dataframe
595
+ validate_dataframe(df, "df")
596
+
597
+ # Check minimum size - require at least 3 rows for meaningful augmentation
598
+ MIN_ROWS = 3
599
+ df_length = len(df)
600
+ if df_length < MIN_ROWS:
601
+ raise ValidationError(
602
+ f"Minimum {MIN_ROWS} rows required for augmentation. "
603
+ f"Current size: {df_length}"
604
+ )
605
+
606
+ # Parse n_rows
607
+ try:
608
+ actual_n_rows = _parse_n_rows(n_rows, df_length)
609
+ except ValidationError as e:
610
+ raise ValidationError(f"Invalid n_rows parameter: {e}")
611
+
612
+ # Detect input backend (for augment mode, use input format; for sample mode, use output_format)
613
+ if mode == "sample":
614
+ input_backend = output_format
615
+ else:
616
+ input_backend = detect_backend(df)
617
+
618
+ # Augment using Polars-only architecture
619
+ try:
620
+ # 1. Convert to Polars via Arrow bridge
621
+ df_polars = to_polars(df, input_backend if mode != "sample" else "polars")
622
+
623
+ # Memory cleanup: delete original if converted
624
+ if mode != "sample" and input_backend != 'polars':
625
+ del df
626
+ import gc
627
+ gc.collect()
628
+
629
+ # 2. Process augmentation in Polars
630
+ result_polars = _augment_polars_engine(df_polars, actual_n_rows, strategy_dict, seed)
631
+
632
+ # Memory cleanup: delete intermediate Polars DataFrame
633
+ del df_polars
634
+ import gc
635
+ gc.collect()
636
+
637
+ # 3. Convert back to target format
638
+ # In augment mode: match input format
639
+ # In sample mode: use output_format parameter
640
+ target_backend = output_format if mode == "sample" else input_backend
641
+ result_df = from_polars(result_polars, target_backend)
642
+
643
+ # Final memory cleanup
644
+ del result_polars
645
+ import gc
646
+ gc.collect()
647
+
648
+ return result_df
649
+
650
+ except Exception as e:
651
+ if isinstance(e, (ValidationError, AugmentError)):
652
+ raise
653
+ raise AugmentError(f"Augmentation failed: {e}")