additory 0.1.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. additory/__init__.py +15 -0
  2. additory/analysis/__init__.py +48 -0
  3. additory/analysis/cardinality.py +126 -0
  4. additory/analysis/correlations.py +124 -0
  5. additory/analysis/distributions.py +376 -0
  6. additory/analysis/quality.py +158 -0
  7. additory/analysis/scan.py +400 -0
  8. additory/augment/__init__.py +24 -0
  9. additory/augment/augmentor.py +653 -0
  10. additory/augment/builtin_lists.py +430 -0
  11. additory/augment/distributions.py +22 -0
  12. additory/augment/forecast.py +1132 -0
  13. additory/augment/list_registry.py +177 -0
  14. additory/augment/smote.py +320 -0
  15. additory/augment/strategies.py +883 -0
  16. additory/common/__init__.py +157 -0
  17. additory/common/backend.py +355 -0
  18. additory/common/column_utils.py +191 -0
  19. additory/common/distributions.py +737 -0
  20. additory/common/exceptions.py +62 -0
  21. additory/common/lists.py +229 -0
  22. additory/common/patterns.py +240 -0
  23. additory/common/resolver.py +567 -0
  24. additory/common/sample_data.py +182 -0
  25. additory/common/validation.py +197 -0
  26. additory/core/__init__.py +27 -0
  27. additory/core/ast_builder.py +165 -0
  28. additory/core/backends/__init__.py +23 -0
  29. additory/core/backends/arrow_bridge.py +476 -0
  30. additory/core/backends/cudf_bridge.py +355 -0
  31. additory/core/column_positioning.py +358 -0
  32. additory/core/compiler_polars.py +166 -0
  33. additory/core/config.py +342 -0
  34. additory/core/enhanced_cache_manager.py +1119 -0
  35. additory/core/enhanced_matchers.py +473 -0
  36. additory/core/enhanced_version_manager.py +325 -0
  37. additory/core/executor.py +59 -0
  38. additory/core/integrity_manager.py +477 -0
  39. additory/core/loader.py +190 -0
  40. additory/core/logging.py +24 -0
  41. additory/core/memory_manager.py +547 -0
  42. additory/core/namespace_manager.py +657 -0
  43. additory/core/parser.py +176 -0
  44. additory/core/polars_expression_engine.py +551 -0
  45. additory/core/registry.py +176 -0
  46. additory/core/sample_data_manager.py +492 -0
  47. additory/core/user_namespace.py +751 -0
  48. additory/core/validator.py +27 -0
  49. additory/dynamic_api.py +308 -0
  50. additory/expressions/__init__.py +26 -0
  51. additory/expressions/engine.py +551 -0
  52. additory/expressions/parser.py +176 -0
  53. additory/expressions/proxy.py +546 -0
  54. additory/expressions/registry.py +313 -0
  55. additory/expressions/samples.py +492 -0
  56. additory/synthetic/__init__.py +101 -0
  57. additory/synthetic/api.py +220 -0
  58. additory/synthetic/common_integration.py +314 -0
  59. additory/synthetic/config.py +262 -0
  60. additory/synthetic/engines.py +529 -0
  61. additory/synthetic/exceptions.py +180 -0
  62. additory/synthetic/file_managers.py +518 -0
  63. additory/synthetic/generator.py +702 -0
  64. additory/synthetic/generator_parser.py +68 -0
  65. additory/synthetic/integration.py +319 -0
  66. additory/synthetic/models.py +241 -0
  67. additory/synthetic/pattern_resolver.py +573 -0
  68. additory/synthetic/performance.py +469 -0
  69. additory/synthetic/polars_integration.py +464 -0
  70. additory/synthetic/proxy.py +60 -0
  71. additory/synthetic/schema_parser.py +685 -0
  72. additory/synthetic/validator.py +553 -0
  73. additory/utilities/__init__.py +53 -0
  74. additory/utilities/encoding.py +600 -0
  75. additory/utilities/games.py +300 -0
  76. additory/utilities/keys.py +8 -0
  77. additory/utilities/lookup.py +103 -0
  78. additory/utilities/matchers.py +216 -0
  79. additory/utilities/resolvers.py +286 -0
  80. additory/utilities/settings.py +167 -0
  81. additory/utilities/units.py +746 -0
  82. additory/utilities/validators.py +153 -0
  83. additory-0.1.0a1.dist-info/METADATA +293 -0
  84. additory-0.1.0a1.dist-info/RECORD +87 -0
  85. additory-0.1.0a1.dist-info/WHEEL +5 -0
  86. additory-0.1.0a1.dist-info/licenses/LICENSE +21 -0
  87. additory-0.1.0a1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,883 @@
1
+ """
2
+ Strategy handlers for data augmentation
3
+
4
+ Provides different strategies for generating synthetic data:
5
+ - auto: Random sampling from existing values
6
+ - increment: Increment numeric or pattern-based values
7
+ - choice:[...]: Random selection from inline list
8
+ - choice_list:name: Random selection from registered/built-in list
9
+ """
10
+
11
+ import re
12
+ import random
13
+ from typing import Any, Dict, List, Optional, Tuple
14
+
15
+ from additory.common.exceptions import ValidationError, AugmentError
16
+ from additory.augment.list_registry import get_list
17
+
18
+
19
+ def parse_strategy_params(strategy_spec: str) -> Tuple[str, Dict[str, Any]]:
20
+ """
21
+ Parse strategy specification with inline parameters.
22
+
23
+ Supports two formats:
24
+ 1. key=value format: "increment:start=100:pattern=EMP_[001]"
25
+ 2. range format: "range:18-65"
26
+
27
+ Args:
28
+ strategy_spec: Strategy string with optional parameters
29
+
30
+ Returns:
31
+ Tuple of (strategy_name, params_dict)
32
+ - strategy_name: Base strategy name (e.g., "increment", "range")
33
+ - params_dict: Dictionary of parsed parameters
34
+
35
+ Raises:
36
+ ValidationError: If parameter format is invalid
37
+
38
+ Examples:
39
+ >>> parse_strategy_params("increment")
40
+ ("increment", {})
41
+
42
+ >>> parse_strategy_params("increment:start=100")
43
+ ("increment", {"start": 100})
44
+
45
+ >>> parse_strategy_params("increment:start=100:pattern=EMP_[001]")
46
+ ("increment", {"start": 100, "pattern": "EMP_[001]"})
47
+
48
+ >>> parse_strategy_params("range:18-65")
49
+ ("range", {"min": 18, "max": 65})
50
+ """
51
+ if not strategy_spec or not strategy_spec.strip():
52
+ raise ValidationError("Empty strategy specification")
53
+
54
+ parts = strategy_spec.split(":")
55
+
56
+ strategy_name = parts[0].strip()
57
+
58
+ if not strategy_name:
59
+ raise ValidationError("Empty strategy name")
60
+
61
+ if len(parts) == 1:
62
+ # No parameters
63
+ return strategy_name, {}
64
+
65
+ # Special case: range format "range:18-65"
66
+ if strategy_name == "range" and len(parts) == 2:
67
+ range_part = parts[1].strip()
68
+
69
+ # Check if it's the min-max format (no = sign)
70
+ if "=" not in range_part:
71
+ if "-" not in range_part:
72
+ raise ValidationError(
73
+ f"Invalid range format: {range_part}. "
74
+ "Expected format: range:min-max (e.g., range:18-65)"
75
+ )
76
+
77
+ # Split by dash, handling negative numbers
78
+ # Use regex to properly split on dash
79
+ match = re.match(r'^(-?\d+)-(-?\d+)$', range_part)
80
+ if match:
81
+ try:
82
+ min_val = int(match.group(1))
83
+ max_val = int(match.group(2))
84
+ return strategy_name, {"min": min_val, "max": max_val}
85
+ except ValueError:
86
+ raise ValidationError(
87
+ f"Invalid range format: {range_part}. "
88
+ "Expected format: range:min-max (e.g., range:18-65)"
89
+ )
90
+ else:
91
+ raise ValidationError(
92
+ f"Invalid range format: {range_part}. "
93
+ "Expected format: range:min-max (e.g., range:18-65)"
94
+ )
95
+
96
+ # Parse key=value parameters
97
+ params = {}
98
+
99
+ for i in range(1, len(parts)):
100
+ param_part = parts[i].strip()
101
+
102
+ if "=" not in param_part:
103
+ raise ValidationError(
104
+ f"Invalid parameter format: '{param_part}'. "
105
+ "Expected format: key=value (e.g., start=100)"
106
+ )
107
+
108
+ key, value = param_part.split("=", 1)
109
+ key = key.strip()
110
+ value = value.strip()
111
+
112
+ if not key:
113
+ raise ValidationError(
114
+ f"Empty parameter key in: '{param_part}'"
115
+ )
116
+
117
+ if not value:
118
+ raise ValidationError(
119
+ f"Empty parameter value for key '{key}'"
120
+ )
121
+
122
+ # Try to convert numeric strings to integers
123
+ try:
124
+ # Check if it's a valid integer
125
+ if value.lstrip('-').isdigit():
126
+ params[key] = int(value)
127
+ else:
128
+ # Keep as string
129
+ params[key] = value
130
+ except ValueError:
131
+ # Keep as string if conversion fails
132
+ params[key] = value
133
+
134
+ return strategy_name, params
135
+
136
+
137
+ def parse_increment_strategy(strategy_spec: str) -> Tuple[Optional[str], Optional[str]]:
138
+ """
139
+ Parse increment strategy specification.
140
+
141
+ Args:
142
+ strategy_spec: Strategy string like:
143
+ - "increment"
144
+ - "increment:EMP_[001]_ID"
145
+ - r"increment:A(\\d+)"
146
+
147
+ Returns:
148
+ Tuple of (pattern, regex_pattern)
149
+ - pattern: Original pattern string (for bracket notation)
150
+ - regex_pattern: Compiled regex pattern (for extraction)
151
+
152
+ Examples:
153
+ >>> parse_increment_strategy("increment")
154
+ (None, None)
155
+
156
+ >>> parse_increment_strategy("increment:EMP_[001]_ID")
157
+ ("EMP_[001]_ID", r"EMP_(\\d{3})_ID")
158
+
159
+ >>> parse_increment_strategy(r"increment:A(\\d+)")
160
+ (None, r"A(\\d+)")
161
+ """
162
+ parts = strategy_spec.split(":", 1)
163
+
164
+ if len(parts) == 1:
165
+ # Simple "increment" with no pattern
166
+ return None, None
167
+
168
+ pattern_str = parts[1].strip()
169
+
170
+ # Check if it's bracket notation: EMP_[001]_ID
171
+ if "[" in pattern_str and "]" in pattern_str:
172
+ # Extract the bracketed part
173
+ bracket_match = re.search(r'\[(\d+)\]', pattern_str)
174
+ if not bracket_match:
175
+ raise ValidationError(
176
+ f"Invalid bracket pattern: {pattern_str}. "
177
+ "Brackets must contain digits, e.g., [001] or [123]"
178
+ )
179
+
180
+ # Get the number inside brackets to determine padding
181
+ number_str = bracket_match.group(1)
182
+ padding = len(number_str)
183
+
184
+ # Convert bracket notation to regex
185
+ # EMP_[001]_ID -> EMP_(\d{3})_ID
186
+ regex_pattern = pattern_str.replace(f"[{number_str}]", f"(\\d{{{padding}}})")
187
+
188
+ return pattern_str, regex_pattern
189
+
190
+ # Otherwise, assume it's already a regex pattern
191
+ # Validate that it has a capture group
192
+ if "(" not in pattern_str or ")" not in pattern_str:
193
+ raise ValidationError(
194
+ f"Invalid pattern: {pattern_str}. "
195
+ "Pattern must either use bracket notation [001] or regex with capture group (\\d+)"
196
+ )
197
+
198
+ return None, pattern_str
199
+
200
+
201
+ def validate_increment_column(
202
+ last_value: Any,
203
+ pattern: Optional[str],
204
+ regex_pattern: Optional[str]
205
+ ) -> Tuple[int, Optional[str], Optional[int]]:
206
+ """
207
+ Validate that a column can be incremented and extract current value.
208
+
209
+ Args:
210
+ last_value: Last value in the column
211
+ pattern: Pattern string (if using bracket notation)
212
+ regex_pattern: Regex pattern (if provided)
213
+
214
+ Returns:
215
+ Tuple of (current_number, prefix_suffix_template, padding)
216
+ - current_number: The numeric value to increment from
217
+ - prefix_suffix_template: Template for reconstruction (e.g., "EMP_{}_ID")
218
+ - padding: Number of digits for zero-padding (or None)
219
+
220
+ Raises:
221
+ ValidationError: If column cannot be incremented
222
+ """
223
+ last_value_str = str(last_value)
224
+
225
+ # Case 1: Pure numeric value
226
+ if regex_pattern is None and pattern is None:
227
+ try:
228
+ current_number = int(last_value)
229
+ return current_number, None, None
230
+ except (ValueError, TypeError):
231
+ raise ValidationError(
232
+ f"Column has non-numeric last value '{last_value}'. "
233
+ "For non-numeric columns, you must provide a pattern. "
234
+ "Examples: 'increment:EMP_[001]_ID' or 'increment:A(\\d+)'"
235
+ )
236
+
237
+ # Case 2: Pattern-based value
238
+ if regex_pattern is None:
239
+ raise ValidationError("Pattern parsing failed - this should not happen")
240
+
241
+ # Try to match the pattern
242
+ match = re.search(regex_pattern, last_value_str)
243
+ if not match:
244
+ raise ValidationError(
245
+ f"Pattern '{pattern or regex_pattern}' does not match last value '{last_value}'. "
246
+ "Please verify the pattern matches your data."
247
+ )
248
+
249
+ # Extract the numeric part
250
+ try:
251
+ number_str = match.group(1)
252
+ current_number = int(number_str)
253
+ padding = len(number_str) if number_str.startswith('0') else None
254
+ except (ValueError, IndexError):
255
+ raise ValidationError(
256
+ f"Could not extract numeric value from '{last_value}' using pattern '{pattern or regex_pattern}'"
257
+ )
258
+
259
+ # Create template for reconstruction
260
+ # Replace the captured group with {} placeholder
261
+ template = re.sub(r'\([^)]+\)', '{}', regex_pattern)
262
+ # Remove regex special characters for simple replacement
263
+ template = template.replace('\\d', '').replace('{', '').replace('}', '')
264
+
265
+ # Better approach: use the actual matched string positions
266
+ start, end = match.span(1)
267
+ template = last_value_str[:start] + '{}' + last_value_str[end:]
268
+
269
+ return current_number, template, padding
270
+
271
+
272
+ def generate_increment_values(
273
+ start_number: int,
274
+ count: int,
275
+ template: Optional[str],
276
+ padding: Optional[int]
277
+ ) -> List[Any]:
278
+ """
279
+ Generate incremented values.
280
+
281
+ Args:
282
+ start_number: Starting number (last value + 1)
283
+ count: Number of values to generate
284
+ template: Template for reconstruction (e.g., "EMP_{}_ID")
285
+ padding: Number of digits for zero-padding
286
+
287
+ Returns:
288
+ List of generated values
289
+ """
290
+ values = []
291
+
292
+ for i in range(count):
293
+ new_number = start_number + i
294
+
295
+ if template is None:
296
+ # Pure numeric
297
+ values.append(new_number)
298
+ else:
299
+ # Pattern-based
300
+ if padding:
301
+ number_str = str(new_number).zfill(padding)
302
+ else:
303
+ number_str = str(new_number)
304
+
305
+ new_value = template.format(number_str)
306
+ values.append(new_value)
307
+
308
+ return values
309
+
310
+
311
+ def apply_increment_strategy(
312
+ df_polars: Any,
313
+ column: str,
314
+ strategy_spec: str,
315
+ n_rows: int,
316
+ params: Optional[Dict[str, Any]] = None
317
+ ) -> List[Any]:
318
+ """
319
+ Apply increment strategy to a column (Polars-only).
320
+
321
+ Supports two modes:
322
+ 1. Augment mode: Increment from last value in df_polars
323
+ 2. Create mode: Start from specified value (requires params with 'start')
324
+
325
+ Args:
326
+ df_polars: Input Polars DataFrame (None in create mode)
327
+ column: Column name to increment
328
+ strategy_spec: Strategy specification (e.g., "increment:EMP_[001]_ID")
329
+ n_rows: Number of new values to generate
330
+ params: Optional parameters dict with 'start' and/or 'pattern' keys
331
+
332
+ Returns:
333
+ List of new values for the column
334
+
335
+ Raises:
336
+ ValidationError: If strategy cannot be applied
337
+
338
+ Examples:
339
+ # Augment mode (with DataFrame)
340
+ >>> apply_increment_strategy(df, "id", "increment", 5)
341
+ [11, 12, 13, 14, 15] # if last value was 10
342
+
343
+ # Create mode (no DataFrame, with start parameter)
344
+ >>> apply_increment_strategy(None, "id", "increment", 5, {"start": 100})
345
+ [100, 101, 102, 103, 104]
346
+
347
+ # Create mode with pattern
348
+ >>> apply_increment_strategy(None, "emp_id", "increment", 3,
349
+ ... {"start": 1, "pattern": "EMP_[001]"})
350
+ ["EMP_001", "EMP_002", "EMP_003"]
351
+ """
352
+ # Determine mode: augment (has df) or create (no df)
353
+ is_create_mode = df_polars is None
354
+
355
+ if is_create_mode:
356
+ # Create mode: use start parameter
357
+ if params is None or "start" not in params:
358
+ raise ValidationError(
359
+ f"Increment strategy in create mode requires 'start' parameter. "
360
+ f"Use format: 'increment:start=N' or 'increment:start=N:pattern=P'"
361
+ )
362
+
363
+ start_number = params["start"]
364
+
365
+ # Check if pattern is provided in params
366
+ if "pattern" in params:
367
+ pattern_str = params["pattern"]
368
+
369
+ # Parse the pattern to get template and padding
370
+ if "[" in pattern_str and "]" in pattern_str:
371
+ # Bracket notation: EMP_[001]
372
+ bracket_match = re.search(r'\[(\d+)\]', pattern_str)
373
+ if not bracket_match:
374
+ raise ValidationError(
375
+ f"Invalid bracket pattern: {pattern_str}. "
376
+ "Brackets must contain digits, e.g., [001] or [123]"
377
+ )
378
+
379
+ number_str = bracket_match.group(1)
380
+ padding = len(number_str)
381
+
382
+ # Create template by replacing [NNN] with {}
383
+ template = pattern_str.replace(f"[{number_str}]", "{}")
384
+ else:
385
+ raise ValidationError(
386
+ f"Invalid pattern: {pattern_str}. "
387
+ "Pattern must use bracket notation [001]"
388
+ )
389
+ else:
390
+ # No pattern, pure numeric
391
+ template = None
392
+ padding = None
393
+
394
+ # Generate values starting from start_number
395
+ new_values = generate_increment_values(
396
+ start_number=start_number,
397
+ count=n_rows,
398
+ template=template,
399
+ padding=padding
400
+ )
401
+
402
+ return new_values
403
+
404
+ else:
405
+ # Augment mode: use existing logic
406
+ # Parse the strategy
407
+ pattern, regex_pattern = parse_increment_strategy(strategy_spec)
408
+
409
+ # Get last value from the Polars column
410
+ last_value = df_polars[column][-1]
411
+
412
+ # Validate and extract current value
413
+ current_number, template, padding = validate_increment_column(
414
+ last_value, pattern, regex_pattern
415
+ )
416
+
417
+ # Generate new values starting from current + 1
418
+ new_values = generate_increment_values(
419
+ start_number=current_number + 1,
420
+ count=n_rows,
421
+ template=template,
422
+ padding=padding
423
+ )
424
+
425
+ return new_values
426
+
427
+
428
+ def parse_strategy_dict(strategy: Any) -> Dict[str, str]:
429
+ """
430
+ Parse and validate strategy parameter.
431
+
432
+ Args:
433
+ strategy: Strategy specification, can be:
434
+ - str: "auto" (default for all columns)
435
+ - dict: {"col1": "increment", "col2": "auto", ...}
436
+
437
+ Returns:
438
+ Dictionary mapping column names to strategy specs
439
+
440
+ Raises:
441
+ ValidationError: If strategy format is invalid
442
+ """
443
+ if isinstance(strategy, str):
444
+ # Simple string strategy applies to all columns
445
+ return {"__default__": strategy}
446
+
447
+ if isinstance(strategy, dict):
448
+ # Validate all strategy values are strings
449
+ for col, strat in strategy.items():
450
+ if not isinstance(strat, str):
451
+ raise ValidationError(
452
+ f"Strategy for column '{col}' must be a string, got {type(strat)}"
453
+ )
454
+ return strategy
455
+
456
+ raise ValidationError(
457
+ f"Strategy must be str or dict, got {type(strategy)}"
458
+ )
459
+
460
+
461
+ def get_column_strategy(column: str, strategy_dict: Dict[str, str]) -> str:
462
+ """
463
+ Get strategy for a specific column.
464
+
465
+ Args:
466
+ column: Column name
467
+ strategy_dict: Parsed strategy dictionary
468
+
469
+ Returns:
470
+ Strategy string for the column (defaults to "auto")
471
+ """
472
+ if column in strategy_dict:
473
+ return strategy_dict[column]
474
+
475
+ # Return default strategy
476
+ return strategy_dict.get("__default__", "auto")
477
+
478
+
479
+ def parse_choice_strategy(strategy_spec: str) -> Tuple[str, Optional[List[Any]]]:
480
+ """
481
+ Parse choice strategy specification.
482
+
483
+ Args:
484
+ strategy_spec: Strategy string like:
485
+ - "choice:[value1,value2,value3]"
486
+ - "choice_list:banks"
487
+
488
+ Returns:
489
+ Tuple of (strategy_type, values)
490
+ - strategy_type: "choice" or "choice_list"
491
+ - values: List of values (for choice) or None (for choice_list)
492
+
493
+ Raises:
494
+ ValidationError: If strategy format is invalid
495
+
496
+ Examples:
497
+ >>> parse_choice_strategy("choice:[Active,Inactive,Pending]")
498
+ ("choice", ["Active", "Inactive", "Pending"])
499
+
500
+ >>> parse_choice_strategy("choice_list:banks")
501
+ ("choice_list", None)
502
+ """
503
+ if strategy_spec.startswith("choice:["):
504
+ # Inline list: choice:[value1,value2,value3]
505
+ if not strategy_spec.endswith("]"):
506
+ raise ValidationError(
507
+ f"Invalid choice strategy: {strategy_spec}. "
508
+ "Must be in format: choice:[value1,value2,value3]"
509
+ )
510
+
511
+ # Extract values between [ and ]
512
+ values_str = strategy_spec[len("choice:["):-1]
513
+
514
+ if not values_str.strip():
515
+ raise ValidationError(
516
+ f"Choice list cannot be empty: {strategy_spec}"
517
+ )
518
+
519
+ # Split by comma and strip whitespace
520
+ values = [v.strip() for v in values_str.split(",")]
521
+
522
+ if len(values) == 0:
523
+ raise ValidationError(
524
+ f"Choice list must contain at least one value: {strategy_spec}"
525
+ )
526
+
527
+ return "choice", values
528
+
529
+ elif strategy_spec.startswith("choice_list:"):
530
+ # Named list: choice_list:banks
531
+ list_name = strategy_spec[len("choice_list:"):].strip()
532
+
533
+ if not list_name:
534
+ raise ValidationError(
535
+ f"Invalid choice_list strategy: {strategy_spec}. "
536
+ "Must be in format: choice_list:list_name"
537
+ )
538
+
539
+ return "choice_list", list_name
540
+
541
+ else:
542
+ raise ValidationError(
543
+ f"Invalid choice strategy: {strategy_spec}. "
544
+ "Must start with 'choice:[' or 'choice_list:'"
545
+ )
546
+
547
+
548
+ def apply_range_strategy(
549
+ min_val: int,
550
+ max_val: int,
551
+ n_rows: int,
552
+ seed: Optional[int]
553
+ ) -> List[int]:
554
+ """
555
+ Apply range strategy to generate random integers within a range.
556
+
557
+ Args:
558
+ min_val: Minimum value (inclusive)
559
+ max_val: Maximum value (inclusive)
560
+ n_rows: Number of values to generate
561
+ seed: Random seed for reproducibility
562
+
563
+ Returns:
564
+ List of random integers within the specified range
565
+
566
+ Raises:
567
+ ValidationError: If min_val >= max_val
568
+
569
+ Examples:
570
+ >>> apply_range_strategy(18, 65, 5, seed=42)
571
+ [34, 52, 23, 61, 38]
572
+
573
+ >>> apply_range_strategy(40000, 120000, 3, seed=42)
574
+ [75000, 110000, 45000]
575
+ """
576
+ # Validate range
577
+ if min_val >= max_val:
578
+ raise ValidationError(
579
+ f"Invalid range: min ({min_val}) must be less than max ({max_val})"
580
+ )
581
+
582
+ # Set seed for reproducibility
583
+ if seed is not None:
584
+ random.seed(seed)
585
+
586
+ # Generate random integers within range (inclusive)
587
+ values = [random.randint(min_val, max_val) for _ in range(n_rows)]
588
+
589
+ return values
590
+
591
+
592
+ def apply_choice_strategy(
593
+ strategy_spec: str,
594
+ n_rows: int,
595
+ seed: Optional[int]
596
+ ) -> List[Any]:
597
+ """
598
+ Apply choice strategy to generate values.
599
+
600
+ Args:
601
+ strategy_spec: Strategy specification (e.g., "choice:[A,B,C]")
602
+ n_rows: Number of values to generate
603
+ seed: Random seed for reproducibility
604
+
605
+ Returns:
606
+ List of randomly selected values
607
+
608
+ Raises:
609
+ ValidationError: If strategy cannot be applied
610
+ """
611
+ # Parse the strategy
612
+ strategy_type, values_or_name = parse_choice_strategy(strategy_spec)
613
+
614
+ # Get the actual values list
615
+ if strategy_type == "choice":
616
+ values = values_or_name
617
+ elif strategy_type == "choice_list":
618
+ # Resolve list name to actual list
619
+ list_name = values_or_name
620
+ try:
621
+ values = get_list(list_name)
622
+ except ValidationError as e:
623
+ raise ValidationError(
624
+ f"Cannot apply choice_list strategy: {e}"
625
+ )
626
+ else:
627
+ raise ValidationError(f"Unknown choice strategy type: {strategy_type}")
628
+
629
+ # Generate random selections
630
+ if seed is not None:
631
+ # Use Python's random for consistency across backends
632
+ random.seed(seed)
633
+
634
+ selected_values = random.choices(values, k=n_rows)
635
+
636
+ return selected_values
637
+
638
+
639
+
640
+ def apply_forecast_strategy(
641
+ df_polars: Any,
642
+ column: str,
643
+ strategy_spec: str,
644
+ n_rows: int,
645
+ seed: Optional[int] = None
646
+ ) -> List[Any]:
647
+ """
648
+ Apply forecast strategy to a column.
649
+
650
+ Supports:
651
+ - forecast:linear
652
+ - forecast:polynomial
653
+ - forecast:exponential
654
+ - forecast:moving_average
655
+ - forecast:seasonal
656
+ - forecast:auto
657
+
658
+ Args:
659
+ df_polars: Input Polars DataFrame
660
+ column: Column name to forecast
661
+ strategy_spec: Strategy specification (e.g., "forecast:seasonal:period=12")
662
+ n_rows: Number of values to forecast
663
+ seed: Random seed (not used for deterministic forecasts)
664
+
665
+ Returns:
666
+ List of forecasted values
667
+
668
+ Raises:
669
+ ValidationError: If strategy cannot be applied
670
+
671
+ Examples:
672
+ >>> apply_forecast_strategy(df, "sales", "forecast:linear", 10)
673
+ [105.2, 110.4, 115.6, ...]
674
+
675
+ >>> apply_forecast_strategy(df, "sales", "forecast:seasonal:period=12", 24)
676
+ [98.5, 102.3, 95.8, ...]
677
+ """
678
+ from additory.augment.forecast import forecast_values, ForecastMethod
679
+
680
+ # Parse strategy: forecast:method:param1=val1:param2=val2
681
+ parts = strategy_spec.split(":")
682
+
683
+ if len(parts) < 2:
684
+ raise ValidationError(
685
+ f"Invalid forecast strategy: {strategy_spec}. "
686
+ "Expected format: forecast:method or forecast:method:param=value"
687
+ )
688
+
689
+ # parts[0] is "forecast", parts[1] is method
690
+ method = parts[1].strip()
691
+
692
+ # Parse additional parameters
693
+ params = {}
694
+ for i in range(2, len(parts)):
695
+ param_part = parts[i].strip()
696
+
697
+ if "=" in param_part:
698
+ key, value = param_part.split("=", 1)
699
+ key = key.strip()
700
+ value = value.strip()
701
+
702
+ # Try to convert to int/float
703
+ try:
704
+ if "." in value:
705
+ params[key] = float(value)
706
+ else:
707
+ params[key] = int(value)
708
+ except ValueError:
709
+ params[key] = value
710
+
711
+ # Call forecast function
712
+ try:
713
+ return forecast_values(
714
+ df_polars,
715
+ column,
716
+ n_rows,
717
+ method=method,
718
+ **params
719
+ )
720
+ except Exception as e:
721
+ raise ValidationError(f"Forecast strategy failed: {e}")
722
+
723
+
724
+ def apply_distribution_strategy(
725
+ df_polars: Any,
726
+ column: str,
727
+ strategy_spec: str,
728
+ n_rows: int,
729
+ seed: Optional[int] = None
730
+ ) -> List[Any]:
731
+ """
732
+ Apply distribution strategy to a column.
733
+
734
+ Supports:
735
+ - normal (or normal:auto)
736
+ - normal:mean=X:std=Y
737
+ - uniform:min=X:max=Y
738
+ - skewed_left:skewness=X
739
+ - skewed_right:skewness=X
740
+
741
+ Args:
742
+ df_polars: Input Polars DataFrame (for parameter estimation)
743
+ column: Column name to generate from
744
+ strategy_spec: Strategy specification (e.g., "normal:auto")
745
+ n_rows: Number of values to generate
746
+ seed: Random seed for reproducibility
747
+
748
+ Returns:
749
+ List of generated values
750
+
751
+ Raises:
752
+ ValidationError: If strategy cannot be applied
753
+
754
+ Examples:
755
+ >>> apply_distribution_strategy(df, "age", "normal:auto", 100, seed=42)
756
+ [34.5, 28.9, 41.2, ...]
757
+
758
+ >>> apply_distribution_strategy(df, "score", "uniform:min=0:max=100", 50)
759
+ [45.2, 78.9, 12.3, ...]
760
+ """
761
+ from additory.common.distributions import generate_distribution_values
762
+
763
+ # Parse strategy: distribution:param1=val1:param2=val2
764
+ parts = strategy_spec.split(":")
765
+
766
+ if len(parts) < 1:
767
+ raise ValidationError(f"Invalid distribution strategy: {strategy_spec}")
768
+
769
+ distribution = parts[0].strip()
770
+
771
+ # Parse additional parameters
772
+ params = {}
773
+ auto_mode = False
774
+
775
+ for i in range(1, len(parts)):
776
+ param_part = parts[i].strip()
777
+
778
+ if param_part == "auto":
779
+ # Special case: normal:auto
780
+ auto_mode = True
781
+ continue
782
+
783
+ if "=" in param_part:
784
+ key, value = param_part.split("=", 1)
785
+ key = key.strip()
786
+ value = value.strip()
787
+
788
+ # Try to convert to int/float
789
+ try:
790
+ if "." in value:
791
+ params[key] = float(value)
792
+ else:
793
+ params[key] = int(value)
794
+ except ValueError:
795
+ params[key] = value
796
+
797
+ # Get existing data for parameter estimation
798
+ data = df_polars[column].to_numpy()
799
+
800
+ # Determine distribution type
801
+ if auto_mode:
802
+ dist_type = 'auto'
803
+ else:
804
+ dist_type = distribution
805
+
806
+ # Call distribution function
807
+ try:
808
+ return generate_distribution_values(
809
+ n_rows,
810
+ distribution=dist_type,
811
+ data=data,
812
+ seed=seed,
813
+ **params
814
+ )
815
+ except Exception as e:
816
+ raise ValidationError(f"Distribution strategy failed: {e}")
817
+
818
+
819
+ def apply_smote_strategy(
820
+ df_polars: Any,
821
+ columns: List[str],
822
+ strategy_spec: str,
823
+ n_rows: int,
824
+ seed: Optional[int] = None
825
+ ) -> Dict[str, List[Any]]:
826
+ """
827
+ Apply SMOTE strategy to multiple columns.
828
+
829
+ SMOTE generates synthetic samples using k-nearest neighbors.
830
+
831
+ Args:
832
+ df_polars: Input Polars DataFrame
833
+ columns: List of column names to use for SMOTE
834
+ strategy_spec: Strategy specification (e.g., "smote:k=5")
835
+ n_rows: Number of synthetic samples to generate
836
+ seed: Random seed for reproducibility
837
+
838
+ Returns:
839
+ Dictionary mapping column names to generated values
840
+
841
+ Raises:
842
+ ValidationError: If strategy cannot be applied
843
+
844
+ Examples:
845
+ >>> apply_smote_strategy(df, ["feature1", "feature2"], "smote:k=5", 100)
846
+ {"feature1": [1.2, 3.4, ...], "feature2": [5.6, 7.8, ...]}
847
+ """
848
+ from additory.augment.smote import generate_smote_values
849
+
850
+ # Parse strategy: smote:k=5
851
+ parts = strategy_spec.split(":")
852
+
853
+ # Parse parameters
854
+ params = {}
855
+ for i in range(1, len(parts)):
856
+ param_part = parts[i].strip()
857
+
858
+ if "=" in param_part:
859
+ key, value = param_part.split("=", 1)
860
+ key = key.strip()
861
+ value = value.strip()
862
+
863
+ # Convert k to k_neighbors
864
+ if key == "k":
865
+ key = "k_neighbors"
866
+
867
+ # Try to convert to int
868
+ try:
869
+ params[key] = int(value)
870
+ except ValueError:
871
+ params[key] = value
872
+
873
+ # Call SMOTE function
874
+ try:
875
+ return generate_smote_values(
876
+ df_polars,
877
+ columns,
878
+ n_rows,
879
+ seed=seed,
880
+ **params
881
+ )
882
+ except Exception as e:
883
+ raise ValidationError(f"SMOTE strategy failed: {e}")