additory 0.1.0a3__py3-none-any.whl → 0.1.1a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. additory/__init__.py +58 -14
  2. additory/common/__init__.py +31 -147
  3. additory/common/column_selector.py +255 -0
  4. additory/common/distributions.py +286 -613
  5. additory/common/extractors.py +313 -0
  6. additory/common/knn_imputation.py +332 -0
  7. additory/common/result.py +380 -0
  8. additory/common/strategy_parser.py +243 -0
  9. additory/common/unit_conversions.py +338 -0
  10. additory/common/validation.py +283 -103
  11. additory/core/__init__.py +34 -22
  12. additory/core/backend.py +258 -0
  13. additory/core/config.py +177 -305
  14. additory/core/logging.py +230 -24
  15. additory/core/memory_manager.py +157 -495
  16. additory/expressions/__init__.py +2 -23
  17. additory/expressions/compiler.py +457 -0
  18. additory/expressions/engine.py +264 -487
  19. additory/expressions/integrity.py +179 -0
  20. additory/expressions/loader.py +263 -0
  21. additory/expressions/parser.py +363 -167
  22. additory/expressions/resolver.py +274 -0
  23. additory/functions/__init__.py +1 -0
  24. additory/functions/analyze/__init__.py +144 -0
  25. additory/functions/analyze/cardinality.py +58 -0
  26. additory/functions/analyze/correlations.py +66 -0
  27. additory/functions/analyze/distributions.py +53 -0
  28. additory/functions/analyze/duplicates.py +49 -0
  29. additory/functions/analyze/features.py +61 -0
  30. additory/functions/analyze/imputation.py +66 -0
  31. additory/functions/analyze/outliers.py +65 -0
  32. additory/functions/analyze/patterns.py +65 -0
  33. additory/functions/analyze/presets.py +72 -0
  34. additory/functions/analyze/quality.py +59 -0
  35. additory/functions/analyze/timeseries.py +53 -0
  36. additory/functions/analyze/types.py +45 -0
  37. additory/functions/expressions/__init__.py +161 -0
  38. additory/functions/snapshot/__init__.py +82 -0
  39. additory/functions/snapshot/filter.py +119 -0
  40. additory/functions/synthetic/__init__.py +113 -0
  41. additory/functions/synthetic/mode_detector.py +47 -0
  42. additory/functions/synthetic/strategies/__init__.py +1 -0
  43. additory/functions/synthetic/strategies/advanced.py +35 -0
  44. additory/functions/synthetic/strategies/augmentative.py +160 -0
  45. additory/functions/synthetic/strategies/generative.py +168 -0
  46. additory/functions/synthetic/strategies/presets.py +116 -0
  47. additory/functions/to/__init__.py +188 -0
  48. additory/functions/to/lookup.py +351 -0
  49. additory/functions/to/merge.py +189 -0
  50. additory/functions/to/sort.py +91 -0
  51. additory/functions/to/summarize.py +170 -0
  52. additory/functions/transform/__init__.py +140 -0
  53. additory/functions/transform/datetime.py +79 -0
  54. additory/functions/transform/extract.py +85 -0
  55. additory/functions/transform/harmonize.py +105 -0
  56. additory/functions/transform/knn.py +62 -0
  57. additory/functions/transform/onehotencoding.py +68 -0
  58. additory/functions/transform/transpose.py +42 -0
  59. additory-0.1.1a1.dist-info/METADATA +83 -0
  60. additory-0.1.1a1.dist-info/RECORD +62 -0
  61. additory/analysis/__init__.py +0 -48
  62. additory/analysis/cardinality.py +0 -126
  63. additory/analysis/correlations.py +0 -124
  64. additory/analysis/distributions.py +0 -376
  65. additory/analysis/quality.py +0 -158
  66. additory/analysis/scan.py +0 -400
  67. additory/common/backend.py +0 -371
  68. additory/common/column_utils.py +0 -191
  69. additory/common/exceptions.py +0 -62
  70. additory/common/lists.py +0 -229
  71. additory/common/patterns.py +0 -240
  72. additory/common/resolver.py +0 -567
  73. additory/common/sample_data.py +0 -182
  74. additory/core/ast_builder.py +0 -165
  75. additory/core/backends/__init__.py +0 -23
  76. additory/core/backends/arrow_bridge.py +0 -483
  77. additory/core/backends/cudf_bridge.py +0 -355
  78. additory/core/column_positioning.py +0 -358
  79. additory/core/compiler_polars.py +0 -166
  80. additory/core/enhanced_cache_manager.py +0 -1119
  81. additory/core/enhanced_matchers.py +0 -473
  82. additory/core/enhanced_version_manager.py +0 -325
  83. additory/core/executor.py +0 -59
  84. additory/core/integrity_manager.py +0 -477
  85. additory/core/loader.py +0 -190
  86. additory/core/namespace_manager.py +0 -657
  87. additory/core/parser.py +0 -176
  88. additory/core/polars_expression_engine.py +0 -601
  89. additory/core/registry.py +0 -176
  90. additory/core/sample_data_manager.py +0 -492
  91. additory/core/user_namespace.py +0 -751
  92. additory/core/validator.py +0 -27
  93. additory/dynamic_api.py +0 -304
  94. additory/expressions/proxy.py +0 -549
  95. additory/expressions/registry.py +0 -313
  96. additory/expressions/samples.py +0 -492
  97. additory/synthetic/__init__.py +0 -13
  98. additory/synthetic/column_name_resolver.py +0 -149
  99. additory/synthetic/distributions.py +0 -22
  100. additory/synthetic/forecast.py +0 -1132
  101. additory/synthetic/linked_list_parser.py +0 -415
  102. additory/synthetic/namespace_lookup.py +0 -129
  103. additory/synthetic/smote.py +0 -320
  104. additory/synthetic/strategies.py +0 -850
  105. additory/synthetic/synthesizer.py +0 -713
  106. additory/utilities/__init__.py +0 -53
  107. additory/utilities/encoding.py +0 -600
  108. additory/utilities/games.py +0 -300
  109. additory/utilities/keys.py +0 -8
  110. additory/utilities/lookup.py +0 -103
  111. additory/utilities/matchers.py +0 -216
  112. additory/utilities/resolvers.py +0 -286
  113. additory/utilities/settings.py +0 -167
  114. additory/utilities/units.py +0 -749
  115. additory/utilities/validators.py +0 -153
  116. additory-0.1.0a3.dist-info/METADATA +0 -288
  117. additory-0.1.0a3.dist-info/RECORD +0 -71
  118. additory-0.1.0a3.dist-info/licenses/LICENSE +0 -21
  119. {additory-0.1.0a3.dist-info → additory-0.1.1a1.dist-info}/WHEEL +0 -0
  120. {additory-0.1.0a3.dist-info → additory-0.1.1a1.dist-info}/top_level.txt +0 -0
@@ -1,850 +0,0 @@
1
- """
2
- Strategy handlers for synthetic data generation
3
-
4
- Provides different strategies for generating synthetic data:
5
- - auto: Random sampling from existing values
6
- - increment: Increment numeric or pattern-based values
7
- - choice:[...]: Random selection from inline list
8
- """
9
-
10
- import re
11
- import random
12
- from typing import Any, Dict, List, Optional, Tuple
13
-
14
- from additory.common.exceptions import ValidationError, AugmentError
15
-
16
-
17
- def parse_strategy_params(strategy_spec: str) -> Tuple[str, Dict[str, Any]]:
18
- """
19
- Parse strategy specification with inline parameters.
20
-
21
- Supports two formats:
22
- 1. key=value format: "increment:start=100:pattern=EMP_[001]"
23
- 2. range format: "range:18-65"
24
-
25
- Args:
26
- strategy_spec: Strategy string with optional parameters
27
-
28
- Returns:
29
- Tuple of (strategy_name, params_dict)
30
- - strategy_name: Base strategy name (e.g., "increment", "range")
31
- - params_dict: Dictionary of parsed parameters
32
-
33
- Raises:
34
- ValidationError: If parameter format is invalid
35
-
36
- Examples:
37
- >>> parse_strategy_params("increment")
38
- ("increment", {})
39
-
40
- >>> parse_strategy_params("increment:start=100")
41
- ("increment", {"start": 100})
42
-
43
- >>> parse_strategy_params("increment:start=100:pattern=EMP_[001]")
44
- ("increment", {"start": 100, "pattern": "EMP_[001]"})
45
-
46
- >>> parse_strategy_params("range:18-65")
47
- ("range", {"min": 18, "max": 65})
48
- """
49
- if not strategy_spec or not strategy_spec.strip():
50
- raise ValidationError("Empty strategy specification")
51
-
52
- parts = strategy_spec.split(":")
53
-
54
- strategy_name = parts[0].strip()
55
-
56
- if not strategy_name:
57
- raise ValidationError("Empty strategy name")
58
-
59
- if len(parts) == 1:
60
- # No parameters
61
- return strategy_name, {}
62
-
63
- # Special case: range format "range:18-65"
64
- if strategy_name == "range" and len(parts) == 2:
65
- range_part = parts[1].strip()
66
-
67
- # Check if it's the min-max format (no = sign)
68
- if "=" not in range_part:
69
- if "-" not in range_part:
70
- raise ValidationError(
71
- f"Invalid range format: {range_part}. "
72
- "Expected format: range:min-max (e.g., range:18-65)"
73
- )
74
-
75
- # Split by dash, handling negative numbers
76
- # Use regex to properly split on dash
77
- match = re.match(r'^(-?\d+)-(-?\d+)$', range_part)
78
- if match:
79
- try:
80
- min_val = int(match.group(1))
81
- max_val = int(match.group(2))
82
- return strategy_name, {"min": min_val, "max": max_val}
83
- except ValueError:
84
- raise ValidationError(
85
- f"Invalid range format: {range_part}. "
86
- "Expected format: range:min-max (e.g., range:18-65)"
87
- )
88
- else:
89
- raise ValidationError(
90
- f"Invalid range format: {range_part}. "
91
- "Expected format: range:min-max (e.g., range:18-65)"
92
- )
93
-
94
- # Parse key=value parameters
95
- params = {}
96
-
97
- for i in range(1, len(parts)):
98
- param_part = parts[i].strip()
99
-
100
- if "=" not in param_part:
101
- raise ValidationError(
102
- f"Invalid parameter format: '{param_part}'. "
103
- "Expected format: key=value (e.g., start=100)"
104
- )
105
-
106
- key, value = param_part.split("=", 1)
107
- key = key.strip()
108
- value = value.strip()
109
-
110
- if not key:
111
- raise ValidationError(
112
- f"Empty parameter key in: '{param_part}'"
113
- )
114
-
115
- if not value:
116
- raise ValidationError(
117
- f"Empty parameter value for key '{key}'"
118
- )
119
-
120
- # Try to convert numeric strings to integers
121
- try:
122
- # Check if it's a valid integer
123
- if value.lstrip('-').isdigit():
124
- params[key] = int(value)
125
- else:
126
- # Keep as string
127
- params[key] = value
128
- except ValueError:
129
- # Keep as string if conversion fails
130
- params[key] = value
131
-
132
- return strategy_name, params
133
-
134
-
135
- def parse_increment_strategy(strategy_spec: str) -> Tuple[Optional[str], Optional[str]]:
136
- """
137
- Parse increment strategy specification.
138
-
139
- Args:
140
- strategy_spec: Strategy string like:
141
- - "increment"
142
- - "increment:EMP_[001]_ID"
143
- - r"increment:A(\\d+)"
144
-
145
- Returns:
146
- Tuple of (pattern, regex_pattern)
147
- - pattern: Original pattern string (for bracket notation)
148
- - regex_pattern: Compiled regex pattern (for extraction)
149
-
150
- Examples:
151
- >>> parse_increment_strategy("increment")
152
- (None, None)
153
-
154
- >>> parse_increment_strategy("increment:EMP_[001]_ID")
155
- ("EMP_[001]_ID", r"EMP_(\\d{3})_ID")
156
-
157
- >>> parse_increment_strategy(r"increment:A(\\d+)")
158
- (None, r"A(\\d+)")
159
- """
160
- parts = strategy_spec.split(":", 1)
161
-
162
- if len(parts) == 1:
163
- # Simple "increment" with no pattern
164
- return None, None
165
-
166
- pattern_str = parts[1].strip()
167
-
168
- # Check if it's bracket notation: EMP_[001]_ID
169
- if "[" in pattern_str and "]" in pattern_str:
170
- # Extract the bracketed part
171
- bracket_match = re.search(r'\[(\d+)\]', pattern_str)
172
- if not bracket_match:
173
- raise ValidationError(
174
- f"Invalid bracket pattern: {pattern_str}. "
175
- "Brackets must contain digits, e.g., [001] or [123]"
176
- )
177
-
178
- # Get the number inside brackets to determine padding
179
- number_str = bracket_match.group(1)
180
- padding = len(number_str)
181
-
182
- # Convert bracket notation to regex
183
- # EMP_[001]_ID -> EMP_(\d{3})_ID
184
- regex_pattern = pattern_str.replace(f"[{number_str}]", f"(\\d{{{padding}}})")
185
-
186
- return pattern_str, regex_pattern
187
-
188
- # Otherwise, assume it's already a regex pattern
189
- # Validate that it has a capture group
190
- if "(" not in pattern_str or ")" not in pattern_str:
191
- raise ValidationError(
192
- f"Invalid pattern: {pattern_str}. "
193
- "Pattern must either use bracket notation [001] or regex with capture group (\\d+)"
194
- )
195
-
196
- return None, pattern_str
197
-
198
-
199
- def validate_increment_column(
200
- last_value: Any,
201
- pattern: Optional[str],
202
- regex_pattern: Optional[str]
203
- ) -> Tuple[int, Optional[str], Optional[int]]:
204
- """
205
- Validate that a column can be incremented and extract current value.
206
-
207
- Args:
208
- last_value: Last value in the column
209
- pattern: Pattern string (if using bracket notation)
210
- regex_pattern: Regex pattern (if provided)
211
-
212
- Returns:
213
- Tuple of (current_number, prefix_suffix_template, padding)
214
- - current_number: The numeric value to increment from
215
- - prefix_suffix_template: Template for reconstruction (e.g., "EMP_{}_ID")
216
- - padding: Number of digits for zero-padding (or None)
217
-
218
- Raises:
219
- ValidationError: If column cannot be incremented
220
- """
221
- last_value_str = str(last_value)
222
-
223
- # Case 1: Pure numeric value
224
- if regex_pattern is None and pattern is None:
225
- try:
226
- current_number = int(last_value)
227
- return current_number, None, None
228
- except (ValueError, TypeError):
229
- raise ValidationError(
230
- f"Column has non-numeric last value '{last_value}'. "
231
- "For non-numeric columns, you must provide a pattern. "
232
- "Examples: 'increment:EMP_[001]_ID' or 'increment:A(\\d+)'"
233
- )
234
-
235
- # Case 2: Pattern-based value
236
- if regex_pattern is None:
237
- raise ValidationError("Pattern parsing failed - this should not happen")
238
-
239
- # Try to match the pattern
240
- match = re.search(regex_pattern, last_value_str)
241
- if not match:
242
- raise ValidationError(
243
- f"Pattern '{pattern or regex_pattern}' does not match last value '{last_value}'. "
244
- "Please verify the pattern matches your data."
245
- )
246
-
247
- # Extract the numeric part
248
- try:
249
- number_str = match.group(1)
250
- current_number = int(number_str)
251
- padding = len(number_str) if number_str.startswith('0') else None
252
- except (ValueError, IndexError):
253
- raise ValidationError(
254
- f"Could not extract numeric value from '{last_value}' using pattern '{pattern or regex_pattern}'"
255
- )
256
-
257
- # Create template for reconstruction
258
- # Replace the captured group with {} placeholder
259
- template = re.sub(r'\([^)]+\)', '{}', regex_pattern)
260
- # Remove regex special characters for simple replacement
261
- template = template.replace('\\d', '').replace('{', '').replace('}', '')
262
-
263
- # Better approach: use the actual matched string positions
264
- start, end = match.span(1)
265
- template = last_value_str[:start] + '{}' + last_value_str[end:]
266
-
267
- return current_number, template, padding
268
-
269
-
270
- def generate_increment_values(
271
- start_number: int,
272
- count: int,
273
- template: Optional[str],
274
- padding: Optional[int]
275
- ) -> List[Any]:
276
- """
277
- Generate incremented values.
278
-
279
- Args:
280
- start_number: Starting number (last value + 1)
281
- count: Number of values to generate
282
- template: Template for reconstruction (e.g., "EMP_{}_ID")
283
- padding: Number of digits for zero-padding
284
-
285
- Returns:
286
- List of generated values
287
- """
288
- values = []
289
-
290
- for i in range(count):
291
- new_number = start_number + i
292
-
293
- if template is None:
294
- # Pure numeric
295
- values.append(new_number)
296
- else:
297
- # Pattern-based
298
- if padding:
299
- number_str = str(new_number).zfill(padding)
300
- else:
301
- number_str = str(new_number)
302
-
303
- new_value = template.format(number_str)
304
- values.append(new_value)
305
-
306
- return values
307
-
308
-
309
- def apply_increment_strategy(
310
- df_polars: Any,
311
- column: str,
312
- strategy_spec: str,
313
- n_rows: int,
314
- params: Optional[Dict[str, Any]] = None
315
- ) -> List[Any]:
316
- """
317
- Apply increment strategy to a column (Polars-only).
318
-
319
- Supports two modes:
320
- 1. Extend mode: Increment from last value in df_polars
321
- 2. Create mode: Start from specified value (requires params with 'start')
322
-
323
- Args:
324
- df_polars: Input Polars DataFrame (None in create mode)
325
- column: Column name to increment
326
- strategy_spec: Strategy specification (e.g., "increment:EMP_[001]_ID")
327
- n_rows: Number of new values to generate
328
- params: Optional parameters dict with 'start' and/or 'pattern' keys
329
-
330
- Returns:
331
- List of new values for the column
332
-
333
- Raises:
334
- ValidationError: If strategy cannot be applied
335
-
336
- Examples:
337
- # Extend mode (with DataFrame)
338
- >>> apply_increment_strategy(df, "id", "increment", 5)
339
- [11, 12, 13, 14, 15] # if last value was 10
340
-
341
- # Create mode (no DataFrame, with start parameter)
342
- >>> apply_increment_strategy(None, "id", "increment", 5, {"start": 100})
343
- [100, 101, 102, 103, 104]
344
-
345
- # Create mode with pattern
346
- >>> apply_increment_strategy(None, "emp_id", "increment", 3,
347
- ... {"start": 1, "pattern": "EMP_[001]"})
348
- ["EMP_001", "EMP_002", "EMP_003"]
349
- """
350
- # Determine mode: extend (has df) or create (no df)
351
- is_create_mode = df_polars is None
352
-
353
- if is_create_mode:
354
- # Create mode: use start parameter
355
- if params is None or "start" not in params:
356
- raise ValidationError(
357
- f"Increment strategy in create mode requires 'start' parameter. "
358
- f"Use format: 'increment:start=N' or 'increment:start=N:pattern=P'"
359
- )
360
-
361
- start_number = params["start"]
362
-
363
- # Check if pattern is provided in params
364
- if "pattern" in params:
365
- pattern_str = params["pattern"]
366
-
367
- # Parse the pattern to get template and padding
368
- if "[" in pattern_str and "]" in pattern_str:
369
- # Bracket notation: EMP_[001]
370
- bracket_match = re.search(r'\[(\d+)\]', pattern_str)
371
- if not bracket_match:
372
- raise ValidationError(
373
- f"Invalid bracket pattern: {pattern_str}. "
374
- "Brackets must contain digits, e.g., [001] or [123]"
375
- )
376
-
377
- number_str = bracket_match.group(1)
378
- padding = len(number_str)
379
-
380
- # Create template by replacing [NNN] with {}
381
- template = pattern_str.replace(f"[{number_str}]", "{}")
382
- else:
383
- raise ValidationError(
384
- f"Invalid pattern: {pattern_str}. "
385
- "Pattern must use bracket notation [001]"
386
- )
387
- else:
388
- # No pattern, pure numeric
389
- template = None
390
- padding = None
391
-
392
- # Generate values starting from start_number
393
- new_values = generate_increment_values(
394
- start_number=start_number,
395
- count=n_rows,
396
- template=template,
397
- padding=padding
398
- )
399
-
400
- return new_values
401
-
402
- else:
403
- # Extend mode: use existing logic
404
- # Parse the strategy
405
- pattern, regex_pattern = parse_increment_strategy(strategy_spec)
406
-
407
- # Get last value from the Polars column
408
- last_value = df_polars[column][-1]
409
-
410
- # Validate and extract current value
411
- current_number, template, padding = validate_increment_column(
412
- last_value, pattern, regex_pattern
413
- )
414
-
415
- # Generate new values starting from current + 1
416
- new_values = generate_increment_values(
417
- start_number=current_number + 1,
418
- count=n_rows,
419
- template=template,
420
- padding=padding
421
- )
422
-
423
- return new_values
424
-
425
-
426
- def parse_strategy_dict(strategy: Any) -> Dict[str, str]:
427
- """
428
- Parse and validate strategy parameter.
429
-
430
- Args:
431
- strategy: Strategy specification, can be:
432
- - str: "auto" (default for all columns)
433
- - dict: {"col1": "increment", "col2": "auto", ...}
434
-
435
- Returns:
436
- Dictionary mapping column names to strategy specs
437
-
438
- Raises:
439
- ValidationError: If strategy format is invalid
440
- """
441
- if isinstance(strategy, str):
442
- # Simple string strategy applies to all columns
443
- return {"__default__": strategy}
444
-
445
- if isinstance(strategy, dict):
446
- # Validate all strategy values are strings
447
- for col, strat in strategy.items():
448
- if not isinstance(strat, str):
449
- raise ValidationError(
450
- f"Strategy for column '{col}' must be a string, got {type(strat)}"
451
- )
452
- return strategy
453
-
454
- raise ValidationError(
455
- f"Strategy must be str or dict, got {type(strategy)}"
456
- )
457
-
458
-
459
- def get_column_strategy(column: str, strategy_dict: Dict[str, str]) -> str:
460
- """
461
- Get strategy for a specific column.
462
-
463
- Args:
464
- column: Column name
465
- strategy_dict: Parsed strategy dictionary
466
-
467
- Returns:
468
- Strategy string for the column (defaults to "auto")
469
- """
470
- if column in strategy_dict:
471
- return strategy_dict[column]
472
-
473
- # Return default strategy
474
- return strategy_dict.get("__default__", "auto")
475
-
476
-
477
- def parse_choice_strategy(strategy_spec: str) -> Tuple[str, Optional[List[Any]]]:
478
- """
479
- Parse choice strategy specification.
480
-
481
- Args:
482
- strategy_spec: Strategy string like:
483
- - "choice:[value1,value2,value3]"
484
-
485
- Returns:
486
- Tuple of (strategy_type, values)
487
- - strategy_type: "choice"
488
- - values: List of values
489
-
490
- Raises:
491
- ValidationError: If strategy format is invalid
492
-
493
- Examples:
494
- >>> parse_choice_strategy("choice:[Active,Inactive,Pending]")
495
- ("choice", ["Active", "Inactive", "Pending"])
496
- """
497
- if strategy_spec.startswith("choice:["):
498
- # Inline list: choice:[value1,value2,value3]
499
- if not strategy_spec.endswith("]"):
500
- raise ValidationError(
501
- f"Invalid choice strategy: {strategy_spec}. "
502
- "Must be in format: choice:[value1,value2,value3]"
503
- )
504
-
505
- # Extract values between [ and ]
506
- values_str = strategy_spec[len("choice:["):-1]
507
-
508
- if not values_str.strip():
509
- raise ValidationError(
510
- f"Choice list cannot be empty: {strategy_spec}"
511
- )
512
-
513
- # Split by comma and strip whitespace
514
- values = [v.strip() for v in values_str.split(",")]
515
-
516
- if len(values) == 0:
517
- raise ValidationError(
518
- f"Choice list must contain at least one value: {strategy_spec}"
519
- )
520
-
521
- return "choice", values
522
-
523
- else:
524
- raise ValidationError(
525
- f"Invalid choice strategy: {strategy_spec}. "
526
- "Must start with 'choice:['"
527
- )
528
-
529
-
530
- def apply_range_strategy(
531
- min_val: int,
532
- max_val: int,
533
- n_rows: int,
534
- seed: Optional[int]
535
- ) -> List[int]:
536
- """
537
- Apply range strategy to generate random integers within a range.
538
-
539
- Args:
540
- min_val: Minimum value (inclusive)
541
- max_val: Maximum value (inclusive)
542
- n_rows: Number of values to generate
543
- seed: Random seed for reproducibility
544
-
545
- Returns:
546
- List of random integers within the specified range
547
-
548
- Raises:
549
- ValidationError: If min_val >= max_val
550
-
551
- Examples:
552
- >>> apply_range_strategy(18, 65, 5, seed=42)
553
- [34, 52, 23, 61, 38]
554
-
555
- >>> apply_range_strategy(40000, 120000, 3, seed=42)
556
- [75000, 110000, 45000]
557
- """
558
- # Validate range
559
- if min_val >= max_val:
560
- raise ValidationError(
561
- f"Invalid range: min ({min_val}) must be less than max ({max_val})"
562
- )
563
-
564
- # Set seed for reproducibility
565
- if seed is not None:
566
- random.seed(seed)
567
-
568
- # Generate random integers within range (inclusive)
569
- values = [random.randint(min_val, max_val) for _ in range(n_rows)]
570
-
571
- return values
572
-
573
-
574
- def apply_choice_strategy(
575
- strategy_spec: str,
576
- n_rows: int,
577
- seed: Optional[int]
578
- ) -> List[Any]:
579
- """
580
- Apply choice strategy to generate values.
581
-
582
- Args:
583
- strategy_spec: Strategy specification (e.g., "choice:[A,B,C]")
584
- n_rows: Number of values to generate
585
- seed: Random seed for reproducibility
586
-
587
- Returns:
588
- List of randomly selected values
589
-
590
- Raises:
591
- ValidationError: If strategy cannot be applied
592
- """
593
- # Parse the strategy
594
- strategy_type, values = parse_choice_strategy(strategy_spec)
595
-
596
- # Generate random selections
597
- if seed is not None:
598
- # Use Python's random for consistency across backends
599
- random.seed(seed)
600
-
601
- selected_values = random.choices(values, k=n_rows)
602
-
603
- return selected_values
604
-
605
-
606
-
607
- def apply_forecast_strategy(
608
- df_polars: Any,
609
- column: str,
610
- strategy_spec: str,
611
- n_rows: int,
612
- seed: Optional[int] = None
613
- ) -> List[Any]:
614
- """
615
- Apply forecast strategy to a column.
616
-
617
- Supports:
618
- - forecast:linear
619
- - forecast:polynomial
620
- - forecast:exponential
621
- - forecast:moving_average
622
- - forecast:seasonal
623
- - forecast:auto
624
-
625
- Args:
626
- df_polars: Input Polars DataFrame
627
- column: Column name to forecast
628
- strategy_spec: Strategy specification (e.g., "forecast:seasonal:period=12")
629
- n_rows: Number of values to forecast
630
- seed: Random seed (not used for deterministic forecasts)
631
-
632
- Returns:
633
- List of forecasted values
634
-
635
- Raises:
636
- ValidationError: If strategy cannot be applied
637
-
638
- Examples:
639
- >>> apply_forecast_strategy(df, "sales", "forecast:linear", 10)
640
- [105.2, 110.4, 115.6, ...]
641
-
642
- >>> apply_forecast_strategy(df, "sales", "forecast:seasonal:period=12", 24)
643
- [98.5, 102.3, 95.8, ...]
644
- """
645
- from additory.synthetic.forecast import forecast_values, ForecastMethod
646
-
647
- # Parse strategy: forecast:method:param1=val1:param2=val2
648
- parts = strategy_spec.split(":")
649
-
650
- if len(parts) < 2:
651
- raise ValidationError(
652
- f"Invalid forecast strategy: {strategy_spec}. "
653
- "Expected format: forecast:method or forecast:method:param=value"
654
- )
655
-
656
- # parts[0] is "forecast", parts[1] is method
657
- method = parts[1].strip()
658
-
659
- # Parse additional parameters
660
- params = {}
661
- for i in range(2, len(parts)):
662
- param_part = parts[i].strip()
663
-
664
- if "=" in param_part:
665
- key, value = param_part.split("=", 1)
666
- key = key.strip()
667
- value = value.strip()
668
-
669
- # Try to convert to int/float
670
- try:
671
- if "." in value:
672
- params[key] = float(value)
673
- else:
674
- params[key] = int(value)
675
- except ValueError:
676
- params[key] = value
677
-
678
- # Call forecast function
679
- try:
680
- return forecast_values(
681
- df_polars,
682
- column,
683
- n_rows,
684
- method=method,
685
- **params
686
- )
687
- except Exception as e:
688
- raise ValidationError(f"Forecast strategy failed: {e}")
689
-
690
-
691
- def apply_distribution_strategy(
692
- df_polars: Any,
693
- column: str,
694
- strategy_spec: str,
695
- n_rows: int,
696
- seed: Optional[int] = None
697
- ) -> List[Any]:
698
- """
699
- Apply distribution strategy to a column.
700
-
701
- Supports:
702
- - normal (or normal:auto)
703
- - normal:mean=X:std=Y
704
- - uniform:min=X:max=Y
705
- - skewed_left:skewness=X
706
- - skewed_right:skewness=X
707
-
708
- Args:
709
- df_polars: Input Polars DataFrame (for parameter estimation)
710
- column: Column name to generate from
711
- strategy_spec: Strategy specification (e.g., "normal:auto")
712
- n_rows: Number of values to generate
713
- seed: Random seed for reproducibility
714
-
715
- Returns:
716
- List of generated values
717
-
718
- Raises:
719
- ValidationError: If strategy cannot be applied
720
-
721
- Examples:
722
- >>> apply_distribution_strategy(df, "age", "normal:auto", 100, seed=42)
723
- [34.5, 28.9, 41.2, ...]
724
-
725
- >>> apply_distribution_strategy(df, "score", "uniform:min=0:max=100", 50)
726
- [45.2, 78.9, 12.3, ...]
727
- """
728
- from additory.common.distributions import generate_distribution_values
729
-
730
- # Parse strategy: distribution:param1=val1:param2=val2
731
- parts = strategy_spec.split(":")
732
-
733
- if len(parts) < 1:
734
- raise ValidationError(f"Invalid distribution strategy: {strategy_spec}")
735
-
736
- distribution = parts[0].strip()
737
-
738
- # Parse additional parameters
739
- params = {}
740
- auto_mode = False
741
-
742
- for i in range(1, len(parts)):
743
- param_part = parts[i].strip()
744
-
745
- if param_part == "auto":
746
- # Special case: normal:auto
747
- auto_mode = True
748
- continue
749
-
750
- if "=" in param_part:
751
- key, value = param_part.split("=", 1)
752
- key = key.strip()
753
- value = value.strip()
754
-
755
- # Try to convert to int/float
756
- try:
757
- if "." in value:
758
- params[key] = float(value)
759
- else:
760
- params[key] = int(value)
761
- except ValueError:
762
- params[key] = value
763
-
764
- # Get existing data for parameter estimation
765
- data = df_polars[column].to_numpy()
766
-
767
- # Determine distribution type
768
- if auto_mode:
769
- dist_type = 'auto'
770
- else:
771
- dist_type = distribution
772
-
773
- # Call distribution function
774
- try:
775
- return generate_distribution_values(
776
- n_rows,
777
- distribution=dist_type,
778
- data=data,
779
- seed=seed,
780
- **params
781
- )
782
- except Exception as e:
783
- raise ValidationError(f"Distribution strategy failed: {e}")
784
-
785
-
786
- def apply_smote_strategy(
787
- df_polars: Any,
788
- columns: List[str],
789
- strategy_spec: str,
790
- n_rows: int,
791
- seed: Optional[int] = None
792
- ) -> Dict[str, List[Any]]:
793
- """
794
- Apply SMOTE strategy to multiple columns.
795
-
796
- SMOTE generates synthetic samples using k-nearest neighbors.
797
-
798
- Args:
799
- df_polars: Input Polars DataFrame
800
- columns: List of column names to use for SMOTE
801
- strategy_spec: Strategy specification (e.g., "smote:k=5")
802
- n_rows: Number of synthetic samples to generate
803
- seed: Random seed for reproducibility
804
-
805
- Returns:
806
- Dictionary mapping column names to generated values
807
-
808
- Raises:
809
- ValidationError: If strategy cannot be applied
810
-
811
- Examples:
812
- >>> apply_smote_strategy(df, ["feature1", "feature2"], "smote:k=5", 100)
813
- {"feature1": [1.2, 3.4, ...], "feature2": [5.6, 7.8, ...]}
814
- """
815
- from additory.synthetic.smote import generate_smote_values
816
-
817
- # Parse strategy: smote:k=5
818
- parts = strategy_spec.split(":")
819
-
820
- # Parse parameters
821
- params = {}
822
- for i in range(1, len(parts)):
823
- param_part = parts[i].strip()
824
-
825
- if "=" in param_part:
826
- key, value = param_part.split("=", 1)
827
- key = key.strip()
828
- value = value.strip()
829
-
830
- # Convert k to k_neighbors
831
- if key == "k":
832
- key = "k_neighbors"
833
-
834
- # Try to convert to int
835
- try:
836
- params[key] = int(value)
837
- except ValueError:
838
- params[key] = value
839
-
840
- # Call SMOTE function
841
- try:
842
- return generate_smote_values(
843
- df_polars,
844
- columns,
845
- n_rows,
846
- seed=seed,
847
- **params
848
- )
849
- except Exception as e:
850
- raise ValidationError(f"SMOTE strategy failed: {e}")