additory 0.1.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. additory/__init__.py +15 -0
  2. additory/analysis/__init__.py +48 -0
  3. additory/analysis/cardinality.py +126 -0
  4. additory/analysis/correlations.py +124 -0
  5. additory/analysis/distributions.py +376 -0
  6. additory/analysis/quality.py +158 -0
  7. additory/analysis/scan.py +400 -0
  8. additory/augment/__init__.py +24 -0
  9. additory/augment/augmentor.py +653 -0
  10. additory/augment/builtin_lists.py +430 -0
  11. additory/augment/distributions.py +22 -0
  12. additory/augment/forecast.py +1132 -0
  13. additory/augment/list_registry.py +177 -0
  14. additory/augment/smote.py +320 -0
  15. additory/augment/strategies.py +883 -0
  16. additory/common/__init__.py +157 -0
  17. additory/common/backend.py +355 -0
  18. additory/common/column_utils.py +191 -0
  19. additory/common/distributions.py +737 -0
  20. additory/common/exceptions.py +62 -0
  21. additory/common/lists.py +229 -0
  22. additory/common/patterns.py +240 -0
  23. additory/common/resolver.py +567 -0
  24. additory/common/sample_data.py +182 -0
  25. additory/common/validation.py +197 -0
  26. additory/core/__init__.py +27 -0
  27. additory/core/ast_builder.py +165 -0
  28. additory/core/backends/__init__.py +23 -0
  29. additory/core/backends/arrow_bridge.py +476 -0
  30. additory/core/backends/cudf_bridge.py +355 -0
  31. additory/core/column_positioning.py +358 -0
  32. additory/core/compiler_polars.py +166 -0
  33. additory/core/config.py +342 -0
  34. additory/core/enhanced_cache_manager.py +1119 -0
  35. additory/core/enhanced_matchers.py +473 -0
  36. additory/core/enhanced_version_manager.py +325 -0
  37. additory/core/executor.py +59 -0
  38. additory/core/integrity_manager.py +477 -0
  39. additory/core/loader.py +190 -0
  40. additory/core/logging.py +24 -0
  41. additory/core/memory_manager.py +547 -0
  42. additory/core/namespace_manager.py +657 -0
  43. additory/core/parser.py +176 -0
  44. additory/core/polars_expression_engine.py +551 -0
  45. additory/core/registry.py +176 -0
  46. additory/core/sample_data_manager.py +492 -0
  47. additory/core/user_namespace.py +751 -0
  48. additory/core/validator.py +27 -0
  49. additory/dynamic_api.py +308 -0
  50. additory/expressions/__init__.py +26 -0
  51. additory/expressions/engine.py +551 -0
  52. additory/expressions/parser.py +176 -0
  53. additory/expressions/proxy.py +546 -0
  54. additory/expressions/registry.py +313 -0
  55. additory/expressions/samples.py +492 -0
  56. additory/synthetic/__init__.py +101 -0
  57. additory/synthetic/api.py +220 -0
  58. additory/synthetic/common_integration.py +314 -0
  59. additory/synthetic/config.py +262 -0
  60. additory/synthetic/engines.py +529 -0
  61. additory/synthetic/exceptions.py +180 -0
  62. additory/synthetic/file_managers.py +518 -0
  63. additory/synthetic/generator.py +702 -0
  64. additory/synthetic/generator_parser.py +68 -0
  65. additory/synthetic/integration.py +319 -0
  66. additory/synthetic/models.py +241 -0
  67. additory/synthetic/pattern_resolver.py +573 -0
  68. additory/synthetic/performance.py +469 -0
  69. additory/synthetic/polars_integration.py +464 -0
  70. additory/synthetic/proxy.py +60 -0
  71. additory/synthetic/schema_parser.py +685 -0
  72. additory/synthetic/validator.py +553 -0
  73. additory/utilities/__init__.py +53 -0
  74. additory/utilities/encoding.py +600 -0
  75. additory/utilities/games.py +300 -0
  76. additory/utilities/keys.py +8 -0
  77. additory/utilities/lookup.py +103 -0
  78. additory/utilities/matchers.py +216 -0
  79. additory/utilities/resolvers.py +286 -0
  80. additory/utilities/settings.py +167 -0
  81. additory/utilities/units.py +746 -0
  82. additory/utilities/validators.py +153 -0
  83. additory-0.1.0a1.dist-info/METADATA +293 -0
  84. additory-0.1.0a1.dist-info/RECORD +87 -0
  85. additory-0.1.0a1.dist-info/WHEEL +5 -0
  86. additory-0.1.0a1.dist-info/licenses/LICENSE +21 -0
  87. additory-0.1.0a1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,737 @@
1
+ """
2
+ Distribution Strategies for Data Augmentation
3
+
4
+ Provides statistical distribution-based data generation:
5
+ - Normal (Gaussian) distribution
6
+ - Uniform distribution
7
+ - Skewed distributions (left/right)
8
+ - Custom distributions based on existing data
9
+ """
10
+
11
+ from typing import List, Optional, Tuple
12
+ import warnings
13
+
14
+ import numpy as np
15
+
16
+ from additory.common.exceptions import ValidationError, AugmentError
17
+
18
+
19
+ class DistributionType:
20
+ """Supported distribution types."""
21
+ NORMAL = "normal"
22
+ UNIFORM = "uniform"
23
+ SKEWED_LEFT = "skewed_left"
24
+ SKEWED_RIGHT = "skewed_right"
25
+ BETA = "beta"
26
+ GAMMA = "gamma"
27
+ EXPONENTIAL = "exponential"
28
+ KDE = "kde"
29
+ AUTO = "auto"
30
+
31
+
32
+ def estimate_distribution_params(y: np.ndarray) -> Tuple[float, float, float, float]:
33
+ """
34
+ Estimate distribution parameters from data.
35
+
36
+ Args:
37
+ y: Data values
38
+
39
+ Returns:
40
+ Tuple of (mean, std, min, max)
41
+ """
42
+ return float(np.mean(y)), float(np.std(y)), float(np.min(y)), float(np.max(y))
43
+
44
+
45
+ def calculate_skewness(y: np.ndarray) -> float:
46
+ """
47
+ Calculate skewness of data.
48
+
49
+ Skewness measures asymmetry of distribution:
50
+ - 0: Symmetric (normal)
51
+ - > 0: Right-skewed (tail on right)
52
+ - < 0: Left-skewed (tail on left)
53
+
54
+ Args:
55
+ y: Data values
56
+
57
+ Returns:
58
+ Skewness value
59
+ """
60
+ n = len(y)
61
+ if n < 3:
62
+ return 0.0
63
+
64
+ mean_y = np.mean(y)
65
+ std_y = np.std(y)
66
+
67
+ if std_y == 0:
68
+ return 0.0
69
+
70
+ # Calculate third moment
71
+ skew = np.sum(((y - mean_y) / std_y) ** 3) / n
72
+
73
+ return float(skew)
74
+
75
+
76
+ def detect_distribution_type(y: np.ndarray) -> str:
77
+ """
78
+ Detect distribution type from data.
79
+
80
+ Args:
81
+ y: Data values
82
+
83
+ Returns:
84
+ Distribution type: normal, skewed_left, skewed_right, or uniform
85
+ """
86
+ skewness = calculate_skewness(y)
87
+
88
+ # Check for uniform distribution (low variance relative to range)
89
+ std_y = np.std(y)
90
+ range_y = np.max(y) - np.min(y)
91
+
92
+ if range_y > 0:
93
+ cv = std_y / range_y # Coefficient of variation relative to range
94
+ # Uniform distribution has CV ≈ 0.289
95
+ if 0.25 < cv < 0.35 and abs(skewness) < 0.3:
96
+ return DistributionType.UNIFORM
97
+
98
+ # Check skewness
99
+ if abs(skewness) < 0.5:
100
+ return DistributionType.NORMAL
101
+ elif skewness > 0.5:
102
+ return DistributionType.SKEWED_RIGHT
103
+ else:
104
+ return DistributionType.SKEWED_LEFT
105
+
106
+
107
+ def generate_normal(
108
+ n_rows: int,
109
+ mean: Optional[float] = None,
110
+ std: Optional[float] = None,
111
+ data: Optional[np.ndarray] = None,
112
+ seed: Optional[int] = None,
113
+ clip: bool = True
114
+ ) -> List[float]:
115
+ """
116
+ Generate values from normal (Gaussian) distribution.
117
+
118
+ Args:
119
+ n_rows: Number of values to generate
120
+ mean: Mean of distribution (estimated from data if None)
121
+ std: Standard deviation (estimated from data if None)
122
+ data: Existing data to estimate parameters from
123
+ seed: Random seed for reproducibility
124
+ clip: Whether to clip values to data range
125
+
126
+ Returns:
127
+ List of generated values
128
+
129
+ Raises:
130
+ ValidationError: If neither parameters nor data provided
131
+ """
132
+ # Estimate parameters from data if not provided
133
+ if mean is None or std is None:
134
+ if data is None:
135
+ raise ValidationError(
136
+ "Must provide either (mean, std) or data for normal distribution"
137
+ )
138
+
139
+ est_mean, est_std, data_min, data_max = estimate_distribution_params(data)
140
+
141
+ if mean is None:
142
+ mean = est_mean
143
+ if std is None:
144
+ std = est_std
145
+
146
+ # Validate parameters
147
+ if std <= 0:
148
+ raise ValidationError(f"Standard deviation must be positive, got {std}")
149
+
150
+ # Generate values
151
+ if seed is not None:
152
+ np.random.seed(seed)
153
+
154
+ values = np.random.normal(mean, std, n_rows)
155
+
156
+ # Clip to data range if requested
157
+ if clip and data is not None:
158
+ data_min = np.min(data)
159
+ data_max = np.max(data)
160
+ values = np.clip(values, data_min, data_max)
161
+
162
+ return values.tolist()
163
+
164
+
165
+ def generate_uniform(
166
+ n_rows: int,
167
+ min_val: Optional[float] = None,
168
+ max_val: Optional[float] = None,
169
+ data: Optional[np.ndarray] = None,
170
+ seed: Optional[int] = None
171
+ ) -> List[float]:
172
+ """
173
+ Generate values from uniform distribution.
174
+
175
+ Args:
176
+ n_rows: Number of values to generate
177
+ min_val: Minimum value (estimated from data if None)
178
+ max_val: Maximum value (estimated from data if None)
179
+ data: Existing data to estimate parameters from
180
+ seed: Random seed for reproducibility
181
+
182
+ Returns:
183
+ List of generated values
184
+
185
+ Raises:
186
+ ValidationError: If neither parameters nor data provided
187
+ """
188
+ # Estimate parameters from data if not provided
189
+ if min_val is None or max_val is None:
190
+ if data is None:
191
+ raise ValidationError(
192
+ "Must provide either (min_val, max_val) or data for uniform distribution"
193
+ )
194
+
195
+ _, _, data_min, data_max = estimate_distribution_params(data)
196
+
197
+ if min_val is None:
198
+ min_val = data_min
199
+ if max_val is None:
200
+ max_val = data_max
201
+
202
+ # Validate parameters
203
+ if min_val >= max_val:
204
+ raise ValidationError(
205
+ f"min_val must be less than max_val, got min={min_val}, max={max_val}"
206
+ )
207
+
208
+ # Generate values
209
+ if seed is not None:
210
+ np.random.seed(seed)
211
+
212
+ values = np.random.uniform(min_val, max_val, n_rows)
213
+
214
+ return values.tolist()
215
+
216
+
217
+ def generate_skewed(
218
+ n_rows: int,
219
+ direction: str,
220
+ mean: Optional[float] = None,
221
+ std: Optional[float] = None,
222
+ skewness: float = 1.0,
223
+ data: Optional[np.ndarray] = None,
224
+ seed: Optional[int] = None,
225
+ clip: bool = True
226
+ ) -> List[float]:
227
+ """
228
+ Generate values from skewed distribution.
229
+
230
+ Uses log-normal distribution for right skew and reflected log-normal for left skew.
231
+
232
+ Args:
233
+ n_rows: Number of values to generate
234
+ direction: 'left' or 'right'
235
+ mean: Target mean (estimated from data if None)
236
+ std: Target standard deviation (estimated from data if None)
237
+ skewness: Degree of skewness (default: 1.0)
238
+ data: Existing data to estimate parameters from
239
+ seed: Random seed for reproducibility
240
+ clip: Whether to clip values to data range
241
+
242
+ Returns:
243
+ List of generated values
244
+
245
+ Raises:
246
+ ValidationError: If parameters invalid
247
+ """
248
+ # Validate direction
249
+ if direction not in ['left', 'right']:
250
+ raise ValidationError(f"Direction must be 'left' or 'right', got '{direction}'")
251
+
252
+ # Estimate parameters from data if not provided
253
+ if mean is None or std is None:
254
+ if data is None:
255
+ raise ValidationError(
256
+ "Must provide either (mean, std) or data for skewed distribution"
257
+ )
258
+
259
+ est_mean, est_std, data_min, data_max = estimate_distribution_params(data)
260
+
261
+ if mean is None:
262
+ mean = est_mean
263
+ if std is None:
264
+ std = est_std
265
+
266
+ # Validate parameters
267
+ if std <= 0:
268
+ raise ValidationError(f"Standard deviation must be positive, got {std}")
269
+
270
+ # Generate values
271
+ if seed is not None:
272
+ np.random.seed(seed)
273
+
274
+ # Use log-normal distribution for skewness
275
+ # Adjust parameters to match target mean and std
276
+ sigma = np.sqrt(np.log(1 + (std / mean) ** 2))
277
+ mu = np.log(mean) - 0.5 * sigma ** 2
278
+
279
+ # Scale sigma by skewness parameter
280
+ sigma *= abs(skewness)
281
+
282
+ if direction == 'right':
283
+ # Right-skewed: log-normal
284
+ values = np.random.lognormal(mu, sigma, n_rows)
285
+ else:
286
+ # Left-skewed: reflected log-normal
287
+ values = np.random.lognormal(mu, sigma, n_rows)
288
+ # Reflect around mean
289
+ values = 2 * mean - values
290
+
291
+ # Clip to data range if requested
292
+ if clip and data is not None:
293
+ data_min = np.min(data)
294
+ data_max = np.max(data)
295
+ values = np.clip(values, data_min, data_max)
296
+
297
+ return values.tolist()
298
+
299
+
300
+ def generate_beta(
301
+ n_rows: int,
302
+ alpha: Optional[float] = None,
303
+ beta_param: Optional[float] = None,
304
+ data: Optional[np.ndarray] = None,
305
+ seed: Optional[int] = None,
306
+ scale_min: float = 0.0,
307
+ scale_max: float = 1.0
308
+ ) -> List[float]:
309
+ """
310
+ Generate values from beta distribution.
311
+
312
+ Beta distribution is bounded between 0 and 1 (or scaled range).
313
+ Useful for percentages, probabilities, proportions.
314
+
315
+ Args:
316
+ n_rows: Number of values to generate
317
+ alpha: Shape parameter (> 0)
318
+ beta_param: Shape parameter (> 0)
319
+ data: Existing data to estimate parameters from
320
+ seed: Random seed for reproducibility
321
+ scale_min: Minimum value for scaling (default: 0)
322
+ scale_max: Maximum value for scaling (default: 1)
323
+
324
+ Returns:
325
+ List of generated values
326
+
327
+ Raises:
328
+ ValidationError: If parameters invalid
329
+ """
330
+ # Estimate parameters from data if not provided
331
+ if alpha is None or beta_param is None:
332
+ if data is None:
333
+ raise ValidationError(
334
+ "Must provide either (alpha, beta) or data for beta distribution"
335
+ )
336
+
337
+ # Normalize data to [0, 1]
338
+ data_min = np.min(data)
339
+ data_max = np.max(data)
340
+
341
+ if data_max == data_min:
342
+ raise ValidationError("Data has no variance, cannot fit beta distribution")
343
+
344
+ normalized = (data - data_min) / (data_max - data_min)
345
+
346
+ # Method of moments estimation
347
+ mean = np.mean(normalized)
348
+ var = np.var(normalized)
349
+
350
+ # Avoid edge cases
351
+ mean = np.clip(mean, 0.01, 0.99)
352
+ var = np.clip(var, 0.001, mean * (1 - mean) * 0.99)
353
+
354
+ # Estimate alpha and beta
355
+ alpha = mean * ((mean * (1 - mean) / var) - 1)
356
+ beta_param = (1 - mean) * ((mean * (1 - mean) / var) - 1)
357
+
358
+ # Use data range for scaling
359
+ scale_min = data_min
360
+ scale_max = data_max
361
+
362
+ # Validate parameters
363
+ if alpha <= 0 or beta_param <= 0:
364
+ raise ValidationError(
365
+ f"Alpha and beta must be positive, got alpha={alpha}, beta={beta_param}"
366
+ )
367
+
368
+ # Generate values
369
+ if seed is not None:
370
+ np.random.seed(seed)
371
+
372
+ values = np.random.beta(alpha, beta_param, n_rows)
373
+
374
+ # Scale to desired range
375
+ values = values * (scale_max - scale_min) + scale_min
376
+
377
+ return values.tolist()
378
+
379
+
380
+ def generate_gamma(
381
+ n_rows: int,
382
+ shape: Optional[float] = None,
383
+ scale: Optional[float] = None,
384
+ data: Optional[np.ndarray] = None,
385
+ seed: Optional[int] = None
386
+ ) -> List[float]:
387
+ """
388
+ Generate values from gamma distribution.
389
+
390
+ Gamma distribution is for positive values, often right-skewed.
391
+ Useful for waiting times, sizes, amounts.
392
+
393
+ Args:
394
+ n_rows: Number of values to generate
395
+ shape: Shape parameter (k, > 0)
396
+ scale: Scale parameter (theta, > 0)
397
+ data: Existing data to estimate parameters from
398
+ seed: Random seed for reproducibility
399
+
400
+ Returns:
401
+ List of generated values
402
+
403
+ Raises:
404
+ ValidationError: If parameters invalid
405
+ """
406
+ # Estimate parameters from data if not provided
407
+ if shape is None or scale is None:
408
+ if data is None:
409
+ raise ValidationError(
410
+ "Must provide either (shape, scale) or data for gamma distribution"
411
+ )
412
+
413
+ # Check for non-positive values
414
+ if np.any(data <= 0):
415
+ raise ValidationError(
416
+ "Gamma distribution requires all positive values"
417
+ )
418
+
419
+ # Method of moments estimation
420
+ mean = np.mean(data)
421
+ var = np.var(data)
422
+
423
+ if var == 0:
424
+ raise ValidationError("Data has no variance, cannot fit gamma distribution")
425
+
426
+ # shape = mean^2 / var, scale = var / mean
427
+ shape = (mean ** 2) / var
428
+ scale = var / mean
429
+
430
+ # Validate parameters
431
+ if shape <= 0 or scale <= 0:
432
+ raise ValidationError(
433
+ f"Shape and scale must be positive, got shape={shape}, scale={scale}"
434
+ )
435
+
436
+ # Generate values
437
+ if seed is not None:
438
+ np.random.seed(seed)
439
+
440
+ values = np.random.gamma(shape, scale, n_rows)
441
+
442
+ return values.tolist()
443
+
444
+
445
+ def generate_exponential_dist(
446
+ n_rows: int,
447
+ rate: Optional[float] = None,
448
+ data: Optional[np.ndarray] = None,
449
+ seed: Optional[int] = None
450
+ ) -> List[float]:
451
+ """
452
+ Generate values from exponential distribution.
453
+
454
+ Exponential distribution models time between events.
455
+ Memoryless property. Always positive.
456
+
457
+ Args:
458
+ n_rows: Number of values to generate
459
+ rate: Rate parameter (lambda, > 0). Mean = 1/rate
460
+ data: Existing data to estimate parameters from
461
+ seed: Random seed for reproducibility
462
+
463
+ Returns:
464
+ List of generated values
465
+
466
+ Raises:
467
+ ValidationError: If parameters invalid
468
+ """
469
+ # Estimate parameters from data if not provided
470
+ if rate is None:
471
+ if data is None:
472
+ raise ValidationError(
473
+ "Must provide either rate or data for exponential distribution"
474
+ )
475
+
476
+ # Check for non-positive values
477
+ if np.any(data <= 0):
478
+ raise ValidationError(
479
+ "Exponential distribution requires all positive values"
480
+ )
481
+
482
+ # Maximum likelihood estimation: rate = 1 / mean
483
+ mean = np.mean(data)
484
+ rate = 1.0 / mean
485
+
486
+ # Validate parameters
487
+ if rate <= 0:
488
+ raise ValidationError(f"Rate must be positive, got {rate}")
489
+
490
+ # Generate values
491
+ if seed is not None:
492
+ np.random.seed(seed)
493
+
494
+ # numpy uses scale = 1/rate
495
+ scale = 1.0 / rate
496
+ values = np.random.exponential(scale, n_rows)
497
+
498
+ return values.tolist()
499
+
500
+
501
+ def generate_kde(
502
+ n_rows: int,
503
+ data: np.ndarray,
504
+ bandwidth: Optional[float] = None,
505
+ seed: Optional[int] = None
506
+ ) -> List[float]:
507
+ """
508
+ Generate values using Kernel Density Estimation.
509
+
510
+ KDE learns the exact distribution shape from data.
511
+ Non-parametric approach that preserves complex patterns.
512
+
513
+ Args:
514
+ n_rows: Number of values to generate
515
+ data: Existing data to learn from (required)
516
+ bandwidth: KDE bandwidth (auto-selected if None)
517
+ seed: Random seed for reproducibility
518
+
519
+ Returns:
520
+ List of generated values
521
+
522
+ Raises:
523
+ ValidationError: If data invalid
524
+ """
525
+ if data is None or len(data) == 0:
526
+ raise ValidationError("KDE requires existing data")
527
+
528
+ if len(data) < 3:
529
+ raise ValidationError(f"KDE requires at least 3 data points, got {len(data)}")
530
+
531
+ # Auto-select bandwidth using Silverman's rule of thumb
532
+ if bandwidth is None:
533
+ std = np.std(data)
534
+ n = len(data)
535
+ bandwidth = 1.06 * std * (n ** (-1/5))
536
+
537
+ # Ensure reasonable bandwidth
538
+ if bandwidth == 0:
539
+ bandwidth = 0.1 * (np.max(data) - np.min(data))
540
+
541
+ if bandwidth <= 0:
542
+ raise ValidationError(f"Bandwidth must be positive, got {bandwidth}")
543
+
544
+ # Generate values by sampling from data and adding noise
545
+ if seed is not None:
546
+ np.random.seed(seed)
547
+
548
+ # Sample from data with replacement
549
+ sampled_indices = np.random.choice(len(data), size=n_rows, replace=True)
550
+ sampled_values = data[sampled_indices]
551
+
552
+ # Add Gaussian noise with bandwidth as std
553
+ noise = np.random.normal(0, bandwidth, n_rows)
554
+ values = sampled_values + noise
555
+
556
+ return values.tolist()
557
+
558
+
559
+ def generate_multivariate_normal(
560
+ n_rows: int,
561
+ columns: List[str],
562
+ data: np.ndarray,
563
+ seed: Optional[int] = None
564
+ ) -> np.ndarray:
565
+ """
566
+ Generate correlated values using multivariate normal distribution.
567
+
568
+ Preserves correlations between multiple columns.
569
+
570
+ Args:
571
+ n_rows: Number of rows to generate
572
+ columns: List of column names
573
+ data: Existing data (2D array, shape: [n_samples, n_features])
574
+ seed: Random seed for reproducibility
575
+
576
+ Returns:
577
+ 2D array of generated values (shape: [n_rows, n_features])
578
+
579
+ Raises:
580
+ ValidationError: If data invalid
581
+ """
582
+ if data is None or len(data) == 0:
583
+ raise ValidationError("Multivariate normal requires existing data")
584
+
585
+ if data.ndim != 2:
586
+ raise ValidationError(f"Data must be 2D array, got shape {data.shape}")
587
+
588
+ if data.shape[1] != len(columns):
589
+ raise ValidationError(
590
+ f"Number of columns ({len(columns)}) doesn't match data dimensions ({data.shape[1]})"
591
+ )
592
+
593
+ # Estimate mean and covariance
594
+ mean = np.mean(data, axis=0)
595
+ cov = np.cov(data, rowvar=False)
596
+
597
+ # Ensure covariance matrix is positive definite
598
+ # Add small value to diagonal if needed
599
+ min_eig = np.min(np.linalg.eigvals(cov))
600
+ if min_eig < 0:
601
+ cov += np.eye(cov.shape[0]) * (abs(min_eig) + 1e-6)
602
+
603
+ # Generate values
604
+ if seed is not None:
605
+ np.random.seed(seed)
606
+
607
+ values = np.random.multivariate_normal(mean, cov, n_rows)
608
+
609
+ return values
610
+
611
+
612
+ def generate_distribution_values(
613
+ n_rows: int,
614
+ distribution: str = DistributionType.AUTO,
615
+ data: Optional[np.ndarray] = None,
616
+ seed: Optional[int] = None,
617
+ **params
618
+ ) -> List[float]:
619
+ """
620
+ Main distribution generation function.
621
+
622
+ Args:
623
+ n_rows: Number of values to generate
624
+ distribution: Distribution type (normal, uniform, skewed_left, skewed_right,
625
+ beta, gamma, exponential, kde, auto)
626
+ data: Existing data to estimate parameters from (required for auto and kde)
627
+ seed: Random seed for reproducibility
628
+ **params: Distribution-specific parameters:
629
+ - mean, std: For normal
630
+ - min_val, max_val: For uniform
631
+ - skewness: For skewed (default: 1.0)
632
+ - alpha, beta: For beta
633
+ - shape, scale: For gamma
634
+ - rate: For exponential
635
+ - bandwidth: For kde
636
+ - clip: Whether to clip to data range (default: True)
637
+
638
+ Returns:
639
+ List of generated values
640
+
641
+ Raises:
642
+ ValidationError: If parameters invalid
643
+ AugmentError: If generation fails
644
+ """
645
+ # Auto-detect distribution if requested
646
+ if distribution == DistributionType.AUTO or distribution == "auto":
647
+ if data is None:
648
+ raise ValidationError(
649
+ "Auto distribution detection requires existing data"
650
+ )
651
+
652
+ distribution = detect_distribution_type(data)
653
+ print(f"Auto-detected distribution: {distribution}")
654
+
655
+ # Generate based on distribution type
656
+ try:
657
+ if distribution == DistributionType.NORMAL:
658
+ return generate_normal(
659
+ n_rows,
660
+ mean=params.get('mean'),
661
+ std=params.get('std'),
662
+ data=data,
663
+ seed=seed,
664
+ clip=params.get('clip', True)
665
+ )
666
+
667
+ elif distribution == DistributionType.UNIFORM:
668
+ return generate_uniform(
669
+ n_rows,
670
+ min_val=params.get('min_val'),
671
+ max_val=params.get('max_val'),
672
+ data=data,
673
+ seed=seed
674
+ )
675
+
676
+ elif distribution in [DistributionType.SKEWED_LEFT, DistributionType.SKEWED_RIGHT]:
677
+ direction = 'left' if distribution == DistributionType.SKEWED_LEFT else 'right'
678
+ return generate_skewed(
679
+ n_rows,
680
+ direction=direction,
681
+ mean=params.get('mean'),
682
+ std=params.get('std'),
683
+ skewness=params.get('skewness', 1.0),
684
+ data=data,
685
+ seed=seed,
686
+ clip=params.get('clip', True)
687
+ )
688
+
689
+ elif distribution == DistributionType.BETA:
690
+ return generate_beta(
691
+ n_rows,
692
+ alpha=params.get('alpha'),
693
+ beta_param=params.get('beta'),
694
+ data=data,
695
+ seed=seed,
696
+ scale_min=params.get('scale_min', 0.0),
697
+ scale_max=params.get('scale_max', 1.0)
698
+ )
699
+
700
+ elif distribution == DistributionType.GAMMA:
701
+ return generate_gamma(
702
+ n_rows,
703
+ shape=params.get('shape'),
704
+ scale=params.get('scale'),
705
+ data=data,
706
+ seed=seed
707
+ )
708
+
709
+ elif distribution == DistributionType.EXPONENTIAL:
710
+ return generate_exponential_dist(
711
+ n_rows,
712
+ rate=params.get('rate'),
713
+ data=data,
714
+ seed=seed
715
+ )
716
+
717
+ elif distribution == DistributionType.KDE:
718
+ if data is None:
719
+ raise ValidationError("KDE requires existing data")
720
+ return generate_kde(
721
+ n_rows,
722
+ data=data,
723
+ bandwidth=params.get('bandwidth'),
724
+ seed=seed
725
+ )
726
+
727
+ else:
728
+ raise ValidationError(
729
+ f"Unknown distribution type: '{distribution}'. "
730
+ f"Supported: normal, uniform, skewed_left, skewed_right, "
731
+ f"beta, gamma, exponential, kde, auto"
732
+ )
733
+
734
+ except Exception as e:
735
+ if isinstance(e, (ValidationError, AugmentError)):
736
+ raise
737
+ raise AugmentError(f"Distribution generation failed: {e}")