additory 0.1.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- additory/__init__.py +15 -0
- additory/analysis/__init__.py +48 -0
- additory/analysis/cardinality.py +126 -0
- additory/analysis/correlations.py +124 -0
- additory/analysis/distributions.py +376 -0
- additory/analysis/quality.py +158 -0
- additory/analysis/scan.py +400 -0
- additory/augment/__init__.py +24 -0
- additory/augment/augmentor.py +653 -0
- additory/augment/builtin_lists.py +430 -0
- additory/augment/distributions.py +22 -0
- additory/augment/forecast.py +1132 -0
- additory/augment/list_registry.py +177 -0
- additory/augment/smote.py +320 -0
- additory/augment/strategies.py +883 -0
- additory/common/__init__.py +157 -0
- additory/common/backend.py +355 -0
- additory/common/column_utils.py +191 -0
- additory/common/distributions.py +737 -0
- additory/common/exceptions.py +62 -0
- additory/common/lists.py +229 -0
- additory/common/patterns.py +240 -0
- additory/common/resolver.py +567 -0
- additory/common/sample_data.py +182 -0
- additory/common/validation.py +197 -0
- additory/core/__init__.py +27 -0
- additory/core/ast_builder.py +165 -0
- additory/core/backends/__init__.py +23 -0
- additory/core/backends/arrow_bridge.py +476 -0
- additory/core/backends/cudf_bridge.py +355 -0
- additory/core/column_positioning.py +358 -0
- additory/core/compiler_polars.py +166 -0
- additory/core/config.py +342 -0
- additory/core/enhanced_cache_manager.py +1119 -0
- additory/core/enhanced_matchers.py +473 -0
- additory/core/enhanced_version_manager.py +325 -0
- additory/core/executor.py +59 -0
- additory/core/integrity_manager.py +477 -0
- additory/core/loader.py +190 -0
- additory/core/logging.py +24 -0
- additory/core/memory_manager.py +547 -0
- additory/core/namespace_manager.py +657 -0
- additory/core/parser.py +176 -0
- additory/core/polars_expression_engine.py +551 -0
- additory/core/registry.py +176 -0
- additory/core/sample_data_manager.py +492 -0
- additory/core/user_namespace.py +751 -0
- additory/core/validator.py +27 -0
- additory/dynamic_api.py +308 -0
- additory/expressions/__init__.py +26 -0
- additory/expressions/engine.py +551 -0
- additory/expressions/parser.py +176 -0
- additory/expressions/proxy.py +546 -0
- additory/expressions/registry.py +313 -0
- additory/expressions/samples.py +492 -0
- additory/synthetic/__init__.py +101 -0
- additory/synthetic/api.py +220 -0
- additory/synthetic/common_integration.py +314 -0
- additory/synthetic/config.py +262 -0
- additory/synthetic/engines.py +529 -0
- additory/synthetic/exceptions.py +180 -0
- additory/synthetic/file_managers.py +518 -0
- additory/synthetic/generator.py +702 -0
- additory/synthetic/generator_parser.py +68 -0
- additory/synthetic/integration.py +319 -0
- additory/synthetic/models.py +241 -0
- additory/synthetic/pattern_resolver.py +573 -0
- additory/synthetic/performance.py +469 -0
- additory/synthetic/polars_integration.py +464 -0
- additory/synthetic/proxy.py +60 -0
- additory/synthetic/schema_parser.py +685 -0
- additory/synthetic/validator.py +553 -0
- additory/utilities/__init__.py +53 -0
- additory/utilities/encoding.py +600 -0
- additory/utilities/games.py +300 -0
- additory/utilities/keys.py +8 -0
- additory/utilities/lookup.py +103 -0
- additory/utilities/matchers.py +216 -0
- additory/utilities/resolvers.py +286 -0
- additory/utilities/settings.py +167 -0
- additory/utilities/units.py +746 -0
- additory/utilities/validators.py +153 -0
- additory-0.1.0a1.dist-info/METADATA +293 -0
- additory-0.1.0a1.dist-info/RECORD +87 -0
- additory-0.1.0a1.dist-info/WHEEL +5 -0
- additory-0.1.0a1.dist-info/licenses/LICENSE +21 -0
- additory-0.1.0a1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,737 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Distribution Strategies for Data Augmentation
|
|
3
|
+
|
|
4
|
+
Provides statistical distribution-based data generation:
|
|
5
|
+
- Normal (Gaussian) distribution
|
|
6
|
+
- Uniform distribution
|
|
7
|
+
- Skewed distributions (left/right)
|
|
8
|
+
- Custom distributions based on existing data
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from typing import List, Optional, Tuple
|
|
12
|
+
import warnings
|
|
13
|
+
|
|
14
|
+
import numpy as np
|
|
15
|
+
|
|
16
|
+
from additory.common.exceptions import ValidationError, AugmentError
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class DistributionType:
|
|
20
|
+
"""Supported distribution types."""
|
|
21
|
+
NORMAL = "normal"
|
|
22
|
+
UNIFORM = "uniform"
|
|
23
|
+
SKEWED_LEFT = "skewed_left"
|
|
24
|
+
SKEWED_RIGHT = "skewed_right"
|
|
25
|
+
BETA = "beta"
|
|
26
|
+
GAMMA = "gamma"
|
|
27
|
+
EXPONENTIAL = "exponential"
|
|
28
|
+
KDE = "kde"
|
|
29
|
+
AUTO = "auto"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def estimate_distribution_params(y: np.ndarray) -> Tuple[float, float, float, float]:
|
|
33
|
+
"""
|
|
34
|
+
Estimate distribution parameters from data.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
y: Data values
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
Tuple of (mean, std, min, max)
|
|
41
|
+
"""
|
|
42
|
+
return float(np.mean(y)), float(np.std(y)), float(np.min(y)), float(np.max(y))
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def calculate_skewness(y: np.ndarray) -> float:
|
|
46
|
+
"""
|
|
47
|
+
Calculate skewness of data.
|
|
48
|
+
|
|
49
|
+
Skewness measures asymmetry of distribution:
|
|
50
|
+
- 0: Symmetric (normal)
|
|
51
|
+
- > 0: Right-skewed (tail on right)
|
|
52
|
+
- < 0: Left-skewed (tail on left)
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
y: Data values
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
Skewness value
|
|
59
|
+
"""
|
|
60
|
+
n = len(y)
|
|
61
|
+
if n < 3:
|
|
62
|
+
return 0.0
|
|
63
|
+
|
|
64
|
+
mean_y = np.mean(y)
|
|
65
|
+
std_y = np.std(y)
|
|
66
|
+
|
|
67
|
+
if std_y == 0:
|
|
68
|
+
return 0.0
|
|
69
|
+
|
|
70
|
+
# Calculate third moment
|
|
71
|
+
skew = np.sum(((y - mean_y) / std_y) ** 3) / n
|
|
72
|
+
|
|
73
|
+
return float(skew)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def detect_distribution_type(y: np.ndarray) -> str:
|
|
77
|
+
"""
|
|
78
|
+
Detect distribution type from data.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
y: Data values
|
|
82
|
+
|
|
83
|
+
Returns:
|
|
84
|
+
Distribution type: normal, skewed_left, skewed_right, or uniform
|
|
85
|
+
"""
|
|
86
|
+
skewness = calculate_skewness(y)
|
|
87
|
+
|
|
88
|
+
# Check for uniform distribution (low variance relative to range)
|
|
89
|
+
std_y = np.std(y)
|
|
90
|
+
range_y = np.max(y) - np.min(y)
|
|
91
|
+
|
|
92
|
+
if range_y > 0:
|
|
93
|
+
cv = std_y / range_y # Coefficient of variation relative to range
|
|
94
|
+
# Uniform distribution has CV ≈ 0.289
|
|
95
|
+
if 0.25 < cv < 0.35 and abs(skewness) < 0.3:
|
|
96
|
+
return DistributionType.UNIFORM
|
|
97
|
+
|
|
98
|
+
# Check skewness
|
|
99
|
+
if abs(skewness) < 0.5:
|
|
100
|
+
return DistributionType.NORMAL
|
|
101
|
+
elif skewness > 0.5:
|
|
102
|
+
return DistributionType.SKEWED_RIGHT
|
|
103
|
+
else:
|
|
104
|
+
return DistributionType.SKEWED_LEFT
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def generate_normal(
|
|
108
|
+
n_rows: int,
|
|
109
|
+
mean: Optional[float] = None,
|
|
110
|
+
std: Optional[float] = None,
|
|
111
|
+
data: Optional[np.ndarray] = None,
|
|
112
|
+
seed: Optional[int] = None,
|
|
113
|
+
clip: bool = True
|
|
114
|
+
) -> List[float]:
|
|
115
|
+
"""
|
|
116
|
+
Generate values from normal (Gaussian) distribution.
|
|
117
|
+
|
|
118
|
+
Args:
|
|
119
|
+
n_rows: Number of values to generate
|
|
120
|
+
mean: Mean of distribution (estimated from data if None)
|
|
121
|
+
std: Standard deviation (estimated from data if None)
|
|
122
|
+
data: Existing data to estimate parameters from
|
|
123
|
+
seed: Random seed for reproducibility
|
|
124
|
+
clip: Whether to clip values to data range
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
List of generated values
|
|
128
|
+
|
|
129
|
+
Raises:
|
|
130
|
+
ValidationError: If neither parameters nor data provided
|
|
131
|
+
"""
|
|
132
|
+
# Estimate parameters from data if not provided
|
|
133
|
+
if mean is None or std is None:
|
|
134
|
+
if data is None:
|
|
135
|
+
raise ValidationError(
|
|
136
|
+
"Must provide either (mean, std) or data for normal distribution"
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
est_mean, est_std, data_min, data_max = estimate_distribution_params(data)
|
|
140
|
+
|
|
141
|
+
if mean is None:
|
|
142
|
+
mean = est_mean
|
|
143
|
+
if std is None:
|
|
144
|
+
std = est_std
|
|
145
|
+
|
|
146
|
+
# Validate parameters
|
|
147
|
+
if std <= 0:
|
|
148
|
+
raise ValidationError(f"Standard deviation must be positive, got {std}")
|
|
149
|
+
|
|
150
|
+
# Generate values
|
|
151
|
+
if seed is not None:
|
|
152
|
+
np.random.seed(seed)
|
|
153
|
+
|
|
154
|
+
values = np.random.normal(mean, std, n_rows)
|
|
155
|
+
|
|
156
|
+
# Clip to data range if requested
|
|
157
|
+
if clip and data is not None:
|
|
158
|
+
data_min = np.min(data)
|
|
159
|
+
data_max = np.max(data)
|
|
160
|
+
values = np.clip(values, data_min, data_max)
|
|
161
|
+
|
|
162
|
+
return values.tolist()
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def generate_uniform(
|
|
166
|
+
n_rows: int,
|
|
167
|
+
min_val: Optional[float] = None,
|
|
168
|
+
max_val: Optional[float] = None,
|
|
169
|
+
data: Optional[np.ndarray] = None,
|
|
170
|
+
seed: Optional[int] = None
|
|
171
|
+
) -> List[float]:
|
|
172
|
+
"""
|
|
173
|
+
Generate values from uniform distribution.
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
n_rows: Number of values to generate
|
|
177
|
+
min_val: Minimum value (estimated from data if None)
|
|
178
|
+
max_val: Maximum value (estimated from data if None)
|
|
179
|
+
data: Existing data to estimate parameters from
|
|
180
|
+
seed: Random seed for reproducibility
|
|
181
|
+
|
|
182
|
+
Returns:
|
|
183
|
+
List of generated values
|
|
184
|
+
|
|
185
|
+
Raises:
|
|
186
|
+
ValidationError: If neither parameters nor data provided
|
|
187
|
+
"""
|
|
188
|
+
# Estimate parameters from data if not provided
|
|
189
|
+
if min_val is None or max_val is None:
|
|
190
|
+
if data is None:
|
|
191
|
+
raise ValidationError(
|
|
192
|
+
"Must provide either (min_val, max_val) or data for uniform distribution"
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
_, _, data_min, data_max = estimate_distribution_params(data)
|
|
196
|
+
|
|
197
|
+
if min_val is None:
|
|
198
|
+
min_val = data_min
|
|
199
|
+
if max_val is None:
|
|
200
|
+
max_val = data_max
|
|
201
|
+
|
|
202
|
+
# Validate parameters
|
|
203
|
+
if min_val >= max_val:
|
|
204
|
+
raise ValidationError(
|
|
205
|
+
f"min_val must be less than max_val, got min={min_val}, max={max_val}"
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
# Generate values
|
|
209
|
+
if seed is not None:
|
|
210
|
+
np.random.seed(seed)
|
|
211
|
+
|
|
212
|
+
values = np.random.uniform(min_val, max_val, n_rows)
|
|
213
|
+
|
|
214
|
+
return values.tolist()
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def generate_skewed(
|
|
218
|
+
n_rows: int,
|
|
219
|
+
direction: str,
|
|
220
|
+
mean: Optional[float] = None,
|
|
221
|
+
std: Optional[float] = None,
|
|
222
|
+
skewness: float = 1.0,
|
|
223
|
+
data: Optional[np.ndarray] = None,
|
|
224
|
+
seed: Optional[int] = None,
|
|
225
|
+
clip: bool = True
|
|
226
|
+
) -> List[float]:
|
|
227
|
+
"""
|
|
228
|
+
Generate values from skewed distribution.
|
|
229
|
+
|
|
230
|
+
Uses log-normal distribution for right skew and reflected log-normal for left skew.
|
|
231
|
+
|
|
232
|
+
Args:
|
|
233
|
+
n_rows: Number of values to generate
|
|
234
|
+
direction: 'left' or 'right'
|
|
235
|
+
mean: Target mean (estimated from data if None)
|
|
236
|
+
std: Target standard deviation (estimated from data if None)
|
|
237
|
+
skewness: Degree of skewness (default: 1.0)
|
|
238
|
+
data: Existing data to estimate parameters from
|
|
239
|
+
seed: Random seed for reproducibility
|
|
240
|
+
clip: Whether to clip values to data range
|
|
241
|
+
|
|
242
|
+
Returns:
|
|
243
|
+
List of generated values
|
|
244
|
+
|
|
245
|
+
Raises:
|
|
246
|
+
ValidationError: If parameters invalid
|
|
247
|
+
"""
|
|
248
|
+
# Validate direction
|
|
249
|
+
if direction not in ['left', 'right']:
|
|
250
|
+
raise ValidationError(f"Direction must be 'left' or 'right', got '{direction}'")
|
|
251
|
+
|
|
252
|
+
# Estimate parameters from data if not provided
|
|
253
|
+
if mean is None or std is None:
|
|
254
|
+
if data is None:
|
|
255
|
+
raise ValidationError(
|
|
256
|
+
"Must provide either (mean, std) or data for skewed distribution"
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
est_mean, est_std, data_min, data_max = estimate_distribution_params(data)
|
|
260
|
+
|
|
261
|
+
if mean is None:
|
|
262
|
+
mean = est_mean
|
|
263
|
+
if std is None:
|
|
264
|
+
std = est_std
|
|
265
|
+
|
|
266
|
+
# Validate parameters
|
|
267
|
+
if std <= 0:
|
|
268
|
+
raise ValidationError(f"Standard deviation must be positive, got {std}")
|
|
269
|
+
|
|
270
|
+
# Generate values
|
|
271
|
+
if seed is not None:
|
|
272
|
+
np.random.seed(seed)
|
|
273
|
+
|
|
274
|
+
# Use log-normal distribution for skewness
|
|
275
|
+
# Adjust parameters to match target mean and std
|
|
276
|
+
sigma = np.sqrt(np.log(1 + (std / mean) ** 2))
|
|
277
|
+
mu = np.log(mean) - 0.5 * sigma ** 2
|
|
278
|
+
|
|
279
|
+
# Scale sigma by skewness parameter
|
|
280
|
+
sigma *= abs(skewness)
|
|
281
|
+
|
|
282
|
+
if direction == 'right':
|
|
283
|
+
# Right-skewed: log-normal
|
|
284
|
+
values = np.random.lognormal(mu, sigma, n_rows)
|
|
285
|
+
else:
|
|
286
|
+
# Left-skewed: reflected log-normal
|
|
287
|
+
values = np.random.lognormal(mu, sigma, n_rows)
|
|
288
|
+
# Reflect around mean
|
|
289
|
+
values = 2 * mean - values
|
|
290
|
+
|
|
291
|
+
# Clip to data range if requested
|
|
292
|
+
if clip and data is not None:
|
|
293
|
+
data_min = np.min(data)
|
|
294
|
+
data_max = np.max(data)
|
|
295
|
+
values = np.clip(values, data_min, data_max)
|
|
296
|
+
|
|
297
|
+
return values.tolist()
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
def generate_beta(
|
|
301
|
+
n_rows: int,
|
|
302
|
+
alpha: Optional[float] = None,
|
|
303
|
+
beta_param: Optional[float] = None,
|
|
304
|
+
data: Optional[np.ndarray] = None,
|
|
305
|
+
seed: Optional[int] = None,
|
|
306
|
+
scale_min: float = 0.0,
|
|
307
|
+
scale_max: float = 1.0
|
|
308
|
+
) -> List[float]:
|
|
309
|
+
"""
|
|
310
|
+
Generate values from beta distribution.
|
|
311
|
+
|
|
312
|
+
Beta distribution is bounded between 0 and 1 (or scaled range).
|
|
313
|
+
Useful for percentages, probabilities, proportions.
|
|
314
|
+
|
|
315
|
+
Args:
|
|
316
|
+
n_rows: Number of values to generate
|
|
317
|
+
alpha: Shape parameter (> 0)
|
|
318
|
+
beta_param: Shape parameter (> 0)
|
|
319
|
+
data: Existing data to estimate parameters from
|
|
320
|
+
seed: Random seed for reproducibility
|
|
321
|
+
scale_min: Minimum value for scaling (default: 0)
|
|
322
|
+
scale_max: Maximum value for scaling (default: 1)
|
|
323
|
+
|
|
324
|
+
Returns:
|
|
325
|
+
List of generated values
|
|
326
|
+
|
|
327
|
+
Raises:
|
|
328
|
+
ValidationError: If parameters invalid
|
|
329
|
+
"""
|
|
330
|
+
# Estimate parameters from data if not provided
|
|
331
|
+
if alpha is None or beta_param is None:
|
|
332
|
+
if data is None:
|
|
333
|
+
raise ValidationError(
|
|
334
|
+
"Must provide either (alpha, beta) or data for beta distribution"
|
|
335
|
+
)
|
|
336
|
+
|
|
337
|
+
# Normalize data to [0, 1]
|
|
338
|
+
data_min = np.min(data)
|
|
339
|
+
data_max = np.max(data)
|
|
340
|
+
|
|
341
|
+
if data_max == data_min:
|
|
342
|
+
raise ValidationError("Data has no variance, cannot fit beta distribution")
|
|
343
|
+
|
|
344
|
+
normalized = (data - data_min) / (data_max - data_min)
|
|
345
|
+
|
|
346
|
+
# Method of moments estimation
|
|
347
|
+
mean = np.mean(normalized)
|
|
348
|
+
var = np.var(normalized)
|
|
349
|
+
|
|
350
|
+
# Avoid edge cases
|
|
351
|
+
mean = np.clip(mean, 0.01, 0.99)
|
|
352
|
+
var = np.clip(var, 0.001, mean * (1 - mean) * 0.99)
|
|
353
|
+
|
|
354
|
+
# Estimate alpha and beta
|
|
355
|
+
alpha = mean * ((mean * (1 - mean) / var) - 1)
|
|
356
|
+
beta_param = (1 - mean) * ((mean * (1 - mean) / var) - 1)
|
|
357
|
+
|
|
358
|
+
# Use data range for scaling
|
|
359
|
+
scale_min = data_min
|
|
360
|
+
scale_max = data_max
|
|
361
|
+
|
|
362
|
+
# Validate parameters
|
|
363
|
+
if alpha <= 0 or beta_param <= 0:
|
|
364
|
+
raise ValidationError(
|
|
365
|
+
f"Alpha and beta must be positive, got alpha={alpha}, beta={beta_param}"
|
|
366
|
+
)
|
|
367
|
+
|
|
368
|
+
# Generate values
|
|
369
|
+
if seed is not None:
|
|
370
|
+
np.random.seed(seed)
|
|
371
|
+
|
|
372
|
+
values = np.random.beta(alpha, beta_param, n_rows)
|
|
373
|
+
|
|
374
|
+
# Scale to desired range
|
|
375
|
+
values = values * (scale_max - scale_min) + scale_min
|
|
376
|
+
|
|
377
|
+
return values.tolist()
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
def generate_gamma(
|
|
381
|
+
n_rows: int,
|
|
382
|
+
shape: Optional[float] = None,
|
|
383
|
+
scale: Optional[float] = None,
|
|
384
|
+
data: Optional[np.ndarray] = None,
|
|
385
|
+
seed: Optional[int] = None
|
|
386
|
+
) -> List[float]:
|
|
387
|
+
"""
|
|
388
|
+
Generate values from gamma distribution.
|
|
389
|
+
|
|
390
|
+
Gamma distribution is for positive values, often right-skewed.
|
|
391
|
+
Useful for waiting times, sizes, amounts.
|
|
392
|
+
|
|
393
|
+
Args:
|
|
394
|
+
n_rows: Number of values to generate
|
|
395
|
+
shape: Shape parameter (k, > 0)
|
|
396
|
+
scale: Scale parameter (theta, > 0)
|
|
397
|
+
data: Existing data to estimate parameters from
|
|
398
|
+
seed: Random seed for reproducibility
|
|
399
|
+
|
|
400
|
+
Returns:
|
|
401
|
+
List of generated values
|
|
402
|
+
|
|
403
|
+
Raises:
|
|
404
|
+
ValidationError: If parameters invalid
|
|
405
|
+
"""
|
|
406
|
+
# Estimate parameters from data if not provided
|
|
407
|
+
if shape is None or scale is None:
|
|
408
|
+
if data is None:
|
|
409
|
+
raise ValidationError(
|
|
410
|
+
"Must provide either (shape, scale) or data for gamma distribution"
|
|
411
|
+
)
|
|
412
|
+
|
|
413
|
+
# Check for non-positive values
|
|
414
|
+
if np.any(data <= 0):
|
|
415
|
+
raise ValidationError(
|
|
416
|
+
"Gamma distribution requires all positive values"
|
|
417
|
+
)
|
|
418
|
+
|
|
419
|
+
# Method of moments estimation
|
|
420
|
+
mean = np.mean(data)
|
|
421
|
+
var = np.var(data)
|
|
422
|
+
|
|
423
|
+
if var == 0:
|
|
424
|
+
raise ValidationError("Data has no variance, cannot fit gamma distribution")
|
|
425
|
+
|
|
426
|
+
# shape = mean^2 / var, scale = var / mean
|
|
427
|
+
shape = (mean ** 2) / var
|
|
428
|
+
scale = var / mean
|
|
429
|
+
|
|
430
|
+
# Validate parameters
|
|
431
|
+
if shape <= 0 or scale <= 0:
|
|
432
|
+
raise ValidationError(
|
|
433
|
+
f"Shape and scale must be positive, got shape={shape}, scale={scale}"
|
|
434
|
+
)
|
|
435
|
+
|
|
436
|
+
# Generate values
|
|
437
|
+
if seed is not None:
|
|
438
|
+
np.random.seed(seed)
|
|
439
|
+
|
|
440
|
+
values = np.random.gamma(shape, scale, n_rows)
|
|
441
|
+
|
|
442
|
+
return values.tolist()
|
|
443
|
+
|
|
444
|
+
|
|
445
|
+
def generate_exponential_dist(
|
|
446
|
+
n_rows: int,
|
|
447
|
+
rate: Optional[float] = None,
|
|
448
|
+
data: Optional[np.ndarray] = None,
|
|
449
|
+
seed: Optional[int] = None
|
|
450
|
+
) -> List[float]:
|
|
451
|
+
"""
|
|
452
|
+
Generate values from exponential distribution.
|
|
453
|
+
|
|
454
|
+
Exponential distribution models time between events.
|
|
455
|
+
Memoryless property. Always positive.
|
|
456
|
+
|
|
457
|
+
Args:
|
|
458
|
+
n_rows: Number of values to generate
|
|
459
|
+
rate: Rate parameter (lambda, > 0). Mean = 1/rate
|
|
460
|
+
data: Existing data to estimate parameters from
|
|
461
|
+
seed: Random seed for reproducibility
|
|
462
|
+
|
|
463
|
+
Returns:
|
|
464
|
+
List of generated values
|
|
465
|
+
|
|
466
|
+
Raises:
|
|
467
|
+
ValidationError: If parameters invalid
|
|
468
|
+
"""
|
|
469
|
+
# Estimate parameters from data if not provided
|
|
470
|
+
if rate is None:
|
|
471
|
+
if data is None:
|
|
472
|
+
raise ValidationError(
|
|
473
|
+
"Must provide either rate or data for exponential distribution"
|
|
474
|
+
)
|
|
475
|
+
|
|
476
|
+
# Check for non-positive values
|
|
477
|
+
if np.any(data <= 0):
|
|
478
|
+
raise ValidationError(
|
|
479
|
+
"Exponential distribution requires all positive values"
|
|
480
|
+
)
|
|
481
|
+
|
|
482
|
+
# Maximum likelihood estimation: rate = 1 / mean
|
|
483
|
+
mean = np.mean(data)
|
|
484
|
+
rate = 1.0 / mean
|
|
485
|
+
|
|
486
|
+
# Validate parameters
|
|
487
|
+
if rate <= 0:
|
|
488
|
+
raise ValidationError(f"Rate must be positive, got {rate}")
|
|
489
|
+
|
|
490
|
+
# Generate values
|
|
491
|
+
if seed is not None:
|
|
492
|
+
np.random.seed(seed)
|
|
493
|
+
|
|
494
|
+
# numpy uses scale = 1/rate
|
|
495
|
+
scale = 1.0 / rate
|
|
496
|
+
values = np.random.exponential(scale, n_rows)
|
|
497
|
+
|
|
498
|
+
return values.tolist()
|
|
499
|
+
|
|
500
|
+
|
|
501
|
+
def generate_kde(
|
|
502
|
+
n_rows: int,
|
|
503
|
+
data: np.ndarray,
|
|
504
|
+
bandwidth: Optional[float] = None,
|
|
505
|
+
seed: Optional[int] = None
|
|
506
|
+
) -> List[float]:
|
|
507
|
+
"""
|
|
508
|
+
Generate values using Kernel Density Estimation.
|
|
509
|
+
|
|
510
|
+
KDE learns the exact distribution shape from data.
|
|
511
|
+
Non-parametric approach that preserves complex patterns.
|
|
512
|
+
|
|
513
|
+
Args:
|
|
514
|
+
n_rows: Number of values to generate
|
|
515
|
+
data: Existing data to learn from (required)
|
|
516
|
+
bandwidth: KDE bandwidth (auto-selected if None)
|
|
517
|
+
seed: Random seed for reproducibility
|
|
518
|
+
|
|
519
|
+
Returns:
|
|
520
|
+
List of generated values
|
|
521
|
+
|
|
522
|
+
Raises:
|
|
523
|
+
ValidationError: If data invalid
|
|
524
|
+
"""
|
|
525
|
+
if data is None or len(data) == 0:
|
|
526
|
+
raise ValidationError("KDE requires existing data")
|
|
527
|
+
|
|
528
|
+
if len(data) < 3:
|
|
529
|
+
raise ValidationError(f"KDE requires at least 3 data points, got {len(data)}")
|
|
530
|
+
|
|
531
|
+
# Auto-select bandwidth using Silverman's rule of thumb
|
|
532
|
+
if bandwidth is None:
|
|
533
|
+
std = np.std(data)
|
|
534
|
+
n = len(data)
|
|
535
|
+
bandwidth = 1.06 * std * (n ** (-1/5))
|
|
536
|
+
|
|
537
|
+
# Ensure reasonable bandwidth
|
|
538
|
+
if bandwidth == 0:
|
|
539
|
+
bandwidth = 0.1 * (np.max(data) - np.min(data))
|
|
540
|
+
|
|
541
|
+
if bandwidth <= 0:
|
|
542
|
+
raise ValidationError(f"Bandwidth must be positive, got {bandwidth}")
|
|
543
|
+
|
|
544
|
+
# Generate values by sampling from data and adding noise
|
|
545
|
+
if seed is not None:
|
|
546
|
+
np.random.seed(seed)
|
|
547
|
+
|
|
548
|
+
# Sample from data with replacement
|
|
549
|
+
sampled_indices = np.random.choice(len(data), size=n_rows, replace=True)
|
|
550
|
+
sampled_values = data[sampled_indices]
|
|
551
|
+
|
|
552
|
+
# Add Gaussian noise with bandwidth as std
|
|
553
|
+
noise = np.random.normal(0, bandwidth, n_rows)
|
|
554
|
+
values = sampled_values + noise
|
|
555
|
+
|
|
556
|
+
return values.tolist()
|
|
557
|
+
|
|
558
|
+
|
|
559
|
+
def generate_multivariate_normal(
|
|
560
|
+
n_rows: int,
|
|
561
|
+
columns: List[str],
|
|
562
|
+
data: np.ndarray,
|
|
563
|
+
seed: Optional[int] = None
|
|
564
|
+
) -> np.ndarray:
|
|
565
|
+
"""
|
|
566
|
+
Generate correlated values using multivariate normal distribution.
|
|
567
|
+
|
|
568
|
+
Preserves correlations between multiple columns.
|
|
569
|
+
|
|
570
|
+
Args:
|
|
571
|
+
n_rows: Number of rows to generate
|
|
572
|
+
columns: List of column names
|
|
573
|
+
data: Existing data (2D array, shape: [n_samples, n_features])
|
|
574
|
+
seed: Random seed for reproducibility
|
|
575
|
+
|
|
576
|
+
Returns:
|
|
577
|
+
2D array of generated values (shape: [n_rows, n_features])
|
|
578
|
+
|
|
579
|
+
Raises:
|
|
580
|
+
ValidationError: If data invalid
|
|
581
|
+
"""
|
|
582
|
+
if data is None or len(data) == 0:
|
|
583
|
+
raise ValidationError("Multivariate normal requires existing data")
|
|
584
|
+
|
|
585
|
+
if data.ndim != 2:
|
|
586
|
+
raise ValidationError(f"Data must be 2D array, got shape {data.shape}")
|
|
587
|
+
|
|
588
|
+
if data.shape[1] != len(columns):
|
|
589
|
+
raise ValidationError(
|
|
590
|
+
f"Number of columns ({len(columns)}) doesn't match data dimensions ({data.shape[1]})"
|
|
591
|
+
)
|
|
592
|
+
|
|
593
|
+
# Estimate mean and covariance
|
|
594
|
+
mean = np.mean(data, axis=0)
|
|
595
|
+
cov = np.cov(data, rowvar=False)
|
|
596
|
+
|
|
597
|
+
# Ensure covariance matrix is positive definite
|
|
598
|
+
# Add small value to diagonal if needed
|
|
599
|
+
min_eig = np.min(np.linalg.eigvals(cov))
|
|
600
|
+
if min_eig < 0:
|
|
601
|
+
cov += np.eye(cov.shape[0]) * (abs(min_eig) + 1e-6)
|
|
602
|
+
|
|
603
|
+
# Generate values
|
|
604
|
+
if seed is not None:
|
|
605
|
+
np.random.seed(seed)
|
|
606
|
+
|
|
607
|
+
values = np.random.multivariate_normal(mean, cov, n_rows)
|
|
608
|
+
|
|
609
|
+
return values
|
|
610
|
+
|
|
611
|
+
|
|
612
|
+
def generate_distribution_values(
|
|
613
|
+
n_rows: int,
|
|
614
|
+
distribution: str = DistributionType.AUTO,
|
|
615
|
+
data: Optional[np.ndarray] = None,
|
|
616
|
+
seed: Optional[int] = None,
|
|
617
|
+
**params
|
|
618
|
+
) -> List[float]:
|
|
619
|
+
"""
|
|
620
|
+
Main distribution generation function.
|
|
621
|
+
|
|
622
|
+
Args:
|
|
623
|
+
n_rows: Number of values to generate
|
|
624
|
+
distribution: Distribution type (normal, uniform, skewed_left, skewed_right,
|
|
625
|
+
beta, gamma, exponential, kde, auto)
|
|
626
|
+
data: Existing data to estimate parameters from (required for auto and kde)
|
|
627
|
+
seed: Random seed for reproducibility
|
|
628
|
+
**params: Distribution-specific parameters:
|
|
629
|
+
- mean, std: For normal
|
|
630
|
+
- min_val, max_val: For uniform
|
|
631
|
+
- skewness: For skewed (default: 1.0)
|
|
632
|
+
- alpha, beta: For beta
|
|
633
|
+
- shape, scale: For gamma
|
|
634
|
+
- rate: For exponential
|
|
635
|
+
- bandwidth: For kde
|
|
636
|
+
- clip: Whether to clip to data range (default: True)
|
|
637
|
+
|
|
638
|
+
Returns:
|
|
639
|
+
List of generated values
|
|
640
|
+
|
|
641
|
+
Raises:
|
|
642
|
+
ValidationError: If parameters invalid
|
|
643
|
+
AugmentError: If generation fails
|
|
644
|
+
"""
|
|
645
|
+
# Auto-detect distribution if requested
|
|
646
|
+
if distribution == DistributionType.AUTO or distribution == "auto":
|
|
647
|
+
if data is None:
|
|
648
|
+
raise ValidationError(
|
|
649
|
+
"Auto distribution detection requires existing data"
|
|
650
|
+
)
|
|
651
|
+
|
|
652
|
+
distribution = detect_distribution_type(data)
|
|
653
|
+
print(f"Auto-detected distribution: {distribution}")
|
|
654
|
+
|
|
655
|
+
# Generate based on distribution type
|
|
656
|
+
try:
|
|
657
|
+
if distribution == DistributionType.NORMAL:
|
|
658
|
+
return generate_normal(
|
|
659
|
+
n_rows,
|
|
660
|
+
mean=params.get('mean'),
|
|
661
|
+
std=params.get('std'),
|
|
662
|
+
data=data,
|
|
663
|
+
seed=seed,
|
|
664
|
+
clip=params.get('clip', True)
|
|
665
|
+
)
|
|
666
|
+
|
|
667
|
+
elif distribution == DistributionType.UNIFORM:
|
|
668
|
+
return generate_uniform(
|
|
669
|
+
n_rows,
|
|
670
|
+
min_val=params.get('min_val'),
|
|
671
|
+
max_val=params.get('max_val'),
|
|
672
|
+
data=data,
|
|
673
|
+
seed=seed
|
|
674
|
+
)
|
|
675
|
+
|
|
676
|
+
elif distribution in [DistributionType.SKEWED_LEFT, DistributionType.SKEWED_RIGHT]:
|
|
677
|
+
direction = 'left' if distribution == DistributionType.SKEWED_LEFT else 'right'
|
|
678
|
+
return generate_skewed(
|
|
679
|
+
n_rows,
|
|
680
|
+
direction=direction,
|
|
681
|
+
mean=params.get('mean'),
|
|
682
|
+
std=params.get('std'),
|
|
683
|
+
skewness=params.get('skewness', 1.0),
|
|
684
|
+
data=data,
|
|
685
|
+
seed=seed,
|
|
686
|
+
clip=params.get('clip', True)
|
|
687
|
+
)
|
|
688
|
+
|
|
689
|
+
elif distribution == DistributionType.BETA:
|
|
690
|
+
return generate_beta(
|
|
691
|
+
n_rows,
|
|
692
|
+
alpha=params.get('alpha'),
|
|
693
|
+
beta_param=params.get('beta'),
|
|
694
|
+
data=data,
|
|
695
|
+
seed=seed,
|
|
696
|
+
scale_min=params.get('scale_min', 0.0),
|
|
697
|
+
scale_max=params.get('scale_max', 1.0)
|
|
698
|
+
)
|
|
699
|
+
|
|
700
|
+
elif distribution == DistributionType.GAMMA:
|
|
701
|
+
return generate_gamma(
|
|
702
|
+
n_rows,
|
|
703
|
+
shape=params.get('shape'),
|
|
704
|
+
scale=params.get('scale'),
|
|
705
|
+
data=data,
|
|
706
|
+
seed=seed
|
|
707
|
+
)
|
|
708
|
+
|
|
709
|
+
elif distribution == DistributionType.EXPONENTIAL:
|
|
710
|
+
return generate_exponential_dist(
|
|
711
|
+
n_rows,
|
|
712
|
+
rate=params.get('rate'),
|
|
713
|
+
data=data,
|
|
714
|
+
seed=seed
|
|
715
|
+
)
|
|
716
|
+
|
|
717
|
+
elif distribution == DistributionType.KDE:
|
|
718
|
+
if data is None:
|
|
719
|
+
raise ValidationError("KDE requires existing data")
|
|
720
|
+
return generate_kde(
|
|
721
|
+
n_rows,
|
|
722
|
+
data=data,
|
|
723
|
+
bandwidth=params.get('bandwidth'),
|
|
724
|
+
seed=seed
|
|
725
|
+
)
|
|
726
|
+
|
|
727
|
+
else:
|
|
728
|
+
raise ValidationError(
|
|
729
|
+
f"Unknown distribution type: '{distribution}'. "
|
|
730
|
+
f"Supported: normal, uniform, skewed_left, skewed_right, "
|
|
731
|
+
f"beta, gamma, exponential, kde, auto"
|
|
732
|
+
)
|
|
733
|
+
|
|
734
|
+
except Exception as e:
|
|
735
|
+
if isinstance(e, (ValidationError, AugmentError)):
|
|
736
|
+
raise
|
|
737
|
+
raise AugmentError(f"Distribution generation failed: {e}")
|