additory 0.1.0a3__py3-none-any.whl → 0.1.1a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- additory/__init__.py +58 -14
- additory/common/__init__.py +31 -147
- additory/common/column_selector.py +255 -0
- additory/common/distributions.py +286 -613
- additory/common/extractors.py +313 -0
- additory/common/knn_imputation.py +332 -0
- additory/common/result.py +380 -0
- additory/common/strategy_parser.py +243 -0
- additory/common/unit_conversions.py +338 -0
- additory/common/validation.py +283 -103
- additory/core/__init__.py +34 -22
- additory/core/backend.py +258 -0
- additory/core/config.py +177 -305
- additory/core/logging.py +230 -24
- additory/core/memory_manager.py +157 -495
- additory/expressions/__init__.py +2 -23
- additory/expressions/compiler.py +457 -0
- additory/expressions/engine.py +264 -487
- additory/expressions/integrity.py +179 -0
- additory/expressions/loader.py +263 -0
- additory/expressions/parser.py +363 -167
- additory/expressions/resolver.py +274 -0
- additory/functions/__init__.py +1 -0
- additory/functions/analyze/__init__.py +144 -0
- additory/functions/analyze/cardinality.py +58 -0
- additory/functions/analyze/correlations.py +66 -0
- additory/functions/analyze/distributions.py +53 -0
- additory/functions/analyze/duplicates.py +49 -0
- additory/functions/analyze/features.py +61 -0
- additory/functions/analyze/imputation.py +66 -0
- additory/functions/analyze/outliers.py +65 -0
- additory/functions/analyze/patterns.py +65 -0
- additory/functions/analyze/presets.py +72 -0
- additory/functions/analyze/quality.py +59 -0
- additory/functions/analyze/timeseries.py +53 -0
- additory/functions/analyze/types.py +45 -0
- additory/functions/expressions/__init__.py +161 -0
- additory/functions/snapshot/__init__.py +82 -0
- additory/functions/snapshot/filter.py +119 -0
- additory/functions/synthetic/__init__.py +113 -0
- additory/functions/synthetic/mode_detector.py +47 -0
- additory/functions/synthetic/strategies/__init__.py +1 -0
- additory/functions/synthetic/strategies/advanced.py +35 -0
- additory/functions/synthetic/strategies/augmentative.py +160 -0
- additory/functions/synthetic/strategies/generative.py +168 -0
- additory/functions/synthetic/strategies/presets.py +116 -0
- additory/functions/to/__init__.py +188 -0
- additory/functions/to/lookup.py +351 -0
- additory/functions/to/merge.py +189 -0
- additory/functions/to/sort.py +91 -0
- additory/functions/to/summarize.py +170 -0
- additory/functions/transform/__init__.py +140 -0
- additory/functions/transform/datetime.py +79 -0
- additory/functions/transform/extract.py +85 -0
- additory/functions/transform/harmonize.py +105 -0
- additory/functions/transform/knn.py +62 -0
- additory/functions/transform/onehotencoding.py +68 -0
- additory/functions/transform/transpose.py +42 -0
- additory-0.1.1a1.dist-info/METADATA +83 -0
- additory-0.1.1a1.dist-info/RECORD +62 -0
- additory/analysis/__init__.py +0 -48
- additory/analysis/cardinality.py +0 -126
- additory/analysis/correlations.py +0 -124
- additory/analysis/distributions.py +0 -376
- additory/analysis/quality.py +0 -158
- additory/analysis/scan.py +0 -400
- additory/common/backend.py +0 -371
- additory/common/column_utils.py +0 -191
- additory/common/exceptions.py +0 -62
- additory/common/lists.py +0 -229
- additory/common/patterns.py +0 -240
- additory/common/resolver.py +0 -567
- additory/common/sample_data.py +0 -182
- additory/core/ast_builder.py +0 -165
- additory/core/backends/__init__.py +0 -23
- additory/core/backends/arrow_bridge.py +0 -483
- additory/core/backends/cudf_bridge.py +0 -355
- additory/core/column_positioning.py +0 -358
- additory/core/compiler_polars.py +0 -166
- additory/core/enhanced_cache_manager.py +0 -1119
- additory/core/enhanced_matchers.py +0 -473
- additory/core/enhanced_version_manager.py +0 -325
- additory/core/executor.py +0 -59
- additory/core/integrity_manager.py +0 -477
- additory/core/loader.py +0 -190
- additory/core/namespace_manager.py +0 -657
- additory/core/parser.py +0 -176
- additory/core/polars_expression_engine.py +0 -601
- additory/core/registry.py +0 -176
- additory/core/sample_data_manager.py +0 -492
- additory/core/user_namespace.py +0 -751
- additory/core/validator.py +0 -27
- additory/dynamic_api.py +0 -304
- additory/expressions/proxy.py +0 -549
- additory/expressions/registry.py +0 -313
- additory/expressions/samples.py +0 -492
- additory/synthetic/__init__.py +0 -13
- additory/synthetic/column_name_resolver.py +0 -149
- additory/synthetic/distributions.py +0 -22
- additory/synthetic/forecast.py +0 -1132
- additory/synthetic/linked_list_parser.py +0 -415
- additory/synthetic/namespace_lookup.py +0 -129
- additory/synthetic/smote.py +0 -320
- additory/synthetic/strategies.py +0 -850
- additory/synthetic/synthesizer.py +0 -713
- additory/utilities/__init__.py +0 -53
- additory/utilities/encoding.py +0 -600
- additory/utilities/games.py +0 -300
- additory/utilities/keys.py +0 -8
- additory/utilities/lookup.py +0 -103
- additory/utilities/matchers.py +0 -216
- additory/utilities/resolvers.py +0 -286
- additory/utilities/settings.py +0 -167
- additory/utilities/units.py +0 -749
- additory/utilities/validators.py +0 -153
- additory-0.1.0a3.dist-info/METADATA +0 -288
- additory-0.1.0a3.dist-info/RECORD +0 -71
- additory-0.1.0a3.dist-info/licenses/LICENSE +0 -21
- {additory-0.1.0a3.dist-info → additory-0.1.1a1.dist-info}/WHEEL +0 -0
- {additory-0.1.0a3.dist-info → additory-0.1.1a1.dist-info}/top_level.txt +0 -0
additory/common/distributions.py
CHANGED
|
@@ -1,737 +1,410 @@
|
|
|
1
1
|
"""
|
|
2
|
-
|
|
2
|
+
Statistical distribution utilities for Additory.
|
|
3
3
|
|
|
4
|
-
Provides statistical
|
|
5
|
-
- Normal (Gaussian) distribution
|
|
6
|
-
- Uniform distribution
|
|
7
|
-
- Skewed distributions (left/right)
|
|
8
|
-
- Custom distributions based on existing data
|
|
4
|
+
Provides generation and analysis of statistical distributions for synthetic data.
|
|
9
5
|
"""
|
|
10
6
|
|
|
11
|
-
|
|
12
|
-
import
|
|
13
|
-
|
|
7
|
+
import math
|
|
8
|
+
from typing import Dict, Optional
|
|
9
|
+
import polars as pl
|
|
14
10
|
import numpy as np
|
|
15
11
|
|
|
16
|
-
from additory.common.exceptions import ValidationError, AugmentError
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
class DistributionType:
|
|
20
|
-
"""Supported distribution types."""
|
|
21
|
-
NORMAL = "normal"
|
|
22
|
-
UNIFORM = "uniform"
|
|
23
|
-
SKEWED_LEFT = "skewed_left"
|
|
24
|
-
SKEWED_RIGHT = "skewed_right"
|
|
25
|
-
BETA = "beta"
|
|
26
|
-
GAMMA = "gamma"
|
|
27
|
-
EXPONENTIAL = "exponential"
|
|
28
|
-
KDE = "kde"
|
|
29
|
-
AUTO = "auto"
|
|
30
|
-
|
|
31
12
|
|
|
32
|
-
def
|
|
13
|
+
def generate_normal(n: int, mean: float = 0, std: float = 1, seed: Optional[int] = None) -> pl.Series:
|
|
33
14
|
"""
|
|
34
|
-
|
|
15
|
+
Generate values from normal distribution.
|
|
35
16
|
|
|
36
17
|
Args:
|
|
37
|
-
|
|
18
|
+
n: Number of values to generate
|
|
19
|
+
mean: Mean of distribution
|
|
20
|
+
std: Standard deviation
|
|
21
|
+
seed: Random seed for reproducibility
|
|
38
22
|
|
|
39
23
|
Returns:
|
|
40
|
-
|
|
24
|
+
Polars Series with generated values
|
|
25
|
+
|
|
26
|
+
Example:
|
|
27
|
+
values = generate_normal(n=1000, mean=50, std=10)
|
|
41
28
|
"""
|
|
42
|
-
|
|
29
|
+
if seed is not None:
|
|
30
|
+
np.random.seed(seed)
|
|
31
|
+
|
|
32
|
+
values = np.random.normal(mean, std, n)
|
|
33
|
+
return pl.Series(values)
|
|
43
34
|
|
|
44
35
|
|
|
45
|
-
def
|
|
36
|
+
def generate_uniform(n: int, low: float, high: float, seed: Optional[int] = None) -> pl.Series:
|
|
46
37
|
"""
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
Skewness measures asymmetry of distribution:
|
|
50
|
-
- 0: Symmetric (normal)
|
|
51
|
-
- > 0: Right-skewed (tail on right)
|
|
52
|
-
- < 0: Left-skewed (tail on left)
|
|
38
|
+
Generate values from uniform distribution.
|
|
53
39
|
|
|
54
40
|
Args:
|
|
55
|
-
|
|
41
|
+
n: Number of values to generate
|
|
42
|
+
low: Lower bound
|
|
43
|
+
high: Upper bound
|
|
44
|
+
seed: Random seed
|
|
56
45
|
|
|
57
46
|
Returns:
|
|
58
|
-
|
|
47
|
+
Polars Series with generated values
|
|
59
48
|
"""
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
return 0.0
|
|
63
|
-
|
|
64
|
-
mean_y = np.mean(y)
|
|
65
|
-
std_y = np.std(y)
|
|
66
|
-
|
|
67
|
-
if std_y == 0:
|
|
68
|
-
return 0.0
|
|
69
|
-
|
|
70
|
-
# Calculate third moment
|
|
71
|
-
skew = np.sum(((y - mean_y) / std_y) ** 3) / n
|
|
49
|
+
if seed is not None:
|
|
50
|
+
np.random.seed(seed)
|
|
72
51
|
|
|
73
|
-
|
|
52
|
+
values = np.random.uniform(low, high, n)
|
|
53
|
+
return pl.Series(values)
|
|
74
54
|
|
|
75
55
|
|
|
76
|
-
def
|
|
56
|
+
def generate_exponential(n: int, rate: float = 1.0, seed: Optional[int] = None) -> pl.Series:
|
|
77
57
|
"""
|
|
78
|
-
|
|
58
|
+
Generate values from exponential distribution.
|
|
79
59
|
|
|
80
60
|
Args:
|
|
81
|
-
|
|
61
|
+
n: Number of values to generate
|
|
62
|
+
rate: Rate parameter (lambda)
|
|
63
|
+
seed: Random seed
|
|
82
64
|
|
|
83
65
|
Returns:
|
|
84
|
-
|
|
66
|
+
Polars Series with generated values
|
|
85
67
|
"""
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
cv = std_y / range_y # Coefficient of variation relative to range
|
|
94
|
-
# Uniform distribution has CV ≈ 0.289
|
|
95
|
-
if 0.25 < cv < 0.35 and abs(skewness) < 0.3:
|
|
96
|
-
return DistributionType.UNIFORM
|
|
97
|
-
|
|
98
|
-
# Check skewness
|
|
99
|
-
if abs(skewness) < 0.5:
|
|
100
|
-
return DistributionType.NORMAL
|
|
101
|
-
elif skewness > 0.5:
|
|
102
|
-
return DistributionType.SKEWED_RIGHT
|
|
103
|
-
else:
|
|
104
|
-
return DistributionType.SKEWED_LEFT
|
|
68
|
+
if seed is not None:
|
|
69
|
+
np.random.seed(seed)
|
|
70
|
+
|
|
71
|
+
# numpy uses scale = 1/rate
|
|
72
|
+
scale = 1.0 / rate
|
|
73
|
+
values = np.random.exponential(scale, n)
|
|
74
|
+
return pl.Series(values)
|
|
105
75
|
|
|
106
76
|
|
|
107
|
-
def
|
|
108
|
-
n_rows: int,
|
|
109
|
-
mean: Optional[float] = None,
|
|
110
|
-
std: Optional[float] = None,
|
|
111
|
-
data: Optional[np.ndarray] = None,
|
|
112
|
-
seed: Optional[int] = None,
|
|
113
|
-
clip: bool = True
|
|
114
|
-
) -> List[float]:
|
|
77
|
+
def generate_poisson(n: int, lambda_: float, seed: Optional[int] = None) -> pl.Series:
|
|
115
78
|
"""
|
|
116
|
-
Generate values from
|
|
79
|
+
Generate values from Poisson distribution.
|
|
117
80
|
|
|
118
81
|
Args:
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
data: Existing data to estimate parameters from
|
|
123
|
-
seed: Random seed for reproducibility
|
|
124
|
-
clip: Whether to clip values to data range
|
|
82
|
+
n: Number of values to generate
|
|
83
|
+
lambda_: Lambda parameter (mean)
|
|
84
|
+
seed: Random seed
|
|
125
85
|
|
|
126
86
|
Returns:
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
Raises:
|
|
130
|
-
ValidationError: If neither parameters nor data provided
|
|
87
|
+
Polars Series with generated values
|
|
131
88
|
"""
|
|
132
|
-
# Estimate parameters from data if not provided
|
|
133
|
-
if mean is None or std is None:
|
|
134
|
-
if data is None:
|
|
135
|
-
raise ValidationError(
|
|
136
|
-
"Must provide either (mean, std) or data for normal distribution"
|
|
137
|
-
)
|
|
138
|
-
|
|
139
|
-
est_mean, est_std, data_min, data_max = estimate_distribution_params(data)
|
|
140
|
-
|
|
141
|
-
if mean is None:
|
|
142
|
-
mean = est_mean
|
|
143
|
-
if std is None:
|
|
144
|
-
std = est_std
|
|
145
|
-
|
|
146
|
-
# Validate parameters
|
|
147
|
-
if std <= 0:
|
|
148
|
-
raise ValidationError(f"Standard deviation must be positive, got {std}")
|
|
149
|
-
|
|
150
|
-
# Generate values
|
|
151
89
|
if seed is not None:
|
|
152
90
|
np.random.seed(seed)
|
|
153
91
|
|
|
154
|
-
values = np.random.
|
|
155
|
-
|
|
156
|
-
# Clip to data range if requested
|
|
157
|
-
if clip and data is not None:
|
|
158
|
-
data_min = np.min(data)
|
|
159
|
-
data_max = np.max(data)
|
|
160
|
-
values = np.clip(values, data_min, data_max)
|
|
161
|
-
|
|
162
|
-
return values.tolist()
|
|
92
|
+
values = np.random.poisson(lambda_, n)
|
|
93
|
+
return pl.Series(values)
|
|
163
94
|
|
|
164
95
|
|
|
165
|
-
def
|
|
166
|
-
n_rows: int,
|
|
167
|
-
min_val: Optional[float] = None,
|
|
168
|
-
max_val: Optional[float] = None,
|
|
169
|
-
data: Optional[np.ndarray] = None,
|
|
170
|
-
seed: Optional[int] = None
|
|
171
|
-
) -> List[float]:
|
|
96
|
+
def generate_binomial(n: int, trials: int, prob: float, seed: Optional[int] = None) -> pl.Series:
|
|
172
97
|
"""
|
|
173
|
-
Generate values from
|
|
98
|
+
Generate values from binomial distribution.
|
|
174
99
|
|
|
175
100
|
Args:
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
seed: Random seed for reproducibility
|
|
101
|
+
n: Number of values to generate
|
|
102
|
+
trials: Number of trials
|
|
103
|
+
prob: Probability of success
|
|
104
|
+
seed: Random seed
|
|
181
105
|
|
|
182
106
|
Returns:
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
Raises:
|
|
186
|
-
ValidationError: If neither parameters nor data provided
|
|
107
|
+
Polars Series with generated values
|
|
187
108
|
"""
|
|
188
|
-
# Estimate parameters from data if not provided
|
|
189
|
-
if min_val is None or max_val is None:
|
|
190
|
-
if data is None:
|
|
191
|
-
raise ValidationError(
|
|
192
|
-
"Must provide either (min_val, max_val) or data for uniform distribution"
|
|
193
|
-
)
|
|
194
|
-
|
|
195
|
-
_, _, data_min, data_max = estimate_distribution_params(data)
|
|
196
|
-
|
|
197
|
-
if min_val is None:
|
|
198
|
-
min_val = data_min
|
|
199
|
-
if max_val is None:
|
|
200
|
-
max_val = data_max
|
|
201
|
-
|
|
202
|
-
# Validate parameters
|
|
203
|
-
if min_val >= max_val:
|
|
204
|
-
raise ValidationError(
|
|
205
|
-
f"min_val must be less than max_val, got min={min_val}, max={max_val}"
|
|
206
|
-
)
|
|
207
|
-
|
|
208
|
-
# Generate values
|
|
209
109
|
if seed is not None:
|
|
210
110
|
np.random.seed(seed)
|
|
211
111
|
|
|
212
|
-
values = np.random.
|
|
213
|
-
|
|
214
|
-
return values.tolist()
|
|
112
|
+
values = np.random.binomial(trials, prob, n)
|
|
113
|
+
return pl.Series(values)
|
|
215
114
|
|
|
216
115
|
|
|
217
|
-
def
|
|
218
|
-
n_rows: int,
|
|
219
|
-
direction: str,
|
|
220
|
-
mean: Optional[float] = None,
|
|
221
|
-
std: Optional[float] = None,
|
|
222
|
-
skewness: float = 1.0,
|
|
223
|
-
data: Optional[np.ndarray] = None,
|
|
224
|
-
seed: Optional[int] = None,
|
|
225
|
-
clip: bool = True
|
|
226
|
-
) -> List[float]:
|
|
116
|
+
def fit_distribution(series: pl.Series, dist_type: str) -> Dict:
|
|
227
117
|
"""
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
Uses log-normal distribution for right skew and reflected log-normal for left skew.
|
|
118
|
+
Fit distribution to data and return parameters.
|
|
231
119
|
|
|
232
120
|
Args:
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
mean: Target mean (estimated from data if None)
|
|
236
|
-
std: Target standard deviation (estimated from data if None)
|
|
237
|
-
skewness: Degree of skewness (default: 1.0)
|
|
238
|
-
data: Existing data to estimate parameters from
|
|
239
|
-
seed: Random seed for reproducibility
|
|
240
|
-
clip: Whether to clip values to data range
|
|
121
|
+
series: Data to fit
|
|
122
|
+
dist_type: Distribution type ('normal', 'uniform', 'exponential', etc.)
|
|
241
123
|
|
|
242
124
|
Returns:
|
|
243
|
-
|
|
125
|
+
Dictionary with fitted parameters
|
|
244
126
|
|
|
245
|
-
|
|
246
|
-
|
|
127
|
+
Example:
|
|
128
|
+
params = fit_distribution(df['age'], 'normal')
|
|
129
|
+
# Returns: {'mean': 35.5, 'std': 12.3, 'fit_quality': 0.95}
|
|
247
130
|
"""
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
131
|
+
data = series.to_numpy()
|
|
132
|
+
|
|
133
|
+
if dist_type == 'normal':
|
|
134
|
+
mean = float(np.mean(data))
|
|
135
|
+
std = float(np.std(data, ddof=1))
|
|
136
|
+
|
|
137
|
+
# Simple fit quality based on how well data matches normal
|
|
138
|
+
# Using coefficient of variation as a rough measure
|
|
139
|
+
cv = std / abs(mean) if mean != 0 else float('inf')
|
|
140
|
+
fit_quality = max(0.0, min(1.0, 1.0 - cv / 2.0)) # Rough approximation
|
|
141
|
+
|
|
142
|
+
return {
|
|
143
|
+
'mean': mean,
|
|
144
|
+
'std': std,
|
|
145
|
+
'fit_quality': fit_quality
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
elif dist_type == 'uniform':
|
|
149
|
+
low = float(np.min(data))
|
|
150
|
+
high = float(np.max(data))
|
|
151
|
+
|
|
152
|
+
# Check if data is roughly uniform
|
|
153
|
+
expected_mean = (low + high) / 2
|
|
154
|
+
actual_mean = float(np.mean(data))
|
|
155
|
+
fit_quality = max(0.0, 1.0 - abs(actual_mean - expected_mean) / (high - low))
|
|
156
|
+
|
|
157
|
+
return {
|
|
158
|
+
'low': low,
|
|
159
|
+
'high': high,
|
|
160
|
+
'fit_quality': fit_quality
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
elif dist_type == 'exponential':
|
|
164
|
+
rate = 1.0 / float(np.mean(data))
|
|
165
|
+
|
|
166
|
+
# Simple fit quality check
|
|
167
|
+
theoretical_std = 1.0 / rate
|
|
168
|
+
actual_std = float(np.std(data, ddof=1))
|
|
169
|
+
fit_quality = max(0.0, 1.0 - abs(actual_std - theoretical_std) / theoretical_std)
|
|
170
|
+
|
|
171
|
+
return {
|
|
172
|
+
'rate': rate,
|
|
173
|
+
'fit_quality': fit_quality
|
|
174
|
+
}
|
|
278
175
|
|
|
279
|
-
# Scale sigma by skewness parameter
|
|
280
|
-
sigma *= abs(skewness)
|
|
281
|
-
|
|
282
|
-
if direction == 'right':
|
|
283
|
-
# Right-skewed: log-normal
|
|
284
|
-
values = np.random.lognormal(mu, sigma, n_rows)
|
|
285
176
|
else:
|
|
286
|
-
|
|
287
|
-
values = np.random.lognormal(mu, sigma, n_rows)
|
|
288
|
-
# Reflect around mean
|
|
289
|
-
values = 2 * mean - values
|
|
290
|
-
|
|
291
|
-
# Clip to data range if requested
|
|
292
|
-
if clip and data is not None:
|
|
293
|
-
data_min = np.min(data)
|
|
294
|
-
data_max = np.max(data)
|
|
295
|
-
values = np.clip(values, data_min, data_max)
|
|
296
|
-
|
|
297
|
-
return values.tolist()
|
|
177
|
+
raise ValueError(f"Unsupported distribution type: {dist_type}")
|
|
298
178
|
|
|
299
179
|
|
|
300
|
-
def
|
|
301
|
-
n_rows: int,
|
|
302
|
-
alpha: Optional[float] = None,
|
|
303
|
-
beta_param: Optional[float] = None,
|
|
304
|
-
data: Optional[np.ndarray] = None,
|
|
305
|
-
seed: Optional[int] = None,
|
|
306
|
-
scale_min: float = 0.0,
|
|
307
|
-
scale_max: float = 1.0
|
|
308
|
-
) -> List[float]:
|
|
180
|
+
def calculate_distribution_stats(series: pl.Series) -> Dict:
|
|
309
181
|
"""
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
Beta distribution is bounded between 0 and 1 (or scaled range).
|
|
313
|
-
Useful for percentages, probabilities, proportions.
|
|
182
|
+
Calculate distribution statistics.
|
|
314
183
|
|
|
315
184
|
Args:
|
|
316
|
-
|
|
317
|
-
alpha: Shape parameter (> 0)
|
|
318
|
-
beta_param: Shape parameter (> 0)
|
|
319
|
-
data: Existing data to estimate parameters from
|
|
320
|
-
seed: Random seed for reproducibility
|
|
321
|
-
scale_min: Minimum value for scaling (default: 0)
|
|
322
|
-
scale_max: Maximum value for scaling (default: 1)
|
|
185
|
+
series: Data to analyze
|
|
323
186
|
|
|
324
187
|
Returns:
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
Raises:
|
|
328
|
-
ValidationError: If parameters invalid
|
|
188
|
+
Dictionary with statistics
|
|
329
189
|
"""
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
190
|
+
data = series.to_numpy()
|
|
191
|
+
|
|
192
|
+
# Basic statistics
|
|
193
|
+
mean = float(np.mean(data))
|
|
194
|
+
median = float(np.median(data))
|
|
195
|
+
std = float(np.std(data, ddof=1))
|
|
196
|
+
variance = float(np.var(data, ddof=1))
|
|
197
|
+
|
|
198
|
+
# Min/max/range
|
|
199
|
+
min_val = float(np.min(data))
|
|
200
|
+
max_val = float(np.max(data))
|
|
201
|
+
range_val = max_val - min_val
|
|
202
|
+
|
|
203
|
+
# Quantiles
|
|
204
|
+
q25 = float(np.percentile(data, 25))
|
|
205
|
+
q75 = float(np.percentile(data, 75))
|
|
206
|
+
iqr = q75 - q25
|
|
207
|
+
|
|
208
|
+
# Mode (most frequent value)
|
|
209
|
+
unique, counts = np.unique(data, return_counts=True)
|
|
210
|
+
mode_idx = np.argmax(counts)
|
|
211
|
+
mode = float(unique[mode_idx])
|
|
212
|
+
|
|
213
|
+
# Skewness and kurtosis (simplified calculations)
|
|
214
|
+
n = len(data)
|
|
215
|
+
if n > 2 and std > 0:
|
|
216
|
+
# Skewness
|
|
217
|
+
skewness = float(np.sum(((data - mean) / std) ** 3) / n)
|
|
218
|
+
|
|
219
|
+
# Kurtosis (excess kurtosis)
|
|
220
|
+
kurtosis = float(np.sum(((data - mean) / std) ** 4) / n - 3)
|
|
221
|
+
else:
|
|
222
|
+
skewness = 0.0
|
|
223
|
+
kurtosis = 0.0
|
|
224
|
+
|
|
225
|
+
return {
|
|
226
|
+
'mean': mean,
|
|
227
|
+
'median': median,
|
|
228
|
+
'mode': mode,
|
|
229
|
+
'std': std,
|
|
230
|
+
'variance': variance,
|
|
231
|
+
'skewness': skewness,
|
|
232
|
+
'kurtosis': kurtosis,
|
|
233
|
+
'min': min_val,
|
|
234
|
+
'max': max_val,
|
|
235
|
+
'range': range_val,
|
|
236
|
+
'q25': q25,
|
|
237
|
+
'q75': q75,
|
|
238
|
+
'iqr': iqr
|
|
239
|
+
}
|
|
378
240
|
|
|
379
241
|
|
|
380
|
-
def
|
|
381
|
-
n_rows: int,
|
|
382
|
-
shape: Optional[float] = None,
|
|
383
|
-
scale: Optional[float] = None,
|
|
384
|
-
data: Optional[np.ndarray] = None,
|
|
385
|
-
seed: Optional[int] = None
|
|
386
|
-
) -> List[float]:
|
|
242
|
+
def check_normality(series: pl.Series) -> Dict:
|
|
387
243
|
"""
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
Gamma distribution is for positive values, often right-skewed.
|
|
391
|
-
Useful for waiting times, sizes, amounts.
|
|
244
|
+
Test if data follows normal distribution.
|
|
392
245
|
|
|
393
246
|
Args:
|
|
394
|
-
|
|
395
|
-
shape: Shape parameter (k, > 0)
|
|
396
|
-
scale: Scale parameter (theta, > 0)
|
|
397
|
-
data: Existing data to estimate parameters from
|
|
398
|
-
seed: Random seed for reproducibility
|
|
247
|
+
series: Data to test
|
|
399
248
|
|
|
400
249
|
Returns:
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
Raises:
|
|
404
|
-
ValidationError: If parameters invalid
|
|
250
|
+
Dictionary with test results
|
|
405
251
|
"""
|
|
406
|
-
|
|
407
|
-
if shape is None or scale is None:
|
|
408
|
-
if data is None:
|
|
409
|
-
raise ValidationError(
|
|
410
|
-
"Must provide either (shape, scale) or data for gamma distribution"
|
|
411
|
-
)
|
|
412
|
-
|
|
413
|
-
# Check for non-positive values
|
|
414
|
-
if np.any(data <= 0):
|
|
415
|
-
raise ValidationError(
|
|
416
|
-
"Gamma distribution requires all positive values"
|
|
417
|
-
)
|
|
418
|
-
|
|
419
|
-
# Method of moments estimation
|
|
420
|
-
mean = np.mean(data)
|
|
421
|
-
var = np.var(data)
|
|
422
|
-
|
|
423
|
-
if var == 0:
|
|
424
|
-
raise ValidationError("Data has no variance, cannot fit gamma distribution")
|
|
425
|
-
|
|
426
|
-
# shape = mean^2 / var, scale = var / mean
|
|
427
|
-
shape = (mean ** 2) / var
|
|
428
|
-
scale = var / mean
|
|
252
|
+
data = series.to_numpy()
|
|
429
253
|
|
|
430
|
-
#
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
f"Shape and scale must be positive, got shape={shape}, scale={scale}"
|
|
434
|
-
)
|
|
254
|
+
# Simple normality test based on skewness and kurtosis
|
|
255
|
+
# This is a simplified version - in production, you'd use scipy.stats
|
|
256
|
+
stats = calculate_distribution_stats(series)
|
|
435
257
|
|
|
436
|
-
#
|
|
437
|
-
|
|
438
|
-
|
|
258
|
+
# Normal distribution has skewness ≈ 0 and kurtosis ≈ 0
|
|
259
|
+
skew_test = abs(stats['skewness']) < 0.5
|
|
260
|
+
kurt_test = abs(stats['kurtosis']) < 0.5
|
|
261
|
+
|
|
262
|
+
is_normal = skew_test and kurt_test
|
|
439
263
|
|
|
440
|
-
|
|
264
|
+
# Rough p-value approximation
|
|
265
|
+
skew_p = max(0.001, 1.0 - abs(stats['skewness']))
|
|
266
|
+
kurt_p = max(0.001, 1.0 - abs(stats['kurtosis']))
|
|
267
|
+
p_value = min(skew_p, kurt_p)
|
|
441
268
|
|
|
442
|
-
|
|
269
|
+
# Test statistic (combined skewness and kurtosis)
|
|
270
|
+
test_statistic = abs(stats['skewness']) + abs(stats['kurtosis'])
|
|
271
|
+
|
|
272
|
+
return {
|
|
273
|
+
'is_normal': is_normal,
|
|
274
|
+
'p_value': p_value,
|
|
275
|
+
'test_statistic': test_statistic,
|
|
276
|
+
'test_name': 'Simplified Normality Test'
|
|
277
|
+
}
|
|
443
278
|
|
|
444
279
|
|
|
445
|
-
def
|
|
446
|
-
|
|
447
|
-
rate: Optional[float] = None,
|
|
448
|
-
data: Optional[np.ndarray] = None,
|
|
449
|
-
seed: Optional[int] = None
|
|
450
|
-
) -> List[float]:
|
|
280
|
+
def generate_correlated(series: pl.Series, n: int, correlation: float,
|
|
281
|
+
seed: Optional[int] = None) -> pl.Series:
|
|
451
282
|
"""
|
|
452
|
-
Generate values
|
|
453
|
-
|
|
454
|
-
Exponential distribution models time between events.
|
|
455
|
-
Memoryless property. Always positive.
|
|
283
|
+
Generate values correlated with existing series.
|
|
456
284
|
|
|
457
285
|
Args:
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
seed: Random seed
|
|
286
|
+
series: Series to correlate with
|
|
287
|
+
n: Number of values to generate
|
|
288
|
+
correlation: Desired correlation (-1 to 1)
|
|
289
|
+
seed: Random seed
|
|
462
290
|
|
|
463
291
|
Returns:
|
|
464
|
-
|
|
292
|
+
Polars Series with correlated values
|
|
465
293
|
|
|
466
|
-
|
|
467
|
-
|
|
294
|
+
Example:
|
|
295
|
+
# Generate income correlated with age (correlation = 0.75)
|
|
296
|
+
income = generate_correlated(df['age'], n=1000, correlation=0.75)
|
|
468
297
|
"""
|
|
469
|
-
# Estimate parameters from data if not provided
|
|
470
|
-
if rate is None:
|
|
471
|
-
if data is None:
|
|
472
|
-
raise ValidationError(
|
|
473
|
-
"Must provide either rate or data for exponential distribution"
|
|
474
|
-
)
|
|
475
|
-
|
|
476
|
-
# Check for non-positive values
|
|
477
|
-
if np.any(data <= 0):
|
|
478
|
-
raise ValidationError(
|
|
479
|
-
"Exponential distribution requires all positive values"
|
|
480
|
-
)
|
|
481
|
-
|
|
482
|
-
# Maximum likelihood estimation: rate = 1 / mean
|
|
483
|
-
mean = np.mean(data)
|
|
484
|
-
rate = 1.0 / mean
|
|
485
|
-
|
|
486
|
-
# Validate parameters
|
|
487
|
-
if rate <= 0:
|
|
488
|
-
raise ValidationError(f"Rate must be positive, got {rate}")
|
|
489
|
-
|
|
490
|
-
# Generate values
|
|
491
298
|
if seed is not None:
|
|
492
299
|
np.random.seed(seed)
|
|
493
300
|
|
|
494
|
-
#
|
|
495
|
-
|
|
496
|
-
|
|
301
|
+
# Get original data
|
|
302
|
+
x = series.to_numpy()
|
|
303
|
+
|
|
304
|
+
# If we need more values than available, repeat the series
|
|
305
|
+
if n > len(x):
|
|
306
|
+
repeats = (n // len(x)) + 1
|
|
307
|
+
x = np.tile(x, repeats)[:n]
|
|
308
|
+
else:
|
|
309
|
+
x = x[:n]
|
|
497
310
|
|
|
498
|
-
|
|
311
|
+
# Standardize x
|
|
312
|
+
x_mean = np.mean(x)
|
|
313
|
+
x_std = np.std(x)
|
|
314
|
+
if x_std == 0:
|
|
315
|
+
x_std = 1.0 # Avoid division by zero
|
|
316
|
+
x_standardized = (x - x_mean) / x_std
|
|
317
|
+
|
|
318
|
+
# Generate independent random variable
|
|
319
|
+
z = np.random.normal(0, 1, n)
|
|
320
|
+
|
|
321
|
+
# Create correlated variable using Cholesky-like approach
|
|
322
|
+
# y = correlation * x + sqrt(1 - correlation^2) * z
|
|
323
|
+
y = correlation * x_standardized + math.sqrt(1 - correlation**2) * z
|
|
324
|
+
|
|
325
|
+
# Scale y to have similar range as x
|
|
326
|
+
y = y * x_std + x_mean
|
|
327
|
+
|
|
328
|
+
return pl.Series(y)
|
|
499
329
|
|
|
500
330
|
|
|
501
|
-
def
|
|
502
|
-
n_rows: int,
|
|
503
|
-
data: np.ndarray,
|
|
504
|
-
bandwidth: Optional[float] = None,
|
|
505
|
-
seed: Optional[int] = None
|
|
506
|
-
) -> List[float]:
|
|
331
|
+
def add_noise(series: pl.Series, noise_level: float, seed: Optional[int] = None) -> pl.Series:
|
|
507
332
|
"""
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
KDE learns the exact distribution shape from data.
|
|
511
|
-
Non-parametric approach that preserves complex patterns.
|
|
333
|
+
Add random noise to series.
|
|
512
334
|
|
|
513
335
|
Args:
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
seed: Random seed for reproducibility
|
|
336
|
+
series: Series to add noise to
|
|
337
|
+
noise_level: Noise level (0 to 1, as fraction of std)
|
|
338
|
+
seed: Random seed
|
|
518
339
|
|
|
519
340
|
Returns:
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
Raises:
|
|
523
|
-
ValidationError: If data invalid
|
|
341
|
+
Series with added noise
|
|
524
342
|
"""
|
|
525
|
-
if data is None or len(data) == 0:
|
|
526
|
-
raise ValidationError("KDE requires existing data")
|
|
527
|
-
|
|
528
|
-
if len(data) < 3:
|
|
529
|
-
raise ValidationError(f"KDE requires at least 3 data points, got {len(data)}")
|
|
530
|
-
|
|
531
|
-
# Auto-select bandwidth using Silverman's rule of thumb
|
|
532
|
-
if bandwidth is None:
|
|
533
|
-
std = np.std(data)
|
|
534
|
-
n = len(data)
|
|
535
|
-
bandwidth = 1.06 * std * (n ** (-1/5))
|
|
536
|
-
|
|
537
|
-
# Ensure reasonable bandwidth
|
|
538
|
-
if bandwidth == 0:
|
|
539
|
-
bandwidth = 0.1 * (np.max(data) - np.min(data))
|
|
540
|
-
|
|
541
|
-
if bandwidth <= 0:
|
|
542
|
-
raise ValidationError(f"Bandwidth must be positive, got {bandwidth}")
|
|
543
|
-
|
|
544
|
-
# Generate values by sampling from data and adding noise
|
|
545
343
|
if seed is not None:
|
|
546
344
|
np.random.seed(seed)
|
|
547
345
|
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
sampled_values = data[sampled_indices]
|
|
346
|
+
data = series.to_numpy()
|
|
347
|
+
std = np.std(data)
|
|
551
348
|
|
|
552
|
-
#
|
|
553
|
-
|
|
554
|
-
|
|
349
|
+
# If std is zero (constant data), use a small default noise level
|
|
350
|
+
if std == 0:
|
|
351
|
+
std = 1.0 # Use unit noise for constant data
|
|
555
352
|
|
|
556
|
-
|
|
353
|
+
# Generate noise
|
|
354
|
+
noise = np.random.normal(0, std * noise_level, len(data))
|
|
355
|
+
|
|
356
|
+
# Add noise to original data
|
|
357
|
+
noisy_data = data + noise
|
|
358
|
+
|
|
359
|
+
return pl.Series(noisy_data)
|
|
557
360
|
|
|
558
361
|
|
|
559
|
-
def
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
data: np.ndarray,
|
|
563
|
-
seed: Optional[int] = None
|
|
564
|
-
) -> np.ndarray:
|
|
362
|
+
def generate_seasonal(n: int, period: int, amplitude: float = 1.0,
|
|
363
|
+
trend: str = 'none', noise: float = 0.0,
|
|
364
|
+
seed: Optional[int] = None) -> pl.Series:
|
|
565
365
|
"""
|
|
566
|
-
Generate
|
|
567
|
-
|
|
568
|
-
Preserves correlations between multiple columns.
|
|
366
|
+
Generate seasonal time series data.
|
|
569
367
|
|
|
570
368
|
Args:
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
369
|
+
n: Number of values to generate
|
|
370
|
+
period: Seasonal period (e.g., 7 for weekly, 365 for yearly)
|
|
371
|
+
amplitude: Amplitude of seasonal component
|
|
372
|
+
trend: Trend type ('none', 'increasing', 'decreasing')
|
|
373
|
+
noise: Noise level
|
|
374
|
+
seed: Random seed
|
|
575
375
|
|
|
576
376
|
Returns:
|
|
577
|
-
|
|
377
|
+
Series with seasonal pattern
|
|
578
378
|
|
|
579
|
-
|
|
580
|
-
|
|
379
|
+
Example:
|
|
380
|
+
# Generate weekly seasonal sales data
|
|
381
|
+
sales = generate_seasonal(n=365, period=7, amplitude=100,
|
|
382
|
+
trend='increasing', noise=0.1)
|
|
581
383
|
"""
|
|
582
|
-
if
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
if data.ndim != 2:
|
|
586
|
-
raise ValidationError(f"Data must be 2D array, got shape {data.shape}")
|
|
384
|
+
if seed is not None:
|
|
385
|
+
np.random.seed(seed)
|
|
587
386
|
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
f"Number of columns ({len(columns)}) doesn't match data dimensions ({data.shape[1]})"
|
|
591
|
-
)
|
|
387
|
+
# Time index
|
|
388
|
+
t = np.arange(n)
|
|
592
389
|
|
|
593
|
-
#
|
|
594
|
-
|
|
595
|
-
cov = np.cov(data, rowvar=False)
|
|
390
|
+
# Seasonal component (sine wave)
|
|
391
|
+
seasonal = amplitude * np.sin(2 * np.pi * t / period)
|
|
596
392
|
|
|
597
|
-
#
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
393
|
+
# Trend component
|
|
394
|
+
if trend == 'increasing':
|
|
395
|
+
trend_component = t * (amplitude / n)
|
|
396
|
+
elif trend == 'decreasing':
|
|
397
|
+
trend_component = -t * (amplitude / n)
|
|
398
|
+
else: # 'none'
|
|
399
|
+
trend_component = np.zeros(n)
|
|
602
400
|
|
|
603
|
-
#
|
|
604
|
-
if
|
|
605
|
-
np.random.
|
|
401
|
+
# Noise component
|
|
402
|
+
if noise > 0:
|
|
403
|
+
noise_component = np.random.normal(0, amplitude * noise, n)
|
|
404
|
+
else:
|
|
405
|
+
noise_component = np.zeros(n)
|
|
606
406
|
|
|
607
|
-
|
|
407
|
+
# Combine components
|
|
408
|
+
values = seasonal + trend_component + noise_component
|
|
608
409
|
|
|
609
|
-
return values
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
def generate_distribution_values(
|
|
613
|
-
n_rows: int,
|
|
614
|
-
distribution: str = DistributionType.AUTO,
|
|
615
|
-
data: Optional[np.ndarray] = None,
|
|
616
|
-
seed: Optional[int] = None,
|
|
617
|
-
**params
|
|
618
|
-
) -> List[float]:
|
|
619
|
-
"""
|
|
620
|
-
Main distribution generation function.
|
|
621
|
-
|
|
622
|
-
Args:
|
|
623
|
-
n_rows: Number of values to generate
|
|
624
|
-
distribution: Distribution type (normal, uniform, skewed_left, skewed_right,
|
|
625
|
-
beta, gamma, exponential, kde, auto)
|
|
626
|
-
data: Existing data to estimate parameters from (required for auto and kde)
|
|
627
|
-
seed: Random seed for reproducibility
|
|
628
|
-
**params: Distribution-specific parameters:
|
|
629
|
-
- mean, std: For normal
|
|
630
|
-
- min_val, max_val: For uniform
|
|
631
|
-
- skewness: For skewed (default: 1.0)
|
|
632
|
-
- alpha, beta: For beta
|
|
633
|
-
- shape, scale: For gamma
|
|
634
|
-
- rate: For exponential
|
|
635
|
-
- bandwidth: For kde
|
|
636
|
-
- clip: Whether to clip to data range (default: True)
|
|
637
|
-
|
|
638
|
-
Returns:
|
|
639
|
-
List of generated values
|
|
640
|
-
|
|
641
|
-
Raises:
|
|
642
|
-
ValidationError: If parameters invalid
|
|
643
|
-
AugmentError: If generation fails
|
|
644
|
-
"""
|
|
645
|
-
# Auto-detect distribution if requested
|
|
646
|
-
if distribution == DistributionType.AUTO or distribution == "auto":
|
|
647
|
-
if data is None:
|
|
648
|
-
raise ValidationError(
|
|
649
|
-
"Auto distribution detection requires existing data"
|
|
650
|
-
)
|
|
651
|
-
|
|
652
|
-
distribution = detect_distribution_type(data)
|
|
653
|
-
print(f"Auto-detected distribution: {distribution}")
|
|
654
|
-
|
|
655
|
-
# Generate based on distribution type
|
|
656
|
-
try:
|
|
657
|
-
if distribution == DistributionType.NORMAL:
|
|
658
|
-
return generate_normal(
|
|
659
|
-
n_rows,
|
|
660
|
-
mean=params.get('mean'),
|
|
661
|
-
std=params.get('std'),
|
|
662
|
-
data=data,
|
|
663
|
-
seed=seed,
|
|
664
|
-
clip=params.get('clip', True)
|
|
665
|
-
)
|
|
666
|
-
|
|
667
|
-
elif distribution == DistributionType.UNIFORM:
|
|
668
|
-
return generate_uniform(
|
|
669
|
-
n_rows,
|
|
670
|
-
min_val=params.get('min_val'),
|
|
671
|
-
max_val=params.get('max_val'),
|
|
672
|
-
data=data,
|
|
673
|
-
seed=seed
|
|
674
|
-
)
|
|
675
|
-
|
|
676
|
-
elif distribution in [DistributionType.SKEWED_LEFT, DistributionType.SKEWED_RIGHT]:
|
|
677
|
-
direction = 'left' if distribution == DistributionType.SKEWED_LEFT else 'right'
|
|
678
|
-
return generate_skewed(
|
|
679
|
-
n_rows,
|
|
680
|
-
direction=direction,
|
|
681
|
-
mean=params.get('mean'),
|
|
682
|
-
std=params.get('std'),
|
|
683
|
-
skewness=params.get('skewness', 1.0),
|
|
684
|
-
data=data,
|
|
685
|
-
seed=seed,
|
|
686
|
-
clip=params.get('clip', True)
|
|
687
|
-
)
|
|
688
|
-
|
|
689
|
-
elif distribution == DistributionType.BETA:
|
|
690
|
-
return generate_beta(
|
|
691
|
-
n_rows,
|
|
692
|
-
alpha=params.get('alpha'),
|
|
693
|
-
beta_param=params.get('beta'),
|
|
694
|
-
data=data,
|
|
695
|
-
seed=seed,
|
|
696
|
-
scale_min=params.get('scale_min', 0.0),
|
|
697
|
-
scale_max=params.get('scale_max', 1.0)
|
|
698
|
-
)
|
|
699
|
-
|
|
700
|
-
elif distribution == DistributionType.GAMMA:
|
|
701
|
-
return generate_gamma(
|
|
702
|
-
n_rows,
|
|
703
|
-
shape=params.get('shape'),
|
|
704
|
-
scale=params.get('scale'),
|
|
705
|
-
data=data,
|
|
706
|
-
seed=seed
|
|
707
|
-
)
|
|
708
|
-
|
|
709
|
-
elif distribution == DistributionType.EXPONENTIAL:
|
|
710
|
-
return generate_exponential_dist(
|
|
711
|
-
n_rows,
|
|
712
|
-
rate=params.get('rate'),
|
|
713
|
-
data=data,
|
|
714
|
-
seed=seed
|
|
715
|
-
)
|
|
716
|
-
|
|
717
|
-
elif distribution == DistributionType.KDE:
|
|
718
|
-
if data is None:
|
|
719
|
-
raise ValidationError("KDE requires existing data")
|
|
720
|
-
return generate_kde(
|
|
721
|
-
n_rows,
|
|
722
|
-
data=data,
|
|
723
|
-
bandwidth=params.get('bandwidth'),
|
|
724
|
-
seed=seed
|
|
725
|
-
)
|
|
726
|
-
|
|
727
|
-
else:
|
|
728
|
-
raise ValidationError(
|
|
729
|
-
f"Unknown distribution type: '{distribution}'. "
|
|
730
|
-
f"Supported: normal, uniform, skewed_left, skewed_right, "
|
|
731
|
-
f"beta, gamma, exponential, kde, auto"
|
|
732
|
-
)
|
|
733
|
-
|
|
734
|
-
except Exception as e:
|
|
735
|
-
if isinstance(e, (ValidationError, AugmentError)):
|
|
736
|
-
raise
|
|
737
|
-
raise AugmentError(f"Distribution generation failed: {e}")
|
|
410
|
+
return pl.Series(values)
|