additory 0.1.0a3__py3-none-any.whl → 0.1.1a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. additory/__init__.py +58 -14
  2. additory/common/__init__.py +31 -147
  3. additory/common/column_selector.py +255 -0
  4. additory/common/distributions.py +286 -613
  5. additory/common/extractors.py +313 -0
  6. additory/common/knn_imputation.py +332 -0
  7. additory/common/result.py +380 -0
  8. additory/common/strategy_parser.py +243 -0
  9. additory/common/unit_conversions.py +338 -0
  10. additory/common/validation.py +283 -103
  11. additory/core/__init__.py +34 -22
  12. additory/core/backend.py +258 -0
  13. additory/core/config.py +177 -305
  14. additory/core/logging.py +230 -24
  15. additory/core/memory_manager.py +157 -495
  16. additory/expressions/__init__.py +2 -23
  17. additory/expressions/compiler.py +457 -0
  18. additory/expressions/engine.py +264 -487
  19. additory/expressions/integrity.py +179 -0
  20. additory/expressions/loader.py +263 -0
  21. additory/expressions/parser.py +363 -167
  22. additory/expressions/resolver.py +274 -0
  23. additory/functions/__init__.py +1 -0
  24. additory/functions/analyze/__init__.py +144 -0
  25. additory/functions/analyze/cardinality.py +58 -0
  26. additory/functions/analyze/correlations.py +66 -0
  27. additory/functions/analyze/distributions.py +53 -0
  28. additory/functions/analyze/duplicates.py +49 -0
  29. additory/functions/analyze/features.py +61 -0
  30. additory/functions/analyze/imputation.py +66 -0
  31. additory/functions/analyze/outliers.py +65 -0
  32. additory/functions/analyze/patterns.py +65 -0
  33. additory/functions/analyze/presets.py +72 -0
  34. additory/functions/analyze/quality.py +59 -0
  35. additory/functions/analyze/timeseries.py +53 -0
  36. additory/functions/analyze/types.py +45 -0
  37. additory/functions/expressions/__init__.py +161 -0
  38. additory/functions/snapshot/__init__.py +82 -0
  39. additory/functions/snapshot/filter.py +119 -0
  40. additory/functions/synthetic/__init__.py +113 -0
  41. additory/functions/synthetic/mode_detector.py +47 -0
  42. additory/functions/synthetic/strategies/__init__.py +1 -0
  43. additory/functions/synthetic/strategies/advanced.py +35 -0
  44. additory/functions/synthetic/strategies/augmentative.py +160 -0
  45. additory/functions/synthetic/strategies/generative.py +168 -0
  46. additory/functions/synthetic/strategies/presets.py +116 -0
  47. additory/functions/to/__init__.py +188 -0
  48. additory/functions/to/lookup.py +351 -0
  49. additory/functions/to/merge.py +189 -0
  50. additory/functions/to/sort.py +91 -0
  51. additory/functions/to/summarize.py +170 -0
  52. additory/functions/transform/__init__.py +140 -0
  53. additory/functions/transform/datetime.py +79 -0
  54. additory/functions/transform/extract.py +85 -0
  55. additory/functions/transform/harmonize.py +105 -0
  56. additory/functions/transform/knn.py +62 -0
  57. additory/functions/transform/onehotencoding.py +68 -0
  58. additory/functions/transform/transpose.py +42 -0
  59. additory-0.1.1a1.dist-info/METADATA +83 -0
  60. additory-0.1.1a1.dist-info/RECORD +62 -0
  61. additory/analysis/__init__.py +0 -48
  62. additory/analysis/cardinality.py +0 -126
  63. additory/analysis/correlations.py +0 -124
  64. additory/analysis/distributions.py +0 -376
  65. additory/analysis/quality.py +0 -158
  66. additory/analysis/scan.py +0 -400
  67. additory/common/backend.py +0 -371
  68. additory/common/column_utils.py +0 -191
  69. additory/common/exceptions.py +0 -62
  70. additory/common/lists.py +0 -229
  71. additory/common/patterns.py +0 -240
  72. additory/common/resolver.py +0 -567
  73. additory/common/sample_data.py +0 -182
  74. additory/core/ast_builder.py +0 -165
  75. additory/core/backends/__init__.py +0 -23
  76. additory/core/backends/arrow_bridge.py +0 -483
  77. additory/core/backends/cudf_bridge.py +0 -355
  78. additory/core/column_positioning.py +0 -358
  79. additory/core/compiler_polars.py +0 -166
  80. additory/core/enhanced_cache_manager.py +0 -1119
  81. additory/core/enhanced_matchers.py +0 -473
  82. additory/core/enhanced_version_manager.py +0 -325
  83. additory/core/executor.py +0 -59
  84. additory/core/integrity_manager.py +0 -477
  85. additory/core/loader.py +0 -190
  86. additory/core/namespace_manager.py +0 -657
  87. additory/core/parser.py +0 -176
  88. additory/core/polars_expression_engine.py +0 -601
  89. additory/core/registry.py +0 -176
  90. additory/core/sample_data_manager.py +0 -492
  91. additory/core/user_namespace.py +0 -751
  92. additory/core/validator.py +0 -27
  93. additory/dynamic_api.py +0 -304
  94. additory/expressions/proxy.py +0 -549
  95. additory/expressions/registry.py +0 -313
  96. additory/expressions/samples.py +0 -492
  97. additory/synthetic/__init__.py +0 -13
  98. additory/synthetic/column_name_resolver.py +0 -149
  99. additory/synthetic/distributions.py +0 -22
  100. additory/synthetic/forecast.py +0 -1132
  101. additory/synthetic/linked_list_parser.py +0 -415
  102. additory/synthetic/namespace_lookup.py +0 -129
  103. additory/synthetic/smote.py +0 -320
  104. additory/synthetic/strategies.py +0 -850
  105. additory/synthetic/synthesizer.py +0 -713
  106. additory/utilities/__init__.py +0 -53
  107. additory/utilities/encoding.py +0 -600
  108. additory/utilities/games.py +0 -300
  109. additory/utilities/keys.py +0 -8
  110. additory/utilities/lookup.py +0 -103
  111. additory/utilities/matchers.py +0 -216
  112. additory/utilities/resolvers.py +0 -286
  113. additory/utilities/settings.py +0 -167
  114. additory/utilities/units.py +0 -749
  115. additory/utilities/validators.py +0 -153
  116. additory-0.1.0a3.dist-info/METADATA +0 -288
  117. additory-0.1.0a3.dist-info/RECORD +0 -71
  118. additory-0.1.0a3.dist-info/licenses/LICENSE +0 -21
  119. {additory-0.1.0a3.dist-info → additory-0.1.1a1.dist-info}/WHEEL +0 -0
  120. {additory-0.1.0a3.dist-info → additory-0.1.1a1.dist-info}/top_level.txt +0 -0
@@ -1,737 +1,410 @@
1
1
  """
2
- Distribution Strategies for Synthetic Data Generation
2
+ Statistical distribution utilities for Additory.
3
3
 
4
- Provides statistical distribution-based data generation:
5
- - Normal (Gaussian) distribution
6
- - Uniform distribution
7
- - Skewed distributions (left/right)
8
- - Custom distributions based on existing data
4
+ Provides generation and analysis of statistical distributions for synthetic data.
9
5
  """
10
6
 
11
- from typing import List, Optional, Tuple
12
- import warnings
13
-
7
+ import math
8
+ from typing import Dict, Optional
9
+ import polars as pl
14
10
  import numpy as np
15
11
 
16
- from additory.common.exceptions import ValidationError, AugmentError
17
-
18
-
19
- class DistributionType:
20
- """Supported distribution types."""
21
- NORMAL = "normal"
22
- UNIFORM = "uniform"
23
- SKEWED_LEFT = "skewed_left"
24
- SKEWED_RIGHT = "skewed_right"
25
- BETA = "beta"
26
- GAMMA = "gamma"
27
- EXPONENTIAL = "exponential"
28
- KDE = "kde"
29
- AUTO = "auto"
30
-
31
12
 
32
- def estimate_distribution_params(y: np.ndarray) -> Tuple[float, float, float, float]:
13
+ def generate_normal(n: int, mean: float = 0, std: float = 1, seed: Optional[int] = None) -> pl.Series:
33
14
  """
34
- Estimate distribution parameters from data.
15
+ Generate values from normal distribution.
35
16
 
36
17
  Args:
37
- y: Data values
18
+ n: Number of values to generate
19
+ mean: Mean of distribution
20
+ std: Standard deviation
21
+ seed: Random seed for reproducibility
38
22
 
39
23
  Returns:
40
- Tuple of (mean, std, min, max)
24
+ Polars Series with generated values
25
+
26
+ Example:
27
+ values = generate_normal(n=1000, mean=50, std=10)
41
28
  """
42
- return float(np.mean(y)), float(np.std(y)), float(np.min(y)), float(np.max(y))
29
+ if seed is not None:
30
+ np.random.seed(seed)
31
+
32
+ values = np.random.normal(mean, std, n)
33
+ return pl.Series(values)
43
34
 
44
35
 
45
- def calculate_skewness(y: np.ndarray) -> float:
36
+ def generate_uniform(n: int, low: float, high: float, seed: Optional[int] = None) -> pl.Series:
46
37
  """
47
- Calculate skewness of data.
48
-
49
- Skewness measures asymmetry of distribution:
50
- - 0: Symmetric (normal)
51
- - > 0: Right-skewed (tail on right)
52
- - < 0: Left-skewed (tail on left)
38
+ Generate values from uniform distribution.
53
39
 
54
40
  Args:
55
- y: Data values
41
+ n: Number of values to generate
42
+ low: Lower bound
43
+ high: Upper bound
44
+ seed: Random seed
56
45
 
57
46
  Returns:
58
- Skewness value
47
+ Polars Series with generated values
59
48
  """
60
- n = len(y)
61
- if n < 3:
62
- return 0.0
63
-
64
- mean_y = np.mean(y)
65
- std_y = np.std(y)
66
-
67
- if std_y == 0:
68
- return 0.0
69
-
70
- # Calculate third moment
71
- skew = np.sum(((y - mean_y) / std_y) ** 3) / n
49
+ if seed is not None:
50
+ np.random.seed(seed)
72
51
 
73
- return float(skew)
52
+ values = np.random.uniform(low, high, n)
53
+ return pl.Series(values)
74
54
 
75
55
 
76
- def detect_distribution_type(y: np.ndarray) -> str:
56
+ def generate_exponential(n: int, rate: float = 1.0, seed: Optional[int] = None) -> pl.Series:
77
57
  """
78
- Detect distribution type from data.
58
+ Generate values from exponential distribution.
79
59
 
80
60
  Args:
81
- y: Data values
61
+ n: Number of values to generate
62
+ rate: Rate parameter (lambda)
63
+ seed: Random seed
82
64
 
83
65
  Returns:
84
- Distribution type: normal, skewed_left, skewed_right, or uniform
66
+ Polars Series with generated values
85
67
  """
86
- skewness = calculate_skewness(y)
87
-
88
- # Check for uniform distribution (low variance relative to range)
89
- std_y = np.std(y)
90
- range_y = np.max(y) - np.min(y)
91
-
92
- if range_y > 0:
93
- cv = std_y / range_y # Coefficient of variation relative to range
94
- # Uniform distribution has CV ≈ 0.289
95
- if 0.25 < cv < 0.35 and abs(skewness) < 0.3:
96
- return DistributionType.UNIFORM
97
-
98
- # Check skewness
99
- if abs(skewness) < 0.5:
100
- return DistributionType.NORMAL
101
- elif skewness > 0.5:
102
- return DistributionType.SKEWED_RIGHT
103
- else:
104
- return DistributionType.SKEWED_LEFT
68
+ if seed is not None:
69
+ np.random.seed(seed)
70
+
71
+ # numpy uses scale = 1/rate
72
+ scale = 1.0 / rate
73
+ values = np.random.exponential(scale, n)
74
+ return pl.Series(values)
105
75
 
106
76
 
107
- def generate_normal(
108
- n_rows: int,
109
- mean: Optional[float] = None,
110
- std: Optional[float] = None,
111
- data: Optional[np.ndarray] = None,
112
- seed: Optional[int] = None,
113
- clip: bool = True
114
- ) -> List[float]:
77
+ def generate_poisson(n: int, lambda_: float, seed: Optional[int] = None) -> pl.Series:
115
78
  """
116
- Generate values from normal (Gaussian) distribution.
79
+ Generate values from Poisson distribution.
117
80
 
118
81
  Args:
119
- n_rows: Number of values to generate
120
- mean: Mean of distribution (estimated from data if None)
121
- std: Standard deviation (estimated from data if None)
122
- data: Existing data to estimate parameters from
123
- seed: Random seed for reproducibility
124
- clip: Whether to clip values to data range
82
+ n: Number of values to generate
83
+ lambda_: Lambda parameter (mean)
84
+ seed: Random seed
125
85
 
126
86
  Returns:
127
- List of generated values
128
-
129
- Raises:
130
- ValidationError: If neither parameters nor data provided
87
+ Polars Series with generated values
131
88
  """
132
- # Estimate parameters from data if not provided
133
- if mean is None or std is None:
134
- if data is None:
135
- raise ValidationError(
136
- "Must provide either (mean, std) or data for normal distribution"
137
- )
138
-
139
- est_mean, est_std, data_min, data_max = estimate_distribution_params(data)
140
-
141
- if mean is None:
142
- mean = est_mean
143
- if std is None:
144
- std = est_std
145
-
146
- # Validate parameters
147
- if std <= 0:
148
- raise ValidationError(f"Standard deviation must be positive, got {std}")
149
-
150
- # Generate values
151
89
  if seed is not None:
152
90
  np.random.seed(seed)
153
91
 
154
- values = np.random.normal(mean, std, n_rows)
155
-
156
- # Clip to data range if requested
157
- if clip and data is not None:
158
- data_min = np.min(data)
159
- data_max = np.max(data)
160
- values = np.clip(values, data_min, data_max)
161
-
162
- return values.tolist()
92
+ values = np.random.poisson(lambda_, n)
93
+ return pl.Series(values)
163
94
 
164
95
 
165
- def generate_uniform(
166
- n_rows: int,
167
- min_val: Optional[float] = None,
168
- max_val: Optional[float] = None,
169
- data: Optional[np.ndarray] = None,
170
- seed: Optional[int] = None
171
- ) -> List[float]:
96
+ def generate_binomial(n: int, trials: int, prob: float, seed: Optional[int] = None) -> pl.Series:
172
97
  """
173
- Generate values from uniform distribution.
98
+ Generate values from binomial distribution.
174
99
 
175
100
  Args:
176
- n_rows: Number of values to generate
177
- min_val: Minimum value (estimated from data if None)
178
- max_val: Maximum value (estimated from data if None)
179
- data: Existing data to estimate parameters from
180
- seed: Random seed for reproducibility
101
+ n: Number of values to generate
102
+ trials: Number of trials
103
+ prob: Probability of success
104
+ seed: Random seed
181
105
 
182
106
  Returns:
183
- List of generated values
184
-
185
- Raises:
186
- ValidationError: If neither parameters nor data provided
107
+ Polars Series with generated values
187
108
  """
188
- # Estimate parameters from data if not provided
189
- if min_val is None or max_val is None:
190
- if data is None:
191
- raise ValidationError(
192
- "Must provide either (min_val, max_val) or data for uniform distribution"
193
- )
194
-
195
- _, _, data_min, data_max = estimate_distribution_params(data)
196
-
197
- if min_val is None:
198
- min_val = data_min
199
- if max_val is None:
200
- max_val = data_max
201
-
202
- # Validate parameters
203
- if min_val >= max_val:
204
- raise ValidationError(
205
- f"min_val must be less than max_val, got min={min_val}, max={max_val}"
206
- )
207
-
208
- # Generate values
209
109
  if seed is not None:
210
110
  np.random.seed(seed)
211
111
 
212
- values = np.random.uniform(min_val, max_val, n_rows)
213
-
214
- return values.tolist()
112
+ values = np.random.binomial(trials, prob, n)
113
+ return pl.Series(values)
215
114
 
216
115
 
217
- def generate_skewed(
218
- n_rows: int,
219
- direction: str,
220
- mean: Optional[float] = None,
221
- std: Optional[float] = None,
222
- skewness: float = 1.0,
223
- data: Optional[np.ndarray] = None,
224
- seed: Optional[int] = None,
225
- clip: bool = True
226
- ) -> List[float]:
116
+ def fit_distribution(series: pl.Series, dist_type: str) -> Dict:
227
117
  """
228
- Generate values from skewed distribution.
229
-
230
- Uses log-normal distribution for right skew and reflected log-normal for left skew.
118
+ Fit distribution to data and return parameters.
231
119
 
232
120
  Args:
233
- n_rows: Number of values to generate
234
- direction: 'left' or 'right'
235
- mean: Target mean (estimated from data if None)
236
- std: Target standard deviation (estimated from data if None)
237
- skewness: Degree of skewness (default: 1.0)
238
- data: Existing data to estimate parameters from
239
- seed: Random seed for reproducibility
240
- clip: Whether to clip values to data range
121
+ series: Data to fit
122
+ dist_type: Distribution type ('normal', 'uniform', 'exponential', etc.)
241
123
 
242
124
  Returns:
243
- List of generated values
125
+ Dictionary with fitted parameters
244
126
 
245
- Raises:
246
- ValidationError: If parameters invalid
127
+ Example:
128
+ params = fit_distribution(df['age'], 'normal')
129
+ # Returns: {'mean': 35.5, 'std': 12.3, 'fit_quality': 0.95}
247
130
  """
248
- # Validate direction
249
- if direction not in ['left', 'right']:
250
- raise ValidationError(f"Direction must be 'left' or 'right', got '{direction}'")
251
-
252
- # Estimate parameters from data if not provided
253
- if mean is None or std is None:
254
- if data is None:
255
- raise ValidationError(
256
- "Must provide either (mean, std) or data for skewed distribution"
257
- )
258
-
259
- est_mean, est_std, data_min, data_max = estimate_distribution_params(data)
260
-
261
- if mean is None:
262
- mean = est_mean
263
- if std is None:
264
- std = est_std
265
-
266
- # Validate parameters
267
- if std <= 0:
268
- raise ValidationError(f"Standard deviation must be positive, got {std}")
269
-
270
- # Generate values
271
- if seed is not None:
272
- np.random.seed(seed)
273
-
274
- # Use log-normal distribution for skewness
275
- # Adjust parameters to match target mean and std
276
- sigma = np.sqrt(np.log(1 + (std / mean) ** 2))
277
- mu = np.log(mean) - 0.5 * sigma ** 2
131
+ data = series.to_numpy()
132
+
133
+ if dist_type == 'normal':
134
+ mean = float(np.mean(data))
135
+ std = float(np.std(data, ddof=1))
136
+
137
+ # Simple fit quality based on how well data matches normal
138
+ # Using coefficient of variation as a rough measure
139
+ cv = std / abs(mean) if mean != 0 else float('inf')
140
+ fit_quality = max(0.0, min(1.0, 1.0 - cv / 2.0)) # Rough approximation
141
+
142
+ return {
143
+ 'mean': mean,
144
+ 'std': std,
145
+ 'fit_quality': fit_quality
146
+ }
147
+
148
+ elif dist_type == 'uniform':
149
+ low = float(np.min(data))
150
+ high = float(np.max(data))
151
+
152
+ # Check if data is roughly uniform
153
+ expected_mean = (low + high) / 2
154
+ actual_mean = float(np.mean(data))
155
+ fit_quality = max(0.0, 1.0 - abs(actual_mean - expected_mean) / (high - low))
156
+
157
+ return {
158
+ 'low': low,
159
+ 'high': high,
160
+ 'fit_quality': fit_quality
161
+ }
162
+
163
+ elif dist_type == 'exponential':
164
+ rate = 1.0 / float(np.mean(data))
165
+
166
+ # Simple fit quality check
167
+ theoretical_std = 1.0 / rate
168
+ actual_std = float(np.std(data, ddof=1))
169
+ fit_quality = max(0.0, 1.0 - abs(actual_std - theoretical_std) / theoretical_std)
170
+
171
+ return {
172
+ 'rate': rate,
173
+ 'fit_quality': fit_quality
174
+ }
278
175
 
279
- # Scale sigma by skewness parameter
280
- sigma *= abs(skewness)
281
-
282
- if direction == 'right':
283
- # Right-skewed: log-normal
284
- values = np.random.lognormal(mu, sigma, n_rows)
285
176
  else:
286
- # Left-skewed: reflected log-normal
287
- values = np.random.lognormal(mu, sigma, n_rows)
288
- # Reflect around mean
289
- values = 2 * mean - values
290
-
291
- # Clip to data range if requested
292
- if clip and data is not None:
293
- data_min = np.min(data)
294
- data_max = np.max(data)
295
- values = np.clip(values, data_min, data_max)
296
-
297
- return values.tolist()
177
+ raise ValueError(f"Unsupported distribution type: {dist_type}")
298
178
 
299
179
 
300
- def generate_beta(
301
- n_rows: int,
302
- alpha: Optional[float] = None,
303
- beta_param: Optional[float] = None,
304
- data: Optional[np.ndarray] = None,
305
- seed: Optional[int] = None,
306
- scale_min: float = 0.0,
307
- scale_max: float = 1.0
308
- ) -> List[float]:
180
+ def calculate_distribution_stats(series: pl.Series) -> Dict:
309
181
  """
310
- Generate values from beta distribution.
311
-
312
- Beta distribution is bounded between 0 and 1 (or scaled range).
313
- Useful for percentages, probabilities, proportions.
182
+ Calculate distribution statistics.
314
183
 
315
184
  Args:
316
- n_rows: Number of values to generate
317
- alpha: Shape parameter (> 0)
318
- beta_param: Shape parameter (> 0)
319
- data: Existing data to estimate parameters from
320
- seed: Random seed for reproducibility
321
- scale_min: Minimum value for scaling (default: 0)
322
- scale_max: Maximum value for scaling (default: 1)
185
+ series: Data to analyze
323
186
 
324
187
  Returns:
325
- List of generated values
326
-
327
- Raises:
328
- ValidationError: If parameters invalid
188
+ Dictionary with statistics
329
189
  """
330
- # Estimate parameters from data if not provided
331
- if alpha is None or beta_param is None:
332
- if data is None:
333
- raise ValidationError(
334
- "Must provide either (alpha, beta) or data for beta distribution"
335
- )
336
-
337
- # Normalize data to [0, 1]
338
- data_min = np.min(data)
339
- data_max = np.max(data)
340
-
341
- if data_max == data_min:
342
- raise ValidationError("Data has no variance, cannot fit beta distribution")
343
-
344
- normalized = (data - data_min) / (data_max - data_min)
345
-
346
- # Method of moments estimation
347
- mean = np.mean(normalized)
348
- var = np.var(normalized)
349
-
350
- # Avoid edge cases
351
- mean = np.clip(mean, 0.01, 0.99)
352
- var = np.clip(var, 0.001, mean * (1 - mean) * 0.99)
353
-
354
- # Estimate alpha and beta
355
- alpha = mean * ((mean * (1 - mean) / var) - 1)
356
- beta_param = (1 - mean) * ((mean * (1 - mean) / var) - 1)
357
-
358
- # Use data range for scaling
359
- scale_min = data_min
360
- scale_max = data_max
361
-
362
- # Validate parameters
363
- if alpha <= 0 or beta_param <= 0:
364
- raise ValidationError(
365
- f"Alpha and beta must be positive, got alpha={alpha}, beta={beta_param}"
366
- )
367
-
368
- # Generate values
369
- if seed is not None:
370
- np.random.seed(seed)
371
-
372
- values = np.random.beta(alpha, beta_param, n_rows)
373
-
374
- # Scale to desired range
375
- values = values * (scale_max - scale_min) + scale_min
376
-
377
- return values.tolist()
190
+ data = series.to_numpy()
191
+
192
+ # Basic statistics
193
+ mean = float(np.mean(data))
194
+ median = float(np.median(data))
195
+ std = float(np.std(data, ddof=1))
196
+ variance = float(np.var(data, ddof=1))
197
+
198
+ # Min/max/range
199
+ min_val = float(np.min(data))
200
+ max_val = float(np.max(data))
201
+ range_val = max_val - min_val
202
+
203
+ # Quantiles
204
+ q25 = float(np.percentile(data, 25))
205
+ q75 = float(np.percentile(data, 75))
206
+ iqr = q75 - q25
207
+
208
+ # Mode (most frequent value)
209
+ unique, counts = np.unique(data, return_counts=True)
210
+ mode_idx = np.argmax(counts)
211
+ mode = float(unique[mode_idx])
212
+
213
+ # Skewness and kurtosis (simplified calculations)
214
+ n = len(data)
215
+ if n > 2 and std > 0:
216
+ # Skewness
217
+ skewness = float(np.sum(((data - mean) / std) ** 3) / n)
218
+
219
+ # Kurtosis (excess kurtosis)
220
+ kurtosis = float(np.sum(((data - mean) / std) ** 4) / n - 3)
221
+ else:
222
+ skewness = 0.0
223
+ kurtosis = 0.0
224
+
225
+ return {
226
+ 'mean': mean,
227
+ 'median': median,
228
+ 'mode': mode,
229
+ 'std': std,
230
+ 'variance': variance,
231
+ 'skewness': skewness,
232
+ 'kurtosis': kurtosis,
233
+ 'min': min_val,
234
+ 'max': max_val,
235
+ 'range': range_val,
236
+ 'q25': q25,
237
+ 'q75': q75,
238
+ 'iqr': iqr
239
+ }
378
240
 
379
241
 
380
- def generate_gamma(
381
- n_rows: int,
382
- shape: Optional[float] = None,
383
- scale: Optional[float] = None,
384
- data: Optional[np.ndarray] = None,
385
- seed: Optional[int] = None
386
- ) -> List[float]:
242
+ def check_normality(series: pl.Series) -> Dict:
387
243
  """
388
- Generate values from gamma distribution.
389
-
390
- Gamma distribution is for positive values, often right-skewed.
391
- Useful for waiting times, sizes, amounts.
244
+ Test if data follows normal distribution.
392
245
 
393
246
  Args:
394
- n_rows: Number of values to generate
395
- shape: Shape parameter (k, > 0)
396
- scale: Scale parameter (theta, > 0)
397
- data: Existing data to estimate parameters from
398
- seed: Random seed for reproducibility
247
+ series: Data to test
399
248
 
400
249
  Returns:
401
- List of generated values
402
-
403
- Raises:
404
- ValidationError: If parameters invalid
250
+ Dictionary with test results
405
251
  """
406
- # Estimate parameters from data if not provided
407
- if shape is None or scale is None:
408
- if data is None:
409
- raise ValidationError(
410
- "Must provide either (shape, scale) or data for gamma distribution"
411
- )
412
-
413
- # Check for non-positive values
414
- if np.any(data <= 0):
415
- raise ValidationError(
416
- "Gamma distribution requires all positive values"
417
- )
418
-
419
- # Method of moments estimation
420
- mean = np.mean(data)
421
- var = np.var(data)
422
-
423
- if var == 0:
424
- raise ValidationError("Data has no variance, cannot fit gamma distribution")
425
-
426
- # shape = mean^2 / var, scale = var / mean
427
- shape = (mean ** 2) / var
428
- scale = var / mean
252
+ data = series.to_numpy()
429
253
 
430
- # Validate parameters
431
- if shape <= 0 or scale <= 0:
432
- raise ValidationError(
433
- f"Shape and scale must be positive, got shape={shape}, scale={scale}"
434
- )
254
+ # Simple normality test based on skewness and kurtosis
255
+ # This is a simplified version - in production, you'd use scipy.stats
256
+ stats = calculate_distribution_stats(series)
435
257
 
436
- # Generate values
437
- if seed is not None:
438
- np.random.seed(seed)
258
+ # Normal distribution has skewness ≈ 0 and kurtosis ≈ 0
259
+ skew_test = abs(stats['skewness']) < 0.5
260
+ kurt_test = abs(stats['kurtosis']) < 0.5
261
+
262
+ is_normal = skew_test and kurt_test
439
263
 
440
- values = np.random.gamma(shape, scale, n_rows)
264
+ # Rough p-value approximation
265
+ skew_p = max(0.001, 1.0 - abs(stats['skewness']))
266
+ kurt_p = max(0.001, 1.0 - abs(stats['kurtosis']))
267
+ p_value = min(skew_p, kurt_p)
441
268
 
442
- return values.tolist()
269
+ # Test statistic (combined skewness and kurtosis)
270
+ test_statistic = abs(stats['skewness']) + abs(stats['kurtosis'])
271
+
272
+ return {
273
+ 'is_normal': is_normal,
274
+ 'p_value': p_value,
275
+ 'test_statistic': test_statistic,
276
+ 'test_name': 'Simplified Normality Test'
277
+ }
443
278
 
444
279
 
445
- def generate_exponential_dist(
446
- n_rows: int,
447
- rate: Optional[float] = None,
448
- data: Optional[np.ndarray] = None,
449
- seed: Optional[int] = None
450
- ) -> List[float]:
280
+ def generate_correlated(series: pl.Series, n: int, correlation: float,
281
+ seed: Optional[int] = None) -> pl.Series:
451
282
  """
452
- Generate values from exponential distribution.
453
-
454
- Exponential distribution models time between events.
455
- Memoryless property. Always positive.
283
+ Generate values correlated with existing series.
456
284
 
457
285
  Args:
458
- n_rows: Number of values to generate
459
- rate: Rate parameter (lambda, > 0). Mean = 1/rate
460
- data: Existing data to estimate parameters from
461
- seed: Random seed for reproducibility
286
+ series: Series to correlate with
287
+ n: Number of values to generate
288
+ correlation: Desired correlation (-1 to 1)
289
+ seed: Random seed
462
290
 
463
291
  Returns:
464
- List of generated values
292
+ Polars Series with correlated values
465
293
 
466
- Raises:
467
- ValidationError: If parameters invalid
294
+ Example:
295
+ # Generate income correlated with age (correlation = 0.75)
296
+ income = generate_correlated(df['age'], n=1000, correlation=0.75)
468
297
  """
469
- # Estimate parameters from data if not provided
470
- if rate is None:
471
- if data is None:
472
- raise ValidationError(
473
- "Must provide either rate or data for exponential distribution"
474
- )
475
-
476
- # Check for non-positive values
477
- if np.any(data <= 0):
478
- raise ValidationError(
479
- "Exponential distribution requires all positive values"
480
- )
481
-
482
- # Maximum likelihood estimation: rate = 1 / mean
483
- mean = np.mean(data)
484
- rate = 1.0 / mean
485
-
486
- # Validate parameters
487
- if rate <= 0:
488
- raise ValidationError(f"Rate must be positive, got {rate}")
489
-
490
- # Generate values
491
298
  if seed is not None:
492
299
  np.random.seed(seed)
493
300
 
494
- # numpy uses scale = 1/rate
495
- scale = 1.0 / rate
496
- values = np.random.exponential(scale, n_rows)
301
+ # Get original data
302
+ x = series.to_numpy()
303
+
304
+ # If we need more values than available, repeat the series
305
+ if n > len(x):
306
+ repeats = (n // len(x)) + 1
307
+ x = np.tile(x, repeats)[:n]
308
+ else:
309
+ x = x[:n]
497
310
 
498
- return values.tolist()
311
+ # Standardize x
312
+ x_mean = np.mean(x)
313
+ x_std = np.std(x)
314
+ if x_std == 0:
315
+ x_std = 1.0 # Avoid division by zero
316
+ x_standardized = (x - x_mean) / x_std
317
+
318
+ # Generate independent random variable
319
+ z = np.random.normal(0, 1, n)
320
+
321
+ # Create correlated variable using Cholesky-like approach
322
+ # y = correlation * x + sqrt(1 - correlation^2) * z
323
+ y = correlation * x_standardized + math.sqrt(1 - correlation**2) * z
324
+
325
+ # Scale y to have similar range as x
326
+ y = y * x_std + x_mean
327
+
328
+ return pl.Series(y)
499
329
 
500
330
 
501
- def generate_kde(
502
- n_rows: int,
503
- data: np.ndarray,
504
- bandwidth: Optional[float] = None,
505
- seed: Optional[int] = None
506
- ) -> List[float]:
331
+ def add_noise(series: pl.Series, noise_level: float, seed: Optional[int] = None) -> pl.Series:
507
332
  """
508
- Generate values using Kernel Density Estimation.
509
-
510
- KDE learns the exact distribution shape from data.
511
- Non-parametric approach that preserves complex patterns.
333
+ Add random noise to series.
512
334
 
513
335
  Args:
514
- n_rows: Number of values to generate
515
- data: Existing data to learn from (required)
516
- bandwidth: KDE bandwidth (auto-selected if None)
517
- seed: Random seed for reproducibility
336
+ series: Series to add noise to
337
+ noise_level: Noise level (0 to 1, as fraction of std)
338
+ seed: Random seed
518
339
 
519
340
  Returns:
520
- List of generated values
521
-
522
- Raises:
523
- ValidationError: If data invalid
341
+ Series with added noise
524
342
  """
525
- if data is None or len(data) == 0:
526
- raise ValidationError("KDE requires existing data")
527
-
528
- if len(data) < 3:
529
- raise ValidationError(f"KDE requires at least 3 data points, got {len(data)}")
530
-
531
- # Auto-select bandwidth using Silverman's rule of thumb
532
- if bandwidth is None:
533
- std = np.std(data)
534
- n = len(data)
535
- bandwidth = 1.06 * std * (n ** (-1/5))
536
-
537
- # Ensure reasonable bandwidth
538
- if bandwidth == 0:
539
- bandwidth = 0.1 * (np.max(data) - np.min(data))
540
-
541
- if bandwidth <= 0:
542
- raise ValidationError(f"Bandwidth must be positive, got {bandwidth}")
543
-
544
- # Generate values by sampling from data and adding noise
545
343
  if seed is not None:
546
344
  np.random.seed(seed)
547
345
 
548
- # Sample from data with replacement
549
- sampled_indices = np.random.choice(len(data), size=n_rows, replace=True)
550
- sampled_values = data[sampled_indices]
346
+ data = series.to_numpy()
347
+ std = np.std(data)
551
348
 
552
- # Add Gaussian noise with bandwidth as std
553
- noise = np.random.normal(0, bandwidth, n_rows)
554
- values = sampled_values + noise
349
+ # If std is zero (constant data), use a small default noise level
350
+ if std == 0:
351
+ std = 1.0 # Use unit noise for constant data
555
352
 
556
- return values.tolist()
353
+ # Generate noise
354
+ noise = np.random.normal(0, std * noise_level, len(data))
355
+
356
+ # Add noise to original data
357
+ noisy_data = data + noise
358
+
359
+ return pl.Series(noisy_data)
557
360
 
558
361
 
559
- def generate_multivariate_normal(
560
- n_rows: int,
561
- columns: List[str],
562
- data: np.ndarray,
563
- seed: Optional[int] = None
564
- ) -> np.ndarray:
362
+ def generate_seasonal(n: int, period: int, amplitude: float = 1.0,
363
+ trend: str = 'none', noise: float = 0.0,
364
+ seed: Optional[int] = None) -> pl.Series:
565
365
  """
566
- Generate correlated values using multivariate normal distribution.
567
-
568
- Preserves correlations between multiple columns.
366
+ Generate seasonal time series data.
569
367
 
570
368
  Args:
571
- n_rows: Number of rows to generate
572
- columns: List of column names
573
- data: Existing data (2D array, shape: [n_samples, n_features])
574
- seed: Random seed for reproducibility
369
+ n: Number of values to generate
370
+ period: Seasonal period (e.g., 7 for weekly, 365 for yearly)
371
+ amplitude: Amplitude of seasonal component
372
+ trend: Trend type ('none', 'increasing', 'decreasing')
373
+ noise: Noise level
374
+ seed: Random seed
575
375
 
576
376
  Returns:
577
- 2D array of generated values (shape: [n_rows, n_features])
377
+ Series with seasonal pattern
578
378
 
579
- Raises:
580
- ValidationError: If data invalid
379
+ Example:
380
+ # Generate weekly seasonal sales data
381
+ sales = generate_seasonal(n=365, period=7, amplitude=100,
382
+ trend='increasing', noise=0.1)
581
383
  """
582
- if data is None or len(data) == 0:
583
- raise ValidationError("Multivariate normal requires existing data")
584
-
585
- if data.ndim != 2:
586
- raise ValidationError(f"Data must be 2D array, got shape {data.shape}")
384
+ if seed is not None:
385
+ np.random.seed(seed)
587
386
 
588
- if data.shape[1] != len(columns):
589
- raise ValidationError(
590
- f"Number of columns ({len(columns)}) doesn't match data dimensions ({data.shape[1]})"
591
- )
387
+ # Time index
388
+ t = np.arange(n)
592
389
 
593
- # Estimate mean and covariance
594
- mean = np.mean(data, axis=0)
595
- cov = np.cov(data, rowvar=False)
390
+ # Seasonal component (sine wave)
391
+ seasonal = amplitude * np.sin(2 * np.pi * t / period)
596
392
 
597
- # Ensure covariance matrix is positive definite
598
- # Add small value to diagonal if needed
599
- min_eig = np.min(np.linalg.eigvals(cov))
600
- if min_eig < 0:
601
- cov += np.eye(cov.shape[0]) * (abs(min_eig) + 1e-6)
393
+ # Trend component
394
+ if trend == 'increasing':
395
+ trend_component = t * (amplitude / n)
396
+ elif trend == 'decreasing':
397
+ trend_component = -t * (amplitude / n)
398
+ else: # 'none'
399
+ trend_component = np.zeros(n)
602
400
 
603
- # Generate values
604
- if seed is not None:
605
- np.random.seed(seed)
401
+ # Noise component
402
+ if noise > 0:
403
+ noise_component = np.random.normal(0, amplitude * noise, n)
404
+ else:
405
+ noise_component = np.zeros(n)
606
406
 
607
- values = np.random.multivariate_normal(mean, cov, n_rows)
407
+ # Combine components
408
+ values = seasonal + trend_component + noise_component
608
409
 
609
- return values
610
-
611
-
612
- def generate_distribution_values(
613
- n_rows: int,
614
- distribution: str = DistributionType.AUTO,
615
- data: Optional[np.ndarray] = None,
616
- seed: Optional[int] = None,
617
- **params
618
- ) -> List[float]:
619
- """
620
- Main distribution generation function.
621
-
622
- Args:
623
- n_rows: Number of values to generate
624
- distribution: Distribution type (normal, uniform, skewed_left, skewed_right,
625
- beta, gamma, exponential, kde, auto)
626
- data: Existing data to estimate parameters from (required for auto and kde)
627
- seed: Random seed for reproducibility
628
- **params: Distribution-specific parameters:
629
- - mean, std: For normal
630
- - min_val, max_val: For uniform
631
- - skewness: For skewed (default: 1.0)
632
- - alpha, beta: For beta
633
- - shape, scale: For gamma
634
- - rate: For exponential
635
- - bandwidth: For kde
636
- - clip: Whether to clip to data range (default: True)
637
-
638
- Returns:
639
- List of generated values
640
-
641
- Raises:
642
- ValidationError: If parameters invalid
643
- AugmentError: If generation fails
644
- """
645
- # Auto-detect distribution if requested
646
- if distribution == DistributionType.AUTO or distribution == "auto":
647
- if data is None:
648
- raise ValidationError(
649
- "Auto distribution detection requires existing data"
650
- )
651
-
652
- distribution = detect_distribution_type(data)
653
- print(f"Auto-detected distribution: {distribution}")
654
-
655
- # Generate based on distribution type
656
- try:
657
- if distribution == DistributionType.NORMAL:
658
- return generate_normal(
659
- n_rows,
660
- mean=params.get('mean'),
661
- std=params.get('std'),
662
- data=data,
663
- seed=seed,
664
- clip=params.get('clip', True)
665
- )
666
-
667
- elif distribution == DistributionType.UNIFORM:
668
- return generate_uniform(
669
- n_rows,
670
- min_val=params.get('min_val'),
671
- max_val=params.get('max_val'),
672
- data=data,
673
- seed=seed
674
- )
675
-
676
- elif distribution in [DistributionType.SKEWED_LEFT, DistributionType.SKEWED_RIGHT]:
677
- direction = 'left' if distribution == DistributionType.SKEWED_LEFT else 'right'
678
- return generate_skewed(
679
- n_rows,
680
- direction=direction,
681
- mean=params.get('mean'),
682
- std=params.get('std'),
683
- skewness=params.get('skewness', 1.0),
684
- data=data,
685
- seed=seed,
686
- clip=params.get('clip', True)
687
- )
688
-
689
- elif distribution == DistributionType.BETA:
690
- return generate_beta(
691
- n_rows,
692
- alpha=params.get('alpha'),
693
- beta_param=params.get('beta'),
694
- data=data,
695
- seed=seed,
696
- scale_min=params.get('scale_min', 0.0),
697
- scale_max=params.get('scale_max', 1.0)
698
- )
699
-
700
- elif distribution == DistributionType.GAMMA:
701
- return generate_gamma(
702
- n_rows,
703
- shape=params.get('shape'),
704
- scale=params.get('scale'),
705
- data=data,
706
- seed=seed
707
- )
708
-
709
- elif distribution == DistributionType.EXPONENTIAL:
710
- return generate_exponential_dist(
711
- n_rows,
712
- rate=params.get('rate'),
713
- data=data,
714
- seed=seed
715
- )
716
-
717
- elif distribution == DistributionType.KDE:
718
- if data is None:
719
- raise ValidationError("KDE requires existing data")
720
- return generate_kde(
721
- n_rows,
722
- data=data,
723
- bandwidth=params.get('bandwidth'),
724
- seed=seed
725
- )
726
-
727
- else:
728
- raise ValidationError(
729
- f"Unknown distribution type: '{distribution}'. "
730
- f"Supported: normal, uniform, skewed_left, skewed_right, "
731
- f"beta, gamma, exponential, kde, auto"
732
- )
733
-
734
- except Exception as e:
735
- if isinstance(e, (ValidationError, AugmentError)):
736
- raise
737
- raise AugmentError(f"Distribution generation failed: {e}")
410
+ return pl.Series(values)