additory 0.1.0a3__py3-none-any.whl → 0.1.1a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. additory/__init__.py +58 -14
  2. additory/common/__init__.py +31 -147
  3. additory/common/column_selector.py +255 -0
  4. additory/common/distributions.py +286 -613
  5. additory/common/extractors.py +313 -0
  6. additory/common/knn_imputation.py +332 -0
  7. additory/common/result.py +380 -0
  8. additory/common/strategy_parser.py +243 -0
  9. additory/common/unit_conversions.py +338 -0
  10. additory/common/validation.py +283 -103
  11. additory/core/__init__.py +34 -22
  12. additory/core/backend.py +258 -0
  13. additory/core/config.py +177 -305
  14. additory/core/logging.py +230 -24
  15. additory/core/memory_manager.py +157 -495
  16. additory/expressions/__init__.py +2 -23
  17. additory/expressions/compiler.py +457 -0
  18. additory/expressions/engine.py +264 -487
  19. additory/expressions/integrity.py +179 -0
  20. additory/expressions/loader.py +263 -0
  21. additory/expressions/parser.py +363 -167
  22. additory/expressions/resolver.py +274 -0
  23. additory/functions/__init__.py +1 -0
  24. additory/functions/analyze/__init__.py +144 -0
  25. additory/functions/analyze/cardinality.py +58 -0
  26. additory/functions/analyze/correlations.py +66 -0
  27. additory/functions/analyze/distributions.py +53 -0
  28. additory/functions/analyze/duplicates.py +49 -0
  29. additory/functions/analyze/features.py +61 -0
  30. additory/functions/analyze/imputation.py +66 -0
  31. additory/functions/analyze/outliers.py +65 -0
  32. additory/functions/analyze/patterns.py +65 -0
  33. additory/functions/analyze/presets.py +72 -0
  34. additory/functions/analyze/quality.py +59 -0
  35. additory/functions/analyze/timeseries.py +53 -0
  36. additory/functions/analyze/types.py +45 -0
  37. additory/functions/expressions/__init__.py +161 -0
  38. additory/functions/snapshot/__init__.py +82 -0
  39. additory/functions/snapshot/filter.py +119 -0
  40. additory/functions/synthetic/__init__.py +113 -0
  41. additory/functions/synthetic/mode_detector.py +47 -0
  42. additory/functions/synthetic/strategies/__init__.py +1 -0
  43. additory/functions/synthetic/strategies/advanced.py +35 -0
  44. additory/functions/synthetic/strategies/augmentative.py +160 -0
  45. additory/functions/synthetic/strategies/generative.py +168 -0
  46. additory/functions/synthetic/strategies/presets.py +116 -0
  47. additory/functions/to/__init__.py +188 -0
  48. additory/functions/to/lookup.py +351 -0
  49. additory/functions/to/merge.py +189 -0
  50. additory/functions/to/sort.py +91 -0
  51. additory/functions/to/summarize.py +170 -0
  52. additory/functions/transform/__init__.py +140 -0
  53. additory/functions/transform/datetime.py +79 -0
  54. additory/functions/transform/extract.py +85 -0
  55. additory/functions/transform/harmonize.py +105 -0
  56. additory/functions/transform/knn.py +62 -0
  57. additory/functions/transform/onehotencoding.py +68 -0
  58. additory/functions/transform/transpose.py +42 -0
  59. additory-0.1.1a1.dist-info/METADATA +83 -0
  60. additory-0.1.1a1.dist-info/RECORD +62 -0
  61. additory/analysis/__init__.py +0 -48
  62. additory/analysis/cardinality.py +0 -126
  63. additory/analysis/correlations.py +0 -124
  64. additory/analysis/distributions.py +0 -376
  65. additory/analysis/quality.py +0 -158
  66. additory/analysis/scan.py +0 -400
  67. additory/common/backend.py +0 -371
  68. additory/common/column_utils.py +0 -191
  69. additory/common/exceptions.py +0 -62
  70. additory/common/lists.py +0 -229
  71. additory/common/patterns.py +0 -240
  72. additory/common/resolver.py +0 -567
  73. additory/common/sample_data.py +0 -182
  74. additory/core/ast_builder.py +0 -165
  75. additory/core/backends/__init__.py +0 -23
  76. additory/core/backends/arrow_bridge.py +0 -483
  77. additory/core/backends/cudf_bridge.py +0 -355
  78. additory/core/column_positioning.py +0 -358
  79. additory/core/compiler_polars.py +0 -166
  80. additory/core/enhanced_cache_manager.py +0 -1119
  81. additory/core/enhanced_matchers.py +0 -473
  82. additory/core/enhanced_version_manager.py +0 -325
  83. additory/core/executor.py +0 -59
  84. additory/core/integrity_manager.py +0 -477
  85. additory/core/loader.py +0 -190
  86. additory/core/namespace_manager.py +0 -657
  87. additory/core/parser.py +0 -176
  88. additory/core/polars_expression_engine.py +0 -601
  89. additory/core/registry.py +0 -176
  90. additory/core/sample_data_manager.py +0 -492
  91. additory/core/user_namespace.py +0 -751
  92. additory/core/validator.py +0 -27
  93. additory/dynamic_api.py +0 -304
  94. additory/expressions/proxy.py +0 -549
  95. additory/expressions/registry.py +0 -313
  96. additory/expressions/samples.py +0 -492
  97. additory/synthetic/__init__.py +0 -13
  98. additory/synthetic/column_name_resolver.py +0 -149
  99. additory/synthetic/distributions.py +0 -22
  100. additory/synthetic/forecast.py +0 -1132
  101. additory/synthetic/linked_list_parser.py +0 -415
  102. additory/synthetic/namespace_lookup.py +0 -129
  103. additory/synthetic/smote.py +0 -320
  104. additory/synthetic/strategies.py +0 -850
  105. additory/synthetic/synthesizer.py +0 -713
  106. additory/utilities/__init__.py +0 -53
  107. additory/utilities/encoding.py +0 -600
  108. additory/utilities/games.py +0 -300
  109. additory/utilities/keys.py +0 -8
  110. additory/utilities/lookup.py +0 -103
  111. additory/utilities/matchers.py +0 -216
  112. additory/utilities/resolvers.py +0 -286
  113. additory/utilities/settings.py +0 -167
  114. additory/utilities/units.py +0 -749
  115. additory/utilities/validators.py +0 -153
  116. additory-0.1.0a3.dist-info/METADATA +0 -288
  117. additory-0.1.0a3.dist-info/RECORD +0 -71
  118. additory-0.1.0a3.dist-info/licenses/LICENSE +0 -21
  119. {additory-0.1.0a3.dist-info → additory-0.1.1a1.dist-info}/WHEEL +0 -0
  120. {additory-0.1.0a3.dist-info → additory-0.1.1a1.dist-info}/top_level.txt +0 -0
@@ -1,376 +0,0 @@
1
- """
2
- Distribution Detection and Fitting
3
-
4
- Detects and fits statistical distributions to numeric data.
5
- """
6
-
7
- from dataclasses import dataclass
8
- from typing import List, Dict, Any, Optional
9
- import numpy as np
10
- from scipy import stats
11
-
12
-
13
- @dataclass
14
- class DistributionFit:
15
- """Result of fitting a distribution to data."""
16
- name: str
17
- params: Dict[str, float]
18
- goodness_of_fit: float # KS test statistic (lower is better)
19
- p_value: float # KS test p-value (higher is better)
20
-
21
- def __repr__(self) -> str:
22
- return f"DistributionFit(name='{self.name}', fit={self.goodness_of_fit:.4f}, p={self.p_value:.4f})"
23
-
24
-
25
- def fit_normal(data: np.ndarray) -> DistributionFit:
26
- """Fit normal distribution."""
27
- mean, std = stats.norm.fit(data)
28
- ks_stat, p_value = stats.kstest(data, 'norm', args=(mean, std))
29
-
30
- return DistributionFit(
31
- name='normal',
32
- params={'mean': float(mean), 'std': float(std)},
33
- goodness_of_fit=float(ks_stat),
34
- p_value=float(p_value)
35
- )
36
-
37
-
38
- def fit_uniform(data: np.ndarray) -> DistributionFit:
39
- """Fit uniform distribution."""
40
- loc, scale = stats.uniform.fit(data)
41
- ks_stat, p_value = stats.kstest(data, 'uniform', args=(loc, scale))
42
-
43
- return DistributionFit(
44
- name='uniform',
45
- params={'min': float(loc), 'max': float(loc + scale)},
46
- goodness_of_fit=float(ks_stat),
47
- p_value=float(p_value)
48
- )
49
-
50
-
51
- def fit_exponential(data: np.ndarray) -> Optional[DistributionFit]:
52
- """Fit exponential distribution (requires positive values)."""
53
- if np.any(data <= 0):
54
- return None
55
-
56
- loc, scale = stats.expon.fit(data)
57
- ks_stat, p_value = stats.kstest(data, 'expon', args=(loc, scale))
58
-
59
- return DistributionFit(
60
- name='exponential',
61
- params={'loc': float(loc), 'scale': float(scale), 'rate': float(1/scale)},
62
- goodness_of_fit=float(ks_stat),
63
- p_value=float(p_value)
64
- )
65
-
66
-
67
- def fit_lognormal(data: np.ndarray) -> Optional[DistributionFit]:
68
- """Fit log-normal distribution (requires positive values)."""
69
- if np.any(data <= 0):
70
- return None
71
-
72
- shape, loc, scale = stats.lognorm.fit(data, floc=0)
73
- ks_stat, p_value = stats.kstest(data, 'lognorm', args=(shape, loc, scale))
74
-
75
- return DistributionFit(
76
- name='lognormal',
77
- params={'shape': float(shape), 'loc': float(loc), 'scale': float(scale)},
78
- goodness_of_fit=float(ks_stat),
79
- p_value=float(p_value)
80
- )
81
-
82
-
83
- def fit_gamma(data: np.ndarray) -> Optional[DistributionFit]:
84
- """Fit gamma distribution (requires positive values)."""
85
- if np.any(data <= 0):
86
- return None
87
-
88
- shape, loc, scale = stats.gamma.fit(data, floc=0)
89
- ks_stat, p_value = stats.kstest(data, 'gamma', args=(shape, loc, scale))
90
-
91
- return DistributionFit(
92
- name='gamma',
93
- params={'shape': float(shape), 'loc': float(loc), 'scale': float(scale)},
94
- goodness_of_fit=float(ks_stat),
95
- p_value=float(p_value)
96
- )
97
-
98
-
99
- def fit_beta(data: np.ndarray) -> Optional[DistributionFit]:
100
- """Fit beta distribution (requires values in [0, 1] or will be normalized)."""
101
- # Normalize to [0, 1]
102
- data_min, data_max = np.min(data), np.max(data)
103
-
104
- if data_max == data_min:
105
- return None
106
-
107
- normalized = (data - data_min) / (data_max - data_min)
108
-
109
- # Avoid exact 0 and 1 for beta fitting
110
- normalized = np.clip(normalized, 1e-6, 1 - 1e-6)
111
-
112
- a, b, loc, scale = stats.beta.fit(normalized, floc=0, fscale=1)
113
- ks_stat, p_value = stats.kstest(normalized, 'beta', args=(a, b, loc, scale))
114
-
115
- return DistributionFit(
116
- name='beta',
117
- params={
118
- 'alpha': float(a),
119
- 'beta': float(b),
120
- 'data_min': float(data_min),
121
- 'data_max': float(data_max)
122
- },
123
- goodness_of_fit=float(ks_stat),
124
- p_value=float(p_value)
125
- )
126
-
127
-
128
- def fit_poisson(data: np.ndarray) -> Optional[DistributionFit]:
129
- """Fit Poisson distribution (requires non-negative integers)."""
130
- # Check if data looks like integers
131
- if not np.allclose(data, np.round(data)):
132
- return None
133
-
134
- if np.any(data < 0):
135
- return None
136
-
137
- mu = np.mean(data)
138
-
139
- # For Poisson, use chi-square test instead of KS
140
- # KS test doesn't work well for discrete distributions
141
- # We'll use a simplified goodness-of-fit measure
142
- expected_var = mu
143
- actual_var = np.var(data)
144
-
145
- # Goodness of fit: how close variance is to mean (Poisson property)
146
- if mu > 0:
147
- fit_score = abs(actual_var - expected_var) / mu
148
- else:
149
- fit_score = 1.0
150
-
151
- return DistributionFit(
152
- name='poisson',
153
- params={'lambda': float(mu)},
154
- goodness_of_fit=float(fit_score),
155
- p_value=0.0 # Not applicable for this simplified test
156
- )
157
-
158
-
159
- def fit_chisquare(data: np.ndarray) -> Optional[DistributionFit]:
160
- """Fit chi-squared distribution (requires positive values)."""
161
- if np.any(data <= 0):
162
- return None
163
-
164
- df, loc, scale = stats.chi2.fit(data, floc=0)
165
- ks_stat, p_value = stats.kstest(data, 'chi2', args=(df, loc, scale))
166
-
167
- return DistributionFit(
168
- name='chisquare',
169
- params={'df': float(df), 'loc': float(loc), 'scale': float(scale)},
170
- goodness_of_fit=float(ks_stat),
171
- p_value=float(p_value)
172
- )
173
-
174
-
175
- def fit_distribution(data: np.ndarray, dist_name: str) -> Optional[DistributionFit]:
176
- """
177
- Fit a specific distribution to data.
178
-
179
- Args:
180
- data: Numeric data array
181
- dist_name: Distribution name (normal, uniform, exponential, etc.)
182
-
183
- Returns:
184
- DistributionFit object or None if fitting failed
185
- """
186
- if len(data) < 3:
187
- return None
188
-
189
- # Remove NaN values
190
- data = data[~np.isnan(data)]
191
-
192
- if len(data) < 3:
193
- return None
194
-
195
- try:
196
- if dist_name == 'normal':
197
- return fit_normal(data)
198
- elif dist_name == 'uniform':
199
- return fit_uniform(data)
200
- elif dist_name == 'exponential':
201
- return fit_exponential(data)
202
- elif dist_name == 'lognormal':
203
- return fit_lognormal(data)
204
- elif dist_name == 'gamma':
205
- return fit_gamma(data)
206
- elif dist_name == 'beta':
207
- return fit_beta(data)
208
- elif dist_name == 'poisson':
209
- return fit_poisson(data)
210
- elif dist_name == 'chisquare':
211
- return fit_chisquare(data)
212
- else:
213
- return None
214
- except Exception:
215
- return None
216
-
217
-
218
- def detect_distributions(
219
- data: np.ndarray,
220
- top_n: int = 3
221
- ) -> List[DistributionFit]:
222
- """
223
- Detect best-fitting distributions for data.
224
-
225
- Args:
226
- data: Numeric data array
227
- top_n: Number of top distributions to return
228
-
229
- Returns:
230
- List of DistributionFit objects, sorted by goodness of fit
231
- """
232
- if len(data) < 3:
233
- return []
234
-
235
- # Remove NaN values
236
- data = data[~np.isnan(data)]
237
-
238
- if len(data) < 3:
239
- return []
240
-
241
- # Try all distributions
242
- distributions = [
243
- 'normal',
244
- 'uniform',
245
- 'exponential',
246
- 'lognormal',
247
- 'gamma',
248
- 'beta',
249
- 'poisson',
250
- 'chisquare'
251
- ]
252
-
253
- fits = []
254
- for dist_name in distributions:
255
- fit = fit_distribution(data, dist_name)
256
- if fit is not None:
257
- fits.append(fit)
258
-
259
- # Sort by goodness of fit (lower is better)
260
- fits.sort(key=lambda x: x.goodness_of_fit)
261
-
262
- return fits[:top_n]
263
-
264
-
265
- def detect_distributions(
266
- df,
267
- columns: List[str] = None,
268
- top_n: int = 3
269
- ) -> Dict[str, List[DistributionFit]]:
270
- """
271
- Detect best-fitting distributions for multiple columns in a DataFrame.
272
-
273
- Args:
274
- df: Polars DataFrame
275
- columns: List of column names to analyze (None = all numeric columns)
276
- top_n: Number of top distributions to return per column
277
-
278
- Returns:
279
- Dictionary mapping column names to lists of DistributionFit objects
280
- """
281
- import polars as pl
282
- from concurrent.futures import ThreadPoolExecutor, as_completed
283
- import numpy as np
284
-
285
- if columns is None:
286
- # Auto-detect numeric columns
287
- columns = [col for col in df.columns
288
- if df[col].dtype in [pl.Int8, pl.Int16, pl.Int32, pl.Int64,
289
- pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64,
290
- pl.Float32, pl.Float64]]
291
-
292
- results = {}
293
-
294
- def process_column(col_name):
295
- """Process a single column for distribution detection"""
296
- try:
297
- # Extract column data as numpy array
298
- col_data = df[col_name].to_numpy()
299
-
300
- # Remove null values
301
- col_data = col_data[~np.isnan(col_data)]
302
-
303
- if len(col_data) < 3:
304
- return col_name, []
305
-
306
- # Detect distributions for this column
307
- fits = detect_distributions_array(col_data, top_n)
308
- return col_name, fits
309
-
310
- except Exception as e:
311
- # Log error but continue with other columns
312
- return col_name, []
313
-
314
- # Use ThreadPoolExecutor for parallel processing
315
- with ThreadPoolExecutor(max_workers=min(4, len(columns))) as executor:
316
- # Submit all column processing tasks
317
- future_to_column = {
318
- executor.submit(process_column, col): col
319
- for col in columns
320
- }
321
-
322
- # Collect results as they complete
323
- for future in as_completed(future_to_column):
324
- col_name, fits = future.result()
325
- results[col_name] = fits
326
-
327
- return results
328
-
329
-
330
- def detect_distributions_array(
331
- data: np.ndarray,
332
- top_n: int = 3
333
- ) -> List[DistributionFit]:
334
- """
335
- Detect best-fitting distributions for data array.
336
-
337
- This is the original function renamed to avoid conflicts.
338
-
339
- Args:
340
- data: Numeric data array
341
- top_n: Number of top distributions to return
342
-
343
- Returns:
344
- List of DistributionFit objects, sorted by goodness of fit
345
- """
346
- if len(data) < 3:
347
- return []
348
-
349
- # Remove NaN values
350
- data = data[~np.isnan(data)]
351
-
352
- if len(data) < 3:
353
- return []
354
-
355
- # Try all distributions
356
- distributions = [
357
- 'normal',
358
- 'uniform',
359
- 'exponential',
360
- 'lognormal',
361
- 'gamma',
362
- 'beta',
363
- 'poisson',
364
- 'chisquare'
365
- ]
366
-
367
- fits = []
368
- for dist_name in distributions:
369
- fit = fit_distribution(data, dist_name)
370
- if fit is not None:
371
- fits.append(fit)
372
-
373
- # Sort by goodness of fit (lower is better)
374
- fits.sort(key=lambda x: x.goodness_of_fit)
375
-
376
- return fits[:top_n]
@@ -1,158 +0,0 @@
1
- """
2
- Data Quality Metrics
3
-
4
- Analyzes data quality including missing values, types, and statistics.
5
- """
6
-
7
- from dataclasses import dataclass
8
- from typing import Optional, Any, Dict
9
- import polars as pl
10
- import numpy as np
11
-
12
-
13
- @dataclass
14
- class QualityMetrics:
15
- """Data quality metrics for a column."""
16
- column: str
17
- dtype: str
18
- missing_count: int
19
- missing_ratio: float
20
- total_count: int
21
-
22
- # Numeric statistics
23
- min_value: Optional[float] = None
24
- max_value: Optional[float] = None
25
- mean: Optional[float] = None
26
- median: Optional[float] = None
27
- std: Optional[float] = None
28
- q25: Optional[float] = None
29
- q75: Optional[float] = None
30
-
31
- # Categorical statistics
32
- mode: Optional[Any] = None
33
- mode_count: Optional[int] = None
34
- mode_ratio: Optional[float] = None
35
-
36
- def __repr__(self) -> str:
37
- return (
38
- f"QualityMetrics(column='{self.column}', "
39
- f"dtype='{self.dtype}', missing={self.missing_ratio:.1%})"
40
- )
41
-
42
- def to_dict(self) -> Dict[str, Any]:
43
- """Convert to dictionary."""
44
- return {
45
- 'column': self.column,
46
- 'dtype': self.dtype,
47
- 'missing_count': self.missing_count,
48
- 'missing_ratio': self.missing_ratio,
49
- 'total_count': self.total_count,
50
- 'min': self.min_value,
51
- 'max': self.max_value,
52
- 'mean': self.mean,
53
- 'median': self.median,
54
- 'std': self.std,
55
- 'q25': self.q25,
56
- 'q75': self.q75,
57
- 'mode': self.mode,
58
- 'mode_count': self.mode_count,
59
- 'mode_ratio': self.mode_ratio
60
- }
61
-
62
-
63
- def is_numeric_dtype(dtype: pl.DataType) -> bool:
64
- """Check if dtype is numeric."""
65
- return dtype in [
66
- pl.Int8, pl.Int16, pl.Int32, pl.Int64,
67
- pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64,
68
- pl.Float32, pl.Float64
69
- ]
70
-
71
-
72
- def analyze_quality(
73
- df: pl.DataFrame,
74
- column: str
75
- ) -> QualityMetrics:
76
- """
77
- Analyze data quality for a column.
78
-
79
- Args:
80
- df: Polars DataFrame
81
- column: Column name
82
-
83
- Returns:
84
- QualityMetrics object
85
- """
86
- col_series = df[column]
87
- dtype = col_series.dtype
88
-
89
- # Basic counts
90
- total_count = len(df)
91
- missing_count = col_series.null_count()
92
- missing_ratio = missing_count / total_count if total_count > 0 else 0.0
93
-
94
- # Initialize metrics
95
- metrics = QualityMetrics(
96
- column=column,
97
- dtype=str(dtype),
98
- missing_count=missing_count,
99
- missing_ratio=missing_ratio,
100
- total_count=total_count
101
- )
102
-
103
- # Numeric statistics
104
- if is_numeric_dtype(dtype):
105
- try:
106
- metrics.min_value = float(col_series.min())
107
- metrics.max_value = float(col_series.max())
108
- metrics.mean = float(col_series.mean())
109
- metrics.median = float(col_series.median())
110
- metrics.std = float(col_series.std())
111
-
112
- # Quantiles
113
- q25 = col_series.quantile(0.25, interpolation='linear')
114
- q75 = col_series.quantile(0.75, interpolation='linear')
115
- if q25 is not None:
116
- metrics.q25 = float(q25)
117
- if q75 is not None:
118
- metrics.q75 = float(q75)
119
- except Exception:
120
- pass
121
-
122
- # Mode (for all types)
123
- try:
124
- mode_result = (
125
- df
126
- .group_by(column)
127
- .agg(pl.len().alias('count'))
128
- .sort('count', descending=True)
129
- .head(1)
130
- )
131
-
132
- if len(mode_result) > 0:
133
- row = mode_result.row(0, named=True)
134
- metrics.mode = row[column]
135
- metrics.mode_count = row['count']
136
- metrics.mode_ratio = metrics.mode_count / total_count if total_count > 0 else 0.0
137
- except Exception:
138
- pass
139
-
140
- return metrics
141
-
142
-
143
- def analyze_all_quality(
144
- df: pl.DataFrame
145
- ) -> Dict[str, QualityMetrics]:
146
- """
147
- Analyze data quality for all columns.
148
-
149
- Args:
150
- df: Polars DataFrame
151
-
152
- Returns:
153
- Dictionary mapping column names to QualityMetrics
154
- """
155
- return {
156
- col: analyze_quality(df, col)
157
- for col in df.columns
158
- }