additory 0.1.0a4__py3-none-any.whl → 0.1.1a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- additory/__init__.py +58 -14
- additory/common/__init__.py +31 -147
- additory/common/column_selector.py +255 -0
- additory/common/distributions.py +286 -613
- additory/common/extractors.py +313 -0
- additory/common/knn_imputation.py +332 -0
- additory/common/result.py +380 -0
- additory/common/strategy_parser.py +243 -0
- additory/common/unit_conversions.py +338 -0
- additory/common/validation.py +283 -103
- additory/core/__init__.py +34 -22
- additory/core/backend.py +258 -0
- additory/core/config.py +177 -305
- additory/core/logging.py +230 -24
- additory/core/memory_manager.py +157 -495
- additory/expressions/__init__.py +2 -23
- additory/expressions/compiler.py +457 -0
- additory/expressions/engine.py +264 -487
- additory/expressions/integrity.py +179 -0
- additory/expressions/loader.py +263 -0
- additory/expressions/parser.py +363 -167
- additory/expressions/resolver.py +274 -0
- additory/functions/__init__.py +1 -0
- additory/functions/analyze/__init__.py +144 -0
- additory/functions/analyze/cardinality.py +58 -0
- additory/functions/analyze/correlations.py +66 -0
- additory/functions/analyze/distributions.py +53 -0
- additory/functions/analyze/duplicates.py +49 -0
- additory/functions/analyze/features.py +61 -0
- additory/functions/analyze/imputation.py +66 -0
- additory/functions/analyze/outliers.py +65 -0
- additory/functions/analyze/patterns.py +65 -0
- additory/functions/analyze/presets.py +72 -0
- additory/functions/analyze/quality.py +59 -0
- additory/functions/analyze/timeseries.py +53 -0
- additory/functions/analyze/types.py +45 -0
- additory/functions/expressions/__init__.py +161 -0
- additory/functions/snapshot/__init__.py +82 -0
- additory/functions/snapshot/filter.py +119 -0
- additory/functions/synthetic/__init__.py +113 -0
- additory/functions/synthetic/mode_detector.py +47 -0
- additory/functions/synthetic/strategies/__init__.py +1 -0
- additory/functions/synthetic/strategies/advanced.py +35 -0
- additory/functions/synthetic/strategies/augmentative.py +160 -0
- additory/functions/synthetic/strategies/generative.py +168 -0
- additory/functions/synthetic/strategies/presets.py +116 -0
- additory/functions/to/__init__.py +188 -0
- additory/functions/to/lookup.py +351 -0
- additory/functions/to/merge.py +189 -0
- additory/functions/to/sort.py +91 -0
- additory/functions/to/summarize.py +170 -0
- additory/functions/transform/__init__.py +140 -0
- additory/functions/transform/datetime.py +79 -0
- additory/functions/transform/extract.py +85 -0
- additory/functions/transform/harmonize.py +105 -0
- additory/functions/transform/knn.py +62 -0
- additory/functions/transform/onehotencoding.py +68 -0
- additory/functions/transform/transpose.py +42 -0
- additory-0.1.1a1.dist-info/METADATA +83 -0
- additory-0.1.1a1.dist-info/RECORD +62 -0
- additory/analysis/__init__.py +0 -48
- additory/analysis/cardinality.py +0 -126
- additory/analysis/correlations.py +0 -124
- additory/analysis/distributions.py +0 -376
- additory/analysis/quality.py +0 -158
- additory/analysis/scan.py +0 -400
- additory/common/backend.py +0 -371
- additory/common/column_utils.py +0 -191
- additory/common/exceptions.py +0 -62
- additory/common/lists.py +0 -229
- additory/common/patterns.py +0 -240
- additory/common/resolver.py +0 -567
- additory/common/sample_data.py +0 -182
- additory/core/ast_builder.py +0 -165
- additory/core/backends/__init__.py +0 -23
- additory/core/backends/arrow_bridge.py +0 -483
- additory/core/backends/cudf_bridge.py +0 -355
- additory/core/column_positioning.py +0 -358
- additory/core/compiler_polars.py +0 -166
- additory/core/enhanced_cache_manager.py +0 -1119
- additory/core/enhanced_matchers.py +0 -473
- additory/core/enhanced_version_manager.py +0 -325
- additory/core/executor.py +0 -59
- additory/core/integrity_manager.py +0 -477
- additory/core/loader.py +0 -190
- additory/core/namespace_manager.py +0 -657
- additory/core/parser.py +0 -176
- additory/core/polars_expression_engine.py +0 -601
- additory/core/registry.py +0 -177
- additory/core/sample_data_manager.py +0 -492
- additory/core/user_namespace.py +0 -751
- additory/core/validator.py +0 -27
- additory/dynamic_api.py +0 -352
- additory/expressions/proxy.py +0 -549
- additory/expressions/registry.py +0 -313
- additory/expressions/samples.py +0 -492
- additory/synthetic/__init__.py +0 -13
- additory/synthetic/column_name_resolver.py +0 -149
- additory/synthetic/deduce.py +0 -259
- additory/synthetic/distributions.py +0 -22
- additory/synthetic/forecast.py +0 -1132
- additory/synthetic/linked_list_parser.py +0 -415
- additory/synthetic/namespace_lookup.py +0 -129
- additory/synthetic/smote.py +0 -320
- additory/synthetic/strategies.py +0 -926
- additory/synthetic/synthesizer.py +0 -713
- additory/utilities/__init__.py +0 -53
- additory/utilities/encoding.py +0 -600
- additory/utilities/games.py +0 -300
- additory/utilities/keys.py +0 -8
- additory/utilities/lookup.py +0 -103
- additory/utilities/matchers.py +0 -216
- additory/utilities/resolvers.py +0 -286
- additory/utilities/settings.py +0 -167
- additory/utilities/units.py +0 -749
- additory/utilities/validators.py +0 -153
- additory-0.1.0a4.dist-info/METADATA +0 -311
- additory-0.1.0a4.dist-info/RECORD +0 -72
- additory-0.1.0a4.dist-info/licenses/LICENSE +0 -21
- {additory-0.1.0a4.dist-info → additory-0.1.1a1.dist-info}/WHEEL +0 -0
- {additory-0.1.0a4.dist-info → additory-0.1.1a1.dist-info}/top_level.txt +0 -0
|
@@ -1,376 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Distribution Detection and Fitting
|
|
3
|
-
|
|
4
|
-
Detects and fits statistical distributions to numeric data.
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
from dataclasses import dataclass
|
|
8
|
-
from typing import List, Dict, Any, Optional
|
|
9
|
-
import numpy as np
|
|
10
|
-
from scipy import stats
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
@dataclass
|
|
14
|
-
class DistributionFit:
|
|
15
|
-
"""Result of fitting a distribution to data."""
|
|
16
|
-
name: str
|
|
17
|
-
params: Dict[str, float]
|
|
18
|
-
goodness_of_fit: float # KS test statistic (lower is better)
|
|
19
|
-
p_value: float # KS test p-value (higher is better)
|
|
20
|
-
|
|
21
|
-
def __repr__(self) -> str:
|
|
22
|
-
return f"DistributionFit(name='{self.name}', fit={self.goodness_of_fit:.4f}, p={self.p_value:.4f})"
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
def fit_normal(data: np.ndarray) -> DistributionFit:
|
|
26
|
-
"""Fit normal distribution."""
|
|
27
|
-
mean, std = stats.norm.fit(data)
|
|
28
|
-
ks_stat, p_value = stats.kstest(data, 'norm', args=(mean, std))
|
|
29
|
-
|
|
30
|
-
return DistributionFit(
|
|
31
|
-
name='normal',
|
|
32
|
-
params={'mean': float(mean), 'std': float(std)},
|
|
33
|
-
goodness_of_fit=float(ks_stat),
|
|
34
|
-
p_value=float(p_value)
|
|
35
|
-
)
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
def fit_uniform(data: np.ndarray) -> DistributionFit:
|
|
39
|
-
"""Fit uniform distribution."""
|
|
40
|
-
loc, scale = stats.uniform.fit(data)
|
|
41
|
-
ks_stat, p_value = stats.kstest(data, 'uniform', args=(loc, scale))
|
|
42
|
-
|
|
43
|
-
return DistributionFit(
|
|
44
|
-
name='uniform',
|
|
45
|
-
params={'min': float(loc), 'max': float(loc + scale)},
|
|
46
|
-
goodness_of_fit=float(ks_stat),
|
|
47
|
-
p_value=float(p_value)
|
|
48
|
-
)
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
def fit_exponential(data: np.ndarray) -> Optional[DistributionFit]:
|
|
52
|
-
"""Fit exponential distribution (requires positive values)."""
|
|
53
|
-
if np.any(data <= 0):
|
|
54
|
-
return None
|
|
55
|
-
|
|
56
|
-
loc, scale = stats.expon.fit(data)
|
|
57
|
-
ks_stat, p_value = stats.kstest(data, 'expon', args=(loc, scale))
|
|
58
|
-
|
|
59
|
-
return DistributionFit(
|
|
60
|
-
name='exponential',
|
|
61
|
-
params={'loc': float(loc), 'scale': float(scale), 'rate': float(1/scale)},
|
|
62
|
-
goodness_of_fit=float(ks_stat),
|
|
63
|
-
p_value=float(p_value)
|
|
64
|
-
)
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
def fit_lognormal(data: np.ndarray) -> Optional[DistributionFit]:
|
|
68
|
-
"""Fit log-normal distribution (requires positive values)."""
|
|
69
|
-
if np.any(data <= 0):
|
|
70
|
-
return None
|
|
71
|
-
|
|
72
|
-
shape, loc, scale = stats.lognorm.fit(data, floc=0)
|
|
73
|
-
ks_stat, p_value = stats.kstest(data, 'lognorm', args=(shape, loc, scale))
|
|
74
|
-
|
|
75
|
-
return DistributionFit(
|
|
76
|
-
name='lognormal',
|
|
77
|
-
params={'shape': float(shape), 'loc': float(loc), 'scale': float(scale)},
|
|
78
|
-
goodness_of_fit=float(ks_stat),
|
|
79
|
-
p_value=float(p_value)
|
|
80
|
-
)
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
def fit_gamma(data: np.ndarray) -> Optional[DistributionFit]:
|
|
84
|
-
"""Fit gamma distribution (requires positive values)."""
|
|
85
|
-
if np.any(data <= 0):
|
|
86
|
-
return None
|
|
87
|
-
|
|
88
|
-
shape, loc, scale = stats.gamma.fit(data, floc=0)
|
|
89
|
-
ks_stat, p_value = stats.kstest(data, 'gamma', args=(shape, loc, scale))
|
|
90
|
-
|
|
91
|
-
return DistributionFit(
|
|
92
|
-
name='gamma',
|
|
93
|
-
params={'shape': float(shape), 'loc': float(loc), 'scale': float(scale)},
|
|
94
|
-
goodness_of_fit=float(ks_stat),
|
|
95
|
-
p_value=float(p_value)
|
|
96
|
-
)
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
def fit_beta(data: np.ndarray) -> Optional[DistributionFit]:
|
|
100
|
-
"""Fit beta distribution (requires values in [0, 1] or will be normalized)."""
|
|
101
|
-
# Normalize to [0, 1]
|
|
102
|
-
data_min, data_max = np.min(data), np.max(data)
|
|
103
|
-
|
|
104
|
-
if data_max == data_min:
|
|
105
|
-
return None
|
|
106
|
-
|
|
107
|
-
normalized = (data - data_min) / (data_max - data_min)
|
|
108
|
-
|
|
109
|
-
# Avoid exact 0 and 1 for beta fitting
|
|
110
|
-
normalized = np.clip(normalized, 1e-6, 1 - 1e-6)
|
|
111
|
-
|
|
112
|
-
a, b, loc, scale = stats.beta.fit(normalized, floc=0, fscale=1)
|
|
113
|
-
ks_stat, p_value = stats.kstest(normalized, 'beta', args=(a, b, loc, scale))
|
|
114
|
-
|
|
115
|
-
return DistributionFit(
|
|
116
|
-
name='beta',
|
|
117
|
-
params={
|
|
118
|
-
'alpha': float(a),
|
|
119
|
-
'beta': float(b),
|
|
120
|
-
'data_min': float(data_min),
|
|
121
|
-
'data_max': float(data_max)
|
|
122
|
-
},
|
|
123
|
-
goodness_of_fit=float(ks_stat),
|
|
124
|
-
p_value=float(p_value)
|
|
125
|
-
)
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
def fit_poisson(data: np.ndarray) -> Optional[DistributionFit]:
|
|
129
|
-
"""Fit Poisson distribution (requires non-negative integers)."""
|
|
130
|
-
# Check if data looks like integers
|
|
131
|
-
if not np.allclose(data, np.round(data)):
|
|
132
|
-
return None
|
|
133
|
-
|
|
134
|
-
if np.any(data < 0):
|
|
135
|
-
return None
|
|
136
|
-
|
|
137
|
-
mu = np.mean(data)
|
|
138
|
-
|
|
139
|
-
# For Poisson, use chi-square test instead of KS
|
|
140
|
-
# KS test doesn't work well for discrete distributions
|
|
141
|
-
# We'll use a simplified goodness-of-fit measure
|
|
142
|
-
expected_var = mu
|
|
143
|
-
actual_var = np.var(data)
|
|
144
|
-
|
|
145
|
-
# Goodness of fit: how close variance is to mean (Poisson property)
|
|
146
|
-
if mu > 0:
|
|
147
|
-
fit_score = abs(actual_var - expected_var) / mu
|
|
148
|
-
else:
|
|
149
|
-
fit_score = 1.0
|
|
150
|
-
|
|
151
|
-
return DistributionFit(
|
|
152
|
-
name='poisson',
|
|
153
|
-
params={'lambda': float(mu)},
|
|
154
|
-
goodness_of_fit=float(fit_score),
|
|
155
|
-
p_value=0.0 # Not applicable for this simplified test
|
|
156
|
-
)
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
def fit_chisquare(data: np.ndarray) -> Optional[DistributionFit]:
|
|
160
|
-
"""Fit chi-squared distribution (requires positive values)."""
|
|
161
|
-
if np.any(data <= 0):
|
|
162
|
-
return None
|
|
163
|
-
|
|
164
|
-
df, loc, scale = stats.chi2.fit(data, floc=0)
|
|
165
|
-
ks_stat, p_value = stats.kstest(data, 'chi2', args=(df, loc, scale))
|
|
166
|
-
|
|
167
|
-
return DistributionFit(
|
|
168
|
-
name='chisquare',
|
|
169
|
-
params={'df': float(df), 'loc': float(loc), 'scale': float(scale)},
|
|
170
|
-
goodness_of_fit=float(ks_stat),
|
|
171
|
-
p_value=float(p_value)
|
|
172
|
-
)
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
def fit_distribution(data: np.ndarray, dist_name: str) -> Optional[DistributionFit]:
|
|
176
|
-
"""
|
|
177
|
-
Fit a specific distribution to data.
|
|
178
|
-
|
|
179
|
-
Args:
|
|
180
|
-
data: Numeric data array
|
|
181
|
-
dist_name: Distribution name (normal, uniform, exponential, etc.)
|
|
182
|
-
|
|
183
|
-
Returns:
|
|
184
|
-
DistributionFit object or None if fitting failed
|
|
185
|
-
"""
|
|
186
|
-
if len(data) < 3:
|
|
187
|
-
return None
|
|
188
|
-
|
|
189
|
-
# Remove NaN values
|
|
190
|
-
data = data[~np.isnan(data)]
|
|
191
|
-
|
|
192
|
-
if len(data) < 3:
|
|
193
|
-
return None
|
|
194
|
-
|
|
195
|
-
try:
|
|
196
|
-
if dist_name == 'normal':
|
|
197
|
-
return fit_normal(data)
|
|
198
|
-
elif dist_name == 'uniform':
|
|
199
|
-
return fit_uniform(data)
|
|
200
|
-
elif dist_name == 'exponential':
|
|
201
|
-
return fit_exponential(data)
|
|
202
|
-
elif dist_name == 'lognormal':
|
|
203
|
-
return fit_lognormal(data)
|
|
204
|
-
elif dist_name == 'gamma':
|
|
205
|
-
return fit_gamma(data)
|
|
206
|
-
elif dist_name == 'beta':
|
|
207
|
-
return fit_beta(data)
|
|
208
|
-
elif dist_name == 'poisson':
|
|
209
|
-
return fit_poisson(data)
|
|
210
|
-
elif dist_name == 'chisquare':
|
|
211
|
-
return fit_chisquare(data)
|
|
212
|
-
else:
|
|
213
|
-
return None
|
|
214
|
-
except Exception:
|
|
215
|
-
return None
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
def detect_distributions(
|
|
219
|
-
data: np.ndarray,
|
|
220
|
-
top_n: int = 3
|
|
221
|
-
) -> List[DistributionFit]:
|
|
222
|
-
"""
|
|
223
|
-
Detect best-fitting distributions for data.
|
|
224
|
-
|
|
225
|
-
Args:
|
|
226
|
-
data: Numeric data array
|
|
227
|
-
top_n: Number of top distributions to return
|
|
228
|
-
|
|
229
|
-
Returns:
|
|
230
|
-
List of DistributionFit objects, sorted by goodness of fit
|
|
231
|
-
"""
|
|
232
|
-
if len(data) < 3:
|
|
233
|
-
return []
|
|
234
|
-
|
|
235
|
-
# Remove NaN values
|
|
236
|
-
data = data[~np.isnan(data)]
|
|
237
|
-
|
|
238
|
-
if len(data) < 3:
|
|
239
|
-
return []
|
|
240
|
-
|
|
241
|
-
# Try all distributions
|
|
242
|
-
distributions = [
|
|
243
|
-
'normal',
|
|
244
|
-
'uniform',
|
|
245
|
-
'exponential',
|
|
246
|
-
'lognormal',
|
|
247
|
-
'gamma',
|
|
248
|
-
'beta',
|
|
249
|
-
'poisson',
|
|
250
|
-
'chisquare'
|
|
251
|
-
]
|
|
252
|
-
|
|
253
|
-
fits = []
|
|
254
|
-
for dist_name in distributions:
|
|
255
|
-
fit = fit_distribution(data, dist_name)
|
|
256
|
-
if fit is not None:
|
|
257
|
-
fits.append(fit)
|
|
258
|
-
|
|
259
|
-
# Sort by goodness of fit (lower is better)
|
|
260
|
-
fits.sort(key=lambda x: x.goodness_of_fit)
|
|
261
|
-
|
|
262
|
-
return fits[:top_n]
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
def detect_distributions(
|
|
266
|
-
df,
|
|
267
|
-
columns: List[str] = None,
|
|
268
|
-
top_n: int = 3
|
|
269
|
-
) -> Dict[str, List[DistributionFit]]:
|
|
270
|
-
"""
|
|
271
|
-
Detect best-fitting distributions for multiple columns in a DataFrame.
|
|
272
|
-
|
|
273
|
-
Args:
|
|
274
|
-
df: Polars DataFrame
|
|
275
|
-
columns: List of column names to analyze (None = all numeric columns)
|
|
276
|
-
top_n: Number of top distributions to return per column
|
|
277
|
-
|
|
278
|
-
Returns:
|
|
279
|
-
Dictionary mapping column names to lists of DistributionFit objects
|
|
280
|
-
"""
|
|
281
|
-
import polars as pl
|
|
282
|
-
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
283
|
-
import numpy as np
|
|
284
|
-
|
|
285
|
-
if columns is None:
|
|
286
|
-
# Auto-detect numeric columns
|
|
287
|
-
columns = [col for col in df.columns
|
|
288
|
-
if df[col].dtype in [pl.Int8, pl.Int16, pl.Int32, pl.Int64,
|
|
289
|
-
pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64,
|
|
290
|
-
pl.Float32, pl.Float64]]
|
|
291
|
-
|
|
292
|
-
results = {}
|
|
293
|
-
|
|
294
|
-
def process_column(col_name):
|
|
295
|
-
"""Process a single column for distribution detection"""
|
|
296
|
-
try:
|
|
297
|
-
# Extract column data as numpy array
|
|
298
|
-
col_data = df[col_name].to_numpy()
|
|
299
|
-
|
|
300
|
-
# Remove null values
|
|
301
|
-
col_data = col_data[~np.isnan(col_data)]
|
|
302
|
-
|
|
303
|
-
if len(col_data) < 3:
|
|
304
|
-
return col_name, []
|
|
305
|
-
|
|
306
|
-
# Detect distributions for this column
|
|
307
|
-
fits = detect_distributions_array(col_data, top_n)
|
|
308
|
-
return col_name, fits
|
|
309
|
-
|
|
310
|
-
except Exception as e:
|
|
311
|
-
# Log error but continue with other columns
|
|
312
|
-
return col_name, []
|
|
313
|
-
|
|
314
|
-
# Use ThreadPoolExecutor for parallel processing
|
|
315
|
-
with ThreadPoolExecutor(max_workers=min(4, len(columns))) as executor:
|
|
316
|
-
# Submit all column processing tasks
|
|
317
|
-
future_to_column = {
|
|
318
|
-
executor.submit(process_column, col): col
|
|
319
|
-
for col in columns
|
|
320
|
-
}
|
|
321
|
-
|
|
322
|
-
# Collect results as they complete
|
|
323
|
-
for future in as_completed(future_to_column):
|
|
324
|
-
col_name, fits = future.result()
|
|
325
|
-
results[col_name] = fits
|
|
326
|
-
|
|
327
|
-
return results
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
def detect_distributions_array(
|
|
331
|
-
data: np.ndarray,
|
|
332
|
-
top_n: int = 3
|
|
333
|
-
) -> List[DistributionFit]:
|
|
334
|
-
"""
|
|
335
|
-
Detect best-fitting distributions for data array.
|
|
336
|
-
|
|
337
|
-
This is the original function renamed to avoid conflicts.
|
|
338
|
-
|
|
339
|
-
Args:
|
|
340
|
-
data: Numeric data array
|
|
341
|
-
top_n: Number of top distributions to return
|
|
342
|
-
|
|
343
|
-
Returns:
|
|
344
|
-
List of DistributionFit objects, sorted by goodness of fit
|
|
345
|
-
"""
|
|
346
|
-
if len(data) < 3:
|
|
347
|
-
return []
|
|
348
|
-
|
|
349
|
-
# Remove NaN values
|
|
350
|
-
data = data[~np.isnan(data)]
|
|
351
|
-
|
|
352
|
-
if len(data) < 3:
|
|
353
|
-
return []
|
|
354
|
-
|
|
355
|
-
# Try all distributions
|
|
356
|
-
distributions = [
|
|
357
|
-
'normal',
|
|
358
|
-
'uniform',
|
|
359
|
-
'exponential',
|
|
360
|
-
'lognormal',
|
|
361
|
-
'gamma',
|
|
362
|
-
'beta',
|
|
363
|
-
'poisson',
|
|
364
|
-
'chisquare'
|
|
365
|
-
]
|
|
366
|
-
|
|
367
|
-
fits = []
|
|
368
|
-
for dist_name in distributions:
|
|
369
|
-
fit = fit_distribution(data, dist_name)
|
|
370
|
-
if fit is not None:
|
|
371
|
-
fits.append(fit)
|
|
372
|
-
|
|
373
|
-
# Sort by goodness of fit (lower is better)
|
|
374
|
-
fits.sort(key=lambda x: x.goodness_of_fit)
|
|
375
|
-
|
|
376
|
-
return fits[:top_n]
|
additory/analysis/quality.py
DELETED
|
@@ -1,158 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Data Quality Metrics
|
|
3
|
-
|
|
4
|
-
Analyzes data quality including missing values, types, and statistics.
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
from dataclasses import dataclass
|
|
8
|
-
from typing import Optional, Any, Dict
|
|
9
|
-
import polars as pl
|
|
10
|
-
import numpy as np
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
@dataclass
|
|
14
|
-
class QualityMetrics:
|
|
15
|
-
"""Data quality metrics for a column."""
|
|
16
|
-
column: str
|
|
17
|
-
dtype: str
|
|
18
|
-
missing_count: int
|
|
19
|
-
missing_ratio: float
|
|
20
|
-
total_count: int
|
|
21
|
-
|
|
22
|
-
# Numeric statistics
|
|
23
|
-
min_value: Optional[float] = None
|
|
24
|
-
max_value: Optional[float] = None
|
|
25
|
-
mean: Optional[float] = None
|
|
26
|
-
median: Optional[float] = None
|
|
27
|
-
std: Optional[float] = None
|
|
28
|
-
q25: Optional[float] = None
|
|
29
|
-
q75: Optional[float] = None
|
|
30
|
-
|
|
31
|
-
# Categorical statistics
|
|
32
|
-
mode: Optional[Any] = None
|
|
33
|
-
mode_count: Optional[int] = None
|
|
34
|
-
mode_ratio: Optional[float] = None
|
|
35
|
-
|
|
36
|
-
def __repr__(self) -> str:
|
|
37
|
-
return (
|
|
38
|
-
f"QualityMetrics(column='{self.column}', "
|
|
39
|
-
f"dtype='{self.dtype}', missing={self.missing_ratio:.1%})"
|
|
40
|
-
)
|
|
41
|
-
|
|
42
|
-
def to_dict(self) -> Dict[str, Any]:
|
|
43
|
-
"""Convert to dictionary."""
|
|
44
|
-
return {
|
|
45
|
-
'column': self.column,
|
|
46
|
-
'dtype': self.dtype,
|
|
47
|
-
'missing_count': self.missing_count,
|
|
48
|
-
'missing_ratio': self.missing_ratio,
|
|
49
|
-
'total_count': self.total_count,
|
|
50
|
-
'min': self.min_value,
|
|
51
|
-
'max': self.max_value,
|
|
52
|
-
'mean': self.mean,
|
|
53
|
-
'median': self.median,
|
|
54
|
-
'std': self.std,
|
|
55
|
-
'q25': self.q25,
|
|
56
|
-
'q75': self.q75,
|
|
57
|
-
'mode': self.mode,
|
|
58
|
-
'mode_count': self.mode_count,
|
|
59
|
-
'mode_ratio': self.mode_ratio
|
|
60
|
-
}
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
def is_numeric_dtype(dtype: pl.DataType) -> bool:
|
|
64
|
-
"""Check if dtype is numeric."""
|
|
65
|
-
return dtype in [
|
|
66
|
-
pl.Int8, pl.Int16, pl.Int32, pl.Int64,
|
|
67
|
-
pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64,
|
|
68
|
-
pl.Float32, pl.Float64
|
|
69
|
-
]
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
def analyze_quality(
|
|
73
|
-
df: pl.DataFrame,
|
|
74
|
-
column: str
|
|
75
|
-
) -> QualityMetrics:
|
|
76
|
-
"""
|
|
77
|
-
Analyze data quality for a column.
|
|
78
|
-
|
|
79
|
-
Args:
|
|
80
|
-
df: Polars DataFrame
|
|
81
|
-
column: Column name
|
|
82
|
-
|
|
83
|
-
Returns:
|
|
84
|
-
QualityMetrics object
|
|
85
|
-
"""
|
|
86
|
-
col_series = df[column]
|
|
87
|
-
dtype = col_series.dtype
|
|
88
|
-
|
|
89
|
-
# Basic counts
|
|
90
|
-
total_count = len(df)
|
|
91
|
-
missing_count = col_series.null_count()
|
|
92
|
-
missing_ratio = missing_count / total_count if total_count > 0 else 0.0
|
|
93
|
-
|
|
94
|
-
# Initialize metrics
|
|
95
|
-
metrics = QualityMetrics(
|
|
96
|
-
column=column,
|
|
97
|
-
dtype=str(dtype),
|
|
98
|
-
missing_count=missing_count,
|
|
99
|
-
missing_ratio=missing_ratio,
|
|
100
|
-
total_count=total_count
|
|
101
|
-
)
|
|
102
|
-
|
|
103
|
-
# Numeric statistics
|
|
104
|
-
if is_numeric_dtype(dtype):
|
|
105
|
-
try:
|
|
106
|
-
metrics.min_value = float(col_series.min())
|
|
107
|
-
metrics.max_value = float(col_series.max())
|
|
108
|
-
metrics.mean = float(col_series.mean())
|
|
109
|
-
metrics.median = float(col_series.median())
|
|
110
|
-
metrics.std = float(col_series.std())
|
|
111
|
-
|
|
112
|
-
# Quantiles
|
|
113
|
-
q25 = col_series.quantile(0.25, interpolation='linear')
|
|
114
|
-
q75 = col_series.quantile(0.75, interpolation='linear')
|
|
115
|
-
if q25 is not None:
|
|
116
|
-
metrics.q25 = float(q25)
|
|
117
|
-
if q75 is not None:
|
|
118
|
-
metrics.q75 = float(q75)
|
|
119
|
-
except Exception:
|
|
120
|
-
pass
|
|
121
|
-
|
|
122
|
-
# Mode (for all types)
|
|
123
|
-
try:
|
|
124
|
-
mode_result = (
|
|
125
|
-
df
|
|
126
|
-
.group_by(column)
|
|
127
|
-
.agg(pl.len().alias('count'))
|
|
128
|
-
.sort('count', descending=True)
|
|
129
|
-
.head(1)
|
|
130
|
-
)
|
|
131
|
-
|
|
132
|
-
if len(mode_result) > 0:
|
|
133
|
-
row = mode_result.row(0, named=True)
|
|
134
|
-
metrics.mode = row[column]
|
|
135
|
-
metrics.mode_count = row['count']
|
|
136
|
-
metrics.mode_ratio = metrics.mode_count / total_count if total_count > 0 else 0.0
|
|
137
|
-
except Exception:
|
|
138
|
-
pass
|
|
139
|
-
|
|
140
|
-
return metrics
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
def analyze_all_quality(
|
|
144
|
-
df: pl.DataFrame
|
|
145
|
-
) -> Dict[str, QualityMetrics]:
|
|
146
|
-
"""
|
|
147
|
-
Analyze data quality for all columns.
|
|
148
|
-
|
|
149
|
-
Args:
|
|
150
|
-
df: Polars DataFrame
|
|
151
|
-
|
|
152
|
-
Returns:
|
|
153
|
-
Dictionary mapping column names to QualityMetrics
|
|
154
|
-
"""
|
|
155
|
-
return {
|
|
156
|
-
col: analyze_quality(df, col)
|
|
157
|
-
for col in df.columns
|
|
158
|
-
}
|