additory 0.1.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. additory/__init__.py +15 -0
  2. additory/analysis/__init__.py +48 -0
  3. additory/analysis/cardinality.py +126 -0
  4. additory/analysis/correlations.py +124 -0
  5. additory/analysis/distributions.py +376 -0
  6. additory/analysis/quality.py +158 -0
  7. additory/analysis/scan.py +400 -0
  8. additory/augment/__init__.py +24 -0
  9. additory/augment/augmentor.py +653 -0
  10. additory/augment/builtin_lists.py +430 -0
  11. additory/augment/distributions.py +22 -0
  12. additory/augment/forecast.py +1132 -0
  13. additory/augment/list_registry.py +177 -0
  14. additory/augment/smote.py +320 -0
  15. additory/augment/strategies.py +883 -0
  16. additory/common/__init__.py +157 -0
  17. additory/common/backend.py +355 -0
  18. additory/common/column_utils.py +191 -0
  19. additory/common/distributions.py +737 -0
  20. additory/common/exceptions.py +62 -0
  21. additory/common/lists.py +229 -0
  22. additory/common/patterns.py +240 -0
  23. additory/common/resolver.py +567 -0
  24. additory/common/sample_data.py +182 -0
  25. additory/common/validation.py +197 -0
  26. additory/core/__init__.py +27 -0
  27. additory/core/ast_builder.py +165 -0
  28. additory/core/backends/__init__.py +23 -0
  29. additory/core/backends/arrow_bridge.py +476 -0
  30. additory/core/backends/cudf_bridge.py +355 -0
  31. additory/core/column_positioning.py +358 -0
  32. additory/core/compiler_polars.py +166 -0
  33. additory/core/config.py +342 -0
  34. additory/core/enhanced_cache_manager.py +1119 -0
  35. additory/core/enhanced_matchers.py +473 -0
  36. additory/core/enhanced_version_manager.py +325 -0
  37. additory/core/executor.py +59 -0
  38. additory/core/integrity_manager.py +477 -0
  39. additory/core/loader.py +190 -0
  40. additory/core/logging.py +24 -0
  41. additory/core/memory_manager.py +547 -0
  42. additory/core/namespace_manager.py +657 -0
  43. additory/core/parser.py +176 -0
  44. additory/core/polars_expression_engine.py +551 -0
  45. additory/core/registry.py +176 -0
  46. additory/core/sample_data_manager.py +492 -0
  47. additory/core/user_namespace.py +751 -0
  48. additory/core/validator.py +27 -0
  49. additory/dynamic_api.py +308 -0
  50. additory/expressions/__init__.py +26 -0
  51. additory/expressions/engine.py +551 -0
  52. additory/expressions/parser.py +176 -0
  53. additory/expressions/proxy.py +546 -0
  54. additory/expressions/registry.py +313 -0
  55. additory/expressions/samples.py +492 -0
  56. additory/synthetic/__init__.py +101 -0
  57. additory/synthetic/api.py +220 -0
  58. additory/synthetic/common_integration.py +314 -0
  59. additory/synthetic/config.py +262 -0
  60. additory/synthetic/engines.py +529 -0
  61. additory/synthetic/exceptions.py +180 -0
  62. additory/synthetic/file_managers.py +518 -0
  63. additory/synthetic/generator.py +702 -0
  64. additory/synthetic/generator_parser.py +68 -0
  65. additory/synthetic/integration.py +319 -0
  66. additory/synthetic/models.py +241 -0
  67. additory/synthetic/pattern_resolver.py +573 -0
  68. additory/synthetic/performance.py +469 -0
  69. additory/synthetic/polars_integration.py +464 -0
  70. additory/synthetic/proxy.py +60 -0
  71. additory/synthetic/schema_parser.py +685 -0
  72. additory/synthetic/validator.py +553 -0
  73. additory/utilities/__init__.py +53 -0
  74. additory/utilities/encoding.py +600 -0
  75. additory/utilities/games.py +300 -0
  76. additory/utilities/keys.py +8 -0
  77. additory/utilities/lookup.py +103 -0
  78. additory/utilities/matchers.py +216 -0
  79. additory/utilities/resolvers.py +286 -0
  80. additory/utilities/settings.py +167 -0
  81. additory/utilities/units.py +746 -0
  82. additory/utilities/validators.py +153 -0
  83. additory-0.1.0a1.dist-info/METADATA +293 -0
  84. additory-0.1.0a1.dist-info/RECORD +87 -0
  85. additory-0.1.0a1.dist-info/WHEEL +5 -0
  86. additory-0.1.0a1.dist-info/licenses/LICENSE +21 -0
  87. additory-0.1.0a1.dist-info/top_level.txt +1 -0
additory/__init__.py ADDED
@@ -0,0 +1,15 @@
1
+ # additory/__init__.py
2
+
3
+ from .dynamic_api import add as _api_instance
4
+
5
+ # Expose the API instance normally
6
+ add = _api_instance
7
+
8
+ # Module-level __getattr__ to forward dynamic attributes
9
+ def __getattr__(name):
10
+ # Delegate all unknown attributes to the API instance
11
+ return getattr(_api_instance, name)
12
+
13
+ __all__ = [
14
+ "add",
15
+ ]
@@ -0,0 +1,48 @@
1
+ """
2
+ Analysis Module for Data Profiling
3
+
4
+ Provides comprehensive data analysis capabilities:
5
+ - Distribution detection and fitting
6
+ - Correlation analysis
7
+ - Cardinality analysis
8
+ - Data quality metrics
9
+ - Data profiling and scanning
10
+ """
11
+
12
+ from additory.analysis.distributions import (
13
+ detect_distributions,
14
+ fit_distribution,
15
+ DistributionFit
16
+ )
17
+ from additory.analysis.correlations import (
18
+ calculate_correlations,
19
+ CorrelationResult
20
+ )
21
+ from additory.analysis.cardinality import (
22
+ analyze_cardinality,
23
+ CardinalityInfo
24
+ )
25
+ from additory.analysis.quality import (
26
+ analyze_quality,
27
+ QualityMetrics
28
+ )
29
+ from additory.analysis.scan import (
30
+ scan,
31
+ ScanResult,
32
+ ColumnInfo
33
+ )
34
+
35
+ __all__ = [
36
+ 'detect_distributions',
37
+ 'fit_distribution',
38
+ 'DistributionFit',
39
+ 'calculate_correlations',
40
+ 'CorrelationResult',
41
+ 'analyze_cardinality',
42
+ 'CardinalityInfo',
43
+ 'analyze_quality',
44
+ 'QualityMetrics',
45
+ 'scan',
46
+ 'ScanResult',
47
+ 'ColumnInfo',
48
+ ]
@@ -0,0 +1,126 @@
1
+ """
2
+ Cardinality Analysis
3
+
4
+ Analyzes unique values and cardinality of columns.
5
+ """
6
+
7
+ from dataclasses import dataclass
8
+ from typing import List, Any, Dict
9
+ import polars as pl
10
+
11
+
12
+ @dataclass
13
+ class CardinalityInfo:
14
+ """Cardinality information for a column."""
15
+ unique_count: int
16
+ total_count: int
17
+ ratio: float
18
+ top_values: List[tuple] # [(value, count), ...]
19
+ classification: str # 'constant', 'low', 'medium', 'high'
20
+
21
+ def __repr__(self) -> str:
22
+ return (
23
+ f"CardinalityInfo(unique={self.unique_count}, "
24
+ f"ratio={self.ratio:.2%}, class='{self.classification}')"
25
+ )
26
+
27
+
28
+ def classify_cardinality(ratio: float, unique_count: int) -> str:
29
+ """
30
+ Classify cardinality based on ratio and unique count.
31
+
32
+ Args:
33
+ ratio: Unique count / total count
34
+ unique_count: Number of unique values
35
+
36
+ Returns:
37
+ Classification: 'constant', 'low', 'medium', 'high'
38
+ """
39
+ if unique_count == 1:
40
+ return 'constant'
41
+ elif ratio >= 0.5:
42
+ return 'high'
43
+ elif ratio >= 0.1:
44
+ return 'medium'
45
+ else:
46
+ return 'low'
47
+
48
+
49
+ def analyze_cardinality(
50
+ df: pl.DataFrame,
51
+ column: str,
52
+ top_n: int = 10
53
+ ) -> CardinalityInfo:
54
+ """
55
+ Analyze cardinality of a column.
56
+
57
+ Args:
58
+ df: Polars DataFrame
59
+ column: Column name
60
+ top_n: Number of top values to return
61
+
62
+ Returns:
63
+ CardinalityInfo object
64
+ """
65
+ # Get total count (excluding nulls)
66
+ total_count = df[column].count()
67
+
68
+ if total_count == 0:
69
+ return CardinalityInfo(
70
+ unique_count=0,
71
+ total_count=0,
72
+ ratio=0.0,
73
+ top_values=[],
74
+ classification='constant'
75
+ )
76
+
77
+ # Get unique count (excluding nulls)
78
+ unique_count = df[column].drop_nulls().n_unique()
79
+
80
+ # Calculate ratio
81
+ ratio = unique_count / total_count if total_count > 0 else 0.0
82
+
83
+ # Get top values
84
+ value_counts = (
85
+ df
86
+ .group_by(column)
87
+ .agg(pl.len().alias('count'))
88
+ .sort('count', descending=True)
89
+ .head(top_n)
90
+ )
91
+
92
+ top_values = [
93
+ (row[column], row['count'])
94
+ for row in value_counts.iter_rows(named=True)
95
+ ]
96
+
97
+ # Classify
98
+ classification = classify_cardinality(ratio, unique_count)
99
+
100
+ return CardinalityInfo(
101
+ unique_count=unique_count,
102
+ total_count=total_count,
103
+ ratio=ratio,
104
+ top_values=top_values,
105
+ classification=classification
106
+ )
107
+
108
+
109
+ def analyze_all_cardinality(
110
+ df: pl.DataFrame,
111
+ top_n: int = 10
112
+ ) -> Dict[str, CardinalityInfo]:
113
+ """
114
+ Analyze cardinality for all columns.
115
+
116
+ Args:
117
+ df: Polars DataFrame
118
+ top_n: Number of top values to return per column
119
+
120
+ Returns:
121
+ Dictionary mapping column names to CardinalityInfo
122
+ """
123
+ return {
124
+ col: analyze_cardinality(df, col, top_n)
125
+ for col in df.columns
126
+ }
@@ -0,0 +1,124 @@
1
+ """
2
+ Correlation Analysis
3
+
4
+ Calculates correlations between numeric columns.
5
+ """
6
+
7
+ from dataclasses import dataclass
8
+ from typing import Dict, List, Tuple
9
+ import numpy as np
10
+ import polars as pl
11
+ from scipy import stats
12
+
13
+
14
+ @dataclass
15
+ class CorrelationResult:
16
+ """Result of correlation analysis between two columns."""
17
+ column1: str
18
+ column2: str
19
+ correlation: float
20
+ method: str
21
+ p_value: float = 0.0
22
+
23
+
24
+ def calculate_correlations(
25
+ df: pl.DataFrame,
26
+ columns: List[str],
27
+ methods: List[str] = ['pearson', 'spearman'],
28
+ threshold: float = 0.0
29
+ ) -> List[CorrelationResult]:
30
+ """
31
+ Calculate correlations between numeric columns with optimized batch processing.
32
+
33
+ Args:
34
+ df: Polars DataFrame
35
+ columns: List of numeric column names
36
+ methods: Correlation methods to calculate
37
+ threshold: Minimum correlation threshold to report
38
+
39
+ Returns:
40
+ List of CorrelationResult objects (changed from single object for scan.py compatibility)
41
+ """
42
+ from concurrent.futures import ThreadPoolExecutor, as_completed
43
+ import itertools
44
+
45
+ if len(columns) < 2:
46
+ return []
47
+
48
+ # Pre-extract all data as numpy arrays for efficiency
49
+ data_arrays = {}
50
+ for col in columns:
51
+ arr = df[col].to_numpy()
52
+ data_arrays[col] = arr
53
+
54
+ # Generate all column pairs
55
+ column_pairs = list(itertools.combinations(columns, 2))
56
+
57
+ results = []
58
+
59
+ def calculate_pair_correlations(pair):
60
+ """Calculate correlations for a single pair of columns."""
61
+ col1, col2 = pair
62
+ arr1 = data_arrays[col1]
63
+ arr2 = data_arrays[col2]
64
+
65
+ # Get common non-NaN indices
66
+ mask = ~(np.isnan(arr1) | np.isnan(arr2))
67
+ arr1_clean = arr1[mask]
68
+ arr2_clean = arr2[mask]
69
+
70
+ if len(arr1_clean) < 3:
71
+ return None
72
+
73
+ pair_results = {}
74
+
75
+ # Calculate all requested methods for this pair
76
+ for method in methods:
77
+ try:
78
+ if method == 'pearson':
79
+ corr, p_value = stats.pearsonr(arr1_clean, arr2_clean)
80
+ elif method == 'spearman':
81
+ corr, p_value = stats.spearmanr(arr1_clean, arr2_clean)
82
+ elif method == 'kendall':
83
+ corr, p_value = stats.kendalltau(arr1_clean, arr2_clean)
84
+ else:
85
+ continue
86
+
87
+ # Only include if above threshold
88
+ if abs(corr) >= threshold:
89
+ pair_results[method] = {
90
+ 'correlation': float(corr),
91
+ 'p_value': float(p_value)
92
+ }
93
+ except Exception:
94
+ continue
95
+
96
+ if pair_results:
97
+ return (col1, col2, pair_results)
98
+ return None
99
+
100
+ # Use ThreadPoolExecutor for parallel processing of correlation pairs
101
+ with ThreadPoolExecutor(max_workers=min(4, len(column_pairs))) as executor:
102
+ # Submit all pair processing tasks
103
+ future_to_pair = {
104
+ executor.submit(calculate_pair_correlations, pair): pair
105
+ for pair in column_pairs
106
+ }
107
+
108
+ # Collect results as they complete
109
+ for future in as_completed(future_to_pair):
110
+ result = future.result()
111
+ if result is not None:
112
+ col1, col2, pair_results = result
113
+
114
+ # Create CorrelationResult objects for each method
115
+ for method, corr_data in pair_results.items():
116
+ results.append(CorrelationResult(
117
+ column1=col1,
118
+ column2=col2,
119
+ correlation=corr_data['correlation'],
120
+ method=method,
121
+ p_value=corr_data['p_value']
122
+ ))
123
+
124
+ return results
@@ -0,0 +1,376 @@
1
+ """
2
+ Distribution Detection and Fitting
3
+
4
+ Detects and fits statistical distributions to numeric data.
5
+ """
6
+
7
+ from dataclasses import dataclass
8
+ from typing import List, Dict, Any, Optional
9
+ import numpy as np
10
+ from scipy import stats
11
+
12
+
13
+ @dataclass
14
+ class DistributionFit:
15
+ """Result of fitting a distribution to data."""
16
+ name: str
17
+ params: Dict[str, float]
18
+ goodness_of_fit: float # KS test statistic (lower is better)
19
+ p_value: float # KS test p-value (higher is better)
20
+
21
+ def __repr__(self) -> str:
22
+ return f"DistributionFit(name='{self.name}', fit={self.goodness_of_fit:.4f}, p={self.p_value:.4f})"
23
+
24
+
25
+ def fit_normal(data: np.ndarray) -> DistributionFit:
26
+ """Fit normal distribution."""
27
+ mean, std = stats.norm.fit(data)
28
+ ks_stat, p_value = stats.kstest(data, 'norm', args=(mean, std))
29
+
30
+ return DistributionFit(
31
+ name='normal',
32
+ params={'mean': float(mean), 'std': float(std)},
33
+ goodness_of_fit=float(ks_stat),
34
+ p_value=float(p_value)
35
+ )
36
+
37
+
38
+ def fit_uniform(data: np.ndarray) -> DistributionFit:
39
+ """Fit uniform distribution."""
40
+ loc, scale = stats.uniform.fit(data)
41
+ ks_stat, p_value = stats.kstest(data, 'uniform', args=(loc, scale))
42
+
43
+ return DistributionFit(
44
+ name='uniform',
45
+ params={'min': float(loc), 'max': float(loc + scale)},
46
+ goodness_of_fit=float(ks_stat),
47
+ p_value=float(p_value)
48
+ )
49
+
50
+
51
+ def fit_exponential(data: np.ndarray) -> Optional[DistributionFit]:
52
+ """Fit exponential distribution (requires positive values)."""
53
+ if np.any(data <= 0):
54
+ return None
55
+
56
+ loc, scale = stats.expon.fit(data)
57
+ ks_stat, p_value = stats.kstest(data, 'expon', args=(loc, scale))
58
+
59
+ return DistributionFit(
60
+ name='exponential',
61
+ params={'loc': float(loc), 'scale': float(scale), 'rate': float(1/scale)},
62
+ goodness_of_fit=float(ks_stat),
63
+ p_value=float(p_value)
64
+ )
65
+
66
+
67
+ def fit_lognormal(data: np.ndarray) -> Optional[DistributionFit]:
68
+ """Fit log-normal distribution (requires positive values)."""
69
+ if np.any(data <= 0):
70
+ return None
71
+
72
+ shape, loc, scale = stats.lognorm.fit(data, floc=0)
73
+ ks_stat, p_value = stats.kstest(data, 'lognorm', args=(shape, loc, scale))
74
+
75
+ return DistributionFit(
76
+ name='lognormal',
77
+ params={'shape': float(shape), 'loc': float(loc), 'scale': float(scale)},
78
+ goodness_of_fit=float(ks_stat),
79
+ p_value=float(p_value)
80
+ )
81
+
82
+
83
+ def fit_gamma(data: np.ndarray) -> Optional[DistributionFit]:
84
+ """Fit gamma distribution (requires positive values)."""
85
+ if np.any(data <= 0):
86
+ return None
87
+
88
+ shape, loc, scale = stats.gamma.fit(data, floc=0)
89
+ ks_stat, p_value = stats.kstest(data, 'gamma', args=(shape, loc, scale))
90
+
91
+ return DistributionFit(
92
+ name='gamma',
93
+ params={'shape': float(shape), 'loc': float(loc), 'scale': float(scale)},
94
+ goodness_of_fit=float(ks_stat),
95
+ p_value=float(p_value)
96
+ )
97
+
98
+
99
+ def fit_beta(data: np.ndarray) -> Optional[DistributionFit]:
100
+ """Fit beta distribution (requires values in [0, 1] or will be normalized)."""
101
+ # Normalize to [0, 1]
102
+ data_min, data_max = np.min(data), np.max(data)
103
+
104
+ if data_max == data_min:
105
+ return None
106
+
107
+ normalized = (data - data_min) / (data_max - data_min)
108
+
109
+ # Avoid exact 0 and 1 for beta fitting
110
+ normalized = np.clip(normalized, 1e-6, 1 - 1e-6)
111
+
112
+ a, b, loc, scale = stats.beta.fit(normalized, floc=0, fscale=1)
113
+ ks_stat, p_value = stats.kstest(normalized, 'beta', args=(a, b, loc, scale))
114
+
115
+ return DistributionFit(
116
+ name='beta',
117
+ params={
118
+ 'alpha': float(a),
119
+ 'beta': float(b),
120
+ 'data_min': float(data_min),
121
+ 'data_max': float(data_max)
122
+ },
123
+ goodness_of_fit=float(ks_stat),
124
+ p_value=float(p_value)
125
+ )
126
+
127
+
128
+ def fit_poisson(data: np.ndarray) -> Optional[DistributionFit]:
129
+ """Fit Poisson distribution (requires non-negative integers)."""
130
+ # Check if data looks like integers
131
+ if not np.allclose(data, np.round(data)):
132
+ return None
133
+
134
+ if np.any(data < 0):
135
+ return None
136
+
137
+ mu = np.mean(data)
138
+
139
+ # For Poisson, use chi-square test instead of KS
140
+ # KS test doesn't work well for discrete distributions
141
+ # We'll use a simplified goodness-of-fit measure
142
+ expected_var = mu
143
+ actual_var = np.var(data)
144
+
145
+ # Goodness of fit: how close variance is to mean (Poisson property)
146
+ if mu > 0:
147
+ fit_score = abs(actual_var - expected_var) / mu
148
+ else:
149
+ fit_score = 1.0
150
+
151
+ return DistributionFit(
152
+ name='poisson',
153
+ params={'lambda': float(mu)},
154
+ goodness_of_fit=float(fit_score),
155
+ p_value=0.0 # Not applicable for this simplified test
156
+ )
157
+
158
+
159
+ def fit_chisquare(data: np.ndarray) -> Optional[DistributionFit]:
160
+ """Fit chi-squared distribution (requires positive values)."""
161
+ if np.any(data <= 0):
162
+ return None
163
+
164
+ df, loc, scale = stats.chi2.fit(data, floc=0)
165
+ ks_stat, p_value = stats.kstest(data, 'chi2', args=(df, loc, scale))
166
+
167
+ return DistributionFit(
168
+ name='chisquare',
169
+ params={'df': float(df), 'loc': float(loc), 'scale': float(scale)},
170
+ goodness_of_fit=float(ks_stat),
171
+ p_value=float(p_value)
172
+ )
173
+
174
+
175
+ def fit_distribution(data: np.ndarray, dist_name: str) -> Optional[DistributionFit]:
176
+ """
177
+ Fit a specific distribution to data.
178
+
179
+ Args:
180
+ data: Numeric data array
181
+ dist_name: Distribution name (normal, uniform, exponential, etc.)
182
+
183
+ Returns:
184
+ DistributionFit object or None if fitting failed
185
+ """
186
+ if len(data) < 3:
187
+ return None
188
+
189
+ # Remove NaN values
190
+ data = data[~np.isnan(data)]
191
+
192
+ if len(data) < 3:
193
+ return None
194
+
195
+ try:
196
+ if dist_name == 'normal':
197
+ return fit_normal(data)
198
+ elif dist_name == 'uniform':
199
+ return fit_uniform(data)
200
+ elif dist_name == 'exponential':
201
+ return fit_exponential(data)
202
+ elif dist_name == 'lognormal':
203
+ return fit_lognormal(data)
204
+ elif dist_name == 'gamma':
205
+ return fit_gamma(data)
206
+ elif dist_name == 'beta':
207
+ return fit_beta(data)
208
+ elif dist_name == 'poisson':
209
+ return fit_poisson(data)
210
+ elif dist_name == 'chisquare':
211
+ return fit_chisquare(data)
212
+ else:
213
+ return None
214
+ except Exception:
215
+ return None
216
+
217
+
218
+ def detect_distributions(
219
+ data: np.ndarray,
220
+ top_n: int = 3
221
+ ) -> List[DistributionFit]:
222
+ """
223
+ Detect best-fitting distributions for data.
224
+
225
+ Args:
226
+ data: Numeric data array
227
+ top_n: Number of top distributions to return
228
+
229
+ Returns:
230
+ List of DistributionFit objects, sorted by goodness of fit
231
+ """
232
+ if len(data) < 3:
233
+ return []
234
+
235
+ # Remove NaN values
236
+ data = data[~np.isnan(data)]
237
+
238
+ if len(data) < 3:
239
+ return []
240
+
241
+ # Try all distributions
242
+ distributions = [
243
+ 'normal',
244
+ 'uniform',
245
+ 'exponential',
246
+ 'lognormal',
247
+ 'gamma',
248
+ 'beta',
249
+ 'poisson',
250
+ 'chisquare'
251
+ ]
252
+
253
+ fits = []
254
+ for dist_name in distributions:
255
+ fit = fit_distribution(data, dist_name)
256
+ if fit is not None:
257
+ fits.append(fit)
258
+
259
+ # Sort by goodness of fit (lower is better)
260
+ fits.sort(key=lambda x: x.goodness_of_fit)
261
+
262
+ return fits[:top_n]
263
+
264
+
265
+ def detect_distributions(
266
+ df,
267
+ columns: List[str] = None,
268
+ top_n: int = 3
269
+ ) -> Dict[str, List[DistributionFit]]:
270
+ """
271
+ Detect best-fitting distributions for multiple columns in a DataFrame.
272
+
273
+ Args:
274
+ df: Polars DataFrame
275
+ columns: List of column names to analyze (None = all numeric columns)
276
+ top_n: Number of top distributions to return per column
277
+
278
+ Returns:
279
+ Dictionary mapping column names to lists of DistributionFit objects
280
+ """
281
+ import polars as pl
282
+ from concurrent.futures import ThreadPoolExecutor, as_completed
283
+ import numpy as np
284
+
285
+ if columns is None:
286
+ # Auto-detect numeric columns
287
+ columns = [col for col in df.columns
288
+ if df[col].dtype in [pl.Int8, pl.Int16, pl.Int32, pl.Int64,
289
+ pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64,
290
+ pl.Float32, pl.Float64]]
291
+
292
+ results = {}
293
+
294
+ def process_column(col_name):
295
+ """Process a single column for distribution detection"""
296
+ try:
297
+ # Extract column data as numpy array
298
+ col_data = df[col_name].to_numpy()
299
+
300
+ # Remove null values
301
+ col_data = col_data[~np.isnan(col_data)]
302
+
303
+ if len(col_data) < 3:
304
+ return col_name, []
305
+
306
+ # Detect distributions for this column
307
+ fits = detect_distributions_array(col_data, top_n)
308
+ return col_name, fits
309
+
310
+ except Exception as e:
311
+ # Log error but continue with other columns
312
+ return col_name, []
313
+
314
+ # Use ThreadPoolExecutor for parallel processing
315
+ with ThreadPoolExecutor(max_workers=min(4, len(columns))) as executor:
316
+ # Submit all column processing tasks
317
+ future_to_column = {
318
+ executor.submit(process_column, col): col
319
+ for col in columns
320
+ }
321
+
322
+ # Collect results as they complete
323
+ for future in as_completed(future_to_column):
324
+ col_name, fits = future.result()
325
+ results[col_name] = fits
326
+
327
+ return results
328
+
329
+
330
+ def detect_distributions_array(
331
+ data: np.ndarray,
332
+ top_n: int = 3
333
+ ) -> List[DistributionFit]:
334
+ """
335
+ Detect best-fitting distributions for data array.
336
+
337
+ This is the original function renamed to avoid conflicts.
338
+
339
+ Args:
340
+ data: Numeric data array
341
+ top_n: Number of top distributions to return
342
+
343
+ Returns:
344
+ List of DistributionFit objects, sorted by goodness of fit
345
+ """
346
+ if len(data) < 3:
347
+ return []
348
+
349
+ # Remove NaN values
350
+ data = data[~np.isnan(data)]
351
+
352
+ if len(data) < 3:
353
+ return []
354
+
355
+ # Try all distributions
356
+ distributions = [
357
+ 'normal',
358
+ 'uniform',
359
+ 'exponential',
360
+ 'lognormal',
361
+ 'gamma',
362
+ 'beta',
363
+ 'poisson',
364
+ 'chisquare'
365
+ ]
366
+
367
+ fits = []
368
+ for dist_name in distributions:
369
+ fit = fit_distribution(data, dist_name)
370
+ if fit is not None:
371
+ fits.append(fit)
372
+
373
+ # Sort by goodness of fit (lower is better)
374
+ fits.sort(key=lambda x: x.goodness_of_fit)
375
+
376
+ return fits[:top_n]