additory 0.1.0a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- additory/__init__.py +15 -0
- additory/analysis/__init__.py +48 -0
- additory/analysis/cardinality.py +126 -0
- additory/analysis/correlations.py +124 -0
- additory/analysis/distributions.py +376 -0
- additory/analysis/quality.py +158 -0
- additory/analysis/scan.py +400 -0
- additory/augment/__init__.py +24 -0
- additory/augment/augmentor.py +653 -0
- additory/augment/builtin_lists.py +430 -0
- additory/augment/distributions.py +22 -0
- additory/augment/forecast.py +1132 -0
- additory/augment/list_registry.py +177 -0
- additory/augment/smote.py +320 -0
- additory/augment/strategies.py +883 -0
- additory/common/__init__.py +157 -0
- additory/common/backend.py +355 -0
- additory/common/column_utils.py +191 -0
- additory/common/distributions.py +737 -0
- additory/common/exceptions.py +62 -0
- additory/common/lists.py +229 -0
- additory/common/patterns.py +240 -0
- additory/common/resolver.py +567 -0
- additory/common/sample_data.py +182 -0
- additory/common/validation.py +197 -0
- additory/core/__init__.py +27 -0
- additory/core/ast_builder.py +165 -0
- additory/core/backends/__init__.py +23 -0
- additory/core/backends/arrow_bridge.py +476 -0
- additory/core/backends/cudf_bridge.py +355 -0
- additory/core/column_positioning.py +358 -0
- additory/core/compiler_polars.py +166 -0
- additory/core/config.py +342 -0
- additory/core/enhanced_cache_manager.py +1119 -0
- additory/core/enhanced_matchers.py +473 -0
- additory/core/enhanced_version_manager.py +325 -0
- additory/core/executor.py +59 -0
- additory/core/integrity_manager.py +477 -0
- additory/core/loader.py +190 -0
- additory/core/logging.py +24 -0
- additory/core/memory_manager.py +547 -0
- additory/core/namespace_manager.py +657 -0
- additory/core/parser.py +176 -0
- additory/core/polars_expression_engine.py +551 -0
- additory/core/registry.py +176 -0
- additory/core/sample_data_manager.py +492 -0
- additory/core/user_namespace.py +751 -0
- additory/core/validator.py +27 -0
- additory/dynamic_api.py +308 -0
- additory/expressions/__init__.py +26 -0
- additory/expressions/engine.py +551 -0
- additory/expressions/parser.py +176 -0
- additory/expressions/proxy.py +546 -0
- additory/expressions/registry.py +313 -0
- additory/expressions/samples.py +492 -0
- additory/synthetic/__init__.py +101 -0
- additory/synthetic/api.py +220 -0
- additory/synthetic/common_integration.py +314 -0
- additory/synthetic/config.py +262 -0
- additory/synthetic/engines.py +529 -0
- additory/synthetic/exceptions.py +180 -0
- additory/synthetic/file_managers.py +518 -0
- additory/synthetic/generator.py +702 -0
- additory/synthetic/generator_parser.py +68 -0
- additory/synthetic/integration.py +319 -0
- additory/synthetic/models.py +241 -0
- additory/synthetic/pattern_resolver.py +573 -0
- additory/synthetic/performance.py +469 -0
- additory/synthetic/polars_integration.py +464 -0
- additory/synthetic/proxy.py +60 -0
- additory/synthetic/schema_parser.py +685 -0
- additory/synthetic/validator.py +553 -0
- additory/utilities/__init__.py +53 -0
- additory/utilities/encoding.py +600 -0
- additory/utilities/games.py +300 -0
- additory/utilities/keys.py +8 -0
- additory/utilities/lookup.py +103 -0
- additory/utilities/matchers.py +216 -0
- additory/utilities/resolvers.py +286 -0
- additory/utilities/settings.py +167 -0
- additory/utilities/units.py +746 -0
- additory/utilities/validators.py +153 -0
- additory-0.1.0a1.dist-info/METADATA +293 -0
- additory-0.1.0a1.dist-info/RECORD +87 -0
- additory-0.1.0a1.dist-info/WHEEL +5 -0
- additory-0.1.0a1.dist-info/licenses/LICENSE +21 -0
- additory-0.1.0a1.dist-info/top_level.txt +1 -0
additory/__init__.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# additory/__init__.py
|
|
2
|
+
|
|
3
|
+
from .dynamic_api import add as _api_instance
|
|
4
|
+
|
|
5
|
+
# Expose the API instance normally
|
|
6
|
+
add = _api_instance
|
|
7
|
+
|
|
8
|
+
# Module-level __getattr__ to forward dynamic attributes
|
|
9
|
+
def __getattr__(name):
|
|
10
|
+
# Delegate all unknown attributes to the API instance
|
|
11
|
+
return getattr(_api_instance, name)
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"add",
|
|
15
|
+
]
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Analysis Module for Data Profiling
|
|
3
|
+
|
|
4
|
+
Provides comprehensive data analysis capabilities:
|
|
5
|
+
- Distribution detection and fitting
|
|
6
|
+
- Correlation analysis
|
|
7
|
+
- Cardinality analysis
|
|
8
|
+
- Data quality metrics
|
|
9
|
+
- Data profiling and scanning
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from additory.analysis.distributions import (
|
|
13
|
+
detect_distributions,
|
|
14
|
+
fit_distribution,
|
|
15
|
+
DistributionFit
|
|
16
|
+
)
|
|
17
|
+
from additory.analysis.correlations import (
|
|
18
|
+
calculate_correlations,
|
|
19
|
+
CorrelationResult
|
|
20
|
+
)
|
|
21
|
+
from additory.analysis.cardinality import (
|
|
22
|
+
analyze_cardinality,
|
|
23
|
+
CardinalityInfo
|
|
24
|
+
)
|
|
25
|
+
from additory.analysis.quality import (
|
|
26
|
+
analyze_quality,
|
|
27
|
+
QualityMetrics
|
|
28
|
+
)
|
|
29
|
+
from additory.analysis.scan import (
|
|
30
|
+
scan,
|
|
31
|
+
ScanResult,
|
|
32
|
+
ColumnInfo
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
__all__ = [
|
|
36
|
+
'detect_distributions',
|
|
37
|
+
'fit_distribution',
|
|
38
|
+
'DistributionFit',
|
|
39
|
+
'calculate_correlations',
|
|
40
|
+
'CorrelationResult',
|
|
41
|
+
'analyze_cardinality',
|
|
42
|
+
'CardinalityInfo',
|
|
43
|
+
'analyze_quality',
|
|
44
|
+
'QualityMetrics',
|
|
45
|
+
'scan',
|
|
46
|
+
'ScanResult',
|
|
47
|
+
'ColumnInfo',
|
|
48
|
+
]
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Cardinality Analysis
|
|
3
|
+
|
|
4
|
+
Analyzes unique values and cardinality of columns.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from typing import List, Any, Dict
|
|
9
|
+
import polars as pl
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class CardinalityInfo:
|
|
14
|
+
"""Cardinality information for a column."""
|
|
15
|
+
unique_count: int
|
|
16
|
+
total_count: int
|
|
17
|
+
ratio: float
|
|
18
|
+
top_values: List[tuple] # [(value, count), ...]
|
|
19
|
+
classification: str # 'constant', 'low', 'medium', 'high'
|
|
20
|
+
|
|
21
|
+
def __repr__(self) -> str:
|
|
22
|
+
return (
|
|
23
|
+
f"CardinalityInfo(unique={self.unique_count}, "
|
|
24
|
+
f"ratio={self.ratio:.2%}, class='{self.classification}')"
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def classify_cardinality(ratio: float, unique_count: int) -> str:
|
|
29
|
+
"""
|
|
30
|
+
Classify cardinality based on ratio and unique count.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
ratio: Unique count / total count
|
|
34
|
+
unique_count: Number of unique values
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
Classification: 'constant', 'low', 'medium', 'high'
|
|
38
|
+
"""
|
|
39
|
+
if unique_count == 1:
|
|
40
|
+
return 'constant'
|
|
41
|
+
elif ratio >= 0.5:
|
|
42
|
+
return 'high'
|
|
43
|
+
elif ratio >= 0.1:
|
|
44
|
+
return 'medium'
|
|
45
|
+
else:
|
|
46
|
+
return 'low'
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def analyze_cardinality(
|
|
50
|
+
df: pl.DataFrame,
|
|
51
|
+
column: str,
|
|
52
|
+
top_n: int = 10
|
|
53
|
+
) -> CardinalityInfo:
|
|
54
|
+
"""
|
|
55
|
+
Analyze cardinality of a column.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
df: Polars DataFrame
|
|
59
|
+
column: Column name
|
|
60
|
+
top_n: Number of top values to return
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
CardinalityInfo object
|
|
64
|
+
"""
|
|
65
|
+
# Get total count (excluding nulls)
|
|
66
|
+
total_count = df[column].count()
|
|
67
|
+
|
|
68
|
+
if total_count == 0:
|
|
69
|
+
return CardinalityInfo(
|
|
70
|
+
unique_count=0,
|
|
71
|
+
total_count=0,
|
|
72
|
+
ratio=0.0,
|
|
73
|
+
top_values=[],
|
|
74
|
+
classification='constant'
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
# Get unique count (excluding nulls)
|
|
78
|
+
unique_count = df[column].drop_nulls().n_unique()
|
|
79
|
+
|
|
80
|
+
# Calculate ratio
|
|
81
|
+
ratio = unique_count / total_count if total_count > 0 else 0.0
|
|
82
|
+
|
|
83
|
+
# Get top values
|
|
84
|
+
value_counts = (
|
|
85
|
+
df
|
|
86
|
+
.group_by(column)
|
|
87
|
+
.agg(pl.len().alias('count'))
|
|
88
|
+
.sort('count', descending=True)
|
|
89
|
+
.head(top_n)
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
top_values = [
|
|
93
|
+
(row[column], row['count'])
|
|
94
|
+
for row in value_counts.iter_rows(named=True)
|
|
95
|
+
]
|
|
96
|
+
|
|
97
|
+
# Classify
|
|
98
|
+
classification = classify_cardinality(ratio, unique_count)
|
|
99
|
+
|
|
100
|
+
return CardinalityInfo(
|
|
101
|
+
unique_count=unique_count,
|
|
102
|
+
total_count=total_count,
|
|
103
|
+
ratio=ratio,
|
|
104
|
+
top_values=top_values,
|
|
105
|
+
classification=classification
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def analyze_all_cardinality(
|
|
110
|
+
df: pl.DataFrame,
|
|
111
|
+
top_n: int = 10
|
|
112
|
+
) -> Dict[str, CardinalityInfo]:
|
|
113
|
+
"""
|
|
114
|
+
Analyze cardinality for all columns.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
df: Polars DataFrame
|
|
118
|
+
top_n: Number of top values to return per column
|
|
119
|
+
|
|
120
|
+
Returns:
|
|
121
|
+
Dictionary mapping column names to CardinalityInfo
|
|
122
|
+
"""
|
|
123
|
+
return {
|
|
124
|
+
col: analyze_cardinality(df, col, top_n)
|
|
125
|
+
for col in df.columns
|
|
126
|
+
}
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Correlation Analysis
|
|
3
|
+
|
|
4
|
+
Calculates correlations between numeric columns.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from typing import Dict, List, Tuple
|
|
9
|
+
import numpy as np
|
|
10
|
+
import polars as pl
|
|
11
|
+
from scipy import stats
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@dataclass
|
|
15
|
+
class CorrelationResult:
|
|
16
|
+
"""Result of correlation analysis between two columns."""
|
|
17
|
+
column1: str
|
|
18
|
+
column2: str
|
|
19
|
+
correlation: float
|
|
20
|
+
method: str
|
|
21
|
+
p_value: float = 0.0
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def calculate_correlations(
|
|
25
|
+
df: pl.DataFrame,
|
|
26
|
+
columns: List[str],
|
|
27
|
+
methods: List[str] = ['pearson', 'spearman'],
|
|
28
|
+
threshold: float = 0.0
|
|
29
|
+
) -> List[CorrelationResult]:
|
|
30
|
+
"""
|
|
31
|
+
Calculate correlations between numeric columns with optimized batch processing.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
df: Polars DataFrame
|
|
35
|
+
columns: List of numeric column names
|
|
36
|
+
methods: Correlation methods to calculate
|
|
37
|
+
threshold: Minimum correlation threshold to report
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
List of CorrelationResult objects (changed from single object for scan.py compatibility)
|
|
41
|
+
"""
|
|
42
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
43
|
+
import itertools
|
|
44
|
+
|
|
45
|
+
if len(columns) < 2:
|
|
46
|
+
return []
|
|
47
|
+
|
|
48
|
+
# Pre-extract all data as numpy arrays for efficiency
|
|
49
|
+
data_arrays = {}
|
|
50
|
+
for col in columns:
|
|
51
|
+
arr = df[col].to_numpy()
|
|
52
|
+
data_arrays[col] = arr
|
|
53
|
+
|
|
54
|
+
# Generate all column pairs
|
|
55
|
+
column_pairs = list(itertools.combinations(columns, 2))
|
|
56
|
+
|
|
57
|
+
results = []
|
|
58
|
+
|
|
59
|
+
def calculate_pair_correlations(pair):
|
|
60
|
+
"""Calculate correlations for a single pair of columns."""
|
|
61
|
+
col1, col2 = pair
|
|
62
|
+
arr1 = data_arrays[col1]
|
|
63
|
+
arr2 = data_arrays[col2]
|
|
64
|
+
|
|
65
|
+
# Get common non-NaN indices
|
|
66
|
+
mask = ~(np.isnan(arr1) | np.isnan(arr2))
|
|
67
|
+
arr1_clean = arr1[mask]
|
|
68
|
+
arr2_clean = arr2[mask]
|
|
69
|
+
|
|
70
|
+
if len(arr1_clean) < 3:
|
|
71
|
+
return None
|
|
72
|
+
|
|
73
|
+
pair_results = {}
|
|
74
|
+
|
|
75
|
+
# Calculate all requested methods for this pair
|
|
76
|
+
for method in methods:
|
|
77
|
+
try:
|
|
78
|
+
if method == 'pearson':
|
|
79
|
+
corr, p_value = stats.pearsonr(arr1_clean, arr2_clean)
|
|
80
|
+
elif method == 'spearman':
|
|
81
|
+
corr, p_value = stats.spearmanr(arr1_clean, arr2_clean)
|
|
82
|
+
elif method == 'kendall':
|
|
83
|
+
corr, p_value = stats.kendalltau(arr1_clean, arr2_clean)
|
|
84
|
+
else:
|
|
85
|
+
continue
|
|
86
|
+
|
|
87
|
+
# Only include if above threshold
|
|
88
|
+
if abs(corr) >= threshold:
|
|
89
|
+
pair_results[method] = {
|
|
90
|
+
'correlation': float(corr),
|
|
91
|
+
'p_value': float(p_value)
|
|
92
|
+
}
|
|
93
|
+
except Exception:
|
|
94
|
+
continue
|
|
95
|
+
|
|
96
|
+
if pair_results:
|
|
97
|
+
return (col1, col2, pair_results)
|
|
98
|
+
return None
|
|
99
|
+
|
|
100
|
+
# Use ThreadPoolExecutor for parallel processing of correlation pairs
|
|
101
|
+
with ThreadPoolExecutor(max_workers=min(4, len(column_pairs))) as executor:
|
|
102
|
+
# Submit all pair processing tasks
|
|
103
|
+
future_to_pair = {
|
|
104
|
+
executor.submit(calculate_pair_correlations, pair): pair
|
|
105
|
+
for pair in column_pairs
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
# Collect results as they complete
|
|
109
|
+
for future in as_completed(future_to_pair):
|
|
110
|
+
result = future.result()
|
|
111
|
+
if result is not None:
|
|
112
|
+
col1, col2, pair_results = result
|
|
113
|
+
|
|
114
|
+
# Create CorrelationResult objects for each method
|
|
115
|
+
for method, corr_data in pair_results.items():
|
|
116
|
+
results.append(CorrelationResult(
|
|
117
|
+
column1=col1,
|
|
118
|
+
column2=col2,
|
|
119
|
+
correlation=corr_data['correlation'],
|
|
120
|
+
method=method,
|
|
121
|
+
p_value=corr_data['p_value']
|
|
122
|
+
))
|
|
123
|
+
|
|
124
|
+
return results
|
|
@@ -0,0 +1,376 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Distribution Detection and Fitting
|
|
3
|
+
|
|
4
|
+
Detects and fits statistical distributions to numeric data.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from typing import List, Dict, Any, Optional
|
|
9
|
+
import numpy as np
|
|
10
|
+
from scipy import stats
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class DistributionFit:
|
|
15
|
+
"""Result of fitting a distribution to data."""
|
|
16
|
+
name: str
|
|
17
|
+
params: Dict[str, float]
|
|
18
|
+
goodness_of_fit: float # KS test statistic (lower is better)
|
|
19
|
+
p_value: float # KS test p-value (higher is better)
|
|
20
|
+
|
|
21
|
+
def __repr__(self) -> str:
|
|
22
|
+
return f"DistributionFit(name='{self.name}', fit={self.goodness_of_fit:.4f}, p={self.p_value:.4f})"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def fit_normal(data: np.ndarray) -> DistributionFit:
|
|
26
|
+
"""Fit normal distribution."""
|
|
27
|
+
mean, std = stats.norm.fit(data)
|
|
28
|
+
ks_stat, p_value = stats.kstest(data, 'norm', args=(mean, std))
|
|
29
|
+
|
|
30
|
+
return DistributionFit(
|
|
31
|
+
name='normal',
|
|
32
|
+
params={'mean': float(mean), 'std': float(std)},
|
|
33
|
+
goodness_of_fit=float(ks_stat),
|
|
34
|
+
p_value=float(p_value)
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def fit_uniform(data: np.ndarray) -> DistributionFit:
|
|
39
|
+
"""Fit uniform distribution."""
|
|
40
|
+
loc, scale = stats.uniform.fit(data)
|
|
41
|
+
ks_stat, p_value = stats.kstest(data, 'uniform', args=(loc, scale))
|
|
42
|
+
|
|
43
|
+
return DistributionFit(
|
|
44
|
+
name='uniform',
|
|
45
|
+
params={'min': float(loc), 'max': float(loc + scale)},
|
|
46
|
+
goodness_of_fit=float(ks_stat),
|
|
47
|
+
p_value=float(p_value)
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def fit_exponential(data: np.ndarray) -> Optional[DistributionFit]:
|
|
52
|
+
"""Fit exponential distribution (requires positive values)."""
|
|
53
|
+
if np.any(data <= 0):
|
|
54
|
+
return None
|
|
55
|
+
|
|
56
|
+
loc, scale = stats.expon.fit(data)
|
|
57
|
+
ks_stat, p_value = stats.kstest(data, 'expon', args=(loc, scale))
|
|
58
|
+
|
|
59
|
+
return DistributionFit(
|
|
60
|
+
name='exponential',
|
|
61
|
+
params={'loc': float(loc), 'scale': float(scale), 'rate': float(1/scale)},
|
|
62
|
+
goodness_of_fit=float(ks_stat),
|
|
63
|
+
p_value=float(p_value)
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def fit_lognormal(data: np.ndarray) -> Optional[DistributionFit]:
|
|
68
|
+
"""Fit log-normal distribution (requires positive values)."""
|
|
69
|
+
if np.any(data <= 0):
|
|
70
|
+
return None
|
|
71
|
+
|
|
72
|
+
shape, loc, scale = stats.lognorm.fit(data, floc=0)
|
|
73
|
+
ks_stat, p_value = stats.kstest(data, 'lognorm', args=(shape, loc, scale))
|
|
74
|
+
|
|
75
|
+
return DistributionFit(
|
|
76
|
+
name='lognormal',
|
|
77
|
+
params={'shape': float(shape), 'loc': float(loc), 'scale': float(scale)},
|
|
78
|
+
goodness_of_fit=float(ks_stat),
|
|
79
|
+
p_value=float(p_value)
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def fit_gamma(data: np.ndarray) -> Optional[DistributionFit]:
|
|
84
|
+
"""Fit gamma distribution (requires positive values)."""
|
|
85
|
+
if np.any(data <= 0):
|
|
86
|
+
return None
|
|
87
|
+
|
|
88
|
+
shape, loc, scale = stats.gamma.fit(data, floc=0)
|
|
89
|
+
ks_stat, p_value = stats.kstest(data, 'gamma', args=(shape, loc, scale))
|
|
90
|
+
|
|
91
|
+
return DistributionFit(
|
|
92
|
+
name='gamma',
|
|
93
|
+
params={'shape': float(shape), 'loc': float(loc), 'scale': float(scale)},
|
|
94
|
+
goodness_of_fit=float(ks_stat),
|
|
95
|
+
p_value=float(p_value)
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def fit_beta(data: np.ndarray) -> Optional[DistributionFit]:
|
|
100
|
+
"""Fit beta distribution (requires values in [0, 1] or will be normalized)."""
|
|
101
|
+
# Normalize to [0, 1]
|
|
102
|
+
data_min, data_max = np.min(data), np.max(data)
|
|
103
|
+
|
|
104
|
+
if data_max == data_min:
|
|
105
|
+
return None
|
|
106
|
+
|
|
107
|
+
normalized = (data - data_min) / (data_max - data_min)
|
|
108
|
+
|
|
109
|
+
# Avoid exact 0 and 1 for beta fitting
|
|
110
|
+
normalized = np.clip(normalized, 1e-6, 1 - 1e-6)
|
|
111
|
+
|
|
112
|
+
a, b, loc, scale = stats.beta.fit(normalized, floc=0, fscale=1)
|
|
113
|
+
ks_stat, p_value = stats.kstest(normalized, 'beta', args=(a, b, loc, scale))
|
|
114
|
+
|
|
115
|
+
return DistributionFit(
|
|
116
|
+
name='beta',
|
|
117
|
+
params={
|
|
118
|
+
'alpha': float(a),
|
|
119
|
+
'beta': float(b),
|
|
120
|
+
'data_min': float(data_min),
|
|
121
|
+
'data_max': float(data_max)
|
|
122
|
+
},
|
|
123
|
+
goodness_of_fit=float(ks_stat),
|
|
124
|
+
p_value=float(p_value)
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def fit_poisson(data: np.ndarray) -> Optional[DistributionFit]:
|
|
129
|
+
"""Fit Poisson distribution (requires non-negative integers)."""
|
|
130
|
+
# Check if data looks like integers
|
|
131
|
+
if not np.allclose(data, np.round(data)):
|
|
132
|
+
return None
|
|
133
|
+
|
|
134
|
+
if np.any(data < 0):
|
|
135
|
+
return None
|
|
136
|
+
|
|
137
|
+
mu = np.mean(data)
|
|
138
|
+
|
|
139
|
+
# For Poisson, use chi-square test instead of KS
|
|
140
|
+
# KS test doesn't work well for discrete distributions
|
|
141
|
+
# We'll use a simplified goodness-of-fit measure
|
|
142
|
+
expected_var = mu
|
|
143
|
+
actual_var = np.var(data)
|
|
144
|
+
|
|
145
|
+
# Goodness of fit: how close variance is to mean (Poisson property)
|
|
146
|
+
if mu > 0:
|
|
147
|
+
fit_score = abs(actual_var - expected_var) / mu
|
|
148
|
+
else:
|
|
149
|
+
fit_score = 1.0
|
|
150
|
+
|
|
151
|
+
return DistributionFit(
|
|
152
|
+
name='poisson',
|
|
153
|
+
params={'lambda': float(mu)},
|
|
154
|
+
goodness_of_fit=float(fit_score),
|
|
155
|
+
p_value=0.0 # Not applicable for this simplified test
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def fit_chisquare(data: np.ndarray) -> Optional[DistributionFit]:
|
|
160
|
+
"""Fit chi-squared distribution (requires positive values)."""
|
|
161
|
+
if np.any(data <= 0):
|
|
162
|
+
return None
|
|
163
|
+
|
|
164
|
+
df, loc, scale = stats.chi2.fit(data, floc=0)
|
|
165
|
+
ks_stat, p_value = stats.kstest(data, 'chi2', args=(df, loc, scale))
|
|
166
|
+
|
|
167
|
+
return DistributionFit(
|
|
168
|
+
name='chisquare',
|
|
169
|
+
params={'df': float(df), 'loc': float(loc), 'scale': float(scale)},
|
|
170
|
+
goodness_of_fit=float(ks_stat),
|
|
171
|
+
p_value=float(p_value)
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def fit_distribution(data: np.ndarray, dist_name: str) -> Optional[DistributionFit]:
|
|
176
|
+
"""
|
|
177
|
+
Fit a specific distribution to data.
|
|
178
|
+
|
|
179
|
+
Args:
|
|
180
|
+
data: Numeric data array
|
|
181
|
+
dist_name: Distribution name (normal, uniform, exponential, etc.)
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
DistributionFit object or None if fitting failed
|
|
185
|
+
"""
|
|
186
|
+
if len(data) < 3:
|
|
187
|
+
return None
|
|
188
|
+
|
|
189
|
+
# Remove NaN values
|
|
190
|
+
data = data[~np.isnan(data)]
|
|
191
|
+
|
|
192
|
+
if len(data) < 3:
|
|
193
|
+
return None
|
|
194
|
+
|
|
195
|
+
try:
|
|
196
|
+
if dist_name == 'normal':
|
|
197
|
+
return fit_normal(data)
|
|
198
|
+
elif dist_name == 'uniform':
|
|
199
|
+
return fit_uniform(data)
|
|
200
|
+
elif dist_name == 'exponential':
|
|
201
|
+
return fit_exponential(data)
|
|
202
|
+
elif dist_name == 'lognormal':
|
|
203
|
+
return fit_lognormal(data)
|
|
204
|
+
elif dist_name == 'gamma':
|
|
205
|
+
return fit_gamma(data)
|
|
206
|
+
elif dist_name == 'beta':
|
|
207
|
+
return fit_beta(data)
|
|
208
|
+
elif dist_name == 'poisson':
|
|
209
|
+
return fit_poisson(data)
|
|
210
|
+
elif dist_name == 'chisquare':
|
|
211
|
+
return fit_chisquare(data)
|
|
212
|
+
else:
|
|
213
|
+
return None
|
|
214
|
+
except Exception:
|
|
215
|
+
return None
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def detect_distributions(
|
|
219
|
+
data: np.ndarray,
|
|
220
|
+
top_n: int = 3
|
|
221
|
+
) -> List[DistributionFit]:
|
|
222
|
+
"""
|
|
223
|
+
Detect best-fitting distributions for data.
|
|
224
|
+
|
|
225
|
+
Args:
|
|
226
|
+
data: Numeric data array
|
|
227
|
+
top_n: Number of top distributions to return
|
|
228
|
+
|
|
229
|
+
Returns:
|
|
230
|
+
List of DistributionFit objects, sorted by goodness of fit
|
|
231
|
+
"""
|
|
232
|
+
if len(data) < 3:
|
|
233
|
+
return []
|
|
234
|
+
|
|
235
|
+
# Remove NaN values
|
|
236
|
+
data = data[~np.isnan(data)]
|
|
237
|
+
|
|
238
|
+
if len(data) < 3:
|
|
239
|
+
return []
|
|
240
|
+
|
|
241
|
+
# Try all distributions
|
|
242
|
+
distributions = [
|
|
243
|
+
'normal',
|
|
244
|
+
'uniform',
|
|
245
|
+
'exponential',
|
|
246
|
+
'lognormal',
|
|
247
|
+
'gamma',
|
|
248
|
+
'beta',
|
|
249
|
+
'poisson',
|
|
250
|
+
'chisquare'
|
|
251
|
+
]
|
|
252
|
+
|
|
253
|
+
fits = []
|
|
254
|
+
for dist_name in distributions:
|
|
255
|
+
fit = fit_distribution(data, dist_name)
|
|
256
|
+
if fit is not None:
|
|
257
|
+
fits.append(fit)
|
|
258
|
+
|
|
259
|
+
# Sort by goodness of fit (lower is better)
|
|
260
|
+
fits.sort(key=lambda x: x.goodness_of_fit)
|
|
261
|
+
|
|
262
|
+
return fits[:top_n]
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def detect_distributions(
|
|
266
|
+
df,
|
|
267
|
+
columns: List[str] = None,
|
|
268
|
+
top_n: int = 3
|
|
269
|
+
) -> Dict[str, List[DistributionFit]]:
|
|
270
|
+
"""
|
|
271
|
+
Detect best-fitting distributions for multiple columns in a DataFrame.
|
|
272
|
+
|
|
273
|
+
Args:
|
|
274
|
+
df: Polars DataFrame
|
|
275
|
+
columns: List of column names to analyze (None = all numeric columns)
|
|
276
|
+
top_n: Number of top distributions to return per column
|
|
277
|
+
|
|
278
|
+
Returns:
|
|
279
|
+
Dictionary mapping column names to lists of DistributionFit objects
|
|
280
|
+
"""
|
|
281
|
+
import polars as pl
|
|
282
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
283
|
+
import numpy as np
|
|
284
|
+
|
|
285
|
+
if columns is None:
|
|
286
|
+
# Auto-detect numeric columns
|
|
287
|
+
columns = [col for col in df.columns
|
|
288
|
+
if df[col].dtype in [pl.Int8, pl.Int16, pl.Int32, pl.Int64,
|
|
289
|
+
pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64,
|
|
290
|
+
pl.Float32, pl.Float64]]
|
|
291
|
+
|
|
292
|
+
results = {}
|
|
293
|
+
|
|
294
|
+
def process_column(col_name):
|
|
295
|
+
"""Process a single column for distribution detection"""
|
|
296
|
+
try:
|
|
297
|
+
# Extract column data as numpy array
|
|
298
|
+
col_data = df[col_name].to_numpy()
|
|
299
|
+
|
|
300
|
+
# Remove null values
|
|
301
|
+
col_data = col_data[~np.isnan(col_data)]
|
|
302
|
+
|
|
303
|
+
if len(col_data) < 3:
|
|
304
|
+
return col_name, []
|
|
305
|
+
|
|
306
|
+
# Detect distributions for this column
|
|
307
|
+
fits = detect_distributions_array(col_data, top_n)
|
|
308
|
+
return col_name, fits
|
|
309
|
+
|
|
310
|
+
except Exception as e:
|
|
311
|
+
# Log error but continue with other columns
|
|
312
|
+
return col_name, []
|
|
313
|
+
|
|
314
|
+
# Use ThreadPoolExecutor for parallel processing
|
|
315
|
+
with ThreadPoolExecutor(max_workers=min(4, len(columns))) as executor:
|
|
316
|
+
# Submit all column processing tasks
|
|
317
|
+
future_to_column = {
|
|
318
|
+
executor.submit(process_column, col): col
|
|
319
|
+
for col in columns
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
# Collect results as they complete
|
|
323
|
+
for future in as_completed(future_to_column):
|
|
324
|
+
col_name, fits = future.result()
|
|
325
|
+
results[col_name] = fits
|
|
326
|
+
|
|
327
|
+
return results
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
def detect_distributions_array(
|
|
331
|
+
data: np.ndarray,
|
|
332
|
+
top_n: int = 3
|
|
333
|
+
) -> List[DistributionFit]:
|
|
334
|
+
"""
|
|
335
|
+
Detect best-fitting distributions for data array.
|
|
336
|
+
|
|
337
|
+
This is the original function renamed to avoid conflicts.
|
|
338
|
+
|
|
339
|
+
Args:
|
|
340
|
+
data: Numeric data array
|
|
341
|
+
top_n: Number of top distributions to return
|
|
342
|
+
|
|
343
|
+
Returns:
|
|
344
|
+
List of DistributionFit objects, sorted by goodness of fit
|
|
345
|
+
"""
|
|
346
|
+
if len(data) < 3:
|
|
347
|
+
return []
|
|
348
|
+
|
|
349
|
+
# Remove NaN values
|
|
350
|
+
data = data[~np.isnan(data)]
|
|
351
|
+
|
|
352
|
+
if len(data) < 3:
|
|
353
|
+
return []
|
|
354
|
+
|
|
355
|
+
# Try all distributions
|
|
356
|
+
distributions = [
|
|
357
|
+
'normal',
|
|
358
|
+
'uniform',
|
|
359
|
+
'exponential',
|
|
360
|
+
'lognormal',
|
|
361
|
+
'gamma',
|
|
362
|
+
'beta',
|
|
363
|
+
'poisson',
|
|
364
|
+
'chisquare'
|
|
365
|
+
]
|
|
366
|
+
|
|
367
|
+
fits = []
|
|
368
|
+
for dist_name in distributions:
|
|
369
|
+
fit = fit_distribution(data, dist_name)
|
|
370
|
+
if fit is not None:
|
|
371
|
+
fits.append(fit)
|
|
372
|
+
|
|
373
|
+
# Sort by goodness of fit (lower is better)
|
|
374
|
+
fits.sort(key=lambda x: x.goodness_of_fit)
|
|
375
|
+
|
|
376
|
+
return fits[:top_n]
|