additory 0.1.0a3__py3-none-any.whl → 0.1.1a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- additory/__init__.py +58 -14
- additory/common/__init__.py +31 -147
- additory/common/column_selector.py +255 -0
- additory/common/distributions.py +286 -613
- additory/common/extractors.py +313 -0
- additory/common/knn_imputation.py +332 -0
- additory/common/result.py +380 -0
- additory/common/strategy_parser.py +243 -0
- additory/common/unit_conversions.py +338 -0
- additory/common/validation.py +283 -103
- additory/core/__init__.py +34 -22
- additory/core/backend.py +258 -0
- additory/core/config.py +177 -305
- additory/core/logging.py +230 -24
- additory/core/memory_manager.py +157 -495
- additory/expressions/__init__.py +2 -23
- additory/expressions/compiler.py +457 -0
- additory/expressions/engine.py +264 -487
- additory/expressions/integrity.py +179 -0
- additory/expressions/loader.py +263 -0
- additory/expressions/parser.py +363 -167
- additory/expressions/resolver.py +274 -0
- additory/functions/__init__.py +1 -0
- additory/functions/analyze/__init__.py +144 -0
- additory/functions/analyze/cardinality.py +58 -0
- additory/functions/analyze/correlations.py +66 -0
- additory/functions/analyze/distributions.py +53 -0
- additory/functions/analyze/duplicates.py +49 -0
- additory/functions/analyze/features.py +61 -0
- additory/functions/analyze/imputation.py +66 -0
- additory/functions/analyze/outliers.py +65 -0
- additory/functions/analyze/patterns.py +65 -0
- additory/functions/analyze/presets.py +72 -0
- additory/functions/analyze/quality.py +59 -0
- additory/functions/analyze/timeseries.py +53 -0
- additory/functions/analyze/types.py +45 -0
- additory/functions/expressions/__init__.py +161 -0
- additory/functions/snapshot/__init__.py +82 -0
- additory/functions/snapshot/filter.py +119 -0
- additory/functions/synthetic/__init__.py +113 -0
- additory/functions/synthetic/mode_detector.py +47 -0
- additory/functions/synthetic/strategies/__init__.py +1 -0
- additory/functions/synthetic/strategies/advanced.py +35 -0
- additory/functions/synthetic/strategies/augmentative.py +160 -0
- additory/functions/synthetic/strategies/generative.py +168 -0
- additory/functions/synthetic/strategies/presets.py +116 -0
- additory/functions/to/__init__.py +188 -0
- additory/functions/to/lookup.py +351 -0
- additory/functions/to/merge.py +189 -0
- additory/functions/to/sort.py +91 -0
- additory/functions/to/summarize.py +170 -0
- additory/functions/transform/__init__.py +140 -0
- additory/functions/transform/datetime.py +79 -0
- additory/functions/transform/extract.py +85 -0
- additory/functions/transform/harmonize.py +105 -0
- additory/functions/transform/knn.py +62 -0
- additory/functions/transform/onehotencoding.py +68 -0
- additory/functions/transform/transpose.py +42 -0
- additory-0.1.1a1.dist-info/METADATA +83 -0
- additory-0.1.1a1.dist-info/RECORD +62 -0
- additory/analysis/__init__.py +0 -48
- additory/analysis/cardinality.py +0 -126
- additory/analysis/correlations.py +0 -124
- additory/analysis/distributions.py +0 -376
- additory/analysis/quality.py +0 -158
- additory/analysis/scan.py +0 -400
- additory/common/backend.py +0 -371
- additory/common/column_utils.py +0 -191
- additory/common/exceptions.py +0 -62
- additory/common/lists.py +0 -229
- additory/common/patterns.py +0 -240
- additory/common/resolver.py +0 -567
- additory/common/sample_data.py +0 -182
- additory/core/ast_builder.py +0 -165
- additory/core/backends/__init__.py +0 -23
- additory/core/backends/arrow_bridge.py +0 -483
- additory/core/backends/cudf_bridge.py +0 -355
- additory/core/column_positioning.py +0 -358
- additory/core/compiler_polars.py +0 -166
- additory/core/enhanced_cache_manager.py +0 -1119
- additory/core/enhanced_matchers.py +0 -473
- additory/core/enhanced_version_manager.py +0 -325
- additory/core/executor.py +0 -59
- additory/core/integrity_manager.py +0 -477
- additory/core/loader.py +0 -190
- additory/core/namespace_manager.py +0 -657
- additory/core/parser.py +0 -176
- additory/core/polars_expression_engine.py +0 -601
- additory/core/registry.py +0 -176
- additory/core/sample_data_manager.py +0 -492
- additory/core/user_namespace.py +0 -751
- additory/core/validator.py +0 -27
- additory/dynamic_api.py +0 -304
- additory/expressions/proxy.py +0 -549
- additory/expressions/registry.py +0 -313
- additory/expressions/samples.py +0 -492
- additory/synthetic/__init__.py +0 -13
- additory/synthetic/column_name_resolver.py +0 -149
- additory/synthetic/distributions.py +0 -22
- additory/synthetic/forecast.py +0 -1132
- additory/synthetic/linked_list_parser.py +0 -415
- additory/synthetic/namespace_lookup.py +0 -129
- additory/synthetic/smote.py +0 -320
- additory/synthetic/strategies.py +0 -850
- additory/synthetic/synthesizer.py +0 -713
- additory/utilities/__init__.py +0 -53
- additory/utilities/encoding.py +0 -600
- additory/utilities/games.py +0 -300
- additory/utilities/keys.py +0 -8
- additory/utilities/lookup.py +0 -103
- additory/utilities/matchers.py +0 -216
- additory/utilities/resolvers.py +0 -286
- additory/utilities/settings.py +0 -167
- additory/utilities/units.py +0 -749
- additory/utilities/validators.py +0 -153
- additory-0.1.0a3.dist-info/METADATA +0 -288
- additory-0.1.0a3.dist-info/RECORD +0 -71
- additory-0.1.0a3.dist-info/licenses/LICENSE +0 -21
- {additory-0.1.0a3.dist-info → additory-0.1.1a1.dist-info}/WHEEL +0 -0
- {additory-0.1.0a3.dist-info → additory-0.1.1a1.dist-info}/top_level.txt +0 -0
additory/analysis/scan.py
DELETED
|
@@ -1,400 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Data Profiling and Analysis
|
|
3
|
-
|
|
4
|
-
Provides comprehensive data profiling through the scan() function.
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
from dataclasses import dataclass, field
|
|
8
|
-
from typing import Dict, List, Optional, Union, Any
|
|
9
|
-
import json
|
|
10
|
-
import polars as pl
|
|
11
|
-
import pandas as pd
|
|
12
|
-
import numpy as np
|
|
13
|
-
|
|
14
|
-
from additory.common.backend import detect_backend, to_polars
|
|
15
|
-
from additory.analysis.distributions import (
|
|
16
|
-
detect_distributions,
|
|
17
|
-
DistributionFit
|
|
18
|
-
)
|
|
19
|
-
from additory.analysis.correlations import (
|
|
20
|
-
calculate_correlations,
|
|
21
|
-
CorrelationResult
|
|
22
|
-
)
|
|
23
|
-
from additory.analysis.cardinality import (
|
|
24
|
-
analyze_all_cardinality,
|
|
25
|
-
CardinalityInfo
|
|
26
|
-
)
|
|
27
|
-
from additory.analysis.quality import (
|
|
28
|
-
analyze_all_quality,
|
|
29
|
-
QualityMetrics,
|
|
30
|
-
is_numeric_dtype
|
|
31
|
-
)
|
|
32
|
-
|
|
33
|
-
# Scan presets for common use cases
|
|
34
|
-
SCAN_PRESETS = {
|
|
35
|
-
'quick': {
|
|
36
|
-
'detect_distributions_flag': False,
|
|
37
|
-
'detect_correlations_flag': False,
|
|
38
|
-
'detect_cardinality_flag': True,
|
|
39
|
-
'description': 'Quick scan: quality + cardinality only'
|
|
40
|
-
},
|
|
41
|
-
'distributions': {
|
|
42
|
-
'detect_distributions_flag': True,
|
|
43
|
-
'detect_correlations_flag': False,
|
|
44
|
-
'detect_cardinality_flag': False,
|
|
45
|
-
'description': 'Distribution-focused: detect distributions only'
|
|
46
|
-
},
|
|
47
|
-
'correlations': {
|
|
48
|
-
'detect_distributions_flag': False,
|
|
49
|
-
'detect_correlations_flag': True,
|
|
50
|
-
'detect_cardinality_flag': False,
|
|
51
|
-
'description': 'Correlation-focused: correlation analysis only'
|
|
52
|
-
},
|
|
53
|
-
'full': {
|
|
54
|
-
'detect_distributions_flag': True,
|
|
55
|
-
'detect_correlations_flag': True,
|
|
56
|
-
'detect_cardinality_flag': True,
|
|
57
|
-
'description': 'Full analysis: all features enabled'
|
|
58
|
-
},
|
|
59
|
-
'minimal': {
|
|
60
|
-
'detect_distributions_flag': False,
|
|
61
|
-
'detect_correlations_flag': False,
|
|
62
|
-
'detect_cardinality_flag': False,
|
|
63
|
-
'description': 'Minimal scan: quality metrics only'
|
|
64
|
-
}
|
|
65
|
-
}
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
@dataclass
|
|
69
|
-
class ColumnInfo:
|
|
70
|
-
"""Information about a single column"""
|
|
71
|
-
name: str
|
|
72
|
-
dtype: str
|
|
73
|
-
null_count: int
|
|
74
|
-
null_percentage: float
|
|
75
|
-
unique_count: int
|
|
76
|
-
|
|
77
|
-
def __repr__(self):
|
|
78
|
-
return f"ColumnInfo(name='{self.name}', dtype='{self.dtype}', nulls={self.null_percentage:.1f}%)"
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
@dataclass
|
|
82
|
-
class ScanResult:
|
|
83
|
-
"""
|
|
84
|
-
Comprehensive scan results for a DataFrame
|
|
85
|
-
|
|
86
|
-
Contains all analysis results including distributions, correlations,
|
|
87
|
-
cardinality, and quality metrics.
|
|
88
|
-
"""
|
|
89
|
-
# Basic info
|
|
90
|
-
shape: tuple
|
|
91
|
-
columns: List[ColumnInfo]
|
|
92
|
-
|
|
93
|
-
# Analysis results
|
|
94
|
-
distributions: Dict[str, List[DistributionFit]] = field(default_factory=dict)
|
|
95
|
-
correlations: List[CorrelationResult] = field(default_factory=list)
|
|
96
|
-
cardinality: Dict[str, CardinalityInfo] = field(default_factory=dict)
|
|
97
|
-
quality: Dict[str, QualityMetrics] = field(default_factory=dict)
|
|
98
|
-
|
|
99
|
-
# Metadata
|
|
100
|
-
preset_used: Optional[str] = None
|
|
101
|
-
analysis_enabled: Dict[str, bool] = field(default_factory=dict)
|
|
102
|
-
|
|
103
|
-
def summary(self) -> str:
|
|
104
|
-
"""Generate a human-readable summary of the scan results"""
|
|
105
|
-
lines = []
|
|
106
|
-
lines.append(f"DataFrame Scan Results")
|
|
107
|
-
lines.append(f"Shape: {self.shape[0]:,} rows × {self.shape[1]} columns")
|
|
108
|
-
lines.append("")
|
|
109
|
-
|
|
110
|
-
if self.preset_used:
|
|
111
|
-
preset_desc = SCAN_PRESETS.get(self.preset_used, {}).get('description', 'Custom preset')
|
|
112
|
-
lines.append(f"Preset: {self.preset_used} ({preset_desc})")
|
|
113
|
-
lines.append("")
|
|
114
|
-
|
|
115
|
-
# Column overview
|
|
116
|
-
lines.append("Columns:")
|
|
117
|
-
for col in self.columns:
|
|
118
|
-
lines.append(f" {col.name}: {col.dtype} ({col.null_percentage:.1f}% null, {col.unique_count:,} unique)")
|
|
119
|
-
lines.append("")
|
|
120
|
-
|
|
121
|
-
# Distributions
|
|
122
|
-
if self.distributions:
|
|
123
|
-
lines.append("Top Distributions:")
|
|
124
|
-
for col_name, fits in self.distributions.items():
|
|
125
|
-
if fits:
|
|
126
|
-
best_fit = fits[0]
|
|
127
|
-
lines.append(f" {col_name}: {best_fit.distribution} (score: {best_fit.score:.3f})")
|
|
128
|
-
lines.append("")
|
|
129
|
-
|
|
130
|
-
# Correlations
|
|
131
|
-
if self.correlations:
|
|
132
|
-
lines.append("Strong Correlations:")
|
|
133
|
-
strong_corrs = [c for c in self.correlations if abs(c.correlation) >= 0.7]
|
|
134
|
-
for corr in strong_corrs[:5]: # Top 5
|
|
135
|
-
lines.append(f" {corr.column1} ↔ {corr.column2}: {corr.correlation:.3f} ({corr.method})")
|
|
136
|
-
if len(strong_corrs) > 5:
|
|
137
|
-
lines.append(f" ... and {len(strong_corrs) - 5} more")
|
|
138
|
-
lines.append("")
|
|
139
|
-
|
|
140
|
-
# Cardinality insights
|
|
141
|
-
if self.cardinality:
|
|
142
|
-
high_card = [name for name, info in self.cardinality.items()
|
|
143
|
-
if info.classification == 'high']
|
|
144
|
-
if high_card:
|
|
145
|
-
lines.append(f"High Cardinality Columns: {', '.join(high_card)}")
|
|
146
|
-
lines.append("")
|
|
147
|
-
|
|
148
|
-
return "\n".join(lines)
|
|
149
|
-
|
|
150
|
-
def to_dict(self) -> Dict[str, Any]:
|
|
151
|
-
"""Convert scan results to dictionary format"""
|
|
152
|
-
return {
|
|
153
|
-
'shape': self.shape,
|
|
154
|
-
'columns': [
|
|
155
|
-
{
|
|
156
|
-
'name': col.name,
|
|
157
|
-
'dtype': col.dtype,
|
|
158
|
-
'null_count': col.null_count,
|
|
159
|
-
'null_percentage': col.null_percentage,
|
|
160
|
-
'unique_count': col.unique_count
|
|
161
|
-
}
|
|
162
|
-
for col in self.columns
|
|
163
|
-
],
|
|
164
|
-
'distributions': {
|
|
165
|
-
col_name: [
|
|
166
|
-
{
|
|
167
|
-
'distribution': fit.distribution,
|
|
168
|
-
'score': fit.score,
|
|
169
|
-
'parameters': fit.parameters
|
|
170
|
-
}
|
|
171
|
-
for fit in fits
|
|
172
|
-
]
|
|
173
|
-
for col_name, fits in self.distributions.items()
|
|
174
|
-
},
|
|
175
|
-
'correlations': [
|
|
176
|
-
{
|
|
177
|
-
'column1': corr.column1,
|
|
178
|
-
'column2': corr.column2,
|
|
179
|
-
'correlation': corr.correlation,
|
|
180
|
-
'method': corr.method,
|
|
181
|
-
'p_value': corr.p_value
|
|
182
|
-
}
|
|
183
|
-
for corr in self.correlations
|
|
184
|
-
],
|
|
185
|
-
'cardinality': {
|
|
186
|
-
col_name: {
|
|
187
|
-
'unique_count': info.unique_count,
|
|
188
|
-
'total_count': info.total_count,
|
|
189
|
-
'unique_ratio': info.unique_ratio,
|
|
190
|
-
'classification': info.classification,
|
|
191
|
-
'top_values': info.top_values
|
|
192
|
-
}
|
|
193
|
-
for col_name, info in self.cardinality.items()
|
|
194
|
-
},
|
|
195
|
-
'quality': {
|
|
196
|
-
col_name: {
|
|
197
|
-
'missing_count': metrics.missing_count,
|
|
198
|
-
'missing_percentage': metrics.missing_percentage,
|
|
199
|
-
'data_type': metrics.data_type,
|
|
200
|
-
'summary_stats': metrics.summary_stats
|
|
201
|
-
}
|
|
202
|
-
for col_name, metrics in self.quality.items()
|
|
203
|
-
},
|
|
204
|
-
'metadata': {
|
|
205
|
-
'preset_used': self.preset_used,
|
|
206
|
-
'analysis_enabled': self.analysis_enabled
|
|
207
|
-
}
|
|
208
|
-
}
|
|
209
|
-
|
|
210
|
-
def to_json(self, indent: int = 2) -> str:
|
|
211
|
-
"""Convert scan results to JSON string"""
|
|
212
|
-
return json.dumps(self.to_dict(), indent=indent, default=str)
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
def scan(
|
|
216
|
-
df: Union[pl.DataFrame, pd.DataFrame, Any],
|
|
217
|
-
preset: Optional[str] = None,
|
|
218
|
-
detect_distributions_flag: bool = True,
|
|
219
|
-
detect_correlations_flag: bool = True,
|
|
220
|
-
detect_cardinality_flag: bool = True,
|
|
221
|
-
top_n_distributions: int = 3,
|
|
222
|
-
correlation_methods: List[str] = None,
|
|
223
|
-
correlation_threshold: float = 0.3,
|
|
224
|
-
cardinality_top_n: int = 10,
|
|
225
|
-
verbose: bool = True
|
|
226
|
-
) -> ScanResult:
|
|
227
|
-
"""
|
|
228
|
-
Scan a DataFrame to detect distributions, correlations, and cardinality.
|
|
229
|
-
|
|
230
|
-
Accepts pandas, polars, or cuDF DataFrames. Automatically converts to Polars
|
|
231
|
-
for processing. Returns ScanResult with analysis results.
|
|
232
|
-
|
|
233
|
-
This function provides comprehensive data profiling including:
|
|
234
|
-
- Distribution detection for numeric columns
|
|
235
|
-
- Correlation analysis between columns
|
|
236
|
-
- Cardinality analysis (unique values)
|
|
237
|
-
- Data quality metrics
|
|
238
|
-
|
|
239
|
-
Args:
|
|
240
|
-
df: DataFrame to analyze (pandas, polars, or cuDF)
|
|
241
|
-
preset: Optional preset ('quick', 'distributions', 'correlations', 'full', 'minimal')
|
|
242
|
-
detect_distributions_flag: Whether to detect distributions (default: True)
|
|
243
|
-
detect_correlations_flag: Whether to calculate correlations (default: True)
|
|
244
|
-
detect_cardinality_flag: Whether to analyze cardinality (default: True)
|
|
245
|
-
top_n_distributions: Number of top distributions to return per column (default: 3)
|
|
246
|
-
correlation_methods: Correlation methods to use (default: ['pearson', 'spearman'])
|
|
247
|
-
correlation_threshold: Minimum correlation to report (default: 0.3)
|
|
248
|
-
cardinality_top_n: Number of top values to return per column (default: 10)
|
|
249
|
-
verbose: Whether to print progress messages (default: True)
|
|
250
|
-
|
|
251
|
-
Returns:
|
|
252
|
-
ScanResult object containing all analysis results
|
|
253
|
-
|
|
254
|
-
Presets:
|
|
255
|
-
- 'quick': Quality + cardinality only (fast)
|
|
256
|
-
- 'distributions': Distribution detection only
|
|
257
|
-
- 'correlations': Correlation analysis only
|
|
258
|
-
- 'full': All analyses enabled
|
|
259
|
-
- 'minimal': Quality metrics only (fastest)
|
|
260
|
-
|
|
261
|
-
Example:
|
|
262
|
-
>>> import pandas as pd
|
|
263
|
-
>>> from additory.analysis.scan import scan
|
|
264
|
-
>>>
|
|
265
|
-
>>> # Works with pandas
|
|
266
|
-
>>> df = pd.DataFrame({
|
|
267
|
-
... 'age': [25, 30, 35, 40, 45],
|
|
268
|
-
... 'income': [50000, 60000, 70000, 80000, 90000],
|
|
269
|
-
... 'category': ['A', 'B', 'A', 'B', 'A']
|
|
270
|
-
... })
|
|
271
|
-
>>>
|
|
272
|
-
>>> result = scan(df)
|
|
273
|
-
>>> print(result.summary())
|
|
274
|
-
>>>
|
|
275
|
-
>>> # Use presets
|
|
276
|
-
>>> result = scan(df, preset='quick')
|
|
277
|
-
>>> result = scan(df, preset='distributions', top_n_distributions=5)
|
|
278
|
-
"""
|
|
279
|
-
# Handle preset configuration
|
|
280
|
-
if preset:
|
|
281
|
-
if preset not in SCAN_PRESETS:
|
|
282
|
-
available = ', '.join(SCAN_PRESETS.keys())
|
|
283
|
-
raise ValueError(f"Unknown preset '{preset}'. Available presets: {available}")
|
|
284
|
-
|
|
285
|
-
preset_config = SCAN_PRESETS[preset]
|
|
286
|
-
# Override flags with preset values (but allow explicit overrides)
|
|
287
|
-
if 'detect_distributions_flag' not in locals() or detect_distributions_flag is True:
|
|
288
|
-
detect_distributions_flag = preset_config['detect_distributions_flag']
|
|
289
|
-
if 'detect_correlations_flag' not in locals() or detect_correlations_flag is True:
|
|
290
|
-
detect_correlations_flag = preset_config['detect_correlations_flag']
|
|
291
|
-
if 'detect_cardinality_flag' not in locals() or detect_cardinality_flag is True:
|
|
292
|
-
detect_cardinality_flag = preset_config['detect_cardinality_flag']
|
|
293
|
-
|
|
294
|
-
# Set default correlation methods
|
|
295
|
-
if correlation_methods is None:
|
|
296
|
-
correlation_methods = ['pearson', 'spearman']
|
|
297
|
-
|
|
298
|
-
# Convert to Polars for processing
|
|
299
|
-
original_backend = detect_backend(df)
|
|
300
|
-
if verbose:
|
|
301
|
-
print(f"Scanning {original_backend} DataFrame with shape {df.shape}")
|
|
302
|
-
|
|
303
|
-
df_polars = to_polars(df)
|
|
304
|
-
|
|
305
|
-
# Memory cleanup: delete original if converted
|
|
306
|
-
if original_backend != 'polars':
|
|
307
|
-
del df
|
|
308
|
-
import gc
|
|
309
|
-
gc.collect()
|
|
310
|
-
|
|
311
|
-
# Get basic info
|
|
312
|
-
shape = df_polars.shape
|
|
313
|
-
column_names = df_polars.columns
|
|
314
|
-
|
|
315
|
-
# Analyze column info
|
|
316
|
-
columns = []
|
|
317
|
-
for col_name in column_names:
|
|
318
|
-
col_series = df_polars[col_name]
|
|
319
|
-
dtype = str(col_series.dtype)
|
|
320
|
-
null_count = col_series.null_count()
|
|
321
|
-
null_percentage = (null_count / shape[0]) * 100
|
|
322
|
-
unique_count = col_series.n_unique()
|
|
323
|
-
|
|
324
|
-
columns.append(ColumnInfo(
|
|
325
|
-
name=col_name,
|
|
326
|
-
dtype=dtype,
|
|
327
|
-
null_count=null_count,
|
|
328
|
-
null_percentage=null_percentage,
|
|
329
|
-
unique_count=unique_count
|
|
330
|
-
))
|
|
331
|
-
|
|
332
|
-
# Initialize result
|
|
333
|
-
result = ScanResult(
|
|
334
|
-
shape=shape,
|
|
335
|
-
columns=columns,
|
|
336
|
-
preset_used=preset,
|
|
337
|
-
analysis_enabled={
|
|
338
|
-
'distributions': detect_distributions_flag,
|
|
339
|
-
'correlations': detect_correlations_flag,
|
|
340
|
-
'cardinality': detect_cardinality_flag
|
|
341
|
-
}
|
|
342
|
-
)
|
|
343
|
-
|
|
344
|
-
# Quality analysis (always performed)
|
|
345
|
-
if verbose:
|
|
346
|
-
print("Analyzing data quality...")
|
|
347
|
-
result.quality = analyze_all_quality(df_polars)
|
|
348
|
-
|
|
349
|
-
# Distribution analysis
|
|
350
|
-
if detect_distributions_flag:
|
|
351
|
-
if verbose:
|
|
352
|
-
print("Detecting distributions...")
|
|
353
|
-
|
|
354
|
-
# Get numeric columns
|
|
355
|
-
numeric_columns = [col.name for col in columns
|
|
356
|
-
if is_numeric_dtype(df_polars[col.name].dtype)]
|
|
357
|
-
|
|
358
|
-
if numeric_columns:
|
|
359
|
-
result.distributions = detect_distributions(
|
|
360
|
-
df_polars,
|
|
361
|
-
columns=numeric_columns,
|
|
362
|
-
top_n=top_n_distributions
|
|
363
|
-
)
|
|
364
|
-
elif verbose:
|
|
365
|
-
print("No numeric columns found for distribution analysis")
|
|
366
|
-
|
|
367
|
-
# Correlation analysis
|
|
368
|
-
if detect_correlations_flag:
|
|
369
|
-
if verbose:
|
|
370
|
-
print("Calculating correlations...")
|
|
371
|
-
|
|
372
|
-
# Get numeric columns for correlation
|
|
373
|
-
numeric_columns = [col.name for col in columns
|
|
374
|
-
if is_numeric_dtype(df_polars[col.name].dtype)]
|
|
375
|
-
|
|
376
|
-
if len(numeric_columns) >= 2:
|
|
377
|
-
result.correlations = calculate_correlations(
|
|
378
|
-
df_polars,
|
|
379
|
-
columns=numeric_columns,
|
|
380
|
-
methods=correlation_methods,
|
|
381
|
-
threshold=correlation_threshold
|
|
382
|
-
)
|
|
383
|
-
elif verbose:
|
|
384
|
-
print(f"Need at least 2 numeric columns for correlation analysis (found {len(numeric_columns)})")
|
|
385
|
-
|
|
386
|
-
# Cardinality analysis
|
|
387
|
-
if detect_cardinality_flag:
|
|
388
|
-
if verbose:
|
|
389
|
-
print("Analyzing cardinality...")
|
|
390
|
-
result.cardinality = analyze_all_cardinality(df_polars, top_n=cardinality_top_n)
|
|
391
|
-
|
|
392
|
-
if verbose:
|
|
393
|
-
print("Scan complete!")
|
|
394
|
-
|
|
395
|
-
# Final memory cleanup
|
|
396
|
-
del df_polars
|
|
397
|
-
import gc
|
|
398
|
-
gc.collect()
|
|
399
|
-
|
|
400
|
-
return result
|