additory 0.1.0a3__py3-none-any.whl → 0.1.1a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- additory/__init__.py +58 -14
- additory/common/__init__.py +31 -147
- additory/common/column_selector.py +255 -0
- additory/common/distributions.py +286 -613
- additory/common/extractors.py +313 -0
- additory/common/knn_imputation.py +332 -0
- additory/common/result.py +380 -0
- additory/common/strategy_parser.py +243 -0
- additory/common/unit_conversions.py +338 -0
- additory/common/validation.py +283 -103
- additory/core/__init__.py +34 -22
- additory/core/backend.py +258 -0
- additory/core/config.py +177 -305
- additory/core/logging.py +230 -24
- additory/core/memory_manager.py +157 -495
- additory/expressions/__init__.py +2 -23
- additory/expressions/compiler.py +457 -0
- additory/expressions/engine.py +264 -487
- additory/expressions/integrity.py +179 -0
- additory/expressions/loader.py +263 -0
- additory/expressions/parser.py +363 -167
- additory/expressions/resolver.py +274 -0
- additory/functions/__init__.py +1 -0
- additory/functions/analyze/__init__.py +144 -0
- additory/functions/analyze/cardinality.py +58 -0
- additory/functions/analyze/correlations.py +66 -0
- additory/functions/analyze/distributions.py +53 -0
- additory/functions/analyze/duplicates.py +49 -0
- additory/functions/analyze/features.py +61 -0
- additory/functions/analyze/imputation.py +66 -0
- additory/functions/analyze/outliers.py +65 -0
- additory/functions/analyze/patterns.py +65 -0
- additory/functions/analyze/presets.py +72 -0
- additory/functions/analyze/quality.py +59 -0
- additory/functions/analyze/timeseries.py +53 -0
- additory/functions/analyze/types.py +45 -0
- additory/functions/expressions/__init__.py +161 -0
- additory/functions/snapshot/__init__.py +82 -0
- additory/functions/snapshot/filter.py +119 -0
- additory/functions/synthetic/__init__.py +113 -0
- additory/functions/synthetic/mode_detector.py +47 -0
- additory/functions/synthetic/strategies/__init__.py +1 -0
- additory/functions/synthetic/strategies/advanced.py +35 -0
- additory/functions/synthetic/strategies/augmentative.py +160 -0
- additory/functions/synthetic/strategies/generative.py +168 -0
- additory/functions/synthetic/strategies/presets.py +116 -0
- additory/functions/to/__init__.py +188 -0
- additory/functions/to/lookup.py +351 -0
- additory/functions/to/merge.py +189 -0
- additory/functions/to/sort.py +91 -0
- additory/functions/to/summarize.py +170 -0
- additory/functions/transform/__init__.py +140 -0
- additory/functions/transform/datetime.py +79 -0
- additory/functions/transform/extract.py +85 -0
- additory/functions/transform/harmonize.py +105 -0
- additory/functions/transform/knn.py +62 -0
- additory/functions/transform/onehotencoding.py +68 -0
- additory/functions/transform/transpose.py +42 -0
- additory-0.1.1a1.dist-info/METADATA +83 -0
- additory-0.1.1a1.dist-info/RECORD +62 -0
- additory/analysis/__init__.py +0 -48
- additory/analysis/cardinality.py +0 -126
- additory/analysis/correlations.py +0 -124
- additory/analysis/distributions.py +0 -376
- additory/analysis/quality.py +0 -158
- additory/analysis/scan.py +0 -400
- additory/common/backend.py +0 -371
- additory/common/column_utils.py +0 -191
- additory/common/exceptions.py +0 -62
- additory/common/lists.py +0 -229
- additory/common/patterns.py +0 -240
- additory/common/resolver.py +0 -567
- additory/common/sample_data.py +0 -182
- additory/core/ast_builder.py +0 -165
- additory/core/backends/__init__.py +0 -23
- additory/core/backends/arrow_bridge.py +0 -483
- additory/core/backends/cudf_bridge.py +0 -355
- additory/core/column_positioning.py +0 -358
- additory/core/compiler_polars.py +0 -166
- additory/core/enhanced_cache_manager.py +0 -1119
- additory/core/enhanced_matchers.py +0 -473
- additory/core/enhanced_version_manager.py +0 -325
- additory/core/executor.py +0 -59
- additory/core/integrity_manager.py +0 -477
- additory/core/loader.py +0 -190
- additory/core/namespace_manager.py +0 -657
- additory/core/parser.py +0 -176
- additory/core/polars_expression_engine.py +0 -601
- additory/core/registry.py +0 -176
- additory/core/sample_data_manager.py +0 -492
- additory/core/user_namespace.py +0 -751
- additory/core/validator.py +0 -27
- additory/dynamic_api.py +0 -304
- additory/expressions/proxy.py +0 -549
- additory/expressions/registry.py +0 -313
- additory/expressions/samples.py +0 -492
- additory/synthetic/__init__.py +0 -13
- additory/synthetic/column_name_resolver.py +0 -149
- additory/synthetic/distributions.py +0 -22
- additory/synthetic/forecast.py +0 -1132
- additory/synthetic/linked_list_parser.py +0 -415
- additory/synthetic/namespace_lookup.py +0 -129
- additory/synthetic/smote.py +0 -320
- additory/synthetic/strategies.py +0 -850
- additory/synthetic/synthesizer.py +0 -713
- additory/utilities/__init__.py +0 -53
- additory/utilities/encoding.py +0 -600
- additory/utilities/games.py +0 -300
- additory/utilities/keys.py +0 -8
- additory/utilities/lookup.py +0 -103
- additory/utilities/matchers.py +0 -216
- additory/utilities/resolvers.py +0 -286
- additory/utilities/settings.py +0 -167
- additory/utilities/units.py +0 -749
- additory/utilities/validators.py +0 -153
- additory-0.1.0a3.dist-info/METADATA +0 -288
- additory-0.1.0a3.dist-info/RECORD +0 -71
- additory-0.1.0a3.dist-info/licenses/LICENSE +0 -21
- {additory-0.1.0a3.dist-info → additory-0.1.1a1.dist-info}/WHEEL +0 -0
- {additory-0.1.0a3.dist-info → additory-0.1.1a1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,274 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Expression dependency resolver for Additory.
|
|
3
|
+
|
|
4
|
+
Resolves column references and dependencies in expressions.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import re
|
|
8
|
+
from typing import List, Dict, Set
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
# Known function names
|
|
12
|
+
FUNCTION_NAMES = {
|
|
13
|
+
# Math functions
|
|
14
|
+
'sqrt', 'abs', 'log', 'log10', 'exp', 'pow', 'round', 'floor', 'ceil',
|
|
15
|
+
# String functions
|
|
16
|
+
'lower', 'upper', 'trim', 'length', 'substring', 'replace', 'contains', 'matches',
|
|
17
|
+
# DateTime functions
|
|
18
|
+
'year', 'month', 'day', 'hour', 'minute', 'second', 'day_of_week', 'time_of_day',
|
|
19
|
+
# Aggregation functions
|
|
20
|
+
'sum', 'mean', 'median', 'min', 'max', 'count', 'std',
|
|
21
|
+
# Conditional functions
|
|
22
|
+
'if_else', 'coalesce', 'is_null', 'is_not_null'
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def resolve_dependencies(expression: str, available_columns: List[str]) -> List[str]:
|
|
27
|
+
"""
|
|
28
|
+
Find all column dependencies in an expression.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
expression: Expression string
|
|
32
|
+
available_columns: List of available column names in DataFrame
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
List of column names used in expression
|
|
36
|
+
|
|
37
|
+
Example:
|
|
38
|
+
deps = resolve_dependencies('weight / (height ** 2)', ['weight', 'height', 'age'])
|
|
39
|
+
# Returns: ['weight', 'height']
|
|
40
|
+
"""
|
|
41
|
+
# Extract all identifiers
|
|
42
|
+
identifiers = extract_identifiers(expression)
|
|
43
|
+
|
|
44
|
+
# Filter to only columns that exist in DataFrame
|
|
45
|
+
dependencies = [id for id in identifiers if id in available_columns]
|
|
46
|
+
|
|
47
|
+
# Return unique list
|
|
48
|
+
return list(set(dependencies))
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def validate_dependencies(expression: str, available_columns: List[str]) -> bool:
|
|
52
|
+
"""
|
|
53
|
+
Validate that all column dependencies exist.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
expression: Expression string
|
|
57
|
+
available_columns: List of available column names
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
True if all dependencies exist
|
|
61
|
+
|
|
62
|
+
Raises:
|
|
63
|
+
ValueError: If any dependencies are missing
|
|
64
|
+
|
|
65
|
+
Example:
|
|
66
|
+
validate_dependencies('weight / height', ['weight', 'height']) # Returns True
|
|
67
|
+
validate_dependencies('weight / height', ['age']) # Raises ValueError
|
|
68
|
+
"""
|
|
69
|
+
# Extract all identifiers
|
|
70
|
+
identifiers = extract_identifiers(expression)
|
|
71
|
+
|
|
72
|
+
# Find missing columns
|
|
73
|
+
missing = [id for id in identifiers if id not in available_columns]
|
|
74
|
+
|
|
75
|
+
if missing:
|
|
76
|
+
raise ValueError(
|
|
77
|
+
f"Expression requires columns {missing} but DataFrame only has {available_columns}"
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
return True
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def extract_identifiers(expression: str) -> List[str]:
|
|
84
|
+
"""
|
|
85
|
+
Extract all identifiers from expression string.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
expression: Expression string
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
List of identifier names (column names, not function names)
|
|
92
|
+
|
|
93
|
+
Example:
|
|
94
|
+
identifiers = extract_identifiers('sqrt((height * weight) / 3600)')
|
|
95
|
+
# Returns: ['height', 'weight']
|
|
96
|
+
"""
|
|
97
|
+
# Remove string literals first (both single and double quotes)
|
|
98
|
+
# This prevents extracting identifiers from within strings
|
|
99
|
+
expr_without_strings = re.sub(r'"[^"]*"', '', expression)
|
|
100
|
+
expr_without_strings = re.sub(r"'[^']*'", '', expr_without_strings)
|
|
101
|
+
|
|
102
|
+
# Pattern to match identifiers (letters, numbers, underscores)
|
|
103
|
+
pattern = r'\b[a-zA-Z_][a-zA-Z0-9_]*\b'
|
|
104
|
+
|
|
105
|
+
# Find all matches
|
|
106
|
+
matches = re.findall(pattern, expr_without_strings)
|
|
107
|
+
|
|
108
|
+
# Filter out keywords and function names
|
|
109
|
+
keywords = {'AND', 'OR', 'NOT'}
|
|
110
|
+
identifiers = [
|
|
111
|
+
match for match in matches
|
|
112
|
+
if match not in keywords and not is_function_name(match)
|
|
113
|
+
]
|
|
114
|
+
|
|
115
|
+
# Return unique list
|
|
116
|
+
return list(set(identifiers))
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def is_function_name(identifier: str) -> bool:
|
|
120
|
+
"""
|
|
121
|
+
Check if identifier is a function name.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
identifier: Identifier string
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
True if function name, False if column name
|
|
128
|
+
|
|
129
|
+
Example:
|
|
130
|
+
is_function_name('sqrt') # Returns True
|
|
131
|
+
is_function_name('weight') # Returns False
|
|
132
|
+
"""
|
|
133
|
+
return identifier.lower() in FUNCTION_NAMES
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def check_circular_dependencies(expressions: Dict[str, str]) -> bool:
|
|
137
|
+
"""
|
|
138
|
+
Check for circular dependencies between expressions.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
expressions: Dictionary mapping expression name to expression string
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
True if no circular dependencies
|
|
145
|
+
|
|
146
|
+
Raises:
|
|
147
|
+
ValueError: If circular dependency detected
|
|
148
|
+
|
|
149
|
+
Example:
|
|
150
|
+
expressions = {
|
|
151
|
+
'a': 'b + 1',
|
|
152
|
+
'b': 'c + 1',
|
|
153
|
+
'c': 'a + 1' # Circular!
|
|
154
|
+
}
|
|
155
|
+
check_circular_dependencies(expressions)
|
|
156
|
+
# Raises: "Circular dependency detected: a -> b -> c -> a"
|
|
157
|
+
"""
|
|
158
|
+
# Build dependency graph
|
|
159
|
+
graph = build_dependency_graph(expressions)
|
|
160
|
+
|
|
161
|
+
# Track visited nodes and recursion stack
|
|
162
|
+
visited: Set[str] = set()
|
|
163
|
+
rec_stack: Set[str] = set()
|
|
164
|
+
path: List[str] = []
|
|
165
|
+
|
|
166
|
+
def has_cycle(node: str) -> bool:
|
|
167
|
+
"""DFS to detect cycles."""
|
|
168
|
+
visited.add(node)
|
|
169
|
+
rec_stack.add(node)
|
|
170
|
+
path.append(node)
|
|
171
|
+
|
|
172
|
+
# Check all dependencies
|
|
173
|
+
for dep in graph.get(node, []):
|
|
174
|
+
if dep not in visited:
|
|
175
|
+
if has_cycle(dep):
|
|
176
|
+
return True
|
|
177
|
+
elif dep in rec_stack:
|
|
178
|
+
# Found cycle
|
|
179
|
+
cycle_start = path.index(dep)
|
|
180
|
+
cycle = path[cycle_start:] + [dep]
|
|
181
|
+
raise ValueError(
|
|
182
|
+
f"Circular dependency detected: {' -> '.join(cycle)}"
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
rec_stack.remove(node)
|
|
186
|
+
path.pop()
|
|
187
|
+
return False
|
|
188
|
+
|
|
189
|
+
# Check each expression
|
|
190
|
+
for expr_name in expressions:
|
|
191
|
+
if expr_name not in visited:
|
|
192
|
+
has_cycle(expr_name)
|
|
193
|
+
|
|
194
|
+
return True
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def topological_sort(expressions: Dict[str, str]) -> List[str]:
|
|
198
|
+
"""
|
|
199
|
+
Sort expressions by dependency order.
|
|
200
|
+
|
|
201
|
+
Args:
|
|
202
|
+
expressions: Dictionary mapping expression name to expression string
|
|
203
|
+
|
|
204
|
+
Returns:
|
|
205
|
+
List of expression names in dependency order
|
|
206
|
+
|
|
207
|
+
Example:
|
|
208
|
+
expressions = {
|
|
209
|
+
'bmi': 'weight / (height ** 2)',
|
|
210
|
+
'bmi_category': 'if_else(bmi < 18.5, "underweight", "normal")',
|
|
211
|
+
'weight': 'weight_lb * 0.453592'
|
|
212
|
+
}
|
|
213
|
+
order = topological_sort(expressions)
|
|
214
|
+
# Returns: ['weight', 'bmi', 'bmi_category']
|
|
215
|
+
"""
|
|
216
|
+
# Build dependency graph
|
|
217
|
+
graph = build_dependency_graph(expressions)
|
|
218
|
+
|
|
219
|
+
# Track visited nodes and result
|
|
220
|
+
visited: Set[str] = set()
|
|
221
|
+
result: List[str] = []
|
|
222
|
+
|
|
223
|
+
def visit(node: str):
|
|
224
|
+
"""DFS to build topological order."""
|
|
225
|
+
if node in visited:
|
|
226
|
+
return
|
|
227
|
+
|
|
228
|
+
visited.add(node)
|
|
229
|
+
|
|
230
|
+
# Visit all dependencies first
|
|
231
|
+
for dep in graph.get(node, []):
|
|
232
|
+
if dep in expressions: # Only visit if it's an expression
|
|
233
|
+
visit(dep)
|
|
234
|
+
|
|
235
|
+
# Add node after all dependencies
|
|
236
|
+
result.append(node)
|
|
237
|
+
|
|
238
|
+
# Visit each expression
|
|
239
|
+
for expr_name in expressions:
|
|
240
|
+
visit(expr_name)
|
|
241
|
+
|
|
242
|
+
return result
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def build_dependency_graph(expressions: Dict[str, str]) -> Dict[str, List[str]]:
|
|
246
|
+
"""
|
|
247
|
+
Build dependency graph for expressions.
|
|
248
|
+
|
|
249
|
+
Args:
|
|
250
|
+
expressions: Dictionary mapping expression name to expression string
|
|
251
|
+
|
|
252
|
+
Returns:
|
|
253
|
+
Dictionary mapping expression name to list of dependencies
|
|
254
|
+
|
|
255
|
+
Example:
|
|
256
|
+
expressions = {
|
|
257
|
+
'bmi': 'weight / (height ** 2)',
|
|
258
|
+
'bmi_category': 'if_else(bmi < 18.5, "underweight", "normal")'
|
|
259
|
+
}
|
|
260
|
+
graph = build_dependency_graph(expressions)
|
|
261
|
+
# Returns: {'bmi': [], 'bmi_category': ['bmi']}
|
|
262
|
+
"""
|
|
263
|
+
graph: Dict[str, List[str]] = {}
|
|
264
|
+
|
|
265
|
+
for expr_name, expr_string in expressions.items():
|
|
266
|
+
# Extract identifiers from expression
|
|
267
|
+
identifiers = extract_identifiers(expr_string)
|
|
268
|
+
|
|
269
|
+
# Filter to only other expressions (not columns)
|
|
270
|
+
dependencies = [id for id in identifiers if id in expressions]
|
|
271
|
+
|
|
272
|
+
graph[expr_name] = dependencies
|
|
273
|
+
|
|
274
|
+
return graph
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Functions package."""
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Analyze function - Data analysis with multiple modes.
|
|
3
|
+
|
|
4
|
+
Provides comprehensive data analysis including quality, cardinality,
|
|
5
|
+
distributions, correlations, and more.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import polars as pl
|
|
9
|
+
import time
|
|
10
|
+
from typing import Optional
|
|
11
|
+
|
|
12
|
+
from additory.core.backend import detect_backend, to_polars
|
|
13
|
+
from additory.core.logging import get_logger
|
|
14
|
+
from additory.common.validation import validate_dataframe
|
|
15
|
+
from additory.common.result import wrap_analysis
|
|
16
|
+
|
|
17
|
+
from . import quality
|
|
18
|
+
from . import cardinality
|
|
19
|
+
from . import distributions
|
|
20
|
+
from . import correlations
|
|
21
|
+
from . import types
|
|
22
|
+
from . import features
|
|
23
|
+
from . import patterns
|
|
24
|
+
from . import outliers
|
|
25
|
+
from . import duplicates
|
|
26
|
+
from . import timeseries
|
|
27
|
+
from . import imputation
|
|
28
|
+
from . import presets
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
logger = get_logger()
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def analyze(
|
|
35
|
+
df,
|
|
36
|
+
preset: Optional[str] = None,
|
|
37
|
+
quality_analysis: bool = False,
|
|
38
|
+
cardinality_analysis: bool = False,
|
|
39
|
+
distributions_analysis: bool = False,
|
|
40
|
+
correlations_analysis: bool = False,
|
|
41
|
+
features_analysis: bool = False,
|
|
42
|
+
types_analysis: bool = False,
|
|
43
|
+
patterns_analysis: bool = False,
|
|
44
|
+
outliers_analysis: bool = False,
|
|
45
|
+
duplicates_analysis: bool = False,
|
|
46
|
+
timeseries_analysis: bool = False,
|
|
47
|
+
imputation_analysis: bool = False,
|
|
48
|
+
date_column: Optional[str] = None
|
|
49
|
+
):
|
|
50
|
+
"""
|
|
51
|
+
Analyze DataFrame with multiple analysis types.
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
df: Input DataFrame
|
|
55
|
+
preset: Preset name ('quick', 'full')
|
|
56
|
+
quality_analysis: Run quality analysis
|
|
57
|
+
cardinality_analysis: Run cardinality analysis
|
|
58
|
+
distributions_analysis: Run distribution analysis
|
|
59
|
+
correlations_analysis: Run correlation analysis
|
|
60
|
+
features_analysis: Run feature analysis
|
|
61
|
+
types_analysis: Run type analysis
|
|
62
|
+
patterns_analysis: Run pattern analysis
|
|
63
|
+
outliers_analysis: Run outlier analysis
|
|
64
|
+
duplicates_analysis: Run duplicate analysis
|
|
65
|
+
timeseries_analysis: Run timeseries analysis
|
|
66
|
+
imputation_analysis: Run imputation analysis
|
|
67
|
+
date_column: Column for timeseries analysis
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
AnalysisResult with analysis results
|
|
71
|
+
"""
|
|
72
|
+
start_time = time.time()
|
|
73
|
+
|
|
74
|
+
# Validate and convert
|
|
75
|
+
validate_dataframe(df, 'df')
|
|
76
|
+
backend = detect_backend(df)
|
|
77
|
+
polars_df = to_polars(df)
|
|
78
|
+
|
|
79
|
+
# Determine analyses to run
|
|
80
|
+
if preset:
|
|
81
|
+
analyses_to_run = presets.get_preset_analyses(preset)
|
|
82
|
+
else:
|
|
83
|
+
analyses_to_run = {
|
|
84
|
+
'quality': quality_analysis,
|
|
85
|
+
'cardinality': cardinality_analysis,
|
|
86
|
+
'distributions': distributions_analysis,
|
|
87
|
+
'correlations': correlations_analysis,
|
|
88
|
+
'features': features_analysis,
|
|
89
|
+
'types': types_analysis,
|
|
90
|
+
'patterns': patterns_analysis,
|
|
91
|
+
'outliers': outliers_analysis,
|
|
92
|
+
'duplicates': duplicates_analysis,
|
|
93
|
+
'timeseries': timeseries_analysis,
|
|
94
|
+
'imputation': imputation_analysis
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
# Run analyses
|
|
98
|
+
results = {}
|
|
99
|
+
|
|
100
|
+
if analyses_to_run.get('quality'):
|
|
101
|
+
results['quality'] = quality.analyze_quality(polars_df)
|
|
102
|
+
|
|
103
|
+
if analyses_to_run.get('cardinality'):
|
|
104
|
+
results['cardinality'] = cardinality.analyze_cardinality(polars_df)
|
|
105
|
+
|
|
106
|
+
if analyses_to_run.get('distributions'):
|
|
107
|
+
results['distributions'] = distributions.analyze_distributions(polars_df)
|
|
108
|
+
|
|
109
|
+
if analyses_to_run.get('correlations'):
|
|
110
|
+
results['correlations'] = correlations.analyze_correlations(polars_df)
|
|
111
|
+
|
|
112
|
+
if analyses_to_run.get('features'):
|
|
113
|
+
results['features'] = features.analyze_features(polars_df)
|
|
114
|
+
|
|
115
|
+
if analyses_to_run.get('types'):
|
|
116
|
+
results['types'] = types.analyze_types(polars_df)
|
|
117
|
+
|
|
118
|
+
if analyses_to_run.get('patterns'):
|
|
119
|
+
results['patterns'] = patterns.analyze_patterns(polars_df)
|
|
120
|
+
|
|
121
|
+
if analyses_to_run.get('outliers'):
|
|
122
|
+
results['outliers'] = outliers.analyze_outliers(polars_df)
|
|
123
|
+
|
|
124
|
+
if analyses_to_run.get('duplicates'):
|
|
125
|
+
results['duplicates'] = duplicates.analyze_duplicates(polars_df)
|
|
126
|
+
|
|
127
|
+
if analyses_to_run.get('timeseries') and date_column:
|
|
128
|
+
results['timeseries'] = timeseries.analyze_timeseries(polars_df, date_column)
|
|
129
|
+
|
|
130
|
+
if analyses_to_run.get('imputation'):
|
|
131
|
+
results['imputation'] = imputation.analyze_imputation(polars_df)
|
|
132
|
+
|
|
133
|
+
# Calculate execution time
|
|
134
|
+
execution_time = time.time() - start_time
|
|
135
|
+
|
|
136
|
+
# Wrap and return
|
|
137
|
+
metadata = {
|
|
138
|
+
'preset': preset,
|
|
139
|
+
'analyses_run': list(results.keys()),
|
|
140
|
+
'execution_time': execution_time,
|
|
141
|
+
'input_shape': (polars_df.height, polars_df.width)
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
return wrap_analysis(results, metadata)
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Cardinality analysis module.
|
|
3
|
+
|
|
4
|
+
Analyzes unique value counts and cardinality ratios to identify
|
|
5
|
+
potential keys and categorical columns.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import polars as pl
|
|
9
|
+
from typing import Dict, Any, List
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def analyze_cardinality(df: pl.DataFrame) -> Dict[str, Any]:
|
|
13
|
+
"""
|
|
14
|
+
Analyze cardinality metrics.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
df: Polars DataFrame to analyze
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
Dictionary with cardinality metrics:
|
|
21
|
+
- unique_counts: Dict of column -> unique count
|
|
22
|
+
- cardinality_ratios: Dict of column -> ratio (unique/total)
|
|
23
|
+
- high_cardinality: List of high cardinality columns (ratio > 0.95)
|
|
24
|
+
- low_cardinality: List of low cardinality columns (ratio < 0.05)
|
|
25
|
+
- potential_keys: List of columns that could be keys (ratio == 1.0)
|
|
26
|
+
- potential_categories: List of columns that could be categories (ratio < 0.05)
|
|
27
|
+
|
|
28
|
+
Example:
|
|
29
|
+
>>> df = pl.DataFrame({'id': [1, 2, 3], 'category': ['A', 'A', 'B']})
|
|
30
|
+
>>> result = analyze_cardinality(df)
|
|
31
|
+
>>> result['potential_keys']
|
|
32
|
+
['id']
|
|
33
|
+
"""
|
|
34
|
+
total_rows = df.height
|
|
35
|
+
|
|
36
|
+
# Calculate unique counts and ratios
|
|
37
|
+
unique_counts = {}
|
|
38
|
+
cardinality_ratios = {}
|
|
39
|
+
|
|
40
|
+
for col in df.columns:
|
|
41
|
+
unique_count = df[col].n_unique()
|
|
42
|
+
unique_counts[col] = unique_count
|
|
43
|
+
cardinality_ratios[col] = (unique_count / total_rows) if total_rows > 0 else 0.0
|
|
44
|
+
|
|
45
|
+
# Classify columns by cardinality
|
|
46
|
+
high_cardinality = [col for col, ratio in cardinality_ratios.items() if ratio > 0.95]
|
|
47
|
+
low_cardinality = [col for col, ratio in cardinality_ratios.items() if ratio < 0.05]
|
|
48
|
+
potential_keys = [col for col, ratio in cardinality_ratios.items() if ratio == 1.0]
|
|
49
|
+
potential_categories = [col for col, ratio in cardinality_ratios.items() if ratio < 0.05 and ratio > 0]
|
|
50
|
+
|
|
51
|
+
return {
|
|
52
|
+
'unique_counts': unique_counts,
|
|
53
|
+
'cardinality_ratios': {k: round(v, 4) for k, v in cardinality_ratios.items()},
|
|
54
|
+
'high_cardinality': high_cardinality,
|
|
55
|
+
'low_cardinality': low_cardinality,
|
|
56
|
+
'potential_keys': potential_keys,
|
|
57
|
+
'potential_categories': potential_categories
|
|
58
|
+
}
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Correlation analysis module.
|
|
3
|
+
|
|
4
|
+
Analyzes correlations between numeric columns using Pearson correlation.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import polars as pl
|
|
8
|
+
from typing import Dict, Any, List, Tuple
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def analyze_correlations(df: pl.DataFrame) -> Dict[str, Any]:
|
|
12
|
+
"""
|
|
13
|
+
Analyze correlations between numeric columns.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
df: Polars DataFrame to analyze
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
Dictionary with correlation metrics:
|
|
20
|
+
- correlation_matrix: Dict of column pairs -> correlation value
|
|
21
|
+
- highly_correlated: List of highly correlated pairs (|r| > 0.7)
|
|
22
|
+
- numeric_columns: List of columns analyzed
|
|
23
|
+
|
|
24
|
+
Example:
|
|
25
|
+
>>> df = pl.DataFrame({'a': [1, 2, 3], 'b': [2, 4, 6], 'c': [1, 1, 1]})
|
|
26
|
+
>>> result = analyze_correlations(df)
|
|
27
|
+
>>> len(result['highly_correlated']) > 0
|
|
28
|
+
True
|
|
29
|
+
"""
|
|
30
|
+
# Get numeric columns
|
|
31
|
+
numeric_cols = [col for col in df.columns if df[col].dtype in [pl.Int8, pl.Int16, pl.Int32, pl.Int64, pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64, pl.Float32, pl.Float64]]
|
|
32
|
+
|
|
33
|
+
if len(numeric_cols) < 2:
|
|
34
|
+
return {
|
|
35
|
+
'correlation_matrix': {},
|
|
36
|
+
'highly_correlated': [],
|
|
37
|
+
'numeric_columns': numeric_cols
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
# Calculate correlation matrix
|
|
41
|
+
correlation_matrix = {}
|
|
42
|
+
highly_correlated = []
|
|
43
|
+
|
|
44
|
+
for i, col1 in enumerate(numeric_cols):
|
|
45
|
+
for col2 in numeric_cols[i+1:]:
|
|
46
|
+
# Calculate Pearson correlation
|
|
47
|
+
corr = df.select([
|
|
48
|
+
pl.corr(col1, col2).alias('correlation')
|
|
49
|
+
])['correlation'][0]
|
|
50
|
+
|
|
51
|
+
if corr is not None:
|
|
52
|
+
correlation_matrix[f"{col1}_{col2}"] = round(float(corr), 4)
|
|
53
|
+
|
|
54
|
+
# Check if highly correlated
|
|
55
|
+
if abs(corr) > 0.7:
|
|
56
|
+
highly_correlated.append({
|
|
57
|
+
'column1': col1,
|
|
58
|
+
'column2': col2,
|
|
59
|
+
'correlation': round(float(corr), 4)
|
|
60
|
+
})
|
|
61
|
+
|
|
62
|
+
return {
|
|
63
|
+
'correlation_matrix': correlation_matrix,
|
|
64
|
+
'highly_correlated': highly_correlated,
|
|
65
|
+
'numeric_columns': numeric_cols
|
|
66
|
+
}
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Distribution analysis module.
|
|
3
|
+
|
|
4
|
+
Analyzes statistical distributions of numeric columns including
|
|
5
|
+
mean, median, std, skewness, and kurtosis.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import polars as pl
|
|
9
|
+
from typing import Dict, Any
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def analyze_distributions(df: pl.DataFrame) -> Dict[str, Any]:
|
|
13
|
+
"""
|
|
14
|
+
Analyze distribution metrics for numeric columns.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
df: Polars DataFrame to analyze
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
Dictionary with distribution metrics per numeric column:
|
|
21
|
+
- mean: Mean value
|
|
22
|
+
- median: Median value
|
|
23
|
+
- std: Standard deviation
|
|
24
|
+
- min: Minimum value
|
|
25
|
+
- max: Maximum value
|
|
26
|
+
- q25: 25th percentile
|
|
27
|
+
- q75: 75th percentile
|
|
28
|
+
|
|
29
|
+
Example:
|
|
30
|
+
>>> df = pl.DataFrame({'a': [1, 2, 3, 4, 5], 'b': ['x', 'y', 'z', 'x', 'y']})
|
|
31
|
+
>>> result = analyze_distributions(df)
|
|
32
|
+
>>> 'a' in result
|
|
33
|
+
True
|
|
34
|
+
"""
|
|
35
|
+
numeric_cols = [col for col in df.columns if df[col].dtype in [pl.Int8, pl.Int16, pl.Int32, pl.Int64, pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64, pl.Float32, pl.Float64]]
|
|
36
|
+
|
|
37
|
+
distributions = {}
|
|
38
|
+
|
|
39
|
+
for col in numeric_cols:
|
|
40
|
+
col_data = df[col]
|
|
41
|
+
|
|
42
|
+
# Calculate basic statistics
|
|
43
|
+
distributions[col] = {
|
|
44
|
+
'mean': float(col_data.mean()) if col_data.mean() is not None else None,
|
|
45
|
+
'median': float(col_data.median()) if col_data.median() is not None else None,
|
|
46
|
+
'std': float(col_data.std()) if col_data.std() is not None else None,
|
|
47
|
+
'min': float(col_data.min()) if col_data.min() is not None else None,
|
|
48
|
+
'max': float(col_data.max()) if col_data.max() is not None else None,
|
|
49
|
+
'q25': float(col_data.quantile(0.25)) if col_data.quantile(0.25) is not None else None,
|
|
50
|
+
'q75': float(col_data.quantile(0.75)) if col_data.quantile(0.75) is not None else None
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
return distributions
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Duplicate detection module.
|
|
3
|
+
|
|
4
|
+
Detects duplicate rows and values in the DataFrame.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import polars as pl
|
|
8
|
+
from typing import Dict, Any
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def analyze_duplicates(df: pl.DataFrame) -> Dict[str, Any]:
|
|
12
|
+
"""
|
|
13
|
+
Detect duplicate rows and values.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
df: Polars DataFrame to analyze
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
Dictionary with duplicate detection results:
|
|
20
|
+
- duplicate_rows: Number of duplicate rows
|
|
21
|
+
- duplicate_percentage: Percentage of duplicate rows
|
|
22
|
+
- columns_with_duplicates: List of columns with duplicate values
|
|
23
|
+
- unique_rows: Number of unique rows
|
|
24
|
+
|
|
25
|
+
Example:
|
|
26
|
+
>>> df = pl.DataFrame({'a': [1, 1, 2], 'b': [1, 1, 2]})
|
|
27
|
+
>>> result = analyze_duplicates(df)
|
|
28
|
+
>>> result['duplicate_rows']
|
|
29
|
+
1
|
|
30
|
+
"""
|
|
31
|
+
total_rows = df.height
|
|
32
|
+
unique_rows = df.unique().height
|
|
33
|
+
duplicate_rows = total_rows - unique_rows
|
|
34
|
+
duplicate_percentage = (duplicate_rows / total_rows * 100) if total_rows > 0 else 0.0
|
|
35
|
+
|
|
36
|
+
# Check for duplicates in each column
|
|
37
|
+
columns_with_duplicates = []
|
|
38
|
+
for col in df.columns:
|
|
39
|
+
unique_count = df[col].n_unique()
|
|
40
|
+
if unique_count < total_rows:
|
|
41
|
+
columns_with_duplicates.append(col)
|
|
42
|
+
|
|
43
|
+
return {
|
|
44
|
+
'duplicate_rows': duplicate_rows,
|
|
45
|
+
'duplicate_percentage': round(duplicate_percentage, 2),
|
|
46
|
+
'unique_rows': unique_rows,
|
|
47
|
+
'total_rows': total_rows,
|
|
48
|
+
'columns_with_duplicates': columns_with_duplicates
|
|
49
|
+
}
|