additory 0.1.0a4__py3-none-any.whl → 0.1.1a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. additory/__init__.py +58 -14
  2. additory/common/__init__.py +31 -147
  3. additory/common/column_selector.py +255 -0
  4. additory/common/distributions.py +286 -613
  5. additory/common/extractors.py +313 -0
  6. additory/common/knn_imputation.py +332 -0
  7. additory/common/result.py +390 -0
  8. additory/common/strategy_parser.py +243 -0
  9. additory/common/unit_conversions.py +338 -0
  10. additory/common/validation.py +283 -103
  11. additory/core/__init__.py +34 -22
  12. additory/core/backend.py +258 -0
  13. additory/core/config.py +177 -305
  14. additory/core/logging.py +230 -24
  15. additory/core/memory_manager.py +157 -495
  16. additory/expressions/__init__.py +2 -23
  17. additory/expressions/compiler.py +457 -0
  18. additory/expressions/engine.py +264 -487
  19. additory/expressions/integrity.py +179 -0
  20. additory/expressions/loader.py +263 -0
  21. additory/expressions/parser.py +363 -167
  22. additory/expressions/resolver.py +274 -0
  23. additory/functions/__init__.py +1 -0
  24. additory/functions/analyze/__init__.py +144 -0
  25. additory/functions/analyze/cardinality.py +58 -0
  26. additory/functions/analyze/correlations.py +66 -0
  27. additory/functions/analyze/distributions.py +53 -0
  28. additory/functions/analyze/duplicates.py +49 -0
  29. additory/functions/analyze/features.py +61 -0
  30. additory/functions/analyze/imputation.py +66 -0
  31. additory/functions/analyze/outliers.py +65 -0
  32. additory/functions/analyze/patterns.py +65 -0
  33. additory/functions/analyze/presets.py +72 -0
  34. additory/functions/analyze/quality.py +59 -0
  35. additory/functions/analyze/timeseries.py +53 -0
  36. additory/functions/analyze/types.py +45 -0
  37. additory/functions/expressions/__init__.py +161 -0
  38. additory/functions/snapshot/__init__.py +82 -0
  39. additory/functions/snapshot/filter.py +119 -0
  40. additory/functions/synthetic/__init__.py +113 -0
  41. additory/functions/synthetic/mode_detector.py +47 -0
  42. additory/functions/synthetic/strategies/__init__.py +1 -0
  43. additory/functions/synthetic/strategies/advanced.py +35 -0
  44. additory/functions/synthetic/strategies/augmentative.py +160 -0
  45. additory/functions/synthetic/strategies/generative.py +168 -0
  46. additory/functions/synthetic/strategies/presets.py +116 -0
  47. additory/functions/to/__init__.py +188 -0
  48. additory/functions/to/lookup.py +351 -0
  49. additory/functions/to/merge.py +189 -0
  50. additory/functions/to/sort.py +91 -0
  51. additory/functions/to/summarize.py +170 -0
  52. additory/functions/transform/__init__.py +140 -0
  53. additory/functions/transform/datetime.py +79 -0
  54. additory/functions/transform/extract.py +85 -0
  55. additory/functions/transform/harmonize.py +105 -0
  56. additory/functions/transform/knn.py +62 -0
  57. additory/functions/transform/onehotencoding.py +68 -0
  58. additory/functions/transform/transpose.py +42 -0
  59. additory-0.1.1a2.dist-info/METADATA +84 -0
  60. additory-0.1.1a2.dist-info/RECORD +62 -0
  61. additory/analysis/__init__.py +0 -48
  62. additory/analysis/cardinality.py +0 -126
  63. additory/analysis/correlations.py +0 -124
  64. additory/analysis/distributions.py +0 -376
  65. additory/analysis/quality.py +0 -158
  66. additory/analysis/scan.py +0 -400
  67. additory/common/backend.py +0 -371
  68. additory/common/column_utils.py +0 -191
  69. additory/common/exceptions.py +0 -62
  70. additory/common/lists.py +0 -229
  71. additory/common/patterns.py +0 -240
  72. additory/common/resolver.py +0 -567
  73. additory/common/sample_data.py +0 -182
  74. additory/core/ast_builder.py +0 -165
  75. additory/core/backends/__init__.py +0 -23
  76. additory/core/backends/arrow_bridge.py +0 -483
  77. additory/core/backends/cudf_bridge.py +0 -355
  78. additory/core/column_positioning.py +0 -358
  79. additory/core/compiler_polars.py +0 -166
  80. additory/core/enhanced_cache_manager.py +0 -1119
  81. additory/core/enhanced_matchers.py +0 -473
  82. additory/core/enhanced_version_manager.py +0 -325
  83. additory/core/executor.py +0 -59
  84. additory/core/integrity_manager.py +0 -477
  85. additory/core/loader.py +0 -190
  86. additory/core/namespace_manager.py +0 -657
  87. additory/core/parser.py +0 -176
  88. additory/core/polars_expression_engine.py +0 -601
  89. additory/core/registry.py +0 -177
  90. additory/core/sample_data_manager.py +0 -492
  91. additory/core/user_namespace.py +0 -751
  92. additory/core/validator.py +0 -27
  93. additory/dynamic_api.py +0 -352
  94. additory/expressions/proxy.py +0 -549
  95. additory/expressions/registry.py +0 -313
  96. additory/expressions/samples.py +0 -492
  97. additory/synthetic/__init__.py +0 -13
  98. additory/synthetic/column_name_resolver.py +0 -149
  99. additory/synthetic/deduce.py +0 -259
  100. additory/synthetic/distributions.py +0 -22
  101. additory/synthetic/forecast.py +0 -1132
  102. additory/synthetic/linked_list_parser.py +0 -415
  103. additory/synthetic/namespace_lookup.py +0 -129
  104. additory/synthetic/smote.py +0 -320
  105. additory/synthetic/strategies.py +0 -926
  106. additory/synthetic/synthesizer.py +0 -713
  107. additory/utilities/__init__.py +0 -53
  108. additory/utilities/encoding.py +0 -600
  109. additory/utilities/games.py +0 -300
  110. additory/utilities/keys.py +0 -8
  111. additory/utilities/lookup.py +0 -103
  112. additory/utilities/matchers.py +0 -216
  113. additory/utilities/resolvers.py +0 -286
  114. additory/utilities/settings.py +0 -167
  115. additory/utilities/units.py +0 -749
  116. additory/utilities/validators.py +0 -153
  117. additory-0.1.0a4.dist-info/METADATA +0 -311
  118. additory-0.1.0a4.dist-info/RECORD +0 -72
  119. additory-0.1.0a4.dist-info/licenses/LICENSE +0 -21
  120. {additory-0.1.0a4.dist-info → additory-0.1.1a2.dist-info}/WHEEL +0 -0
  121. {additory-0.1.0a4.dist-info → additory-0.1.1a2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,274 @@
1
+ """
2
+ Expression dependency resolver for Additory.
3
+
4
+ Resolves column references and dependencies in expressions.
5
+ """
6
+
7
+ import re
8
+ from typing import List, Dict, Set
9
+
10
+
11
+ # Known function names
12
+ FUNCTION_NAMES = {
13
+ # Math functions
14
+ 'sqrt', 'abs', 'log', 'log10', 'exp', 'pow', 'round', 'floor', 'ceil',
15
+ # String functions
16
+ 'lower', 'upper', 'trim', 'length', 'substring', 'replace', 'contains', 'matches',
17
+ # DateTime functions
18
+ 'year', 'month', 'day', 'hour', 'minute', 'second', 'day_of_week', 'time_of_day',
19
+ # Aggregation functions
20
+ 'sum', 'mean', 'median', 'min', 'max', 'count', 'std',
21
+ # Conditional functions
22
+ 'if_else', 'coalesce', 'is_null', 'is_not_null'
23
+ }
24
+
25
+
26
+ def resolve_dependencies(expression: str, available_columns: List[str]) -> List[str]:
27
+ """
28
+ Find all column dependencies in an expression.
29
+
30
+ Args:
31
+ expression: Expression string
32
+ available_columns: List of available column names in DataFrame
33
+
34
+ Returns:
35
+ List of column names used in expression
36
+
37
+ Example:
38
+ deps = resolve_dependencies('weight / (height ** 2)', ['weight', 'height', 'age'])
39
+ # Returns: ['weight', 'height']
40
+ """
41
+ # Extract all identifiers
42
+ identifiers = extract_identifiers(expression)
43
+
44
+ # Filter to only columns that exist in DataFrame
45
+ dependencies = [id for id in identifiers if id in available_columns]
46
+
47
+ # Return unique list
48
+ return list(set(dependencies))
49
+
50
+
51
+ def validate_dependencies(expression: str, available_columns: List[str]) -> bool:
52
+ """
53
+ Validate that all column dependencies exist.
54
+
55
+ Args:
56
+ expression: Expression string
57
+ available_columns: List of available column names
58
+
59
+ Returns:
60
+ True if all dependencies exist
61
+
62
+ Raises:
63
+ ValueError: If any dependencies are missing
64
+
65
+ Example:
66
+ validate_dependencies('weight / height', ['weight', 'height']) # Returns True
67
+ validate_dependencies('weight / height', ['age']) # Raises ValueError
68
+ """
69
+ # Extract all identifiers
70
+ identifiers = extract_identifiers(expression)
71
+
72
+ # Find missing columns
73
+ missing = [id for id in identifiers if id not in available_columns]
74
+
75
+ if missing:
76
+ raise ValueError(
77
+ f"Expression requires columns {missing} but DataFrame only has {available_columns}"
78
+ )
79
+
80
+ return True
81
+
82
+
83
+ def extract_identifiers(expression: str) -> List[str]:
84
+ """
85
+ Extract all identifiers from expression string.
86
+
87
+ Args:
88
+ expression: Expression string
89
+
90
+ Returns:
91
+ List of identifier names (column names, not function names)
92
+
93
+ Example:
94
+ identifiers = extract_identifiers('sqrt((height * weight) / 3600)')
95
+ # Returns: ['height', 'weight']
96
+ """
97
+ # Remove string literals first (both single and double quotes)
98
+ # This prevents extracting identifiers from within strings
99
+ expr_without_strings = re.sub(r'"[^"]*"', '', expression)
100
+ expr_without_strings = re.sub(r"'[^']*'", '', expr_without_strings)
101
+
102
+ # Pattern to match identifiers (letters, numbers, underscores)
103
+ pattern = r'\b[a-zA-Z_][a-zA-Z0-9_]*\b'
104
+
105
+ # Find all matches
106
+ matches = re.findall(pattern, expr_without_strings)
107
+
108
+ # Filter out keywords and function names
109
+ keywords = {'AND', 'OR', 'NOT'}
110
+ identifiers = [
111
+ match for match in matches
112
+ if match not in keywords and not is_function_name(match)
113
+ ]
114
+
115
+ # Return unique list
116
+ return list(set(identifiers))
117
+
118
+
119
+ def is_function_name(identifier: str) -> bool:
120
+ """
121
+ Check if identifier is a function name.
122
+
123
+ Args:
124
+ identifier: Identifier string
125
+
126
+ Returns:
127
+ True if function name, False if column name
128
+
129
+ Example:
130
+ is_function_name('sqrt') # Returns True
131
+ is_function_name('weight') # Returns False
132
+ """
133
+ return identifier.lower() in FUNCTION_NAMES
134
+
135
+
136
+ def check_circular_dependencies(expressions: Dict[str, str]) -> bool:
137
+ """
138
+ Check for circular dependencies between expressions.
139
+
140
+ Args:
141
+ expressions: Dictionary mapping expression name to expression string
142
+
143
+ Returns:
144
+ True if no circular dependencies
145
+
146
+ Raises:
147
+ ValueError: If circular dependency detected
148
+
149
+ Example:
150
+ expressions = {
151
+ 'a': 'b + 1',
152
+ 'b': 'c + 1',
153
+ 'c': 'a + 1' # Circular!
154
+ }
155
+ check_circular_dependencies(expressions)
156
+ # Raises: "Circular dependency detected: a -> b -> c -> a"
157
+ """
158
+ # Build dependency graph
159
+ graph = build_dependency_graph(expressions)
160
+
161
+ # Track visited nodes and recursion stack
162
+ visited: Set[str] = set()
163
+ rec_stack: Set[str] = set()
164
+ path: List[str] = []
165
+
166
+ def has_cycle(node: str) -> bool:
167
+ """DFS to detect cycles."""
168
+ visited.add(node)
169
+ rec_stack.add(node)
170
+ path.append(node)
171
+
172
+ # Check all dependencies
173
+ for dep in graph.get(node, []):
174
+ if dep not in visited:
175
+ if has_cycle(dep):
176
+ return True
177
+ elif dep in rec_stack:
178
+ # Found cycle
179
+ cycle_start = path.index(dep)
180
+ cycle = path[cycle_start:] + [dep]
181
+ raise ValueError(
182
+ f"Circular dependency detected: {' -> '.join(cycle)}"
183
+ )
184
+
185
+ rec_stack.remove(node)
186
+ path.pop()
187
+ return False
188
+
189
+ # Check each expression
190
+ for expr_name in expressions:
191
+ if expr_name not in visited:
192
+ has_cycle(expr_name)
193
+
194
+ return True
195
+
196
+
197
+ def topological_sort(expressions: Dict[str, str]) -> List[str]:
198
+ """
199
+ Sort expressions by dependency order.
200
+
201
+ Args:
202
+ expressions: Dictionary mapping expression name to expression string
203
+
204
+ Returns:
205
+ List of expression names in dependency order
206
+
207
+ Example:
208
+ expressions = {
209
+ 'bmi': 'weight / (height ** 2)',
210
+ 'bmi_category': 'if_else(bmi < 18.5, "underweight", "normal")',
211
+ 'weight': 'weight_lb * 0.453592'
212
+ }
213
+ order = topological_sort(expressions)
214
+ # Returns: ['weight', 'bmi', 'bmi_category']
215
+ """
216
+ # Build dependency graph
217
+ graph = build_dependency_graph(expressions)
218
+
219
+ # Track visited nodes and result
220
+ visited: Set[str] = set()
221
+ result: List[str] = []
222
+
223
+ def visit(node: str):
224
+ """DFS to build topological order."""
225
+ if node in visited:
226
+ return
227
+
228
+ visited.add(node)
229
+
230
+ # Visit all dependencies first
231
+ for dep in graph.get(node, []):
232
+ if dep in expressions: # Only visit if it's an expression
233
+ visit(dep)
234
+
235
+ # Add node after all dependencies
236
+ result.append(node)
237
+
238
+ # Visit each expression
239
+ for expr_name in expressions:
240
+ visit(expr_name)
241
+
242
+ return result
243
+
244
+
245
+ def build_dependency_graph(expressions: Dict[str, str]) -> Dict[str, List[str]]:
246
+ """
247
+ Build dependency graph for expressions.
248
+
249
+ Args:
250
+ expressions: Dictionary mapping expression name to expression string
251
+
252
+ Returns:
253
+ Dictionary mapping expression name to list of dependencies
254
+
255
+ Example:
256
+ expressions = {
257
+ 'bmi': 'weight / (height ** 2)',
258
+ 'bmi_category': 'if_else(bmi < 18.5, "underweight", "normal")'
259
+ }
260
+ graph = build_dependency_graph(expressions)
261
+ # Returns: {'bmi': [], 'bmi_category': ['bmi']}
262
+ """
263
+ graph: Dict[str, List[str]] = {}
264
+
265
+ for expr_name, expr_string in expressions.items():
266
+ # Extract identifiers from expression
267
+ identifiers = extract_identifiers(expr_string)
268
+
269
+ # Filter to only other expressions (not columns)
270
+ dependencies = [id for id in identifiers if id in expressions]
271
+
272
+ graph[expr_name] = dependencies
273
+
274
+ return graph
@@ -0,0 +1 @@
1
+ """Functions package."""
@@ -0,0 +1,144 @@
1
+ """
2
+ Analyze function - Data analysis with multiple modes.
3
+
4
+ Provides comprehensive data analysis including quality, cardinality,
5
+ distributions, correlations, and more.
6
+ """
7
+
8
+ import polars as pl
9
+ import time
10
+ from typing import Optional
11
+
12
+ from additory.core.backend import detect_backend, to_polars
13
+ from additory.core.logging import get_logger
14
+ from additory.common.validation import validate_dataframe
15
+ from additory.common.result import wrap_analysis
16
+
17
+ from . import quality
18
+ from . import cardinality
19
+ from . import distributions
20
+ from . import correlations
21
+ from . import types
22
+ from . import features
23
+ from . import patterns
24
+ from . import outliers
25
+ from . import duplicates
26
+ from . import timeseries
27
+ from . import imputation
28
+ from . import presets
29
+
30
+
31
+ logger = get_logger()
32
+
33
+
34
+ def analyze(
35
+ df,
36
+ preset: Optional[str] = None,
37
+ quality_analysis: bool = False,
38
+ cardinality_analysis: bool = False,
39
+ distributions_analysis: bool = False,
40
+ correlations_analysis: bool = False,
41
+ features_analysis: bool = False,
42
+ types_analysis: bool = False,
43
+ patterns_analysis: bool = False,
44
+ outliers_analysis: bool = False,
45
+ duplicates_analysis: bool = False,
46
+ timeseries_analysis: bool = False,
47
+ imputation_analysis: bool = False,
48
+ date_column: Optional[str] = None
49
+ ):
50
+ """
51
+ Analyze DataFrame with multiple analysis types.
52
+
53
+ Args:
54
+ df: Input DataFrame
55
+ preset: Preset name ('quick', 'full')
56
+ quality_analysis: Run quality analysis
57
+ cardinality_analysis: Run cardinality analysis
58
+ distributions_analysis: Run distribution analysis
59
+ correlations_analysis: Run correlation analysis
60
+ features_analysis: Run feature analysis
61
+ types_analysis: Run type analysis
62
+ patterns_analysis: Run pattern analysis
63
+ outliers_analysis: Run outlier analysis
64
+ duplicates_analysis: Run duplicate analysis
65
+ timeseries_analysis: Run timeseries analysis
66
+ imputation_analysis: Run imputation analysis
67
+ date_column: Column for timeseries analysis
68
+
69
+ Returns:
70
+ AnalysisResult with analysis results
71
+ """
72
+ start_time = time.time()
73
+
74
+ # Validate and convert
75
+ validate_dataframe(df, 'df')
76
+ backend = detect_backend(df)
77
+ polars_df = to_polars(df)
78
+
79
+ # Determine analyses to run
80
+ if preset:
81
+ analyses_to_run = presets.get_preset_analyses(preset)
82
+ else:
83
+ analyses_to_run = {
84
+ 'quality': quality_analysis,
85
+ 'cardinality': cardinality_analysis,
86
+ 'distributions': distributions_analysis,
87
+ 'correlations': correlations_analysis,
88
+ 'features': features_analysis,
89
+ 'types': types_analysis,
90
+ 'patterns': patterns_analysis,
91
+ 'outliers': outliers_analysis,
92
+ 'duplicates': duplicates_analysis,
93
+ 'timeseries': timeseries_analysis,
94
+ 'imputation': imputation_analysis
95
+ }
96
+
97
+ # Run analyses
98
+ results = {}
99
+
100
+ if analyses_to_run.get('quality'):
101
+ results['quality'] = quality.analyze_quality(polars_df)
102
+
103
+ if analyses_to_run.get('cardinality'):
104
+ results['cardinality'] = cardinality.analyze_cardinality(polars_df)
105
+
106
+ if analyses_to_run.get('distributions'):
107
+ results['distributions'] = distributions.analyze_distributions(polars_df)
108
+
109
+ if analyses_to_run.get('correlations'):
110
+ results['correlations'] = correlations.analyze_correlations(polars_df)
111
+
112
+ if analyses_to_run.get('features'):
113
+ results['features'] = features.analyze_features(polars_df)
114
+
115
+ if analyses_to_run.get('types'):
116
+ results['types'] = types.analyze_types(polars_df)
117
+
118
+ if analyses_to_run.get('patterns'):
119
+ results['patterns'] = patterns.analyze_patterns(polars_df)
120
+
121
+ if analyses_to_run.get('outliers'):
122
+ results['outliers'] = outliers.analyze_outliers(polars_df)
123
+
124
+ if analyses_to_run.get('duplicates'):
125
+ results['duplicates'] = duplicates.analyze_duplicates(polars_df)
126
+
127
+ if analyses_to_run.get('timeseries') and date_column:
128
+ results['timeseries'] = timeseries.analyze_timeseries(polars_df, date_column)
129
+
130
+ if analyses_to_run.get('imputation'):
131
+ results['imputation'] = imputation.analyze_imputation(polars_df)
132
+
133
+ # Calculate execution time
134
+ execution_time = time.time() - start_time
135
+
136
+ # Wrap and return
137
+ metadata = {
138
+ 'preset': preset,
139
+ 'analyses_run': list(results.keys()),
140
+ 'execution_time': execution_time,
141
+ 'input_shape': (polars_df.height, polars_df.width)
142
+ }
143
+
144
+ return wrap_analysis(results, metadata)
@@ -0,0 +1,58 @@
1
+ """
2
+ Cardinality analysis module.
3
+
4
+ Analyzes unique value counts and cardinality ratios to identify
5
+ potential keys and categorical columns.
6
+ """
7
+
8
+ import polars as pl
9
+ from typing import Dict, Any, List
10
+
11
+
12
+ def analyze_cardinality(df: pl.DataFrame) -> Dict[str, Any]:
13
+ """
14
+ Analyze cardinality metrics.
15
+
16
+ Args:
17
+ df: Polars DataFrame to analyze
18
+
19
+ Returns:
20
+ Dictionary with cardinality metrics:
21
+ - unique_counts: Dict of column -> unique count
22
+ - cardinality_ratios: Dict of column -> ratio (unique/total)
23
+ - high_cardinality: List of high cardinality columns (ratio > 0.95)
24
+ - low_cardinality: List of low cardinality columns (ratio < 0.05)
25
+ - potential_keys: List of columns that could be keys (ratio == 1.0)
26
+ - potential_categories: List of columns that could be categories (ratio < 0.05)
27
+
28
+ Example:
29
+ >>> df = pl.DataFrame({'id': [1, 2, 3], 'category': ['A', 'A', 'B']})
30
+ >>> result = analyze_cardinality(df)
31
+ >>> result['potential_keys']
32
+ ['id']
33
+ """
34
+ total_rows = df.height
35
+
36
+ # Calculate unique counts and ratios
37
+ unique_counts = {}
38
+ cardinality_ratios = {}
39
+
40
+ for col in df.columns:
41
+ unique_count = df[col].n_unique()
42
+ unique_counts[col] = unique_count
43
+ cardinality_ratios[col] = (unique_count / total_rows) if total_rows > 0 else 0.0
44
+
45
+ # Classify columns by cardinality
46
+ high_cardinality = [col for col, ratio in cardinality_ratios.items() if ratio > 0.95]
47
+ low_cardinality = [col for col, ratio in cardinality_ratios.items() if ratio < 0.05]
48
+ potential_keys = [col for col, ratio in cardinality_ratios.items() if ratio == 1.0]
49
+ potential_categories = [col for col, ratio in cardinality_ratios.items() if ratio < 0.05 and ratio > 0]
50
+
51
+ return {
52
+ 'unique_counts': unique_counts,
53
+ 'cardinality_ratios': {k: round(v, 4) for k, v in cardinality_ratios.items()},
54
+ 'high_cardinality': high_cardinality,
55
+ 'low_cardinality': low_cardinality,
56
+ 'potential_keys': potential_keys,
57
+ 'potential_categories': potential_categories
58
+ }
@@ -0,0 +1,66 @@
1
+ """
2
+ Correlation analysis module.
3
+
4
+ Analyzes correlations between numeric columns using Pearson correlation.
5
+ """
6
+
7
+ import polars as pl
8
+ from typing import Dict, Any, List, Tuple
9
+
10
+
11
+ def analyze_correlations(df: pl.DataFrame) -> Dict[str, Any]:
12
+ """
13
+ Analyze correlations between numeric columns.
14
+
15
+ Args:
16
+ df: Polars DataFrame to analyze
17
+
18
+ Returns:
19
+ Dictionary with correlation metrics:
20
+ - correlation_matrix: Dict of column pairs -> correlation value
21
+ - highly_correlated: List of highly correlated pairs (|r| > 0.7)
22
+ - numeric_columns: List of columns analyzed
23
+
24
+ Example:
25
+ >>> df = pl.DataFrame({'a': [1, 2, 3], 'b': [2, 4, 6], 'c': [1, 1, 1]})
26
+ >>> result = analyze_correlations(df)
27
+ >>> len(result['highly_correlated']) > 0
28
+ True
29
+ """
30
+ # Get numeric columns
31
+ numeric_cols = [col for col in df.columns if df[col].dtype in [pl.Int8, pl.Int16, pl.Int32, pl.Int64, pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64, pl.Float32, pl.Float64]]
32
+
33
+ if len(numeric_cols) < 2:
34
+ return {
35
+ 'correlation_matrix': {},
36
+ 'highly_correlated': [],
37
+ 'numeric_columns': numeric_cols
38
+ }
39
+
40
+ # Calculate correlation matrix
41
+ correlation_matrix = {}
42
+ highly_correlated = []
43
+
44
+ for i, col1 in enumerate(numeric_cols):
45
+ for col2 in numeric_cols[i+1:]:
46
+ # Calculate Pearson correlation
47
+ corr = df.select([
48
+ pl.corr(col1, col2).alias('correlation')
49
+ ])['correlation'][0]
50
+
51
+ if corr is not None:
52
+ correlation_matrix[f"{col1}_{col2}"] = round(float(corr), 4)
53
+
54
+ # Check if highly correlated
55
+ if abs(corr) > 0.7:
56
+ highly_correlated.append({
57
+ 'column1': col1,
58
+ 'column2': col2,
59
+ 'correlation': round(float(corr), 4)
60
+ })
61
+
62
+ return {
63
+ 'correlation_matrix': correlation_matrix,
64
+ 'highly_correlated': highly_correlated,
65
+ 'numeric_columns': numeric_cols
66
+ }
@@ -0,0 +1,53 @@
1
+ """
2
+ Distribution analysis module.
3
+
4
+ Analyzes statistical distributions of numeric columns including
5
+ mean, median, std, skewness, and kurtosis.
6
+ """
7
+
8
+ import polars as pl
9
+ from typing import Dict, Any
10
+
11
+
12
+ def analyze_distributions(df: pl.DataFrame) -> Dict[str, Any]:
13
+ """
14
+ Analyze distribution metrics for numeric columns.
15
+
16
+ Args:
17
+ df: Polars DataFrame to analyze
18
+
19
+ Returns:
20
+ Dictionary with distribution metrics per numeric column:
21
+ - mean: Mean value
22
+ - median: Median value
23
+ - std: Standard deviation
24
+ - min: Minimum value
25
+ - max: Maximum value
26
+ - q25: 25th percentile
27
+ - q75: 75th percentile
28
+
29
+ Example:
30
+ >>> df = pl.DataFrame({'a': [1, 2, 3, 4, 5], 'b': ['x', 'y', 'z', 'x', 'y']})
31
+ >>> result = analyze_distributions(df)
32
+ >>> 'a' in result
33
+ True
34
+ """
35
+ numeric_cols = [col for col in df.columns if df[col].dtype in [pl.Int8, pl.Int16, pl.Int32, pl.Int64, pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64, pl.Float32, pl.Float64]]
36
+
37
+ distributions = {}
38
+
39
+ for col in numeric_cols:
40
+ col_data = df[col]
41
+
42
+ # Calculate basic statistics
43
+ distributions[col] = {
44
+ 'mean': float(col_data.mean()) if col_data.mean() is not None else None,
45
+ 'median': float(col_data.median()) if col_data.median() is not None else None,
46
+ 'std': float(col_data.std()) if col_data.std() is not None else None,
47
+ 'min': float(col_data.min()) if col_data.min() is not None else None,
48
+ 'max': float(col_data.max()) if col_data.max() is not None else None,
49
+ 'q25': float(col_data.quantile(0.25)) if col_data.quantile(0.25) is not None else None,
50
+ 'q75': float(col_data.quantile(0.75)) if col_data.quantile(0.75) is not None else None
51
+ }
52
+
53
+ return distributions
@@ -0,0 +1,49 @@
1
+ """
2
+ Duplicate detection module.
3
+
4
+ Detects duplicate rows and values in the DataFrame.
5
+ """
6
+
7
+ import polars as pl
8
+ from typing import Dict, Any
9
+
10
+
11
+ def analyze_duplicates(df: pl.DataFrame) -> Dict[str, Any]:
12
+ """
13
+ Detect duplicate rows and values.
14
+
15
+ Args:
16
+ df: Polars DataFrame to analyze
17
+
18
+ Returns:
19
+ Dictionary with duplicate detection results:
20
+ - duplicate_rows: Number of duplicate rows
21
+ - duplicate_percentage: Percentage of duplicate rows
22
+ - columns_with_duplicates: List of columns with duplicate values
23
+ - unique_rows: Number of unique rows
24
+
25
+ Example:
26
+ >>> df = pl.DataFrame({'a': [1, 1, 2], 'b': [1, 1, 2]})
27
+ >>> result = analyze_duplicates(df)
28
+ >>> result['duplicate_rows']
29
+ 1
30
+ """
31
+ total_rows = df.height
32
+ unique_rows = df.unique().height
33
+ duplicate_rows = total_rows - unique_rows
34
+ duplicate_percentage = (duplicate_rows / total_rows * 100) if total_rows > 0 else 0.0
35
+
36
+ # Check for duplicates in each column
37
+ columns_with_duplicates = []
38
+ for col in df.columns:
39
+ unique_count = df[col].n_unique()
40
+ if unique_count < total_rows:
41
+ columns_with_duplicates.append(col)
42
+
43
+ return {
44
+ 'duplicate_rows': duplicate_rows,
45
+ 'duplicate_percentage': round(duplicate_percentage, 2),
46
+ 'unique_rows': unique_rows,
47
+ 'total_rows': total_rows,
48
+ 'columns_with_duplicates': columns_with_duplicates
49
+ }