additory 0.1.0a4__py3-none-any.whl → 0.1.1a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- additory/__init__.py +58 -14
- additory/common/__init__.py +31 -147
- additory/common/column_selector.py +255 -0
- additory/common/distributions.py +286 -613
- additory/common/extractors.py +313 -0
- additory/common/knn_imputation.py +332 -0
- additory/common/result.py +380 -0
- additory/common/strategy_parser.py +243 -0
- additory/common/unit_conversions.py +338 -0
- additory/common/validation.py +283 -103
- additory/core/__init__.py +34 -22
- additory/core/backend.py +258 -0
- additory/core/config.py +177 -305
- additory/core/logging.py +230 -24
- additory/core/memory_manager.py +157 -495
- additory/expressions/__init__.py +2 -23
- additory/expressions/compiler.py +457 -0
- additory/expressions/engine.py +264 -487
- additory/expressions/integrity.py +179 -0
- additory/expressions/loader.py +263 -0
- additory/expressions/parser.py +363 -167
- additory/expressions/resolver.py +274 -0
- additory/functions/__init__.py +1 -0
- additory/functions/analyze/__init__.py +144 -0
- additory/functions/analyze/cardinality.py +58 -0
- additory/functions/analyze/correlations.py +66 -0
- additory/functions/analyze/distributions.py +53 -0
- additory/functions/analyze/duplicates.py +49 -0
- additory/functions/analyze/features.py +61 -0
- additory/functions/analyze/imputation.py +66 -0
- additory/functions/analyze/outliers.py +65 -0
- additory/functions/analyze/patterns.py +65 -0
- additory/functions/analyze/presets.py +72 -0
- additory/functions/analyze/quality.py +59 -0
- additory/functions/analyze/timeseries.py +53 -0
- additory/functions/analyze/types.py +45 -0
- additory/functions/expressions/__init__.py +161 -0
- additory/functions/snapshot/__init__.py +82 -0
- additory/functions/snapshot/filter.py +119 -0
- additory/functions/synthetic/__init__.py +113 -0
- additory/functions/synthetic/mode_detector.py +47 -0
- additory/functions/synthetic/strategies/__init__.py +1 -0
- additory/functions/synthetic/strategies/advanced.py +35 -0
- additory/functions/synthetic/strategies/augmentative.py +160 -0
- additory/functions/synthetic/strategies/generative.py +168 -0
- additory/functions/synthetic/strategies/presets.py +116 -0
- additory/functions/to/__init__.py +188 -0
- additory/functions/to/lookup.py +351 -0
- additory/functions/to/merge.py +189 -0
- additory/functions/to/sort.py +91 -0
- additory/functions/to/summarize.py +170 -0
- additory/functions/transform/__init__.py +140 -0
- additory/functions/transform/datetime.py +79 -0
- additory/functions/transform/extract.py +85 -0
- additory/functions/transform/harmonize.py +105 -0
- additory/functions/transform/knn.py +62 -0
- additory/functions/transform/onehotencoding.py +68 -0
- additory/functions/transform/transpose.py +42 -0
- additory-0.1.1a1.dist-info/METADATA +83 -0
- additory-0.1.1a1.dist-info/RECORD +62 -0
- additory/analysis/__init__.py +0 -48
- additory/analysis/cardinality.py +0 -126
- additory/analysis/correlations.py +0 -124
- additory/analysis/distributions.py +0 -376
- additory/analysis/quality.py +0 -158
- additory/analysis/scan.py +0 -400
- additory/common/backend.py +0 -371
- additory/common/column_utils.py +0 -191
- additory/common/exceptions.py +0 -62
- additory/common/lists.py +0 -229
- additory/common/patterns.py +0 -240
- additory/common/resolver.py +0 -567
- additory/common/sample_data.py +0 -182
- additory/core/ast_builder.py +0 -165
- additory/core/backends/__init__.py +0 -23
- additory/core/backends/arrow_bridge.py +0 -483
- additory/core/backends/cudf_bridge.py +0 -355
- additory/core/column_positioning.py +0 -358
- additory/core/compiler_polars.py +0 -166
- additory/core/enhanced_cache_manager.py +0 -1119
- additory/core/enhanced_matchers.py +0 -473
- additory/core/enhanced_version_manager.py +0 -325
- additory/core/executor.py +0 -59
- additory/core/integrity_manager.py +0 -477
- additory/core/loader.py +0 -190
- additory/core/namespace_manager.py +0 -657
- additory/core/parser.py +0 -176
- additory/core/polars_expression_engine.py +0 -601
- additory/core/registry.py +0 -177
- additory/core/sample_data_manager.py +0 -492
- additory/core/user_namespace.py +0 -751
- additory/core/validator.py +0 -27
- additory/dynamic_api.py +0 -352
- additory/expressions/proxy.py +0 -549
- additory/expressions/registry.py +0 -313
- additory/expressions/samples.py +0 -492
- additory/synthetic/__init__.py +0 -13
- additory/synthetic/column_name_resolver.py +0 -149
- additory/synthetic/deduce.py +0 -259
- additory/synthetic/distributions.py +0 -22
- additory/synthetic/forecast.py +0 -1132
- additory/synthetic/linked_list_parser.py +0 -415
- additory/synthetic/namespace_lookup.py +0 -129
- additory/synthetic/smote.py +0 -320
- additory/synthetic/strategies.py +0 -926
- additory/synthetic/synthesizer.py +0 -713
- additory/utilities/__init__.py +0 -53
- additory/utilities/encoding.py +0 -600
- additory/utilities/games.py +0 -300
- additory/utilities/keys.py +0 -8
- additory/utilities/lookup.py +0 -103
- additory/utilities/matchers.py +0 -216
- additory/utilities/resolvers.py +0 -286
- additory/utilities/settings.py +0 -167
- additory/utilities/units.py +0 -749
- additory/utilities/validators.py +0 -153
- additory-0.1.0a4.dist-info/METADATA +0 -311
- additory-0.1.0a4.dist-info/RECORD +0 -72
- additory-0.1.0a4.dist-info/licenses/LICENSE +0 -21
- {additory-0.1.0a4.dist-info → additory-0.1.1a1.dist-info}/WHEEL +0 -0
- {additory-0.1.0a4.dist-info → additory-0.1.1a1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Feature analysis module.
|
|
3
|
+
|
|
4
|
+
Analyzes feature types and provides transformation recommendations.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import polars as pl
|
|
8
|
+
from typing import Dict, Any
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def analyze_features(df: pl.DataFrame) -> Dict[str, Any]:
|
|
12
|
+
"""
|
|
13
|
+
Analyze features and provide recommendations.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
df: Polars DataFrame to analyze
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
Dictionary with feature analysis:
|
|
20
|
+
- feature_types: Dict of column -> feature type
|
|
21
|
+
- numeric_features: List of numeric features
|
|
22
|
+
- categorical_features: List of categorical features
|
|
23
|
+
- datetime_features: List of datetime features
|
|
24
|
+
|
|
25
|
+
Example:
|
|
26
|
+
>>> df = pl.DataFrame({'a': [1, 2, 3], 'b': ['x', 'y', 'z']})
|
|
27
|
+
>>> result = analyze_features(df)
|
|
28
|
+
>>> result['numeric_features']
|
|
29
|
+
['a']
|
|
30
|
+
"""
|
|
31
|
+
feature_types = {}
|
|
32
|
+
numeric_features = []
|
|
33
|
+
categorical_features = []
|
|
34
|
+
datetime_features = []
|
|
35
|
+
|
|
36
|
+
for col in df.columns:
|
|
37
|
+
dtype = df[col].dtype
|
|
38
|
+
|
|
39
|
+
if dtype in [pl.Int8, pl.Int16, pl.Int32, pl.Int64, pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64, pl.Float32, pl.Float64]:
|
|
40
|
+
feature_types[col] = 'numeric'
|
|
41
|
+
numeric_features.append(col)
|
|
42
|
+
elif dtype == pl.Utf8:
|
|
43
|
+
# Check if it's categorical (low cardinality)
|
|
44
|
+
unique_ratio = df[col].n_unique() / df.height if df.height > 0 else 0
|
|
45
|
+
if unique_ratio < 0.05:
|
|
46
|
+
feature_types[col] = 'categorical'
|
|
47
|
+
categorical_features.append(col)
|
|
48
|
+
else:
|
|
49
|
+
feature_types[col] = 'text'
|
|
50
|
+
elif dtype in [pl.Date, pl.Datetime]:
|
|
51
|
+
feature_types[col] = 'datetime'
|
|
52
|
+
datetime_features.append(col)
|
|
53
|
+
else:
|
|
54
|
+
feature_types[col] = str(dtype)
|
|
55
|
+
|
|
56
|
+
return {
|
|
57
|
+
'feature_types': feature_types,
|
|
58
|
+
'numeric_features': numeric_features,
|
|
59
|
+
'categorical_features': categorical_features,
|
|
60
|
+
'datetime_features': datetime_features
|
|
61
|
+
}
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Imputation recommendation module.
|
|
3
|
+
|
|
4
|
+
Recommends imputation strategies for missing values.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import polars as pl
|
|
8
|
+
from typing import Dict, Any
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def analyze_imputation(df: pl.DataFrame) -> Dict[str, Any]:
|
|
12
|
+
"""
|
|
13
|
+
Recommend imputation strategies for missing values.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
df: Polars DataFrame to analyze
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
Dictionary with imputation recommendations:
|
|
20
|
+
- recommendations: Dict of column -> recommended strategy
|
|
21
|
+
- columns_needing_imputation: List of columns with missing values
|
|
22
|
+
- imputation_complexity: Overall complexity (low, medium, high)
|
|
23
|
+
|
|
24
|
+
Example:
|
|
25
|
+
>>> df = pl.DataFrame({'a': [1, None, 3], 'b': ['x', 'y', None]})
|
|
26
|
+
>>> result = analyze_imputation(df)
|
|
27
|
+
>>> 'a' in result['columns_needing_imputation']
|
|
28
|
+
True
|
|
29
|
+
"""
|
|
30
|
+
recommendations = {}
|
|
31
|
+
columns_needing_imputation = []
|
|
32
|
+
|
|
33
|
+
for col in df.columns:
|
|
34
|
+
null_count = df[col].null_count()
|
|
35
|
+
|
|
36
|
+
if null_count == 0:
|
|
37
|
+
continue
|
|
38
|
+
|
|
39
|
+
columns_needing_imputation.append(col)
|
|
40
|
+
dtype = df[col].dtype
|
|
41
|
+
|
|
42
|
+
# Recommend strategy based on type
|
|
43
|
+
if dtype in [pl.Int8, pl.Int16, pl.Int32, pl.Int64, pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64, pl.Float32, pl.Float64]:
|
|
44
|
+
recommendations[col] = 'mean or median'
|
|
45
|
+
elif dtype == pl.Utf8:
|
|
46
|
+
recommendations[col] = 'mode or constant'
|
|
47
|
+
elif dtype == pl.Boolean:
|
|
48
|
+
recommendations[col] = 'mode'
|
|
49
|
+
else:
|
|
50
|
+
recommendations[col] = 'forward fill or constant'
|
|
51
|
+
|
|
52
|
+
# Determine complexity
|
|
53
|
+
if len(columns_needing_imputation) == 0:
|
|
54
|
+
complexity = 'none'
|
|
55
|
+
elif len(columns_needing_imputation) <= 2:
|
|
56
|
+
complexity = 'low'
|
|
57
|
+
elif len(columns_needing_imputation) <= 5:
|
|
58
|
+
complexity = 'medium'
|
|
59
|
+
else:
|
|
60
|
+
complexity = 'high'
|
|
61
|
+
|
|
62
|
+
return {
|
|
63
|
+
'recommendations': recommendations,
|
|
64
|
+
'columns_needing_imputation': columns_needing_imputation,
|
|
65
|
+
'imputation_complexity': complexity
|
|
66
|
+
}
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Outlier detection module.
|
|
3
|
+
|
|
4
|
+
Detects outliers in numeric columns using IQR method.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import polars as pl
|
|
8
|
+
from typing import Dict, Any
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def analyze_outliers(df: pl.DataFrame) -> Dict[str, Any]:
|
|
12
|
+
"""
|
|
13
|
+
Detect outliers in numeric columns using IQR method.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
df: Polars DataFrame to analyze
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
Dictionary with outlier detection results:
|
|
20
|
+
- outlier_counts: Dict of column -> outlier count
|
|
21
|
+
- outlier_percentages: Dict of column -> outlier percentage
|
|
22
|
+
- columns_with_outliers: List of columns with outliers
|
|
23
|
+
|
|
24
|
+
Example:
|
|
25
|
+
>>> df = pl.DataFrame({'a': [1, 2, 3, 4, 100]})
|
|
26
|
+
>>> result = analyze_outliers(df)
|
|
27
|
+
>>> result['outlier_counts']['a'] > 0
|
|
28
|
+
True
|
|
29
|
+
"""
|
|
30
|
+
numeric_cols = [col for col in df.columns if df[col].dtype in [pl.Int8, pl.Int16, pl.Int32, pl.Int64, pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64, pl.Float32, pl.Float64]]
|
|
31
|
+
|
|
32
|
+
outlier_counts = {}
|
|
33
|
+
outlier_percentages = {}
|
|
34
|
+
|
|
35
|
+
for col in numeric_cols:
|
|
36
|
+
col_data = df[col].drop_nulls()
|
|
37
|
+
|
|
38
|
+
if len(col_data) == 0:
|
|
39
|
+
outlier_counts[col] = 0
|
|
40
|
+
outlier_percentages[col] = 0.0
|
|
41
|
+
continue
|
|
42
|
+
|
|
43
|
+
# Calculate IQR
|
|
44
|
+
q25 = col_data.quantile(0.25)
|
|
45
|
+
q75 = col_data.quantile(0.75)
|
|
46
|
+
iqr = q75 - q25
|
|
47
|
+
|
|
48
|
+
# Calculate bounds
|
|
49
|
+
lower_bound = q25 - 1.5 * iqr
|
|
50
|
+
upper_bound = q75 + 1.5 * iqr
|
|
51
|
+
|
|
52
|
+
# Count outliers
|
|
53
|
+
outliers = col_data.filter((col_data < lower_bound) | (col_data > upper_bound))
|
|
54
|
+
outlier_count = len(outliers)
|
|
55
|
+
|
|
56
|
+
outlier_counts[col] = outlier_count
|
|
57
|
+
outlier_percentages[col] = (outlier_count / len(col_data) * 100) if len(col_data) > 0 else 0.0
|
|
58
|
+
|
|
59
|
+
columns_with_outliers = [col for col, count in outlier_counts.items() if count > 0]
|
|
60
|
+
|
|
61
|
+
return {
|
|
62
|
+
'outlier_counts': outlier_counts,
|
|
63
|
+
'outlier_percentages': {k: round(v, 2) for k, v in outlier_percentages.items()},
|
|
64
|
+
'columns_with_outliers': columns_with_outliers
|
|
65
|
+
}
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Pattern detection module.
|
|
3
|
+
|
|
4
|
+
Detects common patterns in string columns like emails, phone numbers, etc.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import polars as pl
|
|
8
|
+
import re
|
|
9
|
+
from typing import Dict, Any
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def analyze_patterns(df: pl.DataFrame) -> Dict[str, Any]:
|
|
13
|
+
"""
|
|
14
|
+
Detect patterns in string columns.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
df: Polars DataFrame to analyze
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
Dictionary with pattern detection results:
|
|
21
|
+
- email_columns: List of columns containing emails
|
|
22
|
+
- phone_columns: List of columns containing phone numbers
|
|
23
|
+
- date_string_columns: List of columns containing date strings
|
|
24
|
+
- id_columns: List of columns containing IDs
|
|
25
|
+
|
|
26
|
+
Example:
|
|
27
|
+
>>> df = pl.DataFrame({'email': ['a@b.com', 'c@d.com']})
|
|
28
|
+
>>> result = analyze_patterns(df)
|
|
29
|
+
>>> 'email' in result['email_columns']
|
|
30
|
+
True
|
|
31
|
+
"""
|
|
32
|
+
email_pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
|
|
33
|
+
phone_pattern = r'^\+?[\d\s\-\(\)]{10,}$'
|
|
34
|
+
|
|
35
|
+
email_columns = []
|
|
36
|
+
phone_columns = []
|
|
37
|
+
date_string_columns = []
|
|
38
|
+
id_columns = []
|
|
39
|
+
|
|
40
|
+
for col in df.columns:
|
|
41
|
+
if df[col].dtype != pl.Utf8:
|
|
42
|
+
continue
|
|
43
|
+
|
|
44
|
+
# Sample first 100 non-null values
|
|
45
|
+
sample = df[col].drop_nulls().head(100).to_list()
|
|
46
|
+
|
|
47
|
+
if not sample:
|
|
48
|
+
continue
|
|
49
|
+
|
|
50
|
+
# Check for email pattern
|
|
51
|
+
email_matches = sum(1 for val in sample if re.match(email_pattern, str(val)))
|
|
52
|
+
if email_matches / len(sample) > 0.8:
|
|
53
|
+
email_columns.append(col)
|
|
54
|
+
|
|
55
|
+
# Check for phone pattern
|
|
56
|
+
phone_matches = sum(1 for val in sample if re.match(phone_pattern, str(val)))
|
|
57
|
+
if phone_matches / len(sample) > 0.8:
|
|
58
|
+
phone_columns.append(col)
|
|
59
|
+
|
|
60
|
+
return {
|
|
61
|
+
'email_columns': email_columns,
|
|
62
|
+
'phone_columns': phone_columns,
|
|
63
|
+
'date_string_columns': date_string_columns,
|
|
64
|
+
'id_columns': id_columns
|
|
65
|
+
}
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Preset analysis configurations.
|
|
3
|
+
|
|
4
|
+
Provides preset analysis configurations for quick and full analysis.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import Dict, List
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def get_preset_analyses(preset_name: str) -> Dict[str, bool]:
|
|
11
|
+
"""
|
|
12
|
+
Get analyses for a preset.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
preset_name: Name of preset ('quick' or 'full')
|
|
16
|
+
|
|
17
|
+
Returns:
|
|
18
|
+
Dictionary of analysis_name -> enabled
|
|
19
|
+
|
|
20
|
+
Example:
|
|
21
|
+
>>> analyses = get_preset_analyses('quick')
|
|
22
|
+
>>> analyses['quality']
|
|
23
|
+
True
|
|
24
|
+
"""
|
|
25
|
+
presets = {
|
|
26
|
+
'quick': {
|
|
27
|
+
'quality': True,
|
|
28
|
+
'cardinality': True,
|
|
29
|
+
'types': True,
|
|
30
|
+
'distributions': False,
|
|
31
|
+
'correlations': False,
|
|
32
|
+
'features': False,
|
|
33
|
+
'patterns': False,
|
|
34
|
+
'outliers': False,
|
|
35
|
+
'duplicates': False,
|
|
36
|
+
'timeseries': False,
|
|
37
|
+
'imputation': False
|
|
38
|
+
},
|
|
39
|
+
'full': {
|
|
40
|
+
'quality': True,
|
|
41
|
+
'cardinality': True,
|
|
42
|
+
'types': True,
|
|
43
|
+
'distributions': True,
|
|
44
|
+
'correlations': True,
|
|
45
|
+
'features': True,
|
|
46
|
+
'patterns': True,
|
|
47
|
+
'outliers': True,
|
|
48
|
+
'duplicates': True,
|
|
49
|
+
'timeseries': False, # Requires date_column
|
|
50
|
+
'imputation': True
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
if preset_name not in presets:
|
|
55
|
+
raise ValueError(f"Unknown preset '{preset_name}'. Available: {list(presets.keys())}")
|
|
56
|
+
|
|
57
|
+
return presets[preset_name]
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def list_presets() -> List[str]:
|
|
61
|
+
"""
|
|
62
|
+
List available presets.
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
List of preset names
|
|
66
|
+
|
|
67
|
+
Example:
|
|
68
|
+
>>> presets = list_presets()
|
|
69
|
+
>>> 'quick' in presets
|
|
70
|
+
True
|
|
71
|
+
"""
|
|
72
|
+
return ['quick', 'full']
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data quality analysis module.
|
|
3
|
+
|
|
4
|
+
Analyzes data quality metrics including missing values, type consistency,
|
|
5
|
+
and format violations.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import polars as pl
|
|
9
|
+
from typing import Dict, Any
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def analyze_quality(df: pl.DataFrame) -> Dict[str, Any]:
|
|
13
|
+
"""
|
|
14
|
+
Analyze data quality metrics.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
df: Polars DataFrame to analyze
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
Dictionary with quality metrics:
|
|
21
|
+
- missing_values: Dict of column -> count
|
|
22
|
+
- missing_percentages: Dict of column -> percentage
|
|
23
|
+
- total_rows: Total number of rows
|
|
24
|
+
- columns_with_nulls: List of columns with missing values
|
|
25
|
+
- quality_score: Overall quality score (0-100)
|
|
26
|
+
|
|
27
|
+
Example:
|
|
28
|
+
>>> df = pl.DataFrame({'a': [1, 2, None], 'b': [1, 2, 3]})
|
|
29
|
+
>>> result = analyze_quality(df)
|
|
30
|
+
>>> result['missing_values']
|
|
31
|
+
{'a': 1, 'b': 0}
|
|
32
|
+
"""
|
|
33
|
+
total_rows = df.height
|
|
34
|
+
total_cells = total_rows * df.width
|
|
35
|
+
|
|
36
|
+
# Calculate missing values per column
|
|
37
|
+
missing_values = {}
|
|
38
|
+
missing_percentages = {}
|
|
39
|
+
|
|
40
|
+
for col in df.columns:
|
|
41
|
+
null_count = df[col].null_count()
|
|
42
|
+
missing_values[col] = null_count
|
|
43
|
+
missing_percentages[col] = (null_count / total_rows * 100) if total_rows > 0 else 0.0
|
|
44
|
+
|
|
45
|
+
# Identify columns with nulls
|
|
46
|
+
columns_with_nulls = [col for col, count in missing_values.items() if count > 0]
|
|
47
|
+
|
|
48
|
+
# Calculate overall quality score (percentage of non-null cells)
|
|
49
|
+
total_nulls = sum(missing_values.values())
|
|
50
|
+
quality_score = ((total_cells - total_nulls) / total_cells * 100) if total_cells > 0 else 100.0
|
|
51
|
+
|
|
52
|
+
return {
|
|
53
|
+
'missing_values': missing_values,
|
|
54
|
+
'missing_percentages': missing_percentages,
|
|
55
|
+
'total_rows': total_rows,
|
|
56
|
+
'total_columns': df.width,
|
|
57
|
+
'columns_with_nulls': columns_with_nulls,
|
|
58
|
+
'quality_score': round(quality_score, 2)
|
|
59
|
+
}
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Time series analysis module.
|
|
3
|
+
|
|
4
|
+
Analyzes time series data for trends and patterns.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import polars as pl
|
|
8
|
+
from typing import Dict, Any, Optional
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def analyze_timeseries(df: pl.DataFrame, date_column: str) -> Dict[str, Any]:
|
|
12
|
+
"""
|
|
13
|
+
Analyze time series data.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
df: Polars DataFrame to analyze
|
|
17
|
+
date_column: Column containing dates
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
Dictionary with timeseries analysis:
|
|
21
|
+
- date_column: Name of date column
|
|
22
|
+
- date_range: Tuple of (min_date, max_date)
|
|
23
|
+
- total_days: Number of days in range
|
|
24
|
+
- row_count: Number of rows
|
|
25
|
+
- frequency: Estimated frequency (daily, weekly, etc.)
|
|
26
|
+
|
|
27
|
+
Example:
|
|
28
|
+
>>> df = pl.DataFrame({'date': ['2024-01-01', '2024-01-02'], 'value': [1, 2]})
|
|
29
|
+
>>> df = df.with_columns(pl.col('date').str.strptime(pl.Date, '%Y-%m-%d'))
|
|
30
|
+
>>> result = analyze_timeseries(df, 'date')
|
|
31
|
+
>>> result['row_count']
|
|
32
|
+
2
|
|
33
|
+
"""
|
|
34
|
+
if date_column not in df.columns:
|
|
35
|
+
raise ValueError(f"Column '{date_column}' not found in DataFrame")
|
|
36
|
+
|
|
37
|
+
# Get date range
|
|
38
|
+
min_date = df[date_column].min()
|
|
39
|
+
max_date = df[date_column].max()
|
|
40
|
+
|
|
41
|
+
# Calculate total days
|
|
42
|
+
if min_date is not None and max_date is not None:
|
|
43
|
+
total_days = (max_date - min_date).days if hasattr(max_date - min_date, 'days') else 0
|
|
44
|
+
else:
|
|
45
|
+
total_days = 0
|
|
46
|
+
|
|
47
|
+
return {
|
|
48
|
+
'date_column': date_column,
|
|
49
|
+
'date_range': (str(min_date), str(max_date)),
|
|
50
|
+
'total_days': total_days,
|
|
51
|
+
'row_count': df.height,
|
|
52
|
+
'frequency': 'unknown'
|
|
53
|
+
}
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data type analysis module.
|
|
3
|
+
|
|
4
|
+
Analyzes data types and provides type recommendations.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import polars as pl
|
|
8
|
+
from typing import Dict, Any
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def analyze_types(df: pl.DataFrame) -> Dict[str, Any]:
|
|
12
|
+
"""
|
|
13
|
+
Analyze data types of columns.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
df: Polars DataFrame to analyze
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
Dictionary with type information:
|
|
20
|
+
- column_types: Dict of column -> type name
|
|
21
|
+
- numeric_columns: List of numeric columns
|
|
22
|
+
- string_columns: List of string columns
|
|
23
|
+
- boolean_columns: List of boolean columns
|
|
24
|
+
- date_columns: List of date/datetime columns
|
|
25
|
+
|
|
26
|
+
Example:
|
|
27
|
+
>>> df = pl.DataFrame({'a': [1, 2, 3], 'b': ['x', 'y', 'z']})
|
|
28
|
+
>>> result = analyze_types(df)
|
|
29
|
+
>>> result['numeric_columns']
|
|
30
|
+
['a']
|
|
31
|
+
"""
|
|
32
|
+
column_types = {col: str(df[col].dtype) for col in df.columns}
|
|
33
|
+
|
|
34
|
+
numeric_columns = [col for col in df.columns if df[col].dtype in [pl.Int8, pl.Int16, pl.Int32, pl.Int64, pl.UInt8, pl.UInt16, pl.UInt32, pl.UInt64, pl.Float32, pl.Float64]]
|
|
35
|
+
string_columns = [col for col in df.columns if df[col].dtype == pl.Utf8]
|
|
36
|
+
boolean_columns = [col for col in df.columns if df[col].dtype == pl.Boolean]
|
|
37
|
+
date_columns = [col for col in df.columns if df[col].dtype in [pl.Date, pl.Datetime]]
|
|
38
|
+
|
|
39
|
+
return {
|
|
40
|
+
'column_types': column_types,
|
|
41
|
+
'numeric_columns': numeric_columns,
|
|
42
|
+
'string_columns': string_columns,
|
|
43
|
+
'boolean_columns': boolean_columns,
|
|
44
|
+
'date_columns': date_columns
|
|
45
|
+
}
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Expressions function - Evaluate expressions and add result columns.
|
|
3
|
+
|
|
4
|
+
Supports both inline expressions and references to named expressions.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import polars as pl
|
|
8
|
+
import time
|
|
9
|
+
import re
|
|
10
|
+
from typing import List
|
|
11
|
+
|
|
12
|
+
from additory.core.backend import detect_backend, to_polars, from_polars
|
|
13
|
+
from additory.core.logging import get_logger
|
|
14
|
+
from additory.common.validation import validate_dataframe, validate_not_empty
|
|
15
|
+
from additory.common.result import wrap_result
|
|
16
|
+
from additory.expressions.engine import get_engine
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
logger = get_logger()
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def expressions(df, *expressions_list):
|
|
23
|
+
"""
|
|
24
|
+
Evaluate one or more expressions and add result columns.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
df: Input DataFrame
|
|
28
|
+
*expressions_list: Variable number of expression strings
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
DataFrameResult with computed columns
|
|
32
|
+
|
|
33
|
+
Examples:
|
|
34
|
+
>>> # Single expression
|
|
35
|
+
>>> result = expressions(df, 'inbuilt:bmi')
|
|
36
|
+
|
|
37
|
+
>>> # Multiple expressions
|
|
38
|
+
>>> result = expressions(df, 'inbuilt:bmi', 'inbuilt:bsa')
|
|
39
|
+
|
|
40
|
+
>>> # Inline expression
|
|
41
|
+
>>> result = expressions(df, 'weight / (height ** 2)')
|
|
42
|
+
|
|
43
|
+
>>> # Mix of inline and references
|
|
44
|
+
>>> result = expressions(df, 'inbuilt:bmi', 'age * 12')
|
|
45
|
+
"""
|
|
46
|
+
start_time = time.time()
|
|
47
|
+
|
|
48
|
+
# Validate inputs
|
|
49
|
+
validate_dataframe(df, 'df')
|
|
50
|
+
validate_not_empty(df, 'df')
|
|
51
|
+
|
|
52
|
+
if not expressions_list:
|
|
53
|
+
raise ValueError("At least one expression required")
|
|
54
|
+
|
|
55
|
+
# Detect backend and convert to Polars
|
|
56
|
+
backend = detect_backend(df)
|
|
57
|
+
polars_df = to_polars(df)
|
|
58
|
+
|
|
59
|
+
# Get expression engine
|
|
60
|
+
engine = get_engine()
|
|
61
|
+
|
|
62
|
+
# Track columns added
|
|
63
|
+
columns_added = []
|
|
64
|
+
|
|
65
|
+
# Evaluate each expression
|
|
66
|
+
result_df = polars_df
|
|
67
|
+
for expr_str in expressions_list:
|
|
68
|
+
# Evaluate expression
|
|
69
|
+
result_series = engine.evaluate(result_df, expr_str)
|
|
70
|
+
|
|
71
|
+
# Determine column name
|
|
72
|
+
col_name = determine_column_name(expr_str)
|
|
73
|
+
|
|
74
|
+
# Add to DataFrame
|
|
75
|
+
result_df = result_df.with_columns(result_series.alias(col_name))
|
|
76
|
+
columns_added.append(col_name)
|
|
77
|
+
|
|
78
|
+
logger.info(f"Added column '{col_name}' from expression '{expr_str}'")
|
|
79
|
+
|
|
80
|
+
# Convert back to original backend
|
|
81
|
+
result_df = from_polars(result_df, backend)
|
|
82
|
+
|
|
83
|
+
# Calculate execution time
|
|
84
|
+
execution_time = time.time() - start_time
|
|
85
|
+
|
|
86
|
+
# Wrap result
|
|
87
|
+
metadata = {
|
|
88
|
+
'expressions_evaluated': len(expressions_list),
|
|
89
|
+
'columns_added': columns_added,
|
|
90
|
+
'input_shape': (polars_df.height, polars_df.width),
|
|
91
|
+
'execution_time': execution_time
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
return wrap_result(result_df, 'expressions', metadata)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def determine_column_name(expression: str) -> str:
|
|
98
|
+
"""
|
|
99
|
+
Determine column name for expression result.
|
|
100
|
+
|
|
101
|
+
Args:
|
|
102
|
+
expression: Expression string
|
|
103
|
+
|
|
104
|
+
Returns:
|
|
105
|
+
Column name
|
|
106
|
+
|
|
107
|
+
Examples:
|
|
108
|
+
>>> determine_column_name('inbuilt:bmi')
|
|
109
|
+
'bmi'
|
|
110
|
+
>>> determine_column_name('weight / height')
|
|
111
|
+
'weight_div_height'
|
|
112
|
+
"""
|
|
113
|
+
# If reference (e.g., 'inbuilt:bmi'), use expression name
|
|
114
|
+
if ':' in expression:
|
|
115
|
+
parts = expression.split(':', 1)
|
|
116
|
+
if len(parts) == 2 and parts[0] in ['inbuilt', 'user', 'company']:
|
|
117
|
+
return parts[1]
|
|
118
|
+
|
|
119
|
+
# If inline, infer from expression
|
|
120
|
+
return infer_column_name(expression)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def infer_column_name(expression: str) -> str:
|
|
124
|
+
"""
|
|
125
|
+
Infer column name from inline expression.
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
expression: Inline expression string
|
|
129
|
+
|
|
130
|
+
Returns:
|
|
131
|
+
Inferred column name
|
|
132
|
+
|
|
133
|
+
Examples:
|
|
134
|
+
>>> infer_column_name('weight / height')
|
|
135
|
+
'weight_div_height'
|
|
136
|
+
>>> infer_column_name('sqrt(age)')
|
|
137
|
+
'sqrt_age'
|
|
138
|
+
>>> infer_column_name('age * 12')
|
|
139
|
+
'age_mul_12'
|
|
140
|
+
"""
|
|
141
|
+
# Replace operators with words
|
|
142
|
+
name = expression
|
|
143
|
+
name = name.replace(' / ', '_div_')
|
|
144
|
+
name = name.replace(' * ', '_mul_')
|
|
145
|
+
name = name.replace(' + ', '_add_')
|
|
146
|
+
name = name.replace(' - ', '_sub_')
|
|
147
|
+
name = name.replace('**', '_pow_')
|
|
148
|
+
|
|
149
|
+
# Remove parentheses and spaces
|
|
150
|
+
name = name.replace('(', '_')
|
|
151
|
+
name = name.replace(')', '')
|
|
152
|
+
name = name.replace(' ', '')
|
|
153
|
+
|
|
154
|
+
# Remove trailing underscores
|
|
155
|
+
name = name.strip('_')
|
|
156
|
+
|
|
157
|
+
# If name is too long or complex, use generic name
|
|
158
|
+
if len(name) > 50 or not re.match(r'^[a-zA-Z_][a-zA-Z0-9_]*$', name):
|
|
159
|
+
return 'expr_result'
|
|
160
|
+
|
|
161
|
+
return name
|