additory 0.1.0a3__py3-none-any.whl → 0.1.1a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- additory/__init__.py +58 -14
- additory/common/__init__.py +31 -147
- additory/common/column_selector.py +255 -0
- additory/common/distributions.py +286 -613
- additory/common/extractors.py +313 -0
- additory/common/knn_imputation.py +332 -0
- additory/common/result.py +380 -0
- additory/common/strategy_parser.py +243 -0
- additory/common/unit_conversions.py +338 -0
- additory/common/validation.py +283 -103
- additory/core/__init__.py +34 -22
- additory/core/backend.py +258 -0
- additory/core/config.py +177 -305
- additory/core/logging.py +230 -24
- additory/core/memory_manager.py +157 -495
- additory/expressions/__init__.py +2 -23
- additory/expressions/compiler.py +457 -0
- additory/expressions/engine.py +264 -487
- additory/expressions/integrity.py +179 -0
- additory/expressions/loader.py +263 -0
- additory/expressions/parser.py +363 -167
- additory/expressions/resolver.py +274 -0
- additory/functions/__init__.py +1 -0
- additory/functions/analyze/__init__.py +144 -0
- additory/functions/analyze/cardinality.py +58 -0
- additory/functions/analyze/correlations.py +66 -0
- additory/functions/analyze/distributions.py +53 -0
- additory/functions/analyze/duplicates.py +49 -0
- additory/functions/analyze/features.py +61 -0
- additory/functions/analyze/imputation.py +66 -0
- additory/functions/analyze/outliers.py +65 -0
- additory/functions/analyze/patterns.py +65 -0
- additory/functions/analyze/presets.py +72 -0
- additory/functions/analyze/quality.py +59 -0
- additory/functions/analyze/timeseries.py +53 -0
- additory/functions/analyze/types.py +45 -0
- additory/functions/expressions/__init__.py +161 -0
- additory/functions/snapshot/__init__.py +82 -0
- additory/functions/snapshot/filter.py +119 -0
- additory/functions/synthetic/__init__.py +113 -0
- additory/functions/synthetic/mode_detector.py +47 -0
- additory/functions/synthetic/strategies/__init__.py +1 -0
- additory/functions/synthetic/strategies/advanced.py +35 -0
- additory/functions/synthetic/strategies/augmentative.py +160 -0
- additory/functions/synthetic/strategies/generative.py +168 -0
- additory/functions/synthetic/strategies/presets.py +116 -0
- additory/functions/to/__init__.py +188 -0
- additory/functions/to/lookup.py +351 -0
- additory/functions/to/merge.py +189 -0
- additory/functions/to/sort.py +91 -0
- additory/functions/to/summarize.py +170 -0
- additory/functions/transform/__init__.py +140 -0
- additory/functions/transform/datetime.py +79 -0
- additory/functions/transform/extract.py +85 -0
- additory/functions/transform/harmonize.py +105 -0
- additory/functions/transform/knn.py +62 -0
- additory/functions/transform/onehotencoding.py +68 -0
- additory/functions/transform/transpose.py +42 -0
- additory-0.1.1a1.dist-info/METADATA +83 -0
- additory-0.1.1a1.dist-info/RECORD +62 -0
- additory/analysis/__init__.py +0 -48
- additory/analysis/cardinality.py +0 -126
- additory/analysis/correlations.py +0 -124
- additory/analysis/distributions.py +0 -376
- additory/analysis/quality.py +0 -158
- additory/analysis/scan.py +0 -400
- additory/common/backend.py +0 -371
- additory/common/column_utils.py +0 -191
- additory/common/exceptions.py +0 -62
- additory/common/lists.py +0 -229
- additory/common/patterns.py +0 -240
- additory/common/resolver.py +0 -567
- additory/common/sample_data.py +0 -182
- additory/core/ast_builder.py +0 -165
- additory/core/backends/__init__.py +0 -23
- additory/core/backends/arrow_bridge.py +0 -483
- additory/core/backends/cudf_bridge.py +0 -355
- additory/core/column_positioning.py +0 -358
- additory/core/compiler_polars.py +0 -166
- additory/core/enhanced_cache_manager.py +0 -1119
- additory/core/enhanced_matchers.py +0 -473
- additory/core/enhanced_version_manager.py +0 -325
- additory/core/executor.py +0 -59
- additory/core/integrity_manager.py +0 -477
- additory/core/loader.py +0 -190
- additory/core/namespace_manager.py +0 -657
- additory/core/parser.py +0 -176
- additory/core/polars_expression_engine.py +0 -601
- additory/core/registry.py +0 -176
- additory/core/sample_data_manager.py +0 -492
- additory/core/user_namespace.py +0 -751
- additory/core/validator.py +0 -27
- additory/dynamic_api.py +0 -304
- additory/expressions/proxy.py +0 -549
- additory/expressions/registry.py +0 -313
- additory/expressions/samples.py +0 -492
- additory/synthetic/__init__.py +0 -13
- additory/synthetic/column_name_resolver.py +0 -149
- additory/synthetic/distributions.py +0 -22
- additory/synthetic/forecast.py +0 -1132
- additory/synthetic/linked_list_parser.py +0 -415
- additory/synthetic/namespace_lookup.py +0 -129
- additory/synthetic/smote.py +0 -320
- additory/synthetic/strategies.py +0 -850
- additory/synthetic/synthesizer.py +0 -713
- additory/utilities/__init__.py +0 -53
- additory/utilities/encoding.py +0 -600
- additory/utilities/games.py +0 -300
- additory/utilities/keys.py +0 -8
- additory/utilities/lookup.py +0 -103
- additory/utilities/matchers.py +0 -216
- additory/utilities/resolvers.py +0 -286
- additory/utilities/settings.py +0 -167
- additory/utilities/units.py +0 -749
- additory/utilities/validators.py +0 -153
- additory-0.1.0a3.dist-info/METADATA +0 -288
- additory-0.1.0a3.dist-info/RECORD +0 -71
- additory-0.1.0a3.dist-info/licenses/LICENSE +0 -21
- {additory-0.1.0a3.dist-info → additory-0.1.1a1.dist-info}/WHEEL +0 -0
- {additory-0.1.0a3.dist-info → additory-0.1.1a1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Group and aggregate data.
|
|
3
|
+
|
|
4
|
+
This module provides summarization functionality for the to function.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import polars as pl
|
|
8
|
+
from typing import Union, List, Dict
|
|
9
|
+
|
|
10
|
+
from additory.common.validation import validate_dataframe, validate_not_empty
|
|
11
|
+
from additory.core.logging import Logger
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
# Mapping of aggregation names to Polars methods
|
|
15
|
+
AGGREGATION_MAP = {
|
|
16
|
+
'sum': 'sum',
|
|
17
|
+
'mean': 'mean',
|
|
18
|
+
'median': 'median',
|
|
19
|
+
'min': 'min',
|
|
20
|
+
'max': 'max',
|
|
21
|
+
'count': 'count',
|
|
22
|
+
'count_unique': 'n_unique',
|
|
23
|
+
'std': 'std',
|
|
24
|
+
'var': 'var',
|
|
25
|
+
'first': 'first',
|
|
26
|
+
'last': 'last',
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def perform_summarize(
|
|
31
|
+
df: pl.DataFrame,
|
|
32
|
+
group_by: Union[str, List[str]],
|
|
33
|
+
aggregations: Dict[str, str],
|
|
34
|
+
**kwargs
|
|
35
|
+
) -> pl.DataFrame:
|
|
36
|
+
"""
|
|
37
|
+
Group and aggregate DataFrame.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
df: Input DataFrame
|
|
41
|
+
group_by: Column(s) to group by
|
|
42
|
+
aggregations: Dictionary mapping column to aggregation function
|
|
43
|
+
**kwargs: Additional parameters (reserved for future use)
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
Summarized DataFrame
|
|
47
|
+
|
|
48
|
+
Example:
|
|
49
|
+
>>> result = perform_summarize(
|
|
50
|
+
... df,
|
|
51
|
+
... group_by='category',
|
|
52
|
+
... aggregations={'sales': 'sum', 'orders': 'count', 'price': 'mean'}
|
|
53
|
+
... )
|
|
54
|
+
|
|
55
|
+
Supported Aggregations:
|
|
56
|
+
- 'sum', 'mean', 'median', 'min', 'max'
|
|
57
|
+
- 'count', 'count_unique', 'std', 'var'
|
|
58
|
+
- 'first', 'last'
|
|
59
|
+
"""
|
|
60
|
+
logger = Logger()
|
|
61
|
+
|
|
62
|
+
# Validate parameters
|
|
63
|
+
validate_summarize_parameters(df, group_by, aggregations)
|
|
64
|
+
|
|
65
|
+
# Normalize group_by to list
|
|
66
|
+
group_by_list = [group_by] if isinstance(group_by, str) else group_by
|
|
67
|
+
|
|
68
|
+
# Log operation
|
|
69
|
+
logger.info(f"Summarizing by {group_by_list}, {len(aggregations)} aggregations")
|
|
70
|
+
|
|
71
|
+
# Parse aggregation specifications
|
|
72
|
+
agg_exprs = parse_aggregation_spec(aggregations)
|
|
73
|
+
|
|
74
|
+
# Perform groupby and aggregation
|
|
75
|
+
result = df.group_by(group_by_list).agg(agg_exprs)
|
|
76
|
+
|
|
77
|
+
logger.info(f"Summarize complete: {len(result)} groups")
|
|
78
|
+
|
|
79
|
+
return result
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def validate_summarize_parameters(
|
|
83
|
+
df: pl.DataFrame,
|
|
84
|
+
group_by: Union[str, List[str]],
|
|
85
|
+
aggregations: Dict[str, str]
|
|
86
|
+
) -> None:
|
|
87
|
+
"""
|
|
88
|
+
Validate summarize parameters.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
df: Input DataFrame
|
|
92
|
+
group_by: Column(s) to group by
|
|
93
|
+
aggregations: Dictionary mapping column to aggregation function
|
|
94
|
+
|
|
95
|
+
Raises:
|
|
96
|
+
ValueError: If validation fails
|
|
97
|
+
"""
|
|
98
|
+
# Validate DataFrame
|
|
99
|
+
validate_dataframe(df)
|
|
100
|
+
validate_not_empty(df)
|
|
101
|
+
|
|
102
|
+
# Normalize group_by to list
|
|
103
|
+
group_by_list = [group_by] if isinstance(group_by, str) else group_by
|
|
104
|
+
|
|
105
|
+
# Check that group_by columns exist
|
|
106
|
+
missing_cols = [col for col in group_by_list if col not in df.columns]
|
|
107
|
+
if missing_cols:
|
|
108
|
+
raise ValueError(
|
|
109
|
+
f"Group by columns not found in DataFrame: {missing_cols}. "
|
|
110
|
+
f"Available columns: {df.columns}"
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
# Check that aggregations is not empty
|
|
114
|
+
if not aggregations:
|
|
115
|
+
raise ValueError("Aggregations dictionary cannot be empty")
|
|
116
|
+
|
|
117
|
+
# Check that aggregation columns exist
|
|
118
|
+
missing_agg_cols = [col for col in aggregations.keys() if col not in df.columns]
|
|
119
|
+
if missing_agg_cols:
|
|
120
|
+
raise ValueError(
|
|
121
|
+
f"Aggregation columns not found in DataFrame: {missing_agg_cols}. "
|
|
122
|
+
f"Available columns: {df.columns}"
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
# Check that aggregation functions are valid
|
|
126
|
+
invalid_funcs = [
|
|
127
|
+
func for func in aggregations.values()
|
|
128
|
+
if func not in AGGREGATION_MAP
|
|
129
|
+
]
|
|
130
|
+
if invalid_funcs:
|
|
131
|
+
raise ValueError(
|
|
132
|
+
f"Invalid aggregation functions: {invalid_funcs}. "
|
|
133
|
+
f"Valid functions: {list(AGGREGATION_MAP.keys())}"
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def parse_aggregation_spec(aggregations: Dict[str, str]) -> List[pl.Expr]:
|
|
138
|
+
"""
|
|
139
|
+
Parse aggregation specifications into Polars expressions.
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
aggregations: Dictionary mapping column to aggregation function
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
List of Polars expressions
|
|
146
|
+
|
|
147
|
+
Example:
|
|
148
|
+
>>> # Input: {'sales': 'sum', 'orders': 'count'}
|
|
149
|
+
>>> # Output: [pl.col('sales').sum(), pl.col('orders').count()]
|
|
150
|
+
"""
|
|
151
|
+
exprs = []
|
|
152
|
+
|
|
153
|
+
for col, func in aggregations.items():
|
|
154
|
+
# Get the Polars method name
|
|
155
|
+
polars_method = AGGREGATION_MAP[func]
|
|
156
|
+
|
|
157
|
+
# Create expression
|
|
158
|
+
if polars_method == 'n_unique':
|
|
159
|
+
# Special case for count_unique
|
|
160
|
+
expr = pl.col(col).n_unique()
|
|
161
|
+
elif polars_method == 'count':
|
|
162
|
+
# For count, we count non-null values
|
|
163
|
+
expr = pl.col(col).count()
|
|
164
|
+
else:
|
|
165
|
+
# Standard aggregation
|
|
166
|
+
expr = getattr(pl.col(col), polars_method)()
|
|
167
|
+
|
|
168
|
+
exprs.append(expr)
|
|
169
|
+
|
|
170
|
+
return exprs
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Main transform function - transform DataFrame structure.
|
|
3
|
+
|
|
4
|
+
This module provides the main user-facing transform() function with 6 modes:
|
|
5
|
+
- transpose: Transpose DataFrame (rows ↔ columns)
|
|
6
|
+
- onehotencoding: One-hot encode categorical columns
|
|
7
|
+
- extract: Extract features from columns
|
|
8
|
+
- datetime: Normalize datetime columns
|
|
9
|
+
- harmonize: Harmonize units across columns
|
|
10
|
+
- knn: Impute missing values using KNN
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import polars as pl
|
|
14
|
+
from typing import Any, Optional, Dict, List, Union
|
|
15
|
+
|
|
16
|
+
from additory.core.backend import detect_backend, to_polars, from_polars
|
|
17
|
+
from additory.core.logging import Logger
|
|
18
|
+
from additory.core.memory_manager import MemoryManager
|
|
19
|
+
from additory.common.validation import validate_dataframe, validate_not_empty
|
|
20
|
+
from additory.common.result import wrap_result
|
|
21
|
+
|
|
22
|
+
from additory.functions.transform.transpose import perform_transpose
|
|
23
|
+
from additory.functions.transform.onehotencoding import perform_onehotencoding
|
|
24
|
+
from additory.functions.transform.extract import perform_extract
|
|
25
|
+
from additory.functions.transform.datetime import perform_datetime_normalization
|
|
26
|
+
from additory.functions.transform.harmonize import perform_harmonize
|
|
27
|
+
from additory.functions.transform.knn import perform_knn_imputation
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def transform(
|
|
31
|
+
df: Any,
|
|
32
|
+
mode: str,
|
|
33
|
+
columns: Optional[Union[str, List[str], Dict]] = None,
|
|
34
|
+
strategy: Optional[Dict] = None,
|
|
35
|
+
**kwargs
|
|
36
|
+
) -> Any:
|
|
37
|
+
"""
|
|
38
|
+
Transform DataFrame structure using various modes.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
df: Input DataFrame
|
|
42
|
+
mode: Transformation mode
|
|
43
|
+
columns: Columns to transform (mode-specific)
|
|
44
|
+
strategy: Strategy dictionary for advanced control
|
|
45
|
+
**kwargs: Mode-specific parameters
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
Transformed DataFrame (wrapped in Result)
|
|
49
|
+
|
|
50
|
+
Modes:
|
|
51
|
+
1. transpose: Transpose DataFrame (rows ↔ columns)
|
|
52
|
+
- Example: transform(df, mode='transpose')
|
|
53
|
+
|
|
54
|
+
2. onehotencoding: One-hot encode categorical columns
|
|
55
|
+
- Example: transform(df, mode='onehotencoding', columns=['category'])
|
|
56
|
+
|
|
57
|
+
3. extract: Extract features from columns
|
|
58
|
+
- Example: transform(df, mode='extract', columns={'date': ['hour', 'day']})
|
|
59
|
+
|
|
60
|
+
4. datetime: Normalize datetime columns
|
|
61
|
+
- Example: transform(df, mode='datetime', columns=['birth_date'], strategy={...})
|
|
62
|
+
|
|
63
|
+
5. harmonize: Harmonize units across columns
|
|
64
|
+
- Example: transform(df, mode='harmonize', columns=['height'], strategy={...})
|
|
65
|
+
|
|
66
|
+
6. knn: Impute missing values using KNN
|
|
67
|
+
- Example: transform(df, mode='knn', columns=['age'], strategy={'k': 5})
|
|
68
|
+
"""
|
|
69
|
+
logger = Logger()
|
|
70
|
+
memory_manager = MemoryManager()
|
|
71
|
+
|
|
72
|
+
try:
|
|
73
|
+
# Validate input
|
|
74
|
+
validate_dataframe(df)
|
|
75
|
+
validate_not_empty(df)
|
|
76
|
+
|
|
77
|
+
# Detect backend and convert to Polars
|
|
78
|
+
backend = detect_backend(df)
|
|
79
|
+
polars_df = to_polars(df)
|
|
80
|
+
|
|
81
|
+
# Set logging context
|
|
82
|
+
logger.set_context('transform', {'mode': mode})
|
|
83
|
+
logger.info(f"Starting transform() function in '{mode}' mode")
|
|
84
|
+
|
|
85
|
+
# Dispatch to appropriate mode
|
|
86
|
+
if mode == 'transpose':
|
|
87
|
+
result = perform_transpose(polars_df)
|
|
88
|
+
|
|
89
|
+
elif mode == 'onehotencoding':
|
|
90
|
+
if columns is None:
|
|
91
|
+
raise ValueError("columns parameter required for onehotencoding mode")
|
|
92
|
+
result = perform_onehotencoding(polars_df, columns)
|
|
93
|
+
|
|
94
|
+
elif mode == 'extract':
|
|
95
|
+
if columns is None:
|
|
96
|
+
raise ValueError("columns parameter required for extract mode")
|
|
97
|
+
if not isinstance(columns, dict):
|
|
98
|
+
raise TypeError("columns must be a dictionary for extract mode")
|
|
99
|
+
result = perform_extract(polars_df, columns, strategy)
|
|
100
|
+
|
|
101
|
+
elif mode == 'datetime':
|
|
102
|
+
if columns is None:
|
|
103
|
+
raise ValueError("columns parameter required for datetime mode")
|
|
104
|
+
columns_list = [columns] if isinstance(columns, str) else columns
|
|
105
|
+
result = perform_datetime_normalization(polars_df, columns_list, strategy)
|
|
106
|
+
|
|
107
|
+
elif mode == 'harmonize':
|
|
108
|
+
if columns is None:
|
|
109
|
+
raise ValueError("columns parameter required for harmonize mode")
|
|
110
|
+
columns_list = [columns] if isinstance(columns, str) else columns
|
|
111
|
+
result = perform_harmonize(polars_df, columns_list, strategy)
|
|
112
|
+
|
|
113
|
+
elif mode == 'knn':
|
|
114
|
+
if columns is None:
|
|
115
|
+
raise ValueError("columns parameter required for knn mode")
|
|
116
|
+
columns_list = [columns] if isinstance(columns, str) else columns
|
|
117
|
+
result = perform_knn_imputation(polars_df, columns_list, strategy)
|
|
118
|
+
|
|
119
|
+
else:
|
|
120
|
+
raise ValueError(
|
|
121
|
+
f"Unknown mode: {mode}. "
|
|
122
|
+
f"Valid modes: transpose, onehotencoding, extract, datetime, harmonize, knn"
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
# Convert back to original backend
|
|
126
|
+
result = from_polars(result, backend)
|
|
127
|
+
|
|
128
|
+
# Cleanup
|
|
129
|
+
memory_manager.cleanup()
|
|
130
|
+
|
|
131
|
+
# Wrap result
|
|
132
|
+
logger.info(f"transform() function complete: {len(result)} rows, {len(result.columns)} columns")
|
|
133
|
+
return wrap_result(result, 'transform', metadata={'mode': mode})
|
|
134
|
+
|
|
135
|
+
except Exception as e:
|
|
136
|
+
logger.error(f"Error in transform() function: {str(e)}", error_location="transform")
|
|
137
|
+
raise
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
__all__ = ['transform']
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Normalize datetime columns.
|
|
3
|
+
|
|
4
|
+
This module provides datetime normalization functionality for the transform function.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import polars as pl
|
|
8
|
+
from typing import List, Dict, Optional
|
|
9
|
+
|
|
10
|
+
from additory.common.validation import validate_dataframe, validate_not_empty
|
|
11
|
+
from additory.core.logging import Logger
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def perform_datetime_normalization(
|
|
15
|
+
df: pl.DataFrame,
|
|
16
|
+
columns: List[str],
|
|
17
|
+
strategy: Optional[Dict] = None
|
|
18
|
+
) -> pl.DataFrame:
|
|
19
|
+
"""
|
|
20
|
+
Normalize datetime columns.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
df: Input DataFrame
|
|
24
|
+
columns: Columns to normalize
|
|
25
|
+
strategy: Normalization strategy per column
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
DataFrame with normalized datetime columns
|
|
29
|
+
|
|
30
|
+
Example:
|
|
31
|
+
>>> result = perform_datetime_normalization(df,
|
|
32
|
+
... columns=['birth_date'],
|
|
33
|
+
... strategy={
|
|
34
|
+
... 'birth_date': {
|
|
35
|
+
... 'format': '%Y-%m-%d',
|
|
36
|
+
... 'output_format': '%d-%b-%Y'
|
|
37
|
+
... }
|
|
38
|
+
... })
|
|
39
|
+
"""
|
|
40
|
+
logger = Logger()
|
|
41
|
+
|
|
42
|
+
# Validate
|
|
43
|
+
validate_dataframe(df)
|
|
44
|
+
validate_not_empty(df)
|
|
45
|
+
|
|
46
|
+
if not columns:
|
|
47
|
+
raise ValueError("columns list cannot be empty")
|
|
48
|
+
|
|
49
|
+
# Validate columns exist
|
|
50
|
+
missing = [col for col in columns if col not in df.columns]
|
|
51
|
+
if missing:
|
|
52
|
+
raise ValueError(f"Columns not found: {missing}")
|
|
53
|
+
|
|
54
|
+
logger.info(f"Normalizing {len(columns)} datetime columns")
|
|
55
|
+
|
|
56
|
+
result = df
|
|
57
|
+
strategy = strategy or {}
|
|
58
|
+
|
|
59
|
+
# Normalize each column
|
|
60
|
+
for col in columns:
|
|
61
|
+
col_strategy = strategy.get(col, {})
|
|
62
|
+
|
|
63
|
+
# Parse datetime if string
|
|
64
|
+
if result[col].dtype == pl.Utf8:
|
|
65
|
+
fmt = col_strategy.get('format', '%Y-%m-%d')
|
|
66
|
+
result = result.with_columns(
|
|
67
|
+
pl.col(col).str.strptime(pl.Datetime, fmt, strict=False).alias(col)
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
# Convert to desired output format if specified
|
|
71
|
+
output_fmt = col_strategy.get('output_format')
|
|
72
|
+
if output_fmt:
|
|
73
|
+
result = result.with_columns(
|
|
74
|
+
pl.col(col).dt.strftime(output_fmt).alias(col)
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
logger.info(f"Datetime normalization complete")
|
|
78
|
+
|
|
79
|
+
return result
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Extract features from columns.
|
|
3
|
+
|
|
4
|
+
This module provides feature extraction functionality for the transform function.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import polars as pl
|
|
8
|
+
from typing import Dict, Optional
|
|
9
|
+
|
|
10
|
+
from additory.common.validation import validate_dataframe, validate_not_empty
|
|
11
|
+
from additory.common.extractors import (
|
|
12
|
+
extract_datetime_features,
|
|
13
|
+
extract_email_features,
|
|
14
|
+
extract_text_features
|
|
15
|
+
)
|
|
16
|
+
from additory.core.logging import Logger
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def perform_extract(
|
|
20
|
+
df: pl.DataFrame,
|
|
21
|
+
columns: Dict[str, list],
|
|
22
|
+
strategy: Optional[Dict] = None
|
|
23
|
+
) -> pl.DataFrame:
|
|
24
|
+
"""
|
|
25
|
+
Extract features from columns.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
df: Input DataFrame
|
|
29
|
+
columns: Dictionary mapping column to features to extract
|
|
30
|
+
strategy: Optional strategy for extraction
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
DataFrame with extracted features
|
|
34
|
+
|
|
35
|
+
Example:
|
|
36
|
+
>>> result = perform_extract(df, columns={
|
|
37
|
+
... 'timestamp': ['hour', 'day_of_week', 'quarter'],
|
|
38
|
+
... 'email': ['domain'],
|
|
39
|
+
... 'tags': ['split']
|
|
40
|
+
... })
|
|
41
|
+
"""
|
|
42
|
+
logger = Logger()
|
|
43
|
+
|
|
44
|
+
# Validate
|
|
45
|
+
validate_dataframe(df)
|
|
46
|
+
validate_not_empty(df)
|
|
47
|
+
|
|
48
|
+
if not columns:
|
|
49
|
+
raise ValueError("columns dictionary cannot be empty")
|
|
50
|
+
|
|
51
|
+
logger.info(f"Extracting features from {len(columns)} columns")
|
|
52
|
+
|
|
53
|
+
result = df
|
|
54
|
+
|
|
55
|
+
# Extract features from each column
|
|
56
|
+
for col, features in columns.items():
|
|
57
|
+
if col not in df.columns:
|
|
58
|
+
raise ValueError(f"Column not found: {col}")
|
|
59
|
+
|
|
60
|
+
# Determine column type and extract accordingly
|
|
61
|
+
col_dtype = df[col].dtype
|
|
62
|
+
|
|
63
|
+
extracted_features = {}
|
|
64
|
+
|
|
65
|
+
if col_dtype in [pl.Datetime, pl.Date]:
|
|
66
|
+
# Datetime extraction
|
|
67
|
+
extracted_features = extract_datetime_features(df[col], features)
|
|
68
|
+
elif col_dtype == pl.Utf8:
|
|
69
|
+
# Check if it's email or text
|
|
70
|
+
if 'domain' in features or 'username' in features:
|
|
71
|
+
extracted_features = extract_email_features(df[col], features)
|
|
72
|
+
else:
|
|
73
|
+
extracted_features = extract_text_features(df[col], features)
|
|
74
|
+
else:
|
|
75
|
+
logger.warning(f"Unsupported column type for extraction: {col_dtype}")
|
|
76
|
+
continue
|
|
77
|
+
|
|
78
|
+
# Add extracted features as new columns
|
|
79
|
+
for feature_name, feature_series in extracted_features.items():
|
|
80
|
+
new_col_name = f"{col}_{feature_name}"
|
|
81
|
+
result = result.with_columns(feature_series.alias(new_col_name))
|
|
82
|
+
|
|
83
|
+
logger.info(f"Feature extraction complete: {len(result.columns)} columns")
|
|
84
|
+
|
|
85
|
+
return result
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Harmonize units across columns.
|
|
3
|
+
|
|
4
|
+
This module provides unit harmonization functionality for the transform function.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import polars as pl
|
|
8
|
+
from typing import List, Dict, Optional
|
|
9
|
+
|
|
10
|
+
from additory.common.validation import validate_dataframe, validate_not_empty
|
|
11
|
+
from additory.common.unit_conversions import (
|
|
12
|
+
convert_weight, convert_temperature, convert_distance,
|
|
13
|
+
convert_currency, convert_volume, convert_time, detect_unit_type
|
|
14
|
+
)
|
|
15
|
+
from additory.core.logging import Logger
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def perform_harmonize(
|
|
19
|
+
df: pl.DataFrame,
|
|
20
|
+
columns: List[str],
|
|
21
|
+
strategy: Optional[Dict] = None
|
|
22
|
+
) -> pl.DataFrame:
|
|
23
|
+
"""
|
|
24
|
+
Harmonize units across columns.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
df: Input DataFrame
|
|
28
|
+
columns: Columns to harmonize
|
|
29
|
+
strategy: Harmonization strategy per column
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
DataFrame with harmonized units
|
|
33
|
+
|
|
34
|
+
Example:
|
|
35
|
+
>>> result = perform_harmonize(df,
|
|
36
|
+
... columns=['height', 'weight'],
|
|
37
|
+
... strategy={
|
|
38
|
+
... 'height': {'from': 'cm', 'to': 'm'},
|
|
39
|
+
... 'weight': {'from': 'lb', 'to': 'kg'}
|
|
40
|
+
... })
|
|
41
|
+
"""
|
|
42
|
+
logger = Logger()
|
|
43
|
+
|
|
44
|
+
# Validate
|
|
45
|
+
validate_dataframe(df)
|
|
46
|
+
validate_not_empty(df)
|
|
47
|
+
|
|
48
|
+
if not columns:
|
|
49
|
+
raise ValueError("columns list cannot be empty")
|
|
50
|
+
|
|
51
|
+
# Validate columns exist
|
|
52
|
+
missing = [col for col in columns if col not in df.columns]
|
|
53
|
+
if missing:
|
|
54
|
+
raise ValueError(f"Columns not found: {missing}")
|
|
55
|
+
|
|
56
|
+
if not strategy:
|
|
57
|
+
raise ValueError("strategy is required for harmonization")
|
|
58
|
+
|
|
59
|
+
logger.info(f"Harmonizing {len(columns)} columns")
|
|
60
|
+
|
|
61
|
+
result = df
|
|
62
|
+
|
|
63
|
+
# Harmonize each column
|
|
64
|
+
for col in columns:
|
|
65
|
+
if col not in strategy:
|
|
66
|
+
logger.warning(f"No strategy provided for column: {col}")
|
|
67
|
+
continue
|
|
68
|
+
|
|
69
|
+
col_strategy = strategy[col]
|
|
70
|
+
from_unit = col_strategy.get('from')
|
|
71
|
+
to_unit = col_strategy.get('to')
|
|
72
|
+
|
|
73
|
+
if not from_unit or not to_unit:
|
|
74
|
+
raise ValueError(f"Both 'from' and 'to' units required for column: {col}")
|
|
75
|
+
|
|
76
|
+
# Detect unit type
|
|
77
|
+
unit_type = detect_unit_type(from_unit)
|
|
78
|
+
|
|
79
|
+
# Select appropriate conversion function
|
|
80
|
+
if unit_type == 'weight':
|
|
81
|
+
converter = convert_weight
|
|
82
|
+
elif unit_type == 'temperature':
|
|
83
|
+
converter = convert_temperature
|
|
84
|
+
elif unit_type == 'distance':
|
|
85
|
+
converter = convert_distance
|
|
86
|
+
elif unit_type == 'currency':
|
|
87
|
+
converter = convert_currency
|
|
88
|
+
elif unit_type == 'volume':
|
|
89
|
+
converter = convert_volume
|
|
90
|
+
elif unit_type == 'time':
|
|
91
|
+
converter = convert_time
|
|
92
|
+
else:
|
|
93
|
+
raise ValueError(f"Unsupported unit type: {unit_type}")
|
|
94
|
+
|
|
95
|
+
# Convert units using map_elements
|
|
96
|
+
result = result.with_columns(
|
|
97
|
+
pl.col(col).map_elements(
|
|
98
|
+
lambda x: converter(x, from_unit, to_unit) if x is not None else None,
|
|
99
|
+
return_dtype=pl.Float64
|
|
100
|
+
).alias(col)
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
logger.info(f"Unit harmonization complete")
|
|
104
|
+
|
|
105
|
+
return result
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""
|
|
2
|
+
KNN imputation for missing values.
|
|
3
|
+
|
|
4
|
+
This module provides KNN imputation functionality for the transform function.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import polars as pl
|
|
8
|
+
from typing import List, Optional, Dict
|
|
9
|
+
|
|
10
|
+
from additory.common.validation import validate_dataframe, validate_not_empty
|
|
11
|
+
from additory.common.knn_imputation import knn_impute
|
|
12
|
+
from additory.core.logging import Logger
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def perform_knn_imputation(
|
|
16
|
+
df: pl.DataFrame,
|
|
17
|
+
columns: List[str],
|
|
18
|
+
strategy: Optional[Dict] = None
|
|
19
|
+
) -> pl.DataFrame:
|
|
20
|
+
"""
|
|
21
|
+
Impute missing values using KNN.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
df: Input DataFrame
|
|
25
|
+
columns: Columns to impute
|
|
26
|
+
strategy: Imputation strategy (k, weights, etc.)
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
DataFrame with imputed values
|
|
30
|
+
|
|
31
|
+
Example:
|
|
32
|
+
>>> result = perform_knn_imputation(df,
|
|
33
|
+
... columns=['age', 'income'],
|
|
34
|
+
... strategy={'k': 5, 'weights': 'distance'})
|
|
35
|
+
"""
|
|
36
|
+
logger = Logger()
|
|
37
|
+
|
|
38
|
+
# Validate
|
|
39
|
+
validate_dataframe(df)
|
|
40
|
+
validate_not_empty(df)
|
|
41
|
+
|
|
42
|
+
if not columns:
|
|
43
|
+
raise ValueError("columns list cannot be empty")
|
|
44
|
+
|
|
45
|
+
# Validate columns exist
|
|
46
|
+
missing = [col for col in columns if col not in df.columns]
|
|
47
|
+
if missing:
|
|
48
|
+
raise ValueError(f"Columns not found: {missing}")
|
|
49
|
+
|
|
50
|
+
logger.info(f"Performing KNN imputation on {len(columns)} columns")
|
|
51
|
+
|
|
52
|
+
strategy = strategy or {}
|
|
53
|
+
k = strategy.get('k', 5)
|
|
54
|
+
weights = strategy.get('weights', 'uniform')
|
|
55
|
+
metric = strategy.get('metric', 'euclidean')
|
|
56
|
+
|
|
57
|
+
# Perform KNN imputation
|
|
58
|
+
result = knn_impute(df, columns, k=k, weights=weights, metric=metric)
|
|
59
|
+
|
|
60
|
+
logger.info(f"KNN imputation complete")
|
|
61
|
+
|
|
62
|
+
return result
|