additory 0.1.0a3__py3-none-any.whl → 0.1.1a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. additory/__init__.py +58 -14
  2. additory/common/__init__.py +31 -147
  3. additory/common/column_selector.py +255 -0
  4. additory/common/distributions.py +286 -613
  5. additory/common/extractors.py +313 -0
  6. additory/common/knn_imputation.py +332 -0
  7. additory/common/result.py +380 -0
  8. additory/common/strategy_parser.py +243 -0
  9. additory/common/unit_conversions.py +338 -0
  10. additory/common/validation.py +283 -103
  11. additory/core/__init__.py +34 -22
  12. additory/core/backend.py +258 -0
  13. additory/core/config.py +177 -305
  14. additory/core/logging.py +230 -24
  15. additory/core/memory_manager.py +157 -495
  16. additory/expressions/__init__.py +2 -23
  17. additory/expressions/compiler.py +457 -0
  18. additory/expressions/engine.py +264 -487
  19. additory/expressions/integrity.py +179 -0
  20. additory/expressions/loader.py +263 -0
  21. additory/expressions/parser.py +363 -167
  22. additory/expressions/resolver.py +274 -0
  23. additory/functions/__init__.py +1 -0
  24. additory/functions/analyze/__init__.py +144 -0
  25. additory/functions/analyze/cardinality.py +58 -0
  26. additory/functions/analyze/correlations.py +66 -0
  27. additory/functions/analyze/distributions.py +53 -0
  28. additory/functions/analyze/duplicates.py +49 -0
  29. additory/functions/analyze/features.py +61 -0
  30. additory/functions/analyze/imputation.py +66 -0
  31. additory/functions/analyze/outliers.py +65 -0
  32. additory/functions/analyze/patterns.py +65 -0
  33. additory/functions/analyze/presets.py +72 -0
  34. additory/functions/analyze/quality.py +59 -0
  35. additory/functions/analyze/timeseries.py +53 -0
  36. additory/functions/analyze/types.py +45 -0
  37. additory/functions/expressions/__init__.py +161 -0
  38. additory/functions/snapshot/__init__.py +82 -0
  39. additory/functions/snapshot/filter.py +119 -0
  40. additory/functions/synthetic/__init__.py +113 -0
  41. additory/functions/synthetic/mode_detector.py +47 -0
  42. additory/functions/synthetic/strategies/__init__.py +1 -0
  43. additory/functions/synthetic/strategies/advanced.py +35 -0
  44. additory/functions/synthetic/strategies/augmentative.py +160 -0
  45. additory/functions/synthetic/strategies/generative.py +168 -0
  46. additory/functions/synthetic/strategies/presets.py +116 -0
  47. additory/functions/to/__init__.py +188 -0
  48. additory/functions/to/lookup.py +351 -0
  49. additory/functions/to/merge.py +189 -0
  50. additory/functions/to/sort.py +91 -0
  51. additory/functions/to/summarize.py +170 -0
  52. additory/functions/transform/__init__.py +140 -0
  53. additory/functions/transform/datetime.py +79 -0
  54. additory/functions/transform/extract.py +85 -0
  55. additory/functions/transform/harmonize.py +105 -0
  56. additory/functions/transform/knn.py +62 -0
  57. additory/functions/transform/onehotencoding.py +68 -0
  58. additory/functions/transform/transpose.py +42 -0
  59. additory-0.1.1a1.dist-info/METADATA +83 -0
  60. additory-0.1.1a1.dist-info/RECORD +62 -0
  61. additory/analysis/__init__.py +0 -48
  62. additory/analysis/cardinality.py +0 -126
  63. additory/analysis/correlations.py +0 -124
  64. additory/analysis/distributions.py +0 -376
  65. additory/analysis/quality.py +0 -158
  66. additory/analysis/scan.py +0 -400
  67. additory/common/backend.py +0 -371
  68. additory/common/column_utils.py +0 -191
  69. additory/common/exceptions.py +0 -62
  70. additory/common/lists.py +0 -229
  71. additory/common/patterns.py +0 -240
  72. additory/common/resolver.py +0 -567
  73. additory/common/sample_data.py +0 -182
  74. additory/core/ast_builder.py +0 -165
  75. additory/core/backends/__init__.py +0 -23
  76. additory/core/backends/arrow_bridge.py +0 -483
  77. additory/core/backends/cudf_bridge.py +0 -355
  78. additory/core/column_positioning.py +0 -358
  79. additory/core/compiler_polars.py +0 -166
  80. additory/core/enhanced_cache_manager.py +0 -1119
  81. additory/core/enhanced_matchers.py +0 -473
  82. additory/core/enhanced_version_manager.py +0 -325
  83. additory/core/executor.py +0 -59
  84. additory/core/integrity_manager.py +0 -477
  85. additory/core/loader.py +0 -190
  86. additory/core/namespace_manager.py +0 -657
  87. additory/core/parser.py +0 -176
  88. additory/core/polars_expression_engine.py +0 -601
  89. additory/core/registry.py +0 -176
  90. additory/core/sample_data_manager.py +0 -492
  91. additory/core/user_namespace.py +0 -751
  92. additory/core/validator.py +0 -27
  93. additory/dynamic_api.py +0 -304
  94. additory/expressions/proxy.py +0 -549
  95. additory/expressions/registry.py +0 -313
  96. additory/expressions/samples.py +0 -492
  97. additory/synthetic/__init__.py +0 -13
  98. additory/synthetic/column_name_resolver.py +0 -149
  99. additory/synthetic/distributions.py +0 -22
  100. additory/synthetic/forecast.py +0 -1132
  101. additory/synthetic/linked_list_parser.py +0 -415
  102. additory/synthetic/namespace_lookup.py +0 -129
  103. additory/synthetic/smote.py +0 -320
  104. additory/synthetic/strategies.py +0 -850
  105. additory/synthetic/synthesizer.py +0 -713
  106. additory/utilities/__init__.py +0 -53
  107. additory/utilities/encoding.py +0 -600
  108. additory/utilities/games.py +0 -300
  109. additory/utilities/keys.py +0 -8
  110. additory/utilities/lookup.py +0 -103
  111. additory/utilities/matchers.py +0 -216
  112. additory/utilities/resolvers.py +0 -286
  113. additory/utilities/settings.py +0 -167
  114. additory/utilities/units.py +0 -749
  115. additory/utilities/validators.py +0 -153
  116. additory-0.1.0a3.dist-info/METADATA +0 -288
  117. additory-0.1.0a3.dist-info/RECORD +0 -71
  118. additory-0.1.0a3.dist-info/licenses/LICENSE +0 -21
  119. {additory-0.1.0a3.dist-info → additory-0.1.1a1.dist-info}/WHEEL +0 -0
  120. {additory-0.1.0a3.dist-info → additory-0.1.1a1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,170 @@
1
+ """
2
+ Group and aggregate data.
3
+
4
+ This module provides summarization functionality for the to function.
5
+ """
6
+
7
+ import polars as pl
8
+ from typing import Union, List, Dict
9
+
10
+ from additory.common.validation import validate_dataframe, validate_not_empty
11
+ from additory.core.logging import Logger
12
+
13
+
14
+ # Mapping of aggregation names to Polars methods
15
+ AGGREGATION_MAP = {
16
+ 'sum': 'sum',
17
+ 'mean': 'mean',
18
+ 'median': 'median',
19
+ 'min': 'min',
20
+ 'max': 'max',
21
+ 'count': 'count',
22
+ 'count_unique': 'n_unique',
23
+ 'std': 'std',
24
+ 'var': 'var',
25
+ 'first': 'first',
26
+ 'last': 'last',
27
+ }
28
+
29
+
30
+ def perform_summarize(
31
+ df: pl.DataFrame,
32
+ group_by: Union[str, List[str]],
33
+ aggregations: Dict[str, str],
34
+ **kwargs
35
+ ) -> pl.DataFrame:
36
+ """
37
+ Group and aggregate DataFrame.
38
+
39
+ Args:
40
+ df: Input DataFrame
41
+ group_by: Column(s) to group by
42
+ aggregations: Dictionary mapping column to aggregation function
43
+ **kwargs: Additional parameters (reserved for future use)
44
+
45
+ Returns:
46
+ Summarized DataFrame
47
+
48
+ Example:
49
+ >>> result = perform_summarize(
50
+ ... df,
51
+ ... group_by='category',
52
+ ... aggregations={'sales': 'sum', 'orders': 'count', 'price': 'mean'}
53
+ ... )
54
+
55
+ Supported Aggregations:
56
+ - 'sum', 'mean', 'median', 'min', 'max'
57
+ - 'count', 'count_unique', 'std', 'var'
58
+ - 'first', 'last'
59
+ """
60
+ logger = Logger()
61
+
62
+ # Validate parameters
63
+ validate_summarize_parameters(df, group_by, aggregations)
64
+
65
+ # Normalize group_by to list
66
+ group_by_list = [group_by] if isinstance(group_by, str) else group_by
67
+
68
+ # Log operation
69
+ logger.info(f"Summarizing by {group_by_list}, {len(aggregations)} aggregations")
70
+
71
+ # Parse aggregation specifications
72
+ agg_exprs = parse_aggregation_spec(aggregations)
73
+
74
+ # Perform groupby and aggregation
75
+ result = df.group_by(group_by_list).agg(agg_exprs)
76
+
77
+ logger.info(f"Summarize complete: {len(result)} groups")
78
+
79
+ return result
80
+
81
+
82
+ def validate_summarize_parameters(
83
+ df: pl.DataFrame,
84
+ group_by: Union[str, List[str]],
85
+ aggregations: Dict[str, str]
86
+ ) -> None:
87
+ """
88
+ Validate summarize parameters.
89
+
90
+ Args:
91
+ df: Input DataFrame
92
+ group_by: Column(s) to group by
93
+ aggregations: Dictionary mapping column to aggregation function
94
+
95
+ Raises:
96
+ ValueError: If validation fails
97
+ """
98
+ # Validate DataFrame
99
+ validate_dataframe(df)
100
+ validate_not_empty(df)
101
+
102
+ # Normalize group_by to list
103
+ group_by_list = [group_by] if isinstance(group_by, str) else group_by
104
+
105
+ # Check that group_by columns exist
106
+ missing_cols = [col for col in group_by_list if col not in df.columns]
107
+ if missing_cols:
108
+ raise ValueError(
109
+ f"Group by columns not found in DataFrame: {missing_cols}. "
110
+ f"Available columns: {df.columns}"
111
+ )
112
+
113
+ # Check that aggregations is not empty
114
+ if not aggregations:
115
+ raise ValueError("Aggregations dictionary cannot be empty")
116
+
117
+ # Check that aggregation columns exist
118
+ missing_agg_cols = [col for col in aggregations.keys() if col not in df.columns]
119
+ if missing_agg_cols:
120
+ raise ValueError(
121
+ f"Aggregation columns not found in DataFrame: {missing_agg_cols}. "
122
+ f"Available columns: {df.columns}"
123
+ )
124
+
125
+ # Check that aggregation functions are valid
126
+ invalid_funcs = [
127
+ func for func in aggregations.values()
128
+ if func not in AGGREGATION_MAP
129
+ ]
130
+ if invalid_funcs:
131
+ raise ValueError(
132
+ f"Invalid aggregation functions: {invalid_funcs}. "
133
+ f"Valid functions: {list(AGGREGATION_MAP.keys())}"
134
+ )
135
+
136
+
137
+ def parse_aggregation_spec(aggregations: Dict[str, str]) -> List[pl.Expr]:
138
+ """
139
+ Parse aggregation specifications into Polars expressions.
140
+
141
+ Args:
142
+ aggregations: Dictionary mapping column to aggregation function
143
+
144
+ Returns:
145
+ List of Polars expressions
146
+
147
+ Example:
148
+ >>> # Input: {'sales': 'sum', 'orders': 'count'}
149
+ >>> # Output: [pl.col('sales').sum(), pl.col('orders').count()]
150
+ """
151
+ exprs = []
152
+
153
+ for col, func in aggregations.items():
154
+ # Get the Polars method name
155
+ polars_method = AGGREGATION_MAP[func]
156
+
157
+ # Create expression
158
+ if polars_method == 'n_unique':
159
+ # Special case for count_unique
160
+ expr = pl.col(col).n_unique()
161
+ elif polars_method == 'count':
162
+ # For count, we count non-null values
163
+ expr = pl.col(col).count()
164
+ else:
165
+ # Standard aggregation
166
+ expr = getattr(pl.col(col), polars_method)()
167
+
168
+ exprs.append(expr)
169
+
170
+ return exprs
@@ -0,0 +1,140 @@
1
+ """
2
+ Main transform function - transform DataFrame structure.
3
+
4
+ This module provides the main user-facing transform() function with 6 modes:
5
+ - transpose: Transpose DataFrame (rows ↔ columns)
6
+ - onehotencoding: One-hot encode categorical columns
7
+ - extract: Extract features from columns
8
+ - datetime: Normalize datetime columns
9
+ - harmonize: Harmonize units across columns
10
+ - knn: Impute missing values using KNN
11
+ """
12
+
13
+ import polars as pl
14
+ from typing import Any, Optional, Dict, List, Union
15
+
16
+ from additory.core.backend import detect_backend, to_polars, from_polars
17
+ from additory.core.logging import Logger
18
+ from additory.core.memory_manager import MemoryManager
19
+ from additory.common.validation import validate_dataframe, validate_not_empty
20
+ from additory.common.result import wrap_result
21
+
22
+ from additory.functions.transform.transpose import perform_transpose
23
+ from additory.functions.transform.onehotencoding import perform_onehotencoding
24
+ from additory.functions.transform.extract import perform_extract
25
+ from additory.functions.transform.datetime import perform_datetime_normalization
26
+ from additory.functions.transform.harmonize import perform_harmonize
27
+ from additory.functions.transform.knn import perform_knn_imputation
28
+
29
+
30
+ def transform(
31
+ df: Any,
32
+ mode: str,
33
+ columns: Optional[Union[str, List[str], Dict]] = None,
34
+ strategy: Optional[Dict] = None,
35
+ **kwargs
36
+ ) -> Any:
37
+ """
38
+ Transform DataFrame structure using various modes.
39
+
40
+ Args:
41
+ df: Input DataFrame
42
+ mode: Transformation mode
43
+ columns: Columns to transform (mode-specific)
44
+ strategy: Strategy dictionary for advanced control
45
+ **kwargs: Mode-specific parameters
46
+
47
+ Returns:
48
+ Transformed DataFrame (wrapped in Result)
49
+
50
+ Modes:
51
+ 1. transpose: Transpose DataFrame (rows ↔ columns)
52
+ - Example: transform(df, mode='transpose')
53
+
54
+ 2. onehotencoding: One-hot encode categorical columns
55
+ - Example: transform(df, mode='onehotencoding', columns=['category'])
56
+
57
+ 3. extract: Extract features from columns
58
+ - Example: transform(df, mode='extract', columns={'date': ['hour', 'day']})
59
+
60
+ 4. datetime: Normalize datetime columns
61
+ - Example: transform(df, mode='datetime', columns=['birth_date'], strategy={...})
62
+
63
+ 5. harmonize: Harmonize units across columns
64
+ - Example: transform(df, mode='harmonize', columns=['height'], strategy={...})
65
+
66
+ 6. knn: Impute missing values using KNN
67
+ - Example: transform(df, mode='knn', columns=['age'], strategy={'k': 5})
68
+ """
69
+ logger = Logger()
70
+ memory_manager = MemoryManager()
71
+
72
+ try:
73
+ # Validate input
74
+ validate_dataframe(df)
75
+ validate_not_empty(df)
76
+
77
+ # Detect backend and convert to Polars
78
+ backend = detect_backend(df)
79
+ polars_df = to_polars(df)
80
+
81
+ # Set logging context
82
+ logger.set_context('transform', {'mode': mode})
83
+ logger.info(f"Starting transform() function in '{mode}' mode")
84
+
85
+ # Dispatch to appropriate mode
86
+ if mode == 'transpose':
87
+ result = perform_transpose(polars_df)
88
+
89
+ elif mode == 'onehotencoding':
90
+ if columns is None:
91
+ raise ValueError("columns parameter required for onehotencoding mode")
92
+ result = perform_onehotencoding(polars_df, columns)
93
+
94
+ elif mode == 'extract':
95
+ if columns is None:
96
+ raise ValueError("columns parameter required for extract mode")
97
+ if not isinstance(columns, dict):
98
+ raise TypeError("columns must be a dictionary for extract mode")
99
+ result = perform_extract(polars_df, columns, strategy)
100
+
101
+ elif mode == 'datetime':
102
+ if columns is None:
103
+ raise ValueError("columns parameter required for datetime mode")
104
+ columns_list = [columns] if isinstance(columns, str) else columns
105
+ result = perform_datetime_normalization(polars_df, columns_list, strategy)
106
+
107
+ elif mode == 'harmonize':
108
+ if columns is None:
109
+ raise ValueError("columns parameter required for harmonize mode")
110
+ columns_list = [columns] if isinstance(columns, str) else columns
111
+ result = perform_harmonize(polars_df, columns_list, strategy)
112
+
113
+ elif mode == 'knn':
114
+ if columns is None:
115
+ raise ValueError("columns parameter required for knn mode")
116
+ columns_list = [columns] if isinstance(columns, str) else columns
117
+ result = perform_knn_imputation(polars_df, columns_list, strategy)
118
+
119
+ else:
120
+ raise ValueError(
121
+ f"Unknown mode: {mode}. "
122
+ f"Valid modes: transpose, onehotencoding, extract, datetime, harmonize, knn"
123
+ )
124
+
125
+ # Convert back to original backend
126
+ result = from_polars(result, backend)
127
+
128
+ # Cleanup
129
+ memory_manager.cleanup()
130
+
131
+ # Wrap result
132
+ logger.info(f"transform() function complete: {len(result)} rows, {len(result.columns)} columns")
133
+ return wrap_result(result, 'transform', metadata={'mode': mode})
134
+
135
+ except Exception as e:
136
+ logger.error(f"Error in transform() function: {str(e)}", error_location="transform")
137
+ raise
138
+
139
+
140
+ __all__ = ['transform']
@@ -0,0 +1,79 @@
1
+ """
2
+ Normalize datetime columns.
3
+
4
+ This module provides datetime normalization functionality for the transform function.
5
+ """
6
+
7
+ import polars as pl
8
+ from typing import List, Dict, Optional
9
+
10
+ from additory.common.validation import validate_dataframe, validate_not_empty
11
+ from additory.core.logging import Logger
12
+
13
+
14
+ def perform_datetime_normalization(
15
+ df: pl.DataFrame,
16
+ columns: List[str],
17
+ strategy: Optional[Dict] = None
18
+ ) -> pl.DataFrame:
19
+ """
20
+ Normalize datetime columns.
21
+
22
+ Args:
23
+ df: Input DataFrame
24
+ columns: Columns to normalize
25
+ strategy: Normalization strategy per column
26
+
27
+ Returns:
28
+ DataFrame with normalized datetime columns
29
+
30
+ Example:
31
+ >>> result = perform_datetime_normalization(df,
32
+ ... columns=['birth_date'],
33
+ ... strategy={
34
+ ... 'birth_date': {
35
+ ... 'format': '%Y-%m-%d',
36
+ ... 'output_format': '%d-%b-%Y'
37
+ ... }
38
+ ... })
39
+ """
40
+ logger = Logger()
41
+
42
+ # Validate
43
+ validate_dataframe(df)
44
+ validate_not_empty(df)
45
+
46
+ if not columns:
47
+ raise ValueError("columns list cannot be empty")
48
+
49
+ # Validate columns exist
50
+ missing = [col for col in columns if col not in df.columns]
51
+ if missing:
52
+ raise ValueError(f"Columns not found: {missing}")
53
+
54
+ logger.info(f"Normalizing {len(columns)} datetime columns")
55
+
56
+ result = df
57
+ strategy = strategy or {}
58
+
59
+ # Normalize each column
60
+ for col in columns:
61
+ col_strategy = strategy.get(col, {})
62
+
63
+ # Parse datetime if string
64
+ if result[col].dtype == pl.Utf8:
65
+ fmt = col_strategy.get('format', '%Y-%m-%d')
66
+ result = result.with_columns(
67
+ pl.col(col).str.strptime(pl.Datetime, fmt, strict=False).alias(col)
68
+ )
69
+
70
+ # Convert to desired output format if specified
71
+ output_fmt = col_strategy.get('output_format')
72
+ if output_fmt:
73
+ result = result.with_columns(
74
+ pl.col(col).dt.strftime(output_fmt).alias(col)
75
+ )
76
+
77
+ logger.info(f"Datetime normalization complete")
78
+
79
+ return result
@@ -0,0 +1,85 @@
1
+ """
2
+ Extract features from columns.
3
+
4
+ This module provides feature extraction functionality for the transform function.
5
+ """
6
+
7
+ import polars as pl
8
+ from typing import Dict, Optional
9
+
10
+ from additory.common.validation import validate_dataframe, validate_not_empty
11
+ from additory.common.extractors import (
12
+ extract_datetime_features,
13
+ extract_email_features,
14
+ extract_text_features
15
+ )
16
+ from additory.core.logging import Logger
17
+
18
+
19
+ def perform_extract(
20
+ df: pl.DataFrame,
21
+ columns: Dict[str, list],
22
+ strategy: Optional[Dict] = None
23
+ ) -> pl.DataFrame:
24
+ """
25
+ Extract features from columns.
26
+
27
+ Args:
28
+ df: Input DataFrame
29
+ columns: Dictionary mapping column to features to extract
30
+ strategy: Optional strategy for extraction
31
+
32
+ Returns:
33
+ DataFrame with extracted features
34
+
35
+ Example:
36
+ >>> result = perform_extract(df, columns={
37
+ ... 'timestamp': ['hour', 'day_of_week', 'quarter'],
38
+ ... 'email': ['domain'],
39
+ ... 'tags': ['split']
40
+ ... })
41
+ """
42
+ logger = Logger()
43
+
44
+ # Validate
45
+ validate_dataframe(df)
46
+ validate_not_empty(df)
47
+
48
+ if not columns:
49
+ raise ValueError("columns dictionary cannot be empty")
50
+
51
+ logger.info(f"Extracting features from {len(columns)} columns")
52
+
53
+ result = df
54
+
55
+ # Extract features from each column
56
+ for col, features in columns.items():
57
+ if col not in df.columns:
58
+ raise ValueError(f"Column not found: {col}")
59
+
60
+ # Determine column type and extract accordingly
61
+ col_dtype = df[col].dtype
62
+
63
+ extracted_features = {}
64
+
65
+ if col_dtype in [pl.Datetime, pl.Date]:
66
+ # Datetime extraction
67
+ extracted_features = extract_datetime_features(df[col], features)
68
+ elif col_dtype == pl.Utf8:
69
+ # Check if it's email or text
70
+ if 'domain' in features or 'username' in features:
71
+ extracted_features = extract_email_features(df[col], features)
72
+ else:
73
+ extracted_features = extract_text_features(df[col], features)
74
+ else:
75
+ logger.warning(f"Unsupported column type for extraction: {col_dtype}")
76
+ continue
77
+
78
+ # Add extracted features as new columns
79
+ for feature_name, feature_series in extracted_features.items():
80
+ new_col_name = f"{col}_{feature_name}"
81
+ result = result.with_columns(feature_series.alias(new_col_name))
82
+
83
+ logger.info(f"Feature extraction complete: {len(result.columns)} columns")
84
+
85
+ return result
@@ -0,0 +1,105 @@
1
+ """
2
+ Harmonize units across columns.
3
+
4
+ This module provides unit harmonization functionality for the transform function.
5
+ """
6
+
7
+ import polars as pl
8
+ from typing import List, Dict, Optional
9
+
10
+ from additory.common.validation import validate_dataframe, validate_not_empty
11
+ from additory.common.unit_conversions import (
12
+ convert_weight, convert_temperature, convert_distance,
13
+ convert_currency, convert_volume, convert_time, detect_unit_type
14
+ )
15
+ from additory.core.logging import Logger
16
+
17
+
18
+ def perform_harmonize(
19
+ df: pl.DataFrame,
20
+ columns: List[str],
21
+ strategy: Optional[Dict] = None
22
+ ) -> pl.DataFrame:
23
+ """
24
+ Harmonize units across columns.
25
+
26
+ Args:
27
+ df: Input DataFrame
28
+ columns: Columns to harmonize
29
+ strategy: Harmonization strategy per column
30
+
31
+ Returns:
32
+ DataFrame with harmonized units
33
+
34
+ Example:
35
+ >>> result = perform_harmonize(df,
36
+ ... columns=['height', 'weight'],
37
+ ... strategy={
38
+ ... 'height': {'from': 'cm', 'to': 'm'},
39
+ ... 'weight': {'from': 'lb', 'to': 'kg'}
40
+ ... })
41
+ """
42
+ logger = Logger()
43
+
44
+ # Validate
45
+ validate_dataframe(df)
46
+ validate_not_empty(df)
47
+
48
+ if not columns:
49
+ raise ValueError("columns list cannot be empty")
50
+
51
+ # Validate columns exist
52
+ missing = [col for col in columns if col not in df.columns]
53
+ if missing:
54
+ raise ValueError(f"Columns not found: {missing}")
55
+
56
+ if not strategy:
57
+ raise ValueError("strategy is required for harmonization")
58
+
59
+ logger.info(f"Harmonizing {len(columns)} columns")
60
+
61
+ result = df
62
+
63
+ # Harmonize each column
64
+ for col in columns:
65
+ if col not in strategy:
66
+ logger.warning(f"No strategy provided for column: {col}")
67
+ continue
68
+
69
+ col_strategy = strategy[col]
70
+ from_unit = col_strategy.get('from')
71
+ to_unit = col_strategy.get('to')
72
+
73
+ if not from_unit or not to_unit:
74
+ raise ValueError(f"Both 'from' and 'to' units required for column: {col}")
75
+
76
+ # Detect unit type
77
+ unit_type = detect_unit_type(from_unit)
78
+
79
+ # Select appropriate conversion function
80
+ if unit_type == 'weight':
81
+ converter = convert_weight
82
+ elif unit_type == 'temperature':
83
+ converter = convert_temperature
84
+ elif unit_type == 'distance':
85
+ converter = convert_distance
86
+ elif unit_type == 'currency':
87
+ converter = convert_currency
88
+ elif unit_type == 'volume':
89
+ converter = convert_volume
90
+ elif unit_type == 'time':
91
+ converter = convert_time
92
+ else:
93
+ raise ValueError(f"Unsupported unit type: {unit_type}")
94
+
95
+ # Convert units using map_elements
96
+ result = result.with_columns(
97
+ pl.col(col).map_elements(
98
+ lambda x: converter(x, from_unit, to_unit) if x is not None else None,
99
+ return_dtype=pl.Float64
100
+ ).alias(col)
101
+ )
102
+
103
+ logger.info(f"Unit harmonization complete")
104
+
105
+ return result
@@ -0,0 +1,62 @@
1
+ """
2
+ KNN imputation for missing values.
3
+
4
+ This module provides KNN imputation functionality for the transform function.
5
+ """
6
+
7
+ import polars as pl
8
+ from typing import List, Optional, Dict
9
+
10
+ from additory.common.validation import validate_dataframe, validate_not_empty
11
+ from additory.common.knn_imputation import knn_impute
12
+ from additory.core.logging import Logger
13
+
14
+
15
+ def perform_knn_imputation(
16
+ df: pl.DataFrame,
17
+ columns: List[str],
18
+ strategy: Optional[Dict] = None
19
+ ) -> pl.DataFrame:
20
+ """
21
+ Impute missing values using KNN.
22
+
23
+ Args:
24
+ df: Input DataFrame
25
+ columns: Columns to impute
26
+ strategy: Imputation strategy (k, weights, etc.)
27
+
28
+ Returns:
29
+ DataFrame with imputed values
30
+
31
+ Example:
32
+ >>> result = perform_knn_imputation(df,
33
+ ... columns=['age', 'income'],
34
+ ... strategy={'k': 5, 'weights': 'distance'})
35
+ """
36
+ logger = Logger()
37
+
38
+ # Validate
39
+ validate_dataframe(df)
40
+ validate_not_empty(df)
41
+
42
+ if not columns:
43
+ raise ValueError("columns list cannot be empty")
44
+
45
+ # Validate columns exist
46
+ missing = [col for col in columns if col not in df.columns]
47
+ if missing:
48
+ raise ValueError(f"Columns not found: {missing}")
49
+
50
+ logger.info(f"Performing KNN imputation on {len(columns)} columns")
51
+
52
+ strategy = strategy or {}
53
+ k = strategy.get('k', 5)
54
+ weights = strategy.get('weights', 'uniform')
55
+ metric = strategy.get('metric', 'euclidean')
56
+
57
+ # Perform KNN imputation
58
+ result = knn_impute(df, columns, k=k, weights=weights, metric=metric)
59
+
60
+ logger.info(f"KNN imputation complete")
61
+
62
+ return result