additory 0.1.0a3__py3-none-any.whl → 0.1.1a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- additory/__init__.py +58 -14
- additory/common/__init__.py +31 -147
- additory/common/column_selector.py +255 -0
- additory/common/distributions.py +286 -613
- additory/common/extractors.py +313 -0
- additory/common/knn_imputation.py +332 -0
- additory/common/result.py +380 -0
- additory/common/strategy_parser.py +243 -0
- additory/common/unit_conversions.py +338 -0
- additory/common/validation.py +283 -103
- additory/core/__init__.py +34 -22
- additory/core/backend.py +258 -0
- additory/core/config.py +177 -305
- additory/core/logging.py +230 -24
- additory/core/memory_manager.py +157 -495
- additory/expressions/__init__.py +2 -23
- additory/expressions/compiler.py +457 -0
- additory/expressions/engine.py +264 -487
- additory/expressions/integrity.py +179 -0
- additory/expressions/loader.py +263 -0
- additory/expressions/parser.py +363 -167
- additory/expressions/resolver.py +274 -0
- additory/functions/__init__.py +1 -0
- additory/functions/analyze/__init__.py +144 -0
- additory/functions/analyze/cardinality.py +58 -0
- additory/functions/analyze/correlations.py +66 -0
- additory/functions/analyze/distributions.py +53 -0
- additory/functions/analyze/duplicates.py +49 -0
- additory/functions/analyze/features.py +61 -0
- additory/functions/analyze/imputation.py +66 -0
- additory/functions/analyze/outliers.py +65 -0
- additory/functions/analyze/patterns.py +65 -0
- additory/functions/analyze/presets.py +72 -0
- additory/functions/analyze/quality.py +59 -0
- additory/functions/analyze/timeseries.py +53 -0
- additory/functions/analyze/types.py +45 -0
- additory/functions/expressions/__init__.py +161 -0
- additory/functions/snapshot/__init__.py +82 -0
- additory/functions/snapshot/filter.py +119 -0
- additory/functions/synthetic/__init__.py +113 -0
- additory/functions/synthetic/mode_detector.py +47 -0
- additory/functions/synthetic/strategies/__init__.py +1 -0
- additory/functions/synthetic/strategies/advanced.py +35 -0
- additory/functions/synthetic/strategies/augmentative.py +160 -0
- additory/functions/synthetic/strategies/generative.py +168 -0
- additory/functions/synthetic/strategies/presets.py +116 -0
- additory/functions/to/__init__.py +188 -0
- additory/functions/to/lookup.py +351 -0
- additory/functions/to/merge.py +189 -0
- additory/functions/to/sort.py +91 -0
- additory/functions/to/summarize.py +170 -0
- additory/functions/transform/__init__.py +140 -0
- additory/functions/transform/datetime.py +79 -0
- additory/functions/transform/extract.py +85 -0
- additory/functions/transform/harmonize.py +105 -0
- additory/functions/transform/knn.py +62 -0
- additory/functions/transform/onehotencoding.py +68 -0
- additory/functions/transform/transpose.py +42 -0
- additory-0.1.1a1.dist-info/METADATA +83 -0
- additory-0.1.1a1.dist-info/RECORD +62 -0
- additory/analysis/__init__.py +0 -48
- additory/analysis/cardinality.py +0 -126
- additory/analysis/correlations.py +0 -124
- additory/analysis/distributions.py +0 -376
- additory/analysis/quality.py +0 -158
- additory/analysis/scan.py +0 -400
- additory/common/backend.py +0 -371
- additory/common/column_utils.py +0 -191
- additory/common/exceptions.py +0 -62
- additory/common/lists.py +0 -229
- additory/common/patterns.py +0 -240
- additory/common/resolver.py +0 -567
- additory/common/sample_data.py +0 -182
- additory/core/ast_builder.py +0 -165
- additory/core/backends/__init__.py +0 -23
- additory/core/backends/arrow_bridge.py +0 -483
- additory/core/backends/cudf_bridge.py +0 -355
- additory/core/column_positioning.py +0 -358
- additory/core/compiler_polars.py +0 -166
- additory/core/enhanced_cache_manager.py +0 -1119
- additory/core/enhanced_matchers.py +0 -473
- additory/core/enhanced_version_manager.py +0 -325
- additory/core/executor.py +0 -59
- additory/core/integrity_manager.py +0 -477
- additory/core/loader.py +0 -190
- additory/core/namespace_manager.py +0 -657
- additory/core/parser.py +0 -176
- additory/core/polars_expression_engine.py +0 -601
- additory/core/registry.py +0 -176
- additory/core/sample_data_manager.py +0 -492
- additory/core/user_namespace.py +0 -751
- additory/core/validator.py +0 -27
- additory/dynamic_api.py +0 -304
- additory/expressions/proxy.py +0 -549
- additory/expressions/registry.py +0 -313
- additory/expressions/samples.py +0 -492
- additory/synthetic/__init__.py +0 -13
- additory/synthetic/column_name_resolver.py +0 -149
- additory/synthetic/distributions.py +0 -22
- additory/synthetic/forecast.py +0 -1132
- additory/synthetic/linked_list_parser.py +0 -415
- additory/synthetic/namespace_lookup.py +0 -129
- additory/synthetic/smote.py +0 -320
- additory/synthetic/strategies.py +0 -850
- additory/synthetic/synthesizer.py +0 -713
- additory/utilities/__init__.py +0 -53
- additory/utilities/encoding.py +0 -600
- additory/utilities/games.py +0 -300
- additory/utilities/keys.py +0 -8
- additory/utilities/lookup.py +0 -103
- additory/utilities/matchers.py +0 -216
- additory/utilities/resolvers.py +0 -286
- additory/utilities/settings.py +0 -167
- additory/utilities/units.py +0 -749
- additory/utilities/validators.py +0 -153
- additory-0.1.0a3.dist-info/METADATA +0 -288
- additory-0.1.0a3.dist-info/RECORD +0 -71
- additory-0.1.0a3.dist-info/licenses/LICENSE +0 -21
- {additory-0.1.0a3.dist-info → additory-0.1.1a1.dist-info}/WHEEL +0 -0
- {additory-0.1.0a3.dist-info → additory-0.1.1a1.dist-info}/top_level.txt +0 -0
additory/utilities/__init__.py
DELETED
|
@@ -1,53 +0,0 @@
|
|
|
1
|
-
# additory/utilities/__init__.py
|
|
2
|
-
# Utilities system - Direct operations (non-.add driven)
|
|
3
|
-
|
|
4
|
-
"""
|
|
5
|
-
Utilities System Module
|
|
6
|
-
|
|
7
|
-
This module handles direct operations that don't require .add files:
|
|
8
|
-
- Lookup operations (add.to)
|
|
9
|
-
- Unit conversion (add.harmonize_units)
|
|
10
|
-
- Global settings management
|
|
11
|
-
- Enhanced matchers and resolvers
|
|
12
|
-
- Input validation
|
|
13
|
-
"""
|
|
14
|
-
|
|
15
|
-
# Utility functionality
|
|
16
|
-
from .lookup import to, fuzzy_lookup, aggregate_lookup
|
|
17
|
-
from .units import harmonize_units, get_supported_units, get_conversion_stats
|
|
18
|
-
from .encoding import onehotencoding
|
|
19
|
-
from .games import play, tictactoe, sudoku
|
|
20
|
-
from .settings import (
|
|
21
|
-
set_global_settings, get_global_settings, get_setting,
|
|
22
|
-
set_my_expressions_path, set_my_schemas_path,
|
|
23
|
-
get_my_expressions_path, get_my_schemas_path,
|
|
24
|
-
set_backend, set_precision, enable_cache, disable_cache
|
|
25
|
-
)
|
|
26
|
-
from .validators import (
|
|
27
|
-
validate_dataframe, validate_columns_exist, validate_numeric_column,
|
|
28
|
-
validate_string_column, is_dataframe
|
|
29
|
-
)
|
|
30
|
-
|
|
31
|
-
__all__ = [
|
|
32
|
-
# Lookup functionality
|
|
33
|
-
'to', 'fuzzy_lookup', 'aggregate_lookup',
|
|
34
|
-
|
|
35
|
-
# Unit conversion
|
|
36
|
-
'harmonize_units', 'get_supported_units', 'get_conversion_stats',
|
|
37
|
-
|
|
38
|
-
# Encoding
|
|
39
|
-
'onehotencoding',
|
|
40
|
-
|
|
41
|
-
# Games (Easter egg)
|
|
42
|
-
'play', 'tictactoe', 'sudoku',
|
|
43
|
-
|
|
44
|
-
# Settings management
|
|
45
|
-
'set_global_settings', 'get_global_settings', 'get_setting',
|
|
46
|
-
'set_my_expressions_path', 'set_my_schemas_path',
|
|
47
|
-
'get_my_expressions_path', 'get_my_schemas_path',
|
|
48
|
-
'set_backend', 'set_precision', 'enable_cache', 'disable_cache',
|
|
49
|
-
|
|
50
|
-
# Validation
|
|
51
|
-
'validate_dataframe', 'validate_columns_exist', 'validate_numeric_column',
|
|
52
|
-
'validate_string_column', 'is_dataframe'
|
|
53
|
-
]
|
additory/utilities/encoding.py
DELETED
|
@@ -1,600 +0,0 @@
|
|
|
1
|
-
# additory/utilities/encoding.py
|
|
2
|
-
# One-hot encoding and other encoding utilities
|
|
3
|
-
|
|
4
|
-
"""
|
|
5
|
-
Encoding Utilities Module
|
|
6
|
-
|
|
7
|
-
Provides encoding operations for categorical data:
|
|
8
|
-
- One-hot encoding with native backend support
|
|
9
|
-
- Smart cardinality handling
|
|
10
|
-
- Column name conflict resolution
|
|
11
|
-
"""
|
|
12
|
-
|
|
13
|
-
from typing import Union, Optional, List, Dict, Any, Tuple
|
|
14
|
-
from collections import Counter
|
|
15
|
-
import warnings
|
|
16
|
-
|
|
17
|
-
# Import from harmonized common module
|
|
18
|
-
from additory.common import (
|
|
19
|
-
detect_backend,
|
|
20
|
-
validate_dataframe,
|
|
21
|
-
validate_columns_exist,
|
|
22
|
-
validate_positive_number,
|
|
23
|
-
validate_ratio,
|
|
24
|
-
validate_integer_in_range,
|
|
25
|
-
ValidationError,
|
|
26
|
-
EncodingError,
|
|
27
|
-
sanitize_column_name,
|
|
28
|
-
generate_safe_column_name,
|
|
29
|
-
BackendType
|
|
30
|
-
)
|
|
31
|
-
|
|
32
|
-
# Import column positioning
|
|
33
|
-
from additory.core.column_positioning import position_columns
|
|
34
|
-
|
|
35
|
-
# Backend imports
|
|
36
|
-
import pandas as pd
|
|
37
|
-
import numpy as np
|
|
38
|
-
|
|
39
|
-
try:
|
|
40
|
-
import polars as pl
|
|
41
|
-
HAS_POLARS = True
|
|
42
|
-
except ImportError:
|
|
43
|
-
HAS_POLARS = False
|
|
44
|
-
pl = None
|
|
45
|
-
|
|
46
|
-
try:
|
|
47
|
-
import cudf
|
|
48
|
-
HAS_CUDF = True
|
|
49
|
-
except ImportError:
|
|
50
|
-
HAS_CUDF = False
|
|
51
|
-
cudf = None
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
DataFrame = Union[pd.DataFrame, 'pl.DataFrame', 'cudf.DataFrame']
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
# Constants
|
|
58
|
-
_MAX_CATEGORIES_ABSOLUTE = 200
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
def onehotencoding(
|
|
62
|
-
df: DataFrame,
|
|
63
|
-
column: str,
|
|
64
|
-
*,
|
|
65
|
-
max_categories: int = 50,
|
|
66
|
-
max_cardinality_ratio: float = 0.5,
|
|
67
|
-
handle_overflow: str = "error",
|
|
68
|
-
position: str = "after",
|
|
69
|
-
drop_original: bool = True,
|
|
70
|
-
prefix: Optional[str] = None,
|
|
71
|
-
suffix: Optional[str] = None,
|
|
72
|
-
check_id_column: bool = True,
|
|
73
|
-
max_column_name_length: int = 63
|
|
74
|
-
) -> DataFrame:
|
|
75
|
-
"""
|
|
76
|
-
One-hot encode a categorical column into binary uint8 columns.
|
|
77
|
-
|
|
78
|
-
Validates all column names before any data processing to fail fast.
|
|
79
|
-
Uses native backend operations (no conversions) for optimal performance.
|
|
80
|
-
|
|
81
|
-
Parameters
|
|
82
|
-
----------
|
|
83
|
-
df : DataFrame
|
|
84
|
-
Input dataframe (pandas, polars, or cudf)
|
|
85
|
-
column : str
|
|
86
|
-
Column name to encode
|
|
87
|
-
max_categories : int, default 50
|
|
88
|
-
Maximum number of categories to encode (includes "other" if used).
|
|
89
|
-
Cannot exceed 200 (hard limit).
|
|
90
|
-
max_cardinality_ratio : float, default 0.5
|
|
91
|
-
Maximum ratio of unique values to total rows (0.0-1.0).
|
|
92
|
-
If exceeded, raises error unless handle_overflow="warn".
|
|
93
|
-
Set to 1.0 to disable check.
|
|
94
|
-
handle_overflow : str, default "error"
|
|
95
|
-
How to handle when unique values exceed max_categories:
|
|
96
|
-
- "error": Raise EncodingError (default)
|
|
97
|
-
- "top_n": Keep top N most frequent, group rest as "other"
|
|
98
|
-
- "top_n:N": Keep top N most frequent + "other" (e.g., "top_n:20")
|
|
99
|
-
- "warn": Proceed with warning (if cardinality ratio allows)
|
|
100
|
-
position : str, default "after"
|
|
101
|
-
Where to insert new columns: "after", "before", "end", "start"
|
|
102
|
-
drop_original : bool, default True
|
|
103
|
-
Whether to drop the original column after encoding
|
|
104
|
-
prefix : str, optional
|
|
105
|
-
Prefix for new column names (max 20 chars)
|
|
106
|
-
suffix : str, optional
|
|
107
|
-
Suffix for new column names (max 20 chars)
|
|
108
|
-
check_id_column : bool, default True
|
|
109
|
-
Check if column appears to be an ID column and raise error
|
|
110
|
-
max_column_name_length : int, default 63
|
|
111
|
-
Maximum length for generated column names (SQL compatibility)
|
|
112
|
-
|
|
113
|
-
Returns
|
|
114
|
-
-------
|
|
115
|
-
DataFrame
|
|
116
|
-
DataFrame with one-hot encoded uint8 columns (0 or 1)
|
|
117
|
-
|
|
118
|
-
Raises
|
|
119
|
-
------
|
|
120
|
-
ValidationError
|
|
121
|
-
- If column does not exist
|
|
122
|
-
- If parameters are invalid
|
|
123
|
-
EncodingError
|
|
124
|
-
- If cardinality exceeds max_categories and handle_overflow="error"
|
|
125
|
-
- If unique ratio exceeds max_cardinality_ratio
|
|
126
|
-
- If generated column names have duplicates (after truncation)
|
|
127
|
-
- If generated column names conflict with existing columns
|
|
128
|
-
- If max_categories exceeds hard limit (200)
|
|
129
|
-
- If column appears to be an ID column and check_id_column=True
|
|
130
|
-
|
|
131
|
-
Examples
|
|
132
|
-
--------
|
|
133
|
-
Basic usage:
|
|
134
|
-
>>> df = pl.DataFrame({"color": ["red", "blue", "red", "green"]})
|
|
135
|
-
>>> result = add.onehotencoding(df, "color")
|
|
136
|
-
# Creates: color_blue, color_green, color_red (uint8, sorted)
|
|
137
|
-
|
|
138
|
-
With prefix to avoid conflicts:
|
|
139
|
-
>>> result = add.onehotencoding(df, "color", prefix="ohe", drop_original=False)
|
|
140
|
-
# Creates: ohe_color_blue, ohe_color_green, ohe_color_red
|
|
141
|
-
# Keeps: color
|
|
142
|
-
|
|
143
|
-
Handle high cardinality with top_n:
|
|
144
|
-
>>> result = add.onehotencoding(
|
|
145
|
-
... df, "category",
|
|
146
|
-
... max_categories=10,
|
|
147
|
-
... handle_overflow="top_n:9"
|
|
148
|
-
... )
|
|
149
|
-
# Creates: top 9 categories + category_other
|
|
150
|
-
|
|
151
|
-
Allow high cardinality:
|
|
152
|
-
>>> result = add.onehotencoding(
|
|
153
|
-
... df, "user_id",
|
|
154
|
-
... max_cardinality_ratio=1.0,
|
|
155
|
-
... max_categories=100,
|
|
156
|
-
... check_id_column=False
|
|
157
|
-
... )
|
|
158
|
-
# Proceeds even if user_id looks like an ID column
|
|
159
|
-
|
|
160
|
-
Notes
|
|
161
|
-
-----
|
|
162
|
-
- All validation happens before data processing (fail-fast)
|
|
163
|
-
- Generated column names are checked for duplicates and conflicts
|
|
164
|
-
- Category values are sorted alphabetically in output columns
|
|
165
|
-
- "other" category (if used) always appears last
|
|
166
|
-
- Uses uint8 dtype (0 or 1) for memory efficiency
|
|
167
|
-
- Native backend operations (no conversions) for performance
|
|
168
|
-
"""
|
|
169
|
-
|
|
170
|
-
# ============================================
|
|
171
|
-
# PHASE 1: VALIDATION (No data processing)
|
|
172
|
-
# ============================================
|
|
173
|
-
|
|
174
|
-
# Detect backend
|
|
175
|
-
backend = detect_backend(df)
|
|
176
|
-
|
|
177
|
-
# Validate parameters
|
|
178
|
-
_validate_parameters(
|
|
179
|
-
max_categories, max_cardinality_ratio, handle_overflow,
|
|
180
|
-
position, prefix, suffix, max_column_name_length
|
|
181
|
-
)
|
|
182
|
-
|
|
183
|
-
# Parse handle_overflow
|
|
184
|
-
overflow_mode, top_n = _parse_handle_overflow(handle_overflow, max_categories)
|
|
185
|
-
|
|
186
|
-
# Validate dataframe and column
|
|
187
|
-
validate_dataframe(df, "input dataframe")
|
|
188
|
-
validate_columns_exist(df, column, "input dataframe")
|
|
189
|
-
|
|
190
|
-
# Get column statistics (NATIVE operations)
|
|
191
|
-
stats = _get_column_stats_native(df, column, backend)
|
|
192
|
-
|
|
193
|
-
# Check if ID column
|
|
194
|
-
if check_id_column:
|
|
195
|
-
_check_id_column(column, stats)
|
|
196
|
-
|
|
197
|
-
# Check cardinality ratio
|
|
198
|
-
if max_cardinality_ratio < 1.0:
|
|
199
|
-
_check_cardinality_ratio(column, stats, max_cardinality_ratio)
|
|
200
|
-
|
|
201
|
-
# Determine categories to encode
|
|
202
|
-
categories = _determine_categories(
|
|
203
|
-
df, column, backend, stats['n_unique'],
|
|
204
|
-
max_categories, overflow_mode, top_n
|
|
205
|
-
)
|
|
206
|
-
|
|
207
|
-
# Generate column names
|
|
208
|
-
generated_names = _generate_column_names(
|
|
209
|
-
column, categories, prefix, suffix, max_column_name_length
|
|
210
|
-
)
|
|
211
|
-
|
|
212
|
-
# Validate column names (duplicates and conflicts)
|
|
213
|
-
_validate_column_names(generated_names, df, column, drop_original)
|
|
214
|
-
|
|
215
|
-
# ✅ All validations passed!
|
|
216
|
-
|
|
217
|
-
# ============================================
|
|
218
|
-
# PHASE 2: EXECUTION (Data processing)
|
|
219
|
-
# ============================================
|
|
220
|
-
|
|
221
|
-
# Create encoded columns (NATIVE operations)
|
|
222
|
-
new_columns_dict = _create_encoded_columns_native(
|
|
223
|
-
df, column, categories, generated_names, backend
|
|
224
|
-
)
|
|
225
|
-
|
|
226
|
-
# Add new columns to dataframe (NATIVE operations)
|
|
227
|
-
result = _add_columns_native(df, new_columns_dict, backend)
|
|
228
|
-
|
|
229
|
-
# Position columns
|
|
230
|
-
if position != "end":
|
|
231
|
-
# Convert position to column_positioning format
|
|
232
|
-
if position == "after":
|
|
233
|
-
position_spec = f"after:{column}" if column in result.columns else "end"
|
|
234
|
-
elif position == "before":
|
|
235
|
-
position_spec = f"before:{column}" if column in result.columns else "start"
|
|
236
|
-
else:
|
|
237
|
-
position_spec = position
|
|
238
|
-
|
|
239
|
-
# Column positioning works with pandas
|
|
240
|
-
if backend == 'pandas':
|
|
241
|
-
result = position_columns(result, generated_names, position_spec)
|
|
242
|
-
else:
|
|
243
|
-
# For polars/cudf, do manual positioning
|
|
244
|
-
result = _position_columns_native(
|
|
245
|
-
result, generated_names, position, column, backend
|
|
246
|
-
)
|
|
247
|
-
|
|
248
|
-
# Drop original if requested
|
|
249
|
-
if drop_original:
|
|
250
|
-
result = _drop_column_native(result, column, backend)
|
|
251
|
-
|
|
252
|
-
return result
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
def _validate_parameters(max_categories, max_cardinality_ratio, handle_overflow,
|
|
256
|
-
position, prefix, suffix, max_column_name_length):
|
|
257
|
-
"""Validate all parameters."""
|
|
258
|
-
# Validate max_categories
|
|
259
|
-
validate_integer_in_range(
|
|
260
|
-
max_categories, "max_categories",
|
|
261
|
-
min_val=1, max_val=_MAX_CATEGORIES_ABSOLUTE
|
|
262
|
-
)
|
|
263
|
-
|
|
264
|
-
# Validate max_cardinality_ratio
|
|
265
|
-
validate_ratio(max_cardinality_ratio, "max_cardinality_ratio")
|
|
266
|
-
|
|
267
|
-
# Validate handle_overflow format
|
|
268
|
-
if not isinstance(handle_overflow, str):
|
|
269
|
-
raise ValidationError("handle_overflow must be a string")
|
|
270
|
-
|
|
271
|
-
# Validate position
|
|
272
|
-
valid_positions = ["after", "before", "end", "start"]
|
|
273
|
-
if position not in valid_positions:
|
|
274
|
-
raise ValidationError(
|
|
275
|
-
f"Invalid position: '{position}'. Must be one of: {valid_positions}"
|
|
276
|
-
)
|
|
277
|
-
|
|
278
|
-
# Validate prefix/suffix length
|
|
279
|
-
if prefix and len(prefix) > 20:
|
|
280
|
-
raise ValidationError("prefix must be <= 20 characters")
|
|
281
|
-
if suffix and len(suffix) > 20:
|
|
282
|
-
raise ValidationError("suffix must be <= 20 characters")
|
|
283
|
-
|
|
284
|
-
# Validate max_column_name_length
|
|
285
|
-
if max_column_name_length < 10:
|
|
286
|
-
raise ValidationError("max_column_name_length must be >= 10")
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
def _parse_handle_overflow(handle_overflow: str, max_categories: int) -> Tuple[str, Optional[int]]:
|
|
290
|
-
"""
|
|
291
|
-
Parse handle_overflow parameter.
|
|
292
|
-
|
|
293
|
-
Returns:
|
|
294
|
-
Tuple of (mode, top_n_count)
|
|
295
|
-
"""
|
|
296
|
-
if ":" in handle_overflow:
|
|
297
|
-
mode, value = handle_overflow.split(":", 1)
|
|
298
|
-
mode = mode.strip()
|
|
299
|
-
value = value.strip()
|
|
300
|
-
|
|
301
|
-
if mode != "top_n":
|
|
302
|
-
raise ValidationError(
|
|
303
|
-
f"Invalid handle_overflow format: '{handle_overflow}'. "
|
|
304
|
-
f"Only 'top_n' supports ':' syntax."
|
|
305
|
-
)
|
|
306
|
-
|
|
307
|
-
if value == "auto":
|
|
308
|
-
top_n = max_categories - 1
|
|
309
|
-
else:
|
|
310
|
-
try:
|
|
311
|
-
top_n = int(value)
|
|
312
|
-
except ValueError:
|
|
313
|
-
raise ValidationError(
|
|
314
|
-
f"Invalid top_n value: '{value}'. Must be integer or 'auto'."
|
|
315
|
-
)
|
|
316
|
-
|
|
317
|
-
if top_n >= max_categories:
|
|
318
|
-
raise ValidationError(
|
|
319
|
-
f"top_n ({top_n}) must be less than max_categories ({max_categories})"
|
|
320
|
-
)
|
|
321
|
-
if top_n < 1:
|
|
322
|
-
raise ValidationError(f"top_n must be at least 1, got {top_n}")
|
|
323
|
-
|
|
324
|
-
return mode, top_n
|
|
325
|
-
|
|
326
|
-
else:
|
|
327
|
-
mode = handle_overflow.strip()
|
|
328
|
-
if mode == "top_n":
|
|
329
|
-
# Default: leave room for "other"
|
|
330
|
-
top_n = max_categories - 1
|
|
331
|
-
return mode, top_n
|
|
332
|
-
elif mode in ["error", "warn"]:
|
|
333
|
-
return mode, None
|
|
334
|
-
else:
|
|
335
|
-
raise ValidationError(
|
|
336
|
-
f"Invalid handle_overflow: '{handle_overflow}'. "
|
|
337
|
-
f"Must be 'error', 'warn', 'top_n', or 'top_n:N'"
|
|
338
|
-
)
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
def _get_column_stats_native(df, column: str, backend: BackendType) -> Dict[str, Any]:
|
|
342
|
-
"""Get column statistics using native backend operations."""
|
|
343
|
-
if backend == 'polars':
|
|
344
|
-
n_rows = df.height
|
|
345
|
-
n_unique = df[column].n_unique()
|
|
346
|
-
unique_values = df[column].unique().to_list()
|
|
347
|
-
|
|
348
|
-
elif backend == 'cudf':
|
|
349
|
-
n_rows = len(df)
|
|
350
|
-
n_unique = df[column].nunique()
|
|
351
|
-
unique_values = df[column].unique().to_arrow().to_pylist()
|
|
352
|
-
|
|
353
|
-
elif backend == 'pandas':
|
|
354
|
-
n_rows = len(df)
|
|
355
|
-
n_unique = df[column].nunique()
|
|
356
|
-
unique_values = df[column].unique().tolist()
|
|
357
|
-
|
|
358
|
-
else:
|
|
359
|
-
raise EncodingError(f"Unsupported backend: {backend}")
|
|
360
|
-
|
|
361
|
-
return {
|
|
362
|
-
'n_rows': n_rows,
|
|
363
|
-
'n_unique': n_unique,
|
|
364
|
-
'unique_values': unique_values,
|
|
365
|
-
'cardinality_ratio': n_unique / n_rows if n_rows > 0 else 0
|
|
366
|
-
}
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
def _check_id_column(column: str, stats: Dict[str, Any]):
|
|
370
|
-
"""Check if column appears to be an ID column."""
|
|
371
|
-
ratio = stats['cardinality_ratio']
|
|
372
|
-
|
|
373
|
-
# Check cardinality ratio
|
|
374
|
-
if ratio > 0.95:
|
|
375
|
-
raise EncodingError(
|
|
376
|
-
f"Column '{column}' appears to be an ID column "
|
|
377
|
-
f"(cardinality ratio: {ratio:.2%}). "
|
|
378
|
-
f"Set check_id_column=False to override."
|
|
379
|
-
)
|
|
380
|
-
|
|
381
|
-
# Check column name patterns
|
|
382
|
-
col_lower = column.lower()
|
|
383
|
-
id_patterns = ['_id', 'id_', 'uuid', '_key', 'key_']
|
|
384
|
-
if any(pattern in col_lower for pattern in id_patterns):
|
|
385
|
-
if ratio > 0.8:
|
|
386
|
-
raise EncodingError(
|
|
387
|
-
f"Column '{column}' appears to be an ID column "
|
|
388
|
-
f"(name pattern + {ratio:.2%} cardinality). "
|
|
389
|
-
f"Set check_id_column=False to override."
|
|
390
|
-
)
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
def _check_cardinality_ratio(column: str, stats: Dict[str, Any], max_ratio: float):
|
|
394
|
-
"""Check cardinality ratio."""
|
|
395
|
-
ratio = stats['cardinality_ratio']
|
|
396
|
-
if ratio > max_ratio:
|
|
397
|
-
raise EncodingError(
|
|
398
|
-
f"Column '{column}' has high cardinality ratio: {ratio:.2%} "
|
|
399
|
-
f"(threshold: {max_ratio:.2%}). "
|
|
400
|
-
f"Set max_cardinality_ratio=1.0 to override."
|
|
401
|
-
)
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
def _determine_categories(df, column: str, backend: BackendType, n_unique: int,
|
|
405
|
-
max_categories: int, overflow_mode: str,
|
|
406
|
-
top_n: Optional[int]) -> List[str]:
|
|
407
|
-
"""Determine which categories to encode."""
|
|
408
|
-
if n_unique > max_categories:
|
|
409
|
-
if overflow_mode == "error":
|
|
410
|
-
raise EncodingError(
|
|
411
|
-
f"Column '{column}' has {n_unique} unique values, "
|
|
412
|
-
f"exceeds max_categories={max_categories}. "
|
|
413
|
-
f"Use handle_overflow='top_n' or increase max_categories."
|
|
414
|
-
)
|
|
415
|
-
elif overflow_mode == "top_n":
|
|
416
|
-
# Get top N most frequent categories
|
|
417
|
-
categories = _get_top_n_categories_native(df, column, backend, top_n)
|
|
418
|
-
categories.append("other") # Add "other" category
|
|
419
|
-
return categories
|
|
420
|
-
elif overflow_mode == "warn":
|
|
421
|
-
warnings.warn(
|
|
422
|
-
f"Encoding {n_unique} categories (exceeds max_categories={max_categories})"
|
|
423
|
-
)
|
|
424
|
-
return _get_all_categories_sorted_native(df, column, backend)
|
|
425
|
-
else:
|
|
426
|
-
return _get_all_categories_sorted_native(df, column, backend)
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
def _get_top_n_categories_native(df, column: str, backend: BackendType,
|
|
430
|
-
top_n: int) -> List[str]:
|
|
431
|
-
"""Get top N most frequent categories using native operations."""
|
|
432
|
-
if backend == 'polars':
|
|
433
|
-
vc = df[column].value_counts(sort=True).head(top_n)
|
|
434
|
-
return vc[column].to_list()
|
|
435
|
-
|
|
436
|
-
elif backend == 'cudf':
|
|
437
|
-
vc = df[column].value_counts().sort_values(ascending=False).head(top_n)
|
|
438
|
-
return vc.index.to_arrow().to_pylist()
|
|
439
|
-
|
|
440
|
-
elif backend == 'pandas':
|
|
441
|
-
vc = df[column].value_counts().head(top_n)
|
|
442
|
-
return vc.index.tolist()
|
|
443
|
-
|
|
444
|
-
else:
|
|
445
|
-
raise EncodingError(f"Unsupported backend: {backend}")
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
def _get_all_categories_sorted_native(df, column: str, backend: BackendType) -> List[str]:
|
|
449
|
-
"""Get all unique categories sorted."""
|
|
450
|
-
if backend == 'polars':
|
|
451
|
-
return sorted(df[column].unique().to_list())
|
|
452
|
-
elif backend == 'cudf':
|
|
453
|
-
return sorted(df[column].unique().to_arrow().to_pylist())
|
|
454
|
-
elif backend == 'pandas':
|
|
455
|
-
return sorted(df[column].unique().tolist())
|
|
456
|
-
else:
|
|
457
|
-
raise EncodingError(f"Unsupported backend: {backend}")
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
def _generate_column_names(column: str, categories: List[str],
|
|
461
|
-
prefix: Optional[str], suffix: Optional[str],
|
|
462
|
-
max_length: int) -> List[str]:
|
|
463
|
-
"""Generate column names for encoded categories."""
|
|
464
|
-
generated_names = []
|
|
465
|
-
|
|
466
|
-
for category in categories:
|
|
467
|
-
# Build parts
|
|
468
|
-
parts = []
|
|
469
|
-
if prefix:
|
|
470
|
-
parts.append(prefix[:20]) # Limit prefix
|
|
471
|
-
parts.append(column)
|
|
472
|
-
parts.append(str(category))
|
|
473
|
-
if suffix:
|
|
474
|
-
parts.append(suffix[:20]) # Limit suffix
|
|
475
|
-
|
|
476
|
-
# Join with underscores
|
|
477
|
-
full_name = "_".join(parts)
|
|
478
|
-
|
|
479
|
-
# Truncate if needed (preserve end for uniqueness)
|
|
480
|
-
if len(full_name) > max_length:
|
|
481
|
-
# Keep start and end
|
|
482
|
-
keep_start = max_length // 2
|
|
483
|
-
keep_end = max_length - keep_start
|
|
484
|
-
full_name = full_name[:keep_start] + full_name[-keep_end:]
|
|
485
|
-
|
|
486
|
-
generated_names.append(full_name)
|
|
487
|
-
|
|
488
|
-
return generated_names
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
def _validate_column_names(generated_names: List[str], df, column: str,
|
|
492
|
-
drop_original: bool):
|
|
493
|
-
"""Validate generated column names for duplicates and conflicts."""
|
|
494
|
-
# Check for duplicates in generated names
|
|
495
|
-
duplicates = [name for name, count in Counter(generated_names).items() if count > 1]
|
|
496
|
-
if duplicates:
|
|
497
|
-
raise EncodingError(
|
|
498
|
-
f"Column name generation resulted in {len(duplicates)} duplicate names. "
|
|
499
|
-
f"Examples: {duplicates[:3]}. "
|
|
500
|
-
f"Try using a shorter prefix/suffix or cleaning category values."
|
|
501
|
-
)
|
|
502
|
-
|
|
503
|
-
# Check for conflicts with existing columns
|
|
504
|
-
existing_cols = set(df.columns)
|
|
505
|
-
if drop_original:
|
|
506
|
-
existing_cols.discard(column) # Will be dropped, so not a conflict
|
|
507
|
-
|
|
508
|
-
conflicts = set(generated_names) & existing_cols
|
|
509
|
-
if conflicts:
|
|
510
|
-
raise EncodingError(
|
|
511
|
-
f"Generated column names conflict with existing columns: {list(conflicts)[:5]}. "
|
|
512
|
-
f"Rename existing columns or use prefix parameter."
|
|
513
|
-
)
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
def _create_encoded_columns_native(df, column: str, categories: List[str],
|
|
517
|
-
col_names: List[str], backend: BackendType) -> Dict[str, Any]:
|
|
518
|
-
"""Create one-hot encoded columns using native backend operations."""
|
|
519
|
-
if backend == 'polars':
|
|
520
|
-
new_cols = {}
|
|
521
|
-
for category, col_name in zip(categories, col_names):
|
|
522
|
-
if category == "other":
|
|
523
|
-
mask = ~df[column].is_in(categories[:-1])
|
|
524
|
-
else:
|
|
525
|
-
mask = df[column] == category
|
|
526
|
-
new_cols[col_name] = mask.cast(pl.UInt8)
|
|
527
|
-
return new_cols
|
|
528
|
-
|
|
529
|
-
elif backend == 'cudf':
|
|
530
|
-
new_cols = {}
|
|
531
|
-
for category, col_name in zip(categories, col_names):
|
|
532
|
-
if category == "other":
|
|
533
|
-
mask = ~df[column].isin(categories[:-1])
|
|
534
|
-
else:
|
|
535
|
-
mask = df[column] == category
|
|
536
|
-
new_cols[col_name] = mask.astype('uint8')
|
|
537
|
-
return new_cols
|
|
538
|
-
|
|
539
|
-
elif backend == 'pandas':
|
|
540
|
-
new_cols = {}
|
|
541
|
-
for category, col_name in zip(categories, col_names):
|
|
542
|
-
if category == "other":
|
|
543
|
-
mask = ~df[column].isin(categories[:-1])
|
|
544
|
-
else:
|
|
545
|
-
mask = df[column] == category
|
|
546
|
-
new_cols[col_name] = mask.astype(np.uint8)
|
|
547
|
-
return new_cols
|
|
548
|
-
|
|
549
|
-
else:
|
|
550
|
-
raise EncodingError(f"Unsupported backend: {backend}")
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
def _add_columns_native(df, new_columns: Dict[str, Any], backend: BackendType):
|
|
554
|
-
"""Add new columns to dataframe using native operations."""
|
|
555
|
-
if backend == 'polars':
|
|
556
|
-
return df.with_columns([pl.lit(col_data).alias(col_name)
|
|
557
|
-
for col_name, col_data in new_columns.items()])
|
|
558
|
-
elif backend == 'cudf' or backend == 'pandas':
|
|
559
|
-
result = df.copy()
|
|
560
|
-
for col_name, col_data in new_columns.items():
|
|
561
|
-
result[col_name] = col_data
|
|
562
|
-
return result
|
|
563
|
-
else:
|
|
564
|
-
raise EncodingError(f"Unsupported backend: {backend}")
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
def _position_columns_native(df, new_columns: List[str], position: str,
|
|
568
|
-
reference_col: str, backend: BackendType):
|
|
569
|
-
"""Position columns using native operations (for polars/cudf)."""
|
|
570
|
-
all_cols = list(df.columns)
|
|
571
|
-
existing_cols = [c for c in all_cols if c not in new_columns]
|
|
572
|
-
|
|
573
|
-
if position == "start":
|
|
574
|
-
new_order = new_columns + existing_cols
|
|
575
|
-
elif position == "after":
|
|
576
|
-
if reference_col in existing_cols:
|
|
577
|
-
idx = existing_cols.index(reference_col) + 1
|
|
578
|
-
new_order = existing_cols[:idx] + new_columns + existing_cols[idx:]
|
|
579
|
-
else:
|
|
580
|
-
new_order = existing_cols + new_columns
|
|
581
|
-
elif position == "before":
|
|
582
|
-
if reference_col in existing_cols:
|
|
583
|
-
idx = existing_cols.index(reference_col)
|
|
584
|
-
new_order = existing_cols[:idx] + new_columns + existing_cols[idx:]
|
|
585
|
-
else:
|
|
586
|
-
new_order = new_columns + existing_cols
|
|
587
|
-
else: # "end"
|
|
588
|
-
new_order = existing_cols + new_columns
|
|
589
|
-
|
|
590
|
-
return df[new_order]
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
def _drop_column_native(df, column: str, backend: BackendType):
|
|
594
|
-
"""Drop column using native operations."""
|
|
595
|
-
if backend == 'polars':
|
|
596
|
-
return df.drop(column)
|
|
597
|
-
elif backend == 'cudf' or backend == 'pandas':
|
|
598
|
-
return df.drop(columns=[column])
|
|
599
|
-
else:
|
|
600
|
-
raise EncodingError(f"Unsupported backend: {backend}")
|