additory 0.1.0a3__py3-none-any.whl → 0.1.1a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. additory/__init__.py +58 -14
  2. additory/common/__init__.py +31 -147
  3. additory/common/column_selector.py +255 -0
  4. additory/common/distributions.py +286 -613
  5. additory/common/extractors.py +313 -0
  6. additory/common/knn_imputation.py +332 -0
  7. additory/common/result.py +380 -0
  8. additory/common/strategy_parser.py +243 -0
  9. additory/common/unit_conversions.py +338 -0
  10. additory/common/validation.py +283 -103
  11. additory/core/__init__.py +34 -22
  12. additory/core/backend.py +258 -0
  13. additory/core/config.py +177 -305
  14. additory/core/logging.py +230 -24
  15. additory/core/memory_manager.py +157 -495
  16. additory/expressions/__init__.py +2 -23
  17. additory/expressions/compiler.py +457 -0
  18. additory/expressions/engine.py +264 -487
  19. additory/expressions/integrity.py +179 -0
  20. additory/expressions/loader.py +263 -0
  21. additory/expressions/parser.py +363 -167
  22. additory/expressions/resolver.py +274 -0
  23. additory/functions/__init__.py +1 -0
  24. additory/functions/analyze/__init__.py +144 -0
  25. additory/functions/analyze/cardinality.py +58 -0
  26. additory/functions/analyze/correlations.py +66 -0
  27. additory/functions/analyze/distributions.py +53 -0
  28. additory/functions/analyze/duplicates.py +49 -0
  29. additory/functions/analyze/features.py +61 -0
  30. additory/functions/analyze/imputation.py +66 -0
  31. additory/functions/analyze/outliers.py +65 -0
  32. additory/functions/analyze/patterns.py +65 -0
  33. additory/functions/analyze/presets.py +72 -0
  34. additory/functions/analyze/quality.py +59 -0
  35. additory/functions/analyze/timeseries.py +53 -0
  36. additory/functions/analyze/types.py +45 -0
  37. additory/functions/expressions/__init__.py +161 -0
  38. additory/functions/snapshot/__init__.py +82 -0
  39. additory/functions/snapshot/filter.py +119 -0
  40. additory/functions/synthetic/__init__.py +113 -0
  41. additory/functions/synthetic/mode_detector.py +47 -0
  42. additory/functions/synthetic/strategies/__init__.py +1 -0
  43. additory/functions/synthetic/strategies/advanced.py +35 -0
  44. additory/functions/synthetic/strategies/augmentative.py +160 -0
  45. additory/functions/synthetic/strategies/generative.py +168 -0
  46. additory/functions/synthetic/strategies/presets.py +116 -0
  47. additory/functions/to/__init__.py +188 -0
  48. additory/functions/to/lookup.py +351 -0
  49. additory/functions/to/merge.py +189 -0
  50. additory/functions/to/sort.py +91 -0
  51. additory/functions/to/summarize.py +170 -0
  52. additory/functions/transform/__init__.py +140 -0
  53. additory/functions/transform/datetime.py +79 -0
  54. additory/functions/transform/extract.py +85 -0
  55. additory/functions/transform/harmonize.py +105 -0
  56. additory/functions/transform/knn.py +62 -0
  57. additory/functions/transform/onehotencoding.py +68 -0
  58. additory/functions/transform/transpose.py +42 -0
  59. additory-0.1.1a1.dist-info/METADATA +83 -0
  60. additory-0.1.1a1.dist-info/RECORD +62 -0
  61. additory/analysis/__init__.py +0 -48
  62. additory/analysis/cardinality.py +0 -126
  63. additory/analysis/correlations.py +0 -124
  64. additory/analysis/distributions.py +0 -376
  65. additory/analysis/quality.py +0 -158
  66. additory/analysis/scan.py +0 -400
  67. additory/common/backend.py +0 -371
  68. additory/common/column_utils.py +0 -191
  69. additory/common/exceptions.py +0 -62
  70. additory/common/lists.py +0 -229
  71. additory/common/patterns.py +0 -240
  72. additory/common/resolver.py +0 -567
  73. additory/common/sample_data.py +0 -182
  74. additory/core/ast_builder.py +0 -165
  75. additory/core/backends/__init__.py +0 -23
  76. additory/core/backends/arrow_bridge.py +0 -483
  77. additory/core/backends/cudf_bridge.py +0 -355
  78. additory/core/column_positioning.py +0 -358
  79. additory/core/compiler_polars.py +0 -166
  80. additory/core/enhanced_cache_manager.py +0 -1119
  81. additory/core/enhanced_matchers.py +0 -473
  82. additory/core/enhanced_version_manager.py +0 -325
  83. additory/core/executor.py +0 -59
  84. additory/core/integrity_manager.py +0 -477
  85. additory/core/loader.py +0 -190
  86. additory/core/namespace_manager.py +0 -657
  87. additory/core/parser.py +0 -176
  88. additory/core/polars_expression_engine.py +0 -601
  89. additory/core/registry.py +0 -176
  90. additory/core/sample_data_manager.py +0 -492
  91. additory/core/user_namespace.py +0 -751
  92. additory/core/validator.py +0 -27
  93. additory/dynamic_api.py +0 -304
  94. additory/expressions/proxy.py +0 -549
  95. additory/expressions/registry.py +0 -313
  96. additory/expressions/samples.py +0 -492
  97. additory/synthetic/__init__.py +0 -13
  98. additory/synthetic/column_name_resolver.py +0 -149
  99. additory/synthetic/distributions.py +0 -22
  100. additory/synthetic/forecast.py +0 -1132
  101. additory/synthetic/linked_list_parser.py +0 -415
  102. additory/synthetic/namespace_lookup.py +0 -129
  103. additory/synthetic/smote.py +0 -320
  104. additory/synthetic/strategies.py +0 -850
  105. additory/synthetic/synthesizer.py +0 -713
  106. additory/utilities/__init__.py +0 -53
  107. additory/utilities/encoding.py +0 -600
  108. additory/utilities/games.py +0 -300
  109. additory/utilities/keys.py +0 -8
  110. additory/utilities/lookup.py +0 -103
  111. additory/utilities/matchers.py +0 -216
  112. additory/utilities/resolvers.py +0 -286
  113. additory/utilities/settings.py +0 -167
  114. additory/utilities/units.py +0 -749
  115. additory/utilities/validators.py +0 -153
  116. additory-0.1.0a3.dist-info/METADATA +0 -288
  117. additory-0.1.0a3.dist-info/RECORD +0 -71
  118. additory-0.1.0a3.dist-info/licenses/LICENSE +0 -21
  119. {additory-0.1.0a3.dist-info → additory-0.1.1a1.dist-info}/WHEEL +0 -0
  120. {additory-0.1.0a3.dist-info → additory-0.1.1a1.dist-info}/top_level.txt +0 -0
@@ -1,53 +0,0 @@
1
- # additory/utilities/__init__.py
2
- # Utilities system - Direct operations (non-.add driven)
3
-
4
- """
5
- Utilities System Module
6
-
7
- This module handles direct operations that don't require .add files:
8
- - Lookup operations (add.to)
9
- - Unit conversion (add.harmonize_units)
10
- - Global settings management
11
- - Enhanced matchers and resolvers
12
- - Input validation
13
- """
14
-
15
- # Utility functionality
16
- from .lookup import to, fuzzy_lookup, aggregate_lookup
17
- from .units import harmonize_units, get_supported_units, get_conversion_stats
18
- from .encoding import onehotencoding
19
- from .games import play, tictactoe, sudoku
20
- from .settings import (
21
- set_global_settings, get_global_settings, get_setting,
22
- set_my_expressions_path, set_my_schemas_path,
23
- get_my_expressions_path, get_my_schemas_path,
24
- set_backend, set_precision, enable_cache, disable_cache
25
- )
26
- from .validators import (
27
- validate_dataframe, validate_columns_exist, validate_numeric_column,
28
- validate_string_column, is_dataframe
29
- )
30
-
31
- __all__ = [
32
- # Lookup functionality
33
- 'to', 'fuzzy_lookup', 'aggregate_lookup',
34
-
35
- # Unit conversion
36
- 'harmonize_units', 'get_supported_units', 'get_conversion_stats',
37
-
38
- # Encoding
39
- 'onehotencoding',
40
-
41
- # Games (Easter egg)
42
- 'play', 'tictactoe', 'sudoku',
43
-
44
- # Settings management
45
- 'set_global_settings', 'get_global_settings', 'get_setting',
46
- 'set_my_expressions_path', 'set_my_schemas_path',
47
- 'get_my_expressions_path', 'get_my_schemas_path',
48
- 'set_backend', 'set_precision', 'enable_cache', 'disable_cache',
49
-
50
- # Validation
51
- 'validate_dataframe', 'validate_columns_exist', 'validate_numeric_column',
52
- 'validate_string_column', 'is_dataframe'
53
- ]
@@ -1,600 +0,0 @@
1
- # additory/utilities/encoding.py
2
- # One-hot encoding and other encoding utilities
3
-
4
- """
5
- Encoding Utilities Module
6
-
7
- Provides encoding operations for categorical data:
8
- - One-hot encoding with native backend support
9
- - Smart cardinality handling
10
- - Column name conflict resolution
11
- """
12
-
13
- from typing import Union, Optional, List, Dict, Any, Tuple
14
- from collections import Counter
15
- import warnings
16
-
17
- # Import from harmonized common module
18
- from additory.common import (
19
- detect_backend,
20
- validate_dataframe,
21
- validate_columns_exist,
22
- validate_positive_number,
23
- validate_ratio,
24
- validate_integer_in_range,
25
- ValidationError,
26
- EncodingError,
27
- sanitize_column_name,
28
- generate_safe_column_name,
29
- BackendType
30
- )
31
-
32
- # Import column positioning
33
- from additory.core.column_positioning import position_columns
34
-
35
- # Backend imports
36
- import pandas as pd
37
- import numpy as np
38
-
39
- try:
40
- import polars as pl
41
- HAS_POLARS = True
42
- except ImportError:
43
- HAS_POLARS = False
44
- pl = None
45
-
46
- try:
47
- import cudf
48
- HAS_CUDF = True
49
- except ImportError:
50
- HAS_CUDF = False
51
- cudf = None
52
-
53
-
54
- DataFrame = Union[pd.DataFrame, 'pl.DataFrame', 'cudf.DataFrame']
55
-
56
-
57
- # Constants
58
- _MAX_CATEGORIES_ABSOLUTE = 200
59
-
60
-
61
- def onehotencoding(
62
- df: DataFrame,
63
- column: str,
64
- *,
65
- max_categories: int = 50,
66
- max_cardinality_ratio: float = 0.5,
67
- handle_overflow: str = "error",
68
- position: str = "after",
69
- drop_original: bool = True,
70
- prefix: Optional[str] = None,
71
- suffix: Optional[str] = None,
72
- check_id_column: bool = True,
73
- max_column_name_length: int = 63
74
- ) -> DataFrame:
75
- """
76
- One-hot encode a categorical column into binary uint8 columns.
77
-
78
- Validates all column names before any data processing to fail fast.
79
- Uses native backend operations (no conversions) for optimal performance.
80
-
81
- Parameters
82
- ----------
83
- df : DataFrame
84
- Input dataframe (pandas, polars, or cudf)
85
- column : str
86
- Column name to encode
87
- max_categories : int, default 50
88
- Maximum number of categories to encode (includes "other" if used).
89
- Cannot exceed 200 (hard limit).
90
- max_cardinality_ratio : float, default 0.5
91
- Maximum ratio of unique values to total rows (0.0-1.0).
92
- If exceeded, raises error unless handle_overflow="warn".
93
- Set to 1.0 to disable check.
94
- handle_overflow : str, default "error"
95
- How to handle when unique values exceed max_categories:
96
- - "error": Raise EncodingError (default)
97
- - "top_n": Keep top N most frequent, group rest as "other"
98
- - "top_n:N": Keep top N most frequent + "other" (e.g., "top_n:20")
99
- - "warn": Proceed with warning (if cardinality ratio allows)
100
- position : str, default "after"
101
- Where to insert new columns: "after", "before", "end", "start"
102
- drop_original : bool, default True
103
- Whether to drop the original column after encoding
104
- prefix : str, optional
105
- Prefix for new column names (max 20 chars)
106
- suffix : str, optional
107
- Suffix for new column names (max 20 chars)
108
- check_id_column : bool, default True
109
- Check if column appears to be an ID column and raise error
110
- max_column_name_length : int, default 63
111
- Maximum length for generated column names (SQL compatibility)
112
-
113
- Returns
114
- -------
115
- DataFrame
116
- DataFrame with one-hot encoded uint8 columns (0 or 1)
117
-
118
- Raises
119
- ------
120
- ValidationError
121
- - If column does not exist
122
- - If parameters are invalid
123
- EncodingError
124
- - If cardinality exceeds max_categories and handle_overflow="error"
125
- - If unique ratio exceeds max_cardinality_ratio
126
- - If generated column names have duplicates (after truncation)
127
- - If generated column names conflict with existing columns
128
- - If max_categories exceeds hard limit (200)
129
- - If column appears to be an ID column and check_id_column=True
130
-
131
- Examples
132
- --------
133
- Basic usage:
134
- >>> df = pl.DataFrame({"color": ["red", "blue", "red", "green"]})
135
- >>> result = add.onehotencoding(df, "color")
136
- # Creates: color_blue, color_green, color_red (uint8, sorted)
137
-
138
- With prefix to avoid conflicts:
139
- >>> result = add.onehotencoding(df, "color", prefix="ohe", drop_original=False)
140
- # Creates: ohe_color_blue, ohe_color_green, ohe_color_red
141
- # Keeps: color
142
-
143
- Handle high cardinality with top_n:
144
- >>> result = add.onehotencoding(
145
- ... df, "category",
146
- ... max_categories=10,
147
- ... handle_overflow="top_n:9"
148
- ... )
149
- # Creates: top 9 categories + category_other
150
-
151
- Allow high cardinality:
152
- >>> result = add.onehotencoding(
153
- ... df, "user_id",
154
- ... max_cardinality_ratio=1.0,
155
- ... max_categories=100,
156
- ... check_id_column=False
157
- ... )
158
- # Proceeds even if user_id looks like an ID column
159
-
160
- Notes
161
- -----
162
- - All validation happens before data processing (fail-fast)
163
- - Generated column names are checked for duplicates and conflicts
164
- - Category values are sorted alphabetically in output columns
165
- - "other" category (if used) always appears last
166
- - Uses uint8 dtype (0 or 1) for memory efficiency
167
- - Native backend operations (no conversions) for performance
168
- """
169
-
170
- # ============================================
171
- # PHASE 1: VALIDATION (No data processing)
172
- # ============================================
173
-
174
- # Detect backend
175
- backend = detect_backend(df)
176
-
177
- # Validate parameters
178
- _validate_parameters(
179
- max_categories, max_cardinality_ratio, handle_overflow,
180
- position, prefix, suffix, max_column_name_length
181
- )
182
-
183
- # Parse handle_overflow
184
- overflow_mode, top_n = _parse_handle_overflow(handle_overflow, max_categories)
185
-
186
- # Validate dataframe and column
187
- validate_dataframe(df, "input dataframe")
188
- validate_columns_exist(df, column, "input dataframe")
189
-
190
- # Get column statistics (NATIVE operations)
191
- stats = _get_column_stats_native(df, column, backend)
192
-
193
- # Check if ID column
194
- if check_id_column:
195
- _check_id_column(column, stats)
196
-
197
- # Check cardinality ratio
198
- if max_cardinality_ratio < 1.0:
199
- _check_cardinality_ratio(column, stats, max_cardinality_ratio)
200
-
201
- # Determine categories to encode
202
- categories = _determine_categories(
203
- df, column, backend, stats['n_unique'],
204
- max_categories, overflow_mode, top_n
205
- )
206
-
207
- # Generate column names
208
- generated_names = _generate_column_names(
209
- column, categories, prefix, suffix, max_column_name_length
210
- )
211
-
212
- # Validate column names (duplicates and conflicts)
213
- _validate_column_names(generated_names, df, column, drop_original)
214
-
215
- # ✅ All validations passed!
216
-
217
- # ============================================
218
- # PHASE 2: EXECUTION (Data processing)
219
- # ============================================
220
-
221
- # Create encoded columns (NATIVE operations)
222
- new_columns_dict = _create_encoded_columns_native(
223
- df, column, categories, generated_names, backend
224
- )
225
-
226
- # Add new columns to dataframe (NATIVE operations)
227
- result = _add_columns_native(df, new_columns_dict, backend)
228
-
229
- # Position columns
230
- if position != "end":
231
- # Convert position to column_positioning format
232
- if position == "after":
233
- position_spec = f"after:{column}" if column in result.columns else "end"
234
- elif position == "before":
235
- position_spec = f"before:{column}" if column in result.columns else "start"
236
- else:
237
- position_spec = position
238
-
239
- # Column positioning works with pandas
240
- if backend == 'pandas':
241
- result = position_columns(result, generated_names, position_spec)
242
- else:
243
- # For polars/cudf, do manual positioning
244
- result = _position_columns_native(
245
- result, generated_names, position, column, backend
246
- )
247
-
248
- # Drop original if requested
249
- if drop_original:
250
- result = _drop_column_native(result, column, backend)
251
-
252
- return result
253
-
254
-
255
- def _validate_parameters(max_categories, max_cardinality_ratio, handle_overflow,
256
- position, prefix, suffix, max_column_name_length):
257
- """Validate all parameters."""
258
- # Validate max_categories
259
- validate_integer_in_range(
260
- max_categories, "max_categories",
261
- min_val=1, max_val=_MAX_CATEGORIES_ABSOLUTE
262
- )
263
-
264
- # Validate max_cardinality_ratio
265
- validate_ratio(max_cardinality_ratio, "max_cardinality_ratio")
266
-
267
- # Validate handle_overflow format
268
- if not isinstance(handle_overflow, str):
269
- raise ValidationError("handle_overflow must be a string")
270
-
271
- # Validate position
272
- valid_positions = ["after", "before", "end", "start"]
273
- if position not in valid_positions:
274
- raise ValidationError(
275
- f"Invalid position: '{position}'. Must be one of: {valid_positions}"
276
- )
277
-
278
- # Validate prefix/suffix length
279
- if prefix and len(prefix) > 20:
280
- raise ValidationError("prefix must be <= 20 characters")
281
- if suffix and len(suffix) > 20:
282
- raise ValidationError("suffix must be <= 20 characters")
283
-
284
- # Validate max_column_name_length
285
- if max_column_name_length < 10:
286
- raise ValidationError("max_column_name_length must be >= 10")
287
-
288
-
289
- def _parse_handle_overflow(handle_overflow: str, max_categories: int) -> Tuple[str, Optional[int]]:
290
- """
291
- Parse handle_overflow parameter.
292
-
293
- Returns:
294
- Tuple of (mode, top_n_count)
295
- """
296
- if ":" in handle_overflow:
297
- mode, value = handle_overflow.split(":", 1)
298
- mode = mode.strip()
299
- value = value.strip()
300
-
301
- if mode != "top_n":
302
- raise ValidationError(
303
- f"Invalid handle_overflow format: '{handle_overflow}'. "
304
- f"Only 'top_n' supports ':' syntax."
305
- )
306
-
307
- if value == "auto":
308
- top_n = max_categories - 1
309
- else:
310
- try:
311
- top_n = int(value)
312
- except ValueError:
313
- raise ValidationError(
314
- f"Invalid top_n value: '{value}'. Must be integer or 'auto'."
315
- )
316
-
317
- if top_n >= max_categories:
318
- raise ValidationError(
319
- f"top_n ({top_n}) must be less than max_categories ({max_categories})"
320
- )
321
- if top_n < 1:
322
- raise ValidationError(f"top_n must be at least 1, got {top_n}")
323
-
324
- return mode, top_n
325
-
326
- else:
327
- mode = handle_overflow.strip()
328
- if mode == "top_n":
329
- # Default: leave room for "other"
330
- top_n = max_categories - 1
331
- return mode, top_n
332
- elif mode in ["error", "warn"]:
333
- return mode, None
334
- else:
335
- raise ValidationError(
336
- f"Invalid handle_overflow: '{handle_overflow}'. "
337
- f"Must be 'error', 'warn', 'top_n', or 'top_n:N'"
338
- )
339
-
340
-
341
- def _get_column_stats_native(df, column: str, backend: BackendType) -> Dict[str, Any]:
342
- """Get column statistics using native backend operations."""
343
- if backend == 'polars':
344
- n_rows = df.height
345
- n_unique = df[column].n_unique()
346
- unique_values = df[column].unique().to_list()
347
-
348
- elif backend == 'cudf':
349
- n_rows = len(df)
350
- n_unique = df[column].nunique()
351
- unique_values = df[column].unique().to_arrow().to_pylist()
352
-
353
- elif backend == 'pandas':
354
- n_rows = len(df)
355
- n_unique = df[column].nunique()
356
- unique_values = df[column].unique().tolist()
357
-
358
- else:
359
- raise EncodingError(f"Unsupported backend: {backend}")
360
-
361
- return {
362
- 'n_rows': n_rows,
363
- 'n_unique': n_unique,
364
- 'unique_values': unique_values,
365
- 'cardinality_ratio': n_unique / n_rows if n_rows > 0 else 0
366
- }
367
-
368
-
369
- def _check_id_column(column: str, stats: Dict[str, Any]):
370
- """Check if column appears to be an ID column."""
371
- ratio = stats['cardinality_ratio']
372
-
373
- # Check cardinality ratio
374
- if ratio > 0.95:
375
- raise EncodingError(
376
- f"Column '{column}' appears to be an ID column "
377
- f"(cardinality ratio: {ratio:.2%}). "
378
- f"Set check_id_column=False to override."
379
- )
380
-
381
- # Check column name patterns
382
- col_lower = column.lower()
383
- id_patterns = ['_id', 'id_', 'uuid', '_key', 'key_']
384
- if any(pattern in col_lower for pattern in id_patterns):
385
- if ratio > 0.8:
386
- raise EncodingError(
387
- f"Column '{column}' appears to be an ID column "
388
- f"(name pattern + {ratio:.2%} cardinality). "
389
- f"Set check_id_column=False to override."
390
- )
391
-
392
-
393
- def _check_cardinality_ratio(column: str, stats: Dict[str, Any], max_ratio: float):
394
- """Check cardinality ratio."""
395
- ratio = stats['cardinality_ratio']
396
- if ratio > max_ratio:
397
- raise EncodingError(
398
- f"Column '{column}' has high cardinality ratio: {ratio:.2%} "
399
- f"(threshold: {max_ratio:.2%}). "
400
- f"Set max_cardinality_ratio=1.0 to override."
401
- )
402
-
403
-
404
- def _determine_categories(df, column: str, backend: BackendType, n_unique: int,
405
- max_categories: int, overflow_mode: str,
406
- top_n: Optional[int]) -> List[str]:
407
- """Determine which categories to encode."""
408
- if n_unique > max_categories:
409
- if overflow_mode == "error":
410
- raise EncodingError(
411
- f"Column '{column}' has {n_unique} unique values, "
412
- f"exceeds max_categories={max_categories}. "
413
- f"Use handle_overflow='top_n' or increase max_categories."
414
- )
415
- elif overflow_mode == "top_n":
416
- # Get top N most frequent categories
417
- categories = _get_top_n_categories_native(df, column, backend, top_n)
418
- categories.append("other") # Add "other" category
419
- return categories
420
- elif overflow_mode == "warn":
421
- warnings.warn(
422
- f"Encoding {n_unique} categories (exceeds max_categories={max_categories})"
423
- )
424
- return _get_all_categories_sorted_native(df, column, backend)
425
- else:
426
- return _get_all_categories_sorted_native(df, column, backend)
427
-
428
-
429
- def _get_top_n_categories_native(df, column: str, backend: BackendType,
430
- top_n: int) -> List[str]:
431
- """Get top N most frequent categories using native operations."""
432
- if backend == 'polars':
433
- vc = df[column].value_counts(sort=True).head(top_n)
434
- return vc[column].to_list()
435
-
436
- elif backend == 'cudf':
437
- vc = df[column].value_counts().sort_values(ascending=False).head(top_n)
438
- return vc.index.to_arrow().to_pylist()
439
-
440
- elif backend == 'pandas':
441
- vc = df[column].value_counts().head(top_n)
442
- return vc.index.tolist()
443
-
444
- else:
445
- raise EncodingError(f"Unsupported backend: {backend}")
446
-
447
-
448
- def _get_all_categories_sorted_native(df, column: str, backend: BackendType) -> List[str]:
449
- """Get all unique categories sorted."""
450
- if backend == 'polars':
451
- return sorted(df[column].unique().to_list())
452
- elif backend == 'cudf':
453
- return sorted(df[column].unique().to_arrow().to_pylist())
454
- elif backend == 'pandas':
455
- return sorted(df[column].unique().tolist())
456
- else:
457
- raise EncodingError(f"Unsupported backend: {backend}")
458
-
459
-
460
- def _generate_column_names(column: str, categories: List[str],
461
- prefix: Optional[str], suffix: Optional[str],
462
- max_length: int) -> List[str]:
463
- """Generate column names for encoded categories."""
464
- generated_names = []
465
-
466
- for category in categories:
467
- # Build parts
468
- parts = []
469
- if prefix:
470
- parts.append(prefix[:20]) # Limit prefix
471
- parts.append(column)
472
- parts.append(str(category))
473
- if suffix:
474
- parts.append(suffix[:20]) # Limit suffix
475
-
476
- # Join with underscores
477
- full_name = "_".join(parts)
478
-
479
- # Truncate if needed (preserve end for uniqueness)
480
- if len(full_name) > max_length:
481
- # Keep start and end
482
- keep_start = max_length // 2
483
- keep_end = max_length - keep_start
484
- full_name = full_name[:keep_start] + full_name[-keep_end:]
485
-
486
- generated_names.append(full_name)
487
-
488
- return generated_names
489
-
490
-
491
- def _validate_column_names(generated_names: List[str], df, column: str,
492
- drop_original: bool):
493
- """Validate generated column names for duplicates and conflicts."""
494
- # Check for duplicates in generated names
495
- duplicates = [name for name, count in Counter(generated_names).items() if count > 1]
496
- if duplicates:
497
- raise EncodingError(
498
- f"Column name generation resulted in {len(duplicates)} duplicate names. "
499
- f"Examples: {duplicates[:3]}. "
500
- f"Try using a shorter prefix/suffix or cleaning category values."
501
- )
502
-
503
- # Check for conflicts with existing columns
504
- existing_cols = set(df.columns)
505
- if drop_original:
506
- existing_cols.discard(column) # Will be dropped, so not a conflict
507
-
508
- conflicts = set(generated_names) & existing_cols
509
- if conflicts:
510
- raise EncodingError(
511
- f"Generated column names conflict with existing columns: {list(conflicts)[:5]}. "
512
- f"Rename existing columns or use prefix parameter."
513
- )
514
-
515
-
516
- def _create_encoded_columns_native(df, column: str, categories: List[str],
517
- col_names: List[str], backend: BackendType) -> Dict[str, Any]:
518
- """Create one-hot encoded columns using native backend operations."""
519
- if backend == 'polars':
520
- new_cols = {}
521
- for category, col_name in zip(categories, col_names):
522
- if category == "other":
523
- mask = ~df[column].is_in(categories[:-1])
524
- else:
525
- mask = df[column] == category
526
- new_cols[col_name] = mask.cast(pl.UInt8)
527
- return new_cols
528
-
529
- elif backend == 'cudf':
530
- new_cols = {}
531
- for category, col_name in zip(categories, col_names):
532
- if category == "other":
533
- mask = ~df[column].isin(categories[:-1])
534
- else:
535
- mask = df[column] == category
536
- new_cols[col_name] = mask.astype('uint8')
537
- return new_cols
538
-
539
- elif backend == 'pandas':
540
- new_cols = {}
541
- for category, col_name in zip(categories, col_names):
542
- if category == "other":
543
- mask = ~df[column].isin(categories[:-1])
544
- else:
545
- mask = df[column] == category
546
- new_cols[col_name] = mask.astype(np.uint8)
547
- return new_cols
548
-
549
- else:
550
- raise EncodingError(f"Unsupported backend: {backend}")
551
-
552
-
553
- def _add_columns_native(df, new_columns: Dict[str, Any], backend: BackendType):
554
- """Add new columns to dataframe using native operations."""
555
- if backend == 'polars':
556
- return df.with_columns([pl.lit(col_data).alias(col_name)
557
- for col_name, col_data in new_columns.items()])
558
- elif backend == 'cudf' or backend == 'pandas':
559
- result = df.copy()
560
- for col_name, col_data in new_columns.items():
561
- result[col_name] = col_data
562
- return result
563
- else:
564
- raise EncodingError(f"Unsupported backend: {backend}")
565
-
566
-
567
- def _position_columns_native(df, new_columns: List[str], position: str,
568
- reference_col: str, backend: BackendType):
569
- """Position columns using native operations (for polars/cudf)."""
570
- all_cols = list(df.columns)
571
- existing_cols = [c for c in all_cols if c not in new_columns]
572
-
573
- if position == "start":
574
- new_order = new_columns + existing_cols
575
- elif position == "after":
576
- if reference_col in existing_cols:
577
- idx = existing_cols.index(reference_col) + 1
578
- new_order = existing_cols[:idx] + new_columns + existing_cols[idx:]
579
- else:
580
- new_order = existing_cols + new_columns
581
- elif position == "before":
582
- if reference_col in existing_cols:
583
- idx = existing_cols.index(reference_col)
584
- new_order = existing_cols[:idx] + new_columns + existing_cols[idx:]
585
- else:
586
- new_order = new_columns + existing_cols
587
- else: # "end"
588
- new_order = existing_cols + new_columns
589
-
590
- return df[new_order]
591
-
592
-
593
- def _drop_column_native(df, column: str, backend: BackendType):
594
- """Drop column using native operations."""
595
- if backend == 'polars':
596
- return df.drop(column)
597
- elif backend == 'cudf' or backend == 'pandas':
598
- return df.drop(columns=[column])
599
- else:
600
- raise EncodingError(f"Unsupported backend: {backend}")