additory 0.1.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. additory/__init__.py +15 -0
  2. additory/analysis/__init__.py +48 -0
  3. additory/analysis/cardinality.py +126 -0
  4. additory/analysis/correlations.py +124 -0
  5. additory/analysis/distributions.py +376 -0
  6. additory/analysis/quality.py +158 -0
  7. additory/analysis/scan.py +400 -0
  8. additory/augment/__init__.py +24 -0
  9. additory/augment/augmentor.py +653 -0
  10. additory/augment/builtin_lists.py +430 -0
  11. additory/augment/distributions.py +22 -0
  12. additory/augment/forecast.py +1132 -0
  13. additory/augment/list_registry.py +177 -0
  14. additory/augment/smote.py +320 -0
  15. additory/augment/strategies.py +883 -0
  16. additory/common/__init__.py +157 -0
  17. additory/common/backend.py +355 -0
  18. additory/common/column_utils.py +191 -0
  19. additory/common/distributions.py +737 -0
  20. additory/common/exceptions.py +62 -0
  21. additory/common/lists.py +229 -0
  22. additory/common/patterns.py +240 -0
  23. additory/common/resolver.py +567 -0
  24. additory/common/sample_data.py +182 -0
  25. additory/common/validation.py +197 -0
  26. additory/core/__init__.py +27 -0
  27. additory/core/ast_builder.py +165 -0
  28. additory/core/backends/__init__.py +23 -0
  29. additory/core/backends/arrow_bridge.py +476 -0
  30. additory/core/backends/cudf_bridge.py +355 -0
  31. additory/core/column_positioning.py +358 -0
  32. additory/core/compiler_polars.py +166 -0
  33. additory/core/config.py +342 -0
  34. additory/core/enhanced_cache_manager.py +1119 -0
  35. additory/core/enhanced_matchers.py +473 -0
  36. additory/core/enhanced_version_manager.py +325 -0
  37. additory/core/executor.py +59 -0
  38. additory/core/integrity_manager.py +477 -0
  39. additory/core/loader.py +190 -0
  40. additory/core/logging.py +24 -0
  41. additory/core/memory_manager.py +547 -0
  42. additory/core/namespace_manager.py +657 -0
  43. additory/core/parser.py +176 -0
  44. additory/core/polars_expression_engine.py +551 -0
  45. additory/core/registry.py +176 -0
  46. additory/core/sample_data_manager.py +492 -0
  47. additory/core/user_namespace.py +751 -0
  48. additory/core/validator.py +27 -0
  49. additory/dynamic_api.py +308 -0
  50. additory/expressions/__init__.py +26 -0
  51. additory/expressions/engine.py +551 -0
  52. additory/expressions/parser.py +176 -0
  53. additory/expressions/proxy.py +546 -0
  54. additory/expressions/registry.py +313 -0
  55. additory/expressions/samples.py +492 -0
  56. additory/synthetic/__init__.py +101 -0
  57. additory/synthetic/api.py +220 -0
  58. additory/synthetic/common_integration.py +314 -0
  59. additory/synthetic/config.py +262 -0
  60. additory/synthetic/engines.py +529 -0
  61. additory/synthetic/exceptions.py +180 -0
  62. additory/synthetic/file_managers.py +518 -0
  63. additory/synthetic/generator.py +702 -0
  64. additory/synthetic/generator_parser.py +68 -0
  65. additory/synthetic/integration.py +319 -0
  66. additory/synthetic/models.py +241 -0
  67. additory/synthetic/pattern_resolver.py +573 -0
  68. additory/synthetic/performance.py +469 -0
  69. additory/synthetic/polars_integration.py +464 -0
  70. additory/synthetic/proxy.py +60 -0
  71. additory/synthetic/schema_parser.py +685 -0
  72. additory/synthetic/validator.py +553 -0
  73. additory/utilities/__init__.py +53 -0
  74. additory/utilities/encoding.py +600 -0
  75. additory/utilities/games.py +300 -0
  76. additory/utilities/keys.py +8 -0
  77. additory/utilities/lookup.py +103 -0
  78. additory/utilities/matchers.py +216 -0
  79. additory/utilities/resolvers.py +286 -0
  80. additory/utilities/settings.py +167 -0
  81. additory/utilities/units.py +746 -0
  82. additory/utilities/validators.py +153 -0
  83. additory-0.1.0a1.dist-info/METADATA +293 -0
  84. additory-0.1.0a1.dist-info/RECORD +87 -0
  85. additory-0.1.0a1.dist-info/WHEEL +5 -0
  86. additory-0.1.0a1.dist-info/licenses/LICENSE +21 -0
  87. additory-0.1.0a1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,157 @@
1
+ """
2
+ Common Utilities Module
3
+
4
+ Shared functionality used by both augment and synthetic modules:
5
+ - Distribution functions (normal, uniform, skewed, etc.)
6
+ - List file management (.list format)
7
+ - Pattern file management (.properties format)
8
+ - Fallback resolution logic
9
+
10
+ This module eliminates code duplication and provides consistent behavior
11
+ across augment and synthetic data generation.
12
+ """
13
+
14
+ from .distributions import (
15
+ generate_normal,
16
+ generate_uniform,
17
+ generate_skewed,
18
+ generate_beta,
19
+ generate_gamma,
20
+ generate_exponential_dist,
21
+ generate_kde,
22
+ generate_multivariate_normal,
23
+ generate_distribution_values,
24
+ estimate_distribution_params,
25
+ calculate_skewness,
26
+ detect_distribution_type,
27
+ DistributionType,
28
+ )
29
+
30
+ from .lists import (
31
+ load_list_file,
32
+ parse_list_file,
33
+ get_list_values,
34
+ list_all_lists,
35
+ )
36
+
37
+ from .patterns import (
38
+ load_properties_file,
39
+ parse_properties_file,
40
+ get_pattern,
41
+ list_all_patterns,
42
+ )
43
+
44
+ from .resolver import (
45
+ resolve_pattern,
46
+ resolve_with_logging,
47
+ PatternResolutionResult,
48
+ PreferMode,
49
+ )
50
+
51
+ from .backend import (
52
+ detect_backend,
53
+ is_dataframe,
54
+ to_polars,
55
+ from_polars,
56
+ BackendType,
57
+ )
58
+
59
+ from .validation import (
60
+ validate_dataframe,
61
+ validate_columns_exist,
62
+ validate_positive_number,
63
+ validate_non_negative_number,
64
+ validate_parameter_choice,
65
+ validate_ratio,
66
+ validate_string_not_empty,
67
+ validate_integer_in_range,
68
+ ValidationError,
69
+ )
70
+
71
+ from .exceptions import (
72
+ AdditoryError,
73
+ ValidationError,
74
+ BackendError,
75
+ ConversionError,
76
+ ExpressionError,
77
+ ConfigurationError,
78
+ UnitConversionError,
79
+ EncodingError,
80
+ LookupError,
81
+ SyntheticDataError,
82
+ AugmentError,
83
+ )
84
+
85
+ from .column_utils import (
86
+ sanitize_column_name,
87
+ generate_safe_column_name,
88
+ validate_column_name,
89
+ truncate_column_name,
90
+ generate_column_names_with_prefix_suffix,
91
+ )
92
+
93
+ __all__ = [
94
+ # Distribution functions
95
+ "generate_normal",
96
+ "generate_uniform",
97
+ "generate_skewed",
98
+ "generate_beta",
99
+ "generate_gamma",
100
+ "generate_exponential_dist",
101
+ "generate_kde",
102
+ "generate_multivariate_normal",
103
+ "generate_distribution_values",
104
+ "estimate_distribution_params",
105
+ "calculate_skewness",
106
+ "detect_distribution_type",
107
+ "DistributionType",
108
+ # List management
109
+ "load_list_file",
110
+ "parse_list_file",
111
+ "get_list_values",
112
+ "list_all_lists",
113
+ # Pattern management
114
+ "load_properties_file",
115
+ "parse_properties_file",
116
+ "get_pattern",
117
+ "list_all_patterns",
118
+ # Resolution
119
+ "resolve_pattern",
120
+ "resolve_with_logging",
121
+ "PatternResolutionResult",
122
+ "PreferMode",
123
+ # Backend detection
124
+ "detect_backend",
125
+ "is_dataframe",
126
+ "to_polars",
127
+ "from_polars",
128
+ "BackendType",
129
+ # Validation
130
+ "validate_dataframe",
131
+ "validate_columns_exist",
132
+ "validate_positive_number",
133
+ "validate_non_negative_number",
134
+ "validate_parameter_choice",
135
+ "validate_ratio",
136
+ "validate_string_not_empty",
137
+ "validate_integer_in_range",
138
+ "ValidationError",
139
+ # Exceptions
140
+ "AdditoryError",
141
+ "ValidationError",
142
+ "BackendError",
143
+ "ConversionError",
144
+ "ExpressionError",
145
+ "ConfigurationError",
146
+ "UnitConversionError",
147
+ "EncodingError",
148
+ "LookupError",
149
+ "SyntheticDataError",
150
+ "AugmentError",
151
+ # Column utilities
152
+ "sanitize_column_name",
153
+ "generate_safe_column_name",
154
+ "validate_column_name",
155
+ "truncate_column_name",
156
+ "generate_column_names_with_prefix_suffix",
157
+ ]
@@ -0,0 +1,355 @@
1
+ """
2
+ Unified Backend Detection System
3
+
4
+ Provides consistent backend detection across all additory modules.
5
+ """
6
+
7
+ import pandas as pd
8
+ from typing import Any, Literal, Dict
9
+
10
+ # Optional imports
11
+ try:
12
+ import polars as pl
13
+ HAS_POLARS = True
14
+ except ImportError:
15
+ HAS_POLARS = False
16
+ pl = None
17
+
18
+ try:
19
+ import cudf
20
+ HAS_CUDF = True
21
+ except (ImportError, Exception):
22
+ HAS_CUDF = False
23
+ cudf = None
24
+
25
+
26
+ BackendType = Literal['pandas', 'polars', 'cudf']
27
+ ExecutionMode = Literal['cpu', 'gpu']
28
+
29
+
30
+ def detect_backend(df: Any) -> BackendType:
31
+ """
32
+ Detect the specific backend type of a dataframe.
33
+
34
+ Args:
35
+ df: Dataframe to detect
36
+
37
+ Returns:
38
+ 'pandas', 'polars', or 'cudf'
39
+
40
+ Raises:
41
+ TypeError: If not a supported dataframe type
42
+
43
+ Usage:
44
+ - Use this when you need to know the SPECIFIC backend
45
+ - For utilities that need native operations
46
+ - For type-specific conversions
47
+
48
+ Examples:
49
+ >>> backend = detect_backend(df)
50
+ >>> if backend == 'polars':
51
+ ... result = df.select(...)
52
+ >>> elif backend == 'pandas':
53
+ ... result = df[...]
54
+ """
55
+ if isinstance(df, pd.DataFrame):
56
+ return 'pandas'
57
+ elif HAS_POLARS and isinstance(df, pl.DataFrame):
58
+ return 'polars'
59
+ elif HAS_CUDF and isinstance(df, cudf.DataFrame):
60
+ return 'cudf'
61
+ else:
62
+ raise TypeError(
63
+ f"Unsupported dataframe type: {type(df)}. "
64
+ f"Supported types: pandas.DataFrame"
65
+ f"{', polars.DataFrame' if HAS_POLARS else ''}"
66
+ f"{', cudf.DataFrame' if HAS_CUDF else ''}"
67
+ )
68
+
69
+
70
+ def detect_execution_mode(df: Any, preference: str = None) -> ExecutionMode:
71
+ """
72
+ Detect execution mode (CPU vs GPU) for expression processing.
73
+
74
+ Args:
75
+ df: Dataframe to detect
76
+ preference: User preference ('cpu', 'gpu', or None for auto)
77
+
78
+ Returns:
79
+ 'cpu' or 'gpu'
80
+
81
+ Usage:
82
+ - Use this for expression execution routing
83
+ - Respects user preferences
84
+ - Falls back intelligently
85
+
86
+ Examples:
87
+ >>> mode = detect_execution_mode(df, preference='gpu')
88
+ >>> if mode == 'gpu':
89
+ ... # Use GPU-accelerated execution
90
+ """
91
+ backend = detect_backend(df)
92
+
93
+ # User preference takes priority
94
+ if preference == 'gpu' and HAS_CUDF:
95
+ return 'gpu'
96
+ elif preference == 'cpu':
97
+ return 'cpu'
98
+
99
+ # Auto-detect based on dataframe type
100
+ if backend == 'cudf':
101
+ return 'gpu'
102
+ else:
103
+ return 'cpu'
104
+
105
+
106
+ def is_dataframe(obj: Any) -> bool:
107
+ """
108
+ Check if object is any supported dataframe type.
109
+
110
+ Args:
111
+ obj: Object to check
112
+
113
+ Returns:
114
+ True if supported dataframe type
115
+
116
+ Usage:
117
+ - Use for simple boolean checks
118
+ - Fast validation without exceptions
119
+
120
+ Examples:
121
+ >>> if is_dataframe(obj):
122
+ ... process(obj)
123
+ """
124
+ return (
125
+ isinstance(obj, pd.DataFrame) or
126
+ (HAS_POLARS and isinstance(obj, pl.DataFrame)) or
127
+ (HAS_CUDF and isinstance(obj, cudf.DataFrame))
128
+ )
129
+
130
+
131
+ def get_available_backends() -> Dict[str, bool]:
132
+ """
133
+ Get availability status of all backends.
134
+
135
+ Returns:
136
+ Dictionary mapping backend name to availability
137
+
138
+ Examples:
139
+ >>> backends = get_available_backends()
140
+ >>> if backends['polars']:
141
+ ... # Use polars-specific features
142
+ """
143
+ return {
144
+ 'pandas': True, # Always available
145
+ 'polars': HAS_POLARS,
146
+ 'cudf': HAS_CUDF
147
+ }
148
+
149
+
150
+ def check_backend_available(backend: BackendType) -> bool:
151
+ """
152
+ Check if a specific backend is available.
153
+
154
+ Args:
155
+ backend: Backend to check ('pandas', 'polars', 'cudf')
156
+
157
+ Returns:
158
+ True if backend is available
159
+
160
+ Examples:
161
+ >>> if check_backend_available('polars'):
162
+ ... # Safe to use polars
163
+ """
164
+ availability = get_available_backends()
165
+ return availability.get(backend, False)
166
+
167
+
168
+ # ============================================================================
169
+ # Arrow Bridge Helpers - Polars-Only Architecture
170
+ # ============================================================================
171
+
172
+ def get_arrow_bridge():
173
+ """
174
+ Get singleton instance of Arrow bridge.
175
+
176
+ Returns:
177
+ EnhancedArrowBridge instance
178
+
179
+ Usage:
180
+ - Use for all cross-backend conversions
181
+ - Handles pandas/polars/cuDF via Arrow
182
+ """
183
+ from additory.core.backends.arrow_bridge import EnhancedArrowBridge
184
+
185
+ # Singleton pattern
186
+ if not hasattr(get_arrow_bridge, '_instance'):
187
+ get_arrow_bridge._instance = EnhancedArrowBridge()
188
+
189
+ return get_arrow_bridge._instance
190
+
191
+
192
+ def to_polars(df: Any, backend_type: BackendType = None) -> 'pl.DataFrame':
193
+ """
194
+ Convert any dataframe to Polars via Arrow bridge.
195
+
196
+ This is the primary conversion function for the Polars-only architecture.
197
+ All operations (expressions, augment, etc.) use this to convert input
198
+ dataframes to Polars for processing.
199
+
200
+ Args:
201
+ df: Input dataframe (pandas, polars, or cuDF)
202
+ backend_type: Source backend type (auto-detected if None)
203
+
204
+ Returns:
205
+ Polars DataFrame
206
+
207
+ Raises:
208
+ TypeError: If df is not a supported dataframe type
209
+ RuntimeError: If conversion fails
210
+
211
+ Examples:
212
+ >>> # Convert pandas to polars
213
+ >>> pl_df = to_polars(pandas_df)
214
+
215
+ >>> # Convert cuDF to polars
216
+ >>> pl_df = to_polars(cudf_df)
217
+
218
+ >>> # Already polars (no-op)
219
+ >>> pl_df = to_polars(polars_df)
220
+ """
221
+ if not HAS_POLARS:
222
+ raise RuntimeError(
223
+ "Polars is not available. Install with: pip install polars"
224
+ )
225
+
226
+ # Fast path: already Polars
227
+ if isinstance(df, pl.DataFrame):
228
+ return df
229
+
230
+ # Validate input
231
+ if not is_dataframe(df):
232
+ raise TypeError(
233
+ f"Expected pandas, polars, or cuDF DataFrame, got {type(df)}"
234
+ )
235
+
236
+ # Auto-detect backend if not provided
237
+ if backend_type is None:
238
+ backend_type = detect_backend(df)
239
+
240
+ # Convert via Arrow bridge
241
+ try:
242
+ bridge = get_arrow_bridge()
243
+ arrow_table = bridge.to_arrow(df, backend_type)
244
+ pl_df = bridge.from_arrow(arrow_table, "polars")
245
+ return pl_df
246
+ except Exception as e:
247
+ raise RuntimeError(
248
+ f"Failed to convert {backend_type} DataFrame to Polars: {e}"
249
+ ) from e
250
+
251
+
252
+ def from_polars(pl_df: 'pl.DataFrame', target_backend: BackendType) -> Any:
253
+ """
254
+ Convert Polars dataframe back to target backend via Arrow bridge.
255
+
256
+ This is used to convert results back to the user's original format
257
+ after processing in Polars.
258
+
259
+ Args:
260
+ pl_df: Polars DataFrame
261
+ target_backend: Target backend ('pandas', 'polars', or 'cudf')
262
+
263
+ Returns:
264
+ DataFrame in target format
265
+
266
+ Raises:
267
+ TypeError: If pl_df is not a Polars DataFrame
268
+ ValueError: If target_backend is not supported
269
+ RuntimeError: If conversion fails
270
+
271
+ Examples:
272
+ >>> # Convert back to pandas
273
+ >>> pandas_df = from_polars(pl_df, 'pandas')
274
+
275
+ >>> # Convert back to cuDF
276
+ >>> cudf_df = from_polars(pl_df, 'cudf')
277
+
278
+ >>> # Keep as polars (no-op)
279
+ >>> pl_df = from_polars(pl_df, 'polars')
280
+ """
281
+ if not HAS_POLARS:
282
+ raise RuntimeError(
283
+ "Polars is not available. Install with: pip install polars"
284
+ )
285
+
286
+ # Validate input
287
+ if not isinstance(pl_df, pl.DataFrame):
288
+ raise TypeError(
289
+ f"Expected Polars DataFrame, got {type(pl_df)}"
290
+ )
291
+
292
+ # Validate target backend
293
+ if target_backend not in ('pandas', 'polars', 'cudf'):
294
+ raise ValueError(
295
+ f"Invalid target_backend: {target_backend}. "
296
+ f"Must be 'pandas', 'polars', or 'cudf'"
297
+ )
298
+
299
+ # Fast path: already target format
300
+ if target_backend == 'polars':
301
+ return pl_df
302
+
303
+ # Check target backend availability
304
+ if target_backend == 'cudf' and not HAS_CUDF:
305
+ raise RuntimeError(
306
+ "cuDF is not available. Install with: pip install cudf"
307
+ )
308
+
309
+ # Convert via Arrow bridge
310
+ try:
311
+ bridge = get_arrow_bridge()
312
+ arrow_table = bridge.to_arrow(pl_df, "polars")
313
+ result_df = bridge.from_arrow(arrow_table, target_backend)
314
+ return result_df
315
+ except Exception as e:
316
+ raise RuntimeError(
317
+ f"Failed to convert Polars DataFrame to {target_backend}: {e}"
318
+ ) from e
319
+
320
+
321
+ def convert_via_polars(df: Any, target_backend: BackendType = None) -> Any:
322
+ """
323
+ Convert dataframe to target backend via Polars (round-trip conversion).
324
+
325
+ This is a convenience function that combines to_polars() and from_polars().
326
+ Useful for format conversions without processing.
327
+
328
+ Args:
329
+ df: Input dataframe
330
+ target_backend: Target backend (defaults to input backend)
331
+
332
+ Returns:
333
+ DataFrame in target format
334
+
335
+ Examples:
336
+ >>> # Convert pandas to cuDF via Polars
337
+ >>> cudf_df = convert_via_polars(pandas_df, 'cudf')
338
+
339
+ >>> # Round-trip (normalize via Polars)
340
+ >>> normalized_df = convert_via_polars(df)
341
+ """
342
+ # Detect input backend
343
+ input_backend = detect_backend(df)
344
+
345
+ # Default to same backend
346
+ if target_backend is None:
347
+ target_backend = input_backend
348
+
349
+ # Fast path: same backend
350
+ if input_backend == target_backend:
351
+ return df
352
+
353
+ # Convert via Polars
354
+ pl_df = to_polars(df, input_backend)
355
+ return from_polars(pl_df, target_backend)
@@ -0,0 +1,191 @@
1
+ """
2
+ Common Column Utilities
3
+
4
+ Provides column name handling utilities shared across modules.
5
+ """
6
+
7
+ import re
8
+ from typing import List
9
+ from .exceptions import ValidationError
10
+
11
+
12
+ def sanitize_column_name(col_name: str) -> str:
13
+ """
14
+ Convert column name to Python-friendly identifier.
15
+
16
+ Rules:
17
+ - Replace spaces and special chars with underscores
18
+ - Remove consecutive underscores
19
+ - Remove leading/trailing underscores
20
+ - Ensure doesn't start with number
21
+ - Convert to lowercase for consistency
22
+
23
+ Args:
24
+ col_name: Original column name
25
+
26
+ Returns:
27
+ Sanitized column name safe for Python identifiers
28
+
29
+ Examples:
30
+ >>> sanitize_column_name("height collected on site")
31
+ 'height_collected_on_site'
32
+ >>> sanitize_column_name("Patient Height - Site A")
33
+ 'patient_height_site_a'
34
+ >>> sanitize_column_name("Weight (kg)")
35
+ 'weight_kg'
36
+ >>> sanitize_column_name("temp@location#1")
37
+ 'temp_location_1'
38
+ """
39
+ # Convert to string and handle None/empty
40
+ if not col_name:
41
+ return "unnamed_column"
42
+
43
+ col_str = str(col_name)
44
+
45
+ # Replace non-alphanumeric chars with underscores
46
+ sanitized = re.sub(r'[^a-zA-Z0-9_]', '_', col_str)
47
+
48
+ # Remove consecutive underscores
49
+ sanitized = re.sub(r'_+', '_', sanitized)
50
+
51
+ # Remove leading/trailing underscores
52
+ sanitized = sanitized.strip('_')
53
+
54
+ # Ensure doesn't start with number
55
+ if sanitized and sanitized[0].isdigit():
56
+ sanitized = f"col_{sanitized}"
57
+
58
+ # Convert to lowercase for consistency
59
+ sanitized = sanitized.lower()
60
+
61
+ return sanitized if sanitized else "unnamed_column"
62
+
63
+
64
+ def generate_safe_column_name(base_name: str, existing_columns: List[str]) -> str:
65
+ """
66
+ Generate a safe column name that doesn't conflict with existing columns.
67
+
68
+ Args:
69
+ base_name: Desired column name
70
+ existing_columns: List of existing column names
71
+
72
+ Returns:
73
+ Safe column name with _1, _2, etc. suffix if needed
74
+
75
+ Examples:
76
+ >>> generate_safe_column_name("value", ["value", "value_1"])
77
+ 'value_2'
78
+ >>> generate_safe_column_name("new_col", ["col1", "col2"])
79
+ 'new_col'
80
+ """
81
+ if base_name not in existing_columns:
82
+ return base_name
83
+
84
+ counter = 1
85
+ while f"{base_name}_{counter}" in existing_columns:
86
+ counter += 1
87
+
88
+ return f"{base_name}_{counter}"
89
+
90
+
91
+ def validate_column_name(name: str) -> None:
92
+ """
93
+ Validate column name format.
94
+
95
+ Args:
96
+ name: Column name to validate
97
+
98
+ Raises:
99
+ ValidationError: If name is invalid
100
+
101
+ Examples:
102
+ >>> validate_column_name("valid_column")
103
+ >>> validate_column_name("") # Raises ValidationError
104
+ """
105
+ if not isinstance(name, str):
106
+ raise ValidationError(f"Column name must be a string, got {type(name)}")
107
+
108
+ if not name.strip():
109
+ raise ValidationError("Column name cannot be empty")
110
+
111
+
112
+ def truncate_column_name(name: str, max_length: int = 63,
113
+ preserve_end: bool = True) -> str:
114
+ """
115
+ Truncate column name to maximum length while preserving uniqueness.
116
+
117
+ Args:
118
+ name: Column name to truncate
119
+ max_length: Maximum length (default 63 for SQL compatibility)
120
+ preserve_end: If True, preserve end of name (where differences often are)
121
+
122
+ Returns:
123
+ Truncated column name
124
+
125
+ Examples:
126
+ >>> truncate_column_name("very_long_column_name_with_suffix_01", max_length=20)
127
+ 'very_lon_suffix_01'
128
+ >>> truncate_column_name("short", max_length=20)
129
+ 'short'
130
+ """
131
+ if len(name) <= max_length:
132
+ return name
133
+
134
+ if preserve_end:
135
+ # Keep start and end, truncate middle
136
+ keep_start = max_length // 2
137
+ keep_end = max_length - keep_start
138
+ return name[:keep_start] + name[-keep_end:]
139
+ else:
140
+ # Simple truncation from start
141
+ return name[:max_length]
142
+
143
+
144
+ def generate_column_names_with_prefix_suffix(
145
+ base_name: str,
146
+ values: List[str],
147
+ prefix: str = None,
148
+ suffix: str = None,
149
+ max_length: int = 63
150
+ ) -> List[str]:
151
+ """
152
+ Generate column names with optional prefix/suffix.
153
+
154
+ Args:
155
+ base_name: Base column name
156
+ values: List of values to create column names for
157
+ prefix: Optional prefix
158
+ suffix: Optional suffix
159
+ max_length: Maximum column name length
160
+
161
+ Returns:
162
+ List of generated column names
163
+
164
+ Examples:
165
+ >>> generate_column_names_with_prefix_suffix(
166
+ ... "color", ["red", "blue"], prefix="ohe"
167
+ ... )
168
+ ['ohe_color_red', 'ohe_color_blue']
169
+ """
170
+ column_names = []
171
+
172
+ for value in values:
173
+ # Build parts
174
+ parts = []
175
+ if prefix:
176
+ parts.append(prefix)
177
+ parts.append(base_name)
178
+ parts.append(str(value))
179
+ if suffix:
180
+ parts.append(suffix)
181
+
182
+ # Join with underscores
183
+ full_name = "_".join(parts)
184
+
185
+ # Truncate if needed
186
+ if len(full_name) > max_length:
187
+ full_name = truncate_column_name(full_name, max_length)
188
+
189
+ column_names.append(full_name)
190
+
191
+ return column_names