additory 0.1.0a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. additory/__init__.py +15 -0
  2. additory/analysis/__init__.py +48 -0
  3. additory/analysis/cardinality.py +126 -0
  4. additory/analysis/correlations.py +124 -0
  5. additory/analysis/distributions.py +376 -0
  6. additory/analysis/quality.py +158 -0
  7. additory/analysis/scan.py +400 -0
  8. additory/augment/__init__.py +24 -0
  9. additory/augment/augmentor.py +653 -0
  10. additory/augment/builtin_lists.py +430 -0
  11. additory/augment/distributions.py +22 -0
  12. additory/augment/forecast.py +1132 -0
  13. additory/augment/list_registry.py +177 -0
  14. additory/augment/smote.py +320 -0
  15. additory/augment/strategies.py +883 -0
  16. additory/common/__init__.py +157 -0
  17. additory/common/backend.py +355 -0
  18. additory/common/column_utils.py +191 -0
  19. additory/common/distributions.py +737 -0
  20. additory/common/exceptions.py +62 -0
  21. additory/common/lists.py +229 -0
  22. additory/common/patterns.py +240 -0
  23. additory/common/resolver.py +567 -0
  24. additory/common/sample_data.py +182 -0
  25. additory/common/validation.py +197 -0
  26. additory/core/__init__.py +27 -0
  27. additory/core/ast_builder.py +165 -0
  28. additory/core/backends/__init__.py +23 -0
  29. additory/core/backends/arrow_bridge.py +476 -0
  30. additory/core/backends/cudf_bridge.py +355 -0
  31. additory/core/column_positioning.py +358 -0
  32. additory/core/compiler_polars.py +166 -0
  33. additory/core/config.py +342 -0
  34. additory/core/enhanced_cache_manager.py +1119 -0
  35. additory/core/enhanced_matchers.py +473 -0
  36. additory/core/enhanced_version_manager.py +325 -0
  37. additory/core/executor.py +59 -0
  38. additory/core/integrity_manager.py +477 -0
  39. additory/core/loader.py +190 -0
  40. additory/core/logging.py +24 -0
  41. additory/core/memory_manager.py +547 -0
  42. additory/core/namespace_manager.py +657 -0
  43. additory/core/parser.py +176 -0
  44. additory/core/polars_expression_engine.py +551 -0
  45. additory/core/registry.py +176 -0
  46. additory/core/sample_data_manager.py +492 -0
  47. additory/core/user_namespace.py +751 -0
  48. additory/core/validator.py +27 -0
  49. additory/dynamic_api.py +308 -0
  50. additory/expressions/__init__.py +26 -0
  51. additory/expressions/engine.py +551 -0
  52. additory/expressions/parser.py +176 -0
  53. additory/expressions/proxy.py +546 -0
  54. additory/expressions/registry.py +313 -0
  55. additory/expressions/samples.py +492 -0
  56. additory/synthetic/__init__.py +101 -0
  57. additory/synthetic/api.py +220 -0
  58. additory/synthetic/common_integration.py +314 -0
  59. additory/synthetic/config.py +262 -0
  60. additory/synthetic/engines.py +529 -0
  61. additory/synthetic/exceptions.py +180 -0
  62. additory/synthetic/file_managers.py +518 -0
  63. additory/synthetic/generator.py +702 -0
  64. additory/synthetic/generator_parser.py +68 -0
  65. additory/synthetic/integration.py +319 -0
  66. additory/synthetic/models.py +241 -0
  67. additory/synthetic/pattern_resolver.py +573 -0
  68. additory/synthetic/performance.py +469 -0
  69. additory/synthetic/polars_integration.py +464 -0
  70. additory/synthetic/proxy.py +60 -0
  71. additory/synthetic/schema_parser.py +685 -0
  72. additory/synthetic/validator.py +553 -0
  73. additory/utilities/__init__.py +53 -0
  74. additory/utilities/encoding.py +600 -0
  75. additory/utilities/games.py +300 -0
  76. additory/utilities/keys.py +8 -0
  77. additory/utilities/lookup.py +103 -0
  78. additory/utilities/matchers.py +216 -0
  79. additory/utilities/resolvers.py +286 -0
  80. additory/utilities/settings.py +167 -0
  81. additory/utilities/units.py +746 -0
  82. additory/utilities/validators.py +153 -0
  83. additory-0.1.0a1.dist-info/METADATA +293 -0
  84. additory-0.1.0a1.dist-info/RECORD +87 -0
  85. additory-0.1.0a1.dist-info/WHEEL +5 -0
  86. additory-0.1.0a1.dist-info/licenses/LICENSE +21 -0
  87. additory-0.1.0a1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,746 @@
1
+ # additory/utilities/units.py
2
+ # Unit conversion system with hardcoded conversion factors
3
+
4
+ import polars as pl
5
+ import pandas as pd
6
+ import re
7
+ from typing import Any, Dict, List, Optional, Union, Tuple
8
+ from datetime import datetime
9
+
10
+ from ..core.backends.arrow_bridge import EnhancedArrowBridge, ArrowBridgeError
11
+ from ..core.logging import log_info, log_warning
12
+
13
+
14
+ # Hardcoded unit conversion definitions
15
+ # All conversions relative to base unit (factor = 1.0)
16
+ UNIT_CONVERSIONS = {
17
+ # LENGTH - base unit: meter (m)
18
+ "length": {
19
+ "base_unit": "m",
20
+ "units": {
21
+ "m": 1.0, # meter (base)
22
+ "meter": 1.0, # meter (alternative)
23
+ "metres": 1.0, # meter (alternative)
24
+ "cm": 0.01, # centimeter
25
+ "centimeter": 0.01, # centimeter (alternative)
26
+ "centimetre": 0.01, # centimeter (alternative)
27
+ "mm": 0.001, # millimeter
28
+ "millimeter": 0.001,# millimeter (alternative)
29
+ "millimetre": 0.001,# millimeter (alternative)
30
+ "km": 1000.0, # kilometer
31
+ "kilometer": 1000.0,# kilometer (alternative)
32
+ "kilometre": 1000.0,# kilometer (alternative)
33
+ "in": 0.0254, # inch
34
+ "inch": 0.0254, # inch (alternative)
35
+ "inches": 0.0254, # inch (alternative)
36
+ "ft": 0.3048, # foot
37
+ "foot": 0.3048, # foot (alternative)
38
+ "feet": 0.3048, # foot (alternative)
39
+ "yd": 0.9144, # yard
40
+ "yard": 0.9144, # yard (alternative)
41
+ "yards": 0.9144, # yard (alternative)
42
+ "mi": 1609.344, # mile
43
+ "mile": 1609.344, # mile (alternative)
44
+ "miles": 1609.344, # mile (alternative)
45
+ }
46
+ },
47
+
48
+ # WEIGHT/MASS - base unit: kilogram (kg)
49
+ "weight": {
50
+ "base_unit": "kg",
51
+ "units": {
52
+ "kg": 1.0, # kilogram (base)
53
+ "kilogram": 1.0, # kilogram (alternative)
54
+ "kilograms": 1.0, # kilogram (alternative)
55
+ "g": 0.001, # gram
56
+ "gram": 0.001, # gram (alternative)
57
+ "grams": 0.001, # gram (alternative)
58
+ "mg": 0.000001, # milligram
59
+ "milligram": 0.000001, # milligram (alternative)
60
+ "milligrams": 0.000001, # milligram (alternative)
61
+ "lb": 0.453592, # pound
62
+ "lbs": 0.453592, # pound (alternative)
63
+ "pound": 0.453592, # pound (alternative)
64
+ "pounds": 0.453592, # pound (alternative)
65
+ "oz": 0.0283495, # ounce
66
+ "ounce": 0.0283495, # ounce (alternative)
67
+ "ounces": 0.0283495,# ounce (alternative)
68
+ "ton": 1000.0, # metric ton
69
+ "tonne": 1000.0, # metric ton (alternative)
70
+ "tonnes": 1000.0, # metric ton (alternative)
71
+ "stone": 6.35029, # stone (14 pounds)
72
+ "stones": 6.35029, # stone (alternative)
73
+ }
74
+ },
75
+
76
+ # TEMPERATURE - base unit: Celsius (°C)
77
+ # Note: Temperature requires special handling due to offset conversions
78
+ "temperature": {
79
+ "base_unit": "C",
80
+ "units": {
81
+ "C": {"factor": 1.0, "offset": 0.0}, # Celsius (base)
82
+ "c": {"factor": 1.0, "offset": 0.0}, # Celsius (lowercase)
83
+ "celsius": {"factor": 1.0, "offset": 0.0}, # Celsius (alternative)
84
+ "F": {"factor": 5/9, "offset": -32}, # Fahrenheit: (F-32)*5/9 = (F+(-32))*5/9
85
+ "f": {"factor": 5/9, "offset": -32}, # Fahrenheit (lowercase)
86
+ "fahrenheit": {"factor": 5/9, "offset": -32}, # Fahrenheit (alternative)
87
+ "K": {"factor": 1.0, "offset": -273.15}, # Kelvin: K-273.15 = (K+(-273.15))*1.0
88
+ "k": {"factor": 1.0, "offset": -273.15}, # Kelvin (lowercase)
89
+ "kelvin": {"factor": 1.0, "offset": -273.15}, # Kelvin (alternative)
90
+ }
91
+ },
92
+
93
+ # VOLUME - base unit: liter (L)
94
+ "volume": {
95
+ "base_unit": "L",
96
+ "units": {
97
+ "L": 1.0, # liter (base)
98
+ "l": 1.0, # liter (lowercase)
99
+ "liter": 1.0, # liter (alternative)
100
+ "liters": 1.0, # liter (alternative)
101
+ "litre": 1.0, # liter (alternative)
102
+ "litres": 1.0, # liter (alternative)
103
+ "mL": 0.001, # milliliter
104
+ "ml": 0.001, # milliliter (lowercase)
105
+ "milliliter": 0.001,# milliliter (alternative)
106
+ "milliliters": 0.001,# milliliter (alternative)
107
+ "millilitre": 0.001,# milliliter (alternative)
108
+ "millilitres": 0.001,# milliliter (alternative)
109
+ "gal": 3.78541, # US gallon
110
+ "gallon": 3.78541, # US gallon (alternative)
111
+ "gallons": 3.78541, # US gallon (alternative)
112
+ "qt": 0.946353, # US quart
113
+ "quart": 0.946353, # US quart (alternative)
114
+ "quarts": 0.946353, # US quart (alternative)
115
+ "pt": 0.473176, # US pint
116
+ "pint": 0.473176, # US pint (alternative)
117
+ "pints": 0.473176, # US pint (alternative)
118
+ "cup": 0.236588, # US cup
119
+ "cups": 0.236588, # US cup (alternative)
120
+ "fl_oz": 0.0295735, # US fluid ounce
121
+ "floz": 0.0295735, # US fluid ounce (alternative)
122
+ "fluid_ounce": 0.0295735, # US fluid ounce (alternative)
123
+ "fluid_ounces": 0.0295735, # US fluid ounce (alternative)
124
+ }
125
+ },
126
+
127
+ # TIME - base unit: second (s)
128
+ "time": {
129
+ "base_unit": "s",
130
+ "units": {
131
+ "s": 1.0, # second (base)
132
+ "sec": 1.0, # second (alternative)
133
+ "second": 1.0, # second (alternative)
134
+ "seconds": 1.0, # second (alternative)
135
+ "min": 60.0, # minute
136
+ "minute": 60.0, # minute (alternative)
137
+ "minutes": 60.0, # minute (alternative)
138
+ "hr": 3600.0, # hour
139
+ "hour": 3600.0, # hour (alternative)
140
+ "hours": 3600.0, # hour (alternative)
141
+ "day": 86400.0, # day
142
+ "days": 86400.0, # day (alternative)
143
+ "week": 604800.0, # week
144
+ "weeks": 604800.0, # week (alternative)
145
+ "month": 2629746.0, # average month (30.44 days)
146
+ "months": 2629746.0,# average month (alternative)
147
+ "year": 31556952.0, # average year (365.24 days)
148
+ "years": 31556952.0,# average year (alternative)
149
+ }
150
+ },
151
+
152
+ # AREA - base unit: square meter (m²)
153
+ "area": {
154
+ "base_unit": "m2",
155
+ "units": {
156
+ "m2": 1.0, # square meter (base)
157
+ "m²": 1.0, # square meter (alternative)
158
+ "sq_m": 1.0, # square meter (alternative)
159
+ "square_meter": 1.0,# square meter (alternative)
160
+ "square_metres": 1.0,# square meter (alternative)
161
+ "cm2": 0.0001, # square centimeter
162
+ "cm²": 0.0001, # square centimeter (alternative)
163
+ "sq_cm": 0.0001, # square centimeter (alternative)
164
+ "km2": 1000000.0, # square kilometer
165
+ "km²": 1000000.0, # square kilometer (alternative)
166
+ "sq_km": 1000000.0, # square kilometer (alternative)
167
+ "in2": 0.00064516, # square inch
168
+ "in²": 0.00064516, # square inch (alternative)
169
+ "sq_in": 0.00064516,# square inch (alternative)
170
+ "ft2": 0.092903, # square foot
171
+ "ft²": 0.092903, # square foot (alternative)
172
+ "sq_ft": 0.092903, # square foot (alternative)
173
+ "acre": 4046.86, # acre
174
+ "acres": 4046.86, # acre (alternative)
175
+ }
176
+ }
177
+ }
178
+
179
+
180
+ class UnitConversionError(Exception):
181
+ """Raised when unit conversion operations fail"""
182
+ pass
183
+
184
+
185
+ def sanitize_column_name(col_name: str) -> str:
186
+ """
187
+ Convert column name to Python-friendly identifier
188
+
189
+ Rules:
190
+ - Replace spaces and special chars with underscores
191
+ - Remove consecutive underscores
192
+ - Remove leading/trailing underscores
193
+ - Ensure doesn't start with number
194
+ - Convert to lowercase for consistency
195
+
196
+ Args:
197
+ col_name: Original column name
198
+
199
+ Returns:
200
+ Sanitized column name safe for Python identifiers
201
+
202
+ Examples:
203
+ "height collected on site" → "height_collected_on_site"
204
+ "Patient Height - Site A" → "patient_height_site_a"
205
+ "Weight (kg)" → "weight_kg"
206
+ "temp@location#1" → "temp_location_1"
207
+ """
208
+ # Convert to string and handle None/empty
209
+ if not col_name:
210
+ return "unnamed_column"
211
+
212
+ col_str = str(col_name)
213
+
214
+ # Replace non-alphanumeric chars with underscores
215
+ sanitized = re.sub(r'[^a-zA-Z0-9_]', '_', col_str)
216
+
217
+ # Remove consecutive underscores
218
+ sanitized = re.sub(r'_+', '_', sanitized)
219
+
220
+ # Remove leading/trailing underscores
221
+ sanitized = sanitized.strip('_')
222
+
223
+ # Ensure doesn't start with number
224
+ if sanitized and sanitized[0].isdigit():
225
+ sanitized = f"col_{sanitized}"
226
+
227
+ # Convert to lowercase for consistency
228
+ sanitized = sanitized.lower()
229
+
230
+ return sanitized if sanitized else "unnamed_column"
231
+
232
+
233
+ def generate_safe_column_name(base_name: str, existing_columns: List[str]) -> str:
234
+ """
235
+ Generate a safe column name that doesn't conflict with existing columns
236
+
237
+ Args:
238
+ base_name: Desired column name
239
+ existing_columns: List of existing column names
240
+
241
+ Returns:
242
+ Safe column name with _1, _2, etc. suffix if needed
243
+ """
244
+ if base_name not in existing_columns:
245
+ return base_name
246
+
247
+ counter = 1
248
+ while f"{base_name}_{counter}" in existing_columns:
249
+ counter += 1
250
+
251
+ return f"{base_name}_{counter}"
252
+
253
+
254
+ class UnitConverter:
255
+ """Unit conversion system with Polars processing"""
256
+
257
+ def __init__(self):
258
+ self.arrow_bridge = EnhancedArrowBridge()
259
+ self.conversion_stats = {
260
+ "total_conversions": 0,
261
+ "successful_conversions": 0,
262
+ "failed_conversions": 0,
263
+ "categories_detected": set(),
264
+ "units_processed": set()
265
+ }
266
+
267
+ log_info("[units] Unit Converter initialized")
268
+
269
+ def harmonize_units(self, df: Any, value_column: str, unit_column: str,
270
+ target_unit: Optional[str] = None,
271
+ position: str = "end") -> Any:
272
+ """
273
+ Harmonize units in a dataframe
274
+
275
+ Args:
276
+ df: Input dataframe (pandas, polars, cudf)
277
+ value_column: Column containing numeric values
278
+ unit_column: Column containing unit strings
279
+ target_unit: Target unit (auto-detected if None)
280
+ position: Where to place new columns ("end", "start", int, "after:col", "before:col")
281
+
282
+ Returns:
283
+ Dataframe with harmonized columns added
284
+
285
+ Raises:
286
+ UnitConversionError: If conversion fails
287
+ """
288
+ start_time = datetime.now()
289
+
290
+ try:
291
+ # Validate inputs
292
+ self._validate_inputs(df, value_column, unit_column)
293
+
294
+ # Detect backend
295
+ backend_type = self.arrow_bridge.detect_backend(df)
296
+ log_info(f"[units] Processing {backend_type} dataframe")
297
+
298
+ # Convert to Arrow then Polars for processing
299
+ arrow_table = self.arrow_bridge.to_arrow(df, backend_type)
300
+ polars_df = pl.from_arrow(arrow_table)
301
+
302
+ # Perform unit conversion in Polars
303
+ result_polars = self._harmonize_units_polars(
304
+ polars_df, value_column, unit_column, target_unit
305
+ )
306
+
307
+ # Convert back to original backend
308
+ result_arrow = result_polars.to_arrow()
309
+ result_df = self.arrow_bridge.from_arrow(result_arrow, backend_type)
310
+
311
+ # The column names should already be correct from the Polars processing
312
+ # Just apply positioning if needed
313
+ sanitized_value_col = sanitize_column_name(value_column)
314
+ sanitized_unit_col = sanitize_column_name(unit_column)
315
+
316
+ # Determine target unit if not specified (should be available from processing)
317
+ if target_unit is None:
318
+ unique_units = polars_df[unit_column].unique().to_list()
319
+ unique_units_lower = [str(unit).strip().lower() for unit in unique_units if unit is not None]
320
+ category = self._detect_unit_category(unique_units_lower)
321
+ if category:
322
+ target_unit = UNIT_CONVERSIONS[category]["base_unit"]
323
+ else:
324
+ target_unit = "unknown"
325
+
326
+ expected_value_col = f"{sanitized_value_col}_{target_unit}"
327
+ expected_unit_col = f"{sanitized_unit_col}_{target_unit}"
328
+
329
+ # Apply column positioning with the actual column names that were created
330
+ actual_new_columns = [col for col in result_df.columns if col not in df.columns]
331
+
332
+ result_df = self._apply_positioning(
333
+ result_df,
334
+ actual_new_columns,
335
+ position,
336
+ backend_type
337
+ )
338
+
339
+ # Update statistics
340
+ execution_time = (datetime.now() - start_time).total_seconds() * 1000
341
+ self.conversion_stats["total_conversions"] += 1
342
+ self.conversion_stats["successful_conversions"] += 1
343
+
344
+ log_info(f"[units] Unit harmonization completed in {execution_time:.1f}ms")
345
+
346
+ return result_df
347
+
348
+ except Exception as e:
349
+ self.conversion_stats["total_conversions"] += 1
350
+ self.conversion_stats["failed_conversions"] += 1
351
+ raise UnitConversionError(f"Unit harmonization failed: {e}")
352
+
353
+ def _harmonize_units_polars(self, df: pl.DataFrame, value_column: str,
354
+ unit_column: str, target_unit: Optional[str] = None) -> pl.DataFrame:
355
+ """
356
+ Perform unit conversion using Polars
357
+
358
+ Args:
359
+ df: Polars DataFrame
360
+ value_column: Column containing numeric values
361
+ unit_column: Column containing unit strings
362
+ target_unit: Target unit (auto-detected if None)
363
+
364
+ Returns:
365
+ Polars DataFrame with harmonized columns
366
+ """
367
+ # Get unique units in the data
368
+ unique_units = df[unit_column].unique().to_list()
369
+ unique_units = [str(unit).strip() for unit in unique_units if unit is not None]
370
+
371
+ # Convert to lowercase for matching but preserve original case for logging
372
+ unique_units_lower = [unit.lower() for unit in unique_units]
373
+
374
+ log_info(f"[units] Found units: {unique_units}")
375
+
376
+ # Detect unit category using lowercase units
377
+ category = self._detect_unit_category(unique_units_lower)
378
+ if not category:
379
+ raise UnitConversionError(f"Could not detect unit category for units: {unique_units}")
380
+
381
+ log_info(f"[units] Detected category: {category}")
382
+ self.conversion_stats["categories_detected"].add(category)
383
+
384
+ # Determine target unit
385
+ if target_unit is None:
386
+ target_unit = UNIT_CONVERSIONS[category]["base_unit"]
387
+ else:
388
+ # Validate target unit
389
+ target_unit_lower = target_unit.lower()
390
+ if target_unit_lower not in UNIT_CONVERSIONS[category]["units"]:
391
+ available_units = list(UNIT_CONVERSIONS[category]["units"].keys())
392
+ raise UnitConversionError(
393
+ f"Target unit '{target_unit}' not supported for category '{category}'. "
394
+ f"Available units: {available_units}"
395
+ )
396
+ target_unit = target_unit_lower
397
+
398
+ log_info(f"[units] Target unit: {target_unit}")
399
+
400
+ # Create conversion mapping
401
+ conversion_map = self._create_conversion_map(category, target_unit)
402
+
403
+ # Generate clean, descriptive column names
404
+ sanitized_value_col = sanitize_column_name(value_column)
405
+ sanitized_unit_col = sanitize_column_name(unit_column)
406
+
407
+ harmonized_value_col = f"{sanitized_value_col}_{target_unit}"
408
+ harmonized_unit_col = f"{sanitized_unit_col}_{target_unit}"
409
+
410
+ log_info(f"[units] Creating harmonized columns: {harmonized_value_col}, {harmonized_unit_col}")
411
+
412
+ # Apply conversions using Polars
413
+ if category == "temperature":
414
+ # Special handling for temperature (offset conversions)
415
+ result_df = self._convert_temperature_polars(df, value_column, unit_column,
416
+ conversion_map, target_unit,
417
+ harmonized_value_col, harmonized_unit_col)
418
+ else:
419
+ # Standard factor-based conversions
420
+ result_df = self._convert_standard_polars(df, value_column, unit_column,
421
+ conversion_map, target_unit,
422
+ harmonized_value_col, harmonized_unit_col)
423
+
424
+ # Update statistics
425
+ self.conversion_stats["units_processed"].update(unique_units_lower)
426
+
427
+ return result_df
428
+
429
+ def _detect_unit_category(self, units: List[str]) -> Optional[str]:
430
+ """
431
+ Detect the category of units based on the units present
432
+
433
+ Args:
434
+ units: List of unit strings (lowercase)
435
+
436
+ Returns:
437
+ Category name or None if not detected
438
+ """
439
+ # Count matches for each category
440
+ category_scores = {}
441
+
442
+ for category, config in UNIT_CONVERSIONS.items():
443
+ score = 0
444
+ for unit in units:
445
+ if unit in config["units"]:
446
+ score += 1
447
+
448
+ if score > 0:
449
+ category_scores[category] = score
450
+
451
+ if not category_scores:
452
+ return None
453
+
454
+ # Return category with highest score
455
+ best_category = max(category_scores, key=category_scores.get)
456
+
457
+ # Require at least one match
458
+ if category_scores[best_category] > 0:
459
+ return best_category
460
+
461
+ return None
462
+
463
+ def _create_conversion_map(self, category: str, target_unit: str) -> Dict[str, float]:
464
+ """
465
+ Create conversion factors mapping from each unit to target unit
466
+
467
+ Args:
468
+ category: Unit category
469
+ target_unit: Target unit
470
+
471
+ Returns:
472
+ Dictionary mapping unit -> conversion factor
473
+ """
474
+ config = UNIT_CONVERSIONS[category]
475
+ target_factor = config["units"][target_unit]
476
+
477
+ conversion_map = {}
478
+
479
+ if category == "temperature":
480
+ # Temperature requires special handling
481
+ for unit, unit_config in config["units"].items():
482
+ conversion_map[unit] = unit_config
483
+ else:
484
+ # Standard factor-based conversion
485
+ for unit, unit_factor in config["units"].items():
486
+ # Convert from unit to base, then from base to target
487
+ conversion_map[unit] = unit_factor / target_factor
488
+
489
+ return conversion_map
490
+
491
+ def _convert_standard_polars(self, df: pl.DataFrame, value_column: str,
492
+ unit_column: str, conversion_map: Dict[str, float],
493
+ target_unit: str, harmonized_value_col: str,
494
+ harmonized_unit_col: str) -> pl.DataFrame:
495
+ """
496
+ Convert units using standard factor-based conversion in Polars
497
+ """
498
+ # Create a mapping expression for unit conversion
499
+ unit_mapping_expr = pl.col(unit_column).str.to_lowercase()
500
+
501
+ # Build when-then chain for conversion factors
502
+ conversion_expr = None
503
+ for unit, factor in conversion_map.items():
504
+ condition = unit_mapping_expr == unit
505
+ if conversion_expr is None:
506
+ conversion_expr = pl.when(condition).then(pl.lit(factor))
507
+ else:
508
+ conversion_expr = conversion_expr.when(condition).then(pl.lit(factor))
509
+
510
+ # Default to 1.0 for unknown units (no conversion)
511
+ conversion_expr = conversion_expr.otherwise(pl.lit(1.0))
512
+
513
+ # Apply conversion with clear column names
514
+ result_df = df.with_columns([
515
+ (pl.col(value_column) * conversion_expr).alias(harmonized_value_col),
516
+ pl.lit(target_unit).alias(harmonized_unit_col)
517
+ ])
518
+
519
+ return result_df
520
+
521
+ def _convert_temperature_polars(self, df: pl.DataFrame, value_column: str,
522
+ unit_column: str, conversion_map: Dict[str, Dict],
523
+ target_unit: str, harmonized_value_col: str,
524
+ harmonized_unit_col: str) -> pl.DataFrame:
525
+ """
526
+ Convert temperature units with offset handling in Polars
527
+ """
528
+ unit_mapping_expr = pl.col(unit_column).str.to_lowercase()
529
+
530
+ # Temperature conversion requires two steps:
531
+ # 1. Convert source unit to Celsius (base unit)
532
+ # 2. Convert Celsius to target unit
533
+
534
+ # Step 1: Convert all units to Celsius first
535
+ celsius_conversion_expr = None
536
+
537
+ for unit, config in conversion_map.items():
538
+ factor = config["factor"]
539
+ offset = config["offset"]
540
+ condition = unit_mapping_expr == unit
541
+
542
+ # Convert to Celsius: (value + offset) * factor
543
+ unit_to_celsius = (pl.col(value_column) + pl.lit(offset)) * pl.lit(factor)
544
+
545
+ if celsius_conversion_expr is None:
546
+ celsius_conversion_expr = pl.when(condition).then(unit_to_celsius)
547
+ else:
548
+ celsius_conversion_expr = celsius_conversion_expr.when(condition).then(unit_to_celsius)
549
+
550
+ # Default to original value for unknown units (assume already Celsius)
551
+ celsius_conversion_expr = celsius_conversion_expr.otherwise(pl.col(value_column))
552
+
553
+ # Step 2: Convert from Celsius to target unit
554
+ if target_unit.lower() == 'c' or target_unit.lower() == 'celsius':
555
+ # Target is Celsius, we're done
556
+ final_conversion_expr = celsius_conversion_expr
557
+ elif target_unit.lower() == 'f' or target_unit.lower() == 'fahrenheit':
558
+ # Convert Celsius to Fahrenheit: F = C * 9/5 + 32
559
+ final_conversion_expr = celsius_conversion_expr * pl.lit(9/5) + pl.lit(32)
560
+ elif target_unit.lower() == 'k' or target_unit.lower() == 'kelvin':
561
+ # Convert Celsius to Kelvin: K = C + 273.15
562
+ final_conversion_expr = celsius_conversion_expr + pl.lit(273.15)
563
+ else:
564
+ # Unknown target unit, keep as Celsius
565
+ final_conversion_expr = celsius_conversion_expr
566
+
567
+ # Apply conversion with clear column names
568
+ result_df = df.with_columns([
569
+ final_conversion_expr.alias(harmonized_value_col),
570
+ pl.lit(target_unit).alias(harmonized_unit_col)
571
+ ])
572
+
573
+ return result_df
574
+
575
+ def _validate_inputs(self, df: Any, value_column: str, unit_column: str):
576
+ """Validate input parameters"""
577
+ # Check if dataframe is supported
578
+ if not self._is_dataframe(df):
579
+ raise UnitConversionError(
580
+ f"Input must be a DataFrame (pandas, polars, or cudf). Got: {type(df)}"
581
+ )
582
+
583
+ # Check if dataframe is empty
584
+ if len(df) == 0:
585
+ raise UnitConversionError("Input dataframe is empty")
586
+
587
+ # Check if columns exist
588
+ df_columns = list(df.columns)
589
+
590
+ if value_column not in df_columns:
591
+ raise UnitConversionError(
592
+ f"Value column '{value_column}' not found. Available columns: {df_columns}"
593
+ )
594
+
595
+ if unit_column not in df_columns:
596
+ raise UnitConversionError(
597
+ f"Unit column '{unit_column}' not found. Available columns: {df_columns}"
598
+ )
599
+
600
+ # Check for column name conflicts (with new naming scheme)
601
+ sanitized_value_col = sanitize_column_name(value_column)
602
+ sanitized_unit_col = sanitize_column_name(unit_column)
603
+
604
+ # We can't check target_unit conflicts here since target_unit might be auto-detected
605
+ # Conflict resolution will happen in the main function
606
+
607
+ log_info(f"[units] Sanitized column names: {value_column} → {sanitized_value_col}, {unit_column} → {sanitized_unit_col}")
608
+
609
+ def _is_dataframe(self, obj: Any) -> bool:
610
+ """Check if object is a supported dataframe type"""
611
+ return (
612
+ isinstance(obj, pd.DataFrame) or
613
+ (hasattr(obj, '__class__') and 'polars' in str(type(obj))) or
614
+ (hasattr(obj, '__class__') and 'cudf' in str(type(obj)))
615
+ )
616
+
617
+ def _apply_positioning(self, df: Any, new_columns: List[str], position: str,
618
+ backend_type: str) -> Any:
619
+ """Apply column positioning using existing positioning system"""
620
+ try:
621
+ from ..core.column_positioning import position_columns
622
+ return position_columns(df, new_columns, position)
623
+ except Exception as e:
624
+ log_warning(f"[units] Column positioning failed: {e}. Using default 'end'.")
625
+ return df
626
+
627
+ def _handle_column_conflicts(self, df: Any, harmonized_value_col: str,
628
+ harmonized_unit_col: str, backend_type: str) -> Tuple[Any, str, str]:
629
+ """
630
+ Handle column name conflicts by generating safe alternatives
631
+
632
+ Args:
633
+ df: Input dataframe
634
+ harmonized_value_col: Desired harmonized value column name
635
+ harmonized_unit_col: Desired harmonized unit column name
636
+ backend_type: Backend type for column access
637
+
638
+ Returns:
639
+ Tuple of (dataframe, final_value_col_name, final_unit_col_name)
640
+ """
641
+ # Get existing column names
642
+ if backend_type == "pandas":
643
+ existing_columns = list(df.columns)
644
+ elif backend_type == "polars":
645
+ existing_columns = df.columns
646
+ else: # cudf
647
+ existing_columns = list(df.columns)
648
+
649
+ # Generate safe column names
650
+ final_value_col = generate_safe_column_name(harmonized_value_col, existing_columns)
651
+ final_unit_col = generate_safe_column_name(harmonized_unit_col, existing_columns)
652
+
653
+ # Log if conflicts were resolved
654
+ if final_value_col != harmonized_value_col:
655
+ log_warning(f"[units] Column conflict resolved: {harmonized_value_col} → {final_value_col}")
656
+ if final_unit_col != harmonized_unit_col:
657
+ log_warning(f"[units] Column conflict resolved: {harmonized_unit_col} → {final_unit_col}")
658
+
659
+ return df, final_value_col, final_unit_col
660
+
661
+ def get_supported_units(self, category: Optional[str] = None) -> Dict[str, List[str]]:
662
+ """
663
+ Get list of supported units
664
+
665
+ Args:
666
+ category: Specific category to get units for (optional)
667
+
668
+ Returns:
669
+ Dictionary of category -> list of units
670
+ """
671
+ if category:
672
+ if category not in UNIT_CONVERSIONS:
673
+ raise UnitConversionError(f"Unknown category: {category}")
674
+ return {category: list(UNIT_CONVERSIONS[category]["units"].keys())}
675
+
676
+ return {
677
+ cat: list(config["units"].keys())
678
+ for cat, config in UNIT_CONVERSIONS.items()
679
+ }
680
+
681
+ def get_conversion_stats(self) -> Dict[str, Any]:
682
+ """Get conversion statistics"""
683
+ stats = self.conversion_stats.copy()
684
+ stats["categories_detected"] = list(stats["categories_detected"])
685
+ stats["units_processed"] = list(stats["units_processed"])
686
+
687
+ if stats["total_conversions"] > 0:
688
+ stats["success_rate"] = (stats["successful_conversions"] / stats["total_conversions"]) * 100
689
+ else:
690
+ stats["success_rate"] = 0.0
691
+
692
+ return stats
693
+
694
+ def reset_stats(self):
695
+ """Reset conversion statistics"""
696
+ self.conversion_stats = {
697
+ "total_conversions": 0,
698
+ "successful_conversions": 0,
699
+ "failed_conversions": 0,
700
+ "categories_detected": set(),
701
+ "units_processed": set()
702
+ }
703
+ log_info("[units] Conversion statistics reset")
704
+
705
+
706
+ # Global instance
707
+ _unit_converter = None
708
+
709
+ def get_unit_converter() -> UnitConverter:
710
+ """Get the global unit converter instance"""
711
+ global _unit_converter
712
+ if _unit_converter is None:
713
+ _unit_converter = UnitConverter()
714
+ return _unit_converter
715
+
716
+
717
+ def harmonize_units(df: Any, value_column: str, unit_column: str,
718
+ target_unit: Optional[str] = None,
719
+ position: str = "end") -> Any:
720
+ """
721
+ Harmonize units in a dataframe
722
+
723
+ Args:
724
+ df: Input dataframe (pandas, polars, cudf)
725
+ value_column: Column containing numeric values
726
+ unit_column: Column containing unit strings
727
+ target_unit: Target unit (auto-detected if None)
728
+ position: Where to place new columns
729
+
730
+ Returns:
731
+ Dataframe with harmonized columns added
732
+ """
733
+ converter = get_unit_converter()
734
+ return converter.harmonize_units(df, value_column, unit_column, target_unit, position)
735
+
736
+
737
+ def get_supported_units(category: Optional[str] = None) -> Dict[str, List[str]]:
738
+ """Get list of supported units"""
739
+ converter = get_unit_converter()
740
+ return converter.get_supported_units(category)
741
+
742
+
743
+ def get_conversion_stats() -> Dict[str, Any]:
744
+ """Get conversion statistics"""
745
+ converter = get_unit_converter()
746
+ return converter.get_conversion_stats()