additory 0.1.0a4__py3-none-any.whl → 0.1.1a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. additory/__init__.py +58 -14
  2. additory/common/__init__.py +31 -147
  3. additory/common/column_selector.py +255 -0
  4. additory/common/distributions.py +286 -613
  5. additory/common/extractors.py +313 -0
  6. additory/common/knn_imputation.py +332 -0
  7. additory/common/result.py +380 -0
  8. additory/common/strategy_parser.py +243 -0
  9. additory/common/unit_conversions.py +338 -0
  10. additory/common/validation.py +283 -103
  11. additory/core/__init__.py +34 -22
  12. additory/core/backend.py +258 -0
  13. additory/core/config.py +177 -305
  14. additory/core/logging.py +230 -24
  15. additory/core/memory_manager.py +157 -495
  16. additory/expressions/__init__.py +2 -23
  17. additory/expressions/compiler.py +457 -0
  18. additory/expressions/engine.py +264 -487
  19. additory/expressions/integrity.py +179 -0
  20. additory/expressions/loader.py +263 -0
  21. additory/expressions/parser.py +363 -167
  22. additory/expressions/resolver.py +274 -0
  23. additory/functions/__init__.py +1 -0
  24. additory/functions/analyze/__init__.py +144 -0
  25. additory/functions/analyze/cardinality.py +58 -0
  26. additory/functions/analyze/correlations.py +66 -0
  27. additory/functions/analyze/distributions.py +53 -0
  28. additory/functions/analyze/duplicates.py +49 -0
  29. additory/functions/analyze/features.py +61 -0
  30. additory/functions/analyze/imputation.py +66 -0
  31. additory/functions/analyze/outliers.py +65 -0
  32. additory/functions/analyze/patterns.py +65 -0
  33. additory/functions/analyze/presets.py +72 -0
  34. additory/functions/analyze/quality.py +59 -0
  35. additory/functions/analyze/timeseries.py +53 -0
  36. additory/functions/analyze/types.py +45 -0
  37. additory/functions/expressions/__init__.py +161 -0
  38. additory/functions/snapshot/__init__.py +82 -0
  39. additory/functions/snapshot/filter.py +119 -0
  40. additory/functions/synthetic/__init__.py +113 -0
  41. additory/functions/synthetic/mode_detector.py +47 -0
  42. additory/functions/synthetic/strategies/__init__.py +1 -0
  43. additory/functions/synthetic/strategies/advanced.py +35 -0
  44. additory/functions/synthetic/strategies/augmentative.py +160 -0
  45. additory/functions/synthetic/strategies/generative.py +168 -0
  46. additory/functions/synthetic/strategies/presets.py +116 -0
  47. additory/functions/to/__init__.py +188 -0
  48. additory/functions/to/lookup.py +351 -0
  49. additory/functions/to/merge.py +189 -0
  50. additory/functions/to/sort.py +91 -0
  51. additory/functions/to/summarize.py +170 -0
  52. additory/functions/transform/__init__.py +140 -0
  53. additory/functions/transform/datetime.py +79 -0
  54. additory/functions/transform/extract.py +85 -0
  55. additory/functions/transform/harmonize.py +105 -0
  56. additory/functions/transform/knn.py +62 -0
  57. additory/functions/transform/onehotencoding.py +68 -0
  58. additory/functions/transform/transpose.py +42 -0
  59. additory-0.1.1a1.dist-info/METADATA +83 -0
  60. additory-0.1.1a1.dist-info/RECORD +62 -0
  61. additory/analysis/__init__.py +0 -48
  62. additory/analysis/cardinality.py +0 -126
  63. additory/analysis/correlations.py +0 -124
  64. additory/analysis/distributions.py +0 -376
  65. additory/analysis/quality.py +0 -158
  66. additory/analysis/scan.py +0 -400
  67. additory/common/backend.py +0 -371
  68. additory/common/column_utils.py +0 -191
  69. additory/common/exceptions.py +0 -62
  70. additory/common/lists.py +0 -229
  71. additory/common/patterns.py +0 -240
  72. additory/common/resolver.py +0 -567
  73. additory/common/sample_data.py +0 -182
  74. additory/core/ast_builder.py +0 -165
  75. additory/core/backends/__init__.py +0 -23
  76. additory/core/backends/arrow_bridge.py +0 -483
  77. additory/core/backends/cudf_bridge.py +0 -355
  78. additory/core/column_positioning.py +0 -358
  79. additory/core/compiler_polars.py +0 -166
  80. additory/core/enhanced_cache_manager.py +0 -1119
  81. additory/core/enhanced_matchers.py +0 -473
  82. additory/core/enhanced_version_manager.py +0 -325
  83. additory/core/executor.py +0 -59
  84. additory/core/integrity_manager.py +0 -477
  85. additory/core/loader.py +0 -190
  86. additory/core/namespace_manager.py +0 -657
  87. additory/core/parser.py +0 -176
  88. additory/core/polars_expression_engine.py +0 -601
  89. additory/core/registry.py +0 -177
  90. additory/core/sample_data_manager.py +0 -492
  91. additory/core/user_namespace.py +0 -751
  92. additory/core/validator.py +0 -27
  93. additory/dynamic_api.py +0 -352
  94. additory/expressions/proxy.py +0 -549
  95. additory/expressions/registry.py +0 -313
  96. additory/expressions/samples.py +0 -492
  97. additory/synthetic/__init__.py +0 -13
  98. additory/synthetic/column_name_resolver.py +0 -149
  99. additory/synthetic/deduce.py +0 -259
  100. additory/synthetic/distributions.py +0 -22
  101. additory/synthetic/forecast.py +0 -1132
  102. additory/synthetic/linked_list_parser.py +0 -415
  103. additory/synthetic/namespace_lookup.py +0 -129
  104. additory/synthetic/smote.py +0 -320
  105. additory/synthetic/strategies.py +0 -926
  106. additory/synthetic/synthesizer.py +0 -713
  107. additory/utilities/__init__.py +0 -53
  108. additory/utilities/encoding.py +0 -600
  109. additory/utilities/games.py +0 -300
  110. additory/utilities/keys.py +0 -8
  111. additory/utilities/lookup.py +0 -103
  112. additory/utilities/matchers.py +0 -216
  113. additory/utilities/resolvers.py +0 -286
  114. additory/utilities/settings.py +0 -167
  115. additory/utilities/units.py +0 -749
  116. additory/utilities/validators.py +0 -153
  117. additory-0.1.0a4.dist-info/METADATA +0 -311
  118. additory-0.1.0a4.dist-info/RECORD +0 -72
  119. additory-0.1.0a4.dist-info/licenses/LICENSE +0 -21
  120. {additory-0.1.0a4.dist-info → additory-0.1.1a1.dist-info}/WHEEL +0 -0
  121. {additory-0.1.0a4.dist-info → additory-0.1.1a1.dist-info}/top_level.txt +0 -0
@@ -1,749 +0,0 @@
1
- # additory/utilities/units.py
2
- # Unit conversion system with hardcoded conversion factors
3
-
4
- import polars as pl
5
- import pandas as pd
6
- import re
7
- from typing import Any, Dict, List, Optional, Union, Tuple
8
- from datetime import datetime
9
-
10
- from ..core.backends.arrow_bridge import EnhancedArrowBridge, ArrowBridgeError
11
- from ..core.logging import log_info, log_warning
12
-
13
-
14
- # Hardcoded unit conversion definitions
15
- # All conversions relative to base unit (factor = 1.0)
16
- UNIT_CONVERSIONS = {
17
- # LENGTH - base unit: meter (m)
18
- "length": {
19
- "base_unit": "m",
20
- "units": {
21
- "m": 1.0, # meter (base)
22
- "meter": 1.0, # meter (alternative)
23
- "metres": 1.0, # meter (alternative)
24
- "cm": 0.01, # centimeter
25
- "centimeter": 0.01, # centimeter (alternative)
26
- "centimetre": 0.01, # centimeter (alternative)
27
- "mm": 0.001, # millimeter
28
- "millimeter": 0.001,# millimeter (alternative)
29
- "millimetre": 0.001,# millimeter (alternative)
30
- "km": 1000.0, # kilometer
31
- "kilometer": 1000.0,# kilometer (alternative)
32
- "kilometre": 1000.0,# kilometer (alternative)
33
- "in": 0.0254, # inch
34
- "inch": 0.0254, # inch (alternative)
35
- "inches": 0.0254, # inch (alternative)
36
- "ft": 0.3048, # foot
37
- "foot": 0.3048, # foot (alternative)
38
- "feet": 0.3048, # foot (alternative)
39
- "yd": 0.9144, # yard
40
- "yard": 0.9144, # yard (alternative)
41
- "yards": 0.9144, # yard (alternative)
42
- "mi": 1609.344, # mile
43
- "mile": 1609.344, # mile (alternative)
44
- "miles": 1609.344, # mile (alternative)
45
- }
46
- },
47
-
48
- # WEIGHT/MASS - base unit: kilogram (kg)
49
- "weight": {
50
- "base_unit": "kg",
51
- "units": {
52
- "kg": 1.0, # kilogram (base)
53
- "kilogram": 1.0, # kilogram (alternative)
54
- "kilograms": 1.0, # kilogram (alternative)
55
- "g": 0.001, # gram
56
- "gram": 0.001, # gram (alternative)
57
- "grams": 0.001, # gram (alternative)
58
- "mg": 0.000001, # milligram
59
- "milligram": 0.000001, # milligram (alternative)
60
- "milligrams": 0.000001, # milligram (alternative)
61
- "lb": 0.453592, # pound
62
- "lbs": 0.453592, # pound (alternative)
63
- "pound": 0.453592, # pound (alternative)
64
- "pounds": 0.453592, # pound (alternative)
65
- "oz": 0.0283495, # ounce
66
- "ounce": 0.0283495, # ounce (alternative)
67
- "ounces": 0.0283495,# ounce (alternative)
68
- "ton": 1000.0, # metric ton
69
- "tonne": 1000.0, # metric ton (alternative)
70
- "tonnes": 1000.0, # metric ton (alternative)
71
- "stone": 6.35029, # stone (14 pounds)
72
- "stones": 6.35029, # stone (alternative)
73
- }
74
- },
75
-
76
- # TEMPERATURE - base unit: Celsius (°C)
77
- # Note: Temperature requires special handling due to offset conversions
78
- "temperature": {
79
- "base_unit": "C",
80
- "units": {
81
- "C": {"factor": 1.0, "offset": 0.0}, # Celsius (base)
82
- "c": {"factor": 1.0, "offset": 0.0}, # Celsius (lowercase)
83
- "celsius": {"factor": 1.0, "offset": 0.0}, # Celsius (alternative)
84
- "F": {"factor": 5/9, "offset": -32}, # Fahrenheit: (F-32)*5/9 = (F+(-32))*5/9
85
- "f": {"factor": 5/9, "offset": -32}, # Fahrenheit (lowercase)
86
- "fahrenheit": {"factor": 5/9, "offset": -32}, # Fahrenheit (alternative)
87
- "K": {"factor": 1.0, "offset": -273.15}, # Kelvin: K-273.15 = (K+(-273.15))*1.0
88
- "k": {"factor": 1.0, "offset": -273.15}, # Kelvin (lowercase)
89
- "kelvin": {"factor": 1.0, "offset": -273.15}, # Kelvin (alternative)
90
- }
91
- },
92
-
93
- # VOLUME - base unit: liter (L)
94
- "volume": {
95
- "base_unit": "L",
96
- "units": {
97
- "L": 1.0, # liter (base)
98
- "l": 1.0, # liter (lowercase)
99
- "liter": 1.0, # liter (alternative)
100
- "liters": 1.0, # liter (alternative)
101
- "litre": 1.0, # liter (alternative)
102
- "litres": 1.0, # liter (alternative)
103
- "mL": 0.001, # milliliter
104
- "ml": 0.001, # milliliter (lowercase)
105
- "milliliter": 0.001,# milliliter (alternative)
106
- "milliliters": 0.001,# milliliter (alternative)
107
- "millilitre": 0.001,# milliliter (alternative)
108
- "millilitres": 0.001,# milliliter (alternative)
109
- "gal": 3.78541, # US gallon
110
- "gallon": 3.78541, # US gallon (alternative)
111
- "gallons": 3.78541, # US gallon (alternative)
112
- "qt": 0.946353, # US quart
113
- "quart": 0.946353, # US quart (alternative)
114
- "quarts": 0.946353, # US quart (alternative)
115
- "pt": 0.473176, # US pint
116
- "pint": 0.473176, # US pint (alternative)
117
- "pints": 0.473176, # US pint (alternative)
118
- "cup": 0.236588, # US cup
119
- "cups": 0.236588, # US cup (alternative)
120
- "fl_oz": 0.0295735, # US fluid ounce
121
- "floz": 0.0295735, # US fluid ounce (alternative)
122
- "fluid_ounce": 0.0295735, # US fluid ounce (alternative)
123
- "fluid_ounces": 0.0295735, # US fluid ounce (alternative)
124
- }
125
- },
126
-
127
- # TIME - base unit: second (s)
128
- "time": {
129
- "base_unit": "s",
130
- "units": {
131
- "s": 1.0, # second (base)
132
- "sec": 1.0, # second (alternative)
133
- "second": 1.0, # second (alternative)
134
- "seconds": 1.0, # second (alternative)
135
- "min": 60.0, # minute
136
- "minute": 60.0, # minute (alternative)
137
- "minutes": 60.0, # minute (alternative)
138
- "hr": 3600.0, # hour
139
- "hour": 3600.0, # hour (alternative)
140
- "hours": 3600.0, # hour (alternative)
141
- "day": 86400.0, # day
142
- "days": 86400.0, # day (alternative)
143
- "week": 604800.0, # week
144
- "weeks": 604800.0, # week (alternative)
145
- "month": 2629746.0, # average month (30.44 days)
146
- "months": 2629746.0,# average month (alternative)
147
- "year": 31556952.0, # average year (365.24 days)
148
- "years": 31556952.0,# average year (alternative)
149
- }
150
- },
151
-
152
- # AREA - base unit: square meter (m²)
153
- "area": {
154
- "base_unit": "m2",
155
- "units": {
156
- "m2": 1.0, # square meter (base)
157
- "m²": 1.0, # square meter (alternative)
158
- "sq_m": 1.0, # square meter (alternative)
159
- "square_meter": 1.0,# square meter (alternative)
160
- "square_metres": 1.0,# square meter (alternative)
161
- "cm2": 0.0001, # square centimeter
162
- "cm²": 0.0001, # square centimeter (alternative)
163
- "sq_cm": 0.0001, # square centimeter (alternative)
164
- "km2": 1000000.0, # square kilometer
165
- "km²": 1000000.0, # square kilometer (alternative)
166
- "sq_km": 1000000.0, # square kilometer (alternative)
167
- "in2": 0.00064516, # square inch
168
- "in²": 0.00064516, # square inch (alternative)
169
- "sq_in": 0.00064516,# square inch (alternative)
170
- "ft2": 0.092903, # square foot
171
- "ft²": 0.092903, # square foot (alternative)
172
- "sq_ft": 0.092903, # square foot (alternative)
173
- "acre": 4046.86, # acre
174
- "acres": 4046.86, # acre (alternative)
175
- }
176
- }
177
- }
178
-
179
-
180
- class UnitConversionError(Exception):
181
- """Raised when unit conversion operations fail"""
182
- pass
183
-
184
-
185
- def sanitize_column_name(col_name: str) -> str:
186
- """
187
- Convert column name to Python-friendly identifier
188
-
189
- Rules:
190
- - Replace spaces and special chars with underscores
191
- - Remove consecutive underscores
192
- - Remove leading/trailing underscores
193
- - Ensure doesn't start with number
194
- - Convert to lowercase for consistency
195
-
196
- Args:
197
- col_name: Original column name
198
-
199
- Returns:
200
- Sanitized column name safe for Python identifiers
201
-
202
- Examples:
203
- "height collected on site" → "height_collected_on_site"
204
- "Patient Height - Site A" → "patient_height_site_a"
205
- "Weight (kg)" → "weight_kg"
206
- "temp@location#1" → "temp_location_1"
207
- """
208
- # Convert to string and handle None/empty
209
- if not col_name:
210
- return "unnamed_column"
211
-
212
- col_str = str(col_name)
213
-
214
- # Replace non-alphanumeric chars with underscores
215
- sanitized = re.sub(r'[^a-zA-Z0-9_]', '_', col_str)
216
-
217
- # Remove consecutive underscores
218
- sanitized = re.sub(r'_+', '_', sanitized)
219
-
220
- # Remove leading/trailing underscores
221
- sanitized = sanitized.strip('_')
222
-
223
- # Ensure doesn't start with number
224
- if sanitized and sanitized[0].isdigit():
225
- sanitized = f"col_{sanitized}"
226
-
227
- # Convert to lowercase for consistency
228
- sanitized = sanitized.lower()
229
-
230
- return sanitized if sanitized else "unnamed_column"
231
-
232
-
233
- def generate_safe_column_name(base_name: str, existing_columns: List[str]) -> str:
234
- """
235
- Generate a safe column name that doesn't conflict with existing columns
236
-
237
- Args:
238
- base_name: Desired column name
239
- existing_columns: List of existing column names
240
-
241
- Returns:
242
- Safe column name with _1, _2, etc. suffix if needed
243
- """
244
- if base_name not in existing_columns:
245
- return base_name
246
-
247
- counter = 1
248
- while f"{base_name}_{counter}" in existing_columns:
249
- counter += 1
250
-
251
- return f"{base_name}_{counter}"
252
-
253
-
254
- class UnitConverter:
255
- """Unit conversion system with Polars processing"""
256
-
257
- def __init__(self):
258
- try:
259
- self.arrow_bridge = EnhancedArrowBridge()
260
- except ArrowBridgeError:
261
- self.arrow_bridge = None
262
- self.conversion_stats = {
263
- "total_conversions": 0,
264
- "successful_conversions": 0,
265
- "failed_conversions": 0,
266
- "categories_detected": set(),
267
- "units_processed": set()
268
- }
269
-
270
- log_info("[units] Unit Converter initialized")
271
-
272
- def harmonize_units(self, df: Any, value_column: str, unit_column: str,
273
- target_unit: Optional[str] = None,
274
- position: str = "end") -> Any:
275
- """
276
- Harmonize units in a dataframe
277
-
278
- Args:
279
- df: Input dataframe (pandas, polars, cudf)
280
- value_column: Column containing numeric values
281
- unit_column: Column containing unit strings
282
- target_unit: Target unit (auto-detected if None)
283
- position: Where to place new columns ("end", "start", int, "after:col", "before:col")
284
-
285
- Returns:
286
- Dataframe with harmonized columns added
287
-
288
- Raises:
289
- UnitConversionError: If conversion fails
290
- """
291
- start_time = datetime.now()
292
-
293
- try:
294
- # Validate inputs
295
- self._validate_inputs(df, value_column, unit_column)
296
-
297
- # Detect backend
298
- backend_type = self.arrow_bridge.detect_backend(df)
299
- log_info(f"[units] Processing {backend_type} dataframe")
300
-
301
- # Convert to Arrow then Polars for processing
302
- arrow_table = self.arrow_bridge.to_arrow(df, backend_type)
303
- polars_df = pl.from_arrow(arrow_table)
304
-
305
- # Perform unit conversion in Polars
306
- result_polars = self._harmonize_units_polars(
307
- polars_df, value_column, unit_column, target_unit
308
- )
309
-
310
- # Convert back to original backend
311
- result_arrow = result_polars.to_arrow()
312
- result_df = self.arrow_bridge.from_arrow(result_arrow, backend_type)
313
-
314
- # The column names should already be correct from the Polars processing
315
- # Just apply positioning if needed
316
- sanitized_value_col = sanitize_column_name(value_column)
317
- sanitized_unit_col = sanitize_column_name(unit_column)
318
-
319
- # Determine target unit if not specified (should be available from processing)
320
- if target_unit is None:
321
- unique_units = polars_df[unit_column].unique().to_list()
322
- unique_units_lower = [str(unit).strip().lower() for unit in unique_units if unit is not None]
323
- category = self._detect_unit_category(unique_units_lower)
324
- if category:
325
- target_unit = UNIT_CONVERSIONS[category]["base_unit"]
326
- else:
327
- target_unit = "unknown"
328
-
329
- expected_value_col = f"{sanitized_value_col}_{target_unit}"
330
- expected_unit_col = f"{sanitized_unit_col}_{target_unit}"
331
-
332
- # Apply column positioning with the actual column names that were created
333
- actual_new_columns = [col for col in result_df.columns if col not in df.columns]
334
-
335
- result_df = self._apply_positioning(
336
- result_df,
337
- actual_new_columns,
338
- position,
339
- backend_type
340
- )
341
-
342
- # Update statistics
343
- execution_time = (datetime.now() - start_time).total_seconds() * 1000
344
- self.conversion_stats["total_conversions"] += 1
345
- self.conversion_stats["successful_conversions"] += 1
346
-
347
- log_info(f"[units] Unit harmonization completed in {execution_time:.1f}ms")
348
-
349
- return result_df
350
-
351
- except Exception as e:
352
- self.conversion_stats["total_conversions"] += 1
353
- self.conversion_stats["failed_conversions"] += 1
354
- raise UnitConversionError(f"Unit harmonization failed: {e}")
355
-
356
- def _harmonize_units_polars(self, df: pl.DataFrame, value_column: str,
357
- unit_column: str, target_unit: Optional[str] = None) -> pl.DataFrame:
358
- """
359
- Perform unit conversion using Polars
360
-
361
- Args:
362
- df: Polars DataFrame
363
- value_column: Column containing numeric values
364
- unit_column: Column containing unit strings
365
- target_unit: Target unit (auto-detected if None)
366
-
367
- Returns:
368
- Polars DataFrame with harmonized columns
369
- """
370
- # Get unique units in the data
371
- unique_units = df[unit_column].unique().to_list()
372
- unique_units = [str(unit).strip() for unit in unique_units if unit is not None]
373
-
374
- # Convert to lowercase for matching but preserve original case for logging
375
- unique_units_lower = [unit.lower() for unit in unique_units]
376
-
377
- log_info(f"[units] Found units: {unique_units}")
378
-
379
- # Detect unit category using lowercase units
380
- category = self._detect_unit_category(unique_units_lower)
381
- if not category:
382
- raise UnitConversionError(f"Could not detect unit category for units: {unique_units}")
383
-
384
- log_info(f"[units] Detected category: {category}")
385
- self.conversion_stats["categories_detected"].add(category)
386
-
387
- # Determine target unit
388
- if target_unit is None:
389
- target_unit = UNIT_CONVERSIONS[category]["base_unit"]
390
- else:
391
- # Validate target unit
392
- target_unit_lower = target_unit.lower()
393
- if target_unit_lower not in UNIT_CONVERSIONS[category]["units"]:
394
- available_units = list(UNIT_CONVERSIONS[category]["units"].keys())
395
- raise UnitConversionError(
396
- f"Target unit '{target_unit}' not supported for category '{category}'. "
397
- f"Available units: {available_units}"
398
- )
399
- target_unit = target_unit_lower
400
-
401
- log_info(f"[units] Target unit: {target_unit}")
402
-
403
- # Create conversion mapping
404
- conversion_map = self._create_conversion_map(category, target_unit)
405
-
406
- # Generate clean, descriptive column names
407
- sanitized_value_col = sanitize_column_name(value_column)
408
- sanitized_unit_col = sanitize_column_name(unit_column)
409
-
410
- harmonized_value_col = f"{sanitized_value_col}_{target_unit}"
411
- harmonized_unit_col = f"{sanitized_unit_col}_{target_unit}"
412
-
413
- log_info(f"[units] Creating harmonized columns: {harmonized_value_col}, {harmonized_unit_col}")
414
-
415
- # Apply conversions using Polars
416
- if category == "temperature":
417
- # Special handling for temperature (offset conversions)
418
- result_df = self._convert_temperature_polars(df, value_column, unit_column,
419
- conversion_map, target_unit,
420
- harmonized_value_col, harmonized_unit_col)
421
- else:
422
- # Standard factor-based conversions
423
- result_df = self._convert_standard_polars(df, value_column, unit_column,
424
- conversion_map, target_unit,
425
- harmonized_value_col, harmonized_unit_col)
426
-
427
- # Update statistics
428
- self.conversion_stats["units_processed"].update(unique_units_lower)
429
-
430
- return result_df
431
-
432
- def _detect_unit_category(self, units: List[str]) -> Optional[str]:
433
- """
434
- Detect the category of units based on the units present
435
-
436
- Args:
437
- units: List of unit strings (lowercase)
438
-
439
- Returns:
440
- Category name or None if not detected
441
- """
442
- # Count matches for each category
443
- category_scores = {}
444
-
445
- for category, config in UNIT_CONVERSIONS.items():
446
- score = 0
447
- for unit in units:
448
- if unit in config["units"]:
449
- score += 1
450
-
451
- if score > 0:
452
- category_scores[category] = score
453
-
454
- if not category_scores:
455
- return None
456
-
457
- # Return category with highest score
458
- best_category = max(category_scores, key=category_scores.get)
459
-
460
- # Require at least one match
461
- if category_scores[best_category] > 0:
462
- return best_category
463
-
464
- return None
465
-
466
- def _create_conversion_map(self, category: str, target_unit: str) -> Dict[str, float]:
467
- """
468
- Create conversion factors mapping from each unit to target unit
469
-
470
- Args:
471
- category: Unit category
472
- target_unit: Target unit
473
-
474
- Returns:
475
- Dictionary mapping unit -> conversion factor
476
- """
477
- config = UNIT_CONVERSIONS[category]
478
- target_factor = config["units"][target_unit]
479
-
480
- conversion_map = {}
481
-
482
- if category == "temperature":
483
- # Temperature requires special handling
484
- for unit, unit_config in config["units"].items():
485
- conversion_map[unit] = unit_config
486
- else:
487
- # Standard factor-based conversion
488
- for unit, unit_factor in config["units"].items():
489
- # Convert from unit to base, then from base to target
490
- conversion_map[unit] = unit_factor / target_factor
491
-
492
- return conversion_map
493
-
494
- def _convert_standard_polars(self, df: pl.DataFrame, value_column: str,
495
- unit_column: str, conversion_map: Dict[str, float],
496
- target_unit: str, harmonized_value_col: str,
497
- harmonized_unit_col: str) -> pl.DataFrame:
498
- """
499
- Convert units using standard factor-based conversion in Polars
500
- """
501
- # Create a mapping expression for unit conversion
502
- unit_mapping_expr = pl.col(unit_column).str.to_lowercase()
503
-
504
- # Build when-then chain for conversion factors
505
- conversion_expr = None
506
- for unit, factor in conversion_map.items():
507
- condition = unit_mapping_expr == unit
508
- if conversion_expr is None:
509
- conversion_expr = pl.when(condition).then(pl.lit(factor))
510
- else:
511
- conversion_expr = conversion_expr.when(condition).then(pl.lit(factor))
512
-
513
- # Default to 1.0 for unknown units (no conversion)
514
- conversion_expr = conversion_expr.otherwise(pl.lit(1.0))
515
-
516
- # Apply conversion with clear column names
517
- result_df = df.with_columns([
518
- (pl.col(value_column) * conversion_expr).alias(harmonized_value_col),
519
- pl.lit(target_unit).alias(harmonized_unit_col)
520
- ])
521
-
522
- return result_df
523
-
524
- def _convert_temperature_polars(self, df: pl.DataFrame, value_column: str,
525
- unit_column: str, conversion_map: Dict[str, Dict],
526
- target_unit: str, harmonized_value_col: str,
527
- harmonized_unit_col: str) -> pl.DataFrame:
528
- """
529
- Convert temperature units with offset handling in Polars
530
- """
531
- unit_mapping_expr = pl.col(unit_column).str.to_lowercase()
532
-
533
- # Temperature conversion requires two steps:
534
- # 1. Convert source unit to Celsius (base unit)
535
- # 2. Convert Celsius to target unit
536
-
537
- # Step 1: Convert all units to Celsius first
538
- celsius_conversion_expr = None
539
-
540
- for unit, config in conversion_map.items():
541
- factor = config["factor"]
542
- offset = config["offset"]
543
- condition = unit_mapping_expr == unit
544
-
545
- # Convert to Celsius: (value + offset) * factor
546
- unit_to_celsius = (pl.col(value_column) + pl.lit(offset)) * pl.lit(factor)
547
-
548
- if celsius_conversion_expr is None:
549
- celsius_conversion_expr = pl.when(condition).then(unit_to_celsius)
550
- else:
551
- celsius_conversion_expr = celsius_conversion_expr.when(condition).then(unit_to_celsius)
552
-
553
- # Default to original value for unknown units (assume already Celsius)
554
- celsius_conversion_expr = celsius_conversion_expr.otherwise(pl.col(value_column))
555
-
556
- # Step 2: Convert from Celsius to target unit
557
- if target_unit.lower() == 'c' or target_unit.lower() == 'celsius':
558
- # Target is Celsius, we're done
559
- final_conversion_expr = celsius_conversion_expr
560
- elif target_unit.lower() == 'f' or target_unit.lower() == 'fahrenheit':
561
- # Convert Celsius to Fahrenheit: F = C * 9/5 + 32
562
- final_conversion_expr = celsius_conversion_expr * pl.lit(9/5) + pl.lit(32)
563
- elif target_unit.lower() == 'k' or target_unit.lower() == 'kelvin':
564
- # Convert Celsius to Kelvin: K = C + 273.15
565
- final_conversion_expr = celsius_conversion_expr + pl.lit(273.15)
566
- else:
567
- # Unknown target unit, keep as Celsius
568
- final_conversion_expr = celsius_conversion_expr
569
-
570
- # Apply conversion with clear column names
571
- result_df = df.with_columns([
572
- final_conversion_expr.alias(harmonized_value_col),
573
- pl.lit(target_unit).alias(harmonized_unit_col)
574
- ])
575
-
576
- return result_df
577
-
578
- def _validate_inputs(self, df: Any, value_column: str, unit_column: str):
579
- """Validate input parameters"""
580
- # Check if dataframe is supported
581
- if not self._is_dataframe(df):
582
- raise UnitConversionError(
583
- f"Input must be a DataFrame (pandas, polars, or cudf). Got: {type(df)}"
584
- )
585
-
586
- # Check if dataframe is empty
587
- if len(df) == 0:
588
- raise UnitConversionError("Input dataframe is empty")
589
-
590
- # Check if columns exist
591
- df_columns = list(df.columns)
592
-
593
- if value_column not in df_columns:
594
- raise UnitConversionError(
595
- f"Value column '{value_column}' not found. Available columns: {df_columns}"
596
- )
597
-
598
- if unit_column not in df_columns:
599
- raise UnitConversionError(
600
- f"Unit column '{unit_column}' not found. Available columns: {df_columns}"
601
- )
602
-
603
- # Check for column name conflicts (with new naming scheme)
604
- sanitized_value_col = sanitize_column_name(value_column)
605
- sanitized_unit_col = sanitize_column_name(unit_column)
606
-
607
- # We can't check target_unit conflicts here since target_unit might be auto-detected
608
- # Conflict resolution will happen in the main function
609
-
610
- log_info(f"[units] Sanitized column names: {value_column} → {sanitized_value_col}, {unit_column} → {sanitized_unit_col}")
611
-
612
- def _is_dataframe(self, obj: Any) -> bool:
613
- """Check if object is a supported dataframe type"""
614
- return (
615
- isinstance(obj, pd.DataFrame) or
616
- (hasattr(obj, '__class__') and 'polars' in str(type(obj))) or
617
- (hasattr(obj, '__class__') and 'cudf' in str(type(obj)))
618
- )
619
-
620
- def _apply_positioning(self, df: Any, new_columns: List[str], position: str,
621
- backend_type: str) -> Any:
622
- """Apply column positioning using existing positioning system"""
623
- try:
624
- from ..core.column_positioning import position_columns
625
- return position_columns(df, new_columns, position)
626
- except Exception as e:
627
- log_warning(f"[units] Column positioning failed: {e}. Using default 'end'.")
628
- return df
629
-
630
- def _handle_column_conflicts(self, df: Any, harmonized_value_col: str,
631
- harmonized_unit_col: str, backend_type: str) -> Tuple[Any, str, str]:
632
- """
633
- Handle column name conflicts by generating safe alternatives
634
-
635
- Args:
636
- df: Input dataframe
637
- harmonized_value_col: Desired harmonized value column name
638
- harmonized_unit_col: Desired harmonized unit column name
639
- backend_type: Backend type for column access
640
-
641
- Returns:
642
- Tuple of (dataframe, final_value_col_name, final_unit_col_name)
643
- """
644
- # Get existing column names
645
- if backend_type == "pandas":
646
- existing_columns = list(df.columns)
647
- elif backend_type == "polars":
648
- existing_columns = df.columns
649
- else: # cudf
650
- existing_columns = list(df.columns)
651
-
652
- # Generate safe column names
653
- final_value_col = generate_safe_column_name(harmonized_value_col, existing_columns)
654
- final_unit_col = generate_safe_column_name(harmonized_unit_col, existing_columns)
655
-
656
- # Log if conflicts were resolved
657
- if final_value_col != harmonized_value_col:
658
- log_warning(f"[units] Column conflict resolved: {harmonized_value_col} → {final_value_col}")
659
- if final_unit_col != harmonized_unit_col:
660
- log_warning(f"[units] Column conflict resolved: {harmonized_unit_col} → {final_unit_col}")
661
-
662
- return df, final_value_col, final_unit_col
663
-
664
- def get_supported_units(self, category: Optional[str] = None) -> Dict[str, List[str]]:
665
- """
666
- Get list of supported units
667
-
668
- Args:
669
- category: Specific category to get units for (optional)
670
-
671
- Returns:
672
- Dictionary of category -> list of units
673
- """
674
- if category:
675
- if category not in UNIT_CONVERSIONS:
676
- raise UnitConversionError(f"Unknown category: {category}")
677
- return {category: list(UNIT_CONVERSIONS[category]["units"].keys())}
678
-
679
- return {
680
- cat: list(config["units"].keys())
681
- for cat, config in UNIT_CONVERSIONS.items()
682
- }
683
-
684
- def get_conversion_stats(self) -> Dict[str, Any]:
685
- """Get conversion statistics"""
686
- stats = self.conversion_stats.copy()
687
- stats["categories_detected"] = list(stats["categories_detected"])
688
- stats["units_processed"] = list(stats["units_processed"])
689
-
690
- if stats["total_conversions"] > 0:
691
- stats["success_rate"] = (stats["successful_conversions"] / stats["total_conversions"]) * 100
692
- else:
693
- stats["success_rate"] = 0.0
694
-
695
- return stats
696
-
697
- def reset_stats(self):
698
- """Reset conversion statistics"""
699
- self.conversion_stats = {
700
- "total_conversions": 0,
701
- "successful_conversions": 0,
702
- "failed_conversions": 0,
703
- "categories_detected": set(),
704
- "units_processed": set()
705
- }
706
- log_info("[units] Conversion statistics reset")
707
-
708
-
709
- # Global instance
710
- _unit_converter = None
711
-
712
- def get_unit_converter() -> UnitConverter:
713
- """Get the global unit converter instance"""
714
- global _unit_converter
715
- if _unit_converter is None:
716
- _unit_converter = UnitConverter()
717
- return _unit_converter
718
-
719
-
720
- def harmonize_units(df: Any, value_column: str, unit_column: str,
721
- target_unit: Optional[str] = None,
722
- position: str = "end") -> Any:
723
- """
724
- Harmonize units in a dataframe
725
-
726
- Args:
727
- df: Input dataframe (pandas, polars, cudf)
728
- value_column: Column containing numeric values
729
- unit_column: Column containing unit strings
730
- target_unit: Target unit (auto-detected if None)
731
- position: Where to place new columns
732
-
733
- Returns:
734
- Dataframe with harmonized columns added
735
- """
736
- converter = get_unit_converter()
737
- return converter.harmonize_units(df, value_column, unit_column, target_unit, position)
738
-
739
-
740
- def get_supported_units(category: Optional[str] = None) -> Dict[str, List[str]]:
741
- """Get list of supported units"""
742
- converter = get_unit_converter()
743
- return converter.get_supported_units(category)
744
-
745
-
746
- def get_conversion_stats() -> Dict[str, Any]:
747
- """Get conversion statistics"""
748
- converter = get_unit_converter()
749
- return converter.get_conversion_stats()