additory 0.1.0a3__py3-none-any.whl → 0.1.1a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- additory/__init__.py +58 -14
- additory/common/__init__.py +31 -147
- additory/common/column_selector.py +255 -0
- additory/common/distributions.py +286 -613
- additory/common/extractors.py +313 -0
- additory/common/knn_imputation.py +332 -0
- additory/common/result.py +380 -0
- additory/common/strategy_parser.py +243 -0
- additory/common/unit_conversions.py +338 -0
- additory/common/validation.py +283 -103
- additory/core/__init__.py +34 -22
- additory/core/backend.py +258 -0
- additory/core/config.py +177 -305
- additory/core/logging.py +230 -24
- additory/core/memory_manager.py +157 -495
- additory/expressions/__init__.py +2 -23
- additory/expressions/compiler.py +457 -0
- additory/expressions/engine.py +264 -487
- additory/expressions/integrity.py +179 -0
- additory/expressions/loader.py +263 -0
- additory/expressions/parser.py +363 -167
- additory/expressions/resolver.py +274 -0
- additory/functions/__init__.py +1 -0
- additory/functions/analyze/__init__.py +144 -0
- additory/functions/analyze/cardinality.py +58 -0
- additory/functions/analyze/correlations.py +66 -0
- additory/functions/analyze/distributions.py +53 -0
- additory/functions/analyze/duplicates.py +49 -0
- additory/functions/analyze/features.py +61 -0
- additory/functions/analyze/imputation.py +66 -0
- additory/functions/analyze/outliers.py +65 -0
- additory/functions/analyze/patterns.py +65 -0
- additory/functions/analyze/presets.py +72 -0
- additory/functions/analyze/quality.py +59 -0
- additory/functions/analyze/timeseries.py +53 -0
- additory/functions/analyze/types.py +45 -0
- additory/functions/expressions/__init__.py +161 -0
- additory/functions/snapshot/__init__.py +82 -0
- additory/functions/snapshot/filter.py +119 -0
- additory/functions/synthetic/__init__.py +113 -0
- additory/functions/synthetic/mode_detector.py +47 -0
- additory/functions/synthetic/strategies/__init__.py +1 -0
- additory/functions/synthetic/strategies/advanced.py +35 -0
- additory/functions/synthetic/strategies/augmentative.py +160 -0
- additory/functions/synthetic/strategies/generative.py +168 -0
- additory/functions/synthetic/strategies/presets.py +116 -0
- additory/functions/to/__init__.py +188 -0
- additory/functions/to/lookup.py +351 -0
- additory/functions/to/merge.py +189 -0
- additory/functions/to/sort.py +91 -0
- additory/functions/to/summarize.py +170 -0
- additory/functions/transform/__init__.py +140 -0
- additory/functions/transform/datetime.py +79 -0
- additory/functions/transform/extract.py +85 -0
- additory/functions/transform/harmonize.py +105 -0
- additory/functions/transform/knn.py +62 -0
- additory/functions/transform/onehotencoding.py +68 -0
- additory/functions/transform/transpose.py +42 -0
- additory-0.1.1a1.dist-info/METADATA +83 -0
- additory-0.1.1a1.dist-info/RECORD +62 -0
- additory/analysis/__init__.py +0 -48
- additory/analysis/cardinality.py +0 -126
- additory/analysis/correlations.py +0 -124
- additory/analysis/distributions.py +0 -376
- additory/analysis/quality.py +0 -158
- additory/analysis/scan.py +0 -400
- additory/common/backend.py +0 -371
- additory/common/column_utils.py +0 -191
- additory/common/exceptions.py +0 -62
- additory/common/lists.py +0 -229
- additory/common/patterns.py +0 -240
- additory/common/resolver.py +0 -567
- additory/common/sample_data.py +0 -182
- additory/core/ast_builder.py +0 -165
- additory/core/backends/__init__.py +0 -23
- additory/core/backends/arrow_bridge.py +0 -483
- additory/core/backends/cudf_bridge.py +0 -355
- additory/core/column_positioning.py +0 -358
- additory/core/compiler_polars.py +0 -166
- additory/core/enhanced_cache_manager.py +0 -1119
- additory/core/enhanced_matchers.py +0 -473
- additory/core/enhanced_version_manager.py +0 -325
- additory/core/executor.py +0 -59
- additory/core/integrity_manager.py +0 -477
- additory/core/loader.py +0 -190
- additory/core/namespace_manager.py +0 -657
- additory/core/parser.py +0 -176
- additory/core/polars_expression_engine.py +0 -601
- additory/core/registry.py +0 -176
- additory/core/sample_data_manager.py +0 -492
- additory/core/user_namespace.py +0 -751
- additory/core/validator.py +0 -27
- additory/dynamic_api.py +0 -304
- additory/expressions/proxy.py +0 -549
- additory/expressions/registry.py +0 -313
- additory/expressions/samples.py +0 -492
- additory/synthetic/__init__.py +0 -13
- additory/synthetic/column_name_resolver.py +0 -149
- additory/synthetic/distributions.py +0 -22
- additory/synthetic/forecast.py +0 -1132
- additory/synthetic/linked_list_parser.py +0 -415
- additory/synthetic/namespace_lookup.py +0 -129
- additory/synthetic/smote.py +0 -320
- additory/synthetic/strategies.py +0 -850
- additory/synthetic/synthesizer.py +0 -713
- additory/utilities/__init__.py +0 -53
- additory/utilities/encoding.py +0 -600
- additory/utilities/games.py +0 -300
- additory/utilities/keys.py +0 -8
- additory/utilities/lookup.py +0 -103
- additory/utilities/matchers.py +0 -216
- additory/utilities/resolvers.py +0 -286
- additory/utilities/settings.py +0 -167
- additory/utilities/units.py +0 -749
- additory/utilities/validators.py +0 -153
- additory-0.1.0a3.dist-info/METADATA +0 -288
- additory-0.1.0a3.dist-info/RECORD +0 -71
- additory-0.1.0a3.dist-info/licenses/LICENSE +0 -21
- {additory-0.1.0a3.dist-info → additory-0.1.1a1.dist-info}/WHEEL +0 -0
- {additory-0.1.0a3.dist-info → additory-0.1.1a1.dist-info}/top_level.txt +0 -0
additory/utilities/units.py
DELETED
|
@@ -1,749 +0,0 @@
|
|
|
1
|
-
# additory/utilities/units.py
|
|
2
|
-
# Unit conversion system with hardcoded conversion factors
|
|
3
|
-
|
|
4
|
-
import polars as pl
|
|
5
|
-
import pandas as pd
|
|
6
|
-
import re
|
|
7
|
-
from typing import Any, Dict, List, Optional, Union, Tuple
|
|
8
|
-
from datetime import datetime
|
|
9
|
-
|
|
10
|
-
from ..core.backends.arrow_bridge import EnhancedArrowBridge, ArrowBridgeError
|
|
11
|
-
from ..core.logging import log_info, log_warning
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
# Hardcoded unit conversion definitions
|
|
15
|
-
# All conversions relative to base unit (factor = 1.0)
|
|
16
|
-
UNIT_CONVERSIONS = {
|
|
17
|
-
# LENGTH - base unit: meter (m)
|
|
18
|
-
"length": {
|
|
19
|
-
"base_unit": "m",
|
|
20
|
-
"units": {
|
|
21
|
-
"m": 1.0, # meter (base)
|
|
22
|
-
"meter": 1.0, # meter (alternative)
|
|
23
|
-
"metres": 1.0, # meter (alternative)
|
|
24
|
-
"cm": 0.01, # centimeter
|
|
25
|
-
"centimeter": 0.01, # centimeter (alternative)
|
|
26
|
-
"centimetre": 0.01, # centimeter (alternative)
|
|
27
|
-
"mm": 0.001, # millimeter
|
|
28
|
-
"millimeter": 0.001,# millimeter (alternative)
|
|
29
|
-
"millimetre": 0.001,# millimeter (alternative)
|
|
30
|
-
"km": 1000.0, # kilometer
|
|
31
|
-
"kilometer": 1000.0,# kilometer (alternative)
|
|
32
|
-
"kilometre": 1000.0,# kilometer (alternative)
|
|
33
|
-
"in": 0.0254, # inch
|
|
34
|
-
"inch": 0.0254, # inch (alternative)
|
|
35
|
-
"inches": 0.0254, # inch (alternative)
|
|
36
|
-
"ft": 0.3048, # foot
|
|
37
|
-
"foot": 0.3048, # foot (alternative)
|
|
38
|
-
"feet": 0.3048, # foot (alternative)
|
|
39
|
-
"yd": 0.9144, # yard
|
|
40
|
-
"yard": 0.9144, # yard (alternative)
|
|
41
|
-
"yards": 0.9144, # yard (alternative)
|
|
42
|
-
"mi": 1609.344, # mile
|
|
43
|
-
"mile": 1609.344, # mile (alternative)
|
|
44
|
-
"miles": 1609.344, # mile (alternative)
|
|
45
|
-
}
|
|
46
|
-
},
|
|
47
|
-
|
|
48
|
-
# WEIGHT/MASS - base unit: kilogram (kg)
|
|
49
|
-
"weight": {
|
|
50
|
-
"base_unit": "kg",
|
|
51
|
-
"units": {
|
|
52
|
-
"kg": 1.0, # kilogram (base)
|
|
53
|
-
"kilogram": 1.0, # kilogram (alternative)
|
|
54
|
-
"kilograms": 1.0, # kilogram (alternative)
|
|
55
|
-
"g": 0.001, # gram
|
|
56
|
-
"gram": 0.001, # gram (alternative)
|
|
57
|
-
"grams": 0.001, # gram (alternative)
|
|
58
|
-
"mg": 0.000001, # milligram
|
|
59
|
-
"milligram": 0.000001, # milligram (alternative)
|
|
60
|
-
"milligrams": 0.000001, # milligram (alternative)
|
|
61
|
-
"lb": 0.453592, # pound
|
|
62
|
-
"lbs": 0.453592, # pound (alternative)
|
|
63
|
-
"pound": 0.453592, # pound (alternative)
|
|
64
|
-
"pounds": 0.453592, # pound (alternative)
|
|
65
|
-
"oz": 0.0283495, # ounce
|
|
66
|
-
"ounce": 0.0283495, # ounce (alternative)
|
|
67
|
-
"ounces": 0.0283495,# ounce (alternative)
|
|
68
|
-
"ton": 1000.0, # metric ton
|
|
69
|
-
"tonne": 1000.0, # metric ton (alternative)
|
|
70
|
-
"tonnes": 1000.0, # metric ton (alternative)
|
|
71
|
-
"stone": 6.35029, # stone (14 pounds)
|
|
72
|
-
"stones": 6.35029, # stone (alternative)
|
|
73
|
-
}
|
|
74
|
-
},
|
|
75
|
-
|
|
76
|
-
# TEMPERATURE - base unit: Celsius (°C)
|
|
77
|
-
# Note: Temperature requires special handling due to offset conversions
|
|
78
|
-
"temperature": {
|
|
79
|
-
"base_unit": "C",
|
|
80
|
-
"units": {
|
|
81
|
-
"C": {"factor": 1.0, "offset": 0.0}, # Celsius (base)
|
|
82
|
-
"c": {"factor": 1.0, "offset": 0.0}, # Celsius (lowercase)
|
|
83
|
-
"celsius": {"factor": 1.0, "offset": 0.0}, # Celsius (alternative)
|
|
84
|
-
"F": {"factor": 5/9, "offset": -32}, # Fahrenheit: (F-32)*5/9 = (F+(-32))*5/9
|
|
85
|
-
"f": {"factor": 5/9, "offset": -32}, # Fahrenheit (lowercase)
|
|
86
|
-
"fahrenheit": {"factor": 5/9, "offset": -32}, # Fahrenheit (alternative)
|
|
87
|
-
"K": {"factor": 1.0, "offset": -273.15}, # Kelvin: K-273.15 = (K+(-273.15))*1.0
|
|
88
|
-
"k": {"factor": 1.0, "offset": -273.15}, # Kelvin (lowercase)
|
|
89
|
-
"kelvin": {"factor": 1.0, "offset": -273.15}, # Kelvin (alternative)
|
|
90
|
-
}
|
|
91
|
-
},
|
|
92
|
-
|
|
93
|
-
# VOLUME - base unit: liter (L)
|
|
94
|
-
"volume": {
|
|
95
|
-
"base_unit": "L",
|
|
96
|
-
"units": {
|
|
97
|
-
"L": 1.0, # liter (base)
|
|
98
|
-
"l": 1.0, # liter (lowercase)
|
|
99
|
-
"liter": 1.0, # liter (alternative)
|
|
100
|
-
"liters": 1.0, # liter (alternative)
|
|
101
|
-
"litre": 1.0, # liter (alternative)
|
|
102
|
-
"litres": 1.0, # liter (alternative)
|
|
103
|
-
"mL": 0.001, # milliliter
|
|
104
|
-
"ml": 0.001, # milliliter (lowercase)
|
|
105
|
-
"milliliter": 0.001,# milliliter (alternative)
|
|
106
|
-
"milliliters": 0.001,# milliliter (alternative)
|
|
107
|
-
"millilitre": 0.001,# milliliter (alternative)
|
|
108
|
-
"millilitres": 0.001,# milliliter (alternative)
|
|
109
|
-
"gal": 3.78541, # US gallon
|
|
110
|
-
"gallon": 3.78541, # US gallon (alternative)
|
|
111
|
-
"gallons": 3.78541, # US gallon (alternative)
|
|
112
|
-
"qt": 0.946353, # US quart
|
|
113
|
-
"quart": 0.946353, # US quart (alternative)
|
|
114
|
-
"quarts": 0.946353, # US quart (alternative)
|
|
115
|
-
"pt": 0.473176, # US pint
|
|
116
|
-
"pint": 0.473176, # US pint (alternative)
|
|
117
|
-
"pints": 0.473176, # US pint (alternative)
|
|
118
|
-
"cup": 0.236588, # US cup
|
|
119
|
-
"cups": 0.236588, # US cup (alternative)
|
|
120
|
-
"fl_oz": 0.0295735, # US fluid ounce
|
|
121
|
-
"floz": 0.0295735, # US fluid ounce (alternative)
|
|
122
|
-
"fluid_ounce": 0.0295735, # US fluid ounce (alternative)
|
|
123
|
-
"fluid_ounces": 0.0295735, # US fluid ounce (alternative)
|
|
124
|
-
}
|
|
125
|
-
},
|
|
126
|
-
|
|
127
|
-
# TIME - base unit: second (s)
|
|
128
|
-
"time": {
|
|
129
|
-
"base_unit": "s",
|
|
130
|
-
"units": {
|
|
131
|
-
"s": 1.0, # second (base)
|
|
132
|
-
"sec": 1.0, # second (alternative)
|
|
133
|
-
"second": 1.0, # second (alternative)
|
|
134
|
-
"seconds": 1.0, # second (alternative)
|
|
135
|
-
"min": 60.0, # minute
|
|
136
|
-
"minute": 60.0, # minute (alternative)
|
|
137
|
-
"minutes": 60.0, # minute (alternative)
|
|
138
|
-
"hr": 3600.0, # hour
|
|
139
|
-
"hour": 3600.0, # hour (alternative)
|
|
140
|
-
"hours": 3600.0, # hour (alternative)
|
|
141
|
-
"day": 86400.0, # day
|
|
142
|
-
"days": 86400.0, # day (alternative)
|
|
143
|
-
"week": 604800.0, # week
|
|
144
|
-
"weeks": 604800.0, # week (alternative)
|
|
145
|
-
"month": 2629746.0, # average month (30.44 days)
|
|
146
|
-
"months": 2629746.0,# average month (alternative)
|
|
147
|
-
"year": 31556952.0, # average year (365.24 days)
|
|
148
|
-
"years": 31556952.0,# average year (alternative)
|
|
149
|
-
}
|
|
150
|
-
},
|
|
151
|
-
|
|
152
|
-
# AREA - base unit: square meter (m²)
|
|
153
|
-
"area": {
|
|
154
|
-
"base_unit": "m2",
|
|
155
|
-
"units": {
|
|
156
|
-
"m2": 1.0, # square meter (base)
|
|
157
|
-
"m²": 1.0, # square meter (alternative)
|
|
158
|
-
"sq_m": 1.0, # square meter (alternative)
|
|
159
|
-
"square_meter": 1.0,# square meter (alternative)
|
|
160
|
-
"square_metres": 1.0,# square meter (alternative)
|
|
161
|
-
"cm2": 0.0001, # square centimeter
|
|
162
|
-
"cm²": 0.0001, # square centimeter (alternative)
|
|
163
|
-
"sq_cm": 0.0001, # square centimeter (alternative)
|
|
164
|
-
"km2": 1000000.0, # square kilometer
|
|
165
|
-
"km²": 1000000.0, # square kilometer (alternative)
|
|
166
|
-
"sq_km": 1000000.0, # square kilometer (alternative)
|
|
167
|
-
"in2": 0.00064516, # square inch
|
|
168
|
-
"in²": 0.00064516, # square inch (alternative)
|
|
169
|
-
"sq_in": 0.00064516,# square inch (alternative)
|
|
170
|
-
"ft2": 0.092903, # square foot
|
|
171
|
-
"ft²": 0.092903, # square foot (alternative)
|
|
172
|
-
"sq_ft": 0.092903, # square foot (alternative)
|
|
173
|
-
"acre": 4046.86, # acre
|
|
174
|
-
"acres": 4046.86, # acre (alternative)
|
|
175
|
-
}
|
|
176
|
-
}
|
|
177
|
-
}
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
class UnitConversionError(Exception):
|
|
181
|
-
"""Raised when unit conversion operations fail"""
|
|
182
|
-
pass
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
def sanitize_column_name(col_name: str) -> str:
|
|
186
|
-
"""
|
|
187
|
-
Convert column name to Python-friendly identifier
|
|
188
|
-
|
|
189
|
-
Rules:
|
|
190
|
-
- Replace spaces and special chars with underscores
|
|
191
|
-
- Remove consecutive underscores
|
|
192
|
-
- Remove leading/trailing underscores
|
|
193
|
-
- Ensure doesn't start with number
|
|
194
|
-
- Convert to lowercase for consistency
|
|
195
|
-
|
|
196
|
-
Args:
|
|
197
|
-
col_name: Original column name
|
|
198
|
-
|
|
199
|
-
Returns:
|
|
200
|
-
Sanitized column name safe for Python identifiers
|
|
201
|
-
|
|
202
|
-
Examples:
|
|
203
|
-
"height collected on site" → "height_collected_on_site"
|
|
204
|
-
"Patient Height - Site A" → "patient_height_site_a"
|
|
205
|
-
"Weight (kg)" → "weight_kg"
|
|
206
|
-
"temp@location#1" → "temp_location_1"
|
|
207
|
-
"""
|
|
208
|
-
# Convert to string and handle None/empty
|
|
209
|
-
if not col_name:
|
|
210
|
-
return "unnamed_column"
|
|
211
|
-
|
|
212
|
-
col_str = str(col_name)
|
|
213
|
-
|
|
214
|
-
# Replace non-alphanumeric chars with underscores
|
|
215
|
-
sanitized = re.sub(r'[^a-zA-Z0-9_]', '_', col_str)
|
|
216
|
-
|
|
217
|
-
# Remove consecutive underscores
|
|
218
|
-
sanitized = re.sub(r'_+', '_', sanitized)
|
|
219
|
-
|
|
220
|
-
# Remove leading/trailing underscores
|
|
221
|
-
sanitized = sanitized.strip('_')
|
|
222
|
-
|
|
223
|
-
# Ensure doesn't start with number
|
|
224
|
-
if sanitized and sanitized[0].isdigit():
|
|
225
|
-
sanitized = f"col_{sanitized}"
|
|
226
|
-
|
|
227
|
-
# Convert to lowercase for consistency
|
|
228
|
-
sanitized = sanitized.lower()
|
|
229
|
-
|
|
230
|
-
return sanitized if sanitized else "unnamed_column"
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
def generate_safe_column_name(base_name: str, existing_columns: List[str]) -> str:
|
|
234
|
-
"""
|
|
235
|
-
Generate a safe column name that doesn't conflict with existing columns
|
|
236
|
-
|
|
237
|
-
Args:
|
|
238
|
-
base_name: Desired column name
|
|
239
|
-
existing_columns: List of existing column names
|
|
240
|
-
|
|
241
|
-
Returns:
|
|
242
|
-
Safe column name with _1, _2, etc. suffix if needed
|
|
243
|
-
"""
|
|
244
|
-
if base_name not in existing_columns:
|
|
245
|
-
return base_name
|
|
246
|
-
|
|
247
|
-
counter = 1
|
|
248
|
-
while f"{base_name}_{counter}" in existing_columns:
|
|
249
|
-
counter += 1
|
|
250
|
-
|
|
251
|
-
return f"{base_name}_{counter}"
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
class UnitConverter:
|
|
255
|
-
"""Unit conversion system with Polars processing"""
|
|
256
|
-
|
|
257
|
-
def __init__(self):
|
|
258
|
-
try:
|
|
259
|
-
self.arrow_bridge = EnhancedArrowBridge()
|
|
260
|
-
except ArrowBridgeError:
|
|
261
|
-
self.arrow_bridge = None
|
|
262
|
-
self.conversion_stats = {
|
|
263
|
-
"total_conversions": 0,
|
|
264
|
-
"successful_conversions": 0,
|
|
265
|
-
"failed_conversions": 0,
|
|
266
|
-
"categories_detected": set(),
|
|
267
|
-
"units_processed": set()
|
|
268
|
-
}
|
|
269
|
-
|
|
270
|
-
log_info("[units] Unit Converter initialized")
|
|
271
|
-
|
|
272
|
-
def harmonize_units(self, df: Any, value_column: str, unit_column: str,
|
|
273
|
-
target_unit: Optional[str] = None,
|
|
274
|
-
position: str = "end") -> Any:
|
|
275
|
-
"""
|
|
276
|
-
Harmonize units in a dataframe
|
|
277
|
-
|
|
278
|
-
Args:
|
|
279
|
-
df: Input dataframe (pandas, polars, cudf)
|
|
280
|
-
value_column: Column containing numeric values
|
|
281
|
-
unit_column: Column containing unit strings
|
|
282
|
-
target_unit: Target unit (auto-detected if None)
|
|
283
|
-
position: Where to place new columns ("end", "start", int, "after:col", "before:col")
|
|
284
|
-
|
|
285
|
-
Returns:
|
|
286
|
-
Dataframe with harmonized columns added
|
|
287
|
-
|
|
288
|
-
Raises:
|
|
289
|
-
UnitConversionError: If conversion fails
|
|
290
|
-
"""
|
|
291
|
-
start_time = datetime.now()
|
|
292
|
-
|
|
293
|
-
try:
|
|
294
|
-
# Validate inputs
|
|
295
|
-
self._validate_inputs(df, value_column, unit_column)
|
|
296
|
-
|
|
297
|
-
# Detect backend
|
|
298
|
-
backend_type = self.arrow_bridge.detect_backend(df)
|
|
299
|
-
log_info(f"[units] Processing {backend_type} dataframe")
|
|
300
|
-
|
|
301
|
-
# Convert to Arrow then Polars for processing
|
|
302
|
-
arrow_table = self.arrow_bridge.to_arrow(df, backend_type)
|
|
303
|
-
polars_df = pl.from_arrow(arrow_table)
|
|
304
|
-
|
|
305
|
-
# Perform unit conversion in Polars
|
|
306
|
-
result_polars = self._harmonize_units_polars(
|
|
307
|
-
polars_df, value_column, unit_column, target_unit
|
|
308
|
-
)
|
|
309
|
-
|
|
310
|
-
# Convert back to original backend
|
|
311
|
-
result_arrow = result_polars.to_arrow()
|
|
312
|
-
result_df = self.arrow_bridge.from_arrow(result_arrow, backend_type)
|
|
313
|
-
|
|
314
|
-
# The column names should already be correct from the Polars processing
|
|
315
|
-
# Just apply positioning if needed
|
|
316
|
-
sanitized_value_col = sanitize_column_name(value_column)
|
|
317
|
-
sanitized_unit_col = sanitize_column_name(unit_column)
|
|
318
|
-
|
|
319
|
-
# Determine target unit if not specified (should be available from processing)
|
|
320
|
-
if target_unit is None:
|
|
321
|
-
unique_units = polars_df[unit_column].unique().to_list()
|
|
322
|
-
unique_units_lower = [str(unit).strip().lower() for unit in unique_units if unit is not None]
|
|
323
|
-
category = self._detect_unit_category(unique_units_lower)
|
|
324
|
-
if category:
|
|
325
|
-
target_unit = UNIT_CONVERSIONS[category]["base_unit"]
|
|
326
|
-
else:
|
|
327
|
-
target_unit = "unknown"
|
|
328
|
-
|
|
329
|
-
expected_value_col = f"{sanitized_value_col}_{target_unit}"
|
|
330
|
-
expected_unit_col = f"{sanitized_unit_col}_{target_unit}"
|
|
331
|
-
|
|
332
|
-
# Apply column positioning with the actual column names that were created
|
|
333
|
-
actual_new_columns = [col for col in result_df.columns if col not in df.columns]
|
|
334
|
-
|
|
335
|
-
result_df = self._apply_positioning(
|
|
336
|
-
result_df,
|
|
337
|
-
actual_new_columns,
|
|
338
|
-
position,
|
|
339
|
-
backend_type
|
|
340
|
-
)
|
|
341
|
-
|
|
342
|
-
# Update statistics
|
|
343
|
-
execution_time = (datetime.now() - start_time).total_seconds() * 1000
|
|
344
|
-
self.conversion_stats["total_conversions"] += 1
|
|
345
|
-
self.conversion_stats["successful_conversions"] += 1
|
|
346
|
-
|
|
347
|
-
log_info(f"[units] Unit harmonization completed in {execution_time:.1f}ms")
|
|
348
|
-
|
|
349
|
-
return result_df
|
|
350
|
-
|
|
351
|
-
except Exception as e:
|
|
352
|
-
self.conversion_stats["total_conversions"] += 1
|
|
353
|
-
self.conversion_stats["failed_conversions"] += 1
|
|
354
|
-
raise UnitConversionError(f"Unit harmonization failed: {e}")
|
|
355
|
-
|
|
356
|
-
def _harmonize_units_polars(self, df: pl.DataFrame, value_column: str,
|
|
357
|
-
unit_column: str, target_unit: Optional[str] = None) -> pl.DataFrame:
|
|
358
|
-
"""
|
|
359
|
-
Perform unit conversion using Polars
|
|
360
|
-
|
|
361
|
-
Args:
|
|
362
|
-
df: Polars DataFrame
|
|
363
|
-
value_column: Column containing numeric values
|
|
364
|
-
unit_column: Column containing unit strings
|
|
365
|
-
target_unit: Target unit (auto-detected if None)
|
|
366
|
-
|
|
367
|
-
Returns:
|
|
368
|
-
Polars DataFrame with harmonized columns
|
|
369
|
-
"""
|
|
370
|
-
# Get unique units in the data
|
|
371
|
-
unique_units = df[unit_column].unique().to_list()
|
|
372
|
-
unique_units = [str(unit).strip() for unit in unique_units if unit is not None]
|
|
373
|
-
|
|
374
|
-
# Convert to lowercase for matching but preserve original case for logging
|
|
375
|
-
unique_units_lower = [unit.lower() for unit in unique_units]
|
|
376
|
-
|
|
377
|
-
log_info(f"[units] Found units: {unique_units}")
|
|
378
|
-
|
|
379
|
-
# Detect unit category using lowercase units
|
|
380
|
-
category = self._detect_unit_category(unique_units_lower)
|
|
381
|
-
if not category:
|
|
382
|
-
raise UnitConversionError(f"Could not detect unit category for units: {unique_units}")
|
|
383
|
-
|
|
384
|
-
log_info(f"[units] Detected category: {category}")
|
|
385
|
-
self.conversion_stats["categories_detected"].add(category)
|
|
386
|
-
|
|
387
|
-
# Determine target unit
|
|
388
|
-
if target_unit is None:
|
|
389
|
-
target_unit = UNIT_CONVERSIONS[category]["base_unit"]
|
|
390
|
-
else:
|
|
391
|
-
# Validate target unit
|
|
392
|
-
target_unit_lower = target_unit.lower()
|
|
393
|
-
if target_unit_lower not in UNIT_CONVERSIONS[category]["units"]:
|
|
394
|
-
available_units = list(UNIT_CONVERSIONS[category]["units"].keys())
|
|
395
|
-
raise UnitConversionError(
|
|
396
|
-
f"Target unit '{target_unit}' not supported for category '{category}'. "
|
|
397
|
-
f"Available units: {available_units}"
|
|
398
|
-
)
|
|
399
|
-
target_unit = target_unit_lower
|
|
400
|
-
|
|
401
|
-
log_info(f"[units] Target unit: {target_unit}")
|
|
402
|
-
|
|
403
|
-
# Create conversion mapping
|
|
404
|
-
conversion_map = self._create_conversion_map(category, target_unit)
|
|
405
|
-
|
|
406
|
-
# Generate clean, descriptive column names
|
|
407
|
-
sanitized_value_col = sanitize_column_name(value_column)
|
|
408
|
-
sanitized_unit_col = sanitize_column_name(unit_column)
|
|
409
|
-
|
|
410
|
-
harmonized_value_col = f"{sanitized_value_col}_{target_unit}"
|
|
411
|
-
harmonized_unit_col = f"{sanitized_unit_col}_{target_unit}"
|
|
412
|
-
|
|
413
|
-
log_info(f"[units] Creating harmonized columns: {harmonized_value_col}, {harmonized_unit_col}")
|
|
414
|
-
|
|
415
|
-
# Apply conversions using Polars
|
|
416
|
-
if category == "temperature":
|
|
417
|
-
# Special handling for temperature (offset conversions)
|
|
418
|
-
result_df = self._convert_temperature_polars(df, value_column, unit_column,
|
|
419
|
-
conversion_map, target_unit,
|
|
420
|
-
harmonized_value_col, harmonized_unit_col)
|
|
421
|
-
else:
|
|
422
|
-
# Standard factor-based conversions
|
|
423
|
-
result_df = self._convert_standard_polars(df, value_column, unit_column,
|
|
424
|
-
conversion_map, target_unit,
|
|
425
|
-
harmonized_value_col, harmonized_unit_col)
|
|
426
|
-
|
|
427
|
-
# Update statistics
|
|
428
|
-
self.conversion_stats["units_processed"].update(unique_units_lower)
|
|
429
|
-
|
|
430
|
-
return result_df
|
|
431
|
-
|
|
432
|
-
def _detect_unit_category(self, units: List[str]) -> Optional[str]:
|
|
433
|
-
"""
|
|
434
|
-
Detect the category of units based on the units present
|
|
435
|
-
|
|
436
|
-
Args:
|
|
437
|
-
units: List of unit strings (lowercase)
|
|
438
|
-
|
|
439
|
-
Returns:
|
|
440
|
-
Category name or None if not detected
|
|
441
|
-
"""
|
|
442
|
-
# Count matches for each category
|
|
443
|
-
category_scores = {}
|
|
444
|
-
|
|
445
|
-
for category, config in UNIT_CONVERSIONS.items():
|
|
446
|
-
score = 0
|
|
447
|
-
for unit in units:
|
|
448
|
-
if unit in config["units"]:
|
|
449
|
-
score += 1
|
|
450
|
-
|
|
451
|
-
if score > 0:
|
|
452
|
-
category_scores[category] = score
|
|
453
|
-
|
|
454
|
-
if not category_scores:
|
|
455
|
-
return None
|
|
456
|
-
|
|
457
|
-
# Return category with highest score
|
|
458
|
-
best_category = max(category_scores, key=category_scores.get)
|
|
459
|
-
|
|
460
|
-
# Require at least one match
|
|
461
|
-
if category_scores[best_category] > 0:
|
|
462
|
-
return best_category
|
|
463
|
-
|
|
464
|
-
return None
|
|
465
|
-
|
|
466
|
-
def _create_conversion_map(self, category: str, target_unit: str) -> Dict[str, float]:
|
|
467
|
-
"""
|
|
468
|
-
Create conversion factors mapping from each unit to target unit
|
|
469
|
-
|
|
470
|
-
Args:
|
|
471
|
-
category: Unit category
|
|
472
|
-
target_unit: Target unit
|
|
473
|
-
|
|
474
|
-
Returns:
|
|
475
|
-
Dictionary mapping unit -> conversion factor
|
|
476
|
-
"""
|
|
477
|
-
config = UNIT_CONVERSIONS[category]
|
|
478
|
-
target_factor = config["units"][target_unit]
|
|
479
|
-
|
|
480
|
-
conversion_map = {}
|
|
481
|
-
|
|
482
|
-
if category == "temperature":
|
|
483
|
-
# Temperature requires special handling
|
|
484
|
-
for unit, unit_config in config["units"].items():
|
|
485
|
-
conversion_map[unit] = unit_config
|
|
486
|
-
else:
|
|
487
|
-
# Standard factor-based conversion
|
|
488
|
-
for unit, unit_factor in config["units"].items():
|
|
489
|
-
# Convert from unit to base, then from base to target
|
|
490
|
-
conversion_map[unit] = unit_factor / target_factor
|
|
491
|
-
|
|
492
|
-
return conversion_map
|
|
493
|
-
|
|
494
|
-
def _convert_standard_polars(self, df: pl.DataFrame, value_column: str,
|
|
495
|
-
unit_column: str, conversion_map: Dict[str, float],
|
|
496
|
-
target_unit: str, harmonized_value_col: str,
|
|
497
|
-
harmonized_unit_col: str) -> pl.DataFrame:
|
|
498
|
-
"""
|
|
499
|
-
Convert units using standard factor-based conversion in Polars
|
|
500
|
-
"""
|
|
501
|
-
# Create a mapping expression for unit conversion
|
|
502
|
-
unit_mapping_expr = pl.col(unit_column).str.to_lowercase()
|
|
503
|
-
|
|
504
|
-
# Build when-then chain for conversion factors
|
|
505
|
-
conversion_expr = None
|
|
506
|
-
for unit, factor in conversion_map.items():
|
|
507
|
-
condition = unit_mapping_expr == unit
|
|
508
|
-
if conversion_expr is None:
|
|
509
|
-
conversion_expr = pl.when(condition).then(pl.lit(factor))
|
|
510
|
-
else:
|
|
511
|
-
conversion_expr = conversion_expr.when(condition).then(pl.lit(factor))
|
|
512
|
-
|
|
513
|
-
# Default to 1.0 for unknown units (no conversion)
|
|
514
|
-
conversion_expr = conversion_expr.otherwise(pl.lit(1.0))
|
|
515
|
-
|
|
516
|
-
# Apply conversion with clear column names
|
|
517
|
-
result_df = df.with_columns([
|
|
518
|
-
(pl.col(value_column) * conversion_expr).alias(harmonized_value_col),
|
|
519
|
-
pl.lit(target_unit).alias(harmonized_unit_col)
|
|
520
|
-
])
|
|
521
|
-
|
|
522
|
-
return result_df
|
|
523
|
-
|
|
524
|
-
def _convert_temperature_polars(self, df: pl.DataFrame, value_column: str,
|
|
525
|
-
unit_column: str, conversion_map: Dict[str, Dict],
|
|
526
|
-
target_unit: str, harmonized_value_col: str,
|
|
527
|
-
harmonized_unit_col: str) -> pl.DataFrame:
|
|
528
|
-
"""
|
|
529
|
-
Convert temperature units with offset handling in Polars
|
|
530
|
-
"""
|
|
531
|
-
unit_mapping_expr = pl.col(unit_column).str.to_lowercase()
|
|
532
|
-
|
|
533
|
-
# Temperature conversion requires two steps:
|
|
534
|
-
# 1. Convert source unit to Celsius (base unit)
|
|
535
|
-
# 2. Convert Celsius to target unit
|
|
536
|
-
|
|
537
|
-
# Step 1: Convert all units to Celsius first
|
|
538
|
-
celsius_conversion_expr = None
|
|
539
|
-
|
|
540
|
-
for unit, config in conversion_map.items():
|
|
541
|
-
factor = config["factor"]
|
|
542
|
-
offset = config["offset"]
|
|
543
|
-
condition = unit_mapping_expr == unit
|
|
544
|
-
|
|
545
|
-
# Convert to Celsius: (value + offset) * factor
|
|
546
|
-
unit_to_celsius = (pl.col(value_column) + pl.lit(offset)) * pl.lit(factor)
|
|
547
|
-
|
|
548
|
-
if celsius_conversion_expr is None:
|
|
549
|
-
celsius_conversion_expr = pl.when(condition).then(unit_to_celsius)
|
|
550
|
-
else:
|
|
551
|
-
celsius_conversion_expr = celsius_conversion_expr.when(condition).then(unit_to_celsius)
|
|
552
|
-
|
|
553
|
-
# Default to original value for unknown units (assume already Celsius)
|
|
554
|
-
celsius_conversion_expr = celsius_conversion_expr.otherwise(pl.col(value_column))
|
|
555
|
-
|
|
556
|
-
# Step 2: Convert from Celsius to target unit
|
|
557
|
-
if target_unit.lower() == 'c' or target_unit.lower() == 'celsius':
|
|
558
|
-
# Target is Celsius, we're done
|
|
559
|
-
final_conversion_expr = celsius_conversion_expr
|
|
560
|
-
elif target_unit.lower() == 'f' or target_unit.lower() == 'fahrenheit':
|
|
561
|
-
# Convert Celsius to Fahrenheit: F = C * 9/5 + 32
|
|
562
|
-
final_conversion_expr = celsius_conversion_expr * pl.lit(9/5) + pl.lit(32)
|
|
563
|
-
elif target_unit.lower() == 'k' or target_unit.lower() == 'kelvin':
|
|
564
|
-
# Convert Celsius to Kelvin: K = C + 273.15
|
|
565
|
-
final_conversion_expr = celsius_conversion_expr + pl.lit(273.15)
|
|
566
|
-
else:
|
|
567
|
-
# Unknown target unit, keep as Celsius
|
|
568
|
-
final_conversion_expr = celsius_conversion_expr
|
|
569
|
-
|
|
570
|
-
# Apply conversion with clear column names
|
|
571
|
-
result_df = df.with_columns([
|
|
572
|
-
final_conversion_expr.alias(harmonized_value_col),
|
|
573
|
-
pl.lit(target_unit).alias(harmonized_unit_col)
|
|
574
|
-
])
|
|
575
|
-
|
|
576
|
-
return result_df
|
|
577
|
-
|
|
578
|
-
def _validate_inputs(self, df: Any, value_column: str, unit_column: str):
|
|
579
|
-
"""Validate input parameters"""
|
|
580
|
-
# Check if dataframe is supported
|
|
581
|
-
if not self._is_dataframe(df):
|
|
582
|
-
raise UnitConversionError(
|
|
583
|
-
f"Input must be a DataFrame (pandas, polars, or cudf). Got: {type(df)}"
|
|
584
|
-
)
|
|
585
|
-
|
|
586
|
-
# Check if dataframe is empty
|
|
587
|
-
if len(df) == 0:
|
|
588
|
-
raise UnitConversionError("Input dataframe is empty")
|
|
589
|
-
|
|
590
|
-
# Check if columns exist
|
|
591
|
-
df_columns = list(df.columns)
|
|
592
|
-
|
|
593
|
-
if value_column not in df_columns:
|
|
594
|
-
raise UnitConversionError(
|
|
595
|
-
f"Value column '{value_column}' not found. Available columns: {df_columns}"
|
|
596
|
-
)
|
|
597
|
-
|
|
598
|
-
if unit_column not in df_columns:
|
|
599
|
-
raise UnitConversionError(
|
|
600
|
-
f"Unit column '{unit_column}' not found. Available columns: {df_columns}"
|
|
601
|
-
)
|
|
602
|
-
|
|
603
|
-
# Check for column name conflicts (with new naming scheme)
|
|
604
|
-
sanitized_value_col = sanitize_column_name(value_column)
|
|
605
|
-
sanitized_unit_col = sanitize_column_name(unit_column)
|
|
606
|
-
|
|
607
|
-
# We can't check target_unit conflicts here since target_unit might be auto-detected
|
|
608
|
-
# Conflict resolution will happen in the main function
|
|
609
|
-
|
|
610
|
-
log_info(f"[units] Sanitized column names: {value_column} → {sanitized_value_col}, {unit_column} → {sanitized_unit_col}")
|
|
611
|
-
|
|
612
|
-
def _is_dataframe(self, obj: Any) -> bool:
|
|
613
|
-
"""Check if object is a supported dataframe type"""
|
|
614
|
-
return (
|
|
615
|
-
isinstance(obj, pd.DataFrame) or
|
|
616
|
-
(hasattr(obj, '__class__') and 'polars' in str(type(obj))) or
|
|
617
|
-
(hasattr(obj, '__class__') and 'cudf' in str(type(obj)))
|
|
618
|
-
)
|
|
619
|
-
|
|
620
|
-
def _apply_positioning(self, df: Any, new_columns: List[str], position: str,
|
|
621
|
-
backend_type: str) -> Any:
|
|
622
|
-
"""Apply column positioning using existing positioning system"""
|
|
623
|
-
try:
|
|
624
|
-
from ..core.column_positioning import position_columns
|
|
625
|
-
return position_columns(df, new_columns, position)
|
|
626
|
-
except Exception as e:
|
|
627
|
-
log_warning(f"[units] Column positioning failed: {e}. Using default 'end'.")
|
|
628
|
-
return df
|
|
629
|
-
|
|
630
|
-
def _handle_column_conflicts(self, df: Any, harmonized_value_col: str,
|
|
631
|
-
harmonized_unit_col: str, backend_type: str) -> Tuple[Any, str, str]:
|
|
632
|
-
"""
|
|
633
|
-
Handle column name conflicts by generating safe alternatives
|
|
634
|
-
|
|
635
|
-
Args:
|
|
636
|
-
df: Input dataframe
|
|
637
|
-
harmonized_value_col: Desired harmonized value column name
|
|
638
|
-
harmonized_unit_col: Desired harmonized unit column name
|
|
639
|
-
backend_type: Backend type for column access
|
|
640
|
-
|
|
641
|
-
Returns:
|
|
642
|
-
Tuple of (dataframe, final_value_col_name, final_unit_col_name)
|
|
643
|
-
"""
|
|
644
|
-
# Get existing column names
|
|
645
|
-
if backend_type == "pandas":
|
|
646
|
-
existing_columns = list(df.columns)
|
|
647
|
-
elif backend_type == "polars":
|
|
648
|
-
existing_columns = df.columns
|
|
649
|
-
else: # cudf
|
|
650
|
-
existing_columns = list(df.columns)
|
|
651
|
-
|
|
652
|
-
# Generate safe column names
|
|
653
|
-
final_value_col = generate_safe_column_name(harmonized_value_col, existing_columns)
|
|
654
|
-
final_unit_col = generate_safe_column_name(harmonized_unit_col, existing_columns)
|
|
655
|
-
|
|
656
|
-
# Log if conflicts were resolved
|
|
657
|
-
if final_value_col != harmonized_value_col:
|
|
658
|
-
log_warning(f"[units] Column conflict resolved: {harmonized_value_col} → {final_value_col}")
|
|
659
|
-
if final_unit_col != harmonized_unit_col:
|
|
660
|
-
log_warning(f"[units] Column conflict resolved: {harmonized_unit_col} → {final_unit_col}")
|
|
661
|
-
|
|
662
|
-
return df, final_value_col, final_unit_col
|
|
663
|
-
|
|
664
|
-
def get_supported_units(self, category: Optional[str] = None) -> Dict[str, List[str]]:
|
|
665
|
-
"""
|
|
666
|
-
Get list of supported units
|
|
667
|
-
|
|
668
|
-
Args:
|
|
669
|
-
category: Specific category to get units for (optional)
|
|
670
|
-
|
|
671
|
-
Returns:
|
|
672
|
-
Dictionary of category -> list of units
|
|
673
|
-
"""
|
|
674
|
-
if category:
|
|
675
|
-
if category not in UNIT_CONVERSIONS:
|
|
676
|
-
raise UnitConversionError(f"Unknown category: {category}")
|
|
677
|
-
return {category: list(UNIT_CONVERSIONS[category]["units"].keys())}
|
|
678
|
-
|
|
679
|
-
return {
|
|
680
|
-
cat: list(config["units"].keys())
|
|
681
|
-
for cat, config in UNIT_CONVERSIONS.items()
|
|
682
|
-
}
|
|
683
|
-
|
|
684
|
-
def get_conversion_stats(self) -> Dict[str, Any]:
|
|
685
|
-
"""Get conversion statistics"""
|
|
686
|
-
stats = self.conversion_stats.copy()
|
|
687
|
-
stats["categories_detected"] = list(stats["categories_detected"])
|
|
688
|
-
stats["units_processed"] = list(stats["units_processed"])
|
|
689
|
-
|
|
690
|
-
if stats["total_conversions"] > 0:
|
|
691
|
-
stats["success_rate"] = (stats["successful_conversions"] / stats["total_conversions"]) * 100
|
|
692
|
-
else:
|
|
693
|
-
stats["success_rate"] = 0.0
|
|
694
|
-
|
|
695
|
-
return stats
|
|
696
|
-
|
|
697
|
-
def reset_stats(self):
|
|
698
|
-
"""Reset conversion statistics"""
|
|
699
|
-
self.conversion_stats = {
|
|
700
|
-
"total_conversions": 0,
|
|
701
|
-
"successful_conversions": 0,
|
|
702
|
-
"failed_conversions": 0,
|
|
703
|
-
"categories_detected": set(),
|
|
704
|
-
"units_processed": set()
|
|
705
|
-
}
|
|
706
|
-
log_info("[units] Conversion statistics reset")
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
# Global instance
|
|
710
|
-
_unit_converter = None
|
|
711
|
-
|
|
712
|
-
def get_unit_converter() -> UnitConverter:
|
|
713
|
-
"""Get the global unit converter instance"""
|
|
714
|
-
global _unit_converter
|
|
715
|
-
if _unit_converter is None:
|
|
716
|
-
_unit_converter = UnitConverter()
|
|
717
|
-
return _unit_converter
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
def harmonize_units(df: Any, value_column: str, unit_column: str,
|
|
721
|
-
target_unit: Optional[str] = None,
|
|
722
|
-
position: str = "end") -> Any:
|
|
723
|
-
"""
|
|
724
|
-
Harmonize units in a dataframe
|
|
725
|
-
|
|
726
|
-
Args:
|
|
727
|
-
df: Input dataframe (pandas, polars, cudf)
|
|
728
|
-
value_column: Column containing numeric values
|
|
729
|
-
unit_column: Column containing unit strings
|
|
730
|
-
target_unit: Target unit (auto-detected if None)
|
|
731
|
-
position: Where to place new columns
|
|
732
|
-
|
|
733
|
-
Returns:
|
|
734
|
-
Dataframe with harmonized columns added
|
|
735
|
-
"""
|
|
736
|
-
converter = get_unit_converter()
|
|
737
|
-
return converter.harmonize_units(df, value_column, unit_column, target_unit, position)
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
def get_supported_units(category: Optional[str] = None) -> Dict[str, List[str]]:
|
|
741
|
-
"""Get list of supported units"""
|
|
742
|
-
converter = get_unit_converter()
|
|
743
|
-
return converter.get_supported_units(category)
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
def get_conversion_stats() -> Dict[str, Any]:
|
|
747
|
-
"""Get conversion statistics"""
|
|
748
|
-
converter = get_unit_converter()
|
|
749
|
-
return converter.get_conversion_stats()
|