additory 0.1.0a3__py3-none-any.whl → 0.1.1a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- additory/__init__.py +58 -14
- additory/common/__init__.py +31 -147
- additory/common/column_selector.py +255 -0
- additory/common/distributions.py +286 -613
- additory/common/extractors.py +313 -0
- additory/common/knn_imputation.py +332 -0
- additory/common/result.py +380 -0
- additory/common/strategy_parser.py +243 -0
- additory/common/unit_conversions.py +338 -0
- additory/common/validation.py +283 -103
- additory/core/__init__.py +34 -22
- additory/core/backend.py +258 -0
- additory/core/config.py +177 -305
- additory/core/logging.py +230 -24
- additory/core/memory_manager.py +157 -495
- additory/expressions/__init__.py +2 -23
- additory/expressions/compiler.py +457 -0
- additory/expressions/engine.py +264 -487
- additory/expressions/integrity.py +179 -0
- additory/expressions/loader.py +263 -0
- additory/expressions/parser.py +363 -167
- additory/expressions/resolver.py +274 -0
- additory/functions/__init__.py +1 -0
- additory/functions/analyze/__init__.py +144 -0
- additory/functions/analyze/cardinality.py +58 -0
- additory/functions/analyze/correlations.py +66 -0
- additory/functions/analyze/distributions.py +53 -0
- additory/functions/analyze/duplicates.py +49 -0
- additory/functions/analyze/features.py +61 -0
- additory/functions/analyze/imputation.py +66 -0
- additory/functions/analyze/outliers.py +65 -0
- additory/functions/analyze/patterns.py +65 -0
- additory/functions/analyze/presets.py +72 -0
- additory/functions/analyze/quality.py +59 -0
- additory/functions/analyze/timeseries.py +53 -0
- additory/functions/analyze/types.py +45 -0
- additory/functions/expressions/__init__.py +161 -0
- additory/functions/snapshot/__init__.py +82 -0
- additory/functions/snapshot/filter.py +119 -0
- additory/functions/synthetic/__init__.py +113 -0
- additory/functions/synthetic/mode_detector.py +47 -0
- additory/functions/synthetic/strategies/__init__.py +1 -0
- additory/functions/synthetic/strategies/advanced.py +35 -0
- additory/functions/synthetic/strategies/augmentative.py +160 -0
- additory/functions/synthetic/strategies/generative.py +168 -0
- additory/functions/synthetic/strategies/presets.py +116 -0
- additory/functions/to/__init__.py +188 -0
- additory/functions/to/lookup.py +351 -0
- additory/functions/to/merge.py +189 -0
- additory/functions/to/sort.py +91 -0
- additory/functions/to/summarize.py +170 -0
- additory/functions/transform/__init__.py +140 -0
- additory/functions/transform/datetime.py +79 -0
- additory/functions/transform/extract.py +85 -0
- additory/functions/transform/harmonize.py +105 -0
- additory/functions/transform/knn.py +62 -0
- additory/functions/transform/onehotencoding.py +68 -0
- additory/functions/transform/transpose.py +42 -0
- additory-0.1.1a1.dist-info/METADATA +83 -0
- additory-0.1.1a1.dist-info/RECORD +62 -0
- additory/analysis/__init__.py +0 -48
- additory/analysis/cardinality.py +0 -126
- additory/analysis/correlations.py +0 -124
- additory/analysis/distributions.py +0 -376
- additory/analysis/quality.py +0 -158
- additory/analysis/scan.py +0 -400
- additory/common/backend.py +0 -371
- additory/common/column_utils.py +0 -191
- additory/common/exceptions.py +0 -62
- additory/common/lists.py +0 -229
- additory/common/patterns.py +0 -240
- additory/common/resolver.py +0 -567
- additory/common/sample_data.py +0 -182
- additory/core/ast_builder.py +0 -165
- additory/core/backends/__init__.py +0 -23
- additory/core/backends/arrow_bridge.py +0 -483
- additory/core/backends/cudf_bridge.py +0 -355
- additory/core/column_positioning.py +0 -358
- additory/core/compiler_polars.py +0 -166
- additory/core/enhanced_cache_manager.py +0 -1119
- additory/core/enhanced_matchers.py +0 -473
- additory/core/enhanced_version_manager.py +0 -325
- additory/core/executor.py +0 -59
- additory/core/integrity_manager.py +0 -477
- additory/core/loader.py +0 -190
- additory/core/namespace_manager.py +0 -657
- additory/core/parser.py +0 -176
- additory/core/polars_expression_engine.py +0 -601
- additory/core/registry.py +0 -176
- additory/core/sample_data_manager.py +0 -492
- additory/core/user_namespace.py +0 -751
- additory/core/validator.py +0 -27
- additory/dynamic_api.py +0 -304
- additory/expressions/proxy.py +0 -549
- additory/expressions/registry.py +0 -313
- additory/expressions/samples.py +0 -492
- additory/synthetic/__init__.py +0 -13
- additory/synthetic/column_name_resolver.py +0 -149
- additory/synthetic/distributions.py +0 -22
- additory/synthetic/forecast.py +0 -1132
- additory/synthetic/linked_list_parser.py +0 -415
- additory/synthetic/namespace_lookup.py +0 -129
- additory/synthetic/smote.py +0 -320
- additory/synthetic/strategies.py +0 -850
- additory/synthetic/synthesizer.py +0 -713
- additory/utilities/__init__.py +0 -53
- additory/utilities/encoding.py +0 -600
- additory/utilities/games.py +0 -300
- additory/utilities/keys.py +0 -8
- additory/utilities/lookup.py +0 -103
- additory/utilities/matchers.py +0 -216
- additory/utilities/resolvers.py +0 -286
- additory/utilities/settings.py +0 -167
- additory/utilities/units.py +0 -749
- additory/utilities/validators.py +0 -153
- additory-0.1.0a3.dist-info/METADATA +0 -288
- additory-0.1.0a3.dist-info/RECORD +0 -71
- additory-0.1.0a3.dist-info/licenses/LICENSE +0 -21
- {additory-0.1.0a3.dist-info → additory-0.1.1a1.dist-info}/WHEEL +0 -0
- {additory-0.1.0a3.dist-info → additory-0.1.1a1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,351 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Lookup and add columns from reference DataFrame.
|
|
3
|
+
|
|
4
|
+
This module provides lookup functionality for the to function.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import polars as pl
|
|
8
|
+
from typing import Union, List, Optional, Dict
|
|
9
|
+
|
|
10
|
+
from additory.common.validation import validate_dataframe, validate_not_empty
|
|
11
|
+
from additory.common.column_selector import select_columns
|
|
12
|
+
from additory.core.logging import Logger
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def perform_lookup(
|
|
16
|
+
df: pl.DataFrame,
|
|
17
|
+
from_df: pl.DataFrame,
|
|
18
|
+
bring: Union[str, List[str]],
|
|
19
|
+
against: Union[str, List[str]],
|
|
20
|
+
bring_at: Optional[str] = None,
|
|
21
|
+
strategy: Optional[Dict] = None
|
|
22
|
+
) -> pl.DataFrame:
|
|
23
|
+
"""
|
|
24
|
+
Perform lookup and add columns from reference DataFrame.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
df: Input DataFrame
|
|
28
|
+
from_df: Reference DataFrame
|
|
29
|
+
bring: Column(s) to bring from reference
|
|
30
|
+
against: Key column(s) for matching
|
|
31
|
+
bring_at: Position to insert columns ('start', 'end', 'after:col', 'before:col')
|
|
32
|
+
strategy: Strategy dictionary for advanced control
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
DataFrame with added columns
|
|
36
|
+
|
|
37
|
+
Example:
|
|
38
|
+
>>> result = perform_lookup(
|
|
39
|
+
... df=orders,
|
|
40
|
+
... from_df=products,
|
|
41
|
+
... bring='price',
|
|
42
|
+
... against='product_id'
|
|
43
|
+
... )
|
|
44
|
+
"""
|
|
45
|
+
logger = Logger()
|
|
46
|
+
|
|
47
|
+
# Validate parameters
|
|
48
|
+
validate_lookup_parameters(df, from_df, bring, against)
|
|
49
|
+
|
|
50
|
+
# Normalize parameters
|
|
51
|
+
bring_list = [bring] if isinstance(bring, str) else bring
|
|
52
|
+
against_list = [against] if isinstance(against, str) else against
|
|
53
|
+
|
|
54
|
+
# Log operation
|
|
55
|
+
logger.info(f"Lookup: bringing {bring_list} against {against_list}")
|
|
56
|
+
|
|
57
|
+
# Detect lookup mode based on cardinality
|
|
58
|
+
mode = detect_lookup_mode(df, from_df, against_list)
|
|
59
|
+
logger.info(f"Detected lookup mode: {mode}")
|
|
60
|
+
|
|
61
|
+
# Perform the join
|
|
62
|
+
result = perform_join(df, from_df, bring_list, against_list, mode)
|
|
63
|
+
|
|
64
|
+
# Apply strategy if provided
|
|
65
|
+
if strategy:
|
|
66
|
+
result = apply_strategy(result, bring_list, strategy)
|
|
67
|
+
|
|
68
|
+
# Position columns if specified
|
|
69
|
+
if bring_at:
|
|
70
|
+
result = position_columns(result, bring_list, bring_at)
|
|
71
|
+
|
|
72
|
+
logger.info(f"Lookup complete: {len(result)} rows, {len(result.columns)} columns")
|
|
73
|
+
|
|
74
|
+
return result
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def validate_lookup_parameters(
|
|
78
|
+
df: pl.DataFrame,
|
|
79
|
+
from_df: pl.DataFrame,
|
|
80
|
+
bring: Union[str, List[str]],
|
|
81
|
+
against: Union[str, List[str]]
|
|
82
|
+
) -> None:
|
|
83
|
+
"""
|
|
84
|
+
Validate lookup parameters.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
df: Input DataFrame
|
|
88
|
+
from_df: Reference DataFrame
|
|
89
|
+
bring: Column(s) to bring
|
|
90
|
+
against: Key column(s)
|
|
91
|
+
|
|
92
|
+
Raises:
|
|
93
|
+
ValueError: If validation fails
|
|
94
|
+
"""
|
|
95
|
+
# Validate DataFrames
|
|
96
|
+
validate_dataframe(df)
|
|
97
|
+
validate_not_empty(df)
|
|
98
|
+
validate_dataframe(from_df)
|
|
99
|
+
validate_not_empty(from_df)
|
|
100
|
+
|
|
101
|
+
# Normalize to lists
|
|
102
|
+
bring_list = [bring] if isinstance(bring, str) else bring
|
|
103
|
+
against_list = [against] if isinstance(against, str) else against
|
|
104
|
+
|
|
105
|
+
# Check that bring columns exist in from_df
|
|
106
|
+
missing_bring = [col for col in bring_list if col not in from_df.columns]
|
|
107
|
+
if missing_bring:
|
|
108
|
+
raise ValueError(
|
|
109
|
+
f"Bring columns not found in reference DataFrame: {missing_bring}. "
|
|
110
|
+
f"Available columns: {from_df.columns}"
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
# Check that against columns exist in both DataFrames
|
|
114
|
+
missing_in_df = [col for col in against_list if col not in df.columns]
|
|
115
|
+
if missing_in_df:
|
|
116
|
+
raise ValueError(
|
|
117
|
+
f"Key columns not found in input DataFrame: {missing_in_df}. "
|
|
118
|
+
f"Available columns: {df.columns}"
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
missing_in_from = [col for col in against_list if col not in from_df.columns]
|
|
122
|
+
if missing_in_from:
|
|
123
|
+
raise ValueError(
|
|
124
|
+
f"Key columns not found in reference DataFrame: {missing_in_from}. "
|
|
125
|
+
f"Available columns: {from_df.columns}"
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def detect_lookup_mode(
|
|
130
|
+
df: pl.DataFrame,
|
|
131
|
+
from_df: pl.DataFrame,
|
|
132
|
+
against: List[str]
|
|
133
|
+
) -> str:
|
|
134
|
+
"""
|
|
135
|
+
Detect lookup mode based on cardinality.
|
|
136
|
+
|
|
137
|
+
Args:
|
|
138
|
+
df: Input DataFrame
|
|
139
|
+
from_df: Reference DataFrame
|
|
140
|
+
against: Key columns
|
|
141
|
+
|
|
142
|
+
Returns:
|
|
143
|
+
Lookup mode string ('fetch_if_unique', 'first', etc.)
|
|
144
|
+
"""
|
|
145
|
+
logger = Logger()
|
|
146
|
+
|
|
147
|
+
# Check cardinality
|
|
148
|
+
cardinality = check_cardinality(df, from_df, against)
|
|
149
|
+
|
|
150
|
+
if cardinality['type'] == 'one_to_one':
|
|
151
|
+
return 'fetch_if_unique'
|
|
152
|
+
elif cardinality['type'] == 'many_to_one':
|
|
153
|
+
return 'fetch_if_unique'
|
|
154
|
+
elif cardinality['type'] == 'one_to_many':
|
|
155
|
+
logger.warning(
|
|
156
|
+
f"One-to-many relationship detected. "
|
|
157
|
+
f"Using 'first' mode (taking first match). "
|
|
158
|
+
f"Max matches per key: {cardinality['max_matches']}"
|
|
159
|
+
)
|
|
160
|
+
return 'first'
|
|
161
|
+
else: # many_to_many
|
|
162
|
+
raise ValueError(
|
|
163
|
+
"Many-to-many relationship detected. "
|
|
164
|
+
"Cannot perform lookup with many-to-many relationships. "
|
|
165
|
+
"Consider using a different key or aggregating the reference data first."
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def check_cardinality(
|
|
170
|
+
df: pl.DataFrame,
|
|
171
|
+
from_df: pl.DataFrame,
|
|
172
|
+
against: List[str]
|
|
173
|
+
) -> Dict:
|
|
174
|
+
"""
|
|
175
|
+
Check cardinality of relationship between DataFrames.
|
|
176
|
+
|
|
177
|
+
Args:
|
|
178
|
+
df: Input DataFrame
|
|
179
|
+
from_df: Reference DataFrame
|
|
180
|
+
against: Key columns
|
|
181
|
+
|
|
182
|
+
Returns:
|
|
183
|
+
Dictionary with cardinality information
|
|
184
|
+
"""
|
|
185
|
+
# Check if keys are unique in each DataFrame
|
|
186
|
+
left_unique = df.select(against).n_unique() == len(df)
|
|
187
|
+
right_unique = from_df.select(against).n_unique() == len(from_df)
|
|
188
|
+
|
|
189
|
+
# Calculate max matches
|
|
190
|
+
if right_unique:
|
|
191
|
+
max_matches = 1
|
|
192
|
+
else:
|
|
193
|
+
# Group by keys and count
|
|
194
|
+
counts = from_df.group_by(against).agg(pl.len().alias('count'))
|
|
195
|
+
max_matches = counts['count'].max()
|
|
196
|
+
|
|
197
|
+
# Determine relationship type
|
|
198
|
+
if left_unique and right_unique:
|
|
199
|
+
rel_type = 'one_to_one'
|
|
200
|
+
elif not left_unique and right_unique:
|
|
201
|
+
rel_type = 'many_to_one'
|
|
202
|
+
elif left_unique and not right_unique:
|
|
203
|
+
rel_type = 'one_to_many'
|
|
204
|
+
else:
|
|
205
|
+
rel_type = 'many_to_many'
|
|
206
|
+
|
|
207
|
+
return {
|
|
208
|
+
'type': rel_type,
|
|
209
|
+
'left_unique': left_unique,
|
|
210
|
+
'right_unique': right_unique,
|
|
211
|
+
'max_matches': max_matches
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def perform_join(
|
|
216
|
+
df: pl.DataFrame,
|
|
217
|
+
from_df: pl.DataFrame,
|
|
218
|
+
bring: List[str],
|
|
219
|
+
against: List[str],
|
|
220
|
+
mode: str
|
|
221
|
+
) -> pl.DataFrame:
|
|
222
|
+
"""
|
|
223
|
+
Perform the actual join operation.
|
|
224
|
+
|
|
225
|
+
Args:
|
|
226
|
+
df: Input DataFrame
|
|
227
|
+
from_df: Reference DataFrame
|
|
228
|
+
bring: Columns to bring
|
|
229
|
+
against: Key columns
|
|
230
|
+
mode: Lookup mode
|
|
231
|
+
|
|
232
|
+
Returns:
|
|
233
|
+
DataFrame with joined columns
|
|
234
|
+
"""
|
|
235
|
+
# Select only needed columns from reference
|
|
236
|
+
ref_cols = list(set(against + bring))
|
|
237
|
+
ref_df = from_df.select(ref_cols)
|
|
238
|
+
|
|
239
|
+
# For 'first' mode, take first occurrence of each key
|
|
240
|
+
if mode == 'first':
|
|
241
|
+
ref_df = ref_df.unique(subset=against, keep='first')
|
|
242
|
+
|
|
243
|
+
# Perform left join
|
|
244
|
+
result = df.join(ref_df, on=against, how='left')
|
|
245
|
+
|
|
246
|
+
return result
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def apply_strategy(
|
|
250
|
+
df: pl.DataFrame,
|
|
251
|
+
bring: List[str],
|
|
252
|
+
strategy: Dict
|
|
253
|
+
) -> pl.DataFrame:
|
|
254
|
+
"""
|
|
255
|
+
Apply strategy to lookup results.
|
|
256
|
+
|
|
257
|
+
Args:
|
|
258
|
+
df: DataFrame with joined columns
|
|
259
|
+
bring: Columns that were brought
|
|
260
|
+
strategy: Strategy dictionary
|
|
261
|
+
|
|
262
|
+
Returns:
|
|
263
|
+
DataFrame with strategy applied
|
|
264
|
+
|
|
265
|
+
Note:
|
|
266
|
+
Strategy can include fill_null, position, etc.
|
|
267
|
+
"""
|
|
268
|
+
result = df
|
|
269
|
+
|
|
270
|
+
for col in bring:
|
|
271
|
+
if col in strategy:
|
|
272
|
+
col_strategy = strategy[col]
|
|
273
|
+
|
|
274
|
+
# Handle fill_null
|
|
275
|
+
if isinstance(col_strategy, dict) and 'fill_null' in col_strategy:
|
|
276
|
+
fill_value = col_strategy['fill_null']
|
|
277
|
+
result = result.with_columns(
|
|
278
|
+
pl.col(col).fill_null(fill_value)
|
|
279
|
+
)
|
|
280
|
+
|
|
281
|
+
return result
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def position_columns(
|
|
285
|
+
df: pl.DataFrame,
|
|
286
|
+
bring: List[str],
|
|
287
|
+
bring_at: str
|
|
288
|
+
) -> pl.DataFrame:
|
|
289
|
+
"""
|
|
290
|
+
Position brought columns at specified location.
|
|
291
|
+
|
|
292
|
+
Args:
|
|
293
|
+
df: DataFrame with new columns
|
|
294
|
+
bring: Columns to position
|
|
295
|
+
bring_at: Position specification
|
|
296
|
+
|
|
297
|
+
Returns:
|
|
298
|
+
DataFrame with repositioned columns
|
|
299
|
+
|
|
300
|
+
Position Specifications:
|
|
301
|
+
- 'start' - At beginning
|
|
302
|
+
- 'end' - At end (default)
|
|
303
|
+
- 'after:column_name' - After specified column
|
|
304
|
+
- 'before:column_name' - Before specified column
|
|
305
|
+
"""
|
|
306
|
+
if bring_at == 'start':
|
|
307
|
+
# Move to start
|
|
308
|
+
other_cols = [col for col in df.columns if col not in bring]
|
|
309
|
+
return df.select(bring + other_cols)
|
|
310
|
+
|
|
311
|
+
elif bring_at == 'end':
|
|
312
|
+
# Already at end (default join behavior)
|
|
313
|
+
return df
|
|
314
|
+
|
|
315
|
+
elif bring_at.startswith('after:'):
|
|
316
|
+
# Insert after specified column
|
|
317
|
+
target_col = bring_at.split(':', 1)[1]
|
|
318
|
+
if target_col not in df.columns:
|
|
319
|
+
raise ValueError(f"Target column '{target_col}' not found in DataFrame")
|
|
320
|
+
|
|
321
|
+
# Build new column order
|
|
322
|
+
new_order = []
|
|
323
|
+
for col in df.columns:
|
|
324
|
+
if col not in bring:
|
|
325
|
+
new_order.append(col)
|
|
326
|
+
if col == target_col:
|
|
327
|
+
new_order.extend(bring)
|
|
328
|
+
|
|
329
|
+
return df.select(new_order)
|
|
330
|
+
|
|
331
|
+
elif bring_at.startswith('before:'):
|
|
332
|
+
# Insert before specified column
|
|
333
|
+
target_col = bring_at.split(':', 1)[1]
|
|
334
|
+
if target_col not in df.columns:
|
|
335
|
+
raise ValueError(f"Target column '{target_col}' not found in DataFrame")
|
|
336
|
+
|
|
337
|
+
# Build new column order
|
|
338
|
+
new_order = []
|
|
339
|
+
for col in df.columns:
|
|
340
|
+
if col not in bring:
|
|
341
|
+
if col == target_col:
|
|
342
|
+
new_order.extend(bring)
|
|
343
|
+
new_order.append(col)
|
|
344
|
+
|
|
345
|
+
return df.select(new_order)
|
|
346
|
+
|
|
347
|
+
else:
|
|
348
|
+
raise ValueError(
|
|
349
|
+
f"Invalid bring_at specification: {bring_at}. "
|
|
350
|
+
f"Valid options: 'start', 'end', 'after:column', 'before:column'"
|
|
351
|
+
)
|
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Merge multiple DataFrames.
|
|
3
|
+
|
|
4
|
+
This module provides merging functionality for the to function.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import polars as pl
|
|
8
|
+
from typing import List, Union, Optional
|
|
9
|
+
|
|
10
|
+
from additory.common.validation import validate_dataframe, validate_not_empty
|
|
11
|
+
from additory.core.logging import Logger
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def perform_merge(
|
|
15
|
+
dfs: List[pl.DataFrame],
|
|
16
|
+
how: str = 'vertical',
|
|
17
|
+
on: Optional[Union[str, List[str]]] = None,
|
|
18
|
+
**kwargs
|
|
19
|
+
) -> pl.DataFrame:
|
|
20
|
+
"""
|
|
21
|
+
Merge multiple DataFrames.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
dfs: List of DataFrames to merge
|
|
25
|
+
how: Merge type ('vertical', 'horizontal', 'diagonal')
|
|
26
|
+
on: Key column(s) for horizontal merge (required for horizontal)
|
|
27
|
+
**kwargs: Additional parameters (reserved for future use)
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
Merged DataFrame
|
|
31
|
+
|
|
32
|
+
Merge Types:
|
|
33
|
+
- 'vertical' - Stack vertically (union, concat)
|
|
34
|
+
- 'horizontal' - Join horizontally (requires key)
|
|
35
|
+
- 'diagonal' - Diagonal concat (fill missing columns)
|
|
36
|
+
|
|
37
|
+
Example:
|
|
38
|
+
>>> result = perform_merge([df1, df2, df3], how='vertical')
|
|
39
|
+
>>> result = perform_merge([df1, df2], how='horizontal', on='id')
|
|
40
|
+
"""
|
|
41
|
+
logger = Logger()
|
|
42
|
+
|
|
43
|
+
# Validate parameters
|
|
44
|
+
validate_merge_parameters(dfs, how, on)
|
|
45
|
+
|
|
46
|
+
# Log operation
|
|
47
|
+
logger.info(f"Merging {len(dfs)} DataFrames using '{how}' method")
|
|
48
|
+
|
|
49
|
+
# Dispatch to appropriate merge function
|
|
50
|
+
if how == 'vertical':
|
|
51
|
+
result = merge_vertical(dfs)
|
|
52
|
+
elif how == 'horizontal':
|
|
53
|
+
result = merge_horizontal(dfs, on)
|
|
54
|
+
elif how == 'diagonal':
|
|
55
|
+
result = merge_diagonal(dfs)
|
|
56
|
+
else:
|
|
57
|
+
raise ValueError(f"Invalid merge type: {how}")
|
|
58
|
+
|
|
59
|
+
logger.info(f"Merge complete: {len(result)} rows, {len(result.columns)} columns")
|
|
60
|
+
|
|
61
|
+
return result
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def validate_merge_parameters(
|
|
65
|
+
dfs: List[pl.DataFrame],
|
|
66
|
+
how: str,
|
|
67
|
+
on: Optional[Union[str, List[str]]] = None
|
|
68
|
+
) -> None:
|
|
69
|
+
"""
|
|
70
|
+
Validate merge parameters.
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
dfs: List of DataFrames to merge
|
|
74
|
+
how: Merge type
|
|
75
|
+
on: Key column(s) for horizontal merge
|
|
76
|
+
|
|
77
|
+
Raises:
|
|
78
|
+
ValueError: If validation fails
|
|
79
|
+
"""
|
|
80
|
+
# Check that dfs is a list
|
|
81
|
+
if not isinstance(dfs, list):
|
|
82
|
+
raise TypeError("dfs must be a list of DataFrames")
|
|
83
|
+
|
|
84
|
+
# Check that we have at least 2 DataFrames
|
|
85
|
+
if len(dfs) < 2:
|
|
86
|
+
raise ValueError("Must provide at least 2 DataFrames to merge")
|
|
87
|
+
|
|
88
|
+
# Validate each DataFrame
|
|
89
|
+
for i, df in enumerate(dfs):
|
|
90
|
+
validate_dataframe(df)
|
|
91
|
+
validate_not_empty(df)
|
|
92
|
+
|
|
93
|
+
# Check that how is valid
|
|
94
|
+
valid_how = ['vertical', 'horizontal', 'diagonal']
|
|
95
|
+
if how not in valid_how:
|
|
96
|
+
raise ValueError(
|
|
97
|
+
f"Invalid merge type: {how}. "
|
|
98
|
+
f"Valid types: {valid_how}"
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
# For horizontal merge, on is required
|
|
102
|
+
if how == 'horizontal' and on is None:
|
|
103
|
+
raise ValueError("Parameter 'on' is required for horizontal merge")
|
|
104
|
+
|
|
105
|
+
# For horizontal merge, check that on columns exist in all DataFrames
|
|
106
|
+
if how == 'horizontal' and on is not None:
|
|
107
|
+
on_list = [on] if isinstance(on, str) else on
|
|
108
|
+
for i, df in enumerate(dfs):
|
|
109
|
+
missing_cols = [col for col in on_list if col not in df.columns]
|
|
110
|
+
if missing_cols:
|
|
111
|
+
raise ValueError(
|
|
112
|
+
f"Key columns not found in DataFrame {i}: {missing_cols}. "
|
|
113
|
+
f"Available columns: {df.columns}"
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def merge_vertical(dfs: List[pl.DataFrame]) -> pl.DataFrame:
|
|
118
|
+
"""
|
|
119
|
+
Stack DataFrames vertically.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
dfs: List of DataFrames to stack
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
Vertically stacked DataFrame
|
|
126
|
+
|
|
127
|
+
Note:
|
|
128
|
+
All DataFrames must have the same columns.
|
|
129
|
+
"""
|
|
130
|
+
# Check that all DataFrames have the same columns
|
|
131
|
+
first_cols = set(dfs[0].columns)
|
|
132
|
+
for i, df in enumerate(dfs[1:], 1):
|
|
133
|
+
if set(df.columns) != first_cols:
|
|
134
|
+
raise ValueError(
|
|
135
|
+
f"DataFrame {i} has different columns than DataFrame 0. "
|
|
136
|
+
f"For vertical merge, all DataFrames must have the same columns. "
|
|
137
|
+
f"Use 'diagonal' merge to handle different columns."
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
# Concatenate vertically
|
|
141
|
+
result = pl.concat(dfs, how='vertical')
|
|
142
|
+
|
|
143
|
+
return result
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def merge_horizontal(
|
|
147
|
+
dfs: List[pl.DataFrame],
|
|
148
|
+
on: Union[str, List[str]]
|
|
149
|
+
) -> pl.DataFrame:
|
|
150
|
+
"""
|
|
151
|
+
Join DataFrames horizontally.
|
|
152
|
+
|
|
153
|
+
Args:
|
|
154
|
+
dfs: List of DataFrames to join
|
|
155
|
+
on: Key column(s) for joining
|
|
156
|
+
|
|
157
|
+
Returns:
|
|
158
|
+
Horizontally joined DataFrame
|
|
159
|
+
|
|
160
|
+
Note:
|
|
161
|
+
Performs left joins sequentially.
|
|
162
|
+
"""
|
|
163
|
+
# Start with first DataFrame
|
|
164
|
+
result = dfs[0]
|
|
165
|
+
|
|
166
|
+
# Join each subsequent DataFrame
|
|
167
|
+
for i, df in enumerate(dfs[1:], 1):
|
|
168
|
+
result = result.join(df, on=on, how='left')
|
|
169
|
+
|
|
170
|
+
return result
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def merge_diagonal(dfs: List[pl.DataFrame]) -> pl.DataFrame:
|
|
174
|
+
"""
|
|
175
|
+
Diagonal concat (fill missing columns).
|
|
176
|
+
|
|
177
|
+
Args:
|
|
178
|
+
dfs: List of DataFrames to concat
|
|
179
|
+
|
|
180
|
+
Returns:
|
|
181
|
+
Diagonally concatenated DataFrame
|
|
182
|
+
|
|
183
|
+
Note:
|
|
184
|
+
Missing columns are filled with nulls.
|
|
185
|
+
"""
|
|
186
|
+
# Concatenate diagonally (Polars handles missing columns)
|
|
187
|
+
result = pl.concat(dfs, how='diagonal')
|
|
188
|
+
|
|
189
|
+
return result
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Sort DataFrame.
|
|
3
|
+
|
|
4
|
+
This module provides sorting functionality for the to function.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import polars as pl
|
|
8
|
+
from typing import Union, List
|
|
9
|
+
|
|
10
|
+
from additory.common.validation import validate_dataframe, validate_not_empty
|
|
11
|
+
from additory.core.logging import Logger
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def perform_sort(
|
|
15
|
+
df: pl.DataFrame,
|
|
16
|
+
by: Union[str, List[str]],
|
|
17
|
+
descending: Union[bool, List[bool]] = False,
|
|
18
|
+
nulls_last: bool = True
|
|
19
|
+
) -> pl.DataFrame:
|
|
20
|
+
"""
|
|
21
|
+
Sort DataFrame by specified columns.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
df: Input DataFrame
|
|
25
|
+
by: Column(s) to sort by
|
|
26
|
+
descending: Sort in descending order (single bool or list per column)
|
|
27
|
+
nulls_last: Place nulls at end
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
Sorted DataFrame
|
|
31
|
+
|
|
32
|
+
Example:
|
|
33
|
+
>>> result = perform_sort(df, by='date', descending=True)
|
|
34
|
+
>>> result = perform_sort(df, by=['category', 'price'],
|
|
35
|
+
... descending=[False, True])
|
|
36
|
+
"""
|
|
37
|
+
logger = Logger()
|
|
38
|
+
|
|
39
|
+
# Validate parameters
|
|
40
|
+
validate_sort_parameters(df, by)
|
|
41
|
+
|
|
42
|
+
# Normalize by to list
|
|
43
|
+
by_list = [by] if isinstance(by, str) else by
|
|
44
|
+
|
|
45
|
+
# Normalize descending to list
|
|
46
|
+
if isinstance(descending, bool):
|
|
47
|
+
descending_list = [descending] * len(by_list)
|
|
48
|
+
else:
|
|
49
|
+
descending_list = descending
|
|
50
|
+
if len(descending_list) != len(by_list):
|
|
51
|
+
raise ValueError(
|
|
52
|
+
f"Length of descending ({len(descending_list)}) must match "
|
|
53
|
+
f"length of by ({len(by_list)})"
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
# Log operation
|
|
57
|
+
logger.info(f"Sorting by {by_list}, descending={descending_list}")
|
|
58
|
+
|
|
59
|
+
# Perform sort
|
|
60
|
+
result = df.sort(by=by_list, descending=descending_list, nulls_last=nulls_last)
|
|
61
|
+
|
|
62
|
+
logger.info(f"Sort complete: {len(result)} rows")
|
|
63
|
+
|
|
64
|
+
return result
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def validate_sort_parameters(df: pl.DataFrame, by: Union[str, List[str]]) -> None:
|
|
68
|
+
"""
|
|
69
|
+
Validate sort parameters.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
df: Input DataFrame
|
|
73
|
+
by: Column(s) to sort by
|
|
74
|
+
|
|
75
|
+
Raises:
|
|
76
|
+
ValueError: If validation fails
|
|
77
|
+
"""
|
|
78
|
+
# Validate DataFrame
|
|
79
|
+
validate_dataframe(df)
|
|
80
|
+
validate_not_empty(df)
|
|
81
|
+
|
|
82
|
+
# Normalize by to list
|
|
83
|
+
by_list = [by] if isinstance(by, str) else by
|
|
84
|
+
|
|
85
|
+
# Check that all columns exist
|
|
86
|
+
missing_cols = [col for col in by_list if col not in df.columns]
|
|
87
|
+
if missing_cols:
|
|
88
|
+
raise ValueError(
|
|
89
|
+
f"Sort columns not found in DataFrame: {missing_cols}. "
|
|
90
|
+
f"Available columns: {df.columns}"
|
|
91
|
+
)
|