additory 0.1.0a3__py3-none-any.whl → 0.1.1a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. additory/__init__.py +58 -14
  2. additory/common/__init__.py +31 -147
  3. additory/common/column_selector.py +255 -0
  4. additory/common/distributions.py +286 -613
  5. additory/common/extractors.py +313 -0
  6. additory/common/knn_imputation.py +332 -0
  7. additory/common/result.py +380 -0
  8. additory/common/strategy_parser.py +243 -0
  9. additory/common/unit_conversions.py +338 -0
  10. additory/common/validation.py +283 -103
  11. additory/core/__init__.py +34 -22
  12. additory/core/backend.py +258 -0
  13. additory/core/config.py +177 -305
  14. additory/core/logging.py +230 -24
  15. additory/core/memory_manager.py +157 -495
  16. additory/expressions/__init__.py +2 -23
  17. additory/expressions/compiler.py +457 -0
  18. additory/expressions/engine.py +264 -487
  19. additory/expressions/integrity.py +179 -0
  20. additory/expressions/loader.py +263 -0
  21. additory/expressions/parser.py +363 -167
  22. additory/expressions/resolver.py +274 -0
  23. additory/functions/__init__.py +1 -0
  24. additory/functions/analyze/__init__.py +144 -0
  25. additory/functions/analyze/cardinality.py +58 -0
  26. additory/functions/analyze/correlations.py +66 -0
  27. additory/functions/analyze/distributions.py +53 -0
  28. additory/functions/analyze/duplicates.py +49 -0
  29. additory/functions/analyze/features.py +61 -0
  30. additory/functions/analyze/imputation.py +66 -0
  31. additory/functions/analyze/outliers.py +65 -0
  32. additory/functions/analyze/patterns.py +65 -0
  33. additory/functions/analyze/presets.py +72 -0
  34. additory/functions/analyze/quality.py +59 -0
  35. additory/functions/analyze/timeseries.py +53 -0
  36. additory/functions/analyze/types.py +45 -0
  37. additory/functions/expressions/__init__.py +161 -0
  38. additory/functions/snapshot/__init__.py +82 -0
  39. additory/functions/snapshot/filter.py +119 -0
  40. additory/functions/synthetic/__init__.py +113 -0
  41. additory/functions/synthetic/mode_detector.py +47 -0
  42. additory/functions/synthetic/strategies/__init__.py +1 -0
  43. additory/functions/synthetic/strategies/advanced.py +35 -0
  44. additory/functions/synthetic/strategies/augmentative.py +160 -0
  45. additory/functions/synthetic/strategies/generative.py +168 -0
  46. additory/functions/synthetic/strategies/presets.py +116 -0
  47. additory/functions/to/__init__.py +188 -0
  48. additory/functions/to/lookup.py +351 -0
  49. additory/functions/to/merge.py +189 -0
  50. additory/functions/to/sort.py +91 -0
  51. additory/functions/to/summarize.py +170 -0
  52. additory/functions/transform/__init__.py +140 -0
  53. additory/functions/transform/datetime.py +79 -0
  54. additory/functions/transform/extract.py +85 -0
  55. additory/functions/transform/harmonize.py +105 -0
  56. additory/functions/transform/knn.py +62 -0
  57. additory/functions/transform/onehotencoding.py +68 -0
  58. additory/functions/transform/transpose.py +42 -0
  59. additory-0.1.1a1.dist-info/METADATA +83 -0
  60. additory-0.1.1a1.dist-info/RECORD +62 -0
  61. additory/analysis/__init__.py +0 -48
  62. additory/analysis/cardinality.py +0 -126
  63. additory/analysis/correlations.py +0 -124
  64. additory/analysis/distributions.py +0 -376
  65. additory/analysis/quality.py +0 -158
  66. additory/analysis/scan.py +0 -400
  67. additory/common/backend.py +0 -371
  68. additory/common/column_utils.py +0 -191
  69. additory/common/exceptions.py +0 -62
  70. additory/common/lists.py +0 -229
  71. additory/common/patterns.py +0 -240
  72. additory/common/resolver.py +0 -567
  73. additory/common/sample_data.py +0 -182
  74. additory/core/ast_builder.py +0 -165
  75. additory/core/backends/__init__.py +0 -23
  76. additory/core/backends/arrow_bridge.py +0 -483
  77. additory/core/backends/cudf_bridge.py +0 -355
  78. additory/core/column_positioning.py +0 -358
  79. additory/core/compiler_polars.py +0 -166
  80. additory/core/enhanced_cache_manager.py +0 -1119
  81. additory/core/enhanced_matchers.py +0 -473
  82. additory/core/enhanced_version_manager.py +0 -325
  83. additory/core/executor.py +0 -59
  84. additory/core/integrity_manager.py +0 -477
  85. additory/core/loader.py +0 -190
  86. additory/core/namespace_manager.py +0 -657
  87. additory/core/parser.py +0 -176
  88. additory/core/polars_expression_engine.py +0 -601
  89. additory/core/registry.py +0 -176
  90. additory/core/sample_data_manager.py +0 -492
  91. additory/core/user_namespace.py +0 -751
  92. additory/core/validator.py +0 -27
  93. additory/dynamic_api.py +0 -304
  94. additory/expressions/proxy.py +0 -549
  95. additory/expressions/registry.py +0 -313
  96. additory/expressions/samples.py +0 -492
  97. additory/synthetic/__init__.py +0 -13
  98. additory/synthetic/column_name_resolver.py +0 -149
  99. additory/synthetic/distributions.py +0 -22
  100. additory/synthetic/forecast.py +0 -1132
  101. additory/synthetic/linked_list_parser.py +0 -415
  102. additory/synthetic/namespace_lookup.py +0 -129
  103. additory/synthetic/smote.py +0 -320
  104. additory/synthetic/strategies.py +0 -850
  105. additory/synthetic/synthesizer.py +0 -713
  106. additory/utilities/__init__.py +0 -53
  107. additory/utilities/encoding.py +0 -600
  108. additory/utilities/games.py +0 -300
  109. additory/utilities/keys.py +0 -8
  110. additory/utilities/lookup.py +0 -103
  111. additory/utilities/matchers.py +0 -216
  112. additory/utilities/resolvers.py +0 -286
  113. additory/utilities/settings.py +0 -167
  114. additory/utilities/units.py +0 -749
  115. additory/utilities/validators.py +0 -153
  116. additory-0.1.0a3.dist-info/METADATA +0 -288
  117. additory-0.1.0a3.dist-info/RECORD +0 -71
  118. additory-0.1.0a3.dist-info/licenses/LICENSE +0 -21
  119. {additory-0.1.0a3.dist-info → additory-0.1.1a1.dist-info}/WHEEL +0 -0
  120. {additory-0.1.0a3.dist-info → additory-0.1.1a1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,351 @@
1
+ """
2
+ Lookup and add columns from reference DataFrame.
3
+
4
+ This module provides lookup functionality for the to function.
5
+ """
6
+
7
+ import polars as pl
8
+ from typing import Union, List, Optional, Dict
9
+
10
+ from additory.common.validation import validate_dataframe, validate_not_empty
11
+ from additory.common.column_selector import select_columns
12
+ from additory.core.logging import Logger
13
+
14
+
15
+ def perform_lookup(
16
+ df: pl.DataFrame,
17
+ from_df: pl.DataFrame,
18
+ bring: Union[str, List[str]],
19
+ against: Union[str, List[str]],
20
+ bring_at: Optional[str] = None,
21
+ strategy: Optional[Dict] = None
22
+ ) -> pl.DataFrame:
23
+ """
24
+ Perform lookup and add columns from reference DataFrame.
25
+
26
+ Args:
27
+ df: Input DataFrame
28
+ from_df: Reference DataFrame
29
+ bring: Column(s) to bring from reference
30
+ against: Key column(s) for matching
31
+ bring_at: Position to insert columns ('start', 'end', 'after:col', 'before:col')
32
+ strategy: Strategy dictionary for advanced control
33
+
34
+ Returns:
35
+ DataFrame with added columns
36
+
37
+ Example:
38
+ >>> result = perform_lookup(
39
+ ... df=orders,
40
+ ... from_df=products,
41
+ ... bring='price',
42
+ ... against='product_id'
43
+ ... )
44
+ """
45
+ logger = Logger()
46
+
47
+ # Validate parameters
48
+ validate_lookup_parameters(df, from_df, bring, against)
49
+
50
+ # Normalize parameters
51
+ bring_list = [bring] if isinstance(bring, str) else bring
52
+ against_list = [against] if isinstance(against, str) else against
53
+
54
+ # Log operation
55
+ logger.info(f"Lookup: bringing {bring_list} against {against_list}")
56
+
57
+ # Detect lookup mode based on cardinality
58
+ mode = detect_lookup_mode(df, from_df, against_list)
59
+ logger.info(f"Detected lookup mode: {mode}")
60
+
61
+ # Perform the join
62
+ result = perform_join(df, from_df, bring_list, against_list, mode)
63
+
64
+ # Apply strategy if provided
65
+ if strategy:
66
+ result = apply_strategy(result, bring_list, strategy)
67
+
68
+ # Position columns if specified
69
+ if bring_at:
70
+ result = position_columns(result, bring_list, bring_at)
71
+
72
+ logger.info(f"Lookup complete: {len(result)} rows, {len(result.columns)} columns")
73
+
74
+ return result
75
+
76
+
77
+ def validate_lookup_parameters(
78
+ df: pl.DataFrame,
79
+ from_df: pl.DataFrame,
80
+ bring: Union[str, List[str]],
81
+ against: Union[str, List[str]]
82
+ ) -> None:
83
+ """
84
+ Validate lookup parameters.
85
+
86
+ Args:
87
+ df: Input DataFrame
88
+ from_df: Reference DataFrame
89
+ bring: Column(s) to bring
90
+ against: Key column(s)
91
+
92
+ Raises:
93
+ ValueError: If validation fails
94
+ """
95
+ # Validate DataFrames
96
+ validate_dataframe(df)
97
+ validate_not_empty(df)
98
+ validate_dataframe(from_df)
99
+ validate_not_empty(from_df)
100
+
101
+ # Normalize to lists
102
+ bring_list = [bring] if isinstance(bring, str) else bring
103
+ against_list = [against] if isinstance(against, str) else against
104
+
105
+ # Check that bring columns exist in from_df
106
+ missing_bring = [col for col in bring_list if col not in from_df.columns]
107
+ if missing_bring:
108
+ raise ValueError(
109
+ f"Bring columns not found in reference DataFrame: {missing_bring}. "
110
+ f"Available columns: {from_df.columns}"
111
+ )
112
+
113
+ # Check that against columns exist in both DataFrames
114
+ missing_in_df = [col for col in against_list if col not in df.columns]
115
+ if missing_in_df:
116
+ raise ValueError(
117
+ f"Key columns not found in input DataFrame: {missing_in_df}. "
118
+ f"Available columns: {df.columns}"
119
+ )
120
+
121
+ missing_in_from = [col for col in against_list if col not in from_df.columns]
122
+ if missing_in_from:
123
+ raise ValueError(
124
+ f"Key columns not found in reference DataFrame: {missing_in_from}. "
125
+ f"Available columns: {from_df.columns}"
126
+ )
127
+
128
+
129
+ def detect_lookup_mode(
130
+ df: pl.DataFrame,
131
+ from_df: pl.DataFrame,
132
+ against: List[str]
133
+ ) -> str:
134
+ """
135
+ Detect lookup mode based on cardinality.
136
+
137
+ Args:
138
+ df: Input DataFrame
139
+ from_df: Reference DataFrame
140
+ against: Key columns
141
+
142
+ Returns:
143
+ Lookup mode string ('fetch_if_unique', 'first', etc.)
144
+ """
145
+ logger = Logger()
146
+
147
+ # Check cardinality
148
+ cardinality = check_cardinality(df, from_df, against)
149
+
150
+ if cardinality['type'] == 'one_to_one':
151
+ return 'fetch_if_unique'
152
+ elif cardinality['type'] == 'many_to_one':
153
+ return 'fetch_if_unique'
154
+ elif cardinality['type'] == 'one_to_many':
155
+ logger.warning(
156
+ f"One-to-many relationship detected. "
157
+ f"Using 'first' mode (taking first match). "
158
+ f"Max matches per key: {cardinality['max_matches']}"
159
+ )
160
+ return 'first'
161
+ else: # many_to_many
162
+ raise ValueError(
163
+ "Many-to-many relationship detected. "
164
+ "Cannot perform lookup with many-to-many relationships. "
165
+ "Consider using a different key or aggregating the reference data first."
166
+ )
167
+
168
+
169
+ def check_cardinality(
170
+ df: pl.DataFrame,
171
+ from_df: pl.DataFrame,
172
+ against: List[str]
173
+ ) -> Dict:
174
+ """
175
+ Check cardinality of relationship between DataFrames.
176
+
177
+ Args:
178
+ df: Input DataFrame
179
+ from_df: Reference DataFrame
180
+ against: Key columns
181
+
182
+ Returns:
183
+ Dictionary with cardinality information
184
+ """
185
+ # Check if keys are unique in each DataFrame
186
+ left_unique = df.select(against).n_unique() == len(df)
187
+ right_unique = from_df.select(against).n_unique() == len(from_df)
188
+
189
+ # Calculate max matches
190
+ if right_unique:
191
+ max_matches = 1
192
+ else:
193
+ # Group by keys and count
194
+ counts = from_df.group_by(against).agg(pl.len().alias('count'))
195
+ max_matches = counts['count'].max()
196
+
197
+ # Determine relationship type
198
+ if left_unique and right_unique:
199
+ rel_type = 'one_to_one'
200
+ elif not left_unique and right_unique:
201
+ rel_type = 'many_to_one'
202
+ elif left_unique and not right_unique:
203
+ rel_type = 'one_to_many'
204
+ else:
205
+ rel_type = 'many_to_many'
206
+
207
+ return {
208
+ 'type': rel_type,
209
+ 'left_unique': left_unique,
210
+ 'right_unique': right_unique,
211
+ 'max_matches': max_matches
212
+ }
213
+
214
+
215
+ def perform_join(
216
+ df: pl.DataFrame,
217
+ from_df: pl.DataFrame,
218
+ bring: List[str],
219
+ against: List[str],
220
+ mode: str
221
+ ) -> pl.DataFrame:
222
+ """
223
+ Perform the actual join operation.
224
+
225
+ Args:
226
+ df: Input DataFrame
227
+ from_df: Reference DataFrame
228
+ bring: Columns to bring
229
+ against: Key columns
230
+ mode: Lookup mode
231
+
232
+ Returns:
233
+ DataFrame with joined columns
234
+ """
235
+ # Select only needed columns from reference
236
+ ref_cols = list(set(against + bring))
237
+ ref_df = from_df.select(ref_cols)
238
+
239
+ # For 'first' mode, take first occurrence of each key
240
+ if mode == 'first':
241
+ ref_df = ref_df.unique(subset=against, keep='first')
242
+
243
+ # Perform left join
244
+ result = df.join(ref_df, on=against, how='left')
245
+
246
+ return result
247
+
248
+
249
+ def apply_strategy(
250
+ df: pl.DataFrame,
251
+ bring: List[str],
252
+ strategy: Dict
253
+ ) -> pl.DataFrame:
254
+ """
255
+ Apply strategy to lookup results.
256
+
257
+ Args:
258
+ df: DataFrame with joined columns
259
+ bring: Columns that were brought
260
+ strategy: Strategy dictionary
261
+
262
+ Returns:
263
+ DataFrame with strategy applied
264
+
265
+ Note:
266
+ Strategy can include fill_null, position, etc.
267
+ """
268
+ result = df
269
+
270
+ for col in bring:
271
+ if col in strategy:
272
+ col_strategy = strategy[col]
273
+
274
+ # Handle fill_null
275
+ if isinstance(col_strategy, dict) and 'fill_null' in col_strategy:
276
+ fill_value = col_strategy['fill_null']
277
+ result = result.with_columns(
278
+ pl.col(col).fill_null(fill_value)
279
+ )
280
+
281
+ return result
282
+
283
+
284
+ def position_columns(
285
+ df: pl.DataFrame,
286
+ bring: List[str],
287
+ bring_at: str
288
+ ) -> pl.DataFrame:
289
+ """
290
+ Position brought columns at specified location.
291
+
292
+ Args:
293
+ df: DataFrame with new columns
294
+ bring: Columns to position
295
+ bring_at: Position specification
296
+
297
+ Returns:
298
+ DataFrame with repositioned columns
299
+
300
+ Position Specifications:
301
+ - 'start' - At beginning
302
+ - 'end' - At end (default)
303
+ - 'after:column_name' - After specified column
304
+ - 'before:column_name' - Before specified column
305
+ """
306
+ if bring_at == 'start':
307
+ # Move to start
308
+ other_cols = [col for col in df.columns if col not in bring]
309
+ return df.select(bring + other_cols)
310
+
311
+ elif bring_at == 'end':
312
+ # Already at end (default join behavior)
313
+ return df
314
+
315
+ elif bring_at.startswith('after:'):
316
+ # Insert after specified column
317
+ target_col = bring_at.split(':', 1)[1]
318
+ if target_col not in df.columns:
319
+ raise ValueError(f"Target column '{target_col}' not found in DataFrame")
320
+
321
+ # Build new column order
322
+ new_order = []
323
+ for col in df.columns:
324
+ if col not in bring:
325
+ new_order.append(col)
326
+ if col == target_col:
327
+ new_order.extend(bring)
328
+
329
+ return df.select(new_order)
330
+
331
+ elif bring_at.startswith('before:'):
332
+ # Insert before specified column
333
+ target_col = bring_at.split(':', 1)[1]
334
+ if target_col not in df.columns:
335
+ raise ValueError(f"Target column '{target_col}' not found in DataFrame")
336
+
337
+ # Build new column order
338
+ new_order = []
339
+ for col in df.columns:
340
+ if col not in bring:
341
+ if col == target_col:
342
+ new_order.extend(bring)
343
+ new_order.append(col)
344
+
345
+ return df.select(new_order)
346
+
347
+ else:
348
+ raise ValueError(
349
+ f"Invalid bring_at specification: {bring_at}. "
350
+ f"Valid options: 'start', 'end', 'after:column', 'before:column'"
351
+ )
@@ -0,0 +1,189 @@
1
+ """
2
+ Merge multiple DataFrames.
3
+
4
+ This module provides merging functionality for the to function.
5
+ """
6
+
7
+ import polars as pl
8
+ from typing import List, Union, Optional
9
+
10
+ from additory.common.validation import validate_dataframe, validate_not_empty
11
+ from additory.core.logging import Logger
12
+
13
+
14
+ def perform_merge(
15
+ dfs: List[pl.DataFrame],
16
+ how: str = 'vertical',
17
+ on: Optional[Union[str, List[str]]] = None,
18
+ **kwargs
19
+ ) -> pl.DataFrame:
20
+ """
21
+ Merge multiple DataFrames.
22
+
23
+ Args:
24
+ dfs: List of DataFrames to merge
25
+ how: Merge type ('vertical', 'horizontal', 'diagonal')
26
+ on: Key column(s) for horizontal merge (required for horizontal)
27
+ **kwargs: Additional parameters (reserved for future use)
28
+
29
+ Returns:
30
+ Merged DataFrame
31
+
32
+ Merge Types:
33
+ - 'vertical' - Stack vertically (union, concat)
34
+ - 'horizontal' - Join horizontally (requires key)
35
+ - 'diagonal' - Diagonal concat (fill missing columns)
36
+
37
+ Example:
38
+ >>> result = perform_merge([df1, df2, df3], how='vertical')
39
+ >>> result = perform_merge([df1, df2], how='horizontal', on='id')
40
+ """
41
+ logger = Logger()
42
+
43
+ # Validate parameters
44
+ validate_merge_parameters(dfs, how, on)
45
+
46
+ # Log operation
47
+ logger.info(f"Merging {len(dfs)} DataFrames using '{how}' method")
48
+
49
+ # Dispatch to appropriate merge function
50
+ if how == 'vertical':
51
+ result = merge_vertical(dfs)
52
+ elif how == 'horizontal':
53
+ result = merge_horizontal(dfs, on)
54
+ elif how == 'diagonal':
55
+ result = merge_diagonal(dfs)
56
+ else:
57
+ raise ValueError(f"Invalid merge type: {how}")
58
+
59
+ logger.info(f"Merge complete: {len(result)} rows, {len(result.columns)} columns")
60
+
61
+ return result
62
+
63
+
64
+ def validate_merge_parameters(
65
+ dfs: List[pl.DataFrame],
66
+ how: str,
67
+ on: Optional[Union[str, List[str]]] = None
68
+ ) -> None:
69
+ """
70
+ Validate merge parameters.
71
+
72
+ Args:
73
+ dfs: List of DataFrames to merge
74
+ how: Merge type
75
+ on: Key column(s) for horizontal merge
76
+
77
+ Raises:
78
+ ValueError: If validation fails
79
+ """
80
+ # Check that dfs is a list
81
+ if not isinstance(dfs, list):
82
+ raise TypeError("dfs must be a list of DataFrames")
83
+
84
+ # Check that we have at least 2 DataFrames
85
+ if len(dfs) < 2:
86
+ raise ValueError("Must provide at least 2 DataFrames to merge")
87
+
88
+ # Validate each DataFrame
89
+ for i, df in enumerate(dfs):
90
+ validate_dataframe(df)
91
+ validate_not_empty(df)
92
+
93
+ # Check that how is valid
94
+ valid_how = ['vertical', 'horizontal', 'diagonal']
95
+ if how not in valid_how:
96
+ raise ValueError(
97
+ f"Invalid merge type: {how}. "
98
+ f"Valid types: {valid_how}"
99
+ )
100
+
101
+ # For horizontal merge, on is required
102
+ if how == 'horizontal' and on is None:
103
+ raise ValueError("Parameter 'on' is required for horizontal merge")
104
+
105
+ # For horizontal merge, check that on columns exist in all DataFrames
106
+ if how == 'horizontal' and on is not None:
107
+ on_list = [on] if isinstance(on, str) else on
108
+ for i, df in enumerate(dfs):
109
+ missing_cols = [col for col in on_list if col not in df.columns]
110
+ if missing_cols:
111
+ raise ValueError(
112
+ f"Key columns not found in DataFrame {i}: {missing_cols}. "
113
+ f"Available columns: {df.columns}"
114
+ )
115
+
116
+
117
+ def merge_vertical(dfs: List[pl.DataFrame]) -> pl.DataFrame:
118
+ """
119
+ Stack DataFrames vertically.
120
+
121
+ Args:
122
+ dfs: List of DataFrames to stack
123
+
124
+ Returns:
125
+ Vertically stacked DataFrame
126
+
127
+ Note:
128
+ All DataFrames must have the same columns.
129
+ """
130
+ # Check that all DataFrames have the same columns
131
+ first_cols = set(dfs[0].columns)
132
+ for i, df in enumerate(dfs[1:], 1):
133
+ if set(df.columns) != first_cols:
134
+ raise ValueError(
135
+ f"DataFrame {i} has different columns than DataFrame 0. "
136
+ f"For vertical merge, all DataFrames must have the same columns. "
137
+ f"Use 'diagonal' merge to handle different columns."
138
+ )
139
+
140
+ # Concatenate vertically
141
+ result = pl.concat(dfs, how='vertical')
142
+
143
+ return result
144
+
145
+
146
+ def merge_horizontal(
147
+ dfs: List[pl.DataFrame],
148
+ on: Union[str, List[str]]
149
+ ) -> pl.DataFrame:
150
+ """
151
+ Join DataFrames horizontally.
152
+
153
+ Args:
154
+ dfs: List of DataFrames to join
155
+ on: Key column(s) for joining
156
+
157
+ Returns:
158
+ Horizontally joined DataFrame
159
+
160
+ Note:
161
+ Performs left joins sequentially.
162
+ """
163
+ # Start with first DataFrame
164
+ result = dfs[0]
165
+
166
+ # Join each subsequent DataFrame
167
+ for i, df in enumerate(dfs[1:], 1):
168
+ result = result.join(df, on=on, how='left')
169
+
170
+ return result
171
+
172
+
173
+ def merge_diagonal(dfs: List[pl.DataFrame]) -> pl.DataFrame:
174
+ """
175
+ Diagonal concat (fill missing columns).
176
+
177
+ Args:
178
+ dfs: List of DataFrames to concat
179
+
180
+ Returns:
181
+ Diagonally concatenated DataFrame
182
+
183
+ Note:
184
+ Missing columns are filled with nulls.
185
+ """
186
+ # Concatenate diagonally (Polars handles missing columns)
187
+ result = pl.concat(dfs, how='diagonal')
188
+
189
+ return result
@@ -0,0 +1,91 @@
1
+ """
2
+ Sort DataFrame.
3
+
4
+ This module provides sorting functionality for the to function.
5
+ """
6
+
7
+ import polars as pl
8
+ from typing import Union, List
9
+
10
+ from additory.common.validation import validate_dataframe, validate_not_empty
11
+ from additory.core.logging import Logger
12
+
13
+
14
+ def perform_sort(
15
+ df: pl.DataFrame,
16
+ by: Union[str, List[str]],
17
+ descending: Union[bool, List[bool]] = False,
18
+ nulls_last: bool = True
19
+ ) -> pl.DataFrame:
20
+ """
21
+ Sort DataFrame by specified columns.
22
+
23
+ Args:
24
+ df: Input DataFrame
25
+ by: Column(s) to sort by
26
+ descending: Sort in descending order (single bool or list per column)
27
+ nulls_last: Place nulls at end
28
+
29
+ Returns:
30
+ Sorted DataFrame
31
+
32
+ Example:
33
+ >>> result = perform_sort(df, by='date', descending=True)
34
+ >>> result = perform_sort(df, by=['category', 'price'],
35
+ ... descending=[False, True])
36
+ """
37
+ logger = Logger()
38
+
39
+ # Validate parameters
40
+ validate_sort_parameters(df, by)
41
+
42
+ # Normalize by to list
43
+ by_list = [by] if isinstance(by, str) else by
44
+
45
+ # Normalize descending to list
46
+ if isinstance(descending, bool):
47
+ descending_list = [descending] * len(by_list)
48
+ else:
49
+ descending_list = descending
50
+ if len(descending_list) != len(by_list):
51
+ raise ValueError(
52
+ f"Length of descending ({len(descending_list)}) must match "
53
+ f"length of by ({len(by_list)})"
54
+ )
55
+
56
+ # Log operation
57
+ logger.info(f"Sorting by {by_list}, descending={descending_list}")
58
+
59
+ # Perform sort
60
+ result = df.sort(by=by_list, descending=descending_list, nulls_last=nulls_last)
61
+
62
+ logger.info(f"Sort complete: {len(result)} rows")
63
+
64
+ return result
65
+
66
+
67
+ def validate_sort_parameters(df: pl.DataFrame, by: Union[str, List[str]]) -> None:
68
+ """
69
+ Validate sort parameters.
70
+
71
+ Args:
72
+ df: Input DataFrame
73
+ by: Column(s) to sort by
74
+
75
+ Raises:
76
+ ValueError: If validation fails
77
+ """
78
+ # Validate DataFrame
79
+ validate_dataframe(df)
80
+ validate_not_empty(df)
81
+
82
+ # Normalize by to list
83
+ by_list = [by] if isinstance(by, str) else by
84
+
85
+ # Check that all columns exist
86
+ missing_cols = [col for col in by_list if col not in df.columns]
87
+ if missing_cols:
88
+ raise ValueError(
89
+ f"Sort columns not found in DataFrame: {missing_cols}. "
90
+ f"Available columns: {df.columns}"
91
+ )