additory 0.1.0a4__py3-none-any.whl → 0.1.1a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. additory/__init__.py +58 -14
  2. additory/common/__init__.py +31 -147
  3. additory/common/column_selector.py +255 -0
  4. additory/common/distributions.py +286 -613
  5. additory/common/extractors.py +313 -0
  6. additory/common/knn_imputation.py +332 -0
  7. additory/common/result.py +380 -0
  8. additory/common/strategy_parser.py +243 -0
  9. additory/common/unit_conversions.py +338 -0
  10. additory/common/validation.py +283 -103
  11. additory/core/__init__.py +34 -22
  12. additory/core/backend.py +258 -0
  13. additory/core/config.py +177 -305
  14. additory/core/logging.py +230 -24
  15. additory/core/memory_manager.py +157 -495
  16. additory/expressions/__init__.py +2 -23
  17. additory/expressions/compiler.py +457 -0
  18. additory/expressions/engine.py +264 -487
  19. additory/expressions/integrity.py +179 -0
  20. additory/expressions/loader.py +263 -0
  21. additory/expressions/parser.py +363 -167
  22. additory/expressions/resolver.py +274 -0
  23. additory/functions/__init__.py +1 -0
  24. additory/functions/analyze/__init__.py +144 -0
  25. additory/functions/analyze/cardinality.py +58 -0
  26. additory/functions/analyze/correlations.py +66 -0
  27. additory/functions/analyze/distributions.py +53 -0
  28. additory/functions/analyze/duplicates.py +49 -0
  29. additory/functions/analyze/features.py +61 -0
  30. additory/functions/analyze/imputation.py +66 -0
  31. additory/functions/analyze/outliers.py +65 -0
  32. additory/functions/analyze/patterns.py +65 -0
  33. additory/functions/analyze/presets.py +72 -0
  34. additory/functions/analyze/quality.py +59 -0
  35. additory/functions/analyze/timeseries.py +53 -0
  36. additory/functions/analyze/types.py +45 -0
  37. additory/functions/expressions/__init__.py +161 -0
  38. additory/functions/snapshot/__init__.py +82 -0
  39. additory/functions/snapshot/filter.py +119 -0
  40. additory/functions/synthetic/__init__.py +113 -0
  41. additory/functions/synthetic/mode_detector.py +47 -0
  42. additory/functions/synthetic/strategies/__init__.py +1 -0
  43. additory/functions/synthetic/strategies/advanced.py +35 -0
  44. additory/functions/synthetic/strategies/augmentative.py +160 -0
  45. additory/functions/synthetic/strategies/generative.py +168 -0
  46. additory/functions/synthetic/strategies/presets.py +116 -0
  47. additory/functions/to/__init__.py +188 -0
  48. additory/functions/to/lookup.py +351 -0
  49. additory/functions/to/merge.py +189 -0
  50. additory/functions/to/sort.py +91 -0
  51. additory/functions/to/summarize.py +170 -0
  52. additory/functions/transform/__init__.py +140 -0
  53. additory/functions/transform/datetime.py +79 -0
  54. additory/functions/transform/extract.py +85 -0
  55. additory/functions/transform/harmonize.py +105 -0
  56. additory/functions/transform/knn.py +62 -0
  57. additory/functions/transform/onehotencoding.py +68 -0
  58. additory/functions/transform/transpose.py +42 -0
  59. additory-0.1.1a1.dist-info/METADATA +83 -0
  60. additory-0.1.1a1.dist-info/RECORD +62 -0
  61. additory/analysis/__init__.py +0 -48
  62. additory/analysis/cardinality.py +0 -126
  63. additory/analysis/correlations.py +0 -124
  64. additory/analysis/distributions.py +0 -376
  65. additory/analysis/quality.py +0 -158
  66. additory/analysis/scan.py +0 -400
  67. additory/common/backend.py +0 -371
  68. additory/common/column_utils.py +0 -191
  69. additory/common/exceptions.py +0 -62
  70. additory/common/lists.py +0 -229
  71. additory/common/patterns.py +0 -240
  72. additory/common/resolver.py +0 -567
  73. additory/common/sample_data.py +0 -182
  74. additory/core/ast_builder.py +0 -165
  75. additory/core/backends/__init__.py +0 -23
  76. additory/core/backends/arrow_bridge.py +0 -483
  77. additory/core/backends/cudf_bridge.py +0 -355
  78. additory/core/column_positioning.py +0 -358
  79. additory/core/compiler_polars.py +0 -166
  80. additory/core/enhanced_cache_manager.py +0 -1119
  81. additory/core/enhanced_matchers.py +0 -473
  82. additory/core/enhanced_version_manager.py +0 -325
  83. additory/core/executor.py +0 -59
  84. additory/core/integrity_manager.py +0 -477
  85. additory/core/loader.py +0 -190
  86. additory/core/namespace_manager.py +0 -657
  87. additory/core/parser.py +0 -176
  88. additory/core/polars_expression_engine.py +0 -601
  89. additory/core/registry.py +0 -177
  90. additory/core/sample_data_manager.py +0 -492
  91. additory/core/user_namespace.py +0 -751
  92. additory/core/validator.py +0 -27
  93. additory/dynamic_api.py +0 -352
  94. additory/expressions/proxy.py +0 -549
  95. additory/expressions/registry.py +0 -313
  96. additory/expressions/samples.py +0 -492
  97. additory/synthetic/__init__.py +0 -13
  98. additory/synthetic/column_name_resolver.py +0 -149
  99. additory/synthetic/deduce.py +0 -259
  100. additory/synthetic/distributions.py +0 -22
  101. additory/synthetic/forecast.py +0 -1132
  102. additory/synthetic/linked_list_parser.py +0 -415
  103. additory/synthetic/namespace_lookup.py +0 -129
  104. additory/synthetic/smote.py +0 -320
  105. additory/synthetic/strategies.py +0 -926
  106. additory/synthetic/synthesizer.py +0 -713
  107. additory/utilities/__init__.py +0 -53
  108. additory/utilities/encoding.py +0 -600
  109. additory/utilities/games.py +0 -300
  110. additory/utilities/keys.py +0 -8
  111. additory/utilities/lookup.py +0 -103
  112. additory/utilities/matchers.py +0 -216
  113. additory/utilities/resolvers.py +0 -286
  114. additory/utilities/settings.py +0 -167
  115. additory/utilities/units.py +0 -749
  116. additory/utilities/validators.py +0 -153
  117. additory-0.1.0a4.dist-info/METADATA +0 -311
  118. additory-0.1.0a4.dist-info/RECORD +0 -72
  119. additory-0.1.0a4.dist-info/licenses/LICENSE +0 -21
  120. {additory-0.1.0a4.dist-info → additory-0.1.1a1.dist-info}/WHEEL +0 -0
  121. {additory-0.1.0a4.dist-info → additory-0.1.1a1.dist-info}/top_level.txt +0 -0
@@ -1,358 +0,0 @@
1
- # additory/core/column_positioning.py
2
-
3
- """
4
- Column Positioning System for Smart Column Insertion
5
-
6
- This module provides intelligent column positioning capabilities for the add.to() function.
7
- Users can specify where new columns should be inserted in the target dataframe.
8
-
9
- Supported positioning options:
10
- - "end": Append at end (default)
11
- - "start": Insert at beginning
12
- - int: 0-based index position
13
- - "after:col_name": After specific column
14
- - "before:col_name": Before specific column
15
-
16
- Design Philosophy:
17
- - Intuitive positioning syntax
18
- - Robust error handling with helpful suggestions
19
- - Preserve dataframe structure and types
20
- - Support for multiple new columns with smart placement
21
- """
22
-
23
- import logging
24
- from typing import Union, List, Optional, Dict, Any
25
- import pandas as pd
26
-
27
- logger = logging.getLogger(__name__)
28
-
29
-
30
- class ColumnPositioner:
31
- """
32
- Handles intelligent column positioning for dataframes
33
- """
34
-
35
- def __init__(self):
36
- self._positioning_stats = {
37
- 'total_operations': 0,
38
- 'end_insertions': 0,
39
- 'start_insertions': 0,
40
- 'index_insertions': 0,
41
- 'relative_insertions': 0,
42
- 'errors_handled': 0
43
- }
44
-
45
- def position_columns(self,
46
- df: pd.DataFrame,
47
- new_columns: List[str],
48
- position: Union[str, int] = "end") -> pd.DataFrame:
49
- """
50
- Insert new columns at specified position in dataframe
51
-
52
- Args:
53
- df: Target dataframe with new columns already added at the end
54
- new_columns: List of column names that were just added
55
- position: Where to position the new columns
56
-
57
- Returns:
58
- DataFrame with columns repositioned as requested
59
- """
60
-
61
- self._positioning_stats['total_operations'] += 1
62
-
63
- # Validate inputs
64
- if not new_columns:
65
- logger.warning("No new columns specified for positioning")
66
- return df
67
-
68
- # Check that new columns exist in dataframe
69
- missing_cols = [col for col in new_columns if col not in df.columns]
70
- if missing_cols:
71
- logger.error(f"New columns not found in dataframe: {missing_cols}")
72
- self._positioning_stats['errors_handled'] += 1
73
- return df
74
-
75
- # Handle different positioning options
76
- try:
77
- if position == "end":
78
- # Already at end, no change needed
79
- self._positioning_stats['end_insertions'] += 1
80
- logger.debug("Columns positioned at end (default)")
81
- return df
82
-
83
- elif position == "start":
84
- return self._position_at_start(df, new_columns)
85
-
86
- elif isinstance(position, int):
87
- return self._position_at_index(df, new_columns, position)
88
-
89
- elif isinstance(position, str) and position.startswith("after:"):
90
- reference_col = position[6:] # Remove "after:" prefix
91
- return self._position_after_column(df, new_columns, reference_col)
92
-
93
- elif isinstance(position, str) and position.startswith("before:"):
94
- reference_col = position[7:] # Remove "before:" prefix
95
- return self._position_before_column(df, new_columns, reference_col)
96
-
97
- else:
98
- logger.warning(f"Unknown position specification: '{position}'. Using default 'end'.")
99
- self._positioning_stats['errors_handled'] += 1
100
- return df
101
-
102
- except Exception as e:
103
- logger.error(f"Column positioning failed: {e}. Using default 'end'.")
104
- self._positioning_stats['errors_handled'] += 1
105
- return df
106
-
107
- def _position_at_start(self, df: pd.DataFrame, new_columns: List[str]) -> pd.DataFrame:
108
- """Position new columns at the start of the dataframe"""
109
-
110
- self._positioning_stats['start_insertions'] += 1
111
-
112
- # Get existing columns (excluding new ones)
113
- existing_columns = [col for col in df.columns if col not in new_columns]
114
-
115
- # Reorder: new columns first, then existing columns
116
- new_order = new_columns + existing_columns
117
-
118
- logger.debug(f"Positioning {len(new_columns)} columns at start")
119
- return df[new_order]
120
-
121
- def _position_at_index(self, df: pd.DataFrame, new_columns: List[str], index: int) -> pd.DataFrame:
122
- """Position new columns at specific 0-based index"""
123
-
124
- self._positioning_stats['index_insertions'] += 1
125
-
126
- # Get existing columns (excluding new ones)
127
- existing_columns = [col for col in df.columns if col not in new_columns]
128
-
129
- # Validate index
130
- max_index = len(existing_columns)
131
- if index < 0:
132
- # Convert negative index: -1 means before last column, -2 before second-to-last, etc.
133
- index = max(0, max_index + index)
134
- elif index > max_index:
135
- logger.warning(f"Index {index} exceeds column count {max_index}. Using end position.")
136
- index = max_index
137
-
138
- # Insert new columns at specified index
139
- new_order = existing_columns[:index] + new_columns + existing_columns[index:]
140
-
141
- logger.debug(f"Positioning {len(new_columns)} columns at index {index}")
142
- return df[new_order]
143
-
144
- def _position_after_column(self, df: pd.DataFrame, new_columns: List[str],
145
- reference_col: str) -> pd.DataFrame:
146
- """Position new columns after a specific reference column"""
147
-
148
- self._positioning_stats['relative_insertions'] += 1
149
-
150
- # Get existing columns (excluding new ones)
151
- existing_columns = [col for col in df.columns if col not in new_columns]
152
-
153
- # Check if reference column exists
154
- if reference_col not in existing_columns:
155
- available_cols = existing_columns[:5] # Show first 5 for brevity
156
- logger.warning(f"Reference column '{reference_col}' not found. "
157
- f"Available columns: {available_cols}{'...' if len(existing_columns) > 5 else ''}. "
158
- f"Using end position.")
159
- return df
160
-
161
- # Find position after reference column
162
- ref_index = existing_columns.index(reference_col)
163
- insert_index = ref_index + 1
164
-
165
- # Insert new columns after reference column
166
- new_order = (existing_columns[:insert_index] +
167
- new_columns +
168
- existing_columns[insert_index:])
169
-
170
- logger.debug(f"Positioning {len(new_columns)} columns after '{reference_col}'")
171
- return df[new_order]
172
-
173
- def _position_before_column(self, df: pd.DataFrame, new_columns: List[str],
174
- reference_col: str) -> pd.DataFrame:
175
- """Position new columns before a specific reference column"""
176
-
177
- self._positioning_stats['relative_insertions'] += 1
178
-
179
- # Get existing columns (excluding new ones)
180
- existing_columns = [col for col in df.columns if col not in new_columns]
181
-
182
- # Check if reference column exists
183
- if reference_col not in existing_columns:
184
- available_cols = existing_columns[:5] # Show first 5 for brevity
185
- logger.warning(f"Reference column '{reference_col}' not found. "
186
- f"Available columns: {available_cols}{'...' if len(existing_columns) > 5 else ''}. "
187
- f"Using end position.")
188
- return df
189
-
190
- # Find position before reference column
191
- ref_index = existing_columns.index(reference_col)
192
-
193
- # Insert new columns before reference column
194
- new_order = (existing_columns[:ref_index] +
195
- new_columns +
196
- existing_columns[ref_index:])
197
-
198
- logger.debug(f"Positioning {len(new_columns)} columns before '{reference_col}'")
199
- return df[new_order]
200
-
201
- def validate_position_syntax(self, position: Union[str, int]) -> Dict[str, Any]:
202
- """
203
- Validate position syntax and provide helpful feedback
204
-
205
- Returns:
206
- Dict with validation results and suggestions
207
- """
208
-
209
- result = {
210
- 'valid': True,
211
- 'position_type': None,
212
- 'parsed_value': None,
213
- 'warnings': [],
214
- 'suggestions': []
215
- }
216
-
217
- if position == "end":
218
- result['position_type'] = 'end'
219
-
220
- elif position == "start":
221
- result['position_type'] = 'start'
222
-
223
- elif isinstance(position, int):
224
- result['position_type'] = 'index'
225
- result['parsed_value'] = position
226
- if position < 0:
227
- result['warnings'].append("Negative index will be converted to positive")
228
-
229
- elif isinstance(position, str) and position.startswith("after:"):
230
- reference_col = position[6:]
231
- if not reference_col:
232
- result['valid'] = False
233
- result['suggestions'].append("Specify column name after 'after:' (e.g., 'after:product_id')")
234
- else:
235
- result['position_type'] = 'after'
236
- result['parsed_value'] = reference_col
237
-
238
- elif isinstance(position, str) and position.startswith("before:"):
239
- reference_col = position[7:]
240
- if not reference_col:
241
- result['valid'] = False
242
- result['suggestions'].append("Specify column name after 'before:' (e.g., 'before:total')")
243
- else:
244
- result['position_type'] = 'before'
245
- result['parsed_value'] = reference_col
246
-
247
- else:
248
- result['valid'] = False
249
- result['suggestions'].extend([
250
- "Valid position options:",
251
- " - 'end' (default)",
252
- " - 'start'",
253
- " - integer index (0-based)",
254
- " - 'after:column_name'",
255
- " - 'before:column_name'"
256
- ])
257
-
258
- return result
259
-
260
- def get_column_suggestions(self, df: pd.DataFrame, partial_name: str = "") -> List[str]:
261
- """
262
- Get column name suggestions for positioning
263
-
264
- Args:
265
- df: Target dataframe
266
- partial_name: Partial column name for filtering suggestions
267
-
268
- Returns:
269
- List of suggested column names
270
- """
271
-
272
- columns = list(df.columns)
273
-
274
- if not partial_name:
275
- return columns[:10] # Return first 10 columns
276
-
277
- # Filter columns that contain the partial name (case-insensitive)
278
- partial_lower = partial_name.lower()
279
- matches = [col for col in columns if partial_lower in col.lower()]
280
-
281
- return matches[:10] # Return up to 10 matches
282
-
283
- def get_stats(self) -> Dict[str, Any]:
284
- """Get column positioning statistics"""
285
- return self._positioning_stats.copy()
286
-
287
- def reset_stats(self):
288
- """Reset positioning statistics"""
289
- self._positioning_stats = {
290
- 'total_operations': 0,
291
- 'end_insertions': 0,
292
- 'start_insertions': 0,
293
- 'index_insertions': 0,
294
- 'relative_insertions': 0,
295
- 'errors_handled': 0
296
- }
297
-
298
-
299
- # Global positioner instance
300
- _positioner = ColumnPositioner()
301
-
302
-
303
- # Convenience functions
304
- def position_columns(df: pd.DataFrame,
305
- new_columns: List[str],
306
- position: Union[str, int] = "end") -> pd.DataFrame:
307
- """Position new columns in dataframe"""
308
- return _positioner.position_columns(df, new_columns, position)
309
-
310
-
311
- def validate_position_syntax(position: Union[str, int]) -> Dict[str, Any]:
312
- """Validate position syntax"""
313
- return _positioner.validate_position_syntax(position)
314
-
315
-
316
- def get_column_suggestions(df: pd.DataFrame, partial_name: str = "") -> List[str]:
317
- """Get column name suggestions"""
318
- return _positioner.get_column_suggestions(df, partial_name)
319
-
320
-
321
- def get_positioning_stats() -> Dict[str, Any]:
322
- """Get positioning statistics"""
323
- return _positioner.get_stats()
324
-
325
-
326
- # Example usage and validation
327
- def demonstrate_positioning():
328
- """Demonstrate column positioning capabilities"""
329
-
330
- # Create sample dataframe
331
- df = pd.DataFrame({
332
- 'id': [1, 2, 3],
333
- 'name': ['A', 'B', 'C'],
334
- 'category': ['X', 'Y', 'Z'],
335
- 'new_col1': [10, 20, 30], # Simulated new columns
336
- 'new_col2': [100, 200, 300]
337
- })
338
-
339
- new_columns = ['new_col1', 'new_col2']
340
-
341
- print("Original column order:", list(df.columns))
342
-
343
- # Test different positioning options
344
- positions = [
345
- "start",
346
- "end",
347
- 1,
348
- "after:name",
349
- "before:category"
350
- ]
351
-
352
- for pos in positions:
353
- result = position_columns(df, new_columns, pos)
354
- print(f"Position '{pos}': {list(result.columns)}")
355
-
356
-
357
- if __name__ == "__main__":
358
- demonstrate_positioning()
@@ -1,166 +0,0 @@
1
- # compiler_polars.py
2
-
3
- import polars as pl
4
-
5
-
6
- def compile_polars(ast):
7
- """
8
- Convert AST → Polars expression.
9
- Supports:
10
- - column
11
- - literal
12
- - binary arithmetic
13
- - comparisons
14
- - boolean logic
15
- - unary boolean
16
- - ternary (if_expr)
17
- - function calls (min, max, abs, log, exp)
18
- """
19
-
20
- node_type = ast["type"]
21
-
22
- # ------------------------------------------------------------
23
- # Column reference
24
- # ------------------------------------------------------------
25
- if node_type == "column":
26
- return pl.col(ast["name"])
27
-
28
- # ------------------------------------------------------------
29
- # Literal
30
- # ------------------------------------------------------------
31
- if node_type == "literal":
32
- return pl.lit(ast["value"])
33
-
34
- # ------------------------------------------------------------
35
- # Binary arithmetic: + - * / ** % //
36
- # ------------------------------------------------------------
37
- if node_type == "binary":
38
- left = compile_polars(ast["left"])
39
- right = compile_polars(ast["right"])
40
- op = ast["op"]
41
-
42
- if op == "+":
43
- return left + right
44
- if op == "-":
45
- return left - right
46
- if op == "*":
47
- return left * right
48
- if op == "/":
49
- return left / right
50
- if op == "**":
51
- return left ** right
52
- if op == "%":
53
- return left % right
54
- if op == "//":
55
- return left // right
56
-
57
- raise NotImplementedError(f"Unknown binary op: {op}")
58
-
59
- # ------------------------------------------------------------
60
- # Comparison: == != > < >= <=
61
- # ------------------------------------------------------------
62
- if node_type == "cmp":
63
- left = compile_polars(ast["left"])
64
- right = compile_polars(ast["right"])
65
- op = ast["op"]
66
-
67
- if op == "==":
68
- return left == right
69
- if op == "!=":
70
- return left != right
71
- if op == ">":
72
- return left > right
73
- if op == "<":
74
- return left < right
75
- if op == ">=":
76
- return left >= right
77
- if op == "<=":
78
- return left <= right
79
-
80
- raise NotImplementedError(f"Unknown comparison op: {op}")
81
-
82
- # ------------------------------------------------------------
83
- # Boolean operations: and/or
84
- # ------------------------------------------------------------
85
- if node_type == "bool_op":
86
- op = ast["op"]
87
- values = [compile_polars(v) for v in ast["values"]]
88
-
89
- if op == "and":
90
- expr = values[0]
91
- for v in values[1:]:
92
- expr = expr & v
93
- return expr
94
-
95
- if op == "or":
96
- expr = values[0]
97
- for v in values[1:]:
98
- expr = expr | v
99
- return expr
100
-
101
- raise NotImplementedError(f"Unknown boolean op: {op}")
102
-
103
- # ------------------------------------------------------------
104
- # Unary boolean: not x
105
- # ------------------------------------------------------------
106
- if node_type == "unary_bool":
107
- val = compile_polars(ast["value"])
108
- return ~val
109
-
110
- # ------------------------------------------------------------
111
- # Ternary: a if cond else b
112
- # ------------------------------------------------------------
113
- if node_type == "if_expr":
114
- cond = compile_polars(ast["cond"])
115
- then = compile_polars(ast["then"])
116
- els = compile_polars(ast["else"])
117
- return pl.when(cond).then(then).otherwise(els)
118
-
119
- # ------------------------------------------------------------
120
- # Function calls: min, max, abs, log, exp, sqrt, sin, cos, tan, round, ceil, floor
121
- # ------------------------------------------------------------
122
- if node_type == "call":
123
- name = ast["name"]
124
- args = [compile_polars(a) for a in ast["args"]]
125
-
126
- # Basic math functions
127
- if name == "abs":
128
- return args[0].abs()
129
- if name == "log":
130
- return args[0].log()
131
- if name == "exp":
132
- return args[0].exp()
133
- if name == "sqrt":
134
- return args[0].sqrt()
135
-
136
- # Rounding functions
137
- if name == "round":
138
- if len(args) == 1:
139
- return args[0].round(0)
140
- else:
141
- return args[0].round(args[1])
142
- if name == "ceil":
143
- return args[0].ceil()
144
- if name == "floor":
145
- return args[0].floor()
146
-
147
- # Trigonometric functions
148
- if name == "sin":
149
- return args[0].sin()
150
- if name == "cos":
151
- return args[0].cos()
152
- if name == "tan":
153
- return args[0].tan()
154
-
155
- # Aggregation functions (horizontal)
156
- if name == "min":
157
- return pl.min_horizontal(*args)
158
- if name == "max":
159
- return pl.max_horizontal(*args)
160
-
161
- raise NotImplementedError(f"Unknown function: {name}")
162
-
163
- # ------------------------------------------------------------
164
- # Fallback
165
- # ------------------------------------------------------------
166
- raise NotImplementedError(f"Unsupported AST node: {ast}")