additory 0.1.0a4__py3-none-any.whl → 0.1.1a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. additory/__init__.py +58 -14
  2. additory/common/__init__.py +31 -147
  3. additory/common/column_selector.py +255 -0
  4. additory/common/distributions.py +286 -613
  5. additory/common/extractors.py +313 -0
  6. additory/common/knn_imputation.py +332 -0
  7. additory/common/result.py +380 -0
  8. additory/common/strategy_parser.py +243 -0
  9. additory/common/unit_conversions.py +338 -0
  10. additory/common/validation.py +283 -103
  11. additory/core/__init__.py +34 -22
  12. additory/core/backend.py +258 -0
  13. additory/core/config.py +177 -305
  14. additory/core/logging.py +230 -24
  15. additory/core/memory_manager.py +157 -495
  16. additory/expressions/__init__.py +2 -23
  17. additory/expressions/compiler.py +457 -0
  18. additory/expressions/engine.py +264 -487
  19. additory/expressions/integrity.py +179 -0
  20. additory/expressions/loader.py +263 -0
  21. additory/expressions/parser.py +363 -167
  22. additory/expressions/resolver.py +274 -0
  23. additory/functions/__init__.py +1 -0
  24. additory/functions/analyze/__init__.py +144 -0
  25. additory/functions/analyze/cardinality.py +58 -0
  26. additory/functions/analyze/correlations.py +66 -0
  27. additory/functions/analyze/distributions.py +53 -0
  28. additory/functions/analyze/duplicates.py +49 -0
  29. additory/functions/analyze/features.py +61 -0
  30. additory/functions/analyze/imputation.py +66 -0
  31. additory/functions/analyze/outliers.py +65 -0
  32. additory/functions/analyze/patterns.py +65 -0
  33. additory/functions/analyze/presets.py +72 -0
  34. additory/functions/analyze/quality.py +59 -0
  35. additory/functions/analyze/timeseries.py +53 -0
  36. additory/functions/analyze/types.py +45 -0
  37. additory/functions/expressions/__init__.py +161 -0
  38. additory/functions/snapshot/__init__.py +82 -0
  39. additory/functions/snapshot/filter.py +119 -0
  40. additory/functions/synthetic/__init__.py +113 -0
  41. additory/functions/synthetic/mode_detector.py +47 -0
  42. additory/functions/synthetic/strategies/__init__.py +1 -0
  43. additory/functions/synthetic/strategies/advanced.py +35 -0
  44. additory/functions/synthetic/strategies/augmentative.py +160 -0
  45. additory/functions/synthetic/strategies/generative.py +168 -0
  46. additory/functions/synthetic/strategies/presets.py +116 -0
  47. additory/functions/to/__init__.py +188 -0
  48. additory/functions/to/lookup.py +351 -0
  49. additory/functions/to/merge.py +189 -0
  50. additory/functions/to/sort.py +91 -0
  51. additory/functions/to/summarize.py +170 -0
  52. additory/functions/transform/__init__.py +140 -0
  53. additory/functions/transform/datetime.py +79 -0
  54. additory/functions/transform/extract.py +85 -0
  55. additory/functions/transform/harmonize.py +105 -0
  56. additory/functions/transform/knn.py +62 -0
  57. additory/functions/transform/onehotencoding.py +68 -0
  58. additory/functions/transform/transpose.py +42 -0
  59. additory-0.1.1a1.dist-info/METADATA +83 -0
  60. additory-0.1.1a1.dist-info/RECORD +62 -0
  61. additory/analysis/__init__.py +0 -48
  62. additory/analysis/cardinality.py +0 -126
  63. additory/analysis/correlations.py +0 -124
  64. additory/analysis/distributions.py +0 -376
  65. additory/analysis/quality.py +0 -158
  66. additory/analysis/scan.py +0 -400
  67. additory/common/backend.py +0 -371
  68. additory/common/column_utils.py +0 -191
  69. additory/common/exceptions.py +0 -62
  70. additory/common/lists.py +0 -229
  71. additory/common/patterns.py +0 -240
  72. additory/common/resolver.py +0 -567
  73. additory/common/sample_data.py +0 -182
  74. additory/core/ast_builder.py +0 -165
  75. additory/core/backends/__init__.py +0 -23
  76. additory/core/backends/arrow_bridge.py +0 -483
  77. additory/core/backends/cudf_bridge.py +0 -355
  78. additory/core/column_positioning.py +0 -358
  79. additory/core/compiler_polars.py +0 -166
  80. additory/core/enhanced_cache_manager.py +0 -1119
  81. additory/core/enhanced_matchers.py +0 -473
  82. additory/core/enhanced_version_manager.py +0 -325
  83. additory/core/executor.py +0 -59
  84. additory/core/integrity_manager.py +0 -477
  85. additory/core/loader.py +0 -190
  86. additory/core/namespace_manager.py +0 -657
  87. additory/core/parser.py +0 -176
  88. additory/core/polars_expression_engine.py +0 -601
  89. additory/core/registry.py +0 -177
  90. additory/core/sample_data_manager.py +0 -492
  91. additory/core/user_namespace.py +0 -751
  92. additory/core/validator.py +0 -27
  93. additory/dynamic_api.py +0 -352
  94. additory/expressions/proxy.py +0 -549
  95. additory/expressions/registry.py +0 -313
  96. additory/expressions/samples.py +0 -492
  97. additory/synthetic/__init__.py +0 -13
  98. additory/synthetic/column_name_resolver.py +0 -149
  99. additory/synthetic/deduce.py +0 -259
  100. additory/synthetic/distributions.py +0 -22
  101. additory/synthetic/forecast.py +0 -1132
  102. additory/synthetic/linked_list_parser.py +0 -415
  103. additory/synthetic/namespace_lookup.py +0 -129
  104. additory/synthetic/smote.py +0 -320
  105. additory/synthetic/strategies.py +0 -926
  106. additory/synthetic/synthesizer.py +0 -713
  107. additory/utilities/__init__.py +0 -53
  108. additory/utilities/encoding.py +0 -600
  109. additory/utilities/games.py +0 -300
  110. additory/utilities/keys.py +0 -8
  111. additory/utilities/lookup.py +0 -103
  112. additory/utilities/matchers.py +0 -216
  113. additory/utilities/resolvers.py +0 -286
  114. additory/utilities/settings.py +0 -167
  115. additory/utilities/units.py +0 -749
  116. additory/utilities/validators.py +0 -153
  117. additory-0.1.0a4.dist-info/METADATA +0 -311
  118. additory-0.1.0a4.dist-info/RECORD +0 -72
  119. additory-0.1.0a4.dist-info/licenses/LICENSE +0 -21
  120. {additory-0.1.0a4.dist-info → additory-0.1.1a1.dist-info}/WHEEL +0 -0
  121. {additory-0.1.0a4.dist-info → additory-0.1.1a1.dist-info}/top_level.txt +0 -0
@@ -1,286 +0,0 @@
1
- # additory/ops/resolvers.py
2
-
3
- import pandas as pd
4
- from typing import List, Dict, Any
5
- from collections import Counter
6
-
7
- def resolve_strict(matches, ref_df, cols):
8
- """
9
- Only accept exactly one match.
10
- If 0 or >1 matches → return None for all columns.
11
- """
12
- if len(matches) != 1:
13
- return {col: None for col in cols}
14
-
15
- row = ref_df.iloc[matches[0]]
16
- return {col: row[col] for col in cols}
17
-
18
-
19
- def resolve_first(matches, ref_df, cols):
20
- """
21
- Excel VLOOKUP behavior:
22
- - If no matches → None
23
- - If multiple → take the first
24
- """
25
- if not matches:
26
- return {col: None for col in cols}
27
-
28
- row = ref_df.iloc[matches[0]]
29
- return {col: row[col] for col in cols}
30
-
31
-
32
- def resolve_last(matches, ref_df, cols):
33
- """
34
- Take the last match:
35
- - If no matches → None
36
- - If multiple → take the last
37
- """
38
- if not matches:
39
- return {col: None for col in cols}
40
-
41
- row = ref_df.iloc[matches[-1]]
42
- return {col: row[col] for col in cols}
43
-
44
-
45
- def resolve_majority(matches, ref_df, cols):
46
- """
47
- For each column:
48
- - pick the most frequent value among duplicates
49
- - ties → first occurring
50
- """
51
- out = {}
52
-
53
- for col in cols:
54
- values = [ref_df.iloc[i][col] for i in matches]
55
-
56
- if not values:
57
- out[col] = None
58
- continue
59
-
60
- # Count frequency, handling NaN values
61
- non_null_values = [v for v in values if pd.notna(v)]
62
-
63
- if not non_null_values:
64
- out[col] = None
65
- continue
66
-
67
- # Use Counter for frequency counting
68
- freq = Counter(non_null_values)
69
-
70
- # Pick the value with highest frequency (first in case of tie)
71
- out[col] = freq.most_common(1)[0][0]
72
-
73
- return out
74
-
75
-
76
- def resolve_max(matches, ref_df, cols):
77
- """
78
- For numeric or date-like columns:
79
- - pick the maximum value
80
- """
81
- out = {}
82
-
83
- for col in cols:
84
- values = [ref_df.iloc[i][col] for i in matches]
85
-
86
- if not values:
87
- out[col] = None
88
- else:
89
- try:
90
- # Filter out NaN values
91
- non_null_values = [v for v in values if pd.notna(v)]
92
- if non_null_values:
93
- out[col] = max(non_null_values)
94
- else:
95
- out[col] = None
96
- except (TypeError, ValueError):
97
- out[col] = None
98
-
99
- return out
100
-
101
-
102
- def resolve_min(matches, ref_df, cols):
103
- """
104
- For numeric or date-like columns:
105
- - pick the minimum value
106
- """
107
- out = {}
108
-
109
- for col in cols:
110
- values = [ref_df.iloc[i][col] for i in matches]
111
-
112
- if not values:
113
- out[col] = None
114
- else:
115
- try:
116
- # Filter out NaN values
117
- non_null_values = [v for v in values if pd.notna(v)]
118
- if non_null_values:
119
- out[col] = min(non_null_values)
120
- else:
121
- out[col] = None
122
- except (TypeError, ValueError):
123
- out[col] = None
124
-
125
- return out
126
-
127
-
128
- def resolve_longest(matches, ref_df, cols):
129
- """
130
- For text columns:
131
- - pick the longest string representation
132
- """
133
- out = {}
134
-
135
- for col in cols:
136
- values = [ref_df.iloc[i][col] for i in matches]
137
-
138
- if not values:
139
- out[col] = None
140
- else:
141
- # Filter out NaN values and convert to string
142
- non_null_values = [v for v in values if pd.notna(v)]
143
- if non_null_values:
144
- out[col] = max(non_null_values, key=lambda x: len(str(x)))
145
- else:
146
- out[col] = None
147
-
148
- return out
149
-
150
-
151
- def resolve_shortest(matches, ref_df, cols):
152
- """
153
- For text columns:
154
- - pick the shortest string representation
155
- """
156
- out = {}
157
-
158
- for col in cols:
159
- values = [ref_df.iloc[i][col] for i in matches]
160
-
161
- if not values:
162
- out[col] = None
163
- else:
164
- # Filter out NaN values and convert to string
165
- non_null_values = [v for v in values if pd.notna(v)]
166
- if non_null_values:
167
- out[col] = min(non_null_values, key=lambda x: len(str(x)))
168
- else:
169
- out[col] = None
170
-
171
- return out
172
-
173
-
174
- def resolve_sum(matches, ref_df, cols):
175
- """
176
- For numeric columns:
177
- - sum all matching values
178
- """
179
- out = {}
180
-
181
- for col in cols:
182
- values = [ref_df.iloc[i][col] for i in matches]
183
-
184
- if not values:
185
- out[col] = None
186
- else:
187
- try:
188
- # Filter out NaN values
189
- non_null_values = [v for v in values if pd.notna(v)]
190
- if non_null_values:
191
- # Try to sum numeric values
192
- numeric_values = [float(v) for v in non_null_values]
193
- out[col] = sum(numeric_values)
194
- else:
195
- out[col] = None
196
- except (TypeError, ValueError):
197
- out[col] = None
198
-
199
- return out
200
-
201
-
202
- def resolve_count(matches, ref_df, cols):
203
- """
204
- Count the number of matches for each column
205
- """
206
- out = {}
207
-
208
- for col in cols:
209
- values = [ref_df.iloc[i][col] for i in matches]
210
- # Count non-null values
211
- non_null_count = sum(1 for v in values if pd.notna(v))
212
- out[col] = non_null_count
213
-
214
- return out
215
-
216
-
217
- def resolve_avg(matches, ref_df, cols):
218
- """
219
- For numeric columns:
220
- - calculate average of all matching values
221
- """
222
- out = {}
223
-
224
- for col in cols:
225
- values = [ref_df.iloc[i][col] for i in matches]
226
-
227
- if not values:
228
- out[col] = None
229
- else:
230
- try:
231
- # Filter out NaN values
232
- non_null_values = [v for v in values if pd.notna(v)]
233
- if non_null_values:
234
- # Try to average numeric values
235
- numeric_values = [float(v) for v in non_null_values]
236
- out[col] = sum(numeric_values) / len(numeric_values)
237
- else:
238
- out[col] = None
239
- except (TypeError, ValueError):
240
- out[col] = None
241
-
242
- return out
243
-
244
-
245
- def resolve_concat(matches, ref_df, cols):
246
- """
247
- For text columns:
248
- - concatenate all matching values with separator
249
- """
250
- out = {}
251
-
252
- for col in cols:
253
- values = [ref_df.iloc[i][col] for i in matches]
254
-
255
- if not values:
256
- out[col] = None
257
- else:
258
- # Filter out NaN values and convert to string
259
- non_null_values = [str(v) for v in values if pd.notna(v)]
260
- if non_null_values:
261
- out[col] = "; ".join(non_null_values)
262
- else:
263
- out[col] = None
264
-
265
- return out
266
-
267
-
268
- RESOLVERS = {
269
- # Single value selection
270
- "strict": resolve_strict,
271
- "first": resolve_first,
272
- "last": resolve_last,
273
- "majority": resolve_majority,
274
-
275
- # Numeric aggregation
276
- "max": resolve_max,
277
- "min": resolve_min,
278
- "sum": resolve_sum,
279
- "avg": resolve_avg,
280
- "count": resolve_count,
281
-
282
- # Text aggregation
283
- "longest": resolve_longest,
284
- "shortest": resolve_shortest,
285
- "concat": resolve_concat,
286
- }
@@ -1,167 +0,0 @@
1
- # additory/utilities/settings.py
2
- # Global settings management
3
-
4
- """
5
- Settings Utilities Module
6
-
7
- This module provides global settings management for the additory library:
8
- - Backend preferences
9
- - Path configurations
10
- - Performance settings
11
- - User preferences
12
- """
13
-
14
- from typing import Optional, Dict, Any
15
- import os
16
-
17
- # Global settings storage
18
- _global_settings = {
19
- "backend": "auto", # auto, pandas, polars, cudf
20
- "precision": "auto", # auto, float32, float64
21
- "my_expressions_path": None,
22
- "my_schemas_path": None,
23
- "cache_enabled": True,
24
- "memory_threshold_mb": 100,
25
- "performance_mode": "balanced" # fast, balanced, memory_optimized
26
- }
27
-
28
-
29
- def set_global_settings(**kwargs) -> Dict[str, Any]:
30
- """
31
- Set global settings for additory
32
-
33
- Args:
34
- backend: Preferred backend ("auto", "pandas", "polars", "cudf")
35
- precision: Numeric precision ("auto", "float32", "float64")
36
- my_expressions_path: Path to user expressions
37
- my_schemas_path: Path to user schemas
38
- cache_enabled: Enable/disable caching
39
- memory_threshold_mb: Memory cleanup threshold
40
- performance_mode: Performance mode ("fast", "balanced", "memory_optimized")
41
-
42
- Returns:
43
- Dictionary with updated settings
44
- """
45
- global _global_settings
46
-
47
- valid_backends = ["auto", "pandas", "polars", "cudf"]
48
- valid_precisions = ["auto", "float32", "float64"]
49
- valid_performance_modes = ["fast", "balanced", "memory_optimized"]
50
-
51
- for key, value in kwargs.items():
52
- if key == "backend" and value not in valid_backends:
53
- raise ValueError(f"Invalid backend: {value}. Must be one of {valid_backends}")
54
- elif key == "precision" and value not in valid_precisions:
55
- raise ValueError(f"Invalid precision: {value}. Must be one of {valid_precisions}")
56
- elif key == "performance_mode" and value not in valid_performance_modes:
57
- raise ValueError(f"Invalid performance_mode: {value}. Must be one of {valid_performance_modes}")
58
- elif key in ["my_expressions_path", "my_schemas_path"] and value is not None:
59
- if not os.path.exists(value):
60
- raise ValueError(f"Path does not exist: {value}")
61
-
62
- if key in _global_settings:
63
- _global_settings[key] = value
64
- else:
65
- raise ValueError(f"Unknown setting: {key}")
66
-
67
- return _global_settings.copy()
68
-
69
-
70
- def get_global_settings() -> Dict[str, Any]:
71
- """
72
- Get current global settings
73
-
74
- Returns:
75
- Dictionary with current settings
76
- """
77
- return _global_settings.copy()
78
-
79
-
80
- def get_setting(key: str, default: Any = None) -> Any:
81
- """
82
- Get a specific setting value
83
-
84
- Args:
85
- key: Setting key
86
- default: Default value if key not found
87
-
88
- Returns:
89
- Setting value or default
90
- """
91
- return _global_settings.get(key, default)
92
-
93
-
94
- def reset_settings():
95
- """Reset all settings to defaults"""
96
- global _global_settings
97
- _global_settings = {
98
- "backend": "auto",
99
- "precision": "auto",
100
- "my_expressions_path": None,
101
- "my_schemas_path": None,
102
- "cache_enabled": True,
103
- "memory_threshold_mb": 100,
104
- "performance_mode": "balanced"
105
- }
106
-
107
-
108
- def set_my_expressions_path(path: str):
109
- """
110
- Set path for user expressions
111
-
112
- Args:
113
- path: Path to user expressions directory
114
- """
115
- if not os.path.exists(path):
116
- raise ValueError(f"Path does not exist: {path}")
117
-
118
- _global_settings["my_expressions_path"] = path
119
-
120
-
121
- def set_my_schemas_path(path: str):
122
- """
123
- Set path for user schemas
124
-
125
- Args:
126
- path: Path to user schemas directory
127
- """
128
- if not os.path.exists(path):
129
- raise ValueError(f"Path does not exist: {path}")
130
-
131
- _global_settings["my_schemas_path"] = path
132
-
133
-
134
- def get_my_expressions_path() -> Optional[str]:
135
- """Get current user expressions path"""
136
- return _global_settings.get("my_expressions_path")
137
-
138
-
139
- def get_my_schemas_path() -> Optional[str]:
140
- """Get current user schemas path"""
141
- return _global_settings.get("my_schemas_path")
142
-
143
-
144
- # Convenience functions for common settings
145
- def set_backend(backend: str):
146
- """Set preferred backend"""
147
- set_global_settings(backend=backend)
148
-
149
-
150
- def set_precision(precision: str):
151
- """Set numeric precision"""
152
- set_global_settings(precision=precision)
153
-
154
-
155
- def enable_cache():
156
- """Enable caching"""
157
- set_global_settings(cache_enabled=True)
158
-
159
-
160
- def disable_cache():
161
- """Disable caching"""
162
- set_global_settings(cache_enabled=False)
163
-
164
-
165
- def set_performance_mode(mode: str):
166
- """Set performance mode"""
167
- set_global_settings(performance_mode=mode)