additory 0.1.0a3__py3-none-any.whl → 0.1.1a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- additory/__init__.py +58 -14
- additory/common/__init__.py +31 -147
- additory/common/column_selector.py +255 -0
- additory/common/distributions.py +286 -613
- additory/common/extractors.py +313 -0
- additory/common/knn_imputation.py +332 -0
- additory/common/result.py +380 -0
- additory/common/strategy_parser.py +243 -0
- additory/common/unit_conversions.py +338 -0
- additory/common/validation.py +283 -103
- additory/core/__init__.py +34 -22
- additory/core/backend.py +258 -0
- additory/core/config.py +177 -305
- additory/core/logging.py +230 -24
- additory/core/memory_manager.py +157 -495
- additory/expressions/__init__.py +2 -23
- additory/expressions/compiler.py +457 -0
- additory/expressions/engine.py +264 -487
- additory/expressions/integrity.py +179 -0
- additory/expressions/loader.py +263 -0
- additory/expressions/parser.py +363 -167
- additory/expressions/resolver.py +274 -0
- additory/functions/__init__.py +1 -0
- additory/functions/analyze/__init__.py +144 -0
- additory/functions/analyze/cardinality.py +58 -0
- additory/functions/analyze/correlations.py +66 -0
- additory/functions/analyze/distributions.py +53 -0
- additory/functions/analyze/duplicates.py +49 -0
- additory/functions/analyze/features.py +61 -0
- additory/functions/analyze/imputation.py +66 -0
- additory/functions/analyze/outliers.py +65 -0
- additory/functions/analyze/patterns.py +65 -0
- additory/functions/analyze/presets.py +72 -0
- additory/functions/analyze/quality.py +59 -0
- additory/functions/analyze/timeseries.py +53 -0
- additory/functions/analyze/types.py +45 -0
- additory/functions/expressions/__init__.py +161 -0
- additory/functions/snapshot/__init__.py +82 -0
- additory/functions/snapshot/filter.py +119 -0
- additory/functions/synthetic/__init__.py +113 -0
- additory/functions/synthetic/mode_detector.py +47 -0
- additory/functions/synthetic/strategies/__init__.py +1 -0
- additory/functions/synthetic/strategies/advanced.py +35 -0
- additory/functions/synthetic/strategies/augmentative.py +160 -0
- additory/functions/synthetic/strategies/generative.py +168 -0
- additory/functions/synthetic/strategies/presets.py +116 -0
- additory/functions/to/__init__.py +188 -0
- additory/functions/to/lookup.py +351 -0
- additory/functions/to/merge.py +189 -0
- additory/functions/to/sort.py +91 -0
- additory/functions/to/summarize.py +170 -0
- additory/functions/transform/__init__.py +140 -0
- additory/functions/transform/datetime.py +79 -0
- additory/functions/transform/extract.py +85 -0
- additory/functions/transform/harmonize.py +105 -0
- additory/functions/transform/knn.py +62 -0
- additory/functions/transform/onehotencoding.py +68 -0
- additory/functions/transform/transpose.py +42 -0
- additory-0.1.1a1.dist-info/METADATA +83 -0
- additory-0.1.1a1.dist-info/RECORD +62 -0
- additory/analysis/__init__.py +0 -48
- additory/analysis/cardinality.py +0 -126
- additory/analysis/correlations.py +0 -124
- additory/analysis/distributions.py +0 -376
- additory/analysis/quality.py +0 -158
- additory/analysis/scan.py +0 -400
- additory/common/backend.py +0 -371
- additory/common/column_utils.py +0 -191
- additory/common/exceptions.py +0 -62
- additory/common/lists.py +0 -229
- additory/common/patterns.py +0 -240
- additory/common/resolver.py +0 -567
- additory/common/sample_data.py +0 -182
- additory/core/ast_builder.py +0 -165
- additory/core/backends/__init__.py +0 -23
- additory/core/backends/arrow_bridge.py +0 -483
- additory/core/backends/cudf_bridge.py +0 -355
- additory/core/column_positioning.py +0 -358
- additory/core/compiler_polars.py +0 -166
- additory/core/enhanced_cache_manager.py +0 -1119
- additory/core/enhanced_matchers.py +0 -473
- additory/core/enhanced_version_manager.py +0 -325
- additory/core/executor.py +0 -59
- additory/core/integrity_manager.py +0 -477
- additory/core/loader.py +0 -190
- additory/core/namespace_manager.py +0 -657
- additory/core/parser.py +0 -176
- additory/core/polars_expression_engine.py +0 -601
- additory/core/registry.py +0 -176
- additory/core/sample_data_manager.py +0 -492
- additory/core/user_namespace.py +0 -751
- additory/core/validator.py +0 -27
- additory/dynamic_api.py +0 -304
- additory/expressions/proxy.py +0 -549
- additory/expressions/registry.py +0 -313
- additory/expressions/samples.py +0 -492
- additory/synthetic/__init__.py +0 -13
- additory/synthetic/column_name_resolver.py +0 -149
- additory/synthetic/distributions.py +0 -22
- additory/synthetic/forecast.py +0 -1132
- additory/synthetic/linked_list_parser.py +0 -415
- additory/synthetic/namespace_lookup.py +0 -129
- additory/synthetic/smote.py +0 -320
- additory/synthetic/strategies.py +0 -850
- additory/synthetic/synthesizer.py +0 -713
- additory/utilities/__init__.py +0 -53
- additory/utilities/encoding.py +0 -600
- additory/utilities/games.py +0 -300
- additory/utilities/keys.py +0 -8
- additory/utilities/lookup.py +0 -103
- additory/utilities/matchers.py +0 -216
- additory/utilities/resolvers.py +0 -286
- additory/utilities/settings.py +0 -167
- additory/utilities/units.py +0 -749
- additory/utilities/validators.py +0 -153
- additory-0.1.0a3.dist-info/METADATA +0 -288
- additory-0.1.0a3.dist-info/RECORD +0 -71
- additory-0.1.0a3.dist-info/licenses/LICENSE +0 -21
- {additory-0.1.0a3.dist-info → additory-0.1.1a1.dist-info}/WHEEL +0 -0
- {additory-0.1.0a3.dist-info → additory-0.1.1a1.dist-info}/top_level.txt +0 -0
additory/core/validator.py
DELETED
|
@@ -1,27 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Expression Validator
|
|
3
|
-
|
|
4
|
-
Validates expression syntax and structure.
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
from typing import Dict, Any, List, Tuple
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
def validate_expression(expression: str) -> Tuple[bool, List[str]]:
|
|
11
|
-
"""
|
|
12
|
-
Validate an expression.
|
|
13
|
-
|
|
14
|
-
Args:
|
|
15
|
-
expression: Expression string to validate
|
|
16
|
-
|
|
17
|
-
Returns:
|
|
18
|
-
Tuple of (is_valid, error_messages)
|
|
19
|
-
"""
|
|
20
|
-
errors = []
|
|
21
|
-
|
|
22
|
-
if not expression or not expression.strip():
|
|
23
|
-
errors.append("Expression cannot be empty")
|
|
24
|
-
return (False, errors)
|
|
25
|
-
|
|
26
|
-
# Basic validation - can be expanded later
|
|
27
|
-
return (True, [])
|
additory/dynamic_api.py
DELETED
|
@@ -1,304 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Dynamic API for Additory
|
|
3
|
-
|
|
4
|
-
This module provides the main API interface with dynamic attribute access.
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
from types import SimpleNamespace
|
|
8
|
-
from typing import Union, Optional, List, Any
|
|
9
|
-
import pandas as pd
|
|
10
|
-
import polars as pl
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
class AdditoryAPI(SimpleNamespace):
|
|
14
|
-
"""
|
|
15
|
-
Main API class for Additory functionality.
|
|
16
|
-
|
|
17
|
-
Provides access to:
|
|
18
|
-
- add.synthetic() - Synthetic data generation
|
|
19
|
-
- add.to() - Lookup/join operations
|
|
20
|
-
- add.scan() - Data profiling and analysis
|
|
21
|
-
- add.my - User expressions
|
|
22
|
-
- add.play() - Hidden games (for the curious 😉)
|
|
23
|
-
- Expression evaluation
|
|
24
|
-
"""
|
|
25
|
-
|
|
26
|
-
def __init__(self):
|
|
27
|
-
super().__init__()
|
|
28
|
-
# Initialize expression proxies
|
|
29
|
-
from additory.expressions.proxy import ExpressionProxy
|
|
30
|
-
self.my = ExpressionProxy(namespace="user")
|
|
31
|
-
self._builtin_proxy = ExpressionProxy(namespace="builtin")
|
|
32
|
-
|
|
33
|
-
# Explicitly set the synthetic method to prevent namespace conflicts
|
|
34
|
-
self.synthetic = self._synthetic_method
|
|
35
|
-
|
|
36
|
-
def __getattr__(self, name):
|
|
37
|
-
"""
|
|
38
|
-
Dynamic attribute access for expressions.
|
|
39
|
-
|
|
40
|
-
Checks built-in expressions first, then user expressions.
|
|
41
|
-
This ensures built-in expressions take precedence.
|
|
42
|
-
"""
|
|
43
|
-
# Check if it's a built-in expression first
|
|
44
|
-
if self._expression_exists(self._builtin_proxy, name):
|
|
45
|
-
return getattr(self._builtin_proxy, name)
|
|
46
|
-
|
|
47
|
-
# Check if it's a user expression
|
|
48
|
-
if self._expression_exists(self.my, name):
|
|
49
|
-
return getattr(self.my, name)
|
|
50
|
-
|
|
51
|
-
# If not found, raise AttributeError
|
|
52
|
-
raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
|
|
53
|
-
|
|
54
|
-
def _expression_exists(self, proxy, name):
|
|
55
|
-
"""Check if an expression exists in a proxy's namespace"""
|
|
56
|
-
try:
|
|
57
|
-
# List expressions in the proxy's namespace
|
|
58
|
-
expr_list = proxy.list_expressions()
|
|
59
|
-
return name in expr_list.get("expressions", {})
|
|
60
|
-
except Exception:
|
|
61
|
-
return False
|
|
62
|
-
|
|
63
|
-
def _synthetic_method(self, df, n_rows=5, strategy="auto", seed=None, output_format="pandas", **kwargs):
|
|
64
|
-
"""
|
|
65
|
-
Generate synthetic data by extending a dataframe or creating from scratch.
|
|
66
|
-
|
|
67
|
-
Three modes:
|
|
68
|
-
1. Extend mode: Pass a DataFrame to add synthetic rows
|
|
69
|
-
2. Create mode: Pass "@new" to create data from scratch
|
|
70
|
-
3. Sample mode: Pass "@sample" to load sample data
|
|
71
|
-
|
|
72
|
-
Args:
|
|
73
|
-
df: DataFrame to extend, "@new" to create, or "@sample" for sample data
|
|
74
|
-
n_rows: Number of rows (int for create/sample, int/float/str for extend)
|
|
75
|
-
strategy: Strategy specification (dict for create, str/dict for extend)
|
|
76
|
-
seed: Random seed for reproducibility
|
|
77
|
-
output_format: Output format ("pandas", "polars", "cudf")
|
|
78
|
-
**kwargs: Additional parameters
|
|
79
|
-
|
|
80
|
-
Returns:
|
|
81
|
-
Extended or generated DataFrame
|
|
82
|
-
|
|
83
|
-
Examples:
|
|
84
|
-
# Extend existing data
|
|
85
|
-
result = add.synthetic(df, n_rows=100, strategy='auto')
|
|
86
|
-
|
|
87
|
-
# Create from scratch
|
|
88
|
-
result = add.synthetic("@new", n_rows=100, strategy={'id': 'increment', 'age': 'range:18-65'})
|
|
89
|
-
|
|
90
|
-
# Load sample data
|
|
91
|
-
result = add.synthetic("@sample", n_rows=50)
|
|
92
|
-
"""
|
|
93
|
-
# Store reference to restore after import (in the correct namespace)
|
|
94
|
-
import additory
|
|
95
|
-
original_synthetic = getattr(additory, 'synthetic', None)
|
|
96
|
-
|
|
97
|
-
try:
|
|
98
|
-
# Import and call the implementation
|
|
99
|
-
from additory.synthetic.synthesizer import synthetic as synthetic_impl
|
|
100
|
-
result = synthetic_impl(df, n_rows=n_rows, strategy=strategy, seed=seed,
|
|
101
|
-
output_format=output_format, **kwargs)
|
|
102
|
-
|
|
103
|
-
# Restore the method reference in the additory module namespace
|
|
104
|
-
# The import above will have overridden additory.synthetic with the module
|
|
105
|
-
# We need to restore it to point to this method
|
|
106
|
-
if original_synthetic is not None:
|
|
107
|
-
additory.synthetic = original_synthetic
|
|
108
|
-
else:
|
|
109
|
-
# If there was no original synthetic, set it to this method
|
|
110
|
-
additory.synthetic = self._synthetic_method
|
|
111
|
-
|
|
112
|
-
return result
|
|
113
|
-
except Exception as e:
|
|
114
|
-
# Restore the method reference even if there's an error
|
|
115
|
-
if original_synthetic is not None:
|
|
116
|
-
additory.synthetic = original_synthetic
|
|
117
|
-
else:
|
|
118
|
-
additory.synthetic = self._synthetic_method
|
|
119
|
-
raise
|
|
120
|
-
|
|
121
|
-
def to(self, target_df, from_df=None, bring=None, against=None, **kwargs):
|
|
122
|
-
"""
|
|
123
|
-
Add columns from reference dataframe to target dataframe.
|
|
124
|
-
|
|
125
|
-
Args:
|
|
126
|
-
target_df: Target dataframe to add columns to
|
|
127
|
-
from_df: Reference dataframe to get columns from
|
|
128
|
-
bring: Column(s) to bring from reference dataframe (str or list)
|
|
129
|
-
against: Column(s) to match on (str or list)
|
|
130
|
-
**kwargs: Additional parameters
|
|
131
|
-
|
|
132
|
-
Returns:
|
|
133
|
-
Target dataframe with new columns added
|
|
134
|
-
|
|
135
|
-
Example:
|
|
136
|
-
result = add.to(orders_df, from_df=products_df, bring='price', against='product_id')
|
|
137
|
-
result = add.to(orders_df, from_df=products_df, bring=['price', 'name'], against='product_id')
|
|
138
|
-
"""
|
|
139
|
-
from additory.utilities.lookup import to
|
|
140
|
-
return to(target_df, from_df, bring=bring, against=against, **kwargs)
|
|
141
|
-
|
|
142
|
-
def onehotencoding(self, df, columns=None, **kwargs):
|
|
143
|
-
"""
|
|
144
|
-
One-hot encode categorical columns.
|
|
145
|
-
|
|
146
|
-
Args:
|
|
147
|
-
df: Input dataframe
|
|
148
|
-
columns: Column to encode (single column name as string)
|
|
149
|
-
**kwargs: Additional parameters
|
|
150
|
-
|
|
151
|
-
Returns:
|
|
152
|
-
DataFrame with one-hot encoded columns
|
|
153
|
-
"""
|
|
154
|
-
from additory.utilities.encoding import onehotencoding
|
|
155
|
-
return onehotencoding(df, column=columns, **kwargs)
|
|
156
|
-
|
|
157
|
-
def harmonize_units(self, df, value_column, unit_column, target_unit=None, position="end", **kwargs):
|
|
158
|
-
"""
|
|
159
|
-
Harmonize units in a dataframe.
|
|
160
|
-
|
|
161
|
-
Args:
|
|
162
|
-
df: Input dataframe
|
|
163
|
-
value_column: Column containing numeric values
|
|
164
|
-
unit_column: Column containing unit strings
|
|
165
|
-
target_unit: Target unit to convert to (auto-detected if None)
|
|
166
|
-
position: Where to place new columns ("end", "start", etc.)
|
|
167
|
-
**kwargs: Additional parameters
|
|
168
|
-
|
|
169
|
-
Returns:
|
|
170
|
-
DataFrame with harmonized units
|
|
171
|
-
|
|
172
|
-
Example:
|
|
173
|
-
result = add.harmonize_units(df, value_column='weight', unit_column='unit')
|
|
174
|
-
result = add.harmonize_units(df, value_column='temp', unit_column='unit', target_unit='F')
|
|
175
|
-
"""
|
|
176
|
-
from additory.utilities.units import harmonize_units
|
|
177
|
-
return harmonize_units(df, value_column, unit_column, target_unit, position, **kwargs)
|
|
178
|
-
|
|
179
|
-
def scan(
|
|
180
|
-
self,
|
|
181
|
-
df: Union[pl.DataFrame, pd.DataFrame, Any],
|
|
182
|
-
preset: Optional[str] = None,
|
|
183
|
-
detect_distributions: bool = True,
|
|
184
|
-
detect_correlations: bool = True,
|
|
185
|
-
detect_cardinality: bool = True,
|
|
186
|
-
top_n_distributions: int = 3,
|
|
187
|
-
correlation_methods: List[str] = None,
|
|
188
|
-
correlation_threshold: float = 0.3,
|
|
189
|
-
cardinality_top_n: int = 10,
|
|
190
|
-
verbose: bool = True
|
|
191
|
-
):
|
|
192
|
-
"""
|
|
193
|
-
Scan a DataFrame to detect distributions, correlations, and cardinality.
|
|
194
|
-
|
|
195
|
-
Accepts pandas, polars, or cuDF DataFrames. Automatically converts to Polars
|
|
196
|
-
for processing. Returns ScanResult with analysis results.
|
|
197
|
-
|
|
198
|
-
This function provides comprehensive data profiling including:
|
|
199
|
-
- Distribution detection for numeric columns
|
|
200
|
-
- Correlation analysis between columns
|
|
201
|
-
- Cardinality analysis (unique values)
|
|
202
|
-
- Data quality metrics
|
|
203
|
-
|
|
204
|
-
Args:
|
|
205
|
-
df: DataFrame to analyze (pandas, polars, or cuDF)
|
|
206
|
-
preset: Optional preset ('quick', 'distributions', 'correlations', 'full', 'minimal')
|
|
207
|
-
detect_distributions: Whether to detect distributions (default: True)
|
|
208
|
-
detect_correlations: Whether to calculate correlations (default: True)
|
|
209
|
-
detect_cardinality: Whether to analyze cardinality (default: True)
|
|
210
|
-
top_n_distributions: Number of top distributions to return per column (default: 3)
|
|
211
|
-
correlation_methods: Correlation methods to use (default: ['pearson', 'spearman'])
|
|
212
|
-
correlation_threshold: Minimum correlation to report (default: 0.3)
|
|
213
|
-
cardinality_top_n: Number of top values to return per column (default: 10)
|
|
214
|
-
verbose: Whether to print progress messages (default: True)
|
|
215
|
-
|
|
216
|
-
Returns:
|
|
217
|
-
ScanResult object containing all analysis results
|
|
218
|
-
|
|
219
|
-
Presets:
|
|
220
|
-
- 'quick': Quality + cardinality only (fast)
|
|
221
|
-
- 'distributions': Distribution detection only
|
|
222
|
-
- 'correlations': Correlation analysis only
|
|
223
|
-
- 'full': All analyses enabled
|
|
224
|
-
- 'minimal': Quality metrics only (fastest)
|
|
225
|
-
|
|
226
|
-
Example:
|
|
227
|
-
>>> import pandas as pd
|
|
228
|
-
>>> import additory as add
|
|
229
|
-
>>>
|
|
230
|
-
>>> # Works with pandas
|
|
231
|
-
>>> df = pd.DataFrame({
|
|
232
|
-
... 'age': [25, 30, 35, 40, 45],
|
|
233
|
-
... 'income': [50000, 60000, 70000, 80000, 90000],
|
|
234
|
-
... 'category': ['A', 'B', 'A', 'B', 'A']
|
|
235
|
-
... })
|
|
236
|
-
>>>
|
|
237
|
-
>>> result = add.scan(df)
|
|
238
|
-
>>> print(result.summary())
|
|
239
|
-
>>>
|
|
240
|
-
>>> # Use presets
|
|
241
|
-
>>> result = add.scan(df, preset='quick')
|
|
242
|
-
>>> result = add.scan(df, preset='distributions', top_n_distributions=5)
|
|
243
|
-
"""
|
|
244
|
-
from additory.analysis.scan import scan as scan_impl
|
|
245
|
-
|
|
246
|
-
if correlation_methods is None:
|
|
247
|
-
correlation_methods = ['pearson', 'spearman']
|
|
248
|
-
|
|
249
|
-
return scan_impl(
|
|
250
|
-
df,
|
|
251
|
-
preset=preset,
|
|
252
|
-
detect_distributions_flag=detect_distributions,
|
|
253
|
-
detect_correlations_flag=detect_correlations,
|
|
254
|
-
detect_cardinality_flag=detect_cardinality,
|
|
255
|
-
top_n_distributions=top_n_distributions,
|
|
256
|
-
correlation_methods=correlation_methods,
|
|
257
|
-
correlation_threshold=correlation_threshold,
|
|
258
|
-
cardinality_top_n=cardinality_top_n,
|
|
259
|
-
verbose=verbose
|
|
260
|
-
)
|
|
261
|
-
|
|
262
|
-
def games(self):
|
|
263
|
-
"""
|
|
264
|
-
List available games! 🎮
|
|
265
|
-
|
|
266
|
-
Returns a list of games you can play with add.play().
|
|
267
|
-
|
|
268
|
-
Returns:
|
|
269
|
-
List of available game names
|
|
270
|
-
|
|
271
|
-
Example:
|
|
272
|
-
>>> import additory
|
|
273
|
-
>>> additory.add.games()
|
|
274
|
-
['tictactoe', 'sudoku']
|
|
275
|
-
"""
|
|
276
|
-
return ['tictactoe', 'sudoku']
|
|
277
|
-
|
|
278
|
-
def play(self, game: str = "tictactoe"):
|
|
279
|
-
"""
|
|
280
|
-
Play a game! 🎮
|
|
281
|
-
|
|
282
|
-
Hidden feature for the curious. Reinforces row-column thinking.
|
|
283
|
-
|
|
284
|
-
Available games:
|
|
285
|
-
- 'tictactoe' or 'ttt': Play Tic-Tac-Toe
|
|
286
|
-
- 'sudoku': Play Sudoku
|
|
287
|
-
|
|
288
|
-
Args:
|
|
289
|
-
game: Name of the game to play (default: 'tictactoe')
|
|
290
|
-
|
|
291
|
-
Example:
|
|
292
|
-
>>> import additory
|
|
293
|
-
>>> additory.add.play('tictactoe')
|
|
294
|
-
>>> additory.add.play('sudoku')
|
|
295
|
-
"""
|
|
296
|
-
from additory.utilities.games import play as play_impl
|
|
297
|
-
return play_impl(game)
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
# Create the singleton API instance
|
|
301
|
-
add = AdditoryAPI()
|
|
302
|
-
|
|
303
|
-
# Export the instance
|
|
304
|
-
__all__ = ['add']
|