additory 0.1.0a4__py3-none-any.whl → 0.1.1a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. additory/__init__.py +58 -14
  2. additory/common/__init__.py +31 -147
  3. additory/common/column_selector.py +255 -0
  4. additory/common/distributions.py +286 -613
  5. additory/common/extractors.py +313 -0
  6. additory/common/knn_imputation.py +332 -0
  7. additory/common/result.py +380 -0
  8. additory/common/strategy_parser.py +243 -0
  9. additory/common/unit_conversions.py +338 -0
  10. additory/common/validation.py +283 -103
  11. additory/core/__init__.py +34 -22
  12. additory/core/backend.py +258 -0
  13. additory/core/config.py +177 -305
  14. additory/core/logging.py +230 -24
  15. additory/core/memory_manager.py +157 -495
  16. additory/expressions/__init__.py +2 -23
  17. additory/expressions/compiler.py +457 -0
  18. additory/expressions/engine.py +264 -487
  19. additory/expressions/integrity.py +179 -0
  20. additory/expressions/loader.py +263 -0
  21. additory/expressions/parser.py +363 -167
  22. additory/expressions/resolver.py +274 -0
  23. additory/functions/__init__.py +1 -0
  24. additory/functions/analyze/__init__.py +144 -0
  25. additory/functions/analyze/cardinality.py +58 -0
  26. additory/functions/analyze/correlations.py +66 -0
  27. additory/functions/analyze/distributions.py +53 -0
  28. additory/functions/analyze/duplicates.py +49 -0
  29. additory/functions/analyze/features.py +61 -0
  30. additory/functions/analyze/imputation.py +66 -0
  31. additory/functions/analyze/outliers.py +65 -0
  32. additory/functions/analyze/patterns.py +65 -0
  33. additory/functions/analyze/presets.py +72 -0
  34. additory/functions/analyze/quality.py +59 -0
  35. additory/functions/analyze/timeseries.py +53 -0
  36. additory/functions/analyze/types.py +45 -0
  37. additory/functions/expressions/__init__.py +161 -0
  38. additory/functions/snapshot/__init__.py +82 -0
  39. additory/functions/snapshot/filter.py +119 -0
  40. additory/functions/synthetic/__init__.py +113 -0
  41. additory/functions/synthetic/mode_detector.py +47 -0
  42. additory/functions/synthetic/strategies/__init__.py +1 -0
  43. additory/functions/synthetic/strategies/advanced.py +35 -0
  44. additory/functions/synthetic/strategies/augmentative.py +160 -0
  45. additory/functions/synthetic/strategies/generative.py +168 -0
  46. additory/functions/synthetic/strategies/presets.py +116 -0
  47. additory/functions/to/__init__.py +188 -0
  48. additory/functions/to/lookup.py +351 -0
  49. additory/functions/to/merge.py +189 -0
  50. additory/functions/to/sort.py +91 -0
  51. additory/functions/to/summarize.py +170 -0
  52. additory/functions/transform/__init__.py +140 -0
  53. additory/functions/transform/datetime.py +79 -0
  54. additory/functions/transform/extract.py +85 -0
  55. additory/functions/transform/harmonize.py +105 -0
  56. additory/functions/transform/knn.py +62 -0
  57. additory/functions/transform/onehotencoding.py +68 -0
  58. additory/functions/transform/transpose.py +42 -0
  59. additory-0.1.1a1.dist-info/METADATA +83 -0
  60. additory-0.1.1a1.dist-info/RECORD +62 -0
  61. additory/analysis/__init__.py +0 -48
  62. additory/analysis/cardinality.py +0 -126
  63. additory/analysis/correlations.py +0 -124
  64. additory/analysis/distributions.py +0 -376
  65. additory/analysis/quality.py +0 -158
  66. additory/analysis/scan.py +0 -400
  67. additory/common/backend.py +0 -371
  68. additory/common/column_utils.py +0 -191
  69. additory/common/exceptions.py +0 -62
  70. additory/common/lists.py +0 -229
  71. additory/common/patterns.py +0 -240
  72. additory/common/resolver.py +0 -567
  73. additory/common/sample_data.py +0 -182
  74. additory/core/ast_builder.py +0 -165
  75. additory/core/backends/__init__.py +0 -23
  76. additory/core/backends/arrow_bridge.py +0 -483
  77. additory/core/backends/cudf_bridge.py +0 -355
  78. additory/core/column_positioning.py +0 -358
  79. additory/core/compiler_polars.py +0 -166
  80. additory/core/enhanced_cache_manager.py +0 -1119
  81. additory/core/enhanced_matchers.py +0 -473
  82. additory/core/enhanced_version_manager.py +0 -325
  83. additory/core/executor.py +0 -59
  84. additory/core/integrity_manager.py +0 -477
  85. additory/core/loader.py +0 -190
  86. additory/core/namespace_manager.py +0 -657
  87. additory/core/parser.py +0 -176
  88. additory/core/polars_expression_engine.py +0 -601
  89. additory/core/registry.py +0 -177
  90. additory/core/sample_data_manager.py +0 -492
  91. additory/core/user_namespace.py +0 -751
  92. additory/core/validator.py +0 -27
  93. additory/dynamic_api.py +0 -352
  94. additory/expressions/proxy.py +0 -549
  95. additory/expressions/registry.py +0 -313
  96. additory/expressions/samples.py +0 -492
  97. additory/synthetic/__init__.py +0 -13
  98. additory/synthetic/column_name_resolver.py +0 -149
  99. additory/synthetic/deduce.py +0 -259
  100. additory/synthetic/distributions.py +0 -22
  101. additory/synthetic/forecast.py +0 -1132
  102. additory/synthetic/linked_list_parser.py +0 -415
  103. additory/synthetic/namespace_lookup.py +0 -129
  104. additory/synthetic/smote.py +0 -320
  105. additory/synthetic/strategies.py +0 -926
  106. additory/synthetic/synthesizer.py +0 -713
  107. additory/utilities/__init__.py +0 -53
  108. additory/utilities/encoding.py +0 -600
  109. additory/utilities/games.py +0 -300
  110. additory/utilities/keys.py +0 -8
  111. additory/utilities/lookup.py +0 -103
  112. additory/utilities/matchers.py +0 -216
  113. additory/utilities/resolvers.py +0 -286
  114. additory/utilities/settings.py +0 -167
  115. additory/utilities/units.py +0 -749
  116. additory/utilities/validators.py +0 -153
  117. additory-0.1.0a4.dist-info/METADATA +0 -311
  118. additory-0.1.0a4.dist-info/RECORD +0 -72
  119. additory-0.1.0a4.dist-info/licenses/LICENSE +0 -21
  120. {additory-0.1.0a4.dist-info → additory-0.1.1a1.dist-info}/WHEEL +0 -0
  121. {additory-0.1.0a4.dist-info → additory-0.1.1a1.dist-info}/top_level.txt +0 -0
@@ -1,27 +0,0 @@
1
- """
2
- Expression Validator
3
-
4
- Validates expression syntax and structure.
5
- """
6
-
7
- from typing import Dict, Any, List, Tuple
8
-
9
-
10
- def validate_expression(expression: str) -> Tuple[bool, List[str]]:
11
- """
12
- Validate an expression.
13
-
14
- Args:
15
- expression: Expression string to validate
16
-
17
- Returns:
18
- Tuple of (is_valid, error_messages)
19
- """
20
- errors = []
21
-
22
- if not expression or not expression.strip():
23
- errors.append("Expression cannot be empty")
24
- return (False, errors)
25
-
26
- # Basic validation - can be expanded later
27
- return (True, [])
additory/dynamic_api.py DELETED
@@ -1,352 +0,0 @@
1
- """
2
- Dynamic API for Additory
3
-
4
- This module provides the main API interface with dynamic attribute access.
5
- """
6
-
7
- from types import SimpleNamespace
8
- from typing import Union, Optional, List, Any
9
- import pandas as pd
10
- import polars as pl
11
-
12
-
13
- class AdditoryAPI(SimpleNamespace):
14
- """
15
- Main API class for Additory functionality.
16
-
17
- Provides access to:
18
- - add.synthetic() - Synthetic data generation
19
- - add.to() - Lookup/join operations
20
- - add.scan() - Data profiling and analysis
21
- - add.my - User expressions
22
- - add.play() - Hidden games (for the curious 😉)
23
- - Expression evaluation
24
- """
25
-
26
- def __init__(self):
27
- super().__init__()
28
- # Initialize expression proxies
29
- from additory.expressions.proxy import ExpressionProxy
30
- self.my = ExpressionProxy(namespace="user")
31
- self._builtin_proxy = ExpressionProxy(namespace="builtin")
32
-
33
- # Explicitly set methods to prevent namespace conflicts
34
- self.synthetic = self._synthetic_method
35
- self.deduce = self._deduce_method
36
- self.to = self._to_method
37
- self.onehotencoding = self._onehotencoding_method
38
- self.harmonize_units = self._harmonize_units_method
39
- self.scan = self._scan_method
40
- self.games = self._games_method
41
- self.play = self._play_method
42
-
43
- def __getattr__(self, name):
44
- """
45
- Dynamic attribute access for expressions.
46
-
47
- Checks built-in expressions first, then user expressions.
48
- This ensures built-in expressions take precedence.
49
- """
50
- # Check if it's a built-in expression first
51
- if self._expression_exists(self._builtin_proxy, name):
52
- return getattr(self._builtin_proxy, name)
53
-
54
- # Check if it's a user expression
55
- if self._expression_exists(self.my, name):
56
- return getattr(self.my, name)
57
-
58
- # If not found, raise AttributeError
59
- raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
60
-
61
- def _expression_exists(self, proxy, name):
62
- """Check if an expression exists in a proxy's namespace"""
63
- try:
64
- # List expressions in the proxy's namespace
65
- expr_list = proxy.list_expressions()
66
- return name in expr_list.get("expressions", {})
67
- except Exception:
68
- return False
69
-
70
- def _synthetic_method(self, df, n_rows=5, strategy="auto", seed=None, output_format="pandas", **kwargs):
71
- """
72
- Generate synthetic data by extending a dataframe or creating from scratch.
73
-
74
- Three modes:
75
- 1. Extend mode: Pass a DataFrame to add synthetic rows
76
- 2. Create mode: Pass "@new" to create data from scratch
77
- 3. Sample mode: Pass "@sample" to load sample data
78
-
79
- Args:
80
- df: DataFrame to extend, "@new" to create, or "@sample" for sample data
81
- n_rows: Number of rows (int for create/sample, int/float/str for extend)
82
- strategy: Strategy specification (dict for create, str/dict for extend)
83
- seed: Random seed for reproducibility
84
- output_format: Output format ("pandas", "polars", "cudf")
85
- **kwargs: Additional parameters
86
-
87
- Returns:
88
- Extended or generated DataFrame
89
-
90
- Examples:
91
- # Extend existing data
92
- result = add.synthetic(df, n_rows=100, strategy='auto')
93
-
94
- # Create from scratch
95
- result = add.synthetic("@new", n_rows=100, strategy={'id': 'increment', 'age': 'range:18-65'})
96
-
97
- # Load sample data
98
- result = add.synthetic("@sample", n_rows=50)
99
- """
100
- # Store reference to restore after import (in the correct namespace)
101
- import additory
102
- original_synthetic = getattr(additory, 'synthetic', None)
103
-
104
- try:
105
- # Import and call the implementation
106
- from additory.synthetic.synthesizer import synthetic as synthetic_impl
107
- result = synthetic_impl(df, n_rows=n_rows, strategy=strategy, seed=seed,
108
- output_format=output_format, **kwargs)
109
-
110
- # Restore the method reference in the additory module namespace
111
- # The import above will have overridden additory.synthetic with the module
112
- # We need to restore it to point to this method
113
- if original_synthetic is not None:
114
- additory.synthetic = original_synthetic
115
- else:
116
- # If there was no original synthetic, set it to this method
117
- additory.synthetic = self._synthetic_method
118
-
119
- return result
120
- except Exception as e:
121
- # Restore the method reference even if there's an error
122
- if original_synthetic is not None:
123
- additory.synthetic = original_synthetic
124
- else:
125
- additory.synthetic = self._synthetic_method
126
- raise
127
-
128
- def _to_method(self, target_df, from_df=None, bring=None, against=None, **kwargs):
129
- """
130
- Add columns from reference dataframe to target dataframe.
131
-
132
- Args:
133
- target_df: Target dataframe to add columns to
134
- from_df: Reference dataframe to get columns from
135
- bring: Column(s) to bring from reference dataframe (str or list)
136
- against: Column(s) to match on (str or list)
137
- **kwargs: Additional parameters
138
-
139
- Returns:
140
- Target dataframe with new columns added
141
-
142
- Example:
143
- result = add.to(orders_df, from_df=products_df, bring='price', against='product_id')
144
- result = add.to(orders_df, from_df=products_df, bring=['price', 'name'], against='product_id')
145
- """
146
- from additory.utilities.lookup import to
147
- return to(target_df, from_df, bring=bring, against=against, **kwargs)
148
-
149
- def _onehotencoding_method(self, df, columns=None, **kwargs):
150
- """
151
- One-hot encode categorical columns.
152
-
153
- Args:
154
- df: Input dataframe
155
- columns: Column to encode (single column name as string)
156
- **kwargs: Additional parameters
157
-
158
- Returns:
159
- DataFrame with one-hot encoded columns
160
- """
161
- from additory.utilities.encoding import onehotencoding
162
- return onehotencoding(df, column=columns, **kwargs)
163
-
164
- def _harmonize_units_method(self, df, value_column, unit_column, target_unit=None, position="end", **kwargs):
165
- """
166
- Harmonize units in a dataframe.
167
-
168
- Args:
169
- df: Input dataframe
170
- value_column: Column containing numeric values
171
- unit_column: Column containing unit strings
172
- target_unit: Target unit to convert to (auto-detected if None)
173
- position: Where to place new columns ("end", "start", etc.)
174
- **kwargs: Additional parameters
175
-
176
- Returns:
177
- DataFrame with harmonized units
178
-
179
- Example:
180
- result = add.harmonize_units(df, value_column='weight', unit_column='unit')
181
- result = add.harmonize_units(df, value_column='temp', unit_column='unit', target_unit='F')
182
- """
183
- from additory.utilities.units import harmonize_units
184
- return harmonize_units(df, value_column, unit_column, target_unit, position, **kwargs)
185
-
186
- def _scan_method(
187
- self,
188
- df: Union[pl.DataFrame, pd.DataFrame, Any],
189
- preset: Optional[str] = None,
190
- detect_distributions: bool = True,
191
- detect_correlations: bool = True,
192
- detect_cardinality: bool = True,
193
- top_n_distributions: int = 3,
194
- correlation_methods: List[str] = None,
195
- correlation_threshold: float = 0.3,
196
- cardinality_top_n: int = 10,
197
- verbose: bool = True
198
- ):
199
- """
200
- Scan a DataFrame to detect distributions, correlations, and cardinality.
201
-
202
- Accepts pandas, polars, or cuDF DataFrames. Automatically converts to Polars
203
- for processing. Returns ScanResult with analysis results.
204
-
205
- This function provides comprehensive data profiling including:
206
- - Distribution detection for numeric columns
207
- - Correlation analysis between columns
208
- - Cardinality analysis (unique values)
209
- - Data quality metrics
210
-
211
- Args:
212
- df: DataFrame to analyze (pandas, polars, or cuDF)
213
- preset: Optional preset ('quick', 'distributions', 'correlations', 'full', 'minimal')
214
- detect_distributions: Whether to detect distributions (default: True)
215
- detect_correlations: Whether to calculate correlations (default: True)
216
- detect_cardinality: Whether to analyze cardinality (default: True)
217
- top_n_distributions: Number of top distributions to return per column (default: 3)
218
- correlation_methods: Correlation methods to use (default: ['pearson', 'spearman'])
219
- correlation_threshold: Minimum correlation to report (default: 0.3)
220
- cardinality_top_n: Number of top values to return per column (default: 10)
221
- verbose: Whether to print progress messages (default: True)
222
-
223
- Returns:
224
- ScanResult object containing all analysis results
225
-
226
- Presets:
227
- - 'quick': Quality + cardinality only (fast)
228
- - 'distributions': Distribution detection only
229
- - 'correlations': Correlation analysis only
230
- - 'full': All analyses enabled
231
- - 'minimal': Quality metrics only (fastest)
232
-
233
- Example:
234
- >>> import pandas as pd
235
- >>> import additory as add
236
- >>>
237
- >>> # Works with pandas
238
- >>> df = pd.DataFrame({
239
- ... 'age': [25, 30, 35, 40, 45],
240
- ... 'income': [50000, 60000, 70000, 80000, 90000],
241
- ... 'category': ['A', 'B', 'A', 'B', 'A']
242
- ... })
243
- >>>
244
- >>> result = add.scan(df)
245
- >>> print(result.summary())
246
- >>>
247
- >>> # Use presets
248
- >>> result = add.scan(df, preset='quick')
249
- >>> result = add.scan(df, preset='distributions', top_n_distributions=5)
250
- """
251
- from additory.analysis.scan import scan as scan_impl
252
-
253
- if correlation_methods is None:
254
- correlation_methods = ['pearson', 'spearman']
255
-
256
- return scan_impl(
257
- df,
258
- preset=preset,
259
- detect_distributions_flag=detect_distributions,
260
- detect_correlations_flag=detect_correlations,
261
- detect_cardinality_flag=detect_cardinality,
262
- top_n_distributions=top_n_distributions,
263
- correlation_methods=correlation_methods,
264
- correlation_threshold=correlation_threshold,
265
- cardinality_top_n=cardinality_top_n,
266
- verbose=verbose
267
- )
268
-
269
- def _deduce_method(
270
- self,
271
- df: Union[pd.DataFrame, pl.DataFrame, Any],
272
- from_column: Union[str, List[str]],
273
- to_column: str
274
- ) -> Union[pd.DataFrame, pl.DataFrame, Any]:
275
- """
276
- Deduce missing labels based on text similarity to labeled examples.
277
-
278
- Uses cosine similarity on TF-IDF vectors. Pure Python, no LLMs, offline-first.
279
- Requires at least 3 labeled examples to work.
280
-
281
- When multiple source columns are provided, they are concatenated with
282
- spaces before computing similarity.
283
-
284
- Args:
285
- df: DataFrame with some labeled and some unlabeled rows
286
- from_column: Text column(s) to analyze
287
- - str: Single column (e.g., "comment")
288
- - List[str]: Multiple columns (e.g., ["comment", "notes"])
289
- to_column: Label column to fill (e.g., "status")
290
-
291
- Returns:
292
- DataFrame with deduced labels filled in
293
-
294
- Examples:
295
- # Single column
296
- >>> result = add.deduce(df, from_column="comment", to_column="status")
297
-
298
- # Multiple columns (better accuracy)
299
- >>> result = add.deduce(
300
- ... df,
301
- ... from_column=["comment", "notes", "description"],
302
- ... to_column="status"
303
- ... )
304
-
305
- Privacy: Your data never leaves your machine. No external connections.
306
- """
307
- from additory.synthetic.deduce import deduce as deduce_impl
308
- return deduce_impl(df, from_column, to_column)
309
-
310
- def _games_method(self):
311
- """
312
- List available games! 🎮
313
-
314
- Returns a list of games you can play with add.play().
315
-
316
- Returns:
317
- List of available game names
318
-
319
- Example:
320
- >>> import additory
321
- >>> additory.add.games()
322
- ['tictactoe', 'sudoku']
323
- """
324
- return ['tictactoe', 'sudoku']
325
-
326
- def _play_method(self, game: str = "tictactoe"):
327
- """
328
- Play a game! 🎮
329
-
330
- Hidden feature for the curious. Reinforces row-column thinking.
331
-
332
- Available games:
333
- - 'tictactoe' or 'ttt': Play Tic-Tac-Toe
334
- - 'sudoku': Play Sudoku
335
-
336
- Args:
337
- game: Name of the game to play (default: 'tictactoe')
338
-
339
- Example:
340
- >>> import additory
341
- >>> additory.add.play('tictactoe')
342
- >>> additory.add.play('sudoku')
343
- """
344
- from additory.utilities.games import play as play_impl
345
- return play_impl(game)
346
-
347
-
348
- # Create the singleton API instance
349
- add = AdditoryAPI()
350
-
351
- # Export the instance
352
- __all__ = ['add']