additory 0.1.0a2__py3-none-any.whl → 0.1.0a4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- additory/__init__.py +4 -0
- additory/common/__init__.py +2 -2
- additory/common/backend.py +20 -4
- additory/common/distributions.py +1 -1
- additory/common/sample_data.py +19 -19
- additory/core/backends/arrow_bridge.py +7 -0
- additory/core/config.py +3 -3
- additory/core/polars_expression_engine.py +66 -16
- additory/core/registry.py +4 -3
- additory/dynamic_api.py +95 -51
- additory/expressions/proxy.py +4 -1
- additory/expressions/registry.py +3 -3
- additory/synthetic/__init__.py +7 -95
- additory/synthetic/column_name_resolver.py +149 -0
- additory/synthetic/deduce.py +259 -0
- additory/{augment → synthetic}/distributions.py +2 -2
- additory/{augment → synthetic}/forecast.py +1 -1
- additory/synthetic/linked_list_parser.py +415 -0
- additory/synthetic/namespace_lookup.py +129 -0
- additory/{augment → synthetic}/smote.py +1 -1
- additory/{augment → synthetic}/strategies.py +87 -44
- additory/{augment/augmentor.py → synthetic/synthesizer.py} +75 -15
- additory/utilities/units.py +4 -1
- {additory-0.1.0a2.dist-info → additory-0.1.0a4.dist-info}/METADATA +44 -28
- {additory-0.1.0a2.dist-info → additory-0.1.0a4.dist-info}/RECORD +28 -43
- {additory-0.1.0a2.dist-info → additory-0.1.0a4.dist-info}/WHEEL +1 -1
- additory/augment/__init__.py +0 -24
- additory/augment/builtin_lists.py +0 -430
- additory/augment/list_registry.py +0 -177
- additory/synthetic/api.py +0 -220
- additory/synthetic/common_integration.py +0 -314
- additory/synthetic/config.py +0 -262
- additory/synthetic/engines.py +0 -529
- additory/synthetic/exceptions.py +0 -180
- additory/synthetic/file_managers.py +0 -518
- additory/synthetic/generator.py +0 -702
- additory/synthetic/generator_parser.py +0 -68
- additory/synthetic/integration.py +0 -319
- additory/synthetic/models.py +0 -241
- additory/synthetic/pattern_resolver.py +0 -573
- additory/synthetic/performance.py +0 -469
- additory/synthetic/polars_integration.py +0 -464
- additory/synthetic/proxy.py +0 -60
- additory/synthetic/schema_parser.py +0 -685
- additory/synthetic/validator.py +0 -553
- {additory-0.1.0a2.dist-info → additory-0.1.0a4.dist-info}/licenses/LICENSE +0 -0
- {additory-0.1.0a2.dist-info → additory-0.1.0a4.dist-info}/top_level.txt +0 -0
additory/__init__.py
CHANGED
|
@@ -2,6 +2,9 @@
|
|
|
2
2
|
|
|
3
3
|
from .dynamic_api import add as _api_instance
|
|
4
4
|
|
|
5
|
+
# Version information
|
|
6
|
+
__version__ = "0.1.0a4"
|
|
7
|
+
|
|
5
8
|
# Expose the API instance normally
|
|
6
9
|
add = _api_instance
|
|
7
10
|
|
|
@@ -12,4 +15,5 @@ def __getattr__(name):
|
|
|
12
15
|
|
|
13
16
|
__all__ = [
|
|
14
17
|
"add",
|
|
18
|
+
"__version__",
|
|
15
19
|
]
|
additory/common/__init__.py
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Common Utilities Module
|
|
3
3
|
|
|
4
|
-
Shared functionality used by both
|
|
4
|
+
Shared functionality used by both synthetic and expressions modules:
|
|
5
5
|
- Distribution functions (normal, uniform, skewed, etc.)
|
|
6
6
|
- List file management (.list format)
|
|
7
7
|
- Pattern file management (.properties format)
|
|
8
8
|
- Fallback resolution logic
|
|
9
9
|
|
|
10
10
|
This module eliminates code duplication and provides consistent behavior
|
|
11
|
-
across
|
|
11
|
+
across synthetic and expression data generation.
|
|
12
12
|
"""
|
|
13
13
|
|
|
14
14
|
from .distributions import (
|
additory/common/backend.py
CHANGED
|
@@ -180,11 +180,14 @@ def get_arrow_bridge():
|
|
|
180
180
|
- Use for all cross-backend conversions
|
|
181
181
|
- Handles pandas/polars/cuDF via Arrow
|
|
182
182
|
"""
|
|
183
|
-
from additory.core.backends.arrow_bridge import EnhancedArrowBridge
|
|
183
|
+
from additory.core.backends.arrow_bridge import EnhancedArrowBridge, ArrowBridgeError
|
|
184
184
|
|
|
185
185
|
# Singleton pattern
|
|
186
186
|
if not hasattr(get_arrow_bridge, '_instance'):
|
|
187
|
-
|
|
187
|
+
try:
|
|
188
|
+
get_arrow_bridge._instance = EnhancedArrowBridge()
|
|
189
|
+
except ArrowBridgeError:
|
|
190
|
+
get_arrow_bridge._instance = None
|
|
188
191
|
|
|
189
192
|
return get_arrow_bridge._instance
|
|
190
193
|
|
|
@@ -194,7 +197,7 @@ def to_polars(df: Any, backend_type: BackendType = None) -> 'pl.DataFrame':
|
|
|
194
197
|
Convert any dataframe to Polars via Arrow bridge.
|
|
195
198
|
|
|
196
199
|
This is the primary conversion function for the Polars-only architecture.
|
|
197
|
-
All operations (expressions,
|
|
200
|
+
All operations (expressions, synthetic, etc.) use this to convert input
|
|
198
201
|
dataframes to Polars for processing.
|
|
199
202
|
|
|
200
203
|
Args:
|
|
@@ -224,7 +227,7 @@ def to_polars(df: Any, backend_type: BackendType = None) -> 'pl.DataFrame':
|
|
|
224
227
|
)
|
|
225
228
|
|
|
226
229
|
# Fast path: already Polars
|
|
227
|
-
if isinstance(df, pl.DataFrame):
|
|
230
|
+
if HAS_POLARS and isinstance(df, pl.DataFrame):
|
|
228
231
|
return df
|
|
229
232
|
|
|
230
233
|
# Validate input
|
|
@@ -240,6 +243,13 @@ def to_polars(df: Any, backend_type: BackendType = None) -> 'pl.DataFrame':
|
|
|
240
243
|
# Convert via Arrow bridge
|
|
241
244
|
try:
|
|
242
245
|
bridge = get_arrow_bridge()
|
|
246
|
+
if bridge is None:
|
|
247
|
+
# Fallback: direct conversion for pandas
|
|
248
|
+
if backend_type == "pandas":
|
|
249
|
+
if isinstance(df, pd.DataFrame):
|
|
250
|
+
return pl.from_pandas(df)
|
|
251
|
+
raise RuntimeError("Arrow bridge not available and cannot convert non-pandas DataFrame")
|
|
252
|
+
|
|
243
253
|
arrow_table = bridge.to_arrow(df, backend_type)
|
|
244
254
|
pl_df = bridge.from_arrow(arrow_table, "polars")
|
|
245
255
|
return pl_df
|
|
@@ -309,6 +319,12 @@ def from_polars(pl_df: 'pl.DataFrame', target_backend: BackendType) -> Any:
|
|
|
309
319
|
# Convert via Arrow bridge
|
|
310
320
|
try:
|
|
311
321
|
bridge = get_arrow_bridge()
|
|
322
|
+
if bridge is None:
|
|
323
|
+
# Fallback: direct conversion for pandas
|
|
324
|
+
if target_backend == "pandas":
|
|
325
|
+
return pl_df.to_pandas()
|
|
326
|
+
raise RuntimeError("Arrow bridge not available and cannot convert to non-pandas DataFrame")
|
|
327
|
+
|
|
312
328
|
arrow_table = bridge.to_arrow(pl_df, "polars")
|
|
313
329
|
result_df = bridge.from_arrow(arrow_table, target_backend)
|
|
314
330
|
return result_df
|
additory/common/distributions.py
CHANGED
additory/common/sample_data.py
CHANGED
|
@@ -8,8 +8,8 @@ loaded on-demand using the existing .add file parser.
|
|
|
8
8
|
Usage:
|
|
9
9
|
from additory.common.sample_data import get_sample_dataset
|
|
10
10
|
|
|
11
|
-
# For
|
|
12
|
-
df = get_sample_dataset("
|
|
11
|
+
# For synthetic
|
|
12
|
+
df = get_sample_dataset("synthetic", "sample")
|
|
13
13
|
|
|
14
14
|
# For expressions (future)
|
|
15
15
|
df = get_sample_dataset("expressions", "sample")
|
|
@@ -25,7 +25,7 @@ from additory.common.exceptions import ValidationError
|
|
|
25
25
|
|
|
26
26
|
|
|
27
27
|
def get_sample_dataset(
|
|
28
|
-
module: str = "
|
|
28
|
+
module: str = "synthetic",
|
|
29
29
|
block: str = "sample",
|
|
30
30
|
dataset_type: str = "clean"
|
|
31
31
|
) -> pl.DataFrame:
|
|
@@ -33,12 +33,12 @@ def get_sample_dataset(
|
|
|
33
33
|
Load a sample dataset from .add files.
|
|
34
34
|
|
|
35
35
|
This function provides centralized access to sample datasets across
|
|
36
|
-
all additory modules (
|
|
36
|
+
all additory modules (synthetic, expressions, utilities). Sample datasets
|
|
37
37
|
are stored as .add files in the reference/ directory structure.
|
|
38
38
|
|
|
39
39
|
Args:
|
|
40
|
-
module: Module name ("
|
|
41
|
-
block: Block name within the .add file ("sample" for
|
|
40
|
+
module: Module name ("synthetic", "expressions", "utilities")
|
|
41
|
+
block: Block name within the .add file ("sample" for synthetic)
|
|
42
42
|
dataset_type: Type of sample data ("clean" or "unclean")
|
|
43
43
|
|
|
44
44
|
Returns:
|
|
@@ -48,8 +48,8 @@ def get_sample_dataset(
|
|
|
48
48
|
ValidationError: If module, block, or dataset_type not found
|
|
49
49
|
|
|
50
50
|
Examples:
|
|
51
|
-
>>> # Load
|
|
52
|
-
>>> df = get_sample_dataset("
|
|
51
|
+
>>> # Load synthetic sample dataset
|
|
52
|
+
>>> df = get_sample_dataset("synthetic", "sample")
|
|
53
53
|
>>> print(df.shape)
|
|
54
54
|
(50, 10)
|
|
55
55
|
|
|
@@ -57,7 +57,7 @@ def get_sample_dataset(
|
|
|
57
57
|
>>> df = get_sample_dataset("expressions", "sample", "clean")
|
|
58
58
|
>>> df_unclean = get_sample_dataset("expressions", "sample", "unclean")
|
|
59
59
|
|
|
60
|
-
Sample Dataset Structure (
|
|
60
|
+
Sample Dataset Structure (synthetic):
|
|
61
61
|
- id: Sequential numeric IDs (1-50)
|
|
62
62
|
- emp_id: Employee IDs with pattern (EMP_001 - EMP_050)
|
|
63
63
|
- order_id: Order IDs with different padding (ORD_0001 - ORD_0050)
|
|
@@ -72,8 +72,8 @@ def get_sample_dataset(
|
|
|
72
72
|
# Construct path to .add file
|
|
73
73
|
base_path = Path(__file__).parent.parent.parent / "reference"
|
|
74
74
|
|
|
75
|
-
if module == "
|
|
76
|
-
add_file_path = base_path / "
|
|
75
|
+
if module == "synthetic":
|
|
76
|
+
add_file_path = base_path / "synthetic_definitions" / f"{block}_0.1.add"
|
|
77
77
|
elif module == "expressions":
|
|
78
78
|
add_file_path = base_path / "expressions_definitions" / f"{block}_0.1.add"
|
|
79
79
|
elif module == "utilities":
|
|
@@ -81,7 +81,7 @@ def get_sample_dataset(
|
|
|
81
81
|
else:
|
|
82
82
|
raise ValidationError(
|
|
83
83
|
f"Unknown module '{module}'. "
|
|
84
|
-
f"Valid modules:
|
|
84
|
+
f"Valid modules: synthetic, expressions, utilities"
|
|
85
85
|
)
|
|
86
86
|
|
|
87
87
|
# Check if file exists
|
|
@@ -141,7 +141,7 @@ def list_available_samples() -> dict:
|
|
|
141
141
|
>>> samples = list_available_samples()
|
|
142
142
|
>>> print(samples)
|
|
143
143
|
{
|
|
144
|
-
'
|
|
144
|
+
'synthetic': ['sample'],
|
|
145
145
|
'expressions': ['sample'],
|
|
146
146
|
'utilities': []
|
|
147
147
|
}
|
|
@@ -149,15 +149,15 @@ def list_available_samples() -> dict:
|
|
|
149
149
|
base_path = Path(__file__).parent.parent.parent / "reference"
|
|
150
150
|
available = {}
|
|
151
151
|
|
|
152
|
-
# Check
|
|
153
|
-
|
|
154
|
-
if
|
|
155
|
-
available['
|
|
152
|
+
# Check synthetic
|
|
153
|
+
synthetic_path = base_path / "synthetic_definitions"
|
|
154
|
+
if synthetic_path.exists():
|
|
155
|
+
available['synthetic'] = [
|
|
156
156
|
f.stem.rsplit('_', 1)[0] # Remove version suffix
|
|
157
|
-
for f in
|
|
157
|
+
for f in synthetic_path.glob("*.add")
|
|
158
158
|
]
|
|
159
159
|
else:
|
|
160
|
-
available['
|
|
160
|
+
available['synthetic'] = []
|
|
161
161
|
|
|
162
162
|
# Check expressions
|
|
163
163
|
expressions_path = base_path / "expressions_definitions"
|
|
@@ -16,6 +16,13 @@ try:
|
|
|
16
16
|
except ImportError as e:
|
|
17
17
|
ARROW_AVAILABLE = False
|
|
18
18
|
IMPORT_ERROR = str(e)
|
|
19
|
+
# Create dummy classes for type annotations
|
|
20
|
+
class pa:
|
|
21
|
+
Table = Any
|
|
22
|
+
class pl:
|
|
23
|
+
DataFrame = Any
|
|
24
|
+
class pd:
|
|
25
|
+
DataFrame = Any
|
|
19
26
|
|
|
20
27
|
from ..logging import log_info, log_warning
|
|
21
28
|
from .cudf_bridge import get_cudf_bridge
|
additory/core/config.py
CHANGED
|
@@ -329,14 +329,14 @@ def set_custom_formula_path(path):
|
|
|
329
329
|
|
|
330
330
|
# backend preference setting
|
|
331
331
|
|
|
332
|
-
_backend_preference: str
|
|
332
|
+
_backend_preference: Optional[str] = None # "cpu", "gpu", or None
|
|
333
333
|
|
|
334
|
-
def set_backend_preference(mode: str
|
|
334
|
+
def set_backend_preference(mode: Optional[str]):
|
|
335
335
|
global _backend_preference
|
|
336
336
|
if mode not in (None, "cpu", "gpu"):
|
|
337
337
|
raise ValueError("backend must be 'cpu', 'gpu', or None")
|
|
338
338
|
_backend_preference = mode
|
|
339
339
|
|
|
340
|
-
def get_backend_preference() -> str
|
|
340
|
+
def get_backend_preference() -> Optional[str]:
|
|
341
341
|
return _backend_preference
|
|
342
342
|
|
|
@@ -32,7 +32,10 @@ class PolarsExpressionEngine:
|
|
|
32
32
|
"""Exclusive Polars-based expression processing engine"""
|
|
33
33
|
|
|
34
34
|
def __init__(self):
|
|
35
|
-
|
|
35
|
+
try:
|
|
36
|
+
self.arrow_bridge = EnhancedArrowBridge()
|
|
37
|
+
except ArrowBridgeError:
|
|
38
|
+
self.arrow_bridge = None
|
|
36
39
|
self.execution_stats = {
|
|
37
40
|
"total_executions": 0,
|
|
38
41
|
"total_time_ms": 0.0,
|
|
@@ -68,14 +71,28 @@ class PolarsExpressionEngine:
|
|
|
68
71
|
try:
|
|
69
72
|
# Auto-detect backend if not specified
|
|
70
73
|
if backend_type is None:
|
|
71
|
-
|
|
74
|
+
if self.arrow_bridge:
|
|
75
|
+
backend_type = self.arrow_bridge.detect_backend(df)
|
|
76
|
+
else:
|
|
77
|
+
backend_type = "pandas" # fallback
|
|
72
78
|
|
|
73
79
|
# Get memory usage before processing
|
|
74
|
-
|
|
80
|
+
if self.arrow_bridge:
|
|
81
|
+
memory_before = self.arrow_bridge._get_memory_usage_mb()
|
|
82
|
+
else:
|
|
83
|
+
memory_before = 0
|
|
75
84
|
|
|
76
85
|
# 1. Convert input to Arrow
|
|
77
86
|
log_info(f"[polars_engine] Converting {backend_type} to Arrow")
|
|
78
|
-
|
|
87
|
+
if self.arrow_bridge:
|
|
88
|
+
arrow_table = self.arrow_bridge.to_arrow(df, backend_type)
|
|
89
|
+
else:
|
|
90
|
+
# Fallback: assume pandas and convert directly
|
|
91
|
+
import pandas as pd
|
|
92
|
+
if isinstance(df, pd.DataFrame):
|
|
93
|
+
arrow_table = pl.from_pandas(df).to_arrow()
|
|
94
|
+
else:
|
|
95
|
+
raise RuntimeError("Arrow bridge not available and input is not pandas DataFrame")
|
|
79
96
|
|
|
80
97
|
# 2. Convert Arrow to Polars
|
|
81
98
|
log_info("[polars_engine] Converting Arrow to Polars")
|
|
@@ -93,11 +110,18 @@ class PolarsExpressionEngine:
|
|
|
93
110
|
|
|
94
111
|
# 5. Convert to original backend format
|
|
95
112
|
log_info(f"[polars_engine] Converting Arrow to {backend_type}")
|
|
96
|
-
|
|
113
|
+
if self.arrow_bridge:
|
|
114
|
+
final_result = self.arrow_bridge.from_arrow(result_arrow, backend_type)
|
|
115
|
+
else:
|
|
116
|
+
# Fallback: convert back to pandas
|
|
117
|
+
final_result = pl.from_arrow(result_arrow).to_pandas()
|
|
97
118
|
|
|
98
119
|
# Calculate execution statistics
|
|
99
120
|
execution_time = (datetime.now() - start_time).total_seconds() * 1000
|
|
100
|
-
|
|
121
|
+
if self.arrow_bridge:
|
|
122
|
+
memory_after = self.arrow_bridge._get_memory_usage_mb()
|
|
123
|
+
else:
|
|
124
|
+
memory_after = 0
|
|
101
125
|
memory_used = max(0, memory_after - memory_before)
|
|
102
126
|
|
|
103
127
|
# Update global statistics
|
|
@@ -122,7 +146,8 @@ class PolarsExpressionEngine:
|
|
|
122
146
|
|
|
123
147
|
finally:
|
|
124
148
|
# 6. Always cleanup Arrow memory
|
|
125
|
-
self.arrow_bridge
|
|
149
|
+
if self.arrow_bridge:
|
|
150
|
+
self.arrow_bridge.cleanup_arrow_memory()
|
|
126
151
|
|
|
127
152
|
def _execute_polars_expression(self, polars_df: pl.DataFrame,
|
|
128
153
|
expression: str, output_column: str) -> pl.DataFrame:
|
|
@@ -381,14 +406,28 @@ class PolarsExpressionEngine:
|
|
|
381
406
|
try:
|
|
382
407
|
# Auto-detect backend if not specified
|
|
383
408
|
if backend_type is None:
|
|
384
|
-
|
|
409
|
+
if self.arrow_bridge:
|
|
410
|
+
backend_type = self.arrow_bridge.detect_backend(df)
|
|
411
|
+
else:
|
|
412
|
+
backend_type = "pandas"
|
|
385
413
|
|
|
386
414
|
# Get memory usage before processing
|
|
387
|
-
|
|
415
|
+
if self.arrow_bridge:
|
|
416
|
+
memory_before = self.arrow_bridge._get_memory_usage_mb()
|
|
417
|
+
else:
|
|
418
|
+
memory_before = 0
|
|
388
419
|
|
|
389
420
|
# Convert to Polars via Arrow
|
|
390
|
-
|
|
391
|
-
|
|
421
|
+
if self.arrow_bridge:
|
|
422
|
+
arrow_table = self.arrow_bridge.to_arrow(df, backend_type)
|
|
423
|
+
polars_df = pl.from_arrow(arrow_table)
|
|
424
|
+
else:
|
|
425
|
+
# Fallback: assume pandas
|
|
426
|
+
import pandas as pd
|
|
427
|
+
if isinstance(df, pd.DataFrame):
|
|
428
|
+
polars_df = pl.from_pandas(df)
|
|
429
|
+
else:
|
|
430
|
+
raise RuntimeError("Arrow bridge not available and input is not pandas DataFrame")
|
|
392
431
|
|
|
393
432
|
# Execute using AST
|
|
394
433
|
polars_expr = self._ast_to_polars_expr(ast_tree)
|
|
@@ -396,11 +435,17 @@ class PolarsExpressionEngine:
|
|
|
396
435
|
|
|
397
436
|
# Convert back to original format
|
|
398
437
|
result_arrow = result_df.to_arrow()
|
|
399
|
-
|
|
438
|
+
if self.arrow_bridge:
|
|
439
|
+
final_result = self.arrow_bridge.from_arrow(result_arrow, backend_type)
|
|
440
|
+
else:
|
|
441
|
+
final_result = pl.from_arrow(result_arrow).to_pandas()
|
|
400
442
|
|
|
401
443
|
# Calculate statistics
|
|
402
444
|
execution_time = (datetime.now() - start_time).total_seconds() * 1000
|
|
403
|
-
|
|
445
|
+
if self.arrow_bridge:
|
|
446
|
+
memory_after = self.arrow_bridge._get_memory_usage_mb()
|
|
447
|
+
else:
|
|
448
|
+
memory_after = 0
|
|
404
449
|
memory_used = max(0, memory_after - memory_before)
|
|
405
450
|
|
|
406
451
|
# Update statistics
|
|
@@ -422,7 +467,8 @@ class PolarsExpressionEngine:
|
|
|
422
467
|
raise PolarsExpressionError(f"AST execution failed: {e}")
|
|
423
468
|
|
|
424
469
|
finally:
|
|
425
|
-
self.arrow_bridge
|
|
470
|
+
if self.arrow_bridge:
|
|
471
|
+
self.arrow_bridge.cleanup_arrow_memory()
|
|
426
472
|
|
|
427
473
|
def validate_expression(self, expression: str) -> bool:
|
|
428
474
|
"""
|
|
@@ -489,7 +535,10 @@ class PolarsExpressionEngine:
|
|
|
489
535
|
Benchmark results
|
|
490
536
|
"""
|
|
491
537
|
times = []
|
|
492
|
-
|
|
538
|
+
if self.arrow_bridge:
|
|
539
|
+
backend_type = self.arrow_bridge.detect_backend(df)
|
|
540
|
+
else:
|
|
541
|
+
backend_type = "pandas"
|
|
493
542
|
|
|
494
543
|
for i in range(iterations):
|
|
495
544
|
try:
|
|
@@ -532,7 +581,8 @@ class PolarsExpressionEngine:
|
|
|
532
581
|
"""Cleanup callback for memory manager"""
|
|
533
582
|
try:
|
|
534
583
|
# Cleanup Arrow bridge memory
|
|
535
|
-
self.arrow_bridge
|
|
584
|
+
if self.arrow_bridge:
|
|
585
|
+
self.arrow_bridge.cleanup_arrow_memory()
|
|
536
586
|
|
|
537
587
|
# Reset statistics if they get too large
|
|
538
588
|
if self.execution_stats["total_executions"] > 10000:
|
additory/core/registry.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
# Versioned registry for additory
|
|
3
3
|
|
|
4
4
|
from dataclasses import dataclass
|
|
5
|
+
from typing import Optional
|
|
5
6
|
import os
|
|
6
7
|
import json
|
|
7
8
|
|
|
@@ -26,9 +27,9 @@ class ResolvedFormula:
|
|
|
26
27
|
source: str
|
|
27
28
|
version: str
|
|
28
29
|
mode: str = "local"
|
|
29
|
-
ast: dict
|
|
30
|
-
sample_clean: dict
|
|
31
|
-
sample_unclean: dict
|
|
30
|
+
ast: Optional[dict] = None
|
|
31
|
+
sample_clean: Optional[dict] = None
|
|
32
|
+
sample_unclean: Optional[dict] = None
|
|
32
33
|
|
|
33
34
|
|
|
34
35
|
# ------------------------------------------------------------
|
additory/dynamic_api.py
CHANGED
|
@@ -15,9 +15,8 @@ class AdditoryAPI(SimpleNamespace):
|
|
|
15
15
|
Main API class for Additory functionality.
|
|
16
16
|
|
|
17
17
|
Provides access to:
|
|
18
|
-
- add.
|
|
18
|
+
- add.synthetic() - Synthetic data generation
|
|
19
19
|
- add.to() - Lookup/join operations
|
|
20
|
-
- add.synth() - Synthetic data generation
|
|
21
20
|
- add.scan() - Data profiling and analysis
|
|
22
21
|
- add.my - User expressions
|
|
23
22
|
- add.play() - Hidden games (for the curious 😉)
|
|
@@ -31,8 +30,15 @@ class AdditoryAPI(SimpleNamespace):
|
|
|
31
30
|
self.my = ExpressionProxy(namespace="user")
|
|
32
31
|
self._builtin_proxy = ExpressionProxy(namespace="builtin")
|
|
33
32
|
|
|
34
|
-
# Explicitly set
|
|
35
|
-
self.
|
|
33
|
+
# Explicitly set methods to prevent namespace conflicts
|
|
34
|
+
self.synthetic = self._synthetic_method
|
|
35
|
+
self.deduce = self._deduce_method
|
|
36
|
+
self.to = self._to_method
|
|
37
|
+
self.onehotencoding = self._onehotencoding_method
|
|
38
|
+
self.harmonize_units = self._harmonize_units_method
|
|
39
|
+
self.scan = self._scan_method
|
|
40
|
+
self.games = self._games_method
|
|
41
|
+
self.play = self._play_method
|
|
36
42
|
|
|
37
43
|
def __getattr__(self, name):
|
|
38
44
|
"""
|
|
@@ -61,65 +67,65 @@ class AdditoryAPI(SimpleNamespace):
|
|
|
61
67
|
except Exception:
|
|
62
68
|
return False
|
|
63
69
|
|
|
64
|
-
def
|
|
70
|
+
def _synthetic_method(self, df, n_rows=5, strategy="auto", seed=None, output_format="pandas", **kwargs):
|
|
65
71
|
"""
|
|
66
|
-
|
|
72
|
+
Generate synthetic data by extending a dataframe or creating from scratch.
|
|
67
73
|
|
|
68
74
|
Three modes:
|
|
69
|
-
1.
|
|
75
|
+
1. Extend mode: Pass a DataFrame to add synthetic rows
|
|
70
76
|
2. Create mode: Pass "@new" to create data from scratch
|
|
71
77
|
3. Sample mode: Pass "@sample" to load sample data
|
|
72
78
|
|
|
73
79
|
Args:
|
|
74
|
-
df: DataFrame to
|
|
75
|
-
n_rows: Number of rows (int for create/sample, int/float/str for
|
|
76
|
-
strategy: Strategy specification (dict for create, str/dict for
|
|
80
|
+
df: DataFrame to extend, "@new" to create, or "@sample" for sample data
|
|
81
|
+
n_rows: Number of rows (int for create/sample, int/float/str for extend)
|
|
82
|
+
strategy: Strategy specification (dict for create, str/dict for extend)
|
|
77
83
|
seed: Random seed for reproducibility
|
|
78
84
|
output_format: Output format ("pandas", "polars", "cudf")
|
|
79
85
|
**kwargs: Additional parameters
|
|
80
86
|
|
|
81
87
|
Returns:
|
|
82
|
-
|
|
88
|
+
Extended or generated DataFrame
|
|
83
89
|
|
|
84
90
|
Examples:
|
|
85
|
-
#
|
|
86
|
-
result = add.
|
|
91
|
+
# Extend existing data
|
|
92
|
+
result = add.synthetic(df, n_rows=100, strategy='auto')
|
|
87
93
|
|
|
88
94
|
# Create from scratch
|
|
89
|
-
result = add.
|
|
95
|
+
result = add.synthetic("@new", n_rows=100, strategy={'id': 'increment', 'age': 'range:18-65'})
|
|
90
96
|
|
|
91
97
|
# Load sample data
|
|
92
|
-
result = add.
|
|
98
|
+
result = add.synthetic("@sample", n_rows=50)
|
|
93
99
|
"""
|
|
94
100
|
# Store reference to restore after import (in the correct namespace)
|
|
95
101
|
import additory
|
|
96
|
-
|
|
102
|
+
original_synthetic = getattr(additory, 'synthetic', None)
|
|
97
103
|
|
|
98
104
|
try:
|
|
99
105
|
# Import and call the implementation
|
|
100
|
-
from additory.
|
|
101
|
-
result =
|
|
102
|
-
|
|
106
|
+
from additory.synthetic.synthesizer import synthetic as synthetic_impl
|
|
107
|
+
result = synthetic_impl(df, n_rows=n_rows, strategy=strategy, seed=seed,
|
|
108
|
+
output_format=output_format, **kwargs)
|
|
103
109
|
|
|
104
110
|
# Restore the method reference in the additory module namespace
|
|
105
|
-
# The import above will have overridden additory.
|
|
111
|
+
# The import above will have overridden additory.synthetic with the module
|
|
106
112
|
# We need to restore it to point to this method
|
|
107
|
-
if
|
|
108
|
-
additory.
|
|
113
|
+
if original_synthetic is not None:
|
|
114
|
+
additory.synthetic = original_synthetic
|
|
109
115
|
else:
|
|
110
|
-
# If there was no original
|
|
111
|
-
additory.
|
|
116
|
+
# If there was no original synthetic, set it to this method
|
|
117
|
+
additory.synthetic = self._synthetic_method
|
|
112
118
|
|
|
113
119
|
return result
|
|
114
120
|
except Exception as e:
|
|
115
121
|
# Restore the method reference even if there's an error
|
|
116
|
-
if
|
|
117
|
-
additory.
|
|
122
|
+
if original_synthetic is not None:
|
|
123
|
+
additory.synthetic = original_synthetic
|
|
118
124
|
else:
|
|
119
|
-
additory.
|
|
125
|
+
additory.synthetic = self._synthetic_method
|
|
120
126
|
raise
|
|
121
127
|
|
|
122
|
-
def
|
|
128
|
+
def _to_method(self, target_df, from_df=None, bring=None, against=None, **kwargs):
|
|
123
129
|
"""
|
|
124
130
|
Add columns from reference dataframe to target dataframe.
|
|
125
131
|
|
|
@@ -140,26 +146,7 @@ class AdditoryAPI(SimpleNamespace):
|
|
|
140
146
|
from additory.utilities.lookup import to
|
|
141
147
|
return to(target_df, from_df, bring=bring, against=against, **kwargs)
|
|
142
148
|
|
|
143
|
-
def
|
|
144
|
-
"""
|
|
145
|
-
Generate synthetic data from a schema file.
|
|
146
|
-
|
|
147
|
-
Args:
|
|
148
|
-
schema_path: Path to the .toml schema file
|
|
149
|
-
rows: Number of rows to generate (default: 1000)
|
|
150
|
-
engine: Output engine ("pandas" or "polars"). If None, uses default from config
|
|
151
|
-
|
|
152
|
-
Returns:
|
|
153
|
-
Generated DataFrame in the specified format
|
|
154
|
-
|
|
155
|
-
Example:
|
|
156
|
-
df = add.synth("customer.toml", rows=5000)
|
|
157
|
-
df = add.synth("customer.toml", rows=5000, engine="polars")
|
|
158
|
-
"""
|
|
159
|
-
from additory.synthetic.api import synth as synth_impl
|
|
160
|
-
return synth_impl(schema_path, rows, engine)
|
|
161
|
-
|
|
162
|
-
def onehotencoding(self, df, columns=None, **kwargs):
|
|
149
|
+
def _onehotencoding_method(self, df, columns=None, **kwargs):
|
|
163
150
|
"""
|
|
164
151
|
One-hot encode categorical columns.
|
|
165
152
|
|
|
@@ -174,7 +161,7 @@ class AdditoryAPI(SimpleNamespace):
|
|
|
174
161
|
from additory.utilities.encoding import onehotencoding
|
|
175
162
|
return onehotencoding(df, column=columns, **kwargs)
|
|
176
163
|
|
|
177
|
-
def
|
|
164
|
+
def _harmonize_units_method(self, df, value_column, unit_column, target_unit=None, position="end", **kwargs):
|
|
178
165
|
"""
|
|
179
166
|
Harmonize units in a dataframe.
|
|
180
167
|
|
|
@@ -196,7 +183,7 @@ class AdditoryAPI(SimpleNamespace):
|
|
|
196
183
|
from additory.utilities.units import harmonize_units
|
|
197
184
|
return harmonize_units(df, value_column, unit_column, target_unit, position, **kwargs)
|
|
198
185
|
|
|
199
|
-
def
|
|
186
|
+
def _scan_method(
|
|
200
187
|
self,
|
|
201
188
|
df: Union[pl.DataFrame, pd.DataFrame, Any],
|
|
202
189
|
preset: Optional[str] = None,
|
|
@@ -279,7 +266,64 @@ class AdditoryAPI(SimpleNamespace):
|
|
|
279
266
|
verbose=verbose
|
|
280
267
|
)
|
|
281
268
|
|
|
282
|
-
def
|
|
269
|
+
def _deduce_method(
|
|
270
|
+
self,
|
|
271
|
+
df: Union[pd.DataFrame, pl.DataFrame, Any],
|
|
272
|
+
from_column: Union[str, List[str]],
|
|
273
|
+
to_column: str
|
|
274
|
+
) -> Union[pd.DataFrame, pl.DataFrame, Any]:
|
|
275
|
+
"""
|
|
276
|
+
Deduce missing labels based on text similarity to labeled examples.
|
|
277
|
+
|
|
278
|
+
Uses cosine similarity on TF-IDF vectors. Pure Python, no LLMs, offline-first.
|
|
279
|
+
Requires at least 3 labeled examples to work.
|
|
280
|
+
|
|
281
|
+
When multiple source columns are provided, they are concatenated with
|
|
282
|
+
spaces before computing similarity.
|
|
283
|
+
|
|
284
|
+
Args:
|
|
285
|
+
df: DataFrame with some labeled and some unlabeled rows
|
|
286
|
+
from_column: Text column(s) to analyze
|
|
287
|
+
- str: Single column (e.g., "comment")
|
|
288
|
+
- List[str]: Multiple columns (e.g., ["comment", "notes"])
|
|
289
|
+
to_column: Label column to fill (e.g., "status")
|
|
290
|
+
|
|
291
|
+
Returns:
|
|
292
|
+
DataFrame with deduced labels filled in
|
|
293
|
+
|
|
294
|
+
Examples:
|
|
295
|
+
# Single column
|
|
296
|
+
>>> result = add.deduce(df, from_column="comment", to_column="status")
|
|
297
|
+
|
|
298
|
+
# Multiple columns (better accuracy)
|
|
299
|
+
>>> result = add.deduce(
|
|
300
|
+
... df,
|
|
301
|
+
... from_column=["comment", "notes", "description"],
|
|
302
|
+
... to_column="status"
|
|
303
|
+
... )
|
|
304
|
+
|
|
305
|
+
Privacy: Your data never leaves your machine. No external connections.
|
|
306
|
+
"""
|
|
307
|
+
from additory.synthetic.deduce import deduce as deduce_impl
|
|
308
|
+
return deduce_impl(df, from_column, to_column)
|
|
309
|
+
|
|
310
|
+
def _games_method(self):
|
|
311
|
+
"""
|
|
312
|
+
List available games! 🎮
|
|
313
|
+
|
|
314
|
+
Returns a list of games you can play with add.play().
|
|
315
|
+
|
|
316
|
+
Returns:
|
|
317
|
+
List of available game names
|
|
318
|
+
|
|
319
|
+
Example:
|
|
320
|
+
>>> import additory
|
|
321
|
+
>>> additory.add.games()
|
|
322
|
+
['tictactoe', 'sudoku']
|
|
323
|
+
"""
|
|
324
|
+
return ['tictactoe', 'sudoku']
|
|
325
|
+
|
|
326
|
+
def _play_method(self, game: str = "tictactoe"):
|
|
283
327
|
"""
|
|
284
328
|
Play a game! 🎮
|
|
285
329
|
|
additory/expressions/proxy.py
CHANGED
|
@@ -287,7 +287,10 @@ class EnhancedExpressionProxy:
|
|
|
287
287
|
backend_type = "polars"
|
|
288
288
|
else:
|
|
289
289
|
# Try to detect other types
|
|
290
|
-
|
|
290
|
+
if self.polars_engine.arrow_bridge:
|
|
291
|
+
backend_type = self.polars_engine.arrow_bridge.detect_backend(df)
|
|
292
|
+
else:
|
|
293
|
+
backend_type = "pandas" # fallback
|
|
291
294
|
|
|
292
295
|
# Execute using Polars engine
|
|
293
296
|
result = self.polars_engine.execute_expression(
|