additory 0.1.0a1__py3-none-any.whl → 0.1.0a3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- additory/__init__.py +4 -0
- additory/common/__init__.py +2 -2
- additory/common/backend.py +20 -4
- additory/common/distributions.py +1 -1
- additory/common/sample_data.py +19 -19
- additory/core/backends/arrow_bridge.py +7 -0
- additory/core/polars_expression_engine.py +66 -16
- additory/dynamic_api.py +42 -46
- additory/expressions/proxy.py +4 -1
- additory/synthetic/__init__.py +7 -95
- additory/synthetic/column_name_resolver.py +149 -0
- additory/{augment → synthetic}/distributions.py +2 -2
- additory/{augment → synthetic}/forecast.py +1 -1
- additory/synthetic/linked_list_parser.py +415 -0
- additory/synthetic/namespace_lookup.py +129 -0
- additory/{augment → synthetic}/smote.py +1 -1
- additory/{augment → synthetic}/strategies.py +11 -44
- additory/{augment/augmentor.py → synthetic/synthesizer.py} +75 -15
- additory/utilities/units.py +4 -1
- {additory-0.1.0a1.dist-info → additory-0.1.0a3.dist-info}/METADATA +12 -17
- {additory-0.1.0a1.dist-info → additory-0.1.0a3.dist-info}/RECORD +24 -40
- {additory-0.1.0a1.dist-info → additory-0.1.0a3.dist-info}/WHEEL +1 -1
- additory/augment/__init__.py +0 -24
- additory/augment/builtin_lists.py +0 -430
- additory/augment/list_registry.py +0 -177
- additory/synthetic/api.py +0 -220
- additory/synthetic/common_integration.py +0 -314
- additory/synthetic/config.py +0 -262
- additory/synthetic/engines.py +0 -529
- additory/synthetic/exceptions.py +0 -180
- additory/synthetic/file_managers.py +0 -518
- additory/synthetic/generator.py +0 -702
- additory/synthetic/generator_parser.py +0 -68
- additory/synthetic/integration.py +0 -319
- additory/synthetic/models.py +0 -241
- additory/synthetic/pattern_resolver.py +0 -573
- additory/synthetic/performance.py +0 -469
- additory/synthetic/polars_integration.py +0 -464
- additory/synthetic/proxy.py +0 -60
- additory/synthetic/schema_parser.py +0 -685
- additory/synthetic/validator.py +0 -553
- {additory-0.1.0a1.dist-info → additory-0.1.0a3.dist-info}/licenses/LICENSE +0 -0
- {additory-0.1.0a1.dist-info → additory-0.1.0a3.dist-info}/top_level.txt +0 -0
additory/__init__.py
CHANGED
|
@@ -2,6 +2,9 @@
|
|
|
2
2
|
|
|
3
3
|
from .dynamic_api import add as _api_instance
|
|
4
4
|
|
|
5
|
+
# Version information
|
|
6
|
+
__version__ = "0.1.0a3"
|
|
7
|
+
|
|
5
8
|
# Expose the API instance normally
|
|
6
9
|
add = _api_instance
|
|
7
10
|
|
|
@@ -12,4 +15,5 @@ def __getattr__(name):
|
|
|
12
15
|
|
|
13
16
|
__all__ = [
|
|
14
17
|
"add",
|
|
18
|
+
"__version__",
|
|
15
19
|
]
|
additory/common/__init__.py
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Common Utilities Module
|
|
3
3
|
|
|
4
|
-
Shared functionality used by both
|
|
4
|
+
Shared functionality used by both synthetic and expressions modules:
|
|
5
5
|
- Distribution functions (normal, uniform, skewed, etc.)
|
|
6
6
|
- List file management (.list format)
|
|
7
7
|
- Pattern file management (.properties format)
|
|
8
8
|
- Fallback resolution logic
|
|
9
9
|
|
|
10
10
|
This module eliminates code duplication and provides consistent behavior
|
|
11
|
-
across
|
|
11
|
+
across synthetic and expression data generation.
|
|
12
12
|
"""
|
|
13
13
|
|
|
14
14
|
from .distributions import (
|
additory/common/backend.py
CHANGED
|
@@ -180,11 +180,14 @@ def get_arrow_bridge():
|
|
|
180
180
|
- Use for all cross-backend conversions
|
|
181
181
|
- Handles pandas/polars/cuDF via Arrow
|
|
182
182
|
"""
|
|
183
|
-
from additory.core.backends.arrow_bridge import EnhancedArrowBridge
|
|
183
|
+
from additory.core.backends.arrow_bridge import EnhancedArrowBridge, ArrowBridgeError
|
|
184
184
|
|
|
185
185
|
# Singleton pattern
|
|
186
186
|
if not hasattr(get_arrow_bridge, '_instance'):
|
|
187
|
-
|
|
187
|
+
try:
|
|
188
|
+
get_arrow_bridge._instance = EnhancedArrowBridge()
|
|
189
|
+
except ArrowBridgeError:
|
|
190
|
+
get_arrow_bridge._instance = None
|
|
188
191
|
|
|
189
192
|
return get_arrow_bridge._instance
|
|
190
193
|
|
|
@@ -194,7 +197,7 @@ def to_polars(df: Any, backend_type: BackendType = None) -> 'pl.DataFrame':
|
|
|
194
197
|
Convert any dataframe to Polars via Arrow bridge.
|
|
195
198
|
|
|
196
199
|
This is the primary conversion function for the Polars-only architecture.
|
|
197
|
-
All operations (expressions,
|
|
200
|
+
All operations (expressions, synthetic, etc.) use this to convert input
|
|
198
201
|
dataframes to Polars for processing.
|
|
199
202
|
|
|
200
203
|
Args:
|
|
@@ -224,7 +227,7 @@ def to_polars(df: Any, backend_type: BackendType = None) -> 'pl.DataFrame':
|
|
|
224
227
|
)
|
|
225
228
|
|
|
226
229
|
# Fast path: already Polars
|
|
227
|
-
if isinstance(df, pl.DataFrame):
|
|
230
|
+
if HAS_POLARS and isinstance(df, pl.DataFrame):
|
|
228
231
|
return df
|
|
229
232
|
|
|
230
233
|
# Validate input
|
|
@@ -240,6 +243,13 @@ def to_polars(df: Any, backend_type: BackendType = None) -> 'pl.DataFrame':
|
|
|
240
243
|
# Convert via Arrow bridge
|
|
241
244
|
try:
|
|
242
245
|
bridge = get_arrow_bridge()
|
|
246
|
+
if bridge is None:
|
|
247
|
+
# Fallback: direct conversion for pandas
|
|
248
|
+
if backend_type == "pandas":
|
|
249
|
+
if isinstance(df, pd.DataFrame):
|
|
250
|
+
return pl.from_pandas(df)
|
|
251
|
+
raise RuntimeError("Arrow bridge not available and cannot convert non-pandas DataFrame")
|
|
252
|
+
|
|
243
253
|
arrow_table = bridge.to_arrow(df, backend_type)
|
|
244
254
|
pl_df = bridge.from_arrow(arrow_table, "polars")
|
|
245
255
|
return pl_df
|
|
@@ -309,6 +319,12 @@ def from_polars(pl_df: 'pl.DataFrame', target_backend: BackendType) -> Any:
|
|
|
309
319
|
# Convert via Arrow bridge
|
|
310
320
|
try:
|
|
311
321
|
bridge = get_arrow_bridge()
|
|
322
|
+
if bridge is None:
|
|
323
|
+
# Fallback: direct conversion for pandas
|
|
324
|
+
if target_backend == "pandas":
|
|
325
|
+
return pl_df.to_pandas()
|
|
326
|
+
raise RuntimeError("Arrow bridge not available and cannot convert to non-pandas DataFrame")
|
|
327
|
+
|
|
312
328
|
arrow_table = bridge.to_arrow(pl_df, "polars")
|
|
313
329
|
result_df = bridge.from_arrow(arrow_table, target_backend)
|
|
314
330
|
return result_df
|
additory/common/distributions.py
CHANGED
additory/common/sample_data.py
CHANGED
|
@@ -8,8 +8,8 @@ loaded on-demand using the existing .add file parser.
|
|
|
8
8
|
Usage:
|
|
9
9
|
from additory.common.sample_data import get_sample_dataset
|
|
10
10
|
|
|
11
|
-
# For
|
|
12
|
-
df = get_sample_dataset("
|
|
11
|
+
# For synthetic
|
|
12
|
+
df = get_sample_dataset("synthetic", "sample")
|
|
13
13
|
|
|
14
14
|
# For expressions (future)
|
|
15
15
|
df = get_sample_dataset("expressions", "sample")
|
|
@@ -25,7 +25,7 @@ from additory.common.exceptions import ValidationError
|
|
|
25
25
|
|
|
26
26
|
|
|
27
27
|
def get_sample_dataset(
|
|
28
|
-
module: str = "
|
|
28
|
+
module: str = "synthetic",
|
|
29
29
|
block: str = "sample",
|
|
30
30
|
dataset_type: str = "clean"
|
|
31
31
|
) -> pl.DataFrame:
|
|
@@ -33,12 +33,12 @@ def get_sample_dataset(
|
|
|
33
33
|
Load a sample dataset from .add files.
|
|
34
34
|
|
|
35
35
|
This function provides centralized access to sample datasets across
|
|
36
|
-
all additory modules (
|
|
36
|
+
all additory modules (synthetic, expressions, utilities). Sample datasets
|
|
37
37
|
are stored as .add files in the reference/ directory structure.
|
|
38
38
|
|
|
39
39
|
Args:
|
|
40
|
-
module: Module name ("
|
|
41
|
-
block: Block name within the .add file ("sample" for
|
|
40
|
+
module: Module name ("synthetic", "expressions", "utilities")
|
|
41
|
+
block: Block name within the .add file ("sample" for synthetic)
|
|
42
42
|
dataset_type: Type of sample data ("clean" or "unclean")
|
|
43
43
|
|
|
44
44
|
Returns:
|
|
@@ -48,8 +48,8 @@ def get_sample_dataset(
|
|
|
48
48
|
ValidationError: If module, block, or dataset_type not found
|
|
49
49
|
|
|
50
50
|
Examples:
|
|
51
|
-
>>> # Load
|
|
52
|
-
>>> df = get_sample_dataset("
|
|
51
|
+
>>> # Load synthetic sample dataset
|
|
52
|
+
>>> df = get_sample_dataset("synthetic", "sample")
|
|
53
53
|
>>> print(df.shape)
|
|
54
54
|
(50, 10)
|
|
55
55
|
|
|
@@ -57,7 +57,7 @@ def get_sample_dataset(
|
|
|
57
57
|
>>> df = get_sample_dataset("expressions", "sample", "clean")
|
|
58
58
|
>>> df_unclean = get_sample_dataset("expressions", "sample", "unclean")
|
|
59
59
|
|
|
60
|
-
Sample Dataset Structure (
|
|
60
|
+
Sample Dataset Structure (synthetic):
|
|
61
61
|
- id: Sequential numeric IDs (1-50)
|
|
62
62
|
- emp_id: Employee IDs with pattern (EMP_001 - EMP_050)
|
|
63
63
|
- order_id: Order IDs with different padding (ORD_0001 - ORD_0050)
|
|
@@ -72,8 +72,8 @@ def get_sample_dataset(
|
|
|
72
72
|
# Construct path to .add file
|
|
73
73
|
base_path = Path(__file__).parent.parent.parent / "reference"
|
|
74
74
|
|
|
75
|
-
if module == "
|
|
76
|
-
add_file_path = base_path / "
|
|
75
|
+
if module == "synthetic":
|
|
76
|
+
add_file_path = base_path / "synthetic_definitions" / f"{block}_0.1.add"
|
|
77
77
|
elif module == "expressions":
|
|
78
78
|
add_file_path = base_path / "expressions_definitions" / f"{block}_0.1.add"
|
|
79
79
|
elif module == "utilities":
|
|
@@ -81,7 +81,7 @@ def get_sample_dataset(
|
|
|
81
81
|
else:
|
|
82
82
|
raise ValidationError(
|
|
83
83
|
f"Unknown module '{module}'. "
|
|
84
|
-
f"Valid modules:
|
|
84
|
+
f"Valid modules: synthetic, expressions, utilities"
|
|
85
85
|
)
|
|
86
86
|
|
|
87
87
|
# Check if file exists
|
|
@@ -141,7 +141,7 @@ def list_available_samples() -> dict:
|
|
|
141
141
|
>>> samples = list_available_samples()
|
|
142
142
|
>>> print(samples)
|
|
143
143
|
{
|
|
144
|
-
'
|
|
144
|
+
'synthetic': ['sample'],
|
|
145
145
|
'expressions': ['sample'],
|
|
146
146
|
'utilities': []
|
|
147
147
|
}
|
|
@@ -149,15 +149,15 @@ def list_available_samples() -> dict:
|
|
|
149
149
|
base_path = Path(__file__).parent.parent.parent / "reference"
|
|
150
150
|
available = {}
|
|
151
151
|
|
|
152
|
-
# Check
|
|
153
|
-
|
|
154
|
-
if
|
|
155
|
-
available['
|
|
152
|
+
# Check synthetic
|
|
153
|
+
synthetic_path = base_path / "synthetic_definitions"
|
|
154
|
+
if synthetic_path.exists():
|
|
155
|
+
available['synthetic'] = [
|
|
156
156
|
f.stem.rsplit('_', 1)[0] # Remove version suffix
|
|
157
|
-
for f in
|
|
157
|
+
for f in synthetic_path.glob("*.add")
|
|
158
158
|
]
|
|
159
159
|
else:
|
|
160
|
-
available['
|
|
160
|
+
available['synthetic'] = []
|
|
161
161
|
|
|
162
162
|
# Check expressions
|
|
163
163
|
expressions_path = base_path / "expressions_definitions"
|
|
@@ -16,6 +16,13 @@ try:
|
|
|
16
16
|
except ImportError as e:
|
|
17
17
|
ARROW_AVAILABLE = False
|
|
18
18
|
IMPORT_ERROR = str(e)
|
|
19
|
+
# Create dummy classes for type annotations
|
|
20
|
+
class pa:
|
|
21
|
+
Table = Any
|
|
22
|
+
class pl:
|
|
23
|
+
DataFrame = Any
|
|
24
|
+
class pd:
|
|
25
|
+
DataFrame = Any
|
|
19
26
|
|
|
20
27
|
from ..logging import log_info, log_warning
|
|
21
28
|
from .cudf_bridge import get_cudf_bridge
|
|
@@ -32,7 +32,10 @@ class PolarsExpressionEngine:
|
|
|
32
32
|
"""Exclusive Polars-based expression processing engine"""
|
|
33
33
|
|
|
34
34
|
def __init__(self):
|
|
35
|
-
|
|
35
|
+
try:
|
|
36
|
+
self.arrow_bridge = EnhancedArrowBridge()
|
|
37
|
+
except ArrowBridgeError:
|
|
38
|
+
self.arrow_bridge = None
|
|
36
39
|
self.execution_stats = {
|
|
37
40
|
"total_executions": 0,
|
|
38
41
|
"total_time_ms": 0.0,
|
|
@@ -68,14 +71,28 @@ class PolarsExpressionEngine:
|
|
|
68
71
|
try:
|
|
69
72
|
# Auto-detect backend if not specified
|
|
70
73
|
if backend_type is None:
|
|
71
|
-
|
|
74
|
+
if self.arrow_bridge:
|
|
75
|
+
backend_type = self.arrow_bridge.detect_backend(df)
|
|
76
|
+
else:
|
|
77
|
+
backend_type = "pandas" # fallback
|
|
72
78
|
|
|
73
79
|
# Get memory usage before processing
|
|
74
|
-
|
|
80
|
+
if self.arrow_bridge:
|
|
81
|
+
memory_before = self.arrow_bridge._get_memory_usage_mb()
|
|
82
|
+
else:
|
|
83
|
+
memory_before = 0
|
|
75
84
|
|
|
76
85
|
# 1. Convert input to Arrow
|
|
77
86
|
log_info(f"[polars_engine] Converting {backend_type} to Arrow")
|
|
78
|
-
|
|
87
|
+
if self.arrow_bridge:
|
|
88
|
+
arrow_table = self.arrow_bridge.to_arrow(df, backend_type)
|
|
89
|
+
else:
|
|
90
|
+
# Fallback: assume pandas and convert directly
|
|
91
|
+
import pandas as pd
|
|
92
|
+
if isinstance(df, pd.DataFrame):
|
|
93
|
+
arrow_table = pl.from_pandas(df).to_arrow()
|
|
94
|
+
else:
|
|
95
|
+
raise RuntimeError("Arrow bridge not available and input is not pandas DataFrame")
|
|
79
96
|
|
|
80
97
|
# 2. Convert Arrow to Polars
|
|
81
98
|
log_info("[polars_engine] Converting Arrow to Polars")
|
|
@@ -93,11 +110,18 @@ class PolarsExpressionEngine:
|
|
|
93
110
|
|
|
94
111
|
# 5. Convert to original backend format
|
|
95
112
|
log_info(f"[polars_engine] Converting Arrow to {backend_type}")
|
|
96
|
-
|
|
113
|
+
if self.arrow_bridge:
|
|
114
|
+
final_result = self.arrow_bridge.from_arrow(result_arrow, backend_type)
|
|
115
|
+
else:
|
|
116
|
+
# Fallback: convert back to pandas
|
|
117
|
+
final_result = pl.from_arrow(result_arrow).to_pandas()
|
|
97
118
|
|
|
98
119
|
# Calculate execution statistics
|
|
99
120
|
execution_time = (datetime.now() - start_time).total_seconds() * 1000
|
|
100
|
-
|
|
121
|
+
if self.arrow_bridge:
|
|
122
|
+
memory_after = self.arrow_bridge._get_memory_usage_mb()
|
|
123
|
+
else:
|
|
124
|
+
memory_after = 0
|
|
101
125
|
memory_used = max(0, memory_after - memory_before)
|
|
102
126
|
|
|
103
127
|
# Update global statistics
|
|
@@ -122,7 +146,8 @@ class PolarsExpressionEngine:
|
|
|
122
146
|
|
|
123
147
|
finally:
|
|
124
148
|
# 6. Always cleanup Arrow memory
|
|
125
|
-
self.arrow_bridge
|
|
149
|
+
if self.arrow_bridge:
|
|
150
|
+
self.arrow_bridge.cleanup_arrow_memory()
|
|
126
151
|
|
|
127
152
|
def _execute_polars_expression(self, polars_df: pl.DataFrame,
|
|
128
153
|
expression: str, output_column: str) -> pl.DataFrame:
|
|
@@ -381,14 +406,28 @@ class PolarsExpressionEngine:
|
|
|
381
406
|
try:
|
|
382
407
|
# Auto-detect backend if not specified
|
|
383
408
|
if backend_type is None:
|
|
384
|
-
|
|
409
|
+
if self.arrow_bridge:
|
|
410
|
+
backend_type = self.arrow_bridge.detect_backend(df)
|
|
411
|
+
else:
|
|
412
|
+
backend_type = "pandas"
|
|
385
413
|
|
|
386
414
|
# Get memory usage before processing
|
|
387
|
-
|
|
415
|
+
if self.arrow_bridge:
|
|
416
|
+
memory_before = self.arrow_bridge._get_memory_usage_mb()
|
|
417
|
+
else:
|
|
418
|
+
memory_before = 0
|
|
388
419
|
|
|
389
420
|
# Convert to Polars via Arrow
|
|
390
|
-
|
|
391
|
-
|
|
421
|
+
if self.arrow_bridge:
|
|
422
|
+
arrow_table = self.arrow_bridge.to_arrow(df, backend_type)
|
|
423
|
+
polars_df = pl.from_arrow(arrow_table)
|
|
424
|
+
else:
|
|
425
|
+
# Fallback: assume pandas
|
|
426
|
+
import pandas as pd
|
|
427
|
+
if isinstance(df, pd.DataFrame):
|
|
428
|
+
polars_df = pl.from_pandas(df)
|
|
429
|
+
else:
|
|
430
|
+
raise RuntimeError("Arrow bridge not available and input is not pandas DataFrame")
|
|
392
431
|
|
|
393
432
|
# Execute using AST
|
|
394
433
|
polars_expr = self._ast_to_polars_expr(ast_tree)
|
|
@@ -396,11 +435,17 @@ class PolarsExpressionEngine:
|
|
|
396
435
|
|
|
397
436
|
# Convert back to original format
|
|
398
437
|
result_arrow = result_df.to_arrow()
|
|
399
|
-
|
|
438
|
+
if self.arrow_bridge:
|
|
439
|
+
final_result = self.arrow_bridge.from_arrow(result_arrow, backend_type)
|
|
440
|
+
else:
|
|
441
|
+
final_result = pl.from_arrow(result_arrow).to_pandas()
|
|
400
442
|
|
|
401
443
|
# Calculate statistics
|
|
402
444
|
execution_time = (datetime.now() - start_time).total_seconds() * 1000
|
|
403
|
-
|
|
445
|
+
if self.arrow_bridge:
|
|
446
|
+
memory_after = self.arrow_bridge._get_memory_usage_mb()
|
|
447
|
+
else:
|
|
448
|
+
memory_after = 0
|
|
404
449
|
memory_used = max(0, memory_after - memory_before)
|
|
405
450
|
|
|
406
451
|
# Update statistics
|
|
@@ -422,7 +467,8 @@ class PolarsExpressionEngine:
|
|
|
422
467
|
raise PolarsExpressionError(f"AST execution failed: {e}")
|
|
423
468
|
|
|
424
469
|
finally:
|
|
425
|
-
self.arrow_bridge
|
|
470
|
+
if self.arrow_bridge:
|
|
471
|
+
self.arrow_bridge.cleanup_arrow_memory()
|
|
426
472
|
|
|
427
473
|
def validate_expression(self, expression: str) -> bool:
|
|
428
474
|
"""
|
|
@@ -489,7 +535,10 @@ class PolarsExpressionEngine:
|
|
|
489
535
|
Benchmark results
|
|
490
536
|
"""
|
|
491
537
|
times = []
|
|
492
|
-
|
|
538
|
+
if self.arrow_bridge:
|
|
539
|
+
backend_type = self.arrow_bridge.detect_backend(df)
|
|
540
|
+
else:
|
|
541
|
+
backend_type = "pandas"
|
|
493
542
|
|
|
494
543
|
for i in range(iterations):
|
|
495
544
|
try:
|
|
@@ -532,7 +581,8 @@ class PolarsExpressionEngine:
|
|
|
532
581
|
"""Cleanup callback for memory manager"""
|
|
533
582
|
try:
|
|
534
583
|
# Cleanup Arrow bridge memory
|
|
535
|
-
self.arrow_bridge
|
|
584
|
+
if self.arrow_bridge:
|
|
585
|
+
self.arrow_bridge.cleanup_arrow_memory()
|
|
536
586
|
|
|
537
587
|
# Reset statistics if they get too large
|
|
538
588
|
if self.execution_stats["total_executions"] > 10000:
|
additory/dynamic_api.py
CHANGED
|
@@ -15,9 +15,8 @@ class AdditoryAPI(SimpleNamespace):
|
|
|
15
15
|
Main API class for Additory functionality.
|
|
16
16
|
|
|
17
17
|
Provides access to:
|
|
18
|
-
- add.
|
|
18
|
+
- add.synthetic() - Synthetic data generation
|
|
19
19
|
- add.to() - Lookup/join operations
|
|
20
|
-
- add.synth() - Synthetic data generation
|
|
21
20
|
- add.scan() - Data profiling and analysis
|
|
22
21
|
- add.my - User expressions
|
|
23
22
|
- add.play() - Hidden games (for the curious 😉)
|
|
@@ -31,8 +30,8 @@ class AdditoryAPI(SimpleNamespace):
|
|
|
31
30
|
self.my = ExpressionProxy(namespace="user")
|
|
32
31
|
self._builtin_proxy = ExpressionProxy(namespace="builtin")
|
|
33
32
|
|
|
34
|
-
# Explicitly set the
|
|
35
|
-
self.
|
|
33
|
+
# Explicitly set the synthetic method to prevent namespace conflicts
|
|
34
|
+
self.synthetic = self._synthetic_method
|
|
36
35
|
|
|
37
36
|
def __getattr__(self, name):
|
|
38
37
|
"""
|
|
@@ -61,62 +60,62 @@ class AdditoryAPI(SimpleNamespace):
|
|
|
61
60
|
except Exception:
|
|
62
61
|
return False
|
|
63
62
|
|
|
64
|
-
def
|
|
63
|
+
def _synthetic_method(self, df, n_rows=5, strategy="auto", seed=None, output_format="pandas", **kwargs):
|
|
65
64
|
"""
|
|
66
|
-
|
|
65
|
+
Generate synthetic data by extending a dataframe or creating from scratch.
|
|
67
66
|
|
|
68
67
|
Three modes:
|
|
69
|
-
1.
|
|
68
|
+
1. Extend mode: Pass a DataFrame to add synthetic rows
|
|
70
69
|
2. Create mode: Pass "@new" to create data from scratch
|
|
71
70
|
3. Sample mode: Pass "@sample" to load sample data
|
|
72
71
|
|
|
73
72
|
Args:
|
|
74
|
-
df: DataFrame to
|
|
75
|
-
n_rows: Number of rows (int for create/sample, int/float/str for
|
|
76
|
-
strategy: Strategy specification (dict for create, str/dict for
|
|
73
|
+
df: DataFrame to extend, "@new" to create, or "@sample" for sample data
|
|
74
|
+
n_rows: Number of rows (int for create/sample, int/float/str for extend)
|
|
75
|
+
strategy: Strategy specification (dict for create, str/dict for extend)
|
|
77
76
|
seed: Random seed for reproducibility
|
|
78
77
|
output_format: Output format ("pandas", "polars", "cudf")
|
|
79
78
|
**kwargs: Additional parameters
|
|
80
79
|
|
|
81
80
|
Returns:
|
|
82
|
-
|
|
81
|
+
Extended or generated DataFrame
|
|
83
82
|
|
|
84
83
|
Examples:
|
|
85
|
-
#
|
|
86
|
-
result = add.
|
|
84
|
+
# Extend existing data
|
|
85
|
+
result = add.synthetic(df, n_rows=100, strategy='auto')
|
|
87
86
|
|
|
88
87
|
# Create from scratch
|
|
89
|
-
result = add.
|
|
88
|
+
result = add.synthetic("@new", n_rows=100, strategy={'id': 'increment', 'age': 'range:18-65'})
|
|
90
89
|
|
|
91
90
|
# Load sample data
|
|
92
|
-
result = add.
|
|
91
|
+
result = add.synthetic("@sample", n_rows=50)
|
|
93
92
|
"""
|
|
94
93
|
# Store reference to restore after import (in the correct namespace)
|
|
95
94
|
import additory
|
|
96
|
-
|
|
95
|
+
original_synthetic = getattr(additory, 'synthetic', None)
|
|
97
96
|
|
|
98
97
|
try:
|
|
99
98
|
# Import and call the implementation
|
|
100
|
-
from additory.
|
|
101
|
-
result =
|
|
102
|
-
|
|
99
|
+
from additory.synthetic.synthesizer import synthetic as synthetic_impl
|
|
100
|
+
result = synthetic_impl(df, n_rows=n_rows, strategy=strategy, seed=seed,
|
|
101
|
+
output_format=output_format, **kwargs)
|
|
103
102
|
|
|
104
103
|
# Restore the method reference in the additory module namespace
|
|
105
|
-
# The import above will have overridden additory.
|
|
104
|
+
# The import above will have overridden additory.synthetic with the module
|
|
106
105
|
# We need to restore it to point to this method
|
|
107
|
-
if
|
|
108
|
-
additory.
|
|
106
|
+
if original_synthetic is not None:
|
|
107
|
+
additory.synthetic = original_synthetic
|
|
109
108
|
else:
|
|
110
|
-
# If there was no original
|
|
111
|
-
additory.
|
|
109
|
+
# If there was no original synthetic, set it to this method
|
|
110
|
+
additory.synthetic = self._synthetic_method
|
|
112
111
|
|
|
113
112
|
return result
|
|
114
113
|
except Exception as e:
|
|
115
114
|
# Restore the method reference even if there's an error
|
|
116
|
-
if
|
|
117
|
-
additory.
|
|
115
|
+
if original_synthetic is not None:
|
|
116
|
+
additory.synthetic = original_synthetic
|
|
118
117
|
else:
|
|
119
|
-
additory.
|
|
118
|
+
additory.synthetic = self._synthetic_method
|
|
120
119
|
raise
|
|
121
120
|
|
|
122
121
|
def to(self, target_df, from_df=None, bring=None, against=None, **kwargs):
|
|
@@ -140,25 +139,6 @@ class AdditoryAPI(SimpleNamespace):
|
|
|
140
139
|
from additory.utilities.lookup import to
|
|
141
140
|
return to(target_df, from_df, bring=bring, against=against, **kwargs)
|
|
142
141
|
|
|
143
|
-
def synth(self, schema_path: str, rows: int = 1000, engine: Optional[str] = None):
|
|
144
|
-
"""
|
|
145
|
-
Generate synthetic data from a schema file.
|
|
146
|
-
|
|
147
|
-
Args:
|
|
148
|
-
schema_path: Path to the .toml schema file
|
|
149
|
-
rows: Number of rows to generate (default: 1000)
|
|
150
|
-
engine: Output engine ("pandas" or "polars"). If None, uses default from config
|
|
151
|
-
|
|
152
|
-
Returns:
|
|
153
|
-
Generated DataFrame in the specified format
|
|
154
|
-
|
|
155
|
-
Example:
|
|
156
|
-
df = add.synth("customer.toml", rows=5000)
|
|
157
|
-
df = add.synth("customer.toml", rows=5000, engine="polars")
|
|
158
|
-
"""
|
|
159
|
-
from additory.synthetic.api import synth as synth_impl
|
|
160
|
-
return synth_impl(schema_path, rows, engine)
|
|
161
|
-
|
|
162
142
|
def onehotencoding(self, df, columns=None, **kwargs):
|
|
163
143
|
"""
|
|
164
144
|
One-hot encode categorical columns.
|
|
@@ -279,6 +259,22 @@ class AdditoryAPI(SimpleNamespace):
|
|
|
279
259
|
verbose=verbose
|
|
280
260
|
)
|
|
281
261
|
|
|
262
|
+
def games(self):
|
|
263
|
+
"""
|
|
264
|
+
List available games! 🎮
|
|
265
|
+
|
|
266
|
+
Returns a list of games you can play with add.play().
|
|
267
|
+
|
|
268
|
+
Returns:
|
|
269
|
+
List of available game names
|
|
270
|
+
|
|
271
|
+
Example:
|
|
272
|
+
>>> import additory
|
|
273
|
+
>>> additory.add.games()
|
|
274
|
+
['tictactoe', 'sudoku']
|
|
275
|
+
"""
|
|
276
|
+
return ['tictactoe', 'sudoku']
|
|
277
|
+
|
|
282
278
|
def play(self, game: str = "tictactoe"):
|
|
283
279
|
"""
|
|
284
280
|
Play a game! 🎮
|
additory/expressions/proxy.py
CHANGED
|
@@ -287,7 +287,10 @@ class EnhancedExpressionProxy:
|
|
|
287
287
|
backend_type = "polars"
|
|
288
288
|
else:
|
|
289
289
|
# Try to detect other types
|
|
290
|
-
|
|
290
|
+
if self.polars_engine.arrow_bridge:
|
|
291
|
+
backend_type = self.polars_engine.arrow_bridge.detect_backend(df)
|
|
292
|
+
else:
|
|
293
|
+
backend_type = "pandas" # fallback
|
|
291
294
|
|
|
292
295
|
# Execute using Polars engine
|
|
293
296
|
result = self.polars_engine.execute_expression(
|
additory/synthetic/__init__.py
CHANGED
|
@@ -1,101 +1,13 @@
|
|
|
1
1
|
"""
|
|
2
|
-
|
|
2
|
+
Synthetic Module - Synthetic Data Generation Functionality
|
|
3
3
|
|
|
4
|
-
This module provides
|
|
5
|
-
|
|
6
|
-
|
|
4
|
+
This module provides synthetic data generation capabilities to add synthetic rows
|
|
5
|
+
to existing dataframes or create data from scratch by intelligently sampling
|
|
6
|
+
from existing data patterns.
|
|
7
7
|
"""
|
|
8
8
|
|
|
9
|
-
from .
|
|
10
|
-
synth,
|
|
11
|
-
config,
|
|
12
|
-
register_distribution_engine,
|
|
13
|
-
unregister_distribution_engine,
|
|
14
|
-
list_custom_distribution_engines
|
|
15
|
-
)
|
|
16
|
-
from .exceptions import (
|
|
17
|
-
SyntheticDataError,
|
|
18
|
-
PatternResolutionError,
|
|
19
|
-
ValidationError,
|
|
20
|
-
DistributionError,
|
|
21
|
-
FileFormatError,
|
|
22
|
-
PatternImportError,
|
|
23
|
-
SchemaParsingError
|
|
24
|
-
)
|
|
25
|
-
from .pattern_resolver import PatternHierarchyResolver, ResolutionTrace, PatternResolutionResult
|
|
26
|
-
from .engines import (
|
|
27
|
-
DistributionEngine,
|
|
28
|
-
DistributionEngineFactory,
|
|
29
|
-
DistributionManager,
|
|
30
|
-
DistributionConfig,
|
|
31
|
-
)
|
|
32
|
-
from .generator import (
|
|
33
|
-
RegexGenerator,
|
|
34
|
-
PolarsGeneratorCore,
|
|
35
|
-
OutputConverter,
|
|
36
|
-
SyntheticDataGenerator,
|
|
37
|
-
GenerationConfig,
|
|
38
|
-
)
|
|
39
|
-
from .performance import (
|
|
40
|
-
PerformanceMonitor,
|
|
41
|
-
PerformanceOptimizer,
|
|
42
|
-
PerformanceMetrics,
|
|
43
|
-
PerformanceComparison,
|
|
44
|
-
performance_monitor,
|
|
45
|
-
performance_optimizer
|
|
46
|
-
)
|
|
47
|
-
from .polars_integration import (
|
|
48
|
-
PolarsIntegrationLayer,
|
|
49
|
-
optimize_conversion,
|
|
50
|
-
enhance_result,
|
|
51
|
-
optimize_context,
|
|
52
|
-
apply_expression,
|
|
53
|
-
optimize_memory,
|
|
54
|
-
validate_compatibility,
|
|
55
|
-
get_integration_stats,
|
|
56
|
-
cleanup_integration,
|
|
57
|
-
benchmark_integration
|
|
58
|
-
)
|
|
9
|
+
from additory.synthetic.synthesizer import synthetic
|
|
59
10
|
|
|
60
11
|
__all__ = [
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
'register_distribution_engine',
|
|
64
|
-
'unregister_distribution_engine',
|
|
65
|
-
'list_custom_distribution_engines',
|
|
66
|
-
'SyntheticDataError',
|
|
67
|
-
'PatternResolutionError',
|
|
68
|
-
'ValidationError',
|
|
69
|
-
'DistributionError',
|
|
70
|
-
'FileFormatError',
|
|
71
|
-
'PatternImportError',
|
|
72
|
-
'SchemaParsingError',
|
|
73
|
-
'PatternHierarchyResolver',
|
|
74
|
-
'ResolutionTrace',
|
|
75
|
-
'PatternResolutionResult',
|
|
76
|
-
'DistributionEngine',
|
|
77
|
-
'DistributionEngineFactory',
|
|
78
|
-
'DistributionManager',
|
|
79
|
-
'DistributionConfig',
|
|
80
|
-
'RegexGenerator',
|
|
81
|
-
'PolarsGeneratorCore',
|
|
82
|
-
'OutputConverter',
|
|
83
|
-
'SyntheticDataGenerator',
|
|
84
|
-
'GenerationConfig',
|
|
85
|
-
'PerformanceMonitor',
|
|
86
|
-
'PerformanceOptimizer',
|
|
87
|
-
'PerformanceMetrics',
|
|
88
|
-
'PerformanceComparison',
|
|
89
|
-
'performance_monitor',
|
|
90
|
-
'performance_optimizer',
|
|
91
|
-
'PolarsIntegrationLayer',
|
|
92
|
-
'optimize_conversion',
|
|
93
|
-
'enhance_result',
|
|
94
|
-
'optimize_context',
|
|
95
|
-
'apply_expression',
|
|
96
|
-
'optimize_memory',
|
|
97
|
-
'validate_compatibility',
|
|
98
|
-
'get_integration_stats',
|
|
99
|
-
'cleanup_integration',
|
|
100
|
-
'benchmark_integration'
|
|
101
|
-
]
|
|
12
|
+
"synthetic"
|
|
13
|
+
]
|