additory 0.1.0a2__py3-none-any.whl → 0.1.0a4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. additory/__init__.py +4 -0
  2. additory/common/__init__.py +2 -2
  3. additory/common/backend.py +20 -4
  4. additory/common/distributions.py +1 -1
  5. additory/common/sample_data.py +19 -19
  6. additory/core/backends/arrow_bridge.py +7 -0
  7. additory/core/config.py +3 -3
  8. additory/core/polars_expression_engine.py +66 -16
  9. additory/core/registry.py +4 -3
  10. additory/dynamic_api.py +95 -51
  11. additory/expressions/proxy.py +4 -1
  12. additory/expressions/registry.py +3 -3
  13. additory/synthetic/__init__.py +7 -95
  14. additory/synthetic/column_name_resolver.py +149 -0
  15. additory/synthetic/deduce.py +259 -0
  16. additory/{augment → synthetic}/distributions.py +2 -2
  17. additory/{augment → synthetic}/forecast.py +1 -1
  18. additory/synthetic/linked_list_parser.py +415 -0
  19. additory/synthetic/namespace_lookup.py +129 -0
  20. additory/{augment → synthetic}/smote.py +1 -1
  21. additory/{augment → synthetic}/strategies.py +87 -44
  22. additory/{augment/augmentor.py → synthetic/synthesizer.py} +75 -15
  23. additory/utilities/units.py +4 -1
  24. {additory-0.1.0a2.dist-info → additory-0.1.0a4.dist-info}/METADATA +44 -28
  25. {additory-0.1.0a2.dist-info → additory-0.1.0a4.dist-info}/RECORD +28 -43
  26. {additory-0.1.0a2.dist-info → additory-0.1.0a4.dist-info}/WHEEL +1 -1
  27. additory/augment/__init__.py +0 -24
  28. additory/augment/builtin_lists.py +0 -430
  29. additory/augment/list_registry.py +0 -177
  30. additory/synthetic/api.py +0 -220
  31. additory/synthetic/common_integration.py +0 -314
  32. additory/synthetic/config.py +0 -262
  33. additory/synthetic/engines.py +0 -529
  34. additory/synthetic/exceptions.py +0 -180
  35. additory/synthetic/file_managers.py +0 -518
  36. additory/synthetic/generator.py +0 -702
  37. additory/synthetic/generator_parser.py +0 -68
  38. additory/synthetic/integration.py +0 -319
  39. additory/synthetic/models.py +0 -241
  40. additory/synthetic/pattern_resolver.py +0 -573
  41. additory/synthetic/performance.py +0 -469
  42. additory/synthetic/polars_integration.py +0 -464
  43. additory/synthetic/proxy.py +0 -60
  44. additory/synthetic/schema_parser.py +0 -685
  45. additory/synthetic/validator.py +0 -553
  46. {additory-0.1.0a2.dist-info → additory-0.1.0a4.dist-info}/licenses/LICENSE +0 -0
  47. {additory-0.1.0a2.dist-info → additory-0.1.0a4.dist-info}/top_level.txt +0 -0
additory/__init__.py CHANGED
@@ -2,6 +2,9 @@
2
2
 
3
3
  from .dynamic_api import add as _api_instance
4
4
 
5
+ # Version information
6
+ __version__ = "0.1.0a4"
7
+
5
8
  # Expose the API instance normally
6
9
  add = _api_instance
7
10
 
@@ -12,4 +15,5 @@ def __getattr__(name):
12
15
 
13
16
  __all__ = [
14
17
  "add",
18
+ "__version__",
15
19
  ]
@@ -1,14 +1,14 @@
1
1
  """
2
2
  Common Utilities Module
3
3
 
4
- Shared functionality used by both augment and synthetic modules:
4
+ Shared functionality used by both synthetic and expressions modules:
5
5
  - Distribution functions (normal, uniform, skewed, etc.)
6
6
  - List file management (.list format)
7
7
  - Pattern file management (.properties format)
8
8
  - Fallback resolution logic
9
9
 
10
10
  This module eliminates code duplication and provides consistent behavior
11
- across augment and synthetic data generation.
11
+ across synthetic and expression data generation.
12
12
  """
13
13
 
14
14
  from .distributions import (
@@ -180,11 +180,14 @@ def get_arrow_bridge():
180
180
  - Use for all cross-backend conversions
181
181
  - Handles pandas/polars/cuDF via Arrow
182
182
  """
183
- from additory.core.backends.arrow_bridge import EnhancedArrowBridge
183
+ from additory.core.backends.arrow_bridge import EnhancedArrowBridge, ArrowBridgeError
184
184
 
185
185
  # Singleton pattern
186
186
  if not hasattr(get_arrow_bridge, '_instance'):
187
- get_arrow_bridge._instance = EnhancedArrowBridge()
187
+ try:
188
+ get_arrow_bridge._instance = EnhancedArrowBridge()
189
+ except ArrowBridgeError:
190
+ get_arrow_bridge._instance = None
188
191
 
189
192
  return get_arrow_bridge._instance
190
193
 
@@ -194,7 +197,7 @@ def to_polars(df: Any, backend_type: BackendType = None) -> 'pl.DataFrame':
194
197
  Convert any dataframe to Polars via Arrow bridge.
195
198
 
196
199
  This is the primary conversion function for the Polars-only architecture.
197
- All operations (expressions, augment, etc.) use this to convert input
200
+ All operations (expressions, synthetic, etc.) use this to convert input
198
201
  dataframes to Polars for processing.
199
202
 
200
203
  Args:
@@ -224,7 +227,7 @@ def to_polars(df: Any, backend_type: BackendType = None) -> 'pl.DataFrame':
224
227
  )
225
228
 
226
229
  # Fast path: already Polars
227
- if isinstance(df, pl.DataFrame):
230
+ if HAS_POLARS and isinstance(df, pl.DataFrame):
228
231
  return df
229
232
 
230
233
  # Validate input
@@ -240,6 +243,13 @@ def to_polars(df: Any, backend_type: BackendType = None) -> 'pl.DataFrame':
240
243
  # Convert via Arrow bridge
241
244
  try:
242
245
  bridge = get_arrow_bridge()
246
+ if bridge is None:
247
+ # Fallback: direct conversion for pandas
248
+ if backend_type == "pandas":
249
+ if isinstance(df, pd.DataFrame):
250
+ return pl.from_pandas(df)
251
+ raise RuntimeError("Arrow bridge not available and cannot convert non-pandas DataFrame")
252
+
243
253
  arrow_table = bridge.to_arrow(df, backend_type)
244
254
  pl_df = bridge.from_arrow(arrow_table, "polars")
245
255
  return pl_df
@@ -309,6 +319,12 @@ def from_polars(pl_df: 'pl.DataFrame', target_backend: BackendType) -> Any:
309
319
  # Convert via Arrow bridge
310
320
  try:
311
321
  bridge = get_arrow_bridge()
322
+ if bridge is None:
323
+ # Fallback: direct conversion for pandas
324
+ if target_backend == "pandas":
325
+ return pl_df.to_pandas()
326
+ raise RuntimeError("Arrow bridge not available and cannot convert to non-pandas DataFrame")
327
+
312
328
  arrow_table = bridge.to_arrow(pl_df, "polars")
313
329
  result_df = bridge.from_arrow(arrow_table, target_backend)
314
330
  return result_df
@@ -1,5 +1,5 @@
1
1
  """
2
- Distribution Strategies for Data Augmentation
2
+ Distribution Strategies for Synthetic Data Generation
3
3
 
4
4
  Provides statistical distribution-based data generation:
5
5
  - Normal (Gaussian) distribution
@@ -8,8 +8,8 @@ loaded on-demand using the existing .add file parser.
8
8
  Usage:
9
9
  from additory.common.sample_data import get_sample_dataset
10
10
 
11
- # For augment
12
- df = get_sample_dataset("augment", "sample")
11
+ # For synthetic
12
+ df = get_sample_dataset("synthetic", "sample")
13
13
 
14
14
  # For expressions (future)
15
15
  df = get_sample_dataset("expressions", "sample")
@@ -25,7 +25,7 @@ from additory.common.exceptions import ValidationError
25
25
 
26
26
 
27
27
  def get_sample_dataset(
28
- module: str = "augment",
28
+ module: str = "synthetic",
29
29
  block: str = "sample",
30
30
  dataset_type: str = "clean"
31
31
  ) -> pl.DataFrame:
@@ -33,12 +33,12 @@ def get_sample_dataset(
33
33
  Load a sample dataset from .add files.
34
34
 
35
35
  This function provides centralized access to sample datasets across
36
- all additory modules (augment, expressions, utilities). Sample datasets
36
+ all additory modules (synthetic, expressions, utilities). Sample datasets
37
37
  are stored as .add files in the reference/ directory structure.
38
38
 
39
39
  Args:
40
- module: Module name ("augment", "expressions", "utilities")
41
- block: Block name within the .add file ("sample" for augment)
40
+ module: Module name ("synthetic", "expressions", "utilities")
41
+ block: Block name within the .add file ("sample" for synthetic)
42
42
  dataset_type: Type of sample data ("clean" or "unclean")
43
43
 
44
44
  Returns:
@@ -48,8 +48,8 @@ def get_sample_dataset(
48
48
  ValidationError: If module, block, or dataset_type not found
49
49
 
50
50
  Examples:
51
- >>> # Load augment sample dataset
52
- >>> df = get_sample_dataset("augment", "sample")
51
+ >>> # Load synthetic sample dataset
52
+ >>> df = get_sample_dataset("synthetic", "sample")
53
53
  >>> print(df.shape)
54
54
  (50, 10)
55
55
 
@@ -57,7 +57,7 @@ def get_sample_dataset(
57
57
  >>> df = get_sample_dataset("expressions", "sample", "clean")
58
58
  >>> df_unclean = get_sample_dataset("expressions", "sample", "unclean")
59
59
 
60
- Sample Dataset Structure (augment):
60
+ Sample Dataset Structure (synthetic):
61
61
  - id: Sequential numeric IDs (1-50)
62
62
  - emp_id: Employee IDs with pattern (EMP_001 - EMP_050)
63
63
  - order_id: Order IDs with different padding (ORD_0001 - ORD_0050)
@@ -72,8 +72,8 @@ def get_sample_dataset(
72
72
  # Construct path to .add file
73
73
  base_path = Path(__file__).parent.parent.parent / "reference"
74
74
 
75
- if module == "augment":
76
- add_file_path = base_path / "augment_definitions" / f"{block}_0.1.add"
75
+ if module == "synthetic":
76
+ add_file_path = base_path / "synthetic_definitions" / f"{block}_0.1.add"
77
77
  elif module == "expressions":
78
78
  add_file_path = base_path / "expressions_definitions" / f"{block}_0.1.add"
79
79
  elif module == "utilities":
@@ -81,7 +81,7 @@ def get_sample_dataset(
81
81
  else:
82
82
  raise ValidationError(
83
83
  f"Unknown module '{module}'. "
84
- f"Valid modules: augment, expressions, utilities"
84
+ f"Valid modules: synthetic, expressions, utilities"
85
85
  )
86
86
 
87
87
  # Check if file exists
@@ -141,7 +141,7 @@ def list_available_samples() -> dict:
141
141
  >>> samples = list_available_samples()
142
142
  >>> print(samples)
143
143
  {
144
- 'augment': ['sample'],
144
+ 'synthetic': ['sample'],
145
145
  'expressions': ['sample'],
146
146
  'utilities': []
147
147
  }
@@ -149,15 +149,15 @@ def list_available_samples() -> dict:
149
149
  base_path = Path(__file__).parent.parent.parent / "reference"
150
150
  available = {}
151
151
 
152
- # Check augment
153
- augment_path = base_path / "augment_definitions"
154
- if augment_path.exists():
155
- available['augment'] = [
152
+ # Check synthetic
153
+ synthetic_path = base_path / "synthetic_definitions"
154
+ if synthetic_path.exists():
155
+ available['synthetic'] = [
156
156
  f.stem.rsplit('_', 1)[0] # Remove version suffix
157
- for f in augment_path.glob("*.add")
157
+ for f in synthetic_path.glob("*.add")
158
158
  ]
159
159
  else:
160
- available['augment'] = []
160
+ available['synthetic'] = []
161
161
 
162
162
  # Check expressions
163
163
  expressions_path = base_path / "expressions_definitions"
@@ -16,6 +16,13 @@ try:
16
16
  except ImportError as e:
17
17
  ARROW_AVAILABLE = False
18
18
  IMPORT_ERROR = str(e)
19
+ # Create dummy classes for type annotations
20
+ class pa:
21
+ Table = Any
22
+ class pl:
23
+ DataFrame = Any
24
+ class pd:
25
+ DataFrame = Any
19
26
 
20
27
  from ..logging import log_info, log_warning
21
28
  from .cudf_bridge import get_cudf_bridge
additory/core/config.py CHANGED
@@ -329,14 +329,14 @@ def set_custom_formula_path(path):
329
329
 
330
330
  # backend preference setting
331
331
 
332
- _backend_preference: str | None = None # "cpu", "gpu", or None
332
+ _backend_preference: Optional[str] = None # "cpu", "gpu", or None
333
333
 
334
- def set_backend_preference(mode: str | None):
334
+ def set_backend_preference(mode: Optional[str]):
335
335
  global _backend_preference
336
336
  if mode not in (None, "cpu", "gpu"):
337
337
  raise ValueError("backend must be 'cpu', 'gpu', or None")
338
338
  _backend_preference = mode
339
339
 
340
- def get_backend_preference() -> str | None:
340
+ def get_backend_preference() -> Optional[str]:
341
341
  return _backend_preference
342
342
 
@@ -32,7 +32,10 @@ class PolarsExpressionEngine:
32
32
  """Exclusive Polars-based expression processing engine"""
33
33
 
34
34
  def __init__(self):
35
- self.arrow_bridge = EnhancedArrowBridge()
35
+ try:
36
+ self.arrow_bridge = EnhancedArrowBridge()
37
+ except ArrowBridgeError:
38
+ self.arrow_bridge = None
36
39
  self.execution_stats = {
37
40
  "total_executions": 0,
38
41
  "total_time_ms": 0.0,
@@ -68,14 +71,28 @@ class PolarsExpressionEngine:
68
71
  try:
69
72
  # Auto-detect backend if not specified
70
73
  if backend_type is None:
71
- backend_type = self.arrow_bridge.detect_backend(df)
74
+ if self.arrow_bridge:
75
+ backend_type = self.arrow_bridge.detect_backend(df)
76
+ else:
77
+ backend_type = "pandas" # fallback
72
78
 
73
79
  # Get memory usage before processing
74
- memory_before = self.arrow_bridge._get_memory_usage_mb()
80
+ if self.arrow_bridge:
81
+ memory_before = self.arrow_bridge._get_memory_usage_mb()
82
+ else:
83
+ memory_before = 0
75
84
 
76
85
  # 1. Convert input to Arrow
77
86
  log_info(f"[polars_engine] Converting {backend_type} to Arrow")
78
- arrow_table = self.arrow_bridge.to_arrow(df, backend_type)
87
+ if self.arrow_bridge:
88
+ arrow_table = self.arrow_bridge.to_arrow(df, backend_type)
89
+ else:
90
+ # Fallback: assume pandas and convert directly
91
+ import pandas as pd
92
+ if isinstance(df, pd.DataFrame):
93
+ arrow_table = pl.from_pandas(df).to_arrow()
94
+ else:
95
+ raise RuntimeError("Arrow bridge not available and input is not pandas DataFrame")
79
96
 
80
97
  # 2. Convert Arrow to Polars
81
98
  log_info("[polars_engine] Converting Arrow to Polars")
@@ -93,11 +110,18 @@ class PolarsExpressionEngine:
93
110
 
94
111
  # 5. Convert to original backend format
95
112
  log_info(f"[polars_engine] Converting Arrow to {backend_type}")
96
- final_result = self.arrow_bridge.from_arrow(result_arrow, backend_type)
113
+ if self.arrow_bridge:
114
+ final_result = self.arrow_bridge.from_arrow(result_arrow, backend_type)
115
+ else:
116
+ # Fallback: convert back to pandas
117
+ final_result = pl.from_arrow(result_arrow).to_pandas()
97
118
 
98
119
  # Calculate execution statistics
99
120
  execution_time = (datetime.now() - start_time).total_seconds() * 1000
100
- memory_after = self.arrow_bridge._get_memory_usage_mb()
121
+ if self.arrow_bridge:
122
+ memory_after = self.arrow_bridge._get_memory_usage_mb()
123
+ else:
124
+ memory_after = 0
101
125
  memory_used = max(0, memory_after - memory_before)
102
126
 
103
127
  # Update global statistics
@@ -122,7 +146,8 @@ class PolarsExpressionEngine:
122
146
 
123
147
  finally:
124
148
  # 6. Always cleanup Arrow memory
125
- self.arrow_bridge.cleanup_arrow_memory()
149
+ if self.arrow_bridge:
150
+ self.arrow_bridge.cleanup_arrow_memory()
126
151
 
127
152
  def _execute_polars_expression(self, polars_df: pl.DataFrame,
128
153
  expression: str, output_column: str) -> pl.DataFrame:
@@ -381,14 +406,28 @@ class PolarsExpressionEngine:
381
406
  try:
382
407
  # Auto-detect backend if not specified
383
408
  if backend_type is None:
384
- backend_type = self.arrow_bridge.detect_backend(df)
409
+ if self.arrow_bridge:
410
+ backend_type = self.arrow_bridge.detect_backend(df)
411
+ else:
412
+ backend_type = "pandas"
385
413
 
386
414
  # Get memory usage before processing
387
- memory_before = self.arrow_bridge._get_memory_usage_mb()
415
+ if self.arrow_bridge:
416
+ memory_before = self.arrow_bridge._get_memory_usage_mb()
417
+ else:
418
+ memory_before = 0
388
419
 
389
420
  # Convert to Polars via Arrow
390
- arrow_table = self.arrow_bridge.to_arrow(df, backend_type)
391
- polars_df = pl.from_arrow(arrow_table)
421
+ if self.arrow_bridge:
422
+ arrow_table = self.arrow_bridge.to_arrow(df, backend_type)
423
+ polars_df = pl.from_arrow(arrow_table)
424
+ else:
425
+ # Fallback: assume pandas
426
+ import pandas as pd
427
+ if isinstance(df, pd.DataFrame):
428
+ polars_df = pl.from_pandas(df)
429
+ else:
430
+ raise RuntimeError("Arrow bridge not available and input is not pandas DataFrame")
392
431
 
393
432
  # Execute using AST
394
433
  polars_expr = self._ast_to_polars_expr(ast_tree)
@@ -396,11 +435,17 @@ class PolarsExpressionEngine:
396
435
 
397
436
  # Convert back to original format
398
437
  result_arrow = result_df.to_arrow()
399
- final_result = self.arrow_bridge.from_arrow(result_arrow, backend_type)
438
+ if self.arrow_bridge:
439
+ final_result = self.arrow_bridge.from_arrow(result_arrow, backend_type)
440
+ else:
441
+ final_result = pl.from_arrow(result_arrow).to_pandas()
400
442
 
401
443
  # Calculate statistics
402
444
  execution_time = (datetime.now() - start_time).total_seconds() * 1000
403
- memory_after = self.arrow_bridge._get_memory_usage_mb()
445
+ if self.arrow_bridge:
446
+ memory_after = self.arrow_bridge._get_memory_usage_mb()
447
+ else:
448
+ memory_after = 0
404
449
  memory_used = max(0, memory_after - memory_before)
405
450
 
406
451
  # Update statistics
@@ -422,7 +467,8 @@ class PolarsExpressionEngine:
422
467
  raise PolarsExpressionError(f"AST execution failed: {e}")
423
468
 
424
469
  finally:
425
- self.arrow_bridge.cleanup_arrow_memory()
470
+ if self.arrow_bridge:
471
+ self.arrow_bridge.cleanup_arrow_memory()
426
472
 
427
473
  def validate_expression(self, expression: str) -> bool:
428
474
  """
@@ -489,7 +535,10 @@ class PolarsExpressionEngine:
489
535
  Benchmark results
490
536
  """
491
537
  times = []
492
- backend_type = self.arrow_bridge.detect_backend(df)
538
+ if self.arrow_bridge:
539
+ backend_type = self.arrow_bridge.detect_backend(df)
540
+ else:
541
+ backend_type = "pandas"
493
542
 
494
543
  for i in range(iterations):
495
544
  try:
@@ -532,7 +581,8 @@ class PolarsExpressionEngine:
532
581
  """Cleanup callback for memory manager"""
533
582
  try:
534
583
  # Cleanup Arrow bridge memory
535
- self.arrow_bridge.cleanup_arrow_memory()
584
+ if self.arrow_bridge:
585
+ self.arrow_bridge.cleanup_arrow_memory()
536
586
 
537
587
  # Reset statistics if they get too large
538
588
  if self.execution_stats["total_executions"] > 10000:
additory/core/registry.py CHANGED
@@ -2,6 +2,7 @@
2
2
  # Versioned registry for additory
3
3
 
4
4
  from dataclasses import dataclass
5
+ from typing import Optional
5
6
  import os
6
7
  import json
7
8
 
@@ -26,9 +27,9 @@ class ResolvedFormula:
26
27
  source: str
27
28
  version: str
28
29
  mode: str = "local"
29
- ast: dict | None = None
30
- sample_clean: dict | None = None
31
- sample_unclean: dict | None = None
30
+ ast: Optional[dict] = None
31
+ sample_clean: Optional[dict] = None
32
+ sample_unclean: Optional[dict] = None
32
33
 
33
34
 
34
35
  # ------------------------------------------------------------
additory/dynamic_api.py CHANGED
@@ -15,9 +15,8 @@ class AdditoryAPI(SimpleNamespace):
15
15
  Main API class for Additory functionality.
16
16
 
17
17
  Provides access to:
18
- - add.augment() - Data augmentation
18
+ - add.synthetic() - Synthetic data generation
19
19
  - add.to() - Lookup/join operations
20
- - add.synth() - Synthetic data generation
21
20
  - add.scan() - Data profiling and analysis
22
21
  - add.my - User expressions
23
22
  - add.play() - Hidden games (for the curious 😉)
@@ -31,8 +30,15 @@ class AdditoryAPI(SimpleNamespace):
31
30
  self.my = ExpressionProxy(namespace="user")
32
31
  self._builtin_proxy = ExpressionProxy(namespace="builtin")
33
32
 
34
- # Explicitly set the augment method to prevent namespace conflicts
35
- self.augment = self._augment_method
33
+ # Explicitly set methods to prevent namespace conflicts
34
+ self.synthetic = self._synthetic_method
35
+ self.deduce = self._deduce_method
36
+ self.to = self._to_method
37
+ self.onehotencoding = self._onehotencoding_method
38
+ self.harmonize_units = self._harmonize_units_method
39
+ self.scan = self._scan_method
40
+ self.games = self._games_method
41
+ self.play = self._play_method
36
42
 
37
43
  def __getattr__(self, name):
38
44
  """
@@ -61,65 +67,65 @@ class AdditoryAPI(SimpleNamespace):
61
67
  except Exception:
62
68
  return False
63
69
 
64
- def _augment_method(self, df, n_rows=5, strategy="auto", seed=None, output_format="pandas", **kwargs):
70
+ def _synthetic_method(self, df, n_rows=5, strategy="auto", seed=None, output_format="pandas", **kwargs):
65
71
  """
66
- Augment a dataframe with additional rows or create data from scratch.
72
+ Generate synthetic data by extending a dataframe or creating from scratch.
67
73
 
68
74
  Three modes:
69
- 1. Augment mode: Pass a DataFrame to add rows
75
+ 1. Extend mode: Pass a DataFrame to add synthetic rows
70
76
  2. Create mode: Pass "@new" to create data from scratch
71
77
  3. Sample mode: Pass "@sample" to load sample data
72
78
 
73
79
  Args:
74
- df: DataFrame to augment, "@new" to create, or "@sample" for sample data
75
- n_rows: Number of rows (int for create/sample, int/float/str for augment)
76
- strategy: Strategy specification (dict for create, str/dict for augment)
80
+ df: DataFrame to extend, "@new" to create, or "@sample" for sample data
81
+ n_rows: Number of rows (int for create/sample, int/float/str for extend)
82
+ strategy: Strategy specification (dict for create, str/dict for extend)
77
83
  seed: Random seed for reproducibility
78
84
  output_format: Output format ("pandas", "polars", "cudf")
79
85
  **kwargs: Additional parameters
80
86
 
81
87
  Returns:
82
- Augmented or generated DataFrame
88
+ Extended or generated DataFrame
83
89
 
84
90
  Examples:
85
- # Augment existing data
86
- result = add.augment(df, n_rows=100, strategy='auto')
91
+ # Extend existing data
92
+ result = add.synthetic(df, n_rows=100, strategy='auto')
87
93
 
88
94
  # Create from scratch
89
- result = add.augment("@new", n_rows=100, strategy={'id': 'increment', 'age': 'range:18-65'})
95
+ result = add.synthetic("@new", n_rows=100, strategy={'id': 'increment', 'age': 'range:18-65'})
90
96
 
91
97
  # Load sample data
92
- result = add.augment("@sample", n_rows=50)
98
+ result = add.synthetic("@sample", n_rows=50)
93
99
  """
94
100
  # Store reference to restore after import (in the correct namespace)
95
101
  import additory
96
- original_augment = getattr(additory, 'augment', None)
102
+ original_synthetic = getattr(additory, 'synthetic', None)
97
103
 
98
104
  try:
99
105
  # Import and call the implementation
100
- from additory.augment.augmentor import augment as augment_impl
101
- result = augment_impl(df, n_rows=n_rows, strategy=strategy, seed=seed,
102
- output_format=output_format, **kwargs)
106
+ from additory.synthetic.synthesizer import synthetic as synthetic_impl
107
+ result = synthetic_impl(df, n_rows=n_rows, strategy=strategy, seed=seed,
108
+ output_format=output_format, **kwargs)
103
109
 
104
110
  # Restore the method reference in the additory module namespace
105
- # The import above will have overridden additory.augment with the module
111
+ # The import above will have overridden additory.synthetic with the module
106
112
  # We need to restore it to point to this method
107
- if original_augment is not None:
108
- additory.augment = original_augment
113
+ if original_synthetic is not None:
114
+ additory.synthetic = original_synthetic
109
115
  else:
110
- # If there was no original augment, set it to this method
111
- additory.augment = self._augment_method
116
+ # If there was no original synthetic, set it to this method
117
+ additory.synthetic = self._synthetic_method
112
118
 
113
119
  return result
114
120
  except Exception as e:
115
121
  # Restore the method reference even if there's an error
116
- if original_augment is not None:
117
- additory.augment = original_augment
122
+ if original_synthetic is not None:
123
+ additory.synthetic = original_synthetic
118
124
  else:
119
- additory.augment = self._augment_method
125
+ additory.synthetic = self._synthetic_method
120
126
  raise
121
127
 
122
- def to(self, target_df, from_df=None, bring=None, against=None, **kwargs):
128
+ def _to_method(self, target_df, from_df=None, bring=None, against=None, **kwargs):
123
129
  """
124
130
  Add columns from reference dataframe to target dataframe.
125
131
 
@@ -140,26 +146,7 @@ class AdditoryAPI(SimpleNamespace):
140
146
  from additory.utilities.lookup import to
141
147
  return to(target_df, from_df, bring=bring, against=against, **kwargs)
142
148
 
143
- def synth(self, schema_path: str, rows: int = 1000, engine: Optional[str] = None):
144
- """
145
- Generate synthetic data from a schema file.
146
-
147
- Args:
148
- schema_path: Path to the .toml schema file
149
- rows: Number of rows to generate (default: 1000)
150
- engine: Output engine ("pandas" or "polars"). If None, uses default from config
151
-
152
- Returns:
153
- Generated DataFrame in the specified format
154
-
155
- Example:
156
- df = add.synth("customer.toml", rows=5000)
157
- df = add.synth("customer.toml", rows=5000, engine="polars")
158
- """
159
- from additory.synthetic.api import synth as synth_impl
160
- return synth_impl(schema_path, rows, engine)
161
-
162
- def onehotencoding(self, df, columns=None, **kwargs):
149
+ def _onehotencoding_method(self, df, columns=None, **kwargs):
163
150
  """
164
151
  One-hot encode categorical columns.
165
152
 
@@ -174,7 +161,7 @@ class AdditoryAPI(SimpleNamespace):
174
161
  from additory.utilities.encoding import onehotencoding
175
162
  return onehotencoding(df, column=columns, **kwargs)
176
163
 
177
- def harmonize_units(self, df, value_column, unit_column, target_unit=None, position="end", **kwargs):
164
+ def _harmonize_units_method(self, df, value_column, unit_column, target_unit=None, position="end", **kwargs):
178
165
  """
179
166
  Harmonize units in a dataframe.
180
167
 
@@ -196,7 +183,7 @@ class AdditoryAPI(SimpleNamespace):
196
183
  from additory.utilities.units import harmonize_units
197
184
  return harmonize_units(df, value_column, unit_column, target_unit, position, **kwargs)
198
185
 
199
- def scan(
186
+ def _scan_method(
200
187
  self,
201
188
  df: Union[pl.DataFrame, pd.DataFrame, Any],
202
189
  preset: Optional[str] = None,
@@ -279,7 +266,64 @@ class AdditoryAPI(SimpleNamespace):
279
266
  verbose=verbose
280
267
  )
281
268
 
282
- def play(self, game: str = "tictactoe"):
269
+ def _deduce_method(
270
+ self,
271
+ df: Union[pd.DataFrame, pl.DataFrame, Any],
272
+ from_column: Union[str, List[str]],
273
+ to_column: str
274
+ ) -> Union[pd.DataFrame, pl.DataFrame, Any]:
275
+ """
276
+ Deduce missing labels based on text similarity to labeled examples.
277
+
278
+ Uses cosine similarity on TF-IDF vectors. Pure Python, no LLMs, offline-first.
279
+ Requires at least 3 labeled examples to work.
280
+
281
+ When multiple source columns are provided, they are concatenated with
282
+ spaces before computing similarity.
283
+
284
+ Args:
285
+ df: DataFrame with some labeled and some unlabeled rows
286
+ from_column: Text column(s) to analyze
287
+ - str: Single column (e.g., "comment")
288
+ - List[str]: Multiple columns (e.g., ["comment", "notes"])
289
+ to_column: Label column to fill (e.g., "status")
290
+
291
+ Returns:
292
+ DataFrame with deduced labels filled in
293
+
294
+ Examples:
295
+ # Single column
296
+ >>> result = add.deduce(df, from_column="comment", to_column="status")
297
+
298
+ # Multiple columns (better accuracy)
299
+ >>> result = add.deduce(
300
+ ... df,
301
+ ... from_column=["comment", "notes", "description"],
302
+ ... to_column="status"
303
+ ... )
304
+
305
+ Privacy: Your data never leaves your machine. No external connections.
306
+ """
307
+ from additory.synthetic.deduce import deduce as deduce_impl
308
+ return deduce_impl(df, from_column, to_column)
309
+
310
+ def _games_method(self):
311
+ """
312
+ List available games! 🎮
313
+
314
+ Returns a list of games you can play with add.play().
315
+
316
+ Returns:
317
+ List of available game names
318
+
319
+ Example:
320
+ >>> import additory
321
+ >>> additory.add.games()
322
+ ['tictactoe', 'sudoku']
323
+ """
324
+ return ['tictactoe', 'sudoku']
325
+
326
+ def _play_method(self, game: str = "tictactoe"):
283
327
  """
284
328
  Play a game! 🎮
285
329
 
@@ -287,7 +287,10 @@ class EnhancedExpressionProxy:
287
287
  backend_type = "polars"
288
288
  else:
289
289
  # Try to detect other types
290
- backend_type = self.polars_engine.arrow_bridge.detect_backend(df)
290
+ if self.polars_engine.arrow_bridge:
291
+ backend_type = self.polars_engine.arrow_bridge.detect_backend(df)
292
+ else:
293
+ backend_type = "pandas" # fallback
291
294
 
292
295
  # Execute using Polars engine
293
296
  result = self.polars_engine.execute_expression(