additory 0.1.0a2__py3-none-any.whl → 0.1.0a3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. additory/__init__.py +4 -0
  2. additory/common/__init__.py +2 -2
  3. additory/common/backend.py +20 -4
  4. additory/common/distributions.py +1 -1
  5. additory/common/sample_data.py +19 -19
  6. additory/core/backends/arrow_bridge.py +7 -0
  7. additory/core/polars_expression_engine.py +66 -16
  8. additory/dynamic_api.py +42 -46
  9. additory/expressions/proxy.py +4 -1
  10. additory/synthetic/__init__.py +7 -95
  11. additory/synthetic/column_name_resolver.py +149 -0
  12. additory/{augment → synthetic}/distributions.py +2 -2
  13. additory/{augment → synthetic}/forecast.py +1 -1
  14. additory/synthetic/linked_list_parser.py +415 -0
  15. additory/synthetic/namespace_lookup.py +129 -0
  16. additory/{augment → synthetic}/smote.py +1 -1
  17. additory/{augment → synthetic}/strategies.py +11 -44
  18. additory/{augment/augmentor.py → synthetic/synthesizer.py} +75 -15
  19. additory/utilities/units.py +4 -1
  20. {additory-0.1.0a2.dist-info → additory-0.1.0a3.dist-info}/METADATA +10 -17
  21. {additory-0.1.0a2.dist-info → additory-0.1.0a3.dist-info}/RECORD +24 -40
  22. {additory-0.1.0a2.dist-info → additory-0.1.0a3.dist-info}/WHEEL +1 -1
  23. additory/augment/__init__.py +0 -24
  24. additory/augment/builtin_lists.py +0 -430
  25. additory/augment/list_registry.py +0 -177
  26. additory/synthetic/api.py +0 -220
  27. additory/synthetic/common_integration.py +0 -314
  28. additory/synthetic/config.py +0 -262
  29. additory/synthetic/engines.py +0 -529
  30. additory/synthetic/exceptions.py +0 -180
  31. additory/synthetic/file_managers.py +0 -518
  32. additory/synthetic/generator.py +0 -702
  33. additory/synthetic/generator_parser.py +0 -68
  34. additory/synthetic/integration.py +0 -319
  35. additory/synthetic/models.py +0 -241
  36. additory/synthetic/pattern_resolver.py +0 -573
  37. additory/synthetic/performance.py +0 -469
  38. additory/synthetic/polars_integration.py +0 -464
  39. additory/synthetic/proxy.py +0 -60
  40. additory/synthetic/schema_parser.py +0 -685
  41. additory/synthetic/validator.py +0 -553
  42. {additory-0.1.0a2.dist-info → additory-0.1.0a3.dist-info}/licenses/LICENSE +0 -0
  43. {additory-0.1.0a2.dist-info → additory-0.1.0a3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,129 @@
1
+ """
2
+ Namespace Lookup for Linked Lists
3
+
4
+ Finds Python variables in the caller's namespace using frame inspection.
5
+ """
6
+
7
+ import inspect
8
+ from typing import Any
9
+
10
+ from additory.common.exceptions import ValidationError
11
+
12
+
13
+ def lookup_variable_in_namespace(var_name: str, depth: int = 2) -> Any:
14
+ """
15
+ Look up a variable in the caller's namespace.
16
+
17
+ Uses frame inspection to find variables defined in the same scope
18
+ as the synthetic() call.
19
+
20
+ Args:
21
+ var_name: Name of the variable to find
22
+ depth: Number of frames to go back (default: 2)
23
+ 2 = caller's caller (synthetic() -> this function -> caller)
24
+
25
+ Returns:
26
+ Value of the variable
27
+
28
+ Raises:
29
+ ValidationError: If variable not found
30
+
31
+ Examples:
32
+ >>> # In user code:
33
+ >>> AE_CM = [["Headache", ["Aspirin"]]]
34
+ >>> df = add.synthetic('@new', strategy={'col1': 'lists@AE_CM'})
35
+ >>> # lookup_variable_in_namespace('AE_CM') finds the list
36
+ """
37
+ try:
38
+ # Get caller's frame
39
+ frame = inspect.currentframe()
40
+
41
+ # Go back 'depth' frames
42
+ for _ in range(depth):
43
+ if frame is None:
44
+ raise ValidationError(
45
+ f"Cannot access caller's namespace (frame depth {depth})"
46
+ )
47
+ frame = frame.f_back
48
+
49
+ if frame is None:
50
+ raise ValidationError(
51
+ f"Cannot access caller's namespace (frame is None)"
52
+ )
53
+
54
+ # Search in locals first, then globals
55
+ caller_locals = frame.f_locals
56
+ caller_globals = frame.f_globals
57
+
58
+ if var_name in caller_locals:
59
+ return caller_locals[var_name]
60
+ elif var_name in caller_globals:
61
+ return caller_globals[var_name]
62
+ else:
63
+ # Variable not found - provide helpful error
64
+ raise ValidationError(
65
+ f"Variable '{var_name}' not found in namespace.\n"
66
+ f"Make sure '{var_name}' is defined before calling synthetic().\n"
67
+ f"\n"
68
+ f"Example:\n"
69
+ f" {var_name} = [['Headache', ['Aspirin'], ['mild']]]\n"
70
+ f" df = add.synthetic('@new', strategy={{'col1': 'lists@{var_name}'}})\n"
71
+ f"\n"
72
+ f"Note: Linked lists must be defined in the same scope (cell/function) "
73
+ f"as the synthetic() call."
74
+ )
75
+
76
+ finally:
77
+ # Clean up frame reference to avoid reference cycles
78
+ del frame
79
+
80
+
81
+ def validate_linked_list_variable(var_value: Any, var_name: str) -> None:
82
+ """
83
+ Validate that the variable is a valid linked list.
84
+
85
+ Args:
86
+ var_value: Value of the variable
87
+ var_name: Name of the variable (for error messages)
88
+
89
+ Raises:
90
+ ValidationError: If variable is not a valid linked list
91
+ """
92
+ if not isinstance(var_value, list):
93
+ raise ValidationError(
94
+ f"Variable '{var_name}' must be a list. "
95
+ f"Got: {type(var_value).__name__}\n"
96
+ f"\n"
97
+ f"Expected format:\n"
98
+ f" {var_name} = [\n"
99
+ f" ['Headache', ['Aspirin', 'Ibuprofen'], ['mild', 'moderate']],\n"
100
+ f" ['Nausea', ['Ondansetron'], ['severe']]\n"
101
+ f" ]"
102
+ )
103
+
104
+ if len(var_value) == 0:
105
+ raise ValidationError(
106
+ f"Variable '{var_name}' is an empty list. "
107
+ f"Linked list must contain at least one row."
108
+ )
109
+
110
+
111
+ def lookup_linked_list(var_name: str, depth: int = 2) -> Any:
112
+ """
113
+ Look up and validate a linked list variable.
114
+
115
+ Convenience function that combines lookup and validation.
116
+
117
+ Args:
118
+ var_name: Name of the variable to find
119
+ depth: Number of frames to go back
120
+
121
+ Returns:
122
+ Linked list data
123
+
124
+ Raises:
125
+ ValidationError: If variable not found or invalid
126
+ """
127
+ var_value = lookup_variable_in_namespace(var_name, depth)
128
+ validate_linked_list_variable(var_value, var_name)
129
+ return var_value
@@ -1,5 +1,5 @@
1
1
  """
2
- SMOTE (Synthetic Minority Over-sampling Technique) for Data Augmentation
2
+ SMOTE (Synthetic Minority Over-sampling Technique) for Synthetic Data Generation
3
3
 
4
4
  Provides imbalanced data handling strategies:
5
5
  - SMOTE: Generate synthetic samples for minority class
@@ -1,11 +1,10 @@
1
1
  """
2
- Strategy handlers for data augmentation
2
+ Strategy handlers for synthetic data generation
3
3
 
4
4
  Provides different strategies for generating synthetic data:
5
5
  - auto: Random sampling from existing values
6
6
  - increment: Increment numeric or pattern-based values
7
7
  - choice:[...]: Random selection from inline list
8
- - choice_list:name: Random selection from registered/built-in list
9
8
  """
10
9
 
11
10
  import re
@@ -13,7 +12,6 @@ import random
13
12
  from typing import Any, Dict, List, Optional, Tuple
14
13
 
15
14
  from additory.common.exceptions import ValidationError, AugmentError
16
- from additory.augment.list_registry import get_list
17
15
 
18
16
 
19
17
  def parse_strategy_params(strategy_spec: str) -> Tuple[str, Dict[str, Any]]:
@@ -319,7 +317,7 @@ def apply_increment_strategy(
319
317
  Apply increment strategy to a column (Polars-only).
320
318
 
321
319
  Supports two modes:
322
- 1. Augment mode: Increment from last value in df_polars
320
+ 1. Extend mode: Increment from last value in df_polars
323
321
  2. Create mode: Start from specified value (requires params with 'start')
324
322
 
325
323
  Args:
@@ -336,7 +334,7 @@ def apply_increment_strategy(
336
334
  ValidationError: If strategy cannot be applied
337
335
 
338
336
  Examples:
339
- # Augment mode (with DataFrame)
337
+ # Extend mode (with DataFrame)
340
338
  >>> apply_increment_strategy(df, "id", "increment", 5)
341
339
  [11, 12, 13, 14, 15] # if last value was 10
342
340
 
@@ -349,7 +347,7 @@ def apply_increment_strategy(
349
347
  ... {"start": 1, "pattern": "EMP_[001]"})
350
348
  ["EMP_001", "EMP_002", "EMP_003"]
351
349
  """
352
- # Determine mode: augment (has df) or create (no df)
350
+ # Determine mode: extend (has df) or create (no df)
353
351
  is_create_mode = df_polars is None
354
352
 
355
353
  if is_create_mode:
@@ -402,7 +400,7 @@ def apply_increment_strategy(
402
400
  return new_values
403
401
 
404
402
  else:
405
- # Augment mode: use existing logic
403
+ # Extend mode: use existing logic
406
404
  # Parse the strategy
407
405
  pattern, regex_pattern = parse_increment_strategy(strategy_spec)
408
406
 
@@ -483,12 +481,11 @@ def parse_choice_strategy(strategy_spec: str) -> Tuple[str, Optional[List[Any]]]
483
481
  Args:
484
482
  strategy_spec: Strategy string like:
485
483
  - "choice:[value1,value2,value3]"
486
- - "choice_list:banks"
487
484
 
488
485
  Returns:
489
486
  Tuple of (strategy_type, values)
490
- - strategy_type: "choice" or "choice_list"
491
- - values: List of values (for choice) or None (for choice_list)
487
+ - strategy_type: "choice"
488
+ - values: List of values
492
489
 
493
490
  Raises:
494
491
  ValidationError: If strategy format is invalid
@@ -496,9 +493,6 @@ def parse_choice_strategy(strategy_spec: str) -> Tuple[str, Optional[List[Any]]]
496
493
  Examples:
497
494
  >>> parse_choice_strategy("choice:[Active,Inactive,Pending]")
498
495
  ("choice", ["Active", "Inactive", "Pending"])
499
-
500
- >>> parse_choice_strategy("choice_list:banks")
501
- ("choice_list", None)
502
496
  """
503
497
  if strategy_spec.startswith("choice:["):
504
498
  # Inline list: choice:[value1,value2,value3]
@@ -526,22 +520,10 @@ def parse_choice_strategy(strategy_spec: str) -> Tuple[str, Optional[List[Any]]]
526
520
 
527
521
  return "choice", values
528
522
 
529
- elif strategy_spec.startswith("choice_list:"):
530
- # Named list: choice_list:banks
531
- list_name = strategy_spec[len("choice_list:"):].strip()
532
-
533
- if not list_name:
534
- raise ValidationError(
535
- f"Invalid choice_list strategy: {strategy_spec}. "
536
- "Must be in format: choice_list:list_name"
537
- )
538
-
539
- return "choice_list", list_name
540
-
541
523
  else:
542
524
  raise ValidationError(
543
525
  f"Invalid choice strategy: {strategy_spec}. "
544
- "Must start with 'choice:[' or 'choice_list:'"
526
+ "Must start with 'choice:['"
545
527
  )
546
528
 
547
529
 
@@ -609,22 +591,7 @@ def apply_choice_strategy(
609
591
  ValidationError: If strategy cannot be applied
610
592
  """
611
593
  # Parse the strategy
612
- strategy_type, values_or_name = parse_choice_strategy(strategy_spec)
613
-
614
- # Get the actual values list
615
- if strategy_type == "choice":
616
- values = values_or_name
617
- elif strategy_type == "choice_list":
618
- # Resolve list name to actual list
619
- list_name = values_or_name
620
- try:
621
- values = get_list(list_name)
622
- except ValidationError as e:
623
- raise ValidationError(
624
- f"Cannot apply choice_list strategy: {e}"
625
- )
626
- else:
627
- raise ValidationError(f"Unknown choice strategy type: {strategy_type}")
594
+ strategy_type, values = parse_choice_strategy(strategy_spec)
628
595
 
629
596
  # Generate random selections
630
597
  if seed is not None:
@@ -675,7 +642,7 @@ def apply_forecast_strategy(
675
642
  >>> apply_forecast_strategy(df, "sales", "forecast:seasonal:period=12", 24)
676
643
  [98.5, 102.3, 95.8, ...]
677
644
  """
678
- from additory.augment.forecast import forecast_values, ForecastMethod
645
+ from additory.synthetic.forecast import forecast_values, ForecastMethod
679
646
 
680
647
  # Parse strategy: forecast:method:param1=val1:param2=val2
681
648
  parts = strategy_spec.split(":")
@@ -845,7 +812,7 @@ def apply_smote_strategy(
845
812
  >>> apply_smote_strategy(df, ["feature1", "feature2"], "smote:k=5", 100)
846
813
  {"feature1": [1.2, 3.4, ...], "feature2": [5.6, 7.8, ...]}
847
814
  """
848
- from additory.augment.smote import generate_smote_values
815
+ from additory.synthetic.smote import generate_smote_values
849
816
 
850
817
  # Parse strategy: smote:k=5
851
818
  parts = strategy_spec.split(":")
@@ -18,7 +18,7 @@ from additory.common.backend import detect_backend, to_polars, from_polars
18
18
  from additory.common.exceptions import ValidationError, AugmentError
19
19
  from additory.common.validation import validate_dataframe
20
20
  from additory.common.sample_data import get_sample_dataset
21
- from additory.augment.strategies import (
21
+ from additory.synthetic.strategies import (
22
22
  parse_strategy_dict,
23
23
  get_column_strategy,
24
24
  apply_increment_strategy,
@@ -27,6 +27,14 @@ from additory.augment.strategies import (
27
27
  parse_strategy_params
28
28
  )
29
29
 
30
+ # Linked lists feature imports
31
+ from additory.synthetic.namespace_lookup import lookup_linked_list
32
+ from additory.synthetic.linked_list_parser import (
33
+ parse_linked_list,
34
+ generate_linked_list_data
35
+ )
36
+ from additory.synthetic.column_name_resolver import resolve_column_names
37
+
30
38
 
31
39
  def _validate_generative_strategies(strategy_dict: Dict[str, str]) -> None:
32
40
  """
@@ -36,7 +44,7 @@ def _validate_generative_strategies(strategy_dict: Dict[str, str]) -> None:
36
44
  - increment (with start parameter)
37
45
  - range
38
46
  - choice
39
- - choice_list
47
+ - lists (inline linked lists)
40
48
 
41
49
  Augmentative strategies require existing data:
42
50
  - auto (random sampling)
@@ -61,6 +69,10 @@ def _validate_generative_strategies(strategy_dict: Dict[str, str]) -> None:
61
69
  # Get the base strategy name (before any parameters)
62
70
  strategy_name = strategy_spec.split(":")[0].strip()
63
71
 
72
+ # Handle lists@ pattern
73
+ if strategy_name.startswith("lists@"):
74
+ continue # Valid generative strategy
75
+
64
76
  if strategy_name in augmentative_strategies:
65
77
  invalid_columns.append((col, strategy_name))
66
78
 
@@ -76,7 +88,7 @@ def _validate_generative_strategies(strategy_dict: Dict[str, str]) -> None:
76
88
  error_lines.append(" - increment (with start parameter)")
77
89
  error_lines.append(" - range:min-max")
78
90
  error_lines.append(" - choice:[value1,value2,...]")
79
- error_lines.append(" - choice_list:list_name")
91
+ error_lines.append(" - lists@variable_name (inline linked lists)")
80
92
 
81
93
  raise ValidationError("\n".join(error_lines))
82
94
 
@@ -249,7 +261,7 @@ def _augment_polars_engine(df_polars: Any, n_rows: int, strategy_dict: Dict[str,
249
261
  new_data[col] = new_values
250
262
  elif col_strategy.startswith("forecast"):
251
263
  # Import here to avoid circular dependency
252
- from additory.augment.strategies import apply_forecast_strategy
264
+ from additory.synthetic.strategies import apply_forecast_strategy
253
265
 
254
266
  # Generate forecasted values
255
267
  new_values = apply_forecast_strategy(
@@ -265,7 +277,7 @@ def _augment_polars_engine(df_polars: Any, n_rows: int, strategy_dict: Dict[str,
265
277
  new_data[col] = new_values
266
278
  elif col_strategy.startswith(("normal", "uniform", "skewed_left", "skewed_right", "beta", "gamma", "exponential", "kde")):
267
279
  # Import here to avoid circular dependency
268
- from additory.augment.strategies import apply_distribution_strategy
280
+ from additory.synthetic.strategies import apply_distribution_strategy
269
281
 
270
282
  # Generate distribution values
271
283
  new_values = apply_distribution_strategy(
@@ -310,7 +322,6 @@ def _create_from_scratch_engine(
310
322
  - increment (with start parameter)
311
323
  - range
312
324
  - choice
313
- - choice_list
314
325
 
315
326
  Augmentative strategies (NOT supported):
316
327
  - auto (requires existing data)
@@ -349,23 +360,62 @@ def _create_from_scratch_engine(
349
360
  ... "id": "increment:start=1",
350
361
  ... "emp_id": "increment:start=1:pattern=EMP_[001]",
351
362
  ... "age": "range:18-65",
352
- ... "status": "choice:[Active,Inactive]",
353
- ... "department": "choice_list:departments"
363
+ ... "status": "choice:[Active,Inactive,Pending]"
354
364
  ... },
355
365
  ... seed=42
356
366
  ... )
357
367
  >>> result.shape
358
- (100, 5)
368
+ (100, 4)
359
369
  """
360
370
  import polars as pl
361
371
 
362
372
  # Validate all strategies are generative
363
373
  _validate_generative_strategies(strategy_dict)
364
374
 
375
+ # Pre-process linked lists strategies
376
+ # Linked lists generate multiple columns, so we need to expand strategy_dict
377
+ expanded_strategy_dict = {}
378
+ lists_to_process = [] # Store (original_key, var_name, parsed_data, column_names)
379
+
380
+ for col, col_strategy in strategy_dict.items():
381
+ if col == "__default__":
382
+ continue
383
+
384
+ # Check for lists@ pattern
385
+ if col_strategy.startswith("lists@"):
386
+ # Extract variable name
387
+ var_name = col_strategy[6:].strip() # Remove "lists@" prefix
388
+
389
+ try:
390
+ # Lookup variable in namespace
391
+ # Depth=5: user -> add.synthetic (API) -> synthetic() -> _create_from_scratch_engine -> here
392
+ linked_list_data = lookup_linked_list(var_name, depth=5)
393
+
394
+ # Parse linked list
395
+ parsed_data = parse_linked_list(linked_list_data)
396
+
397
+ # Resolve column names
398
+ column_names = resolve_column_names(
399
+ list_name=var_name,
400
+ strategy_key=col,
401
+ num_columns=parsed_data['num_columns'],
402
+ explicit_names=parsed_data['column_names']
403
+ )
404
+
405
+ # Store for later processing
406
+ lists_to_process.append((col, var_name, parsed_data, column_names))
407
+
408
+ except ValidationError as e:
409
+ raise ValidationError(f"Linked list error for column '{col}': {e}")
410
+ else:
411
+ # Regular strategy - keep as is
412
+ expanded_strategy_dict[col] = col_strategy
413
+
365
414
  # Build data column by column
366
415
  new_data = {}
367
416
 
368
- for col, col_strategy in strategy_dict.items():
417
+ # Process regular strategies first
418
+ for col, col_strategy in expanded_strategy_dict.items():
369
419
  if col == "__default__":
370
420
  continue
371
421
 
@@ -414,13 +464,24 @@ def _create_from_scratch_engine(
414
464
  f"Unknown or unsupported strategy for column '{col}': '{col_strategy}'"
415
465
  )
416
466
 
467
+ # Process linked lists strategies
468
+ for original_key, var_name, parsed_data, column_names in lists_to_process:
469
+ # Generate data rows
470
+ data_rows = generate_linked_list_data(parsed_data, n_rows, seed)
471
+
472
+ # Transpose: list of tuples -> dict of lists
473
+ # data_rows = [(val1_col1, val1_col2), (val2_col1, val2_col2), ...]
474
+ # -> {col1: [val1_col1, val2_col1, ...], col2: [val1_col2, val2_col2, ...]}
475
+ for col_idx, col_name in enumerate(column_names):
476
+ new_data[col_name] = [row[col_idx] for row in data_rows]
477
+
417
478
  # Build Polars DataFrame from generated columns
418
479
  result = pl.DataFrame(new_data)
419
480
 
420
481
  return result
421
482
 
422
483
 
423
- def augment(
484
+ def synthetic(
424
485
  df: Any,
425
486
  n_rows: Union[int, str] = 5,
426
487
  strategy: Union[str, Dict[str, str]] = "auto",
@@ -428,12 +489,12 @@ def augment(
428
489
  output_format: str = "pandas"
429
490
  ) -> Any:
430
491
  """
431
- Augment a dataframe by adding synthetic rows based on existing data.
492
+ Generate synthetic data by extending a dataframe or creating from scratch.
432
493
 
433
494
  Uses Polars-only architecture:
434
495
  1. Detect input format (pandas/polars/cuDF)
435
496
  2. Convert to Polars via Arrow bridge (if needed)
436
- 3. Process augmentation in Polars
497
+ 3. Process synthetic data generation in Polars
437
498
  4. Convert back to original format via Arrow bridge
438
499
 
439
500
  This function adds new rows to a dataframe using various strategies:
@@ -441,7 +502,7 @@ def augment(
441
502
  - "increment": Increment numeric or pattern-based values
442
503
  - "range:min-max": Random integers within range
443
504
  - "choice:[...]": Random selection from inline list
444
- - "choice_list:name": Random selection from registered/built-in list
505
+ - "lists@variable_name": Inline linked lists (generates multiple columns)
445
506
  - "forecast:method": Time series forecasting (linear, polynomial, exponential, seasonal)
446
507
  - "normal": Normal distribution generation
447
508
  - "uniform": Uniform distribution generation
@@ -465,7 +526,6 @@ def augment(
465
526
  "emp_id": "increment:EMP_[001]_ID",
466
527
  "age": "range:18-65",
467
528
  "status": "choice:[Active,Inactive,Pending]",
468
- "bank": "choice_list:banks",
469
529
  "sales": "forecast:seasonal:period=12",
470
530
  "score": "normal:mean=75:std=10",
471
531
  "income": "skewed_right:skewness=1.5"
@@ -255,7 +255,10 @@ class UnitConverter:
255
255
  """Unit conversion system with Polars processing"""
256
256
 
257
257
  def __init__(self):
258
- self.arrow_bridge = EnhancedArrowBridge()
258
+ try:
259
+ self.arrow_bridge = EnhancedArrowBridge()
260
+ except ArrowBridgeError:
261
+ self.arrow_bridge = None
259
262
  self.conversion_stats = {
260
263
  "total_conversions": 0,
261
264
  "successful_conversions": 0,
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: additory
3
- Version: 0.1.0a2
4
- Summary: A semantic, extensible dataframe transformation engine with expressions, lookup, synthetic data, and sample-data support.
3
+ Version: 0.1.0a3
4
+ Summary: A semantic, extensible dataframe transformation engine with expressions, lookup, and synthetic data generation support.
5
5
  Author: Krishnamoorthy Sankaran
6
6
  License: MIT
7
7
  Project-URL: homepage, https://github.com/sekarkrishna/additory
@@ -13,6 +13,7 @@ Description-Content-Type: text/markdown
13
13
  License-File: LICENSE
14
14
  Requires-Dist: pandas>=1.5
15
15
  Requires-Dist: polars>=0.20
16
+ Requires-Dist: pyarrow>=10.0
16
17
  Requires-Dist: pyyaml>=6.0
17
18
  Requires-Dist: requests>=2.31
18
19
  Requires-Dist: toml>=0.10
@@ -34,11 +35,11 @@ Dynamic: license-file
34
35
 
35
36
  # Additory
36
37
 
37
- **A semantic, extensible dataframe transformation engine with expressions, lookup, synthetic data, and sample-data support.**
38
+ **A semantic, extensible dataframe transformation engine with expressions, lookup, and augmentation support.**
38
39
 
39
40
  [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
40
41
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
41
- [![Version](https://img.shields.io/badge/version-0.1.0a1-orange.svg)](https://github.com/sekarkrishna/additory/tree/main/V0.1.0a1/)
42
+ [![Version](https://img.shields.io/badge/version-0.1.0a2-orange.svg)](https://github.com/sekarkrishna/additory/tree/main/V0.1.0a1/)
42
43
 
43
44
  **Author:** Krishnamoorthy Sankaran
44
45
 
@@ -51,17 +52,17 @@ Dynamic: license-file
51
52
  ## 📦 Installation
52
53
 
53
54
  ```bash
54
- pip install additory==0.1.0a1
55
+ pip install additory==0.1.0a2
55
56
  ```
56
57
 
57
58
  **Optional GPU support:**
58
59
  ```bash
59
- pip install additory[gpu]==0.1.0a1 # Includes cuDF for GPU acceleration
60
+ pip install additory[gpu]==0.1.0a2 # Includes cuDF for GPU acceleration
60
61
  ```
61
62
 
62
63
  **Development installation:**
63
64
  ```bash
64
- pip install additory[dev]==0.1.0a1 # Includes testing and development tools
65
+ pip install additory[dev]==0.1.0a2 # Includes testing and development tools
65
66
  ```
66
67
 
67
68
  ## 🎯 Core Functions
@@ -70,7 +71,6 @@ pip install additory[dev]==0.1.0a1 # Includes testing and development tools
70
71
  |----------|---------|---------|
71
72
  | `add.to()` | Lookup/join operations | `add.to(df1, from_df=df2, bring='col', against='key')` |
72
73
  | `add.augment()` | Generate additional data | `add.augment(df, n_rows=1000)` |
73
- | `add.synth()` | Synthetic data from schemas | `add.synth("schema.toml", rows=5000)` |
74
74
  | `add.scan()` | Data profiling & analysis | `add.scan(df, preset="full")` |
75
75
 
76
76
  ## 🧬 Available Expressions
@@ -193,13 +193,9 @@ patients_with_bsa = add.bsa(patients)
193
193
  result = add.fitness_score(add.bmr(add.bmi(patients)))
194
194
  ```
195
195
 
196
- ### 🔄 Augment and Synthetic Data
196
+ ### 🔄 Augment Data Generation
197
197
 
198
- **Augment** generates more data similar to your existing dataset, while **Synthetic** creates entirely new datasets from schema definitions.
199
-
200
- **Key Differences:**
201
- - **Augment**: Learns patterns from existing data to create similar rows
202
- - **Synthetic**: Uses predefined schemas to generate structured data
198
+ **Augment** generates additional data similar to your existing dataset using inline strategies.
203
199
 
204
200
  ```python
205
201
  # Augment existing data (learns from patterns)
@@ -211,9 +207,6 @@ new_data = add.augment("@new", n_rows=500, strategy={
211
207
  'name': 'choice:[John,Jane,Bob]',
212
208
  'age': 'range:18-65'
213
209
  })
214
-
215
- # Generate from schema file (structured approach)
216
- customers = add.synth("customer_schema.toml", rows=10000)
217
210
  ```
218
211
 
219
212
  ## 🧪 Examples