additory 0.1.0a2__py3-none-any.whl → 0.1.0a4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. additory/__init__.py +4 -0
  2. additory/common/__init__.py +2 -2
  3. additory/common/backend.py +20 -4
  4. additory/common/distributions.py +1 -1
  5. additory/common/sample_data.py +19 -19
  6. additory/core/backends/arrow_bridge.py +7 -0
  7. additory/core/config.py +3 -3
  8. additory/core/polars_expression_engine.py +66 -16
  9. additory/core/registry.py +4 -3
  10. additory/dynamic_api.py +95 -51
  11. additory/expressions/proxy.py +4 -1
  12. additory/expressions/registry.py +3 -3
  13. additory/synthetic/__init__.py +7 -95
  14. additory/synthetic/column_name_resolver.py +149 -0
  15. additory/synthetic/deduce.py +259 -0
  16. additory/{augment → synthetic}/distributions.py +2 -2
  17. additory/{augment → synthetic}/forecast.py +1 -1
  18. additory/synthetic/linked_list_parser.py +415 -0
  19. additory/synthetic/namespace_lookup.py +129 -0
  20. additory/{augment → synthetic}/smote.py +1 -1
  21. additory/{augment → synthetic}/strategies.py +87 -44
  22. additory/{augment/augmentor.py → synthetic/synthesizer.py} +75 -15
  23. additory/utilities/units.py +4 -1
  24. {additory-0.1.0a2.dist-info → additory-0.1.0a4.dist-info}/METADATA +44 -28
  25. {additory-0.1.0a2.dist-info → additory-0.1.0a4.dist-info}/RECORD +28 -43
  26. {additory-0.1.0a2.dist-info → additory-0.1.0a4.dist-info}/WHEEL +1 -1
  27. additory/augment/__init__.py +0 -24
  28. additory/augment/builtin_lists.py +0 -430
  29. additory/augment/list_registry.py +0 -177
  30. additory/synthetic/api.py +0 -220
  31. additory/synthetic/common_integration.py +0 -314
  32. additory/synthetic/config.py +0 -262
  33. additory/synthetic/engines.py +0 -529
  34. additory/synthetic/exceptions.py +0 -180
  35. additory/synthetic/file_managers.py +0 -518
  36. additory/synthetic/generator.py +0 -702
  37. additory/synthetic/generator_parser.py +0 -68
  38. additory/synthetic/integration.py +0 -319
  39. additory/synthetic/models.py +0 -241
  40. additory/synthetic/pattern_resolver.py +0 -573
  41. additory/synthetic/performance.py +0 -469
  42. additory/synthetic/polars_integration.py +0 -464
  43. additory/synthetic/proxy.py +0 -60
  44. additory/synthetic/schema_parser.py +0 -685
  45. additory/synthetic/validator.py +0 -553
  46. {additory-0.1.0a2.dist-info → additory-0.1.0a4.dist-info}/licenses/LICENSE +0 -0
  47. {additory-0.1.0a2.dist-info → additory-0.1.0a4.dist-info}/top_level.txt +0 -0
@@ -1,11 +1,10 @@
1
1
  """
2
- Strategy handlers for data augmentation
2
+ Strategy handlers for synthetic data generation
3
3
 
4
4
  Provides different strategies for generating synthetic data:
5
5
  - auto: Random sampling from existing values
6
6
  - increment: Increment numeric or pattern-based values
7
7
  - choice:[...]: Random selection from inline list
8
- - choice_list:name: Random selection from registered/built-in list
9
8
  """
10
9
 
11
10
  import re
@@ -13,7 +12,6 @@ import random
13
12
  from typing import Any, Dict, List, Optional, Tuple
14
13
 
15
14
  from additory.common.exceptions import ValidationError, AugmentError
16
- from additory.augment.list_registry import get_list
17
15
 
18
16
 
19
17
  def parse_strategy_params(strategy_spec: str) -> Tuple[str, Dict[str, Any]]:
@@ -319,7 +317,7 @@ def apply_increment_strategy(
319
317
  Apply increment strategy to a column (Polars-only).
320
318
 
321
319
  Supports two modes:
322
- 1. Augment mode: Increment from last value in df_polars
320
+ 1. Extend mode: Increment from last value in df_polars
323
321
  2. Create mode: Start from specified value (requires params with 'start')
324
322
 
325
323
  Args:
@@ -336,7 +334,7 @@ def apply_increment_strategy(
336
334
  ValidationError: If strategy cannot be applied
337
335
 
338
336
  Examples:
339
- # Augment mode (with DataFrame)
337
+ # Extend mode (with DataFrame)
340
338
  >>> apply_increment_strategy(df, "id", "increment", 5)
341
339
  [11, 12, 13, 14, 15] # if last value was 10
342
340
 
@@ -349,7 +347,7 @@ def apply_increment_strategy(
349
347
  ... {"start": 1, "pattern": "EMP_[001]"})
350
348
  ["EMP_001", "EMP_002", "EMP_003"]
351
349
  """
352
- # Determine mode: augment (has df) or create (no df)
350
+ # Determine mode: extend (has df) or create (no df)
353
351
  is_create_mode = df_polars is None
354
352
 
355
353
  if is_create_mode:
@@ -402,7 +400,7 @@ def apply_increment_strategy(
402
400
  return new_values
403
401
 
404
402
  else:
405
- # Augment mode: use existing logic
403
+ # Extend mode: use existing logic
406
404
  # Parse the strategy
407
405
  pattern, regex_pattern = parse_increment_strategy(strategy_spec)
408
406
 
@@ -483,12 +481,11 @@ def parse_choice_strategy(strategy_spec: str) -> Tuple[str, Optional[List[Any]]]
483
481
  Args:
484
482
  strategy_spec: Strategy string like:
485
483
  - "choice:[value1,value2,value3]"
486
- - "choice_list:banks"
487
484
 
488
485
  Returns:
489
486
  Tuple of (strategy_type, values)
490
- - strategy_type: "choice" or "choice_list"
491
- - values: List of values (for choice) or None (for choice_list)
487
+ - strategy_type: "choice"
488
+ - values: List of values
492
489
 
493
490
  Raises:
494
491
  ValidationError: If strategy format is invalid
@@ -496,9 +493,6 @@ def parse_choice_strategy(strategy_spec: str) -> Tuple[str, Optional[List[Any]]]
496
493
  Examples:
497
494
  >>> parse_choice_strategy("choice:[Active,Inactive,Pending]")
498
495
  ("choice", ["Active", "Inactive", "Pending"])
499
-
500
- >>> parse_choice_strategy("choice_list:banks")
501
- ("choice_list", None)
502
496
  """
503
497
  if strategy_spec.startswith("choice:["):
504
498
  # Inline list: choice:[value1,value2,value3]
@@ -526,22 +520,10 @@ def parse_choice_strategy(strategy_spec: str) -> Tuple[str, Optional[List[Any]]]
526
520
 
527
521
  return "choice", values
528
522
 
529
- elif strategy_spec.startswith("choice_list:"):
530
- # Named list: choice_list:banks
531
- list_name = strategy_spec[len("choice_list:"):].strip()
532
-
533
- if not list_name:
534
- raise ValidationError(
535
- f"Invalid choice_list strategy: {strategy_spec}. "
536
- "Must be in format: choice_list:list_name"
537
- )
538
-
539
- return "choice_list", list_name
540
-
541
523
  else:
542
524
  raise ValidationError(
543
525
  f"Invalid choice strategy: {strategy_spec}. "
544
- "Must start with 'choice:[' or 'choice_list:'"
526
+ "Must start with 'choice:['"
545
527
  )
546
528
 
547
529
 
@@ -609,22 +591,7 @@ def apply_choice_strategy(
609
591
  ValidationError: If strategy cannot be applied
610
592
  """
611
593
  # Parse the strategy
612
- strategy_type, values_or_name = parse_choice_strategy(strategy_spec)
613
-
614
- # Get the actual values list
615
- if strategy_type == "choice":
616
- values = values_or_name
617
- elif strategy_type == "choice_list":
618
- # Resolve list name to actual list
619
- list_name = values_or_name
620
- try:
621
- values = get_list(list_name)
622
- except ValidationError as e:
623
- raise ValidationError(
624
- f"Cannot apply choice_list strategy: {e}"
625
- )
626
- else:
627
- raise ValidationError(f"Unknown choice strategy type: {strategy_type}")
594
+ strategy_type, values = parse_choice_strategy(strategy_spec)
628
595
 
629
596
  # Generate random selections
630
597
  if seed is not None:
@@ -675,7 +642,7 @@ def apply_forecast_strategy(
675
642
  >>> apply_forecast_strategy(df, "sales", "forecast:seasonal:period=12", 24)
676
643
  [98.5, 102.3, 95.8, ...]
677
644
  """
678
- from additory.augment.forecast import forecast_values, ForecastMethod
645
+ from additory.synthetic.forecast import forecast_values, ForecastMethod
679
646
 
680
647
  # Parse strategy: forecast:method:param1=val1:param2=val2
681
648
  parts = strategy_spec.split(":")
@@ -845,7 +812,7 @@ def apply_smote_strategy(
845
812
  >>> apply_smote_strategy(df, ["feature1", "feature2"], "smote:k=5", 100)
846
813
  {"feature1": [1.2, 3.4, ...], "feature2": [5.6, 7.8, ...]}
847
814
  """
848
- from additory.augment.smote import generate_smote_values
815
+ from additory.synthetic.smote import generate_smote_values
849
816
 
850
817
  # Parse strategy: smote:k=5
851
818
  parts = strategy_spec.split(":")
@@ -881,3 +848,79 @@ def apply_smote_strategy(
881
848
  )
882
849
  except Exception as e:
883
850
  raise ValidationError(f"SMOTE strategy failed: {e}")
851
+
852
+
853
+ def parse_deduce_strategy(strategy_spec: str) -> Tuple[str, List[str]]:
854
+ """
855
+ Parse deduce strategy specification.
856
+
857
+ Args:
858
+ strategy_spec: Strategy string like:
859
+ - "deduce:comment"
860
+ - "deduce:[comment, notes]"
861
+
862
+ Returns:
863
+ Tuple of (strategy_type, source_columns)
864
+ - strategy_type: "deduce"
865
+ - source_columns: List of source column names
866
+
867
+ Raises:
868
+ ValidationError: If strategy format is invalid
869
+
870
+ Examples:
871
+ >>> parse_deduce_strategy("deduce:comment")
872
+ ("deduce", ["comment"])
873
+
874
+ >>> parse_deduce_strategy("deduce:[comment, notes]")
875
+ ("deduce", ["comment", "notes"])
876
+ """
877
+ if not strategy_spec.startswith("deduce:"):
878
+ raise ValidationError(
879
+ f"Invalid deduce strategy: {strategy_spec}. "
880
+ "Must start with 'deduce:'"
881
+ )
882
+
883
+ # Extract source specification after "deduce:"
884
+ source_spec = strategy_spec[7:].strip() # Remove "deduce:" prefix
885
+
886
+ if not source_spec:
887
+ raise ValidationError(
888
+ f"Deduce strategy requires source column(s): {strategy_spec}. "
889
+ "Format: 'deduce:column' or 'deduce:[col1, col2]'"
890
+ )
891
+
892
+ # Check if it's multiple columns: deduce:[col1, col2]
893
+ if source_spec.startswith("[") and source_spec.endswith("]"):
894
+ # Multiple columns
895
+ columns_str = source_spec[1:-1] # Remove brackets
896
+
897
+ if not columns_str.strip():
898
+ raise ValidationError(
899
+ f"Deduce column list cannot be empty: {strategy_spec}"
900
+ )
901
+
902
+ # Split by comma and strip whitespace
903
+ columns = [c.strip() for c in columns_str.split(",")]
904
+
905
+ if len(columns) == 0:
906
+ raise ValidationError(
907
+ f"Deduce strategy must specify at least one column: {strategy_spec}"
908
+ )
909
+
910
+ return "deduce", columns
911
+ else:
912
+ # Single column
913
+ return "deduce", [source_spec]
914
+
915
+
916
+ def is_deduce_strategy(strategy_spec: str) -> bool:
917
+ """
918
+ Check if a strategy specification is a deduce strategy.
919
+
920
+ Args:
921
+ strategy_spec: Strategy string
922
+
923
+ Returns:
924
+ True if it's a deduce strategy, False otherwise
925
+ """
926
+ return isinstance(strategy_spec, str) and strategy_spec.startswith("deduce:")
@@ -18,7 +18,7 @@ from additory.common.backend import detect_backend, to_polars, from_polars
18
18
  from additory.common.exceptions import ValidationError, AugmentError
19
19
  from additory.common.validation import validate_dataframe
20
20
  from additory.common.sample_data import get_sample_dataset
21
- from additory.augment.strategies import (
21
+ from additory.synthetic.strategies import (
22
22
  parse_strategy_dict,
23
23
  get_column_strategy,
24
24
  apply_increment_strategy,
@@ -27,6 +27,14 @@ from additory.augment.strategies import (
27
27
  parse_strategy_params
28
28
  )
29
29
 
30
+ # Linked lists feature imports
31
+ from additory.synthetic.namespace_lookup import lookup_linked_list
32
+ from additory.synthetic.linked_list_parser import (
33
+ parse_linked_list,
34
+ generate_linked_list_data
35
+ )
36
+ from additory.synthetic.column_name_resolver import resolve_column_names
37
+
30
38
 
31
39
  def _validate_generative_strategies(strategy_dict: Dict[str, str]) -> None:
32
40
  """
@@ -36,7 +44,7 @@ def _validate_generative_strategies(strategy_dict: Dict[str, str]) -> None:
36
44
  - increment (with start parameter)
37
45
  - range
38
46
  - choice
39
- - choice_list
47
+ - lists (inline linked lists)
40
48
 
41
49
  Augmentative strategies require existing data:
42
50
  - auto (random sampling)
@@ -61,6 +69,10 @@ def _validate_generative_strategies(strategy_dict: Dict[str, str]) -> None:
61
69
  # Get the base strategy name (before any parameters)
62
70
  strategy_name = strategy_spec.split(":")[0].strip()
63
71
 
72
+ # Handle lists@ pattern
73
+ if strategy_name.startswith("lists@"):
74
+ continue # Valid generative strategy
75
+
64
76
  if strategy_name in augmentative_strategies:
65
77
  invalid_columns.append((col, strategy_name))
66
78
 
@@ -76,7 +88,7 @@ def _validate_generative_strategies(strategy_dict: Dict[str, str]) -> None:
76
88
  error_lines.append(" - increment (with start parameter)")
77
89
  error_lines.append(" - range:min-max")
78
90
  error_lines.append(" - choice:[value1,value2,...]")
79
- error_lines.append(" - choice_list:list_name")
91
+ error_lines.append(" - lists@variable_name (inline linked lists)")
80
92
 
81
93
  raise ValidationError("\n".join(error_lines))
82
94
 
@@ -249,7 +261,7 @@ def _augment_polars_engine(df_polars: Any, n_rows: int, strategy_dict: Dict[str,
249
261
  new_data[col] = new_values
250
262
  elif col_strategy.startswith("forecast"):
251
263
  # Import here to avoid circular dependency
252
- from additory.augment.strategies import apply_forecast_strategy
264
+ from additory.synthetic.strategies import apply_forecast_strategy
253
265
 
254
266
  # Generate forecasted values
255
267
  new_values = apply_forecast_strategy(
@@ -265,7 +277,7 @@ def _augment_polars_engine(df_polars: Any, n_rows: int, strategy_dict: Dict[str,
265
277
  new_data[col] = new_values
266
278
  elif col_strategy.startswith(("normal", "uniform", "skewed_left", "skewed_right", "beta", "gamma", "exponential", "kde")):
267
279
  # Import here to avoid circular dependency
268
- from additory.augment.strategies import apply_distribution_strategy
280
+ from additory.synthetic.strategies import apply_distribution_strategy
269
281
 
270
282
  # Generate distribution values
271
283
  new_values = apply_distribution_strategy(
@@ -310,7 +322,6 @@ def _create_from_scratch_engine(
310
322
  - increment (with start parameter)
311
323
  - range
312
324
  - choice
313
- - choice_list
314
325
 
315
326
  Augmentative strategies (NOT supported):
316
327
  - auto (requires existing data)
@@ -349,23 +360,62 @@ def _create_from_scratch_engine(
349
360
  ... "id": "increment:start=1",
350
361
  ... "emp_id": "increment:start=1:pattern=EMP_[001]",
351
362
  ... "age": "range:18-65",
352
- ... "status": "choice:[Active,Inactive]",
353
- ... "department": "choice_list:departments"
363
+ ... "status": "choice:[Active,Inactive,Pending]"
354
364
  ... },
355
365
  ... seed=42
356
366
  ... )
357
367
  >>> result.shape
358
- (100, 5)
368
+ (100, 4)
359
369
  """
360
370
  import polars as pl
361
371
 
362
372
  # Validate all strategies are generative
363
373
  _validate_generative_strategies(strategy_dict)
364
374
 
375
+ # Pre-process linked lists strategies
376
+ # Linked lists generate multiple columns, so we need to expand strategy_dict
377
+ expanded_strategy_dict = {}
378
+ lists_to_process = [] # Store (original_key, var_name, parsed_data, column_names)
379
+
380
+ for col, col_strategy in strategy_dict.items():
381
+ if col == "__default__":
382
+ continue
383
+
384
+ # Check for lists@ pattern
385
+ if col_strategy.startswith("lists@"):
386
+ # Extract variable name
387
+ var_name = col_strategy[6:].strip() # Remove "lists@" prefix
388
+
389
+ try:
390
+ # Lookup variable in namespace
391
+ # Depth=5: user -> add.synthetic (API) -> synthetic() -> _create_from_scratch_engine -> here
392
+ linked_list_data = lookup_linked_list(var_name, depth=5)
393
+
394
+ # Parse linked list
395
+ parsed_data = parse_linked_list(linked_list_data)
396
+
397
+ # Resolve column names
398
+ column_names = resolve_column_names(
399
+ list_name=var_name,
400
+ strategy_key=col,
401
+ num_columns=parsed_data['num_columns'],
402
+ explicit_names=parsed_data['column_names']
403
+ )
404
+
405
+ # Store for later processing
406
+ lists_to_process.append((col, var_name, parsed_data, column_names))
407
+
408
+ except ValidationError as e:
409
+ raise ValidationError(f"Linked list error for column '{col}': {e}")
410
+ else:
411
+ # Regular strategy - keep as is
412
+ expanded_strategy_dict[col] = col_strategy
413
+
365
414
  # Build data column by column
366
415
  new_data = {}
367
416
 
368
- for col, col_strategy in strategy_dict.items():
417
+ # Process regular strategies first
418
+ for col, col_strategy in expanded_strategy_dict.items():
369
419
  if col == "__default__":
370
420
  continue
371
421
 
@@ -414,13 +464,24 @@ def _create_from_scratch_engine(
414
464
  f"Unknown or unsupported strategy for column '{col}': '{col_strategy}'"
415
465
  )
416
466
 
467
+ # Process linked lists strategies
468
+ for original_key, var_name, parsed_data, column_names in lists_to_process:
469
+ # Generate data rows
470
+ data_rows = generate_linked_list_data(parsed_data, n_rows, seed)
471
+
472
+ # Transpose: list of tuples -> dict of lists
473
+ # data_rows = [(val1_col1, val1_col2), (val2_col1, val2_col2), ...]
474
+ # -> {col1: [val1_col1, val2_col1, ...], col2: [val1_col2, val2_col2, ...]}
475
+ for col_idx, col_name in enumerate(column_names):
476
+ new_data[col_name] = [row[col_idx] for row in data_rows]
477
+
417
478
  # Build Polars DataFrame from generated columns
418
479
  result = pl.DataFrame(new_data)
419
480
 
420
481
  return result
421
482
 
422
483
 
423
- def augment(
484
+ def synthetic(
424
485
  df: Any,
425
486
  n_rows: Union[int, str] = 5,
426
487
  strategy: Union[str, Dict[str, str]] = "auto",
@@ -428,12 +489,12 @@ def augment(
428
489
  output_format: str = "pandas"
429
490
  ) -> Any:
430
491
  """
431
- Augment a dataframe by adding synthetic rows based on existing data.
492
+ Generate synthetic data by extending a dataframe or creating from scratch.
432
493
 
433
494
  Uses Polars-only architecture:
434
495
  1. Detect input format (pandas/polars/cuDF)
435
496
  2. Convert to Polars via Arrow bridge (if needed)
436
- 3. Process augmentation in Polars
497
+ 3. Process synthetic data generation in Polars
437
498
  4. Convert back to original format via Arrow bridge
438
499
 
439
500
  This function adds new rows to a dataframe using various strategies:
@@ -441,7 +502,7 @@ def augment(
441
502
  - "increment": Increment numeric or pattern-based values
442
503
  - "range:min-max": Random integers within range
443
504
  - "choice:[...]": Random selection from inline list
444
- - "choice_list:name": Random selection from registered/built-in list
505
+ - "lists@variable_name": Inline linked lists (generates multiple columns)
445
506
  - "forecast:method": Time series forecasting (linear, polynomial, exponential, seasonal)
446
507
  - "normal": Normal distribution generation
447
508
  - "uniform": Uniform distribution generation
@@ -465,7 +526,6 @@ def augment(
465
526
  "emp_id": "increment:EMP_[001]_ID",
466
527
  "age": "range:18-65",
467
528
  "status": "choice:[Active,Inactive,Pending]",
468
- "bank": "choice_list:banks",
469
529
  "sales": "forecast:seasonal:period=12",
470
530
  "score": "normal:mean=75:std=10",
471
531
  "income": "skewed_right:skewness=1.5"
@@ -255,7 +255,10 @@ class UnitConverter:
255
255
  """Unit conversion system with Polars processing"""
256
256
 
257
257
  def __init__(self):
258
- self.arrow_bridge = EnhancedArrowBridge()
258
+ try:
259
+ self.arrow_bridge = EnhancedArrowBridge()
260
+ except ArrowBridgeError:
261
+ self.arrow_bridge = None
259
262
  self.conversion_stats = {
260
263
  "total_conversions": 0,
261
264
  "successful_conversions": 0,
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: additory
3
- Version: 0.1.0a2
4
- Summary: A semantic, extensible dataframe transformation engine with expressions, lookup, synthetic data, and sample-data support.
3
+ Version: 0.1.0a4
4
+ Summary: A semantic, extensible dataframe transformation engine with expressions, lookup, and synthetic data generation support.
5
5
  Author: Krishnamoorthy Sankaran
6
6
  License: MIT
7
7
  Project-URL: homepage, https://github.com/sekarkrishna/additory
@@ -13,6 +13,7 @@ Description-Content-Type: text/markdown
13
13
  License-File: LICENSE
14
14
  Requires-Dist: pandas>=1.5
15
15
  Requires-Dist: polars>=0.20
16
+ Requires-Dist: pyarrow>=10.0
16
17
  Requires-Dist: pyyaml>=6.0
17
18
  Requires-Dist: requests>=2.31
18
19
  Requires-Dist: toml>=0.10
@@ -34,11 +35,11 @@ Dynamic: license-file
34
35
 
35
36
  # Additory
36
37
 
37
- **A semantic, extensible dataframe transformation engine with expressions, lookup, synthetic data, and sample-data support.**
38
+ **A semantic, extensible dataframe transformation engine with expressions, lookup, and augmentation support.**
38
39
 
39
40
  [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
40
41
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
41
- [![Version](https://img.shields.io/badge/version-0.1.0a1-orange.svg)](https://github.com/sekarkrishna/additory/tree/main/V0.1.0a1/)
42
+ [![Version](https://img.shields.io/badge/version-0.1.0a4-orange.svg)](https://github.com/sekarkrishna/additory)
42
43
 
43
44
  **Author:** Krishnamoorthy Sankaran
44
45
 
@@ -51,17 +52,17 @@ Dynamic: license-file
51
52
  ## 📦 Installation
52
53
 
53
54
  ```bash
54
- pip install additory==0.1.0a1
55
+ pip install additory==0.1.0a4
55
56
  ```
56
57
 
57
58
  **Optional GPU support:**
58
59
  ```bash
59
- pip install additory[gpu]==0.1.0a1 # Includes cuDF for GPU acceleration
60
+ pip install additory[gpu]==0.1.0a4 # Includes cuDF for GPU acceleration
60
61
  ```
61
62
 
62
63
  **Development installation:**
63
64
  ```bash
64
- pip install additory[dev]==0.1.0a1 # Includes testing and development tools
65
+ pip install additory[dev]==0.1.0a4 # Includes testing and development tools
65
66
  ```
66
67
 
67
68
  ## 🎯 Core Functions
@@ -69,8 +70,8 @@ pip install additory[dev]==0.1.0a1 # Includes testing and development tools
69
70
  | Function | Purpose | Example |
70
71
  |----------|---------|---------|
71
72
  | `add.to()` | Lookup/join operations | `add.to(df1, from_df=df2, bring='col', against='key')` |
72
- | `add.augment()` | Generate additional data | `add.augment(df, n_rows=1000)` |
73
- | `add.synth()` | Synthetic data from schemas | `add.synth("schema.toml", rows=5000)` |
73
+ | `add.synthetic()` | Generate additional data | `add.synthetic(df, n_rows=1000)` |
74
+ | `add.deduce()` | Text-based label deduction | `add.deduce(df, from_column='text', to_column='label')` |
74
75
  | `add.scan()` | Data profiling & analysis | `add.scan(df, preset="full")` |
75
76
 
76
77
  ## 🧬 Available Expressions
@@ -119,7 +120,7 @@ import additory as add
119
120
 
120
121
  # Works with polars
121
122
  df_polars = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
122
- result = add.augment(df_polars, n_rows=100)
123
+ result = add.synthetic(df_polars, n_rows=100)
123
124
 
124
125
  # Automatic type detection and conversion
125
126
  ```
@@ -193,27 +194,42 @@ patients_with_bsa = add.bsa(patients)
193
194
  result = add.fitness_score(add.bmr(add.bmi(patients)))
194
195
  ```
195
196
 
196
- ### 🔄 Augment and Synthetic Data
197
+ ### 🔄 Synthetic Data Generation
197
198
 
198
- **Augment** generates more data similar to your existing dataset, while **Synthetic** creates entirely new datasets from schema definitions.
199
-
200
- **Key Differences:**
201
- - **Augment**: Learns patterns from existing data to create similar rows
202
- - **Synthetic**: Uses predefined schemas to generate structured data
199
+ **Synthetic** generates additional data similar to your existing dataset using inline strategies.
203
200
 
204
201
  ```python
205
- # Augment existing data (learns from patterns)
206
- more_customers = add.augment(customers, n_rows=1000)
202
+ # Extend existing data (learns from patterns)
203
+ more_customers = add.synthetic(customers, n_rows=1000)
207
204
 
208
205
  # Create data from scratch with strategies
209
- new_data = add.augment("@new", n_rows=500, strategy={
206
+ new_data = add.synthetic("@new", n_rows=500, strategy={
210
207
  'id': 'increment:start=1',
211
208
  'name': 'choice:[John,Jane,Bob]',
212
209
  'age': 'range:18-65'
213
210
  })
211
+ ```
212
+
213
+ ### 🤖 Text-Based Label Deduction
214
214
 
215
- # Generate from schema file (structured approach)
216
- customers = add.synth("customer_schema.toml", rows=10000)
215
+ **Deduce** automatically fills in missing labels by learning from your existing labeled examples. Pure Python, no LLMs, offline-first.
216
+
217
+ ```python
218
+ # Deduce missing labels from text
219
+ tickets = pd.DataFrame({
220
+ "ticket_text": ["Cannot log in", "Billing question", "App crashes", "Need invoice"],
221
+ "category": ["Technical", "Billing", None, None]
222
+ })
223
+
224
+ # Automatically fill in missing categories
225
+ result = add.deduce(tickets, from_column="ticket_text", to_column="category")
226
+
227
+ # Use multiple columns for better accuracy
228
+ result = add.deduce(
229
+ df,
230
+ from_column=["title", "description"],
231
+ to_column="category"
232
+ )
217
233
  ```
218
234
 
219
235
  ## 🧪 Examples
@@ -231,7 +247,7 @@ customers = pd.DataFrame({
231
247
  })
232
248
 
233
249
  # Generate more customers
234
- customers = add.augment(customers, n_rows=10000)
250
+ customers = add.synthetic(customers, n_rows=10000)
235
251
 
236
252
  # Add customer tiers
237
253
  tiers = pd.DataFrame({
@@ -257,7 +273,7 @@ strategy = {
257
273
  'height_cm': 'range:150-200' # Height in cm
258
274
  }
259
275
 
260
- patients = add.augment("@new", n_rows=1000, strategy=strategy)
276
+ patients = add.synthetic("@new", n_rows=1000, strategy=strategy)
261
277
 
262
278
  # Convert height to meters for expressions
263
279
  patients['height_m'] = patients['height_cm'] / 100
@@ -272,19 +288,19 @@ print(result.correlations)
272
288
 
273
289
  ## 📚 Documentation
274
290
 
275
- - **[Function Documentation](https://github.com/sekarkrishna/additory/tree/main/V0.1.0a1/documentation/V0.1.0/)** - Detailed guides for each function
276
- - **[Expressions Guide](https://github.com/sekarkrishna/additory/tree/main/V0.1.0a1/documentation/V0.1.0/expressions.html)** - Complete expressions reference
291
+ - **[Function Documentation](https://github.com/sekarkrishna/additory/tree/main/documentation/)** - Detailed guides for each function
292
+ - **[Expressions Guide](https://github.com/sekarkrishna/additory/tree/main/documentation/)** - Complete expressions reference
277
293
 
278
294
  ## 📄 License
279
295
 
280
- MIT License - see [LICENSE](https://github.com/sekarkrishna/additory/tree/main/V0.1.0a1/LICENSE) file for details.
296
+ MIT License - see [LICENSE](LICENSE) file for details.
281
297
 
282
298
  ## 📞 Support
283
299
 
284
300
  - **Issues**: [GitHub Issues](https://github.com/sekarkrishna/additory/issues)
285
- - **Documentation**: [Full Documentation](https://github.com/sekarkrishna/additory/tree/main/V0.1.0a1/documentation/V0.1.0)
301
+ - **Documentation**: [Full Documentation](https://github.com/sekarkrishna/additory/tree/main/documentation/)
286
302
 
287
- ## 🗺️ v0.1.1 (February 2025)
303
+ ## 🗺️ v0.1.1 (January 2026)
288
304
  - Enhanced documentation and tutorials
289
305
  - Performance optimizations
290
306
  - Additional expressions