additory 0.1.0a2__py3-none-any.whl → 0.1.0a4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- additory/__init__.py +4 -0
- additory/common/__init__.py +2 -2
- additory/common/backend.py +20 -4
- additory/common/distributions.py +1 -1
- additory/common/sample_data.py +19 -19
- additory/core/backends/arrow_bridge.py +7 -0
- additory/core/config.py +3 -3
- additory/core/polars_expression_engine.py +66 -16
- additory/core/registry.py +4 -3
- additory/dynamic_api.py +95 -51
- additory/expressions/proxy.py +4 -1
- additory/expressions/registry.py +3 -3
- additory/synthetic/__init__.py +7 -95
- additory/synthetic/column_name_resolver.py +149 -0
- additory/synthetic/deduce.py +259 -0
- additory/{augment → synthetic}/distributions.py +2 -2
- additory/{augment → synthetic}/forecast.py +1 -1
- additory/synthetic/linked_list_parser.py +415 -0
- additory/synthetic/namespace_lookup.py +129 -0
- additory/{augment → synthetic}/smote.py +1 -1
- additory/{augment → synthetic}/strategies.py +87 -44
- additory/{augment/augmentor.py → synthetic/synthesizer.py} +75 -15
- additory/utilities/units.py +4 -1
- {additory-0.1.0a2.dist-info → additory-0.1.0a4.dist-info}/METADATA +44 -28
- {additory-0.1.0a2.dist-info → additory-0.1.0a4.dist-info}/RECORD +28 -43
- {additory-0.1.0a2.dist-info → additory-0.1.0a4.dist-info}/WHEEL +1 -1
- additory/augment/__init__.py +0 -24
- additory/augment/builtin_lists.py +0 -430
- additory/augment/list_registry.py +0 -177
- additory/synthetic/api.py +0 -220
- additory/synthetic/common_integration.py +0 -314
- additory/synthetic/config.py +0 -262
- additory/synthetic/engines.py +0 -529
- additory/synthetic/exceptions.py +0 -180
- additory/synthetic/file_managers.py +0 -518
- additory/synthetic/generator.py +0 -702
- additory/synthetic/generator_parser.py +0 -68
- additory/synthetic/integration.py +0 -319
- additory/synthetic/models.py +0 -241
- additory/synthetic/pattern_resolver.py +0 -573
- additory/synthetic/performance.py +0 -469
- additory/synthetic/polars_integration.py +0 -464
- additory/synthetic/proxy.py +0 -60
- additory/synthetic/schema_parser.py +0 -685
- additory/synthetic/validator.py +0 -553
- {additory-0.1.0a2.dist-info → additory-0.1.0a4.dist-info}/licenses/LICENSE +0 -0
- {additory-0.1.0a2.dist-info → additory-0.1.0a4.dist-info}/top_level.txt +0 -0
|
@@ -1,11 +1,10 @@
|
|
|
1
1
|
"""
|
|
2
|
-
Strategy handlers for data
|
|
2
|
+
Strategy handlers for synthetic data generation
|
|
3
3
|
|
|
4
4
|
Provides different strategies for generating synthetic data:
|
|
5
5
|
- auto: Random sampling from existing values
|
|
6
6
|
- increment: Increment numeric or pattern-based values
|
|
7
7
|
- choice:[...]: Random selection from inline list
|
|
8
|
-
- choice_list:name: Random selection from registered/built-in list
|
|
9
8
|
"""
|
|
10
9
|
|
|
11
10
|
import re
|
|
@@ -13,7 +12,6 @@ import random
|
|
|
13
12
|
from typing import Any, Dict, List, Optional, Tuple
|
|
14
13
|
|
|
15
14
|
from additory.common.exceptions import ValidationError, AugmentError
|
|
16
|
-
from additory.augment.list_registry import get_list
|
|
17
15
|
|
|
18
16
|
|
|
19
17
|
def parse_strategy_params(strategy_spec: str) -> Tuple[str, Dict[str, Any]]:
|
|
@@ -319,7 +317,7 @@ def apply_increment_strategy(
|
|
|
319
317
|
Apply increment strategy to a column (Polars-only).
|
|
320
318
|
|
|
321
319
|
Supports two modes:
|
|
322
|
-
1.
|
|
320
|
+
1. Extend mode: Increment from last value in df_polars
|
|
323
321
|
2. Create mode: Start from specified value (requires params with 'start')
|
|
324
322
|
|
|
325
323
|
Args:
|
|
@@ -336,7 +334,7 @@ def apply_increment_strategy(
|
|
|
336
334
|
ValidationError: If strategy cannot be applied
|
|
337
335
|
|
|
338
336
|
Examples:
|
|
339
|
-
#
|
|
337
|
+
# Extend mode (with DataFrame)
|
|
340
338
|
>>> apply_increment_strategy(df, "id", "increment", 5)
|
|
341
339
|
[11, 12, 13, 14, 15] # if last value was 10
|
|
342
340
|
|
|
@@ -349,7 +347,7 @@ def apply_increment_strategy(
|
|
|
349
347
|
... {"start": 1, "pattern": "EMP_[001]"})
|
|
350
348
|
["EMP_001", "EMP_002", "EMP_003"]
|
|
351
349
|
"""
|
|
352
|
-
# Determine mode:
|
|
350
|
+
# Determine mode: extend (has df) or create (no df)
|
|
353
351
|
is_create_mode = df_polars is None
|
|
354
352
|
|
|
355
353
|
if is_create_mode:
|
|
@@ -402,7 +400,7 @@ def apply_increment_strategy(
|
|
|
402
400
|
return new_values
|
|
403
401
|
|
|
404
402
|
else:
|
|
405
|
-
#
|
|
403
|
+
# Extend mode: use existing logic
|
|
406
404
|
# Parse the strategy
|
|
407
405
|
pattern, regex_pattern = parse_increment_strategy(strategy_spec)
|
|
408
406
|
|
|
@@ -483,12 +481,11 @@ def parse_choice_strategy(strategy_spec: str) -> Tuple[str, Optional[List[Any]]]
|
|
|
483
481
|
Args:
|
|
484
482
|
strategy_spec: Strategy string like:
|
|
485
483
|
- "choice:[value1,value2,value3]"
|
|
486
|
-
- "choice_list:banks"
|
|
487
484
|
|
|
488
485
|
Returns:
|
|
489
486
|
Tuple of (strategy_type, values)
|
|
490
|
-
- strategy_type: "choice"
|
|
491
|
-
- values: List of values
|
|
487
|
+
- strategy_type: "choice"
|
|
488
|
+
- values: List of values
|
|
492
489
|
|
|
493
490
|
Raises:
|
|
494
491
|
ValidationError: If strategy format is invalid
|
|
@@ -496,9 +493,6 @@ def parse_choice_strategy(strategy_spec: str) -> Tuple[str, Optional[List[Any]]]
|
|
|
496
493
|
Examples:
|
|
497
494
|
>>> parse_choice_strategy("choice:[Active,Inactive,Pending]")
|
|
498
495
|
("choice", ["Active", "Inactive", "Pending"])
|
|
499
|
-
|
|
500
|
-
>>> parse_choice_strategy("choice_list:banks")
|
|
501
|
-
("choice_list", None)
|
|
502
496
|
"""
|
|
503
497
|
if strategy_spec.startswith("choice:["):
|
|
504
498
|
# Inline list: choice:[value1,value2,value3]
|
|
@@ -526,22 +520,10 @@ def parse_choice_strategy(strategy_spec: str) -> Tuple[str, Optional[List[Any]]]
|
|
|
526
520
|
|
|
527
521
|
return "choice", values
|
|
528
522
|
|
|
529
|
-
elif strategy_spec.startswith("choice_list:"):
|
|
530
|
-
# Named list: choice_list:banks
|
|
531
|
-
list_name = strategy_spec[len("choice_list:"):].strip()
|
|
532
|
-
|
|
533
|
-
if not list_name:
|
|
534
|
-
raise ValidationError(
|
|
535
|
-
f"Invalid choice_list strategy: {strategy_spec}. "
|
|
536
|
-
"Must be in format: choice_list:list_name"
|
|
537
|
-
)
|
|
538
|
-
|
|
539
|
-
return "choice_list", list_name
|
|
540
|
-
|
|
541
523
|
else:
|
|
542
524
|
raise ValidationError(
|
|
543
525
|
f"Invalid choice strategy: {strategy_spec}. "
|
|
544
|
-
"Must start with 'choice:['
|
|
526
|
+
"Must start with 'choice:['"
|
|
545
527
|
)
|
|
546
528
|
|
|
547
529
|
|
|
@@ -609,22 +591,7 @@ def apply_choice_strategy(
|
|
|
609
591
|
ValidationError: If strategy cannot be applied
|
|
610
592
|
"""
|
|
611
593
|
# Parse the strategy
|
|
612
|
-
strategy_type,
|
|
613
|
-
|
|
614
|
-
# Get the actual values list
|
|
615
|
-
if strategy_type == "choice":
|
|
616
|
-
values = values_or_name
|
|
617
|
-
elif strategy_type == "choice_list":
|
|
618
|
-
# Resolve list name to actual list
|
|
619
|
-
list_name = values_or_name
|
|
620
|
-
try:
|
|
621
|
-
values = get_list(list_name)
|
|
622
|
-
except ValidationError as e:
|
|
623
|
-
raise ValidationError(
|
|
624
|
-
f"Cannot apply choice_list strategy: {e}"
|
|
625
|
-
)
|
|
626
|
-
else:
|
|
627
|
-
raise ValidationError(f"Unknown choice strategy type: {strategy_type}")
|
|
594
|
+
strategy_type, values = parse_choice_strategy(strategy_spec)
|
|
628
595
|
|
|
629
596
|
# Generate random selections
|
|
630
597
|
if seed is not None:
|
|
@@ -675,7 +642,7 @@ def apply_forecast_strategy(
|
|
|
675
642
|
>>> apply_forecast_strategy(df, "sales", "forecast:seasonal:period=12", 24)
|
|
676
643
|
[98.5, 102.3, 95.8, ...]
|
|
677
644
|
"""
|
|
678
|
-
from additory.
|
|
645
|
+
from additory.synthetic.forecast import forecast_values, ForecastMethod
|
|
679
646
|
|
|
680
647
|
# Parse strategy: forecast:method:param1=val1:param2=val2
|
|
681
648
|
parts = strategy_spec.split(":")
|
|
@@ -845,7 +812,7 @@ def apply_smote_strategy(
|
|
|
845
812
|
>>> apply_smote_strategy(df, ["feature1", "feature2"], "smote:k=5", 100)
|
|
846
813
|
{"feature1": [1.2, 3.4, ...], "feature2": [5.6, 7.8, ...]}
|
|
847
814
|
"""
|
|
848
|
-
from additory.
|
|
815
|
+
from additory.synthetic.smote import generate_smote_values
|
|
849
816
|
|
|
850
817
|
# Parse strategy: smote:k=5
|
|
851
818
|
parts = strategy_spec.split(":")
|
|
@@ -881,3 +848,79 @@ def apply_smote_strategy(
|
|
|
881
848
|
)
|
|
882
849
|
except Exception as e:
|
|
883
850
|
raise ValidationError(f"SMOTE strategy failed: {e}")
|
|
851
|
+
|
|
852
|
+
|
|
853
|
+
def parse_deduce_strategy(strategy_spec: str) -> Tuple[str, List[str]]:
|
|
854
|
+
"""
|
|
855
|
+
Parse deduce strategy specification.
|
|
856
|
+
|
|
857
|
+
Args:
|
|
858
|
+
strategy_spec: Strategy string like:
|
|
859
|
+
- "deduce:comment"
|
|
860
|
+
- "deduce:[comment, notes]"
|
|
861
|
+
|
|
862
|
+
Returns:
|
|
863
|
+
Tuple of (strategy_type, source_columns)
|
|
864
|
+
- strategy_type: "deduce"
|
|
865
|
+
- source_columns: List of source column names
|
|
866
|
+
|
|
867
|
+
Raises:
|
|
868
|
+
ValidationError: If strategy format is invalid
|
|
869
|
+
|
|
870
|
+
Examples:
|
|
871
|
+
>>> parse_deduce_strategy("deduce:comment")
|
|
872
|
+
("deduce", ["comment"])
|
|
873
|
+
|
|
874
|
+
>>> parse_deduce_strategy("deduce:[comment, notes]")
|
|
875
|
+
("deduce", ["comment", "notes"])
|
|
876
|
+
"""
|
|
877
|
+
if not strategy_spec.startswith("deduce:"):
|
|
878
|
+
raise ValidationError(
|
|
879
|
+
f"Invalid deduce strategy: {strategy_spec}. "
|
|
880
|
+
"Must start with 'deduce:'"
|
|
881
|
+
)
|
|
882
|
+
|
|
883
|
+
# Extract source specification after "deduce:"
|
|
884
|
+
source_spec = strategy_spec[7:].strip() # Remove "deduce:" prefix
|
|
885
|
+
|
|
886
|
+
if not source_spec:
|
|
887
|
+
raise ValidationError(
|
|
888
|
+
f"Deduce strategy requires source column(s): {strategy_spec}. "
|
|
889
|
+
"Format: 'deduce:column' or 'deduce:[col1, col2]'"
|
|
890
|
+
)
|
|
891
|
+
|
|
892
|
+
# Check if it's multiple columns: deduce:[col1, col2]
|
|
893
|
+
if source_spec.startswith("[") and source_spec.endswith("]"):
|
|
894
|
+
# Multiple columns
|
|
895
|
+
columns_str = source_spec[1:-1] # Remove brackets
|
|
896
|
+
|
|
897
|
+
if not columns_str.strip():
|
|
898
|
+
raise ValidationError(
|
|
899
|
+
f"Deduce column list cannot be empty: {strategy_spec}"
|
|
900
|
+
)
|
|
901
|
+
|
|
902
|
+
# Split by comma and strip whitespace
|
|
903
|
+
columns = [c.strip() for c in columns_str.split(",")]
|
|
904
|
+
|
|
905
|
+
if len(columns) == 0:
|
|
906
|
+
raise ValidationError(
|
|
907
|
+
f"Deduce strategy must specify at least one column: {strategy_spec}"
|
|
908
|
+
)
|
|
909
|
+
|
|
910
|
+
return "deduce", columns
|
|
911
|
+
else:
|
|
912
|
+
# Single column
|
|
913
|
+
return "deduce", [source_spec]
|
|
914
|
+
|
|
915
|
+
|
|
916
|
+
def is_deduce_strategy(strategy_spec: str) -> bool:
|
|
917
|
+
"""
|
|
918
|
+
Check if a strategy specification is a deduce strategy.
|
|
919
|
+
|
|
920
|
+
Args:
|
|
921
|
+
strategy_spec: Strategy string
|
|
922
|
+
|
|
923
|
+
Returns:
|
|
924
|
+
True if it's a deduce strategy, False otherwise
|
|
925
|
+
"""
|
|
926
|
+
return isinstance(strategy_spec, str) and strategy_spec.startswith("deduce:")
|
|
@@ -18,7 +18,7 @@ from additory.common.backend import detect_backend, to_polars, from_polars
|
|
|
18
18
|
from additory.common.exceptions import ValidationError, AugmentError
|
|
19
19
|
from additory.common.validation import validate_dataframe
|
|
20
20
|
from additory.common.sample_data import get_sample_dataset
|
|
21
|
-
from additory.
|
|
21
|
+
from additory.synthetic.strategies import (
|
|
22
22
|
parse_strategy_dict,
|
|
23
23
|
get_column_strategy,
|
|
24
24
|
apply_increment_strategy,
|
|
@@ -27,6 +27,14 @@ from additory.augment.strategies import (
|
|
|
27
27
|
parse_strategy_params
|
|
28
28
|
)
|
|
29
29
|
|
|
30
|
+
# Linked lists feature imports
|
|
31
|
+
from additory.synthetic.namespace_lookup import lookup_linked_list
|
|
32
|
+
from additory.synthetic.linked_list_parser import (
|
|
33
|
+
parse_linked_list,
|
|
34
|
+
generate_linked_list_data
|
|
35
|
+
)
|
|
36
|
+
from additory.synthetic.column_name_resolver import resolve_column_names
|
|
37
|
+
|
|
30
38
|
|
|
31
39
|
def _validate_generative_strategies(strategy_dict: Dict[str, str]) -> None:
|
|
32
40
|
"""
|
|
@@ -36,7 +44,7 @@ def _validate_generative_strategies(strategy_dict: Dict[str, str]) -> None:
|
|
|
36
44
|
- increment (with start parameter)
|
|
37
45
|
- range
|
|
38
46
|
- choice
|
|
39
|
-
-
|
|
47
|
+
- lists (inline linked lists)
|
|
40
48
|
|
|
41
49
|
Augmentative strategies require existing data:
|
|
42
50
|
- auto (random sampling)
|
|
@@ -61,6 +69,10 @@ def _validate_generative_strategies(strategy_dict: Dict[str, str]) -> None:
|
|
|
61
69
|
# Get the base strategy name (before any parameters)
|
|
62
70
|
strategy_name = strategy_spec.split(":")[0].strip()
|
|
63
71
|
|
|
72
|
+
# Handle lists@ pattern
|
|
73
|
+
if strategy_name.startswith("lists@"):
|
|
74
|
+
continue # Valid generative strategy
|
|
75
|
+
|
|
64
76
|
if strategy_name in augmentative_strategies:
|
|
65
77
|
invalid_columns.append((col, strategy_name))
|
|
66
78
|
|
|
@@ -76,7 +88,7 @@ def _validate_generative_strategies(strategy_dict: Dict[str, str]) -> None:
|
|
|
76
88
|
error_lines.append(" - increment (with start parameter)")
|
|
77
89
|
error_lines.append(" - range:min-max")
|
|
78
90
|
error_lines.append(" - choice:[value1,value2,...]")
|
|
79
|
-
error_lines.append(" -
|
|
91
|
+
error_lines.append(" - lists@variable_name (inline linked lists)")
|
|
80
92
|
|
|
81
93
|
raise ValidationError("\n".join(error_lines))
|
|
82
94
|
|
|
@@ -249,7 +261,7 @@ def _augment_polars_engine(df_polars: Any, n_rows: int, strategy_dict: Dict[str,
|
|
|
249
261
|
new_data[col] = new_values
|
|
250
262
|
elif col_strategy.startswith("forecast"):
|
|
251
263
|
# Import here to avoid circular dependency
|
|
252
|
-
from additory.
|
|
264
|
+
from additory.synthetic.strategies import apply_forecast_strategy
|
|
253
265
|
|
|
254
266
|
# Generate forecasted values
|
|
255
267
|
new_values = apply_forecast_strategy(
|
|
@@ -265,7 +277,7 @@ def _augment_polars_engine(df_polars: Any, n_rows: int, strategy_dict: Dict[str,
|
|
|
265
277
|
new_data[col] = new_values
|
|
266
278
|
elif col_strategy.startswith(("normal", "uniform", "skewed_left", "skewed_right", "beta", "gamma", "exponential", "kde")):
|
|
267
279
|
# Import here to avoid circular dependency
|
|
268
|
-
from additory.
|
|
280
|
+
from additory.synthetic.strategies import apply_distribution_strategy
|
|
269
281
|
|
|
270
282
|
# Generate distribution values
|
|
271
283
|
new_values = apply_distribution_strategy(
|
|
@@ -310,7 +322,6 @@ def _create_from_scratch_engine(
|
|
|
310
322
|
- increment (with start parameter)
|
|
311
323
|
- range
|
|
312
324
|
- choice
|
|
313
|
-
- choice_list
|
|
314
325
|
|
|
315
326
|
Augmentative strategies (NOT supported):
|
|
316
327
|
- auto (requires existing data)
|
|
@@ -349,23 +360,62 @@ def _create_from_scratch_engine(
|
|
|
349
360
|
... "id": "increment:start=1",
|
|
350
361
|
... "emp_id": "increment:start=1:pattern=EMP_[001]",
|
|
351
362
|
... "age": "range:18-65",
|
|
352
|
-
... "status": "choice:[Active,Inactive]"
|
|
353
|
-
... "department": "choice_list:departments"
|
|
363
|
+
... "status": "choice:[Active,Inactive,Pending]"
|
|
354
364
|
... },
|
|
355
365
|
... seed=42
|
|
356
366
|
... )
|
|
357
367
|
>>> result.shape
|
|
358
|
-
(100,
|
|
368
|
+
(100, 4)
|
|
359
369
|
"""
|
|
360
370
|
import polars as pl
|
|
361
371
|
|
|
362
372
|
# Validate all strategies are generative
|
|
363
373
|
_validate_generative_strategies(strategy_dict)
|
|
364
374
|
|
|
375
|
+
# Pre-process linked lists strategies
|
|
376
|
+
# Linked lists generate multiple columns, so we need to expand strategy_dict
|
|
377
|
+
expanded_strategy_dict = {}
|
|
378
|
+
lists_to_process = [] # Store (original_key, var_name, parsed_data, column_names)
|
|
379
|
+
|
|
380
|
+
for col, col_strategy in strategy_dict.items():
|
|
381
|
+
if col == "__default__":
|
|
382
|
+
continue
|
|
383
|
+
|
|
384
|
+
# Check for lists@ pattern
|
|
385
|
+
if col_strategy.startswith("lists@"):
|
|
386
|
+
# Extract variable name
|
|
387
|
+
var_name = col_strategy[6:].strip() # Remove "lists@" prefix
|
|
388
|
+
|
|
389
|
+
try:
|
|
390
|
+
# Lookup variable in namespace
|
|
391
|
+
# Depth=5: user -> add.synthetic (API) -> synthetic() -> _create_from_scratch_engine -> here
|
|
392
|
+
linked_list_data = lookup_linked_list(var_name, depth=5)
|
|
393
|
+
|
|
394
|
+
# Parse linked list
|
|
395
|
+
parsed_data = parse_linked_list(linked_list_data)
|
|
396
|
+
|
|
397
|
+
# Resolve column names
|
|
398
|
+
column_names = resolve_column_names(
|
|
399
|
+
list_name=var_name,
|
|
400
|
+
strategy_key=col,
|
|
401
|
+
num_columns=parsed_data['num_columns'],
|
|
402
|
+
explicit_names=parsed_data['column_names']
|
|
403
|
+
)
|
|
404
|
+
|
|
405
|
+
# Store for later processing
|
|
406
|
+
lists_to_process.append((col, var_name, parsed_data, column_names))
|
|
407
|
+
|
|
408
|
+
except ValidationError as e:
|
|
409
|
+
raise ValidationError(f"Linked list error for column '{col}': {e}")
|
|
410
|
+
else:
|
|
411
|
+
# Regular strategy - keep as is
|
|
412
|
+
expanded_strategy_dict[col] = col_strategy
|
|
413
|
+
|
|
365
414
|
# Build data column by column
|
|
366
415
|
new_data = {}
|
|
367
416
|
|
|
368
|
-
|
|
417
|
+
# Process regular strategies first
|
|
418
|
+
for col, col_strategy in expanded_strategy_dict.items():
|
|
369
419
|
if col == "__default__":
|
|
370
420
|
continue
|
|
371
421
|
|
|
@@ -414,13 +464,24 @@ def _create_from_scratch_engine(
|
|
|
414
464
|
f"Unknown or unsupported strategy for column '{col}': '{col_strategy}'"
|
|
415
465
|
)
|
|
416
466
|
|
|
467
|
+
# Process linked lists strategies
|
|
468
|
+
for original_key, var_name, parsed_data, column_names in lists_to_process:
|
|
469
|
+
# Generate data rows
|
|
470
|
+
data_rows = generate_linked_list_data(parsed_data, n_rows, seed)
|
|
471
|
+
|
|
472
|
+
# Transpose: list of tuples -> dict of lists
|
|
473
|
+
# data_rows = [(val1_col1, val1_col2), (val2_col1, val2_col2), ...]
|
|
474
|
+
# -> {col1: [val1_col1, val2_col1, ...], col2: [val1_col2, val2_col2, ...]}
|
|
475
|
+
for col_idx, col_name in enumerate(column_names):
|
|
476
|
+
new_data[col_name] = [row[col_idx] for row in data_rows]
|
|
477
|
+
|
|
417
478
|
# Build Polars DataFrame from generated columns
|
|
418
479
|
result = pl.DataFrame(new_data)
|
|
419
480
|
|
|
420
481
|
return result
|
|
421
482
|
|
|
422
483
|
|
|
423
|
-
def
|
|
484
|
+
def synthetic(
|
|
424
485
|
df: Any,
|
|
425
486
|
n_rows: Union[int, str] = 5,
|
|
426
487
|
strategy: Union[str, Dict[str, str]] = "auto",
|
|
@@ -428,12 +489,12 @@ def augment(
|
|
|
428
489
|
output_format: str = "pandas"
|
|
429
490
|
) -> Any:
|
|
430
491
|
"""
|
|
431
|
-
|
|
492
|
+
Generate synthetic data by extending a dataframe or creating from scratch.
|
|
432
493
|
|
|
433
494
|
Uses Polars-only architecture:
|
|
434
495
|
1. Detect input format (pandas/polars/cuDF)
|
|
435
496
|
2. Convert to Polars via Arrow bridge (if needed)
|
|
436
|
-
3. Process
|
|
497
|
+
3. Process synthetic data generation in Polars
|
|
437
498
|
4. Convert back to original format via Arrow bridge
|
|
438
499
|
|
|
439
500
|
This function adds new rows to a dataframe using various strategies:
|
|
@@ -441,7 +502,7 @@ def augment(
|
|
|
441
502
|
- "increment": Increment numeric or pattern-based values
|
|
442
503
|
- "range:min-max": Random integers within range
|
|
443
504
|
- "choice:[...]": Random selection from inline list
|
|
444
|
-
- "
|
|
505
|
+
- "lists@variable_name": Inline linked lists (generates multiple columns)
|
|
445
506
|
- "forecast:method": Time series forecasting (linear, polynomial, exponential, seasonal)
|
|
446
507
|
- "normal": Normal distribution generation
|
|
447
508
|
- "uniform": Uniform distribution generation
|
|
@@ -465,7 +526,6 @@ def augment(
|
|
|
465
526
|
"emp_id": "increment:EMP_[001]_ID",
|
|
466
527
|
"age": "range:18-65",
|
|
467
528
|
"status": "choice:[Active,Inactive,Pending]",
|
|
468
|
-
"bank": "choice_list:banks",
|
|
469
529
|
"sales": "forecast:seasonal:period=12",
|
|
470
530
|
"score": "normal:mean=75:std=10",
|
|
471
531
|
"income": "skewed_right:skewness=1.5"
|
additory/utilities/units.py
CHANGED
|
@@ -255,7 +255,10 @@ class UnitConverter:
|
|
|
255
255
|
"""Unit conversion system with Polars processing"""
|
|
256
256
|
|
|
257
257
|
def __init__(self):
|
|
258
|
-
|
|
258
|
+
try:
|
|
259
|
+
self.arrow_bridge = EnhancedArrowBridge()
|
|
260
|
+
except ArrowBridgeError:
|
|
261
|
+
self.arrow_bridge = None
|
|
259
262
|
self.conversion_stats = {
|
|
260
263
|
"total_conversions": 0,
|
|
261
264
|
"successful_conversions": 0,
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: additory
|
|
3
|
-
Version: 0.1.
|
|
4
|
-
Summary: A semantic, extensible dataframe transformation engine with expressions, lookup, synthetic data
|
|
3
|
+
Version: 0.1.0a4
|
|
4
|
+
Summary: A semantic, extensible dataframe transformation engine with expressions, lookup, and synthetic data generation support.
|
|
5
5
|
Author: Krishnamoorthy Sankaran
|
|
6
6
|
License: MIT
|
|
7
7
|
Project-URL: homepage, https://github.com/sekarkrishna/additory
|
|
@@ -13,6 +13,7 @@ Description-Content-Type: text/markdown
|
|
|
13
13
|
License-File: LICENSE
|
|
14
14
|
Requires-Dist: pandas>=1.5
|
|
15
15
|
Requires-Dist: polars>=0.20
|
|
16
|
+
Requires-Dist: pyarrow>=10.0
|
|
16
17
|
Requires-Dist: pyyaml>=6.0
|
|
17
18
|
Requires-Dist: requests>=2.31
|
|
18
19
|
Requires-Dist: toml>=0.10
|
|
@@ -34,11 +35,11 @@ Dynamic: license-file
|
|
|
34
35
|
|
|
35
36
|
# Additory
|
|
36
37
|
|
|
37
|
-
**A semantic, extensible dataframe transformation engine with expressions, lookup,
|
|
38
|
+
**A semantic, extensible dataframe transformation engine with expressions, lookup, and augmentation support.**
|
|
38
39
|
|
|
39
40
|
[](https://www.python.org/downloads/)
|
|
40
41
|
[](https://opensource.org/licenses/MIT)
|
|
41
|
-
[](https://github.com/sekarkrishna/additory)
|
|
42
43
|
|
|
43
44
|
**Author:** Krishnamoorthy Sankaran
|
|
44
45
|
|
|
@@ -51,17 +52,17 @@ Dynamic: license-file
|
|
|
51
52
|
## 📦 Installation
|
|
52
53
|
|
|
53
54
|
```bash
|
|
54
|
-
pip install additory==0.1.
|
|
55
|
+
pip install additory==0.1.0a4
|
|
55
56
|
```
|
|
56
57
|
|
|
57
58
|
**Optional GPU support:**
|
|
58
59
|
```bash
|
|
59
|
-
pip install additory[gpu]==0.1.
|
|
60
|
+
pip install additory[gpu]==0.1.0a4 # Includes cuDF for GPU acceleration
|
|
60
61
|
```
|
|
61
62
|
|
|
62
63
|
**Development installation:**
|
|
63
64
|
```bash
|
|
64
|
-
pip install additory[dev]==0.1.
|
|
65
|
+
pip install additory[dev]==0.1.0a4 # Includes testing and development tools
|
|
65
66
|
```
|
|
66
67
|
|
|
67
68
|
## 🎯 Core Functions
|
|
@@ -69,8 +70,8 @@ pip install additory[dev]==0.1.0a1 # Includes testing and development tools
|
|
|
69
70
|
| Function | Purpose | Example |
|
|
70
71
|
|----------|---------|---------|
|
|
71
72
|
| `add.to()` | Lookup/join operations | `add.to(df1, from_df=df2, bring='col', against='key')` |
|
|
72
|
-
| `add.
|
|
73
|
-
| `add.
|
|
73
|
+
| `add.synthetic()` | Generate additional data | `add.synthetic(df, n_rows=1000)` |
|
|
74
|
+
| `add.deduce()` | Text-based label deduction | `add.deduce(df, from_column='text', to_column='label')` |
|
|
74
75
|
| `add.scan()` | Data profiling & analysis | `add.scan(df, preset="full")` |
|
|
75
76
|
|
|
76
77
|
## 🧬 Available Expressions
|
|
@@ -119,7 +120,7 @@ import additory as add
|
|
|
119
120
|
|
|
120
121
|
# Works with polars
|
|
121
122
|
df_polars = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
|
|
122
|
-
result = add.
|
|
123
|
+
result = add.synthetic(df_polars, n_rows=100)
|
|
123
124
|
|
|
124
125
|
# Automatic type detection and conversion
|
|
125
126
|
```
|
|
@@ -193,27 +194,42 @@ patients_with_bsa = add.bsa(patients)
|
|
|
193
194
|
result = add.fitness_score(add.bmr(add.bmi(patients)))
|
|
194
195
|
```
|
|
195
196
|
|
|
196
|
-
### 🔄
|
|
197
|
+
### 🔄 Synthetic Data Generation
|
|
197
198
|
|
|
198
|
-
**
|
|
199
|
-
|
|
200
|
-
**Key Differences:**
|
|
201
|
-
- **Augment**: Learns patterns from existing data to create similar rows
|
|
202
|
-
- **Synthetic**: Uses predefined schemas to generate structured data
|
|
199
|
+
**Synthetic** generates additional data similar to your existing dataset using inline strategies.
|
|
203
200
|
|
|
204
201
|
```python
|
|
205
|
-
#
|
|
206
|
-
more_customers = add.
|
|
202
|
+
# Extend existing data (learns from patterns)
|
|
203
|
+
more_customers = add.synthetic(customers, n_rows=1000)
|
|
207
204
|
|
|
208
205
|
# Create data from scratch with strategies
|
|
209
|
-
new_data = add.
|
|
206
|
+
new_data = add.synthetic("@new", n_rows=500, strategy={
|
|
210
207
|
'id': 'increment:start=1',
|
|
211
208
|
'name': 'choice:[John,Jane,Bob]',
|
|
212
209
|
'age': 'range:18-65'
|
|
213
210
|
})
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
### 🤖 Text-Based Label Deduction
|
|
214
214
|
|
|
215
|
-
|
|
216
|
-
|
|
215
|
+
**Deduce** automatically fills in missing labels by learning from your existing labeled examples. Pure Python, no LLMs, offline-first.
|
|
216
|
+
|
|
217
|
+
```python
|
|
218
|
+
# Deduce missing labels from text
|
|
219
|
+
tickets = pd.DataFrame({
|
|
220
|
+
"ticket_text": ["Cannot log in", "Billing question", "App crashes", "Need invoice"],
|
|
221
|
+
"category": ["Technical", "Billing", None, None]
|
|
222
|
+
})
|
|
223
|
+
|
|
224
|
+
# Automatically fill in missing categories
|
|
225
|
+
result = add.deduce(tickets, from_column="ticket_text", to_column="category")
|
|
226
|
+
|
|
227
|
+
# Use multiple columns for better accuracy
|
|
228
|
+
result = add.deduce(
|
|
229
|
+
df,
|
|
230
|
+
from_column=["title", "description"],
|
|
231
|
+
to_column="category"
|
|
232
|
+
)
|
|
217
233
|
```
|
|
218
234
|
|
|
219
235
|
## 🧪 Examples
|
|
@@ -231,7 +247,7 @@ customers = pd.DataFrame({
|
|
|
231
247
|
})
|
|
232
248
|
|
|
233
249
|
# Generate more customers
|
|
234
|
-
customers = add.
|
|
250
|
+
customers = add.synthetic(customers, n_rows=10000)
|
|
235
251
|
|
|
236
252
|
# Add customer tiers
|
|
237
253
|
tiers = pd.DataFrame({
|
|
@@ -257,7 +273,7 @@ strategy = {
|
|
|
257
273
|
'height_cm': 'range:150-200' # Height in cm
|
|
258
274
|
}
|
|
259
275
|
|
|
260
|
-
patients = add.
|
|
276
|
+
patients = add.synthetic("@new", n_rows=1000, strategy=strategy)
|
|
261
277
|
|
|
262
278
|
# Convert height to meters for expressions
|
|
263
279
|
patients['height_m'] = patients['height_cm'] / 100
|
|
@@ -272,19 +288,19 @@ print(result.correlations)
|
|
|
272
288
|
|
|
273
289
|
## 📚 Documentation
|
|
274
290
|
|
|
275
|
-
- **[Function Documentation](https://github.com/sekarkrishna/additory/tree/main/
|
|
276
|
-
- **[Expressions Guide](https://github.com/sekarkrishna/additory/tree/main/
|
|
291
|
+
- **[Function Documentation](https://github.com/sekarkrishna/additory/tree/main/documentation/)** - Detailed guides for each function
|
|
292
|
+
- **[Expressions Guide](https://github.com/sekarkrishna/additory/tree/main/documentation/)** - Complete expressions reference
|
|
277
293
|
|
|
278
294
|
## 📄 License
|
|
279
295
|
|
|
280
|
-
MIT License - see [LICENSE](
|
|
296
|
+
MIT License - see [LICENSE](LICENSE) file for details.
|
|
281
297
|
|
|
282
298
|
## 📞 Support
|
|
283
299
|
|
|
284
300
|
- **Issues**: [GitHub Issues](https://github.com/sekarkrishna/additory/issues)
|
|
285
|
-
- **Documentation**: [Full Documentation](https://github.com/sekarkrishna/additory/tree/main/
|
|
301
|
+
- **Documentation**: [Full Documentation](https://github.com/sekarkrishna/additory/tree/main/documentation/)
|
|
286
302
|
|
|
287
|
-
## 🗺️ v0.1.1 (
|
|
303
|
+
## 🗺️ v0.1.1 (January 2026)
|
|
288
304
|
- Enhanced documentation and tutorials
|
|
289
305
|
- Performance optimizations
|
|
290
306
|
- Additional expressions
|