additory 0.1.0a2__py3-none-any.whl → 0.1.0a3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- additory/__init__.py +4 -0
- additory/common/__init__.py +2 -2
- additory/common/backend.py +20 -4
- additory/common/distributions.py +1 -1
- additory/common/sample_data.py +19 -19
- additory/core/backends/arrow_bridge.py +7 -0
- additory/core/polars_expression_engine.py +66 -16
- additory/dynamic_api.py +42 -46
- additory/expressions/proxy.py +4 -1
- additory/synthetic/__init__.py +7 -95
- additory/synthetic/column_name_resolver.py +149 -0
- additory/{augment → synthetic}/distributions.py +2 -2
- additory/{augment → synthetic}/forecast.py +1 -1
- additory/synthetic/linked_list_parser.py +415 -0
- additory/synthetic/namespace_lookup.py +129 -0
- additory/{augment → synthetic}/smote.py +1 -1
- additory/{augment → synthetic}/strategies.py +11 -44
- additory/{augment/augmentor.py → synthetic/synthesizer.py} +75 -15
- additory/utilities/units.py +4 -1
- {additory-0.1.0a2.dist-info → additory-0.1.0a3.dist-info}/METADATA +10 -17
- {additory-0.1.0a2.dist-info → additory-0.1.0a3.dist-info}/RECORD +24 -40
- {additory-0.1.0a2.dist-info → additory-0.1.0a3.dist-info}/WHEEL +1 -1
- additory/augment/__init__.py +0 -24
- additory/augment/builtin_lists.py +0 -430
- additory/augment/list_registry.py +0 -177
- additory/synthetic/api.py +0 -220
- additory/synthetic/common_integration.py +0 -314
- additory/synthetic/config.py +0 -262
- additory/synthetic/engines.py +0 -529
- additory/synthetic/exceptions.py +0 -180
- additory/synthetic/file_managers.py +0 -518
- additory/synthetic/generator.py +0 -702
- additory/synthetic/generator_parser.py +0 -68
- additory/synthetic/integration.py +0 -319
- additory/synthetic/models.py +0 -241
- additory/synthetic/pattern_resolver.py +0 -573
- additory/synthetic/performance.py +0 -469
- additory/synthetic/polars_integration.py +0 -464
- additory/synthetic/proxy.py +0 -60
- additory/synthetic/schema_parser.py +0 -685
- additory/synthetic/validator.py +0 -553
- {additory-0.1.0a2.dist-info → additory-0.1.0a3.dist-info}/licenses/LICENSE +0 -0
- {additory-0.1.0a2.dist-info → additory-0.1.0a3.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Namespace Lookup for Linked Lists
|
|
3
|
+
|
|
4
|
+
Finds Python variables in the caller's namespace using frame inspection.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import inspect
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from additory.common.exceptions import ValidationError
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def lookup_variable_in_namespace(var_name: str, depth: int = 2) -> Any:
|
|
14
|
+
"""
|
|
15
|
+
Look up a variable in the caller's namespace.
|
|
16
|
+
|
|
17
|
+
Uses frame inspection to find variables defined in the same scope
|
|
18
|
+
as the synthetic() call.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
var_name: Name of the variable to find
|
|
22
|
+
depth: Number of frames to go back (default: 2)
|
|
23
|
+
2 = caller's caller (synthetic() -> this function -> caller)
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
Value of the variable
|
|
27
|
+
|
|
28
|
+
Raises:
|
|
29
|
+
ValidationError: If variable not found
|
|
30
|
+
|
|
31
|
+
Examples:
|
|
32
|
+
>>> # In user code:
|
|
33
|
+
>>> AE_CM = [["Headache", ["Aspirin"]]]
|
|
34
|
+
>>> df = add.synthetic('@new', strategy={'col1': 'lists@AE_CM'})
|
|
35
|
+
>>> # lookup_variable_in_namespace('AE_CM') finds the list
|
|
36
|
+
"""
|
|
37
|
+
try:
|
|
38
|
+
# Get caller's frame
|
|
39
|
+
frame = inspect.currentframe()
|
|
40
|
+
|
|
41
|
+
# Go back 'depth' frames
|
|
42
|
+
for _ in range(depth):
|
|
43
|
+
if frame is None:
|
|
44
|
+
raise ValidationError(
|
|
45
|
+
f"Cannot access caller's namespace (frame depth {depth})"
|
|
46
|
+
)
|
|
47
|
+
frame = frame.f_back
|
|
48
|
+
|
|
49
|
+
if frame is None:
|
|
50
|
+
raise ValidationError(
|
|
51
|
+
f"Cannot access caller's namespace (frame is None)"
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
# Search in locals first, then globals
|
|
55
|
+
caller_locals = frame.f_locals
|
|
56
|
+
caller_globals = frame.f_globals
|
|
57
|
+
|
|
58
|
+
if var_name in caller_locals:
|
|
59
|
+
return caller_locals[var_name]
|
|
60
|
+
elif var_name in caller_globals:
|
|
61
|
+
return caller_globals[var_name]
|
|
62
|
+
else:
|
|
63
|
+
# Variable not found - provide helpful error
|
|
64
|
+
raise ValidationError(
|
|
65
|
+
f"Variable '{var_name}' not found in namespace.\n"
|
|
66
|
+
f"Make sure '{var_name}' is defined before calling synthetic().\n"
|
|
67
|
+
f"\n"
|
|
68
|
+
f"Example:\n"
|
|
69
|
+
f" {var_name} = [['Headache', ['Aspirin'], ['mild']]]\n"
|
|
70
|
+
f" df = add.synthetic('@new', strategy={{'col1': 'lists@{var_name}'}})\n"
|
|
71
|
+
f"\n"
|
|
72
|
+
f"Note: Linked lists must be defined in the same scope (cell/function) "
|
|
73
|
+
f"as the synthetic() call."
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
finally:
|
|
77
|
+
# Clean up frame reference to avoid reference cycles
|
|
78
|
+
del frame
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def validate_linked_list_variable(var_value: Any, var_name: str) -> None:
|
|
82
|
+
"""
|
|
83
|
+
Validate that the variable is a valid linked list.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
var_value: Value of the variable
|
|
87
|
+
var_name: Name of the variable (for error messages)
|
|
88
|
+
|
|
89
|
+
Raises:
|
|
90
|
+
ValidationError: If variable is not a valid linked list
|
|
91
|
+
"""
|
|
92
|
+
if not isinstance(var_value, list):
|
|
93
|
+
raise ValidationError(
|
|
94
|
+
f"Variable '{var_name}' must be a list. "
|
|
95
|
+
f"Got: {type(var_value).__name__}\n"
|
|
96
|
+
f"\n"
|
|
97
|
+
f"Expected format:\n"
|
|
98
|
+
f" {var_name} = [\n"
|
|
99
|
+
f" ['Headache', ['Aspirin', 'Ibuprofen'], ['mild', 'moderate']],\n"
|
|
100
|
+
f" ['Nausea', ['Ondansetron'], ['severe']]\n"
|
|
101
|
+
f" ]"
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
if len(var_value) == 0:
|
|
105
|
+
raise ValidationError(
|
|
106
|
+
f"Variable '{var_name}' is an empty list. "
|
|
107
|
+
f"Linked list must contain at least one row."
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def lookup_linked_list(var_name: str, depth: int = 2) -> Any:
|
|
112
|
+
"""
|
|
113
|
+
Look up and validate a linked list variable.
|
|
114
|
+
|
|
115
|
+
Convenience function that combines lookup and validation.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
var_name: Name of the variable to find
|
|
119
|
+
depth: Number of frames to go back
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
Linked list data
|
|
123
|
+
|
|
124
|
+
Raises:
|
|
125
|
+
ValidationError: If variable not found or invalid
|
|
126
|
+
"""
|
|
127
|
+
var_value = lookup_variable_in_namespace(var_name, depth)
|
|
128
|
+
validate_linked_list_variable(var_value, var_name)
|
|
129
|
+
return var_value
|
|
@@ -1,11 +1,10 @@
|
|
|
1
1
|
"""
|
|
2
|
-
Strategy handlers for data
|
|
2
|
+
Strategy handlers for synthetic data generation
|
|
3
3
|
|
|
4
4
|
Provides different strategies for generating synthetic data:
|
|
5
5
|
- auto: Random sampling from existing values
|
|
6
6
|
- increment: Increment numeric or pattern-based values
|
|
7
7
|
- choice:[...]: Random selection from inline list
|
|
8
|
-
- choice_list:name: Random selection from registered/built-in list
|
|
9
8
|
"""
|
|
10
9
|
|
|
11
10
|
import re
|
|
@@ -13,7 +12,6 @@ import random
|
|
|
13
12
|
from typing import Any, Dict, List, Optional, Tuple
|
|
14
13
|
|
|
15
14
|
from additory.common.exceptions import ValidationError, AugmentError
|
|
16
|
-
from additory.augment.list_registry import get_list
|
|
17
15
|
|
|
18
16
|
|
|
19
17
|
def parse_strategy_params(strategy_spec: str) -> Tuple[str, Dict[str, Any]]:
|
|
@@ -319,7 +317,7 @@ def apply_increment_strategy(
|
|
|
319
317
|
Apply increment strategy to a column (Polars-only).
|
|
320
318
|
|
|
321
319
|
Supports two modes:
|
|
322
|
-
1.
|
|
320
|
+
1. Extend mode: Increment from last value in df_polars
|
|
323
321
|
2. Create mode: Start from specified value (requires params with 'start')
|
|
324
322
|
|
|
325
323
|
Args:
|
|
@@ -336,7 +334,7 @@ def apply_increment_strategy(
|
|
|
336
334
|
ValidationError: If strategy cannot be applied
|
|
337
335
|
|
|
338
336
|
Examples:
|
|
339
|
-
#
|
|
337
|
+
# Extend mode (with DataFrame)
|
|
340
338
|
>>> apply_increment_strategy(df, "id", "increment", 5)
|
|
341
339
|
[11, 12, 13, 14, 15] # if last value was 10
|
|
342
340
|
|
|
@@ -349,7 +347,7 @@ def apply_increment_strategy(
|
|
|
349
347
|
... {"start": 1, "pattern": "EMP_[001]"})
|
|
350
348
|
["EMP_001", "EMP_002", "EMP_003"]
|
|
351
349
|
"""
|
|
352
|
-
# Determine mode:
|
|
350
|
+
# Determine mode: extend (has df) or create (no df)
|
|
353
351
|
is_create_mode = df_polars is None
|
|
354
352
|
|
|
355
353
|
if is_create_mode:
|
|
@@ -402,7 +400,7 @@ def apply_increment_strategy(
|
|
|
402
400
|
return new_values
|
|
403
401
|
|
|
404
402
|
else:
|
|
405
|
-
#
|
|
403
|
+
# Extend mode: use existing logic
|
|
406
404
|
# Parse the strategy
|
|
407
405
|
pattern, regex_pattern = parse_increment_strategy(strategy_spec)
|
|
408
406
|
|
|
@@ -483,12 +481,11 @@ def parse_choice_strategy(strategy_spec: str) -> Tuple[str, Optional[List[Any]]]
|
|
|
483
481
|
Args:
|
|
484
482
|
strategy_spec: Strategy string like:
|
|
485
483
|
- "choice:[value1,value2,value3]"
|
|
486
|
-
- "choice_list:banks"
|
|
487
484
|
|
|
488
485
|
Returns:
|
|
489
486
|
Tuple of (strategy_type, values)
|
|
490
|
-
- strategy_type: "choice"
|
|
491
|
-
- values: List of values
|
|
487
|
+
- strategy_type: "choice"
|
|
488
|
+
- values: List of values
|
|
492
489
|
|
|
493
490
|
Raises:
|
|
494
491
|
ValidationError: If strategy format is invalid
|
|
@@ -496,9 +493,6 @@ def parse_choice_strategy(strategy_spec: str) -> Tuple[str, Optional[List[Any]]]
|
|
|
496
493
|
Examples:
|
|
497
494
|
>>> parse_choice_strategy("choice:[Active,Inactive,Pending]")
|
|
498
495
|
("choice", ["Active", "Inactive", "Pending"])
|
|
499
|
-
|
|
500
|
-
>>> parse_choice_strategy("choice_list:banks")
|
|
501
|
-
("choice_list", None)
|
|
502
496
|
"""
|
|
503
497
|
if strategy_spec.startswith("choice:["):
|
|
504
498
|
# Inline list: choice:[value1,value2,value3]
|
|
@@ -526,22 +520,10 @@ def parse_choice_strategy(strategy_spec: str) -> Tuple[str, Optional[List[Any]]]
|
|
|
526
520
|
|
|
527
521
|
return "choice", values
|
|
528
522
|
|
|
529
|
-
elif strategy_spec.startswith("choice_list:"):
|
|
530
|
-
# Named list: choice_list:banks
|
|
531
|
-
list_name = strategy_spec[len("choice_list:"):].strip()
|
|
532
|
-
|
|
533
|
-
if not list_name:
|
|
534
|
-
raise ValidationError(
|
|
535
|
-
f"Invalid choice_list strategy: {strategy_spec}. "
|
|
536
|
-
"Must be in format: choice_list:list_name"
|
|
537
|
-
)
|
|
538
|
-
|
|
539
|
-
return "choice_list", list_name
|
|
540
|
-
|
|
541
523
|
else:
|
|
542
524
|
raise ValidationError(
|
|
543
525
|
f"Invalid choice strategy: {strategy_spec}. "
|
|
544
|
-
"Must start with 'choice:['
|
|
526
|
+
"Must start with 'choice:['"
|
|
545
527
|
)
|
|
546
528
|
|
|
547
529
|
|
|
@@ -609,22 +591,7 @@ def apply_choice_strategy(
|
|
|
609
591
|
ValidationError: If strategy cannot be applied
|
|
610
592
|
"""
|
|
611
593
|
# Parse the strategy
|
|
612
|
-
strategy_type,
|
|
613
|
-
|
|
614
|
-
# Get the actual values list
|
|
615
|
-
if strategy_type == "choice":
|
|
616
|
-
values = values_or_name
|
|
617
|
-
elif strategy_type == "choice_list":
|
|
618
|
-
# Resolve list name to actual list
|
|
619
|
-
list_name = values_or_name
|
|
620
|
-
try:
|
|
621
|
-
values = get_list(list_name)
|
|
622
|
-
except ValidationError as e:
|
|
623
|
-
raise ValidationError(
|
|
624
|
-
f"Cannot apply choice_list strategy: {e}"
|
|
625
|
-
)
|
|
626
|
-
else:
|
|
627
|
-
raise ValidationError(f"Unknown choice strategy type: {strategy_type}")
|
|
594
|
+
strategy_type, values = parse_choice_strategy(strategy_spec)
|
|
628
595
|
|
|
629
596
|
# Generate random selections
|
|
630
597
|
if seed is not None:
|
|
@@ -675,7 +642,7 @@ def apply_forecast_strategy(
|
|
|
675
642
|
>>> apply_forecast_strategy(df, "sales", "forecast:seasonal:period=12", 24)
|
|
676
643
|
[98.5, 102.3, 95.8, ...]
|
|
677
644
|
"""
|
|
678
|
-
from additory.
|
|
645
|
+
from additory.synthetic.forecast import forecast_values, ForecastMethod
|
|
679
646
|
|
|
680
647
|
# Parse strategy: forecast:method:param1=val1:param2=val2
|
|
681
648
|
parts = strategy_spec.split(":")
|
|
@@ -845,7 +812,7 @@ def apply_smote_strategy(
|
|
|
845
812
|
>>> apply_smote_strategy(df, ["feature1", "feature2"], "smote:k=5", 100)
|
|
846
813
|
{"feature1": [1.2, 3.4, ...], "feature2": [5.6, 7.8, ...]}
|
|
847
814
|
"""
|
|
848
|
-
from additory.
|
|
815
|
+
from additory.synthetic.smote import generate_smote_values
|
|
849
816
|
|
|
850
817
|
# Parse strategy: smote:k=5
|
|
851
818
|
parts = strategy_spec.split(":")
|
|
@@ -18,7 +18,7 @@ from additory.common.backend import detect_backend, to_polars, from_polars
|
|
|
18
18
|
from additory.common.exceptions import ValidationError, AugmentError
|
|
19
19
|
from additory.common.validation import validate_dataframe
|
|
20
20
|
from additory.common.sample_data import get_sample_dataset
|
|
21
|
-
from additory.
|
|
21
|
+
from additory.synthetic.strategies import (
|
|
22
22
|
parse_strategy_dict,
|
|
23
23
|
get_column_strategy,
|
|
24
24
|
apply_increment_strategy,
|
|
@@ -27,6 +27,14 @@ from additory.augment.strategies import (
|
|
|
27
27
|
parse_strategy_params
|
|
28
28
|
)
|
|
29
29
|
|
|
30
|
+
# Linked lists feature imports
|
|
31
|
+
from additory.synthetic.namespace_lookup import lookup_linked_list
|
|
32
|
+
from additory.synthetic.linked_list_parser import (
|
|
33
|
+
parse_linked_list,
|
|
34
|
+
generate_linked_list_data
|
|
35
|
+
)
|
|
36
|
+
from additory.synthetic.column_name_resolver import resolve_column_names
|
|
37
|
+
|
|
30
38
|
|
|
31
39
|
def _validate_generative_strategies(strategy_dict: Dict[str, str]) -> None:
|
|
32
40
|
"""
|
|
@@ -36,7 +44,7 @@ def _validate_generative_strategies(strategy_dict: Dict[str, str]) -> None:
|
|
|
36
44
|
- increment (with start parameter)
|
|
37
45
|
- range
|
|
38
46
|
- choice
|
|
39
|
-
-
|
|
47
|
+
- lists (inline linked lists)
|
|
40
48
|
|
|
41
49
|
Augmentative strategies require existing data:
|
|
42
50
|
- auto (random sampling)
|
|
@@ -61,6 +69,10 @@ def _validate_generative_strategies(strategy_dict: Dict[str, str]) -> None:
|
|
|
61
69
|
# Get the base strategy name (before any parameters)
|
|
62
70
|
strategy_name = strategy_spec.split(":")[0].strip()
|
|
63
71
|
|
|
72
|
+
# Handle lists@ pattern
|
|
73
|
+
if strategy_name.startswith("lists@"):
|
|
74
|
+
continue # Valid generative strategy
|
|
75
|
+
|
|
64
76
|
if strategy_name in augmentative_strategies:
|
|
65
77
|
invalid_columns.append((col, strategy_name))
|
|
66
78
|
|
|
@@ -76,7 +88,7 @@ def _validate_generative_strategies(strategy_dict: Dict[str, str]) -> None:
|
|
|
76
88
|
error_lines.append(" - increment (with start parameter)")
|
|
77
89
|
error_lines.append(" - range:min-max")
|
|
78
90
|
error_lines.append(" - choice:[value1,value2,...]")
|
|
79
|
-
error_lines.append(" -
|
|
91
|
+
error_lines.append(" - lists@variable_name (inline linked lists)")
|
|
80
92
|
|
|
81
93
|
raise ValidationError("\n".join(error_lines))
|
|
82
94
|
|
|
@@ -249,7 +261,7 @@ def _augment_polars_engine(df_polars: Any, n_rows: int, strategy_dict: Dict[str,
|
|
|
249
261
|
new_data[col] = new_values
|
|
250
262
|
elif col_strategy.startswith("forecast"):
|
|
251
263
|
# Import here to avoid circular dependency
|
|
252
|
-
from additory.
|
|
264
|
+
from additory.synthetic.strategies import apply_forecast_strategy
|
|
253
265
|
|
|
254
266
|
# Generate forecasted values
|
|
255
267
|
new_values = apply_forecast_strategy(
|
|
@@ -265,7 +277,7 @@ def _augment_polars_engine(df_polars: Any, n_rows: int, strategy_dict: Dict[str,
|
|
|
265
277
|
new_data[col] = new_values
|
|
266
278
|
elif col_strategy.startswith(("normal", "uniform", "skewed_left", "skewed_right", "beta", "gamma", "exponential", "kde")):
|
|
267
279
|
# Import here to avoid circular dependency
|
|
268
|
-
from additory.
|
|
280
|
+
from additory.synthetic.strategies import apply_distribution_strategy
|
|
269
281
|
|
|
270
282
|
# Generate distribution values
|
|
271
283
|
new_values = apply_distribution_strategy(
|
|
@@ -310,7 +322,6 @@ def _create_from_scratch_engine(
|
|
|
310
322
|
- increment (with start parameter)
|
|
311
323
|
- range
|
|
312
324
|
- choice
|
|
313
|
-
- choice_list
|
|
314
325
|
|
|
315
326
|
Augmentative strategies (NOT supported):
|
|
316
327
|
- auto (requires existing data)
|
|
@@ -349,23 +360,62 @@ def _create_from_scratch_engine(
|
|
|
349
360
|
... "id": "increment:start=1",
|
|
350
361
|
... "emp_id": "increment:start=1:pattern=EMP_[001]",
|
|
351
362
|
... "age": "range:18-65",
|
|
352
|
-
... "status": "choice:[Active,Inactive]"
|
|
353
|
-
... "department": "choice_list:departments"
|
|
363
|
+
... "status": "choice:[Active,Inactive,Pending]"
|
|
354
364
|
... },
|
|
355
365
|
... seed=42
|
|
356
366
|
... )
|
|
357
367
|
>>> result.shape
|
|
358
|
-
(100,
|
|
368
|
+
(100, 4)
|
|
359
369
|
"""
|
|
360
370
|
import polars as pl
|
|
361
371
|
|
|
362
372
|
# Validate all strategies are generative
|
|
363
373
|
_validate_generative_strategies(strategy_dict)
|
|
364
374
|
|
|
375
|
+
# Pre-process linked lists strategies
|
|
376
|
+
# Linked lists generate multiple columns, so we need to expand strategy_dict
|
|
377
|
+
expanded_strategy_dict = {}
|
|
378
|
+
lists_to_process = [] # Store (original_key, var_name, parsed_data, column_names)
|
|
379
|
+
|
|
380
|
+
for col, col_strategy in strategy_dict.items():
|
|
381
|
+
if col == "__default__":
|
|
382
|
+
continue
|
|
383
|
+
|
|
384
|
+
# Check for lists@ pattern
|
|
385
|
+
if col_strategy.startswith("lists@"):
|
|
386
|
+
# Extract variable name
|
|
387
|
+
var_name = col_strategy[6:].strip() # Remove "lists@" prefix
|
|
388
|
+
|
|
389
|
+
try:
|
|
390
|
+
# Lookup variable in namespace
|
|
391
|
+
# Depth=5: user -> add.synthetic (API) -> synthetic() -> _create_from_scratch_engine -> here
|
|
392
|
+
linked_list_data = lookup_linked_list(var_name, depth=5)
|
|
393
|
+
|
|
394
|
+
# Parse linked list
|
|
395
|
+
parsed_data = parse_linked_list(linked_list_data)
|
|
396
|
+
|
|
397
|
+
# Resolve column names
|
|
398
|
+
column_names = resolve_column_names(
|
|
399
|
+
list_name=var_name,
|
|
400
|
+
strategy_key=col,
|
|
401
|
+
num_columns=parsed_data['num_columns'],
|
|
402
|
+
explicit_names=parsed_data['column_names']
|
|
403
|
+
)
|
|
404
|
+
|
|
405
|
+
# Store for later processing
|
|
406
|
+
lists_to_process.append((col, var_name, parsed_data, column_names))
|
|
407
|
+
|
|
408
|
+
except ValidationError as e:
|
|
409
|
+
raise ValidationError(f"Linked list error for column '{col}': {e}")
|
|
410
|
+
else:
|
|
411
|
+
# Regular strategy - keep as is
|
|
412
|
+
expanded_strategy_dict[col] = col_strategy
|
|
413
|
+
|
|
365
414
|
# Build data column by column
|
|
366
415
|
new_data = {}
|
|
367
416
|
|
|
368
|
-
|
|
417
|
+
# Process regular strategies first
|
|
418
|
+
for col, col_strategy in expanded_strategy_dict.items():
|
|
369
419
|
if col == "__default__":
|
|
370
420
|
continue
|
|
371
421
|
|
|
@@ -414,13 +464,24 @@ def _create_from_scratch_engine(
|
|
|
414
464
|
f"Unknown or unsupported strategy for column '{col}': '{col_strategy}'"
|
|
415
465
|
)
|
|
416
466
|
|
|
467
|
+
# Process linked lists strategies
|
|
468
|
+
for original_key, var_name, parsed_data, column_names in lists_to_process:
|
|
469
|
+
# Generate data rows
|
|
470
|
+
data_rows = generate_linked_list_data(parsed_data, n_rows, seed)
|
|
471
|
+
|
|
472
|
+
# Transpose: list of tuples -> dict of lists
|
|
473
|
+
# data_rows = [(val1_col1, val1_col2), (val2_col1, val2_col2), ...]
|
|
474
|
+
# -> {col1: [val1_col1, val2_col1, ...], col2: [val1_col2, val2_col2, ...]}
|
|
475
|
+
for col_idx, col_name in enumerate(column_names):
|
|
476
|
+
new_data[col_name] = [row[col_idx] for row in data_rows]
|
|
477
|
+
|
|
417
478
|
# Build Polars DataFrame from generated columns
|
|
418
479
|
result = pl.DataFrame(new_data)
|
|
419
480
|
|
|
420
481
|
return result
|
|
421
482
|
|
|
422
483
|
|
|
423
|
-
def
|
|
484
|
+
def synthetic(
|
|
424
485
|
df: Any,
|
|
425
486
|
n_rows: Union[int, str] = 5,
|
|
426
487
|
strategy: Union[str, Dict[str, str]] = "auto",
|
|
@@ -428,12 +489,12 @@ def augment(
|
|
|
428
489
|
output_format: str = "pandas"
|
|
429
490
|
) -> Any:
|
|
430
491
|
"""
|
|
431
|
-
|
|
492
|
+
Generate synthetic data by extending a dataframe or creating from scratch.
|
|
432
493
|
|
|
433
494
|
Uses Polars-only architecture:
|
|
434
495
|
1. Detect input format (pandas/polars/cuDF)
|
|
435
496
|
2. Convert to Polars via Arrow bridge (if needed)
|
|
436
|
-
3. Process
|
|
497
|
+
3. Process synthetic data generation in Polars
|
|
437
498
|
4. Convert back to original format via Arrow bridge
|
|
438
499
|
|
|
439
500
|
This function adds new rows to a dataframe using various strategies:
|
|
@@ -441,7 +502,7 @@ def augment(
|
|
|
441
502
|
- "increment": Increment numeric or pattern-based values
|
|
442
503
|
- "range:min-max": Random integers within range
|
|
443
504
|
- "choice:[...]": Random selection from inline list
|
|
444
|
-
- "
|
|
505
|
+
- "lists@variable_name": Inline linked lists (generates multiple columns)
|
|
445
506
|
- "forecast:method": Time series forecasting (linear, polynomial, exponential, seasonal)
|
|
446
507
|
- "normal": Normal distribution generation
|
|
447
508
|
- "uniform": Uniform distribution generation
|
|
@@ -465,7 +526,6 @@ def augment(
|
|
|
465
526
|
"emp_id": "increment:EMP_[001]_ID",
|
|
466
527
|
"age": "range:18-65",
|
|
467
528
|
"status": "choice:[Active,Inactive,Pending]",
|
|
468
|
-
"bank": "choice_list:banks",
|
|
469
529
|
"sales": "forecast:seasonal:period=12",
|
|
470
530
|
"score": "normal:mean=75:std=10",
|
|
471
531
|
"income": "skewed_right:skewness=1.5"
|
additory/utilities/units.py
CHANGED
|
@@ -255,7 +255,10 @@ class UnitConverter:
|
|
|
255
255
|
"""Unit conversion system with Polars processing"""
|
|
256
256
|
|
|
257
257
|
def __init__(self):
|
|
258
|
-
|
|
258
|
+
try:
|
|
259
|
+
self.arrow_bridge = EnhancedArrowBridge()
|
|
260
|
+
except ArrowBridgeError:
|
|
261
|
+
self.arrow_bridge = None
|
|
259
262
|
self.conversion_stats = {
|
|
260
263
|
"total_conversions": 0,
|
|
261
264
|
"successful_conversions": 0,
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: additory
|
|
3
|
-
Version: 0.1.
|
|
4
|
-
Summary: A semantic, extensible dataframe transformation engine with expressions, lookup, synthetic data
|
|
3
|
+
Version: 0.1.0a3
|
|
4
|
+
Summary: A semantic, extensible dataframe transformation engine with expressions, lookup, and synthetic data generation support.
|
|
5
5
|
Author: Krishnamoorthy Sankaran
|
|
6
6
|
License: MIT
|
|
7
7
|
Project-URL: homepage, https://github.com/sekarkrishna/additory
|
|
@@ -13,6 +13,7 @@ Description-Content-Type: text/markdown
|
|
|
13
13
|
License-File: LICENSE
|
|
14
14
|
Requires-Dist: pandas>=1.5
|
|
15
15
|
Requires-Dist: polars>=0.20
|
|
16
|
+
Requires-Dist: pyarrow>=10.0
|
|
16
17
|
Requires-Dist: pyyaml>=6.0
|
|
17
18
|
Requires-Dist: requests>=2.31
|
|
18
19
|
Requires-Dist: toml>=0.10
|
|
@@ -34,11 +35,11 @@ Dynamic: license-file
|
|
|
34
35
|
|
|
35
36
|
# Additory
|
|
36
37
|
|
|
37
|
-
**A semantic, extensible dataframe transformation engine with expressions, lookup,
|
|
38
|
+
**A semantic, extensible dataframe transformation engine with expressions, lookup, and augmentation support.**
|
|
38
39
|
|
|
39
40
|
[](https://www.python.org/downloads/)
|
|
40
41
|
[](https://opensource.org/licenses/MIT)
|
|
41
|
-
[](https://github.com/sekarkrishna/additory/tree/main/V0.1.0a1/)
|
|
42
43
|
|
|
43
44
|
**Author:** Krishnamoorthy Sankaran
|
|
44
45
|
|
|
@@ -51,17 +52,17 @@ Dynamic: license-file
|
|
|
51
52
|
## 📦 Installation
|
|
52
53
|
|
|
53
54
|
```bash
|
|
54
|
-
pip install additory==0.1.
|
|
55
|
+
pip install additory==0.1.0a2
|
|
55
56
|
```
|
|
56
57
|
|
|
57
58
|
**Optional GPU support:**
|
|
58
59
|
```bash
|
|
59
|
-
pip install additory[gpu]==0.1.
|
|
60
|
+
pip install additory[gpu]==0.1.0a2 # Includes cuDF for GPU acceleration
|
|
60
61
|
```
|
|
61
62
|
|
|
62
63
|
**Development installation:**
|
|
63
64
|
```bash
|
|
64
|
-
pip install additory[dev]==0.1.
|
|
65
|
+
pip install additory[dev]==0.1.0a2 # Includes testing and development tools
|
|
65
66
|
```
|
|
66
67
|
|
|
67
68
|
## 🎯 Core Functions
|
|
@@ -70,7 +71,6 @@ pip install additory[dev]==0.1.0a1 # Includes testing and development tools
|
|
|
70
71
|
|----------|---------|---------|
|
|
71
72
|
| `add.to()` | Lookup/join operations | `add.to(df1, from_df=df2, bring='col', against='key')` |
|
|
72
73
|
| `add.augment()` | Generate additional data | `add.augment(df, n_rows=1000)` |
|
|
73
|
-
| `add.synth()` | Synthetic data from schemas | `add.synth("schema.toml", rows=5000)` |
|
|
74
74
|
| `add.scan()` | Data profiling & analysis | `add.scan(df, preset="full")` |
|
|
75
75
|
|
|
76
76
|
## 🧬 Available Expressions
|
|
@@ -193,13 +193,9 @@ patients_with_bsa = add.bsa(patients)
|
|
|
193
193
|
result = add.fitness_score(add.bmr(add.bmi(patients)))
|
|
194
194
|
```
|
|
195
195
|
|
|
196
|
-
### 🔄 Augment
|
|
196
|
+
### 🔄 Augment Data Generation
|
|
197
197
|
|
|
198
|
-
**Augment** generates
|
|
199
|
-
|
|
200
|
-
**Key Differences:**
|
|
201
|
-
- **Augment**: Learns patterns from existing data to create similar rows
|
|
202
|
-
- **Synthetic**: Uses predefined schemas to generate structured data
|
|
198
|
+
**Augment** generates additional data similar to your existing dataset using inline strategies.
|
|
203
199
|
|
|
204
200
|
```python
|
|
205
201
|
# Augment existing data (learns from patterns)
|
|
@@ -211,9 +207,6 @@ new_data = add.augment("@new", n_rows=500, strategy={
|
|
|
211
207
|
'name': 'choice:[John,Jane,Bob]',
|
|
212
208
|
'age': 'range:18-65'
|
|
213
209
|
})
|
|
214
|
-
|
|
215
|
-
# Generate from schema file (structured approach)
|
|
216
|
-
customers = add.synth("customer_schema.toml", rows=10000)
|
|
217
210
|
```
|
|
218
211
|
|
|
219
212
|
## 🧪 Examples
|