additory 0.1.0a1__py3-none-any.whl → 0.1.0a3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- additory/__init__.py +4 -0
- additory/common/__init__.py +2 -2
- additory/common/backend.py +20 -4
- additory/common/distributions.py +1 -1
- additory/common/sample_data.py +19 -19
- additory/core/backends/arrow_bridge.py +7 -0
- additory/core/polars_expression_engine.py +66 -16
- additory/dynamic_api.py +42 -46
- additory/expressions/proxy.py +4 -1
- additory/synthetic/__init__.py +7 -95
- additory/synthetic/column_name_resolver.py +149 -0
- additory/{augment → synthetic}/distributions.py +2 -2
- additory/{augment → synthetic}/forecast.py +1 -1
- additory/synthetic/linked_list_parser.py +415 -0
- additory/synthetic/namespace_lookup.py +129 -0
- additory/{augment → synthetic}/smote.py +1 -1
- additory/{augment → synthetic}/strategies.py +11 -44
- additory/{augment/augmentor.py → synthetic/synthesizer.py} +75 -15
- additory/utilities/units.py +4 -1
- {additory-0.1.0a1.dist-info → additory-0.1.0a3.dist-info}/METADATA +12 -17
- {additory-0.1.0a1.dist-info → additory-0.1.0a3.dist-info}/RECORD +24 -40
- {additory-0.1.0a1.dist-info → additory-0.1.0a3.dist-info}/WHEEL +1 -1
- additory/augment/__init__.py +0 -24
- additory/augment/builtin_lists.py +0 -430
- additory/augment/list_registry.py +0 -177
- additory/synthetic/api.py +0 -220
- additory/synthetic/common_integration.py +0 -314
- additory/synthetic/config.py +0 -262
- additory/synthetic/engines.py +0 -529
- additory/synthetic/exceptions.py +0 -180
- additory/synthetic/file_managers.py +0 -518
- additory/synthetic/generator.py +0 -702
- additory/synthetic/generator_parser.py +0 -68
- additory/synthetic/integration.py +0 -319
- additory/synthetic/models.py +0 -241
- additory/synthetic/pattern_resolver.py +0 -573
- additory/synthetic/performance.py +0 -469
- additory/synthetic/polars_integration.py +0 -464
- additory/synthetic/proxy.py +0 -60
- additory/synthetic/schema_parser.py +0 -685
- additory/synthetic/validator.py +0 -553
- {additory-0.1.0a1.dist-info → additory-0.1.0a3.dist-info}/licenses/LICENSE +0 -0
- {additory-0.1.0a1.dist-info → additory-0.1.0a3.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Column Name Resolver for Linked Lists
|
|
3
|
+
|
|
4
|
+
Resolves column names for linked lists using priority order:
|
|
5
|
+
1. Column_Names row (explicit names)
|
|
6
|
+
2. Underscore parsing from list name
|
|
7
|
+
3. Fallback to {strategy_key}_1, {strategy_key}_2, etc.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from typing import List, Optional
|
|
11
|
+
import warnings
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def parse_column_names_from_underscores(list_name: str) -> Optional[List[str]]:
|
|
15
|
+
"""
|
|
16
|
+
Parse column names from list name using underscore delimiters.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
list_name: Name of the list variable (e.g., "AE_CM_SEV")
|
|
20
|
+
|
|
21
|
+
Returns:
|
|
22
|
+
List of column names, or None if no underscores found
|
|
23
|
+
|
|
24
|
+
Examples:
|
|
25
|
+
>>> parse_column_names_from_underscores("AE_CM_SEV")
|
|
26
|
+
['AE', 'CM', 'SEV']
|
|
27
|
+
|
|
28
|
+
>>> parse_column_names_from_underscores("adverse_event_medication")
|
|
29
|
+
['adverse', 'event', 'medication']
|
|
30
|
+
|
|
31
|
+
>>> parse_column_names_from_underscores("adverseconmed")
|
|
32
|
+
None
|
|
33
|
+
"""
|
|
34
|
+
if '_' not in list_name:
|
|
35
|
+
return None
|
|
36
|
+
|
|
37
|
+
parts = list_name.split('_')
|
|
38
|
+
|
|
39
|
+
# Filter out empty parts
|
|
40
|
+
column_names = [part for part in parts if part]
|
|
41
|
+
|
|
42
|
+
if not column_names:
|
|
43
|
+
return None
|
|
44
|
+
|
|
45
|
+
return column_names
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def generate_fallback_column_names(strategy_key: str, num_columns: int) -> List[str]:
|
|
49
|
+
"""
|
|
50
|
+
Generate fallback column names when no other naming strategy works.
|
|
51
|
+
|
|
52
|
+
Format: {strategy_key}_1, {strategy_key}_2, etc.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
strategy_key: Key from strategy dict (e.g., "col1")
|
|
56
|
+
num_columns: Number of columns to generate names for
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
List of column names
|
|
60
|
+
|
|
61
|
+
Examples:
|
|
62
|
+
>>> generate_fallback_column_names("col1", 3)
|
|
63
|
+
['col1_1', 'col1_2', 'col1_3']
|
|
64
|
+
|
|
65
|
+
>>> generate_fallback_column_names("adverse_events", 2)
|
|
66
|
+
['adverse_events_1', 'adverse_events_2']
|
|
67
|
+
"""
|
|
68
|
+
return [f"{strategy_key}_{i+1}" for i in range(num_columns)]
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def resolve_column_names(
|
|
72
|
+
list_name: str,
|
|
73
|
+
strategy_key: str,
|
|
74
|
+
num_columns: int,
|
|
75
|
+
explicit_names: Optional[List[str]] = None
|
|
76
|
+
) -> List[str]:
|
|
77
|
+
"""
|
|
78
|
+
Resolve column names using priority order.
|
|
79
|
+
|
|
80
|
+
Priority:
|
|
81
|
+
1. explicit_names (from Column_Names row)
|
|
82
|
+
2. Underscore parsing from list_name
|
|
83
|
+
3. Fallback to {strategy_key}_1, {strategy_key}_2, etc.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
list_name: Name of the list variable
|
|
87
|
+
strategy_key: Key from strategy dict
|
|
88
|
+
num_columns: Number of columns to generate
|
|
89
|
+
explicit_names: Explicit column names from Column_Names row (optional)
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
List of column names
|
|
93
|
+
|
|
94
|
+
Raises:
|
|
95
|
+
ValueError: If explicit_names count doesn't match num_columns
|
|
96
|
+
|
|
97
|
+
Examples:
|
|
98
|
+
>>> # Priority 1: Explicit names
|
|
99
|
+
>>> resolve_column_names("AE_CM", "col1", 2, ["adverse_event", "medication"])
|
|
100
|
+
['adverse_event', 'medication']
|
|
101
|
+
|
|
102
|
+
>>> # Priority 2: Underscore parsing
|
|
103
|
+
>>> resolve_column_names("AE_CM_SEV", "col1", 3)
|
|
104
|
+
['AE', 'CM', 'SEV']
|
|
105
|
+
|
|
106
|
+
>>> # Priority 3: Fallback
|
|
107
|
+
>>> resolve_column_names("adverseconmed", "col1", 2)
|
|
108
|
+
['col1_1', 'col1_2']
|
|
109
|
+
"""
|
|
110
|
+
# Priority 1: Explicit names from Column_Names row
|
|
111
|
+
if explicit_names is not None:
|
|
112
|
+
if len(explicit_names) != num_columns:
|
|
113
|
+
raise ValueError(
|
|
114
|
+
f"Column_Names row has {len(explicit_names)} names but "
|
|
115
|
+
f"linked list generates {num_columns} columns. They must match."
|
|
116
|
+
)
|
|
117
|
+
return explicit_names
|
|
118
|
+
|
|
119
|
+
# Priority 2: Underscore parsing
|
|
120
|
+
parsed_names = parse_column_names_from_underscores(list_name)
|
|
121
|
+
if parsed_names is not None:
|
|
122
|
+
if len(parsed_names) == num_columns:
|
|
123
|
+
return parsed_names
|
|
124
|
+
else:
|
|
125
|
+
# Underscore count doesn't match - fall through to fallback
|
|
126
|
+
warnings.warn(
|
|
127
|
+
f"List name '{list_name}' has {len(parsed_names)} underscore-separated "
|
|
128
|
+
f"parts but generates {num_columns} columns. "
|
|
129
|
+
f"Using fallback naming: {strategy_key}_1, {strategy_key}_2, etc.\n"
|
|
130
|
+
f"Suggestion: Use a list name with {num_columns-1} underscores, "
|
|
131
|
+
f"or add a Column_Names row for explicit naming.",
|
|
132
|
+
UserWarning
|
|
133
|
+
)
|
|
134
|
+
else:
|
|
135
|
+
# No underscores - emit warning
|
|
136
|
+
warnings.warn(
|
|
137
|
+
f"List name '{list_name}' has no underscores. "
|
|
138
|
+
f"Using fallback naming: {strategy_key}_1, {strategy_key}_2, etc.\n"
|
|
139
|
+
f"Suggestion: Use underscore-delimited naming (e.g., 'AE_CM_SEV') "
|
|
140
|
+
f"or add a Column_Names row:\n"
|
|
141
|
+
f" {list_name} = [\n"
|
|
142
|
+
f" ['Column_Names:[col1,col2,col3]'],\n"
|
|
143
|
+
f" ['primary', ['attr1'], ['attr2']]\n"
|
|
144
|
+
f" ]",
|
|
145
|
+
UserWarning
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
# Priority 3: Fallback
|
|
149
|
+
return generate_fallback_column_names(strategy_key, num_columns)
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
"""
|
|
2
|
-
Distribution Strategies for Data
|
|
2
|
+
Distribution Strategies for Synthetic Data Generation
|
|
3
3
|
|
|
4
4
|
DEPRECATED: This module has been moved to additory.common.distributions
|
|
5
5
|
Please update your imports to use additory.common.distributions instead.
|
|
@@ -11,7 +11,7 @@ import warnings
|
|
|
11
11
|
|
|
12
12
|
# Issue deprecation warning
|
|
13
13
|
warnings.warn(
|
|
14
|
-
"additory.
|
|
14
|
+
"additory.synthetic.distributions is deprecated. "
|
|
15
15
|
"Please use additory.common.distributions instead. "
|
|
16
16
|
"This module will be removed in a future version.",
|
|
17
17
|
DeprecationWarning,
|
|
@@ -0,0 +1,415 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Linked List Parser for Synthetic Data Generation
|
|
3
|
+
|
|
4
|
+
Parses linked lists with optional special rows:
|
|
5
|
+
- Column_Names:[name1,name2,name3] - Explicit column names
|
|
6
|
+
- Regex:[pattern1], Regex:[pattern2], ... - Regex patterns for testing
|
|
7
|
+
|
|
8
|
+
Generates cartesian product of primary key pattern:
|
|
9
|
+
[primary_key, [attr1_values], [attr2_values], ...]
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from itertools import product
|
|
13
|
+
from typing import List, Tuple, Optional, Dict, Any
|
|
14
|
+
import re
|
|
15
|
+
import random
|
|
16
|
+
import string
|
|
17
|
+
|
|
18
|
+
from additory.common.exceptions import ValidationError
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def parse_column_names_row(row: List[str]) -> Optional[List[str]]:
|
|
22
|
+
"""
|
|
23
|
+
Parse Column_Names row to extract explicit column names.
|
|
24
|
+
|
|
25
|
+
Format: ["Column_Names:[name1,name2,name3]"]
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
row: First element should start with "Column_Names:"
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
List of column names, or None if not a Column_Names row
|
|
32
|
+
|
|
33
|
+
Examples:
|
|
34
|
+
>>> parse_column_names_row(["Column_Names:[AE,CM,SEV]"])
|
|
35
|
+
['AE', 'CM', 'SEV']
|
|
36
|
+
|
|
37
|
+
>>> parse_column_names_row(["Column_Names:[adverse_event,medication]"])
|
|
38
|
+
['adverse_event', 'medication']
|
|
39
|
+
"""
|
|
40
|
+
if not row or not isinstance(row[0], str):
|
|
41
|
+
return None
|
|
42
|
+
|
|
43
|
+
first_elem = row[0].strip()
|
|
44
|
+
if not first_elem.startswith("Column_Names:"):
|
|
45
|
+
return None
|
|
46
|
+
|
|
47
|
+
# Extract content between brackets: "Column_Names:[AE,CM,SEV]" -> "AE,CM,SEV"
|
|
48
|
+
match = re.search(r'Column_Names:\[([^\]]+)\]', first_elem)
|
|
49
|
+
if not match:
|
|
50
|
+
raise ValidationError(
|
|
51
|
+
f"Invalid Column_Names format: '{first_elem}'. "
|
|
52
|
+
"Expected format: 'Column_Names:[name1,name2,name3]'"
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
names_str = match.group(1)
|
|
56
|
+
column_names = [name.strip() for name in names_str.split(',')]
|
|
57
|
+
|
|
58
|
+
if not column_names or any(not name for name in column_names):
|
|
59
|
+
raise ValidationError(
|
|
60
|
+
f"Column names cannot be empty. Got: {column_names}"
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
return column_names
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def parse_regex_row(row: List[str]) -> Optional[List[str]]:
|
|
67
|
+
"""
|
|
68
|
+
Parse Regex row to extract regex patterns.
|
|
69
|
+
|
|
70
|
+
Format: ["Regex:[A-Z]{3,10}", "Regex:[A-Za-z0-9]{8,50}", ...]
|
|
71
|
+
|
|
72
|
+
Args:
|
|
73
|
+
row: List where elements may start with "Regex:"
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
List of regex patterns (without "Regex:" prefix), or None if not a Regex row
|
|
77
|
+
|
|
78
|
+
Examples:
|
|
79
|
+
>>> parse_regex_row(["Regex:[A-Z]{3,10}", "Regex:[A-Za-z0-9]{8,50}"])
|
|
80
|
+
['[A-Z]{3,10}', '[A-Za-z0-9]{8,50}']
|
|
81
|
+
|
|
82
|
+
>>> parse_regex_row(["Headache", ["Aspirin"]])
|
|
83
|
+
None
|
|
84
|
+
"""
|
|
85
|
+
if not row or not isinstance(row[0], str):
|
|
86
|
+
return None
|
|
87
|
+
|
|
88
|
+
first_elem = row[0].strip()
|
|
89
|
+
if not first_elem.startswith("Regex:"):
|
|
90
|
+
return None
|
|
91
|
+
|
|
92
|
+
# Extract patterns from all elements
|
|
93
|
+
patterns = []
|
|
94
|
+
for elem in row:
|
|
95
|
+
if isinstance(elem, str) and elem.strip().startswith("Regex:"):
|
|
96
|
+
pattern = elem.strip()[6:] # Remove "Regex:" prefix
|
|
97
|
+
patterns.append(pattern)
|
|
98
|
+
else:
|
|
99
|
+
# Mixed row: some regex, some not
|
|
100
|
+
patterns.append(str(elem))
|
|
101
|
+
|
|
102
|
+
return patterns
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def generate_from_regex(pattern: str, seed: Optional[int] = None) -> str:
|
|
106
|
+
"""
|
|
107
|
+
Generate a string matching the regex pattern (simplified).
|
|
108
|
+
|
|
109
|
+
Supports basic patterns:
|
|
110
|
+
- [A-Z]{n,m} - Uppercase letters
|
|
111
|
+
- [a-z]{n,m} - Lowercase letters
|
|
112
|
+
- [0-9]{n,m} or \\d{n,m} - Digits
|
|
113
|
+
- [A-Za-z0-9]{n,m} - Alphanumeric
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
pattern: Regex pattern
|
|
117
|
+
seed: Random seed for reproducibility
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
Generated string matching pattern
|
|
121
|
+
|
|
122
|
+
Examples:
|
|
123
|
+
>>> generate_from_regex('[A-Z]{3,10}', seed=42)
|
|
124
|
+
'ABCDEFGHIJ'
|
|
125
|
+
|
|
126
|
+
>>> generate_from_regex('[0-9]{1,3}', seed=42)
|
|
127
|
+
'999'
|
|
128
|
+
"""
|
|
129
|
+
if seed is not None:
|
|
130
|
+
random.seed(seed)
|
|
131
|
+
|
|
132
|
+
# Parse pattern: [charset]{min,max}
|
|
133
|
+
match = re.match(r'\[([^\]]+)\]\{(\d+),(\d+)\}', pattern)
|
|
134
|
+
if not match:
|
|
135
|
+
# Fallback: return pattern as-is with marker
|
|
136
|
+
return f"REGEX_{pattern[:10]}"
|
|
137
|
+
|
|
138
|
+
charset_def = match.group(1)
|
|
139
|
+
min_len = int(match.group(2))
|
|
140
|
+
max_len = int(match.group(3))
|
|
141
|
+
|
|
142
|
+
# Determine character set
|
|
143
|
+
if charset_def == 'A-Z':
|
|
144
|
+
charset = string.ascii_uppercase
|
|
145
|
+
elif charset_def == 'a-z':
|
|
146
|
+
charset = string.ascii_lowercase
|
|
147
|
+
elif charset_def in ['0-9', '\\d']:
|
|
148
|
+
charset = string.digits
|
|
149
|
+
elif charset_def == 'A-Za-z':
|
|
150
|
+
charset = string.ascii_letters
|
|
151
|
+
elif charset_def == 'A-Za-z0-9':
|
|
152
|
+
charset = string.ascii_letters + string.digits
|
|
153
|
+
else:
|
|
154
|
+
# Fallback
|
|
155
|
+
charset = string.ascii_letters + string.digits
|
|
156
|
+
|
|
157
|
+
# Generate max length string (for edge case testing)
|
|
158
|
+
length = max_len
|
|
159
|
+
return ''.join(random.choice(charset) for _ in range(length))
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def parse_data_rows(rows: List[List]) -> List[Tuple]:
|
|
163
|
+
"""
|
|
164
|
+
Parse data rows in primary key format and generate cartesian product.
|
|
165
|
+
|
|
166
|
+
Format: [primary_key, [attr1_values], [attr2_values], ...]
|
|
167
|
+
|
|
168
|
+
Args:
|
|
169
|
+
rows: List of data rows
|
|
170
|
+
|
|
171
|
+
Returns:
|
|
172
|
+
List of tuples representing all valid combinations
|
|
173
|
+
|
|
174
|
+
Raises:
|
|
175
|
+
ValidationError: If row format is invalid
|
|
176
|
+
|
|
177
|
+
Examples:
|
|
178
|
+
>>> rows = [
|
|
179
|
+
... ["Headache", ["Aspirin", "Ibuprofen"], ["mild", "moderate"]],
|
|
180
|
+
... ["Nausea", ["Ondansetron"], ["severe"]]
|
|
181
|
+
... ]
|
|
182
|
+
>>> combinations = parse_data_rows(rows)
|
|
183
|
+
>>> len(combinations)
|
|
184
|
+
5
|
|
185
|
+
>>> combinations[0]
|
|
186
|
+
('Headache', 'Aspirin', 'mild')
|
|
187
|
+
"""
|
|
188
|
+
if not rows:
|
|
189
|
+
raise ValidationError("No data rows found in linked list")
|
|
190
|
+
|
|
191
|
+
combinations = []
|
|
192
|
+
expected_length = None
|
|
193
|
+
|
|
194
|
+
for i, row in enumerate(rows):
|
|
195
|
+
if not isinstance(row, list):
|
|
196
|
+
raise ValidationError(
|
|
197
|
+
f"Row {i+1} is not a list. Expected format: "
|
|
198
|
+
"[primary_key, [attr1_values], [attr2_values], ...]"
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
if len(row) < 1:
|
|
202
|
+
raise ValidationError(f"Row {i+1} is empty")
|
|
203
|
+
|
|
204
|
+
# Validate structure consistency (strict validation for MVP)
|
|
205
|
+
if expected_length is None:
|
|
206
|
+
expected_length = len(row)
|
|
207
|
+
elif len(row) != expected_length:
|
|
208
|
+
raise ValidationError(
|
|
209
|
+
f"Row {i+1} has {len(row)} elements, expected {expected_length}. "
|
|
210
|
+
"All data rows must have the same structure.\n"
|
|
211
|
+
f"Row {i+1}: {row}\n"
|
|
212
|
+
"Expected format: [primary_key, [attr1_values], [attr2_values], ...]"
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
# Extract primary key and attribute lists
|
|
216
|
+
primary_key = row[0]
|
|
217
|
+
attribute_lists = row[1:]
|
|
218
|
+
|
|
219
|
+
# Validate attribute lists
|
|
220
|
+
for j, attr_list in enumerate(attribute_lists):
|
|
221
|
+
if not isinstance(attr_list, list):
|
|
222
|
+
raise ValidationError(
|
|
223
|
+
f"Row {i+1}, attribute {j+1} is not a list. "
|
|
224
|
+
f"Got: {type(attr_list).__name__}. "
|
|
225
|
+
"Expected format: [primary_key, [attr1_values], [attr2_values], ...]"
|
|
226
|
+
)
|
|
227
|
+
if len(attr_list) == 0:
|
|
228
|
+
raise ValidationError(
|
|
229
|
+
f"Row {i+1}, attribute {j+1} is an empty list"
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
# Generate cartesian product for this row
|
|
233
|
+
if attribute_lists:
|
|
234
|
+
for combination in product(*attribute_lists):
|
|
235
|
+
combinations.append((primary_key, *combination))
|
|
236
|
+
else:
|
|
237
|
+
# Only primary key, no attributes
|
|
238
|
+
combinations.append((primary_key,))
|
|
239
|
+
|
|
240
|
+
return combinations
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def parse_linked_list(data: List) -> Dict[str, Any]:
|
|
244
|
+
"""
|
|
245
|
+
Parse linked list with optional special rows.
|
|
246
|
+
|
|
247
|
+
Special rows (order-independent):
|
|
248
|
+
- Column_Names:[name1,name2,name3]
|
|
249
|
+
- Regex:[pattern1], Regex:[pattern2], ...
|
|
250
|
+
|
|
251
|
+
Args:
|
|
252
|
+
data: Linked list data
|
|
253
|
+
|
|
254
|
+
Returns:
|
|
255
|
+
Dictionary with:
|
|
256
|
+
- column_names: List of column names (or None)
|
|
257
|
+
- has_regex: Boolean indicating if regex row present
|
|
258
|
+
- regex_patterns: List of regex patterns (or None)
|
|
259
|
+
- combinations: List of tuples (cartesian product)
|
|
260
|
+
- num_columns: Number of columns to generate
|
|
261
|
+
|
|
262
|
+
Raises:
|
|
263
|
+
ValidationError: If format is invalid
|
|
264
|
+
|
|
265
|
+
Examples:
|
|
266
|
+
>>> data = [
|
|
267
|
+
... ["Column_Names:[AE,CM,SEV]"],
|
|
268
|
+
... ["Regex:[A-Z]{3,10}", "Regex:[A-Za-z0-9]{8,50}", "Regex:[0-9]{1,3}"],
|
|
269
|
+
... ["Headache", ["Aspirin"], ["mild"]]
|
|
270
|
+
... ]
|
|
271
|
+
>>> result = parse_linked_list(data)
|
|
272
|
+
>>> result['column_names']
|
|
273
|
+
['AE', 'CM', 'SEV']
|
|
274
|
+
>>> result['has_regex']
|
|
275
|
+
True
|
|
276
|
+
>>> len(result['combinations'])
|
|
277
|
+
1
|
|
278
|
+
"""
|
|
279
|
+
if not data or not isinstance(data, list):
|
|
280
|
+
raise ValidationError(
|
|
281
|
+
"Linked list must be a non-empty list. "
|
|
282
|
+
"Expected format: [[primary_key, [attr1_values], ...], ...]"
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
column_names = None
|
|
286
|
+
has_regex = False
|
|
287
|
+
regex_patterns = None
|
|
288
|
+
data_rows = []
|
|
289
|
+
|
|
290
|
+
# Parse rows
|
|
291
|
+
for row in data:
|
|
292
|
+
if not isinstance(row, list):
|
|
293
|
+
raise ValidationError(
|
|
294
|
+
f"Each row must be a list. Got: {type(row).__name__}"
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
# Check for Column_Names row
|
|
298
|
+
col_names = parse_column_names_row(row)
|
|
299
|
+
if col_names is not None:
|
|
300
|
+
if column_names is not None:
|
|
301
|
+
raise ValidationError(
|
|
302
|
+
"Multiple Column_Names rows found. Only one is allowed."
|
|
303
|
+
)
|
|
304
|
+
column_names = col_names
|
|
305
|
+
continue
|
|
306
|
+
|
|
307
|
+
# Check for Regex row
|
|
308
|
+
patterns = parse_regex_row(row)
|
|
309
|
+
if patterns is not None:
|
|
310
|
+
if has_regex:
|
|
311
|
+
raise ValidationError(
|
|
312
|
+
"Multiple Regex rows found. Only one is allowed."
|
|
313
|
+
)
|
|
314
|
+
has_regex = True
|
|
315
|
+
regex_patterns = patterns
|
|
316
|
+
continue
|
|
317
|
+
|
|
318
|
+
# Regular data row
|
|
319
|
+
data_rows.append(row)
|
|
320
|
+
|
|
321
|
+
# Parse data rows to generate combinations
|
|
322
|
+
if not data_rows and not has_regex:
|
|
323
|
+
raise ValidationError(
|
|
324
|
+
"No data rows found. Linked list must contain at least one data row "
|
|
325
|
+
"or a Regex row."
|
|
326
|
+
)
|
|
327
|
+
|
|
328
|
+
# Generate combinations from data rows
|
|
329
|
+
combinations = []
|
|
330
|
+
if data_rows:
|
|
331
|
+
combinations = parse_data_rows(data_rows)
|
|
332
|
+
|
|
333
|
+
# Determine number of columns
|
|
334
|
+
if combinations:
|
|
335
|
+
num_columns = len(combinations[0])
|
|
336
|
+
elif regex_patterns:
|
|
337
|
+
num_columns = len(regex_patterns)
|
|
338
|
+
else:
|
|
339
|
+
raise ValidationError("Cannot determine number of columns")
|
|
340
|
+
|
|
341
|
+
# Validate column names count if provided
|
|
342
|
+
if column_names and len(column_names) != num_columns:
|
|
343
|
+
raise ValidationError(
|
|
344
|
+
f"Column_Names row has {len(column_names)} names but data has "
|
|
345
|
+
f"{num_columns} columns. They must match."
|
|
346
|
+
)
|
|
347
|
+
|
|
348
|
+
return {
|
|
349
|
+
'column_names': column_names,
|
|
350
|
+
'has_regex': has_regex,
|
|
351
|
+
'regex_patterns': regex_patterns,
|
|
352
|
+
'combinations': combinations,
|
|
353
|
+
'num_columns': num_columns
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
def generate_linked_list_data(
|
|
358
|
+
parsed_data: Dict[str, Any],
|
|
359
|
+
n_rows: int,
|
|
360
|
+
seed: Optional[int] = None
|
|
361
|
+
) -> List[Tuple]:
|
|
362
|
+
"""
|
|
363
|
+
Generate data rows from parsed linked list.
|
|
364
|
+
|
|
365
|
+
Handles three scenarios:
|
|
366
|
+
1. Both regex + data: First row regex, rest from combinations
|
|
367
|
+
2. Regex only: All rows regex
|
|
368
|
+
3. Data only: All rows from combinations
|
|
369
|
+
|
|
370
|
+
Args:
|
|
371
|
+
parsed_data: Output from parse_linked_list()
|
|
372
|
+
n_rows: Number of rows to generate
|
|
373
|
+
seed: Random seed for reproducibility
|
|
374
|
+
|
|
375
|
+
Returns:
|
|
376
|
+
List of tuples (one per row)
|
|
377
|
+
|
|
378
|
+
Examples:
|
|
379
|
+
>>> parsed = parse_linked_list([["Headache", ["Aspirin"], ["mild"]]])
|
|
380
|
+
>>> rows = generate_linked_list_data(parsed, n_rows=3, seed=42)
|
|
381
|
+
>>> len(rows)
|
|
382
|
+
3
|
|
383
|
+
>>> rows[0]
|
|
384
|
+
('Headache', 'Aspirin', 'mild')
|
|
385
|
+
"""
|
|
386
|
+
if seed is not None:
|
|
387
|
+
random.seed(seed)
|
|
388
|
+
|
|
389
|
+
has_regex = parsed_data['has_regex']
|
|
390
|
+
regex_patterns = parsed_data['regex_patterns']
|
|
391
|
+
combinations = parsed_data['combinations']
|
|
392
|
+
|
|
393
|
+
results = []
|
|
394
|
+
|
|
395
|
+
# Scenario 1 & 2: Regex present
|
|
396
|
+
if has_regex:
|
|
397
|
+
# Generate first row from regex
|
|
398
|
+
regex_row = tuple(generate_from_regex(pattern, seed) for pattern in regex_patterns)
|
|
399
|
+
results.append(regex_row)
|
|
400
|
+
n_rows -= 1
|
|
401
|
+
|
|
402
|
+
# Scenario 2: Regex only (no combinations)
|
|
403
|
+
if not combinations:
|
|
404
|
+
# Generate all remaining rows from regex
|
|
405
|
+
for _ in range(n_rows):
|
|
406
|
+
regex_row = tuple(generate_from_regex(pattern, seed) for pattern in regex_patterns)
|
|
407
|
+
results.append(regex_row)
|
|
408
|
+
return results
|
|
409
|
+
|
|
410
|
+
# Scenario 1 & 3: Sample from combinations
|
|
411
|
+
if combinations:
|
|
412
|
+
for _ in range(n_rows):
|
|
413
|
+
results.append(random.choice(combinations))
|
|
414
|
+
|
|
415
|
+
return results
|