additory 0.1.0a3__py3-none-any.whl → 0.1.1a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- additory/__init__.py +58 -14
- additory/common/__init__.py +31 -147
- additory/common/column_selector.py +255 -0
- additory/common/distributions.py +286 -613
- additory/common/extractors.py +313 -0
- additory/common/knn_imputation.py +332 -0
- additory/common/result.py +380 -0
- additory/common/strategy_parser.py +243 -0
- additory/common/unit_conversions.py +338 -0
- additory/common/validation.py +283 -103
- additory/core/__init__.py +34 -22
- additory/core/backend.py +258 -0
- additory/core/config.py +177 -305
- additory/core/logging.py +230 -24
- additory/core/memory_manager.py +157 -495
- additory/expressions/__init__.py +2 -23
- additory/expressions/compiler.py +457 -0
- additory/expressions/engine.py +264 -487
- additory/expressions/integrity.py +179 -0
- additory/expressions/loader.py +263 -0
- additory/expressions/parser.py +363 -167
- additory/expressions/resolver.py +274 -0
- additory/functions/__init__.py +1 -0
- additory/functions/analyze/__init__.py +144 -0
- additory/functions/analyze/cardinality.py +58 -0
- additory/functions/analyze/correlations.py +66 -0
- additory/functions/analyze/distributions.py +53 -0
- additory/functions/analyze/duplicates.py +49 -0
- additory/functions/analyze/features.py +61 -0
- additory/functions/analyze/imputation.py +66 -0
- additory/functions/analyze/outliers.py +65 -0
- additory/functions/analyze/patterns.py +65 -0
- additory/functions/analyze/presets.py +72 -0
- additory/functions/analyze/quality.py +59 -0
- additory/functions/analyze/timeseries.py +53 -0
- additory/functions/analyze/types.py +45 -0
- additory/functions/expressions/__init__.py +161 -0
- additory/functions/snapshot/__init__.py +82 -0
- additory/functions/snapshot/filter.py +119 -0
- additory/functions/synthetic/__init__.py +113 -0
- additory/functions/synthetic/mode_detector.py +47 -0
- additory/functions/synthetic/strategies/__init__.py +1 -0
- additory/functions/synthetic/strategies/advanced.py +35 -0
- additory/functions/synthetic/strategies/augmentative.py +160 -0
- additory/functions/synthetic/strategies/generative.py +168 -0
- additory/functions/synthetic/strategies/presets.py +116 -0
- additory/functions/to/__init__.py +188 -0
- additory/functions/to/lookup.py +351 -0
- additory/functions/to/merge.py +189 -0
- additory/functions/to/sort.py +91 -0
- additory/functions/to/summarize.py +170 -0
- additory/functions/transform/__init__.py +140 -0
- additory/functions/transform/datetime.py +79 -0
- additory/functions/transform/extract.py +85 -0
- additory/functions/transform/harmonize.py +105 -0
- additory/functions/transform/knn.py +62 -0
- additory/functions/transform/onehotencoding.py +68 -0
- additory/functions/transform/transpose.py +42 -0
- additory-0.1.1a1.dist-info/METADATA +83 -0
- additory-0.1.1a1.dist-info/RECORD +62 -0
- additory/analysis/__init__.py +0 -48
- additory/analysis/cardinality.py +0 -126
- additory/analysis/correlations.py +0 -124
- additory/analysis/distributions.py +0 -376
- additory/analysis/quality.py +0 -158
- additory/analysis/scan.py +0 -400
- additory/common/backend.py +0 -371
- additory/common/column_utils.py +0 -191
- additory/common/exceptions.py +0 -62
- additory/common/lists.py +0 -229
- additory/common/patterns.py +0 -240
- additory/common/resolver.py +0 -567
- additory/common/sample_data.py +0 -182
- additory/core/ast_builder.py +0 -165
- additory/core/backends/__init__.py +0 -23
- additory/core/backends/arrow_bridge.py +0 -483
- additory/core/backends/cudf_bridge.py +0 -355
- additory/core/column_positioning.py +0 -358
- additory/core/compiler_polars.py +0 -166
- additory/core/enhanced_cache_manager.py +0 -1119
- additory/core/enhanced_matchers.py +0 -473
- additory/core/enhanced_version_manager.py +0 -325
- additory/core/executor.py +0 -59
- additory/core/integrity_manager.py +0 -477
- additory/core/loader.py +0 -190
- additory/core/namespace_manager.py +0 -657
- additory/core/parser.py +0 -176
- additory/core/polars_expression_engine.py +0 -601
- additory/core/registry.py +0 -176
- additory/core/sample_data_manager.py +0 -492
- additory/core/user_namespace.py +0 -751
- additory/core/validator.py +0 -27
- additory/dynamic_api.py +0 -304
- additory/expressions/proxy.py +0 -549
- additory/expressions/registry.py +0 -313
- additory/expressions/samples.py +0 -492
- additory/synthetic/__init__.py +0 -13
- additory/synthetic/column_name_resolver.py +0 -149
- additory/synthetic/distributions.py +0 -22
- additory/synthetic/forecast.py +0 -1132
- additory/synthetic/linked_list_parser.py +0 -415
- additory/synthetic/namespace_lookup.py +0 -129
- additory/synthetic/smote.py +0 -320
- additory/synthetic/strategies.py +0 -850
- additory/synthetic/synthesizer.py +0 -713
- additory/utilities/__init__.py +0 -53
- additory/utilities/encoding.py +0 -600
- additory/utilities/games.py +0 -300
- additory/utilities/keys.py +0 -8
- additory/utilities/lookup.py +0 -103
- additory/utilities/matchers.py +0 -216
- additory/utilities/resolvers.py +0 -286
- additory/utilities/settings.py +0 -167
- additory/utilities/units.py +0 -749
- additory/utilities/validators.py +0 -153
- additory-0.1.0a3.dist-info/METADATA +0 -288
- additory-0.1.0a3.dist-info/RECORD +0 -71
- additory-0.1.0a3.dist-info/licenses/LICENSE +0 -21
- {additory-0.1.0a3.dist-info → additory-0.1.1a1.dist-info}/WHEEL +0 -0
- {additory-0.1.0a3.dist-info → additory-0.1.1a1.dist-info}/top_level.txt +0 -0
|
@@ -1,415 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Linked List Parser for Synthetic Data Generation
|
|
3
|
-
|
|
4
|
-
Parses linked lists with optional special rows:
|
|
5
|
-
- Column_Names:[name1,name2,name3] - Explicit column names
|
|
6
|
-
- Regex:[pattern1], Regex:[pattern2], ... - Regex patterns for testing
|
|
7
|
-
|
|
8
|
-
Generates cartesian product of primary key pattern:
|
|
9
|
-
[primary_key, [attr1_values], [attr2_values], ...]
|
|
10
|
-
"""
|
|
11
|
-
|
|
12
|
-
from itertools import product
|
|
13
|
-
from typing import List, Tuple, Optional, Dict, Any
|
|
14
|
-
import re
|
|
15
|
-
import random
|
|
16
|
-
import string
|
|
17
|
-
|
|
18
|
-
from additory.common.exceptions import ValidationError
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
def parse_column_names_row(row: List[str]) -> Optional[List[str]]:
|
|
22
|
-
"""
|
|
23
|
-
Parse Column_Names row to extract explicit column names.
|
|
24
|
-
|
|
25
|
-
Format: ["Column_Names:[name1,name2,name3]"]
|
|
26
|
-
|
|
27
|
-
Args:
|
|
28
|
-
row: First element should start with "Column_Names:"
|
|
29
|
-
|
|
30
|
-
Returns:
|
|
31
|
-
List of column names, or None if not a Column_Names row
|
|
32
|
-
|
|
33
|
-
Examples:
|
|
34
|
-
>>> parse_column_names_row(["Column_Names:[AE,CM,SEV]"])
|
|
35
|
-
['AE', 'CM', 'SEV']
|
|
36
|
-
|
|
37
|
-
>>> parse_column_names_row(["Column_Names:[adverse_event,medication]"])
|
|
38
|
-
['adverse_event', 'medication']
|
|
39
|
-
"""
|
|
40
|
-
if not row or not isinstance(row[0], str):
|
|
41
|
-
return None
|
|
42
|
-
|
|
43
|
-
first_elem = row[0].strip()
|
|
44
|
-
if not first_elem.startswith("Column_Names:"):
|
|
45
|
-
return None
|
|
46
|
-
|
|
47
|
-
# Extract content between brackets: "Column_Names:[AE,CM,SEV]" -> "AE,CM,SEV"
|
|
48
|
-
match = re.search(r'Column_Names:\[([^\]]+)\]', first_elem)
|
|
49
|
-
if not match:
|
|
50
|
-
raise ValidationError(
|
|
51
|
-
f"Invalid Column_Names format: '{first_elem}'. "
|
|
52
|
-
"Expected format: 'Column_Names:[name1,name2,name3]'"
|
|
53
|
-
)
|
|
54
|
-
|
|
55
|
-
names_str = match.group(1)
|
|
56
|
-
column_names = [name.strip() for name in names_str.split(',')]
|
|
57
|
-
|
|
58
|
-
if not column_names or any(not name for name in column_names):
|
|
59
|
-
raise ValidationError(
|
|
60
|
-
f"Column names cannot be empty. Got: {column_names}"
|
|
61
|
-
)
|
|
62
|
-
|
|
63
|
-
return column_names
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
def parse_regex_row(row: List[str]) -> Optional[List[str]]:
|
|
67
|
-
"""
|
|
68
|
-
Parse Regex row to extract regex patterns.
|
|
69
|
-
|
|
70
|
-
Format: ["Regex:[A-Z]{3,10}", "Regex:[A-Za-z0-9]{8,50}", ...]
|
|
71
|
-
|
|
72
|
-
Args:
|
|
73
|
-
row: List where elements may start with "Regex:"
|
|
74
|
-
|
|
75
|
-
Returns:
|
|
76
|
-
List of regex patterns (without "Regex:" prefix), or None if not a Regex row
|
|
77
|
-
|
|
78
|
-
Examples:
|
|
79
|
-
>>> parse_regex_row(["Regex:[A-Z]{3,10}", "Regex:[A-Za-z0-9]{8,50}"])
|
|
80
|
-
['[A-Z]{3,10}', '[A-Za-z0-9]{8,50}']
|
|
81
|
-
|
|
82
|
-
>>> parse_regex_row(["Headache", ["Aspirin"]])
|
|
83
|
-
None
|
|
84
|
-
"""
|
|
85
|
-
if not row or not isinstance(row[0], str):
|
|
86
|
-
return None
|
|
87
|
-
|
|
88
|
-
first_elem = row[0].strip()
|
|
89
|
-
if not first_elem.startswith("Regex:"):
|
|
90
|
-
return None
|
|
91
|
-
|
|
92
|
-
# Extract patterns from all elements
|
|
93
|
-
patterns = []
|
|
94
|
-
for elem in row:
|
|
95
|
-
if isinstance(elem, str) and elem.strip().startswith("Regex:"):
|
|
96
|
-
pattern = elem.strip()[6:] # Remove "Regex:" prefix
|
|
97
|
-
patterns.append(pattern)
|
|
98
|
-
else:
|
|
99
|
-
# Mixed row: some regex, some not
|
|
100
|
-
patterns.append(str(elem))
|
|
101
|
-
|
|
102
|
-
return patterns
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
def generate_from_regex(pattern: str, seed: Optional[int] = None) -> str:
|
|
106
|
-
"""
|
|
107
|
-
Generate a string matching the regex pattern (simplified).
|
|
108
|
-
|
|
109
|
-
Supports basic patterns:
|
|
110
|
-
- [A-Z]{n,m} - Uppercase letters
|
|
111
|
-
- [a-z]{n,m} - Lowercase letters
|
|
112
|
-
- [0-9]{n,m} or \\d{n,m} - Digits
|
|
113
|
-
- [A-Za-z0-9]{n,m} - Alphanumeric
|
|
114
|
-
|
|
115
|
-
Args:
|
|
116
|
-
pattern: Regex pattern
|
|
117
|
-
seed: Random seed for reproducibility
|
|
118
|
-
|
|
119
|
-
Returns:
|
|
120
|
-
Generated string matching pattern
|
|
121
|
-
|
|
122
|
-
Examples:
|
|
123
|
-
>>> generate_from_regex('[A-Z]{3,10}', seed=42)
|
|
124
|
-
'ABCDEFGHIJ'
|
|
125
|
-
|
|
126
|
-
>>> generate_from_regex('[0-9]{1,3}', seed=42)
|
|
127
|
-
'999'
|
|
128
|
-
"""
|
|
129
|
-
if seed is not None:
|
|
130
|
-
random.seed(seed)
|
|
131
|
-
|
|
132
|
-
# Parse pattern: [charset]{min,max}
|
|
133
|
-
match = re.match(r'\[([^\]]+)\]\{(\d+),(\d+)\}', pattern)
|
|
134
|
-
if not match:
|
|
135
|
-
# Fallback: return pattern as-is with marker
|
|
136
|
-
return f"REGEX_{pattern[:10]}"
|
|
137
|
-
|
|
138
|
-
charset_def = match.group(1)
|
|
139
|
-
min_len = int(match.group(2))
|
|
140
|
-
max_len = int(match.group(3))
|
|
141
|
-
|
|
142
|
-
# Determine character set
|
|
143
|
-
if charset_def == 'A-Z':
|
|
144
|
-
charset = string.ascii_uppercase
|
|
145
|
-
elif charset_def == 'a-z':
|
|
146
|
-
charset = string.ascii_lowercase
|
|
147
|
-
elif charset_def in ['0-9', '\\d']:
|
|
148
|
-
charset = string.digits
|
|
149
|
-
elif charset_def == 'A-Za-z':
|
|
150
|
-
charset = string.ascii_letters
|
|
151
|
-
elif charset_def == 'A-Za-z0-9':
|
|
152
|
-
charset = string.ascii_letters + string.digits
|
|
153
|
-
else:
|
|
154
|
-
# Fallback
|
|
155
|
-
charset = string.ascii_letters + string.digits
|
|
156
|
-
|
|
157
|
-
# Generate max length string (for edge case testing)
|
|
158
|
-
length = max_len
|
|
159
|
-
return ''.join(random.choice(charset) for _ in range(length))
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
def parse_data_rows(rows: List[List]) -> List[Tuple]:
|
|
163
|
-
"""
|
|
164
|
-
Parse data rows in primary key format and generate cartesian product.
|
|
165
|
-
|
|
166
|
-
Format: [primary_key, [attr1_values], [attr2_values], ...]
|
|
167
|
-
|
|
168
|
-
Args:
|
|
169
|
-
rows: List of data rows
|
|
170
|
-
|
|
171
|
-
Returns:
|
|
172
|
-
List of tuples representing all valid combinations
|
|
173
|
-
|
|
174
|
-
Raises:
|
|
175
|
-
ValidationError: If row format is invalid
|
|
176
|
-
|
|
177
|
-
Examples:
|
|
178
|
-
>>> rows = [
|
|
179
|
-
... ["Headache", ["Aspirin", "Ibuprofen"], ["mild", "moderate"]],
|
|
180
|
-
... ["Nausea", ["Ondansetron"], ["severe"]]
|
|
181
|
-
... ]
|
|
182
|
-
>>> combinations = parse_data_rows(rows)
|
|
183
|
-
>>> len(combinations)
|
|
184
|
-
5
|
|
185
|
-
>>> combinations[0]
|
|
186
|
-
('Headache', 'Aspirin', 'mild')
|
|
187
|
-
"""
|
|
188
|
-
if not rows:
|
|
189
|
-
raise ValidationError("No data rows found in linked list")
|
|
190
|
-
|
|
191
|
-
combinations = []
|
|
192
|
-
expected_length = None
|
|
193
|
-
|
|
194
|
-
for i, row in enumerate(rows):
|
|
195
|
-
if not isinstance(row, list):
|
|
196
|
-
raise ValidationError(
|
|
197
|
-
f"Row {i+1} is not a list. Expected format: "
|
|
198
|
-
"[primary_key, [attr1_values], [attr2_values], ...]"
|
|
199
|
-
)
|
|
200
|
-
|
|
201
|
-
if len(row) < 1:
|
|
202
|
-
raise ValidationError(f"Row {i+1} is empty")
|
|
203
|
-
|
|
204
|
-
# Validate structure consistency (strict validation for MVP)
|
|
205
|
-
if expected_length is None:
|
|
206
|
-
expected_length = len(row)
|
|
207
|
-
elif len(row) != expected_length:
|
|
208
|
-
raise ValidationError(
|
|
209
|
-
f"Row {i+1} has {len(row)} elements, expected {expected_length}. "
|
|
210
|
-
"All data rows must have the same structure.\n"
|
|
211
|
-
f"Row {i+1}: {row}\n"
|
|
212
|
-
"Expected format: [primary_key, [attr1_values], [attr2_values], ...]"
|
|
213
|
-
)
|
|
214
|
-
|
|
215
|
-
# Extract primary key and attribute lists
|
|
216
|
-
primary_key = row[0]
|
|
217
|
-
attribute_lists = row[1:]
|
|
218
|
-
|
|
219
|
-
# Validate attribute lists
|
|
220
|
-
for j, attr_list in enumerate(attribute_lists):
|
|
221
|
-
if not isinstance(attr_list, list):
|
|
222
|
-
raise ValidationError(
|
|
223
|
-
f"Row {i+1}, attribute {j+1} is not a list. "
|
|
224
|
-
f"Got: {type(attr_list).__name__}. "
|
|
225
|
-
"Expected format: [primary_key, [attr1_values], [attr2_values], ...]"
|
|
226
|
-
)
|
|
227
|
-
if len(attr_list) == 0:
|
|
228
|
-
raise ValidationError(
|
|
229
|
-
f"Row {i+1}, attribute {j+1} is an empty list"
|
|
230
|
-
)
|
|
231
|
-
|
|
232
|
-
# Generate cartesian product for this row
|
|
233
|
-
if attribute_lists:
|
|
234
|
-
for combination in product(*attribute_lists):
|
|
235
|
-
combinations.append((primary_key, *combination))
|
|
236
|
-
else:
|
|
237
|
-
# Only primary key, no attributes
|
|
238
|
-
combinations.append((primary_key,))
|
|
239
|
-
|
|
240
|
-
return combinations
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
def parse_linked_list(data: List) -> Dict[str, Any]:
|
|
244
|
-
"""
|
|
245
|
-
Parse linked list with optional special rows.
|
|
246
|
-
|
|
247
|
-
Special rows (order-independent):
|
|
248
|
-
- Column_Names:[name1,name2,name3]
|
|
249
|
-
- Regex:[pattern1], Regex:[pattern2], ...
|
|
250
|
-
|
|
251
|
-
Args:
|
|
252
|
-
data: Linked list data
|
|
253
|
-
|
|
254
|
-
Returns:
|
|
255
|
-
Dictionary with:
|
|
256
|
-
- column_names: List of column names (or None)
|
|
257
|
-
- has_regex: Boolean indicating if regex row present
|
|
258
|
-
- regex_patterns: List of regex patterns (or None)
|
|
259
|
-
- combinations: List of tuples (cartesian product)
|
|
260
|
-
- num_columns: Number of columns to generate
|
|
261
|
-
|
|
262
|
-
Raises:
|
|
263
|
-
ValidationError: If format is invalid
|
|
264
|
-
|
|
265
|
-
Examples:
|
|
266
|
-
>>> data = [
|
|
267
|
-
... ["Column_Names:[AE,CM,SEV]"],
|
|
268
|
-
... ["Regex:[A-Z]{3,10}", "Regex:[A-Za-z0-9]{8,50}", "Regex:[0-9]{1,3}"],
|
|
269
|
-
... ["Headache", ["Aspirin"], ["mild"]]
|
|
270
|
-
... ]
|
|
271
|
-
>>> result = parse_linked_list(data)
|
|
272
|
-
>>> result['column_names']
|
|
273
|
-
['AE', 'CM', 'SEV']
|
|
274
|
-
>>> result['has_regex']
|
|
275
|
-
True
|
|
276
|
-
>>> len(result['combinations'])
|
|
277
|
-
1
|
|
278
|
-
"""
|
|
279
|
-
if not data or not isinstance(data, list):
|
|
280
|
-
raise ValidationError(
|
|
281
|
-
"Linked list must be a non-empty list. "
|
|
282
|
-
"Expected format: [[primary_key, [attr1_values], ...], ...]"
|
|
283
|
-
)
|
|
284
|
-
|
|
285
|
-
column_names = None
|
|
286
|
-
has_regex = False
|
|
287
|
-
regex_patterns = None
|
|
288
|
-
data_rows = []
|
|
289
|
-
|
|
290
|
-
# Parse rows
|
|
291
|
-
for row in data:
|
|
292
|
-
if not isinstance(row, list):
|
|
293
|
-
raise ValidationError(
|
|
294
|
-
f"Each row must be a list. Got: {type(row).__name__}"
|
|
295
|
-
)
|
|
296
|
-
|
|
297
|
-
# Check for Column_Names row
|
|
298
|
-
col_names = parse_column_names_row(row)
|
|
299
|
-
if col_names is not None:
|
|
300
|
-
if column_names is not None:
|
|
301
|
-
raise ValidationError(
|
|
302
|
-
"Multiple Column_Names rows found. Only one is allowed."
|
|
303
|
-
)
|
|
304
|
-
column_names = col_names
|
|
305
|
-
continue
|
|
306
|
-
|
|
307
|
-
# Check for Regex row
|
|
308
|
-
patterns = parse_regex_row(row)
|
|
309
|
-
if patterns is not None:
|
|
310
|
-
if has_regex:
|
|
311
|
-
raise ValidationError(
|
|
312
|
-
"Multiple Regex rows found. Only one is allowed."
|
|
313
|
-
)
|
|
314
|
-
has_regex = True
|
|
315
|
-
regex_patterns = patterns
|
|
316
|
-
continue
|
|
317
|
-
|
|
318
|
-
# Regular data row
|
|
319
|
-
data_rows.append(row)
|
|
320
|
-
|
|
321
|
-
# Parse data rows to generate combinations
|
|
322
|
-
if not data_rows and not has_regex:
|
|
323
|
-
raise ValidationError(
|
|
324
|
-
"No data rows found. Linked list must contain at least one data row "
|
|
325
|
-
"or a Regex row."
|
|
326
|
-
)
|
|
327
|
-
|
|
328
|
-
# Generate combinations from data rows
|
|
329
|
-
combinations = []
|
|
330
|
-
if data_rows:
|
|
331
|
-
combinations = parse_data_rows(data_rows)
|
|
332
|
-
|
|
333
|
-
# Determine number of columns
|
|
334
|
-
if combinations:
|
|
335
|
-
num_columns = len(combinations[0])
|
|
336
|
-
elif regex_patterns:
|
|
337
|
-
num_columns = len(regex_patterns)
|
|
338
|
-
else:
|
|
339
|
-
raise ValidationError("Cannot determine number of columns")
|
|
340
|
-
|
|
341
|
-
# Validate column names count if provided
|
|
342
|
-
if column_names and len(column_names) != num_columns:
|
|
343
|
-
raise ValidationError(
|
|
344
|
-
f"Column_Names row has {len(column_names)} names but data has "
|
|
345
|
-
f"{num_columns} columns. They must match."
|
|
346
|
-
)
|
|
347
|
-
|
|
348
|
-
return {
|
|
349
|
-
'column_names': column_names,
|
|
350
|
-
'has_regex': has_regex,
|
|
351
|
-
'regex_patterns': regex_patterns,
|
|
352
|
-
'combinations': combinations,
|
|
353
|
-
'num_columns': num_columns
|
|
354
|
-
}
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
def generate_linked_list_data(
|
|
358
|
-
parsed_data: Dict[str, Any],
|
|
359
|
-
n_rows: int,
|
|
360
|
-
seed: Optional[int] = None
|
|
361
|
-
) -> List[Tuple]:
|
|
362
|
-
"""
|
|
363
|
-
Generate data rows from parsed linked list.
|
|
364
|
-
|
|
365
|
-
Handles three scenarios:
|
|
366
|
-
1. Both regex + data: First row regex, rest from combinations
|
|
367
|
-
2. Regex only: All rows regex
|
|
368
|
-
3. Data only: All rows from combinations
|
|
369
|
-
|
|
370
|
-
Args:
|
|
371
|
-
parsed_data: Output from parse_linked_list()
|
|
372
|
-
n_rows: Number of rows to generate
|
|
373
|
-
seed: Random seed for reproducibility
|
|
374
|
-
|
|
375
|
-
Returns:
|
|
376
|
-
List of tuples (one per row)
|
|
377
|
-
|
|
378
|
-
Examples:
|
|
379
|
-
>>> parsed = parse_linked_list([["Headache", ["Aspirin"], ["mild"]]])
|
|
380
|
-
>>> rows = generate_linked_list_data(parsed, n_rows=3, seed=42)
|
|
381
|
-
>>> len(rows)
|
|
382
|
-
3
|
|
383
|
-
>>> rows[0]
|
|
384
|
-
('Headache', 'Aspirin', 'mild')
|
|
385
|
-
"""
|
|
386
|
-
if seed is not None:
|
|
387
|
-
random.seed(seed)
|
|
388
|
-
|
|
389
|
-
has_regex = parsed_data['has_regex']
|
|
390
|
-
regex_patterns = parsed_data['regex_patterns']
|
|
391
|
-
combinations = parsed_data['combinations']
|
|
392
|
-
|
|
393
|
-
results = []
|
|
394
|
-
|
|
395
|
-
# Scenario 1 & 2: Regex present
|
|
396
|
-
if has_regex:
|
|
397
|
-
# Generate first row from regex
|
|
398
|
-
regex_row = tuple(generate_from_regex(pattern, seed) for pattern in regex_patterns)
|
|
399
|
-
results.append(regex_row)
|
|
400
|
-
n_rows -= 1
|
|
401
|
-
|
|
402
|
-
# Scenario 2: Regex only (no combinations)
|
|
403
|
-
if not combinations:
|
|
404
|
-
# Generate all remaining rows from regex
|
|
405
|
-
for _ in range(n_rows):
|
|
406
|
-
regex_row = tuple(generate_from_regex(pattern, seed) for pattern in regex_patterns)
|
|
407
|
-
results.append(regex_row)
|
|
408
|
-
return results
|
|
409
|
-
|
|
410
|
-
# Scenario 1 & 3: Sample from combinations
|
|
411
|
-
if combinations:
|
|
412
|
-
for _ in range(n_rows):
|
|
413
|
-
results.append(random.choice(combinations))
|
|
414
|
-
|
|
415
|
-
return results
|
|
@@ -1,129 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Namespace Lookup for Linked Lists
|
|
3
|
-
|
|
4
|
-
Finds Python variables in the caller's namespace using frame inspection.
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
import inspect
|
|
8
|
-
from typing import Any
|
|
9
|
-
|
|
10
|
-
from additory.common.exceptions import ValidationError
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
def lookup_variable_in_namespace(var_name: str, depth: int = 2) -> Any:
|
|
14
|
-
"""
|
|
15
|
-
Look up a variable in the caller's namespace.
|
|
16
|
-
|
|
17
|
-
Uses frame inspection to find variables defined in the same scope
|
|
18
|
-
as the synthetic() call.
|
|
19
|
-
|
|
20
|
-
Args:
|
|
21
|
-
var_name: Name of the variable to find
|
|
22
|
-
depth: Number of frames to go back (default: 2)
|
|
23
|
-
2 = caller's caller (synthetic() -> this function -> caller)
|
|
24
|
-
|
|
25
|
-
Returns:
|
|
26
|
-
Value of the variable
|
|
27
|
-
|
|
28
|
-
Raises:
|
|
29
|
-
ValidationError: If variable not found
|
|
30
|
-
|
|
31
|
-
Examples:
|
|
32
|
-
>>> # In user code:
|
|
33
|
-
>>> AE_CM = [["Headache", ["Aspirin"]]]
|
|
34
|
-
>>> df = add.synthetic('@new', strategy={'col1': 'lists@AE_CM'})
|
|
35
|
-
>>> # lookup_variable_in_namespace('AE_CM') finds the list
|
|
36
|
-
"""
|
|
37
|
-
try:
|
|
38
|
-
# Get caller's frame
|
|
39
|
-
frame = inspect.currentframe()
|
|
40
|
-
|
|
41
|
-
# Go back 'depth' frames
|
|
42
|
-
for _ in range(depth):
|
|
43
|
-
if frame is None:
|
|
44
|
-
raise ValidationError(
|
|
45
|
-
f"Cannot access caller's namespace (frame depth {depth})"
|
|
46
|
-
)
|
|
47
|
-
frame = frame.f_back
|
|
48
|
-
|
|
49
|
-
if frame is None:
|
|
50
|
-
raise ValidationError(
|
|
51
|
-
f"Cannot access caller's namespace (frame is None)"
|
|
52
|
-
)
|
|
53
|
-
|
|
54
|
-
# Search in locals first, then globals
|
|
55
|
-
caller_locals = frame.f_locals
|
|
56
|
-
caller_globals = frame.f_globals
|
|
57
|
-
|
|
58
|
-
if var_name in caller_locals:
|
|
59
|
-
return caller_locals[var_name]
|
|
60
|
-
elif var_name in caller_globals:
|
|
61
|
-
return caller_globals[var_name]
|
|
62
|
-
else:
|
|
63
|
-
# Variable not found - provide helpful error
|
|
64
|
-
raise ValidationError(
|
|
65
|
-
f"Variable '{var_name}' not found in namespace.\n"
|
|
66
|
-
f"Make sure '{var_name}' is defined before calling synthetic().\n"
|
|
67
|
-
f"\n"
|
|
68
|
-
f"Example:\n"
|
|
69
|
-
f" {var_name} = [['Headache', ['Aspirin'], ['mild']]]\n"
|
|
70
|
-
f" df = add.synthetic('@new', strategy={{'col1': 'lists@{var_name}'}})\n"
|
|
71
|
-
f"\n"
|
|
72
|
-
f"Note: Linked lists must be defined in the same scope (cell/function) "
|
|
73
|
-
f"as the synthetic() call."
|
|
74
|
-
)
|
|
75
|
-
|
|
76
|
-
finally:
|
|
77
|
-
# Clean up frame reference to avoid reference cycles
|
|
78
|
-
del frame
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
def validate_linked_list_variable(var_value: Any, var_name: str) -> None:
|
|
82
|
-
"""
|
|
83
|
-
Validate that the variable is a valid linked list.
|
|
84
|
-
|
|
85
|
-
Args:
|
|
86
|
-
var_value: Value of the variable
|
|
87
|
-
var_name: Name of the variable (for error messages)
|
|
88
|
-
|
|
89
|
-
Raises:
|
|
90
|
-
ValidationError: If variable is not a valid linked list
|
|
91
|
-
"""
|
|
92
|
-
if not isinstance(var_value, list):
|
|
93
|
-
raise ValidationError(
|
|
94
|
-
f"Variable '{var_name}' must be a list. "
|
|
95
|
-
f"Got: {type(var_value).__name__}\n"
|
|
96
|
-
f"\n"
|
|
97
|
-
f"Expected format:\n"
|
|
98
|
-
f" {var_name} = [\n"
|
|
99
|
-
f" ['Headache', ['Aspirin', 'Ibuprofen'], ['mild', 'moderate']],\n"
|
|
100
|
-
f" ['Nausea', ['Ondansetron'], ['severe']]\n"
|
|
101
|
-
f" ]"
|
|
102
|
-
)
|
|
103
|
-
|
|
104
|
-
if len(var_value) == 0:
|
|
105
|
-
raise ValidationError(
|
|
106
|
-
f"Variable '{var_name}' is an empty list. "
|
|
107
|
-
f"Linked list must contain at least one row."
|
|
108
|
-
)
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
def lookup_linked_list(var_name: str, depth: int = 2) -> Any:
|
|
112
|
-
"""
|
|
113
|
-
Look up and validate a linked list variable.
|
|
114
|
-
|
|
115
|
-
Convenience function that combines lookup and validation.
|
|
116
|
-
|
|
117
|
-
Args:
|
|
118
|
-
var_name: Name of the variable to find
|
|
119
|
-
depth: Number of frames to go back
|
|
120
|
-
|
|
121
|
-
Returns:
|
|
122
|
-
Linked list data
|
|
123
|
-
|
|
124
|
-
Raises:
|
|
125
|
-
ValidationError: If variable not found or invalid
|
|
126
|
-
"""
|
|
127
|
-
var_value = lookup_variable_in_namespace(var_name, depth)
|
|
128
|
-
validate_linked_list_variable(var_value, var_name)
|
|
129
|
-
return var_value
|