additory 0.1.0a2__py3-none-any.whl → 0.1.0a3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. additory/__init__.py +4 -0
  2. additory/common/__init__.py +2 -2
  3. additory/common/backend.py +20 -4
  4. additory/common/distributions.py +1 -1
  5. additory/common/sample_data.py +19 -19
  6. additory/core/backends/arrow_bridge.py +7 -0
  7. additory/core/polars_expression_engine.py +66 -16
  8. additory/dynamic_api.py +42 -46
  9. additory/expressions/proxy.py +4 -1
  10. additory/synthetic/__init__.py +7 -95
  11. additory/synthetic/column_name_resolver.py +149 -0
  12. additory/{augment → synthetic}/distributions.py +2 -2
  13. additory/{augment → synthetic}/forecast.py +1 -1
  14. additory/synthetic/linked_list_parser.py +415 -0
  15. additory/synthetic/namespace_lookup.py +129 -0
  16. additory/{augment → synthetic}/smote.py +1 -1
  17. additory/{augment → synthetic}/strategies.py +11 -44
  18. additory/{augment/augmentor.py → synthetic/synthesizer.py} +75 -15
  19. additory/utilities/units.py +4 -1
  20. {additory-0.1.0a2.dist-info → additory-0.1.0a3.dist-info}/METADATA +10 -17
  21. {additory-0.1.0a2.dist-info → additory-0.1.0a3.dist-info}/RECORD +24 -40
  22. {additory-0.1.0a2.dist-info → additory-0.1.0a3.dist-info}/WHEEL +1 -1
  23. additory/augment/__init__.py +0 -24
  24. additory/augment/builtin_lists.py +0 -430
  25. additory/augment/list_registry.py +0 -177
  26. additory/synthetic/api.py +0 -220
  27. additory/synthetic/common_integration.py +0 -314
  28. additory/synthetic/config.py +0 -262
  29. additory/synthetic/engines.py +0 -529
  30. additory/synthetic/exceptions.py +0 -180
  31. additory/synthetic/file_managers.py +0 -518
  32. additory/synthetic/generator.py +0 -702
  33. additory/synthetic/generator_parser.py +0 -68
  34. additory/synthetic/integration.py +0 -319
  35. additory/synthetic/models.py +0 -241
  36. additory/synthetic/pattern_resolver.py +0 -573
  37. additory/synthetic/performance.py +0 -469
  38. additory/synthetic/polars_integration.py +0 -464
  39. additory/synthetic/proxy.py +0 -60
  40. additory/synthetic/schema_parser.py +0 -685
  41. additory/synthetic/validator.py +0 -553
  42. {additory-0.1.0a2.dist-info → additory-0.1.0a3.dist-info}/licenses/LICENSE +0 -0
  43. {additory-0.1.0a2.dist-info → additory-0.1.0a3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,149 @@
1
+ """
2
+ Column Name Resolver for Linked Lists
3
+
4
+ Resolves column names for linked lists using priority order:
5
+ 1. Column_Names row (explicit names)
6
+ 2. Underscore parsing from list name
7
+ 3. Fallback to {strategy_key}_1, {strategy_key}_2, etc.
8
+ """
9
+
10
+ from typing import List, Optional
11
+ import warnings
12
+
13
+
14
+ def parse_column_names_from_underscores(list_name: str) -> Optional[List[str]]:
15
+ """
16
+ Parse column names from list name using underscore delimiters.
17
+
18
+ Args:
19
+ list_name: Name of the list variable (e.g., "AE_CM_SEV")
20
+
21
+ Returns:
22
+ List of column names, or None if no underscores found
23
+
24
+ Examples:
25
+ >>> parse_column_names_from_underscores("AE_CM_SEV")
26
+ ['AE', 'CM', 'SEV']
27
+
28
+ >>> parse_column_names_from_underscores("adverse_event_medication")
29
+ ['adverse', 'event', 'medication']
30
+
31
+ >>> parse_column_names_from_underscores("adverseconmed")
32
+ None
33
+ """
34
+ if '_' not in list_name:
35
+ return None
36
+
37
+ parts = list_name.split('_')
38
+
39
+ # Filter out empty parts
40
+ column_names = [part for part in parts if part]
41
+
42
+ if not column_names:
43
+ return None
44
+
45
+ return column_names
46
+
47
+
48
+ def generate_fallback_column_names(strategy_key: str, num_columns: int) -> List[str]:
49
+ """
50
+ Generate fallback column names when no other naming strategy works.
51
+
52
+ Format: {strategy_key}_1, {strategy_key}_2, etc.
53
+
54
+ Args:
55
+ strategy_key: Key from strategy dict (e.g., "col1")
56
+ num_columns: Number of columns to generate names for
57
+
58
+ Returns:
59
+ List of column names
60
+
61
+ Examples:
62
+ >>> generate_fallback_column_names("col1", 3)
63
+ ['col1_1', 'col1_2', 'col1_3']
64
+
65
+ >>> generate_fallback_column_names("adverse_events", 2)
66
+ ['adverse_events_1', 'adverse_events_2']
67
+ """
68
+ return [f"{strategy_key}_{i+1}" for i in range(num_columns)]
69
+
70
+
71
+ def resolve_column_names(
72
+ list_name: str,
73
+ strategy_key: str,
74
+ num_columns: int,
75
+ explicit_names: Optional[List[str]] = None
76
+ ) -> List[str]:
77
+ """
78
+ Resolve column names using priority order.
79
+
80
+ Priority:
81
+ 1. explicit_names (from Column_Names row)
82
+ 2. Underscore parsing from list_name
83
+ 3. Fallback to {strategy_key}_1, {strategy_key}_2, etc.
84
+
85
+ Args:
86
+ list_name: Name of the list variable
87
+ strategy_key: Key from strategy dict
88
+ num_columns: Number of columns to generate
89
+ explicit_names: Explicit column names from Column_Names row (optional)
90
+
91
+ Returns:
92
+ List of column names
93
+
94
+ Raises:
95
+ ValueError: If explicit_names count doesn't match num_columns
96
+
97
+ Examples:
98
+ >>> # Priority 1: Explicit names
99
+ >>> resolve_column_names("AE_CM", "col1", 2, ["adverse_event", "medication"])
100
+ ['adverse_event', 'medication']
101
+
102
+ >>> # Priority 2: Underscore parsing
103
+ >>> resolve_column_names("AE_CM_SEV", "col1", 3)
104
+ ['AE', 'CM', 'SEV']
105
+
106
+ >>> # Priority 3: Fallback
107
+ >>> resolve_column_names("adverseconmed", "col1", 2)
108
+ ['col1_1', 'col1_2']
109
+ """
110
+ # Priority 1: Explicit names from Column_Names row
111
+ if explicit_names is not None:
112
+ if len(explicit_names) != num_columns:
113
+ raise ValueError(
114
+ f"Column_Names row has {len(explicit_names)} names but "
115
+ f"linked list generates {num_columns} columns. They must match."
116
+ )
117
+ return explicit_names
118
+
119
+ # Priority 2: Underscore parsing
120
+ parsed_names = parse_column_names_from_underscores(list_name)
121
+ if parsed_names is not None:
122
+ if len(parsed_names) == num_columns:
123
+ return parsed_names
124
+ else:
125
+ # Underscore count doesn't match - fall through to fallback
126
+ warnings.warn(
127
+ f"List name '{list_name}' has {len(parsed_names)} underscore-separated "
128
+ f"parts but generates {num_columns} columns. "
129
+ f"Using fallback naming: {strategy_key}_1, {strategy_key}_2, etc.\n"
130
+ f"Suggestion: Use a list name with {num_columns-1} underscores, "
131
+ f"or add a Column_Names row for explicit naming.",
132
+ UserWarning
133
+ )
134
+ else:
135
+ # No underscores - emit warning
136
+ warnings.warn(
137
+ f"List name '{list_name}' has no underscores. "
138
+ f"Using fallback naming: {strategy_key}_1, {strategy_key}_2, etc.\n"
139
+ f"Suggestion: Use underscore-delimited naming (e.g., 'AE_CM_SEV') "
140
+ f"or add a Column_Names row:\n"
141
+ f" {list_name} = [\n"
142
+ f" ['Column_Names:[col1,col2,col3]'],\n"
143
+ f" ['primary', ['attr1'], ['attr2']]\n"
144
+ f" ]",
145
+ UserWarning
146
+ )
147
+
148
+ # Priority 3: Fallback
149
+ return generate_fallback_column_names(strategy_key, num_columns)
@@ -1,5 +1,5 @@
1
1
  """
2
- Distribution Strategies for Data Augmentation
2
+ Distribution Strategies for Synthetic Data Generation
3
3
 
4
4
  DEPRECATED: This module has been moved to additory.common.distributions
5
5
  Please update your imports to use additory.common.distributions instead.
@@ -11,7 +11,7 @@ import warnings
11
11
 
12
12
  # Issue deprecation warning
13
13
  warnings.warn(
14
- "additory.augment.distributions is deprecated. "
14
+ "additory.synthetic.distributions is deprecated. "
15
15
  "Please use additory.common.distributions instead. "
16
16
  "This module will be removed in a future version.",
17
17
  DeprecationWarning,
@@ -1,5 +1,5 @@
1
1
  """
2
- Forecast Strategies for Data Augmentation
2
+ Forecast Strategies for Synthetic Data Generation
3
3
 
4
4
  Provides time series forecasting capabilities:
5
5
  - Linear trend forecasting
@@ -0,0 +1,415 @@
1
+ """
2
+ Linked List Parser for Synthetic Data Generation
3
+
4
+ Parses linked lists with optional special rows:
5
+ - Column_Names:[name1,name2,name3] - Explicit column names
6
+ - Regex:[pattern1], Regex:[pattern2], ... - Regex patterns for testing
7
+
8
+ Generates cartesian product of primary key pattern:
9
+ [primary_key, [attr1_values], [attr2_values], ...]
10
+ """
11
+
12
+ from itertools import product
13
+ from typing import List, Tuple, Optional, Dict, Any
14
+ import re
15
+ import random
16
+ import string
17
+
18
+ from additory.common.exceptions import ValidationError
19
+
20
+
21
+ def parse_column_names_row(row: List[str]) -> Optional[List[str]]:
22
+ """
23
+ Parse Column_Names row to extract explicit column names.
24
+
25
+ Format: ["Column_Names:[name1,name2,name3]"]
26
+
27
+ Args:
28
+ row: First element should start with "Column_Names:"
29
+
30
+ Returns:
31
+ List of column names, or None if not a Column_Names row
32
+
33
+ Examples:
34
+ >>> parse_column_names_row(["Column_Names:[AE,CM,SEV]"])
35
+ ['AE', 'CM', 'SEV']
36
+
37
+ >>> parse_column_names_row(["Column_Names:[adverse_event,medication]"])
38
+ ['adverse_event', 'medication']
39
+ """
40
+ if not row or not isinstance(row[0], str):
41
+ return None
42
+
43
+ first_elem = row[0].strip()
44
+ if not first_elem.startswith("Column_Names:"):
45
+ return None
46
+
47
+ # Extract content between brackets: "Column_Names:[AE,CM,SEV]" -> "AE,CM,SEV"
48
+ match = re.search(r'Column_Names:\[([^\]]+)\]', first_elem)
49
+ if not match:
50
+ raise ValidationError(
51
+ f"Invalid Column_Names format: '{first_elem}'. "
52
+ "Expected format: 'Column_Names:[name1,name2,name3]'"
53
+ )
54
+
55
+ names_str = match.group(1)
56
+ column_names = [name.strip() for name in names_str.split(',')]
57
+
58
+ if not column_names or any(not name for name in column_names):
59
+ raise ValidationError(
60
+ f"Column names cannot be empty. Got: {column_names}"
61
+ )
62
+
63
+ return column_names
64
+
65
+
66
+ def parse_regex_row(row: List[str]) -> Optional[List[str]]:
67
+ """
68
+ Parse Regex row to extract regex patterns.
69
+
70
+ Format: ["Regex:[A-Z]{3,10}", "Regex:[A-Za-z0-9]{8,50}", ...]
71
+
72
+ Args:
73
+ row: List where elements may start with "Regex:"
74
+
75
+ Returns:
76
+ List of regex patterns (without "Regex:" prefix), or None if not a Regex row
77
+
78
+ Examples:
79
+ >>> parse_regex_row(["Regex:[A-Z]{3,10}", "Regex:[A-Za-z0-9]{8,50}"])
80
+ ['[A-Z]{3,10}', '[A-Za-z0-9]{8,50}']
81
+
82
+ >>> parse_regex_row(["Headache", ["Aspirin"]])
83
+ None
84
+ """
85
+ if not row or not isinstance(row[0], str):
86
+ return None
87
+
88
+ first_elem = row[0].strip()
89
+ if not first_elem.startswith("Regex:"):
90
+ return None
91
+
92
+ # Extract patterns from all elements
93
+ patterns = []
94
+ for elem in row:
95
+ if isinstance(elem, str) and elem.strip().startswith("Regex:"):
96
+ pattern = elem.strip()[6:] # Remove "Regex:" prefix
97
+ patterns.append(pattern)
98
+ else:
99
+ # Mixed row: some regex, some not
100
+ patterns.append(str(elem))
101
+
102
+ return patterns
103
+
104
+
105
+ def generate_from_regex(pattern: str, seed: Optional[int] = None) -> str:
106
+ """
107
+ Generate a string matching the regex pattern (simplified).
108
+
109
+ Supports basic patterns:
110
+ - [A-Z]{n,m} - Uppercase letters
111
+ - [a-z]{n,m} - Lowercase letters
112
+ - [0-9]{n,m} or \\d{n,m} - Digits
113
+ - [A-Za-z0-9]{n,m} - Alphanumeric
114
+
115
+ Args:
116
+ pattern: Regex pattern
117
+ seed: Random seed for reproducibility
118
+
119
+ Returns:
120
+ Generated string matching pattern
121
+
122
+ Examples:
123
+ >>> generate_from_regex('[A-Z]{3,10}', seed=42)
124
+ 'ABCDEFGHIJ'
125
+
126
+ >>> generate_from_regex('[0-9]{1,3}', seed=42)
127
+ '999'
128
+ """
129
+ if seed is not None:
130
+ random.seed(seed)
131
+
132
+ # Parse pattern: [charset]{min,max}
133
+ match = re.match(r'\[([^\]]+)\]\{(\d+),(\d+)\}', pattern)
134
+ if not match:
135
+ # Fallback: return pattern as-is with marker
136
+ return f"REGEX_{pattern[:10]}"
137
+
138
+ charset_def = match.group(1)
139
+ min_len = int(match.group(2))
140
+ max_len = int(match.group(3))
141
+
142
+ # Determine character set
143
+ if charset_def == 'A-Z':
144
+ charset = string.ascii_uppercase
145
+ elif charset_def == 'a-z':
146
+ charset = string.ascii_lowercase
147
+ elif charset_def in ['0-9', '\\d']:
148
+ charset = string.digits
149
+ elif charset_def == 'A-Za-z':
150
+ charset = string.ascii_letters
151
+ elif charset_def == 'A-Za-z0-9':
152
+ charset = string.ascii_letters + string.digits
153
+ else:
154
+ # Fallback
155
+ charset = string.ascii_letters + string.digits
156
+
157
+ # Generate max length string (for edge case testing)
158
+ length = max_len
159
+ return ''.join(random.choice(charset) for _ in range(length))
160
+
161
+
162
+ def parse_data_rows(rows: List[List]) -> List[Tuple]:
163
+ """
164
+ Parse data rows in primary key format and generate cartesian product.
165
+
166
+ Format: [primary_key, [attr1_values], [attr2_values], ...]
167
+
168
+ Args:
169
+ rows: List of data rows
170
+
171
+ Returns:
172
+ List of tuples representing all valid combinations
173
+
174
+ Raises:
175
+ ValidationError: If row format is invalid
176
+
177
+ Examples:
178
+ >>> rows = [
179
+ ... ["Headache", ["Aspirin", "Ibuprofen"], ["mild", "moderate"]],
180
+ ... ["Nausea", ["Ondansetron"], ["severe"]]
181
+ ... ]
182
+ >>> combinations = parse_data_rows(rows)
183
+ >>> len(combinations)
184
+ 5
185
+ >>> combinations[0]
186
+ ('Headache', 'Aspirin', 'mild')
187
+ """
188
+ if not rows:
189
+ raise ValidationError("No data rows found in linked list")
190
+
191
+ combinations = []
192
+ expected_length = None
193
+
194
+ for i, row in enumerate(rows):
195
+ if not isinstance(row, list):
196
+ raise ValidationError(
197
+ f"Row {i+1} is not a list. Expected format: "
198
+ "[primary_key, [attr1_values], [attr2_values], ...]"
199
+ )
200
+
201
+ if len(row) < 1:
202
+ raise ValidationError(f"Row {i+1} is empty")
203
+
204
+ # Validate structure consistency (strict validation for MVP)
205
+ if expected_length is None:
206
+ expected_length = len(row)
207
+ elif len(row) != expected_length:
208
+ raise ValidationError(
209
+ f"Row {i+1} has {len(row)} elements, expected {expected_length}. "
210
+ "All data rows must have the same structure.\n"
211
+ f"Row {i+1}: {row}\n"
212
+ "Expected format: [primary_key, [attr1_values], [attr2_values], ...]"
213
+ )
214
+
215
+ # Extract primary key and attribute lists
216
+ primary_key = row[0]
217
+ attribute_lists = row[1:]
218
+
219
+ # Validate attribute lists
220
+ for j, attr_list in enumerate(attribute_lists):
221
+ if not isinstance(attr_list, list):
222
+ raise ValidationError(
223
+ f"Row {i+1}, attribute {j+1} is not a list. "
224
+ f"Got: {type(attr_list).__name__}. "
225
+ "Expected format: [primary_key, [attr1_values], [attr2_values], ...]"
226
+ )
227
+ if len(attr_list) == 0:
228
+ raise ValidationError(
229
+ f"Row {i+1}, attribute {j+1} is an empty list"
230
+ )
231
+
232
+ # Generate cartesian product for this row
233
+ if attribute_lists:
234
+ for combination in product(*attribute_lists):
235
+ combinations.append((primary_key, *combination))
236
+ else:
237
+ # Only primary key, no attributes
238
+ combinations.append((primary_key,))
239
+
240
+ return combinations
241
+
242
+
243
+ def parse_linked_list(data: List) -> Dict[str, Any]:
244
+ """
245
+ Parse linked list with optional special rows.
246
+
247
+ Special rows (order-independent):
248
+ - Column_Names:[name1,name2,name3]
249
+ - Regex:[pattern1], Regex:[pattern2], ...
250
+
251
+ Args:
252
+ data: Linked list data
253
+
254
+ Returns:
255
+ Dictionary with:
256
+ - column_names: List of column names (or None)
257
+ - has_regex: Boolean indicating if regex row present
258
+ - regex_patterns: List of regex patterns (or None)
259
+ - combinations: List of tuples (cartesian product)
260
+ - num_columns: Number of columns to generate
261
+
262
+ Raises:
263
+ ValidationError: If format is invalid
264
+
265
+ Examples:
266
+ >>> data = [
267
+ ... ["Column_Names:[AE,CM,SEV]"],
268
+ ... ["Regex:[A-Z]{3,10}", "Regex:[A-Za-z0-9]{8,50}", "Regex:[0-9]{1,3}"],
269
+ ... ["Headache", ["Aspirin"], ["mild"]]
270
+ ... ]
271
+ >>> result = parse_linked_list(data)
272
+ >>> result['column_names']
273
+ ['AE', 'CM', 'SEV']
274
+ >>> result['has_regex']
275
+ True
276
+ >>> len(result['combinations'])
277
+ 1
278
+ """
279
+ if not data or not isinstance(data, list):
280
+ raise ValidationError(
281
+ "Linked list must be a non-empty list. "
282
+ "Expected format: [[primary_key, [attr1_values], ...], ...]"
283
+ )
284
+
285
+ column_names = None
286
+ has_regex = False
287
+ regex_patterns = None
288
+ data_rows = []
289
+
290
+ # Parse rows
291
+ for row in data:
292
+ if not isinstance(row, list):
293
+ raise ValidationError(
294
+ f"Each row must be a list. Got: {type(row).__name__}"
295
+ )
296
+
297
+ # Check for Column_Names row
298
+ col_names = parse_column_names_row(row)
299
+ if col_names is not None:
300
+ if column_names is not None:
301
+ raise ValidationError(
302
+ "Multiple Column_Names rows found. Only one is allowed."
303
+ )
304
+ column_names = col_names
305
+ continue
306
+
307
+ # Check for Regex row
308
+ patterns = parse_regex_row(row)
309
+ if patterns is not None:
310
+ if has_regex:
311
+ raise ValidationError(
312
+ "Multiple Regex rows found. Only one is allowed."
313
+ )
314
+ has_regex = True
315
+ regex_patterns = patterns
316
+ continue
317
+
318
+ # Regular data row
319
+ data_rows.append(row)
320
+
321
+ # Parse data rows to generate combinations
322
+ if not data_rows and not has_regex:
323
+ raise ValidationError(
324
+ "No data rows found. Linked list must contain at least one data row "
325
+ "or a Regex row."
326
+ )
327
+
328
+ # Generate combinations from data rows
329
+ combinations = []
330
+ if data_rows:
331
+ combinations = parse_data_rows(data_rows)
332
+
333
+ # Determine number of columns
334
+ if combinations:
335
+ num_columns = len(combinations[0])
336
+ elif regex_patterns:
337
+ num_columns = len(regex_patterns)
338
+ else:
339
+ raise ValidationError("Cannot determine number of columns")
340
+
341
+ # Validate column names count if provided
342
+ if column_names and len(column_names) != num_columns:
343
+ raise ValidationError(
344
+ f"Column_Names row has {len(column_names)} names but data has "
345
+ f"{num_columns} columns. They must match."
346
+ )
347
+
348
+ return {
349
+ 'column_names': column_names,
350
+ 'has_regex': has_regex,
351
+ 'regex_patterns': regex_patterns,
352
+ 'combinations': combinations,
353
+ 'num_columns': num_columns
354
+ }
355
+
356
+
357
+ def generate_linked_list_data(
358
+ parsed_data: Dict[str, Any],
359
+ n_rows: int,
360
+ seed: Optional[int] = None
361
+ ) -> List[Tuple]:
362
+ """
363
+ Generate data rows from parsed linked list.
364
+
365
+ Handles three scenarios:
366
+ 1. Both regex + data: First row regex, rest from combinations
367
+ 2. Regex only: All rows regex
368
+ 3. Data only: All rows from combinations
369
+
370
+ Args:
371
+ parsed_data: Output from parse_linked_list()
372
+ n_rows: Number of rows to generate
373
+ seed: Random seed for reproducibility
374
+
375
+ Returns:
376
+ List of tuples (one per row)
377
+
378
+ Examples:
379
+ >>> parsed = parse_linked_list([["Headache", ["Aspirin"], ["mild"]]])
380
+ >>> rows = generate_linked_list_data(parsed, n_rows=3, seed=42)
381
+ >>> len(rows)
382
+ 3
383
+ >>> rows[0]
384
+ ('Headache', 'Aspirin', 'mild')
385
+ """
386
+ if seed is not None:
387
+ random.seed(seed)
388
+
389
+ has_regex = parsed_data['has_regex']
390
+ regex_patterns = parsed_data['regex_patterns']
391
+ combinations = parsed_data['combinations']
392
+
393
+ results = []
394
+
395
+ # Scenario 1 & 2: Regex present
396
+ if has_regex:
397
+ # Generate first row from regex
398
+ regex_row = tuple(generate_from_regex(pattern, seed) for pattern in regex_patterns)
399
+ results.append(regex_row)
400
+ n_rows -= 1
401
+
402
+ # Scenario 2: Regex only (no combinations)
403
+ if not combinations:
404
+ # Generate all remaining rows from regex
405
+ for _ in range(n_rows):
406
+ regex_row = tuple(generate_from_regex(pattern, seed) for pattern in regex_patterns)
407
+ results.append(regex_row)
408
+ return results
409
+
410
+ # Scenario 1 & 3: Sample from combinations
411
+ if combinations:
412
+ for _ in range(n_rows):
413
+ results.append(random.choice(combinations))
414
+
415
+ return results