additory 0.1.0a2__py3-none-any.whl → 0.1.0a4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. additory/__init__.py +4 -0
  2. additory/common/__init__.py +2 -2
  3. additory/common/backend.py +20 -4
  4. additory/common/distributions.py +1 -1
  5. additory/common/sample_data.py +19 -19
  6. additory/core/backends/arrow_bridge.py +7 -0
  7. additory/core/config.py +3 -3
  8. additory/core/polars_expression_engine.py +66 -16
  9. additory/core/registry.py +4 -3
  10. additory/dynamic_api.py +95 -51
  11. additory/expressions/proxy.py +4 -1
  12. additory/expressions/registry.py +3 -3
  13. additory/synthetic/__init__.py +7 -95
  14. additory/synthetic/column_name_resolver.py +149 -0
  15. additory/synthetic/deduce.py +259 -0
  16. additory/{augment → synthetic}/distributions.py +2 -2
  17. additory/{augment → synthetic}/forecast.py +1 -1
  18. additory/synthetic/linked_list_parser.py +415 -0
  19. additory/synthetic/namespace_lookup.py +129 -0
  20. additory/{augment → synthetic}/smote.py +1 -1
  21. additory/{augment → synthetic}/strategies.py +87 -44
  22. additory/{augment/augmentor.py → synthetic/synthesizer.py} +75 -15
  23. additory/utilities/units.py +4 -1
  24. {additory-0.1.0a2.dist-info → additory-0.1.0a4.dist-info}/METADATA +44 -28
  25. {additory-0.1.0a2.dist-info → additory-0.1.0a4.dist-info}/RECORD +28 -43
  26. {additory-0.1.0a2.dist-info → additory-0.1.0a4.dist-info}/WHEEL +1 -1
  27. additory/augment/__init__.py +0 -24
  28. additory/augment/builtin_lists.py +0 -430
  29. additory/augment/list_registry.py +0 -177
  30. additory/synthetic/api.py +0 -220
  31. additory/synthetic/common_integration.py +0 -314
  32. additory/synthetic/config.py +0 -262
  33. additory/synthetic/engines.py +0 -529
  34. additory/synthetic/exceptions.py +0 -180
  35. additory/synthetic/file_managers.py +0 -518
  36. additory/synthetic/generator.py +0 -702
  37. additory/synthetic/generator_parser.py +0 -68
  38. additory/synthetic/integration.py +0 -319
  39. additory/synthetic/models.py +0 -241
  40. additory/synthetic/pattern_resolver.py +0 -573
  41. additory/synthetic/performance.py +0 -469
  42. additory/synthetic/polars_integration.py +0 -464
  43. additory/synthetic/proxy.py +0 -60
  44. additory/synthetic/schema_parser.py +0 -685
  45. additory/synthetic/validator.py +0 -553
  46. {additory-0.1.0a2.dist-info → additory-0.1.0a4.dist-info}/licenses/LICENSE +0 -0
  47. {additory-0.1.0a2.dist-info → additory-0.1.0a4.dist-info}/top_level.txt +0 -0
@@ -28,9 +28,9 @@ class ResolvedFormula:
28
28
  version: str
29
29
  mode: str = "local"
30
30
  namespace: str = "builtin" # NEW: "builtin" or "user"
31
- ast: dict | None = None
32
- sample_clean: dict | None = None
33
- sample_unclean: dict | None = None
31
+ ast: Optional[dict] = None
32
+ sample_clean: Optional[dict] = None
33
+ sample_unclean: Optional[dict] = None
34
34
 
35
35
 
36
36
  # ------------------------------------------------------------
@@ -1,101 +1,13 @@
1
1
  """
2
- Additory Synthetic Data Generation Module
2
+ Synthetic Module - Synthetic Data Generation Functionality
3
3
 
4
- This module provides polars-native synthetic data generation using regex patterns
5
- and distribution strategies. It supports hierarchical pattern resolution and
6
- industry-standard file formats (.properties and .toml).
4
+ This module provides synthetic data generation capabilities to add synthetic rows
5
+ to existing dataframes or create data from scratch by intelligently sampling
6
+ from existing data patterns.
7
7
  """
8
8
 
9
- from .api import (
10
- synth,
11
- config,
12
- register_distribution_engine,
13
- unregister_distribution_engine,
14
- list_custom_distribution_engines
15
- )
16
- from .exceptions import (
17
- SyntheticDataError,
18
- PatternResolutionError,
19
- ValidationError,
20
- DistributionError,
21
- FileFormatError,
22
- PatternImportError,
23
- SchemaParsingError
24
- )
25
- from .pattern_resolver import PatternHierarchyResolver, ResolutionTrace, PatternResolutionResult
26
- from .engines import (
27
- DistributionEngine,
28
- DistributionEngineFactory,
29
- DistributionManager,
30
- DistributionConfig,
31
- )
32
- from .generator import (
33
- RegexGenerator,
34
- PolarsGeneratorCore,
35
- OutputConverter,
36
- SyntheticDataGenerator,
37
- GenerationConfig,
38
- )
39
- from .performance import (
40
- PerformanceMonitor,
41
- PerformanceOptimizer,
42
- PerformanceMetrics,
43
- PerformanceComparison,
44
- performance_monitor,
45
- performance_optimizer
46
- )
47
- from .polars_integration import (
48
- PolarsIntegrationLayer,
49
- optimize_conversion,
50
- enhance_result,
51
- optimize_context,
52
- apply_expression,
53
- optimize_memory,
54
- validate_compatibility,
55
- get_integration_stats,
56
- cleanup_integration,
57
- benchmark_integration
58
- )
9
+ from additory.synthetic.synthesizer import synthetic
59
10
 
60
11
  __all__ = [
61
- 'synth',
62
- 'config',
63
- 'register_distribution_engine',
64
- 'unregister_distribution_engine',
65
- 'list_custom_distribution_engines',
66
- 'SyntheticDataError',
67
- 'PatternResolutionError',
68
- 'ValidationError',
69
- 'DistributionError',
70
- 'FileFormatError',
71
- 'PatternImportError',
72
- 'SchemaParsingError',
73
- 'PatternHierarchyResolver',
74
- 'ResolutionTrace',
75
- 'PatternResolutionResult',
76
- 'DistributionEngine',
77
- 'DistributionEngineFactory',
78
- 'DistributionManager',
79
- 'DistributionConfig',
80
- 'RegexGenerator',
81
- 'PolarsGeneratorCore',
82
- 'OutputConverter',
83
- 'SyntheticDataGenerator',
84
- 'GenerationConfig',
85
- 'PerformanceMonitor',
86
- 'PerformanceOptimizer',
87
- 'PerformanceMetrics',
88
- 'PerformanceComparison',
89
- 'performance_monitor',
90
- 'performance_optimizer',
91
- 'PolarsIntegrationLayer',
92
- 'optimize_conversion',
93
- 'enhance_result',
94
- 'optimize_context',
95
- 'apply_expression',
96
- 'optimize_memory',
97
- 'validate_compatibility',
98
- 'get_integration_stats',
99
- 'cleanup_integration',
100
- 'benchmark_integration'
101
- ]
12
+ "synthetic"
13
+ ]
@@ -0,0 +1,149 @@
1
+ """
2
+ Column Name Resolver for Linked Lists
3
+
4
+ Resolves column names for linked lists using priority order:
5
+ 1. Column_Names row (explicit names)
6
+ 2. Underscore parsing from list name
7
+ 3. Fallback to {strategy_key}_1, {strategy_key}_2, etc.
8
+ """
9
+
10
+ from typing import List, Optional
11
+ import warnings
12
+
13
+
14
+ def parse_column_names_from_underscores(list_name: str) -> Optional[List[str]]:
15
+ """
16
+ Parse column names from list name using underscore delimiters.
17
+
18
+ Args:
19
+ list_name: Name of the list variable (e.g., "AE_CM_SEV")
20
+
21
+ Returns:
22
+ List of column names, or None if no underscores found
23
+
24
+ Examples:
25
+ >>> parse_column_names_from_underscores("AE_CM_SEV")
26
+ ['AE', 'CM', 'SEV']
27
+
28
+ >>> parse_column_names_from_underscores("adverse_event_medication")
29
+ ['adverse', 'event', 'medication']
30
+
31
+ >>> parse_column_names_from_underscores("adverseconmed")
32
+ None
33
+ """
34
+ if '_' not in list_name:
35
+ return None
36
+
37
+ parts = list_name.split('_')
38
+
39
+ # Filter out empty parts
40
+ column_names = [part for part in parts if part]
41
+
42
+ if not column_names:
43
+ return None
44
+
45
+ return column_names
46
+
47
+
48
+ def generate_fallback_column_names(strategy_key: str, num_columns: int) -> List[str]:
49
+ """
50
+ Generate fallback column names when no other naming strategy works.
51
+
52
+ Format: {strategy_key}_1, {strategy_key}_2, etc.
53
+
54
+ Args:
55
+ strategy_key: Key from strategy dict (e.g., "col1")
56
+ num_columns: Number of columns to generate names for
57
+
58
+ Returns:
59
+ List of column names
60
+
61
+ Examples:
62
+ >>> generate_fallback_column_names("col1", 3)
63
+ ['col1_1', 'col1_2', 'col1_3']
64
+
65
+ >>> generate_fallback_column_names("adverse_events", 2)
66
+ ['adverse_events_1', 'adverse_events_2']
67
+ """
68
+ return [f"{strategy_key}_{i+1}" for i in range(num_columns)]
69
+
70
+
71
+ def resolve_column_names(
72
+ list_name: str,
73
+ strategy_key: str,
74
+ num_columns: int,
75
+ explicit_names: Optional[List[str]] = None
76
+ ) -> List[str]:
77
+ """
78
+ Resolve column names using priority order.
79
+
80
+ Priority:
81
+ 1. explicit_names (from Column_Names row)
82
+ 2. Underscore parsing from list_name
83
+ 3. Fallback to {strategy_key}_1, {strategy_key}_2, etc.
84
+
85
+ Args:
86
+ list_name: Name of the list variable
87
+ strategy_key: Key from strategy dict
88
+ num_columns: Number of columns to generate
89
+ explicit_names: Explicit column names from Column_Names row (optional)
90
+
91
+ Returns:
92
+ List of column names
93
+
94
+ Raises:
95
+ ValueError: If explicit_names count doesn't match num_columns
96
+
97
+ Examples:
98
+ >>> # Priority 1: Explicit names
99
+ >>> resolve_column_names("AE_CM", "col1", 2, ["adverse_event", "medication"])
100
+ ['adverse_event', 'medication']
101
+
102
+ >>> # Priority 2: Underscore parsing
103
+ >>> resolve_column_names("AE_CM_SEV", "col1", 3)
104
+ ['AE', 'CM', 'SEV']
105
+
106
+ >>> # Priority 3: Fallback
107
+ >>> resolve_column_names("adverseconmed", "col1", 2)
108
+ ['col1_1', 'col1_2']
109
+ """
110
+ # Priority 1: Explicit names from Column_Names row
111
+ if explicit_names is not None:
112
+ if len(explicit_names) != num_columns:
113
+ raise ValueError(
114
+ f"Column_Names row has {len(explicit_names)} names but "
115
+ f"linked list generates {num_columns} columns. They must match."
116
+ )
117
+ return explicit_names
118
+
119
+ # Priority 2: Underscore parsing
120
+ parsed_names = parse_column_names_from_underscores(list_name)
121
+ if parsed_names is not None:
122
+ if len(parsed_names) == num_columns:
123
+ return parsed_names
124
+ else:
125
+ # Underscore count doesn't match - fall through to fallback
126
+ warnings.warn(
127
+ f"List name '{list_name}' has {len(parsed_names)} underscore-separated "
128
+ f"parts but generates {num_columns} columns. "
129
+ f"Using fallback naming: {strategy_key}_1, {strategy_key}_2, etc.\n"
130
+ f"Suggestion: Use a list name with {num_columns-1} underscores, "
131
+ f"or add a Column_Names row for explicit naming.",
132
+ UserWarning
133
+ )
134
+ else:
135
+ # No underscores - emit warning
136
+ warnings.warn(
137
+ f"List name '{list_name}' has no underscores. "
138
+ f"Using fallback naming: {strategy_key}_1, {strategy_key}_2, etc.\n"
139
+ f"Suggestion: Use underscore-delimited naming (e.g., 'AE_CM_SEV') "
140
+ f"or add a Column_Names row:\n"
141
+ f" {list_name} = [\n"
142
+ f" ['Column_Names:[col1,col2,col3]'],\n"
143
+ f" ['primary', ['attr1'], ['attr2']]\n"
144
+ f" ]",
145
+ UserWarning
146
+ )
147
+
148
+ # Priority 3: Fallback
149
+ return generate_fallback_column_names(strategy_key, num_columns)
@@ -0,0 +1,259 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Text-based label deduction for additory.
4
+
5
+ Uses TF-IDF + cosine similarity to deduce labels from text.
6
+ Pure Python, no LLMs, offline-first.
7
+ """
8
+
9
+ import math
10
+ import re
11
+ from collections import Counter
12
+ from typing import Union, List, Optional
13
+ import pandas as pd
14
+ import polars as pl
15
+
16
+
17
+ def tokenize(text: str) -> List[str]:
18
+ """
19
+ Tokenize text into words.
20
+
21
+ Args:
22
+ text: Input text
23
+
24
+ Returns:
25
+ List of lowercase tokens
26
+ """
27
+ if text is None or not isinstance(text, str):
28
+ return []
29
+
30
+ text = text.lower()
31
+ text = re.sub(r"[^a-z0-9\s]", " ", text)
32
+ return [w for w in text.split() if w]
33
+
34
+
35
+ def vectorize(tokens: List[str]) -> Counter:
36
+ """
37
+ Convert tokens to TF vector (term frequency).
38
+
39
+ Args:
40
+ tokens: List of tokens
41
+
42
+ Returns:
43
+ Counter with term frequencies
44
+ """
45
+ return Counter(tokens)
46
+
47
+
48
+ def cosine_similarity(v1: Counter, v2: Counter) -> float:
49
+ """
50
+ Compute cosine similarity between two vectors.
51
+
52
+ Args:
53
+ v1: First vector (Counter)
54
+ v2: Second vector (Counter)
55
+
56
+ Returns:
57
+ Similarity score (0-1)
58
+ """
59
+ # Dot product
60
+ dot = sum(v1[t] * v2[t] for t in v1 if t in v2)
61
+
62
+ # Magnitudes
63
+ mag1 = math.sqrt(sum(v * v for v in v1.values()))
64
+ mag2 = math.sqrt(sum(v * v for v in v2.values()))
65
+
66
+ if mag1 == 0 or mag2 == 0:
67
+ return 0.0
68
+
69
+ return dot / (mag1 * mag2)
70
+
71
+
72
+ def _deduce_polars(
73
+ df: pl.DataFrame,
74
+ from_column: Union[str, List[str]],
75
+ to_column: str,
76
+ min_examples: int = 3
77
+ ) -> pl.DataFrame:
78
+ """
79
+ Deduce missing labels using text similarity (Polars-native).
80
+
81
+ Args:
82
+ df: Polars DataFrame
83
+ from_column: Text column(s) to analyze
84
+ to_column: Label column to fill
85
+ min_examples: Minimum labeled examples required
86
+
87
+ Returns:
88
+ DataFrame with deduced labels
89
+
90
+ Raises:
91
+ ValueError: If insufficient labeled examples
92
+ """
93
+ # Normalize from_column to list
94
+ if isinstance(from_column, str):
95
+ source_cols = [from_column]
96
+ else:
97
+ source_cols = from_column
98
+
99
+ # Validate columns exist
100
+ for col in source_cols:
101
+ if col not in df.columns:
102
+ raise ValueError(f"Column '{col}' not found in DataFrame")
103
+
104
+ if to_column not in df.columns:
105
+ raise ValueError(f"Column '{to_column}' not found in DataFrame")
106
+
107
+ # Create combined text column if multiple sources
108
+ if len(source_cols) == 1:
109
+ text_col = source_cols[0]
110
+ df_work = df.clone()
111
+ else:
112
+ # Concatenate multiple columns with spaces
113
+ df_work = df.with_columns([
114
+ pl.concat_str(
115
+ [pl.col(c).fill_null("") for c in source_cols],
116
+ separator=" "
117
+ ).alias("__deduce_text__")
118
+ ])
119
+ text_col = "__deduce_text__"
120
+
121
+ # Split into labeled and unlabeled
122
+ labeled_df = df_work.filter(pl.col(to_column).is_not_null())
123
+ unlabeled_df = df_work.filter(pl.col(to_column).is_null())
124
+
125
+ # Check if we have enough labeled examples
126
+ n_labeled = len(labeled_df)
127
+ if n_labeled == 0:
128
+ raise ValueError(
129
+ f"⚠️ Cannot deduce labels: No labeled examples found in '{to_column}' column.\n"
130
+ f"Please manually label at least {min_examples} examples per category, then run again.\n\n"
131
+ f"Note: additory uses pure Python text similarity (no LLMs, no external calls).\n"
132
+ f"Your data never leaves your machine."
133
+ )
134
+
135
+ if n_labeled < min_examples:
136
+ print(
137
+ f"⚠️ Only {n_labeled} labeled examples found. "
138
+ f"For better accuracy, label at least {min_examples} examples.\n"
139
+ f"Proceeding with available data..."
140
+ )
141
+
142
+ # If no unlabeled rows, return original
143
+ if len(unlabeled_df) == 0:
144
+ if len(source_cols) > 1:
145
+ # Remove temporary column
146
+ return df_work.drop("__deduce_text__")
147
+ return df_work
148
+
149
+ # Precompute vectors for labeled rows
150
+ labeled_vectors = []
151
+ for row in labeled_df.iter_rows(named=True):
152
+ text = row[text_col]
153
+ label = row[to_column]
154
+ tokens = tokenize(text)
155
+ vec = vectorize(tokens)
156
+ labeled_vectors.append((vec, label))
157
+
158
+ # Deduce labels for unlabeled rows
159
+ deduced_labels = []
160
+ for row in unlabeled_df.iter_rows(named=True):
161
+ text = row[text_col]
162
+ tokens = tokenize(text)
163
+ vec = vectorize(tokens)
164
+
165
+ # Find most similar labeled example
166
+ best_label = None
167
+ best_score = -1.0
168
+
169
+ for labeled_vec, label in labeled_vectors:
170
+ score = cosine_similarity(vec, labeled_vec)
171
+ if score > best_score:
172
+ best_score = score
173
+ best_label = label
174
+
175
+ deduced_labels.append(best_label)
176
+
177
+ # Create deduced labels series
178
+ deduced_series = pl.Series(to_column, deduced_labels)
179
+
180
+ # Update unlabeled rows with deduced labels
181
+ unlabeled_df = unlabeled_df.with_columns([deduced_series])
182
+
183
+ # Combine labeled and unlabeled back together
184
+ result_df = pl.concat([labeled_df, unlabeled_df])
185
+
186
+ # Remove temporary column if created
187
+ if len(source_cols) > 1:
188
+ result_df = result_df.drop("__deduce_text__")
189
+
190
+ # Print success message
191
+ n_deduced = len(deduced_labels)
192
+ print(f"✓ Deduced {n_deduced} label{'s' if n_deduced != 1 else ''} from {n_labeled} examples (offline, no LLMs)")
193
+
194
+ return result_df
195
+
196
+
197
+ def deduce(
198
+ df: Union[pd.DataFrame, pl.DataFrame],
199
+ from_column: Union[str, List[str]],
200
+ to_column: str
201
+ ) -> Union[pd.DataFrame, pl.DataFrame]:
202
+ """
203
+ Deduce missing labels based on text similarity to labeled examples.
204
+
205
+ Uses cosine similarity on TF-IDF vectors. Pure Python, no LLMs, offline-first.
206
+ Requires at least 3 labeled examples to work.
207
+
208
+ When multiple source columns are provided, they are concatenated with
209
+ spaces before computing similarity.
210
+
211
+ Args:
212
+ df: DataFrame with some labeled and some unlabeled rows
213
+ from_column: Text column(s) to analyze
214
+ - str: Single column (e.g., "comment")
215
+ - List[str]: Multiple columns (e.g., ["comment", "notes"])
216
+ to_column: Label column to fill (e.g., "status")
217
+
218
+ Returns:
219
+ DataFrame with deduced labels filled in
220
+
221
+ Examples:
222
+ # Single column
223
+ >>> result = add.deduce(df, from_column="comment", to_column="status")
224
+
225
+ # Multiple columns (better accuracy)
226
+ >>> result = add.deduce(
227
+ ... df,
228
+ ... from_column=["comment", "notes", "description"],
229
+ ... to_column="status"
230
+ ... )
231
+
232
+ Privacy: Your data never leaves your machine. No external connections.
233
+ """
234
+ # Detect input backend
235
+ if isinstance(df, pd.DataFrame):
236
+ backend = "pandas"
237
+ # Convert to Polars
238
+ df_polars = pl.from_pandas(df)
239
+ elif isinstance(df, pl.DataFrame):
240
+ backend = "polars"
241
+ df_polars = df
242
+ else:
243
+ # Try arrow bridge (for cudf, etc.)
244
+ try:
245
+ df_polars = pl.from_arrow(df)
246
+ backend = "arrow"
247
+ except Exception:
248
+ raise TypeError(f"Unsupported DataFrame type: {type(df)}")
249
+
250
+ # Process in Polars
251
+ result_polars = _deduce_polars(df_polars, from_column, to_column)
252
+
253
+ # Convert back to original format
254
+ if backend == "pandas":
255
+ return result_polars.to_pandas()
256
+ elif backend == "polars":
257
+ return result_polars
258
+ else: # arrow
259
+ return result_polars.to_arrow()
@@ -1,5 +1,5 @@
1
1
  """
2
- Distribution Strategies for Data Augmentation
2
+ Distribution Strategies for Synthetic Data Generation
3
3
 
4
4
  DEPRECATED: This module has been moved to additory.common.distributions
5
5
  Please update your imports to use additory.common.distributions instead.
@@ -11,7 +11,7 @@ import warnings
11
11
 
12
12
  # Issue deprecation warning
13
13
  warnings.warn(
14
- "additory.augment.distributions is deprecated. "
14
+ "additory.synthetic.distributions is deprecated. "
15
15
  "Please use additory.common.distributions instead. "
16
16
  "This module will be removed in a future version.",
17
17
  DeprecationWarning,
@@ -1,5 +1,5 @@
1
1
  """
2
- Forecast Strategies for Data Augmentation
2
+ Forecast Strategies for Synthetic Data Generation
3
3
 
4
4
  Provides time series forecasting capabilities:
5
5
  - Linear trend forecasting