additory 0.1.0a2__py3-none-any.whl → 0.1.0a4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- additory/__init__.py +4 -0
- additory/common/__init__.py +2 -2
- additory/common/backend.py +20 -4
- additory/common/distributions.py +1 -1
- additory/common/sample_data.py +19 -19
- additory/core/backends/arrow_bridge.py +7 -0
- additory/core/config.py +3 -3
- additory/core/polars_expression_engine.py +66 -16
- additory/core/registry.py +4 -3
- additory/dynamic_api.py +95 -51
- additory/expressions/proxy.py +4 -1
- additory/expressions/registry.py +3 -3
- additory/synthetic/__init__.py +7 -95
- additory/synthetic/column_name_resolver.py +149 -0
- additory/synthetic/deduce.py +259 -0
- additory/{augment → synthetic}/distributions.py +2 -2
- additory/{augment → synthetic}/forecast.py +1 -1
- additory/synthetic/linked_list_parser.py +415 -0
- additory/synthetic/namespace_lookup.py +129 -0
- additory/{augment → synthetic}/smote.py +1 -1
- additory/{augment → synthetic}/strategies.py +87 -44
- additory/{augment/augmentor.py → synthetic/synthesizer.py} +75 -15
- additory/utilities/units.py +4 -1
- {additory-0.1.0a2.dist-info → additory-0.1.0a4.dist-info}/METADATA +44 -28
- {additory-0.1.0a2.dist-info → additory-0.1.0a4.dist-info}/RECORD +28 -43
- {additory-0.1.0a2.dist-info → additory-0.1.0a4.dist-info}/WHEEL +1 -1
- additory/augment/__init__.py +0 -24
- additory/augment/builtin_lists.py +0 -430
- additory/augment/list_registry.py +0 -177
- additory/synthetic/api.py +0 -220
- additory/synthetic/common_integration.py +0 -314
- additory/synthetic/config.py +0 -262
- additory/synthetic/engines.py +0 -529
- additory/synthetic/exceptions.py +0 -180
- additory/synthetic/file_managers.py +0 -518
- additory/synthetic/generator.py +0 -702
- additory/synthetic/generator_parser.py +0 -68
- additory/synthetic/integration.py +0 -319
- additory/synthetic/models.py +0 -241
- additory/synthetic/pattern_resolver.py +0 -573
- additory/synthetic/performance.py +0 -469
- additory/synthetic/polars_integration.py +0 -464
- additory/synthetic/proxy.py +0 -60
- additory/synthetic/schema_parser.py +0 -685
- additory/synthetic/validator.py +0 -553
- {additory-0.1.0a2.dist-info → additory-0.1.0a4.dist-info}/licenses/LICENSE +0 -0
- {additory-0.1.0a2.dist-info → additory-0.1.0a4.dist-info}/top_level.txt +0 -0
additory/expressions/registry.py
CHANGED
|
@@ -28,9 +28,9 @@ class ResolvedFormula:
|
|
|
28
28
|
version: str
|
|
29
29
|
mode: str = "local"
|
|
30
30
|
namespace: str = "builtin" # NEW: "builtin" or "user"
|
|
31
|
-
ast: dict
|
|
32
|
-
sample_clean: dict
|
|
33
|
-
sample_unclean: dict
|
|
31
|
+
ast: Optional[dict] = None
|
|
32
|
+
sample_clean: Optional[dict] = None
|
|
33
|
+
sample_unclean: Optional[dict] = None
|
|
34
34
|
|
|
35
35
|
|
|
36
36
|
# ------------------------------------------------------------
|
additory/synthetic/__init__.py
CHANGED
|
@@ -1,101 +1,13 @@
|
|
|
1
1
|
"""
|
|
2
|
-
|
|
2
|
+
Synthetic Module - Synthetic Data Generation Functionality
|
|
3
3
|
|
|
4
|
-
This module provides
|
|
5
|
-
|
|
6
|
-
|
|
4
|
+
This module provides synthetic data generation capabilities to add synthetic rows
|
|
5
|
+
to existing dataframes or create data from scratch by intelligently sampling
|
|
6
|
+
from existing data patterns.
|
|
7
7
|
"""
|
|
8
8
|
|
|
9
|
-
from .
|
|
10
|
-
synth,
|
|
11
|
-
config,
|
|
12
|
-
register_distribution_engine,
|
|
13
|
-
unregister_distribution_engine,
|
|
14
|
-
list_custom_distribution_engines
|
|
15
|
-
)
|
|
16
|
-
from .exceptions import (
|
|
17
|
-
SyntheticDataError,
|
|
18
|
-
PatternResolutionError,
|
|
19
|
-
ValidationError,
|
|
20
|
-
DistributionError,
|
|
21
|
-
FileFormatError,
|
|
22
|
-
PatternImportError,
|
|
23
|
-
SchemaParsingError
|
|
24
|
-
)
|
|
25
|
-
from .pattern_resolver import PatternHierarchyResolver, ResolutionTrace, PatternResolutionResult
|
|
26
|
-
from .engines import (
|
|
27
|
-
DistributionEngine,
|
|
28
|
-
DistributionEngineFactory,
|
|
29
|
-
DistributionManager,
|
|
30
|
-
DistributionConfig,
|
|
31
|
-
)
|
|
32
|
-
from .generator import (
|
|
33
|
-
RegexGenerator,
|
|
34
|
-
PolarsGeneratorCore,
|
|
35
|
-
OutputConverter,
|
|
36
|
-
SyntheticDataGenerator,
|
|
37
|
-
GenerationConfig,
|
|
38
|
-
)
|
|
39
|
-
from .performance import (
|
|
40
|
-
PerformanceMonitor,
|
|
41
|
-
PerformanceOptimizer,
|
|
42
|
-
PerformanceMetrics,
|
|
43
|
-
PerformanceComparison,
|
|
44
|
-
performance_monitor,
|
|
45
|
-
performance_optimizer
|
|
46
|
-
)
|
|
47
|
-
from .polars_integration import (
|
|
48
|
-
PolarsIntegrationLayer,
|
|
49
|
-
optimize_conversion,
|
|
50
|
-
enhance_result,
|
|
51
|
-
optimize_context,
|
|
52
|
-
apply_expression,
|
|
53
|
-
optimize_memory,
|
|
54
|
-
validate_compatibility,
|
|
55
|
-
get_integration_stats,
|
|
56
|
-
cleanup_integration,
|
|
57
|
-
benchmark_integration
|
|
58
|
-
)
|
|
9
|
+
from additory.synthetic.synthesizer import synthetic
|
|
59
10
|
|
|
60
11
|
__all__ = [
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
'register_distribution_engine',
|
|
64
|
-
'unregister_distribution_engine',
|
|
65
|
-
'list_custom_distribution_engines',
|
|
66
|
-
'SyntheticDataError',
|
|
67
|
-
'PatternResolutionError',
|
|
68
|
-
'ValidationError',
|
|
69
|
-
'DistributionError',
|
|
70
|
-
'FileFormatError',
|
|
71
|
-
'PatternImportError',
|
|
72
|
-
'SchemaParsingError',
|
|
73
|
-
'PatternHierarchyResolver',
|
|
74
|
-
'ResolutionTrace',
|
|
75
|
-
'PatternResolutionResult',
|
|
76
|
-
'DistributionEngine',
|
|
77
|
-
'DistributionEngineFactory',
|
|
78
|
-
'DistributionManager',
|
|
79
|
-
'DistributionConfig',
|
|
80
|
-
'RegexGenerator',
|
|
81
|
-
'PolarsGeneratorCore',
|
|
82
|
-
'OutputConverter',
|
|
83
|
-
'SyntheticDataGenerator',
|
|
84
|
-
'GenerationConfig',
|
|
85
|
-
'PerformanceMonitor',
|
|
86
|
-
'PerformanceOptimizer',
|
|
87
|
-
'PerformanceMetrics',
|
|
88
|
-
'PerformanceComparison',
|
|
89
|
-
'performance_monitor',
|
|
90
|
-
'performance_optimizer',
|
|
91
|
-
'PolarsIntegrationLayer',
|
|
92
|
-
'optimize_conversion',
|
|
93
|
-
'enhance_result',
|
|
94
|
-
'optimize_context',
|
|
95
|
-
'apply_expression',
|
|
96
|
-
'optimize_memory',
|
|
97
|
-
'validate_compatibility',
|
|
98
|
-
'get_integration_stats',
|
|
99
|
-
'cleanup_integration',
|
|
100
|
-
'benchmark_integration'
|
|
101
|
-
]
|
|
12
|
+
"synthetic"
|
|
13
|
+
]
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Column Name Resolver for Linked Lists
|
|
3
|
+
|
|
4
|
+
Resolves column names for linked lists using priority order:
|
|
5
|
+
1. Column_Names row (explicit names)
|
|
6
|
+
2. Underscore parsing from list name
|
|
7
|
+
3. Fallback to {strategy_key}_1, {strategy_key}_2, etc.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from typing import List, Optional
|
|
11
|
+
import warnings
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def parse_column_names_from_underscores(list_name: str) -> Optional[List[str]]:
|
|
15
|
+
"""
|
|
16
|
+
Parse column names from list name using underscore delimiters.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
list_name: Name of the list variable (e.g., "AE_CM_SEV")
|
|
20
|
+
|
|
21
|
+
Returns:
|
|
22
|
+
List of column names, or None if no underscores found
|
|
23
|
+
|
|
24
|
+
Examples:
|
|
25
|
+
>>> parse_column_names_from_underscores("AE_CM_SEV")
|
|
26
|
+
['AE', 'CM', 'SEV']
|
|
27
|
+
|
|
28
|
+
>>> parse_column_names_from_underscores("adverse_event_medication")
|
|
29
|
+
['adverse', 'event', 'medication']
|
|
30
|
+
|
|
31
|
+
>>> parse_column_names_from_underscores("adverseconmed")
|
|
32
|
+
None
|
|
33
|
+
"""
|
|
34
|
+
if '_' not in list_name:
|
|
35
|
+
return None
|
|
36
|
+
|
|
37
|
+
parts = list_name.split('_')
|
|
38
|
+
|
|
39
|
+
# Filter out empty parts
|
|
40
|
+
column_names = [part for part in parts if part]
|
|
41
|
+
|
|
42
|
+
if not column_names:
|
|
43
|
+
return None
|
|
44
|
+
|
|
45
|
+
return column_names
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def generate_fallback_column_names(strategy_key: str, num_columns: int) -> List[str]:
|
|
49
|
+
"""
|
|
50
|
+
Generate fallback column names when no other naming strategy works.
|
|
51
|
+
|
|
52
|
+
Format: {strategy_key}_1, {strategy_key}_2, etc.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
strategy_key: Key from strategy dict (e.g., "col1")
|
|
56
|
+
num_columns: Number of columns to generate names for
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
List of column names
|
|
60
|
+
|
|
61
|
+
Examples:
|
|
62
|
+
>>> generate_fallback_column_names("col1", 3)
|
|
63
|
+
['col1_1', 'col1_2', 'col1_3']
|
|
64
|
+
|
|
65
|
+
>>> generate_fallback_column_names("adverse_events", 2)
|
|
66
|
+
['adverse_events_1', 'adverse_events_2']
|
|
67
|
+
"""
|
|
68
|
+
return [f"{strategy_key}_{i+1}" for i in range(num_columns)]
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def resolve_column_names(
|
|
72
|
+
list_name: str,
|
|
73
|
+
strategy_key: str,
|
|
74
|
+
num_columns: int,
|
|
75
|
+
explicit_names: Optional[List[str]] = None
|
|
76
|
+
) -> List[str]:
|
|
77
|
+
"""
|
|
78
|
+
Resolve column names using priority order.
|
|
79
|
+
|
|
80
|
+
Priority:
|
|
81
|
+
1. explicit_names (from Column_Names row)
|
|
82
|
+
2. Underscore parsing from list_name
|
|
83
|
+
3. Fallback to {strategy_key}_1, {strategy_key}_2, etc.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
list_name: Name of the list variable
|
|
87
|
+
strategy_key: Key from strategy dict
|
|
88
|
+
num_columns: Number of columns to generate
|
|
89
|
+
explicit_names: Explicit column names from Column_Names row (optional)
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
List of column names
|
|
93
|
+
|
|
94
|
+
Raises:
|
|
95
|
+
ValueError: If explicit_names count doesn't match num_columns
|
|
96
|
+
|
|
97
|
+
Examples:
|
|
98
|
+
>>> # Priority 1: Explicit names
|
|
99
|
+
>>> resolve_column_names("AE_CM", "col1", 2, ["adverse_event", "medication"])
|
|
100
|
+
['adverse_event', 'medication']
|
|
101
|
+
|
|
102
|
+
>>> # Priority 2: Underscore parsing
|
|
103
|
+
>>> resolve_column_names("AE_CM_SEV", "col1", 3)
|
|
104
|
+
['AE', 'CM', 'SEV']
|
|
105
|
+
|
|
106
|
+
>>> # Priority 3: Fallback
|
|
107
|
+
>>> resolve_column_names("adverseconmed", "col1", 2)
|
|
108
|
+
['col1_1', 'col1_2']
|
|
109
|
+
"""
|
|
110
|
+
# Priority 1: Explicit names from Column_Names row
|
|
111
|
+
if explicit_names is not None:
|
|
112
|
+
if len(explicit_names) != num_columns:
|
|
113
|
+
raise ValueError(
|
|
114
|
+
f"Column_Names row has {len(explicit_names)} names but "
|
|
115
|
+
f"linked list generates {num_columns} columns. They must match."
|
|
116
|
+
)
|
|
117
|
+
return explicit_names
|
|
118
|
+
|
|
119
|
+
# Priority 2: Underscore parsing
|
|
120
|
+
parsed_names = parse_column_names_from_underscores(list_name)
|
|
121
|
+
if parsed_names is not None:
|
|
122
|
+
if len(parsed_names) == num_columns:
|
|
123
|
+
return parsed_names
|
|
124
|
+
else:
|
|
125
|
+
# Underscore count doesn't match - fall through to fallback
|
|
126
|
+
warnings.warn(
|
|
127
|
+
f"List name '{list_name}' has {len(parsed_names)} underscore-separated "
|
|
128
|
+
f"parts but generates {num_columns} columns. "
|
|
129
|
+
f"Using fallback naming: {strategy_key}_1, {strategy_key}_2, etc.\n"
|
|
130
|
+
f"Suggestion: Use a list name with {num_columns-1} underscores, "
|
|
131
|
+
f"or add a Column_Names row for explicit naming.",
|
|
132
|
+
UserWarning
|
|
133
|
+
)
|
|
134
|
+
else:
|
|
135
|
+
# No underscores - emit warning
|
|
136
|
+
warnings.warn(
|
|
137
|
+
f"List name '{list_name}' has no underscores. "
|
|
138
|
+
f"Using fallback naming: {strategy_key}_1, {strategy_key}_2, etc.\n"
|
|
139
|
+
f"Suggestion: Use underscore-delimited naming (e.g., 'AE_CM_SEV') "
|
|
140
|
+
f"or add a Column_Names row:\n"
|
|
141
|
+
f" {list_name} = [\n"
|
|
142
|
+
f" ['Column_Names:[col1,col2,col3]'],\n"
|
|
143
|
+
f" ['primary', ['attr1'], ['attr2']]\n"
|
|
144
|
+
f" ]",
|
|
145
|
+
UserWarning
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
# Priority 3: Fallback
|
|
149
|
+
return generate_fallback_column_names(strategy_key, num_columns)
|
|
@@ -0,0 +1,259 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Text-based label deduction for additory.
|
|
4
|
+
|
|
5
|
+
Uses TF-IDF + cosine similarity to deduce labels from text.
|
|
6
|
+
Pure Python, no LLMs, offline-first.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import math
|
|
10
|
+
import re
|
|
11
|
+
from collections import Counter
|
|
12
|
+
from typing import Union, List, Optional
|
|
13
|
+
import pandas as pd
|
|
14
|
+
import polars as pl
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def tokenize(text: str) -> List[str]:
|
|
18
|
+
"""
|
|
19
|
+
Tokenize text into words.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
text: Input text
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
List of lowercase tokens
|
|
26
|
+
"""
|
|
27
|
+
if text is None or not isinstance(text, str):
|
|
28
|
+
return []
|
|
29
|
+
|
|
30
|
+
text = text.lower()
|
|
31
|
+
text = re.sub(r"[^a-z0-9\s]", " ", text)
|
|
32
|
+
return [w for w in text.split() if w]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def vectorize(tokens: List[str]) -> Counter:
|
|
36
|
+
"""
|
|
37
|
+
Convert tokens to TF vector (term frequency).
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
tokens: List of tokens
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
Counter with term frequencies
|
|
44
|
+
"""
|
|
45
|
+
return Counter(tokens)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def cosine_similarity(v1: Counter, v2: Counter) -> float:
|
|
49
|
+
"""
|
|
50
|
+
Compute cosine similarity between two vectors.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
v1: First vector (Counter)
|
|
54
|
+
v2: Second vector (Counter)
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
Similarity score (0-1)
|
|
58
|
+
"""
|
|
59
|
+
# Dot product
|
|
60
|
+
dot = sum(v1[t] * v2[t] for t in v1 if t in v2)
|
|
61
|
+
|
|
62
|
+
# Magnitudes
|
|
63
|
+
mag1 = math.sqrt(sum(v * v for v in v1.values()))
|
|
64
|
+
mag2 = math.sqrt(sum(v * v for v in v2.values()))
|
|
65
|
+
|
|
66
|
+
if mag1 == 0 or mag2 == 0:
|
|
67
|
+
return 0.0
|
|
68
|
+
|
|
69
|
+
return dot / (mag1 * mag2)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _deduce_polars(
|
|
73
|
+
df: pl.DataFrame,
|
|
74
|
+
from_column: Union[str, List[str]],
|
|
75
|
+
to_column: str,
|
|
76
|
+
min_examples: int = 3
|
|
77
|
+
) -> pl.DataFrame:
|
|
78
|
+
"""
|
|
79
|
+
Deduce missing labels using text similarity (Polars-native).
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
df: Polars DataFrame
|
|
83
|
+
from_column: Text column(s) to analyze
|
|
84
|
+
to_column: Label column to fill
|
|
85
|
+
min_examples: Minimum labeled examples required
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
DataFrame with deduced labels
|
|
89
|
+
|
|
90
|
+
Raises:
|
|
91
|
+
ValueError: If insufficient labeled examples
|
|
92
|
+
"""
|
|
93
|
+
# Normalize from_column to list
|
|
94
|
+
if isinstance(from_column, str):
|
|
95
|
+
source_cols = [from_column]
|
|
96
|
+
else:
|
|
97
|
+
source_cols = from_column
|
|
98
|
+
|
|
99
|
+
# Validate columns exist
|
|
100
|
+
for col in source_cols:
|
|
101
|
+
if col not in df.columns:
|
|
102
|
+
raise ValueError(f"Column '{col}' not found in DataFrame")
|
|
103
|
+
|
|
104
|
+
if to_column not in df.columns:
|
|
105
|
+
raise ValueError(f"Column '{to_column}' not found in DataFrame")
|
|
106
|
+
|
|
107
|
+
# Create combined text column if multiple sources
|
|
108
|
+
if len(source_cols) == 1:
|
|
109
|
+
text_col = source_cols[0]
|
|
110
|
+
df_work = df.clone()
|
|
111
|
+
else:
|
|
112
|
+
# Concatenate multiple columns with spaces
|
|
113
|
+
df_work = df.with_columns([
|
|
114
|
+
pl.concat_str(
|
|
115
|
+
[pl.col(c).fill_null("") for c in source_cols],
|
|
116
|
+
separator=" "
|
|
117
|
+
).alias("__deduce_text__")
|
|
118
|
+
])
|
|
119
|
+
text_col = "__deduce_text__"
|
|
120
|
+
|
|
121
|
+
# Split into labeled and unlabeled
|
|
122
|
+
labeled_df = df_work.filter(pl.col(to_column).is_not_null())
|
|
123
|
+
unlabeled_df = df_work.filter(pl.col(to_column).is_null())
|
|
124
|
+
|
|
125
|
+
# Check if we have enough labeled examples
|
|
126
|
+
n_labeled = len(labeled_df)
|
|
127
|
+
if n_labeled == 0:
|
|
128
|
+
raise ValueError(
|
|
129
|
+
f"⚠️ Cannot deduce labels: No labeled examples found in '{to_column}' column.\n"
|
|
130
|
+
f"Please manually label at least {min_examples} examples per category, then run again.\n\n"
|
|
131
|
+
f"Note: additory uses pure Python text similarity (no LLMs, no external calls).\n"
|
|
132
|
+
f"Your data never leaves your machine."
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
if n_labeled < min_examples:
|
|
136
|
+
print(
|
|
137
|
+
f"⚠️ Only {n_labeled} labeled examples found. "
|
|
138
|
+
f"For better accuracy, label at least {min_examples} examples.\n"
|
|
139
|
+
f"Proceeding with available data..."
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
# If no unlabeled rows, return original
|
|
143
|
+
if len(unlabeled_df) == 0:
|
|
144
|
+
if len(source_cols) > 1:
|
|
145
|
+
# Remove temporary column
|
|
146
|
+
return df_work.drop("__deduce_text__")
|
|
147
|
+
return df_work
|
|
148
|
+
|
|
149
|
+
# Precompute vectors for labeled rows
|
|
150
|
+
labeled_vectors = []
|
|
151
|
+
for row in labeled_df.iter_rows(named=True):
|
|
152
|
+
text = row[text_col]
|
|
153
|
+
label = row[to_column]
|
|
154
|
+
tokens = tokenize(text)
|
|
155
|
+
vec = vectorize(tokens)
|
|
156
|
+
labeled_vectors.append((vec, label))
|
|
157
|
+
|
|
158
|
+
# Deduce labels for unlabeled rows
|
|
159
|
+
deduced_labels = []
|
|
160
|
+
for row in unlabeled_df.iter_rows(named=True):
|
|
161
|
+
text = row[text_col]
|
|
162
|
+
tokens = tokenize(text)
|
|
163
|
+
vec = vectorize(tokens)
|
|
164
|
+
|
|
165
|
+
# Find most similar labeled example
|
|
166
|
+
best_label = None
|
|
167
|
+
best_score = -1.0
|
|
168
|
+
|
|
169
|
+
for labeled_vec, label in labeled_vectors:
|
|
170
|
+
score = cosine_similarity(vec, labeled_vec)
|
|
171
|
+
if score > best_score:
|
|
172
|
+
best_score = score
|
|
173
|
+
best_label = label
|
|
174
|
+
|
|
175
|
+
deduced_labels.append(best_label)
|
|
176
|
+
|
|
177
|
+
# Create deduced labels series
|
|
178
|
+
deduced_series = pl.Series(to_column, deduced_labels)
|
|
179
|
+
|
|
180
|
+
# Update unlabeled rows with deduced labels
|
|
181
|
+
unlabeled_df = unlabeled_df.with_columns([deduced_series])
|
|
182
|
+
|
|
183
|
+
# Combine labeled and unlabeled back together
|
|
184
|
+
result_df = pl.concat([labeled_df, unlabeled_df])
|
|
185
|
+
|
|
186
|
+
# Remove temporary column if created
|
|
187
|
+
if len(source_cols) > 1:
|
|
188
|
+
result_df = result_df.drop("__deduce_text__")
|
|
189
|
+
|
|
190
|
+
# Print success message
|
|
191
|
+
n_deduced = len(deduced_labels)
|
|
192
|
+
print(f"✓ Deduced {n_deduced} label{'s' if n_deduced != 1 else ''} from {n_labeled} examples (offline, no LLMs)")
|
|
193
|
+
|
|
194
|
+
return result_df
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def deduce(
|
|
198
|
+
df: Union[pd.DataFrame, pl.DataFrame],
|
|
199
|
+
from_column: Union[str, List[str]],
|
|
200
|
+
to_column: str
|
|
201
|
+
) -> Union[pd.DataFrame, pl.DataFrame]:
|
|
202
|
+
"""
|
|
203
|
+
Deduce missing labels based on text similarity to labeled examples.
|
|
204
|
+
|
|
205
|
+
Uses cosine similarity on TF-IDF vectors. Pure Python, no LLMs, offline-first.
|
|
206
|
+
Requires at least 3 labeled examples to work.
|
|
207
|
+
|
|
208
|
+
When multiple source columns are provided, they are concatenated with
|
|
209
|
+
spaces before computing similarity.
|
|
210
|
+
|
|
211
|
+
Args:
|
|
212
|
+
df: DataFrame with some labeled and some unlabeled rows
|
|
213
|
+
from_column: Text column(s) to analyze
|
|
214
|
+
- str: Single column (e.g., "comment")
|
|
215
|
+
- List[str]: Multiple columns (e.g., ["comment", "notes"])
|
|
216
|
+
to_column: Label column to fill (e.g., "status")
|
|
217
|
+
|
|
218
|
+
Returns:
|
|
219
|
+
DataFrame with deduced labels filled in
|
|
220
|
+
|
|
221
|
+
Examples:
|
|
222
|
+
# Single column
|
|
223
|
+
>>> result = add.deduce(df, from_column="comment", to_column="status")
|
|
224
|
+
|
|
225
|
+
# Multiple columns (better accuracy)
|
|
226
|
+
>>> result = add.deduce(
|
|
227
|
+
... df,
|
|
228
|
+
... from_column=["comment", "notes", "description"],
|
|
229
|
+
... to_column="status"
|
|
230
|
+
... )
|
|
231
|
+
|
|
232
|
+
Privacy: Your data never leaves your machine. No external connections.
|
|
233
|
+
"""
|
|
234
|
+
# Detect input backend
|
|
235
|
+
if isinstance(df, pd.DataFrame):
|
|
236
|
+
backend = "pandas"
|
|
237
|
+
# Convert to Polars
|
|
238
|
+
df_polars = pl.from_pandas(df)
|
|
239
|
+
elif isinstance(df, pl.DataFrame):
|
|
240
|
+
backend = "polars"
|
|
241
|
+
df_polars = df
|
|
242
|
+
else:
|
|
243
|
+
# Try arrow bridge (for cudf, etc.)
|
|
244
|
+
try:
|
|
245
|
+
df_polars = pl.from_arrow(df)
|
|
246
|
+
backend = "arrow"
|
|
247
|
+
except Exception:
|
|
248
|
+
raise TypeError(f"Unsupported DataFrame type: {type(df)}")
|
|
249
|
+
|
|
250
|
+
# Process in Polars
|
|
251
|
+
result_polars = _deduce_polars(df_polars, from_column, to_column)
|
|
252
|
+
|
|
253
|
+
# Convert back to original format
|
|
254
|
+
if backend == "pandas":
|
|
255
|
+
return result_polars.to_pandas()
|
|
256
|
+
elif backend == "polars":
|
|
257
|
+
return result_polars
|
|
258
|
+
else: # arrow
|
|
259
|
+
return result_polars.to_arrow()
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
"""
|
|
2
|
-
Distribution Strategies for Data
|
|
2
|
+
Distribution Strategies for Synthetic Data Generation
|
|
3
3
|
|
|
4
4
|
DEPRECATED: This module has been moved to additory.common.distributions
|
|
5
5
|
Please update your imports to use additory.common.distributions instead.
|
|
@@ -11,7 +11,7 @@ import warnings
|
|
|
11
11
|
|
|
12
12
|
# Issue deprecation warning
|
|
13
13
|
warnings.warn(
|
|
14
|
-
"additory.
|
|
14
|
+
"additory.synthetic.distributions is deprecated. "
|
|
15
15
|
"Please use additory.common.distributions instead. "
|
|
16
16
|
"This module will be removed in a future version.",
|
|
17
17
|
DeprecationWarning,
|