additory 0.1.0a4__py3-none-any.whl → 0.1.1a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- additory/__init__.py +58 -14
- additory/common/__init__.py +31 -147
- additory/common/column_selector.py +255 -0
- additory/common/distributions.py +286 -613
- additory/common/extractors.py +313 -0
- additory/common/knn_imputation.py +332 -0
- additory/common/result.py +380 -0
- additory/common/strategy_parser.py +243 -0
- additory/common/unit_conversions.py +338 -0
- additory/common/validation.py +283 -103
- additory/core/__init__.py +34 -22
- additory/core/backend.py +258 -0
- additory/core/config.py +177 -305
- additory/core/logging.py +230 -24
- additory/core/memory_manager.py +157 -495
- additory/expressions/__init__.py +2 -23
- additory/expressions/compiler.py +457 -0
- additory/expressions/engine.py +264 -487
- additory/expressions/integrity.py +179 -0
- additory/expressions/loader.py +263 -0
- additory/expressions/parser.py +363 -167
- additory/expressions/resolver.py +274 -0
- additory/functions/__init__.py +1 -0
- additory/functions/analyze/__init__.py +144 -0
- additory/functions/analyze/cardinality.py +58 -0
- additory/functions/analyze/correlations.py +66 -0
- additory/functions/analyze/distributions.py +53 -0
- additory/functions/analyze/duplicates.py +49 -0
- additory/functions/analyze/features.py +61 -0
- additory/functions/analyze/imputation.py +66 -0
- additory/functions/analyze/outliers.py +65 -0
- additory/functions/analyze/patterns.py +65 -0
- additory/functions/analyze/presets.py +72 -0
- additory/functions/analyze/quality.py +59 -0
- additory/functions/analyze/timeseries.py +53 -0
- additory/functions/analyze/types.py +45 -0
- additory/functions/expressions/__init__.py +161 -0
- additory/functions/snapshot/__init__.py +82 -0
- additory/functions/snapshot/filter.py +119 -0
- additory/functions/synthetic/__init__.py +113 -0
- additory/functions/synthetic/mode_detector.py +47 -0
- additory/functions/synthetic/strategies/__init__.py +1 -0
- additory/functions/synthetic/strategies/advanced.py +35 -0
- additory/functions/synthetic/strategies/augmentative.py +160 -0
- additory/functions/synthetic/strategies/generative.py +168 -0
- additory/functions/synthetic/strategies/presets.py +116 -0
- additory/functions/to/__init__.py +188 -0
- additory/functions/to/lookup.py +351 -0
- additory/functions/to/merge.py +189 -0
- additory/functions/to/sort.py +91 -0
- additory/functions/to/summarize.py +170 -0
- additory/functions/transform/__init__.py +140 -0
- additory/functions/transform/datetime.py +79 -0
- additory/functions/transform/extract.py +85 -0
- additory/functions/transform/harmonize.py +105 -0
- additory/functions/transform/knn.py +62 -0
- additory/functions/transform/onehotencoding.py +68 -0
- additory/functions/transform/transpose.py +42 -0
- additory-0.1.1a1.dist-info/METADATA +83 -0
- additory-0.1.1a1.dist-info/RECORD +62 -0
- additory/analysis/__init__.py +0 -48
- additory/analysis/cardinality.py +0 -126
- additory/analysis/correlations.py +0 -124
- additory/analysis/distributions.py +0 -376
- additory/analysis/quality.py +0 -158
- additory/analysis/scan.py +0 -400
- additory/common/backend.py +0 -371
- additory/common/column_utils.py +0 -191
- additory/common/exceptions.py +0 -62
- additory/common/lists.py +0 -229
- additory/common/patterns.py +0 -240
- additory/common/resolver.py +0 -567
- additory/common/sample_data.py +0 -182
- additory/core/ast_builder.py +0 -165
- additory/core/backends/__init__.py +0 -23
- additory/core/backends/arrow_bridge.py +0 -483
- additory/core/backends/cudf_bridge.py +0 -355
- additory/core/column_positioning.py +0 -358
- additory/core/compiler_polars.py +0 -166
- additory/core/enhanced_cache_manager.py +0 -1119
- additory/core/enhanced_matchers.py +0 -473
- additory/core/enhanced_version_manager.py +0 -325
- additory/core/executor.py +0 -59
- additory/core/integrity_manager.py +0 -477
- additory/core/loader.py +0 -190
- additory/core/namespace_manager.py +0 -657
- additory/core/parser.py +0 -176
- additory/core/polars_expression_engine.py +0 -601
- additory/core/registry.py +0 -177
- additory/core/sample_data_manager.py +0 -492
- additory/core/user_namespace.py +0 -751
- additory/core/validator.py +0 -27
- additory/dynamic_api.py +0 -352
- additory/expressions/proxy.py +0 -549
- additory/expressions/registry.py +0 -313
- additory/expressions/samples.py +0 -492
- additory/synthetic/__init__.py +0 -13
- additory/synthetic/column_name_resolver.py +0 -149
- additory/synthetic/deduce.py +0 -259
- additory/synthetic/distributions.py +0 -22
- additory/synthetic/forecast.py +0 -1132
- additory/synthetic/linked_list_parser.py +0 -415
- additory/synthetic/namespace_lookup.py +0 -129
- additory/synthetic/smote.py +0 -320
- additory/synthetic/strategies.py +0 -926
- additory/synthetic/synthesizer.py +0 -713
- additory/utilities/__init__.py +0 -53
- additory/utilities/encoding.py +0 -600
- additory/utilities/games.py +0 -300
- additory/utilities/keys.py +0 -8
- additory/utilities/lookup.py +0 -103
- additory/utilities/matchers.py +0 -216
- additory/utilities/resolvers.py +0 -286
- additory/utilities/settings.py +0 -167
- additory/utilities/units.py +0 -749
- additory/utilities/validators.py +0 -153
- additory-0.1.0a4.dist-info/METADATA +0 -311
- additory-0.1.0a4.dist-info/RECORD +0 -72
- additory-0.1.0a4.dist-info/licenses/LICENSE +0 -21
- {additory-0.1.0a4.dist-info → additory-0.1.1a1.dist-info}/WHEEL +0 -0
- {additory-0.1.0a4.dist-info → additory-0.1.1a1.dist-info}/top_level.txt +0 -0
additory/synthetic/deduce.py
DELETED
|
@@ -1,259 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
|
-
"""
|
|
3
|
-
Text-based label deduction for additory.
|
|
4
|
-
|
|
5
|
-
Uses TF-IDF + cosine similarity to deduce labels from text.
|
|
6
|
-
Pure Python, no LLMs, offline-first.
|
|
7
|
-
"""
|
|
8
|
-
|
|
9
|
-
import math
|
|
10
|
-
import re
|
|
11
|
-
from collections import Counter
|
|
12
|
-
from typing import Union, List, Optional
|
|
13
|
-
import pandas as pd
|
|
14
|
-
import polars as pl
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
def tokenize(text: str) -> List[str]:
|
|
18
|
-
"""
|
|
19
|
-
Tokenize text into words.
|
|
20
|
-
|
|
21
|
-
Args:
|
|
22
|
-
text: Input text
|
|
23
|
-
|
|
24
|
-
Returns:
|
|
25
|
-
List of lowercase tokens
|
|
26
|
-
"""
|
|
27
|
-
if text is None or not isinstance(text, str):
|
|
28
|
-
return []
|
|
29
|
-
|
|
30
|
-
text = text.lower()
|
|
31
|
-
text = re.sub(r"[^a-z0-9\s]", " ", text)
|
|
32
|
-
return [w for w in text.split() if w]
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
def vectorize(tokens: List[str]) -> Counter:
|
|
36
|
-
"""
|
|
37
|
-
Convert tokens to TF vector (term frequency).
|
|
38
|
-
|
|
39
|
-
Args:
|
|
40
|
-
tokens: List of tokens
|
|
41
|
-
|
|
42
|
-
Returns:
|
|
43
|
-
Counter with term frequencies
|
|
44
|
-
"""
|
|
45
|
-
return Counter(tokens)
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
def cosine_similarity(v1: Counter, v2: Counter) -> float:
|
|
49
|
-
"""
|
|
50
|
-
Compute cosine similarity between two vectors.
|
|
51
|
-
|
|
52
|
-
Args:
|
|
53
|
-
v1: First vector (Counter)
|
|
54
|
-
v2: Second vector (Counter)
|
|
55
|
-
|
|
56
|
-
Returns:
|
|
57
|
-
Similarity score (0-1)
|
|
58
|
-
"""
|
|
59
|
-
# Dot product
|
|
60
|
-
dot = sum(v1[t] * v2[t] for t in v1 if t in v2)
|
|
61
|
-
|
|
62
|
-
# Magnitudes
|
|
63
|
-
mag1 = math.sqrt(sum(v * v for v in v1.values()))
|
|
64
|
-
mag2 = math.sqrt(sum(v * v for v in v2.values()))
|
|
65
|
-
|
|
66
|
-
if mag1 == 0 or mag2 == 0:
|
|
67
|
-
return 0.0
|
|
68
|
-
|
|
69
|
-
return dot / (mag1 * mag2)
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
def _deduce_polars(
|
|
73
|
-
df: pl.DataFrame,
|
|
74
|
-
from_column: Union[str, List[str]],
|
|
75
|
-
to_column: str,
|
|
76
|
-
min_examples: int = 3
|
|
77
|
-
) -> pl.DataFrame:
|
|
78
|
-
"""
|
|
79
|
-
Deduce missing labels using text similarity (Polars-native).
|
|
80
|
-
|
|
81
|
-
Args:
|
|
82
|
-
df: Polars DataFrame
|
|
83
|
-
from_column: Text column(s) to analyze
|
|
84
|
-
to_column: Label column to fill
|
|
85
|
-
min_examples: Minimum labeled examples required
|
|
86
|
-
|
|
87
|
-
Returns:
|
|
88
|
-
DataFrame with deduced labels
|
|
89
|
-
|
|
90
|
-
Raises:
|
|
91
|
-
ValueError: If insufficient labeled examples
|
|
92
|
-
"""
|
|
93
|
-
# Normalize from_column to list
|
|
94
|
-
if isinstance(from_column, str):
|
|
95
|
-
source_cols = [from_column]
|
|
96
|
-
else:
|
|
97
|
-
source_cols = from_column
|
|
98
|
-
|
|
99
|
-
# Validate columns exist
|
|
100
|
-
for col in source_cols:
|
|
101
|
-
if col not in df.columns:
|
|
102
|
-
raise ValueError(f"Column '{col}' not found in DataFrame")
|
|
103
|
-
|
|
104
|
-
if to_column not in df.columns:
|
|
105
|
-
raise ValueError(f"Column '{to_column}' not found in DataFrame")
|
|
106
|
-
|
|
107
|
-
# Create combined text column if multiple sources
|
|
108
|
-
if len(source_cols) == 1:
|
|
109
|
-
text_col = source_cols[0]
|
|
110
|
-
df_work = df.clone()
|
|
111
|
-
else:
|
|
112
|
-
# Concatenate multiple columns with spaces
|
|
113
|
-
df_work = df.with_columns([
|
|
114
|
-
pl.concat_str(
|
|
115
|
-
[pl.col(c).fill_null("") for c in source_cols],
|
|
116
|
-
separator=" "
|
|
117
|
-
).alias("__deduce_text__")
|
|
118
|
-
])
|
|
119
|
-
text_col = "__deduce_text__"
|
|
120
|
-
|
|
121
|
-
# Split into labeled and unlabeled
|
|
122
|
-
labeled_df = df_work.filter(pl.col(to_column).is_not_null())
|
|
123
|
-
unlabeled_df = df_work.filter(pl.col(to_column).is_null())
|
|
124
|
-
|
|
125
|
-
# Check if we have enough labeled examples
|
|
126
|
-
n_labeled = len(labeled_df)
|
|
127
|
-
if n_labeled == 0:
|
|
128
|
-
raise ValueError(
|
|
129
|
-
f"⚠️ Cannot deduce labels: No labeled examples found in '{to_column}' column.\n"
|
|
130
|
-
f"Please manually label at least {min_examples} examples per category, then run again.\n\n"
|
|
131
|
-
f"Note: additory uses pure Python text similarity (no LLMs, no external calls).\n"
|
|
132
|
-
f"Your data never leaves your machine."
|
|
133
|
-
)
|
|
134
|
-
|
|
135
|
-
if n_labeled < min_examples:
|
|
136
|
-
print(
|
|
137
|
-
f"⚠️ Only {n_labeled} labeled examples found. "
|
|
138
|
-
f"For better accuracy, label at least {min_examples} examples.\n"
|
|
139
|
-
f"Proceeding with available data..."
|
|
140
|
-
)
|
|
141
|
-
|
|
142
|
-
# If no unlabeled rows, return original
|
|
143
|
-
if len(unlabeled_df) == 0:
|
|
144
|
-
if len(source_cols) > 1:
|
|
145
|
-
# Remove temporary column
|
|
146
|
-
return df_work.drop("__deduce_text__")
|
|
147
|
-
return df_work
|
|
148
|
-
|
|
149
|
-
# Precompute vectors for labeled rows
|
|
150
|
-
labeled_vectors = []
|
|
151
|
-
for row in labeled_df.iter_rows(named=True):
|
|
152
|
-
text = row[text_col]
|
|
153
|
-
label = row[to_column]
|
|
154
|
-
tokens = tokenize(text)
|
|
155
|
-
vec = vectorize(tokens)
|
|
156
|
-
labeled_vectors.append((vec, label))
|
|
157
|
-
|
|
158
|
-
# Deduce labels for unlabeled rows
|
|
159
|
-
deduced_labels = []
|
|
160
|
-
for row in unlabeled_df.iter_rows(named=True):
|
|
161
|
-
text = row[text_col]
|
|
162
|
-
tokens = tokenize(text)
|
|
163
|
-
vec = vectorize(tokens)
|
|
164
|
-
|
|
165
|
-
# Find most similar labeled example
|
|
166
|
-
best_label = None
|
|
167
|
-
best_score = -1.0
|
|
168
|
-
|
|
169
|
-
for labeled_vec, label in labeled_vectors:
|
|
170
|
-
score = cosine_similarity(vec, labeled_vec)
|
|
171
|
-
if score > best_score:
|
|
172
|
-
best_score = score
|
|
173
|
-
best_label = label
|
|
174
|
-
|
|
175
|
-
deduced_labels.append(best_label)
|
|
176
|
-
|
|
177
|
-
# Create deduced labels series
|
|
178
|
-
deduced_series = pl.Series(to_column, deduced_labels)
|
|
179
|
-
|
|
180
|
-
# Update unlabeled rows with deduced labels
|
|
181
|
-
unlabeled_df = unlabeled_df.with_columns([deduced_series])
|
|
182
|
-
|
|
183
|
-
# Combine labeled and unlabeled back together
|
|
184
|
-
result_df = pl.concat([labeled_df, unlabeled_df])
|
|
185
|
-
|
|
186
|
-
# Remove temporary column if created
|
|
187
|
-
if len(source_cols) > 1:
|
|
188
|
-
result_df = result_df.drop("__deduce_text__")
|
|
189
|
-
|
|
190
|
-
# Print success message
|
|
191
|
-
n_deduced = len(deduced_labels)
|
|
192
|
-
print(f"✓ Deduced {n_deduced} label{'s' if n_deduced != 1 else ''} from {n_labeled} examples (offline, no LLMs)")
|
|
193
|
-
|
|
194
|
-
return result_df
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
def deduce(
|
|
198
|
-
df: Union[pd.DataFrame, pl.DataFrame],
|
|
199
|
-
from_column: Union[str, List[str]],
|
|
200
|
-
to_column: str
|
|
201
|
-
) -> Union[pd.DataFrame, pl.DataFrame]:
|
|
202
|
-
"""
|
|
203
|
-
Deduce missing labels based on text similarity to labeled examples.
|
|
204
|
-
|
|
205
|
-
Uses cosine similarity on TF-IDF vectors. Pure Python, no LLMs, offline-first.
|
|
206
|
-
Requires at least 3 labeled examples to work.
|
|
207
|
-
|
|
208
|
-
When multiple source columns are provided, they are concatenated with
|
|
209
|
-
spaces before computing similarity.
|
|
210
|
-
|
|
211
|
-
Args:
|
|
212
|
-
df: DataFrame with some labeled and some unlabeled rows
|
|
213
|
-
from_column: Text column(s) to analyze
|
|
214
|
-
- str: Single column (e.g., "comment")
|
|
215
|
-
- List[str]: Multiple columns (e.g., ["comment", "notes"])
|
|
216
|
-
to_column: Label column to fill (e.g., "status")
|
|
217
|
-
|
|
218
|
-
Returns:
|
|
219
|
-
DataFrame with deduced labels filled in
|
|
220
|
-
|
|
221
|
-
Examples:
|
|
222
|
-
# Single column
|
|
223
|
-
>>> result = add.deduce(df, from_column="comment", to_column="status")
|
|
224
|
-
|
|
225
|
-
# Multiple columns (better accuracy)
|
|
226
|
-
>>> result = add.deduce(
|
|
227
|
-
... df,
|
|
228
|
-
... from_column=["comment", "notes", "description"],
|
|
229
|
-
... to_column="status"
|
|
230
|
-
... )
|
|
231
|
-
|
|
232
|
-
Privacy: Your data never leaves your machine. No external connections.
|
|
233
|
-
"""
|
|
234
|
-
# Detect input backend
|
|
235
|
-
if isinstance(df, pd.DataFrame):
|
|
236
|
-
backend = "pandas"
|
|
237
|
-
# Convert to Polars
|
|
238
|
-
df_polars = pl.from_pandas(df)
|
|
239
|
-
elif isinstance(df, pl.DataFrame):
|
|
240
|
-
backend = "polars"
|
|
241
|
-
df_polars = df
|
|
242
|
-
else:
|
|
243
|
-
# Try arrow bridge (for cudf, etc.)
|
|
244
|
-
try:
|
|
245
|
-
df_polars = pl.from_arrow(df)
|
|
246
|
-
backend = "arrow"
|
|
247
|
-
except Exception:
|
|
248
|
-
raise TypeError(f"Unsupported DataFrame type: {type(df)}")
|
|
249
|
-
|
|
250
|
-
# Process in Polars
|
|
251
|
-
result_polars = _deduce_polars(df_polars, from_column, to_column)
|
|
252
|
-
|
|
253
|
-
# Convert back to original format
|
|
254
|
-
if backend == "pandas":
|
|
255
|
-
return result_polars.to_pandas()
|
|
256
|
-
elif backend == "polars":
|
|
257
|
-
return result_polars
|
|
258
|
-
else: # arrow
|
|
259
|
-
return result_polars.to_arrow()
|
|
@@ -1,22 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Distribution Strategies for Synthetic Data Generation
|
|
3
|
-
|
|
4
|
-
DEPRECATED: This module has been moved to additory.common.distributions
|
|
5
|
-
Please update your imports to use additory.common.distributions instead.
|
|
6
|
-
|
|
7
|
-
This file is kept for backward compatibility and will be removed in a future version.
|
|
8
|
-
"""
|
|
9
|
-
|
|
10
|
-
import warnings
|
|
11
|
-
|
|
12
|
-
# Issue deprecation warning
|
|
13
|
-
warnings.warn(
|
|
14
|
-
"additory.synthetic.distributions is deprecated. "
|
|
15
|
-
"Please use additory.common.distributions instead. "
|
|
16
|
-
"This module will be removed in a future version.",
|
|
17
|
-
DeprecationWarning,
|
|
18
|
-
stacklevel=2
|
|
19
|
-
)
|
|
20
|
-
|
|
21
|
-
# Import everything from common.distributions for backward compatibility
|
|
22
|
-
from additory.common.distributions import * # noqa: F401, F403
|