additory 0.1.0a3__py3-none-any.whl → 0.1.0a4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- additory/__init__.py +1 -1
- additory/core/config.py +3 -3
- additory/core/registry.py +4 -3
- additory/dynamic_api.py +55 -7
- additory/expressions/registry.py +3 -3
- additory/synthetic/deduce.py +259 -0
- additory/synthetic/strategies.py +76 -0
- {additory-0.1.0a3.dist-info → additory-0.1.0a4.dist-info}/METADATA +42 -19
- {additory-0.1.0a3.dist-info → additory-0.1.0a4.dist-info}/RECORD +12 -11
- {additory-0.1.0a3.dist-info → additory-0.1.0a4.dist-info}/WHEEL +0 -0
- {additory-0.1.0a3.dist-info → additory-0.1.0a4.dist-info}/licenses/LICENSE +0 -0
- {additory-0.1.0a3.dist-info → additory-0.1.0a4.dist-info}/top_level.txt +0 -0
additory/__init__.py
CHANGED
additory/core/config.py
CHANGED
|
@@ -329,14 +329,14 @@ def set_custom_formula_path(path):
|
|
|
329
329
|
|
|
330
330
|
# backend preference setting
|
|
331
331
|
|
|
332
|
-
_backend_preference: str
|
|
332
|
+
_backend_preference: Optional[str] = None # "cpu", "gpu", or None
|
|
333
333
|
|
|
334
|
-
def set_backend_preference(mode: str
|
|
334
|
+
def set_backend_preference(mode: Optional[str]):
|
|
335
335
|
global _backend_preference
|
|
336
336
|
if mode not in (None, "cpu", "gpu"):
|
|
337
337
|
raise ValueError("backend must be 'cpu', 'gpu', or None")
|
|
338
338
|
_backend_preference = mode
|
|
339
339
|
|
|
340
|
-
def get_backend_preference() -> str
|
|
340
|
+
def get_backend_preference() -> Optional[str]:
|
|
341
341
|
return _backend_preference
|
|
342
342
|
|
additory/core/registry.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
# Versioned registry for additory
|
|
3
3
|
|
|
4
4
|
from dataclasses import dataclass
|
|
5
|
+
from typing import Optional
|
|
5
6
|
import os
|
|
6
7
|
import json
|
|
7
8
|
|
|
@@ -26,9 +27,9 @@ class ResolvedFormula:
|
|
|
26
27
|
source: str
|
|
27
28
|
version: str
|
|
28
29
|
mode: str = "local"
|
|
29
|
-
ast: dict
|
|
30
|
-
sample_clean: dict
|
|
31
|
-
sample_unclean: dict
|
|
30
|
+
ast: Optional[dict] = None
|
|
31
|
+
sample_clean: Optional[dict] = None
|
|
32
|
+
sample_unclean: Optional[dict] = None
|
|
32
33
|
|
|
33
34
|
|
|
34
35
|
# ------------------------------------------------------------
|
additory/dynamic_api.py
CHANGED
|
@@ -30,8 +30,15 @@ class AdditoryAPI(SimpleNamespace):
|
|
|
30
30
|
self.my = ExpressionProxy(namespace="user")
|
|
31
31
|
self._builtin_proxy = ExpressionProxy(namespace="builtin")
|
|
32
32
|
|
|
33
|
-
# Explicitly set
|
|
33
|
+
# Explicitly set methods to prevent namespace conflicts
|
|
34
34
|
self.synthetic = self._synthetic_method
|
|
35
|
+
self.deduce = self._deduce_method
|
|
36
|
+
self.to = self._to_method
|
|
37
|
+
self.onehotencoding = self._onehotencoding_method
|
|
38
|
+
self.harmonize_units = self._harmonize_units_method
|
|
39
|
+
self.scan = self._scan_method
|
|
40
|
+
self.games = self._games_method
|
|
41
|
+
self.play = self._play_method
|
|
35
42
|
|
|
36
43
|
def __getattr__(self, name):
|
|
37
44
|
"""
|
|
@@ -118,7 +125,7 @@ class AdditoryAPI(SimpleNamespace):
|
|
|
118
125
|
additory.synthetic = self._synthetic_method
|
|
119
126
|
raise
|
|
120
127
|
|
|
121
|
-
def
|
|
128
|
+
def _to_method(self, target_df, from_df=None, bring=None, against=None, **kwargs):
|
|
122
129
|
"""
|
|
123
130
|
Add columns from reference dataframe to target dataframe.
|
|
124
131
|
|
|
@@ -139,7 +146,7 @@ class AdditoryAPI(SimpleNamespace):
|
|
|
139
146
|
from additory.utilities.lookup import to
|
|
140
147
|
return to(target_df, from_df, bring=bring, against=against, **kwargs)
|
|
141
148
|
|
|
142
|
-
def
|
|
149
|
+
def _onehotencoding_method(self, df, columns=None, **kwargs):
|
|
143
150
|
"""
|
|
144
151
|
One-hot encode categorical columns.
|
|
145
152
|
|
|
@@ -154,7 +161,7 @@ class AdditoryAPI(SimpleNamespace):
|
|
|
154
161
|
from additory.utilities.encoding import onehotencoding
|
|
155
162
|
return onehotencoding(df, column=columns, **kwargs)
|
|
156
163
|
|
|
157
|
-
def
|
|
164
|
+
def _harmonize_units_method(self, df, value_column, unit_column, target_unit=None, position="end", **kwargs):
|
|
158
165
|
"""
|
|
159
166
|
Harmonize units in a dataframe.
|
|
160
167
|
|
|
@@ -176,7 +183,7 @@ class AdditoryAPI(SimpleNamespace):
|
|
|
176
183
|
from additory.utilities.units import harmonize_units
|
|
177
184
|
return harmonize_units(df, value_column, unit_column, target_unit, position, **kwargs)
|
|
178
185
|
|
|
179
|
-
def
|
|
186
|
+
def _scan_method(
|
|
180
187
|
self,
|
|
181
188
|
df: Union[pl.DataFrame, pd.DataFrame, Any],
|
|
182
189
|
preset: Optional[str] = None,
|
|
@@ -259,7 +266,48 @@ class AdditoryAPI(SimpleNamespace):
|
|
|
259
266
|
verbose=verbose
|
|
260
267
|
)
|
|
261
268
|
|
|
262
|
-
def
|
|
269
|
+
def _deduce_method(
|
|
270
|
+
self,
|
|
271
|
+
df: Union[pd.DataFrame, pl.DataFrame, Any],
|
|
272
|
+
from_column: Union[str, List[str]],
|
|
273
|
+
to_column: str
|
|
274
|
+
) -> Union[pd.DataFrame, pl.DataFrame, Any]:
|
|
275
|
+
"""
|
|
276
|
+
Deduce missing labels based on text similarity to labeled examples.
|
|
277
|
+
|
|
278
|
+
Uses cosine similarity on TF-IDF vectors. Pure Python, no LLMs, offline-first.
|
|
279
|
+
Requires at least 3 labeled examples to work.
|
|
280
|
+
|
|
281
|
+
When multiple source columns are provided, they are concatenated with
|
|
282
|
+
spaces before computing similarity.
|
|
283
|
+
|
|
284
|
+
Args:
|
|
285
|
+
df: DataFrame with some labeled and some unlabeled rows
|
|
286
|
+
from_column: Text column(s) to analyze
|
|
287
|
+
- str: Single column (e.g., "comment")
|
|
288
|
+
- List[str]: Multiple columns (e.g., ["comment", "notes"])
|
|
289
|
+
to_column: Label column to fill (e.g., "status")
|
|
290
|
+
|
|
291
|
+
Returns:
|
|
292
|
+
DataFrame with deduced labels filled in
|
|
293
|
+
|
|
294
|
+
Examples:
|
|
295
|
+
# Single column
|
|
296
|
+
>>> result = add.deduce(df, from_column="comment", to_column="status")
|
|
297
|
+
|
|
298
|
+
# Multiple columns (better accuracy)
|
|
299
|
+
>>> result = add.deduce(
|
|
300
|
+
... df,
|
|
301
|
+
... from_column=["comment", "notes", "description"],
|
|
302
|
+
... to_column="status"
|
|
303
|
+
... )
|
|
304
|
+
|
|
305
|
+
Privacy: Your data never leaves your machine. No external connections.
|
|
306
|
+
"""
|
|
307
|
+
from additory.synthetic.deduce import deduce as deduce_impl
|
|
308
|
+
return deduce_impl(df, from_column, to_column)
|
|
309
|
+
|
|
310
|
+
def _games_method(self):
|
|
263
311
|
"""
|
|
264
312
|
List available games! 🎮
|
|
265
313
|
|
|
@@ -275,7 +323,7 @@ class AdditoryAPI(SimpleNamespace):
|
|
|
275
323
|
"""
|
|
276
324
|
return ['tictactoe', 'sudoku']
|
|
277
325
|
|
|
278
|
-
def
|
|
326
|
+
def _play_method(self, game: str = "tictactoe"):
|
|
279
327
|
"""
|
|
280
328
|
Play a game! 🎮
|
|
281
329
|
|
additory/expressions/registry.py
CHANGED
|
@@ -28,9 +28,9 @@ class ResolvedFormula:
|
|
|
28
28
|
version: str
|
|
29
29
|
mode: str = "local"
|
|
30
30
|
namespace: str = "builtin" # NEW: "builtin" or "user"
|
|
31
|
-
ast: dict
|
|
32
|
-
sample_clean: dict
|
|
33
|
-
sample_unclean: dict
|
|
31
|
+
ast: Optional[dict] = None
|
|
32
|
+
sample_clean: Optional[dict] = None
|
|
33
|
+
sample_unclean: Optional[dict] = None
|
|
34
34
|
|
|
35
35
|
|
|
36
36
|
# ------------------------------------------------------------
|
|
@@ -0,0 +1,259 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Text-based label deduction for additory.
|
|
4
|
+
|
|
5
|
+
Uses TF-IDF + cosine similarity to deduce labels from text.
|
|
6
|
+
Pure Python, no LLMs, offline-first.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import math
|
|
10
|
+
import re
|
|
11
|
+
from collections import Counter
|
|
12
|
+
from typing import Union, List, Optional
|
|
13
|
+
import pandas as pd
|
|
14
|
+
import polars as pl
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def tokenize(text: str) -> List[str]:
|
|
18
|
+
"""
|
|
19
|
+
Tokenize text into words.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
text: Input text
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
List of lowercase tokens
|
|
26
|
+
"""
|
|
27
|
+
if text is None or not isinstance(text, str):
|
|
28
|
+
return []
|
|
29
|
+
|
|
30
|
+
text = text.lower()
|
|
31
|
+
text = re.sub(r"[^a-z0-9\s]", " ", text)
|
|
32
|
+
return [w for w in text.split() if w]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def vectorize(tokens: List[str]) -> Counter:
|
|
36
|
+
"""
|
|
37
|
+
Convert tokens to TF vector (term frequency).
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
tokens: List of tokens
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
Counter with term frequencies
|
|
44
|
+
"""
|
|
45
|
+
return Counter(tokens)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def cosine_similarity(v1: Counter, v2: Counter) -> float:
|
|
49
|
+
"""
|
|
50
|
+
Compute cosine similarity between two vectors.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
v1: First vector (Counter)
|
|
54
|
+
v2: Second vector (Counter)
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
Similarity score (0-1)
|
|
58
|
+
"""
|
|
59
|
+
# Dot product
|
|
60
|
+
dot = sum(v1[t] * v2[t] for t in v1 if t in v2)
|
|
61
|
+
|
|
62
|
+
# Magnitudes
|
|
63
|
+
mag1 = math.sqrt(sum(v * v for v in v1.values()))
|
|
64
|
+
mag2 = math.sqrt(sum(v * v for v in v2.values()))
|
|
65
|
+
|
|
66
|
+
if mag1 == 0 or mag2 == 0:
|
|
67
|
+
return 0.0
|
|
68
|
+
|
|
69
|
+
return dot / (mag1 * mag2)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _deduce_polars(
|
|
73
|
+
df: pl.DataFrame,
|
|
74
|
+
from_column: Union[str, List[str]],
|
|
75
|
+
to_column: str,
|
|
76
|
+
min_examples: int = 3
|
|
77
|
+
) -> pl.DataFrame:
|
|
78
|
+
"""
|
|
79
|
+
Deduce missing labels using text similarity (Polars-native).
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
df: Polars DataFrame
|
|
83
|
+
from_column: Text column(s) to analyze
|
|
84
|
+
to_column: Label column to fill
|
|
85
|
+
min_examples: Minimum labeled examples required
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
DataFrame with deduced labels
|
|
89
|
+
|
|
90
|
+
Raises:
|
|
91
|
+
ValueError: If insufficient labeled examples
|
|
92
|
+
"""
|
|
93
|
+
# Normalize from_column to list
|
|
94
|
+
if isinstance(from_column, str):
|
|
95
|
+
source_cols = [from_column]
|
|
96
|
+
else:
|
|
97
|
+
source_cols = from_column
|
|
98
|
+
|
|
99
|
+
# Validate columns exist
|
|
100
|
+
for col in source_cols:
|
|
101
|
+
if col not in df.columns:
|
|
102
|
+
raise ValueError(f"Column '{col}' not found in DataFrame")
|
|
103
|
+
|
|
104
|
+
if to_column not in df.columns:
|
|
105
|
+
raise ValueError(f"Column '{to_column}' not found in DataFrame")
|
|
106
|
+
|
|
107
|
+
# Create combined text column if multiple sources
|
|
108
|
+
if len(source_cols) == 1:
|
|
109
|
+
text_col = source_cols[0]
|
|
110
|
+
df_work = df.clone()
|
|
111
|
+
else:
|
|
112
|
+
# Concatenate multiple columns with spaces
|
|
113
|
+
df_work = df.with_columns([
|
|
114
|
+
pl.concat_str(
|
|
115
|
+
[pl.col(c).fill_null("") for c in source_cols],
|
|
116
|
+
separator=" "
|
|
117
|
+
).alias("__deduce_text__")
|
|
118
|
+
])
|
|
119
|
+
text_col = "__deduce_text__"
|
|
120
|
+
|
|
121
|
+
# Split into labeled and unlabeled
|
|
122
|
+
labeled_df = df_work.filter(pl.col(to_column).is_not_null())
|
|
123
|
+
unlabeled_df = df_work.filter(pl.col(to_column).is_null())
|
|
124
|
+
|
|
125
|
+
# Check if we have enough labeled examples
|
|
126
|
+
n_labeled = len(labeled_df)
|
|
127
|
+
if n_labeled == 0:
|
|
128
|
+
raise ValueError(
|
|
129
|
+
f"⚠️ Cannot deduce labels: No labeled examples found in '{to_column}' column.\n"
|
|
130
|
+
f"Please manually label at least {min_examples} examples per category, then run again.\n\n"
|
|
131
|
+
f"Note: additory uses pure Python text similarity (no LLMs, no external calls).\n"
|
|
132
|
+
f"Your data never leaves your machine."
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
if n_labeled < min_examples:
|
|
136
|
+
print(
|
|
137
|
+
f"⚠️ Only {n_labeled} labeled examples found. "
|
|
138
|
+
f"For better accuracy, label at least {min_examples} examples.\n"
|
|
139
|
+
f"Proceeding with available data..."
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
# If no unlabeled rows, return original
|
|
143
|
+
if len(unlabeled_df) == 0:
|
|
144
|
+
if len(source_cols) > 1:
|
|
145
|
+
# Remove temporary column
|
|
146
|
+
return df_work.drop("__deduce_text__")
|
|
147
|
+
return df_work
|
|
148
|
+
|
|
149
|
+
# Precompute vectors for labeled rows
|
|
150
|
+
labeled_vectors = []
|
|
151
|
+
for row in labeled_df.iter_rows(named=True):
|
|
152
|
+
text = row[text_col]
|
|
153
|
+
label = row[to_column]
|
|
154
|
+
tokens = tokenize(text)
|
|
155
|
+
vec = vectorize(tokens)
|
|
156
|
+
labeled_vectors.append((vec, label))
|
|
157
|
+
|
|
158
|
+
# Deduce labels for unlabeled rows
|
|
159
|
+
deduced_labels = []
|
|
160
|
+
for row in unlabeled_df.iter_rows(named=True):
|
|
161
|
+
text = row[text_col]
|
|
162
|
+
tokens = tokenize(text)
|
|
163
|
+
vec = vectorize(tokens)
|
|
164
|
+
|
|
165
|
+
# Find most similar labeled example
|
|
166
|
+
best_label = None
|
|
167
|
+
best_score = -1.0
|
|
168
|
+
|
|
169
|
+
for labeled_vec, label in labeled_vectors:
|
|
170
|
+
score = cosine_similarity(vec, labeled_vec)
|
|
171
|
+
if score > best_score:
|
|
172
|
+
best_score = score
|
|
173
|
+
best_label = label
|
|
174
|
+
|
|
175
|
+
deduced_labels.append(best_label)
|
|
176
|
+
|
|
177
|
+
# Create deduced labels series
|
|
178
|
+
deduced_series = pl.Series(to_column, deduced_labels)
|
|
179
|
+
|
|
180
|
+
# Update unlabeled rows with deduced labels
|
|
181
|
+
unlabeled_df = unlabeled_df.with_columns([deduced_series])
|
|
182
|
+
|
|
183
|
+
# Combine labeled and unlabeled back together
|
|
184
|
+
result_df = pl.concat([labeled_df, unlabeled_df])
|
|
185
|
+
|
|
186
|
+
# Remove temporary column if created
|
|
187
|
+
if len(source_cols) > 1:
|
|
188
|
+
result_df = result_df.drop("__deduce_text__")
|
|
189
|
+
|
|
190
|
+
# Print success message
|
|
191
|
+
n_deduced = len(deduced_labels)
|
|
192
|
+
print(f"✓ Deduced {n_deduced} label{'s' if n_deduced != 1 else ''} from {n_labeled} examples (offline, no LLMs)")
|
|
193
|
+
|
|
194
|
+
return result_df
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def deduce(
|
|
198
|
+
df: Union[pd.DataFrame, pl.DataFrame],
|
|
199
|
+
from_column: Union[str, List[str]],
|
|
200
|
+
to_column: str
|
|
201
|
+
) -> Union[pd.DataFrame, pl.DataFrame]:
|
|
202
|
+
"""
|
|
203
|
+
Deduce missing labels based on text similarity to labeled examples.
|
|
204
|
+
|
|
205
|
+
Uses cosine similarity on TF-IDF vectors. Pure Python, no LLMs, offline-first.
|
|
206
|
+
Requires at least 3 labeled examples to work.
|
|
207
|
+
|
|
208
|
+
When multiple source columns are provided, they are concatenated with
|
|
209
|
+
spaces before computing similarity.
|
|
210
|
+
|
|
211
|
+
Args:
|
|
212
|
+
df: DataFrame with some labeled and some unlabeled rows
|
|
213
|
+
from_column: Text column(s) to analyze
|
|
214
|
+
- str: Single column (e.g., "comment")
|
|
215
|
+
- List[str]: Multiple columns (e.g., ["comment", "notes"])
|
|
216
|
+
to_column: Label column to fill (e.g., "status")
|
|
217
|
+
|
|
218
|
+
Returns:
|
|
219
|
+
DataFrame with deduced labels filled in
|
|
220
|
+
|
|
221
|
+
Examples:
|
|
222
|
+
# Single column
|
|
223
|
+
>>> result = add.deduce(df, from_column="comment", to_column="status")
|
|
224
|
+
|
|
225
|
+
# Multiple columns (better accuracy)
|
|
226
|
+
>>> result = add.deduce(
|
|
227
|
+
... df,
|
|
228
|
+
... from_column=["comment", "notes", "description"],
|
|
229
|
+
... to_column="status"
|
|
230
|
+
... )
|
|
231
|
+
|
|
232
|
+
Privacy: Your data never leaves your machine. No external connections.
|
|
233
|
+
"""
|
|
234
|
+
# Detect input backend
|
|
235
|
+
if isinstance(df, pd.DataFrame):
|
|
236
|
+
backend = "pandas"
|
|
237
|
+
# Convert to Polars
|
|
238
|
+
df_polars = pl.from_pandas(df)
|
|
239
|
+
elif isinstance(df, pl.DataFrame):
|
|
240
|
+
backend = "polars"
|
|
241
|
+
df_polars = df
|
|
242
|
+
else:
|
|
243
|
+
# Try arrow bridge (for cudf, etc.)
|
|
244
|
+
try:
|
|
245
|
+
df_polars = pl.from_arrow(df)
|
|
246
|
+
backend = "arrow"
|
|
247
|
+
except Exception:
|
|
248
|
+
raise TypeError(f"Unsupported DataFrame type: {type(df)}")
|
|
249
|
+
|
|
250
|
+
# Process in Polars
|
|
251
|
+
result_polars = _deduce_polars(df_polars, from_column, to_column)
|
|
252
|
+
|
|
253
|
+
# Convert back to original format
|
|
254
|
+
if backend == "pandas":
|
|
255
|
+
return result_polars.to_pandas()
|
|
256
|
+
elif backend == "polars":
|
|
257
|
+
return result_polars
|
|
258
|
+
else: # arrow
|
|
259
|
+
return result_polars.to_arrow()
|
additory/synthetic/strategies.py
CHANGED
|
@@ -848,3 +848,79 @@ def apply_smote_strategy(
|
|
|
848
848
|
)
|
|
849
849
|
except Exception as e:
|
|
850
850
|
raise ValidationError(f"SMOTE strategy failed: {e}")
|
|
851
|
+
|
|
852
|
+
|
|
853
|
+
def parse_deduce_strategy(strategy_spec: str) -> Tuple[str, List[str]]:
|
|
854
|
+
"""
|
|
855
|
+
Parse deduce strategy specification.
|
|
856
|
+
|
|
857
|
+
Args:
|
|
858
|
+
strategy_spec: Strategy string like:
|
|
859
|
+
- "deduce:comment"
|
|
860
|
+
- "deduce:[comment, notes]"
|
|
861
|
+
|
|
862
|
+
Returns:
|
|
863
|
+
Tuple of (strategy_type, source_columns)
|
|
864
|
+
- strategy_type: "deduce"
|
|
865
|
+
- source_columns: List of source column names
|
|
866
|
+
|
|
867
|
+
Raises:
|
|
868
|
+
ValidationError: If strategy format is invalid
|
|
869
|
+
|
|
870
|
+
Examples:
|
|
871
|
+
>>> parse_deduce_strategy("deduce:comment")
|
|
872
|
+
("deduce", ["comment"])
|
|
873
|
+
|
|
874
|
+
>>> parse_deduce_strategy("deduce:[comment, notes]")
|
|
875
|
+
("deduce", ["comment", "notes"])
|
|
876
|
+
"""
|
|
877
|
+
if not strategy_spec.startswith("deduce:"):
|
|
878
|
+
raise ValidationError(
|
|
879
|
+
f"Invalid deduce strategy: {strategy_spec}. "
|
|
880
|
+
"Must start with 'deduce:'"
|
|
881
|
+
)
|
|
882
|
+
|
|
883
|
+
# Extract source specification after "deduce:"
|
|
884
|
+
source_spec = strategy_spec[7:].strip() # Remove "deduce:" prefix
|
|
885
|
+
|
|
886
|
+
if not source_spec:
|
|
887
|
+
raise ValidationError(
|
|
888
|
+
f"Deduce strategy requires source column(s): {strategy_spec}. "
|
|
889
|
+
"Format: 'deduce:column' or 'deduce:[col1, col2]'"
|
|
890
|
+
)
|
|
891
|
+
|
|
892
|
+
# Check if it's multiple columns: deduce:[col1, col2]
|
|
893
|
+
if source_spec.startswith("[") and source_spec.endswith("]"):
|
|
894
|
+
# Multiple columns
|
|
895
|
+
columns_str = source_spec[1:-1] # Remove brackets
|
|
896
|
+
|
|
897
|
+
if not columns_str.strip():
|
|
898
|
+
raise ValidationError(
|
|
899
|
+
f"Deduce column list cannot be empty: {strategy_spec}"
|
|
900
|
+
)
|
|
901
|
+
|
|
902
|
+
# Split by comma and strip whitespace
|
|
903
|
+
columns = [c.strip() for c in columns_str.split(",")]
|
|
904
|
+
|
|
905
|
+
if len(columns) == 0:
|
|
906
|
+
raise ValidationError(
|
|
907
|
+
f"Deduce strategy must specify at least one column: {strategy_spec}"
|
|
908
|
+
)
|
|
909
|
+
|
|
910
|
+
return "deduce", columns
|
|
911
|
+
else:
|
|
912
|
+
# Single column
|
|
913
|
+
return "deduce", [source_spec]
|
|
914
|
+
|
|
915
|
+
|
|
916
|
+
def is_deduce_strategy(strategy_spec: str) -> bool:
|
|
917
|
+
"""
|
|
918
|
+
Check if a strategy specification is a deduce strategy.
|
|
919
|
+
|
|
920
|
+
Args:
|
|
921
|
+
strategy_spec: Strategy string
|
|
922
|
+
|
|
923
|
+
Returns:
|
|
924
|
+
True if it's a deduce strategy, False otherwise
|
|
925
|
+
"""
|
|
926
|
+
return isinstance(strategy_spec, str) and strategy_spec.startswith("deduce:")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: additory
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.0a4
|
|
4
4
|
Summary: A semantic, extensible dataframe transformation engine with expressions, lookup, and synthetic data generation support.
|
|
5
5
|
Author: Krishnamoorthy Sankaran
|
|
6
6
|
License: MIT
|
|
@@ -39,7 +39,7 @@ Dynamic: license-file
|
|
|
39
39
|
|
|
40
40
|
[](https://www.python.org/downloads/)
|
|
41
41
|
[](https://opensource.org/licenses/MIT)
|
|
42
|
-
[](https://github.com/sekarkrishna/additory)
|
|
43
43
|
|
|
44
44
|
**Author:** Krishnamoorthy Sankaran
|
|
45
45
|
|
|
@@ -52,17 +52,17 @@ Dynamic: license-file
|
|
|
52
52
|
## 📦 Installation
|
|
53
53
|
|
|
54
54
|
```bash
|
|
55
|
-
pip install additory==0.1.
|
|
55
|
+
pip install additory==0.1.0a4
|
|
56
56
|
```
|
|
57
57
|
|
|
58
58
|
**Optional GPU support:**
|
|
59
59
|
```bash
|
|
60
|
-
pip install additory[gpu]==0.1.
|
|
60
|
+
pip install additory[gpu]==0.1.0a4 # Includes cuDF for GPU acceleration
|
|
61
61
|
```
|
|
62
62
|
|
|
63
63
|
**Development installation:**
|
|
64
64
|
```bash
|
|
65
|
-
pip install additory[dev]==0.1.
|
|
65
|
+
pip install additory[dev]==0.1.0a4 # Includes testing and development tools
|
|
66
66
|
```
|
|
67
67
|
|
|
68
68
|
## 🎯 Core Functions
|
|
@@ -70,7 +70,8 @@ pip install additory[dev]==0.1.0a2 # Includes testing and development tools
|
|
|
70
70
|
| Function | Purpose | Example |
|
|
71
71
|
|----------|---------|---------|
|
|
72
72
|
| `add.to()` | Lookup/join operations | `add.to(df1, from_df=df2, bring='col', against='key')` |
|
|
73
|
-
| `add.
|
|
73
|
+
| `add.synthetic()` | Generate additional data | `add.synthetic(df, n_rows=1000)` |
|
|
74
|
+
| `add.deduce()` | Text-based label deduction | `add.deduce(df, from_column='text', to_column='label')` |
|
|
74
75
|
| `add.scan()` | Data profiling & analysis | `add.scan(df, preset="full")` |
|
|
75
76
|
|
|
76
77
|
## 🧬 Available Expressions
|
|
@@ -119,7 +120,7 @@ import additory as add
|
|
|
119
120
|
|
|
120
121
|
# Works with polars
|
|
121
122
|
df_polars = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
|
|
122
|
-
result = add.
|
|
123
|
+
result = add.synthetic(df_polars, n_rows=100)
|
|
123
124
|
|
|
124
125
|
# Automatic type detection and conversion
|
|
125
126
|
```
|
|
@@ -193,22 +194,44 @@ patients_with_bsa = add.bsa(patients)
|
|
|
193
194
|
result = add.fitness_score(add.bmr(add.bmi(patients)))
|
|
194
195
|
```
|
|
195
196
|
|
|
196
|
-
### 🔄
|
|
197
|
+
### 🔄 Synthetic Data Generation
|
|
197
198
|
|
|
198
|
-
**
|
|
199
|
+
**Synthetic** generates additional data similar to your existing dataset using inline strategies.
|
|
199
200
|
|
|
200
201
|
```python
|
|
201
|
-
#
|
|
202
|
-
more_customers = add.
|
|
202
|
+
# Extend existing data (learns from patterns)
|
|
203
|
+
more_customers = add.synthetic(customers, n_rows=1000)
|
|
203
204
|
|
|
204
205
|
# Create data from scratch with strategies
|
|
205
|
-
new_data = add.
|
|
206
|
+
new_data = add.synthetic("@new", n_rows=500, strategy={
|
|
206
207
|
'id': 'increment:start=1',
|
|
207
208
|
'name': 'choice:[John,Jane,Bob]',
|
|
208
209
|
'age': 'range:18-65'
|
|
209
210
|
})
|
|
210
211
|
```
|
|
211
212
|
|
|
213
|
+
### 🤖 Text-Based Label Deduction
|
|
214
|
+
|
|
215
|
+
**Deduce** automatically fills in missing labels by learning from your existing labeled examples. Pure Python, no LLMs, offline-first.
|
|
216
|
+
|
|
217
|
+
```python
|
|
218
|
+
# Deduce missing labels from text
|
|
219
|
+
tickets = pd.DataFrame({
|
|
220
|
+
"ticket_text": ["Cannot log in", "Billing question", "App crashes", "Need invoice"],
|
|
221
|
+
"category": ["Technical", "Billing", None, None]
|
|
222
|
+
})
|
|
223
|
+
|
|
224
|
+
# Automatically fill in missing categories
|
|
225
|
+
result = add.deduce(tickets, from_column="ticket_text", to_column="category")
|
|
226
|
+
|
|
227
|
+
# Use multiple columns for better accuracy
|
|
228
|
+
result = add.deduce(
|
|
229
|
+
df,
|
|
230
|
+
from_column=["title", "description"],
|
|
231
|
+
to_column="category"
|
|
232
|
+
)
|
|
233
|
+
```
|
|
234
|
+
|
|
212
235
|
## 🧪 Examples
|
|
213
236
|
|
|
214
237
|
### E-commerce Data Pipeline
|
|
@@ -224,7 +247,7 @@ customers = pd.DataFrame({
|
|
|
224
247
|
})
|
|
225
248
|
|
|
226
249
|
# Generate more customers
|
|
227
|
-
customers = add.
|
|
250
|
+
customers = add.synthetic(customers, n_rows=10000)
|
|
228
251
|
|
|
229
252
|
# Add customer tiers
|
|
230
253
|
tiers = pd.DataFrame({
|
|
@@ -250,7 +273,7 @@ strategy = {
|
|
|
250
273
|
'height_cm': 'range:150-200' # Height in cm
|
|
251
274
|
}
|
|
252
275
|
|
|
253
|
-
patients = add.
|
|
276
|
+
patients = add.synthetic("@new", n_rows=1000, strategy=strategy)
|
|
254
277
|
|
|
255
278
|
# Convert height to meters for expressions
|
|
256
279
|
patients['height_m'] = patients['height_cm'] / 100
|
|
@@ -265,19 +288,19 @@ print(result.correlations)
|
|
|
265
288
|
|
|
266
289
|
## 📚 Documentation
|
|
267
290
|
|
|
268
|
-
- **[Function Documentation](https://github.com/sekarkrishna/additory/tree/main/
|
|
269
|
-
- **[Expressions Guide](https://github.com/sekarkrishna/additory/tree/main/
|
|
291
|
+
- **[Function Documentation](https://github.com/sekarkrishna/additory/tree/main/documentation/)** - Detailed guides for each function
|
|
292
|
+
- **[Expressions Guide](https://github.com/sekarkrishna/additory/tree/main/documentation/)** - Complete expressions reference
|
|
270
293
|
|
|
271
294
|
## 📄 License
|
|
272
295
|
|
|
273
|
-
MIT License - see [LICENSE](
|
|
296
|
+
MIT License - see [LICENSE](LICENSE) file for details.
|
|
274
297
|
|
|
275
298
|
## 📞 Support
|
|
276
299
|
|
|
277
300
|
- **Issues**: [GitHub Issues](https://github.com/sekarkrishna/additory/issues)
|
|
278
|
-
- **Documentation**: [Full Documentation](https://github.com/sekarkrishna/additory/tree/main/
|
|
301
|
+
- **Documentation**: [Full Documentation](https://github.com/sekarkrishna/additory/tree/main/documentation/)
|
|
279
302
|
|
|
280
|
-
## 🗺️ v0.1.1 (
|
|
303
|
+
## 🗺️ v0.1.1 (January 2026)
|
|
281
304
|
- Enhanced documentation and tutorials
|
|
282
305
|
- Performance optimizations
|
|
283
306
|
- Additional expressions
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
additory/__init__.py,sha256=
|
|
2
|
-
additory/dynamic_api.py,sha256=
|
|
1
|
+
additory/__init__.py,sha256=6LWidwOfdQInwiTCaS9OlJG97WDnXCEMncZF9f9Uheo,397
|
|
2
|
+
additory/dynamic_api.py,sha256=XcmXMS-k2u5RoH5uM6BoQfTLGgF4UcZvXs9f9-BlOM4,13814
|
|
3
3
|
additory/analysis/__init__.py,sha256=F_yhD_hcIWbwO1wrRe8Js1RI-vkozaKyWNIIEb-jSMk,986
|
|
4
4
|
additory/analysis/cardinality.py,sha256=y4ttjk3VFNm3mEfNZaTegVQxH7btnmXgnDUSkctNuTo,2976
|
|
5
5
|
additory/analysis/correlations.py,sha256=n0vIPW9lTTSPsPlr40YOIohTX3mUgGmSLdlBrkJZa1c,3909
|
|
@@ -20,7 +20,7 @@ additory/core/__init__.py,sha256=dhEBneupBndNBlsQI8niFZgQjUJDLORzRcFtvXGXg-E,630
|
|
|
20
20
|
additory/core/ast_builder.py,sha256=cW65w-utVGjUJos1ffmfEPgPbxVwN6WU-vcDKrBPy8o,5303
|
|
21
21
|
additory/core/column_positioning.py,sha256=1frwieAvdHXvlZzlUhL1BXP1P_iOZ7yzCNDlvw4L9kI,13241
|
|
22
22
|
additory/core/compiler_polars.py,sha256=wN_785yk7N3tYGPCP2IsOpCeWxqJNOMq35TX-xoSCS8,5161
|
|
23
|
-
additory/core/config.py,sha256=
|
|
23
|
+
additory/core/config.py,sha256=3qqM_JIahzf4ZscjU0OzlAAYQ7kEdfjG7ztKk8993nQ,9587
|
|
24
24
|
additory/core/enhanced_cache_manager.py,sha256=7hpoMucAWkP_-sUzst_JigPKK04S6TsYLpI_m-s9FrY,47230
|
|
25
25
|
additory/core/enhanced_matchers.py,sha256=lZO-PPfiAiriX4SjTenaulWqijogq9NnhUATHfwMqak,20353
|
|
26
26
|
additory/core/enhanced_version_manager.py,sha256=wIk5pg0Pn5KahgsGMYtmHxxxX3sltnwHqJ_QT7mosNw,12957
|
|
@@ -32,7 +32,7 @@ additory/core/memory_manager.py,sha256=b1H1juAg2CXioSI4N65XldPdKxHTXRI3MSTSAtKV3
|
|
|
32
32
|
additory/core/namespace_manager.py,sha256=RWbMZBcoXvpdcz5km2CJlXcrDwWE4DES-lGET4r98Pk,23325
|
|
33
33
|
additory/core/parser.py,sha256=yVh87CiE4fmrg4jFisNMKTHlz4OpAMNVFF4thq6I0JE,4748
|
|
34
34
|
additory/core/polars_expression_engine.py,sha256=O4s-ZtHgP2SQd_LsdGgCPVOACJgJsL7W48wj6CbutFw,23158
|
|
35
|
-
additory/core/registry.py,sha256=
|
|
35
|
+
additory/core/registry.py,sha256=bhm__bPuLvGvjbEUm8xbb0lUXQ1ZMd1Pcn2oUsIg4Rc,5658
|
|
36
36
|
additory/core/sample_data_manager.py,sha256=urBT2T5NZZM0KXriuW5xfCwC1SA3WHwraVMtz5qyw7Y,19800
|
|
37
37
|
additory/core/user_namespace.py,sha256=qgPhuHuhiePa9Qr2CtBCuflpUfxD8wTakWFcp5Ve2xU,22522
|
|
38
38
|
additory/core/validator.py,sha256=em71_1TAdk44B2yyNwzmxkh4pMpqAq1JN_oHoDH7fCk,588
|
|
@@ -43,16 +43,17 @@ additory/expressions/__init__.py,sha256=FYZjHA7zJie1HRAQjMo6MdQxwYW2owrHulKXjfBF
|
|
|
43
43
|
additory/expressions/engine.py,sha256=yOzZDNKjltP-HLVKBL4BXke63ALqgRFXgHK4YeeXLQ8,21138
|
|
44
44
|
additory/expressions/parser.py,sha256=yVh87CiE4fmrg4jFisNMKTHlz4OpAMNVFF4thq6I0JE,4748
|
|
45
45
|
additory/expressions/proxy.py,sha256=kohaZTtU5f_r3O_WidnNKXzN3IAmAnt6M0L5F3mpb7I,22044
|
|
46
|
-
additory/expressions/registry.py,sha256=
|
|
46
|
+
additory/expressions/registry.py,sha256=DPkjg8YfQxjz0Tf6nif_e_8uTNAMfHd7LmUNL2GqZ-4,11024
|
|
47
47
|
additory/expressions/samples.py,sha256=urBT2T5NZZM0KXriuW5xfCwC1SA3WHwraVMtz5qyw7Y,19800
|
|
48
48
|
additory/synthetic/__init__.py,sha256=Zw0GqXXh5v6_1S6SxPcEYL7CzNmaRuVk1aC3qBOQ2RE,342
|
|
49
49
|
additory/synthetic/column_name_resolver.py,sha256=-kh6bxitaSUwk28TZ5yPzbLUe6nxU7oYsazKEwumtRA,4913
|
|
50
|
+
additory/synthetic/deduce.py,sha256=CjEw-mCHGO1GjQtb0i-YX8QlmHiJORwMqjnfp8Oxm68,7736
|
|
50
51
|
additory/synthetic/distributions.py,sha256=jrwDGVy_Vcm5XXoGKy-V0LrpnxdGM5p84GklKq-0b_A,705
|
|
51
52
|
additory/synthetic/forecast.py,sha256=F2XoKEDFDJ47W6bSzy2jXYWU3PN5X0l16YvtfxXc4GU,34820
|
|
52
53
|
additory/synthetic/linked_list_parser.py,sha256=YysP1ODyABJzUe82QLEfbuxGknTCyWb81tf8Pueg-oE,13002
|
|
53
54
|
additory/synthetic/namespace_lookup.py,sha256=4ILe1MWubGvRsF_xbQLybBbr3hG0iMTseypigB_66TI,4096
|
|
54
55
|
additory/synthetic/smote.py,sha256=ub8pTA5Ez3WjXP15GtyUqCRztiPr7XfHbNGTucUFErA,9092
|
|
55
|
-
additory/synthetic/strategies.py,sha256=
|
|
56
|
+
additory/synthetic/strategies.py,sha256=2Cn6wy-tRTj9CuBkhYizB6oQGev4EzxrprbQlKyxce8,28620
|
|
56
57
|
additory/synthetic/synthesizer.py,sha256=9YHXyA9wfUyMZLse7nBMJ1hQ0F9SJmF4j01y4Oyebzg,26405
|
|
57
58
|
additory/utilities/__init__.py,sha256=I28c5ZqqZ2VsMIG40fUBJhnc930cFXHJX22xQWARXq8,1679
|
|
58
59
|
additory/utilities/encoding.py,sha256=DhTaTeUlJOSixQ3-hgUwSy1jMJAYadV2bQHuONVzzEY,20995
|
|
@@ -64,8 +65,8 @@ additory/utilities/resolvers.py,sha256=ykMfce2f9in9wqHgmljCFIil8xGcalT0FBwFIwHOl
|
|
|
64
65
|
additory/utilities/settings.py,sha256=5XB2S3L7Ht486LZMDacYTuyB_ta7sVohUFEKzMo1nDU,4698
|
|
65
66
|
additory/utilities/units.py,sha256=75VFSLCVhX3dcFokh-jbZepDRaFRuO2QpGZNQbG8fag,30526
|
|
66
67
|
additory/utilities/validators.py,sha256=K1ZYsPL3W7XkIUECVWov4HZxTlzqs9Rbc61Vidh2F8o,4213
|
|
67
|
-
additory-0.1.
|
|
68
|
-
additory-0.1.
|
|
69
|
-
additory-0.1.
|
|
70
|
-
additory-0.1.
|
|
71
|
-
additory-0.1.
|
|
68
|
+
additory-0.1.0a4.dist-info/licenses/LICENSE,sha256=ztobegtjJRyvQntGjQ1w80MGuTOeMmWkh5Be-pFyq3I,1079
|
|
69
|
+
additory-0.1.0a4.dist-info/METADATA,sha256=uYviUo6_AjdbVtV91hFZL7fyEwh4Pd_HsDqjgzxqDts,8729
|
|
70
|
+
additory-0.1.0a4.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
71
|
+
additory-0.1.0a4.dist-info/top_level.txt,sha256=4zphwXiI6HEl40fdjMXoUp9JNIqQ-tgYWeo3zqKqvEk,9
|
|
72
|
+
additory-0.1.0a4.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|