additory 0.1.0a2__tar.gz → 0.1.0a4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {additory-0.1.0a2 → additory-0.1.0a4}/PKG-INFO +44 -28
- {additory-0.1.0a2 → additory-0.1.0a4}/README.md +41 -26
- {additory-0.1.0a2 → additory-0.1.0a4}/additory/__init__.py +4 -0
- {additory-0.1.0a2 → additory-0.1.0a4}/additory/common/__init__.py +2 -2
- {additory-0.1.0a2 → additory-0.1.0a4}/additory/common/backend.py +20 -4
- {additory-0.1.0a2 → additory-0.1.0a4}/additory/common/distributions.py +1 -1
- {additory-0.1.0a2 → additory-0.1.0a4}/additory/common/sample_data.py +19 -19
- {additory-0.1.0a2 → additory-0.1.0a4}/additory/core/backends/arrow_bridge.py +7 -0
- {additory-0.1.0a2 → additory-0.1.0a4}/additory/core/config.py +3 -3
- {additory-0.1.0a2 → additory-0.1.0a4}/additory/core/polars_expression_engine.py +66 -16
- {additory-0.1.0a2 → additory-0.1.0a4}/additory/core/registry.py +4 -3
- {additory-0.1.0a2 → additory-0.1.0a4}/additory/dynamic_api.py +95 -51
- {additory-0.1.0a2 → additory-0.1.0a4}/additory/expressions/proxy.py +4 -1
- {additory-0.1.0a2 → additory-0.1.0a4}/additory/expressions/registry.py +3 -3
- additory-0.1.0a4/additory/synthetic/__init__.py +13 -0
- additory-0.1.0a4/additory/synthetic/column_name_resolver.py +149 -0
- additory-0.1.0a4/additory/synthetic/deduce.py +259 -0
- {additory-0.1.0a2/additory/augment → additory-0.1.0a4/additory/synthetic}/distributions.py +2 -2
- {additory-0.1.0a2/additory/augment → additory-0.1.0a4/additory/synthetic}/forecast.py +1 -1
- additory-0.1.0a4/additory/synthetic/linked_list_parser.py +415 -0
- additory-0.1.0a4/additory/synthetic/namespace_lookup.py +129 -0
- {additory-0.1.0a2/additory/augment → additory-0.1.0a4/additory/synthetic}/smote.py +1 -1
- {additory-0.1.0a2/additory/augment → additory-0.1.0a4/additory/synthetic}/strategies.py +87 -44
- additory-0.1.0a2/additory/augment/augmentor.py → additory-0.1.0a4/additory/synthetic/synthesizer.py +75 -15
- {additory-0.1.0a2 → additory-0.1.0a4}/additory/utilities/units.py +4 -1
- {additory-0.1.0a2 → additory-0.1.0a4}/additory.egg-info/SOURCES.txt +11 -37
- additory-0.1.0a4/documentation/V0.1.0/add_deduce_function.html +759 -0
- {additory-0.1.0a2 → additory-0.1.0a4}/documentation/V0.1.0/add_onehotencoding_function.html +43 -40
- additory-0.1.0a2/documentation/V0.1.0/add_augment_function.html → additory-0.1.0a4/documentation/V0.1.0/add_synthetic_function.html +140 -147
- {additory-0.1.0a2 → additory-0.1.0a4}/documentation/V0.1.0/expressions.html +55 -79
- {additory-0.1.0a2 → additory-0.1.0a4}/pyproject.toml +3 -2
- {additory-0.1.0a2 → additory-0.1.0a4}/reference/expressions_definitions/age_category_0.1.add +23 -5
- {additory-0.1.0a2 → additory-0.1.0a4}/reference/expressions_definitions/blood_pressure_category_0.1.add +24 -5
- {additory-0.1.0a2 → additory-0.1.0a4}/reference/expressions_definitions/bmi2_0.1.add +3 -3
- additory-0.1.0a4/reference/expressions_definitions/bmi3_0.1.add +41 -0
- {additory-0.1.0a2 → additory-0.1.0a4}/reference/expressions_definitions/bmi_0.1.add +3 -3
- {additory-0.1.0a2 → additory-0.1.0a4}/reference/expressions_definitions/bmr_0.1.add +13 -5
- {additory-0.1.0a2 → additory-0.1.0a4}/reference/expressions_definitions/body_fat_percentage_0.1.add +13 -5
- {additory-0.1.0a2 → additory-0.1.0a4}/reference/expressions_definitions/bsa_0.1.add +11 -5
- {additory-0.1.0a2 → additory-0.1.0a4}/reference/expressions_definitions/cholesterol_ratio_0.1.add +11 -5
- {additory-0.1.0a2 → additory-0.1.0a4}/reference/expressions_definitions/fitness_score_0.1.add +24 -5
- {additory-0.1.0a2 → additory-0.1.0a4}/reference/expressions_definitions/ideal_body_weight_0.1.add +14 -5
- {additory-0.1.0a2 → additory-0.1.0a4}/reference/expressions_definitions/waist_hip_ratio_0.1.add +11 -5
- additory-0.1.0a2/additory/augment/__init__.py +0 -24
- additory-0.1.0a2/additory/augment/builtin_lists.py +0 -430
- additory-0.1.0a2/additory/augment/list_registry.py +0 -177
- additory-0.1.0a2/additory/synthetic/__init__.py +0 -101
- additory-0.1.0a2/additory/synthetic/api.py +0 -220
- additory-0.1.0a2/additory/synthetic/common_integration.py +0 -314
- additory-0.1.0a2/additory/synthetic/config.py +0 -262
- additory-0.1.0a2/additory/synthetic/engines.py +0 -529
- additory-0.1.0a2/additory/synthetic/exceptions.py +0 -180
- additory-0.1.0a2/additory/synthetic/file_managers.py +0 -518
- additory-0.1.0a2/additory/synthetic/generator.py +0 -702
- additory-0.1.0a2/additory/synthetic/generator_parser.py +0 -68
- additory-0.1.0a2/additory/synthetic/integration.py +0 -319
- additory-0.1.0a2/additory/synthetic/models.py +0 -241
- additory-0.1.0a2/additory/synthetic/pattern_resolver.py +0 -573
- additory-0.1.0a2/additory/synthetic/performance.py +0 -469
- additory-0.1.0a2/additory/synthetic/polars_integration.py +0 -464
- additory-0.1.0a2/additory/synthetic/proxy.py +0 -60
- additory-0.1.0a2/additory/synthetic/schema_parser.py +0 -685
- additory-0.1.0a2/additory/synthetic/validator.py +0 -553
- additory-0.1.0a2/documentation/V0.1.0/add_synth_function.html +0 -664
- additory-0.1.0a2/reference/expressions_definitions/bmi3_0.1.add +0 -26
- additory-0.1.0a2/reference/schema_definitions/ca.list +0 -41
- additory-0.1.0a2/reference/schema_definitions/ca.properties +0 -14
- additory-0.1.0a2/reference/schema_definitions/eu.list +0 -41
- additory-0.1.0a2/reference/schema_definitions/eu.properties +0 -13
- additory-0.1.0a2/reference/schema_definitions/finance.list +0 -31
- additory-0.1.0a2/reference/schema_definitions/finance.properties +0 -18
- additory-0.1.0a2/reference/schema_definitions/global.list +0 -57
- additory-0.1.0a2/reference/schema_definitions/global.properties +0 -11
- additory-0.1.0a2/reference/schema_definitions/healthcare.list +0 -28
- additory-0.1.0a2/reference/schema_definitions/us.list +0 -41
- additory-0.1.0a2/reference/schema_definitions/us.properties +0 -14
- {additory-0.1.0a2 → additory-0.1.0a4}/LICENSE +0 -0
- {additory-0.1.0a2 → additory-0.1.0a4}/additory/analysis/__init__.py +0 -0
- {additory-0.1.0a2 → additory-0.1.0a4}/additory/analysis/cardinality.py +0 -0
- {additory-0.1.0a2 → additory-0.1.0a4}/additory/analysis/correlations.py +0 -0
- {additory-0.1.0a2 → additory-0.1.0a4}/additory/analysis/distributions.py +0 -0
- {additory-0.1.0a2 → additory-0.1.0a4}/additory/analysis/quality.py +0 -0
- {additory-0.1.0a2 → additory-0.1.0a4}/additory/analysis/scan.py +0 -0
- {additory-0.1.0a2 → additory-0.1.0a4}/additory/common/column_utils.py +0 -0
- {additory-0.1.0a2 → additory-0.1.0a4}/additory/common/exceptions.py +0 -0
- {additory-0.1.0a2 → additory-0.1.0a4}/additory/common/lists.py +0 -0
- {additory-0.1.0a2 → additory-0.1.0a4}/additory/common/patterns.py +0 -0
- {additory-0.1.0a2 → additory-0.1.0a4}/additory/common/resolver.py +0 -0
- {additory-0.1.0a2 → additory-0.1.0a4}/additory/common/validation.py +0 -0
- {additory-0.1.0a2 → additory-0.1.0a4}/additory/core/__init__.py +0 -0
- {additory-0.1.0a2 → additory-0.1.0a4}/additory/core/ast_builder.py +0 -0
- {additory-0.1.0a2 → additory-0.1.0a4}/additory/core/backends/__init__.py +0 -0
- {additory-0.1.0a2 → additory-0.1.0a4}/additory/core/backends/cudf_bridge.py +0 -0
- {additory-0.1.0a2 → additory-0.1.0a4}/additory/core/column_positioning.py +0 -0
- {additory-0.1.0a2 → additory-0.1.0a4}/additory/core/compiler_polars.py +0 -0
- {additory-0.1.0a2 → additory-0.1.0a4}/additory/core/enhanced_cache_manager.py +0 -0
- {additory-0.1.0a2 → additory-0.1.0a4}/additory/core/enhanced_matchers.py +0 -0
- {additory-0.1.0a2 → additory-0.1.0a4}/additory/core/enhanced_version_manager.py +0 -0
- {additory-0.1.0a2 → additory-0.1.0a4}/additory/core/executor.py +0 -0
- {additory-0.1.0a2 → additory-0.1.0a4}/additory/core/integrity_manager.py +0 -0
- {additory-0.1.0a2 → additory-0.1.0a4}/additory/core/loader.py +0 -0
- {additory-0.1.0a2 → additory-0.1.0a4}/additory/core/logging.py +0 -0
- {additory-0.1.0a2 → additory-0.1.0a4}/additory/core/memory_manager.py +0 -0
- {additory-0.1.0a2 → additory-0.1.0a4}/additory/core/namespace_manager.py +0 -0
- {additory-0.1.0a2 → additory-0.1.0a4}/additory/core/parser.py +0 -0
- {additory-0.1.0a2 → additory-0.1.0a4}/additory/core/sample_data_manager.py +0 -0
- {additory-0.1.0a2 → additory-0.1.0a4}/additory/core/user_namespace.py +0 -0
- {additory-0.1.0a2 → additory-0.1.0a4}/additory/core/validator.py +0 -0
- {additory-0.1.0a2 → additory-0.1.0a4}/additory/expressions/__init__.py +0 -0
- {additory-0.1.0a2 → additory-0.1.0a4}/additory/expressions/engine.py +0 -0
- {additory-0.1.0a2 → additory-0.1.0a4}/additory/expressions/parser.py +0 -0
- {additory-0.1.0a2 → additory-0.1.0a4}/additory/expressions/samples.py +0 -0
- {additory-0.1.0a2 → additory-0.1.0a4}/additory/utilities/__init__.py +0 -0
- {additory-0.1.0a2 → additory-0.1.0a4}/additory/utilities/encoding.py +0 -0
- {additory-0.1.0a2 → additory-0.1.0a4}/additory/utilities/games.py +0 -0
- {additory-0.1.0a2 → additory-0.1.0a4}/additory/utilities/keys.py +0 -0
- {additory-0.1.0a2 → additory-0.1.0a4}/additory/utilities/lookup.py +0 -0
- {additory-0.1.0a2 → additory-0.1.0a4}/additory/utilities/matchers.py +0 -0
- {additory-0.1.0a2 → additory-0.1.0a4}/additory/utilities/resolvers.py +0 -0
- {additory-0.1.0a2 → additory-0.1.0a4}/additory/utilities/settings.py +0 -0
- {additory-0.1.0a2 → additory-0.1.0a4}/additory/utilities/validators.py +0 -0
- {additory-0.1.0a2 → additory-0.1.0a4}/documentation/V0.1.0/add_harmonize_units_function.html +0 -0
- {additory-0.1.0a2 → additory-0.1.0a4}/documentation/V0.1.0/add_scan_function.html +0 -0
- {additory-0.1.0a2 → additory-0.1.0a4}/documentation/V0.1.0/add_to_function.html +0 -0
- {additory-0.1.0a2 → additory-0.1.0a4}/reference/expressions_definitions/manifest.json +0 -0
- {additory-0.1.0a2 → additory-0.1.0a4}/setup.cfg +0 -0
- {additory-0.1.0a2 → additory-0.1.0a4}/user_expressions/bmi1_0.1.add +0 -0
- {additory-0.1.0a2 → additory-0.1.0a4}/user_expressions/bmi2_0.1.add +0 -0
- {additory-0.1.0a2 → additory-0.1.0a4}/user_expressions/bmi3_0.1.add +0 -0
- {additory-0.1.0a2 → additory-0.1.0a4}/user_expressions/bmi_0.1.add +0 -0
- {additory-0.1.0a2 → additory-0.1.0a4}/user_expressions/manifest.json +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: additory
|
|
3
|
-
Version: 0.1.
|
|
4
|
-
Summary: A semantic, extensible dataframe transformation engine with expressions, lookup, synthetic data
|
|
3
|
+
Version: 0.1.0a4
|
|
4
|
+
Summary: A semantic, extensible dataframe transformation engine with expressions, lookup, and synthetic data generation support.
|
|
5
5
|
Author: Krishnamoorthy Sankaran
|
|
6
6
|
License: MIT
|
|
7
7
|
Project-URL: homepage, https://github.com/sekarkrishna/additory
|
|
@@ -13,6 +13,7 @@ Description-Content-Type: text/markdown
|
|
|
13
13
|
License-File: LICENSE
|
|
14
14
|
Requires-Dist: pandas>=1.5
|
|
15
15
|
Requires-Dist: polars>=0.20
|
|
16
|
+
Requires-Dist: pyarrow>=10.0
|
|
16
17
|
Requires-Dist: pyyaml>=6.0
|
|
17
18
|
Requires-Dist: requests>=2.31
|
|
18
19
|
Requires-Dist: toml>=0.10
|
|
@@ -34,11 +35,11 @@ Dynamic: license-file
|
|
|
34
35
|
|
|
35
36
|
# Additory
|
|
36
37
|
|
|
37
|
-
**A semantic, extensible dataframe transformation engine with expressions, lookup,
|
|
38
|
+
**A semantic, extensible dataframe transformation engine with expressions, lookup, and augmentation support.**
|
|
38
39
|
|
|
39
40
|
[](https://www.python.org/downloads/)
|
|
40
41
|
[](https://opensource.org/licenses/MIT)
|
|
41
|
-
[](https://github.com/sekarkrishna/additory)
|
|
42
43
|
|
|
43
44
|
**Author:** Krishnamoorthy Sankaran
|
|
44
45
|
|
|
@@ -51,17 +52,17 @@ Dynamic: license-file
|
|
|
51
52
|
## 📦 Installation
|
|
52
53
|
|
|
53
54
|
```bash
|
|
54
|
-
pip install additory==0.1.
|
|
55
|
+
pip install additory==0.1.0a4
|
|
55
56
|
```
|
|
56
57
|
|
|
57
58
|
**Optional GPU support:**
|
|
58
59
|
```bash
|
|
59
|
-
pip install additory[gpu]==0.1.
|
|
60
|
+
pip install additory[gpu]==0.1.0a4 # Includes cuDF for GPU acceleration
|
|
60
61
|
```
|
|
61
62
|
|
|
62
63
|
**Development installation:**
|
|
63
64
|
```bash
|
|
64
|
-
pip install additory[dev]==0.1.
|
|
65
|
+
pip install additory[dev]==0.1.0a4 # Includes testing and development tools
|
|
65
66
|
```
|
|
66
67
|
|
|
67
68
|
## 🎯 Core Functions
|
|
@@ -69,8 +70,8 @@ pip install additory[dev]==0.1.0a1 # Includes testing and development tools
|
|
|
69
70
|
| Function | Purpose | Example |
|
|
70
71
|
|----------|---------|---------|
|
|
71
72
|
| `add.to()` | Lookup/join operations | `add.to(df1, from_df=df2, bring='col', against='key')` |
|
|
72
|
-
| `add.
|
|
73
|
-
| `add.
|
|
73
|
+
| `add.synthetic()` | Generate additional data | `add.synthetic(df, n_rows=1000)` |
|
|
74
|
+
| `add.deduce()` | Text-based label deduction | `add.deduce(df, from_column='text', to_column='label')` |
|
|
74
75
|
| `add.scan()` | Data profiling & analysis | `add.scan(df, preset="full")` |
|
|
75
76
|
|
|
76
77
|
## 🧬 Available Expressions
|
|
@@ -119,7 +120,7 @@ import additory as add
|
|
|
119
120
|
|
|
120
121
|
# Works with polars
|
|
121
122
|
df_polars = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
|
|
122
|
-
result = add.
|
|
123
|
+
result = add.synthetic(df_polars, n_rows=100)
|
|
123
124
|
|
|
124
125
|
# Automatic type detection and conversion
|
|
125
126
|
```
|
|
@@ -193,27 +194,42 @@ patients_with_bsa = add.bsa(patients)
|
|
|
193
194
|
result = add.fitness_score(add.bmr(add.bmi(patients)))
|
|
194
195
|
```
|
|
195
196
|
|
|
196
|
-
### 🔄
|
|
197
|
+
### 🔄 Synthetic Data Generation
|
|
197
198
|
|
|
198
|
-
**
|
|
199
|
-
|
|
200
|
-
**Key Differences:**
|
|
201
|
-
- **Augment**: Learns patterns from existing data to create similar rows
|
|
202
|
-
- **Synthetic**: Uses predefined schemas to generate structured data
|
|
199
|
+
**Synthetic** generates additional data similar to your existing dataset using inline strategies.
|
|
203
200
|
|
|
204
201
|
```python
|
|
205
|
-
#
|
|
206
|
-
more_customers = add.
|
|
202
|
+
# Extend existing data (learns from patterns)
|
|
203
|
+
more_customers = add.synthetic(customers, n_rows=1000)
|
|
207
204
|
|
|
208
205
|
# Create data from scratch with strategies
|
|
209
|
-
new_data = add.
|
|
206
|
+
new_data = add.synthetic("@new", n_rows=500, strategy={
|
|
210
207
|
'id': 'increment:start=1',
|
|
211
208
|
'name': 'choice:[John,Jane,Bob]',
|
|
212
209
|
'age': 'range:18-65'
|
|
213
210
|
})
|
|
211
|
+
```
|
|
212
|
+
|
|
213
|
+
### 🤖 Text-Based Label Deduction
|
|
214
214
|
|
|
215
|
-
|
|
216
|
-
|
|
215
|
+
**Deduce** automatically fills in missing labels by learning from your existing labeled examples. Pure Python, no LLMs, offline-first.
|
|
216
|
+
|
|
217
|
+
```python
|
|
218
|
+
# Deduce missing labels from text
|
|
219
|
+
tickets = pd.DataFrame({
|
|
220
|
+
"ticket_text": ["Cannot log in", "Billing question", "App crashes", "Need invoice"],
|
|
221
|
+
"category": ["Technical", "Billing", None, None]
|
|
222
|
+
})
|
|
223
|
+
|
|
224
|
+
# Automatically fill in missing categories
|
|
225
|
+
result = add.deduce(tickets, from_column="ticket_text", to_column="category")
|
|
226
|
+
|
|
227
|
+
# Use multiple columns for better accuracy
|
|
228
|
+
result = add.deduce(
|
|
229
|
+
df,
|
|
230
|
+
from_column=["title", "description"],
|
|
231
|
+
to_column="category"
|
|
232
|
+
)
|
|
217
233
|
```
|
|
218
234
|
|
|
219
235
|
## 🧪 Examples
|
|
@@ -231,7 +247,7 @@ customers = pd.DataFrame({
|
|
|
231
247
|
})
|
|
232
248
|
|
|
233
249
|
# Generate more customers
|
|
234
|
-
customers = add.
|
|
250
|
+
customers = add.synthetic(customers, n_rows=10000)
|
|
235
251
|
|
|
236
252
|
# Add customer tiers
|
|
237
253
|
tiers = pd.DataFrame({
|
|
@@ -257,7 +273,7 @@ strategy = {
|
|
|
257
273
|
'height_cm': 'range:150-200' # Height in cm
|
|
258
274
|
}
|
|
259
275
|
|
|
260
|
-
patients = add.
|
|
276
|
+
patients = add.synthetic("@new", n_rows=1000, strategy=strategy)
|
|
261
277
|
|
|
262
278
|
# Convert height to meters for expressions
|
|
263
279
|
patients['height_m'] = patients['height_cm'] / 100
|
|
@@ -272,19 +288,19 @@ print(result.correlations)
|
|
|
272
288
|
|
|
273
289
|
## 📚 Documentation
|
|
274
290
|
|
|
275
|
-
- **[Function Documentation](https://github.com/sekarkrishna/additory/tree/main/
|
|
276
|
-
- **[Expressions Guide](https://github.com/sekarkrishna/additory/tree/main/
|
|
291
|
+
- **[Function Documentation](https://github.com/sekarkrishna/additory/tree/main/documentation/)** - Detailed guides for each function
|
|
292
|
+
- **[Expressions Guide](https://github.com/sekarkrishna/additory/tree/main/documentation/)** - Complete expressions reference
|
|
277
293
|
|
|
278
294
|
## 📄 License
|
|
279
295
|
|
|
280
|
-
MIT License - see [LICENSE](
|
|
296
|
+
MIT License - see [LICENSE](LICENSE) file for details.
|
|
281
297
|
|
|
282
298
|
## 📞 Support
|
|
283
299
|
|
|
284
300
|
- **Issues**: [GitHub Issues](https://github.com/sekarkrishna/additory/issues)
|
|
285
|
-
- **Documentation**: [Full Documentation](https://github.com/sekarkrishna/additory/tree/main/
|
|
301
|
+
- **Documentation**: [Full Documentation](https://github.com/sekarkrishna/additory/tree/main/documentation/)
|
|
286
302
|
|
|
287
|
-
## 🗺️ v0.1.1 (
|
|
303
|
+
## 🗺️ v0.1.1 (January 2026)
|
|
288
304
|
- Enhanced documentation and tutorials
|
|
289
305
|
- Performance optimizations
|
|
290
306
|
- Additional expressions
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
# Additory
|
|
2
2
|
|
|
3
|
-
**A semantic, extensible dataframe transformation engine with expressions, lookup,
|
|
3
|
+
**A semantic, extensible dataframe transformation engine with expressions, lookup, and augmentation support.**
|
|
4
4
|
|
|
5
5
|
[](https://www.python.org/downloads/)
|
|
6
6
|
[](https://opensource.org/licenses/MIT)
|
|
7
|
-
[](https://github.com/sekarkrishna/additory)
|
|
8
8
|
|
|
9
9
|
**Author:** Krishnamoorthy Sankaran
|
|
10
10
|
|
|
@@ -17,17 +17,17 @@
|
|
|
17
17
|
## 📦 Installation
|
|
18
18
|
|
|
19
19
|
```bash
|
|
20
|
-
pip install additory==0.1.
|
|
20
|
+
pip install additory==0.1.0a4
|
|
21
21
|
```
|
|
22
22
|
|
|
23
23
|
**Optional GPU support:**
|
|
24
24
|
```bash
|
|
25
|
-
pip install additory[gpu]==0.1.
|
|
25
|
+
pip install additory[gpu]==0.1.0a4 # Includes cuDF for GPU acceleration
|
|
26
26
|
```
|
|
27
27
|
|
|
28
28
|
**Development installation:**
|
|
29
29
|
```bash
|
|
30
|
-
pip install additory[dev]==0.1.
|
|
30
|
+
pip install additory[dev]==0.1.0a4 # Includes testing and development tools
|
|
31
31
|
```
|
|
32
32
|
|
|
33
33
|
## 🎯 Core Functions
|
|
@@ -35,8 +35,8 @@ pip install additory[dev]==0.1.0a1 # Includes testing and development tools
|
|
|
35
35
|
| Function | Purpose | Example |
|
|
36
36
|
|----------|---------|---------|
|
|
37
37
|
| `add.to()` | Lookup/join operations | `add.to(df1, from_df=df2, bring='col', against='key')` |
|
|
38
|
-
| `add.
|
|
39
|
-
| `add.
|
|
38
|
+
| `add.synthetic()` | Generate additional data | `add.synthetic(df, n_rows=1000)` |
|
|
39
|
+
| `add.deduce()` | Text-based label deduction | `add.deduce(df, from_column='text', to_column='label')` |
|
|
40
40
|
| `add.scan()` | Data profiling & analysis | `add.scan(df, preset="full")` |
|
|
41
41
|
|
|
42
42
|
## 🧬 Available Expressions
|
|
@@ -85,7 +85,7 @@ import additory as add
|
|
|
85
85
|
|
|
86
86
|
# Works with polars
|
|
87
87
|
df_polars = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
|
|
88
|
-
result = add.
|
|
88
|
+
result = add.synthetic(df_polars, n_rows=100)
|
|
89
89
|
|
|
90
90
|
# Automatic type detection and conversion
|
|
91
91
|
```
|
|
@@ -159,27 +159,42 @@ patients_with_bsa = add.bsa(patients)
|
|
|
159
159
|
result = add.fitness_score(add.bmr(add.bmi(patients)))
|
|
160
160
|
```
|
|
161
161
|
|
|
162
|
-
### 🔄
|
|
162
|
+
### 🔄 Synthetic Data Generation
|
|
163
163
|
|
|
164
|
-
**
|
|
165
|
-
|
|
166
|
-
**Key Differences:**
|
|
167
|
-
- **Augment**: Learns patterns from existing data to create similar rows
|
|
168
|
-
- **Synthetic**: Uses predefined schemas to generate structured data
|
|
164
|
+
**Synthetic** generates additional data similar to your existing dataset using inline strategies.
|
|
169
165
|
|
|
170
166
|
```python
|
|
171
|
-
#
|
|
172
|
-
more_customers = add.
|
|
167
|
+
# Extend existing data (learns from patterns)
|
|
168
|
+
more_customers = add.synthetic(customers, n_rows=1000)
|
|
173
169
|
|
|
174
170
|
# Create data from scratch with strategies
|
|
175
|
-
new_data = add.
|
|
171
|
+
new_data = add.synthetic("@new", n_rows=500, strategy={
|
|
176
172
|
'id': 'increment:start=1',
|
|
177
173
|
'name': 'choice:[John,Jane,Bob]',
|
|
178
174
|
'age': 'range:18-65'
|
|
179
175
|
})
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
### 🤖 Text-Based Label Deduction
|
|
180
179
|
|
|
181
|
-
|
|
182
|
-
|
|
180
|
+
**Deduce** automatically fills in missing labels by learning from your existing labeled examples. Pure Python, no LLMs, offline-first.
|
|
181
|
+
|
|
182
|
+
```python
|
|
183
|
+
# Deduce missing labels from text
|
|
184
|
+
tickets = pd.DataFrame({
|
|
185
|
+
"ticket_text": ["Cannot log in", "Billing question", "App crashes", "Need invoice"],
|
|
186
|
+
"category": ["Technical", "Billing", None, None]
|
|
187
|
+
})
|
|
188
|
+
|
|
189
|
+
# Automatically fill in missing categories
|
|
190
|
+
result = add.deduce(tickets, from_column="ticket_text", to_column="category")
|
|
191
|
+
|
|
192
|
+
# Use multiple columns for better accuracy
|
|
193
|
+
result = add.deduce(
|
|
194
|
+
df,
|
|
195
|
+
from_column=["title", "description"],
|
|
196
|
+
to_column="category"
|
|
197
|
+
)
|
|
183
198
|
```
|
|
184
199
|
|
|
185
200
|
## 🧪 Examples
|
|
@@ -197,7 +212,7 @@ customers = pd.DataFrame({
|
|
|
197
212
|
})
|
|
198
213
|
|
|
199
214
|
# Generate more customers
|
|
200
|
-
customers = add.
|
|
215
|
+
customers = add.synthetic(customers, n_rows=10000)
|
|
201
216
|
|
|
202
217
|
# Add customer tiers
|
|
203
218
|
tiers = pd.DataFrame({
|
|
@@ -223,7 +238,7 @@ strategy = {
|
|
|
223
238
|
'height_cm': 'range:150-200' # Height in cm
|
|
224
239
|
}
|
|
225
240
|
|
|
226
|
-
patients = add.
|
|
241
|
+
patients = add.synthetic("@new", n_rows=1000, strategy=strategy)
|
|
227
242
|
|
|
228
243
|
# Convert height to meters for expressions
|
|
229
244
|
patients['height_m'] = patients['height_cm'] / 100
|
|
@@ -238,19 +253,19 @@ print(result.correlations)
|
|
|
238
253
|
|
|
239
254
|
## 📚 Documentation
|
|
240
255
|
|
|
241
|
-
- **[Function Documentation](https://github.com/sekarkrishna/additory/tree/main/
|
|
242
|
-
- **[Expressions Guide](https://github.com/sekarkrishna/additory/tree/main/
|
|
256
|
+
- **[Function Documentation](https://github.com/sekarkrishna/additory/tree/main/documentation/)** - Detailed guides for each function
|
|
257
|
+
- **[Expressions Guide](https://github.com/sekarkrishna/additory/tree/main/documentation/)** - Complete expressions reference
|
|
243
258
|
|
|
244
259
|
## 📄 License
|
|
245
260
|
|
|
246
|
-
MIT License - see [LICENSE](
|
|
261
|
+
MIT License - see [LICENSE](LICENSE) file for details.
|
|
247
262
|
|
|
248
263
|
## 📞 Support
|
|
249
264
|
|
|
250
265
|
- **Issues**: [GitHub Issues](https://github.com/sekarkrishna/additory/issues)
|
|
251
|
-
- **Documentation**: [Full Documentation](https://github.com/sekarkrishna/additory/tree/main/
|
|
266
|
+
- **Documentation**: [Full Documentation](https://github.com/sekarkrishna/additory/tree/main/documentation/)
|
|
252
267
|
|
|
253
|
-
## 🗺️ v0.1.1 (
|
|
268
|
+
## 🗺️ v0.1.1 (January 2026)
|
|
254
269
|
- Enhanced documentation and tutorials
|
|
255
270
|
- Performance optimizations
|
|
256
271
|
- Additional expressions
|
|
@@ -2,6 +2,9 @@
|
|
|
2
2
|
|
|
3
3
|
from .dynamic_api import add as _api_instance
|
|
4
4
|
|
|
5
|
+
# Version information
|
|
6
|
+
__version__ = "0.1.0a4"
|
|
7
|
+
|
|
5
8
|
# Expose the API instance normally
|
|
6
9
|
add = _api_instance
|
|
7
10
|
|
|
@@ -12,4 +15,5 @@ def __getattr__(name):
|
|
|
12
15
|
|
|
13
16
|
__all__ = [
|
|
14
17
|
"add",
|
|
18
|
+
"__version__",
|
|
15
19
|
]
|
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Common Utilities Module
|
|
3
3
|
|
|
4
|
-
Shared functionality used by both
|
|
4
|
+
Shared functionality used by both synthetic and expressions modules:
|
|
5
5
|
- Distribution functions (normal, uniform, skewed, etc.)
|
|
6
6
|
- List file management (.list format)
|
|
7
7
|
- Pattern file management (.properties format)
|
|
8
8
|
- Fallback resolution logic
|
|
9
9
|
|
|
10
10
|
This module eliminates code duplication and provides consistent behavior
|
|
11
|
-
across
|
|
11
|
+
across synthetic and expression data generation.
|
|
12
12
|
"""
|
|
13
13
|
|
|
14
14
|
from .distributions import (
|
|
@@ -180,11 +180,14 @@ def get_arrow_bridge():
|
|
|
180
180
|
- Use for all cross-backend conversions
|
|
181
181
|
- Handles pandas/polars/cuDF via Arrow
|
|
182
182
|
"""
|
|
183
|
-
from additory.core.backends.arrow_bridge import EnhancedArrowBridge
|
|
183
|
+
from additory.core.backends.arrow_bridge import EnhancedArrowBridge, ArrowBridgeError
|
|
184
184
|
|
|
185
185
|
# Singleton pattern
|
|
186
186
|
if not hasattr(get_arrow_bridge, '_instance'):
|
|
187
|
-
|
|
187
|
+
try:
|
|
188
|
+
get_arrow_bridge._instance = EnhancedArrowBridge()
|
|
189
|
+
except ArrowBridgeError:
|
|
190
|
+
get_arrow_bridge._instance = None
|
|
188
191
|
|
|
189
192
|
return get_arrow_bridge._instance
|
|
190
193
|
|
|
@@ -194,7 +197,7 @@ def to_polars(df: Any, backend_type: BackendType = None) -> 'pl.DataFrame':
|
|
|
194
197
|
Convert any dataframe to Polars via Arrow bridge.
|
|
195
198
|
|
|
196
199
|
This is the primary conversion function for the Polars-only architecture.
|
|
197
|
-
All operations (expressions,
|
|
200
|
+
All operations (expressions, synthetic, etc.) use this to convert input
|
|
198
201
|
dataframes to Polars for processing.
|
|
199
202
|
|
|
200
203
|
Args:
|
|
@@ -224,7 +227,7 @@ def to_polars(df: Any, backend_type: BackendType = None) -> 'pl.DataFrame':
|
|
|
224
227
|
)
|
|
225
228
|
|
|
226
229
|
# Fast path: already Polars
|
|
227
|
-
if isinstance(df, pl.DataFrame):
|
|
230
|
+
if HAS_POLARS and isinstance(df, pl.DataFrame):
|
|
228
231
|
return df
|
|
229
232
|
|
|
230
233
|
# Validate input
|
|
@@ -240,6 +243,13 @@ def to_polars(df: Any, backend_type: BackendType = None) -> 'pl.DataFrame':
|
|
|
240
243
|
# Convert via Arrow bridge
|
|
241
244
|
try:
|
|
242
245
|
bridge = get_arrow_bridge()
|
|
246
|
+
if bridge is None:
|
|
247
|
+
# Fallback: direct conversion for pandas
|
|
248
|
+
if backend_type == "pandas":
|
|
249
|
+
if isinstance(df, pd.DataFrame):
|
|
250
|
+
return pl.from_pandas(df)
|
|
251
|
+
raise RuntimeError("Arrow bridge not available and cannot convert non-pandas DataFrame")
|
|
252
|
+
|
|
243
253
|
arrow_table = bridge.to_arrow(df, backend_type)
|
|
244
254
|
pl_df = bridge.from_arrow(arrow_table, "polars")
|
|
245
255
|
return pl_df
|
|
@@ -309,6 +319,12 @@ def from_polars(pl_df: 'pl.DataFrame', target_backend: BackendType) -> Any:
|
|
|
309
319
|
# Convert via Arrow bridge
|
|
310
320
|
try:
|
|
311
321
|
bridge = get_arrow_bridge()
|
|
322
|
+
if bridge is None:
|
|
323
|
+
# Fallback: direct conversion for pandas
|
|
324
|
+
if target_backend == "pandas":
|
|
325
|
+
return pl_df.to_pandas()
|
|
326
|
+
raise RuntimeError("Arrow bridge not available and cannot convert to non-pandas DataFrame")
|
|
327
|
+
|
|
312
328
|
arrow_table = bridge.to_arrow(pl_df, "polars")
|
|
313
329
|
result_df = bridge.from_arrow(arrow_table, target_backend)
|
|
314
330
|
return result_df
|
|
@@ -8,8 +8,8 @@ loaded on-demand using the existing .add file parser.
|
|
|
8
8
|
Usage:
|
|
9
9
|
from additory.common.sample_data import get_sample_dataset
|
|
10
10
|
|
|
11
|
-
# For
|
|
12
|
-
df = get_sample_dataset("
|
|
11
|
+
# For synthetic
|
|
12
|
+
df = get_sample_dataset("synthetic", "sample")
|
|
13
13
|
|
|
14
14
|
# For expressions (future)
|
|
15
15
|
df = get_sample_dataset("expressions", "sample")
|
|
@@ -25,7 +25,7 @@ from additory.common.exceptions import ValidationError
|
|
|
25
25
|
|
|
26
26
|
|
|
27
27
|
def get_sample_dataset(
|
|
28
|
-
module: str = "
|
|
28
|
+
module: str = "synthetic",
|
|
29
29
|
block: str = "sample",
|
|
30
30
|
dataset_type: str = "clean"
|
|
31
31
|
) -> pl.DataFrame:
|
|
@@ -33,12 +33,12 @@ def get_sample_dataset(
|
|
|
33
33
|
Load a sample dataset from .add files.
|
|
34
34
|
|
|
35
35
|
This function provides centralized access to sample datasets across
|
|
36
|
-
all additory modules (
|
|
36
|
+
all additory modules (synthetic, expressions, utilities). Sample datasets
|
|
37
37
|
are stored as .add files in the reference/ directory structure.
|
|
38
38
|
|
|
39
39
|
Args:
|
|
40
|
-
module: Module name ("
|
|
41
|
-
block: Block name within the .add file ("sample" for
|
|
40
|
+
module: Module name ("synthetic", "expressions", "utilities")
|
|
41
|
+
block: Block name within the .add file ("sample" for synthetic)
|
|
42
42
|
dataset_type: Type of sample data ("clean" or "unclean")
|
|
43
43
|
|
|
44
44
|
Returns:
|
|
@@ -48,8 +48,8 @@ def get_sample_dataset(
|
|
|
48
48
|
ValidationError: If module, block, or dataset_type not found
|
|
49
49
|
|
|
50
50
|
Examples:
|
|
51
|
-
>>> # Load
|
|
52
|
-
>>> df = get_sample_dataset("
|
|
51
|
+
>>> # Load synthetic sample dataset
|
|
52
|
+
>>> df = get_sample_dataset("synthetic", "sample")
|
|
53
53
|
>>> print(df.shape)
|
|
54
54
|
(50, 10)
|
|
55
55
|
|
|
@@ -57,7 +57,7 @@ def get_sample_dataset(
|
|
|
57
57
|
>>> df = get_sample_dataset("expressions", "sample", "clean")
|
|
58
58
|
>>> df_unclean = get_sample_dataset("expressions", "sample", "unclean")
|
|
59
59
|
|
|
60
|
-
Sample Dataset Structure (
|
|
60
|
+
Sample Dataset Structure (synthetic):
|
|
61
61
|
- id: Sequential numeric IDs (1-50)
|
|
62
62
|
- emp_id: Employee IDs with pattern (EMP_001 - EMP_050)
|
|
63
63
|
- order_id: Order IDs with different padding (ORD_0001 - ORD_0050)
|
|
@@ -72,8 +72,8 @@ def get_sample_dataset(
|
|
|
72
72
|
# Construct path to .add file
|
|
73
73
|
base_path = Path(__file__).parent.parent.parent / "reference"
|
|
74
74
|
|
|
75
|
-
if module == "
|
|
76
|
-
add_file_path = base_path / "
|
|
75
|
+
if module == "synthetic":
|
|
76
|
+
add_file_path = base_path / "synthetic_definitions" / f"{block}_0.1.add"
|
|
77
77
|
elif module == "expressions":
|
|
78
78
|
add_file_path = base_path / "expressions_definitions" / f"{block}_0.1.add"
|
|
79
79
|
elif module == "utilities":
|
|
@@ -81,7 +81,7 @@ def get_sample_dataset(
|
|
|
81
81
|
else:
|
|
82
82
|
raise ValidationError(
|
|
83
83
|
f"Unknown module '{module}'. "
|
|
84
|
-
f"Valid modules:
|
|
84
|
+
f"Valid modules: synthetic, expressions, utilities"
|
|
85
85
|
)
|
|
86
86
|
|
|
87
87
|
# Check if file exists
|
|
@@ -141,7 +141,7 @@ def list_available_samples() -> dict:
|
|
|
141
141
|
>>> samples = list_available_samples()
|
|
142
142
|
>>> print(samples)
|
|
143
143
|
{
|
|
144
|
-
'
|
|
144
|
+
'synthetic': ['sample'],
|
|
145
145
|
'expressions': ['sample'],
|
|
146
146
|
'utilities': []
|
|
147
147
|
}
|
|
@@ -149,15 +149,15 @@ def list_available_samples() -> dict:
|
|
|
149
149
|
base_path = Path(__file__).parent.parent.parent / "reference"
|
|
150
150
|
available = {}
|
|
151
151
|
|
|
152
|
-
# Check
|
|
153
|
-
|
|
154
|
-
if
|
|
155
|
-
available['
|
|
152
|
+
# Check synthetic
|
|
153
|
+
synthetic_path = base_path / "synthetic_definitions"
|
|
154
|
+
if synthetic_path.exists():
|
|
155
|
+
available['synthetic'] = [
|
|
156
156
|
f.stem.rsplit('_', 1)[0] # Remove version suffix
|
|
157
|
-
for f in
|
|
157
|
+
for f in synthetic_path.glob("*.add")
|
|
158
158
|
]
|
|
159
159
|
else:
|
|
160
|
-
available['
|
|
160
|
+
available['synthetic'] = []
|
|
161
161
|
|
|
162
162
|
# Check expressions
|
|
163
163
|
expressions_path = base_path / "expressions_definitions"
|
|
@@ -16,6 +16,13 @@ try:
|
|
|
16
16
|
except ImportError as e:
|
|
17
17
|
ARROW_AVAILABLE = False
|
|
18
18
|
IMPORT_ERROR = str(e)
|
|
19
|
+
# Create dummy classes for type annotations
|
|
20
|
+
class pa:
|
|
21
|
+
Table = Any
|
|
22
|
+
class pl:
|
|
23
|
+
DataFrame = Any
|
|
24
|
+
class pd:
|
|
25
|
+
DataFrame = Any
|
|
19
26
|
|
|
20
27
|
from ..logging import log_info, log_warning
|
|
21
28
|
from .cudf_bridge import get_cudf_bridge
|
|
@@ -329,14 +329,14 @@ def set_custom_formula_path(path):
|
|
|
329
329
|
|
|
330
330
|
# backend preference setting
|
|
331
331
|
|
|
332
|
-
_backend_preference: str
|
|
332
|
+
_backend_preference: Optional[str] = None # "cpu", "gpu", or None
|
|
333
333
|
|
|
334
|
-
def set_backend_preference(mode: str
|
|
334
|
+
def set_backend_preference(mode: Optional[str]):
|
|
335
335
|
global _backend_preference
|
|
336
336
|
if mode not in (None, "cpu", "gpu"):
|
|
337
337
|
raise ValueError("backend must be 'cpu', 'gpu', or None")
|
|
338
338
|
_backend_preference = mode
|
|
339
339
|
|
|
340
|
-
def get_backend_preference() -> str
|
|
340
|
+
def get_backend_preference() -> Optional[str]:
|
|
341
341
|
return _backend_preference
|
|
342
342
|
|