additory 0.1.0a1__tar.gz → 0.1.0a3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {additory-0.1.0a1 → additory-0.1.0a3}/PKG-INFO +12 -17
- {additory-0.1.0a1 → additory-0.1.0a3}/README.md +7 -15
- {additory-0.1.0a1 → additory-0.1.0a3}/additory/__init__.py +4 -0
- {additory-0.1.0a1 → additory-0.1.0a3}/additory/common/__init__.py +2 -2
- {additory-0.1.0a1 → additory-0.1.0a3}/additory/common/backend.py +20 -4
- {additory-0.1.0a1 → additory-0.1.0a3}/additory/common/distributions.py +1 -1
- {additory-0.1.0a1 → additory-0.1.0a3}/additory/common/sample_data.py +19 -19
- {additory-0.1.0a1 → additory-0.1.0a3}/additory/core/backends/arrow_bridge.py +7 -0
- {additory-0.1.0a1 → additory-0.1.0a3}/additory/core/polars_expression_engine.py +66 -16
- {additory-0.1.0a1 → additory-0.1.0a3}/additory/dynamic_api.py +42 -46
- {additory-0.1.0a1 → additory-0.1.0a3}/additory/expressions/proxy.py +4 -1
- additory-0.1.0a3/additory/synthetic/__init__.py +13 -0
- additory-0.1.0a3/additory/synthetic/column_name_resolver.py +149 -0
- {additory-0.1.0a1/additory/augment → additory-0.1.0a3/additory/synthetic}/distributions.py +2 -2
- {additory-0.1.0a1/additory/augment → additory-0.1.0a3/additory/synthetic}/forecast.py +1 -1
- additory-0.1.0a3/additory/synthetic/linked_list_parser.py +415 -0
- additory-0.1.0a3/additory/synthetic/namespace_lookup.py +129 -0
- {additory-0.1.0a1/additory/augment → additory-0.1.0a3/additory/synthetic}/smote.py +1 -1
- {additory-0.1.0a1/additory/augment → additory-0.1.0a3/additory/synthetic}/strategies.py +11 -44
- additory-0.1.0a1/additory/augment/augmentor.py → additory-0.1.0a3/additory/synthetic/synthesizer.py +75 -15
- {additory-0.1.0a1 → additory-0.1.0a3}/additory/utilities/units.py +4 -1
- {additory-0.1.0a1 → additory-0.1.0a3}/additory.egg-info/SOURCES.txt +9 -37
- {additory-0.1.0a1 → additory-0.1.0a3}/documentation/V0.1.0/add_onehotencoding_function.html +43 -40
- additory-0.1.0a1/documentation/V0.1.0/add_augment_function.html → additory-0.1.0a3/documentation/V0.1.0/add_synthetic_function.html +140 -147
- {additory-0.1.0a1 → additory-0.1.0a3}/documentation/V0.1.0/expressions.html +55 -79
- {additory-0.1.0a1 → additory-0.1.0a3}/pyproject.toml +6 -3
- {additory-0.1.0a1 → additory-0.1.0a3}/reference/expressions_definitions/age_category_0.1.add +23 -5
- {additory-0.1.0a1 → additory-0.1.0a3}/reference/expressions_definitions/blood_pressure_category_0.1.add +24 -5
- {additory-0.1.0a1 → additory-0.1.0a3}/reference/expressions_definitions/bmi2_0.1.add +3 -3
- additory-0.1.0a3/reference/expressions_definitions/bmi3_0.1.add +41 -0
- {additory-0.1.0a1 → additory-0.1.0a3}/reference/expressions_definitions/bmi_0.1.add +3 -3
- {additory-0.1.0a1 → additory-0.1.0a3}/reference/expressions_definitions/bmr_0.1.add +13 -5
- {additory-0.1.0a1 → additory-0.1.0a3}/reference/expressions_definitions/body_fat_percentage_0.1.add +13 -5
- {additory-0.1.0a1 → additory-0.1.0a3}/reference/expressions_definitions/bsa_0.1.add +11 -5
- {additory-0.1.0a1 → additory-0.1.0a3}/reference/expressions_definitions/cholesterol_ratio_0.1.add +11 -5
- {additory-0.1.0a1 → additory-0.1.0a3}/reference/expressions_definitions/fitness_score_0.1.add +24 -5
- {additory-0.1.0a1 → additory-0.1.0a3}/reference/expressions_definitions/ideal_body_weight_0.1.add +14 -5
- {additory-0.1.0a1 → additory-0.1.0a3}/reference/expressions_definitions/waist_hip_ratio_0.1.add +11 -5
- additory-0.1.0a1/additory/augment/__init__.py +0 -24
- additory-0.1.0a1/additory/augment/builtin_lists.py +0 -430
- additory-0.1.0a1/additory/augment/list_registry.py +0 -177
- additory-0.1.0a1/additory/synthetic/__init__.py +0 -101
- additory-0.1.0a1/additory/synthetic/api.py +0 -220
- additory-0.1.0a1/additory/synthetic/common_integration.py +0 -314
- additory-0.1.0a1/additory/synthetic/config.py +0 -262
- additory-0.1.0a1/additory/synthetic/engines.py +0 -529
- additory-0.1.0a1/additory/synthetic/exceptions.py +0 -180
- additory-0.1.0a1/additory/synthetic/file_managers.py +0 -518
- additory-0.1.0a1/additory/synthetic/generator.py +0 -702
- additory-0.1.0a1/additory/synthetic/generator_parser.py +0 -68
- additory-0.1.0a1/additory/synthetic/integration.py +0 -319
- additory-0.1.0a1/additory/synthetic/models.py +0 -241
- additory-0.1.0a1/additory/synthetic/pattern_resolver.py +0 -573
- additory-0.1.0a1/additory/synthetic/performance.py +0 -469
- additory-0.1.0a1/additory/synthetic/polars_integration.py +0 -464
- additory-0.1.0a1/additory/synthetic/proxy.py +0 -60
- additory-0.1.0a1/additory/synthetic/schema_parser.py +0 -685
- additory-0.1.0a1/additory/synthetic/validator.py +0 -553
- additory-0.1.0a1/documentation/V0.1.0/add_synth_function.html +0 -664
- additory-0.1.0a1/reference/expressions_definitions/bmi3_0.1.add +0 -26
- additory-0.1.0a1/reference/schema_definitions/ca.list +0 -41
- additory-0.1.0a1/reference/schema_definitions/ca.properties +0 -14
- additory-0.1.0a1/reference/schema_definitions/eu.list +0 -41
- additory-0.1.0a1/reference/schema_definitions/eu.properties +0 -13
- additory-0.1.0a1/reference/schema_definitions/finance.list +0 -31
- additory-0.1.0a1/reference/schema_definitions/finance.properties +0 -18
- additory-0.1.0a1/reference/schema_definitions/global.list +0 -57
- additory-0.1.0a1/reference/schema_definitions/global.properties +0 -11
- additory-0.1.0a1/reference/schema_definitions/healthcare.list +0 -28
- additory-0.1.0a1/reference/schema_definitions/us.list +0 -41
- additory-0.1.0a1/reference/schema_definitions/us.properties +0 -14
- {additory-0.1.0a1 → additory-0.1.0a3}/LICENSE +0 -0
- {additory-0.1.0a1 → additory-0.1.0a3}/additory/analysis/__init__.py +0 -0
- {additory-0.1.0a1 → additory-0.1.0a3}/additory/analysis/cardinality.py +0 -0
- {additory-0.1.0a1 → additory-0.1.0a3}/additory/analysis/correlations.py +0 -0
- {additory-0.1.0a1 → additory-0.1.0a3}/additory/analysis/distributions.py +0 -0
- {additory-0.1.0a1 → additory-0.1.0a3}/additory/analysis/quality.py +0 -0
- {additory-0.1.0a1 → additory-0.1.0a3}/additory/analysis/scan.py +0 -0
- {additory-0.1.0a1 → additory-0.1.0a3}/additory/common/column_utils.py +0 -0
- {additory-0.1.0a1 → additory-0.1.0a3}/additory/common/exceptions.py +0 -0
- {additory-0.1.0a1 → additory-0.1.0a3}/additory/common/lists.py +0 -0
- {additory-0.1.0a1 → additory-0.1.0a3}/additory/common/patterns.py +0 -0
- {additory-0.1.0a1 → additory-0.1.0a3}/additory/common/resolver.py +0 -0
- {additory-0.1.0a1 → additory-0.1.0a3}/additory/common/validation.py +0 -0
- {additory-0.1.0a1 → additory-0.1.0a3}/additory/core/__init__.py +0 -0
- {additory-0.1.0a1 → additory-0.1.0a3}/additory/core/ast_builder.py +0 -0
- {additory-0.1.0a1 → additory-0.1.0a3}/additory/core/backends/__init__.py +0 -0
- {additory-0.1.0a1 → additory-0.1.0a3}/additory/core/backends/cudf_bridge.py +0 -0
- {additory-0.1.0a1 → additory-0.1.0a3}/additory/core/column_positioning.py +0 -0
- {additory-0.1.0a1 → additory-0.1.0a3}/additory/core/compiler_polars.py +0 -0
- {additory-0.1.0a1 → additory-0.1.0a3}/additory/core/config.py +0 -0
- {additory-0.1.0a1 → additory-0.1.0a3}/additory/core/enhanced_cache_manager.py +0 -0
- {additory-0.1.0a1 → additory-0.1.0a3}/additory/core/enhanced_matchers.py +0 -0
- {additory-0.1.0a1 → additory-0.1.0a3}/additory/core/enhanced_version_manager.py +0 -0
- {additory-0.1.0a1 → additory-0.1.0a3}/additory/core/executor.py +0 -0
- {additory-0.1.0a1 → additory-0.1.0a3}/additory/core/integrity_manager.py +0 -0
- {additory-0.1.0a1 → additory-0.1.0a3}/additory/core/loader.py +0 -0
- {additory-0.1.0a1 → additory-0.1.0a3}/additory/core/logging.py +0 -0
- {additory-0.1.0a1 → additory-0.1.0a3}/additory/core/memory_manager.py +0 -0
- {additory-0.1.0a1 → additory-0.1.0a3}/additory/core/namespace_manager.py +0 -0
- {additory-0.1.0a1 → additory-0.1.0a3}/additory/core/parser.py +0 -0
- {additory-0.1.0a1 → additory-0.1.0a3}/additory/core/registry.py +0 -0
- {additory-0.1.0a1 → additory-0.1.0a3}/additory/core/sample_data_manager.py +0 -0
- {additory-0.1.0a1 → additory-0.1.0a3}/additory/core/user_namespace.py +0 -0
- {additory-0.1.0a1 → additory-0.1.0a3}/additory/core/validator.py +0 -0
- {additory-0.1.0a1 → additory-0.1.0a3}/additory/expressions/__init__.py +0 -0
- {additory-0.1.0a1 → additory-0.1.0a3}/additory/expressions/engine.py +0 -0
- {additory-0.1.0a1 → additory-0.1.0a3}/additory/expressions/parser.py +0 -0
- {additory-0.1.0a1 → additory-0.1.0a3}/additory/expressions/registry.py +0 -0
- {additory-0.1.0a1 → additory-0.1.0a3}/additory/expressions/samples.py +0 -0
- {additory-0.1.0a1 → additory-0.1.0a3}/additory/utilities/__init__.py +0 -0
- {additory-0.1.0a1 → additory-0.1.0a3}/additory/utilities/encoding.py +0 -0
- {additory-0.1.0a1 → additory-0.1.0a3}/additory/utilities/games.py +0 -0
- {additory-0.1.0a1 → additory-0.1.0a3}/additory/utilities/keys.py +0 -0
- {additory-0.1.0a1 → additory-0.1.0a3}/additory/utilities/lookup.py +0 -0
- {additory-0.1.0a1 → additory-0.1.0a3}/additory/utilities/matchers.py +0 -0
- {additory-0.1.0a1 → additory-0.1.0a3}/additory/utilities/resolvers.py +0 -0
- {additory-0.1.0a1 → additory-0.1.0a3}/additory/utilities/settings.py +0 -0
- {additory-0.1.0a1 → additory-0.1.0a3}/additory/utilities/validators.py +0 -0
- {additory-0.1.0a1 → additory-0.1.0a3}/documentation/V0.1.0/add_harmonize_units_function.html +0 -0
- {additory-0.1.0a1 → additory-0.1.0a3}/documentation/V0.1.0/add_scan_function.html +0 -0
- {additory-0.1.0a1 → additory-0.1.0a3}/documentation/V0.1.0/add_to_function.html +0 -0
- {additory-0.1.0a1 → additory-0.1.0a3}/reference/expressions_definitions/manifest.json +0 -0
- {additory-0.1.0a1 → additory-0.1.0a3}/setup.cfg +0 -0
- {additory-0.1.0a1 → additory-0.1.0a3}/user_expressions/bmi1_0.1.add +0 -0
- {additory-0.1.0a1 → additory-0.1.0a3}/user_expressions/bmi2_0.1.add +0 -0
- {additory-0.1.0a1 → additory-0.1.0a3}/user_expressions/bmi3_0.1.add +0 -0
- {additory-0.1.0a1 → additory-0.1.0a3}/user_expressions/bmi_0.1.add +0 -0
- {additory-0.1.0a1 → additory-0.1.0a3}/user_expressions/manifest.json +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: additory
|
|
3
|
-
Version: 0.1.
|
|
4
|
-
Summary: A semantic, extensible dataframe transformation engine with expressions, lookup, synthetic data
|
|
3
|
+
Version: 0.1.0a3
|
|
4
|
+
Summary: A semantic, extensible dataframe transformation engine with expressions, lookup, and synthetic data generation support.
|
|
5
5
|
Author: Krishnamoorthy Sankaran
|
|
6
6
|
License: MIT
|
|
7
7
|
Project-URL: homepage, https://github.com/sekarkrishna/additory
|
|
@@ -13,11 +13,14 @@ Description-Content-Type: text/markdown
|
|
|
13
13
|
License-File: LICENSE
|
|
14
14
|
Requires-Dist: pandas>=1.5
|
|
15
15
|
Requires-Dist: polars>=0.20
|
|
16
|
+
Requires-Dist: pyarrow>=10.0
|
|
16
17
|
Requires-Dist: pyyaml>=6.0
|
|
17
18
|
Requires-Dist: requests>=2.31
|
|
18
19
|
Requires-Dist: toml>=0.10
|
|
19
20
|
Requires-Dist: scipy>=1.9
|
|
20
21
|
Requires-Dist: numpy>=1.21
|
|
22
|
+
Requires-Dist: packaging>=21.0
|
|
23
|
+
Requires-Dist: psutil>=5.8
|
|
21
24
|
Provides-Extra: gpu
|
|
22
25
|
Requires-Dist: cudf>=24.0; extra == "gpu"
|
|
23
26
|
Provides-Extra: dev
|
|
@@ -32,11 +35,11 @@ Dynamic: license-file
|
|
|
32
35
|
|
|
33
36
|
# Additory
|
|
34
37
|
|
|
35
|
-
**A semantic, extensible dataframe transformation engine with expressions, lookup,
|
|
38
|
+
**A semantic, extensible dataframe transformation engine with expressions, lookup, and augmentation support.**
|
|
36
39
|
|
|
37
40
|
[](https://www.python.org/downloads/)
|
|
38
41
|
[](https://opensource.org/licenses/MIT)
|
|
39
|
-
[](https://github.com/sekarkrishna/additory/tree/main/V0.1.0a1/)
|
|
40
43
|
|
|
41
44
|
**Author:** Krishnamoorthy Sankaran
|
|
42
45
|
|
|
@@ -49,17 +52,17 @@ Dynamic: license-file
|
|
|
49
52
|
## 📦 Installation
|
|
50
53
|
|
|
51
54
|
```bash
|
|
52
|
-
pip install additory==0.1.
|
|
55
|
+
pip install additory==0.1.0a2
|
|
53
56
|
```
|
|
54
57
|
|
|
55
58
|
**Optional GPU support:**
|
|
56
59
|
```bash
|
|
57
|
-
pip install additory[gpu]==0.1.
|
|
60
|
+
pip install additory[gpu]==0.1.0a2 # Includes cuDF for GPU acceleration
|
|
58
61
|
```
|
|
59
62
|
|
|
60
63
|
**Development installation:**
|
|
61
64
|
```bash
|
|
62
|
-
pip install additory[dev]==0.1.
|
|
65
|
+
pip install additory[dev]==0.1.0a2 # Includes testing and development tools
|
|
63
66
|
```
|
|
64
67
|
|
|
65
68
|
## 🎯 Core Functions
|
|
@@ -68,7 +71,6 @@ pip install additory[dev]==0.1.0a1 # Includes testing and development tools
|
|
|
68
71
|
|----------|---------|---------|
|
|
69
72
|
| `add.to()` | Lookup/join operations | `add.to(df1, from_df=df2, bring='col', against='key')` |
|
|
70
73
|
| `add.augment()` | Generate additional data | `add.augment(df, n_rows=1000)` |
|
|
71
|
-
| `add.synth()` | Synthetic data from schemas | `add.synth("schema.toml", rows=5000)` |
|
|
72
74
|
| `add.scan()` | Data profiling & analysis | `add.scan(df, preset="full")` |
|
|
73
75
|
|
|
74
76
|
## 🧬 Available Expressions
|
|
@@ -191,13 +193,9 @@ patients_with_bsa = add.bsa(patients)
|
|
|
191
193
|
result = add.fitness_score(add.bmr(add.bmi(patients)))
|
|
192
194
|
```
|
|
193
195
|
|
|
194
|
-
### 🔄 Augment
|
|
196
|
+
### 🔄 Augment Data Generation
|
|
195
197
|
|
|
196
|
-
**Augment** generates
|
|
197
|
-
|
|
198
|
-
**Key Differences:**
|
|
199
|
-
- **Augment**: Learns patterns from existing data to create similar rows
|
|
200
|
-
- **Synthetic**: Uses predefined schemas to generate structured data
|
|
198
|
+
**Augment** generates additional data similar to your existing dataset using inline strategies.
|
|
201
199
|
|
|
202
200
|
```python
|
|
203
201
|
# Augment existing data (learns from patterns)
|
|
@@ -209,9 +207,6 @@ new_data = add.augment("@new", n_rows=500, strategy={
|
|
|
209
207
|
'name': 'choice:[John,Jane,Bob]',
|
|
210
208
|
'age': 'range:18-65'
|
|
211
209
|
})
|
|
212
|
-
|
|
213
|
-
# Generate from schema file (structured approach)
|
|
214
|
-
customers = add.synth("customer_schema.toml", rows=10000)
|
|
215
210
|
```
|
|
216
211
|
|
|
217
212
|
## 🧪 Examples
|
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
# Additory
|
|
2
2
|
|
|
3
|
-
**A semantic, extensible dataframe transformation engine with expressions, lookup,
|
|
3
|
+
**A semantic, extensible dataframe transformation engine with expressions, lookup, and augmentation support.**
|
|
4
4
|
|
|
5
5
|
[](https://www.python.org/downloads/)
|
|
6
6
|
[](https://opensource.org/licenses/MIT)
|
|
7
|
-
[](https://github.com/sekarkrishna/additory/tree/main/V0.1.0a1/)
|
|
8
8
|
|
|
9
9
|
**Author:** Krishnamoorthy Sankaran
|
|
10
10
|
|
|
@@ -17,17 +17,17 @@
|
|
|
17
17
|
## 📦 Installation
|
|
18
18
|
|
|
19
19
|
```bash
|
|
20
|
-
pip install additory==0.1.
|
|
20
|
+
pip install additory==0.1.0a2
|
|
21
21
|
```
|
|
22
22
|
|
|
23
23
|
**Optional GPU support:**
|
|
24
24
|
```bash
|
|
25
|
-
pip install additory[gpu]==0.1.
|
|
25
|
+
pip install additory[gpu]==0.1.0a2 # Includes cuDF for GPU acceleration
|
|
26
26
|
```
|
|
27
27
|
|
|
28
28
|
**Development installation:**
|
|
29
29
|
```bash
|
|
30
|
-
pip install additory[dev]==0.1.
|
|
30
|
+
pip install additory[dev]==0.1.0a2 # Includes testing and development tools
|
|
31
31
|
```
|
|
32
32
|
|
|
33
33
|
## 🎯 Core Functions
|
|
@@ -36,7 +36,6 @@ pip install additory[dev]==0.1.0a1 # Includes testing and development tools
|
|
|
36
36
|
|----------|---------|---------|
|
|
37
37
|
| `add.to()` | Lookup/join operations | `add.to(df1, from_df=df2, bring='col', against='key')` |
|
|
38
38
|
| `add.augment()` | Generate additional data | `add.augment(df, n_rows=1000)` |
|
|
39
|
-
| `add.synth()` | Synthetic data from schemas | `add.synth("schema.toml", rows=5000)` |
|
|
40
39
|
| `add.scan()` | Data profiling & analysis | `add.scan(df, preset="full")` |
|
|
41
40
|
|
|
42
41
|
## 🧬 Available Expressions
|
|
@@ -159,13 +158,9 @@ patients_with_bsa = add.bsa(patients)
|
|
|
159
158
|
result = add.fitness_score(add.bmr(add.bmi(patients)))
|
|
160
159
|
```
|
|
161
160
|
|
|
162
|
-
### 🔄 Augment
|
|
161
|
+
### 🔄 Augment Data Generation
|
|
163
162
|
|
|
164
|
-
**Augment** generates
|
|
165
|
-
|
|
166
|
-
**Key Differences:**
|
|
167
|
-
- **Augment**: Learns patterns from existing data to create similar rows
|
|
168
|
-
- **Synthetic**: Uses predefined schemas to generate structured data
|
|
163
|
+
**Augment** generates additional data similar to your existing dataset using inline strategies.
|
|
169
164
|
|
|
170
165
|
```python
|
|
171
166
|
# Augment existing data (learns from patterns)
|
|
@@ -177,9 +172,6 @@ new_data = add.augment("@new", n_rows=500, strategy={
|
|
|
177
172
|
'name': 'choice:[John,Jane,Bob]',
|
|
178
173
|
'age': 'range:18-65'
|
|
179
174
|
})
|
|
180
|
-
|
|
181
|
-
# Generate from schema file (structured approach)
|
|
182
|
-
customers = add.synth("customer_schema.toml", rows=10000)
|
|
183
175
|
```
|
|
184
176
|
|
|
185
177
|
## 🧪 Examples
|
|
@@ -2,6 +2,9 @@
|
|
|
2
2
|
|
|
3
3
|
from .dynamic_api import add as _api_instance
|
|
4
4
|
|
|
5
|
+
# Version information
|
|
6
|
+
__version__ = "0.1.0a3"
|
|
7
|
+
|
|
5
8
|
# Expose the API instance normally
|
|
6
9
|
add = _api_instance
|
|
7
10
|
|
|
@@ -12,4 +15,5 @@ def __getattr__(name):
|
|
|
12
15
|
|
|
13
16
|
__all__ = [
|
|
14
17
|
"add",
|
|
18
|
+
"__version__",
|
|
15
19
|
]
|
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Common Utilities Module
|
|
3
3
|
|
|
4
|
-
Shared functionality used by both
|
|
4
|
+
Shared functionality used by both synthetic and expressions modules:
|
|
5
5
|
- Distribution functions (normal, uniform, skewed, etc.)
|
|
6
6
|
- List file management (.list format)
|
|
7
7
|
- Pattern file management (.properties format)
|
|
8
8
|
- Fallback resolution logic
|
|
9
9
|
|
|
10
10
|
This module eliminates code duplication and provides consistent behavior
|
|
11
|
-
across
|
|
11
|
+
across synthetic and expression data generation.
|
|
12
12
|
"""
|
|
13
13
|
|
|
14
14
|
from .distributions import (
|
|
@@ -180,11 +180,14 @@ def get_arrow_bridge():
|
|
|
180
180
|
- Use for all cross-backend conversions
|
|
181
181
|
- Handles pandas/polars/cuDF via Arrow
|
|
182
182
|
"""
|
|
183
|
-
from additory.core.backends.arrow_bridge import EnhancedArrowBridge
|
|
183
|
+
from additory.core.backends.arrow_bridge import EnhancedArrowBridge, ArrowBridgeError
|
|
184
184
|
|
|
185
185
|
# Singleton pattern
|
|
186
186
|
if not hasattr(get_arrow_bridge, '_instance'):
|
|
187
|
-
|
|
187
|
+
try:
|
|
188
|
+
get_arrow_bridge._instance = EnhancedArrowBridge()
|
|
189
|
+
except ArrowBridgeError:
|
|
190
|
+
get_arrow_bridge._instance = None
|
|
188
191
|
|
|
189
192
|
return get_arrow_bridge._instance
|
|
190
193
|
|
|
@@ -194,7 +197,7 @@ def to_polars(df: Any, backend_type: BackendType = None) -> 'pl.DataFrame':
|
|
|
194
197
|
Convert any dataframe to Polars via Arrow bridge.
|
|
195
198
|
|
|
196
199
|
This is the primary conversion function for the Polars-only architecture.
|
|
197
|
-
All operations (expressions,
|
|
200
|
+
All operations (expressions, synthetic, etc.) use this to convert input
|
|
198
201
|
dataframes to Polars for processing.
|
|
199
202
|
|
|
200
203
|
Args:
|
|
@@ -224,7 +227,7 @@ def to_polars(df: Any, backend_type: BackendType = None) -> 'pl.DataFrame':
|
|
|
224
227
|
)
|
|
225
228
|
|
|
226
229
|
# Fast path: already Polars
|
|
227
|
-
if isinstance(df, pl.DataFrame):
|
|
230
|
+
if HAS_POLARS and isinstance(df, pl.DataFrame):
|
|
228
231
|
return df
|
|
229
232
|
|
|
230
233
|
# Validate input
|
|
@@ -240,6 +243,13 @@ def to_polars(df: Any, backend_type: BackendType = None) -> 'pl.DataFrame':
|
|
|
240
243
|
# Convert via Arrow bridge
|
|
241
244
|
try:
|
|
242
245
|
bridge = get_arrow_bridge()
|
|
246
|
+
if bridge is None:
|
|
247
|
+
# Fallback: direct conversion for pandas
|
|
248
|
+
if backend_type == "pandas":
|
|
249
|
+
if isinstance(df, pd.DataFrame):
|
|
250
|
+
return pl.from_pandas(df)
|
|
251
|
+
raise RuntimeError("Arrow bridge not available and cannot convert non-pandas DataFrame")
|
|
252
|
+
|
|
243
253
|
arrow_table = bridge.to_arrow(df, backend_type)
|
|
244
254
|
pl_df = bridge.from_arrow(arrow_table, "polars")
|
|
245
255
|
return pl_df
|
|
@@ -309,6 +319,12 @@ def from_polars(pl_df: 'pl.DataFrame', target_backend: BackendType) -> Any:
|
|
|
309
319
|
# Convert via Arrow bridge
|
|
310
320
|
try:
|
|
311
321
|
bridge = get_arrow_bridge()
|
|
322
|
+
if bridge is None:
|
|
323
|
+
# Fallback: direct conversion for pandas
|
|
324
|
+
if target_backend == "pandas":
|
|
325
|
+
return pl_df.to_pandas()
|
|
326
|
+
raise RuntimeError("Arrow bridge not available and cannot convert to non-pandas DataFrame")
|
|
327
|
+
|
|
312
328
|
arrow_table = bridge.to_arrow(pl_df, "polars")
|
|
313
329
|
result_df = bridge.from_arrow(arrow_table, target_backend)
|
|
314
330
|
return result_df
|
|
@@ -8,8 +8,8 @@ loaded on-demand using the existing .add file parser.
|
|
|
8
8
|
Usage:
|
|
9
9
|
from additory.common.sample_data import get_sample_dataset
|
|
10
10
|
|
|
11
|
-
# For
|
|
12
|
-
df = get_sample_dataset("
|
|
11
|
+
# For synthetic
|
|
12
|
+
df = get_sample_dataset("synthetic", "sample")
|
|
13
13
|
|
|
14
14
|
# For expressions (future)
|
|
15
15
|
df = get_sample_dataset("expressions", "sample")
|
|
@@ -25,7 +25,7 @@ from additory.common.exceptions import ValidationError
|
|
|
25
25
|
|
|
26
26
|
|
|
27
27
|
def get_sample_dataset(
|
|
28
|
-
module: str = "
|
|
28
|
+
module: str = "synthetic",
|
|
29
29
|
block: str = "sample",
|
|
30
30
|
dataset_type: str = "clean"
|
|
31
31
|
) -> pl.DataFrame:
|
|
@@ -33,12 +33,12 @@ def get_sample_dataset(
|
|
|
33
33
|
Load a sample dataset from .add files.
|
|
34
34
|
|
|
35
35
|
This function provides centralized access to sample datasets across
|
|
36
|
-
all additory modules (
|
|
36
|
+
all additory modules (synthetic, expressions, utilities). Sample datasets
|
|
37
37
|
are stored as .add files in the reference/ directory structure.
|
|
38
38
|
|
|
39
39
|
Args:
|
|
40
|
-
module: Module name ("
|
|
41
|
-
block: Block name within the .add file ("sample" for
|
|
40
|
+
module: Module name ("synthetic", "expressions", "utilities")
|
|
41
|
+
block: Block name within the .add file ("sample" for synthetic)
|
|
42
42
|
dataset_type: Type of sample data ("clean" or "unclean")
|
|
43
43
|
|
|
44
44
|
Returns:
|
|
@@ -48,8 +48,8 @@ def get_sample_dataset(
|
|
|
48
48
|
ValidationError: If module, block, or dataset_type not found
|
|
49
49
|
|
|
50
50
|
Examples:
|
|
51
|
-
>>> # Load
|
|
52
|
-
>>> df = get_sample_dataset("
|
|
51
|
+
>>> # Load synthetic sample dataset
|
|
52
|
+
>>> df = get_sample_dataset("synthetic", "sample")
|
|
53
53
|
>>> print(df.shape)
|
|
54
54
|
(50, 10)
|
|
55
55
|
|
|
@@ -57,7 +57,7 @@ def get_sample_dataset(
|
|
|
57
57
|
>>> df = get_sample_dataset("expressions", "sample", "clean")
|
|
58
58
|
>>> df_unclean = get_sample_dataset("expressions", "sample", "unclean")
|
|
59
59
|
|
|
60
|
-
Sample Dataset Structure (
|
|
60
|
+
Sample Dataset Structure (synthetic):
|
|
61
61
|
- id: Sequential numeric IDs (1-50)
|
|
62
62
|
- emp_id: Employee IDs with pattern (EMP_001 - EMP_050)
|
|
63
63
|
- order_id: Order IDs with different padding (ORD_0001 - ORD_0050)
|
|
@@ -72,8 +72,8 @@ def get_sample_dataset(
|
|
|
72
72
|
# Construct path to .add file
|
|
73
73
|
base_path = Path(__file__).parent.parent.parent / "reference"
|
|
74
74
|
|
|
75
|
-
if module == "
|
|
76
|
-
add_file_path = base_path / "
|
|
75
|
+
if module == "synthetic":
|
|
76
|
+
add_file_path = base_path / "synthetic_definitions" / f"{block}_0.1.add"
|
|
77
77
|
elif module == "expressions":
|
|
78
78
|
add_file_path = base_path / "expressions_definitions" / f"{block}_0.1.add"
|
|
79
79
|
elif module == "utilities":
|
|
@@ -81,7 +81,7 @@ def get_sample_dataset(
|
|
|
81
81
|
else:
|
|
82
82
|
raise ValidationError(
|
|
83
83
|
f"Unknown module '{module}'. "
|
|
84
|
-
f"Valid modules:
|
|
84
|
+
f"Valid modules: synthetic, expressions, utilities"
|
|
85
85
|
)
|
|
86
86
|
|
|
87
87
|
# Check if file exists
|
|
@@ -141,7 +141,7 @@ def list_available_samples() -> dict:
|
|
|
141
141
|
>>> samples = list_available_samples()
|
|
142
142
|
>>> print(samples)
|
|
143
143
|
{
|
|
144
|
-
'
|
|
144
|
+
'synthetic': ['sample'],
|
|
145
145
|
'expressions': ['sample'],
|
|
146
146
|
'utilities': []
|
|
147
147
|
}
|
|
@@ -149,15 +149,15 @@ def list_available_samples() -> dict:
|
|
|
149
149
|
base_path = Path(__file__).parent.parent.parent / "reference"
|
|
150
150
|
available = {}
|
|
151
151
|
|
|
152
|
-
# Check
|
|
153
|
-
|
|
154
|
-
if
|
|
155
|
-
available['
|
|
152
|
+
# Check synthetic
|
|
153
|
+
synthetic_path = base_path / "synthetic_definitions"
|
|
154
|
+
if synthetic_path.exists():
|
|
155
|
+
available['synthetic'] = [
|
|
156
156
|
f.stem.rsplit('_', 1)[0] # Remove version suffix
|
|
157
|
-
for f in
|
|
157
|
+
for f in synthetic_path.glob("*.add")
|
|
158
158
|
]
|
|
159
159
|
else:
|
|
160
|
-
available['
|
|
160
|
+
available['synthetic'] = []
|
|
161
161
|
|
|
162
162
|
# Check expressions
|
|
163
163
|
expressions_path = base_path / "expressions_definitions"
|
|
@@ -16,6 +16,13 @@ try:
|
|
|
16
16
|
except ImportError as e:
|
|
17
17
|
ARROW_AVAILABLE = False
|
|
18
18
|
IMPORT_ERROR = str(e)
|
|
19
|
+
# Create dummy classes for type annotations
|
|
20
|
+
class pa:
|
|
21
|
+
Table = Any
|
|
22
|
+
class pl:
|
|
23
|
+
DataFrame = Any
|
|
24
|
+
class pd:
|
|
25
|
+
DataFrame = Any
|
|
19
26
|
|
|
20
27
|
from ..logging import log_info, log_warning
|
|
21
28
|
from .cudf_bridge import get_cudf_bridge
|
|
@@ -32,7 +32,10 @@ class PolarsExpressionEngine:
|
|
|
32
32
|
"""Exclusive Polars-based expression processing engine"""
|
|
33
33
|
|
|
34
34
|
def __init__(self):
|
|
35
|
-
|
|
35
|
+
try:
|
|
36
|
+
self.arrow_bridge = EnhancedArrowBridge()
|
|
37
|
+
except ArrowBridgeError:
|
|
38
|
+
self.arrow_bridge = None
|
|
36
39
|
self.execution_stats = {
|
|
37
40
|
"total_executions": 0,
|
|
38
41
|
"total_time_ms": 0.0,
|
|
@@ -68,14 +71,28 @@ class PolarsExpressionEngine:
|
|
|
68
71
|
try:
|
|
69
72
|
# Auto-detect backend if not specified
|
|
70
73
|
if backend_type is None:
|
|
71
|
-
|
|
74
|
+
if self.arrow_bridge:
|
|
75
|
+
backend_type = self.arrow_bridge.detect_backend(df)
|
|
76
|
+
else:
|
|
77
|
+
backend_type = "pandas" # fallback
|
|
72
78
|
|
|
73
79
|
# Get memory usage before processing
|
|
74
|
-
|
|
80
|
+
if self.arrow_bridge:
|
|
81
|
+
memory_before = self.arrow_bridge._get_memory_usage_mb()
|
|
82
|
+
else:
|
|
83
|
+
memory_before = 0
|
|
75
84
|
|
|
76
85
|
# 1. Convert input to Arrow
|
|
77
86
|
log_info(f"[polars_engine] Converting {backend_type} to Arrow")
|
|
78
|
-
|
|
87
|
+
if self.arrow_bridge:
|
|
88
|
+
arrow_table = self.arrow_bridge.to_arrow(df, backend_type)
|
|
89
|
+
else:
|
|
90
|
+
# Fallback: assume pandas and convert directly
|
|
91
|
+
import pandas as pd
|
|
92
|
+
if isinstance(df, pd.DataFrame):
|
|
93
|
+
arrow_table = pl.from_pandas(df).to_arrow()
|
|
94
|
+
else:
|
|
95
|
+
raise RuntimeError("Arrow bridge not available and input is not pandas DataFrame")
|
|
79
96
|
|
|
80
97
|
# 2. Convert Arrow to Polars
|
|
81
98
|
log_info("[polars_engine] Converting Arrow to Polars")
|
|
@@ -93,11 +110,18 @@ class PolarsExpressionEngine:
|
|
|
93
110
|
|
|
94
111
|
# 5. Convert to original backend format
|
|
95
112
|
log_info(f"[polars_engine] Converting Arrow to {backend_type}")
|
|
96
|
-
|
|
113
|
+
if self.arrow_bridge:
|
|
114
|
+
final_result = self.arrow_bridge.from_arrow(result_arrow, backend_type)
|
|
115
|
+
else:
|
|
116
|
+
# Fallback: convert back to pandas
|
|
117
|
+
final_result = pl.from_arrow(result_arrow).to_pandas()
|
|
97
118
|
|
|
98
119
|
# Calculate execution statistics
|
|
99
120
|
execution_time = (datetime.now() - start_time).total_seconds() * 1000
|
|
100
|
-
|
|
121
|
+
if self.arrow_bridge:
|
|
122
|
+
memory_after = self.arrow_bridge._get_memory_usage_mb()
|
|
123
|
+
else:
|
|
124
|
+
memory_after = 0
|
|
101
125
|
memory_used = max(0, memory_after - memory_before)
|
|
102
126
|
|
|
103
127
|
# Update global statistics
|
|
@@ -122,7 +146,8 @@ class PolarsExpressionEngine:
|
|
|
122
146
|
|
|
123
147
|
finally:
|
|
124
148
|
# 6. Always cleanup Arrow memory
|
|
125
|
-
self.arrow_bridge
|
|
149
|
+
if self.arrow_bridge:
|
|
150
|
+
self.arrow_bridge.cleanup_arrow_memory()
|
|
126
151
|
|
|
127
152
|
def _execute_polars_expression(self, polars_df: pl.DataFrame,
|
|
128
153
|
expression: str, output_column: str) -> pl.DataFrame:
|
|
@@ -381,14 +406,28 @@ class PolarsExpressionEngine:
|
|
|
381
406
|
try:
|
|
382
407
|
# Auto-detect backend if not specified
|
|
383
408
|
if backend_type is None:
|
|
384
|
-
|
|
409
|
+
if self.arrow_bridge:
|
|
410
|
+
backend_type = self.arrow_bridge.detect_backend(df)
|
|
411
|
+
else:
|
|
412
|
+
backend_type = "pandas"
|
|
385
413
|
|
|
386
414
|
# Get memory usage before processing
|
|
387
|
-
|
|
415
|
+
if self.arrow_bridge:
|
|
416
|
+
memory_before = self.arrow_bridge._get_memory_usage_mb()
|
|
417
|
+
else:
|
|
418
|
+
memory_before = 0
|
|
388
419
|
|
|
389
420
|
# Convert to Polars via Arrow
|
|
390
|
-
|
|
391
|
-
|
|
421
|
+
if self.arrow_bridge:
|
|
422
|
+
arrow_table = self.arrow_bridge.to_arrow(df, backend_type)
|
|
423
|
+
polars_df = pl.from_arrow(arrow_table)
|
|
424
|
+
else:
|
|
425
|
+
# Fallback: assume pandas
|
|
426
|
+
import pandas as pd
|
|
427
|
+
if isinstance(df, pd.DataFrame):
|
|
428
|
+
polars_df = pl.from_pandas(df)
|
|
429
|
+
else:
|
|
430
|
+
raise RuntimeError("Arrow bridge not available and input is not pandas DataFrame")
|
|
392
431
|
|
|
393
432
|
# Execute using AST
|
|
394
433
|
polars_expr = self._ast_to_polars_expr(ast_tree)
|
|
@@ -396,11 +435,17 @@ class PolarsExpressionEngine:
|
|
|
396
435
|
|
|
397
436
|
# Convert back to original format
|
|
398
437
|
result_arrow = result_df.to_arrow()
|
|
399
|
-
|
|
438
|
+
if self.arrow_bridge:
|
|
439
|
+
final_result = self.arrow_bridge.from_arrow(result_arrow, backend_type)
|
|
440
|
+
else:
|
|
441
|
+
final_result = pl.from_arrow(result_arrow).to_pandas()
|
|
400
442
|
|
|
401
443
|
# Calculate statistics
|
|
402
444
|
execution_time = (datetime.now() - start_time).total_seconds() * 1000
|
|
403
|
-
|
|
445
|
+
if self.arrow_bridge:
|
|
446
|
+
memory_after = self.arrow_bridge._get_memory_usage_mb()
|
|
447
|
+
else:
|
|
448
|
+
memory_after = 0
|
|
404
449
|
memory_used = max(0, memory_after - memory_before)
|
|
405
450
|
|
|
406
451
|
# Update statistics
|
|
@@ -422,7 +467,8 @@ class PolarsExpressionEngine:
|
|
|
422
467
|
raise PolarsExpressionError(f"AST execution failed: {e}")
|
|
423
468
|
|
|
424
469
|
finally:
|
|
425
|
-
self.arrow_bridge
|
|
470
|
+
if self.arrow_bridge:
|
|
471
|
+
self.arrow_bridge.cleanup_arrow_memory()
|
|
426
472
|
|
|
427
473
|
def validate_expression(self, expression: str) -> bool:
|
|
428
474
|
"""
|
|
@@ -489,7 +535,10 @@ class PolarsExpressionEngine:
|
|
|
489
535
|
Benchmark results
|
|
490
536
|
"""
|
|
491
537
|
times = []
|
|
492
|
-
|
|
538
|
+
if self.arrow_bridge:
|
|
539
|
+
backend_type = self.arrow_bridge.detect_backend(df)
|
|
540
|
+
else:
|
|
541
|
+
backend_type = "pandas"
|
|
493
542
|
|
|
494
543
|
for i in range(iterations):
|
|
495
544
|
try:
|
|
@@ -532,7 +581,8 @@ class PolarsExpressionEngine:
|
|
|
532
581
|
"""Cleanup callback for memory manager"""
|
|
533
582
|
try:
|
|
534
583
|
# Cleanup Arrow bridge memory
|
|
535
|
-
self.arrow_bridge
|
|
584
|
+
if self.arrow_bridge:
|
|
585
|
+
self.arrow_bridge.cleanup_arrow_memory()
|
|
536
586
|
|
|
537
587
|
# Reset statistics if they get too large
|
|
538
588
|
if self.execution_stats["total_executions"] > 10000:
|