additory 0.1.0a1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- additory-0.1.0a1/LICENSE +21 -0
- additory-0.1.0a1/PKG-INFO +293 -0
- additory-0.1.0a1/README.md +261 -0
- additory-0.1.0a1/additory/__init__.py +15 -0
- additory-0.1.0a1/additory/analysis/__init__.py +48 -0
- additory-0.1.0a1/additory/analysis/cardinality.py +126 -0
- additory-0.1.0a1/additory/analysis/correlations.py +124 -0
- additory-0.1.0a1/additory/analysis/distributions.py +376 -0
- additory-0.1.0a1/additory/analysis/quality.py +158 -0
- additory-0.1.0a1/additory/analysis/scan.py +400 -0
- additory-0.1.0a1/additory/augment/__init__.py +24 -0
- additory-0.1.0a1/additory/augment/augmentor.py +653 -0
- additory-0.1.0a1/additory/augment/builtin_lists.py +430 -0
- additory-0.1.0a1/additory/augment/distributions.py +22 -0
- additory-0.1.0a1/additory/augment/forecast.py +1132 -0
- additory-0.1.0a1/additory/augment/list_registry.py +177 -0
- additory-0.1.0a1/additory/augment/smote.py +320 -0
- additory-0.1.0a1/additory/augment/strategies.py +883 -0
- additory-0.1.0a1/additory/common/__init__.py +157 -0
- additory-0.1.0a1/additory/common/backend.py +355 -0
- additory-0.1.0a1/additory/common/column_utils.py +191 -0
- additory-0.1.0a1/additory/common/distributions.py +737 -0
- additory-0.1.0a1/additory/common/exceptions.py +62 -0
- additory-0.1.0a1/additory/common/lists.py +229 -0
- additory-0.1.0a1/additory/common/patterns.py +240 -0
- additory-0.1.0a1/additory/common/resolver.py +567 -0
- additory-0.1.0a1/additory/common/sample_data.py +182 -0
- additory-0.1.0a1/additory/common/validation.py +197 -0
- additory-0.1.0a1/additory/core/__init__.py +27 -0
- additory-0.1.0a1/additory/core/ast_builder.py +165 -0
- additory-0.1.0a1/additory/core/backends/__init__.py +23 -0
- additory-0.1.0a1/additory/core/backends/arrow_bridge.py +476 -0
- additory-0.1.0a1/additory/core/backends/cudf_bridge.py +355 -0
- additory-0.1.0a1/additory/core/column_positioning.py +358 -0
- additory-0.1.0a1/additory/core/compiler_polars.py +166 -0
- additory-0.1.0a1/additory/core/config.py +342 -0
- additory-0.1.0a1/additory/core/enhanced_cache_manager.py +1119 -0
- additory-0.1.0a1/additory/core/enhanced_matchers.py +473 -0
- additory-0.1.0a1/additory/core/enhanced_version_manager.py +325 -0
- additory-0.1.0a1/additory/core/executor.py +59 -0
- additory-0.1.0a1/additory/core/integrity_manager.py +477 -0
- additory-0.1.0a1/additory/core/loader.py +190 -0
- additory-0.1.0a1/additory/core/logging.py +24 -0
- additory-0.1.0a1/additory/core/memory_manager.py +547 -0
- additory-0.1.0a1/additory/core/namespace_manager.py +657 -0
- additory-0.1.0a1/additory/core/parser.py +176 -0
- additory-0.1.0a1/additory/core/polars_expression_engine.py +551 -0
- additory-0.1.0a1/additory/core/registry.py +176 -0
- additory-0.1.0a1/additory/core/sample_data_manager.py +492 -0
- additory-0.1.0a1/additory/core/user_namespace.py +751 -0
- additory-0.1.0a1/additory/core/validator.py +27 -0
- additory-0.1.0a1/additory/dynamic_api.py +308 -0
- additory-0.1.0a1/additory/expressions/__init__.py +26 -0
- additory-0.1.0a1/additory/expressions/engine.py +551 -0
- additory-0.1.0a1/additory/expressions/parser.py +176 -0
- additory-0.1.0a1/additory/expressions/proxy.py +546 -0
- additory-0.1.0a1/additory/expressions/registry.py +313 -0
- additory-0.1.0a1/additory/expressions/samples.py +492 -0
- additory-0.1.0a1/additory/synthetic/__init__.py +101 -0
- additory-0.1.0a1/additory/synthetic/api.py +220 -0
- additory-0.1.0a1/additory/synthetic/common_integration.py +314 -0
- additory-0.1.0a1/additory/synthetic/config.py +262 -0
- additory-0.1.0a1/additory/synthetic/engines.py +529 -0
- additory-0.1.0a1/additory/synthetic/exceptions.py +180 -0
- additory-0.1.0a1/additory/synthetic/file_managers.py +518 -0
- additory-0.1.0a1/additory/synthetic/generator.py +702 -0
- additory-0.1.0a1/additory/synthetic/generator_parser.py +68 -0
- additory-0.1.0a1/additory/synthetic/integration.py +319 -0
- additory-0.1.0a1/additory/synthetic/models.py +241 -0
- additory-0.1.0a1/additory/synthetic/pattern_resolver.py +573 -0
- additory-0.1.0a1/additory/synthetic/performance.py +469 -0
- additory-0.1.0a1/additory/synthetic/polars_integration.py +464 -0
- additory-0.1.0a1/additory/synthetic/proxy.py +60 -0
- additory-0.1.0a1/additory/synthetic/schema_parser.py +685 -0
- additory-0.1.0a1/additory/synthetic/validator.py +553 -0
- additory-0.1.0a1/additory/utilities/__init__.py +53 -0
- additory-0.1.0a1/additory/utilities/encoding.py +600 -0
- additory-0.1.0a1/additory/utilities/games.py +300 -0
- additory-0.1.0a1/additory/utilities/keys.py +8 -0
- additory-0.1.0a1/additory/utilities/lookup.py +103 -0
- additory-0.1.0a1/additory/utilities/matchers.py +216 -0
- additory-0.1.0a1/additory/utilities/resolvers.py +286 -0
- additory-0.1.0a1/additory/utilities/settings.py +167 -0
- additory-0.1.0a1/additory/utilities/units.py +746 -0
- additory-0.1.0a1/additory/utilities/validators.py +153 -0
- additory-0.1.0a1/additory.egg-info/SOURCES.txt +121 -0
- additory-0.1.0a1/documentation/V0.1.0/add_augment_function.html +603 -0
- additory-0.1.0a1/documentation/V0.1.0/add_harmonize_units_function.html +564 -0
- additory-0.1.0a1/documentation/V0.1.0/add_onehotencoding_function.html +530 -0
- additory-0.1.0a1/documentation/V0.1.0/add_scan_function.html +701 -0
- additory-0.1.0a1/documentation/V0.1.0/add_synth_function.html +664 -0
- additory-0.1.0a1/documentation/V0.1.0/add_to_function.html +707 -0
- additory-0.1.0a1/documentation/V0.1.0/expressions.html +621 -0
- additory-0.1.0a1/pyproject.toml +45 -0
- additory-0.1.0a1/reference/expressions_definitions/age_category_0.1.add +66 -0
- additory-0.1.0a1/reference/expressions_definitions/blood_pressure_category_0.1.add +79 -0
- additory-0.1.0a1/reference/expressions_definitions/bmi2_0.1.add +41 -0
- additory-0.1.0a1/reference/expressions_definitions/bmi3_0.1.add +26 -0
- additory-0.1.0a1/reference/expressions_definitions/bmi_0.1.add +77 -0
- additory-0.1.0a1/reference/expressions_definitions/bmr_0.1.add +109 -0
- additory-0.1.0a1/reference/expressions_definitions/body_fat_percentage_0.1.add +124 -0
- additory-0.1.0a1/reference/expressions_definitions/bsa_0.1.add +79 -0
- additory-0.1.0a1/reference/expressions_definitions/cholesterol_ratio_0.1.add +79 -0
- additory-0.1.0a1/reference/expressions_definitions/fitness_score_0.1.add +110 -0
- additory-0.1.0a1/reference/expressions_definitions/ideal_body_weight_0.1.add +79 -0
- additory-0.1.0a1/reference/expressions_definitions/manifest.json +35 -0
- additory-0.1.0a1/reference/expressions_definitions/waist_hip_ratio_0.1.add +79 -0
- additory-0.1.0a1/reference/schema_definitions/ca.list +41 -0
- additory-0.1.0a1/reference/schema_definitions/ca.properties +14 -0
- additory-0.1.0a1/reference/schema_definitions/eu.list +41 -0
- additory-0.1.0a1/reference/schema_definitions/eu.properties +13 -0
- additory-0.1.0a1/reference/schema_definitions/finance.list +31 -0
- additory-0.1.0a1/reference/schema_definitions/finance.properties +18 -0
- additory-0.1.0a1/reference/schema_definitions/global.list +57 -0
- additory-0.1.0a1/reference/schema_definitions/global.properties +11 -0
- additory-0.1.0a1/reference/schema_definitions/healthcare.list +28 -0
- additory-0.1.0a1/reference/schema_definitions/us.list +41 -0
- additory-0.1.0a1/reference/schema_definitions/us.properties +14 -0
- additory-0.1.0a1/setup.cfg +4 -0
- additory-0.1.0a1/user_expressions/bmi1_0.1.add +41 -0
- additory-0.1.0a1/user_expressions/bmi2_0.1.add +41 -0
- additory-0.1.0a1/user_expressions/bmi3_0.1.add +26 -0
- additory-0.1.0a1/user_expressions/bmi_0.1.add +26 -0
- additory-0.1.0a1/user_expressions/manifest.json +22 -0
additory-0.1.0a1/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Krishnamoorthy Sankaran
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,293 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: additory
|
|
3
|
+
Version: 0.1.0a1
|
|
4
|
+
Summary: A semantic, extensible dataframe transformation engine with expressions, lookup, synthetic data, and sample-data support.
|
|
5
|
+
Author: Krishnamoorthy Sankaran
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: homepage, https://github.com/sekarkrishna/additory
|
|
8
|
+
Project-URL: documentation, https://github.com/sekarkrishna/additory/tree/main/documentation/V0.1.0
|
|
9
|
+
Project-URL: source, https://github.com/sekarkrishna/additory
|
|
10
|
+
Project-URL: issues, https://github.com/sekarkrishna/additory/issues
|
|
11
|
+
Requires-Python: >=3.9
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
License-File: LICENSE
|
|
14
|
+
Requires-Dist: pandas>=1.5
|
|
15
|
+
Requires-Dist: polars>=0.20
|
|
16
|
+
Requires-Dist: pyyaml>=6.0
|
|
17
|
+
Requires-Dist: requests>=2.31
|
|
18
|
+
Requires-Dist: toml>=0.10
|
|
19
|
+
Requires-Dist: scipy>=1.9
|
|
20
|
+
Requires-Dist: numpy>=1.21
|
|
21
|
+
Provides-Extra: gpu
|
|
22
|
+
Requires-Dist: cudf>=24.0; extra == "gpu"
|
|
23
|
+
Provides-Extra: dev
|
|
24
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
25
|
+
Requires-Dist: pytest-cov>=4.0; extra == "dev"
|
|
26
|
+
Requires-Dist: pytest-xdist>=3.0; extra == "dev"
|
|
27
|
+
Requires-Dist: hypothesis>=6.0; extra == "dev"
|
|
28
|
+
Requires-Dist: black>=23.0; extra == "dev"
|
|
29
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
30
|
+
Requires-Dist: coverage>=7.0; extra == "dev"
|
|
31
|
+
Dynamic: license-file
|
|
32
|
+
|
|
33
|
+
# Additory
|
|
34
|
+
|
|
35
|
+
**A semantic, extensible dataframe transformation engine with expressions, lookup, synthetic data, and sample-data support.**
|
|
36
|
+
|
|
37
|
+
[](https://www.python.org/downloads/)
|
|
38
|
+
[](https://opensource.org/licenses/MIT)
|
|
39
|
+
[](https://github.com/sekarkrishna/additory/tree/main/V0.1.0a1/)
|
|
40
|
+
|
|
41
|
+
**Author:** Krishnamoorthy Sankaran
|
|
42
|
+
|
|
43
|
+
## 🛠️ Requirements
|
|
44
|
+
|
|
45
|
+
- **Python**: 3.9+
|
|
46
|
+
- **Core dependencies**: pandas, polars, numpy, scipy
|
|
47
|
+
- **Optional**: cuDF (for GPU support)
|
|
48
|
+
|
|
49
|
+
## 📦 Installation
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
pip install additory==0.1.0a1
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
**Optional GPU support:**
|
|
56
|
+
```bash
|
|
57
|
+
pip install additory[gpu]==0.1.0a1 # Includes cuDF for GPU acceleration
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
**Development installation:**
|
|
61
|
+
```bash
|
|
62
|
+
pip install additory[dev]==0.1.0a1 # Includes testing and development tools
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
## 🎯 Core Functions
|
|
66
|
+
|
|
67
|
+
| Function | Purpose | Example |
|
|
68
|
+
|----------|---------|---------|
|
|
69
|
+
| `add.to()` | Lookup/join operations | `add.to(df1, from_df=df2, bring='col', against='key')` |
|
|
70
|
+
| `add.augment()` | Generate additional data | `add.augment(df, n_rows=1000)` |
|
|
71
|
+
| `add.synth()` | Synthetic data from schemas | `add.synth("schema.toml", rows=5000)` |
|
|
72
|
+
| `add.scan()` | Data profiling & analysis | `add.scan(df, preset="full")` |
|
|
73
|
+
|
|
74
|
+
## 🧬 Available Expressions
|
|
75
|
+
|
|
76
|
+
Additory includes 12 built-in health and fitness expressions:
|
|
77
|
+
|
|
78
|
+
- **`add.bmi()`** - Body Mass Index
|
|
79
|
+
- **`add.bsa()`** - Body Surface Area
|
|
80
|
+
- **`add.bmr()`** - Basal Metabolic Rate
|
|
81
|
+
- **`add.waist_hip_ratio()`** - Waist-to-Hip Ratio
|
|
82
|
+
- **`add.body_fat_percentage()`** - Body Fat Percentage
|
|
83
|
+
- **`add.ideal_body_weight()`** - Ideal Body Weight
|
|
84
|
+
- **`add.blood_pressure_category()`** - BP Classification
|
|
85
|
+
- **`add.cholesterol_ratio()`** - Cholesterol Ratio
|
|
86
|
+
- **`add.age_category()`** - Age Classification
|
|
87
|
+
- **`add.fitness_score()`** - Overall Fitness Score
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
# Health calculations
|
|
91
|
+
patients = pd.DataFrame({
|
|
92
|
+
'weight_kg': [70, 80, 65], # Weight in kilograms
|
|
93
|
+
'height_m': [1.75, 1.80, 1.60], # Height in meters
|
|
94
|
+
'age': [25, 35, 45],
|
|
95
|
+
'gender': ['M', 'F', 'M']
|
|
96
|
+
})
|
|
97
|
+
|
|
98
|
+
patients_bmi = add.bmi(patients)
|
|
99
|
+
patients_bsa = add.bsa(patients)
|
|
100
|
+
fitness_scores = add.fitness_score(patients)
|
|
101
|
+
|
|
102
|
+
# Chain multiple expressions
|
|
103
|
+
result = add.fitness_score(add.bmr(add.bmi(patients)))
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
## 🔧 DataFrame Support
|
|
107
|
+
|
|
108
|
+
Additory works seamlessly with multiple DataFrame libraries:
|
|
109
|
+
|
|
110
|
+
- **pandas** - Full support
|
|
111
|
+
- **polars** - Full support
|
|
112
|
+
- **cuDF** - GPU acceleration support
|
|
113
|
+
|
|
114
|
+
```python
|
|
115
|
+
import polars as pl
|
|
116
|
+
import additory as add
|
|
117
|
+
|
|
118
|
+
# Works with polars
|
|
119
|
+
df_polars = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
|
|
120
|
+
result = add.augment(df_polars, n_rows=100)
|
|
121
|
+
|
|
122
|
+
# Automatic type detection and conversion
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
## ✨ Key Features
|
|
126
|
+
|
|
127
|
+
### 🔧 Utilities
|
|
128
|
+
|
|
129
|
+
**add.to() - Data Lookup & Joins**
|
|
130
|
+
Simplified syntax for bringing columns from one dataframe to another.
|
|
131
|
+
|
|
132
|
+
```python
|
|
133
|
+
# Simple lookup
|
|
134
|
+
orders_with_prices = add.to(
|
|
135
|
+
orders,
|
|
136
|
+
from_df=products,
|
|
137
|
+
bring='price',
|
|
138
|
+
against='product_id'
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
# Multiple columns and keys
|
|
142
|
+
enriched = add.to(
|
|
143
|
+
orders,
|
|
144
|
+
from_df=products,
|
|
145
|
+
bring=['price', 'category'],
|
|
146
|
+
against=['product_id', 'region']
|
|
147
|
+
)
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
**add.onehotencoding() - Categorical Encoding**
|
|
151
|
+
Convert categorical columns to one-hot encoded format.
|
|
152
|
+
|
|
153
|
+
```python
|
|
154
|
+
# One-hot encoding (single column)
|
|
155
|
+
encoded = add.onehotencoding(df, 'category')
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
**add.harmonize_units() - Unit Standardization**
|
|
159
|
+
Standardize units across your dataset.
|
|
160
|
+
|
|
161
|
+
```python
|
|
162
|
+
# Unit harmonization
|
|
163
|
+
standardized = add.harmonize_units(
|
|
164
|
+
df,
|
|
165
|
+
value_column='temperature',
|
|
166
|
+
unit_column='unit',
|
|
167
|
+
target_unit='C'
|
|
168
|
+
)
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
### 🧮 Expressions
|
|
172
|
+
|
|
173
|
+
Pre-built calculations for health, fitness, and common metrics. Simple examples:
|
|
174
|
+
|
|
175
|
+
```python
|
|
176
|
+
# Create patient data with correct column names
|
|
177
|
+
patients = pd.DataFrame({
|
|
178
|
+
'weight_kg': [70, 80, 65], # Weight in kilograms
|
|
179
|
+
'height_m': [1.75, 1.80, 1.60], # Height in meters
|
|
180
|
+
'age': [25, 35, 45],
|
|
181
|
+
'gender': ['M', 'F', 'M']
|
|
182
|
+
})
|
|
183
|
+
|
|
184
|
+
# Calculate BMI
|
|
185
|
+
patients_with_bmi = add.bmi(patients)
|
|
186
|
+
|
|
187
|
+
# Calculate Body Surface Area
|
|
188
|
+
patients_with_bsa = add.bsa(patients)
|
|
189
|
+
|
|
190
|
+
# Chain multiple expressions
|
|
191
|
+
result = add.fitness_score(add.bmr(add.bmi(patients)))
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
### 🔄 Augment and Synthetic Data
|
|
195
|
+
|
|
196
|
+
**Augment** generates more data similar to your existing dataset, while **Synthetic** creates entirely new datasets from schema definitions.
|
|
197
|
+
|
|
198
|
+
**Key Differences:**
|
|
199
|
+
- **Augment**: Learns patterns from existing data to create similar rows
|
|
200
|
+
- **Synthetic**: Uses predefined schemas to generate structured data
|
|
201
|
+
|
|
202
|
+
```python
|
|
203
|
+
# Augment existing data (learns from patterns)
|
|
204
|
+
more_customers = add.augment(customers, n_rows=1000)
|
|
205
|
+
|
|
206
|
+
# Create data from scratch with strategies
|
|
207
|
+
new_data = add.augment("@new", n_rows=500, strategy={
|
|
208
|
+
'id': 'increment:start=1',
|
|
209
|
+
'name': 'choice:[John,Jane,Bob]',
|
|
210
|
+
'age': 'range:18-65'
|
|
211
|
+
})
|
|
212
|
+
|
|
213
|
+
# Generate from schema file (structured approach)
|
|
214
|
+
customers = add.synth("customer_schema.toml", rows=10000)
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
## 🧪 Examples
|
|
218
|
+
|
|
219
|
+
### E-commerce Data Pipeline
|
|
220
|
+
```python
|
|
221
|
+
import pandas as pd
|
|
222
|
+
import additory as add
|
|
223
|
+
|
|
224
|
+
# Start with small customer sample
|
|
225
|
+
customers = pd.DataFrame({
|
|
226
|
+
'customer_id': [1, 2, 3],
|
|
227
|
+
'age': [25, 35, 45],
|
|
228
|
+
'region': ['North', 'South', 'East']
|
|
229
|
+
})
|
|
230
|
+
|
|
231
|
+
# Generate more customers
|
|
232
|
+
customers = add.augment(customers, n_rows=10000)
|
|
233
|
+
|
|
234
|
+
# Add customer tiers
|
|
235
|
+
tiers = pd.DataFrame({
|
|
236
|
+
'customer_id': range(1, 4), # Match original IDs
|
|
237
|
+
'tier': ['Gold', 'Silver', 'Bronze']
|
|
238
|
+
})
|
|
239
|
+
|
|
240
|
+
# Use pipeline approach
|
|
241
|
+
result = (customers
|
|
242
|
+
.pipe(add.to, from_df=tiers, bring='tier', against='customer_id')
|
|
243
|
+
.pipe(add.scan, preset="quick"))
|
|
244
|
+
|
|
245
|
+
print(result.summary())
|
|
246
|
+
```
|
|
247
|
+
|
|
248
|
+
### Healthcare Data Analysis
|
|
249
|
+
```python
|
|
250
|
+
# Create patient data from scratch
|
|
251
|
+
strategy = {
|
|
252
|
+
'patient_id': 'increment:start=1',
|
|
253
|
+
'age': 'range:18-80',
|
|
254
|
+
'weight_kg': 'range:50-120', # Weight in kg
|
|
255
|
+
'height_cm': 'range:150-200' # Height in cm
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
patients = add.augment("@new", n_rows=1000, strategy=strategy)
|
|
259
|
+
|
|
260
|
+
# Convert height to meters for expressions
|
|
261
|
+
patients['height_m'] = patients['height_cm'] / 100
|
|
262
|
+
|
|
263
|
+
# Calculate health metrics using pipeline
|
|
264
|
+
result = (patients
|
|
265
|
+
.pipe(add.bmi)
|
|
266
|
+
.pipe(add.scan, preset="correlations"))
|
|
267
|
+
|
|
268
|
+
print(result.correlations)
|
|
269
|
+
```
|
|
270
|
+
|
|
271
|
+
## 📚 Documentation
|
|
272
|
+
|
|
273
|
+
- **[Function Documentation](https://github.com/sekarkrishna/additory/tree/main/V0.1.0a1/documentation/V0.1.0/)** - Detailed guides for each function
|
|
274
|
+
- **[Expressions Guide](https://github.com/sekarkrishna/additory/tree/main/V0.1.0a1/documentation/V0.1.0/expressions.html)** - Complete expressions reference
|
|
275
|
+
|
|
276
|
+
## 📄 License
|
|
277
|
+
|
|
278
|
+
MIT License - see [LICENSE](https://github.com/sekarkrishna/additory/tree/main/V0.1.0a1/LICENSE) file for details.
|
|
279
|
+
|
|
280
|
+
## 📞 Support
|
|
281
|
+
|
|
282
|
+
- **Issues**: [GitHub Issues](https://github.com/sekarkrishna/additory/issues)
|
|
283
|
+
- **Documentation**: [Full Documentation](https://github.com/sekarkrishna/additory/tree/main/V0.1.0a1/documentation/V0.1.0)
|
|
284
|
+
|
|
285
|
+
## 🗺️ v0.1.1 (February 2025)
|
|
286
|
+
- Enhanced documentation and tutorials
|
|
287
|
+
- Performance optimizations
|
|
288
|
+
- Additional expressions
|
|
289
|
+
- Advanced synthetic data patterns
|
|
290
|
+
|
|
291
|
+
---
|
|
292
|
+
|
|
293
|
+
**Made with ❤️ for data scientists, analysts, and developers who love working with data.**
|
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
# Additory
|
|
2
|
+
|
|
3
|
+
**A semantic, extensible dataframe transformation engine with expressions, lookup, synthetic data, and sample-data support.**
|
|
4
|
+
|
|
5
|
+
[](https://www.python.org/downloads/)
|
|
6
|
+
[](https://opensource.org/licenses/MIT)
|
|
7
|
+
[](https://github.com/sekarkrishna/additory/tree/main/V0.1.0a1/)
|
|
8
|
+
|
|
9
|
+
**Author:** Krishnamoorthy Sankaran
|
|
10
|
+
|
|
11
|
+
## 🛠️ Requirements
|
|
12
|
+
|
|
13
|
+
- **Python**: 3.9+
|
|
14
|
+
- **Core dependencies**: pandas, polars, numpy, scipy
|
|
15
|
+
- **Optional**: cuDF (for GPU support)
|
|
16
|
+
|
|
17
|
+
## 📦 Installation
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
pip install additory==0.1.0a1
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
**Optional GPU support:**
|
|
24
|
+
```bash
|
|
25
|
+
pip install additory[gpu]==0.1.0a1 # Includes cuDF for GPU acceleration
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
**Development installation:**
|
|
29
|
+
```bash
|
|
30
|
+
pip install additory[dev]==0.1.0a1 # Includes testing and development tools
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
## 🎯 Core Functions
|
|
34
|
+
|
|
35
|
+
| Function | Purpose | Example |
|
|
36
|
+
|----------|---------|---------|
|
|
37
|
+
| `add.to()` | Lookup/join operations | `add.to(df1, from_df=df2, bring='col', against='key')` |
|
|
38
|
+
| `add.augment()` | Generate additional data | `add.augment(df, n_rows=1000)` |
|
|
39
|
+
| `add.synth()` | Synthetic data from schemas | `add.synth("schema.toml", rows=5000)` |
|
|
40
|
+
| `add.scan()` | Data profiling & analysis | `add.scan(df, preset="full")` |
|
|
41
|
+
|
|
42
|
+
## 🧬 Available Expressions
|
|
43
|
+
|
|
44
|
+
Additory includes 12 built-in health and fitness expressions:
|
|
45
|
+
|
|
46
|
+
- **`add.bmi()`** - Body Mass Index
|
|
47
|
+
- **`add.bsa()`** - Body Surface Area
|
|
48
|
+
- **`add.bmr()`** - Basal Metabolic Rate
|
|
49
|
+
- **`add.waist_hip_ratio()`** - Waist-to-Hip Ratio
|
|
50
|
+
- **`add.body_fat_percentage()`** - Body Fat Percentage
|
|
51
|
+
- **`add.ideal_body_weight()`** - Ideal Body Weight
|
|
52
|
+
- **`add.blood_pressure_category()`** - BP Classification
|
|
53
|
+
- **`add.cholesterol_ratio()`** - Cholesterol Ratio
|
|
54
|
+
- **`add.age_category()`** - Age Classification
|
|
55
|
+
- **`add.fitness_score()`** - Overall Fitness Score
|
|
56
|
+
|
|
57
|
+
```python
|
|
58
|
+
# Health calculations
|
|
59
|
+
patients = pd.DataFrame({
|
|
60
|
+
'weight_kg': [70, 80, 65], # Weight in kilograms
|
|
61
|
+
'height_m': [1.75, 1.80, 1.60], # Height in meters
|
|
62
|
+
'age': [25, 35, 45],
|
|
63
|
+
'gender': ['M', 'F', 'M']
|
|
64
|
+
})
|
|
65
|
+
|
|
66
|
+
patients_bmi = add.bmi(patients)
|
|
67
|
+
patients_bsa = add.bsa(patients)
|
|
68
|
+
fitness_scores = add.fitness_score(patients)
|
|
69
|
+
|
|
70
|
+
# Chain multiple expressions
|
|
71
|
+
result = add.fitness_score(add.bmr(add.bmi(patients)))
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
## 🔧 DataFrame Support
|
|
75
|
+
|
|
76
|
+
Additory works seamlessly with multiple DataFrame libraries:
|
|
77
|
+
|
|
78
|
+
- **pandas** - Full support
|
|
79
|
+
- **polars** - Full support
|
|
80
|
+
- **cuDF** - GPU acceleration support
|
|
81
|
+
|
|
82
|
+
```python
|
|
83
|
+
import polars as pl
|
|
84
|
+
import additory as add
|
|
85
|
+
|
|
86
|
+
# Works with polars
|
|
87
|
+
df_polars = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
|
|
88
|
+
result = add.augment(df_polars, n_rows=100)
|
|
89
|
+
|
|
90
|
+
# Automatic type detection and conversion
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
## ✨ Key Features
|
|
94
|
+
|
|
95
|
+
### 🔧 Utilities
|
|
96
|
+
|
|
97
|
+
**add.to() - Data Lookup & Joins**
|
|
98
|
+
Simplified syntax for bringing columns from one dataframe to another.
|
|
99
|
+
|
|
100
|
+
```python
|
|
101
|
+
# Simple lookup
|
|
102
|
+
orders_with_prices = add.to(
|
|
103
|
+
orders,
|
|
104
|
+
from_df=products,
|
|
105
|
+
bring='price',
|
|
106
|
+
against='product_id'
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
# Multiple columns and keys
|
|
110
|
+
enriched = add.to(
|
|
111
|
+
orders,
|
|
112
|
+
from_df=products,
|
|
113
|
+
bring=['price', 'category'],
|
|
114
|
+
against=['product_id', 'region']
|
|
115
|
+
)
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
**add.onehotencoding() - Categorical Encoding**
|
|
119
|
+
Convert categorical columns to one-hot encoded format.
|
|
120
|
+
|
|
121
|
+
```python
|
|
122
|
+
# One-hot encoding (single column)
|
|
123
|
+
encoded = add.onehotencoding(df, 'category')
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
**add.harmonize_units() - Unit Standardization**
|
|
127
|
+
Standardize units across your dataset.
|
|
128
|
+
|
|
129
|
+
```python
|
|
130
|
+
# Unit harmonization
|
|
131
|
+
standardized = add.harmonize_units(
|
|
132
|
+
df,
|
|
133
|
+
value_column='temperature',
|
|
134
|
+
unit_column='unit',
|
|
135
|
+
target_unit='C'
|
|
136
|
+
)
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
### 🧮 Expressions
|
|
140
|
+
|
|
141
|
+
Pre-built calculations for health, fitness, and common metrics. Simple examples:
|
|
142
|
+
|
|
143
|
+
```python
|
|
144
|
+
# Create patient data with correct column names
|
|
145
|
+
patients = pd.DataFrame({
|
|
146
|
+
'weight_kg': [70, 80, 65], # Weight in kilograms
|
|
147
|
+
'height_m': [1.75, 1.80, 1.60], # Height in meters
|
|
148
|
+
'age': [25, 35, 45],
|
|
149
|
+
'gender': ['M', 'F', 'M']
|
|
150
|
+
})
|
|
151
|
+
|
|
152
|
+
# Calculate BMI
|
|
153
|
+
patients_with_bmi = add.bmi(patients)
|
|
154
|
+
|
|
155
|
+
# Calculate Body Surface Area
|
|
156
|
+
patients_with_bsa = add.bsa(patients)
|
|
157
|
+
|
|
158
|
+
# Chain multiple expressions
|
|
159
|
+
result = add.fitness_score(add.bmr(add.bmi(patients)))
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
### 🔄 Augment and Synthetic Data
|
|
163
|
+
|
|
164
|
+
**Augment** generates more data similar to your existing dataset, while **Synthetic** creates entirely new datasets from schema definitions.
|
|
165
|
+
|
|
166
|
+
**Key Differences:**
|
|
167
|
+
- **Augment**: Learns patterns from existing data to create similar rows
|
|
168
|
+
- **Synthetic**: Uses predefined schemas to generate structured data
|
|
169
|
+
|
|
170
|
+
```python
|
|
171
|
+
# Augment existing data (learns from patterns)
|
|
172
|
+
more_customers = add.augment(customers, n_rows=1000)
|
|
173
|
+
|
|
174
|
+
# Create data from scratch with strategies
|
|
175
|
+
new_data = add.augment("@new", n_rows=500, strategy={
|
|
176
|
+
'id': 'increment:start=1',
|
|
177
|
+
'name': 'choice:[John,Jane,Bob]',
|
|
178
|
+
'age': 'range:18-65'
|
|
179
|
+
})
|
|
180
|
+
|
|
181
|
+
# Generate from schema file (structured approach)
|
|
182
|
+
customers = add.synth("customer_schema.toml", rows=10000)
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
## 🧪 Examples
|
|
186
|
+
|
|
187
|
+
### E-commerce Data Pipeline
|
|
188
|
+
```python
|
|
189
|
+
import pandas as pd
|
|
190
|
+
import additory as add
|
|
191
|
+
|
|
192
|
+
# Start with small customer sample
|
|
193
|
+
customers = pd.DataFrame({
|
|
194
|
+
'customer_id': [1, 2, 3],
|
|
195
|
+
'age': [25, 35, 45],
|
|
196
|
+
'region': ['North', 'South', 'East']
|
|
197
|
+
})
|
|
198
|
+
|
|
199
|
+
# Generate more customers
|
|
200
|
+
customers = add.augment(customers, n_rows=10000)
|
|
201
|
+
|
|
202
|
+
# Add customer tiers
|
|
203
|
+
tiers = pd.DataFrame({
|
|
204
|
+
'customer_id': range(1, 4), # Match original IDs
|
|
205
|
+
'tier': ['Gold', 'Silver', 'Bronze']
|
|
206
|
+
})
|
|
207
|
+
|
|
208
|
+
# Use pipeline approach
|
|
209
|
+
result = (customers
|
|
210
|
+
.pipe(add.to, from_df=tiers, bring='tier', against='customer_id')
|
|
211
|
+
.pipe(add.scan, preset="quick"))
|
|
212
|
+
|
|
213
|
+
print(result.summary())
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
### Healthcare Data Analysis
|
|
217
|
+
```python
|
|
218
|
+
# Create patient data from scratch
|
|
219
|
+
strategy = {
|
|
220
|
+
'patient_id': 'increment:start=1',
|
|
221
|
+
'age': 'range:18-80',
|
|
222
|
+
'weight_kg': 'range:50-120', # Weight in kg
|
|
223
|
+
'height_cm': 'range:150-200' # Height in cm
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
patients = add.augment("@new", n_rows=1000, strategy=strategy)
|
|
227
|
+
|
|
228
|
+
# Convert height to meters for expressions
|
|
229
|
+
patients['height_m'] = patients['height_cm'] / 100
|
|
230
|
+
|
|
231
|
+
# Calculate health metrics using pipeline
|
|
232
|
+
result = (patients
|
|
233
|
+
.pipe(add.bmi)
|
|
234
|
+
.pipe(add.scan, preset="correlations"))
|
|
235
|
+
|
|
236
|
+
print(result.correlations)
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
## 📚 Documentation
|
|
240
|
+
|
|
241
|
+
- **[Function Documentation](https://github.com/sekarkrishna/additory/tree/main/V0.1.0a1/documentation/V0.1.0/)** - Detailed guides for each function
|
|
242
|
+
- **[Expressions Guide](https://github.com/sekarkrishna/additory/tree/main/V0.1.0a1/documentation/V0.1.0/expressions.html)** - Complete expressions reference
|
|
243
|
+
|
|
244
|
+
## 📄 License
|
|
245
|
+
|
|
246
|
+
MIT License - see [LICENSE](https://github.com/sekarkrishna/additory/tree/main/V0.1.0a1/LICENSE) file for details.
|
|
247
|
+
|
|
248
|
+
## 📞 Support
|
|
249
|
+
|
|
250
|
+
- **Issues**: [GitHub Issues](https://github.com/sekarkrishna/additory/issues)
|
|
251
|
+
- **Documentation**: [Full Documentation](https://github.com/sekarkrishna/additory/tree/main/V0.1.0a1/documentation/V0.1.0)
|
|
252
|
+
|
|
253
|
+
## 🗺️ v0.1.1 (February 2025)
|
|
254
|
+
- Enhanced documentation and tutorials
|
|
255
|
+
- Performance optimizations
|
|
256
|
+
- Additional expressions
|
|
257
|
+
- Advanced synthetic data patterns
|
|
258
|
+
|
|
259
|
+
---
|
|
260
|
+
|
|
261
|
+
**Made with ❤️ for data scientists, analysts, and developers who love working with data.**
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# additory/__init__.py
|
|
2
|
+
|
|
3
|
+
from .dynamic_api import add as _api_instance
|
|
4
|
+
|
|
5
|
+
# Expose the API instance normally
|
|
6
|
+
add = _api_instance
|
|
7
|
+
|
|
8
|
+
# Module-level __getattr__ to forward dynamic attributes
|
|
9
|
+
def __getattr__(name):
|
|
10
|
+
# Delegate all unknown attributes to the API instance
|
|
11
|
+
return getattr(_api_instance, name)
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"add",
|
|
15
|
+
]
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Analysis Module for Data Profiling
|
|
3
|
+
|
|
4
|
+
Provides comprehensive data analysis capabilities:
|
|
5
|
+
- Distribution detection and fitting
|
|
6
|
+
- Correlation analysis
|
|
7
|
+
- Cardinality analysis
|
|
8
|
+
- Data quality metrics
|
|
9
|
+
- Data profiling and scanning
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from additory.analysis.distributions import (
|
|
13
|
+
detect_distributions,
|
|
14
|
+
fit_distribution,
|
|
15
|
+
DistributionFit
|
|
16
|
+
)
|
|
17
|
+
from additory.analysis.correlations import (
|
|
18
|
+
calculate_correlations,
|
|
19
|
+
CorrelationResult
|
|
20
|
+
)
|
|
21
|
+
from additory.analysis.cardinality import (
|
|
22
|
+
analyze_cardinality,
|
|
23
|
+
CardinalityInfo
|
|
24
|
+
)
|
|
25
|
+
from additory.analysis.quality import (
|
|
26
|
+
analyze_quality,
|
|
27
|
+
QualityMetrics
|
|
28
|
+
)
|
|
29
|
+
from additory.analysis.scan import (
|
|
30
|
+
scan,
|
|
31
|
+
ScanResult,
|
|
32
|
+
ColumnInfo
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
__all__ = [
|
|
36
|
+
'detect_distributions',
|
|
37
|
+
'fit_distribution',
|
|
38
|
+
'DistributionFit',
|
|
39
|
+
'calculate_correlations',
|
|
40
|
+
'CorrelationResult',
|
|
41
|
+
'analyze_cardinality',
|
|
42
|
+
'CardinalityInfo',
|
|
43
|
+
'analyze_quality',
|
|
44
|
+
'QualityMetrics',
|
|
45
|
+
'scan',
|
|
46
|
+
'ScanResult',
|
|
47
|
+
'ColumnInfo',
|
|
48
|
+
]
|