deepvariance-sdk 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepvariance_sdk-0.1.0/LICENSE +18 -0
- deepvariance_sdk-0.1.0/PKG-INFO +306 -0
- deepvariance_sdk-0.1.0/README.md +268 -0
- deepvariance_sdk-0.1.0/pyproject.toml +81 -0
- deepvariance_sdk-0.1.0/setup.cfg +4 -0
- deepvariance_sdk-0.1.0/setup.py +70 -0
- deepvariance_sdk-0.1.0/src/deepvariance/__init__.c +5409 -0
- deepvariance_sdk-0.1.0/src/deepvariance/__init__.py +12 -0
- deepvariance_sdk-0.1.0/src/deepvariance/agents/__init__.c +4956 -0
- deepvariance_sdk-0.1.0/src/deepvariance/agents/__init__.py +4 -0
- deepvariance_sdk-0.1.0/src/deepvariance/agents/base.c +8989 -0
- deepvariance_sdk-0.1.0/src/deepvariance/agents/base.py +48 -0
- deepvariance_sdk-0.1.0/src/deepvariance/agents/code_generation.c +11098 -0
- deepvariance_sdk-0.1.0/src/deepvariance/agents/code_generation.py +134 -0
- deepvariance_sdk-0.1.0/src/deepvariance/agents/dl/__init__.c +4954 -0
- deepvariance_sdk-0.1.0/src/deepvariance/agents/dl/__init__.py +4 -0
- deepvariance_sdk-0.1.0/src/deepvariance/agents/dl/architecture_agent.c +12272 -0
- deepvariance_sdk-0.1.0/src/deepvariance/agents/dl/architecture_agent.py +232 -0
- deepvariance_sdk-0.1.0/src/deepvariance/agents/dl/hyperparams.c +14124 -0
- deepvariance_sdk-0.1.0/src/deepvariance/agents/dl/hyperparams.py +281 -0
- deepvariance_sdk-0.1.0/src/deepvariance/agents/ml/__init__.c +5008 -0
- deepvariance_sdk-0.1.0/src/deepvariance/agents/ml/__init__.py +11 -0
- deepvariance_sdk-0.1.0/src/deepvariance/agents/ml/auto_cast_agent.c +9661 -0
- deepvariance_sdk-0.1.0/src/deepvariance/agents/ml/auto_cast_agent.py +134 -0
- deepvariance_sdk-0.1.0/src/deepvariance/agents/ml/model_recommendation_agent.c +12365 -0
- deepvariance_sdk-0.1.0/src/deepvariance/agents/ml/model_recommendation_agent.py +103 -0
- deepvariance_sdk-0.1.0/src/deepvariance/agents/ml/preprocessing_agent.c +9228 -0
- deepvariance_sdk-0.1.0/src/deepvariance/agents/ml/preprocessing_agent.py +60 -0
- deepvariance_sdk-0.1.0/src/deepvariance/agents/ml/sampling_agent.c +13200 -0
- deepvariance_sdk-0.1.0/src/deepvariance/agents/ml/sampling_agent.py +143 -0
- deepvariance_sdk-0.1.0/src/deepvariance/analytics.c +9795 -0
- deepvariance_sdk-0.1.0/src/deepvariance/analytics.py +85 -0
- deepvariance_sdk-0.1.0/src/deepvariance/core.c +8621 -0
- deepvariance_sdk-0.1.0/src/deepvariance/core.py +25 -0
- deepvariance_sdk-0.1.0/src/deepvariance/license.c +10884 -0
- deepvariance_sdk-0.1.0/src/deepvariance/license.py +113 -0
- deepvariance_sdk-0.1.0/src/deepvariance/llm/__init__.c +4980 -0
- deepvariance_sdk-0.1.0/src/deepvariance/llm/__init__.py +5 -0
- deepvariance_sdk-0.1.0/src/deepvariance/llm/base.c +8300 -0
- deepvariance_sdk-0.1.0/src/deepvariance/llm/base.py +20 -0
- deepvariance_sdk-0.1.0/src/deepvariance/llm/groq.c +10343 -0
- deepvariance_sdk-0.1.0/src/deepvariance/llm/groq.py +40 -0
- deepvariance_sdk-0.1.0/src/deepvariance/llm/openai.c +10327 -0
- deepvariance_sdk-0.1.0/src/deepvariance/llm/openai.py +38 -0
- deepvariance_sdk-0.1.0/src/deepvariance/pipelines/base.c +8786 -0
- deepvariance_sdk-0.1.0/src/deepvariance/pipelines/base.py +17 -0
- deepvariance_sdk-0.1.0/src/deepvariance/pipelines/dl/__init__.c +4929 -0
- deepvariance_sdk-0.1.0/src/deepvariance/pipelines/dl/__init__.py +3 -0
- deepvariance_sdk-0.1.0/src/deepvariance/pipelines/dl/layers/__init__.c +4980 -0
- deepvariance_sdk-0.1.0/src/deepvariance/pipelines/dl/layers/__init__.py +5 -0
- deepvariance_sdk-0.1.0/src/deepvariance/pipelines/dl/layers/data_loading.c +17756 -0
- deepvariance_sdk-0.1.0/src/deepvariance/pipelines/dl/layers/data_loading.py +284 -0
- deepvariance_sdk-0.1.0/src/deepvariance/pipelines/dl/layers/evaluation.c +13098 -0
- deepvariance_sdk-0.1.0/src/deepvariance/pipelines/dl/layers/evaluation.py +101 -0
- deepvariance_sdk-0.1.0/src/deepvariance/pipelines/dl/layers/training.c +11956 -0
- deepvariance_sdk-0.1.0/src/deepvariance/pipelines/dl/layers/training.py +168 -0
- deepvariance_sdk-0.1.0/src/deepvariance/pipelines/dl/metrics.c +8955 -0
- deepvariance_sdk-0.1.0/src/deepvariance/pipelines/dl/metrics.py +49 -0
- deepvariance_sdk-0.1.0/src/deepvariance/pipelines/dl/pipeline.c +14471 -0
- deepvariance_sdk-0.1.0/src/deepvariance/pipelines/dl/pipeline.py +310 -0
- deepvariance_sdk-0.1.0/src/deepvariance/pipelines/ml/__init__.c +4929 -0
- deepvariance_sdk-0.1.0/src/deepvariance/pipelines/ml/__init__.py +3 -0
- deepvariance_sdk-0.1.0/src/deepvariance/pipelines/ml/layers/__init__.c +5112 -0
- deepvariance_sdk-0.1.0/src/deepvariance/pipelines/ml/layers/__init__.py +19 -0
- deepvariance_sdk-0.1.0/src/deepvariance/pipelines/ml/layers/auto_cast.c +10979 -0
- deepvariance_sdk-0.1.0/src/deepvariance/pipelines/ml/layers/auto_cast.py +65 -0
- deepvariance_sdk-0.1.0/src/deepvariance/pipelines/ml/layers/base.c +11238 -0
- deepvariance_sdk-0.1.0/src/deepvariance/pipelines/ml/layers/base.py +219 -0
- deepvariance_sdk-0.1.0/src/deepvariance/pipelines/ml/layers/correlation.c +9382 -0
- deepvariance_sdk-0.1.0/src/deepvariance/pipelines/ml/layers/correlation.py +39 -0
- deepvariance_sdk-0.1.0/src/deepvariance/pipelines/ml/layers/data_profiling.c +9339 -0
- deepvariance_sdk-0.1.0/src/deepvariance/pipelines/ml/layers/data_profiling.py +39 -0
- deepvariance_sdk-0.1.0/src/deepvariance/pipelines/ml/layers/model_recommendation.c +8840 -0
- deepvariance_sdk-0.1.0/src/deepvariance/pipelines/ml/layers/model_recommendation.py +33 -0
- deepvariance_sdk-0.1.0/src/deepvariance/pipelines/ml/layers/model_training.c +12983 -0
- deepvariance_sdk-0.1.0/src/deepvariance/pipelines/ml/layers/model_training.py +164 -0
- deepvariance_sdk-0.1.0/src/deepvariance/pipelines/ml/layers/preprocessing.c +9611 -0
- deepvariance_sdk-0.1.0/src/deepvariance/pipelines/ml/layers/preprocessing.py +30 -0
- deepvariance_sdk-0.1.0/src/deepvariance/pipelines/ml/layers/sampling.c +9736 -0
- deepvariance_sdk-0.1.0/src/deepvariance/pipelines/ml/layers/sampling.py +40 -0
- deepvariance_sdk-0.1.0/src/deepvariance/pipelines/ml/pipeline.c +12084 -0
- deepvariance_sdk-0.1.0/src/deepvariance/pipelines/ml/pipeline.py +142 -0
- deepvariance_sdk-0.1.0/src/deepvariance/pipelines/ml/profiling_utils.c +18962 -0
- deepvariance_sdk-0.1.0/src/deepvariance/pipelines/ml/profiling_utils.py +375 -0
- deepvariance_sdk-0.1.0/src/deepvariance/typings/__init__.c +4957 -0
- deepvariance_sdk-0.1.0/src/deepvariance/typings/__init__.py +4 -0
- deepvariance_sdk-0.1.0/src/deepvariance/typings/config.c +7954 -0
- deepvariance_sdk-0.1.0/src/deepvariance/typings/config.py +37 -0
- deepvariance_sdk-0.1.0/src/deepvariance/typings/dl.c +9445 -0
- deepvariance_sdk-0.1.0/src/deepvariance/typings/dl.py +164 -0
- deepvariance_sdk-0.1.0/src/deepvariance/utils/__init__.c +4931 -0
- deepvariance_sdk-0.1.0/src/deepvariance/utils/__init__.py +5 -0
- deepvariance_sdk-0.1.0/src/deepvariance/utils/run_stats.c +9371 -0
- deepvariance_sdk-0.1.0/src/deepvariance/utils/run_stats.py +111 -0
- deepvariance_sdk-0.1.0/src/deepvariance_sdk.egg-info/PKG-INFO +306 -0
- deepvariance_sdk-0.1.0/src/deepvariance_sdk.egg-info/SOURCES.txt +97 -0
- deepvariance_sdk-0.1.0/src/deepvariance_sdk.egg-info/dependency_links.txt +1 -0
- deepvariance_sdk-0.1.0/src/deepvariance_sdk.egg-info/requires.txt +20 -0
- deepvariance_sdk-0.1.0/src/deepvariance_sdk.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
Copyright (c) 2026 Deep Variance Dev Team. All rights reserved.
|
|
2
|
+
|
|
3
|
+
This software and its source code are proprietary and confidential.
|
|
4
|
+
Unauthorized copying, distribution, modification, or use of this software,
|
|
5
|
+
in whole or in part, via any medium, is strictly prohibited without the
|
|
6
|
+
express prior written permission of Deep Variance Dev Team.
|
|
7
|
+
|
|
8
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
9
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
10
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
11
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
12
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
13
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
14
|
+
SOFTWARE.
|
|
15
|
+
|
|
16
|
+
Use of this SDK requires a valid DeepVariance API key issued from
|
|
17
|
+
https://deepvariance.com/dashboard. Access is governed by the DeepVariance
|
|
18
|
+
Terms of Service at https://deepvariance.com/terms.
|
|
@@ -0,0 +1,306 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: deepvariance-sdk
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: DeepVariance Python AutoML SDK — LLM-driven pipelines for tabular ML and image classification
|
|
5
|
+
Author: Deep Variance Dev Team
|
|
6
|
+
License-Expression: LicenseRef-proprietary
|
|
7
|
+
Project-URL: Homepage, https://deepvariance.com
|
|
8
|
+
Keywords: automl,llm,machine-learning,deep-learning,autogluon,pytorch
|
|
9
|
+
Classifier: Development Status :: 4 - Beta
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: Intended Audience :: Science/Research
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
15
|
+
Classifier: Typing :: Typed
|
|
16
|
+
Requires-Python: >=3.12
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
License-File: LICENSE
|
|
19
|
+
Requires-Dist: pandas>=2.0
|
|
20
|
+
Requires-Dist: numpy>=1.24
|
|
21
|
+
Requires-Dist: scipy>=1.10
|
|
22
|
+
Requires-Dist: scikit-learn>=1.3
|
|
23
|
+
Requires-Dist: psutil>=5.9
|
|
24
|
+
Requires-Dist: openai>=1.0
|
|
25
|
+
Requires-Dist: groq>=0.9
|
|
26
|
+
Requires-Dist: autogluon.tabular>=1.0
|
|
27
|
+
Requires-Dist: torch>=2.0
|
|
28
|
+
Requires-Dist: torchvision>=0.15
|
|
29
|
+
Provides-Extra: dev
|
|
30
|
+
Requires-Dist: pytest>=7; extra == "dev"
|
|
31
|
+
Requires-Dist: pytest-cov; extra == "dev"
|
|
32
|
+
Requires-Dist: ruff>=0.4; extra == "dev"
|
|
33
|
+
Requires-Dist: cython>=3.0; extra == "dev"
|
|
34
|
+
Provides-Extra: docs
|
|
35
|
+
Requires-Dist: sphinx>=6.2; extra == "docs"
|
|
36
|
+
Requires-Dist: sphinx-autodoc-typehints>=1.23; extra == "docs"
|
|
37
|
+
Dynamic: license-file
|
|
38
|
+
|
|
39
|
+
# DeepVariance SDK
|
|
40
|
+
|
|
41
|
+
**DeepVariance** is a Python AutoML SDK that combines LLM-driven code generation with [AutoGluon](https://auto.gluon.ai/) to automatically cast, clean, sample, preprocess, and train ML models on any tabular dataset — with a single `pipeline.run()` call.
|
|
42
|
+
|
|
43
|
+
---
|
|
44
|
+
|
|
45
|
+
## Table of Contents
|
|
46
|
+
|
|
47
|
+
- [How it works](#how-it-works)
|
|
48
|
+
- [Requirements](#requirements)
|
|
49
|
+
- [Installation](#installation)
|
|
50
|
+
- [Configuration](#configuration)
|
|
51
|
+
- [Quickstart](#quickstart)
|
|
52
|
+
- [Pipeline output](#pipeline-output)
|
|
53
|
+
- [PipelineConfig reference](#pipelineconfig-reference)
|
|
54
|
+
- [Progress callbacks](#progress-callbacks)
|
|
55
|
+
- [Build](#build)
|
|
56
|
+
- [Development](#development)
|
|
57
|
+
- [Documentation](#documentation)
|
|
58
|
+
|
|
59
|
+
---
|
|
60
|
+
|
|
61
|
+
## How it works
|
|
62
|
+
|
|
63
|
+
The `MLPipeline` executes 7 sequential layers against your DataFrame:
|
|
64
|
+
|
|
65
|
+
| # | Layer | Type | What it does |
|
|
66
|
+
| --- | -------------------------- | -------------------- | ---------------------------------------------------------------- |
|
|
67
|
+
| 1 | `AutoCastLayer` | LLM → code | Infers and applies column types, encodes categoricals |
|
|
68
|
+
| 2 | `DataProfilingLayer` | Deterministic | Computes feature + target statistics |
|
|
69
|
+
| 3 | `CorrelationLayer` | Deterministic | Pearson correlation matrix + mutual information scores |
|
|
70
|
+
| 4 | `SamplingLayer` | LLM → code | Produces a stratified, representative sample |
|
|
71
|
+
| 5 | `PreprocessingLayer` | LLM → code | Generates and applies pandas transforms (imputation, scaling, …) |
|
|
72
|
+
| 6 | `ModelRecommendationLayer` | LLM → recommendation | Selects the best AutoGluon model codes for your task |
|
|
73
|
+
| 7 | `ModelTrainingLayer` | Deterministic | Trains and evaluates a `TabularPredictor`, returns metrics |
|
|
74
|
+
|
|
75
|
+
LLM-driven layers use a **retry loop** — if the generated code raises an exception, the error is fed back to the LLM for self-correction.
|
|
76
|
+
|
|
77
|
+
---
|
|
78
|
+
|
|
79
|
+
## Requirements
|
|
80
|
+
|
|
81
|
+
- Python ≥ 3.12
|
|
82
|
+
- A **DeepVariance API key** — email [founders@deepvariance.com](mailto:founders@deepvariance.com) or fill the contact form at [deepvariance.com](https://deepvariance.com)
|
|
83
|
+
- An **OpenAI** or **Groq** API key
|
|
84
|
+
|
|
85
|
+
---
|
|
86
|
+
|
|
87
|
+
## Installation
|
|
88
|
+
|
|
89
|
+
```bash
|
|
90
|
+
pip install deepvariance
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
Dependencies installed automatically: `pandas`, `numpy`, `scipy`, `scikit-learn`, `psutil`, `openai`, `groq`, `autogluon.tabular`, `torch`, `torchvision`
|
|
94
|
+
|
|
95
|
+
### Dev install (from source)
|
|
96
|
+
|
|
97
|
+
```bash
|
|
98
|
+
git clone <repo-url> deepvariance-sdk
|
|
99
|
+
cd deepvariance-sdk
|
|
100
|
+
uv venv && source .venv/bin/activate # Windows: .venv\Scripts\activate
|
|
101
|
+
uv pip install -e ".[dev]" # installs all deps + pytest, ruff, cython
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
---
|
|
105
|
+
|
|
106
|
+
## Configuration
|
|
107
|
+
|
|
108
|
+
The SDK reads credentials from environment variables. Set them in your shell
|
|
109
|
+
before running:
|
|
110
|
+
|
|
111
|
+
```bash
|
|
112
|
+
export DV_API_KEY=dv_...
|
|
113
|
+
export OPENAI_API_KEY=sk-...
|
|
114
|
+
export GROQ_API_KEY=gsk_... # fallback if OpenAI key is absent
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
The SDK resolves LLM providers in order: **OpenAI → Groq**. You only need one.
|
|
118
|
+
|
|
119
|
+
### Optional: load from a `.env` file (local dev)
|
|
120
|
+
|
|
121
|
+
`python-dotenv` is not required by the SDK, but it is a convenient way to
|
|
122
|
+
manage keys during local development.
|
|
123
|
+
|
|
124
|
+
```bash
|
|
125
|
+
pip install python-dotenv
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
Create a `.env` file at the project root (see `.env.example`):
|
|
129
|
+
|
|
130
|
+
```dotenv
|
|
131
|
+
# .env
|
|
132
|
+
DV_API_KEY=dv_...
|
|
133
|
+
OPENAI_API_KEY=sk-...
|
|
134
|
+
GROQ_API_KEY=gsk_...
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
Then load it at the top of your script, **before** constructing `PipelineConfig`:
|
|
138
|
+
|
|
139
|
+
```python
|
|
140
|
+
from dotenv import load_dotenv
|
|
141
|
+
load_dotenv() # reads .env into os.environ
|
|
142
|
+
|
|
143
|
+
import os
|
|
144
|
+
from deepvariance.pipelines.ml import MLPipeline
|
|
145
|
+
from deepvariance.typings import PipelineConfig
|
|
146
|
+
|
|
147
|
+
config = PipelineConfig(
|
|
148
|
+
dv_api_key=os.getenv("DV_API_KEY"),
|
|
149
|
+
openai_api_key=os.getenv("OPENAI_API_KEY"),
|
|
150
|
+
)
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
> **Never commit your `.env` file.** Add it to `.gitignore`:
|
|
154
|
+
> ```
|
|
155
|
+
> .env
|
|
156
|
+
> ```
|
|
157
|
+
|
|
158
|
+
---
|
|
159
|
+
|
|
160
|
+
## Quickstart
|
|
161
|
+
|
|
162
|
+
```python
|
|
163
|
+
import os
|
|
164
|
+
import pandas as pd
|
|
165
|
+
|
|
166
|
+
from deepvariance.pipelines.ml import MLPipeline
|
|
167
|
+
from deepvariance.typings import PipelineConfig
|
|
168
|
+
|
|
169
|
+
# 1. Load your data
|
|
170
|
+
data = pd.read_csv("your_dataset.csv")
|
|
171
|
+
|
|
172
|
+
# 2. Configure
|
|
173
|
+
config = PipelineConfig(
|
|
174
|
+
dv_api_key=os.getenv("DV_API_KEY"),
|
|
175
|
+
openai_api_key=os.getenv("OPENAI_API_KEY"),
|
|
176
|
+
groq_api_key=os.getenv("GROQ_API_KEY"),
|
|
177
|
+
sample_percentage=0.1, # train on a 10% stratified sample
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
# 3. Run
|
|
181
|
+
pipeline = MLPipeline(config=config)
|
|
182
|
+
result = pipeline.run(data, target="your_target_column")
|
|
183
|
+
|
|
184
|
+
# 4. Inspect results
|
|
185
|
+
print(result["metrics"])
|
|
186
|
+
print(result["leaderboard"])
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
Run the bundled examples directly:
|
|
190
|
+
|
|
191
|
+
```bash
|
|
192
|
+
# Binary classification — Australia weather dataset
|
|
193
|
+
.venv/bin/python examples/ml_quickstart.py
|
|
194
|
+
|
|
195
|
+
# Regression — medical insurance dataset
|
|
196
|
+
.venv/bin/python examples/insurance_regression.py
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
---
|
|
200
|
+
|
|
201
|
+
## Pipeline output
|
|
202
|
+
|
|
203
|
+
`pipeline.run()` returns a dict:
|
|
204
|
+
|
|
205
|
+
| Key | Type | Description |
|
|
206
|
+
| -------------------- | ---------------------- | --------------------------------------------------- |
|
|
207
|
+
| `metrics` | `dict[str, float]` | Accuracy, F1, ROC-AUC, RMSE, R², … (task-dependent) |
|
|
208
|
+
| `model` | `TabularPredictor` | Trained AutoGluon predictor |
|
|
209
|
+
| `leaderboard` | `pd.DataFrame` | All candidate models ranked by validation score |
|
|
210
|
+
| `feature_importance` | `pd.DataFrame \| None` | Feature importance scores from the best model |
|
|
211
|
+
| `run_stats` | `dict` | Wall-clock duration and peak memory per layer |
|
|
212
|
+
|
|
213
|
+
### Classification metrics
|
|
214
|
+
|
|
215
|
+
`accuracy`, `f1_macro`, `f1_weighted`, `precision_macro`, `precision_weighted`, `recall_macro`, `recall_weighted`, `cohen_kappa`, `mcc`, `roc_auc` (binary) / `roc_auc_ovr` (multiclass), `log_loss`
|
|
216
|
+
|
|
217
|
+
### Regression metrics
|
|
218
|
+
|
|
219
|
+
`rmse`, `mae`, `r2`, `median_ae`, `max_error`, `explained_var`, `mape`
|
|
220
|
+
|
|
221
|
+
---
|
|
222
|
+
|
|
223
|
+
## PipelineConfig reference
|
|
224
|
+
|
|
225
|
+
```python
|
|
226
|
+
@dataclass
|
|
227
|
+
class PipelineConfig:
|
|
228
|
+
dv_api_key: str | None = None # DeepVariance API key (or set DV_API_KEY env var)
|
|
229
|
+
openai_api_key: str | None = None # OpenAI API key
|
|
230
|
+
groq_api_key: str | None = None # Groq API key (fallback)
|
|
231
|
+
sample_percentage: float | None = None # e.g. 0.1 → 10% sample fed to AutoGluon
|
|
232
|
+
extra: dict[str, Any] = field(default_factory=dict) # pipeline-specific overrides
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
`sample_percentage` controls the fraction of rows passed to AutoGluon after the LLM sampling stage. For large datasets (> 100k rows) a value of `0.1`–`0.2` keeps training fast while preserving distribution.
|
|
236
|
+
|
|
237
|
+
---
|
|
238
|
+
|
|
239
|
+
## Progress callbacks
|
|
240
|
+
|
|
241
|
+
Pass an `on_progress` callable to get real-time stage updates:
|
|
242
|
+
|
|
243
|
+
```python
|
|
244
|
+
def on_progress(stage: str, status: str) -> None:
|
|
245
|
+
# stage — e.g. "AutoCastLayer", "ModelTrainingLayer"
|
|
246
|
+
# status — "start" | "complete" | "error"
|
|
247
|
+
icon = {"start": "▶", "complete": "✓", "error": "✗"}.get(status, "·")
|
|
248
|
+
print(f" {icon} {stage}: {status}")
|
|
249
|
+
|
|
250
|
+
result = pipeline.run(data, target="label", on_progress=on_progress)
|
|
251
|
+
```
|
|
252
|
+
|
|
253
|
+
---
|
|
254
|
+
|
|
255
|
+
## Build
|
|
256
|
+
|
|
257
|
+
The release wheel compiles all source to native C extensions via Cython —
|
|
258
|
+
no Python source is included in the distributed package.
|
|
259
|
+
|
|
260
|
+
```bash
|
|
261
|
+
# Install build dependencies (one-time)
|
|
262
|
+
uv pip install -e ".[dev]"
|
|
263
|
+
|
|
264
|
+
# Compile extensions in-place (for local dev / running tests against .so)
|
|
265
|
+
just build-ext
|
|
266
|
+
|
|
267
|
+
# Build a release wheel (compiled .so only, no .py source)
|
|
268
|
+
just build-wheel
|
|
269
|
+
# → dist/deepvariance-0.1.0-cp312-cp312-macosx_arm64.whl
|
|
270
|
+
```
|
|
271
|
+
|
|
272
|
+
For CI, build on each target platform (macOS arm64, Linux x86_64) and upload
|
|
273
|
+
all wheels to PyPI so users get the right binary for their machine.
|
|
274
|
+
|
|
275
|
+
---
|
|
276
|
+
|
|
277
|
+
## Documentation
|
|
278
|
+
|
|
279
|
+
The project now includes **Sphinx-based documentation** under the `docs/` directory. To build the HTML locally:
|
|
280
|
+
|
|
281
|
+
```bash
|
|
282
|
+
# install docs dependencies (optional group)
|
|
283
|
+
uv pip install -e ".[docs]" # or use pip/poetry/uv manually
|
|
284
|
+
cd docs
|
|
285
|
+
make html # requires make; or run `sphinx-build -b html . _build/html`
|
|
286
|
+
```
|
|
287
|
+
|
|
288
|
+
The generated site will appear in `docs/_build/html/index.html`.
|
|
289
|
+
|
|
290
|
+
See `docs/quickstart.rst` for a getting‑started guide and `docs/api.rst` for
|
|
291
|
+
an auto‑generated API reference.
|
|
292
|
+
|
|
293
|
+
## Development
|
|
294
|
+
|
|
295
|
+
```bash
|
|
296
|
+
# Run tests
|
|
297
|
+
.venv/bin/python -m pytest tests/ -q
|
|
298
|
+
|
|
299
|
+
# Lint
|
|
300
|
+
.venv/bin/ruff check src/ tests/
|
|
301
|
+
|
|
302
|
+
# Format
|
|
303
|
+
.venv/bin/ruff format src/ tests/
|
|
304
|
+
```
|
|
305
|
+
|
|
306
|
+
All lint rules are configured in `pyproject.toml` under `[tool.ruff]`.
|
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
# DeepVariance SDK
|
|
2
|
+
|
|
3
|
+
**DeepVariance** is a Python AutoML SDK that combines LLM-driven code generation with [AutoGluon](https://auto.gluon.ai/) to automatically cast, clean, sample, preprocess, and train ML models on any tabular dataset — with a single `pipeline.run()` call.
|
|
4
|
+
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
## Table of Contents
|
|
8
|
+
|
|
9
|
+
- [How it works](#how-it-works)
|
|
10
|
+
- [Requirements](#requirements)
|
|
11
|
+
- [Installation](#installation)
|
|
12
|
+
- [Configuration](#configuration)
|
|
13
|
+
- [Quickstart](#quickstart)
|
|
14
|
+
- [Pipeline output](#pipeline-output)
|
|
15
|
+
- [PipelineConfig reference](#pipelineconfig-reference)
|
|
16
|
+
- [Progress callbacks](#progress-callbacks)
|
|
17
|
+
- [Build](#build)
|
|
18
|
+
- [Development](#development)
|
|
19
|
+
- [Documentation](#documentation)
|
|
20
|
+
|
|
21
|
+
---
|
|
22
|
+
|
|
23
|
+
## How it works
|
|
24
|
+
|
|
25
|
+
The `MLPipeline` executes 7 sequential layers against your DataFrame:
|
|
26
|
+
|
|
27
|
+
| # | Layer | Type | What it does |
|
|
28
|
+
| --- | -------------------------- | -------------------- | ---------------------------------------------------------------- |
|
|
29
|
+
| 1 | `AutoCastLayer` | LLM → code | Infers and applies column types, encodes categoricals |
|
|
30
|
+
| 2 | `DataProfilingLayer` | Deterministic | Computes feature + target statistics |
|
|
31
|
+
| 3 | `CorrelationLayer` | Deterministic | Pearson correlation matrix + mutual information scores |
|
|
32
|
+
| 4 | `SamplingLayer` | LLM → code | Produces a stratified, representative sample |
|
|
33
|
+
| 5 | `PreprocessingLayer` | LLM → code | Generates and applies pandas transforms (imputation, scaling, …) |
|
|
34
|
+
| 6 | `ModelRecommendationLayer` | LLM → recommendation | Selects the best AutoGluon model codes for your task |
|
|
35
|
+
| 7 | `ModelTrainingLayer` | Deterministic | Trains and evaluates a `TabularPredictor`, returns metrics |
|
|
36
|
+
|
|
37
|
+
LLM-driven layers use a **retry loop** — if the generated code raises an exception, the error is fed back to the LLM for self-correction.
|
|
38
|
+
|
|
39
|
+
---
|
|
40
|
+
|
|
41
|
+
## Requirements
|
|
42
|
+
|
|
43
|
+
- Python ≥ 3.12
|
|
44
|
+
- A **DeepVariance API key** — email [founders@deepvariance.com](mailto:founders@deepvariance.com) or fill the contact form at [deepvariance.com](https://deepvariance.com)
|
|
45
|
+
- An **OpenAI** or **Groq** API key
|
|
46
|
+
|
|
47
|
+
---
|
|
48
|
+
|
|
49
|
+
## Installation
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
pip install deepvariance
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
Dependencies installed automatically: `pandas`, `numpy`, `scipy`, `scikit-learn`, `psutil`, `openai`, `groq`, `autogluon.tabular`, `torch`, `torchvision`
|
|
56
|
+
|
|
57
|
+
### Dev install (from source)
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
git clone <repo-url> deepvariance-sdk
|
|
61
|
+
cd deepvariance-sdk
|
|
62
|
+
uv venv && source .venv/bin/activate # Windows: .venv\Scripts\activate
|
|
63
|
+
uv pip install -e ".[dev]" # installs all deps + pytest, ruff, cython
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
---
|
|
67
|
+
|
|
68
|
+
## Configuration
|
|
69
|
+
|
|
70
|
+
The SDK reads credentials from environment variables. Set them in your shell
|
|
71
|
+
before running:
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
export DV_API_KEY=dv_...
|
|
75
|
+
export OPENAI_API_KEY=sk-...
|
|
76
|
+
export GROQ_API_KEY=gsk_... # fallback if OpenAI key is absent
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
The SDK resolves LLM providers in order: **OpenAI → Groq**. You only need one.
|
|
80
|
+
|
|
81
|
+
### Optional: load from a `.env` file (local dev)
|
|
82
|
+
|
|
83
|
+
`python-dotenv` is not required by the SDK, but it is a convenient way to
|
|
84
|
+
manage keys during local development.
|
|
85
|
+
|
|
86
|
+
```bash
|
|
87
|
+
pip install python-dotenv
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
Create a `.env` file at the project root (see `.env.example`):
|
|
91
|
+
|
|
92
|
+
```dotenv
|
|
93
|
+
# .env
|
|
94
|
+
DV_API_KEY=dv_...
|
|
95
|
+
OPENAI_API_KEY=sk-...
|
|
96
|
+
GROQ_API_KEY=gsk_...
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
Then load it at the top of your script, **before** constructing `PipelineConfig`:
|
|
100
|
+
|
|
101
|
+
```python
|
|
102
|
+
from dotenv import load_dotenv
|
|
103
|
+
load_dotenv() # reads .env into os.environ
|
|
104
|
+
|
|
105
|
+
import os
|
|
106
|
+
from deepvariance.pipelines.ml import MLPipeline
|
|
107
|
+
from deepvariance.typings import PipelineConfig
|
|
108
|
+
|
|
109
|
+
config = PipelineConfig(
|
|
110
|
+
dv_api_key=os.getenv("DV_API_KEY"),
|
|
111
|
+
openai_api_key=os.getenv("OPENAI_API_KEY"),
|
|
112
|
+
)
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
> **Never commit your `.env` file.** Add it to `.gitignore`:
|
|
116
|
+
> ```
|
|
117
|
+
> .env
|
|
118
|
+
> ```
|
|
119
|
+
|
|
120
|
+
---
|
|
121
|
+
|
|
122
|
+
## Quickstart
|
|
123
|
+
|
|
124
|
+
```python
|
|
125
|
+
import os
|
|
126
|
+
import pandas as pd
|
|
127
|
+
|
|
128
|
+
from deepvariance.pipelines.ml import MLPipeline
|
|
129
|
+
from deepvariance.typings import PipelineConfig
|
|
130
|
+
|
|
131
|
+
# 1. Load your data
|
|
132
|
+
data = pd.read_csv("your_dataset.csv")
|
|
133
|
+
|
|
134
|
+
# 2. Configure
|
|
135
|
+
config = PipelineConfig(
|
|
136
|
+
dv_api_key=os.getenv("DV_API_KEY"),
|
|
137
|
+
openai_api_key=os.getenv("OPENAI_API_KEY"),
|
|
138
|
+
groq_api_key=os.getenv("GROQ_API_KEY"),
|
|
139
|
+
sample_percentage=0.1, # train on a 10% stratified sample
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
# 3. Run
|
|
143
|
+
pipeline = MLPipeline(config=config)
|
|
144
|
+
result = pipeline.run(data, target="your_target_column")
|
|
145
|
+
|
|
146
|
+
# 4. Inspect results
|
|
147
|
+
print(result["metrics"])
|
|
148
|
+
print(result["leaderboard"])
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
Run the bundled examples directly:
|
|
152
|
+
|
|
153
|
+
```bash
|
|
154
|
+
# Binary classification — Australia weather dataset
|
|
155
|
+
.venv/bin/python examples/ml_quickstart.py
|
|
156
|
+
|
|
157
|
+
# Regression — medical insurance dataset
|
|
158
|
+
.venv/bin/python examples/insurance_regression.py
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
---
|
|
162
|
+
|
|
163
|
+
## Pipeline output
|
|
164
|
+
|
|
165
|
+
`pipeline.run()` returns a dict:
|
|
166
|
+
|
|
167
|
+
| Key | Type | Description |
|
|
168
|
+
| -------------------- | ---------------------- | --------------------------------------------------- |
|
|
169
|
+
| `metrics` | `dict[str, float]` | Accuracy, F1, ROC-AUC, RMSE, R², … (task-dependent) |
|
|
170
|
+
| `model` | `TabularPredictor` | Trained AutoGluon predictor |
|
|
171
|
+
| `leaderboard` | `pd.DataFrame` | All candidate models ranked by validation score |
|
|
172
|
+
| `feature_importance` | `pd.DataFrame \| None` | Feature importance scores from the best model |
|
|
173
|
+
| `run_stats` | `dict` | Wall-clock duration and peak memory per layer |
|
|
174
|
+
|
|
175
|
+
### Classification metrics
|
|
176
|
+
|
|
177
|
+
`accuracy`, `f1_macro`, `f1_weighted`, `precision_macro`, `precision_weighted`, `recall_macro`, `recall_weighted`, `cohen_kappa`, `mcc`, `roc_auc` (binary) / `roc_auc_ovr` (multiclass), `log_loss`
|
|
178
|
+
|
|
179
|
+
### Regression metrics
|
|
180
|
+
|
|
181
|
+
`rmse`, `mae`, `r2`, `median_ae`, `max_error`, `explained_var`, `mape`
|
|
182
|
+
|
|
183
|
+
---
|
|
184
|
+
|
|
185
|
+
## PipelineConfig reference
|
|
186
|
+
|
|
187
|
+
```python
|
|
188
|
+
@dataclass
|
|
189
|
+
class PipelineConfig:
|
|
190
|
+
dv_api_key: str | None = None # DeepVariance API key (or set DV_API_KEY env var)
|
|
191
|
+
openai_api_key: str | None = None # OpenAI API key
|
|
192
|
+
groq_api_key: str | None = None # Groq API key (fallback)
|
|
193
|
+
sample_percentage: float | None = None # e.g. 0.1 → 10% sample fed to AutoGluon
|
|
194
|
+
extra: dict[str, Any] = field(default_factory=dict) # pipeline-specific overrides
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
`sample_percentage` controls the fraction of rows passed to AutoGluon after the LLM sampling stage. For large datasets (> 100k rows) a value of `0.1`–`0.2` keeps training fast while preserving distribution.
|
|
198
|
+
|
|
199
|
+
---
|
|
200
|
+
|
|
201
|
+
## Progress callbacks
|
|
202
|
+
|
|
203
|
+
Pass an `on_progress` callable to get real-time stage updates:
|
|
204
|
+
|
|
205
|
+
```python
|
|
206
|
+
def on_progress(stage: str, status: str) -> None:
|
|
207
|
+
# stage — e.g. "AutoCastLayer", "ModelTrainingLayer"
|
|
208
|
+
# status — "start" | "complete" | "error"
|
|
209
|
+
icon = {"start": "▶", "complete": "✓", "error": "✗"}.get(status, "·")
|
|
210
|
+
print(f" {icon} {stage}: {status}")
|
|
211
|
+
|
|
212
|
+
result = pipeline.run(data, target="label", on_progress=on_progress)
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
---
|
|
216
|
+
|
|
217
|
+
## Build
|
|
218
|
+
|
|
219
|
+
The release wheel compiles all source to native C extensions via Cython —
|
|
220
|
+
no Python source is included in the distributed package.
|
|
221
|
+
|
|
222
|
+
```bash
|
|
223
|
+
# Install build dependencies (one-time)
|
|
224
|
+
uv pip install -e ".[dev]"
|
|
225
|
+
|
|
226
|
+
# Compile extensions in-place (for local dev / running tests against .so)
|
|
227
|
+
just build-ext
|
|
228
|
+
|
|
229
|
+
# Build a release wheel (compiled .so only, no .py source)
|
|
230
|
+
just build-wheel
|
|
231
|
+
# → dist/deepvariance-0.1.0-cp312-cp312-macosx_arm64.whl
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
For CI, build on each target platform (macOS arm64, Linux x86_64) and upload
|
|
235
|
+
all wheels to PyPI so users get the right binary for their machine.
|
|
236
|
+
|
|
237
|
+
---
|
|
238
|
+
|
|
239
|
+
## Documentation
|
|
240
|
+
|
|
241
|
+
The project now includes **Sphinx-based documentation** under the `docs/` directory. To build the HTML locally:
|
|
242
|
+
|
|
243
|
+
```bash
|
|
244
|
+
# install docs dependencies (optional group)
|
|
245
|
+
uv pip install -e ".[docs]" # or use pip/poetry/uv manually
|
|
246
|
+
cd docs
|
|
247
|
+
make html # requires make; or run `sphinx-build -b html . _build/html`
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
The generated site will appear in `docs/_build/html/index.html`.
|
|
251
|
+
|
|
252
|
+
See `docs/quickstart.rst` for a getting‑started guide and `docs/api.rst` for
|
|
253
|
+
an auto‑generated API reference.
|
|
254
|
+
|
|
255
|
+
## Development
|
|
256
|
+
|
|
257
|
+
```bash
|
|
258
|
+
# Run tests
|
|
259
|
+
.venv/bin/python -m pytest tests/ -q
|
|
260
|
+
|
|
261
|
+
# Lint
|
|
262
|
+
.venv/bin/ruff check src/ tests/
|
|
263
|
+
|
|
264
|
+
# Format
|
|
265
|
+
.venv/bin/ruff format src/ tests/
|
|
266
|
+
```
|
|
267
|
+
|
|
268
|
+
All lint rules are configured in `pyproject.toml` under `[tool.ruff]`.
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "cython>=3.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[tool.setuptools.packages.find]
|
|
6
|
+
where = ["src"]
|
|
7
|
+
|
|
8
|
+
[tool.setuptools.package-data]
|
|
9
|
+
"*" = []
|
|
10
|
+
|
|
11
|
+
[project]
|
|
12
|
+
name = "deepvariance-sdk"
|
|
13
|
+
version = "0.1.0"
|
|
14
|
+
description = "DeepVariance Python AutoML SDK — LLM-driven pipelines for tabular ML and image classification"
|
|
15
|
+
readme = "README.md"
|
|
16
|
+
requires-python = ">=3.12"
|
|
17
|
+
license = "LicenseRef-proprietary"
|
|
18
|
+
license-files = ["LICENSE"]
|
|
19
|
+
authors = [
|
|
20
|
+
{ name = "Deep Variance Dev Team" },
|
|
21
|
+
]
|
|
22
|
+
keywords = ["automl", "llm", "machine-learning", "deep-learning", "autogluon", "pytorch"]
|
|
23
|
+
classifiers = [
|
|
24
|
+
"Development Status :: 4 - Beta",
|
|
25
|
+
"Intended Audience :: Developers",
|
|
26
|
+
"Intended Audience :: Science/Research",
|
|
27
|
+
"Programming Language :: Python :: 3",
|
|
28
|
+
"Programming Language :: Python :: 3.12",
|
|
29
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
30
|
+
"Typing :: Typed",
|
|
31
|
+
]
|
|
32
|
+
dependencies = [
|
|
33
|
+
"pandas>=2.0",
|
|
34
|
+
"numpy>=1.24",
|
|
35
|
+
"scipy>=1.10",
|
|
36
|
+
"scikit-learn>=1.3",
|
|
37
|
+
"psutil>=5.9",
|
|
38
|
+
"openai>=1.0",
|
|
39
|
+
"groq>=0.9",
|
|
40
|
+
"autogluon.tabular>=1.0",
|
|
41
|
+
"torch>=2.0",
|
|
42
|
+
"torchvision>=0.15",
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
[project.urls]
|
|
46
|
+
"Homepage" = "https://deepvariance.com"
|
|
47
|
+
|
|
48
|
+
[project.optional-dependencies]
|
|
49
|
+
dev = ["pytest>=7", "pytest-cov", "ruff>=0.4", "cython>=3.0"]
|
|
50
|
+
docs = ["sphinx>=6.2", "sphinx-autodoc-typehints>=1.23"]
|
|
51
|
+
|
|
52
|
+
# Ruff — lint + format
|
|
53
|
+
[tool.ruff]
|
|
54
|
+
target-version = "py312"
|
|
55
|
+
line-length = 100
|
|
56
|
+
|
|
57
|
+
[tool.ruff.lint]
|
|
58
|
+
select = [
|
|
59
|
+
"E", # pycodestyle errors
|
|
60
|
+
"W", # pycodestyle warnings
|
|
61
|
+
"F", # pyflakes (unused imports, undefined names, …)
|
|
62
|
+
"I", # isort
|
|
63
|
+
"B", # flake8-bugbear
|
|
64
|
+
"C4", # flake8-comprehensions
|
|
65
|
+
"SIM", # flake8-simplify
|
|
66
|
+
"RUF", # ruff-specific rules
|
|
67
|
+
]
|
|
68
|
+
ignore = [
|
|
69
|
+
"E501", # line too long — formatter handles wrapping; long strings are intentional
|
|
70
|
+
"SIM108", # ternary operator — explicit if/else is clearer in agent prompts
|
|
71
|
+
"B904", # raise ... from exc — already done; don't force on every bare raise
|
|
72
|
+
"RUF001", # ambiguous Unicode — EN DASH in system-prompt strings is intentional
|
|
73
|
+
]
|
|
74
|
+
|
|
75
|
+
[tool.ruff.lint.per-file-ignores]
|
|
76
|
+
"tests/**" = ["S101"] # allow bare assert in tests
|
|
77
|
+
|
|
78
|
+
[tool.ruff.format]
|
|
79
|
+
quote-style = "double"
|
|
80
|
+
indent-style = "space"
|
|
81
|
+
skip-magic-trailing-comma = false
|