deepvariance-sdk 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. deepvariance_sdk-0.1.0/LICENSE +18 -0
  2. deepvariance_sdk-0.1.0/PKG-INFO +306 -0
  3. deepvariance_sdk-0.1.0/README.md +268 -0
  4. deepvariance_sdk-0.1.0/pyproject.toml +81 -0
  5. deepvariance_sdk-0.1.0/setup.cfg +4 -0
  6. deepvariance_sdk-0.1.0/setup.py +70 -0
  7. deepvariance_sdk-0.1.0/src/deepvariance/__init__.c +5409 -0
  8. deepvariance_sdk-0.1.0/src/deepvariance/__init__.py +12 -0
  9. deepvariance_sdk-0.1.0/src/deepvariance/agents/__init__.c +4956 -0
  10. deepvariance_sdk-0.1.0/src/deepvariance/agents/__init__.py +4 -0
  11. deepvariance_sdk-0.1.0/src/deepvariance/agents/base.c +8989 -0
  12. deepvariance_sdk-0.1.0/src/deepvariance/agents/base.py +48 -0
  13. deepvariance_sdk-0.1.0/src/deepvariance/agents/code_generation.c +11098 -0
  14. deepvariance_sdk-0.1.0/src/deepvariance/agents/code_generation.py +134 -0
  15. deepvariance_sdk-0.1.0/src/deepvariance/agents/dl/__init__.c +4954 -0
  16. deepvariance_sdk-0.1.0/src/deepvariance/agents/dl/__init__.py +4 -0
  17. deepvariance_sdk-0.1.0/src/deepvariance/agents/dl/architecture_agent.c +12272 -0
  18. deepvariance_sdk-0.1.0/src/deepvariance/agents/dl/architecture_agent.py +232 -0
  19. deepvariance_sdk-0.1.0/src/deepvariance/agents/dl/hyperparams.c +14124 -0
  20. deepvariance_sdk-0.1.0/src/deepvariance/agents/dl/hyperparams.py +281 -0
  21. deepvariance_sdk-0.1.0/src/deepvariance/agents/ml/__init__.c +5008 -0
  22. deepvariance_sdk-0.1.0/src/deepvariance/agents/ml/__init__.py +11 -0
  23. deepvariance_sdk-0.1.0/src/deepvariance/agents/ml/auto_cast_agent.c +9661 -0
  24. deepvariance_sdk-0.1.0/src/deepvariance/agents/ml/auto_cast_agent.py +134 -0
  25. deepvariance_sdk-0.1.0/src/deepvariance/agents/ml/model_recommendation_agent.c +12365 -0
  26. deepvariance_sdk-0.1.0/src/deepvariance/agents/ml/model_recommendation_agent.py +103 -0
  27. deepvariance_sdk-0.1.0/src/deepvariance/agents/ml/preprocessing_agent.c +9228 -0
  28. deepvariance_sdk-0.1.0/src/deepvariance/agents/ml/preprocessing_agent.py +60 -0
  29. deepvariance_sdk-0.1.0/src/deepvariance/agents/ml/sampling_agent.c +13200 -0
  30. deepvariance_sdk-0.1.0/src/deepvariance/agents/ml/sampling_agent.py +143 -0
  31. deepvariance_sdk-0.1.0/src/deepvariance/analytics.c +9795 -0
  32. deepvariance_sdk-0.1.0/src/deepvariance/analytics.py +85 -0
  33. deepvariance_sdk-0.1.0/src/deepvariance/core.c +8621 -0
  34. deepvariance_sdk-0.1.0/src/deepvariance/core.py +25 -0
  35. deepvariance_sdk-0.1.0/src/deepvariance/license.c +10884 -0
  36. deepvariance_sdk-0.1.0/src/deepvariance/license.py +113 -0
  37. deepvariance_sdk-0.1.0/src/deepvariance/llm/__init__.c +4980 -0
  38. deepvariance_sdk-0.1.0/src/deepvariance/llm/__init__.py +5 -0
  39. deepvariance_sdk-0.1.0/src/deepvariance/llm/base.c +8300 -0
  40. deepvariance_sdk-0.1.0/src/deepvariance/llm/base.py +20 -0
  41. deepvariance_sdk-0.1.0/src/deepvariance/llm/groq.c +10343 -0
  42. deepvariance_sdk-0.1.0/src/deepvariance/llm/groq.py +40 -0
  43. deepvariance_sdk-0.1.0/src/deepvariance/llm/openai.c +10327 -0
  44. deepvariance_sdk-0.1.0/src/deepvariance/llm/openai.py +38 -0
  45. deepvariance_sdk-0.1.0/src/deepvariance/pipelines/base.c +8786 -0
  46. deepvariance_sdk-0.1.0/src/deepvariance/pipelines/base.py +17 -0
  47. deepvariance_sdk-0.1.0/src/deepvariance/pipelines/dl/__init__.c +4929 -0
  48. deepvariance_sdk-0.1.0/src/deepvariance/pipelines/dl/__init__.py +3 -0
  49. deepvariance_sdk-0.1.0/src/deepvariance/pipelines/dl/layers/__init__.c +4980 -0
  50. deepvariance_sdk-0.1.0/src/deepvariance/pipelines/dl/layers/__init__.py +5 -0
  51. deepvariance_sdk-0.1.0/src/deepvariance/pipelines/dl/layers/data_loading.c +17756 -0
  52. deepvariance_sdk-0.1.0/src/deepvariance/pipelines/dl/layers/data_loading.py +284 -0
  53. deepvariance_sdk-0.1.0/src/deepvariance/pipelines/dl/layers/evaluation.c +13098 -0
  54. deepvariance_sdk-0.1.0/src/deepvariance/pipelines/dl/layers/evaluation.py +101 -0
  55. deepvariance_sdk-0.1.0/src/deepvariance/pipelines/dl/layers/training.c +11956 -0
  56. deepvariance_sdk-0.1.0/src/deepvariance/pipelines/dl/layers/training.py +168 -0
  57. deepvariance_sdk-0.1.0/src/deepvariance/pipelines/dl/metrics.c +8955 -0
  58. deepvariance_sdk-0.1.0/src/deepvariance/pipelines/dl/metrics.py +49 -0
  59. deepvariance_sdk-0.1.0/src/deepvariance/pipelines/dl/pipeline.c +14471 -0
  60. deepvariance_sdk-0.1.0/src/deepvariance/pipelines/dl/pipeline.py +310 -0
  61. deepvariance_sdk-0.1.0/src/deepvariance/pipelines/ml/__init__.c +4929 -0
  62. deepvariance_sdk-0.1.0/src/deepvariance/pipelines/ml/__init__.py +3 -0
  63. deepvariance_sdk-0.1.0/src/deepvariance/pipelines/ml/layers/__init__.c +5112 -0
  64. deepvariance_sdk-0.1.0/src/deepvariance/pipelines/ml/layers/__init__.py +19 -0
  65. deepvariance_sdk-0.1.0/src/deepvariance/pipelines/ml/layers/auto_cast.c +10979 -0
  66. deepvariance_sdk-0.1.0/src/deepvariance/pipelines/ml/layers/auto_cast.py +65 -0
  67. deepvariance_sdk-0.1.0/src/deepvariance/pipelines/ml/layers/base.c +11238 -0
  68. deepvariance_sdk-0.1.0/src/deepvariance/pipelines/ml/layers/base.py +219 -0
  69. deepvariance_sdk-0.1.0/src/deepvariance/pipelines/ml/layers/correlation.c +9382 -0
  70. deepvariance_sdk-0.1.0/src/deepvariance/pipelines/ml/layers/correlation.py +39 -0
  71. deepvariance_sdk-0.1.0/src/deepvariance/pipelines/ml/layers/data_profiling.c +9339 -0
  72. deepvariance_sdk-0.1.0/src/deepvariance/pipelines/ml/layers/data_profiling.py +39 -0
  73. deepvariance_sdk-0.1.0/src/deepvariance/pipelines/ml/layers/model_recommendation.c +8840 -0
  74. deepvariance_sdk-0.1.0/src/deepvariance/pipelines/ml/layers/model_recommendation.py +33 -0
  75. deepvariance_sdk-0.1.0/src/deepvariance/pipelines/ml/layers/model_training.c +12983 -0
  76. deepvariance_sdk-0.1.0/src/deepvariance/pipelines/ml/layers/model_training.py +164 -0
  77. deepvariance_sdk-0.1.0/src/deepvariance/pipelines/ml/layers/preprocessing.c +9611 -0
  78. deepvariance_sdk-0.1.0/src/deepvariance/pipelines/ml/layers/preprocessing.py +30 -0
  79. deepvariance_sdk-0.1.0/src/deepvariance/pipelines/ml/layers/sampling.c +9736 -0
  80. deepvariance_sdk-0.1.0/src/deepvariance/pipelines/ml/layers/sampling.py +40 -0
  81. deepvariance_sdk-0.1.0/src/deepvariance/pipelines/ml/pipeline.c +12084 -0
  82. deepvariance_sdk-0.1.0/src/deepvariance/pipelines/ml/pipeline.py +142 -0
  83. deepvariance_sdk-0.1.0/src/deepvariance/pipelines/ml/profiling_utils.c +18962 -0
  84. deepvariance_sdk-0.1.0/src/deepvariance/pipelines/ml/profiling_utils.py +375 -0
  85. deepvariance_sdk-0.1.0/src/deepvariance/typings/__init__.c +4957 -0
  86. deepvariance_sdk-0.1.0/src/deepvariance/typings/__init__.py +4 -0
  87. deepvariance_sdk-0.1.0/src/deepvariance/typings/config.c +7954 -0
  88. deepvariance_sdk-0.1.0/src/deepvariance/typings/config.py +37 -0
  89. deepvariance_sdk-0.1.0/src/deepvariance/typings/dl.c +9445 -0
  90. deepvariance_sdk-0.1.0/src/deepvariance/typings/dl.py +164 -0
  91. deepvariance_sdk-0.1.0/src/deepvariance/utils/__init__.c +4931 -0
  92. deepvariance_sdk-0.1.0/src/deepvariance/utils/__init__.py +5 -0
  93. deepvariance_sdk-0.1.0/src/deepvariance/utils/run_stats.c +9371 -0
  94. deepvariance_sdk-0.1.0/src/deepvariance/utils/run_stats.py +111 -0
  95. deepvariance_sdk-0.1.0/src/deepvariance_sdk.egg-info/PKG-INFO +306 -0
  96. deepvariance_sdk-0.1.0/src/deepvariance_sdk.egg-info/SOURCES.txt +97 -0
  97. deepvariance_sdk-0.1.0/src/deepvariance_sdk.egg-info/dependency_links.txt +1 -0
  98. deepvariance_sdk-0.1.0/src/deepvariance_sdk.egg-info/requires.txt +20 -0
  99. deepvariance_sdk-0.1.0/src/deepvariance_sdk.egg-info/top_level.txt +1 -0
@@ -0,0 +1,18 @@
1
+ Copyright (c) 2026 Deep Variance Dev Team. All rights reserved.
2
+
3
+ This software and its source code are proprietary and confidential.
4
+ Unauthorized copying, distribution, modification, or use of this software,
5
+ in whole or in part, via any medium, is strictly prohibited without the
6
+ express prior written permission of Deep Variance Dev Team.
7
+
8
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
9
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
10
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
11
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
12
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
13
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
14
+ SOFTWARE.
15
+
16
+ Use of this SDK requires a valid DeepVariance API key issued from
17
+ https://deepvariance.com/dashboard. Access is governed by the DeepVariance
18
+ Terms of Service at https://deepvariance.com/terms.
@@ -0,0 +1,306 @@
1
+ Metadata-Version: 2.4
2
+ Name: deepvariance-sdk
3
+ Version: 0.1.0
4
+ Summary: DeepVariance Python AutoML SDK — LLM-driven pipelines for tabular ML and image classification
5
+ Author: Deep Variance Dev Team
6
+ License-Expression: LicenseRef-proprietary
7
+ Project-URL: Homepage, https://deepvariance.com
8
+ Keywords: automl,llm,machine-learning,deep-learning,autogluon,pytorch
9
+ Classifier: Development Status :: 4 - Beta
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
15
+ Classifier: Typing :: Typed
16
+ Requires-Python: >=3.12
17
+ Description-Content-Type: text/markdown
18
+ License-File: LICENSE
19
+ Requires-Dist: pandas>=2.0
20
+ Requires-Dist: numpy>=1.24
21
+ Requires-Dist: scipy>=1.10
22
+ Requires-Dist: scikit-learn>=1.3
23
+ Requires-Dist: psutil>=5.9
24
+ Requires-Dist: openai>=1.0
25
+ Requires-Dist: groq>=0.9
26
+ Requires-Dist: autogluon.tabular>=1.0
27
+ Requires-Dist: torch>=2.0
28
+ Requires-Dist: torchvision>=0.15
29
+ Provides-Extra: dev
30
+ Requires-Dist: pytest>=7; extra == "dev"
31
+ Requires-Dist: pytest-cov; extra == "dev"
32
+ Requires-Dist: ruff>=0.4; extra == "dev"
33
+ Requires-Dist: cython>=3.0; extra == "dev"
34
+ Provides-Extra: docs
35
+ Requires-Dist: sphinx>=6.2; extra == "docs"
36
+ Requires-Dist: sphinx-autodoc-typehints>=1.23; extra == "docs"
37
+ Dynamic: license-file
38
+
39
+ # DeepVariance SDK
40
+
41
+ **DeepVariance** is a Python AutoML SDK that combines LLM-driven code generation with [AutoGluon](https://auto.gluon.ai/) to automatically cast, clean, sample, preprocess, and train ML models on any tabular dataset — with a single `pipeline.run()` call.
42
+
43
+ ---
44
+
45
+ ## Table of Contents
46
+
47
+ - [How it works](#how-it-works)
48
+ - [Requirements](#requirements)
49
+ - [Installation](#installation)
50
+ - [Configuration](#configuration)
51
+ - [Quickstart](#quickstart)
52
+ - [Pipeline output](#pipeline-output)
53
+ - [PipelineConfig reference](#pipelineconfig-reference)
54
+ - [Progress callbacks](#progress-callbacks)
55
+ - [Build](#build)
56
+ - [Development](#development)
57
+ - [Documentation](#documentation)
58
+
59
+ ---
60
+
61
+ ## How it works
62
+
63
+ The `MLPipeline` executes 7 sequential layers against your DataFrame:
64
+
65
+ | # | Layer | Type | What it does |
66
+ | --- | -------------------------- | -------------------- | ---------------------------------------------------------------- |
67
+ | 1 | `AutoCastLayer` | LLM → code | Infers and applies column types, encodes categoricals |
68
+ | 2 | `DataProfilingLayer` | Deterministic | Computes feature + target statistics |
69
+ | 3 | `CorrelationLayer` | Deterministic | Pearson correlation matrix + mutual information scores |
70
+ | 4 | `SamplingLayer` | LLM → code | Produces a stratified, representative sample |
71
+ | 5 | `PreprocessingLayer` | LLM → code | Generates and applies pandas transforms (imputation, scaling, …) |
72
+ | 6 | `ModelRecommendationLayer` | LLM → recommendation | Selects the best AutoGluon model codes for your task |
73
+ | 7 | `ModelTrainingLayer` | Deterministic | Trains and evaluates a `TabularPredictor`, returns metrics |
74
+
75
+ LLM-driven layers use a **retry loop** — if the generated code raises an exception, the error is fed back to the LLM for self-correction.
76
+
77
+ ---
78
+
79
+ ## Requirements
80
+
81
+ - Python ≥ 3.12
82
+ - A **DeepVariance API key** — email [founders@deepvariance.com](mailto:founders@deepvariance.com) or fill the contact form at [deepvariance.com](https://deepvariance.com)
83
+ - An **OpenAI** or **Groq** API key
84
+
85
+ ---
86
+
87
+ ## Installation
88
+
89
+ ```bash
90
+ pip install deepvariance
91
+ ```
92
+
93
+ Dependencies installed automatically: `pandas`, `numpy`, `scipy`, `scikit-learn`, `psutil`, `openai`, `groq`, `autogluon.tabular`, `torch`, `torchvision`
94
+
95
+ ### Dev install (from source)
96
+
97
+ ```bash
98
+ git clone <repo-url> deepvariance-sdk
99
+ cd deepvariance-sdk
100
+ uv venv && source .venv/bin/activate # Windows: .venv\Scripts\activate
101
+ uv pip install -e ".[dev]" # installs all deps + pytest, ruff, cython
102
+ ```
103
+
104
+ ---
105
+
106
+ ## Configuration
107
+
108
+ The SDK reads credentials from environment variables. Set them in your shell
109
+ before running:
110
+
111
+ ```bash
112
+ export DV_API_KEY=dv_...
113
+ export OPENAI_API_KEY=sk-...
114
+ export GROQ_API_KEY=gsk_... # fallback if OpenAI key is absent
115
+ ```
116
+
117
+ The SDK resolves LLM providers in order: **OpenAI → Groq**. You only need one.
118
+
119
+ ### Optional: load from a `.env` file (local dev)
120
+
121
+ `python-dotenv` is not required by the SDK, but it is a convenient way to
122
+ manage keys during local development.
123
+
124
+ ```bash
125
+ pip install python-dotenv
126
+ ```
127
+
128
+ Create a `.env` file at the project root (see `.env.example`):
129
+
130
+ ```dotenv
131
+ # .env
132
+ DV_API_KEY=dv_...
133
+ OPENAI_API_KEY=sk-...
134
+ GROQ_API_KEY=gsk_...
135
+ ```
136
+
137
+ Then load it at the top of your script, **before** constructing `PipelineConfig`:
138
+
139
+ ```python
140
+ from dotenv import load_dotenv
141
+ load_dotenv() # reads .env into os.environ
142
+
143
+ import os
144
+ from deepvariance.pipelines.ml import MLPipeline
145
+ from deepvariance.typings import PipelineConfig
146
+
147
+ config = PipelineConfig(
148
+ dv_api_key=os.getenv("DV_API_KEY"),
149
+ openai_api_key=os.getenv("OPENAI_API_KEY"),
150
+ )
151
+ ```
152
+
153
+ > **Never commit your `.env` file.** Add it to `.gitignore`:
154
+ > ```
155
+ > .env
156
+ > ```
157
+
158
+ ---
159
+
160
+ ## Quickstart
161
+
162
+ ```python
163
+ import os
164
+ import pandas as pd
165
+
166
+ from deepvariance.pipelines.ml import MLPipeline
167
+ from deepvariance.typings import PipelineConfig
168
+
169
+ # 1. Load your data
170
+ data = pd.read_csv("your_dataset.csv")
171
+
172
+ # 2. Configure
173
+ config = PipelineConfig(
174
+ dv_api_key=os.getenv("DV_API_KEY"),
175
+ openai_api_key=os.getenv("OPENAI_API_KEY"),
176
+ groq_api_key=os.getenv("GROQ_API_KEY"),
177
+ sample_percentage=0.1, # train on a 10% stratified sample
178
+ )
179
+
180
+ # 3. Run
181
+ pipeline = MLPipeline(config=config)
182
+ result = pipeline.run(data, target="your_target_column")
183
+
184
+ # 4. Inspect results
185
+ print(result["metrics"])
186
+ print(result["leaderboard"])
187
+ ```
188
+
189
+ Run the bundled examples directly:
190
+
191
+ ```bash
192
+ # Binary classification — Australia weather dataset
193
+ .venv/bin/python examples/ml_quickstart.py
194
+
195
+ # Regression — medical insurance dataset
196
+ .venv/bin/python examples/insurance_regression.py
197
+ ```
198
+
199
+ ---
200
+
201
+ ## Pipeline output
202
+
203
+ `pipeline.run()` returns a dict:
204
+
205
+ | Key | Type | Description |
206
+ | -------------------- | ---------------------- | --------------------------------------------------- |
207
+ | `metrics` | `dict[str, float]` | Accuracy, F1, ROC-AUC, RMSE, R², … (task-dependent) |
208
+ | `model` | `TabularPredictor` | Trained AutoGluon predictor |
209
+ | `leaderboard` | `pd.DataFrame` | All candidate models ranked by validation score |
210
+ | `feature_importance` | `pd.DataFrame \| None` | Feature importance scores from the best model |
211
+ | `run_stats` | `dict` | Wall-clock duration and peak memory per layer |
212
+
213
+ ### Classification metrics
214
+
215
+ `accuracy`, `f1_macro`, `f1_weighted`, `precision_macro`, `precision_weighted`, `recall_macro`, `recall_weighted`, `cohen_kappa`, `mcc`, `roc_auc` (binary) / `roc_auc_ovr` (multiclass), `log_loss`
216
+
217
+ ### Regression metrics
218
+
219
+ `rmse`, `mae`, `r2`, `median_ae`, `max_error`, `explained_var`, `mape`
220
+
221
+ ---
222
+
223
+ ## PipelineConfig reference
224
+
225
+ ```python
226
+ @dataclass
227
+ class PipelineConfig:
228
+ dv_api_key: str | None = None # DeepVariance API key (or set DV_API_KEY env var)
229
+ openai_api_key: str | None = None # OpenAI API key
230
+ groq_api_key: str | None = None # Groq API key (fallback)
231
+ sample_percentage: float | None = None # e.g. 0.1 → 10% sample fed to AutoGluon
232
+ extra: dict[str, Any] = field(default_factory=dict) # pipeline-specific overrides
233
+ ```
234
+
235
+ `sample_percentage` controls the fraction of rows passed to AutoGluon after the LLM sampling stage. For large datasets (> 100k rows) a value of `0.1`–`0.2` keeps training fast while preserving distribution.
236
+
237
+ ---
238
+
239
+ ## Progress callbacks
240
+
241
+ Pass an `on_progress` callable to get real-time stage updates:
242
+
243
+ ```python
244
+ def on_progress(stage: str, status: str) -> None:
245
+ # stage — e.g. "AutoCastLayer", "ModelTrainingLayer"
246
+ # status — "start" | "complete" | "error"
247
+ icon = {"start": "▶", "complete": "✓", "error": "✗"}.get(status, "·")
248
+ print(f" {icon} {stage}: {status}")
249
+
250
+ result = pipeline.run(data, target="label", on_progress=on_progress)
251
+ ```
252
+
253
+ ---
254
+
255
+ ## Build
256
+
257
+ The release wheel compiles all source to native C extensions via Cython —
258
+ no Python source is included in the distributed package.
259
+
260
+ ```bash
261
+ # Install build dependencies (one-time)
262
+ uv pip install -e ".[dev]"
263
+
264
+ # Compile extensions in-place (for local dev / running tests against .so)
265
+ just build-ext
266
+
267
+ # Build a release wheel (compiled .so only, no .py source)
268
+ just build-wheel
269
+ # → dist/deepvariance-0.1.0-cp312-cp312-macosx_arm64.whl
270
+ ```
271
+
272
+ For CI, build on each target platform (macOS arm64, Linux x86_64) and upload
273
+ all wheels to PyPI so users get the right binary for their machine.
274
+
275
+ ---
276
+
277
+ ## Documentation
278
+
279
+ The project now includes **Sphinx-based documentation** under the `docs/` directory. To build the HTML locally:
280
+
281
+ ```bash
282
+ # install docs dependencies (optional group)
283
+ uv pip install -e ".[docs]" # or use pip/poetry/uv manually
284
+ cd docs
285
+ make html # requires make; or run `sphinx-build -b html . _build/html`
286
+ ```
287
+
288
+ The generated site will appear in `docs/_build/html/index.html`.
289
+
290
+ See `docs/quickstart.rst` for a getting‑started guide and `docs/api.rst` for
291
+ an auto‑generated API reference.
292
+
293
+ ## Development
294
+
295
+ ```bash
296
+ # Run tests
297
+ .venv/bin/python -m pytest tests/ -q
298
+
299
+ # Lint
300
+ .venv/bin/ruff check src/ tests/
301
+
302
+ # Format
303
+ .venv/bin/ruff format src/ tests/
304
+ ```
305
+
306
+ All lint rules are configured in `pyproject.toml` under `[tool.ruff]`.
@@ -0,0 +1,268 @@
1
+ # DeepVariance SDK
2
+
3
+ **DeepVariance** is a Python AutoML SDK that combines LLM-driven code generation with [AutoGluon](https://auto.gluon.ai/) to automatically cast, clean, sample, preprocess, and train ML models on any tabular dataset — with a single `pipeline.run()` call.
4
+
5
+ ---
6
+
7
+ ## Table of Contents
8
+
9
+ - [How it works](#how-it-works)
10
+ - [Requirements](#requirements)
11
+ - [Installation](#installation)
12
+ - [Configuration](#configuration)
13
+ - [Quickstart](#quickstart)
14
+ - [Pipeline output](#pipeline-output)
15
+ - [PipelineConfig reference](#pipelineconfig-reference)
16
+ - [Progress callbacks](#progress-callbacks)
17
+ - [Build](#build)
18
+ - [Development](#development)
19
+ - [Documentation](#documentation)
20
+
21
+ ---
22
+
23
+ ## How it works
24
+
25
+ The `MLPipeline` executes 7 sequential layers against your DataFrame:
26
+
27
+ | # | Layer | Type | What it does |
28
+ | --- | -------------------------- | -------------------- | ---------------------------------------------------------------- |
29
+ | 1 | `AutoCastLayer` | LLM → code | Infers and applies column types, encodes categoricals |
30
+ | 2 | `DataProfilingLayer` | Deterministic | Computes feature + target statistics |
31
+ | 3 | `CorrelationLayer` | Deterministic | Pearson correlation matrix + mutual information scores |
32
+ | 4 | `SamplingLayer` | LLM → code | Produces a stratified, representative sample |
33
+ | 5 | `PreprocessingLayer` | LLM → code | Generates and applies pandas transforms (imputation, scaling, …) |
34
+ | 6 | `ModelRecommendationLayer` | LLM → recommendation | Selects the best AutoGluon model codes for your task |
35
+ | 7 | `ModelTrainingLayer` | Deterministic | Trains and evaluates a `TabularPredictor`, returns metrics |
36
+
37
+ LLM-driven layers use a **retry loop** — if the generated code raises an exception, the error is fed back to the LLM for self-correction.
38
+
39
+ ---
40
+
41
+ ## Requirements
42
+
43
+ - Python ≥ 3.12
44
+ - A **DeepVariance API key** — email [founders@deepvariance.com](mailto:founders@deepvariance.com) or fill the contact form at [deepvariance.com](https://deepvariance.com)
45
+ - An **OpenAI** or **Groq** API key
46
+
47
+ ---
48
+
49
+ ## Installation
50
+
51
+ ```bash
52
+ pip install deepvariance
53
+ ```
54
+
55
+ Dependencies installed automatically: `pandas`, `numpy`, `scipy`, `scikit-learn`, `psutil`, `openai`, `groq`, `autogluon.tabular`, `torch`, `torchvision`
56
+
57
+ ### Dev install (from source)
58
+
59
+ ```bash
60
+ git clone <repo-url> deepvariance-sdk
61
+ cd deepvariance-sdk
62
+ uv venv && source .venv/bin/activate # Windows: .venv\Scripts\activate
63
+ uv pip install -e ".[dev]" # installs all deps + pytest, ruff, cython
64
+ ```
65
+
66
+ ---
67
+
68
+ ## Configuration
69
+
70
+ The SDK reads credentials from environment variables. Set them in your shell
71
+ before running:
72
+
73
+ ```bash
74
+ export DV_API_KEY=dv_...
75
+ export OPENAI_API_KEY=sk-...
76
+ export GROQ_API_KEY=gsk_... # fallback if OpenAI key is absent
77
+ ```
78
+
79
+ The SDK resolves LLM providers in order: **OpenAI → Groq**. You only need one.
80
+
81
+ ### Optional: load from a `.env` file (local dev)
82
+
83
+ `python-dotenv` is not required by the SDK, but it is a convenient way to
84
+ manage keys during local development.
85
+
86
+ ```bash
87
+ pip install python-dotenv
88
+ ```
89
+
90
+ Create a `.env` file at the project root (see `.env.example`):
91
+
92
+ ```dotenv
93
+ # .env
94
+ DV_API_KEY=dv_...
95
+ OPENAI_API_KEY=sk-...
96
+ GROQ_API_KEY=gsk_...
97
+ ```
98
+
99
+ Then load it at the top of your script, **before** constructing `PipelineConfig`:
100
+
101
+ ```python
102
+ from dotenv import load_dotenv
103
+ load_dotenv() # reads .env into os.environ
104
+
105
+ import os
106
+ from deepvariance.pipelines.ml import MLPipeline
107
+ from deepvariance.typings import PipelineConfig
108
+
109
+ config = PipelineConfig(
110
+ dv_api_key=os.getenv("DV_API_KEY"),
111
+ openai_api_key=os.getenv("OPENAI_API_KEY"),
112
+ )
113
+ ```
114
+
115
+ > **Never commit your `.env` file.** Add it to `.gitignore`:
116
+ > ```
117
+ > .env
118
+ > ```
119
+
120
+ ---
121
+
122
+ ## Quickstart
123
+
124
+ ```python
125
+ import os
126
+ import pandas as pd
127
+
128
+ from deepvariance.pipelines.ml import MLPipeline
129
+ from deepvariance.typings import PipelineConfig
130
+
131
+ # 1. Load your data
132
+ data = pd.read_csv("your_dataset.csv")
133
+
134
+ # 2. Configure
135
+ config = PipelineConfig(
136
+ dv_api_key=os.getenv("DV_API_KEY"),
137
+ openai_api_key=os.getenv("OPENAI_API_KEY"),
138
+ groq_api_key=os.getenv("GROQ_API_KEY"),
139
+ sample_percentage=0.1, # train on a 10% stratified sample
140
+ )
141
+
142
+ # 3. Run
143
+ pipeline = MLPipeline(config=config)
144
+ result = pipeline.run(data, target="your_target_column")
145
+
146
+ # 4. Inspect results
147
+ print(result["metrics"])
148
+ print(result["leaderboard"])
149
+ ```
150
+
151
+ Run the bundled examples directly:
152
+
153
+ ```bash
154
+ # Binary classification — Australia weather dataset
155
+ .venv/bin/python examples/ml_quickstart.py
156
+
157
+ # Regression — medical insurance dataset
158
+ .venv/bin/python examples/insurance_regression.py
159
+ ```
160
+
161
+ ---
162
+
163
+ ## Pipeline output
164
+
165
+ `pipeline.run()` returns a dict:
166
+
167
+ | Key | Type | Description |
168
+ | -------------------- | ---------------------- | --------------------------------------------------- |
169
+ | `metrics` | `dict[str, float]` | Accuracy, F1, ROC-AUC, RMSE, R², … (task-dependent) |
170
+ | `model` | `TabularPredictor` | Trained AutoGluon predictor |
171
+ | `leaderboard` | `pd.DataFrame` | All candidate models ranked by validation score |
172
+ | `feature_importance` | `pd.DataFrame \| None` | Feature importance scores from the best model |
173
+ | `run_stats` | `dict` | Wall-clock duration and peak memory per layer |
174
+
175
+ ### Classification metrics
176
+
177
+ `accuracy`, `f1_macro`, `f1_weighted`, `precision_macro`, `precision_weighted`, `recall_macro`, `recall_weighted`, `cohen_kappa`, `mcc`, `roc_auc` (binary) / `roc_auc_ovr` (multiclass), `log_loss`
178
+
179
+ ### Regression metrics
180
+
181
+ `rmse`, `mae`, `r2`, `median_ae`, `max_error`, `explained_var`, `mape`
182
+
183
+ ---
184
+
185
+ ## PipelineConfig reference
186
+
187
+ ```python
188
+ @dataclass
189
+ class PipelineConfig:
190
+ dv_api_key: str | None = None # DeepVariance API key (or set DV_API_KEY env var)
191
+ openai_api_key: str | None = None # OpenAI API key
192
+ groq_api_key: str | None = None # Groq API key (fallback)
193
+ sample_percentage: float | None = None # e.g. 0.1 → 10% sample fed to AutoGluon
194
+ extra: dict[str, Any] = field(default_factory=dict) # pipeline-specific overrides
195
+ ```
196
+
197
+ `sample_percentage` controls the fraction of rows passed to AutoGluon after the LLM sampling stage. For large datasets (> 100k rows) a value of `0.1`–`0.2` keeps training fast while preserving distribution.
198
+
199
+ ---
200
+
201
+ ## Progress callbacks
202
+
203
+ Pass an `on_progress` callable to get real-time stage updates:
204
+
205
+ ```python
206
+ def on_progress(stage: str, status: str) -> None:
207
+ # stage — e.g. "AutoCastLayer", "ModelTrainingLayer"
208
+ # status — "start" | "complete" | "error"
209
+ icon = {"start": "▶", "complete": "✓", "error": "✗"}.get(status, "·")
210
+ print(f" {icon} {stage}: {status}")
211
+
212
+ result = pipeline.run(data, target="label", on_progress=on_progress)
213
+ ```
214
+
215
+ ---
216
+
217
+ ## Build
218
+
219
+ The release wheel compiles all source to native C extensions via Cython —
220
+ no Python source is included in the distributed package.
221
+
222
+ ```bash
223
+ # Install build dependencies (one-time)
224
+ uv pip install -e ".[dev]"
225
+
226
+ # Compile extensions in-place (for local dev / running tests against .so)
227
+ just build-ext
228
+
229
+ # Build a release wheel (compiled .so only, no .py source)
230
+ just build-wheel
231
+ # → dist/deepvariance-0.1.0-cp312-cp312-macosx_arm64.whl
232
+ ```
233
+
234
+ For CI, build on each target platform (macOS arm64, Linux x86_64) and upload
235
+ all wheels to PyPI so users get the right binary for their machine.
236
+
237
+ ---
238
+
239
+ ## Documentation
240
+
241
+ The project now includes **Sphinx-based documentation** under the `docs/` directory. To build the HTML locally:
242
+
243
+ ```bash
244
+ # install docs dependencies (optional group)
245
+ uv pip install -e ".[docs]" # or use pip/poetry/uv manually
246
+ cd docs
247
+ make html # requires make; or run `sphinx-build -b html . _build/html`
248
+ ```
249
+
250
+ The generated site will appear in `docs/_build/html/index.html`.
251
+
252
+ See `docs/quickstart.rst` for a getting‑started guide and `docs/api.rst` for
253
+ an auto‑generated API reference.
254
+
255
+ ## Development
256
+
257
+ ```bash
258
+ # Run tests
259
+ .venv/bin/python -m pytest tests/ -q
260
+
261
+ # Lint
262
+ .venv/bin/ruff check src/ tests/
263
+
264
+ # Format
265
+ .venv/bin/ruff format src/ tests/
266
+ ```
267
+
268
+ All lint rules are configured in `pyproject.toml` under `[tool.ruff]`.
@@ -0,0 +1,81 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68", "cython>=3.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [tool.setuptools.packages.find]
6
+ where = ["src"]
7
+
8
+ [tool.setuptools.package-data]
9
+ "*" = []
10
+
11
+ [project]
12
+ name = "deepvariance-sdk"
13
+ version = "0.1.0"
14
+ description = "DeepVariance Python AutoML SDK — LLM-driven pipelines for tabular ML and image classification"
15
+ readme = "README.md"
16
+ requires-python = ">=3.12"
17
+ license = "LicenseRef-proprietary"
18
+ license-files = ["LICENSE"]
19
+ authors = [
20
+ { name = "Deep Variance Dev Team" },
21
+ ]
22
+ keywords = ["automl", "llm", "machine-learning", "deep-learning", "autogluon", "pytorch"]
23
+ classifiers = [
24
+ "Development Status :: 4 - Beta",
25
+ "Intended Audience :: Developers",
26
+ "Intended Audience :: Science/Research",
27
+ "Programming Language :: Python :: 3",
28
+ "Programming Language :: Python :: 3.12",
29
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
30
+ "Typing :: Typed",
31
+ ]
32
+ dependencies = [
33
+ "pandas>=2.0",
34
+ "numpy>=1.24",
35
+ "scipy>=1.10",
36
+ "scikit-learn>=1.3",
37
+ "psutil>=5.9",
38
+ "openai>=1.0",
39
+ "groq>=0.9",
40
+ "autogluon.tabular>=1.0",
41
+ "torch>=2.0",
42
+ "torchvision>=0.15",
43
+ ]
44
+
45
+ [project.urls]
46
+ "Homepage" = "https://deepvariance.com"
47
+
48
+ [project.optional-dependencies]
49
+ dev = ["pytest>=7", "pytest-cov", "ruff>=0.4", "cython>=3.0"]
50
+ docs = ["sphinx>=6.2", "sphinx-autodoc-typehints>=1.23"]
51
+
52
+ # Ruff — lint + format
53
+ [tool.ruff]
54
+ target-version = "py312"
55
+ line-length = 100
56
+
57
+ [tool.ruff.lint]
58
+ select = [
59
+ "E", # pycodestyle errors
60
+ "W", # pycodestyle warnings
61
+ "F", # pyflakes (unused imports, undefined names, …)
62
+ "I", # isort
63
+ "B", # flake8-bugbear
64
+ "C4", # flake8-comprehensions
65
+ "SIM", # flake8-simplify
66
+ "RUF", # ruff-specific rules
67
+ ]
68
+ ignore = [
69
+ "E501", # line too long — formatter handles wrapping; long strings are intentional
70
+ "SIM108", # ternary operator — explicit if/else is clearer in agent prompts
71
+ "B904", # raise ... from exc — already done; don't force on every bare raise
72
+ "RUF001", # ambiguous Unicode — EN DASH in system-prompt strings is intentional
73
+ ]
74
+
75
+ [tool.ruff.lint.per-file-ignores]
76
+ "tests/**" = ["S101"] # allow bare assert in tests
77
+
78
+ [tool.ruff.format]
79
+ quote-style = "double"
80
+ indent-style = "space"
81
+ skip-magic-trailing-comma = false
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+