orbital 0.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- orbital-0.2.2/LICENSE.md +21 -0
- orbital-0.2.2/PKG-INFO +243 -0
- orbital-0.2.2/README.md +197 -0
- orbital-0.2.2/pyproject.toml +116 -0
- orbital-0.2.2/setup.cfg +4 -0
- orbital-0.2.2/src/orbital/__init__.py +14 -0
- orbital-0.2.2/src/orbital/_utils/__init__.py +0 -0
- orbital-0.2.2/src/orbital/_utils/onnx.py +39 -0
- orbital-0.2.2/src/orbital/_utils/repr_pipeline.py +79 -0
- orbital-0.2.2/src/orbital/ast.py +135 -0
- orbital-0.2.2/src/orbital/sql.py +81 -0
- orbital-0.2.2/src/orbital/translate.py +223 -0
- orbital-0.2.2/src/orbital/translation/__init__.py +1 -0
- orbital-0.2.2/src/orbital/translation/optimizer.py +313 -0
- orbital-0.2.2/src/orbital/translation/steps/__init__.py +1 -0
- orbital-0.2.2/src/orbital/translation/steps/add.py +67 -0
- orbital-0.2.2/src/orbital/translation/steps/argmax.py +79 -0
- orbital-0.2.2/src/orbital/translation/steps/arrayfeatureextractor.py +82 -0
- orbital-0.2.2/src/orbital/translation/steps/cast.py +100 -0
- orbital-0.2.2/src/orbital/translation/steps/concat.py +105 -0
- orbital-0.2.2/src/orbital/translation/steps/div.py +90 -0
- orbital-0.2.2/src/orbital/translation/steps/gather.py +57 -0
- orbital-0.2.2/src/orbital/translation/steps/identity.py +19 -0
- orbital-0.2.2/src/orbital/translation/steps/imputer.py +37 -0
- orbital-0.2.2/src/orbital/translation/steps/labelencoder.py +75 -0
- orbital-0.2.2/src/orbital/translation/steps/linearclass.py +107 -0
- orbital-0.2.2/src/orbital/translation/steps/linearreg.py +86 -0
- orbital-0.2.2/src/orbital/translation/steps/matmul.py +151 -0
- orbital-0.2.2/src/orbital/translation/steps/mul.py +67 -0
- orbital-0.2.2/src/orbital/translation/steps/onehotencoder.py +48 -0
- orbital-0.2.2/src/orbital/translation/steps/reshape.py +48 -0
- orbital-0.2.2/src/orbital/translation/steps/scaler.py +69 -0
- orbital-0.2.2/src/orbital/translation/steps/softmax.py +74 -0
- orbital-0.2.2/src/orbital/translation/steps/sub.py +70 -0
- orbital-0.2.2/src/orbital/translation/steps/trees/__init__.py +6 -0
- orbital-0.2.2/src/orbital/translation/steps/trees/classifier.py +212 -0
- orbital-0.2.2/src/orbital/translation/steps/trees/regressor.py +93 -0
- orbital-0.2.2/src/orbital/translation/steps/trees/tree.py +175 -0
- orbital-0.2.2/src/orbital/translation/steps/where.py +76 -0
- orbital-0.2.2/src/orbital/translation/steps/zipmap.py +54 -0
- orbital-0.2.2/src/orbital/translation/translator.py +128 -0
- orbital-0.2.2/src/orbital/translation/variables.py +204 -0
- orbital-0.2.2/src/orbital/types.py +222 -0
- orbital-0.2.2/src/orbital.egg-info/PKG-INFO +243 -0
- orbital-0.2.2/src/orbital.egg-info/SOURCES.txt +52 -0
- orbital-0.2.2/src/orbital.egg-info/dependency_links.txt +1 -0
- orbital-0.2.2/src/orbital.egg-info/requires.txt +17 -0
- orbital-0.2.2/src/orbital.egg-info/top_level.txt +1 -0
- orbital-0.2.2/tests/test_ast.py +90 -0
- orbital-0.2.2/tests/test_pipeline_e2e.py +560 -0
- orbital-0.2.2/tests/test_pipeline_str.py +53 -0
- orbital-0.2.2/tests/test_sql.py +133 -0
- orbital-0.2.2/tests/test_translation.py +42 -0
- orbital-0.2.2/tests/test_types.py +78 -0
orbital-0.2.2/LICENSE.md
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 orbital authors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
orbital-0.2.2/PKG-INFO
ADDED
|
@@ -0,0 +1,243 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: orbital
|
|
3
|
+
Version: 0.2.2
|
|
4
|
+
Summary: Allow SKLearn predictions to run on database systems in pure SQL.
|
|
5
|
+
Author: Posit Software PBC
|
|
6
|
+
Author-email: Alessandro Molina <alessandro.molina@posit.co>
|
|
7
|
+
License-Expression: MIT
|
|
8
|
+
Keywords: database,machine learning,sql
|
|
9
|
+
Classifier: Operating System :: Microsoft :: Windows
|
|
10
|
+
Classifier: Operating System :: POSIX
|
|
11
|
+
Classifier: Operating System :: Unix
|
|
12
|
+
Classifier: Operating System :: MacOS
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Intended Audience :: Science/Research
|
|
20
|
+
Classifier: Intended Audience :: Developers
|
|
21
|
+
Classifier: Topic :: Software Development
|
|
22
|
+
Classifier: Topic :: Scientific/Engineering
|
|
23
|
+
Classifier: Topic :: Database
|
|
24
|
+
Classifier: Programming Language :: Python
|
|
25
|
+
Classifier: Programming Language :: SQL
|
|
26
|
+
Requires-Python: >=3.9
|
|
27
|
+
Description-Content-Type: text/markdown
|
|
28
|
+
License-File: LICENSE.md
|
|
29
|
+
Requires-Dist: packaging
|
|
30
|
+
Requires-Dist: scikit-learn
|
|
31
|
+
Requires-Dist: skl2onnx~=1.19.1
|
|
32
|
+
Requires-Dist: onnx~=1.18.0
|
|
33
|
+
Requires-Dist: ibis-framework<11.0.0
|
|
34
|
+
Provides-Extra: test
|
|
35
|
+
Requires-Dist: pandas; extra == "test"
|
|
36
|
+
Requires-Dist: polars-lts-cpu; extra == "test"
|
|
37
|
+
Requires-Dist: pyarrow>=19.0.1; extra == "test"
|
|
38
|
+
Requires-Dist: pyarrow-hotfix; extra == "test"
|
|
39
|
+
Requires-Dist: ibis-framework[duckdb]>=5.1.0; extra == "test"
|
|
40
|
+
Requires-Dist: pytest-cov>=5.0.0; extra == "test"
|
|
41
|
+
Requires-Dist: pytest>=8.3.2; extra == "test"
|
|
42
|
+
Requires-Dist: sqlalchemy; extra == "test"
|
|
43
|
+
Requires-Dist: psycopg2-binary; extra == "test"
|
|
44
|
+
Requires-Dist: duckdb; extra == "test"
|
|
45
|
+
Dynamic: license-file
|
|
46
|
+
|
|
47
|
+
# orbital
|
|
48
|
+
|
|
49
|
+
Convert SKLearn pipelines into SQL queries for execution in a database
|
|
50
|
+
without the need for a Python environment.
|
|
51
|
+
|
|
52
|
+
See `examples` directory for [example pipelines](https://github.com/posit-dev/orbital/tree/main/examples) and [Documentation](https://posit-dev.github.io/orbital/)
|
|
53
|
+
|
|
54
|
+
**Warning**:
|
|
55
|
+
|
|
56
|
+
This is a work in progress.
|
|
57
|
+
You might encounter bugs or missing features.
|
|
58
|
+
|
|
59
|
+
**Note**:
|
|
60
|
+
|
|
61
|
+
Not all transformations and models can be represented as SQL queries,
|
|
62
|
+
so orbital might not be able to implement the specific pipeline you are using.
|
|
63
|
+
|
|
64
|
+
## Getting Started
|
|
65
|
+
|
|
66
|
+
Install orbital:
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
$ pip install orbital
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
Prepare some data:
|
|
73
|
+
|
|
74
|
+
```python
|
|
75
|
+
from sklearn.datasets import load_iris
|
|
76
|
+
from sklearn.model_selection import train_test_split
|
|
77
|
+
|
|
78
|
+
COLUMNS = ["sepal.length", "sepal.width", "petal.length", "petal.width"]
|
|
79
|
+
|
|
80
|
+
iris = load_iris(as_frame=True)
|
|
81
|
+
iris_x = iris.data.set_axis(COLUMNS, axis=1)
|
|
82
|
+
|
|
83
|
+
# SQL and orbital don't like dots in column names, replace them with underscores
|
|
84
|
+
iris_x.columns = COLUMNS = [cname.replace(".", "_") for cname in COLUMNS]
|
|
85
|
+
|
|
86
|
+
X_train, X_test, y_train, y_test = train_test_split(
|
|
87
|
+
iris_x, iris.target, test_size=0.2, random_state=42
|
|
88
|
+
)
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
Define a Scikit-Learn pipeline and train it:
|
|
92
|
+
|
|
93
|
+
```python
|
|
94
|
+
from sklearn.compose import ColumnTransformer
|
|
95
|
+
from sklearn.linear_model import LinearRegression
|
|
96
|
+
from sklearn.pipeline import Pipeline
|
|
97
|
+
from sklearn.preprocessing import StandardScaler
|
|
98
|
+
|
|
99
|
+
pipeline = Pipeline(
|
|
100
|
+
[
|
|
101
|
+
("preprocess", ColumnTransformer([("scaler", StandardScaler(with_std=False), COLUMNS)],
|
|
102
|
+
remainder="passthrough")),
|
|
103
|
+
("linear_regression", LinearRegression()),
|
|
104
|
+
]
|
|
105
|
+
)
|
|
106
|
+
pipeline.fit(X_train, y_train)
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
Convert the pipeline to orbital:
|
|
110
|
+
|
|
111
|
+
```python
|
|
112
|
+
import orbital
|
|
113
|
+
import orbital.types
|
|
114
|
+
|
|
115
|
+
orbital_pipeline = orbital.parse_pipeline(pipeline, features={
|
|
116
|
+
"sepal_length": orbital.types.DoubleColumnType(),
|
|
117
|
+
"sepal_width": orbital.types.DoubleColumnType(),
|
|
118
|
+
"petal_length": orbital.types.DoubleColumnType(),
|
|
119
|
+
"petal_width": orbital.types.DoubleColumnType(),
|
|
120
|
+
})
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
You can print the pipeline to see the result:
|
|
124
|
+
|
|
125
|
+
```python
|
|
126
|
+
>>> print(orbital_pipeline)
|
|
127
|
+
|
|
128
|
+
ParsedPipeline(
|
|
129
|
+
features={
|
|
130
|
+
sepal_length: DoubleColumnType()
|
|
131
|
+
sepal_width: DoubleColumnType()
|
|
132
|
+
petal_length: DoubleColumnType()
|
|
133
|
+
petal_width: DoubleColumnType()
|
|
134
|
+
},
|
|
135
|
+
steps=[
|
|
136
|
+
merged_columns=Concat(
|
|
137
|
+
inputs: sepal_length, sepal_width, petal_length, petal_width,
|
|
138
|
+
attributes:
|
|
139
|
+
axis=1
|
|
140
|
+
)
|
|
141
|
+
variable1=Sub(
|
|
142
|
+
inputs: merged_columns, Su_Subcst=[5.809166666666666, 3.0616666666666665, 3.7266666666666666, 1.18333333...,
|
|
143
|
+
attributes:
|
|
144
|
+
)
|
|
145
|
+
multiplied=MatMul(
|
|
146
|
+
inputs: variable1, coef=[-0.11633479416518255, -0.05977785171980231, 0.25491374699772246, 0.5475959...,
|
|
147
|
+
attributes:
|
|
148
|
+
)
|
|
149
|
+
resh=Add(
|
|
150
|
+
inputs: multiplied, intercept=[0.9916666666666668],
|
|
151
|
+
attributes:
|
|
152
|
+
)
|
|
153
|
+
variable=Reshape(
|
|
154
|
+
inputs: resh, shape_tensor=[-1, 1],
|
|
155
|
+
attributes:
|
|
156
|
+
)
|
|
157
|
+
],
|
|
158
|
+
)
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
Now we can generate the SQL from the pipeline:
|
|
162
|
+
|
|
163
|
+
```python
|
|
164
|
+
sql = orbital.export_sql("DATA_TABLE", orbital_pipeline, dialect="duckdb")
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
And check the resulting query:
|
|
168
|
+
|
|
169
|
+
```python
|
|
170
|
+
>>> print(sql)
|
|
171
|
+
|
|
172
|
+
SELECT ("t0"."sepal_length" - 5.809166666666666) * -0.11633479416518255 + 0.9916666666666668 +
|
|
173
|
+
("t0"."sepal_width" - 3.0616666666666665) * -0.05977785171980231 +
|
|
174
|
+
("t0"."petal_length" - 3.7266666666666666) * 0.25491374699772246 +
|
|
175
|
+
("t0"."petal_width" - 1.1833333333333333) * 0.5475959809777828
|
|
176
|
+
AS "variable" FROM "DATA_TABLE" AS "t0"
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
Once the SQL is generate, you can use it to run the pipeline on a
|
|
180
|
+
database. From here on the SQL can be exported and reused in other
|
|
181
|
+
places:
|
|
182
|
+
|
|
183
|
+
```python
|
|
184
|
+
>>> print("\nPrediction with SQL")
|
|
185
|
+
>>> duckdb.register("DATA_TABLE", X_test)
|
|
186
|
+
>>> print(duckdb.sql(sql).df()["variable"][:5].to_numpy())
|
|
187
|
+
|
|
188
|
+
Prediction with SQL
|
|
189
|
+
[ 1.23071715 -0.04010441 2.21970287 1.34966889 1.28429336]
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
We can verify that the prediction matches the one done by Scikit-Learn
|
|
193
|
+
by running the scikitlearn pipeline on the same set of data:
|
|
194
|
+
|
|
195
|
+
```python
|
|
196
|
+
>>> print("\nPrediction with SciKit-Learn")
|
|
197
|
+
>>> print(pipeline.predict(X_test)[:5])
|
|
198
|
+
|
|
199
|
+
Prediction with SciKit-Learn
|
|
200
|
+
[ 1.23071715 -0.04010441 2.21970287 1.34966889 1.28429336 ]
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
## Supported Models
|
|
204
|
+
|
|
205
|
+
orbital currently supports the following models:
|
|
206
|
+
|
|
207
|
+
- Linear Regression
|
|
208
|
+
- Logistic Regression
|
|
209
|
+
- Lasso Regression
|
|
210
|
+
- Elastic Net
|
|
211
|
+
- Decision Tree Regressor
|
|
212
|
+
- Decision Tree Classifier
|
|
213
|
+
- Random Forest Classifier
|
|
214
|
+
- Gradient Boosting Regressor
|
|
215
|
+
- Gradient Boosting Classifier
|
|
216
|
+
|
|
217
|
+
# Testing
|
|
218
|
+
|
|
219
|
+
Setup testing environment:
|
|
220
|
+
|
|
221
|
+
```bash
|
|
222
|
+
$ uv sync --no-dev --extra test
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
Run Tests:
|
|
226
|
+
|
|
227
|
+
```bash
|
|
228
|
+
$ uv run pytest -v
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
Try Examples:
|
|
232
|
+
|
|
233
|
+
```bash
|
|
234
|
+
$ uv run examples/pipeline_lineareg.py
|
|
235
|
+
```
|
|
236
|
+
|
|
237
|
+
# Development
|
|
238
|
+
|
|
239
|
+
Setup a development environment:
|
|
240
|
+
|
|
241
|
+
```bash
|
|
242
|
+
$ uv sync
|
|
243
|
+
```
|
orbital-0.2.2/README.md
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
# orbital
|
|
2
|
+
|
|
3
|
+
Convert SKLearn pipelines into SQL queries for execution in a database
|
|
4
|
+
without the need for a Python environment.
|
|
5
|
+
|
|
6
|
+
See `examples` directory for [example pipelines](https://github.com/posit-dev/orbital/tree/main/examples) and [Documentation](https://posit-dev.github.io/orbital/)
|
|
7
|
+
|
|
8
|
+
**Warning**:
|
|
9
|
+
|
|
10
|
+
This is a work in progress.
|
|
11
|
+
You might encounter bugs or missing features.
|
|
12
|
+
|
|
13
|
+
**Note**:
|
|
14
|
+
|
|
15
|
+
Not all transformations and models can be represented as SQL queries,
|
|
16
|
+
so orbital might not be able to implement the specific pipeline you are using.
|
|
17
|
+
|
|
18
|
+
## Getting Started
|
|
19
|
+
|
|
20
|
+
Install orbital:
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
$ pip install orbital
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
Prepare some data:
|
|
27
|
+
|
|
28
|
+
```python
|
|
29
|
+
from sklearn.datasets import load_iris
|
|
30
|
+
from sklearn.model_selection import train_test_split
|
|
31
|
+
|
|
32
|
+
COLUMNS = ["sepal.length", "sepal.width", "petal.length", "petal.width"]
|
|
33
|
+
|
|
34
|
+
iris = load_iris(as_frame=True)
|
|
35
|
+
iris_x = iris.data.set_axis(COLUMNS, axis=1)
|
|
36
|
+
|
|
37
|
+
# SQL and orbital don't like dots in column names, replace them with underscores
|
|
38
|
+
iris_x.columns = COLUMNS = [cname.replace(".", "_") for cname in COLUMNS]
|
|
39
|
+
|
|
40
|
+
X_train, X_test, y_train, y_test = train_test_split(
|
|
41
|
+
iris_x, iris.target, test_size=0.2, random_state=42
|
|
42
|
+
)
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
Define a Scikit-Learn pipeline and train it:
|
|
46
|
+
|
|
47
|
+
```python
|
|
48
|
+
from sklearn.compose import ColumnTransformer
|
|
49
|
+
from sklearn.linear_model import LinearRegression
|
|
50
|
+
from sklearn.pipeline import Pipeline
|
|
51
|
+
from sklearn.preprocessing import StandardScaler
|
|
52
|
+
|
|
53
|
+
pipeline = Pipeline(
|
|
54
|
+
[
|
|
55
|
+
("preprocess", ColumnTransformer([("scaler", StandardScaler(with_std=False), COLUMNS)],
|
|
56
|
+
remainder="passthrough")),
|
|
57
|
+
("linear_regression", LinearRegression()),
|
|
58
|
+
]
|
|
59
|
+
)
|
|
60
|
+
pipeline.fit(X_train, y_train)
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
Convert the pipeline to orbital:
|
|
64
|
+
|
|
65
|
+
```python
|
|
66
|
+
import orbital
|
|
67
|
+
import orbital.types
|
|
68
|
+
|
|
69
|
+
orbital_pipeline = orbital.parse_pipeline(pipeline, features={
|
|
70
|
+
"sepal_length": orbital.types.DoubleColumnType(),
|
|
71
|
+
"sepal_width": orbital.types.DoubleColumnType(),
|
|
72
|
+
"petal_length": orbital.types.DoubleColumnType(),
|
|
73
|
+
"petal_width": orbital.types.DoubleColumnType(),
|
|
74
|
+
})
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
You can print the pipeline to see the result:
|
|
78
|
+
|
|
79
|
+
```python
|
|
80
|
+
>>> print(orbital_pipeline)
|
|
81
|
+
|
|
82
|
+
ParsedPipeline(
|
|
83
|
+
features={
|
|
84
|
+
sepal_length: DoubleColumnType()
|
|
85
|
+
sepal_width: DoubleColumnType()
|
|
86
|
+
petal_length: DoubleColumnType()
|
|
87
|
+
petal_width: DoubleColumnType()
|
|
88
|
+
},
|
|
89
|
+
steps=[
|
|
90
|
+
merged_columns=Concat(
|
|
91
|
+
inputs: sepal_length, sepal_width, petal_length, petal_width,
|
|
92
|
+
attributes:
|
|
93
|
+
axis=1
|
|
94
|
+
)
|
|
95
|
+
variable1=Sub(
|
|
96
|
+
inputs: merged_columns, Su_Subcst=[5.809166666666666, 3.0616666666666665, 3.7266666666666666, 1.18333333...,
|
|
97
|
+
attributes:
|
|
98
|
+
)
|
|
99
|
+
multiplied=MatMul(
|
|
100
|
+
inputs: variable1, coef=[-0.11633479416518255, -0.05977785171980231, 0.25491374699772246, 0.5475959...,
|
|
101
|
+
attributes:
|
|
102
|
+
)
|
|
103
|
+
resh=Add(
|
|
104
|
+
inputs: multiplied, intercept=[0.9916666666666668],
|
|
105
|
+
attributes:
|
|
106
|
+
)
|
|
107
|
+
variable=Reshape(
|
|
108
|
+
inputs: resh, shape_tensor=[-1, 1],
|
|
109
|
+
attributes:
|
|
110
|
+
)
|
|
111
|
+
],
|
|
112
|
+
)
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
Now we can generate the SQL from the pipeline:
|
|
116
|
+
|
|
117
|
+
```python
|
|
118
|
+
sql = orbital.export_sql("DATA_TABLE", orbital_pipeline, dialect="duckdb")
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
And check the resulting query:
|
|
122
|
+
|
|
123
|
+
```python
|
|
124
|
+
>>> print(sql)
|
|
125
|
+
|
|
126
|
+
SELECT ("t0"."sepal_length" - 5.809166666666666) * -0.11633479416518255 + 0.9916666666666668 +
|
|
127
|
+
("t0"."sepal_width" - 3.0616666666666665) * -0.05977785171980231 +
|
|
128
|
+
("t0"."petal_length" - 3.7266666666666666) * 0.25491374699772246 +
|
|
129
|
+
("t0"."petal_width" - 1.1833333333333333) * 0.5475959809777828
|
|
130
|
+
AS "variable" FROM "DATA_TABLE" AS "t0"
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
Once the SQL is generate, you can use it to run the pipeline on a
|
|
134
|
+
database. From here on the SQL can be exported and reused in other
|
|
135
|
+
places:
|
|
136
|
+
|
|
137
|
+
```python
|
|
138
|
+
>>> print("\nPrediction with SQL")
|
|
139
|
+
>>> duckdb.register("DATA_TABLE", X_test)
|
|
140
|
+
>>> print(duckdb.sql(sql).df()["variable"][:5].to_numpy())
|
|
141
|
+
|
|
142
|
+
Prediction with SQL
|
|
143
|
+
[ 1.23071715 -0.04010441 2.21970287 1.34966889 1.28429336]
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
We can verify that the prediction matches the one done by Scikit-Learn
|
|
147
|
+
by running the scikitlearn pipeline on the same set of data:
|
|
148
|
+
|
|
149
|
+
```python
|
|
150
|
+
>>> print("\nPrediction with SciKit-Learn")
|
|
151
|
+
>>> print(pipeline.predict(X_test)[:5])
|
|
152
|
+
|
|
153
|
+
Prediction with SciKit-Learn
|
|
154
|
+
[ 1.23071715 -0.04010441 2.21970287 1.34966889 1.28429336 ]
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
## Supported Models
|
|
158
|
+
|
|
159
|
+
orbital currently supports the following models:
|
|
160
|
+
|
|
161
|
+
- Linear Regression
|
|
162
|
+
- Logistic Regression
|
|
163
|
+
- Lasso Regression
|
|
164
|
+
- Elastic Net
|
|
165
|
+
- Decision Tree Regressor
|
|
166
|
+
- Decision Tree Classifier
|
|
167
|
+
- Random Forest Classifier
|
|
168
|
+
- Gradient Boosting Regressor
|
|
169
|
+
- Gradient Boosting Classifier
|
|
170
|
+
|
|
171
|
+
# Testing
|
|
172
|
+
|
|
173
|
+
Setup testing environment:
|
|
174
|
+
|
|
175
|
+
```bash
|
|
176
|
+
$ uv sync --no-dev --extra test
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
Run Tests:
|
|
180
|
+
|
|
181
|
+
```bash
|
|
182
|
+
$ uv run pytest -v
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
Try Examples:
|
|
186
|
+
|
|
187
|
+
```bash
|
|
188
|
+
$ uv run examples/pipeline_lineareg.py
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
# Development
|
|
192
|
+
|
|
193
|
+
Setup a development environment:
|
|
194
|
+
|
|
195
|
+
```bash
|
|
196
|
+
$ uv sync
|
|
197
|
+
```
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = [
|
|
3
|
+
"setuptools>=77.0.0",
|
|
4
|
+
"wheel"
|
|
5
|
+
]
|
|
6
|
+
build-backend = "setuptools.build_meta"
|
|
7
|
+
|
|
8
|
+
[project]
|
|
9
|
+
name = "orbital"
|
|
10
|
+
version = "0.2.2"
|
|
11
|
+
description = "Allow SKLearn predictions to run on database systems in pure SQL."
|
|
12
|
+
keywords = ["database", "machine learning", "sql"]
|
|
13
|
+
readme = { file = "README.md", content-type = "text/markdown" }
|
|
14
|
+
license = "MIT"
|
|
15
|
+
authors = [
|
|
16
|
+
{ name = "Alessandro Molina", email = "alessandro.molina@posit.co" },
|
|
17
|
+
{ name = "Posit Software PBC" }
|
|
18
|
+
]
|
|
19
|
+
classifiers = [
|
|
20
|
+
"Operating System :: Microsoft :: Windows",
|
|
21
|
+
"Operating System :: POSIX",
|
|
22
|
+
"Operating System :: Unix",
|
|
23
|
+
"Operating System :: MacOS",
|
|
24
|
+
"Programming Language :: Python :: 3",
|
|
25
|
+
"Programming Language :: Python :: 3.9",
|
|
26
|
+
"Programming Language :: Python :: 3.10",
|
|
27
|
+
"Programming Language :: Python :: 3.11",
|
|
28
|
+
"Programming Language :: Python :: 3.12",
|
|
29
|
+
"Programming Language :: Python :: 3.13",
|
|
30
|
+
"Intended Audience :: Science/Research",
|
|
31
|
+
"Intended Audience :: Developers",
|
|
32
|
+
"Topic :: Software Development",
|
|
33
|
+
"Topic :: Scientific/Engineering",
|
|
34
|
+
"Topic :: Database",
|
|
35
|
+
"Programming Language :: Python",
|
|
36
|
+
"Programming Language :: SQL",
|
|
37
|
+
]
|
|
38
|
+
|
|
39
|
+
requires-python = ">=3.9"
|
|
40
|
+
dependencies = [
|
|
41
|
+
"packaging", # somehow required by skl2onnx/operator_converters
|
|
42
|
+
"scikit-learn",
|
|
43
|
+
"skl2onnx~=1.19.1",
|
|
44
|
+
"onnx~=1.18.0",
|
|
45
|
+
"ibis-framework<11.0.0",
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
[project.optional-dependencies]
|
|
49
|
+
test = [
|
|
50
|
+
"pandas",
|
|
51
|
+
"polars-lts-cpu", # For testing we prefer compatibility over performance.
|
|
52
|
+
"pyarrow>=19.0.1",
|
|
53
|
+
"pyarrow-hotfix", # Ibis seems to demand this even on versions without hotfixes
|
|
54
|
+
"ibis-framework[duckdb]>=5.1.0",
|
|
55
|
+
"pytest-cov>=5.0.0",
|
|
56
|
+
"pytest>=8.3.2",
|
|
57
|
+
"sqlalchemy",
|
|
58
|
+
"psycopg2-binary",
|
|
59
|
+
"duckdb",
|
|
60
|
+
]
|
|
61
|
+
|
|
62
|
+
[tool.uv]
|
|
63
|
+
dev-dependencies = [
|
|
64
|
+
# Need to repeat test dependencies
|
|
65
|
+
# uv doesn't allow to merge them with extras
|
|
66
|
+
# and we don't want users to know that they need to run
|
|
67
|
+
# uv sync with extras.
|
|
68
|
+
"pandas",
|
|
69
|
+
"polars-lts-cpu",
|
|
70
|
+
"pyarrow",
|
|
71
|
+
"pyarrow-hotfix",
|
|
72
|
+
"pytest-cov>=5.0.0",
|
|
73
|
+
"pytest>=8.3.2",
|
|
74
|
+
"ibis-framework[duckdb]>=5.1.0",
|
|
75
|
+
# Necessary for development workflow
|
|
76
|
+
"mypy>=1.11.2",
|
|
77
|
+
"pre-commit",
|
|
78
|
+
"ruff>=0.6.3",
|
|
79
|
+
"mkdocs-material",
|
|
80
|
+
"mkdocstrings[python]",
|
|
81
|
+
"pydot",
|
|
82
|
+
"onnxruntime",
|
|
83
|
+
"onnxscript",
|
|
84
|
+
"sqlalchemy",
|
|
85
|
+
"psycopg2-binary",
|
|
86
|
+
"duckdb",
|
|
87
|
+
]
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
[tool.setuptools.packages.find]
|
|
91
|
+
where = ["src"]
|
|
92
|
+
exclude = ["tests"]
|
|
93
|
+
|
|
94
|
+
[tool.pytest.ini_options]
|
|
95
|
+
addopts = "--doctest-modules --cov=src --cov-report term-missing"
|
|
96
|
+
testpaths = ["tests"]
|
|
97
|
+
|
|
98
|
+
[tool.mypy]
|
|
99
|
+
ignore_missing_imports = true
|
|
100
|
+
mypy_path = "$MYPY_CONFIG_FILE_DIR/src"
|
|
101
|
+
packages = ["orbital"]
|
|
102
|
+
|
|
103
|
+
[tool.ruff]
|
|
104
|
+
target-version = "py39"
|
|
105
|
+
src = ["src"]
|
|
106
|
+
extend-exclude = ["docs", "tests", "examples", "proxypackage"]
|
|
107
|
+
output-format = "grouped"
|
|
108
|
+
lint.select = [
|
|
109
|
+
"E4", "E7", "E9",
|
|
110
|
+
"F",
|
|
111
|
+
"I001",
|
|
112
|
+
"D100", "D101", "D102", "D103",
|
|
113
|
+
"D104", "D107", "D417",
|
|
114
|
+
"ANN001", "ANN201", "ANN202",
|
|
115
|
+
"ANN204", "ANN205", "ANN206"
|
|
116
|
+
]
|
orbital-0.2.2/setup.cfg
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""orbital, translate scikit-learn pipelines into SQL queries
|
|
2
|
+
|
|
3
|
+
orbital is a library for translating **scikit-learn** pipelines
|
|
4
|
+
into **SQL queries** and **Ibis expressions**.
|
|
5
|
+
|
|
6
|
+
It provides a way to execute machine learning models on databases without
|
|
7
|
+
the need for a python runtime environment.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from .ast import parse_pipeline
|
|
11
|
+
from .sql import export_sql
|
|
12
|
+
from .translate import ResultsProjection, translate
|
|
13
|
+
|
|
14
|
+
__all__ = ["parse_pipeline", "translate", "export_sql", "ResultsProjection"]
|
|
File without changes
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import typing
|
|
2
|
+
|
|
3
|
+
import onnx
|
|
4
|
+
import onnx.helper
|
|
5
|
+
|
|
6
|
+
ListVariableTypes = typing.Union[list[int], list[float], list[str]]
|
|
7
|
+
VariableTypes = typing.Union[float, int, str, ListVariableTypes]
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def get_initializer_data(var: onnx.TensorProto) -> VariableTypes:
|
|
11
|
+
"""Given a constant initializer, return its value"""
|
|
12
|
+
attr_name = onnx.helper.tensor_dtype_to_field(var.data_type)
|
|
13
|
+
values = list(getattr(var, attr_name))
|
|
14
|
+
dimensions = getattr(var, "dims", None)
|
|
15
|
+
|
|
16
|
+
if not dimensions and len(values) == 1:
|
|
17
|
+
# If there are no dimensions, it's a scalar
|
|
18
|
+
# and we should return the single value
|
|
19
|
+
return values[0]
|
|
20
|
+
return values
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def get_attr_value(attr: onnx.AttributeProto) -> VariableTypes:
|
|
24
|
+
"""Given an attribute, return its value"""
|
|
25
|
+
# TODO: Check if it can be replaced with onnx.helper.get_attribute_value
|
|
26
|
+
if attr.type == attr.INTS:
|
|
27
|
+
return list(attr.ints)
|
|
28
|
+
elif attr.type == attr.FLOATS:
|
|
29
|
+
return list(attr.floats)
|
|
30
|
+
elif attr.type == attr.STRINGS:
|
|
31
|
+
return [s.decode("utf-8") if isinstance(s, bytes) else s for s in attr.strings]
|
|
32
|
+
elif attr.type == attr.INT:
|
|
33
|
+
return attr.i
|
|
34
|
+
elif attr.type == attr.FLOAT:
|
|
35
|
+
return attr.f
|
|
36
|
+
elif attr.type == attr.STRING:
|
|
37
|
+
return attr.s.decode("utf-8") if isinstance(attr.s, bytes) else attr.s
|
|
38
|
+
else:
|
|
39
|
+
raise ValueError(f"Unsupported attribute type: {attr.type}")
|