ml-analytics-tools 0.2.0__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/PKG-INFO +76 -2
- {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/README.md +74 -1
- {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/ml_analytics/data_connector.py +439 -14
- {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/ml_analytics/gsheet_connector.py +79 -17
- {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/ml_analytics_tools.egg-info/PKG-INFO +76 -2
- {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/ml_analytics_tools.egg-info/requires.txt +1 -0
- {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/pyproject.toml +2 -1
- {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/tests/test_db_s3.py +148 -0
- {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/tests/test_gsheet_connector.py +141 -0
- {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/LICENSE +0 -0
- {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/ml_analytics/__init__.py +0 -0
- {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/ml_analytics/aws_auth.py +0 -0
- {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/ml_analytics/cli.py +0 -0
- {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/ml_analytics/model_manager.py +0 -0
- {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/ml_analytics/model_tools.py +0 -0
- {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/ml_analytics/s3_connector.py +0 -0
- {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/ml_analytics/slack_connector.py +0 -0
- {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/ml_analytics/tunnel_manager.py +0 -0
- {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/ml_analytics/utils.py +0 -0
- {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/ml_analytics_tools.egg-info/SOURCES.txt +0 -0
- {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/ml_analytics_tools.egg-info/dependency_links.txt +0 -0
- {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/ml_analytics_tools.egg-info/entry_points.txt +0 -0
- {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/ml_analytics_tools.egg-info/top_level.txt +0 -0
- {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/setup.cfg +0 -0
- {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/tests/test_aws_auth.py +0 -0
- {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/tests/test_identity_column.py +0 -0
- {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/tests/test_model_manager.py +0 -0
- {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/tests/test_model_tools.py +0 -0
- {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/tests/test_s3_redshift_validation.py +0 -0
- {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/tests/test_tunnel_manager.py +0 -0
- {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/tests/test_utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ml-analytics-tools
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: Tools for ML projects and data management
|
|
5
5
|
Requires-Python: >=3.11
|
|
6
6
|
Description-Content-Type: text/markdown
|
|
@@ -30,10 +30,17 @@ Requires-Dist: seaborn>=0.13.2
|
|
|
30
30
|
Requires-Dist: setuptools>=42.0.0
|
|
31
31
|
Requires-Dist: shap>=0.47.2
|
|
32
32
|
Requires-Dist: slack-sdk>=3.27.0
|
|
33
|
+
Requires-Dist: snowflake-connector-python[pandas,secure-local-storage]>=4.6.0
|
|
33
34
|
Dynamic: license-file
|
|
34
35
|
|
|
35
36
|
# ML Analytics Tools
|
|
36
37
|
|
|
38
|
+
[](https://github.com/sdaza/ml-analytics-tools/actions/workflows/ci.yml)
|
|
39
|
+
[](https://github.com/sdaza/ml-analytics-tools/releases)
|
|
40
|
+
[](https://pypi.org/project/ml-analytics-tools/)
|
|
41
|
+
[](https://www.python.org/downloads/)
|
|
42
|
+
[](LICENSE)
|
|
43
|
+
|
|
37
44
|
Utilities for common analytics and machine learning workflows: Redshift, S3,
|
|
38
45
|
Google Sheets, Slack, MLflow, model evaluation, and SQL pipelines.
|
|
39
46
|
|
|
@@ -43,7 +50,7 @@ arguments.
|
|
|
43
50
|
|
|
44
51
|
## What Is Included
|
|
45
52
|
|
|
46
|
-
- `DataConnector`: run Redshift SQL, load SQL files, unload/load data through S3, and create Redshift tables from DataFrames.
|
|
53
|
+
- `DataConnector`: run Redshift or Snowflake SQL, load SQL files, unload/load data through S3, and create Redshift tables from DataFrames.
|
|
47
54
|
- `S3Connector`: read, write, list, delete, and query S3 data with DuckDB.
|
|
48
55
|
- `GSheet`: read, write, share, and export Google Sheets data.
|
|
49
56
|
- `SlackConnector`: send messages, upload files, and manage simple Slack interactions.
|
|
@@ -84,6 +91,18 @@ BI_REDSHIFT_USER=analytics_user
|
|
|
84
91
|
BI_REDSHIFT_PASSWORD=secret
|
|
85
92
|
BI_REDSHIFT_PORT=5439
|
|
86
93
|
|
|
94
|
+
# Snowflake
|
|
95
|
+
SNOWFLAKE_USER=your.name@example.com
|
|
96
|
+
SNOWFLAKE_ACCOUNT=your-account
|
|
97
|
+
SNOWFLAKE_WAREHOUSE=ANALYTICS_S_WH
|
|
98
|
+
SNOWFLAKE_DATABASE=ANALYTICS_DB
|
|
99
|
+
SNOWFLAKE_SCHEMA=PUBLIC
|
|
100
|
+
SNOWFLAKE_AUTHENTICATOR=externalbrowser
|
|
101
|
+
|
|
102
|
+
# Browser-free Snowflake auth for local or Databricks jobs
|
|
103
|
+
SNOWFLAKE_PRIVATE_KEY_PATH=~/.snowflake/rsa_key.p8
|
|
104
|
+
SNOWFLAKE_PRIVATE_KEY_PASSPHRASE=secret
|
|
105
|
+
|
|
87
106
|
# S3
|
|
88
107
|
ML_ANALYTICS_S3_BUCKET=my-analytics-bucket
|
|
89
108
|
|
|
@@ -135,6 +154,43 @@ df = dc.sql("SELECT * FROM analytics.customer_features LIMIT 100")
|
|
|
135
154
|
df_polars = dc.sql("queries/features.sql", format="polars", country="es")
|
|
136
155
|
```
|
|
137
156
|
|
|
157
|
+
### Query Snowflake
|
|
158
|
+
|
|
159
|
+
```python
|
|
160
|
+
from ml_analytics import DataConnector
|
|
161
|
+
|
|
162
|
+
dc = DataConnector(engine="snowflake")
|
|
163
|
+
|
|
164
|
+
df = dc.sql("SELECT 1 AS col_1")
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
For local interactive work, `SNOWFLAKE_AUTHENTICATOR=externalbrowser` is supported.
|
|
168
|
+
SSO tokens are cached in the OS keychain, so the browser login only happens once
|
|
169
|
+
per token lifetime.
|
|
170
|
+
For Databricks and Spark jobs, use key-pair auth instead. The connector reads
|
|
171
|
+
default Databricks personal-scope secrets automatically:
|
|
172
|
+
|
|
173
|
+
```bash
|
|
174
|
+
databricks secrets put-secret user-your.name@example.com snowflake_key --bytes-value """$(cat rsa_key.p8)"""
|
|
175
|
+
databricks secrets put-secret user-your.name@example.com snowflake_key_pass --string-value """<password>"""
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
Then build Spark connector options without opening a browser:
|
|
179
|
+
|
|
180
|
+
```python
|
|
181
|
+
from ml_analytics import DataConnector
|
|
182
|
+
|
|
183
|
+
dc = DataConnector(engine="snowflake", secret_scope="user-your.name@example.com")
|
|
184
|
+
options = dc.snowflake_spark_options()
|
|
185
|
+
|
|
186
|
+
df = (
|
|
187
|
+
spark.read.format("net.snowflake.spark.snowflake")
|
|
188
|
+
.options(**options)
|
|
189
|
+
.option("query", "SELECT 1 AS col_1")
|
|
190
|
+
.load()
|
|
191
|
+
)
|
|
192
|
+
```
|
|
193
|
+
|
|
138
194
|
### Create A Redshift Table From A DataFrame
|
|
139
195
|
|
|
140
196
|
```python
|
|
@@ -175,6 +231,24 @@ df = gsheet.read_sheet(spreadsheet_id="...", sheet_name="Input")
|
|
|
175
231
|
gsheet.write_sheet(df, spreadsheet_id="...", sheet_name="Results")
|
|
176
232
|
```
|
|
177
233
|
|
|
234
|
+
#### OAuth authentication (alternative to a service account)
|
|
235
|
+
|
|
236
|
+
`GSheet` can authenticate as your own Google account using OAuth installed-app
|
|
237
|
+
credentials (e.g. Preply's Google Workspace CLI credentials). Set these env vars
|
|
238
|
+
and the connector uses OAuth automatically when no service-account credentials
|
|
239
|
+
are found:
|
|
240
|
+
|
|
241
|
+
| Variable | Required | Description |
|
|
242
|
+
|----------|----------|-------------|
|
|
243
|
+
| `GOOGLE_OAUTH_CLIENT_ID` | yes | OAuth client id (`...apps.googleusercontent.com`) |
|
|
244
|
+
| `GOOGLE_OAUTH_CLIENT_SECRET` | yes | OAuth client secret (`GOCSPX-...`) |
|
|
245
|
+
| `GOOGLE_CLOUD_PROJECT` | optional | GCP project id (e.g. `preply-gworkspace-cli`) |
|
|
246
|
+
| `GSHEET_TOKEN_PATH` | optional | Token cache path (default `~/.config/ml-analytics/gsheet_token.json`) |
|
|
247
|
+
|
|
248
|
+
The first run opens a browser for one-time consent; the cached refresh token
|
|
249
|
+
makes later runs non-interactive. Under OAuth, `get_service_account_email()`
|
|
250
|
+
returns `None`.
|
|
251
|
+
|
|
178
252
|
### Log To MLflow
|
|
179
253
|
|
|
180
254
|
```python
|
|
@@ -1,5 +1,11 @@
|
|
|
1
1
|
# ML Analytics Tools
|
|
2
2
|
|
|
3
|
+
[](https://github.com/sdaza/ml-analytics-tools/actions/workflows/ci.yml)
|
|
4
|
+
[](https://github.com/sdaza/ml-analytics-tools/releases)
|
|
5
|
+
[](https://pypi.org/project/ml-analytics-tools/)
|
|
6
|
+
[](https://www.python.org/downloads/)
|
|
7
|
+
[](LICENSE)
|
|
8
|
+
|
|
3
9
|
Utilities for common analytics and machine learning workflows: Redshift, S3,
|
|
4
10
|
Google Sheets, Slack, MLflow, model evaluation, and SQL pipelines.
|
|
5
11
|
|
|
@@ -9,7 +15,7 @@ arguments.
|
|
|
9
15
|
|
|
10
16
|
## What Is Included
|
|
11
17
|
|
|
12
|
-
- `DataConnector`: run Redshift SQL, load SQL files, unload/load data through S3, and create Redshift tables from DataFrames.
|
|
18
|
+
- `DataConnector`: run Redshift or Snowflake SQL, load SQL files, unload/load data through S3, and create Redshift tables from DataFrames.
|
|
13
19
|
- `S3Connector`: read, write, list, delete, and query S3 data with DuckDB.
|
|
14
20
|
- `GSheet`: read, write, share, and export Google Sheets data.
|
|
15
21
|
- `SlackConnector`: send messages, upload files, and manage simple Slack interactions.
|
|
@@ -50,6 +56,18 @@ BI_REDSHIFT_USER=analytics_user
|
|
|
50
56
|
BI_REDSHIFT_PASSWORD=secret
|
|
51
57
|
BI_REDSHIFT_PORT=5439
|
|
52
58
|
|
|
59
|
+
# Snowflake
|
|
60
|
+
SNOWFLAKE_USER=your.name@example.com
|
|
61
|
+
SNOWFLAKE_ACCOUNT=your-account
|
|
62
|
+
SNOWFLAKE_WAREHOUSE=ANALYTICS_S_WH
|
|
63
|
+
SNOWFLAKE_DATABASE=ANALYTICS_DB
|
|
64
|
+
SNOWFLAKE_SCHEMA=PUBLIC
|
|
65
|
+
SNOWFLAKE_AUTHENTICATOR=externalbrowser
|
|
66
|
+
|
|
67
|
+
# Browser-free Snowflake auth for local or Databricks jobs
|
|
68
|
+
SNOWFLAKE_PRIVATE_KEY_PATH=~/.snowflake/rsa_key.p8
|
|
69
|
+
SNOWFLAKE_PRIVATE_KEY_PASSPHRASE=secret
|
|
70
|
+
|
|
53
71
|
# S3
|
|
54
72
|
ML_ANALYTICS_S3_BUCKET=my-analytics-bucket
|
|
55
73
|
|
|
@@ -101,6 +119,43 @@ df = dc.sql("SELECT * FROM analytics.customer_features LIMIT 100")
|
|
|
101
119
|
df_polars = dc.sql("queries/features.sql", format="polars", country="es")
|
|
102
120
|
```
|
|
103
121
|
|
|
122
|
+
### Query Snowflake
|
|
123
|
+
|
|
124
|
+
```python
|
|
125
|
+
from ml_analytics import DataConnector
|
|
126
|
+
|
|
127
|
+
dc = DataConnector(engine="snowflake")
|
|
128
|
+
|
|
129
|
+
df = dc.sql("SELECT 1 AS col_1")
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
For local interactive work, `SNOWFLAKE_AUTHENTICATOR=externalbrowser` is supported.
|
|
133
|
+
SSO tokens are cached in the OS keychain, so the browser login only happens once
|
|
134
|
+
per token lifetime.
|
|
135
|
+
For Databricks and Spark jobs, use key-pair auth instead. The connector reads
|
|
136
|
+
default Databricks personal-scope secrets automatically:
|
|
137
|
+
|
|
138
|
+
```bash
|
|
139
|
+
databricks secrets put-secret user-your.name@example.com snowflake_key --bytes-value """$(cat rsa_key.p8)"""
|
|
140
|
+
databricks secrets put-secret user-your.name@example.com snowflake_key_pass --string-value """<password>"""
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
Then build Spark connector options without opening a browser:
|
|
144
|
+
|
|
145
|
+
```python
|
|
146
|
+
from ml_analytics import DataConnector
|
|
147
|
+
|
|
148
|
+
dc = DataConnector(engine="snowflake", secret_scope="user-your.name@example.com")
|
|
149
|
+
options = dc.snowflake_spark_options()
|
|
150
|
+
|
|
151
|
+
df = (
|
|
152
|
+
spark.read.format("net.snowflake.spark.snowflake")
|
|
153
|
+
.options(**options)
|
|
154
|
+
.option("query", "SELECT 1 AS col_1")
|
|
155
|
+
.load()
|
|
156
|
+
)
|
|
157
|
+
```
|
|
158
|
+
|
|
104
159
|
### Create A Redshift Table From A DataFrame
|
|
105
160
|
|
|
106
161
|
```python
|
|
@@ -141,6 +196,24 @@ df = gsheet.read_sheet(spreadsheet_id="...", sheet_name="Input")
|
|
|
141
196
|
gsheet.write_sheet(df, spreadsheet_id="...", sheet_name="Results")
|
|
142
197
|
```
|
|
143
198
|
|
|
199
|
+
#### OAuth authentication (alternative to a service account)
|
|
200
|
+
|
|
201
|
+
`GSheet` can authenticate as your own Google account using OAuth installed-app
|
|
202
|
+
credentials (e.g. Preply's Google Workspace CLI credentials). Set these env vars
|
|
203
|
+
and the connector uses OAuth automatically when no service-account credentials
|
|
204
|
+
are found:
|
|
205
|
+
|
|
206
|
+
| Variable | Required | Description |
|
|
207
|
+
|----------|----------|-------------|
|
|
208
|
+
| `GOOGLE_OAUTH_CLIENT_ID` | yes | OAuth client id (`...apps.googleusercontent.com`) |
|
|
209
|
+
| `GOOGLE_OAUTH_CLIENT_SECRET` | yes | OAuth client secret (`GOCSPX-...`) |
|
|
210
|
+
| `GOOGLE_CLOUD_PROJECT` | optional | GCP project id (e.g. `preply-gworkspace-cli`) |
|
|
211
|
+
| `GSHEET_TOKEN_PATH` | optional | Token cache path (default `~/.config/ml-analytics/gsheet_token.json`) |
|
|
212
|
+
|
|
213
|
+
The first run opens a browser for one-time consent; the cached refresh token
|
|
214
|
+
makes later runs non-interactive. Under OAuth, `get_service_account_email()`
|
|
215
|
+
returns `None`.
|
|
216
|
+
|
|
144
217
|
### Log To MLflow
|
|
145
218
|
|
|
146
219
|
```python
|