ml-analytics-tools 0.2.1__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/PKG-INFO +70 -2
- {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/README.md +68 -1
- {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/ml_analytics/data_connector.py +439 -14
- {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/ml_analytics/gsheet_connector.py +79 -17
- {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/ml_analytics_tools.egg-info/PKG-INFO +70 -2
- {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/ml_analytics_tools.egg-info/requires.txt +1 -0
- {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/pyproject.toml +2 -1
- {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/tests/test_db_s3.py +148 -0
- {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/tests/test_gsheet_connector.py +141 -0
- {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/LICENSE +0 -0
- {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/ml_analytics/__init__.py +0 -0
- {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/ml_analytics/aws_auth.py +0 -0
- {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/ml_analytics/cli.py +0 -0
- {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/ml_analytics/model_manager.py +0 -0
- {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/ml_analytics/model_tools.py +0 -0
- {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/ml_analytics/s3_connector.py +0 -0
- {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/ml_analytics/slack_connector.py +0 -0
- {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/ml_analytics/tunnel_manager.py +0 -0
- {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/ml_analytics/utils.py +0 -0
- {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/ml_analytics_tools.egg-info/SOURCES.txt +0 -0
- {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/ml_analytics_tools.egg-info/dependency_links.txt +0 -0
- {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/ml_analytics_tools.egg-info/entry_points.txt +0 -0
- {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/ml_analytics_tools.egg-info/top_level.txt +0 -0
- {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/setup.cfg +0 -0
- {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/tests/test_aws_auth.py +0 -0
- {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/tests/test_identity_column.py +0 -0
- {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/tests/test_model_manager.py +0 -0
- {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/tests/test_model_tools.py +0 -0
- {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/tests/test_s3_redshift_validation.py +0 -0
- {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/tests/test_tunnel_manager.py +0 -0
- {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/tests/test_utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: ml-analytics-tools
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: Tools for ML projects and data management
|
|
5
5
|
Requires-Python: >=3.11
|
|
6
6
|
Description-Content-Type: text/markdown
|
|
@@ -30,6 +30,7 @@ Requires-Dist: seaborn>=0.13.2
|
|
|
30
30
|
Requires-Dist: setuptools>=42.0.0
|
|
31
31
|
Requires-Dist: shap>=0.47.2
|
|
32
32
|
Requires-Dist: slack-sdk>=3.27.0
|
|
33
|
+
Requires-Dist: snowflake-connector-python[pandas,secure-local-storage]>=4.6.0
|
|
33
34
|
Dynamic: license-file
|
|
34
35
|
|
|
35
36
|
# ML Analytics Tools
|
|
@@ -49,7 +50,7 @@ arguments.
|
|
|
49
50
|
|
|
50
51
|
## What Is Included
|
|
51
52
|
|
|
52
|
-
- `DataConnector`: run Redshift SQL, load SQL files, unload/load data through S3, and create Redshift tables from DataFrames.
|
|
53
|
+
- `DataConnector`: run Redshift or Snowflake SQL, load SQL files, unload/load data through S3, and create Redshift tables from DataFrames.
|
|
53
54
|
- `S3Connector`: read, write, list, delete, and query S3 data with DuckDB.
|
|
54
55
|
- `GSheet`: read, write, share, and export Google Sheets data.
|
|
55
56
|
- `SlackConnector`: send messages, upload files, and manage simple Slack interactions.
|
|
@@ -90,6 +91,18 @@ BI_REDSHIFT_USER=analytics_user
|
|
|
90
91
|
BI_REDSHIFT_PASSWORD=secret
|
|
91
92
|
BI_REDSHIFT_PORT=5439
|
|
92
93
|
|
|
94
|
+
# Snowflake
|
|
95
|
+
SNOWFLAKE_USER=your.name@example.com
|
|
96
|
+
SNOWFLAKE_ACCOUNT=your-account
|
|
97
|
+
SNOWFLAKE_WAREHOUSE=ANALYTICS_S_WH
|
|
98
|
+
SNOWFLAKE_DATABASE=ANALYTICS_DB
|
|
99
|
+
SNOWFLAKE_SCHEMA=PUBLIC
|
|
100
|
+
SNOWFLAKE_AUTHENTICATOR=externalbrowser
|
|
101
|
+
|
|
102
|
+
# Browser-free Snowflake auth for local or Databricks jobs
|
|
103
|
+
SNOWFLAKE_PRIVATE_KEY_PATH=~/.snowflake/rsa_key.p8
|
|
104
|
+
SNOWFLAKE_PRIVATE_KEY_PASSPHRASE=secret
|
|
105
|
+
|
|
93
106
|
# S3
|
|
94
107
|
ML_ANALYTICS_S3_BUCKET=my-analytics-bucket
|
|
95
108
|
|
|
@@ -141,6 +154,43 @@ df = dc.sql("SELECT * FROM analytics.customer_features LIMIT 100")
|
|
|
141
154
|
df_polars = dc.sql("queries/features.sql", format="polars", country="es")
|
|
142
155
|
```
|
|
143
156
|
|
|
157
|
+
### Query Snowflake
|
|
158
|
+
|
|
159
|
+
```python
|
|
160
|
+
from ml_analytics import DataConnector
|
|
161
|
+
|
|
162
|
+
dc = DataConnector(engine="snowflake")
|
|
163
|
+
|
|
164
|
+
df = dc.sql("SELECT 1 AS col_1")
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
For local interactive work, `SNOWFLAKE_AUTHENTICATOR=externalbrowser` is supported.
|
|
168
|
+
SSO tokens are cached in the OS keychain, so the browser login only happens once
|
|
169
|
+
per token lifetime.
|
|
170
|
+
For Databricks and Spark jobs, use key-pair auth instead. The connector reads
|
|
171
|
+
default Databricks personal-scope secrets automatically:
|
|
172
|
+
|
|
173
|
+
```bash
|
|
174
|
+
databricks secrets put-secret user-your.name@example.com snowflake_key --bytes-value """$(cat rsa_key.p8)"""
|
|
175
|
+
databricks secrets put-secret user-your.name@example.com snowflake_key_pass --string-value """<password>"""
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
Then build Spark connector options without opening a browser:
|
|
179
|
+
|
|
180
|
+
```python
|
|
181
|
+
from ml_analytics import DataConnector
|
|
182
|
+
|
|
183
|
+
dc = DataConnector(engine="snowflake", secret_scope="user-your.name@example.com")
|
|
184
|
+
options = dc.snowflake_spark_options()
|
|
185
|
+
|
|
186
|
+
df = (
|
|
187
|
+
spark.read.format("net.snowflake.spark.snowflake")
|
|
188
|
+
.options(**options)
|
|
189
|
+
.option("query", "SELECT 1 AS col_1")
|
|
190
|
+
.load()
|
|
191
|
+
)
|
|
192
|
+
```
|
|
193
|
+
|
|
144
194
|
### Create A Redshift Table From A DataFrame
|
|
145
195
|
|
|
146
196
|
```python
|
|
@@ -181,6 +231,24 @@ df = gsheet.read_sheet(spreadsheet_id="...", sheet_name="Input")
|
|
|
181
231
|
gsheet.write_sheet(df, spreadsheet_id="...", sheet_name="Results")
|
|
182
232
|
```
|
|
183
233
|
|
|
234
|
+
#### OAuth authentication (alternative to a service account)
|
|
235
|
+
|
|
236
|
+
`GSheet` can authenticate as your own Google account using OAuth installed-app
|
|
237
|
+
credentials (e.g. Preply's Google Workspace CLI credentials). Set these env vars
|
|
238
|
+
and the connector uses OAuth automatically when no service-account credentials
|
|
239
|
+
are found:
|
|
240
|
+
|
|
241
|
+
| Variable | Required | Description |
|
|
242
|
+
|----------|----------|-------------|
|
|
243
|
+
| `GOOGLE_OAUTH_CLIENT_ID` | yes | OAuth client id (`...apps.googleusercontent.com`) |
|
|
244
|
+
| `GOOGLE_OAUTH_CLIENT_SECRET` | yes | OAuth client secret (`GOCSPX-...`) |
|
|
245
|
+
| `GOOGLE_CLOUD_PROJECT` | optional | GCP project id (e.g. `preply-gworkspace-cli`) |
|
|
246
|
+
| `GSHEET_TOKEN_PATH` | optional | Token cache path (default `~/.config/ml-analytics/gsheet_token.json`) |
|
|
247
|
+
|
|
248
|
+
The first run opens a browser for one-time consent; the cached refresh token
|
|
249
|
+
makes later runs non-interactive. Under OAuth, `get_service_account_email()`
|
|
250
|
+
returns `None`.
|
|
251
|
+
|
|
184
252
|
### Log To MLflow
|
|
185
253
|
|
|
186
254
|
```python
|
|
@@ -15,7 +15,7 @@ arguments.
|
|
|
15
15
|
|
|
16
16
|
## What Is Included
|
|
17
17
|
|
|
18
|
-
- `DataConnector`: run Redshift SQL, load SQL files, unload/load data through S3, and create Redshift tables from DataFrames.
|
|
18
|
+
- `DataConnector`: run Redshift or Snowflake SQL, load SQL files, unload/load data through S3, and create Redshift tables from DataFrames.
|
|
19
19
|
- `S3Connector`: read, write, list, delete, and query S3 data with DuckDB.
|
|
20
20
|
- `GSheet`: read, write, share, and export Google Sheets data.
|
|
21
21
|
- `SlackConnector`: send messages, upload files, and manage simple Slack interactions.
|
|
@@ -56,6 +56,18 @@ BI_REDSHIFT_USER=analytics_user
|
|
|
56
56
|
BI_REDSHIFT_PASSWORD=secret
|
|
57
57
|
BI_REDSHIFT_PORT=5439
|
|
58
58
|
|
|
59
|
+
# Snowflake
|
|
60
|
+
SNOWFLAKE_USER=your.name@example.com
|
|
61
|
+
SNOWFLAKE_ACCOUNT=your-account
|
|
62
|
+
SNOWFLAKE_WAREHOUSE=ANALYTICS_S_WH
|
|
63
|
+
SNOWFLAKE_DATABASE=ANALYTICS_DB
|
|
64
|
+
SNOWFLAKE_SCHEMA=PUBLIC
|
|
65
|
+
SNOWFLAKE_AUTHENTICATOR=externalbrowser
|
|
66
|
+
|
|
67
|
+
# Browser-free Snowflake auth for local or Databricks jobs
|
|
68
|
+
SNOWFLAKE_PRIVATE_KEY_PATH=~/.snowflake/rsa_key.p8
|
|
69
|
+
SNOWFLAKE_PRIVATE_KEY_PASSPHRASE=secret
|
|
70
|
+
|
|
59
71
|
# S3
|
|
60
72
|
ML_ANALYTICS_S3_BUCKET=my-analytics-bucket
|
|
61
73
|
|
|
@@ -107,6 +119,43 @@ df = dc.sql("SELECT * FROM analytics.customer_features LIMIT 100")
|
|
|
107
119
|
df_polars = dc.sql("queries/features.sql", format="polars", country="es")
|
|
108
120
|
```
|
|
109
121
|
|
|
122
|
+
### Query Snowflake
|
|
123
|
+
|
|
124
|
+
```python
|
|
125
|
+
from ml_analytics import DataConnector
|
|
126
|
+
|
|
127
|
+
dc = DataConnector(engine="snowflake")
|
|
128
|
+
|
|
129
|
+
df = dc.sql("SELECT 1 AS col_1")
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
For local interactive work, `SNOWFLAKE_AUTHENTICATOR=externalbrowser` is supported.
|
|
133
|
+
SSO tokens are cached in the OS keychain, so the browser login only happens once
|
|
134
|
+
per token lifetime.
|
|
135
|
+
For Databricks and Spark jobs, use key-pair auth instead. The connector reads
|
|
136
|
+
default Databricks personal-scope secrets automatically:
|
|
137
|
+
|
|
138
|
+
```bash
|
|
139
|
+
databricks secrets put-secret user-your.name@example.com snowflake_key --bytes-value """$(cat rsa_key.p8)"""
|
|
140
|
+
databricks secrets put-secret user-your.name@example.com snowflake_key_pass --string-value """<password>"""
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
Then build Spark connector options without opening a browser:
|
|
144
|
+
|
|
145
|
+
```python
|
|
146
|
+
from ml_analytics import DataConnector
|
|
147
|
+
|
|
148
|
+
dc = DataConnector(engine="snowflake", secret_scope="user-your.name@example.com")
|
|
149
|
+
options = dc.snowflake_spark_options()
|
|
150
|
+
|
|
151
|
+
df = (
|
|
152
|
+
spark.read.format("net.snowflake.spark.snowflake")
|
|
153
|
+
.options(**options)
|
|
154
|
+
.option("query", "SELECT 1 AS col_1")
|
|
155
|
+
.load()
|
|
156
|
+
)
|
|
157
|
+
```
|
|
158
|
+
|
|
110
159
|
### Create A Redshift Table From A DataFrame
|
|
111
160
|
|
|
112
161
|
```python
|
|
@@ -147,6 +196,24 @@ df = gsheet.read_sheet(spreadsheet_id="...", sheet_name="Input")
|
|
|
147
196
|
gsheet.write_sheet(df, spreadsheet_id="...", sheet_name="Results")
|
|
148
197
|
```
|
|
149
198
|
|
|
199
|
+
#### OAuth authentication (alternative to a service account)
|
|
200
|
+
|
|
201
|
+
`GSheet` can authenticate as your own Google account using OAuth installed-app
|
|
202
|
+
credentials (e.g. Preply's Google Workspace CLI credentials). Set these env vars
|
|
203
|
+
and the connector uses OAuth automatically when no service-account credentials
|
|
204
|
+
are found:
|
|
205
|
+
|
|
206
|
+
| Variable | Required | Description |
|
|
207
|
+
|----------|----------|-------------|
|
|
208
|
+
| `GOOGLE_OAUTH_CLIENT_ID` | yes | OAuth client id (`...apps.googleusercontent.com`) |
|
|
209
|
+
| `GOOGLE_OAUTH_CLIENT_SECRET` | yes | OAuth client secret (`GOCSPX-...`) |
|
|
210
|
+
| `GOOGLE_CLOUD_PROJECT` | optional | GCP project id (e.g. `preply-gworkspace-cli`) |
|
|
211
|
+
| `GSHEET_TOKEN_PATH` | optional | Token cache path (default `~/.config/ml-analytics/gsheet_token.json`) |
|
|
212
|
+
|
|
213
|
+
The first run opens a browser for one-time consent; the cached refresh token
|
|
214
|
+
makes later runs non-interactive. Under OAuth, `get_service_account_email()`
|
|
215
|
+
returns `None`.
|
|
216
|
+
|
|
150
217
|
### Log To MLflow
|
|
151
218
|
|
|
152
219
|
```python
|