ml-analytics-tools 0.2.0__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/PKG-INFO +76 -2
  2. {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/README.md +74 -1
  3. {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/ml_analytics/data_connector.py +439 -14
  4. {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/ml_analytics/gsheet_connector.py +79 -17
  5. {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/ml_analytics_tools.egg-info/PKG-INFO +76 -2
  6. {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/ml_analytics_tools.egg-info/requires.txt +1 -0
  7. {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/pyproject.toml +2 -1
  8. {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/tests/test_db_s3.py +148 -0
  9. {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/tests/test_gsheet_connector.py +141 -0
  10. {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/LICENSE +0 -0
  11. {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/ml_analytics/__init__.py +0 -0
  12. {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/ml_analytics/aws_auth.py +0 -0
  13. {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/ml_analytics/cli.py +0 -0
  14. {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/ml_analytics/model_manager.py +0 -0
  15. {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/ml_analytics/model_tools.py +0 -0
  16. {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/ml_analytics/s3_connector.py +0 -0
  17. {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/ml_analytics/slack_connector.py +0 -0
  18. {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/ml_analytics/tunnel_manager.py +0 -0
  19. {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/ml_analytics/utils.py +0 -0
  20. {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/ml_analytics_tools.egg-info/SOURCES.txt +0 -0
  21. {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/ml_analytics_tools.egg-info/dependency_links.txt +0 -0
  22. {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/ml_analytics_tools.egg-info/entry_points.txt +0 -0
  23. {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/ml_analytics_tools.egg-info/top_level.txt +0 -0
  24. {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/setup.cfg +0 -0
  25. {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/tests/test_aws_auth.py +0 -0
  26. {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/tests/test_identity_column.py +0 -0
  27. {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/tests/test_model_manager.py +0 -0
  28. {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/tests/test_model_tools.py +0 -0
  29. {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/tests/test_s3_redshift_validation.py +0 -0
  30. {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/tests/test_tunnel_manager.py +0 -0
  31. {ml_analytics_tools-0.2.0 → ml_analytics_tools-0.3.0}/tests/test_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ml-analytics-tools
3
- Version: 0.2.0
3
+ Version: 0.3.0
4
4
  Summary: Tools for ML projects and data management
5
5
  Requires-Python: >=3.11
6
6
  Description-Content-Type: text/markdown
@@ -30,10 +30,17 @@ Requires-Dist: seaborn>=0.13.2
30
30
  Requires-Dist: setuptools>=42.0.0
31
31
  Requires-Dist: shap>=0.47.2
32
32
  Requires-Dist: slack-sdk>=3.27.0
33
+ Requires-Dist: snowflake-connector-python[pandas,secure-local-storage]>=4.6.0
33
34
  Dynamic: license-file
34
35
 
35
36
  # ML Analytics Tools
36
37
 
38
+ [![CI](https://github.com/sdaza/ml-analytics-tools/actions/workflows/ci.yml/badge.svg)](https://github.com/sdaza/ml-analytics-tools/actions/workflows/ci.yml)
39
+ [![GitHub release](https://img.shields.io/github/v/release/sdaza/ml-analytics-tools)](https://github.com/sdaza/ml-analytics-tools/releases)
40
+ [![PyPI](https://img.shields.io/pypi/v/ml-analytics-tools.svg)](https://pypi.org/project/ml-analytics-tools/)
41
+ [![Python 3.11+](https://img.shields.io/badge/python-3.11%2B-blue.svg)](https://www.python.org/downloads/)
42
+ [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](LICENSE)
43
+
37
44
  Utilities for common analytics and machine learning workflows: Redshift, S3,
38
45
  Google Sheets, Slack, MLflow, model evaluation, and SQL pipelines.
39
46
 
@@ -43,7 +50,7 @@ arguments.
43
50
 
44
51
  ## What Is Included
45
52
 
46
- - `DataConnector`: run Redshift SQL, load SQL files, unload/load data through S3, and create Redshift tables from DataFrames.
53
+ - `DataConnector`: run Redshift or Snowflake SQL, load SQL files, unload/load data through S3, and create Redshift tables from DataFrames.
47
54
  - `S3Connector`: read, write, list, delete, and query S3 data with DuckDB.
48
55
  - `GSheet`: read, write, share, and export Google Sheets data.
49
56
  - `SlackConnector`: send messages, upload files, and manage simple Slack interactions.
@@ -84,6 +91,18 @@ BI_REDSHIFT_USER=analytics_user
84
91
  BI_REDSHIFT_PASSWORD=secret
85
92
  BI_REDSHIFT_PORT=5439
86
93
 
94
+ # Snowflake
95
+ SNOWFLAKE_USER=your.name@example.com
96
+ SNOWFLAKE_ACCOUNT=your-account
97
+ SNOWFLAKE_WAREHOUSE=ANALYTICS_S_WH
98
+ SNOWFLAKE_DATABASE=ANALYTICS_DB
99
+ SNOWFLAKE_SCHEMA=PUBLIC
100
+ SNOWFLAKE_AUTHENTICATOR=externalbrowser
101
+
102
+ # Browser-free Snowflake auth for local or Databricks jobs
103
+ SNOWFLAKE_PRIVATE_KEY_PATH=~/.snowflake/rsa_key.p8
104
+ SNOWFLAKE_PRIVATE_KEY_PASSPHRASE=secret
105
+
87
106
  # S3
88
107
  ML_ANALYTICS_S3_BUCKET=my-analytics-bucket
89
108
 
@@ -135,6 +154,43 @@ df = dc.sql("SELECT * FROM analytics.customer_features LIMIT 100")
135
154
  df_polars = dc.sql("queries/features.sql", format="polars", country="es")
136
155
  ```
137
156
 
157
+ ### Query Snowflake
158
+
159
+ ```python
160
+ from ml_analytics import DataConnector
161
+
162
+ dc = DataConnector(engine="snowflake")
163
+
164
+ df = dc.sql("SELECT 1 AS col_1")
165
+ ```
166
+
167
+ For local interactive work, `SNOWFLAKE_AUTHENTICATOR=externalbrowser` is supported.
168
+ SSO tokens are cached in the OS keychain, so the browser login only happens once
169
+ per token lifetime.
170
+ For Databricks and Spark jobs, use key-pair auth instead. The connector reads
171
+ default Databricks personal-scope secrets automatically:
172
+
173
+ ```bash
174
+ databricks secrets put-secret user-your.name@example.com snowflake_key --bytes-value """$(cat rsa_key.p8)"""
175
+ databricks secrets put-secret user-your.name@example.com snowflake_key_pass --string-value """<password>"""
176
+ ```
177
+
178
+ Then build Spark connector options without opening a browser:
179
+
180
+ ```python
181
+ from ml_analytics import DataConnector
182
+
183
+ dc = DataConnector(engine="snowflake", secret_scope="user-your.name@example.com")
184
+ options = dc.snowflake_spark_options()
185
+
186
+ df = (
187
+ spark.read.format("net.snowflake.spark.snowflake")
188
+ .options(**options)
189
+ .option("query", "SELECT 1 AS col_1")
190
+ .load()
191
+ )
192
+ ```
193
+
138
194
  ### Create A Redshift Table From A DataFrame
139
195
 
140
196
  ```python
@@ -175,6 +231,24 @@ df = gsheet.read_sheet(spreadsheet_id="...", sheet_name="Input")
175
231
  gsheet.write_sheet(df, spreadsheet_id="...", sheet_name="Results")
176
232
  ```
177
233
 
234
+ #### OAuth authentication (alternative to a service account)
235
+
236
+ `GSheet` can authenticate as your own Google account using OAuth installed-app
237
+ credentials (e.g. Preply's Google Workspace CLI credentials). Set these env vars
238
+ and the connector uses OAuth automatically when no service-account credentials
239
+ are found:
240
+
241
+ | Variable | Required | Description |
242
+ |----------|----------|-------------|
243
+ | `GOOGLE_OAUTH_CLIENT_ID` | yes | OAuth client id (`...apps.googleusercontent.com`) |
244
+ | `GOOGLE_OAUTH_CLIENT_SECRET` | yes | OAuth client secret (`GOCSPX-...`) |
245
+ | `GOOGLE_CLOUD_PROJECT` | optional | GCP project id (e.g. `preply-gworkspace-cli`) |
246
+ | `GSHEET_TOKEN_PATH` | optional | Token cache path (default `~/.config/ml-analytics/gsheet_token.json`) |
247
+
248
+ The first run opens a browser for one-time consent; the cached refresh token
249
+ makes later runs non-interactive. Under OAuth, `get_service_account_email()`
250
+ returns `None`.
251
+
178
252
  ### Log To MLflow
179
253
 
180
254
  ```python
@@ -1,5 +1,11 @@
1
1
  # ML Analytics Tools
2
2
 
3
+ [![CI](https://github.com/sdaza/ml-analytics-tools/actions/workflows/ci.yml/badge.svg)](https://github.com/sdaza/ml-analytics-tools/actions/workflows/ci.yml)
4
+ [![GitHub release](https://img.shields.io/github/v/release/sdaza/ml-analytics-tools)](https://github.com/sdaza/ml-analytics-tools/releases)
5
+ [![PyPI](https://img.shields.io/pypi/v/ml-analytics-tools.svg)](https://pypi.org/project/ml-analytics-tools/)
6
+ [![Python 3.11+](https://img.shields.io/badge/python-3.11%2B-blue.svg)](https://www.python.org/downloads/)
7
+ [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](LICENSE)
8
+
3
9
  Utilities for common analytics and machine learning workflows: Redshift, S3,
4
10
  Google Sheets, Slack, MLflow, model evaluation, and SQL pipelines.
5
11
 
@@ -9,7 +15,7 @@ arguments.
9
15
 
10
16
  ## What Is Included
11
17
 
12
- - `DataConnector`: run Redshift SQL, load SQL files, unload/load data through S3, and create Redshift tables from DataFrames.
18
+ - `DataConnector`: run Redshift or Snowflake SQL, load SQL files, unload/load data through S3, and create Redshift tables from DataFrames.
13
19
  - `S3Connector`: read, write, list, delete, and query S3 data with DuckDB.
14
20
  - `GSheet`: read, write, share, and export Google Sheets data.
15
21
  - `SlackConnector`: send messages, upload files, and manage simple Slack interactions.
@@ -50,6 +56,18 @@ BI_REDSHIFT_USER=analytics_user
50
56
  BI_REDSHIFT_PASSWORD=secret
51
57
  BI_REDSHIFT_PORT=5439
52
58
 
59
+ # Snowflake
60
+ SNOWFLAKE_USER=your.name@example.com
61
+ SNOWFLAKE_ACCOUNT=your-account
62
+ SNOWFLAKE_WAREHOUSE=ANALYTICS_S_WH
63
+ SNOWFLAKE_DATABASE=ANALYTICS_DB
64
+ SNOWFLAKE_SCHEMA=PUBLIC
65
+ SNOWFLAKE_AUTHENTICATOR=externalbrowser
66
+
67
+ # Browser-free Snowflake auth for local or Databricks jobs
68
+ SNOWFLAKE_PRIVATE_KEY_PATH=~/.snowflake/rsa_key.p8
69
+ SNOWFLAKE_PRIVATE_KEY_PASSPHRASE=secret
70
+
53
71
  # S3
54
72
  ML_ANALYTICS_S3_BUCKET=my-analytics-bucket
55
73
 
@@ -101,6 +119,43 @@ df = dc.sql("SELECT * FROM analytics.customer_features LIMIT 100")
101
119
  df_polars = dc.sql("queries/features.sql", format="polars", country="es")
102
120
  ```
103
121
 
122
+ ### Query Snowflake
123
+
124
+ ```python
125
+ from ml_analytics import DataConnector
126
+
127
+ dc = DataConnector(engine="snowflake")
128
+
129
+ df = dc.sql("SELECT 1 AS col_1")
130
+ ```
131
+
132
+ For local interactive work, `SNOWFLAKE_AUTHENTICATOR=externalbrowser` is supported.
133
+ SSO tokens are cached in the OS keychain, so the browser login only happens once
134
+ per token lifetime.
135
+ For Databricks and Spark jobs, use key-pair auth instead. The connector reads
136
+ default Databricks personal-scope secrets automatically:
137
+
138
+ ```bash
139
+ databricks secrets put-secret user-your.name@example.com snowflake_key --bytes-value """$(cat rsa_key.p8)"""
140
+ databricks secrets put-secret user-your.name@example.com snowflake_key_pass --string-value """<password>"""
141
+ ```
142
+
143
+ Then build Spark connector options without opening a browser:
144
+
145
+ ```python
146
+ from ml_analytics import DataConnector
147
+
148
+ dc = DataConnector(engine="snowflake", secret_scope="user-your.name@example.com")
149
+ options = dc.snowflake_spark_options()
150
+
151
+ df = (
152
+ spark.read.format("net.snowflake.spark.snowflake")
153
+ .options(**options)
154
+ .option("query", "SELECT 1 AS col_1")
155
+ .load()
156
+ )
157
+ ```
158
+
104
159
  ### Create A Redshift Table From A DataFrame
105
160
 
106
161
  ```python
@@ -141,6 +196,24 @@ df = gsheet.read_sheet(spreadsheet_id="...", sheet_name="Input")
141
196
  gsheet.write_sheet(df, spreadsheet_id="...", sheet_name="Results")
142
197
  ```
143
198
 
199
+ #### OAuth authentication (alternative to a service account)
200
+
201
+ `GSheet` can authenticate as your own Google account using OAuth installed-app
202
+ credentials (e.g. Preply's Google Workspace CLI credentials). Set these env vars
203
+ and the connector uses OAuth automatically when no service-account credentials
204
+ are found:
205
+
206
+ | Variable | Required | Description |
207
+ |----------|----------|-------------|
208
+ | `GOOGLE_OAUTH_CLIENT_ID` | yes | OAuth client id (`...apps.googleusercontent.com`) |
209
+ | `GOOGLE_OAUTH_CLIENT_SECRET` | yes | OAuth client secret (`GOCSPX-...`) |
210
+ | `GOOGLE_CLOUD_PROJECT` | optional | GCP project id (e.g. `preply-gworkspace-cli`) |
211
+ | `GSHEET_TOKEN_PATH` | optional | Token cache path (default `~/.config/ml-analytics/gsheet_token.json`) |
212
+
213
+ The first run opens a browser for one-time consent; the cached refresh token
214
+ makes later runs non-interactive. Under OAuth, `get_service_account_email()`
215
+ returns `None`.
216
+
144
217
  ### Log To MLflow
145
218
 
146
219
  ```python