ml-analytics-tools 0.2.1__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/PKG-INFO +70 -2
  2. {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/README.md +68 -1
  3. {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/ml_analytics/data_connector.py +439 -14
  4. {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/ml_analytics/gsheet_connector.py +79 -17
  5. {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/ml_analytics_tools.egg-info/PKG-INFO +70 -2
  6. {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/ml_analytics_tools.egg-info/requires.txt +1 -0
  7. {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/pyproject.toml +2 -1
  8. {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/tests/test_db_s3.py +148 -0
  9. {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/tests/test_gsheet_connector.py +141 -0
  10. {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/LICENSE +0 -0
  11. {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/ml_analytics/__init__.py +0 -0
  12. {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/ml_analytics/aws_auth.py +0 -0
  13. {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/ml_analytics/cli.py +0 -0
  14. {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/ml_analytics/model_manager.py +0 -0
  15. {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/ml_analytics/model_tools.py +0 -0
  16. {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/ml_analytics/s3_connector.py +0 -0
  17. {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/ml_analytics/slack_connector.py +0 -0
  18. {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/ml_analytics/tunnel_manager.py +0 -0
  19. {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/ml_analytics/utils.py +0 -0
  20. {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/ml_analytics_tools.egg-info/SOURCES.txt +0 -0
  21. {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/ml_analytics_tools.egg-info/dependency_links.txt +0 -0
  22. {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/ml_analytics_tools.egg-info/entry_points.txt +0 -0
  23. {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/ml_analytics_tools.egg-info/top_level.txt +0 -0
  24. {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/setup.cfg +0 -0
  25. {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/tests/test_aws_auth.py +0 -0
  26. {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/tests/test_identity_column.py +0 -0
  27. {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/tests/test_model_manager.py +0 -0
  28. {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/tests/test_model_tools.py +0 -0
  29. {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/tests/test_s3_redshift_validation.py +0 -0
  30. {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/tests/test_tunnel_manager.py +0 -0
  31. {ml_analytics_tools-0.2.1 → ml_analytics_tools-0.3.0}/tests/test_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: ml-analytics-tools
3
- Version: 0.2.1
3
+ Version: 0.3.0
4
4
  Summary: Tools for ML projects and data management
5
5
  Requires-Python: >=3.11
6
6
  Description-Content-Type: text/markdown
@@ -30,6 +30,7 @@ Requires-Dist: seaborn>=0.13.2
30
30
  Requires-Dist: setuptools>=42.0.0
31
31
  Requires-Dist: shap>=0.47.2
32
32
  Requires-Dist: slack-sdk>=3.27.0
33
+ Requires-Dist: snowflake-connector-python[pandas,secure-local-storage]>=4.6.0
33
34
  Dynamic: license-file
34
35
 
35
36
  # ML Analytics Tools
@@ -49,7 +50,7 @@ arguments.
49
50
 
50
51
  ## What Is Included
51
52
 
52
- - `DataConnector`: run Redshift SQL, load SQL files, unload/load data through S3, and create Redshift tables from DataFrames.
53
+ - `DataConnector`: run Redshift or Snowflake SQL, load SQL files, unload/load data through S3, and create Redshift tables from DataFrames.
53
54
  - `S3Connector`: read, write, list, delete, and query S3 data with DuckDB.
54
55
  - `GSheet`: read, write, share, and export Google Sheets data.
55
56
  - `SlackConnector`: send messages, upload files, and manage simple Slack interactions.
@@ -90,6 +91,18 @@ BI_REDSHIFT_USER=analytics_user
90
91
  BI_REDSHIFT_PASSWORD=secret
91
92
  BI_REDSHIFT_PORT=5439
92
93
 
94
+ # Snowflake
95
+ SNOWFLAKE_USER=your.name@example.com
96
+ SNOWFLAKE_ACCOUNT=your-account
97
+ SNOWFLAKE_WAREHOUSE=ANALYTICS_S_WH
98
+ SNOWFLAKE_DATABASE=ANALYTICS_DB
99
+ SNOWFLAKE_SCHEMA=PUBLIC
100
+ SNOWFLAKE_AUTHENTICATOR=externalbrowser
101
+
102
+ # Browser-free Snowflake auth for local or Databricks jobs
103
+ SNOWFLAKE_PRIVATE_KEY_PATH=~/.snowflake/rsa_key.p8
104
+ SNOWFLAKE_PRIVATE_KEY_PASSPHRASE=secret
105
+
93
106
  # S3
94
107
  ML_ANALYTICS_S3_BUCKET=my-analytics-bucket
95
108
 
@@ -141,6 +154,43 @@ df = dc.sql("SELECT * FROM analytics.customer_features LIMIT 100")
141
154
  df_polars = dc.sql("queries/features.sql", format="polars", country="es")
142
155
  ```
143
156
 
157
+ ### Query Snowflake
158
+
159
+ ```python
160
+ from ml_analytics import DataConnector
161
+
162
+ dc = DataConnector(engine="snowflake")
163
+
164
+ df = dc.sql("SELECT 1 AS col_1")
165
+ ```
166
+
167
+ For local interactive work, `SNOWFLAKE_AUTHENTICATOR=externalbrowser` is supported.
168
+ SSO tokens are cached in the OS keychain, so the browser login only happens once
169
+ per token lifetime.
170
+ For Databricks and Spark jobs, use key-pair auth instead. The connector reads
171
+ default Databricks personal-scope secrets automatically:
172
+
173
+ ```bash
174
+ databricks secrets put-secret user-your.name@example.com snowflake_key --bytes-value """$(cat rsa_key.p8)"""
175
+ databricks secrets put-secret user-your.name@example.com snowflake_key_pass --string-value """<password>"""
176
+ ```
177
+
178
+ Then build Spark connector options without opening a browser:
179
+
180
+ ```python
181
+ from ml_analytics import DataConnector
182
+
183
+ dc = DataConnector(engine="snowflake", secret_scope="user-your.name@example.com")
184
+ options = dc.snowflake_spark_options()
185
+
186
+ df = (
187
+ spark.read.format("net.snowflake.spark.snowflake")
188
+ .options(**options)
189
+ .option("query", "SELECT 1 AS col_1")
190
+ .load()
191
+ )
192
+ ```
193
+
144
194
  ### Create A Redshift Table From A DataFrame
145
195
 
146
196
  ```python
@@ -181,6 +231,24 @@ df = gsheet.read_sheet(spreadsheet_id="...", sheet_name="Input")
181
231
  gsheet.write_sheet(df, spreadsheet_id="...", sheet_name="Results")
182
232
  ```
183
233
 
234
+ #### OAuth authentication (alternative to a service account)
235
+
236
+ `GSheet` can authenticate as your own Google account using OAuth installed-app
237
+ credentials (e.g. Preply's Google Workspace CLI credentials). Set these env vars
238
+ and the connector uses OAuth automatically when no service-account credentials
239
+ are found:
240
+
241
+ | Variable | Required | Description |
242
+ |----------|----------|-------------|
243
+ | `GOOGLE_OAUTH_CLIENT_ID` | yes | OAuth client id (`...apps.googleusercontent.com`) |
244
+ | `GOOGLE_OAUTH_CLIENT_SECRET` | yes | OAuth client secret (`GOCSPX-...`) |
245
+ | `GOOGLE_CLOUD_PROJECT` | optional | GCP project id (e.g. `preply-gworkspace-cli`) |
246
+ | `GSHEET_TOKEN_PATH` | optional | Token cache path (default `~/.config/ml-analytics/gsheet_token.json`) |
247
+
248
+ The first run opens a browser for one-time consent; the cached refresh token
249
+ makes later runs non-interactive. Under OAuth, `get_service_account_email()`
250
+ returns `None`.
251
+
184
252
  ### Log To MLflow
185
253
 
186
254
  ```python
@@ -15,7 +15,7 @@ arguments.
15
15
 
16
16
  ## What Is Included
17
17
 
18
- - `DataConnector`: run Redshift SQL, load SQL files, unload/load data through S3, and create Redshift tables from DataFrames.
18
+ - `DataConnector`: run Redshift or Snowflake SQL, load SQL files, unload/load data through S3, and create Redshift tables from DataFrames.
19
19
  - `S3Connector`: read, write, list, delete, and query S3 data with DuckDB.
20
20
  - `GSheet`: read, write, share, and export Google Sheets data.
21
21
  - `SlackConnector`: send messages, upload files, and manage simple Slack interactions.
@@ -56,6 +56,18 @@ BI_REDSHIFT_USER=analytics_user
56
56
  BI_REDSHIFT_PASSWORD=secret
57
57
  BI_REDSHIFT_PORT=5439
58
58
 
59
+ # Snowflake
60
+ SNOWFLAKE_USER=your.name@example.com
61
+ SNOWFLAKE_ACCOUNT=your-account
62
+ SNOWFLAKE_WAREHOUSE=ANALYTICS_S_WH
63
+ SNOWFLAKE_DATABASE=ANALYTICS_DB
64
+ SNOWFLAKE_SCHEMA=PUBLIC
65
+ SNOWFLAKE_AUTHENTICATOR=externalbrowser
66
+
67
+ # Browser-free Snowflake auth for local or Databricks jobs
68
+ SNOWFLAKE_PRIVATE_KEY_PATH=~/.snowflake/rsa_key.p8
69
+ SNOWFLAKE_PRIVATE_KEY_PASSPHRASE=secret
70
+
59
71
  # S3
60
72
  ML_ANALYTICS_S3_BUCKET=my-analytics-bucket
61
73
 
@@ -107,6 +119,43 @@ df = dc.sql("SELECT * FROM analytics.customer_features LIMIT 100")
107
119
  df_polars = dc.sql("queries/features.sql", format="polars", country="es")
108
120
  ```
109
121
 
122
+ ### Query Snowflake
123
+
124
+ ```python
125
+ from ml_analytics import DataConnector
126
+
127
+ dc = DataConnector(engine="snowflake")
128
+
129
+ df = dc.sql("SELECT 1 AS col_1")
130
+ ```
131
+
132
+ For local interactive work, `SNOWFLAKE_AUTHENTICATOR=externalbrowser` is supported.
133
+ SSO tokens are cached in the OS keychain, so the browser login only happens once
134
+ per token lifetime.
135
+ For Databricks and Spark jobs, use key-pair auth instead. The connector reads
136
+ default Databricks personal-scope secrets automatically:
137
+
138
+ ```bash
139
+ databricks secrets put-secret user-your.name@example.com snowflake_key --bytes-value """$(cat rsa_key.p8)"""
140
+ databricks secrets put-secret user-your.name@example.com snowflake_key_pass --string-value """<password>"""
141
+ ```
142
+
143
+ Then build Spark connector options without opening a browser:
144
+
145
+ ```python
146
+ from ml_analytics import DataConnector
147
+
148
+ dc = DataConnector(engine="snowflake", secret_scope="user-your.name@example.com")
149
+ options = dc.snowflake_spark_options()
150
+
151
+ df = (
152
+ spark.read.format("net.snowflake.spark.snowflake")
153
+ .options(**options)
154
+ .option("query", "SELECT 1 AS col_1")
155
+ .load()
156
+ )
157
+ ```
158
+
110
159
  ### Create A Redshift Table From A DataFrame
111
160
 
112
161
  ```python
@@ -147,6 +196,24 @@ df = gsheet.read_sheet(spreadsheet_id="...", sheet_name="Input")
147
196
  gsheet.write_sheet(df, spreadsheet_id="...", sheet_name="Results")
148
197
  ```
149
198
 
199
+ #### OAuth authentication (alternative to a service account)
200
+
201
+ `GSheet` can authenticate as your own Google account using OAuth installed-app
202
+ credentials (e.g. Preply's Google Workspace CLI credentials). Set these env vars
203
+ and the connector uses OAuth automatically when no service-account credentials
204
+ are found:
205
+
206
+ | Variable | Required | Description |
207
+ |----------|----------|-------------|
208
+ | `GOOGLE_OAUTH_CLIENT_ID` | yes | OAuth client id (`...apps.googleusercontent.com`) |
209
+ | `GOOGLE_OAUTH_CLIENT_SECRET` | yes | OAuth client secret (`GOCSPX-...`) |
210
+ | `GOOGLE_CLOUD_PROJECT` | optional | GCP project id (e.g. `preply-gworkspace-cli`) |
211
+ | `GSHEET_TOKEN_PATH` | optional | Token cache path (default `~/.config/ml-analytics/gsheet_token.json`) |
212
+
213
+ The first run opens a browser for one-time consent; the cached refresh token
214
+ makes later runs non-interactive. Under OAuth, `get_service_account_email()`
215
+ returns `None`.
216
+
150
217
  ### Log To MLflow
151
218
 
152
219
  ```python