PyPI - ml-analytics-tools - Versions diffs - 0.2.0__tar.gz - Mend

ml-analytics-tools 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

ml_analytics_tools-0.2.0/LICENSE +21 -0
ml_analytics_tools-0.2.0/PKG-INFO +231 -0
ml_analytics_tools-0.2.0/README.md +197 -0
ml_analytics_tools-0.2.0/ml_analytics/__init__.py +53 -0
ml_analytics_tools-0.2.0/ml_analytics/aws_auth.py +169 -0
ml_analytics_tools-0.2.0/ml_analytics/cli.py +58 -0
ml_analytics_tools-0.2.0/ml_analytics/data_connector.py +2615 -0
ml_analytics_tools-0.2.0/ml_analytics/gsheet_connector.py +1646 -0
ml_analytics_tools-0.2.0/ml_analytics/model_manager.py +1208 -0
ml_analytics_tools-0.2.0/ml_analytics/model_tools.py +990 -0
ml_analytics_tools-0.2.0/ml_analytics/s3_connector.py +1381 -0
ml_analytics_tools-0.2.0/ml_analytics/slack_connector.py +637 -0
ml_analytics_tools-0.2.0/ml_analytics/tunnel_manager.py +277 -0
ml_analytics_tools-0.2.0/ml_analytics/utils.py +673 -0
ml_analytics_tools-0.2.0/ml_analytics_tools.egg-info/PKG-INFO +231 -0
ml_analytics_tools-0.2.0/ml_analytics_tools.egg-info/SOURCES.txt +29 -0
ml_analytics_tools-0.2.0/ml_analytics_tools.egg-info/dependency_links.txt +1 -0
ml_analytics_tools-0.2.0/ml_analytics_tools.egg-info/entry_points.txt +4 -0
ml_analytics_tools-0.2.0/ml_analytics_tools.egg-info/requires.txt +25 -0
ml_analytics_tools-0.2.0/ml_analytics_tools.egg-info/top_level.txt +1 -0
ml_analytics_tools-0.2.0/pyproject.toml +147 -0
ml_analytics_tools-0.2.0/setup.cfg +4 -0
ml_analytics_tools-0.2.0/tests/test_aws_auth.py +133 -0
ml_analytics_tools-0.2.0/tests/test_db_s3.py +554 -0
ml_analytics_tools-0.2.0/tests/test_gsheet_connector.py +982 -0
ml_analytics_tools-0.2.0/tests/test_identity_column.py +246 -0
ml_analytics_tools-0.2.0/tests/test_model_manager.py +64 -0
ml_analytics_tools-0.2.0/tests/test_model_tools.py +304 -0
ml_analytics_tools-0.2.0/tests/test_s3_redshift_validation.py +297 -0
ml_analytics_tools-0.2.0/tests/test_tunnel_manager.py +289 -0
ml_analytics_tools-0.2.0/tests/test_utils.py +414 -0

ml_analytics_tools-0.2.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2025 Sebastian Daza
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

ml_analytics_tools-0.2.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,231 @@
+Metadata-Version: 2.4
+Name: ml-analytics-tools
+Version: 0.2.0
+Summary: Tools for ML projects and data management
+Requires-Python: >=3.11
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: boto3>=1.37.24
+Requires-Dist: catboost>=1.2.8
+Requires-Dist: ddtrace>=3.4.1
+Requires-Dist: dotenv>=0.9.9
+Requires-Dist: duckdb>=1.4.1
+Requires-Dist: google-api-python-client>=2.150.0
+Requires-Dist: google-auth>=2.35.0
+Requires-Dist: google-auth-httplib2>=0.2.0
+Requires-Dist: google-auth-oauthlib>=1.2.0
+Requires-Dist: ipykernel>=6.29.5
+Requires-Dist: lifelines>=0.30.3
+Requires-Dist: mlflow==3.10.1
+Requires-Dist: mlflow[auth]==3.10.1
+Requires-Dist: pip>=25.3
+Requires-Dist: polars==1.30.0
+Requires-Dist: pytest>=8.3.5
+Requires-Dist: pyyaml>=6.0.2
+Requires-Dist: redshift-connector>=2.1.9
+Requires-Dist: ruff>=0.11.4
+Requires-Dist: schedule>=1.2.2
+Requires-Dist: scikit-learn==1.5.2
+Requires-Dist: seaborn>=0.13.2
+Requires-Dist: setuptools>=42.0.0
+Requires-Dist: shap>=0.47.2
+Requires-Dist: slack-sdk>=3.27.0
+Dynamic: license-file
+# ML Analytics Tools
+Utilities for common analytics and machine learning workflows: Redshift, S3,
+Google Sheets, Slack, MLflow, model evaluation, and SQL pipelines.
+The package is intentionally infrastructure-neutral. Buckets, credentials,
+MLflow hosts, and tokens are provided by your environment or by explicit
+arguments.
+## What Is Included
+- `DataConnector`: run Redshift SQL, load SQL files, unload/load data through S3, and create Redshift tables from DataFrames.
+- `S3Connector`: read, write, list, delete, and query S3 data with DuckDB.
+- `GSheet`: read, write, share, and export Google Sheets data.
+- `SlackConnector`: send messages, upload files, and manage simple Slack interactions.
+- `ModelManager`: create MLflow experiments, log models, register versions, manage aliases, and handle permissions.
+- `model_tools`: classification, regression, survival analysis, CatBoost helpers, plotting, and reporting utilities.
+- `utils`: project-root discovery, SQL file loading, logging, credentials, and YAML SQL pipelines.
+## Install
+From PyPI, after a release is available:
+```bash
+uv add ml-analytics-tools
+```
+Directly from GitHub:
+```bash
+uv add git+https://github.com/sdaza/ml-analytics-tools
+```
+For local development:
+```bash
+uv sync --all-groups
+```
+## Configuration
+The package loads a `.env` file from the project root when it is imported.
+Only configure the services you use.
+```bash
+# Redshift
+BI_REDSHIFT_HOST=redshift-cluster.example.com
+BI_REDSHIFT_DB=analytics
+BI_REDSHIFT_USER=analytics_user
+BI_REDSHIFT_PASSWORD=secret
+BI_REDSHIFT_PORT=5439
+# S3
+ML_ANALYTICS_S3_BUCKET=my-analytics-bucket
+# MLflow
+MLFLOW_TRACKING_URI=https://mlflow.example.com
+MLFLOW_TRACKING_USERNAME=user@example.com
+MLFLOW_TRACKING_PASSWORD=secret
+# Google Sheets
+GSHEET_SPREADSHEET_ID=optional-default-sheet-id
+GOOGLE_CREDENTIALS='{"type":"service_account", ...}'
+# Slack
+SLACK_BOT_TOKEN=xoxb-your-token
+```
+S3 buckets are never hard-coded. Pass `bucket=...` or `s3_bucket=...`, or set
+`ML_ANALYTICS_S3_BUCKET`.
+## AWS Authentication
+Use the CLI helper for AWS SSO:
+```bash
+ml-analytics-auth
+```
+You can also call it from Python:
+```python
+from ml_analytics import ensure_aws_authenticated
+ensure_aws_authenticated()
+```
+See [AWS Authentication](docs/AWS_AUTHENTICATION.md) and
+[CLI Commands](docs/CLI_COMMANDS.md) for details.
+## Quick Examples
+### Query Redshift
+```python
+from ml_analytics import DataConnector
+dc = DataConnector()
+df = dc.sql("SELECT * FROM analytics.customer_features LIMIT 100")
+df_polars = dc.sql("queries/features.sql", format="polars", country="es")
+```
+### Create A Redshift Table From A DataFrame
+```python
+dc.create_table_from_dataframe(
+    df,
+    table="model_scores",
+    schema="analytics",
+    drop_existing_table=True,
+)
+```
+### Work With S3
+```python
+from ml_analytics import S3Connector
+s3 = S3Connector(bucket="my-analytics-bucket", s3_root="projects/churn")
+s3.save_dataframe(df, directory="outputs", file_name="scores")
+summary = s3.query(
+    """
+    SELECT segment, count(*) AS rows
+    FROM read_parquet('s3://my-analytics-bucket/projects/churn/outputs/*.parquet')
+    GROUP BY segment
+    """
+)
+```
+### Read And Write Google Sheets
+```python
+from ml_analytics import GSheet
+gsheet = GSheet(credentials_path="gsheet_credentials.json")
+df = gsheet.read_sheet(spreadsheet_id="...", sheet_name="Input")
+gsheet.write_sheet(df, spreadsheet_id="...", sheet_name="Results")
+```
+### Log To MLflow
+```python
+from ml_analytics import ModelManager
+manager = ModelManager(model_name="churn-model", user="user@example.com")
+manager.start_run("training")
+manager.log_metric("auc", 0.91)
+manager.end_run()
+```
+### Send A Slack Message
+```python
+from ml_analytics import SlackConnector
+slack = SlackConnector()
+slack.send_message(channel="#ml-alerts", text="Training finished")
+```
+## Detailed Guides
+| Guide | Use It For |
+| --- | --- |
+| [AWS Authentication](docs/AWS_AUTHENTICATION.md) | AWS SSO setup and Python helpers |
+| [CLI Commands](docs/CLI_COMMANDS.md) | Available console commands |
+| [Google Sheets](docs/GSHEET_CONNECTOR_USAGE.md) | Sheets setup, sharing, exports, and examples |
+| [Slack](docs/SLACK_CONNECTOR_USAGE.md) | Slack token setup and message/file examples |
+| [Tunnel Manager](docs/TUNNEL_MANAGER.md) | SSH tunnel configuration and CLI usage |
+## Development
+Run the standard checks before opening a PR:
+```bash
+uv run ruff check
+uv run pytest
+```
+CI runs Ruff and pytest on Python 3.11 and 3.12.
+## Releases
+This repository uses Release Please. Conventional commits on `main` create or
+update a release PR with the next version and changelog. When that PR is merged,
+the release workflow builds the package and publishes it to PyPI through Trusted
+Publishing using the `pypi` GitHub environment.
+## Contributing
+Keep changes small, covered by tests when behavior changes, and free of
+environment-specific defaults. Prefer explicit configuration over hidden
+infrastructure assumptions.

ml_analytics_tools-0.2.0/README.md ADDED Viewed

@@ -0,0 +1,197 @@
+# ML Analytics Tools
+Utilities for common analytics and machine learning workflows: Redshift, S3,
+Google Sheets, Slack, MLflow, model evaluation, and SQL pipelines.
+The package is intentionally infrastructure-neutral. Buckets, credentials,
+MLflow hosts, and tokens are provided by your environment or by explicit
+arguments.
+## What Is Included
+- `DataConnector`: run Redshift SQL, load SQL files, unload/load data through S3, and create Redshift tables from DataFrames.
+- `S3Connector`: read, write, list, delete, and query S3 data with DuckDB.
+- `GSheet`: read, write, share, and export Google Sheets data.
+- `SlackConnector`: send messages, upload files, and manage simple Slack interactions.
+- `ModelManager`: create MLflow experiments, log models, register versions, manage aliases, and handle permissions.
+- `model_tools`: classification, regression, survival analysis, CatBoost helpers, plotting, and reporting utilities.
+- `utils`: project-root discovery, SQL file loading, logging, credentials, and YAML SQL pipelines.
+## Install
+From PyPI, after a release is available:
+```bash
+uv add ml-analytics-tools
+```
+Directly from GitHub:
+```bash
+uv add git+https://github.com/sdaza/ml-analytics-tools
+```
+For local development:
+```bash
+uv sync --all-groups
+```
+## Configuration
+The package loads a `.env` file from the project root when it is imported.
+Only configure the services you use.
+```bash
+# Redshift
+BI_REDSHIFT_HOST=redshift-cluster.example.com
+BI_REDSHIFT_DB=analytics
+BI_REDSHIFT_USER=analytics_user
+BI_REDSHIFT_PASSWORD=secret
+BI_REDSHIFT_PORT=5439
+# S3
+ML_ANALYTICS_S3_BUCKET=my-analytics-bucket
+# MLflow
+MLFLOW_TRACKING_URI=https://mlflow.example.com
+MLFLOW_TRACKING_USERNAME=user@example.com
+MLFLOW_TRACKING_PASSWORD=secret
+# Google Sheets
+GSHEET_SPREADSHEET_ID=optional-default-sheet-id
+GOOGLE_CREDENTIALS='{"type":"service_account", ...}'
+# Slack
+SLACK_BOT_TOKEN=xoxb-your-token
+```
+S3 buckets are never hard-coded. Pass `bucket=...` or `s3_bucket=...`, or set
+`ML_ANALYTICS_S3_BUCKET`.
+## AWS Authentication
+Use the CLI helper for AWS SSO:
+```bash
+ml-analytics-auth
+```
+You can also call it from Python:
+```python
+from ml_analytics import ensure_aws_authenticated
+ensure_aws_authenticated()
+```
+See [AWS Authentication](docs/AWS_AUTHENTICATION.md) and
+[CLI Commands](docs/CLI_COMMANDS.md) for details.
+## Quick Examples
+### Query Redshift
+```python
+from ml_analytics import DataConnector
+dc = DataConnector()
+df = dc.sql("SELECT * FROM analytics.customer_features LIMIT 100")
+df_polars = dc.sql("queries/features.sql", format="polars", country="es")
+```
+### Create A Redshift Table From A DataFrame
+```python
+dc.create_table_from_dataframe(
+    df,
+    table="model_scores",
+    schema="analytics",
+    drop_existing_table=True,
+)
+```
+### Work With S3
+```python
+from ml_analytics import S3Connector
+s3 = S3Connector(bucket="my-analytics-bucket", s3_root="projects/churn")
+s3.save_dataframe(df, directory="outputs", file_name="scores")
+summary = s3.query(
+    """
+    SELECT segment, count(*) AS rows
+    FROM read_parquet('s3://my-analytics-bucket/projects/churn/outputs/*.parquet')
+    GROUP BY segment
+    """
+)
+```
+### Read And Write Google Sheets
+```python
+from ml_analytics import GSheet
+gsheet = GSheet(credentials_path="gsheet_credentials.json")
+df = gsheet.read_sheet(spreadsheet_id="...", sheet_name="Input")
+gsheet.write_sheet(df, spreadsheet_id="...", sheet_name="Results")
+```
+### Log To MLflow
+```python
+from ml_analytics import ModelManager
+manager = ModelManager(model_name="churn-model", user="user@example.com")
+manager.start_run("training")
+manager.log_metric("auc", 0.91)
+manager.end_run()
+```
+### Send A Slack Message
+```python
+from ml_analytics import SlackConnector
+slack = SlackConnector()
+slack.send_message(channel="#ml-alerts", text="Training finished")
+```
+## Detailed Guides
+| Guide | Use It For |
+| --- | --- |
+| [AWS Authentication](docs/AWS_AUTHENTICATION.md) | AWS SSO setup and Python helpers |
+| [CLI Commands](docs/CLI_COMMANDS.md) | Available console commands |
+| [Google Sheets](docs/GSHEET_CONNECTOR_USAGE.md) | Sheets setup, sharing, exports, and examples |
+| [Slack](docs/SLACK_CONNECTOR_USAGE.md) | Slack token setup and message/file examples |
+| [Tunnel Manager](docs/TUNNEL_MANAGER.md) | SSH tunnel configuration and CLI usage |
+## Development
+Run the standard checks before opening a PR:
+```bash
+uv run ruff check
+uv run pytest
+```
+CI runs Ruff and pytest on Python 3.11 and 3.12.
+## Releases
+This repository uses Release Please. Conventional commits on `main` create or
+update a release PR with the next version and changelog. When that PR is merged,
+the release workflow builds the package and publishes it to PyPI through Trusted
+Publishing using the `pypi` GitHub environment.
+## Contributing
+Keep changes small, covered by tests when behavior changes, and free of
+environment-specific defaults. Prefer explicit configuration over hidden
+infrastructure assumptions.

ml_analytics_tools-0.2.0/ml_analytics/__init__.py ADDED Viewed

@@ -0,0 +1,53 @@
+"""
+ML Analytics Tools Package
+"""
+from dotenv import load_dotenv
+from .aws_auth import ensure_aws_authenticated, ensure_aws_sso_login
+from .data_connector import DataConnector
+from .gsheet_connector import GSheet
+from .model_manager import ModelManager
+from .s3_connector import S3Connector
+from .slack_connector import SlackConnector
+from .utils import (
+    execute_sql_scripts,
+    find_project_root,
+    get_credential_value,
+    get_logger,
+    get_sql_files,
+    load_sql_query,
+    log_and_raise_error,
+)
+# Automatically load .env file when the package is imported
+logger = get_logger("ml_analytics")
+try:
+    project_root = find_project_root()
+    env_file = project_root / ".env"
+    if env_file.exists():
+        if load_dotenv(env_file, override=True):
+            logger.info(".env file loaded successfully.")
+        else:
+            logger.info("Failed to load .env file.")
+    else:
+        logger.info("No .env file present in project root.")
+except Exception:
+    logger.info("No .env file loaded.")
+__all__ = [
+    "DataConnector",
+    "ensure_aws_authenticated",
+    "ensure_aws_sso_login",
+    "execute_sql_scripts",
+    "find_project_root",
+    "get_credential_value",
+    "get_logger",
+    "get_sql_files",
+    "GSheet",
+    "load_sql_query",
+    "log_and_raise_error",
+    "ModelManager",
+    "S3Connector",
+    "SlackConnector",
+]

ml_analytics_tools-0.2.0/ml_analytics/aws_auth.py ADDED Viewed

@@ -0,0 +1,169 @@
+"""
+AWS authentication utilities.
+"""
+import subprocess
+import sys
+from .utils import get_logger
+logger = get_logger("aws_auth")
+def _do_sso_login(profile: str = None) -> bool:
+    """
+    Performs the interactive SSO login flow.
+    Parameters
+    ----------
+    profile : str, optional
+        AWS profile name to use.
+    Returns
+    -------
+    bool
+        True if login successful, False otherwise.
+    """
+    try:
+        logger.info("AWS SSO login required - starting authentication...")
+        login_cmd = ["aws", "sso", "login"]
+        if profile:
+            login_cmd.extend(["--profile", profile])
+        # Redirect stdout to stderr so user sees prompts but eval doesn't execute them
+        login_result = subprocess.run(login_cmd, stdout=sys.stderr, timeout=300)
+        if login_result.returncode == 0:
+            logger.info("✓ AWS SSO login successful")
+            return True
+        else:
+            logger.error("✗ AWS SSO login failed")
+            return False
+    except subprocess.TimeoutExpired:
+        logger.error("AWS SSO login timed out")
+        return False
+    except FileNotFoundError:
+        logger.error("AWS CLI not found. Please install AWS CLI.")
+        return False
+    except Exception as e:
+        logger.error(f"Error during AWS SSO login: {e}")
+        return False
+def ensure_aws_sso_login(profile: str = None, force: bool = False) -> bool:
+    """
+    Ensures AWS SSO is authenticated. If not, prompts user to login.
+    Parameters
+    ----------
+    profile : str, optional
+        AWS profile name to use. If None, uses default profile.
+    force : bool, optional
+        If True, skip the cached credential check and force a fresh SSO login.
+    Returns
+    -------
+    bool
+        True if authenticated successfully, False otherwise.
+    """
+    if force:
+        return _do_sso_login(profile)
+    try:
+        # Check if already logged in by attempting to get caller identity
+        cmd = ["aws", "sts", "get-caller-identity"]
+        if profile:
+            cmd.extend(["--profile", profile])
+        result = subprocess.run(cmd, capture_output=True, text=True, timeout=10)
+        if result.returncode == 0:
+            # Already authenticated - don't log to reduce noise
+            return True
+        # Not logged in, attempt SSO login
+        return _do_sso_login(profile)
+    except subprocess.TimeoutExpired:
+        logger.error("AWS SSO login timed out")
+        return False
+    except FileNotFoundError:
+        logger.error("AWS CLI not found. Please install AWS CLI.")
+        return False
+    except Exception as e:
+        logger.error(f"Error during AWS SSO login: {e}")
+        return False
+def ensure_aws_authenticated(sso_profile: str = None, print_exports: bool = False) -> bool:
+    """
+    Convenience function that ensures AWS SSO is authenticated.
+    Parameters
+    ----------
+    sso_profile : str, optional
+        AWS SSO profile to use
+    print_exports : bool, optional
+        Kept for backward-compatible CLI calls. No shell exports are required.
+    Returns
+    -------
+    bool
+        True if AWS SSO authentication succeeded, False otherwise.
+    Example
+    -------
+    >>> from ml_analytics.aws_auth import ensure_aws_authenticated
+    >>> ensure_aws_authenticated()
+    """
+    del print_exports
+    logger.info("Ensuring AWS authentication...")
+    if not ensure_aws_sso_login(sso_profile):
+        return False
+    logger.info("✓ AWS authentication complete")
+    return True
+def run_uv_command(command: str) -> bool:
+    """
+    Runs a UV command and returns whether it succeeded.
+    Parameters
+    ----------
+    command : str
+        The UV command to run (e.g., "uv sync", "uv add package")
+    Returns
+    -------
+    bool
+        True if the command executed successfully, False otherwise.
+    Example
+    -------
+    >>> from ml_analytics.aws_auth import run_uv_command
+    >>> run_uv_command("uv sync")
+    """
+    try:
+        logger.info(f"Running UV command: {command}")
+        result = subprocess.run(command, shell=True, capture_output=True, text=True, timeout=300)
+        if result.returncode == 0:
+            logger.info("✓ UV command completed successfully")
+            if result.stdout:
+                print(result.stdout)
+            return True
+        else:
+            logger.error(f"✗ UV command failed: {result.stderr}")
+            if result.stderr:
+                print(result.stderr)
+            return False
+    except subprocess.TimeoutExpired:
+        logger.error(f"UV command timed out: {command}")
+        return False
+    except Exception as e:
+        logger.error(f"Error running UV command '{command}': {e}")
+        return False