PyPI - datatoolpack - Versions diffs - 0.2.0__py3-none-any.whl - Mend

datatoolpack 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

autodata/__init__.py +3 -0
autodata/client.py +345 -0
datatoolpack-0.2.0.dist-info/METADATA +299 -0
datatoolpack-0.2.0.dist-info/RECORD +6 -0
datatoolpack-0.2.0.dist-info/WHEEL +5 -0
datatoolpack-0.2.0.dist-info/top_level.txt +1 -0

autodata/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .client import AutoDataClient, AutoDataError
+__all__ = ['AutoDataClient', 'AutoDataError']

autodata/client.py ADDED Viewed

@@ -0,0 +1,345 @@
+"""
+AutoData Python Client
+======================
+Official Python SDK for the AutoData ML data preparation pipeline API.
+Usage:
+    from autodata import AutoDataClient
+    client = AutoDataClient(api_key="dtpk_...", base_url="https://autodata.datatoolpack.com")
+    result = client.process(
+        file_path="data.csv",
+        target_columns=["price"],
+        output_rows=20000,
+    )
+    print(result["files"])
+"""
+import io
+import json
+import os
+import time
+import zipfile
+from typing import Dict, List, Optional, Union
+try:
+    import requests
+except ImportError as exc:  # pragma: no cover
+    raise ImportError("The 'requests' package is required. Install with: pip install requests") from exc
+class AutoDataError(Exception):
+    """Raised for API errors returned by the AutoData server."""
+    def __init__(self, message: str, status_code: Optional[int] = None):
+        super().__init__(message)
+        self.status_code = status_code
+class AutoDataClient:
+    """
+    Client for the AutoData REST API v1.
+    All endpoints authenticate with an API key issued from the AutoData dashboard.
+    Pass the key as the ``api_key`` argument — it is sent as a Bearer token.
+    Args:
+        api_key:  API key string starting with ``dtpk_``.
+        base_url: Base URL of the AutoData server (no trailing slash).
+        timeout:  HTTP request timeout in seconds (default 120).
+    """
+    def __init__(
+        self,
+        api_key: str,
+        base_url: str = "https://autodata.datatoolpack.com",
+        timeout: int = 120,
+    ):
+        if not api_key or not api_key.startswith("dtpk_"):
+            raise ValueError("api_key must start with 'dtpk_'. Get yours from the AutoData dashboard.")
+        self.api_key = api_key
+        self.base_url = base_url.rstrip("/")
+        self.timeout = timeout
+        self._session = requests.Session()
+        self._session.headers.update({"Authorization": f"Bearer {api_key}"})
+    # ------------------------------------------------------------------
+    # Internal helpers
+    # ------------------------------------------------------------------
+    def _url(self, path: str) -> str:
+        return f"{self.base_url}/api/v1{path}"
+    def _raise_for_error(self, response: requests.Response) -> None:
+        if not response.ok:
+            try:
+                msg = response.json().get("error", response.text)
+            except Exception:
+                msg = response.text or f"HTTP {response.status_code}"
+            raise AutoDataError(msg, status_code=response.status_code)
+    # ------------------------------------------------------------------
+    # Core pipeline methods
+    # ------------------------------------------------------------------
+    def process(
+        self,
+        file_path: str,
+        target_columns: Union[str, List[str]],
+        output_rows: int = 10000,
+        tools: Optional[Dict[str, bool]] = None,
+        advanced_params: Optional[Dict] = None,
+        wait: bool = True,
+        poll_interval: int = 2,
+        download_path: Optional[str] = None,
+        output_preferences: Optional[List[str]] = None,
+        compressed: bool = True,
+    ) -> Dict:
+        """
+        Upload a CSV file and start the AutoData pipeline.
+        Args:
+            file_path:          Path to the input CSV file.
+            target_columns:     Target column name(s) for ML (y-columns).
+            output_rows:        Desired number of rows in the output dataset.
+            tools:              Dict of tool toggles. Keys: anomaly, dtc, mdh,
+                                dor, cds, dsm, dsg. Default is all standard
+                                tools enabled (anomaly/dor off by default).
+            advanced_params:    Fine-grained parameters:
+                                  excluded_columns (list[str])
+                                  text_mode (int: 0=none, 1=neural, 2=tfidf)
+                                  text_cleaning (bool)
+                                  zscore_limit (float, default 3.0)
+                                  dsg_mode (str: 'copula'|'gan')
+                                  similarity_p (float, default 95)
+            wait:               If True (default), block until processing
+                                completes and download results automatically.
+            poll_interval:      Seconds between status polls (default 2).
+            download_path:      Directory to save results. Defaults to
+                                ``./auto_data_outputs/<session_id>/``.
+            output_preferences: Subset of filenames to download. Downloads
+                                all files when None.
+            compressed:         Download as a ZIP archive (default True).
+        Returns:
+            Dict with session_id, status, and files list.
+        Raises:
+            FileNotFoundError: If file_path does not exist.
+            AutoDataError:     On API errors.
+        """
+        if not os.path.exists(file_path):
+            raise FileNotFoundError(f"File not found: {file_path}")
+        config = {
+            "target_columns": [target_columns] if isinstance(target_columns, str) else target_columns,
+            "output_rows": output_rows,
+            "tools": tools or {},
+            "advanced_params": advanced_params or {},
+        }
+        with open(file_path, "rb") as f:
+            response = self._session.post(
+                self._url("/process"),
+                files={"file": (os.path.basename(file_path), f, "text/csv")},
+                data={"config": json.dumps(config)},
+                timeout=self.timeout,
+            )
+        self._raise_for_error(response)
+        result = response.json()
+        session_id = result["session_id"]
+        if not wait:
+            return result
+        try:
+            final = self.wait_for_completion(session_id, poll_interval=poll_interval)
+        except KeyboardInterrupt:
+            print("\nInterrupted — cancelling job on server…")
+            self.cancel(session_id)
+            raise
+        if download_path is not False:
+            self.download_results(
+                session_id,
+                download_path=download_path,
+                output_preferences=output_preferences,
+                compressed=compressed,
+            )
+        return final
+    def get_status(self, session_id: str) -> Dict:
+        """
+        Poll the processing status of a session.
+        Returns a dict with keys: status, message, current_step, total_steps,
+        progress_percent, duration_seconds.
+        """
+        r = self._session.get(self._url(f"/status/{session_id}"), timeout=self.timeout)
+        self._raise_for_error(r)
+        return r.json()
+    def get_result(self, session_id: str) -> Dict:
+        """
+        Retrieve the final results of a completed session.
+        Returns a dict with keys: status, files (list of {name, url, size, description}),
+        row_count, duration_seconds.
+        """
+        r = self._session.get(self._url(f"/result/{session_id}"), timeout=self.timeout)
+        self._raise_for_error(r)
+        return r.json()
+    def cancel(self, session_id: str) -> bool:
+        """
+        Cancel a running or queued session.
+        Returns True if cancellation was acknowledged.
+        """
+        try:
+            r = self._session.post(self._url(f"/cancel/{session_id}"), timeout=self.timeout)
+            self._raise_for_error(r)
+            return r.json().get("cancelled", False)
+        except AutoDataError as e:
+            print(f"Warning: cancel failed — {e}")
+            return False
+    def wait_for_completion(self, session_id: str, poll_interval: int = 2) -> Dict:
+        """
+        Block until a session reaches *completed*, *error*, or *cancelled*.
+        Prints progress updates to stdout. Raises ``AutoDataError`` on failure.
+        """
+        print(f"Waiting for session {session_id}…")
+        while True:
+            data = self.get_status(session_id)
+            status = data.get("status", "unknown")
+            pct = data.get("progress_percent", 0)
+            msg = data.get("message", "")
+            if status == "completed":
+                print(f"\r✓ Completed ({pct}%): {msg}     ")
+                return self.get_result(session_id)
+            elif status == "error":
+                raise AutoDataError(f"Processing failed: {msg}")
+            elif status == "cancelled":
+                raise AutoDataError("Processing was cancelled")
+            print(f"\r  {pct:3d}% — {msg[:60]:<60}", end="", flush=True)
+            time.sleep(poll_interval)
+    # ------------------------------------------------------------------
+    # File download helpers
+    # ------------------------------------------------------------------
+    def download_results(
+        self,
+        session_id: str,
+        download_path: Optional[str] = None,
+        output_preferences: Optional[List[str]] = None,
+        compressed: bool = True,
+    ) -> str:
+        """
+        Download result files for a completed session.
+        Args:
+            session_id:         Session to download.
+            download_path:      Directory to save files (created if missing).
+            output_preferences: Subset of filenames to include. None = all.
+            compressed:         If True (default), downloads a single ZIP archive
+                                and extracts it. If False, downloads each file
+                                individually.
+        Returns:
+            Absolute path to the download directory.
+        """
+        if not download_path:
+            download_path = os.path.join(os.getcwd(), "auto_data_outputs", session_id)
+        os.makedirs(download_path, exist_ok=True)
+        if compressed:
+            body = {}
+            if output_preferences:
+                body["files"] = output_preferences
+            r = self._session.post(
+                self._url(f"/download-archive/{session_id}"),
+                json=body,
+                timeout=max(self.timeout, 300),
+                stream=True,
+            )
+            self._raise_for_error(r)
+            try:
+                z = zipfile.ZipFile(io.BytesIO(r.content))
+                z.extractall(download_path)
+                print(f"Extracted results to {download_path}")
+            except zipfile.BadZipFile:
+                raise AutoDataError("Server returned an invalid ZIP archive")
+        else:
+            result = self.get_result(session_id)
+            for f in result.get("files", []):
+                fname = f["name"]
+                if output_preferences and fname not in output_preferences:
+                    continue
+                url = f["url"]
+                if not url.startswith("http"):
+                    url = f"{self.base_url}{url}"
+                out = os.path.join(download_path, fname)
+                with self._session.get(url, stream=True, timeout=max(self.timeout, 300)) as resp:
+                    resp.raise_for_status()
+                    with open(out, "wb") as fh:
+                        for chunk in resp.iter_content(chunk_size=65536):
+                            fh.write(chunk)
+                print(f"  Downloaded {fname}")
+        return os.path.abspath(download_path)
+    def download_file(self, url: str, output_path: str) -> None:
+        """Download a single file by URL (absolute or server-relative)."""
+        if not url.startswith("http"):
+            url = f"{self.base_url}{url}"
+        with self._session.get(url, stream=True, timeout=max(self.timeout, 300)) as r:
+            r.raise_for_status()
+            with open(output_path, "wb") as fh:
+                for chunk in r.iter_content(chunk_size=65536):
+                    fh.write(chunk)
+        print(f"Downloaded to {output_path}")
+    # ------------------------------------------------------------------
+    # Account / API key management
+    # ------------------------------------------------------------------
+    def list_keys(self) -> List[Dict]:
+        """
+        List all active API keys for the authenticated account.
+        Returns a list of key metadata dicts (no secret values exposed).
+        """
+        r = self._session.get(self._url("/keys"), timeout=self.timeout)
+        self._raise_for_error(r)
+        return r.json().get("api_keys", [])
+    def get_usage(self) -> Dict:
+        """
+        Get credit usage statistics for the current API key.
+        Returns a dict with:
+            daily_credits_used, daily_credit_limit, daily_remaining,
+            lifetime_credits_used, lifetime_credit_limit, lifetime_remaining,
+            daily_request_count, last_used_at.
+        """
+        r = self._session.get(self._url("/usage"), timeout=self.timeout)
+        self._raise_for_error(r)
+        return r.json()
+    # ------------------------------------------------------------------
+    # Deprecated aliases (backward compat)
+    # ------------------------------------------------------------------
+    def get_status_deprecated(self, session_id: str) -> Dict:  # noqa: D401
+        """Deprecated alias for get_status()."""
+        return self.get_status(session_id)
+    def cancel_session(self, session_id: str) -> bool:
+        """Deprecated alias for cancel(). Use cancel() instead."""
+        return self.cancel(session_id)

datatoolpack-0.2.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,299 @@
+Metadata-Version: 2.4
+Name: datatoolpack
+Version: 0.2.0
+Summary: Official Python SDK for the AutoData ML data preparation pipeline API
+Home-page: https://autodata.datatoolpack.com
+Author: AutoData Team
+Author-email: support@datatoolpack.com
+Project-URL: Documentation, https://autodata.datatoolpack.com/docs
+Project-URL: Bug Tracker, https://github.com/datatoolpack/autodata-client/issues
+Keywords: autodata machine-learning data-preparation synthetic-data ml-pipeline
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Requires-Python: >=3.8
+Description-Content-Type: text/markdown
+Requires-Dist: requests>=2.25.0
+Dynamic: author
+Dynamic: author-email
+Dynamic: classifier
+Dynamic: description
+Dynamic: description-content-type
+Dynamic: home-page
+Dynamic: keywords
+Dynamic: project-url
+Dynamic: requires-dist
+Dynamic: requires-python
+Dynamic: summary
+# AutoData Python Client
+Official Python SDK for the [AutoData](https://autodata.datatoolpack.com) ML data preparation pipeline API.
+## Installation
+```bash
+pip install autodata-client
+```
+Or install from source:
+```bash
+git clone https://github.com/datatoolpack/autodata-client
+cd autodata-client
+pip install .
+```
+## Quick Start
+```python
+from autodata import AutoDataClient
+client = AutoDataClient(
+    api_key="dtpk_YOUR_API_KEY",
+    base_url="https://autodata.datatoolpack.com",
+)
+result = client.process(
+    file_path="data.csv",
+    target_columns=["price"],
+    output_rows=20000,
+)
+print(result["files"])
+# [{'name': 'dsg.csv', 'url': '/download/...', 'size': 1048576, 'description': '...'}]
+```
+Get your API key from the [AutoData dashboard](https://autodata.datatoolpack.com/dashboard) → API Keys tab.
+---
+## Reference
+### `AutoDataClient(api_key, base_url, timeout)`
+| Parameter  | Type  | Default                              | Description                        |
+|------------|-------|--------------------------------------|------------------------------------|
+| `api_key`  | `str` | required                             | API key starting with `dtpk_`      |
+| `base_url` | `str` | `"https://autodata.datatoolpack.com"` | Server URL (no trailing slash)     |
+| `timeout`  | `int` | `120`                                | Request timeout in seconds         |
+---
+### `client.process(...)` — Upload & run pipeline
+```python
+result = client.process(
+    file_path="data.csv",           # Path to input CSV
+    target_columns=["price"],       # y-column(s) for ML
+    output_rows=20000,              # Target row count in output
+    tools={                         # Toggle pipeline steps (all optional)
+        "anomaly": False,           # Anomaly detection (off by default)
+        "dtc": True,                # Data Type Conversion
+        "mdh": True,                # Missing Data Handler
+        "cds": True,                # Column Scaling
+        "dsm": True,                # Data Split Manager
+        "dsg": True,                # Synthetic Data Generator
+    },
+    advanced_params={               # Fine-grained parameters (all optional)
+        "excluded_columns": ["id"], # Columns to drop before processing
+        "text_mode": 0,             # 0=none, 1=neural, 2=tfidf
+        "text_cleaning": True,      # Clean text before encoding
+        "zscore_limit": 3.0,        # Z-score outlier threshold
+        "dsg_mode": "copula",       # "copula" or "gan"
+        "similarity_p": 95,         # Similarity percentile for DSG
+    },
+    wait=True,                      # Block until complete (default True)
+    poll_interval=2,                # Status poll interval in seconds
+    download_path="./outputs/",     # Where to save files (default auto)
+    output_preferences=["dsg.csv"], # Which files to download (default all)
+    compressed=True,                # Download as ZIP (default True)
+)
+```
+**Returns** a dict:
+```python
+{
+    "session_id": "abc123...",
+    "status": "completed",
+    "files": [
+        {"name": "dsg.csv", "url": "/download/.../dsg.csv", "size": 2097152, "description": "Synthetic data"},
+        {"name": "dsm_train.csv", ...},
+        ...
+    ],
+    "row_count": 20000,
+    "duration_seconds": 42.1,
+}
+```
+Set `wait=False` to get back immediately with just `session_id` and `status`:
+```python
+result = client.process(file_path="data.csv", target_columns="price", wait=False)
+session_id = result["session_id"]
+```
+---
+### `client.get_status(session_id)` — Poll progress
+```python
+status = client.get_status(session_id)
+# {
+#   "status": "running",           # queued | running | completed | error | cancelled
+#   "message": "Running MDH...",
+#   "current_step": 3,
+#   "total_steps": 6,
+#   "progress_percent": 50,
+#   "duration_seconds": 15.3,
+# }
+```
+---
+### `client.get_result(session_id)` — Fetch completed results
+```python
+result = client.get_result(session_id)
+# {"status": "completed", "files": [...], "row_count": ..., "duration_seconds": ...}
+```
+---
+### `client.wait_for_completion(session_id, poll_interval)` — Block until done
+```python
+result = client.wait_for_completion(session_id, poll_interval=3)
+```
+Prints live progress to stdout. Raises `AutoDataError` if processing fails.
+---
+### `client.cancel(session_id)` — Cancel a running job
+```python
+cancelled = client.cancel(session_id)  # True if acknowledged
+```
+---
+### `client.download_results(session_id, ...)` — Download output files
+```python
+path = client.download_results(
+    session_id,
+    download_path="./my_outputs/",      # Directory to save into
+    output_preferences=["dsg.csv"],     # Specific files only (None = all)
+    compressed=True,                    # ZIP download (default) or individual files
+)
+print(f"Saved to {path}")
+```
+---
+### `client.download_file(url, output_path)` — Download a single file
+```python
+client.download_file("/download/abc123.../dsg.csv", "dsg.csv")
+```
+---
+### `client.list_keys()` — List API keys
+```python
+keys = client.list_keys()
+# [{"id": "...", "name": "My Key", "prefix": "dtpk_abc123", "created_at": "..."}]
+```
+---
+### `client.get_usage()` — Usage statistics
+```python
+usage = client.get_usage()
+# {
+#   "daily_credits_used": 500,
+#   "daily_credit_limit": 10000,
+#   "daily_remaining": 9500,
+#   "lifetime_credits_used": 12340,
+#   "lifetime_credit_limit": 1000000,
+#   "lifetime_remaining": 987660,
+#   "daily_request_count": 3,
+#   "last_used_at": "2026-04-12T10:30:00Z",
+# }
+```
+---
+## Error Handling
+All API errors raise `AutoDataError`:
+```python
+from autodata import AutoDataClient, AutoDataError
+client = AutoDataClient(api_key="dtpk_...")
+try:
+    result = client.process("data.csv", target_columns="price")
+except AutoDataError as e:
+    print(f"API error {e.status_code}: {e}")
+except FileNotFoundError as e:
+    print(f"File not found: {e}")
+```
+`AutoDataError` attributes:
+- `str(e)` — human-readable error message from the server
+- `e.status_code` — HTTP status code (e.g. `401`, `429`, `500`), or `None` for non-HTTP errors
+---
+## Advanced Example: Non-blocking with manual polling
+```python
+import time
+from autodata import AutoDataClient, AutoDataError
+client = AutoDataClient(api_key="dtpk_...")
+# Start job without blocking
+job = client.process("large_dataset.csv", target_columns=["churn"], wait=False)
+session_id = job["session_id"]
+print(f"Job started: {session_id}")
+# Poll manually
+while True:
+    status = client.get_status(session_id)
+    print(f"  {status['progress_percent']}% — {status['message']}")
+    if status["status"] == "completed":
+        break
+    elif status["status"] in ("error", "cancelled"):
+        raise AutoDataError(f"Job {status['status']}: {status['message']}")
+    time.sleep(5)
+# Download results
+path = client.download_results(session_id, download_path="./outputs/")
+print(f"Results saved to {path}")
+```
+---
+## Requirements
+- Python ≥ 3.8
+- `requests` ≥ 2.25.0
+## License
+MIT

datatoolpack-0.2.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,6 @@
+autodata/__init__.py,sha256=Zm0UiJk5vs65JRtDPtqoGfqvW9ns2hZF1FFkSHM7rRk,97
+autodata/client.py,sha256=nVOrHzfNxB8weggxfQ4j9gz6caC5der_wgstw6dcM1M,13302
+datatoolpack-0.2.0.dist-info/METADATA,sha256=Zqj2b3zCHB9QdiYa5cjp16ZrVV9Od6Ol6R674cDGMK0,8446
+datatoolpack-0.2.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
+datatoolpack-0.2.0.dist-info/top_level.txt,sha256=sr-rI7IXX_zHkBW6llyF3dgIU9Bg-99IY_QQXfBo_YU,9
+datatoolpack-0.2.0.dist-info/RECORD,,

datatoolpack-0.2.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: setuptools (82.0.1)
+Root-Is-Purelib: true
+Tag: py3-none-any

datatoolpack-0.2.0.dist-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ autodata