PyPI - dataflare-sdk - Versions diffs - 0.1.0__tar.gz - Mend

dataflare-sdk 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

dataflare_sdk-0.1.0/.gitignore +32 -0
dataflare_sdk-0.1.0/PKG-INFO +61 -0
dataflare_sdk-0.1.0/README.md +45 -0
dataflare_sdk-0.1.0/pyproject.toml +29 -0
dataflare_sdk-0.1.0/src/df/__init__.py +16 -0
dataflare_sdk-0.1.0/src/df/client.py +59 -0
dataflare_sdk-0.1.0/src/df/exceptions.py +19 -0
dataflare_sdk-0.1.0/src/df/models/__init__.py +3 -0
dataflare_sdk-0.1.0/src/df/models/dataset.py +28 -0
dataflare_sdk-0.1.0/src/df/services/__init__.py +3 -0
dataflare_sdk-0.1.0/src/df/services/datasets.py +131 -0
dataflare_sdk-0.1.0/tests/test_datasets.py +90 -0

dataflare_sdk-0.1.0/.gitignore ADDED Viewed

@@ -0,0 +1,32 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Testing
+.pytest_cache/
+htmlcov/
+.coverage
+.tox/
+coverage.xml
+# Distribution / packaging
+dist/
+build/
+*.egg-info/
+.eggs/
+# IDEs
+.idea/
+.vscode/
+*.swp
+*.swo

dataflare_sdk-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,61 @@
+Metadata-Version: 2.4
+Name: dataflare-sdk
+Version: 0.1.0
+Summary: Official Python SDK for the DataFlare API
+Author-email: DataFlare Lab <oss@dataflarelab.com>
+License-Expression: MIT
+Requires-Python: >=3.9
+Requires-Dist: httpx>=0.27.0
+Requires-Dist: pydantic>=2.0.0
+Requires-Dist: tenacity>=8.0.0
+Provides-Extra: dev
+Requires-Dist: pytest-asyncio>=0.23.0; extra == 'dev'
+Requires-Dist: pytest>=8.0.0; extra == 'dev'
+Requires-Dist: respx>=0.21.1; extra == 'dev'
+Description-Content-Type: text/markdown
+# DF API Python SDK `<a href="https://pypi.org/project/dataflare-sdk/"><img alt="PyPI" src="https://img.shields.io/pypi/v/dataflare-sdk?color=blue"></a>`
+The official Python SDK for the **DataFlare API**.
+## Features
+- **Typed Models:** Full Pydantic schemas mapping the Datasets API for rigid IDE autocompletion.
+- **Connection Pools:** Subclass optimized `httpx` logic reusing TCP connections seamlessly.
+- **Resilient Requests:** Automated retries (`tenacity`) wrapping Rate Limit and transient network faults over exponential backoffs.
+- **Idiomatic Paginators:** `client.datasets.stream(...)` automatically handles cursor injection iteratively returning stream chunks cleanly.
+- **Memory-safe Source Retrieval:** For pipelines feeding Large Language Models directly from data archives, effortlessly invoke `download_file(...)` natively chunking raw bytes down to the file system avoiding memory leaks.
+## Installation
+```bash
+pip install dataflare-sdk
+```
+## Quick Start
+You will need a DataFlare API Key. The easiest way is setting an environment variable:
+```bash
+export DF_API_KEY="dfk_abc123"
+```
+```python
+from df import DFClient, AuthenticationError
+# Automatically discovers DF_API_KEY from the environment
+try:
+    with DFClient() as client:
+        # Generator handles pagination constraints completely
+        for doc in client.datasets.stream("legal", search_term="التأمين", limit=100):
+            print(f"Doc ID: {doc.id} | Metadata: {doc.metadata}")
+            # Helper to download the raw PDF to disk natively
+            if doc.source_url:
+                client.datasets.download_file(
+                    doc.source_url,
+                    destination=f"./archives/{doc.id}.pdf"
+                )
+except AuthenticationError:
+    print("Invalid API Key.")
+```

dataflare_sdk-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,45 @@
+# DF API Python SDK `<a href="https://pypi.org/project/dataflare-sdk/"><img alt="PyPI" src="https://img.shields.io/pypi/v/dataflare-sdk?color=blue"></a>`
+The official Python SDK for the **DataFlare API**.
+## Features
+- **Typed Models:** Full Pydantic schemas mapping the Datasets API for rigid IDE autocompletion.
+- **Connection Pools:** Subclass optimized `httpx` logic reusing TCP connections seamlessly.
+- **Resilient Requests:** Automated retries (`tenacity`) wrapping Rate Limit and transient network faults over exponential backoffs.
+- **Idiomatic Paginators:** `client.datasets.stream(...)` automatically handles cursor injection iteratively returning stream chunks cleanly.
+- **Memory-safe Source Retrieval:** For pipelines feeding Large Language Models directly from data archives, effortlessly invoke `download_file(...)` natively chunking raw bytes down to the file system avoiding memory leaks.
+## Installation
+```bash
+pip install dataflare-sdk
+```
+## Quick Start
+You will need a DataFlare API Key. The easiest way is setting an environment variable:
+```bash
+export DF_API_KEY="dfk_abc123"
+```
+```python
+from df import DFClient, AuthenticationError
+# Automatically discovers DF_API_KEY from the environment
+try:
+    with DFClient() as client:
+        # Generator handles pagination constraints completely
+        for doc in client.datasets.stream("legal", search_term="التأمين", limit=100):
+            print(f"Doc ID: {doc.id} | Metadata: {doc.metadata}")
+            # Helper to download the raw PDF to disk natively
+            if doc.source_url:
+                client.datasets.download_file(
+                    doc.source_url,
+                    destination=f"./archives/{doc.id}.pdf"
+                )
+except AuthenticationError:
+    print("Invalid API Key.")
+```

dataflare_sdk-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,29 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+[project]
+name = "dataflare-sdk"
+version = "0.1.0"
+description = "Official Python SDK for the DataFlare API"
+readme = "README.md"
+requires-python = ">=3.9"
+license = "MIT"
+authors = [
+  { name = "DataFlare Lab", email = "oss@dataflarelab.com" }
+]
+dependencies = [
+    "httpx>=0.27.0",
+    "pydantic>=2.0.0",
+    "tenacity>=8.0.0"
+]
+[project.optional-dependencies]
+dev = [
+    "pytest>=8.0.0",
+    "respx>=0.21.1",
+    "pytest-asyncio>=0.23.0"
+]
+[tool.hatch.build.targets.wheel]
+packages = ["src/df"]

dataflare_sdk-0.1.0/src/df/__init__.py ADDED Viewed

@@ -0,0 +1,16 @@
+"""
+DF SDK
+Official Python SDK for the DataFlare API.
+"""
+from .client import DFClient
+from .exceptions import APIError, AuthenticationError, RateLimitError
+__version__ = "0.1.0"
+__all__ = [
+    "DFClient",
+    "APIError",
+    "AuthenticationError",
+    "RateLimitError",
+    "__version__",
+]

dataflare_sdk-0.1.0/src/df/client.py ADDED Viewed

@@ -0,0 +1,59 @@
+import os
+import httpx
+from typing import Optional
+from .services.datasets import DatasetService
+class DFClient:
+    """
+    The official DF (DataFlare) API Client.
+    Manages connection pooling, authentication, and HTTP retries globally.
+    """
+    def __init__(
+        self,
+        api_key: Optional[str] = None,
+        base_url: str = "https://api.example.com",
+        timeout: float = 30.0,
+        max_retries: int = 3,
+    ):
+        """
+        Initializes the DF API Client.
+        :param api_key: Your API Key. Defaults to the DF_API_KEY env var.
+        :param base_url: The base URL for the DF API.
+        :param timeout: HTTP request timeout in seconds.
+        :param max_retries: The maximum number of background retries for transient errors.
+        """
+        self.api_key = api_key or os.environ.get("DF_API_KEY")
+        if not self.api_key:
+            raise ValueError(
+                "DF API key must be provided or set as DF_API_KEY environment variable."
+            )
+        self.base_url = base_url.rstrip("/")
+        self.max_retries = max_retries
+        # Configure connection pooled httpx client
+        self._http_client = httpx.Client(
+            base_url=self.base_url,
+            timeout=timeout,
+            headers={
+                "X-API-Key": self.api_key,
+                "Accept": "application/json",
+                "User-Agent": "df-python/0.1.0",
+                "Accept-Encoding": "gzip, deflate, br",
+            },
+        )
+        # Initialize services
+        self.datasets = DatasetService(self._http_client, max_retries=self.max_retries)
+    def close(self):
+        """Close the underlying HTTP connection pool."""
+        self._http_client.close()
+    def __enter__(self):
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.close()

dataflare_sdk-0.1.0/src/df/exceptions.py ADDED Viewed

@@ -0,0 +1,19 @@
+class DFError(Exception):
+    """Base exception for all DF SDK errors."""
+class AuthenticationError(DFError):
+    """Raised when the API returns a 401 Unauthorized."""
+class RateLimitError(DFError):
+    """Raised when the API returns a 429 Too Many Requests, and retries are exhausted."""
+class APIError(DFError):
+    """Raised when the API returns a 4xx or 5xx status code."""
+    def __init__(self, message: str, status_code: int = None, details: dict = None):
+        super().__init__(message)
+        self.status_code = status_code
+        self.details = details

dataflare_sdk-0.1.0/src/df/models/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .dataset import DatasetQueryRequest, DatasetQueryResponse, DatasetDocument
+__all__ = ["DatasetQueryRequest", "DatasetQueryResponse", "DatasetDocument"]

dataflare_sdk-0.1.0/src/df/models/dataset.py ADDED Viewed

@@ -0,0 +1,28 @@
+from datetime import datetime
+from typing import Optional, Dict, Any, List
+from pydantic import BaseModel, ConfigDict, Field
+class DatasetQueryRequest(BaseModel):
+    dataset: str
+    limit: Optional[int] = Field(default=None, le=1000)
+    cursor: Optional[str] = None
+    search_term: Optional[str] = None
+    filters: Optional[Dict[str, Any]] = None
+class DatasetDocument(BaseModel):
+    id: str = Field(alias="_id")
+    text: str
+    metadata: Optional[Dict[str, Any]] = Field(default_factory=dict)
+    source_url: Optional[str] = None
+    created_at: Optional[datetime] = None
+    model_config = ConfigDict(populate_by_name=True, extra="allow")
+class DatasetQueryResponse(BaseModel):
+    data: List[DatasetDocument]
+    count: int
+    next_cursor: Optional[str] = None
+    latency: Optional[str] = None

dataflare_sdk-0.1.0/src/df/services/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .datasets import DatasetService
+__all__ = ["DatasetService"]

dataflare_sdk-0.1.0/src/df/services/datasets.py ADDED Viewed

@@ -0,0 +1,131 @@
+import httpx
+from tenacity import (
+    retry,
+    stop_after_attempt,
+    wait_exponential,
+    retry_if_exception_type,
+)
+from typing import Iterator, Optional, Dict, Any
+import logging
+from ..models.dataset import DatasetDocument, DatasetQueryResponse
+from ..exceptions import RateLimitError, APIError, AuthenticationError
+logger = logging.getLogger(__name__)
+class DatasetService:
+    def __init__(self, http_client: httpx.Client, max_retries: int = 3):
+        self._client = http_client
+        self.max_retries = max_retries
+    def _should_retry_error(exc: Exception) -> bool:
+        if isinstance(exc, RateLimitError):
+            return True
+        if isinstance(exc, APIError) and exc.status_code and exc.status_code >= 500:
+            return True
+        if isinstance(exc, httpx.RequestError):
+            return True
+        return False
+    def _execute_request_with_retries(
+        self, method: str, path: str, **kwargs
+    ) -> httpx.Response:
+        @retry(
+            stop=stop_after_attempt(self.max_retries),
+            wait=wait_exponential(multiplier=1, min=1, max=10),
+            retry=retry_if_exception_type(
+                (RateLimitError, httpx.RequestError, httpx.TimeoutException)
+            ),
+        )
+        def _do_request():
+            try:
+                response = self._client.request(method, path, **kwargs)
+                if response.status_code == 401:
+                    raise AuthenticationError("Invalid API Key.")
+                elif response.status_code == 403:
+                    raise APIError(
+                        "Access denied. You do not have permission for this dataset.",
+                        status_code=403,
+                    )
+                elif response.status_code == 429:
+                    raise RateLimitError("Rate limit exceeded.")
+                elif not response.is_success:
+                    raise APIError(
+                        f"API Error: {response.text}", status_code=response.status_code
+                    )
+                return response
+            except APIError as e:
+                # We specifically raise APIError 5xx to trigger retry (handled by tenacity if we wanted to),
+                # but currently retry_if specifies RateLimitError and network errors. Let's adjust logic.
+                if e.status_code and e.status_code >= 500:
+                    raise RateLimitError(
+                        "Transient Server Error"
+                    )  # mapped to generic retryable
+                raise
+        return _do_request()
+    def stream(
+        self,
+        dataset: str,
+        search_term: Optional[str] = None,
+        filters: Optional[Dict[str, Any]] = None,
+        limit: int = 100,
+    ) -> Iterator[DatasetDocument]:
+        """
+        Stream documents from a dataset. Handles pagination automatically.
+        :param dataset: The dataset name (e.g. 'legal')
+        :param search_term: Optional Arabic search text
+        :param filters: Optional strict matching keys
+        :param limit: Page chunk limit
+        """
+        cursor = None
+        while True:
+            payload = {"dataset": dataset, "limit": limit}
+            if search_term:
+                payload["search_term"] = search_term
+            if filters:
+                payload["filters"] = filters
+            if cursor:
+                payload["cursor"] = cursor
+            response = self._execute_request_with_retries(
+                "POST", "/v1/datasets", json=payload
+            )
+            data = response.json()
+            # Use Pydantic to validate and instantiate models
+            parsed_response = DatasetQueryResponse(**data)
+            for doc in parsed_response.data:
+                yield doc
+            if not parsed_response.next_cursor:
+                break
+            cursor = parsed_response.next_cursor
+    def download_file(self, url: str, destination: str, chunk_size: int = 8192):
+        """
+        Memory-safe helper to download a source PDF/file directly to disk.
+        Does not buffer the file in RAM.
+        :param url: The Presigned URL or CDN URL to download
+        :param destination: The local file path to write to (e.g., './doc.pdf')
+        :param chunk_size: Buffer chunk size in bytes
+        """
+        import os
+        # We use an isolated client here to not leak auth headers if URL points to S3 directly
+        with httpx.stream("GET", url, follow_redirects=True) as response:
+            response.raise_for_status()
+            # Ensure the directory exists
+            os.makedirs(os.path.dirname(os.path.abspath(destination)), exist_ok=True)
+            with open(destination, "wb") as f:
+                for chunk in response.iter_bytes(chunk_size=chunk_size):
+                    f.write(chunk)

dataflare_sdk-0.1.0/tests/test_datasets.py ADDED Viewed

@@ -0,0 +1,90 @@
+import pytest
+import respx
+import httpx
+from unittest.mock import patch, mock_open
+from df.client import DFClient
+from df.exceptions import AuthenticationError
+@pytest.fixture
+def client():
+    import os
+    # Environment variable automatically fetched
+    with patch.dict(os.environ, {"DF_API_KEY": "test_key_123"}):
+        with DFClient(base_url="https://api.test.com", max_retries=1) as c:
+            yield c
+@respx.mock
+def test_dataset_stream_pagination(client: DFClient):
+    # Mock first page
+    respx.post("https://api.test.com/v1/datasets").mock(
+        return_value=httpx.Response(
+            200,
+            json={
+                "data": [{"_id": "doc1", "text": "Hello"}],
+                "count": 1,
+                "next_cursor": "cur1",
+            },
+        )
+    )
+    # We must patch the second request matching cursor
+    def handle_request(request):
+        import json
+        body = json.loads(request.content)
+        if body.get("cursor") == "cur1":
+            return httpx.Response(
+                200,
+                json={
+                    "data": [{"_id": "doc2", "text": "World"}],
+                    "count": 1,
+                    "next_cursor": None,
+                },
+            )
+        return httpx.Response(
+            200,
+            json={
+                "data": [{"_id": "doc1", "text": "Hello"}],
+                "count": 1,
+                "next_cursor": "cur1",
+            },
+        )
+    respx.post("https://api.test.com/v1/datasets").mock(side_effect=handle_request)
+    # Act
+    items = list(client.datasets.stream("legal"))
+    # Assert
+    assert len(items) == 2
+    assert items[0].id == "doc1"
+    assert items[1].id == "doc2"
+@respx.mock
+def test_authentication_error(client: DFClient):
+    respx.post("https://api.test.com/v1/datasets").mock(
+        return_value=httpx.Response(401, json={"error": "unauthorized"})
+    )
+    with pytest.raises(AuthenticationError):
+        list(client.datasets.stream("legal"))
+@respx.mock
+def test_download_file(client: DFClient):
+    # Tests that download_file writes bytes successfully
+    url = "https://cdn.example.com/doc.pdf"
+    respx.get(url).mock(return_value=httpx.Response(200, content=b"fake-pdf-content"))
+    with patch("builtins.open", mock_open()) as mocked_file:
+        with patch("os.makedirs"):
+            client.datasets.download_file(url, "fake.pdf")
+            # Ensures file was opened and written
+            mocked_file.assert_called_with("fake.pdf", "wb")
+            mocked_file().write.assert_called()