dataflare-sdk 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,32 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # Environments
7
+ .env
8
+ .venv
9
+ env/
10
+ venv/
11
+ ENV/
12
+ env.bak/
13
+ venv.bak/
14
+
15
+ # Testing
16
+ .pytest_cache/
17
+ htmlcov/
18
+ .coverage
19
+ .tox/
20
+ coverage.xml
21
+
22
+ # Distribution / packaging
23
+ dist/
24
+ build/
25
+ *.egg-info/
26
+ .eggs/
27
+
28
+ # IDEs
29
+ .idea/
30
+ .vscode/
31
+ *.swp
32
+ *.swo
@@ -0,0 +1,61 @@
1
+ Metadata-Version: 2.4
2
+ Name: dataflare-sdk
3
+ Version: 0.1.0
4
+ Summary: Official Python SDK for the DataFlare API
5
+ Author-email: DataFlare Lab <oss@dataflarelab.com>
6
+ License-Expression: MIT
7
+ Requires-Python: >=3.9
8
+ Requires-Dist: httpx>=0.27.0
9
+ Requires-Dist: pydantic>=2.0.0
10
+ Requires-Dist: tenacity>=8.0.0
11
+ Provides-Extra: dev
12
+ Requires-Dist: pytest-asyncio>=0.23.0; extra == 'dev'
13
+ Requires-Dist: pytest>=8.0.0; extra == 'dev'
14
+ Requires-Dist: respx>=0.21.1; extra == 'dev'
15
+ Description-Content-Type: text/markdown
16
+
17
+ # DF API Python SDK `<a href="https://pypi.org/project/dataflare-sdk/"><img alt="PyPI" src="https://img.shields.io/pypi/v/dataflare-sdk?color=blue"></a>`
18
+
19
+ The official Python SDK for the **DataFlare API**.
20
+
21
+ ## Features
22
+ - **Typed Models:** Full Pydantic schemas mapping the Datasets API for rigid IDE autocompletion.
23
+ - **Connection Pools:** Subclass optimized `httpx` logic reusing TCP connections seamlessly.
24
+ - **Resilient Requests:** Automated retries (`tenacity`) wrapping Rate Limit and transient network faults over exponential backoffs.
25
+ - **Idiomatic Paginators:** `client.datasets.stream(...)` automatically handles cursor injection iteratively returning stream chunks cleanly.
26
+ - **Memory-safe Source Retrieval:** For pipelines feeding Large Language Models directly from data archives, effortlessly invoke `download_file(...)` natively chunking raw bytes down to the file system avoiding memory leaks.
27
+
28
+ ## Installation
29
+
30
+ ```bash
31
+ pip install dataflare-sdk
32
+ ```
33
+
34
+ ## Quick Start
35
+
36
+ You will need a DataFlare API Key. The easiest way is setting an environment variable:
37
+ ```bash
38
+ export DF_API_KEY="dfk_abc123"
39
+ ```
40
+
41
+ ```python
42
+ from df import DFClient, AuthenticationError
43
+
44
+ # Automatically discovers DF_API_KEY from the environment
45
+ try:
46
+ with DFClient() as client:
47
+
48
+ # Generator handles pagination constraints completely
49
+ for doc in client.datasets.stream("legal", search_term="التأمين", limit=100):
50
+ print(f"Doc ID: {doc.id} | Metadata: {doc.metadata}")
51
+
52
+ # Helper to download the raw PDF to disk natively
53
+ if doc.source_url:
54
+ client.datasets.download_file(
55
+ doc.source_url,
56
+ destination=f"./archives/{doc.id}.pdf"
57
+ )
58
+
59
+ except AuthenticationError:
60
+ print("Invalid API Key.")
61
+ ```
@@ -0,0 +1,45 @@
1
+ # DF API Python SDK `<a href="https://pypi.org/project/dataflare-sdk/"><img alt="PyPI" src="https://img.shields.io/pypi/v/dataflare-sdk?color=blue"></a>`
2
+
3
+ The official Python SDK for the **DataFlare API**.
4
+
5
+ ## Features
6
+ - **Typed Models:** Full Pydantic schemas mapping the Datasets API for rigid IDE autocompletion.
7
+ - **Connection Pools:** Subclass optimized `httpx` logic reusing TCP connections seamlessly.
8
+ - **Resilient Requests:** Automated retries (`tenacity`) wrapping Rate Limit and transient network faults over exponential backoffs.
9
+ - **Idiomatic Paginators:** `client.datasets.stream(...)` automatically handles cursor injection iteratively returning stream chunks cleanly.
10
+ - **Memory-safe Source Retrieval:** For pipelines feeding Large Language Models directly from data archives, effortlessly invoke `download_file(...)` natively chunking raw bytes down to the file system avoiding memory leaks.
11
+
12
+ ## Installation
13
+
14
+ ```bash
15
+ pip install dataflare-sdk
16
+ ```
17
+
18
+ ## Quick Start
19
+
20
+ You will need a DataFlare API Key. The easiest way is setting an environment variable:
21
+ ```bash
22
+ export DF_API_KEY="dfk_abc123"
23
+ ```
24
+
25
+ ```python
26
+ from df import DFClient, AuthenticationError
27
+
28
+ # Automatically discovers DF_API_KEY from the environment
29
+ try:
30
+ with DFClient() as client:
31
+
32
+ # Generator handles pagination constraints completely
33
+ for doc in client.datasets.stream("legal", search_term="التأمين", limit=100):
34
+ print(f"Doc ID: {doc.id} | Metadata: {doc.metadata}")
35
+
36
+ # Helper to download the raw PDF to disk natively
37
+ if doc.source_url:
38
+ client.datasets.download_file(
39
+ doc.source_url,
40
+ destination=f"./archives/{doc.id}.pdf"
41
+ )
42
+
43
+ except AuthenticationError:
44
+ print("Invalid API Key.")
45
+ ```
@@ -0,0 +1,29 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "dataflare-sdk"
7
+ version = "0.1.0"
8
+ description = "Official Python SDK for the DataFlare API"
9
+ readme = "README.md"
10
+ requires-python = ">=3.9"
11
+ license = "MIT"
12
+ authors = [
13
+ { name = "DataFlare Lab", email = "oss@dataflarelab.com" }
14
+ ]
15
+ dependencies = [
16
+ "httpx>=0.27.0",
17
+ "pydantic>=2.0.0",
18
+ "tenacity>=8.0.0"
19
+ ]
20
+
21
+ [project.optional-dependencies]
22
+ dev = [
23
+ "pytest>=8.0.0",
24
+ "respx>=0.21.1",
25
+ "pytest-asyncio>=0.23.0"
26
+ ]
27
+
28
+ [tool.hatch.build.targets.wheel]
29
+ packages = ["src/df"]
@@ -0,0 +1,16 @@
1
+ """
2
+ DF SDK
3
+ Official Python SDK for the DataFlare API.
4
+ """
5
+
6
+ from .client import DFClient
7
+ from .exceptions import APIError, AuthenticationError, RateLimitError
8
+
9
+ __version__ = "0.1.0"
10
+ __all__ = [
11
+ "DFClient",
12
+ "APIError",
13
+ "AuthenticationError",
14
+ "RateLimitError",
15
+ "__version__",
16
+ ]
@@ -0,0 +1,59 @@
1
+ import os
2
+ import httpx
3
+ from typing import Optional
4
+ from .services.datasets import DatasetService
5
+
6
+
7
+ class DFClient:
8
+ """
9
+ The official DF (DataFlare) API Client.
10
+ Manages connection pooling, authentication, and HTTP retries globally.
11
+ """
12
+
13
+ def __init__(
14
+ self,
15
+ api_key: Optional[str] = None,
16
+ base_url: str = "https://api.example.com",
17
+ timeout: float = 30.0,
18
+ max_retries: int = 3,
19
+ ):
20
+ """
21
+ Initializes the DF API Client.
22
+ :param api_key: Your API Key. Defaults to the DF_API_KEY env var.
23
+ :param base_url: The base URL for the DF API.
24
+ :param timeout: HTTP request timeout in seconds.
25
+ :param max_retries: The maximum number of background retries for transient errors.
26
+ """
27
+ self.api_key = api_key or os.environ.get("DF_API_KEY")
28
+ if not self.api_key:
29
+ raise ValueError(
30
+ "DF API key must be provided or set as DF_API_KEY environment variable."
31
+ )
32
+
33
+ self.base_url = base_url.rstrip("/")
34
+ self.max_retries = max_retries
35
+
36
+ # Configure connection pooled httpx client
37
+ self._http_client = httpx.Client(
38
+ base_url=self.base_url,
39
+ timeout=timeout,
40
+ headers={
41
+ "X-API-Key": self.api_key,
42
+ "Accept": "application/json",
43
+ "User-Agent": "df-python/0.1.0",
44
+ "Accept-Encoding": "gzip, deflate, br",
45
+ },
46
+ )
47
+
48
+ # Initialize services
49
+ self.datasets = DatasetService(self._http_client, max_retries=self.max_retries)
50
+
51
+ def close(self):
52
+ """Close the underlying HTTP connection pool."""
53
+ self._http_client.close()
54
+
55
+ def __enter__(self):
56
+ return self
57
+
58
+ def __exit__(self, exc_type, exc_val, exc_tb):
59
+ self.close()
@@ -0,0 +1,19 @@
1
+ class DFError(Exception):
2
+ """Base exception for all DF SDK errors."""
3
+
4
+
5
+ class AuthenticationError(DFError):
6
+ """Raised when the API returns a 401 Unauthorized."""
7
+
8
+
9
+ class RateLimitError(DFError):
10
+ """Raised when the API returns a 429 Too Many Requests, and retries are exhausted."""
11
+
12
+
13
+ class APIError(DFError):
14
+ """Raised when the API returns a 4xx or 5xx status code."""
15
+
16
+ def __init__(self, message: str, status_code: int = None, details: dict = None):
17
+ super().__init__(message)
18
+ self.status_code = status_code
19
+ self.details = details
@@ -0,0 +1,3 @@
1
+ from .dataset import DatasetQueryRequest, DatasetQueryResponse, DatasetDocument
2
+
3
+ __all__ = ["DatasetQueryRequest", "DatasetQueryResponse", "DatasetDocument"]
@@ -0,0 +1,28 @@
1
+ from datetime import datetime
2
+ from typing import Optional, Dict, Any, List
3
+ from pydantic import BaseModel, ConfigDict, Field
4
+
5
+
6
+ class DatasetQueryRequest(BaseModel):
7
+ dataset: str
8
+ limit: Optional[int] = Field(default=None, le=1000)
9
+ cursor: Optional[str] = None
10
+ search_term: Optional[str] = None
11
+ filters: Optional[Dict[str, Any]] = None
12
+
13
+
14
+ class DatasetDocument(BaseModel):
15
+ id: str = Field(alias="_id")
16
+ text: str
17
+ metadata: Optional[Dict[str, Any]] = Field(default_factory=dict)
18
+ source_url: Optional[str] = None
19
+ created_at: Optional[datetime] = None
20
+
21
+ model_config = ConfigDict(populate_by_name=True, extra="allow")
22
+
23
+
24
+ class DatasetQueryResponse(BaseModel):
25
+ data: List[DatasetDocument]
26
+ count: int
27
+ next_cursor: Optional[str] = None
28
+ latency: Optional[str] = None
@@ -0,0 +1,3 @@
1
+ from .datasets import DatasetService
2
+
3
+ __all__ = ["DatasetService"]
@@ -0,0 +1,131 @@
1
+ import httpx
2
+ from tenacity import (
3
+ retry,
4
+ stop_after_attempt,
5
+ wait_exponential,
6
+ retry_if_exception_type,
7
+ )
8
+ from typing import Iterator, Optional, Dict, Any
9
+ import logging
10
+
11
+ from ..models.dataset import DatasetDocument, DatasetQueryResponse
12
+ from ..exceptions import RateLimitError, APIError, AuthenticationError
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class DatasetService:
18
+ def __init__(self, http_client: httpx.Client, max_retries: int = 3):
19
+ self._client = http_client
20
+ self.max_retries = max_retries
21
+
22
+ def _should_retry_error(exc: Exception) -> bool:
23
+ if isinstance(exc, RateLimitError):
24
+ return True
25
+ if isinstance(exc, APIError) and exc.status_code and exc.status_code >= 500:
26
+ return True
27
+ if isinstance(exc, httpx.RequestError):
28
+ return True
29
+ return False
30
+
31
+ def _execute_request_with_retries(
32
+ self, method: str, path: str, **kwargs
33
+ ) -> httpx.Response:
34
+ @retry(
35
+ stop=stop_after_attempt(self.max_retries),
36
+ wait=wait_exponential(multiplier=1, min=1, max=10),
37
+ retry=retry_if_exception_type(
38
+ (RateLimitError, httpx.RequestError, httpx.TimeoutException)
39
+ ),
40
+ )
41
+ def _do_request():
42
+ try:
43
+ response = self._client.request(method, path, **kwargs)
44
+ if response.status_code == 401:
45
+ raise AuthenticationError("Invalid API Key.")
46
+ elif response.status_code == 403:
47
+ raise APIError(
48
+ "Access denied. You do not have permission for this dataset.",
49
+ status_code=403,
50
+ )
51
+ elif response.status_code == 429:
52
+ raise RateLimitError("Rate limit exceeded.")
53
+ elif not response.is_success:
54
+ raise APIError(
55
+ f"API Error: {response.text}", status_code=response.status_code
56
+ )
57
+ return response
58
+ except APIError as e:
59
+ # We specifically raise APIError 5xx to trigger retry (handled by tenacity if we wanted to),
60
+ # but currently retry_if specifies RateLimitError and network errors. Let's adjust logic.
61
+ if e.status_code and e.status_code >= 500:
62
+ raise RateLimitError(
63
+ "Transient Server Error"
64
+ ) # mapped to generic retryable
65
+ raise
66
+
67
+ return _do_request()
68
+
69
+ def stream(
70
+ self,
71
+ dataset: str,
72
+ search_term: Optional[str] = None,
73
+ filters: Optional[Dict[str, Any]] = None,
74
+ limit: int = 100,
75
+ ) -> Iterator[DatasetDocument]:
76
+ """
77
+ Stream documents from a dataset. Handles pagination automatically.
78
+
79
+ :param dataset: The dataset name (e.g. 'legal')
80
+ :param search_term: Optional Arabic search text
81
+ :param filters: Optional strict matching keys
82
+ :param limit: Page chunk limit
83
+ """
84
+ cursor = None
85
+
86
+ while True:
87
+ payload = {"dataset": dataset, "limit": limit}
88
+ if search_term:
89
+ payload["search_term"] = search_term
90
+ if filters:
91
+ payload["filters"] = filters
92
+ if cursor:
93
+ payload["cursor"] = cursor
94
+
95
+ response = self._execute_request_with_retries(
96
+ "POST", "/v1/datasets", json=payload
97
+ )
98
+ data = response.json()
99
+
100
+ # Use Pydantic to validate and instantiate models
101
+ parsed_response = DatasetQueryResponse(**data)
102
+
103
+ for doc in parsed_response.data:
104
+ yield doc
105
+
106
+ if not parsed_response.next_cursor:
107
+ break
108
+
109
+ cursor = parsed_response.next_cursor
110
+
111
+ def download_file(self, url: str, destination: str, chunk_size: int = 8192):
112
+ """
113
+ Memory-safe helper to download a source PDF/file directly to disk.
114
+ Does not buffer the file in RAM.
115
+
116
+ :param url: The Presigned URL or CDN URL to download
117
+ :param destination: The local file path to write to (e.g., './doc.pdf')
118
+ :param chunk_size: Buffer chunk size in bytes
119
+ """
120
+ import os
121
+
122
+ # We use an isolated client here to not leak auth headers if URL points to S3 directly
123
+ with httpx.stream("GET", url, follow_redirects=True) as response:
124
+ response.raise_for_status()
125
+
126
+ # Ensure the directory exists
127
+ os.makedirs(os.path.dirname(os.path.abspath(destination)), exist_ok=True)
128
+
129
+ with open(destination, "wb") as f:
130
+ for chunk in response.iter_bytes(chunk_size=chunk_size):
131
+ f.write(chunk)
@@ -0,0 +1,90 @@
1
+ import pytest
2
+ import respx
3
+ import httpx
4
+ from unittest.mock import patch, mock_open
5
+
6
+ from df.client import DFClient
7
+ from df.exceptions import AuthenticationError
8
+
9
+
10
+ @pytest.fixture
11
+ def client():
12
+ import os
13
+
14
+ # Environment variable automatically fetched
15
+ with patch.dict(os.environ, {"DF_API_KEY": "test_key_123"}):
16
+ with DFClient(base_url="https://api.test.com", max_retries=1) as c:
17
+ yield c
18
+
19
+
20
+ @respx.mock
21
+ def test_dataset_stream_pagination(client: DFClient):
22
+ # Mock first page
23
+ respx.post("https://api.test.com/v1/datasets").mock(
24
+ return_value=httpx.Response(
25
+ 200,
26
+ json={
27
+ "data": [{"_id": "doc1", "text": "Hello"}],
28
+ "count": 1,
29
+ "next_cursor": "cur1",
30
+ },
31
+ )
32
+ )
33
+
34
+ # We must patch the second request matching cursor
35
+ def handle_request(request):
36
+ import json
37
+
38
+ body = json.loads(request.content)
39
+ if body.get("cursor") == "cur1":
40
+ return httpx.Response(
41
+ 200,
42
+ json={
43
+ "data": [{"_id": "doc2", "text": "World"}],
44
+ "count": 1,
45
+ "next_cursor": None,
46
+ },
47
+ )
48
+ return httpx.Response(
49
+ 200,
50
+ json={
51
+ "data": [{"_id": "doc1", "text": "Hello"}],
52
+ "count": 1,
53
+ "next_cursor": "cur1",
54
+ },
55
+ )
56
+
57
+ respx.post("https://api.test.com/v1/datasets").mock(side_effect=handle_request)
58
+
59
+ # Act
60
+ items = list(client.datasets.stream("legal"))
61
+
62
+ # Assert
63
+ assert len(items) == 2
64
+ assert items[0].id == "doc1"
65
+ assert items[1].id == "doc2"
66
+
67
+
68
+ @respx.mock
69
+ def test_authentication_error(client: DFClient):
70
+ respx.post("https://api.test.com/v1/datasets").mock(
71
+ return_value=httpx.Response(401, json={"error": "unauthorized"})
72
+ )
73
+
74
+ with pytest.raises(AuthenticationError):
75
+ list(client.datasets.stream("legal"))
76
+
77
+
78
+ @respx.mock
79
+ def test_download_file(client: DFClient):
80
+ # Tests that download_file writes bytes successfully
81
+ url = "https://cdn.example.com/doc.pdf"
82
+ respx.get(url).mock(return_value=httpx.Response(200, content=b"fake-pdf-content"))
83
+
84
+ with patch("builtins.open", mock_open()) as mocked_file:
85
+ with patch("os.makedirs"):
86
+ client.datasets.download_file(url, "fake.pdf")
87
+
88
+ # Ensures file was opened and written
89
+ mocked_file.assert_called_with("fake.pdf", "wb")
90
+ mocked_file().write.assert_called()