PyPI - prismadata - Versions diffs - 0.3.2__tar.gz → 0.4.0__tar.gz - Mend

prismadata 0.3.2tar.gz → 0.4.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

{prismadata-0.3.2 → prismadata-0.4.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: prismadata
-Version: 0.3.2
+Version: 0.4.0
 Summary: Python client for the PrismaData location intelligence API
 License: MIT
 License-File: LICENSE

{prismadata-0.3.2 → prismadata-0.4.0}/prismadata/__init__.py RENAMED Viewed

@@ -37,6 +37,7 @@ from .client import Client
 from .exceptions import (
     AuthenticationError,
     BatchError,
+    BatchPrepareError,
     PrismaDataError,
     QuotaExhaustedError,
     RateLimitError,
@@ -62,6 +63,7 @@ __all__ = [
     "__version__",
     "AuthenticationError",
     "BatchError",
+    "BatchPrepareError",
     "PrismaDataError",
     "QuotaExhaustedError",
     "RateLimitError",

{prismadata-0.3.2 → prismadata-0.4.0}/prismadata/_batch.py RENAMED Viewed

@@ -11,6 +11,26 @@ from .exceptions import BatchError
 logger = logging.getLogger("prismadata.batch")
+def split_into_groups(
+    items: dict[str, Any], num_groups: int,
+) -> list[dict[str, Any]]:
+    """Split a dict into num_groups roughly-equal sub-dicts.
+    Args:
+        items: Dictionary to split.
+        num_groups: Number of groups (must be >= 1).
+    Returns:
+        List of dicts, each containing a subset of the items.
+    """
+    keys = list(items.keys())
+    group_size = math.ceil(len(keys) / num_groups)
+    return [
+        {k: items[k] for k in keys[i : i + group_size]}
+        for i in range(0, len(keys), group_size)
+    ]
 def _raise_if_partial(
     results: dict[str, Any],
     failed_keys: list[str],

{prismadata-0.3.2 → prismadata-0.4.0}/prismadata/_constants.py RENAMED Viewed

@@ -22,6 +22,11 @@ USER_AGENT_PREFIX = "prismadata-python"
 DIRECTION_MAP = {"outgoing": "SAINDO", "incoming": "INDO"}
+PREPARE_POLL_INTERVAL = 10
+PREPARE_TIMEOUT = 300
+DEFAULT_CHUNK_THRESHOLD = 10_000
+DEFAULT_MAX_WORKERS = 2
 ENV_API_KEY = "PRISMADATA_APIKEY"
 ENV_USERNAME = "PRISMADATA_USERNAME"
 ENV_PASSWORD = "PRISMADATA_PASSWORD"

prismadata-0.4.0/prismadata/_prepare.py ADDED Viewed

@@ -0,0 +1,119 @@
+"""Batch prepare lifecycle for auto-scaling large batch operations."""
+from __future__ import annotations
+import asyncio
+import logging
+import time
+from typing import Any, Awaitable, Callable
+from ._constants import PREPARE_POLL_INTERVAL, PREPARE_TIMEOUT
+from .exceptions import BatchPrepareError
+logger = logging.getLogger("prismadata.prepare")
+def batch_prepare(
+    post_fn: Callable[..., Any],
+    total_items: int,
+) -> dict[str, Any]:
+    """Signal the API that a large batch is starting.
+    Args:
+        post_fn: Callable that POSTs to an API path (client._post).
+        total_items: Total number of items in the batch.
+    Returns:
+        Response dict with ``session_id`` and ``max_workers``.
+    """
+    logger.info("Preparing batch for %d items", total_items)
+    return post_fn("/v1/batch/prepare", {"total_items": total_items})
+def wait_until_ready(
+    get_fn: Callable[..., Any],
+    session_id: str,
+    timeout: int = PREPARE_TIMEOUT,
+    interval: int = PREPARE_POLL_INTERVAL,
+) -> None:
+    """Poll until the infrastructure is ready.
+    Args:
+        get_fn: Callable that GETs from an API path (client._get).
+        session_id: Session ID from batch_prepare response.
+        timeout: Max seconds to wait.
+        interval: Seconds between polls.
+    Raises:
+        BatchPrepareError: If not ready within timeout.
+    """
+    deadline = time.monotonic() + timeout
+    logger.info("Waiting for batch session %s to be ready (timeout=%ds)", session_id, timeout)
+    while True:
+        status = get_fn(f"/v1/batch/prepare/{session_id}/status")
+        if status.get("ready"):
+            logger.info("Batch session %s is ready", session_id)
+            return
+        if time.monotonic() >= deadline:
+            raise BatchPrepareError(
+                f"Batch prepare timed out after {timeout}s for session {session_id}"
+            )
+        time.sleep(interval)
+def batch_complete(
+    post_fn: Callable[..., Any],
+    session_id: str,
+) -> None:
+    """Signal the API that the batch is finished (scale down).
+    Always called in a finally block — swallows errors to avoid
+    masking the original exception.
+    """
+    try:
+        post_fn(f"/v1/batch/complete/{session_id}", {})
+        logger.info("Batch session %s completed", session_id)
+    except Exception:
+        logger.warning("Failed to complete batch session %s", session_id, exc_info=True)
+async def async_batch_prepare(
+    post_fn: Callable[..., Awaitable[Any]],
+    total_items: int,
+) -> dict[str, Any]:
+    """Async version of batch_prepare."""
+    logger.info("Preparing batch for %d items", total_items)
+    return await post_fn("/v1/batch/prepare", {"total_items": total_items})
+async def async_wait_until_ready(
+    get_fn: Callable[..., Awaitable[Any]],
+    session_id: str,
+    timeout: int = PREPARE_TIMEOUT,
+    interval: int = PREPARE_POLL_INTERVAL,
+) -> None:
+    """Async version of wait_until_ready."""
+    deadline = time.monotonic() + timeout
+    logger.info("Waiting for batch session %s to be ready (timeout=%ds)", session_id, timeout)
+    while True:
+        status = await get_fn(f"/v1/batch/prepare/{session_id}/status")
+        if status.get("ready"):
+            logger.info("Batch session %s is ready", session_id)
+            return
+        if time.monotonic() >= deadline:
+            raise BatchPrepareError(
+                f"Batch prepare timed out after {timeout}s for session {session_id}"
+            )
+        await asyncio.sleep(interval)
+async def async_batch_complete(
+    post_fn: Callable[..., Awaitable[Any]],
+    session_id: str,
+) -> None:
+    """Async version of batch_complete."""
+    try:
+        await post_fn(f"/v1/batch/complete/{session_id}", {})
+        logger.info("Batch session %s completed", session_id)
+    except Exception:
+        logger.warning("Failed to complete batch session %s", session_id, exc_info=True)

{prismadata-0.3.2 → prismadata-0.4.0}/prismadata/async_client.py RENAMED Viewed

@@ -8,17 +8,20 @@ from typing import Any, Literal, TYPE_CHECKING
 from ._async_auth import AsyncAuthManager
 from ._async_http import AsyncHttpClient
 from ._types import QuotaInfo
-from ._batch import async_process_batch, async_process_routing_batch
+from ._batch import async_process_batch, async_process_routing_batch, split_into_groups
 from ._cache import CacheManager
 from ._columns import clean_columns
 from ._constants import (
     BASE_URL,
     DEFAULT_CACHE_TTL,
+    DEFAULT_CHUNK_THRESHOLD,
+    DEFAULT_MAX_WORKERS,
     DEFAULT_TIMEOUT,
     DIRECTION_MAP,
     MAX_BATCH_SIZE,
     MAX_ROUTING_BATCH,
 )
+from ._prepare import async_batch_complete, async_batch_prepare, async_wait_until_ready
 from ._progress import progress_bar
 from ._validation import validate_lat_lng, validate_profile, validate_route_points
 from .client import _resolve_credentials
@@ -657,9 +660,17 @@ class AsyncClient:
         on_error: Literal["raise", "skip"] = "raise",
         timeout: int | None = None,
         show_progress: bool | None = None,
+        auto_scale: bool = True,
+        max_workers: int = DEFAULT_MAX_WORKERS,
+        chunk_threshold: int = DEFAULT_CHUNK_THRESHOLD,
         **kwargs: Any,
     ) -> dict[str, Any]:
-        """Batch geocode addresses and aggregate location APIs."""
+        """Batch geocode addresses and aggregate location APIs.
+        For large batches (above ``chunk_threshold``), the SDK signals the API
+        to scale up infrastructure, splits the work across parallel workers,
+        and signals scale-down when finished.
+        """
         params: dict[str, Any] = {}
         for svc in services:
             params[svc] = True
@@ -667,16 +678,44 @@ class AsyncClient:
         chunk_size = batch_size if batch_size is not None else MAX_BATCH_SIZE
         use_progress = show_progress if show_progress is not None else self._show_progress
+        total = len(addresses)
+        should_scale = auto_scale and total >= chunk_threshold
         async def _request(chunk: dict) -> dict[str, Any]:
             return await self._post("/location/batch/geocoder/aggregator", chunk, params=params, timeout=timeout)
-        with progress_bar(len(addresses), desc="Geocode+Aggregating", enabled=use_progress) as bar:
+        if not should_scale:
+            with progress_bar(total, desc="Geocode+Aggregating", enabled=use_progress) as bar:
-            def _on_progress(n: int) -> None:
-                bar.update(n)
+                def _on_progress(n: int) -> None:
+                    bar.update(n)
-            result = await async_process_batch(addresses, _request, chunk_size, on_progress=_on_progress, on_error=on_error)
+                result = await async_process_batch(addresses, _request, chunk_size, on_progress=_on_progress, on_error=on_error)
+        else:
+            import asyncio
+            resp = await async_batch_prepare(self._post, total)
+            session_id = resp["session_id"]
+            num_workers = min(max_workers, resp.get("max_workers", max_workers))
+            try:
+                await async_wait_until_ready(self._get, session_id)
+                groups = split_into_groups(addresses, num_workers)
+                async def _process_group(group: dict) -> dict[str, Any]:
+                    return await async_process_batch(group, _request, chunk_size, on_error=on_error)
+                group_results = await asyncio.gather(
+                    *[_process_group(g) for g in groups],
+                    return_exceptions=(on_error == "skip"),
+                )
+                result = {}
+                for gr in group_results:
+                    if isinstance(gr, Exception):
+                        continue
+                    result.update(gr)
+            finally:
+                await async_batch_complete(self._post, session_id)
         if self._clean:
             return {k: clean_columns(v) if isinstance(v, dict) else v for k, v in result.items()}

{prismadata-0.3.2 → prismadata-0.4.0}/prismadata/client.py RENAMED Viewed

@@ -7,13 +7,15 @@ import warnings
 from typing import Any, Literal, TYPE_CHECKING
 from ._auth import AuthManager
-from ._batch import process_batch, process_routing_batch
+from ._batch import process_batch, process_routing_batch, split_into_groups
 from ._types import QuotaInfo
 from ._cache import CacheManager
 from ._columns import clean_columns
 from ._constants import (
     BASE_URL,
     DEFAULT_CACHE_TTL,
+    DEFAULT_CHUNK_THRESHOLD,
+    DEFAULT_MAX_WORKERS,
     DEFAULT_TIMEOUT,
     DIRECTION_MAP,
     ENV_API_KEY,
@@ -23,6 +25,7 @@ from ._constants import (
     MAX_ROUTING_BATCH,
 )
 from ._http import HttpClient
+from ._prepare import batch_complete, batch_prepare, wait_until_ready
 from ._progress import progress_bar
 from ._validation import validate_lat_lng, validate_profile, validate_route_points
 from .exceptions import AuthenticationError
@@ -884,10 +887,17 @@ class Client:
         on_error: Literal["raise", "skip"] = "raise",
         timeout: int | None = None,
         show_progress: bool | None = None,
+        auto_scale: bool = True,
+        max_workers: int = DEFAULT_MAX_WORKERS,
+        chunk_threshold: int = DEFAULT_CHUNK_THRESHOLD,
         **kwargs: Any,
     ) -> dict[str, Any]:
         """Batch geocode addresses and aggregate location APIs.
+        For large batches (above ``chunk_threshold``), the SDK signals the API
+        to scale up infrastructure, splits the work across parallel workers,
+        and signals scale-down when finished.
         Args:
             addresses: Mapping of id to address dict.
             services: List of service names to enable.
@@ -895,6 +905,9 @@ class Client:
             on_error: ``"raise"`` (default) or ``"skip"`` to return partial results.
             timeout: Override default request timeout (seconds).
             show_progress: Override progress bar setting.
+            auto_scale: If True, call prepare/complete for large batches.
+            max_workers: Max parallel workers (server may return fewer).
+            chunk_threshold: Minimum items to trigger auto-scaling.
             **kwargs: Additional parameters for specific services.
         """
         params: dict[str, Any] = {}
@@ -904,16 +917,41 @@ class Client:
         chunk_size = batch_size if batch_size is not None else MAX_BATCH_SIZE
         use_progress = show_progress if show_progress is not None else self._show_progress
+        total = len(addresses)
+        should_scale = auto_scale and total >= chunk_threshold
         def _request(chunk: dict) -> dict[str, Any]:
             return self._post("/location/batch/geocoder/aggregator", chunk, params=params, timeout=timeout)
-        with progress_bar(len(addresses), desc="Geocode+Aggregating", enabled=use_progress) as bar:
+        if not should_scale:
+            with progress_bar(total, desc="Geocode+Aggregating", enabled=use_progress) as bar:
-            def _on_progress(n: int) -> None:
-                bar.update(n)
+                def _on_progress(n: int) -> None:
+                    bar.update(n)
+                result = process_batch(addresses, _request, chunk_size, on_progress=_on_progress, on_error=on_error)
+        else:
+            resp = batch_prepare(self._post, total)
+            session_id = resp["session_id"]
+            num_workers = min(max_workers, resp.get("max_workers", max_workers))
+            try:
+                wait_until_ready(self._get, session_id)
+                groups = split_into_groups(addresses, num_workers)
+                from concurrent.futures import ThreadPoolExecutor, as_completed
+                def _process_group(group: dict) -> dict[str, Any]:
+                    return process_batch(group, _request, chunk_size, on_error=on_error)
-            result = process_batch(addresses, _request, chunk_size, on_progress=_on_progress, on_error=on_error)
+                with ThreadPoolExecutor(max_workers=num_workers) as executor:
+                    futures = [executor.submit(_process_group, g) for g in groups]
+                    result = {}
+                    for f in as_completed(futures):
+                        chunk_result = f.result()
+                        result.update(chunk_result)
+            finally:
+                batch_complete(self._post, session_id)
         if self._clean:
             return {k: clean_columns(v) if isinstance(v, dict) else v for k, v in result.items()}

{prismadata-0.3.2 → prismadata-0.4.0}/prismadata/exceptions.py RENAMED Viewed

@@ -38,6 +38,10 @@ class ValidationError(PrismaDataError):
     """Raised when the API returns a validation error (422)."""
+class BatchPrepareError(PrismaDataError):
+    """Raised when batch prepare polling times out or fails."""
 class BatchError(PrismaDataError):
     """Raised when a batch operation has partial failures.

{prismadata-0.3.2 → prismadata-0.4.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "prismadata"
-version = "0.3.2"
+version = "0.4.0"
 description = "Python client for the PrismaData location intelligence API"
 authors = ["PrismaData <contato@prismadata.io>"]
 license = "MIT"