PyPI - geonode-scraper-tools-core - Versions diffs - 0.1.0__py3-none-any.whl - Mend

geonode-scraper-tools-core 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

geonode_scraper_tools_core/__init__.py +26 -0
geonode_scraper_tools_core/py.typed +0 -0
geonode_scraper_tools_core/registry.py +99 -0
geonode_scraper_tools_core/schemas.py +99 -0
geonode_scraper_tools_core/service.py +326 -0
geonode_scraper_tools_core-0.1.0.dist-info/METADATA +56 -0
geonode_scraper_tools_core-0.1.0.dist-info/RECORD +9 -0
geonode_scraper_tools_core-0.1.0.dist-info/WHEEL +5 -0
geonode_scraper_tools_core-0.1.0.dist-info/top_level.txt +1 -0

geonode_scraper_tools_core/__init__.py ADDED Viewed

@@ -0,0 +1,26 @@
+from .registry import OPERATIONS, OperationSpec, get_operations
+from .schemas import (
+    ExtractInput,
+    GetJobResultInput,
+    GetStatisticsInput,
+    HealthCheckInput,
+    ListJobsInput,
+    WaitForJobInput,
+)
+from .service import ScraperToolService, ScraperToolSettings
+__all__ = [
+    "OPERATIONS",
+    "ExtractInput",
+    "GetJobResultInput",
+    "GetStatisticsInput",
+    "HealthCheckInput",
+    "ListJobsInput",
+    "OperationSpec",
+    "ScraperToolService",
+    "ScraperToolSettings",
+    "WaitForJobInput",
+    "get_operations",
+]
+__version__ = "0.1.0"

geonode_scraper_tools_core/py.typed ADDED Viewed

File without changes

geonode_scraper_tools_core/registry.py ADDED Viewed

@@ -0,0 +1,99 @@
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Sequence
+from pydantic import BaseModel
+from .schemas import (
+    ExtractInput,
+    GetJobResultInput,
+    GetStatisticsInput,
+    HealthCheckInput,
+    ListJobsInput,
+    WaitForJobInput,
+)
+if TYPE_CHECKING:
+    from .service import ScraperToolService
+@dataclass(frozen=True)
+class OperationSpec:
+    key: str
+    tool_name: str
+    description: str
+    args_schema: type[BaseModel]
+    service_method: str
+    def invoke(self, service: ScraperToolService, **kwargs: Any) -> dict[str, Any]:
+        return getattr(service, self.service_method)(**kwargs)
+OPERATIONS: tuple[OperationSpec, ...] = (
+    OperationSpec(
+        key="extract",
+        tool_name="scraper_extract_content",
+        description="Extract structured content from a URL.",
+        args_schema=ExtractInput,
+        service_method="extract",
+    ),
+    OperationSpec(
+        key="get_job_result",
+        tool_name="scraper_get_job_result",
+        description="Fetch the current state or final result for an async extraction job.",
+        args_schema=GetJobResultInput,
+        service_method="get_job_result",
+    ),
+    OperationSpec(
+        key="wait_for_job",
+        tool_name="scraper_wait_for_job",
+        description="Poll an async extraction job until it reaches a terminal state or a timeout expires.",
+        args_schema=WaitForJobInput,
+        service_method="wait_for_job",
+    ),
+    OperationSpec(
+        key="list_jobs",
+        tool_name="scraper_list_jobs",
+        description="List previously submitted extraction jobs with optional filters.",
+        args_schema=ListJobsInput,
+        service_method="list_jobs",
+    ),
+    OperationSpec(
+        key="get_statistics",
+        tool_name="scraper_get_statistics",
+        description="Retrieve aggregated extraction statistics for an optional date range.",
+        args_schema=GetStatisticsInput,
+        service_method="get_statistics",
+    ),
+    OperationSpec(
+        key="health_check",
+        tool_name="scraper_check_health",
+        description="Check the scraper service health and version metadata.",
+        args_schema=HealthCheckInput,
+        service_method="health_check",
+    ),
+)
+_OPERATIONS_BY_KEY = {operation.key: operation for operation in OPERATIONS}
+def get_operations(keys: Sequence[str] | None = None) -> tuple[OperationSpec, ...]:
+    if keys is None:
+        return OPERATIONS
+    selected: list[OperationSpec] = []
+    missing: list[str] = []
+    for key in keys:
+        operation = _OPERATIONS_BY_KEY.get(key)
+        if operation is None:
+            missing.append(key)
+            continue
+        selected.append(operation)
+    if missing:
+        available = ", ".join(sorted(_OPERATIONS_BY_KEY))
+        missing_text = ", ".join(sorted(missing))
+        raise ValueError(f"Unknown operations: {missing_text}. Available operations: {available}")
+    return tuple(selected)

geonode_scraper_tools_core/schemas.py ADDED Viewed

@@ -0,0 +1,99 @@
+from __future__ import annotations
+from datetime import datetime
+from typing import Literal
+from pydantic import BaseModel, ConfigDict, Field
+class ToolInputModel(BaseModel):
+    model_config = ConfigDict(extra="forbid")
+class HealthCheckInput(ToolInputModel):
+    pass
+class ExtractInput(ToolInputModel):
+    url: str = Field(
+        min_length=1,
+        max_length=2083,
+        description="URL to extract content from.",
+    )
+    formats: list[Literal["markdown", "html"]] = Field(
+        default_factory=lambda: ["html"],
+        description="Output formats to return.",
+    )
+    render_js: bool = Field(
+        default=False,
+        description="If true, uses a headless browser to render JavaScript before extraction.",
+    )
+    processing_mode: Literal["sync", "async"] = Field(
+        default="sync",
+        description="sync returns the extraction inline; async starts a job and returns a job ID.",
+    )
+    proxy_country: str | None = Field(
+        default=None,
+        pattern="^[A-Z]{2}$",
+        description="Optional ISO 3166-1 alpha-2 country code for proxy geo-targeting.",
+    )
+    proxy_type: Literal["datacenter", "residential", "mix"] | None = Field(
+        default=None,
+        description="Optional proxy type.",
+    )
+    headers: dict[str, str] | None = Field(
+        default=None,
+        description="Optional HTTP headers forwarded to the target URL.",
+    )
+class GetJobResultInput(ToolInputModel):
+    job_id: str = Field(description="Extraction job ID returned by an async extract request.")
+class WaitForJobInput(ToolInputModel):
+    job_id: str = Field(description="Extraction job ID returned by an async extract request.")
+    timeout_seconds: float | None = Field(
+        default=None,
+        gt=0,
+        description="Override the polling timeout in seconds.",
+    )
+    poll_interval_seconds: float | None = Field(
+        default=None,
+        gt=0,
+        description="Override the polling interval in seconds.",
+    )
+class ListJobsInput(ToolInputModel):
+    job_id: str | None = Field(default=None, description="Filter by job ID.")
+    url: str | None = Field(default=None, description="Filter by target URL.")
+    status: Literal["queued", "processing", "completed", "failed", "cancelled"] | None = Field(
+        default=None,
+        description="Filter by job status.",
+    )
+    output: Literal["markdown", "html"] | None = Field(
+        default=None,
+        description="Filter by requested output format.",
+    )
+    start_date: datetime | None = Field(
+        default=None,
+        description="Filter jobs created on or after this ISO 8601 datetime.",
+    )
+    end_date: datetime | None = Field(
+        default=None,
+        description="Filter jobs created on or before this ISO 8601 datetime.",
+    )
+    page: int = Field(default=1, ge=1, description="Page number.")
+    page_size: int = Field(default=100, ge=1, le=100, description="Number of results per page.")
+class GetStatisticsInput(ToolInputModel):
+    start_date: datetime | None = Field(
+        default=None,
+        description="Filter by start date as an ISO 8601 datetime.",
+    )
+    end_date: datetime | None = Field(
+        default=None,
+        description="Filter by end date as an ISO 8601 datetime.",
+    )

geonode_scraper_tools_core/service.py ADDED Viewed

@@ -0,0 +1,326 @@
+from __future__ import annotations
+from contextlib import contextmanager
+from dataclasses import dataclass
+from datetime import datetime
+from enum import Enum
+from time import monotonic, sleep
+from typing import Any, Callable, ContextManager, Iterator, Mapping, Sequence
+from uuid import UUID
+from geonode_scraper_sdk import (
+    ApiClient,
+    Configuration,
+    ExtractionApi,
+    ExtractRequest,
+    OutputFormat,
+    ProcessingMode,
+    ProxySettings,
+    ProxyType,
+    StatisticsApi,
+    SystemApi,
+)
+from geonode_scraper_sdk.api_client import ApiClient as GeneratedApiClient
+from geonode_scraper_sdk.exceptions import ApiException
+from pydantic import BaseModel
+RequestTimeout = float | tuple[float, float] | None
+ApiClientFactory = Callable[[], ContextManager[GeneratedApiClient]]
+@dataclass(frozen=True)
+class ScraperToolSettings:
+    host: str
+    api_key: str
+    verify_ssl: bool = True
+    request_timeout: RequestTimeout = None
+    max_retries: int = 0
+    retry_backoff_seconds: float = 1.0
+    poll_interval_seconds: float = 3.0
+    poll_timeout_seconds: float = 60.0
+class ScraperToolService:
+    def __init__(
+        self,
+        settings: ScraperToolSettings,
+        *,
+        api_client_factory: ApiClientFactory | None = None,
+        sleep_fn: Callable[[float], None] = sleep,
+    ) -> None:
+        self.settings = settings
+        self._api_client_factory = api_client_factory
+        self._sleep = sleep_fn
+    def extract(
+        self,
+        *,
+        url: str,
+        formats: Sequence[str] | None = None,
+        render_js: bool = False,
+        processing_mode: str = "sync",
+        proxy_country: str | None = None,
+        proxy_type: str | None = None,
+        headers: Mapping[str, str] | None = None,
+    ) -> dict[str, Any]:
+        request = ExtractRequest(
+            url=url,
+            formats=self._build_output_formats(formats),
+            render_js=render_js,
+            processing_mode=ProcessingMode(processing_mode),
+            proxy=self._build_proxy(proxy_country=proxy_country, proxy_type=proxy_type),
+            headers=dict(headers) if headers is not None else None,
+        )
+        return self._execute_api_call(
+            operation="extract",
+            api_cls=ExtractionApi,
+            method_name="extract_v1_extract_post",
+            extract_request=request,
+            _request_timeout=self.settings.request_timeout,
+        )
+    def get_job_result(self, *, job_id: str) -> dict[str, Any]:
+        return self._execute_api_call(
+            operation="get_job_result",
+            api_cls=ExtractionApi,
+            method_name="get_job_result_v1_extract_job_id_get",
+            job_id=job_id,
+            _request_timeout=self.settings.request_timeout,
+        )
+    def wait_for_job(
+        self,
+        *,
+        job_id: str,
+        timeout_seconds: float | None = None,
+        poll_interval_seconds: float | None = None,
+    ) -> dict[str, Any]:
+        timeout = timeout_seconds if timeout_seconds is not None else self.settings.poll_timeout_seconds
+        interval = poll_interval_seconds if poll_interval_seconds is not None else self.settings.poll_interval_seconds
+        deadline = monotonic() + timeout
+        attempts = 0
+        while True:
+            attempts += 1
+            result = self.get_job_result(job_id=job_id)
+            result = {
+                **result,
+                "operation": "wait_for_job",
+                "poll_attempts": attempts,
+            }
+            if not result["ok"]:
+                return result
+            status = result["result"]["status"]
+            if status in {"completed", "failed", "cancelled"}:
+                return result
+            if monotonic() >= deadline:
+                return {
+                    "ok": False,
+                    "operation": "wait_for_job",
+                    "poll_attempts": attempts,
+                    "error": {
+                        "status": None,
+                        "code": "POLL_TIMEOUT",
+                        "message": f"Job {job_id} did not reach a terminal state before the polling timeout expired.",
+                        "retryable": True,
+                        "data": {"job_id": job_id, "last_status": status},
+                    },
+                }
+            self._sleep(interval)
+    def list_jobs(
+        self,
+        *,
+        job_id: str | None = None,
+        url: str | None = None,
+        status: str | None = None,
+        output: str | None = None,
+        start_date: datetime | None = None,
+        end_date: datetime | None = None,
+        page: int = 1,
+        page_size: int = 100,
+    ) -> dict[str, Any]:
+        return self._execute_api_call(
+            operation="list_jobs",
+            api_cls=ExtractionApi,
+            method_name="list_jobs_v1_extract_jobs_get",
+            job_id=job_id,
+            url=url,
+            status=self._enum_or_none(status, enum_type=self._job_status_enum()),
+            output=self._enum_or_none(output, enum_type=OutputFormat),
+            start_date=start_date,
+            end_date=end_date,
+            page=page,
+            page_size=page_size,
+            _request_timeout=self.settings.request_timeout,
+        )
+    def get_statistics(
+        self,
+        *,
+        start_date: datetime | None = None,
+        end_date: datetime | None = None,
+    ) -> dict[str, Any]:
+        return self._execute_api_call(
+            operation="get_statistics",
+            api_cls=StatisticsApi,
+            method_name="get_statistics_v1_statistics_get",
+            start_date=start_date,
+            end_date=end_date,
+            _request_timeout=self.settings.request_timeout,
+        )
+    def health_check(self) -> dict[str, Any]:
+        return self._execute_api_call(
+            operation="health_check",
+            api_cls=SystemApi,
+            method_name="health_check_health_get",
+            _request_timeout=self.settings.request_timeout,
+        )
+    def _execute_api_call(
+        self,
+        *,
+        operation: str,
+        api_cls: type[Any],
+        method_name: str,
+        **call_kwargs: Any,
+    ) -> dict[str, Any]:
+        return self._execute(
+            operation,
+            self._invoke_api_method,
+            api_cls,
+            method_name,
+            call_kwargs,
+        )
+    def _execute(self, operation: str, callback: Callable[..., Any], *args: Any) -> dict[str, Any]:
+        attempts = 0
+        while True:
+            try:
+                result = callback(*args)
+                payload = {
+                    "ok": True,
+                    "operation": operation,
+                    "attempts": attempts + 1,
+                    "result": self._normalize_value(result),
+                }
+                response_type = self._extract_response_type(operation, result)
+                if response_type is not None:
+                    payload["response_type"] = response_type
+                return payload
+            except ApiException as exc:
+                if self._should_retry(exc=exc, attempts=attempts):
+                    attempts += 1
+                    self._sleep(self.settings.retry_backoff_seconds * attempts)
+                    continue
+                return {
+                    "ok": False,
+                    "operation": operation,
+                    "attempts": attempts + 1,
+                    "error": self._normalize_exception(exc),
+                }
+    def _should_retry(self, *, exc: ApiException, attempts: int) -> bool:
+        if attempts >= self.settings.max_retries:
+            return False
+        payload = exc.data.error if getattr(exc.data, "error", None) is not None else exc.data
+        retryable = getattr(payload, "retryable", None)
+        if retryable is not None:
+            return bool(retryable)
+        return exc.status in {429, 500, 503}
+    def _invoke_api_method(self, api_cls: type[Any], method_name: str, call_kwargs: Mapping[str, Any]) -> Any:
+        with self._api_client_context() as api_client:
+            api = api_cls(api_client)
+            api_method = getattr(api, method_name)
+            return api_method(**call_kwargs)
+    @contextmanager
+    def _api_client_context(self) -> Iterator[GeneratedApiClient]:
+        if self._api_client_factory is not None:
+            with self._api_client_factory() as api_client:
+                yield api_client
+            return
+        configuration = Configuration(host=self.settings.host)
+        configuration.api_key["APIKeyHeader"] = self.settings.api_key
+        configuration.verify_ssl = self.settings.verify_ssl
+        with ApiClient(configuration) as api_client:
+            yield api_client
+    @staticmethod
+    def _build_output_formats(formats: Sequence[str] | None) -> list[OutputFormat] | None:
+        if formats is None:
+            return None
+        return [OutputFormat(format_name) for format_name in formats]
+    @staticmethod
+    def _build_proxy(*, proxy_country: str | None, proxy_type: str | None) -> ProxySettings | None:
+        if proxy_country is None and proxy_type is None:
+            return None
+        proxy_kwargs: dict[str, Any] = {}
+        if proxy_country is not None:
+            proxy_kwargs["country"] = proxy_country
+        if proxy_type is not None:
+            proxy_kwargs["type"] = ProxyType(proxy_type)
+        return ProxySettings(**proxy_kwargs)
+    @staticmethod
+    def _job_status_enum() -> type[Enum]:
+        from geonode_scraper_sdk import JobStatus
+        return JobStatus
+    @staticmethod
+    def _enum_or_none(value: str | None, *, enum_type: type[Enum]) -> Enum | None:
+        if value is None:
+            return None
+        return enum_type(value)
+    def _extract_response_type(self, operation: str, result: Any) -> str | None:
+        if operation != "extract":
+            return None
+        if getattr(result, "job_id", None) is not None and getattr(result, "status_url", None) is not None:
+            return "async"
+        return "sync"
+    def _normalize_exception(self, exc: ApiException) -> dict[str, Any]:
+        payload = exc.data.error if getattr(exc.data, "error", None) is not None else exc.data
+        return {
+            "status": exc.status,
+            "reason": exc.reason,
+            "code": self._normalize_value(getattr(payload, "code", None)),
+            "message": getattr(payload, "message", None) or exc.reason,
+            "retryable": getattr(payload, "retryable", None),
+            "body": exc.body,
+            "data": self._normalize_value(exc.data),
+        }
+    def _normalize_value(self, value: Any) -> Any:
+        if value is None:
+            return None
+        if isinstance(value, BaseModel):
+            return value.model_dump(mode="json", exclude_none=True)
+        if isinstance(value, Enum):
+            return value.value
+        if isinstance(value, datetime):
+            return value.isoformat()
+        if isinstance(value, UUID):
+            return str(value)
+        if isinstance(value, Mapping):
+            return {str(key): self._normalize_value(item) for key, item in value.items()}
+        if isinstance(value, Sequence) and not isinstance(value, (str, bytes, bytearray)):
+            return [self._normalize_value(item) for item in value]
+        return value

geonode_scraper_tools_core-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,56 @@
+Metadata-Version: 2.4
+Name: geonode-scraper-tools-core
+Version: 0.1.0
+Summary: Shared runtime and schemas for Geonode Scraper framework tools
+Author: Geonode Team
+License-Expression: MIT
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Typing :: Typed
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+Requires-Dist: geonode-scraper-sdk>=0.1.0
+Requires-Dist: pydantic>=2.11
+Requires-Dist: typing-extensions>=4.7.1
+Provides-Extra: dev
+Requires-Dist: pytest>=8.0; extra == "dev"
+Requires-Dist: pytest-cov>=5.0; extra == "dev"
+Requires-Dist: ruff>=0.12.11; extra == "dev"
+# Geonode Scraper Tools Core
+Shared runtime, schemas, and operation registry for Geonode Scraper tool
+integrations.
+Most users should install one of the framework packages instead:
+- `geonode-scraper-langchain`
+- `geonode-scraper-crewai`
+Install the core package directly only if you are building your own wrapper layer
+on top of the shared service.
+## Installation
+```sh
+pip install geonode-scraper-tools-core
+```
+## Public API
+- `ScraperToolSettings`
+- `ScraperToolService`
+- `OperationSpec`
+- `OPERATIONS`
+- `get_operations()`
+The shared service normalizes SDK responses into JSON-friendly dictionaries and
+exposes the following operations:
+- `extract`
+- `get_job_result`
+- `wait_for_job`
+- `list_jobs`
+- `get_statistics`
+- `health_check`

geonode_scraper_tools_core-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,9 @@
+geonode_scraper_tools_core/__init__.py,sha256=3YWSYca3N5Gkm3FUquKcJdLitg5O6wpOV2dAVpp3j5s,568
+geonode_scraper_tools_core/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+geonode_scraper_tools_core/registry.py,sha256=RbF_vL4XYjy7m7CBTUzW3-1a-ODMD3JRkS220rJ-iLg,2984
+geonode_scraper_tools_core/schemas.py,sha256=kY5UxJIQL7YfI_5OXRy1sBxTF4wmeuHCAAjBRdnwzzQ,3274
+geonode_scraper_tools_core/service.py,sha256=zCS4RWFWs1-pNw-0T0Si_Wjh36dwEl_y8DFrJ2GxTt4,11304
+geonode_scraper_tools_core-0.1.0.dist-info/METADATA,sha256=mynM6L5sNYjryca5t3JlnfhOKInFg7YUq_Ghm5Stx6Q,1452
+geonode_scraper_tools_core-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
+geonode_scraper_tools_core-0.1.0.dist-info/top_level.txt,sha256=Cgcs-WmPRNMfxerlNeumLzr1ykxkXe2t1K85xR71r3Q,27
+geonode_scraper_tools_core-0.1.0.dist-info/RECORD,,

geonode_scraper_tools_core-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,5 @@
+Wheel-Version: 1.0
+Generator: setuptools (82.0.1)
+Root-Is-Purelib: true
+Tag: py3-none-any

geonode_scraper_tools_core-0.1.0.dist-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@
1	+ geonode_scraper_tools_core