geonode-scraper-tools-core 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,2 @@
1
+ include README.md
2
+ recursive-include geonode_scraper_tools_core py.typed
@@ -0,0 +1,56 @@
1
+ Metadata-Version: 2.4
2
+ Name: geonode-scraper-tools-core
3
+ Version: 0.1.0
4
+ Summary: Shared runtime and schemas for Geonode Scraper framework tools
5
+ Author: Geonode Team
6
+ License-Expression: MIT
7
+ Classifier: Programming Language :: Python :: 3.10
8
+ Classifier: Programming Language :: Python :: 3.11
9
+ Classifier: Programming Language :: Python :: 3.12
10
+ Classifier: Typing :: Typed
11
+ Requires-Python: >=3.10
12
+ Description-Content-Type: text/markdown
13
+ Requires-Dist: geonode-scraper-sdk>=0.1.0
14
+ Requires-Dist: pydantic>=2.11
15
+ Requires-Dist: typing-extensions>=4.7.1
16
+ Provides-Extra: dev
17
+ Requires-Dist: pytest>=8.0; extra == "dev"
18
+ Requires-Dist: pytest-cov>=5.0; extra == "dev"
19
+ Requires-Dist: ruff>=0.12.11; extra == "dev"
20
+
21
+ # Geonode Scraper Tools Core
22
+
23
+ Shared runtime, schemas, and operation registry for Geonode Scraper tool
24
+ integrations.
25
+
26
+ Most users should install one of the framework packages instead:
27
+
28
+ - `geonode-scraper-langchain`
29
+ - `geonode-scraper-crewai`
30
+
31
+ Install the core package directly only if you are building your own wrapper layer
32
+ on top of the shared service.
33
+
34
+ ## Installation
35
+
36
+ ```sh
37
+ pip install geonode-scraper-tools-core
38
+ ```
39
+
40
+ ## Public API
41
+
42
+ - `ScraperToolSettings`
43
+ - `ScraperToolService`
44
+ - `OperationSpec`
45
+ - `OPERATIONS`
46
+ - `get_operations()`
47
+
48
+ The shared service normalizes SDK responses into JSON-friendly dictionaries and
49
+ exposes the following operations:
50
+
51
+ - `extract`
52
+ - `get_job_result`
53
+ - `wait_for_job`
54
+ - `list_jobs`
55
+ - `get_statistics`
56
+ - `health_check`
@@ -0,0 +1,36 @@
1
+ # Geonode Scraper Tools Core
2
+
3
+ Shared runtime, schemas, and operation registry for Geonode Scraper tool
4
+ integrations.
5
+
6
+ Most users should install one of the framework packages instead:
7
+
8
+ - `geonode-scraper-langchain`
9
+ - `geonode-scraper-crewai`
10
+
11
+ Install the core package directly only if you are building your own wrapper layer
12
+ on top of the shared service.
13
+
14
+ ## Installation
15
+
16
+ ```sh
17
+ pip install geonode-scraper-tools-core
18
+ ```
19
+
20
+ ## Public API
21
+
22
+ - `ScraperToolSettings`
23
+ - `ScraperToolService`
24
+ - `OperationSpec`
25
+ - `OPERATIONS`
26
+ - `get_operations()`
27
+
28
+ The shared service normalizes SDK responses into JSON-friendly dictionaries and
29
+ exposes the following operations:
30
+
31
+ - `extract`
32
+ - `get_job_result`
33
+ - `wait_for_job`
34
+ - `list_jobs`
35
+ - `get_statistics`
36
+ - `health_check`
@@ -0,0 +1,26 @@
1
+ from .registry import OPERATIONS, OperationSpec, get_operations
2
+ from .schemas import (
3
+ ExtractInput,
4
+ GetJobResultInput,
5
+ GetStatisticsInput,
6
+ HealthCheckInput,
7
+ ListJobsInput,
8
+ WaitForJobInput,
9
+ )
10
+ from .service import ScraperToolService, ScraperToolSettings
11
+
12
+ __all__ = [
13
+ "OPERATIONS",
14
+ "ExtractInput",
15
+ "GetJobResultInput",
16
+ "GetStatisticsInput",
17
+ "HealthCheckInput",
18
+ "ListJobsInput",
19
+ "OperationSpec",
20
+ "ScraperToolService",
21
+ "ScraperToolSettings",
22
+ "WaitForJobInput",
23
+ "get_operations",
24
+ ]
25
+
26
+ __version__ = "0.1.0"
@@ -0,0 +1,99 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import TYPE_CHECKING, Any, Sequence
5
+
6
+ from pydantic import BaseModel
7
+
8
+ from .schemas import (
9
+ ExtractInput,
10
+ GetJobResultInput,
11
+ GetStatisticsInput,
12
+ HealthCheckInput,
13
+ ListJobsInput,
14
+ WaitForJobInput,
15
+ )
16
+
17
+ if TYPE_CHECKING:
18
+ from .service import ScraperToolService
19
+
20
+
21
+ @dataclass(frozen=True)
22
+ class OperationSpec:
23
+ key: str
24
+ tool_name: str
25
+ description: str
26
+ args_schema: type[BaseModel]
27
+ service_method: str
28
+
29
+ def invoke(self, service: ScraperToolService, **kwargs: Any) -> dict[str, Any]:
30
+ return getattr(service, self.service_method)(**kwargs)
31
+
32
+
33
+ OPERATIONS: tuple[OperationSpec, ...] = (
34
+ OperationSpec(
35
+ key="extract",
36
+ tool_name="scraper_extract_content",
37
+ description="Extract structured content from a URL.",
38
+ args_schema=ExtractInput,
39
+ service_method="extract",
40
+ ),
41
+ OperationSpec(
42
+ key="get_job_result",
43
+ tool_name="scraper_get_job_result",
44
+ description="Fetch the current state or final result for an async extraction job.",
45
+ args_schema=GetJobResultInput,
46
+ service_method="get_job_result",
47
+ ),
48
+ OperationSpec(
49
+ key="wait_for_job",
50
+ tool_name="scraper_wait_for_job",
51
+ description="Poll an async extraction job until it reaches a terminal state or a timeout expires.",
52
+ args_schema=WaitForJobInput,
53
+ service_method="wait_for_job",
54
+ ),
55
+ OperationSpec(
56
+ key="list_jobs",
57
+ tool_name="scraper_list_jobs",
58
+ description="List previously submitted extraction jobs with optional filters.",
59
+ args_schema=ListJobsInput,
60
+ service_method="list_jobs",
61
+ ),
62
+ OperationSpec(
63
+ key="get_statistics",
64
+ tool_name="scraper_get_statistics",
65
+ description="Retrieve aggregated extraction statistics for an optional date range.",
66
+ args_schema=GetStatisticsInput,
67
+ service_method="get_statistics",
68
+ ),
69
+ OperationSpec(
70
+ key="health_check",
71
+ tool_name="scraper_check_health",
72
+ description="Check the scraper service health and version metadata.",
73
+ args_schema=HealthCheckInput,
74
+ service_method="health_check",
75
+ ),
76
+ )
77
+
78
+ _OPERATIONS_BY_KEY = {operation.key: operation for operation in OPERATIONS}
79
+
80
+
81
+ def get_operations(keys: Sequence[str] | None = None) -> tuple[OperationSpec, ...]:
82
+ if keys is None:
83
+ return OPERATIONS
84
+
85
+ selected: list[OperationSpec] = []
86
+ missing: list[str] = []
87
+ for key in keys:
88
+ operation = _OPERATIONS_BY_KEY.get(key)
89
+ if operation is None:
90
+ missing.append(key)
91
+ continue
92
+ selected.append(operation)
93
+
94
+ if missing:
95
+ available = ", ".join(sorted(_OPERATIONS_BY_KEY))
96
+ missing_text = ", ".join(sorted(missing))
97
+ raise ValueError(f"Unknown operations: {missing_text}. Available operations: {available}")
98
+
99
+ return tuple(selected)
@@ -0,0 +1,99 @@
1
+ from __future__ import annotations
2
+
3
+ from datetime import datetime
4
+ from typing import Literal
5
+
6
+ from pydantic import BaseModel, ConfigDict, Field
7
+
8
+
9
+ class ToolInputModel(BaseModel):
10
+ model_config = ConfigDict(extra="forbid")
11
+
12
+
13
+ class HealthCheckInput(ToolInputModel):
14
+ pass
15
+
16
+
17
+ class ExtractInput(ToolInputModel):
18
+ url: str = Field(
19
+ min_length=1,
20
+ max_length=2083,
21
+ description="URL to extract content from.",
22
+ )
23
+ formats: list[Literal["markdown", "html"]] = Field(
24
+ default_factory=lambda: ["html"],
25
+ description="Output formats to return.",
26
+ )
27
+ render_js: bool = Field(
28
+ default=False,
29
+ description="If true, uses a headless browser to render JavaScript before extraction.",
30
+ )
31
+ processing_mode: Literal["sync", "async"] = Field(
32
+ default="sync",
33
+ description="sync returns the extraction inline; async starts a job and returns a job ID.",
34
+ )
35
+ proxy_country: str | None = Field(
36
+ default=None,
37
+ pattern="^[A-Z]{2}$",
38
+ description="Optional ISO 3166-1 alpha-2 country code for proxy geo-targeting.",
39
+ )
40
+ proxy_type: Literal["datacenter", "residential", "mix"] | None = Field(
41
+ default=None,
42
+ description="Optional proxy type.",
43
+ )
44
+ headers: dict[str, str] | None = Field(
45
+ default=None,
46
+ description="Optional HTTP headers forwarded to the target URL.",
47
+ )
48
+
49
+
50
+ class GetJobResultInput(ToolInputModel):
51
+ job_id: str = Field(description="Extraction job ID returned by an async extract request.")
52
+
53
+
54
+ class WaitForJobInput(ToolInputModel):
55
+ job_id: str = Field(description="Extraction job ID returned by an async extract request.")
56
+ timeout_seconds: float | None = Field(
57
+ default=None,
58
+ gt=0,
59
+ description="Override the polling timeout in seconds.",
60
+ )
61
+ poll_interval_seconds: float | None = Field(
62
+ default=None,
63
+ gt=0,
64
+ description="Override the polling interval in seconds.",
65
+ )
66
+
67
+
68
+ class ListJobsInput(ToolInputModel):
69
+ job_id: str | None = Field(default=None, description="Filter by job ID.")
70
+ url: str | None = Field(default=None, description="Filter by target URL.")
71
+ status: Literal["queued", "processing", "completed", "failed", "cancelled"] | None = Field(
72
+ default=None,
73
+ description="Filter by job status.",
74
+ )
75
+ output: Literal["markdown", "html"] | None = Field(
76
+ default=None,
77
+ description="Filter by requested output format.",
78
+ )
79
+ start_date: datetime | None = Field(
80
+ default=None,
81
+ description="Filter jobs created on or after this ISO 8601 datetime.",
82
+ )
83
+ end_date: datetime | None = Field(
84
+ default=None,
85
+ description="Filter jobs created on or before this ISO 8601 datetime.",
86
+ )
87
+ page: int = Field(default=1, ge=1, description="Page number.")
88
+ page_size: int = Field(default=100, ge=1, le=100, description="Number of results per page.")
89
+
90
+
91
+ class GetStatisticsInput(ToolInputModel):
92
+ start_date: datetime | None = Field(
93
+ default=None,
94
+ description="Filter by start date as an ISO 8601 datetime.",
95
+ )
96
+ end_date: datetime | None = Field(
97
+ default=None,
98
+ description="Filter by end date as an ISO 8601 datetime.",
99
+ )
@@ -0,0 +1,326 @@
1
+ from __future__ import annotations
2
+
3
+ from contextlib import contextmanager
4
+ from dataclasses import dataclass
5
+ from datetime import datetime
6
+ from enum import Enum
7
+ from time import monotonic, sleep
8
+ from typing import Any, Callable, ContextManager, Iterator, Mapping, Sequence
9
+ from uuid import UUID
10
+
11
+ from geonode_scraper_sdk import (
12
+ ApiClient,
13
+ Configuration,
14
+ ExtractionApi,
15
+ ExtractRequest,
16
+ OutputFormat,
17
+ ProcessingMode,
18
+ ProxySettings,
19
+ ProxyType,
20
+ StatisticsApi,
21
+ SystemApi,
22
+ )
23
+ from geonode_scraper_sdk.api_client import ApiClient as GeneratedApiClient
24
+ from geonode_scraper_sdk.exceptions import ApiException
25
+ from pydantic import BaseModel
26
+
27
+ RequestTimeout = float | tuple[float, float] | None
28
+ ApiClientFactory = Callable[[], ContextManager[GeneratedApiClient]]
29
+
30
+
31
+ @dataclass(frozen=True)
32
+ class ScraperToolSettings:
33
+ host: str
34
+ api_key: str
35
+ verify_ssl: bool = True
36
+ request_timeout: RequestTimeout = None
37
+ max_retries: int = 0
38
+ retry_backoff_seconds: float = 1.0
39
+ poll_interval_seconds: float = 3.0
40
+ poll_timeout_seconds: float = 60.0
41
+
42
+
43
+ class ScraperToolService:
44
+ def __init__(
45
+ self,
46
+ settings: ScraperToolSettings,
47
+ *,
48
+ api_client_factory: ApiClientFactory | None = None,
49
+ sleep_fn: Callable[[float], None] = sleep,
50
+ ) -> None:
51
+ self.settings = settings
52
+ self._api_client_factory = api_client_factory
53
+ self._sleep = sleep_fn
54
+
55
+ def extract(
56
+ self,
57
+ *,
58
+ url: str,
59
+ formats: Sequence[str] | None = None,
60
+ render_js: bool = False,
61
+ processing_mode: str = "sync",
62
+ proxy_country: str | None = None,
63
+ proxy_type: str | None = None,
64
+ headers: Mapping[str, str] | None = None,
65
+ ) -> dict[str, Any]:
66
+ request = ExtractRequest(
67
+ url=url,
68
+ formats=self._build_output_formats(formats),
69
+ render_js=render_js,
70
+ processing_mode=ProcessingMode(processing_mode),
71
+ proxy=self._build_proxy(proxy_country=proxy_country, proxy_type=proxy_type),
72
+ headers=dict(headers) if headers is not None else None,
73
+ )
74
+
75
+ return self._execute_api_call(
76
+ operation="extract",
77
+ api_cls=ExtractionApi,
78
+ method_name="extract_v1_extract_post",
79
+ extract_request=request,
80
+ _request_timeout=self.settings.request_timeout,
81
+ )
82
+
83
+ def get_job_result(self, *, job_id: str) -> dict[str, Any]:
84
+ return self._execute_api_call(
85
+ operation="get_job_result",
86
+ api_cls=ExtractionApi,
87
+ method_name="get_job_result_v1_extract_job_id_get",
88
+ job_id=job_id,
89
+ _request_timeout=self.settings.request_timeout,
90
+ )
91
+
92
+ def wait_for_job(
93
+ self,
94
+ *,
95
+ job_id: str,
96
+ timeout_seconds: float | None = None,
97
+ poll_interval_seconds: float | None = None,
98
+ ) -> dict[str, Any]:
99
+ timeout = timeout_seconds if timeout_seconds is not None else self.settings.poll_timeout_seconds
100
+ interval = poll_interval_seconds if poll_interval_seconds is not None else self.settings.poll_interval_seconds
101
+ deadline = monotonic() + timeout
102
+ attempts = 0
103
+
104
+ while True:
105
+ attempts += 1
106
+ result = self.get_job_result(job_id=job_id)
107
+ result = {
108
+ **result,
109
+ "operation": "wait_for_job",
110
+ "poll_attempts": attempts,
111
+ }
112
+
113
+ if not result["ok"]:
114
+ return result
115
+
116
+ status = result["result"]["status"]
117
+ if status in {"completed", "failed", "cancelled"}:
118
+ return result
119
+
120
+ if monotonic() >= deadline:
121
+ return {
122
+ "ok": False,
123
+ "operation": "wait_for_job",
124
+ "poll_attempts": attempts,
125
+ "error": {
126
+ "status": None,
127
+ "code": "POLL_TIMEOUT",
128
+ "message": f"Job {job_id} did not reach a terminal state before the polling timeout expired.",
129
+ "retryable": True,
130
+ "data": {"job_id": job_id, "last_status": status},
131
+ },
132
+ }
133
+
134
+ self._sleep(interval)
135
+
136
+ def list_jobs(
137
+ self,
138
+ *,
139
+ job_id: str | None = None,
140
+ url: str | None = None,
141
+ status: str | None = None,
142
+ output: str | None = None,
143
+ start_date: datetime | None = None,
144
+ end_date: datetime | None = None,
145
+ page: int = 1,
146
+ page_size: int = 100,
147
+ ) -> dict[str, Any]:
148
+ return self._execute_api_call(
149
+ operation="list_jobs",
150
+ api_cls=ExtractionApi,
151
+ method_name="list_jobs_v1_extract_jobs_get",
152
+ job_id=job_id,
153
+ url=url,
154
+ status=self._enum_or_none(status, enum_type=self._job_status_enum()),
155
+ output=self._enum_or_none(output, enum_type=OutputFormat),
156
+ start_date=start_date,
157
+ end_date=end_date,
158
+ page=page,
159
+ page_size=page_size,
160
+ _request_timeout=self.settings.request_timeout,
161
+ )
162
+
163
+ def get_statistics(
164
+ self,
165
+ *,
166
+ start_date: datetime | None = None,
167
+ end_date: datetime | None = None,
168
+ ) -> dict[str, Any]:
169
+ return self._execute_api_call(
170
+ operation="get_statistics",
171
+ api_cls=StatisticsApi,
172
+ method_name="get_statistics_v1_statistics_get",
173
+ start_date=start_date,
174
+ end_date=end_date,
175
+ _request_timeout=self.settings.request_timeout,
176
+ )
177
+
178
+ def health_check(self) -> dict[str, Any]:
179
+ return self._execute_api_call(
180
+ operation="health_check",
181
+ api_cls=SystemApi,
182
+ method_name="health_check_health_get",
183
+ _request_timeout=self.settings.request_timeout,
184
+ )
185
+
186
+ def _execute_api_call(
187
+ self,
188
+ *,
189
+ operation: str,
190
+ api_cls: type[Any],
191
+ method_name: str,
192
+ **call_kwargs: Any,
193
+ ) -> dict[str, Any]:
194
+ return self._execute(
195
+ operation,
196
+ self._invoke_api_method,
197
+ api_cls,
198
+ method_name,
199
+ call_kwargs,
200
+ )
201
+
202
+ def _execute(self, operation: str, callback: Callable[..., Any], *args: Any) -> dict[str, Any]:
203
+ attempts = 0
204
+ while True:
205
+ try:
206
+ result = callback(*args)
207
+ payload = {
208
+ "ok": True,
209
+ "operation": operation,
210
+ "attempts": attempts + 1,
211
+ "result": self._normalize_value(result),
212
+ }
213
+ response_type = self._extract_response_type(operation, result)
214
+ if response_type is not None:
215
+ payload["response_type"] = response_type
216
+ return payload
217
+ except ApiException as exc:
218
+ if self._should_retry(exc=exc, attempts=attempts):
219
+ attempts += 1
220
+ self._sleep(self.settings.retry_backoff_seconds * attempts)
221
+ continue
222
+
223
+ return {
224
+ "ok": False,
225
+ "operation": operation,
226
+ "attempts": attempts + 1,
227
+ "error": self._normalize_exception(exc),
228
+ }
229
+
230
+ def _should_retry(self, *, exc: ApiException, attempts: int) -> bool:
231
+ if attempts >= self.settings.max_retries:
232
+ return False
233
+
234
+ payload = exc.data.error if getattr(exc.data, "error", None) is not None else exc.data
235
+ retryable = getattr(payload, "retryable", None)
236
+ if retryable is not None:
237
+ return bool(retryable)
238
+
239
+ return exc.status in {429, 500, 503}
240
+
241
+ def _invoke_api_method(self, api_cls: type[Any], method_name: str, call_kwargs: Mapping[str, Any]) -> Any:
242
+ with self._api_client_context() as api_client:
243
+ api = api_cls(api_client)
244
+ api_method = getattr(api, method_name)
245
+ return api_method(**call_kwargs)
246
+
247
+ @contextmanager
248
+ def _api_client_context(self) -> Iterator[GeneratedApiClient]:
249
+ if self._api_client_factory is not None:
250
+ with self._api_client_factory() as api_client:
251
+ yield api_client
252
+ return
253
+
254
+ configuration = Configuration(host=self.settings.host)
255
+ configuration.api_key["APIKeyHeader"] = self.settings.api_key
256
+ configuration.verify_ssl = self.settings.verify_ssl
257
+ with ApiClient(configuration) as api_client:
258
+ yield api_client
259
+
260
+ @staticmethod
261
+ def _build_output_formats(formats: Sequence[str] | None) -> list[OutputFormat] | None:
262
+ if formats is None:
263
+ return None
264
+ return [OutputFormat(format_name) for format_name in formats]
265
+
266
+ @staticmethod
267
+ def _build_proxy(*, proxy_country: str | None, proxy_type: str | None) -> ProxySettings | None:
268
+ if proxy_country is None and proxy_type is None:
269
+ return None
270
+
271
+ proxy_kwargs: dict[str, Any] = {}
272
+ if proxy_country is not None:
273
+ proxy_kwargs["country"] = proxy_country
274
+ if proxy_type is not None:
275
+ proxy_kwargs["type"] = ProxyType(proxy_type)
276
+
277
+ return ProxySettings(**proxy_kwargs)
278
+
279
+ @staticmethod
280
+ def _job_status_enum() -> type[Enum]:
281
+ from geonode_scraper_sdk import JobStatus
282
+
283
+ return JobStatus
284
+
285
+ @staticmethod
286
+ def _enum_or_none(value: str | None, *, enum_type: type[Enum]) -> Enum | None:
287
+ if value is None:
288
+ return None
289
+ return enum_type(value)
290
+
291
+ def _extract_response_type(self, operation: str, result: Any) -> str | None:
292
+ if operation != "extract":
293
+ return None
294
+
295
+ if getattr(result, "job_id", None) is not None and getattr(result, "status_url", None) is not None:
296
+ return "async"
297
+ return "sync"
298
+
299
+ def _normalize_exception(self, exc: ApiException) -> dict[str, Any]:
300
+ payload = exc.data.error if getattr(exc.data, "error", None) is not None else exc.data
301
+ return {
302
+ "status": exc.status,
303
+ "reason": exc.reason,
304
+ "code": self._normalize_value(getattr(payload, "code", None)),
305
+ "message": getattr(payload, "message", None) or exc.reason,
306
+ "retryable": getattr(payload, "retryable", None),
307
+ "body": exc.body,
308
+ "data": self._normalize_value(exc.data),
309
+ }
310
+
311
+ def _normalize_value(self, value: Any) -> Any:
312
+ if value is None:
313
+ return None
314
+ if isinstance(value, BaseModel):
315
+ return value.model_dump(mode="json", exclude_none=True)
316
+ if isinstance(value, Enum):
317
+ return value.value
318
+ if isinstance(value, datetime):
319
+ return value.isoformat()
320
+ if isinstance(value, UUID):
321
+ return str(value)
322
+ if isinstance(value, Mapping):
323
+ return {str(key): self._normalize_value(item) for key, item in value.items()}
324
+ if isinstance(value, Sequence) and not isinstance(value, (str, bytes, bytearray)):
325
+ return [self._normalize_value(item) for item in value]
326
+ return value
@@ -0,0 +1,56 @@
1
+ Metadata-Version: 2.4
2
+ Name: geonode-scraper-tools-core
3
+ Version: 0.1.0
4
+ Summary: Shared runtime and schemas for Geonode Scraper framework tools
5
+ Author: Geonode Team
6
+ License-Expression: MIT
7
+ Classifier: Programming Language :: Python :: 3.10
8
+ Classifier: Programming Language :: Python :: 3.11
9
+ Classifier: Programming Language :: Python :: 3.12
10
+ Classifier: Typing :: Typed
11
+ Requires-Python: >=3.10
12
+ Description-Content-Type: text/markdown
13
+ Requires-Dist: geonode-scraper-sdk>=0.1.0
14
+ Requires-Dist: pydantic>=2.11
15
+ Requires-Dist: typing-extensions>=4.7.1
16
+ Provides-Extra: dev
17
+ Requires-Dist: pytest>=8.0; extra == "dev"
18
+ Requires-Dist: pytest-cov>=5.0; extra == "dev"
19
+ Requires-Dist: ruff>=0.12.11; extra == "dev"
20
+
21
+ # Geonode Scraper Tools Core
22
+
23
+ Shared runtime, schemas, and operation registry for Geonode Scraper tool
24
+ integrations.
25
+
26
+ Most users should install one of the framework packages instead:
27
+
28
+ - `geonode-scraper-langchain`
29
+ - `geonode-scraper-crewai`
30
+
31
+ Install the core package directly only if you are building your own wrapper layer
32
+ on top of the shared service.
33
+
34
+ ## Installation
35
+
36
+ ```sh
37
+ pip install geonode-scraper-tools-core
38
+ ```
39
+
40
+ ## Public API
41
+
42
+ - `ScraperToolSettings`
43
+ - `ScraperToolService`
44
+ - `OperationSpec`
45
+ - `OPERATIONS`
46
+ - `get_operations()`
47
+
48
+ The shared service normalizes SDK responses into JSON-friendly dictionaries and
49
+ exposes the following operations:
50
+
51
+ - `extract`
52
+ - `get_job_result`
53
+ - `wait_for_job`
54
+ - `list_jobs`
55
+ - `get_statistics`
56
+ - `health_check`
@@ -0,0 +1,14 @@
1
+ MANIFEST.in
2
+ README.md
3
+ pyproject.toml
4
+ geonode_scraper_tools_core/__init__.py
5
+ geonode_scraper_tools_core/py.typed
6
+ geonode_scraper_tools_core/registry.py
7
+ geonode_scraper_tools_core/schemas.py
8
+ geonode_scraper_tools_core/service.py
9
+ geonode_scraper_tools_core.egg-info/PKG-INFO
10
+ geonode_scraper_tools_core.egg-info/SOURCES.txt
11
+ geonode_scraper_tools_core.egg-info/dependency_links.txt
12
+ geonode_scraper_tools_core.egg-info/requires.txt
13
+ geonode_scraper_tools_core.egg-info/top_level.txt
14
+ tests/test_service.py
@@ -0,0 +1,8 @@
1
+ geonode-scraper-sdk>=0.1.0
2
+ pydantic>=2.11
3
+ typing-extensions>=4.7.1
4
+
5
+ [dev]
6
+ pytest>=8.0
7
+ pytest-cov>=5.0
8
+ ruff>=0.12.11
@@ -0,0 +1 @@
1
+ geonode_scraper_tools_core
@@ -0,0 +1,48 @@
1
+ [project]
2
+ name = "geonode-scraper-tools-core"
3
+ version = "0.1.0"
4
+ description = "Shared runtime and schemas for Geonode Scraper framework tools"
5
+ authors = [
6
+ {name = "Geonode Team"},
7
+ ]
8
+ readme = "README.md"
9
+ license = "MIT"
10
+ requires-python = ">=3.10"
11
+ classifiers = [
12
+ "Programming Language :: Python :: 3.10",
13
+ "Programming Language :: Python :: 3.11",
14
+ "Programming Language :: Python :: 3.12",
15
+ "Typing :: Typed",
16
+ ]
17
+ dependencies = [
18
+ "geonode-scraper-sdk>=0.1.0",
19
+ "pydantic>=2.11",
20
+ "typing-extensions>=4.7.1",
21
+ ]
22
+
23
+ [project.optional-dependencies]
24
+ dev = [
25
+ "pytest>=8.0",
26
+ "pytest-cov>=5.0",
27
+ "ruff>=0.12.11",
28
+ ]
29
+
30
+ [build-system]
31
+ requires = ["setuptools"]
32
+ build-backend = "setuptools.build_meta"
33
+
34
+ [tool.setuptools.packages.find]
35
+ include = ["geonode_scraper_tools_core*"]
36
+
37
+ [tool.setuptools.package-data]
38
+ geonode_scraper_tools_core = ["py.typed"]
39
+
40
+ [tool.pytest.ini_options]
41
+ testpaths = ["tests"]
42
+
43
+ [tool.ruff]
44
+ target-version = "py310"
45
+ line-length = 120
46
+
47
+ [tool.ruff.lint]
48
+ select = ["E", "W", "F", "I", "B"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,147 @@
1
+ from __future__ import annotations
2
+
3
+ from datetime import datetime, timezone
4
+ from unittest.mock import patch
5
+
6
+ from geonode_scraper_tools_core.service import ScraperToolService, ScraperToolSettings
7
+
8
+
9
+ def test_extract_sync_normalizes_response(service, mock_api_client, make_mock_http_response, sync_response_payload):
10
+ mock_response = make_mock_http_response(200, sync_response_payload)
11
+ with patch.object(mock_api_client.rest_client.pool_manager, "request", return_value=mock_response):
12
+ result = service.extract(url="https://example.com")
13
+
14
+ assert result["ok"] is True
15
+ assert result["operation"] == "extract"
16
+ assert result["response_type"] == "sync"
17
+ assert result["result"]["tokens_charged"] == 10
18
+ assert result["result"]["data"]["markdown"] == "# Example"
19
+
20
+
21
+ def test_extract_async_normalizes_union_response(
22
+ service,
23
+ mock_api_client,
24
+ make_mock_http_response,
25
+ async_response_payload,
26
+ ):
27
+ mock_response = make_mock_http_response(202, async_response_payload)
28
+ with patch.object(mock_api_client.rest_client.pool_manager, "request", return_value=mock_response):
29
+ result = service.extract(url="https://example.com", processing_mode="async")
30
+
31
+ assert result["ok"] is True
32
+ assert result["response_type"] == "async"
33
+ assert result["result"]["job_id"] == "test-job-uuid"
34
+
35
+
36
+ def test_wait_for_job_is_exposed_as_its_own_operation(service):
37
+ queued = {
38
+ "ok": True,
39
+ "operation": "get_job_result",
40
+ "attempts": 1,
41
+ "result": {"status": "processing", "job_id": "test-job-uuid"},
42
+ }
43
+ completed = {
44
+ "ok": True,
45
+ "operation": "get_job_result",
46
+ "attempts": 1,
47
+ "result": {"status": "completed", "job_id": "test-job-uuid", "data": {"markdown": "# Done"}},
48
+ }
49
+
50
+ with patch.object(service, "get_job_result", side_effect=[queued, completed]) as mock_get_job_result:
51
+ result = service.wait_for_job(job_id="test-job-uuid")
52
+
53
+ assert result["ok"] is True
54
+ assert result["operation"] == "wait_for_job"
55
+ assert result["poll_attempts"] == 2
56
+ assert result["result"]["status"] == "completed"
57
+ assert mock_get_job_result.call_count == 2
58
+
59
+
60
+ def test_extract_retries_retryable_errors(mock_api_client, make_mock_http_response):
61
+ @patch.object(mock_api_client.rest_client.pool_manager, "request")
62
+ def _run(mock_request):
63
+ retry_response = make_mock_http_response(
64
+ 429,
65
+ {
66
+ "code": "RATE_LIMITED",
67
+ "message": "Too many requests",
68
+ "retryable": True,
69
+ },
70
+ )
71
+ success_response = make_mock_http_response(
72
+ 200,
73
+ {
74
+ "data": {"html": "<html/>", "markdown": "# Example"},
75
+ "metadata": {
76
+ "url": "https://example.com",
77
+ "render_js": False,
78
+ "formats": ["html"],
79
+ "processing_mode": "sync",
80
+ },
81
+ "tokens_charged": 10,
82
+ },
83
+ )
84
+ mock_request.side_effect = [retry_response, success_response]
85
+
86
+ settings = ScraperToolSettings(
87
+ host="http://api.mock.local",
88
+ api_key="test-mock-api-key",
89
+ max_retries=1,
90
+ retry_backoff_seconds=0,
91
+ )
92
+
93
+ from contextlib import contextmanager
94
+
95
+ @contextmanager
96
+ def _factory():
97
+ yield mock_api_client
98
+
99
+ service = ScraperToolService(settings, api_client_factory=_factory, sleep_fn=lambda _seconds: None)
100
+ result = service.extract(url="https://example.com")
101
+
102
+ assert result["ok"] is True
103
+ assert result["attempts"] == 2
104
+ assert mock_request.call_count == 2
105
+
106
+ _run()
107
+
108
+
109
+ def test_list_jobs_serializes_filters(service, mock_api_client, make_mock_http_response, get_request_details):
110
+ mock_response = make_mock_http_response(
111
+ 200,
112
+ {
113
+ "jobs": [
114
+ {
115
+ "job_id": "00000000-0000-0000-0000-000000000001",
116
+ "status": "completed",
117
+ "url": "https://example.com",
118
+ "created_at": datetime(2026, 1, 1, tzinfo=timezone.utc).isoformat(),
119
+ }
120
+ ],
121
+ "page": 1,
122
+ "page_size": 100,
123
+ "page_count": 1,
124
+ },
125
+ )
126
+
127
+ start_date = datetime(2026, 1, 1, tzinfo=timezone.utc)
128
+ end_date = datetime(2026, 1, 7, tzinfo=timezone.utc)
129
+ with patch.object(mock_api_client.rest_client.pool_manager, "request", return_value=mock_response) as mock_request:
130
+ result = service.list_jobs(status="completed", output="markdown", start_date=start_date, end_date=end_date)
131
+
132
+ request_details = get_request_details(mock_request)
133
+ assert result["ok"] is True
134
+ assert request_details["query"]["status"] == "completed"
135
+ assert request_details["query"]["output"] == "markdown"
136
+ assert request_details["query"]["start_date"] == start_date.strftime(mock_api_client.configuration.datetime_format)
137
+ assert request_details["query"]["end_date"] == end_date.strftime(mock_api_client.configuration.datetime_format)
138
+
139
+
140
+ def test_health_check_preserves_error_payload(service, mock_api_client, make_mock_http_response, health_payload):
141
+ mock_response = make_mock_http_response(503, {**health_payload, "status": "degraded"})
142
+ with patch.object(mock_api_client.rest_client.pool_manager, "request", return_value=mock_response):
143
+ result = service.health_check()
144
+
145
+ assert result["ok"] is False
146
+ assert result["error"]["status"] == 503
147
+ assert result["error"]["data"]["status"] == "degraded"