ghostcrawl-langchain 2.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,6 @@
1
+ __pycache__/
2
+ *.pyc
3
+ dist/
4
+ build/
5
+ *.egg-info/
6
+ .pytest_cache/
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 GhostCrawl
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,33 @@
1
+ Metadata-Version: 2.4
2
+ Name: ghostcrawl-langchain
3
+ Version: 2.1.0
4
+ Summary: LangChain tool wrappers for the ghostcrawl SaaS stealth-scraping API.
5
+ License-Expression: MIT
6
+ License-File: LICENSE
7
+ Requires-Python: >=3.9
8
+ Requires-Dist: httpx>=0.27
9
+ Requires-Dist: langchain-core>=0.3
10
+ Requires-Dist: pydantic>=2.7
11
+ Provides-Extra: dev
12
+ Requires-Dist: pytest>=8; extra == 'dev'
13
+ Requires-Dist: respx>=0.21; extra == 'dev'
14
+ Description-Content-Type: text/markdown
15
+
16
+ # ghostcrawl-langchain
17
+
18
+ LangChain tool integration for the [ghostcrawl](https://pypi.org/project/ghostcrawl/)
19
+ stealth-scraping API.
20
+
21
+ Exposes the ghostcrawl scrape / search / extract / crawl / Google-vertical
22
+ surfaces as LangChain `BaseTool` subclasses with shared Pydantic params
23
+ validation.
24
+
25
+ ## Install
26
+
27
+ ```bash
28
+ pip install ghostcrawl-langchain
29
+ ```
30
+
31
+ ## License
32
+
33
+ MIT
@@ -0,0 +1,18 @@
1
+ # ghostcrawl-langchain
2
+
3
+ LangChain tool integration for the [ghostcrawl](https://pypi.org/project/ghostcrawl/)
4
+ stealth-scraping API.
5
+
6
+ Exposes the ghostcrawl scrape / search / extract / crawl / Google-vertical
7
+ surfaces as LangChain `BaseTool` subclasses with shared Pydantic params
8
+ validation.
9
+
10
+ ## Install
11
+
12
+ ```bash
13
+ pip install ghostcrawl-langchain
14
+ ```
15
+
16
+ ## License
17
+
18
+ MIT
@@ -0,0 +1,34 @@
1
+ """ghostcrawl-langchain — LangChain tool wrappers for the GhostCrawl SaaS API."""
2
+ from .errors import (
3
+ GhostcrawlAuthError,
4
+ GhostcrawlError,
5
+ GhostcrawlQuotaError,
6
+ GhostcrawlRateLimitError,
7
+ GhostcrawlServerError,
8
+ )
9
+ from .tools import (
10
+ GhostcrawlCrawlTool,
11
+ GhostcrawlExtractTool,
12
+ GhostcrawlGoogleHotelsTool,
13
+ GhostcrawlGoogleSearchTool,
14
+ GhostcrawlGoogleSportsTool,
15
+ GhostcrawlScrapeTool,
16
+ GhostcrawlSearchTool,
17
+ GhostcrawlUsageTool,
18
+ )
19
+
20
+ __all__ = [
21
+ "GhostcrawlScrapeTool",
22
+ "GhostcrawlSearchTool",
23
+ "GhostcrawlGoogleSearchTool",
24
+ "GhostcrawlGoogleHotelsTool",
25
+ "GhostcrawlGoogleSportsTool",
26
+ "GhostcrawlExtractTool",
27
+ "GhostcrawlCrawlTool",
28
+ "GhostcrawlUsageTool",
29
+ "GhostcrawlError",
30
+ "GhostcrawlAuthError",
31
+ "GhostcrawlQuotaError",
32
+ "GhostcrawlRateLimitError",
33
+ "GhostcrawlServerError",
34
+ ]
@@ -0,0 +1,84 @@
1
+ """Shared httpx client factory for ghostcrawl-langchain tools."""
2
+ from __future__ import annotations
3
+ import os
4
+ import platform
5
+ import sys
6
+ from importlib.metadata import PackageNotFoundError, version
7
+
8
+ import httpx
9
+
10
+
11
+ def _ua() -> str:
12
+ try:
13
+ v = version("ghostcrawl-langchain")
14
+ except PackageNotFoundError:
15
+ v = "0.0.0"
16
+ return (
17
+ f"ghostcrawl-langchain/{v} "
18
+ f"Python/{sys.version_info.major}.{sys.version_info.minor} "
19
+ f"{platform.system()}"
20
+ )
21
+
22
+
23
+ def get_client(
24
+ api_key: str | None = None,
25
+ base_url: str | None = None,
26
+ ) -> httpx.Client:
27
+ """Return a configured httpx.Client with Bearer auth and branded User-Agent.
28
+
29
+ Auth is read from the ``api_key`` argument first, then the
30
+ ``GHOSTCRAWL_API_KEY`` environment variable. Raises ``ValueError`` if
31
+ neither is set.
32
+
33
+ The ``base_url`` argument overrides ``GHOSTCRAWL_BASE_URL`` (defaults to
34
+ ``https://api.ghostcrawl.io``).
35
+ """
36
+ key = (api_key or os.environ.get("GHOSTCRAWL_API_KEY", "")).strip()
37
+ if not key:
38
+ raise ValueError(
39
+ "ghostcrawl-langchain: GHOSTCRAWL_API_KEY is not set. "
40
+ "Set the GHOSTCRAWL_API_KEY environment variable or pass api_key to get_client()."
41
+ )
42
+ resolved_base = (
43
+ base_url
44
+ or os.environ.get("GHOSTCRAWL_BASE_URL", "https://api.ghostcrawl.io")
45
+ ).rstrip("/")
46
+ return httpx.Client(
47
+ base_url=resolved_base,
48
+ headers={
49
+ "Authorization": f"Bearer {key}",
50
+ "User-Agent": _ua(),
51
+ "Content-Type": "application/json",
52
+ },
53
+ timeout=60.0,
54
+ )
55
+
56
+
57
+ def get_async_client(
58
+ api_key: str | None = None,
59
+ base_url: str | None = None,
60
+ ) -> httpx.AsyncClient:
61
+ """Return a configured ``httpx.AsyncClient`` mirroring :func:`get_client`.
62
+
63
+ Used by LangChain BaseTool ``_arun`` overrides so the async path does not
64
+ block the event loop on synchronous I/O.
65
+ """
66
+ key = (api_key or os.environ.get("GHOSTCRAWL_API_KEY", "")).strip()
67
+ if not key:
68
+ raise ValueError(
69
+ "ghostcrawl-langchain: GHOSTCRAWL_API_KEY is not set. "
70
+ "Set the GHOSTCRAWL_API_KEY environment variable or pass api_key to get_async_client()."
71
+ )
72
+ resolved_base = (
73
+ base_url
74
+ or os.environ.get("GHOSTCRAWL_BASE_URL", "https://api.ghostcrawl.io")
75
+ ).rstrip("/")
76
+ return httpx.AsyncClient(
77
+ base_url=resolved_base,
78
+ headers={
79
+ "Authorization": f"Bearer {key}",
80
+ "User-Agent": _ua(),
81
+ "Content-Type": "application/json",
82
+ },
83
+ timeout=60.0,
84
+ )
@@ -0,0 +1,33 @@
1
+ """ghostcrawl-langchain error classes.
2
+
3
+ This is the canonical Python status_code → exception class mapping; the Node SDK
4
+ (sdks/node/src/errors.ts) and the other official SDK surfaces are kept in lockstep
5
+ with it.
6
+
7
+ Third-party attribution: see sdks/python-langchain/LICENSE-NOTICE.md.
8
+ """
9
+ from __future__ import annotations
10
+
11
+
12
+ class GhostcrawlError(Exception):
13
+ """Base class for all ghostcrawl-langchain errors."""
14
+
15
+
16
+ class GhostcrawlAuthError(GhostcrawlError):
17
+ """401 — Invalid API key or missing Authorization header."""
18
+
19
+
20
+ class GhostcrawlQuotaError(GhostcrawlError):
21
+ """402 — Quota exceeded; top up at https://ghostcrawl.io/billing."""
22
+
23
+
24
+ class GhostcrawlRateLimitError(GhostcrawlError):
25
+ """429 — Rate limited; check retry_after attribute (seconds)."""
26
+
27
+ def __init__(self, message: str, retry_after: int | None = None):
28
+ super().__init__(message)
29
+ self.retry_after = retry_after
30
+
31
+
32
+ class GhostcrawlServerError(GhostcrawlError):
33
+ """5xx — Retryable server-side error."""
@@ -0,0 +1,778 @@
1
+ """LangChain BaseTool subclasses for GhostCrawl REST endpoints.
2
+
3
+ Auth: Authorization: Bearer <token>.
4
+
5
+ - _handle_error maps httpx.HTTPStatusError to Ghostcrawl* domain subclasses.
6
+ - All _run() bodies wrap the httpx call in try/except -> _handle_error.
7
+ - All _arun() bodies use httpx.AsyncClient.
8
+ """
9
+ from __future__ import annotations
10
+
11
+ import ast
12
+ import json
13
+ from typing import Any, Dict, NoReturn, Optional, Type
14
+ from urllib.parse import parse_qs
15
+
16
+ import httpx
17
+ from langchain_core.tools import BaseTool
18
+ from pydantic import BaseModel, Field, field_validator
19
+
20
+ from ._client import get_async_client, get_client
21
+ from .errors import (
22
+ GhostcrawlAuthError,
23
+ GhostcrawlError,
24
+ GhostcrawlQuotaError,
25
+ GhostcrawlRateLimitError,
26
+ GhostcrawlServerError,
27
+ )
28
+
29
+
30
+ # ---------------------------------------------------------------------------
31
+ # Error mapping
32
+ # ---------------------------------------------------------------------------
33
+
34
+ def _handle_error(exc: httpx.HTTPStatusError) -> NoReturn:
35
+ """Map an ``httpx.HTTPStatusError`` to the right ``Ghostcrawl*`` subclass.
36
+
37
+ Mapping is kept lock-step across the official SDKs (see ``errors.py`` — the
38
+ canonical Python mapping, and ``sdks/node/src/errors.ts``):
39
+ 401 → auth, 402 → quota, 429 → rate-limit (with ``retry_after``),
40
+ 5xx → retryable server error, anything else → generic ``GhostcrawlError``.
41
+ """
42
+ status = exc.response.status_code
43
+ body = exc.response.text
44
+ if status == 401:
45
+ raise GhostcrawlAuthError(f"401 Unauthorized: {body}") from exc
46
+ if status == 402:
47
+ raise GhostcrawlQuotaError(f"402 Payment Required: {body}") from exc
48
+ if status == 429:
49
+ retry_after_raw = exc.response.headers.get("Retry-After")
50
+ try:
51
+ retry_after = int(retry_after_raw) if retry_after_raw is not None else None
52
+ except (TypeError, ValueError):
53
+ retry_after = None
54
+ raise GhostcrawlRateLimitError(
55
+ f"429 Too Many Requests: {body}", retry_after=retry_after
56
+ ) from exc
57
+ if 500 <= status < 600:
58
+ raise GhostcrawlServerError(f"{status} Server Error: {body}") from exc
59
+ raise GhostcrawlError(f"{status} HTTP error: {body}") from exc
60
+
61
+
62
+ # ---------------------------------------------------------------------------
63
+ # Input schemas
64
+ # ---------------------------------------------------------------------------
65
+
66
+ def _coerce_scalar(raw: Any) -> Any:
67
+ """Narrow a URL ``k=v`` value to bool/int when it matches exactly.
68
+
69
+ Pure value-narrowing — no ``eval``, no attribute access. Lists (repeated
70
+ keys) and anything that is not an exact true/false/digit-string pass
71
+ through unchanged as inert values.
72
+ """
73
+ if isinstance(raw, list):
74
+ return raw
75
+ if isinstance(raw, str):
76
+ low = raw.lower()
77
+ if low == "true":
78
+ return True
79
+ if low == "false":
80
+ return False
81
+ if raw.isdigit():
82
+ return int(raw)
83
+ return raw
84
+
85
+
86
+ class _GhostcrawlBaseInput(BaseModel):
87
+ """Shared base carrying the single 3-stage ``params`` coercion validator.
88
+
89
+ Technique adapted from the Bee ``str_to_dict_validator`` reference
90
+ (cited by path in 140.5-13-SUMMARY.md), rewritten ghostcrawl-style and
91
+ forward-merged into ONE base class so every params-bearing tool input
92
+ inherits the same battle-tested coercion — replacing the five independent
93
+ JSON-only ``_coerce_params`` validators (D-05a / D-09 / D-10).
94
+
95
+ Stage 2 uses ``ast.literal_eval`` which is literal-only (dict/list/str/
96
+ num/bool/None): it never executes code, calls functions, or imports. A
97
+ call-expression string is rejected by the literal stage and (lacking
98
+ ``=``) falls through stage 3 to ``{}`` — never executing attacker input.
99
+
100
+ The validator is declared ``check_fields=False`` so this base (and any
101
+ subclass without a ``params`` field, e.g. ``_EmptyInput``) imports without
102
+ a PydanticUserError.
103
+ """
104
+
105
+ @field_validator("params", mode="before", check_fields=False)
106
+ @classmethod
107
+ def _coerce_params(cls, v: Any) -> dict:
108
+ if isinstance(v, dict):
109
+ return v
110
+ if not isinstance(v, str):
111
+ return v or {}
112
+ s = v.strip()
113
+ if not s:
114
+ return {}
115
+
116
+ # Stage 1 — JSON object string ({"render_js": true}).
117
+ try:
118
+ parsed = json.loads(s)
119
+ if isinstance(parsed, dict):
120
+ return parsed
121
+ except (json.JSONDecodeError, ValueError):
122
+ pass
123
+
124
+ # Stage 2 — Python-literal dict string ({'render_js': True}).
125
+ # literal-only via ast.literal_eval: no code execution / no imports.
126
+ try:
127
+ literal = ast.literal_eval(s)
128
+ if isinstance(literal, dict):
129
+ return literal
130
+ except (ValueError, SyntaxError, TypeError):
131
+ pass
132
+
133
+ # Stage 3 — URL k=v string (screenshot=true&wait=3000).
134
+ if "=" in s:
135
+ parsed_qs = parse_qs(s, keep_blank_values=True)
136
+ return {
137
+ key: _coerce_scalar(vals[0] if len(vals) == 1 else vals)
138
+ for key, vals in parsed_qs.items()
139
+ }
140
+ return {}
141
+
142
+
143
+ class _ScrapeInput(_GhostcrawlBaseInput):
144
+ url: str = Field(..., description="URL to scrape (http/https).")
145
+ render_js: bool = Field(
146
+ False,
147
+ description="Execute JavaScript before extraction. Required for SPAs and lazy-loaded content.",
148
+ )
149
+ output_format: str = Field(
150
+ "markdown",
151
+ description="Content format to return: 'html' returns raw HTML; 'markdown' returns cleaned Markdown.",
152
+ )
153
+ params: dict = Field(
154
+ default_factory=dict,
155
+ description=(
156
+ "Extra /v1/scrape parameters as a dict or JSON string. "
157
+ "Supported keys: wait_for (CSS selector), extract_schema (JSON schema object), "
158
+ "country (ISO 3166-1 alpha-2), screenshot (bool), full_page (bool), "
159
+ "screenshot_selector (CSS selector for element-scoped capture)."
160
+ ),
161
+ )
162
+
163
+
164
+ class _SearchInput(_GhostcrawlBaseInput):
165
+ query: str = Field(..., description="Search query string.")
166
+ engine: str = Field(
167
+ "brave",
168
+ description="Search engine: 'brave' or 'tavily'. Requires your own provider API key via X-Provider-Authorization.",
169
+ )
170
+ limit: int = Field(
171
+ 10,
172
+ ge=1,
173
+ le=20,
174
+ description="Maximum number of results to return (1–20).",
175
+ )
176
+ params: dict = Field(
177
+ default_factory=dict,
178
+ description="Extra /v1/search parameters as a dict or JSON string.",
179
+ )
180
+
181
+
182
+ class _ExtractInput(_GhostcrawlBaseInput):
183
+ url: str = Field(..., description="URL to extract structured data from (http/https).")
184
+ schema_: dict = Field(
185
+ default_factory=dict,
186
+ description=(
187
+ "JSON Schema object describing the fields to extract. "
188
+ "Example: {\"type\": \"object\", \"properties\": {\"title\": {\"type\": \"string\"}}}. "
189
+ "When omitted, ghostcrawl returns the full page content."
190
+ ),
191
+ )
192
+ params: dict = Field(
193
+ default_factory=dict,
194
+ description="Extra /v1/extract parameters as a dict or JSON string.",
195
+ )
196
+
197
+ @field_validator("schema_", mode="before")
198
+ @classmethod
199
+ def _coerce_schema(cls, v: Any) -> dict:
200
+ if isinstance(v, str):
201
+ try:
202
+ return json.loads(v)
203
+ except (json.JSONDecodeError, ValueError) as e:
204
+ raise ValueError(f"schema must be valid JSON: {e}") from e
205
+ return v or {}
206
+
207
+
208
+ class _CrawlInput(_GhostcrawlBaseInput):
209
+ url: str = Field(..., description="Seed URL to start crawling from (http/https).")
210
+ max_pages: int = Field(
211
+ 10,
212
+ ge=1,
213
+ le=500,
214
+ description="Maximum number of pages to crawl from the seed URL (1–500).",
215
+ )
216
+ same_origin: bool = Field(
217
+ True,
218
+ description="Restrict crawl to the same origin (scheme + host + port). Disable to follow cross-origin links.",
219
+ )
220
+ params: dict = Field(
221
+ default_factory=dict,
222
+ description="Extra /v1/crawl parameters as a dict or JSON string.",
223
+ )
224
+
225
+
226
+ class _GoogleSearchInput(_GhostcrawlBaseInput):
227
+ query: str = Field(..., description="Google search query string.")
228
+ country: str = Field(
229
+ "us",
230
+ description="ISO 3166-1 alpha-2 country code for regional Google SERP results.",
231
+ )
232
+ params: dict = Field(
233
+ default_factory=dict,
234
+ description="Extra /v1/google/search parameters as a dict or JSON string.",
235
+ )
236
+
237
+
238
+ class _GoogleHotelsInput(_GhostcrawlBaseInput):
239
+ """Args for the Google Hotels SERP tool (Plan 140.5-18 / D-07a / D-01 parity).
240
+
241
+ Inherits ``_GhostcrawlBaseInput`` (D-05a / plan 140.5-13) so the shared
242
+ 3-stage ``params`` coercion validator applies — no per-class validator.
243
+ """
244
+
245
+ query: str = Field(..., description="Hotel search query, e.g. 'hotels in san francisco'.")
246
+ check_in: str = Field(..., description="Check-in date, ISO 8601 (YYYY-MM-DD).")
247
+ check_out: str = Field(..., description="Check-out date, ISO 8601 (YYYY-MM-DD).")
248
+ adults: int = Field(2, description="Number of adult guests (default 2).")
249
+ rooms: int = Field(1, description="Number of rooms (default 1).")
250
+ currency: str = Field("USD", description="ISO 4217 currency code (default USD).")
251
+ country: str = Field("us", description="ISO 3166-1 alpha-2 region code (default us).")
252
+ params: dict = Field(
253
+ default_factory=dict,
254
+ description="Extra /v1/google/hotels parameters as a dict or JSON string.",
255
+ )
256
+
257
+
258
+ class _GoogleSportsInput(_GhostcrawlBaseInput):
259
+ """Args for the Google Sports SERP tool (Plan 140.5-19 / D-07b / D-01 parity).
260
+
261
+ Inherits ``_GhostcrawlBaseInput`` (D-05a / plan 140.5-13) so the shared
262
+ 3-stage ``params`` coercion validator applies — no per-class validator.
263
+ """
264
+
265
+ query: str = Field(..., description="Sports query, e.g. 'lakers score' or 'premier league'.")
266
+ country: str = Field("us", description="ISO 3166-1 alpha-2 region code (default us).")
267
+ params: dict = Field(
268
+ default_factory=dict,
269
+ description="Extra /v1/google/sports parameters as a dict or JSON string.",
270
+ )
271
+
272
+
273
+ class _EmptyInput(BaseModel):
274
+ """Empty args schema for tools that take no parameters.
275
+
276
+ Stays on bare ``BaseModel`` (NOT ``_GhostcrawlBaseInput``): it carries no
277
+ ``params`` field, so inheriting the validator buys nothing. Leaving it on
278
+ ``BaseModel`` keeps the empty-tool args schema minimal and decoupled.
279
+ """
280
+
281
+
282
+ # ---------------------------------------------------------------------------
283
+ # BaseTool subclasses
284
+ # ---------------------------------------------------------------------------
285
+
286
+ class GhostcrawlScrapeTool(BaseTool):
287
+ """LangChain tool: scrape a single URL via ghostcrawl's stealth browser fleet."""
288
+
289
+ name: str = "ghostcrawl_scrape"
290
+ description: str = (
291
+ "Scrape a webpage using ghostcrawl's stealth browser fleet and return the content "
292
+ "as HTML or Markdown. Suitable for news articles, landing pages, product pages, "
293
+ "and any publicly accessible URL. "
294
+ "SUPPORTED: render_js (bool — execute JavaScript before extraction), "
295
+ "output_format ('html' | 'markdown'), wait_for (CSS selector to await), "
296
+ "extract_schema (JSON schema for structured extraction), country (ISO 3166-1 alpha-2 "
297
+ "for geo-targeted content). "
298
+ "UNSUPPORTED: CAPTCHA solving, multi-step login sessions (use ghostcrawl_crawl with "
299
+ "agent-mode for session-based flows). "
300
+ "Returns: page content as HTML or Markdown plus metadata including title and HTTP status_code. "
301
+ "Uses POST /v1/scrape with Authorization: Bearer token auth."
302
+ )
303
+ args_schema: Type[BaseModel] = _ScrapeInput
304
+
305
+ def _run(
306
+ self,
307
+ url: str,
308
+ render_js: bool = False,
309
+ output_format: str = "markdown",
310
+ params: Optional[dict] = None,
311
+ **_: Any,
312
+ ) -> str:
313
+ body: Dict[str, Any] = {
314
+ "url": url,
315
+ "render_js": render_js,
316
+ "output_format": output_format,
317
+ **(params or {}),
318
+ }
319
+ with get_client() as c:
320
+ r = c.post("/v1/scrape", json=body)
321
+ try:
322
+ r.raise_for_status()
323
+ except httpx.HTTPStatusError as e:
324
+ _handle_error(e)
325
+ return r.text
326
+
327
+ async def _arun(
328
+ self,
329
+ url: str,
330
+ render_js: bool = False,
331
+ output_format: str = "markdown",
332
+ params: Optional[dict] = None,
333
+ **kwargs: Any,
334
+ ) -> str:
335
+ body: Dict[str, Any] = {
336
+ "url": url,
337
+ "render_js": render_js,
338
+ "output_format": output_format,
339
+ **(params or {}),
340
+ }
341
+ async with get_async_client() as c:
342
+ r = await c.post("/v1/scrape", json=body)
343
+ try:
344
+ r.raise_for_status()
345
+ except httpx.HTTPStatusError as e:
346
+ _handle_error(e)
347
+ return r.text
348
+
349
+
350
+ class GhostcrawlSearchTool(BaseTool):
351
+ """LangChain tool: run a web search via Brave or Tavily through ghostcrawl."""
352
+
353
+ name: str = "ghostcrawl_search"
354
+ description: str = (
355
+ "Run a web search through ghostcrawl and return normalized results. "
356
+ "Supports multiple search backends: 'brave' (default) or 'tavily'. "
357
+ "SUPPORTED: query (the search string), engine ('brave' | 'tavily'), "
358
+ "limit (1–20 results), country (ISO 3166-1 alpha-2 for regional results). "
359
+ "UNSUPPORTED: image search, news-only verticals (use ghostcrawl_scrape on Google News). "
360
+ "Returns: list of results with url, title, snippet, published_at, and relevance score. "
361
+ "ghostcrawl charges no markup — you pay Brave or Tavily directly via your provider key "
362
+ "passed as X-Provider-Authorization: Bearer <YOUR_KEY>. "
363
+ "Uses POST /v1/search with Authorization: Bearer token auth."
364
+ )
365
+ args_schema: Type[BaseModel] = _SearchInput
366
+
367
+ def _run(
368
+ self,
369
+ query: str,
370
+ engine: str = "brave",
371
+ limit: int = 10,
372
+ params: Optional[dict] = None,
373
+ **_: Any,
374
+ ) -> str:
375
+ body: Dict[str, Any] = {
376
+ "query": query,
377
+ "engine": engine,
378
+ "limit": limit,
379
+ **(params or {}),
380
+ }
381
+ with get_client() as c:
382
+ r = c.post("/v1/search", json=body)
383
+ try:
384
+ r.raise_for_status()
385
+ except httpx.HTTPStatusError as e:
386
+ _handle_error(e)
387
+ return r.text
388
+
389
+ async def _arun(
390
+ self,
391
+ query: str,
392
+ engine: str = "brave",
393
+ limit: int = 10,
394
+ params: Optional[dict] = None,
395
+ **kwargs: Any,
396
+ ) -> str:
397
+ body: Dict[str, Any] = {
398
+ "query": query,
399
+ "engine": engine,
400
+ "limit": limit,
401
+ **(params or {}),
402
+ }
403
+ async with get_async_client() as c:
404
+ r = await c.post("/v1/search", json=body)
405
+ try:
406
+ r.raise_for_status()
407
+ except httpx.HTTPStatusError as e:
408
+ _handle_error(e)
409
+ return r.text
410
+
411
+
412
+ class GhostcrawlGoogleSearchTool(BaseTool):
413
+ """LangChain tool: run a Google SERP query via ghostcrawl's /v1/google/search.
414
+
415
+ Lets LangChain users reach the Google SERP surface without dropping to
416
+ raw HTTP.
417
+ """
418
+
419
+ name: str = "ghostcrawl_google_search"
420
+ description: str = (
421
+ "Run a Google SERP query through ghostcrawl and return normalized results. "
422
+ "SUPPORTED: query (search string), country (ISO 3166-1 alpha-2 code for "
423
+ "regional results, e.g. 'us', 'gb', 'de'). "
424
+ "UNSUPPORTED: image search, news verticals, knowledge-panel deep extraction "
425
+ "(use ghostcrawl_scrape on the relevant Google result page). "
426
+ "Returns: list of organic results with url, title, snippet, position; plus "
427
+ "any featured snippet and related-searches metadata. "
428
+ "Differs from ghostcrawl_search (Brave/Tavily) — this hits Google directly "
429
+ "via /v1/google/search and does NOT require a third-party provider key. "
430
+ "Uses POST /v1/google/search with Authorization: Bearer token auth."
431
+ )
432
+ args_schema: Type[BaseModel] = _GoogleSearchInput
433
+
434
+ def _run(
435
+ self,
436
+ query: str,
437
+ country: str = "us",
438
+ params: Optional[dict] = None,
439
+ **_: Any,
440
+ ) -> str:
441
+ with get_client() as c:
442
+ body: Dict[str, Any] = {
443
+ "q": query,
444
+ "country_code": country,
445
+ **(params or {}),
446
+ }
447
+ r = c.post("/v1/google/search", json=body)
448
+ try:
449
+ r.raise_for_status()
450
+ except httpx.HTTPStatusError as e:
451
+ _handle_error(e)
452
+ return r.text
453
+
454
+ async def _arun(
455
+ self,
456
+ query: str,
457
+ country: str = "us",
458
+ params: Optional[dict] = None,
459
+ **kwargs: Any,
460
+ ) -> str:
461
+ async with get_async_client() as c:
462
+ body: Dict[str, Any] = {
463
+ "q": query,
464
+ "country_code": country,
465
+ **(params or {}),
466
+ }
467
+ r = await c.post("/v1/google/search", json=body)
468
+ try:
469
+ r.raise_for_status()
470
+ except httpx.HTTPStatusError as e:
471
+ _handle_error(e)
472
+ return r.text
473
+
474
+
475
+ class GhostcrawlGoogleHotelsTool(BaseTool):
476
+ """LangChain tool: fetch Google Hotels listings via ghostcrawl's /v1/google/hotels.
477
+
478
+ Parity peer of ``GhostcrawlGoogleSearchTool`` (D-01 / D-07a, Plan 140.5-18).
479
+ Reaches the Google Travel (hotels) SERP without dropping to raw HTTP.
480
+ """
481
+
482
+ name: str = "ghostcrawl_google_hotels"
483
+ description: str = (
484
+ "Fetch Google Hotels (Travel) listings through ghostcrawl. "
485
+ "SUPPORTED: query (hotel search string), check_in / check_out (ISO 8601 "
486
+ "YYYY-MM-DD; check_out must be after check_in), adults (default 2), rooms "
487
+ "(default 1), currency (ISO 4217, default USD), country (ISO 3166-1 alpha-2). "
488
+ "Returns: hotels_results with name, price, total_price, rating, amenities, "
489
+ "booking_providers. HIGH BRITTLENESS (obfuscated Google Travel SPA classes). "
490
+ "Uses POST /v1/google/hotels with Authorization: Bearer token auth."
491
+ )
492
+ args_schema: Type[BaseModel] = _GoogleHotelsInput
493
+
494
+ def _run(
495
+ self,
496
+ query: str,
497
+ check_in: str,
498
+ check_out: str,
499
+ adults: int = 2,
500
+ rooms: int = 1,
501
+ currency: str = "USD",
502
+ country: str = "us",
503
+ params: Optional[dict] = None,
504
+ **_: Any,
505
+ ) -> str:
506
+ with get_client() as c:
507
+ body: Dict[str, Any] = {
508
+ "q": query,
509
+ "check_in": check_in,
510
+ "check_out": check_out,
511
+ "adults": adults,
512
+ "rooms": rooms,
513
+ "currency": currency,
514
+ "country_code": country,
515
+ **(params or {}),
516
+ }
517
+ r = c.post("/v1/google/hotels", json=body)
518
+ try:
519
+ r.raise_for_status()
520
+ except httpx.HTTPStatusError as e:
521
+ _handle_error(e)
522
+ return r.text
523
+
524
+ async def _arun(
525
+ self,
526
+ query: str,
527
+ check_in: str,
528
+ check_out: str,
529
+ adults: int = 2,
530
+ rooms: int = 1,
531
+ currency: str = "USD",
532
+ country: str = "us",
533
+ params: Optional[dict] = None,
534
+ **_: Any,
535
+ ) -> str:
536
+ async with get_async_client() as c:
537
+ body: Dict[str, Any] = {
538
+ "q": query,
539
+ "check_in": check_in,
540
+ "check_out": check_out,
541
+ "adults": adults,
542
+ "rooms": rooms,
543
+ "currency": currency,
544
+ "country_code": country,
545
+ **(params or {}),
546
+ }
547
+ r = await c.post("/v1/google/hotels", json=body)
548
+ try:
549
+ r.raise_for_status()
550
+ except httpx.HTTPStatusError as e:
551
+ _handle_error(e)
552
+ return r.text
553
+
554
+
555
+ class GhostcrawlGoogleSportsTool(BaseTool):
556
+ """LangChain tool: fetch the Google sports knowledge panel via /v1/google/sports.
557
+
558
+ Parity peer of ``GhostcrawlGoogleHotelsTool`` (D-01 / D-07b, Plan 140.5-19).
559
+ Reaches the Google sports SERP knowledge panel without dropping to raw HTTP.
560
+ """
561
+
562
+ name: str = "ghostcrawl_google_sports"
563
+ description: str = (
564
+ "Fetch the Google sports knowledge panel (match summary + standings) through ghostcrawl. "
565
+ "SUPPORTED: query (sports search string, e.g. 'lakers score'), country (ISO 3166-1 "
566
+ "alpha-2, default us). "
567
+ "Returns: SearchResult with extras.sports_results = {match: {home_team, away_team, "
568
+ "scores, status}, standings: [...]}. HIGH BRITTLENESS (Google knowledge-panel classes "
569
+ "drift on deploy cadence). "
570
+ "Uses POST /v1/google/sports with Authorization: Bearer token auth."
571
+ )
572
+ args_schema: Type[BaseModel] = _GoogleSportsInput
573
+
574
+ def _run(
575
+ self,
576
+ query: str,
577
+ country: str = "us",
578
+ params: Optional[dict] = None,
579
+ **_: Any,
580
+ ) -> str:
581
+ with get_client() as c:
582
+ body: Dict[str, Any] = {
583
+ "q": query,
584
+ "country_code": country,
585
+ **(params or {}),
586
+ }
587
+ r = c.post("/v1/google/sports", json=body)
588
+ try:
589
+ r.raise_for_status()
590
+ except httpx.HTTPStatusError as e:
591
+ _handle_error(e)
592
+ return r.text
593
+
594
+ async def _arun(
595
+ self,
596
+ query: str,
597
+ country: str = "us",
598
+ params: Optional[dict] = None,
599
+ **_: Any,
600
+ ) -> str:
601
+ async with get_async_client() as c:
602
+ body: Dict[str, Any] = {
603
+ "q": query,
604
+ "country_code": country,
605
+ **(params or {}),
606
+ }
607
+ r = await c.post("/v1/google/sports", json=body)
608
+ try:
609
+ r.raise_for_status()
610
+ except httpx.HTTPStatusError as e:
611
+ _handle_error(e)
612
+ return r.text
613
+
614
+
615
+ class GhostcrawlExtractTool(BaseTool):
616
+ """LangChain tool: extract structured data from a URL via ghostcrawl."""
617
+
618
+ name: str = "ghostcrawl_extract"
619
+ description: str = (
620
+ "Extract structured data from a URL by providing a JSON Schema. "
621
+ "ghostcrawl fetches the page through its stealth fleet, then uses the schema "
622
+ "to identify and return matching fields as a structured JSON object. "
623
+ "SUPPORTED: url (the target page), schema (JSON Schema object describing the fields "
624
+ "to extract — e.g. product title, price, reviews), render_js (bool), "
625
+ "session_id (existing session for stateful extraction). "
626
+ "UNSUPPORTED: extracting from binary files (PDF/DOCX) — convert to HTML first. "
627
+ "Returns: data dict keyed by the schema's property names, plus metadata. "
628
+ "Use this tool instead of ghostcrawl_scrape when you need structured output "
629
+ "rather than raw page content. "
630
+ "Uses POST /v1/extract with Authorization: Bearer token auth."
631
+ )
632
+ args_schema: Type[BaseModel] = _ExtractInput
633
+
634
+ def _run(
635
+ self,
636
+ url: str,
637
+ schema_: Optional[dict] = None,
638
+ schema: Optional[dict] = None,
639
+ params: Optional[dict] = None,
640
+ **_: Any,
641
+ ) -> str:
642
+ resolved_schema = schema_ or schema or {}
643
+ body: Dict[str, Any] = {
644
+ "url": url,
645
+ **({"schema": resolved_schema} if resolved_schema else {}),
646
+ **(params or {}),
647
+ }
648
+ with get_client() as c:
649
+ r = c.post("/v1/extract", json=body)
650
+ try:
651
+ r.raise_for_status()
652
+ except httpx.HTTPStatusError as e:
653
+ _handle_error(e)
654
+ return r.text
655
+
656
+ async def _arun(
657
+ self,
658
+ url: str,
659
+ schema_: Optional[dict] = None,
660
+ schema: Optional[dict] = None,
661
+ params: Optional[dict] = None,
662
+ **kwargs: Any,
663
+ ) -> str:
664
+ resolved_schema = schema_ or schema or {}
665
+ body: Dict[str, Any] = {
666
+ "url": url,
667
+ **({"schema": resolved_schema} if resolved_schema else {}),
668
+ **(params or {}),
669
+ }
670
+ async with get_async_client() as c:
671
+ r = await c.post("/v1/extract", json=body)
672
+ try:
673
+ r.raise_for_status()
674
+ except httpx.HTTPStatusError as e:
675
+ _handle_error(e)
676
+ return r.text
677
+
678
+
679
+ class GhostcrawlCrawlTool(BaseTool):
680
+ """LangChain tool: crawl a site starting from a seed URL via ghostcrawl."""
681
+
682
+ name: str = "ghostcrawl_crawl"
683
+ description: str = (
684
+ "Crawl a website starting from a seed URL and return content from multiple pages. "
685
+ "ghostcrawl follows links using its stealth fleet, respecting same-origin and "
686
+ "max_pages bounds to avoid unbounded crawls. "
687
+ "SUPPORTED: url (seed URL), max_pages (1–500, default 10), same_origin (bool — "
688
+ "restrict to same scheme+host+port, default true), render_js (bool for JS-heavy sites). "
689
+ "UNSUPPORTED: authenticated crawls requiring session cookies (use ghostcrawl_scrape "
690
+ "in a loop with session_id for authenticated pages). "
691
+ "Returns: list of crawled pages with url, content, title, and status_code per page. "
692
+ "QUOTA NOTE: each page counts as one ghostcrawl credit. Check ghostcrawl_usage "
693
+ "before launching large crawl jobs. "
694
+ "Uses POST /v1/crawl with Authorization: Bearer token auth."
695
+ )
696
+ args_schema: Type[BaseModel] = _CrawlInput
697
+
698
+ def _run(
699
+ self,
700
+ url: str,
701
+ max_pages: int = 10,
702
+ same_origin: bool = True,
703
+ params: Optional[dict] = None,
704
+ **_: Any,
705
+ ) -> str:
706
+ body: Dict[str, Any] = {
707
+ "url": url,
708
+ "max_pages": max_pages,
709
+ "same_origin": same_origin,
710
+ **(params or {}),
711
+ }
712
+ with get_client() as c:
713
+ r = c.post("/v1/crawl", json=body)
714
+ try:
715
+ r.raise_for_status()
716
+ except httpx.HTTPStatusError as e:
717
+ _handle_error(e)
718
+ return r.text
719
+
720
+ async def _arun(
721
+ self,
722
+ url: str,
723
+ max_pages: int = 10,
724
+ same_origin: bool = True,
725
+ params: Optional[dict] = None,
726
+ **kwargs: Any,
727
+ ) -> str:
728
+ body: Dict[str, Any] = {
729
+ "url": url,
730
+ "max_pages": max_pages,
731
+ "same_origin": same_origin,
732
+ **(params or {}),
733
+ }
734
+ async with get_async_client() as c:
735
+ r = await c.post("/v1/crawl", json=body)
736
+ try:
737
+ r.raise_for_status()
738
+ except httpx.HTTPStatusError as e:
739
+ _handle_error(e)
740
+ return r.text
741
+
742
+
743
+ class GhostcrawlUsageTool(BaseTool):
744
+ """LangChain tool: check ghostcrawl API quota and current usage."""
745
+
746
+ name: str = "ghostcrawl_usage"
747
+ description: str = (
748
+ "Check your ghostcrawl API quota and current usage statistics. "
749
+ "Use this tool BEFORE launching large crawl or scrape jobs to confirm "
750
+ "you have sufficient credit headroom. Also useful for cost-control loops "
751
+ "that should pause when credits drop below a threshold. "
752
+ "SUPPORTED: no input parameters required — returns current period stats. "
753
+ "Returns: JSON object with requests_used, requests_quota, credits_remaining, "
754
+ "and period_end_iso (ISO-8601 timestamp of current billing period end). "
755
+ "Uses GET /v1/usage with Authorization: Bearer token auth. "
756
+ "For a metered-dim-only breakdown, the billing endpoint "
757
+ "GET /v1/billing/usage?metered_only=true (D-05e item 10) narrows to "
758
+ "billable dims (request, identity_served, proxy_bytes)."
759
+ )
760
+ args_schema: Type[BaseModel] = _EmptyInput
761
+
762
+ def _run(self, **_: Any) -> str:
763
+ with get_client() as c:
764
+ r = c.get("/v1/usage")
765
+ try:
766
+ r.raise_for_status()
767
+ except httpx.HTTPStatusError as e:
768
+ _handle_error(e)
769
+ return r.text
770
+
771
+ async def _arun(self, **_: Any) -> str:
772
+ async with get_async_client() as c:
773
+ r = await c.get("/v1/usage")
774
+ try:
775
+ r.raise_for_status()
776
+ except httpx.HTTPStatusError as e:
777
+ _handle_error(e)
778
+ return r.text
@@ -0,0 +1,42 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "ghostcrawl-langchain"
7
+ version = "2.1.0"
8
+ description = "LangChain tool wrappers for the ghostcrawl SaaS stealth-scraping API."
9
+ readme = "README.md"
10
+ license = "MIT"
11
+ license-files = ["LICENSE"]
12
+ requires-python = ">=3.9"
13
+ dependencies = [
14
+ "langchain-core>=0.3",
15
+ "pydantic>=2.7",
16
+ "httpx>=0.27",
17
+ ]
18
+
19
+ [project.optional-dependencies]
20
+ dev = [
21
+ "pytest>=8",
22
+ "respx>=0.21",
23
+ ]
24
+
25
+ [tool.hatch.build.targets.wheel]
26
+ packages = ["ghostcrawl_langchain"]
27
+
28
+ [tool.hatch.build.targets.sdist]
29
+ include = [
30
+ "ghostcrawl_langchain",
31
+ "README.md",
32
+ "LICENSE",
33
+ "pyproject.toml",
34
+ ]
35
+ exclude = [
36
+ ".gitignore",
37
+ "tests",
38
+ ]
39
+ ignore-vcs = true
40
+
41
+ [tool.pytest.ini_options]
42
+ testpaths = ["tests"]