PyPI - ghostcrawl-langchain - Versions diffs - 2.1.0__tar.gz - Mend

ghostcrawl-langchain 2.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

ghostcrawl_langchain-2.1.0/.gitignore +6 -0
ghostcrawl_langchain-2.1.0/LICENSE +21 -0
ghostcrawl_langchain-2.1.0/PKG-INFO +33 -0
ghostcrawl_langchain-2.1.0/README.md +18 -0
ghostcrawl_langchain-2.1.0/ghostcrawl_langchain/__init__.py +34 -0
ghostcrawl_langchain-2.1.0/ghostcrawl_langchain/_client.py +84 -0
ghostcrawl_langchain-2.1.0/ghostcrawl_langchain/errors.py +33 -0
ghostcrawl_langchain-2.1.0/ghostcrawl_langchain/tools.py +778 -0
ghostcrawl_langchain-2.1.0/pyproject.toml +42 -0

ghostcrawl_langchain-2.1.0/.gitignore ADDED Viewed

@@ -0,0 +1,6 @@
+__pycache__/
+*.pyc
+dist/
+build/
+*.egg-info/
+.pytest_cache/

ghostcrawl_langchain-2.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 GhostCrawl
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

ghostcrawl_langchain-2.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,33 @@
+Metadata-Version: 2.4
+Name: ghostcrawl-langchain
+Version: 2.1.0
+Summary: LangChain tool wrappers for the ghostcrawl SaaS stealth-scraping API.
+License-Expression: MIT
+License-File: LICENSE
+Requires-Python: >=3.9
+Requires-Dist: httpx>=0.27
+Requires-Dist: langchain-core>=0.3
+Requires-Dist: pydantic>=2.7
+Provides-Extra: dev
+Requires-Dist: pytest>=8; extra == 'dev'
+Requires-Dist: respx>=0.21; extra == 'dev'
+Description-Content-Type: text/markdown
+# ghostcrawl-langchain
+LangChain tool integration for the [ghostcrawl](https://pypi.org/project/ghostcrawl/)
+stealth-scraping API.
+Exposes the ghostcrawl scrape / search / extract / crawl / Google-vertical
+surfaces as LangChain `BaseTool` subclasses with shared Pydantic params
+validation.
+## Install
+```bash
+pip install ghostcrawl-langchain
+```
+## License
+MIT

ghostcrawl_langchain-2.1.0/README.md ADDED Viewed

@@ -0,0 +1,18 @@
+# ghostcrawl-langchain
+LangChain tool integration for the [ghostcrawl](https://pypi.org/project/ghostcrawl/)
+stealth-scraping API.
+Exposes the ghostcrawl scrape / search / extract / crawl / Google-vertical
+surfaces as LangChain `BaseTool` subclasses with shared Pydantic params
+validation.
+## Install
+```bash
+pip install ghostcrawl-langchain
+```
+## License
+MIT

ghostcrawl_langchain-2.1.0/ghostcrawl_langchain/__init__.py ADDED Viewed

@@ -0,0 +1,34 @@
+"""ghostcrawl-langchain — LangChain tool wrappers for the GhostCrawl SaaS API."""
+from .errors import (
+    GhostcrawlAuthError,
+    GhostcrawlError,
+    GhostcrawlQuotaError,
+    GhostcrawlRateLimitError,
+    GhostcrawlServerError,
+)
+from .tools import (
+    GhostcrawlCrawlTool,
+    GhostcrawlExtractTool,
+    GhostcrawlGoogleHotelsTool,
+    GhostcrawlGoogleSearchTool,
+    GhostcrawlGoogleSportsTool,
+    GhostcrawlScrapeTool,
+    GhostcrawlSearchTool,
+    GhostcrawlUsageTool,
+)
+__all__ = [
+    "GhostcrawlScrapeTool",
+    "GhostcrawlSearchTool",
+    "GhostcrawlGoogleSearchTool",
+    "GhostcrawlGoogleHotelsTool",
+    "GhostcrawlGoogleSportsTool",
+    "GhostcrawlExtractTool",
+    "GhostcrawlCrawlTool",
+    "GhostcrawlUsageTool",
+    "GhostcrawlError",
+    "GhostcrawlAuthError",
+    "GhostcrawlQuotaError",
+    "GhostcrawlRateLimitError",
+    "GhostcrawlServerError",
+]

ghostcrawl_langchain-2.1.0/ghostcrawl_langchain/_client.py ADDED Viewed

@@ -0,0 +1,84 @@
+"""Shared httpx client factory for ghostcrawl-langchain tools."""
+from __future__ import annotations
+import os
+import platform
+import sys
+from importlib.metadata import PackageNotFoundError, version
+import httpx
+def _ua() -> str:
+    try:
+        v = version("ghostcrawl-langchain")
+    except PackageNotFoundError:
+        v = "0.0.0"
+    return (
+        f"ghostcrawl-langchain/{v} "
+        f"Python/{sys.version_info.major}.{sys.version_info.minor} "
+        f"{platform.system()}"
+    )
+def get_client(
+    api_key: str | None = None,
+    base_url: str | None = None,
+) -> httpx.Client:
+    """Return a configured httpx.Client with Bearer auth and branded User-Agent.
+    Auth is read from the ``api_key`` argument first, then the
+    ``GHOSTCRAWL_API_KEY`` environment variable.  Raises ``ValueError`` if
+    neither is set.
+    The ``base_url`` argument overrides ``GHOSTCRAWL_BASE_URL`` (defaults to
+    ``https://api.ghostcrawl.io``).
+    """
+    key = (api_key or os.environ.get("GHOSTCRAWL_API_KEY", "")).strip()
+    if not key:
+        raise ValueError(
+            "ghostcrawl-langchain: GHOSTCRAWL_API_KEY is not set. "
+            "Set the GHOSTCRAWL_API_KEY environment variable or pass api_key to get_client()."
+        )
+    resolved_base = (
+        base_url
+        or os.environ.get("GHOSTCRAWL_BASE_URL", "https://api.ghostcrawl.io")
+    ).rstrip("/")
+    return httpx.Client(
+        base_url=resolved_base,
+        headers={
+            "Authorization": f"Bearer {key}",
+            "User-Agent": _ua(),
+            "Content-Type": "application/json",
+        },
+        timeout=60.0,
+    )
+def get_async_client(
+    api_key: str | None = None,
+    base_url: str | None = None,
+) -> httpx.AsyncClient:
+    """Return a configured ``httpx.AsyncClient`` mirroring :func:`get_client`.
+    Used by LangChain BaseTool ``_arun`` overrides so the async path does not
+    block the event loop on synchronous I/O.
+    """
+    key = (api_key or os.environ.get("GHOSTCRAWL_API_KEY", "")).strip()
+    if not key:
+        raise ValueError(
+            "ghostcrawl-langchain: GHOSTCRAWL_API_KEY is not set. "
+            "Set the GHOSTCRAWL_API_KEY environment variable or pass api_key to get_async_client()."
+        )
+    resolved_base = (
+        base_url
+        or os.environ.get("GHOSTCRAWL_BASE_URL", "https://api.ghostcrawl.io")
+    ).rstrip("/")
+    return httpx.AsyncClient(
+        base_url=resolved_base,
+        headers={
+            "Authorization": f"Bearer {key}",
+            "User-Agent": _ua(),
+            "Content-Type": "application/json",
+        },
+        timeout=60.0,
+    )

ghostcrawl_langchain-2.1.0/ghostcrawl_langchain/errors.py ADDED Viewed

@@ -0,0 +1,33 @@
+"""ghostcrawl-langchain error classes.
+This is the canonical Python status_code → exception class mapping; the Node SDK
+(sdks/node/src/errors.ts) and the other official SDK surfaces are kept in lockstep
+with it.
+Third-party attribution: see sdks/python-langchain/LICENSE-NOTICE.md.
+"""
+from __future__ import annotations
+class GhostcrawlError(Exception):
+    """Base class for all ghostcrawl-langchain errors."""
+class GhostcrawlAuthError(GhostcrawlError):
+    """401 — Invalid API key or missing Authorization header."""
+class GhostcrawlQuotaError(GhostcrawlError):
+    """402 — Quota exceeded; top up at https://ghostcrawl.io/billing."""
+class GhostcrawlRateLimitError(GhostcrawlError):
+    """429 — Rate limited; check retry_after attribute (seconds)."""
+    def __init__(self, message: str, retry_after: int | None = None):
+        super().__init__(message)
+        self.retry_after = retry_after
+class GhostcrawlServerError(GhostcrawlError):
+    """5xx — Retryable server-side error."""

ghostcrawl_langchain-2.1.0/ghostcrawl_langchain/tools.py ADDED Viewed

@@ -0,0 +1,778 @@
+"""LangChain BaseTool subclasses for GhostCrawl REST endpoints.
+Auth: Authorization: Bearer <token>.
+- _handle_error maps httpx.HTTPStatusError to Ghostcrawl* domain subclasses.
+- All _run() bodies wrap the httpx call in try/except -> _handle_error.
+- All _arun() bodies use httpx.AsyncClient.
+"""
+from __future__ import annotations
+import ast
+import json
+from typing import Any, Dict, NoReturn, Optional, Type
+from urllib.parse import parse_qs
+import httpx
+from langchain_core.tools import BaseTool
+from pydantic import BaseModel, Field, field_validator
+from ._client import get_async_client, get_client
+from .errors import (
+    GhostcrawlAuthError,
+    GhostcrawlError,
+    GhostcrawlQuotaError,
+    GhostcrawlRateLimitError,
+    GhostcrawlServerError,
+)
+# ---------------------------------------------------------------------------
+# Error mapping
+# ---------------------------------------------------------------------------
+def _handle_error(exc: httpx.HTTPStatusError) -> NoReturn:
+    """Map an ``httpx.HTTPStatusError`` to the right ``Ghostcrawl*`` subclass.
+    Mapping is kept lock-step across the official SDKs (see ``errors.py`` — the
+    canonical Python mapping, and ``sdks/node/src/errors.ts``):
+    401 → auth, 402 → quota, 429 → rate-limit (with ``retry_after``),
+    5xx → retryable server error, anything else → generic ``GhostcrawlError``.
+    """
+    status = exc.response.status_code
+    body = exc.response.text
+    if status == 401:
+        raise GhostcrawlAuthError(f"401 Unauthorized: {body}") from exc
+    if status == 402:
+        raise GhostcrawlQuotaError(f"402 Payment Required: {body}") from exc
+    if status == 429:
+        retry_after_raw = exc.response.headers.get("Retry-After")
+        try:
+            retry_after = int(retry_after_raw) if retry_after_raw is not None else None
+        except (TypeError, ValueError):
+            retry_after = None
+        raise GhostcrawlRateLimitError(
+            f"429 Too Many Requests: {body}", retry_after=retry_after
+        ) from exc
+    if 500 <= status < 600:
+        raise GhostcrawlServerError(f"{status} Server Error: {body}") from exc
+    raise GhostcrawlError(f"{status} HTTP error: {body}") from exc
+# ---------------------------------------------------------------------------
+# Input schemas
+# ---------------------------------------------------------------------------
+def _coerce_scalar(raw: Any) -> Any:
+    """Narrow a URL ``k=v`` value to bool/int when it matches exactly.
+    Pure value-narrowing — no ``eval``, no attribute access. Lists (repeated
+    keys) and anything that is not an exact true/false/digit-string pass
+    through unchanged as inert values.
+    """
+    if isinstance(raw, list):
+        return raw
+    if isinstance(raw, str):
+        low = raw.lower()
+        if low == "true":
+            return True
+        if low == "false":
+            return False
+        if raw.isdigit():
+            return int(raw)
+    return raw
+class _GhostcrawlBaseInput(BaseModel):
+    """Shared base carrying the single 3-stage ``params`` coercion validator.
+    Technique adapted from the Bee ``str_to_dict_validator`` reference
+    (cited by path in 140.5-13-SUMMARY.md), rewritten ghostcrawl-style and
+    forward-merged into ONE base class so every params-bearing tool input
+    inherits the same battle-tested coercion — replacing the five independent
+    JSON-only ``_coerce_params`` validators (D-05a / D-09 / D-10).
+    Stage 2 uses ``ast.literal_eval`` which is literal-only (dict/list/str/
+    num/bool/None): it never executes code, calls functions, or imports. A
+    call-expression string is rejected by the literal stage and (lacking
+    ``=``) falls through stage 3 to ``{}`` — never executing attacker input.
+    The validator is declared ``check_fields=False`` so this base (and any
+    subclass without a ``params`` field, e.g. ``_EmptyInput``) imports without
+    a PydanticUserError.
+    """
+    @field_validator("params", mode="before", check_fields=False)
+    @classmethod
+    def _coerce_params(cls, v: Any) -> dict:
+        if isinstance(v, dict):
+            return v
+        if not isinstance(v, str):
+            return v or {}
+        s = v.strip()
+        if not s:
+            return {}
+        # Stage 1 — JSON object string ({"render_js": true}).
+        try:
+            parsed = json.loads(s)
+            if isinstance(parsed, dict):
+                return parsed
+        except (json.JSONDecodeError, ValueError):
+            pass
+        # Stage 2 — Python-literal dict string ({'render_js': True}).
+        # literal-only via ast.literal_eval: no code execution / no imports.
+        try:
+            literal = ast.literal_eval(s)
+            if isinstance(literal, dict):
+                return literal
+        except (ValueError, SyntaxError, TypeError):
+            pass
+        # Stage 3 — URL k=v string (screenshot=true&wait=3000).
+        if "=" in s:
+            parsed_qs = parse_qs(s, keep_blank_values=True)
+            return {
+                key: _coerce_scalar(vals[0] if len(vals) == 1 else vals)
+                for key, vals in parsed_qs.items()
+            }
+        return {}
+class _ScrapeInput(_GhostcrawlBaseInput):
+    url: str = Field(..., description="URL to scrape (http/https).")
+    render_js: bool = Field(
+        False,
+        description="Execute JavaScript before extraction. Required for SPAs and lazy-loaded content.",
+    )
+    output_format: str = Field(
+        "markdown",
+        description="Content format to return: 'html' returns raw HTML; 'markdown' returns cleaned Markdown.",
+    )
+    params: dict = Field(
+        default_factory=dict,
+        description=(
+            "Extra /v1/scrape parameters as a dict or JSON string. "
+            "Supported keys: wait_for (CSS selector), extract_schema (JSON schema object), "
+            "country (ISO 3166-1 alpha-2), screenshot (bool), full_page (bool), "
+            "screenshot_selector (CSS selector for element-scoped capture)."
+        ),
+    )
+class _SearchInput(_GhostcrawlBaseInput):
+    query: str = Field(..., description="Search query string.")
+    engine: str = Field(
+        "brave",
+        description="Search engine: 'brave' or 'tavily'. Requires your own provider API key via X-Provider-Authorization.",
+    )
+    limit: int = Field(
+        10,
+        ge=1,
+        le=20,
+        description="Maximum number of results to return (1–20).",
+    )
+    params: dict = Field(
+        default_factory=dict,
+        description="Extra /v1/search parameters as a dict or JSON string.",
+    )
+class _ExtractInput(_GhostcrawlBaseInput):
+    url: str = Field(..., description="URL to extract structured data from (http/https).")
+    schema_: dict = Field(
+        default_factory=dict,
+        description=(
+            "JSON Schema object describing the fields to extract. "
+            "Example: {\"type\": \"object\", \"properties\": {\"title\": {\"type\": \"string\"}}}. "
+            "When omitted, ghostcrawl returns the full page content."
+        ),
+    )
+    params: dict = Field(
+        default_factory=dict,
+        description="Extra /v1/extract parameters as a dict or JSON string.",
+    )
+    @field_validator("schema_", mode="before")
+    @classmethod
+    def _coerce_schema(cls, v: Any) -> dict:
+        if isinstance(v, str):
+            try:
+                return json.loads(v)
+            except (json.JSONDecodeError, ValueError) as e:
+                raise ValueError(f"schema must be valid JSON: {e}") from e
+        return v or {}
+class _CrawlInput(_GhostcrawlBaseInput):
+    url: str = Field(..., description="Seed URL to start crawling from (http/https).")
+    max_pages: int = Field(
+        10,
+        ge=1,
+        le=500,
+        description="Maximum number of pages to crawl from the seed URL (1–500).",
+    )
+    same_origin: bool = Field(
+        True,
+        description="Restrict crawl to the same origin (scheme + host + port). Disable to follow cross-origin links.",
+    )
+    params: dict = Field(
+        default_factory=dict,
+        description="Extra /v1/crawl parameters as a dict or JSON string.",
+    )
+class _GoogleSearchInput(_GhostcrawlBaseInput):
+    query: str = Field(..., description="Google search query string.")
+    country: str = Field(
+        "us",
+        description="ISO 3166-1 alpha-2 country code for regional Google SERP results.",
+    )
+    params: dict = Field(
+        default_factory=dict,
+        description="Extra /v1/google/search parameters as a dict or JSON string.",
+    )
+class _GoogleHotelsInput(_GhostcrawlBaseInput):
+    """Args for the Google Hotels SERP tool (Plan 140.5-18 / D-07a / D-01 parity).
+    Inherits ``_GhostcrawlBaseInput`` (D-05a / plan 140.5-13) so the shared
+    3-stage ``params`` coercion validator applies — no per-class validator.
+    """
+    query: str = Field(..., description="Hotel search query, e.g. 'hotels in san francisco'.")
+    check_in: str = Field(..., description="Check-in date, ISO 8601 (YYYY-MM-DD).")
+    check_out: str = Field(..., description="Check-out date, ISO 8601 (YYYY-MM-DD).")
+    adults: int = Field(2, description="Number of adult guests (default 2).")
+    rooms: int = Field(1, description="Number of rooms (default 1).")
+    currency: str = Field("USD", description="ISO 4217 currency code (default USD).")
+    country: str = Field("us", description="ISO 3166-1 alpha-2 region code (default us).")
+    params: dict = Field(
+        default_factory=dict,
+        description="Extra /v1/google/hotels parameters as a dict or JSON string.",
+    )
+class _GoogleSportsInput(_GhostcrawlBaseInput):
+    """Args for the Google Sports SERP tool (Plan 140.5-19 / D-07b / D-01 parity).
+    Inherits ``_GhostcrawlBaseInput`` (D-05a / plan 140.5-13) so the shared
+    3-stage ``params`` coercion validator applies — no per-class validator.
+    """
+    query: str = Field(..., description="Sports query, e.g. 'lakers score' or 'premier league'.")
+    country: str = Field("us", description="ISO 3166-1 alpha-2 region code (default us).")
+    params: dict = Field(
+        default_factory=dict,
+        description="Extra /v1/google/sports parameters as a dict or JSON string.",
+    )
+class _EmptyInput(BaseModel):
+    """Empty args schema for tools that take no parameters.
+    Stays on bare ``BaseModel`` (NOT ``_GhostcrawlBaseInput``): it carries no
+    ``params`` field, so inheriting the validator buys nothing. Leaving it on
+    ``BaseModel`` keeps the empty-tool args schema minimal and decoupled.
+    """
+# ---------------------------------------------------------------------------
+# BaseTool subclasses
+# ---------------------------------------------------------------------------
+class GhostcrawlScrapeTool(BaseTool):
+    """LangChain tool: scrape a single URL via ghostcrawl's stealth browser fleet."""
+    name: str = "ghostcrawl_scrape"
+    description: str = (
+        "Scrape a webpage using ghostcrawl's stealth browser fleet and return the content "
+        "as HTML or Markdown. Suitable for news articles, landing pages, product pages, "
+        "and any publicly accessible URL. "
+        "SUPPORTED: render_js (bool — execute JavaScript before extraction), "
+        "output_format ('html' | 'markdown'), wait_for (CSS selector to await), "
+        "extract_schema (JSON schema for structured extraction), country (ISO 3166-1 alpha-2 "
+        "for geo-targeted content). "
+        "UNSUPPORTED: CAPTCHA solving, multi-step login sessions (use ghostcrawl_crawl with "
+        "agent-mode for session-based flows). "
+        "Returns: page content as HTML or Markdown plus metadata including title and HTTP status_code. "
+        "Uses POST /v1/scrape with Authorization: Bearer token auth."
+    )
+    args_schema: Type[BaseModel] = _ScrapeInput
+    def _run(
+        self,
+        url: str,
+        render_js: bool = False,
+        output_format: str = "markdown",
+        params: Optional[dict] = None,
+        **_: Any,
+    ) -> str:
+        body: Dict[str, Any] = {
+            "url": url,
+            "render_js": render_js,
+            "output_format": output_format,
+            **(params or {}),
+        }
+        with get_client() as c:
+            r = c.post("/v1/scrape", json=body)
+            try:
+                r.raise_for_status()
+            except httpx.HTTPStatusError as e:
+                _handle_error(e)
+            return r.text
+    async def _arun(
+        self,
+        url: str,
+        render_js: bool = False,
+        output_format: str = "markdown",
+        params: Optional[dict] = None,
+        **kwargs: Any,
+    ) -> str:
+        body: Dict[str, Any] = {
+            "url": url,
+            "render_js": render_js,
+            "output_format": output_format,
+            **(params or {}),
+        }
+        async with get_async_client() as c:
+            r = await c.post("/v1/scrape", json=body)
+            try:
+                r.raise_for_status()
+            except httpx.HTTPStatusError as e:
+                _handle_error(e)
+            return r.text
+class GhostcrawlSearchTool(BaseTool):
+    """LangChain tool: run a web search via Brave or Tavily through ghostcrawl."""
+    name: str = "ghostcrawl_search"
+    description: str = (
+        "Run a web search through ghostcrawl and return normalized results. "
+        "Supports multiple search backends: 'brave' (default) or 'tavily'. "
+        "SUPPORTED: query (the search string), engine ('brave' | 'tavily'), "
+        "limit (1–20 results), country (ISO 3166-1 alpha-2 for regional results). "
+        "UNSUPPORTED: image search, news-only verticals (use ghostcrawl_scrape on Google News). "
+        "Returns: list of results with url, title, snippet, published_at, and relevance score. "
+        "ghostcrawl charges no markup — you pay Brave or Tavily directly via your provider key "
+        "passed as X-Provider-Authorization: Bearer <YOUR_KEY>. "
+        "Uses POST /v1/search with Authorization: Bearer token auth."
+    )
+    args_schema: Type[BaseModel] = _SearchInput
+    def _run(
+        self,
+        query: str,
+        engine: str = "brave",
+        limit: int = 10,
+        params: Optional[dict] = None,
+        **_: Any,
+    ) -> str:
+        body: Dict[str, Any] = {
+            "query": query,
+            "engine": engine,
+            "limit": limit,
+            **(params or {}),
+        }
+        with get_client() as c:
+            r = c.post("/v1/search", json=body)
+            try:
+                r.raise_for_status()
+            except httpx.HTTPStatusError as e:
+                _handle_error(e)
+            return r.text
+    async def _arun(
+        self,
+        query: str,
+        engine: str = "brave",
+        limit: int = 10,
+        params: Optional[dict] = None,
+        **kwargs: Any,
+    ) -> str:
+        body: Dict[str, Any] = {
+            "query": query,
+            "engine": engine,
+            "limit": limit,
+            **(params or {}),
+        }
+        async with get_async_client() as c:
+            r = await c.post("/v1/search", json=body)
+            try:
+                r.raise_for_status()
+            except httpx.HTTPStatusError as e:
+                _handle_error(e)
+            return r.text
+class GhostcrawlGoogleSearchTool(BaseTool):
+    """LangChain tool: run a Google SERP query via ghostcrawl's /v1/google/search.
+    Lets LangChain users reach the Google SERP surface without dropping to
+    raw HTTP.
+    """
+    name: str = "ghostcrawl_google_search"
+    description: str = (
+        "Run a Google SERP query through ghostcrawl and return normalized results. "
+        "SUPPORTED: query (search string), country (ISO 3166-1 alpha-2 code for "
+        "regional results, e.g. 'us', 'gb', 'de'). "
+        "UNSUPPORTED: image search, news verticals, knowledge-panel deep extraction "
+        "(use ghostcrawl_scrape on the relevant Google result page). "
+        "Returns: list of organic results with url, title, snippet, position; plus "
+        "any featured snippet and related-searches metadata. "
+        "Differs from ghostcrawl_search (Brave/Tavily) — this hits Google directly "
+        "via /v1/google/search and does NOT require a third-party provider key. "
+        "Uses POST /v1/google/search with Authorization: Bearer token auth."
+    )
+    args_schema: Type[BaseModel] = _GoogleSearchInput
+    def _run(
+        self,
+        query: str,
+        country: str = "us",
+        params: Optional[dict] = None,
+        **_: Any,
+    ) -> str:
+        with get_client() as c:
+            body: Dict[str, Any] = {
+                "q": query,
+                "country_code": country,
+                **(params or {}),
+            }
+            r = c.post("/v1/google/search", json=body)
+            try:
+                r.raise_for_status()
+            except httpx.HTTPStatusError as e:
+                _handle_error(e)
+            return r.text
+    async def _arun(
+        self,
+        query: str,
+        country: str = "us",
+        params: Optional[dict] = None,
+        **kwargs: Any,
+    ) -> str:
+        async with get_async_client() as c:
+            body: Dict[str, Any] = {
+                "q": query,
+                "country_code": country,
+                **(params or {}),
+            }
+            r = await c.post("/v1/google/search", json=body)
+            try:
+                r.raise_for_status()
+            except httpx.HTTPStatusError as e:
+                _handle_error(e)
+            return r.text
+class GhostcrawlGoogleHotelsTool(BaseTool):
+    """LangChain tool: fetch Google Hotels listings via ghostcrawl's /v1/google/hotels.
+    Parity peer of ``GhostcrawlGoogleSearchTool`` (D-01 / D-07a, Plan 140.5-18).
+    Reaches the Google Travel (hotels) SERP without dropping to raw HTTP.
+    """
+    name: str = "ghostcrawl_google_hotels"
+    description: str = (
+        "Fetch Google Hotels (Travel) listings through ghostcrawl. "
+        "SUPPORTED: query (hotel search string), check_in / check_out (ISO 8601 "
+        "YYYY-MM-DD; check_out must be after check_in), adults (default 2), rooms "
+        "(default 1), currency (ISO 4217, default USD), country (ISO 3166-1 alpha-2). "
+        "Returns: hotels_results with name, price, total_price, rating, amenities, "
+        "booking_providers. HIGH BRITTLENESS (obfuscated Google Travel SPA classes). "
+        "Uses POST /v1/google/hotels with Authorization: Bearer token auth."
+    )
+    args_schema: Type[BaseModel] = _GoogleHotelsInput
+    def _run(
+        self,
+        query: str,
+        check_in: str,
+        check_out: str,
+        adults: int = 2,
+        rooms: int = 1,
+        currency: str = "USD",
+        country: str = "us",
+        params: Optional[dict] = None,
+        **_: Any,
+    ) -> str:
+        with get_client() as c:
+            body: Dict[str, Any] = {
+                "q": query,
+                "check_in": check_in,
+                "check_out": check_out,
+                "adults": adults,
+                "rooms": rooms,
+                "currency": currency,
+                "country_code": country,
+                **(params or {}),
+            }
+            r = c.post("/v1/google/hotels", json=body)
+            try:
+                r.raise_for_status()
+            except httpx.HTTPStatusError as e:
+                _handle_error(e)
+            return r.text
+    async def _arun(
+        self,
+        query: str,
+        check_in: str,
+        check_out: str,
+        adults: int = 2,
+        rooms: int = 1,
+        currency: str = "USD",
+        country: str = "us",
+        params: Optional[dict] = None,
+        **_: Any,
+    ) -> str:
+        async with get_async_client() as c:
+            body: Dict[str, Any] = {
+                "q": query,
+                "check_in": check_in,
+                "check_out": check_out,
+                "adults": adults,
+                "rooms": rooms,
+                "currency": currency,
+                "country_code": country,
+                **(params or {}),
+            }
+            r = await c.post("/v1/google/hotels", json=body)
+            try:
+                r.raise_for_status()
+            except httpx.HTTPStatusError as e:
+                _handle_error(e)
+            return r.text
+class GhostcrawlGoogleSportsTool(BaseTool):
+    """LangChain tool: fetch the Google sports knowledge panel via /v1/google/sports.
+    Parity peer of ``GhostcrawlGoogleHotelsTool`` (D-01 / D-07b, Plan 140.5-19).
+    Reaches the Google sports SERP knowledge panel without dropping to raw HTTP.
+    """
+    name: str = "ghostcrawl_google_sports"
+    description: str = (
+        "Fetch the Google sports knowledge panel (match summary + standings) through ghostcrawl. "
+        "SUPPORTED: query (sports search string, e.g. 'lakers score'), country (ISO 3166-1 "
+        "alpha-2, default us). "
+        "Returns: SearchResult with extras.sports_results = {match: {home_team, away_team, "
+        "scores, status}, standings: [...]}. HIGH BRITTLENESS (Google knowledge-panel classes "
+        "drift on deploy cadence). "
+        "Uses POST /v1/google/sports with Authorization: Bearer token auth."
+    )
+    args_schema: Type[BaseModel] = _GoogleSportsInput
+    def _run(
+        self,
+        query: str,
+        country: str = "us",
+        params: Optional[dict] = None,
+        **_: Any,
+    ) -> str:
+        with get_client() as c:
+            body: Dict[str, Any] = {
+                "q": query,
+                "country_code": country,
+                **(params or {}),
+            }
+            r = c.post("/v1/google/sports", json=body)
+            try:
+                r.raise_for_status()
+            except httpx.HTTPStatusError as e:
+                _handle_error(e)
+            return r.text
+    async def _arun(
+        self,
+        query: str,
+        country: str = "us",
+        params: Optional[dict] = None,
+        **_: Any,
+    ) -> str:
+        async with get_async_client() as c:
+            body: Dict[str, Any] = {
+                "q": query,
+                "country_code": country,
+                **(params or {}),
+            }
+            r = await c.post("/v1/google/sports", json=body)
+            try:
+                r.raise_for_status()
+            except httpx.HTTPStatusError as e:
+                _handle_error(e)
+            return r.text
+class GhostcrawlExtractTool(BaseTool):
+    """LangChain tool: extract structured data from a URL via ghostcrawl."""
+    name: str = "ghostcrawl_extract"
+    description: str = (
+        "Extract structured data from a URL by providing a JSON Schema. "
+        "ghostcrawl fetches the page through its stealth fleet, then uses the schema "
+        "to identify and return matching fields as a structured JSON object. "
+        "SUPPORTED: url (the target page), schema (JSON Schema object describing the fields "
+        "to extract — e.g. product title, price, reviews), render_js (bool), "
+        "session_id (existing session for stateful extraction). "
+        "UNSUPPORTED: extracting from binary files (PDF/DOCX) — convert to HTML first. "
+        "Returns: data dict keyed by the schema's property names, plus metadata. "
+        "Use this tool instead of ghostcrawl_scrape when you need structured output "
+        "rather than raw page content. "
+        "Uses POST /v1/extract with Authorization: Bearer token auth."
+    )
+    args_schema: Type[BaseModel] = _ExtractInput
+    def _run(
+        self,
+        url: str,
+        schema_: Optional[dict] = None,
+        schema: Optional[dict] = None,
+        params: Optional[dict] = None,
+        **_: Any,
+    ) -> str:
+        resolved_schema = schema_ or schema or {}
+        body: Dict[str, Any] = {
+            "url": url,
+            **({"schema": resolved_schema} if resolved_schema else {}),
+            **(params or {}),
+        }
+        with get_client() as c:
+            r = c.post("/v1/extract", json=body)
+            try:
+                r.raise_for_status()
+            except httpx.HTTPStatusError as e:
+                _handle_error(e)
+            return r.text
+    async def _arun(
+        self,
+        url: str,
+        schema_: Optional[dict] = None,
+        schema: Optional[dict] = None,
+        params: Optional[dict] = None,
+        **kwargs: Any,
+    ) -> str:
+        resolved_schema = schema_ or schema or {}
+        body: Dict[str, Any] = {
+            "url": url,
+            **({"schema": resolved_schema} if resolved_schema else {}),
+            **(params or {}),
+        }
+        async with get_async_client() as c:
+            r = await c.post("/v1/extract", json=body)
+            try:
+                r.raise_for_status()
+            except httpx.HTTPStatusError as e:
+                _handle_error(e)
+            return r.text
+class GhostcrawlCrawlTool(BaseTool):
+    """LangChain tool: crawl a site starting from a seed URL via ghostcrawl."""
+    name: str = "ghostcrawl_crawl"
+    description: str = (
+        "Crawl a website starting from a seed URL and return content from multiple pages. "
+        "ghostcrawl follows links using its stealth fleet, respecting same-origin and "
+        "max_pages bounds to avoid unbounded crawls. "
+        "SUPPORTED: url (seed URL), max_pages (1–500, default 10), same_origin (bool — "
+        "restrict to same scheme+host+port, default true), render_js (bool for JS-heavy sites). "
+        "UNSUPPORTED: authenticated crawls requiring session cookies (use ghostcrawl_scrape "
+        "in a loop with session_id for authenticated pages). "
+        "Returns: list of crawled pages with url, content, title, and status_code per page. "
+        "QUOTA NOTE: each page counts as one ghostcrawl credit. Check ghostcrawl_usage "
+        "before launching large crawl jobs. "
+        "Uses POST /v1/crawl with Authorization: Bearer token auth."
+    )
+    args_schema: Type[BaseModel] = _CrawlInput
+    def _run(
+        self,
+        url: str,
+        max_pages: int = 10,
+        same_origin: bool = True,
+        params: Optional[dict] = None,
+        **_: Any,
+    ) -> str:
+        body: Dict[str, Any] = {
+            "url": url,
+            "max_pages": max_pages,
+            "same_origin": same_origin,
+            **(params or {}),
+        }
+        with get_client() as c:
+            r = c.post("/v1/crawl", json=body)
+            try:
+                r.raise_for_status()
+            except httpx.HTTPStatusError as e:
+                _handle_error(e)
+            return r.text
+    async def _arun(
+        self,
+        url: str,
+        max_pages: int = 10,
+        same_origin: bool = True,
+        params: Optional[dict] = None,
+        **kwargs: Any,
+    ) -> str:
+        body: Dict[str, Any] = {
+            "url": url,
+            "max_pages": max_pages,
+            "same_origin": same_origin,
+            **(params or {}),
+        }
+        async with get_async_client() as c:
+            r = await c.post("/v1/crawl", json=body)
+            try:
+                r.raise_for_status()
+            except httpx.HTTPStatusError as e:
+                _handle_error(e)
+            return r.text
+class GhostcrawlUsageTool(BaseTool):
+    """LangChain tool: check ghostcrawl API quota and current usage."""
+    name: str = "ghostcrawl_usage"
+    description: str = (
+        "Check your ghostcrawl API quota and current usage statistics. "
+        "Use this tool BEFORE launching large crawl or scrape jobs to confirm "
+        "you have sufficient credit headroom. Also useful for cost-control loops "
+        "that should pause when credits drop below a threshold. "
+        "SUPPORTED: no input parameters required — returns current period stats. "
+        "Returns: JSON object with requests_used, requests_quota, credits_remaining, "
+        "and period_end_iso (ISO-8601 timestamp of current billing period end). "
+        "Uses GET /v1/usage with Authorization: Bearer token auth. "
+        "For a metered-dim-only breakdown, the billing endpoint "
+        "GET /v1/billing/usage?metered_only=true (D-05e item 10) narrows to "
+        "billable dims (request, identity_served, proxy_bytes)."
+    )
+    args_schema: Type[BaseModel] = _EmptyInput
+    def _run(self, **_: Any) -> str:
+        with get_client() as c:
+            r = c.get("/v1/usage")
+            try:
+                r.raise_for_status()
+            except httpx.HTTPStatusError as e:
+                _handle_error(e)
+            return r.text
+    async def _arun(self, **_: Any) -> str:
+        async with get_async_client() as c:
+            r = await c.get("/v1/usage")
+            try:
+                r.raise_for_status()
+            except httpx.HTTPStatusError as e:
+                _handle_error(e)
+            return r.text

ghostcrawl_langchain-2.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,42 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+[project]
+name = "ghostcrawl-langchain"
+version = "2.1.0"
+description = "LangChain tool wrappers for the ghostcrawl SaaS stealth-scraping API."
+readme = "README.md"
+license = "MIT"
+license-files = ["LICENSE"]
+requires-python = ">=3.9"
+dependencies = [
+    "langchain-core>=0.3",
+    "pydantic>=2.7",
+    "httpx>=0.27",
+]
+[project.optional-dependencies]
+dev = [
+    "pytest>=8",
+    "respx>=0.21",
+]
+[tool.hatch.build.targets.wheel]
+packages = ["ghostcrawl_langchain"]
+[tool.hatch.build.targets.sdist]
+include = [
+    "ghostcrawl_langchain",
+    "README.md",
+    "LICENSE",
+    "pyproject.toml",
+]
+exclude = [
+    ".gitignore",
+    "tests",
+]
+ignore-vcs = true
+[tool.pytest.ini_options]
+testpaths = ["tests"]