PyPI - sourceweave-web-search - Versions diffs - 0.2.0__py3-none-any.whl - Mend

sourceweave-web-search 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

sourceweave_web_search/__init__.py +4 -0
sourceweave_web_search/build_openwebui.py +64 -0
sourceweave_web_search/cli.py +189 -0
sourceweave_web_search/config.py +68 -0
sourceweave_web_search/mcp_server.py +206 -0
sourceweave_web_search/tool.py +2242 -0
sourceweave_web_search-0.2.0.dist-info/METADATA +343 -0
sourceweave_web_search-0.2.0.dist-info/RECORD +11 -0
sourceweave_web_search-0.2.0.dist-info/WHEEL +4 -0
sourceweave_web_search-0.2.0.dist-info/entry_points.txt +5 -0
sourceweave_web_search-0.2.0.dist-info/licenses/LICENSE +21 -0

sourceweave_web_search/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from sourceweave_web_search.cli import main
+from sourceweave_web_search.tool import Tools
+__all__ = ["Tools", "main"]

sourceweave_web_search/build_openwebui.py ADDED Viewed

@@ -0,0 +1,64 @@
+import argparse
+from pathlib import Path
+from typing import Sequence
+def canonical_tool_path() -> Path:
+    return Path(__file__).resolve().with_name("tool.py")
+def default_output_path() -> Path:
+    return canonical_tool_path().parents[2] / "artifacts" / "sourceweave_web_search.py"
+def build_openwebui_artifact(
+    output_path: Path | None = None, check: bool = False
+) -> bool:
+    source = canonical_tool_path().read_text(encoding="utf-8")
+    target = output_path or default_output_path()
+    if check:
+        return target.exists() and target.read_text(encoding="utf-8") == source
+    target.parent.mkdir(parents=True, exist_ok=True)
+    target.write_text(source, encoding="utf-8")
+    return True
+def parse_args(argv: Sequence[str] | None = None) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Build or validate the standalone OpenWebUI tool artifact."
+    )
+    parser.add_argument(
+        "--check",
+        action="store_true",
+        help="Exit non-zero if artifacts/sourceweave_web_search.py is out of sync with src/sourceweave_web_search/tool.py.",
+    )
+    parser.add_argument(
+        "--output",
+        default=str(default_output_path()),
+        help="Output path for the generated OpenWebUI tool file.",
+    )
+    return parser.parse_args(argv)
+def main(argv: Sequence[str] | None = None) -> int:
+    args = parse_args(argv)
+    output_path = Path(args.output).resolve()
+    in_sync = build_openwebui_artifact(output_path=output_path, check=args.check)
+    if args.check:
+        if not in_sync:
+            print(
+                f"OpenWebUI artifact is out of sync: {output_path} != {canonical_tool_path()}"
+            )
+            return 1
+        print(f"OpenWebUI artifact is in sync: {output_path}")
+        return 0
+    print(f"Wrote OpenWebUI artifact to {output_path}")
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

sourceweave_web_search/cli.py ADDED Viewed

@@ -0,0 +1,189 @@
+import argparse
+import asyncio
+import json
+from typing import Any, Sequence
+from sourceweave_web_search.config import build_tools
+def parse_args(argv: Sequence[str] | None = None) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Call the SourceWeave Web Search tool directly from the package CLI."
+    )
+    parser.add_argument("--query", default="", help="Query for search_web")
+    parser.add_argument(
+        "--url",
+        dest="urls",
+        action="append",
+        default=[],
+        help=(
+            "Optional URL to crawl alongside search results. Repeatable. "
+            "May be a plain URL string or a JSON object like "
+            '\'{"url": "https://example.com/file.pdf", "convert_document": true}\'.'
+        ),
+    )
+    parser.add_argument(
+        "--related-links-limit",
+        type=int,
+        default=3,
+        help="Maximum number of stored related links to include per read_pages result. Use 0 to omit them.",
+    )
+    parser.add_argument(
+        "--depth",
+        choices=["quick", "normal", "deep"],
+        default="normal",
+        help="search_web depth",
+    )
+    parser.add_argument("--max-results", type=int, default=None)
+    parser.add_argument("--fresh", action="store_true")
+    parser.add_argument("--read-first-page", action="store_true")
+    parser.add_argument(
+        "--read-first-pages",
+        type=int,
+        default=0,
+        help="After search, batch-read the first N returned page_ids in a single read_pages call.",
+    )
+    parser.add_argument(
+        "--read-page-id",
+        dest="read_page_ids",
+        action="append",
+        default=[],
+        help="Read one or more page_ids. Repeat this flag to batch them into a single read_pages call.",
+    )
+    parser.add_argument("--focus", default="")
+    parser.add_argument("--max-chars", type=int, default=1200)
+    parser.add_argument("--pretty", action="store_true")
+    parser.add_argument(
+        "--include-metadata",
+        action="store_true",
+        help="Include per-query debug metadata in CLI output.",
+    )
+    parser.add_argument(
+        "--searxng-base-url",
+        default=None,
+        help="Optional override for SEARXNG_BASE_URL. The SOURCEWEAVE_SEARCH_SEARXNG_BASE_URL env var works too.",
+    )
+    parser.add_argument(
+        "--crawl4ai-base-url",
+        default=None,
+        help="Optional override for CRAWL4AI_BASE_URL. The SOURCEWEAVE_SEARCH_CRAWL4AI_BASE_URL env var works too.",
+    )
+    parser.add_argument(
+        "--cache-redis-url",
+        default=None,
+        help="Optional override for CACHE_REDIS_URL. The SOURCEWEAVE_SEARCH_CACHE_REDIS_URL env var works too.",
+    )
+    return parser.parse_args(argv)
+def _page_ids_from_results(results: Any, count: int) -> list[str]:
+    if not isinstance(results, list) or count <= 0:
+        return []
+    return [
+        result.get("page_id", "")
+        for result in results[:count]
+        if result.get("page_id", "")
+    ]
+def _urls_from_args(args: argparse.Namespace) -> list[Any] | None:
+    normalized_urls: list[Any] = []
+    for raw_value in args.urls:
+        value = str(raw_value or "").strip()
+        if not value:
+            continue
+        if value.startswith("{"):
+            try:
+                parsed = json.loads(value)
+            except json.JSONDecodeError as exc:
+                raise SystemExit(f"Invalid JSON passed to --url: {exc}") from exc
+            if not isinstance(parsed, dict) or not parsed.get("url"):
+                raise SystemExit(
+                    "JSON passed to --url must be an object with at least a 'url' field"
+                )
+            normalized_urls.append(parsed)
+            continue
+        normalized_urls.append(value)
+    return normalized_urls or None
+def _valve_overrides_from_args(args: argparse.Namespace) -> dict[str, Any]:
+    return {
+        "SEARXNG_BASE_URL": args.searxng_base_url,
+        "CRAWL4AI_BASE_URL": args.crawl4ai_base_url,
+        "CACHE_REDIS_URL": args.cache_redis_url,
+    }
+async def _read_pages(
+    tool: Any,
+    page_ids: list[str],
+    focus: str,
+    related_links_limit: int,
+    max_chars: int,
+) -> Any:
+    if not page_ids:
+        return None
+    return await tool.read_pages(
+        page_ids[0] if len(page_ids) == 1 else page_ids,
+        focus=focus,
+        related_links_limit=related_links_limit,
+        max_chars=max_chars,
+    )
+async def run_cli(args: argparse.Namespace) -> dict[str, Any]:
+    if not args.query and not args.read_page_ids:
+        raise SystemExit("Provide --query or --read-page-id")
+    tool = build_tools(valve_overrides=_valve_overrides_from_args(args))
+    payload: dict[str, Any] = {}
+    if args.query:
+        results = await tool.search_web(
+            query=args.query,
+            urls=_urls_from_args(args),
+            depth=args.depth,
+            max_results=args.max_results,
+            fresh=args.fresh,
+        )
+        payload["search_web"] = results
+        if args.include_metadata:
+            payload["search_metadata"] = tool.last_query_metadata
+        read_first_count = max(args.read_first_pages, 1 if args.read_first_page else 0)
+        page_ids = _page_ids_from_results(results, read_first_count)
+        read_payload = await _read_pages(
+            tool,
+            page_ids,
+            args.focus,
+            args.related_links_limit,
+            args.max_chars,
+        )
+        if read_payload is not None:
+            payload["read_pages"] = read_payload
+    if args.read_page_ids:
+        requested_page_ids = [page_id for page_id in args.read_page_ids if page_id]
+        payload["read_pages"] = await _read_pages(
+            tool,
+            requested_page_ids,
+            args.focus,
+            args.related_links_limit,
+            args.max_chars,
+        )
+    return payload
+def main(argv: Sequence[str] | None = None) -> int:
+    args = parse_args(argv)
+    payload = asyncio.run(run_cli(args))
+    print(json.dumps(payload, indent=2 if args.pretty else None, ensure_ascii=True))
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

sourceweave_web_search/config.py ADDED Viewed

@@ -0,0 +1,68 @@
+import os
+from dataclasses import dataclass, field
+from typing import Any, Mapping
+from sourceweave_web_search.tool import Tools
+def _parse_bool(value: str) -> bool:
+    normalized = value.strip().lower()
+    if normalized in {"1", "true", "yes", "on"}:
+        return True
+    if normalized in {"0", "false", "no", "off"}:
+        return False
+    raise ValueError(f"Invalid boolean value: {value}")
+def _coerce_env_value(raw_value: str, current_value: Any) -> Any:
+    if isinstance(current_value, bool):
+        return _parse_bool(raw_value)
+    if isinstance(current_value, int) and not isinstance(current_value, bool):
+        return int(raw_value)
+    if isinstance(current_value, float):
+        return float(raw_value)
+    return raw_value
+@dataclass(slots=True)
+class RuntimeOverrides:
+    valve_overrides: dict[str, Any] = field(default_factory=dict)
+    @classmethod
+    def from_env(cls) -> "RuntimeOverrides":
+        defaults = Tools.Valves()
+        overrides: dict[str, Any] = {}
+        for field_name in Tools.Valves.model_fields:
+            env_name = f"SOURCEWEAVE_SEARCH_{field_name}"
+            raw_value = os.getenv(env_name)
+            if raw_value is None:
+                continue
+            overrides[field_name] = _coerce_env_value(
+                raw_value,
+                getattr(defaults, field_name),
+            )
+        return cls(valve_overrides=overrides)
+    def apply(self, tool: Tools) -> Tools:
+        return tool.apply_valve_overrides(self.valve_overrides)
+def _sync_runtime_state(tool: Tools) -> None:
+    tool._sync_runtime_state()
+def build_tools(
+    *,
+    runtime_overrides: RuntimeOverrides | None = None,
+    valve_overrides: Mapping[str, Any] | None = None,
+) -> Tools:
+    tool = Tools()
+    merged_overrides = dict(
+        (runtime_overrides or RuntimeOverrides.from_env()).valve_overrides
+    )
+    for field_name, value in (valve_overrides or {}).items():
+        if value is None or not hasattr(tool.valves, field_name):
+            continue
+        merged_overrides[field_name] = value
+    return tool.apply_valve_overrides(merged_overrides)

sourceweave_web_search/mcp_server.py ADDED Viewed

@@ -0,0 +1,206 @@
+import argparse
+import os
+from typing import Annotated
+from typing import Literal
+from typing import Sequence
+from mcp.server.fastmcp import FastMCP
+from pydantic import BaseModel
+from pydantic import Field
+from sourceweave_web_search.config import build_tools
+from sourceweave_web_search.tool import Tools
+class UrlTarget(BaseModel):
+    url: str = Field(description="Absolute URL to crawl or convert.")
+    convert_document: bool = Field(
+        default=False,
+        description="Force document conversion for this URL when it points to a document such as a PDF.",
+    )
+SearchQuery = Annotated[
+    str,
+    Field(
+        description=(
+            "Search query. Prefer concise retrieval-style queries, quote exact errors, "
+            "and use site: when a specific domain matters."
+        )
+    ),
+]
+SearchUrls = Annotated[
+    list[str | UrlTarget] | None,
+    Field(
+        description=(
+            "Optional specific URLs to crawl in addition to search results. Each item may be "
+            "either a plain URL string or an object with per-URL options like convert_document."
+        )
+    ),
+]
+SearchDepth = Annotated[
+    Literal["quick", "normal", "deep"],
+    Field(
+        description=(
+            "How much search and crawl effort to spend. quick is fastest, normal is balanced, "
+            "and deep explores more candidates."
+        )
+    ),
+]
+SearchMaxResults = Annotated[
+    int | None,
+    Field(description="Optional cap on how many summarized results to return."),
+]
+SearchFresh = Annotated[
+    bool,
+    Field(description="If true, bypass cached search and page results for this call."),
+]
+ReadPageIds = Annotated[
+    list[str],
+    Field(
+        description=(
+            "One or more page_ids returned by search_web. Batch related pages into one "
+            "call when comparing or synthesizing multiple sources."
+        )
+    ),
+]
+ReadFocus = Annotated[
+    str,
+    Field(
+        description=(
+            "Optional focus phrase used to extract the most relevant sections from stored page "
+            "content. Use short topic phrases, exact errors, function names, or concepts."
+        )
+    ),
+]
+ReadRelatedLinksLimit = Annotated[
+    int,
+    Field(
+        description=(
+            "Maximum number of stored related links to return per page. Use 0 to omit the links "
+            "array while still returning related_links_total and related_links_more_available."
+        ),
+        ge=0,
+    ),
+]
+ReadMaxChars = Annotated[
+    int,
+    Field(description="Maximum number of characters to return per page."),
+]
+def _mcp_host() -> str:
+    return os.getenv("FASTMCP_HOST", "127.0.0.1")
+def _mcp_port() -> int:
+    return int(os.getenv("FASTMCP_PORT", "8000"))
+def build_mcp_server(
+    tool: Tools | None = None,
+    *,
+    host: str | None = None,
+    port: int | None = None,
+) -> FastMCP:
+    tool_instance = tool or build_tools()
+    server = FastMCP(
+        "sourceweave-web-search",
+        host=host or _mcp_host(),
+        port=port if port is not None else _mcp_port(),
+    )
+    @server.tool(
+        name="search_web",
+        description=(
+            "Search the web for relevant sources and crawl the selected pages. "
+            "Use concise retrieval-style queries, quote exact errors, and add site: filters when domain preference matters. "
+            "Returns compact summaries plus page_ids. Use read_pages next when you need full content. "
+            "If you already know an important URL, pass it in urls; use convert_document for explicit document URLs like PDFs. "
+            "Low-utility crawled pages may include page_quality such as challenge or blocked."
+        ),
+    )
+    async def search_web(
+        query: SearchQuery,
+        urls: SearchUrls = None,
+        depth: SearchDepth = "normal",
+        max_results: SearchMaxResults = None,
+        fresh: SearchFresh = False,
+    ):
+        return await tool_instance.search_web(
+            query=query,
+            urls=urls,
+            depth=depth,
+            max_results=max_results,
+            fresh=fresh,
+        )
+    @server.tool(
+        name="read_pages",
+        description=(
+            "Retrieve the full cleaned content for one or more previously returned page_ids. "
+            "Prefer batching related page_ids in a single call. Use focus to extract the most relevant sections. "
+            "Use related_links_limit=0 when you only want page content without page-adjacent links. "
+            "Returned pages may include page_quality when a page looks challenge-like or blocked."
+        ),
+    )
+    async def read_pages(
+        page_ids: ReadPageIds,
+        focus: ReadFocus = "",
+        related_links_limit: ReadRelatedLinksLimit = 3,
+        max_chars: ReadMaxChars = 8000,
+    ):
+        return await tool_instance.read_pages(
+            page_ids,
+            focus=focus,
+            related_links_limit=related_links_limit,
+            max_chars=max_chars,
+        )
+    return server
+def parse_args(argv: Sequence[str] | None = None) -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Run the SourceWeave Web Search MCP server."
+    )
+    parser.add_argument(
+        "--transport",
+        choices=["stdio", "sse", "streamable-http"],
+        default="stdio",
+        help="MCP transport to run. stdio is the default for uvx-based MCP clients.",
+    )
+    parser.add_argument(
+        "--host",
+        help=(
+            "Host to bind for sse or streamable-http transport. "
+            "Ignored for stdio. Defaults to FASTMCP_HOST or 127.0.0.1."
+        ),
+    )
+    parser.add_argument(
+        "--port",
+        type=int,
+        help=(
+            "Port to bind for sse or streamable-http transport. "
+            "Ignored for stdio. Defaults to FASTMCP_PORT or 8000."
+        ),
+    )
+    return parser.parse_args(argv)
+def main(argv: Sequence[str] | None = None) -> int:
+    args = parse_args(argv)
+    build_mcp_server(host=args.host, port=args.port).run(transport=args.transport)
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())