sourceweave-web-search 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,4 @@
1
+ from sourceweave_web_search.cli import main
2
+ from sourceweave_web_search.tool import Tools
3
+
4
+ __all__ = ["Tools", "main"]
@@ -0,0 +1,64 @@
1
+ import argparse
2
+ from pathlib import Path
3
+ from typing import Sequence
4
+
5
+
6
+ def canonical_tool_path() -> Path:
7
+ return Path(__file__).resolve().with_name("tool.py")
8
+
9
+
10
+ def default_output_path() -> Path:
11
+ return canonical_tool_path().parents[2] / "artifacts" / "sourceweave_web_search.py"
12
+
13
+
14
+ def build_openwebui_artifact(
15
+ output_path: Path | None = None, check: bool = False
16
+ ) -> bool:
17
+ source = canonical_tool_path().read_text(encoding="utf-8")
18
+ target = output_path or default_output_path()
19
+
20
+ if check:
21
+ return target.exists() and target.read_text(encoding="utf-8") == source
22
+
23
+ target.parent.mkdir(parents=True, exist_ok=True)
24
+ target.write_text(source, encoding="utf-8")
25
+ return True
26
+
27
+
28
+ def parse_args(argv: Sequence[str] | None = None) -> argparse.Namespace:
29
+ parser = argparse.ArgumentParser(
30
+ description="Build or validate the standalone OpenWebUI tool artifact."
31
+ )
32
+ parser.add_argument(
33
+ "--check",
34
+ action="store_true",
35
+ help="Exit non-zero if artifacts/sourceweave_web_search.py is out of sync with src/sourceweave_web_search/tool.py.",
36
+ )
37
+ parser.add_argument(
38
+ "--output",
39
+ default=str(default_output_path()),
40
+ help="Output path for the generated OpenWebUI tool file.",
41
+ )
42
+ return parser.parse_args(argv)
43
+
44
+
45
+ def main(argv: Sequence[str] | None = None) -> int:
46
+ args = parse_args(argv)
47
+ output_path = Path(args.output).resolve()
48
+ in_sync = build_openwebui_artifact(output_path=output_path, check=args.check)
49
+
50
+ if args.check:
51
+ if not in_sync:
52
+ print(
53
+ f"OpenWebUI artifact is out of sync: {output_path} != {canonical_tool_path()}"
54
+ )
55
+ return 1
56
+ print(f"OpenWebUI artifact is in sync: {output_path}")
57
+ return 0
58
+
59
+ print(f"Wrote OpenWebUI artifact to {output_path}")
60
+ return 0
61
+
62
+
63
+ if __name__ == "__main__":
64
+ raise SystemExit(main())
@@ -0,0 +1,189 @@
1
+ import argparse
2
+ import asyncio
3
+ import json
4
+ from typing import Any, Sequence
5
+
6
+ from sourceweave_web_search.config import build_tools
7
+
8
+
9
+ def parse_args(argv: Sequence[str] | None = None) -> argparse.Namespace:
10
+ parser = argparse.ArgumentParser(
11
+ description="Call the SourceWeave Web Search tool directly from the package CLI."
12
+ )
13
+ parser.add_argument("--query", default="", help="Query for search_web")
14
+ parser.add_argument(
15
+ "--url",
16
+ dest="urls",
17
+ action="append",
18
+ default=[],
19
+ help=(
20
+ "Optional URL to crawl alongside search results. Repeatable. "
21
+ "May be a plain URL string or a JSON object like "
22
+ '\'{"url": "https://example.com/file.pdf", "convert_document": true}\'.'
23
+ ),
24
+ )
25
+ parser.add_argument(
26
+ "--related-links-limit",
27
+ type=int,
28
+ default=3,
29
+ help="Maximum number of stored related links to include per read_pages result. Use 0 to omit them.",
30
+ )
31
+ parser.add_argument(
32
+ "--depth",
33
+ choices=["quick", "normal", "deep"],
34
+ default="normal",
35
+ help="search_web depth",
36
+ )
37
+ parser.add_argument("--max-results", type=int, default=None)
38
+ parser.add_argument("--fresh", action="store_true")
39
+ parser.add_argument("--read-first-page", action="store_true")
40
+ parser.add_argument(
41
+ "--read-first-pages",
42
+ type=int,
43
+ default=0,
44
+ help="After search, batch-read the first N returned page_ids in a single read_pages call.",
45
+ )
46
+ parser.add_argument(
47
+ "--read-page-id",
48
+ dest="read_page_ids",
49
+ action="append",
50
+ default=[],
51
+ help="Read one or more page_ids. Repeat this flag to batch them into a single read_pages call.",
52
+ )
53
+ parser.add_argument("--focus", default="")
54
+ parser.add_argument("--max-chars", type=int, default=1200)
55
+ parser.add_argument("--pretty", action="store_true")
56
+ parser.add_argument(
57
+ "--include-metadata",
58
+ action="store_true",
59
+ help="Include per-query debug metadata in CLI output.",
60
+ )
61
+ parser.add_argument(
62
+ "--searxng-base-url",
63
+ default=None,
64
+ help="Optional override for SEARXNG_BASE_URL. The SOURCEWEAVE_SEARCH_SEARXNG_BASE_URL env var works too.",
65
+ )
66
+ parser.add_argument(
67
+ "--crawl4ai-base-url",
68
+ default=None,
69
+ help="Optional override for CRAWL4AI_BASE_URL. The SOURCEWEAVE_SEARCH_CRAWL4AI_BASE_URL env var works too.",
70
+ )
71
+ parser.add_argument(
72
+ "--cache-redis-url",
73
+ default=None,
74
+ help="Optional override for CACHE_REDIS_URL. The SOURCEWEAVE_SEARCH_CACHE_REDIS_URL env var works too.",
75
+ )
76
+ return parser.parse_args(argv)
77
+
78
+
79
+ def _page_ids_from_results(results: Any, count: int) -> list[str]:
80
+ if not isinstance(results, list) or count <= 0:
81
+ return []
82
+
83
+ return [
84
+ result.get("page_id", "")
85
+ for result in results[:count]
86
+ if result.get("page_id", "")
87
+ ]
88
+
89
+
90
+ def _urls_from_args(args: argparse.Namespace) -> list[Any] | None:
91
+ normalized_urls: list[Any] = []
92
+ for raw_value in args.urls:
93
+ value = str(raw_value or "").strip()
94
+ if not value:
95
+ continue
96
+ if value.startswith("{"):
97
+ try:
98
+ parsed = json.loads(value)
99
+ except json.JSONDecodeError as exc:
100
+ raise SystemExit(f"Invalid JSON passed to --url: {exc}") from exc
101
+ if not isinstance(parsed, dict) or not parsed.get("url"):
102
+ raise SystemExit(
103
+ "JSON passed to --url must be an object with at least a 'url' field"
104
+ )
105
+ normalized_urls.append(parsed)
106
+ continue
107
+ normalized_urls.append(value)
108
+ return normalized_urls or None
109
+
110
+
111
+ def _valve_overrides_from_args(args: argparse.Namespace) -> dict[str, Any]:
112
+ return {
113
+ "SEARXNG_BASE_URL": args.searxng_base_url,
114
+ "CRAWL4AI_BASE_URL": args.crawl4ai_base_url,
115
+ "CACHE_REDIS_URL": args.cache_redis_url,
116
+ }
117
+
118
+
119
+ async def _read_pages(
120
+ tool: Any,
121
+ page_ids: list[str],
122
+ focus: str,
123
+ related_links_limit: int,
124
+ max_chars: int,
125
+ ) -> Any:
126
+ if not page_ids:
127
+ return None
128
+
129
+ return await tool.read_pages(
130
+ page_ids[0] if len(page_ids) == 1 else page_ids,
131
+ focus=focus,
132
+ related_links_limit=related_links_limit,
133
+ max_chars=max_chars,
134
+ )
135
+
136
+
137
+ async def run_cli(args: argparse.Namespace) -> dict[str, Any]:
138
+ if not args.query and not args.read_page_ids:
139
+ raise SystemExit("Provide --query or --read-page-id")
140
+
141
+ tool = build_tools(valve_overrides=_valve_overrides_from_args(args))
142
+ payload: dict[str, Any] = {}
143
+
144
+ if args.query:
145
+ results = await tool.search_web(
146
+ query=args.query,
147
+ urls=_urls_from_args(args),
148
+ depth=args.depth,
149
+ max_results=args.max_results,
150
+ fresh=args.fresh,
151
+ )
152
+ payload["search_web"] = results
153
+ if args.include_metadata:
154
+ payload["search_metadata"] = tool.last_query_metadata
155
+
156
+ read_first_count = max(args.read_first_pages, 1 if args.read_first_page else 0)
157
+ page_ids = _page_ids_from_results(results, read_first_count)
158
+ read_payload = await _read_pages(
159
+ tool,
160
+ page_ids,
161
+ args.focus,
162
+ args.related_links_limit,
163
+ args.max_chars,
164
+ )
165
+ if read_payload is not None:
166
+ payload["read_pages"] = read_payload
167
+
168
+ if args.read_page_ids:
169
+ requested_page_ids = [page_id for page_id in args.read_page_ids if page_id]
170
+ payload["read_pages"] = await _read_pages(
171
+ tool,
172
+ requested_page_ids,
173
+ args.focus,
174
+ args.related_links_limit,
175
+ args.max_chars,
176
+ )
177
+
178
+ return payload
179
+
180
+
181
+ def main(argv: Sequence[str] | None = None) -> int:
182
+ args = parse_args(argv)
183
+ payload = asyncio.run(run_cli(args))
184
+ print(json.dumps(payload, indent=2 if args.pretty else None, ensure_ascii=True))
185
+ return 0
186
+
187
+
188
+ if __name__ == "__main__":
189
+ raise SystemExit(main())
@@ -0,0 +1,68 @@
1
+ import os
2
+ from dataclasses import dataclass, field
3
+ from typing import Any, Mapping
4
+
5
+ from sourceweave_web_search.tool import Tools
6
+
7
+
8
+ def _parse_bool(value: str) -> bool:
9
+ normalized = value.strip().lower()
10
+ if normalized in {"1", "true", "yes", "on"}:
11
+ return True
12
+ if normalized in {"0", "false", "no", "off"}:
13
+ return False
14
+ raise ValueError(f"Invalid boolean value: {value}")
15
+
16
+
17
+ def _coerce_env_value(raw_value: str, current_value: Any) -> Any:
18
+ if isinstance(current_value, bool):
19
+ return _parse_bool(raw_value)
20
+ if isinstance(current_value, int) and not isinstance(current_value, bool):
21
+ return int(raw_value)
22
+ if isinstance(current_value, float):
23
+ return float(raw_value)
24
+ return raw_value
25
+
26
+
27
+ @dataclass(slots=True)
28
+ class RuntimeOverrides:
29
+ valve_overrides: dict[str, Any] = field(default_factory=dict)
30
+
31
+ @classmethod
32
+ def from_env(cls) -> "RuntimeOverrides":
33
+ defaults = Tools.Valves()
34
+ overrides: dict[str, Any] = {}
35
+ for field_name in Tools.Valves.model_fields:
36
+ env_name = f"SOURCEWEAVE_SEARCH_{field_name}"
37
+ raw_value = os.getenv(env_name)
38
+ if raw_value is None:
39
+ continue
40
+ overrides[field_name] = _coerce_env_value(
41
+ raw_value,
42
+ getattr(defaults, field_name),
43
+ )
44
+ return cls(valve_overrides=overrides)
45
+
46
+ def apply(self, tool: Tools) -> Tools:
47
+ return tool.apply_valve_overrides(self.valve_overrides)
48
+
49
+
50
+ def _sync_runtime_state(tool: Tools) -> None:
51
+ tool._sync_runtime_state()
52
+
53
+
54
+ def build_tools(
55
+ *,
56
+ runtime_overrides: RuntimeOverrides | None = None,
57
+ valve_overrides: Mapping[str, Any] | None = None,
58
+ ) -> Tools:
59
+ tool = Tools()
60
+ merged_overrides = dict(
61
+ (runtime_overrides or RuntimeOverrides.from_env()).valve_overrides
62
+ )
63
+ for field_name, value in (valve_overrides or {}).items():
64
+ if value is None or not hasattr(tool.valves, field_name):
65
+ continue
66
+ merged_overrides[field_name] = value
67
+
68
+ return tool.apply_valve_overrides(merged_overrides)
@@ -0,0 +1,206 @@
1
+ import argparse
2
+ import os
3
+ from typing import Annotated
4
+ from typing import Literal
5
+ from typing import Sequence
6
+
7
+ from mcp.server.fastmcp import FastMCP
8
+ from pydantic import BaseModel
9
+ from pydantic import Field
10
+
11
+ from sourceweave_web_search.config import build_tools
12
+ from sourceweave_web_search.tool import Tools
13
+
14
+
15
+ class UrlTarget(BaseModel):
16
+ url: str = Field(description="Absolute URL to crawl or convert.")
17
+ convert_document: bool = Field(
18
+ default=False,
19
+ description="Force document conversion for this URL when it points to a document such as a PDF.",
20
+ )
21
+
22
+
23
+ SearchQuery = Annotated[
24
+ str,
25
+ Field(
26
+ description=(
27
+ "Search query. Prefer concise retrieval-style queries, quote exact errors, "
28
+ "and use site: when a specific domain matters."
29
+ )
30
+ ),
31
+ ]
32
+
33
+ SearchUrls = Annotated[
34
+ list[str | UrlTarget] | None,
35
+ Field(
36
+ description=(
37
+ "Optional specific URLs to crawl in addition to search results. Each item may be "
38
+ "either a plain URL string or an object with per-URL options like convert_document."
39
+ )
40
+ ),
41
+ ]
42
+
43
+ SearchDepth = Annotated[
44
+ Literal["quick", "normal", "deep"],
45
+ Field(
46
+ description=(
47
+ "How much search and crawl effort to spend. quick is fastest, normal is balanced, "
48
+ "and deep explores more candidates."
49
+ )
50
+ ),
51
+ ]
52
+
53
+ SearchMaxResults = Annotated[
54
+ int | None,
55
+ Field(description="Optional cap on how many summarized results to return."),
56
+ ]
57
+
58
+ SearchFresh = Annotated[
59
+ bool,
60
+ Field(description="If true, bypass cached search and page results for this call."),
61
+ ]
62
+
63
+ ReadPageIds = Annotated[
64
+ list[str],
65
+ Field(
66
+ description=(
67
+ "One or more page_ids returned by search_web. Batch related pages into one "
68
+ "call when comparing or synthesizing multiple sources."
69
+ )
70
+ ),
71
+ ]
72
+
73
+ ReadFocus = Annotated[
74
+ str,
75
+ Field(
76
+ description=(
77
+ "Optional focus phrase used to extract the most relevant sections from stored page "
78
+ "content. Use short topic phrases, exact errors, function names, or concepts."
79
+ )
80
+ ),
81
+ ]
82
+
83
+ ReadRelatedLinksLimit = Annotated[
84
+ int,
85
+ Field(
86
+ description=(
87
+ "Maximum number of stored related links to return per page. Use 0 to omit the links "
88
+ "array while still returning related_links_total and related_links_more_available."
89
+ ),
90
+ ge=0,
91
+ ),
92
+ ]
93
+
94
+ ReadMaxChars = Annotated[
95
+ int,
96
+ Field(description="Maximum number of characters to return per page."),
97
+ ]
98
+
99
+
100
+ def _mcp_host() -> str:
101
+ return os.getenv("FASTMCP_HOST", "127.0.0.1")
102
+
103
+
104
+ def _mcp_port() -> int:
105
+ return int(os.getenv("FASTMCP_PORT", "8000"))
106
+
107
+
108
+ def build_mcp_server(
109
+ tool: Tools | None = None,
110
+ *,
111
+ host: str | None = None,
112
+ port: int | None = None,
113
+ ) -> FastMCP:
114
+ tool_instance = tool or build_tools()
115
+ server = FastMCP(
116
+ "sourceweave-web-search",
117
+ host=host or _mcp_host(),
118
+ port=port if port is not None else _mcp_port(),
119
+ )
120
+
121
+ @server.tool(
122
+ name="search_web",
123
+ description=(
124
+ "Search the web for relevant sources and crawl the selected pages. "
125
+ "Use concise retrieval-style queries, quote exact errors, and add site: filters when domain preference matters. "
126
+ "Returns compact summaries plus page_ids. Use read_pages next when you need full content. "
127
+ "If you already know an important URL, pass it in urls; use convert_document for explicit document URLs like PDFs. "
128
+ "Low-utility crawled pages may include page_quality such as challenge or blocked."
129
+ ),
130
+ )
131
+ async def search_web(
132
+ query: SearchQuery,
133
+ urls: SearchUrls = None,
134
+ depth: SearchDepth = "normal",
135
+ max_results: SearchMaxResults = None,
136
+ fresh: SearchFresh = False,
137
+ ):
138
+ return await tool_instance.search_web(
139
+ query=query,
140
+ urls=urls,
141
+ depth=depth,
142
+ max_results=max_results,
143
+ fresh=fresh,
144
+ )
145
+
146
+ @server.tool(
147
+ name="read_pages",
148
+ description=(
149
+ "Retrieve the full cleaned content for one or more previously returned page_ids. "
150
+ "Prefer batching related page_ids in a single call. Use focus to extract the most relevant sections. "
151
+ "Use related_links_limit=0 when you only want page content without page-adjacent links. "
152
+ "Returned pages may include page_quality when a page looks challenge-like or blocked."
153
+ ),
154
+ )
155
+ async def read_pages(
156
+ page_ids: ReadPageIds,
157
+ focus: ReadFocus = "",
158
+ related_links_limit: ReadRelatedLinksLimit = 3,
159
+ max_chars: ReadMaxChars = 8000,
160
+ ):
161
+ return await tool_instance.read_pages(
162
+ page_ids,
163
+ focus=focus,
164
+ related_links_limit=related_links_limit,
165
+ max_chars=max_chars,
166
+ )
167
+
168
+ return server
169
+
170
+
171
+ def parse_args(argv: Sequence[str] | None = None) -> argparse.Namespace:
172
+ parser = argparse.ArgumentParser(
173
+ description="Run the SourceWeave Web Search MCP server."
174
+ )
175
+ parser.add_argument(
176
+ "--transport",
177
+ choices=["stdio", "sse", "streamable-http"],
178
+ default="stdio",
179
+ help="MCP transport to run. stdio is the default for uvx-based MCP clients.",
180
+ )
181
+ parser.add_argument(
182
+ "--host",
183
+ help=(
184
+ "Host to bind for sse or streamable-http transport. "
185
+ "Ignored for stdio. Defaults to FASTMCP_HOST or 127.0.0.1."
186
+ ),
187
+ )
188
+ parser.add_argument(
189
+ "--port",
190
+ type=int,
191
+ help=(
192
+ "Port to bind for sse or streamable-http transport. "
193
+ "Ignored for stdio. Defaults to FASTMCP_PORT or 8000."
194
+ ),
195
+ )
196
+ return parser.parse_args(argv)
197
+
198
+
199
+ def main(argv: Sequence[str] | None = None) -> int:
200
+ args = parse_args(argv)
201
+ build_mcp_server(host=args.host, port=args.port).run(transport=args.transport)
202
+ return 0
203
+
204
+
205
+ if __name__ == "__main__":
206
+ raise SystemExit(main())