browser-goat 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- browser_goat/__init__.py +15 -0
- browser_goat/cli.py +277 -0
- browser_goat/extraction/__init__.py +9 -0
- browser_goat/extraction/content_extractor.py +415 -0
- browser_goat/extraction/goal_oriented.py +175 -0
- browser_goat/extraction/scrapling_fetcher.py +237 -0
- browser_goat/mcp_server.py +132 -0
- browser_goat/models.py +320 -0
- browser_goat/post_search/__init__.py +17 -0
- browser_goat/post_search/ranking.py +395 -0
- browser_goat/post_search/url_pipeline.py +185 -0
- browser_goat/pre_search/__init__.py +16 -0
- browser_goat/pre_search/browser_profiles.py +224 -0
- browser_goat/pre_search/language_detect.py +76 -0
- browser_goat/pre_search/query_intel.py +387 -0
- browser_goat/reliability/__init__.py +15 -0
- browser_goat/reliability/force_answer.py +77 -0
- browser_goat/reliability/give_up_detector.py +110 -0
- browser_goat/reliability/quality_gate.py +64 -0
- browser_goat/router.py +457 -0
- browser_goat/searxng_client.py +200 -0
- browser_goat/strategy/__init__.py +7 -0
- browser_goat/strategy/adaptive_explorer.py +466 -0
- browser_goat/strategy/query_classifier.py +383 -0
- browser_goat/strategy/recursive_decomposer.py +380 -0
- browser_goat/verification/__init__.py +7 -0
- browser_goat/verification/answer_voter.py +146 -0
- browser_goat/verification/llm_verifier.py +263 -0
- browser_goat/verification/multi_rollout.py +206 -0
- browser_goat-0.1.0.dist-info/METADATA +24 -0
- browser_goat-0.1.0.dist-info/RECORD +35 -0
- browser_goat-0.1.0.dist-info/WHEEL +5 -0
- browser_goat-0.1.0.dist-info/entry_points.txt +3 -0
- browser_goat-0.1.0.dist-info/licenses/LICENSE +21 -0
- browser_goat-0.1.0.dist-info/top_level.txt +1 -0
browser_goat/__init__.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""browser-goat — Meta-layer search intelligence wrapping SearXNG.
|
|
2
|
+
|
|
3
|
+
Layers:
|
|
4
|
+
pre_search — Query intelligence, language detection, browser profiles
|
|
5
|
+
post_search — URL pipeline, RRF+BM25+MMR ranking, dedup
|
|
6
|
+
extraction — Content extraction, goal-oriented, Scrapling anti-bot
|
|
7
|
+
reliability — Give-up detection, quality gating, force answer
|
|
8
|
+
strategy — Query classification, adaptive exploration (Phase 2)
|
|
9
|
+
verification — Multi-rollout voting (Phase 3)
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from browser_goat.router import BrowserGoat
|
|
13
|
+
|
|
14
|
+
__version__ = "0.1.0"
|
|
15
|
+
__all__ = ["BrowserGoat"]
|
browser_goat/cli.py
ADDED
|
@@ -0,0 +1,277 @@
|
|
|
1
|
+
"""CLI entry point for browser-goat.
|
|
2
|
+
|
|
3
|
+
Usage:
|
|
4
|
+
browser-goat search "What is Python?" --searxng-url http://localhost:8080
|
|
5
|
+
browser-goat search "latest AI news" --time-range week --strategy explore
|
|
6
|
+
browser-goat search "Python vs Rust" --reliability high
|
|
7
|
+
uvx browser-goat search "quantum computing research"
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import argparse
|
|
13
|
+
import asyncio
|
|
14
|
+
import json
|
|
15
|
+
import sys
|
|
16
|
+
from typing import Any
|
|
17
|
+
|
|
18
|
+
from browser_goat.router import BrowserGoat
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
22
|
+
parser = argparse.ArgumentParser(
|
|
23
|
+
prog="browser-goat",
|
|
24
|
+
description="Meta-layer search intelligence wrapping SearXNG",
|
|
25
|
+
)
|
|
26
|
+
sub = parser.add_subparsers(dest="command", required=True)
|
|
27
|
+
|
|
28
|
+
# ── search ──
|
|
29
|
+
search = sub.add_parser("search", help="Run a full search pipeline")
|
|
30
|
+
search.add_argument("query", help="Search query string")
|
|
31
|
+
search.add_argument(
|
|
32
|
+
"--searxng-url",
|
|
33
|
+
default="http://localhost:8080",
|
|
34
|
+
help="SearXNG instance URL (default: http://localhost:8080)",
|
|
35
|
+
)
|
|
36
|
+
search.add_argument(
|
|
37
|
+
"--engines",
|
|
38
|
+
nargs="*",
|
|
39
|
+
default=None,
|
|
40
|
+
help="SearXNG engines to use (e.g. google bing scholar)",
|
|
41
|
+
)
|
|
42
|
+
search.add_argument(
|
|
43
|
+
"--time-range",
|
|
44
|
+
choices=["day", "week", "month", "year"],
|
|
45
|
+
default=None,
|
|
46
|
+
help="Time filter for results",
|
|
47
|
+
)
|
|
48
|
+
search.add_argument(
|
|
49
|
+
"--language",
|
|
50
|
+
default="en",
|
|
51
|
+
help="Language code for results (default: en)",
|
|
52
|
+
)
|
|
53
|
+
search.add_argument(
|
|
54
|
+
"--max-sources",
|
|
55
|
+
type=int,
|
|
56
|
+
default=15,
|
|
57
|
+
help="Maximum sources to extract (default: 15)",
|
|
58
|
+
)
|
|
59
|
+
search.add_argument(
|
|
60
|
+
"--strategy",
|
|
61
|
+
choices=["default", "auto", "explore", "decompose"],
|
|
62
|
+
default="default",
|
|
63
|
+
help="Search strategy (default: default)",
|
|
64
|
+
)
|
|
65
|
+
search.add_argument(
|
|
66
|
+
"--reliability",
|
|
67
|
+
choices=["standard", "high", "maximum"],
|
|
68
|
+
default="standard",
|
|
69
|
+
help="Reliability mode (default: standard)",
|
|
70
|
+
)
|
|
71
|
+
search.add_argument(
|
|
72
|
+
"--format",
|
|
73
|
+
choices=["json", "pretty"],
|
|
74
|
+
default="json",
|
|
75
|
+
help="Output format (default: json)",
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
# ── extract ──
|
|
79
|
+
extract = sub.add_parser("extract", help="Extract content from a URL")
|
|
80
|
+
extract.add_argument("url", help="URL to extract content from")
|
|
81
|
+
extract.add_argument(
|
|
82
|
+
"--searxng-url",
|
|
83
|
+
default="http://localhost:8080",
|
|
84
|
+
help="SearXNG instance URL",
|
|
85
|
+
)
|
|
86
|
+
extract.add_argument(
|
|
87
|
+
"--format",
|
|
88
|
+
choices=["json", "pretty"],
|
|
89
|
+
default="json",
|
|
90
|
+
help="Output format (default: json)",
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
# ── verify ──
|
|
94
|
+
verify = sub.add_parser("verify", help="Verify an answer via multi-rollout voting")
|
|
95
|
+
verify.add_argument("query", help="The query to verify")
|
|
96
|
+
verify.add_argument(
|
|
97
|
+
"--searxng-url",
|
|
98
|
+
default="http://localhost:8080",
|
|
99
|
+
help="SearXNG instance URL",
|
|
100
|
+
)
|
|
101
|
+
verify.add_argument(
|
|
102
|
+
"--rollouts",
|
|
103
|
+
type=int,
|
|
104
|
+
default=5,
|
|
105
|
+
help="Number of parallel rollouts (default: 5)",
|
|
106
|
+
)
|
|
107
|
+
verify.add_argument(
|
|
108
|
+
"--format",
|
|
109
|
+
choices=["json", "pretty"],
|
|
110
|
+
default="json",
|
|
111
|
+
help="Output format (default: json)",
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
# ── serve ──
|
|
115
|
+
serve = sub.add_parser("serve", help="Run browser-goat as an HTTP JSON API server")
|
|
116
|
+
serve.add_argument(
|
|
117
|
+
"--host",
|
|
118
|
+
default="0.0.0.0",
|
|
119
|
+
help="Host to bind to (default: 0.0.0.0)",
|
|
120
|
+
)
|
|
121
|
+
serve.add_argument(
|
|
122
|
+
"--port",
|
|
123
|
+
type=int,
|
|
124
|
+
default=8000,
|
|
125
|
+
help="Port to listen on (default: 8000)",
|
|
126
|
+
)
|
|
127
|
+
serve.add_argument(
|
|
128
|
+
"--searxng-url",
|
|
129
|
+
default="http://localhost:8080",
|
|
130
|
+
help="SearXNG instance URL (default: http://localhost:8080)",
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
return parser
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def format_output(data: Any, fmt: str) -> str:
|
|
137
|
+
"""Format output as JSON or pretty-printed."""
|
|
138
|
+
if fmt == "pretty":
|
|
139
|
+
if hasattr(data, "model_dump"):
|
|
140
|
+
return json.dumps(data.model_dump(), indent=2, ensure_ascii=False)
|
|
141
|
+
return json.dumps(data, indent=2, ensure_ascii=False)
|
|
142
|
+
if hasattr(data, "model_dump"):
|
|
143
|
+
return str(data.model_dump_json())
|
|
144
|
+
return json.dumps(data, ensure_ascii=False)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
async def cmd_search(args: argparse.Namespace) -> None:
|
|
148
|
+
meta = BrowserGoat(searxng_url=args.searxng_url)
|
|
149
|
+
result = await meta.search(
|
|
150
|
+
query=args.query,
|
|
151
|
+
engines=args.engines,
|
|
152
|
+
time_range=args.time_range,
|
|
153
|
+
language=args.language,
|
|
154
|
+
max_sources=args.max_sources,
|
|
155
|
+
strategy=args.strategy,
|
|
156
|
+
reliability_mode=args.reliability,
|
|
157
|
+
)
|
|
158
|
+
output = format_output(result, args.format)
|
|
159
|
+
print(output)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
async def cmd_extract(args: argparse.Namespace) -> None:
|
|
163
|
+
meta = BrowserGoat(searxng_url=args.searxng_url)
|
|
164
|
+
fetcher = meta.scrapling
|
|
165
|
+
extractor = meta.content_extractor
|
|
166
|
+
profile = meta.browser_profiles.get_random_profile()
|
|
167
|
+
|
|
168
|
+
fetch_result = await fetcher.fetch(args.url, profile)
|
|
169
|
+
if not fetch_result.success:
|
|
170
|
+
print(json.dumps({"error": fetch_result.error or "fetch failed"}), file=sys.stderr)
|
|
171
|
+
sys.exit(1)
|
|
172
|
+
|
|
173
|
+
content = extractor.extract(fetch_result.html, args.url)
|
|
174
|
+
|
|
175
|
+
output = format_output(
|
|
176
|
+
{
|
|
177
|
+
"url": args.url,
|
|
178
|
+
"title": content.title,
|
|
179
|
+
"text": content.text[:1000] + "..." if len(content.text) > 1000 else content.text,
|
|
180
|
+
"extraction_tier": content.extraction_tier,
|
|
181
|
+
"text_length": len(content.text),
|
|
182
|
+
},
|
|
183
|
+
args.format,
|
|
184
|
+
)
|
|
185
|
+
print(output)
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
async def cmd_verify(args: argparse.Namespace) -> None:
|
|
189
|
+
meta = BrowserGoat(searxng_url=args.searxng_url)
|
|
190
|
+
result = await meta.search(
|
|
191
|
+
query=args.query,
|
|
192
|
+
reliability_mode="high" if args.rollouts <= 5 else "maximum",
|
|
193
|
+
)
|
|
194
|
+
output = format_output(result, args.format)
|
|
195
|
+
print(output)
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
async def cmd_serve(args: argparse.Namespace) -> None:
|
|
199
|
+
"""Run browser-goat as a minimal HTTP JSON API server (zero extra deps)."""
|
|
200
|
+
meta = BrowserGoat(searxng_url=args.searxng_url)
|
|
201
|
+
|
|
202
|
+
async def handle(reader: asyncio.StreamReader, writer: asyncio.StreamWriter) -> None:
|
|
203
|
+
try:
|
|
204
|
+
raw = await asyncio.wait_for(reader.readuntil(b"\r\n\r\n"), timeout=30)
|
|
205
|
+
request_line, *_ = raw.decode("utf-8", errors="replace").split("\r\n")
|
|
206
|
+
method, path, *_ = request_line.split(" ") + ["", ""]
|
|
207
|
+
|
|
208
|
+
if method == "GET" and path in ("/health", "/"):
|
|
209
|
+
body = b'{"status":"ok"}'
|
|
210
|
+
writer.write(
|
|
211
|
+
b"HTTP/1.1 200 OK\r\nContent-Type: application/json\r\n"
|
|
212
|
+
b"Content-Length: " + str(len(body)).encode() + b"\r\n\r\n" + body
|
|
213
|
+
)
|
|
214
|
+
await writer.drain()
|
|
215
|
+
return
|
|
216
|
+
|
|
217
|
+
if method == "POST" and path == "/search":
|
|
218
|
+
content_length = 0
|
|
219
|
+
for line in raw.decode("utf-8", errors="replace").split("\r\n"):
|
|
220
|
+
if line.lower().startswith("content-length:"):
|
|
221
|
+
content_length = int(line.split(":")[1].strip())
|
|
222
|
+
body_raw = await asyncio.wait_for(reader.readexactly(content_length), timeout=5)
|
|
223
|
+
params = json.loads(body_raw)
|
|
224
|
+
|
|
225
|
+
result = await meta.search(
|
|
226
|
+
query=params.get("query", ""),
|
|
227
|
+
time_range=params.get("time_range"),
|
|
228
|
+
max_sources=params.get("max_sources", 15),
|
|
229
|
+
strategy=params.get("strategy", "default"),
|
|
230
|
+
reliability_mode=params.get("reliability", "standard"),
|
|
231
|
+
)
|
|
232
|
+
body = result.model_dump_json().encode()
|
|
233
|
+
writer.write(
|
|
234
|
+
b"HTTP/1.1 200 OK\r\nContent-Type: application/json\r\n"
|
|
235
|
+
b"Content-Length: " + str(len(body)).encode() + b"\r\n\r\n" + body
|
|
236
|
+
)
|
|
237
|
+
await writer.drain()
|
|
238
|
+
return
|
|
239
|
+
|
|
240
|
+
body = b'{"error":"not found"}'
|
|
241
|
+
writer.write(
|
|
242
|
+
b"HTTP/1.1 404 Not Found\r\nContent-Type: application/json\r\n"
|
|
243
|
+
b"Content-Length: " + str(len(body)).encode() + b"\r\n\r\n" + body
|
|
244
|
+
)
|
|
245
|
+
await writer.drain()
|
|
246
|
+
except Exception:
|
|
247
|
+
pass
|
|
248
|
+
finally:
|
|
249
|
+
writer.close()
|
|
250
|
+
await writer.wait_closed()
|
|
251
|
+
|
|
252
|
+
server = await asyncio.start_server(handle, host=args.host, port=args.port)
|
|
253
|
+
print(f"browser-goat API listening on http://{args.host}:{args.port}", file=sys.stderr)
|
|
254
|
+
async with server:
|
|
255
|
+
await server.serve_forever()
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def main() -> None:
|
|
259
|
+
parser = build_parser()
|
|
260
|
+
args = parser.parse_args()
|
|
261
|
+
|
|
262
|
+
try:
|
|
263
|
+
if args.command == "search":
|
|
264
|
+
asyncio.run(cmd_search(args))
|
|
265
|
+
elif args.command == "extract":
|
|
266
|
+
asyncio.run(cmd_extract(args))
|
|
267
|
+
elif args.command == "verify":
|
|
268
|
+
asyncio.run(cmd_verify(args))
|
|
269
|
+
elif args.command == "serve":
|
|
270
|
+
asyncio.run(cmd_serve(args))
|
|
271
|
+
except Exception as e:
|
|
272
|
+
print(json.dumps({"error": str(e)}), file=sys.stderr)
|
|
273
|
+
sys.exit(1)
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
if __name__ == "__main__":
|
|
277
|
+
main()
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
"""Extraction layer: 7-tier content extraction, goal-oriented extraction, Scrapling anti-bot."""
|
|
2
|
+
|
|
3
|
+
from browser_goat.extraction.content_extractor import ContentExtractor
|
|
4
|
+
from browser_goat.extraction.goal_oriented import GoalOrientedExtractor
|
|
5
|
+
from browser_goat.extraction.scrapling_fetcher import ScraplingFetcher
|
|
6
|
+
|
|
7
|
+
__all__ = ["ContentExtractor", "GoalOrientedExtractor", "ScraplingFetcher"]
|
|
8
|
+
|
|
9
|
+
__all__ = ["ContentExtractor", "GoalOrientedExtractor", "ScraplingFetcher"]
|