browser-goat 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. browser_goat/__init__.py +15 -0
  2. browser_goat/cli.py +277 -0
  3. browser_goat/extraction/__init__.py +9 -0
  4. browser_goat/extraction/content_extractor.py +415 -0
  5. browser_goat/extraction/goal_oriented.py +175 -0
  6. browser_goat/extraction/scrapling_fetcher.py +237 -0
  7. browser_goat/mcp_server.py +132 -0
  8. browser_goat/models.py +320 -0
  9. browser_goat/post_search/__init__.py +17 -0
  10. browser_goat/post_search/ranking.py +395 -0
  11. browser_goat/post_search/url_pipeline.py +185 -0
  12. browser_goat/pre_search/__init__.py +16 -0
  13. browser_goat/pre_search/browser_profiles.py +224 -0
  14. browser_goat/pre_search/language_detect.py +76 -0
  15. browser_goat/pre_search/query_intel.py +387 -0
  16. browser_goat/reliability/__init__.py +15 -0
  17. browser_goat/reliability/force_answer.py +77 -0
  18. browser_goat/reliability/give_up_detector.py +110 -0
  19. browser_goat/reliability/quality_gate.py +64 -0
  20. browser_goat/router.py +457 -0
  21. browser_goat/searxng_client.py +200 -0
  22. browser_goat/strategy/__init__.py +7 -0
  23. browser_goat/strategy/adaptive_explorer.py +466 -0
  24. browser_goat/strategy/query_classifier.py +383 -0
  25. browser_goat/strategy/recursive_decomposer.py +380 -0
  26. browser_goat/verification/__init__.py +7 -0
  27. browser_goat/verification/answer_voter.py +146 -0
  28. browser_goat/verification/llm_verifier.py +263 -0
  29. browser_goat/verification/multi_rollout.py +206 -0
  30. browser_goat-0.1.0.dist-info/METADATA +24 -0
  31. browser_goat-0.1.0.dist-info/RECORD +35 -0
  32. browser_goat-0.1.0.dist-info/WHEEL +5 -0
  33. browser_goat-0.1.0.dist-info/entry_points.txt +3 -0
  34. browser_goat-0.1.0.dist-info/licenses/LICENSE +21 -0
  35. browser_goat-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,15 @@
1
+ """browser-goat — Meta-layer search intelligence wrapping SearXNG.
2
+
3
+ Layers:
4
+ pre_search — Query intelligence, language detection, browser profiles
5
+ post_search — URL pipeline, RRF+BM25+MMR ranking, dedup
6
+ extraction — Content extraction, goal-oriented, Scrapling anti-bot
7
+ reliability — Give-up detection, quality gating, force answer
8
+ strategy — Query classification, adaptive exploration (Phase 2)
9
+ verification — Multi-rollout voting (Phase 3)
10
+ """
11
+
12
+ from browser_goat.router import BrowserGoat
13
+
14
+ __version__ = "0.1.0"
15
+ __all__ = ["BrowserGoat"]
browser_goat/cli.py ADDED
@@ -0,0 +1,277 @@
1
+ """CLI entry point for browser-goat.
2
+
3
+ Usage:
4
+ browser-goat search "What is Python?" --searxng-url http://localhost:8080
5
+ browser-goat search "latest AI news" --time-range week --strategy explore
6
+ browser-goat search "Python vs Rust" --reliability high
7
+ uvx browser-goat search "quantum computing research"
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import argparse
13
+ import asyncio
14
+ import json
15
+ import sys
16
+ from typing import Any
17
+
18
+ from browser_goat.router import BrowserGoat
19
+
20
+
21
+ def build_parser() -> argparse.ArgumentParser:
22
+ parser = argparse.ArgumentParser(
23
+ prog="browser-goat",
24
+ description="Meta-layer search intelligence wrapping SearXNG",
25
+ )
26
+ sub = parser.add_subparsers(dest="command", required=True)
27
+
28
+ # ── search ──
29
+ search = sub.add_parser("search", help="Run a full search pipeline")
30
+ search.add_argument("query", help="Search query string")
31
+ search.add_argument(
32
+ "--searxng-url",
33
+ default="http://localhost:8080",
34
+ help="SearXNG instance URL (default: http://localhost:8080)",
35
+ )
36
+ search.add_argument(
37
+ "--engines",
38
+ nargs="*",
39
+ default=None,
40
+ help="SearXNG engines to use (e.g. google bing scholar)",
41
+ )
42
+ search.add_argument(
43
+ "--time-range",
44
+ choices=["day", "week", "month", "year"],
45
+ default=None,
46
+ help="Time filter for results",
47
+ )
48
+ search.add_argument(
49
+ "--language",
50
+ default="en",
51
+ help="Language code for results (default: en)",
52
+ )
53
+ search.add_argument(
54
+ "--max-sources",
55
+ type=int,
56
+ default=15,
57
+ help="Maximum sources to extract (default: 15)",
58
+ )
59
+ search.add_argument(
60
+ "--strategy",
61
+ choices=["default", "auto", "explore", "decompose"],
62
+ default="default",
63
+ help="Search strategy (default: default)",
64
+ )
65
+ search.add_argument(
66
+ "--reliability",
67
+ choices=["standard", "high", "maximum"],
68
+ default="standard",
69
+ help="Reliability mode (default: standard)",
70
+ )
71
+ search.add_argument(
72
+ "--format",
73
+ choices=["json", "pretty"],
74
+ default="json",
75
+ help="Output format (default: json)",
76
+ )
77
+
78
+ # ── extract ──
79
+ extract = sub.add_parser("extract", help="Extract content from a URL")
80
+ extract.add_argument("url", help="URL to extract content from")
81
+ extract.add_argument(
82
+ "--searxng-url",
83
+ default="http://localhost:8080",
84
+ help="SearXNG instance URL",
85
+ )
86
+ extract.add_argument(
87
+ "--format",
88
+ choices=["json", "pretty"],
89
+ default="json",
90
+ help="Output format (default: json)",
91
+ )
92
+
93
+ # ── verify ──
94
+ verify = sub.add_parser("verify", help="Verify an answer via multi-rollout voting")
95
+ verify.add_argument("query", help="The query to verify")
96
+ verify.add_argument(
97
+ "--searxng-url",
98
+ default="http://localhost:8080",
99
+ help="SearXNG instance URL",
100
+ )
101
+ verify.add_argument(
102
+ "--rollouts",
103
+ type=int,
104
+ default=5,
105
+ help="Number of parallel rollouts (default: 5)",
106
+ )
107
+ verify.add_argument(
108
+ "--format",
109
+ choices=["json", "pretty"],
110
+ default="json",
111
+ help="Output format (default: json)",
112
+ )
113
+
114
+ # ── serve ──
115
+ serve = sub.add_parser("serve", help="Run browser-goat as an HTTP JSON API server")
116
+ serve.add_argument(
117
+ "--host",
118
+ default="0.0.0.0",
119
+ help="Host to bind to (default: 0.0.0.0)",
120
+ )
121
+ serve.add_argument(
122
+ "--port",
123
+ type=int,
124
+ default=8000,
125
+ help="Port to listen on (default: 8000)",
126
+ )
127
+ serve.add_argument(
128
+ "--searxng-url",
129
+ default="http://localhost:8080",
130
+ help="SearXNG instance URL (default: http://localhost:8080)",
131
+ )
132
+
133
+ return parser
134
+
135
+
136
+ def format_output(data: Any, fmt: str) -> str:
137
+ """Format output as JSON or pretty-printed."""
138
+ if fmt == "pretty":
139
+ if hasattr(data, "model_dump"):
140
+ return json.dumps(data.model_dump(), indent=2, ensure_ascii=False)
141
+ return json.dumps(data, indent=2, ensure_ascii=False)
142
+ if hasattr(data, "model_dump"):
143
+ return str(data.model_dump_json())
144
+ return json.dumps(data, ensure_ascii=False)
145
+
146
+
147
+ async def cmd_search(args: argparse.Namespace) -> None:
148
+ meta = BrowserGoat(searxng_url=args.searxng_url)
149
+ result = await meta.search(
150
+ query=args.query,
151
+ engines=args.engines,
152
+ time_range=args.time_range,
153
+ language=args.language,
154
+ max_sources=args.max_sources,
155
+ strategy=args.strategy,
156
+ reliability_mode=args.reliability,
157
+ )
158
+ output = format_output(result, args.format)
159
+ print(output)
160
+
161
+
162
+ async def cmd_extract(args: argparse.Namespace) -> None:
163
+ meta = BrowserGoat(searxng_url=args.searxng_url)
164
+ fetcher = meta.scrapling
165
+ extractor = meta.content_extractor
166
+ profile = meta.browser_profiles.get_random_profile()
167
+
168
+ fetch_result = await fetcher.fetch(args.url, profile)
169
+ if not fetch_result.success:
170
+ print(json.dumps({"error": fetch_result.error or "fetch failed"}), file=sys.stderr)
171
+ sys.exit(1)
172
+
173
+ content = extractor.extract(fetch_result.html, args.url)
174
+
175
+ output = format_output(
176
+ {
177
+ "url": args.url,
178
+ "title": content.title,
179
+ "text": content.text[:1000] + "..." if len(content.text) > 1000 else content.text,
180
+ "extraction_tier": content.extraction_tier,
181
+ "text_length": len(content.text),
182
+ },
183
+ args.format,
184
+ )
185
+ print(output)
186
+
187
+
188
+ async def cmd_verify(args: argparse.Namespace) -> None:
189
+ meta = BrowserGoat(searxng_url=args.searxng_url)
190
+ result = await meta.search(
191
+ query=args.query,
192
+ reliability_mode="high" if args.rollouts <= 5 else "maximum",
193
+ )
194
+ output = format_output(result, args.format)
195
+ print(output)
196
+
197
+
198
+ async def cmd_serve(args: argparse.Namespace) -> None:
199
+ """Run browser-goat as a minimal HTTP JSON API server (zero extra deps)."""
200
+ meta = BrowserGoat(searxng_url=args.searxng_url)
201
+
202
+ async def handle(reader: asyncio.StreamReader, writer: asyncio.StreamWriter) -> None:
203
+ try:
204
+ raw = await asyncio.wait_for(reader.readuntil(b"\r\n\r\n"), timeout=30)
205
+ request_line, *_ = raw.decode("utf-8", errors="replace").split("\r\n")
206
+ method, path, *_ = request_line.split(" ") + ["", ""]
207
+
208
+ if method == "GET" and path in ("/health", "/"):
209
+ body = b'{"status":"ok"}'
210
+ writer.write(
211
+ b"HTTP/1.1 200 OK\r\nContent-Type: application/json\r\n"
212
+ b"Content-Length: " + str(len(body)).encode() + b"\r\n\r\n" + body
213
+ )
214
+ await writer.drain()
215
+ return
216
+
217
+ if method == "POST" and path == "/search":
218
+ content_length = 0
219
+ for line in raw.decode("utf-8", errors="replace").split("\r\n"):
220
+ if line.lower().startswith("content-length:"):
221
+ content_length = int(line.split(":")[1].strip())
222
+ body_raw = await asyncio.wait_for(reader.readexactly(content_length), timeout=5)
223
+ params = json.loads(body_raw)
224
+
225
+ result = await meta.search(
226
+ query=params.get("query", ""),
227
+ time_range=params.get("time_range"),
228
+ max_sources=params.get("max_sources", 15),
229
+ strategy=params.get("strategy", "default"),
230
+ reliability_mode=params.get("reliability", "standard"),
231
+ )
232
+ body = result.model_dump_json().encode()
233
+ writer.write(
234
+ b"HTTP/1.1 200 OK\r\nContent-Type: application/json\r\n"
235
+ b"Content-Length: " + str(len(body)).encode() + b"\r\n\r\n" + body
236
+ )
237
+ await writer.drain()
238
+ return
239
+
240
+ body = b'{"error":"not found"}'
241
+ writer.write(
242
+ b"HTTP/1.1 404 Not Found\r\nContent-Type: application/json\r\n"
243
+ b"Content-Length: " + str(len(body)).encode() + b"\r\n\r\n" + body
244
+ )
245
+ await writer.drain()
246
+ except Exception:
247
+ pass
248
+ finally:
249
+ writer.close()
250
+ await writer.wait_closed()
251
+
252
+ server = await asyncio.start_server(handle, host=args.host, port=args.port)
253
+ print(f"browser-goat API listening on http://{args.host}:{args.port}", file=sys.stderr)
254
+ async with server:
255
+ await server.serve_forever()
256
+
257
+
258
+ def main() -> None:
259
+ parser = build_parser()
260
+ args = parser.parse_args()
261
+
262
+ try:
263
+ if args.command == "search":
264
+ asyncio.run(cmd_search(args))
265
+ elif args.command == "extract":
266
+ asyncio.run(cmd_extract(args))
267
+ elif args.command == "verify":
268
+ asyncio.run(cmd_verify(args))
269
+ elif args.command == "serve":
270
+ asyncio.run(cmd_serve(args))
271
+ except Exception as e:
272
+ print(json.dumps({"error": str(e)}), file=sys.stderr)
273
+ sys.exit(1)
274
+
275
+
276
+ if __name__ == "__main__":
277
+ main()
@@ -0,0 +1,9 @@
1
+ """Extraction layer: 7-tier content extraction, goal-oriented extraction, Scrapling anti-bot."""
2
+
3
+ from browser_goat.extraction.content_extractor import ContentExtractor
4
+ from browser_goat.extraction.goal_oriented import GoalOrientedExtractor
5
+ from browser_goat.extraction.scrapling_fetcher import ScraplingFetcher
6
+
7
+ __all__ = ["ContentExtractor", "GoalOrientedExtractor", "ScraplingFetcher"]
8
+
9
+ __all__ = ["ContentExtractor", "GoalOrientedExtractor", "ScraplingFetcher"]