browser-goat 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- browser_goat-0.1.0/LICENSE +21 -0
- browser_goat-0.1.0/PKG-INFO +24 -0
- browser_goat-0.1.0/README.md +177 -0
- browser_goat-0.1.0/pyproject.toml +65 -0
- browser_goat-0.1.0/setup.cfg +4 -0
- browser_goat-0.1.0/src/browser_goat/__init__.py +15 -0
- browser_goat-0.1.0/src/browser_goat/cli.py +277 -0
- browser_goat-0.1.0/src/browser_goat/extraction/__init__.py +9 -0
- browser_goat-0.1.0/src/browser_goat/extraction/content_extractor.py +415 -0
- browser_goat-0.1.0/src/browser_goat/extraction/goal_oriented.py +175 -0
- browser_goat-0.1.0/src/browser_goat/extraction/scrapling_fetcher.py +237 -0
- browser_goat-0.1.0/src/browser_goat/mcp_server.py +132 -0
- browser_goat-0.1.0/src/browser_goat/models.py +320 -0
- browser_goat-0.1.0/src/browser_goat/post_search/__init__.py +17 -0
- browser_goat-0.1.0/src/browser_goat/post_search/ranking.py +395 -0
- browser_goat-0.1.0/src/browser_goat/post_search/url_pipeline.py +185 -0
- browser_goat-0.1.0/src/browser_goat/pre_search/__init__.py +16 -0
- browser_goat-0.1.0/src/browser_goat/pre_search/browser_profiles.py +224 -0
- browser_goat-0.1.0/src/browser_goat/pre_search/language_detect.py +76 -0
- browser_goat-0.1.0/src/browser_goat/pre_search/query_intel.py +387 -0
- browser_goat-0.1.0/src/browser_goat/reliability/__init__.py +15 -0
- browser_goat-0.1.0/src/browser_goat/reliability/force_answer.py +77 -0
- browser_goat-0.1.0/src/browser_goat/reliability/give_up_detector.py +110 -0
- browser_goat-0.1.0/src/browser_goat/reliability/quality_gate.py +64 -0
- browser_goat-0.1.0/src/browser_goat/router.py +457 -0
- browser_goat-0.1.0/src/browser_goat/searxng_client.py +200 -0
- browser_goat-0.1.0/src/browser_goat/strategy/__init__.py +7 -0
- browser_goat-0.1.0/src/browser_goat/strategy/adaptive_explorer.py +466 -0
- browser_goat-0.1.0/src/browser_goat/strategy/query_classifier.py +383 -0
- browser_goat-0.1.0/src/browser_goat/strategy/recursive_decomposer.py +380 -0
- browser_goat-0.1.0/src/browser_goat/verification/__init__.py +7 -0
- browser_goat-0.1.0/src/browser_goat/verification/answer_voter.py +146 -0
- browser_goat-0.1.0/src/browser_goat/verification/llm_verifier.py +263 -0
- browser_goat-0.1.0/src/browser_goat/verification/multi_rollout.py +206 -0
- browser_goat-0.1.0/src/browser_goat.egg-info/PKG-INFO +24 -0
- browser_goat-0.1.0/src/browser_goat.egg-info/SOURCES.txt +41 -0
- browser_goat-0.1.0/src/browser_goat.egg-info/dependency_links.txt +1 -0
- browser_goat-0.1.0/src/browser_goat.egg-info/entry_points.txt +3 -0
- browser_goat-0.1.0/src/browser_goat.egg-info/requires.txt +17 -0
- browser_goat-0.1.0/src/browser_goat.egg-info/top_level.txt +1 -0
- browser_goat-0.1.0/tests/test_models.py +278 -0
- browser_goat-0.1.0/tests/test_router.py +377 -0
- browser_goat-0.1.0/tests/test_searxng_client.py +132 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 browser-goat
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: browser-goat
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Meta-layer search intelligence wrapping SearXNG with pre/post processing, extraction, strategy, and verification from SearchWala, local-deep-research, Marco-DeepResearch, Tongyi-DeepResearch, and Scrapling.
|
|
5
|
+
License-Expression: MIT
|
|
6
|
+
Requires-Python: >=3.13
|
|
7
|
+
License-File: LICENSE
|
|
8
|
+
Requires-Dist: httpx[http2]>=0.28
|
|
9
|
+
Requires-Dist: beautifulsoup4>=4.12
|
|
10
|
+
Requires-Dist: lxml>=5.3
|
|
11
|
+
Requires-Dist: pydantic>=2.10
|
|
12
|
+
Requires-Dist: tenacity>=9.0
|
|
13
|
+
Requires-Dist: tiktoken>=0.8
|
|
14
|
+
Requires-Dist: playwright>=1.49
|
|
15
|
+
Requires-Dist: scrapling>=0.2
|
|
16
|
+
Requires-Dist: mcp>=1.0
|
|
17
|
+
Provides-Extra: dev
|
|
18
|
+
Requires-Dist: pytest>=8.3; extra == "dev"
|
|
19
|
+
Requires-Dist: pytest-asyncio>=0.25; extra == "dev"
|
|
20
|
+
Requires-Dist: pytest-cov>=6.0; extra == "dev"
|
|
21
|
+
Requires-Dist: ruff>=0.8; extra == "dev"
|
|
22
|
+
Requires-Dist: mypy>=1.13; extra == "dev"
|
|
23
|
+
Requires-Dist: bandit>=1.8; extra == "dev"
|
|
24
|
+
Dynamic: license-file
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
# browser-goat
|
|
2
|
+
|
|
3
|
+
[](https://github.com/Im-Busy/browser-goat)
|
|
4
|
+
[](https://python.org)
|
|
5
|
+
[](LICENSE)
|
|
6
|
+
|
|
7
|
+
> Meta-layer search intelligence wrapping SearXNG — Tavily-quality results on your own infrastructure.
|
|
8
|
+
|
|
9
|
+
browser-goat adds six processing layers to [SearXNG](https://docs.searxng.org/): intent detection, hybrid ranking, anti-bot content extraction, reliability gating, adaptive strategy, and multi-rollout verification. The result is agent-ready search output competitive with commercial APIs — running entirely on your own infrastructure.
|
|
10
|
+
|
|
11
|
+
```mermaid
|
|
12
|
+
flowchart TD
|
|
13
|
+
Q["Query"] --> L1
|
|
14
|
+
|
|
15
|
+
subgraph L1["1. Pre-Search"]
|
|
16
|
+
A["Intent detection<br/>Browser profiles<br/>Language detection"]
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
L1 --> SX["SearXNG Engine"]
|
|
20
|
+
|
|
21
|
+
SX --> L2
|
|
22
|
+
subgraph L2["2. Post-Search"]
|
|
23
|
+
B["URL normalization<br/>RRF + BM25 + MMR"]
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
L2 --> L3
|
|
27
|
+
subgraph L3["3. Extraction"]
|
|
28
|
+
C["7-tier cascading<br/>Anti-bot bypass<br/>Goal-oriented summary"]
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
L3 --> L4
|
|
32
|
+
subgraph L4["4. Reliability"]
|
|
33
|
+
D["Give-up detection<br/>Quality-gated retry<br/>Force synthesis"]
|
|
34
|
+
end
|
|
35
|
+
|
|
36
|
+
L4 --> L5
|
|
37
|
+
subgraph L5["5. Strategy"]
|
|
38
|
+
E["Query classification<br/>Adaptive exploration<br/>Recursive decomposition"]
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
L5 --> L6
|
|
42
|
+
subgraph L6["6. Verification"]
|
|
43
|
+
F["Multi-rollout voting<br/>Consensus verification<br/>LLM tie-breaking"]
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
L6 --> A["Answer"]
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
---
|
|
50
|
+
|
|
51
|
+
## Quick Start
|
|
52
|
+
|
|
53
|
+
### MCP (AI Agents)
|
|
54
|
+
|
|
55
|
+
```json
|
|
56
|
+
{
|
|
57
|
+
"mcpServers": {
|
|
58
|
+
"browser-goat": {
|
|
59
|
+
"command": "npx",
|
|
60
|
+
"args": ["browser-goat"],
|
|
61
|
+
"env": { "SEARXNG_URL": "http://localhost:8080" }
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
Requires Python 3.13+ and a running SearXNG instance.
|
|
68
|
+
|
|
69
|
+
### CLI
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
uvx browser-goat search "latest AI research"
|
|
73
|
+
uvx browser-goat search "Python vs Rust" --strategy explore
|
|
74
|
+
uvx browser-goat extract "https://example.com/article"
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
### Library
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
pip install browser-goat
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
```python
|
|
84
|
+
from browser_goat import BrowserGoat
|
|
85
|
+
|
|
86
|
+
meta = BrowserGoat(searxng_url="http://localhost:8080")
|
|
87
|
+
result = await meta.search("quantum computing")
|
|
88
|
+
print(result.answer)
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
---
|
|
92
|
+
|
|
93
|
+
## MCP Tools
|
|
94
|
+
|
|
95
|
+
| Tool | Description |
|
|
96
|
+
|------|-------------|
|
|
97
|
+
| `search` | Full pipeline: intent analysis → SearXNG → ranking → extraction → reliability. Supports `time_range` (day/week/month/year), `max_sources`, and `strategy` (default/auto/explore/decompose). |
|
|
98
|
+
| `extract` | Fetch and extract a single URL with anti-bot bypass (Cloudflare Turnstile). Returns title, clean text, and extraction tier. |
|
|
99
|
+
|
|
100
|
+
---
|
|
101
|
+
|
|
102
|
+
## Client Configuration
|
|
103
|
+
|
|
104
|
+
### Claude Desktop
|
|
105
|
+
|
|
106
|
+
```json
|
|
107
|
+
{
|
|
108
|
+
"mcpServers": {
|
|
109
|
+
"browser-goat": {
|
|
110
|
+
"command": "uvx",
|
|
111
|
+
"args": ["browser-goat-mcp", "--searxng-url", "http://localhost:8080"]
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
### Cursor / VS Code
|
|
118
|
+
|
|
119
|
+
```json
|
|
120
|
+
{
|
|
121
|
+
"mcpServers": {
|
|
122
|
+
"browser-goat": {
|
|
123
|
+
"command": "npx",
|
|
124
|
+
"args": ["browser-goat"],
|
|
125
|
+
"env": { "SEARXNG_URL": "http://localhost:8080" }
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
---
|
|
132
|
+
|
|
133
|
+
## Docker
|
|
134
|
+
|
|
135
|
+
Bundled SearXNG + Redis sidecar deployment:
|
|
136
|
+
|
|
137
|
+
```bash
|
|
138
|
+
docker compose up
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
SearXNG starts at `localhost:8080`, browser-goat API at `localhost:8000`.
|
|
142
|
+
|
|
143
|
+
```bash
|
|
144
|
+
docker exec browser-goat uv run browser-goat search "your query"
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
---
|
|
148
|
+
|
|
149
|
+
## How It Works
|
|
150
|
+
|
|
151
|
+
Each search passes through six layers before returning an answer. The diagram above shows the full pipeline. Layers 1-4 run on every query; Layers 5-6 activate when `--strategy` or `--reliability` are set.
|
|
152
|
+
|
|
153
|
+
---
|
|
154
|
+
|
|
155
|
+
## Development
|
|
156
|
+
|
|
157
|
+
```bash
|
|
158
|
+
git clone https://github.com/Im-Busy/browser-goat.git
|
|
159
|
+
cd browser-goat
|
|
160
|
+
uv sync
|
|
161
|
+
|
|
162
|
+
uv run pytest # 304 tests (287 unit + 17 integration)
|
|
163
|
+
uv run ruff check src/ tests/ # zero violations
|
|
164
|
+
uv run mypy src/ # zero errors
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
Tests require SearXNG at `localhost:8080`. Skip integration tests:
|
|
168
|
+
|
|
169
|
+
```bash
|
|
170
|
+
uv run pytest -m "not integration"
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
---
|
|
174
|
+
|
|
175
|
+
## License
|
|
176
|
+
|
|
177
|
+
MIT
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "browser-goat"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Meta-layer search intelligence wrapping SearXNG with pre/post processing, extraction, strategy, and verification from SearchWala, local-deep-research, Marco-DeepResearch, Tongyi-DeepResearch, and Scrapling."
|
|
5
|
+
requires-python = ">=3.13"
|
|
6
|
+
license = "MIT"
|
|
7
|
+
license-files = ["LICENSE"]
|
|
8
|
+
dependencies = [
|
|
9
|
+
"httpx[http2]>=0.28",
|
|
10
|
+
"beautifulsoup4>=4.12",
|
|
11
|
+
"lxml>=5.3",
|
|
12
|
+
"pydantic>=2.10",
|
|
13
|
+
"tenacity>=9.0",
|
|
14
|
+
"tiktoken>=0.8",
|
|
15
|
+
"playwright>=1.49",
|
|
16
|
+
"scrapling>=0.2",
|
|
17
|
+
"mcp>=1.0",
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
[project.scripts]
|
|
21
|
+
browser-goat = "browser_goat.cli:main"
|
|
22
|
+
browser-goat-mcp = "browser_goat.mcp_server:main"
|
|
23
|
+
|
|
24
|
+
[project.optional-dependencies]
|
|
25
|
+
dev = [
|
|
26
|
+
"pytest>=8.3",
|
|
27
|
+
"pytest-asyncio>=0.25",
|
|
28
|
+
"pytest-cov>=6.0",
|
|
29
|
+
"ruff>=0.8",
|
|
30
|
+
"mypy>=1.13",
|
|
31
|
+
"bandit>=1.8",
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
[tool.ruff]
|
|
35
|
+
line-length = 100
|
|
36
|
+
target-version = "py313"
|
|
37
|
+
|
|
38
|
+
[tool.ruff.lint]
|
|
39
|
+
select = ["E", "F", "I", "N", "W", "UP", "B", "C4", "SIM"]
|
|
40
|
+
ignore = ["E501"]
|
|
41
|
+
|
|
42
|
+
[tool.mypy]
|
|
43
|
+
python_version = "3.13"
|
|
44
|
+
strict = true
|
|
45
|
+
warn_unreachable = true
|
|
46
|
+
warn_unused_ignores = true
|
|
47
|
+
|
|
48
|
+
[tool.pytest.ini_options]
|
|
49
|
+
asyncio_mode = "auto"
|
|
50
|
+
testpaths = ["tests"]
|
|
51
|
+
markers = [
|
|
52
|
+
"integration: end-to-end tests requiring a running SearXNG instance",
|
|
53
|
+
]
|
|
54
|
+
|
|
55
|
+
[tool.uv]
|
|
56
|
+
package = true
|
|
57
|
+
|
|
58
|
+
[dependency-groups]
|
|
59
|
+
dev = [
|
|
60
|
+
"mypy>=2.1.0",
|
|
61
|
+
"pytest>=9.1.0",
|
|
62
|
+
"pytest-asyncio>=1.4.0",
|
|
63
|
+
"pytest-cov>=7.1.0",
|
|
64
|
+
"ruff>=0.15.17",
|
|
65
|
+
]
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""browser-goat — Meta-layer search intelligence wrapping SearXNG.
|
|
2
|
+
|
|
3
|
+
Layers:
|
|
4
|
+
pre_search — Query intelligence, language detection, browser profiles
|
|
5
|
+
post_search — URL pipeline, RRF+BM25+MMR ranking, dedup
|
|
6
|
+
extraction — Content extraction, goal-oriented, Scrapling anti-bot
|
|
7
|
+
reliability — Give-up detection, quality gating, force answer
|
|
8
|
+
strategy — Query classification, adaptive exploration (Phase 2)
|
|
9
|
+
verification — Multi-rollout voting (Phase 3)
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from browser_goat.router import BrowserGoat
|
|
13
|
+
|
|
14
|
+
__version__ = "0.1.0"
|
|
15
|
+
__all__ = ["BrowserGoat"]
|
|
@@ -0,0 +1,277 @@
|
|
|
1
|
+
"""CLI entry point for browser-goat.
|
|
2
|
+
|
|
3
|
+
Usage:
|
|
4
|
+
browser-goat search "What is Python?" --searxng-url http://localhost:8080
|
|
5
|
+
browser-goat search "latest AI news" --time-range week --strategy explore
|
|
6
|
+
browser-goat search "Python vs Rust" --reliability high
|
|
7
|
+
uvx browser-goat search "quantum computing research"
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import argparse
|
|
13
|
+
import asyncio
|
|
14
|
+
import json
|
|
15
|
+
import sys
|
|
16
|
+
from typing import Any
|
|
17
|
+
|
|
18
|
+
from browser_goat.router import BrowserGoat
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
22
|
+
parser = argparse.ArgumentParser(
|
|
23
|
+
prog="browser-goat",
|
|
24
|
+
description="Meta-layer search intelligence wrapping SearXNG",
|
|
25
|
+
)
|
|
26
|
+
sub = parser.add_subparsers(dest="command", required=True)
|
|
27
|
+
|
|
28
|
+
# ── search ──
|
|
29
|
+
search = sub.add_parser("search", help="Run a full search pipeline")
|
|
30
|
+
search.add_argument("query", help="Search query string")
|
|
31
|
+
search.add_argument(
|
|
32
|
+
"--searxng-url",
|
|
33
|
+
default="http://localhost:8080",
|
|
34
|
+
help="SearXNG instance URL (default: http://localhost:8080)",
|
|
35
|
+
)
|
|
36
|
+
search.add_argument(
|
|
37
|
+
"--engines",
|
|
38
|
+
nargs="*",
|
|
39
|
+
default=None,
|
|
40
|
+
help="SearXNG engines to use (e.g. google bing scholar)",
|
|
41
|
+
)
|
|
42
|
+
search.add_argument(
|
|
43
|
+
"--time-range",
|
|
44
|
+
choices=["day", "week", "month", "year"],
|
|
45
|
+
default=None,
|
|
46
|
+
help="Time filter for results",
|
|
47
|
+
)
|
|
48
|
+
search.add_argument(
|
|
49
|
+
"--language",
|
|
50
|
+
default="en",
|
|
51
|
+
help="Language code for results (default: en)",
|
|
52
|
+
)
|
|
53
|
+
search.add_argument(
|
|
54
|
+
"--max-sources",
|
|
55
|
+
type=int,
|
|
56
|
+
default=15,
|
|
57
|
+
help="Maximum sources to extract (default: 15)",
|
|
58
|
+
)
|
|
59
|
+
search.add_argument(
|
|
60
|
+
"--strategy",
|
|
61
|
+
choices=["default", "auto", "explore", "decompose"],
|
|
62
|
+
default="default",
|
|
63
|
+
help="Search strategy (default: default)",
|
|
64
|
+
)
|
|
65
|
+
search.add_argument(
|
|
66
|
+
"--reliability",
|
|
67
|
+
choices=["standard", "high", "maximum"],
|
|
68
|
+
default="standard",
|
|
69
|
+
help="Reliability mode (default: standard)",
|
|
70
|
+
)
|
|
71
|
+
search.add_argument(
|
|
72
|
+
"--format",
|
|
73
|
+
choices=["json", "pretty"],
|
|
74
|
+
default="json",
|
|
75
|
+
help="Output format (default: json)",
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
# ── extract ──
|
|
79
|
+
extract = sub.add_parser("extract", help="Extract content from a URL")
|
|
80
|
+
extract.add_argument("url", help="URL to extract content from")
|
|
81
|
+
extract.add_argument(
|
|
82
|
+
"--searxng-url",
|
|
83
|
+
default="http://localhost:8080",
|
|
84
|
+
help="SearXNG instance URL",
|
|
85
|
+
)
|
|
86
|
+
extract.add_argument(
|
|
87
|
+
"--format",
|
|
88
|
+
choices=["json", "pretty"],
|
|
89
|
+
default="json",
|
|
90
|
+
help="Output format (default: json)",
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
# ── verify ──
|
|
94
|
+
verify = sub.add_parser("verify", help="Verify an answer via multi-rollout voting")
|
|
95
|
+
verify.add_argument("query", help="The query to verify")
|
|
96
|
+
verify.add_argument(
|
|
97
|
+
"--searxng-url",
|
|
98
|
+
default="http://localhost:8080",
|
|
99
|
+
help="SearXNG instance URL",
|
|
100
|
+
)
|
|
101
|
+
verify.add_argument(
|
|
102
|
+
"--rollouts",
|
|
103
|
+
type=int,
|
|
104
|
+
default=5,
|
|
105
|
+
help="Number of parallel rollouts (default: 5)",
|
|
106
|
+
)
|
|
107
|
+
verify.add_argument(
|
|
108
|
+
"--format",
|
|
109
|
+
choices=["json", "pretty"],
|
|
110
|
+
default="json",
|
|
111
|
+
help="Output format (default: json)",
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
# ── serve ──
|
|
115
|
+
serve = sub.add_parser("serve", help="Run browser-goat as an HTTP JSON API server")
|
|
116
|
+
serve.add_argument(
|
|
117
|
+
"--host",
|
|
118
|
+
default="0.0.0.0",
|
|
119
|
+
help="Host to bind to (default: 0.0.0.0)",
|
|
120
|
+
)
|
|
121
|
+
serve.add_argument(
|
|
122
|
+
"--port",
|
|
123
|
+
type=int,
|
|
124
|
+
default=8000,
|
|
125
|
+
help="Port to listen on (default: 8000)",
|
|
126
|
+
)
|
|
127
|
+
serve.add_argument(
|
|
128
|
+
"--searxng-url",
|
|
129
|
+
default="http://localhost:8080",
|
|
130
|
+
help="SearXNG instance URL (default: http://localhost:8080)",
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
return parser
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def format_output(data: Any, fmt: str) -> str:
|
|
137
|
+
"""Format output as JSON or pretty-printed."""
|
|
138
|
+
if fmt == "pretty":
|
|
139
|
+
if hasattr(data, "model_dump"):
|
|
140
|
+
return json.dumps(data.model_dump(), indent=2, ensure_ascii=False)
|
|
141
|
+
return json.dumps(data, indent=2, ensure_ascii=False)
|
|
142
|
+
if hasattr(data, "model_dump"):
|
|
143
|
+
return str(data.model_dump_json())
|
|
144
|
+
return json.dumps(data, ensure_ascii=False)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
async def cmd_search(args: argparse.Namespace) -> None:
|
|
148
|
+
meta = BrowserGoat(searxng_url=args.searxng_url)
|
|
149
|
+
result = await meta.search(
|
|
150
|
+
query=args.query,
|
|
151
|
+
engines=args.engines,
|
|
152
|
+
time_range=args.time_range,
|
|
153
|
+
language=args.language,
|
|
154
|
+
max_sources=args.max_sources,
|
|
155
|
+
strategy=args.strategy,
|
|
156
|
+
reliability_mode=args.reliability,
|
|
157
|
+
)
|
|
158
|
+
output = format_output(result, args.format)
|
|
159
|
+
print(output)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
async def cmd_extract(args: argparse.Namespace) -> None:
|
|
163
|
+
meta = BrowserGoat(searxng_url=args.searxng_url)
|
|
164
|
+
fetcher = meta.scrapling
|
|
165
|
+
extractor = meta.content_extractor
|
|
166
|
+
profile = meta.browser_profiles.get_random_profile()
|
|
167
|
+
|
|
168
|
+
fetch_result = await fetcher.fetch(args.url, profile)
|
|
169
|
+
if not fetch_result.success:
|
|
170
|
+
print(json.dumps({"error": fetch_result.error or "fetch failed"}), file=sys.stderr)
|
|
171
|
+
sys.exit(1)
|
|
172
|
+
|
|
173
|
+
content = extractor.extract(fetch_result.html, args.url)
|
|
174
|
+
|
|
175
|
+
output = format_output(
|
|
176
|
+
{
|
|
177
|
+
"url": args.url,
|
|
178
|
+
"title": content.title,
|
|
179
|
+
"text": content.text[:1000] + "..." if len(content.text) > 1000 else content.text,
|
|
180
|
+
"extraction_tier": content.extraction_tier,
|
|
181
|
+
"text_length": len(content.text),
|
|
182
|
+
},
|
|
183
|
+
args.format,
|
|
184
|
+
)
|
|
185
|
+
print(output)
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
async def cmd_verify(args: argparse.Namespace) -> None:
|
|
189
|
+
meta = BrowserGoat(searxng_url=args.searxng_url)
|
|
190
|
+
result = await meta.search(
|
|
191
|
+
query=args.query,
|
|
192
|
+
reliability_mode="high" if args.rollouts <= 5 else "maximum",
|
|
193
|
+
)
|
|
194
|
+
output = format_output(result, args.format)
|
|
195
|
+
print(output)
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
async def cmd_serve(args: argparse.Namespace) -> None:
|
|
199
|
+
"""Run browser-goat as a minimal HTTP JSON API server (zero extra deps)."""
|
|
200
|
+
meta = BrowserGoat(searxng_url=args.searxng_url)
|
|
201
|
+
|
|
202
|
+
async def handle(reader: asyncio.StreamReader, writer: asyncio.StreamWriter) -> None:
|
|
203
|
+
try:
|
|
204
|
+
raw = await asyncio.wait_for(reader.readuntil(b"\r\n\r\n"), timeout=30)
|
|
205
|
+
request_line, *_ = raw.decode("utf-8", errors="replace").split("\r\n")
|
|
206
|
+
method, path, *_ = request_line.split(" ") + ["", ""]
|
|
207
|
+
|
|
208
|
+
if method == "GET" and path in ("/health", "/"):
|
|
209
|
+
body = b'{"status":"ok"}'
|
|
210
|
+
writer.write(
|
|
211
|
+
b"HTTP/1.1 200 OK\r\nContent-Type: application/json\r\n"
|
|
212
|
+
b"Content-Length: " + str(len(body)).encode() + b"\r\n\r\n" + body
|
|
213
|
+
)
|
|
214
|
+
await writer.drain()
|
|
215
|
+
return
|
|
216
|
+
|
|
217
|
+
if method == "POST" and path == "/search":
|
|
218
|
+
content_length = 0
|
|
219
|
+
for line in raw.decode("utf-8", errors="replace").split("\r\n"):
|
|
220
|
+
if line.lower().startswith("content-length:"):
|
|
221
|
+
content_length = int(line.split(":")[1].strip())
|
|
222
|
+
body_raw = await asyncio.wait_for(reader.readexactly(content_length), timeout=5)
|
|
223
|
+
params = json.loads(body_raw)
|
|
224
|
+
|
|
225
|
+
result = await meta.search(
|
|
226
|
+
query=params.get("query", ""),
|
|
227
|
+
time_range=params.get("time_range"),
|
|
228
|
+
max_sources=params.get("max_sources", 15),
|
|
229
|
+
strategy=params.get("strategy", "default"),
|
|
230
|
+
reliability_mode=params.get("reliability", "standard"),
|
|
231
|
+
)
|
|
232
|
+
body = result.model_dump_json().encode()
|
|
233
|
+
writer.write(
|
|
234
|
+
b"HTTP/1.1 200 OK\r\nContent-Type: application/json\r\n"
|
|
235
|
+
b"Content-Length: " + str(len(body)).encode() + b"\r\n\r\n" + body
|
|
236
|
+
)
|
|
237
|
+
await writer.drain()
|
|
238
|
+
return
|
|
239
|
+
|
|
240
|
+
body = b'{"error":"not found"}'
|
|
241
|
+
writer.write(
|
|
242
|
+
b"HTTP/1.1 404 Not Found\r\nContent-Type: application/json\r\n"
|
|
243
|
+
b"Content-Length: " + str(len(body)).encode() + b"\r\n\r\n" + body
|
|
244
|
+
)
|
|
245
|
+
await writer.drain()
|
|
246
|
+
except Exception:
|
|
247
|
+
pass
|
|
248
|
+
finally:
|
|
249
|
+
writer.close()
|
|
250
|
+
await writer.wait_closed()
|
|
251
|
+
|
|
252
|
+
server = await asyncio.start_server(handle, host=args.host, port=args.port)
|
|
253
|
+
print(f"browser-goat API listening on http://{args.host}:{args.port}", file=sys.stderr)
|
|
254
|
+
async with server:
|
|
255
|
+
await server.serve_forever()
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
def main() -> None:
|
|
259
|
+
parser = build_parser()
|
|
260
|
+
args = parser.parse_args()
|
|
261
|
+
|
|
262
|
+
try:
|
|
263
|
+
if args.command == "search":
|
|
264
|
+
asyncio.run(cmd_search(args))
|
|
265
|
+
elif args.command == "extract":
|
|
266
|
+
asyncio.run(cmd_extract(args))
|
|
267
|
+
elif args.command == "verify":
|
|
268
|
+
asyncio.run(cmd_verify(args))
|
|
269
|
+
elif args.command == "serve":
|
|
270
|
+
asyncio.run(cmd_serve(args))
|
|
271
|
+
except Exception as e:
|
|
272
|
+
print(json.dumps({"error": str(e)}), file=sys.stderr)
|
|
273
|
+
sys.exit(1)
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
if __name__ == "__main__":
|
|
277
|
+
main()
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
"""Extraction layer: 7-tier content extraction, goal-oriented extraction, Scrapling anti-bot."""
|
|
2
|
+
|
|
3
|
+
from browser_goat.extraction.content_extractor import ContentExtractor
|
|
4
|
+
from browser_goat.extraction.goal_oriented import GoalOrientedExtractor
|
|
5
|
+
from browser_goat.extraction.scrapling_fetcher import ScraplingFetcher
|
|
6
|
+
|
|
7
|
+
__all__ = ["ContentExtractor", "GoalOrientedExtractor", "ScraplingFetcher"]
|
|
8
|
+
|
|
9
|
+
__all__ = ["ContentExtractor", "GoalOrientedExtractor", "ScraplingFetcher"]
|