chrome-scraper 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chrome_scraper/__init__.py +2 -0
- chrome_scraper/browser_api/__init__.py +0 -0
- chrome_scraper/browser_api/cli.py +160 -0
- chrome_scraper/browser_api/client.py +170 -0
- chrome_scraper/browser_api/server.py +446 -0
- chrome_scraper/cli_output.py +23 -0
- chrome_scraper/html_to_md/__init__.py +6 -0
- chrome_scraper/html_to_md/cli.py +146 -0
- chrome_scraper/html_to_md/extract.js +121 -0
- chrome_scraper/html_to_md/extract.py +117 -0
- chrome_scraper/html_to_md/render.py +504 -0
- chrome_scraper/py.typed +0 -0
- chrome_scraper/web_scrapers/__init__.py +0 -0
- chrome_scraper/web_scrapers/_fetch_common.py +75 -0
- chrome_scraper/web_scrapers/base.py +254 -0
- chrome_scraper/web_scrapers/google_fetch.py +247 -0
- chrome_scraper/web_scrapers/google_fetch_cli.py +126 -0
- chrome_scraper/web_scrapers/google_search.py +253 -0
- chrome_scraper/web_scrapers/scripts/__init__.py +0 -0
- chrome_scraper/web_scrapers/scripts/google_search_results.js +50 -0
- chrome_scraper/web_scrapers/scripts/xcom_search_results.js +31 -0
- chrome_scraper/web_scrapers/xcom_fetch.py +298 -0
- chrome_scraper/web_scrapers/xcom_fetch_cli.py +108 -0
- chrome_scraper-0.1.0.dist-info/METADATA +268 -0
- chrome_scraper-0.1.0.dist-info/RECORD +27 -0
- chrome_scraper-0.1.0.dist-info/WHEEL +4 -0
- chrome_scraper-0.1.0.dist-info/entry_points.txt +6 -0
|
File without changes
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
"""`browser-api` CLI: start/stop/status for the shared browser server.
|
|
2
|
+
|
|
3
|
+
Usage:
|
|
4
|
+
uv run browser-api # start with defaults (port 9333)
|
|
5
|
+
uv run browser-api --port 8080 # custom port
|
|
6
|
+
uv run browser-api --headless # headless mode
|
|
7
|
+
uv run browser-api --hide # hide Chrome window (macOS)
|
|
8
|
+
uv run browser-api --browser-args="--disable-gpu --no-sandbox"
|
|
9
|
+
uv run browser-api status # check if running
|
|
10
|
+
uv run browser-api stop # shut down browser + server
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import argparse
|
|
16
|
+
import sys
|
|
17
|
+
|
|
18
|
+
import httpx
|
|
19
|
+
|
|
20
|
+
from chrome_scraper.cli_output import emit_error, emit_ok
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _ensure_background_patch() -> None:
|
|
24
|
+
"""Apply the Patchright crBrowser.js patch so new tabs stay in background."""
|
|
25
|
+
import site
|
|
26
|
+
from pathlib import Path
|
|
27
|
+
|
|
28
|
+
for base in site.getsitepackages():
|
|
29
|
+
if not base:
|
|
30
|
+
continue
|
|
31
|
+
target = (
|
|
32
|
+
Path(base)
|
|
33
|
+
/ "patchright"
|
|
34
|
+
/ "driver"
|
|
35
|
+
/ "package"
|
|
36
|
+
/ "lib"
|
|
37
|
+
/ "server"
|
|
38
|
+
/ "chromium"
|
|
39
|
+
/ "crBrowser.js"
|
|
40
|
+
)
|
|
41
|
+
if target.exists():
|
|
42
|
+
original = target.read_text(encoding="utf-8")
|
|
43
|
+
if "background: true" in original:
|
|
44
|
+
return
|
|
45
|
+
needle = '{ url: "about:blank", browserContextId: this._browserContextId }'
|
|
46
|
+
if needle in original:
|
|
47
|
+
patched = original.replace(
|
|
48
|
+
needle,
|
|
49
|
+
'{ url: "about:blank", browserContextId: this._browserContextId, background: true }',
|
|
50
|
+
)
|
|
51
|
+
target.write_text(patched, encoding="utf-8")
|
|
52
|
+
print(
|
|
53
|
+
"[browser-api] patched Patchright for background tabs",
|
|
54
|
+
file=sys.stderr,
|
|
55
|
+
flush=True,
|
|
56
|
+
)
|
|
57
|
+
return
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
61
|
+
p = argparse.ArgumentParser(
|
|
62
|
+
prog="browser-api",
|
|
63
|
+
description="Shared browser server for chrome_scraper scrapers.",
|
|
64
|
+
)
|
|
65
|
+
p.add_argument(
|
|
66
|
+
"--port", type=int, default=9333, help="Server port (default: 9333)."
|
|
67
|
+
)
|
|
68
|
+
p.add_argument("--channel", default="chrome", help="Chrome channel.")
|
|
69
|
+
p.add_argument("--headless", action="store_true", help="Run Chrome headless.")
|
|
70
|
+
p.add_argument("--chrome-path", help="Path to Chrome binary.")
|
|
71
|
+
p.add_argument("--profile-dir", help="Chrome profile directory.")
|
|
72
|
+
p.add_argument("--proxy", help="Proxy server URL.")
|
|
73
|
+
p.add_argument(
|
|
74
|
+
"--timeout", type=float, default=30.0, help="Default operation timeout."
|
|
75
|
+
)
|
|
76
|
+
p.add_argument(
|
|
77
|
+
"--browser-args", default="", help="Extra Chrome CLI flags (space-separated)."
|
|
78
|
+
)
|
|
79
|
+
p.add_argument(
|
|
80
|
+
"--hide", action="store_true", help="Hide Chrome after launch (macOS only)."
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
sub = p.add_subparsers(dest="command")
|
|
84
|
+
sub.add_parser("status", help="Check if browser-api is running.")
|
|
85
|
+
sub.add_parser("stop", help="Shut down browser and server.")
|
|
86
|
+
|
|
87
|
+
return p
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def main(argv: list[str] | None = None) -> int:
|
|
91
|
+
args = build_parser().parse_args(argv)
|
|
92
|
+
|
|
93
|
+
if args.command == "status":
|
|
94
|
+
return cmd_status(args)
|
|
95
|
+
if args.command == "stop":
|
|
96
|
+
return cmd_stop(args)
|
|
97
|
+
# Default: start
|
|
98
|
+
return cmd_start(args)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def cmd_start(args: argparse.Namespace) -> int:
|
|
102
|
+
import uvicorn
|
|
103
|
+
|
|
104
|
+
from chrome_scraper.browser_api.server import ServerConfig, create_app
|
|
105
|
+
|
|
106
|
+
# Ensure Patchright is patched so new tabs don't bring Chrome to foreground.
|
|
107
|
+
_ensure_background_patch()
|
|
108
|
+
|
|
109
|
+
config = ServerConfig(
|
|
110
|
+
port=args.port,
|
|
111
|
+
channel=args.channel,
|
|
112
|
+
headless=args.headless,
|
|
113
|
+
chrome_path=args.chrome_path,
|
|
114
|
+
profile_dir=args.profile_dir,
|
|
115
|
+
proxy=args.proxy,
|
|
116
|
+
timeout=args.timeout,
|
|
117
|
+
browser_args=args.browser_args.split() if args.browser_args else [],
|
|
118
|
+
hide=args.hide,
|
|
119
|
+
)
|
|
120
|
+
app = create_app(config)
|
|
121
|
+
|
|
122
|
+
print(
|
|
123
|
+
f"[browser-api] starting on :{config.port} ...",
|
|
124
|
+
file=sys.stderr,
|
|
125
|
+
flush=True,
|
|
126
|
+
)
|
|
127
|
+
uvicorn.run(app, host="127.0.0.1", port=config.port, log_level="warning")
|
|
128
|
+
return 0
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def cmd_status(args: argparse.Namespace) -> int:
|
|
132
|
+
try:
|
|
133
|
+
r = httpx.get(f"http://127.0.0.1:{args.port}/status", timeout=3.0)
|
|
134
|
+
r.raise_for_status()
|
|
135
|
+
emit_ok(r.json())
|
|
136
|
+
return 0
|
|
137
|
+
except httpx.ConnectError:
|
|
138
|
+
emit_error(f"browser-api not running on port {args.port}.")
|
|
139
|
+
return 1
|
|
140
|
+
except httpx.HTTPStatusError as exc:
|
|
141
|
+
emit_error(f"Server error: {exc.response.text}")
|
|
142
|
+
return 1
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def cmd_stop(args: argparse.Namespace) -> int:
|
|
146
|
+
try:
|
|
147
|
+
r = httpx.post(f"http://127.0.0.1:{args.port}/shutdown", timeout=5.0)
|
|
148
|
+
r.raise_for_status()
|
|
149
|
+
emit_ok({"stopped": True})
|
|
150
|
+
return 0
|
|
151
|
+
except httpx.ConnectError:
|
|
152
|
+
emit_error(f"browser-api not running on port {args.port}.")
|
|
153
|
+
return 1
|
|
154
|
+
except httpx.HTTPStatusError as exc:
|
|
155
|
+
emit_error(f"Server error: {exc.response.text}")
|
|
156
|
+
return 1
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
if __name__ == "__main__":
|
|
160
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
"""BrowserAPIClient: BrowserTool implementation backed by the browser-api HTTP service.
|
|
2
|
+
|
|
3
|
+
Scraping code (google_fetch, xcom_fetch) is unchanged — it calls browser.eval_js()
|
|
4
|
+
etc. which route over HTTP.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import os
|
|
10
|
+
from collections.abc import Generator
|
|
11
|
+
from contextlib import contextmanager
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
import httpx
|
|
15
|
+
|
|
16
|
+
from chrome_scraper.web_scrapers.base import BrowserTab, WebScraperError
|
|
17
|
+
|
|
18
|
+
_DEFAULT_BASE_URL = "http://localhost:9333"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class BrowserAPIClient:
|
|
22
|
+
"""BrowserTool backed by a running browser-api server.
|
|
23
|
+
|
|
24
|
+
Launch / stop are no-ops — the server owns the browser lifecycle.
|
|
25
|
+
All page operations are HTTP calls keyed by tab id or label.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def __init__(self, base_url: str | None = None, *, timeout: float = 60.0):
|
|
29
|
+
self.base_url = (
|
|
30
|
+
base_url or os.environ.get("BROWSER_API_URL") or _DEFAULT_BASE_URL
|
|
31
|
+
).rstrip("/")
|
|
32
|
+
self._http = httpx.Client(base_url=self.base_url, timeout=timeout)
|
|
33
|
+
|
|
34
|
+
def launch(self, **kwargs: Any) -> dict[str, Any]:
|
|
35
|
+
"""No-op — browser is managed by the server. Returns status."""
|
|
36
|
+
return self.status()
|
|
37
|
+
|
|
38
|
+
def attach(self, **kwargs: Any) -> dict[str, Any]:
|
|
39
|
+
"""No-op — always attached to the running server. Returns status."""
|
|
40
|
+
return self.status()
|
|
41
|
+
|
|
42
|
+
def stop(self, *, timeout: float = 5.0) -> dict[str, Any]:
|
|
43
|
+
"""No-op — server stays alive. Does not shut down the browser."""
|
|
44
|
+
return {"running": True, "stopped": False}
|
|
45
|
+
|
|
46
|
+
def status(self) -> dict[str, Any]:
|
|
47
|
+
r = self._get("/status")
|
|
48
|
+
return r
|
|
49
|
+
|
|
50
|
+
def list_tabs(self) -> list[BrowserTab]:
|
|
51
|
+
tabs = self._get("/tabs")
|
|
52
|
+
return [
|
|
53
|
+
{
|
|
54
|
+
"target_id": t["tab_id"],
|
|
55
|
+
"title": t.get("title", ""),
|
|
56
|
+
"url": t.get("url", ""),
|
|
57
|
+
"label": t.get("label"),
|
|
58
|
+
}
|
|
59
|
+
for t in tabs
|
|
60
|
+
]
|
|
61
|
+
|
|
62
|
+
def open_tab(
|
|
63
|
+
self, url: str = "about:blank", *, label: str | None = None
|
|
64
|
+
) -> BrowserTab:
|
|
65
|
+
r = self._post("/tabs", json={"url": url, "label": label})
|
|
66
|
+
return {
|
|
67
|
+
"target_id": r["tab_id"],
|
|
68
|
+
"title": r.get("title", ""),
|
|
69
|
+
"url": r.get("url", ""),
|
|
70
|
+
"label": r.get("label"),
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
def close_tab(self, tab_ref: str) -> dict[str, Any]:
|
|
74
|
+
self._delete(f"/tabs/{tab_ref}")
|
|
75
|
+
return {"closed": True, "tab": {"target_id": tab_ref}}
|
|
76
|
+
|
|
77
|
+
def activate_tab(self, tab_ref: str) -> dict[str, Any]:
|
|
78
|
+
# No server-side activation needed — tabs are independent.
|
|
79
|
+
return {"activated": True, "tab": {"target_id": tab_ref}}
|
|
80
|
+
|
|
81
|
+
def navigate(
|
|
82
|
+
self,
|
|
83
|
+
*,
|
|
84
|
+
tab_ref: str,
|
|
85
|
+
url: str,
|
|
86
|
+
timeout: float,
|
|
87
|
+
wait_until: str = "load",
|
|
88
|
+
) -> dict[str, Any]:
|
|
89
|
+
r = self._post(
|
|
90
|
+
f"/tabs/{tab_ref}/goto",
|
|
91
|
+
json={"url": url, "wait_until": wait_until},
|
|
92
|
+
timeout=timeout,
|
|
93
|
+
)
|
|
94
|
+
return {"navigated": True, "wait_until": wait_until, "url": r["url"]}
|
|
95
|
+
|
|
96
|
+
def eval_js(self, *, tab_ref: str, expression: str, timeout: float) -> Any:
|
|
97
|
+
r = self._post(
|
|
98
|
+
f"/tabs/{tab_ref}/eval",
|
|
99
|
+
json={"expression": expression},
|
|
100
|
+
timeout=timeout,
|
|
101
|
+
)
|
|
102
|
+
return r["result"]
|
|
103
|
+
|
|
104
|
+
def eval_js_file(self, *, tab_ref: str, script_path: str, timeout: float) -> Any:
|
|
105
|
+
from pathlib import Path
|
|
106
|
+
|
|
107
|
+
script = Path(script_path).expanduser().read_text(encoding="utf-8")
|
|
108
|
+
return self.eval_js(tab_ref=tab_ref, expression=script, timeout=timeout)
|
|
109
|
+
|
|
110
|
+
def keyboard_type(self, *, tab_ref: str, text: str, delay_ms: int = 30) -> None:
|
|
111
|
+
self._post(f"/tabs/{tab_ref}/type", json={"text": text, "delay_ms": delay_ms})
|
|
112
|
+
|
|
113
|
+
def keyboard_press(self, *, tab_ref: str, key: str) -> None:
|
|
114
|
+
self._post(f"/tabs/{tab_ref}/press", json={"key": key})
|
|
115
|
+
|
|
116
|
+
def focus(self, *, tab_ref: str, selector: str, timeout: float) -> None:
|
|
117
|
+
self._post(
|
|
118
|
+
f"/tabs/{tab_ref}/focus", json={"selector": selector}, timeout=timeout
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
@contextmanager
|
|
122
|
+
def tab(
|
|
123
|
+
self,
|
|
124
|
+
label: str,
|
|
125
|
+
*,
|
|
126
|
+
url: str = "about:blank",
|
|
127
|
+
stop_on_exit: bool = True,
|
|
128
|
+
) -> Generator[str, None, None]:
|
|
129
|
+
"""Open a tab, yield its label, close tab on exit.
|
|
130
|
+
|
|
131
|
+
stop_on_exit is ignored — the server owns the browser lifecycle.
|
|
132
|
+
"""
|
|
133
|
+
self.open_tab(url, label=label)
|
|
134
|
+
try:
|
|
135
|
+
yield label
|
|
136
|
+
finally:
|
|
137
|
+
try:
|
|
138
|
+
self.close_tab(label)
|
|
139
|
+
except Exception:
|
|
140
|
+
pass
|
|
141
|
+
|
|
142
|
+
def _request(self, method: str, path: str, **kwargs: Any) -> Any:
|
|
143
|
+
"""Dispatch an HTTP request and handle common errors."""
|
|
144
|
+
try:
|
|
145
|
+
r = getattr(self._http, method)(path, **kwargs)
|
|
146
|
+
r.raise_for_status()
|
|
147
|
+
return r.json()
|
|
148
|
+
except httpx.ConnectError:
|
|
149
|
+
raise WebScraperError(
|
|
150
|
+
f"Cannot connect to browser-api at {self.base_url}. "
|
|
151
|
+
"Start it with: uv run browser-api"
|
|
152
|
+
)
|
|
153
|
+
except httpx.HTTPStatusError as exc:
|
|
154
|
+
raise WebScraperError(f"browser-api error: {exc.response.text}")
|
|
155
|
+
|
|
156
|
+
def _get(self, path: str) -> Any:
|
|
157
|
+
return self._request("get", path)
|
|
158
|
+
|
|
159
|
+
def _post(
|
|
160
|
+
self, path: str, *, json: dict | None = None, timeout: float | None = None
|
|
161
|
+
) -> Any:
|
|
162
|
+
kw: dict[str, Any] = {}
|
|
163
|
+
if json is not None:
|
|
164
|
+
kw["json"] = json
|
|
165
|
+
if timeout is not None:
|
|
166
|
+
kw["timeout"] = timeout
|
|
167
|
+
return self._request("post", path, **kw)
|
|
168
|
+
|
|
169
|
+
def _delete(self, path: str) -> Any:
|
|
170
|
+
return self._request("delete", path)
|