chrome-scraper 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,2 @@
1
+ def hello() -> str:
2
+ return "Hello from chrome-scraper!"
File without changes
@@ -0,0 +1,160 @@
1
+ """`browser-api` CLI: start/stop/status for the shared browser server.
2
+
3
+ Usage:
4
+ uv run browser-api # start with defaults (port 9333)
5
+ uv run browser-api --port 8080 # custom port
6
+ uv run browser-api --headless # headless mode
7
+ uv run browser-api --hide # hide Chrome window (macOS)
8
+ uv run browser-api --browser-args="--disable-gpu --no-sandbox"
9
+ uv run browser-api status # check if running
10
+ uv run browser-api stop # shut down browser + server
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import argparse
16
+ import sys
17
+
18
+ import httpx
19
+
20
+ from chrome_scraper.cli_output import emit_error, emit_ok
21
+
22
+
23
+ def _ensure_background_patch() -> None:
24
+ """Apply the Patchright crBrowser.js patch so new tabs stay in background."""
25
+ import site
26
+ from pathlib import Path
27
+
28
+ for base in site.getsitepackages():
29
+ if not base:
30
+ continue
31
+ target = (
32
+ Path(base)
33
+ / "patchright"
34
+ / "driver"
35
+ / "package"
36
+ / "lib"
37
+ / "server"
38
+ / "chromium"
39
+ / "crBrowser.js"
40
+ )
41
+ if target.exists():
42
+ original = target.read_text(encoding="utf-8")
43
+ if "background: true" in original:
44
+ return
45
+ needle = '{ url: "about:blank", browserContextId: this._browserContextId }'
46
+ if needle in original:
47
+ patched = original.replace(
48
+ needle,
49
+ '{ url: "about:blank", browserContextId: this._browserContextId, background: true }',
50
+ )
51
+ target.write_text(patched, encoding="utf-8")
52
+ print(
53
+ "[browser-api] patched Patchright for background tabs",
54
+ file=sys.stderr,
55
+ flush=True,
56
+ )
57
+ return
58
+
59
+
60
+ def build_parser() -> argparse.ArgumentParser:
61
+ p = argparse.ArgumentParser(
62
+ prog="browser-api",
63
+ description="Shared browser server for chrome_scraper scrapers.",
64
+ )
65
+ p.add_argument(
66
+ "--port", type=int, default=9333, help="Server port (default: 9333)."
67
+ )
68
+ p.add_argument("--channel", default="chrome", help="Chrome channel.")
69
+ p.add_argument("--headless", action="store_true", help="Run Chrome headless.")
70
+ p.add_argument("--chrome-path", help="Path to Chrome binary.")
71
+ p.add_argument("--profile-dir", help="Chrome profile directory.")
72
+ p.add_argument("--proxy", help="Proxy server URL.")
73
+ p.add_argument(
74
+ "--timeout", type=float, default=30.0, help="Default operation timeout."
75
+ )
76
+ p.add_argument(
77
+ "--browser-args", default="", help="Extra Chrome CLI flags (space-separated)."
78
+ )
79
+ p.add_argument(
80
+ "--hide", action="store_true", help="Hide Chrome after launch (macOS only)."
81
+ )
82
+
83
+ sub = p.add_subparsers(dest="command")
84
+ sub.add_parser("status", help="Check if browser-api is running.")
85
+ sub.add_parser("stop", help="Shut down browser and server.")
86
+
87
+ return p
88
+
89
+
90
+ def main(argv: list[str] | None = None) -> int:
91
+ args = build_parser().parse_args(argv)
92
+
93
+ if args.command == "status":
94
+ return cmd_status(args)
95
+ if args.command == "stop":
96
+ return cmd_stop(args)
97
+ # Default: start
98
+ return cmd_start(args)
99
+
100
+
101
+ def cmd_start(args: argparse.Namespace) -> int:
102
+ import uvicorn
103
+
104
+ from chrome_scraper.browser_api.server import ServerConfig, create_app
105
+
106
+ # Ensure Patchright is patched so new tabs don't bring Chrome to foreground.
107
+ _ensure_background_patch()
108
+
109
+ config = ServerConfig(
110
+ port=args.port,
111
+ channel=args.channel,
112
+ headless=args.headless,
113
+ chrome_path=args.chrome_path,
114
+ profile_dir=args.profile_dir,
115
+ proxy=args.proxy,
116
+ timeout=args.timeout,
117
+ browser_args=args.browser_args.split() if args.browser_args else [],
118
+ hide=args.hide,
119
+ )
120
+ app = create_app(config)
121
+
122
+ print(
123
+ f"[browser-api] starting on :{config.port} ...",
124
+ file=sys.stderr,
125
+ flush=True,
126
+ )
127
+ uvicorn.run(app, host="127.0.0.1", port=config.port, log_level="warning")
128
+ return 0
129
+
130
+
131
+ def cmd_status(args: argparse.Namespace) -> int:
132
+ try:
133
+ r = httpx.get(f"http://127.0.0.1:{args.port}/status", timeout=3.0)
134
+ r.raise_for_status()
135
+ emit_ok(r.json())
136
+ return 0
137
+ except httpx.ConnectError:
138
+ emit_error(f"browser-api not running on port {args.port}.")
139
+ return 1
140
+ except httpx.HTTPStatusError as exc:
141
+ emit_error(f"Server error: {exc.response.text}")
142
+ return 1
143
+
144
+
145
+ def cmd_stop(args: argparse.Namespace) -> int:
146
+ try:
147
+ r = httpx.post(f"http://127.0.0.1:{args.port}/shutdown", timeout=5.0)
148
+ r.raise_for_status()
149
+ emit_ok({"stopped": True})
150
+ return 0
151
+ except httpx.ConnectError:
152
+ emit_error(f"browser-api not running on port {args.port}.")
153
+ return 1
154
+ except httpx.HTTPStatusError as exc:
155
+ emit_error(f"Server error: {exc.response.text}")
156
+ return 1
157
+
158
+
159
+ if __name__ == "__main__":
160
+ raise SystemExit(main())
@@ -0,0 +1,170 @@
1
+ """BrowserAPIClient: BrowserTool implementation backed by the browser-api HTTP service.
2
+
3
+ Scraping code (google_fetch, xcom_fetch) is unchanged — it calls browser.eval_js()
4
+ etc. which route over HTTP.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import os
10
+ from collections.abc import Generator
11
+ from contextlib import contextmanager
12
+ from typing import Any
13
+
14
+ import httpx
15
+
16
+ from chrome_scraper.web_scrapers.base import BrowserTab, WebScraperError
17
+
18
+ _DEFAULT_BASE_URL = "http://localhost:9333"
19
+
20
+
21
+ class BrowserAPIClient:
22
+ """BrowserTool backed by a running browser-api server.
23
+
24
+ Launch / stop are no-ops — the server owns the browser lifecycle.
25
+ All page operations are HTTP calls keyed by tab id or label.
26
+ """
27
+
28
+ def __init__(self, base_url: str | None = None, *, timeout: float = 60.0):
29
+ self.base_url = (
30
+ base_url or os.environ.get("BROWSER_API_URL") or _DEFAULT_BASE_URL
31
+ ).rstrip("/")
32
+ self._http = httpx.Client(base_url=self.base_url, timeout=timeout)
33
+
34
+ def launch(self, **kwargs: Any) -> dict[str, Any]:
35
+ """No-op — browser is managed by the server. Returns status."""
36
+ return self.status()
37
+
38
+ def attach(self, **kwargs: Any) -> dict[str, Any]:
39
+ """No-op — always attached to the running server. Returns status."""
40
+ return self.status()
41
+
42
+ def stop(self, *, timeout: float = 5.0) -> dict[str, Any]:
43
+ """No-op — server stays alive. Does not shut down the browser."""
44
+ return {"running": True, "stopped": False}
45
+
46
+ def status(self) -> dict[str, Any]:
47
+ r = self._get("/status")
48
+ return r
49
+
50
+ def list_tabs(self) -> list[BrowserTab]:
51
+ tabs = self._get("/tabs")
52
+ return [
53
+ {
54
+ "target_id": t["tab_id"],
55
+ "title": t.get("title", ""),
56
+ "url": t.get("url", ""),
57
+ "label": t.get("label"),
58
+ }
59
+ for t in tabs
60
+ ]
61
+
62
+ def open_tab(
63
+ self, url: str = "about:blank", *, label: str | None = None
64
+ ) -> BrowserTab:
65
+ r = self._post("/tabs", json={"url": url, "label": label})
66
+ return {
67
+ "target_id": r["tab_id"],
68
+ "title": r.get("title", ""),
69
+ "url": r.get("url", ""),
70
+ "label": r.get("label"),
71
+ }
72
+
73
+ def close_tab(self, tab_ref: str) -> dict[str, Any]:
74
+ self._delete(f"/tabs/{tab_ref}")
75
+ return {"closed": True, "tab": {"target_id": tab_ref}}
76
+
77
+ def activate_tab(self, tab_ref: str) -> dict[str, Any]:
78
+ # No server-side activation needed — tabs are independent.
79
+ return {"activated": True, "tab": {"target_id": tab_ref}}
80
+
81
+ def navigate(
82
+ self,
83
+ *,
84
+ tab_ref: str,
85
+ url: str,
86
+ timeout: float,
87
+ wait_until: str = "load",
88
+ ) -> dict[str, Any]:
89
+ r = self._post(
90
+ f"/tabs/{tab_ref}/goto",
91
+ json={"url": url, "wait_until": wait_until},
92
+ timeout=timeout,
93
+ )
94
+ return {"navigated": True, "wait_until": wait_until, "url": r["url"]}
95
+
96
+ def eval_js(self, *, tab_ref: str, expression: str, timeout: float) -> Any:
97
+ r = self._post(
98
+ f"/tabs/{tab_ref}/eval",
99
+ json={"expression": expression},
100
+ timeout=timeout,
101
+ )
102
+ return r["result"]
103
+
104
+ def eval_js_file(self, *, tab_ref: str, script_path: str, timeout: float) -> Any:
105
+ from pathlib import Path
106
+
107
+ script = Path(script_path).expanduser().read_text(encoding="utf-8")
108
+ return self.eval_js(tab_ref=tab_ref, expression=script, timeout=timeout)
109
+
110
+ def keyboard_type(self, *, tab_ref: str, text: str, delay_ms: int = 30) -> None:
111
+ self._post(f"/tabs/{tab_ref}/type", json={"text": text, "delay_ms": delay_ms})
112
+
113
+ def keyboard_press(self, *, tab_ref: str, key: str) -> None:
114
+ self._post(f"/tabs/{tab_ref}/press", json={"key": key})
115
+
116
+ def focus(self, *, tab_ref: str, selector: str, timeout: float) -> None:
117
+ self._post(
118
+ f"/tabs/{tab_ref}/focus", json={"selector": selector}, timeout=timeout
119
+ )
120
+
121
+ @contextmanager
122
+ def tab(
123
+ self,
124
+ label: str,
125
+ *,
126
+ url: str = "about:blank",
127
+ stop_on_exit: bool = True,
128
+ ) -> Generator[str, None, None]:
129
+ """Open a tab, yield its label, close tab on exit.
130
+
131
+ stop_on_exit is ignored — the server owns the browser lifecycle.
132
+ """
133
+ self.open_tab(url, label=label)
134
+ try:
135
+ yield label
136
+ finally:
137
+ try:
138
+ self.close_tab(label)
139
+ except Exception:
140
+ pass
141
+
142
+ def _request(self, method: str, path: str, **kwargs: Any) -> Any:
143
+ """Dispatch an HTTP request and handle common errors."""
144
+ try:
145
+ r = getattr(self._http, method)(path, **kwargs)
146
+ r.raise_for_status()
147
+ return r.json()
148
+ except httpx.ConnectError:
149
+ raise WebScraperError(
150
+ f"Cannot connect to browser-api at {self.base_url}. "
151
+ "Start it with: uv run browser-api"
152
+ )
153
+ except httpx.HTTPStatusError as exc:
154
+ raise WebScraperError(f"browser-api error: {exc.response.text}")
155
+
156
+ def _get(self, path: str) -> Any:
157
+ return self._request("get", path)
158
+
159
+ def _post(
160
+ self, path: str, *, json: dict | None = None, timeout: float | None = None
161
+ ) -> Any:
162
+ kw: dict[str, Any] = {}
163
+ if json is not None:
164
+ kw["json"] = json
165
+ if timeout is not None:
166
+ kw["timeout"] = timeout
167
+ return self._request("post", path, **kw)
168
+
169
+ def _delete(self, path: str) -> Any:
170
+ return self._request("delete", path)