ember-browser 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
emb/__init__.py ADDED
@@ -0,0 +1,38 @@
1
+ """ember — open source, lightweight headless browser for AI agents."""
2
+
3
+ from __future__ import annotations
4
+
5
+ __version__ = "0.1.0"
6
+
7
+ # Lazily re-export the most-used public functions so `from emb import scrape_url`
8
+ # works without loading heavy dependencies at `import emb` time.
9
+ #
10
+ # Names that clash with a same-named submodule (search, crawl) can't be re-exported
11
+ # this way — Python returns the submodule before __getattr__ fires. Use the submodule
12
+ # form for those: `from emb.search import search`, `from emb.crawl import crawl`.
13
+
14
+ __all__ = [
15
+ "__version__",
16
+ "scrape_url",
17
+ "scrape_url_async",
18
+ "scrape_markdown",
19
+ "scrape_markdown_async",
20
+ "map_url",
21
+ ]
22
+
23
+ _LAZY: dict[str, str] = {
24
+ "scrape_url": "emb.scrape",
25
+ "scrape_url_async": "emb.scrape",
26
+ "scrape_markdown": "emb.scrape",
27
+ "scrape_markdown_async": "emb.scrape",
28
+ "map_url": "emb.map",
29
+ }
30
+
31
+
32
+ def __getattr__(name: str):
33
+ module_path = _LAZY.get(name)
34
+ if module_path is None:
35
+ raise AttributeError(f"module 'emb' has no attribute {name!r}")
36
+ import importlib
37
+ module = importlib.import_module(module_path)
38
+ return getattr(module, name)
emb/_browser.py ADDED
@@ -0,0 +1,157 @@
1
+ """Auto-download and cache Lightpanda browser binary.
2
+
3
+ The browser is downloaded on first use — no manual install required.
4
+ Cached at ~/.cache/ember/lightpanda for subsequent use.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import hashlib
10
+ import os
11
+ import platform
12
+ import secrets
13
+ import stat
14
+ import subprocess
15
+ import sys
16
+ from pathlib import Path
17
+
18
+ import httpx
19
+
20
+ CACHE_DIR = Path.home() / ".cache" / "ember"
21
+ BINARY_NAME = "lightpanda"
22
+ BINARY_PATH = CACHE_DIR / BINARY_NAME
23
+
24
+ LIGHTPANDA_VERSION = "0.3.3"
25
+
26
+ # SHA-256 digests for each platform binary at LIGHTPANDA_VERSION.
27
+ # Update both this dict and LIGHTPANDA_VERSION together on every release bump.
28
+ _BINARY_HASHES: dict[tuple[str, str], str] = {
29
+ ("Linux", "x86_64"): "b6ab613846f5291cc6bafd7f44ffb9718df51bf00eb83954e1fc5d7f52c7b886",
30
+ ("Linux", "aarch64"): "db35c06ee074a79c2e039965c404e578748c1d22cb296e853461970ea0c2945f",
31
+ ("Darwin", "x86_64"): "631cec32766d2f98f1005e3af6af74794fb55cc28d193d97176cfa21f1d26d0c",
32
+ ("Darwin", "aarch64"): "1ef236a72e63975cf8acc7430e52dd31af5fff27a0b62ba81876eb8dde3e18e2",
33
+ }
34
+
35
+ _DOWNLOAD_URLS = {
36
+ ("Linux", "x86_64"): (
37
+ f"https://github.com/lightpanda-io/browser/releases/download/{LIGHTPANDA_VERSION}/"
38
+ "lightpanda-x86_64-linux"
39
+ ),
40
+ ("Linux", "aarch64"): (
41
+ f"https://github.com/lightpanda-io/browser/releases/download/{LIGHTPANDA_VERSION}/"
42
+ "lightpanda-aarch64-linux"
43
+ ),
44
+ ("Darwin", "x86_64"): (
45
+ f"https://github.com/lightpanda-io/browser/releases/download/{LIGHTPANDA_VERSION}/"
46
+ "lightpanda-x86_64-macos"
47
+ ),
48
+ ("Darwin", "aarch64"): (
49
+ f"https://github.com/lightpanda-io/browser/releases/download/{LIGHTPANDA_VERSION}/"
50
+ "lightpanda-aarch64-macos"
51
+ ),
52
+ }
53
+
54
+
55
+ def is_available() -> bool:
56
+ if BINARY_PATH.exists():
57
+ return True
58
+ try:
59
+ subprocess.run(["lightpanda", "version"], capture_output=True, timeout=5)
60
+ return True
61
+ except (FileNotFoundError, subprocess.TimeoutExpired):
62
+ return False
63
+
64
+
65
+ def _platform_url() -> str | None:
66
+ system = platform.system()
67
+ machine = platform.machine()
68
+ return _DOWNLOAD_URLS.get((system, machine))
69
+
70
+
71
+ def ensure() -> str:
72
+ # Check env override
73
+ env_path = os.environ.get("EMBER_LIGHTPANDA_PATH")
74
+ if env_path:
75
+ p = Path(env_path)
76
+ if p.exists() and p.is_file():
77
+ return str(p)
78
+ # Bare name (no path separator) — allow PATH resolution
79
+ if os.sep not in env_path and "/" not in env_path:
80
+ try:
81
+ r = subprocess.run([env_path, "version"], capture_output=True, text=True, timeout=5)
82
+ if r.returncode == 0:
83
+ return env_path
84
+ except (FileNotFoundError, subprocess.TimeoutExpired):
85
+ pass
86
+ raise RuntimeError(
87
+ f"EMBER_LIGHTPANDA_PATH={env_path!r} — binary not found or not executable. "
88
+ f"Check the path and try again."
89
+ )
90
+
91
+ # Check PATH
92
+ try:
93
+ r = subprocess.run(["lightpanda", "version"], capture_output=True, text=True, timeout=5)
94
+ if r.returncode == 0:
95
+ return "lightpanda"
96
+ except (FileNotFoundError, subprocess.TimeoutExpired):
97
+ pass
98
+
99
+ # Check cached binary
100
+ if BINARY_PATH.exists():
101
+ try:
102
+ r = subprocess.run([str(BINARY_PATH), "version"], capture_output=True, text=True, timeout=5)
103
+ if r.returncode == 0:
104
+ return str(BINARY_PATH)
105
+ except (FileNotFoundError, subprocess.TimeoutExpired):
106
+ pass
107
+ BINARY_PATH.unlink(missing_ok=True)
108
+
109
+ # Download
110
+ url = _platform_url()
111
+ if not url:
112
+ if platform.system() == "Windows":
113
+ raise RuntimeError(
114
+ f"Lightpanda does not support Windows natively ({platform.machine()}).\n"
115
+ "Agents using ember should run on Linux. For local development, use WSL2:\n"
116
+ " wsl --install\n"
117
+ " then run ember inside WSL.\n"
118
+ "Browser-free features (scrape, search, crawl, map) work on Windows as-is."
119
+ )
120
+ raise RuntimeError(
121
+ f"Lightpanda not available for {platform.system()} {platform.machine()}. "
122
+ f"See: https://lightpanda.io/docs/open-source/installation"
123
+ )
124
+ CACHE_DIR.mkdir(parents=True, exist_ok=True)
125
+ tmp = BINARY_PATH.with_suffix(".tmp")
126
+ try:
127
+ print(" Downloading Lightpanda...", file=sys.stderr)
128
+ with httpx.stream("GET", url, follow_redirects=True, timeout=120) as resp:
129
+ resp.raise_for_status()
130
+ with open(tmp, "wb") as f:
131
+ for chunk in resp.iter_bytes(65536):
132
+ f.write(chunk)
133
+
134
+ expected = _BINARY_HASHES.get((platform.system(), platform.machine()))
135
+ if expected is None:
136
+ tmp.unlink(missing_ok=True)
137
+ raise RuntimeError(
138
+ f"No SHA-256 hash registered for {platform.system()} {platform.machine()}. "
139
+ "Add the expected hash to _BINARY_HASHES before enabling downloads for this platform."
140
+ )
141
+ actual = hashlib.sha256(tmp.read_bytes()).hexdigest()
142
+ if not secrets.compare_digest(actual, expected):
143
+ tmp.unlink(missing_ok=True)
144
+ raise RuntimeError(
145
+ f"Lightpanda binary digest mismatch — download may be corrupted or tampered.\n"
146
+ f" expected: {expected}\n"
147
+ f" got: {actual}"
148
+ )
149
+
150
+ tmp.chmod(tmp.stat().st_mode | stat.S_IEXEC | stat.S_IXGRP | stat.S_IXOTH)
151
+ tmp.rename(BINARY_PATH)
152
+ print(f" ✓ Lightpanda cached at {BINARY_PATH}", file=sys.stderr)
153
+ except Exception as e:
154
+ tmp.unlink(missing_ok=True)
155
+ raise RuntimeError(f"Failed to download Lightpanda: {e}") from e
156
+
157
+ return str(BINARY_PATH)
emb/_url_validator.py ADDED
@@ -0,0 +1,59 @@
1
+ """URL validation to block SSRF and unsafe schemes."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import ipaddress
6
+ import socket
7
+ from urllib.parse import urlparse
8
+
9
+ _BLOCKED_NETWORKS = [
10
+ ipaddress.ip_network("10.0.0.0/8"),
11
+ ipaddress.ip_network("172.16.0.0/12"),
12
+ ipaddress.ip_network("192.168.0.0/16"),
13
+ ipaddress.ip_network("127.0.0.0/8"),
14
+ ipaddress.ip_network("169.254.0.0/16"),
15
+ ipaddress.ip_network("0.0.0.0/8"),
16
+ ipaddress.ip_network("::1/128"),
17
+ ipaddress.ip_network("fc00::/7"),
18
+ ipaddress.ip_network("fe80::/10"),
19
+ ]
20
+
21
+
22
+ def _is_blocked(addr_str: str) -> bool:
23
+ try:
24
+ addr = ipaddress.ip_address(addr_str)
25
+ except ValueError:
26
+ return False
27
+ # IPv6-mapped IPv4 (e.g. ::ffff:10.0.0.1) — unwrap to check against IPv4 ranges
28
+ if isinstance(addr, ipaddress.IPv6Address) and addr.ipv4_mapped is not None:
29
+ addr = addr.ipv4_mapped
30
+ for network in _BLOCKED_NETWORKS:
31
+ try:
32
+ if addr in network:
33
+ return True
34
+ except TypeError:
35
+ pass # address family mismatch — not blocked by this entry
36
+ return False
37
+
38
+
39
+ # DNS rebinding can bypass this check — pair with network-level egress controls
40
+ # in high-security deployments.
41
+ def validate_url(url: str) -> None:
42
+ parsed = urlparse(url)
43
+
44
+ if parsed.scheme not in ("http", "https"):
45
+ raise ValueError(f"URL scheme must be http or https, got {parsed.scheme!r}")
46
+
47
+ hostname = parsed.hostname
48
+ if not hostname:
49
+ raise ValueError("URL has no hostname")
50
+
51
+ try:
52
+ infos = socket.getaddrinfo(hostname, None)
53
+ except socket.gaierror as exc:
54
+ raise ValueError(f"Cannot resolve hostname {hostname!r}: {exc}") from exc
55
+
56
+ for info in infos:
57
+ addr_str = info[4][0]
58
+ if _is_blocked(addr_str):
59
+ raise ValueError(f"URL resolves to a blocked address ({addr_str})")
emb/agent.py ADDED
@@ -0,0 +1,70 @@
1
+ """LLM-powered structured extraction from scraped pages."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import logging
7
+ import os
8
+ from typing import Any
9
+
10
+ import httpx
11
+
12
+ from emb.scrape import scrape_url
13
+
14
+ _log = logging.getLogger(__name__)
15
+
16
+ _DEFAULT_API_KEY = os.environ.get("EMBER_LLM_API_KEY", "")
17
+ _DEFAULT_BASE_URL = os.environ.get("EMBER_LLM_BASE_URL", "https://api.openai.com/v1")
18
+ _DEFAULT_MODEL = os.environ.get("EMBER_LLM_MODEL", "gpt-4o-mini")
19
+ _MAX_CONTENT_CHARS = 15_000
20
+
21
+
22
+ # Requires EMBER_LLM_API_KEY. Falls back to returning raw markdown when no key is set.
23
+ def extract(
24
+ url: str,
25
+ *,
26
+ prompt: str = "",
27
+ model: str = _DEFAULT_MODEL,
28
+ api_key: str = _DEFAULT_API_KEY,
29
+ base_url: str = _DEFAULT_BASE_URL,
30
+ timeout: int = 60,
31
+ use_browser: bool | None = None,
32
+ ) -> dict[str, Any]:
33
+ scraped = scrape_url(url, use_browser=use_browser, timeout=timeout)
34
+ if not scraped.success:
35
+ return {"error": scraped.error or "Failed to scrape URL"}
36
+
37
+ if not api_key:
38
+ return {"markdown": scraped.markdown, "title": scraped.title}
39
+
40
+ user_prompt = f"Page: {url}\nTitle: {scraped.title}\n\n{scraped.markdown[:_MAX_CONTENT_CHARS]}\n\n"
41
+ if prompt:
42
+ user_prompt += f"Task: {prompt}"
43
+ else:
44
+ user_prompt += "Extract the main structured information."
45
+
46
+ try:
47
+ resp = httpx.post(
48
+ f"{base_url.rstrip('/')}/chat/completions",
49
+ headers={"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"},
50
+ json={
51
+ "model": model,
52
+ "messages": [
53
+ {"role": "system", "content": "Extract the requested information from the page content."},
54
+ {"role": "user", "content": user_prompt},
55
+ ],
56
+ "temperature": 0.1,
57
+ "max_tokens": 4096,
58
+ },
59
+ timeout=timeout,
60
+ )
61
+ resp.raise_for_status()
62
+ data = resp.json()
63
+ content = data["choices"][0]["message"]["content"]
64
+ try:
65
+ return json.loads(content)
66
+ except (json.JSONDecodeError, TypeError):
67
+ return {"content": content, "sources": [url]}
68
+ except Exception as e:
69
+ _log.debug("LLM request error: %s", e)
70
+ return {"error": "LLM request failed"}
emb/api.py ADDED
@@ -0,0 +1,193 @@
1
+ """FastAPI server. Exposes every ember feature as an HTTP endpoint."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ import secrets
7
+ import sys
8
+ from collections.abc import Awaitable, Callable
9
+ from typing import Any, Literal
10
+
11
+ from fastapi import FastAPI, HTTPException, Request, Response
12
+ from fastapi.responses import JSONResponse
13
+ from pydantic import BaseModel, Field
14
+
15
+ from emb import __version__
16
+ from emb._url_validator import validate_url
17
+ from emb.agent import extract as agent_extract
18
+ from emb.crawl import crawl as do_crawl
19
+ from emb.interact import interact as do_interact
20
+ from emb.map import map_url
21
+ from emb.scrape import scrape_url
22
+ from emb.search import search
23
+
24
+ app = FastAPI(title="ember", version=__version__)
25
+
26
+ _API_KEY = os.environ.get("EMBER_API_KEY", "")
27
+
28
+ if not _API_KEY:
29
+ print(
30
+ "WARNING: EMBER_API_KEY is not set. The API server is open to any caller. "
31
+ "Set EMBER_API_KEY to require authentication.",
32
+ file=sys.stderr,
33
+ )
34
+
35
+
36
+ @app.middleware("http")
37
+ async def _auth_middleware(
38
+ request: Request,
39
+ call_next: Callable[[Request], Awaitable[Response]],
40
+ ) -> Response:
41
+ if request.url.path in ("/", "/health"):
42
+ return await call_next(request)
43
+ if _API_KEY:
44
+ key = request.headers.get("X-API-Key", "")
45
+ if not secrets.compare_digest(key, _API_KEY):
46
+ return JSONResponse({"detail": "Unauthorized"}, status_code=401)
47
+ return await call_next(request)
48
+
49
+
50
+ def _safe_url(url: str) -> str:
51
+ try:
52
+ validate_url(url)
53
+ except ValueError as exc:
54
+ raise HTTPException(status_code=400, detail=str(exc))
55
+ return url
56
+
57
+
58
+ class ScrapeRequest(BaseModel):
59
+ url: str = Field(..., max_length=2048, description="URL to scrape")
60
+ use_browser: bool | None = Field(None, description="Force browser rendering")
61
+ timeout: int = Field(30, ge=1, le=120)
62
+
63
+
64
+ class SearchRequest(BaseModel):
65
+ query: str = Field(..., max_length=500, description="Search query")
66
+ limit: int = Field(5, ge=1, le=50)
67
+
68
+
69
+ class CrawlRequest(BaseModel):
70
+ url: str = Field(..., max_length=2048, description="URL to start from")
71
+ max_pages: int = Field(50, ge=1, le=500)
72
+ max_depth: int = Field(3, ge=1, le=10)
73
+ timeout: int = Field(30, ge=1, le=300)
74
+ delay: float = Field(0.0, ge=0.0, le=10.0, description="Seconds between requests")
75
+
76
+
77
+ class MapRequest(BaseModel):
78
+ url: str = Field(..., max_length=2048, description="Website URL")
79
+ max_links: int = Field(500, ge=1, le=5000)
80
+
81
+
82
+ class InteractRequest(BaseModel):
83
+ url: str = Field(..., max_length=2048, description="URL to open")
84
+ prompt: str = Field("", description="Natural language action")
85
+ provider: Literal[
86
+ "openai", "anthropic", "gemini", "mistral",
87
+ "huggingface", "vercel", "ollama", "llama_cpp"
88
+ ] = Field("openai", description="LLM provider")
89
+ model: str = Field("", max_length=128, description="Model name override")
90
+ timeout: int = Field(60, ge=1, le=300)
91
+
92
+
93
+ class ExtractRequest(BaseModel):
94
+ url: str = Field(..., max_length=2048, description="URL to extract from")
95
+ prompt: str = Field("", description="What to extract")
96
+ model: str = Field("gpt-4o-mini", max_length=128, description="LLM model")
97
+
98
+
99
+ @app.get("/")
100
+ def root() -> dict[str, Any]:
101
+ return {
102
+ "name": "ember",
103
+ "version": __version__,
104
+ "endpoints": {
105
+ "POST /scrape": "Extract markdown from a URL",
106
+ "POST /crawl": "Crawl a website",
107
+ "POST /search": "Search the web",
108
+ "POST /map": "Discover URLs on a site",
109
+ "POST /interact": "Browser interaction",
110
+ "POST /extract": "LLM-powered structured extraction",
111
+ "POST /agent": "Alias for /extract",
112
+ "GET /health": "Health check",
113
+ },
114
+ }
115
+
116
+
117
+ @app.post("/scrape")
118
+ def api_scrape(req: ScrapeRequest) -> dict[str, Any]:
119
+ _safe_url(req.url)
120
+ result = scrape_url(req.url, use_browser=req.use_browser, timeout=req.timeout)
121
+ if not result.success:
122
+ raise HTTPException(status_code=502, detail=result.error)
123
+ return {"url": result.url, "title": result.title, "markdown": result.markdown}
124
+
125
+
126
+ @app.post("/crawl")
127
+ def api_crawl(req: CrawlRequest) -> dict[str, Any]:
128
+ _safe_url(req.url)
129
+ result = do_crawl(
130
+ req.url,
131
+ max_pages=req.max_pages,
132
+ max_depth=req.max_depth,
133
+ timeout=req.timeout,
134
+ delay=req.delay,
135
+ )
136
+ if not result.success:
137
+ raise HTTPException(status_code=502, detail=result.error)
138
+ return {
139
+ "url": result.url,
140
+ "total": result.total,
141
+ "pages": [{"url": p.url, "title": p.title, "markdown": p.markdown, "depth": p.depth}
142
+ for p in result.pages],
143
+ }
144
+
145
+
146
+ @app.post("/search")
147
+ def api_search(req: SearchRequest) -> dict[str, Any]:
148
+ try:
149
+ results = search(req.query, limit=req.limit)
150
+ except RuntimeError as e:
151
+ raise HTTPException(status_code=502, detail=str(e))
152
+ return {
153
+ "query": req.query,
154
+ "results": [{"url": r.url, "title": r.title, "description": r.description} for r in results],
155
+ }
156
+
157
+
158
+ @app.post("/map")
159
+ def api_map(req: MapRequest) -> dict[str, Any]:
160
+ _safe_url(req.url)
161
+ result = map_url(req.url, max_links=req.max_links)
162
+ if result.error:
163
+ raise HTTPException(status_code=502, detail=result.error)
164
+ return {"url": result.url, "total": result.total, "links": result.links}
165
+
166
+
167
+ @app.post("/interact")
168
+ def api_interact(req: InteractRequest) -> dict[str, Any]:
169
+ _safe_url(req.url)
170
+ result = do_interact(req.url, prompt=req.prompt, provider=req.provider, model=req.model, timeout=req.timeout)
171
+ if not result.success:
172
+ raise HTTPException(status_code=502, detail=result.error)
173
+ return {"url": result.url, "content": result.content}
174
+
175
+
176
+ @app.post("/extract")
177
+ @app.post("/agent")
178
+ def api_extract(req: ExtractRequest) -> dict[str, Any]:
179
+ _safe_url(req.url)
180
+ result = agent_extract(req.url, prompt=req.prompt, model=req.model)
181
+ if "error" in result:
182
+ raise HTTPException(status_code=502, detail=result["error"])
183
+ return result
184
+
185
+
186
+ @app.get("/health")
187
+ def health() -> dict[str, Any]:
188
+ return {"status": "ok", "version": __version__}
189
+
190
+
191
+ def start_server(host: str = "127.0.0.1", port: int = 51251) -> None:
192
+ import uvicorn
193
+ uvicorn.run(app, host=host, port=port)