ember-browser 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- emb/__init__.py +38 -0
- emb/_browser.py +157 -0
- emb/_url_validator.py +59 -0
- emb/agent.py +70 -0
- emb/api.py +193 -0
- emb/cli.py +1041 -0
- emb/crawl.py +174 -0
- emb/interact.py +156 -0
- emb/map.py +109 -0
- emb/mcp.py +126 -0
- emb/scrape.py +207 -0
- emb/search.py +27 -0
- emb/types.py +60 -0
- ember_browser-0.1.0.dist-info/METADATA +338 -0
- ember_browser-0.1.0.dist-info/RECORD +19 -0
- ember_browser-0.1.0.dist-info/WHEEL +5 -0
- ember_browser-0.1.0.dist-info/entry_points.txt +2 -0
- ember_browser-0.1.0.dist-info/licenses/LICENSE +11 -0
- ember_browser-0.1.0.dist-info/top_level.txt +1 -0
emb/__init__.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""ember — open source, lightweight headless browser for AI agents."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
__version__ = "0.1.0"
|
|
6
|
+
|
|
7
|
+
# Lazily re-export the most-used public functions so `from emb import scrape_url`
|
|
8
|
+
# works without loading heavy dependencies at `import emb` time.
|
|
9
|
+
#
|
|
10
|
+
# Names that clash with a same-named submodule (search, crawl) can't be re-exported
|
|
11
|
+
# this way — Python returns the submodule before __getattr__ fires. Use the submodule
|
|
12
|
+
# form for those: `from emb.search import search`, `from emb.crawl import crawl`.
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"__version__",
|
|
16
|
+
"scrape_url",
|
|
17
|
+
"scrape_url_async",
|
|
18
|
+
"scrape_markdown",
|
|
19
|
+
"scrape_markdown_async",
|
|
20
|
+
"map_url",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
_LAZY: dict[str, str] = {
|
|
24
|
+
"scrape_url": "emb.scrape",
|
|
25
|
+
"scrape_url_async": "emb.scrape",
|
|
26
|
+
"scrape_markdown": "emb.scrape",
|
|
27
|
+
"scrape_markdown_async": "emb.scrape",
|
|
28
|
+
"map_url": "emb.map",
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def __getattr__(name: str):
|
|
33
|
+
module_path = _LAZY.get(name)
|
|
34
|
+
if module_path is None:
|
|
35
|
+
raise AttributeError(f"module 'emb' has no attribute {name!r}")
|
|
36
|
+
import importlib
|
|
37
|
+
module = importlib.import_module(module_path)
|
|
38
|
+
return getattr(module, name)
|
emb/_browser.py
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
"""Auto-download and cache Lightpanda browser binary.
|
|
2
|
+
|
|
3
|
+
The browser is downloaded on first use — no manual install required.
|
|
4
|
+
Cached at ~/.cache/ember/lightpanda for subsequent use.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import hashlib
|
|
10
|
+
import os
|
|
11
|
+
import platform
|
|
12
|
+
import secrets
|
|
13
|
+
import stat
|
|
14
|
+
import subprocess
|
|
15
|
+
import sys
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
|
|
18
|
+
import httpx
|
|
19
|
+
|
|
20
|
+
CACHE_DIR = Path.home() / ".cache" / "ember"
|
|
21
|
+
BINARY_NAME = "lightpanda"
|
|
22
|
+
BINARY_PATH = CACHE_DIR / BINARY_NAME
|
|
23
|
+
|
|
24
|
+
LIGHTPANDA_VERSION = "0.3.3"
|
|
25
|
+
|
|
26
|
+
# SHA-256 digests for each platform binary at LIGHTPANDA_VERSION.
|
|
27
|
+
# Update both this dict and LIGHTPANDA_VERSION together on every release bump.
|
|
28
|
+
_BINARY_HASHES: dict[tuple[str, str], str] = {
|
|
29
|
+
("Linux", "x86_64"): "b6ab613846f5291cc6bafd7f44ffb9718df51bf00eb83954e1fc5d7f52c7b886",
|
|
30
|
+
("Linux", "aarch64"): "db35c06ee074a79c2e039965c404e578748c1d22cb296e853461970ea0c2945f",
|
|
31
|
+
("Darwin", "x86_64"): "631cec32766d2f98f1005e3af6af74794fb55cc28d193d97176cfa21f1d26d0c",
|
|
32
|
+
("Darwin", "aarch64"): "1ef236a72e63975cf8acc7430e52dd31af5fff27a0b62ba81876eb8dde3e18e2",
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
_DOWNLOAD_URLS = {
|
|
36
|
+
("Linux", "x86_64"): (
|
|
37
|
+
f"https://github.com/lightpanda-io/browser/releases/download/{LIGHTPANDA_VERSION}/"
|
|
38
|
+
"lightpanda-x86_64-linux"
|
|
39
|
+
),
|
|
40
|
+
("Linux", "aarch64"): (
|
|
41
|
+
f"https://github.com/lightpanda-io/browser/releases/download/{LIGHTPANDA_VERSION}/"
|
|
42
|
+
"lightpanda-aarch64-linux"
|
|
43
|
+
),
|
|
44
|
+
("Darwin", "x86_64"): (
|
|
45
|
+
f"https://github.com/lightpanda-io/browser/releases/download/{LIGHTPANDA_VERSION}/"
|
|
46
|
+
"lightpanda-x86_64-macos"
|
|
47
|
+
),
|
|
48
|
+
("Darwin", "aarch64"): (
|
|
49
|
+
f"https://github.com/lightpanda-io/browser/releases/download/{LIGHTPANDA_VERSION}/"
|
|
50
|
+
"lightpanda-aarch64-macos"
|
|
51
|
+
),
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def is_available() -> bool:
|
|
56
|
+
if BINARY_PATH.exists():
|
|
57
|
+
return True
|
|
58
|
+
try:
|
|
59
|
+
subprocess.run(["lightpanda", "version"], capture_output=True, timeout=5)
|
|
60
|
+
return True
|
|
61
|
+
except (FileNotFoundError, subprocess.TimeoutExpired):
|
|
62
|
+
return False
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _platform_url() -> str | None:
|
|
66
|
+
system = platform.system()
|
|
67
|
+
machine = platform.machine()
|
|
68
|
+
return _DOWNLOAD_URLS.get((system, machine))
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def ensure() -> str:
|
|
72
|
+
# Check env override
|
|
73
|
+
env_path = os.environ.get("EMBER_LIGHTPANDA_PATH")
|
|
74
|
+
if env_path:
|
|
75
|
+
p = Path(env_path)
|
|
76
|
+
if p.exists() and p.is_file():
|
|
77
|
+
return str(p)
|
|
78
|
+
# Bare name (no path separator) — allow PATH resolution
|
|
79
|
+
if os.sep not in env_path and "/" not in env_path:
|
|
80
|
+
try:
|
|
81
|
+
r = subprocess.run([env_path, "version"], capture_output=True, text=True, timeout=5)
|
|
82
|
+
if r.returncode == 0:
|
|
83
|
+
return env_path
|
|
84
|
+
except (FileNotFoundError, subprocess.TimeoutExpired):
|
|
85
|
+
pass
|
|
86
|
+
raise RuntimeError(
|
|
87
|
+
f"EMBER_LIGHTPANDA_PATH={env_path!r} — binary not found or not executable. "
|
|
88
|
+
f"Check the path and try again."
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
# Check PATH
|
|
92
|
+
try:
|
|
93
|
+
r = subprocess.run(["lightpanda", "version"], capture_output=True, text=True, timeout=5)
|
|
94
|
+
if r.returncode == 0:
|
|
95
|
+
return "lightpanda"
|
|
96
|
+
except (FileNotFoundError, subprocess.TimeoutExpired):
|
|
97
|
+
pass
|
|
98
|
+
|
|
99
|
+
# Check cached binary
|
|
100
|
+
if BINARY_PATH.exists():
|
|
101
|
+
try:
|
|
102
|
+
r = subprocess.run([str(BINARY_PATH), "version"], capture_output=True, text=True, timeout=5)
|
|
103
|
+
if r.returncode == 0:
|
|
104
|
+
return str(BINARY_PATH)
|
|
105
|
+
except (FileNotFoundError, subprocess.TimeoutExpired):
|
|
106
|
+
pass
|
|
107
|
+
BINARY_PATH.unlink(missing_ok=True)
|
|
108
|
+
|
|
109
|
+
# Download
|
|
110
|
+
url = _platform_url()
|
|
111
|
+
if not url:
|
|
112
|
+
if platform.system() == "Windows":
|
|
113
|
+
raise RuntimeError(
|
|
114
|
+
f"Lightpanda does not support Windows natively ({platform.machine()}).\n"
|
|
115
|
+
"Agents using ember should run on Linux. For local development, use WSL2:\n"
|
|
116
|
+
" wsl --install\n"
|
|
117
|
+
" then run ember inside WSL.\n"
|
|
118
|
+
"Browser-free features (scrape, search, crawl, map) work on Windows as-is."
|
|
119
|
+
)
|
|
120
|
+
raise RuntimeError(
|
|
121
|
+
f"Lightpanda not available for {platform.system()} {platform.machine()}. "
|
|
122
|
+
f"See: https://lightpanda.io/docs/open-source/installation"
|
|
123
|
+
)
|
|
124
|
+
CACHE_DIR.mkdir(parents=True, exist_ok=True)
|
|
125
|
+
tmp = BINARY_PATH.with_suffix(".tmp")
|
|
126
|
+
try:
|
|
127
|
+
print(" Downloading Lightpanda...", file=sys.stderr)
|
|
128
|
+
with httpx.stream("GET", url, follow_redirects=True, timeout=120) as resp:
|
|
129
|
+
resp.raise_for_status()
|
|
130
|
+
with open(tmp, "wb") as f:
|
|
131
|
+
for chunk in resp.iter_bytes(65536):
|
|
132
|
+
f.write(chunk)
|
|
133
|
+
|
|
134
|
+
expected = _BINARY_HASHES.get((platform.system(), platform.machine()))
|
|
135
|
+
if expected is None:
|
|
136
|
+
tmp.unlink(missing_ok=True)
|
|
137
|
+
raise RuntimeError(
|
|
138
|
+
f"No SHA-256 hash registered for {platform.system()} {platform.machine()}. "
|
|
139
|
+
"Add the expected hash to _BINARY_HASHES before enabling downloads for this platform."
|
|
140
|
+
)
|
|
141
|
+
actual = hashlib.sha256(tmp.read_bytes()).hexdigest()
|
|
142
|
+
if not secrets.compare_digest(actual, expected):
|
|
143
|
+
tmp.unlink(missing_ok=True)
|
|
144
|
+
raise RuntimeError(
|
|
145
|
+
f"Lightpanda binary digest mismatch — download may be corrupted or tampered.\n"
|
|
146
|
+
f" expected: {expected}\n"
|
|
147
|
+
f" got: {actual}"
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
tmp.chmod(tmp.stat().st_mode | stat.S_IEXEC | stat.S_IXGRP | stat.S_IXOTH)
|
|
151
|
+
tmp.rename(BINARY_PATH)
|
|
152
|
+
print(f" ✓ Lightpanda cached at {BINARY_PATH}", file=sys.stderr)
|
|
153
|
+
except Exception as e:
|
|
154
|
+
tmp.unlink(missing_ok=True)
|
|
155
|
+
raise RuntimeError(f"Failed to download Lightpanda: {e}") from e
|
|
156
|
+
|
|
157
|
+
return str(BINARY_PATH)
|
emb/_url_validator.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""URL validation to block SSRF and unsafe schemes."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import ipaddress
|
|
6
|
+
import socket
|
|
7
|
+
from urllib.parse import urlparse
|
|
8
|
+
|
|
9
|
+
_BLOCKED_NETWORKS = [
|
|
10
|
+
ipaddress.ip_network("10.0.0.0/8"),
|
|
11
|
+
ipaddress.ip_network("172.16.0.0/12"),
|
|
12
|
+
ipaddress.ip_network("192.168.0.0/16"),
|
|
13
|
+
ipaddress.ip_network("127.0.0.0/8"),
|
|
14
|
+
ipaddress.ip_network("169.254.0.0/16"),
|
|
15
|
+
ipaddress.ip_network("0.0.0.0/8"),
|
|
16
|
+
ipaddress.ip_network("::1/128"),
|
|
17
|
+
ipaddress.ip_network("fc00::/7"),
|
|
18
|
+
ipaddress.ip_network("fe80::/10"),
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _is_blocked(addr_str: str) -> bool:
|
|
23
|
+
try:
|
|
24
|
+
addr = ipaddress.ip_address(addr_str)
|
|
25
|
+
except ValueError:
|
|
26
|
+
return False
|
|
27
|
+
# IPv6-mapped IPv4 (e.g. ::ffff:10.0.0.1) — unwrap to check against IPv4 ranges
|
|
28
|
+
if isinstance(addr, ipaddress.IPv6Address) and addr.ipv4_mapped is not None:
|
|
29
|
+
addr = addr.ipv4_mapped
|
|
30
|
+
for network in _BLOCKED_NETWORKS:
|
|
31
|
+
try:
|
|
32
|
+
if addr in network:
|
|
33
|
+
return True
|
|
34
|
+
except TypeError:
|
|
35
|
+
pass # address family mismatch — not blocked by this entry
|
|
36
|
+
return False
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
# DNS rebinding can bypass this check — pair with network-level egress controls
|
|
40
|
+
# in high-security deployments.
|
|
41
|
+
def validate_url(url: str) -> None:
|
|
42
|
+
parsed = urlparse(url)
|
|
43
|
+
|
|
44
|
+
if parsed.scheme not in ("http", "https"):
|
|
45
|
+
raise ValueError(f"URL scheme must be http or https, got {parsed.scheme!r}")
|
|
46
|
+
|
|
47
|
+
hostname = parsed.hostname
|
|
48
|
+
if not hostname:
|
|
49
|
+
raise ValueError("URL has no hostname")
|
|
50
|
+
|
|
51
|
+
try:
|
|
52
|
+
infos = socket.getaddrinfo(hostname, None)
|
|
53
|
+
except socket.gaierror as exc:
|
|
54
|
+
raise ValueError(f"Cannot resolve hostname {hostname!r}: {exc}") from exc
|
|
55
|
+
|
|
56
|
+
for info in infos:
|
|
57
|
+
addr_str = info[4][0]
|
|
58
|
+
if _is_blocked(addr_str):
|
|
59
|
+
raise ValueError(f"URL resolves to a blocked address ({addr_str})")
|
emb/agent.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""LLM-powered structured extraction from scraped pages."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import logging
|
|
7
|
+
import os
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
import httpx
|
|
11
|
+
|
|
12
|
+
from emb.scrape import scrape_url
|
|
13
|
+
|
|
14
|
+
_log = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
_DEFAULT_API_KEY = os.environ.get("EMBER_LLM_API_KEY", "")
|
|
17
|
+
_DEFAULT_BASE_URL = os.environ.get("EMBER_LLM_BASE_URL", "https://api.openai.com/v1")
|
|
18
|
+
_DEFAULT_MODEL = os.environ.get("EMBER_LLM_MODEL", "gpt-4o-mini")
|
|
19
|
+
_MAX_CONTENT_CHARS = 15_000
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
# Requires EMBER_LLM_API_KEY. Falls back to returning raw markdown when no key is set.
|
|
23
|
+
def extract(
|
|
24
|
+
url: str,
|
|
25
|
+
*,
|
|
26
|
+
prompt: str = "",
|
|
27
|
+
model: str = _DEFAULT_MODEL,
|
|
28
|
+
api_key: str = _DEFAULT_API_KEY,
|
|
29
|
+
base_url: str = _DEFAULT_BASE_URL,
|
|
30
|
+
timeout: int = 60,
|
|
31
|
+
use_browser: bool | None = None,
|
|
32
|
+
) -> dict[str, Any]:
|
|
33
|
+
scraped = scrape_url(url, use_browser=use_browser, timeout=timeout)
|
|
34
|
+
if not scraped.success:
|
|
35
|
+
return {"error": scraped.error or "Failed to scrape URL"}
|
|
36
|
+
|
|
37
|
+
if not api_key:
|
|
38
|
+
return {"markdown": scraped.markdown, "title": scraped.title}
|
|
39
|
+
|
|
40
|
+
user_prompt = f"Page: {url}\nTitle: {scraped.title}\n\n{scraped.markdown[:_MAX_CONTENT_CHARS]}\n\n"
|
|
41
|
+
if prompt:
|
|
42
|
+
user_prompt += f"Task: {prompt}"
|
|
43
|
+
else:
|
|
44
|
+
user_prompt += "Extract the main structured information."
|
|
45
|
+
|
|
46
|
+
try:
|
|
47
|
+
resp = httpx.post(
|
|
48
|
+
f"{base_url.rstrip('/')}/chat/completions",
|
|
49
|
+
headers={"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"},
|
|
50
|
+
json={
|
|
51
|
+
"model": model,
|
|
52
|
+
"messages": [
|
|
53
|
+
{"role": "system", "content": "Extract the requested information from the page content."},
|
|
54
|
+
{"role": "user", "content": user_prompt},
|
|
55
|
+
],
|
|
56
|
+
"temperature": 0.1,
|
|
57
|
+
"max_tokens": 4096,
|
|
58
|
+
},
|
|
59
|
+
timeout=timeout,
|
|
60
|
+
)
|
|
61
|
+
resp.raise_for_status()
|
|
62
|
+
data = resp.json()
|
|
63
|
+
content = data["choices"][0]["message"]["content"]
|
|
64
|
+
try:
|
|
65
|
+
return json.loads(content)
|
|
66
|
+
except (json.JSONDecodeError, TypeError):
|
|
67
|
+
return {"content": content, "sources": [url]}
|
|
68
|
+
except Exception as e:
|
|
69
|
+
_log.debug("LLM request error: %s", e)
|
|
70
|
+
return {"error": "LLM request failed"}
|
emb/api.py
ADDED
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
"""FastAPI server. Exposes every ember feature as an HTTP endpoint."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import secrets
|
|
7
|
+
import sys
|
|
8
|
+
from collections.abc import Awaitable, Callable
|
|
9
|
+
from typing import Any, Literal
|
|
10
|
+
|
|
11
|
+
from fastapi import FastAPI, HTTPException, Request, Response
|
|
12
|
+
from fastapi.responses import JSONResponse
|
|
13
|
+
from pydantic import BaseModel, Field
|
|
14
|
+
|
|
15
|
+
from emb import __version__
|
|
16
|
+
from emb._url_validator import validate_url
|
|
17
|
+
from emb.agent import extract as agent_extract
|
|
18
|
+
from emb.crawl import crawl as do_crawl
|
|
19
|
+
from emb.interact import interact as do_interact
|
|
20
|
+
from emb.map import map_url
|
|
21
|
+
from emb.scrape import scrape_url
|
|
22
|
+
from emb.search import search
|
|
23
|
+
|
|
24
|
+
app = FastAPI(title="ember", version=__version__)
|
|
25
|
+
|
|
26
|
+
_API_KEY = os.environ.get("EMBER_API_KEY", "")
|
|
27
|
+
|
|
28
|
+
if not _API_KEY:
|
|
29
|
+
print(
|
|
30
|
+
"WARNING: EMBER_API_KEY is not set. The API server is open to any caller. "
|
|
31
|
+
"Set EMBER_API_KEY to require authentication.",
|
|
32
|
+
file=sys.stderr,
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@app.middleware("http")
|
|
37
|
+
async def _auth_middleware(
|
|
38
|
+
request: Request,
|
|
39
|
+
call_next: Callable[[Request], Awaitable[Response]],
|
|
40
|
+
) -> Response:
|
|
41
|
+
if request.url.path in ("/", "/health"):
|
|
42
|
+
return await call_next(request)
|
|
43
|
+
if _API_KEY:
|
|
44
|
+
key = request.headers.get("X-API-Key", "")
|
|
45
|
+
if not secrets.compare_digest(key, _API_KEY):
|
|
46
|
+
return JSONResponse({"detail": "Unauthorized"}, status_code=401)
|
|
47
|
+
return await call_next(request)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _safe_url(url: str) -> str:
|
|
51
|
+
try:
|
|
52
|
+
validate_url(url)
|
|
53
|
+
except ValueError as exc:
|
|
54
|
+
raise HTTPException(status_code=400, detail=str(exc))
|
|
55
|
+
return url
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class ScrapeRequest(BaseModel):
|
|
59
|
+
url: str = Field(..., max_length=2048, description="URL to scrape")
|
|
60
|
+
use_browser: bool | None = Field(None, description="Force browser rendering")
|
|
61
|
+
timeout: int = Field(30, ge=1, le=120)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class SearchRequest(BaseModel):
|
|
65
|
+
query: str = Field(..., max_length=500, description="Search query")
|
|
66
|
+
limit: int = Field(5, ge=1, le=50)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class CrawlRequest(BaseModel):
|
|
70
|
+
url: str = Field(..., max_length=2048, description="URL to start from")
|
|
71
|
+
max_pages: int = Field(50, ge=1, le=500)
|
|
72
|
+
max_depth: int = Field(3, ge=1, le=10)
|
|
73
|
+
timeout: int = Field(30, ge=1, le=300)
|
|
74
|
+
delay: float = Field(0.0, ge=0.0, le=10.0, description="Seconds between requests")
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
class MapRequest(BaseModel):
|
|
78
|
+
url: str = Field(..., max_length=2048, description="Website URL")
|
|
79
|
+
max_links: int = Field(500, ge=1, le=5000)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class InteractRequest(BaseModel):
|
|
83
|
+
url: str = Field(..., max_length=2048, description="URL to open")
|
|
84
|
+
prompt: str = Field("", description="Natural language action")
|
|
85
|
+
provider: Literal[
|
|
86
|
+
"openai", "anthropic", "gemini", "mistral",
|
|
87
|
+
"huggingface", "vercel", "ollama", "llama_cpp"
|
|
88
|
+
] = Field("openai", description="LLM provider")
|
|
89
|
+
model: str = Field("", max_length=128, description="Model name override")
|
|
90
|
+
timeout: int = Field(60, ge=1, le=300)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class ExtractRequest(BaseModel):
|
|
94
|
+
url: str = Field(..., max_length=2048, description="URL to extract from")
|
|
95
|
+
prompt: str = Field("", description="What to extract")
|
|
96
|
+
model: str = Field("gpt-4o-mini", max_length=128, description="LLM model")
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
@app.get("/")
|
|
100
|
+
def root() -> dict[str, Any]:
|
|
101
|
+
return {
|
|
102
|
+
"name": "ember",
|
|
103
|
+
"version": __version__,
|
|
104
|
+
"endpoints": {
|
|
105
|
+
"POST /scrape": "Extract markdown from a URL",
|
|
106
|
+
"POST /crawl": "Crawl a website",
|
|
107
|
+
"POST /search": "Search the web",
|
|
108
|
+
"POST /map": "Discover URLs on a site",
|
|
109
|
+
"POST /interact": "Browser interaction",
|
|
110
|
+
"POST /extract": "LLM-powered structured extraction",
|
|
111
|
+
"POST /agent": "Alias for /extract",
|
|
112
|
+
"GET /health": "Health check",
|
|
113
|
+
},
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
@app.post("/scrape")
|
|
118
|
+
def api_scrape(req: ScrapeRequest) -> dict[str, Any]:
|
|
119
|
+
_safe_url(req.url)
|
|
120
|
+
result = scrape_url(req.url, use_browser=req.use_browser, timeout=req.timeout)
|
|
121
|
+
if not result.success:
|
|
122
|
+
raise HTTPException(status_code=502, detail=result.error)
|
|
123
|
+
return {"url": result.url, "title": result.title, "markdown": result.markdown}
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
@app.post("/crawl")
|
|
127
|
+
def api_crawl(req: CrawlRequest) -> dict[str, Any]:
|
|
128
|
+
_safe_url(req.url)
|
|
129
|
+
result = do_crawl(
|
|
130
|
+
req.url,
|
|
131
|
+
max_pages=req.max_pages,
|
|
132
|
+
max_depth=req.max_depth,
|
|
133
|
+
timeout=req.timeout,
|
|
134
|
+
delay=req.delay,
|
|
135
|
+
)
|
|
136
|
+
if not result.success:
|
|
137
|
+
raise HTTPException(status_code=502, detail=result.error)
|
|
138
|
+
return {
|
|
139
|
+
"url": result.url,
|
|
140
|
+
"total": result.total,
|
|
141
|
+
"pages": [{"url": p.url, "title": p.title, "markdown": p.markdown, "depth": p.depth}
|
|
142
|
+
for p in result.pages],
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
@app.post("/search")
|
|
147
|
+
def api_search(req: SearchRequest) -> dict[str, Any]:
|
|
148
|
+
try:
|
|
149
|
+
results = search(req.query, limit=req.limit)
|
|
150
|
+
except RuntimeError as e:
|
|
151
|
+
raise HTTPException(status_code=502, detail=str(e))
|
|
152
|
+
return {
|
|
153
|
+
"query": req.query,
|
|
154
|
+
"results": [{"url": r.url, "title": r.title, "description": r.description} for r in results],
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
@app.post("/map")
|
|
159
|
+
def api_map(req: MapRequest) -> dict[str, Any]:
|
|
160
|
+
_safe_url(req.url)
|
|
161
|
+
result = map_url(req.url, max_links=req.max_links)
|
|
162
|
+
if result.error:
|
|
163
|
+
raise HTTPException(status_code=502, detail=result.error)
|
|
164
|
+
return {"url": result.url, "total": result.total, "links": result.links}
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
@app.post("/interact")
|
|
168
|
+
def api_interact(req: InteractRequest) -> dict[str, Any]:
|
|
169
|
+
_safe_url(req.url)
|
|
170
|
+
result = do_interact(req.url, prompt=req.prompt, provider=req.provider, model=req.model, timeout=req.timeout)
|
|
171
|
+
if not result.success:
|
|
172
|
+
raise HTTPException(status_code=502, detail=result.error)
|
|
173
|
+
return {"url": result.url, "content": result.content}
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
@app.post("/extract")
|
|
177
|
+
@app.post("/agent")
|
|
178
|
+
def api_extract(req: ExtractRequest) -> dict[str, Any]:
|
|
179
|
+
_safe_url(req.url)
|
|
180
|
+
result = agent_extract(req.url, prompt=req.prompt, model=req.model)
|
|
181
|
+
if "error" in result:
|
|
182
|
+
raise HTTPException(status_code=502, detail=result["error"])
|
|
183
|
+
return result
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
@app.get("/health")
|
|
187
|
+
def health() -> dict[str, Any]:
|
|
188
|
+
return {"status": "ok", "version": __version__}
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def start_server(host: str = "127.0.0.1", port: int = 51251) -> None:
|
|
192
|
+
import uvicorn
|
|
193
|
+
uvicorn.run(app, host=host, port=port)
|