botbrowser 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- botbrowser-0.1.0/.gitignore +13 -0
- botbrowser-0.1.0/PKG-INFO +77 -0
- botbrowser-0.1.0/README.md +48 -0
- botbrowser-0.1.0/botbrowser/__init__.py +15 -0
- botbrowser-0.1.0/botbrowser/cleaner.py +99 -0
- botbrowser-0.1.0/botbrowser/client.py +79 -0
- botbrowser-0.1.0/botbrowser/converter.py +49 -0
- botbrowser-0.1.0/botbrowser/core.py +186 -0
- botbrowser-0.1.0/botbrowser/fetcher.py +63 -0
- botbrowser-0.1.0/botbrowser/models.py +45 -0
- botbrowser-0.1.0/pyproject.toml +47 -0
- botbrowser-0.1.0/tests/__init__.py +0 -0
- botbrowser-0.1.0/tests/test_core.py +249 -0
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: botbrowser
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Token-efficient web content extraction for LLM agents
|
|
5
|
+
Project-URL: Homepage, https://github.com/AmplifyCo/botbrowser
|
|
6
|
+
Project-URL: Repository, https://github.com/AmplifyCo/botbrowser
|
|
7
|
+
Project-URL: Issues, https://github.com/AmplifyCo/botbrowser/issues
|
|
8
|
+
License-Expression: MIT
|
|
9
|
+
Keywords: ai-agents,content-extraction,llm,token-optimization,web-scraping
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
19
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
20
|
+
Requires-Python: >=3.10
|
|
21
|
+
Requires-Dist: beautifulsoup4>=4.12.0
|
|
22
|
+
Requires-Dist: httpx>=0.27.0
|
|
23
|
+
Requires-Dist: markdownify>=0.13.0
|
|
24
|
+
Requires-Dist: pydantic>=2.0.0
|
|
25
|
+
Requires-Dist: trafilatura>=1.12.0
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Requires-Dist: pytest>=8.0.0; extra == 'dev'
|
|
28
|
+
Description-Content-Type: text/markdown
|
|
29
|
+
|
|
30
|
+
# BotBrowser (Python)
|
|
31
|
+
|
|
32
|
+
Token-efficient web content extraction for LLM agents.
|
|
33
|
+
|
|
34
|
+
## Install
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
pip install botbrowser
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Usage
|
|
41
|
+
|
|
42
|
+
```python
|
|
43
|
+
from botbrowser import extract
|
|
44
|
+
|
|
45
|
+
result = extract("https://example.com")
|
|
46
|
+
|
|
47
|
+
print(result.content) # clean markdown
|
|
48
|
+
print(result.metadata.token_savings_percent) # e.g. 94
|
|
49
|
+
print(result.title) # page title
|
|
50
|
+
print(result.links) # extracted links
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## Options
|
|
54
|
+
|
|
55
|
+
```python
|
|
56
|
+
result = extract(
|
|
57
|
+
"https://example.com",
|
|
58
|
+
format="text", # "markdown" (default) or "text"
|
|
59
|
+
timeout=10000, # request timeout in ms (default: 15000)
|
|
60
|
+
include_links=False, # extract links (default: True)
|
|
61
|
+
)
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
## Client Mode
|
|
65
|
+
|
|
66
|
+
If you're running the BotBrowser REST API server, you can use the client:
|
|
67
|
+
|
|
68
|
+
```python
|
|
69
|
+
from botbrowser import BotBrowserClient
|
|
70
|
+
|
|
71
|
+
client = BotBrowserClient("http://localhost:3000")
|
|
72
|
+
result = client.extract("https://example.com")
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
## License
|
|
76
|
+
|
|
77
|
+
MIT
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
# BotBrowser (Python)
|
|
2
|
+
|
|
3
|
+
Token-efficient web content extraction for LLM agents.
|
|
4
|
+
|
|
5
|
+
## Install
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install botbrowser
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Usage
|
|
12
|
+
|
|
13
|
+
```python
|
|
14
|
+
from botbrowser import extract
|
|
15
|
+
|
|
16
|
+
result = extract("https://example.com")
|
|
17
|
+
|
|
18
|
+
print(result.content) # clean markdown
|
|
19
|
+
print(result.metadata.token_savings_percent) # e.g. 94
|
|
20
|
+
print(result.title) # page title
|
|
21
|
+
print(result.links) # extracted links
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## Options
|
|
25
|
+
|
|
26
|
+
```python
|
|
27
|
+
result = extract(
|
|
28
|
+
"https://example.com",
|
|
29
|
+
format="text", # "markdown" (default) or "text"
|
|
30
|
+
timeout=10000, # request timeout in ms (default: 15000)
|
|
31
|
+
include_links=False, # extract links (default: True)
|
|
32
|
+
)
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
## Client Mode
|
|
36
|
+
|
|
37
|
+
If you're running the BotBrowser REST API server, you can use the client:
|
|
38
|
+
|
|
39
|
+
```python
|
|
40
|
+
from botbrowser import BotBrowserClient
|
|
41
|
+
|
|
42
|
+
client = BotBrowserClient("http://localhost:3000")
|
|
43
|
+
result = client.extract("https://example.com")
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
## License
|
|
47
|
+
|
|
48
|
+
MIT
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""BotBrowser — Token-efficient web content extraction for LLM agents."""
|
|
2
|
+
|
|
3
|
+
from botbrowser.core import extract
|
|
4
|
+
from botbrowser.client import BotBrowserClient
|
|
5
|
+
from botbrowser.models import BotBrowserResult, ExtractOptions, ExtractedLink, ExtractionMetadata
|
|
6
|
+
|
|
7
|
+
__version__ = "0.1.0"
|
|
8
|
+
__all__ = [
|
|
9
|
+
"extract",
|
|
10
|
+
"BotBrowserClient",
|
|
11
|
+
"BotBrowserResult",
|
|
12
|
+
"ExtractOptions",
|
|
13
|
+
"ExtractedLink",
|
|
14
|
+
"ExtractionMetadata",
|
|
15
|
+
]
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
"""HTML cleaning pipeline — strips bloat from web pages."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from bs4 import BeautifulSoup, Comment, Tag
|
|
6
|
+
|
|
7
|
+
REMOVE_TAGS = {
|
|
8
|
+
"script", "style", "noscript", "iframe", "object", "embed",
|
|
9
|
+
"applet", "svg", "canvas", "video", "audio", "source", "track",
|
|
10
|
+
"map", "area",
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
REMOVE_SELECTORS = [
|
|
14
|
+
"nav",
|
|
15
|
+
"[role='navigation']",
|
|
16
|
+
"[role='banner']",
|
|
17
|
+
"[role='complementary']",
|
|
18
|
+
"[aria-hidden='true']",
|
|
19
|
+
".ad", ".ads", ".advertisement", ".adsbygoogle",
|
|
20
|
+
".sidebar", ".side-bar",
|
|
21
|
+
".cookie-banner", ".cookie-notice", ".cookie-consent",
|
|
22
|
+
".popup", ".modal", ".overlay",
|
|
23
|
+
".social-share", ".share-buttons", ".social-links",
|
|
24
|
+
".comments", ".comment-section", "#comments",
|
|
25
|
+
".newsletter", ".subscribe",
|
|
26
|
+
".breadcrumb", ".breadcrumbs",
|
|
27
|
+
".pagination",
|
|
28
|
+
".related-posts", ".related-articles",
|
|
29
|
+
".widget",
|
|
30
|
+
"[data-ad]",
|
|
31
|
+
"[data-tracking]",
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
STRIP_ATTR_PREFIXES = ("data-", "aria-", "on")
|
|
35
|
+
|
|
36
|
+
SELF_CLOSING = {"img", "br", "hr", "input", "meta", "link"}
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def clean_html(html: str) -> str:
|
|
40
|
+
"""Remove non-content elements and unnecessary attributes from HTML."""
|
|
41
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
42
|
+
|
|
43
|
+
# Remove HTML comments
|
|
44
|
+
for comment in soup.find_all(string=lambda t: isinstance(t, Comment)):
|
|
45
|
+
comment.extract()
|
|
46
|
+
|
|
47
|
+
# Remove unwanted tags entirely
|
|
48
|
+
for tag_name in REMOVE_TAGS:
|
|
49
|
+
for tag in soup.find_all(tag_name):
|
|
50
|
+
tag.decompose()
|
|
51
|
+
|
|
52
|
+
# Remove non-content elements by selector
|
|
53
|
+
for selector in REMOVE_SELECTORS:
|
|
54
|
+
try:
|
|
55
|
+
for el in soup.select(selector):
|
|
56
|
+
el.decompose()
|
|
57
|
+
except Exception:
|
|
58
|
+
pass
|
|
59
|
+
|
|
60
|
+
# Remove hidden elements
|
|
61
|
+
for el in soup.find_all(True):
|
|
62
|
+
if not isinstance(el, Tag):
|
|
63
|
+
continue
|
|
64
|
+
if el.has_attr("hidden"):
|
|
65
|
+
el.decompose()
|
|
66
|
+
continue
|
|
67
|
+
style = el.get("style", "")
|
|
68
|
+
if isinstance(style, str) and (
|
|
69
|
+
"display:none" in style.replace(" ", "")
|
|
70
|
+
or "visibility:hidden" in style.replace(" ", "")
|
|
71
|
+
):
|
|
72
|
+
el.decompose()
|
|
73
|
+
|
|
74
|
+
# Strip unnecessary attributes
|
|
75
|
+
for el in soup.find_all(True):
|
|
76
|
+
if not isinstance(el, Tag):
|
|
77
|
+
continue
|
|
78
|
+
attrs_to_remove = []
|
|
79
|
+
for attr_name in list(el.attrs.keys()):
|
|
80
|
+
lower = attr_name.lower()
|
|
81
|
+
if lower in ("style", "class", "id", "role", "tabindex", "draggable", "contenteditable"):
|
|
82
|
+
attrs_to_remove.append(attr_name)
|
|
83
|
+
elif any(lower.startswith(p) for p in STRIP_ATTR_PREFIXES):
|
|
84
|
+
attrs_to_remove.append(attr_name)
|
|
85
|
+
for attr_name in attrs_to_remove:
|
|
86
|
+
del el[attr_name]
|
|
87
|
+
|
|
88
|
+
# Remove empty elements
|
|
89
|
+
for el in soup.find_all(True):
|
|
90
|
+
if not isinstance(el, Tag):
|
|
91
|
+
continue
|
|
92
|
+
if (
|
|
93
|
+
el.name not in SELF_CLOSING
|
|
94
|
+
and not el.get_text(strip=True)
|
|
95
|
+
and not el.find("img")
|
|
96
|
+
):
|
|
97
|
+
el.decompose()
|
|
98
|
+
|
|
99
|
+
return str(soup)
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
"""HTTP client for BotBrowser REST API server."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import httpx
|
|
6
|
+
|
|
7
|
+
from botbrowser.models import BotBrowserResult
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class BotBrowserClient:
|
|
11
|
+
"""
|
|
12
|
+
Client for the BotBrowser REST API server.
|
|
13
|
+
|
|
14
|
+
Usage:
|
|
15
|
+
client = BotBrowserClient("http://localhost:3000")
|
|
16
|
+
result = client.extract("https://example.com")
|
|
17
|
+
print(result.content)
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(self, server_url: str = "http://localhost:3000") -> None:
|
|
21
|
+
self.server_url = server_url.rstrip("/")
|
|
22
|
+
self._client = httpx.Client(timeout=30)
|
|
23
|
+
|
|
24
|
+
def extract(
|
|
25
|
+
self,
|
|
26
|
+
url: str,
|
|
27
|
+
*,
|
|
28
|
+
format: str = "markdown",
|
|
29
|
+
timeout: int = 15000,
|
|
30
|
+
include_links: bool = True,
|
|
31
|
+
) -> BotBrowserResult:
|
|
32
|
+
"""Extract content via the BotBrowser REST API server."""
|
|
33
|
+
response = self._client.post(
|
|
34
|
+
f"{self.server_url}/extract",
|
|
35
|
+
json={
|
|
36
|
+
"url": url,
|
|
37
|
+
"format": format,
|
|
38
|
+
"timeout": timeout,
|
|
39
|
+
"includeLinks": include_links,
|
|
40
|
+
},
|
|
41
|
+
)
|
|
42
|
+
response.raise_for_status()
|
|
43
|
+
data = response.json()
|
|
44
|
+
|
|
45
|
+
# Map camelCase API response to snake_case Python model
|
|
46
|
+
return BotBrowserResult(
|
|
47
|
+
url=data["url"],
|
|
48
|
+
title=data["title"],
|
|
49
|
+
description=data["description"],
|
|
50
|
+
content=data["content"],
|
|
51
|
+
text_content=data.get("textContent", data.get("text_content", "")),
|
|
52
|
+
links=[
|
|
53
|
+
{"text": link["text"], "href": link["href"]}
|
|
54
|
+
for link in data.get("links", [])
|
|
55
|
+
],
|
|
56
|
+
metadata={
|
|
57
|
+
"raw_token_estimate": data["metadata"].get("rawTokenEstimate", data["metadata"].get("raw_token_estimate", 0)),
|
|
58
|
+
"clean_token_estimate": data["metadata"].get("cleanTokenEstimate", data["metadata"].get("clean_token_estimate", 0)),
|
|
59
|
+
"token_savings_percent": data["metadata"].get("tokenSavingsPercent", data["metadata"].get("token_savings_percent", 0)),
|
|
60
|
+
"word_count": data["metadata"].get("wordCount", data["metadata"].get("word_count", 0)),
|
|
61
|
+
"fetched_at": data["metadata"].get("fetchedAt", data["metadata"].get("fetched_at", "")),
|
|
62
|
+
},
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
def health(self) -> dict:
|
|
66
|
+
"""Check server health."""
|
|
67
|
+
response = self._client.get(f"{self.server_url}/health")
|
|
68
|
+
response.raise_for_status()
|
|
69
|
+
return response.json()
|
|
70
|
+
|
|
71
|
+
def close(self) -> None:
|
|
72
|
+
"""Close the HTTP client."""
|
|
73
|
+
self._client.close()
|
|
74
|
+
|
|
75
|
+
def __enter__(self) -> BotBrowserClient:
|
|
76
|
+
return self
|
|
77
|
+
|
|
78
|
+
def __exit__(self, *args: object) -> None:
|
|
79
|
+
self.close()
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""HTML to Markdown/text conversion."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
|
|
7
|
+
from markdownify import markdownify
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def html_to_markdown(html: str) -> str:
|
|
11
|
+
"""Convert HTML to clean markdown."""
|
|
12
|
+
# Pre-process: remove images with data URIs (bloated base64) but keep normal images
|
|
13
|
+
html = re.sub(r'<img[^>]+src="data:[^"]*"[^>]*/?>', "", html)
|
|
14
|
+
|
|
15
|
+
markdown = markdownify(
|
|
16
|
+
html,
|
|
17
|
+
heading_style="ATX",
|
|
18
|
+
bullets="-",
|
|
19
|
+
convert=["a", "p", "h1", "h2", "h3", "h4", "h5", "h6",
|
|
20
|
+
"ul", "ol", "li", "table", "thead", "tbody", "tr", "th", "td",
|
|
21
|
+
"blockquote", "pre", "code", "em", "strong", "br", "hr", "img"],
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
# Clean up excessive whitespace
|
|
25
|
+
markdown = re.sub(r"\n{3,}", "\n\n", markdown) # Max 2 consecutive newlines
|
|
26
|
+
markdown = re.sub(r"[ \t]+$", "", markdown, flags=re.MULTILINE) # Trailing whitespace
|
|
27
|
+
markdown = markdown.strip()
|
|
28
|
+
|
|
29
|
+
return markdown
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def html_to_text(html: str) -> str:
|
|
33
|
+
"""Convert HTML to plain text."""
|
|
34
|
+
markdown = html_to_markdown(html)
|
|
35
|
+
|
|
36
|
+
text = markdown
|
|
37
|
+
text = re.sub(r"#{1,6}\s+", "", text) # Remove heading markers
|
|
38
|
+
text = re.sub(r"\*\*(.+?)\*\*", r"\1", text) # Remove bold
|
|
39
|
+
text = re.sub(r"\*(.+?)\*", r"\1", text) # Remove italic
|
|
40
|
+
text = re.sub(r"\[(.+?)\]\(.+?\)", r"\1", text) # Remove links, keep text
|
|
41
|
+
text = re.sub(r"!\[.*?\]\(.+?\)", "", text) # Remove images
|
|
42
|
+
text = re.sub(r"`{1,3}[^`]*`{1,3}", lambda m: m.group().strip("`"), text) # Remove code markers
|
|
43
|
+
text = re.sub(r"^[-*+]\s+", "", text, flags=re.MULTILINE) # Remove list markers
|
|
44
|
+
text = re.sub(r"^\d+\.\s+", "", text, flags=re.MULTILINE) # Remove numbered list markers
|
|
45
|
+
text = re.sub(r"^>\s+", "", text, flags=re.MULTILINE) # Remove blockquote markers
|
|
46
|
+
text = text.replace("---", "") # Remove horizontal rules
|
|
47
|
+
text = re.sub(r"\n{3,}", "\n\n", text)
|
|
48
|
+
|
|
49
|
+
return text.strip()
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
"""Core extraction engine — native Python implementation."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import math
|
|
6
|
+
from datetime import datetime, timezone
|
|
7
|
+
from urllib.parse import urljoin, urlparse
|
|
8
|
+
|
|
9
|
+
from bs4 import BeautifulSoup
|
|
10
|
+
|
|
11
|
+
from botbrowser.cleaner import clean_html
|
|
12
|
+
from botbrowser.converter import html_to_markdown, html_to_text
|
|
13
|
+
from botbrowser.fetcher import fetch_page
|
|
14
|
+
from botbrowser.models import (
|
|
15
|
+
BotBrowserResult,
|
|
16
|
+
ExtractedLink,
|
|
17
|
+
ExtractionMetadata,
|
|
18
|
+
ExtractOptions,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
import trafilatura
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _estimate_tokens(text: str) -> int:
|
|
25
|
+
"""Rough token estimation: ~4 chars per token for English text."""
|
|
26
|
+
return math.ceil(len(text) / 4)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _extract_description(html: str) -> str:
|
|
30
|
+
"""Extract meta description from HTML."""
|
|
31
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
32
|
+
|
|
33
|
+
meta = soup.find("meta", attrs={"name": "description"})
|
|
34
|
+
if meta and meta.get("content"):
|
|
35
|
+
return str(meta["content"])
|
|
36
|
+
|
|
37
|
+
og = soup.find("meta", attrs={"property": "og:description"})
|
|
38
|
+
if og and og.get("content"):
|
|
39
|
+
return str(og["content"])
|
|
40
|
+
|
|
41
|
+
return ""
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _extract_title(html: str) -> str:
|
|
45
|
+
"""Extract page title from HTML."""
|
|
46
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
47
|
+
title_tag = soup.find("title")
|
|
48
|
+
if title_tag and title_tag.string:
|
|
49
|
+
return title_tag.string.strip()
|
|
50
|
+
|
|
51
|
+
og_title = soup.find("meta", attrs={"property": "og:title"})
|
|
52
|
+
if og_title and og_title.get("content"):
|
|
53
|
+
return str(og_title["content"])
|
|
54
|
+
|
|
55
|
+
return ""
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _extract_links(html: str, base_url: str) -> list[ExtractedLink]:
|
|
59
|
+
"""Extract unique links from HTML content."""
|
|
60
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
61
|
+
links: list[ExtractedLink] = []
|
|
62
|
+
seen: set[str] = set()
|
|
63
|
+
|
|
64
|
+
for a in soup.find_all("a", href=True):
|
|
65
|
+
href = a["href"]
|
|
66
|
+
|
|
67
|
+
# Resolve relative URLs
|
|
68
|
+
try:
|
|
69
|
+
absolute_url = urljoin(base_url, href)
|
|
70
|
+
except Exception:
|
|
71
|
+
continue
|
|
72
|
+
|
|
73
|
+
# Skip non-HTTP, anchors, mailto, tel
|
|
74
|
+
parsed = urlparse(absolute_url)
|
|
75
|
+
if parsed.scheme not in ("http", "https"):
|
|
76
|
+
continue
|
|
77
|
+
if parsed.fragment and parsed.path == urlparse(base_url).path:
|
|
78
|
+
continue
|
|
79
|
+
|
|
80
|
+
if absolute_url in seen:
|
|
81
|
+
continue
|
|
82
|
+
seen.add(absolute_url)
|
|
83
|
+
|
|
84
|
+
text = a.get_text(strip=True)
|
|
85
|
+
if text:
|
|
86
|
+
links.append(ExtractedLink(text=text, href=absolute_url))
|
|
87
|
+
|
|
88
|
+
return links
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def extract(
|
|
92
|
+
url_or_options: str | ExtractOptions | None = None,
|
|
93
|
+
*,
|
|
94
|
+
url: str | None = None,
|
|
95
|
+
format: str = "markdown",
|
|
96
|
+
timeout: int = 15000,
|
|
97
|
+
include_links: bool = True,
|
|
98
|
+
headers: dict[str, str] | None = None,
|
|
99
|
+
) -> BotBrowserResult:
|
|
100
|
+
"""
|
|
101
|
+
Extract clean, token-efficient content from a web page.
|
|
102
|
+
|
|
103
|
+
Usage:
|
|
104
|
+
result = extract("https://example.com")
|
|
105
|
+
result = extract("https://example.com", format="text")
|
|
106
|
+
result = extract(ExtractOptions(url="https://example.com"))
|
|
107
|
+
"""
|
|
108
|
+
# Normalize arguments
|
|
109
|
+
if isinstance(url_or_options, ExtractOptions):
|
|
110
|
+
opts = url_or_options
|
|
111
|
+
elif isinstance(url_or_options, str):
|
|
112
|
+
opts = ExtractOptions(
|
|
113
|
+
url=url_or_options,
|
|
114
|
+
format=format, # type: ignore[arg-type]
|
|
115
|
+
timeout=timeout,
|
|
116
|
+
include_links=include_links,
|
|
117
|
+
headers=headers,
|
|
118
|
+
)
|
|
119
|
+
elif url is not None:
|
|
120
|
+
opts = ExtractOptions(
|
|
121
|
+
url=url,
|
|
122
|
+
format=format, # type: ignore[arg-type]
|
|
123
|
+
timeout=timeout,
|
|
124
|
+
include_links=include_links,
|
|
125
|
+
headers=headers,
|
|
126
|
+
)
|
|
127
|
+
else:
|
|
128
|
+
raise ValueError("url is required")
|
|
129
|
+
|
|
130
|
+
# Step 1: Fetch the page
|
|
131
|
+
fetched = fetch_page(opts.url, timeout=opts.timeout, headers=opts.headers)
|
|
132
|
+
raw_token_estimate = _estimate_tokens(fetched.html)
|
|
133
|
+
|
|
134
|
+
# Step 2: Extract metadata from raw HTML
|
|
135
|
+
title = _extract_title(fetched.html)
|
|
136
|
+
description = _extract_description(fetched.html)
|
|
137
|
+
|
|
138
|
+
# Step 3: Extract main content using trafilatura
|
|
139
|
+
main_content_html = trafilatura.extract(
|
|
140
|
+
fetched.html,
|
|
141
|
+
output_format="html",
|
|
142
|
+
include_links=True,
|
|
143
|
+
include_tables=True,
|
|
144
|
+
include_formatting=True,
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
# Step 4: Clean HTML
|
|
148
|
+
if main_content_html:
|
|
149
|
+
cleaned_html = clean_html(main_content_html)
|
|
150
|
+
else:
|
|
151
|
+
# Fallback: clean the full page HTML
|
|
152
|
+
cleaned_html = clean_html(fetched.html)
|
|
153
|
+
|
|
154
|
+
# Step 5: Convert to desired format
|
|
155
|
+
if opts.format == "markdown":
|
|
156
|
+
content = html_to_markdown(cleaned_html)
|
|
157
|
+
else:
|
|
158
|
+
content = html_to_text(cleaned_html)
|
|
159
|
+
|
|
160
|
+
text_content = html_to_text(cleaned_html)
|
|
161
|
+
|
|
162
|
+
# Step 6: Extract links from raw HTML (not cleaned — cleaning strips nav links)
|
|
163
|
+
links = _extract_links(fetched.html, fetched.final_url) if opts.include_links else []
|
|
164
|
+
|
|
165
|
+
clean_token_estimate = _estimate_tokens(content)
|
|
166
|
+
savings = (
|
|
167
|
+
round((1 - clean_token_estimate / raw_token_estimate) * 100)
|
|
168
|
+
if raw_token_estimate > 0
|
|
169
|
+
else 0
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
return BotBrowserResult(
|
|
173
|
+
url=fetched.final_url,
|
|
174
|
+
title=title,
|
|
175
|
+
description=description,
|
|
176
|
+
content=content,
|
|
177
|
+
text_content=text_content,
|
|
178
|
+
links=links,
|
|
179
|
+
metadata=ExtractionMetadata(
|
|
180
|
+
raw_token_estimate=raw_token_estimate,
|
|
181
|
+
clean_token_estimate=clean_token_estimate,
|
|
182
|
+
token_savings_percent=savings,
|
|
183
|
+
word_count=len(text_content.split()),
|
|
184
|
+
fetched_at=datetime.now(timezone.utc).isoformat(),
|
|
185
|
+
),
|
|
186
|
+
)
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
"""HTTP fetching with smart defaults."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import random
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
|
|
8
|
+
import httpx
|
|
9
|
+
|
|
10
|
+
USER_AGENTS = [
|
|
11
|
+
"Mozilla/5.0 (compatible; BotBrowser/0.1; +https://github.com/AmplifyCo/botbrowser)",
|
|
12
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
|
|
13
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class FetchResult:
|
|
19
|
+
html: str
|
|
20
|
+
final_url: str
|
|
21
|
+
status_code: int
|
|
22
|
+
content_type: str
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def fetch_page(
|
|
26
|
+
url: str,
|
|
27
|
+
*,
|
|
28
|
+
timeout: int = 15000,
|
|
29
|
+
headers: dict[str, str] | None = None,
|
|
30
|
+
) -> FetchResult:
|
|
31
|
+
"""Fetch a web page with smart defaults."""
|
|
32
|
+
user_agent = random.choice(USER_AGENTS)
|
|
33
|
+
|
|
34
|
+
default_headers = {
|
|
35
|
+
"User-Agent": user_agent,
|
|
36
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
37
|
+
"Accept-Language": "en-US,en;q=0.9",
|
|
38
|
+
}
|
|
39
|
+
if headers:
|
|
40
|
+
default_headers.update(headers)
|
|
41
|
+
|
|
42
|
+
timeout_seconds = timeout / 1000
|
|
43
|
+
|
|
44
|
+
response = httpx.get(
|
|
45
|
+
url,
|
|
46
|
+
headers=default_headers,
|
|
47
|
+
follow_redirects=True,
|
|
48
|
+
timeout=timeout_seconds,
|
|
49
|
+
)
|
|
50
|
+
response.raise_for_status()
|
|
51
|
+
|
|
52
|
+
content_type = response.headers.get("content-type", "")
|
|
53
|
+
if "text/html" not in content_type and "application/xhtml" not in content_type:
|
|
54
|
+
raise ValueError(
|
|
55
|
+
f"Unsupported content type: {content_type}. Only HTML pages are supported."
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
return FetchResult(
|
|
59
|
+
html=response.text,
|
|
60
|
+
final_url=str(response.url),
|
|
61
|
+
status_code=response.status_code,
|
|
62
|
+
content_type=content_type,
|
|
63
|
+
)
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""Data models for BotBrowser."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Literal
|
|
6
|
+
from pydantic import BaseModel
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ExtractOptions(BaseModel):
|
|
10
|
+
"""Options for content extraction."""
|
|
11
|
+
|
|
12
|
+
url: str
|
|
13
|
+
format: Literal["markdown", "text"] = "markdown"
|
|
14
|
+
timeout: int = 15000
|
|
15
|
+
include_links: bool = True
|
|
16
|
+
headers: dict[str, str] | None = None
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class ExtractedLink(BaseModel):
|
|
20
|
+
"""A link extracted from page content."""
|
|
21
|
+
|
|
22
|
+
text: str
|
|
23
|
+
href: str
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class ExtractionMetadata(BaseModel):
|
|
27
|
+
"""Metadata about the extraction including token savings."""
|
|
28
|
+
|
|
29
|
+
raw_token_estimate: int
|
|
30
|
+
clean_token_estimate: int
|
|
31
|
+
token_savings_percent: int
|
|
32
|
+
word_count: int
|
|
33
|
+
fetched_at: str
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class BotBrowserResult(BaseModel):
|
|
37
|
+
"""Result of content extraction from a web page."""
|
|
38
|
+
|
|
39
|
+
url: str
|
|
40
|
+
title: str
|
|
41
|
+
description: str
|
|
42
|
+
content: str
|
|
43
|
+
text_content: str
|
|
44
|
+
links: list[ExtractedLink]
|
|
45
|
+
metadata: ExtractionMetadata
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "botbrowser"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Token-efficient web content extraction for LLM agents"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = "MIT"
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
|
+
keywords = ["web-scraping", "content-extraction", "llm", "ai-agents", "token-optimization"]
|
|
13
|
+
classifiers = [
|
|
14
|
+
"Development Status :: 4 - Beta",
|
|
15
|
+
"Intended Audience :: Developers",
|
|
16
|
+
"License :: OSI Approved :: MIT License",
|
|
17
|
+
"Programming Language :: Python :: 3",
|
|
18
|
+
"Programming Language :: Python :: 3.10",
|
|
19
|
+
"Programming Language :: Python :: 3.11",
|
|
20
|
+
"Programming Language :: Python :: 3.12",
|
|
21
|
+
"Programming Language :: Python :: 3.13",
|
|
22
|
+
"Topic :: Internet :: WWW/HTTP",
|
|
23
|
+
"Topic :: Software Development :: Libraries",
|
|
24
|
+
]
|
|
25
|
+
dependencies = [
|
|
26
|
+
"trafilatura>=1.12.0",
|
|
27
|
+
"httpx>=0.27.0",
|
|
28
|
+
"markdownify>=0.13.0",
|
|
29
|
+
"pydantic>=2.0.0",
|
|
30
|
+
"beautifulsoup4>=4.12.0",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
[project.urls]
|
|
34
|
+
Homepage = "https://github.com/AmplifyCo/botbrowser"
|
|
35
|
+
Repository = "https://github.com/AmplifyCo/botbrowser"
|
|
36
|
+
Issues = "https://github.com/AmplifyCo/botbrowser/issues"
|
|
37
|
+
|
|
38
|
+
[project.optional-dependencies]
|
|
39
|
+
dev = [
|
|
40
|
+
"pytest>=8.0.0",
|
|
41
|
+
]
|
|
42
|
+
|
|
43
|
+
[tool.hatch.build.targets.wheel]
|
|
44
|
+
packages = ["botbrowser"]
|
|
45
|
+
|
|
46
|
+
[tool.pytest.ini_options]
|
|
47
|
+
testpaths = ["tests"]
|
|
File without changes
|
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
"""Tests for BotBrowser core extraction."""
|
|
2
|
+
|
|
3
|
+
from botbrowser.cleaner import clean_html
|
|
4
|
+
from botbrowser.converter import html_to_markdown, html_to_text
|
|
5
|
+
from botbrowser.models import BotBrowserResult, ExtractOptions, ExtractedLink, ExtractionMetadata
|
|
6
|
+
from botbrowser.core import _extract_title, _extract_description, _extract_links, _estimate_tokens
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
SAMPLE_HTML = """
|
|
10
|
+
<html>
|
|
11
|
+
<head>
|
|
12
|
+
<title>Test Page</title>
|
|
13
|
+
<meta name="description" content="A test page for BotBrowser">
|
|
14
|
+
<script>console.log('tracking');</script>
|
|
15
|
+
<style>body { color: red; }</style>
|
|
16
|
+
</head>
|
|
17
|
+
<body>
|
|
18
|
+
<nav><a href="/">Home</a> <a href="/about">About</a></nav>
|
|
19
|
+
<main>
|
|
20
|
+
<h1>Hello World</h1>
|
|
21
|
+
<p>This is the <strong>main content</strong> of the page.</p>
|
|
22
|
+
<p>It has <a href="https://example.com">a link</a> and some text.</p>
|
|
23
|
+
<ul>
|
|
24
|
+
<li>Item one</li>
|
|
25
|
+
<li>Item two</li>
|
|
26
|
+
<li>Item three</li>
|
|
27
|
+
</ul>
|
|
28
|
+
</main>
|
|
29
|
+
<footer>Copyright 2026</footer>
|
|
30
|
+
<div class="cookie-banner">We use cookies</div>
|
|
31
|
+
<div style="display:none">Hidden content</div>
|
|
32
|
+
</body>
|
|
33
|
+
</html>
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
# --- Cleaner tests ---
|
|
38
|
+
|
|
39
|
+
def test_clean_html_removes_scripts():
|
|
40
|
+
result = clean_html(SAMPLE_HTML)
|
|
41
|
+
assert "<script>" not in result
|
|
42
|
+
assert "tracking" not in result
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def test_clean_html_removes_styles():
|
|
46
|
+
result = clean_html(SAMPLE_HTML)
|
|
47
|
+
assert "<style>" not in result
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def test_clean_html_removes_nav():
|
|
51
|
+
result = clean_html(SAMPLE_HTML)
|
|
52
|
+
assert "<nav>" not in result
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def test_clean_html_removes_cookie_banner():
|
|
56
|
+
result = clean_html(SAMPLE_HTML)
|
|
57
|
+
assert "cookie" not in result.lower()
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def test_clean_html_removes_hidden():
|
|
61
|
+
result = clean_html(SAMPLE_HTML)
|
|
62
|
+
assert "Hidden content" not in result
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def test_clean_html_preserves_main_content():
|
|
66
|
+
result = clean_html(SAMPLE_HTML)
|
|
67
|
+
assert "Hello World" in result
|
|
68
|
+
assert "main content" in result
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def test_clean_html_strips_attributes():
|
|
72
|
+
html = '<div class="foo" id="bar" data-track="yes" style="color:red"><p>Text</p></div>'
|
|
73
|
+
result = clean_html(html)
|
|
74
|
+
assert 'class=' not in result
|
|
75
|
+
assert 'id=' not in result
|
|
76
|
+
assert 'data-track' not in result
|
|
77
|
+
assert 'style=' not in result
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def test_clean_html_removes_ads():
|
|
81
|
+
html = '<div class="ad">Buy stuff</div><p>Real content</p>'
|
|
82
|
+
result = clean_html(html)
|
|
83
|
+
assert "Buy stuff" not in result
|
|
84
|
+
assert "Real content" in result
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def test_clean_html_handles_empty_input():
|
|
88
|
+
result = clean_html("")
|
|
89
|
+
assert isinstance(result, str)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
# --- Converter tests ---
|
|
93
|
+
|
|
94
|
+
def test_html_to_markdown():
|
|
95
|
+
html = "<h1>Title</h1><p>Hello <strong>world</strong></p>"
|
|
96
|
+
md = html_to_markdown(html)
|
|
97
|
+
assert "# Title" in md
|
|
98
|
+
assert "**world**" in md
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def test_html_to_markdown_links():
|
|
102
|
+
html = '<p><a href="https://example.com">click</a></p>'
|
|
103
|
+
md = html_to_markdown(html)
|
|
104
|
+
assert "[click](https://example.com)" in md
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def test_html_to_markdown_lists():
|
|
108
|
+
html = "<ul><li>One</li><li>Two</li></ul>"
|
|
109
|
+
md = html_to_markdown(html)
|
|
110
|
+
assert "- One" in md
|
|
111
|
+
assert "- Two" in md
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def test_html_to_markdown_strips_data_uri_images():
|
|
115
|
+
html = '<img src="data:image/png;base64,abc123" alt="bloat"><p>Text</p>'
|
|
116
|
+
md = html_to_markdown(html)
|
|
117
|
+
assert "data:" not in md
|
|
118
|
+
assert "Text" in md
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def test_html_to_markdown_keeps_normal_images():
|
|
122
|
+
html = '<img src="https://example.com/img.png" alt="photo"><p>Text</p>'
|
|
123
|
+
md = html_to_markdown(html)
|
|
124
|
+
assert "" in md
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def test_html_to_markdown_collapses_whitespace():
|
|
128
|
+
html = "<p>Hello</p>\n\n\n\n\n<p>World</p>"
|
|
129
|
+
md = html_to_markdown(html)
|
|
130
|
+
assert "\n\n\n" not in md
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def test_html_to_text():
|
|
134
|
+
html = "<h1>Title</h1><p>Hello <strong>world</strong></p>"
|
|
135
|
+
text = html_to_text(html)
|
|
136
|
+
assert "Title" in text
|
|
137
|
+
assert "world" in text
|
|
138
|
+
assert "#" not in text
|
|
139
|
+
assert "**" not in text
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def test_html_to_text_strips_links():
|
|
143
|
+
html = '<p><a href="https://example.com">click here</a></p>'
|
|
144
|
+
text = html_to_text(html)
|
|
145
|
+
assert "click here" in text
|
|
146
|
+
assert "https://example.com" not in text
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
# --- Extractor helper tests ---
|
|
150
|
+
|
|
151
|
+
def test_extract_title():
|
|
152
|
+
html = "<html><head><title>My Page</title></head><body></body></html>"
|
|
153
|
+
assert _extract_title(html) == "My Page"
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def test_extract_title_og_fallback():
|
|
157
|
+
html = '<html><head><meta property="og:title" content="OG Title"></head><body></body></html>'
|
|
158
|
+
assert _extract_title(html) == "OG Title"
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def test_extract_title_empty():
|
|
162
|
+
assert _extract_title("<html><head></head><body></body></html>") == ""
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def test_extract_description():
|
|
166
|
+
html = '<html><head><meta name="description" content="Test desc"></head><body></body></html>'
|
|
167
|
+
assert _extract_description(html) == "Test desc"
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def test_extract_description_og_fallback():
|
|
171
|
+
html = '<html><head><meta property="og:description" content="OG desc"></head><body></body></html>'
|
|
172
|
+
assert _extract_description(html) == "OG desc"
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def test_extract_description_empty():
|
|
176
|
+
assert _extract_description("<html><head></head><body></body></html>") == ""
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def test_extract_links():
|
|
180
|
+
html = """
|
|
181
|
+
<a href="https://example.com/a">Link A</a>
|
|
182
|
+
<a href="https://example.com/b">Link B</a>
|
|
183
|
+
<a href="https://example.com/a">Link A again</a>
|
|
184
|
+
"""
|
|
185
|
+
links = _extract_links(html, "https://example.com")
|
|
186
|
+
assert len(links) == 2
|
|
187
|
+
assert links[0].text == "Link A"
|
|
188
|
+
assert links[0].href == "https://example.com/a"
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def test_extract_links_resolves_relative():
|
|
192
|
+
html = '<a href="/about">About</a>'
|
|
193
|
+
links = _extract_links(html, "https://example.com")
|
|
194
|
+
assert links[0].href == "https://example.com/about"
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def test_extract_links_skips_non_http():
|
|
198
|
+
html = """
|
|
199
|
+
<a href="javascript:void(0)">JS</a>
|
|
200
|
+
<a href="mailto:test@example.com">Email</a>
|
|
201
|
+
<a href="https://example.com/real">Real</a>
|
|
202
|
+
"""
|
|
203
|
+
links = _extract_links(html, "https://example.com")
|
|
204
|
+
assert len(links) == 1
|
|
205
|
+
assert links[0].text == "Real"
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def test_extract_links_skips_same_page_anchors():
|
|
209
|
+
html = '<a href="#section">Jump</a>'
|
|
210
|
+
links = _extract_links(html, "https://example.com")
|
|
211
|
+
assert len(links) == 0
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
# --- Token estimation ---
|
|
215
|
+
|
|
216
|
+
def test_estimate_tokens():
|
|
217
|
+
assert _estimate_tokens("") == 0
|
|
218
|
+
assert _estimate_tokens("abcd") == 1
|
|
219
|
+
assert _estimate_tokens("abcdefgh") == 2
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
# --- Models ---
|
|
223
|
+
|
|
224
|
+
def test_extract_options_defaults():
|
|
225
|
+
opts = ExtractOptions(url="https://example.com")
|
|
226
|
+
assert opts.format == "markdown"
|
|
227
|
+
assert opts.timeout == 15000
|
|
228
|
+
assert opts.include_links is True
|
|
229
|
+
assert opts.headers is None
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def test_botbrowser_result_construction():
|
|
233
|
+
result = BotBrowserResult(
|
|
234
|
+
url="https://example.com",
|
|
235
|
+
title="Test",
|
|
236
|
+
description="Desc",
|
|
237
|
+
content="# Test",
|
|
238
|
+
text_content="Test",
|
|
239
|
+
links=[ExtractedLink(text="link", href="https://example.com")],
|
|
240
|
+
metadata=ExtractionMetadata(
|
|
241
|
+
raw_token_estimate=1000,
|
|
242
|
+
clean_token_estimate=100,
|
|
243
|
+
token_savings_percent=90,
|
|
244
|
+
word_count=50,
|
|
245
|
+
fetched_at="2026-01-01T00:00:00Z",
|
|
246
|
+
),
|
|
247
|
+
)
|
|
248
|
+
assert result.url == "https://example.com"
|
|
249
|
+
assert result.metadata.token_savings_percent == 90
|