botbrowser 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,13 @@
1
+ node_modules/
2
+ dist/
3
+ build/
4
+ *.tsbuildinfo
5
+ .env
6
+ *.egg-info/
7
+ __pycache__/
8
+ .pytest_cache/
9
+ *.pyc
10
+ .venv/
11
+ venv/
12
+ .DS_Store
13
+ coverage/
@@ -0,0 +1,77 @@
1
+ Metadata-Version: 2.4
2
+ Name: botbrowser
3
+ Version: 0.1.0
4
+ Summary: Token-efficient web content extraction for LLM agents
5
+ Project-URL: Homepage, https://github.com/AmplifyCo/botbrowser
6
+ Project-URL: Repository, https://github.com/AmplifyCo/botbrowser
7
+ Project-URL: Issues, https://github.com/AmplifyCo/botbrowser/issues
8
+ License-Expression: MIT
9
+ Keywords: ai-agents,content-extraction,llm,token-optimization,web-scraping
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Programming Language :: Python :: 3.13
18
+ Classifier: Topic :: Internet :: WWW/HTTP
19
+ Classifier: Topic :: Software Development :: Libraries
20
+ Requires-Python: >=3.10
21
+ Requires-Dist: beautifulsoup4>=4.12.0
22
+ Requires-Dist: httpx>=0.27.0
23
+ Requires-Dist: markdownify>=0.13.0
24
+ Requires-Dist: pydantic>=2.0.0
25
+ Requires-Dist: trafilatura>=1.12.0
26
+ Provides-Extra: dev
27
+ Requires-Dist: pytest>=8.0.0; extra == 'dev'
28
+ Description-Content-Type: text/markdown
29
+
30
+ # BotBrowser (Python)
31
+
32
+ Token-efficient web content extraction for LLM agents.
33
+
34
+ ## Install
35
+
36
+ ```bash
37
+ pip install botbrowser
38
+ ```
39
+
40
+ ## Usage
41
+
42
+ ```python
43
+ from botbrowser import extract
44
+
45
+ result = extract("https://example.com")
46
+
47
+ print(result.content) # clean markdown
48
+ print(result.metadata.token_savings_percent) # e.g. 94
49
+ print(result.title) # page title
50
+ print(result.links) # extracted links
51
+ ```
52
+
53
+ ## Options
54
+
55
+ ```python
56
+ result = extract(
57
+ "https://example.com",
58
+ format="text", # "markdown" (default) or "text"
59
+ timeout=10000, # request timeout in ms (default: 15000)
60
+ include_links=False, # extract links (default: True)
61
+ )
62
+ ```
63
+
64
+ ## Client Mode
65
+
66
+ If you're running the BotBrowser REST API server, you can use the client:
67
+
68
+ ```python
69
+ from botbrowser import BotBrowserClient
70
+
71
+ client = BotBrowserClient("http://localhost:3000")
72
+ result = client.extract("https://example.com")
73
+ ```
74
+
75
+ ## License
76
+
77
+ MIT
@@ -0,0 +1,48 @@
1
+ # BotBrowser (Python)
2
+
3
+ Token-efficient web content extraction for LLM agents.
4
+
5
+ ## Install
6
+
7
+ ```bash
8
+ pip install botbrowser
9
+ ```
10
+
11
+ ## Usage
12
+
13
+ ```python
14
+ from botbrowser import extract
15
+
16
+ result = extract("https://example.com")
17
+
18
+ print(result.content) # clean markdown
19
+ print(result.metadata.token_savings_percent) # e.g. 94
20
+ print(result.title) # page title
21
+ print(result.links) # extracted links
22
+ ```
23
+
24
+ ## Options
25
+
26
+ ```python
27
+ result = extract(
28
+ "https://example.com",
29
+ format="text", # "markdown" (default) or "text"
30
+ timeout=10000, # request timeout in ms (default: 15000)
31
+ include_links=False, # extract links (default: True)
32
+ )
33
+ ```
34
+
35
+ ## Client Mode
36
+
37
+ If you're running the BotBrowser REST API server, you can use the client:
38
+
39
+ ```python
40
+ from botbrowser import BotBrowserClient
41
+
42
+ client = BotBrowserClient("http://localhost:3000")
43
+ result = client.extract("https://example.com")
44
+ ```
45
+
46
+ ## License
47
+
48
+ MIT
@@ -0,0 +1,15 @@
1
+ """BotBrowser — Token-efficient web content extraction for LLM agents."""
2
+
3
+ from botbrowser.core import extract
4
+ from botbrowser.client import BotBrowserClient
5
+ from botbrowser.models import BotBrowserResult, ExtractOptions, ExtractedLink, ExtractionMetadata
6
+
7
+ __version__ = "0.1.0"
8
+ __all__ = [
9
+ "extract",
10
+ "BotBrowserClient",
11
+ "BotBrowserResult",
12
+ "ExtractOptions",
13
+ "ExtractedLink",
14
+ "ExtractionMetadata",
15
+ ]
@@ -0,0 +1,99 @@
1
+ """HTML cleaning pipeline — strips bloat from web pages."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from bs4 import BeautifulSoup, Comment, Tag
6
+
7
+ REMOVE_TAGS = {
8
+ "script", "style", "noscript", "iframe", "object", "embed",
9
+ "applet", "svg", "canvas", "video", "audio", "source", "track",
10
+ "map", "area",
11
+ }
12
+
13
+ REMOVE_SELECTORS = [
14
+ "nav",
15
+ "[role='navigation']",
16
+ "[role='banner']",
17
+ "[role='complementary']",
18
+ "[aria-hidden='true']",
19
+ ".ad", ".ads", ".advertisement", ".adsbygoogle",
20
+ ".sidebar", ".side-bar",
21
+ ".cookie-banner", ".cookie-notice", ".cookie-consent",
22
+ ".popup", ".modal", ".overlay",
23
+ ".social-share", ".share-buttons", ".social-links",
24
+ ".comments", ".comment-section", "#comments",
25
+ ".newsletter", ".subscribe",
26
+ ".breadcrumb", ".breadcrumbs",
27
+ ".pagination",
28
+ ".related-posts", ".related-articles",
29
+ ".widget",
30
+ "[data-ad]",
31
+ "[data-tracking]",
32
+ ]
33
+
34
+ STRIP_ATTR_PREFIXES = ("data-", "aria-", "on")
35
+
36
+ SELF_CLOSING = {"img", "br", "hr", "input", "meta", "link"}
37
+
38
+
39
+ def clean_html(html: str) -> str:
40
+ """Remove non-content elements and unnecessary attributes from HTML."""
41
+ soup = BeautifulSoup(html, "html.parser")
42
+
43
+ # Remove HTML comments
44
+ for comment in soup.find_all(string=lambda t: isinstance(t, Comment)):
45
+ comment.extract()
46
+
47
+ # Remove unwanted tags entirely
48
+ for tag_name in REMOVE_TAGS:
49
+ for tag in soup.find_all(tag_name):
50
+ tag.decompose()
51
+
52
+ # Remove non-content elements by selector
53
+ for selector in REMOVE_SELECTORS:
54
+ try:
55
+ for el in soup.select(selector):
56
+ el.decompose()
57
+ except Exception:
58
+ pass
59
+
60
+ # Remove hidden elements
61
+ for el in soup.find_all(True):
62
+ if not isinstance(el, Tag):
63
+ continue
64
+ if el.has_attr("hidden"):
65
+ el.decompose()
66
+ continue
67
+ style = el.get("style", "")
68
+ if isinstance(style, str) and (
69
+ "display:none" in style.replace(" ", "")
70
+ or "visibility:hidden" in style.replace(" ", "")
71
+ ):
72
+ el.decompose()
73
+
74
+ # Strip unnecessary attributes
75
+ for el in soup.find_all(True):
76
+ if not isinstance(el, Tag):
77
+ continue
78
+ attrs_to_remove = []
79
+ for attr_name in list(el.attrs.keys()):
80
+ lower = attr_name.lower()
81
+ if lower in ("style", "class", "id", "role", "tabindex", "draggable", "contenteditable"):
82
+ attrs_to_remove.append(attr_name)
83
+ elif any(lower.startswith(p) for p in STRIP_ATTR_PREFIXES):
84
+ attrs_to_remove.append(attr_name)
85
+ for attr_name in attrs_to_remove:
86
+ del el[attr_name]
87
+
88
+ # Remove empty elements
89
+ for el in soup.find_all(True):
90
+ if not isinstance(el, Tag):
91
+ continue
92
+ if (
93
+ el.name not in SELF_CLOSING
94
+ and not el.get_text(strip=True)
95
+ and not el.find("img")
96
+ ):
97
+ el.decompose()
98
+
99
+ return str(soup)
@@ -0,0 +1,79 @@
1
+ """HTTP client for BotBrowser REST API server."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import httpx
6
+
7
+ from botbrowser.models import BotBrowserResult
8
+
9
+
10
+ class BotBrowserClient:
11
+ """
12
+ Client for the BotBrowser REST API server.
13
+
14
+ Usage:
15
+ client = BotBrowserClient("http://localhost:3000")
16
+ result = client.extract("https://example.com")
17
+ print(result.content)
18
+ """
19
+
20
+ def __init__(self, server_url: str = "http://localhost:3000") -> None:
21
+ self.server_url = server_url.rstrip("/")
22
+ self._client = httpx.Client(timeout=30)
23
+
24
+ def extract(
25
+ self,
26
+ url: str,
27
+ *,
28
+ format: str = "markdown",
29
+ timeout: int = 15000,
30
+ include_links: bool = True,
31
+ ) -> BotBrowserResult:
32
+ """Extract content via the BotBrowser REST API server."""
33
+ response = self._client.post(
34
+ f"{self.server_url}/extract",
35
+ json={
36
+ "url": url,
37
+ "format": format,
38
+ "timeout": timeout,
39
+ "includeLinks": include_links,
40
+ },
41
+ )
42
+ response.raise_for_status()
43
+ data = response.json()
44
+
45
+ # Map camelCase API response to snake_case Python model
46
+ return BotBrowserResult(
47
+ url=data["url"],
48
+ title=data["title"],
49
+ description=data["description"],
50
+ content=data["content"],
51
+ text_content=data.get("textContent", data.get("text_content", "")),
52
+ links=[
53
+ {"text": link["text"], "href": link["href"]}
54
+ for link in data.get("links", [])
55
+ ],
56
+ metadata={
57
+ "raw_token_estimate": data["metadata"].get("rawTokenEstimate", data["metadata"].get("raw_token_estimate", 0)),
58
+ "clean_token_estimate": data["metadata"].get("cleanTokenEstimate", data["metadata"].get("clean_token_estimate", 0)),
59
+ "token_savings_percent": data["metadata"].get("tokenSavingsPercent", data["metadata"].get("token_savings_percent", 0)),
60
+ "word_count": data["metadata"].get("wordCount", data["metadata"].get("word_count", 0)),
61
+ "fetched_at": data["metadata"].get("fetchedAt", data["metadata"].get("fetched_at", "")),
62
+ },
63
+ )
64
+
65
+ def health(self) -> dict:
66
+ """Check server health."""
67
+ response = self._client.get(f"{self.server_url}/health")
68
+ response.raise_for_status()
69
+ return response.json()
70
+
71
+ def close(self) -> None:
72
+ """Close the HTTP client."""
73
+ self._client.close()
74
+
75
+ def __enter__(self) -> BotBrowserClient:
76
+ return self
77
+
78
+ def __exit__(self, *args: object) -> None:
79
+ self.close()
@@ -0,0 +1,49 @@
1
+ """HTML to Markdown/text conversion."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+
7
+ from markdownify import markdownify
8
+
9
+
10
+ def html_to_markdown(html: str) -> str:
11
+ """Convert HTML to clean markdown."""
12
+ # Pre-process: remove images with data URIs (bloated base64) but keep normal images
13
+ html = re.sub(r'<img[^>]+src="data:[^"]*"[^>]*/?>', "", html)
14
+
15
+ markdown = markdownify(
16
+ html,
17
+ heading_style="ATX",
18
+ bullets="-",
19
+ convert=["a", "p", "h1", "h2", "h3", "h4", "h5", "h6",
20
+ "ul", "ol", "li", "table", "thead", "tbody", "tr", "th", "td",
21
+ "blockquote", "pre", "code", "em", "strong", "br", "hr", "img"],
22
+ )
23
+
24
+ # Clean up excessive whitespace
25
+ markdown = re.sub(r"\n{3,}", "\n\n", markdown) # Max 2 consecutive newlines
26
+ markdown = re.sub(r"[ \t]+$", "", markdown, flags=re.MULTILINE) # Trailing whitespace
27
+ markdown = markdown.strip()
28
+
29
+ return markdown
30
+
31
+
32
+ def html_to_text(html: str) -> str:
33
+ """Convert HTML to plain text."""
34
+ markdown = html_to_markdown(html)
35
+
36
+ text = markdown
37
+ text = re.sub(r"#{1,6}\s+", "", text) # Remove heading markers
38
+ text = re.sub(r"\*\*(.+?)\*\*", r"\1", text) # Remove bold
39
+ text = re.sub(r"\*(.+?)\*", r"\1", text) # Remove italic
40
+ text = re.sub(r"\[(.+?)\]\(.+?\)", r"\1", text) # Remove links, keep text
41
+ text = re.sub(r"!\[.*?\]\(.+?\)", "", text) # Remove images
42
+ text = re.sub(r"`{1,3}[^`]*`{1,3}", lambda m: m.group().strip("`"), text) # Remove code markers
43
+ text = re.sub(r"^[-*+]\s+", "", text, flags=re.MULTILINE) # Remove list markers
44
+ text = re.sub(r"^\d+\.\s+", "", text, flags=re.MULTILINE) # Remove numbered list markers
45
+ text = re.sub(r"^>\s+", "", text, flags=re.MULTILINE) # Remove blockquote markers
46
+ text = text.replace("---", "") # Remove horizontal rules
47
+ text = re.sub(r"\n{3,}", "\n\n", text)
48
+
49
+ return text.strip()
@@ -0,0 +1,186 @@
1
+ """Core extraction engine — native Python implementation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import math
6
+ from datetime import datetime, timezone
7
+ from urllib.parse import urljoin, urlparse
8
+
9
+ from bs4 import BeautifulSoup
10
+
11
+ from botbrowser.cleaner import clean_html
12
+ from botbrowser.converter import html_to_markdown, html_to_text
13
+ from botbrowser.fetcher import fetch_page
14
+ from botbrowser.models import (
15
+ BotBrowserResult,
16
+ ExtractedLink,
17
+ ExtractionMetadata,
18
+ ExtractOptions,
19
+ )
20
+
21
+ import trafilatura
22
+
23
+
24
+ def _estimate_tokens(text: str) -> int:
25
+ """Rough token estimation: ~4 chars per token for English text."""
26
+ return math.ceil(len(text) / 4)
27
+
28
+
29
+ def _extract_description(html: str) -> str:
30
+ """Extract meta description from HTML."""
31
+ soup = BeautifulSoup(html, "html.parser")
32
+
33
+ meta = soup.find("meta", attrs={"name": "description"})
34
+ if meta and meta.get("content"):
35
+ return str(meta["content"])
36
+
37
+ og = soup.find("meta", attrs={"property": "og:description"})
38
+ if og and og.get("content"):
39
+ return str(og["content"])
40
+
41
+ return ""
42
+
43
+
44
+ def _extract_title(html: str) -> str:
45
+ """Extract page title from HTML."""
46
+ soup = BeautifulSoup(html, "html.parser")
47
+ title_tag = soup.find("title")
48
+ if title_tag and title_tag.string:
49
+ return title_tag.string.strip()
50
+
51
+ og_title = soup.find("meta", attrs={"property": "og:title"})
52
+ if og_title and og_title.get("content"):
53
+ return str(og_title["content"])
54
+
55
+ return ""
56
+
57
+
58
+ def _extract_links(html: str, base_url: str) -> list[ExtractedLink]:
59
+ """Extract unique links from HTML content."""
60
+ soup = BeautifulSoup(html, "html.parser")
61
+ links: list[ExtractedLink] = []
62
+ seen: set[str] = set()
63
+
64
+ for a in soup.find_all("a", href=True):
65
+ href = a["href"]
66
+
67
+ # Resolve relative URLs
68
+ try:
69
+ absolute_url = urljoin(base_url, href)
70
+ except Exception:
71
+ continue
72
+
73
+ # Skip non-HTTP, anchors, mailto, tel
74
+ parsed = urlparse(absolute_url)
75
+ if parsed.scheme not in ("http", "https"):
76
+ continue
77
+ if parsed.fragment and parsed.path == urlparse(base_url).path:
78
+ continue
79
+
80
+ if absolute_url in seen:
81
+ continue
82
+ seen.add(absolute_url)
83
+
84
+ text = a.get_text(strip=True)
85
+ if text:
86
+ links.append(ExtractedLink(text=text, href=absolute_url))
87
+
88
+ return links
89
+
90
+
91
+ def extract(
92
+ url_or_options: str | ExtractOptions | None = None,
93
+ *,
94
+ url: str | None = None,
95
+ format: str = "markdown",
96
+ timeout: int = 15000,
97
+ include_links: bool = True,
98
+ headers: dict[str, str] | None = None,
99
+ ) -> BotBrowserResult:
100
+ """
101
+ Extract clean, token-efficient content from a web page.
102
+
103
+ Usage:
104
+ result = extract("https://example.com")
105
+ result = extract("https://example.com", format="text")
106
+ result = extract(ExtractOptions(url="https://example.com"))
107
+ """
108
+ # Normalize arguments
109
+ if isinstance(url_or_options, ExtractOptions):
110
+ opts = url_or_options
111
+ elif isinstance(url_or_options, str):
112
+ opts = ExtractOptions(
113
+ url=url_or_options,
114
+ format=format, # type: ignore[arg-type]
115
+ timeout=timeout,
116
+ include_links=include_links,
117
+ headers=headers,
118
+ )
119
+ elif url is not None:
120
+ opts = ExtractOptions(
121
+ url=url,
122
+ format=format, # type: ignore[arg-type]
123
+ timeout=timeout,
124
+ include_links=include_links,
125
+ headers=headers,
126
+ )
127
+ else:
128
+ raise ValueError("url is required")
129
+
130
+ # Step 1: Fetch the page
131
+ fetched = fetch_page(opts.url, timeout=opts.timeout, headers=opts.headers)
132
+ raw_token_estimate = _estimate_tokens(fetched.html)
133
+
134
+ # Step 2: Extract metadata from raw HTML
135
+ title = _extract_title(fetched.html)
136
+ description = _extract_description(fetched.html)
137
+
138
+ # Step 3: Extract main content using trafilatura
139
+ main_content_html = trafilatura.extract(
140
+ fetched.html,
141
+ output_format="html",
142
+ include_links=True,
143
+ include_tables=True,
144
+ include_formatting=True,
145
+ )
146
+
147
+ # Step 4: Clean HTML
148
+ if main_content_html:
149
+ cleaned_html = clean_html(main_content_html)
150
+ else:
151
+ # Fallback: clean the full page HTML
152
+ cleaned_html = clean_html(fetched.html)
153
+
154
+ # Step 5: Convert to desired format
155
+ if opts.format == "markdown":
156
+ content = html_to_markdown(cleaned_html)
157
+ else:
158
+ content = html_to_text(cleaned_html)
159
+
160
+ text_content = html_to_text(cleaned_html)
161
+
162
+ # Step 6: Extract links from raw HTML (not cleaned — cleaning strips nav links)
163
+ links = _extract_links(fetched.html, fetched.final_url) if opts.include_links else []
164
+
165
+ clean_token_estimate = _estimate_tokens(content)
166
+ savings = (
167
+ round((1 - clean_token_estimate / raw_token_estimate) * 100)
168
+ if raw_token_estimate > 0
169
+ else 0
170
+ )
171
+
172
+ return BotBrowserResult(
173
+ url=fetched.final_url,
174
+ title=title,
175
+ description=description,
176
+ content=content,
177
+ text_content=text_content,
178
+ links=links,
179
+ metadata=ExtractionMetadata(
180
+ raw_token_estimate=raw_token_estimate,
181
+ clean_token_estimate=clean_token_estimate,
182
+ token_savings_percent=savings,
183
+ word_count=len(text_content.split()),
184
+ fetched_at=datetime.now(timezone.utc).isoformat(),
185
+ ),
186
+ )
@@ -0,0 +1,63 @@
1
+ """HTTP fetching with smart defaults."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import random
6
+ from dataclasses import dataclass
7
+
8
+ import httpx
9
+
10
+ USER_AGENTS = [
11
+ "Mozilla/5.0 (compatible; BotBrowser/0.1; +https://github.com/AmplifyCo/botbrowser)",
12
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
13
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
14
+ ]
15
+
16
+
17
+ @dataclass
18
+ class FetchResult:
19
+ html: str
20
+ final_url: str
21
+ status_code: int
22
+ content_type: str
23
+
24
+
25
+ def fetch_page(
26
+ url: str,
27
+ *,
28
+ timeout: int = 15000,
29
+ headers: dict[str, str] | None = None,
30
+ ) -> FetchResult:
31
+ """Fetch a web page with smart defaults."""
32
+ user_agent = random.choice(USER_AGENTS)
33
+
34
+ default_headers = {
35
+ "User-Agent": user_agent,
36
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
37
+ "Accept-Language": "en-US,en;q=0.9",
38
+ }
39
+ if headers:
40
+ default_headers.update(headers)
41
+
42
+ timeout_seconds = timeout / 1000
43
+
44
+ response = httpx.get(
45
+ url,
46
+ headers=default_headers,
47
+ follow_redirects=True,
48
+ timeout=timeout_seconds,
49
+ )
50
+ response.raise_for_status()
51
+
52
+ content_type = response.headers.get("content-type", "")
53
+ if "text/html" not in content_type and "application/xhtml" not in content_type:
54
+ raise ValueError(
55
+ f"Unsupported content type: {content_type}. Only HTML pages are supported."
56
+ )
57
+
58
+ return FetchResult(
59
+ html=response.text,
60
+ final_url=str(response.url),
61
+ status_code=response.status_code,
62
+ content_type=content_type,
63
+ )
@@ -0,0 +1,45 @@
1
+ """Data models for BotBrowser."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Literal
6
+ from pydantic import BaseModel
7
+
8
+
9
+ class ExtractOptions(BaseModel):
10
+ """Options for content extraction."""
11
+
12
+ url: str
13
+ format: Literal["markdown", "text"] = "markdown"
14
+ timeout: int = 15000
15
+ include_links: bool = True
16
+ headers: dict[str, str] | None = None
17
+
18
+
19
+ class ExtractedLink(BaseModel):
20
+ """A link extracted from page content."""
21
+
22
+ text: str
23
+ href: str
24
+
25
+
26
+ class ExtractionMetadata(BaseModel):
27
+ """Metadata about the extraction including token savings."""
28
+
29
+ raw_token_estimate: int
30
+ clean_token_estimate: int
31
+ token_savings_percent: int
32
+ word_count: int
33
+ fetched_at: str
34
+
35
+
36
+ class BotBrowserResult(BaseModel):
37
+ """Result of content extraction from a web page."""
38
+
39
+ url: str
40
+ title: str
41
+ description: str
42
+ content: str
43
+ text_content: str
44
+ links: list[ExtractedLink]
45
+ metadata: ExtractionMetadata
@@ -0,0 +1,47 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "botbrowser"
7
+ version = "0.1.0"
8
+ description = "Token-efficient web content extraction for LLM agents"
9
+ readme = "README.md"
10
+ license = "MIT"
11
+ requires-python = ">=3.10"
12
+ keywords = ["web-scraping", "content-extraction", "llm", "ai-agents", "token-optimization"]
13
+ classifiers = [
14
+ "Development Status :: 4 - Beta",
15
+ "Intended Audience :: Developers",
16
+ "License :: OSI Approved :: MIT License",
17
+ "Programming Language :: Python :: 3",
18
+ "Programming Language :: Python :: 3.10",
19
+ "Programming Language :: Python :: 3.11",
20
+ "Programming Language :: Python :: 3.12",
21
+ "Programming Language :: Python :: 3.13",
22
+ "Topic :: Internet :: WWW/HTTP",
23
+ "Topic :: Software Development :: Libraries",
24
+ ]
25
+ dependencies = [
26
+ "trafilatura>=1.12.0",
27
+ "httpx>=0.27.0",
28
+ "markdownify>=0.13.0",
29
+ "pydantic>=2.0.0",
30
+ "beautifulsoup4>=4.12.0",
31
+ ]
32
+
33
+ [project.urls]
34
+ Homepage = "https://github.com/AmplifyCo/botbrowser"
35
+ Repository = "https://github.com/AmplifyCo/botbrowser"
36
+ Issues = "https://github.com/AmplifyCo/botbrowser/issues"
37
+
38
+ [project.optional-dependencies]
39
+ dev = [
40
+ "pytest>=8.0.0",
41
+ ]
42
+
43
+ [tool.hatch.build.targets.wheel]
44
+ packages = ["botbrowser"]
45
+
46
+ [tool.pytest.ini_options]
47
+ testpaths = ["tests"]
File without changes
@@ -0,0 +1,249 @@
1
+ """Tests for BotBrowser core extraction."""
2
+
3
+ from botbrowser.cleaner import clean_html
4
+ from botbrowser.converter import html_to_markdown, html_to_text
5
+ from botbrowser.models import BotBrowserResult, ExtractOptions, ExtractedLink, ExtractionMetadata
6
+ from botbrowser.core import _extract_title, _extract_description, _extract_links, _estimate_tokens
7
+
8
+
9
+ SAMPLE_HTML = """
10
+ <html>
11
+ <head>
12
+ <title>Test Page</title>
13
+ <meta name="description" content="A test page for BotBrowser">
14
+ <script>console.log('tracking');</script>
15
+ <style>body { color: red; }</style>
16
+ </head>
17
+ <body>
18
+ <nav><a href="/">Home</a> <a href="/about">About</a></nav>
19
+ <main>
20
+ <h1>Hello World</h1>
21
+ <p>This is the <strong>main content</strong> of the page.</p>
22
+ <p>It has <a href="https://example.com">a link</a> and some text.</p>
23
+ <ul>
24
+ <li>Item one</li>
25
+ <li>Item two</li>
26
+ <li>Item three</li>
27
+ </ul>
28
+ </main>
29
+ <footer>Copyright 2026</footer>
30
+ <div class="cookie-banner">We use cookies</div>
31
+ <div style="display:none">Hidden content</div>
32
+ </body>
33
+ </html>
34
+ """
35
+
36
+
37
+ # --- Cleaner tests ---
38
+
39
+ def test_clean_html_removes_scripts():
40
+ result = clean_html(SAMPLE_HTML)
41
+ assert "<script>" not in result
42
+ assert "tracking" not in result
43
+
44
+
45
+ def test_clean_html_removes_styles():
46
+ result = clean_html(SAMPLE_HTML)
47
+ assert "<style>" not in result
48
+
49
+
50
+ def test_clean_html_removes_nav():
51
+ result = clean_html(SAMPLE_HTML)
52
+ assert "<nav>" not in result
53
+
54
+
55
+ def test_clean_html_removes_cookie_banner():
56
+ result = clean_html(SAMPLE_HTML)
57
+ assert "cookie" not in result.lower()
58
+
59
+
60
+ def test_clean_html_removes_hidden():
61
+ result = clean_html(SAMPLE_HTML)
62
+ assert "Hidden content" not in result
63
+
64
+
65
+ def test_clean_html_preserves_main_content():
66
+ result = clean_html(SAMPLE_HTML)
67
+ assert "Hello World" in result
68
+ assert "main content" in result
69
+
70
+
71
+ def test_clean_html_strips_attributes():
72
+ html = '<div class="foo" id="bar" data-track="yes" style="color:red"><p>Text</p></div>'
73
+ result = clean_html(html)
74
+ assert 'class=' not in result
75
+ assert 'id=' not in result
76
+ assert 'data-track' not in result
77
+ assert 'style=' not in result
78
+
79
+
80
+ def test_clean_html_removes_ads():
81
+ html = '<div class="ad">Buy stuff</div><p>Real content</p>'
82
+ result = clean_html(html)
83
+ assert "Buy stuff" not in result
84
+ assert "Real content" in result
85
+
86
+
87
+ def test_clean_html_handles_empty_input():
88
+ result = clean_html("")
89
+ assert isinstance(result, str)
90
+
91
+
92
+ # --- Converter tests ---
93
+
94
+ def test_html_to_markdown():
95
+ html = "<h1>Title</h1><p>Hello <strong>world</strong></p>"
96
+ md = html_to_markdown(html)
97
+ assert "# Title" in md
98
+ assert "**world**" in md
99
+
100
+
101
+ def test_html_to_markdown_links():
102
+ html = '<p><a href="https://example.com">click</a></p>'
103
+ md = html_to_markdown(html)
104
+ assert "[click](https://example.com)" in md
105
+
106
+
107
+ def test_html_to_markdown_lists():
108
+ html = "<ul><li>One</li><li>Two</li></ul>"
109
+ md = html_to_markdown(html)
110
+ assert "- One" in md
111
+ assert "- Two" in md
112
+
113
+
114
+ def test_html_to_markdown_strips_data_uri_images():
115
+ html = '<img src="data:image/png;base64,abc123" alt="bloat"><p>Text</p>'
116
+ md = html_to_markdown(html)
117
+ assert "data:" not in md
118
+ assert "Text" in md
119
+
120
+
121
+ def test_html_to_markdown_keeps_normal_images():
122
+ html = '<img src="https://example.com/img.png" alt="photo"><p>Text</p>'
123
+ md = html_to_markdown(html)
124
+ assert "![photo](https://example.com/img.png)" in md
125
+
126
+
127
+ def test_html_to_markdown_collapses_whitespace():
128
+ html = "<p>Hello</p>\n\n\n\n\n<p>World</p>"
129
+ md = html_to_markdown(html)
130
+ assert "\n\n\n" not in md
131
+
132
+
133
+ def test_html_to_text():
134
+ html = "<h1>Title</h1><p>Hello <strong>world</strong></p>"
135
+ text = html_to_text(html)
136
+ assert "Title" in text
137
+ assert "world" in text
138
+ assert "#" not in text
139
+ assert "**" not in text
140
+
141
+
142
+ def test_html_to_text_strips_links():
143
+ html = '<p><a href="https://example.com">click here</a></p>'
144
+ text = html_to_text(html)
145
+ assert "click here" in text
146
+ assert "https://example.com" not in text
147
+
148
+
149
+ # --- Extractor helper tests ---
150
+
151
+ def test_extract_title():
152
+ html = "<html><head><title>My Page</title></head><body></body></html>"
153
+ assert _extract_title(html) == "My Page"
154
+
155
+
156
+ def test_extract_title_og_fallback():
157
+ html = '<html><head><meta property="og:title" content="OG Title"></head><body></body></html>'
158
+ assert _extract_title(html) == "OG Title"
159
+
160
+
161
+ def test_extract_title_empty():
162
+ assert _extract_title("<html><head></head><body></body></html>") == ""
163
+
164
+
165
+ def test_extract_description():
166
+ html = '<html><head><meta name="description" content="Test desc"></head><body></body></html>'
167
+ assert _extract_description(html) == "Test desc"
168
+
169
+
170
+ def test_extract_description_og_fallback():
171
+ html = '<html><head><meta property="og:description" content="OG desc"></head><body></body></html>'
172
+ assert _extract_description(html) == "OG desc"
173
+
174
+
175
+ def test_extract_description_empty():
176
+ assert _extract_description("<html><head></head><body></body></html>") == ""
177
+
178
+
179
+ def test_extract_links():
180
+ html = """
181
+ <a href="https://example.com/a">Link A</a>
182
+ <a href="https://example.com/b">Link B</a>
183
+ <a href="https://example.com/a">Link A again</a>
184
+ """
185
+ links = _extract_links(html, "https://example.com")
186
+ assert len(links) == 2
187
+ assert links[0].text == "Link A"
188
+ assert links[0].href == "https://example.com/a"
189
+
190
+
191
+ def test_extract_links_resolves_relative():
192
+ html = '<a href="/about">About</a>'
193
+ links = _extract_links(html, "https://example.com")
194
+ assert links[0].href == "https://example.com/about"
195
+
196
+
197
+ def test_extract_links_skips_non_http():
198
+ html = """
199
+ <a href="javascript:void(0)">JS</a>
200
+ <a href="mailto:test@example.com">Email</a>
201
+ <a href="https://example.com/real">Real</a>
202
+ """
203
+ links = _extract_links(html, "https://example.com")
204
+ assert len(links) == 1
205
+ assert links[0].text == "Real"
206
+
207
+
208
+ def test_extract_links_skips_same_page_anchors():
209
+ html = '<a href="#section">Jump</a>'
210
+ links = _extract_links(html, "https://example.com")
211
+ assert len(links) == 0
212
+
213
+
214
+ # --- Token estimation ---
215
+
216
+ def test_estimate_tokens():
217
+ assert _estimate_tokens("") == 0
218
+ assert _estimate_tokens("abcd") == 1
219
+ assert _estimate_tokens("abcdefgh") == 2
220
+
221
+
222
+ # --- Models ---
223
+
224
+ def test_extract_options_defaults():
225
+ opts = ExtractOptions(url="https://example.com")
226
+ assert opts.format == "markdown"
227
+ assert opts.timeout == 15000
228
+ assert opts.include_links is True
229
+ assert opts.headers is None
230
+
231
+
232
+ def test_botbrowser_result_construction():
233
+ result = BotBrowserResult(
234
+ url="https://example.com",
235
+ title="Test",
236
+ description="Desc",
237
+ content="# Test",
238
+ text_content="Test",
239
+ links=[ExtractedLink(text="link", href="https://example.com")],
240
+ metadata=ExtractionMetadata(
241
+ raw_token_estimate=1000,
242
+ clean_token_estimate=100,
243
+ token_savings_percent=90,
244
+ word_count=50,
245
+ fetched_at="2026-01-01T00:00:00Z",
246
+ ),
247
+ )
248
+ assert result.url == "https://example.com"
249
+ assert result.metadata.token_savings_percent == 90