llmsbrieftxt 1.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of llmsbrieftxt might be problematic. Click here for more details.
- llmsbrieftxt/__init__.py +1 -0
- llmsbrieftxt/cli.py +276 -0
- llmsbrieftxt/constants.py +62 -0
- llmsbrieftxt/crawler.py +358 -0
- llmsbrieftxt/doc_loader.py +150 -0
- llmsbrieftxt/extractor.py +69 -0
- llmsbrieftxt/main.py +379 -0
- llmsbrieftxt/schema.py +42 -0
- llmsbrieftxt/summarizer.py +303 -0
- llmsbrieftxt/url_filters.py +75 -0
- llmsbrieftxt/url_utils.py +73 -0
- llmsbrieftxt-1.6.0.dist-info/METADATA +420 -0
- llmsbrieftxt-1.6.0.dist-info/RECORD +16 -0
- llmsbrieftxt-1.6.0.dist-info/WHEEL +4 -0
- llmsbrieftxt-1.6.0.dist-info/entry_points.txt +2 -0
- llmsbrieftxt-1.6.0.dist-info/licenses/LICENSE +21 -0
llmsbrieftxt/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.6.1"
|
llmsbrieftxt/cli.py
ADDED
|
@@ -0,0 +1,276 @@
|
|
|
1
|
+
"""Command-line interface for llmsbrieftxt."""
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import asyncio
|
|
5
|
+
import os
|
|
6
|
+
import sys
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from urllib.parse import urlparse
|
|
9
|
+
|
|
10
|
+
from llmsbrieftxt.constants import (
|
|
11
|
+
DEFAULT_CACHE_DIR,
|
|
12
|
+
DEFAULT_CONCURRENT_SUMMARIES,
|
|
13
|
+
DEFAULT_CRAWL_DEPTH,
|
|
14
|
+
DEFAULT_OPENAI_MODEL,
|
|
15
|
+
DOCS_DIR,
|
|
16
|
+
ESTIMATED_TOKENS_PER_PAGE_INPUT,
|
|
17
|
+
ESTIMATED_TOKENS_PER_PAGE_OUTPUT,
|
|
18
|
+
OPENAI_PRICING,
|
|
19
|
+
)
|
|
20
|
+
from llmsbrieftxt.main import generate_llms_txt
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def parse_args(test_args: list[str] | None = None) -> argparse.Namespace:
|
|
24
|
+
"""Parse command-line arguments."""
|
|
25
|
+
parser = argparse.ArgumentParser(
|
|
26
|
+
description="Generate llms-brief.txt files from documentation websites",
|
|
27
|
+
epilog="Example: llmsbrieftxt https://docs.python.org/3/",
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
# Positional argument for URL
|
|
31
|
+
parser.add_argument("url", help="URL of the documentation site to process")
|
|
32
|
+
|
|
33
|
+
parser.add_argument(
|
|
34
|
+
"--model",
|
|
35
|
+
default=DEFAULT_OPENAI_MODEL,
|
|
36
|
+
help=f"OpenAI model to use (default: {DEFAULT_OPENAI_MODEL})",
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
parser.add_argument(
|
|
40
|
+
"--max-concurrent-summaries",
|
|
41
|
+
type=int,
|
|
42
|
+
default=DEFAULT_CONCURRENT_SUMMARIES,
|
|
43
|
+
help=f"Maximum number of concurrent LLM requests (default: {DEFAULT_CONCURRENT_SUMMARIES})",
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
parser.add_argument(
|
|
47
|
+
"--output",
|
|
48
|
+
type=str,
|
|
49
|
+
default=None,
|
|
50
|
+
help=f"Output file path (default: {DOCS_DIR}/<domain>.txt)",
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
parser.add_argument(
|
|
54
|
+
"--show-urls",
|
|
55
|
+
action="store_true",
|
|
56
|
+
help="Preview discovered URLs with cost estimate (no processing or API calls)",
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
parser.add_argument(
|
|
60
|
+
"--max-urls", type=int, help="Maximum number of URLs to discover and process"
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
parser.add_argument(
|
|
64
|
+
"--depth",
|
|
65
|
+
type=int,
|
|
66
|
+
default=DEFAULT_CRAWL_DEPTH,
|
|
67
|
+
help=f"Maximum crawl depth (default: {DEFAULT_CRAWL_DEPTH})",
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
parser.add_argument(
|
|
71
|
+
"--cache-dir",
|
|
72
|
+
type=str,
|
|
73
|
+
default=DEFAULT_CACHE_DIR,
|
|
74
|
+
help=f"Cache directory path (default: {DEFAULT_CACHE_DIR})",
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
parser.add_argument(
|
|
78
|
+
"--use-cache-only",
|
|
79
|
+
action="store_true",
|
|
80
|
+
help="Use only cached summaries, skip API calls for new pages",
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
parser.add_argument(
|
|
84
|
+
"--force-refresh",
|
|
85
|
+
action="store_true",
|
|
86
|
+
help="Ignore cache and regenerate all summaries",
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
parser.add_argument(
|
|
90
|
+
"--yes",
|
|
91
|
+
"-y",
|
|
92
|
+
action="store_true",
|
|
93
|
+
help="Skip confirmation prompts (useful for automation)",
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
return parser.parse_args(test_args)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def validate_url(url: str) -> bool:
|
|
100
|
+
"""Validate that the URL is well-formed and uses http/https scheme."""
|
|
101
|
+
try:
|
|
102
|
+
parsed = urlparse(url)
|
|
103
|
+
return bool(parsed.scheme in ("http", "https") and parsed.netloc)
|
|
104
|
+
except Exception:
|
|
105
|
+
return False
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def check_openai_api_key() -> bool:
|
|
109
|
+
"""Check if OPENAI_API_KEY is set in environment."""
|
|
110
|
+
return bool(os.environ.get("OPENAI_API_KEY"))
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def estimate_cost(num_pages: int, model: str) -> str:
|
|
114
|
+
"""
|
|
115
|
+
Estimate the API cost for processing a given number of pages.
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
num_pages: Number of pages to process
|
|
119
|
+
model: OpenAI model name
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
Formatted cost estimate string
|
|
123
|
+
"""
|
|
124
|
+
if model not in OPENAI_PRICING:
|
|
125
|
+
return "Cost estimation not available for this model"
|
|
126
|
+
|
|
127
|
+
input_price, output_price = OPENAI_PRICING[model]
|
|
128
|
+
|
|
129
|
+
# Calculate total tokens
|
|
130
|
+
total_input_tokens = num_pages * ESTIMATED_TOKENS_PER_PAGE_INPUT
|
|
131
|
+
total_output_tokens = num_pages * ESTIMATED_TOKENS_PER_PAGE_OUTPUT
|
|
132
|
+
|
|
133
|
+
# Calculate cost (prices are per 1M tokens)
|
|
134
|
+
input_cost = (total_input_tokens / 1_000_000) * input_price
|
|
135
|
+
output_cost = (total_output_tokens / 1_000_000) * output_price
|
|
136
|
+
total_cost = input_cost + output_cost
|
|
137
|
+
|
|
138
|
+
if total_cost < 0.01:
|
|
139
|
+
return f"~${total_cost:.4f}"
|
|
140
|
+
elif total_cost < 1.00:
|
|
141
|
+
return f"~${total_cost:.3f}"
|
|
142
|
+
else:
|
|
143
|
+
return f"~${total_cost:.2f}"
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def get_output_path(url: str, custom_output: str | None = None) -> Path:
|
|
147
|
+
"""
|
|
148
|
+
Get the output file path for a given URL.
|
|
149
|
+
|
|
150
|
+
Args:
|
|
151
|
+
url: The URL being processed
|
|
152
|
+
custom_output: Optional custom output path
|
|
153
|
+
|
|
154
|
+
Returns:
|
|
155
|
+
Path object for the output file
|
|
156
|
+
"""
|
|
157
|
+
if custom_output:
|
|
158
|
+
# Expand environment variables and user home directory
|
|
159
|
+
expanded = os.path.expandvars(custom_output)
|
|
160
|
+
return Path(expanded).expanduser()
|
|
161
|
+
|
|
162
|
+
# Extract domain from URL
|
|
163
|
+
parsed = urlparse(url)
|
|
164
|
+
domain = parsed.netloc or parsed.path.split("/")[0]
|
|
165
|
+
|
|
166
|
+
# Remove www. prefix if present
|
|
167
|
+
if domain.startswith("www."):
|
|
168
|
+
domain = domain[4:]
|
|
169
|
+
|
|
170
|
+
# Ensure ~/.claude/docs/ exists
|
|
171
|
+
docs_dir = Path(DOCS_DIR).expanduser()
|
|
172
|
+
docs_dir.mkdir(parents=True, exist_ok=True)
|
|
173
|
+
|
|
174
|
+
return docs_dir / f"{domain}.txt"
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def main() -> None:
|
|
178
|
+
"""Main entry point for the CLI."""
|
|
179
|
+
args = parse_args()
|
|
180
|
+
|
|
181
|
+
try:
|
|
182
|
+
# Validate URL
|
|
183
|
+
if not validate_url(args.url):
|
|
184
|
+
print("Error: Invalid URL", file=sys.stderr)
|
|
185
|
+
print(
|
|
186
|
+
f"Please provide a valid HTTP or HTTPS URL. Got: {args.url}",
|
|
187
|
+
file=sys.stderr,
|
|
188
|
+
)
|
|
189
|
+
print("Example: https://docs.python.org/3/", file=sys.stderr)
|
|
190
|
+
sys.exit(1)
|
|
191
|
+
|
|
192
|
+
# Validate depth parameter
|
|
193
|
+
if args.depth < 1:
|
|
194
|
+
print("Error: --depth must be at least 1", file=sys.stderr)
|
|
195
|
+
sys.exit(1)
|
|
196
|
+
|
|
197
|
+
# Check for conflicting cache flags
|
|
198
|
+
if args.use_cache_only and args.force_refresh:
|
|
199
|
+
print(
|
|
200
|
+
"Error: Cannot use --use-cache-only and --force-refresh together",
|
|
201
|
+
file=sys.stderr,
|
|
202
|
+
)
|
|
203
|
+
sys.exit(1)
|
|
204
|
+
|
|
205
|
+
# Check for API key (unless just showing URLs or using cache only)
|
|
206
|
+
if (
|
|
207
|
+
not args.show_urls
|
|
208
|
+
and not args.use_cache_only
|
|
209
|
+
and not check_openai_api_key()
|
|
210
|
+
):
|
|
211
|
+
print("Error: OPENAI_API_KEY not found", file=sys.stderr)
|
|
212
|
+
print("Please set your OpenAI API key:", file=sys.stderr)
|
|
213
|
+
print(" export OPENAI_API_KEY='sk-your-api-key-here'", file=sys.stderr)
|
|
214
|
+
print("", file=sys.stderr)
|
|
215
|
+
print(
|
|
216
|
+
"Get your API key from: https://platform.openai.com/api-keys",
|
|
217
|
+
file=sys.stderr,
|
|
218
|
+
)
|
|
219
|
+
sys.exit(1)
|
|
220
|
+
|
|
221
|
+
# Determine output path
|
|
222
|
+
output_path = get_output_path(args.url, args.output)
|
|
223
|
+
|
|
224
|
+
# Expand cache directory path
|
|
225
|
+
cache_dir = Path(os.path.expandvars(args.cache_dir)).expanduser()
|
|
226
|
+
|
|
227
|
+
# Print configuration
|
|
228
|
+
print(f"Processing URL: {args.url}")
|
|
229
|
+
if not args.show_urls:
|
|
230
|
+
print(f"Using model: {args.model}")
|
|
231
|
+
print(f"Crawl depth: {args.depth}")
|
|
232
|
+
print(f"Output: {output_path}")
|
|
233
|
+
if args.max_urls:
|
|
234
|
+
print(f"Max URLs: {args.max_urls}")
|
|
235
|
+
if args.use_cache_only:
|
|
236
|
+
print("Mode: Cache-only (no API calls)")
|
|
237
|
+
elif args.force_refresh:
|
|
238
|
+
print("Mode: Force refresh (ignoring cache)")
|
|
239
|
+
|
|
240
|
+
# Run generation
|
|
241
|
+
result = asyncio.run(
|
|
242
|
+
generate_llms_txt(
|
|
243
|
+
url=args.url,
|
|
244
|
+
llm_name=args.model,
|
|
245
|
+
max_concurrent_summaries=args.max_concurrent_summaries,
|
|
246
|
+
output_path=str(output_path),
|
|
247
|
+
show_urls=args.show_urls,
|
|
248
|
+
max_urls=args.max_urls,
|
|
249
|
+
max_depth=args.depth,
|
|
250
|
+
cache_dir=str(cache_dir),
|
|
251
|
+
use_cache_only=args.use_cache_only,
|
|
252
|
+
force_refresh=args.force_refresh,
|
|
253
|
+
skip_confirmation=args.yes,
|
|
254
|
+
)
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
# Show cost estimate and failed URLs if available
|
|
258
|
+
if args.show_urls and result:
|
|
259
|
+
num_urls_value = result.get("num_urls", 0)
|
|
260
|
+
# Type guard to ensure we have an int
|
|
261
|
+
if isinstance(num_urls_value, int):
|
|
262
|
+
print(
|
|
263
|
+
f"\nEstimated cost for {num_urls_value} pages: {estimate_cost(num_urls_value, args.model)}"
|
|
264
|
+
)
|
|
265
|
+
print("Note: Actual cost may vary based on page content size and caching")
|
|
266
|
+
|
|
267
|
+
except KeyboardInterrupt:
|
|
268
|
+
print("\nOperation cancelled by user.", file=sys.stderr)
|
|
269
|
+
sys.exit(1)
|
|
270
|
+
except Exception as e:
|
|
271
|
+
print(f"Error: {str(e)}", file=sys.stderr)
|
|
272
|
+
sys.exit(1)
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
if __name__ == "__main__":
|
|
276
|
+
main()
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""Configuration constants for llmsbrieftxt package."""
|
|
2
|
+
|
|
3
|
+
# Concurrency
|
|
4
|
+
DEFAULT_CONCURRENT_SUMMARIES = 10
|
|
5
|
+
|
|
6
|
+
# Default Models
|
|
7
|
+
DEFAULT_OPENAI_MODEL = "gpt-5-mini"
|
|
8
|
+
|
|
9
|
+
# Docs Directory
|
|
10
|
+
DOCS_DIR = "~/.claude/docs" # Will be expanded to full path at runtime
|
|
11
|
+
|
|
12
|
+
# Default Cache Directory
|
|
13
|
+
DEFAULT_CACHE_DIR = ".llmsbrieftxt_cache"
|
|
14
|
+
|
|
15
|
+
# Default Crawl Depth
|
|
16
|
+
DEFAULT_CRAWL_DEPTH = 3
|
|
17
|
+
|
|
18
|
+
# OpenAI Pricing (per 1M tokens) - prices subject to change
|
|
19
|
+
# Format: {model: (input_price, output_price)}
|
|
20
|
+
# Note: Verify current pricing at https://openai.com/api/pricing/
|
|
21
|
+
OPENAI_PRICING = {
|
|
22
|
+
"gpt-5-mini": (0.15, 0.60), # $0.15 input, $0.60 output per 1M tokens
|
|
23
|
+
"gpt-4o-mini": (0.15, 0.60),
|
|
24
|
+
"gpt-4o": (2.50, 10.00),
|
|
25
|
+
"gpt-4-turbo": (10.00, 30.00),
|
|
26
|
+
"gpt-4": (30.00, 60.00),
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
# Estimated tokens per page for cost calculation
|
|
30
|
+
# These estimates are based on typical documentation page sizes:
|
|
31
|
+
# - Input: ~2000-4000 words per doc page → ~3000 tokens (conservative estimate)
|
|
32
|
+
# - Output: ~300 tokens for structured PageSummary with all fields
|
|
33
|
+
# Accuracy: Estimates typically within ±30% of actual cost
|
|
34
|
+
# Pages with code examples or very long content may exceed these estimates
|
|
35
|
+
ESTIMATED_TOKENS_PER_PAGE_INPUT = 3000
|
|
36
|
+
ESTIMATED_TOKENS_PER_PAGE_OUTPUT = 400
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
# Prompt Templates
|
|
40
|
+
DEFAULT_SUMMARY_PROMPT = """You are a specialized content analyzer creating structured summaries for llms-brief.txt files. Your role is to help LLMs understand web content by providing comprehensive yet concise summaries.
|
|
41
|
+
|
|
42
|
+
Focus on:
|
|
43
|
+
- What information and resources are available
|
|
44
|
+
- When and why an LLM should reference this content
|
|
45
|
+
- Key insights and practical applications
|
|
46
|
+
|
|
47
|
+
Guidelines:
|
|
48
|
+
1. Be specific and actionable - avoid vague descriptions
|
|
49
|
+
2. Focus on practical utility - what can someone DO with this information?
|
|
50
|
+
3. Identify unique value - what makes this page worth referencing?
|
|
51
|
+
4. Target 500-800 tokens total across all fields (roughly 2-4 sentences per field)
|
|
52
|
+
5. Write from the perspective of helping an LLM know when to use this resource
|
|
53
|
+
|
|
54
|
+
Provide structured summaries with:
|
|
55
|
+
- Core information and resources available (2-3 detailed sentences)
|
|
56
|
+
- Specific scenarios when this page should be referenced (3-5 concrete use cases)
|
|
57
|
+
- The most valuable insights or capabilities offered (2-3 key points)
|
|
58
|
+
- Related domains and topics for context (brief but comprehensive list)
|
|
59
|
+
- Searchable keywords for discovery (5-10 specific terms)
|
|
60
|
+
- A single-sentence executive summary (15-25 words)
|
|
61
|
+
|
|
62
|
+
Aim for depth over brevity - each field should contain substantive, actionable information while remaining concise."""
|
llmsbrieftxt/crawler.py
ADDED
|
@@ -0,0 +1,358 @@
|
|
|
1
|
+
"""Simple web crawler with sitemap support and BFS fallback."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import contextlib
|
|
5
|
+
import io
|
|
6
|
+
import logging
|
|
7
|
+
import sys
|
|
8
|
+
from collections.abc import Generator
|
|
9
|
+
from urllib.parse import urljoin, urlparse
|
|
10
|
+
|
|
11
|
+
import httpx
|
|
12
|
+
from bs4 import BeautifulSoup, Tag
|
|
13
|
+
from tenacity import retry, stop_after_attempt, wait_exponential
|
|
14
|
+
from usp.tree import sitemap_tree_for_homepage # type: ignore[import-untyped]
|
|
15
|
+
|
|
16
|
+
from llmsbrieftxt.url_filters import URLFilter
|
|
17
|
+
from llmsbrieftxt.url_utils import URLCanonicalizer
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@contextlib.contextmanager
|
|
23
|
+
def suppress_output() -> Generator[None, None, None]:
|
|
24
|
+
"""Suppress stdout and stderr temporarily.
|
|
25
|
+
|
|
26
|
+
USP library prints noisy errors to console.
|
|
27
|
+
Suppress them when sitemap parsing fails (expected for SPA sites).
|
|
28
|
+
"""
|
|
29
|
+
old_stdout = sys.stdout
|
|
30
|
+
old_stderr = sys.stderr
|
|
31
|
+
try:
|
|
32
|
+
sys.stdout = io.StringIO()
|
|
33
|
+
sys.stderr = io.StringIO()
|
|
34
|
+
yield
|
|
35
|
+
finally:
|
|
36
|
+
sys.stdout = old_stdout
|
|
37
|
+
sys.stderr = old_stderr
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class RobustDocCrawler:
|
|
41
|
+
"""Production-ready documentation crawler with sitemap and BFS fallback."""
|
|
42
|
+
|
|
43
|
+
def __init__(
|
|
44
|
+
self,
|
|
45
|
+
max_urls: int | None = None,
|
|
46
|
+
max_depth: int = 3,
|
|
47
|
+
max_concurrent: int = 10,
|
|
48
|
+
):
|
|
49
|
+
"""Initialize the crawler.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
max_urls: Maximum number of URLs to crawl (None = unlimited)
|
|
53
|
+
max_depth: Maximum crawl depth
|
|
54
|
+
max_concurrent: Maximum concurrent requests
|
|
55
|
+
"""
|
|
56
|
+
self.max_urls = max_urls or 500
|
|
57
|
+
self.max_depth = max_depth
|
|
58
|
+
self.max_concurrent = max_concurrent
|
|
59
|
+
self.discovered_urls: set[str] = set()
|
|
60
|
+
self.user_agent = (
|
|
61
|
+
"llmsbrieftxt-bot/1.0 (+https://github.com/stevennevins/llmsbrief)"
|
|
62
|
+
)
|
|
63
|
+
self.url_canonicalizer = URLCanonicalizer(keep_fragments=False)
|
|
64
|
+
self.url_filter = URLFilter()
|
|
65
|
+
self.semaphore = asyncio.Semaphore(max_concurrent)
|
|
66
|
+
|
|
67
|
+
async def discover_urls(self, base_url: str) -> set[str]:
|
|
68
|
+
"""Discover URLs using sitemap or BFS crawling.
|
|
69
|
+
|
|
70
|
+
Priority:
|
|
71
|
+
1. Sitemap (fastest, most complete)
|
|
72
|
+
2. BFS crawling (fallback)
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
base_url: The starting URL
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
Set of discovered URLs
|
|
79
|
+
"""
|
|
80
|
+
self.discovered_urls = set()
|
|
81
|
+
|
|
82
|
+
# Strategy 1: Try sitemap first
|
|
83
|
+
logger.info("Strategy 1: Checking for sitemap...")
|
|
84
|
+
sitemap_urls = await self._discover_from_sitemap(base_url)
|
|
85
|
+
if sitemap_urls:
|
|
86
|
+
logger.info(f"Sitemap discovery successful: {len(sitemap_urls)} URLs")
|
|
87
|
+
self.discovered_urls = sitemap_urls
|
|
88
|
+
return self._apply_limits(sitemap_urls)
|
|
89
|
+
|
|
90
|
+
# Strategy 2: Fall back to BFS crawling
|
|
91
|
+
logger.info("Strategy 2: Using BFS crawler...")
|
|
92
|
+
crawled_urls = await self._crawl_bfs(base_url)
|
|
93
|
+
logger.info(f"BFS discovery complete: {len(crawled_urls)} URLs")
|
|
94
|
+
self.discovered_urls = crawled_urls
|
|
95
|
+
return self._apply_limits(crawled_urls)
|
|
96
|
+
|
|
97
|
+
async def _discover_from_sitemap(self, base_url: str) -> set[str]:
|
|
98
|
+
"""Discover URLs from sitemap.xml files.
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
base_url: The base URL to discover sitemap for
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
Set of URLs found in sitemaps
|
|
105
|
+
"""
|
|
106
|
+
urls: set[str] = set()
|
|
107
|
+
|
|
108
|
+
try:
|
|
109
|
+
# Try standard sitemap location
|
|
110
|
+
parsed = urlparse(base_url)
|
|
111
|
+
base_domain = f"{parsed.scheme}://{parsed.netloc}"
|
|
112
|
+
sitemap_url = f"{base_domain}/sitemap.xml"
|
|
113
|
+
logger.info("Trying standard sitemap location")
|
|
114
|
+
|
|
115
|
+
# Parse sitemap (with timeout)
|
|
116
|
+
# Suppress noisy errors from USP library (expected for SPA sites)
|
|
117
|
+
try:
|
|
118
|
+
logger.info(f"Parsing sitemap: {sitemap_url}")
|
|
119
|
+
with suppress_output():
|
|
120
|
+
tree = await asyncio.wait_for(
|
|
121
|
+
asyncio.to_thread(sitemap_tree_for_homepage, sitemap_url), # type: ignore[reportUnknownArgumentType]
|
|
122
|
+
timeout=30.0,
|
|
123
|
+
)
|
|
124
|
+
sitemap_pages = [page.url for page in tree.all_pages()]
|
|
125
|
+
|
|
126
|
+
if sitemap_pages:
|
|
127
|
+
logger.info(
|
|
128
|
+
f"Found {len(sitemap_pages)} URLs in sitemap {sitemap_url}"
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
# Filter URLs to only those under base path
|
|
132
|
+
base_path = self._get_base_path(base_url)
|
|
133
|
+
for url in sitemap_pages:
|
|
134
|
+
if self._is_under_base_path(url, base_path):
|
|
135
|
+
urls.add(url)
|
|
136
|
+
except asyncio.TimeoutError:
|
|
137
|
+
logger.debug(f"Timeout parsing sitemap {sitemap_url}")
|
|
138
|
+
|
|
139
|
+
except Exception as e:
|
|
140
|
+
logger.debug(f"Sitemap discovery failed: {e}")
|
|
141
|
+
|
|
142
|
+
return urls
|
|
143
|
+
|
|
144
|
+
@retry(
|
|
145
|
+
stop=stop_after_attempt(3),
|
|
146
|
+
wait=wait_exponential(multiplier=1, min=1, max=10),
|
|
147
|
+
reraise=True,
|
|
148
|
+
)
|
|
149
|
+
async def _fetch_with_retry(self, url: str, client: httpx.AsyncClient) -> str:
|
|
150
|
+
"""Fetch URL with retry logic and concurrency limiting."""
|
|
151
|
+
async with self.semaphore:
|
|
152
|
+
response = await client.get(url, timeout=30.0, follow_redirects=True)
|
|
153
|
+
response.raise_for_status()
|
|
154
|
+
return response.text
|
|
155
|
+
|
|
156
|
+
async def _crawl_bfs(self, base_url: str) -> set[str]:
|
|
157
|
+
"""Crawl using breadth-first search.
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
base_url: The starting URL
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
Set of discovered URLs
|
|
164
|
+
"""
|
|
165
|
+
discovered: set[str] = set()
|
|
166
|
+
to_visit: set[str] = {base_url}
|
|
167
|
+
visited: set[str] = set()
|
|
168
|
+
base_path = self._get_base_path(base_url)
|
|
169
|
+
|
|
170
|
+
# Print initial discovery status
|
|
171
|
+
print("Discovering URLs: 0 found", end="", flush=True)
|
|
172
|
+
|
|
173
|
+
async with httpx.AsyncClient(
|
|
174
|
+
follow_redirects=True, timeout=httpx.Timeout(30.0)
|
|
175
|
+
) as client:
|
|
176
|
+
for depth in range(self.max_depth):
|
|
177
|
+
if not to_visit or len(discovered) >= self.max_urls:
|
|
178
|
+
break
|
|
179
|
+
|
|
180
|
+
logger.info(f"Depth {depth}: {len(to_visit)} URLs to visit")
|
|
181
|
+
current_level = list(to_visit)
|
|
182
|
+
to_visit: set[str] = set()
|
|
183
|
+
|
|
184
|
+
# Process in batches
|
|
185
|
+
for i in range(0, len(current_level), self.max_concurrent):
|
|
186
|
+
batch = current_level[i : i + self.max_concurrent]
|
|
187
|
+
tasks = [
|
|
188
|
+
self._extract_links(url, client, base_path) for url in batch
|
|
189
|
+
]
|
|
190
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
191
|
+
|
|
192
|
+
for url, result in zip(batch, results, strict=False):
|
|
193
|
+
visited.add(url)
|
|
194
|
+
discovered.add(url)
|
|
195
|
+
|
|
196
|
+
# Update live counter
|
|
197
|
+
print(
|
|
198
|
+
f"\rDiscovering URLs: {len(discovered)} found",
|
|
199
|
+
end="",
|
|
200
|
+
flush=True,
|
|
201
|
+
)
|
|
202
|
+
|
|
203
|
+
if not isinstance(result, Exception) and isinstance(
|
|
204
|
+
result, set
|
|
205
|
+
):
|
|
206
|
+
# Add new URLs to visit
|
|
207
|
+
for link in result:
|
|
208
|
+
if (
|
|
209
|
+
link not in visited
|
|
210
|
+
and link not in discovered
|
|
211
|
+
and link not in to_visit
|
|
212
|
+
and len(discovered) < self.max_urls
|
|
213
|
+
):
|
|
214
|
+
to_visit.add(link)
|
|
215
|
+
|
|
216
|
+
logger.info(
|
|
217
|
+
f"Depth {depth} complete: {len(discovered)} URLs discovered"
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
# Final newline after live counter
|
|
221
|
+
print() # Move to next line
|
|
222
|
+
return discovered
|
|
223
|
+
|
|
224
|
+
async def _extract_links(
|
|
225
|
+
self, url: str, client: httpx.AsyncClient, base_path: str
|
|
226
|
+
) -> set[str]:
|
|
227
|
+
"""Extract links from a page."""
|
|
228
|
+
links: set[str] = set()
|
|
229
|
+
|
|
230
|
+
try:
|
|
231
|
+
html = await self._fetch_with_retry(url, client)
|
|
232
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
233
|
+
|
|
234
|
+
for anchor in soup.find_all("a", href=True):
|
|
235
|
+
if not isinstance(anchor, Tag):
|
|
236
|
+
continue
|
|
237
|
+
href_value = anchor.get("href")
|
|
238
|
+
if (
|
|
239
|
+
href_value
|
|
240
|
+
and isinstance(href_value, str)
|
|
241
|
+
and self._is_valid_doc_link(href_value)
|
|
242
|
+
):
|
|
243
|
+
href = href_value
|
|
244
|
+
absolute_url = urljoin(url, href)
|
|
245
|
+
if self._is_under_base_path(absolute_url, base_path):
|
|
246
|
+
links.add(absolute_url)
|
|
247
|
+
|
|
248
|
+
except Exception as e:
|
|
249
|
+
logger.debug(f"Failed to extract links from {url}: {e}")
|
|
250
|
+
|
|
251
|
+
return links
|
|
252
|
+
|
|
253
|
+
def _get_base_path(self, url: str) -> str:
|
|
254
|
+
"""Extract the base path from a URL for scope filtering.
|
|
255
|
+
|
|
256
|
+
Args:
|
|
257
|
+
url: The URL to extract base path from
|
|
258
|
+
|
|
259
|
+
Returns:
|
|
260
|
+
Base path string
|
|
261
|
+
"""
|
|
262
|
+
parsed = urlparse(url)
|
|
263
|
+
|
|
264
|
+
# For GitHub repos, extract user/repo
|
|
265
|
+
if "github.com" in parsed.netloc:
|
|
266
|
+
parts = parsed.path.strip("/").split("/")
|
|
267
|
+
if len(parts) >= 2:
|
|
268
|
+
# Handle special GitHub paths
|
|
269
|
+
if parts[2:3] == ["wiki"]:
|
|
270
|
+
return f"/{parts[0]}/{parts[1]}/wiki/"
|
|
271
|
+
elif parts[2:4] == ["tree"]:
|
|
272
|
+
# Include branch in path
|
|
273
|
+
if len(parts) >= 4:
|
|
274
|
+
return f"/{parts[0]}/{parts[1]}/tree/{parts[3]}/"
|
|
275
|
+
return f"/{parts[0]}/{parts[1]}/"
|
|
276
|
+
elif parts[2:3] == ["blob"]:
|
|
277
|
+
# Single file path
|
|
278
|
+
return f"/{parts[0]}/{parts[1]}/"
|
|
279
|
+
else:
|
|
280
|
+
# Standard repo root
|
|
281
|
+
return f"/{parts[0]}/{parts[1]}/"
|
|
282
|
+
|
|
283
|
+
# For regular docs, use the full path up to the last segment
|
|
284
|
+
path = parsed.path
|
|
285
|
+
if not path or path == "/":
|
|
286
|
+
return "/"
|
|
287
|
+
|
|
288
|
+
# If path looks like a file, remove the filename
|
|
289
|
+
if "." in path.split("/")[-1]:
|
|
290
|
+
path = "/".join(path.split("/")[:-1]) + "/"
|
|
291
|
+
elif not path.endswith("/"):
|
|
292
|
+
path = path + "/"
|
|
293
|
+
|
|
294
|
+
return path
|
|
295
|
+
|
|
296
|
+
def _is_under_base_path(self, url: str, base_path: str) -> bool:
|
|
297
|
+
"""Check if URL is under the base path.
|
|
298
|
+
|
|
299
|
+
Args:
|
|
300
|
+
url: The URL to check
|
|
301
|
+
base_path: The base path to compare against
|
|
302
|
+
|
|
303
|
+
Returns:
|
|
304
|
+
True if URL is under base path
|
|
305
|
+
"""
|
|
306
|
+
parsed = urlparse(url)
|
|
307
|
+
url_path = parsed.path if parsed.path else "/"
|
|
308
|
+
|
|
309
|
+
# Ensure consistent trailing slashes
|
|
310
|
+
if not url_path.endswith("/") and "." not in url_path.split("/")[-1]:
|
|
311
|
+
url_path = url_path + "/"
|
|
312
|
+
|
|
313
|
+
return url_path.startswith(base_path)
|
|
314
|
+
|
|
315
|
+
def _is_valid_doc_link(self, link: str) -> bool:
|
|
316
|
+
"""Check if a link is likely to be documentation using filtering.
|
|
317
|
+
|
|
318
|
+
Args:
|
|
319
|
+
link: The link to check
|
|
320
|
+
|
|
321
|
+
Returns:
|
|
322
|
+
True if link appears to be documentation
|
|
323
|
+
"""
|
|
324
|
+
# Skip invalid links
|
|
325
|
+
if not link or link.startswith("#") or link.startswith("javascript:"):
|
|
326
|
+
return False
|
|
327
|
+
|
|
328
|
+
# Use URLFilter for extension-based filtering
|
|
329
|
+
return self.url_filter.should_include(link)
|
|
330
|
+
|
|
331
|
+
def _apply_limits(self, urls: set[str]) -> set[str]:
|
|
332
|
+
"""Apply canonicalization, deduplication, and max_urls limit.
|
|
333
|
+
|
|
334
|
+
Args:
|
|
335
|
+
urls: Set of URLs to process
|
|
336
|
+
|
|
337
|
+
Returns:
|
|
338
|
+
Canonicalized, deduplicated, and limited set of URLs
|
|
339
|
+
"""
|
|
340
|
+
# Convert set to list for deduplication (preserves order)
|
|
341
|
+
url_list = list(urls)
|
|
342
|
+
|
|
343
|
+
# Deduplicate using URL canonicalizer
|
|
344
|
+
unique_urls = self.url_canonicalizer.deduplicate(url_list)
|
|
345
|
+
|
|
346
|
+
duplicates_removed = len(url_list) - len(unique_urls)
|
|
347
|
+
if duplicates_removed > 0:
|
|
348
|
+
logger.info(
|
|
349
|
+
f"Removed {duplicates_removed} duplicate URLs "
|
|
350
|
+
f"({len(url_list)} -> {len(unique_urls)})"
|
|
351
|
+
)
|
|
352
|
+
|
|
353
|
+
# Apply max_urls limit
|
|
354
|
+
if self.max_urls and len(unique_urls) > self.max_urls:
|
|
355
|
+
logger.info(f"Limiting {len(unique_urls)} URLs to {self.max_urls}")
|
|
356
|
+
unique_urls = unique_urls[: self.max_urls]
|
|
357
|
+
|
|
358
|
+
return set(unique_urls)
|