llmsbrieftxt 1.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of llmsbrieftxt might be problematic. Click here for more details.

@@ -0,0 +1 @@
1
+ __version__ = "0.6.1"
llmsbrieftxt/cli.py ADDED
@@ -0,0 +1,276 @@
1
+ """Command-line interface for llmsbrieftxt."""
2
+
3
+ import argparse
4
+ import asyncio
5
+ import os
6
+ import sys
7
+ from pathlib import Path
8
+ from urllib.parse import urlparse
9
+
10
+ from llmsbrieftxt.constants import (
11
+ DEFAULT_CACHE_DIR,
12
+ DEFAULT_CONCURRENT_SUMMARIES,
13
+ DEFAULT_CRAWL_DEPTH,
14
+ DEFAULT_OPENAI_MODEL,
15
+ DOCS_DIR,
16
+ ESTIMATED_TOKENS_PER_PAGE_INPUT,
17
+ ESTIMATED_TOKENS_PER_PAGE_OUTPUT,
18
+ OPENAI_PRICING,
19
+ )
20
+ from llmsbrieftxt.main import generate_llms_txt
21
+
22
+
23
+ def parse_args(test_args: list[str] | None = None) -> argparse.Namespace:
24
+ """Parse command-line arguments."""
25
+ parser = argparse.ArgumentParser(
26
+ description="Generate llms-brief.txt files from documentation websites",
27
+ epilog="Example: llmsbrieftxt https://docs.python.org/3/",
28
+ )
29
+
30
+ # Positional argument for URL
31
+ parser.add_argument("url", help="URL of the documentation site to process")
32
+
33
+ parser.add_argument(
34
+ "--model",
35
+ default=DEFAULT_OPENAI_MODEL,
36
+ help=f"OpenAI model to use (default: {DEFAULT_OPENAI_MODEL})",
37
+ )
38
+
39
+ parser.add_argument(
40
+ "--max-concurrent-summaries",
41
+ type=int,
42
+ default=DEFAULT_CONCURRENT_SUMMARIES,
43
+ help=f"Maximum number of concurrent LLM requests (default: {DEFAULT_CONCURRENT_SUMMARIES})",
44
+ )
45
+
46
+ parser.add_argument(
47
+ "--output",
48
+ type=str,
49
+ default=None,
50
+ help=f"Output file path (default: {DOCS_DIR}/<domain>.txt)",
51
+ )
52
+
53
+ parser.add_argument(
54
+ "--show-urls",
55
+ action="store_true",
56
+ help="Preview discovered URLs with cost estimate (no processing or API calls)",
57
+ )
58
+
59
+ parser.add_argument(
60
+ "--max-urls", type=int, help="Maximum number of URLs to discover and process"
61
+ )
62
+
63
+ parser.add_argument(
64
+ "--depth",
65
+ type=int,
66
+ default=DEFAULT_CRAWL_DEPTH,
67
+ help=f"Maximum crawl depth (default: {DEFAULT_CRAWL_DEPTH})",
68
+ )
69
+
70
+ parser.add_argument(
71
+ "--cache-dir",
72
+ type=str,
73
+ default=DEFAULT_CACHE_DIR,
74
+ help=f"Cache directory path (default: {DEFAULT_CACHE_DIR})",
75
+ )
76
+
77
+ parser.add_argument(
78
+ "--use-cache-only",
79
+ action="store_true",
80
+ help="Use only cached summaries, skip API calls for new pages",
81
+ )
82
+
83
+ parser.add_argument(
84
+ "--force-refresh",
85
+ action="store_true",
86
+ help="Ignore cache and regenerate all summaries",
87
+ )
88
+
89
+ parser.add_argument(
90
+ "--yes",
91
+ "-y",
92
+ action="store_true",
93
+ help="Skip confirmation prompts (useful for automation)",
94
+ )
95
+
96
+ return parser.parse_args(test_args)
97
+
98
+
99
+ def validate_url(url: str) -> bool:
100
+ """Validate that the URL is well-formed and uses http/https scheme."""
101
+ try:
102
+ parsed = urlparse(url)
103
+ return bool(parsed.scheme in ("http", "https") and parsed.netloc)
104
+ except Exception:
105
+ return False
106
+
107
+
108
+ def check_openai_api_key() -> bool:
109
+ """Check if OPENAI_API_KEY is set in environment."""
110
+ return bool(os.environ.get("OPENAI_API_KEY"))
111
+
112
+
113
+ def estimate_cost(num_pages: int, model: str) -> str:
114
+ """
115
+ Estimate the API cost for processing a given number of pages.
116
+
117
+ Args:
118
+ num_pages: Number of pages to process
119
+ model: OpenAI model name
120
+
121
+ Returns:
122
+ Formatted cost estimate string
123
+ """
124
+ if model not in OPENAI_PRICING:
125
+ return "Cost estimation not available for this model"
126
+
127
+ input_price, output_price = OPENAI_PRICING[model]
128
+
129
+ # Calculate total tokens
130
+ total_input_tokens = num_pages * ESTIMATED_TOKENS_PER_PAGE_INPUT
131
+ total_output_tokens = num_pages * ESTIMATED_TOKENS_PER_PAGE_OUTPUT
132
+
133
+ # Calculate cost (prices are per 1M tokens)
134
+ input_cost = (total_input_tokens / 1_000_000) * input_price
135
+ output_cost = (total_output_tokens / 1_000_000) * output_price
136
+ total_cost = input_cost + output_cost
137
+
138
+ if total_cost < 0.01:
139
+ return f"~${total_cost:.4f}"
140
+ elif total_cost < 1.00:
141
+ return f"~${total_cost:.3f}"
142
+ else:
143
+ return f"~${total_cost:.2f}"
144
+
145
+
146
+ def get_output_path(url: str, custom_output: str | None = None) -> Path:
147
+ """
148
+ Get the output file path for a given URL.
149
+
150
+ Args:
151
+ url: The URL being processed
152
+ custom_output: Optional custom output path
153
+
154
+ Returns:
155
+ Path object for the output file
156
+ """
157
+ if custom_output:
158
+ # Expand environment variables and user home directory
159
+ expanded = os.path.expandvars(custom_output)
160
+ return Path(expanded).expanduser()
161
+
162
+ # Extract domain from URL
163
+ parsed = urlparse(url)
164
+ domain = parsed.netloc or parsed.path.split("/")[0]
165
+
166
+ # Remove www. prefix if present
167
+ if domain.startswith("www."):
168
+ domain = domain[4:]
169
+
170
+ # Ensure ~/.claude/docs/ exists
171
+ docs_dir = Path(DOCS_DIR).expanduser()
172
+ docs_dir.mkdir(parents=True, exist_ok=True)
173
+
174
+ return docs_dir / f"{domain}.txt"
175
+
176
+
177
+ def main() -> None:
178
+ """Main entry point for the CLI."""
179
+ args = parse_args()
180
+
181
+ try:
182
+ # Validate URL
183
+ if not validate_url(args.url):
184
+ print("Error: Invalid URL", file=sys.stderr)
185
+ print(
186
+ f"Please provide a valid HTTP or HTTPS URL. Got: {args.url}",
187
+ file=sys.stderr,
188
+ )
189
+ print("Example: https://docs.python.org/3/", file=sys.stderr)
190
+ sys.exit(1)
191
+
192
+ # Validate depth parameter
193
+ if args.depth < 1:
194
+ print("Error: --depth must be at least 1", file=sys.stderr)
195
+ sys.exit(1)
196
+
197
+ # Check for conflicting cache flags
198
+ if args.use_cache_only and args.force_refresh:
199
+ print(
200
+ "Error: Cannot use --use-cache-only and --force-refresh together",
201
+ file=sys.stderr,
202
+ )
203
+ sys.exit(1)
204
+
205
+ # Check for API key (unless just showing URLs or using cache only)
206
+ if (
207
+ not args.show_urls
208
+ and not args.use_cache_only
209
+ and not check_openai_api_key()
210
+ ):
211
+ print("Error: OPENAI_API_KEY not found", file=sys.stderr)
212
+ print("Please set your OpenAI API key:", file=sys.stderr)
213
+ print(" export OPENAI_API_KEY='sk-your-api-key-here'", file=sys.stderr)
214
+ print("", file=sys.stderr)
215
+ print(
216
+ "Get your API key from: https://platform.openai.com/api-keys",
217
+ file=sys.stderr,
218
+ )
219
+ sys.exit(1)
220
+
221
+ # Determine output path
222
+ output_path = get_output_path(args.url, args.output)
223
+
224
+ # Expand cache directory path
225
+ cache_dir = Path(os.path.expandvars(args.cache_dir)).expanduser()
226
+
227
+ # Print configuration
228
+ print(f"Processing URL: {args.url}")
229
+ if not args.show_urls:
230
+ print(f"Using model: {args.model}")
231
+ print(f"Crawl depth: {args.depth}")
232
+ print(f"Output: {output_path}")
233
+ if args.max_urls:
234
+ print(f"Max URLs: {args.max_urls}")
235
+ if args.use_cache_only:
236
+ print("Mode: Cache-only (no API calls)")
237
+ elif args.force_refresh:
238
+ print("Mode: Force refresh (ignoring cache)")
239
+
240
+ # Run generation
241
+ result = asyncio.run(
242
+ generate_llms_txt(
243
+ url=args.url,
244
+ llm_name=args.model,
245
+ max_concurrent_summaries=args.max_concurrent_summaries,
246
+ output_path=str(output_path),
247
+ show_urls=args.show_urls,
248
+ max_urls=args.max_urls,
249
+ max_depth=args.depth,
250
+ cache_dir=str(cache_dir),
251
+ use_cache_only=args.use_cache_only,
252
+ force_refresh=args.force_refresh,
253
+ skip_confirmation=args.yes,
254
+ )
255
+ )
256
+
257
+ # Show cost estimate and failed URLs if available
258
+ if args.show_urls and result:
259
+ num_urls_value = result.get("num_urls", 0)
260
+ # Type guard to ensure we have an int
261
+ if isinstance(num_urls_value, int):
262
+ print(
263
+ f"\nEstimated cost for {num_urls_value} pages: {estimate_cost(num_urls_value, args.model)}"
264
+ )
265
+ print("Note: Actual cost may vary based on page content size and caching")
266
+
267
+ except KeyboardInterrupt:
268
+ print("\nOperation cancelled by user.", file=sys.stderr)
269
+ sys.exit(1)
270
+ except Exception as e:
271
+ print(f"Error: {str(e)}", file=sys.stderr)
272
+ sys.exit(1)
273
+
274
+
275
+ if __name__ == "__main__":
276
+ main()
@@ -0,0 +1,62 @@
1
+ """Configuration constants for llmsbrieftxt package."""
2
+
3
+ # Concurrency
4
+ DEFAULT_CONCURRENT_SUMMARIES = 10
5
+
6
+ # Default Models
7
+ DEFAULT_OPENAI_MODEL = "gpt-5-mini"
8
+
9
+ # Docs Directory
10
+ DOCS_DIR = "~/.claude/docs" # Will be expanded to full path at runtime
11
+
12
+ # Default Cache Directory
13
+ DEFAULT_CACHE_DIR = ".llmsbrieftxt_cache"
14
+
15
+ # Default Crawl Depth
16
+ DEFAULT_CRAWL_DEPTH = 3
17
+
18
+ # OpenAI Pricing (per 1M tokens) - prices subject to change
19
+ # Format: {model: (input_price, output_price)}
20
+ # Note: Verify current pricing at https://openai.com/api/pricing/
21
+ OPENAI_PRICING = {
22
+ "gpt-5-mini": (0.15, 0.60), # $0.15 input, $0.60 output per 1M tokens
23
+ "gpt-4o-mini": (0.15, 0.60),
24
+ "gpt-4o": (2.50, 10.00),
25
+ "gpt-4-turbo": (10.00, 30.00),
26
+ "gpt-4": (30.00, 60.00),
27
+ }
28
+
29
+ # Estimated tokens per page for cost calculation
30
+ # These estimates are based on typical documentation page sizes:
31
+ # - Input: ~2000-4000 words per doc page → ~3000 tokens (conservative estimate)
32
+ # - Output: ~300 tokens for structured PageSummary with all fields
33
+ # Accuracy: Estimates typically within ±30% of actual cost
34
+ # Pages with code examples or very long content may exceed these estimates
35
+ ESTIMATED_TOKENS_PER_PAGE_INPUT = 3000
36
+ ESTIMATED_TOKENS_PER_PAGE_OUTPUT = 400
37
+
38
+
39
+ # Prompt Templates
40
+ DEFAULT_SUMMARY_PROMPT = """You are a specialized content analyzer creating structured summaries for llms-brief.txt files. Your role is to help LLMs understand web content by providing comprehensive yet concise summaries.
41
+
42
+ Focus on:
43
+ - What information and resources are available
44
+ - When and why an LLM should reference this content
45
+ - Key insights and practical applications
46
+
47
+ Guidelines:
48
+ 1. Be specific and actionable - avoid vague descriptions
49
+ 2. Focus on practical utility - what can someone DO with this information?
50
+ 3. Identify unique value - what makes this page worth referencing?
51
+ 4. Target 500-800 tokens total across all fields (roughly 2-4 sentences per field)
52
+ 5. Write from the perspective of helping an LLM know when to use this resource
53
+
54
+ Provide structured summaries with:
55
+ - Core information and resources available (2-3 detailed sentences)
56
+ - Specific scenarios when this page should be referenced (3-5 concrete use cases)
57
+ - The most valuable insights or capabilities offered (2-3 key points)
58
+ - Related domains and topics for context (brief but comprehensive list)
59
+ - Searchable keywords for discovery (5-10 specific terms)
60
+ - A single-sentence executive summary (15-25 words)
61
+
62
+ Aim for depth over brevity - each field should contain substantive, actionable information while remaining concise."""
@@ -0,0 +1,358 @@
1
+ """Simple web crawler with sitemap support and BFS fallback."""
2
+
3
+ import asyncio
4
+ import contextlib
5
+ import io
6
+ import logging
7
+ import sys
8
+ from collections.abc import Generator
9
+ from urllib.parse import urljoin, urlparse
10
+
11
+ import httpx
12
+ from bs4 import BeautifulSoup, Tag
13
+ from tenacity import retry, stop_after_attempt, wait_exponential
14
+ from usp.tree import sitemap_tree_for_homepage # type: ignore[import-untyped]
15
+
16
+ from llmsbrieftxt.url_filters import URLFilter
17
+ from llmsbrieftxt.url_utils import URLCanonicalizer
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ @contextlib.contextmanager
23
+ def suppress_output() -> Generator[None, None, None]:
24
+ """Suppress stdout and stderr temporarily.
25
+
26
+ USP library prints noisy errors to console.
27
+ Suppress them when sitemap parsing fails (expected for SPA sites).
28
+ """
29
+ old_stdout = sys.stdout
30
+ old_stderr = sys.stderr
31
+ try:
32
+ sys.stdout = io.StringIO()
33
+ sys.stderr = io.StringIO()
34
+ yield
35
+ finally:
36
+ sys.stdout = old_stdout
37
+ sys.stderr = old_stderr
38
+
39
+
40
+ class RobustDocCrawler:
41
+ """Production-ready documentation crawler with sitemap and BFS fallback."""
42
+
43
+ def __init__(
44
+ self,
45
+ max_urls: int | None = None,
46
+ max_depth: int = 3,
47
+ max_concurrent: int = 10,
48
+ ):
49
+ """Initialize the crawler.
50
+
51
+ Args:
52
+ max_urls: Maximum number of URLs to crawl (None = unlimited)
53
+ max_depth: Maximum crawl depth
54
+ max_concurrent: Maximum concurrent requests
55
+ """
56
+ self.max_urls = max_urls or 500
57
+ self.max_depth = max_depth
58
+ self.max_concurrent = max_concurrent
59
+ self.discovered_urls: set[str] = set()
60
+ self.user_agent = (
61
+ "llmsbrieftxt-bot/1.0 (+https://github.com/stevennevins/llmsbrief)"
62
+ )
63
+ self.url_canonicalizer = URLCanonicalizer(keep_fragments=False)
64
+ self.url_filter = URLFilter()
65
+ self.semaphore = asyncio.Semaphore(max_concurrent)
66
+
67
+ async def discover_urls(self, base_url: str) -> set[str]:
68
+ """Discover URLs using sitemap or BFS crawling.
69
+
70
+ Priority:
71
+ 1. Sitemap (fastest, most complete)
72
+ 2. BFS crawling (fallback)
73
+
74
+ Args:
75
+ base_url: The starting URL
76
+
77
+ Returns:
78
+ Set of discovered URLs
79
+ """
80
+ self.discovered_urls = set()
81
+
82
+ # Strategy 1: Try sitemap first
83
+ logger.info("Strategy 1: Checking for sitemap...")
84
+ sitemap_urls = await self._discover_from_sitemap(base_url)
85
+ if sitemap_urls:
86
+ logger.info(f"Sitemap discovery successful: {len(sitemap_urls)} URLs")
87
+ self.discovered_urls = sitemap_urls
88
+ return self._apply_limits(sitemap_urls)
89
+
90
+ # Strategy 2: Fall back to BFS crawling
91
+ logger.info("Strategy 2: Using BFS crawler...")
92
+ crawled_urls = await self._crawl_bfs(base_url)
93
+ logger.info(f"BFS discovery complete: {len(crawled_urls)} URLs")
94
+ self.discovered_urls = crawled_urls
95
+ return self._apply_limits(crawled_urls)
96
+
97
+ async def _discover_from_sitemap(self, base_url: str) -> set[str]:
98
+ """Discover URLs from sitemap.xml files.
99
+
100
+ Args:
101
+ base_url: The base URL to discover sitemap for
102
+
103
+ Returns:
104
+ Set of URLs found in sitemaps
105
+ """
106
+ urls: set[str] = set()
107
+
108
+ try:
109
+ # Try standard sitemap location
110
+ parsed = urlparse(base_url)
111
+ base_domain = f"{parsed.scheme}://{parsed.netloc}"
112
+ sitemap_url = f"{base_domain}/sitemap.xml"
113
+ logger.info("Trying standard sitemap location")
114
+
115
+ # Parse sitemap (with timeout)
116
+ # Suppress noisy errors from USP library (expected for SPA sites)
117
+ try:
118
+ logger.info(f"Parsing sitemap: {sitemap_url}")
119
+ with suppress_output():
120
+ tree = await asyncio.wait_for(
121
+ asyncio.to_thread(sitemap_tree_for_homepage, sitemap_url), # type: ignore[reportUnknownArgumentType]
122
+ timeout=30.0,
123
+ )
124
+ sitemap_pages = [page.url for page in tree.all_pages()]
125
+
126
+ if sitemap_pages:
127
+ logger.info(
128
+ f"Found {len(sitemap_pages)} URLs in sitemap {sitemap_url}"
129
+ )
130
+
131
+ # Filter URLs to only those under base path
132
+ base_path = self._get_base_path(base_url)
133
+ for url in sitemap_pages:
134
+ if self._is_under_base_path(url, base_path):
135
+ urls.add(url)
136
+ except asyncio.TimeoutError:
137
+ logger.debug(f"Timeout parsing sitemap {sitemap_url}")
138
+
139
+ except Exception as e:
140
+ logger.debug(f"Sitemap discovery failed: {e}")
141
+
142
+ return urls
143
+
144
+ @retry(
145
+ stop=stop_after_attempt(3),
146
+ wait=wait_exponential(multiplier=1, min=1, max=10),
147
+ reraise=True,
148
+ )
149
+ async def _fetch_with_retry(self, url: str, client: httpx.AsyncClient) -> str:
150
+ """Fetch URL with retry logic and concurrency limiting."""
151
+ async with self.semaphore:
152
+ response = await client.get(url, timeout=30.0, follow_redirects=True)
153
+ response.raise_for_status()
154
+ return response.text
155
+
156
+ async def _crawl_bfs(self, base_url: str) -> set[str]:
157
+ """Crawl using breadth-first search.
158
+
159
+ Args:
160
+ base_url: The starting URL
161
+
162
+ Returns:
163
+ Set of discovered URLs
164
+ """
165
+ discovered: set[str] = set()
166
+ to_visit: set[str] = {base_url}
167
+ visited: set[str] = set()
168
+ base_path = self._get_base_path(base_url)
169
+
170
+ # Print initial discovery status
171
+ print("Discovering URLs: 0 found", end="", flush=True)
172
+
173
+ async with httpx.AsyncClient(
174
+ follow_redirects=True, timeout=httpx.Timeout(30.0)
175
+ ) as client:
176
+ for depth in range(self.max_depth):
177
+ if not to_visit or len(discovered) >= self.max_urls:
178
+ break
179
+
180
+ logger.info(f"Depth {depth}: {len(to_visit)} URLs to visit")
181
+ current_level = list(to_visit)
182
+ to_visit: set[str] = set()
183
+
184
+ # Process in batches
185
+ for i in range(0, len(current_level), self.max_concurrent):
186
+ batch = current_level[i : i + self.max_concurrent]
187
+ tasks = [
188
+ self._extract_links(url, client, base_path) for url in batch
189
+ ]
190
+ results = await asyncio.gather(*tasks, return_exceptions=True)
191
+
192
+ for url, result in zip(batch, results, strict=False):
193
+ visited.add(url)
194
+ discovered.add(url)
195
+
196
+ # Update live counter
197
+ print(
198
+ f"\rDiscovering URLs: {len(discovered)} found",
199
+ end="",
200
+ flush=True,
201
+ )
202
+
203
+ if not isinstance(result, Exception) and isinstance(
204
+ result, set
205
+ ):
206
+ # Add new URLs to visit
207
+ for link in result:
208
+ if (
209
+ link not in visited
210
+ and link not in discovered
211
+ and link not in to_visit
212
+ and len(discovered) < self.max_urls
213
+ ):
214
+ to_visit.add(link)
215
+
216
+ logger.info(
217
+ f"Depth {depth} complete: {len(discovered)} URLs discovered"
218
+ )
219
+
220
+ # Final newline after live counter
221
+ print() # Move to next line
222
+ return discovered
223
+
224
+ async def _extract_links(
225
+ self, url: str, client: httpx.AsyncClient, base_path: str
226
+ ) -> set[str]:
227
+ """Extract links from a page."""
228
+ links: set[str] = set()
229
+
230
+ try:
231
+ html = await self._fetch_with_retry(url, client)
232
+ soup = BeautifulSoup(html, "html.parser")
233
+
234
+ for anchor in soup.find_all("a", href=True):
235
+ if not isinstance(anchor, Tag):
236
+ continue
237
+ href_value = anchor.get("href")
238
+ if (
239
+ href_value
240
+ and isinstance(href_value, str)
241
+ and self._is_valid_doc_link(href_value)
242
+ ):
243
+ href = href_value
244
+ absolute_url = urljoin(url, href)
245
+ if self._is_under_base_path(absolute_url, base_path):
246
+ links.add(absolute_url)
247
+
248
+ except Exception as e:
249
+ logger.debug(f"Failed to extract links from {url}: {e}")
250
+
251
+ return links
252
+
253
+ def _get_base_path(self, url: str) -> str:
254
+ """Extract the base path from a URL for scope filtering.
255
+
256
+ Args:
257
+ url: The URL to extract base path from
258
+
259
+ Returns:
260
+ Base path string
261
+ """
262
+ parsed = urlparse(url)
263
+
264
+ # For GitHub repos, extract user/repo
265
+ if "github.com" in parsed.netloc:
266
+ parts = parsed.path.strip("/").split("/")
267
+ if len(parts) >= 2:
268
+ # Handle special GitHub paths
269
+ if parts[2:3] == ["wiki"]:
270
+ return f"/{parts[0]}/{parts[1]}/wiki/"
271
+ elif parts[2:4] == ["tree"]:
272
+ # Include branch in path
273
+ if len(parts) >= 4:
274
+ return f"/{parts[0]}/{parts[1]}/tree/{parts[3]}/"
275
+ return f"/{parts[0]}/{parts[1]}/"
276
+ elif parts[2:3] == ["blob"]:
277
+ # Single file path
278
+ return f"/{parts[0]}/{parts[1]}/"
279
+ else:
280
+ # Standard repo root
281
+ return f"/{parts[0]}/{parts[1]}/"
282
+
283
+ # For regular docs, use the full path up to the last segment
284
+ path = parsed.path
285
+ if not path or path == "/":
286
+ return "/"
287
+
288
+ # If path looks like a file, remove the filename
289
+ if "." in path.split("/")[-1]:
290
+ path = "/".join(path.split("/")[:-1]) + "/"
291
+ elif not path.endswith("/"):
292
+ path = path + "/"
293
+
294
+ return path
295
+
296
+ def _is_under_base_path(self, url: str, base_path: str) -> bool:
297
+ """Check if URL is under the base path.
298
+
299
+ Args:
300
+ url: The URL to check
301
+ base_path: The base path to compare against
302
+
303
+ Returns:
304
+ True if URL is under base path
305
+ """
306
+ parsed = urlparse(url)
307
+ url_path = parsed.path if parsed.path else "/"
308
+
309
+ # Ensure consistent trailing slashes
310
+ if not url_path.endswith("/") and "." not in url_path.split("/")[-1]:
311
+ url_path = url_path + "/"
312
+
313
+ return url_path.startswith(base_path)
314
+
315
+ def _is_valid_doc_link(self, link: str) -> bool:
316
+ """Check if a link is likely to be documentation using filtering.
317
+
318
+ Args:
319
+ link: The link to check
320
+
321
+ Returns:
322
+ True if link appears to be documentation
323
+ """
324
+ # Skip invalid links
325
+ if not link or link.startswith("#") or link.startswith("javascript:"):
326
+ return False
327
+
328
+ # Use URLFilter for extension-based filtering
329
+ return self.url_filter.should_include(link)
330
+
331
+ def _apply_limits(self, urls: set[str]) -> set[str]:
332
+ """Apply canonicalization, deduplication, and max_urls limit.
333
+
334
+ Args:
335
+ urls: Set of URLs to process
336
+
337
+ Returns:
338
+ Canonicalized, deduplicated, and limited set of URLs
339
+ """
340
+ # Convert set to list for deduplication (preserves order)
341
+ url_list = list(urls)
342
+
343
+ # Deduplicate using URL canonicalizer
344
+ unique_urls = self.url_canonicalizer.deduplicate(url_list)
345
+
346
+ duplicates_removed = len(url_list) - len(unique_urls)
347
+ if duplicates_removed > 0:
348
+ logger.info(
349
+ f"Removed {duplicates_removed} duplicate URLs "
350
+ f"({len(url_list)} -> {len(unique_urls)})"
351
+ )
352
+
353
+ # Apply max_urls limit
354
+ if self.max_urls and len(unique_urls) > self.max_urls:
355
+ logger.info(f"Limiting {len(unique_urls)} URLs to {self.max_urls}")
356
+ unique_urls = unique_urls[: self.max_urls]
357
+
358
+ return set(unique_urls)