llmsbrieftxt 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,150 @@
1
+ """Documentation-aware loader for intelligently discovering and crawling documentation sites."""
2
+
3
+ import asyncio
4
+ import logging
5
+ from collections.abc import Callable
6
+
7
+ import httpx
8
+ from tqdm import tqdm
9
+
10
+ from llmsbrieftxt.crawler import RobustDocCrawler
11
+ from llmsbrieftxt.extractor import default_extractor
12
+ from llmsbrieftxt.schema import Document
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class DocLoader:
18
+ """Main documentation loader using robust crawling strategies."""
19
+
20
+ def __init__(
21
+ self,
22
+ max_urls: int | None = None,
23
+ max_concurrent: int = 10,
24
+ max_depth: int = 3,
25
+ ):
26
+ """Initialize the documentation loader.
27
+
28
+ Args:
29
+ max_urls: Optional maximum number of URLs to discover
30
+ max_concurrent: Maximum concurrent requests (default 10)
31
+ max_depth: Maximum crawl depth (default 3)
32
+ """
33
+ self.max_urls = max_urls
34
+ self.max_concurrent = max_concurrent
35
+ self.max_depth = max_depth
36
+ self.crawler = RobustDocCrawler(
37
+ max_urls=max_urls,
38
+ max_depth=max_depth,
39
+ max_concurrent=max_concurrent,
40
+ )
41
+
42
+ async def load_docs(
43
+ self,
44
+ url: str,
45
+ extractor: Callable[[str], str] | None = None,
46
+ show_urls: bool = False,
47
+ ) -> tuple[list[Document], list[str]]:
48
+ """Load documentation pages using robust discovery strategies.
49
+
50
+ Args:
51
+ url: The base URL to start from
52
+ extractor: Optional content extractor function
53
+ show_urls: Whether to return discovered URLs without loading
54
+
55
+ Returns:
56
+ Tuple of (documents, discovered_urls)
57
+ """
58
+ if extractor is None:
59
+ extractor = default_extractor
60
+
61
+ logger.info(f"Starting documentation discovery for {url}")
62
+ print(f"Discovering documentation from {url}...")
63
+
64
+ # Use RobustDocCrawler to discover URLs
65
+ discovered_urls = await self.crawler.discover_urls(url)
66
+
67
+ print(f"\nFound {len(discovered_urls)} pages")
68
+
69
+ if show_urls:
70
+ # Return empty documents but include URLs for preview
71
+ return [], sorted(discovered_urls)
72
+
73
+ # Load content from discovered URLs
74
+ documents = await self._load_documents(list(discovered_urls), extractor)
75
+
76
+ return documents, sorted(discovered_urls)
77
+
78
+ async def _load_documents(
79
+ self, urls: list[str], extractor: Callable[[str], str]
80
+ ) -> list[Document]:
81
+ """Load content from URLs and create documents.
82
+
83
+ Args:
84
+ urls: List of URLs to load
85
+ extractor: Function to extract content from HTML
86
+
87
+ Returns:
88
+ List of Document objects
89
+ """
90
+ documents: list[Document] = []
91
+ url_list: list[str] = urls
92
+
93
+ async with httpx.AsyncClient(
94
+ follow_redirects=True, timeout=httpx.Timeout(30.0)
95
+ ) as client:
96
+ # Create semaphore for concurrency control
97
+ semaphore = asyncio.Semaphore(self.max_concurrent)
98
+
99
+ async def load_with_limit(url: str) -> Document | None:
100
+ """Load document with semaphore-controlled concurrency."""
101
+ async with semaphore:
102
+ return await self._load_single_document(url, client, extractor)
103
+
104
+ # Process all URLs concurrently with semaphore limiting parallelism
105
+ with tqdm(
106
+ total=len(url_list), desc="Loading documents", unit="doc"
107
+ ) as pbar:
108
+ tasks = [load_with_limit(url) for url in url_list]
109
+
110
+ # Use as_completed to update progress bar as tasks finish
111
+ for coro in asyncio.as_completed(tasks):
112
+ result = await coro
113
+ if isinstance(result, Exception):
114
+ logger.warning(f"Failed to load: {result}")
115
+ elif result is not None:
116
+ documents.append(result)
117
+ pbar.update(1)
118
+
119
+ return documents
120
+
121
+ async def _load_single_document(
122
+ self, url: str, client: httpx.AsyncClient, extractor: Callable[[str], str]
123
+ ) -> Document | None:
124
+ """Load a single document from a URL.
125
+
126
+ Args:
127
+ url: The URL to load
128
+ client: HTTP client
129
+ extractor: Content extraction function
130
+
131
+ Returns:
132
+ Document object or None if failed
133
+ """
134
+ try:
135
+ response = await client.get(url, timeout=30.0, follow_redirects=True)
136
+ if response.status_code == 200:
137
+ content = extractor(response.text)
138
+ if content and len(content.strip()) > 100:
139
+ return Document(
140
+ page_content=content, metadata={"source": url, "url": url}
141
+ )
142
+ else:
143
+ logger.debug(f"No meaningful content extracted from {url}")
144
+ return None
145
+ else:
146
+ logger.debug(f"HTTP {response.status_code} for {url}")
147
+ return None
148
+ except Exception as e:
149
+ logger.debug(f"Failed to load {url}: {e}")
150
+ return None
@@ -0,0 +1,69 @@
1
+ import logging
2
+ from typing import Any
3
+
4
+ from trafilatura import extract
5
+ from trafilatura.settings import use_config
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+
10
+ # Cache Trafilatura config to avoid recreating on every extraction
11
+ _trafilatura_config: Any = None
12
+
13
+
14
+ def _get_trafilatura_config() -> Any:
15
+ """Get or create cached Trafilatura config."""
16
+ global _trafilatura_config
17
+ if _trafilatura_config is None:
18
+ _trafilatura_config = use_config()
19
+ _trafilatura_config.set("DEFAULT", "MIN_EXTRACTED_SIZE", "200")
20
+ _trafilatura_config.set("DEFAULT", "MIN_FILE_SIZE", "100")
21
+ return _trafilatura_config
22
+
23
+
24
+ def default_extractor(html: str) -> str:
25
+ """
26
+ Extract main content from HTML using Trafilatura.
27
+
28
+ Trafilatura intelligently extracts the main content while filtering out:
29
+ - Navigation menus
30
+ - Sidebars
31
+ - Footers
32
+ - Cookie banners
33
+ - Advertisements
34
+
35
+ Args:
36
+ html: Raw HTML content
37
+
38
+ Returns:
39
+ Extracted content as markdown string (empty string if extraction fails)
40
+ """
41
+ # Get cached Trafilatura config
42
+ config = _get_trafilatura_config()
43
+
44
+ # Extract with Trafilatura
45
+ try:
46
+ result = extract(
47
+ html,
48
+ config=config,
49
+ output_format="markdown",
50
+ include_links=True,
51
+ include_images=False, # Images not needed for text summaries
52
+ include_tables=True,
53
+ include_comments=False,
54
+ favor_recall=True, # Prefer extracting more rather than less
55
+ )
56
+
57
+ if result:
58
+ logger.debug(
59
+ f"Trafilatura extracted {len(result)} chars "
60
+ f"(reduced from {len(html)} chars HTML)"
61
+ )
62
+ return str(result)
63
+ else:
64
+ logger.debug("Trafilatura extraction returned no content")
65
+ return ""
66
+
67
+ except Exception as e:
68
+ logger.warning(f"Trafilatura extraction failed: {e}")
69
+ return ""
llmsbrieftxt/main.py ADDED
@@ -0,0 +1,379 @@
1
+ """Main generation pipeline for llmsbrieftxt."""
2
+
3
+ import json
4
+ import logging
5
+ import re
6
+ from pathlib import Path
7
+
8
+ from llmsbrieftxt.constants import (
9
+ ESTIMATED_TOKENS_PER_PAGE_INPUT,
10
+ ESTIMATED_TOKENS_PER_PAGE_OUTPUT,
11
+ OPENAI_PRICING,
12
+ )
13
+ from llmsbrieftxt.doc_loader import DocLoader
14
+ from llmsbrieftxt.extractor import default_extractor
15
+ from llmsbrieftxt.summarizer import Summarizer
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ def calculate_actual_cost(input_tokens: int, output_tokens: int, model: str) -> float:
21
+ """
22
+ Calculate actual API cost from token usage.
23
+
24
+ Args:
25
+ input_tokens: Number of input tokens used
26
+ output_tokens: Number of output tokens generated
27
+ model: OpenAI model name
28
+
29
+ Returns:
30
+ Total cost in dollars
31
+ """
32
+ if model not in OPENAI_PRICING:
33
+ return 0.0
34
+
35
+ input_price, output_price = OPENAI_PRICING[model]
36
+ input_cost = (input_tokens / 1_000_000) * input_price
37
+ output_cost = (output_tokens / 1_000_000) * output_price
38
+ return input_cost + output_cost
39
+
40
+
41
+ def format_cost(cost: float) -> str:
42
+ """Format cost as a dollar string."""
43
+ if cost < 0.01:
44
+ return f"${cost:.4f}"
45
+ elif cost < 1.00:
46
+ return f"${cost:.3f}"
47
+ else:
48
+ return f"${cost:.2f}"
49
+
50
+
51
+ def get_cache_stats(cache_file: Path, model: str) -> dict[str, int | float | str]:
52
+ """
53
+ Get cache statistics including size and estimated savings.
54
+
55
+ Args:
56
+ cache_file: Path to cache file
57
+ model: OpenAI model name for cost calculation
58
+
59
+ Returns:
60
+ Dictionary with cache statistics
61
+ """
62
+ if not cache_file.exists():
63
+ return {
64
+ "num_entries": 0,
65
+ "size_mb": 0.0,
66
+ "estimated_savings": "$0.00",
67
+ }
68
+
69
+ try:
70
+ # Get file size
71
+ size_bytes = cache_file.stat().st_size
72
+ size_mb = size_bytes / (1024 * 1024)
73
+
74
+ # Count entries
75
+ with open(cache_file) as f:
76
+ cache_data = json.load(f)
77
+ num_entries = len(cache_data)
78
+
79
+ # Estimate savings (num_entries * avg cost per page)
80
+ avg_input_tokens = ESTIMATED_TOKENS_PER_PAGE_INPUT
81
+ avg_output_tokens = ESTIMATED_TOKENS_PER_PAGE_OUTPUT
82
+ savings_per_page = calculate_actual_cost(
83
+ avg_input_tokens, avg_output_tokens, model
84
+ )
85
+ total_savings = num_entries * savings_per_page
86
+
87
+ return {
88
+ "num_entries": num_entries,
89
+ "size_mb": size_mb,
90
+ "estimated_savings": format_cost(total_savings),
91
+ }
92
+ except Exception as e:
93
+ logger.warning(f"Could not read cache stats from {cache_file}: {str(e)}")
94
+ return {
95
+ "num_entries": 0,
96
+ "size_mb": 0.0,
97
+ "estimated_savings": "$0.00",
98
+ }
99
+
100
+
101
+ def extract_url_from_summary(summary: str) -> str | None:
102
+ """
103
+ Extract URL from a summary in the format: Title: [title](URL).
104
+
105
+ Args:
106
+ summary: Formatted summary string
107
+
108
+ Returns:
109
+ Extracted URL or None if not found
110
+ """
111
+ # Match markdown link format: [text](url)
112
+ match = re.search(r"\[([^\]]+)\]\(([^)]+)\)", summary)
113
+ if match:
114
+ return match.group(2)
115
+ return None
116
+
117
+
118
+ def ensure_directory_exists(file_path: str) -> None:
119
+ """Ensure the parent directory of the given file path exists.
120
+
121
+ Args:
122
+ file_path: Path to the file whose parent directory should be created
123
+
124
+ Raises:
125
+ RuntimeError: If directory creation fails due to permissions or other issues
126
+ """
127
+ dir_path = Path(file_path).parent
128
+ if dir_path == Path("."):
129
+ return # Current directory, no need to create
130
+
131
+ try:
132
+ dir_path.mkdir(parents=True, exist_ok=True)
133
+ if not dir_path.exists():
134
+ print(f"Created directory: {dir_path}")
135
+ except OSError as e:
136
+ raise RuntimeError(f"Failed to create directory {dir_path}: {e}") from e
137
+
138
+
139
+ async def generate_llms_txt(
140
+ url: str,
141
+ llm_name: str = "o4-mini",
142
+ max_concurrent_summaries: int = 10,
143
+ output_path: str = "llms.txt",
144
+ show_urls: bool = False,
145
+ max_urls: int | None = None,
146
+ max_depth: int = 3,
147
+ cache_dir: str = ".llmsbrieftxt_cache",
148
+ use_cache_only: bool = False,
149
+ force_refresh: bool = False,
150
+ skip_confirmation: bool = False,
151
+ ) -> dict[str, int | list[str]] | None:
152
+ """
153
+ Generate llms-brief.txt file from a documentation website.
154
+
155
+ Args:
156
+ url: URL of the documentation site to crawl
157
+ llm_name: OpenAI model to use for summarization
158
+ max_concurrent_summaries: Maximum concurrent LLM requests
159
+ output_path: Path to write the output file
160
+ show_urls: If True, only show discovered URLs without processing
161
+ max_urls: Maximum number of URLs to discover/process
162
+ max_depth: Maximum crawl depth for URL discovery
163
+ cache_dir: Directory to store cached summaries
164
+ use_cache_only: If True, only use cached summaries (no API calls)
165
+ force_refresh: If True, ignore cache and regenerate all summaries
166
+ skip_confirmation: If True, skip confirmation prompt for high costs
167
+
168
+ Returns:
169
+ Dictionary with metadata (for show_urls mode) or None
170
+ """
171
+ urls_processed = 0
172
+ summaries_generated = 0
173
+ failed_urls: set[str] = set() # Use set to avoid duplicates
174
+
175
+ # Set up cache directory
176
+ cache_path = Path(cache_dir)
177
+ cache_path.mkdir(parents=True, exist_ok=True)
178
+ cache_file = cache_path / "summaries.json"
179
+
180
+ # Load existing summaries from cache if available (unless force refresh)
181
+ existing_summaries: dict[str, str] = {}
182
+ if not force_refresh and cache_file.exists():
183
+ try:
184
+ with open(cache_file) as f:
185
+ existing_summaries = json.load(f)
186
+ # Show cache stats
187
+ cache_stats = get_cache_stats(cache_file, llm_name)
188
+ print(
189
+ f"\nCache: {cache_stats['num_entries']} entries ({cache_stats['size_mb']:.1f}MB on disk)"
190
+ )
191
+ print(
192
+ f"Approximate value from cache: ~{cache_stats['estimated_savings']} in saved API calls"
193
+ )
194
+ except Exception as e:
195
+ print(f"Warning: Could not load cache: {str(e)}")
196
+ elif force_refresh and cache_file.exists():
197
+ print("\nForce refresh enabled - ignoring existing cache")
198
+
199
+ extractor = default_extractor
200
+ output_file = output_path
201
+
202
+ # If show_urls is True, just show discovered URLs and exit
203
+ if show_urls:
204
+ print("Discovering documentation URLs...")
205
+ doc_loader = DocLoader(max_urls=max_urls, max_depth=max_depth)
206
+ _, discovered_urls = await doc_loader.load_docs(
207
+ url, extractor=extractor, show_urls=True
208
+ )
209
+ print("\nDiscovered URLs:")
210
+ for discovered_url in discovered_urls:
211
+ print(f" - {discovered_url}")
212
+ print(f"\nTotal: {len(discovered_urls)} unique URLs")
213
+
214
+ # Calculate how many would be cached vs new
215
+ num_cached = sum(1 for u in discovered_urls if u in existing_summaries)
216
+ num_new = len(discovered_urls) - num_cached
217
+ if existing_summaries:
218
+ print(f"Cached: {num_cached} | New: {num_new}")
219
+
220
+ return {"num_urls": len(discovered_urls), "failed_urls": []}
221
+
222
+ # Load and process documents
223
+ doc_loader = DocLoader(max_urls=max_urls, max_depth=max_depth)
224
+ docs, discovered_urls = await doc_loader.load_docs(url, extractor=extractor)
225
+ urls_processed = len(docs)
226
+
227
+ # Track which URLs failed to load
228
+ loaded_urls = {doc.metadata.get("source") for doc in docs}
229
+ failed_urls.update(u for u in discovered_urls if u not in loaded_urls)
230
+
231
+ # Show cost estimate and get confirmation (unless using cache-only or skip_confirmation)
232
+ if not use_cache_only and not skip_confirmation:
233
+ num_cached = sum(1 for u in discovered_urls if u in existing_summaries)
234
+ num_new = len(discovered_urls) - num_cached
235
+ estimated_cost_new = calculate_actual_cost(
236
+ num_new * ESTIMATED_TOKENS_PER_PAGE_INPUT,
237
+ num_new * ESTIMATED_TOKENS_PER_PAGE_OUTPUT,
238
+ llm_name,
239
+ )
240
+
241
+ print(f"\nThis run: {num_new} new pages, {num_cached} cached")
242
+ if num_cached > 0:
243
+ saved_cost = calculate_actual_cost(
244
+ num_cached * ESTIMATED_TOKENS_PER_PAGE_INPUT,
245
+ num_cached * ESTIMATED_TOKENS_PER_PAGE_OUTPUT,
246
+ llm_name,
247
+ )
248
+ print(
249
+ f"Estimated cost: {format_cost(estimated_cost_new)} (saving {format_cost(saved_cost)} via cache)"
250
+ )
251
+ else:
252
+ print(f"Estimated cost: {format_cost(estimated_cost_new)}")
253
+
254
+ # Prompt for confirmation if cost is significant (> $1.00)
255
+ if estimated_cost_new > 1.00:
256
+ print(
257
+ f"\nWARNING: This will cost approximately {format_cost(estimated_cost_new)}"
258
+ )
259
+ response = input("Continue? [y/N]: ").strip().lower()
260
+ if response not in ["y", "yes"]:
261
+ print("Cancelled by user")
262
+ return None
263
+
264
+ # Handle cache-only mode
265
+ usage_stats: dict[str, int] = {"input_tokens": 0, "output_tokens": 0}
266
+ if use_cache_only:
267
+ print("\nCache-only mode: Using only cached summaries")
268
+ summaries: list[str] = []
269
+ for doc in docs:
270
+ doc_url = doc.metadata.get("source", "")
271
+ if doc_url in existing_summaries:
272
+ summaries.append(existing_summaries[doc_url])
273
+ else:
274
+ print(f" Warning: No cache for {doc_url}")
275
+ failed_urls.add(doc_url)
276
+ summaries_generated = len(summaries)
277
+ else:
278
+ # Initialize summarizer
279
+ print(f"\nGenerating summaries with {llm_name}...")
280
+ summarizer = Summarizer(
281
+ llm_name=llm_name,
282
+ max_concurrent=max_concurrent_summaries,
283
+ )
284
+
285
+ summaries: list[str] = []
286
+ try:
287
+ summaries, usage_stats = await summarizer.summarize_all(
288
+ docs, existing_summaries=existing_summaries, cache_file=cache_file
289
+ )
290
+ summaries_generated = len(summaries)
291
+
292
+ # Track URLs that failed summarization by extracting URLs from summaries
293
+ summarized_urls: set[str] = set()
294
+ for summary in summaries:
295
+ if summary:
296
+ extracted_url: str | None = extract_url_from_summary(summary)
297
+ if extracted_url:
298
+ summarized_urls.add(extracted_url)
299
+
300
+ # Add docs that weren't successfully summarized to failed_urls
301
+ for doc in docs:
302
+ doc_url = doc.metadata.get("source", "")
303
+ if doc_url and doc_url not in summarized_urls:
304
+ failed_urls.add(doc_url)
305
+ except KeyboardInterrupt:
306
+ print("Process interrupted by user. Saving partial results...")
307
+ if cache_file.exists():
308
+ try:
309
+ with open(cache_file) as f:
310
+ partial_summaries = json.load(f)
311
+ summaries = list(partial_summaries.values())
312
+ summaries_generated = len(summaries)
313
+ print(f"Recovered {len(summaries)} summaries from cache")
314
+ except Exception:
315
+ # Silently ignore cache read errors during interrupt recovery
316
+ # If we can't recover from cache, we'll continue with empty results
317
+ pass
318
+ except Exception as e:
319
+ print(f"Summarization process error: {str(e)}")
320
+ if cache_file.exists():
321
+ try:
322
+ with open(cache_file) as f:
323
+ partial_summaries = json.load(f)
324
+ summaries = list(partial_summaries.values())
325
+ summaries_generated = len(summaries)
326
+ print(
327
+ f"Recovered {len(summaries)} partial summaries from cache"
328
+ )
329
+ except Exception:
330
+ # If cache recovery fails during error handling, continue with empty results
331
+ summaries = []
332
+ finally:
333
+ # Write results to file
334
+ if summaries:
335
+ ensure_directory_exists(output_file)
336
+ output_content = "".join(summaries)
337
+ Path(output_file).write_text(output_content, encoding="utf-8")
338
+ else:
339
+ ensure_directory_exists(output_file)
340
+ Path(output_file).write_text("", encoding="utf-8")
341
+
342
+ # Print summary
343
+ print(f"\n{'=' * 50}")
344
+ print(f"Processed: {summaries_generated}/{urls_processed} pages")
345
+ if urls_processed > 0:
346
+ success_rate = summaries_generated / urls_processed * 100
347
+ print(f"Success rate: {success_rate:.1f}%")
348
+
349
+ # Show actual API cost if tokens were used
350
+ if usage_stats["input_tokens"] > 0 or usage_stats["output_tokens"] > 0:
351
+ actual_cost = calculate_actual_cost(
352
+ usage_stats["input_tokens"], usage_stats["output_tokens"], llm_name
353
+ )
354
+ num_cached = len(existing_summaries)
355
+ if num_cached > 0:
356
+ # Calculate how much we saved via cache
357
+ saved_cost = calculate_actual_cost(
358
+ num_cached * ESTIMATED_TOKENS_PER_PAGE_INPUT,
359
+ num_cached * ESTIMATED_TOKENS_PER_PAGE_OUTPUT,
360
+ llm_name,
361
+ )
362
+ print(
363
+ f"Actual cost: {format_cost(actual_cost)} (saved {format_cost(saved_cost)} via cache)"
364
+ )
365
+ else:
366
+ print(f"Actual cost: {format_cost(actual_cost)}")
367
+
368
+ print(f"Output: {output_file}")
369
+
370
+ # Report failed URLs
371
+ if failed_urls:
372
+ print(f"Failed URLs: {len(failed_urls)}")
373
+ failed_file = Path(output_file).parent / "failed_urls.txt"
374
+ # Sort URLs for consistent output
375
+ failed_file.write_text("\n".join(sorted(failed_urls)), encoding="utf-8")
376
+ print(f"Failed URLs written to: {failed_file}")
377
+ print(f"{'=' * 50}")
378
+
379
+ return None
llmsbrieftxt/schema.py ADDED
@@ -0,0 +1,42 @@
1
+ from typing import Any
2
+
3
+ from pydantic import BaseModel, Field
4
+
5
+
6
+ class Document(BaseModel):
7
+ """Simple document class to replace langchain.schema.Document."""
8
+
9
+ page_content: str
10
+ metadata: dict[str, Any] = Field(default_factory=dict)
11
+
12
+
13
+ class PageSummary(BaseModel):
14
+ content_analysis: str = Field(
15
+ description="Comprehensive analysis of the page content (2-3 sentences, ~50-80 words). "
16
+ "Include main topics, key concepts, important features, and unique value propositions. "
17
+ "Be specific about what makes this content valuable and distinctive."
18
+ )
19
+ primary_use_cases: str = Field(
20
+ description="3-5 specific, actionable scenarios when an LLM should reference this page (2-3 sentences total, ~40-60 words). "
21
+ 'Format as concrete use cases like: "When implementing X feature", "To understand Y concept", '
22
+ '"For troubleshooting Z issue". Focus on practical applications.'
23
+ )
24
+ key_takeaways: str = Field(
25
+ description="2-3 most valuable insights, capabilities, or pieces of information (2-3 sentences, ~40-60 words). "
26
+ "Highlight unique knowledge, practical tips, or critical information that makes this page worth consulting. "
27
+ "Format as distinct points separated by semicolons."
28
+ )
29
+ related_topics: str = Field(
30
+ description="Relevant domains, technologies, and concepts this page relates to (1-2 sentences, ~20-30 words). "
31
+ 'List connected topics that provide context, like: "API design, REST principles, microservices architecture". '
32
+ "Help establish the knowledge domain."
33
+ )
34
+ keywords: str = Field(
35
+ description="5-10 specific, searchable terms for discovery and indexing (comma-separated list, ~15-25 words). "
36
+ "Include technical terms, product names, methodologies, and key concepts. "
37
+ 'Example: "GraphQL, API Gateway, schema stitching, federation, Apollo Server, type safety".'
38
+ )
39
+ concise_summary: str = Field(
40
+ description="Single comprehensive sentence capturing the essence of the page (15-25 words). "
41
+ "Summarize what the page offers and its primary value in one clear, informative statement."
42
+ )