llmsbrieftxt 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,300 @@
1
+ import asyncio
2
+ import json
3
+ import logging
4
+ import os
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ import openai
9
+ from openai import AsyncOpenAI
10
+ from tenacity import (
11
+ retry,
12
+ retry_if_exception_type,
13
+ stop_after_attempt,
14
+ wait_exponential,
15
+ )
16
+ from tqdm import tqdm
17
+
18
+ from .constants import (
19
+ DEFAULT_CONCURRENT_SUMMARIES,
20
+ DEFAULT_OPENAI_MODEL,
21
+ DEFAULT_SUMMARY_PROMPT,
22
+ )
23
+ from .schema import Document, PageSummary
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ # Fallback summary used when LLM summarization fails
29
+ FALLBACK_SUMMARY = PageSummary(
30
+ content_analysis="This page contains web content relevant to the topic.",
31
+ primary_use_cases="When accessing general web content",
32
+ key_takeaways="Contains general information",
33
+ related_topics="Web content",
34
+ keywords="web, content, information",
35
+ concise_summary="This page contains web content relevant to the topic.",
36
+ )
37
+
38
+
39
+ class Summarizer:
40
+ def __init__(
41
+ self,
42
+ llm_name: str = DEFAULT_OPENAI_MODEL,
43
+ summary_prompt: str | None = None,
44
+ max_concurrent: int = DEFAULT_CONCURRENT_SUMMARIES,
45
+ ) -> None:
46
+ self.llm_name = llm_name
47
+ self.max_concurrent = max_concurrent
48
+ self.summary_prompt = summary_prompt or DEFAULT_SUMMARY_PROMPT
49
+ self.client = self._init_client()
50
+ self.semaphore = asyncio.Semaphore(max_concurrent)
51
+ # Cache JSON schema to avoid regenerating on every request
52
+ self._schema_cache = PageSummary.model_json_schema()
53
+ self._schema_cache["additionalProperties"] = False
54
+ # Track token usage for cost reporting (protected by lock for thread safety)
55
+ self.total_input_tokens = 0
56
+ self.total_output_tokens = 0
57
+ self._usage_lock = asyncio.Lock()
58
+
59
+ def _init_client(self) -> AsyncOpenAI:
60
+ api_key = os.getenv("OPENAI_API_KEY")
61
+ if not api_key:
62
+ raise ValueError(
63
+ "OPENAI_API_KEY environment variable is required. Please set your OpenAI API key in your environment variables."
64
+ )
65
+ return AsyncOpenAI(api_key=api_key)
66
+
67
+ @retry(
68
+ stop=stop_after_attempt(3),
69
+ wait=wait_exponential(multiplier=2, min=4, max=60),
70
+ retry=retry_if_exception_type(
71
+ (
72
+ openai.RateLimitError,
73
+ openai.APITimeoutError,
74
+ openai.APIConnectionError,
75
+ )
76
+ ),
77
+ reraise=True,
78
+ )
79
+ async def _summarize_with_retry(
80
+ self,
81
+ doc: Any,
82
+ loop: Any,
83
+ messages: list[dict[str, str]],
84
+ schema: dict[str, Any],
85
+ ) -> Any:
86
+ """Make LLM API call with retry logic for transient failures."""
87
+ return await self.client.chat.completions.create( # type: ignore[call-overload]
88
+ model=self.llm_name,
89
+ messages=messages, # type: ignore[arg-type]
90
+ response_format={ # type: ignore[typeddict-item]
91
+ "type": "json_schema",
92
+ "json_schema": {
93
+ "name": "page_summary",
94
+ "schema": schema,
95
+ "strict": True,
96
+ },
97
+ },
98
+ )
99
+
100
+ async def _summarize(self, doc: Any, loop: Any) -> PageSummary:
101
+ """Summarize document using OpenAI API."""
102
+ url = doc.metadata.get("source", "unknown")
103
+ try:
104
+ # Truncate content if it's too long (keep first 10000 chars for now)
105
+ content = doc.page_content[:10000]
106
+
107
+ # Build messages with combined system prompt
108
+ messages = [
109
+ {"role": "system", "content": self.summary_prompt},
110
+ {
111
+ "role": "user",
112
+ "content": f"Analyze and summarize the following webpage content:\n\n{content}",
113
+ },
114
+ ]
115
+
116
+ # Use cached schema
117
+ schema = self._schema_cache
118
+
119
+ response = await self._summarize_with_retry(doc, loop, messages, schema)
120
+
121
+ # Track token usage (protected by lock to prevent race conditions)
122
+ if response and hasattr(response, "usage") and response.usage:
123
+ async with self._usage_lock:
124
+ self.total_input_tokens += response.usage.prompt_tokens
125
+ self.total_output_tokens += response.usage.completion_tokens
126
+
127
+ # Validate response
128
+ if not response:
129
+ raise ValueError("No response object from API")
130
+
131
+ if not response.choices:
132
+ raise ValueError(f"No choices in response: {response}")
133
+
134
+ if not response.choices[0].message:
135
+ raise ValueError(
136
+ f"No message in response choice: {response.choices[0]}"
137
+ )
138
+
139
+ if not response.choices[0].message.content:
140
+ # Check if there's a finish reason that explains why
141
+ finish_reason = (
142
+ response.choices[0].finish_reason
143
+ if hasattr(response.choices[0], "finish_reason")
144
+ else "unknown"
145
+ )
146
+ raise ValueError(
147
+ f"Empty content in response. Finish reason: {finish_reason}"
148
+ )
149
+
150
+ content = response.choices[0].message.content.strip()
151
+ if not content:
152
+ raise ValueError("Empty response content after stripping")
153
+
154
+ # Parse JSON response
155
+ try:
156
+ parsed_response = PageSummary(**json.loads(content))
157
+ except json.JSONDecodeError as je:
158
+ raise ValueError(
159
+ f"Invalid JSON response: {je}. Content: {content[:200]}..."
160
+ ) from je
161
+
162
+ # Return structured response for formatting
163
+ return parsed_response
164
+
165
+ except Exception as e:
166
+ # Log with full traceback for debugging
167
+ logger.exception(
168
+ f"Failed to summarize {url}: {str(e)}",
169
+ exc_info=e,
170
+ extra={
171
+ "url": url,
172
+ "model": self.llm_name,
173
+ },
174
+ )
175
+ # Return cached fallback PageSummary object
176
+ return FALLBACK_SUMMARY
177
+
178
+ async def summarize_document(
179
+ self, doc: Any, cache_file: Path | None = None
180
+ ) -> str | None:
181
+ async with self.semaphore:
182
+ url = doc.metadata.get("source", "")
183
+ try:
184
+ loop = asyncio.get_event_loop()
185
+ page_summary = await self._summarize(doc, loop)
186
+
187
+ # Format the summary with new structure
188
+ title = doc.metadata.get("title", url.split("/")[-1])
189
+ formatted_summary = f"Title: [{title}]({url})\nKeywords: {page_summary.keywords}\nSummary: {page_summary.concise_summary}\n\n"
190
+
191
+ # Update cache if provided
192
+ if cache_file:
193
+ self._update_cache(cache_file, url, formatted_summary)
194
+
195
+ return formatted_summary
196
+ except Exception as e:
197
+ logger.exception(
198
+ f"Error summarizing {url}: {str(e)}",
199
+ exc_info=e,
200
+ extra={"url": url},
201
+ )
202
+ return None
203
+
204
+ def _update_cache(self, cache_file: Path, url: str, summary: str) -> None:
205
+ """Update the cache file with a new summary (simple version for single-user CLI)."""
206
+ try:
207
+ # Read existing cache
208
+ cache_data = {}
209
+ if cache_file.exists():
210
+ cache_data = json.loads(cache_file.read_text())
211
+
212
+ # Update and write
213
+ cache_data[url] = summary
214
+ cache_file.write_text(json.dumps(cache_data, indent=2))
215
+ except Exception as e:
216
+ logger.exception(
217
+ f"Could not update cache: {str(e)}",
218
+ exc_info=e,
219
+ extra={"cache_file": str(cache_file), "url": url},
220
+ )
221
+
222
+ async def summarize_all(
223
+ self,
224
+ docs: list[Document],
225
+ existing_summaries: dict[str, str] | None = None,
226
+ cache_file: Path | None = None,
227
+ ) -> tuple[list[str], dict[str, int]]:
228
+ # Reset token counters at start of each run
229
+ self.total_input_tokens = 0
230
+ self.total_output_tokens = 0
231
+
232
+ existing_summaries = existing_summaries or {}
233
+ summaries: list[str] = []
234
+ docs_to_process: list[Document] = []
235
+
236
+ # Separate cached from new documents
237
+ for doc in docs:
238
+ url = doc.metadata.get("source", "")
239
+ if url in existing_summaries:
240
+ summaries.append(existing_summaries[url])
241
+ else:
242
+ docs_to_process.append(doc)
243
+
244
+ if len(existing_summaries) > 0:
245
+ print(f"Using {len(existing_summaries)} cached summaries")
246
+
247
+ if not docs_to_process:
248
+ usage_stats = {
249
+ "input_tokens": self.total_input_tokens,
250
+ "output_tokens": self.total_output_tokens,
251
+ }
252
+ return summaries, usage_stats
253
+
254
+ # Process new documents with progress bar
255
+ print(
256
+ f"Summarizing {len(docs_to_process)} documents (max {self.max_concurrent} concurrent)..."
257
+ )
258
+
259
+ tasks = [self.summarize_document(doc, cache_file) for doc in docs_to_process]
260
+
261
+ # Use tqdm to track completion
262
+ failed_count = 0
263
+ with tqdm(
264
+ total=len(docs_to_process), desc="Generating summaries", unit="doc"
265
+ ) as pbar:
266
+ results: list[str | None | Exception] = []
267
+ for coro in asyncio.as_completed(tasks):
268
+ result = await coro
269
+ results.append(result)
270
+ if result is None or isinstance(result, Exception):
271
+ failed_count += 1
272
+ pbar.set_postfix({"failed": failed_count}) # type: ignore[reportUnknownMemberType]
273
+ pbar.update(1)
274
+
275
+ # Collect successful summaries
276
+ for result in results:
277
+ if isinstance(result, str):
278
+ summaries.append(result)
279
+
280
+ # Log any failures with full context
281
+ for result, doc in zip(results, docs_to_process, strict=False):
282
+ if isinstance(result, Exception):
283
+ url = doc.metadata.get("source", "unknown")
284
+ logger.exception(
285
+ f"Failed to summarize document {url}: {str(result)}",
286
+ exc_info=result,
287
+ extra={"url": url},
288
+ )
289
+
290
+ success_count = len(results) - failed_count
291
+ print(
292
+ f"Summarization complete: {success_count} successful, {failed_count} failed"
293
+ )
294
+
295
+ # Return summaries and usage statistics
296
+ usage_stats = {
297
+ "input_tokens": self.total_input_tokens,
298
+ "output_tokens": self.total_output_tokens,
299
+ }
300
+ return summaries, usage_stats
@@ -0,0 +1,75 @@
1
+ """Simple URL filtering for documentation crawling."""
2
+
3
+ import logging
4
+ import re
5
+ from urllib.parse import urlparse
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+
10
+ class URLFilter:
11
+ """Simple URL filtering based on file extensions."""
12
+
13
+ # File extensions to skip (assets, downloads, media)
14
+ FILE_EXTENSION_PATTERNS: list[str] = [
15
+ r"\.(pdf|zip|tar|gz|exe|dmg|iso)$", # Downloads
16
+ r"\.(css|js|map)$", # Web assets
17
+ r"\.(woff2?|ttf|eot)$", # Fonts
18
+ r"\.(png|jpe?g|gif|svg|webp|ico|bmp)$", # Images
19
+ r"\.(mp4|webm|avi|mov|mp3|wav|ogg)$", # Media
20
+ ]
21
+
22
+ def __init__(self) -> None:
23
+ """Initialize URL filter with compiled patterns."""
24
+ self.file_extension_patterns = [
25
+ re.compile(p, re.IGNORECASE) for p in self.FILE_EXTENSION_PATTERNS
26
+ ]
27
+ logger.debug(
28
+ f"URLFilter initialized with {len(self.file_extension_patterns)} patterns"
29
+ )
30
+
31
+ def should_include(self, url: str) -> bool:
32
+ """
33
+ Determine if URL should be included in crawl.
34
+
35
+ Logic:
36
+ - Skip URLs with file extensions (downloads, assets, media)
37
+ - Include everything else (documentation pages)
38
+
39
+ Args:
40
+ url: URL to check
41
+
42
+ Returns:
43
+ True if URL should be crawled, False otherwise
44
+ """
45
+ parsed = urlparse(url)
46
+ path = parsed.path.lower()
47
+
48
+ # Check file extensions
49
+ for pattern in self.file_extension_patterns:
50
+ if pattern.search(path):
51
+ logger.debug(f"URL skipped by file extension: {url}")
52
+ return False
53
+
54
+ # Include by default
55
+ return True
56
+
57
+ def filter_urls(self, urls: list[str]) -> list[str]:
58
+ """
59
+ Filter a list of URLs.
60
+
61
+ Args:
62
+ urls: List of URLs to filter
63
+
64
+ Returns:
65
+ Filtered list of URLs
66
+ """
67
+ filtered = [url for url in urls if self.should_include(url)]
68
+ skipped_count = len(urls) - len(filtered)
69
+
70
+ if skipped_count > 0:
71
+ logger.info(
72
+ f"Filtered {skipped_count} URLs ({len(filtered)}/{len(urls)} remaining)"
73
+ )
74
+
75
+ return filtered
@@ -0,0 +1,73 @@
1
+ """Simple URL deduplication utilities."""
2
+
3
+ from urllib.parse import urlparse, urlunparse
4
+
5
+
6
+ class URLCanonicalizer:
7
+ """Simple URL canonicalization for documentation sites."""
8
+
9
+ def __init__(self, keep_fragments: bool = False):
10
+ """
11
+ Initialize URL canonicalizer.
12
+
13
+ Args:
14
+ keep_fragments: Keep URL fragments (for sites using hash routing).
15
+ Default is False to treat #section1 and #section2 as same URL.
16
+ """
17
+ self.keep_fragments = keep_fragments
18
+
19
+ def canonicalize(self, url: str) -> str:
20
+ """
21
+ Normalize URL for deduplication.
22
+
23
+ Simple normalization:
24
+ - Lowercase scheme and domain
25
+ - Remove fragments (unless keep_fragments=True)
26
+ - Normalize trailing slashes for directory paths
27
+
28
+ Args:
29
+ url: URL to canonicalize
30
+
31
+ Returns:
32
+ Canonicalized URL string
33
+ """
34
+ parsed = urlparse(url)
35
+
36
+ # Normalize scheme and domain to lowercase
37
+ scheme = parsed.scheme.lower()
38
+ netloc = parsed.netloc.lower()
39
+ path = parsed.path
40
+
41
+ # Normalize trailing slashes: add to directory-like paths
42
+ if path and not path.endswith("/"):
43
+ # If no file extension, treat as directory
44
+ last_segment = path.split("/")[-1]
45
+ if "." not in last_segment:
46
+ path = path + "/"
47
+
48
+ # Remove fragment unless keeping them
49
+ fragment = parsed.fragment if self.keep_fragments else ""
50
+
51
+ # Reconstruct URL
52
+ return urlunparse((scheme, netloc, path, parsed.params, parsed.query, fragment))
53
+
54
+ def deduplicate(self, urls: list[str]) -> list[str]:
55
+ """
56
+ Remove duplicate URLs from list while preserving order.
57
+
58
+ Args:
59
+ urls: List of URLs (may contain duplicates)
60
+
61
+ Returns:
62
+ List of unique URLs (first occurrence preserved)
63
+ """
64
+ seen: set[str] = set()
65
+ unique: list[str] = []
66
+
67
+ for url in urls:
68
+ canonical = self.canonicalize(url)
69
+ if canonical not in seen:
70
+ seen.add(canonical)
71
+ unique.append(url) # Keep original URL
72
+
73
+ return unique