llmsbrieftxt 1.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of llmsbrieftxt might be problematic. Click here for more details.

@@ -0,0 +1,303 @@
1
+ import asyncio
2
+ import json
3
+ import logging
4
+ import os
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ import openai
9
+ from openai import AsyncOpenAI
10
+ from tenacity import (
11
+ retry,
12
+ retry_if_exception_type,
13
+ stop_after_attempt,
14
+ wait_exponential,
15
+ )
16
+ from tqdm import tqdm
17
+
18
+ from .constants import (
19
+ DEFAULT_CONCURRENT_SUMMARIES,
20
+ DEFAULT_OPENAI_MODEL,
21
+ DEFAULT_SUMMARY_PROMPT,
22
+ )
23
+ from .schema import Document, PageSummary
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ # Fallback summary used when LLM summarization fails
29
+ FALLBACK_SUMMARY = PageSummary(
30
+ content_analysis="This page contains web content relevant to the topic.",
31
+ primary_use_cases="When accessing general web content",
32
+ key_takeaways="Contains general information",
33
+ related_topics="Web content",
34
+ keywords="web, content, information",
35
+ concise_summary="This page contains web content relevant to the topic.",
36
+ )
37
+
38
+
39
+ class Summarizer:
40
+ def __init__(
41
+ self,
42
+ llm_name: str = DEFAULT_OPENAI_MODEL,
43
+ summary_prompt: str | None = None,
44
+ max_concurrent: int = DEFAULT_CONCURRENT_SUMMARIES,
45
+ ) -> None:
46
+ self.llm_name = llm_name
47
+ self.max_concurrent = max_concurrent
48
+ self.summary_prompt = summary_prompt or DEFAULT_SUMMARY_PROMPT
49
+ self.client = self._init_client()
50
+ self.semaphore = asyncio.Semaphore(max_concurrent)
51
+ # Cache JSON schema to avoid regenerating on every request
52
+ self._schema_cache = PageSummary.model_json_schema()
53
+ self._schema_cache["additionalProperties"] = False
54
+ # Track token usage for cost reporting (protected by lock for thread safety)
55
+ self.total_input_tokens = 0
56
+ self.total_output_tokens = 0
57
+ self._usage_lock = asyncio.Lock()
58
+
59
+ def _init_client(self) -> AsyncOpenAI:
60
+ api_key = os.getenv("OPENAI_API_KEY")
61
+ if not api_key:
62
+ raise ValueError(
63
+ "OPENAI_API_KEY environment variable is required. Please set your OpenAI API key in your environment variables."
64
+ )
65
+ base_url = os.getenv("OPENAI_BASE_URL")
66
+ if base_url:
67
+ return AsyncOpenAI(api_key=api_key, base_url=base_url)
68
+ return AsyncOpenAI(api_key=api_key)
69
+
70
+ @retry(
71
+ stop=stop_after_attempt(3),
72
+ wait=wait_exponential(multiplier=2, min=4, max=60),
73
+ retry=retry_if_exception_type(
74
+ (
75
+ openai.RateLimitError,
76
+ openai.APITimeoutError,
77
+ openai.APIConnectionError,
78
+ )
79
+ ),
80
+ reraise=True,
81
+ )
82
+ async def _summarize_with_retry(
83
+ self,
84
+ doc: Any,
85
+ loop: Any,
86
+ messages: list[dict[str, str]],
87
+ schema: dict[str, Any],
88
+ ) -> Any:
89
+ """Make LLM API call with retry logic for transient failures."""
90
+ return await self.client.chat.completions.create( # type: ignore[call-overload]
91
+ model=self.llm_name,
92
+ messages=messages, # type: ignore[arg-type]
93
+ response_format={ # type: ignore[typeddict-item]
94
+ "type": "json_schema",
95
+ "json_schema": {
96
+ "name": "page_summary",
97
+ "schema": schema,
98
+ "strict": True,
99
+ },
100
+ },
101
+ )
102
+
103
+ async def _summarize(self, doc: Any, loop: Any) -> PageSummary:
104
+ """Summarize document using OpenAI API."""
105
+ url = doc.metadata.get("source", "unknown")
106
+ try:
107
+ # Truncate content if it's too long (keep first 10000 chars for now)
108
+ content = doc.page_content[:10000]
109
+
110
+ # Build messages with combined system prompt
111
+ messages = [
112
+ {"role": "system", "content": self.summary_prompt},
113
+ {
114
+ "role": "user",
115
+ "content": f"Analyze and summarize the following webpage content:\n\n{content}",
116
+ },
117
+ ]
118
+
119
+ # Use cached schema
120
+ schema = self._schema_cache
121
+
122
+ response = await self._summarize_with_retry(doc, loop, messages, schema)
123
+
124
+ # Track token usage (protected by lock to prevent race conditions)
125
+ if response and hasattr(response, "usage") and response.usage:
126
+ async with self._usage_lock:
127
+ self.total_input_tokens += response.usage.prompt_tokens
128
+ self.total_output_tokens += response.usage.completion_tokens
129
+
130
+ # Validate response
131
+ if not response:
132
+ raise ValueError("No response object from API")
133
+
134
+ if not response.choices:
135
+ raise ValueError(f"No choices in response: {response}")
136
+
137
+ if not response.choices[0].message:
138
+ raise ValueError(
139
+ f"No message in response choice: {response.choices[0]}"
140
+ )
141
+
142
+ if not response.choices[0].message.content:
143
+ # Check if there's a finish reason that explains why
144
+ finish_reason = (
145
+ response.choices[0].finish_reason
146
+ if hasattr(response.choices[0], "finish_reason")
147
+ else "unknown"
148
+ )
149
+ raise ValueError(
150
+ f"Empty content in response. Finish reason: {finish_reason}"
151
+ )
152
+
153
+ content = response.choices[0].message.content.strip()
154
+ if not content:
155
+ raise ValueError("Empty response content after stripping")
156
+
157
+ # Parse JSON response
158
+ try:
159
+ parsed_response = PageSummary(**json.loads(content))
160
+ except json.JSONDecodeError as je:
161
+ raise ValueError(
162
+ f"Invalid JSON response: {je}. Content: {content[:200]}..."
163
+ ) from je
164
+
165
+ # Return structured response for formatting
166
+ return parsed_response
167
+
168
+ except Exception as e:
169
+ # Log with full traceback for debugging
170
+ logger.exception(
171
+ f"Failed to summarize {url}: {str(e)}",
172
+ exc_info=e,
173
+ extra={
174
+ "url": url,
175
+ "model": self.llm_name,
176
+ },
177
+ )
178
+ # Return cached fallback PageSummary object
179
+ return FALLBACK_SUMMARY
180
+
181
+ async def summarize_document(
182
+ self, doc: Any, cache_file: Path | None = None
183
+ ) -> str | None:
184
+ async with self.semaphore:
185
+ url = doc.metadata.get("source", "")
186
+ try:
187
+ loop = asyncio.get_event_loop()
188
+ page_summary = await self._summarize(doc, loop)
189
+
190
+ # Format the summary with new structure
191
+ title = doc.metadata.get("title", url.split("/")[-1])
192
+ formatted_summary = f"Title: [{title}]({url})\nKeywords: {page_summary.keywords}\nSummary: {page_summary.concise_summary}\n\n"
193
+
194
+ # Update cache if provided
195
+ if cache_file:
196
+ self._update_cache(cache_file, url, formatted_summary)
197
+
198
+ return formatted_summary
199
+ except Exception as e:
200
+ logger.exception(
201
+ f"Error summarizing {url}: {str(e)}",
202
+ exc_info=e,
203
+ extra={"url": url},
204
+ )
205
+ return None
206
+
207
+ def _update_cache(self, cache_file: Path, url: str, summary: str) -> None:
208
+ """Update the cache file with a new summary (simple version for single-user CLI)."""
209
+ try:
210
+ # Read existing cache
211
+ cache_data = {}
212
+ if cache_file.exists():
213
+ cache_data = json.loads(cache_file.read_text())
214
+
215
+ # Update and write
216
+ cache_data[url] = summary
217
+ cache_file.write_text(json.dumps(cache_data, indent=2))
218
+ except Exception as e:
219
+ logger.exception(
220
+ f"Could not update cache: {str(e)}",
221
+ exc_info=e,
222
+ extra={"cache_file": str(cache_file), "url": url},
223
+ )
224
+
225
+ async def summarize_all(
226
+ self,
227
+ docs: list[Document],
228
+ existing_summaries: dict[str, str] | None = None,
229
+ cache_file: Path | None = None,
230
+ ) -> tuple[list[str], dict[str, int]]:
231
+ # Reset token counters at start of each run
232
+ self.total_input_tokens = 0
233
+ self.total_output_tokens = 0
234
+
235
+ existing_summaries = existing_summaries or {}
236
+ summaries: list[str] = []
237
+ docs_to_process: list[Document] = []
238
+
239
+ # Separate cached from new documents
240
+ for doc in docs:
241
+ url = doc.metadata.get("source", "")
242
+ if url in existing_summaries:
243
+ summaries.append(existing_summaries[url])
244
+ else:
245
+ docs_to_process.append(doc)
246
+
247
+ if len(existing_summaries) > 0:
248
+ print(f"Using {len(existing_summaries)} cached summaries")
249
+
250
+ if not docs_to_process:
251
+ usage_stats = {
252
+ "input_tokens": self.total_input_tokens,
253
+ "output_tokens": self.total_output_tokens,
254
+ }
255
+ return summaries, usage_stats
256
+
257
+ # Process new documents with progress bar
258
+ print(
259
+ f"Summarizing {len(docs_to_process)} documents (max {self.max_concurrent} concurrent)..."
260
+ )
261
+
262
+ tasks = [self.summarize_document(doc, cache_file) for doc in docs_to_process]
263
+
264
+ # Use tqdm to track completion
265
+ failed_count = 0
266
+ with tqdm(
267
+ total=len(docs_to_process), desc="Generating summaries", unit="doc"
268
+ ) as pbar:
269
+ results: list[str | None | Exception] = []
270
+ for coro in asyncio.as_completed(tasks):
271
+ result = await coro
272
+ results.append(result)
273
+ if result is None or isinstance(result, Exception):
274
+ failed_count += 1
275
+ pbar.set_postfix({"failed": failed_count}) # type: ignore[reportUnknownMemberType]
276
+ pbar.update(1)
277
+
278
+ # Collect successful summaries
279
+ for result in results:
280
+ if isinstance(result, str):
281
+ summaries.append(result)
282
+
283
+ # Log any failures with full context
284
+ for result, doc in zip(results, docs_to_process, strict=False):
285
+ if isinstance(result, Exception):
286
+ url = doc.metadata.get("source", "unknown")
287
+ logger.exception(
288
+ f"Failed to summarize document {url}: {str(result)}",
289
+ exc_info=result,
290
+ extra={"url": url},
291
+ )
292
+
293
+ success_count = len(results) - failed_count
294
+ print(
295
+ f"Summarization complete: {success_count} successful, {failed_count} failed"
296
+ )
297
+
298
+ # Return summaries and usage statistics
299
+ usage_stats = {
300
+ "input_tokens": self.total_input_tokens,
301
+ "output_tokens": self.total_output_tokens,
302
+ }
303
+ return summaries, usage_stats
@@ -0,0 +1,75 @@
1
+ """Simple URL filtering for documentation crawling."""
2
+
3
+ import logging
4
+ import re
5
+ from urllib.parse import urlparse
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+
10
+ class URLFilter:
11
+ """Simple URL filtering based on file extensions."""
12
+
13
+ # File extensions to skip (assets, downloads, media)
14
+ FILE_EXTENSION_PATTERNS: list[str] = [
15
+ r"\.(pdf|zip|tar|gz|exe|dmg|iso)$", # Downloads
16
+ r"\.(css|js|map)$", # Web assets
17
+ r"\.(woff2?|ttf|eot)$", # Fonts
18
+ r"\.(png|jpe?g|gif|svg|webp|ico|bmp)$", # Images
19
+ r"\.(mp4|webm|avi|mov|mp3|wav|ogg)$", # Media
20
+ ]
21
+
22
+ def __init__(self) -> None:
23
+ """Initialize URL filter with compiled patterns."""
24
+ self.file_extension_patterns = [
25
+ re.compile(p, re.IGNORECASE) for p in self.FILE_EXTENSION_PATTERNS
26
+ ]
27
+ logger.debug(
28
+ f"URLFilter initialized with {len(self.file_extension_patterns)} patterns"
29
+ )
30
+
31
+ def should_include(self, url: str) -> bool:
32
+ """
33
+ Determine if URL should be included in crawl.
34
+
35
+ Logic:
36
+ - Skip URLs with file extensions (downloads, assets, media)
37
+ - Include everything else (documentation pages)
38
+
39
+ Args:
40
+ url: URL to check
41
+
42
+ Returns:
43
+ True if URL should be crawled, False otherwise
44
+ """
45
+ parsed = urlparse(url)
46
+ path = parsed.path.lower()
47
+
48
+ # Check file extensions
49
+ for pattern in self.file_extension_patterns:
50
+ if pattern.search(path):
51
+ logger.debug(f"URL skipped by file extension: {url}")
52
+ return False
53
+
54
+ # Include by default
55
+ return True
56
+
57
+ def filter_urls(self, urls: list[str]) -> list[str]:
58
+ """
59
+ Filter a list of URLs.
60
+
61
+ Args:
62
+ urls: List of URLs to filter
63
+
64
+ Returns:
65
+ Filtered list of URLs
66
+ """
67
+ filtered = [url for url in urls if self.should_include(url)]
68
+ skipped_count = len(urls) - len(filtered)
69
+
70
+ if skipped_count > 0:
71
+ logger.info(
72
+ f"Filtered {skipped_count} URLs ({len(filtered)}/{len(urls)} remaining)"
73
+ )
74
+
75
+ return filtered
@@ -0,0 +1,73 @@
1
+ """Simple URL deduplication utilities."""
2
+
3
+ from urllib.parse import urlparse, urlunparse
4
+
5
+
6
+ class URLCanonicalizer:
7
+ """Simple URL canonicalization for documentation sites."""
8
+
9
+ def __init__(self, keep_fragments: bool = False):
10
+ """
11
+ Initialize URL canonicalizer.
12
+
13
+ Args:
14
+ keep_fragments: Keep URL fragments (for sites using hash routing).
15
+ Default is False to treat #section1 and #section2 as same URL.
16
+ """
17
+ self.keep_fragments = keep_fragments
18
+
19
+ def canonicalize(self, url: str) -> str:
20
+ """
21
+ Normalize URL for deduplication.
22
+
23
+ Simple normalization:
24
+ - Lowercase scheme and domain
25
+ - Remove fragments (unless keep_fragments=True)
26
+ - Normalize trailing slashes for directory paths
27
+
28
+ Args:
29
+ url: URL to canonicalize
30
+
31
+ Returns:
32
+ Canonicalized URL string
33
+ """
34
+ parsed = urlparse(url)
35
+
36
+ # Normalize scheme and domain to lowercase
37
+ scheme = parsed.scheme.lower()
38
+ netloc = parsed.netloc.lower()
39
+ path = parsed.path
40
+
41
+ # Normalize trailing slashes: add to directory-like paths
42
+ if path and not path.endswith("/"):
43
+ # If no file extension, treat as directory
44
+ last_segment = path.split("/")[-1]
45
+ if "." not in last_segment:
46
+ path = path + "/"
47
+
48
+ # Remove fragment unless keeping them
49
+ fragment = parsed.fragment if self.keep_fragments else ""
50
+
51
+ # Reconstruct URL
52
+ return urlunparse((scheme, netloc, path, parsed.params, parsed.query, fragment))
53
+
54
+ def deduplicate(self, urls: list[str]) -> list[str]:
55
+ """
56
+ Remove duplicate URLs from list while preserving order.
57
+
58
+ Args:
59
+ urls: List of URLs (may contain duplicates)
60
+
61
+ Returns:
62
+ List of unique URLs (first occurrence preserved)
63
+ """
64
+ seen: set[str] = set()
65
+ unique: list[str] = []
66
+
67
+ for url in urls:
68
+ canonical = self.canonicalize(url)
69
+ if canonical not in seen:
70
+ seen.add(canonical)
71
+ unique.append(url) # Keep original URL
72
+
73
+ return unique