llmsbrieftxt 1.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of llmsbrieftxt might be problematic. Click here for more details.
- llmsbrieftxt/__init__.py +1 -0
- llmsbrieftxt/cli.py +276 -0
- llmsbrieftxt/constants.py +62 -0
- llmsbrieftxt/crawler.py +358 -0
- llmsbrieftxt/doc_loader.py +150 -0
- llmsbrieftxt/extractor.py +69 -0
- llmsbrieftxt/main.py +379 -0
- llmsbrieftxt/schema.py +42 -0
- llmsbrieftxt/summarizer.py +303 -0
- llmsbrieftxt/url_filters.py +75 -0
- llmsbrieftxt/url_utils.py +73 -0
- llmsbrieftxt-1.6.0.dist-info/METADATA +420 -0
- llmsbrieftxt-1.6.0.dist-info/RECORD +16 -0
- llmsbrieftxt-1.6.0.dist-info/WHEEL +4 -0
- llmsbrieftxt-1.6.0.dist-info/entry_points.txt +2 -0
- llmsbrieftxt-1.6.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,303 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import json
|
|
3
|
+
import logging
|
|
4
|
+
import os
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
import openai
|
|
9
|
+
from openai import AsyncOpenAI
|
|
10
|
+
from tenacity import (
|
|
11
|
+
retry,
|
|
12
|
+
retry_if_exception_type,
|
|
13
|
+
stop_after_attempt,
|
|
14
|
+
wait_exponential,
|
|
15
|
+
)
|
|
16
|
+
from tqdm import tqdm
|
|
17
|
+
|
|
18
|
+
from .constants import (
|
|
19
|
+
DEFAULT_CONCURRENT_SUMMARIES,
|
|
20
|
+
DEFAULT_OPENAI_MODEL,
|
|
21
|
+
DEFAULT_SUMMARY_PROMPT,
|
|
22
|
+
)
|
|
23
|
+
from .schema import Document, PageSummary
|
|
24
|
+
|
|
25
|
+
logger = logging.getLogger(__name__)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
# Fallback summary used when LLM summarization fails
|
|
29
|
+
FALLBACK_SUMMARY = PageSummary(
|
|
30
|
+
content_analysis="This page contains web content relevant to the topic.",
|
|
31
|
+
primary_use_cases="When accessing general web content",
|
|
32
|
+
key_takeaways="Contains general information",
|
|
33
|
+
related_topics="Web content",
|
|
34
|
+
keywords="web, content, information",
|
|
35
|
+
concise_summary="This page contains web content relevant to the topic.",
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class Summarizer:
|
|
40
|
+
def __init__(
|
|
41
|
+
self,
|
|
42
|
+
llm_name: str = DEFAULT_OPENAI_MODEL,
|
|
43
|
+
summary_prompt: str | None = None,
|
|
44
|
+
max_concurrent: int = DEFAULT_CONCURRENT_SUMMARIES,
|
|
45
|
+
) -> None:
|
|
46
|
+
self.llm_name = llm_name
|
|
47
|
+
self.max_concurrent = max_concurrent
|
|
48
|
+
self.summary_prompt = summary_prompt or DEFAULT_SUMMARY_PROMPT
|
|
49
|
+
self.client = self._init_client()
|
|
50
|
+
self.semaphore = asyncio.Semaphore(max_concurrent)
|
|
51
|
+
# Cache JSON schema to avoid regenerating on every request
|
|
52
|
+
self._schema_cache = PageSummary.model_json_schema()
|
|
53
|
+
self._schema_cache["additionalProperties"] = False
|
|
54
|
+
# Track token usage for cost reporting (protected by lock for thread safety)
|
|
55
|
+
self.total_input_tokens = 0
|
|
56
|
+
self.total_output_tokens = 0
|
|
57
|
+
self._usage_lock = asyncio.Lock()
|
|
58
|
+
|
|
59
|
+
def _init_client(self) -> AsyncOpenAI:
|
|
60
|
+
api_key = os.getenv("OPENAI_API_KEY")
|
|
61
|
+
if not api_key:
|
|
62
|
+
raise ValueError(
|
|
63
|
+
"OPENAI_API_KEY environment variable is required. Please set your OpenAI API key in your environment variables."
|
|
64
|
+
)
|
|
65
|
+
base_url = os.getenv("OPENAI_BASE_URL")
|
|
66
|
+
if base_url:
|
|
67
|
+
return AsyncOpenAI(api_key=api_key, base_url=base_url)
|
|
68
|
+
return AsyncOpenAI(api_key=api_key)
|
|
69
|
+
|
|
70
|
+
@retry(
|
|
71
|
+
stop=stop_after_attempt(3),
|
|
72
|
+
wait=wait_exponential(multiplier=2, min=4, max=60),
|
|
73
|
+
retry=retry_if_exception_type(
|
|
74
|
+
(
|
|
75
|
+
openai.RateLimitError,
|
|
76
|
+
openai.APITimeoutError,
|
|
77
|
+
openai.APIConnectionError,
|
|
78
|
+
)
|
|
79
|
+
),
|
|
80
|
+
reraise=True,
|
|
81
|
+
)
|
|
82
|
+
async def _summarize_with_retry(
|
|
83
|
+
self,
|
|
84
|
+
doc: Any,
|
|
85
|
+
loop: Any,
|
|
86
|
+
messages: list[dict[str, str]],
|
|
87
|
+
schema: dict[str, Any],
|
|
88
|
+
) -> Any:
|
|
89
|
+
"""Make LLM API call with retry logic for transient failures."""
|
|
90
|
+
return await self.client.chat.completions.create( # type: ignore[call-overload]
|
|
91
|
+
model=self.llm_name,
|
|
92
|
+
messages=messages, # type: ignore[arg-type]
|
|
93
|
+
response_format={ # type: ignore[typeddict-item]
|
|
94
|
+
"type": "json_schema",
|
|
95
|
+
"json_schema": {
|
|
96
|
+
"name": "page_summary",
|
|
97
|
+
"schema": schema,
|
|
98
|
+
"strict": True,
|
|
99
|
+
},
|
|
100
|
+
},
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
async def _summarize(self, doc: Any, loop: Any) -> PageSummary:
|
|
104
|
+
"""Summarize document using OpenAI API."""
|
|
105
|
+
url = doc.metadata.get("source", "unknown")
|
|
106
|
+
try:
|
|
107
|
+
# Truncate content if it's too long (keep first 10000 chars for now)
|
|
108
|
+
content = doc.page_content[:10000]
|
|
109
|
+
|
|
110
|
+
# Build messages with combined system prompt
|
|
111
|
+
messages = [
|
|
112
|
+
{"role": "system", "content": self.summary_prompt},
|
|
113
|
+
{
|
|
114
|
+
"role": "user",
|
|
115
|
+
"content": f"Analyze and summarize the following webpage content:\n\n{content}",
|
|
116
|
+
},
|
|
117
|
+
]
|
|
118
|
+
|
|
119
|
+
# Use cached schema
|
|
120
|
+
schema = self._schema_cache
|
|
121
|
+
|
|
122
|
+
response = await self._summarize_with_retry(doc, loop, messages, schema)
|
|
123
|
+
|
|
124
|
+
# Track token usage (protected by lock to prevent race conditions)
|
|
125
|
+
if response and hasattr(response, "usage") and response.usage:
|
|
126
|
+
async with self._usage_lock:
|
|
127
|
+
self.total_input_tokens += response.usage.prompt_tokens
|
|
128
|
+
self.total_output_tokens += response.usage.completion_tokens
|
|
129
|
+
|
|
130
|
+
# Validate response
|
|
131
|
+
if not response:
|
|
132
|
+
raise ValueError("No response object from API")
|
|
133
|
+
|
|
134
|
+
if not response.choices:
|
|
135
|
+
raise ValueError(f"No choices in response: {response}")
|
|
136
|
+
|
|
137
|
+
if not response.choices[0].message:
|
|
138
|
+
raise ValueError(
|
|
139
|
+
f"No message in response choice: {response.choices[0]}"
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
if not response.choices[0].message.content:
|
|
143
|
+
# Check if there's a finish reason that explains why
|
|
144
|
+
finish_reason = (
|
|
145
|
+
response.choices[0].finish_reason
|
|
146
|
+
if hasattr(response.choices[0], "finish_reason")
|
|
147
|
+
else "unknown"
|
|
148
|
+
)
|
|
149
|
+
raise ValueError(
|
|
150
|
+
f"Empty content in response. Finish reason: {finish_reason}"
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
content = response.choices[0].message.content.strip()
|
|
154
|
+
if not content:
|
|
155
|
+
raise ValueError("Empty response content after stripping")
|
|
156
|
+
|
|
157
|
+
# Parse JSON response
|
|
158
|
+
try:
|
|
159
|
+
parsed_response = PageSummary(**json.loads(content))
|
|
160
|
+
except json.JSONDecodeError as je:
|
|
161
|
+
raise ValueError(
|
|
162
|
+
f"Invalid JSON response: {je}. Content: {content[:200]}..."
|
|
163
|
+
) from je
|
|
164
|
+
|
|
165
|
+
# Return structured response for formatting
|
|
166
|
+
return parsed_response
|
|
167
|
+
|
|
168
|
+
except Exception as e:
|
|
169
|
+
# Log with full traceback for debugging
|
|
170
|
+
logger.exception(
|
|
171
|
+
f"Failed to summarize {url}: {str(e)}",
|
|
172
|
+
exc_info=e,
|
|
173
|
+
extra={
|
|
174
|
+
"url": url,
|
|
175
|
+
"model": self.llm_name,
|
|
176
|
+
},
|
|
177
|
+
)
|
|
178
|
+
# Return cached fallback PageSummary object
|
|
179
|
+
return FALLBACK_SUMMARY
|
|
180
|
+
|
|
181
|
+
async def summarize_document(
|
|
182
|
+
self, doc: Any, cache_file: Path | None = None
|
|
183
|
+
) -> str | None:
|
|
184
|
+
async with self.semaphore:
|
|
185
|
+
url = doc.metadata.get("source", "")
|
|
186
|
+
try:
|
|
187
|
+
loop = asyncio.get_event_loop()
|
|
188
|
+
page_summary = await self._summarize(doc, loop)
|
|
189
|
+
|
|
190
|
+
# Format the summary with new structure
|
|
191
|
+
title = doc.metadata.get("title", url.split("/")[-1])
|
|
192
|
+
formatted_summary = f"Title: [{title}]({url})\nKeywords: {page_summary.keywords}\nSummary: {page_summary.concise_summary}\n\n"
|
|
193
|
+
|
|
194
|
+
# Update cache if provided
|
|
195
|
+
if cache_file:
|
|
196
|
+
self._update_cache(cache_file, url, formatted_summary)
|
|
197
|
+
|
|
198
|
+
return formatted_summary
|
|
199
|
+
except Exception as e:
|
|
200
|
+
logger.exception(
|
|
201
|
+
f"Error summarizing {url}: {str(e)}",
|
|
202
|
+
exc_info=e,
|
|
203
|
+
extra={"url": url},
|
|
204
|
+
)
|
|
205
|
+
return None
|
|
206
|
+
|
|
207
|
+
def _update_cache(self, cache_file: Path, url: str, summary: str) -> None:
|
|
208
|
+
"""Update the cache file with a new summary (simple version for single-user CLI)."""
|
|
209
|
+
try:
|
|
210
|
+
# Read existing cache
|
|
211
|
+
cache_data = {}
|
|
212
|
+
if cache_file.exists():
|
|
213
|
+
cache_data = json.loads(cache_file.read_text())
|
|
214
|
+
|
|
215
|
+
# Update and write
|
|
216
|
+
cache_data[url] = summary
|
|
217
|
+
cache_file.write_text(json.dumps(cache_data, indent=2))
|
|
218
|
+
except Exception as e:
|
|
219
|
+
logger.exception(
|
|
220
|
+
f"Could not update cache: {str(e)}",
|
|
221
|
+
exc_info=e,
|
|
222
|
+
extra={"cache_file": str(cache_file), "url": url},
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
async def summarize_all(
|
|
226
|
+
self,
|
|
227
|
+
docs: list[Document],
|
|
228
|
+
existing_summaries: dict[str, str] | None = None,
|
|
229
|
+
cache_file: Path | None = None,
|
|
230
|
+
) -> tuple[list[str], dict[str, int]]:
|
|
231
|
+
# Reset token counters at start of each run
|
|
232
|
+
self.total_input_tokens = 0
|
|
233
|
+
self.total_output_tokens = 0
|
|
234
|
+
|
|
235
|
+
existing_summaries = existing_summaries or {}
|
|
236
|
+
summaries: list[str] = []
|
|
237
|
+
docs_to_process: list[Document] = []
|
|
238
|
+
|
|
239
|
+
# Separate cached from new documents
|
|
240
|
+
for doc in docs:
|
|
241
|
+
url = doc.metadata.get("source", "")
|
|
242
|
+
if url in existing_summaries:
|
|
243
|
+
summaries.append(existing_summaries[url])
|
|
244
|
+
else:
|
|
245
|
+
docs_to_process.append(doc)
|
|
246
|
+
|
|
247
|
+
if len(existing_summaries) > 0:
|
|
248
|
+
print(f"Using {len(existing_summaries)} cached summaries")
|
|
249
|
+
|
|
250
|
+
if not docs_to_process:
|
|
251
|
+
usage_stats = {
|
|
252
|
+
"input_tokens": self.total_input_tokens,
|
|
253
|
+
"output_tokens": self.total_output_tokens,
|
|
254
|
+
}
|
|
255
|
+
return summaries, usage_stats
|
|
256
|
+
|
|
257
|
+
# Process new documents with progress bar
|
|
258
|
+
print(
|
|
259
|
+
f"Summarizing {len(docs_to_process)} documents (max {self.max_concurrent} concurrent)..."
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
tasks = [self.summarize_document(doc, cache_file) for doc in docs_to_process]
|
|
263
|
+
|
|
264
|
+
# Use tqdm to track completion
|
|
265
|
+
failed_count = 0
|
|
266
|
+
with tqdm(
|
|
267
|
+
total=len(docs_to_process), desc="Generating summaries", unit="doc"
|
|
268
|
+
) as pbar:
|
|
269
|
+
results: list[str | None | Exception] = []
|
|
270
|
+
for coro in asyncio.as_completed(tasks):
|
|
271
|
+
result = await coro
|
|
272
|
+
results.append(result)
|
|
273
|
+
if result is None or isinstance(result, Exception):
|
|
274
|
+
failed_count += 1
|
|
275
|
+
pbar.set_postfix({"failed": failed_count}) # type: ignore[reportUnknownMemberType]
|
|
276
|
+
pbar.update(1)
|
|
277
|
+
|
|
278
|
+
# Collect successful summaries
|
|
279
|
+
for result in results:
|
|
280
|
+
if isinstance(result, str):
|
|
281
|
+
summaries.append(result)
|
|
282
|
+
|
|
283
|
+
# Log any failures with full context
|
|
284
|
+
for result, doc in zip(results, docs_to_process, strict=False):
|
|
285
|
+
if isinstance(result, Exception):
|
|
286
|
+
url = doc.metadata.get("source", "unknown")
|
|
287
|
+
logger.exception(
|
|
288
|
+
f"Failed to summarize document {url}: {str(result)}",
|
|
289
|
+
exc_info=result,
|
|
290
|
+
extra={"url": url},
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
success_count = len(results) - failed_count
|
|
294
|
+
print(
|
|
295
|
+
f"Summarization complete: {success_count} successful, {failed_count} failed"
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
# Return summaries and usage statistics
|
|
299
|
+
usage_stats = {
|
|
300
|
+
"input_tokens": self.total_input_tokens,
|
|
301
|
+
"output_tokens": self.total_output_tokens,
|
|
302
|
+
}
|
|
303
|
+
return summaries, usage_stats
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
"""Simple URL filtering for documentation crawling."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import re
|
|
5
|
+
from urllib.parse import urlparse
|
|
6
|
+
|
|
7
|
+
logger = logging.getLogger(__name__)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class URLFilter:
|
|
11
|
+
"""Simple URL filtering based on file extensions."""
|
|
12
|
+
|
|
13
|
+
# File extensions to skip (assets, downloads, media)
|
|
14
|
+
FILE_EXTENSION_PATTERNS: list[str] = [
|
|
15
|
+
r"\.(pdf|zip|tar|gz|exe|dmg|iso)$", # Downloads
|
|
16
|
+
r"\.(css|js|map)$", # Web assets
|
|
17
|
+
r"\.(woff2?|ttf|eot)$", # Fonts
|
|
18
|
+
r"\.(png|jpe?g|gif|svg|webp|ico|bmp)$", # Images
|
|
19
|
+
r"\.(mp4|webm|avi|mov|mp3|wav|ogg)$", # Media
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
def __init__(self) -> None:
|
|
23
|
+
"""Initialize URL filter with compiled patterns."""
|
|
24
|
+
self.file_extension_patterns = [
|
|
25
|
+
re.compile(p, re.IGNORECASE) for p in self.FILE_EXTENSION_PATTERNS
|
|
26
|
+
]
|
|
27
|
+
logger.debug(
|
|
28
|
+
f"URLFilter initialized with {len(self.file_extension_patterns)} patterns"
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
def should_include(self, url: str) -> bool:
|
|
32
|
+
"""
|
|
33
|
+
Determine if URL should be included in crawl.
|
|
34
|
+
|
|
35
|
+
Logic:
|
|
36
|
+
- Skip URLs with file extensions (downloads, assets, media)
|
|
37
|
+
- Include everything else (documentation pages)
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
url: URL to check
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
True if URL should be crawled, False otherwise
|
|
44
|
+
"""
|
|
45
|
+
parsed = urlparse(url)
|
|
46
|
+
path = parsed.path.lower()
|
|
47
|
+
|
|
48
|
+
# Check file extensions
|
|
49
|
+
for pattern in self.file_extension_patterns:
|
|
50
|
+
if pattern.search(path):
|
|
51
|
+
logger.debug(f"URL skipped by file extension: {url}")
|
|
52
|
+
return False
|
|
53
|
+
|
|
54
|
+
# Include by default
|
|
55
|
+
return True
|
|
56
|
+
|
|
57
|
+
def filter_urls(self, urls: list[str]) -> list[str]:
|
|
58
|
+
"""
|
|
59
|
+
Filter a list of URLs.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
urls: List of URLs to filter
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
Filtered list of URLs
|
|
66
|
+
"""
|
|
67
|
+
filtered = [url for url in urls if self.should_include(url)]
|
|
68
|
+
skipped_count = len(urls) - len(filtered)
|
|
69
|
+
|
|
70
|
+
if skipped_count > 0:
|
|
71
|
+
logger.info(
|
|
72
|
+
f"Filtered {skipped_count} URLs ({len(filtered)}/{len(urls)} remaining)"
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
return filtered
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
"""Simple URL deduplication utilities."""
|
|
2
|
+
|
|
3
|
+
from urllib.parse import urlparse, urlunparse
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class URLCanonicalizer:
|
|
7
|
+
"""Simple URL canonicalization for documentation sites."""
|
|
8
|
+
|
|
9
|
+
def __init__(self, keep_fragments: bool = False):
|
|
10
|
+
"""
|
|
11
|
+
Initialize URL canonicalizer.
|
|
12
|
+
|
|
13
|
+
Args:
|
|
14
|
+
keep_fragments: Keep URL fragments (for sites using hash routing).
|
|
15
|
+
Default is False to treat #section1 and #section2 as same URL.
|
|
16
|
+
"""
|
|
17
|
+
self.keep_fragments = keep_fragments
|
|
18
|
+
|
|
19
|
+
def canonicalize(self, url: str) -> str:
|
|
20
|
+
"""
|
|
21
|
+
Normalize URL for deduplication.
|
|
22
|
+
|
|
23
|
+
Simple normalization:
|
|
24
|
+
- Lowercase scheme and domain
|
|
25
|
+
- Remove fragments (unless keep_fragments=True)
|
|
26
|
+
- Normalize trailing slashes for directory paths
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
url: URL to canonicalize
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
Canonicalized URL string
|
|
33
|
+
"""
|
|
34
|
+
parsed = urlparse(url)
|
|
35
|
+
|
|
36
|
+
# Normalize scheme and domain to lowercase
|
|
37
|
+
scheme = parsed.scheme.lower()
|
|
38
|
+
netloc = parsed.netloc.lower()
|
|
39
|
+
path = parsed.path
|
|
40
|
+
|
|
41
|
+
# Normalize trailing slashes: add to directory-like paths
|
|
42
|
+
if path and not path.endswith("/"):
|
|
43
|
+
# If no file extension, treat as directory
|
|
44
|
+
last_segment = path.split("/")[-1]
|
|
45
|
+
if "." not in last_segment:
|
|
46
|
+
path = path + "/"
|
|
47
|
+
|
|
48
|
+
# Remove fragment unless keeping them
|
|
49
|
+
fragment = parsed.fragment if self.keep_fragments else ""
|
|
50
|
+
|
|
51
|
+
# Reconstruct URL
|
|
52
|
+
return urlunparse((scheme, netloc, path, parsed.params, parsed.query, fragment))
|
|
53
|
+
|
|
54
|
+
def deduplicate(self, urls: list[str]) -> list[str]:
|
|
55
|
+
"""
|
|
56
|
+
Remove duplicate URLs from list while preserving order.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
urls: List of URLs (may contain duplicates)
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
List of unique URLs (first occurrence preserved)
|
|
63
|
+
"""
|
|
64
|
+
seen: set[str] = set()
|
|
65
|
+
unique: list[str] = []
|
|
66
|
+
|
|
67
|
+
for url in urls:
|
|
68
|
+
canonical = self.canonicalize(url)
|
|
69
|
+
if canonical not in seen:
|
|
70
|
+
seen.add(canonical)
|
|
71
|
+
unique.append(url) # Keep original URL
|
|
72
|
+
|
|
73
|
+
return unique
|