doculift-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- doculift/__init__.py +0 -0
- doculift/__main__.py +4 -0
- doculift/app.py +97 -0
- doculift/cli.py +177 -0
- doculift/scraper.py +442 -0
- doculift/static/css/style.css +481 -0
- doculift/static/doculift_logo.png +0 -0
- doculift/static/js/main.js +82 -0
- doculift/templates/index.html +91 -0
- doculift_cli-0.1.0.dist-info/METADATA +240 -0
- doculift_cli-0.1.0.dist-info/RECORD +14 -0
- doculift_cli-0.1.0.dist-info/WHEEL +5 -0
- doculift_cli-0.1.0.dist-info/entry_points.txt +2 -0
- doculift_cli-0.1.0.dist-info/top_level.txt +1 -0
doculift/scraper.py
ADDED
|
@@ -0,0 +1,442 @@
|
|
|
1
|
+
from bs4 import BeautifulSoup, NavigableString
|
|
2
|
+
from urllib.parse import urljoin, urlparse
|
|
3
|
+
from collections import deque
|
|
4
|
+
import time
|
|
5
|
+
import os
|
|
6
|
+
import re
|
|
7
|
+
|
|
8
|
+
from playwright.sync_api import sync_playwright
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class DocuLiftScraper:
|
|
12
|
+
def __init__(
|
|
13
|
+
self,
|
|
14
|
+
start_urls,
|
|
15
|
+
output_format="md",
|
|
16
|
+
max_pages=500,
|
|
17
|
+
max_chars_per_file=500000,
|
|
18
|
+
scope_type="section",
|
|
19
|
+
extract_mode="content",
|
|
20
|
+
):
|
|
21
|
+
# Allow passing a single string or list
|
|
22
|
+
if isinstance(start_urls, str):
|
|
23
|
+
start_urls = [start_urls]
|
|
24
|
+
self.start_urls = [url.strip() for url in start_urls if url.strip()]
|
|
25
|
+
self.output_format = output_format.lower()
|
|
26
|
+
self.max_pages = max_pages
|
|
27
|
+
# 500KB approx char limit for NotebookLM
|
|
28
|
+
self.max_chars_per_file = max_chars_per_file
|
|
29
|
+
self.scope_type = scope_type
|
|
30
|
+
self.extract_mode = extract_mode
|
|
31
|
+
self.visited_urls = set()
|
|
32
|
+
self.pages_scraped = 0
|
|
33
|
+
self.urls_extracted = 0
|
|
34
|
+
self.per_url_stats = {} # start_url -> count of pages/urls collected
|
|
35
|
+
self.status = "Initializing..."
|
|
36
|
+
self.progress = 0
|
|
37
|
+
self.max_chars_per_file = max_chars_per_file
|
|
38
|
+
self.visited_urls = set()
|
|
39
|
+
self.pages_scraped = 0
|
|
40
|
+
self.status = "Initializing..."
|
|
41
|
+
self.progress = 0
|
|
42
|
+
|
|
43
|
+
def clean_text(self, text):
|
|
44
|
+
if not text:
|
|
45
|
+
return ""
|
|
46
|
+
text = re.sub(r"[ \t]+", " ", text)
|
|
47
|
+
text = re.sub(r"\n\s*\n", "\n\n", text)
|
|
48
|
+
return text.strip()
|
|
49
|
+
|
|
50
|
+
def html_to_markdown(self, element, base_url=""):
|
|
51
|
+
if element is None:
|
|
52
|
+
return ""
|
|
53
|
+
if isinstance(element, NavigableString):
|
|
54
|
+
return self.clean_text(str(element))
|
|
55
|
+
|
|
56
|
+
tag_name = element.name
|
|
57
|
+
content = ""
|
|
58
|
+
|
|
59
|
+
if tag_name in ["ul", "ol"]:
|
|
60
|
+
list_md = "\n"
|
|
61
|
+
for i, child in enumerate(element.find_all("li", recursive=False)):
|
|
62
|
+
li_content = ""
|
|
63
|
+
for sub in child.children:
|
|
64
|
+
li_content += self.html_to_markdown(sub, base_url) + " "
|
|
65
|
+
li_content = self.clean_text(li_content)
|
|
66
|
+
if tag_name == "ul":
|
|
67
|
+
list_md += f"- {li_content}\n"
|
|
68
|
+
else:
|
|
69
|
+
list_md += f"{i+1}. {li_content}\n"
|
|
70
|
+
return list_md + "\n"
|
|
71
|
+
|
|
72
|
+
for child in element.children:
|
|
73
|
+
child_text = self.html_to_markdown(child, base_url)
|
|
74
|
+
if child_text:
|
|
75
|
+
content += child_text + " "
|
|
76
|
+
|
|
77
|
+
content = self.clean_text(content)
|
|
78
|
+
|
|
79
|
+
if tag_name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
|
|
80
|
+
level = (
|
|
81
|
+
int(tag_name[1]) if len(tag_name) > 1 and tag_name[1].isdigit() else 1
|
|
82
|
+
)
|
|
83
|
+
return f"\n\n{'#' * level} {content}\n\n"
|
|
84
|
+
elif tag_name == "p":
|
|
85
|
+
return f"\n\n{content}\n\n"
|
|
86
|
+
elif tag_name == "pre":
|
|
87
|
+
# Try to find code language class
|
|
88
|
+
lang = ""
|
|
89
|
+
code_elem = element.find("code")
|
|
90
|
+
if code_elem and code_elem.get("class"):
|
|
91
|
+
for cls in code_elem.get("class"):
|
|
92
|
+
if cls.startswith("language-"):
|
|
93
|
+
lang = cls.replace("language-", "")
|
|
94
|
+
break
|
|
95
|
+
return f"\n\n```{lang}\n{element.get_text()}\n```\n\n"
|
|
96
|
+
elif tag_name == "code":
|
|
97
|
+
if element.parent.name == "pre":
|
|
98
|
+
return element.get_text()
|
|
99
|
+
return f"`{content}`"
|
|
100
|
+
elif tag_name == "a":
|
|
101
|
+
href = element.get("href", "")
|
|
102
|
+
# Resolve relative URLs if base_url is present (simplified)
|
|
103
|
+
if base_url and href and not href.startswith(("http", "mailto", "#")):
|
|
104
|
+
href = urljoin(base_url, href)
|
|
105
|
+
return f"[{content}]({href})"
|
|
106
|
+
elif tag_name == "img":
|
|
107
|
+
src = element.get("src", "")
|
|
108
|
+
if base_url and src and not src.startswith(("http", "data:")):
|
|
109
|
+
src = urljoin(base_url, src)
|
|
110
|
+
return f""
|
|
111
|
+
elif tag_name == "blockquote":
|
|
112
|
+
return f"\n> {content}\n"
|
|
113
|
+
elif tag_name == "tr":
|
|
114
|
+
return content + "\n"
|
|
115
|
+
elif tag_name in ["td", "th"]:
|
|
116
|
+
return content + " | "
|
|
117
|
+
|
|
118
|
+
return content
|
|
119
|
+
|
|
120
|
+
def fetch_and_process(self, page, url, original_domain, scope_url):
|
|
121
|
+
try:
|
|
122
|
+
print(f"Navigating to {url}...")
|
|
123
|
+
# Set timeout to handle slow pages
|
|
124
|
+
try:
|
|
125
|
+
page.goto(url, timeout=45000)
|
|
126
|
+
except Exception as e:
|
|
127
|
+
print(f"Timeout on {url}, attempting to read anyway: {e}")
|
|
128
|
+
|
|
129
|
+
# Wait for meaningful content or network idle
|
|
130
|
+
try:
|
|
131
|
+
page.wait_for_load_state("networkidle", timeout=5000)
|
|
132
|
+
except Exception: # nosec B110
|
|
133
|
+
pass
|
|
134
|
+
|
|
135
|
+
# Additional small wait for JS rendering
|
|
136
|
+
time.sleep(1)
|
|
137
|
+
|
|
138
|
+
html_content = page.content()
|
|
139
|
+
soup = BeautifulSoup(html_content, "html.parser")
|
|
140
|
+
extracted_links = set()
|
|
141
|
+
|
|
142
|
+
# Extract links *before* cleaning garbage to ensure navigation works
|
|
143
|
+
for link in soup.find_all("a", href=True):
|
|
144
|
+
href = link.get("href").strip()
|
|
145
|
+
# Skip anchor links on same page to avoid loops
|
|
146
|
+
if href.startswith("#"):
|
|
147
|
+
continue
|
|
148
|
+
|
|
149
|
+
full_url = urljoin(url, href)
|
|
150
|
+
# Remove fragment for deduplication
|
|
151
|
+
full_url_no_frag = full_url.split("#")[0]
|
|
152
|
+
|
|
153
|
+
parsed = urlparse(full_url_no_frag)
|
|
154
|
+
clean_url = parsed.scheme + "://" + parsed.netloc + parsed.path
|
|
155
|
+
|
|
156
|
+
# Check Domain & Scope
|
|
157
|
+
if parsed.netloc == original_domain and clean_url.startswith(scope_url):
|
|
158
|
+
extracted_links.add(clean_url)
|
|
159
|
+
|
|
160
|
+
# Define garbage to remove
|
|
161
|
+
for garbage in soup(
|
|
162
|
+
[
|
|
163
|
+
"script",
|
|
164
|
+
"style",
|
|
165
|
+
"nav",
|
|
166
|
+
"footer",
|
|
167
|
+
"header",
|
|
168
|
+
"aside",
|
|
169
|
+
"form",
|
|
170
|
+
"iframe",
|
|
171
|
+
"noscript",
|
|
172
|
+
"button",
|
|
173
|
+
]
|
|
174
|
+
):
|
|
175
|
+
garbage.decompose()
|
|
176
|
+
|
|
177
|
+
# Find main content area - heuristic
|
|
178
|
+
main_content = (
|
|
179
|
+
soup.find("main")
|
|
180
|
+
or soup.find("article")
|
|
181
|
+
or soup.find("div", class_="content")
|
|
182
|
+
or soup.find("div", id="content")
|
|
183
|
+
or soup.body
|
|
184
|
+
or soup
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
markdown_text = self.html_to_markdown(main_content, base_url=url)
|
|
188
|
+
page_title = soup.title.string.strip() if soup.title else "Untitled"
|
|
189
|
+
|
|
190
|
+
return markdown_text, extracted_links, page_title
|
|
191
|
+
except Exception as e:
|
|
192
|
+
print(f"Error processing {url}: {e}")
|
|
193
|
+
return None, [], ""
|
|
194
|
+
|
|
195
|
+
def generate_base_filename(self, url):
|
|
196
|
+
parsed = urlparse(url)
|
|
197
|
+
path = parsed.path.strip("/")
|
|
198
|
+
if not path:
|
|
199
|
+
filename = parsed.netloc.replace("www.", "")
|
|
200
|
+
else:
|
|
201
|
+
# Create readable filename from path
|
|
202
|
+
filename = re.sub(r"\W+", "_", path)
|
|
203
|
+
|
|
204
|
+
# Ensure it's not too long
|
|
205
|
+
if len(filename) > 50:
|
|
206
|
+
filename = filename[:50]
|
|
207
|
+
|
|
208
|
+
# Prepend part of domain if generic
|
|
209
|
+
if filename in ["docs", "api", "en", "v1"]:
|
|
210
|
+
domain_part = parsed.netloc.split(".")[0]
|
|
211
|
+
filename = f"{domain_part}_{filename}"
|
|
212
|
+
|
|
213
|
+
return filename
|
|
214
|
+
|
|
215
|
+
def determine_scope(self, start_url):
|
|
216
|
+
parsed = urlparse(start_url)
|
|
217
|
+
|
|
218
|
+
if self.scope_type == "domain":
|
|
219
|
+
# Broadest scope: Entire domain
|
|
220
|
+
return f"{parsed.scheme}://{parsed.netloc}/"
|
|
221
|
+
|
|
222
|
+
# "Smart Scoping" Logic (Section/Default)
|
|
223
|
+
path = parsed.path
|
|
224
|
+
|
|
225
|
+
# If url ends with overview, index, or is just a directory root
|
|
226
|
+
if (
|
|
227
|
+
path.endswith("/overview")
|
|
228
|
+
or path.endswith("/index")
|
|
229
|
+
or path.endswith("/index.html")
|
|
230
|
+
):
|
|
231
|
+
# Go up one level
|
|
232
|
+
# e.g. /docs/en/overview -> /docs/en/
|
|
233
|
+
parent_path = path.rsplit("/", 1)[0]
|
|
234
|
+
scope = f"{parsed.scheme}://{parsed.netloc}{parent_path}/"
|
|
235
|
+
return scope
|
|
236
|
+
elif path.endswith("/"):
|
|
237
|
+
# Already a directory
|
|
238
|
+
return start_url
|
|
239
|
+
else:
|
|
240
|
+
# General page URL: trim last segment to get parent folder
|
|
241
|
+
# e.g. /docs/agents/tool-use -> /docs/agents/
|
|
242
|
+
parent_path = path.rsplit("/", 1)[0]
|
|
243
|
+
return f"{parsed.scheme}://{parsed.netloc}{parent_path}/"
|
|
244
|
+
|
|
245
|
+
def run_url_extraction(self, output_dir):
|
|
246
|
+
if not os.path.exists(output_dir):
|
|
247
|
+
os.makedirs(output_dir)
|
|
248
|
+
|
|
249
|
+
self.urls_extracted = 0
|
|
250
|
+
|
|
251
|
+
with sync_playwright() as p:
|
|
252
|
+
browser = p.chromium.launch(headless=True)
|
|
253
|
+
page = browser.new_page()
|
|
254
|
+
|
|
255
|
+
total_start_urls = len(self.start_urls)
|
|
256
|
+
for url_idx, start_url in enumerate(self.start_urls):
|
|
257
|
+
original_domain = urlparse(start_url).netloc
|
|
258
|
+
scope = self.determine_scope(start_url)
|
|
259
|
+
print(f"Collecting URLs from {start_url} with scope: {scope}")
|
|
260
|
+
|
|
261
|
+
# Per-URL visited set — isolates crawl per start URL
|
|
262
|
+
local_visited = {start_url}
|
|
263
|
+
queue = deque([start_url])
|
|
264
|
+
local_all_urls = [start_url]
|
|
265
|
+
url_count = 1
|
|
266
|
+
self.urls_extracted += 1
|
|
267
|
+
pages_scanned = 0
|
|
268
|
+
|
|
269
|
+
while queue and pages_scanned < self.max_pages:
|
|
270
|
+
url = queue.popleft()
|
|
271
|
+
self.status = f"Scanning: {url}"
|
|
272
|
+
|
|
273
|
+
try:
|
|
274
|
+
try:
|
|
275
|
+
page.goto(url, timeout=45000)
|
|
276
|
+
except Exception as e:
|
|
277
|
+
print(f"Timeout on {url}: {e}")
|
|
278
|
+
|
|
279
|
+
try:
|
|
280
|
+
page.wait_for_load_state("networkidle", timeout=5000)
|
|
281
|
+
except Exception: # nosec B110
|
|
282
|
+
pass
|
|
283
|
+
|
|
284
|
+
time.sleep(1)
|
|
285
|
+
|
|
286
|
+
html_content = page.content()
|
|
287
|
+
soup = BeautifulSoup(html_content, "html.parser")
|
|
288
|
+
|
|
289
|
+
for link in soup.find_all("a", href=True):
|
|
290
|
+
href = link.get("href").strip()
|
|
291
|
+
if href.startswith("#"):
|
|
292
|
+
continue
|
|
293
|
+
full_url = urljoin(url, href)
|
|
294
|
+
full_url_no_frag = full_url.split("#")[0]
|
|
295
|
+
parsed = urlparse(full_url_no_frag)
|
|
296
|
+
clean_url = (
|
|
297
|
+
parsed.scheme + "://" + parsed.netloc + parsed.path
|
|
298
|
+
)
|
|
299
|
+
if (
|
|
300
|
+
parsed.netloc == original_domain
|
|
301
|
+
and clean_url.startswith(scope)
|
|
302
|
+
):
|
|
303
|
+
if (
|
|
304
|
+
clean_url not in local_visited
|
|
305
|
+
and clean_url not in self.visited_urls
|
|
306
|
+
):
|
|
307
|
+
local_visited.add(clean_url)
|
|
308
|
+
queue.append(clean_url)
|
|
309
|
+
local_all_urls.append(clean_url)
|
|
310
|
+
url_count += 1
|
|
311
|
+
self.urls_extracted += 1
|
|
312
|
+
|
|
313
|
+
pages_scanned += 1
|
|
314
|
+
self.pages_scraped += 1
|
|
315
|
+
self.progress = int(
|
|
316
|
+
(
|
|
317
|
+
(url_idx + pages_scanned / self.max_pages)
|
|
318
|
+
/ total_start_urls
|
|
319
|
+
)
|
|
320
|
+
* 100
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
except Exception as e:
|
|
324
|
+
print(f"Error scanning {url}: {e}")
|
|
325
|
+
|
|
326
|
+
# Merge this URL's visited pages into the global set
|
|
327
|
+
self.visited_urls.update(local_visited)
|
|
328
|
+
self.per_url_stats[start_url] = url_count
|
|
329
|
+
|
|
330
|
+
# Write collected URLs for THIS start_url to its own file
|
|
331
|
+
base_filename = self.generate_base_filename(start_url)
|
|
332
|
+
filename = f"{base_filename}_urls.txt"
|
|
333
|
+
filepath = os.path.join(output_dir, filename)
|
|
334
|
+
with open(filepath, "w", encoding="utf-8") as f:
|
|
335
|
+
for u in local_all_urls:
|
|
336
|
+
f.write(u + "\n")
|
|
337
|
+
print(f"Saved {filepath} ({url_count} URLs)")
|
|
338
|
+
|
|
339
|
+
browser.close()
|
|
340
|
+
|
|
341
|
+
self.status = "Completed"
|
|
342
|
+
self.progress = 100
|
|
343
|
+
|
|
344
|
+
def run(self, output_dir):
|
|
345
|
+
if self.extract_mode == "urls":
|
|
346
|
+
self.run_url_extraction(output_dir)
|
|
347
|
+
return
|
|
348
|
+
|
|
349
|
+
if not os.path.exists(output_dir):
|
|
350
|
+
os.makedirs(output_dir)
|
|
351
|
+
|
|
352
|
+
# Launch Playwright once
|
|
353
|
+
with sync_playwright() as p:
|
|
354
|
+
browser = p.chromium.launch(headless=True)
|
|
355
|
+
page = browser.new_page()
|
|
356
|
+
|
|
357
|
+
# Loop strictly through provided Start URLs
|
|
358
|
+
total_start_urls = len(self.start_urls)
|
|
359
|
+
for url_idx, start_url in enumerate(self.start_urls):
|
|
360
|
+
# Per-Start-URL isolation
|
|
361
|
+
original_domain = urlparse(start_url).netloc
|
|
362
|
+
scope = self.determine_scope(start_url)
|
|
363
|
+
print(f"Starting {start_url} with scope: {scope}")
|
|
364
|
+
|
|
365
|
+
# Per-URL visited set — isolates crawl per start URL
|
|
366
|
+
local_visited = {start_url}
|
|
367
|
+
queue = deque([start_url])
|
|
368
|
+
|
|
369
|
+
# Per-URL counters
|
|
370
|
+
pages_scraped_this_url = 0
|
|
371
|
+
current_file_idx = 1
|
|
372
|
+
current_buffer = ""
|
|
373
|
+
base_filename = self.generate_base_filename(start_url)
|
|
374
|
+
|
|
375
|
+
# BFS — max_pages applies per start URL
|
|
376
|
+
while queue and pages_scraped_this_url < self.max_pages:
|
|
377
|
+
url = queue.popleft()
|
|
378
|
+
self.status = f"Scraping: {url}"
|
|
379
|
+
|
|
380
|
+
md_content, links, page_title = self.fetch_and_process(
|
|
381
|
+
page, url, original_domain, scope
|
|
382
|
+
)
|
|
383
|
+
|
|
384
|
+
if md_content:
|
|
385
|
+
header = f"\n\n---\n# Page: {page_title}\n# URL: {url}\n---\n\n"
|
|
386
|
+
chunk = header + md_content
|
|
387
|
+
|
|
388
|
+
if len(current_buffer) + len(chunk) > self.max_chars_per_file:
|
|
389
|
+
self.save_buffer(
|
|
390
|
+
output_dir,
|
|
391
|
+
base_filename,
|
|
392
|
+
current_file_idx,
|
|
393
|
+
current_buffer,
|
|
394
|
+
)
|
|
395
|
+
current_file_idx += 1
|
|
396
|
+
current_buffer = chunk
|
|
397
|
+
else:
|
|
398
|
+
current_buffer += chunk
|
|
399
|
+
|
|
400
|
+
pages_scraped_this_url += 1
|
|
401
|
+
self.pages_scraped += 1
|
|
402
|
+
self.progress = int(
|
|
403
|
+
(
|
|
404
|
+
(url_idx + pages_scraped_this_url / self.max_pages)
|
|
405
|
+
/ total_start_urls
|
|
406
|
+
)
|
|
407
|
+
* 100
|
|
408
|
+
)
|
|
409
|
+
|
|
410
|
+
for link in links:
|
|
411
|
+
if (
|
|
412
|
+
link not in local_visited
|
|
413
|
+
and link not in self.visited_urls
|
|
414
|
+
):
|
|
415
|
+
local_visited.add(link)
|
|
416
|
+
queue.append(link)
|
|
417
|
+
|
|
418
|
+
# Merge into global visited to prevent cross-URL duplication
|
|
419
|
+
self.visited_urls.update(local_visited)
|
|
420
|
+
self.per_url_stats[start_url] = pages_scraped_this_url
|
|
421
|
+
|
|
422
|
+
# Flush remaining buffer for this URL
|
|
423
|
+
if current_buffer:
|
|
424
|
+
self.save_buffer(
|
|
425
|
+
output_dir, base_filename, current_file_idx, current_buffer
|
|
426
|
+
)
|
|
427
|
+
|
|
428
|
+
browser.close()
|
|
429
|
+
|
|
430
|
+
self.status = "Completed"
|
|
431
|
+
self.progress = 100
|
|
432
|
+
|
|
433
|
+
def save_buffer(self, output_dir, base_name, idx, content):
|
|
434
|
+
if not content.strip():
|
|
435
|
+
return
|
|
436
|
+
|
|
437
|
+
filename = f"{base_name}_part_{idx}.{self.output_format}"
|
|
438
|
+
filepath = os.path.join(output_dir, filename)
|
|
439
|
+
|
|
440
|
+
with open(filepath, "w", encoding="utf-8") as f:
|
|
441
|
+
f.write(content)
|
|
442
|
+
print(f"Saved {filepath} ({len(content)} chars)")
|