doculift-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
doculift/scraper.py ADDED
@@ -0,0 +1,442 @@
1
+ from bs4 import BeautifulSoup, NavigableString
2
+ from urllib.parse import urljoin, urlparse
3
+ from collections import deque
4
+ import time
5
+ import os
6
+ import re
7
+
8
+ from playwright.sync_api import sync_playwright
9
+
10
+
11
+ class DocuLiftScraper:
12
+ def __init__(
13
+ self,
14
+ start_urls,
15
+ output_format="md",
16
+ max_pages=500,
17
+ max_chars_per_file=500000,
18
+ scope_type="section",
19
+ extract_mode="content",
20
+ ):
21
+ # Allow passing a single string or list
22
+ if isinstance(start_urls, str):
23
+ start_urls = [start_urls]
24
+ self.start_urls = [url.strip() for url in start_urls if url.strip()]
25
+ self.output_format = output_format.lower()
26
+ self.max_pages = max_pages
27
+ # 500KB approx char limit for NotebookLM
28
+ self.max_chars_per_file = max_chars_per_file
29
+ self.scope_type = scope_type
30
+ self.extract_mode = extract_mode
31
+ self.visited_urls = set()
32
+ self.pages_scraped = 0
33
+ self.urls_extracted = 0
34
+ self.per_url_stats = {} # start_url -> count of pages/urls collected
35
+ self.status = "Initializing..."
36
+ self.progress = 0
37
+ self.max_chars_per_file = max_chars_per_file
38
+ self.visited_urls = set()
39
+ self.pages_scraped = 0
40
+ self.status = "Initializing..."
41
+ self.progress = 0
42
+
43
+ def clean_text(self, text):
44
+ if not text:
45
+ return ""
46
+ text = re.sub(r"[ \t]+", " ", text)
47
+ text = re.sub(r"\n\s*\n", "\n\n", text)
48
+ return text.strip()
49
+
50
+ def html_to_markdown(self, element, base_url=""):
51
+ if element is None:
52
+ return ""
53
+ if isinstance(element, NavigableString):
54
+ return self.clean_text(str(element))
55
+
56
+ tag_name = element.name
57
+ content = ""
58
+
59
+ if tag_name in ["ul", "ol"]:
60
+ list_md = "\n"
61
+ for i, child in enumerate(element.find_all("li", recursive=False)):
62
+ li_content = ""
63
+ for sub in child.children:
64
+ li_content += self.html_to_markdown(sub, base_url) + " "
65
+ li_content = self.clean_text(li_content)
66
+ if tag_name == "ul":
67
+ list_md += f"- {li_content}\n"
68
+ else:
69
+ list_md += f"{i+1}. {li_content}\n"
70
+ return list_md + "\n"
71
+
72
+ for child in element.children:
73
+ child_text = self.html_to_markdown(child, base_url)
74
+ if child_text:
75
+ content += child_text + " "
76
+
77
+ content = self.clean_text(content)
78
+
79
+ if tag_name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
80
+ level = (
81
+ int(tag_name[1]) if len(tag_name) > 1 and tag_name[1].isdigit() else 1
82
+ )
83
+ return f"\n\n{'#' * level} {content}\n\n"
84
+ elif tag_name == "p":
85
+ return f"\n\n{content}\n\n"
86
+ elif tag_name == "pre":
87
+ # Try to find code language class
88
+ lang = ""
89
+ code_elem = element.find("code")
90
+ if code_elem and code_elem.get("class"):
91
+ for cls in code_elem.get("class"):
92
+ if cls.startswith("language-"):
93
+ lang = cls.replace("language-", "")
94
+ break
95
+ return f"\n\n```{lang}\n{element.get_text()}\n```\n\n"
96
+ elif tag_name == "code":
97
+ if element.parent.name == "pre":
98
+ return element.get_text()
99
+ return f"`{content}`"
100
+ elif tag_name == "a":
101
+ href = element.get("href", "")
102
+ # Resolve relative URLs if base_url is present (simplified)
103
+ if base_url and href and not href.startswith(("http", "mailto", "#")):
104
+ href = urljoin(base_url, href)
105
+ return f"[{content}]({href})"
106
+ elif tag_name == "img":
107
+ src = element.get("src", "")
108
+ if base_url and src and not src.startswith(("http", "data:")):
109
+ src = urljoin(base_url, src)
110
+ return f"![{element.get('alt', '')}]({src})"
111
+ elif tag_name == "blockquote":
112
+ return f"\n> {content}\n"
113
+ elif tag_name == "tr":
114
+ return content + "\n"
115
+ elif tag_name in ["td", "th"]:
116
+ return content + " | "
117
+
118
+ return content
119
+
120
+ def fetch_and_process(self, page, url, original_domain, scope_url):
121
+ try:
122
+ print(f"Navigating to {url}...")
123
+ # Set timeout to handle slow pages
124
+ try:
125
+ page.goto(url, timeout=45000)
126
+ except Exception as e:
127
+ print(f"Timeout on {url}, attempting to read anyway: {e}")
128
+
129
+ # Wait for meaningful content or network idle
130
+ try:
131
+ page.wait_for_load_state("networkidle", timeout=5000)
132
+ except Exception: # nosec B110
133
+ pass
134
+
135
+ # Additional small wait for JS rendering
136
+ time.sleep(1)
137
+
138
+ html_content = page.content()
139
+ soup = BeautifulSoup(html_content, "html.parser")
140
+ extracted_links = set()
141
+
142
+ # Extract links *before* cleaning garbage to ensure navigation works
143
+ for link in soup.find_all("a", href=True):
144
+ href = link.get("href").strip()
145
+ # Skip anchor links on same page to avoid loops
146
+ if href.startswith("#"):
147
+ continue
148
+
149
+ full_url = urljoin(url, href)
150
+ # Remove fragment for deduplication
151
+ full_url_no_frag = full_url.split("#")[0]
152
+
153
+ parsed = urlparse(full_url_no_frag)
154
+ clean_url = parsed.scheme + "://" + parsed.netloc + parsed.path
155
+
156
+ # Check Domain & Scope
157
+ if parsed.netloc == original_domain and clean_url.startswith(scope_url):
158
+ extracted_links.add(clean_url)
159
+
160
+ # Define garbage to remove
161
+ for garbage in soup(
162
+ [
163
+ "script",
164
+ "style",
165
+ "nav",
166
+ "footer",
167
+ "header",
168
+ "aside",
169
+ "form",
170
+ "iframe",
171
+ "noscript",
172
+ "button",
173
+ ]
174
+ ):
175
+ garbage.decompose()
176
+
177
+ # Find main content area - heuristic
178
+ main_content = (
179
+ soup.find("main")
180
+ or soup.find("article")
181
+ or soup.find("div", class_="content")
182
+ or soup.find("div", id="content")
183
+ or soup.body
184
+ or soup
185
+ )
186
+
187
+ markdown_text = self.html_to_markdown(main_content, base_url=url)
188
+ page_title = soup.title.string.strip() if soup.title else "Untitled"
189
+
190
+ return markdown_text, extracted_links, page_title
191
+ except Exception as e:
192
+ print(f"Error processing {url}: {e}")
193
+ return None, [], ""
194
+
195
+ def generate_base_filename(self, url):
196
+ parsed = urlparse(url)
197
+ path = parsed.path.strip("/")
198
+ if not path:
199
+ filename = parsed.netloc.replace("www.", "")
200
+ else:
201
+ # Create readable filename from path
202
+ filename = re.sub(r"\W+", "_", path)
203
+
204
+ # Ensure it's not too long
205
+ if len(filename) > 50:
206
+ filename = filename[:50]
207
+
208
+ # Prepend part of domain if generic
209
+ if filename in ["docs", "api", "en", "v1"]:
210
+ domain_part = parsed.netloc.split(".")[0]
211
+ filename = f"{domain_part}_{filename}"
212
+
213
+ return filename
214
+
215
+ def determine_scope(self, start_url):
216
+ parsed = urlparse(start_url)
217
+
218
+ if self.scope_type == "domain":
219
+ # Broadest scope: Entire domain
220
+ return f"{parsed.scheme}://{parsed.netloc}/"
221
+
222
+ # "Smart Scoping" Logic (Section/Default)
223
+ path = parsed.path
224
+
225
+ # If url ends with overview, index, or is just a directory root
226
+ if (
227
+ path.endswith("/overview")
228
+ or path.endswith("/index")
229
+ or path.endswith("/index.html")
230
+ ):
231
+ # Go up one level
232
+ # e.g. /docs/en/overview -> /docs/en/
233
+ parent_path = path.rsplit("/", 1)[0]
234
+ scope = f"{parsed.scheme}://{parsed.netloc}{parent_path}/"
235
+ return scope
236
+ elif path.endswith("/"):
237
+ # Already a directory
238
+ return start_url
239
+ else:
240
+ # General page URL: trim last segment to get parent folder
241
+ # e.g. /docs/agents/tool-use -> /docs/agents/
242
+ parent_path = path.rsplit("/", 1)[0]
243
+ return f"{parsed.scheme}://{parsed.netloc}{parent_path}/"
244
+
245
+ def run_url_extraction(self, output_dir):
246
+ if not os.path.exists(output_dir):
247
+ os.makedirs(output_dir)
248
+
249
+ self.urls_extracted = 0
250
+
251
+ with sync_playwright() as p:
252
+ browser = p.chromium.launch(headless=True)
253
+ page = browser.new_page()
254
+
255
+ total_start_urls = len(self.start_urls)
256
+ for url_idx, start_url in enumerate(self.start_urls):
257
+ original_domain = urlparse(start_url).netloc
258
+ scope = self.determine_scope(start_url)
259
+ print(f"Collecting URLs from {start_url} with scope: {scope}")
260
+
261
+ # Per-URL visited set — isolates crawl per start URL
262
+ local_visited = {start_url}
263
+ queue = deque([start_url])
264
+ local_all_urls = [start_url]
265
+ url_count = 1
266
+ self.urls_extracted += 1
267
+ pages_scanned = 0
268
+
269
+ while queue and pages_scanned < self.max_pages:
270
+ url = queue.popleft()
271
+ self.status = f"Scanning: {url}"
272
+
273
+ try:
274
+ try:
275
+ page.goto(url, timeout=45000)
276
+ except Exception as e:
277
+ print(f"Timeout on {url}: {e}")
278
+
279
+ try:
280
+ page.wait_for_load_state("networkidle", timeout=5000)
281
+ except Exception: # nosec B110
282
+ pass
283
+
284
+ time.sleep(1)
285
+
286
+ html_content = page.content()
287
+ soup = BeautifulSoup(html_content, "html.parser")
288
+
289
+ for link in soup.find_all("a", href=True):
290
+ href = link.get("href").strip()
291
+ if href.startswith("#"):
292
+ continue
293
+ full_url = urljoin(url, href)
294
+ full_url_no_frag = full_url.split("#")[0]
295
+ parsed = urlparse(full_url_no_frag)
296
+ clean_url = (
297
+ parsed.scheme + "://" + parsed.netloc + parsed.path
298
+ )
299
+ if (
300
+ parsed.netloc == original_domain
301
+ and clean_url.startswith(scope)
302
+ ):
303
+ if (
304
+ clean_url not in local_visited
305
+ and clean_url not in self.visited_urls
306
+ ):
307
+ local_visited.add(clean_url)
308
+ queue.append(clean_url)
309
+ local_all_urls.append(clean_url)
310
+ url_count += 1
311
+ self.urls_extracted += 1
312
+
313
+ pages_scanned += 1
314
+ self.pages_scraped += 1
315
+ self.progress = int(
316
+ (
317
+ (url_idx + pages_scanned / self.max_pages)
318
+ / total_start_urls
319
+ )
320
+ * 100
321
+ )
322
+
323
+ except Exception as e:
324
+ print(f"Error scanning {url}: {e}")
325
+
326
+ # Merge this URL's visited pages into the global set
327
+ self.visited_urls.update(local_visited)
328
+ self.per_url_stats[start_url] = url_count
329
+
330
+ # Write collected URLs for THIS start_url to its own file
331
+ base_filename = self.generate_base_filename(start_url)
332
+ filename = f"{base_filename}_urls.txt"
333
+ filepath = os.path.join(output_dir, filename)
334
+ with open(filepath, "w", encoding="utf-8") as f:
335
+ for u in local_all_urls:
336
+ f.write(u + "\n")
337
+ print(f"Saved {filepath} ({url_count} URLs)")
338
+
339
+ browser.close()
340
+
341
+ self.status = "Completed"
342
+ self.progress = 100
343
+
344
+ def run(self, output_dir):
345
+ if self.extract_mode == "urls":
346
+ self.run_url_extraction(output_dir)
347
+ return
348
+
349
+ if not os.path.exists(output_dir):
350
+ os.makedirs(output_dir)
351
+
352
+ # Launch Playwright once
353
+ with sync_playwright() as p:
354
+ browser = p.chromium.launch(headless=True)
355
+ page = browser.new_page()
356
+
357
+ # Loop strictly through provided Start URLs
358
+ total_start_urls = len(self.start_urls)
359
+ for url_idx, start_url in enumerate(self.start_urls):
360
+ # Per-Start-URL isolation
361
+ original_domain = urlparse(start_url).netloc
362
+ scope = self.determine_scope(start_url)
363
+ print(f"Starting {start_url} with scope: {scope}")
364
+
365
+ # Per-URL visited set — isolates crawl per start URL
366
+ local_visited = {start_url}
367
+ queue = deque([start_url])
368
+
369
+ # Per-URL counters
370
+ pages_scraped_this_url = 0
371
+ current_file_idx = 1
372
+ current_buffer = ""
373
+ base_filename = self.generate_base_filename(start_url)
374
+
375
+ # BFS — max_pages applies per start URL
376
+ while queue and pages_scraped_this_url < self.max_pages:
377
+ url = queue.popleft()
378
+ self.status = f"Scraping: {url}"
379
+
380
+ md_content, links, page_title = self.fetch_and_process(
381
+ page, url, original_domain, scope
382
+ )
383
+
384
+ if md_content:
385
+ header = f"\n\n---\n# Page: {page_title}\n# URL: {url}\n---\n\n"
386
+ chunk = header + md_content
387
+
388
+ if len(current_buffer) + len(chunk) > self.max_chars_per_file:
389
+ self.save_buffer(
390
+ output_dir,
391
+ base_filename,
392
+ current_file_idx,
393
+ current_buffer,
394
+ )
395
+ current_file_idx += 1
396
+ current_buffer = chunk
397
+ else:
398
+ current_buffer += chunk
399
+
400
+ pages_scraped_this_url += 1
401
+ self.pages_scraped += 1
402
+ self.progress = int(
403
+ (
404
+ (url_idx + pages_scraped_this_url / self.max_pages)
405
+ / total_start_urls
406
+ )
407
+ * 100
408
+ )
409
+
410
+ for link in links:
411
+ if (
412
+ link not in local_visited
413
+ and link not in self.visited_urls
414
+ ):
415
+ local_visited.add(link)
416
+ queue.append(link)
417
+
418
+ # Merge into global visited to prevent cross-URL duplication
419
+ self.visited_urls.update(local_visited)
420
+ self.per_url_stats[start_url] = pages_scraped_this_url
421
+
422
+ # Flush remaining buffer for this URL
423
+ if current_buffer:
424
+ self.save_buffer(
425
+ output_dir, base_filename, current_file_idx, current_buffer
426
+ )
427
+
428
+ browser.close()
429
+
430
+ self.status = "Completed"
431
+ self.progress = 100
432
+
433
+ def save_buffer(self, output_dir, base_name, idx, content):
434
+ if not content.strip():
435
+ return
436
+
437
+ filename = f"{base_name}_part_{idx}.{self.output_format}"
438
+ filepath = os.path.join(output_dir, filename)
439
+
440
+ with open(filepath, "w", encoding="utf-8") as f:
441
+ f.write(content)
442
+ print(f"Saved {filepath} ({len(content)} chars)")