docs-crawler 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ """A documentation crawler that converts web documentation to Markdown format."""
2
+
3
+ __version__ = "0.1.0"
4
+
5
+ from docs_crawler.crawler import Crawler
6
+
7
+ __all__ = ["Crawler"]
docs_crawler/cli.py ADDED
@@ -0,0 +1,241 @@
1
+ import os
2
+ import sys
3
+ import argparse
4
+ import logging
5
+ from urllib.parse import urlparse
6
+ from docs_crawler.crawler import Crawler
7
+
8
+ # Setup logging
9
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ def extract_subdomain(url):
14
+ """Extract subdomain from URL for file naming."""
15
+ parsed = urlparse(url)
16
+ hostname = parsed.hostname
17
+ if hostname:
18
+ parts = hostname.split('.')
19
+ if len(parts) >= 2:
20
+ return parts[-2]
21
+ elif len(parts) == 1:
22
+ return parts[0]
23
+ return 'default'
24
+
25
+
26
+ def main():
27
+ """Main CLI entry point for docs-crawler."""
28
+ parser = argparse.ArgumentParser(
29
+ description="Crawl and convert documentation to Markdown.",
30
+ formatter_class=argparse.RawDescriptionHelpFormatter,
31
+ epilog="""
32
+ Examples:
33
+ # Crawl from sitemap (tries sitemap first, falls back to recursive discovery)
34
+ docs-crawler --base-url https://example.com
35
+
36
+ # Discover links only and save to file
37
+ docs-crawler --mode discover --base-url https://example.com
38
+
39
+ # Crawl from a list of URLs in a file
40
+ docs-crawler --mode list --file urls.txt
41
+
42
+ # Specify custom output folder
43
+ docs-crawler --base-url https://example.com --folder my-docs
44
+ """
45
+ )
46
+
47
+ parser.add_argument(
48
+ '--mode',
49
+ choices=['sitemap', 'discover', 'list'],
50
+ default='sitemap',
51
+ help="Mode: 'sitemap' (crawl, tries sitemap then recursive), 'discover' (find and save URLs), or 'list' (crawl from file)."
52
+ )
53
+
54
+ parser.add_argument(
55
+ '--base-url',
56
+ help="Base URL of the documentation site (e.g., https://example.com)"
57
+ )
58
+
59
+ parser.add_argument(
60
+ '--start-url',
61
+ help="Starting URL for recursive discovery (e.g., https://example.com/docs/)"
62
+ )
63
+
64
+ parser.add_argument(
65
+ '--sitemap-url',
66
+ help="URL of the sitemap (overrides auto-detected sitemap URL)"
67
+ )
68
+
69
+ parser.add_argument(
70
+ '--file',
71
+ help="Path to the text file containing URLs (required if mode is 'list')."
72
+ )
73
+
74
+ parser.add_argument(
75
+ '--output-file',
76
+ help="Output file for discovered URLs (used in discover mode, auto-generated if not specified)"
77
+ )
78
+
79
+ parser.add_argument(
80
+ '--folder',
81
+ help="Custom folder name under output directory (overrides auto-detection from domain)."
82
+ )
83
+
84
+ parser.add_argument(
85
+ '--output-dir',
86
+ default='output',
87
+ help="Output directory for markdown files (default: output)"
88
+ )
89
+
90
+ parser.add_argument(
91
+ '--path-filter',
92
+ default='/docs/',
93
+ help="Path pattern to filter links (default: /docs/)"
94
+ )
95
+
96
+ parser.add_argument(
97
+ '--max-depth',
98
+ type=int,
99
+ default=100,
100
+ help="Maximum number of URLs to discover in recursive mode (default: 100)"
101
+ )
102
+
103
+ args = parser.parse_args()
104
+
105
+ # Validate arguments
106
+ urls = None
107
+
108
+ if args.mode == 'discover':
109
+ # Discover mode: find links and save to file
110
+ if not args.base_url and not args.start_url:
111
+ parser.error("--base-url or --start-url is required when mode is 'discover'")
112
+
113
+ crawler = Crawler(
114
+ base_url=args.base_url,
115
+ sitemap_url=args.sitemap_url,
116
+ output_dir=args.output_dir,
117
+ custom_folder=args.folder
118
+ )
119
+
120
+ try:
121
+ # Discover links
122
+ discovered_urls = crawler.discover_links(
123
+ start_url=args.start_url,
124
+ path_filter=args.path_filter,
125
+ max_depth=args.max_depth
126
+ )
127
+
128
+ if not discovered_urls:
129
+ logger.warning("No URLs discovered.")
130
+ sys.exit(0)
131
+
132
+ # Generate output filename
133
+ if args.output_file:
134
+ output_file = args.output_file
135
+ else:
136
+ # Use subdomain-based naming
137
+ base = args.base_url or args.start_url
138
+ subdomain = extract_subdomain(base)
139
+ output_file = f"{subdomain}_urls.txt"
140
+
141
+ # Show discovered URLs and ask for confirmation
142
+ logger.info(f"\nDiscovered {len(discovered_urls)} URLs:")
143
+ print("\nFirst 10 URLs:")
144
+ for url in discovered_urls[:10]:
145
+ print(f" - {url}")
146
+ if len(discovered_urls) > 10:
147
+ print(f" ... and {len(discovered_urls) - 10} more")
148
+
149
+ # Ask for confirmation
150
+ print(f"\nSave URLs to '{output_file}'? [Y/n]: ", end='', flush=True)
151
+ response = input().strip().lower()
152
+
153
+ if response in ['', 'y', 'yes']:
154
+ with open(output_file, 'w', encoding='utf-8') as f:
155
+ for url in discovered_urls:
156
+ f.write(f"{url}\n")
157
+ logger.info(f"Saved {len(discovered_urls)} URLs to {output_file}")
158
+ logger.info(f"You can now run: docs-crawler --mode list --file {output_file}")
159
+ else:
160
+ logger.info("Cancelled. URLs not saved.")
161
+
162
+ except KeyboardInterrupt:
163
+ logger.info("\nDiscovery interrupted by user.")
164
+ sys.exit(0)
165
+ except Exception as e:
166
+ logger.error(f"Error during discovery: {e}")
167
+ import traceback
168
+ traceback.print_exc()
169
+ sys.exit(1)
170
+
171
+ elif args.mode == 'sitemap':
172
+ # Sitemap mode (with fallback to recursive discovery)
173
+ if not args.base_url and not args.sitemap_url:
174
+ parser.error("--base-url or --sitemap-url is required when mode is 'sitemap'")
175
+
176
+ crawler = Crawler(
177
+ base_url=args.base_url,
178
+ sitemap_url=args.sitemap_url,
179
+ output_dir=args.output_dir,
180
+ custom_folder=args.folder
181
+ )
182
+
183
+ try:
184
+ crawler.run(
185
+ urls=None,
186
+ start_url=args.start_url,
187
+ path_filter=args.path_filter,
188
+ max_depth=args.max_depth
189
+ )
190
+ except KeyboardInterrupt:
191
+ logger.info("\nCrawling interrupted by user.")
192
+ sys.exit(0)
193
+ except Exception as e:
194
+ logger.error(f"Error during crawling: {e}")
195
+ import traceback
196
+ traceback.print_exc()
197
+ sys.exit(1)
198
+
199
+ elif args.mode == 'list':
200
+ # List mode: crawl from file
201
+ if not args.file:
202
+ parser.error("--file is required when mode is 'list'")
203
+
204
+ if not os.path.exists(args.file):
205
+ logger.error(f"File not found: {args.file}")
206
+ sys.exit(1)
207
+
208
+ try:
209
+ with open(args.file, 'r', encoding='utf-8') as f:
210
+ urls = [line.strip() for line in f if line.strip()]
211
+ logger.info(f"Loaded {len(urls)} URLs from {args.file}")
212
+ except Exception as e:
213
+ logger.error(f"Failed to read file {args.file}: {e}")
214
+ sys.exit(1)
215
+
216
+ # Determine base_url from first URL if not provided
217
+ if not args.base_url and urls:
218
+ parsed = urlparse(urls[0])
219
+ args.base_url = f"{parsed.scheme}://{parsed.netloc}"
220
+
221
+ crawler = Crawler(
222
+ base_url=args.base_url,
223
+ sitemap_url=args.sitemap_url,
224
+ output_dir=args.output_dir,
225
+ custom_folder=args.folder
226
+ )
227
+
228
+ try:
229
+ crawler.run(urls=urls)
230
+ except KeyboardInterrupt:
231
+ logger.info("\nCrawling interrupted by user.")
232
+ sys.exit(0)
233
+ except Exception as e:
234
+ logger.error(f"Error during crawling: {e}")
235
+ import traceback
236
+ traceback.print_exc()
237
+ sys.exit(1)
238
+
239
+
240
+ if __name__ == "__main__":
241
+ main()
@@ -0,0 +1,401 @@
1
+ import os
2
+ import logging
3
+ from urllib.parse import urlparse, urljoin
4
+ from bs4 import BeautifulSoup
5
+ from markdownify import markdownify as md
6
+ from playwright.sync_api import sync_playwright
7
+ from tqdm import tqdm
8
+ import requests
9
+
10
+ # Configuration
11
+ MAX_RETRIES = 3
12
+ PAGE_LOAD_TIMEOUT = 30000 # 30秒超时
13
+ MAX_DISCOVERY_DEPTH = 10 # 最大递归深度
14
+
15
+ # Setup logging
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class Crawler:
20
+ def __init__(self, base_url=None, sitemap_url=None, output_dir="output", custom_folder=None):
21
+ """
22
+ Initialize the crawler.
23
+
24
+ Args:
25
+ base_url: Base URL of the documentation site
26
+ sitemap_url: URL of the sitemap
27
+ output_dir: Output directory for markdown files
28
+ custom_folder: Custom folder name under output_dir
29
+ """
30
+ self.base_url = base_url
31
+ self.sitemap_url = sitemap_url or (f"{base_url}/sitemap.xml" if base_url else None)
32
+ self.output_dir = output_dir
33
+ self.session = requests.Session()
34
+ self.session.headers.update({
35
+ 'User-Agent': 'Mozilla/5.0 (compatible; Bot/1.0; +http://example.com)'
36
+ })
37
+ self.results = []
38
+ self.subdomain = None
39
+ self.custom_folder = custom_folder
40
+
41
+ # Ensure output directory exists
42
+ os.makedirs(output_dir, exist_ok=True)
43
+
44
+ def extract_subdomain(self, url):
45
+ """从URL中提取二级域名(主域名)作为文件夹名。"""
46
+ parsed = urlparse(url)
47
+ hostname = parsed.hostname
48
+ if hostname:
49
+ parts = hostname.split('.')
50
+
51
+ # 提取二级域名(主域名)的逻辑:
52
+ # code.claude.com -> parts[-2] = 'claude'
53
+ # antigravity.google -> parts[-2] = 'antigravity'
54
+ # example.com -> parts[-2] = 'example'
55
+ # localhost -> parts[-1] = 'localhost'
56
+
57
+ if len(parts) >= 2:
58
+ # 取倒数第二个部分作为二级域名
59
+ return parts[-2]
60
+ elif len(parts) == 1:
61
+ # 只有一个部分,如 localhost
62
+ return parts[0]
63
+
64
+ return 'default'
65
+
66
+ def fetch_sitemap(self):
67
+ """Fetches and parses the sitemap to extract /docs/ URLs."""
68
+ if not self.sitemap_url:
69
+ logger.error("No sitemap URL configured")
70
+ return []
71
+
72
+ try:
73
+ logger.info(f"Fetching sitemap from {self.sitemap_url}")
74
+ response = self.session.get(self.sitemap_url)
75
+ response.raise_for_status()
76
+
77
+ # XML parsing (using lxml if available, else html.parser)
78
+ # sitemap files are often just text/xml
79
+ soup = BeautifulSoup(response.content, 'xml')
80
+ urls = [loc.text for loc in soup.find_all('loc')]
81
+
82
+ # Filter for /docs/
83
+ doc_urls = [url for url in urls if '/docs/' in urlparse(url).path]
84
+ logger.info(f"Found {len(doc_urls)} pages under /docs/")
85
+ return doc_urls
86
+ except Exception as e:
87
+ logger.error(f"Failed to fetch sitemap: {e}")
88
+ return []
89
+
90
+ def extract_links_from_page(self, page, current_url, path_filter='/docs/'):
91
+ """
92
+ Extract all links from a page that match the path filter.
93
+
94
+ Args:
95
+ page: Playwright page object
96
+ current_url: Current page URL
97
+ path_filter: Path pattern to filter links (default: '/docs/')
98
+
99
+ Returns:
100
+ Set of discovered URLs
101
+ """
102
+ links = set()
103
+
104
+ try:
105
+ # Get all <a> tags
106
+ link_elements = page.query_selector_all('a[href]')
107
+
108
+ parsed_base = urlparse(current_url)
109
+ base_domain = parsed_base.netloc
110
+
111
+ for element in link_elements:
112
+ href = element.get_attribute('href')
113
+ if not href:
114
+ continue
115
+
116
+ # Convert relative URLs to absolute
117
+ absolute_url = urljoin(current_url, href)
118
+ parsed_url = urlparse(absolute_url)
119
+
120
+ # Filter: same domain and contains path_filter
121
+ if (parsed_url.netloc == base_domain and
122
+ path_filter in parsed_url.path):
123
+ # Remove fragment and normalize
124
+ clean_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}"
125
+ if parsed_url.query:
126
+ clean_url += f"?{parsed_url.query}"
127
+ links.add(clean_url)
128
+
129
+ except Exception as e:
130
+ logger.warning(f"Error extracting links from {current_url}: {e}")
131
+
132
+ return links
133
+
134
+ def discover_links_recursive(self, start_url, path_filter='/docs/', max_depth=MAX_DISCOVERY_DEPTH):
135
+ """
136
+ Recursively discover documentation links starting from a URL.
137
+
138
+ Args:
139
+ start_url: Starting URL for discovery
140
+ path_filter: Path pattern to filter links (default: '/docs/')
141
+ max_depth: Maximum number of URLs to discover
142
+
143
+ Returns:
144
+ List of discovered URLs
145
+ """
146
+ discovered = set()
147
+ to_visit = {start_url}
148
+ visited = set()
149
+
150
+ logger.info(f"Starting recursive link discovery from {start_url}")
151
+
152
+ with sync_playwright() as p:
153
+ browser = p.chromium.launch(headless=True)
154
+ context = browser.new_context(
155
+ user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
156
+ )
157
+ page = context.new_page()
158
+
159
+ pbar = tqdm(desc="Discovering links", unit="page")
160
+
161
+ while to_visit and len(discovered) < max_depth:
162
+ current_url = to_visit.pop()
163
+
164
+ if current_url in visited:
165
+ continue
166
+
167
+ visited.add(current_url)
168
+ discovered.add(current_url)
169
+ pbar.update(1)
170
+ pbar.set_postfix({"found": len(discovered), "queue": len(to_visit)})
171
+
172
+ try:
173
+ # Load the page
174
+ page.goto(current_url, timeout=PAGE_LOAD_TIMEOUT)
175
+ page.wait_for_load_state('networkidle', timeout=15000)
176
+
177
+ # Extract links from this page
178
+ new_links = self.extract_links_from_page(page, current_url, path_filter)
179
+
180
+ # Add new unvisited links to the queue
181
+ for link in new_links:
182
+ if link not in visited and link not in discovered:
183
+ to_visit.add(link)
184
+
185
+ except Exception as e:
186
+ logger.warning(f"Failed to process {current_url}: {e}")
187
+
188
+ pbar.close()
189
+ browser.close()
190
+
191
+ logger.info(f"Discovery complete. Found {len(discovered)} URLs")
192
+ return sorted(list(discovered))
193
+
194
+ def discover_links(self, start_url=None, path_filter='/docs/', max_depth=MAX_DISCOVERY_DEPTH):
195
+ """
196
+ Discover documentation links. Try sitemap first, fallback to recursive discovery.
197
+
198
+ Args:
199
+ start_url: Starting URL for recursive discovery (if sitemap fails)
200
+ path_filter: Path pattern to filter links (default: '/docs/')
201
+ max_depth: Maximum number of URLs to discover in recursive mode
202
+
203
+ Returns:
204
+ List of discovered URLs
205
+ """
206
+ # Try sitemap first
207
+ urls = self.fetch_sitemap()
208
+
209
+ if urls:
210
+ logger.info(f"Successfully found {len(urls)} URLs from sitemap")
211
+ return urls
212
+
213
+ # Fallback to recursive discovery
214
+ logger.info("Sitemap not available, using recursive link discovery")
215
+
216
+ if not start_url:
217
+ # Try to construct a starting URL
218
+ if self.base_url:
219
+ start_url = f"{self.base_url}/docs/" if not self.base_url.endswith('/') else f"{self.base_url}docs/"
220
+ else:
221
+ logger.error("No start URL provided and no base_url configured")
222
+ return []
223
+
224
+ return self.discover_links_recursive(start_url, path_filter, max_depth)
225
+
226
+ def process_url_with_playwright(self, page, url):
227
+ """Downloads and converts a single URL using Playwright."""
228
+ # 如果还没有设置subdomain,从当前URL提取或使用custom_folder
229
+ if self.subdomain is None:
230
+ if self.custom_folder:
231
+ self.subdomain = self.custom_folder
232
+ logger.info(f"Using custom folder: {self.subdomain}")
233
+ else:
234
+ self.subdomain = self.extract_subdomain(url)
235
+ logger.info(f"Using auto-detected folder (domain): {self.subdomain}")
236
+ # 创建子文件夹
237
+ self.output_subdir = os.path.join(self.output_dir, self.subdomain)
238
+ os.makedirs(self.output_subdir, exist_ok=True)
239
+
240
+ slug = urlparse(url).path.strip('/').replace('/', '_')
241
+ if not slug:
242
+ slug = "index"
243
+ filename = f"{slug}.md"
244
+ filepath = os.path.join(self.output_subdir, filename)
245
+
246
+ content = None
247
+ title = None
248
+
249
+ for attempt in range(MAX_RETRIES):
250
+ try:
251
+ # 导航到页面
252
+ page.goto(url, timeout=PAGE_LOAD_TIMEOUT)
253
+
254
+ # 等待主内容加载完成
255
+ # 尝试等待文章内容或主区域
256
+ try:
257
+ page.wait_for_selector('article, main, [role="main"]', timeout=10000)
258
+ except:
259
+ pass
260
+
261
+ # 额外等待确保JS完全渲染
262
+ page.wait_for_load_state('networkidle', timeout=15000)
263
+
264
+ # 获取渲染后的HTML
265
+ content = page.content()
266
+ break
267
+ except Exception as e:
268
+ logger.warning(f"Attempt {attempt + 1} failed for {url}: {e}")
269
+ if attempt == MAX_RETRIES - 1:
270
+ logger.error(f"Failed to download {url} after {MAX_RETRIES} attempts")
271
+ return None
272
+
273
+ if content:
274
+ try:
275
+ markdown_content, page_title = self.convert_to_markdown(content)
276
+ title = page_title
277
+
278
+ with open(filepath, 'w', encoding='utf-8') as f:
279
+ f.write(markdown_content)
280
+
281
+ return {'title': title, 'url': url, 'file': filename}
282
+ except Exception as e:
283
+ logger.error(f"Error converting {url}: {e}")
284
+
285
+ return None
286
+
287
+ def convert_to_markdown(self, html_content):
288
+ """Extracts content and converts to Markdown."""
289
+ soup = BeautifulSoup(html_content, 'html.parser')
290
+
291
+ # Extract title
292
+ title_tag = soup.find('title')
293
+ title = title_tag.text.strip() if title_tag else "No Title"
294
+
295
+ # Remove unwanted elements
296
+ for tag in soup.find_all(['nav', 'footer', 'script', 'style', 'noscript', 'iframe', 'header']):
297
+ tag.decompose()
298
+
299
+ # Common classes/IDs for unwanted elements
300
+ unwanted_selectors = [
301
+ '.sidebar', '#sidebar',
302
+ '.toc', '#toc',
303
+ '.breadcrumbs', '.breadcrumb',
304
+ '.footer', '.header', '.nav',
305
+ '[role="navigation"]',
306
+ '.navigation',
307
+ '.menu'
308
+ ]
309
+ for selector in unwanted_selectors:
310
+ for element in soup.select(selector):
311
+ element.decompose()
312
+
313
+ # Prioritize content extraction - 尝试更具体的选择器
314
+ content_element = None
315
+
316
+ # 尝试找到文档内容区域
317
+ content_selectors = [
318
+ 'article',
319
+ '[role="main"]',
320
+ '.docs-content',
321
+ '.content',
322
+ '.markdown-body',
323
+ 'main',
324
+ '.main-content'
325
+ ]
326
+
327
+ for selector in content_selectors:
328
+ content_element = soup.select_one(selector)
329
+ if content_element and len(content_element.get_text(strip=True)) > 100:
330
+ break
331
+
332
+ if not content_element:
333
+ content_element = soup.find('body')
334
+
335
+ if not content_element:
336
+ return "", title
337
+
338
+ # Convert to Markdown
339
+ markdown = md(str(content_element), heading_style="ATX", strip=['img'])
340
+
341
+ # 清理多余的空行
342
+ lines = markdown.split('\n')
343
+ cleaned_lines = []
344
+ prev_empty = False
345
+ for line in lines:
346
+ is_empty = not line.strip()
347
+ if is_empty and prev_empty:
348
+ continue
349
+ cleaned_lines.append(line)
350
+ prev_empty = is_empty
351
+
352
+ return '\n'.join(cleaned_lines).strip(), title
353
+
354
+ def generate_index(self):
355
+ """Generates the index.md file."""
356
+ index_path = os.path.join(self.output_subdir, "index.md")
357
+ with open(index_path, 'w', encoding='utf-8') as f:
358
+ f.write("# Documentation Index\n\n")
359
+ f.write("| Title | Original URL | Local File |\n")
360
+ f.write("|-------|--------------|------------|\n")
361
+ for item in sorted(self.results, key=lambda x: x['title']):
362
+ f.write(f"| {item['title']} | [{item['url']}]({item['url']}) | [{item['file']}]({item['file']}) |\n")
363
+ logger.info(f"Generated index at {index_path}")
364
+
365
+ def run(self, urls=None, start_url=None, path_filter='/docs/', max_depth=MAX_DISCOVERY_DEPTH):
366
+ """
367
+ Run the crawler.
368
+
369
+ Args:
370
+ urls: List of URLs to crawl. If None, uses discover_links method.
371
+ start_url: Starting URL for recursive discovery (if needed)
372
+ path_filter: Path pattern to filter links (default: '/docs/')
373
+ max_depth: Maximum number of URLs to discover in recursive mode
374
+ """
375
+ if urls is None:
376
+ urls = self.discover_links(start_url, path_filter, max_depth)
377
+
378
+ if not urls:
379
+ logger.warning("No URLs found to process.")
380
+ return
381
+
382
+ logger.info(f"Starting download of {len(urls)} pages using Playwright...")
383
+
384
+ with sync_playwright() as p:
385
+ # 启动浏览器
386
+ browser = p.chromium.launch(headless=True)
387
+ context = browser.new_context(
388
+ user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
389
+ )
390
+ page = context.new_page()
391
+
392
+ # 使用 tqdm 显示进度
393
+ for url in tqdm(urls, unit="page"):
394
+ result = self.process_url_with_playwright(page, url)
395
+ if result:
396
+ self.results.append(result)
397
+
398
+ browser.close()
399
+
400
+ self.generate_index()
401
+ logger.info("Done.")
@@ -0,0 +1,262 @@
1
+ Metadata-Version: 2.4
2
+ Name: docs-crawler
3
+ Version: 0.1.0
4
+ Summary: A documentation crawler that converts web documentation to Markdown format
5
+ License: MIT
6
+ License-File: LICENSE
7
+ Keywords: crawler,documentation,markdown,scraper
8
+ Author: nev4rb14su
9
+ Requires-Python: >=3.8.1,<4.0.0
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.9
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Programming Language :: Python :: 3.14
20
+ Classifier: Programming Language :: Python :: 3.8
21
+ Requires-Dist: beautifulsoup4 (>=4.12.0,<5.0.0)
22
+ Requires-Dist: lxml (>=5.0.0,<6.0.0)
23
+ Requires-Dist: markdownify (>=0.11.0,<0.12.0)
24
+ Requires-Dist: playwright (>=1.40.0,<2.0.0)
25
+ Requires-Dist: requests (>=2.31.0,<3.0.0)
26
+ Requires-Dist: tqdm (>=4.66.0,<5.0.0)
27
+ Project-URL: Homepage, https://github.com/neverbiasu/docs-crawler
28
+ Description-Content-Type: text/markdown
29
+
30
+ # Docs Crawler
31
+
32
+ A powerful documentation crawler that converts web documentation to Markdown format using Playwright for JavaScript-rendered content.
33
+
34
+ ## Features
35
+
36
+ - **Smart Link Discovery**: Tries sitemap first, automatically falls back to recursive link discovery
37
+ - **Discover Mode**: Find and save documentation URLs before crawling
38
+ - Crawls documentation from sitemaps or URL lists
39
+ - Uses Playwright to handle JavaScript-rendered Single Page Applications (SPAs)
40
+ - Converts HTML to clean Markdown format
41
+ - Auto-detects domain-based folder structure
42
+ - Generates an index of all crawled pages
43
+ - Progress tracking with tqdm
44
+ - Retry logic for failed requests
45
+
46
+ ## Requirements
47
+
48
+ - Python 3.8+
49
+ - Poetry (for dependency management)
50
+
51
+ ## Installation
52
+
53
+ ### Using Poetry (Recommended)
54
+
55
+ ```bash
56
+ # Install Poetry if you haven't already
57
+ curl -sSL https://install.python-poetry.org | python3 -
58
+
59
+ # Clone the repository
60
+ git clone https://github.com/neverbiasu/docs-crawler.git
61
+ cd docs-crawler
62
+
63
+ # Install dependencies
64
+ poetry install
65
+
66
+ # Install Playwright browsers
67
+ poetry run playwright install chromium
68
+ ```
69
+
70
+ ### Using pip
71
+
72
+ ```bash
73
+ pip install docs-crawler
74
+ playwright install chromium
75
+ ```
76
+
77
+ ## Usage
78
+
79
+ ### Command Line Interface
80
+
81
+ The package provides a `docs-crawler` command with three modes:
82
+
83
+ #### 1. Sitemap Mode (Default)
84
+ Tries to fetch URLs from sitemap first, automatically falls back to recursive link discovery if sitemap is not available.
85
+
86
+ ```bash
87
+ # Crawl from sitemap (with automatic fallback)
88
+ poetry run docs-crawler --base-url https://example.com
89
+
90
+ # Specify custom sitemap URL
91
+ poetry run docs-crawler --sitemap-url https://example.com/custom-sitemap.xml
92
+
93
+ # Customize path filter and max URLs to discover
94
+ poetry run docs-crawler --base-url https://example.com --path-filter /docs/ --max-depth 200
95
+ ```
96
+
97
+ #### 2. Discover Mode
98
+ Discover all documentation URLs and save them to a file for review before crawling.
99
+
100
+ ```bash
101
+ # Discover links and save to auto-generated file (e.g., example_urls.txt)
102
+ poetry run docs-crawler --mode discover --base-url https://example.com
103
+
104
+ # Specify custom output file
105
+ poetry run docs-crawler --mode discover --base-url https://example.com --output-file my-urls.txt
106
+
107
+ # Start from a specific URL
108
+ poetry run docs-crawler --mode discover --start-url https://example.com/docs/intro
109
+
110
+ # Customize discovery settings
111
+ poetry run docs-crawler --mode discover --base-url https://example.com --path-filter /api/ --max-depth 50
112
+ ```
113
+
114
+ The discover mode will:
115
+ 1. Find all documentation links (using sitemap or recursive discovery)
116
+ 2. Display the first 10 URLs as a preview
117
+ 3. Ask for your confirmation before saving
118
+ 4. Save URLs to a file named `{subdomain}_urls.txt` (e.g., `example_urls.txt`)
119
+
120
+ #### 3. List Mode
121
+ Crawl from a list of URLs in a text file.
122
+
123
+ ```bash
124
+ # Crawl from URL list
125
+ poetry run docs-crawler --mode list --file urls.txt
126
+
127
+ # Specify custom output folder
128
+ poetry run docs-crawler --mode list --file urls.txt --folder my-docs
129
+ ```
130
+
131
+ #### Common Options
132
+
133
+ ```bash
134
+ # Custom output directory
135
+ --output-dir custom-output
136
+
137
+ # Custom folder name
138
+ --folder my-docs
139
+
140
+ # Path filter for link discovery (default: /docs/)
141
+ --path-filter /documentation/
142
+
143
+ # Maximum URLs to discover (default: 100)
144
+ --max-depth 500
145
+
146
+ # Starting URL for recursive discovery
147
+ --start-url https://example.com/docs/
148
+ ```
149
+
150
+ ### Python API
151
+
152
+ ```python
153
+ from docs_crawler import Crawler
154
+
155
+ # Create crawler instance
156
+ crawler = Crawler(
157
+ base_url="https://antigravity.google",
158
+ output_dir="output",
159
+ custom_folder="antigravity"
160
+ )
161
+
162
+ # Run with automatic link discovery (sitemap first, then recursive)
163
+ crawler.run()
164
+
165
+ # Discover links only
166
+ urls = crawler.discover_links(
167
+ start_url="https://example.com/docs/",
168
+ path_filter="/docs/",
169
+ max_depth=100
170
+ )
171
+ print(f"Found {len(urls)} URLs")
172
+
173
+ # Run with custom URLs
174
+ crawler.run(urls=[
175
+ "https://example.com/docs/page1",
176
+ "https://example.com/docs/page2"
177
+ ])
178
+
179
+ # Run with custom discovery settings
180
+ crawler.run(
181
+ start_url="https://example.com/docs/intro",
182
+ path_filter="/documentation/",
183
+ max_depth=200
184
+ )
185
+ ```
186
+
187
+ ## Output
188
+
189
+ - The downloaded Markdown files will be saved in the `output/` directory (or custom directory).
190
+ - An index of all downloaded pages is available at `output/{folder}/index.md`.
191
+ - Files are organized by domain or custom folder name.
192
+
193
+ ## Development
194
+
195
+ ```bash
196
+ # Install development dependencies
197
+ poetry install --with dev
198
+
199
+ # Run tests
200
+ poetry run pytest
201
+
202
+ # Format code
203
+ poetry run black .
204
+
205
+ # Lint code
206
+ poetry run flake8
207
+
208
+ # Type checking
209
+ poetry run mypy docs_crawler
210
+ ```
211
+
212
+ ## Configuration
213
+
214
+ The crawler can be configured through:
215
+ - Command-line arguments
216
+ - Python API parameters
217
+ - Environment variables (coming soon)
218
+
219
+ ## How It Works
220
+
221
+ ### Link Discovery
222
+
223
+ The crawler uses a smart two-step approach:
224
+
225
+ 1. **Sitemap First**: Attempts to fetch URLs from the sitemap.xml file
226
+ 2. **Recursive Discovery Fallback**: If sitemap is unavailable or empty, automatically discovers links by:
227
+ - Starting from a base URL (e.g., `/docs/`)
228
+ - Extracting all internal links matching the path filter
229
+ - Recursively crawling pages to find more documentation links
230
+ - Respecting the max-depth limit to avoid excessive crawling
231
+
232
+ ### Workflow Example
233
+
234
+ ```bash
235
+ # Step 1: Discover links and save for review
236
+ poetry run docs-crawler --mode discover --base-url https://example.com
237
+ # Output: example_urls.txt
238
+
239
+ # Step 2: Review and edit urls.txt if needed
240
+ # (Remove unwanted URLs, add missing ones, etc.)
241
+
242
+ # Step 3: Crawl the URLs
243
+ poetry run docs-crawler --mode list --file example_urls.txt
244
+ ```
245
+
246
+ ## Notes
247
+
248
+ - The crawler uses Playwright to handle JavaScript-rendered content, making it suitable for modern SPAs.
249
+ - Default path filter is `/docs/` but can be customized with `--path-filter`
250
+ - Respects retry limits and timeouts to be polite to servers.
251
+ - Auto-detects domain-based folder structure or uses custom folder names.
252
+ - Recursive discovery avoids infinite loops by tracking visited URLs
253
+ - URL files are named using the subdomain for easy identification (e.g., `github_urls.txt`, `example_urls.txt`)
254
+
255
+ ## License
256
+
257
+ MIT License - see LICENSE file for details.
258
+
259
+ ## Contributing
260
+
261
+ Contributions are welcome! Please feel free to submit a Pull Request.
262
+
@@ -0,0 +1,8 @@
1
+ docs_crawler/__init__.py,sha256=9IfLYZnZk_SS5HeOlZ0REqviVItH2JJcsbwq1Sd2Yyc,170
2
+ docs_crawler/cli.py,sha256=3jdQZyZdsN9pNVoi0J1zRTmKttuB4ctEvbjNJqw4ghA,7682
3
+ docs_crawler/crawler.py,sha256=1Q1oX8gmWiorR0dUngezTNmV8jvTibfTc5aZQ7cBe88,14647
4
+ docs_crawler-0.1.0.dist-info/METADATA,sha256=cQ0d7tpUrOALw6RJfF1p9teGKAFIjzydG8hOF6BvFf0,7345
5
+ docs_crawler-0.1.0.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
6
+ docs_crawler-0.1.0.dist-info/entry_points.txt,sha256=S5Z3NqFSSjBLd7yO1hF8pHYH7rYOlYHflvauHc7nRpM,54
7
+ docs_crawler-0.1.0.dist-info/licenses/LICENSE,sha256=igDghpYK4aLmOc5L2tYBYrHZuw1A3sFbZAv2nd3lSsg,1067
8
+ docs_crawler-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: poetry-core 2.2.1
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,3 @@
1
+ [console_scripts]
2
+ docs-crawler=docs_crawler.cli:main
3
+
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 nev4rb14su
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.