html_docs_crawler 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,98 @@
1
+ Metadata-Version: 2.4
2
+ Name: html_docs_crawler
3
+ Version: 0.1.0
4
+ Summary: Universal documentation crawler that converts HTML pages to Markdown with internal link correction
5
+ Project-URL: Homepage, https://github.com/zwidny/doc_crawler
6
+ Project-URL: Source, https://github.com/zwidny/doc_crawler
7
+ Project-URL: BugTracker, https://github.com/zwidny/doc_crawler/issues
8
+ Keywords: scrapy,crawler,markdown,documentation,web-scraping
9
+ Requires-Python: >=3.12
10
+ Description-Content-Type: text/markdown
11
+ Requires-Dist: fake-useragent>=2.2.0
12
+ Requires-Dist: html2text>=2025.4.15
13
+ Requires-Dist: markitdown[docx,pdf,pptx]>=0.1.5
14
+ Requires-Dist: scrapy>=2.14.1
15
+
16
+ # scrapy-mth
17
+
18
+ A Scrapy-based universal documentation crawler that converts HTML documentation sites to Markdown format, with automatic internal link rewriting to local `.md` relative paths. Supports multiple converter engines (markitdown / html2text), path whitelist filtering, and automatic media file download.
19
+
20
+ ## Parameters
21
+
22
+ - `start_urls`: Starting URLs (comma-separated)
23
+ - `allowed_domains`: Allowed domains (comma-separated)
24
+ - `deny_patterns`: Regex deny patterns (comma-separated)
25
+ - `allow_paths`: Allowed path prefixes (comma-separated); only URLs starting with these prefixes will be processed
26
+ - `body_selector`: CSS selector for main HTML content (default: `"main, article, .content, .document, .body, body"`)
27
+ - `output_dir`: Output directory (default: `"~/.config/doc_crawler/_docs/{domain_name}"`, where `{domain_name}` is extracted from `start_urls`)
28
+ - `converter_engine`: Converter engine (default: `"markitdown"`, optional: `"html2text"`)
29
+ - `single_page`: Single-page mode (default: `"false"`, set to `"true"` to crawl a single page without following links)
30
+
31
+ ## Install via UV
32
+
33
+ ```bash
34
+ uv tool install git+https://github.com/zwidny/doc_crawler.git
35
+ ```
36
+
37
+ After installation, you can use the `doc_crawler` command from any directory.
38
+
39
+ ## Usage Examples
40
+
41
+ ```bash
42
+ # Crawl AKShare documentation
43
+ doc_crawler --start-urls "https://akshare.akfamily.xyz" \
44
+ --allowed-domains "akshare.akfamily.xyz" \
45
+ --deny-patterns "/_sources/" \
46
+ --body-selector "main, article, .content, .document, .body" \
47
+ --output-dir "_docs/akshare_markdown"
48
+ ```
49
+
50
+ ### Single-page mode
51
+
52
+ ```bash
53
+ doc_crawler --start-urls "https://build123d.readthedocs.io/en/stable/examples_1.html" \
54
+ --single-page true \
55
+ --body-selector ".wy-nav-content" \
56
+ --output-dir "single_page_output"
57
+ ```
58
+
59
+ ### Path whitelist filtering
60
+
61
+ ```bash
62
+ doc_crawler --start-urls "https://opencode.ai/docs/zh-cn/" \
63
+ --allow-paths "/docs/zh-cn/" \
64
+ --body-selector "main, article, .content" \
65
+ --output-dir "_docs/opencode_docs_zh_cn"
66
+ ```
67
+
68
+ ### Crawl with html2text engine
69
+
70
+ ```bash
71
+ doc_crawler --start-urls "https://akshare.akfamily.xyz/" \
72
+ --allowed-domains "akshare.akfamily.xyz" \
73
+ --deny-patterns "/_sources/" \
74
+ --body-selector "main, article, .content, .document, .body" \
75
+ --converter-engine "html2text" \
76
+ --output-dir "_docs/akshare_markdown_html2text"
77
+ ```
78
+
79
+ ### More examples
80
+
81
+ ```bash
82
+ # Crawl build123d docs
83
+ doc_crawler --start-urls "https://build123d.readthedocs.io/en/stable/" \
84
+ --deny-patterns "/_sources/,/latest/" \
85
+ --body-selector ".wy-nav-content" \
86
+ --output-dir "_docs/build123d"
87
+
88
+ # Crawl Docusaurus docs
89
+ doc_crawler --start-urls "https://docusaurus.io/docs" \
90
+ --allow-paths "/docs" \
91
+ --body-selector ".col.docItemCol_n6xZ" \
92
+ --output-dir "_docs/docusaurus"
93
+
94
+ # Crawl uv documentation
95
+ doc_crawler --start-urls "https://docs.astral.sh/uv/" \
96
+ --body-selector ".md-content" \
97
+ --output-dir "_docs/uv"
98
+ ```
@@ -0,0 +1,83 @@
1
+ # scrapy-mth
2
+
3
+ A Scrapy-based universal documentation crawler that converts HTML documentation sites to Markdown format, with automatic internal link rewriting to local `.md` relative paths. Supports multiple converter engines (markitdown / html2text), path whitelist filtering, and automatic media file download.
4
+
5
+ ## Parameters
6
+
7
+ - `start_urls`: Starting URLs (comma-separated)
8
+ - `allowed_domains`: Allowed domains (comma-separated)
9
+ - `deny_patterns`: Regex deny patterns (comma-separated)
10
+ - `allow_paths`: Allowed path prefixes (comma-separated); only URLs starting with these prefixes will be processed
11
+ - `body_selector`: CSS selector for main HTML content (default: `"main, article, .content, .document, .body, body"`)
12
+ - `output_dir`: Output directory (default: `"~/.config/doc_crawler/_docs/{domain_name}"`, where `{domain_name}` is extracted from `start_urls`)
13
+ - `converter_engine`: Converter engine (default: `"markitdown"`, optional: `"html2text"`)
14
+ - `single_page`: Single-page mode (default: `"false"`, set to `"true"` to crawl a single page without following links)
15
+
16
+ ## Install via UV
17
+
18
+ ```bash
19
+ uv tool install git+https://github.com/zwidny/doc_crawler.git
20
+ ```
21
+
22
+ After installation, you can use the `doc_crawler` command from any directory.
23
+
24
+ ## Usage Examples
25
+
26
+ ```bash
27
+ # Crawl AKShare documentation
28
+ doc_crawler --start-urls "https://akshare.akfamily.xyz" \
29
+ --allowed-domains "akshare.akfamily.xyz" \
30
+ --deny-patterns "/_sources/" \
31
+ --body-selector "main, article, .content, .document, .body" \
32
+ --output-dir "_docs/akshare_markdown"
33
+ ```
34
+
35
+ ### Single-page mode
36
+
37
+ ```bash
38
+ doc_crawler --start-urls "https://build123d.readthedocs.io/en/stable/examples_1.html" \
39
+ --single-page true \
40
+ --body-selector ".wy-nav-content" \
41
+ --output-dir "single_page_output"
42
+ ```
43
+
44
+ ### Path whitelist filtering
45
+
46
+ ```bash
47
+ doc_crawler --start-urls "https://opencode.ai/docs/zh-cn/" \
48
+ --allow-paths "/docs/zh-cn/" \
49
+ --body-selector "main, article, .content" \
50
+ --output-dir "_docs/opencode_docs_zh_cn"
51
+ ```
52
+
53
+ ### Crawl with html2text engine
54
+
55
+ ```bash
56
+ doc_crawler --start-urls "https://akshare.akfamily.xyz/" \
57
+ --allowed-domains "akshare.akfamily.xyz" \
58
+ --deny-patterns "/_sources/" \
59
+ --body-selector "main, article, .content, .document, .body" \
60
+ --converter-engine "html2text" \
61
+ --output-dir "_docs/akshare_markdown_html2text"
62
+ ```
63
+
64
+ ### More examples
65
+
66
+ ```bash
67
+ # Crawl build123d docs
68
+ doc_crawler --start-urls "https://build123d.readthedocs.io/en/stable/" \
69
+ --deny-patterns "/_sources/,/latest/" \
70
+ --body-selector ".wy-nav-content" \
71
+ --output-dir "_docs/build123d"
72
+
73
+ # Crawl Docusaurus docs
74
+ doc_crawler --start-urls "https://docusaurus.io/docs" \
75
+ --allow-paths "/docs" \
76
+ --body-selector ".col.docItemCol_n6xZ" \
77
+ --output-dir "_docs/docusaurus"
78
+
79
+ # Crawl uv documentation
80
+ doc_crawler --start-urls "https://docs.astral.sh/uv/" \
81
+ --body-selector ".md-content" \
82
+ --output-dir "_docs/uv"
83
+ ```
File without changes
@@ -0,0 +1,195 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Command-line interface for the universal documentation crawler.
4
+ This provides a more convenient way to run the doc_crawler spider.
5
+ """
6
+
7
+ import argparse
8
+ import os
9
+ import sys
10
+ from urllib.parse import urlparse
11
+ from scrapy.cmdline import execute
12
+
13
+ try:
14
+ from importlib.metadata import version, PackageNotFoundError
15
+ except ImportError:
16
+ # Python < 3.8 compatibility
17
+ import pkg_resources
18
+
19
+ def version(package_name):
20
+ try:
21
+ return pkg_resources.get_distribution(package_name).version
22
+ except pkg_resources.DistributionNotFound:
23
+ return None
24
+
25
+ PackageNotFoundError = pkg_resources.DistributionNotFound
26
+
27
+
28
+ def get_version():
29
+ try:
30
+ return version("html_docs_crawler")
31
+ except PackageNotFoundError:
32
+ return "unknown"
33
+
34
+
35
+ def get_domain_from_url(url):
36
+ """Extract domain from a URL string."""
37
+ parsed = urlparse(url)
38
+ domain = parsed.netloc
39
+ if not domain:
40
+ return "unknown_domain"
41
+ # Remove port if present
42
+ if ":" in domain:
43
+ domain = domain.split(":")[0]
44
+ return domain
45
+
46
+
47
+ def get_default_output_dir(start_urls):
48
+ """Generate default output directory based on the first start URL."""
49
+ if not start_urls:
50
+ return "markdown_output"
51
+
52
+ # Use the first URL to determine domain
53
+ first_url = start_urls.split(",")[0].strip()
54
+ domain = get_domain_from_url(first_url)
55
+
56
+ # Build path: ~/.config/doc_crawler/_docs/{domain}
57
+ home = os.path.expanduser("~")
58
+ return os.path.join(home, ".config", "doc_crawler", "_docs", domain)
59
+
60
+
61
+ def main():
62
+ parser = argparse.ArgumentParser(
63
+ description="Universal documentation crawler: Convert HTML pages to Markdown with internal link correction.",
64
+ formatter_class=argparse.RawDescriptionHelpFormatter,
65
+ epilog="""
66
+ Examples:
67
+ doc_crawler --start-urls https://akshare.akfamily.xyz --allowed-domains akshare.akfamily.xyz
68
+ doc_crawler --start-urls https://opencode.ai/docs/zh-cn/ --allow-paths /docs/zh-cn/
69
+ doc_crawler --start-urls https://build123d.readthedocs.io/en/stable/ --single-page
70
+ """,
71
+ )
72
+
73
+ parser.add_argument(
74
+ "--version",
75
+ action="version",
76
+ version=f"%(prog)s {get_version()}",
77
+ help="Show version information and exit",
78
+ )
79
+
80
+ # Required arguments
81
+ parser.add_argument(
82
+ "--start-urls", required=True, help="Starting URLs (comma-separated)"
83
+ )
84
+
85
+ # Optional arguments matching the spider's parameters
86
+ parser.add_argument(
87
+ "--allowed-domains", default="", help="Allowed domains (comma-separated)"
88
+ )
89
+
90
+ parser.add_argument(
91
+ "--deny-patterns", default="", help="Regex patterns to deny (comma-separated)"
92
+ )
93
+
94
+ parser.add_argument(
95
+ "--allow-paths", default="", help="Path prefixes to allow (comma-separated)"
96
+ )
97
+
98
+ parser.add_argument(
99
+ "--body-selector",
100
+ default="main, article, .content, .document, .body, body",
101
+ help="CSS selector for HTML body content",
102
+ )
103
+
104
+ parser.add_argument(
105
+ "--output-dir",
106
+ default="",
107
+ help="Output directory for Markdown files (default: ~/.config/doc_crawler/_docs/{domain})",
108
+ )
109
+
110
+ parser.add_argument(
111
+ "--converter-engine",
112
+ default="markitdown",
113
+ choices=["markitdown", "html2text"],
114
+ help="Converter engine: 'markitdown' (default) or 'html2text'",
115
+ )
116
+
117
+ parser.add_argument(
118
+ "--single-page",
119
+ default="false",
120
+ choices=["true", "false"],
121
+ help="Single page mode (don't follow links)",
122
+ )
123
+
124
+ parser.add_argument(
125
+ "--loglevel",
126
+ default="INFO",
127
+ choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
128
+ help="Log level",
129
+ )
130
+
131
+ args = parser.parse_args()
132
+
133
+ # Determine output directory
134
+ if not args.output_dir:
135
+ args.output_dir = get_default_output_dir(args.start_urls)
136
+ # Expand user directory (~)
137
+ args.output_dir = os.path.expanduser(args.output_dir)
138
+ # Convert to absolute path before chdir
139
+ args.output_dir = os.path.abspath(args.output_dir)
140
+ # Ensure the directory exists
141
+ os.makedirs(args.output_dir, exist_ok=True)
142
+
143
+ # Change to the directory containing scrapy.cfg
144
+ # This is necessary for scrapy to find the project settings
145
+ # The cli.py file is in doc_crawler/, so go up one level to find scrapy.cfg
146
+ script_dir = os.path.dirname(os.path.abspath(__file__))
147
+ project_root = os.path.dirname(script_dir) # doc_crawler/ -> project root
148
+
149
+ # Check if scrapy.cfg exists in the project root
150
+ scrapy_cfg_path = os.path.join(project_root, "scrapy.cfg")
151
+ if os.path.exists(scrapy_cfg_path):
152
+ os.chdir(project_root)
153
+ else:
154
+ # If not found, try the parent directory
155
+ parent_dir = os.path.dirname(project_root)
156
+ scrapy_cfg_path = os.path.join(parent_dir, "scrapy.cfg")
157
+ if os.path.exists(scrapy_cfg_path):
158
+ os.chdir(parent_dir)
159
+ else:
160
+ print(
161
+ f"警告: 未找到 scrapy.cfg 文件,当前目录: {os.getcwd()}",
162
+ file=sys.stderr,
163
+ )
164
+
165
+ # Build scrapy command arguments
166
+ scrapy_args = [
167
+ "scrapy",
168
+ "crawl",
169
+ "doc_crawler",
170
+ "-a",
171
+ f"start_urls={args.start_urls}",
172
+ "-a",
173
+ f"allowed_domains={args.allowed_domains}",
174
+ "-a",
175
+ f"deny_patterns={args.deny_patterns}",
176
+ "-a",
177
+ f"allow_paths={args.allow_paths}",
178
+ "-a",
179
+ f"body_selector={args.body_selector}",
180
+ "-a",
181
+ f"output_dir={args.output_dir}",
182
+ "-a",
183
+ f"converter_engine={args.converter_engine}",
184
+ "-a",
185
+ f"single_page={args.single_page}",
186
+ "--loglevel",
187
+ args.loglevel,
188
+ ]
189
+
190
+ # Execute scrapy command
191
+ sys.exit(execute(scrapy_args))
192
+
193
+
194
+ if __name__ == "__main__":
195
+ main()
@@ -0,0 +1,9 @@
1
+ # items.py
2
+ import scrapy
3
+
4
+
5
+ class DocCrawlerItem(scrapy.Item):
6
+ url = scrapy.Field()
7
+ markdown_content = scrapy.Field()
8
+ file_path = scrapy.Field()
9
+ media_urls = scrapy.Field() # 存储页面中的图片、文件等媒体链接
@@ -0,0 +1,39 @@
1
+ # middlewares.py
2
+ import logging
3
+ from fake_useragent import UserAgent
4
+
5
+ class RandomUserAgentMiddleware:
6
+ """为每个请求随机设置 User-Agent 的下载中间件"""
7
+
8
+ def __init__(self):
9
+ # 初始化 UserAgent 对象
10
+ self.ua = UserAgent()
11
+ # 备用的 User-Agent 列表(当 fake-useragent 失效时使用)
12
+ self.fallback_ua_list = [
13
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
14
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15',
15
+ 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
16
+ 'Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1',
17
+ 'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/121.0',
18
+ ]
19
+ self.logger = logging.getLogger(__name__)
20
+
21
+ def process_request(self, request, spider):
22
+ """在请求发送前设置 User-Agent"""
23
+ try:
24
+ # 尝试获取随机 UA
25
+ ua_string = self.ua.random
26
+ except Exception as e:
27
+ # 如果获取失败(如网络问题),从备用列表中随机选择一个
28
+ self.logger.warning(f"获取随机 UA 失败,使用备用列表: {e}")
29
+ import random
30
+ ua_string = random.choice(self.fallback_ua_list)
31
+
32
+ # 设置请求头
33
+ request.headers['User-Agent'] = ua_string
34
+ self.logger.debug(f'使用 User-Agent: {ua_string[:50]}...') # 可选:记录前50字符便于调试
35
+ return None # 必须返回 None,表示继续处理请求
36
+
37
+ def process_response(self, request, response, spider):
38
+ """处理响应(此处无需特殊操作)"""
39
+ return response
@@ -0,0 +1,160 @@
1
+ # pipelines.py
2
+ import os
3
+ import random
4
+ import asyncio
5
+ import urllib.request
6
+ import urllib.error
7
+ from urllib.parse import urlparse
8
+ from itemadapter import ItemAdapter
9
+
10
+
11
+ class SaveMarkdownPipeline:
12
+ def open_spider(self, spider):
13
+ self.output_dir = getattr(spider, "output_dir", "markdown_output")
14
+ if not os.path.exists(self.output_dir):
15
+ os.makedirs(self.output_dir)
16
+ spider.logger.info(f"Markdown 文件将保存到: {os.path.abspath(self.output_dir)}")
17
+
18
+ def process_item(self, item, spider):
19
+ adapter = ItemAdapter(item)
20
+ file_relative_path = adapter.get("file_path")
21
+ markdown_content = adapter.get("markdown_content", "")
22
+
23
+ if not file_relative_path or not markdown_content:
24
+ spider.logger.warning(
25
+ f"跳过 item,缺少文件路径或内容: {adapter.get('url')}"
26
+ )
27
+ return item
28
+
29
+ full_path = os.path.join(self.output_dir, file_relative_path)
30
+ dir_name = os.path.dirname(full_path)
31
+ if dir_name and not os.path.exists(dir_name):
32
+ os.makedirs(dir_name, exist_ok=True)
33
+
34
+ try:
35
+ with open(full_path, "w", encoding="utf-8") as f:
36
+ f.write(markdown_content)
37
+ spider.logger.info(f"成功保存: {full_path}")
38
+ except Exception as e:
39
+ spider.logger.error(f"保存文件失败 {full_path}: {e}")
40
+
41
+ return item
42
+
43
+
44
+ class MediaDownloadPipeline:
45
+ """下载页面中的媒体文件(图片、STL等)"""
46
+
47
+ def __init__(self):
48
+ try:
49
+ from fake_useragent import UserAgent
50
+
51
+ self.ua = UserAgent()
52
+ self.use_fake_ua = True
53
+ except ImportError:
54
+ self.use_fake_ua = False
55
+
56
+ self.fallback_ua_list = [
57
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
58
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15",
59
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
60
+ "Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1",
61
+ "Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/121.0",
62
+ ]
63
+
64
+ self.proxy = None
65
+
66
+ def _setup_proxy(self, spider):
67
+ proxy = getattr(spider, "proxy", None)
68
+
69
+ if not proxy:
70
+ proxy = (
71
+ os.environ.get("HTTP_PROXY")
72
+ or os.environ.get("HTTPS_PROXY")
73
+ or os.environ.get("http_proxy")
74
+ or os.environ.get("https_proxy")
75
+ )
76
+
77
+ self.proxy = proxy
78
+ if self.proxy:
79
+ spider.logger.info(f"使用代理: {self.proxy}")
80
+
81
+ def open_spider(self, spider):
82
+ self.output_dir = getattr(spider, "output_dir", "markdown_output")
83
+ spider.logger.info(f"媒体文件将保存到: {os.path.abspath(self.output_dir)}")
84
+ self._setup_proxy(spider)
85
+
86
+ def _get_random_user_agent(self):
87
+ if self.use_fake_ua:
88
+ try:
89
+ return self.ua.random
90
+ except Exception:
91
+ pass
92
+ return random.choice(self.fallback_ua_list)
93
+
94
+ async def process_item(self, item, spider):
95
+ adapter = ItemAdapter(item)
96
+ media_urls = adapter.get("media_urls", [])
97
+
98
+ if not media_urls:
99
+ return item
100
+
101
+ tasks = [self._download_media(url, spider) for url in media_urls]
102
+ results = await asyncio.gather(*tasks, return_exceptions=True)
103
+ for url, result in zip(media_urls, results):
104
+ if isinstance(result, Exception):
105
+ spider.logger.error(f"下载媒体文件失败 {url}: {result}")
106
+
107
+ return item
108
+
109
+ async def _download_media(self, url, spider):
110
+ parsed = urlparse(url)
111
+ path = parsed.path.lstrip("/")
112
+
113
+ if not path:
114
+ spider.logger.warning(f"跳过无效路径的URL: {url}")
115
+ return
116
+
117
+ local_path = os.path.join(self.output_dir, path)
118
+ local_dir = os.path.dirname(local_path)
119
+
120
+ if local_dir and not os.path.exists(local_dir):
121
+ os.makedirs(local_dir, exist_ok=True)
122
+
123
+ if os.path.exists(local_path):
124
+ spider.logger.debug(f"文件已存在,跳过: {local_path}")
125
+ return
126
+
127
+ def _sync_download():
128
+ spider.logger.info(f"下载媒体文件: {url} -> {local_path}")
129
+ user_agent = self._get_random_user_agent()
130
+ headers = {
131
+ "User-Agent": user_agent,
132
+ "Accept": "image/webp,image/apng,image/*,*/*;q=0.8",
133
+ "Accept-Language": "en-US,en;q=0.9",
134
+ }
135
+ req = urllib.request.Request(url, headers=headers)
136
+
137
+ if self.proxy:
138
+ proxy_handler = urllib.request.ProxyHandler(
139
+ {"http": self.proxy, "https": self.proxy}
140
+ )
141
+ opener = urllib.request.build_opener(proxy_handler)
142
+ response = opener.open(req, timeout=30)
143
+ else:
144
+ response = urllib.request.urlopen(req, timeout=30)
145
+
146
+ with response:
147
+ content = response.read()
148
+ with open(local_path, "wb") as f:
149
+ f.write(content)
150
+ spider.logger.info(f"成功下载: {local_path}")
151
+
152
+ try:
153
+ await asyncio.to_thread(_sync_download)
154
+ except urllib.error.HTTPError as e:
155
+ spider.logger.warning(f"HTTP错误 {e.code} 下载 {url}: {e.reason}")
156
+ except urllib.error.URLError as e:
157
+ spider.logger.warning(f"URL错误下载 {url}: {e.reason}")
158
+ except Exception as e:
159
+ spider.logger.error(f"下载失败 {url}: {e}")
160
+ raise
@@ -0,0 +1,58 @@
1
+ # Scrapy settings for akshare_docs project
2
+ #
3
+ # For simplicity, this file contains only settings considered important or
4
+ # commonly used. You can find more settings consulting the documentation:
5
+ #
6
+ # https://docs.scrapy.org/en/latest/topics/settings.html
7
+ # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
8
+ # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
9
+
10
+ BOT_NAME = "doc_crawler"
11
+ SPIDER_MODULES = ["doc_crawler.spiders"]
12
+ NEWSPIDER_MODULE = "doc_crawler.spiders"
13
+
14
+
15
+ ADDONS = {}
16
+
17
+
18
+ # Crawl responsibly by identifying yourself (and your website) on the user-agent
19
+ # USER_AGENT = "akshare_docs (+http://www.yourdomain.com)"
20
+
21
+
22
+ # 启用管道
23
+ ITEM_PIPELINES = {
24
+ "doc_crawler.pipelines.MediaDownloadPipeline": 200, # 先下载媒体文件
25
+ "doc_crawler.pipelines.SaveMarkdownPipeline": 300, # 再保存Markdown文件
26
+ }
27
+
28
+ # 遵守 robots.txt 规则(默认 True,建议保持)
29
+ ROBOTSTXT_OBEY = True
30
+
31
+ # 设置下载延迟(秒),避免对服务器造成压力
32
+ DOWNLOAD_DELAY = 0.5 # 根据网站承受能力调整,可以设为 1 或更大
33
+
34
+ # 并发请求数(适当降低以体现温和抓取)
35
+ CONCURRENT_REQUESTS = 8
36
+ CONCURRENT_REQUESTS_PER_DOMAIN = 4
37
+
38
+ # 启用 Cookies 中间件(如果需要维持会话)
39
+ COOKIES_ENABLED = True
40
+
41
+ # 设置默认的请求头,模拟浏览器
42
+ DEFAULT_REQUEST_HEADERS = {
43
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
44
+ "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
45
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
46
+ }
47
+
48
+
49
+ DOWNLOADER_MIDDLEWARES = {
50
+ "scrapy.downloadermiddlewares.useragent.UserAgentMiddleware": None,
51
+ "doc_crawler.middlewares.RandomUserAgentMiddleware": 400,
52
+ }
53
+
54
+
55
+ # 启用 AutoThrottle 扩展(自动限速,推荐开启)
56
+ AUTOTHROTTLE_ENABLED = True
57
+ AUTOTHROTTLE_START_DELAY = 1.0
58
+ AUTOTHROTTLE_MAX_DELAY = 5.0
@@ -0,0 +1,4 @@
1
+ # This package will contain the spiders of your Scrapy project
2
+ #
3
+ # Please refer to the documentation for information on how to create and manage
4
+ # your spiders.
@@ -0,0 +1,494 @@
1
+ # spiders/doc_spider.py
2
+ import scrapy
3
+ from scrapy.http import HtmlResponse
4
+ from scrapy.linkextractors import LinkExtractor
5
+ from scrapy.spiders import CrawlSpider, Rule
6
+ from urllib.parse import urlparse, urljoin, urldefrag
7
+ import os
8
+ import re
9
+ import base64
10
+ import hashlib
11
+
12
+ from doc_crawler.items import DocCrawlerItem # 注意:item 类名可能需要调整
13
+
14
+
15
+ class UniversalDocSpider(CrawlSpider):
16
+ """
17
+ 通用文档爬虫:将指定网站的所有 HTML 页面转换为 Markdown,
18
+ 并自动修正内部链接为本地 .md 文件的相对路径。
19
+ """
20
+
21
+ name = "doc_crawler"
22
+
23
+ # 这些属性将通过命令行参数动态设置
24
+ def __init__(self, *args, **kwargs):
25
+ # 创建父类 kwargs 的副本,移除蜘蛛使用的参数
26
+ spider_kwargs = kwargs.copy()
27
+
28
+ # 处理 start_urls:分割并过滤空字符串
29
+ start_urls_raw = spider_kwargs.pop("start_urls", "")
30
+ self.start_urls = [
31
+ url.strip() for url in start_urls_raw.split(",") if url.strip()
32
+ ]
33
+ # 添加 single_page 参数处理
34
+ single_page = spider_kwargs.pop("single_page", "false")
35
+ self.single_page = single_page.lower() in ("true", "1", "yes")
36
+ # 单页面模式下,如果提供了多个 URL,仅使用第一个
37
+ if self.single_page and len(self.start_urls) > 1:
38
+ print(f"单页面模式下,仅处理第一个 URL: {self.start_urls[0]}")
39
+ self.start_urls = self.start_urls[:1]
40
+ # 处理 allowed_domains:分割并过滤空字符串
41
+ allowed_domains_raw = spider_kwargs.pop("allowed_domains", "")
42
+ self.allowed_domains = [
43
+ domain.strip()
44
+ for domain in allowed_domains_raw.split(",")
45
+ if domain.strip()
46
+ ]
47
+ # 处理 deny_patterns
48
+ deny_patterns_raw = spider_kwargs.pop("deny_patterns", "")
49
+ self.deny_patterns = deny_patterns_raw.split(",") if deny_patterns_raw else []
50
+
51
+ self.body_selector = spider_kwargs.pop(
52
+ "body_selector", "main, article, .content, .document, .body, body"
53
+ )
54
+ print(f"body_selector 设置为: {self.body_selector}")
55
+ self.output_dir = spider_kwargs.pop("output_dir", "markdown_output")
56
+ # 弹出 allow_paths 和 converter_engine 以避免传递给父类
57
+ allow_paths_raw = spider_kwargs.pop("allow_paths", "")
58
+ converter_engine = spider_kwargs.pop("converter_engine", "markitdown")
59
+
60
+ # ---------- 新增:路径白名单 ----------
61
+ # allow_paths: 逗号分隔的路径前缀,如 "/docs/zh-cn/, /help/"
62
+ # 只有路径以这些前缀开头的链接才会被提取和跟踪
63
+
64
+ self.allow_paths = (
65
+ [p.strip() for p in allow_paths_raw.split(",") if p.strip()]
66
+ if allow_paths_raw
67
+ else []
68
+ )
69
+
70
+ # 过滤 start_urls,只保留路径被允许的 URL
71
+ if self.allow_paths:
72
+ filtered_start_urls = []
73
+ for url in self.start_urls:
74
+ if self._url_path_allowed(url):
75
+ filtered_start_urls.append(url)
76
+ else:
77
+ print(f"警告:跳过起始 URL(路径不匹配): {url}")
78
+ self.start_urls = filtered_start_urls
79
+
80
+ # ---------- 转换引擎配置 ----------
81
+ self.converter_engine = converter_engine.lower()
82
+ self._init_converter() # 根据引擎初始化转换器
83
+
84
+ # 如果 allowed_domains 未提供,则从 start_urls 中解析域名作为默认
85
+ if not self.allowed_domains and self.start_urls:
86
+ domains = set()
87
+ for url in self.start_urls:
88
+ parsed = urlparse(url)
89
+ domain = parsed.netloc
90
+ if domain:
91
+ domains.add(domain)
92
+ self.allowed_domains = list(domains)
93
+ print(f"从 start_urls 解析的 allowed_domains: {self.allowed_domains}")
94
+
95
+ # ---------- 构建 allow 和 deny 正则 ----------
96
+ # allow_re:如果指定了 allow_paths,则生成只匹配这些路径前缀的正则
97
+ if self.allow_paths:
98
+ # 对每个路径进行转义,确保正则安全,并添加 ^ 前缀
99
+ escaped_paths = [re.escape(p) for p in self.allow_paths]
100
+ # 组合成正则:^(path1|path2|...)
101
+ self.allow_re = "^(" + "|".join(escaped_paths) + ")"
102
+ else:
103
+ self.allow_re = None
104
+
105
+ # 构建 deny 正则表达式(支持多个模式,用 | 连接)
106
+ deny_re = "|".join(self.deny_patterns) if self.deny_patterns else None
107
+
108
+ # 根据 single_page 参数决定是否启用链接跟踪
109
+ if self.single_page:
110
+ self.rules = ()
111
+ else:
112
+ self.rules = (
113
+ Rule(
114
+ LinkExtractor(
115
+ allow_domains=self.allowed_domains,
116
+ # allow=self.allow_re if self.allow_re else (),
117
+ deny=deny_re if deny_re else (),
118
+ ),
119
+ callback="parse_item",
120
+ follow=True,
121
+ process_links="filter_links_by_path", # 新增:在处理链接时调用自定义过滤函数
122
+ ),
123
+ )
124
+ super().__init__(*args, **spider_kwargs)
125
+ # 编译规则以确保 CrawlSpider 正确使用它们
126
+ if hasattr(self, "_compile_rules"):
127
+ self._compile_rules()
128
+ print(">>> 初始化完成,self.start_urls =", self.start_urls)
129
+ if self.single_page:
130
+ print(">>> 单页面模式已启用,将不跟随任何链接")
131
+
132
+ # 新增一个方法,用于在生成请求前过滤链接
133
+ def filter_links_by_path(self, links):
134
+ """
135
+ 根据解析后的绝对路径过滤链接,只保留路径以 allow_paths 中任一前缀开头的链接。
136
+ """
137
+ if not self.allow_paths:
138
+ return links # 如果没有设置路径白名单,返回所有链接
139
+
140
+ filtered_links = []
141
+ for link in links:
142
+ if self._url_path_allowed(link.url):
143
+ filtered_links.append(link)
144
+ else:
145
+ self.logger.debug(f"过滤掉链接(路径不匹配): {link.url}")
146
+
147
+ return filtered_links
148
+
149
+ def _url_path_allowed(self, url):
150
+ """检查 URL 的路径是否以 allow_paths 中的任一前缀开头"""
151
+ if not self.allow_paths:
152
+ return True
153
+ parsed = urlparse(url)
154
+ path = parsed.path
155
+ return any(path.startswith(prefix) for prefix in self.allow_paths)
156
+
157
+ def _init_converter(self):
158
+ """根据 converter_engine 初始化转换器及转换函数"""
159
+ if self.converter_engine == "markitdown":
160
+ try:
161
+ from markitdown import MarkItDown, StreamInfo
162
+ import io
163
+
164
+ self.converter = MarkItDown(enable_plugins=True)
165
+
166
+ # markitdown 的 convert 方法可以直接处理 HTML 字符串
167
+ # 定义转换函数:明确指定流信息为 HTML
168
+ def convert_with_streaminfo(html):
169
+ # 将字符串编码为字节流
170
+ byte_stream = io.BytesIO(html.encode("utf-8"))
171
+ # 创建 StreamInfo 对象,设置 mime_type 为 'text/html'
172
+ # 注意:StreamInfo 的构造函数可能因版本而异,常见用法是直接传入字典或使用 kwargs
173
+ # 这里采用一种兼容的方式,将信息作为关键字参数传递
174
+ # 查阅最新文档,推荐使用:stream_info = StreamInfo(mime_type='text/html')
175
+ # 如果上述方式不行,可以尝试:stream_info = {'mime_type': 'text/html'}
176
+ try:
177
+ # 尝试使用 StreamInfo 类(如果库支持)
178
+ stream_info = StreamInfo(mimetype="text/html")
179
+ except ImportError:
180
+ # 降级方案:直接使用字典(某些旧版本支持)
181
+ stream_info = {"mimetype": "text/html"}
182
+
183
+ result = self.converter.convert_stream(
184
+ byte_stream, stream_info=stream_info
185
+ )
186
+ return result.text_content
187
+
188
+ self.convert_func = convert_with_streaminfo
189
+ except ImportError:
190
+ raise ImportError("markitdown 未安装,请运行: pip install markitdown")
191
+ elif self.converter_engine == "html2text":
192
+ try:
193
+ import html2text
194
+
195
+ self.converter = html2text.HTML2Text()
196
+ # 设置常用选项(可在此处根据用户需求扩展)
197
+ self.converter.ignore_links = False
198
+ self.converter.body_width = 0
199
+ self.converter.protect_links = True
200
+ self.converter.mark_code = True
201
+ self.converter.ignore_images = False
202
+ # 防止将图片src作为alt文本
203
+ self.converter.images_to_alt = False
204
+ # 保留图片的width/height属性
205
+ self.converter.images_with_size = True
206
+ self.convert_func = self.converter.handle
207
+ except ImportError:
208
+ raise ImportError("html2text 未安装,请运行: pip install html2text")
209
+ else:
210
+ raise ValueError(
211
+ f"不支持的转换引擎: {self.converter_engine},可选: markitdown, html2text"
212
+ )
213
+
214
+ def start_requests(self):
215
+ if self.single_page:
216
+ # 单页面模式下,为每个起始URL创建请求,直接调用parse_item
217
+ for url in self.start_urls:
218
+ yield scrapy.Request(url, callback=self.parse_item)
219
+ else:
220
+ # 普通模式下,使用父类的默认行为
221
+ yield from super().start_requests()
222
+
223
+ def parse_item(self, response):
224
+ # 检查 URL 路径是否被允许
225
+ if not self._url_path_allowed(response.url):
226
+ self.logger.debug(f"跳过页面(路径不匹配): {response.url}")
227
+ return
228
+ self.logger.info(f"Parsing item from {response.url}")
229
+ item = DocCrawlerItem()
230
+ item["url"] = response.url
231
+
232
+ # 检查响应类型:如果不是 HTML 响应,则将其视为媒体文件
233
+ if not isinstance(response, HtmlResponse):
234
+ # 将当前 URL 作为媒体文件下载
235
+ media_urls = [response.url]
236
+ item["media_urls"] = media_urls
237
+ # 生成文件路径
238
+ item["file_path"] = self._url_to_file_path(response.url)
239
+ # 设置空的内容
240
+ item["markdown_content"] = ""
241
+ yield item
242
+ return
243
+
244
+ # 提取媒体文件链接(图片、STL等)
245
+ media_urls = self._extract_media_urls(response)
246
+ item["media_urls"] = media_urls
247
+
248
+ # 提取主体内容(使用用户提供的 CSS 选择器)
249
+ self.logger.debug(f"使用 body_selector: {self.body_selector}")
250
+ main_content = response.css(self.body_selector).get()
251
+ self.logger.debug(f"main_content 找到: {bool(main_content)}")
252
+ raw_html = main_content if main_content else response.text
253
+ if not main_content:
254
+ self.logger.warning(
255
+ f"body_selector '{self.body_selector}' 未匹配到内容,使用完整页面"
256
+ )
257
+ self.logger.debug(f"raw_html:\n{raw_html}")
258
+ # 清洗 HTML,只保留 <body> 部分
259
+ if not main_content:
260
+ response.selector.remove_namespaces()
261
+ cleaned_html = response.selector.xpath("//body").get() or raw_html
262
+ else:
263
+ cleaned_html = raw_html
264
+
265
+ # 生成当前文件的本地保存路径
266
+ current_file_path = self._url_to_file_path(response.url)
267
+ item["file_path"] = current_file_path
268
+
269
+ # 处理 HTML 中的 base64 图片:保存为本地文件并替换 src
270
+ cleaned_html = self._save_base64_images(cleaned_html, current_file_path)
271
+
272
+ # 转换为 Markdown
273
+ try:
274
+ markdown_text = self.convert_func(cleaned_html)
275
+
276
+ except Exception as e:
277
+ self.logger.error(f"转换失败 {response.url}: {e}", exc_info=True)
278
+ markdown_text = ""
279
+
280
+ # 转换内部链接
281
+ if markdown_text:
282
+ markdown_text = self._convert_internal_links(
283
+ markdown_text, current_file_path, response.url
284
+ )
285
+
286
+ item["markdown_content"] = markdown_text
287
+
288
+ yield item
289
+
290
+ # ---------- 辅助方法(与之前相同)----------
291
+ def _extract_media_urls(self, response):
292
+ """从响应中提取图片和文件链接"""
293
+ media_extensions = {
294
+ # 图片格式
295
+ ".png",
296
+ ".jpg",
297
+ ".jpeg",
298
+ ".gif",
299
+ ".svg",
300
+ ".webp",
301
+ ".bmp",
302
+ ".ico",
303
+ ".tiff",
304
+ # 文档/文件格式
305
+ ".stl",
306
+ ".pdf",
307
+ ".zip",
308
+ ".gz",
309
+ ".tar",
310
+ ".doc",
311
+ ".docx",
312
+ ".ppt",
313
+ ".pptx",
314
+ ".xls",
315
+ ".xlsx",
316
+ ".txt",
317
+ ".csv",
318
+ ".json",
319
+ ".xml",
320
+ ".ipynb",
321
+ }
322
+
323
+ media_urls = set()
324
+
325
+ # 提取图片标签
326
+ for img in response.css("img"):
327
+ src = img.attrib.get("src")
328
+ if src:
329
+ # 跳过 data: URL (base64 嵌入图片)
330
+ if src.startswith("data:"):
331
+ continue
332
+ absolute_url = urljoin(response.url, src)
333
+ # 只下载内部链接的媒体文件
334
+ if self._is_internal_link(absolute_url, response.url):
335
+ media_urls.add(absolute_url)
336
+ else:
337
+ self.logger.debug(f"跳过外部媒体链接: {absolute_url}")
338
+
339
+ # 提取链接标签中的文件
340
+ for link in response.css("a"):
341
+ href = link.attrib.get("href")
342
+ if href:
343
+ # 检查是否是媒体文件
344
+ if any(href.lower().endswith(ext) for ext in media_extensions):
345
+ absolute_url = urljoin(response.url, href)
346
+ # 只下载内部链接的媒体文件
347
+ if self._is_internal_link(absolute_url, response.url):
348
+ media_urls.add(absolute_url)
349
+ else:
350
+ self.logger.debug(f"跳过外部媒体链接: {absolute_url}")
351
+
352
+ return list(media_urls)
353
+
354
+ def _save_base64_images(self, html_content, current_file_path):
355
+ """将 HTML 中的 base64 内嵌图片保存为本地文件,并替换 src 为相对路径"""
356
+ pattern = r'(<img[^>]*?src=)["\'](data:[^"\']+)["\']([^>]*?>)'
357
+
358
+ def replace_data_url(match):
359
+ prefix = match.group(1)
360
+ data_url = match.group(2)
361
+ suffix = match.group(3)
362
+
363
+ header, _, data = data_url.partition(",")
364
+ if not data:
365
+ return match.group(0)
366
+
367
+ if ";base64" not in header:
368
+ return match.group(0)
369
+
370
+ # 确定扩展名
371
+ ext = ".png"
372
+ if "image/jpeg" in header or "image/jpg" in header:
373
+ ext = ".jpg"
374
+ elif "image/png" in header:
375
+ ext = ".png"
376
+ elif "image/gif" in header:
377
+ ext = ".gif"
378
+ elif "image/svg+xml" in header:
379
+ ext = ".svg"
380
+ elif "image/webp" in header:
381
+ ext = ".webp"
382
+
383
+ try:
384
+ image_data = base64.b64decode(data)
385
+ except Exception:
386
+ return match.group(0)
387
+
388
+ file_hash = hashlib.md5(image_data).hexdigest()[:8]
389
+ base_name = os.path.splitext(os.path.basename(current_file_path))[0]
390
+ image_filename = f"{base_name}_{file_hash}{ext}"
391
+
392
+ image_dir = os.path.join(
393
+ self.output_dir, os.path.dirname(current_file_path)
394
+ )
395
+ image_full_path = os.path.join(image_dir, image_filename)
396
+ os.makedirs(image_dir, exist_ok=True)
397
+ with open(image_full_path, "wb") as f:
398
+ f.write(image_data)
399
+
400
+ self.logger.info(f"保存 base64 图片: {image_filename}")
401
+ return f'{prefix}"{image_filename}"{suffix}'
402
+
403
+ return re.sub(pattern, replace_data_url, html_content, flags=re.IGNORECASE)
404
+
405
+ def _url_to_file_path(self, url):
406
+ parsed = urlparse(url)
407
+ path = parsed.path
408
+ if path.endswith("/"):
409
+ path = path + "index.md"
410
+ elif path.endswith(".html"):
411
+ path = path[:-5] + ".md"
412
+ else:
413
+ if not os.path.splitext(path)[1]:
414
+ path = (
415
+ os.path.join(path, "index.md")
416
+ if path.endswith("/")
417
+ else path + ".md"
418
+ )
419
+ return path.lstrip("/")
420
+
421
+ def _is_internal_link(self, url, base_url):
422
+ absolute = urljoin(base_url, url)
423
+ parsed = urlparse(absolute)
424
+ return any(parsed.netloc.endswith(domain) for domain in self.allowed_domains)
425
+
426
+ def _convert_url(self, url, base_url, current_file_path):
427
+ if not self._is_internal_link(url, base_url):
428
+ return url
429
+ target_abs = urljoin(base_url, url)
430
+ target_path = self._url_to_file_path(target_abs)
431
+ rel_path = os.path.relpath(
432
+ target_path, os.path.dirname(current_file_path)
433
+ ).replace("\\", "/")
434
+ if rel_path == ".":
435
+ rel_path = os.path.basename(target_path)
436
+ return rel_path
437
+
438
+ def _convert_internal_links(self, markdown_text, current_file_path, base_url):
439
+ # 匹配图片和链接:![alt](url) 或 [text](url)
440
+ pattern = r"(!?)\[([^\]]*)\]\(([^)]+)\)"
441
+
442
+ def replace_link(match):
443
+ is_image = match.group(1) == "!"
444
+ text = match.group(2)
445
+ inner = match.group(3).strip()
446
+
447
+ # 清理图片的alt文本:如果看起来像URL(包含路径分隔符或图片扩展名),则清空
448
+ if is_image:
449
+ # 常见的图片扩展名
450
+ image_extensions = {
451
+ ".png",
452
+ ".jpg",
453
+ ".jpeg",
454
+ ".gif",
455
+ ".svg",
456
+ ".webp",
457
+ ".bmp",
458
+ ".ico",
459
+ ".tiff",
460
+ }
461
+ # 如果alt文本看起来像URL(包含'/'或'\'或以图片扩展名结尾),清空它
462
+ # 这样可以避免清空正常的alt文本如"Logo"或"Diagram"
463
+ text_lower = text.lower()
464
+ if (
465
+ "/" in text
466
+ or "\\" in text
467
+ or any(text_lower.endswith(ext) for ext in image_extensions)
468
+ ):
469
+ text = ""
470
+ self.logger.debug(
471
+ f"清空图片alt文本(看起来像URL): {match.group(2)}"
472
+ )
473
+
474
+ if inner.startswith("<"):
475
+ angle_match = re.match(r"<([^>]+)>", inner)
476
+ if not angle_match:
477
+ return match.group(0)
478
+ old_url = angle_match.group(1)
479
+ new_url = self._convert_url(old_url, base_url, current_file_path)
480
+ new_inner = inner.replace(f"<{old_url}>", f"<{new_url}>", 1)
481
+ else:
482
+ parts = inner.split(None, 1)
483
+ old_url = parts[0]
484
+ new_url = self._convert_url(old_url, base_url, current_file_path)
485
+ if len(parts) == 1:
486
+ new_inner = new_url
487
+ else:
488
+ new_inner = new_url + " " + parts[1]
489
+
490
+ # 重新构建链接,保留图片标记
491
+ prefix = "!" if is_image else ""
492
+ return f"{prefix}[{text}]({new_inner})"
493
+
494
+ return re.sub(pattern, replace_link, markdown_text)
@@ -0,0 +1,98 @@
1
+ Metadata-Version: 2.4
2
+ Name: html_docs_crawler
3
+ Version: 0.1.0
4
+ Summary: Universal documentation crawler that converts HTML pages to Markdown with internal link correction
5
+ Project-URL: Homepage, https://github.com/zwidny/doc_crawler
6
+ Project-URL: Source, https://github.com/zwidny/doc_crawler
7
+ Project-URL: BugTracker, https://github.com/zwidny/doc_crawler/issues
8
+ Keywords: scrapy,crawler,markdown,documentation,web-scraping
9
+ Requires-Python: >=3.12
10
+ Description-Content-Type: text/markdown
11
+ Requires-Dist: fake-useragent>=2.2.0
12
+ Requires-Dist: html2text>=2025.4.15
13
+ Requires-Dist: markitdown[docx,pdf,pptx]>=0.1.5
14
+ Requires-Dist: scrapy>=2.14.1
15
+
16
+ # scrapy-mth
17
+
18
+ A Scrapy-based universal documentation crawler that converts HTML documentation sites to Markdown format, with automatic internal link rewriting to local `.md` relative paths. Supports multiple converter engines (markitdown / html2text), path whitelist filtering, and automatic media file download.
19
+
20
+ ## Parameters
21
+
22
+ - `start_urls`: Starting URLs (comma-separated)
23
+ - `allowed_domains`: Allowed domains (comma-separated)
24
+ - `deny_patterns`: Regex deny patterns (comma-separated)
25
+ - `allow_paths`: Allowed path prefixes (comma-separated); only URLs starting with these prefixes will be processed
26
+ - `body_selector`: CSS selector for main HTML content (default: `"main, article, .content, .document, .body, body"`)
27
+ - `output_dir`: Output directory (default: `"~/.config/doc_crawler/_docs/{domain_name}"`, where `{domain_name}` is extracted from `start_urls`)
28
+ - `converter_engine`: Converter engine (default: `"markitdown"`, optional: `"html2text"`)
29
+ - `single_page`: Single-page mode (default: `"false"`, set to `"true"` to crawl a single page without following links)
30
+
31
+ ## Install via UV
32
+
33
+ ```bash
34
+ uv tool install git+https://github.com/zwidny/doc_crawler.git
35
+ ```
36
+
37
+ After installation, you can use the `doc_crawler` command from any directory.
38
+
39
+ ## Usage Examples
40
+
41
+ ```bash
42
+ # Crawl AKShare documentation
43
+ doc_crawler --start-urls "https://akshare.akfamily.xyz" \
44
+ --allowed-domains "akshare.akfamily.xyz" \
45
+ --deny-patterns "/_sources/" \
46
+ --body-selector "main, article, .content, .document, .body" \
47
+ --output-dir "_docs/akshare_markdown"
48
+ ```
49
+
50
+ ### Single-page mode
51
+
52
+ ```bash
53
+ doc_crawler --start-urls "https://build123d.readthedocs.io/en/stable/examples_1.html" \
54
+ --single-page true \
55
+ --body-selector ".wy-nav-content" \
56
+ --output-dir "single_page_output"
57
+ ```
58
+
59
+ ### Path whitelist filtering
60
+
61
+ ```bash
62
+ doc_crawler --start-urls "https://opencode.ai/docs/zh-cn/" \
63
+ --allow-paths "/docs/zh-cn/" \
64
+ --body-selector "main, article, .content" \
65
+ --output-dir "_docs/opencode_docs_zh_cn"
66
+ ```
67
+
68
+ ### Crawl with html2text engine
69
+
70
+ ```bash
71
+ doc_crawler --start-urls "https://akshare.akfamily.xyz/" \
72
+ --allowed-domains "akshare.akfamily.xyz" \
73
+ --deny-patterns "/_sources/" \
74
+ --body-selector "main, article, .content, .document, .body" \
75
+ --converter-engine "html2text" \
76
+ --output-dir "_docs/akshare_markdown_html2text"
77
+ ```
78
+
79
+ ### More examples
80
+
81
+ ```bash
82
+ # Crawl build123d docs
83
+ doc_crawler --start-urls "https://build123d.readthedocs.io/en/stable/" \
84
+ --deny-patterns "/_sources/,/latest/" \
85
+ --body-selector ".wy-nav-content" \
86
+ --output-dir "_docs/build123d"
87
+
88
+ # Crawl Docusaurus docs
89
+ doc_crawler --start-urls "https://docusaurus.io/docs" \
90
+ --allow-paths "/docs" \
91
+ --body-selector ".col.docItemCol_n6xZ" \
92
+ --output-dir "_docs/docusaurus"
93
+
94
+ # Crawl uv documentation
95
+ doc_crawler --start-urls "https://docs.astral.sh/uv/" \
96
+ --body-selector ".md-content" \
97
+ --output-dir "_docs/uv"
98
+ ```
@@ -0,0 +1,17 @@
1
+ README.md
2
+ pyproject.toml
3
+ doc_crawler/__init__.py
4
+ doc_crawler/cli.py
5
+ doc_crawler/items.py
6
+ doc_crawler/middlewares.py
7
+ doc_crawler/pipelines.py
8
+ doc_crawler/settings.py
9
+ doc_crawler/../scrapy.cfg
10
+ doc_crawler/spiders/__init__.py
11
+ doc_crawler/spiders/doc_spider.py
12
+ html_docs_crawler.egg-info/PKG-INFO
13
+ html_docs_crawler.egg-info/SOURCES.txt
14
+ html_docs_crawler.egg-info/dependency_links.txt
15
+ html_docs_crawler.egg-info/entry_points.txt
16
+ html_docs_crawler.egg-info/requires.txt
17
+ html_docs_crawler.egg-info/top_level.txt
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ doc_crawler = doc_crawler.cli:main
@@ -0,0 +1,4 @@
1
+ fake-useragent>=2.2.0
2
+ html2text>=2025.4.15
3
+ markitdown[docx,pdf,pptx]>=0.1.5
4
+ scrapy>=2.14.1
@@ -0,0 +1,30 @@
1
+ [project]
2
+ name = "html_docs_crawler"
3
+ version = "0.1.0"
4
+ description = "Universal documentation crawler that converts HTML pages to Markdown with internal link correction"
5
+ readme = "README.md"
6
+ requires-python = ">=3.12"
7
+ keywords = ["scrapy", "crawler", "markdown", "documentation", "web-scraping"]
8
+ dependencies = [
9
+ "fake-useragent>=2.2.0",
10
+ "html2text>=2025.4.15",
11
+ "markitdown[docx,pdf,pptx]>=0.1.5",
12
+ "scrapy>=2.14.1",
13
+ ]
14
+
15
+ [project.urls]
16
+ Homepage = "https://github.com/zwidny/doc_crawler"
17
+ Source = "https://github.com/zwidny/doc_crawler"
18
+ BugTracker = "https://github.com/zwidny/doc_crawler/issues"
19
+
20
+ [tool.setuptools.package-data]
21
+ doc_crawler = ["../scrapy.cfg"]
22
+
23
+ [build-system]
24
+ requires = ["setuptools>=64.0"]
25
+ build-backend = "setuptools.build_meta"
26
+
27
+ [project.scripts]
28
+ doc_crawler = "doc_crawler.cli:main"
29
+
30
+
@@ -0,0 +1,11 @@
1
+ # Automatically created by: scrapy startproject
2
+ #
3
+ # For more information about the [deploy] section see:
4
+ # https://scrapyd.readthedocs.io/en/latest/deploy.html
5
+
6
+ [settings]
7
+ default = doc_crawler.settings
8
+
9
+ [deploy]
10
+ #url = http://localhost:6800/
11
+ project = doc_crawler
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+