html_docs_crawler 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- doc_crawler/__init__.py +0 -0
- doc_crawler/cli.py +195 -0
- doc_crawler/items.py +9 -0
- doc_crawler/middlewares.py +39 -0
- doc_crawler/pipelines.py +160 -0
- doc_crawler/settings.py +58 -0
- doc_crawler/spiders/__init__.py +4 -0
- doc_crawler/spiders/doc_spider.py +494 -0
- html_docs_crawler-0.1.0.dist-info/METADATA +98 -0
- html_docs_crawler-0.1.0.dist-info/RECORD +14 -0
- html_docs_crawler-0.1.0.dist-info/WHEEL +5 -0
- html_docs_crawler-0.1.0.dist-info/entry_points.txt +2 -0
- html_docs_crawler-0.1.0.dist-info/top_level.txt +1 -0
- scrapy.cfg +11 -0
doc_crawler/__init__.py
ADDED
|
File without changes
|
doc_crawler/cli.py
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Command-line interface for the universal documentation crawler.
|
|
4
|
+
This provides a more convenient way to run the doc_crawler spider.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import argparse
|
|
8
|
+
import os
|
|
9
|
+
import sys
|
|
10
|
+
from urllib.parse import urlparse
|
|
11
|
+
from scrapy.cmdline import execute
|
|
12
|
+
|
|
13
|
+
try:
|
|
14
|
+
from importlib.metadata import version, PackageNotFoundError
|
|
15
|
+
except ImportError:
|
|
16
|
+
# Python < 3.8 compatibility
|
|
17
|
+
import pkg_resources
|
|
18
|
+
|
|
19
|
+
def version(package_name):
|
|
20
|
+
try:
|
|
21
|
+
return pkg_resources.get_distribution(package_name).version
|
|
22
|
+
except pkg_resources.DistributionNotFound:
|
|
23
|
+
return None
|
|
24
|
+
|
|
25
|
+
PackageNotFoundError = pkg_resources.DistributionNotFound
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def get_version():
|
|
29
|
+
try:
|
|
30
|
+
return version("html_docs_crawler")
|
|
31
|
+
except PackageNotFoundError:
|
|
32
|
+
return "unknown"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def get_domain_from_url(url):
|
|
36
|
+
"""Extract domain from a URL string."""
|
|
37
|
+
parsed = urlparse(url)
|
|
38
|
+
domain = parsed.netloc
|
|
39
|
+
if not domain:
|
|
40
|
+
return "unknown_domain"
|
|
41
|
+
# Remove port if present
|
|
42
|
+
if ":" in domain:
|
|
43
|
+
domain = domain.split(":")[0]
|
|
44
|
+
return domain
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def get_default_output_dir(start_urls):
|
|
48
|
+
"""Generate default output directory based on the first start URL."""
|
|
49
|
+
if not start_urls:
|
|
50
|
+
return "markdown_output"
|
|
51
|
+
|
|
52
|
+
# Use the first URL to determine domain
|
|
53
|
+
first_url = start_urls.split(",")[0].strip()
|
|
54
|
+
domain = get_domain_from_url(first_url)
|
|
55
|
+
|
|
56
|
+
# Build path: ~/.config/doc_crawler/_docs/{domain}
|
|
57
|
+
home = os.path.expanduser("~")
|
|
58
|
+
return os.path.join(home, ".config", "doc_crawler", "_docs", domain)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def main():
|
|
62
|
+
parser = argparse.ArgumentParser(
|
|
63
|
+
description="Universal documentation crawler: Convert HTML pages to Markdown with internal link correction.",
|
|
64
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
65
|
+
epilog="""
|
|
66
|
+
Examples:
|
|
67
|
+
doc_crawler --start-urls https://akshare.akfamily.xyz --allowed-domains akshare.akfamily.xyz
|
|
68
|
+
doc_crawler --start-urls https://opencode.ai/docs/zh-cn/ --allow-paths /docs/zh-cn/
|
|
69
|
+
doc_crawler --start-urls https://build123d.readthedocs.io/en/stable/ --single-page
|
|
70
|
+
""",
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
parser.add_argument(
|
|
74
|
+
"--version",
|
|
75
|
+
action="version",
|
|
76
|
+
version=f"%(prog)s {get_version()}",
|
|
77
|
+
help="Show version information and exit",
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
# Required arguments
|
|
81
|
+
parser.add_argument(
|
|
82
|
+
"--start-urls", required=True, help="Starting URLs (comma-separated)"
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
# Optional arguments matching the spider's parameters
|
|
86
|
+
parser.add_argument(
|
|
87
|
+
"--allowed-domains", default="", help="Allowed domains (comma-separated)"
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
parser.add_argument(
|
|
91
|
+
"--deny-patterns", default="", help="Regex patterns to deny (comma-separated)"
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
parser.add_argument(
|
|
95
|
+
"--allow-paths", default="", help="Path prefixes to allow (comma-separated)"
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
parser.add_argument(
|
|
99
|
+
"--body-selector",
|
|
100
|
+
default="main, article, .content, .document, .body, body",
|
|
101
|
+
help="CSS selector for HTML body content",
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
parser.add_argument(
|
|
105
|
+
"--output-dir",
|
|
106
|
+
default="",
|
|
107
|
+
help="Output directory for Markdown files (default: ~/.config/doc_crawler/_docs/{domain})",
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
parser.add_argument(
|
|
111
|
+
"--converter-engine",
|
|
112
|
+
default="markitdown",
|
|
113
|
+
choices=["markitdown", "html2text"],
|
|
114
|
+
help="Converter engine: 'markitdown' (default) or 'html2text'",
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
parser.add_argument(
|
|
118
|
+
"--single-page",
|
|
119
|
+
default="false",
|
|
120
|
+
choices=["true", "false"],
|
|
121
|
+
help="Single page mode (don't follow links)",
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
parser.add_argument(
|
|
125
|
+
"--loglevel",
|
|
126
|
+
default="INFO",
|
|
127
|
+
choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
|
|
128
|
+
help="Log level",
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
args = parser.parse_args()
|
|
132
|
+
|
|
133
|
+
# Determine output directory
|
|
134
|
+
if not args.output_dir:
|
|
135
|
+
args.output_dir = get_default_output_dir(args.start_urls)
|
|
136
|
+
# Expand user directory (~)
|
|
137
|
+
args.output_dir = os.path.expanduser(args.output_dir)
|
|
138
|
+
# Convert to absolute path before chdir
|
|
139
|
+
args.output_dir = os.path.abspath(args.output_dir)
|
|
140
|
+
# Ensure the directory exists
|
|
141
|
+
os.makedirs(args.output_dir, exist_ok=True)
|
|
142
|
+
|
|
143
|
+
# Change to the directory containing scrapy.cfg
|
|
144
|
+
# This is necessary for scrapy to find the project settings
|
|
145
|
+
# The cli.py file is in doc_crawler/, so go up one level to find scrapy.cfg
|
|
146
|
+
script_dir = os.path.dirname(os.path.abspath(__file__))
|
|
147
|
+
project_root = os.path.dirname(script_dir) # doc_crawler/ -> project root
|
|
148
|
+
|
|
149
|
+
# Check if scrapy.cfg exists in the project root
|
|
150
|
+
scrapy_cfg_path = os.path.join(project_root, "scrapy.cfg")
|
|
151
|
+
if os.path.exists(scrapy_cfg_path):
|
|
152
|
+
os.chdir(project_root)
|
|
153
|
+
else:
|
|
154
|
+
# If not found, try the parent directory
|
|
155
|
+
parent_dir = os.path.dirname(project_root)
|
|
156
|
+
scrapy_cfg_path = os.path.join(parent_dir, "scrapy.cfg")
|
|
157
|
+
if os.path.exists(scrapy_cfg_path):
|
|
158
|
+
os.chdir(parent_dir)
|
|
159
|
+
else:
|
|
160
|
+
print(
|
|
161
|
+
f"警告: 未找到 scrapy.cfg 文件,当前目录: {os.getcwd()}",
|
|
162
|
+
file=sys.stderr,
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
# Build scrapy command arguments
|
|
166
|
+
scrapy_args = [
|
|
167
|
+
"scrapy",
|
|
168
|
+
"crawl",
|
|
169
|
+
"doc_crawler",
|
|
170
|
+
"-a",
|
|
171
|
+
f"start_urls={args.start_urls}",
|
|
172
|
+
"-a",
|
|
173
|
+
f"allowed_domains={args.allowed_domains}",
|
|
174
|
+
"-a",
|
|
175
|
+
f"deny_patterns={args.deny_patterns}",
|
|
176
|
+
"-a",
|
|
177
|
+
f"allow_paths={args.allow_paths}",
|
|
178
|
+
"-a",
|
|
179
|
+
f"body_selector={args.body_selector}",
|
|
180
|
+
"-a",
|
|
181
|
+
f"output_dir={args.output_dir}",
|
|
182
|
+
"-a",
|
|
183
|
+
f"converter_engine={args.converter_engine}",
|
|
184
|
+
"-a",
|
|
185
|
+
f"single_page={args.single_page}",
|
|
186
|
+
"--loglevel",
|
|
187
|
+
args.loglevel,
|
|
188
|
+
]
|
|
189
|
+
|
|
190
|
+
# Execute scrapy command
|
|
191
|
+
sys.exit(execute(scrapy_args))
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
if __name__ == "__main__":
|
|
195
|
+
main()
|
doc_crawler/items.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# middlewares.py
|
|
2
|
+
import logging
|
|
3
|
+
from fake_useragent import UserAgent
|
|
4
|
+
|
|
5
|
+
class RandomUserAgentMiddleware:
|
|
6
|
+
"""为每个请求随机设置 User-Agent 的下载中间件"""
|
|
7
|
+
|
|
8
|
+
def __init__(self):
|
|
9
|
+
# 初始化 UserAgent 对象
|
|
10
|
+
self.ua = UserAgent()
|
|
11
|
+
# 备用的 User-Agent 列表(当 fake-useragent 失效时使用)
|
|
12
|
+
self.fallback_ua_list = [
|
|
13
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
14
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15',
|
|
15
|
+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
|
|
16
|
+
'Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1',
|
|
17
|
+
'Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/121.0',
|
|
18
|
+
]
|
|
19
|
+
self.logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
def process_request(self, request, spider):
|
|
22
|
+
"""在请求发送前设置 User-Agent"""
|
|
23
|
+
try:
|
|
24
|
+
# 尝试获取随机 UA
|
|
25
|
+
ua_string = self.ua.random
|
|
26
|
+
except Exception as e:
|
|
27
|
+
# 如果获取失败(如网络问题),从备用列表中随机选择一个
|
|
28
|
+
self.logger.warning(f"获取随机 UA 失败,使用备用列表: {e}")
|
|
29
|
+
import random
|
|
30
|
+
ua_string = random.choice(self.fallback_ua_list)
|
|
31
|
+
|
|
32
|
+
# 设置请求头
|
|
33
|
+
request.headers['User-Agent'] = ua_string
|
|
34
|
+
self.logger.debug(f'使用 User-Agent: {ua_string[:50]}...') # 可选:记录前50字符便于调试
|
|
35
|
+
return None # 必须返回 None,表示继续处理请求
|
|
36
|
+
|
|
37
|
+
def process_response(self, request, response, spider):
|
|
38
|
+
"""处理响应(此处无需特殊操作)"""
|
|
39
|
+
return response
|
doc_crawler/pipelines.py
ADDED
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
# pipelines.py
|
|
2
|
+
import os
|
|
3
|
+
import random
|
|
4
|
+
import asyncio
|
|
5
|
+
import urllib.request
|
|
6
|
+
import urllib.error
|
|
7
|
+
from urllib.parse import urlparse
|
|
8
|
+
from itemadapter import ItemAdapter
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class SaveMarkdownPipeline:
|
|
12
|
+
def open_spider(self, spider):
|
|
13
|
+
self.output_dir = getattr(spider, "output_dir", "markdown_output")
|
|
14
|
+
if not os.path.exists(self.output_dir):
|
|
15
|
+
os.makedirs(self.output_dir)
|
|
16
|
+
spider.logger.info(f"Markdown 文件将保存到: {os.path.abspath(self.output_dir)}")
|
|
17
|
+
|
|
18
|
+
def process_item(self, item, spider):
|
|
19
|
+
adapter = ItemAdapter(item)
|
|
20
|
+
file_relative_path = adapter.get("file_path")
|
|
21
|
+
markdown_content = adapter.get("markdown_content", "")
|
|
22
|
+
|
|
23
|
+
if not file_relative_path or not markdown_content:
|
|
24
|
+
spider.logger.warning(
|
|
25
|
+
f"跳过 item,缺少文件路径或内容: {adapter.get('url')}"
|
|
26
|
+
)
|
|
27
|
+
return item
|
|
28
|
+
|
|
29
|
+
full_path = os.path.join(self.output_dir, file_relative_path)
|
|
30
|
+
dir_name = os.path.dirname(full_path)
|
|
31
|
+
if dir_name and not os.path.exists(dir_name):
|
|
32
|
+
os.makedirs(dir_name, exist_ok=True)
|
|
33
|
+
|
|
34
|
+
try:
|
|
35
|
+
with open(full_path, "w", encoding="utf-8") as f:
|
|
36
|
+
f.write(markdown_content)
|
|
37
|
+
spider.logger.info(f"成功保存: {full_path}")
|
|
38
|
+
except Exception as e:
|
|
39
|
+
spider.logger.error(f"保存文件失败 {full_path}: {e}")
|
|
40
|
+
|
|
41
|
+
return item
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class MediaDownloadPipeline:
|
|
45
|
+
"""下载页面中的媒体文件(图片、STL等)"""
|
|
46
|
+
|
|
47
|
+
def __init__(self):
|
|
48
|
+
try:
|
|
49
|
+
from fake_useragent import UserAgent
|
|
50
|
+
|
|
51
|
+
self.ua = UserAgent()
|
|
52
|
+
self.use_fake_ua = True
|
|
53
|
+
except ImportError:
|
|
54
|
+
self.use_fake_ua = False
|
|
55
|
+
|
|
56
|
+
self.fallback_ua_list = [
|
|
57
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
58
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15",
|
|
59
|
+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
|
|
60
|
+
"Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1",
|
|
61
|
+
"Mozilla/5.0 (Windows NT 10.0; rv:109.0) Gecko/20100101 Firefox/121.0",
|
|
62
|
+
]
|
|
63
|
+
|
|
64
|
+
self.proxy = None
|
|
65
|
+
|
|
66
|
+
def _setup_proxy(self, spider):
|
|
67
|
+
proxy = getattr(spider, "proxy", None)
|
|
68
|
+
|
|
69
|
+
if not proxy:
|
|
70
|
+
proxy = (
|
|
71
|
+
os.environ.get("HTTP_PROXY")
|
|
72
|
+
or os.environ.get("HTTPS_PROXY")
|
|
73
|
+
or os.environ.get("http_proxy")
|
|
74
|
+
or os.environ.get("https_proxy")
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
self.proxy = proxy
|
|
78
|
+
if self.proxy:
|
|
79
|
+
spider.logger.info(f"使用代理: {self.proxy}")
|
|
80
|
+
|
|
81
|
+
def open_spider(self, spider):
|
|
82
|
+
self.output_dir = getattr(spider, "output_dir", "markdown_output")
|
|
83
|
+
spider.logger.info(f"媒体文件将保存到: {os.path.abspath(self.output_dir)}")
|
|
84
|
+
self._setup_proxy(spider)
|
|
85
|
+
|
|
86
|
+
def _get_random_user_agent(self):
|
|
87
|
+
if self.use_fake_ua:
|
|
88
|
+
try:
|
|
89
|
+
return self.ua.random
|
|
90
|
+
except Exception:
|
|
91
|
+
pass
|
|
92
|
+
return random.choice(self.fallback_ua_list)
|
|
93
|
+
|
|
94
|
+
async def process_item(self, item, spider):
|
|
95
|
+
adapter = ItemAdapter(item)
|
|
96
|
+
media_urls = adapter.get("media_urls", [])
|
|
97
|
+
|
|
98
|
+
if not media_urls:
|
|
99
|
+
return item
|
|
100
|
+
|
|
101
|
+
tasks = [self._download_media(url, spider) for url in media_urls]
|
|
102
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
103
|
+
for url, result in zip(media_urls, results):
|
|
104
|
+
if isinstance(result, Exception):
|
|
105
|
+
spider.logger.error(f"下载媒体文件失败 {url}: {result}")
|
|
106
|
+
|
|
107
|
+
return item
|
|
108
|
+
|
|
109
|
+
async def _download_media(self, url, spider):
|
|
110
|
+
parsed = urlparse(url)
|
|
111
|
+
path = parsed.path.lstrip("/")
|
|
112
|
+
|
|
113
|
+
if not path:
|
|
114
|
+
spider.logger.warning(f"跳过无效路径的URL: {url}")
|
|
115
|
+
return
|
|
116
|
+
|
|
117
|
+
local_path = os.path.join(self.output_dir, path)
|
|
118
|
+
local_dir = os.path.dirname(local_path)
|
|
119
|
+
|
|
120
|
+
if local_dir and not os.path.exists(local_dir):
|
|
121
|
+
os.makedirs(local_dir, exist_ok=True)
|
|
122
|
+
|
|
123
|
+
if os.path.exists(local_path):
|
|
124
|
+
spider.logger.debug(f"文件已存在,跳过: {local_path}")
|
|
125
|
+
return
|
|
126
|
+
|
|
127
|
+
def _sync_download():
|
|
128
|
+
spider.logger.info(f"下载媒体文件: {url} -> {local_path}")
|
|
129
|
+
user_agent = self._get_random_user_agent()
|
|
130
|
+
headers = {
|
|
131
|
+
"User-Agent": user_agent,
|
|
132
|
+
"Accept": "image/webp,image/apng,image/*,*/*;q=0.8",
|
|
133
|
+
"Accept-Language": "en-US,en;q=0.9",
|
|
134
|
+
}
|
|
135
|
+
req = urllib.request.Request(url, headers=headers)
|
|
136
|
+
|
|
137
|
+
if self.proxy:
|
|
138
|
+
proxy_handler = urllib.request.ProxyHandler(
|
|
139
|
+
{"http": self.proxy, "https": self.proxy}
|
|
140
|
+
)
|
|
141
|
+
opener = urllib.request.build_opener(proxy_handler)
|
|
142
|
+
response = opener.open(req, timeout=30)
|
|
143
|
+
else:
|
|
144
|
+
response = urllib.request.urlopen(req, timeout=30)
|
|
145
|
+
|
|
146
|
+
with response:
|
|
147
|
+
content = response.read()
|
|
148
|
+
with open(local_path, "wb") as f:
|
|
149
|
+
f.write(content)
|
|
150
|
+
spider.logger.info(f"成功下载: {local_path}")
|
|
151
|
+
|
|
152
|
+
try:
|
|
153
|
+
await asyncio.to_thread(_sync_download)
|
|
154
|
+
except urllib.error.HTTPError as e:
|
|
155
|
+
spider.logger.warning(f"HTTP错误 {e.code} 下载 {url}: {e.reason}")
|
|
156
|
+
except urllib.error.URLError as e:
|
|
157
|
+
spider.logger.warning(f"URL错误下载 {url}: {e.reason}")
|
|
158
|
+
except Exception as e:
|
|
159
|
+
spider.logger.error(f"下载失败 {url}: {e}")
|
|
160
|
+
raise
|
doc_crawler/settings.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# Scrapy settings for akshare_docs project
|
|
2
|
+
#
|
|
3
|
+
# For simplicity, this file contains only settings considered important or
|
|
4
|
+
# commonly used. You can find more settings consulting the documentation:
|
|
5
|
+
#
|
|
6
|
+
# https://docs.scrapy.org/en/latest/topics/settings.html
|
|
7
|
+
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
|
|
8
|
+
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
|
|
9
|
+
|
|
10
|
+
BOT_NAME = "doc_crawler"
|
|
11
|
+
SPIDER_MODULES = ["doc_crawler.spiders"]
|
|
12
|
+
NEWSPIDER_MODULE = "doc_crawler.spiders"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
ADDONS = {}
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
|
19
|
+
# USER_AGENT = "akshare_docs (+http://www.yourdomain.com)"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
# 启用管道
|
|
23
|
+
ITEM_PIPELINES = {
|
|
24
|
+
"doc_crawler.pipelines.MediaDownloadPipeline": 200, # 先下载媒体文件
|
|
25
|
+
"doc_crawler.pipelines.SaveMarkdownPipeline": 300, # 再保存Markdown文件
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
# 遵守 robots.txt 规则(默认 True,建议保持)
|
|
29
|
+
ROBOTSTXT_OBEY = True
|
|
30
|
+
|
|
31
|
+
# 设置下载延迟(秒),避免对服务器造成压力
|
|
32
|
+
DOWNLOAD_DELAY = 0.5 # 根据网站承受能力调整,可以设为 1 或更大
|
|
33
|
+
|
|
34
|
+
# 并发请求数(适当降低以体现温和抓取)
|
|
35
|
+
CONCURRENT_REQUESTS = 8
|
|
36
|
+
CONCURRENT_REQUESTS_PER_DOMAIN = 4
|
|
37
|
+
|
|
38
|
+
# 启用 Cookies 中间件(如果需要维持会话)
|
|
39
|
+
COOKIES_ENABLED = True
|
|
40
|
+
|
|
41
|
+
# 设置默认的请求头,模拟浏览器
|
|
42
|
+
DEFAULT_REQUEST_HEADERS = {
|
|
43
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
44
|
+
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
|
|
45
|
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
DOWNLOADER_MIDDLEWARES = {
|
|
50
|
+
"scrapy.downloadermiddlewares.useragent.UserAgentMiddleware": None,
|
|
51
|
+
"doc_crawler.middlewares.RandomUserAgentMiddleware": 400,
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
# 启用 AutoThrottle 扩展(自动限速,推荐开启)
|
|
56
|
+
AUTOTHROTTLE_ENABLED = True
|
|
57
|
+
AUTOTHROTTLE_START_DELAY = 1.0
|
|
58
|
+
AUTOTHROTTLE_MAX_DELAY = 5.0
|
|
@@ -0,0 +1,494 @@
|
|
|
1
|
+
# spiders/doc_spider.py
|
|
2
|
+
import scrapy
|
|
3
|
+
from scrapy.http import HtmlResponse
|
|
4
|
+
from scrapy.linkextractors import LinkExtractor
|
|
5
|
+
from scrapy.spiders import CrawlSpider, Rule
|
|
6
|
+
from urllib.parse import urlparse, urljoin, urldefrag
|
|
7
|
+
import os
|
|
8
|
+
import re
|
|
9
|
+
import base64
|
|
10
|
+
import hashlib
|
|
11
|
+
|
|
12
|
+
from doc_crawler.items import DocCrawlerItem # 注意:item 类名可能需要调整
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class UniversalDocSpider(CrawlSpider):
|
|
16
|
+
"""
|
|
17
|
+
通用文档爬虫:将指定网站的所有 HTML 页面转换为 Markdown,
|
|
18
|
+
并自动修正内部链接为本地 .md 文件的相对路径。
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
name = "doc_crawler"
|
|
22
|
+
|
|
23
|
+
# 这些属性将通过命令行参数动态设置
|
|
24
|
+
def __init__(self, *args, **kwargs):
|
|
25
|
+
# 创建父类 kwargs 的副本,移除蜘蛛使用的参数
|
|
26
|
+
spider_kwargs = kwargs.copy()
|
|
27
|
+
|
|
28
|
+
# 处理 start_urls:分割并过滤空字符串
|
|
29
|
+
start_urls_raw = spider_kwargs.pop("start_urls", "")
|
|
30
|
+
self.start_urls = [
|
|
31
|
+
url.strip() for url in start_urls_raw.split(",") if url.strip()
|
|
32
|
+
]
|
|
33
|
+
# 添加 single_page 参数处理
|
|
34
|
+
single_page = spider_kwargs.pop("single_page", "false")
|
|
35
|
+
self.single_page = single_page.lower() in ("true", "1", "yes")
|
|
36
|
+
# 单页面模式下,如果提供了多个 URL,仅使用第一个
|
|
37
|
+
if self.single_page and len(self.start_urls) > 1:
|
|
38
|
+
print(f"单页面模式下,仅处理第一个 URL: {self.start_urls[0]}")
|
|
39
|
+
self.start_urls = self.start_urls[:1]
|
|
40
|
+
# 处理 allowed_domains:分割并过滤空字符串
|
|
41
|
+
allowed_domains_raw = spider_kwargs.pop("allowed_domains", "")
|
|
42
|
+
self.allowed_domains = [
|
|
43
|
+
domain.strip()
|
|
44
|
+
for domain in allowed_domains_raw.split(",")
|
|
45
|
+
if domain.strip()
|
|
46
|
+
]
|
|
47
|
+
# 处理 deny_patterns
|
|
48
|
+
deny_patterns_raw = spider_kwargs.pop("deny_patterns", "")
|
|
49
|
+
self.deny_patterns = deny_patterns_raw.split(",") if deny_patterns_raw else []
|
|
50
|
+
|
|
51
|
+
self.body_selector = spider_kwargs.pop(
|
|
52
|
+
"body_selector", "main, article, .content, .document, .body, body"
|
|
53
|
+
)
|
|
54
|
+
print(f"body_selector 设置为: {self.body_selector}")
|
|
55
|
+
self.output_dir = spider_kwargs.pop("output_dir", "markdown_output")
|
|
56
|
+
# 弹出 allow_paths 和 converter_engine 以避免传递给父类
|
|
57
|
+
allow_paths_raw = spider_kwargs.pop("allow_paths", "")
|
|
58
|
+
converter_engine = spider_kwargs.pop("converter_engine", "markitdown")
|
|
59
|
+
|
|
60
|
+
# ---------- 新增:路径白名单 ----------
|
|
61
|
+
# allow_paths: 逗号分隔的路径前缀,如 "/docs/zh-cn/, /help/"
|
|
62
|
+
# 只有路径以这些前缀开头的链接才会被提取和跟踪
|
|
63
|
+
|
|
64
|
+
self.allow_paths = (
|
|
65
|
+
[p.strip() for p in allow_paths_raw.split(",") if p.strip()]
|
|
66
|
+
if allow_paths_raw
|
|
67
|
+
else []
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
# 过滤 start_urls,只保留路径被允许的 URL
|
|
71
|
+
if self.allow_paths:
|
|
72
|
+
filtered_start_urls = []
|
|
73
|
+
for url in self.start_urls:
|
|
74
|
+
if self._url_path_allowed(url):
|
|
75
|
+
filtered_start_urls.append(url)
|
|
76
|
+
else:
|
|
77
|
+
print(f"警告:跳过起始 URL(路径不匹配): {url}")
|
|
78
|
+
self.start_urls = filtered_start_urls
|
|
79
|
+
|
|
80
|
+
# ---------- 转换引擎配置 ----------
|
|
81
|
+
self.converter_engine = converter_engine.lower()
|
|
82
|
+
self._init_converter() # 根据引擎初始化转换器
|
|
83
|
+
|
|
84
|
+
# 如果 allowed_domains 未提供,则从 start_urls 中解析域名作为默认
|
|
85
|
+
if not self.allowed_domains and self.start_urls:
|
|
86
|
+
domains = set()
|
|
87
|
+
for url in self.start_urls:
|
|
88
|
+
parsed = urlparse(url)
|
|
89
|
+
domain = parsed.netloc
|
|
90
|
+
if domain:
|
|
91
|
+
domains.add(domain)
|
|
92
|
+
self.allowed_domains = list(domains)
|
|
93
|
+
print(f"从 start_urls 解析的 allowed_domains: {self.allowed_domains}")
|
|
94
|
+
|
|
95
|
+
# ---------- 构建 allow 和 deny 正则 ----------
|
|
96
|
+
# allow_re:如果指定了 allow_paths,则生成只匹配这些路径前缀的正则
|
|
97
|
+
if self.allow_paths:
|
|
98
|
+
# 对每个路径进行转义,确保正则安全,并添加 ^ 前缀
|
|
99
|
+
escaped_paths = [re.escape(p) for p in self.allow_paths]
|
|
100
|
+
# 组合成正则:^(path1|path2|...)
|
|
101
|
+
self.allow_re = "^(" + "|".join(escaped_paths) + ")"
|
|
102
|
+
else:
|
|
103
|
+
self.allow_re = None
|
|
104
|
+
|
|
105
|
+
# 构建 deny 正则表达式(支持多个模式,用 | 连接)
|
|
106
|
+
deny_re = "|".join(self.deny_patterns) if self.deny_patterns else None
|
|
107
|
+
|
|
108
|
+
# 根据 single_page 参数决定是否启用链接跟踪
|
|
109
|
+
if self.single_page:
|
|
110
|
+
self.rules = ()
|
|
111
|
+
else:
|
|
112
|
+
self.rules = (
|
|
113
|
+
Rule(
|
|
114
|
+
LinkExtractor(
|
|
115
|
+
allow_domains=self.allowed_domains,
|
|
116
|
+
# allow=self.allow_re if self.allow_re else (),
|
|
117
|
+
deny=deny_re if deny_re else (),
|
|
118
|
+
),
|
|
119
|
+
callback="parse_item",
|
|
120
|
+
follow=True,
|
|
121
|
+
process_links="filter_links_by_path", # 新增:在处理链接时调用自定义过滤函数
|
|
122
|
+
),
|
|
123
|
+
)
|
|
124
|
+
super().__init__(*args, **spider_kwargs)
|
|
125
|
+
# 编译规则以确保 CrawlSpider 正确使用它们
|
|
126
|
+
if hasattr(self, "_compile_rules"):
|
|
127
|
+
self._compile_rules()
|
|
128
|
+
print(">>> 初始化完成,self.start_urls =", self.start_urls)
|
|
129
|
+
if self.single_page:
|
|
130
|
+
print(">>> 单页面模式已启用,将不跟随任何链接")
|
|
131
|
+
|
|
132
|
+
# 新增一个方法,用于在生成请求前过滤链接
|
|
133
|
+
def filter_links_by_path(self, links):
|
|
134
|
+
"""
|
|
135
|
+
根据解析后的绝对路径过滤链接,只保留路径以 allow_paths 中任一前缀开头的链接。
|
|
136
|
+
"""
|
|
137
|
+
if not self.allow_paths:
|
|
138
|
+
return links # 如果没有设置路径白名单,返回所有链接
|
|
139
|
+
|
|
140
|
+
filtered_links = []
|
|
141
|
+
for link in links:
|
|
142
|
+
if self._url_path_allowed(link.url):
|
|
143
|
+
filtered_links.append(link)
|
|
144
|
+
else:
|
|
145
|
+
self.logger.debug(f"过滤掉链接(路径不匹配): {link.url}")
|
|
146
|
+
|
|
147
|
+
return filtered_links
|
|
148
|
+
|
|
149
|
+
def _url_path_allowed(self, url):
|
|
150
|
+
"""检查 URL 的路径是否以 allow_paths 中的任一前缀开头"""
|
|
151
|
+
if not self.allow_paths:
|
|
152
|
+
return True
|
|
153
|
+
parsed = urlparse(url)
|
|
154
|
+
path = parsed.path
|
|
155
|
+
return any(path.startswith(prefix) for prefix in self.allow_paths)
|
|
156
|
+
|
|
157
|
+
def _init_converter(self):
|
|
158
|
+
"""根据 converter_engine 初始化转换器及转换函数"""
|
|
159
|
+
if self.converter_engine == "markitdown":
|
|
160
|
+
try:
|
|
161
|
+
from markitdown import MarkItDown, StreamInfo
|
|
162
|
+
import io
|
|
163
|
+
|
|
164
|
+
self.converter = MarkItDown(enable_plugins=True)
|
|
165
|
+
|
|
166
|
+
# markitdown 的 convert 方法可以直接处理 HTML 字符串
|
|
167
|
+
# 定义转换函数:明确指定流信息为 HTML
|
|
168
|
+
def convert_with_streaminfo(html):
|
|
169
|
+
# 将字符串编码为字节流
|
|
170
|
+
byte_stream = io.BytesIO(html.encode("utf-8"))
|
|
171
|
+
# 创建 StreamInfo 对象,设置 mime_type 为 'text/html'
|
|
172
|
+
# 注意:StreamInfo 的构造函数可能因版本而异,常见用法是直接传入字典或使用 kwargs
|
|
173
|
+
# 这里采用一种兼容的方式,将信息作为关键字参数传递
|
|
174
|
+
# 查阅最新文档,推荐使用:stream_info = StreamInfo(mime_type='text/html')
|
|
175
|
+
# 如果上述方式不行,可以尝试:stream_info = {'mime_type': 'text/html'}
|
|
176
|
+
try:
|
|
177
|
+
# 尝试使用 StreamInfo 类(如果库支持)
|
|
178
|
+
stream_info = StreamInfo(mimetype="text/html")
|
|
179
|
+
except ImportError:
|
|
180
|
+
# 降级方案:直接使用字典(某些旧版本支持)
|
|
181
|
+
stream_info = {"mimetype": "text/html"}
|
|
182
|
+
|
|
183
|
+
result = self.converter.convert_stream(
|
|
184
|
+
byte_stream, stream_info=stream_info
|
|
185
|
+
)
|
|
186
|
+
return result.text_content
|
|
187
|
+
|
|
188
|
+
self.convert_func = convert_with_streaminfo
|
|
189
|
+
except ImportError:
|
|
190
|
+
raise ImportError("markitdown 未安装,请运行: pip install markitdown")
|
|
191
|
+
elif self.converter_engine == "html2text":
|
|
192
|
+
try:
|
|
193
|
+
import html2text
|
|
194
|
+
|
|
195
|
+
self.converter = html2text.HTML2Text()
|
|
196
|
+
# 设置常用选项(可在此处根据用户需求扩展)
|
|
197
|
+
self.converter.ignore_links = False
|
|
198
|
+
self.converter.body_width = 0
|
|
199
|
+
self.converter.protect_links = True
|
|
200
|
+
self.converter.mark_code = True
|
|
201
|
+
self.converter.ignore_images = False
|
|
202
|
+
# 防止将图片src作为alt文本
|
|
203
|
+
self.converter.images_to_alt = False
|
|
204
|
+
# 保留图片的width/height属性
|
|
205
|
+
self.converter.images_with_size = True
|
|
206
|
+
self.convert_func = self.converter.handle
|
|
207
|
+
except ImportError:
|
|
208
|
+
raise ImportError("html2text 未安装,请运行: pip install html2text")
|
|
209
|
+
else:
|
|
210
|
+
raise ValueError(
|
|
211
|
+
f"不支持的转换引擎: {self.converter_engine},可选: markitdown, html2text"
|
|
212
|
+
)
|
|
213
|
+
|
|
214
|
+
def start_requests(self):
|
|
215
|
+
if self.single_page:
|
|
216
|
+
# 单页面模式下,为每个起始URL创建请求,直接调用parse_item
|
|
217
|
+
for url in self.start_urls:
|
|
218
|
+
yield scrapy.Request(url, callback=self.parse_item)
|
|
219
|
+
else:
|
|
220
|
+
# 普通模式下,使用父类的默认行为
|
|
221
|
+
yield from super().start_requests()
|
|
222
|
+
|
|
223
|
+
def parse_item(self, response):
|
|
224
|
+
# 检查 URL 路径是否被允许
|
|
225
|
+
if not self._url_path_allowed(response.url):
|
|
226
|
+
self.logger.debug(f"跳过页面(路径不匹配): {response.url}")
|
|
227
|
+
return
|
|
228
|
+
self.logger.info(f"Parsing item from {response.url}")
|
|
229
|
+
item = DocCrawlerItem()
|
|
230
|
+
item["url"] = response.url
|
|
231
|
+
|
|
232
|
+
# 检查响应类型:如果不是 HTML 响应,则将其视为媒体文件
|
|
233
|
+
if not isinstance(response, HtmlResponse):
|
|
234
|
+
# 将当前 URL 作为媒体文件下载
|
|
235
|
+
media_urls = [response.url]
|
|
236
|
+
item["media_urls"] = media_urls
|
|
237
|
+
# 生成文件路径
|
|
238
|
+
item["file_path"] = self._url_to_file_path(response.url)
|
|
239
|
+
# 设置空的内容
|
|
240
|
+
item["markdown_content"] = ""
|
|
241
|
+
yield item
|
|
242
|
+
return
|
|
243
|
+
|
|
244
|
+
# 提取媒体文件链接(图片、STL等)
|
|
245
|
+
media_urls = self._extract_media_urls(response)
|
|
246
|
+
item["media_urls"] = media_urls
|
|
247
|
+
|
|
248
|
+
# 提取主体内容(使用用户提供的 CSS 选择器)
|
|
249
|
+
self.logger.debug(f"使用 body_selector: {self.body_selector}")
|
|
250
|
+
main_content = response.css(self.body_selector).get()
|
|
251
|
+
self.logger.debug(f"main_content 找到: {bool(main_content)}")
|
|
252
|
+
raw_html = main_content if main_content else response.text
|
|
253
|
+
if not main_content:
|
|
254
|
+
self.logger.warning(
|
|
255
|
+
f"body_selector '{self.body_selector}' 未匹配到内容,使用完整页面"
|
|
256
|
+
)
|
|
257
|
+
self.logger.debug(f"raw_html:\n{raw_html}")
|
|
258
|
+
# 清洗 HTML,只保留 <body> 部分
|
|
259
|
+
if not main_content:
|
|
260
|
+
response.selector.remove_namespaces()
|
|
261
|
+
cleaned_html = response.selector.xpath("//body").get() or raw_html
|
|
262
|
+
else:
|
|
263
|
+
cleaned_html = raw_html
|
|
264
|
+
|
|
265
|
+
# 生成当前文件的本地保存路径
|
|
266
|
+
current_file_path = self._url_to_file_path(response.url)
|
|
267
|
+
item["file_path"] = current_file_path
|
|
268
|
+
|
|
269
|
+
# 处理 HTML 中的 base64 图片:保存为本地文件并替换 src
|
|
270
|
+
cleaned_html = self._save_base64_images(cleaned_html, current_file_path)
|
|
271
|
+
|
|
272
|
+
# 转换为 Markdown
|
|
273
|
+
try:
|
|
274
|
+
markdown_text = self.convert_func(cleaned_html)
|
|
275
|
+
|
|
276
|
+
except Exception as e:
|
|
277
|
+
self.logger.error(f"转换失败 {response.url}: {e}", exc_info=True)
|
|
278
|
+
markdown_text = ""
|
|
279
|
+
|
|
280
|
+
# 转换内部链接
|
|
281
|
+
if markdown_text:
|
|
282
|
+
markdown_text = self._convert_internal_links(
|
|
283
|
+
markdown_text, current_file_path, response.url
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
item["markdown_content"] = markdown_text
|
|
287
|
+
|
|
288
|
+
yield item
|
|
289
|
+
|
|
290
|
+
# ---------- 辅助方法(与之前相同)----------
|
|
291
|
+
def _extract_media_urls(self, response):
|
|
292
|
+
"""从响应中提取图片和文件链接"""
|
|
293
|
+
media_extensions = {
|
|
294
|
+
# 图片格式
|
|
295
|
+
".png",
|
|
296
|
+
".jpg",
|
|
297
|
+
".jpeg",
|
|
298
|
+
".gif",
|
|
299
|
+
".svg",
|
|
300
|
+
".webp",
|
|
301
|
+
".bmp",
|
|
302
|
+
".ico",
|
|
303
|
+
".tiff",
|
|
304
|
+
# 文档/文件格式
|
|
305
|
+
".stl",
|
|
306
|
+
".pdf",
|
|
307
|
+
".zip",
|
|
308
|
+
".gz",
|
|
309
|
+
".tar",
|
|
310
|
+
".doc",
|
|
311
|
+
".docx",
|
|
312
|
+
".ppt",
|
|
313
|
+
".pptx",
|
|
314
|
+
".xls",
|
|
315
|
+
".xlsx",
|
|
316
|
+
".txt",
|
|
317
|
+
".csv",
|
|
318
|
+
".json",
|
|
319
|
+
".xml",
|
|
320
|
+
".ipynb",
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
media_urls = set()
|
|
324
|
+
|
|
325
|
+
# 提取图片标签
|
|
326
|
+
for img in response.css("img"):
|
|
327
|
+
src = img.attrib.get("src")
|
|
328
|
+
if src:
|
|
329
|
+
# 跳过 data: URL (base64 嵌入图片)
|
|
330
|
+
if src.startswith("data:"):
|
|
331
|
+
continue
|
|
332
|
+
absolute_url = urljoin(response.url, src)
|
|
333
|
+
# 只下载内部链接的媒体文件
|
|
334
|
+
if self._is_internal_link(absolute_url, response.url):
|
|
335
|
+
media_urls.add(absolute_url)
|
|
336
|
+
else:
|
|
337
|
+
self.logger.debug(f"跳过外部媒体链接: {absolute_url}")
|
|
338
|
+
|
|
339
|
+
# 提取链接标签中的文件
|
|
340
|
+
for link in response.css("a"):
|
|
341
|
+
href = link.attrib.get("href")
|
|
342
|
+
if href:
|
|
343
|
+
# 检查是否是媒体文件
|
|
344
|
+
if any(href.lower().endswith(ext) for ext in media_extensions):
|
|
345
|
+
absolute_url = urljoin(response.url, href)
|
|
346
|
+
# 只下载内部链接的媒体文件
|
|
347
|
+
if self._is_internal_link(absolute_url, response.url):
|
|
348
|
+
media_urls.add(absolute_url)
|
|
349
|
+
else:
|
|
350
|
+
self.logger.debug(f"跳过外部媒体链接: {absolute_url}")
|
|
351
|
+
|
|
352
|
+
return list(media_urls)
|
|
353
|
+
|
|
354
|
+
def _save_base64_images(self, html_content, current_file_path):
|
|
355
|
+
"""将 HTML 中的 base64 内嵌图片保存为本地文件,并替换 src 为相对路径"""
|
|
356
|
+
pattern = r'(<img[^>]*?src=)["\'](data:[^"\']+)["\']([^>]*?>)'
|
|
357
|
+
|
|
358
|
+
def replace_data_url(match):
|
|
359
|
+
prefix = match.group(1)
|
|
360
|
+
data_url = match.group(2)
|
|
361
|
+
suffix = match.group(3)
|
|
362
|
+
|
|
363
|
+
header, _, data = data_url.partition(",")
|
|
364
|
+
if not data:
|
|
365
|
+
return match.group(0)
|
|
366
|
+
|
|
367
|
+
if ";base64" not in header:
|
|
368
|
+
return match.group(0)
|
|
369
|
+
|
|
370
|
+
# 确定扩展名
|
|
371
|
+
ext = ".png"
|
|
372
|
+
if "image/jpeg" in header or "image/jpg" in header:
|
|
373
|
+
ext = ".jpg"
|
|
374
|
+
elif "image/png" in header:
|
|
375
|
+
ext = ".png"
|
|
376
|
+
elif "image/gif" in header:
|
|
377
|
+
ext = ".gif"
|
|
378
|
+
elif "image/svg+xml" in header:
|
|
379
|
+
ext = ".svg"
|
|
380
|
+
elif "image/webp" in header:
|
|
381
|
+
ext = ".webp"
|
|
382
|
+
|
|
383
|
+
try:
|
|
384
|
+
image_data = base64.b64decode(data)
|
|
385
|
+
except Exception:
|
|
386
|
+
return match.group(0)
|
|
387
|
+
|
|
388
|
+
file_hash = hashlib.md5(image_data).hexdigest()[:8]
|
|
389
|
+
base_name = os.path.splitext(os.path.basename(current_file_path))[0]
|
|
390
|
+
image_filename = f"{base_name}_{file_hash}{ext}"
|
|
391
|
+
|
|
392
|
+
image_dir = os.path.join(
|
|
393
|
+
self.output_dir, os.path.dirname(current_file_path)
|
|
394
|
+
)
|
|
395
|
+
image_full_path = os.path.join(image_dir, image_filename)
|
|
396
|
+
os.makedirs(image_dir, exist_ok=True)
|
|
397
|
+
with open(image_full_path, "wb") as f:
|
|
398
|
+
f.write(image_data)
|
|
399
|
+
|
|
400
|
+
self.logger.info(f"保存 base64 图片: {image_filename}")
|
|
401
|
+
return f'{prefix}"{image_filename}"{suffix}'
|
|
402
|
+
|
|
403
|
+
return re.sub(pattern, replace_data_url, html_content, flags=re.IGNORECASE)
|
|
404
|
+
|
|
405
|
+
def _url_to_file_path(self, url):
|
|
406
|
+
parsed = urlparse(url)
|
|
407
|
+
path = parsed.path
|
|
408
|
+
if path.endswith("/"):
|
|
409
|
+
path = path + "index.md"
|
|
410
|
+
elif path.endswith(".html"):
|
|
411
|
+
path = path[:-5] + ".md"
|
|
412
|
+
else:
|
|
413
|
+
if not os.path.splitext(path)[1]:
|
|
414
|
+
path = (
|
|
415
|
+
os.path.join(path, "index.md")
|
|
416
|
+
if path.endswith("/")
|
|
417
|
+
else path + ".md"
|
|
418
|
+
)
|
|
419
|
+
return path.lstrip("/")
|
|
420
|
+
|
|
421
|
+
def _is_internal_link(self, url, base_url):
|
|
422
|
+
absolute = urljoin(base_url, url)
|
|
423
|
+
parsed = urlparse(absolute)
|
|
424
|
+
return any(parsed.netloc.endswith(domain) for domain in self.allowed_domains)
|
|
425
|
+
|
|
426
|
+
def _convert_url(self, url, base_url, current_file_path):
|
|
427
|
+
if not self._is_internal_link(url, base_url):
|
|
428
|
+
return url
|
|
429
|
+
target_abs = urljoin(base_url, url)
|
|
430
|
+
target_path = self._url_to_file_path(target_abs)
|
|
431
|
+
rel_path = os.path.relpath(
|
|
432
|
+
target_path, os.path.dirname(current_file_path)
|
|
433
|
+
).replace("\\", "/")
|
|
434
|
+
if rel_path == ".":
|
|
435
|
+
rel_path = os.path.basename(target_path)
|
|
436
|
+
return rel_path
|
|
437
|
+
|
|
438
|
+
def _convert_internal_links(self, markdown_text, current_file_path, base_url):
|
|
439
|
+
# 匹配图片和链接: 或 [text](url)
|
|
440
|
+
pattern = r"(!?)\[([^\]]*)\]\(([^)]+)\)"
|
|
441
|
+
|
|
442
|
+
def replace_link(match):
|
|
443
|
+
is_image = match.group(1) == "!"
|
|
444
|
+
text = match.group(2)
|
|
445
|
+
inner = match.group(3).strip()
|
|
446
|
+
|
|
447
|
+
# 清理图片的alt文本:如果看起来像URL(包含路径分隔符或图片扩展名),则清空
|
|
448
|
+
if is_image:
|
|
449
|
+
# 常见的图片扩展名
|
|
450
|
+
image_extensions = {
|
|
451
|
+
".png",
|
|
452
|
+
".jpg",
|
|
453
|
+
".jpeg",
|
|
454
|
+
".gif",
|
|
455
|
+
".svg",
|
|
456
|
+
".webp",
|
|
457
|
+
".bmp",
|
|
458
|
+
".ico",
|
|
459
|
+
".tiff",
|
|
460
|
+
}
|
|
461
|
+
# 如果alt文本看起来像URL(包含'/'或'\'或以图片扩展名结尾),清空它
|
|
462
|
+
# 这样可以避免清空正常的alt文本如"Logo"或"Diagram"
|
|
463
|
+
text_lower = text.lower()
|
|
464
|
+
if (
|
|
465
|
+
"/" in text
|
|
466
|
+
or "\\" in text
|
|
467
|
+
or any(text_lower.endswith(ext) for ext in image_extensions)
|
|
468
|
+
):
|
|
469
|
+
text = ""
|
|
470
|
+
self.logger.debug(
|
|
471
|
+
f"清空图片alt文本(看起来像URL): {match.group(2)}"
|
|
472
|
+
)
|
|
473
|
+
|
|
474
|
+
if inner.startswith("<"):
|
|
475
|
+
angle_match = re.match(r"<([^>]+)>", inner)
|
|
476
|
+
if not angle_match:
|
|
477
|
+
return match.group(0)
|
|
478
|
+
old_url = angle_match.group(1)
|
|
479
|
+
new_url = self._convert_url(old_url, base_url, current_file_path)
|
|
480
|
+
new_inner = inner.replace(f"<{old_url}>", f"<{new_url}>", 1)
|
|
481
|
+
else:
|
|
482
|
+
parts = inner.split(None, 1)
|
|
483
|
+
old_url = parts[0]
|
|
484
|
+
new_url = self._convert_url(old_url, base_url, current_file_path)
|
|
485
|
+
if len(parts) == 1:
|
|
486
|
+
new_inner = new_url
|
|
487
|
+
else:
|
|
488
|
+
new_inner = new_url + " " + parts[1]
|
|
489
|
+
|
|
490
|
+
# 重新构建链接,保留图片标记
|
|
491
|
+
prefix = "!" if is_image else ""
|
|
492
|
+
return f"{prefix}[{text}]({new_inner})"
|
|
493
|
+
|
|
494
|
+
return re.sub(pattern, replace_link, markdown_text)
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: html_docs_crawler
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Universal documentation crawler that converts HTML pages to Markdown with internal link correction
|
|
5
|
+
Project-URL: Homepage, https://github.com/zwidny/doc_crawler
|
|
6
|
+
Project-URL: Source, https://github.com/zwidny/doc_crawler
|
|
7
|
+
Project-URL: BugTracker, https://github.com/zwidny/doc_crawler/issues
|
|
8
|
+
Keywords: scrapy,crawler,markdown,documentation,web-scraping
|
|
9
|
+
Requires-Python: >=3.12
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
Requires-Dist: fake-useragent>=2.2.0
|
|
12
|
+
Requires-Dist: html2text>=2025.4.15
|
|
13
|
+
Requires-Dist: markitdown[docx,pdf,pptx]>=0.1.5
|
|
14
|
+
Requires-Dist: scrapy>=2.14.1
|
|
15
|
+
|
|
16
|
+
# scrapy-mth
|
|
17
|
+
|
|
18
|
+
A Scrapy-based universal documentation crawler that converts HTML documentation sites to Markdown format, with automatic internal link rewriting to local `.md` relative paths. Supports multiple converter engines (markitdown / html2text), path whitelist filtering, and automatic media file download.
|
|
19
|
+
|
|
20
|
+
## Parameters
|
|
21
|
+
|
|
22
|
+
- `start_urls`: Starting URLs (comma-separated)
|
|
23
|
+
- `allowed_domains`: Allowed domains (comma-separated)
|
|
24
|
+
- `deny_patterns`: Regex deny patterns (comma-separated)
|
|
25
|
+
- `allow_paths`: Allowed path prefixes (comma-separated); only URLs starting with these prefixes will be processed
|
|
26
|
+
- `body_selector`: CSS selector for main HTML content (default: `"main, article, .content, .document, .body, body"`)
|
|
27
|
+
- `output_dir`: Output directory (default: `"~/.config/doc_crawler/_docs/{domain_name}"`, where `{domain_name}` is extracted from `start_urls`)
|
|
28
|
+
- `converter_engine`: Converter engine (default: `"markitdown"`, optional: `"html2text"`)
|
|
29
|
+
- `single_page`: Single-page mode (default: `"false"`, set to `"true"` to crawl a single page without following links)
|
|
30
|
+
|
|
31
|
+
## Install via UV
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
uv tool install git+https://github.com/zwidny/doc_crawler.git
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
After installation, you can use the `doc_crawler` command from any directory.
|
|
38
|
+
|
|
39
|
+
## Usage Examples
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
# Crawl AKShare documentation
|
|
43
|
+
doc_crawler --start-urls "https://akshare.akfamily.xyz" \
|
|
44
|
+
--allowed-domains "akshare.akfamily.xyz" \
|
|
45
|
+
--deny-patterns "/_sources/" \
|
|
46
|
+
--body-selector "main, article, .content, .document, .body" \
|
|
47
|
+
--output-dir "_docs/akshare_markdown"
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
### Single-page mode
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
doc_crawler --start-urls "https://build123d.readthedocs.io/en/stable/examples_1.html" \
|
|
54
|
+
--single-page true \
|
|
55
|
+
--body-selector ".wy-nav-content" \
|
|
56
|
+
--output-dir "single_page_output"
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
### Path whitelist filtering
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
doc_crawler --start-urls "https://opencode.ai/docs/zh-cn/" \
|
|
63
|
+
--allow-paths "/docs/zh-cn/" \
|
|
64
|
+
--body-selector "main, article, .content" \
|
|
65
|
+
--output-dir "_docs/opencode_docs_zh_cn"
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
### Crawl with html2text engine
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
doc_crawler --start-urls "https://akshare.akfamily.xyz/" \
|
|
72
|
+
--allowed-domains "akshare.akfamily.xyz" \
|
|
73
|
+
--deny-patterns "/_sources/" \
|
|
74
|
+
--body-selector "main, article, .content, .document, .body" \
|
|
75
|
+
--converter-engine "html2text" \
|
|
76
|
+
--output-dir "_docs/akshare_markdown_html2text"
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
### More examples
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
# Crawl build123d docs
|
|
83
|
+
doc_crawler --start-urls "https://build123d.readthedocs.io/en/stable/" \
|
|
84
|
+
--deny-patterns "/_sources/,/latest/" \
|
|
85
|
+
--body-selector ".wy-nav-content" \
|
|
86
|
+
--output-dir "_docs/build123d"
|
|
87
|
+
|
|
88
|
+
# Crawl Docusaurus docs
|
|
89
|
+
doc_crawler --start-urls "https://docusaurus.io/docs" \
|
|
90
|
+
--allow-paths "/docs" \
|
|
91
|
+
--body-selector ".col.docItemCol_n6xZ" \
|
|
92
|
+
--output-dir "_docs/docusaurus"
|
|
93
|
+
|
|
94
|
+
# Crawl uv documentation
|
|
95
|
+
doc_crawler --start-urls "https://docs.astral.sh/uv/" \
|
|
96
|
+
--body-selector ".md-content" \
|
|
97
|
+
--output-dir "_docs/uv"
|
|
98
|
+
```
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
scrapy.cfg,sha256=TKK9gKdRaYNWn06Irgd74zC6AQ9DsI8Ksr6JaExpkEw,265
|
|
2
|
+
doc_crawler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
+
doc_crawler/cli.py,sha256=Qejxu7V83yia-kfJZb5yEbAOLLoRLEV73WXztJK-k-A,5857
|
|
4
|
+
doc_crawler/items.py,sha256=QUfYvMG6dMiHcTXZlXDqYcxhSMR5EgvDMGAnw2fywwY,240
|
|
5
|
+
doc_crawler/middlewares.py,sha256=1UsJy_j5DMNvK_Ys30iczyyiJ9sI7TZR1TOX_6jOqPE,1927
|
|
6
|
+
doc_crawler/pipelines.py,sha256=CzMXo1i0UyiKpXdv8MqYJRRxFYIwz-T-ptba90fi6eo,5891
|
|
7
|
+
doc_crawler/settings.py,sha256=ce4obL5ALRWMHaRNA_I7VBU_Yryhzg2-piXdFZuNV8A,1953
|
|
8
|
+
doc_crawler/spiders/__init__.py,sha256=ULwecZkx3_NTphkz7y_qiazBeUoHFnCCWnKSjoDCZj0,161
|
|
9
|
+
doc_crawler/spiders/doc_spider.py,sha256=sUNXg6boEBwowLIbTgw-6wUksEix1qSilV50LqTcfj4,20121
|
|
10
|
+
html_docs_crawler-0.1.0.dist-info/METADATA,sha256=BNjRApqgU2GPObKC_CTPXyvnHef4It77o_B50zNtDNM,3530
|
|
11
|
+
html_docs_crawler-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
12
|
+
html_docs_crawler-0.1.0.dist-info/entry_points.txt,sha256=rM3npJjcF02jm5w_D5ZajDck2KbhwmCLzU6Ic0JyiHQ,53
|
|
13
|
+
html_docs_crawler-0.1.0.dist-info/top_level.txt,sha256=lMBLRQiCumDc-fNb8HcBPLFO_aC_fiMyLr5zGdCVAV0,12
|
|
14
|
+
html_docs_crawler-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
doc_crawler
|
scrapy.cfg
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
# Automatically created by: scrapy startproject
|
|
2
|
+
#
|
|
3
|
+
# For more information about the [deploy] section see:
|
|
4
|
+
# https://scrapyd.readthedocs.io/en/latest/deploy.html
|
|
5
|
+
|
|
6
|
+
[settings]
|
|
7
|
+
default = doc_crawler.settings
|
|
8
|
+
|
|
9
|
+
[deploy]
|
|
10
|
+
#url = http://localhost:6800/
|
|
11
|
+
project = doc_crawler
|