web2md 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- web2md/__init__.py +10 -0
- web2md/__pycache__/__init__.cpython-313.pyc +0 -0
- web2md/__pycache__/cli.cpython-313.pyc +0 -0
- web2md/__pycache__/version.cpython-313.pyc +0 -0
- web2md/cli.py +612 -0
- web2md/version.py +33 -0
- web2md-0.1.0.dist-info/METADATA +360 -0
- web2md-0.1.0.dist-info/RECORD +12 -0
- web2md-0.1.0.dist-info/WHEEL +5 -0
- web2md-0.1.0.dist-info/entry_points.txt +2 -0
- web2md-0.1.0.dist-info/licenses/LICENSE +21 -0
- web2md-0.1.0.dist-info/top_level.txt +1 -0
web2md/__init__.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
# meta data, align with setup.py
|
|
2
|
+
|
|
3
|
+
from .version import __version__
|
|
4
|
+
|
|
5
|
+
__author__ = "Liming Xie"
|
|
6
|
+
__author_email__ = "liming.xie@gmail.com"
|
|
7
|
+
|
|
8
|
+
__description__ = "A CLI tool to crawl dynamic/static websites and convert content to clean Markdown"
|
|
9
|
+
__url__ = "https://github.com/floatinghotpot/web2md"
|
|
10
|
+
__license__ = "MIT"
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
web2md/cli.py
ADDED
|
@@ -0,0 +1,612 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
from urllib.parse import urlparse, urljoin, unquote, urlunparse
|
|
3
|
+
from urllib.request import urlretrieve, build_opener, HTTPCookieProcessor, HTTPSHandler
|
|
4
|
+
from urllib.error import URLError
|
|
5
|
+
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError
|
|
6
|
+
from bs4 import BeautifulSoup
|
|
7
|
+
import markdownify
|
|
8
|
+
import os
|
|
9
|
+
import time
|
|
10
|
+
import re
|
|
11
|
+
import sys
|
|
12
|
+
import hashlib
|
|
13
|
+
import socket
|
|
14
|
+
import ssl
|
|
15
|
+
|
|
16
|
+
# ===================== Configurable Params (Adjust as needed) =====================
|
|
17
|
+
PLAYWRIGHT_CONFIG = {
|
|
18
|
+
"headless": False, # Set to True for background crawling (no browser window)
|
|
19
|
+
"timeout": 60000, # Page load timeout (ms)
|
|
20
|
+
"wait_for_load": "networkidle", # Wait for page full dynamic render
|
|
21
|
+
"sleep_after_load": 2, # Sleep after load (s) for JS render completion
|
|
22
|
+
"user_agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
|
|
23
|
+
}
|
|
24
|
+
MEDIA_CONFIG = {
|
|
25
|
+
"timeout": 30000, # Media download timeout (s)
|
|
26
|
+
"image_dir": "images", # Image save subdirectory (same level as MD)
|
|
27
|
+
"video_dir": "videos", # Video save subdirectory (same level as MD)
|
|
28
|
+
"allowed_img_ext": [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".svg", ".webp"],
|
|
29
|
+
"allowed_vid_ext": [".mp4", ".avi", ".mov", ".webm", ".flv", ".mkv", ".mpeg", ".mpg"]
|
|
30
|
+
}
|
|
31
|
+
# Tags to remove (keep only core content)
|
|
32
|
+
REMOVE_TAGS = ["nav", "header", "footer", "aside", "script", "style", "iframe", "sidebar"]
|
|
33
|
+
# Core content selectors (match by priority, stop on first match)
|
|
34
|
+
CORE_CONTENT_SELECTORS = [
|
|
35
|
+
("main", {}),
|
|
36
|
+
("div", {"class_": "article-content"}),
|
|
37
|
+
("div", {"class_": "article_content"}),
|
|
38
|
+
("div", {"id": "main-content"}),
|
|
39
|
+
("div", {"class_": "content"}),
|
|
40
|
+
("article", {})
|
|
41
|
+
]
|
|
42
|
+
# Default crawl config
|
|
43
|
+
DEFAULT_CRAWL_CONFIG = {
|
|
44
|
+
"max_depth": 5, # Default max relative crawl depth
|
|
45
|
+
"max_count": 999, # Default max file count (0 = unlimited)
|
|
46
|
+
"allowed_schemes": ["http", "https"],
|
|
47
|
+
"exclude_patterns": [r"\.pdf$", r"\.zip$", r"\.rar$", r"\.7z$", r"\.tar$", r"\.gz$", r"\.exe$"]
|
|
48
|
+
}
|
|
49
|
+
# ==================================================================================
|
|
50
|
+
|
|
51
|
+
# Global variables (initialized once, shared across all functions)
|
|
52
|
+
crawled_urls = set() # Stored crawled URLs to avoid duplication
|
|
53
|
+
base_url = None # Dynamic base URL (parent dir of target URL, core benchmark)
|
|
54
|
+
base_parsed = None # Parsed result of base_url
|
|
55
|
+
root_save_dir = None # Local root save directory (absolute path, no affect on filename)
|
|
56
|
+
max_crawl_depth = None # Max relative crawl depth based on base_url
|
|
57
|
+
max_crawl_count = None # Max crawl file count (0 = unlimited)
|
|
58
|
+
crawl_picture = False # Whether to crawl pictures (--picture)
|
|
59
|
+
crawl_video = False # Whether to crawl videos (--video)
|
|
60
|
+
crawled_count = 0 # Current crawled file count (real-time statistics)
|
|
61
|
+
|
|
62
|
+
# New: Opener that disables SSL certificate verification
|
|
63
|
+
def create_ssl_unverified_opener():
|
|
64
|
+
"""Create an opener that disables SSL certificate verification to solve the CERTIFICATE_VERIFY_FAILED error"""
|
|
65
|
+
context = ssl.create_default_context()
|
|
66
|
+
context.check_hostname = False
|
|
67
|
+
context.verify_mode = ssl.CERT_NONE
|
|
68
|
+
opener = build_opener(HTTPSHandler(context=context), HTTPCookieProcessor())
|
|
69
|
+
return opener
|
|
70
|
+
|
|
71
|
+
# Initialize global opener
|
|
72
|
+
ssl_unverified_opener = create_ssl_unverified_opener()
|
|
73
|
+
|
|
74
|
+
def validate_url(url):
|
|
75
|
+
"""Validate URL legality, must start with http/https"""
|
|
76
|
+
if not re.match(r'^https?://', url, re.IGNORECASE):
|
|
77
|
+
raise argparse.ArgumentTypeError(f"Invalid URL: {url} | Must start with http/https")
|
|
78
|
+
return url
|
|
79
|
+
|
|
80
|
+
def validate_depth(depth):
|
|
81
|
+
"""Validate crawl depth is non-negative integer"""
|
|
82
|
+
try:
|
|
83
|
+
depth_int = int(depth)
|
|
84
|
+
if depth_int < 0:
|
|
85
|
+
raise ValueError("Depth cannot be negative")
|
|
86
|
+
return depth_int
|
|
87
|
+
except ValueError:
|
|
88
|
+
raise argparse.ArgumentTypeError(f"Invalid depth: {depth} | Must be non-negative integer (0,1,2...)")
|
|
89
|
+
|
|
90
|
+
def validate_count(count):
|
|
91
|
+
"""Validate crawl count is non-negative integer"""
|
|
92
|
+
try:
|
|
93
|
+
count_int = int(count)
|
|
94
|
+
if count_int < 0:
|
|
95
|
+
raise ValueError("Count cannot be negative")
|
|
96
|
+
return count_int
|
|
97
|
+
except ValueError:
|
|
98
|
+
raise argparse.ArgumentTypeError(f"Invalid count: {count} | Must be non-negative integer (0 = unlimited)")
|
|
99
|
+
|
|
100
|
+
def get_url_parent_dir(url):
|
|
101
|
+
"""Extract parent directory of any URL (core for generating base_url)
|
|
102
|
+
Example: https://company.com/docs/home → https://company.com/docs/
|
|
103
|
+
Example: https://company.com/docs/ → https://company.com/docs/
|
|
104
|
+
"""
|
|
105
|
+
parsed = urlparse(url)
|
|
106
|
+
path = parsed.path.rstrip('/')
|
|
107
|
+
# Set parent path to self if path is empty or only '/'
|
|
108
|
+
if not path or path == '/':
|
|
109
|
+
parent_path = '/'
|
|
110
|
+
else:
|
|
111
|
+
parent_path = os.path.dirname(path)
|
|
112
|
+
if not parent_path.startswith('/'):
|
|
113
|
+
parent_path = f"/{parent_path}"
|
|
114
|
+
# Reassemble parent URL, force end with '/' for easy prefix matching
|
|
115
|
+
parent_parsed = parsed._replace(path=parent_path.rstrip('/') + '/')
|
|
116
|
+
return urlunparse(parent_parsed)
|
|
117
|
+
|
|
118
|
+
def init_global_config(target_url, save_dir, depth, count, pic, vid):
|
|
119
|
+
"""Initialize global config, core: generate dynamic base_url"""
|
|
120
|
+
global base_url, base_parsed, root_save_dir, max_crawl_depth, max_crawl_count, crawl_picture, crawl_video
|
|
121
|
+
base_url = get_url_parent_dir(target_url)
|
|
122
|
+
base_parsed = urlparse(base_url)
|
|
123
|
+
root_save_dir = os.path.abspath(save_dir)
|
|
124
|
+
max_crawl_depth = depth
|
|
125
|
+
max_crawl_count = count
|
|
126
|
+
crawl_picture = pic
|
|
127
|
+
crawl_video = vid
|
|
128
|
+
# Set global socket timeout (fix urlretrieve timeout issue)
|
|
129
|
+
socket.setdefaulttimeout(MEDIA_CONFIG["timeout"])
|
|
130
|
+
# Print init info
|
|
131
|
+
print(f"🔧 Global Config Initialized")
|
|
132
|
+
print(f" ├─ Target URL: {target_url}")
|
|
133
|
+
print(f" ├─ Base URL (Benchmark): {base_url} (All operations based on this)")
|
|
134
|
+
print(f" ├─ Local Save Dir: {root_save_dir}")
|
|
135
|
+
print(f" ├─ Max Crawl Depth: {max_crawl_depth}")
|
|
136
|
+
print(f" ├─ Max Crawl Count: {max_crawl_count} (0 = unlimited)")
|
|
137
|
+
print(f" ├─ Crawl Pictures: {'✅ Enabled' if crawl_picture else '❌ Disabled'} (--picture)")
|
|
138
|
+
print(f" └─ Crawl Videos: {'✅ Enabled' if crawl_video else '❌ Disabled'} (--video)")
|
|
139
|
+
|
|
140
|
+
def generate_auto_save_dir():
|
|
141
|
+
"""Generate default local save dir name (based on base_url's domain + path)"""
|
|
142
|
+
dir_name = f"{base_parsed.netloc}_{base_parsed.path.strip('/').replace('/', '_')}"
|
|
143
|
+
dir_name = re.sub(r'[^\w\-]', '_', dir_name) # Filter illegal chars
|
|
144
|
+
dir_name = re.sub(r'_+', '_', dir_name).strip('_')
|
|
145
|
+
return dir_name if dir_name else "web2md_docs"
|
|
146
|
+
|
|
147
|
+
def get_file_hash(url, length=8):
|
|
148
|
+
"""Generate 8-bit MD5 hash of URL for media file renaming (avoid duplication)"""
|
|
149
|
+
return hashlib.md5(url.encode('utf-8')).hexdigest()[:length]
|
|
150
|
+
|
|
151
|
+
def get_valid_media_filename(url, default_ext=".file"):
|
|
152
|
+
"""Generate legal media filename, filter system illegal characters"""
|
|
153
|
+
try:
|
|
154
|
+
parsed = urlparse(unquote(url))
|
|
155
|
+
filename = os.path.basename(parsed.path) or f"media_{get_file_hash(url)}"
|
|
156
|
+
name, ext = os.path.splitext(filename)
|
|
157
|
+
ext = ext.lower() if ext else default_ext
|
|
158
|
+
# Filter cross-platform illegal chars
|
|
159
|
+
safe_name = re.sub(r'[<>:"/\\|?*]', '_', name)
|
|
160
|
+
safe_name = re.sub(r'_+', '_', safe_name).strip('_')
|
|
161
|
+
return f"{safe_name}_{get_file_hash(url)}{ext}"
|
|
162
|
+
except Exception:
|
|
163
|
+
return f"fallback_{get_file_hash(url)}{default_ext}"
|
|
164
|
+
|
|
165
|
+
def download_media_file(media_url, md_file_path, allowed_exts, media_type):
|
|
166
|
+
"""Download media file (image/video), save to MD same-level dir, return local relative path
|
|
167
|
+
Optimization 1: Images/videos are not restricted by the base_url parent directory
|
|
168
|
+
Optimization 2: Use an opener that disables SSL verification to solve certificate errors
|
|
169
|
+
:param media_url: Absolute URL of media file
|
|
170
|
+
:param md_file_path: Local path of corresponding MD file
|
|
171
|
+
:param allowed_exts: Allowed media extensions
|
|
172
|
+
:param media_type: Media type (image/video)
|
|
173
|
+
:return: Local relative path / original URL (if download failed)
|
|
174
|
+
"""
|
|
175
|
+
if not (crawl_picture or crawl_video) or not media_url or not md_file_path:
|
|
176
|
+
return media_url
|
|
177
|
+
if not media_url.startswith(('http://', 'https://')):
|
|
178
|
+
return media_url
|
|
179
|
+
# Validate media extension
|
|
180
|
+
ext = os.path.splitext(urlparse(unquote(media_url)).path)[1].lower()
|
|
181
|
+
if ext not in allowed_exts:
|
|
182
|
+
return media_url
|
|
183
|
+
# Media save dir: same level as MD → images/ / videos/
|
|
184
|
+
md_dir = os.path.dirname(md_file_path)
|
|
185
|
+
media_dir = os.path.join(md_dir, MEDIA_CONFIG[f"{media_type}_dir"])
|
|
186
|
+
os.makedirs(media_dir, exist_ok=True)
|
|
187
|
+
# Generate legal filename
|
|
188
|
+
filename = get_valid_media_filename(media_url, ext)
|
|
189
|
+
save_path = os.path.join(media_dir, filename)
|
|
190
|
+
# Return relative path if file already exists
|
|
191
|
+
if os.path.exists(save_path):
|
|
192
|
+
rel_path = os.path.relpath(save_path, md_dir).replace(os.sep, '/')
|
|
193
|
+
return rel_path
|
|
194
|
+
# Optimization 2: Use opener with disabled SSL verification to download files
|
|
195
|
+
try:
|
|
196
|
+
print(f"📥 Download {media_type}: {filename} (from: {media_url})")
|
|
197
|
+
# Replace urlretrieve with opener.open, disabling SSL verification
|
|
198
|
+
with ssl_unverified_opener.open(media_url, timeout=MEDIA_CONFIG["timeout"]) as response, open(save_path, 'wb') as f:
|
|
199
|
+
f.write(response.read())
|
|
200
|
+
rel_path = os.path.relpath(save_path, md_dir).replace(os.sep, '/')
|
|
201
|
+
return rel_path
|
|
202
|
+
except socket.timeout:
|
|
203
|
+
print(f"⚠️ {media_type.capitalize()} download failed: Timeout ({MEDIA_CONFIG['timeout']}s) - {media_url}")
|
|
204
|
+
return media_url
|
|
205
|
+
except ssl.SSLError:
|
|
206
|
+
print(f"⚠️ {media_type.capitalize()} download failed: SSL Certificate Verify Failed - {media_url}")
|
|
207
|
+
return media_url
|
|
208
|
+
except Exception as e:
|
|
209
|
+
print(f"⚠️ {media_type.capitalize()} download failed: {str(e)[:50]} - {media_url}")
|
|
210
|
+
return media_url
|
|
211
|
+
|
|
212
|
+
def crawl_media(soup, md_file_path, current_url):
|
|
213
|
+
"""Crawl pictures/videos on demand, replace soup links with local relative paths"""
|
|
214
|
+
if not soup or not md_file_path:
|
|
215
|
+
return soup
|
|
216
|
+
|
|
217
|
+
def extract_best_url(tag, attrs):
|
|
218
|
+
"""Extract best possible URL from a list of attributes (handles lazy-loading and srcset)"""
|
|
219
|
+
for attr in attrs:
|
|
220
|
+
val = tag.get(attr, "").strip()
|
|
221
|
+
if not val:
|
|
222
|
+
continue
|
|
223
|
+
if attr == "srcset":
|
|
224
|
+
# Handle srcset: "url1 size1, url2 size2"
|
|
225
|
+
# Pick the last one (usually highest quality)
|
|
226
|
+
parts = [p.strip() for p in val.split(",") if p.strip()]
|
|
227
|
+
if parts:
|
|
228
|
+
return parts[-1].split(" ")[0].strip()
|
|
229
|
+
return val
|
|
230
|
+
return None
|
|
231
|
+
|
|
232
|
+
# Crawl pictures
|
|
233
|
+
if crawl_picture:
|
|
234
|
+
for img in soup.find_all("img"):
|
|
235
|
+
# Priority: Common lazy-load attrs > srcset > src
|
|
236
|
+
src = extract_best_url(img, ["data-src", "data-original", "data-original-src", "file-src", "srcset", "src"])
|
|
237
|
+
if src:
|
|
238
|
+
abs_src = urljoin(current_url, src)
|
|
239
|
+
img["src"] = download_media_file(abs_src, md_file_path, MEDIA_CONFIG["allowed_img_ext"], "image")
|
|
240
|
+
|
|
241
|
+
# Crawl videos
|
|
242
|
+
if crawl_video:
|
|
243
|
+
# Process <video> tag
|
|
244
|
+
for video in soup.find_all("video"):
|
|
245
|
+
src = extract_best_url(video, ["src", "data-src"])
|
|
246
|
+
if src:
|
|
247
|
+
abs_src = urljoin(current_url, src)
|
|
248
|
+
video["src"] = download_media_file(abs_src, md_file_path, MEDIA_CONFIG["allowed_vid_ext"], "video")
|
|
249
|
+
|
|
250
|
+
# Process <source> tag
|
|
251
|
+
for source in soup.find_all("source"):
|
|
252
|
+
src = extract_best_url(source, ["src", "srcset"])
|
|
253
|
+
if src:
|
|
254
|
+
abs_src = urljoin(current_url, src)
|
|
255
|
+
ext = os.path.splitext(urlparse(unquote(abs_src)).path)[1].lower()
|
|
256
|
+
if ext in MEDIA_CONFIG["allowed_vid_ext"]:
|
|
257
|
+
source["src"] = download_media_file(abs_src, md_file_path, MEDIA_CONFIG["allowed_vid_ext"], "video")
|
|
258
|
+
elif ext in MEDIA_CONFIG["allowed_img_ext"] and crawl_picture:
|
|
259
|
+
source["src"] = download_media_file(abs_src, md_file_path, MEDIA_CONFIG["allowed_img_ext"], "image")
|
|
260
|
+
return soup
|
|
261
|
+
|
|
262
|
+
def get_dynamic_html(url):
|
|
263
|
+
"""Get dynamically rendered HTML content via Playwright (adapt to JS loaded pages)
|
|
264
|
+
:return: (html, final_url, base_uri) or (None, None, None)
|
|
265
|
+
"""
|
|
266
|
+
try:
|
|
267
|
+
with sync_playwright() as p:
|
|
268
|
+
browser = p.chromium.launch(headless=PLAYWRIGHT_CONFIG["headless"])
|
|
269
|
+
context = browser.new_context(
|
|
270
|
+
user_agent=PLAYWRIGHT_CONFIG["user_agent"],
|
|
271
|
+
viewport={"width": 1920, "height": 1080},
|
|
272
|
+
extra_http_headers={"Referer": base_url},
|
|
273
|
+
ignore_https_errors=True # New: Playwright ignores HTTPS errors
|
|
274
|
+
)
|
|
275
|
+
page = context.new_page()
|
|
276
|
+
response = page.goto(
|
|
277
|
+
url,
|
|
278
|
+
timeout=PLAYWRIGHT_CONFIG["timeout"],
|
|
279
|
+
wait_until=PLAYWRIGHT_CONFIG["wait_for_load"]
|
|
280
|
+
)
|
|
281
|
+
time.sleep(PLAYWRIGHT_CONFIG["sleep_after_load"])
|
|
282
|
+
html = page.content()
|
|
283
|
+
final_url = page.url
|
|
284
|
+
# Get the actual base URI used by the browser (handles <base> tags and redirects)
|
|
285
|
+
base_uri = page.evaluate("document.baseURI") or final_url
|
|
286
|
+
context.close()
|
|
287
|
+
browser.close()
|
|
288
|
+
print(f"✅ Page loaded successfully: {url}")
|
|
289
|
+
return html, final_url, base_uri
|
|
290
|
+
except PlaywrightTimeoutError:
|
|
291
|
+
print(f"❌ Page load timeout: Exceed {PLAYWRIGHT_CONFIG['timeout']/1000}s - {url}")
|
|
292
|
+
return None, None, None
|
|
293
|
+
except Exception as e:
|
|
294
|
+
print(f"❌ Page request failed: {str(e)[:80]} - {url}")
|
|
295
|
+
return None, None, None
|
|
296
|
+
|
|
297
|
+
def calculate_relative_depth(url):
|
|
298
|
+
"""Calculate relative crawl depth of URL based on base_url (for max_depth control)
|
|
299
|
+
:return: Relative depth (0 = base_url itself, -1 = invalid)
|
|
300
|
+
"""
|
|
301
|
+
if not url or not base_parsed:
|
|
302
|
+
return -1
|
|
303
|
+
parsed = urlparse(url)
|
|
304
|
+
# Filter different domain names
|
|
305
|
+
if parsed.netloc != base_parsed.netloc:
|
|
306
|
+
return -1
|
|
307
|
+
# Extract base path and target path (unified format)
|
|
308
|
+
base_path = base_parsed.path.rstrip('/') + '/'
|
|
309
|
+
target_path = unquote(parsed.path).rstrip('/') + '/'
|
|
310
|
+
if not target_path.startswith(base_path):
|
|
311
|
+
return -1
|
|
312
|
+
# Calculate relative depth
|
|
313
|
+
relative_path = target_path[len(base_path):].rstrip('/')
|
|
314
|
+
if not relative_path:
|
|
315
|
+
return 0 # Exact base_url, depth 0
|
|
316
|
+
depth = len([seg for seg in relative_path.split('/') if seg.strip()])
|
|
317
|
+
return depth
|
|
318
|
+
|
|
319
|
+
def is_allowed_url(url):
|
|
320
|
+
"""Judge if URL is allowed to crawl
|
|
321
|
+
Optimization 1: Strictly restrict page URL to base_url parent directory level,
|
|
322
|
+
media resources are not subject to this restriction (media logic is in download_media_file)
|
|
323
|
+
Rules: 1. Same domain as base_url 2. Valid relative depth 3. Not crawled 4. Not excluded format
|
|
324
|
+
:return: True (allowed) / False (forbidden)
|
|
325
|
+
"""
|
|
326
|
+
global crawled_count
|
|
327
|
+
# Check max crawl count (stop if reach limit, 0 = unlimited)
|
|
328
|
+
if max_crawl_count > 0 and crawled_count >= max_crawl_count:
|
|
329
|
+
return False
|
|
330
|
+
if not url:
|
|
331
|
+
return False
|
|
332
|
+
parsed = urlparse(url)
|
|
333
|
+
# Filter non-http/https schemes
|
|
334
|
+
if parsed.scheme not in DEFAULT_CRAWL_CONFIG["allowed_schemes"]:
|
|
335
|
+
return False
|
|
336
|
+
# Check relative depth (Strictly restrict pages to the base_url parent directory)
|
|
337
|
+
depth = calculate_relative_depth(url)
|
|
338
|
+
if depth < 0 or depth > max_crawl_depth:
|
|
339
|
+
return False
|
|
340
|
+
# Filter excluded file formats
|
|
341
|
+
for pattern in DEFAULT_CRAWL_CONFIG["exclude_patterns"]:
|
|
342
|
+
if re.search(pattern, url, re.IGNORECASE):
|
|
343
|
+
return False
|
|
344
|
+
# Filter crawled URLs
|
|
345
|
+
if url in crawled_urls:
|
|
346
|
+
return False
|
|
347
|
+
return True
|
|
348
|
+
|
|
349
|
+
def extract_allowed_links(html, base_uri):
|
|
350
|
+
"""Extract all legal sublinks from page for recursive crawling"""
|
|
351
|
+
if not html or not base_uri:
|
|
352
|
+
return set()
|
|
353
|
+
allowed_links = set()
|
|
354
|
+
soup = BeautifulSoup(html, "lxml")
|
|
355
|
+
for a in soup.find_all("a", href=True):
|
|
356
|
+
href = a.get("href", "").strip()
|
|
357
|
+
# Filter mail/tel/JS/anchor links
|
|
358
|
+
if not href or href.startswith(('mailto:', 'tel:', 'javascript:', '#')):
|
|
359
|
+
continue
|
|
360
|
+
# Assemble to absolute URL using browser's resolved base URI
|
|
361
|
+
abs_url = urljoin(base_uri, href)
|
|
362
|
+
if is_allowed_url(abs_url):
|
|
363
|
+
allowed_links.add(abs_url)
|
|
364
|
+
return allowed_links
|
|
365
|
+
|
|
366
|
+
def url_to_md_filename(url):
|
|
367
|
+
"""Core: Generate MD filename based on base_url (strictly follow rules)
|
|
368
|
+
Rules: 1. Remove base_url prefix 2. Replace / with _ 3. Filter illegal chars 4. Suffix with .md
|
|
369
|
+
Example: https://company.com/docs/home → home → home.md
|
|
370
|
+
Example: https://company.com/docs/home/sub → home/sub → home_sub.md
|
|
371
|
+
Example: https://company.com/docs/ → index.md
|
|
372
|
+
"""
|
|
373
|
+
url_lower = url.lower()
|
|
374
|
+
base_url_lower = base_url.lower()
|
|
375
|
+
# Step 1: Strictly remove base_url prefix
|
|
376
|
+
if url_lower.startswith(base_url_lower):
|
|
377
|
+
name_part = url_lower[len(base_url_lower):].rstrip('/')
|
|
378
|
+
else:
|
|
379
|
+
# Fallback: Get last segment of URL path
|
|
380
|
+
name_part = os.path.basename(urlparse(url).path).rstrip('/') or "unknown"
|
|
381
|
+
# Step 2: Fallback if name_part is empty (URL == base_url)
|
|
382
|
+
if not name_part:
|
|
383
|
+
return "index.md"
|
|
384
|
+
# Step 3: Replace / with _ + filter illegal chars + merge consecutive underscores
|
|
385
|
+
name_part = name_part.replace('/', '_')
|
|
386
|
+
safe_name = re.sub(r'[<>:"/\\|?*]', '_', name_part)
|
|
387
|
+
safe_name = re.sub(r'_+', '_', safe_name).strip('_')
|
|
388
|
+
# Step 4: Suffix with .md
|
|
389
|
+
return f"{safe_name}.md" if safe_name else "index.md"
|
|
390
|
+
|
|
391
|
+
def get_md_file_path(url):
|
|
392
|
+
"""Get local absolute path of MD file (root save dir + legal filename)"""
|
|
393
|
+
if not root_save_dir or not url:
|
|
394
|
+
fallback = os.path.join(root_save_dir, f"unknown_{hash(url) % 10000}.md")
|
|
395
|
+
print(f"⚠️ Config missing, use fallback MD path: {os.path.basename(fallback)}")
|
|
396
|
+
return fallback
|
|
397
|
+
# Core: Only root save dir + legal filename (no other splicing)
|
|
398
|
+
md_filename = url_to_md_filename(url)
|
|
399
|
+
md_file_path = os.path.join(root_save_dir, md_filename)
|
|
400
|
+
# Ensure path is in root save dir (prevent path traversal)
|
|
401
|
+
md_file_path = os.path.abspath(md_file_path)
|
|
402
|
+
if not md_file_path.startswith(root_save_dir):
|
|
403
|
+
md_file_path = os.path.join(root_save_dir, md_filename)
|
|
404
|
+
return md_file_path
|
|
405
|
+
|
|
406
|
+
def fix_local_links(html, current_url, base_uri):
|
|
407
|
+
"""Fix <a> links in page to local MD relative paths"""
|
|
408
|
+
if not html or not current_url or not root_save_dir:
|
|
409
|
+
return html
|
|
410
|
+
soup = BeautifulSoup(html, "lxml")
|
|
411
|
+
current_md_path = get_md_file_path(current_url)
|
|
412
|
+
current_md_dir = os.path.dirname(current_md_path)
|
|
413
|
+
|
|
414
|
+
for a in soup.find_all("a", href=True):
|
|
415
|
+
href = a.get("href", "").strip()
|
|
416
|
+
if not href or href.startswith(('mailto:', 'tel:', 'javascript:', '#')):
|
|
417
|
+
continue
|
|
418
|
+
# Resolve target URL against base_uri, but identify current path via current_url
|
|
419
|
+
abs_url = urljoin(base_uri, href)
|
|
420
|
+
if is_allowed_url(abs_url):
|
|
421
|
+
target_md_path = get_md_file_path(abs_url)
|
|
422
|
+
rel_link = os.path.relpath(target_md_path, current_md_dir).replace(os.sep, '/')
|
|
423
|
+
a["href"] = rel_link
|
|
424
|
+
return str(soup)
|
|
425
|
+
|
|
426
|
+
def extract_core_content(html, md_file_path, base_uri):
|
|
427
|
+
"""Parse HTML, extract core content, crawl media files on demand"""
|
|
428
|
+
if not html:
|
|
429
|
+
return None
|
|
430
|
+
soup = BeautifulSoup(html, "lxml")
|
|
431
|
+
# Remove useless tags to simplify content
|
|
432
|
+
for tag in REMOVE_TAGS:
|
|
433
|
+
for elem in soup.find_all(tag):
|
|
434
|
+
elem.decompose()
|
|
435
|
+
# Crawl media and replace local links if enabled
|
|
436
|
+
if crawl_picture or crawl_video:
|
|
437
|
+
soup = crawl_media(soup, md_file_path, base_uri)
|
|
438
|
+
# Match core content selectors by priority
|
|
439
|
+
core_content = None
|
|
440
|
+
for tag, attrs in CORE_CONTENT_SELECTORS:
|
|
441
|
+
core_content = soup.find(tag, attrs=attrs)
|
|
442
|
+
if core_content:
|
|
443
|
+
print(f"✅ Core content matched: <{tag} {attrs}>")
|
|
444
|
+
break
|
|
445
|
+
# Fallback: Extract entire body if no selector matched
|
|
446
|
+
if not core_content:
|
|
447
|
+
core_content = soup.find("body")
|
|
448
|
+
if not core_content:
|
|
449
|
+
print(f"❌ No extractable content found")
|
|
450
|
+
return None
|
|
451
|
+
print(f"⚠️ No precise selector matched, extract entire <body> content")
|
|
452
|
+
return str(core_content)
|
|
453
|
+
|
|
454
|
+
def html2md(html_content):
|
|
455
|
+
"""Convert HTML to Markdown, reserve images/videos/tables/codes/lists"""
|
|
456
|
+
if not html_content:
|
|
457
|
+
return None
|
|
458
|
+
try:
|
|
459
|
+
md_content = markdownify.markdownify(
|
|
460
|
+
html_content,
|
|
461
|
+
heading_style="ATX", # MD heading style: # H1, ## H2
|
|
462
|
+
bullets="-*+", # Unordered list symbols
|
|
463
|
+
code_language="python", # Default code block language
|
|
464
|
+
convert_ol=True, # Convert ordered lists
|
|
465
|
+
convert_ul=True, # Convert unordered lists
|
|
466
|
+
convert_table=True, # Convert tables
|
|
467
|
+
convert_image=True, # Convert images
|
|
468
|
+
convert_video=True, # Convert videos
|
|
469
|
+
link_style="inlined", # Link style: [text](url)
|
|
470
|
+
convert_br=True # Convert <br> to line break
|
|
471
|
+
)
|
|
472
|
+
# Clean extra blank lines and trailing spaces
|
|
473
|
+
md_lines = [line.rstrip() for line in md_content.splitlines() if line.strip()]
|
|
474
|
+
return "\n".join(md_lines).strip()
|
|
475
|
+
except Exception as e:
|
|
476
|
+
print(f"❌ HTML to Markdown conversion failed: {str(e)[:80]}")
|
|
477
|
+
return None
|
|
478
|
+
|
|
479
|
+
def save_md_file(md_content, url):
|
|
480
|
+
"""Save MD file to local, return absolute file path
|
|
481
|
+
:return: MD file path (success) / False (failed)
|
|
482
|
+
"""
|
|
483
|
+
global crawled_count
|
|
484
|
+
if not md_content or not url:
|
|
485
|
+
print(f"❌ Skip MD save: Empty content or URL - {url}")
|
|
486
|
+
return False
|
|
487
|
+
# Check max crawl count before save
|
|
488
|
+
if max_crawl_count > 0 and crawled_count >= max_crawl_count:
|
|
489
|
+
print(f"❌ Skip MD save: Reach max crawl count ({max_crawl_count}) - {url}")
|
|
490
|
+
return False
|
|
491
|
+
md_file_path = get_md_file_path(url)
|
|
492
|
+
md_filename = os.path.basename(md_file_path)
|
|
493
|
+
try:
|
|
494
|
+
# Write file with utf-8 encoding (support all characters)
|
|
495
|
+
with open(md_file_path, "w", encoding="utf-8") as f:
|
|
496
|
+
f.write(md_content)
|
|
497
|
+
crawled_count += 1 # Increment crawled count after successful save
|
|
498
|
+
print(f"✅ MD file saved successfully: {md_filename} (Target: {url}) [Count: {crawled_count}]")
|
|
499
|
+
return md_file_path
|
|
500
|
+
except IOError as e:
|
|
501
|
+
print(f"❌ MD file save failed: {str(e)[:80]} - {md_filename}")
|
|
502
|
+
return False
|
|
503
|
+
|
|
504
|
+
def crawl_page_recursive(url):
|
|
505
|
+
"""Recursively crawl page and subpages (core crawl logic)
|
|
506
|
+
Termination conditions: 1. URL not allowed 2. URL crawled 3. Max count reached
|
|
507
|
+
"""
|
|
508
|
+
# Global termination: Max crawl count reached
|
|
509
|
+
if max_crawl_count > 0 and crawled_count >= max_crawl_count:
|
|
510
|
+
print(f"🔴 Crawl stopped: Reach max crawl count ({max_crawl_count})")
|
|
511
|
+
return
|
|
512
|
+
# Local termination: URL not allowed or already crawled
|
|
513
|
+
if not url or not is_allowed_url(url) or url in crawled_urls:
|
|
514
|
+
return
|
|
515
|
+
crawled_urls.add(url)
|
|
516
|
+
|
|
517
|
+
# 1. Get dynamic HTML content (return final_url and browser's base_uri)
|
|
518
|
+
html, final_url, page_base_url = get_dynamic_html(url)
|
|
519
|
+
if not html:
|
|
520
|
+
return
|
|
521
|
+
|
|
522
|
+
# 2. Extract legal sublinks for recursive crawling (use page_base_url for resolution)
|
|
523
|
+
sub_links = extract_allowed_links(html, page_base_url)
|
|
524
|
+
|
|
525
|
+
# 3. Fix page internal links to local MD relative paths
|
|
526
|
+
html_fixed = fix_local_links(html, final_url, page_base_url)
|
|
527
|
+
|
|
528
|
+
# 4. Extract core content and convert to Markdown
|
|
529
|
+
md_file_path_temp = get_md_file_path(url)
|
|
530
|
+
core_html = extract_core_content(html_fixed, md_file_path_temp, page_base_url)
|
|
531
|
+
md_content = html2md(core_html)
|
|
532
|
+
if not md_content:
|
|
533
|
+
return
|
|
534
|
+
|
|
535
|
+
# 5. Save MD file to local
|
|
536
|
+
md_file_path = save_md_file(md_content, url)
|
|
537
|
+
if not md_file_path:
|
|
538
|
+
return
|
|
539
|
+
|
|
540
|
+
# 6. Recursively crawl sublinks (depth-first)
|
|
541
|
+
if sub_links and (max_crawl_count == 0 or crawled_count < max_crawl_count):
|
|
542
|
+
current_depth = calculate_relative_depth(url)
|
|
543
|
+
print(f"\n🔍 Found {len(sub_links)} legal subpages, start recursive crawling (Current Depth: {current_depth})")
|
|
544
|
+
for sub_url in sorted(sub_links):
|
|
545
|
+
# Terminate recursion if max count reached
|
|
546
|
+
if max_crawl_count > 0 and crawled_count >= max_crawl_count:
|
|
547
|
+
break
|
|
548
|
+
crawl_page_recursive(sub_url)
|
|
549
|
+
|
|
550
|
+
def main():
|
|
551
|
+
"""Main function: Parse CLI args → Init config → Start crawling"""
|
|
552
|
+
parser = argparse.ArgumentParser(
|
|
553
|
+
prog="web2md",
|
|
554
|
+
description="📄 Dynamic Webpage to Markdown Crawler | Dynamic Base URL | Exact Filename | Media Crawl On Demand",
|
|
555
|
+
formatter_class=argparse.RawTextHelpFormatter,
|
|
556
|
+
epilog="===== Core Rules =====\n"
|
|
557
|
+
"1. Auto set parent dir of target URL as base_url (all operations based on this)\n"
|
|
558
|
+
"2. MD Filename: Remove base_url prefix → replace / with _ → filter illegal chars → suffix .md\n"
|
|
559
|
+
"3. Crawl Scope: Same domain as base_url + relative depth ≤ --depth + count ≤ --count\n"
|
|
560
|
+
"4. Media Scope: No parent dir restriction (images/videos can be from any domain)\n"
|
|
561
|
+
"===== Usage Examples =====\n"
|
|
562
|
+
" 1. Unlimited crawl: web2md https://company.com/docs/home company-docs --depth 2\n"
|
|
563
|
+
" 2. Limit 5 files: web2md https://company.com/docs/home company-docs --depth 2 --count 5\n"
|
|
564
|
+
" 3. Crawl MD + pictures (limit 3 files): web2md https://company.com/docs/home --picture --count 3\n"
|
|
565
|
+
" 4. Auto save dir: web2md https://company.com/docs/home --depth 1 --count 10"
|
|
566
|
+
)
|
|
567
|
+
# Mandatory arg: Target URL
|
|
568
|
+
parser.add_argument("web_url", type=validate_url, help="Target webpage URL (must start with http/https)")
|
|
569
|
+
# Optional arg: Local save directory (auto generate if omitted)
|
|
570
|
+
parser.add_argument("save_folder", nargs='?', help="Local root save directory for MD files (optional)")
|
|
571
|
+
# Optional args: Crawl depth, count, picture, video
|
|
572
|
+
parser.add_argument("--depth", type=validate_depth, default=DEFAULT_CRAWL_CONFIG["max_depth"],
|
|
573
|
+
help=f"Max relative crawl depth based on base_url (default: {DEFAULT_CRAWL_CONFIG['max_depth']})")
|
|
574
|
+
parser.add_argument("--count", type=validate_count, default=DEFAULT_CRAWL_CONFIG["max_count"],
|
|
575
|
+
help=f"Max crawl file count (0 = unlimited, default: {DEFAULT_CRAWL_CONFIG['max_count']})")
|
|
576
|
+
parser.add_argument("--picture", action="store_true", help="Crawl page pictures, save to MD same-level 'images/' dir")
|
|
577
|
+
parser.add_argument("--video", action="store_true", help="Crawl page videos, save to MD same-level 'videos/' dir")
|
|
578
|
+
|
|
579
|
+
# Parse CLI arguments
|
|
580
|
+
args = parser.parse_args()
|
|
581
|
+
|
|
582
|
+
# Determine local save directory (auto generate if omitted)
|
|
583
|
+
save_dir = args.save_folder if args.save_folder else generate_auto_save_dir()
|
|
584
|
+
os.makedirs(save_dir, exist_ok=True)
|
|
585
|
+
print(f"📁 Local save directory created: {os.path.abspath(save_dir)}\n")
|
|
586
|
+
|
|
587
|
+
# Initialize global config
|
|
588
|
+
init_global_config(args.web_url, save_dir, args.depth, args.count, args.picture, args.video)
|
|
589
|
+
|
|
590
|
+
# Start crawling
|
|
591
|
+
print(f"\n🚀 Start Crawling (Base URL: {base_url} | Max Depth: {args.depth} | Max Count: {args.count})")
|
|
592
|
+
print("-" * 80)
|
|
593
|
+
try:
|
|
594
|
+
crawl_page_recursive(args.web_url)
|
|
595
|
+
except Exception as e:
|
|
596
|
+
print(f"\n❌ Crawl aborted unexpectedly: {str(e)}")
|
|
597
|
+
sys.exit(1)
|
|
598
|
+
|
|
599
|
+
# Crawl completion statistics
|
|
600
|
+
print("-" * 80)
|
|
601
|
+
print(f"\n🎉 Crawl Task Completed!")
|
|
602
|
+
print(f"📊 Statistics: Total crawled {crawled_count} valid pages")
|
|
603
|
+
print(f"📂 All files saved to: {root_save_dir}")
|
|
604
|
+
if crawl_picture or crawl_video:
|
|
605
|
+
media_tips = []
|
|
606
|
+
if crawl_picture: media_tips.append("Pictures (images/)")
|
|
607
|
+
if crawl_video: media_tips.append("Videos (videos/)")
|
|
608
|
+
print(f"📌 Crawled {'+'.join(media_tips)}, saved to MD same-level directories (no parent dir restriction)")
|
|
609
|
+
print(f"\n💡 Tip: Open {root_save_dir} to view generated MD files and media resources")
|
|
610
|
+
|
|
611
|
+
if __name__ == "__main__":
|
|
612
|
+
main()
|
web2md/version.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# -*- coding: utf-8; py-indent-offset:4 -*-
|
|
2
|
+
from __future__ import (absolute_import, division, print_function, unicode_literals)
|
|
3
|
+
|
|
4
|
+
import os
|
|
5
|
+
import sys
|
|
6
|
+
|
|
7
|
+
__version__ = '0.1.0'
|
|
8
|
+
|
|
9
|
+
__hqversion__ = tuple(int(x) for x in __version__.split('.'))
|
|
10
|
+
|
|
11
|
+
if __name__ == "__main__":
|
|
12
|
+
if len(sys.argv) > 1 and sys.argv[1] == 'bump':
|
|
13
|
+
fp = open(__file__, 'r')
|
|
14
|
+
lines = fp.readlines()
|
|
15
|
+
fp.close()
|
|
16
|
+
|
|
17
|
+
text = ''
|
|
18
|
+
for line in lines:
|
|
19
|
+
if line.startswith('__version__'):
|
|
20
|
+
items = line.split("'")
|
|
21
|
+
current_version = items[1]
|
|
22
|
+
vers = items[1].split('.')
|
|
23
|
+
vers[-1] = str(int(vers[-1]) +1)
|
|
24
|
+
new_version = items[1] = '.'.join(vers)
|
|
25
|
+
line = "'".join(items)
|
|
26
|
+
text += line
|
|
27
|
+
|
|
28
|
+
fp = open(__file__, 'w')
|
|
29
|
+
fp.write(text)
|
|
30
|
+
fp.close()
|
|
31
|
+
print('Current version:', current_version)
|
|
32
|
+
print('Bumped to:', new_version)
|
|
33
|
+
print('File updated:', __file__, '\n')
|
|
@@ -0,0 +1,360 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: web2md
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A CLI tool to crawl dynamic/static websites and convert content to clean Markdown
|
|
5
|
+
Home-page: https://github.com/floatinghotpot/web2md
|
|
6
|
+
Author: Liming Xie
|
|
7
|
+
Author-email: liming.xie@gmail.com
|
|
8
|
+
License: MIT
|
|
9
|
+
Keywords: crawler,markdown,web2md,scraper,dynamic website,html2md
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Intended Audience :: End Users/Desktop
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Operating System :: OS Independent
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
22
|
+
Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
|
|
23
|
+
Classifier: Topic :: Text Processing :: Markup :: HTML
|
|
24
|
+
Classifier: Topic :: Text Processing :: Markup :: Markdown
|
|
25
|
+
Requires-Python: >=3.8
|
|
26
|
+
Description-Content-Type: text/markdown
|
|
27
|
+
License-File: LICENSE
|
|
28
|
+
Requires-Dist: playwright>=1.40.0
|
|
29
|
+
Requires-Dist: beautifulsoup4>=4.12.0
|
|
30
|
+
Requires-Dist: markdownify>=0.11.6
|
|
31
|
+
Requires-Dist: lxml>=4.9.0
|
|
32
|
+
Requires-Dist: requests>=2.31.0
|
|
33
|
+
Dynamic: author
|
|
34
|
+
Dynamic: author-email
|
|
35
|
+
Dynamic: classifier
|
|
36
|
+
Dynamic: description
|
|
37
|
+
Dynamic: description-content-type
|
|
38
|
+
Dynamic: home-page
|
|
39
|
+
Dynamic: keywords
|
|
40
|
+
Dynamic: license
|
|
41
|
+
Dynamic: license-file
|
|
42
|
+
Dynamic: requires-dist
|
|
43
|
+
Dynamic: requires-python
|
|
44
|
+
Dynamic: summary
|
|
45
|
+
|
|
46
|
+
# web2md
|
|
47
|
+
|
|
48
|
+
[](https://opensource.org/licenses/MIT)
|
|
49
|
+
[](https://www.python.org/downloads/)
|
|
50
|
+
|
|
51
|
+
A powerful, intelligent CLI tool to crawl **dynamic and static websites** with full JavaScript rendering support and convert them to clean, well-formatted Markdown files. Perfect for archiving documentation, creating offline knowledge bases, and preserving web content.
|
|
52
|
+
|
|
53
|
+
## ✨ Key Features
|
|
54
|
+
|
|
55
|
+
- 🚀 **Dynamic Site Support**: Full JavaScript rendering via Playwright (Vue/React/Angular/Next.js)
|
|
56
|
+
- 🎯 **Smart Content Extraction**: Automatically identifies and extracts core content, removing navigation, ads, and sidebars
|
|
57
|
+
- 🔗 **Recursive Crawling**: Intelligently crawls subpages with configurable depth and count limits
|
|
58
|
+
- �️ **Media Downloads**: Optional image and video downloading with lazy-loading support
|
|
59
|
+
- 📐 **Base URL Intelligence**: Uses browser's `document.baseURI` for accurate relative path resolution
|
|
60
|
+
- 🔄 **Local Link Conversion**: Automatically converts HTML links to local Markdown relative paths
|
|
61
|
+
- 🧹 **Clean Output**: Preserves tables, code blocks, images, links, and heading hierarchies
|
|
62
|
+
- 🔒 **SSL Flexibility**: Handles sites with certificate issues gracefully
|
|
63
|
+
- 🌍 **Cross-Platform**: Works on Windows, macOS, and Linux (Python 3.8+)
|
|
64
|
+
- 📋 **Universal Compatibility**: Generated Markdown works with Typora, Obsidian, VS Code, and more
|
|
65
|
+
|
|
66
|
+
## 📦 Installation
|
|
67
|
+
|
|
68
|
+
### Option 1: Install from PyPI (Recommended)
|
|
69
|
+
```bash
|
|
70
|
+
pip3 install web2md
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
### Option 2: Install from Source (For Development)
|
|
74
|
+
```bash
|
|
75
|
+
git clone https://github.com/floatinghotpot/web2md.git
|
|
76
|
+
cd web2md
|
|
77
|
+
python3 -m pip install -e .
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
### Required: Install Playwright Browser
|
|
81
|
+
```bash
|
|
82
|
+
# Install Chromium driver (required for JavaScript rendering)
|
|
83
|
+
python3 -m playwright install chromium
|
|
84
|
+
|
|
85
|
+
# Linux only: Install system dependencies
|
|
86
|
+
python3 -m playwright install-deps chromium
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
## 🚀 Quick Start
|
|
90
|
+
|
|
91
|
+
### Basic Usage
|
|
92
|
+
```bash
|
|
93
|
+
# Crawl a single page (auto-generated save directory)
|
|
94
|
+
web2md https://docs.python.org/3/tutorial/
|
|
95
|
+
|
|
96
|
+
# Specify custom save directory
|
|
97
|
+
web2md https://docs.python.org/3/tutorial/ ./python-docs
|
|
98
|
+
|
|
99
|
+
# Crawl with images
|
|
100
|
+
web2md https://example.com/docs --picture
|
|
101
|
+
|
|
102
|
+
# Limit crawl depth and count
|
|
103
|
+
web2md https://example.com/docs --depth 2 --count 10
|
|
104
|
+
|
|
105
|
+
# Crawl with images and videos
|
|
106
|
+
web2md https://example.com/docs --picture --video --depth 3
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
### Show Help
|
|
110
|
+
```bash
|
|
111
|
+
web2md -h
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
## 📖 Usage
|
|
115
|
+
|
|
116
|
+
### Command Syntax
|
|
117
|
+
```
|
|
118
|
+
web2md [URL] [SAVE_DIR] [OPTIONS]
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
### Arguments
|
|
122
|
+
|
|
123
|
+
| Argument | Required | Description |
|
|
124
|
+
|----------|----------|-------------|
|
|
125
|
+
| `web_url` | ✅ Yes | Target webpage URL (must start with http/https) |
|
|
126
|
+
| `save_folder` | ❌ No | Local save directory (auto-generated from URL if omitted) |
|
|
127
|
+
|
|
128
|
+
### Options
|
|
129
|
+
|
|
130
|
+
| Option | Default | Description |
|
|
131
|
+
|--------|---------|-------------|
|
|
132
|
+
| `--depth N` | `5` | Maximum relative crawl depth from base URL |
|
|
133
|
+
| `--count N` | `999` | Maximum number of pages to crawl (0 = unlimited) |
|
|
134
|
+
| `--picture` | `False` | Download and save images to local `images/` directory |
|
|
135
|
+
| `--video` | `False` | Download and save videos to local `videos/` directory |
|
|
136
|
+
| `-h, --help` | - | Show help message and exit |
|
|
137
|
+
|
|
138
|
+
### Examples
|
|
139
|
+
|
|
140
|
+
#### 1. Unlimited Crawl with Depth Limit
|
|
141
|
+
```bash
|
|
142
|
+
web2md https://company.com/docs/home company-docs --depth 2
|
|
143
|
+
```
|
|
144
|
+
- Crawls all pages within 2 levels of `/docs/`
|
|
145
|
+
- Saves to `./company-docs/`
|
|
146
|
+
|
|
147
|
+
#### 2. Limited Page Count
|
|
148
|
+
```bash
|
|
149
|
+
web2md https://company.com/docs/home company-docs --depth 2 --count 5
|
|
150
|
+
```
|
|
151
|
+
- Stops after crawling 5 pages
|
|
152
|
+
- Useful for testing or sampling large sites
|
|
153
|
+
|
|
154
|
+
#### 3. Crawl with Images
|
|
155
|
+
```bash
|
|
156
|
+
web2md https://company.com/docs/home --picture --count 3
|
|
157
|
+
```
|
|
158
|
+
- Downloads images to `images/` subdirectory
|
|
159
|
+
- Converts image URLs to local relative paths in Markdown
|
|
160
|
+
|
|
161
|
+
#### 4. Auto-Generated Save Directory
|
|
162
|
+
```bash
|
|
163
|
+
web2md https://company.com/docs/home --depth 1 --count 10
|
|
164
|
+
```
|
|
165
|
+
- Auto-creates directory: `company_com_docs/`
|
|
166
|
+
|
|
167
|
+
## 🎯 How It Works
|
|
168
|
+
|
|
169
|
+
### 1. Base URL Calculation
|
|
170
|
+
The tool automatically determines a **base URL** from your target URL:
|
|
171
|
+
- Target: `https://company.com/docs/home` → Base: `https://company.com/docs/`
|
|
172
|
+
- All crawling is scoped to pages under this base URL
|
|
173
|
+
|
|
174
|
+
### 2. Intelligent Path Resolution
|
|
175
|
+
Uses the browser's `document.baseURI` to correctly resolve relative URLs:
|
|
176
|
+
- Handles `<base>` tags in HTML
|
|
177
|
+
- Respects redirects and trailing slashes
|
|
178
|
+
- Resolves lazy-loaded images with `data-src`, `srcset`, etc.
|
|
179
|
+
|
|
180
|
+
### 3. Smart Content Extraction
|
|
181
|
+
Automatically identifies core content using priority selectors:
|
|
182
|
+
1. `<main>` tag
|
|
183
|
+
2. `.article-content` or `.article_content`
|
|
184
|
+
3. `#main-content`
|
|
185
|
+
4. `.content`
|
|
186
|
+
5. `<article>` tag
|
|
187
|
+
6. Fallback to `<body>` (with cleanup)
|
|
188
|
+
|
|
189
|
+
### 4. Media Handling
|
|
190
|
+
When `--picture` or `--video` is enabled:
|
|
191
|
+
- Downloads media files to `images/` or `videos/` subdirectories
|
|
192
|
+
- Generates unique filenames with MD5 hash to prevent duplicates
|
|
193
|
+
- Converts URLs to local relative paths in Markdown
|
|
194
|
+
- Supports lazy-loading attributes: `data-src`, `data-original`, `srcset`
|
|
195
|
+
|
|
196
|
+
### 5. Filename Generation
|
|
197
|
+
MD filenames are generated from URLs:
|
|
198
|
+
- Remove base URL prefix
|
|
199
|
+
- Replace `/` with `_`
|
|
200
|
+
- Filter illegal characters
|
|
201
|
+
- Example: `https://company.com/docs/api/auth` → `api_auth.md`
|
|
202
|
+
|
|
203
|
+
## ⚙️ Configuration
|
|
204
|
+
|
|
205
|
+
### Built-in Settings (in `web2md/cli.py`)
|
|
206
|
+
|
|
207
|
+
#### Playwright Configuration
|
|
208
|
+
```python
|
|
209
|
+
PLAYWRIGHT_CONFIG = {
|
|
210
|
+
"headless": False, # Set to True for background crawling
|
|
211
|
+
"timeout": 60000, # Page load timeout (ms)
|
|
212
|
+
"wait_for_load": "networkidle", # Wait strategy
|
|
213
|
+
"sleep_after_load": 2, # Additional wait time (seconds)
|
|
214
|
+
"user_agent": "Mozilla/5.0..." # Custom user agent
|
|
215
|
+
}
|
|
216
|
+
```
|
|
217
|
+
|
|
218
|
+
#### Media Configuration
|
|
219
|
+
```python
|
|
220
|
+
MEDIA_CONFIG = {
|
|
221
|
+
"timeout": 30000, # Media download timeout (ms)
|
|
222
|
+
"image_dir": "images", # Image save subdirectory
|
|
223
|
+
"video_dir": "videos", # Video save subdirectory
|
|
224
|
+
"allowed_img_ext": [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".svg", ".webp"],
|
|
225
|
+
"allowed_vid_ext": [".mp4", ".avi", ".mov", ".webm", ".flv", ".mkv"]
|
|
226
|
+
}
|
|
227
|
+
```
|
|
228
|
+
|
|
229
|
+
#### Content Filtering
|
|
230
|
+
```python
|
|
231
|
+
REMOVE_TAGS = ["nav", "header", "footer", "aside", "script", "style", "iframe", "sidebar"]
|
|
232
|
+
|
|
233
|
+
CORE_CONTENT_SELECTORS = [
|
|
234
|
+
("main", {}),
|
|
235
|
+
("div", {"class_": "article-content"}),
|
|
236
|
+
("article", {})
|
|
237
|
+
]
|
|
238
|
+
```
|
|
239
|
+
|
|
240
|
+
#### Crawl Defaults
|
|
241
|
+
```python
|
|
242
|
+
DEFAULT_CRAWL_CONFIG = {
|
|
243
|
+
"max_depth": 5, # Default max depth
|
|
244
|
+
"max_count": 999, # Default max pages
|
|
245
|
+
"allowed_schemes": ["http", "https"],
|
|
246
|
+
"exclude_patterns": [r"\.pdf$", r"\.zip$", r"\.exe$"]
|
|
247
|
+
}
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
## 🔧 Advanced Usage
|
|
251
|
+
|
|
252
|
+
### Debug Mode (Show Browser)
|
|
253
|
+
Edit `web2md/cli.py` and set:
|
|
254
|
+
```python
|
|
255
|
+
PLAYWRIGHT_CONFIG = {
|
|
256
|
+
"headless": False, # Shows browser window
|
|
257
|
+
...
|
|
258
|
+
}
|
|
259
|
+
```
|
|
260
|
+
|
|
261
|
+
### Custom Content Selectors
|
|
262
|
+
Add site-specific selectors to `CORE_CONTENT_SELECTORS`:
|
|
263
|
+
```python
|
|
264
|
+
CORE_CONTENT_SELECTORS = [
|
|
265
|
+
("main", {}),
|
|
266
|
+
("div", {"class_": "documentation-content"}), # Custom selector
|
|
267
|
+
("article", {})
|
|
268
|
+
]
|
|
269
|
+
```
|
|
270
|
+
|
|
271
|
+
### Anti-Bot Detection
|
|
272
|
+
Install and use `playwright-stealth`:
|
|
273
|
+
```bash
|
|
274
|
+
pip3 install playwright-stealth
|
|
275
|
+
```
|
|
276
|
+
|
|
277
|
+
Add to `get_dynamic_html()` in `web2md/cli.py`:
|
|
278
|
+
```python
|
|
279
|
+
from playwright_stealth import stealth_sync
|
|
280
|
+
|
|
281
|
+
page = context.new_page()
|
|
282
|
+
stealth_sync(page) # Add this line
|
|
283
|
+
page.goto(url, ...)
|
|
284
|
+
```
|
|
285
|
+
|
|
286
|
+
### Authentication
|
|
287
|
+
Add login logic in `get_dynamic_html()` before `page.goto()`:
|
|
288
|
+
```python
|
|
289
|
+
page.goto("https://example.com/login")
|
|
290
|
+
page.fill("#username", "your-username")
|
|
291
|
+
page.fill("#password", "your-password")
|
|
292
|
+
page.click("#login-button")
|
|
293
|
+
time.sleep(2)
|
|
294
|
+
```
|
|
295
|
+
|
|
296
|
+
## 🐛 Troubleshooting
|
|
297
|
+
|
|
298
|
+
### SSL Certificate Errors
|
|
299
|
+
The tool automatically disables SSL verification for downloads. If you encounter issues, check your network/firewall settings.
|
|
300
|
+
|
|
301
|
+
### Timeout Errors
|
|
302
|
+
Increase timeout in `PLAYWRIGHT_CONFIG`:
|
|
303
|
+
```python
|
|
304
|
+
"timeout": 120000, # 2 minutes
|
|
305
|
+
```
|
|
306
|
+
|
|
307
|
+
### Missing Content
|
|
308
|
+
1. Check if content is in `<main>` or common content tags
|
|
309
|
+
2. Add custom selectors to `CORE_CONTENT_SELECTORS`
|
|
310
|
+
3. Run with `headless: False` to debug visually
|
|
311
|
+
|
|
312
|
+
### Image Download Failures
|
|
313
|
+
- Verify image URLs are accessible
|
|
314
|
+
- Check if images require authentication
|
|
315
|
+
- Some CDNs may block automated downloads
|
|
316
|
+
|
|
317
|
+
## 📋 Dependencies
|
|
318
|
+
|
|
319
|
+
Automatically installed via `pip`:
|
|
320
|
+
- **playwright** - Browser automation and JS rendering
|
|
321
|
+
- **beautifulsoup4** - HTML parsing and manipulation
|
|
322
|
+
- **lxml** - Fast XML/HTML parser
|
|
323
|
+
- **markdownify** - HTML to Markdown conversion
|
|
324
|
+
- **urllib3** - HTTP client utilities
|
|
325
|
+
|
|
326
|
+
## 🤝 Contributing
|
|
327
|
+
|
|
328
|
+
Contributions are welcome! Please follow these steps:
|
|
329
|
+
|
|
330
|
+
1. Fork the repository
|
|
331
|
+
2. Create a feature branch (`git checkout -b feature/amazing-feature`)
|
|
332
|
+
3. Make your changes
|
|
333
|
+
4. Run tests (if available)
|
|
334
|
+
5. Commit your changes (`git commit -m 'Add amazing feature'`)
|
|
335
|
+
6. Push to the branch (`git push origin feature/amazing-feature`)
|
|
336
|
+
7. Open a Pull Request
|
|
337
|
+
|
|
338
|
+
### Development Setup
|
|
339
|
+
```bash
|
|
340
|
+
git clone https://github.com/floatinghotpot/web2md.git
|
|
341
|
+
cd web2md
|
|
342
|
+
python3 -m pip install -e .
|
|
343
|
+
python3 -m playwright install chromium
|
|
344
|
+
```
|
|
345
|
+
|
|
346
|
+
## 📝 License
|
|
347
|
+
|
|
348
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
349
|
+
|
|
350
|
+
## 🙏 Acknowledgments
|
|
351
|
+
|
|
352
|
+
- [Playwright](https://playwright.dev/) for powerful browser automation
|
|
353
|
+
- [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) for HTML parsing
|
|
354
|
+
- [markdownify](https://github.com/matthewwithanm/markdownify) for clean Markdown conversion
|
|
355
|
+
|
|
356
|
+
---
|
|
357
|
+
|
|
358
|
+
**Made with ❤️ for developers, researchers, and documentation enthusiasts.**
|
|
359
|
+
|
|
360
|
+
If you find this tool useful, please consider giving it a ⭐ on GitHub!
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
web2md/__init__.py,sha256=ZPvZpSyiMs3XKhMyFZK1FmjNpNpXnVskD3gkG0YD2aA,312
|
|
2
|
+
web2md/cli.py,sha256=VFgtGYKl_88VNgIWlSxepVJmWimnT0YM0XuTFQiZh9A,28660
|
|
3
|
+
web2md/version.py,sha256=OJykkdBYqX35QsHimnWgTkieEFMR6SFj81IYy4JmViY,1003
|
|
4
|
+
web2md/__pycache__/__init__.cpython-313.pyc,sha256=eQ8Pdd6gAr_PU5khXNRROfe8gPuYOQ7vmh-703GUEFs,486
|
|
5
|
+
web2md/__pycache__/cli.cpython-313.pyc,sha256=PZVHKkwZZr0wtfx67RT1LllsPL5RrphU0BmfYC7j4w8,32231
|
|
6
|
+
web2md/__pycache__/version.cpython-313.pyc,sha256=Re55Ql5lxw9dIsIbhTJRDYTNIGWxjanCxT4L2jJ1UsM,1790
|
|
7
|
+
web2md-0.1.0.dist-info/licenses/LICENSE,sha256=KcrJSzNA8cSHdhobkxKSComvqHZzxTBuBFyet9-jfGI,1071
|
|
8
|
+
web2md-0.1.0.dist-info/METADATA,sha256=F6y_Ky2ncFNywgqCfAz035MjK7d1xU2jJ4-jZ8-Ba5o,11187
|
|
9
|
+
web2md-0.1.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
10
|
+
web2md-0.1.0.dist-info/entry_points.txt,sha256=sO2WMqUhjpYskI6jO9HoO2l3ILG9JY-VBTEE-7_nY4k,43
|
|
11
|
+
web2md-0.1.0.dist-info/top_level.txt,sha256=tMptSjS7zehcx3WbQPakKhNZZ2t6hcDA8iqG5q_Dsbk,7
|
|
12
|
+
web2md-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) [2026] [Liming Xie]
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
web2md
|