web2md 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
web2md/__init__.py ADDED
@@ -0,0 +1,10 @@
1
+ # meta data, align with setup.py
2
+
3
+ from .version import __version__
4
+
5
+ __author__ = "Liming Xie"
6
+ __author_email__ = "liming.xie@gmail.com"
7
+
8
+ __description__ = "A CLI tool to crawl dynamic/static websites and convert content to clean Markdown"
9
+ __url__ = "https://github.com/floatinghotpot/web2md"
10
+ __license__ = "MIT"
Binary file
web2md/cli.py ADDED
@@ -0,0 +1,612 @@
1
+ import argparse
2
+ from urllib.parse import urlparse, urljoin, unquote, urlunparse
3
+ from urllib.request import urlretrieve, build_opener, HTTPCookieProcessor, HTTPSHandler
4
+ from urllib.error import URLError
5
+ from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError
6
+ from bs4 import BeautifulSoup
7
+ import markdownify
8
+ import os
9
+ import time
10
+ import re
11
+ import sys
12
+ import hashlib
13
+ import socket
14
+ import ssl
15
+
16
+ # ===================== Configurable Params (Adjust as needed) =====================
17
+ PLAYWRIGHT_CONFIG = {
18
+ "headless": False, # Set to True for background crawling (no browser window)
19
+ "timeout": 60000, # Page load timeout (ms)
20
+ "wait_for_load": "networkidle", # Wait for page full dynamic render
21
+ "sleep_after_load": 2, # Sleep after load (s) for JS render completion
22
+ "user_agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36"
23
+ }
24
+ MEDIA_CONFIG = {
25
+ "timeout": 30000, # Media download timeout (s)
26
+ "image_dir": "images", # Image save subdirectory (same level as MD)
27
+ "video_dir": "videos", # Video save subdirectory (same level as MD)
28
+ "allowed_img_ext": [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".svg", ".webp"],
29
+ "allowed_vid_ext": [".mp4", ".avi", ".mov", ".webm", ".flv", ".mkv", ".mpeg", ".mpg"]
30
+ }
31
+ # Tags to remove (keep only core content)
32
+ REMOVE_TAGS = ["nav", "header", "footer", "aside", "script", "style", "iframe", "sidebar"]
33
+ # Core content selectors (match by priority, stop on first match)
34
+ CORE_CONTENT_SELECTORS = [
35
+ ("main", {}),
36
+ ("div", {"class_": "article-content"}),
37
+ ("div", {"class_": "article_content"}),
38
+ ("div", {"id": "main-content"}),
39
+ ("div", {"class_": "content"}),
40
+ ("article", {})
41
+ ]
42
+ # Default crawl config
43
+ DEFAULT_CRAWL_CONFIG = {
44
+ "max_depth": 5, # Default max relative crawl depth
45
+ "max_count": 999, # Default max file count (0 = unlimited)
46
+ "allowed_schemes": ["http", "https"],
47
+ "exclude_patterns": [r"\.pdf$", r"\.zip$", r"\.rar$", r"\.7z$", r"\.tar$", r"\.gz$", r"\.exe$"]
48
+ }
49
+ # ==================================================================================
50
+
51
+ # Global variables (initialized once, shared across all functions)
52
+ crawled_urls = set() # Stored crawled URLs to avoid duplication
53
+ base_url = None # Dynamic base URL (parent dir of target URL, core benchmark)
54
+ base_parsed = None # Parsed result of base_url
55
+ root_save_dir = None # Local root save directory (absolute path, no affect on filename)
56
+ max_crawl_depth = None # Max relative crawl depth based on base_url
57
+ max_crawl_count = None # Max crawl file count (0 = unlimited)
58
+ crawl_picture = False # Whether to crawl pictures (--picture)
59
+ crawl_video = False # Whether to crawl videos (--video)
60
+ crawled_count = 0 # Current crawled file count (real-time statistics)
61
+
62
+ # New: Opener that disables SSL certificate verification
63
+ def create_ssl_unverified_opener():
64
+ """Create an opener that disables SSL certificate verification to solve the CERTIFICATE_VERIFY_FAILED error"""
65
+ context = ssl.create_default_context()
66
+ context.check_hostname = False
67
+ context.verify_mode = ssl.CERT_NONE
68
+ opener = build_opener(HTTPSHandler(context=context), HTTPCookieProcessor())
69
+ return opener
70
+
71
+ # Initialize global opener
72
+ ssl_unverified_opener = create_ssl_unverified_opener()
73
+
74
+ def validate_url(url):
75
+ """Validate URL legality, must start with http/https"""
76
+ if not re.match(r'^https?://', url, re.IGNORECASE):
77
+ raise argparse.ArgumentTypeError(f"Invalid URL: {url} | Must start with http/https")
78
+ return url
79
+
80
+ def validate_depth(depth):
81
+ """Validate crawl depth is non-negative integer"""
82
+ try:
83
+ depth_int = int(depth)
84
+ if depth_int < 0:
85
+ raise ValueError("Depth cannot be negative")
86
+ return depth_int
87
+ except ValueError:
88
+ raise argparse.ArgumentTypeError(f"Invalid depth: {depth} | Must be non-negative integer (0,1,2...)")
89
+
90
+ def validate_count(count):
91
+ """Validate crawl count is non-negative integer"""
92
+ try:
93
+ count_int = int(count)
94
+ if count_int < 0:
95
+ raise ValueError("Count cannot be negative")
96
+ return count_int
97
+ except ValueError:
98
+ raise argparse.ArgumentTypeError(f"Invalid count: {count} | Must be non-negative integer (0 = unlimited)")
99
+
100
+ def get_url_parent_dir(url):
101
+ """Extract parent directory of any URL (core for generating base_url)
102
+ Example: https://company.com/docs/home → https://company.com/docs/
103
+ Example: https://company.com/docs/ → https://company.com/docs/
104
+ """
105
+ parsed = urlparse(url)
106
+ path = parsed.path.rstrip('/')
107
+ # Set parent path to self if path is empty or only '/'
108
+ if not path or path == '/':
109
+ parent_path = '/'
110
+ else:
111
+ parent_path = os.path.dirname(path)
112
+ if not parent_path.startswith('/'):
113
+ parent_path = f"/{parent_path}"
114
+ # Reassemble parent URL, force end with '/' for easy prefix matching
115
+ parent_parsed = parsed._replace(path=parent_path.rstrip('/') + '/')
116
+ return urlunparse(parent_parsed)
117
+
118
+ def init_global_config(target_url, save_dir, depth, count, pic, vid):
119
+ """Initialize global config, core: generate dynamic base_url"""
120
+ global base_url, base_parsed, root_save_dir, max_crawl_depth, max_crawl_count, crawl_picture, crawl_video
121
+ base_url = get_url_parent_dir(target_url)
122
+ base_parsed = urlparse(base_url)
123
+ root_save_dir = os.path.abspath(save_dir)
124
+ max_crawl_depth = depth
125
+ max_crawl_count = count
126
+ crawl_picture = pic
127
+ crawl_video = vid
128
+ # Set global socket timeout (fix urlretrieve timeout issue)
129
+ socket.setdefaulttimeout(MEDIA_CONFIG["timeout"])
130
+ # Print init info
131
+ print(f"🔧 Global Config Initialized")
132
+ print(f" ├─ Target URL: {target_url}")
133
+ print(f" ├─ Base URL (Benchmark): {base_url} (All operations based on this)")
134
+ print(f" ├─ Local Save Dir: {root_save_dir}")
135
+ print(f" ├─ Max Crawl Depth: {max_crawl_depth}")
136
+ print(f" ├─ Max Crawl Count: {max_crawl_count} (0 = unlimited)")
137
+ print(f" ├─ Crawl Pictures: {'✅ Enabled' if crawl_picture else '❌ Disabled'} (--picture)")
138
+ print(f" └─ Crawl Videos: {'✅ Enabled' if crawl_video else '❌ Disabled'} (--video)")
139
+
140
+ def generate_auto_save_dir():
141
+ """Generate default local save dir name (based on base_url's domain + path)"""
142
+ dir_name = f"{base_parsed.netloc}_{base_parsed.path.strip('/').replace('/', '_')}"
143
+ dir_name = re.sub(r'[^\w\-]', '_', dir_name) # Filter illegal chars
144
+ dir_name = re.sub(r'_+', '_', dir_name).strip('_')
145
+ return dir_name if dir_name else "web2md_docs"
146
+
147
+ def get_file_hash(url, length=8):
148
+ """Generate 8-bit MD5 hash of URL for media file renaming (avoid duplication)"""
149
+ return hashlib.md5(url.encode('utf-8')).hexdigest()[:length]
150
+
151
+ def get_valid_media_filename(url, default_ext=".file"):
152
+ """Generate legal media filename, filter system illegal characters"""
153
+ try:
154
+ parsed = urlparse(unquote(url))
155
+ filename = os.path.basename(parsed.path) or f"media_{get_file_hash(url)}"
156
+ name, ext = os.path.splitext(filename)
157
+ ext = ext.lower() if ext else default_ext
158
+ # Filter cross-platform illegal chars
159
+ safe_name = re.sub(r'[<>:"/\\|?*]', '_', name)
160
+ safe_name = re.sub(r'_+', '_', safe_name).strip('_')
161
+ return f"{safe_name}_{get_file_hash(url)}{ext}"
162
+ except Exception:
163
+ return f"fallback_{get_file_hash(url)}{default_ext}"
164
+
165
+ def download_media_file(media_url, md_file_path, allowed_exts, media_type):
166
+ """Download media file (image/video), save to MD same-level dir, return local relative path
167
+ Optimization 1: Images/videos are not restricted by the base_url parent directory
168
+ Optimization 2: Use an opener that disables SSL verification to solve certificate errors
169
+ :param media_url: Absolute URL of media file
170
+ :param md_file_path: Local path of corresponding MD file
171
+ :param allowed_exts: Allowed media extensions
172
+ :param media_type: Media type (image/video)
173
+ :return: Local relative path / original URL (if download failed)
174
+ """
175
+ if not (crawl_picture or crawl_video) or not media_url or not md_file_path:
176
+ return media_url
177
+ if not media_url.startswith(('http://', 'https://')):
178
+ return media_url
179
+ # Validate media extension
180
+ ext = os.path.splitext(urlparse(unquote(media_url)).path)[1].lower()
181
+ if ext not in allowed_exts:
182
+ return media_url
183
+ # Media save dir: same level as MD → images/ / videos/
184
+ md_dir = os.path.dirname(md_file_path)
185
+ media_dir = os.path.join(md_dir, MEDIA_CONFIG[f"{media_type}_dir"])
186
+ os.makedirs(media_dir, exist_ok=True)
187
+ # Generate legal filename
188
+ filename = get_valid_media_filename(media_url, ext)
189
+ save_path = os.path.join(media_dir, filename)
190
+ # Return relative path if file already exists
191
+ if os.path.exists(save_path):
192
+ rel_path = os.path.relpath(save_path, md_dir).replace(os.sep, '/')
193
+ return rel_path
194
+ # Optimization 2: Use opener with disabled SSL verification to download files
195
+ try:
196
+ print(f"📥 Download {media_type}: {filename} (from: {media_url})")
197
+ # Replace urlretrieve with opener.open, disabling SSL verification
198
+ with ssl_unverified_opener.open(media_url, timeout=MEDIA_CONFIG["timeout"]) as response, open(save_path, 'wb') as f:
199
+ f.write(response.read())
200
+ rel_path = os.path.relpath(save_path, md_dir).replace(os.sep, '/')
201
+ return rel_path
202
+ except socket.timeout:
203
+ print(f"⚠️ {media_type.capitalize()} download failed: Timeout ({MEDIA_CONFIG['timeout']}s) - {media_url}")
204
+ return media_url
205
+ except ssl.SSLError:
206
+ print(f"⚠️ {media_type.capitalize()} download failed: SSL Certificate Verify Failed - {media_url}")
207
+ return media_url
208
+ except Exception as e:
209
+ print(f"⚠️ {media_type.capitalize()} download failed: {str(e)[:50]} - {media_url}")
210
+ return media_url
211
+
212
+ def crawl_media(soup, md_file_path, current_url):
213
+ """Crawl pictures/videos on demand, replace soup links with local relative paths"""
214
+ if not soup or not md_file_path:
215
+ return soup
216
+
217
+ def extract_best_url(tag, attrs):
218
+ """Extract best possible URL from a list of attributes (handles lazy-loading and srcset)"""
219
+ for attr in attrs:
220
+ val = tag.get(attr, "").strip()
221
+ if not val:
222
+ continue
223
+ if attr == "srcset":
224
+ # Handle srcset: "url1 size1, url2 size2"
225
+ # Pick the last one (usually highest quality)
226
+ parts = [p.strip() for p in val.split(",") if p.strip()]
227
+ if parts:
228
+ return parts[-1].split(" ")[0].strip()
229
+ return val
230
+ return None
231
+
232
+ # Crawl pictures
233
+ if crawl_picture:
234
+ for img in soup.find_all("img"):
235
+ # Priority: Common lazy-load attrs > srcset > src
236
+ src = extract_best_url(img, ["data-src", "data-original", "data-original-src", "file-src", "srcset", "src"])
237
+ if src:
238
+ abs_src = urljoin(current_url, src)
239
+ img["src"] = download_media_file(abs_src, md_file_path, MEDIA_CONFIG["allowed_img_ext"], "image")
240
+
241
+ # Crawl videos
242
+ if crawl_video:
243
+ # Process <video> tag
244
+ for video in soup.find_all("video"):
245
+ src = extract_best_url(video, ["src", "data-src"])
246
+ if src:
247
+ abs_src = urljoin(current_url, src)
248
+ video["src"] = download_media_file(abs_src, md_file_path, MEDIA_CONFIG["allowed_vid_ext"], "video")
249
+
250
+ # Process <source> tag
251
+ for source in soup.find_all("source"):
252
+ src = extract_best_url(source, ["src", "srcset"])
253
+ if src:
254
+ abs_src = urljoin(current_url, src)
255
+ ext = os.path.splitext(urlparse(unquote(abs_src)).path)[1].lower()
256
+ if ext in MEDIA_CONFIG["allowed_vid_ext"]:
257
+ source["src"] = download_media_file(abs_src, md_file_path, MEDIA_CONFIG["allowed_vid_ext"], "video")
258
+ elif ext in MEDIA_CONFIG["allowed_img_ext"] and crawl_picture:
259
+ source["src"] = download_media_file(abs_src, md_file_path, MEDIA_CONFIG["allowed_img_ext"], "image")
260
+ return soup
261
+
262
+ def get_dynamic_html(url):
263
+ """Get dynamically rendered HTML content via Playwright (adapt to JS loaded pages)
264
+ :return: (html, final_url, base_uri) or (None, None, None)
265
+ """
266
+ try:
267
+ with sync_playwright() as p:
268
+ browser = p.chromium.launch(headless=PLAYWRIGHT_CONFIG["headless"])
269
+ context = browser.new_context(
270
+ user_agent=PLAYWRIGHT_CONFIG["user_agent"],
271
+ viewport={"width": 1920, "height": 1080},
272
+ extra_http_headers={"Referer": base_url},
273
+ ignore_https_errors=True # New: Playwright ignores HTTPS errors
274
+ )
275
+ page = context.new_page()
276
+ response = page.goto(
277
+ url,
278
+ timeout=PLAYWRIGHT_CONFIG["timeout"],
279
+ wait_until=PLAYWRIGHT_CONFIG["wait_for_load"]
280
+ )
281
+ time.sleep(PLAYWRIGHT_CONFIG["sleep_after_load"])
282
+ html = page.content()
283
+ final_url = page.url
284
+ # Get the actual base URI used by the browser (handles <base> tags and redirects)
285
+ base_uri = page.evaluate("document.baseURI") or final_url
286
+ context.close()
287
+ browser.close()
288
+ print(f"✅ Page loaded successfully: {url}")
289
+ return html, final_url, base_uri
290
+ except PlaywrightTimeoutError:
291
+ print(f"❌ Page load timeout: Exceed {PLAYWRIGHT_CONFIG['timeout']/1000}s - {url}")
292
+ return None, None, None
293
+ except Exception as e:
294
+ print(f"❌ Page request failed: {str(e)[:80]} - {url}")
295
+ return None, None, None
296
+
297
+ def calculate_relative_depth(url):
298
+ """Calculate relative crawl depth of URL based on base_url (for max_depth control)
299
+ :return: Relative depth (0 = base_url itself, -1 = invalid)
300
+ """
301
+ if not url or not base_parsed:
302
+ return -1
303
+ parsed = urlparse(url)
304
+ # Filter different domain names
305
+ if parsed.netloc != base_parsed.netloc:
306
+ return -1
307
+ # Extract base path and target path (unified format)
308
+ base_path = base_parsed.path.rstrip('/') + '/'
309
+ target_path = unquote(parsed.path).rstrip('/') + '/'
310
+ if not target_path.startswith(base_path):
311
+ return -1
312
+ # Calculate relative depth
313
+ relative_path = target_path[len(base_path):].rstrip('/')
314
+ if not relative_path:
315
+ return 0 # Exact base_url, depth 0
316
+ depth = len([seg for seg in relative_path.split('/') if seg.strip()])
317
+ return depth
318
+
319
+ def is_allowed_url(url):
320
+ """Judge if URL is allowed to crawl
321
+ Optimization 1: Strictly restrict page URL to base_url parent directory level,
322
+ media resources are not subject to this restriction (media logic is in download_media_file)
323
+ Rules: 1. Same domain as base_url 2. Valid relative depth 3. Not crawled 4. Not excluded format
324
+ :return: True (allowed) / False (forbidden)
325
+ """
326
+ global crawled_count
327
+ # Check max crawl count (stop if reach limit, 0 = unlimited)
328
+ if max_crawl_count > 0 and crawled_count >= max_crawl_count:
329
+ return False
330
+ if not url:
331
+ return False
332
+ parsed = urlparse(url)
333
+ # Filter non-http/https schemes
334
+ if parsed.scheme not in DEFAULT_CRAWL_CONFIG["allowed_schemes"]:
335
+ return False
336
+ # Check relative depth (Strictly restrict pages to the base_url parent directory)
337
+ depth = calculate_relative_depth(url)
338
+ if depth < 0 or depth > max_crawl_depth:
339
+ return False
340
+ # Filter excluded file formats
341
+ for pattern in DEFAULT_CRAWL_CONFIG["exclude_patterns"]:
342
+ if re.search(pattern, url, re.IGNORECASE):
343
+ return False
344
+ # Filter crawled URLs
345
+ if url in crawled_urls:
346
+ return False
347
+ return True
348
+
349
+ def extract_allowed_links(html, base_uri):
350
+ """Extract all legal sublinks from page for recursive crawling"""
351
+ if not html or not base_uri:
352
+ return set()
353
+ allowed_links = set()
354
+ soup = BeautifulSoup(html, "lxml")
355
+ for a in soup.find_all("a", href=True):
356
+ href = a.get("href", "").strip()
357
+ # Filter mail/tel/JS/anchor links
358
+ if not href or href.startswith(('mailto:', 'tel:', 'javascript:', '#')):
359
+ continue
360
+ # Assemble to absolute URL using browser's resolved base URI
361
+ abs_url = urljoin(base_uri, href)
362
+ if is_allowed_url(abs_url):
363
+ allowed_links.add(abs_url)
364
+ return allowed_links
365
+
366
+ def url_to_md_filename(url):
367
+ """Core: Generate MD filename based on base_url (strictly follow rules)
368
+ Rules: 1. Remove base_url prefix 2. Replace / with _ 3. Filter illegal chars 4. Suffix with .md
369
+ Example: https://company.com/docs/home → home → home.md
370
+ Example: https://company.com/docs/home/sub → home/sub → home_sub.md
371
+ Example: https://company.com/docs/ → index.md
372
+ """
373
+ url_lower = url.lower()
374
+ base_url_lower = base_url.lower()
375
+ # Step 1: Strictly remove base_url prefix
376
+ if url_lower.startswith(base_url_lower):
377
+ name_part = url_lower[len(base_url_lower):].rstrip('/')
378
+ else:
379
+ # Fallback: Get last segment of URL path
380
+ name_part = os.path.basename(urlparse(url).path).rstrip('/') or "unknown"
381
+ # Step 2: Fallback if name_part is empty (URL == base_url)
382
+ if not name_part:
383
+ return "index.md"
384
+ # Step 3: Replace / with _ + filter illegal chars + merge consecutive underscores
385
+ name_part = name_part.replace('/', '_')
386
+ safe_name = re.sub(r'[<>:"/\\|?*]', '_', name_part)
387
+ safe_name = re.sub(r'_+', '_', safe_name).strip('_')
388
+ # Step 4: Suffix with .md
389
+ return f"{safe_name}.md" if safe_name else "index.md"
390
+
391
+ def get_md_file_path(url):
392
+ """Get local absolute path of MD file (root save dir + legal filename)"""
393
+ if not root_save_dir or not url:
394
+ fallback = os.path.join(root_save_dir, f"unknown_{hash(url) % 10000}.md")
395
+ print(f"⚠️ Config missing, use fallback MD path: {os.path.basename(fallback)}")
396
+ return fallback
397
+ # Core: Only root save dir + legal filename (no other splicing)
398
+ md_filename = url_to_md_filename(url)
399
+ md_file_path = os.path.join(root_save_dir, md_filename)
400
+ # Ensure path is in root save dir (prevent path traversal)
401
+ md_file_path = os.path.abspath(md_file_path)
402
+ if not md_file_path.startswith(root_save_dir):
403
+ md_file_path = os.path.join(root_save_dir, md_filename)
404
+ return md_file_path
405
+
406
+ def fix_local_links(html, current_url, base_uri):
407
+ """Fix <a> links in page to local MD relative paths"""
408
+ if not html or not current_url or not root_save_dir:
409
+ return html
410
+ soup = BeautifulSoup(html, "lxml")
411
+ current_md_path = get_md_file_path(current_url)
412
+ current_md_dir = os.path.dirname(current_md_path)
413
+
414
+ for a in soup.find_all("a", href=True):
415
+ href = a.get("href", "").strip()
416
+ if not href or href.startswith(('mailto:', 'tel:', 'javascript:', '#')):
417
+ continue
418
+ # Resolve target URL against base_uri, but identify current path via current_url
419
+ abs_url = urljoin(base_uri, href)
420
+ if is_allowed_url(abs_url):
421
+ target_md_path = get_md_file_path(abs_url)
422
+ rel_link = os.path.relpath(target_md_path, current_md_dir).replace(os.sep, '/')
423
+ a["href"] = rel_link
424
+ return str(soup)
425
+
426
+ def extract_core_content(html, md_file_path, base_uri):
427
+ """Parse HTML, extract core content, crawl media files on demand"""
428
+ if not html:
429
+ return None
430
+ soup = BeautifulSoup(html, "lxml")
431
+ # Remove useless tags to simplify content
432
+ for tag in REMOVE_TAGS:
433
+ for elem in soup.find_all(tag):
434
+ elem.decompose()
435
+ # Crawl media and replace local links if enabled
436
+ if crawl_picture or crawl_video:
437
+ soup = crawl_media(soup, md_file_path, base_uri)
438
+ # Match core content selectors by priority
439
+ core_content = None
440
+ for tag, attrs in CORE_CONTENT_SELECTORS:
441
+ core_content = soup.find(tag, attrs=attrs)
442
+ if core_content:
443
+ print(f"✅ Core content matched: <{tag} {attrs}>")
444
+ break
445
+ # Fallback: Extract entire body if no selector matched
446
+ if not core_content:
447
+ core_content = soup.find("body")
448
+ if not core_content:
449
+ print(f"❌ No extractable content found")
450
+ return None
451
+ print(f"⚠️ No precise selector matched, extract entire <body> content")
452
+ return str(core_content)
453
+
454
+ def html2md(html_content):
455
+ """Convert HTML to Markdown, reserve images/videos/tables/codes/lists"""
456
+ if not html_content:
457
+ return None
458
+ try:
459
+ md_content = markdownify.markdownify(
460
+ html_content,
461
+ heading_style="ATX", # MD heading style: # H1, ## H2
462
+ bullets="-*+", # Unordered list symbols
463
+ code_language="python", # Default code block language
464
+ convert_ol=True, # Convert ordered lists
465
+ convert_ul=True, # Convert unordered lists
466
+ convert_table=True, # Convert tables
467
+ convert_image=True, # Convert images
468
+ convert_video=True, # Convert videos
469
+ link_style="inlined", # Link style: [text](url)
470
+ convert_br=True # Convert <br> to line break
471
+ )
472
+ # Clean extra blank lines and trailing spaces
473
+ md_lines = [line.rstrip() for line in md_content.splitlines() if line.strip()]
474
+ return "\n".join(md_lines).strip()
475
+ except Exception as e:
476
+ print(f"❌ HTML to Markdown conversion failed: {str(e)[:80]}")
477
+ return None
478
+
479
+ def save_md_file(md_content, url):
480
+ """Save MD file to local, return absolute file path
481
+ :return: MD file path (success) / False (failed)
482
+ """
483
+ global crawled_count
484
+ if not md_content or not url:
485
+ print(f"❌ Skip MD save: Empty content or URL - {url}")
486
+ return False
487
+ # Check max crawl count before save
488
+ if max_crawl_count > 0 and crawled_count >= max_crawl_count:
489
+ print(f"❌ Skip MD save: Reach max crawl count ({max_crawl_count}) - {url}")
490
+ return False
491
+ md_file_path = get_md_file_path(url)
492
+ md_filename = os.path.basename(md_file_path)
493
+ try:
494
+ # Write file with utf-8 encoding (support all characters)
495
+ with open(md_file_path, "w", encoding="utf-8") as f:
496
+ f.write(md_content)
497
+ crawled_count += 1 # Increment crawled count after successful save
498
+ print(f"✅ MD file saved successfully: {md_filename} (Target: {url}) [Count: {crawled_count}]")
499
+ return md_file_path
500
+ except IOError as e:
501
+ print(f"❌ MD file save failed: {str(e)[:80]} - {md_filename}")
502
+ return False
503
+
504
+ def crawl_page_recursive(url):
505
+ """Recursively crawl page and subpages (core crawl logic)
506
+ Termination conditions: 1. URL not allowed 2. URL crawled 3. Max count reached
507
+ """
508
+ # Global termination: Max crawl count reached
509
+ if max_crawl_count > 0 and crawled_count >= max_crawl_count:
510
+ print(f"🔴 Crawl stopped: Reach max crawl count ({max_crawl_count})")
511
+ return
512
+ # Local termination: URL not allowed or already crawled
513
+ if not url or not is_allowed_url(url) or url in crawled_urls:
514
+ return
515
+ crawled_urls.add(url)
516
+
517
+ # 1. Get dynamic HTML content (return final_url and browser's base_uri)
518
+ html, final_url, page_base_url = get_dynamic_html(url)
519
+ if not html:
520
+ return
521
+
522
+ # 2. Extract legal sublinks for recursive crawling (use page_base_url for resolution)
523
+ sub_links = extract_allowed_links(html, page_base_url)
524
+
525
+ # 3. Fix page internal links to local MD relative paths
526
+ html_fixed = fix_local_links(html, final_url, page_base_url)
527
+
528
+ # 4. Extract core content and convert to Markdown
529
+ md_file_path_temp = get_md_file_path(url)
530
+ core_html = extract_core_content(html_fixed, md_file_path_temp, page_base_url)
531
+ md_content = html2md(core_html)
532
+ if not md_content:
533
+ return
534
+
535
+ # 5. Save MD file to local
536
+ md_file_path = save_md_file(md_content, url)
537
+ if not md_file_path:
538
+ return
539
+
540
+ # 6. Recursively crawl sublinks (depth-first)
541
+ if sub_links and (max_crawl_count == 0 or crawled_count < max_crawl_count):
542
+ current_depth = calculate_relative_depth(url)
543
+ print(f"\n🔍 Found {len(sub_links)} legal subpages, start recursive crawling (Current Depth: {current_depth})")
544
+ for sub_url in sorted(sub_links):
545
+ # Terminate recursion if max count reached
546
+ if max_crawl_count > 0 and crawled_count >= max_crawl_count:
547
+ break
548
+ crawl_page_recursive(sub_url)
549
+
550
+ def main():
551
+ """Main function: Parse CLI args → Init config → Start crawling"""
552
+ parser = argparse.ArgumentParser(
553
+ prog="web2md",
554
+ description="📄 Dynamic Webpage to Markdown Crawler | Dynamic Base URL | Exact Filename | Media Crawl On Demand",
555
+ formatter_class=argparse.RawTextHelpFormatter,
556
+ epilog="===== Core Rules =====\n"
557
+ "1. Auto set parent dir of target URL as base_url (all operations based on this)\n"
558
+ "2. MD Filename: Remove base_url prefix → replace / with _ → filter illegal chars → suffix .md\n"
559
+ "3. Crawl Scope: Same domain as base_url + relative depth ≤ --depth + count ≤ --count\n"
560
+ "4. Media Scope: No parent dir restriction (images/videos can be from any domain)\n"
561
+ "===== Usage Examples =====\n"
562
+ " 1. Unlimited crawl: web2md https://company.com/docs/home company-docs --depth 2\n"
563
+ " 2. Limit 5 files: web2md https://company.com/docs/home company-docs --depth 2 --count 5\n"
564
+ " 3. Crawl MD + pictures (limit 3 files): web2md https://company.com/docs/home --picture --count 3\n"
565
+ " 4. Auto save dir: web2md https://company.com/docs/home --depth 1 --count 10"
566
+ )
567
+ # Mandatory arg: Target URL
568
+ parser.add_argument("web_url", type=validate_url, help="Target webpage URL (must start with http/https)")
569
+ # Optional arg: Local save directory (auto generate if omitted)
570
+ parser.add_argument("save_folder", nargs='?', help="Local root save directory for MD files (optional)")
571
+ # Optional args: Crawl depth, count, picture, video
572
+ parser.add_argument("--depth", type=validate_depth, default=DEFAULT_CRAWL_CONFIG["max_depth"],
573
+ help=f"Max relative crawl depth based on base_url (default: {DEFAULT_CRAWL_CONFIG['max_depth']})")
574
+ parser.add_argument("--count", type=validate_count, default=DEFAULT_CRAWL_CONFIG["max_count"],
575
+ help=f"Max crawl file count (0 = unlimited, default: {DEFAULT_CRAWL_CONFIG['max_count']})")
576
+ parser.add_argument("--picture", action="store_true", help="Crawl page pictures, save to MD same-level 'images/' dir")
577
+ parser.add_argument("--video", action="store_true", help="Crawl page videos, save to MD same-level 'videos/' dir")
578
+
579
+ # Parse CLI arguments
580
+ args = parser.parse_args()
581
+
582
+ # Determine local save directory (auto generate if omitted)
583
+ save_dir = args.save_folder if args.save_folder else generate_auto_save_dir()
584
+ os.makedirs(save_dir, exist_ok=True)
585
+ print(f"📁 Local save directory created: {os.path.abspath(save_dir)}\n")
586
+
587
+ # Initialize global config
588
+ init_global_config(args.web_url, save_dir, args.depth, args.count, args.picture, args.video)
589
+
590
+ # Start crawling
591
+ print(f"\n🚀 Start Crawling (Base URL: {base_url} | Max Depth: {args.depth} | Max Count: {args.count})")
592
+ print("-" * 80)
593
+ try:
594
+ crawl_page_recursive(args.web_url)
595
+ except Exception as e:
596
+ print(f"\n❌ Crawl aborted unexpectedly: {str(e)}")
597
+ sys.exit(1)
598
+
599
+ # Crawl completion statistics
600
+ print("-" * 80)
601
+ print(f"\n🎉 Crawl Task Completed!")
602
+ print(f"📊 Statistics: Total crawled {crawled_count} valid pages")
603
+ print(f"📂 All files saved to: {root_save_dir}")
604
+ if crawl_picture or crawl_video:
605
+ media_tips = []
606
+ if crawl_picture: media_tips.append("Pictures (images/)")
607
+ if crawl_video: media_tips.append("Videos (videos/)")
608
+ print(f"📌 Crawled {'+'.join(media_tips)}, saved to MD same-level directories (no parent dir restriction)")
609
+ print(f"\n💡 Tip: Open {root_save_dir} to view generated MD files and media resources")
610
+
611
+ if __name__ == "__main__":
612
+ main()
web2md/version.py ADDED
@@ -0,0 +1,33 @@
1
+ # -*- coding: utf-8; py-indent-offset:4 -*-
2
+ from __future__ import (absolute_import, division, print_function, unicode_literals)
3
+
4
+ import os
5
+ import sys
6
+
7
+ __version__ = '0.1.0'
8
+
9
+ __hqversion__ = tuple(int(x) for x in __version__.split('.'))
10
+
11
+ if __name__ == "__main__":
12
+ if len(sys.argv) > 1 and sys.argv[1] == 'bump':
13
+ fp = open(__file__, 'r')
14
+ lines = fp.readlines()
15
+ fp.close()
16
+
17
+ text = ''
18
+ for line in lines:
19
+ if line.startswith('__version__'):
20
+ items = line.split("'")
21
+ current_version = items[1]
22
+ vers = items[1].split('.')
23
+ vers[-1] = str(int(vers[-1]) +1)
24
+ new_version = items[1] = '.'.join(vers)
25
+ line = "'".join(items)
26
+ text += line
27
+
28
+ fp = open(__file__, 'w')
29
+ fp.write(text)
30
+ fp.close()
31
+ print('Current version:', current_version)
32
+ print('Bumped to:', new_version)
33
+ print('File updated:', __file__, '\n')
@@ -0,0 +1,360 @@
1
+ Metadata-Version: 2.4
2
+ Name: web2md
3
+ Version: 0.1.0
4
+ Summary: A CLI tool to crawl dynamic/static websites and convert content to clean Markdown
5
+ Home-page: https://github.com/floatinghotpot/web2md
6
+ Author: Liming Xie
7
+ Author-email: liming.xie@gmail.com
8
+ License: MIT
9
+ Keywords: crawler,markdown,web2md,scraper,dynamic website,html2md
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Intended Audience :: End Users/Desktop
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Operating System :: OS Independent
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.8
17
+ Classifier: Programming Language :: Python :: 3.9
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Topic :: Internet :: WWW/HTTP
22
+ Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
23
+ Classifier: Topic :: Text Processing :: Markup :: HTML
24
+ Classifier: Topic :: Text Processing :: Markup :: Markdown
25
+ Requires-Python: >=3.8
26
+ Description-Content-Type: text/markdown
27
+ License-File: LICENSE
28
+ Requires-Dist: playwright>=1.40.0
29
+ Requires-Dist: beautifulsoup4>=4.12.0
30
+ Requires-Dist: markdownify>=0.11.6
31
+ Requires-Dist: lxml>=4.9.0
32
+ Requires-Dist: requests>=2.31.0
33
+ Dynamic: author
34
+ Dynamic: author-email
35
+ Dynamic: classifier
36
+ Dynamic: description
37
+ Dynamic: description-content-type
38
+ Dynamic: home-page
39
+ Dynamic: keywords
40
+ Dynamic: license
41
+ Dynamic: license-file
42
+ Dynamic: requires-dist
43
+ Dynamic: requires-python
44
+ Dynamic: summary
45
+
46
+ # web2md
47
+
48
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
49
+ [![Python 3.8+](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/)
50
+
51
+ A powerful, intelligent CLI tool to crawl **dynamic and static websites** with full JavaScript rendering support and convert them to clean, well-formatted Markdown files. Perfect for archiving documentation, creating offline knowledge bases, and preserving web content.
52
+
53
+ ## ✨ Key Features
54
+
55
+ - 🚀 **Dynamic Site Support**: Full JavaScript rendering via Playwright (Vue/React/Angular/Next.js)
56
+ - 🎯 **Smart Content Extraction**: Automatically identifies and extracts core content, removing navigation, ads, and sidebars
57
+ - 🔗 **Recursive Crawling**: Intelligently crawls subpages with configurable depth and count limits
58
+ - �️ **Media Downloads**: Optional image and video downloading with lazy-loading support
59
+ - 📐 **Base URL Intelligence**: Uses browser's `document.baseURI` for accurate relative path resolution
60
+ - 🔄 **Local Link Conversion**: Automatically converts HTML links to local Markdown relative paths
61
+ - 🧹 **Clean Output**: Preserves tables, code blocks, images, links, and heading hierarchies
62
+ - 🔒 **SSL Flexibility**: Handles sites with certificate issues gracefully
63
+ - 🌍 **Cross-Platform**: Works on Windows, macOS, and Linux (Python 3.8+)
64
+ - 📋 **Universal Compatibility**: Generated Markdown works with Typora, Obsidian, VS Code, and more
65
+
66
+ ## 📦 Installation
67
+
68
+ ### Option 1: Install from PyPI (Recommended)
69
+ ```bash
70
+ pip3 install web2md
71
+ ```
72
+
73
+ ### Option 2: Install from Source (For Development)
74
+ ```bash
75
+ git clone https://github.com/floatinghotpot/web2md.git
76
+ cd web2md
77
+ python3 -m pip install -e .
78
+ ```
79
+
80
+ ### Required: Install Playwright Browser
81
+ ```bash
82
+ # Install Chromium driver (required for JavaScript rendering)
83
+ python3 -m playwright install chromium
84
+
85
+ # Linux only: Install system dependencies
86
+ python3 -m playwright install-deps chromium
87
+ ```
88
+
89
+ ## 🚀 Quick Start
90
+
91
+ ### Basic Usage
92
+ ```bash
93
+ # Crawl a single page (auto-generated save directory)
94
+ web2md https://docs.python.org/3/tutorial/
95
+
96
+ # Specify custom save directory
97
+ web2md https://docs.python.org/3/tutorial/ ./python-docs
98
+
99
+ # Crawl with images
100
+ web2md https://example.com/docs --picture
101
+
102
+ # Limit crawl depth and count
103
+ web2md https://example.com/docs --depth 2 --count 10
104
+
105
+ # Crawl with images and videos
106
+ web2md https://example.com/docs --picture --video --depth 3
107
+ ```
108
+
109
+ ### Show Help
110
+ ```bash
111
+ web2md -h
112
+ ```
113
+
114
+ ## 📖 Usage
115
+
116
+ ### Command Syntax
117
+ ```
118
+ web2md [URL] [SAVE_DIR] [OPTIONS]
119
+ ```
120
+
121
+ ### Arguments
122
+
123
+ | Argument | Required | Description |
124
+ |----------|----------|-------------|
125
+ | `web_url` | ✅ Yes | Target webpage URL (must start with http/https) |
126
+ | `save_folder` | ❌ No | Local save directory (auto-generated from URL if omitted) |
127
+
128
+ ### Options
129
+
130
+ | Option | Default | Description |
131
+ |--------|---------|-------------|
132
+ | `--depth N` | `5` | Maximum relative crawl depth from base URL |
133
+ | `--count N` | `999` | Maximum number of pages to crawl (0 = unlimited) |
134
+ | `--picture` | `False` | Download and save images to local `images/` directory |
135
+ | `--video` | `False` | Download and save videos to local `videos/` directory |
136
+ | `-h, --help` | - | Show help message and exit |
137
+
138
+ ### Examples
139
+
140
+ #### 1. Unlimited Crawl with Depth Limit
141
+ ```bash
142
+ web2md https://company.com/docs/home company-docs --depth 2
143
+ ```
144
+ - Crawls all pages within 2 levels of `/docs/`
145
+ - Saves to `./company-docs/`
146
+
147
+ #### 2. Limited Page Count
148
+ ```bash
149
+ web2md https://company.com/docs/home company-docs --depth 2 --count 5
150
+ ```
151
+ - Stops after crawling 5 pages
152
+ - Useful for testing or sampling large sites
153
+
154
+ #### 3. Crawl with Images
155
+ ```bash
156
+ web2md https://company.com/docs/home --picture --count 3
157
+ ```
158
+ - Downloads images to `images/` subdirectory
159
+ - Converts image URLs to local relative paths in Markdown
160
+
161
+ #### 4. Auto-Generated Save Directory
162
+ ```bash
163
+ web2md https://company.com/docs/home --depth 1 --count 10
164
+ ```
165
+ - Auto-creates directory: `company_com_docs/`
166
+
167
+ ## 🎯 How It Works
168
+
169
+ ### 1. Base URL Calculation
170
+ The tool automatically determines a **base URL** from your target URL:
171
+ - Target: `https://company.com/docs/home` → Base: `https://company.com/docs/`
172
+ - All crawling is scoped to pages under this base URL
173
+
174
+ ### 2. Intelligent Path Resolution
175
+ Uses the browser's `document.baseURI` to correctly resolve relative URLs:
176
+ - Handles `<base>` tags in HTML
177
+ - Respects redirects and trailing slashes
178
+ - Resolves lazy-loaded images with `data-src`, `srcset`, etc.
179
+
180
+ ### 3. Smart Content Extraction
181
+ Automatically identifies core content using priority selectors:
182
+ 1. `<main>` tag
183
+ 2. `.article-content` or `.article_content`
184
+ 3. `#main-content`
185
+ 4. `.content`
186
+ 5. `<article>` tag
187
+ 6. Fallback to `<body>` (with cleanup)
188
+
189
+ ### 4. Media Handling
190
+ When `--picture` or `--video` is enabled:
191
+ - Downloads media files to `images/` or `videos/` subdirectories
192
+ - Generates unique filenames with MD5 hash to prevent duplicates
193
+ - Converts URLs to local relative paths in Markdown
194
+ - Supports lazy-loading attributes: `data-src`, `data-original`, `srcset`
195
+
196
+ ### 5. Filename Generation
197
+ MD filenames are generated from URLs:
198
+ - Remove base URL prefix
199
+ - Replace `/` with `_`
200
+ - Filter illegal characters
201
+ - Example: `https://company.com/docs/api/auth` → `api_auth.md`
202
+
203
+ ## ⚙️ Configuration
204
+
205
+ ### Built-in Settings (in `web2md/cli.py`)
206
+
207
+ #### Playwright Configuration
208
+ ```python
209
+ PLAYWRIGHT_CONFIG = {
210
+ "headless": False, # Set to True for background crawling
211
+ "timeout": 60000, # Page load timeout (ms)
212
+ "wait_for_load": "networkidle", # Wait strategy
213
+ "sleep_after_load": 2, # Additional wait time (seconds)
214
+ "user_agent": "Mozilla/5.0..." # Custom user agent
215
+ }
216
+ ```
217
+
218
+ #### Media Configuration
219
+ ```python
220
+ MEDIA_CONFIG = {
221
+ "timeout": 30000, # Media download timeout (ms)
222
+ "image_dir": "images", # Image save subdirectory
223
+ "video_dir": "videos", # Video save subdirectory
224
+ "allowed_img_ext": [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".svg", ".webp"],
225
+ "allowed_vid_ext": [".mp4", ".avi", ".mov", ".webm", ".flv", ".mkv"]
226
+ }
227
+ ```
228
+
229
+ #### Content Filtering
230
+ ```python
231
+ REMOVE_TAGS = ["nav", "header", "footer", "aside", "script", "style", "iframe", "sidebar"]
232
+
233
+ CORE_CONTENT_SELECTORS = [
234
+ ("main", {}),
235
+ ("div", {"class_": "article-content"}),
236
+ ("article", {})
237
+ ]
238
+ ```
239
+
240
+ #### Crawl Defaults
241
+ ```python
242
+ DEFAULT_CRAWL_CONFIG = {
243
+ "max_depth": 5, # Default max depth
244
+ "max_count": 999, # Default max pages
245
+ "allowed_schemes": ["http", "https"],
246
+ "exclude_patterns": [r"\.pdf$", r"\.zip$", r"\.exe$"]
247
+ }
248
+ ```
249
+
250
+ ## 🔧 Advanced Usage
251
+
252
+ ### Debug Mode (Show Browser)
253
+ Edit `web2md/cli.py` and set:
254
+ ```python
255
+ PLAYWRIGHT_CONFIG = {
256
+ "headless": False, # Shows browser window
257
+ ...
258
+ }
259
+ ```
260
+
261
+ ### Custom Content Selectors
262
+ Add site-specific selectors to `CORE_CONTENT_SELECTORS`:
263
+ ```python
264
+ CORE_CONTENT_SELECTORS = [
265
+ ("main", {}),
266
+ ("div", {"class_": "documentation-content"}), # Custom selector
267
+ ("article", {})
268
+ ]
269
+ ```
270
+
271
+ ### Anti-Bot Detection
272
+ Install and use `playwright-stealth`:
273
+ ```bash
274
+ pip3 install playwright-stealth
275
+ ```
276
+
277
+ Add to `get_dynamic_html()` in `web2md/cli.py`:
278
+ ```python
279
+ from playwright_stealth import stealth_sync
280
+
281
+ page = context.new_page()
282
+ stealth_sync(page) # Add this line
283
+ page.goto(url, ...)
284
+ ```
285
+
286
+ ### Authentication
287
+ Add login logic in `get_dynamic_html()` before `page.goto()`:
288
+ ```python
289
+ page.goto("https://example.com/login")
290
+ page.fill("#username", "your-username")
291
+ page.fill("#password", "your-password")
292
+ page.click("#login-button")
293
+ time.sleep(2)
294
+ ```
295
+
296
+ ## 🐛 Troubleshooting
297
+
298
+ ### SSL Certificate Errors
299
+ The tool automatically disables SSL verification for downloads. If you encounter issues, check your network/firewall settings.
300
+
301
+ ### Timeout Errors
302
+ Increase timeout in `PLAYWRIGHT_CONFIG`:
303
+ ```python
304
+ "timeout": 120000, # 2 minutes
305
+ ```
306
+
307
+ ### Missing Content
308
+ 1. Check if content is in `<main>` or common content tags
309
+ 2. Add custom selectors to `CORE_CONTENT_SELECTORS`
310
+ 3. Run with `headless: False` to debug visually
311
+
312
+ ### Image Download Failures
313
+ - Verify image URLs are accessible
314
+ - Check if images require authentication
315
+ - Some CDNs may block automated downloads
316
+
317
+ ## 📋 Dependencies
318
+
319
+ Automatically installed via `pip`:
320
+ - **playwright** - Browser automation and JS rendering
321
+ - **beautifulsoup4** - HTML parsing and manipulation
322
+ - **lxml** - Fast XML/HTML parser
323
+ - **markdownify** - HTML to Markdown conversion
324
+ - **urllib3** - HTTP client utilities
325
+
326
+ ## 🤝 Contributing
327
+
328
+ Contributions are welcome! Please follow these steps:
329
+
330
+ 1. Fork the repository
331
+ 2. Create a feature branch (`git checkout -b feature/amazing-feature`)
332
+ 3. Make your changes
333
+ 4. Run tests (if available)
334
+ 5. Commit your changes (`git commit -m 'Add amazing feature'`)
335
+ 6. Push to the branch (`git push origin feature/amazing-feature`)
336
+ 7. Open a Pull Request
337
+
338
+ ### Development Setup
339
+ ```bash
340
+ git clone https://github.com/floatinghotpot/web2md.git
341
+ cd web2md
342
+ python3 -m pip install -e .
343
+ python3 -m playwright install chromium
344
+ ```
345
+
346
+ ## 📝 License
347
+
348
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
349
+
350
+ ## 🙏 Acknowledgments
351
+
352
+ - [Playwright](https://playwright.dev/) for powerful browser automation
353
+ - [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) for HTML parsing
354
+ - [markdownify](https://github.com/matthewwithanm/markdownify) for clean Markdown conversion
355
+
356
+ ---
357
+
358
+ **Made with ❤️ for developers, researchers, and documentation enthusiasts.**
359
+
360
+ If you find this tool useful, please consider giving it a ⭐ on GitHub!
@@ -0,0 +1,12 @@
1
+ web2md/__init__.py,sha256=ZPvZpSyiMs3XKhMyFZK1FmjNpNpXnVskD3gkG0YD2aA,312
2
+ web2md/cli.py,sha256=VFgtGYKl_88VNgIWlSxepVJmWimnT0YM0XuTFQiZh9A,28660
3
+ web2md/version.py,sha256=OJykkdBYqX35QsHimnWgTkieEFMR6SFj81IYy4JmViY,1003
4
+ web2md/__pycache__/__init__.cpython-313.pyc,sha256=eQ8Pdd6gAr_PU5khXNRROfe8gPuYOQ7vmh-703GUEFs,486
5
+ web2md/__pycache__/cli.cpython-313.pyc,sha256=PZVHKkwZZr0wtfx67RT1LllsPL5RrphU0BmfYC7j4w8,32231
6
+ web2md/__pycache__/version.cpython-313.pyc,sha256=Re55Ql5lxw9dIsIbhTJRDYTNIGWxjanCxT4L2jJ1UsM,1790
7
+ web2md-0.1.0.dist-info/licenses/LICENSE,sha256=KcrJSzNA8cSHdhobkxKSComvqHZzxTBuBFyet9-jfGI,1071
8
+ web2md-0.1.0.dist-info/METADATA,sha256=F6y_Ky2ncFNywgqCfAz035MjK7d1xU2jJ4-jZ8-Ba5o,11187
9
+ web2md-0.1.0.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
10
+ web2md-0.1.0.dist-info/entry_points.txt,sha256=sO2WMqUhjpYskI6jO9HoO2l3ILG9JY-VBTEE-7_nY4k,43
11
+ web2md-0.1.0.dist-info/top_level.txt,sha256=tMptSjS7zehcx3WbQPakKhNZZ2t6hcDA8iqG5q_Dsbk,7
12
+ web2md-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.10.2)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ web2md = web2md.cli:main
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) [2026] [Liming Xie]
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ web2md