pg2md 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pg2md/__init__.py +38 -0
- pg2md/parser.py +417 -0
- pg2md-1.0.0.dist-info/METADATA +265 -0
- pg2md-1.0.0.dist-info/RECORD +6 -0
- pg2md-1.0.0.dist-info/WHEEL +4 -0
- pg2md-1.0.0.dist-info/licenses/LICENSE +21 -0
pg2md/__init__.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""
|
|
2
|
+
pg2md — Page to Markdown converter with JS rendering support.
|
|
3
|
+
|
|
4
|
+
A fast, clean HTML-to-Markdown converter that uses Playwright for
|
|
5
|
+
JavaScript rendering and html-to-markdown (Rust-based) for conversion.
|
|
6
|
+
|
|
7
|
+
Example:
|
|
8
|
+
from pg2md import PageParser, BrowserConfig, ProxyConfig, UserAgents
|
|
9
|
+
|
|
10
|
+
parser = PageParser(with_image=False, with_link=True)
|
|
11
|
+
markdown = parser.parse("https://example.com")
|
|
12
|
+
print(markdown)
|
|
13
|
+
|
|
14
|
+
# With proxy
|
|
15
|
+
proxy = ProxyConfig(server="socks5://1.2.3.4:1080")
|
|
16
|
+
markdown = parser.parse("https://example.com", proxy=proxy)
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from pg2md.parser import (
|
|
20
|
+
PageParser,
|
|
21
|
+
BrowserConfig,
|
|
22
|
+
ProxyConfig,
|
|
23
|
+
UserAgents,
|
|
24
|
+
HtmlCleaner,
|
|
25
|
+
MarkdownCleaner,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
__version__ = "1.0.0"
|
|
29
|
+
__author__ = "Your Name"
|
|
30
|
+
__all__ = [
|
|
31
|
+
"PageParser",
|
|
32
|
+
"BrowserConfig",
|
|
33
|
+
"ProxyConfig",
|
|
34
|
+
"UserAgents",
|
|
35
|
+
"HtmlCleaner",
|
|
36
|
+
"MarkdownCleaner",
|
|
37
|
+
"__version__",
|
|
38
|
+
]
|
pg2md/parser.py
ADDED
|
@@ -0,0 +1,417 @@
|
|
|
1
|
+
"""
|
|
2
|
+
PageParser — Parser for web pages with JS rendering via Lightpanda/Playwright
|
|
3
|
+
and conversion to clean Markdown.
|
|
4
|
+
|
|
5
|
+
Dependencies:
|
|
6
|
+
pip install playwright html-to-markdown beautifulsoup4
|
|
7
|
+
playwright install chromium # if using regular Chrome, not Lightpanda
|
|
8
|
+
|
|
9
|
+
Running Lightpanda (optional, instead of Chrome):
|
|
10
|
+
./lightpanda serve --host 127.0.0.1 --port 9222
|
|
11
|
+
|
|
12
|
+
Usage:
|
|
13
|
+
parser = PageParser(with_image=False, with_link=False)
|
|
14
|
+
|
|
15
|
+
# Without proxy
|
|
16
|
+
result = parser.parse("https://example.com")
|
|
17
|
+
|
|
18
|
+
# With proxy for a specific request
|
|
19
|
+
proxy = ProxyConfig(server="http://1.2.3.4:8080", username="user", password="pass")
|
|
20
|
+
result = parser.parse("https://example.com", proxy=proxy)
|
|
21
|
+
|
|
22
|
+
print(result)
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
import re
|
|
26
|
+
import subprocess
|
|
27
|
+
import time
|
|
28
|
+
import asyncio
|
|
29
|
+
from dataclasses import dataclass
|
|
30
|
+
from typing import Optional, Literal
|
|
31
|
+
|
|
32
|
+
from bs4 import BeautifulSoup
|
|
33
|
+
from html_to_markdown import convert, ConversionOptions, PreprocessingOptions
|
|
34
|
+
from playwright.async_api import async_playwright, ProxySettings
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass
|
|
38
|
+
class ProxyConfig:
|
|
39
|
+
"""
|
|
40
|
+
Proxy settings for a single request.
|
|
41
|
+
|
|
42
|
+
Supported server formats:
|
|
43
|
+
http://host:port
|
|
44
|
+
https://host:port
|
|
45
|
+
socks5://host:port
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
server : proxy address (required)
|
|
49
|
+
username : login (optional)
|
|
50
|
+
password : password (optional)
|
|
51
|
+
bypass : comma-separated list of hosts to bypass proxy
|
|
52
|
+
(e.g. "localhost,127.0.0.1")
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
server: str
|
|
56
|
+
username: Optional[str] = None
|
|
57
|
+
password: Optional[str] = None
|
|
58
|
+
bypass: Optional[str] = None
|
|
59
|
+
|
|
60
|
+
def to_playwright(self) -> ProxySettings:
|
|
61
|
+
"""Converts to Playwright ProxySettings format."""
|
|
62
|
+
settings: ProxySettings = {"server": self.server}
|
|
63
|
+
if self.username:
|
|
64
|
+
settings["username"] = self.username
|
|
65
|
+
if self.password:
|
|
66
|
+
settings["password"] = self.password
|
|
67
|
+
if self.bypass:
|
|
68
|
+
settings["bypass"] = self.bypass
|
|
69
|
+
return settings
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class UserAgents:
|
|
73
|
+
"""Popular User-Agent strings for bypassing blocks."""
|
|
74
|
+
|
|
75
|
+
CHROME_DESKTOP = (
|
|
76
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
77
|
+
"Chrome/120.0.0.0 Safari/537.36"
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
CHROME_MAC = (
|
|
81
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
82
|
+
"Chrome/120.0.0.0 Safari/537.36"
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
FIREFOX_DESKTOP = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0"
|
|
86
|
+
|
|
87
|
+
SAFARI_MAC = (
|
|
88
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) "
|
|
89
|
+
"Version/17.2 Safari/605.1.15"
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
EDGE = (
|
|
93
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
94
|
+
"Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0"
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
GOOGLEBOT_DESKTOP = (
|
|
98
|
+
"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
GOOGLEBOT_MOBILE = (
|
|
102
|
+
"Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) "
|
|
103
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.6099.71 MobileSafari/537.36 "
|
|
104
|
+
"(compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
GOOGLEBOT_VIDEO = (
|
|
108
|
+
"Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) "
|
|
109
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.6099.71 MobileSafari/537.36 "
|
|
110
|
+
"Googlebot/2.1"
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
BINGBOT = "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)"
|
|
114
|
+
|
|
115
|
+
BINGBOT_MOBILE = (
|
|
116
|
+
"Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 "
|
|
117
|
+
"(KHTML, like Gecko) Version/16.6 Mobile/15E148 BingWeb/7.15.13.7055 (advisor; +http://www.bing.com/bingbot.htm)"
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
YANDEXBOT = "Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)"
|
|
121
|
+
|
|
122
|
+
DUCKBOT = "DuckDuckBot/1.0; (+http://duckduckgo.com/duckduckbot.html)"
|
|
123
|
+
|
|
124
|
+
APPLEBOT = (
|
|
125
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) "
|
|
126
|
+
"Version/17.0 Safari/605.1.15 Applebot/0.1"
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
CHROME_LINUX = (
|
|
130
|
+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
131
|
+
"Chrome/120.0.0.0 Safari/537.36"
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
@dataclass
|
|
136
|
+
class BrowserConfig:
|
|
137
|
+
"""Browser connection settings."""
|
|
138
|
+
|
|
139
|
+
cdp_url: Optional[str] = "ws://127.0.0.1:9222"
|
|
140
|
+
lightpanda_bin: Optional[str] = None
|
|
141
|
+
navigation_timeout: int = 30_000
|
|
142
|
+
wait_until: Literal["load", "domcontentloaded", "networkidle"] = "networkidle"
|
|
143
|
+
default_proxy: Optional[ProxyConfig] = None
|
|
144
|
+
user_agent: Optional[str] = UserAgents.CHROME_DESKTOP
|
|
145
|
+
extra_headers: Optional[dict[str, str]] = None
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
class HtmlCleaner:
|
|
149
|
+
"""
|
|
150
|
+
Cleans HTML before converting to Markdown:
|
|
151
|
+
- removes <script>, <style>, <noscript>, <svg>, <canvas>, <video>, <audio>
|
|
152
|
+
- removes <img> (optional)
|
|
153
|
+
- removes href/src with data:, blob: (base64 junk)
|
|
154
|
+
- strips links, keeping only text (optional)
|
|
155
|
+
"""
|
|
156
|
+
|
|
157
|
+
ALWAYS_STRIP_TAGS = [
|
|
158
|
+
"script",
|
|
159
|
+
"style",
|
|
160
|
+
"noscript",
|
|
161
|
+
"svg",
|
|
162
|
+
"canvas",
|
|
163
|
+
"video",
|
|
164
|
+
"audio",
|
|
165
|
+
"iframe",
|
|
166
|
+
"object",
|
|
167
|
+
"embed",
|
|
168
|
+
"head",
|
|
169
|
+
]
|
|
170
|
+
|
|
171
|
+
def __init__(self, with_image: bool = False, with_link: bool = True):
|
|
172
|
+
self.with_image = with_image
|
|
173
|
+
self.with_link = with_link
|
|
174
|
+
|
|
175
|
+
def clean(self, html: str) -> str:
|
|
176
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
177
|
+
|
|
178
|
+
for tag in self.ALWAYS_STRIP_TAGS:
|
|
179
|
+
for el in soup.find_all(tag):
|
|
180
|
+
el.decompose()
|
|
181
|
+
|
|
182
|
+
if not self.with_image:
|
|
183
|
+
for el in soup.find_all("img"):
|
|
184
|
+
el.decompose()
|
|
185
|
+
else:
|
|
186
|
+
for el in soup.find_all("img"):
|
|
187
|
+
src = el.get("src", "")
|
|
188
|
+
if isinstance(src, str) and (
|
|
189
|
+
src.startswith("data:") or src.startswith("blob:")
|
|
190
|
+
):
|
|
191
|
+
el.decompose()
|
|
192
|
+
|
|
193
|
+
if not self.with_link:
|
|
194
|
+
for el in soup.find_all("a"):
|
|
195
|
+
el.replace_with(el.get_text())
|
|
196
|
+
else:
|
|
197
|
+
for el in soup.find_all("a"):
|
|
198
|
+
href = el.get("href", "")
|
|
199
|
+
if isinstance(href, str) and (
|
|
200
|
+
href.startswith("data:") or href.startswith("blob:")
|
|
201
|
+
):
|
|
202
|
+
el["href"] = ""
|
|
203
|
+
|
|
204
|
+
for el in soup.find_all(True):
|
|
205
|
+
for attr in ("src", "href", "srcset", "poster", "background"):
|
|
206
|
+
val = el.get(attr, "")
|
|
207
|
+
if isinstance(val, str) and (
|
|
208
|
+
val.startswith("data:") or val.startswith("blob:")
|
|
209
|
+
):
|
|
210
|
+
del el[attr]
|
|
211
|
+
|
|
212
|
+
return str(soup)
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
class MarkdownCleaner:
|
|
216
|
+
"""Final cleanup of ready Markdown text."""
|
|
217
|
+
|
|
218
|
+
_BASE64_LINE = re.compile(r"^[A-Za-z0-9+/=]{40,}\s*$", re.MULTILINE)
|
|
219
|
+
_BINARY_GARBAGE = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f]")
|
|
220
|
+
_EXCESS_NEWLINES = re.compile(r"\n{3,}")
|
|
221
|
+
_MD_IMAGE = re.compile(r"!\[.*?\]\(.*?\)")
|
|
222
|
+
|
|
223
|
+
def clean(self, text: str, strip_images: bool = False) -> str:
|
|
224
|
+
text = self._BINARY_GARBAGE.sub("", text)
|
|
225
|
+
text = self._BASE64_LINE.sub("", text)
|
|
226
|
+
|
|
227
|
+
if strip_images:
|
|
228
|
+
text = self._MD_IMAGE.sub("", text)
|
|
229
|
+
|
|
230
|
+
text = self._EXCESS_NEWLINES.sub("\n\n", text)
|
|
231
|
+
|
|
232
|
+
return text.strip()
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
class PageParser:
|
|
236
|
+
"""
|
|
237
|
+
Parses web pages with JS rendering and returns clean Markdown.
|
|
238
|
+
|
|
239
|
+
Args:
|
|
240
|
+
with_image (bool) : Include images in output. Default False.
|
|
241
|
+
with_link (bool) : Include links (href). Default True.
|
|
242
|
+
False — links are replaced with their text.
|
|
243
|
+
browser_config (BrowserConfig): Browser connection settings.
|
|
244
|
+
Can set default_proxy for all requests.
|
|
245
|
+
|
|
246
|
+
Proxy is passed to parse() / async_parse() per-request:
|
|
247
|
+
proxy = ProxyConfig(server="socks5://1.2.3.4:1080")
|
|
248
|
+
result = parser.parse("https://example.com", proxy=proxy)
|
|
249
|
+
"""
|
|
250
|
+
|
|
251
|
+
def __init__(
|
|
252
|
+
self,
|
|
253
|
+
with_image: bool = False,
|
|
254
|
+
with_link: bool = True,
|
|
255
|
+
browser_config: Optional[BrowserConfig] = None,
|
|
256
|
+
):
|
|
257
|
+
self.with_image = with_image
|
|
258
|
+
self.with_link = with_link
|
|
259
|
+
self.config = browser_config or BrowserConfig()
|
|
260
|
+
|
|
261
|
+
self._html_cleaner = HtmlCleaner(with_image=with_image, with_link=with_link)
|
|
262
|
+
self._md_cleaner = MarkdownCleaner()
|
|
263
|
+
|
|
264
|
+
self._lightpanda_proc: Optional[subprocess.Popen] = None
|
|
265
|
+
|
|
266
|
+
def parse(self, url: str, proxy: Optional[ProxyConfig] = None) -> str:
|
|
267
|
+
"""
|
|
268
|
+
Synchronous wrapper over async_parse.
|
|
269
|
+
|
|
270
|
+
Args:
|
|
271
|
+
url : page to parse
|
|
272
|
+
proxy : proxy for this specific request (overrides default_proxy)
|
|
273
|
+
"""
|
|
274
|
+
return asyncio.run(self.async_parse(url, proxy=proxy))
|
|
275
|
+
|
|
276
|
+
async def async_parse(self, url: str, proxy: Optional[ProxyConfig] = None) -> str:
|
|
277
|
+
"""
|
|
278
|
+
Loads page, renders JS, returns clean Markdown.
|
|
279
|
+
|
|
280
|
+
Args:
|
|
281
|
+
url : page to parse
|
|
282
|
+
proxy : proxy for this specific request (overrides default_proxy)
|
|
283
|
+
"""
|
|
284
|
+
html = await self._fetch_html(url, proxy=proxy)
|
|
285
|
+
return self._html_to_markdown(html)
|
|
286
|
+
|
|
287
|
+
async def async_parse_many(
|
|
288
|
+
self,
|
|
289
|
+
urls: list[str],
|
|
290
|
+
proxy: Optional[ProxyConfig] = None,
|
|
291
|
+
) -> dict[str, str | BaseException]:
|
|
292
|
+
"""
|
|
293
|
+
Parses multiple URLs in parallel.
|
|
294
|
+
|
|
295
|
+
Args:
|
|
296
|
+
urls : list of pages
|
|
297
|
+
proxy : one proxy for all requests (or None)
|
|
298
|
+
"""
|
|
299
|
+
tasks = [self.async_parse(url, proxy=proxy) for url in urls]
|
|
300
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
301
|
+
return {url: res for url, res in zip(urls, results)}
|
|
302
|
+
|
|
303
|
+
async def _fetch_html(self, url: str, proxy: Optional[ProxyConfig] = None) -> str:
|
|
304
|
+
"""Opens browser, loads page, returns HTML."""
|
|
305
|
+
self._maybe_start_lightpanda()
|
|
306
|
+
|
|
307
|
+
effective_proxy = proxy or self.config.default_proxy
|
|
308
|
+
proxy_settings = effective_proxy.to_playwright() if effective_proxy else None
|
|
309
|
+
|
|
310
|
+
headers = {}
|
|
311
|
+
if self.config.extra_headers:
|
|
312
|
+
headers.update(self.config.extra_headers)
|
|
313
|
+
|
|
314
|
+
async with async_playwright() as pw:
|
|
315
|
+
if self.config.cdp_url:
|
|
316
|
+
browser = await pw.chromium.connect_over_cdp(self.config.cdp_url)
|
|
317
|
+
context = await browser.new_context(
|
|
318
|
+
proxy=proxy_settings,
|
|
319
|
+
user_agent=self.config.user_agent,
|
|
320
|
+
extra_http_headers=headers if headers else None,
|
|
321
|
+
)
|
|
322
|
+
else:
|
|
323
|
+
browser = await pw.chromium.launch(headless=True)
|
|
324
|
+
context = await browser.new_context(
|
|
325
|
+
proxy=proxy_settings,
|
|
326
|
+
user_agent=self.config.user_agent,
|
|
327
|
+
extra_http_headers=headers if headers else None,
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
page = await context.new_page()
|
|
331
|
+
|
|
332
|
+
try:
|
|
333
|
+
await page.goto(
|
|
334
|
+
url,
|
|
335
|
+
timeout=self.config.navigation_timeout,
|
|
336
|
+
wait_until=self.config.wait_until,
|
|
337
|
+
)
|
|
338
|
+
html = await page.content()
|
|
339
|
+
finally:
|
|
340
|
+
await page.close()
|
|
341
|
+
await context.close()
|
|
342
|
+
await browser.close()
|
|
343
|
+
|
|
344
|
+
return html
|
|
345
|
+
|
|
346
|
+
def _html_to_markdown(self, html: str) -> str:
|
|
347
|
+
"""Cleans HTML and converts to Markdown."""
|
|
348
|
+
|
|
349
|
+
clean_html = self._html_cleaner.clean(html)
|
|
350
|
+
|
|
351
|
+
options = ConversionOptions(
|
|
352
|
+
heading_style="atx",
|
|
353
|
+
strong_em_symbol="*",
|
|
354
|
+
bullets="*",
|
|
355
|
+
escape_asterisks=False,
|
|
356
|
+
)
|
|
357
|
+
preprocessing = PreprocessingOptions(
|
|
358
|
+
enabled=True,
|
|
359
|
+
preset="aggressive",
|
|
360
|
+
remove_navigation=True,
|
|
361
|
+
remove_forms=True,
|
|
362
|
+
)
|
|
363
|
+
markdown = convert(clean_html, options, preprocessing)
|
|
364
|
+
|
|
365
|
+
markdown = self._md_cleaner.clean(markdown, strip_images=not self.with_image)
|
|
366
|
+
|
|
367
|
+
return markdown
|
|
368
|
+
|
|
369
|
+
def _maybe_start_lightpanda(self) -> None:
|
|
370
|
+
"""If binary path is set and process not running — start it."""
|
|
371
|
+
if not self.config.lightpanda_bin:
|
|
372
|
+
return
|
|
373
|
+
if self._lightpanda_proc and self._lightpanda_proc.poll() is None:
|
|
374
|
+
return
|
|
375
|
+
|
|
376
|
+
self._lightpanda_proc = subprocess.Popen(
|
|
377
|
+
[
|
|
378
|
+
self.config.lightpanda_bin,
|
|
379
|
+
"serve",
|
|
380
|
+
"--host",
|
|
381
|
+
"127.0.0.1",
|
|
382
|
+
"--port",
|
|
383
|
+
"9222",
|
|
384
|
+
],
|
|
385
|
+
stdout=subprocess.DEVNULL,
|
|
386
|
+
stderr=subprocess.DEVNULL,
|
|
387
|
+
)
|
|
388
|
+
time.sleep(1.5)
|
|
389
|
+
|
|
390
|
+
def stop_lightpanda(self) -> None:
|
|
391
|
+
"""Explicitly stops Lightpanda if started by us."""
|
|
392
|
+
if self._lightpanda_proc:
|
|
393
|
+
self._lightpanda_proc.terminate()
|
|
394
|
+
self._lightpanda_proc = None
|
|
395
|
+
|
|
396
|
+
def __enter__(self):
|
|
397
|
+
return self
|
|
398
|
+
|
|
399
|
+
def __exit__(self, *_):
|
|
400
|
+
self.stop_lightpanda()
|
|
401
|
+
|
|
402
|
+
|
|
403
|
+
if __name__ == "__main__":
|
|
404
|
+
import sys
|
|
405
|
+
|
|
406
|
+
url = sys.argv[1] if len(sys.argv) > 1 else "https://example.com"
|
|
407
|
+
|
|
408
|
+
parser = PageParser(
|
|
409
|
+
with_image=False,
|
|
410
|
+
with_link=False,
|
|
411
|
+
browser_config=BrowserConfig(cdp_url=None),
|
|
412
|
+
)
|
|
413
|
+
|
|
414
|
+
with parser:
|
|
415
|
+
result = parser.parse(url)
|
|
416
|
+
|
|
417
|
+
print(result)
|
|
@@ -0,0 +1,265 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pg2md
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Page to Markdown converter with JS rendering support via Playwright
|
|
5
|
+
Project-URL: Homepage, https://github.com/yourusername/pg2md
|
|
6
|
+
Project-URL: Documentation, https://github.com/yourusername/pg2md#readme
|
|
7
|
+
Project-URL: Repository, https://github.com/yourusername/pg2md
|
|
8
|
+
Project-URL: Issues, https://github.com/yourusername/pg2md/issues
|
|
9
|
+
Author-email: Your Name <your@email.com>
|
|
10
|
+
License-Expression: MIT
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Keywords: converter,html,js-rendering,markdown,parser,playwright,scraper,web-scraping
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
22
|
+
Classifier: Topic :: Text Processing :: Markup :: Markdown
|
|
23
|
+
Classifier: Topic :: Utilities
|
|
24
|
+
Requires-Python: >=3.10
|
|
25
|
+
Requires-Dist: beautifulsoup4>=4.12.0
|
|
26
|
+
Requires-Dist: html-to-markdown>=2.0.0
|
|
27
|
+
Requires-Dist: playwright>=1.40.0
|
|
28
|
+
Provides-Extra: dev
|
|
29
|
+
Requires-Dist: mypy>=1.0.0; extra == 'dev'
|
|
30
|
+
Requires-Dist: pytest>=7.0.0; extra == 'dev'
|
|
31
|
+
Requires-Dist: ruff>=0.1.0; extra == 'dev'
|
|
32
|
+
Description-Content-Type: text/markdown
|
|
33
|
+
|
|
34
|
+
# pg2md
|
|
35
|
+
|
|
36
|
+
[](https://badge.fury.io/py/pg2md)
|
|
37
|
+
[](https://pypi.org/project/pg2md/)
|
|
38
|
+
[](https://opensource.org/licenses/MIT)
|
|
39
|
+
[](https://pepy.tech/project/pg2md)
|
|
40
|
+
[](https://github.com/yourusername/pg2md/stargazers)
|
|
41
|
+
[](https://github.com/yourusername/pg2md/issues)
|
|
42
|
+
[](https://github.com/yourusername/pg2md/network/members)
|
|
43
|
+
|
|
44
|
+
**P**a**g**e to **M**ark**d**own — fast HTML-to-Markdown converter with JavaScript rendering support.
|
|
45
|
+
|
|
46
|
+
Converts any web page to clean Markdown using Playwright for JS rendering and Rust-based `html-to-markdown` for conversion.
|
|
47
|
+
|
|
48
|
+
## Features
|
|
49
|
+
|
|
50
|
+
- **JavaScript Rendering** — handles SPA, React, Vue, dynamic content
|
|
51
|
+
- **Fast Conversion** — Rust-based `html-to-markdown` core
|
|
52
|
+
- **Clean Output** — strips scripts, styles, navigation, forms
|
|
53
|
+
- **Proxy Support** — HTTP/HTTPS/SOCKS5 with auth
|
|
54
|
+
- **Custom User-Agents** — includes Googlebot, Bingbot, etc.
|
|
55
|
+
- **Async & Sync API** — `parse()` and `async_parse()`
|
|
56
|
+
- **Batch Processing** — `async_parse_many()` for parallel requests
|
|
57
|
+
- **Configurable** — images, links, headers, timeouts
|
|
58
|
+
|
|
59
|
+
## Installation
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
pip install pg2md
|
|
63
|
+
playwright install chromium
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## Quick Start
|
|
67
|
+
|
|
68
|
+
```python
|
|
69
|
+
from pg2md import PageParser
|
|
70
|
+
|
|
71
|
+
parser = PageParser(with_image=False, with_link=True)
|
|
72
|
+
markdown = parser.parse("https://example.com")
|
|
73
|
+
print(markdown)
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
## Usage Examples
|
|
77
|
+
|
|
78
|
+
### Basic Usage
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
from pg2md import PageParser
|
|
82
|
+
|
|
83
|
+
parser = PageParser()
|
|
84
|
+
result = parser.parse("https://example.com")
|
|
85
|
+
print(result)
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
### Without Images and Links
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
from pg2md import PageParser
|
|
92
|
+
|
|
93
|
+
parser = PageParser(with_image=False, with_link=False)
|
|
94
|
+
result = parser.parse("https://example.com")
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
### With Proxy
|
|
98
|
+
|
|
99
|
+
```python
|
|
100
|
+
from pg2md import PageParser, ProxyConfig
|
|
101
|
+
|
|
102
|
+
proxy = ProxyConfig(
|
|
103
|
+
server="http://proxy.example.com:8080",
|
|
104
|
+
username="user",
|
|
105
|
+
password="pass"
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
parser = PageParser()
|
|
109
|
+
result = parser.parse("https://example.com", proxy=proxy)
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
### SOCKS5 Proxy
|
|
113
|
+
|
|
114
|
+
```python
|
|
115
|
+
from pg2md import PageParser, ProxyConfig
|
|
116
|
+
|
|
117
|
+
proxy = ProxyConfig(server="socks5://127.0.0.1:1080")
|
|
118
|
+
parser = PageParser()
|
|
119
|
+
result = parser.parse("https://example.com", proxy=proxy)
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
### Custom User-Agent
|
|
123
|
+
|
|
124
|
+
```python
|
|
125
|
+
from pg2md import PageParser, BrowserConfig, UserAgents
|
|
126
|
+
|
|
127
|
+
config = BrowserConfig(
|
|
128
|
+
cdp_url=None,
|
|
129
|
+
user_agent=UserAgents.GOOGLEBOT_DESKTOP,
|
|
130
|
+
extra_headers={"Accept-Language": "en-US,en;q=0.9"}
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
parser = PageParser(browser_config=config)
|
|
134
|
+
result = parser.parse("https://example.com")
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
### Async API
|
|
138
|
+
|
|
139
|
+
```python
|
|
140
|
+
import asyncio
|
|
141
|
+
from pg2md import PageParser
|
|
142
|
+
|
|
143
|
+
async def main():
|
|
144
|
+
parser = PageParser()
|
|
145
|
+
result = await parser.async_parse("https://example.com")
|
|
146
|
+
print(result)
|
|
147
|
+
|
|
148
|
+
asyncio.run(main())
|
|
149
|
+
```
|
|
150
|
+
|
|
151
|
+
### Batch Processing
|
|
152
|
+
|
|
153
|
+
```python
|
|
154
|
+
import asyncio
|
|
155
|
+
from pg2md import PageParser
|
|
156
|
+
|
|
157
|
+
async def main():
|
|
158
|
+
parser = PageParser()
|
|
159
|
+
urls = [
|
|
160
|
+
"https://example.com",
|
|
161
|
+
"https://example.org",
|
|
162
|
+
"https://example.net",
|
|
163
|
+
]
|
|
164
|
+
results = await parser.async_parse_many(urls)
|
|
165
|
+
|
|
166
|
+
for url, result in results.items():
|
|
167
|
+
if isinstance(result, Exception):
|
|
168
|
+
print(f"Error {url}: {result}")
|
|
169
|
+
else:
|
|
170
|
+
print(f"{url}: {len(result)} chars")
|
|
171
|
+
|
|
172
|
+
asyncio.run(main())
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
### Using Lightpanda
|
|
176
|
+
|
|
177
|
+
```python
|
|
178
|
+
from pg2md import PageParser, BrowserConfig
|
|
179
|
+
|
|
180
|
+
# Start Lightpanda manually:
|
|
181
|
+
# ./lightpanda serve --host 127.0.0.1 --port 9222
|
|
182
|
+
|
|
183
|
+
config = BrowserConfig(cdp_url="ws://127.0.0.1:9222")
|
|
184
|
+
parser = PageParser(browser_config=config)
|
|
185
|
+
result = parser.parse("https://example.com")
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
## Configuration
|
|
189
|
+
|
|
190
|
+
### BrowserConfig
|
|
191
|
+
|
|
192
|
+
| Parameter | Type | Default | Description |
|
|
193
|
+
|-----------|------|---------|-------------|
|
|
194
|
+
| `cdp_url` | `str \| None` | `"ws://127.0.0.1:9222"` | CDP endpoint (Lightpanda/Chrome) |
|
|
195
|
+
| `lightpanda_bin` | `str \| None` | `None` | Path to Lightpanda binary |
|
|
196
|
+
| `navigation_timeout` | `int` | `30000` | Navigation timeout (ms) |
|
|
197
|
+
| `wait_until` | `str` | `"networkidle"` | Wait event |
|
|
198
|
+
| `default_proxy` | `ProxyConfig \| None` | `None` | Default proxy for all requests |
|
|
199
|
+
| `user_agent` | `str \| None` | Chrome Desktop | User-Agent string |
|
|
200
|
+
| `extra_headers` | `dict \| None` | `None` | Additional HTTP headers |
|
|
201
|
+
|
|
202
|
+
### ProxyConfig
|
|
203
|
+
|
|
204
|
+
| Parameter | Type | Default | Description |
|
|
205
|
+
|-----------|------|---------|-------------|
|
|
206
|
+
| `server` | `str` | required | Proxy URL |
|
|
207
|
+
| `username` | `str \| None` | `None` | Username |
|
|
208
|
+
| `password` | `str \| None` | `None` | Password |
|
|
209
|
+
| `bypass` | `str \| None` | `None` | Hosts to bypass |
|
|
210
|
+
|
|
211
|
+
### UserAgents
|
|
212
|
+
|
|
213
|
+
Available presets:
|
|
214
|
+
|
|
215
|
+
- `CHROME_DESKTOP`, `CHROME_MAC`, `CHROME_LINUX`
|
|
216
|
+
- `FIREFOX_DESKTOP`
|
|
217
|
+
- `SAFARI_MAC`
|
|
218
|
+
- `EDGE`
|
|
219
|
+
- `GOOGLEBOT_DESKTOP`, `GOOGLEBOT_MOBILE`, `GOOGLEBOT_VIDEO`
|
|
220
|
+
- `BINGBOT`, `BINGBOT_MOBILE`
|
|
221
|
+
- `YANDEXBOT`
|
|
222
|
+
- `DUCKBOT`
|
|
223
|
+
- `APPLEBOT`
|
|
224
|
+
|
|
225
|
+
## API Reference
|
|
226
|
+
|
|
227
|
+
### PageParser
|
|
228
|
+
|
|
229
|
+
```python
|
|
230
|
+
PageParser(
|
|
231
|
+
with_image: bool = False,
|
|
232
|
+
with_link: bool = True,
|
|
233
|
+
browser_config: BrowserConfig | None = None
|
|
234
|
+
)
|
|
235
|
+
```
|
|
236
|
+
|
|
237
|
+
#### Methods
|
|
238
|
+
|
|
239
|
+
| Method | Description |
|
|
240
|
+
|--------|-------------|
|
|
241
|
+
| `parse(url, proxy=None)` | Sync parse, returns Markdown string |
|
|
242
|
+
| `async_parse(url, proxy=None)` | Async parse, returns Markdown string |
|
|
243
|
+
| `async_parse_many(urls, proxy=None)` | Batch async parse, returns dict |
|
|
244
|
+
| `stop_lightpanda()` | Stop Lightpanda if started |
|
|
245
|
+
|
|
246
|
+
## Development
|
|
247
|
+
|
|
248
|
+
```bash
|
|
249
|
+
git clone https://github.com/yourusername/pg2md.git
|
|
250
|
+
cd pg2md
|
|
251
|
+
python -m venv venv
|
|
252
|
+
source venv/bin/activate
|
|
253
|
+
pip install -e ".[dev]"
|
|
254
|
+
playwright install chromium
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
## License
|
|
258
|
+
|
|
259
|
+
[MIT](LICENSE)
|
|
260
|
+
|
|
261
|
+
## Credits
|
|
262
|
+
|
|
263
|
+
- [Playwright](https://playwright.dev/python/) — browser automation
|
|
264
|
+
- [html-to-markdown](https://pypi.org/project/html-to-markdown/) — Rust-based HTML to Markdown
|
|
265
|
+
- [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) — HTML parsing
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
pg2md/__init__.py,sha256=gnJA-EpCZh0rqggk_avwsi3IBGHbc0bddJmB8rrAvBw,896
|
|
2
|
+
pg2md/parser.py,sha256=bnepF5302PsV2SXmHgEla2126SihPE2FuBYNDI5Gp8k,13371
|
|
3
|
+
pg2md-1.0.0.dist-info/METADATA,sha256=5tTGIaPbjSr3qu05zp8hiUQFt94F49JbsRSkFUefPJk,7560
|
|
4
|
+
pg2md-1.0.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
5
|
+
pg2md-1.0.0.dist-info/licenses/LICENSE,sha256=OphKV48tcMv6ep-7j-8T6nycykPT0g8ZlMJ9zbGvdPs,1066
|
|
6
|
+
pg2md-1.0.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Your Name
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|