pg2md 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pg2md/__init__.py ADDED
@@ -0,0 +1,38 @@
1
+ """
2
+ pg2md — Page to Markdown converter with JS rendering support.
3
+
4
+ A fast, clean HTML-to-Markdown converter that uses Playwright for
5
+ JavaScript rendering and html-to-markdown (Rust-based) for conversion.
6
+
7
+ Example:
8
+ from pg2md import PageParser, BrowserConfig, ProxyConfig, UserAgents
9
+
10
+ parser = PageParser(with_image=False, with_link=True)
11
+ markdown = parser.parse("https://example.com")
12
+ print(markdown)
13
+
14
+ # With proxy
15
+ proxy = ProxyConfig(server="socks5://1.2.3.4:1080")
16
+ markdown = parser.parse("https://example.com", proxy=proxy)
17
+ """
18
+
19
+ from pg2md.parser import (
20
+ PageParser,
21
+ BrowserConfig,
22
+ ProxyConfig,
23
+ UserAgents,
24
+ HtmlCleaner,
25
+ MarkdownCleaner,
26
+ )
27
+
28
+ __version__ = "1.0.0"
29
+ __author__ = "Your Name"
30
+ __all__ = [
31
+ "PageParser",
32
+ "BrowserConfig",
33
+ "ProxyConfig",
34
+ "UserAgents",
35
+ "HtmlCleaner",
36
+ "MarkdownCleaner",
37
+ "__version__",
38
+ ]
pg2md/parser.py ADDED
@@ -0,0 +1,417 @@
1
+ """
2
+ PageParser — Parser for web pages with JS rendering via Lightpanda/Playwright
3
+ and conversion to clean Markdown.
4
+
5
+ Dependencies:
6
+ pip install playwright html-to-markdown beautifulsoup4
7
+ playwright install chromium # if using regular Chrome, not Lightpanda
8
+
9
+ Running Lightpanda (optional, instead of Chrome):
10
+ ./lightpanda serve --host 127.0.0.1 --port 9222
11
+
12
+ Usage:
13
+ parser = PageParser(with_image=False, with_link=False)
14
+
15
+ # Without proxy
16
+ result = parser.parse("https://example.com")
17
+
18
+ # With proxy for a specific request
19
+ proxy = ProxyConfig(server="http://1.2.3.4:8080", username="user", password="pass")
20
+ result = parser.parse("https://example.com", proxy=proxy)
21
+
22
+ print(result)
23
+ """
24
+
25
+ import re
26
+ import subprocess
27
+ import time
28
+ import asyncio
29
+ from dataclasses import dataclass
30
+ from typing import Optional, Literal
31
+
32
+ from bs4 import BeautifulSoup
33
+ from html_to_markdown import convert, ConversionOptions, PreprocessingOptions
34
+ from playwright.async_api import async_playwright, ProxySettings
35
+
36
+
37
+ @dataclass
38
+ class ProxyConfig:
39
+ """
40
+ Proxy settings for a single request.
41
+
42
+ Supported server formats:
43
+ http://host:port
44
+ https://host:port
45
+ socks5://host:port
46
+
47
+ Args:
48
+ server : proxy address (required)
49
+ username : login (optional)
50
+ password : password (optional)
51
+ bypass : comma-separated list of hosts to bypass proxy
52
+ (e.g. "localhost,127.0.0.1")
53
+ """
54
+
55
+ server: str
56
+ username: Optional[str] = None
57
+ password: Optional[str] = None
58
+ bypass: Optional[str] = None
59
+
60
+ def to_playwright(self) -> ProxySettings:
61
+ """Converts to Playwright ProxySettings format."""
62
+ settings: ProxySettings = {"server": self.server}
63
+ if self.username:
64
+ settings["username"] = self.username
65
+ if self.password:
66
+ settings["password"] = self.password
67
+ if self.bypass:
68
+ settings["bypass"] = self.bypass
69
+ return settings
70
+
71
+
72
+ class UserAgents:
73
+ """Popular User-Agent strings for bypassing blocks."""
74
+
75
+ CHROME_DESKTOP = (
76
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
77
+ "Chrome/120.0.0.0 Safari/537.36"
78
+ )
79
+
80
+ CHROME_MAC = (
81
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) "
82
+ "Chrome/120.0.0.0 Safari/537.36"
83
+ )
84
+
85
+ FIREFOX_DESKTOP = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0"
86
+
87
+ SAFARI_MAC = (
88
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) "
89
+ "Version/17.2 Safari/605.1.15"
90
+ )
91
+
92
+ EDGE = (
93
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
94
+ "Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0"
95
+ )
96
+
97
+ GOOGLEBOT_DESKTOP = (
98
+ "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
99
+ )
100
+
101
+ GOOGLEBOT_MOBILE = (
102
+ "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) "
103
+ "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.6099.71 MobileSafari/537.36 "
104
+ "(compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
105
+ )
106
+
107
+ GOOGLEBOT_VIDEO = (
108
+ "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) "
109
+ "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.6099.71 MobileSafari/537.36 "
110
+ "Googlebot/2.1"
111
+ )
112
+
113
+ BINGBOT = "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)"
114
+
115
+ BINGBOT_MOBILE = (
116
+ "Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 "
117
+ "(KHTML, like Gecko) Version/16.6 Mobile/15E148 BingWeb/7.15.13.7055 (advisor; +http://www.bing.com/bingbot.htm)"
118
+ )
119
+
120
+ YANDEXBOT = "Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)"
121
+
122
+ DUCKBOT = "DuckDuckBot/1.0; (+http://duckduckgo.com/duckduckbot.html)"
123
+
124
+ APPLEBOT = (
125
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) "
126
+ "Version/17.0 Safari/605.1.15 Applebot/0.1"
127
+ )
128
+
129
+ CHROME_LINUX = (
130
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) "
131
+ "Chrome/120.0.0.0 Safari/537.36"
132
+ )
133
+
134
+
135
+ @dataclass
136
+ class BrowserConfig:
137
+ """Browser connection settings."""
138
+
139
+ cdp_url: Optional[str] = "ws://127.0.0.1:9222"
140
+ lightpanda_bin: Optional[str] = None
141
+ navigation_timeout: int = 30_000
142
+ wait_until: Literal["load", "domcontentloaded", "networkidle"] = "networkidle"
143
+ default_proxy: Optional[ProxyConfig] = None
144
+ user_agent: Optional[str] = UserAgents.CHROME_DESKTOP
145
+ extra_headers: Optional[dict[str, str]] = None
146
+
147
+
148
+ class HtmlCleaner:
149
+ """
150
+ Cleans HTML before converting to Markdown:
151
+ - removes <script>, <style>, <noscript>, <svg>, <canvas>, <video>, <audio>
152
+ - removes <img> (optional)
153
+ - removes href/src with data:, blob: (base64 junk)
154
+ - strips links, keeping only text (optional)
155
+ """
156
+
157
+ ALWAYS_STRIP_TAGS = [
158
+ "script",
159
+ "style",
160
+ "noscript",
161
+ "svg",
162
+ "canvas",
163
+ "video",
164
+ "audio",
165
+ "iframe",
166
+ "object",
167
+ "embed",
168
+ "head",
169
+ ]
170
+
171
+ def __init__(self, with_image: bool = False, with_link: bool = True):
172
+ self.with_image = with_image
173
+ self.with_link = with_link
174
+
175
+ def clean(self, html: str) -> str:
176
+ soup = BeautifulSoup(html, "html.parser")
177
+
178
+ for tag in self.ALWAYS_STRIP_TAGS:
179
+ for el in soup.find_all(tag):
180
+ el.decompose()
181
+
182
+ if not self.with_image:
183
+ for el in soup.find_all("img"):
184
+ el.decompose()
185
+ else:
186
+ for el in soup.find_all("img"):
187
+ src = el.get("src", "")
188
+ if isinstance(src, str) and (
189
+ src.startswith("data:") or src.startswith("blob:")
190
+ ):
191
+ el.decompose()
192
+
193
+ if not self.with_link:
194
+ for el in soup.find_all("a"):
195
+ el.replace_with(el.get_text())
196
+ else:
197
+ for el in soup.find_all("a"):
198
+ href = el.get("href", "")
199
+ if isinstance(href, str) and (
200
+ href.startswith("data:") or href.startswith("blob:")
201
+ ):
202
+ el["href"] = ""
203
+
204
+ for el in soup.find_all(True):
205
+ for attr in ("src", "href", "srcset", "poster", "background"):
206
+ val = el.get(attr, "")
207
+ if isinstance(val, str) and (
208
+ val.startswith("data:") or val.startswith("blob:")
209
+ ):
210
+ del el[attr]
211
+
212
+ return str(soup)
213
+
214
+
215
+ class MarkdownCleaner:
216
+ """Final cleanup of ready Markdown text."""
217
+
218
+ _BASE64_LINE = re.compile(r"^[A-Za-z0-9+/=]{40,}\s*$", re.MULTILINE)
219
+ _BINARY_GARBAGE = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f]")
220
+ _EXCESS_NEWLINES = re.compile(r"\n{3,}")
221
+ _MD_IMAGE = re.compile(r"!\[.*?\]\(.*?\)")
222
+
223
+ def clean(self, text: str, strip_images: bool = False) -> str:
224
+ text = self._BINARY_GARBAGE.sub("", text)
225
+ text = self._BASE64_LINE.sub("", text)
226
+
227
+ if strip_images:
228
+ text = self._MD_IMAGE.sub("", text)
229
+
230
+ text = self._EXCESS_NEWLINES.sub("\n\n", text)
231
+
232
+ return text.strip()
233
+
234
+
235
+ class PageParser:
236
+ """
237
+ Parses web pages with JS rendering and returns clean Markdown.
238
+
239
+ Args:
240
+ with_image (bool) : Include images in output. Default False.
241
+ with_link (bool) : Include links (href). Default True.
242
+ False — links are replaced with their text.
243
+ browser_config (BrowserConfig): Browser connection settings.
244
+ Can set default_proxy for all requests.
245
+
246
+ Proxy is passed to parse() / async_parse() per-request:
247
+ proxy = ProxyConfig(server="socks5://1.2.3.4:1080")
248
+ result = parser.parse("https://example.com", proxy=proxy)
249
+ """
250
+
251
+ def __init__(
252
+ self,
253
+ with_image: bool = False,
254
+ with_link: bool = True,
255
+ browser_config: Optional[BrowserConfig] = None,
256
+ ):
257
+ self.with_image = with_image
258
+ self.with_link = with_link
259
+ self.config = browser_config or BrowserConfig()
260
+
261
+ self._html_cleaner = HtmlCleaner(with_image=with_image, with_link=with_link)
262
+ self._md_cleaner = MarkdownCleaner()
263
+
264
+ self._lightpanda_proc: Optional[subprocess.Popen] = None
265
+
266
+ def parse(self, url: str, proxy: Optional[ProxyConfig] = None) -> str:
267
+ """
268
+ Synchronous wrapper over async_parse.
269
+
270
+ Args:
271
+ url : page to parse
272
+ proxy : proxy for this specific request (overrides default_proxy)
273
+ """
274
+ return asyncio.run(self.async_parse(url, proxy=proxy))
275
+
276
+ async def async_parse(self, url: str, proxy: Optional[ProxyConfig] = None) -> str:
277
+ """
278
+ Loads page, renders JS, returns clean Markdown.
279
+
280
+ Args:
281
+ url : page to parse
282
+ proxy : proxy for this specific request (overrides default_proxy)
283
+ """
284
+ html = await self._fetch_html(url, proxy=proxy)
285
+ return self._html_to_markdown(html)
286
+
287
+ async def async_parse_many(
288
+ self,
289
+ urls: list[str],
290
+ proxy: Optional[ProxyConfig] = None,
291
+ ) -> dict[str, str | BaseException]:
292
+ """
293
+ Parses multiple URLs in parallel.
294
+
295
+ Args:
296
+ urls : list of pages
297
+ proxy : one proxy for all requests (or None)
298
+ """
299
+ tasks = [self.async_parse(url, proxy=proxy) for url in urls]
300
+ results = await asyncio.gather(*tasks, return_exceptions=True)
301
+ return {url: res for url, res in zip(urls, results)}
302
+
303
+ async def _fetch_html(self, url: str, proxy: Optional[ProxyConfig] = None) -> str:
304
+ """Opens browser, loads page, returns HTML."""
305
+ self._maybe_start_lightpanda()
306
+
307
+ effective_proxy = proxy or self.config.default_proxy
308
+ proxy_settings = effective_proxy.to_playwright() if effective_proxy else None
309
+
310
+ headers = {}
311
+ if self.config.extra_headers:
312
+ headers.update(self.config.extra_headers)
313
+
314
+ async with async_playwright() as pw:
315
+ if self.config.cdp_url:
316
+ browser = await pw.chromium.connect_over_cdp(self.config.cdp_url)
317
+ context = await browser.new_context(
318
+ proxy=proxy_settings,
319
+ user_agent=self.config.user_agent,
320
+ extra_http_headers=headers if headers else None,
321
+ )
322
+ else:
323
+ browser = await pw.chromium.launch(headless=True)
324
+ context = await browser.new_context(
325
+ proxy=proxy_settings,
326
+ user_agent=self.config.user_agent,
327
+ extra_http_headers=headers if headers else None,
328
+ )
329
+
330
+ page = await context.new_page()
331
+
332
+ try:
333
+ await page.goto(
334
+ url,
335
+ timeout=self.config.navigation_timeout,
336
+ wait_until=self.config.wait_until,
337
+ )
338
+ html = await page.content()
339
+ finally:
340
+ await page.close()
341
+ await context.close()
342
+ await browser.close()
343
+
344
+ return html
345
+
346
+ def _html_to_markdown(self, html: str) -> str:
347
+ """Cleans HTML and converts to Markdown."""
348
+
349
+ clean_html = self._html_cleaner.clean(html)
350
+
351
+ options = ConversionOptions(
352
+ heading_style="atx",
353
+ strong_em_symbol="*",
354
+ bullets="*",
355
+ escape_asterisks=False,
356
+ )
357
+ preprocessing = PreprocessingOptions(
358
+ enabled=True,
359
+ preset="aggressive",
360
+ remove_navigation=True,
361
+ remove_forms=True,
362
+ )
363
+ markdown = convert(clean_html, options, preprocessing)
364
+
365
+ markdown = self._md_cleaner.clean(markdown, strip_images=not self.with_image)
366
+
367
+ return markdown
368
+
369
+ def _maybe_start_lightpanda(self) -> None:
370
+ """If binary path is set and process not running — start it."""
371
+ if not self.config.lightpanda_bin:
372
+ return
373
+ if self._lightpanda_proc and self._lightpanda_proc.poll() is None:
374
+ return
375
+
376
+ self._lightpanda_proc = subprocess.Popen(
377
+ [
378
+ self.config.lightpanda_bin,
379
+ "serve",
380
+ "--host",
381
+ "127.0.0.1",
382
+ "--port",
383
+ "9222",
384
+ ],
385
+ stdout=subprocess.DEVNULL,
386
+ stderr=subprocess.DEVNULL,
387
+ )
388
+ time.sleep(1.5)
389
+
390
+ def stop_lightpanda(self) -> None:
391
+ """Explicitly stops Lightpanda if started by us."""
392
+ if self._lightpanda_proc:
393
+ self._lightpanda_proc.terminate()
394
+ self._lightpanda_proc = None
395
+
396
+ def __enter__(self):
397
+ return self
398
+
399
+ def __exit__(self, *_):
400
+ self.stop_lightpanda()
401
+
402
+
403
+ if __name__ == "__main__":
404
+ import sys
405
+
406
+ url = sys.argv[1] if len(sys.argv) > 1 else "https://example.com"
407
+
408
+ parser = PageParser(
409
+ with_image=False,
410
+ with_link=False,
411
+ browser_config=BrowserConfig(cdp_url=None),
412
+ )
413
+
414
+ with parser:
415
+ result = parser.parse(url)
416
+
417
+ print(result)
@@ -0,0 +1,265 @@
1
+ Metadata-Version: 2.4
2
+ Name: pg2md
3
+ Version: 1.0.0
4
+ Summary: Page to Markdown converter with JS rendering support via Playwright
5
+ Project-URL: Homepage, https://github.com/yourusername/pg2md
6
+ Project-URL: Documentation, https://github.com/yourusername/pg2md#readme
7
+ Project-URL: Repository, https://github.com/yourusername/pg2md
8
+ Project-URL: Issues, https://github.com/yourusername/pg2md/issues
9
+ Author-email: Your Name <your@email.com>
10
+ License-Expression: MIT
11
+ License-File: LICENSE
12
+ Keywords: converter,html,js-rendering,markdown,parser,playwright,scraper,web-scraping
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Operating System :: OS Independent
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Topic :: Internet :: WWW/HTTP
22
+ Classifier: Topic :: Text Processing :: Markup :: Markdown
23
+ Classifier: Topic :: Utilities
24
+ Requires-Python: >=3.10
25
+ Requires-Dist: beautifulsoup4>=4.12.0
26
+ Requires-Dist: html-to-markdown>=2.0.0
27
+ Requires-Dist: playwright>=1.40.0
28
+ Provides-Extra: dev
29
+ Requires-Dist: mypy>=1.0.0; extra == 'dev'
30
+ Requires-Dist: pytest>=7.0.0; extra == 'dev'
31
+ Requires-Dist: ruff>=0.1.0; extra == 'dev'
32
+ Description-Content-Type: text/markdown
33
+
34
+ # pg2md
35
+
36
+ [![PyPI version](https://badge.fury.io/py/pg2md.svg)](https://badge.fury.io/py/pg2md)
37
+ [![Python](https://img.shields.io/pypi/pyversions/pg2md.svg)](https://pypi.org/project/pg2md/)
38
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
39
+ [![Downloads](https://static.pepy.tech/badge/pg2md)](https://pepy.tech/project/pg2md)
40
+ [![GitHub stars](https://img.shields.io/github/stars/yourusername/pg2md.svg?style=social)](https://github.com/yourusername/pg2md/stargazers)
41
+ [![GitHub issues](https://img.shields.io/github/issues/yourusername/pg2md.svg)](https://github.com/yourusername/pg2md/issues)
42
+ [![GitHub forks](https://img.shields.io/github/forks/yourusername/pg2md.svg?style=social)](https://github.com/yourusername/pg2md/network/members)
43
+
44
+ **P**a**g**e to **M**ark**d**own — fast HTML-to-Markdown converter with JavaScript rendering support.
45
+
46
+ Converts any web page to clean Markdown using Playwright for JS rendering and Rust-based `html-to-markdown` for conversion.
47
+
48
+ ## Features
49
+
50
+ - **JavaScript Rendering** — handles SPA, React, Vue, dynamic content
51
+ - **Fast Conversion** — Rust-based `html-to-markdown` core
52
+ - **Clean Output** — strips scripts, styles, navigation, forms
53
+ - **Proxy Support** — HTTP/HTTPS/SOCKS5 with auth
54
+ - **Custom User-Agents** — includes Googlebot, Bingbot, etc.
55
+ - **Async & Sync API** — `parse()` and `async_parse()`
56
+ - **Batch Processing** — `async_parse_many()` for parallel requests
57
+ - **Configurable** — images, links, headers, timeouts
58
+
59
+ ## Installation
60
+
61
+ ```bash
62
+ pip install pg2md
63
+ playwright install chromium
64
+ ```
65
+
66
+ ## Quick Start
67
+
68
+ ```python
69
+ from pg2md import PageParser
70
+
71
+ parser = PageParser(with_image=False, with_link=True)
72
+ markdown = parser.parse("https://example.com")
73
+ print(markdown)
74
+ ```
75
+
76
+ ## Usage Examples
77
+
78
+ ### Basic Usage
79
+
80
+ ```python
81
+ from pg2md import PageParser
82
+
83
+ parser = PageParser()
84
+ result = parser.parse("https://example.com")
85
+ print(result)
86
+ ```
87
+
88
+ ### Without Images and Links
89
+
90
+ ```python
91
+ from pg2md import PageParser
92
+
93
+ parser = PageParser(with_image=False, with_link=False)
94
+ result = parser.parse("https://example.com")
95
+ ```
96
+
97
+ ### With Proxy
98
+
99
+ ```python
100
+ from pg2md import PageParser, ProxyConfig
101
+
102
+ proxy = ProxyConfig(
103
+ server="http://proxy.example.com:8080",
104
+ username="user",
105
+ password="pass"
106
+ )
107
+
108
+ parser = PageParser()
109
+ result = parser.parse("https://example.com", proxy=proxy)
110
+ ```
111
+
112
+ ### SOCKS5 Proxy
113
+
114
+ ```python
115
+ from pg2md import PageParser, ProxyConfig
116
+
117
+ proxy = ProxyConfig(server="socks5://127.0.0.1:1080")
118
+ parser = PageParser()
119
+ result = parser.parse("https://example.com", proxy=proxy)
120
+ ```
121
+
122
+ ### Custom User-Agent
123
+
124
+ ```python
125
+ from pg2md import PageParser, BrowserConfig, UserAgents
126
+
127
+ config = BrowserConfig(
128
+ cdp_url=None,
129
+ user_agent=UserAgents.GOOGLEBOT_DESKTOP,
130
+ extra_headers={"Accept-Language": "en-US,en;q=0.9"}
131
+ )
132
+
133
+ parser = PageParser(browser_config=config)
134
+ result = parser.parse("https://example.com")
135
+ ```
136
+
137
+ ### Async API
138
+
139
+ ```python
140
+ import asyncio
141
+ from pg2md import PageParser
142
+
143
+ async def main():
144
+ parser = PageParser()
145
+ result = await parser.async_parse("https://example.com")
146
+ print(result)
147
+
148
+ asyncio.run(main())
149
+ ```
150
+
151
+ ### Batch Processing
152
+
153
+ ```python
154
+ import asyncio
155
+ from pg2md import PageParser
156
+
157
+ async def main():
158
+ parser = PageParser()
159
+ urls = [
160
+ "https://example.com",
161
+ "https://example.org",
162
+ "https://example.net",
163
+ ]
164
+ results = await parser.async_parse_many(urls)
165
+
166
+ for url, result in results.items():
167
+ if isinstance(result, Exception):
168
+ print(f"Error {url}: {result}")
169
+ else:
170
+ print(f"{url}: {len(result)} chars")
171
+
172
+ asyncio.run(main())
173
+ ```
174
+
175
+ ### Using Lightpanda
176
+
177
+ ```python
178
+ from pg2md import PageParser, BrowserConfig
179
+
180
+ # Start Lightpanda manually:
181
+ # ./lightpanda serve --host 127.0.0.1 --port 9222
182
+
183
+ config = BrowserConfig(cdp_url="ws://127.0.0.1:9222")
184
+ parser = PageParser(browser_config=config)
185
+ result = parser.parse("https://example.com")
186
+ ```
187
+
188
+ ## Configuration
189
+
190
+ ### BrowserConfig
191
+
192
+ | Parameter | Type | Default | Description |
193
+ |-----------|------|---------|-------------|
194
+ | `cdp_url` | `str \| None` | `"ws://127.0.0.1:9222"` | CDP endpoint (Lightpanda/Chrome) |
195
+ | `lightpanda_bin` | `str \| None` | `None` | Path to Lightpanda binary |
196
+ | `navigation_timeout` | `int` | `30000` | Navigation timeout (ms) |
197
+ | `wait_until` | `str` | `"networkidle"` | Wait event |
198
+ | `default_proxy` | `ProxyConfig \| None` | `None` | Default proxy for all requests |
199
+ | `user_agent` | `str \| None` | Chrome Desktop | User-Agent string |
200
+ | `extra_headers` | `dict \| None` | `None` | Additional HTTP headers |
201
+
202
+ ### ProxyConfig
203
+
204
+ | Parameter | Type | Default | Description |
205
+ |-----------|------|---------|-------------|
206
+ | `server` | `str` | required | Proxy URL |
207
+ | `username` | `str \| None` | `None` | Username |
208
+ | `password` | `str \| None` | `None` | Password |
209
+ | `bypass` | `str \| None` | `None` | Hosts to bypass |
210
+
211
+ ### UserAgents
212
+
213
+ Available presets:
214
+
215
+ - `CHROME_DESKTOP`, `CHROME_MAC`, `CHROME_LINUX`
216
+ - `FIREFOX_DESKTOP`
217
+ - `SAFARI_MAC`
218
+ - `EDGE`
219
+ - `GOOGLEBOT_DESKTOP`, `GOOGLEBOT_MOBILE`, `GOOGLEBOT_VIDEO`
220
+ - `BINGBOT`, `BINGBOT_MOBILE`
221
+ - `YANDEXBOT`
222
+ - `DUCKBOT`
223
+ - `APPLEBOT`
224
+
225
+ ## API Reference
226
+
227
+ ### PageParser
228
+
229
+ ```python
230
+ PageParser(
231
+ with_image: bool = False,
232
+ with_link: bool = True,
233
+ browser_config: BrowserConfig | None = None
234
+ )
235
+ ```
236
+
237
+ #### Methods
238
+
239
+ | Method | Description |
240
+ |--------|-------------|
241
+ | `parse(url, proxy=None)` | Sync parse, returns Markdown string |
242
+ | `async_parse(url, proxy=None)` | Async parse, returns Markdown string |
243
+ | `async_parse_many(urls, proxy=None)` | Batch async parse, returns dict |
244
+ | `stop_lightpanda()` | Stop Lightpanda if started |
245
+
246
+ ## Development
247
+
248
+ ```bash
249
+ git clone https://github.com/yourusername/pg2md.git
250
+ cd pg2md
251
+ python -m venv venv
252
+ source venv/bin/activate
253
+ pip install -e ".[dev]"
254
+ playwright install chromium
255
+ ```
256
+
257
+ ## License
258
+
259
+ [MIT](LICENSE)
260
+
261
+ ## Credits
262
+
263
+ - [Playwright](https://playwright.dev/python/) — browser automation
264
+ - [html-to-markdown](https://pypi.org/project/html-to-markdown/) — Rust-based HTML to Markdown
265
+ - [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) — HTML parsing
@@ -0,0 +1,6 @@
1
+ pg2md/__init__.py,sha256=gnJA-EpCZh0rqggk_avwsi3IBGHbc0bddJmB8rrAvBw,896
2
+ pg2md/parser.py,sha256=bnepF5302PsV2SXmHgEla2126SihPE2FuBYNDI5Gp8k,13371
3
+ pg2md-1.0.0.dist-info/METADATA,sha256=5tTGIaPbjSr3qu05zp8hiUQFt94F49JbsRSkFUefPJk,7560
4
+ pg2md-1.0.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
5
+ pg2md-1.0.0.dist-info/licenses/LICENSE,sha256=OphKV48tcMv6ep-7j-8T6nycykPT0g8ZlMJ9zbGvdPs,1066
6
+ pg2md-1.0.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.29.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Your Name
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.