pg2md 1.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pg2md-1.2.0/.gitignore ADDED
@@ -0,0 +1,82 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ *.egg-info/
24
+ .installed.cfg
25
+ *.egg
26
+ MANIFEST
27
+
28
+ # PyInstaller
29
+ *.manifest
30
+ *.spec
31
+
32
+ # Installer logs
33
+ pip-log.txt
34
+ pip-delete-this-directory.txt
35
+
36
+ # Unit test / coverage reports
37
+ htmlcov/
38
+ .tox/
39
+ .nox/
40
+ .coverage
41
+ .coverage.*
42
+ .cache
43
+ nosetests.xml
44
+ coverage.xml
45
+ *.cover
46
+ .hypothesis/
47
+ .pytest_cache/
48
+
49
+ # Translations
50
+ *.mo
51
+ *.pot
52
+
53
+ # Environments
54
+ .env
55
+ .venv
56
+ env/
57
+ venv/
58
+ ENV/
59
+ env.bak/
60
+ venv.bak/
61
+
62
+ # IDEs
63
+ .idea/
64
+ .vscode/
65
+ *.swp
66
+ *.swo
67
+ *~
68
+
69
+ # Linters / formatters
70
+ .ruff_cache/
71
+ .mypy_cache/
72
+ .dmypy.json
73
+ dmypy.json
74
+
75
+ # Jupyter
76
+ .ipynb_checkpoints
77
+
78
+ # Project specific
79
+ *.md
80
+ !README.md
81
+ data.json
82
+ proxies.txt
pg2md-1.2.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 lemantorus
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
pg2md-1.2.0/PKG-INFO ADDED
@@ -0,0 +1,273 @@
1
+ Metadata-Version: 2.4
2
+ Name: pg2md
3
+ Version: 1.2.0
4
+ Summary: Page to Markdown converter with JS rendering support via Playwright
5
+ Project-URL: Homepage, https://github.com/lemantorus/pg2md
6
+ Project-URL: Documentation, https://github.com/lemantorus/pg2md#readme
7
+ Project-URL: Repository, https://github.com/lemantorus/pg2md
8
+ Project-URL: Issues, https://github.com/lemantorus/pg2md/issues
9
+ Author: lemantorus
10
+ License-Expression: MIT
11
+ License-File: LICENSE
12
+ Keywords: converter,html,js-rendering,markdown,parser,playwright,scraper,web-scraping
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Operating System :: OS Independent
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Topic :: Internet :: WWW/HTTP
22
+ Classifier: Topic :: Text Processing :: Markup :: Markdown
23
+ Classifier: Topic :: Utilities
24
+ Requires-Python: >=3.10
25
+ Requires-Dist: beautifulsoup4>=4.12.0
26
+ Requires-Dist: html-to-markdown>=2.0.0
27
+ Requires-Dist: playwright>=1.40.0
28
+ Provides-Extra: dev
29
+ Requires-Dist: mypy>=1.0.0; extra == 'dev'
30
+ Requires-Dist: pytest>=7.0.0; extra == 'dev'
31
+ Requires-Dist: ruff>=0.1.0; extra == 'dev'
32
+ Description-Content-Type: text/markdown
33
+
34
+ # pg2md
35
+
36
+ [![PyPI version](https://badge.fury.io/py/pg2md.svg)](https://badge.fury.io/py/pg2md)
37
+ [![Python](https://img.shields.io/pypi/pyversions/pg2md.svg)](https://pypi.org/project/pg2md/)
38
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
39
+ [![Downloads](https://static.pepy.tech/badge/pg2md)](https://pepy.tech/project/pg2md)
40
+ [![GitHub stars](https://img.shields.io/github/stars/lemantorus/pg2md.svg?style=social)](https://github.com/lemantorus/pg2md/stargazers)
41
+ [![GitHub issues](https://img.shields.io/github/issues/lemantorus/pg2md.svg)](https://github.com/lemantorus/pg2md/issues)
42
+ [![GitHub forks](https://img.shields.io/github/forks/lemantorus/pg2md.svg?style=social)](https://github.com/lemantorus/pg2md/network/members)
43
+
44
+ **P**a**g**e to **M**ark**d**own — fast HTML-to-Markdown converter with JavaScript rendering support.
45
+
46
+ Converts any web page to clean Markdown using Playwright for JS rendering and Rust-based `html-to-markdown` for conversion.
47
+
48
+ ## Features
49
+
50
+ - **Auto Lightpanda** — automatically downloads and starts Lightpanda browser
51
+ - **JavaScript Rendering** — handles SPA, React, Vue, dynamic content
52
+ - **Fast Conversion** — Rust-based `html-to-markdown` core
53
+ - **Clean Output** — strips scripts, styles, navigation, forms
54
+ - **Proxy Support** — HTTP/HTTPS/SOCKS5 with auth
55
+ - **Custom User-Agents** — includes Googlebot, Bingbot, etc.
56
+ - **Async & Sync API** — `parse()` and `async_parse()`
57
+ - **Batch Processing** — `async_parse_many()` for parallel requests
58
+ - **Configurable** — images, links, headers, timeouts
59
+
60
+ ## Quick Start
61
+
62
+ ```python
63
+ from pg2md import PageParser
64
+
65
+ # Auto-downloads Lightpanda, starts it, and parses page
66
+ # If Lightpanda fails, falls back to Chromium automatically
67
+ parser = PageParser()
68
+ markdown = parser.parse("https://example.com")
69
+ print(markdown)
70
+ ```
71
+
72
+ First run will download Lightpanda (~50MB) to `~/.cache/pg2md/`.
73
+
74
+ **Note:** Lightpanda is in beta and may not support all Playwright features. If it fails, pg2md automatically falls back to Chromium.
75
+
76
+ ## Installation
77
+
78
+ ```bash
79
+ pip install pg2md
80
+ playwright install chromium # fallback browser if Lightpanda unsupported
81
+ ```
82
+
83
+ ## Usage Examples
84
+
85
+ ### Basic Usage
86
+
87
+ ```python
88
+ from pg2md import PageParser
89
+
90
+ parser = PageParser()
91
+ result = parser.parse("https://example.com")
92
+ print(result)
93
+ ```
94
+
95
+ ### Without Images and Links
96
+
97
+ ```python
98
+ from pg2md import PageParser
99
+
100
+ parser = PageParser(with_image=False, with_link=False)
101
+ result = parser.parse("https://example.com")
102
+ ```
103
+
104
+ ### With Proxy
105
+
106
+ ```python
107
+ from pg2md import PageParser, ProxyConfig
108
+
109
+ proxy = ProxyConfig(
110
+ server="http://proxy.example.com:8080",
111
+ username="user",
112
+ password="pass"
113
+ )
114
+
115
+ parser = PageParser()
116
+ result = parser.parse("https://example.com", proxy=proxy)
117
+ ```
118
+
119
+ ### SOCKS5 Proxy
120
+
121
+ ```python
122
+ from pg2md import PageParser, ProxyConfig
123
+
124
+ proxy = ProxyConfig(server="socks5://127.0.0.1:1080")
125
+ parser = PageParser()
126
+ result = parser.parse("https://example.com", proxy=proxy)
127
+ ```
128
+
129
+ ### Custom User-Agent
130
+
131
+ ```python
132
+ from pg2md import PageParser, BrowserConfig, UserAgents
133
+
134
+ config = BrowserConfig(
135
+ cdp_url=None,
136
+ user_agent=UserAgents.GOOGLEBOT_DESKTOP,
137
+ extra_headers={"Accept-Language": "en-US,en;q=0.9"}
138
+ )
139
+
140
+ parser = PageParser(browser_config=config)
141
+ result = parser.parse("https://example.com")
142
+ ```
143
+
144
+ ### Async API
145
+
146
+ ```python
147
+ import asyncio
148
+ from pg2md import PageParser
149
+
150
+ async def main():
151
+ parser = PageParser()
152
+ result = await parser.async_parse("https://example.com")
153
+ print(result)
154
+
155
+ asyncio.run(main())
156
+ ```
157
+
158
+ ### Batch Processing
159
+
160
+ ```python
161
+ import asyncio
162
+ from pg2md import PageParser
163
+
164
+ async def main():
165
+ parser = PageParser()
166
+ urls = [
167
+ "https://example.com",
168
+ "https://example.org",
169
+ "https://example.net",
170
+ ]
171
+ results = await parser.async_parse_many(urls)
172
+
173
+ for url, result in results.items():
174
+ if isinstance(result, Exception):
175
+ print(f"Error {url}: {result}")
176
+ else:
177
+ print(f"{url}: {len(result)} chars")
178
+
179
+ asyncio.run(main())
180
+ ```
181
+
182
+ ### Using Lightpanda
183
+
184
+ ```python
185
+ from pg2md import PageParser, BrowserConfig
186
+
187
+ # Start Lightpanda manually:
188
+ # ./lightpanda serve --host 127.0.0.1 --port 9222
189
+
190
+ config = BrowserConfig(cdp_url="ws://127.0.0.1:9222")
191
+ parser = PageParser(browser_config=config)
192
+ result = parser.parse("https://example.com")
193
+ ```
194
+
195
+ ## Configuration
196
+
197
+ ### BrowserConfig
198
+
199
+ | Parameter | Type | Default | Description |
200
+ |-----------|------|---------|-------------|
201
+ | `cdp_url` | `str \| None` | `"auto"` | `"auto"` = auto Lightpanda, `None` = Chromium, `"ws://..."` = custom CDP |
202
+ | `lightpanda_bin` | `str \| None` | `None` | Path to Lightpanda binary (for manual start) |
203
+ | `navigation_timeout` | `int` | `60000` | Navigation timeout (ms) |
204
+ | `wait_until` | `str` | `"domcontentloaded"` | Wait event (`"load"`, `"domcontentloaded"`, `"networkidle"`) |
205
+ | `default_proxy` | `ProxyConfig \| None` | `None` | Default proxy for all requests |
206
+ | `user_agent` | `str \| None` | Chrome Desktop | User-Agent string |
207
+ | `extra_headers` | `dict \| None` | `None` | Additional HTTP headers |
208
+ | `viewport` | `dict \| None` | `{"width": 1920, "height": 1080}` | Browser viewport size |
209
+
210
+ ### ProxyConfig
211
+
212
+ | Parameter | Type | Default | Description |
213
+ |-----------|------|---------|-------------|
214
+ | `server` | `str` | required | Proxy URL |
215
+ | `username` | `str \| None` | `None` | Username |
216
+ | `password` | `str \| None` | `None` | Password |
217
+ | `bypass` | `str \| None` | `None` | Hosts to bypass |
218
+
219
+ ### UserAgents
220
+
221
+ Available presets:
222
+
223
+ - `CHROME_DESKTOP`, `CHROME_MAC`, `CHROME_LINUX`
224
+ - `FIREFOX_DESKTOP`
225
+ - `SAFARI_MAC`
226
+ - `EDGE`
227
+ - `GOOGLEBOT_DESKTOP`, `GOOGLEBOT_MOBILE`, `GOOGLEBOT_VIDEO`
228
+ - `BINGBOT`, `BINGBOT_MOBILE`
229
+ - `YANDEXBOT`
230
+ - `DUCKBOT`
231
+ - `APPLEBOT`
232
+
233
+ ## API Reference
234
+
235
+ ### PageParser
236
+
237
+ ```python
238
+ PageParser(
239
+ with_image: bool = False,
240
+ with_link: bool = True,
241
+ browser_config: BrowserConfig | None = None
242
+ )
243
+ ```
244
+
245
+ #### Methods
246
+
247
+ | Method | Description |
248
+ |--------|-------------|
249
+ | `parse(url, proxy=None)` | Sync parse, returns Markdown string |
250
+ | `async_parse(url, proxy=None)` | Async parse, returns Markdown string |
251
+ | `async_parse_many(urls, proxy=None)` | Batch async parse, returns dict |
252
+ | `stop_lightpanda()` | Stop Lightpanda if started |
253
+
254
+ ## Development
255
+
256
+ ```bash
257
+ git clone https://github.com/lemantorus/pg2md.git
258
+ cd pg2md
259
+ python -m venv venv
260
+ source venv/bin/activate
261
+ pip install -e ".[dev]"
262
+ playwright install chromium
263
+ ```
264
+
265
+ ## License
266
+
267
+ [MIT](LICENSE)
268
+
269
+ ## Credits
270
+
271
+ - [Playwright](https://playwright.dev/python/) — browser automation
272
+ - [html-to-markdown](https://pypi.org/project/html-to-markdown/) — Rust-based HTML to Markdown
273
+ - [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) — HTML parsing
pg2md-1.2.0/README.md ADDED
@@ -0,0 +1,240 @@
1
+ # pg2md
2
+
3
+ [![PyPI version](https://badge.fury.io/py/pg2md.svg)](https://badge.fury.io/py/pg2md)
4
+ [![Python](https://img.shields.io/pypi/pyversions/pg2md.svg)](https://pypi.org/project/pg2md/)
5
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
6
+ [![Downloads](https://static.pepy.tech/badge/pg2md)](https://pepy.tech/project/pg2md)
7
+ [![GitHub stars](https://img.shields.io/github/stars/lemantorus/pg2md.svg?style=social)](https://github.com/lemantorus/pg2md/stargazers)
8
+ [![GitHub issues](https://img.shields.io/github/issues/lemantorus/pg2md.svg)](https://github.com/lemantorus/pg2md/issues)
9
+ [![GitHub forks](https://img.shields.io/github/forks/lemantorus/pg2md.svg?style=social)](https://github.com/lemantorus/pg2md/network/members)
10
+
11
+ **P**a**g**e to **M**ark**d**own — fast HTML-to-Markdown converter with JavaScript rendering support.
12
+
13
+ Converts any web page to clean Markdown using Playwright for JS rendering and Rust-based `html-to-markdown` for conversion.
14
+
15
+ ## Features
16
+
17
+ - **Auto Lightpanda** — automatically downloads and starts Lightpanda browser
18
+ - **JavaScript Rendering** — handles SPA, React, Vue, dynamic content
19
+ - **Fast Conversion** — Rust-based `html-to-markdown` core
20
+ - **Clean Output** — strips scripts, styles, navigation, forms
21
+ - **Proxy Support** — HTTP/HTTPS/SOCKS5 with auth
22
+ - **Custom User-Agents** — includes Googlebot, Bingbot, etc.
23
+ - **Async & Sync API** — `parse()` and `async_parse()`
24
+ - **Batch Processing** — `async_parse_many()` for parallel requests
25
+ - **Configurable** — images, links, headers, timeouts
26
+
27
+ ## Quick Start
28
+
29
+ ```python
30
+ from pg2md import PageParser
31
+
32
+ # Auto-downloads Lightpanda, starts it, and parses page
33
+ # If Lightpanda fails, falls back to Chromium automatically
34
+ parser = PageParser()
35
+ markdown = parser.parse("https://example.com")
36
+ print(markdown)
37
+ ```
38
+
39
+ First run will download Lightpanda (~50MB) to `~/.cache/pg2md/`.
40
+
41
+ **Note:** Lightpanda is in beta and may not support all Playwright features. If it fails, pg2md automatically falls back to Chromium.
42
+
43
+ ## Installation
44
+
45
+ ```bash
46
+ pip install pg2md
47
+ playwright install chromium # fallback browser if Lightpanda unsupported
48
+ ```
49
+
50
+ ## Usage Examples
51
+
52
+ ### Basic Usage
53
+
54
+ ```python
55
+ from pg2md import PageParser
56
+
57
+ parser = PageParser()
58
+ result = parser.parse("https://example.com")
59
+ print(result)
60
+ ```
61
+
62
+ ### Without Images and Links
63
+
64
+ ```python
65
+ from pg2md import PageParser
66
+
67
+ parser = PageParser(with_image=False, with_link=False)
68
+ result = parser.parse("https://example.com")
69
+ ```
70
+
71
+ ### With Proxy
72
+
73
+ ```python
74
+ from pg2md import PageParser, ProxyConfig
75
+
76
+ proxy = ProxyConfig(
77
+ server="http://proxy.example.com:8080",
78
+ username="user",
79
+ password="pass"
80
+ )
81
+
82
+ parser = PageParser()
83
+ result = parser.parse("https://example.com", proxy=proxy)
84
+ ```
85
+
86
+ ### SOCKS5 Proxy
87
+
88
+ ```python
89
+ from pg2md import PageParser, ProxyConfig
90
+
91
+ proxy = ProxyConfig(server="socks5://127.0.0.1:1080")
92
+ parser = PageParser()
93
+ result = parser.parse("https://example.com", proxy=proxy)
94
+ ```
95
+
96
+ ### Custom User-Agent
97
+
98
+ ```python
99
+ from pg2md import PageParser, BrowserConfig, UserAgents
100
+
101
+ config = BrowserConfig(
102
+ cdp_url=None,
103
+ user_agent=UserAgents.GOOGLEBOT_DESKTOP,
104
+ extra_headers={"Accept-Language": "en-US,en;q=0.9"}
105
+ )
106
+
107
+ parser = PageParser(browser_config=config)
108
+ result = parser.parse("https://example.com")
109
+ ```
110
+
111
+ ### Async API
112
+
113
+ ```python
114
+ import asyncio
115
+ from pg2md import PageParser
116
+
117
+ async def main():
118
+ parser = PageParser()
119
+ result = await parser.async_parse("https://example.com")
120
+ print(result)
121
+
122
+ asyncio.run(main())
123
+ ```
124
+
125
+ ### Batch Processing
126
+
127
+ ```python
128
+ import asyncio
129
+ from pg2md import PageParser
130
+
131
+ async def main():
132
+ parser = PageParser()
133
+ urls = [
134
+ "https://example.com",
135
+ "https://example.org",
136
+ "https://example.net",
137
+ ]
138
+ results = await parser.async_parse_many(urls)
139
+
140
+ for url, result in results.items():
141
+ if isinstance(result, Exception):
142
+ print(f"Error {url}: {result}")
143
+ else:
144
+ print(f"{url}: {len(result)} chars")
145
+
146
+ asyncio.run(main())
147
+ ```
148
+
149
+ ### Using Lightpanda
150
+
151
+ ```python
152
+ from pg2md import PageParser, BrowserConfig
153
+
154
+ # Start Lightpanda manually:
155
+ # ./lightpanda serve --host 127.0.0.1 --port 9222
156
+
157
+ config = BrowserConfig(cdp_url="ws://127.0.0.1:9222")
158
+ parser = PageParser(browser_config=config)
159
+ result = parser.parse("https://example.com")
160
+ ```
161
+
162
+ ## Configuration
163
+
164
+ ### BrowserConfig
165
+
166
+ | Parameter | Type | Default | Description |
167
+ |-----------|------|---------|-------------|
168
+ | `cdp_url` | `str \| None` | `"auto"` | `"auto"` = auto Lightpanda, `None` = Chromium, `"ws://..."` = custom CDP |
169
+ | `lightpanda_bin` | `str \| None` | `None` | Path to Lightpanda binary (for manual start) |
170
+ | `navigation_timeout` | `int` | `60000` | Navigation timeout (ms) |
171
+ | `wait_until` | `str` | `"domcontentloaded"` | Wait event (`"load"`, `"domcontentloaded"`, `"networkidle"`) |
172
+ | `default_proxy` | `ProxyConfig \| None` | `None` | Default proxy for all requests |
173
+ | `user_agent` | `str \| None` | Chrome Desktop | User-Agent string |
174
+ | `extra_headers` | `dict \| None` | `None` | Additional HTTP headers |
175
+ | `viewport` | `dict \| None` | `{"width": 1920, "height": 1080}` | Browser viewport size |
176
+
177
+ ### ProxyConfig
178
+
179
+ | Parameter | Type | Default | Description |
180
+ |-----------|------|---------|-------------|
181
+ | `server` | `str` | required | Proxy URL |
182
+ | `username` | `str \| None` | `None` | Username |
183
+ | `password` | `str \| None` | `None` | Password |
184
+ | `bypass` | `str \| None` | `None` | Hosts to bypass |
185
+
186
+ ### UserAgents
187
+
188
+ Available presets:
189
+
190
+ - `CHROME_DESKTOP`, `CHROME_MAC`, `CHROME_LINUX`
191
+ - `FIREFOX_DESKTOP`
192
+ - `SAFARI_MAC`
193
+ - `EDGE`
194
+ - `GOOGLEBOT_DESKTOP`, `GOOGLEBOT_MOBILE`, `GOOGLEBOT_VIDEO`
195
+ - `BINGBOT`, `BINGBOT_MOBILE`
196
+ - `YANDEXBOT`
197
+ - `DUCKBOT`
198
+ - `APPLEBOT`
199
+
200
+ ## API Reference
201
+
202
+ ### PageParser
203
+
204
+ ```python
205
+ PageParser(
206
+ with_image: bool = False,
207
+ with_link: bool = True,
208
+ browser_config: BrowserConfig | None = None
209
+ )
210
+ ```
211
+
212
+ #### Methods
213
+
214
+ | Method | Description |
215
+ |--------|-------------|
216
+ | `parse(url, proxy=None)` | Sync parse, returns Markdown string |
217
+ | `async_parse(url, proxy=None)` | Async parse, returns Markdown string |
218
+ | `async_parse_many(urls, proxy=None)` | Batch async parse, returns dict |
219
+ | `stop_lightpanda()` | Stop Lightpanda if started |
220
+
221
+ ## Development
222
+
223
+ ```bash
224
+ git clone https://github.com/lemantorus/pg2md.git
225
+ cd pg2md
226
+ python -m venv venv
227
+ source venv/bin/activate
228
+ pip install -e ".[dev]"
229
+ playwright install chromium
230
+ ```
231
+
232
+ ## License
233
+
234
+ [MIT](LICENSE)
235
+
236
+ ## Credits
237
+
238
+ - [Playwright](https://playwright.dev/python/) — browser automation
239
+ - [html-to-markdown](https://pypi.org/project/html-to-markdown/) — Rust-based HTML to Markdown
240
+ - [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) — HTML parsing
Binary file
@@ -0,0 +1,64 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "pg2md"
7
+ version = "1.2.0"
8
+ description = "Page to Markdown converter with JS rendering support via Playwright"
9
+ readme = "README.md"
10
+ license = "MIT"
11
+ requires-python = ">=3.10"
12
+ authors = [{ name = "lemantorus" }]
13
+ keywords = [
14
+ "markdown",
15
+ "html",
16
+ "converter",
17
+ "playwright",
18
+ "scraper",
19
+ "parser",
20
+ "web-scraping",
21
+ "js-rendering",
22
+ ]
23
+ classifiers = [
24
+ "Development Status :: 4 - Beta",
25
+ "Intended Audience :: Developers",
26
+ "License :: OSI Approved :: MIT License",
27
+ "Operating System :: OS Independent",
28
+ "Programming Language :: Python :: 3",
29
+ "Programming Language :: Python :: 3.10",
30
+ "Programming Language :: Python :: 3.11",
31
+ "Programming Language :: Python :: 3.12",
32
+ "Topic :: Internet :: WWW/HTTP",
33
+ "Topic :: Text Processing :: Markup :: Markdown",
34
+ "Topic :: Utilities",
35
+ ]
36
+ dependencies = [
37
+ "playwright>=1.40.0",
38
+ "html-to-markdown>=2.0.0",
39
+ "beautifulsoup4>=4.12.0",
40
+ ]
41
+
42
+ [project.optional-dependencies]
43
+ dev = ["pytest>=7.0.0", "ruff>=0.1.0", "mypy>=1.0.0"]
44
+
45
+ [project.urls]
46
+ Homepage = "https://github.com/lemantorus/pg2md"
47
+ Documentation = "https://github.com/lemantorus/pg2md#readme"
48
+ Repository = "https://github.com/lemantorus/pg2md"
49
+ Issues = "https://github.com/lemantorus/pg2md/issues"
50
+
51
+ [tool.hatch.build.targets.wheel]
52
+ packages = ["src/pg2md"]
53
+
54
+ [tool.ruff]
55
+ line-length = 100
56
+ target-version = "py310"
57
+
58
+ [tool.ruff.lint]
59
+ select = ["E", "F", "I", "UP", "B"]
60
+
61
+ [tool.mypy]
62
+ python_version = "3.10"
63
+ warn_return_any = true
64
+ warn_unused_configs = true
@@ -0,0 +1,49 @@
1
+ """
2
+ pg2md — Page to Markdown converter with JS rendering support.
3
+
4
+ A fast, clean HTML-to-Markdown converter that uses Playwright for
5
+ JavaScript rendering and html-to-markdown (Rust-based) for conversion.
6
+
7
+ Features:
8
+ - Auto-download and start Lightpanda browser (default)
9
+ - JavaScript rendering via Playwright/Lightpanda
10
+ - Fast Rust-based HTML-to-Markdown conversion
11
+ - Proxy support (HTTP/HTTPS/SOCKS5)
12
+ - Custom User-Agents (Googlebot, Bingbot, etc.)
13
+
14
+ Example:
15
+ from pg2md import PageParser
16
+
17
+ # Auto-downloads Lightpanda and parses page
18
+ parser = PageParser()
19
+ markdown = parser.parse("https://example.com")
20
+ print(markdown)
21
+
22
+ # With proxy
23
+ from pg2md import ProxyConfig
24
+ proxy = ProxyConfig(server="socks5://1.2.3.4:1080")
25
+ markdown = parser.parse("https://example.com", proxy=proxy)
26
+ """
27
+
28
+ from pg2md.parser import (
29
+ PageParser,
30
+ BrowserConfig,
31
+ ProxyConfig,
32
+ UserAgents,
33
+ HtmlCleaner,
34
+ MarkdownCleaner,
35
+ )
36
+ from pg2md import lightpanda
37
+
38
+ __version__ = "1.2.0"
39
+ __author__ = "lemantorus"
40
+ __all__ = [
41
+ "PageParser",
42
+ "BrowserConfig",
43
+ "ProxyConfig",
44
+ "UserAgents",
45
+ "HtmlCleaner",
46
+ "MarkdownCleaner",
47
+ "lightpanda",
48
+ "__version__",
49
+ ]
@@ -0,0 +1,163 @@
1
+ """
2
+ Lightpanda browser auto-downloader and launcher.
3
+
4
+ Automatically downloads Lightpanda binary from GitHub releases
5
+ and manages the CDP server process.
6
+
7
+ Supported platforms:
8
+ - Linux x86_64
9
+ - macOS arm64 (Apple Silicon)
10
+ """
11
+
12
+ import platform
13
+ import subprocess
14
+ import time
15
+ import urllib.request
16
+ from pathlib import Path
17
+
18
+ LIGHTPANDA_VERSION = "nightly"
19
+
20
+ DOWNLOAD_URLS = {
21
+ (
22
+ "Linux",
23
+ "x86_64",
24
+ ): "https://github.com/lightpanda-io/browser/releases/download/nightly/lightpanda-x86_64-linux",
25
+ (
26
+ "Darwin",
27
+ "arm64",
28
+ ): "https://github.com/lightpanda-io/browser/releases/download/nightly/lightpanda-aarch64-macos",
29
+ }
30
+
31
+
32
+ def get_cache_dir() -> Path:
33
+ """Returns the cache directory for Lightpanda binary."""
34
+ xdg_cache = Path.home() / ".cache" / "pg2md"
35
+ return xdg_cache
36
+
37
+
38
+ def get_binary_path() -> Path:
39
+ """Returns the path to the Lightpanda binary."""
40
+ return get_cache_dir() / "lightpanda"
41
+
42
+
43
+ def get_platform_key() -> tuple[str, str]:
44
+ """Returns (system, machine) tuple for current platform."""
45
+ return (platform.system(), platform.machine())
46
+
47
+
48
+ def is_supported() -> bool:
49
+ """Check if Lightpanda is available for current platform."""
50
+ return get_platform_key() in DOWNLOAD_URLS
51
+
52
+
53
+ def is_downloaded() -> bool:
54
+ """Check if Lightpanda binary is already downloaded."""
55
+ return get_binary_path().exists()
56
+
57
+
58
+ def download(force: bool = False) -> Path:
59
+ """
60
+ Download Lightpanda binary if not already present.
61
+
62
+ Args:
63
+ force: Re-download even if binary exists
64
+
65
+ Returns:
66
+ Path to the binary
67
+
68
+ Raises:
69
+ RuntimeError: If platform is not supported
70
+ """
71
+ binary_path = get_binary_path()
72
+
73
+ if binary_path.exists() and not force:
74
+ return binary_path
75
+
76
+ key = get_platform_key()
77
+ if key not in DOWNLOAD_URLS:
78
+ raise RuntimeError(
79
+ f"Lightpanda not available for {key[0]} {key[1]}. "
80
+ f"Supported: {list(DOWNLOAD_URLS.keys())}"
81
+ )
82
+
83
+ url = DOWNLOAD_URLS[key]
84
+ binary_path.parent.mkdir(parents=True, exist_ok=True)
85
+
86
+ print(f"[pg2md] Downloading Lightpanda {LIGHTPANDA_VERSION}...")
87
+ urllib.request.urlretrieve(url, binary_path)
88
+ binary_path.chmod(0o755)
89
+ print(f"[pg2md] Saved to {binary_path}")
90
+
91
+ return binary_path
92
+
93
+
94
+ def start(
95
+ host: str = "127.0.0.1",
96
+ port: int = 9222,
97
+ download_if_missing: bool = True,
98
+ ) -> subprocess.Popen:
99
+ """
100
+ Start Lightpanda CDP server.
101
+
102
+ Args:
103
+ host: Host to bind to
104
+ port: Port to bind to
105
+ download_if_missing: Download binary if not present
106
+
107
+ Returns:
108
+ subprocess.Popen object
109
+ """
110
+ if download_if_missing:
111
+ binary_path = download()
112
+ else:
113
+ binary_path = get_binary_path()
114
+ if not binary_path.exists():
115
+ raise FileNotFoundError(
116
+ f"Lightpanda binary not found at {binary_path}. "
117
+ "Set download_if_missing=True to auto-download."
118
+ )
119
+
120
+ proc = subprocess.Popen(
121
+ [str(binary_path), "serve", "--host", host, "--port", str(port)],
122
+ stdout=subprocess.DEVNULL,
123
+ stderr=subprocess.DEVNULL,
124
+ )
125
+
126
+ cdp_url = f"ws://{host}:{port}"
127
+ _wait_for_cdp(cdp_url, timeout=10)
128
+
129
+ return proc
130
+
131
+
132
+ def _wait_for_cdp(cdp_url: str, timeout: float = 10.0) -> bool:
133
+ """Wait for CDP server to be ready."""
134
+ import socket
135
+
136
+ host = cdp_url.split("://")[1].split(":")[0]
137
+ port = int(cdp_url.split(":")[-1])
138
+
139
+ start_time = time.time()
140
+ while time.time() - start_time < timeout:
141
+ try:
142
+ sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
143
+ sock.settimeout(1)
144
+ result = sock.connect_ex((host, port))
145
+ sock.close()
146
+ if result == 0:
147
+ time.sleep(0.5)
148
+ return True
149
+ except Exception:
150
+ pass
151
+ time.sleep(0.2)
152
+
153
+ raise TimeoutError(f"Lightpanda CDP server not ready at {cdp_url}")
154
+
155
+
156
+ def stop(proc: subprocess.Popen) -> None:
157
+ """Stop Lightpanda process."""
158
+ if proc and proc.poll() is None:
159
+ proc.terminate()
160
+ try:
161
+ proc.wait(timeout=5)
162
+ except subprocess.TimeoutExpired:
163
+ proc.kill()
@@ -0,0 +1,469 @@
1
+ """
2
+ PageParser — Parser for web pages with JS rendering via Lightpanda/Playwright
3
+ and conversion to clean Markdown.
4
+
5
+ Dependencies:
6
+ pip install playwright html-to-markdown beautifulsoup4
7
+ playwright install chromium # if using regular Chrome, not Lightpanda
8
+
9
+ Running Lightpanda (optional, instead of Chrome):
10
+ ./lightpanda serve --host 127.0.0.1 --port 9222
11
+
12
+ Usage:
13
+ parser = PageParser(with_image=False, with_link=False)
14
+
15
+ # Without proxy
16
+ result = parser.parse("https://example.com")
17
+
18
+ # With proxy for a specific request
19
+ proxy = ProxyConfig(server="http://1.2.3.4:8080", username="user", password="pass")
20
+ result = parser.parse("https://example.com", proxy=proxy)
21
+
22
+ print(result)
23
+ """
24
+
25
+ import re
26
+ import subprocess
27
+ import time
28
+ import asyncio
29
+ from dataclasses import dataclass
30
+ from typing import Optional, Literal
31
+
32
+ from bs4 import BeautifulSoup
33
+ from html_to_markdown import convert, ConversionOptions, PreprocessingOptions
34
+ from playwright.async_api import async_playwright, ProxySettings, ViewportSize
35
+
36
+ from pg2md import lightpanda
37
+
38
+
39
+ @dataclass
40
+ class ProxyConfig:
41
+ """
42
+ Proxy settings for a single request.
43
+
44
+ Supported server formats:
45
+ http://host:port
46
+ https://host:port
47
+ socks5://host:port
48
+
49
+ Args:
50
+ server : proxy address (required)
51
+ username : login (optional)
52
+ password : password (optional)
53
+ bypass : comma-separated list of hosts to bypass proxy
54
+ (e.g. "localhost,127.0.0.1")
55
+ """
56
+
57
+ server: str
58
+ username: Optional[str] = None
59
+ password: Optional[str] = None
60
+ bypass: Optional[str] = None
61
+
62
+ def to_playwright(self) -> ProxySettings:
63
+ """Converts to Playwright ProxySettings format."""
64
+ settings: ProxySettings = {"server": self.server}
65
+ if self.username:
66
+ settings["username"] = self.username
67
+ if self.password:
68
+ settings["password"] = self.password
69
+ if self.bypass:
70
+ settings["bypass"] = self.bypass
71
+ return settings
72
+
73
+
74
+ class UserAgents:
75
+ """Popular User-Agent strings for bypassing blocks."""
76
+
77
+ CHROME_DESKTOP = (
78
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
79
+ "Chrome/120.0.0.0 Safari/537.36"
80
+ )
81
+
82
+ CHROME_MAC = (
83
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) "
84
+ "Chrome/120.0.0.0 Safari/537.36"
85
+ )
86
+
87
+ FIREFOX_DESKTOP = (
88
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0"
89
+ )
90
+
91
+ SAFARI_MAC = (
92
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) "
93
+ "Version/17.2 Safari/605.1.15"
94
+ )
95
+
96
+ EDGE = (
97
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
98
+ "Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0"
99
+ )
100
+
101
+ GOOGLEBOT_DESKTOP = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
102
+
103
+ GOOGLEBOT_MOBILE = (
104
+ "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) "
105
+ "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.6099.71 MobileSafari/537.36 "
106
+ "(compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
107
+ )
108
+
109
+ GOOGLEBOT_VIDEO = (
110
+ "Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) "
111
+ "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.6099.71 MobileSafari/537.36 "
112
+ "Googlebot/2.1"
113
+ )
114
+
115
+ BINGBOT = "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)"
116
+
117
+ BINGBOT_MOBILE = (
118
+ "Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 "
119
+ "(KHTML, like Gecko) Version/16.6 Mobile/15E148 BingWeb/7.15.13.7055 (advisor; +http://www.bing.com/bingbot.htm)"
120
+ )
121
+
122
+ YANDEXBOT = "Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)"
123
+
124
+ DUCKBOT = "DuckDuckBot/1.0; (+http://duckduckgo.com/duckduckbot.html)"
125
+
126
+ APPLEBOT = (
127
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) "
128
+ "Version/17.0 Safari/605.1.15 Applebot/0.1"
129
+ )
130
+
131
+ CHROME_LINUX = (
132
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) "
133
+ "Chrome/120.0.0.0 Safari/537.36"
134
+ )
135
+
136
+
137
+ @dataclass
138
+ class BrowserConfig:
139
+ """
140
+ Browser connection settings.
141
+
142
+ cdp_url options:
143
+ "auto" - Auto-download and start Lightpanda (default)
144
+ None - Use built-in Chromium via Playwright
145
+ "ws://host:port" - Connect to existing CDP server (Lightpanda/Chrome)
146
+ """
147
+
148
+ cdp_url: Optional[str] = "auto"
149
+ lightpanda_bin: Optional[str] = None
150
+ navigation_timeout: int = 60_000
151
+ wait_until: Literal["load", "domcontentloaded", "networkidle"] = "domcontentloaded"
152
+ default_proxy: Optional[ProxyConfig] = None
153
+ user_agent: Optional[str] = UserAgents.CHROME_DESKTOP
154
+ extra_headers: Optional[dict[str, str]] = None
155
+ viewport: Optional[ViewportSize] = None
156
+
157
+ def __post_init__(self):
158
+ if self.viewport is None:
159
+ self.viewport = {"width": 1920, "height": 1080}
160
+
161
+
162
+ class HtmlCleaner:
163
+ """
164
+ Cleans HTML before converting to Markdown:
165
+ - removes <script>, <style>, <noscript>, <svg>, <canvas>, <video>, <audio>
166
+ - removes <img> (optional)
167
+ - removes href/src with data:, blob: (base64 junk)
168
+ - strips links, keeping only text (optional)
169
+ """
170
+
171
+ ALWAYS_STRIP_TAGS = [
172
+ "script",
173
+ "style",
174
+ "noscript",
175
+ "svg",
176
+ "canvas",
177
+ "video",
178
+ "audio",
179
+ "iframe",
180
+ "object",
181
+ "embed",
182
+ "head",
183
+ ]
184
+
185
+ def __init__(self, with_image: bool = False, with_link: bool = True):
186
+ self.with_image = with_image
187
+ self.with_link = with_link
188
+
189
+ def clean(self, html: str) -> str:
190
+ soup = BeautifulSoup(html, "html.parser")
191
+
192
+ for tag in self.ALWAYS_STRIP_TAGS:
193
+ for el in soup.find_all(tag):
194
+ el.decompose()
195
+
196
+ if not self.with_image:
197
+ for el in soup.find_all("img"):
198
+ el.decompose()
199
+ else:
200
+ for el in soup.find_all("img"):
201
+ src = el.get("src", "")
202
+ if isinstance(src, str) and (src.startswith("data:") or src.startswith("blob:")):
203
+ el.decompose()
204
+
205
+ if not self.with_link:
206
+ for el in soup.find_all("a"):
207
+ el.replace_with(el.get_text())
208
+ else:
209
+ for el in soup.find_all("a"):
210
+ href = el.get("href", "")
211
+ if isinstance(href, str) and (href.startswith("data:") or href.startswith("blob:")):
212
+ el["href"] = ""
213
+
214
+ for el in soup.find_all(True):
215
+ for attr in ("src", "href", "srcset", "poster", "background"):
216
+ val = el.get(attr, "")
217
+ if isinstance(val, str) and (val.startswith("data:") or val.startswith("blob:")):
218
+ del el[attr]
219
+
220
+ return str(soup)
221
+
222
+
223
+ class MarkdownCleaner:
224
+ """Final cleanup of ready Markdown text."""
225
+
226
+ _BASE64_LINE = re.compile(r"^[A-Za-z0-9+/=]{40,}\s*$", re.MULTILINE)
227
+ _BINARY_GARBAGE = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f]")
228
+ _EXCESS_NEWLINES = re.compile(r"\n{3,}")
229
+ _MD_IMAGE = re.compile(r"!\[.*?\]\(.*?\)")
230
+
231
+ def clean(self, text: str, strip_images: bool = False) -> str:
232
+ text = self._BINARY_GARBAGE.sub("", text)
233
+ text = self._BASE64_LINE.sub("", text)
234
+
235
+ if strip_images:
236
+ text = self._MD_IMAGE.sub("", text)
237
+
238
+ text = self._EXCESS_NEWLINES.sub("\n\n", text)
239
+
240
+ return text.strip()
241
+
242
+
243
+ class PageParser:
244
+ """
245
+ Parses web pages with JS rendering and returns clean Markdown.
246
+
247
+ Args:
248
+ with_image (bool) : Include images in output. Default False.
249
+ with_link (bool) : Include links (href). Default True.
250
+ False — links are replaced with their text.
251
+ browser_config (BrowserConfig): Browser connection settings.
252
+ Can set default_proxy for all requests.
253
+
254
+ Proxy is passed to parse() / async_parse() per-request:
255
+ proxy = ProxyConfig(server="socks5://1.2.3.4:1080")
256
+ result = parser.parse("https://example.com", proxy=proxy)
257
+ """
258
+
259
+ def __init__(
260
+ self,
261
+ with_image: bool = False,
262
+ with_link: bool = True,
263
+ browser_config: Optional[BrowserConfig] = None,
264
+ ):
265
+ self.with_image = with_image
266
+ self.with_link = with_link
267
+ self.config = browser_config or BrowserConfig()
268
+
269
+ self._html_cleaner = HtmlCleaner(with_image=with_image, with_link=with_link)
270
+ self._md_cleaner = MarkdownCleaner()
271
+
272
+ self._lightpanda_proc: Optional[subprocess.Popen] = None
273
+ self._auto_lightpanda_proc: Optional[subprocess.Popen] = None
274
+ self._auto_cdp_url: Optional[str] = None
275
+
276
+ def parse(self, url: str, proxy: Optional[ProxyConfig] = None) -> str:
277
+ """
278
+ Synchronous wrapper over async_parse.
279
+
280
+ Args:
281
+ url : page to parse
282
+ proxy : proxy for this specific request (overrides default_proxy)
283
+ """
284
+ return asyncio.run(self.async_parse(url, proxy=proxy))
285
+
286
+ async def async_parse(self, url: str, proxy: Optional[ProxyConfig] = None) -> str:
287
+ """
288
+ Loads page, renders JS, returns clean Markdown.
289
+
290
+ Args:
291
+ url : page to parse
292
+ proxy : proxy for this specific request (overrides default_proxy)
293
+ """
294
+ html = await self._fetch_html(url, proxy=proxy)
295
+ return self._html_to_markdown(html)
296
+
297
+ async def async_parse_many(
298
+ self,
299
+ urls: list[str],
300
+ proxy: Optional[ProxyConfig] = None,
301
+ ) -> dict[str, str | BaseException]:
302
+ """
303
+ Parses multiple URLs in parallel.
304
+
305
+ Args:
306
+ urls : list of pages
307
+ proxy : one proxy for all requests (or None)
308
+ """
309
+ tasks = [self.async_parse(url, proxy=proxy) for url in urls]
310
+ results = await asyncio.gather(*tasks, return_exceptions=True)
311
+ return {url: res for url, res in zip(urls, results)}
312
+
313
+ async def _fetch_html(self, url: str, proxy: Optional[ProxyConfig] = None) -> str:
314
+ """Opens browser, loads page, returns HTML."""
315
+ self._maybe_start_lightpanda()
316
+
317
+ effective_proxy = proxy or self.config.default_proxy
318
+ proxy_settings = effective_proxy.to_playwright() if effective_proxy else None
319
+
320
+ headers = {}
321
+ cdp_url = self._resolve_cdp_url()
322
+ use_cdp = cdp_url is not None
323
+ if use_cdp and self.config.user_agent:
324
+ headers["User-Agent"] = self.config.user_agent
325
+ if self.config.extra_headers:
326
+ headers.update(self.config.extra_headers)
327
+
328
+ async with async_playwright() as pw:
329
+ fallback_to_chromium = False
330
+
331
+ while True:
332
+ try:
333
+ if use_cdp and not fallback_to_chromium:
334
+ browser = await pw.chromium.connect_over_cdp(cdp_url)
335
+ context = await browser.new_context(
336
+ proxy=proxy_settings,
337
+ extra_http_headers=headers if headers else None,
338
+ viewport=self.config.viewport,
339
+ )
340
+ else:
341
+ browser = await pw.chromium.launch(headless=True)
342
+ context = await browser.new_context(
343
+ proxy=proxy_settings,
344
+ user_agent=self.config.user_agent,
345
+ extra_http_headers=headers if headers else None,
346
+ viewport=self.config.viewport,
347
+ )
348
+
349
+ page = await context.new_page()
350
+
351
+ try:
352
+ await page.goto(
353
+ url,
354
+ timeout=self.config.navigation_timeout,
355
+ wait_until=self.config.wait_until,
356
+ )
357
+ await page.wait_for_timeout(1500)
358
+ html = await page.content()
359
+ finally:
360
+ await page.close()
361
+ await context.close()
362
+ await browser.close()
363
+
364
+ return html
365
+
366
+ except Exception as e:
367
+ if use_cdp and not fallback_to_chromium and self.config.cdp_url == "auto":
368
+ print(f"[pg2md] Lightpanda failed: {e}")
369
+ print("[pg2md] Falling back to Chromium...")
370
+ self._stop_auto_lightpanda()
371
+ fallback_to_chromium = True
372
+ continue
373
+ else:
374
+ raise
375
+
376
+ def _resolve_cdp_url(self) -> Optional[str]:
377
+ """Resolve CDP URL based on config."""
378
+ if self.config.cdp_url == "auto":
379
+ if self._auto_cdp_url:
380
+ return self._auto_cdp_url
381
+
382
+ if lightpanda.is_supported():
383
+ self._auto_lightpanda_proc = lightpanda.start()
384
+ self._auto_cdp_url = "ws://127.0.0.1:9222"
385
+ return self._auto_cdp_url
386
+ else:
387
+ return None
388
+ return self.config.cdp_url
389
+
390
+ def _html_to_markdown(self, html: str) -> str:
391
+ """Cleans HTML and converts to Markdown."""
392
+
393
+ clean_html = self._html_cleaner.clean(html)
394
+
395
+ options = ConversionOptions(
396
+ heading_style="atx",
397
+ strong_em_symbol="*",
398
+ bullets="*",
399
+ escape_asterisks=False,
400
+ )
401
+ preprocessing = PreprocessingOptions(
402
+ enabled=True,
403
+ preset="aggressive",
404
+ remove_navigation=True,
405
+ remove_forms=True,
406
+ )
407
+ markdown = convert(clean_html, options, preprocessing)
408
+
409
+ markdown = self._md_cleaner.clean(markdown, strip_images=not self.with_image)
410
+
411
+ return markdown
412
+
413
+ def _maybe_start_lightpanda(self) -> None:
414
+ """If binary path is set and process not running — start it."""
415
+ if not self.config.lightpanda_bin:
416
+ return
417
+ if self._lightpanda_proc and self._lightpanda_proc.poll() is None:
418
+ return
419
+
420
+ self._lightpanda_proc = subprocess.Popen(
421
+ [
422
+ self.config.lightpanda_bin,
423
+ "serve",
424
+ "--host",
425
+ "127.0.0.1",
426
+ "--port",
427
+ "9222",
428
+ ],
429
+ stdout=subprocess.DEVNULL,
430
+ stderr=subprocess.DEVNULL,
431
+ )
432
+ time.sleep(1.5)
433
+
434
+ def stop_lightpanda(self) -> None:
435
+ """Explicitly stops Lightpanda if started by us."""
436
+ if self._lightpanda_proc:
437
+ self._lightpanda_proc.terminate()
438
+ self._lightpanda_proc = None
439
+
440
+ def _stop_auto_lightpanda(self) -> None:
441
+ """Stop auto-launched Lightpanda process."""
442
+ if self._auto_lightpanda_proc:
443
+ lightpanda.stop(self._auto_lightpanda_proc)
444
+ self._auto_lightpanda_proc = None
445
+ self._auto_cdp_url = None
446
+
447
+ def stop(self) -> None:
448
+ """Stop all browser processes started by this parser."""
449
+ self.stop_lightpanda()
450
+ self._stop_auto_lightpanda()
451
+
452
+ def __enter__(self):
453
+ return self
454
+
455
+ def __exit__(self, *_):
456
+ self.stop()
457
+
458
+
459
+ if __name__ == "__main__":
460
+ import sys
461
+
462
+ url = sys.argv[1] if len(sys.argv) > 1 else "https://example.com"
463
+
464
+ parser = PageParser(with_image=False, with_link=False)
465
+
466
+ with parser:
467
+ result = parser.parse(url)
468
+
469
+ print(result)
@@ -0,0 +1 @@
1
+ manylinux1_x86_64.whl driver package
@@ -0,0 +1,3 @@
1
+ # playwright-core
2
+
3
+ This package contains the no-browser flavor of [Playwright](http://github.com/microsoft/playwright).