pg2md 1.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pg2md-1.2.0/.gitignore +82 -0
- pg2md-1.2.0/LICENSE +21 -0
- pg2md-1.2.0/PKG-INFO +273 -0
- pg2md-1.2.0/README.md +240 -0
- pg2md-1.2.0/all_code.txt +0 -0
- pg2md-1.2.0/pyproject.toml +64 -0
- pg2md-1.2.0/src/pg2md/__init__.py +49 -0
- pg2md-1.2.0/src/pg2md/lightpanda.py +163 -0
- pg2md-1.2.0/src/pg2md/parser.py +469 -0
- pg2md-1.2.0/venv/lib/python3.12/site-packages/playwright/driver/README.md +1 -0
- pg2md-1.2.0/venv/lib/python3.12/site-packages/playwright/driver/package/README.md +3 -0
pg2md-1.2.0/.gitignore
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# C extensions
|
|
7
|
+
*.so
|
|
8
|
+
|
|
9
|
+
# Distribution / packaging
|
|
10
|
+
.Python
|
|
11
|
+
build/
|
|
12
|
+
develop-eggs/
|
|
13
|
+
dist/
|
|
14
|
+
downloads/
|
|
15
|
+
eggs/
|
|
16
|
+
.eggs/
|
|
17
|
+
lib/
|
|
18
|
+
lib64/
|
|
19
|
+
parts/
|
|
20
|
+
sdist/
|
|
21
|
+
var/
|
|
22
|
+
wheels/
|
|
23
|
+
*.egg-info/
|
|
24
|
+
.installed.cfg
|
|
25
|
+
*.egg
|
|
26
|
+
MANIFEST
|
|
27
|
+
|
|
28
|
+
# PyInstaller
|
|
29
|
+
*.manifest
|
|
30
|
+
*.spec
|
|
31
|
+
|
|
32
|
+
# Installer logs
|
|
33
|
+
pip-log.txt
|
|
34
|
+
pip-delete-this-directory.txt
|
|
35
|
+
|
|
36
|
+
# Unit test / coverage reports
|
|
37
|
+
htmlcov/
|
|
38
|
+
.tox/
|
|
39
|
+
.nox/
|
|
40
|
+
.coverage
|
|
41
|
+
.coverage.*
|
|
42
|
+
.cache
|
|
43
|
+
nosetests.xml
|
|
44
|
+
coverage.xml
|
|
45
|
+
*.cover
|
|
46
|
+
.hypothesis/
|
|
47
|
+
.pytest_cache/
|
|
48
|
+
|
|
49
|
+
# Translations
|
|
50
|
+
*.mo
|
|
51
|
+
*.pot
|
|
52
|
+
|
|
53
|
+
# Environments
|
|
54
|
+
.env
|
|
55
|
+
.venv
|
|
56
|
+
env/
|
|
57
|
+
venv/
|
|
58
|
+
ENV/
|
|
59
|
+
env.bak/
|
|
60
|
+
venv.bak/
|
|
61
|
+
|
|
62
|
+
# IDEs
|
|
63
|
+
.idea/
|
|
64
|
+
.vscode/
|
|
65
|
+
*.swp
|
|
66
|
+
*.swo
|
|
67
|
+
*~
|
|
68
|
+
|
|
69
|
+
# Linters / formatters
|
|
70
|
+
.ruff_cache/
|
|
71
|
+
.mypy_cache/
|
|
72
|
+
.dmypy.json
|
|
73
|
+
dmypy.json
|
|
74
|
+
|
|
75
|
+
# Jupyter
|
|
76
|
+
.ipynb_checkpoints
|
|
77
|
+
|
|
78
|
+
# Project specific
|
|
79
|
+
*.md
|
|
80
|
+
!README.md
|
|
81
|
+
data.json
|
|
82
|
+
proxies.txt
|
pg2md-1.2.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 lemantorus
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
pg2md-1.2.0/PKG-INFO
ADDED
|
@@ -0,0 +1,273 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pg2md
|
|
3
|
+
Version: 1.2.0
|
|
4
|
+
Summary: Page to Markdown converter with JS rendering support via Playwright
|
|
5
|
+
Project-URL: Homepage, https://github.com/lemantorus/pg2md
|
|
6
|
+
Project-URL: Documentation, https://github.com/lemantorus/pg2md#readme
|
|
7
|
+
Project-URL: Repository, https://github.com/lemantorus/pg2md
|
|
8
|
+
Project-URL: Issues, https://github.com/lemantorus/pg2md/issues
|
|
9
|
+
Author: lemantorus
|
|
10
|
+
License-Expression: MIT
|
|
11
|
+
License-File: LICENSE
|
|
12
|
+
Keywords: converter,html,js-rendering,markdown,parser,playwright,scraper,web-scraping
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
22
|
+
Classifier: Topic :: Text Processing :: Markup :: Markdown
|
|
23
|
+
Classifier: Topic :: Utilities
|
|
24
|
+
Requires-Python: >=3.10
|
|
25
|
+
Requires-Dist: beautifulsoup4>=4.12.0
|
|
26
|
+
Requires-Dist: html-to-markdown>=2.0.0
|
|
27
|
+
Requires-Dist: playwright>=1.40.0
|
|
28
|
+
Provides-Extra: dev
|
|
29
|
+
Requires-Dist: mypy>=1.0.0; extra == 'dev'
|
|
30
|
+
Requires-Dist: pytest>=7.0.0; extra == 'dev'
|
|
31
|
+
Requires-Dist: ruff>=0.1.0; extra == 'dev'
|
|
32
|
+
Description-Content-Type: text/markdown
|
|
33
|
+
|
|
34
|
+
# pg2md
|
|
35
|
+
|
|
36
|
+
[](https://badge.fury.io/py/pg2md)
|
|
37
|
+
[](https://pypi.org/project/pg2md/)
|
|
38
|
+
[](https://opensource.org/licenses/MIT)
|
|
39
|
+
[](https://pepy.tech/project/pg2md)
|
|
40
|
+
[](https://github.com/lemantorus/pg2md/stargazers)
|
|
41
|
+
[](https://github.com/lemantorus/pg2md/issues)
|
|
42
|
+
[](https://github.com/lemantorus/pg2md/network/members)
|
|
43
|
+
|
|
44
|
+
**P**a**g**e to **M**ark**d**own — fast HTML-to-Markdown converter with JavaScript rendering support.
|
|
45
|
+
|
|
46
|
+
Converts any web page to clean Markdown using Playwright for JS rendering and Rust-based `html-to-markdown` for conversion.
|
|
47
|
+
|
|
48
|
+
## Features
|
|
49
|
+
|
|
50
|
+
- **Auto Lightpanda** — automatically downloads and starts Lightpanda browser
|
|
51
|
+
- **JavaScript Rendering** — handles SPA, React, Vue, dynamic content
|
|
52
|
+
- **Fast Conversion** — Rust-based `html-to-markdown` core
|
|
53
|
+
- **Clean Output** — strips scripts, styles, navigation, forms
|
|
54
|
+
- **Proxy Support** — HTTP/HTTPS/SOCKS5 with auth
|
|
55
|
+
- **Custom User-Agents** — includes Googlebot, Bingbot, etc.
|
|
56
|
+
- **Async & Sync API** — `parse()` and `async_parse()`
|
|
57
|
+
- **Batch Processing** — `async_parse_many()` for parallel requests
|
|
58
|
+
- **Configurable** — images, links, headers, timeouts
|
|
59
|
+
|
|
60
|
+
## Quick Start
|
|
61
|
+
|
|
62
|
+
```python
|
|
63
|
+
from pg2md import PageParser
|
|
64
|
+
|
|
65
|
+
# Auto-downloads Lightpanda, starts it, and parses page
|
|
66
|
+
# If Lightpanda fails, falls back to Chromium automatically
|
|
67
|
+
parser = PageParser()
|
|
68
|
+
markdown = parser.parse("https://example.com")
|
|
69
|
+
print(markdown)
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
First run will download Lightpanda (~50MB) to `~/.cache/pg2md/`.
|
|
73
|
+
|
|
74
|
+
**Note:** Lightpanda is in beta and may not support all Playwright features. If it fails, pg2md automatically falls back to Chromium.
|
|
75
|
+
|
|
76
|
+
## Installation
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
pip install pg2md
|
|
80
|
+
playwright install chromium # fallback browser if Lightpanda unsupported
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
## Usage Examples
|
|
84
|
+
|
|
85
|
+
### Basic Usage
|
|
86
|
+
|
|
87
|
+
```python
|
|
88
|
+
from pg2md import PageParser
|
|
89
|
+
|
|
90
|
+
parser = PageParser()
|
|
91
|
+
result = parser.parse("https://example.com")
|
|
92
|
+
print(result)
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
### Without Images and Links
|
|
96
|
+
|
|
97
|
+
```python
|
|
98
|
+
from pg2md import PageParser
|
|
99
|
+
|
|
100
|
+
parser = PageParser(with_image=False, with_link=False)
|
|
101
|
+
result = parser.parse("https://example.com")
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
### With Proxy
|
|
105
|
+
|
|
106
|
+
```python
|
|
107
|
+
from pg2md import PageParser, ProxyConfig
|
|
108
|
+
|
|
109
|
+
proxy = ProxyConfig(
|
|
110
|
+
server="http://proxy.example.com:8080",
|
|
111
|
+
username="user",
|
|
112
|
+
password="pass"
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
parser = PageParser()
|
|
116
|
+
result = parser.parse("https://example.com", proxy=proxy)
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
### SOCKS5 Proxy
|
|
120
|
+
|
|
121
|
+
```python
|
|
122
|
+
from pg2md import PageParser, ProxyConfig
|
|
123
|
+
|
|
124
|
+
proxy = ProxyConfig(server="socks5://127.0.0.1:1080")
|
|
125
|
+
parser = PageParser()
|
|
126
|
+
result = parser.parse("https://example.com", proxy=proxy)
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
### Custom User-Agent
|
|
130
|
+
|
|
131
|
+
```python
|
|
132
|
+
from pg2md import PageParser, BrowserConfig, UserAgents
|
|
133
|
+
|
|
134
|
+
config = BrowserConfig(
|
|
135
|
+
cdp_url=None,
|
|
136
|
+
user_agent=UserAgents.GOOGLEBOT_DESKTOP,
|
|
137
|
+
extra_headers={"Accept-Language": "en-US,en;q=0.9"}
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
parser = PageParser(browser_config=config)
|
|
141
|
+
result = parser.parse("https://example.com")
|
|
142
|
+
```
|
|
143
|
+
|
|
144
|
+
### Async API
|
|
145
|
+
|
|
146
|
+
```python
|
|
147
|
+
import asyncio
|
|
148
|
+
from pg2md import PageParser
|
|
149
|
+
|
|
150
|
+
async def main():
|
|
151
|
+
parser = PageParser()
|
|
152
|
+
result = await parser.async_parse("https://example.com")
|
|
153
|
+
print(result)
|
|
154
|
+
|
|
155
|
+
asyncio.run(main())
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
### Batch Processing
|
|
159
|
+
|
|
160
|
+
```python
|
|
161
|
+
import asyncio
|
|
162
|
+
from pg2md import PageParser
|
|
163
|
+
|
|
164
|
+
async def main():
|
|
165
|
+
parser = PageParser()
|
|
166
|
+
urls = [
|
|
167
|
+
"https://example.com",
|
|
168
|
+
"https://example.org",
|
|
169
|
+
"https://example.net",
|
|
170
|
+
]
|
|
171
|
+
results = await parser.async_parse_many(urls)
|
|
172
|
+
|
|
173
|
+
for url, result in results.items():
|
|
174
|
+
if isinstance(result, Exception):
|
|
175
|
+
print(f"Error {url}: {result}")
|
|
176
|
+
else:
|
|
177
|
+
print(f"{url}: {len(result)} chars")
|
|
178
|
+
|
|
179
|
+
asyncio.run(main())
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
### Using Lightpanda
|
|
183
|
+
|
|
184
|
+
```python
|
|
185
|
+
from pg2md import PageParser, BrowserConfig
|
|
186
|
+
|
|
187
|
+
# Start Lightpanda manually:
|
|
188
|
+
# ./lightpanda serve --host 127.0.0.1 --port 9222
|
|
189
|
+
|
|
190
|
+
config = BrowserConfig(cdp_url="ws://127.0.0.1:9222")
|
|
191
|
+
parser = PageParser(browser_config=config)
|
|
192
|
+
result = parser.parse("https://example.com")
|
|
193
|
+
```
|
|
194
|
+
|
|
195
|
+
## Configuration
|
|
196
|
+
|
|
197
|
+
### BrowserConfig
|
|
198
|
+
|
|
199
|
+
| Parameter | Type | Default | Description |
|
|
200
|
+
|-----------|------|---------|-------------|
|
|
201
|
+
| `cdp_url` | `str \| None` | `"auto"` | `"auto"` = auto Lightpanda, `None` = Chromium, `"ws://..."` = custom CDP |
|
|
202
|
+
| `lightpanda_bin` | `str \| None` | `None` | Path to Lightpanda binary (for manual start) |
|
|
203
|
+
| `navigation_timeout` | `int` | `60000` | Navigation timeout (ms) |
|
|
204
|
+
| `wait_until` | `str` | `"domcontentloaded"` | Wait event (`"load"`, `"domcontentloaded"`, `"networkidle"`) |
|
|
205
|
+
| `default_proxy` | `ProxyConfig \| None` | `None` | Default proxy for all requests |
|
|
206
|
+
| `user_agent` | `str \| None` | Chrome Desktop | User-Agent string |
|
|
207
|
+
| `extra_headers` | `dict \| None` | `None` | Additional HTTP headers |
|
|
208
|
+
| `viewport` | `dict \| None` | `{"width": 1920, "height": 1080}` | Browser viewport size |
|
|
209
|
+
|
|
210
|
+
### ProxyConfig
|
|
211
|
+
|
|
212
|
+
| Parameter | Type | Default | Description |
|
|
213
|
+
|-----------|------|---------|-------------|
|
|
214
|
+
| `server` | `str` | required | Proxy URL |
|
|
215
|
+
| `username` | `str \| None` | `None` | Username |
|
|
216
|
+
| `password` | `str \| None` | `None` | Password |
|
|
217
|
+
| `bypass` | `str \| None` | `None` | Hosts to bypass |
|
|
218
|
+
|
|
219
|
+
### UserAgents
|
|
220
|
+
|
|
221
|
+
Available presets:
|
|
222
|
+
|
|
223
|
+
- `CHROME_DESKTOP`, `CHROME_MAC`, `CHROME_LINUX`
|
|
224
|
+
- `FIREFOX_DESKTOP`
|
|
225
|
+
- `SAFARI_MAC`
|
|
226
|
+
- `EDGE`
|
|
227
|
+
- `GOOGLEBOT_DESKTOP`, `GOOGLEBOT_MOBILE`, `GOOGLEBOT_VIDEO`
|
|
228
|
+
- `BINGBOT`, `BINGBOT_MOBILE`
|
|
229
|
+
- `YANDEXBOT`
|
|
230
|
+
- `DUCKBOT`
|
|
231
|
+
- `APPLEBOT`
|
|
232
|
+
|
|
233
|
+
## API Reference
|
|
234
|
+
|
|
235
|
+
### PageParser
|
|
236
|
+
|
|
237
|
+
```python
|
|
238
|
+
PageParser(
|
|
239
|
+
with_image: bool = False,
|
|
240
|
+
with_link: bool = True,
|
|
241
|
+
browser_config: BrowserConfig | None = None
|
|
242
|
+
)
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
#### Methods
|
|
246
|
+
|
|
247
|
+
| Method | Description |
|
|
248
|
+
|--------|-------------|
|
|
249
|
+
| `parse(url, proxy=None)` | Sync parse, returns Markdown string |
|
|
250
|
+
| `async_parse(url, proxy=None)` | Async parse, returns Markdown string |
|
|
251
|
+
| `async_parse_many(urls, proxy=None)` | Batch async parse, returns dict |
|
|
252
|
+
| `stop_lightpanda()` | Stop Lightpanda if started |
|
|
253
|
+
|
|
254
|
+
## Development
|
|
255
|
+
|
|
256
|
+
```bash
|
|
257
|
+
git clone https://github.com/lemantorus/pg2md.git
|
|
258
|
+
cd pg2md
|
|
259
|
+
python -m venv venv
|
|
260
|
+
source venv/bin/activate
|
|
261
|
+
pip install -e ".[dev]"
|
|
262
|
+
playwright install chromium
|
|
263
|
+
```
|
|
264
|
+
|
|
265
|
+
## License
|
|
266
|
+
|
|
267
|
+
[MIT](LICENSE)
|
|
268
|
+
|
|
269
|
+
## Credits
|
|
270
|
+
|
|
271
|
+
- [Playwright](https://playwright.dev/python/) — browser automation
|
|
272
|
+
- [html-to-markdown](https://pypi.org/project/html-to-markdown/) — Rust-based HTML to Markdown
|
|
273
|
+
- [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) — HTML parsing
|
pg2md-1.2.0/README.md
ADDED
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
# pg2md
|
|
2
|
+
|
|
3
|
+
[](https://badge.fury.io/py/pg2md)
|
|
4
|
+
[](https://pypi.org/project/pg2md/)
|
|
5
|
+
[](https://opensource.org/licenses/MIT)
|
|
6
|
+
[](https://pepy.tech/project/pg2md)
|
|
7
|
+
[](https://github.com/lemantorus/pg2md/stargazers)
|
|
8
|
+
[](https://github.com/lemantorus/pg2md/issues)
|
|
9
|
+
[](https://github.com/lemantorus/pg2md/network/members)
|
|
10
|
+
|
|
11
|
+
**P**a**g**e to **M**ark**d**own — fast HTML-to-Markdown converter with JavaScript rendering support.
|
|
12
|
+
|
|
13
|
+
Converts any web page to clean Markdown using Playwright for JS rendering and Rust-based `html-to-markdown` for conversion.
|
|
14
|
+
|
|
15
|
+
## Features
|
|
16
|
+
|
|
17
|
+
- **Auto Lightpanda** — automatically downloads and starts Lightpanda browser
|
|
18
|
+
- **JavaScript Rendering** — handles SPA, React, Vue, dynamic content
|
|
19
|
+
- **Fast Conversion** — Rust-based `html-to-markdown` core
|
|
20
|
+
- **Clean Output** — strips scripts, styles, navigation, forms
|
|
21
|
+
- **Proxy Support** — HTTP/HTTPS/SOCKS5 with auth
|
|
22
|
+
- **Custom User-Agents** — includes Googlebot, Bingbot, etc.
|
|
23
|
+
- **Async & Sync API** — `parse()` and `async_parse()`
|
|
24
|
+
- **Batch Processing** — `async_parse_many()` for parallel requests
|
|
25
|
+
- **Configurable** — images, links, headers, timeouts
|
|
26
|
+
|
|
27
|
+
## Quick Start
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
from pg2md import PageParser
|
|
31
|
+
|
|
32
|
+
# Auto-downloads Lightpanda, starts it, and parses page
|
|
33
|
+
# If Lightpanda fails, falls back to Chromium automatically
|
|
34
|
+
parser = PageParser()
|
|
35
|
+
markdown = parser.parse("https://example.com")
|
|
36
|
+
print(markdown)
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
First run will download Lightpanda (~50MB) to `~/.cache/pg2md/`.
|
|
40
|
+
|
|
41
|
+
**Note:** Lightpanda is in beta and may not support all Playwright features. If it fails, pg2md automatically falls back to Chromium.
|
|
42
|
+
|
|
43
|
+
## Installation
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
pip install pg2md
|
|
47
|
+
playwright install chromium # fallback browser if Lightpanda unsupported
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
## Usage Examples
|
|
51
|
+
|
|
52
|
+
### Basic Usage
|
|
53
|
+
|
|
54
|
+
```python
|
|
55
|
+
from pg2md import PageParser
|
|
56
|
+
|
|
57
|
+
parser = PageParser()
|
|
58
|
+
result = parser.parse("https://example.com")
|
|
59
|
+
print(result)
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
### Without Images and Links
|
|
63
|
+
|
|
64
|
+
```python
|
|
65
|
+
from pg2md import PageParser
|
|
66
|
+
|
|
67
|
+
parser = PageParser(with_image=False, with_link=False)
|
|
68
|
+
result = parser.parse("https://example.com")
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
### With Proxy
|
|
72
|
+
|
|
73
|
+
```python
|
|
74
|
+
from pg2md import PageParser, ProxyConfig
|
|
75
|
+
|
|
76
|
+
proxy = ProxyConfig(
|
|
77
|
+
server="http://proxy.example.com:8080",
|
|
78
|
+
username="user",
|
|
79
|
+
password="pass"
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
parser = PageParser()
|
|
83
|
+
result = parser.parse("https://example.com", proxy=proxy)
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
### SOCKS5 Proxy
|
|
87
|
+
|
|
88
|
+
```python
|
|
89
|
+
from pg2md import PageParser, ProxyConfig
|
|
90
|
+
|
|
91
|
+
proxy = ProxyConfig(server="socks5://127.0.0.1:1080")
|
|
92
|
+
parser = PageParser()
|
|
93
|
+
result = parser.parse("https://example.com", proxy=proxy)
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
### Custom User-Agent
|
|
97
|
+
|
|
98
|
+
```python
|
|
99
|
+
from pg2md import PageParser, BrowserConfig, UserAgents
|
|
100
|
+
|
|
101
|
+
config = BrowserConfig(
|
|
102
|
+
cdp_url=None,
|
|
103
|
+
user_agent=UserAgents.GOOGLEBOT_DESKTOP,
|
|
104
|
+
extra_headers={"Accept-Language": "en-US,en;q=0.9"}
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
parser = PageParser(browser_config=config)
|
|
108
|
+
result = parser.parse("https://example.com")
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
### Async API
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
import asyncio
|
|
115
|
+
from pg2md import PageParser
|
|
116
|
+
|
|
117
|
+
async def main():
|
|
118
|
+
parser = PageParser()
|
|
119
|
+
result = await parser.async_parse("https://example.com")
|
|
120
|
+
print(result)
|
|
121
|
+
|
|
122
|
+
asyncio.run(main())
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
### Batch Processing
|
|
126
|
+
|
|
127
|
+
```python
|
|
128
|
+
import asyncio
|
|
129
|
+
from pg2md import PageParser
|
|
130
|
+
|
|
131
|
+
async def main():
|
|
132
|
+
parser = PageParser()
|
|
133
|
+
urls = [
|
|
134
|
+
"https://example.com",
|
|
135
|
+
"https://example.org",
|
|
136
|
+
"https://example.net",
|
|
137
|
+
]
|
|
138
|
+
results = await parser.async_parse_many(urls)
|
|
139
|
+
|
|
140
|
+
for url, result in results.items():
|
|
141
|
+
if isinstance(result, Exception):
|
|
142
|
+
print(f"Error {url}: {result}")
|
|
143
|
+
else:
|
|
144
|
+
print(f"{url}: {len(result)} chars")
|
|
145
|
+
|
|
146
|
+
asyncio.run(main())
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
### Using Lightpanda
|
|
150
|
+
|
|
151
|
+
```python
|
|
152
|
+
from pg2md import PageParser, BrowserConfig
|
|
153
|
+
|
|
154
|
+
# Start Lightpanda manually:
|
|
155
|
+
# ./lightpanda serve --host 127.0.0.1 --port 9222
|
|
156
|
+
|
|
157
|
+
config = BrowserConfig(cdp_url="ws://127.0.0.1:9222")
|
|
158
|
+
parser = PageParser(browser_config=config)
|
|
159
|
+
result = parser.parse("https://example.com")
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
## Configuration
|
|
163
|
+
|
|
164
|
+
### BrowserConfig
|
|
165
|
+
|
|
166
|
+
| Parameter | Type | Default | Description |
|
|
167
|
+
|-----------|------|---------|-------------|
|
|
168
|
+
| `cdp_url` | `str \| None` | `"auto"` | `"auto"` = auto Lightpanda, `None` = Chromium, `"ws://..."` = custom CDP |
|
|
169
|
+
| `lightpanda_bin` | `str \| None` | `None` | Path to Lightpanda binary (for manual start) |
|
|
170
|
+
| `navigation_timeout` | `int` | `60000` | Navigation timeout (ms) |
|
|
171
|
+
| `wait_until` | `str` | `"domcontentloaded"` | Wait event (`"load"`, `"domcontentloaded"`, `"networkidle"`) |
|
|
172
|
+
| `default_proxy` | `ProxyConfig \| None` | `None` | Default proxy for all requests |
|
|
173
|
+
| `user_agent` | `str \| None` | Chrome Desktop | User-Agent string |
|
|
174
|
+
| `extra_headers` | `dict \| None` | `None` | Additional HTTP headers |
|
|
175
|
+
| `viewport` | `dict \| None` | `{"width": 1920, "height": 1080}` | Browser viewport size |
|
|
176
|
+
|
|
177
|
+
### ProxyConfig
|
|
178
|
+
|
|
179
|
+
| Parameter | Type | Default | Description |
|
|
180
|
+
|-----------|------|---------|-------------|
|
|
181
|
+
| `server` | `str` | required | Proxy URL |
|
|
182
|
+
| `username` | `str \| None` | `None` | Username |
|
|
183
|
+
| `password` | `str \| None` | `None` | Password |
|
|
184
|
+
| `bypass` | `str \| None` | `None` | Hosts to bypass |
|
|
185
|
+
|
|
186
|
+
### UserAgents
|
|
187
|
+
|
|
188
|
+
Available presets:
|
|
189
|
+
|
|
190
|
+
- `CHROME_DESKTOP`, `CHROME_MAC`, `CHROME_LINUX`
|
|
191
|
+
- `FIREFOX_DESKTOP`
|
|
192
|
+
- `SAFARI_MAC`
|
|
193
|
+
- `EDGE`
|
|
194
|
+
- `GOOGLEBOT_DESKTOP`, `GOOGLEBOT_MOBILE`, `GOOGLEBOT_VIDEO`
|
|
195
|
+
- `BINGBOT`, `BINGBOT_MOBILE`
|
|
196
|
+
- `YANDEXBOT`
|
|
197
|
+
- `DUCKBOT`
|
|
198
|
+
- `APPLEBOT`
|
|
199
|
+
|
|
200
|
+
## API Reference
|
|
201
|
+
|
|
202
|
+
### PageParser
|
|
203
|
+
|
|
204
|
+
```python
|
|
205
|
+
PageParser(
|
|
206
|
+
with_image: bool = False,
|
|
207
|
+
with_link: bool = True,
|
|
208
|
+
browser_config: BrowserConfig | None = None
|
|
209
|
+
)
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
#### Methods
|
|
213
|
+
|
|
214
|
+
| Method | Description |
|
|
215
|
+
|--------|-------------|
|
|
216
|
+
| `parse(url, proxy=None)` | Sync parse, returns Markdown string |
|
|
217
|
+
| `async_parse(url, proxy=None)` | Async parse, returns Markdown string |
|
|
218
|
+
| `async_parse_many(urls, proxy=None)` | Batch async parse, returns dict |
|
|
219
|
+
| `stop_lightpanda()` | Stop Lightpanda if started |
|
|
220
|
+
|
|
221
|
+
## Development
|
|
222
|
+
|
|
223
|
+
```bash
|
|
224
|
+
git clone https://github.com/lemantorus/pg2md.git
|
|
225
|
+
cd pg2md
|
|
226
|
+
python -m venv venv
|
|
227
|
+
source venv/bin/activate
|
|
228
|
+
pip install -e ".[dev]"
|
|
229
|
+
playwright install chromium
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
## License
|
|
233
|
+
|
|
234
|
+
[MIT](LICENSE)
|
|
235
|
+
|
|
236
|
+
## Credits
|
|
237
|
+
|
|
238
|
+
- [Playwright](https://playwright.dev/python/) — browser automation
|
|
239
|
+
- [html-to-markdown](https://pypi.org/project/html-to-markdown/) — Rust-based HTML to Markdown
|
|
240
|
+
- [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) — HTML parsing
|
pg2md-1.2.0/all_code.txt
ADDED
|
Binary file
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "pg2md"
|
|
7
|
+
version = "1.2.0"
|
|
8
|
+
description = "Page to Markdown converter with JS rendering support via Playwright"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = "MIT"
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
|
+
authors = [{ name = "lemantorus" }]
|
|
13
|
+
keywords = [
|
|
14
|
+
"markdown",
|
|
15
|
+
"html",
|
|
16
|
+
"converter",
|
|
17
|
+
"playwright",
|
|
18
|
+
"scraper",
|
|
19
|
+
"parser",
|
|
20
|
+
"web-scraping",
|
|
21
|
+
"js-rendering",
|
|
22
|
+
]
|
|
23
|
+
classifiers = [
|
|
24
|
+
"Development Status :: 4 - Beta",
|
|
25
|
+
"Intended Audience :: Developers",
|
|
26
|
+
"License :: OSI Approved :: MIT License",
|
|
27
|
+
"Operating System :: OS Independent",
|
|
28
|
+
"Programming Language :: Python :: 3",
|
|
29
|
+
"Programming Language :: Python :: 3.10",
|
|
30
|
+
"Programming Language :: Python :: 3.11",
|
|
31
|
+
"Programming Language :: Python :: 3.12",
|
|
32
|
+
"Topic :: Internet :: WWW/HTTP",
|
|
33
|
+
"Topic :: Text Processing :: Markup :: Markdown",
|
|
34
|
+
"Topic :: Utilities",
|
|
35
|
+
]
|
|
36
|
+
dependencies = [
|
|
37
|
+
"playwright>=1.40.0",
|
|
38
|
+
"html-to-markdown>=2.0.0",
|
|
39
|
+
"beautifulsoup4>=4.12.0",
|
|
40
|
+
]
|
|
41
|
+
|
|
42
|
+
[project.optional-dependencies]
|
|
43
|
+
dev = ["pytest>=7.0.0", "ruff>=0.1.0", "mypy>=1.0.0"]
|
|
44
|
+
|
|
45
|
+
[project.urls]
|
|
46
|
+
Homepage = "https://github.com/lemantorus/pg2md"
|
|
47
|
+
Documentation = "https://github.com/lemantorus/pg2md#readme"
|
|
48
|
+
Repository = "https://github.com/lemantorus/pg2md"
|
|
49
|
+
Issues = "https://github.com/lemantorus/pg2md/issues"
|
|
50
|
+
|
|
51
|
+
[tool.hatch.build.targets.wheel]
|
|
52
|
+
packages = ["src/pg2md"]
|
|
53
|
+
|
|
54
|
+
[tool.ruff]
|
|
55
|
+
line-length = 100
|
|
56
|
+
target-version = "py310"
|
|
57
|
+
|
|
58
|
+
[tool.ruff.lint]
|
|
59
|
+
select = ["E", "F", "I", "UP", "B"]
|
|
60
|
+
|
|
61
|
+
[tool.mypy]
|
|
62
|
+
python_version = "3.10"
|
|
63
|
+
warn_return_any = true
|
|
64
|
+
warn_unused_configs = true
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""
|
|
2
|
+
pg2md — Page to Markdown converter with JS rendering support.
|
|
3
|
+
|
|
4
|
+
A fast, clean HTML-to-Markdown converter that uses Playwright for
|
|
5
|
+
JavaScript rendering and html-to-markdown (Rust-based) for conversion.
|
|
6
|
+
|
|
7
|
+
Features:
|
|
8
|
+
- Auto-download and start Lightpanda browser (default)
|
|
9
|
+
- JavaScript rendering via Playwright/Lightpanda
|
|
10
|
+
- Fast Rust-based HTML-to-Markdown conversion
|
|
11
|
+
- Proxy support (HTTP/HTTPS/SOCKS5)
|
|
12
|
+
- Custom User-Agents (Googlebot, Bingbot, etc.)
|
|
13
|
+
|
|
14
|
+
Example:
|
|
15
|
+
from pg2md import PageParser
|
|
16
|
+
|
|
17
|
+
# Auto-downloads Lightpanda and parses page
|
|
18
|
+
parser = PageParser()
|
|
19
|
+
markdown = parser.parse("https://example.com")
|
|
20
|
+
print(markdown)
|
|
21
|
+
|
|
22
|
+
# With proxy
|
|
23
|
+
from pg2md import ProxyConfig
|
|
24
|
+
proxy = ProxyConfig(server="socks5://1.2.3.4:1080")
|
|
25
|
+
markdown = parser.parse("https://example.com", proxy=proxy)
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
from pg2md.parser import (
|
|
29
|
+
PageParser,
|
|
30
|
+
BrowserConfig,
|
|
31
|
+
ProxyConfig,
|
|
32
|
+
UserAgents,
|
|
33
|
+
HtmlCleaner,
|
|
34
|
+
MarkdownCleaner,
|
|
35
|
+
)
|
|
36
|
+
from pg2md import lightpanda
|
|
37
|
+
|
|
38
|
+
__version__ = "1.2.0"
|
|
39
|
+
__author__ = "lemantorus"
|
|
40
|
+
__all__ = [
|
|
41
|
+
"PageParser",
|
|
42
|
+
"BrowserConfig",
|
|
43
|
+
"ProxyConfig",
|
|
44
|
+
"UserAgents",
|
|
45
|
+
"HtmlCleaner",
|
|
46
|
+
"MarkdownCleaner",
|
|
47
|
+
"lightpanda",
|
|
48
|
+
"__version__",
|
|
49
|
+
]
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Lightpanda browser auto-downloader and launcher.
|
|
3
|
+
|
|
4
|
+
Automatically downloads Lightpanda binary from GitHub releases
|
|
5
|
+
and manages the CDP server process.
|
|
6
|
+
|
|
7
|
+
Supported platforms:
|
|
8
|
+
- Linux x86_64
|
|
9
|
+
- macOS arm64 (Apple Silicon)
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import platform
|
|
13
|
+
import subprocess
|
|
14
|
+
import time
|
|
15
|
+
import urllib.request
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
|
|
18
|
+
LIGHTPANDA_VERSION = "nightly"
|
|
19
|
+
|
|
20
|
+
DOWNLOAD_URLS = {
|
|
21
|
+
(
|
|
22
|
+
"Linux",
|
|
23
|
+
"x86_64",
|
|
24
|
+
): "https://github.com/lightpanda-io/browser/releases/download/nightly/lightpanda-x86_64-linux",
|
|
25
|
+
(
|
|
26
|
+
"Darwin",
|
|
27
|
+
"arm64",
|
|
28
|
+
): "https://github.com/lightpanda-io/browser/releases/download/nightly/lightpanda-aarch64-macos",
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def get_cache_dir() -> Path:
|
|
33
|
+
"""Returns the cache directory for Lightpanda binary."""
|
|
34
|
+
xdg_cache = Path.home() / ".cache" / "pg2md"
|
|
35
|
+
return xdg_cache
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def get_binary_path() -> Path:
|
|
39
|
+
"""Returns the path to the Lightpanda binary."""
|
|
40
|
+
return get_cache_dir() / "lightpanda"
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def get_platform_key() -> tuple[str, str]:
|
|
44
|
+
"""Returns (system, machine) tuple for current platform."""
|
|
45
|
+
return (platform.system(), platform.machine())
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def is_supported() -> bool:
|
|
49
|
+
"""Check if Lightpanda is available for current platform."""
|
|
50
|
+
return get_platform_key() in DOWNLOAD_URLS
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def is_downloaded() -> bool:
|
|
54
|
+
"""Check if Lightpanda binary is already downloaded."""
|
|
55
|
+
return get_binary_path().exists()
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def download(force: bool = False) -> Path:
|
|
59
|
+
"""
|
|
60
|
+
Download Lightpanda binary if not already present.
|
|
61
|
+
|
|
62
|
+
Args:
|
|
63
|
+
force: Re-download even if binary exists
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
Path to the binary
|
|
67
|
+
|
|
68
|
+
Raises:
|
|
69
|
+
RuntimeError: If platform is not supported
|
|
70
|
+
"""
|
|
71
|
+
binary_path = get_binary_path()
|
|
72
|
+
|
|
73
|
+
if binary_path.exists() and not force:
|
|
74
|
+
return binary_path
|
|
75
|
+
|
|
76
|
+
key = get_platform_key()
|
|
77
|
+
if key not in DOWNLOAD_URLS:
|
|
78
|
+
raise RuntimeError(
|
|
79
|
+
f"Lightpanda not available for {key[0]} {key[1]}. "
|
|
80
|
+
f"Supported: {list(DOWNLOAD_URLS.keys())}"
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
url = DOWNLOAD_URLS[key]
|
|
84
|
+
binary_path.parent.mkdir(parents=True, exist_ok=True)
|
|
85
|
+
|
|
86
|
+
print(f"[pg2md] Downloading Lightpanda {LIGHTPANDA_VERSION}...")
|
|
87
|
+
urllib.request.urlretrieve(url, binary_path)
|
|
88
|
+
binary_path.chmod(0o755)
|
|
89
|
+
print(f"[pg2md] Saved to {binary_path}")
|
|
90
|
+
|
|
91
|
+
return binary_path
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def start(
|
|
95
|
+
host: str = "127.0.0.1",
|
|
96
|
+
port: int = 9222,
|
|
97
|
+
download_if_missing: bool = True,
|
|
98
|
+
) -> subprocess.Popen:
|
|
99
|
+
"""
|
|
100
|
+
Start Lightpanda CDP server.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
host: Host to bind to
|
|
104
|
+
port: Port to bind to
|
|
105
|
+
download_if_missing: Download binary if not present
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
subprocess.Popen object
|
|
109
|
+
"""
|
|
110
|
+
if download_if_missing:
|
|
111
|
+
binary_path = download()
|
|
112
|
+
else:
|
|
113
|
+
binary_path = get_binary_path()
|
|
114
|
+
if not binary_path.exists():
|
|
115
|
+
raise FileNotFoundError(
|
|
116
|
+
f"Lightpanda binary not found at {binary_path}. "
|
|
117
|
+
"Set download_if_missing=True to auto-download."
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
proc = subprocess.Popen(
|
|
121
|
+
[str(binary_path), "serve", "--host", host, "--port", str(port)],
|
|
122
|
+
stdout=subprocess.DEVNULL,
|
|
123
|
+
stderr=subprocess.DEVNULL,
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
cdp_url = f"ws://{host}:{port}"
|
|
127
|
+
_wait_for_cdp(cdp_url, timeout=10)
|
|
128
|
+
|
|
129
|
+
return proc
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def _wait_for_cdp(cdp_url: str, timeout: float = 10.0) -> bool:
|
|
133
|
+
"""Wait for CDP server to be ready."""
|
|
134
|
+
import socket
|
|
135
|
+
|
|
136
|
+
host = cdp_url.split("://")[1].split(":")[0]
|
|
137
|
+
port = int(cdp_url.split(":")[-1])
|
|
138
|
+
|
|
139
|
+
start_time = time.time()
|
|
140
|
+
while time.time() - start_time < timeout:
|
|
141
|
+
try:
|
|
142
|
+
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
|
143
|
+
sock.settimeout(1)
|
|
144
|
+
result = sock.connect_ex((host, port))
|
|
145
|
+
sock.close()
|
|
146
|
+
if result == 0:
|
|
147
|
+
time.sleep(0.5)
|
|
148
|
+
return True
|
|
149
|
+
except Exception:
|
|
150
|
+
pass
|
|
151
|
+
time.sleep(0.2)
|
|
152
|
+
|
|
153
|
+
raise TimeoutError(f"Lightpanda CDP server not ready at {cdp_url}")
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def stop(proc: subprocess.Popen) -> None:
|
|
157
|
+
"""Stop Lightpanda process."""
|
|
158
|
+
if proc and proc.poll() is None:
|
|
159
|
+
proc.terminate()
|
|
160
|
+
try:
|
|
161
|
+
proc.wait(timeout=5)
|
|
162
|
+
except subprocess.TimeoutExpired:
|
|
163
|
+
proc.kill()
|
|
@@ -0,0 +1,469 @@
|
|
|
1
|
+
"""
|
|
2
|
+
PageParser — Parser for web pages with JS rendering via Lightpanda/Playwright
|
|
3
|
+
and conversion to clean Markdown.
|
|
4
|
+
|
|
5
|
+
Dependencies:
|
|
6
|
+
pip install playwright html-to-markdown beautifulsoup4
|
|
7
|
+
playwright install chromium # if using regular Chrome, not Lightpanda
|
|
8
|
+
|
|
9
|
+
Running Lightpanda (optional, instead of Chrome):
|
|
10
|
+
./lightpanda serve --host 127.0.0.1 --port 9222
|
|
11
|
+
|
|
12
|
+
Usage:
|
|
13
|
+
parser = PageParser(with_image=False, with_link=False)
|
|
14
|
+
|
|
15
|
+
# Without proxy
|
|
16
|
+
result = parser.parse("https://example.com")
|
|
17
|
+
|
|
18
|
+
# With proxy for a specific request
|
|
19
|
+
proxy = ProxyConfig(server="http://1.2.3.4:8080", username="user", password="pass")
|
|
20
|
+
result = parser.parse("https://example.com", proxy=proxy)
|
|
21
|
+
|
|
22
|
+
print(result)
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
import re
|
|
26
|
+
import subprocess
|
|
27
|
+
import time
|
|
28
|
+
import asyncio
|
|
29
|
+
from dataclasses import dataclass
|
|
30
|
+
from typing import Optional, Literal
|
|
31
|
+
|
|
32
|
+
from bs4 import BeautifulSoup
|
|
33
|
+
from html_to_markdown import convert, ConversionOptions, PreprocessingOptions
|
|
34
|
+
from playwright.async_api import async_playwright, ProxySettings, ViewportSize
|
|
35
|
+
|
|
36
|
+
from pg2md import lightpanda
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass
|
|
40
|
+
class ProxyConfig:
|
|
41
|
+
"""
|
|
42
|
+
Proxy settings for a single request.
|
|
43
|
+
|
|
44
|
+
Supported server formats:
|
|
45
|
+
http://host:port
|
|
46
|
+
https://host:port
|
|
47
|
+
socks5://host:port
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
server : proxy address (required)
|
|
51
|
+
username : login (optional)
|
|
52
|
+
password : password (optional)
|
|
53
|
+
bypass : comma-separated list of hosts to bypass proxy
|
|
54
|
+
(e.g. "localhost,127.0.0.1")
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
server: str
|
|
58
|
+
username: Optional[str] = None
|
|
59
|
+
password: Optional[str] = None
|
|
60
|
+
bypass: Optional[str] = None
|
|
61
|
+
|
|
62
|
+
def to_playwright(self) -> ProxySettings:
|
|
63
|
+
"""Converts to Playwright ProxySettings format."""
|
|
64
|
+
settings: ProxySettings = {"server": self.server}
|
|
65
|
+
if self.username:
|
|
66
|
+
settings["username"] = self.username
|
|
67
|
+
if self.password:
|
|
68
|
+
settings["password"] = self.password
|
|
69
|
+
if self.bypass:
|
|
70
|
+
settings["bypass"] = self.bypass
|
|
71
|
+
return settings
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class UserAgents:
|
|
75
|
+
"""Popular User-Agent strings for bypassing blocks."""
|
|
76
|
+
|
|
77
|
+
CHROME_DESKTOP = (
|
|
78
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
79
|
+
"Chrome/120.0.0.0 Safari/537.36"
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
CHROME_MAC = (
|
|
83
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
84
|
+
"Chrome/120.0.0.0 Safari/537.36"
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
FIREFOX_DESKTOP = (
|
|
88
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0"
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
SAFARI_MAC = (
|
|
92
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) "
|
|
93
|
+
"Version/17.2 Safari/605.1.15"
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
EDGE = (
|
|
97
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
98
|
+
"Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0"
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
GOOGLEBOT_DESKTOP = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
|
|
102
|
+
|
|
103
|
+
GOOGLEBOT_MOBILE = (
|
|
104
|
+
"Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) "
|
|
105
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.6099.71 MobileSafari/537.36 "
|
|
106
|
+
"(compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
GOOGLEBOT_VIDEO = (
|
|
110
|
+
"Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) "
|
|
111
|
+
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.6099.71 MobileSafari/537.36 "
|
|
112
|
+
"Googlebot/2.1"
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
BINGBOT = "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)"
|
|
116
|
+
|
|
117
|
+
BINGBOT_MOBILE = (
|
|
118
|
+
"Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 "
|
|
119
|
+
"(KHTML, like Gecko) Version/16.6 Mobile/15E148 BingWeb/7.15.13.7055 (advisor; +http://www.bing.com/bingbot.htm)"
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
YANDEXBOT = "Mozilla/5.0 (compatible; YandexBot/3.0; +http://yandex.com/bots)"
|
|
123
|
+
|
|
124
|
+
DUCKBOT = "DuckDuckBot/1.0; (+http://duckduckgo.com/duckduckbot.html)"
|
|
125
|
+
|
|
126
|
+
APPLEBOT = (
|
|
127
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) "
|
|
128
|
+
"Version/17.0 Safari/605.1.15 Applebot/0.1"
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
CHROME_LINUX = (
|
|
132
|
+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
133
|
+
"Chrome/120.0.0.0 Safari/537.36"
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
@dataclass
|
|
138
|
+
class BrowserConfig:
|
|
139
|
+
"""
|
|
140
|
+
Browser connection settings.
|
|
141
|
+
|
|
142
|
+
cdp_url options:
|
|
143
|
+
"auto" - Auto-download and start Lightpanda (default)
|
|
144
|
+
None - Use built-in Chromium via Playwright
|
|
145
|
+
"ws://host:port" - Connect to existing CDP server (Lightpanda/Chrome)
|
|
146
|
+
"""
|
|
147
|
+
|
|
148
|
+
cdp_url: Optional[str] = "auto"
|
|
149
|
+
lightpanda_bin: Optional[str] = None
|
|
150
|
+
navigation_timeout: int = 60_000
|
|
151
|
+
wait_until: Literal["load", "domcontentloaded", "networkidle"] = "domcontentloaded"
|
|
152
|
+
default_proxy: Optional[ProxyConfig] = None
|
|
153
|
+
user_agent: Optional[str] = UserAgents.CHROME_DESKTOP
|
|
154
|
+
extra_headers: Optional[dict[str, str]] = None
|
|
155
|
+
viewport: Optional[ViewportSize] = None
|
|
156
|
+
|
|
157
|
+
def __post_init__(self):
|
|
158
|
+
if self.viewport is None:
|
|
159
|
+
self.viewport = {"width": 1920, "height": 1080}
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
class HtmlCleaner:
|
|
163
|
+
"""
|
|
164
|
+
Cleans HTML before converting to Markdown:
|
|
165
|
+
- removes <script>, <style>, <noscript>, <svg>, <canvas>, <video>, <audio>
|
|
166
|
+
- removes <img> (optional)
|
|
167
|
+
- removes href/src with data:, blob: (base64 junk)
|
|
168
|
+
- strips links, keeping only text (optional)
|
|
169
|
+
"""
|
|
170
|
+
|
|
171
|
+
ALWAYS_STRIP_TAGS = [
|
|
172
|
+
"script",
|
|
173
|
+
"style",
|
|
174
|
+
"noscript",
|
|
175
|
+
"svg",
|
|
176
|
+
"canvas",
|
|
177
|
+
"video",
|
|
178
|
+
"audio",
|
|
179
|
+
"iframe",
|
|
180
|
+
"object",
|
|
181
|
+
"embed",
|
|
182
|
+
"head",
|
|
183
|
+
]
|
|
184
|
+
|
|
185
|
+
def __init__(self, with_image: bool = False, with_link: bool = True):
|
|
186
|
+
self.with_image = with_image
|
|
187
|
+
self.with_link = with_link
|
|
188
|
+
|
|
189
|
+
def clean(self, html: str) -> str:
|
|
190
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
191
|
+
|
|
192
|
+
for tag in self.ALWAYS_STRIP_TAGS:
|
|
193
|
+
for el in soup.find_all(tag):
|
|
194
|
+
el.decompose()
|
|
195
|
+
|
|
196
|
+
if not self.with_image:
|
|
197
|
+
for el in soup.find_all("img"):
|
|
198
|
+
el.decompose()
|
|
199
|
+
else:
|
|
200
|
+
for el in soup.find_all("img"):
|
|
201
|
+
src = el.get("src", "")
|
|
202
|
+
if isinstance(src, str) and (src.startswith("data:") or src.startswith("blob:")):
|
|
203
|
+
el.decompose()
|
|
204
|
+
|
|
205
|
+
if not self.with_link:
|
|
206
|
+
for el in soup.find_all("a"):
|
|
207
|
+
el.replace_with(el.get_text())
|
|
208
|
+
else:
|
|
209
|
+
for el in soup.find_all("a"):
|
|
210
|
+
href = el.get("href", "")
|
|
211
|
+
if isinstance(href, str) and (href.startswith("data:") or href.startswith("blob:")):
|
|
212
|
+
el["href"] = ""
|
|
213
|
+
|
|
214
|
+
for el in soup.find_all(True):
|
|
215
|
+
for attr in ("src", "href", "srcset", "poster", "background"):
|
|
216
|
+
val = el.get(attr, "")
|
|
217
|
+
if isinstance(val, str) and (val.startswith("data:") or val.startswith("blob:")):
|
|
218
|
+
del el[attr]
|
|
219
|
+
|
|
220
|
+
return str(soup)
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
class MarkdownCleaner:
|
|
224
|
+
"""Final cleanup of ready Markdown text."""
|
|
225
|
+
|
|
226
|
+
_BASE64_LINE = re.compile(r"^[A-Za-z0-9+/=]{40,}\s*$", re.MULTILINE)
|
|
227
|
+
_BINARY_GARBAGE = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f]")
|
|
228
|
+
_EXCESS_NEWLINES = re.compile(r"\n{3,}")
|
|
229
|
+
_MD_IMAGE = re.compile(r"!\[.*?\]\(.*?\)")
|
|
230
|
+
|
|
231
|
+
def clean(self, text: str, strip_images: bool = False) -> str:
|
|
232
|
+
text = self._BINARY_GARBAGE.sub("", text)
|
|
233
|
+
text = self._BASE64_LINE.sub("", text)
|
|
234
|
+
|
|
235
|
+
if strip_images:
|
|
236
|
+
text = self._MD_IMAGE.sub("", text)
|
|
237
|
+
|
|
238
|
+
text = self._EXCESS_NEWLINES.sub("\n\n", text)
|
|
239
|
+
|
|
240
|
+
return text.strip()
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
class PageParser:
|
|
244
|
+
"""
|
|
245
|
+
Parses web pages with JS rendering and returns clean Markdown.
|
|
246
|
+
|
|
247
|
+
Args:
|
|
248
|
+
with_image (bool) : Include images in output. Default False.
|
|
249
|
+
with_link (bool) : Include links (href). Default True.
|
|
250
|
+
False — links are replaced with their text.
|
|
251
|
+
browser_config (BrowserConfig): Browser connection settings.
|
|
252
|
+
Can set default_proxy for all requests.
|
|
253
|
+
|
|
254
|
+
Proxy is passed to parse() / async_parse() per-request:
|
|
255
|
+
proxy = ProxyConfig(server="socks5://1.2.3.4:1080")
|
|
256
|
+
result = parser.parse("https://example.com", proxy=proxy)
|
|
257
|
+
"""
|
|
258
|
+
|
|
259
|
+
def __init__(
|
|
260
|
+
self,
|
|
261
|
+
with_image: bool = False,
|
|
262
|
+
with_link: bool = True,
|
|
263
|
+
browser_config: Optional[BrowserConfig] = None,
|
|
264
|
+
):
|
|
265
|
+
self.with_image = with_image
|
|
266
|
+
self.with_link = with_link
|
|
267
|
+
self.config = browser_config or BrowserConfig()
|
|
268
|
+
|
|
269
|
+
self._html_cleaner = HtmlCleaner(with_image=with_image, with_link=with_link)
|
|
270
|
+
self._md_cleaner = MarkdownCleaner()
|
|
271
|
+
|
|
272
|
+
self._lightpanda_proc: Optional[subprocess.Popen] = None
|
|
273
|
+
self._auto_lightpanda_proc: Optional[subprocess.Popen] = None
|
|
274
|
+
self._auto_cdp_url: Optional[str] = None
|
|
275
|
+
|
|
276
|
+
def parse(self, url: str, proxy: Optional[ProxyConfig] = None) -> str:
|
|
277
|
+
"""
|
|
278
|
+
Synchronous wrapper over async_parse.
|
|
279
|
+
|
|
280
|
+
Args:
|
|
281
|
+
url : page to parse
|
|
282
|
+
proxy : proxy for this specific request (overrides default_proxy)
|
|
283
|
+
"""
|
|
284
|
+
return asyncio.run(self.async_parse(url, proxy=proxy))
|
|
285
|
+
|
|
286
|
+
async def async_parse(self, url: str, proxy: Optional[ProxyConfig] = None) -> str:
|
|
287
|
+
"""
|
|
288
|
+
Loads page, renders JS, returns clean Markdown.
|
|
289
|
+
|
|
290
|
+
Args:
|
|
291
|
+
url : page to parse
|
|
292
|
+
proxy : proxy for this specific request (overrides default_proxy)
|
|
293
|
+
"""
|
|
294
|
+
html = await self._fetch_html(url, proxy=proxy)
|
|
295
|
+
return self._html_to_markdown(html)
|
|
296
|
+
|
|
297
|
+
async def async_parse_many(
|
|
298
|
+
self,
|
|
299
|
+
urls: list[str],
|
|
300
|
+
proxy: Optional[ProxyConfig] = None,
|
|
301
|
+
) -> dict[str, str | BaseException]:
|
|
302
|
+
"""
|
|
303
|
+
Parses multiple URLs in parallel.
|
|
304
|
+
|
|
305
|
+
Args:
|
|
306
|
+
urls : list of pages
|
|
307
|
+
proxy : one proxy for all requests (or None)
|
|
308
|
+
"""
|
|
309
|
+
tasks = [self.async_parse(url, proxy=proxy) for url in urls]
|
|
310
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
311
|
+
return {url: res for url, res in zip(urls, results)}
|
|
312
|
+
|
|
313
|
+
async def _fetch_html(self, url: str, proxy: Optional[ProxyConfig] = None) -> str:
|
|
314
|
+
"""Opens browser, loads page, returns HTML."""
|
|
315
|
+
self._maybe_start_lightpanda()
|
|
316
|
+
|
|
317
|
+
effective_proxy = proxy or self.config.default_proxy
|
|
318
|
+
proxy_settings = effective_proxy.to_playwright() if effective_proxy else None
|
|
319
|
+
|
|
320
|
+
headers = {}
|
|
321
|
+
cdp_url = self._resolve_cdp_url()
|
|
322
|
+
use_cdp = cdp_url is not None
|
|
323
|
+
if use_cdp and self.config.user_agent:
|
|
324
|
+
headers["User-Agent"] = self.config.user_agent
|
|
325
|
+
if self.config.extra_headers:
|
|
326
|
+
headers.update(self.config.extra_headers)
|
|
327
|
+
|
|
328
|
+
async with async_playwright() as pw:
|
|
329
|
+
fallback_to_chromium = False
|
|
330
|
+
|
|
331
|
+
while True:
|
|
332
|
+
try:
|
|
333
|
+
if use_cdp and not fallback_to_chromium:
|
|
334
|
+
browser = await pw.chromium.connect_over_cdp(cdp_url)
|
|
335
|
+
context = await browser.new_context(
|
|
336
|
+
proxy=proxy_settings,
|
|
337
|
+
extra_http_headers=headers if headers else None,
|
|
338
|
+
viewport=self.config.viewport,
|
|
339
|
+
)
|
|
340
|
+
else:
|
|
341
|
+
browser = await pw.chromium.launch(headless=True)
|
|
342
|
+
context = await browser.new_context(
|
|
343
|
+
proxy=proxy_settings,
|
|
344
|
+
user_agent=self.config.user_agent,
|
|
345
|
+
extra_http_headers=headers if headers else None,
|
|
346
|
+
viewport=self.config.viewport,
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
page = await context.new_page()
|
|
350
|
+
|
|
351
|
+
try:
|
|
352
|
+
await page.goto(
|
|
353
|
+
url,
|
|
354
|
+
timeout=self.config.navigation_timeout,
|
|
355
|
+
wait_until=self.config.wait_until,
|
|
356
|
+
)
|
|
357
|
+
await page.wait_for_timeout(1500)
|
|
358
|
+
html = await page.content()
|
|
359
|
+
finally:
|
|
360
|
+
await page.close()
|
|
361
|
+
await context.close()
|
|
362
|
+
await browser.close()
|
|
363
|
+
|
|
364
|
+
return html
|
|
365
|
+
|
|
366
|
+
except Exception as e:
|
|
367
|
+
if use_cdp and not fallback_to_chromium and self.config.cdp_url == "auto":
|
|
368
|
+
print(f"[pg2md] Lightpanda failed: {e}")
|
|
369
|
+
print("[pg2md] Falling back to Chromium...")
|
|
370
|
+
self._stop_auto_lightpanda()
|
|
371
|
+
fallback_to_chromium = True
|
|
372
|
+
continue
|
|
373
|
+
else:
|
|
374
|
+
raise
|
|
375
|
+
|
|
376
|
+
def _resolve_cdp_url(self) -> Optional[str]:
|
|
377
|
+
"""Resolve CDP URL based on config."""
|
|
378
|
+
if self.config.cdp_url == "auto":
|
|
379
|
+
if self._auto_cdp_url:
|
|
380
|
+
return self._auto_cdp_url
|
|
381
|
+
|
|
382
|
+
if lightpanda.is_supported():
|
|
383
|
+
self._auto_lightpanda_proc = lightpanda.start()
|
|
384
|
+
self._auto_cdp_url = "ws://127.0.0.1:9222"
|
|
385
|
+
return self._auto_cdp_url
|
|
386
|
+
else:
|
|
387
|
+
return None
|
|
388
|
+
return self.config.cdp_url
|
|
389
|
+
|
|
390
|
+
def _html_to_markdown(self, html: str) -> str:
|
|
391
|
+
"""Cleans HTML and converts to Markdown."""
|
|
392
|
+
|
|
393
|
+
clean_html = self._html_cleaner.clean(html)
|
|
394
|
+
|
|
395
|
+
options = ConversionOptions(
|
|
396
|
+
heading_style="atx",
|
|
397
|
+
strong_em_symbol="*",
|
|
398
|
+
bullets="*",
|
|
399
|
+
escape_asterisks=False,
|
|
400
|
+
)
|
|
401
|
+
preprocessing = PreprocessingOptions(
|
|
402
|
+
enabled=True,
|
|
403
|
+
preset="aggressive",
|
|
404
|
+
remove_navigation=True,
|
|
405
|
+
remove_forms=True,
|
|
406
|
+
)
|
|
407
|
+
markdown = convert(clean_html, options, preprocessing)
|
|
408
|
+
|
|
409
|
+
markdown = self._md_cleaner.clean(markdown, strip_images=not self.with_image)
|
|
410
|
+
|
|
411
|
+
return markdown
|
|
412
|
+
|
|
413
|
+
def _maybe_start_lightpanda(self) -> None:
|
|
414
|
+
"""If binary path is set and process not running — start it."""
|
|
415
|
+
if not self.config.lightpanda_bin:
|
|
416
|
+
return
|
|
417
|
+
if self._lightpanda_proc and self._lightpanda_proc.poll() is None:
|
|
418
|
+
return
|
|
419
|
+
|
|
420
|
+
self._lightpanda_proc = subprocess.Popen(
|
|
421
|
+
[
|
|
422
|
+
self.config.lightpanda_bin,
|
|
423
|
+
"serve",
|
|
424
|
+
"--host",
|
|
425
|
+
"127.0.0.1",
|
|
426
|
+
"--port",
|
|
427
|
+
"9222",
|
|
428
|
+
],
|
|
429
|
+
stdout=subprocess.DEVNULL,
|
|
430
|
+
stderr=subprocess.DEVNULL,
|
|
431
|
+
)
|
|
432
|
+
time.sleep(1.5)
|
|
433
|
+
|
|
434
|
+
def stop_lightpanda(self) -> None:
|
|
435
|
+
"""Explicitly stops Lightpanda if started by us."""
|
|
436
|
+
if self._lightpanda_proc:
|
|
437
|
+
self._lightpanda_proc.terminate()
|
|
438
|
+
self._lightpanda_proc = None
|
|
439
|
+
|
|
440
|
+
def _stop_auto_lightpanda(self) -> None:
|
|
441
|
+
"""Stop auto-launched Lightpanda process."""
|
|
442
|
+
if self._auto_lightpanda_proc:
|
|
443
|
+
lightpanda.stop(self._auto_lightpanda_proc)
|
|
444
|
+
self._auto_lightpanda_proc = None
|
|
445
|
+
self._auto_cdp_url = None
|
|
446
|
+
|
|
447
|
+
def stop(self) -> None:
|
|
448
|
+
"""Stop all browser processes started by this parser."""
|
|
449
|
+
self.stop_lightpanda()
|
|
450
|
+
self._stop_auto_lightpanda()
|
|
451
|
+
|
|
452
|
+
def __enter__(self):
|
|
453
|
+
return self
|
|
454
|
+
|
|
455
|
+
def __exit__(self, *_):
|
|
456
|
+
self.stop()
|
|
457
|
+
|
|
458
|
+
|
|
459
|
+
if __name__ == "__main__":
|
|
460
|
+
import sys
|
|
461
|
+
|
|
462
|
+
url = sys.argv[1] if len(sys.argv) > 1 else "https://example.com"
|
|
463
|
+
|
|
464
|
+
parser = PageParser(with_image=False, with_link=False)
|
|
465
|
+
|
|
466
|
+
with parser:
|
|
467
|
+
result = parser.parse(url)
|
|
468
|
+
|
|
469
|
+
print(result)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
manylinux1_x86_64.whl driver package
|