pg2md 1.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pg2md-1.0.1/LICENSE +21 -0
- pg2md-1.0.1/PKG-INFO +302 -0
- pg2md-1.0.1/README.md +268 -0
- pg2md-1.0.1/pg2md/__init__.py +257 -0
- pg2md-1.0.1/pg2md/html_to_md.py +168 -0
- pg2md-1.0.1/pg2md.egg-info/PKG-INFO +302 -0
- pg2md-1.0.1/pg2md.egg-info/SOURCES.txt +10 -0
- pg2md-1.0.1/pg2md.egg-info/dependency_links.txt +1 -0
- pg2md-1.0.1/pg2md.egg-info/requires.txt +11 -0
- pg2md-1.0.1/pg2md.egg-info/top_level.txt +1 -0
- pg2md-1.0.1/pyproject.toml +51 -0
- pg2md-1.0.1/setup.cfg +4 -0
pg2md-1.0.1/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
pg2md-1.0.1/PKG-INFO
ADDED
|
@@ -0,0 +1,302 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pg2md
|
|
3
|
+
Version: 1.0.1
|
|
4
|
+
Summary: HTML to Markdown converter with Requests or Playwright backend
|
|
5
|
+
Author-email: Your Name <your@email.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/yourname/pg2md
|
|
8
|
+
Project-URL: Repository, https://github.com/yourname/pg2md
|
|
9
|
+
Project-URL: Issues, https://github.com/yourname/pg2md/issues
|
|
10
|
+
Keywords: html,markdown,converter,playwright,requests,scraper
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Topic :: Text Processing :: Markup :: HTML
|
|
19
|
+
Classifier: Topic :: Text Processing :: Markup :: Markdown
|
|
20
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
21
|
+
Requires-Python: >=3.10
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
License-File: LICENSE
|
|
24
|
+
Requires-Dist: requests>=2.28.0
|
|
25
|
+
Requires-Dist: beautifulsoup4>=4.12.0
|
|
26
|
+
Requires-Dist: html-to-markdown>=1.1.0
|
|
27
|
+
Provides-Extra: playwright
|
|
28
|
+
Requires-Dist: playwright>=1.40.0; extra == "playwright"
|
|
29
|
+
Provides-Extra: dev
|
|
30
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
31
|
+
Requires-Dist: build>=1.0.0; extra == "dev"
|
|
32
|
+
Requires-Dist: twine>=4.0.0; extra == "dev"
|
|
33
|
+
Dynamic: license-file
|
|
34
|
+
|
|
35
|
+
# pg2md
|
|
36
|
+
|
|
37
|
+
**HTML to Markdown converter** with Requests or Playwright backend.
|
|
38
|
+
|
|
39
|
+
Convert any webpage to clean Markdown. Choose between fast `requests` or full browser `playwright` for JavaScript-rendered pages.
|
|
40
|
+
|
|
41
|
+
## Features
|
|
42
|
+
|
|
43
|
+
- **Two backends**: `Pg2MdRequests` (fast) or `Pg2MdPlaywright` (JS support)
|
|
44
|
+
- **Browser reuse**: Playwright instances share a single browser
|
|
45
|
+
- **Proxy support**: HTTP/HTTPS proxies with authentication
|
|
46
|
+
- **Custom headers & cookies**: Full control over requests
|
|
47
|
+
- **Clean output**: Optional removal of images and links
|
|
48
|
+
- **Context manager**: Auto-cleanup with `with` statement
|
|
49
|
+
|
|
50
|
+
## Installation
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
pip install pg2md
|
|
54
|
+
|
|
55
|
+
# For Playwright backend:
|
|
56
|
+
pip install pg2md[playwright]
|
|
57
|
+
playwright install chromium
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## Quick Start
|
|
61
|
+
|
|
62
|
+
```python
|
|
63
|
+
from pg2md import Pg2MdRequests, Pg2MdPlaywright
|
|
64
|
+
|
|
65
|
+
# Simple usage with Requests
|
|
66
|
+
pg = Pg2MdRequests()
|
|
67
|
+
markdown = pg.run("https://example.com")
|
|
68
|
+
print(markdown)
|
|
69
|
+
|
|
70
|
+
# Playwright for JS-heavy sites
|
|
71
|
+
pg = Pg2MdPlaywright()
|
|
72
|
+
markdown = pg.run("https://spa-example.com")
|
|
73
|
+
pg.close()
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
## Usage
|
|
77
|
+
|
|
78
|
+
### Basic Conversion
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
from pg2md import Pg2MdRequests
|
|
82
|
+
|
|
83
|
+
pg = Pg2MdRequests(with_image=False, with_link=False)
|
|
84
|
+
md = pg.run("https://news.ycombinator.com")
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
### With Proxy
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
from pg2md import Pg2MdRequests, Pg2MdPlaywright
|
|
91
|
+
|
|
92
|
+
# Format: http://user:password@host:port
|
|
93
|
+
# Or: host:port:user:password
|
|
94
|
+
proxy = "http://user:pass@proxy.example.com:8080"
|
|
95
|
+
|
|
96
|
+
# Requests
|
|
97
|
+
pg = Pg2MdRequests()
|
|
98
|
+
md = pg.run("https://example.com", proxy=proxy)
|
|
99
|
+
|
|
100
|
+
# Playwright
|
|
101
|
+
pg = Pg2MdPlaywright()
|
|
102
|
+
md = pg.run("https://example.com", proxy=proxy)
|
|
103
|
+
pg.close()
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
### Custom Headers & User-Agent
|
|
107
|
+
|
|
108
|
+
```python
|
|
109
|
+
from pg2md import Pg2MdRequests
|
|
110
|
+
|
|
111
|
+
pg = Pg2MdRequests()
|
|
112
|
+
md = pg.run(
|
|
113
|
+
"https://api.example.com/data",
|
|
114
|
+
headers={
|
|
115
|
+
"X-API-Key": "secret123",
|
|
116
|
+
"Accept": "application/json",
|
|
117
|
+
},
|
|
118
|
+
user_agent="MyBot/1.0",
|
|
119
|
+
)
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
### With Cookies
|
|
123
|
+
|
|
124
|
+
```python
|
|
125
|
+
from pg2md import Pg2MdRequests
|
|
126
|
+
|
|
127
|
+
pg = Pg2MdRequests()
|
|
128
|
+
md = pg.run(
|
|
129
|
+
"https://example.com/dashboard",
|
|
130
|
+
cookies={
|
|
131
|
+
"session": "abc123",
|
|
132
|
+
"auth_token": "xyz789",
|
|
133
|
+
},
|
|
134
|
+
)
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
### Save to File
|
|
138
|
+
|
|
139
|
+
```python
|
|
140
|
+
from pg2md import Pg2MdRequests
|
|
141
|
+
|
|
142
|
+
pg = Pg2MdRequests()
|
|
143
|
+
pg.save("output.md", "https://example.com")
|
|
144
|
+
|
|
145
|
+
# With options
|
|
146
|
+
pg.save(
|
|
147
|
+
"article.md",
|
|
148
|
+
"https://blog.example.com/post",
|
|
149
|
+
proxy="http://user:pass@host:port",
|
|
150
|
+
user_agent="MyBot/1.0",
|
|
151
|
+
)
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
### Context Manager
|
|
155
|
+
|
|
156
|
+
```python
|
|
157
|
+
from pg2md import Pg2MdPlaywright
|
|
158
|
+
|
|
159
|
+
with Pg2MdPlaywright() as pg:
|
|
160
|
+
md1 = pg.run("https://site1.com")
|
|
161
|
+
md2 = pg.run("https://site2.com")
|
|
162
|
+
# Browser closed automatically
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
### Multiple Instances
|
|
166
|
+
|
|
167
|
+
```python
|
|
168
|
+
from pg2md import Pg2MdPlaywright
|
|
169
|
+
|
|
170
|
+
# Both share the same browser (efficient)
|
|
171
|
+
pg1 = Pg2MdPlaywright()
|
|
172
|
+
pg2 = Pg2MdPlaywright()
|
|
173
|
+
|
|
174
|
+
md1 = pg1.run("https://site1.com")
|
|
175
|
+
md2 = pg2.run("https://site2.com")
|
|
176
|
+
|
|
177
|
+
Pg2MdPlaywright.close_all() # Close shared browser
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
## API Reference
|
|
181
|
+
|
|
182
|
+
### Pg2MdRequests
|
|
183
|
+
|
|
184
|
+
```python
|
|
185
|
+
Pg2MdRequests(with_image=False, with_link=False)
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
| Parameter | Type | Default | Description |
|
|
189
|
+
|-----------|------|---------|-------------|
|
|
190
|
+
| `with_image` | bool | False | Include images in output |
|
|
191
|
+
| `with_link` | bool | False | Include links in output |
|
|
192
|
+
|
|
193
|
+
### Pg2MdPlaywright
|
|
194
|
+
|
|
195
|
+
```python
|
|
196
|
+
Pg2MdPlaywright(
|
|
197
|
+
browser=None, # Custom Browser instance
|
|
198
|
+
headless=True, # Headless mode
|
|
199
|
+
with_image=False,
|
|
200
|
+
with_link=False,
|
|
201
|
+
)
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
### Methods
|
|
205
|
+
|
|
206
|
+
#### `run(url, proxy=None, headers=None, cookies=None, user_agent=None, timeout=30)`
|
|
207
|
+
|
|
208
|
+
Fetch URL and convert to Markdown.
|
|
209
|
+
|
|
210
|
+
Returns: `str` (Markdown)
|
|
211
|
+
|
|
212
|
+
#### `fetch(url, proxy=None, headers=None, cookies=None, user_agent=None, timeout=30)`
|
|
213
|
+
|
|
214
|
+
Fetch HTML only.
|
|
215
|
+
|
|
216
|
+
Returns: `str` (HTML)
|
|
217
|
+
|
|
218
|
+
#### `convert(html)`
|
|
219
|
+
|
|
220
|
+
Convert HTML to Markdown.
|
|
221
|
+
|
|
222
|
+
Returns: `str` (Markdown)
|
|
223
|
+
|
|
224
|
+
#### `save(filepath, url, **kwargs)`
|
|
225
|
+
|
|
226
|
+
Fetch, convert, and save to file.
|
|
227
|
+
|
|
228
|
+
#### `close()`
|
|
229
|
+
|
|
230
|
+
Close browser (Playwright only).
|
|
231
|
+
|
|
232
|
+
#### `close_all()` (classmethod, Playwright only)
|
|
233
|
+
|
|
234
|
+
Close all shared browsers.
|
|
235
|
+
|
|
236
|
+
## When to Use Which Backend?
|
|
237
|
+
|
|
238
|
+
| Use Requests | Use Playwright |
|
|
239
|
+
|--------------|----------------|
|
|
240
|
+
| Static HTML pages | SPA / JavaScript apps |
|
|
241
|
+
| Speed matters | Need rendered content |
|
|
242
|
+
| Simple scraping | Bypass anti-bot (sometimes) |
|
|
243
|
+
| Low memory | Modern web apps |
|
|
244
|
+
|
|
245
|
+
## Examples
|
|
246
|
+
|
|
247
|
+
### Scrape Multiple URLs
|
|
248
|
+
|
|
249
|
+
```python
|
|
250
|
+
from pg2md import Pg2MdRequests
|
|
251
|
+
|
|
252
|
+
urls = [
|
|
253
|
+
"https://blog.example.com/post1",
|
|
254
|
+
"https://blog.example.com/post2",
|
|
255
|
+
"https://blog.example.com/post3",
|
|
256
|
+
]
|
|
257
|
+
|
|
258
|
+
pg = Pg2MdRequests(with_image=False, with_link=False)
|
|
259
|
+
|
|
260
|
+
for i, url in enumerate(urls):
|
|
261
|
+
pg.save(f"post_{i+1}.md", url)
|
|
262
|
+
print(f"Saved: {url}")
|
|
263
|
+
```
|
|
264
|
+
|
|
265
|
+
### Batch with Proxies
|
|
266
|
+
|
|
267
|
+
```python
|
|
268
|
+
from pg2md import Pg2MdRequests
|
|
269
|
+
|
|
270
|
+
urls = ["https://site1.com", "https://site2.com", "https://site3.com"]
|
|
271
|
+
proxies = [
|
|
272
|
+
"http://user1:pass1@proxy1:8080",
|
|
273
|
+
"http://user2:pass2@proxy2:8080",
|
|
274
|
+
]
|
|
275
|
+
|
|
276
|
+
pg = Pg2MdRequests()
|
|
277
|
+
|
|
278
|
+
for i, url in enumerate(urls):
|
|
279
|
+
proxy = proxies[i % len(proxies)]
|
|
280
|
+
md = pg.run(url, proxy=proxy)
|
|
281
|
+
print(f"[{i+1}] {len(md)} chars")
|
|
282
|
+
```
|
|
283
|
+
|
|
284
|
+
### Extract Article Content
|
|
285
|
+
|
|
286
|
+
```python
|
|
287
|
+
from pg2md import Pg2MdPlaywright
|
|
288
|
+
|
|
289
|
+
with Pg2MdPlaywright() as pg:
|
|
290
|
+
md = pg.run(
|
|
291
|
+
"https://medium.com/some-article",
|
|
292
|
+
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
# Save clean text
|
|
296
|
+
with open("article.md", "w") as f:
|
|
297
|
+
f.write(md)
|
|
298
|
+
```
|
|
299
|
+
|
|
300
|
+
## License
|
|
301
|
+
|
|
302
|
+
MIT
|
pg2md-1.0.1/README.md
ADDED
|
@@ -0,0 +1,268 @@
|
|
|
1
|
+
# pg2md
|
|
2
|
+
|
|
3
|
+
**HTML to Markdown converter** with Requests or Playwright backend.
|
|
4
|
+
|
|
5
|
+
Convert any webpage to clean Markdown. Choose between fast `requests` or full browser `playwright` for JavaScript-rendered pages.
|
|
6
|
+
|
|
7
|
+
## Features
|
|
8
|
+
|
|
9
|
+
- **Two backends**: `Pg2MdRequests` (fast) or `Pg2MdPlaywright` (JS support)
|
|
10
|
+
- **Browser reuse**: Playwright instances share a single browser
|
|
11
|
+
- **Proxy support**: HTTP/HTTPS proxies with authentication
|
|
12
|
+
- **Custom headers & cookies**: Full control over requests
|
|
13
|
+
- **Clean output**: Optional removal of images and links
|
|
14
|
+
- **Context manager**: Auto-cleanup with `with` statement
|
|
15
|
+
|
|
16
|
+
## Installation
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
pip install pg2md
|
|
20
|
+
|
|
21
|
+
# For Playwright backend:
|
|
22
|
+
pip install pg2md[playwright]
|
|
23
|
+
playwright install chromium
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
## Quick Start
|
|
27
|
+
|
|
28
|
+
```python
|
|
29
|
+
from pg2md import Pg2MdRequests, Pg2MdPlaywright
|
|
30
|
+
|
|
31
|
+
# Simple usage with Requests
|
|
32
|
+
pg = Pg2MdRequests()
|
|
33
|
+
markdown = pg.run("https://example.com")
|
|
34
|
+
print(markdown)
|
|
35
|
+
|
|
36
|
+
# Playwright for JS-heavy sites
|
|
37
|
+
pg = Pg2MdPlaywright()
|
|
38
|
+
markdown = pg.run("https://spa-example.com")
|
|
39
|
+
pg.close()
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## Usage
|
|
43
|
+
|
|
44
|
+
### Basic Conversion
|
|
45
|
+
|
|
46
|
+
```python
|
|
47
|
+
from pg2md import Pg2MdRequests
|
|
48
|
+
|
|
49
|
+
pg = Pg2MdRequests(with_image=False, with_link=False)
|
|
50
|
+
md = pg.run("https://news.ycombinator.com")
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
### With Proxy
|
|
54
|
+
|
|
55
|
+
```python
|
|
56
|
+
from pg2md import Pg2MdRequests, Pg2MdPlaywright
|
|
57
|
+
|
|
58
|
+
# Format: http://user:password@host:port
|
|
59
|
+
# Or: host:port:user:password
|
|
60
|
+
proxy = "http://user:pass@proxy.example.com:8080"
|
|
61
|
+
|
|
62
|
+
# Requests
|
|
63
|
+
pg = Pg2MdRequests()
|
|
64
|
+
md = pg.run("https://example.com", proxy=proxy)
|
|
65
|
+
|
|
66
|
+
# Playwright
|
|
67
|
+
pg = Pg2MdPlaywright()
|
|
68
|
+
md = pg.run("https://example.com", proxy=proxy)
|
|
69
|
+
pg.close()
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
### Custom Headers & User-Agent
|
|
73
|
+
|
|
74
|
+
```python
|
|
75
|
+
from pg2md import Pg2MdRequests
|
|
76
|
+
|
|
77
|
+
pg = Pg2MdRequests()
|
|
78
|
+
md = pg.run(
|
|
79
|
+
"https://api.example.com/data",
|
|
80
|
+
headers={
|
|
81
|
+
"X-API-Key": "secret123",
|
|
82
|
+
"Accept": "application/json",
|
|
83
|
+
},
|
|
84
|
+
user_agent="MyBot/1.0",
|
|
85
|
+
)
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
### With Cookies
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
from pg2md import Pg2MdRequests
|
|
92
|
+
|
|
93
|
+
pg = Pg2MdRequests()
|
|
94
|
+
md = pg.run(
|
|
95
|
+
"https://example.com/dashboard",
|
|
96
|
+
cookies={
|
|
97
|
+
"session": "abc123",
|
|
98
|
+
"auth_token": "xyz789",
|
|
99
|
+
},
|
|
100
|
+
)
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
### Save to File
|
|
104
|
+
|
|
105
|
+
```python
|
|
106
|
+
from pg2md import Pg2MdRequests
|
|
107
|
+
|
|
108
|
+
pg = Pg2MdRequests()
|
|
109
|
+
pg.save("output.md", "https://example.com")
|
|
110
|
+
|
|
111
|
+
# With options
|
|
112
|
+
pg.save(
|
|
113
|
+
"article.md",
|
|
114
|
+
"https://blog.example.com/post",
|
|
115
|
+
proxy="http://user:pass@host:port",
|
|
116
|
+
user_agent="MyBot/1.0",
|
|
117
|
+
)
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
### Context Manager
|
|
121
|
+
|
|
122
|
+
```python
|
|
123
|
+
from pg2md import Pg2MdPlaywright
|
|
124
|
+
|
|
125
|
+
with Pg2MdPlaywright() as pg:
|
|
126
|
+
md1 = pg.run("https://site1.com")
|
|
127
|
+
md2 = pg.run("https://site2.com")
|
|
128
|
+
# Browser closed automatically
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
### Multiple Instances
|
|
132
|
+
|
|
133
|
+
```python
|
|
134
|
+
from pg2md import Pg2MdPlaywright
|
|
135
|
+
|
|
136
|
+
# Both share the same browser (efficient)
|
|
137
|
+
pg1 = Pg2MdPlaywright()
|
|
138
|
+
pg2 = Pg2MdPlaywright()
|
|
139
|
+
|
|
140
|
+
md1 = pg1.run("https://site1.com")
|
|
141
|
+
md2 = pg2.run("https://site2.com")
|
|
142
|
+
|
|
143
|
+
Pg2MdPlaywright.close_all() # Close shared browser
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
## API Reference
|
|
147
|
+
|
|
148
|
+
### Pg2MdRequests
|
|
149
|
+
|
|
150
|
+
```python
|
|
151
|
+
Pg2MdRequests(with_image=False, with_link=False)
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
| Parameter | Type | Default | Description |
|
|
155
|
+
|-----------|------|---------|-------------|
|
|
156
|
+
| `with_image` | bool | False | Include images in output |
|
|
157
|
+
| `with_link` | bool | False | Include links in output |
|
|
158
|
+
|
|
159
|
+
### Pg2MdPlaywright
|
|
160
|
+
|
|
161
|
+
```python
|
|
162
|
+
Pg2MdPlaywright(
|
|
163
|
+
browser=None, # Custom Browser instance
|
|
164
|
+
headless=True, # Headless mode
|
|
165
|
+
with_image=False,
|
|
166
|
+
with_link=False,
|
|
167
|
+
)
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
### Methods
|
|
171
|
+
|
|
172
|
+
#### `run(url, proxy=None, headers=None, cookies=None, user_agent=None, timeout=30)`
|
|
173
|
+
|
|
174
|
+
Fetch URL and convert to Markdown.
|
|
175
|
+
|
|
176
|
+
Returns: `str` (Markdown)
|
|
177
|
+
|
|
178
|
+
#### `fetch(url, proxy=None, headers=None, cookies=None, user_agent=None, timeout=30)`
|
|
179
|
+
|
|
180
|
+
Fetch HTML only.
|
|
181
|
+
|
|
182
|
+
Returns: `str` (HTML)
|
|
183
|
+
|
|
184
|
+
#### `convert(html)`
|
|
185
|
+
|
|
186
|
+
Convert HTML to Markdown.
|
|
187
|
+
|
|
188
|
+
Returns: `str` (Markdown)
|
|
189
|
+
|
|
190
|
+
#### `save(filepath, url, **kwargs)`
|
|
191
|
+
|
|
192
|
+
Fetch, convert, and save to file.
|
|
193
|
+
|
|
194
|
+
#### `close()`
|
|
195
|
+
|
|
196
|
+
Close browser (Playwright only).
|
|
197
|
+
|
|
198
|
+
#### `close_all()` (classmethod, Playwright only)
|
|
199
|
+
|
|
200
|
+
Close all shared browsers.
|
|
201
|
+
|
|
202
|
+
## When to Use Which Backend?
|
|
203
|
+
|
|
204
|
+
| Use Requests | Use Playwright |
|
|
205
|
+
|--------------|----------------|
|
|
206
|
+
| Static HTML pages | SPA / JavaScript apps |
|
|
207
|
+
| Speed matters | Need rendered content |
|
|
208
|
+
| Simple scraping | Bypass anti-bot (sometimes) |
|
|
209
|
+
| Low memory | Modern web apps |
|
|
210
|
+
|
|
211
|
+
## Examples
|
|
212
|
+
|
|
213
|
+
### Scrape Multiple URLs
|
|
214
|
+
|
|
215
|
+
```python
|
|
216
|
+
from pg2md import Pg2MdRequests
|
|
217
|
+
|
|
218
|
+
urls = [
|
|
219
|
+
"https://blog.example.com/post1",
|
|
220
|
+
"https://blog.example.com/post2",
|
|
221
|
+
"https://blog.example.com/post3",
|
|
222
|
+
]
|
|
223
|
+
|
|
224
|
+
pg = Pg2MdRequests(with_image=False, with_link=False)
|
|
225
|
+
|
|
226
|
+
for i, url in enumerate(urls):
|
|
227
|
+
pg.save(f"post_{i+1}.md", url)
|
|
228
|
+
print(f"Saved: {url}")
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
### Batch with Proxies
|
|
232
|
+
|
|
233
|
+
```python
|
|
234
|
+
from pg2md import Pg2MdRequests
|
|
235
|
+
|
|
236
|
+
urls = ["https://site1.com", "https://site2.com", "https://site3.com"]
|
|
237
|
+
proxies = [
|
|
238
|
+
"http://user1:pass1@proxy1:8080",
|
|
239
|
+
"http://user2:pass2@proxy2:8080",
|
|
240
|
+
]
|
|
241
|
+
|
|
242
|
+
pg = Pg2MdRequests()
|
|
243
|
+
|
|
244
|
+
for i, url in enumerate(urls):
|
|
245
|
+
proxy = proxies[i % len(proxies)]
|
|
246
|
+
md = pg.run(url, proxy=proxy)
|
|
247
|
+
print(f"[{i+1}] {len(md)} chars")
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
### Extract Article Content
|
|
251
|
+
|
|
252
|
+
```python
|
|
253
|
+
from pg2md import Pg2MdPlaywright
|
|
254
|
+
|
|
255
|
+
with Pg2MdPlaywright() as pg:
|
|
256
|
+
md = pg.run(
|
|
257
|
+
"https://medium.com/some-article",
|
|
258
|
+
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
# Save clean text
|
|
262
|
+
with open("article.md", "w") as f:
|
|
263
|
+
f.write(md)
|
|
264
|
+
```
|
|
265
|
+
|
|
266
|
+
## License
|
|
267
|
+
|
|
268
|
+
MIT
|
|
@@ -0,0 +1,257 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Pg2Md — HTML to Markdown converter with Requests or Playwright backend.
|
|
3
|
+
|
|
4
|
+
Usage:
|
|
5
|
+
from pg2md import Pg2MdRequests, Pg2MdPlaywright
|
|
6
|
+
|
|
7
|
+
# Requests
|
|
8
|
+
pg = Pg2MdRequests(with_image=False, with_link=False)
|
|
9
|
+
md = pg.run("https://example.com", proxy="http://user:pass@host:port")
|
|
10
|
+
|
|
11
|
+
# Playwright
|
|
12
|
+
pg = Pg2MdPlaywright()
|
|
13
|
+
md = pg.run("https://example.com")
|
|
14
|
+
pg.close()
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from abc import ABC, abstractmethod
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
from typing import Optional
|
|
20
|
+
from urllib.parse import urlparse
|
|
21
|
+
|
|
22
|
+
from requests import Session
|
|
23
|
+
from playwright.sync_api import sync_playwright, Browser
|
|
24
|
+
|
|
25
|
+
from .html_to_md import HtmlToMarkdown
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class Pg2Md(ABC):
|
|
29
|
+
"""Base class for HTML to Markdown conversion."""
|
|
30
|
+
|
|
31
|
+
def __init__(self, with_image: bool = False, with_link: bool = False):
|
|
32
|
+
self._converter = HtmlToMarkdown(
|
|
33
|
+
with_image=with_image,
|
|
34
|
+
with_link=with_link,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
@abstractmethod
|
|
38
|
+
def fetch(
|
|
39
|
+
self,
|
|
40
|
+
url: str,
|
|
41
|
+
proxy: Optional[str] = None,
|
|
42
|
+
headers: Optional[dict] = None,
|
|
43
|
+
cookies: Optional[dict] = None,
|
|
44
|
+
user_agent: Optional[str] = None,
|
|
45
|
+
timeout: int = 30,
|
|
46
|
+
) -> str:
|
|
47
|
+
"""Fetch HTML from URL."""
|
|
48
|
+
...
|
|
49
|
+
|
|
50
|
+
def convert(self, html: str) -> str:
|
|
51
|
+
"""Convert HTML to Markdown."""
|
|
52
|
+
return self._converter.convert(html)
|
|
53
|
+
|
|
54
|
+
def run(
|
|
55
|
+
self,
|
|
56
|
+
url: str,
|
|
57
|
+
proxy: Optional[str] = None,
|
|
58
|
+
headers: Optional[dict] = None,
|
|
59
|
+
cookies: Optional[dict] = None,
|
|
60
|
+
user_agent: Optional[str] = None,
|
|
61
|
+
timeout: int = 30,
|
|
62
|
+
) -> str:
|
|
63
|
+
"""Fetch URL and convert to Markdown."""
|
|
64
|
+
html = self.fetch(url, proxy, headers, cookies, user_agent, timeout)
|
|
65
|
+
return self.convert(html)
|
|
66
|
+
|
|
67
|
+
def save(
|
|
68
|
+
self,
|
|
69
|
+
filepath: str,
|
|
70
|
+
url: str,
|
|
71
|
+
proxy: Optional[str] = None,
|
|
72
|
+
headers: Optional[dict] = None,
|
|
73
|
+
cookies: Optional[dict] = None,
|
|
74
|
+
user_agent: Optional[str] = None,
|
|
75
|
+
timeout: int = 30,
|
|
76
|
+
) -> None:
|
|
77
|
+
"""Fetch, convert and save to file."""
|
|
78
|
+
md = self.run(url, proxy, headers, cookies, user_agent, timeout)
|
|
79
|
+
Path(filepath).write_text(md, encoding="utf-8")
|
|
80
|
+
|
|
81
|
+
def close(self):
|
|
82
|
+
"""Close resources. Override in subclasses if needed."""
|
|
83
|
+
pass
|
|
84
|
+
|
|
85
|
+
def __enter__(self):
|
|
86
|
+
return self
|
|
87
|
+
|
|
88
|
+
def __exit__(self, *args):
|
|
89
|
+
self.close()
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
class Pg2MdRequests(Pg2Md):
|
|
93
|
+
"""Requests-based implementation."""
|
|
94
|
+
|
|
95
|
+
def fetch(
|
|
96
|
+
self,
|
|
97
|
+
url: str,
|
|
98
|
+
proxy: Optional[str] = None,
|
|
99
|
+
headers: Optional[dict] = None,
|
|
100
|
+
cookies: Optional[dict] = None,
|
|
101
|
+
user_agent: Optional[str] = None,
|
|
102
|
+
timeout: int = 30,
|
|
103
|
+
) -> str:
|
|
104
|
+
session = Session()
|
|
105
|
+
|
|
106
|
+
final_headers = dict(headers) if headers else {}
|
|
107
|
+
if user_agent:
|
|
108
|
+
final_headers["User-Agent"] = user_agent
|
|
109
|
+
|
|
110
|
+
proxies = None
|
|
111
|
+
if proxy:
|
|
112
|
+
proxy_url = self._normalize_proxy(proxy)
|
|
113
|
+
proxies = {"http": proxy_url, "https": proxy_url}
|
|
114
|
+
|
|
115
|
+
resp = session.get(
|
|
116
|
+
url,
|
|
117
|
+
proxies=proxies,
|
|
118
|
+
headers=final_headers if final_headers else None,
|
|
119
|
+
cookies=cookies,
|
|
120
|
+
timeout=timeout,
|
|
121
|
+
)
|
|
122
|
+
resp.raise_for_status()
|
|
123
|
+
return resp.text
|
|
124
|
+
|
|
125
|
+
def _normalize_proxy(self, proxy: str) -> str:
|
|
126
|
+
"""Normalize proxy to http://user:pass@host:port format."""
|
|
127
|
+
if proxy.startswith("http://") or proxy.startswith("https://"):
|
|
128
|
+
return proxy
|
|
129
|
+
|
|
130
|
+
parts = proxy.split(":")
|
|
131
|
+
if len(parts) == 4:
|
|
132
|
+
host, port, user, password = parts
|
|
133
|
+
return f"http://{user}:{password}@{host}:{port}"
|
|
134
|
+
|
|
135
|
+
return f"http://{proxy}"
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
class Pg2MdPlaywright(Pg2Md):
|
|
139
|
+
"""Playwright-based implementation with browser reuse."""
|
|
140
|
+
|
|
141
|
+
_shared_playwright = None
|
|
142
|
+
_shared_browsers: dict = {}
|
|
143
|
+
|
|
144
|
+
def __init__(
|
|
145
|
+
self,
|
|
146
|
+
browser: Optional[Browser] = None,
|
|
147
|
+
headless: bool = True,
|
|
148
|
+
with_image: bool = False,
|
|
149
|
+
with_link: bool = False,
|
|
150
|
+
):
|
|
151
|
+
super().__init__(with_image, with_link)
|
|
152
|
+
self._browser = browser
|
|
153
|
+
self._headless = headless
|
|
154
|
+
self._owns_browser = browser is None
|
|
155
|
+
|
|
156
|
+
@classmethod
|
|
157
|
+
def _get_playwright(cls):
|
|
158
|
+
if cls._shared_playwright is None:
|
|
159
|
+
cls._shared_playwright = sync_playwright().start()
|
|
160
|
+
return cls._shared_playwright
|
|
161
|
+
|
|
162
|
+
@property
|
|
163
|
+
def browser(self) -> Browser:
|
|
164
|
+
"""Get or create browser instance."""
|
|
165
|
+
if self._browser is None:
|
|
166
|
+
key = ("chromium", self._headless)
|
|
167
|
+
if key not in self._shared_browsers:
|
|
168
|
+
pw = self._get_playwright()
|
|
169
|
+
self._shared_browsers[key] = pw.chromium.launch(headless=self._headless)
|
|
170
|
+
self._browser = self._shared_browsers[key]
|
|
171
|
+
return self._browser
|
|
172
|
+
|
|
173
|
+
def fetch(
|
|
174
|
+
self,
|
|
175
|
+
url: str,
|
|
176
|
+
proxy: Optional[str] = None,
|
|
177
|
+
headers: Optional[dict] = None,
|
|
178
|
+
cookies: Optional[dict] = None,
|
|
179
|
+
user_agent: Optional[str] = None,
|
|
180
|
+
timeout: int = 30,
|
|
181
|
+
) -> str:
|
|
182
|
+
context_opts = {}
|
|
183
|
+
|
|
184
|
+
if proxy:
|
|
185
|
+
context_opts["proxy"] = self._parse_proxy(proxy)
|
|
186
|
+
if user_agent:
|
|
187
|
+
context_opts["user_agent"] = user_agent
|
|
188
|
+
|
|
189
|
+
context = self.browser.new_context(**context_opts)
|
|
190
|
+
|
|
191
|
+
if headers:
|
|
192
|
+
context.set_extra_http_headers(headers)
|
|
193
|
+
if cookies:
|
|
194
|
+
parsed_url = urlparse(url)
|
|
195
|
+
domain = parsed_url.hostname
|
|
196
|
+
formatted_cookies = [
|
|
197
|
+
{"name": k, "value": v, "domain": domain} for k, v in cookies.items()
|
|
198
|
+
]
|
|
199
|
+
context.add_cookies(formatted_cookies)
|
|
200
|
+
|
|
201
|
+
page = context.new_page()
|
|
202
|
+
page.goto(url, timeout=timeout * 1000)
|
|
203
|
+
html = page.content()
|
|
204
|
+
|
|
205
|
+
page.close()
|
|
206
|
+
context.close()
|
|
207
|
+
|
|
208
|
+
return html
|
|
209
|
+
|
|
210
|
+
def _parse_proxy(self, proxy: str) -> dict:
|
|
211
|
+
"""Parse proxy string to Playwright format."""
|
|
212
|
+
if proxy.startswith("http://") or proxy.startswith("https://"):
|
|
213
|
+
parsed = urlparse(proxy)
|
|
214
|
+
result = {"server": f"{parsed.scheme}://{parsed.hostname}:{parsed.port}"}
|
|
215
|
+
if parsed.username and parsed.password:
|
|
216
|
+
result["username"] = parsed.username
|
|
217
|
+
result["password"] = parsed.password
|
|
218
|
+
return result
|
|
219
|
+
|
|
220
|
+
parts = proxy.split(":")
|
|
221
|
+
if len(parts) == 4:
|
|
222
|
+
host, port, user, password = parts
|
|
223
|
+
return {
|
|
224
|
+
"server": f"http://{host}:{port}",
|
|
225
|
+
"username": user,
|
|
226
|
+
"password": password,
|
|
227
|
+
}
|
|
228
|
+
elif len(parts) == 2:
|
|
229
|
+
host, port = parts
|
|
230
|
+
return {"server": f"http://{host}:{port}"}
|
|
231
|
+
|
|
232
|
+
return {"server": f"http://{proxy}"}
|
|
233
|
+
|
|
234
|
+
def close(self):
|
|
235
|
+
"""Close browser if owned by this instance."""
|
|
236
|
+
if self._owns_browser:
|
|
237
|
+
key = ("chromium", self._headless)
|
|
238
|
+
if key in self._shared_browsers:
|
|
239
|
+
self._shared_browsers[key].close()
|
|
240
|
+
del self._shared_browsers[key]
|
|
241
|
+
self._browser = None
|
|
242
|
+
|
|
243
|
+
@classmethod
|
|
244
|
+
def close_all(cls):
|
|
245
|
+
"""Close all shared browsers and playwright."""
|
|
246
|
+
for browser in cls._shared_browsers.values():
|
|
247
|
+
browser.close()
|
|
248
|
+
cls._shared_browsers.clear()
|
|
249
|
+
if cls._shared_playwright:
|
|
250
|
+
cls._shared_playwright.stop()
|
|
251
|
+
cls._shared_playwright = None
|
|
252
|
+
|
|
253
|
+
def __enter__(self):
|
|
254
|
+
return self
|
|
255
|
+
|
|
256
|
+
def __exit__(self, *args):
|
|
257
|
+
self.close()
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
"""
|
|
2
|
+
HtmlToMarkdown — standalone HTML to Markdown converter.
|
|
3
|
+
|
|
4
|
+
No browser dependencies, just HTML -> Markdown conversion.
|
|
5
|
+
|
|
6
|
+
Dependencies:
|
|
7
|
+
pip install html-to-markdown beautifulsoup4
|
|
8
|
+
|
|
9
|
+
Usage:
|
|
10
|
+
converter = HtmlToMarkdown(with_image=False, with_link=False)
|
|
11
|
+
markdown = converter.convert(html_string)
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import re
|
|
15
|
+
from typing import Optional
|
|
16
|
+
|
|
17
|
+
from bs4 import BeautifulSoup
|
|
18
|
+
from html_to_markdown import convert, ConversionOptions, PreprocessingOptions
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class HtmlToMarkdown:
|
|
22
|
+
"""
|
|
23
|
+
Converts HTML to clean Markdown.
|
|
24
|
+
|
|
25
|
+
Steps:
|
|
26
|
+
1. Clean HTML (remove scripts, styles, optional images/links)
|
|
27
|
+
2. Convert to Markdown via html-to-markdown
|
|
28
|
+
3. Clean final Markdown (remove base64, excess newlines)
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
with_image: Include images in output. Default False.
|
|
32
|
+
with_link: Include links (href). Default True.
|
|
33
|
+
False — links are replaced with their text.
|
|
34
|
+
heading_style: "atx" (#) or "setext" (underline). Default "atx".
|
|
35
|
+
strong_em_symbol: "*" or "_". Default "*".
|
|
36
|
+
bullets: Bullet character. Default "*".
|
|
37
|
+
escape_asterisks: Escape asterisks in text. Default False.
|
|
38
|
+
preprocessing_preset: "aggressive", "moderate", or "conservative". Default "aggressive".
|
|
39
|
+
remove_navigation: Remove navigation elements. Default True.
|
|
40
|
+
remove_forms: Remove form elements. Default True.
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
STRIP_TAGS = [
|
|
44
|
+
"script",
|
|
45
|
+
"style",
|
|
46
|
+
"noscript",
|
|
47
|
+
"svg",
|
|
48
|
+
"canvas",
|
|
49
|
+
"video",
|
|
50
|
+
"audio",
|
|
51
|
+
"iframe",
|
|
52
|
+
"object",
|
|
53
|
+
"embed",
|
|
54
|
+
"head",
|
|
55
|
+
]
|
|
56
|
+
|
|
57
|
+
_BASE64_LINE = re.compile(r"^[A-Za-z0-9+/=]{40,}\s*$", re.MULTILINE)
|
|
58
|
+
_BINARY_GARBAGE = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f]")
|
|
59
|
+
_EXCESS_NEWLINES = re.compile(r"\n{3,}")
|
|
60
|
+
_MD_IMAGE = re.compile(r"!\[.*?\]\(.*?\)")
|
|
61
|
+
|
|
62
|
+
def __init__(
|
|
63
|
+
self,
|
|
64
|
+
with_image: bool = False,
|
|
65
|
+
with_link: bool = True,
|
|
66
|
+
heading_style: str = "atx",
|
|
67
|
+
strong_em_symbol: str = "*",
|
|
68
|
+
bullets: str = "*",
|
|
69
|
+
escape_asterisks: bool = False,
|
|
70
|
+
preprocessing_preset: str = "aggressive",
|
|
71
|
+
remove_navigation: bool = True,
|
|
72
|
+
remove_forms: bool = True,
|
|
73
|
+
):
|
|
74
|
+
self.with_image = with_image
|
|
75
|
+
self.with_link = with_link
|
|
76
|
+
self.heading_style = heading_style
|
|
77
|
+
self.strong_em_symbol = strong_em_symbol
|
|
78
|
+
self.bullets = bullets
|
|
79
|
+
self.escape_asterisks = escape_asterisks
|
|
80
|
+
self.preprocessing_preset = preprocessing_preset
|
|
81
|
+
self.remove_navigation = remove_navigation
|
|
82
|
+
self.remove_forms = remove_forms
|
|
83
|
+
|
|
84
|
+
def convert(self, html: str) -> str:
|
|
85
|
+
"""
|
|
86
|
+
Convert HTML to clean Markdown.
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
html: HTML string
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
Clean Markdown string
|
|
93
|
+
"""
|
|
94
|
+
clean_html = self._clean_html(html)
|
|
95
|
+
markdown = self._html_to_markdown_lib(clean_html)
|
|
96
|
+
markdown = self._clean_markdown(markdown)
|
|
97
|
+
return markdown
|
|
98
|
+
|
|
99
|
+
def _clean_html(self, html: str) -> str:
|
|
100
|
+
"""Remove unwanted tags and attributes from HTML."""
|
|
101
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
102
|
+
|
|
103
|
+
for tag in self.STRIP_TAGS:
|
|
104
|
+
for el in soup.find_all(tag):
|
|
105
|
+
el.decompose()
|
|
106
|
+
|
|
107
|
+
if not self.with_image:
|
|
108
|
+
for el in soup.find_all("img"):
|
|
109
|
+
el.decompose()
|
|
110
|
+
else:
|
|
111
|
+
for el in soup.find_all("img"):
|
|
112
|
+
src = el.get("src", "")
|
|
113
|
+
if isinstance(src, str) and (src.startswith("data:") or src.startswith("blob:")):
|
|
114
|
+
el.decompose()
|
|
115
|
+
|
|
116
|
+
if not self.with_link:
|
|
117
|
+
for el in soup.find_all("a"):
|
|
118
|
+
el.replace_with(el.get_text())
|
|
119
|
+
else:
|
|
120
|
+
for el in soup.find_all("a"):
|
|
121
|
+
href = el.get("href", "")
|
|
122
|
+
if isinstance(href, str) and (href.startswith("data:") or href.startswith("blob:")):
|
|
123
|
+
el["href"] = ""
|
|
124
|
+
|
|
125
|
+
for el in soup.find_all(True):
|
|
126
|
+
for attr in ("src", "href", "srcset", "poster", "background"):
|
|
127
|
+
val = el.get(attr, "")
|
|
128
|
+
if isinstance(val, str) and (val.startswith("data:") or val.startswith("blob:")):
|
|
129
|
+
del el[attr]
|
|
130
|
+
|
|
131
|
+
return str(soup)
|
|
132
|
+
|
|
133
|
+
def _html_to_markdown_lib(self, html: str) -> str:
|
|
134
|
+
"""Convert HTML to Markdown using html-to-markdown library."""
|
|
135
|
+
options = ConversionOptions(
|
|
136
|
+
heading_style=self.heading_style,
|
|
137
|
+
strong_em_symbol=self.strong_em_symbol,
|
|
138
|
+
bullets=self.bullets,
|
|
139
|
+
escape_asterisks=self.escape_asterisks,
|
|
140
|
+
)
|
|
141
|
+
preprocessing = PreprocessingOptions(
|
|
142
|
+
enabled=True,
|
|
143
|
+
preset=self.preprocessing_preset,
|
|
144
|
+
remove_navigation=self.remove_navigation,
|
|
145
|
+
remove_forms=self.remove_forms,
|
|
146
|
+
)
|
|
147
|
+
return convert(html, options, preprocessing)
|
|
148
|
+
|
|
149
|
+
def _clean_markdown(self, text: str) -> str:
|
|
150
|
+
"""Final cleanup of Markdown text."""
|
|
151
|
+
text = self._BINARY_GARBAGE.sub("", text)
|
|
152
|
+
text = self._BASE64_LINE.sub("", text)
|
|
153
|
+
|
|
154
|
+
if not self.with_image:
|
|
155
|
+
text = self._MD_IMAGE.sub("", text)
|
|
156
|
+
|
|
157
|
+
text = self._EXCESS_NEWLINES.sub("\n\n", text)
|
|
158
|
+
|
|
159
|
+
return text.strip()
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
if __name__ == "__main__":
|
|
163
|
+
import sys
|
|
164
|
+
|
|
165
|
+
html = sys.stdin.read() if not sys.argv[1:] else open(sys.argv[1]).read()
|
|
166
|
+
|
|
167
|
+
converter = HtmlToMarkdown(with_image=False, with_link=False)
|
|
168
|
+
print(converter.convert(html))
|
|
@@ -0,0 +1,302 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pg2md
|
|
3
|
+
Version: 1.0.1
|
|
4
|
+
Summary: HTML to Markdown converter with Requests or Playwright backend
|
|
5
|
+
Author-email: Your Name <your@email.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/yourname/pg2md
|
|
8
|
+
Project-URL: Repository, https://github.com/yourname/pg2md
|
|
9
|
+
Project-URL: Issues, https://github.com/yourname/pg2md/issues
|
|
10
|
+
Keywords: html,markdown,converter,playwright,requests,scraper
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Topic :: Text Processing :: Markup :: HTML
|
|
19
|
+
Classifier: Topic :: Text Processing :: Markup :: Markdown
|
|
20
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
21
|
+
Requires-Python: >=3.10
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
License-File: LICENSE
|
|
24
|
+
Requires-Dist: requests>=2.28.0
|
|
25
|
+
Requires-Dist: beautifulsoup4>=4.12.0
|
|
26
|
+
Requires-Dist: html-to-markdown>=1.1.0
|
|
27
|
+
Provides-Extra: playwright
|
|
28
|
+
Requires-Dist: playwright>=1.40.0; extra == "playwright"
|
|
29
|
+
Provides-Extra: dev
|
|
30
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
31
|
+
Requires-Dist: build>=1.0.0; extra == "dev"
|
|
32
|
+
Requires-Dist: twine>=4.0.0; extra == "dev"
|
|
33
|
+
Dynamic: license-file
|
|
34
|
+
|
|
35
|
+
# pg2md
|
|
36
|
+
|
|
37
|
+
**HTML to Markdown converter** with Requests or Playwright backend.
|
|
38
|
+
|
|
39
|
+
Convert any webpage to clean Markdown. Choose between fast `requests` or full browser `playwright` for JavaScript-rendered pages.
|
|
40
|
+
|
|
41
|
+
## Features
|
|
42
|
+
|
|
43
|
+
- **Two backends**: `Pg2MdRequests` (fast) or `Pg2MdPlaywright` (JS support)
|
|
44
|
+
- **Browser reuse**: Playwright instances share a single browser
|
|
45
|
+
- **Proxy support**: HTTP/HTTPS proxies with authentication
|
|
46
|
+
- **Custom headers & cookies**: Full control over requests
|
|
47
|
+
- **Clean output**: Optional removal of images and links
|
|
48
|
+
- **Context manager**: Auto-cleanup with `with` statement
|
|
49
|
+
|
|
50
|
+
## Installation
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
pip install pg2md
|
|
54
|
+
|
|
55
|
+
# For Playwright backend:
|
|
56
|
+
pip install pg2md[playwright]
|
|
57
|
+
playwright install chromium
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## Quick Start
|
|
61
|
+
|
|
62
|
+
```python
|
|
63
|
+
from pg2md import Pg2MdRequests, Pg2MdPlaywright
|
|
64
|
+
|
|
65
|
+
# Simple usage with Requests
|
|
66
|
+
pg = Pg2MdRequests()
|
|
67
|
+
markdown = pg.run("https://example.com")
|
|
68
|
+
print(markdown)
|
|
69
|
+
|
|
70
|
+
# Playwright for JS-heavy sites
|
|
71
|
+
pg = Pg2MdPlaywright()
|
|
72
|
+
markdown = pg.run("https://spa-example.com")
|
|
73
|
+
pg.close()
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
## Usage
|
|
77
|
+
|
|
78
|
+
### Basic Conversion
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
from pg2md import Pg2MdRequests
|
|
82
|
+
|
|
83
|
+
pg = Pg2MdRequests(with_image=False, with_link=False)
|
|
84
|
+
md = pg.run("https://news.ycombinator.com")
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
### With Proxy
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
from pg2md import Pg2MdRequests, Pg2MdPlaywright
|
|
91
|
+
|
|
92
|
+
# Format: http://user:password@host:port
|
|
93
|
+
# Or: host:port:user:password
|
|
94
|
+
proxy = "http://user:pass@proxy.example.com:8080"
|
|
95
|
+
|
|
96
|
+
# Requests
|
|
97
|
+
pg = Pg2MdRequests()
|
|
98
|
+
md = pg.run("https://example.com", proxy=proxy)
|
|
99
|
+
|
|
100
|
+
# Playwright
|
|
101
|
+
pg = Pg2MdPlaywright()
|
|
102
|
+
md = pg.run("https://example.com", proxy=proxy)
|
|
103
|
+
pg.close()
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
### Custom Headers & User-Agent
|
|
107
|
+
|
|
108
|
+
```python
|
|
109
|
+
from pg2md import Pg2MdRequests
|
|
110
|
+
|
|
111
|
+
pg = Pg2MdRequests()
|
|
112
|
+
md = pg.run(
|
|
113
|
+
"https://api.example.com/data",
|
|
114
|
+
headers={
|
|
115
|
+
"X-API-Key": "secret123",
|
|
116
|
+
"Accept": "application/json",
|
|
117
|
+
},
|
|
118
|
+
user_agent="MyBot/1.0",
|
|
119
|
+
)
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
### With Cookies
|
|
123
|
+
|
|
124
|
+
```python
|
|
125
|
+
from pg2md import Pg2MdRequests
|
|
126
|
+
|
|
127
|
+
pg = Pg2MdRequests()
|
|
128
|
+
md = pg.run(
|
|
129
|
+
"https://example.com/dashboard",
|
|
130
|
+
cookies={
|
|
131
|
+
"session": "abc123",
|
|
132
|
+
"auth_token": "xyz789",
|
|
133
|
+
},
|
|
134
|
+
)
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
### Save to File
|
|
138
|
+
|
|
139
|
+
```python
|
|
140
|
+
from pg2md import Pg2MdRequests
|
|
141
|
+
|
|
142
|
+
pg = Pg2MdRequests()
|
|
143
|
+
pg.save("output.md", "https://example.com")
|
|
144
|
+
|
|
145
|
+
# With options
|
|
146
|
+
pg.save(
|
|
147
|
+
"article.md",
|
|
148
|
+
"https://blog.example.com/post",
|
|
149
|
+
proxy="http://user:pass@host:port",
|
|
150
|
+
user_agent="MyBot/1.0",
|
|
151
|
+
)
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
### Context Manager
|
|
155
|
+
|
|
156
|
+
```python
|
|
157
|
+
from pg2md import Pg2MdPlaywright
|
|
158
|
+
|
|
159
|
+
with Pg2MdPlaywright() as pg:
|
|
160
|
+
md1 = pg.run("https://site1.com")
|
|
161
|
+
md2 = pg.run("https://site2.com")
|
|
162
|
+
# Browser closed automatically
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
### Multiple Instances
|
|
166
|
+
|
|
167
|
+
```python
|
|
168
|
+
from pg2md import Pg2MdPlaywright
|
|
169
|
+
|
|
170
|
+
# Both share the same browser (efficient)
|
|
171
|
+
pg1 = Pg2MdPlaywright()
|
|
172
|
+
pg2 = Pg2MdPlaywright()
|
|
173
|
+
|
|
174
|
+
md1 = pg1.run("https://site1.com")
|
|
175
|
+
md2 = pg2.run("https://site2.com")
|
|
176
|
+
|
|
177
|
+
Pg2MdPlaywright.close_all() # Close shared browser
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
## API Reference
|
|
181
|
+
|
|
182
|
+
### Pg2MdRequests
|
|
183
|
+
|
|
184
|
+
```python
|
|
185
|
+
Pg2MdRequests(with_image=False, with_link=False)
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
| Parameter | Type | Default | Description |
|
|
189
|
+
|-----------|------|---------|-------------|
|
|
190
|
+
| `with_image` | bool | False | Include images in output |
|
|
191
|
+
| `with_link` | bool | False | Include links in output |
|
|
192
|
+
|
|
193
|
+
### Pg2MdPlaywright
|
|
194
|
+
|
|
195
|
+
```python
|
|
196
|
+
Pg2MdPlaywright(
|
|
197
|
+
browser=None, # Custom Browser instance
|
|
198
|
+
headless=True, # Headless mode
|
|
199
|
+
with_image=False,
|
|
200
|
+
with_link=False,
|
|
201
|
+
)
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
### Methods
|
|
205
|
+
|
|
206
|
+
#### `run(url, proxy=None, headers=None, cookies=None, user_agent=None, timeout=30)`
|
|
207
|
+
|
|
208
|
+
Fetch URL and convert to Markdown.
|
|
209
|
+
|
|
210
|
+
Returns: `str` (Markdown)
|
|
211
|
+
|
|
212
|
+
#### `fetch(url, proxy=None, headers=None, cookies=None, user_agent=None, timeout=30)`
|
|
213
|
+
|
|
214
|
+
Fetch HTML only.
|
|
215
|
+
|
|
216
|
+
Returns: `str` (HTML)
|
|
217
|
+
|
|
218
|
+
#### `convert(html)`
|
|
219
|
+
|
|
220
|
+
Convert HTML to Markdown.
|
|
221
|
+
|
|
222
|
+
Returns: `str` (Markdown)
|
|
223
|
+
|
|
224
|
+
#### `save(filepath, url, **kwargs)`
|
|
225
|
+
|
|
226
|
+
Fetch, convert, and save to file.
|
|
227
|
+
|
|
228
|
+
#### `close()`
|
|
229
|
+
|
|
230
|
+
Close browser (Playwright only).
|
|
231
|
+
|
|
232
|
+
#### `close_all()` (classmethod, Playwright only)
|
|
233
|
+
|
|
234
|
+
Close all shared browsers.
|
|
235
|
+
|
|
236
|
+
## When to Use Which Backend?
|
|
237
|
+
|
|
238
|
+
| Use Requests | Use Playwright |
|
|
239
|
+
|--------------|----------------|
|
|
240
|
+
| Static HTML pages | SPA / JavaScript apps |
|
|
241
|
+
| Speed matters | Need rendered content |
|
|
242
|
+
| Simple scraping | Bypass anti-bot (sometimes) |
|
|
243
|
+
| Low memory | Modern web apps |
|
|
244
|
+
|
|
245
|
+
## Examples
|
|
246
|
+
|
|
247
|
+
### Scrape Multiple URLs
|
|
248
|
+
|
|
249
|
+
```python
|
|
250
|
+
from pg2md import Pg2MdRequests
|
|
251
|
+
|
|
252
|
+
urls = [
|
|
253
|
+
"https://blog.example.com/post1",
|
|
254
|
+
"https://blog.example.com/post2",
|
|
255
|
+
"https://blog.example.com/post3",
|
|
256
|
+
]
|
|
257
|
+
|
|
258
|
+
pg = Pg2MdRequests(with_image=False, with_link=False)
|
|
259
|
+
|
|
260
|
+
for i, url in enumerate(urls):
|
|
261
|
+
pg.save(f"post_{i+1}.md", url)
|
|
262
|
+
print(f"Saved: {url}")
|
|
263
|
+
```
|
|
264
|
+
|
|
265
|
+
### Batch with Proxies
|
|
266
|
+
|
|
267
|
+
```python
|
|
268
|
+
from pg2md import Pg2MdRequests
|
|
269
|
+
|
|
270
|
+
urls = ["https://site1.com", "https://site2.com", "https://site3.com"]
|
|
271
|
+
proxies = [
|
|
272
|
+
"http://user1:pass1@proxy1:8080",
|
|
273
|
+
"http://user2:pass2@proxy2:8080",
|
|
274
|
+
]
|
|
275
|
+
|
|
276
|
+
pg = Pg2MdRequests()
|
|
277
|
+
|
|
278
|
+
for i, url in enumerate(urls):
|
|
279
|
+
proxy = proxies[i % len(proxies)]
|
|
280
|
+
md = pg.run(url, proxy=proxy)
|
|
281
|
+
print(f"[{i+1}] {len(md)} chars")
|
|
282
|
+
```
|
|
283
|
+
|
|
284
|
+
### Extract Article Content
|
|
285
|
+
|
|
286
|
+
```python
|
|
287
|
+
from pg2md import Pg2MdPlaywright
|
|
288
|
+
|
|
289
|
+
with Pg2MdPlaywright() as pg:
|
|
290
|
+
md = pg.run(
|
|
291
|
+
"https://medium.com/some-article",
|
|
292
|
+
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
# Save clean text
|
|
296
|
+
with open("article.md", "w") as f:
|
|
297
|
+
f.write(md)
|
|
298
|
+
```
|
|
299
|
+
|
|
300
|
+
## License
|
|
301
|
+
|
|
302
|
+
MIT
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
pg2md
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "pg2md"
|
|
7
|
+
version = "1.0.1"
|
|
8
|
+
description = "HTML to Markdown converter with Requests or Playwright backend"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = {text = "MIT"}
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "Your Name", email = "your@email.com"}
|
|
14
|
+
]
|
|
15
|
+
keywords = ["html", "markdown", "converter", "playwright", "requests", "scraper"]
|
|
16
|
+
classifiers = [
|
|
17
|
+
"Development Status :: 4 - Beta",
|
|
18
|
+
"Intended Audience :: Developers",
|
|
19
|
+
"License :: OSI Approved :: MIT License",
|
|
20
|
+
"Programming Language :: Python :: 3",
|
|
21
|
+
"Programming Language :: Python :: 3.10",
|
|
22
|
+
"Programming Language :: Python :: 3.11",
|
|
23
|
+
"Programming Language :: Python :: 3.12",
|
|
24
|
+
"Topic :: Text Processing :: Markup :: HTML",
|
|
25
|
+
"Topic :: Text Processing :: Markup :: Markdown",
|
|
26
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
27
|
+
]
|
|
28
|
+
dependencies = [
|
|
29
|
+
"requests>=2.28.0",
|
|
30
|
+
"beautifulsoup4>=4.12.0",
|
|
31
|
+
"html-to-markdown>=1.1.0",
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
[project.optional-dependencies]
|
|
35
|
+
playwright = [
|
|
36
|
+
"playwright>=1.40.0",
|
|
37
|
+
]
|
|
38
|
+
dev = [
|
|
39
|
+
"pytest>=7.0.0",
|
|
40
|
+
"build>=1.0.0",
|
|
41
|
+
"twine>=4.0.0",
|
|
42
|
+
]
|
|
43
|
+
|
|
44
|
+
[project.urls]
|
|
45
|
+
Homepage = "https://github.com/yourname/pg2md"
|
|
46
|
+
Repository = "https://github.com/yourname/pg2md"
|
|
47
|
+
Issues = "https://github.com/yourname/pg2md/issues"
|
|
48
|
+
|
|
49
|
+
[tool.setuptools.packages.find]
|
|
50
|
+
where = ["."]
|
|
51
|
+
include = ["pg2md"]
|
pg2md-1.0.1/setup.cfg
ADDED