pg2md 1.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pg2md-1.0.1/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
pg2md-1.0.1/PKG-INFO ADDED
@@ -0,0 +1,302 @@
1
+ Metadata-Version: 2.4
2
+ Name: pg2md
3
+ Version: 1.0.1
4
+ Summary: HTML to Markdown converter with Requests or Playwright backend
5
+ Author-email: Your Name <your@email.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/yourname/pg2md
8
+ Project-URL: Repository, https://github.com/yourname/pg2md
9
+ Project-URL: Issues, https://github.com/yourname/pg2md/issues
10
+ Keywords: html,markdown,converter,playwright,requests,scraper
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Topic :: Text Processing :: Markup :: HTML
19
+ Classifier: Topic :: Text Processing :: Markup :: Markdown
20
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
21
+ Requires-Python: >=3.10
22
+ Description-Content-Type: text/markdown
23
+ License-File: LICENSE
24
+ Requires-Dist: requests>=2.28.0
25
+ Requires-Dist: beautifulsoup4>=4.12.0
26
+ Requires-Dist: html-to-markdown>=1.1.0
27
+ Provides-Extra: playwright
28
+ Requires-Dist: playwright>=1.40.0; extra == "playwright"
29
+ Provides-Extra: dev
30
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
31
+ Requires-Dist: build>=1.0.0; extra == "dev"
32
+ Requires-Dist: twine>=4.0.0; extra == "dev"
33
+ Dynamic: license-file
34
+
35
+ # pg2md
36
+
37
+ **HTML to Markdown converter** with Requests or Playwright backend.
38
+
39
+ Convert any webpage to clean Markdown. Choose between fast `requests` or full browser `playwright` for JavaScript-rendered pages.
40
+
41
+ ## Features
42
+
43
+ - **Two backends**: `Pg2MdRequests` (fast) or `Pg2MdPlaywright` (JS support)
44
+ - **Browser reuse**: Playwright instances share a single browser
45
+ - **Proxy support**: HTTP/HTTPS proxies with authentication
46
+ - **Custom headers & cookies**: Full control over requests
47
+ - **Clean output**: Optional removal of images and links
48
+ - **Context manager**: Auto-cleanup with `with` statement
49
+
50
+ ## Installation
51
+
52
+ ```bash
53
+ pip install pg2md
54
+
55
+ # For Playwright backend:
56
+ pip install pg2md[playwright]
57
+ playwright install chromium
58
+ ```
59
+
60
+ ## Quick Start
61
+
62
+ ```python
63
+ from pg2md import Pg2MdRequests, Pg2MdPlaywright
64
+
65
+ # Simple usage with Requests
66
+ pg = Pg2MdRequests()
67
+ markdown = pg.run("https://example.com")
68
+ print(markdown)
69
+
70
+ # Playwright for JS-heavy sites
71
+ pg = Pg2MdPlaywright()
72
+ markdown = pg.run("https://spa-example.com")
73
+ pg.close()
74
+ ```
75
+
76
+ ## Usage
77
+
78
+ ### Basic Conversion
79
+
80
+ ```python
81
+ from pg2md import Pg2MdRequests
82
+
83
+ pg = Pg2MdRequests(with_image=False, with_link=False)
84
+ md = pg.run("https://news.ycombinator.com")
85
+ ```
86
+
87
+ ### With Proxy
88
+
89
+ ```python
90
+ from pg2md import Pg2MdRequests, Pg2MdPlaywright
91
+
92
+ # Format: http://user:password@host:port
93
+ # Or: host:port:user:password
94
+ proxy = "http://user:pass@proxy.example.com:8080"
95
+
96
+ # Requests
97
+ pg = Pg2MdRequests()
98
+ md = pg.run("https://example.com", proxy=proxy)
99
+
100
+ # Playwright
101
+ pg = Pg2MdPlaywright()
102
+ md = pg.run("https://example.com", proxy=proxy)
103
+ pg.close()
104
+ ```
105
+
106
+ ### Custom Headers & User-Agent
107
+
108
+ ```python
109
+ from pg2md import Pg2MdRequests
110
+
111
+ pg = Pg2MdRequests()
112
+ md = pg.run(
113
+ "https://api.example.com/data",
114
+ headers={
115
+ "X-API-Key": "secret123",
116
+ "Accept": "application/json",
117
+ },
118
+ user_agent="MyBot/1.0",
119
+ )
120
+ ```
121
+
122
+ ### With Cookies
123
+
124
+ ```python
125
+ from pg2md import Pg2MdRequests
126
+
127
+ pg = Pg2MdRequests()
128
+ md = pg.run(
129
+ "https://example.com/dashboard",
130
+ cookies={
131
+ "session": "abc123",
132
+ "auth_token": "xyz789",
133
+ },
134
+ )
135
+ ```
136
+
137
+ ### Save to File
138
+
139
+ ```python
140
+ from pg2md import Pg2MdRequests
141
+
142
+ pg = Pg2MdRequests()
143
+ pg.save("output.md", "https://example.com")
144
+
145
+ # With options
146
+ pg.save(
147
+ "article.md",
148
+ "https://blog.example.com/post",
149
+ proxy="http://user:pass@host:port",
150
+ user_agent="MyBot/1.0",
151
+ )
152
+ ```
153
+
154
+ ### Context Manager
155
+
156
+ ```python
157
+ from pg2md import Pg2MdPlaywright
158
+
159
+ with Pg2MdPlaywright() as pg:
160
+ md1 = pg.run("https://site1.com")
161
+ md2 = pg.run("https://site2.com")
162
+ # Browser closed automatically
163
+ ```
164
+
165
+ ### Multiple Instances
166
+
167
+ ```python
168
+ from pg2md import Pg2MdPlaywright
169
+
170
+ # Both share the same browser (efficient)
171
+ pg1 = Pg2MdPlaywright()
172
+ pg2 = Pg2MdPlaywright()
173
+
174
+ md1 = pg1.run("https://site1.com")
175
+ md2 = pg2.run("https://site2.com")
176
+
177
+ Pg2MdPlaywright.close_all() # Close shared browser
178
+ ```
179
+
180
+ ## API Reference
181
+
182
+ ### Pg2MdRequests
183
+
184
+ ```python
185
+ Pg2MdRequests(with_image=False, with_link=False)
186
+ ```
187
+
188
+ | Parameter | Type | Default | Description |
189
+ |-----------|------|---------|-------------|
190
+ | `with_image` | bool | False | Include images in output |
191
+ | `with_link` | bool | False | Include links in output |
192
+
193
+ ### Pg2MdPlaywright
194
+
195
+ ```python
196
+ Pg2MdPlaywright(
197
+ browser=None, # Custom Browser instance
198
+ headless=True, # Headless mode
199
+ with_image=False,
200
+ with_link=False,
201
+ )
202
+ ```
203
+
204
+ ### Methods
205
+
206
+ #### `run(url, proxy=None, headers=None, cookies=None, user_agent=None, timeout=30)`
207
+
208
+ Fetch URL and convert to Markdown.
209
+
210
+ Returns: `str` (Markdown)
211
+
212
+ #### `fetch(url, proxy=None, headers=None, cookies=None, user_agent=None, timeout=30)`
213
+
214
+ Fetch HTML only.
215
+
216
+ Returns: `str` (HTML)
217
+
218
+ #### `convert(html)`
219
+
220
+ Convert HTML to Markdown.
221
+
222
+ Returns: `str` (Markdown)
223
+
224
+ #### `save(filepath, url, **kwargs)`
225
+
226
+ Fetch, convert, and save to file.
227
+
228
+ #### `close()`
229
+
230
+ Close browser (Playwright only).
231
+
232
+ #### `close_all()` (classmethod, Playwright only)
233
+
234
+ Close all shared browsers.
235
+
236
+ ## When to Use Which Backend?
237
+
238
+ | Use Requests | Use Playwright |
239
+ |--------------|----------------|
240
+ | Static HTML pages | SPA / JavaScript apps |
241
+ | Speed matters | Need rendered content |
242
+ | Simple scraping | Bypass anti-bot (sometimes) |
243
+ | Low memory | Modern web apps |
244
+
245
+ ## Examples
246
+
247
+ ### Scrape Multiple URLs
248
+
249
+ ```python
250
+ from pg2md import Pg2MdRequests
251
+
252
+ urls = [
253
+ "https://blog.example.com/post1",
254
+ "https://blog.example.com/post2",
255
+ "https://blog.example.com/post3",
256
+ ]
257
+
258
+ pg = Pg2MdRequests(with_image=False, with_link=False)
259
+
260
+ for i, url in enumerate(urls):
261
+ pg.save(f"post_{i+1}.md", url)
262
+ print(f"Saved: {url}")
263
+ ```
264
+
265
+ ### Batch with Proxies
266
+
267
+ ```python
268
+ from pg2md import Pg2MdRequests
269
+
270
+ urls = ["https://site1.com", "https://site2.com", "https://site3.com"]
271
+ proxies = [
272
+ "http://user1:pass1@proxy1:8080",
273
+ "http://user2:pass2@proxy2:8080",
274
+ ]
275
+
276
+ pg = Pg2MdRequests()
277
+
278
+ for i, url in enumerate(urls):
279
+ proxy = proxies[i % len(proxies)]
280
+ md = pg.run(url, proxy=proxy)
281
+ print(f"[{i+1}] {len(md)} chars")
282
+ ```
283
+
284
+ ### Extract Article Content
285
+
286
+ ```python
287
+ from pg2md import Pg2MdPlaywright
288
+
289
+ with Pg2MdPlaywright() as pg:
290
+ md = pg.run(
291
+ "https://medium.com/some-article",
292
+ user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
293
+ )
294
+
295
+ # Save clean text
296
+ with open("article.md", "w") as f:
297
+ f.write(md)
298
+ ```
299
+
300
+ ## License
301
+
302
+ MIT
pg2md-1.0.1/README.md ADDED
@@ -0,0 +1,268 @@
1
+ # pg2md
2
+
3
+ **HTML to Markdown converter** with Requests or Playwright backend.
4
+
5
+ Convert any webpage to clean Markdown. Choose between fast `requests` or full browser `playwright` for JavaScript-rendered pages.
6
+
7
+ ## Features
8
+
9
+ - **Two backends**: `Pg2MdRequests` (fast) or `Pg2MdPlaywright` (JS support)
10
+ - **Browser reuse**: Playwright instances share a single browser
11
+ - **Proxy support**: HTTP/HTTPS proxies with authentication
12
+ - **Custom headers & cookies**: Full control over requests
13
+ - **Clean output**: Optional removal of images and links
14
+ - **Context manager**: Auto-cleanup with `with` statement
15
+
16
+ ## Installation
17
+
18
+ ```bash
19
+ pip install pg2md
20
+
21
+ # For Playwright backend:
22
+ pip install pg2md[playwright]
23
+ playwright install chromium
24
+ ```
25
+
26
+ ## Quick Start
27
+
28
+ ```python
29
+ from pg2md import Pg2MdRequests, Pg2MdPlaywright
30
+
31
+ # Simple usage with Requests
32
+ pg = Pg2MdRequests()
33
+ markdown = pg.run("https://example.com")
34
+ print(markdown)
35
+
36
+ # Playwright for JS-heavy sites
37
+ pg = Pg2MdPlaywright()
38
+ markdown = pg.run("https://spa-example.com")
39
+ pg.close()
40
+ ```
41
+
42
+ ## Usage
43
+
44
+ ### Basic Conversion
45
+
46
+ ```python
47
+ from pg2md import Pg2MdRequests
48
+
49
+ pg = Pg2MdRequests(with_image=False, with_link=False)
50
+ md = pg.run("https://news.ycombinator.com")
51
+ ```
52
+
53
+ ### With Proxy
54
+
55
+ ```python
56
+ from pg2md import Pg2MdRequests, Pg2MdPlaywright
57
+
58
+ # Format: http://user:password@host:port
59
+ # Or: host:port:user:password
60
+ proxy = "http://user:pass@proxy.example.com:8080"
61
+
62
+ # Requests
63
+ pg = Pg2MdRequests()
64
+ md = pg.run("https://example.com", proxy=proxy)
65
+
66
+ # Playwright
67
+ pg = Pg2MdPlaywright()
68
+ md = pg.run("https://example.com", proxy=proxy)
69
+ pg.close()
70
+ ```
71
+
72
+ ### Custom Headers & User-Agent
73
+
74
+ ```python
75
+ from pg2md import Pg2MdRequests
76
+
77
+ pg = Pg2MdRequests()
78
+ md = pg.run(
79
+ "https://api.example.com/data",
80
+ headers={
81
+ "X-API-Key": "secret123",
82
+ "Accept": "application/json",
83
+ },
84
+ user_agent="MyBot/1.0",
85
+ )
86
+ ```
87
+
88
+ ### With Cookies
89
+
90
+ ```python
91
+ from pg2md import Pg2MdRequests
92
+
93
+ pg = Pg2MdRequests()
94
+ md = pg.run(
95
+ "https://example.com/dashboard",
96
+ cookies={
97
+ "session": "abc123",
98
+ "auth_token": "xyz789",
99
+ },
100
+ )
101
+ ```
102
+
103
+ ### Save to File
104
+
105
+ ```python
106
+ from pg2md import Pg2MdRequests
107
+
108
+ pg = Pg2MdRequests()
109
+ pg.save("output.md", "https://example.com")
110
+
111
+ # With options
112
+ pg.save(
113
+ "article.md",
114
+ "https://blog.example.com/post",
115
+ proxy="http://user:pass@host:port",
116
+ user_agent="MyBot/1.0",
117
+ )
118
+ ```
119
+
120
+ ### Context Manager
121
+
122
+ ```python
123
+ from pg2md import Pg2MdPlaywright
124
+
125
+ with Pg2MdPlaywright() as pg:
126
+ md1 = pg.run("https://site1.com")
127
+ md2 = pg.run("https://site2.com")
128
+ # Browser closed automatically
129
+ ```
130
+
131
+ ### Multiple Instances
132
+
133
+ ```python
134
+ from pg2md import Pg2MdPlaywright
135
+
136
+ # Both share the same browser (efficient)
137
+ pg1 = Pg2MdPlaywright()
138
+ pg2 = Pg2MdPlaywright()
139
+
140
+ md1 = pg1.run("https://site1.com")
141
+ md2 = pg2.run("https://site2.com")
142
+
143
+ Pg2MdPlaywright.close_all() # Close shared browser
144
+ ```
145
+
146
+ ## API Reference
147
+
148
+ ### Pg2MdRequests
149
+
150
+ ```python
151
+ Pg2MdRequests(with_image=False, with_link=False)
152
+ ```
153
+
154
+ | Parameter | Type | Default | Description |
155
+ |-----------|------|---------|-------------|
156
+ | `with_image` | bool | False | Include images in output |
157
+ | `with_link` | bool | False | Include links in output |
158
+
159
+ ### Pg2MdPlaywright
160
+
161
+ ```python
162
+ Pg2MdPlaywright(
163
+ browser=None, # Custom Browser instance
164
+ headless=True, # Headless mode
165
+ with_image=False,
166
+ with_link=False,
167
+ )
168
+ ```
169
+
170
+ ### Methods
171
+
172
+ #### `run(url, proxy=None, headers=None, cookies=None, user_agent=None, timeout=30)`
173
+
174
+ Fetch URL and convert to Markdown.
175
+
176
+ Returns: `str` (Markdown)
177
+
178
+ #### `fetch(url, proxy=None, headers=None, cookies=None, user_agent=None, timeout=30)`
179
+
180
+ Fetch HTML only.
181
+
182
+ Returns: `str` (HTML)
183
+
184
+ #### `convert(html)`
185
+
186
+ Convert HTML to Markdown.
187
+
188
+ Returns: `str` (Markdown)
189
+
190
+ #### `save(filepath, url, **kwargs)`
191
+
192
+ Fetch, convert, and save to file.
193
+
194
+ #### `close()`
195
+
196
+ Close browser (Playwright only).
197
+
198
+ #### `close_all()` (classmethod, Playwright only)
199
+
200
+ Close all shared browsers.
201
+
202
+ ## When to Use Which Backend?
203
+
204
+ | Use Requests | Use Playwright |
205
+ |--------------|----------------|
206
+ | Static HTML pages | SPA / JavaScript apps |
207
+ | Speed matters | Need rendered content |
208
+ | Simple scraping | Bypass anti-bot (sometimes) |
209
+ | Low memory | Modern web apps |
210
+
211
+ ## Examples
212
+
213
+ ### Scrape Multiple URLs
214
+
215
+ ```python
216
+ from pg2md import Pg2MdRequests
217
+
218
+ urls = [
219
+ "https://blog.example.com/post1",
220
+ "https://blog.example.com/post2",
221
+ "https://blog.example.com/post3",
222
+ ]
223
+
224
+ pg = Pg2MdRequests(with_image=False, with_link=False)
225
+
226
+ for i, url in enumerate(urls):
227
+ pg.save(f"post_{i+1}.md", url)
228
+ print(f"Saved: {url}")
229
+ ```
230
+
231
+ ### Batch with Proxies
232
+
233
+ ```python
234
+ from pg2md import Pg2MdRequests
235
+
236
+ urls = ["https://site1.com", "https://site2.com", "https://site3.com"]
237
+ proxies = [
238
+ "http://user1:pass1@proxy1:8080",
239
+ "http://user2:pass2@proxy2:8080",
240
+ ]
241
+
242
+ pg = Pg2MdRequests()
243
+
244
+ for i, url in enumerate(urls):
245
+ proxy = proxies[i % len(proxies)]
246
+ md = pg.run(url, proxy=proxy)
247
+ print(f"[{i+1}] {len(md)} chars")
248
+ ```
249
+
250
+ ### Extract Article Content
251
+
252
+ ```python
253
+ from pg2md import Pg2MdPlaywright
254
+
255
+ with Pg2MdPlaywright() as pg:
256
+ md = pg.run(
257
+ "https://medium.com/some-article",
258
+ user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
259
+ )
260
+
261
+ # Save clean text
262
+ with open("article.md", "w") as f:
263
+ f.write(md)
264
+ ```
265
+
266
+ ## License
267
+
268
+ MIT
@@ -0,0 +1,257 @@
1
+ """
2
+ Pg2Md — HTML to Markdown converter with Requests or Playwright backend.
3
+
4
+ Usage:
5
+ from pg2md import Pg2MdRequests, Pg2MdPlaywright
6
+
7
+ # Requests
8
+ pg = Pg2MdRequests(with_image=False, with_link=False)
9
+ md = pg.run("https://example.com", proxy="http://user:pass@host:port")
10
+
11
+ # Playwright
12
+ pg = Pg2MdPlaywright()
13
+ md = pg.run("https://example.com")
14
+ pg.close()
15
+ """
16
+
17
+ from abc import ABC, abstractmethod
18
+ from pathlib import Path
19
+ from typing import Optional
20
+ from urllib.parse import urlparse
21
+
22
+ from requests import Session
23
+ from playwright.sync_api import sync_playwright, Browser
24
+
25
+ from .html_to_md import HtmlToMarkdown
26
+
27
+
28
+ class Pg2Md(ABC):
29
+ """Base class for HTML to Markdown conversion."""
30
+
31
+ def __init__(self, with_image: bool = False, with_link: bool = False):
32
+ self._converter = HtmlToMarkdown(
33
+ with_image=with_image,
34
+ with_link=with_link,
35
+ )
36
+
37
+ @abstractmethod
38
+ def fetch(
39
+ self,
40
+ url: str,
41
+ proxy: Optional[str] = None,
42
+ headers: Optional[dict] = None,
43
+ cookies: Optional[dict] = None,
44
+ user_agent: Optional[str] = None,
45
+ timeout: int = 30,
46
+ ) -> str:
47
+ """Fetch HTML from URL."""
48
+ ...
49
+
50
+ def convert(self, html: str) -> str:
51
+ """Convert HTML to Markdown."""
52
+ return self._converter.convert(html)
53
+
54
+ def run(
55
+ self,
56
+ url: str,
57
+ proxy: Optional[str] = None,
58
+ headers: Optional[dict] = None,
59
+ cookies: Optional[dict] = None,
60
+ user_agent: Optional[str] = None,
61
+ timeout: int = 30,
62
+ ) -> str:
63
+ """Fetch URL and convert to Markdown."""
64
+ html = self.fetch(url, proxy, headers, cookies, user_agent, timeout)
65
+ return self.convert(html)
66
+
67
+ def save(
68
+ self,
69
+ filepath: str,
70
+ url: str,
71
+ proxy: Optional[str] = None,
72
+ headers: Optional[dict] = None,
73
+ cookies: Optional[dict] = None,
74
+ user_agent: Optional[str] = None,
75
+ timeout: int = 30,
76
+ ) -> None:
77
+ """Fetch, convert and save to file."""
78
+ md = self.run(url, proxy, headers, cookies, user_agent, timeout)
79
+ Path(filepath).write_text(md, encoding="utf-8")
80
+
81
+ def close(self):
82
+ """Close resources. Override in subclasses if needed."""
83
+ pass
84
+
85
+ def __enter__(self):
86
+ return self
87
+
88
+ def __exit__(self, *args):
89
+ self.close()
90
+
91
+
92
+ class Pg2MdRequests(Pg2Md):
93
+ """Requests-based implementation."""
94
+
95
+ def fetch(
96
+ self,
97
+ url: str,
98
+ proxy: Optional[str] = None,
99
+ headers: Optional[dict] = None,
100
+ cookies: Optional[dict] = None,
101
+ user_agent: Optional[str] = None,
102
+ timeout: int = 30,
103
+ ) -> str:
104
+ session = Session()
105
+
106
+ final_headers = dict(headers) if headers else {}
107
+ if user_agent:
108
+ final_headers["User-Agent"] = user_agent
109
+
110
+ proxies = None
111
+ if proxy:
112
+ proxy_url = self._normalize_proxy(proxy)
113
+ proxies = {"http": proxy_url, "https": proxy_url}
114
+
115
+ resp = session.get(
116
+ url,
117
+ proxies=proxies,
118
+ headers=final_headers if final_headers else None,
119
+ cookies=cookies,
120
+ timeout=timeout,
121
+ )
122
+ resp.raise_for_status()
123
+ return resp.text
124
+
125
+ def _normalize_proxy(self, proxy: str) -> str:
126
+ """Normalize proxy to http://user:pass@host:port format."""
127
+ if proxy.startswith("http://") or proxy.startswith("https://"):
128
+ return proxy
129
+
130
+ parts = proxy.split(":")
131
+ if len(parts) == 4:
132
+ host, port, user, password = parts
133
+ return f"http://{user}:{password}@{host}:{port}"
134
+
135
+ return f"http://{proxy}"
136
+
137
+
138
+ class Pg2MdPlaywright(Pg2Md):
139
+ """Playwright-based implementation with browser reuse."""
140
+
141
+ _shared_playwright = None
142
+ _shared_browsers: dict = {}
143
+
144
+ def __init__(
145
+ self,
146
+ browser: Optional[Browser] = None,
147
+ headless: bool = True,
148
+ with_image: bool = False,
149
+ with_link: bool = False,
150
+ ):
151
+ super().__init__(with_image, with_link)
152
+ self._browser = browser
153
+ self._headless = headless
154
+ self._owns_browser = browser is None
155
+
156
+ @classmethod
157
+ def _get_playwright(cls):
158
+ if cls._shared_playwright is None:
159
+ cls._shared_playwright = sync_playwright().start()
160
+ return cls._shared_playwright
161
+
162
+ @property
163
+ def browser(self) -> Browser:
164
+ """Get or create browser instance."""
165
+ if self._browser is None:
166
+ key = ("chromium", self._headless)
167
+ if key not in self._shared_browsers:
168
+ pw = self._get_playwright()
169
+ self._shared_browsers[key] = pw.chromium.launch(headless=self._headless)
170
+ self._browser = self._shared_browsers[key]
171
+ return self._browser
172
+
173
+ def fetch(
174
+ self,
175
+ url: str,
176
+ proxy: Optional[str] = None,
177
+ headers: Optional[dict] = None,
178
+ cookies: Optional[dict] = None,
179
+ user_agent: Optional[str] = None,
180
+ timeout: int = 30,
181
+ ) -> str:
182
+ context_opts = {}
183
+
184
+ if proxy:
185
+ context_opts["proxy"] = self._parse_proxy(proxy)
186
+ if user_agent:
187
+ context_opts["user_agent"] = user_agent
188
+
189
+ context = self.browser.new_context(**context_opts)
190
+
191
+ if headers:
192
+ context.set_extra_http_headers(headers)
193
+ if cookies:
194
+ parsed_url = urlparse(url)
195
+ domain = parsed_url.hostname
196
+ formatted_cookies = [
197
+ {"name": k, "value": v, "domain": domain} for k, v in cookies.items()
198
+ ]
199
+ context.add_cookies(formatted_cookies)
200
+
201
+ page = context.new_page()
202
+ page.goto(url, timeout=timeout * 1000)
203
+ html = page.content()
204
+
205
+ page.close()
206
+ context.close()
207
+
208
+ return html
209
+
210
+ def _parse_proxy(self, proxy: str) -> dict:
211
+ """Parse proxy string to Playwright format."""
212
+ if proxy.startswith("http://") or proxy.startswith("https://"):
213
+ parsed = urlparse(proxy)
214
+ result = {"server": f"{parsed.scheme}://{parsed.hostname}:{parsed.port}"}
215
+ if parsed.username and parsed.password:
216
+ result["username"] = parsed.username
217
+ result["password"] = parsed.password
218
+ return result
219
+
220
+ parts = proxy.split(":")
221
+ if len(parts) == 4:
222
+ host, port, user, password = parts
223
+ return {
224
+ "server": f"http://{host}:{port}",
225
+ "username": user,
226
+ "password": password,
227
+ }
228
+ elif len(parts) == 2:
229
+ host, port = parts
230
+ return {"server": f"http://{host}:{port}"}
231
+
232
+ return {"server": f"http://{proxy}"}
233
+
234
+ def close(self):
235
+ """Close browser if owned by this instance."""
236
+ if self._owns_browser:
237
+ key = ("chromium", self._headless)
238
+ if key in self._shared_browsers:
239
+ self._shared_browsers[key].close()
240
+ del self._shared_browsers[key]
241
+ self._browser = None
242
+
243
+ @classmethod
244
+ def close_all(cls):
245
+ """Close all shared browsers and playwright."""
246
+ for browser in cls._shared_browsers.values():
247
+ browser.close()
248
+ cls._shared_browsers.clear()
249
+ if cls._shared_playwright:
250
+ cls._shared_playwright.stop()
251
+ cls._shared_playwright = None
252
+
253
+ def __enter__(self):
254
+ return self
255
+
256
+ def __exit__(self, *args):
257
+ self.close()
@@ -0,0 +1,168 @@
1
+ """
2
+ HtmlToMarkdown — standalone HTML to Markdown converter.
3
+
4
+ No browser dependencies, just HTML -> Markdown conversion.
5
+
6
+ Dependencies:
7
+ pip install html-to-markdown beautifulsoup4
8
+
9
+ Usage:
10
+ converter = HtmlToMarkdown(with_image=False, with_link=False)
11
+ markdown = converter.convert(html_string)
12
+ """
13
+
14
+ import re
15
+ from typing import Optional
16
+
17
+ from bs4 import BeautifulSoup
18
+ from html_to_markdown import convert, ConversionOptions, PreprocessingOptions
19
+
20
+
21
+ class HtmlToMarkdown:
22
+ """
23
+ Converts HTML to clean Markdown.
24
+
25
+ Steps:
26
+ 1. Clean HTML (remove scripts, styles, optional images/links)
27
+ 2. Convert to Markdown via html-to-markdown
28
+ 3. Clean final Markdown (remove base64, excess newlines)
29
+
30
+ Args:
31
+ with_image: Include images in output. Default False.
32
+ with_link: Include links (href). Default True.
33
+ False — links are replaced with their text.
34
+ heading_style: "atx" (#) or "setext" (underline). Default "atx".
35
+ strong_em_symbol: "*" or "_". Default "*".
36
+ bullets: Bullet character. Default "*".
37
+ escape_asterisks: Escape asterisks in text. Default False.
38
+ preprocessing_preset: "aggressive", "moderate", or "conservative". Default "aggressive".
39
+ remove_navigation: Remove navigation elements. Default True.
40
+ remove_forms: Remove form elements. Default True.
41
+ """
42
+
43
+ STRIP_TAGS = [
44
+ "script",
45
+ "style",
46
+ "noscript",
47
+ "svg",
48
+ "canvas",
49
+ "video",
50
+ "audio",
51
+ "iframe",
52
+ "object",
53
+ "embed",
54
+ "head",
55
+ ]
56
+
57
+ _BASE64_LINE = re.compile(r"^[A-Za-z0-9+/=]{40,}\s*$", re.MULTILINE)
58
+ _BINARY_GARBAGE = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f]")
59
+ _EXCESS_NEWLINES = re.compile(r"\n{3,}")
60
+ _MD_IMAGE = re.compile(r"!\[.*?\]\(.*?\)")
61
+
62
+ def __init__(
63
+ self,
64
+ with_image: bool = False,
65
+ with_link: bool = True,
66
+ heading_style: str = "atx",
67
+ strong_em_symbol: str = "*",
68
+ bullets: str = "*",
69
+ escape_asterisks: bool = False,
70
+ preprocessing_preset: str = "aggressive",
71
+ remove_navigation: bool = True,
72
+ remove_forms: bool = True,
73
+ ):
74
+ self.with_image = with_image
75
+ self.with_link = with_link
76
+ self.heading_style = heading_style
77
+ self.strong_em_symbol = strong_em_symbol
78
+ self.bullets = bullets
79
+ self.escape_asterisks = escape_asterisks
80
+ self.preprocessing_preset = preprocessing_preset
81
+ self.remove_navigation = remove_navigation
82
+ self.remove_forms = remove_forms
83
+
84
+ def convert(self, html: str) -> str:
85
+ """
86
+ Convert HTML to clean Markdown.
87
+
88
+ Args:
89
+ html: HTML string
90
+
91
+ Returns:
92
+ Clean Markdown string
93
+ """
94
+ clean_html = self._clean_html(html)
95
+ markdown = self._html_to_markdown_lib(clean_html)
96
+ markdown = self._clean_markdown(markdown)
97
+ return markdown
98
+
99
+ def _clean_html(self, html: str) -> str:
100
+ """Remove unwanted tags and attributes from HTML."""
101
+ soup = BeautifulSoup(html, "html.parser")
102
+
103
+ for tag in self.STRIP_TAGS:
104
+ for el in soup.find_all(tag):
105
+ el.decompose()
106
+
107
+ if not self.with_image:
108
+ for el in soup.find_all("img"):
109
+ el.decompose()
110
+ else:
111
+ for el in soup.find_all("img"):
112
+ src = el.get("src", "")
113
+ if isinstance(src, str) and (src.startswith("data:") or src.startswith("blob:")):
114
+ el.decompose()
115
+
116
+ if not self.with_link:
117
+ for el in soup.find_all("a"):
118
+ el.replace_with(el.get_text())
119
+ else:
120
+ for el in soup.find_all("a"):
121
+ href = el.get("href", "")
122
+ if isinstance(href, str) and (href.startswith("data:") or href.startswith("blob:")):
123
+ el["href"] = ""
124
+
125
+ for el in soup.find_all(True):
126
+ for attr in ("src", "href", "srcset", "poster", "background"):
127
+ val = el.get(attr, "")
128
+ if isinstance(val, str) and (val.startswith("data:") or val.startswith("blob:")):
129
+ del el[attr]
130
+
131
+ return str(soup)
132
+
133
+ def _html_to_markdown_lib(self, html: str) -> str:
134
+ """Convert HTML to Markdown using html-to-markdown library."""
135
+ options = ConversionOptions(
136
+ heading_style=self.heading_style,
137
+ strong_em_symbol=self.strong_em_symbol,
138
+ bullets=self.bullets,
139
+ escape_asterisks=self.escape_asterisks,
140
+ )
141
+ preprocessing = PreprocessingOptions(
142
+ enabled=True,
143
+ preset=self.preprocessing_preset,
144
+ remove_navigation=self.remove_navigation,
145
+ remove_forms=self.remove_forms,
146
+ )
147
+ return convert(html, options, preprocessing)
148
+
149
+ def _clean_markdown(self, text: str) -> str:
150
+ """Final cleanup of Markdown text."""
151
+ text = self._BINARY_GARBAGE.sub("", text)
152
+ text = self._BASE64_LINE.sub("", text)
153
+
154
+ if not self.with_image:
155
+ text = self._MD_IMAGE.sub("", text)
156
+
157
+ text = self._EXCESS_NEWLINES.sub("\n\n", text)
158
+
159
+ return text.strip()
160
+
161
+
162
+ if __name__ == "__main__":
163
+ import sys
164
+
165
+ html = sys.stdin.read() if not sys.argv[1:] else open(sys.argv[1]).read()
166
+
167
+ converter = HtmlToMarkdown(with_image=False, with_link=False)
168
+ print(converter.convert(html))
@@ -0,0 +1,302 @@
1
+ Metadata-Version: 2.4
2
+ Name: pg2md
3
+ Version: 1.0.1
4
+ Summary: HTML to Markdown converter with Requests or Playwright backend
5
+ Author-email: Your Name <your@email.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/yourname/pg2md
8
+ Project-URL: Repository, https://github.com/yourname/pg2md
9
+ Project-URL: Issues, https://github.com/yourname/pg2md/issues
10
+ Keywords: html,markdown,converter,playwright,requests,scraper
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Topic :: Text Processing :: Markup :: HTML
19
+ Classifier: Topic :: Text Processing :: Markup :: Markdown
20
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
21
+ Requires-Python: >=3.10
22
+ Description-Content-Type: text/markdown
23
+ License-File: LICENSE
24
+ Requires-Dist: requests>=2.28.0
25
+ Requires-Dist: beautifulsoup4>=4.12.0
26
+ Requires-Dist: html-to-markdown>=1.1.0
27
+ Provides-Extra: playwright
28
+ Requires-Dist: playwright>=1.40.0; extra == "playwright"
29
+ Provides-Extra: dev
30
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
31
+ Requires-Dist: build>=1.0.0; extra == "dev"
32
+ Requires-Dist: twine>=4.0.0; extra == "dev"
33
+ Dynamic: license-file
34
+
35
+ # pg2md
36
+
37
+ **HTML to Markdown converter** with Requests or Playwright backend.
38
+
39
+ Convert any webpage to clean Markdown. Choose between fast `requests` or full browser `playwright` for JavaScript-rendered pages.
40
+
41
+ ## Features
42
+
43
+ - **Two backends**: `Pg2MdRequests` (fast) or `Pg2MdPlaywright` (JS support)
44
+ - **Browser reuse**: Playwright instances share a single browser
45
+ - **Proxy support**: HTTP/HTTPS proxies with authentication
46
+ - **Custom headers & cookies**: Full control over requests
47
+ - **Clean output**: Optional removal of images and links
48
+ - **Context manager**: Auto-cleanup with `with` statement
49
+
50
+ ## Installation
51
+
52
+ ```bash
53
+ pip install pg2md
54
+
55
+ # For Playwright backend:
56
+ pip install pg2md[playwright]
57
+ playwright install chromium
58
+ ```
59
+
60
+ ## Quick Start
61
+
62
+ ```python
63
+ from pg2md import Pg2MdRequests, Pg2MdPlaywright
64
+
65
+ # Simple usage with Requests
66
+ pg = Pg2MdRequests()
67
+ markdown = pg.run("https://example.com")
68
+ print(markdown)
69
+
70
+ # Playwright for JS-heavy sites
71
+ pg = Pg2MdPlaywright()
72
+ markdown = pg.run("https://spa-example.com")
73
+ pg.close()
74
+ ```
75
+
76
+ ## Usage
77
+
78
+ ### Basic Conversion
79
+
80
+ ```python
81
+ from pg2md import Pg2MdRequests
82
+
83
+ pg = Pg2MdRequests(with_image=False, with_link=False)
84
+ md = pg.run("https://news.ycombinator.com")
85
+ ```
86
+
87
+ ### With Proxy
88
+
89
+ ```python
90
+ from pg2md import Pg2MdRequests, Pg2MdPlaywright
91
+
92
+ # Format: http://user:password@host:port
93
+ # Or: host:port:user:password
94
+ proxy = "http://user:pass@proxy.example.com:8080"
95
+
96
+ # Requests
97
+ pg = Pg2MdRequests()
98
+ md = pg.run("https://example.com", proxy=proxy)
99
+
100
+ # Playwright
101
+ pg = Pg2MdPlaywright()
102
+ md = pg.run("https://example.com", proxy=proxy)
103
+ pg.close()
104
+ ```
105
+
106
+ ### Custom Headers & User-Agent
107
+
108
+ ```python
109
+ from pg2md import Pg2MdRequests
110
+
111
+ pg = Pg2MdRequests()
112
+ md = pg.run(
113
+ "https://api.example.com/data",
114
+ headers={
115
+ "X-API-Key": "secret123",
116
+ "Accept": "application/json",
117
+ },
118
+ user_agent="MyBot/1.0",
119
+ )
120
+ ```
121
+
122
+ ### With Cookies
123
+
124
+ ```python
125
+ from pg2md import Pg2MdRequests
126
+
127
+ pg = Pg2MdRequests()
128
+ md = pg.run(
129
+ "https://example.com/dashboard",
130
+ cookies={
131
+ "session": "abc123",
132
+ "auth_token": "xyz789",
133
+ },
134
+ )
135
+ ```
136
+
137
+ ### Save to File
138
+
139
+ ```python
140
+ from pg2md import Pg2MdRequests
141
+
142
+ pg = Pg2MdRequests()
143
+ pg.save("output.md", "https://example.com")
144
+
145
+ # With options
146
+ pg.save(
147
+ "article.md",
148
+ "https://blog.example.com/post",
149
+ proxy="http://user:pass@host:port",
150
+ user_agent="MyBot/1.0",
151
+ )
152
+ ```
153
+
154
+ ### Context Manager
155
+
156
+ ```python
157
+ from pg2md import Pg2MdPlaywright
158
+
159
+ with Pg2MdPlaywright() as pg:
160
+ md1 = pg.run("https://site1.com")
161
+ md2 = pg.run("https://site2.com")
162
+ # Browser closed automatically
163
+ ```
164
+
165
+ ### Multiple Instances
166
+
167
+ ```python
168
+ from pg2md import Pg2MdPlaywright
169
+
170
+ # Both share the same browser (efficient)
171
+ pg1 = Pg2MdPlaywright()
172
+ pg2 = Pg2MdPlaywright()
173
+
174
+ md1 = pg1.run("https://site1.com")
175
+ md2 = pg2.run("https://site2.com")
176
+
177
+ Pg2MdPlaywright.close_all() # Close shared browser
178
+ ```
179
+
180
+ ## API Reference
181
+
182
+ ### Pg2MdRequests
183
+
184
+ ```python
185
+ Pg2MdRequests(with_image=False, with_link=False)
186
+ ```
187
+
188
+ | Parameter | Type | Default | Description |
189
+ |-----------|------|---------|-------------|
190
+ | `with_image` | bool | False | Include images in output |
191
+ | `with_link` | bool | False | Include links in output |
192
+
193
+ ### Pg2MdPlaywright
194
+
195
+ ```python
196
+ Pg2MdPlaywright(
197
+ browser=None, # Custom Browser instance
198
+ headless=True, # Headless mode
199
+ with_image=False,
200
+ with_link=False,
201
+ )
202
+ ```
203
+
204
+ ### Methods
205
+
206
+ #### `run(url, proxy=None, headers=None, cookies=None, user_agent=None, timeout=30)`
207
+
208
+ Fetch URL and convert to Markdown.
209
+
210
+ Returns: `str` (Markdown)
211
+
212
+ #### `fetch(url, proxy=None, headers=None, cookies=None, user_agent=None, timeout=30)`
213
+
214
+ Fetch HTML only.
215
+
216
+ Returns: `str` (HTML)
217
+
218
+ #### `convert(html)`
219
+
220
+ Convert HTML to Markdown.
221
+
222
+ Returns: `str` (Markdown)
223
+
224
+ #### `save(filepath, url, **kwargs)`
225
+
226
+ Fetch, convert, and save to file.
227
+
228
+ #### `close()`
229
+
230
+ Close browser (Playwright only).
231
+
232
+ #### `close_all()` (classmethod, Playwright only)
233
+
234
+ Close all shared browsers.
235
+
236
+ ## When to Use Which Backend?
237
+
238
+ | Use Requests | Use Playwright |
239
+ |--------------|----------------|
240
+ | Static HTML pages | SPA / JavaScript apps |
241
+ | Speed matters | Need rendered content |
242
+ | Simple scraping | Bypass anti-bot (sometimes) |
243
+ | Low memory | Modern web apps |
244
+
245
+ ## Examples
246
+
247
+ ### Scrape Multiple URLs
248
+
249
+ ```python
250
+ from pg2md import Pg2MdRequests
251
+
252
+ urls = [
253
+ "https://blog.example.com/post1",
254
+ "https://blog.example.com/post2",
255
+ "https://blog.example.com/post3",
256
+ ]
257
+
258
+ pg = Pg2MdRequests(with_image=False, with_link=False)
259
+
260
+ for i, url in enumerate(urls):
261
+ pg.save(f"post_{i+1}.md", url)
262
+ print(f"Saved: {url}")
263
+ ```
264
+
265
+ ### Batch with Proxies
266
+
267
+ ```python
268
+ from pg2md import Pg2MdRequests
269
+
270
+ urls = ["https://site1.com", "https://site2.com", "https://site3.com"]
271
+ proxies = [
272
+ "http://user1:pass1@proxy1:8080",
273
+ "http://user2:pass2@proxy2:8080",
274
+ ]
275
+
276
+ pg = Pg2MdRequests()
277
+
278
+ for i, url in enumerate(urls):
279
+ proxy = proxies[i % len(proxies)]
280
+ md = pg.run(url, proxy=proxy)
281
+ print(f"[{i+1}] {len(md)} chars")
282
+ ```
283
+
284
+ ### Extract Article Content
285
+
286
+ ```python
287
+ from pg2md import Pg2MdPlaywright
288
+
289
+ with Pg2MdPlaywright() as pg:
290
+ md = pg.run(
291
+ "https://medium.com/some-article",
292
+ user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
293
+ )
294
+
295
+ # Save clean text
296
+ with open("article.md", "w") as f:
297
+ f.write(md)
298
+ ```
299
+
300
+ ## License
301
+
302
+ MIT
@@ -0,0 +1,10 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ pg2md/__init__.py
5
+ pg2md/html_to_md.py
6
+ pg2md.egg-info/PKG-INFO
7
+ pg2md.egg-info/SOURCES.txt
8
+ pg2md.egg-info/dependency_links.txt
9
+ pg2md.egg-info/requires.txt
10
+ pg2md.egg-info/top_level.txt
@@ -0,0 +1,11 @@
1
+ requests>=2.28.0
2
+ beautifulsoup4>=4.12.0
3
+ html-to-markdown>=1.1.0
4
+
5
+ [dev]
6
+ pytest>=7.0.0
7
+ build>=1.0.0
8
+ twine>=4.0.0
9
+
10
+ [playwright]
11
+ playwright>=1.40.0
@@ -0,0 +1 @@
1
+ pg2md
@@ -0,0 +1,51 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "pg2md"
7
+ version = "1.0.1"
8
+ description = "HTML to Markdown converter with Requests or Playwright backend"
9
+ readme = "README.md"
10
+ license = {text = "MIT"}
11
+ requires-python = ">=3.10"
12
+ authors = [
13
+ {name = "Your Name", email = "your@email.com"}
14
+ ]
15
+ keywords = ["html", "markdown", "converter", "playwright", "requests", "scraper"]
16
+ classifiers = [
17
+ "Development Status :: 4 - Beta",
18
+ "Intended Audience :: Developers",
19
+ "License :: OSI Approved :: MIT License",
20
+ "Programming Language :: Python :: 3",
21
+ "Programming Language :: Python :: 3.10",
22
+ "Programming Language :: Python :: 3.11",
23
+ "Programming Language :: Python :: 3.12",
24
+ "Topic :: Text Processing :: Markup :: HTML",
25
+ "Topic :: Text Processing :: Markup :: Markdown",
26
+ "Topic :: Software Development :: Libraries :: Python Modules",
27
+ ]
28
+ dependencies = [
29
+ "requests>=2.28.0",
30
+ "beautifulsoup4>=4.12.0",
31
+ "html-to-markdown>=1.1.0",
32
+ ]
33
+
34
+ [project.optional-dependencies]
35
+ playwright = [
36
+ "playwright>=1.40.0",
37
+ ]
38
+ dev = [
39
+ "pytest>=7.0.0",
40
+ "build>=1.0.0",
41
+ "twine>=4.0.0",
42
+ ]
43
+
44
+ [project.urls]
45
+ Homepage = "https://github.com/yourname/pg2md"
46
+ Repository = "https://github.com/yourname/pg2md"
47
+ Issues = "https://github.com/yourname/pg2md/issues"
48
+
49
+ [tool.setuptools.packages.find]
50
+ where = ["."]
51
+ include = ["pg2md"]
pg2md-1.0.1/setup.cfg ADDED
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+