markgrab 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- markgrab-0.1.0/LICENSE +21 -0
- markgrab-0.1.0/PKG-INFO +179 -0
- markgrab-0.1.0/README.md +138 -0
- markgrab-0.1.0/markgrab/__init__.py +7 -0
- markgrab-0.1.0/markgrab/__main__.py +70 -0
- markgrab-0.1.0/markgrab/anti_bot/__init__.py +0 -0
- markgrab-0.1.0/markgrab/anti_bot/stealth.py +45 -0
- markgrab-0.1.0/markgrab/core.py +196 -0
- markgrab-0.1.0/markgrab/engine/__init__.py +7 -0
- markgrab-0.1.0/markgrab/engine/base.py +42 -0
- markgrab-0.1.0/markgrab/engine/browser.py +72 -0
- markgrab-0.1.0/markgrab/engine/http.py +37 -0
- markgrab-0.1.0/markgrab/filter/__init__.py +7 -0
- markgrab-0.1.0/markgrab/filter/density.py +79 -0
- markgrab-0.1.0/markgrab/filter/noise.py +42 -0
- markgrab-0.1.0/markgrab/filter/truncate.py +33 -0
- markgrab-0.1.0/markgrab/output/__init__.py +0 -0
- markgrab-0.1.0/markgrab/parser/__init__.py +9 -0
- markgrab-0.1.0/markgrab/parser/base.py +13 -0
- markgrab-0.1.0/markgrab/parser/docx.py +87 -0
- markgrab-0.1.0/markgrab/parser/html.py +120 -0
- markgrab-0.1.0/markgrab/parser/pdf.py +66 -0
- markgrab-0.1.0/markgrab/parser/youtube.py +107 -0
- markgrab-0.1.0/markgrab/result.py +17 -0
- markgrab-0.1.0/markgrab/utils.py +28 -0
- markgrab-0.1.0/markgrab.egg-info/PKG-INFO +179 -0
- markgrab-0.1.0/markgrab.egg-info/SOURCES.txt +43 -0
- markgrab-0.1.0/markgrab.egg-info/dependency_links.txt +1 -0
- markgrab-0.1.0/markgrab.egg-info/entry_points.txt +2 -0
- markgrab-0.1.0/markgrab.egg-info/requires.txt +26 -0
- markgrab-0.1.0/markgrab.egg-info/top_level.txt +1 -0
- markgrab-0.1.0/pyproject.toml +64 -0
- markgrab-0.1.0/setup.cfg +4 -0
- markgrab-0.1.0/tests/test_browser_engine.py +150 -0
- markgrab-0.1.0/tests/test_cli.py +95 -0
- markgrab-0.1.0/tests/test_density_filter.py +182 -0
- markgrab-0.1.0/tests/test_docx_parser.py +149 -0
- markgrab-0.1.0/tests/test_extract.py +205 -0
- markgrab-0.1.0/tests/test_fallback.py +173 -0
- markgrab-0.1.0/tests/test_html_parser.py +206 -0
- markgrab-0.1.0/tests/test_http_engine.py +82 -0
- markgrab-0.1.0/tests/test_pdf_parser.py +125 -0
- markgrab-0.1.0/tests/test_result.py +38 -0
- markgrab-0.1.0/tests/test_truncate.py +76 -0
- markgrab-0.1.0/tests/test_youtube_parser.py +139 -0
markgrab-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 hmj
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
markgrab-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: markgrab
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Universal web content extraction — URL to LLM-ready markdown
|
|
5
|
+
Author: hmj
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/QuartzUnit/markgrab
|
|
8
|
+
Project-URL: Repository, https://github.com/QuartzUnit/markgrab
|
|
9
|
+
Project-URL: Issues, https://github.com/QuartzUnit/markgrab/issues
|
|
10
|
+
Keywords: web-scraping,content-extraction,markdown,llm,rag
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
15
|
+
Classifier: Topic :: Text Processing :: Markup :: Markdown
|
|
16
|
+
Classifier: Typing :: Typed
|
|
17
|
+
Requires-Python: >=3.12
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
License-File: LICENSE
|
|
20
|
+
Requires-Dist: httpx>=0.28
|
|
21
|
+
Requires-Dist: beautifulsoup4>=4.13
|
|
22
|
+
Requires-Dist: markdownify>=0.14
|
|
23
|
+
Provides-Extra: browser
|
|
24
|
+
Requires-Dist: playwright>=1.49; extra == "browser"
|
|
25
|
+
Provides-Extra: youtube
|
|
26
|
+
Requires-Dist: youtube-transcript-api>=1.0; extra == "youtube"
|
|
27
|
+
Provides-Extra: pdf
|
|
28
|
+
Requires-Dist: pdfplumber>=0.11; extra == "pdf"
|
|
29
|
+
Provides-Extra: docx
|
|
30
|
+
Requires-Dist: python-docx>=1.1; extra == "docx"
|
|
31
|
+
Provides-Extra: all
|
|
32
|
+
Requires-Dist: playwright>=1.49; extra == "all"
|
|
33
|
+
Requires-Dist: youtube-transcript-api>=1.0; extra == "all"
|
|
34
|
+
Requires-Dist: pdfplumber>=0.11; extra == "all"
|
|
35
|
+
Requires-Dist: python-docx>=1.1; extra == "all"
|
|
36
|
+
Provides-Extra: dev
|
|
37
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
38
|
+
Requires-Dist: pytest-asyncio>=0.24; extra == "dev"
|
|
39
|
+
Requires-Dist: ruff>=0.9; extra == "dev"
|
|
40
|
+
Dynamic: license-file
|
|
41
|
+
|
|
42
|
+
# MarkGrab
|
|
43
|
+
|
|
44
|
+
> [한국어 문서](README.ko.md)
|
|
45
|
+
|
|
46
|
+
Universal web content extraction — any URL to LLM-ready markdown.
|
|
47
|
+
|
|
48
|
+
```python
|
|
49
|
+
from markgrab import extract
|
|
50
|
+
|
|
51
|
+
result = await extract("https://example.com/article")
|
|
52
|
+
print(result.markdown) # clean markdown
|
|
53
|
+
print(result.title) # "Article Title"
|
|
54
|
+
print(result.word_count) # 1234
|
|
55
|
+
print(result.language) # "en"
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## Features
|
|
59
|
+
|
|
60
|
+
- **HTML** — BeautifulSoup + content density filtering (removes nav, sidebar, ads)
|
|
61
|
+
- **YouTube** — transcript extraction with timestamps
|
|
62
|
+
- **PDF** — text extraction with page structure
|
|
63
|
+
- **DOCX** — paragraph and heading extraction
|
|
64
|
+
- **Auto-fallback** — tries lightweight httpx first, falls back to Playwright for JS-heavy pages
|
|
65
|
+
- **Async-first** — built on httpx and Playwright async APIs
|
|
66
|
+
|
|
67
|
+
## Install
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
pip install markgrab
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
Optional extras for specific content types:
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
pip install "markgrab[browser]" # Playwright for JS-rendered pages
|
|
77
|
+
pip install "markgrab[youtube]" # YouTube transcript extraction
|
|
78
|
+
pip install "markgrab[pdf]" # PDF text extraction
|
|
79
|
+
pip install "markgrab[docx]" # DOCX text extraction
|
|
80
|
+
pip install "markgrab[all]" # everything
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
## Usage
|
|
84
|
+
|
|
85
|
+
### Python API
|
|
86
|
+
|
|
87
|
+
```python
|
|
88
|
+
import asyncio
|
|
89
|
+
from markgrab import extract
|
|
90
|
+
|
|
91
|
+
async def main():
|
|
92
|
+
# HTML (auto-detects content type)
|
|
93
|
+
result = await extract("https://example.com/article")
|
|
94
|
+
|
|
95
|
+
# YouTube transcript
|
|
96
|
+
result = await extract("https://youtube.com/watch?v=dQw4w9WgXcQ")
|
|
97
|
+
|
|
98
|
+
# PDF
|
|
99
|
+
result = await extract("https://arxiv.org/pdf/1706.03762")
|
|
100
|
+
|
|
101
|
+
# Options
|
|
102
|
+
result = await extract(
|
|
103
|
+
"https://example.com",
|
|
104
|
+
max_chars=30_000, # limit output length (default: 50K)
|
|
105
|
+
use_browser=True, # force Playwright rendering
|
|
106
|
+
stealth=True, # anti-bot stealth scripts (opt-in)
|
|
107
|
+
timeout=60.0, # request timeout in seconds
|
|
108
|
+
proxy="http://proxy:8080",
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
asyncio.run(main())
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
### CLI
|
|
115
|
+
|
|
116
|
+
```bash
|
|
117
|
+
markgrab https://example.com # markdown output
|
|
118
|
+
markgrab https://example.com -f text # plain text
|
|
119
|
+
markgrab https://example.com -f json # structured JSON
|
|
120
|
+
markgrab https://example.com --browser # force browser rendering
|
|
121
|
+
markgrab https://example.com --max-chars 10000 # limit output
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
### ExtractResult
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
result.title # page title
|
|
128
|
+
result.text # plain text
|
|
129
|
+
result.markdown # LLM-ready markdown
|
|
130
|
+
result.word_count # word count
|
|
131
|
+
result.language # detected language ("en", "ko", ...)
|
|
132
|
+
result.content_type # "article", "video", "pdf", "docx"
|
|
133
|
+
result.source_url # final URL (after redirects)
|
|
134
|
+
result.metadata # extra metadata (video_id, page_count, etc.)
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
## How it works
|
|
138
|
+
|
|
139
|
+
```
|
|
140
|
+
markgrab.extract(url)
|
|
141
|
+
1. Detect content type (URL pattern)
|
|
142
|
+
2. Fetch content (httpx first, Playwright fallback)
|
|
143
|
+
3. Parse (HTML/YouTube/PDF/DOCX)
|
|
144
|
+
4. Filter (noise removal + content density + truncation)
|
|
145
|
+
5. Return ExtractResult
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
For HTML pages, if the initial httpx fetch yields fewer than 50 words, MarkGrab automatically retries with Playwright to handle JavaScript-rendered content.
|
|
149
|
+
|
|
150
|
+
## Disclaimer
|
|
151
|
+
|
|
152
|
+
**This software is provided for legitimate purposes only.** By using MarkGrab, you agree to the following:
|
|
153
|
+
|
|
154
|
+
- **robots.txt**: MarkGrab does **not** check or enforce `robots.txt`. Users are solely responsible for checking and respecting `robots.txt` directives and the terms of service of any website they access.
|
|
155
|
+
|
|
156
|
+
- **Rate limiting**: MarkGrab does **not** include built-in rate limiting or request throttling. Users must implement their own rate limiting to avoid overloading target servers. Abusive request patterns may violate applicable laws and website terms of service.
|
|
157
|
+
|
|
158
|
+
- **YouTube transcripts**: YouTube transcript extraction relies on the third-party `youtube-transcript-api` library, which uses YouTube's internal (unofficial) caption API. This may not comply with YouTube's Terms of Service. Use at your own discretion and risk.
|
|
159
|
+
|
|
160
|
+
- **Stealth mode**: The optional `stealth=True` feature modifies browser fingerprinting signals to reduce bot detection. This feature is intended for legitimate use cases such as testing, research, and accessing content that is publicly available to regular browser users. Users are responsible for ensuring their use complies with applicable laws and the terms of service of target websites.
|
|
161
|
+
|
|
162
|
+
- **Legal compliance**: Users are responsible for ensuring that their use of MarkGrab complies with all applicable laws, including but not limited to the Computer Fraud and Abuse Act (CFAA), the Digital Millennium Copyright Act (DMCA), GDPR, and equivalent legislation in their jurisdiction.
|
|
163
|
+
|
|
164
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND. See the [LICENSE](LICENSE) file for the full MIT license text.
|
|
165
|
+
|
|
166
|
+
## Acknowledgments
|
|
167
|
+
|
|
168
|
+
MarkGrab builds on excellent open-source work and well-established techniques:
|
|
169
|
+
|
|
170
|
+
- **[puppeteer-extra-plugin-stealth](https://github.com/nicoleahmed/puppeteer-extra-plugin-stealth)** — stealth evasion patterns (webdriver removal, plugin mocking, WebGL spoofing) that inspired the opt-in `anti_bot/stealth.py` module
|
|
171
|
+
- **[Mozilla Readability](https://github.com/mozilla/readability)** — content area detection priority (`article > main > body`) and link density filtering concepts used in the density filter
|
|
172
|
+
- **[Boilerpipe](https://github.com/kohlschutter/boilerpipe)** (Kohlschutter et al., 2010) — the academic origin of link density ratio algorithms for boilerplate removal
|
|
173
|
+
- **[Jina Reader](https://github.com/jina-ai/reader)** — validated the market need for URL-to-markdown extraction; MarkGrab aims to be a lightweight, self-hosted alternative
|
|
174
|
+
|
|
175
|
+
Built with [httpx](https://github.com/encode/httpx), [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/), [markdownify](https://github.com/matthewwithanm/python-markdownify), [Playwright](https://github.com/microsoft/playwright-python), [youtube-transcript-api](https://github.com/jdepoix/youtube-transcript-api), [pdfplumber](https://github.com/jsvine/pdfplumber), and [python-docx](https://github.com/python-openxml/python-docx).
|
|
176
|
+
|
|
177
|
+
## License
|
|
178
|
+
|
|
179
|
+
[MIT](LICENSE)
|
markgrab-0.1.0/README.md
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
# MarkGrab
|
|
2
|
+
|
|
3
|
+
> [한국어 문서](README.ko.md)
|
|
4
|
+
|
|
5
|
+
Universal web content extraction — any URL to LLM-ready markdown.
|
|
6
|
+
|
|
7
|
+
```python
|
|
8
|
+
from markgrab import extract
|
|
9
|
+
|
|
10
|
+
result = await extract("https://example.com/article")
|
|
11
|
+
print(result.markdown) # clean markdown
|
|
12
|
+
print(result.title) # "Article Title"
|
|
13
|
+
print(result.word_count) # 1234
|
|
14
|
+
print(result.language) # "en"
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
## Features
|
|
18
|
+
|
|
19
|
+
- **HTML** — BeautifulSoup + content density filtering (removes nav, sidebar, ads)
|
|
20
|
+
- **YouTube** — transcript extraction with timestamps
|
|
21
|
+
- **PDF** — text extraction with page structure
|
|
22
|
+
- **DOCX** — paragraph and heading extraction
|
|
23
|
+
- **Auto-fallback** — tries lightweight httpx first, falls back to Playwright for JS-heavy pages
|
|
24
|
+
- **Async-first** — built on httpx and Playwright async APIs
|
|
25
|
+
|
|
26
|
+
## Install
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
pip install markgrab
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
Optional extras for specific content types:
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
pip install "markgrab[browser]" # Playwright for JS-rendered pages
|
|
36
|
+
pip install "markgrab[youtube]" # YouTube transcript extraction
|
|
37
|
+
pip install "markgrab[pdf]" # PDF text extraction
|
|
38
|
+
pip install "markgrab[docx]" # DOCX text extraction
|
|
39
|
+
pip install "markgrab[all]" # everything
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## Usage
|
|
43
|
+
|
|
44
|
+
### Python API
|
|
45
|
+
|
|
46
|
+
```python
|
|
47
|
+
import asyncio
|
|
48
|
+
from markgrab import extract
|
|
49
|
+
|
|
50
|
+
async def main():
|
|
51
|
+
# HTML (auto-detects content type)
|
|
52
|
+
result = await extract("https://example.com/article")
|
|
53
|
+
|
|
54
|
+
# YouTube transcript
|
|
55
|
+
result = await extract("https://youtube.com/watch?v=dQw4w9WgXcQ")
|
|
56
|
+
|
|
57
|
+
# PDF
|
|
58
|
+
result = await extract("https://arxiv.org/pdf/1706.03762")
|
|
59
|
+
|
|
60
|
+
# Options
|
|
61
|
+
result = await extract(
|
|
62
|
+
"https://example.com",
|
|
63
|
+
max_chars=30_000, # limit output length (default: 50K)
|
|
64
|
+
use_browser=True, # force Playwright rendering
|
|
65
|
+
stealth=True, # anti-bot stealth scripts (opt-in)
|
|
66
|
+
timeout=60.0, # request timeout in seconds
|
|
67
|
+
proxy="http://proxy:8080",
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
asyncio.run(main())
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
### CLI
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
markgrab https://example.com # markdown output
|
|
77
|
+
markgrab https://example.com -f text # plain text
|
|
78
|
+
markgrab https://example.com -f json # structured JSON
|
|
79
|
+
markgrab https://example.com --browser # force browser rendering
|
|
80
|
+
markgrab https://example.com --max-chars 10000 # limit output
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
### ExtractResult
|
|
84
|
+
|
|
85
|
+
```python
|
|
86
|
+
result.title # page title
|
|
87
|
+
result.text # plain text
|
|
88
|
+
result.markdown # LLM-ready markdown
|
|
89
|
+
result.word_count # word count
|
|
90
|
+
result.language # detected language ("en", "ko", ...)
|
|
91
|
+
result.content_type # "article", "video", "pdf", "docx"
|
|
92
|
+
result.source_url # final URL (after redirects)
|
|
93
|
+
result.metadata # extra metadata (video_id, page_count, etc.)
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
## How it works
|
|
97
|
+
|
|
98
|
+
```
|
|
99
|
+
markgrab.extract(url)
|
|
100
|
+
1. Detect content type (URL pattern)
|
|
101
|
+
2. Fetch content (httpx first, Playwright fallback)
|
|
102
|
+
3. Parse (HTML/YouTube/PDF/DOCX)
|
|
103
|
+
4. Filter (noise removal + content density + truncation)
|
|
104
|
+
5. Return ExtractResult
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
For HTML pages, if the initial httpx fetch yields fewer than 50 words, MarkGrab automatically retries with Playwright to handle JavaScript-rendered content.
|
|
108
|
+
|
|
109
|
+
## Disclaimer
|
|
110
|
+
|
|
111
|
+
**This software is provided for legitimate purposes only.** By using MarkGrab, you agree to the following:
|
|
112
|
+
|
|
113
|
+
- **robots.txt**: MarkGrab does **not** check or enforce `robots.txt`. Users are solely responsible for checking and respecting `robots.txt` directives and the terms of service of any website they access.
|
|
114
|
+
|
|
115
|
+
- **Rate limiting**: MarkGrab does **not** include built-in rate limiting or request throttling. Users must implement their own rate limiting to avoid overloading target servers. Abusive request patterns may violate applicable laws and website terms of service.
|
|
116
|
+
|
|
117
|
+
- **YouTube transcripts**: YouTube transcript extraction relies on the third-party `youtube-transcript-api` library, which uses YouTube's internal (unofficial) caption API. This may not comply with YouTube's Terms of Service. Use at your own discretion and risk.
|
|
118
|
+
|
|
119
|
+
- **Stealth mode**: The optional `stealth=True` feature modifies browser fingerprinting signals to reduce bot detection. This feature is intended for legitimate use cases such as testing, research, and accessing content that is publicly available to regular browser users. Users are responsible for ensuring their use complies with applicable laws and the terms of service of target websites.
|
|
120
|
+
|
|
121
|
+
- **Legal compliance**: Users are responsible for ensuring that their use of MarkGrab complies with all applicable laws, including but not limited to the Computer Fraud and Abuse Act (CFAA), the Digital Millennium Copyright Act (DMCA), GDPR, and equivalent legislation in their jurisdiction.
|
|
122
|
+
|
|
123
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND. See the [LICENSE](LICENSE) file for the full MIT license text.
|
|
124
|
+
|
|
125
|
+
## Acknowledgments
|
|
126
|
+
|
|
127
|
+
MarkGrab builds on excellent open-source work and well-established techniques:
|
|
128
|
+
|
|
129
|
+
- **[puppeteer-extra-plugin-stealth](https://github.com/nicoleahmed/puppeteer-extra-plugin-stealth)** — stealth evasion patterns (webdriver removal, plugin mocking, WebGL spoofing) that inspired the opt-in `anti_bot/stealth.py` module
|
|
130
|
+
- **[Mozilla Readability](https://github.com/mozilla/readability)** — content area detection priority (`article > main > body`) and link density filtering concepts used in the density filter
|
|
131
|
+
- **[Boilerpipe](https://github.com/kohlschutter/boilerpipe)** (Kohlschutter et al., 2010) — the academic origin of link density ratio algorithms for boilerplate removal
|
|
132
|
+
- **[Jina Reader](https://github.com/jina-ai/reader)** — validated the market need for URL-to-markdown extraction; MarkGrab aims to be a lightweight, self-hosted alternative
|
|
133
|
+
|
|
134
|
+
Built with [httpx](https://github.com/encode/httpx), [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/), [markdownify](https://github.com/matthewwithanm/python-markdownify), [Playwright](https://github.com/microsoft/playwright-python), [youtube-transcript-api](https://github.com/jdepoix/youtube-transcript-api), [pdfplumber](https://github.com/jsvine/pdfplumber), and [python-docx](https://github.com/python-openxml/python-docx).
|
|
135
|
+
|
|
136
|
+
## License
|
|
137
|
+
|
|
138
|
+
[MIT](LICENSE)
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""CLI entry point — python -m markgrab or `markgrab` command."""
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import asyncio
|
|
5
|
+
import json
|
|
6
|
+
import sys
|
|
7
|
+
|
|
8
|
+
from markgrab import extract
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def main():
|
|
12
|
+
parser = argparse.ArgumentParser(
|
|
13
|
+
prog="markgrab",
|
|
14
|
+
description="MarkGrab — extract web content as LLM-ready markdown",
|
|
15
|
+
)
|
|
16
|
+
parser.add_argument("url", help="URL to extract content from")
|
|
17
|
+
parser.add_argument("--max-chars", type=int, default=50_000, help="Max output characters (default: 50000)")
|
|
18
|
+
parser.add_argument("--browser", action="store_true", help="Force Playwright browser rendering")
|
|
19
|
+
parser.add_argument("--timeout", type=float, default=30.0, help="Request timeout in seconds (default: 30)")
|
|
20
|
+
parser.add_argument("--proxy", help="Proxy URL (e.g., http://proxy:8080)")
|
|
21
|
+
parser.add_argument(
|
|
22
|
+
"--format", "-f",
|
|
23
|
+
choices=["markdown", "text", "json"],
|
|
24
|
+
default="markdown",
|
|
25
|
+
help="Output format (default: markdown)",
|
|
26
|
+
)
|
|
27
|
+
args = parser.parse_args()
|
|
28
|
+
|
|
29
|
+
try:
|
|
30
|
+
result = asyncio.run(extract(
|
|
31
|
+
args.url,
|
|
32
|
+
max_chars=args.max_chars,
|
|
33
|
+
use_browser=args.browser,
|
|
34
|
+
timeout=args.timeout,
|
|
35
|
+
proxy=args.proxy,
|
|
36
|
+
))
|
|
37
|
+
except KeyboardInterrupt:
|
|
38
|
+
sys.exit(130)
|
|
39
|
+
except Exception as e:
|
|
40
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
41
|
+
sys.exit(1)
|
|
42
|
+
|
|
43
|
+
if args.format == "json":
|
|
44
|
+
output = {
|
|
45
|
+
"title": result.title,
|
|
46
|
+
"text": result.text,
|
|
47
|
+
"markdown": result.markdown,
|
|
48
|
+
"word_count": result.word_count,
|
|
49
|
+
"language": result.language,
|
|
50
|
+
"content_type": result.content_type,
|
|
51
|
+
"source_url": result.source_url,
|
|
52
|
+
"metadata": result.metadata,
|
|
53
|
+
}
|
|
54
|
+
print(json.dumps(output, ensure_ascii=False, indent=2))
|
|
55
|
+
elif args.format == "text":
|
|
56
|
+
if result.title:
|
|
57
|
+
print(f"Title: {result.title}")
|
|
58
|
+
print(f"Words: {result.word_count} | Language: {result.language} | Type: {result.content_type}")
|
|
59
|
+
print("---")
|
|
60
|
+
print(result.text)
|
|
61
|
+
else:
|
|
62
|
+
if result.title:
|
|
63
|
+
print(f"# {result.title}")
|
|
64
|
+
print(f"<!-- words: {result.word_count} | lang: {result.language} | type: {result.content_type} -->")
|
|
65
|
+
print()
|
|
66
|
+
print(result.markdown)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
if __name__ == "__main__":
|
|
70
|
+
main()
|
|
File without changes
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""Stealth settings for Playwright to avoid bot detection."""
|
|
2
|
+
|
|
3
|
+
_STEALTH_SCRIPT = """\
|
|
4
|
+
// Remove webdriver flag
|
|
5
|
+
Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
|
|
6
|
+
|
|
7
|
+
// Realistic languages
|
|
8
|
+
Object.defineProperty(navigator, 'languages', {get: () => ['en-US', 'en', 'ko']});
|
|
9
|
+
|
|
10
|
+
// Mock plugins (Chrome always has these)
|
|
11
|
+
Object.defineProperty(navigator, 'plugins', {
|
|
12
|
+
get: () => {
|
|
13
|
+
const plugins = [
|
|
14
|
+
{name: 'Chrome PDF Plugin', filename: 'internal-pdf-viewer'},
|
|
15
|
+
{name: 'Chrome PDF Viewer', filename: 'mhjfbmdgcfjbbpaeojofohoefgiehjai'},
|
|
16
|
+
{name: 'Native Client', filename: 'internal-nacl-plugin'},
|
|
17
|
+
];
|
|
18
|
+
plugins.length = 3;
|
|
19
|
+
return plugins;
|
|
20
|
+
}
|
|
21
|
+
});
|
|
22
|
+
|
|
23
|
+
// Mock permissions
|
|
24
|
+
const originalQuery = window.navigator.permissions.query;
|
|
25
|
+
window.navigator.permissions.query = (parameters) =>
|
|
26
|
+
parameters.name === 'notifications'
|
|
27
|
+
? Promise.resolve({state: Notification.permission})
|
|
28
|
+
: originalQuery(parameters);
|
|
29
|
+
|
|
30
|
+
// Chrome runtime mock
|
|
31
|
+
window.chrome = {runtime: {}, loadTimes: function() {}, csi: function() {}};
|
|
32
|
+
|
|
33
|
+
// WebGL vendor/renderer (Intel is the most common)
|
|
34
|
+
const getParameter = WebGLRenderingContext.prototype.getParameter;
|
|
35
|
+
WebGLRenderingContext.prototype.getParameter = function(parameter) {
|
|
36
|
+
if (parameter === 37445) return 'Intel Inc.'; // UNMASKED_VENDOR_WEBGL
|
|
37
|
+
if (parameter === 37446) return 'Intel Iris OpenGL Engine'; // UNMASKED_RENDERER_WEBGL
|
|
38
|
+
return getParameter.call(this, parameter);
|
|
39
|
+
};
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
async def apply_stealth(context) -> None:
|
|
44
|
+
"""Apply stealth settings to a Playwright browser context."""
|
|
45
|
+
await context.add_init_script(_STEALTH_SCRIPT)
|
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
"""Main orchestrator — route URL to appropriate engine and parser."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import random
|
|
5
|
+
from urllib.parse import urlparse
|
|
6
|
+
|
|
7
|
+
import httpx
|
|
8
|
+
|
|
9
|
+
from markgrab.engine.base import USER_AGENTS, Engine
|
|
10
|
+
from markgrab.engine.browser import BrowserEngine
|
|
11
|
+
from markgrab.engine.http import HttpEngine
|
|
12
|
+
from markgrab.filter.truncate import truncate_result
|
|
13
|
+
from markgrab.parser.html import HtmlParser
|
|
14
|
+
from markgrab.parser.youtube import YouTubeParser, _extract_video_id
|
|
15
|
+
from markgrab.result import ExtractResult
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
# Minimum word count — below this, content is likely SPA/JS-only
|
|
20
|
+
_MIN_WORD_COUNT = 50
|
|
21
|
+
|
|
22
|
+
_OEMBED_URL = "https://www.youtube.com/oembed?url={url}&format=json"
|
|
23
|
+
|
|
24
|
+
try:
|
|
25
|
+
import playwright # noqa: F401
|
|
26
|
+
|
|
27
|
+
_BROWSER_AVAILABLE = True
|
|
28
|
+
except ImportError:
|
|
29
|
+
_BROWSER_AVAILABLE = False
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _detect_type_from_url(url: str) -> str:
|
|
33
|
+
"""Detect content type from URL pattern."""
|
|
34
|
+
parsed = urlparse(url)
|
|
35
|
+
path = parsed.path.lower()
|
|
36
|
+
|
|
37
|
+
if "youtube.com" in parsed.netloc or "youtu.be" in parsed.netloc:
|
|
38
|
+
return "youtube"
|
|
39
|
+
if path.endswith(".pdf"):
|
|
40
|
+
return "pdf"
|
|
41
|
+
if path.endswith(".docx"):
|
|
42
|
+
return "docx"
|
|
43
|
+
|
|
44
|
+
return "html"
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
async def _fetch_with_fallback(
|
|
48
|
+
url: str,
|
|
49
|
+
*,
|
|
50
|
+
engine: Engine | None = None,
|
|
51
|
+
timeout: float = 30.0,
|
|
52
|
+
proxy: str | None = None,
|
|
53
|
+
stealth: bool = False,
|
|
54
|
+
):
|
|
55
|
+
"""Fetch via HTTP, fallback to browser on error."""
|
|
56
|
+
http_engine = engine or HttpEngine(proxy=proxy)
|
|
57
|
+
try:
|
|
58
|
+
return await http_engine.fetch(url, timeout=timeout)
|
|
59
|
+
except Exception as exc:
|
|
60
|
+
if _BROWSER_AVAILABLE:
|
|
61
|
+
logger.info("HTTP failed for %s (%s), falling back to browser", url, type(exc).__name__)
|
|
62
|
+
return await BrowserEngine(proxy=proxy, stealth=stealth).fetch(url, timeout=timeout)
|
|
63
|
+
raise
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
async def _fetch_youtube_title(url: str, timeout: float = 30.0) -> str:
|
|
67
|
+
"""Fetch YouTube video title via oEmbed API."""
|
|
68
|
+
try:
|
|
69
|
+
oembed_url = _OEMBED_URL.format(url=url)
|
|
70
|
+
async with httpx.AsyncClient(timeout=httpx.Timeout(timeout)) as client:
|
|
71
|
+
resp = await client.get(oembed_url)
|
|
72
|
+
if resp.status_code == 200:
|
|
73
|
+
return resp.json().get("title", "")
|
|
74
|
+
except Exception:
|
|
75
|
+
logger.debug("Failed to fetch YouTube oEmbed title for %s", url)
|
|
76
|
+
return ""
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
async def _fetch_bytes(url: str, *, timeout: float = 30.0, proxy: str | None = None) -> tuple[bytes, str]:
|
|
80
|
+
"""Fetch URL as raw bytes. Returns (data, final_url)."""
|
|
81
|
+
headers = {
|
|
82
|
+
"User-Agent": random.choice(USER_AGENTS),
|
|
83
|
+
"Accept": "*/*",
|
|
84
|
+
}
|
|
85
|
+
async with httpx.AsyncClient(
|
|
86
|
+
headers=headers,
|
|
87
|
+
follow_redirects=True,
|
|
88
|
+
timeout=httpx.Timeout(timeout),
|
|
89
|
+
proxy=proxy,
|
|
90
|
+
) as client:
|
|
91
|
+
resp = await client.get(url)
|
|
92
|
+
resp.raise_for_status()
|
|
93
|
+
return resp.content, str(resp.url)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
async def _extract_youtube(url: str, *, timeout: float = 30.0, max_chars: int = 50_000) -> ExtractResult:
|
|
97
|
+
"""Extract YouTube video transcript."""
|
|
98
|
+
video_id = _extract_video_id(url)
|
|
99
|
+
title = await _fetch_youtube_title(url, timeout=timeout)
|
|
100
|
+
|
|
101
|
+
parser = YouTubeParser()
|
|
102
|
+
result = parser.parse(video_id=video_id, url=url, title=title)
|
|
103
|
+
return truncate_result(result, max_chars=max_chars)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
async def _extract_binary(
|
|
107
|
+
url: str,
|
|
108
|
+
content_type: str,
|
|
109
|
+
*,
|
|
110
|
+
timeout: float = 30.0,
|
|
111
|
+
max_chars: int = 50_000,
|
|
112
|
+
proxy: str | None = None,
|
|
113
|
+
) -> ExtractResult:
|
|
114
|
+
"""Extract content from binary URLs (PDF, DOCX)."""
|
|
115
|
+
data, final_url = await _fetch_bytes(url, timeout=timeout, proxy=proxy)
|
|
116
|
+
|
|
117
|
+
if content_type == "pdf":
|
|
118
|
+
from markgrab.parser.pdf import PdfParser
|
|
119
|
+
|
|
120
|
+
result = PdfParser().parse(data, url=final_url)
|
|
121
|
+
elif content_type == "docx":
|
|
122
|
+
from markgrab.parser.docx import DocxParser
|
|
123
|
+
|
|
124
|
+
result = DocxParser().parse(data, url=final_url)
|
|
125
|
+
else:
|
|
126
|
+
raise ValueError(f"Unknown binary content type: {content_type}")
|
|
127
|
+
|
|
128
|
+
return truncate_result(result, max_chars=max_chars)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
async def extract(
|
|
132
|
+
url: str,
|
|
133
|
+
*,
|
|
134
|
+
engine: Engine | None = None,
|
|
135
|
+
max_chars: int = 50_000,
|
|
136
|
+
use_browser: bool = False,
|
|
137
|
+
stealth: bool = False,
|
|
138
|
+
timeout: float = 30.0,
|
|
139
|
+
proxy: str | None = None,
|
|
140
|
+
) -> ExtractResult:
|
|
141
|
+
"""Extract content from URL and return ExtractResult.
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
url: Target URL to extract content from.
|
|
145
|
+
engine: Custom engine instance (default: HttpEngine, with browser fallback).
|
|
146
|
+
max_chars: Maximum characters for text/markdown (default 50K).
|
|
147
|
+
use_browser: Force Playwright browser rendering.
|
|
148
|
+
stealth: Apply anti-bot stealth scripts when using browser (default: False).
|
|
149
|
+
timeout: Request timeout in seconds.
|
|
150
|
+
proxy: Proxy URL (e.g., "http://proxy:8080", "socks5://proxy:1080").
|
|
151
|
+
"""
|
|
152
|
+
url_type = _detect_type_from_url(url)
|
|
153
|
+
|
|
154
|
+
# YouTube — dedicated parser (no engine needed)
|
|
155
|
+
if url_type == "youtube":
|
|
156
|
+
return await _extract_youtube(url, timeout=timeout, max_chars=max_chars)
|
|
157
|
+
|
|
158
|
+
# PDF / DOCX — binary fetch + dedicated parser
|
|
159
|
+
if url_type in ("pdf", "docx"):
|
|
160
|
+
return await _extract_binary(url, url_type, timeout=timeout, max_chars=max_chars, proxy=proxy)
|
|
161
|
+
|
|
162
|
+
# HTML flow — engine + parser + fallback
|
|
163
|
+
if use_browser:
|
|
164
|
+
if not _BROWSER_AVAILABLE:
|
|
165
|
+
raise ImportError("Playwright not installed. Run: pip install 'markgrab[browser]'")
|
|
166
|
+
fetch_result = await (engine or BrowserEngine(proxy=proxy, stealth=stealth)).fetch(url, timeout=timeout)
|
|
167
|
+
else:
|
|
168
|
+
if _BROWSER_AVAILABLE:
|
|
169
|
+
fetch_result = await _fetch_with_fallback(url, engine=engine, timeout=timeout, proxy=proxy, stealth=stealth)
|
|
170
|
+
else:
|
|
171
|
+
fetch_result = await (engine or HttpEngine(proxy=proxy)).fetch(url, timeout=timeout)
|
|
172
|
+
|
|
173
|
+
# Content-Type header may reveal PDF even without .pdf extension
|
|
174
|
+
if "application/pdf" in fetch_result.content_type:
|
|
175
|
+
data, final_url = await _fetch_bytes(url, timeout=timeout, proxy=proxy)
|
|
176
|
+
from markgrab.parser.pdf import PdfParser
|
|
177
|
+
|
|
178
|
+
result = PdfParser().parse(data, url=final_url)
|
|
179
|
+
return truncate_result(result, max_chars=max_chars)
|
|
180
|
+
|
|
181
|
+
# Parse HTML
|
|
182
|
+
parser = HtmlParser()
|
|
183
|
+
result = parser.parse(fetch_result.html, url=fetch_result.final_url)
|
|
184
|
+
|
|
185
|
+
# Auto-fallback: thin content likely means SPA/JS-only page
|
|
186
|
+
if not use_browser and _BROWSER_AVAILABLE and result.word_count < _MIN_WORD_COUNT:
|
|
187
|
+
logger.info("Thin content (%d words) for %s, retrying with browser", result.word_count, url)
|
|
188
|
+
try:
|
|
189
|
+
browser_result = await BrowserEngine(proxy=proxy, stealth=stealth).fetch(url, timeout=timeout)
|
|
190
|
+
browser_parsed = parser.parse(browser_result.html, url=browser_result.final_url)
|
|
191
|
+
if browser_parsed.word_count > result.word_count:
|
|
192
|
+
result = browser_parsed
|
|
193
|
+
except Exception:
|
|
194
|
+
pass # Keep original result
|
|
195
|
+
|
|
196
|
+
return truncate_result(result, max_chars=max_chars)
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
"""Content fetching engines."""
|
|
2
|
+
|
|
3
|
+
from markgrab.engine.base import USER_AGENTS, Engine, FetchResult
|
|
4
|
+
from markgrab.engine.browser import BrowserEngine
|
|
5
|
+
from markgrab.engine.http import HttpEngine
|
|
6
|
+
|
|
7
|
+
__all__ = ["USER_AGENTS", "Engine", "FetchResult", "HttpEngine", "BrowserEngine"]
|