markgrab 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. markgrab-0.1.0/LICENSE +21 -0
  2. markgrab-0.1.0/PKG-INFO +179 -0
  3. markgrab-0.1.0/README.md +138 -0
  4. markgrab-0.1.0/markgrab/__init__.py +7 -0
  5. markgrab-0.1.0/markgrab/__main__.py +70 -0
  6. markgrab-0.1.0/markgrab/anti_bot/__init__.py +0 -0
  7. markgrab-0.1.0/markgrab/anti_bot/stealth.py +45 -0
  8. markgrab-0.1.0/markgrab/core.py +196 -0
  9. markgrab-0.1.0/markgrab/engine/__init__.py +7 -0
  10. markgrab-0.1.0/markgrab/engine/base.py +42 -0
  11. markgrab-0.1.0/markgrab/engine/browser.py +72 -0
  12. markgrab-0.1.0/markgrab/engine/http.py +37 -0
  13. markgrab-0.1.0/markgrab/filter/__init__.py +7 -0
  14. markgrab-0.1.0/markgrab/filter/density.py +79 -0
  15. markgrab-0.1.0/markgrab/filter/noise.py +42 -0
  16. markgrab-0.1.0/markgrab/filter/truncate.py +33 -0
  17. markgrab-0.1.0/markgrab/output/__init__.py +0 -0
  18. markgrab-0.1.0/markgrab/parser/__init__.py +9 -0
  19. markgrab-0.1.0/markgrab/parser/base.py +13 -0
  20. markgrab-0.1.0/markgrab/parser/docx.py +87 -0
  21. markgrab-0.1.0/markgrab/parser/html.py +120 -0
  22. markgrab-0.1.0/markgrab/parser/pdf.py +66 -0
  23. markgrab-0.1.0/markgrab/parser/youtube.py +107 -0
  24. markgrab-0.1.0/markgrab/result.py +17 -0
  25. markgrab-0.1.0/markgrab/utils.py +28 -0
  26. markgrab-0.1.0/markgrab.egg-info/PKG-INFO +179 -0
  27. markgrab-0.1.0/markgrab.egg-info/SOURCES.txt +43 -0
  28. markgrab-0.1.0/markgrab.egg-info/dependency_links.txt +1 -0
  29. markgrab-0.1.0/markgrab.egg-info/entry_points.txt +2 -0
  30. markgrab-0.1.0/markgrab.egg-info/requires.txt +26 -0
  31. markgrab-0.1.0/markgrab.egg-info/top_level.txt +1 -0
  32. markgrab-0.1.0/pyproject.toml +64 -0
  33. markgrab-0.1.0/setup.cfg +4 -0
  34. markgrab-0.1.0/tests/test_browser_engine.py +150 -0
  35. markgrab-0.1.0/tests/test_cli.py +95 -0
  36. markgrab-0.1.0/tests/test_density_filter.py +182 -0
  37. markgrab-0.1.0/tests/test_docx_parser.py +149 -0
  38. markgrab-0.1.0/tests/test_extract.py +205 -0
  39. markgrab-0.1.0/tests/test_fallback.py +173 -0
  40. markgrab-0.1.0/tests/test_html_parser.py +206 -0
  41. markgrab-0.1.0/tests/test_http_engine.py +82 -0
  42. markgrab-0.1.0/tests/test_pdf_parser.py +125 -0
  43. markgrab-0.1.0/tests/test_result.py +38 -0
  44. markgrab-0.1.0/tests/test_truncate.py +76 -0
  45. markgrab-0.1.0/tests/test_youtube_parser.py +139 -0
markgrab-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 hmj
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,179 @@
1
+ Metadata-Version: 2.4
2
+ Name: markgrab
3
+ Version: 0.1.0
4
+ Summary: Universal web content extraction — URL to LLM-ready markdown
5
+ Author: hmj
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/QuartzUnit/markgrab
8
+ Project-URL: Repository, https://github.com/QuartzUnit/markgrab
9
+ Project-URL: Issues, https://github.com/QuartzUnit/markgrab/issues
10
+ Keywords: web-scraping,content-extraction,markdown,llm,rag
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Topic :: Internet :: WWW/HTTP
15
+ Classifier: Topic :: Text Processing :: Markup :: Markdown
16
+ Classifier: Typing :: Typed
17
+ Requires-Python: >=3.12
18
+ Description-Content-Type: text/markdown
19
+ License-File: LICENSE
20
+ Requires-Dist: httpx>=0.28
21
+ Requires-Dist: beautifulsoup4>=4.13
22
+ Requires-Dist: markdownify>=0.14
23
+ Provides-Extra: browser
24
+ Requires-Dist: playwright>=1.49; extra == "browser"
25
+ Provides-Extra: youtube
26
+ Requires-Dist: youtube-transcript-api>=1.0; extra == "youtube"
27
+ Provides-Extra: pdf
28
+ Requires-Dist: pdfplumber>=0.11; extra == "pdf"
29
+ Provides-Extra: docx
30
+ Requires-Dist: python-docx>=1.1; extra == "docx"
31
+ Provides-Extra: all
32
+ Requires-Dist: playwright>=1.49; extra == "all"
33
+ Requires-Dist: youtube-transcript-api>=1.0; extra == "all"
34
+ Requires-Dist: pdfplumber>=0.11; extra == "all"
35
+ Requires-Dist: python-docx>=1.1; extra == "all"
36
+ Provides-Extra: dev
37
+ Requires-Dist: pytest>=8.0; extra == "dev"
38
+ Requires-Dist: pytest-asyncio>=0.24; extra == "dev"
39
+ Requires-Dist: ruff>=0.9; extra == "dev"
40
+ Dynamic: license-file
41
+
42
+ # MarkGrab
43
+
44
+ > [한국어 문서](README.ko.md)
45
+
46
+ Universal web content extraction — any URL to LLM-ready markdown.
47
+
48
+ ```python
49
+ from markgrab import extract
50
+
51
+ result = await extract("https://example.com/article")
52
+ print(result.markdown) # clean markdown
53
+ print(result.title) # "Article Title"
54
+ print(result.word_count) # 1234
55
+ print(result.language) # "en"
56
+ ```
57
+
58
+ ## Features
59
+
60
+ - **HTML** — BeautifulSoup + content density filtering (removes nav, sidebar, ads)
61
+ - **YouTube** — transcript extraction with timestamps
62
+ - **PDF** — text extraction with page structure
63
+ - **DOCX** — paragraph and heading extraction
64
+ - **Auto-fallback** — tries lightweight httpx first, falls back to Playwright for JS-heavy pages
65
+ - **Async-first** — built on httpx and Playwright async APIs
66
+
67
+ ## Install
68
+
69
+ ```bash
70
+ pip install markgrab
71
+ ```
72
+
73
+ Optional extras for specific content types:
74
+
75
+ ```bash
76
+ pip install "markgrab[browser]" # Playwright for JS-rendered pages
77
+ pip install "markgrab[youtube]" # YouTube transcript extraction
78
+ pip install "markgrab[pdf]" # PDF text extraction
79
+ pip install "markgrab[docx]" # DOCX text extraction
80
+ pip install "markgrab[all]" # everything
81
+ ```
82
+
83
+ ## Usage
84
+
85
+ ### Python API
86
+
87
+ ```python
88
+ import asyncio
89
+ from markgrab import extract
90
+
91
+ async def main():
92
+ # HTML (auto-detects content type)
93
+ result = await extract("https://example.com/article")
94
+
95
+ # YouTube transcript
96
+ result = await extract("https://youtube.com/watch?v=dQw4w9WgXcQ")
97
+
98
+ # PDF
99
+ result = await extract("https://arxiv.org/pdf/1706.03762")
100
+
101
+ # Options
102
+ result = await extract(
103
+ "https://example.com",
104
+ max_chars=30_000, # limit output length (default: 50K)
105
+ use_browser=True, # force Playwright rendering
106
+ stealth=True, # anti-bot stealth scripts (opt-in)
107
+ timeout=60.0, # request timeout in seconds
108
+ proxy="http://proxy:8080",
109
+ )
110
+
111
+ asyncio.run(main())
112
+ ```
113
+
114
+ ### CLI
115
+
116
+ ```bash
117
+ markgrab https://example.com # markdown output
118
+ markgrab https://example.com -f text # plain text
119
+ markgrab https://example.com -f json # structured JSON
120
+ markgrab https://example.com --browser # force browser rendering
121
+ markgrab https://example.com --max-chars 10000 # limit output
122
+ ```
123
+
124
+ ### ExtractResult
125
+
126
+ ```python
127
+ result.title # page title
128
+ result.text # plain text
129
+ result.markdown # LLM-ready markdown
130
+ result.word_count # word count
131
+ result.language # detected language ("en", "ko", ...)
132
+ result.content_type # "article", "video", "pdf", "docx"
133
+ result.source_url # final URL (after redirects)
134
+ result.metadata # extra metadata (video_id, page_count, etc.)
135
+ ```
136
+
137
+ ## How it works
138
+
139
+ ```
140
+ markgrab.extract(url)
141
+ 1. Detect content type (URL pattern)
142
+ 2. Fetch content (httpx first, Playwright fallback)
143
+ 3. Parse (HTML/YouTube/PDF/DOCX)
144
+ 4. Filter (noise removal + content density + truncation)
145
+ 5. Return ExtractResult
146
+ ```
147
+
148
+ For HTML pages, if the initial httpx fetch yields fewer than 50 words, MarkGrab automatically retries with Playwright to handle JavaScript-rendered content.
149
+
150
+ ## Disclaimer
151
+
152
+ **This software is provided for legitimate purposes only.** By using MarkGrab, you agree to the following:
153
+
154
+ - **robots.txt**: MarkGrab does **not** check or enforce `robots.txt`. Users are solely responsible for checking and respecting `robots.txt` directives and the terms of service of any website they access.
155
+
156
+ - **Rate limiting**: MarkGrab does **not** include built-in rate limiting or request throttling. Users must implement their own rate limiting to avoid overloading target servers. Abusive request patterns may violate applicable laws and website terms of service.
157
+
158
+ - **YouTube transcripts**: YouTube transcript extraction relies on the third-party `youtube-transcript-api` library, which uses YouTube's internal (unofficial) caption API. This may not comply with YouTube's Terms of Service. Use at your own discretion and risk.
159
+
160
+ - **Stealth mode**: The optional `stealth=True` feature modifies browser fingerprinting signals to reduce bot detection. This feature is intended for legitimate use cases such as testing, research, and accessing content that is publicly available to regular browser users. Users are responsible for ensuring their use complies with applicable laws and the terms of service of target websites.
161
+
162
+ - **Legal compliance**: Users are responsible for ensuring that their use of MarkGrab complies with all applicable laws, including but not limited to the Computer Fraud and Abuse Act (CFAA), the Digital Millennium Copyright Act (DMCA), GDPR, and equivalent legislation in their jurisdiction.
163
+
164
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND. See the [LICENSE](LICENSE) file for the full MIT license text.
165
+
166
+ ## Acknowledgments
167
+
168
+ MarkGrab builds on excellent open-source work and well-established techniques:
169
+
170
+ - **[puppeteer-extra-plugin-stealth](https://github.com/nicoleahmed/puppeteer-extra-plugin-stealth)** — stealth evasion patterns (webdriver removal, plugin mocking, WebGL spoofing) that inspired the opt-in `anti_bot/stealth.py` module
171
+ - **[Mozilla Readability](https://github.com/mozilla/readability)** — content area detection priority (`article > main > body`) and link density filtering concepts used in the density filter
172
+ - **[Boilerpipe](https://github.com/kohlschutter/boilerpipe)** (Kohlschutter et al., 2010) — the academic origin of link density ratio algorithms for boilerplate removal
173
+ - **[Jina Reader](https://github.com/jina-ai/reader)** — validated the market need for URL-to-markdown extraction; MarkGrab aims to be a lightweight, self-hosted alternative
174
+
175
+ Built with [httpx](https://github.com/encode/httpx), [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/), [markdownify](https://github.com/matthewwithanm/python-markdownify), [Playwright](https://github.com/microsoft/playwright-python), [youtube-transcript-api](https://github.com/jdepoix/youtube-transcript-api), [pdfplumber](https://github.com/jsvine/pdfplumber), and [python-docx](https://github.com/python-openxml/python-docx).
176
+
177
+ ## License
178
+
179
+ [MIT](LICENSE)
@@ -0,0 +1,138 @@
1
+ # MarkGrab
2
+
3
+ > [한국어 문서](README.ko.md)
4
+
5
+ Universal web content extraction — any URL to LLM-ready markdown.
6
+
7
+ ```python
8
+ from markgrab import extract
9
+
10
+ result = await extract("https://example.com/article")
11
+ print(result.markdown) # clean markdown
12
+ print(result.title) # "Article Title"
13
+ print(result.word_count) # 1234
14
+ print(result.language) # "en"
15
+ ```
16
+
17
+ ## Features
18
+
19
+ - **HTML** — BeautifulSoup + content density filtering (removes nav, sidebar, ads)
20
+ - **YouTube** — transcript extraction with timestamps
21
+ - **PDF** — text extraction with page structure
22
+ - **DOCX** — paragraph and heading extraction
23
+ - **Auto-fallback** — tries lightweight httpx first, falls back to Playwright for JS-heavy pages
24
+ - **Async-first** — built on httpx and Playwright async APIs
25
+
26
+ ## Install
27
+
28
+ ```bash
29
+ pip install markgrab
30
+ ```
31
+
32
+ Optional extras for specific content types:
33
+
34
+ ```bash
35
+ pip install "markgrab[browser]" # Playwright for JS-rendered pages
36
+ pip install "markgrab[youtube]" # YouTube transcript extraction
37
+ pip install "markgrab[pdf]" # PDF text extraction
38
+ pip install "markgrab[docx]" # DOCX text extraction
39
+ pip install "markgrab[all]" # everything
40
+ ```
41
+
42
+ ## Usage
43
+
44
+ ### Python API
45
+
46
+ ```python
47
+ import asyncio
48
+ from markgrab import extract
49
+
50
+ async def main():
51
+ # HTML (auto-detects content type)
52
+ result = await extract("https://example.com/article")
53
+
54
+ # YouTube transcript
55
+ result = await extract("https://youtube.com/watch?v=dQw4w9WgXcQ")
56
+
57
+ # PDF
58
+ result = await extract("https://arxiv.org/pdf/1706.03762")
59
+
60
+ # Options
61
+ result = await extract(
62
+ "https://example.com",
63
+ max_chars=30_000, # limit output length (default: 50K)
64
+ use_browser=True, # force Playwright rendering
65
+ stealth=True, # anti-bot stealth scripts (opt-in)
66
+ timeout=60.0, # request timeout in seconds
67
+ proxy="http://proxy:8080",
68
+ )
69
+
70
+ asyncio.run(main())
71
+ ```
72
+
73
+ ### CLI
74
+
75
+ ```bash
76
+ markgrab https://example.com # markdown output
77
+ markgrab https://example.com -f text # plain text
78
+ markgrab https://example.com -f json # structured JSON
79
+ markgrab https://example.com --browser # force browser rendering
80
+ markgrab https://example.com --max-chars 10000 # limit output
81
+ ```
82
+
83
+ ### ExtractResult
84
+
85
+ ```python
86
+ result.title # page title
87
+ result.text # plain text
88
+ result.markdown # LLM-ready markdown
89
+ result.word_count # word count
90
+ result.language # detected language ("en", "ko", ...)
91
+ result.content_type # "article", "video", "pdf", "docx"
92
+ result.source_url # final URL (after redirects)
93
+ result.metadata # extra metadata (video_id, page_count, etc.)
94
+ ```
95
+
96
+ ## How it works
97
+
98
+ ```
99
+ markgrab.extract(url)
100
+ 1. Detect content type (URL pattern)
101
+ 2. Fetch content (httpx first, Playwright fallback)
102
+ 3. Parse (HTML/YouTube/PDF/DOCX)
103
+ 4. Filter (noise removal + content density + truncation)
104
+ 5. Return ExtractResult
105
+ ```
106
+
107
+ For HTML pages, if the initial httpx fetch yields fewer than 50 words, MarkGrab automatically retries with Playwright to handle JavaScript-rendered content.
108
+
109
+ ## Disclaimer
110
+
111
+ **This software is provided for legitimate purposes only.** By using MarkGrab, you agree to the following:
112
+
113
+ - **robots.txt**: MarkGrab does **not** check or enforce `robots.txt`. Users are solely responsible for checking and respecting `robots.txt` directives and the terms of service of any website they access.
114
+
115
+ - **Rate limiting**: MarkGrab does **not** include built-in rate limiting or request throttling. Users must implement their own rate limiting to avoid overloading target servers. Abusive request patterns may violate applicable laws and website terms of service.
116
+
117
+ - **YouTube transcripts**: YouTube transcript extraction relies on the third-party `youtube-transcript-api` library, which uses YouTube's internal (unofficial) caption API. This may not comply with YouTube's Terms of Service. Use at your own discretion and risk.
118
+
119
+ - **Stealth mode**: The optional `stealth=True` feature modifies browser fingerprinting signals to reduce bot detection. This feature is intended for legitimate use cases such as testing, research, and accessing content that is publicly available to regular browser users. Users are responsible for ensuring their use complies with applicable laws and the terms of service of target websites.
120
+
121
+ - **Legal compliance**: Users are responsible for ensuring that their use of MarkGrab complies with all applicable laws, including but not limited to the Computer Fraud and Abuse Act (CFAA), the Digital Millennium Copyright Act (DMCA), GDPR, and equivalent legislation in their jurisdiction.
122
+
123
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND. See the [LICENSE](LICENSE) file for the full MIT license text.
124
+
125
+ ## Acknowledgments
126
+
127
+ MarkGrab builds on excellent open-source work and well-established techniques:
128
+
129
+ - **[puppeteer-extra-plugin-stealth](https://github.com/nicoleahmed/puppeteer-extra-plugin-stealth)** — stealth evasion patterns (webdriver removal, plugin mocking, WebGL spoofing) that inspired the opt-in `anti_bot/stealth.py` module
130
+ - **[Mozilla Readability](https://github.com/mozilla/readability)** — content area detection priority (`article > main > body`) and link density filtering concepts used in the density filter
131
+ - **[Boilerpipe](https://github.com/kohlschutter/boilerpipe)** (Kohlschutter et al., 2010) — the academic origin of link density ratio algorithms for boilerplate removal
132
+ - **[Jina Reader](https://github.com/jina-ai/reader)** — validated the market need for URL-to-markdown extraction; MarkGrab aims to be a lightweight, self-hosted alternative
133
+
134
+ Built with [httpx](https://github.com/encode/httpx), [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/), [markdownify](https://github.com/matthewwithanm/python-markdownify), [Playwright](https://github.com/microsoft/playwright-python), [youtube-transcript-api](https://github.com/jdepoix/youtube-transcript-api), [pdfplumber](https://github.com/jsvine/pdfplumber), and [python-docx](https://github.com/python-openxml/python-docx).
135
+
136
+ ## License
137
+
138
+ [MIT](LICENSE)
@@ -0,0 +1,7 @@
1
+ """MarkGrab — Universal web content extraction."""
2
+
3
+ from markgrab.core import extract
4
+ from markgrab.result import ExtractResult
5
+
6
+ __all__ = ["extract", "ExtractResult"]
7
+ __version__ = "0.1.0"
@@ -0,0 +1,70 @@
1
+ """CLI entry point — python -m markgrab or `markgrab` command."""
2
+
3
+ import argparse
4
+ import asyncio
5
+ import json
6
+ import sys
7
+
8
+ from markgrab import extract
9
+
10
+
11
+ def main():
12
+ parser = argparse.ArgumentParser(
13
+ prog="markgrab",
14
+ description="MarkGrab — extract web content as LLM-ready markdown",
15
+ )
16
+ parser.add_argument("url", help="URL to extract content from")
17
+ parser.add_argument("--max-chars", type=int, default=50_000, help="Max output characters (default: 50000)")
18
+ parser.add_argument("--browser", action="store_true", help="Force Playwright browser rendering")
19
+ parser.add_argument("--timeout", type=float, default=30.0, help="Request timeout in seconds (default: 30)")
20
+ parser.add_argument("--proxy", help="Proxy URL (e.g., http://proxy:8080)")
21
+ parser.add_argument(
22
+ "--format", "-f",
23
+ choices=["markdown", "text", "json"],
24
+ default="markdown",
25
+ help="Output format (default: markdown)",
26
+ )
27
+ args = parser.parse_args()
28
+
29
+ try:
30
+ result = asyncio.run(extract(
31
+ args.url,
32
+ max_chars=args.max_chars,
33
+ use_browser=args.browser,
34
+ timeout=args.timeout,
35
+ proxy=args.proxy,
36
+ ))
37
+ except KeyboardInterrupt:
38
+ sys.exit(130)
39
+ except Exception as e:
40
+ print(f"Error: {e}", file=sys.stderr)
41
+ sys.exit(1)
42
+
43
+ if args.format == "json":
44
+ output = {
45
+ "title": result.title,
46
+ "text": result.text,
47
+ "markdown": result.markdown,
48
+ "word_count": result.word_count,
49
+ "language": result.language,
50
+ "content_type": result.content_type,
51
+ "source_url": result.source_url,
52
+ "metadata": result.metadata,
53
+ }
54
+ print(json.dumps(output, ensure_ascii=False, indent=2))
55
+ elif args.format == "text":
56
+ if result.title:
57
+ print(f"Title: {result.title}")
58
+ print(f"Words: {result.word_count} | Language: {result.language} | Type: {result.content_type}")
59
+ print("---")
60
+ print(result.text)
61
+ else:
62
+ if result.title:
63
+ print(f"# {result.title}")
64
+ print(f"<!-- words: {result.word_count} | lang: {result.language} | type: {result.content_type} -->")
65
+ print()
66
+ print(result.markdown)
67
+
68
+
69
+ if __name__ == "__main__":
70
+ main()
File without changes
@@ -0,0 +1,45 @@
1
+ """Stealth settings for Playwright to avoid bot detection."""
2
+
3
+ _STEALTH_SCRIPT = """\
4
+ // Remove webdriver flag
5
+ Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
6
+
7
+ // Realistic languages
8
+ Object.defineProperty(navigator, 'languages', {get: () => ['en-US', 'en', 'ko']});
9
+
10
+ // Mock plugins (Chrome always has these)
11
+ Object.defineProperty(navigator, 'plugins', {
12
+ get: () => {
13
+ const plugins = [
14
+ {name: 'Chrome PDF Plugin', filename: 'internal-pdf-viewer'},
15
+ {name: 'Chrome PDF Viewer', filename: 'mhjfbmdgcfjbbpaeojofohoefgiehjai'},
16
+ {name: 'Native Client', filename: 'internal-nacl-plugin'},
17
+ ];
18
+ plugins.length = 3;
19
+ return plugins;
20
+ }
21
+ });
22
+
23
+ // Mock permissions
24
+ const originalQuery = window.navigator.permissions.query;
25
+ window.navigator.permissions.query = (parameters) =>
26
+ parameters.name === 'notifications'
27
+ ? Promise.resolve({state: Notification.permission})
28
+ : originalQuery(parameters);
29
+
30
+ // Chrome runtime mock
31
+ window.chrome = {runtime: {}, loadTimes: function() {}, csi: function() {}};
32
+
33
+ // WebGL vendor/renderer (Intel is the most common)
34
+ const getParameter = WebGLRenderingContext.prototype.getParameter;
35
+ WebGLRenderingContext.prototype.getParameter = function(parameter) {
36
+ if (parameter === 37445) return 'Intel Inc.'; // UNMASKED_VENDOR_WEBGL
37
+ if (parameter === 37446) return 'Intel Iris OpenGL Engine'; // UNMASKED_RENDERER_WEBGL
38
+ return getParameter.call(this, parameter);
39
+ };
40
+ """
41
+
42
+
43
+ async def apply_stealth(context) -> None:
44
+ """Apply stealth settings to a Playwright browser context."""
45
+ await context.add_init_script(_STEALTH_SCRIPT)
@@ -0,0 +1,196 @@
1
+ """Main orchestrator — route URL to appropriate engine and parser."""
2
+
3
+ import logging
4
+ import random
5
+ from urllib.parse import urlparse
6
+
7
+ import httpx
8
+
9
+ from markgrab.engine.base import USER_AGENTS, Engine
10
+ from markgrab.engine.browser import BrowserEngine
11
+ from markgrab.engine.http import HttpEngine
12
+ from markgrab.filter.truncate import truncate_result
13
+ from markgrab.parser.html import HtmlParser
14
+ from markgrab.parser.youtube import YouTubeParser, _extract_video_id
15
+ from markgrab.result import ExtractResult
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+ # Minimum word count — below this, content is likely SPA/JS-only
20
+ _MIN_WORD_COUNT = 50
21
+
22
+ _OEMBED_URL = "https://www.youtube.com/oembed?url={url}&format=json"
23
+
24
+ try:
25
+ import playwright # noqa: F401
26
+
27
+ _BROWSER_AVAILABLE = True
28
+ except ImportError:
29
+ _BROWSER_AVAILABLE = False
30
+
31
+
32
+ def _detect_type_from_url(url: str) -> str:
33
+ """Detect content type from URL pattern."""
34
+ parsed = urlparse(url)
35
+ path = parsed.path.lower()
36
+
37
+ if "youtube.com" in parsed.netloc or "youtu.be" in parsed.netloc:
38
+ return "youtube"
39
+ if path.endswith(".pdf"):
40
+ return "pdf"
41
+ if path.endswith(".docx"):
42
+ return "docx"
43
+
44
+ return "html"
45
+
46
+
47
+ async def _fetch_with_fallback(
48
+ url: str,
49
+ *,
50
+ engine: Engine | None = None,
51
+ timeout: float = 30.0,
52
+ proxy: str | None = None,
53
+ stealth: bool = False,
54
+ ):
55
+ """Fetch via HTTP, fallback to browser on error."""
56
+ http_engine = engine or HttpEngine(proxy=proxy)
57
+ try:
58
+ return await http_engine.fetch(url, timeout=timeout)
59
+ except Exception as exc:
60
+ if _BROWSER_AVAILABLE:
61
+ logger.info("HTTP failed for %s (%s), falling back to browser", url, type(exc).__name__)
62
+ return await BrowserEngine(proxy=proxy, stealth=stealth).fetch(url, timeout=timeout)
63
+ raise
64
+
65
+
66
+ async def _fetch_youtube_title(url: str, timeout: float = 30.0) -> str:
67
+ """Fetch YouTube video title via oEmbed API."""
68
+ try:
69
+ oembed_url = _OEMBED_URL.format(url=url)
70
+ async with httpx.AsyncClient(timeout=httpx.Timeout(timeout)) as client:
71
+ resp = await client.get(oembed_url)
72
+ if resp.status_code == 200:
73
+ return resp.json().get("title", "")
74
+ except Exception:
75
+ logger.debug("Failed to fetch YouTube oEmbed title for %s", url)
76
+ return ""
77
+
78
+
79
+ async def _fetch_bytes(url: str, *, timeout: float = 30.0, proxy: str | None = None) -> tuple[bytes, str]:
80
+ """Fetch URL as raw bytes. Returns (data, final_url)."""
81
+ headers = {
82
+ "User-Agent": random.choice(USER_AGENTS),
83
+ "Accept": "*/*",
84
+ }
85
+ async with httpx.AsyncClient(
86
+ headers=headers,
87
+ follow_redirects=True,
88
+ timeout=httpx.Timeout(timeout),
89
+ proxy=proxy,
90
+ ) as client:
91
+ resp = await client.get(url)
92
+ resp.raise_for_status()
93
+ return resp.content, str(resp.url)
94
+
95
+
96
+ async def _extract_youtube(url: str, *, timeout: float = 30.0, max_chars: int = 50_000) -> ExtractResult:
97
+ """Extract YouTube video transcript."""
98
+ video_id = _extract_video_id(url)
99
+ title = await _fetch_youtube_title(url, timeout=timeout)
100
+
101
+ parser = YouTubeParser()
102
+ result = parser.parse(video_id=video_id, url=url, title=title)
103
+ return truncate_result(result, max_chars=max_chars)
104
+
105
+
106
+ async def _extract_binary(
107
+ url: str,
108
+ content_type: str,
109
+ *,
110
+ timeout: float = 30.0,
111
+ max_chars: int = 50_000,
112
+ proxy: str | None = None,
113
+ ) -> ExtractResult:
114
+ """Extract content from binary URLs (PDF, DOCX)."""
115
+ data, final_url = await _fetch_bytes(url, timeout=timeout, proxy=proxy)
116
+
117
+ if content_type == "pdf":
118
+ from markgrab.parser.pdf import PdfParser
119
+
120
+ result = PdfParser().parse(data, url=final_url)
121
+ elif content_type == "docx":
122
+ from markgrab.parser.docx import DocxParser
123
+
124
+ result = DocxParser().parse(data, url=final_url)
125
+ else:
126
+ raise ValueError(f"Unknown binary content type: {content_type}")
127
+
128
+ return truncate_result(result, max_chars=max_chars)
129
+
130
+
131
+ async def extract(
132
+ url: str,
133
+ *,
134
+ engine: Engine | None = None,
135
+ max_chars: int = 50_000,
136
+ use_browser: bool = False,
137
+ stealth: bool = False,
138
+ timeout: float = 30.0,
139
+ proxy: str | None = None,
140
+ ) -> ExtractResult:
141
+ """Extract content from URL and return ExtractResult.
142
+
143
+ Args:
144
+ url: Target URL to extract content from.
145
+ engine: Custom engine instance (default: HttpEngine, with browser fallback).
146
+ max_chars: Maximum characters for text/markdown (default 50K).
147
+ use_browser: Force Playwright browser rendering.
148
+ stealth: Apply anti-bot stealth scripts when using browser (default: False).
149
+ timeout: Request timeout in seconds.
150
+ proxy: Proxy URL (e.g., "http://proxy:8080", "socks5://proxy:1080").
151
+ """
152
+ url_type = _detect_type_from_url(url)
153
+
154
+ # YouTube — dedicated parser (no engine needed)
155
+ if url_type == "youtube":
156
+ return await _extract_youtube(url, timeout=timeout, max_chars=max_chars)
157
+
158
+ # PDF / DOCX — binary fetch + dedicated parser
159
+ if url_type in ("pdf", "docx"):
160
+ return await _extract_binary(url, url_type, timeout=timeout, max_chars=max_chars, proxy=proxy)
161
+
162
+ # HTML flow — engine + parser + fallback
163
+ if use_browser:
164
+ if not _BROWSER_AVAILABLE:
165
+ raise ImportError("Playwright not installed. Run: pip install 'markgrab[browser]'")
166
+ fetch_result = await (engine or BrowserEngine(proxy=proxy, stealth=stealth)).fetch(url, timeout=timeout)
167
+ else:
168
+ if _BROWSER_AVAILABLE:
169
+ fetch_result = await _fetch_with_fallback(url, engine=engine, timeout=timeout, proxy=proxy, stealth=stealth)
170
+ else:
171
+ fetch_result = await (engine or HttpEngine(proxy=proxy)).fetch(url, timeout=timeout)
172
+
173
+ # Content-Type header may reveal PDF even without .pdf extension
174
+ if "application/pdf" in fetch_result.content_type:
175
+ data, final_url = await _fetch_bytes(url, timeout=timeout, proxy=proxy)
176
+ from markgrab.parser.pdf import PdfParser
177
+
178
+ result = PdfParser().parse(data, url=final_url)
179
+ return truncate_result(result, max_chars=max_chars)
180
+
181
+ # Parse HTML
182
+ parser = HtmlParser()
183
+ result = parser.parse(fetch_result.html, url=fetch_result.final_url)
184
+
185
+ # Auto-fallback: thin content likely means SPA/JS-only page
186
+ if not use_browser and _BROWSER_AVAILABLE and result.word_count < _MIN_WORD_COUNT:
187
+ logger.info("Thin content (%d words) for %s, retrying with browser", result.word_count, url)
188
+ try:
189
+ browser_result = await BrowserEngine(proxy=proxy, stealth=stealth).fetch(url, timeout=timeout)
190
+ browser_parsed = parser.parse(browser_result.html, url=browser_result.final_url)
191
+ if browser_parsed.word_count > result.word_count:
192
+ result = browser_parsed
193
+ except Exception:
194
+ pass # Keep original result
195
+
196
+ return truncate_result(result, max_chars=max_chars)
@@ -0,0 +1,7 @@
1
+ """Content fetching engines."""
2
+
3
+ from markgrab.engine.base import USER_AGENTS, Engine, FetchResult
4
+ from markgrab.engine.browser import BrowserEngine
5
+ from markgrab.engine.http import HttpEngine
6
+
7
+ __all__ = ["USER_AGENTS", "Engine", "FetchResult", "HttpEngine", "BrowserEngine"]