markgrab 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- markgrab/__init__.py +7 -0
- markgrab/__main__.py +70 -0
- markgrab/anti_bot/__init__.py +0 -0
- markgrab/anti_bot/stealth.py +45 -0
- markgrab/core.py +196 -0
- markgrab/engine/__init__.py +7 -0
- markgrab/engine/base.py +42 -0
- markgrab/engine/browser.py +72 -0
- markgrab/engine/http.py +37 -0
- markgrab/filter/__init__.py +7 -0
- markgrab/filter/density.py +79 -0
- markgrab/filter/noise.py +42 -0
- markgrab/filter/truncate.py +33 -0
- markgrab/output/__init__.py +0 -0
- markgrab/parser/__init__.py +9 -0
- markgrab/parser/base.py +13 -0
- markgrab/parser/docx.py +87 -0
- markgrab/parser/html.py +120 -0
- markgrab/parser/pdf.py +66 -0
- markgrab/parser/youtube.py +107 -0
- markgrab/result.py +17 -0
- markgrab/utils.py +28 -0
- markgrab-0.1.0.dist-info/METADATA +179 -0
- markgrab-0.1.0.dist-info/RECORD +28 -0
- markgrab-0.1.0.dist-info/WHEEL +5 -0
- markgrab-0.1.0.dist-info/entry_points.txt +2 -0
- markgrab-0.1.0.dist-info/licenses/LICENSE +21 -0
- markgrab-0.1.0.dist-info/top_level.txt +1 -0
markgrab/__init__.py
ADDED
markgrab/__main__.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""CLI entry point — python -m markgrab or `markgrab` command."""
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import asyncio
|
|
5
|
+
import json
|
|
6
|
+
import sys
|
|
7
|
+
|
|
8
|
+
from markgrab import extract
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def main():
|
|
12
|
+
parser = argparse.ArgumentParser(
|
|
13
|
+
prog="markgrab",
|
|
14
|
+
description="MarkGrab — extract web content as LLM-ready markdown",
|
|
15
|
+
)
|
|
16
|
+
parser.add_argument("url", help="URL to extract content from")
|
|
17
|
+
parser.add_argument("--max-chars", type=int, default=50_000, help="Max output characters (default: 50000)")
|
|
18
|
+
parser.add_argument("--browser", action="store_true", help="Force Playwright browser rendering")
|
|
19
|
+
parser.add_argument("--timeout", type=float, default=30.0, help="Request timeout in seconds (default: 30)")
|
|
20
|
+
parser.add_argument("--proxy", help="Proxy URL (e.g., http://proxy:8080)")
|
|
21
|
+
parser.add_argument(
|
|
22
|
+
"--format", "-f",
|
|
23
|
+
choices=["markdown", "text", "json"],
|
|
24
|
+
default="markdown",
|
|
25
|
+
help="Output format (default: markdown)",
|
|
26
|
+
)
|
|
27
|
+
args = parser.parse_args()
|
|
28
|
+
|
|
29
|
+
try:
|
|
30
|
+
result = asyncio.run(extract(
|
|
31
|
+
args.url,
|
|
32
|
+
max_chars=args.max_chars,
|
|
33
|
+
use_browser=args.browser,
|
|
34
|
+
timeout=args.timeout,
|
|
35
|
+
proxy=args.proxy,
|
|
36
|
+
))
|
|
37
|
+
except KeyboardInterrupt:
|
|
38
|
+
sys.exit(130)
|
|
39
|
+
except Exception as e:
|
|
40
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
41
|
+
sys.exit(1)
|
|
42
|
+
|
|
43
|
+
if args.format == "json":
|
|
44
|
+
output = {
|
|
45
|
+
"title": result.title,
|
|
46
|
+
"text": result.text,
|
|
47
|
+
"markdown": result.markdown,
|
|
48
|
+
"word_count": result.word_count,
|
|
49
|
+
"language": result.language,
|
|
50
|
+
"content_type": result.content_type,
|
|
51
|
+
"source_url": result.source_url,
|
|
52
|
+
"metadata": result.metadata,
|
|
53
|
+
}
|
|
54
|
+
print(json.dumps(output, ensure_ascii=False, indent=2))
|
|
55
|
+
elif args.format == "text":
|
|
56
|
+
if result.title:
|
|
57
|
+
print(f"Title: {result.title}")
|
|
58
|
+
print(f"Words: {result.word_count} | Language: {result.language} | Type: {result.content_type}")
|
|
59
|
+
print("---")
|
|
60
|
+
print(result.text)
|
|
61
|
+
else:
|
|
62
|
+
if result.title:
|
|
63
|
+
print(f"# {result.title}")
|
|
64
|
+
print(f"<!-- words: {result.word_count} | lang: {result.language} | type: {result.content_type} -->")
|
|
65
|
+
print()
|
|
66
|
+
print(result.markdown)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
if __name__ == "__main__":
|
|
70
|
+
main()
|
|
File without changes
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""Stealth settings for Playwright to avoid bot detection."""
|
|
2
|
+
|
|
3
|
+
_STEALTH_SCRIPT = """\
|
|
4
|
+
// Remove webdriver flag
|
|
5
|
+
Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
|
|
6
|
+
|
|
7
|
+
// Realistic languages
|
|
8
|
+
Object.defineProperty(navigator, 'languages', {get: () => ['en-US', 'en', 'ko']});
|
|
9
|
+
|
|
10
|
+
// Mock plugins (Chrome always has these)
|
|
11
|
+
Object.defineProperty(navigator, 'plugins', {
|
|
12
|
+
get: () => {
|
|
13
|
+
const plugins = [
|
|
14
|
+
{name: 'Chrome PDF Plugin', filename: 'internal-pdf-viewer'},
|
|
15
|
+
{name: 'Chrome PDF Viewer', filename: 'mhjfbmdgcfjbbpaeojofohoefgiehjai'},
|
|
16
|
+
{name: 'Native Client', filename: 'internal-nacl-plugin'},
|
|
17
|
+
];
|
|
18
|
+
plugins.length = 3;
|
|
19
|
+
return plugins;
|
|
20
|
+
}
|
|
21
|
+
});
|
|
22
|
+
|
|
23
|
+
// Mock permissions
|
|
24
|
+
const originalQuery = window.navigator.permissions.query;
|
|
25
|
+
window.navigator.permissions.query = (parameters) =>
|
|
26
|
+
parameters.name === 'notifications'
|
|
27
|
+
? Promise.resolve({state: Notification.permission})
|
|
28
|
+
: originalQuery(parameters);
|
|
29
|
+
|
|
30
|
+
// Chrome runtime mock
|
|
31
|
+
window.chrome = {runtime: {}, loadTimes: function() {}, csi: function() {}};
|
|
32
|
+
|
|
33
|
+
// WebGL vendor/renderer (Intel is the most common)
|
|
34
|
+
const getParameter = WebGLRenderingContext.prototype.getParameter;
|
|
35
|
+
WebGLRenderingContext.prototype.getParameter = function(parameter) {
|
|
36
|
+
if (parameter === 37445) return 'Intel Inc.'; // UNMASKED_VENDOR_WEBGL
|
|
37
|
+
if (parameter === 37446) return 'Intel Iris OpenGL Engine'; // UNMASKED_RENDERER_WEBGL
|
|
38
|
+
return getParameter.call(this, parameter);
|
|
39
|
+
};
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
async def apply_stealth(context) -> None:
|
|
44
|
+
"""Apply stealth settings to a Playwright browser context."""
|
|
45
|
+
await context.add_init_script(_STEALTH_SCRIPT)
|
markgrab/core.py
ADDED
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
"""Main orchestrator — route URL to appropriate engine and parser."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import random
|
|
5
|
+
from urllib.parse import urlparse
|
|
6
|
+
|
|
7
|
+
import httpx
|
|
8
|
+
|
|
9
|
+
from markgrab.engine.base import USER_AGENTS, Engine
|
|
10
|
+
from markgrab.engine.browser import BrowserEngine
|
|
11
|
+
from markgrab.engine.http import HttpEngine
|
|
12
|
+
from markgrab.filter.truncate import truncate_result
|
|
13
|
+
from markgrab.parser.html import HtmlParser
|
|
14
|
+
from markgrab.parser.youtube import YouTubeParser, _extract_video_id
|
|
15
|
+
from markgrab.result import ExtractResult
|
|
16
|
+
|
|
17
|
+
logger = logging.getLogger(__name__)
|
|
18
|
+
|
|
19
|
+
# Minimum word count — below this, content is likely SPA/JS-only
|
|
20
|
+
_MIN_WORD_COUNT = 50
|
|
21
|
+
|
|
22
|
+
_OEMBED_URL = "https://www.youtube.com/oembed?url={url}&format=json"
|
|
23
|
+
|
|
24
|
+
try:
|
|
25
|
+
import playwright # noqa: F401
|
|
26
|
+
|
|
27
|
+
_BROWSER_AVAILABLE = True
|
|
28
|
+
except ImportError:
|
|
29
|
+
_BROWSER_AVAILABLE = False
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _detect_type_from_url(url: str) -> str:
|
|
33
|
+
"""Detect content type from URL pattern."""
|
|
34
|
+
parsed = urlparse(url)
|
|
35
|
+
path = parsed.path.lower()
|
|
36
|
+
|
|
37
|
+
if "youtube.com" in parsed.netloc or "youtu.be" in parsed.netloc:
|
|
38
|
+
return "youtube"
|
|
39
|
+
if path.endswith(".pdf"):
|
|
40
|
+
return "pdf"
|
|
41
|
+
if path.endswith(".docx"):
|
|
42
|
+
return "docx"
|
|
43
|
+
|
|
44
|
+
return "html"
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
async def _fetch_with_fallback(
|
|
48
|
+
url: str,
|
|
49
|
+
*,
|
|
50
|
+
engine: Engine | None = None,
|
|
51
|
+
timeout: float = 30.0,
|
|
52
|
+
proxy: str | None = None,
|
|
53
|
+
stealth: bool = False,
|
|
54
|
+
):
|
|
55
|
+
"""Fetch via HTTP, fallback to browser on error."""
|
|
56
|
+
http_engine = engine or HttpEngine(proxy=proxy)
|
|
57
|
+
try:
|
|
58
|
+
return await http_engine.fetch(url, timeout=timeout)
|
|
59
|
+
except Exception as exc:
|
|
60
|
+
if _BROWSER_AVAILABLE:
|
|
61
|
+
logger.info("HTTP failed for %s (%s), falling back to browser", url, type(exc).__name__)
|
|
62
|
+
return await BrowserEngine(proxy=proxy, stealth=stealth).fetch(url, timeout=timeout)
|
|
63
|
+
raise
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
async def _fetch_youtube_title(url: str, timeout: float = 30.0) -> str:
|
|
67
|
+
"""Fetch YouTube video title via oEmbed API."""
|
|
68
|
+
try:
|
|
69
|
+
oembed_url = _OEMBED_URL.format(url=url)
|
|
70
|
+
async with httpx.AsyncClient(timeout=httpx.Timeout(timeout)) as client:
|
|
71
|
+
resp = await client.get(oembed_url)
|
|
72
|
+
if resp.status_code == 200:
|
|
73
|
+
return resp.json().get("title", "")
|
|
74
|
+
except Exception:
|
|
75
|
+
logger.debug("Failed to fetch YouTube oEmbed title for %s", url)
|
|
76
|
+
return ""
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
async def _fetch_bytes(url: str, *, timeout: float = 30.0, proxy: str | None = None) -> tuple[bytes, str]:
|
|
80
|
+
"""Fetch URL as raw bytes. Returns (data, final_url)."""
|
|
81
|
+
headers = {
|
|
82
|
+
"User-Agent": random.choice(USER_AGENTS),
|
|
83
|
+
"Accept": "*/*",
|
|
84
|
+
}
|
|
85
|
+
async with httpx.AsyncClient(
|
|
86
|
+
headers=headers,
|
|
87
|
+
follow_redirects=True,
|
|
88
|
+
timeout=httpx.Timeout(timeout),
|
|
89
|
+
proxy=proxy,
|
|
90
|
+
) as client:
|
|
91
|
+
resp = await client.get(url)
|
|
92
|
+
resp.raise_for_status()
|
|
93
|
+
return resp.content, str(resp.url)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
async def _extract_youtube(url: str, *, timeout: float = 30.0, max_chars: int = 50_000) -> ExtractResult:
|
|
97
|
+
"""Extract YouTube video transcript."""
|
|
98
|
+
video_id = _extract_video_id(url)
|
|
99
|
+
title = await _fetch_youtube_title(url, timeout=timeout)
|
|
100
|
+
|
|
101
|
+
parser = YouTubeParser()
|
|
102
|
+
result = parser.parse(video_id=video_id, url=url, title=title)
|
|
103
|
+
return truncate_result(result, max_chars=max_chars)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
async def _extract_binary(
|
|
107
|
+
url: str,
|
|
108
|
+
content_type: str,
|
|
109
|
+
*,
|
|
110
|
+
timeout: float = 30.0,
|
|
111
|
+
max_chars: int = 50_000,
|
|
112
|
+
proxy: str | None = None,
|
|
113
|
+
) -> ExtractResult:
|
|
114
|
+
"""Extract content from binary URLs (PDF, DOCX)."""
|
|
115
|
+
data, final_url = await _fetch_bytes(url, timeout=timeout, proxy=proxy)
|
|
116
|
+
|
|
117
|
+
if content_type == "pdf":
|
|
118
|
+
from markgrab.parser.pdf import PdfParser
|
|
119
|
+
|
|
120
|
+
result = PdfParser().parse(data, url=final_url)
|
|
121
|
+
elif content_type == "docx":
|
|
122
|
+
from markgrab.parser.docx import DocxParser
|
|
123
|
+
|
|
124
|
+
result = DocxParser().parse(data, url=final_url)
|
|
125
|
+
else:
|
|
126
|
+
raise ValueError(f"Unknown binary content type: {content_type}")
|
|
127
|
+
|
|
128
|
+
return truncate_result(result, max_chars=max_chars)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
async def extract(
|
|
132
|
+
url: str,
|
|
133
|
+
*,
|
|
134
|
+
engine: Engine | None = None,
|
|
135
|
+
max_chars: int = 50_000,
|
|
136
|
+
use_browser: bool = False,
|
|
137
|
+
stealth: bool = False,
|
|
138
|
+
timeout: float = 30.0,
|
|
139
|
+
proxy: str | None = None,
|
|
140
|
+
) -> ExtractResult:
|
|
141
|
+
"""Extract content from URL and return ExtractResult.
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
url: Target URL to extract content from.
|
|
145
|
+
engine: Custom engine instance (default: HttpEngine, with browser fallback).
|
|
146
|
+
max_chars: Maximum characters for text/markdown (default 50K).
|
|
147
|
+
use_browser: Force Playwright browser rendering.
|
|
148
|
+
stealth: Apply anti-bot stealth scripts when using browser (default: False).
|
|
149
|
+
timeout: Request timeout in seconds.
|
|
150
|
+
proxy: Proxy URL (e.g., "http://proxy:8080", "socks5://proxy:1080").
|
|
151
|
+
"""
|
|
152
|
+
url_type = _detect_type_from_url(url)
|
|
153
|
+
|
|
154
|
+
# YouTube — dedicated parser (no engine needed)
|
|
155
|
+
if url_type == "youtube":
|
|
156
|
+
return await _extract_youtube(url, timeout=timeout, max_chars=max_chars)
|
|
157
|
+
|
|
158
|
+
# PDF / DOCX — binary fetch + dedicated parser
|
|
159
|
+
if url_type in ("pdf", "docx"):
|
|
160
|
+
return await _extract_binary(url, url_type, timeout=timeout, max_chars=max_chars, proxy=proxy)
|
|
161
|
+
|
|
162
|
+
# HTML flow — engine + parser + fallback
|
|
163
|
+
if use_browser:
|
|
164
|
+
if not _BROWSER_AVAILABLE:
|
|
165
|
+
raise ImportError("Playwright not installed. Run: pip install 'markgrab[browser]'")
|
|
166
|
+
fetch_result = await (engine or BrowserEngine(proxy=proxy, stealth=stealth)).fetch(url, timeout=timeout)
|
|
167
|
+
else:
|
|
168
|
+
if _BROWSER_AVAILABLE:
|
|
169
|
+
fetch_result = await _fetch_with_fallback(url, engine=engine, timeout=timeout, proxy=proxy, stealth=stealth)
|
|
170
|
+
else:
|
|
171
|
+
fetch_result = await (engine or HttpEngine(proxy=proxy)).fetch(url, timeout=timeout)
|
|
172
|
+
|
|
173
|
+
# Content-Type header may reveal PDF even without .pdf extension
|
|
174
|
+
if "application/pdf" in fetch_result.content_type:
|
|
175
|
+
data, final_url = await _fetch_bytes(url, timeout=timeout, proxy=proxy)
|
|
176
|
+
from markgrab.parser.pdf import PdfParser
|
|
177
|
+
|
|
178
|
+
result = PdfParser().parse(data, url=final_url)
|
|
179
|
+
return truncate_result(result, max_chars=max_chars)
|
|
180
|
+
|
|
181
|
+
# Parse HTML
|
|
182
|
+
parser = HtmlParser()
|
|
183
|
+
result = parser.parse(fetch_result.html, url=fetch_result.final_url)
|
|
184
|
+
|
|
185
|
+
# Auto-fallback: thin content likely means SPA/JS-only page
|
|
186
|
+
if not use_browser and _BROWSER_AVAILABLE and result.word_count < _MIN_WORD_COUNT:
|
|
187
|
+
logger.info("Thin content (%d words) for %s, retrying with browser", result.word_count, url)
|
|
188
|
+
try:
|
|
189
|
+
browser_result = await BrowserEngine(proxy=proxy, stealth=stealth).fetch(url, timeout=timeout)
|
|
190
|
+
browser_parsed = parser.parse(browser_result.html, url=browser_result.final_url)
|
|
191
|
+
if browser_parsed.word_count > result.word_count:
|
|
192
|
+
result = browser_parsed
|
|
193
|
+
except Exception:
|
|
194
|
+
pass # Keep original result
|
|
195
|
+
|
|
196
|
+
return truncate_result(result, max_chars=max_chars)
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
"""Content fetching engines."""
|
|
2
|
+
|
|
3
|
+
from markgrab.engine.base import USER_AGENTS, Engine, FetchResult
|
|
4
|
+
from markgrab.engine.browser import BrowserEngine
|
|
5
|
+
from markgrab.engine.http import HttpEngine
|
|
6
|
+
|
|
7
|
+
__all__ = ["USER_AGENTS", "Engine", "FetchResult", "HttpEngine", "BrowserEngine"]
|
markgrab/engine/base.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""Engine base — content fetching abstraction."""
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class FetchResult:
|
|
9
|
+
"""Raw result from fetching a URL."""
|
|
10
|
+
|
|
11
|
+
html: str
|
|
12
|
+
status_code: int
|
|
13
|
+
content_type: str
|
|
14
|
+
final_url: str
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
# Shared User-Agent pool — used by HttpEngine, core._fetch_bytes, etc.
|
|
18
|
+
USER_AGENTS = [
|
|
19
|
+
(
|
|
20
|
+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36"
|
|
21
|
+
" (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
|
|
22
|
+
),
|
|
23
|
+
(
|
|
24
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
|
25
|
+
" (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
|
|
26
|
+
),
|
|
27
|
+
(
|
|
28
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"
|
|
29
|
+
" (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
|
|
30
|
+
),
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class Engine(ABC):
|
|
35
|
+
"""Abstract base for content fetching engines."""
|
|
36
|
+
|
|
37
|
+
def __init__(self, *, proxy: str | None = None):
|
|
38
|
+
self.proxy = proxy
|
|
39
|
+
|
|
40
|
+
@abstractmethod
|
|
41
|
+
async def fetch(self, url: str, *, timeout: float = 30.0) -> FetchResult:
|
|
42
|
+
...
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""Browser engine — Playwright headless for JS-rendered and bot-protected pages."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
|
|
5
|
+
from markgrab.engine.base import Engine, FetchResult
|
|
6
|
+
|
|
7
|
+
logger = logging.getLogger(__name__)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class BrowserEngine(Engine):
|
|
11
|
+
"""Playwright-based browser engine for JS-heavy and bot-protected sites.
|
|
12
|
+
|
|
13
|
+
Requires: pip install markgrab[browser]
|
|
14
|
+
Playwright is imported lazily — the class can be imported without playwright installed.
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
proxy: Proxy URL.
|
|
18
|
+
stealth: Apply anti-bot stealth scripts (default: False).
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
def __init__(self, *, proxy: str | None = None, stealth: bool = False):
|
|
22
|
+
super().__init__(proxy=proxy)
|
|
23
|
+
self.stealth = stealth
|
|
24
|
+
|
|
25
|
+
async def fetch(self, url: str, *, timeout: float = 30.0) -> FetchResult:
|
|
26
|
+
from playwright.async_api import async_playwright
|
|
27
|
+
|
|
28
|
+
timeout_ms = int(timeout * 1000)
|
|
29
|
+
|
|
30
|
+
async with async_playwright() as p:
|
|
31
|
+
browser = await p.chromium.launch(headless=True)
|
|
32
|
+
try:
|
|
33
|
+
context_kwargs: dict = {
|
|
34
|
+
"viewport": {"width": 1920, "height": 1080},
|
|
35
|
+
"locale": "en-US",
|
|
36
|
+
"timezone_id": "America/New_York",
|
|
37
|
+
}
|
|
38
|
+
if self.proxy:
|
|
39
|
+
context_kwargs["proxy"] = {"server": self.proxy}
|
|
40
|
+
|
|
41
|
+
context = await browser.new_context(**context_kwargs)
|
|
42
|
+
if self.stealth:
|
|
43
|
+
from markgrab.anti_bot.stealth import apply_stealth
|
|
44
|
+
|
|
45
|
+
await apply_stealth(context)
|
|
46
|
+
|
|
47
|
+
page = await context.new_page()
|
|
48
|
+
response = await page.goto(
|
|
49
|
+
url,
|
|
50
|
+
wait_until="domcontentloaded",
|
|
51
|
+
timeout=timeout_ms,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
# Best-effort wait for JS rendering (max 5s or half timeout)
|
|
55
|
+
networkidle_ms = min(5000, timeout_ms // 2)
|
|
56
|
+
try:
|
|
57
|
+
await page.wait_for_load_state("networkidle", timeout=networkidle_ms)
|
|
58
|
+
except Exception:
|
|
59
|
+
pass # DOM content is enough
|
|
60
|
+
|
|
61
|
+
html = await page.content()
|
|
62
|
+
status = response.status if response else 200
|
|
63
|
+
headers = response.headers if response else {}
|
|
64
|
+
|
|
65
|
+
return FetchResult(
|
|
66
|
+
html=html,
|
|
67
|
+
status_code=status,
|
|
68
|
+
content_type=headers.get("content-type", "text/html"),
|
|
69
|
+
final_url=page.url,
|
|
70
|
+
)
|
|
71
|
+
finally:
|
|
72
|
+
await browser.close()
|
markgrab/engine/http.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
"""HTTP engine — lightweight fetching with httpx."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import random
|
|
5
|
+
|
|
6
|
+
import httpx
|
|
7
|
+
|
|
8
|
+
from markgrab.engine.base import USER_AGENTS, Engine, FetchResult
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class HttpEngine(Engine):
|
|
14
|
+
"""Lightweight HTTP engine using httpx."""
|
|
15
|
+
|
|
16
|
+
async def fetch(self, url: str, *, timeout: float = 30.0) -> FetchResult:
|
|
17
|
+
headers = {
|
|
18
|
+
"User-Agent": random.choice(USER_AGENTS),
|
|
19
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
20
|
+
"Accept-Language": "en-US,en;q=0.9,ko;q=0.8",
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
async with httpx.AsyncClient(
|
|
24
|
+
headers=headers,
|
|
25
|
+
follow_redirects=True,
|
|
26
|
+
timeout=httpx.Timeout(timeout),
|
|
27
|
+
proxy=self.proxy,
|
|
28
|
+
) as client:
|
|
29
|
+
response = await client.get(url)
|
|
30
|
+
response.raise_for_status()
|
|
31
|
+
|
|
32
|
+
return FetchResult(
|
|
33
|
+
html=response.text,
|
|
34
|
+
status_code=response.status_code,
|
|
35
|
+
content_type=response.headers.get("content-type", ""),
|
|
36
|
+
final_url=str(response.url),
|
|
37
|
+
)
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
"""Content density filter — remove sidebars and navigation from content area."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
|
|
5
|
+
from bs4 import Tag
|
|
6
|
+
|
|
7
|
+
logger = logging.getLogger(__name__)
|
|
8
|
+
|
|
9
|
+
# Block-level elements to analyze for link density
|
|
10
|
+
_BLOCK_TAGS = frozenset({"div", "section", "ul", "ol", "table", "form", "dl"})
|
|
11
|
+
|
|
12
|
+
# Class/id patterns that indicate sidebar/non-content blocks
|
|
13
|
+
_SIDEBAR_PATTERNS = (
|
|
14
|
+
"sidebar",
|
|
15
|
+
"related",
|
|
16
|
+
"widget",
|
|
17
|
+
"toc",
|
|
18
|
+
"breadcrumb",
|
|
19
|
+
"social",
|
|
20
|
+
"share",
|
|
21
|
+
"comment",
|
|
22
|
+
"advert",
|
|
23
|
+
"promo",
|
|
24
|
+
"recommend",
|
|
25
|
+
"popular",
|
|
26
|
+
"trending",
|
|
27
|
+
"signup",
|
|
28
|
+
"newsletter",
|
|
29
|
+
"subscribe",
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
# Link density above this = likely navigation, not content
|
|
33
|
+
_LINK_DENSITY_THRESHOLD = 0.5
|
|
34
|
+
|
|
35
|
+
# Minimum text length to consider for link density analysis
|
|
36
|
+
_MIN_BLOCK_TEXT = 25
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def filter_low_density(content: Tag) -> None:
|
|
40
|
+
"""Remove low-density sidebar/navigation blocks from content area in-place.
|
|
41
|
+
|
|
42
|
+
Three-pass approach:
|
|
43
|
+
1. Remove <aside>/<nav> tags (semantically non-content)
|
|
44
|
+
2. Remove elements matching sidebar class/id patterns
|
|
45
|
+
3. Remove direct block children with high link density
|
|
46
|
+
"""
|
|
47
|
+
# Pass 1: semantic non-content tags inside content
|
|
48
|
+
for tag in content.find_all(["aside", "nav"]):
|
|
49
|
+
logger.debug("Removing <%s> from content", tag.name)
|
|
50
|
+
tag.decompose()
|
|
51
|
+
|
|
52
|
+
# Pass 2: sidebar/widget pattern matching
|
|
53
|
+
for pattern in _SIDEBAR_PATTERNS:
|
|
54
|
+
for selector in (f"[class*='{pattern}']", f"[id*='{pattern}']"):
|
|
55
|
+
for el in content.select(selector):
|
|
56
|
+
if el.attrs is None:
|
|
57
|
+
continue # Already decomposed by a prior pattern
|
|
58
|
+
logger.debug("Removing sidebar pattern '%s': %s", pattern, el.get("class") or el.get("id"))
|
|
59
|
+
el.decompose()
|
|
60
|
+
|
|
61
|
+
# Pass 3: link density on direct block children
|
|
62
|
+
for child in list(content.children):
|
|
63
|
+
if not isinstance(child, Tag):
|
|
64
|
+
continue
|
|
65
|
+
if child.name not in _BLOCK_TAGS:
|
|
66
|
+
continue
|
|
67
|
+
|
|
68
|
+
text = child.get_text(strip=True)
|
|
69
|
+
if not text or len(text) < _MIN_BLOCK_TEXT:
|
|
70
|
+
continue
|
|
71
|
+
|
|
72
|
+
links_text = "".join(a.get_text(strip=True) for a in child.find_all("a"))
|
|
73
|
+
if not links_text:
|
|
74
|
+
continue
|
|
75
|
+
|
|
76
|
+
link_ratio = len(links_text) / len(text)
|
|
77
|
+
if link_ratio > _LINK_DENSITY_THRESHOLD:
|
|
78
|
+
logger.debug("Removing high link-density block (%.0f%%): <%s>", link_ratio * 100, child.name)
|
|
79
|
+
child.decompose()
|
markgrab/filter/noise.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""Noise filter — remove ads, navigation, popups from HTML."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
|
|
5
|
+
from bs4 import BeautifulSoup
|
|
6
|
+
|
|
7
|
+
_NOISE_TAGS = frozenset({"script", "style", "noscript", "svg", "iframe"})
|
|
8
|
+
|
|
9
|
+
_POPUP_SELECTORS = [
|
|
10
|
+
"[class*='cookie']",
|
|
11
|
+
"[class*='consent']",
|
|
12
|
+
"[class*='popup']",
|
|
13
|
+
"[class*='modal']",
|
|
14
|
+
"[id*='cookie']",
|
|
15
|
+
"[id*='consent']",
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def clean_soup(soup: BeautifulSoup) -> None:
|
|
20
|
+
"""Remove noise elements from soup in-place.
|
|
21
|
+
|
|
22
|
+
Removes: script/style/noscript/svg/iframe tags,
|
|
23
|
+
cookie/consent/popup/modal elements, hidden elements.
|
|
24
|
+
"""
|
|
25
|
+
for tag in soup.find_all(list(_NOISE_TAGS)):
|
|
26
|
+
tag.decompose()
|
|
27
|
+
|
|
28
|
+
for selector in _POPUP_SELECTORS:
|
|
29
|
+
for el in soup.select(selector):
|
|
30
|
+
if el.attrs is None:
|
|
31
|
+
continue # Already decomposed by a prior selector
|
|
32
|
+
el.decompose()
|
|
33
|
+
|
|
34
|
+
for el in soup.find_all(attrs={"aria-hidden": "true"}):
|
|
35
|
+
if el.attrs is None:
|
|
36
|
+
continue
|
|
37
|
+
el.decompose()
|
|
38
|
+
|
|
39
|
+
for el in soup.find_all(style=re.compile(r"display:\s*none")):
|
|
40
|
+
if el.attrs is None:
|
|
41
|
+
continue
|
|
42
|
+
el.decompose()
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""Truncate filter — limit content length."""
|
|
2
|
+
|
|
3
|
+
from markgrab.result import ExtractResult
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def truncate_result(result: ExtractResult, *, max_chars: int = 50_000) -> ExtractResult:
|
|
7
|
+
"""Truncate text and markdown fields to max_chars.
|
|
8
|
+
|
|
9
|
+
Tries to break at the last newline before the limit.
|
|
10
|
+
Returns the original result if no truncation needed.
|
|
11
|
+
"""
|
|
12
|
+
if max_chars <= 0 or (len(result.text) <= max_chars and len(result.markdown) <= max_chars):
|
|
13
|
+
return result
|
|
14
|
+
|
|
15
|
+
text = result.text
|
|
16
|
+
markdown = result.markdown
|
|
17
|
+
|
|
18
|
+
if len(text) > max_chars:
|
|
19
|
+
text = text[:max_chars].rsplit("\n", 1)[0] + "\n\n[truncated]"
|
|
20
|
+
|
|
21
|
+
if len(markdown) > max_chars:
|
|
22
|
+
markdown = markdown[:max_chars].rsplit("\n", 1)[0] + "\n\n[truncated]"
|
|
23
|
+
|
|
24
|
+
return ExtractResult(
|
|
25
|
+
title=result.title,
|
|
26
|
+
text=text,
|
|
27
|
+
markdown=markdown,
|
|
28
|
+
word_count=len(text.split()),
|
|
29
|
+
language=result.language,
|
|
30
|
+
content_type=result.content_type,
|
|
31
|
+
source_url=result.source_url,
|
|
32
|
+
metadata=result.metadata,
|
|
33
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
"""Content parsers."""
|
|
2
|
+
|
|
3
|
+
from markgrab.parser.base import Parser
|
|
4
|
+
from markgrab.parser.docx import DocxParser
|
|
5
|
+
from markgrab.parser.html import HtmlParser
|
|
6
|
+
from markgrab.parser.pdf import PdfParser
|
|
7
|
+
from markgrab.parser.youtube import YouTubeParser
|
|
8
|
+
|
|
9
|
+
__all__ = ["Parser", "HtmlParser", "YouTubeParser", "PdfParser", "DocxParser"]
|
markgrab/parser/base.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""Parser base — content parsing abstraction."""
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
|
|
5
|
+
from markgrab.result import ExtractResult
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Parser(ABC):
|
|
9
|
+
"""Abstract base for content parsers."""
|
|
10
|
+
|
|
11
|
+
@abstractmethod
|
|
12
|
+
def parse(self, html: str, url: str) -> ExtractResult:
|
|
13
|
+
...
|