markgrab 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
markgrab/__init__.py ADDED
@@ -0,0 +1,7 @@
1
+ """MarkGrab — Universal web content extraction."""
2
+
3
+ from markgrab.core import extract
4
+ from markgrab.result import ExtractResult
5
+
6
+ __all__ = ["extract", "ExtractResult"]
7
+ __version__ = "0.1.0"
markgrab/__main__.py ADDED
@@ -0,0 +1,70 @@
1
+ """CLI entry point — python -m markgrab or `markgrab` command."""
2
+
3
+ import argparse
4
+ import asyncio
5
+ import json
6
+ import sys
7
+
8
+ from markgrab import extract
9
+
10
+
11
+ def main():
12
+ parser = argparse.ArgumentParser(
13
+ prog="markgrab",
14
+ description="MarkGrab — extract web content as LLM-ready markdown",
15
+ )
16
+ parser.add_argument("url", help="URL to extract content from")
17
+ parser.add_argument("--max-chars", type=int, default=50_000, help="Max output characters (default: 50000)")
18
+ parser.add_argument("--browser", action="store_true", help="Force Playwright browser rendering")
19
+ parser.add_argument("--timeout", type=float, default=30.0, help="Request timeout in seconds (default: 30)")
20
+ parser.add_argument("--proxy", help="Proxy URL (e.g., http://proxy:8080)")
21
+ parser.add_argument(
22
+ "--format", "-f",
23
+ choices=["markdown", "text", "json"],
24
+ default="markdown",
25
+ help="Output format (default: markdown)",
26
+ )
27
+ args = parser.parse_args()
28
+
29
+ try:
30
+ result = asyncio.run(extract(
31
+ args.url,
32
+ max_chars=args.max_chars,
33
+ use_browser=args.browser,
34
+ timeout=args.timeout,
35
+ proxy=args.proxy,
36
+ ))
37
+ except KeyboardInterrupt:
38
+ sys.exit(130)
39
+ except Exception as e:
40
+ print(f"Error: {e}", file=sys.stderr)
41
+ sys.exit(1)
42
+
43
+ if args.format == "json":
44
+ output = {
45
+ "title": result.title,
46
+ "text": result.text,
47
+ "markdown": result.markdown,
48
+ "word_count": result.word_count,
49
+ "language": result.language,
50
+ "content_type": result.content_type,
51
+ "source_url": result.source_url,
52
+ "metadata": result.metadata,
53
+ }
54
+ print(json.dumps(output, ensure_ascii=False, indent=2))
55
+ elif args.format == "text":
56
+ if result.title:
57
+ print(f"Title: {result.title}")
58
+ print(f"Words: {result.word_count} | Language: {result.language} | Type: {result.content_type}")
59
+ print("---")
60
+ print(result.text)
61
+ else:
62
+ if result.title:
63
+ print(f"# {result.title}")
64
+ print(f"<!-- words: {result.word_count} | lang: {result.language} | type: {result.content_type} -->")
65
+ print()
66
+ print(result.markdown)
67
+
68
+
69
+ if __name__ == "__main__":
70
+ main()
File without changes
@@ -0,0 +1,45 @@
1
+ """Stealth settings for Playwright to avoid bot detection."""
2
+
3
+ _STEALTH_SCRIPT = """\
4
+ // Remove webdriver flag
5
+ Object.defineProperty(navigator, 'webdriver', {get: () => undefined});
6
+
7
+ // Realistic languages
8
+ Object.defineProperty(navigator, 'languages', {get: () => ['en-US', 'en', 'ko']});
9
+
10
+ // Mock plugins (Chrome always has these)
11
+ Object.defineProperty(navigator, 'plugins', {
12
+ get: () => {
13
+ const plugins = [
14
+ {name: 'Chrome PDF Plugin', filename: 'internal-pdf-viewer'},
15
+ {name: 'Chrome PDF Viewer', filename: 'mhjfbmdgcfjbbpaeojofohoefgiehjai'},
16
+ {name: 'Native Client', filename: 'internal-nacl-plugin'},
17
+ ];
18
+ plugins.length = 3;
19
+ return plugins;
20
+ }
21
+ });
22
+
23
+ // Mock permissions
24
+ const originalQuery = window.navigator.permissions.query;
25
+ window.navigator.permissions.query = (parameters) =>
26
+ parameters.name === 'notifications'
27
+ ? Promise.resolve({state: Notification.permission})
28
+ : originalQuery(parameters);
29
+
30
+ // Chrome runtime mock
31
+ window.chrome = {runtime: {}, loadTimes: function() {}, csi: function() {}};
32
+
33
+ // WebGL vendor/renderer (Intel is the most common)
34
+ const getParameter = WebGLRenderingContext.prototype.getParameter;
35
+ WebGLRenderingContext.prototype.getParameter = function(parameter) {
36
+ if (parameter === 37445) return 'Intel Inc.'; // UNMASKED_VENDOR_WEBGL
37
+ if (parameter === 37446) return 'Intel Iris OpenGL Engine'; // UNMASKED_RENDERER_WEBGL
38
+ return getParameter.call(this, parameter);
39
+ };
40
+ """
41
+
42
+
43
+ async def apply_stealth(context) -> None:
44
+ """Apply stealth settings to a Playwright browser context."""
45
+ await context.add_init_script(_STEALTH_SCRIPT)
markgrab/core.py ADDED
@@ -0,0 +1,196 @@
1
+ """Main orchestrator — route URL to appropriate engine and parser."""
2
+
3
+ import logging
4
+ import random
5
+ from urllib.parse import urlparse
6
+
7
+ import httpx
8
+
9
+ from markgrab.engine.base import USER_AGENTS, Engine
10
+ from markgrab.engine.browser import BrowserEngine
11
+ from markgrab.engine.http import HttpEngine
12
+ from markgrab.filter.truncate import truncate_result
13
+ from markgrab.parser.html import HtmlParser
14
+ from markgrab.parser.youtube import YouTubeParser, _extract_video_id
15
+ from markgrab.result import ExtractResult
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+ # Minimum word count — below this, content is likely SPA/JS-only
20
+ _MIN_WORD_COUNT = 50
21
+
22
+ _OEMBED_URL = "https://www.youtube.com/oembed?url={url}&format=json"
23
+
24
+ try:
25
+ import playwright # noqa: F401
26
+
27
+ _BROWSER_AVAILABLE = True
28
+ except ImportError:
29
+ _BROWSER_AVAILABLE = False
30
+
31
+
32
+ def _detect_type_from_url(url: str) -> str:
33
+ """Detect content type from URL pattern."""
34
+ parsed = urlparse(url)
35
+ path = parsed.path.lower()
36
+
37
+ if "youtube.com" in parsed.netloc or "youtu.be" in parsed.netloc:
38
+ return "youtube"
39
+ if path.endswith(".pdf"):
40
+ return "pdf"
41
+ if path.endswith(".docx"):
42
+ return "docx"
43
+
44
+ return "html"
45
+
46
+
47
+ async def _fetch_with_fallback(
48
+ url: str,
49
+ *,
50
+ engine: Engine | None = None,
51
+ timeout: float = 30.0,
52
+ proxy: str | None = None,
53
+ stealth: bool = False,
54
+ ):
55
+ """Fetch via HTTP, fallback to browser on error."""
56
+ http_engine = engine or HttpEngine(proxy=proxy)
57
+ try:
58
+ return await http_engine.fetch(url, timeout=timeout)
59
+ except Exception as exc:
60
+ if _BROWSER_AVAILABLE:
61
+ logger.info("HTTP failed for %s (%s), falling back to browser", url, type(exc).__name__)
62
+ return await BrowserEngine(proxy=proxy, stealth=stealth).fetch(url, timeout=timeout)
63
+ raise
64
+
65
+
66
+ async def _fetch_youtube_title(url: str, timeout: float = 30.0) -> str:
67
+ """Fetch YouTube video title via oEmbed API."""
68
+ try:
69
+ oembed_url = _OEMBED_URL.format(url=url)
70
+ async with httpx.AsyncClient(timeout=httpx.Timeout(timeout)) as client:
71
+ resp = await client.get(oembed_url)
72
+ if resp.status_code == 200:
73
+ return resp.json().get("title", "")
74
+ except Exception:
75
+ logger.debug("Failed to fetch YouTube oEmbed title for %s", url)
76
+ return ""
77
+
78
+
79
+ async def _fetch_bytes(url: str, *, timeout: float = 30.0, proxy: str | None = None) -> tuple[bytes, str]:
80
+ """Fetch URL as raw bytes. Returns (data, final_url)."""
81
+ headers = {
82
+ "User-Agent": random.choice(USER_AGENTS),
83
+ "Accept": "*/*",
84
+ }
85
+ async with httpx.AsyncClient(
86
+ headers=headers,
87
+ follow_redirects=True,
88
+ timeout=httpx.Timeout(timeout),
89
+ proxy=proxy,
90
+ ) as client:
91
+ resp = await client.get(url)
92
+ resp.raise_for_status()
93
+ return resp.content, str(resp.url)
94
+
95
+
96
+ async def _extract_youtube(url: str, *, timeout: float = 30.0, max_chars: int = 50_000) -> ExtractResult:
97
+ """Extract YouTube video transcript."""
98
+ video_id = _extract_video_id(url)
99
+ title = await _fetch_youtube_title(url, timeout=timeout)
100
+
101
+ parser = YouTubeParser()
102
+ result = parser.parse(video_id=video_id, url=url, title=title)
103
+ return truncate_result(result, max_chars=max_chars)
104
+
105
+
106
+ async def _extract_binary(
107
+ url: str,
108
+ content_type: str,
109
+ *,
110
+ timeout: float = 30.0,
111
+ max_chars: int = 50_000,
112
+ proxy: str | None = None,
113
+ ) -> ExtractResult:
114
+ """Extract content from binary URLs (PDF, DOCX)."""
115
+ data, final_url = await _fetch_bytes(url, timeout=timeout, proxy=proxy)
116
+
117
+ if content_type == "pdf":
118
+ from markgrab.parser.pdf import PdfParser
119
+
120
+ result = PdfParser().parse(data, url=final_url)
121
+ elif content_type == "docx":
122
+ from markgrab.parser.docx import DocxParser
123
+
124
+ result = DocxParser().parse(data, url=final_url)
125
+ else:
126
+ raise ValueError(f"Unknown binary content type: {content_type}")
127
+
128
+ return truncate_result(result, max_chars=max_chars)
129
+
130
+
131
+ async def extract(
132
+ url: str,
133
+ *,
134
+ engine: Engine | None = None,
135
+ max_chars: int = 50_000,
136
+ use_browser: bool = False,
137
+ stealth: bool = False,
138
+ timeout: float = 30.0,
139
+ proxy: str | None = None,
140
+ ) -> ExtractResult:
141
+ """Extract content from URL and return ExtractResult.
142
+
143
+ Args:
144
+ url: Target URL to extract content from.
145
+ engine: Custom engine instance (default: HttpEngine, with browser fallback).
146
+ max_chars: Maximum characters for text/markdown (default 50K).
147
+ use_browser: Force Playwright browser rendering.
148
+ stealth: Apply anti-bot stealth scripts when using browser (default: False).
149
+ timeout: Request timeout in seconds.
150
+ proxy: Proxy URL (e.g., "http://proxy:8080", "socks5://proxy:1080").
151
+ """
152
+ url_type = _detect_type_from_url(url)
153
+
154
+ # YouTube — dedicated parser (no engine needed)
155
+ if url_type == "youtube":
156
+ return await _extract_youtube(url, timeout=timeout, max_chars=max_chars)
157
+
158
+ # PDF / DOCX — binary fetch + dedicated parser
159
+ if url_type in ("pdf", "docx"):
160
+ return await _extract_binary(url, url_type, timeout=timeout, max_chars=max_chars, proxy=proxy)
161
+
162
+ # HTML flow — engine + parser + fallback
163
+ if use_browser:
164
+ if not _BROWSER_AVAILABLE:
165
+ raise ImportError("Playwright not installed. Run: pip install 'markgrab[browser]'")
166
+ fetch_result = await (engine or BrowserEngine(proxy=proxy, stealth=stealth)).fetch(url, timeout=timeout)
167
+ else:
168
+ if _BROWSER_AVAILABLE:
169
+ fetch_result = await _fetch_with_fallback(url, engine=engine, timeout=timeout, proxy=proxy, stealth=stealth)
170
+ else:
171
+ fetch_result = await (engine or HttpEngine(proxy=proxy)).fetch(url, timeout=timeout)
172
+
173
+ # Content-Type header may reveal PDF even without .pdf extension
174
+ if "application/pdf" in fetch_result.content_type:
175
+ data, final_url = await _fetch_bytes(url, timeout=timeout, proxy=proxy)
176
+ from markgrab.parser.pdf import PdfParser
177
+
178
+ result = PdfParser().parse(data, url=final_url)
179
+ return truncate_result(result, max_chars=max_chars)
180
+
181
+ # Parse HTML
182
+ parser = HtmlParser()
183
+ result = parser.parse(fetch_result.html, url=fetch_result.final_url)
184
+
185
+ # Auto-fallback: thin content likely means SPA/JS-only page
186
+ if not use_browser and _BROWSER_AVAILABLE and result.word_count < _MIN_WORD_COUNT:
187
+ logger.info("Thin content (%d words) for %s, retrying with browser", result.word_count, url)
188
+ try:
189
+ browser_result = await BrowserEngine(proxy=proxy, stealth=stealth).fetch(url, timeout=timeout)
190
+ browser_parsed = parser.parse(browser_result.html, url=browser_result.final_url)
191
+ if browser_parsed.word_count > result.word_count:
192
+ result = browser_parsed
193
+ except Exception:
194
+ pass # Keep original result
195
+
196
+ return truncate_result(result, max_chars=max_chars)
@@ -0,0 +1,7 @@
1
+ """Content fetching engines."""
2
+
3
+ from markgrab.engine.base import USER_AGENTS, Engine, FetchResult
4
+ from markgrab.engine.browser import BrowserEngine
5
+ from markgrab.engine.http import HttpEngine
6
+
7
+ __all__ = ["USER_AGENTS", "Engine", "FetchResult", "HttpEngine", "BrowserEngine"]
@@ -0,0 +1,42 @@
1
+ """Engine base — content fetching abstraction."""
2
+
3
+ from abc import ABC, abstractmethod
4
+ from dataclasses import dataclass
5
+
6
+
7
+ @dataclass
8
+ class FetchResult:
9
+ """Raw result from fetching a URL."""
10
+
11
+ html: str
12
+ status_code: int
13
+ content_type: str
14
+ final_url: str
15
+
16
+
17
+ # Shared User-Agent pool — used by HttpEngine, core._fetch_bytes, etc.
18
+ USER_AGENTS = [
19
+ (
20
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36"
21
+ " (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
22
+ ),
23
+ (
24
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
25
+ " (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
26
+ ),
27
+ (
28
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"
29
+ " (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
30
+ ),
31
+ ]
32
+
33
+
34
+ class Engine(ABC):
35
+ """Abstract base for content fetching engines."""
36
+
37
+ def __init__(self, *, proxy: str | None = None):
38
+ self.proxy = proxy
39
+
40
+ @abstractmethod
41
+ async def fetch(self, url: str, *, timeout: float = 30.0) -> FetchResult:
42
+ ...
@@ -0,0 +1,72 @@
1
+ """Browser engine — Playwright headless for JS-rendered and bot-protected pages."""
2
+
3
+ import logging
4
+
5
+ from markgrab.engine.base import Engine, FetchResult
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+
10
+ class BrowserEngine(Engine):
11
+ """Playwright-based browser engine for JS-heavy and bot-protected sites.
12
+
13
+ Requires: pip install markgrab[browser]
14
+ Playwright is imported lazily — the class can be imported without playwright installed.
15
+
16
+ Args:
17
+ proxy: Proxy URL.
18
+ stealth: Apply anti-bot stealth scripts (default: False).
19
+ """
20
+
21
+ def __init__(self, *, proxy: str | None = None, stealth: bool = False):
22
+ super().__init__(proxy=proxy)
23
+ self.stealth = stealth
24
+
25
+ async def fetch(self, url: str, *, timeout: float = 30.0) -> FetchResult:
26
+ from playwright.async_api import async_playwright
27
+
28
+ timeout_ms = int(timeout * 1000)
29
+
30
+ async with async_playwright() as p:
31
+ browser = await p.chromium.launch(headless=True)
32
+ try:
33
+ context_kwargs: dict = {
34
+ "viewport": {"width": 1920, "height": 1080},
35
+ "locale": "en-US",
36
+ "timezone_id": "America/New_York",
37
+ }
38
+ if self.proxy:
39
+ context_kwargs["proxy"] = {"server": self.proxy}
40
+
41
+ context = await browser.new_context(**context_kwargs)
42
+ if self.stealth:
43
+ from markgrab.anti_bot.stealth import apply_stealth
44
+
45
+ await apply_stealth(context)
46
+
47
+ page = await context.new_page()
48
+ response = await page.goto(
49
+ url,
50
+ wait_until="domcontentloaded",
51
+ timeout=timeout_ms,
52
+ )
53
+
54
+ # Best-effort wait for JS rendering (max 5s or half timeout)
55
+ networkidle_ms = min(5000, timeout_ms // 2)
56
+ try:
57
+ await page.wait_for_load_state("networkidle", timeout=networkidle_ms)
58
+ except Exception:
59
+ pass # DOM content is enough
60
+
61
+ html = await page.content()
62
+ status = response.status if response else 200
63
+ headers = response.headers if response else {}
64
+
65
+ return FetchResult(
66
+ html=html,
67
+ status_code=status,
68
+ content_type=headers.get("content-type", "text/html"),
69
+ final_url=page.url,
70
+ )
71
+ finally:
72
+ await browser.close()
@@ -0,0 +1,37 @@
1
+ """HTTP engine — lightweight fetching with httpx."""
2
+
3
+ import logging
4
+ import random
5
+
6
+ import httpx
7
+
8
+ from markgrab.engine.base import USER_AGENTS, Engine, FetchResult
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class HttpEngine(Engine):
14
+ """Lightweight HTTP engine using httpx."""
15
+
16
+ async def fetch(self, url: str, *, timeout: float = 30.0) -> FetchResult:
17
+ headers = {
18
+ "User-Agent": random.choice(USER_AGENTS),
19
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
20
+ "Accept-Language": "en-US,en;q=0.9,ko;q=0.8",
21
+ }
22
+
23
+ async with httpx.AsyncClient(
24
+ headers=headers,
25
+ follow_redirects=True,
26
+ timeout=httpx.Timeout(timeout),
27
+ proxy=self.proxy,
28
+ ) as client:
29
+ response = await client.get(url)
30
+ response.raise_for_status()
31
+
32
+ return FetchResult(
33
+ html=response.text,
34
+ status_code=response.status_code,
35
+ content_type=response.headers.get("content-type", ""),
36
+ final_url=str(response.url),
37
+ )
@@ -0,0 +1,7 @@
1
+ """Content filters."""
2
+
3
+ from markgrab.filter.density import filter_low_density
4
+ from markgrab.filter.noise import clean_soup
5
+ from markgrab.filter.truncate import truncate_result
6
+
7
+ __all__ = ["clean_soup", "filter_low_density", "truncate_result"]
@@ -0,0 +1,79 @@
1
+ """Content density filter — remove sidebars and navigation from content area."""
2
+
3
+ import logging
4
+
5
+ from bs4 import Tag
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+ # Block-level elements to analyze for link density
10
+ _BLOCK_TAGS = frozenset({"div", "section", "ul", "ol", "table", "form", "dl"})
11
+
12
+ # Class/id patterns that indicate sidebar/non-content blocks
13
+ _SIDEBAR_PATTERNS = (
14
+ "sidebar",
15
+ "related",
16
+ "widget",
17
+ "toc",
18
+ "breadcrumb",
19
+ "social",
20
+ "share",
21
+ "comment",
22
+ "advert",
23
+ "promo",
24
+ "recommend",
25
+ "popular",
26
+ "trending",
27
+ "signup",
28
+ "newsletter",
29
+ "subscribe",
30
+ )
31
+
32
+ # Link density above this = likely navigation, not content
33
+ _LINK_DENSITY_THRESHOLD = 0.5
34
+
35
+ # Minimum text length to consider for link density analysis
36
+ _MIN_BLOCK_TEXT = 25
37
+
38
+
39
+ def filter_low_density(content: Tag) -> None:
40
+ """Remove low-density sidebar/navigation blocks from content area in-place.
41
+
42
+ Three-pass approach:
43
+ 1. Remove <aside>/<nav> tags (semantically non-content)
44
+ 2. Remove elements matching sidebar class/id patterns
45
+ 3. Remove direct block children with high link density
46
+ """
47
+ # Pass 1: semantic non-content tags inside content
48
+ for tag in content.find_all(["aside", "nav"]):
49
+ logger.debug("Removing <%s> from content", tag.name)
50
+ tag.decompose()
51
+
52
+ # Pass 2: sidebar/widget pattern matching
53
+ for pattern in _SIDEBAR_PATTERNS:
54
+ for selector in (f"[class*='{pattern}']", f"[id*='{pattern}']"):
55
+ for el in content.select(selector):
56
+ if el.attrs is None:
57
+ continue # Already decomposed by a prior pattern
58
+ logger.debug("Removing sidebar pattern '%s': %s", pattern, el.get("class") or el.get("id"))
59
+ el.decompose()
60
+
61
+ # Pass 3: link density on direct block children
62
+ for child in list(content.children):
63
+ if not isinstance(child, Tag):
64
+ continue
65
+ if child.name not in _BLOCK_TAGS:
66
+ continue
67
+
68
+ text = child.get_text(strip=True)
69
+ if not text or len(text) < _MIN_BLOCK_TEXT:
70
+ continue
71
+
72
+ links_text = "".join(a.get_text(strip=True) for a in child.find_all("a"))
73
+ if not links_text:
74
+ continue
75
+
76
+ link_ratio = len(links_text) / len(text)
77
+ if link_ratio > _LINK_DENSITY_THRESHOLD:
78
+ logger.debug("Removing high link-density block (%.0f%%): <%s>", link_ratio * 100, child.name)
79
+ child.decompose()
@@ -0,0 +1,42 @@
1
+ """Noise filter — remove ads, navigation, popups from HTML."""
2
+
3
+ import re
4
+
5
+ from bs4 import BeautifulSoup
6
+
7
+ _NOISE_TAGS = frozenset({"script", "style", "noscript", "svg", "iframe"})
8
+
9
+ _POPUP_SELECTORS = [
10
+ "[class*='cookie']",
11
+ "[class*='consent']",
12
+ "[class*='popup']",
13
+ "[class*='modal']",
14
+ "[id*='cookie']",
15
+ "[id*='consent']",
16
+ ]
17
+
18
+
19
+ def clean_soup(soup: BeautifulSoup) -> None:
20
+ """Remove noise elements from soup in-place.
21
+
22
+ Removes: script/style/noscript/svg/iframe tags,
23
+ cookie/consent/popup/modal elements, hidden elements.
24
+ """
25
+ for tag in soup.find_all(list(_NOISE_TAGS)):
26
+ tag.decompose()
27
+
28
+ for selector in _POPUP_SELECTORS:
29
+ for el in soup.select(selector):
30
+ if el.attrs is None:
31
+ continue # Already decomposed by a prior selector
32
+ el.decompose()
33
+
34
+ for el in soup.find_all(attrs={"aria-hidden": "true"}):
35
+ if el.attrs is None:
36
+ continue
37
+ el.decompose()
38
+
39
+ for el in soup.find_all(style=re.compile(r"display:\s*none")):
40
+ if el.attrs is None:
41
+ continue
42
+ el.decompose()
@@ -0,0 +1,33 @@
1
+ """Truncate filter — limit content length."""
2
+
3
+ from markgrab.result import ExtractResult
4
+
5
+
6
+ def truncate_result(result: ExtractResult, *, max_chars: int = 50_000) -> ExtractResult:
7
+ """Truncate text and markdown fields to max_chars.
8
+
9
+ Tries to break at the last newline before the limit.
10
+ Returns the original result if no truncation needed.
11
+ """
12
+ if max_chars <= 0 or (len(result.text) <= max_chars and len(result.markdown) <= max_chars):
13
+ return result
14
+
15
+ text = result.text
16
+ markdown = result.markdown
17
+
18
+ if len(text) > max_chars:
19
+ text = text[:max_chars].rsplit("\n", 1)[0] + "\n\n[truncated]"
20
+
21
+ if len(markdown) > max_chars:
22
+ markdown = markdown[:max_chars].rsplit("\n", 1)[0] + "\n\n[truncated]"
23
+
24
+ return ExtractResult(
25
+ title=result.title,
26
+ text=text,
27
+ markdown=markdown,
28
+ word_count=len(text.split()),
29
+ language=result.language,
30
+ content_type=result.content_type,
31
+ source_url=result.source_url,
32
+ metadata=result.metadata,
33
+ )
File without changes
@@ -0,0 +1,9 @@
1
+ """Content parsers."""
2
+
3
+ from markgrab.parser.base import Parser
4
+ from markgrab.parser.docx import DocxParser
5
+ from markgrab.parser.html import HtmlParser
6
+ from markgrab.parser.pdf import PdfParser
7
+ from markgrab.parser.youtube import YouTubeParser
8
+
9
+ __all__ = ["Parser", "HtmlParser", "YouTubeParser", "PdfParser", "DocxParser"]
@@ -0,0 +1,13 @@
1
+ """Parser base — content parsing abstraction."""
2
+
3
+ from abc import ABC, abstractmethod
4
+
5
+ from markgrab.result import ExtractResult
6
+
7
+
8
+ class Parser(ABC):
9
+ """Abstract base for content parsers."""
10
+
11
+ @abstractmethod
12
+ def parse(self, html: str, url: str) -> ExtractResult:
13
+ ...