kabigon 0.14.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kabigon/__init__.py ADDED
@@ -0,0 +1,45 @@
1
+ import os
2
+ import sys
3
+ from typing import Final
4
+
5
+ from loguru import logger
6
+
7
+ from .api import load_url
8
+ from .api import load_url_sync
9
+ from .loaders import Compose
10
+ from .loaders import FirecrawlLoader
11
+ from .loaders import GitHubLoader
12
+ from .loaders import HttpxLoader
13
+ from .loaders import PDFLoader
14
+ from .loaders import PlaywrightLoader
15
+ from .loaders import PttLoader
16
+ from .loaders import RedditLoader
17
+ from .loaders import ReelLoader
18
+ from .loaders import TruthSocialLoader
19
+ from .loaders import TwitterLoader
20
+ from .loaders import YoutubeLoader
21
+ from .loaders import YoutubeYtdlpLoader
22
+ from .loaders import YtdlpLoader
23
+
24
+ __all__ = [
25
+ "Compose",
26
+ "FirecrawlLoader",
27
+ "GitHubLoader",
28
+ "HttpxLoader",
29
+ "PDFLoader",
30
+ "PlaywrightLoader",
31
+ "PttLoader",
32
+ "RedditLoader",
33
+ "ReelLoader",
34
+ "TruthSocialLoader",
35
+ "TwitterLoader",
36
+ "YoutubeLoader",
37
+ "YoutubeYtdlpLoader",
38
+ "YtdlpLoader",
39
+ "load_url",
40
+ "load_url_sync",
41
+ ]
42
+
43
+ LOGURU_LEVEL: Final[str] = os.getenv("LOGURU_LEVEL", "INFO")
44
+ logger.remove()
45
+ logger.add(sys.stderr, level=LOGURU_LEVEL)
kabigon/api.py ADDED
@@ -0,0 +1,74 @@
1
+ from . import loaders
2
+
3
+
4
+ def _get_default_loader() -> loaders.Compose:
5
+ """Get the default loader composition used by CLI.
6
+
7
+ Returns:
8
+ Compose: Default loader chain with all available loaders
9
+ """
10
+ return loaders.Compose(
11
+ [
12
+ loaders.PttLoader(),
13
+ loaders.TwitterLoader(),
14
+ loaders.TruthSocialLoader(),
15
+ loaders.RedditLoader(),
16
+ loaders.YoutubeLoader(),
17
+ loaders.ReelLoader(),
18
+ loaders.YoutubeYtdlpLoader(),
19
+ loaders.PDFLoader(),
20
+ loaders.GitHubLoader(),
21
+ loaders.PlaywrightLoader(timeout=50_000, wait_until="networkidle"),
22
+ loaders.PlaywrightLoader(timeout=10_000),
23
+ ]
24
+ )
25
+
26
+
27
+ def load_url_sync(url: str) -> str:
28
+ """Load content from a URL using the default loader chain.
29
+
30
+ This is a convenience function that uses the same loader chain as the CLI.
31
+ It tries each loader in sequence until one succeeds.
32
+
33
+ Args:
34
+ url: The URL to load content from
35
+
36
+ Returns:
37
+ str: Extracted content as markdown
38
+
39
+ Raises:
40
+ Exception: If all loaders fail to load the URL
41
+
42
+ Example:
43
+ >>> import kabigon
44
+ >>> text = kabigon.load_url_sync("https://example.com")
45
+ >>> print(text)
46
+ """
47
+ loader = _get_default_loader()
48
+ return loader.load_sync(url)
49
+
50
+
51
+ async def load_url(url: str) -> str:
52
+ """Asynchronously load content from a URL using the default loader chain.
53
+
54
+ This is an async version of load_url() that can be used in async contexts.
55
+
56
+ Args:
57
+ url: The URL to load content from
58
+
59
+ Returns:
60
+ str: Extracted content as markdown
61
+
62
+ Raises:
63
+ Exception: If all loaders fail to load the URL
64
+
65
+ Example:
66
+ >>> import asyncio
67
+ >>> import kabigon
68
+ >>> async def main():
69
+ ... text = await kabigon.load_url("https://example.com")
70
+ ... print(text)
71
+ >>> asyncio.run(main())
72
+ """
73
+ loader = _get_default_loader()
74
+ return await loader.load(url)
kabigon/cli.py ADDED
@@ -0,0 +1,13 @@
1
+ import typer
2
+ from rich import print
3
+
4
+ from .api import load_url_sync
5
+
6
+
7
+ def run(url: str) -> None:
8
+ result = load_url_sync(url)
9
+ print(result)
10
+
11
+
12
+ def main() -> None:
13
+ typer.run(run)
@@ -0,0 +1,19 @@
1
+ from .exception import ConfigurationError
2
+ from .exception import FirecrawlAPIKeyNotSetError
3
+ from .exception import InvalidURLError
4
+ from .exception import KabigonError
5
+ from .exception import LoaderError
6
+ from .exception import MissingDependencyError
7
+ from .exception import WhisperNotInstalledError
8
+ from .loader import Loader
9
+
10
+ __all__ = [
11
+ "ConfigurationError",
12
+ "FirecrawlAPIKeyNotSetError",
13
+ "InvalidURLError",
14
+ "KabigonError",
15
+ "Loader",
16
+ "LoaderError",
17
+ "MissingDependencyError",
18
+ "WhisperNotInstalledError",
19
+ ]
@@ -0,0 +1,41 @@
1
+ class KabigonError(Exception):
2
+ """Base exception for all Kabigon errors."""
3
+
4
+
5
+ class LoaderError(KabigonError):
6
+ """Raised when all loaders fail to load a URL."""
7
+
8
+ def __init__(self, url: str) -> None:
9
+ self.url = url
10
+ super().__init__(f"Failed to load URL: {url}")
11
+
12
+
13
+ class InvalidURLError(KabigonError, ValueError):
14
+ """Raised when a URL is not valid for a specific loader."""
15
+
16
+ def __init__(self, url: str, expected: str) -> None:
17
+ self.url = url
18
+ self.expected = expected
19
+ super().__init__(f"URL is not a {expected} URL: {url}")
20
+
21
+
22
+ class ConfigurationError(KabigonError):
23
+ """Raised when required configuration is missing."""
24
+
25
+
26
+ class FirecrawlAPIKeyNotSetError(ConfigurationError):
27
+ """Raised when FIRECRAWL_API_KEY environment variable is not set."""
28
+
29
+ def __init__(self) -> None:
30
+ super().__init__("FIRECRAWL_API_KEY is not set.")
31
+
32
+
33
+ class MissingDependencyError(KabigonError):
34
+ """Raised when a required dependency is not installed."""
35
+
36
+
37
+ class WhisperNotInstalledError(MissingDependencyError):
38
+ """Raised when OpenAI Whisper is not installed."""
39
+
40
+ def __init__(self) -> None:
41
+ super().__init__("OpenAI Whisper not installed. Please install it with `pip install openai-whisper`.")
kabigon/core/loader.py ADDED
@@ -0,0 +1,9 @@
1
+ import asyncio
2
+
3
+
4
+ class Loader:
5
+ def load_sync(self, url: str) -> str:
6
+ return asyncio.run(self.load(url))
7
+
8
+ async def load(self, url: str) -> str:
9
+ return await asyncio.to_thread(self.load_sync, url)
@@ -0,0 +1,31 @@
1
+ from .compose import Compose
2
+ from .firecrawl import FirecrawlLoader
3
+ from .github import GitHubLoader
4
+ from .httpx import HttpxLoader
5
+ from .pdf import PDFLoader
6
+ from .playwright import PlaywrightLoader
7
+ from .ptt import PttLoader
8
+ from .reddit import RedditLoader
9
+ from .reel import ReelLoader
10
+ from .truthsocial import TruthSocialLoader
11
+ from .twitter import TwitterLoader
12
+ from .youtube import YoutubeLoader
13
+ from .youtube_ytdlp import YoutubeYtdlpLoader
14
+ from .ytdlp import YtdlpLoader
15
+
16
+ __all__ = [
17
+ "Compose",
18
+ "FirecrawlLoader",
19
+ "GitHubLoader",
20
+ "HttpxLoader",
21
+ "PDFLoader",
22
+ "PlaywrightLoader",
23
+ "PttLoader",
24
+ "RedditLoader",
25
+ "ReelLoader",
26
+ "TruthSocialLoader",
27
+ "TwitterLoader",
28
+ "YoutubeLoader",
29
+ "YoutubeYtdlpLoader",
30
+ "YtdlpLoader",
31
+ ]
@@ -0,0 +1,26 @@
1
+ from loguru import logger
2
+
3
+ from kabigon.core.exception import LoaderError
4
+ from kabigon.core.loader import Loader
5
+
6
+
7
+ class Compose(Loader):
8
+ def __init__(self, loaders: list[Loader]) -> None:
9
+ self.loaders = loaders
10
+
11
+ async def load(self, url: str) -> str:
12
+ for loader in self.loaders:
13
+ try:
14
+ result = await loader.load(url)
15
+ except Exception as e: # noqa: BLE001
16
+ # We intentionally catch all exceptions to try the next loader in the chain
17
+ logger.info("[{}] Failed to load URL: {}, got error: {}", loader.__class__.__name__, url, e)
18
+ else:
19
+ if not result:
20
+ logger.info("[{}] Failed to load URL: {}, got empty result", loader.__class__.__name__, url)
21
+ continue
22
+
23
+ logger.info("[{}] Successfully loaded URL: {}", loader.__class__.__name__, url)
24
+ return result
25
+
26
+ raise LoaderError(url)
@@ -0,0 +1,30 @@
1
+ import os
2
+
3
+ from firecrawl import FirecrawlApp
4
+
5
+ from kabigon.core.exception import FirecrawlAPIKeyNotSetError
6
+ from kabigon.core.exception import LoaderError
7
+ from kabigon.core.loader import Loader
8
+
9
+
10
+ class FirecrawlLoader(Loader):
11
+ def __init__(self, timeout: int | None = None) -> None:
12
+ self.timeout = timeout
13
+
14
+ api_key = os.getenv("FIRECRAWL_API_KEY")
15
+ if not api_key:
16
+ raise FirecrawlAPIKeyNotSetError
17
+
18
+ self.app = FirecrawlApp(api_key=api_key)
19
+
20
+ def load_sync(self, url: str) -> str:
21
+ result = self.app.scrape_url( # ty:ignore[possibly-missing-attribute]
22
+ url,
23
+ formats=["markdown"],
24
+ timeout=self.timeout,
25
+ )
26
+
27
+ if not result.success:
28
+ raise LoaderError(url)
29
+
30
+ return result.markdown
@@ -0,0 +1,184 @@
1
+ from __future__ import annotations
2
+
3
+ from html.parser import HTMLParser
4
+ from urllib.parse import urlparse
5
+
6
+ import httpx
7
+
8
+ from kabigon.core.exception import InvalidURLError
9
+ from kabigon.core.loader import Loader
10
+
11
+ from .utils import html_to_markdown
12
+
13
+ GITHUB_HOST = "github.com"
14
+ RAW_GITHUB_HOST = "raw.githubusercontent.com"
15
+
16
+ _VOID_TAGS = {
17
+ "area",
18
+ "base",
19
+ "br",
20
+ "col",
21
+ "embed",
22
+ "hr",
23
+ "img",
24
+ "input",
25
+ "link",
26
+ "meta",
27
+ "param",
28
+ "source",
29
+ "track",
30
+ "wbr",
31
+ }
32
+
33
+ _IGNORED_TAGS = {
34
+ "script",
35
+ "style",
36
+ "noscript",
37
+ "svg",
38
+ "nav",
39
+ "header",
40
+ "footer",
41
+ }
42
+
43
+
44
+ def check_github_url(url: str) -> None:
45
+ host = urlparse(url).netloc
46
+ if host not in {GITHUB_HOST, RAW_GITHUB_HOST}:
47
+ raise InvalidURLError(url, "GitHub")
48
+
49
+
50
+ def to_raw_github_url(url: str) -> str:
51
+ """Convert a GitHub blob URL to a raw.githubusercontent.com URL.
52
+
53
+ Supports:
54
+ - https://github.com/<owner>/<repo>/blob/<ref>/<path>
55
+ - https://raw.githubusercontent.com/<owner>/<repo>/<ref>/<path>
56
+ """
57
+ parsed = urlparse(url)
58
+ if parsed.netloc == RAW_GITHUB_HOST:
59
+ return url
60
+
61
+ if parsed.netloc != GITHUB_HOST:
62
+ raise InvalidURLError(url, "GitHub")
63
+
64
+ parts = [p for p in parsed.path.split("/") if p]
65
+ if len(parts) < 5 or parts[2] != "blob":
66
+ raise InvalidURLError(url, "GitHub blob")
67
+
68
+ owner, repo, _, ref = parts[:4]
69
+ path = "/".join(parts[4:])
70
+ if not path:
71
+ raise InvalidURLError(url, "GitHub blob file")
72
+
73
+ return f"https://{RAW_GITHUB_HOST}/{owner}/{repo}/{ref}/{path}"
74
+
75
+
76
+ class _SubtreeHTMLExtractor(HTMLParser):
77
+ def __init__(self, root_tag: str) -> None:
78
+ super().__init__(convert_charrefs=True)
79
+ self.root_tag = root_tag
80
+ self._capturing = False
81
+ self._depth = 0
82
+ self._ignored_depth = 0
83
+ self._out: list[str] = []
84
+
85
+ def get_html(self) -> str:
86
+ return "".join(self._out).strip()
87
+
88
+ def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
89
+ if tag == self.root_tag and not self._capturing:
90
+ self._capturing = True
91
+ self._depth = 1
92
+ self._out.append(self.get_starttag_text() or f"<{tag}>")
93
+ return
94
+
95
+ if not self._capturing:
96
+ return
97
+
98
+ if tag in _IGNORED_TAGS:
99
+ self._ignored_depth += 1
100
+ return
101
+
102
+ self._out.append(self.get_starttag_text() or f"<{tag}>")
103
+ if tag not in _VOID_TAGS:
104
+ self._depth += 1
105
+
106
+ def handle_endtag(self, tag: str) -> None:
107
+ if not self._capturing:
108
+ return
109
+
110
+ if self._ignored_depth:
111
+ if tag in _IGNORED_TAGS:
112
+ self._ignored_depth -= 1
113
+ return
114
+
115
+ self._out.append(f"</{tag}>")
116
+ if tag not in _VOID_TAGS:
117
+ self._depth -= 1
118
+
119
+ if self._depth <= 0:
120
+ self._capturing = False
121
+
122
+ def handle_startendtag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
123
+ if not self._capturing:
124
+ return
125
+ if self._ignored_depth or tag in _IGNORED_TAGS:
126
+ return
127
+ self._out.append(self.get_starttag_text() or f"<{tag} />")
128
+
129
+ def handle_data(self, data: str) -> None:
130
+ if not self._capturing or self._ignored_depth:
131
+ return
132
+ self._out.append(data)
133
+
134
+
135
+ def extract_main_html(html: str) -> str:
136
+ """Extract GitHub's primary content area without site-specific selectors."""
137
+ for tag in ("main", "article"):
138
+ parser = _SubtreeHTMLExtractor(tag)
139
+ parser.feed(html)
140
+ extracted = parser.get_html()
141
+ if extracted:
142
+ return extracted
143
+ return html
144
+
145
+
146
+ class GitHubLoader(Loader):
147
+ async def load(self, url: str) -> str:
148
+ check_github_url(url)
149
+ parsed = urlparse(url)
150
+
151
+ if parsed.netloc == RAW_GITHUB_HOST or "/blob/" in parsed.path:
152
+ raw_url = to_raw_github_url(url)
153
+
154
+ async with httpx.AsyncClient() as client:
155
+ response = await client.get(
156
+ raw_url,
157
+ follow_redirects=True,
158
+ headers={"Accept": "text/plain, text/markdown;q=0.9, */*;q=0.1"},
159
+ )
160
+ response.raise_for_status()
161
+
162
+ content_type = response.headers.get("content-type", "")
163
+ if "text" not in content_type and "json" not in content_type and "xml" not in content_type:
164
+ raise InvalidURLError(url, f"GitHub text content-type (got {content_type!r})")
165
+
166
+ return response.text
167
+
168
+ async with httpx.AsyncClient() as client:
169
+ response = await client.get(
170
+ url,
171
+ follow_redirects=True,
172
+ headers={
173
+ "Accept": "text/html,application/xhtml+xml",
174
+ "User-Agent": "kabigon (httpx)",
175
+ },
176
+ )
177
+ response.raise_for_status()
178
+
179
+ content_type = response.headers.get("content-type", "")
180
+ if "html" not in content_type:
181
+ raise InvalidURLError(url, f"GitHub HTML content-type (got {content_type!r})")
182
+
183
+ main_html = extract_main_html(response.text)
184
+ return html_to_markdown(main_html)
@@ -0,0 +1,16 @@
1
+ import httpx
2
+
3
+ from kabigon.core.loader import Loader
4
+
5
+ from .utils import html_to_markdown
6
+
7
+
8
+ class HttpxLoader(Loader):
9
+ def __init__(self, headers: dict[str, str] | None = None) -> None:
10
+ self.headers = headers
11
+
12
+ async def load(self, url: str) -> str:
13
+ async with httpx.AsyncClient() as client:
14
+ response = await client.get(url, headers=self.headers, follow_redirects=True)
15
+ response.raise_for_status()
16
+ return html_to_markdown(response.content)
kabigon/loaders/pdf.py ADDED
@@ -0,0 +1,46 @@
1
+ import io
2
+ from pathlib import Path
3
+ from typing import IO
4
+ from typing import Any
5
+
6
+ import httpx
7
+ from pypdf import PdfReader
8
+
9
+ from kabigon.core.loader import Loader
10
+
11
+ DEFAULT_HEADERS = {
12
+ "Accept-Language": "zh-TW,zh;q=0.9,ja;q=0.8,en-US;q=0.7,en;q=0.6",
13
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36", # noqa
14
+ }
15
+
16
+
17
+ class NotPDFError(Exception):
18
+ def __init__(self, url: str) -> None:
19
+ super().__init__(f"URL is not a PDF: {url}")
20
+
21
+
22
+ class PDFLoader(Loader):
23
+ async def load(self, url_or_file: str) -> str: # ty:ignore[invalid-method-override]
24
+ if not url_or_file.startswith("http"):
25
+ return read_pdf_content(url_or_file)
26
+
27
+ async with httpx.AsyncClient() as client:
28
+ resp = await client.get(url_or_file, headers=DEFAULT_HEADERS, follow_redirects=True)
29
+ resp.raise_for_status()
30
+
31
+ if resp.headers.get("content-type") != "application/pdf":
32
+ raise NotPDFError(url_or_file)
33
+
34
+ return read_pdf_content(io.BytesIO(resp.content))
35
+
36
+
37
+ def read_pdf_content(f: str | Path | IO[Any]) -> str:
38
+ lines = []
39
+ with PdfReader(f) as reader:
40
+ for page in reader.pages:
41
+ text = page.extract_text(extraction_mode="plain")
42
+ for line in text.splitlines():
43
+ stripped = line.strip()
44
+ if stripped:
45
+ lines.append(stripped)
46
+ return "\n".join(lines)
@@ -0,0 +1,37 @@
1
+ from typing import Literal
2
+
3
+ from loguru import logger
4
+ from playwright.async_api import TimeoutError
5
+ from playwright.async_api import async_playwright
6
+
7
+ from kabigon.core.loader import Loader
8
+
9
+ from .utils import html_to_markdown
10
+
11
+
12
+ class PlaywrightLoader(Loader):
13
+ def __init__(
14
+ self,
15
+ timeout: float | None = 0,
16
+ wait_until: Literal["commit", "domcontentloaded", "load", "networkidle"] | None = None,
17
+ browser_headless: bool = False,
18
+ ) -> None:
19
+ self.timeout = timeout
20
+ self.wait_until = wait_until
21
+ self.browser_headless = browser_headless
22
+
23
+ async def load(self, url: str) -> str:
24
+ async with async_playwright() as p:
25
+ browser = await p.chromium.launch(headless=self.browser_headless)
26
+ context = await browser.new_context()
27
+ page = await context.new_page()
28
+
29
+ try:
30
+ await page.goto(url, timeout=self.timeout, wait_until=self.wait_until)
31
+ except TimeoutError as e:
32
+ logger.warning("TimeoutError: {}, (url: {}, timeout: {})", e, url, self.timeout)
33
+
34
+ content = await page.content()
35
+ await browser.close()
36
+
37
+ return html_to_markdown(content)
kabigon/loaders/ptt.py ADDED
@@ -0,0 +1,27 @@
1
+ from urllib.parse import urlparse
2
+
3
+ from kabigon.core.exception import InvalidURLError
4
+ from kabigon.core.loader import Loader
5
+
6
+ from .httpx import HttpxLoader
7
+
8
+
9
+ def check_ptt_url(url: str) -> None:
10
+ if urlparse(url).netloc != "www.ptt.cc":
11
+ raise InvalidURLError(url, "PTT")
12
+
13
+
14
+ class PttLoader(Loader):
15
+ def __init__(self) -> None:
16
+ self.httpx_loader = HttpxLoader(
17
+ headers={
18
+ "Accept-Language": "zh-TW,zh;q=0.9,ja;q=0.8,en-US;q=0.7,en;q=0.6",
19
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36", # noqa
20
+ "Cookie": "over18=1",
21
+ }
22
+ )
23
+
24
+ async def load(self, url: str) -> str:
25
+ check_ptt_url(url)
26
+
27
+ return await self.httpx_loader.load(url)