kabigon 0.14.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kabigon/__init__.py +45 -0
- kabigon/api.py +74 -0
- kabigon/cli.py +13 -0
- kabigon/core/__init__.py +19 -0
- kabigon/core/exception.py +41 -0
- kabigon/core/loader.py +9 -0
- kabigon/loaders/__init__.py +31 -0
- kabigon/loaders/compose.py +26 -0
- kabigon/loaders/firecrawl.py +30 -0
- kabigon/loaders/github.py +184 -0
- kabigon/loaders/httpx.py +16 -0
- kabigon/loaders/pdf.py +46 -0
- kabigon/loaders/playwright.py +37 -0
- kabigon/loaders/ptt.py +27 -0
- kabigon/loaders/reddit.py +86 -0
- kabigon/loaders/reel.py +24 -0
- kabigon/loaders/truthsocial.py +71 -0
- kabigon/loaders/twitter.py +115 -0
- kabigon/loaders/utils.py +36 -0
- kabigon/loaders/youtube.py +165 -0
- kabigon/loaders/youtube_ytdlp.py +13 -0
- kabigon/loaders/ytdlp.py +60 -0
- kabigon/py.typed +0 -0
- kabigon-0.14.2.dist-info/METADATA +319 -0
- kabigon-0.14.2.dist-info/RECORD +28 -0
- kabigon-0.14.2.dist-info/WHEEL +4 -0
- kabigon-0.14.2.dist-info/entry_points.txt +2 -0
- kabigon-0.14.2.dist-info/licenses/LICENSE +21 -0
kabigon/__init__.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import sys
|
|
3
|
+
from typing import Final
|
|
4
|
+
|
|
5
|
+
from loguru import logger
|
|
6
|
+
|
|
7
|
+
from .api import load_url
|
|
8
|
+
from .api import load_url_sync
|
|
9
|
+
from .loaders import Compose
|
|
10
|
+
from .loaders import FirecrawlLoader
|
|
11
|
+
from .loaders import GitHubLoader
|
|
12
|
+
from .loaders import HttpxLoader
|
|
13
|
+
from .loaders import PDFLoader
|
|
14
|
+
from .loaders import PlaywrightLoader
|
|
15
|
+
from .loaders import PttLoader
|
|
16
|
+
from .loaders import RedditLoader
|
|
17
|
+
from .loaders import ReelLoader
|
|
18
|
+
from .loaders import TruthSocialLoader
|
|
19
|
+
from .loaders import TwitterLoader
|
|
20
|
+
from .loaders import YoutubeLoader
|
|
21
|
+
from .loaders import YoutubeYtdlpLoader
|
|
22
|
+
from .loaders import YtdlpLoader
|
|
23
|
+
|
|
24
|
+
__all__ = [
|
|
25
|
+
"Compose",
|
|
26
|
+
"FirecrawlLoader",
|
|
27
|
+
"GitHubLoader",
|
|
28
|
+
"HttpxLoader",
|
|
29
|
+
"PDFLoader",
|
|
30
|
+
"PlaywrightLoader",
|
|
31
|
+
"PttLoader",
|
|
32
|
+
"RedditLoader",
|
|
33
|
+
"ReelLoader",
|
|
34
|
+
"TruthSocialLoader",
|
|
35
|
+
"TwitterLoader",
|
|
36
|
+
"YoutubeLoader",
|
|
37
|
+
"YoutubeYtdlpLoader",
|
|
38
|
+
"YtdlpLoader",
|
|
39
|
+
"load_url",
|
|
40
|
+
"load_url_sync",
|
|
41
|
+
]
|
|
42
|
+
|
|
43
|
+
LOGURU_LEVEL: Final[str] = os.getenv("LOGURU_LEVEL", "INFO")
|
|
44
|
+
logger.remove()
|
|
45
|
+
logger.add(sys.stderr, level=LOGURU_LEVEL)
|
kabigon/api.py
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
from . import loaders
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def _get_default_loader() -> loaders.Compose:
|
|
5
|
+
"""Get the default loader composition used by CLI.
|
|
6
|
+
|
|
7
|
+
Returns:
|
|
8
|
+
Compose: Default loader chain with all available loaders
|
|
9
|
+
"""
|
|
10
|
+
return loaders.Compose(
|
|
11
|
+
[
|
|
12
|
+
loaders.PttLoader(),
|
|
13
|
+
loaders.TwitterLoader(),
|
|
14
|
+
loaders.TruthSocialLoader(),
|
|
15
|
+
loaders.RedditLoader(),
|
|
16
|
+
loaders.YoutubeLoader(),
|
|
17
|
+
loaders.ReelLoader(),
|
|
18
|
+
loaders.YoutubeYtdlpLoader(),
|
|
19
|
+
loaders.PDFLoader(),
|
|
20
|
+
loaders.GitHubLoader(),
|
|
21
|
+
loaders.PlaywrightLoader(timeout=50_000, wait_until="networkidle"),
|
|
22
|
+
loaders.PlaywrightLoader(timeout=10_000),
|
|
23
|
+
]
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def load_url_sync(url: str) -> str:
|
|
28
|
+
"""Load content from a URL using the default loader chain.
|
|
29
|
+
|
|
30
|
+
This is a convenience function that uses the same loader chain as the CLI.
|
|
31
|
+
It tries each loader in sequence until one succeeds.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
url: The URL to load content from
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
str: Extracted content as markdown
|
|
38
|
+
|
|
39
|
+
Raises:
|
|
40
|
+
Exception: If all loaders fail to load the URL
|
|
41
|
+
|
|
42
|
+
Example:
|
|
43
|
+
>>> import kabigon
|
|
44
|
+
>>> text = kabigon.load_url_sync("https://example.com")
|
|
45
|
+
>>> print(text)
|
|
46
|
+
"""
|
|
47
|
+
loader = _get_default_loader()
|
|
48
|
+
return loader.load_sync(url)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
async def load_url(url: str) -> str:
|
|
52
|
+
"""Asynchronously load content from a URL using the default loader chain.
|
|
53
|
+
|
|
54
|
+
This is an async version of load_url() that can be used in async contexts.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
url: The URL to load content from
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
str: Extracted content as markdown
|
|
61
|
+
|
|
62
|
+
Raises:
|
|
63
|
+
Exception: If all loaders fail to load the URL
|
|
64
|
+
|
|
65
|
+
Example:
|
|
66
|
+
>>> import asyncio
|
|
67
|
+
>>> import kabigon
|
|
68
|
+
>>> async def main():
|
|
69
|
+
... text = await kabigon.load_url("https://example.com")
|
|
70
|
+
... print(text)
|
|
71
|
+
>>> asyncio.run(main())
|
|
72
|
+
"""
|
|
73
|
+
loader = _get_default_loader()
|
|
74
|
+
return await loader.load(url)
|
kabigon/cli.py
ADDED
kabigon/core/__init__.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
from .exception import ConfigurationError
|
|
2
|
+
from .exception import FirecrawlAPIKeyNotSetError
|
|
3
|
+
from .exception import InvalidURLError
|
|
4
|
+
from .exception import KabigonError
|
|
5
|
+
from .exception import LoaderError
|
|
6
|
+
from .exception import MissingDependencyError
|
|
7
|
+
from .exception import WhisperNotInstalledError
|
|
8
|
+
from .loader import Loader
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"ConfigurationError",
|
|
12
|
+
"FirecrawlAPIKeyNotSetError",
|
|
13
|
+
"InvalidURLError",
|
|
14
|
+
"KabigonError",
|
|
15
|
+
"Loader",
|
|
16
|
+
"LoaderError",
|
|
17
|
+
"MissingDependencyError",
|
|
18
|
+
"WhisperNotInstalledError",
|
|
19
|
+
]
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
class KabigonError(Exception):
|
|
2
|
+
"""Base exception for all Kabigon errors."""
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class LoaderError(KabigonError):
|
|
6
|
+
"""Raised when all loaders fail to load a URL."""
|
|
7
|
+
|
|
8
|
+
def __init__(self, url: str) -> None:
|
|
9
|
+
self.url = url
|
|
10
|
+
super().__init__(f"Failed to load URL: {url}")
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class InvalidURLError(KabigonError, ValueError):
|
|
14
|
+
"""Raised when a URL is not valid for a specific loader."""
|
|
15
|
+
|
|
16
|
+
def __init__(self, url: str, expected: str) -> None:
|
|
17
|
+
self.url = url
|
|
18
|
+
self.expected = expected
|
|
19
|
+
super().__init__(f"URL is not a {expected} URL: {url}")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class ConfigurationError(KabigonError):
|
|
23
|
+
"""Raised when required configuration is missing."""
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class FirecrawlAPIKeyNotSetError(ConfigurationError):
|
|
27
|
+
"""Raised when FIRECRAWL_API_KEY environment variable is not set."""
|
|
28
|
+
|
|
29
|
+
def __init__(self) -> None:
|
|
30
|
+
super().__init__("FIRECRAWL_API_KEY is not set.")
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class MissingDependencyError(KabigonError):
|
|
34
|
+
"""Raised when a required dependency is not installed."""
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class WhisperNotInstalledError(MissingDependencyError):
|
|
38
|
+
"""Raised when OpenAI Whisper is not installed."""
|
|
39
|
+
|
|
40
|
+
def __init__(self) -> None:
|
|
41
|
+
super().__init__("OpenAI Whisper not installed. Please install it with `pip install openai-whisper`.")
|
kabigon/core/loader.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
from .compose import Compose
|
|
2
|
+
from .firecrawl import FirecrawlLoader
|
|
3
|
+
from .github import GitHubLoader
|
|
4
|
+
from .httpx import HttpxLoader
|
|
5
|
+
from .pdf import PDFLoader
|
|
6
|
+
from .playwright import PlaywrightLoader
|
|
7
|
+
from .ptt import PttLoader
|
|
8
|
+
from .reddit import RedditLoader
|
|
9
|
+
from .reel import ReelLoader
|
|
10
|
+
from .truthsocial import TruthSocialLoader
|
|
11
|
+
from .twitter import TwitterLoader
|
|
12
|
+
from .youtube import YoutubeLoader
|
|
13
|
+
from .youtube_ytdlp import YoutubeYtdlpLoader
|
|
14
|
+
from .ytdlp import YtdlpLoader
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"Compose",
|
|
18
|
+
"FirecrawlLoader",
|
|
19
|
+
"GitHubLoader",
|
|
20
|
+
"HttpxLoader",
|
|
21
|
+
"PDFLoader",
|
|
22
|
+
"PlaywrightLoader",
|
|
23
|
+
"PttLoader",
|
|
24
|
+
"RedditLoader",
|
|
25
|
+
"ReelLoader",
|
|
26
|
+
"TruthSocialLoader",
|
|
27
|
+
"TwitterLoader",
|
|
28
|
+
"YoutubeLoader",
|
|
29
|
+
"YoutubeYtdlpLoader",
|
|
30
|
+
"YtdlpLoader",
|
|
31
|
+
]
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
from loguru import logger
|
|
2
|
+
|
|
3
|
+
from kabigon.core.exception import LoaderError
|
|
4
|
+
from kabigon.core.loader import Loader
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class Compose(Loader):
|
|
8
|
+
def __init__(self, loaders: list[Loader]) -> None:
|
|
9
|
+
self.loaders = loaders
|
|
10
|
+
|
|
11
|
+
async def load(self, url: str) -> str:
|
|
12
|
+
for loader in self.loaders:
|
|
13
|
+
try:
|
|
14
|
+
result = await loader.load(url)
|
|
15
|
+
except Exception as e: # noqa: BLE001
|
|
16
|
+
# We intentionally catch all exceptions to try the next loader in the chain
|
|
17
|
+
logger.info("[{}] Failed to load URL: {}, got error: {}", loader.__class__.__name__, url, e)
|
|
18
|
+
else:
|
|
19
|
+
if not result:
|
|
20
|
+
logger.info("[{}] Failed to load URL: {}, got empty result", loader.__class__.__name__, url)
|
|
21
|
+
continue
|
|
22
|
+
|
|
23
|
+
logger.info("[{}] Successfully loaded URL: {}", loader.__class__.__name__, url)
|
|
24
|
+
return result
|
|
25
|
+
|
|
26
|
+
raise LoaderError(url)
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import os
|
|
2
|
+
|
|
3
|
+
from firecrawl import FirecrawlApp
|
|
4
|
+
|
|
5
|
+
from kabigon.core.exception import FirecrawlAPIKeyNotSetError
|
|
6
|
+
from kabigon.core.exception import LoaderError
|
|
7
|
+
from kabigon.core.loader import Loader
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class FirecrawlLoader(Loader):
|
|
11
|
+
def __init__(self, timeout: int | None = None) -> None:
|
|
12
|
+
self.timeout = timeout
|
|
13
|
+
|
|
14
|
+
api_key = os.getenv("FIRECRAWL_API_KEY")
|
|
15
|
+
if not api_key:
|
|
16
|
+
raise FirecrawlAPIKeyNotSetError
|
|
17
|
+
|
|
18
|
+
self.app = FirecrawlApp(api_key=api_key)
|
|
19
|
+
|
|
20
|
+
def load_sync(self, url: str) -> str:
|
|
21
|
+
result = self.app.scrape_url( # ty:ignore[possibly-missing-attribute]
|
|
22
|
+
url,
|
|
23
|
+
formats=["markdown"],
|
|
24
|
+
timeout=self.timeout,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
if not result.success:
|
|
28
|
+
raise LoaderError(url)
|
|
29
|
+
|
|
30
|
+
return result.markdown
|
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from html.parser import HTMLParser
|
|
4
|
+
from urllib.parse import urlparse
|
|
5
|
+
|
|
6
|
+
import httpx
|
|
7
|
+
|
|
8
|
+
from kabigon.core.exception import InvalidURLError
|
|
9
|
+
from kabigon.core.loader import Loader
|
|
10
|
+
|
|
11
|
+
from .utils import html_to_markdown
|
|
12
|
+
|
|
13
|
+
GITHUB_HOST = "github.com"
|
|
14
|
+
RAW_GITHUB_HOST = "raw.githubusercontent.com"
|
|
15
|
+
|
|
16
|
+
_VOID_TAGS = {
|
|
17
|
+
"area",
|
|
18
|
+
"base",
|
|
19
|
+
"br",
|
|
20
|
+
"col",
|
|
21
|
+
"embed",
|
|
22
|
+
"hr",
|
|
23
|
+
"img",
|
|
24
|
+
"input",
|
|
25
|
+
"link",
|
|
26
|
+
"meta",
|
|
27
|
+
"param",
|
|
28
|
+
"source",
|
|
29
|
+
"track",
|
|
30
|
+
"wbr",
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
_IGNORED_TAGS = {
|
|
34
|
+
"script",
|
|
35
|
+
"style",
|
|
36
|
+
"noscript",
|
|
37
|
+
"svg",
|
|
38
|
+
"nav",
|
|
39
|
+
"header",
|
|
40
|
+
"footer",
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def check_github_url(url: str) -> None:
|
|
45
|
+
host = urlparse(url).netloc
|
|
46
|
+
if host not in {GITHUB_HOST, RAW_GITHUB_HOST}:
|
|
47
|
+
raise InvalidURLError(url, "GitHub")
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def to_raw_github_url(url: str) -> str:
|
|
51
|
+
"""Convert a GitHub blob URL to a raw.githubusercontent.com URL.
|
|
52
|
+
|
|
53
|
+
Supports:
|
|
54
|
+
- https://github.com/<owner>/<repo>/blob/<ref>/<path>
|
|
55
|
+
- https://raw.githubusercontent.com/<owner>/<repo>/<ref>/<path>
|
|
56
|
+
"""
|
|
57
|
+
parsed = urlparse(url)
|
|
58
|
+
if parsed.netloc == RAW_GITHUB_HOST:
|
|
59
|
+
return url
|
|
60
|
+
|
|
61
|
+
if parsed.netloc != GITHUB_HOST:
|
|
62
|
+
raise InvalidURLError(url, "GitHub")
|
|
63
|
+
|
|
64
|
+
parts = [p for p in parsed.path.split("/") if p]
|
|
65
|
+
if len(parts) < 5 or parts[2] != "blob":
|
|
66
|
+
raise InvalidURLError(url, "GitHub blob")
|
|
67
|
+
|
|
68
|
+
owner, repo, _, ref = parts[:4]
|
|
69
|
+
path = "/".join(parts[4:])
|
|
70
|
+
if not path:
|
|
71
|
+
raise InvalidURLError(url, "GitHub blob file")
|
|
72
|
+
|
|
73
|
+
return f"https://{RAW_GITHUB_HOST}/{owner}/{repo}/{ref}/{path}"
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class _SubtreeHTMLExtractor(HTMLParser):
|
|
77
|
+
def __init__(self, root_tag: str) -> None:
|
|
78
|
+
super().__init__(convert_charrefs=True)
|
|
79
|
+
self.root_tag = root_tag
|
|
80
|
+
self._capturing = False
|
|
81
|
+
self._depth = 0
|
|
82
|
+
self._ignored_depth = 0
|
|
83
|
+
self._out: list[str] = []
|
|
84
|
+
|
|
85
|
+
def get_html(self) -> str:
|
|
86
|
+
return "".join(self._out).strip()
|
|
87
|
+
|
|
88
|
+
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
|
|
89
|
+
if tag == self.root_tag and not self._capturing:
|
|
90
|
+
self._capturing = True
|
|
91
|
+
self._depth = 1
|
|
92
|
+
self._out.append(self.get_starttag_text() or f"<{tag}>")
|
|
93
|
+
return
|
|
94
|
+
|
|
95
|
+
if not self._capturing:
|
|
96
|
+
return
|
|
97
|
+
|
|
98
|
+
if tag in _IGNORED_TAGS:
|
|
99
|
+
self._ignored_depth += 1
|
|
100
|
+
return
|
|
101
|
+
|
|
102
|
+
self._out.append(self.get_starttag_text() or f"<{tag}>")
|
|
103
|
+
if tag not in _VOID_TAGS:
|
|
104
|
+
self._depth += 1
|
|
105
|
+
|
|
106
|
+
def handle_endtag(self, tag: str) -> None:
|
|
107
|
+
if not self._capturing:
|
|
108
|
+
return
|
|
109
|
+
|
|
110
|
+
if self._ignored_depth:
|
|
111
|
+
if tag in _IGNORED_TAGS:
|
|
112
|
+
self._ignored_depth -= 1
|
|
113
|
+
return
|
|
114
|
+
|
|
115
|
+
self._out.append(f"</{tag}>")
|
|
116
|
+
if tag not in _VOID_TAGS:
|
|
117
|
+
self._depth -= 1
|
|
118
|
+
|
|
119
|
+
if self._depth <= 0:
|
|
120
|
+
self._capturing = False
|
|
121
|
+
|
|
122
|
+
def handle_startendtag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
|
|
123
|
+
if not self._capturing:
|
|
124
|
+
return
|
|
125
|
+
if self._ignored_depth or tag in _IGNORED_TAGS:
|
|
126
|
+
return
|
|
127
|
+
self._out.append(self.get_starttag_text() or f"<{tag} />")
|
|
128
|
+
|
|
129
|
+
def handle_data(self, data: str) -> None:
|
|
130
|
+
if not self._capturing or self._ignored_depth:
|
|
131
|
+
return
|
|
132
|
+
self._out.append(data)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def extract_main_html(html: str) -> str:
|
|
136
|
+
"""Extract GitHub's primary content area without site-specific selectors."""
|
|
137
|
+
for tag in ("main", "article"):
|
|
138
|
+
parser = _SubtreeHTMLExtractor(tag)
|
|
139
|
+
parser.feed(html)
|
|
140
|
+
extracted = parser.get_html()
|
|
141
|
+
if extracted:
|
|
142
|
+
return extracted
|
|
143
|
+
return html
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
class GitHubLoader(Loader):
|
|
147
|
+
async def load(self, url: str) -> str:
|
|
148
|
+
check_github_url(url)
|
|
149
|
+
parsed = urlparse(url)
|
|
150
|
+
|
|
151
|
+
if parsed.netloc == RAW_GITHUB_HOST or "/blob/" in parsed.path:
|
|
152
|
+
raw_url = to_raw_github_url(url)
|
|
153
|
+
|
|
154
|
+
async with httpx.AsyncClient() as client:
|
|
155
|
+
response = await client.get(
|
|
156
|
+
raw_url,
|
|
157
|
+
follow_redirects=True,
|
|
158
|
+
headers={"Accept": "text/plain, text/markdown;q=0.9, */*;q=0.1"},
|
|
159
|
+
)
|
|
160
|
+
response.raise_for_status()
|
|
161
|
+
|
|
162
|
+
content_type = response.headers.get("content-type", "")
|
|
163
|
+
if "text" not in content_type and "json" not in content_type and "xml" not in content_type:
|
|
164
|
+
raise InvalidURLError(url, f"GitHub text content-type (got {content_type!r})")
|
|
165
|
+
|
|
166
|
+
return response.text
|
|
167
|
+
|
|
168
|
+
async with httpx.AsyncClient() as client:
|
|
169
|
+
response = await client.get(
|
|
170
|
+
url,
|
|
171
|
+
follow_redirects=True,
|
|
172
|
+
headers={
|
|
173
|
+
"Accept": "text/html,application/xhtml+xml",
|
|
174
|
+
"User-Agent": "kabigon (httpx)",
|
|
175
|
+
},
|
|
176
|
+
)
|
|
177
|
+
response.raise_for_status()
|
|
178
|
+
|
|
179
|
+
content_type = response.headers.get("content-type", "")
|
|
180
|
+
if "html" not in content_type:
|
|
181
|
+
raise InvalidURLError(url, f"GitHub HTML content-type (got {content_type!r})")
|
|
182
|
+
|
|
183
|
+
main_html = extract_main_html(response.text)
|
|
184
|
+
return html_to_markdown(main_html)
|
kabigon/loaders/httpx.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import httpx
|
|
2
|
+
|
|
3
|
+
from kabigon.core.loader import Loader
|
|
4
|
+
|
|
5
|
+
from .utils import html_to_markdown
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class HttpxLoader(Loader):
|
|
9
|
+
def __init__(self, headers: dict[str, str] | None = None) -> None:
|
|
10
|
+
self.headers = headers
|
|
11
|
+
|
|
12
|
+
async def load(self, url: str) -> str:
|
|
13
|
+
async with httpx.AsyncClient() as client:
|
|
14
|
+
response = await client.get(url, headers=self.headers, follow_redirects=True)
|
|
15
|
+
response.raise_for_status()
|
|
16
|
+
return html_to_markdown(response.content)
|
kabigon/loaders/pdf.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
import io
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import IO
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
import httpx
|
|
7
|
+
from pypdf import PdfReader
|
|
8
|
+
|
|
9
|
+
from kabigon.core.loader import Loader
|
|
10
|
+
|
|
11
|
+
DEFAULT_HEADERS = {
|
|
12
|
+
"Accept-Language": "zh-TW,zh;q=0.9,ja;q=0.8,en-US;q=0.7,en;q=0.6",
|
|
13
|
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36", # noqa
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class NotPDFError(Exception):
|
|
18
|
+
def __init__(self, url: str) -> None:
|
|
19
|
+
super().__init__(f"URL is not a PDF: {url}")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class PDFLoader(Loader):
|
|
23
|
+
async def load(self, url_or_file: str) -> str: # ty:ignore[invalid-method-override]
|
|
24
|
+
if not url_or_file.startswith("http"):
|
|
25
|
+
return read_pdf_content(url_or_file)
|
|
26
|
+
|
|
27
|
+
async with httpx.AsyncClient() as client:
|
|
28
|
+
resp = await client.get(url_or_file, headers=DEFAULT_HEADERS, follow_redirects=True)
|
|
29
|
+
resp.raise_for_status()
|
|
30
|
+
|
|
31
|
+
if resp.headers.get("content-type") != "application/pdf":
|
|
32
|
+
raise NotPDFError(url_or_file)
|
|
33
|
+
|
|
34
|
+
return read_pdf_content(io.BytesIO(resp.content))
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def read_pdf_content(f: str | Path | IO[Any]) -> str:
|
|
38
|
+
lines = []
|
|
39
|
+
with PdfReader(f) as reader:
|
|
40
|
+
for page in reader.pages:
|
|
41
|
+
text = page.extract_text(extraction_mode="plain")
|
|
42
|
+
for line in text.splitlines():
|
|
43
|
+
stripped = line.strip()
|
|
44
|
+
if stripped:
|
|
45
|
+
lines.append(stripped)
|
|
46
|
+
return "\n".join(lines)
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
from typing import Literal
|
|
2
|
+
|
|
3
|
+
from loguru import logger
|
|
4
|
+
from playwright.async_api import TimeoutError
|
|
5
|
+
from playwright.async_api import async_playwright
|
|
6
|
+
|
|
7
|
+
from kabigon.core.loader import Loader
|
|
8
|
+
|
|
9
|
+
from .utils import html_to_markdown
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class PlaywrightLoader(Loader):
|
|
13
|
+
def __init__(
|
|
14
|
+
self,
|
|
15
|
+
timeout: float | None = 0,
|
|
16
|
+
wait_until: Literal["commit", "domcontentloaded", "load", "networkidle"] | None = None,
|
|
17
|
+
browser_headless: bool = False,
|
|
18
|
+
) -> None:
|
|
19
|
+
self.timeout = timeout
|
|
20
|
+
self.wait_until = wait_until
|
|
21
|
+
self.browser_headless = browser_headless
|
|
22
|
+
|
|
23
|
+
async def load(self, url: str) -> str:
|
|
24
|
+
async with async_playwright() as p:
|
|
25
|
+
browser = await p.chromium.launch(headless=self.browser_headless)
|
|
26
|
+
context = await browser.new_context()
|
|
27
|
+
page = await context.new_page()
|
|
28
|
+
|
|
29
|
+
try:
|
|
30
|
+
await page.goto(url, timeout=self.timeout, wait_until=self.wait_until)
|
|
31
|
+
except TimeoutError as e:
|
|
32
|
+
logger.warning("TimeoutError: {}, (url: {}, timeout: {})", e, url, self.timeout)
|
|
33
|
+
|
|
34
|
+
content = await page.content()
|
|
35
|
+
await browser.close()
|
|
36
|
+
|
|
37
|
+
return html_to_markdown(content)
|
kabigon/loaders/ptt.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from urllib.parse import urlparse
|
|
2
|
+
|
|
3
|
+
from kabigon.core.exception import InvalidURLError
|
|
4
|
+
from kabigon.core.loader import Loader
|
|
5
|
+
|
|
6
|
+
from .httpx import HttpxLoader
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def check_ptt_url(url: str) -> None:
|
|
10
|
+
if urlparse(url).netloc != "www.ptt.cc":
|
|
11
|
+
raise InvalidURLError(url, "PTT")
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class PttLoader(Loader):
|
|
15
|
+
def __init__(self) -> None:
|
|
16
|
+
self.httpx_loader = HttpxLoader(
|
|
17
|
+
headers={
|
|
18
|
+
"Accept-Language": "zh-TW,zh;q=0.9,ja;q=0.8,en-US;q=0.7,en;q=0.6",
|
|
19
|
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36", # noqa
|
|
20
|
+
"Cookie": "over18=1",
|
|
21
|
+
}
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
async def load(self, url: str) -> str:
|
|
25
|
+
check_ptt_url(url)
|
|
26
|
+
|
|
27
|
+
return await self.httpx_loader.load(url)
|