kabigon 0.14.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,86 @@
1
+ from urllib.parse import urlparse
2
+ from urllib.parse import urlunparse
3
+
4
+ from playwright.async_api import async_playwright
5
+
6
+ from kabigon.core.exception import InvalidURLError
7
+ from kabigon.core.loader import Loader
8
+
9
+ from .utils import html_to_markdown
10
+
11
+ REDDIT_DOMAINS = [
12
+ "reddit.com",
13
+ "www.reddit.com",
14
+ "old.reddit.com",
15
+ ]
16
+
17
+ USER_AGENT = (
18
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
19
+ )
20
+
21
+
22
+ def check_reddit_url(url: str) -> None:
23
+ """Check if URL is from Reddit.
24
+
25
+ Args:
26
+ url: The URL to check
27
+
28
+ Raises:
29
+ ValueError: If URL is not from Reddit
30
+ """
31
+ netloc = urlparse(url).netloc
32
+ if netloc not in REDDIT_DOMAINS:
33
+ raise InvalidURLError(url, "Reddit")
34
+
35
+
36
+ def convert_to_old_reddit(url: str) -> str:
37
+ """Convert Reddit URL to old.reddit.com format.
38
+
39
+ Args:
40
+ url: Original Reddit URL
41
+
42
+ Returns:
43
+ URL with old.reddit.com domain
44
+ """
45
+ parsed = urlparse(url)
46
+ return str(urlunparse(parsed._replace(netloc="old.reddit.com")))
47
+
48
+
49
+ class RedditLoader(Loader):
50
+ """Loader for Reddit posts and comments.
51
+
52
+ Uses old.reddit.com for better content extraction without CAPTCHA.
53
+ """
54
+
55
+ def __init__(self, timeout: float = 30_000) -> None:
56
+ """Initialize RedditLoader.
57
+
58
+ Args:
59
+ timeout: Timeout in milliseconds for page loading (default: 30 seconds)
60
+ """
61
+ self.timeout = timeout
62
+
63
+ async def load(self, url: str) -> str:
64
+ """Asynchronously load Reddit content from URL.
65
+
66
+ Args:
67
+ url: Reddit URL to load
68
+
69
+ Returns:
70
+ Loaded content as markdown
71
+
72
+ Raises:
73
+ ValueError: If URL is not from Reddit
74
+ """
75
+ check_reddit_url(url)
76
+ url = convert_to_old_reddit(url)
77
+
78
+ async with async_playwright() as p:
79
+ browser = await p.chromium.launch(headless=True)
80
+ context = await browser.new_context(user_agent=USER_AGENT)
81
+ page = await context.new_page()
82
+ await page.goto(url, timeout=self.timeout, wait_until="networkidle")
83
+ content = await page.content()
84
+ await browser.close()
85
+
86
+ return html_to_markdown(content)
@@ -0,0 +1,24 @@
1
+ from kabigon.core.exception import InvalidURLError
2
+ from kabigon.core.loader import Loader
3
+
4
+ from .httpx import HttpxLoader
5
+ from .ytdlp import YtdlpLoader
6
+
7
+
8
+ def check_reel_url(url: str) -> None:
9
+ if not url.startswith("https://www.instagram.com/reel"):
10
+ raise InvalidURLError(url, "Instagram Reel")
11
+
12
+
13
+ class ReelLoader(Loader):
14
+ def __init__(self) -> None:
15
+ self.httpx_loader = HttpxLoader()
16
+ self.ytdlp_loader = YtdlpLoader()
17
+
18
+ async def load(self, url: str) -> str:
19
+ check_reel_url(url)
20
+
21
+ audio_content = await self.ytdlp_loader.load(url)
22
+ html_content = await self.httpx_loader.load(url)
23
+
24
+ return f"{audio_content}\n\n{html_content}"
@@ -0,0 +1,71 @@
1
+ from urllib.parse import urlparse
2
+
3
+ from playwright.async_api import async_playwright
4
+
5
+ from kabigon.core.exception import InvalidURLError
6
+ from kabigon.core.loader import Loader
7
+
8
+ from .utils import html_to_markdown
9
+
10
+ TRUTHSOCIAL_DOMAINS = [
11
+ "truthsocial.com",
12
+ "www.truthsocial.com",
13
+ ]
14
+
15
+ USER_AGENT = (
16
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
17
+ )
18
+
19
+
20
+ def check_truthsocial_url(url: str) -> None:
21
+ """Check if URL is from Truth Social.
22
+
23
+ Args:
24
+ url: The URL to check
25
+
26
+ Raises:
27
+ ValueError: If URL is not from Truth Social
28
+ """
29
+ netloc = urlparse(url).netloc
30
+ if netloc not in TRUTHSOCIAL_DOMAINS:
31
+ raise InvalidURLError(url, "Truth Social")
32
+
33
+
34
+ class TruthSocialLoader(Loader):
35
+ """Loader for Truth Social posts.
36
+
37
+ Truth Social requires JavaScript rendering and longer wait times
38
+ for content to fully load.
39
+ """
40
+
41
+ def __init__(self, timeout: float = 60_000) -> None:
42
+ """Initialize TruthSocialLoader.
43
+
44
+ Args:
45
+ timeout: Timeout in milliseconds for page loading (default: 60 seconds)
46
+ """
47
+ self.timeout = timeout
48
+
49
+ async def load(self, url: str) -> str:
50
+ """Load Truth Social content from URL.
51
+
52
+ Args:
53
+ url: Truth Social URL to load
54
+
55
+ Returns:
56
+ Loaded content as markdown
57
+
58
+ Raises:
59
+ ValueError: If URL is not from Truth Social
60
+ """
61
+ check_truthsocial_url(url)
62
+
63
+ async with async_playwright() as p:
64
+ browser = await p.chromium.launch(headless=True)
65
+ context = await browser.new_context(user_agent=USER_AGENT)
66
+ page = await context.new_page()
67
+ await page.goto(url, timeout=self.timeout, wait_until="networkidle")
68
+ content = await page.content()
69
+ await browser.close()
70
+
71
+ return html_to_markdown(content)
@@ -0,0 +1,115 @@
1
+ import asyncio
2
+ import contextlib
3
+ from urllib.parse import urlparse
4
+ from urllib.parse import urlunparse
5
+
6
+ from loguru import logger
7
+ from playwright.async_api import Error as PlaywrightError
8
+ from playwright.async_api import Page
9
+ from playwright.async_api import Request
10
+ from playwright.async_api import Route
11
+ from playwright.async_api import TimeoutError
12
+ from playwright.async_api import async_playwright
13
+
14
+ from kabigon.core.exception import InvalidURLError
15
+ from kabigon.core.loader import Loader
16
+
17
+ from .utils import html_to_markdown
18
+
19
+ TWITTER_DOMAINS = [
20
+ "twitter.com",
21
+ "x.com",
22
+ "fxtwitter.com",
23
+ "vxtwitter.com",
24
+ "fixvx.com",
25
+ "twittpr.com",
26
+ "api.fxtwitter.com",
27
+ "fixupx.com",
28
+ ]
29
+
30
+ USER_AGENT = (
31
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
32
+ )
33
+
34
+ TWEET_READY_SELECTORS = [
35
+ 'article [data-testid="tweetText"]',
36
+ 'article [data-testid="tweet"]',
37
+ '[data-testid="tweetText"]',
38
+ ]
39
+
40
+
41
+ def replace_domain(url: str, new_domain: str = "x.com") -> str:
42
+ return str(urlunparse(urlparse(url)._replace(netloc=new_domain)))
43
+
44
+
45
+ def check_x_url(url: str) -> None:
46
+ if urlparse(url).netloc not in TWITTER_DOMAINS:
47
+ raise InvalidURLError(url, "Twitter/X")
48
+
49
+
50
+ class TwitterLoader(Loader):
51
+ def __init__(self, timeout: float = 20_000, wait_for_tweet_timeout: float = 15_000) -> None:
52
+ self.timeout = timeout
53
+ self.wait_for_tweet_timeout = wait_for_tweet_timeout
54
+
55
+ async def _wait_for_any_selector(self, page: Page, *, selectors: list[str], timeout_ms: float) -> None:
56
+ async def wait_one(selector: str) -> None:
57
+ await page.wait_for_selector(selector, state="visible", timeout=timeout_ms)
58
+
59
+ tasks = [asyncio.create_task(wait_one(selector)) for selector in selectors]
60
+ try:
61
+ done, pending = await asyncio.wait(
62
+ tasks,
63
+ return_when=asyncio.FIRST_COMPLETED,
64
+ timeout=timeout_ms / 1000,
65
+ )
66
+ for task in pending:
67
+ task.cancel()
68
+ for task in done:
69
+ task.result()
70
+ finally:
71
+ for task in tasks:
72
+ if not task.done():
73
+ task.cancel()
74
+
75
+ async def load(self, url: str) -> str:
76
+ check_x_url(url)
77
+
78
+ url = replace_domain(url)
79
+
80
+ async with async_playwright() as p:
81
+ browser = await p.chromium.launch(headless=True)
82
+ context = await browser.new_context(user_agent=USER_AGENT)
83
+ page = await context.new_page()
84
+
85
+ async def route_handler(route: Route, request: Request) -> None:
86
+ if request.resource_type in {"image", "media", "font"}:
87
+ await route.abort()
88
+ return
89
+ await route.continue_()
90
+
91
+ await page.route("**/*", route_handler)
92
+
93
+ try:
94
+ await page.goto(url, timeout=self.timeout, wait_until="domcontentloaded")
95
+ except TimeoutError as e:
96
+ logger.warning("TimeoutError: {}, (url: {}, timeout: {})", e, url, self.timeout)
97
+
98
+ with contextlib.suppress(TimeoutError):
99
+ await self._wait_for_any_selector(
100
+ page,
101
+ selectors=TWEET_READY_SELECTORS,
102
+ timeout_ms=min(self.timeout or self.wait_for_tweet_timeout, self.wait_for_tweet_timeout),
103
+ )
104
+
105
+ try:
106
+ tweet_articles = page.locator("article").filter(has=page.locator('[data-testid="tweetText"]'))
107
+ if await tweet_articles.count() > 0:
108
+ content = await tweet_articles.nth(0).evaluate("el => el.outerHTML")
109
+ else:
110
+ content = await page.content()
111
+ except (PlaywrightError, TimeoutError):
112
+ content = await page.content()
113
+
114
+ await browser.close()
115
+ return html_to_markdown(content)
@@ -0,0 +1,36 @@
1
+ from pathlib import Path
2
+
3
+ import charset_normalizer
4
+ from markdownify import markdownify
5
+
6
+
7
+ def normalize_whitespace(text: str) -> str:
8
+ lines = []
9
+ for line in text.splitlines():
10
+ stripped = line.strip()
11
+ if stripped:
12
+ lines += [stripped]
13
+ return "\n".join(lines)
14
+
15
+
16
+ def html_to_markdown(content: str | bytes) -> str:
17
+ """Convert HTML content to markdown format.
18
+
19
+ Args:
20
+ content: HTML content as string or bytes
21
+
22
+ Returns:
23
+ Converted markdown text with normalized whitespace
24
+ """
25
+ if isinstance(content, bytes):
26
+ content = str(charset_normalizer.from_bytes(content).best())
27
+
28
+ md = markdownify(content, strip=["a", "img"])
29
+ return normalize_whitespace(md)
30
+
31
+
32
+ def read_html_content(f: str | Path) -> str:
33
+ content = str(charset_normalizer.from_path(f).best())
34
+
35
+ md = markdownify(content, strip=["a", "img"])
36
+ return normalize_whitespace(md)
@@ -0,0 +1,165 @@
1
+ from urllib.parse import parse_qs
2
+ from urllib.parse import urlparse
3
+
4
+ from youtube_transcript_api import YouTubeTranscriptApi
5
+
6
+ from kabigon.core.exception import KabigonError
7
+ from kabigon.core.loader import Loader
8
+
9
+ DEFAULT_LANGUAGES = [
10
+ # 中文
11
+ "zh-TW",
12
+ "zh-Hant",
13
+ "zh",
14
+ "zh-Hans",
15
+ # 日韓英
16
+ "ja",
17
+ "ko",
18
+ "en",
19
+ # 歐洲主要語言
20
+ "fr", # French
21
+ "de", # German
22
+ "es", # Spanish
23
+ "it", # Italian
24
+ "pt", # Portuguese
25
+ "pt-BR", # Portuguese (Brazil)
26
+ "nl", # Dutch
27
+ "sv", # Swedish
28
+ "pl", # Polish
29
+ # 東南亞
30
+ "th", # Thai
31
+ "vi", # Vietnamese
32
+ "id", # Indonesian
33
+ "ms", # Malay
34
+ "fil", # Filipino / Tagalog
35
+ # 其他常見
36
+ "ru", # Russian
37
+ "ar", # Arabic
38
+ "hi", # Hindi
39
+ ]
40
+ ALLOWED_SCHEMES = {
41
+ "http",
42
+ "https",
43
+ }
44
+ ALLOWED_NETLOCS = {
45
+ "youtu.be",
46
+ "m.youtube.com",
47
+ "youtube.com",
48
+ "www.youtube.com",
49
+ "www.youtube-nocookie.com",
50
+ "vid.plus",
51
+ }
52
+
53
+
54
+ class UnsupportedURLSchemeError(KabigonError):
55
+ def __init__(self, scheme: str) -> None:
56
+ super().__init__(f"unsupported URL scheme: {scheme}")
57
+
58
+
59
+ class UnsupportedURLNetlocError(KabigonError):
60
+ def __init__(self, netloc: str) -> None:
61
+ super().__init__(f"unsupported URL netloc: {netloc}")
62
+
63
+
64
+ class VideoIDError(KabigonError):
65
+ def __init__(self, video_id: str) -> None:
66
+ super().__init__(f"invalid video ID: {video_id}")
67
+
68
+
69
+ class NoVideoIDFoundError(KabigonError):
70
+ def __init__(self, url: str) -> None:
71
+ super().__init__(f"no video found in URL: {url}")
72
+
73
+
74
+ def parse_video_id(url: str) -> str:
75
+ """Parse and extract the video ID from a YouTube URL.
76
+
77
+ Supports various YouTube URL formats including:
78
+ - https://www.youtube.com/watch?v=VIDEO_ID
79
+ - https://youtu.be/VIDEO_ID
80
+ - https://m.youtube.com/watch?v=VIDEO_ID
81
+ - https://www.youtube-nocookie.com/watch?v=VIDEO_ID
82
+ - https://vid.plus/VIDEO_ID
83
+
84
+ Args:
85
+ url: YouTube video URL.
86
+
87
+ Returns:
88
+ 11-character video ID.
89
+
90
+ Raises:
91
+ UnsupportedURLSchemeError: If URL scheme is not http or https.
92
+ UnsupportedURLNetlocError: If URL domain is not a supported YouTube domain.
93
+ NoVideoIDFoundError: If no video ID parameter found in the URL.
94
+ VideoIDError: If extracted video ID is not exactly 11 characters.
95
+
96
+ Example:
97
+ >>> parse_video_id("https://www.youtube.com/watch?v=dQw4w9WgXcQ")
98
+ 'dQw4w9WgXcQ'
99
+ >>> parse_video_id("https://youtu.be/dQw4w9WgXcQ")
100
+ 'dQw4w9WgXcQ'
101
+ """
102
+ parsed_url = urlparse(url)
103
+
104
+ if parsed_url.scheme not in ALLOWED_SCHEMES:
105
+ raise UnsupportedURLSchemeError(parsed_url.scheme)
106
+
107
+ if parsed_url.netloc not in ALLOWED_NETLOCS:
108
+ raise UnsupportedURLNetlocError(parsed_url.netloc)
109
+
110
+ path = parsed_url.path
111
+
112
+ if path.endswith("/watch"):
113
+ query = parsed_url.query
114
+ parsed_query = parse_qs(query)
115
+ if "v" in parsed_query:
116
+ ids = parsed_query["v"]
117
+ video_id = ids[0]
118
+ else:
119
+ raise NoVideoIDFoundError(url)
120
+ else:
121
+ stripped_path = parsed_url.path.lstrip("/")
122
+ video_id = stripped_path.split("/")[-1]
123
+
124
+ if len(video_id) != 11: # Video IDs are 11 characters long
125
+ raise VideoIDError(video_id)
126
+
127
+ return video_id
128
+
129
+
130
+ def check_youtube_url(url: str) -> None:
131
+ """Validate that the given URL is a supported YouTube URL.
132
+
133
+ This delegates to ``parse_video_id`` to ensure that URL validation
134
+ (including scheme and netloc checks) is implemented in a single place.
135
+ Any validation failures are surfaced as ``ValueError`` to maintain
136
+ the previous public interface of this function.
137
+
138
+ Args:
139
+ url: YouTube video URL to validate.
140
+
141
+ Raises:
142
+ ValueError: If URL is invalid or not a supported YouTube URL.
143
+ """
144
+ try:
145
+ # We only care about validation here; the caller does not need the ID.
146
+ parse_video_id(url)
147
+ except (UnsupportedURLSchemeError, UnsupportedURLNetlocError, NoVideoIDFoundError, VideoIDError) as exc:
148
+ raise ValueError(str(exc)) from exc
149
+
150
+
151
+ class YoutubeLoader(Loader):
152
+ def __init__(self, languages: list[str] | None = None) -> None:
153
+ self.languages = languages or DEFAULT_LANGUAGES
154
+
155
+ def load_sync(self, url: str) -> str:
156
+ video_id = parse_video_id(url)
157
+
158
+ fetched = YouTubeTranscriptApi().fetch(video_id, self.languages)
159
+
160
+ lines = []
161
+ for snippet in fetched.snippets:
162
+ text = str(snippet.text).strip()
163
+ if text:
164
+ lines.append(text)
165
+ return "\n".join(lines)
@@ -0,0 +1,13 @@
1
+ from kabigon.core.loader import Loader
2
+
3
+ from .youtube import check_youtube_url
4
+ from .ytdlp import YtdlpLoader
5
+
6
+
7
+ class YoutubeYtdlpLoader(Loader):
8
+ def __init__(self) -> None:
9
+ self.ytdlp_loader = YtdlpLoader()
10
+
11
+ def load_sync(self, url: str) -> str:
12
+ check_youtube_url(url)
13
+ return self.ytdlp_loader.load_sync(url)
@@ -0,0 +1,60 @@
1
+ import os
2
+ import uuid
3
+ from pathlib import Path
4
+
5
+ import yt_dlp
6
+ from loguru import logger
7
+
8
+ from kabigon.core.exception import WhisperNotInstalledError
9
+ from kabigon.core.loader import Loader
10
+
11
+
12
+ def download_audio(url: str, outtmpl: str | None = None) -> None:
13
+ ydl_opts = {
14
+ "format": "bestaudio/best",
15
+ "postprocessors": [
16
+ {
17
+ "key": "FFmpegExtractAudio",
18
+ "preferredcodec": "mp3",
19
+ "preferredquality": "192",
20
+ }
21
+ ],
22
+ "match_filter": yt_dlp.match_filter_func(["!is_live"]),
23
+ }
24
+
25
+ if outtmpl is not None:
26
+ ydl_opts["outtmpl"] = outtmpl
27
+
28
+ ffmpeg_path = os.getenv("FFMPEG_PATH")
29
+ if ffmpeg_path is not None:
30
+ ydl_opts["ffmpeg_location"] = ffmpeg_path
31
+
32
+ logger.info("Downloading audio from URL: {} with options: {}", url, ydl_opts)
33
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
34
+ ydl.download([url])
35
+
36
+
37
+ class YtdlpLoader(Loader):
38
+ def __init__(self, model: str = "tiny") -> None:
39
+ try:
40
+ import whisper
41
+ except ImportError as e:
42
+ raise WhisperNotInstalledError from e
43
+
44
+ self.model = whisper.load_model(model)
45
+ self.load_audio = whisper.load_audio
46
+
47
+ def load_sync(self, url: str) -> str:
48
+ outtmpl = uuid.uuid4().hex[:20]
49
+ path = str(Path(outtmpl).with_suffix(".mp3"))
50
+ download_audio(url, outtmpl=outtmpl)
51
+
52
+ try:
53
+ audio = self.load_audio(path)
54
+ logger.info("Transcribing audio file: {}", path)
55
+ result = self.model.transcribe(audio)
56
+ finally:
57
+ # Clean up the audio file
58
+ Path(path).unlink()
59
+
60
+ return result.get("text", "")
kabigon/py.typed ADDED
File without changes