kabigon 0.14.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kabigon/__init__.py +45 -0
- kabigon/api.py +74 -0
- kabigon/cli.py +13 -0
- kabigon/core/__init__.py +19 -0
- kabigon/core/exception.py +41 -0
- kabigon/core/loader.py +9 -0
- kabigon/loaders/__init__.py +31 -0
- kabigon/loaders/compose.py +26 -0
- kabigon/loaders/firecrawl.py +30 -0
- kabigon/loaders/github.py +184 -0
- kabigon/loaders/httpx.py +16 -0
- kabigon/loaders/pdf.py +46 -0
- kabigon/loaders/playwright.py +37 -0
- kabigon/loaders/ptt.py +27 -0
- kabigon/loaders/reddit.py +86 -0
- kabigon/loaders/reel.py +24 -0
- kabigon/loaders/truthsocial.py +71 -0
- kabigon/loaders/twitter.py +115 -0
- kabigon/loaders/utils.py +36 -0
- kabigon/loaders/youtube.py +165 -0
- kabigon/loaders/youtube_ytdlp.py +13 -0
- kabigon/loaders/ytdlp.py +60 -0
- kabigon/py.typed +0 -0
- kabigon-0.14.2.dist-info/METADATA +319 -0
- kabigon-0.14.2.dist-info/RECORD +28 -0
- kabigon-0.14.2.dist-info/WHEEL +4 -0
- kabigon-0.14.2.dist-info/entry_points.txt +2 -0
- kabigon-0.14.2.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
from urllib.parse import urlparse
|
|
2
|
+
from urllib.parse import urlunparse
|
|
3
|
+
|
|
4
|
+
from playwright.async_api import async_playwright
|
|
5
|
+
|
|
6
|
+
from kabigon.core.exception import InvalidURLError
|
|
7
|
+
from kabigon.core.loader import Loader
|
|
8
|
+
|
|
9
|
+
from .utils import html_to_markdown
|
|
10
|
+
|
|
11
|
+
REDDIT_DOMAINS = [
|
|
12
|
+
"reddit.com",
|
|
13
|
+
"www.reddit.com",
|
|
14
|
+
"old.reddit.com",
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
USER_AGENT = (
|
|
18
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def check_reddit_url(url: str) -> None:
|
|
23
|
+
"""Check if URL is from Reddit.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
url: The URL to check
|
|
27
|
+
|
|
28
|
+
Raises:
|
|
29
|
+
ValueError: If URL is not from Reddit
|
|
30
|
+
"""
|
|
31
|
+
netloc = urlparse(url).netloc
|
|
32
|
+
if netloc not in REDDIT_DOMAINS:
|
|
33
|
+
raise InvalidURLError(url, "Reddit")
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def convert_to_old_reddit(url: str) -> str:
|
|
37
|
+
"""Convert Reddit URL to old.reddit.com format.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
url: Original Reddit URL
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
URL with old.reddit.com domain
|
|
44
|
+
"""
|
|
45
|
+
parsed = urlparse(url)
|
|
46
|
+
return str(urlunparse(parsed._replace(netloc="old.reddit.com")))
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class RedditLoader(Loader):
|
|
50
|
+
"""Loader for Reddit posts and comments.
|
|
51
|
+
|
|
52
|
+
Uses old.reddit.com for better content extraction without CAPTCHA.
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
def __init__(self, timeout: float = 30_000) -> None:
|
|
56
|
+
"""Initialize RedditLoader.
|
|
57
|
+
|
|
58
|
+
Args:
|
|
59
|
+
timeout: Timeout in milliseconds for page loading (default: 30 seconds)
|
|
60
|
+
"""
|
|
61
|
+
self.timeout = timeout
|
|
62
|
+
|
|
63
|
+
async def load(self, url: str) -> str:
|
|
64
|
+
"""Asynchronously load Reddit content from URL.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
url: Reddit URL to load
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
Loaded content as markdown
|
|
71
|
+
|
|
72
|
+
Raises:
|
|
73
|
+
ValueError: If URL is not from Reddit
|
|
74
|
+
"""
|
|
75
|
+
check_reddit_url(url)
|
|
76
|
+
url = convert_to_old_reddit(url)
|
|
77
|
+
|
|
78
|
+
async with async_playwright() as p:
|
|
79
|
+
browser = await p.chromium.launch(headless=True)
|
|
80
|
+
context = await browser.new_context(user_agent=USER_AGENT)
|
|
81
|
+
page = await context.new_page()
|
|
82
|
+
await page.goto(url, timeout=self.timeout, wait_until="networkidle")
|
|
83
|
+
content = await page.content()
|
|
84
|
+
await browser.close()
|
|
85
|
+
|
|
86
|
+
return html_to_markdown(content)
|
kabigon/loaders/reel.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
from kabigon.core.exception import InvalidURLError
|
|
2
|
+
from kabigon.core.loader import Loader
|
|
3
|
+
|
|
4
|
+
from .httpx import HttpxLoader
|
|
5
|
+
from .ytdlp import YtdlpLoader
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def check_reel_url(url: str) -> None:
|
|
9
|
+
if not url.startswith("https://www.instagram.com/reel"):
|
|
10
|
+
raise InvalidURLError(url, "Instagram Reel")
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ReelLoader(Loader):
|
|
14
|
+
def __init__(self) -> None:
|
|
15
|
+
self.httpx_loader = HttpxLoader()
|
|
16
|
+
self.ytdlp_loader = YtdlpLoader()
|
|
17
|
+
|
|
18
|
+
async def load(self, url: str) -> str:
|
|
19
|
+
check_reel_url(url)
|
|
20
|
+
|
|
21
|
+
audio_content = await self.ytdlp_loader.load(url)
|
|
22
|
+
html_content = await self.httpx_loader.load(url)
|
|
23
|
+
|
|
24
|
+
return f"{audio_content}\n\n{html_content}"
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
from urllib.parse import urlparse
|
|
2
|
+
|
|
3
|
+
from playwright.async_api import async_playwright
|
|
4
|
+
|
|
5
|
+
from kabigon.core.exception import InvalidURLError
|
|
6
|
+
from kabigon.core.loader import Loader
|
|
7
|
+
|
|
8
|
+
from .utils import html_to_markdown
|
|
9
|
+
|
|
10
|
+
TRUTHSOCIAL_DOMAINS = [
|
|
11
|
+
"truthsocial.com",
|
|
12
|
+
"www.truthsocial.com",
|
|
13
|
+
]
|
|
14
|
+
|
|
15
|
+
USER_AGENT = (
|
|
16
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def check_truthsocial_url(url: str) -> None:
|
|
21
|
+
"""Check if URL is from Truth Social.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
url: The URL to check
|
|
25
|
+
|
|
26
|
+
Raises:
|
|
27
|
+
ValueError: If URL is not from Truth Social
|
|
28
|
+
"""
|
|
29
|
+
netloc = urlparse(url).netloc
|
|
30
|
+
if netloc not in TRUTHSOCIAL_DOMAINS:
|
|
31
|
+
raise InvalidURLError(url, "Truth Social")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class TruthSocialLoader(Loader):
|
|
35
|
+
"""Loader for Truth Social posts.
|
|
36
|
+
|
|
37
|
+
Truth Social requires JavaScript rendering and longer wait times
|
|
38
|
+
for content to fully load.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
def __init__(self, timeout: float = 60_000) -> None:
|
|
42
|
+
"""Initialize TruthSocialLoader.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
timeout: Timeout in milliseconds for page loading (default: 60 seconds)
|
|
46
|
+
"""
|
|
47
|
+
self.timeout = timeout
|
|
48
|
+
|
|
49
|
+
async def load(self, url: str) -> str:
|
|
50
|
+
"""Load Truth Social content from URL.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
url: Truth Social URL to load
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
Loaded content as markdown
|
|
57
|
+
|
|
58
|
+
Raises:
|
|
59
|
+
ValueError: If URL is not from Truth Social
|
|
60
|
+
"""
|
|
61
|
+
check_truthsocial_url(url)
|
|
62
|
+
|
|
63
|
+
async with async_playwright() as p:
|
|
64
|
+
browser = await p.chromium.launch(headless=True)
|
|
65
|
+
context = await browser.new_context(user_agent=USER_AGENT)
|
|
66
|
+
page = await context.new_page()
|
|
67
|
+
await page.goto(url, timeout=self.timeout, wait_until="networkidle")
|
|
68
|
+
content = await page.content()
|
|
69
|
+
await browser.close()
|
|
70
|
+
|
|
71
|
+
return html_to_markdown(content)
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import contextlib
|
|
3
|
+
from urllib.parse import urlparse
|
|
4
|
+
from urllib.parse import urlunparse
|
|
5
|
+
|
|
6
|
+
from loguru import logger
|
|
7
|
+
from playwright.async_api import Error as PlaywrightError
|
|
8
|
+
from playwright.async_api import Page
|
|
9
|
+
from playwright.async_api import Request
|
|
10
|
+
from playwright.async_api import Route
|
|
11
|
+
from playwright.async_api import TimeoutError
|
|
12
|
+
from playwright.async_api import async_playwright
|
|
13
|
+
|
|
14
|
+
from kabigon.core.exception import InvalidURLError
|
|
15
|
+
from kabigon.core.loader import Loader
|
|
16
|
+
|
|
17
|
+
from .utils import html_to_markdown
|
|
18
|
+
|
|
19
|
+
TWITTER_DOMAINS = [
|
|
20
|
+
"twitter.com",
|
|
21
|
+
"x.com",
|
|
22
|
+
"fxtwitter.com",
|
|
23
|
+
"vxtwitter.com",
|
|
24
|
+
"fixvx.com",
|
|
25
|
+
"twittpr.com",
|
|
26
|
+
"api.fxtwitter.com",
|
|
27
|
+
"fixupx.com",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
USER_AGENT = (
|
|
31
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
TWEET_READY_SELECTORS = [
|
|
35
|
+
'article [data-testid="tweetText"]',
|
|
36
|
+
'article [data-testid="tweet"]',
|
|
37
|
+
'[data-testid="tweetText"]',
|
|
38
|
+
]
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def replace_domain(url: str, new_domain: str = "x.com") -> str:
|
|
42
|
+
return str(urlunparse(urlparse(url)._replace(netloc=new_domain)))
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def check_x_url(url: str) -> None:
|
|
46
|
+
if urlparse(url).netloc not in TWITTER_DOMAINS:
|
|
47
|
+
raise InvalidURLError(url, "Twitter/X")
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class TwitterLoader(Loader):
|
|
51
|
+
def __init__(self, timeout: float = 20_000, wait_for_tweet_timeout: float = 15_000) -> None:
|
|
52
|
+
self.timeout = timeout
|
|
53
|
+
self.wait_for_tweet_timeout = wait_for_tweet_timeout
|
|
54
|
+
|
|
55
|
+
async def _wait_for_any_selector(self, page: Page, *, selectors: list[str], timeout_ms: float) -> None:
|
|
56
|
+
async def wait_one(selector: str) -> None:
|
|
57
|
+
await page.wait_for_selector(selector, state="visible", timeout=timeout_ms)
|
|
58
|
+
|
|
59
|
+
tasks = [asyncio.create_task(wait_one(selector)) for selector in selectors]
|
|
60
|
+
try:
|
|
61
|
+
done, pending = await asyncio.wait(
|
|
62
|
+
tasks,
|
|
63
|
+
return_when=asyncio.FIRST_COMPLETED,
|
|
64
|
+
timeout=timeout_ms / 1000,
|
|
65
|
+
)
|
|
66
|
+
for task in pending:
|
|
67
|
+
task.cancel()
|
|
68
|
+
for task in done:
|
|
69
|
+
task.result()
|
|
70
|
+
finally:
|
|
71
|
+
for task in tasks:
|
|
72
|
+
if not task.done():
|
|
73
|
+
task.cancel()
|
|
74
|
+
|
|
75
|
+
async def load(self, url: str) -> str:
|
|
76
|
+
check_x_url(url)
|
|
77
|
+
|
|
78
|
+
url = replace_domain(url)
|
|
79
|
+
|
|
80
|
+
async with async_playwright() as p:
|
|
81
|
+
browser = await p.chromium.launch(headless=True)
|
|
82
|
+
context = await browser.new_context(user_agent=USER_AGENT)
|
|
83
|
+
page = await context.new_page()
|
|
84
|
+
|
|
85
|
+
async def route_handler(route: Route, request: Request) -> None:
|
|
86
|
+
if request.resource_type in {"image", "media", "font"}:
|
|
87
|
+
await route.abort()
|
|
88
|
+
return
|
|
89
|
+
await route.continue_()
|
|
90
|
+
|
|
91
|
+
await page.route("**/*", route_handler)
|
|
92
|
+
|
|
93
|
+
try:
|
|
94
|
+
await page.goto(url, timeout=self.timeout, wait_until="domcontentloaded")
|
|
95
|
+
except TimeoutError as e:
|
|
96
|
+
logger.warning("TimeoutError: {}, (url: {}, timeout: {})", e, url, self.timeout)
|
|
97
|
+
|
|
98
|
+
with contextlib.suppress(TimeoutError):
|
|
99
|
+
await self._wait_for_any_selector(
|
|
100
|
+
page,
|
|
101
|
+
selectors=TWEET_READY_SELECTORS,
|
|
102
|
+
timeout_ms=min(self.timeout or self.wait_for_tweet_timeout, self.wait_for_tweet_timeout),
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
try:
|
|
106
|
+
tweet_articles = page.locator("article").filter(has=page.locator('[data-testid="tweetText"]'))
|
|
107
|
+
if await tweet_articles.count() > 0:
|
|
108
|
+
content = await tweet_articles.nth(0).evaluate("el => el.outerHTML")
|
|
109
|
+
else:
|
|
110
|
+
content = await page.content()
|
|
111
|
+
except (PlaywrightError, TimeoutError):
|
|
112
|
+
content = await page.content()
|
|
113
|
+
|
|
114
|
+
await browser.close()
|
|
115
|
+
return html_to_markdown(content)
|
kabigon/loaders/utils.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
|
|
3
|
+
import charset_normalizer
|
|
4
|
+
from markdownify import markdownify
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def normalize_whitespace(text: str) -> str:
|
|
8
|
+
lines = []
|
|
9
|
+
for line in text.splitlines():
|
|
10
|
+
stripped = line.strip()
|
|
11
|
+
if stripped:
|
|
12
|
+
lines += [stripped]
|
|
13
|
+
return "\n".join(lines)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def html_to_markdown(content: str | bytes) -> str:
|
|
17
|
+
"""Convert HTML content to markdown format.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
content: HTML content as string or bytes
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
Converted markdown text with normalized whitespace
|
|
24
|
+
"""
|
|
25
|
+
if isinstance(content, bytes):
|
|
26
|
+
content = str(charset_normalizer.from_bytes(content).best())
|
|
27
|
+
|
|
28
|
+
md = markdownify(content, strip=["a", "img"])
|
|
29
|
+
return normalize_whitespace(md)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def read_html_content(f: str | Path) -> str:
|
|
33
|
+
content = str(charset_normalizer.from_path(f).best())
|
|
34
|
+
|
|
35
|
+
md = markdownify(content, strip=["a", "img"])
|
|
36
|
+
return normalize_whitespace(md)
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
from urllib.parse import parse_qs
|
|
2
|
+
from urllib.parse import urlparse
|
|
3
|
+
|
|
4
|
+
from youtube_transcript_api import YouTubeTranscriptApi
|
|
5
|
+
|
|
6
|
+
from kabigon.core.exception import KabigonError
|
|
7
|
+
from kabigon.core.loader import Loader
|
|
8
|
+
|
|
9
|
+
DEFAULT_LANGUAGES = [
|
|
10
|
+
# 中文
|
|
11
|
+
"zh-TW",
|
|
12
|
+
"zh-Hant",
|
|
13
|
+
"zh",
|
|
14
|
+
"zh-Hans",
|
|
15
|
+
# 日韓英
|
|
16
|
+
"ja",
|
|
17
|
+
"ko",
|
|
18
|
+
"en",
|
|
19
|
+
# 歐洲主要語言
|
|
20
|
+
"fr", # French
|
|
21
|
+
"de", # German
|
|
22
|
+
"es", # Spanish
|
|
23
|
+
"it", # Italian
|
|
24
|
+
"pt", # Portuguese
|
|
25
|
+
"pt-BR", # Portuguese (Brazil)
|
|
26
|
+
"nl", # Dutch
|
|
27
|
+
"sv", # Swedish
|
|
28
|
+
"pl", # Polish
|
|
29
|
+
# 東南亞
|
|
30
|
+
"th", # Thai
|
|
31
|
+
"vi", # Vietnamese
|
|
32
|
+
"id", # Indonesian
|
|
33
|
+
"ms", # Malay
|
|
34
|
+
"fil", # Filipino / Tagalog
|
|
35
|
+
# 其他常見
|
|
36
|
+
"ru", # Russian
|
|
37
|
+
"ar", # Arabic
|
|
38
|
+
"hi", # Hindi
|
|
39
|
+
]
|
|
40
|
+
ALLOWED_SCHEMES = {
|
|
41
|
+
"http",
|
|
42
|
+
"https",
|
|
43
|
+
}
|
|
44
|
+
ALLOWED_NETLOCS = {
|
|
45
|
+
"youtu.be",
|
|
46
|
+
"m.youtube.com",
|
|
47
|
+
"youtube.com",
|
|
48
|
+
"www.youtube.com",
|
|
49
|
+
"www.youtube-nocookie.com",
|
|
50
|
+
"vid.plus",
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class UnsupportedURLSchemeError(KabigonError):
|
|
55
|
+
def __init__(self, scheme: str) -> None:
|
|
56
|
+
super().__init__(f"unsupported URL scheme: {scheme}")
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class UnsupportedURLNetlocError(KabigonError):
|
|
60
|
+
def __init__(self, netloc: str) -> None:
|
|
61
|
+
super().__init__(f"unsupported URL netloc: {netloc}")
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class VideoIDError(KabigonError):
|
|
65
|
+
def __init__(self, video_id: str) -> None:
|
|
66
|
+
super().__init__(f"invalid video ID: {video_id}")
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class NoVideoIDFoundError(KabigonError):
|
|
70
|
+
def __init__(self, url: str) -> None:
|
|
71
|
+
super().__init__(f"no video found in URL: {url}")
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def parse_video_id(url: str) -> str:
|
|
75
|
+
"""Parse and extract the video ID from a YouTube URL.
|
|
76
|
+
|
|
77
|
+
Supports various YouTube URL formats including:
|
|
78
|
+
- https://www.youtube.com/watch?v=VIDEO_ID
|
|
79
|
+
- https://youtu.be/VIDEO_ID
|
|
80
|
+
- https://m.youtube.com/watch?v=VIDEO_ID
|
|
81
|
+
- https://www.youtube-nocookie.com/watch?v=VIDEO_ID
|
|
82
|
+
- https://vid.plus/VIDEO_ID
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
url: YouTube video URL.
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
11-character video ID.
|
|
89
|
+
|
|
90
|
+
Raises:
|
|
91
|
+
UnsupportedURLSchemeError: If URL scheme is not http or https.
|
|
92
|
+
UnsupportedURLNetlocError: If URL domain is not a supported YouTube domain.
|
|
93
|
+
NoVideoIDFoundError: If no video ID parameter found in the URL.
|
|
94
|
+
VideoIDError: If extracted video ID is not exactly 11 characters.
|
|
95
|
+
|
|
96
|
+
Example:
|
|
97
|
+
>>> parse_video_id("https://www.youtube.com/watch?v=dQw4w9WgXcQ")
|
|
98
|
+
'dQw4w9WgXcQ'
|
|
99
|
+
>>> parse_video_id("https://youtu.be/dQw4w9WgXcQ")
|
|
100
|
+
'dQw4w9WgXcQ'
|
|
101
|
+
"""
|
|
102
|
+
parsed_url = urlparse(url)
|
|
103
|
+
|
|
104
|
+
if parsed_url.scheme not in ALLOWED_SCHEMES:
|
|
105
|
+
raise UnsupportedURLSchemeError(parsed_url.scheme)
|
|
106
|
+
|
|
107
|
+
if parsed_url.netloc not in ALLOWED_NETLOCS:
|
|
108
|
+
raise UnsupportedURLNetlocError(parsed_url.netloc)
|
|
109
|
+
|
|
110
|
+
path = parsed_url.path
|
|
111
|
+
|
|
112
|
+
if path.endswith("/watch"):
|
|
113
|
+
query = parsed_url.query
|
|
114
|
+
parsed_query = parse_qs(query)
|
|
115
|
+
if "v" in parsed_query:
|
|
116
|
+
ids = parsed_query["v"]
|
|
117
|
+
video_id = ids[0]
|
|
118
|
+
else:
|
|
119
|
+
raise NoVideoIDFoundError(url)
|
|
120
|
+
else:
|
|
121
|
+
stripped_path = parsed_url.path.lstrip("/")
|
|
122
|
+
video_id = stripped_path.split("/")[-1]
|
|
123
|
+
|
|
124
|
+
if len(video_id) != 11: # Video IDs are 11 characters long
|
|
125
|
+
raise VideoIDError(video_id)
|
|
126
|
+
|
|
127
|
+
return video_id
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def check_youtube_url(url: str) -> None:
|
|
131
|
+
"""Validate that the given URL is a supported YouTube URL.
|
|
132
|
+
|
|
133
|
+
This delegates to ``parse_video_id`` to ensure that URL validation
|
|
134
|
+
(including scheme and netloc checks) is implemented in a single place.
|
|
135
|
+
Any validation failures are surfaced as ``ValueError`` to maintain
|
|
136
|
+
the previous public interface of this function.
|
|
137
|
+
|
|
138
|
+
Args:
|
|
139
|
+
url: YouTube video URL to validate.
|
|
140
|
+
|
|
141
|
+
Raises:
|
|
142
|
+
ValueError: If URL is invalid or not a supported YouTube URL.
|
|
143
|
+
"""
|
|
144
|
+
try:
|
|
145
|
+
# We only care about validation here; the caller does not need the ID.
|
|
146
|
+
parse_video_id(url)
|
|
147
|
+
except (UnsupportedURLSchemeError, UnsupportedURLNetlocError, NoVideoIDFoundError, VideoIDError) as exc:
|
|
148
|
+
raise ValueError(str(exc)) from exc
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
class YoutubeLoader(Loader):
|
|
152
|
+
def __init__(self, languages: list[str] | None = None) -> None:
|
|
153
|
+
self.languages = languages or DEFAULT_LANGUAGES
|
|
154
|
+
|
|
155
|
+
def load_sync(self, url: str) -> str:
|
|
156
|
+
video_id = parse_video_id(url)
|
|
157
|
+
|
|
158
|
+
fetched = YouTubeTranscriptApi().fetch(video_id, self.languages)
|
|
159
|
+
|
|
160
|
+
lines = []
|
|
161
|
+
for snippet in fetched.snippets:
|
|
162
|
+
text = str(snippet.text).strip()
|
|
163
|
+
if text:
|
|
164
|
+
lines.append(text)
|
|
165
|
+
return "\n".join(lines)
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from kabigon.core.loader import Loader
|
|
2
|
+
|
|
3
|
+
from .youtube import check_youtube_url
|
|
4
|
+
from .ytdlp import YtdlpLoader
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class YoutubeYtdlpLoader(Loader):
|
|
8
|
+
def __init__(self) -> None:
|
|
9
|
+
self.ytdlp_loader = YtdlpLoader()
|
|
10
|
+
|
|
11
|
+
def load_sync(self, url: str) -> str:
|
|
12
|
+
check_youtube_url(url)
|
|
13
|
+
return self.ytdlp_loader.load_sync(url)
|
kabigon/loaders/ytdlp.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import uuid
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
import yt_dlp
|
|
6
|
+
from loguru import logger
|
|
7
|
+
|
|
8
|
+
from kabigon.core.exception import WhisperNotInstalledError
|
|
9
|
+
from kabigon.core.loader import Loader
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def download_audio(url: str, outtmpl: str | None = None) -> None:
|
|
13
|
+
ydl_opts = {
|
|
14
|
+
"format": "bestaudio/best",
|
|
15
|
+
"postprocessors": [
|
|
16
|
+
{
|
|
17
|
+
"key": "FFmpegExtractAudio",
|
|
18
|
+
"preferredcodec": "mp3",
|
|
19
|
+
"preferredquality": "192",
|
|
20
|
+
}
|
|
21
|
+
],
|
|
22
|
+
"match_filter": yt_dlp.match_filter_func(["!is_live"]),
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
if outtmpl is not None:
|
|
26
|
+
ydl_opts["outtmpl"] = outtmpl
|
|
27
|
+
|
|
28
|
+
ffmpeg_path = os.getenv("FFMPEG_PATH")
|
|
29
|
+
if ffmpeg_path is not None:
|
|
30
|
+
ydl_opts["ffmpeg_location"] = ffmpeg_path
|
|
31
|
+
|
|
32
|
+
logger.info("Downloading audio from URL: {} with options: {}", url, ydl_opts)
|
|
33
|
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
|
34
|
+
ydl.download([url])
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class YtdlpLoader(Loader):
|
|
38
|
+
def __init__(self, model: str = "tiny") -> None:
|
|
39
|
+
try:
|
|
40
|
+
import whisper
|
|
41
|
+
except ImportError as e:
|
|
42
|
+
raise WhisperNotInstalledError from e
|
|
43
|
+
|
|
44
|
+
self.model = whisper.load_model(model)
|
|
45
|
+
self.load_audio = whisper.load_audio
|
|
46
|
+
|
|
47
|
+
def load_sync(self, url: str) -> str:
|
|
48
|
+
outtmpl = uuid.uuid4().hex[:20]
|
|
49
|
+
path = str(Path(outtmpl).with_suffix(".mp3"))
|
|
50
|
+
download_audio(url, outtmpl=outtmpl)
|
|
51
|
+
|
|
52
|
+
try:
|
|
53
|
+
audio = self.load_audio(path)
|
|
54
|
+
logger.info("Transcribing audio file: {}", path)
|
|
55
|
+
result = self.model.transcribe(audio)
|
|
56
|
+
finally:
|
|
57
|
+
# Clean up the audio file
|
|
58
|
+
Path(path).unlink()
|
|
59
|
+
|
|
60
|
+
return result.get("text", "")
|
kabigon/py.typed
ADDED
|
File without changes
|