kabigon 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kabigon/__init__.py +19 -0
- kabigon/cli.py +11 -0
- kabigon/cloudscraper.py +14 -0
- kabigon/httpx.py +19 -0
- kabigon/loader.py +7 -0
- kabigon/pdf.py +52 -0
- kabigon/pipeline.py +67 -0
- kabigon/playwright.py +38 -0
- kabigon/py.typed +1 -0
- kabigon/reel.py +31 -0
- kabigon/singlefile.py +67 -0
- kabigon/utils.py +36 -0
- kabigon/youtube.py +92 -0
- kabigon/ytdlp.py +132 -0
- kabigon-0.1.0.dist-info/METADATA +22 -0
- kabigon-0.1.0.dist-info/RECORD +19 -0
- kabigon-0.1.0.dist-info/WHEEL +4 -0
- kabigon-0.1.0.dist-info/entry_points.txt +2 -0
- kabigon-0.1.0.dist-info/licenses/LICENSE +21 -0
kabigon/__init__.py
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
import os
|
2
|
+
import sys
|
3
|
+
from typing import Final
|
4
|
+
|
5
|
+
from loguru import logger
|
6
|
+
|
7
|
+
from .cloudscraper import CloudscraperLoader
|
8
|
+
from .httpx import HttpxLoader
|
9
|
+
from .loader import Loader
|
10
|
+
from .pdf import PDFLoader
|
11
|
+
from .pipeline import PipelineLoader
|
12
|
+
from .playwright import PlaywrightLoader
|
13
|
+
from .reel import ReelLoader
|
14
|
+
from .singlefile import SinglefileLoader
|
15
|
+
from .youtube import YoutubeLoader
|
16
|
+
from .ytdlp import YtdlpLoader
|
17
|
+
|
18
|
+
LOGURU_LEVEL: Final[str] = os.getenv("LOGURU_LEVEL", "INFO")
|
19
|
+
logger.configure(handlers=[{"sink": sys.stderr, "level": LOGURU_LEVEL}])
|
kabigon/cli.py
ADDED
kabigon/cloudscraper.py
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
import cloudscraper
|
2
|
+
import timeout_decorator
|
3
|
+
|
4
|
+
from .loader import Loader
|
5
|
+
from .utils import html_to_markdown
|
6
|
+
|
7
|
+
|
8
|
+
class CloudscraperLoader(Loader):
|
9
|
+
@timeout_decorator.timeout(5)
|
10
|
+
def load(self, url: str) -> str:
|
11
|
+
client = cloudscraper.create_scraper()
|
12
|
+
response = client.get(url, allow_redirects=True)
|
13
|
+
response.raise_for_status()
|
14
|
+
return html_to_markdown(response.text)
|
kabigon/httpx.py
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
import httpx
|
2
|
+
import timeout_decorator
|
3
|
+
|
4
|
+
from .loader import Loader
|
5
|
+
from .utils import html_to_markdown
|
6
|
+
|
7
|
+
DEFAULT_HEADERS = {
|
8
|
+
"Accept-Language": "zh-TW,zh;q=0.9,ja;q=0.8,en-US;q=0.7,en;q=0.6",
|
9
|
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36", # noqa
|
10
|
+
"Cookie": "over18=1", # Required for some sites like PTT
|
11
|
+
}
|
12
|
+
|
13
|
+
|
14
|
+
class HttpxLoader(Loader):
|
15
|
+
@timeout_decorator.timeout(5)
|
16
|
+
def load(self, url: str) -> str:
|
17
|
+
response = httpx.get(url, headers=DEFAULT_HEADERS, follow_redirects=True)
|
18
|
+
response.raise_for_status()
|
19
|
+
return html_to_markdown(response.content)
|
kabigon/loader.py
ADDED
kabigon/pdf.py
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
import tempfile
|
2
|
+
from pathlib import Path
|
3
|
+
|
4
|
+
import httpx
|
5
|
+
import timeout_decorator
|
6
|
+
from pypdf import PdfReader
|
7
|
+
|
8
|
+
from .loader import Loader
|
9
|
+
from .loader import LoaderError
|
10
|
+
|
11
|
+
DEFAULT_HEADERS = {
|
12
|
+
"Accept-Language": "zh-TW,zh;q=0.9,ja;q=0.8,en-US;q=0.7,en;q=0.6",
|
13
|
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36", # noqa
|
14
|
+
}
|
15
|
+
|
16
|
+
|
17
|
+
class NotPDFError(LoaderError):
|
18
|
+
pass
|
19
|
+
|
20
|
+
|
21
|
+
class PDFLoader(Loader):
|
22
|
+
@timeout_decorator.timeout(5)
|
23
|
+
def load(self, url_or_file: str) -> str:
|
24
|
+
if url_or_file.startswith("http"):
|
25
|
+
url_or_file = download_pdf_from_url(url_or_file)
|
26
|
+
return read_pdf_content(url_or_file)
|
27
|
+
|
28
|
+
|
29
|
+
def download_pdf_from_url(url: str) -> str:
|
30
|
+
response = httpx.get(url=url, headers=DEFAULT_HEADERS, follow_redirects=True)
|
31
|
+
response.raise_for_status()
|
32
|
+
|
33
|
+
is_pdf = response.headers.get("content-type") == "application/pdf"
|
34
|
+
if not is_pdf:
|
35
|
+
raise NotPDFError(f"URL is not a PDF: {url}")
|
36
|
+
|
37
|
+
suffix = ".pdf" if is_pdf else None
|
38
|
+
with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as fp:
|
39
|
+
fp.write(response.content)
|
40
|
+
return fp.name
|
41
|
+
|
42
|
+
|
43
|
+
def read_pdf_content(f: str | Path) -> str:
|
44
|
+
lines = []
|
45
|
+
with PdfReader(f) as reader:
|
46
|
+
for page in reader.pages:
|
47
|
+
text = page.extract_text(extraction_mode="plain")
|
48
|
+
for line in text.splitlines():
|
49
|
+
if not line.strip():
|
50
|
+
continue
|
51
|
+
lines.append(line.strip())
|
52
|
+
return "\n".join(lines)
|
kabigon/pipeline.py
ADDED
@@ -0,0 +1,67 @@
|
|
1
|
+
from urllib.parse import urlparse
|
2
|
+
from urllib.parse import urlunparse
|
3
|
+
|
4
|
+
from loguru import logger
|
5
|
+
|
6
|
+
from .cloudscraper import CloudscraperLoader
|
7
|
+
from .httpx import HttpxLoader
|
8
|
+
from .loader import Loader
|
9
|
+
from .loader import LoaderError
|
10
|
+
from .pdf import PDFLoader
|
11
|
+
from .reel import ReelLoader
|
12
|
+
from .singlefile import SinglefileLoader
|
13
|
+
from .youtube import YoutubeLoader
|
14
|
+
from .ytdlp import YtdlpLoader
|
15
|
+
|
16
|
+
REPLACEMENTS = {
|
17
|
+
"api.fxtwitter.com": [
|
18
|
+
"twitter.com",
|
19
|
+
"x.com",
|
20
|
+
"fxtwitter.com",
|
21
|
+
"vxtwitter.com",
|
22
|
+
"fixvx.com",
|
23
|
+
"twittpr.com",
|
24
|
+
"fixupx.com",
|
25
|
+
]
|
26
|
+
}
|
27
|
+
|
28
|
+
|
29
|
+
def replace_domain(url: str) -> str:
|
30
|
+
parsed = urlparse(url)
|
31
|
+
for target, source in REPLACEMENTS.items():
|
32
|
+
if parsed.netloc in source:
|
33
|
+
fixed_url = parsed._replace(netloc=target)
|
34
|
+
return urlunparse(fixed_url)
|
35
|
+
return url
|
36
|
+
|
37
|
+
|
38
|
+
class PipelineLoader(Loader):
|
39
|
+
def __init__(self) -> None:
|
40
|
+
self.loaders: list[Loader] = [
|
41
|
+
YoutubeLoader(),
|
42
|
+
ReelLoader(),
|
43
|
+
YtdlpLoader(),
|
44
|
+
PDFLoader(),
|
45
|
+
CloudscraperLoader(),
|
46
|
+
HttpxLoader(),
|
47
|
+
SinglefileLoader(),
|
48
|
+
]
|
49
|
+
|
50
|
+
def load(self, url: str) -> str:
|
51
|
+
url = replace_domain(url)
|
52
|
+
|
53
|
+
for loader in self.loaders:
|
54
|
+
try:
|
55
|
+
loaded_content = loader.load(url)
|
56
|
+
|
57
|
+
if not loaded_content:
|
58
|
+
logger.info("[{}] Failed to load URL: {}, got empty result", loader.__class__.__name__, url)
|
59
|
+
continue
|
60
|
+
|
61
|
+
logger.info("[{}] Successfully loaded URL: {}", loader.__class__.__name__, url)
|
62
|
+
return loaded_content
|
63
|
+
|
64
|
+
except Exception as e:
|
65
|
+
logger.info("[{}] Failed to load URL: {}, got error: {}", loader.__class__.__name__, url, e)
|
66
|
+
|
67
|
+
raise LoaderError(f"Failed to load URL: {url}")
|
kabigon/playwright.py
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
from typing import Literal
|
2
|
+
|
3
|
+
import timeout_decorator
|
4
|
+
from loguru import logger
|
5
|
+
from playwright.sync_api import TimeoutError
|
6
|
+
from playwright.sync_api import sync_playwright
|
7
|
+
|
8
|
+
from .loader import Loader
|
9
|
+
from .utils import html_to_markdown
|
10
|
+
|
11
|
+
|
12
|
+
class PlaywrightLoader(Loader):
|
13
|
+
def __init__(
|
14
|
+
self,
|
15
|
+
timeout: int = 10_000,
|
16
|
+
wait_until: Literal["commit", "domcontentloaded", "load", "networkidle"] = "networkidle",
|
17
|
+
browser_headless: bool = False,
|
18
|
+
) -> None:
|
19
|
+
self.timeout = timeout
|
20
|
+
self.wait_until = wait_until
|
21
|
+
self.browser_headless = browser_headless
|
22
|
+
|
23
|
+
@timeout_decorator.timeout(5)
|
24
|
+
def load(self, url: str) -> str:
|
25
|
+
with sync_playwright() as p:
|
26
|
+
browser = p.chromium.launch(headless=self.browser_headless)
|
27
|
+
page = browser.new_page()
|
28
|
+
|
29
|
+
try:
|
30
|
+
page.goto(url, timeout=self.timeout, wait_until=self.wait_until)
|
31
|
+
except TimeoutError as e:
|
32
|
+
logger.error("TimeoutError: {}", e)
|
33
|
+
page.goto(url)
|
34
|
+
|
35
|
+
content = page.content()
|
36
|
+
browser.close()
|
37
|
+
|
38
|
+
return html_to_markdown(content)
|
kabigon/py.typed
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
|
kabigon/reel.py
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
import timeout_decorator
|
2
|
+
|
3
|
+
from .httpx import HttpxLoader
|
4
|
+
from .loader import Loader
|
5
|
+
from .loader import LoaderError
|
6
|
+
from .ytdlp import YtdlpLoader
|
7
|
+
|
8
|
+
|
9
|
+
def is_reel_url(url: str) -> bool:
|
10
|
+
return url.startswith("https://www.instagram.com/reel")
|
11
|
+
|
12
|
+
|
13
|
+
class NotReelURLError(LoaderError):
|
14
|
+
def __init__(self, url: str):
|
15
|
+
super().__init__(f"URL is not an Instagram Reel: {url}")
|
16
|
+
|
17
|
+
|
18
|
+
class ReelLoader(Loader):
|
19
|
+
def __init__(self) -> None:
|
20
|
+
self.httpx_loader = HttpxLoader()
|
21
|
+
self.ytdlp_loader = YtdlpLoader()
|
22
|
+
|
23
|
+
@timeout_decorator.timeout(300)
|
24
|
+
def load(self, url: str) -> str:
|
25
|
+
if not is_reel_url(url):
|
26
|
+
raise NotReelURLError(url)
|
27
|
+
|
28
|
+
audio_content = self.ytdlp_loader.load(url)
|
29
|
+
html_content = self.httpx_loader.load(url)
|
30
|
+
|
31
|
+
return f"{audio_content}\n\n{html_content}"
|
kabigon/singlefile.py
ADDED
@@ -0,0 +1,67 @@
|
|
1
|
+
import os
|
2
|
+
import subprocess
|
3
|
+
import tempfile
|
4
|
+
from functools import cache
|
5
|
+
from pathlib import Path
|
6
|
+
from typing import Final
|
7
|
+
|
8
|
+
import charset_normalizer
|
9
|
+
import timeout_decorator
|
10
|
+
from loguru import logger
|
11
|
+
|
12
|
+
from .loader import Loader
|
13
|
+
from .utils import html_to_markdown
|
14
|
+
|
15
|
+
DEFAULT_SINGLEFILE_PATH: Final[str] = "single-file"
|
16
|
+
|
17
|
+
|
18
|
+
@cache
|
19
|
+
def get_singlefile_path() -> str:
|
20
|
+
path = os.getenv("SINGLEFILE_PATH")
|
21
|
+
if not path:
|
22
|
+
path = DEFAULT_SINGLEFILE_PATH
|
23
|
+
logger.warning("SINGLEFILE_PATH not set, using default: {}", DEFAULT_SINGLEFILE_PATH)
|
24
|
+
return path
|
25
|
+
|
26
|
+
|
27
|
+
class SinglefileLoader(Loader):
|
28
|
+
def __init__(self, cookies_file: str | None = None, browser_headless: bool = False) -> None:
|
29
|
+
self.cookies_file = cookies_file
|
30
|
+
self.browser_headless = browser_headless
|
31
|
+
|
32
|
+
@timeout_decorator.timeout(20)
|
33
|
+
def load(self, url: str) -> str:
|
34
|
+
filename = self.download(url)
|
35
|
+
content = str(charset_normalizer.from_path(filename).best())
|
36
|
+
return html_to_markdown(content)
|
37
|
+
|
38
|
+
def download(self, url: str) -> str:
|
39
|
+
logger.info("Downloading HTML using SingleFile: {}", url)
|
40
|
+
|
41
|
+
filename = tempfile.mktemp(suffix=".html")
|
42
|
+
singlefile_path = get_singlefile_path()
|
43
|
+
|
44
|
+
cmds = [singlefile_path]
|
45
|
+
|
46
|
+
if self.cookies_file is not None:
|
47
|
+
cookies_path = Path(self.cookies_file)
|
48
|
+
if not cookies_path.exists():
|
49
|
+
raise FileNotFoundError(f"Cookies file not found: {self.cookies_file}")
|
50
|
+
|
51
|
+
cmds += [
|
52
|
+
"--browser-cookies-file",
|
53
|
+
str(cookies_path),
|
54
|
+
]
|
55
|
+
|
56
|
+
cmds += [
|
57
|
+
"--filename-conflict-action",
|
58
|
+
"overwrite",
|
59
|
+
"--browser-headless",
|
60
|
+
str(self.browser_headless).lower(),
|
61
|
+
url,
|
62
|
+
filename,
|
63
|
+
]
|
64
|
+
|
65
|
+
subprocess.run(cmds)
|
66
|
+
|
67
|
+
return filename
|
kabigon/utils.py
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
from pathlib import Path
|
2
|
+
|
3
|
+
import charset_normalizer
|
4
|
+
from markdownify import markdownify
|
5
|
+
|
6
|
+
|
7
|
+
def normalize_whitespace(text: str) -> str:
|
8
|
+
lines = []
|
9
|
+
for line in text.splitlines():
|
10
|
+
stripped = line.strip()
|
11
|
+
if stripped:
|
12
|
+
lines += [stripped]
|
13
|
+
return "\n".join(lines)
|
14
|
+
|
15
|
+
|
16
|
+
def html_to_markdown(content: str | bytes) -> str:
|
17
|
+
"""Convert HTML content to markdown format.
|
18
|
+
|
19
|
+
Args:
|
20
|
+
content: HTML content as string or bytes
|
21
|
+
|
22
|
+
Returns:
|
23
|
+
Converted markdown text with normalized whitespace
|
24
|
+
"""
|
25
|
+
if isinstance(content, bytes):
|
26
|
+
content = str(charset_normalizer.from_bytes(content).best())
|
27
|
+
|
28
|
+
md = markdownify(content, strip=["a", "img"])
|
29
|
+
return normalize_whitespace(md)
|
30
|
+
|
31
|
+
|
32
|
+
def read_html_content(f: str | Path) -> str:
|
33
|
+
content = str(charset_normalizer.from_path(f).best())
|
34
|
+
|
35
|
+
md = markdownify(content, strip=["a", "img"])
|
36
|
+
return normalize_whitespace(md)
|
kabigon/youtube.py
ADDED
@@ -0,0 +1,92 @@
|
|
1
|
+
from urllib.parse import parse_qs
|
2
|
+
from urllib.parse import urlparse
|
3
|
+
|
4
|
+
import timeout_decorator
|
5
|
+
from youtube_transcript_api import YouTubeTranscriptApi
|
6
|
+
|
7
|
+
from .loader import Loader
|
8
|
+
from .loader import LoaderError
|
9
|
+
|
10
|
+
DEFAULT_LANGUAGES = ["zh-TW", "zh-Hant", "zh", "zh-Hans", "ja", "en", "ko"]
|
11
|
+
ALLOWED_SCHEMES = {
|
12
|
+
"http",
|
13
|
+
"https",
|
14
|
+
}
|
15
|
+
ALLOWED_NETLOCS = {
|
16
|
+
"youtu.be",
|
17
|
+
"m.youtube.com",
|
18
|
+
"youtube.com",
|
19
|
+
"www.youtube.com",
|
20
|
+
"www.youtube-nocookie.com",
|
21
|
+
"vid.plus",
|
22
|
+
}
|
23
|
+
|
24
|
+
|
25
|
+
class UnsupportedURLSchemeError(LoaderError):
|
26
|
+
def __init__(self, scheme: str) -> None:
|
27
|
+
super().__init__(f"unsupported URL scheme: {scheme}")
|
28
|
+
|
29
|
+
|
30
|
+
class UnsupportedURLNetlocError(LoaderError):
|
31
|
+
def __init__(self, netloc: str) -> None:
|
32
|
+
super().__init__(f"unsupported URL netloc: {netloc}")
|
33
|
+
|
34
|
+
|
35
|
+
class VideoIDError(LoaderError):
|
36
|
+
def __init__(self, video_id: str) -> None:
|
37
|
+
super().__init__(f"invalid video ID: {video_id}")
|
38
|
+
|
39
|
+
|
40
|
+
class NoVideoIDFoundError(LoaderError):
|
41
|
+
def __init__(self, url: str) -> None:
|
42
|
+
super().__init__(f"no video found in URL: {url}")
|
43
|
+
|
44
|
+
|
45
|
+
def parse_video_id(url: str) -> str:
|
46
|
+
"""Parse a YouTube URL and return the video ID if valid, otherwise None."""
|
47
|
+
parsed_url = urlparse(url)
|
48
|
+
|
49
|
+
if parsed_url.scheme not in ALLOWED_SCHEMES:
|
50
|
+
raise UnsupportedURLSchemeError(parsed_url.scheme)
|
51
|
+
|
52
|
+
if parsed_url.netloc not in ALLOWED_NETLOCS:
|
53
|
+
raise UnsupportedURLNetlocError(parsed_url.netloc)
|
54
|
+
|
55
|
+
path = parsed_url.path
|
56
|
+
|
57
|
+
if path.endswith("/watch"):
|
58
|
+
query = parsed_url.query
|
59
|
+
parsed_query = parse_qs(query)
|
60
|
+
if "v" in parsed_query:
|
61
|
+
ids = parsed_query["v"]
|
62
|
+
video_id = ids if isinstance(ids, str) else ids[0]
|
63
|
+
else:
|
64
|
+
raise NoVideoIDFoundError(url)
|
65
|
+
else:
|
66
|
+
path = parsed_url.path.lstrip("/")
|
67
|
+
video_id = path.split("/")[-1]
|
68
|
+
|
69
|
+
if len(video_id) != 11: # Video IDs are 11 characters long
|
70
|
+
raise VideoIDError(video_id)
|
71
|
+
|
72
|
+
return video_id
|
73
|
+
|
74
|
+
|
75
|
+
class YoutubeLoader(Loader):
|
76
|
+
def __init__(self, languages: list[str] | None = None) -> None:
|
77
|
+
self.languages = languages or DEFAULT_LANGUAGES
|
78
|
+
|
79
|
+
@timeout_decorator.timeout(20)
|
80
|
+
def load(self, url: str) -> str:
|
81
|
+
video_id = parse_video_id(url)
|
82
|
+
|
83
|
+
transcript_pieces: list[dict[str, str | float]] = YouTubeTranscriptApi().get_transcript(
|
84
|
+
video_id, self.languages
|
85
|
+
)
|
86
|
+
|
87
|
+
lines = []
|
88
|
+
for transcript_piece in transcript_pieces:
|
89
|
+
text = str(transcript_piece.get("text", "")).strip()
|
90
|
+
if text:
|
91
|
+
lines.append(text)
|
92
|
+
return "\n".join(lines)
|
kabigon/ytdlp.py
ADDED
@@ -0,0 +1,132 @@
|
|
1
|
+
import functools
|
2
|
+
import hashlib
|
3
|
+
import os
|
4
|
+
import subprocess
|
5
|
+
import tempfile
|
6
|
+
from typing import Final
|
7
|
+
|
8
|
+
import numpy as np
|
9
|
+
import timeout_decorator
|
10
|
+
import whisper
|
11
|
+
import yt_dlp
|
12
|
+
from loguru import logger
|
13
|
+
|
14
|
+
from .loader import Loader
|
15
|
+
|
16
|
+
try:
|
17
|
+
import mlx_whisper # noqa: F401
|
18
|
+
|
19
|
+
_mlx_whisper_installed = True
|
20
|
+
except ImportError:
|
21
|
+
_mlx_whisper_installed = False
|
22
|
+
|
23
|
+
|
24
|
+
DEFAULT_FFMPEG_PATH: Final[str] = "ffmpeg"
|
25
|
+
|
26
|
+
|
27
|
+
def hash_url(url: str) -> str:
|
28
|
+
return hashlib.sha512(url.encode("utf-8")).hexdigest()
|
29
|
+
|
30
|
+
|
31
|
+
def get_ffmpeg_path() -> str:
|
32
|
+
path = os.getenv("FFMPEG_PATH")
|
33
|
+
if not path:
|
34
|
+
path = DEFAULT_FFMPEG_PATH
|
35
|
+
logger.warning("FFMPEG_PATH not set, using default: {}", DEFAULT_FFMPEG_PATH)
|
36
|
+
|
37
|
+
return path
|
38
|
+
|
39
|
+
|
40
|
+
def download_audio(url: str) -> str:
|
41
|
+
ffmpeg_path = get_ffmpeg_path()
|
42
|
+
|
43
|
+
filename = os.path.join(
|
44
|
+
tempfile.gettempdir(),
|
45
|
+
hash_url(url),
|
46
|
+
)
|
47
|
+
|
48
|
+
ydl_opts = {
|
49
|
+
"format": "bestaudio/best",
|
50
|
+
"postprocessors": [
|
51
|
+
{
|
52
|
+
"key": "FFmpegExtractAudio",
|
53
|
+
"preferredcodec": "mp3",
|
54
|
+
"preferredquality": "192",
|
55
|
+
}
|
56
|
+
],
|
57
|
+
"outtmpl": filename,
|
58
|
+
"ffmpeg_location": ffmpeg_path,
|
59
|
+
"match_filter": yt_dlp.match_filter_func(["!is_live"]),
|
60
|
+
}
|
61
|
+
|
62
|
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
63
|
+
ydl.download([url])
|
64
|
+
|
65
|
+
return filename + ".mp3"
|
66
|
+
|
67
|
+
|
68
|
+
def load_audio(file: str, sr: int = 16000):
|
69
|
+
"""
|
70
|
+
Open an audio file and read as mono waveform, resampling as necessary
|
71
|
+
|
72
|
+
Parameters
|
73
|
+
----------
|
74
|
+
file: str
|
75
|
+
The audio file to open
|
76
|
+
|
77
|
+
sr: int
|
78
|
+
The sample rate to resample the audio if necessary
|
79
|
+
|
80
|
+
Returns
|
81
|
+
-------
|
82
|
+
A NumPy array containing the audio waveform, in float32 dtype.
|
83
|
+
"""
|
84
|
+
ffmpeg_path = get_ffmpeg_path()
|
85
|
+
|
86
|
+
# This launches a subprocess to decode audio while down-mixing
|
87
|
+
# and resampling as necessary. Requires the ffmpeg CLI in PATH.
|
88
|
+
# fmt: off
|
89
|
+
cmd = [
|
90
|
+
ffmpeg_path,
|
91
|
+
"-nostdin",
|
92
|
+
"-threads", "0",
|
93
|
+
"-i", file,
|
94
|
+
"-f", "s16le",
|
95
|
+
"-ac", "1",
|
96
|
+
"-acodec", "pcm_s16le",
|
97
|
+
"-ar", str(sr),
|
98
|
+
"-"
|
99
|
+
]
|
100
|
+
# fmt: on
|
101
|
+
try:
|
102
|
+
out = subprocess.run(cmd, capture_output=True, check=True).stdout
|
103
|
+
except subprocess.CalledProcessError as e:
|
104
|
+
raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
|
105
|
+
|
106
|
+
return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
|
107
|
+
|
108
|
+
|
109
|
+
@functools.cache
|
110
|
+
def _load_whisper_model() -> whisper.Whisper:
|
111
|
+
return whisper.load_model("tiny")
|
112
|
+
|
113
|
+
|
114
|
+
def _transcribe(audio: np.ndarray) -> dict:
|
115
|
+
if _mlx_whisper_installed:
|
116
|
+
return mlx_whisper.transcribe(audio, path_or_hf_repo="mlx-community/whisper-tiny")
|
117
|
+
|
118
|
+
model = _load_whisper_model()
|
119
|
+
return model.transcribe(audio)
|
120
|
+
|
121
|
+
|
122
|
+
class YtdlpLoader(Loader):
|
123
|
+
@timeout_decorator.timeout(300)
|
124
|
+
def load(self, url: str) -> str:
|
125
|
+
audio_file = download_audio(url)
|
126
|
+
audio = load_audio(audio_file)
|
127
|
+
|
128
|
+
# Clean up the audio file
|
129
|
+
os.remove(audio_file)
|
130
|
+
|
131
|
+
result = _transcribe(audio)
|
132
|
+
return result.get("text", "")
|
@@ -0,0 +1,22 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: kabigon
|
3
|
+
Version: 0.1.0
|
4
|
+
Author-email: narumi <toucans-cutouts0f@icloud.com>
|
5
|
+
License-File: LICENSE
|
6
|
+
Requires-Python: >=3.12
|
7
|
+
Requires-Dist: click>=8.1.8
|
8
|
+
Requires-Dist: cloudscraper>=1.2.71
|
9
|
+
Requires-Dist: httpx>=0.28.1
|
10
|
+
Requires-Dist: loguru>=0.7.3
|
11
|
+
Requires-Dist: markdownify>=0.14.1
|
12
|
+
Requires-Dist: numba>=0.61.0
|
13
|
+
Requires-Dist: openai-whisper>=20240930
|
14
|
+
Requires-Dist: playwright>=1.50.0
|
15
|
+
Requires-Dist: pypdf>=5.3.0
|
16
|
+
Requires-Dist: rich>=13.9.4
|
17
|
+
Requires-Dist: timeout-decorator>=0.5.0
|
18
|
+
Requires-Dist: youtube-transcript-api>=0.6.3
|
19
|
+
Requires-Dist: yt-dlp>=2025.1.26
|
20
|
+
Description-Content-Type: text/markdown
|
21
|
+
|
22
|
+
# python-template
|
@@ -0,0 +1,19 @@
|
|
1
|
+
kabigon/__init__.py,sha256=XFwiwCa3r17Ehn4_unnWmTXZ54EFQRRTvNaCjE2f5I4,553
|
2
|
+
kabigon/cli.py,sha256=eOto76gMuHSqpn9XuJsoUY1sG-lDb2Ny7DLbhSyqWAI,221
|
3
|
+
kabigon/cloudscraper.py,sha256=viaIWATsS8nD9HN0RfBiveUHuL012OjuaKlKwLEteGw,403
|
4
|
+
kabigon/httpx.py,sha256=SfhaJXNKlFOwWs_Eeadiegi5wNvZV0RX7lqQwR8nYGo,667
|
5
|
+
kabigon/loader.py,sha256=_iRrjwW9SPWnqnsl8QFB1uEgh4Ed27zXio6GEl98b5A,126
|
6
|
+
kabigon/pdf.py,sha256=9Oi_ZP7D0LLWs2D8KO3omRld1nYyhEbQMForZUw9YZg,1548
|
7
|
+
kabigon/pipeline.py,sha256=XCjZMpOrbIfNcL6hTShvBHpsyOkKymvNRirp6PVIIDA,1874
|
8
|
+
kabigon/playwright.py,sha256=0CkSDY90i7PHKxrJ6Zwad_NDCy5TPyCjeFvLrQuZXFU,1154
|
9
|
+
kabigon/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
10
|
+
kabigon/reel.py,sha256=dkWXG2nBhIt0DpGJzevkIrRKLqJh_03-yrg_rjf6vnY,828
|
11
|
+
kabigon/singlefile.py,sha256=2nTCTFgW5Gp3l0ExaVh2foUDVSgLaAssDp3tBoQ1MhY,1860
|
12
|
+
kabigon/utils.py,sha256=eNTLtHLSB2erDac2HH3jWemgfr8Ou_ozwVb8h9BD-4g,922
|
13
|
+
kabigon/youtube.py,sha256=_wdKvRRAMrYnv3rUhkd_6JuOGCuQClYpj1UlVeYeojc,2615
|
14
|
+
kabigon/ytdlp.py,sha256=kG1fXqU650otOWespjOSkGK_-jk1wO-sWiR60_UPJxY,3125
|
15
|
+
kabigon-0.1.0.dist-info/METADATA,sha256=rtCDMxACLbzTuTKc2Yq4kPkteYSUjtbiyeVZHOmARmo,641
|
16
|
+
kabigon-0.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
17
|
+
kabigon-0.1.0.dist-info/entry_points.txt,sha256=O3FYAO9w-NQvlGMJrBvtrnGHSK2QkUnQBTa30YXRbVE,45
|
18
|
+
kabigon-0.1.0.dist-info/licenses/LICENSE,sha256=H2T3_RTgmcngMeC7p_SXT3GwBLkd2DaNgAZuxulcfiA,1066
|
19
|
+
kabigon-0.1.0.dist-info/RECORD,,
|
@@ -0,0 +1,21 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2022 なるみ
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|