kabigon 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kabigon might be problematic. Click here for more details.

kabigon/__init__.py ADDED
@@ -0,0 +1,22 @@
1
+ import os
2
+ import sys
3
+ from typing import Final
4
+
5
+ from loguru import logger
6
+
7
+ from .compose import Compose
8
+ from .firecrawl import FirecrawlLoader
9
+ from .httpx import HttpxLoader
10
+ from .loader import Loader
11
+ from .pdf import PDFLoader
12
+ from .playwright import PlaywrightLoader
13
+ from .ptt import PttLoader
14
+ from .reddit import RedditLoader
15
+ from .reel import ReelLoader
16
+ from .twitter import TwitterLoader
17
+ from .youtube import YoutubeLoader
18
+ from .youtube_ytdlp import YoutubeYtdlpLoader
19
+ from .ytdlp import YtdlpLoader
20
+
21
+ LOGURU_LEVEL: Final[str] = os.getenv("LOGURU_LEVEL", "INFO")
22
+ logger.add(sys.stderr, level=LOGURU_LEVEL)
kabigon/cli.py ADDED
@@ -0,0 +1,29 @@
1
+ import typer
2
+ from rich import print
3
+
4
+ from .compose import Compose
5
+ from .pdf import PDFLoader
6
+ from .playwright import PlaywrightLoader
7
+ from .ptt import PttLoader
8
+ from .reel import ReelLoader
9
+ from .twitter import TwitterLoader
10
+ from .youtube import YoutubeLoader
11
+
12
+
13
+ def run(url: str) -> None:
14
+ loader = Compose(
15
+ [
16
+ PttLoader(),
17
+ TwitterLoader(),
18
+ YoutubeLoader(),
19
+ ReelLoader(),
20
+ PDFLoader(),
21
+ PlaywrightLoader(),
22
+ ]
23
+ )
24
+ result = loader.load(url)
25
+ print(result)
26
+
27
+
28
+ def main() -> None:
29
+ typer.run(run)
kabigon/compose.py ADDED
@@ -0,0 +1,42 @@
1
+ from loguru import logger
2
+
3
+ from .loader import Loader
4
+
5
+
6
+ class Compose(Loader):
7
+ def __init__(self, loaders: list[Loader]) -> None:
8
+ self.loaders = loaders
9
+
10
+ def load(self, url: str) -> str:
11
+ for loader in self.loaders:
12
+ try:
13
+ result = loader.load(url)
14
+
15
+ if not result:
16
+ logger.info("[{}] Failed to load URL: {}, got empty result", loader.__class__.__name__, url)
17
+ continue
18
+
19
+ logger.info("[{}] Successfully loaded URL: {}", loader.__class__.__name__, url)
20
+ return result
21
+
22
+ except Exception as e:
23
+ logger.info("[{}] Failed to load URL: {}, got error: {}", loader.__class__.__name__, url, e)
24
+
25
+ raise Exception(f"Failed to load URL: {url}")
26
+
27
+ async def async_load(self, url: str) -> str:
28
+ for loader in self.loaders:
29
+ try:
30
+ result = await loader.async_load(url)
31
+
32
+ if not result:
33
+ logger.info("[{}] Failed to load URL: {}, got empty result", loader.__class__.__name__, url)
34
+ continue
35
+
36
+ logger.info("[{}] Successfully loaded URL: {}", loader.__class__.__name__, url)
37
+ return result
38
+
39
+ except Exception as e:
40
+ logger.info("[{}] Failed to load URL: {}, got error: {}", loader.__class__.__name__, url, e)
41
+
42
+ raise Exception(f"Failed to load URL: {url}")
kabigon/firecrawl.py ADDED
@@ -0,0 +1,31 @@
1
+ import os
2
+
3
+ from firecrawl import FirecrawlApp
4
+
5
+ from .loader import Loader
6
+
7
+
8
+ class FirecrawlLoader(Loader):
9
+ def __init__(self, timeout: int | None = None) -> None:
10
+ self.timeout = timeout
11
+
12
+ api_key = os.getenv("FIRECRAWL_API_KEY")
13
+ if not api_key:
14
+ raise ValueError("FIRECRAWL_API_KEY is not set.")
15
+
16
+ self.app = FirecrawlApp(api_key=api_key)
17
+
18
+ def load(self, url: str) -> str:
19
+ result = self.app.scrape_url( # ty:ignore[possibly-missing-attribute]
20
+ url,
21
+ formats=["markdown"],
22
+ timeout=self.timeout,
23
+ )
24
+
25
+ if not result.success:
26
+ raise Exception(f"Failed to load URL: {url}, got: {result.error}")
27
+
28
+ return result.markdown
29
+
30
+ async def async_load(self, url: str) -> str:
31
+ return self.load(url)
kabigon/httpx.py ADDED
@@ -0,0 +1,20 @@
1
+ import httpx
2
+
3
+ from .loader import Loader
4
+ from .utils import html_to_markdown
5
+
6
+
7
+ class HttpxLoader(Loader):
8
+ def __init__(self, headers: dict[str, str] | None = None) -> None:
9
+ self.headers = headers
10
+
11
+ def load(self, url: str) -> str:
12
+ response = httpx.get(url, headers=self.headers, follow_redirects=False)
13
+ response.raise_for_status()
14
+ return html_to_markdown(response.content)
15
+
16
+ async def async_load(self, url: str) -> str:
17
+ async with httpx.AsyncClient() as client:
18
+ response = await client.get(url, headers=self.headers, follow_redirects=True)
19
+ response.raise_for_status()
20
+ return html_to_markdown(response.content)
kabigon/loader.py ADDED
@@ -0,0 +1,13 @@
1
+ import asyncio
2
+ import concurrent.futures
3
+
4
+
5
+ class Loader:
6
+ def load(self, url: str) -> str:
7
+ raise NotImplementedError
8
+
9
+ async def async_load(self, url: str):
10
+ loop = asyncio.get_running_loop()
11
+ with concurrent.futures.ProcessPoolExecutor() as executor:
12
+ result = await loop.run_in_executor(executor, self.load, url)
13
+ return result
kabigon/pdf.py ADDED
@@ -0,0 +1,58 @@
1
+ import io
2
+ from pathlib import Path
3
+ from typing import IO
4
+ from typing import Any
5
+
6
+ import httpx
7
+ from pypdf import PdfReader
8
+
9
+ from .loader import Loader
10
+
11
+ DEFAULT_HEADERS = {
12
+ "Accept-Language": "zh-TW,zh;q=0.9,ja;q=0.8,en-US;q=0.7,en;q=0.6",
13
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36", # noqa
14
+ }
15
+
16
+
17
+ class NotPDFError(Exception):
18
+ def __init__(self, url: str) -> None:
19
+ super().__init__(f"URL is not a PDF: {url}")
20
+
21
+
22
+ class PDFLoader(Loader):
23
+ def load(self, url_or_file: str) -> str: # ty:ignore[invalid-method-override]
24
+ if not url_or_file.startswith("http"):
25
+ return read_pdf_content(url_or_file)
26
+
27
+ resp = httpx.get(url_or_file, headers=DEFAULT_HEADERS, follow_redirects=True)
28
+ resp.raise_for_status()
29
+
30
+ if resp.headers.get("content-type") != "application/pdf":
31
+ raise NotPDFError(url_or_file)
32
+
33
+ return read_pdf_content(io.BytesIO(resp.content))
34
+
35
+ async def async_load(self, url_or_file: str) -> str: # ty:ignore[invalid-method-override]
36
+ if not url_or_file.startswith("http"):
37
+ return read_pdf_content(url_or_file)
38
+
39
+ async with httpx.AsyncClient() as client:
40
+ resp = await client.get(url_or_file, headers=DEFAULT_HEADERS, follow_redirects=True)
41
+ resp.raise_for_status()
42
+
43
+ if resp.headers.get("content-type") != "application/pdf":
44
+ raise NotPDFError(url_or_file)
45
+
46
+ return read_pdf_content(io.BytesIO(resp.content))
47
+
48
+
49
+ def read_pdf_content(f: str | Path | IO[Any]) -> str:
50
+ lines = []
51
+ with PdfReader(f) as reader:
52
+ for page in reader.pages:
53
+ text = page.extract_text(extraction_mode="plain")
54
+ for line in text.splitlines():
55
+ stripped = line.strip()
56
+ if stripped:
57
+ lines.append(stripped)
58
+ return "\n".join(lines)
kabigon/playwright.py ADDED
@@ -0,0 +1,68 @@
1
+ from typing import Literal
2
+
3
+ from loguru import logger
4
+
5
+ from .loader import Loader
6
+ from .utils import html_to_markdown
7
+
8
+
9
+ class PlaywrightLoader(Loader):
10
+ def __init__(
11
+ self,
12
+ timeout: float | None = 0,
13
+ wait_until: Literal["commit", "domcontentloaded", "load", "networkidle"] | None = None,
14
+ browser_headless: bool = False,
15
+ ) -> None:
16
+ self.timeout = timeout
17
+ self.wait_until = wait_until
18
+ self.browser_headless = browser_headless
19
+
20
+ def load(self, url: str) -> str:
21
+ try:
22
+ from playwright.sync_api import TimeoutError
23
+ from playwright.sync_api import sync_playwright
24
+ except ImportError as e:
25
+ raise ImportError(
26
+ "Playwright is not installed."
27
+ "Please install it with `pip install playwright` and run `playwright install`."
28
+ ) from e
29
+
30
+ with sync_playwright() as p:
31
+ browser = p.chromium.launch(headless=self.browser_headless)
32
+ context = browser.new_context()
33
+ page = context.new_page()
34
+
35
+ try:
36
+ page.goto(url, timeout=self.timeout, wait_until=self.wait_until)
37
+ except TimeoutError as e:
38
+ logger.warning("TimeoutError: {}, (url: {}, timeout: {})", e, url, self.timeout)
39
+
40
+ content = page.content()
41
+ browser.close()
42
+
43
+ return html_to_markdown(content)
44
+
45
+ async def async_load(self, url: str) -> str:
46
+ try:
47
+ from playwright.async_api import TimeoutError
48
+ from playwright.async_api import async_playwright
49
+ except ImportError as e:
50
+ raise ImportError(
51
+ "Playwright is not installed."
52
+ "Please install it with `pip install playwright` and run `playwright install`."
53
+ ) from e
54
+
55
+ async with async_playwright() as p:
56
+ browser = await p.chromium.launch(headless=self.browser_headless)
57
+ context = await browser.new_context()
58
+ page = await context.new_page()
59
+
60
+ try:
61
+ await page.goto(url, timeout=self.timeout, wait_until=self.wait_until)
62
+ except TimeoutError as e:
63
+ logger.warning("TimeoutError: {}, (url: {}, timeout: {})", e, url, self.timeout)
64
+
65
+ content = await page.content()
66
+ await browser.close()
67
+
68
+ return html_to_markdown(content)
kabigon/ptt.py ADDED
@@ -0,0 +1,30 @@
1
+ from urllib.parse import urlparse
2
+
3
+ from .httpx import HttpxLoader
4
+ from .loader import Loader
5
+
6
+
7
+ def check_ptt_url(url: str) -> None:
8
+ if urlparse(url).netloc != "www.ptt.cc":
9
+ raise ValueError(f"URL must be from ptt.cc, got {url}")
10
+
11
+
12
+ class PttLoader(Loader):
13
+ def __init__(self) -> None:
14
+ self.httpx_loader = HttpxLoader(
15
+ headers={
16
+ "Accept-Language": "zh-TW,zh;q=0.9,ja;q=0.8,en-US;q=0.7,en;q=0.6",
17
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36", # noqa
18
+ "Cookie": "over18=1",
19
+ }
20
+ )
21
+
22
+ def load(self, url: str) -> str:
23
+ check_ptt_url(url)
24
+
25
+ return self.httpx_loader.load(url)
26
+
27
+ async def async_load(self, url: str):
28
+ check_ptt_url(url)
29
+
30
+ return await self.httpx_loader.async_load(url)
kabigon/py.typed ADDED
File without changes
kabigon/reddit.py ADDED
@@ -0,0 +1,114 @@
1
+ from urllib.parse import urlparse
2
+ from urllib.parse import urlunparse
3
+
4
+ from .loader import Loader
5
+
6
+ REDDIT_DOMAINS = [
7
+ "reddit.com",
8
+ "www.reddit.com",
9
+ "old.reddit.com",
10
+ ]
11
+
12
+ USER_AGENT = (
13
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
14
+ )
15
+
16
+
17
+ def check_reddit_url(url: str) -> None:
18
+ """Check if URL is from Reddit.
19
+
20
+ Args:
21
+ url: The URL to check
22
+
23
+ Raises:
24
+ ValueError: If URL is not from Reddit
25
+ """
26
+ netloc = urlparse(url).netloc
27
+ if netloc not in REDDIT_DOMAINS:
28
+ raise ValueError(f"URL is not a Reddit URL: {url}")
29
+
30
+
31
+ def convert_to_old_reddit(url: str) -> str:
32
+ """Convert Reddit URL to old.reddit.com format.
33
+
34
+ Args:
35
+ url: Original Reddit URL
36
+
37
+ Returns:
38
+ URL with old.reddit.com domain
39
+ """
40
+ parsed = urlparse(url)
41
+ return str(urlunparse(parsed._replace(netloc="old.reddit.com")))
42
+
43
+
44
+ class RedditLoader(Loader):
45
+ """Loader for Reddit posts and comments.
46
+
47
+ Uses old.reddit.com for better content extraction without CAPTCHA.
48
+ """
49
+
50
+ def __init__(self, timeout: float = 30_000) -> None:
51
+ """Initialize RedditLoader.
52
+
53
+ Args:
54
+ timeout: Timeout in milliseconds for page loading (default: 30 seconds)
55
+ """
56
+ self.timeout = timeout
57
+
58
+ def load(self, url: str) -> str:
59
+ """Load Reddit content from URL.
60
+
61
+ Args:
62
+ url: Reddit URL to load
63
+
64
+ Returns:
65
+ Loaded content as markdown
66
+
67
+ Raises:
68
+ ValueError: If URL is not from Reddit
69
+ """
70
+ from playwright.sync_api import sync_playwright
71
+
72
+ from .utils import html_to_markdown
73
+
74
+ check_reddit_url(url)
75
+ url = convert_to_old_reddit(url)
76
+
77
+ with sync_playwright() as p:
78
+ browser = p.chromium.launch(headless=True)
79
+ context = browser.new_context(user_agent=USER_AGENT)
80
+ page = context.new_page()
81
+ page.goto(url, timeout=self.timeout, wait_until="networkidle")
82
+ content = page.content()
83
+ browser.close()
84
+
85
+ return html_to_markdown(content)
86
+
87
+ async def async_load(self, url: str) -> str:
88
+ """Asynchronously load Reddit content from URL.
89
+
90
+ Args:
91
+ url: Reddit URL to load
92
+
93
+ Returns:
94
+ Loaded content as markdown
95
+
96
+ Raises:
97
+ ValueError: If URL is not from Reddit
98
+ """
99
+ from playwright.async_api import async_playwright
100
+
101
+ from .utils import html_to_markdown
102
+
103
+ check_reddit_url(url)
104
+ url = convert_to_old_reddit(url)
105
+
106
+ async with async_playwright() as p:
107
+ browser = await p.chromium.launch(headless=True)
108
+ context = await browser.new_context(user_agent=USER_AGENT)
109
+ page = await context.new_page()
110
+ await page.goto(url, timeout=self.timeout, wait_until="networkidle")
111
+ content = await page.content()
112
+ await browser.close()
113
+
114
+ return html_to_markdown(content)
kabigon/reel.py ADDED
@@ -0,0 +1,30 @@
1
+ from .httpx import HttpxLoader
2
+ from .loader import Loader
3
+ from .ytdlp import YtdlpLoader
4
+
5
+
6
+ def check_reel_url(url: str) -> None:
7
+ if not url.startswith("https://www.instagram.com/reel"):
8
+ raise ValueError(f"URL is not an Instagram Reel: {url}")
9
+
10
+
11
+ class ReelLoader(Loader):
12
+ def __init__(self) -> None:
13
+ self.httpx_loader = HttpxLoader()
14
+ self.ytdlp_loader = YtdlpLoader()
15
+
16
+ def load(self, url: str) -> str:
17
+ check_reel_url(url)
18
+
19
+ audio_content = self.ytdlp_loader.load(url)
20
+ html_content = self.httpx_loader.load(url)
21
+
22
+ return f"{audio_content}\n\n{html_content}"
23
+
24
+ async def async_load(self, url: str):
25
+ check_reel_url(url)
26
+
27
+ audio_content = await self.ytdlp_loader.async_load(url)
28
+ html_content = await self.httpx_loader.async_load(url)
29
+
30
+ return f"{audio_content}\n\n{html_content}"
kabigon/twitter.py ADDED
@@ -0,0 +1,44 @@
1
+ from urllib.parse import urlparse
2
+ from urllib.parse import urlunparse
3
+
4
+ from .loader import Loader
5
+ from .playwright import PlaywrightLoader
6
+
7
+ TWITTER_DOMAINS = [
8
+ "twitter.com",
9
+ "x.com",
10
+ "fxtwitter.com",
11
+ "vxtwitter.com",
12
+ "fixvx.com",
13
+ "twittpr.com",
14
+ "api.fxtwitter.com",
15
+ "fixupx.com",
16
+ ]
17
+
18
+
19
+ def replace_domain(url: str, new_domain: str = "x.com") -> str:
20
+ return str(urlunparse(urlparse(url)._replace(netloc=new_domain)))
21
+
22
+
23
+ def check_x_url(url: str) -> None:
24
+ if urlparse(url).netloc not in TWITTER_DOMAINS:
25
+ raise ValueError(f"URL is not a Twitter URL: {url}")
26
+
27
+
28
+ class TwitterLoader(Loader):
29
+ def __init__(self, timeout: float = 30_000) -> None:
30
+ self.playwright_loader = PlaywrightLoader(wait_until="networkidle", timeout=timeout)
31
+
32
+ def load(self, url: str) -> str:
33
+ check_x_url(url)
34
+
35
+ url = replace_domain(url)
36
+
37
+ return self.playwright_loader.load(url)
38
+
39
+ async def async_load(self, url: str):
40
+ check_x_url(url)
41
+
42
+ url = replace_domain(url)
43
+
44
+ return await self.playwright_loader.async_load(url)
kabigon/utils.py ADDED
@@ -0,0 +1,36 @@
1
+ from pathlib import Path
2
+
3
+ import charset_normalizer
4
+ from markdownify import markdownify
5
+
6
+
7
+ def normalize_whitespace(text: str) -> str:
8
+ lines = []
9
+ for line in text.splitlines():
10
+ stripped = line.strip()
11
+ if stripped:
12
+ lines += [stripped]
13
+ return "\n".join(lines)
14
+
15
+
16
+ def html_to_markdown(content: str | bytes) -> str:
17
+ """Convert HTML content to markdown format.
18
+
19
+ Args:
20
+ content: HTML content as string or bytes
21
+
22
+ Returns:
23
+ Converted markdown text with normalized whitespace
24
+ """
25
+ if isinstance(content, bytes):
26
+ content = str(charset_normalizer.from_bytes(content).best())
27
+
28
+ md = markdownify(content, strip=["a", "img"])
29
+ return normalize_whitespace(md)
30
+
31
+
32
+ def read_html_content(f: str | Path) -> str:
33
+ content = str(charset_normalizer.from_path(f).best())
34
+
35
+ md = markdownify(content, strip=["a", "img"])
36
+ return normalize_whitespace(md)
kabigon/youtube.py ADDED
@@ -0,0 +1,33 @@
1
+ import aioytt
2
+ import aioytt.video_id
3
+ from youtube_transcript_api import YouTubeTranscriptApi
4
+
5
+ from .loader import Loader
6
+
7
+ DEFAULT_LANGUAGES = ["zh-TW", "zh-Hant", "zh", "zh-Hans", "ja", "en", "ko"]
8
+
9
+
10
+ class YoutubeLoader(Loader):
11
+ def __init__(self, languages: list[str] | None = None) -> None:
12
+ self.languages = languages or DEFAULT_LANGUAGES
13
+
14
+ def load(self, url: str) -> str:
15
+ video_id = aioytt.video_id.parse_video_id(url)
16
+
17
+ fetched = YouTubeTranscriptApi().fetch(video_id, self.languages)
18
+
19
+ lines = []
20
+ for snippet in fetched.snippets:
21
+ text = str(snippet.text).strip()
22
+ if text:
23
+ lines.append(text)
24
+ return "\n".join(lines)
25
+
26
+ # async def async_load(self, url: str) -> str:
27
+ # transcript = await aioytt.get_transcript_from_url(url)
28
+ # lines = []
29
+ # for piece in transcript:
30
+ # text = piece.text.strip()
31
+ # if text:
32
+ # lines.append(text)
33
+ # return "\n".join(lines)
@@ -0,0 +1,26 @@
1
+ from urllib.parse import urlparse
2
+
3
+ from aioytt.video_id import ALLOWED_NETLOCS
4
+ from aioytt.video_id import ALLOWED_SCHEMES
5
+
6
+ from .loader import Loader
7
+ from .ytdlp import YtdlpLoader
8
+
9
+
10
+ def check_youtube_url(url: str) -> None:
11
+ schema = urlparse(url).scheme
12
+ if schema not in ALLOWED_SCHEMES:
13
+ raise ValueError(f"URL scheme is not allowed: {schema}")
14
+
15
+ domain = urlparse(url).netloc
16
+ if domain not in ALLOWED_NETLOCS:
17
+ raise ValueError(f"URL domain is not allowed: {domain}")
18
+
19
+
20
+ class YoutubeYtdlpLoader(Loader):
21
+ def __init__(self) -> None:
22
+ self.ytdlp_loader = YtdlpLoader()
23
+
24
+ def load(self, url: str) -> str:
25
+ check_youtube_url(url)
26
+ return self.ytdlp_loader.load(url)
kabigon/ytdlp.py ADDED
@@ -0,0 +1,61 @@
1
+ import os
2
+ import uuid
3
+ from pathlib import Path
4
+
5
+ import yt_dlp
6
+ from loguru import logger
7
+
8
+ from .loader import Loader
9
+
10
+
11
+ def download_audio(url: str, outtmpl: str | None = None) -> None:
12
+ ydl_opts = {
13
+ "format": "bestaudio/best",
14
+ "postprocessors": [
15
+ {
16
+ "key": "FFmpegExtractAudio",
17
+ "preferredcodec": "mp3",
18
+ "preferredquality": "192",
19
+ }
20
+ ],
21
+ "match_filter": yt_dlp.match_filter_func(["!is_live"]),
22
+ }
23
+
24
+ if outtmpl is not None:
25
+ ydl_opts["outtmpl"] = outtmpl
26
+
27
+ ffmpeg_path = os.getenv("FFMPEG_PATH")
28
+ if ffmpeg_path is not None:
29
+ ydl_opts["ffmpeg_location"] = ffmpeg_path
30
+
31
+ logger.info("Downloading audio from URL: {} with options: {}", url, ydl_opts)
32
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
33
+ ydl.download([url])
34
+
35
+
36
+ class YtdlpLoader(Loader):
37
+ def __init__(self, model: str = "tiny") -> None:
38
+ try:
39
+ import whisper
40
+ except ImportError as e:
41
+ raise ImportError(
42
+ "OpenAI Whisper not installed. Please install it with `pip install openai-whisper`."
43
+ ) from e
44
+
45
+ self.model = whisper.load_model(model)
46
+ self.load_audio = whisper.load_audio
47
+
48
+ def load(self, url: str) -> str:
49
+ outtmpl = uuid.uuid4().hex[:20]
50
+ path = str(Path(outtmpl).with_suffix(".mp3"))
51
+ download_audio(url, outtmpl=outtmpl)
52
+
53
+ try:
54
+ audio = self.load_audio(path)
55
+ logger.info("Transcribing audio file: {}", path)
56
+ result = self.model.transcribe(audio)
57
+ finally:
58
+ # Clean up the audio file
59
+ os.remove(path)
60
+
61
+ return result.get("text", "")
@@ -0,0 +1,54 @@
1
+ Metadata-Version: 2.4
2
+ Name: kabigon
3
+ Version: 0.10.0
4
+ Author-email: narumi <toucans-cutouts0f@icloud.com>
5
+ License-File: LICENSE
6
+ Requires-Python: >=3.10
7
+ Requires-Dist: aioytt>=0.2.4
8
+ Requires-Dist: firecrawl-py>=2.4.1
9
+ Requires-Dist: httpx>=0.28.1
10
+ Requires-Dist: loguru>=0.7.3
11
+ Requires-Dist: markdownify>=0.14.1
12
+ Requires-Dist: openai-whisper>=20250625
13
+ Requires-Dist: playwright>=1.52.0
14
+ Requires-Dist: pypdf>=5.3.0
15
+ Requires-Dist: rich>=13.9.4
16
+ Requires-Dist: typer>=0.15.3
17
+ Requires-Dist: youtube-transcript-api>=1.2.2
18
+ Requires-Dist: yt-dlp>=2025.4.30
19
+ Description-Content-Type: text/markdown
20
+
21
+ # kabigon
22
+
23
+ ## Installation
24
+
25
+ ```shell
26
+ pip install kabigon
27
+ playwright install chromium
28
+ ```
29
+
30
+ ## Usage
31
+
32
+ ```shell
33
+ kabigon <url>
34
+ ```
35
+
36
+ or
37
+
38
+ ```python
39
+ import kabigon
40
+
41
+ url = "https://www.google.com.tw"
42
+
43
+ content = kabigon.Compose(
44
+ [
45
+ kabigon.YoutubeLoader(),
46
+ kabigon.ReelLoader(),
47
+ kabigon.YtdlpLoader(),
48
+ kabigon.PDFLoader(),
49
+ # kabigon.HttpxLoader(),
50
+ kabigon.PlaywrightLoader(),
51
+ ]
52
+ ).load(url)
53
+ print(content)
54
+ ```
@@ -0,0 +1,22 @@
1
+ kabigon/__init__.py,sha256=FXimW6HEBYs0Q_2IGgNmnZcHCmP0sECnDxmoUMguZRg,609
2
+ kabigon/cli.py,sha256=nzztow9R1k4w5J9CADmtVhmGf-LdWWwEZdTpU7H6J4g,592
3
+ kabigon/compose.py,sha256=rSiP0ed0vb9dJ08erZGyiunOcd0ALHAfBRGlxYg1Tvs,1457
4
+ kabigon/firecrawl.py,sha256=Yd9isfGFo8vzp6k2KJ2V0jhazaZ_7ekssdMMj2Wpeng,819
5
+ kabigon/httpx.py,sha256=Zup9DURyWLqoWzaxBbCYAaV-5LSlHUuAcNyyUsZTVag,696
6
+ kabigon/loader.py,sha256=W2ZdSINQKjG-5XNYweM8lIl19JA4wwd7AE2pueHeO70,380
7
+ kabigon/pdf.py,sha256=a6UV4SRuWQzBbpwh0hgDe9OemK5mZ9_dYRXRDHwpg50,1946
8
+ kabigon/playwright.py,sha256=gECaDJhsIrY53lwG7OrH8gUHvIsVe7p5jgBTUy7KPJg,2413
9
+ kabigon/ptt.py,sha256=Gyp2nJrjptkjbwZJ9VEQHX0DEgKBe5QRQOmGVHUUgNA,896
10
+ kabigon/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
+ kabigon/reddit.py,sha256=bb8c7I_INO4R2WsLSPyP1b31CcMN7Mpj334FCWLXrBA,3039
12
+ kabigon/reel.py,sha256=qOwWCvcp7xNKg0JDunq_Bsl8yqqMzrnAOI9k5mSqrOU,874
13
+ kabigon/twitter.py,sha256=p-8MEaHOSZwKOgHRc03uJ_5sh8FuG0jcGf7eUTYJ1B8,1093
14
+ kabigon/utils.py,sha256=eNTLtHLSB2erDac2HH3jWemgfr8Ou_ozwVb8h9BD-4g,922
15
+ kabigon/youtube.py,sha256=Vh4cBidScUruTgvm4BbTxqaEDxnotMI3uM3FuFlx5J0,1021
16
+ kabigon/youtube_ytdlp.py,sha256=Y6h55wYF-5PdxLRxsznFtJDypCedD8g-hY9fgCnfvbg,722
17
+ kabigon/ytdlp.py,sha256=4AjPY0W8IxbEhCNVXpi_KbBQLGrWn5ThrchhffcJIIU,1718
18
+ kabigon-0.10.0.dist-info/METADATA,sha256=B2rf3UC9cXdInFkDH2Cel4PkHjy1f5s_BtWS3VE_pjw,1040
19
+ kabigon-0.10.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
20
+ kabigon-0.10.0.dist-info/entry_points.txt,sha256=O3FYAO9w-NQvlGMJrBvtrnGHSK2QkUnQBTa30YXRbVE,45
21
+ kabigon-0.10.0.dist-info/licenses/LICENSE,sha256=H2T3_RTgmcngMeC7p_SXT3GwBLkd2DaNgAZuxulcfiA,1066
22
+ kabigon-0.10.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.28.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ kabigon = kabigon.cli:main
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2022 なるみ
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.