kabigon 0.5.3__py3-none-any.whl → 0.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kabigon/__init__.py +1 -1
- kabigon/compose.py +6 -5
- kabigon/errors.py +23 -0
- kabigon/firecrawl.py +33 -0
- kabigon/loader.py +0 -4
- kabigon/pdf.py +1 -6
- kabigon/reel.py +1 -6
- kabigon/ytdlp.py +1 -1
- {kabigon-0.5.3.dist-info → kabigon-0.6.1.dist-info}/METADATA +2 -1
- kabigon-0.6.1.dist-info/RECORD +19 -0
- kabigon/singlefile.py +0 -65
- kabigon-0.5.3.dist-info/RECORD +0 -18
- {kabigon-0.5.3.dist-info → kabigon-0.6.1.dist-info}/WHEEL +0 -0
- {kabigon-0.5.3.dist-info → kabigon-0.6.1.dist-info}/entry_points.txt +0 -0
- {kabigon-0.5.3.dist-info → kabigon-0.6.1.dist-info}/licenses/LICENSE +0 -0
kabigon/__init__.py
CHANGED
@@ -5,12 +5,12 @@ from typing import Final
|
|
5
5
|
from loguru import logger
|
6
6
|
|
7
7
|
from .compose import Compose
|
8
|
+
from .firecrawl import FirecrawlLoader
|
8
9
|
from .httpx import HttpxLoader
|
9
10
|
from .loader import Loader
|
10
11
|
from .pdf import PDFLoader
|
11
12
|
from .playwright import PlaywrightLoader
|
12
13
|
from .reel import ReelLoader
|
13
|
-
from .singlefile import SinglefileLoader
|
14
14
|
from .youtube import YoutubeLoader
|
15
15
|
from .ytdlp import YtdlpLoader
|
16
16
|
|
kabigon/compose.py
CHANGED
@@ -3,18 +3,19 @@ from urllib.parse import urlunparse
|
|
3
3
|
|
4
4
|
from loguru import logger
|
5
5
|
|
6
|
+
from .errors import KabigonError
|
6
7
|
from .loader import Loader
|
7
|
-
from .loader import LoaderError
|
8
8
|
|
9
9
|
REPLACEMENTS = {
|
10
|
-
|
10
|
+
# fixupx.com seems better than api.fxtwitter.com
|
11
|
+
"fixupx.com": [
|
11
12
|
"twitter.com",
|
12
13
|
"x.com",
|
13
14
|
"fxtwitter.com",
|
14
15
|
"vxtwitter.com",
|
15
16
|
"fixvx.com",
|
16
17
|
"twittpr.com",
|
17
|
-
"
|
18
|
+
"api.fxtwitter.com",
|
18
19
|
]
|
19
20
|
}
|
20
21
|
|
@@ -49,7 +50,7 @@ class Compose(Loader):
|
|
49
50
|
except Exception as e:
|
50
51
|
logger.info("[{}] Failed to load URL: {}, got error: {}", loader.__class__.__name__, url, e)
|
51
52
|
|
52
|
-
raise
|
53
|
+
raise KabigonError(f"Failed to load URL: {url}")
|
53
54
|
|
54
55
|
async def async_load(self, url: str) -> str:
|
55
56
|
url = replace_domain(url)
|
@@ -68,4 +69,4 @@ class Compose(Loader):
|
|
68
69
|
except Exception as e:
|
69
70
|
logger.info("[{}] Failed to load URL: {}, got error: {}", loader.__class__.__name__, url, e)
|
70
71
|
|
71
|
-
raise
|
72
|
+
raise KabigonError(f"Failed to load URL: {url}")
|
kabigon/errors.py
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
class KabigonError(Exception):
|
2
|
+
pass
|
3
|
+
|
4
|
+
|
5
|
+
class FirecrawlKeyError(KabigonError):
|
6
|
+
def __init__(self) -> None:
|
7
|
+
super().__init__("FIRECRAWL_API_KEY is not set.")
|
8
|
+
|
9
|
+
|
10
|
+
class FirecrawlError(KabigonError):
|
11
|
+
def __init__(self, url: str, error: str) -> None:
|
12
|
+
msg = f"Failed to load URL: {url}, got: {error}"
|
13
|
+
super().__init__(msg)
|
14
|
+
|
15
|
+
|
16
|
+
class NotPDFError(KabigonError):
|
17
|
+
def __init__(self, url: str) -> None:
|
18
|
+
super().__init__(f"URL is not a PDF: {url}")
|
19
|
+
|
20
|
+
|
21
|
+
class NotReelURLError(KabigonError):
|
22
|
+
def __init__(self, url: str):
|
23
|
+
super().__init__(f"URL is not an Instagram Reel: {url}")
|
kabigon/firecrawl.py
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
import os
|
2
|
+
|
3
|
+
from firecrawl import FirecrawlApp
|
4
|
+
|
5
|
+
from .errors import FirecrawlError
|
6
|
+
from .errors import FirecrawlKeyError
|
7
|
+
from .loader import Loader
|
8
|
+
|
9
|
+
|
10
|
+
class FirecrawlLoader(Loader):
|
11
|
+
def __init__(self, timeout: int | None = None) -> None:
|
12
|
+
self.timeout = timeout
|
13
|
+
|
14
|
+
api_key = os.getenv("FIRECRAWL_API_KEY")
|
15
|
+
if not api_key:
|
16
|
+
raise FirecrawlKeyError()
|
17
|
+
|
18
|
+
self.app = FirecrawlApp(api_key=api_key)
|
19
|
+
|
20
|
+
def load(self, url: str) -> str:
|
21
|
+
result = self.app.scrape_url(
|
22
|
+
url,
|
23
|
+
formats=["markdown"],
|
24
|
+
timeout=self.timeout,
|
25
|
+
)
|
26
|
+
|
27
|
+
if not result.success:
|
28
|
+
raise FirecrawlError(url, result.error)
|
29
|
+
|
30
|
+
return result.markdown
|
31
|
+
|
32
|
+
async def async_load(self, url: str) -> str:
|
33
|
+
return self.load(url)
|
kabigon/loader.py
CHANGED
kabigon/pdf.py
CHANGED
@@ -6,8 +6,8 @@ from typing import Any
|
|
6
6
|
import httpx
|
7
7
|
from pypdf import PdfReader
|
8
8
|
|
9
|
+
from .errors import NotPDFError
|
9
10
|
from .loader import Loader
|
10
|
-
from .loader import LoaderError
|
11
11
|
|
12
12
|
DEFAULT_HEADERS = {
|
13
13
|
"Accept-Language": "zh-TW,zh;q=0.9,ja;q=0.8,en-US;q=0.7,en;q=0.6",
|
@@ -15,11 +15,6 @@ DEFAULT_HEADERS = {
|
|
15
15
|
}
|
16
16
|
|
17
17
|
|
18
|
-
class NotPDFError(LoaderError):
|
19
|
-
def __init__(self, url: str) -> None:
|
20
|
-
super().__init__(f"URL is not a PDF: {url}")
|
21
|
-
|
22
|
-
|
23
18
|
class PDFLoader(Loader):
|
24
19
|
def load(self, url_or_file: str) -> str:
|
25
20
|
if not url_or_file.startswith("http"):
|
kabigon/reel.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
|
+
from .errors import NotReelURLError
|
1
2
|
from .httpx import HttpxLoader
|
2
3
|
from .loader import Loader
|
3
|
-
from .loader import LoaderError
|
4
4
|
from .ytdlp import YtdlpLoader
|
5
5
|
|
6
6
|
|
@@ -8,11 +8,6 @@ def is_reel_url(url: str) -> bool:
|
|
8
8
|
return url.startswith("https://www.instagram.com/reel")
|
9
9
|
|
10
10
|
|
11
|
-
class NotReelURLError(LoaderError):
|
12
|
-
def __init__(self, url: str):
|
13
|
-
super().__init__(f"URL is not an Instagram Reel: {url}")
|
14
|
-
|
15
|
-
|
16
11
|
class ReelLoader(Loader):
|
17
12
|
def __init__(self) -> None:
|
18
13
|
self.httpx_loader = HttpxLoader()
|
kabigon/ytdlp.py
CHANGED
@@ -1,11 +1,12 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kabigon
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.6.1
|
4
4
|
Author-email: narumi <toucans-cutouts0f@icloud.com>
|
5
5
|
License-File: LICENSE
|
6
6
|
Requires-Python: >=3.10
|
7
7
|
Requires-Dist: aioytt>=0.2.4
|
8
8
|
Requires-Dist: click>=8.1.8
|
9
|
+
Requires-Dist: firecrawl-py>=2.4.1
|
9
10
|
Requires-Dist: httpx>=0.28.1
|
10
11
|
Requires-Dist: loguru>=0.7.3
|
11
12
|
Requires-Dist: markdownify>=0.14.1
|
@@ -0,0 +1,19 @@
|
|
1
|
+
kabigon/__init__.py,sha256=_1LdKfp7qm0m7Fa_IJ9zcg4TRB14c9bB-r9M3fwASWI,498
|
2
|
+
kabigon/cli.py,sha256=XSTyD1RFqq2Qok_52kSjJlBLUXl6t-K9QtsxCfB15o4,611
|
3
|
+
kabigon/compose.py,sha256=5H_hWQ7ENUmEdeOpB2bkRv4U_U18_1IyNTran-cLYaM,2166
|
4
|
+
kabigon/errors.py,sha256=3eSPQtLocreKuq9mhwRk7IMdA2xh8KkmIFEUhGmTPxg,618
|
5
|
+
kabigon/firecrawl.py,sha256=Xnrlhco_R58x5kwGy9ZCKTnVqS4Pp-D3G0u-qnuGEsU,800
|
6
|
+
kabigon/httpx.py,sha256=uDdLks6zVzirY7-mnsJkypX86kAI5XmUVfK-lFifdJA,895
|
7
|
+
kabigon/loader.py,sha256=KhOJvlzLDM0o05o4VqMRgiyRVjofkzuGEcihQagj-8M,452
|
8
|
+
kabigon/pdf.py,sha256=yJcgkdMMF52baFsFxJp9Jn89KsTKBboLTLwD3gs6U5U,1775
|
9
|
+
kabigon/playwright.py,sha256=MZ-r0Ej2wWAOJkDLwYRvO77wcDvh38KXz2wgDsCTgm0,1358
|
10
|
+
kabigon/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
11
|
+
kabigon/reel.py,sha256=J2QOxGMYi_HaEscQPIipPEoHGN_iksGxR6pV_XvryME,929
|
12
|
+
kabigon/utils.py,sha256=eNTLtHLSB2erDac2HH3jWemgfr8Ou_ozwVb8h9BD-4g,922
|
13
|
+
kabigon/youtube.py,sha256=HoiFNq0ookPL7_rO_wloBaY8yTIX6xP8A77F7y02q64,1166
|
14
|
+
kabigon/ytdlp.py,sha256=_QRcyFx9s7NnI1MvcWdKKxlX-hHLnqtduCSL5_UH6dU,3140
|
15
|
+
kabigon-0.6.1.dist-info/METADATA,sha256=78J35ClbTdy2-vU8GtlQWjgSTitntFI8J-Cz07FGqoo,1078
|
16
|
+
kabigon-0.6.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
17
|
+
kabigon-0.6.1.dist-info/entry_points.txt,sha256=O3FYAO9w-NQvlGMJrBvtrnGHSK2QkUnQBTa30YXRbVE,45
|
18
|
+
kabigon-0.6.1.dist-info/licenses/LICENSE,sha256=H2T3_RTgmcngMeC7p_SXT3GwBLkd2DaNgAZuxulcfiA,1066
|
19
|
+
kabigon-0.6.1.dist-info/RECORD,,
|
kabigon/singlefile.py
DELETED
@@ -1,65 +0,0 @@
|
|
1
|
-
import os
|
2
|
-
import subprocess
|
3
|
-
import tempfile
|
4
|
-
from functools import cache
|
5
|
-
from pathlib import Path
|
6
|
-
from typing import Final
|
7
|
-
|
8
|
-
import charset_normalizer
|
9
|
-
from loguru import logger
|
10
|
-
|
11
|
-
from .loader import Loader
|
12
|
-
from .utils import html_to_markdown
|
13
|
-
|
14
|
-
DEFAULT_SINGLEFILE_PATH: Final[str] = "single-file"
|
15
|
-
|
16
|
-
|
17
|
-
@cache
|
18
|
-
def get_singlefile_path() -> str:
|
19
|
-
path = os.getenv("SINGLEFILE_PATH")
|
20
|
-
if not path:
|
21
|
-
path = DEFAULT_SINGLEFILE_PATH
|
22
|
-
logger.warning("SINGLEFILE_PATH not set, using default: {}", DEFAULT_SINGLEFILE_PATH)
|
23
|
-
return path
|
24
|
-
|
25
|
-
|
26
|
-
class SinglefileLoader(Loader):
|
27
|
-
def __init__(self, cookies_file: str | None = None, browser_headless: bool = False) -> None:
|
28
|
-
self.cookies_file = cookies_file
|
29
|
-
self.browser_headless = browser_headless
|
30
|
-
|
31
|
-
def load(self, url: str) -> str:
|
32
|
-
filename = self.download(url)
|
33
|
-
content = str(charset_normalizer.from_path(filename).best())
|
34
|
-
return html_to_markdown(content)
|
35
|
-
|
36
|
-
def download(self, url: str) -> str:
|
37
|
-
logger.info("Downloading HTML using SingleFile: {}", url)
|
38
|
-
|
39
|
-
filename = tempfile.mktemp(suffix=".html")
|
40
|
-
singlefile_path = get_singlefile_path()
|
41
|
-
|
42
|
-
cmds = [singlefile_path]
|
43
|
-
|
44
|
-
if self.cookies_file is not None:
|
45
|
-
cookies_path = Path(self.cookies_file)
|
46
|
-
if not cookies_path.exists():
|
47
|
-
raise FileNotFoundError(f"Cookies file not found: {self.cookies_file}")
|
48
|
-
|
49
|
-
cmds += [
|
50
|
-
"--browser-cookies-file",
|
51
|
-
str(cookies_path),
|
52
|
-
]
|
53
|
-
|
54
|
-
cmds += [
|
55
|
-
"--filename-conflict-action",
|
56
|
-
"overwrite",
|
57
|
-
"--browser-headless",
|
58
|
-
str(self.browser_headless).lower(),
|
59
|
-
url,
|
60
|
-
filename,
|
61
|
-
]
|
62
|
-
|
63
|
-
subprocess.run(cmds)
|
64
|
-
|
65
|
-
return filename
|
kabigon-0.5.3.dist-info/RECORD
DELETED
@@ -1,18 +0,0 @@
|
|
1
|
-
kabigon/__init__.py,sha256=7ll3ePlHNbZq-CmrGMrQouLCUSmuRsZ9yAj2JOzr7HY,500
|
2
|
-
kabigon/cli.py,sha256=XSTyD1RFqq2Qok_52kSjJlBLUXl6t-K9QtsxCfB15o4,611
|
3
|
-
kabigon/compose.py,sha256=DO0hOJgEAX7ZLOS53dcE6V9zi7Tr9oGNW8koPHsx9eM,2110
|
4
|
-
kabigon/httpx.py,sha256=uDdLks6zVzirY7-mnsJkypX86kAI5XmUVfK-lFifdJA,895
|
5
|
-
kabigon/loader.py,sha256=D5xUPJb3uAygmBaN_sX56ZpGcGsVz-ueHOXC7gSGaxM,493
|
6
|
-
kabigon/pdf.py,sha256=Q9XuBdKDrDQJ8BNvY7Lgt6dpGeA_ylGGHWOE3euiI_8,1904
|
7
|
-
kabigon/playwright.py,sha256=MZ-r0Ej2wWAOJkDLwYRvO77wcDvh38KXz2wgDsCTgm0,1358
|
8
|
-
kabigon/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
9
|
-
kabigon/reel.py,sha256=TP_oKYXABXYja2A9damTBWR3MVYA7aZyxbIvCuTcq40,1062
|
10
|
-
kabigon/singlefile.py,sha256=CeTT2WPYm0vb1xWPNdyBN4uHRw9hRqfZm68D-nEcUA8,1800
|
11
|
-
kabigon/utils.py,sha256=eNTLtHLSB2erDac2HH3jWemgfr8Ou_ozwVb8h9BD-4g,922
|
12
|
-
kabigon/youtube.py,sha256=HoiFNq0ookPL7_rO_wloBaY8yTIX6xP8A77F7y02q64,1166
|
13
|
-
kabigon/ytdlp.py,sha256=kG1fXqU650otOWespjOSkGK_-jk1wO-sWiR60_UPJxY,3125
|
14
|
-
kabigon-0.5.3.dist-info/METADATA,sha256=gzcVy_2l4kAo_heevYpN-NsHY9f7RMFlFGk6rkSCVQU,1043
|
15
|
-
kabigon-0.5.3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
16
|
-
kabigon-0.5.3.dist-info/entry_points.txt,sha256=O3FYAO9w-NQvlGMJrBvtrnGHSK2QkUnQBTa30YXRbVE,45
|
17
|
-
kabigon-0.5.3.dist-info/licenses/LICENSE,sha256=H2T3_RTgmcngMeC7p_SXT3GwBLkd2DaNgAZuxulcfiA,1066
|
18
|
-
kabigon-0.5.3.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|