kabigon 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kabigon/cli.py +0 -2
- kabigon/cloudscraper.py +1 -1
- kabigon/compose.py +22 -3
- kabigon/httpx.py +7 -1
- kabigon/loader.py +10 -0
- kabigon/pdf.py +1 -1
- kabigon/playwright.py +14 -9
- kabigon/reel.py +9 -0
- kabigon/singlefile.py +0 -2
- {kabigon-0.3.0.dist-info → kabigon-0.4.0.dist-info}/METADATA +33 -2
- kabigon-0.4.0.dist-info/RECORD +19 -0
- kabigon-0.3.0.dist-info/RECORD +0 -19
- {kabigon-0.3.0.dist-info → kabigon-0.4.0.dist-info}/WHEEL +0 -0
- {kabigon-0.3.0.dist-info → kabigon-0.4.0.dist-info}/entry_points.txt +0 -0
- {kabigon-0.3.0.dist-info → kabigon-0.4.0.dist-info}/licenses/LICENSE +0 -0
kabigon/cli.py
CHANGED
@@ -1,7 +1,6 @@
|
|
1
1
|
import click
|
2
2
|
from rich import print
|
3
3
|
|
4
|
-
from .cloudscraper import CloudscraperLoader
|
5
4
|
from .compose import Compose
|
6
5
|
from .httpx import HttpxLoader
|
7
6
|
from .pdf import PDFLoader
|
@@ -20,7 +19,6 @@ def main(url: str) -> None:
|
|
20
19
|
ReelLoader(),
|
21
20
|
YtdlpLoader(),
|
22
21
|
PDFLoader(),
|
23
|
-
CloudscraperLoader(),
|
24
22
|
HttpxLoader(),
|
25
23
|
SinglefileLoader(),
|
26
24
|
]
|
kabigon/cloudscraper.py
CHANGED
@@ -6,7 +6,7 @@ from .utils import html_to_markdown
|
|
6
6
|
|
7
7
|
|
8
8
|
class CloudscraperLoader(Loader):
|
9
|
-
@timeout_decorator.timeout(
|
9
|
+
@timeout_decorator.timeout(10)
|
10
10
|
def load(self, url: str) -> str:
|
11
11
|
client = cloudscraper.create_scraper()
|
12
12
|
response = client.get(url, allow_redirects=True)
|
kabigon/compose.py
CHANGED
@@ -37,14 +37,33 @@ class Compose(Loader):
|
|
37
37
|
|
38
38
|
for loader in self.loaders:
|
39
39
|
try:
|
40
|
-
|
40
|
+
content = loader.load(url)
|
41
41
|
|
42
|
-
if not
|
42
|
+
if not content:
|
43
43
|
logger.info("[{}] Failed to load URL: {}, got empty result", loader.__class__.__name__, url)
|
44
44
|
continue
|
45
45
|
|
46
46
|
logger.info("[{}] Successfully loaded URL: {}", loader.__class__.__name__, url)
|
47
|
-
return
|
47
|
+
return content
|
48
|
+
|
49
|
+
except Exception as e:
|
50
|
+
logger.info("[{}] Failed to load URL: {}, got error: {}", loader.__class__.__name__, url, e)
|
51
|
+
|
52
|
+
raise LoaderError(f"Failed to load URL: {url}")
|
53
|
+
|
54
|
+
async def async_load(self, url: str) -> str:
|
55
|
+
url = replace_domain(url)
|
56
|
+
|
57
|
+
for loader in self.loaders:
|
58
|
+
try:
|
59
|
+
content = await loader.async_load(url)
|
60
|
+
|
61
|
+
if not content:
|
62
|
+
logger.info("[{}] Failed to load URL: {}, got empty result", loader.__class__.__name__, url)
|
63
|
+
continue
|
64
|
+
|
65
|
+
logger.info("[{}] Successfully loaded URL: {}", loader.__class__.__name__, url)
|
66
|
+
return content
|
48
67
|
|
49
68
|
except Exception as e:
|
50
69
|
logger.info("[{}] Failed to load URL: {}, got error: {}", loader.__class__.__name__, url, e)
|
kabigon/httpx.py
CHANGED
@@ -12,8 +12,14 @@ DEFAULT_HEADERS = {
|
|
12
12
|
|
13
13
|
|
14
14
|
class HttpxLoader(Loader):
|
15
|
-
@timeout_decorator.timeout(
|
15
|
+
@timeout_decorator.timeout(10)
|
16
16
|
def load(self, url: str) -> str:
|
17
17
|
response = httpx.get(url, headers=DEFAULT_HEADERS, follow_redirects=True)
|
18
18
|
response.raise_for_status()
|
19
19
|
return html_to_markdown(response.content)
|
20
|
+
|
21
|
+
async def async_load(self, url: str) -> str:
|
22
|
+
async with httpx.AsyncClient() as client:
|
23
|
+
response = await client.get(url, headers=DEFAULT_HEADERS, follow_redirects=True)
|
24
|
+
response.raise_for_status()
|
25
|
+
return html_to_markdown(response.content)
|
kabigon/loader.py
CHANGED
@@ -1,3 +1,7 @@
|
|
1
|
+
import asyncio
|
2
|
+
import concurrent.futures
|
3
|
+
|
4
|
+
|
1
5
|
class Loader:
|
2
6
|
def __call__(self, url: str) -> str:
|
3
7
|
return self.load(url)
|
@@ -5,6 +9,12 @@ class Loader:
|
|
5
9
|
def load(self, url: str) -> str:
|
6
10
|
raise NotImplementedError
|
7
11
|
|
12
|
+
async def async_load(self, url: str):
|
13
|
+
loop = asyncio.get_running_loop()
|
14
|
+
with concurrent.futures.ProcessPoolExecutor() as executor:
|
15
|
+
result = await loop.run_in_executor(executor, self.load, url)
|
16
|
+
return result
|
17
|
+
|
8
18
|
|
9
19
|
class LoaderError(Exception):
|
10
20
|
pass
|
kabigon/pdf.py
CHANGED
@@ -19,7 +19,7 @@ class NotPDFError(LoaderError):
|
|
19
19
|
|
20
20
|
|
21
21
|
class PDFLoader(Loader):
|
22
|
-
@timeout_decorator.timeout(
|
22
|
+
@timeout_decorator.timeout(10)
|
23
23
|
def load(self, url_or_file: str) -> str:
|
24
24
|
if url_or_file.startswith("http"):
|
25
25
|
url_or_file = download_pdf_from_url(url_or_file)
|
kabigon/playwright.py
CHANGED
@@ -1,8 +1,6 @@
|
|
1
1
|
from typing import Literal
|
2
2
|
|
3
|
-
import
|
4
|
-
from loguru import logger
|
5
|
-
from playwright.sync_api import TimeoutError
|
3
|
+
from playwright.async_api import async_playwright
|
6
4
|
from playwright.sync_api import sync_playwright
|
7
5
|
|
8
6
|
from .loader import Loader
|
@@ -20,19 +18,26 @@ class PlaywrightLoader(Loader):
|
|
20
18
|
self.wait_until = wait_until
|
21
19
|
self.browser_headless = browser_headless
|
22
20
|
|
23
|
-
@timeout_decorator.timeout(5)
|
24
21
|
def load(self, url: str) -> str:
|
25
22
|
with sync_playwright() as p:
|
26
23
|
browser = p.chromium.launch(headless=self.browser_headless)
|
27
24
|
page = browser.new_page()
|
28
25
|
|
29
|
-
|
30
|
-
page.goto(url, timeout=self.timeout, wait_until=self.wait_until)
|
31
|
-
except TimeoutError as e:
|
32
|
-
logger.error("TimeoutError: {}", e)
|
33
|
-
page.goto(url)
|
26
|
+
page.goto(url, timeout=self.timeout, wait_until=self.wait_until)
|
34
27
|
|
35
28
|
content = page.content()
|
36
29
|
browser.close()
|
37
30
|
|
38
31
|
return html_to_markdown(content)
|
32
|
+
|
33
|
+
async def async_load(self, url: str) -> str:
|
34
|
+
async with async_playwright() as p:
|
35
|
+
browser = await p.chromium.launch(headless=self.browser_headless)
|
36
|
+
page = await browser.new_page()
|
37
|
+
|
38
|
+
await page.goto(url, timeout=self.timeout, wait_until=self.wait_until)
|
39
|
+
|
40
|
+
content = await page.content()
|
41
|
+
await browser.close()
|
42
|
+
|
43
|
+
return html_to_markdown(content)
|
kabigon/reel.py
CHANGED
@@ -29,3 +29,12 @@ class ReelLoader(Loader):
|
|
29
29
|
html_content = self.httpx_loader.load(url)
|
30
30
|
|
31
31
|
return f"{audio_content}\n\n{html_content}"
|
32
|
+
|
33
|
+
async def async_load(self, url: str):
|
34
|
+
if not is_reel_url(url):
|
35
|
+
raise NotReelURLError(url)
|
36
|
+
|
37
|
+
audio_content = await self.ytdlp_loader.async_load(url)
|
38
|
+
html_content = await self.httpx_loader.async_load(url)
|
39
|
+
|
40
|
+
return f"{audio_content}\n\n{html_content}"
|
kabigon/singlefile.py
CHANGED
@@ -6,7 +6,6 @@ from pathlib import Path
|
|
6
6
|
from typing import Final
|
7
7
|
|
8
8
|
import charset_normalizer
|
9
|
-
import timeout_decorator
|
10
9
|
from loguru import logger
|
11
10
|
|
12
11
|
from .loader import Loader
|
@@ -29,7 +28,6 @@ class SinglefileLoader(Loader):
|
|
29
28
|
self.cookies_file = cookies_file
|
30
29
|
self.browser_headless = browser_headless
|
31
30
|
|
32
|
-
@timeout_decorator.timeout(20)
|
33
31
|
def load(self, url: str) -> str:
|
34
32
|
filename = self.download(url)
|
35
33
|
content = str(charset_normalizer.from_path(filename).best())
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kabigon
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.4.0
|
4
4
|
Author-email: narumi <toucans-cutouts0f@icloud.com>
|
5
5
|
License-File: LICENSE
|
6
6
|
Requires-Python: >=3.10
|
@@ -9,7 +9,6 @@ Requires-Dist: cloudscraper>=1.2.71
|
|
9
9
|
Requires-Dist: httpx>=0.28.1
|
10
10
|
Requires-Dist: loguru>=0.7.3
|
11
11
|
Requires-Dist: markdownify>=0.14.1
|
12
|
-
Requires-Dist: numpy>=2.1.3
|
13
12
|
Requires-Dist: openai-whisper>=20240930
|
14
13
|
Requires-Dist: playwright>=1.50.0
|
15
14
|
Requires-Dist: pypdf>=5.3.0
|
@@ -20,3 +19,35 @@ Requires-Dist: yt-dlp>=2025.1.26
|
|
20
19
|
Description-Content-Type: text/markdown
|
21
20
|
|
22
21
|
# kabigon
|
22
|
+
|
23
|
+
## Installation
|
24
|
+
|
25
|
+
```shell
|
26
|
+
pip install kabigon
|
27
|
+
```
|
28
|
+
|
29
|
+
## Usage
|
30
|
+
|
31
|
+
```shell
|
32
|
+
kabigon <url>
|
33
|
+
```
|
34
|
+
|
35
|
+
or
|
36
|
+
|
37
|
+
```python
|
38
|
+
import kabigon
|
39
|
+
|
40
|
+
url = "https://www.google.com.tw"
|
41
|
+
|
42
|
+
content = kabigon.Compose(
|
43
|
+
[
|
44
|
+
kabigon.YoutubeLoader(),
|
45
|
+
kabigon.ReelLoader(),
|
46
|
+
kabigon.YtdlpLoader(),
|
47
|
+
kabigon.PDFLoader(),
|
48
|
+
kabigon.HttpxLoader(),
|
49
|
+
kabigon.SinglefileLoader(),
|
50
|
+
]
|
51
|
+
).load(url)
|
52
|
+
print(content)
|
53
|
+
```
|
@@ -0,0 +1,19 @@
|
|
1
|
+
kabigon/__init__.py,sha256=9RgyhhwjqrW2iQy9RBN2j7VZNhwA9xGo_atC7FKnZA4,545
|
2
|
+
kabigon/cli.py,sha256=z3u2Msvi1SWf1fd9nCTzJULeO-rRb5oDKJfPxhUeYQ0,611
|
3
|
+
kabigon/cloudscraper.py,sha256=0jzrXVXSZopExyxrDRbcI_2wsbHAg_dqOk4D3Re0jvk,404
|
4
|
+
kabigon/compose.py,sha256=DO0hOJgEAX7ZLOS53dcE6V9zi7Tr9oGNW8koPHsx9eM,2110
|
5
|
+
kabigon/httpx.py,sha256=B8_26rufJMbKSXINBEqyCIpaRueO_3Gk_PtEQmlOxQ4,955
|
6
|
+
kabigon/loader.py,sha256=D5xUPJb3uAygmBaN_sX56ZpGcGsVz-ueHOXC7gSGaxM,493
|
7
|
+
kabigon/pdf.py,sha256=oM5pwZJ2GCcHyQXg98-Mda-MHxarYVZQge30KdS_aHY,1549
|
8
|
+
kabigon/playwright.py,sha256=ciNUlpMbwd47utCLT454wFSirXFmt3eCXN2Q-nAsiu8,1356
|
9
|
+
kabigon/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
10
|
+
kabigon/reel.py,sha256=1JTcn7qVH7FcD0Oj-Rz-pnjI-xS1UtkoJcuClGb8ExQ,1124
|
11
|
+
kabigon/singlefile.py,sha256=CeTT2WPYm0vb1xWPNdyBN4uHRw9hRqfZm68D-nEcUA8,1800
|
12
|
+
kabigon/utils.py,sha256=eNTLtHLSB2erDac2HH3jWemgfr8Ou_ozwVb8h9BD-4g,922
|
13
|
+
kabigon/youtube.py,sha256=_wdKvRRAMrYnv3rUhkd_6JuOGCuQClYpj1UlVeYeojc,2615
|
14
|
+
kabigon/ytdlp.py,sha256=kG1fXqU650otOWespjOSkGK_-jk1wO-sWiR60_UPJxY,3125
|
15
|
+
kabigon-0.4.0.dist-info/METADATA,sha256=NRqd2kpi19xYz7TJx5TBXDm-uJcVcpu17P0sw3N-SiA,1020
|
16
|
+
kabigon-0.4.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
17
|
+
kabigon-0.4.0.dist-info/entry_points.txt,sha256=O3FYAO9w-NQvlGMJrBvtrnGHSK2QkUnQBTa30YXRbVE,45
|
18
|
+
kabigon-0.4.0.dist-info/licenses/LICENSE,sha256=H2T3_RTgmcngMeC7p_SXT3GwBLkd2DaNgAZuxulcfiA,1066
|
19
|
+
kabigon-0.4.0.dist-info/RECORD,,
|
kabigon-0.3.0.dist-info/RECORD
DELETED
@@ -1,19 +0,0 @@
|
|
1
|
-
kabigon/__init__.py,sha256=9RgyhhwjqrW2iQy9RBN2j7VZNhwA9xGo_atC7FKnZA4,545
|
2
|
-
kabigon/cli.py,sha256=7qHklIhYUZ4E78C9MIB16AoVIYt2xLJS0Pz8sr51YPk,690
|
3
|
-
kabigon/cloudscraper.py,sha256=viaIWATsS8nD9HN0RfBiveUHuL012OjuaKlKwLEteGw,403
|
4
|
-
kabigon/compose.py,sha256=IqNm-Cxl6e8u7X9v5SoG7cjxOWCMIcSZG1lDVNNAfo8,1433
|
5
|
-
kabigon/httpx.py,sha256=SfhaJXNKlFOwWs_Eeadiegi5wNvZV0RX7lqQwR8nYGo,667
|
6
|
-
kabigon/loader.py,sha256=cV9ZqcWaNtS2WTDpgyNYK2kX5Cu1ZC-Sq-qS3PpPnJQ,198
|
7
|
-
kabigon/pdf.py,sha256=9Oi_ZP7D0LLWs2D8KO3omRld1nYyhEbQMForZUw9YZg,1548
|
8
|
-
kabigon/playwright.py,sha256=0CkSDY90i7PHKxrJ6Zwad_NDCy5TPyCjeFvLrQuZXFU,1154
|
9
|
-
kabigon/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
10
|
-
kabigon/reel.py,sha256=dkWXG2nBhIt0DpGJzevkIrRKLqJh_03-yrg_rjf6vnY,828
|
11
|
-
kabigon/singlefile.py,sha256=2nTCTFgW5Gp3l0ExaVh2foUDVSgLaAssDp3tBoQ1MhY,1860
|
12
|
-
kabigon/utils.py,sha256=eNTLtHLSB2erDac2HH3jWemgfr8Ou_ozwVb8h9BD-4g,922
|
13
|
-
kabigon/youtube.py,sha256=_wdKvRRAMrYnv3rUhkd_6JuOGCuQClYpj1UlVeYeojc,2615
|
14
|
-
kabigon/ytdlp.py,sha256=kG1fXqU650otOWespjOSkGK_-jk1wO-sWiR60_UPJxY,3125
|
15
|
-
kabigon-0.3.0.dist-info/METADATA,sha256=w5bE9Wv-p4REXAC_7hPNIVilsxHK7tIZzg2EFz-vh_I,633
|
16
|
-
kabigon-0.3.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
17
|
-
kabigon-0.3.0.dist-info/entry_points.txt,sha256=O3FYAO9w-NQvlGMJrBvtrnGHSK2QkUnQBTa30YXRbVE,45
|
18
|
-
kabigon-0.3.0.dist-info/licenses/LICENSE,sha256=H2T3_RTgmcngMeC7p_SXT3GwBLkd2DaNgAZuxulcfiA,1066
|
19
|
-
kabigon-0.3.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|