kabigon 0.4.2__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kabigon/__init__.py +0 -1
- kabigon/cli.py +2 -2
- kabigon/httpx.py +0 -2
- kabigon/pdf.py +28 -21
- kabigon/playwright.py +7 -1
- kabigon/reel.py +0 -3
- {kabigon-0.4.2.dist-info → kabigon-0.5.1.dist-info}/METADATA +4 -4
- kabigon-0.5.1.dist-info/RECORD +18 -0
- kabigon/cloudscraper.py +0 -14
- kabigon-0.4.2.dist-info/RECORD +0 -19
- {kabigon-0.4.2.dist-info → kabigon-0.5.1.dist-info}/WHEEL +0 -0
- {kabigon-0.4.2.dist-info → kabigon-0.5.1.dist-info}/entry_points.txt +0 -0
- {kabigon-0.4.2.dist-info → kabigon-0.5.1.dist-info}/licenses/LICENSE +0 -0
kabigon/__init__.py
CHANGED
kabigon/cli.py
CHANGED
@@ -4,8 +4,8 @@ from rich import print
|
|
4
4
|
from .compose import Compose
|
5
5
|
from .httpx import HttpxLoader
|
6
6
|
from .pdf import PDFLoader
|
7
|
+
from .playwright import PlaywrightLoader
|
7
8
|
from .reel import ReelLoader
|
8
|
-
from .singlefile import SinglefileLoader
|
9
9
|
from .youtube import YoutubeLoader
|
10
10
|
from .ytdlp import YtdlpLoader
|
11
11
|
|
@@ -20,7 +20,7 @@ def main(url: str) -> None:
|
|
20
20
|
YtdlpLoader(),
|
21
21
|
PDFLoader(),
|
22
22
|
HttpxLoader(),
|
23
|
-
|
23
|
+
PlaywrightLoader(),
|
24
24
|
]
|
25
25
|
)
|
26
26
|
result = loader.load(url)
|
kabigon/httpx.py
CHANGED
@@ -1,5 +1,4 @@
|
|
1
1
|
import httpx
|
2
|
-
import timeout_decorator
|
3
2
|
|
4
3
|
from .loader import Loader
|
5
4
|
from .utils import html_to_markdown
|
@@ -12,7 +11,6 @@ DEFAULT_HEADERS = {
|
|
12
11
|
|
13
12
|
|
14
13
|
class HttpxLoader(Loader):
|
15
|
-
@timeout_decorator.timeout(10)
|
16
14
|
def load(self, url: str) -> str:
|
17
15
|
response = httpx.get(url, headers=DEFAULT_HEADERS, follow_redirects=True)
|
18
16
|
response.raise_for_status()
|
kabigon/pdf.py
CHANGED
@@ -1,8 +1,9 @@
|
|
1
|
-
import
|
1
|
+
import io
|
2
2
|
from pathlib import Path
|
3
|
+
from typing import IO
|
4
|
+
from typing import Any
|
3
5
|
|
4
6
|
import httpx
|
5
|
-
import timeout_decorator
|
6
7
|
from pypdf import PdfReader
|
7
8
|
|
8
9
|
from .loader import Loader
|
@@ -15,38 +16,44 @@ DEFAULT_HEADERS = {
|
|
15
16
|
|
16
17
|
|
17
18
|
class NotPDFError(LoaderError):
|
18
|
-
|
19
|
+
def __init__(self, url: str) -> None:
|
20
|
+
super().__init__(f"URL is not a PDF: {url}")
|
19
21
|
|
20
22
|
|
21
23
|
class PDFLoader(Loader):
|
22
|
-
@timeout_decorator.timeout(10)
|
23
24
|
def load(self, url_or_file: str) -> str:
|
24
|
-
if url_or_file.startswith("http"):
|
25
|
-
|
26
|
-
return read_pdf_content(url_or_file)
|
25
|
+
if not url_or_file.startswith("http"):
|
26
|
+
return read_pdf_content(url_or_file)
|
27
27
|
|
28
|
+
resp = httpx.get(url_or_file, headers=DEFAULT_HEADERS, follow_redirects=True)
|
29
|
+
resp.raise_for_status()
|
28
30
|
|
29
|
-
|
30
|
-
|
31
|
-
response.raise_for_status()
|
31
|
+
if resp.headers.get("content-type") != "application/pdf":
|
32
|
+
raise NotPDFError(url_or_file)
|
32
33
|
|
33
|
-
|
34
|
-
if not is_pdf:
|
35
|
-
raise NotPDFError(f"URL is not a PDF: {url}")
|
34
|
+
return read_pdf_content(io.BytesIO(resp.content))
|
36
35
|
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
return fp.name
|
36
|
+
async def async_load(self, url_or_file: str) -> str:
|
37
|
+
if not url_or_file.startswith("http"):
|
38
|
+
return read_pdf_content(url_or_file)
|
41
39
|
|
40
|
+
async with httpx.AsyncClient() as client:
|
41
|
+
resp = await client.get(url_or_file, headers=DEFAULT_HEADERS, follow_redirects=True)
|
42
|
+
resp.raise_for_status()
|
42
43
|
|
43
|
-
|
44
|
+
if resp.headers.get("content-type") != "application/pdf":
|
45
|
+
raise NotPDFError(url_or_file)
|
46
|
+
|
47
|
+
return read_pdf_content(io.BytesIO(resp.content))
|
48
|
+
|
49
|
+
|
50
|
+
def read_pdf_content(f: str | Path | IO[Any]) -> str:
|
44
51
|
lines = []
|
45
52
|
with PdfReader(f) as reader:
|
46
53
|
for page in reader.pages:
|
47
54
|
text = page.extract_text(extraction_mode="plain")
|
48
55
|
for line in text.splitlines():
|
49
|
-
|
50
|
-
|
51
|
-
|
56
|
+
stripped = line.strip()
|
57
|
+
if stripped:
|
58
|
+
lines.append(stripped)
|
52
59
|
return "\n".join(lines)
|
kabigon/playwright.py
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
from typing import Literal
|
2
2
|
|
3
|
+
from loguru import logger
|
4
|
+
from playwright.async_api import TimeoutError
|
3
5
|
from playwright.async_api import async_playwright
|
4
6
|
from playwright.sync_api import sync_playwright
|
5
7
|
|
@@ -35,7 +37,11 @@ class PlaywrightLoader(Loader):
|
|
35
37
|
browser = await p.chromium.launch(headless=self.browser_headless)
|
36
38
|
page = await browser.new_page()
|
37
39
|
|
38
|
-
|
40
|
+
try:
|
41
|
+
await page.goto(url, timeout=self.timeout, wait_until=self.wait_until)
|
42
|
+
except TimeoutError as e:
|
43
|
+
logger.error("Unable to load url: {}, got error: {}", url, e)
|
44
|
+
await page.goto(url)
|
39
45
|
|
40
46
|
content = await page.content()
|
41
47
|
await browser.close()
|
kabigon/reel.py
CHANGED
@@ -1,5 +1,3 @@
|
|
1
|
-
import timeout_decorator
|
2
|
-
|
3
1
|
from .httpx import HttpxLoader
|
4
2
|
from .loader import Loader
|
5
3
|
from .loader import LoaderError
|
@@ -20,7 +18,6 @@ class ReelLoader(Loader):
|
|
20
18
|
self.httpx_loader = HttpxLoader()
|
21
19
|
self.ytdlp_loader = YtdlpLoader()
|
22
20
|
|
23
|
-
@timeout_decorator.timeout(300)
|
24
21
|
def load(self, url: str) -> str:
|
25
22
|
if not is_reel_url(url):
|
26
23
|
raise NotReelURLError(url)
|
@@ -1,12 +1,11 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kabigon
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.5.1
|
4
4
|
Author-email: narumi <toucans-cutouts0f@icloud.com>
|
5
5
|
License-File: LICENSE
|
6
6
|
Requires-Python: >=3.10
|
7
7
|
Requires-Dist: aioytt>=0.2.4
|
8
8
|
Requires-Dist: click>=8.1.8
|
9
|
-
Requires-Dist: cloudscraper>=1.2.71
|
10
9
|
Requires-Dist: httpx>=0.28.1
|
11
10
|
Requires-Dist: loguru>=0.7.3
|
12
11
|
Requires-Dist: markdownify>=0.14.1
|
@@ -25,6 +24,7 @@ Description-Content-Type: text/markdown
|
|
25
24
|
|
26
25
|
```shell
|
27
26
|
pip install kabigon
|
27
|
+
playwright install chromium
|
28
28
|
```
|
29
29
|
|
30
30
|
## Usage
|
@@ -46,8 +46,8 @@ content = kabigon.Compose(
|
|
46
46
|
kabigon.ReelLoader(),
|
47
47
|
kabigon.YtdlpLoader(),
|
48
48
|
kabigon.PDFLoader(),
|
49
|
-
kabigon.HttpxLoader(),
|
50
|
-
kabigon.
|
49
|
+
# kabigon.HttpxLoader(),
|
50
|
+
kabigon.PlaywrightLoader(),
|
51
51
|
]
|
52
52
|
).load(url)
|
53
53
|
print(content)
|
@@ -0,0 +1,18 @@
|
|
1
|
+
kabigon/__init__.py,sha256=7ll3ePlHNbZq-CmrGMrQouLCUSmuRsZ9yAj2JOzr7HY,500
|
2
|
+
kabigon/cli.py,sha256=XSTyD1RFqq2Qok_52kSjJlBLUXl6t-K9QtsxCfB15o4,611
|
3
|
+
kabigon/compose.py,sha256=DO0hOJgEAX7ZLOS53dcE6V9zi7Tr9oGNW8koPHsx9eM,2110
|
4
|
+
kabigon/httpx.py,sha256=uDdLks6zVzirY7-mnsJkypX86kAI5XmUVfK-lFifdJA,895
|
5
|
+
kabigon/loader.py,sha256=D5xUPJb3uAygmBaN_sX56ZpGcGsVz-ueHOXC7gSGaxM,493
|
6
|
+
kabigon/pdf.py,sha256=Q9XuBdKDrDQJ8BNvY7Lgt6dpGeA_ylGGHWOE3euiI_8,1904
|
7
|
+
kabigon/playwright.py,sha256=R-NCW9sJs3O-asKk_iSxBI8pg15XqgQ-hEXMTr1F2q0,1602
|
8
|
+
kabigon/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
9
|
+
kabigon/reel.py,sha256=TP_oKYXABXYja2A9damTBWR3MVYA7aZyxbIvCuTcq40,1062
|
10
|
+
kabigon/singlefile.py,sha256=CeTT2WPYm0vb1xWPNdyBN4uHRw9hRqfZm68D-nEcUA8,1800
|
11
|
+
kabigon/utils.py,sha256=eNTLtHLSB2erDac2HH3jWemgfr8Ou_ozwVb8h9BD-4g,922
|
12
|
+
kabigon/youtube.py,sha256=HoiFNq0ookPL7_rO_wloBaY8yTIX6xP8A77F7y02q64,1166
|
13
|
+
kabigon/ytdlp.py,sha256=kG1fXqU650otOWespjOSkGK_-jk1wO-sWiR60_UPJxY,3125
|
14
|
+
kabigon-0.5.1.dist-info/METADATA,sha256=3YUnj7TzdYOAfMV4X9vHwO1HMQrM8iIEDyTDqMdnDFM,1043
|
15
|
+
kabigon-0.5.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
16
|
+
kabigon-0.5.1.dist-info/entry_points.txt,sha256=O3FYAO9w-NQvlGMJrBvtrnGHSK2QkUnQBTa30YXRbVE,45
|
17
|
+
kabigon-0.5.1.dist-info/licenses/LICENSE,sha256=H2T3_RTgmcngMeC7p_SXT3GwBLkd2DaNgAZuxulcfiA,1066
|
18
|
+
kabigon-0.5.1.dist-info/RECORD,,
|
kabigon/cloudscraper.py
DELETED
@@ -1,14 +0,0 @@
|
|
1
|
-
import cloudscraper
|
2
|
-
import timeout_decorator
|
3
|
-
|
4
|
-
from .loader import Loader
|
5
|
-
from .utils import html_to_markdown
|
6
|
-
|
7
|
-
|
8
|
-
class CloudscraperLoader(Loader):
|
9
|
-
@timeout_decorator.timeout(10)
|
10
|
-
def load(self, url: str) -> str:
|
11
|
-
client = cloudscraper.create_scraper()
|
12
|
-
response = client.get(url, allow_redirects=True)
|
13
|
-
response.raise_for_status()
|
14
|
-
return html_to_markdown(response.text)
|
kabigon-0.4.2.dist-info/RECORD
DELETED
@@ -1,19 +0,0 @@
|
|
1
|
-
kabigon/__init__.py,sha256=9RgyhhwjqrW2iQy9RBN2j7VZNhwA9xGo_atC7FKnZA4,545
|
2
|
-
kabigon/cli.py,sha256=z3u2Msvi1SWf1fd9nCTzJULeO-rRb5oDKJfPxhUeYQ0,611
|
3
|
-
kabigon/cloudscraper.py,sha256=0jzrXVXSZopExyxrDRbcI_2wsbHAg_dqOk4D3Re0jvk,404
|
4
|
-
kabigon/compose.py,sha256=DO0hOJgEAX7ZLOS53dcE6V9zi7Tr9oGNW8koPHsx9eM,2110
|
5
|
-
kabigon/httpx.py,sha256=B8_26rufJMbKSXINBEqyCIpaRueO_3Gk_PtEQmlOxQ4,955
|
6
|
-
kabigon/loader.py,sha256=D5xUPJb3uAygmBaN_sX56ZpGcGsVz-ueHOXC7gSGaxM,493
|
7
|
-
kabigon/pdf.py,sha256=oM5pwZJ2GCcHyQXg98-Mda-MHxarYVZQge30KdS_aHY,1549
|
8
|
-
kabigon/playwright.py,sha256=ciNUlpMbwd47utCLT454wFSirXFmt3eCXN2Q-nAsiu8,1356
|
9
|
-
kabigon/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
10
|
-
kabigon/reel.py,sha256=1JTcn7qVH7FcD0Oj-Rz-pnjI-xS1UtkoJcuClGb8ExQ,1124
|
11
|
-
kabigon/singlefile.py,sha256=CeTT2WPYm0vb1xWPNdyBN4uHRw9hRqfZm68D-nEcUA8,1800
|
12
|
-
kabigon/utils.py,sha256=eNTLtHLSB2erDac2HH3jWemgfr8Ou_ozwVb8h9BD-4g,922
|
13
|
-
kabigon/youtube.py,sha256=HoiFNq0ookPL7_rO_wloBaY8yTIX6xP8A77F7y02q64,1166
|
14
|
-
kabigon/ytdlp.py,sha256=kG1fXqU650otOWespjOSkGK_-jk1wO-sWiR60_UPJxY,3125
|
15
|
-
kabigon-0.4.2.dist-info/METADATA,sha256=JHbf13Nnhr05WfSS1hijT-YkeCewuWr5kYbzxjlJ-M8,1049
|
16
|
-
kabigon-0.4.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
17
|
-
kabigon-0.4.2.dist-info/entry_points.txt,sha256=O3FYAO9w-NQvlGMJrBvtrnGHSK2QkUnQBTa30YXRbVE,45
|
18
|
-
kabigon-0.4.2.dist-info/licenses/LICENSE,sha256=H2T3_RTgmcngMeC7p_SXT3GwBLkd2DaNgAZuxulcfiA,1066
|
19
|
-
kabigon-0.4.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|