kabigon 0.4.1__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kabigon/__init__.py CHANGED
@@ -4,7 +4,6 @@ from typing import Final
4
4
 
5
5
  from loguru import logger
6
6
 
7
- from .cloudscraper import CloudscraperLoader
8
7
  from .compose import Compose
9
8
  from .httpx import HttpxLoader
10
9
  from .loader import Loader
kabigon/httpx.py CHANGED
@@ -1,5 +1,4 @@
1
1
  import httpx
2
- import timeout_decorator
3
2
 
4
3
  from .loader import Loader
5
4
  from .utils import html_to_markdown
@@ -12,7 +11,6 @@ DEFAULT_HEADERS = {
12
11
 
13
12
 
14
13
  class HttpxLoader(Loader):
15
- @timeout_decorator.timeout(10)
16
14
  def load(self, url: str) -> str:
17
15
  response = httpx.get(url, headers=DEFAULT_HEADERS, follow_redirects=True)
18
16
  response.raise_for_status()
kabigon/pdf.py CHANGED
@@ -1,8 +1,9 @@
1
- import tempfile
1
+ import io
2
2
  from pathlib import Path
3
+ from typing import IO
4
+ from typing import Any
3
5
 
4
6
  import httpx
5
- import timeout_decorator
6
7
  from pypdf import PdfReader
7
8
 
8
9
  from .loader import Loader
@@ -15,38 +16,44 @@ DEFAULT_HEADERS = {
15
16
 
16
17
 
17
18
  class NotPDFError(LoaderError):
18
- pass
19
+ def __init__(self, url: str) -> None:
20
+ super().__init__(f"URL is not a PDF: {url}")
19
21
 
20
22
 
21
23
  class PDFLoader(Loader):
22
- @timeout_decorator.timeout(10)
23
24
  def load(self, url_or_file: str) -> str:
24
- if url_or_file.startswith("http"):
25
- url_or_file = download_pdf_from_url(url_or_file)
26
- return read_pdf_content(url_or_file)
25
+ if not url_or_file.startswith("http"):
26
+ return read_pdf_content(url_or_file)
27
27
 
28
+ resp = httpx.get(url_or_file, headers=DEFAULT_HEADERS, follow_redirects=True)
29
+ resp.raise_for_status()
28
30
 
29
- def download_pdf_from_url(url: str) -> str:
30
- response = httpx.get(url=url, headers=DEFAULT_HEADERS, follow_redirects=True)
31
- response.raise_for_status()
31
+ if resp.headers.get("content-type") != "application/pdf":
32
+ raise NotPDFError(url_or_file)
32
33
 
33
- is_pdf = response.headers.get("content-type") == "application/pdf"
34
- if not is_pdf:
35
- raise NotPDFError(f"URL is not a PDF: {url}")
34
+ return read_pdf_content(io.BytesIO(resp.content))
36
35
 
37
- suffix = ".pdf" if is_pdf else None
38
- with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as fp:
39
- fp.write(response.content)
40
- return fp.name
36
+ async def async_load(self, url_or_file: str) -> str:
37
+ if not url_or_file.startswith("http"):
38
+ return read_pdf_content(url_or_file)
41
39
 
40
+ async with httpx.AsyncClient() as client:
41
+ resp = await client.get(url_or_file, headers=DEFAULT_HEADERS, follow_redirects=True)
42
+ resp.raise_for_status()
42
43
 
43
- def read_pdf_content(f: str | Path) -> str:
44
+ if resp.headers.get("content-type") != "application/pdf":
45
+ raise NotPDFError(url_or_file)
46
+
47
+ return read_pdf_content(io.BytesIO(resp.content))
48
+
49
+
50
+ def read_pdf_content(f: str | Path | IO[Any]) -> str:
44
51
  lines = []
45
52
  with PdfReader(f) as reader:
46
53
  for page in reader.pages:
47
54
  text = page.extract_text(extraction_mode="plain")
48
55
  for line in text.splitlines():
49
- if not line.strip():
50
- continue
51
- lines.append(line.strip())
56
+ stripped = line.strip()
57
+ if stripped:
58
+ lines.append(stripped)
52
59
  return "\n".join(lines)
kabigon/reel.py CHANGED
@@ -1,5 +1,3 @@
1
- import timeout_decorator
2
-
3
1
  from .httpx import HttpxLoader
4
2
  from .loader import Loader
5
3
  from .loader import LoaderError
@@ -20,7 +18,6 @@ class ReelLoader(Loader):
20
18
  self.httpx_loader = HttpxLoader()
21
19
  self.ytdlp_loader = YtdlpLoader()
22
20
 
23
- @timeout_decorator.timeout(300)
24
21
  def load(self, url: str) -> str:
25
22
  if not is_reel_url(url):
26
23
  raise NotReelURLError(url)
kabigon/youtube.py CHANGED
@@ -1,76 +1,11 @@
1
- from urllib.parse import parse_qs
2
- from urllib.parse import urlparse
3
-
4
1
  import aioytt
2
+ import aioytt.video_id
5
3
  import timeout_decorator
6
4
  from youtube_transcript_api import YouTubeTranscriptApi
7
5
 
8
6
  from .loader import Loader
9
- from .loader import LoaderError
10
7
 
11
8
  DEFAULT_LANGUAGES = ["zh-TW", "zh-Hant", "zh", "zh-Hans", "ja", "en", "ko"]
12
- ALLOWED_SCHEMES = {
13
- "http",
14
- "https",
15
- }
16
- ALLOWED_NETLOCS = {
17
- "youtu.be",
18
- "m.youtube.com",
19
- "youtube.com",
20
- "www.youtube.com",
21
- "www.youtube-nocookie.com",
22
- "vid.plus",
23
- }
24
-
25
-
26
- class UnsupportedURLSchemeError(LoaderError):
27
- def __init__(self, scheme: str) -> None:
28
- super().__init__(f"unsupported URL scheme: {scheme}")
29
-
30
-
31
- class UnsupportedURLNetlocError(LoaderError):
32
- def __init__(self, netloc: str) -> None:
33
- super().__init__(f"unsupported URL netloc: {netloc}")
34
-
35
-
36
- class VideoIDError(LoaderError):
37
- def __init__(self, video_id: str) -> None:
38
- super().__init__(f"invalid video ID: {video_id}")
39
-
40
-
41
- class NoVideoIDFoundError(LoaderError):
42
- def __init__(self, url: str) -> None:
43
- super().__init__(f"no video found in URL: {url}")
44
-
45
-
46
- def parse_video_id(url: str) -> str:
47
- """Parse a YouTube URL and return the video ID if valid, otherwise None."""
48
- parsed_url = urlparse(url)
49
-
50
- if parsed_url.scheme not in ALLOWED_SCHEMES:
51
- raise UnsupportedURLSchemeError(parsed_url.scheme)
52
-
53
- if parsed_url.netloc not in ALLOWED_NETLOCS:
54
- raise UnsupportedURLNetlocError(parsed_url.netloc)
55
-
56
- path = parsed_url.path
57
-
58
- if path.endswith("/watch"):
59
- query = parsed_url.query
60
- parsed_query = parse_qs(query)
61
- if "v" in parsed_query:
62
- ids = parsed_query["v"]
63
- video_id = ids if isinstance(ids, str) else ids[0]
64
- else:
65
- raise NoVideoIDFoundError(url)
66
- else:
67
- path = parsed_url.path.lstrip("/")
68
- video_id = path.split("/")[-1]
69
-
70
- if len(video_id) != 11: # Video IDs are 11 characters long
71
- raise VideoIDError(video_id)
72
-
73
- return video_id
74
9
 
75
10
 
76
11
  class YoutubeLoader(Loader):
@@ -79,7 +14,7 @@ class YoutubeLoader(Loader):
79
14
 
80
15
  @timeout_decorator.timeout(20)
81
16
  def load(self, url: str) -> str:
82
- video_id = parse_video_id(url)
17
+ video_id = aioytt.video_id.parse_video_id(url)
83
18
 
84
19
  transcript_pieces: list[dict[str, str | float]] = YouTubeTranscriptApi().get_transcript(
85
20
  video_id, self.languages
@@ -98,5 +33,5 @@ class YoutubeLoader(Loader):
98
33
  for piece in transcript:
99
34
  text = piece.text.strip()
100
35
  if text:
101
- lines += text
36
+ lines.append(text)
102
37
  return "\n".join(lines)
@@ -1,12 +1,11 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kabigon
3
- Version: 0.4.1
3
+ Version: 0.5.0
4
4
  Author-email: narumi <toucans-cutouts0f@icloud.com>
5
5
  License-File: LICENSE
6
6
  Requires-Python: >=3.10
7
7
  Requires-Dist: aioytt>=0.2.4
8
8
  Requires-Dist: click>=8.1.8
9
- Requires-Dist: cloudscraper>=1.2.71
10
9
  Requires-Dist: httpx>=0.28.1
11
10
  Requires-Dist: loguru>=0.7.3
12
11
  Requires-Dist: markdownify>=0.14.1
@@ -0,0 +1,18 @@
1
+ kabigon/__init__.py,sha256=7ll3ePlHNbZq-CmrGMrQouLCUSmuRsZ9yAj2JOzr7HY,500
2
+ kabigon/cli.py,sha256=z3u2Msvi1SWf1fd9nCTzJULeO-rRb5oDKJfPxhUeYQ0,611
3
+ kabigon/compose.py,sha256=DO0hOJgEAX7ZLOS53dcE6V9zi7Tr9oGNW8koPHsx9eM,2110
4
+ kabigon/httpx.py,sha256=uDdLks6zVzirY7-mnsJkypX86kAI5XmUVfK-lFifdJA,895
5
+ kabigon/loader.py,sha256=D5xUPJb3uAygmBaN_sX56ZpGcGsVz-ueHOXC7gSGaxM,493
6
+ kabigon/pdf.py,sha256=Q9XuBdKDrDQJ8BNvY7Lgt6dpGeA_ylGGHWOE3euiI_8,1904
7
+ kabigon/playwright.py,sha256=ciNUlpMbwd47utCLT454wFSirXFmt3eCXN2Q-nAsiu8,1356
8
+ kabigon/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
9
+ kabigon/reel.py,sha256=TP_oKYXABXYja2A9damTBWR3MVYA7aZyxbIvCuTcq40,1062
10
+ kabigon/singlefile.py,sha256=CeTT2WPYm0vb1xWPNdyBN4uHRw9hRqfZm68D-nEcUA8,1800
11
+ kabigon/utils.py,sha256=eNTLtHLSB2erDac2HH3jWemgfr8Ou_ozwVb8h9BD-4g,922
12
+ kabigon/youtube.py,sha256=HoiFNq0ookPL7_rO_wloBaY8yTIX6xP8A77F7y02q64,1166
13
+ kabigon/ytdlp.py,sha256=kG1fXqU650otOWespjOSkGK_-jk1wO-sWiR60_UPJxY,3125
14
+ kabigon-0.5.0.dist-info/METADATA,sha256=TVL8rTtEdjtIkjl7L9pC7vrv3-VOx6sYBafeXVUlSN8,1013
15
+ kabigon-0.5.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
16
+ kabigon-0.5.0.dist-info/entry_points.txt,sha256=O3FYAO9w-NQvlGMJrBvtrnGHSK2QkUnQBTa30YXRbVE,45
17
+ kabigon-0.5.0.dist-info/licenses/LICENSE,sha256=H2T3_RTgmcngMeC7p_SXT3GwBLkd2DaNgAZuxulcfiA,1066
18
+ kabigon-0.5.0.dist-info/RECORD,,
kabigon/cloudscraper.py DELETED
@@ -1,14 +0,0 @@
1
- import cloudscraper
2
- import timeout_decorator
3
-
4
- from .loader import Loader
5
- from .utils import html_to_markdown
6
-
7
-
8
- class CloudscraperLoader(Loader):
9
- @timeout_decorator.timeout(10)
10
- def load(self, url: str) -> str:
11
- client = cloudscraper.create_scraper()
12
- response = client.get(url, allow_redirects=True)
13
- response.raise_for_status()
14
- return html_to_markdown(response.text)
@@ -1,19 +0,0 @@
1
- kabigon/__init__.py,sha256=9RgyhhwjqrW2iQy9RBN2j7VZNhwA9xGo_atC7FKnZA4,545
2
- kabigon/cli.py,sha256=z3u2Msvi1SWf1fd9nCTzJULeO-rRb5oDKJfPxhUeYQ0,611
3
- kabigon/cloudscraper.py,sha256=0jzrXVXSZopExyxrDRbcI_2wsbHAg_dqOk4D3Re0jvk,404
4
- kabigon/compose.py,sha256=DO0hOJgEAX7ZLOS53dcE6V9zi7Tr9oGNW8koPHsx9eM,2110
5
- kabigon/httpx.py,sha256=B8_26rufJMbKSXINBEqyCIpaRueO_3Gk_PtEQmlOxQ4,955
6
- kabigon/loader.py,sha256=D5xUPJb3uAygmBaN_sX56ZpGcGsVz-ueHOXC7gSGaxM,493
7
- kabigon/pdf.py,sha256=oM5pwZJ2GCcHyQXg98-Mda-MHxarYVZQge30KdS_aHY,1549
8
- kabigon/playwright.py,sha256=ciNUlpMbwd47utCLT454wFSirXFmt3eCXN2Q-nAsiu8,1356
9
- kabigon/py.typed,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
10
- kabigon/reel.py,sha256=1JTcn7qVH7FcD0Oj-Rz-pnjI-xS1UtkoJcuClGb8ExQ,1124
11
- kabigon/singlefile.py,sha256=CeTT2WPYm0vb1xWPNdyBN4uHRw9hRqfZm68D-nEcUA8,1800
12
- kabigon/utils.py,sha256=eNTLtHLSB2erDac2HH3jWemgfr8Ou_ozwVb8h9BD-4g,922
13
- kabigon/youtube.py,sha256=M1v7t4VS72ItqaNJTwlv59bafqORqldGjNzptZOeybA,2915
14
- kabigon/ytdlp.py,sha256=kG1fXqU650otOWespjOSkGK_-jk1wO-sWiR60_UPJxY,3125
15
- kabigon-0.4.1.dist-info/METADATA,sha256=IfnrNBC17ac0E4aI9Y-VOWhaiOnMN_RZxhoS-_EvhzA,1049
16
- kabigon-0.4.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
17
- kabigon-0.4.1.dist-info/entry_points.txt,sha256=O3FYAO9w-NQvlGMJrBvtrnGHSK2QkUnQBTa30YXRbVE,45
18
- kabigon-0.4.1.dist-info/licenses/LICENSE,sha256=H2T3_RTgmcngMeC7p_SXT3GwBLkd2DaNgAZuxulcfiA,1066
19
- kabigon-0.4.1.dist-info/RECORD,,