parsehub 2.0.25__tar.gz → 2.0.27__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. {parsehub-2.0.25/src/parsehub.egg-info → parsehub-2.0.27}/PKG-INFO +2 -6
  2. {parsehub-2.0.25 → parsehub-2.0.27}/README.md +1 -5
  3. {parsehub-2.0.25 → parsehub-2.0.27}/pyproject.toml +1 -1
  4. {parsehub-2.0.25 → parsehub-2.0.27}/src/parsehub/parsers/parser/weibo.py +13 -2
  5. {parsehub-2.0.25 → parsehub-2.0.27}/src/parsehub/provider_api/coolapk.py +2 -2
  6. {parsehub-2.0.25 → parsehub-2.0.27}/src/parsehub/provider_api/twitter.py +1 -11
  7. {parsehub-2.0.25 → parsehub-2.0.27}/src/parsehub/provider_api/weibo.py +87 -34
  8. {parsehub-2.0.25 → parsehub-2.0.27/src/parsehub.egg-info}/PKG-INFO +2 -6
  9. {parsehub-2.0.25 → parsehub-2.0.27}/test/test_core_offline.py +2 -1
  10. {parsehub-2.0.25 → parsehub-2.0.27}/LICENSE +0 -0
  11. {parsehub-2.0.25 → parsehub-2.0.27}/setup.cfg +0 -0
  12. {parsehub-2.0.25 → parsehub-2.0.27}/src/__init__.py +0 -0
  13. {parsehub-2.0.25 → parsehub-2.0.27}/src/parsehub/__init__.py +0 -0
  14. {parsehub-2.0.25 → parsehub-2.0.27}/src/parsehub/cli.py +0 -0
  15. {parsehub-2.0.25 → parsehub-2.0.27}/src/parsehub/cli_config.py +0 -0
  16. {parsehub-2.0.25 → parsehub-2.0.27}/src/parsehub/config/__init__.py +0 -0
  17. {parsehub-2.0.25 → parsehub-2.0.27}/src/parsehub/config/config.py +0 -0
  18. {parsehub-2.0.25 → parsehub-2.0.27}/src/parsehub/errors.py +0 -0
  19. {parsehub-2.0.25 → parsehub-2.0.27}/src/parsehub/parsers/__init__.py +0 -0
  20. {parsehub-2.0.25 → parsehub-2.0.27}/src/parsehub/parsers/base/__init__.py +0 -0
  21. {parsehub-2.0.25 → parsehub-2.0.27}/src/parsehub/parsers/base/base.py +0 -0
  22. {parsehub-2.0.25 → parsehub-2.0.27}/src/parsehub/parsers/base/ytdlp.py +0 -0
  23. {parsehub-2.0.25 → parsehub-2.0.27}/src/parsehub/parsers/parser/__init__.py +0 -0
  24. {parsehub-2.0.25 → parsehub-2.0.27}/src/parsehub/parsers/parser/bilibili.py +0 -0
  25. {parsehub-2.0.25 → parsehub-2.0.27}/src/parsehub/parsers/parser/coolapk.py +0 -0
  26. {parsehub-2.0.25 → parsehub-2.0.27}/src/parsehub/parsers/parser/douyin.py +0 -0
  27. {parsehub-2.0.25 → parsehub-2.0.27}/src/parsehub/parsers/parser/facebook.py +0 -0
  28. {parsehub-2.0.25 → parsehub-2.0.27}/src/parsehub/parsers/parser/instagram.py +0 -0
  29. {parsehub-2.0.25 → parsehub-2.0.27}/src/parsehub/parsers/parser/kuaishou.py +0 -0
  30. {parsehub-2.0.25 → parsehub-2.0.27}/src/parsehub/parsers/parser/pipix.py +0 -0
  31. {parsehub-2.0.25 → parsehub-2.0.27}/src/parsehub/parsers/parser/threads.py +0 -0
  32. {parsehub-2.0.25 → parsehub-2.0.27}/src/parsehub/parsers/parser/tieba.py +0 -0
  33. {parsehub-2.0.25 → parsehub-2.0.27}/src/parsehub/parsers/parser/tiktok.py +0 -0
  34. {parsehub-2.0.25 → parsehub-2.0.27}/src/parsehub/parsers/parser/twitter.py +0 -0
  35. {parsehub-2.0.25 → parsehub-2.0.27}/src/parsehub/parsers/parser/weixin.py +0 -0
  36. {parsehub-2.0.25 → parsehub-2.0.27}/src/parsehub/parsers/parser/xhs.py +0 -0
  37. {parsehub-2.0.25 → parsehub-2.0.27}/src/parsehub/parsers/parser/xiaoheihe.py +0 -0
  38. {parsehub-2.0.25 → parsehub-2.0.27}/src/parsehub/parsers/parser/youtube.py +0 -0
  39. {parsehub-2.0.25 → parsehub-2.0.27}/src/parsehub/parsers/parser/zuiyou.py +0 -0
  40. {parsehub-2.0.25 → parsehub-2.0.27}/src/parsehub/provider_api/__init__.py +0 -0
  41. {parsehub-2.0.25 → parsehub-2.0.27}/src/parsehub/provider_api/bilibili.py +0 -0
  42. {parsehub-2.0.25 → parsehub-2.0.27}/src/parsehub/provider_api/douyin.py +0 -0
  43. {parsehub-2.0.25 → parsehub-2.0.27}/src/parsehub/provider_api/instagram.py +0 -0
  44. {parsehub-2.0.25 → parsehub-2.0.27}/src/parsehub/provider_api/kuaishou.py +0 -0
  45. {parsehub-2.0.25 → parsehub-2.0.27}/src/parsehub/provider_api/pipix.py +0 -0
  46. {parsehub-2.0.25 → parsehub-2.0.27}/src/parsehub/provider_api/threads.py +0 -0
  47. {parsehub-2.0.25 → parsehub-2.0.27}/src/parsehub/provider_api/tieba.py +0 -0
  48. {parsehub-2.0.25 → parsehub-2.0.27}/src/parsehub/provider_api/tiktok.py +0 -0
  49. {parsehub-2.0.25 → parsehub-2.0.27}/src/parsehub/provider_api/weixin.py +0 -0
  50. {parsehub-2.0.25 → parsehub-2.0.27}/src/parsehub/provider_api/xhs.py +0 -0
  51. {parsehub-2.0.25 → parsehub-2.0.27}/src/parsehub/provider_api/xiaoheihe.py +0 -0
  52. {parsehub-2.0.25 → parsehub-2.0.27}/src/parsehub/provider_api/zuiyou.py +0 -0
  53. {parsehub-2.0.25 → parsehub-2.0.27}/src/parsehub/types/__init__.py +0 -0
  54. {parsehub-2.0.25 → parsehub-2.0.27}/src/parsehub/types/callback.py +0 -0
  55. {parsehub-2.0.25 → parsehub-2.0.27}/src/parsehub/types/media_file.py +0 -0
  56. {parsehub-2.0.25 → parsehub-2.0.27}/src/parsehub/types/media_ref.py +0 -0
  57. {parsehub-2.0.25 → parsehub-2.0.27}/src/parsehub/types/platform.py +0 -0
  58. {parsehub-2.0.25 → parsehub-2.0.27}/src/parsehub/types/post.py +0 -0
  59. {parsehub-2.0.25 → parsehub-2.0.27}/src/parsehub/types/result.py +0 -0
  60. {parsehub-2.0.25 → parsehub-2.0.27}/src/parsehub/utils/downloader.py +0 -0
  61. {parsehub-2.0.25 → parsehub-2.0.27}/src/parsehub/utils/media_info.py +0 -0
  62. {parsehub-2.0.25 → parsehub-2.0.27}/src/parsehub/utils/utils.py +0 -0
  63. {parsehub-2.0.25 → parsehub-2.0.27}/src/parsehub.egg-info/SOURCES.txt +0 -0
  64. {parsehub-2.0.25 → parsehub-2.0.27}/src/parsehub.egg-info/dependency_links.txt +0 -0
  65. {parsehub-2.0.25 → parsehub-2.0.27}/src/parsehub.egg-info/entry_points.txt +0 -0
  66. {parsehub-2.0.25 → parsehub-2.0.27}/src/parsehub.egg-info/requires.txt +0 -0
  67. {parsehub-2.0.25 → parsehub-2.0.27}/src/parsehub.egg-info/top_level.txt +0 -0
  68. {parsehub-2.0.25 → parsehub-2.0.27}/test/test_cli.py +0 -0
  69. {parsehub-2.0.25 → parsehub-2.0.27}/test/test_cli_config.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: parsehub
3
- Version: 2.0.25
3
+ Version: 2.0.27
4
4
  Summary: 轻量、异步、开箱即用的社交媒体聚合解析库
5
5
  Author-email: 梓澪 <zilingmio@gmail.com>
6
6
  License: MIT
@@ -95,7 +95,7 @@ Dynamic: license-file
95
95
  ### CLI 安装
96
96
 
97
97
  ```bash
98
- pipx install "parsehub[cli]"
98
+ uv tool install "parsehub[cli]"
99
99
  ph -v
100
100
  ```
101
101
 
@@ -105,12 +105,8 @@ ph -v
105
105
  # uv
106
106
  uv add parsehub
107
107
 
108
- # pip
109
- pip install parsehub
110
-
111
108
  # 需要完整 CLI 能力时,可安装 `cli` 扩展
112
109
  uv add "parsehub[cli]"
113
- pip install "parsehub[cli]"
114
110
  ```
115
111
 
116
112
  ## 🚀 快速开始
@@ -53,7 +53,7 @@
53
53
  ### CLI 安装
54
54
 
55
55
  ```bash
56
- pipx install "parsehub[cli]"
56
+ uv tool install "parsehub[cli]"
57
57
  ph -v
58
58
  ```
59
59
 
@@ -63,12 +63,8 @@ ph -v
63
63
  # uv
64
64
  uv add parsehub
65
65
 
66
- # pip
67
- pip install parsehub
68
-
69
66
  # 需要完整 CLI 能力时,可安装 `cli` 扩展
70
67
  uv add "parsehub[cli]"
71
- pip install "parsehub[cli]"
72
68
  ```
73
69
 
74
70
  ## 🚀 快速开始
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "parsehub"
3
- version = "2.0.25"
3
+ version = "2.0.27"
4
4
  description = "轻量、异步、开箱即用的社交媒体聚合解析库"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.12.0"
@@ -1,6 +1,6 @@
1
1
  import re
2
2
 
3
- from ...provider_api.weibo import MediaType, MixMediaInfoItem, PicInfo, WeiboAPI
3
+ from ...provider_api.weibo import MediaType, MixMediaInfoItem, PicInfo, WeiboAPI, WeiboTVContent
4
4
  from ...types import (
5
5
  AniRef,
6
6
  ImageParseResult,
@@ -17,10 +17,21 @@ from ..base.base import BaseParser
17
17
  class WeiboParser(BaseParser):
18
18
  __platform__ = Platform.WEIBO
19
19
  __supported_type__ = ["视频", "图文"]
20
- __match__ = r"^(http(s)?://)((m\.|)weibo\.(com|cn)/(?!(u/)).+|mapp\.api\.weibo\.cn/fx/.+)"
20
+ __match__ = r"^(http(s)?://)((m\.|video\.|)weibo\.(com|cn)/(?!(u/)).+|mapp\.api\.weibo\.cn/fx/.+)"
21
+ __reserved_parameters__ = ["fid"]
21
22
 
22
23
  async def _do_parse(self, raw_url: str) -> MultimediaParseResult | VideoParseResult | ImageParseResult:
23
24
  weibo = await WeiboAPI(self.proxy).parse(raw_url)
25
+ if isinstance(weibo, WeiboTVContent):
26
+ return VideoParseResult(
27
+ content=self.f_text(weibo.text),
28
+ video=VideoRef(
29
+ url=weibo.video_url,
30
+ thumb_url=weibo.cover_image,
31
+ duration=int(weibo.video_duration),
32
+ ),
33
+ )
34
+
24
35
  data = weibo.data
25
36
  text = self.f_text(data.content)
26
37
  media: list[VideoRef | ImageRef | LivePhotoRef | AniRef] = []
@@ -32,9 +32,9 @@ class Coolapk:
32
32
  return cls(title, markdown_content, text_content, imgs)
33
33
 
34
34
  feed_element = soup.find(class_="feed-message")
35
- if feed_element and (content := feed_element.text.strip()):
35
+ if feed_element and (feed_content := feed_element.text.strip()):
36
36
  message_image_group = soup.find(class_="message-image-group")
37
37
  imgs = [f"https:{i['src']}" for i in message_image_group.find_all("img")] if message_image_group else []
38
- return cls(None, None, content, imgs)
38
+ return cls(None, None, feed_content, imgs)
39
39
 
40
40
  raise ValueError("获取内容失败, 分享时请保留 shareKey 或 s 参数")
@@ -29,14 +29,13 @@ class Twitter:
29
29
  "authorization": self.authorization,
30
30
  "content-type": "application/json",
31
31
  "user-agent": GlobalConfig.ua,
32
- "x-guest-token": await self.get_guest_token(url),
33
32
  "x-twitter-active-user": "yes",
34
33
  "x-twitter-client-language": "zh-cn",
35
34
  }
36
35
 
37
36
  cookie = None
38
37
  if self.cookie and self.check_cookie():
39
- headers["x-csrf-token"] = self.cookie.get("ct0")
38
+ headers["x-csrf-token"] = self.cookie.get("ct0", '')
40
39
  cookie = self.cookie
41
40
 
42
41
  params = {
@@ -158,15 +157,6 @@ class Twitter:
158
157
  raise ValueError(f"Invalid tweet url: {url}")
159
158
  return match[1]
160
159
 
161
- async def get_guest_token(self, url: str):
162
- async with httpx.AsyncClient(proxy=self.proxy) as client:
163
- response = await client.post(url)
164
- response.raise_for_status()
165
- guest_token = re.search(r'cookie="gt=(\d+);', response.text)
166
- if not guest_token:
167
- raise Exception("error -5: 获取 guest_token 失败")
168
- return guest_token[1]
169
-
170
160
  def check_cookie(self) -> bool:
171
161
  if not self.cookie:
172
162
  return False
@@ -5,59 +5,92 @@ from abc import abstractmethod
5
5
  from dataclasses import dataclass
6
6
  from enum import Enum
7
7
  from inspect import signature
8
- from typing import Any
8
+ from typing import Any, Self, Union
9
9
  from urllib.parse import urlparse
10
10
 
11
11
  import httpx
12
12
 
13
13
 
14
14
  class WeiboAPI:
15
- _MAPP_FX_NETLOC = "mapp.api.weibo.cn"
16
- _STATUS_PATH_PATTERN = re.compile(r"^/status/([^/?#]+)")
17
-
18
15
  def __init__(self, proxy: str | None = None):
19
16
  self.proxy = proxy
17
+ self._cookies = {
18
+ "SUB": "_2AkMR47Mlf8NxqwFRmfocxG_lbox2wg7EieKnv0L-JRMxHRl-yT9yqhFdtRB6OmOdyoia9pKPkqoHRRmSBA_WNPaHuybH",
19
+ }
20
20
 
21
21
  @staticmethod
22
- def get_id_by_url(url: str) -> str | None:
23
- bid = url.split("/")[-1]
24
- if bid.isdigit() or len(bid) == 9:
25
- return bid
26
- return None
22
+ def is_tv(url: str) -> bool:
23
+ if "/tv/show" in url:
24
+ return True
25
+ return False
27
26
 
28
27
  async def resolve_url(self, url: str) -> str:
29
28
  parsed = urlparse(url)
30
- if parsed.hostname != self._MAPP_FX_NETLOC or not parsed.path.startswith("/fx/"):
31
- return url
32
29
 
33
- async with httpx.AsyncClient(proxy=self.proxy, follow_redirects=False, timeout=30) as client:
34
- response = await client.get(url)
35
- if response.is_error:
36
- response.raise_for_status()
37
- return response.headers.get("location") or url
30
+ async def fn() -> str:
31
+ async with httpx.AsyncClient(proxy=self.proxy, follow_redirects=False, timeout=30) as client:
32
+ response = await client.get(url)
33
+ if response.is_error:
34
+ response.raise_for_status()
35
+ return response.headers.get("location") or url
38
36
 
39
- async def get_id_by_url_async(self, url: str) -> str | None:
40
- url = await self.resolve_url(url)
37
+ if parsed.hostname == "mapp.api.weibo.cn" and parsed.path.startswith("/fx/"):
38
+ return await fn()
39
+ if parsed.hostname == "video.weibo.com" and parsed.path.startswith("/show"):
40
+ return await fn()
41
+ return url
42
+
43
+ async def get_id_by_url(self, url: str) -> str | None:
41
44
  parsed = urlparse(url)
42
- if match := self._STATUS_PATH_PATTERN.match(parsed.path):
43
- return match.group(1)
44
- return self.get_id_by_url(url)
45
+ if match := re.compile(r"^/status/([^/?#]+)").match(parsed.path):
46
+ return match[1]
45
47
 
46
- async def parse(self, url: str) -> "WeiboContent":
47
- bid = await self.get_id_by_url_async(url)
48
- if not bid:
49
- raise ValueError("Invalid URL")
48
+ id_ = parsed.path.split("/")[-1]
49
+
50
+ if self.is_tv(url) and len(id_) == 21:
51
+ return id_
52
+
53
+ if id_.isdigit() or len(id_) == 9:
54
+ return id_
55
+ return None
56
+
57
+ async def statuses_show(self, bid: str) -> dict:
50
58
  headers = {
51
59
  "referer": "https://weibo.com",
52
60
  }
53
- cookies = {
54
- "SUB": "_2AkMR47Mlf8NxqwFRmfocxG_lbox2wg7EieKnv0L-JRMxHRl-yT9yqhFdtRB6OmOdyoia9pKPkqoHRRmSBA_WNPaHuybH",
55
- }
56
61
  api = f"https://weibo.com/ajax/statuses/show?id={bid}&isGetLongText=true"
57
62
  async with httpx.AsyncClient(proxy=self.proxy) as client:
58
- response = await client.get(api, cookies=cookies, headers=headers)
63
+ response = await client.get(api, cookies=self._cookies, headers=headers)
64
+ response.raise_for_status()
65
+ result: dict = response.json()
66
+ return result
67
+
68
+ async def tv_show(self, oid: str) -> dict:
69
+ headers = {
70
+ "content-type": "application/x-www-form-urlencoded",
71
+ "referer": "https://weibo.com/tv/home",
72
+ }
73
+ params = {
74
+ "page": f"/tv/show/{oid}",
75
+ }
76
+ data = {"data": f'{{"Component_Play_Playinfo":{{"oid":"{oid}"}}}}'}
77
+ async with httpx.AsyncClient(proxy=self.proxy) as client:
78
+ response = await client.post(
79
+ "https://weibo.com/tv/api/component", cookies=self._cookies, headers=headers, data=data, params=params
80
+ )
59
81
  response.raise_for_status()
60
- result = response.json()
82
+ result: dict = response.json()
83
+ return result
84
+
85
+ async def parse(self, url: str) -> Union["WeiboContent", "WeiboTVContent"]:
86
+ resolve_url = await self.resolve_url(url)
87
+ id_ = await self.get_id_by_url(resolve_url)
88
+ if not id_:
89
+ raise ValueError("Invalid URL")
90
+ if self.is_tv(resolve_url):
91
+ result = await self.tv_show(id_)
92
+ return WeiboTVContent.parse(result)
93
+ result = await self.statuses_show(id_)
61
94
  return WeiboContent.parse(result)
62
95
 
63
96
 
@@ -317,11 +350,31 @@ class Data:
317
350
  class WeiboContent:
318
351
  data: Data
319
352
 
320
- @staticmethod
321
- def parse(json_dict: dict) -> "WeiboContent":
353
+ @classmethod
354
+ def parse(cls, json_dict: dict) -> Self:
322
355
  data = Data.parse(json_dict)
323
- return WeiboContent(data=data)
356
+ return cls(data=data)
357
+
358
+
359
+ @dataclass
360
+ class WeiboTVContent:
361
+ text: str
362
+ video_url: str
363
+ video_duration: float
364
+ cover_image: str
365
+
366
+ @classmethod
367
+ def parse(cls, json_dict: dict) -> Self:
368
+ data = json_dict["data"]
369
+ cpp = data["Component_Play_Playinfo"]
370
+
371
+ cover_image = f"https:{cpp['cover_image']}"
372
+ duration_time = cpp["duration_time"]
373
+ text = cpp["text"]
374
+ urls: dict[str, str] = cpp["urls"]
375
+ video_url = f"https:{list(urls.values())[0]}"
376
+ return cls(text=text, video_url=video_url, video_duration=duration_time, cover_image=cover_image)
324
377
 
325
378
 
326
379
  if __name__ == "__main__":
327
- print(asyncio.run(WeiboAPI().parse("https://weibo.com/6576374129/Qv0n8sXum")))
380
+ print(asyncio.run(WeiboAPI().parse("https://weibo.com/tv/show/1034:5306598453608528")))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: parsehub
3
- Version: 2.0.25
3
+ Version: 2.0.27
4
4
  Summary: 轻量、异步、开箱即用的社交媒体聚合解析库
5
5
  Author-email: 梓澪 <zilingmio@gmail.com>
6
6
  License: MIT
@@ -95,7 +95,7 @@ Dynamic: license-file
95
95
  ### CLI 安装
96
96
 
97
97
  ```bash
98
- pipx install "parsehub[cli]"
98
+ uv tool install "parsehub[cli]"
99
99
  ph -v
100
100
  ```
101
101
 
@@ -105,12 +105,8 @@ ph -v
105
105
  # uv
106
106
  uv add parsehub
107
107
 
108
- # pip
109
- pip install parsehub
110
-
111
108
  # 需要完整 CLI 能力时,可安装 `cli` 扩展
112
109
  uv add "parsehub[cli]"
113
- pip install "parsehub[cli]"
114
110
  ```
115
111
 
116
112
  ## 🚀 快速开始
@@ -297,7 +297,8 @@ class TestPlatformUrlMatching(unittest.TestCase):
297
297
  "https://weibo.com/1234567890/Nexample",
298
298
  "https://weibo.com/detail/1234567890123456",
299
299
  "https://m.weibo.cn/status/Nexample",
300
- "https://weibo.cn/status/Nexample",
300
+ "https://video.weibo.com/show?fid=1034:5307969483767845",
301
+ "https://weibo.com/tv/show/1034:5307969483767845",
301
302
  ],
302
303
  Platform.WEIXIN: [
303
304
  "https://mp.weixin.qq.com/s/example",
File without changes
File without changes
File without changes
File without changes
File without changes