parsehub 2.0.12__tar.gz → 2.0.14__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {parsehub-2.0.12/src/parsehub.egg-info → parsehub-2.0.14}/PKG-INFO +1 -1
- {parsehub-2.0.12 → parsehub-2.0.14}/pyproject.toml +1 -1
- {parsehub-2.0.12 → parsehub-2.0.14}/src/parsehub/parsers/base/base.py +0 -1
- {parsehub-2.0.12 → parsehub-2.0.14}/src/parsehub/parsers/parser/bilibili.py +5 -2
- {parsehub-2.0.12 → parsehub-2.0.14}/src/parsehub/parsers/parser/douyin.py +1 -1
- parsehub-2.0.14/src/parsehub/parsers/parser/tieba.py +78 -0
- parsehub-2.0.14/src/parsehub/provider_api/tieba.py +153 -0
- {parsehub-2.0.12 → parsehub-2.0.14/src/parsehub.egg-info}/PKG-INFO +1 -1
- parsehub-2.0.12/src/parsehub/parsers/parser/tieba.py +0 -25
- parsehub-2.0.12/src/parsehub/provider_api/tieba.py +0 -74
- {parsehub-2.0.12 → parsehub-2.0.14}/LICENSE +0 -0
- {parsehub-2.0.12 → parsehub-2.0.14}/README.md +0 -0
- {parsehub-2.0.12 → parsehub-2.0.14}/setup.cfg +0 -0
- {parsehub-2.0.12 → parsehub-2.0.14}/src/__init__.py +0 -0
- {parsehub-2.0.12 → parsehub-2.0.14}/src/parsehub/__init__.py +0 -0
- {parsehub-2.0.12 → parsehub-2.0.14}/src/parsehub/config/__init__.py +0 -0
- {parsehub-2.0.12 → parsehub-2.0.14}/src/parsehub/config/config.py +0 -0
- {parsehub-2.0.12 → parsehub-2.0.14}/src/parsehub/errors.py +0 -0
- {parsehub-2.0.12 → parsehub-2.0.14}/src/parsehub/parsers/__init__.py +0 -0
- {parsehub-2.0.12 → parsehub-2.0.14}/src/parsehub/parsers/base/__init__.py +0 -0
- {parsehub-2.0.12 → parsehub-2.0.14}/src/parsehub/parsers/base/ytdlp.py +0 -0
- {parsehub-2.0.12 → parsehub-2.0.14}/src/parsehub/parsers/parser/__init__.py +0 -0
- {parsehub-2.0.12 → parsehub-2.0.14}/src/parsehub/parsers/parser/coolapk.py +0 -0
- {parsehub-2.0.12 → parsehub-2.0.14}/src/parsehub/parsers/parser/facebook.py +0 -0
- {parsehub-2.0.12 → parsehub-2.0.14}/src/parsehub/parsers/parser/instagram.py +0 -0
- {parsehub-2.0.12 → parsehub-2.0.14}/src/parsehub/parsers/parser/kuaishou.py +0 -0
- {parsehub-2.0.12 → parsehub-2.0.14}/src/parsehub/parsers/parser/pipix.py +0 -0
- {parsehub-2.0.12 → parsehub-2.0.14}/src/parsehub/parsers/parser/threads.py +0 -0
- {parsehub-2.0.12 → parsehub-2.0.14}/src/parsehub/parsers/parser/twitter.py +0 -0
- {parsehub-2.0.12 → parsehub-2.0.14}/src/parsehub/parsers/parser/weibo.py +0 -0
- {parsehub-2.0.12 → parsehub-2.0.14}/src/parsehub/parsers/parser/weixin.py +0 -0
- {parsehub-2.0.12 → parsehub-2.0.14}/src/parsehub/parsers/parser/xhs.py +0 -0
- {parsehub-2.0.12 → parsehub-2.0.14}/src/parsehub/parsers/parser/xiaoheihe.py +0 -0
- {parsehub-2.0.12 → parsehub-2.0.14}/src/parsehub/parsers/parser/youtube.py +0 -0
- {parsehub-2.0.12 → parsehub-2.0.14}/src/parsehub/parsers/parser/zuiyou.py +0 -0
- {parsehub-2.0.12 → parsehub-2.0.14}/src/parsehub/provider_api/__init__.py +0 -0
- {parsehub-2.0.12 → parsehub-2.0.14}/src/parsehub/provider_api/bilibili.py +0 -0
- {parsehub-2.0.12 → parsehub-2.0.14}/src/parsehub/provider_api/coolapk.py +0 -0
- {parsehub-2.0.12 → parsehub-2.0.14}/src/parsehub/provider_api/instagram.py +0 -0
- {parsehub-2.0.12 → parsehub-2.0.14}/src/parsehub/provider_api/kuaishou.py +0 -0
- {parsehub-2.0.12 → parsehub-2.0.14}/src/parsehub/provider_api/pipix.py +0 -0
- {parsehub-2.0.12 → parsehub-2.0.14}/src/parsehub/provider_api/threads.py +0 -0
- {parsehub-2.0.12 → parsehub-2.0.14}/src/parsehub/provider_api/twitter.py +0 -0
- {parsehub-2.0.12 → parsehub-2.0.14}/src/parsehub/provider_api/weibo.py +0 -0
- {parsehub-2.0.12 → parsehub-2.0.14}/src/parsehub/provider_api/weixin.py +0 -0
- {parsehub-2.0.12 → parsehub-2.0.14}/src/parsehub/provider_api/xhs.py +0 -0
- {parsehub-2.0.12 → parsehub-2.0.14}/src/parsehub/provider_api/xiaoheihe.py +0 -0
- {parsehub-2.0.12 → parsehub-2.0.14}/src/parsehub/provider_api/zuiyou.py +0 -0
- {parsehub-2.0.12 → parsehub-2.0.14}/src/parsehub/types/__init__.py +0 -0
- {parsehub-2.0.12 → parsehub-2.0.14}/src/parsehub/types/callback.py +0 -0
- {parsehub-2.0.12 → parsehub-2.0.14}/src/parsehub/types/media_file.py +0 -0
- {parsehub-2.0.12 → parsehub-2.0.14}/src/parsehub/types/media_ref.py +0 -0
- {parsehub-2.0.12 → parsehub-2.0.14}/src/parsehub/types/platform.py +0 -0
- {parsehub-2.0.12 → parsehub-2.0.14}/src/parsehub/types/post.py +0 -0
- {parsehub-2.0.12 → parsehub-2.0.14}/src/parsehub/types/result.py +0 -0
- {parsehub-2.0.12 → parsehub-2.0.14}/src/parsehub/utils/downloader.py +0 -0
- {parsehub-2.0.12 → parsehub-2.0.14}/src/parsehub/utils/media_info.py +0 -0
- {parsehub-2.0.12 → parsehub-2.0.14}/src/parsehub/utils/utils.py +0 -0
- {parsehub-2.0.12 → parsehub-2.0.14}/src/parsehub.egg-info/SOURCES.txt +0 -0
- {parsehub-2.0.12 → parsehub-2.0.14}/src/parsehub.egg-info/dependency_links.txt +0 -0
- {parsehub-2.0.12 → parsehub-2.0.14}/src/parsehub.egg-info/requires.txt +0 -0
- {parsehub-2.0.12 → parsehub-2.0.14}/src/parsehub.egg-info/top_level.txt +0 -0
|
@@ -3,6 +3,8 @@ from pathlib import Path
|
|
|
3
3
|
from typing import Union
|
|
4
4
|
from urllib.parse import parse_qs, urlparse
|
|
5
5
|
|
|
6
|
+
from loguru import logger
|
|
7
|
+
|
|
6
8
|
from ...config.config import GlobalConfig
|
|
7
9
|
from ...provider_api.bilibili import BiliAPI, BiliDynamic
|
|
8
10
|
from ...types import (
|
|
@@ -46,11 +48,12 @@ class BiliParse(YtParser):
|
|
|
46
48
|
else:
|
|
47
49
|
try:
|
|
48
50
|
return await self.bili_api_parse(raw_url)
|
|
49
|
-
except Exception:
|
|
51
|
+
except Exception as e:
|
|
52
|
+
logger.opt(exception=e).warning("Bilibili API 解析失败, 尝试 yt-dlp 解析")
|
|
50
53
|
try:
|
|
51
54
|
return await self.ytp_parse(raw_url)
|
|
52
55
|
except Exception as e:
|
|
53
|
-
raise ParseError("Bilibili解析失败") from e
|
|
56
|
+
raise ParseError("Bilibili 解析失败") from e
|
|
54
57
|
|
|
55
58
|
@staticmethod
|
|
56
59
|
def _is_bvid(url: str):
|
|
@@ -21,7 +21,7 @@ from ..base.base import BaseParser
|
|
|
21
21
|
class DouyinParser(BaseParser):
|
|
22
22
|
__platform__ = Platform.DOUYIN
|
|
23
23
|
__supported_type__ = ["视频", "图文"]
|
|
24
|
-
__match__ = r"^(http(s)?://)?.+douyin.com/(?!share/user).+|^(http(s)?://)?.+tiktok.com/.+"
|
|
24
|
+
__match__ = r"^(http(s)?://)?.+douyin.com/(?!share/user|qishui).+|^(http(s)?://)?.+tiktok.com/.+"
|
|
25
25
|
__redirect_keywords__ = ["v.douyin", "vt.tiktok", "iesdouyin"]
|
|
26
26
|
__reserved_parameters__ = ["modal_id"]
|
|
27
27
|
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
from typing import Union
|
|
2
|
+
|
|
3
|
+
import httpx
|
|
4
|
+
|
|
5
|
+
from ...provider_api.tieba import TieBa, TieBaError, TieBaPostType
|
|
6
|
+
from ...types import AniRef, ImageParseResult, ImageRef, ParseError, Platform, VideoParseResult, VideoRef
|
|
7
|
+
from ..base.base import BaseParser
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class TieBaParser(BaseParser):
|
|
11
|
+
__platform__ = Platform.TIEBA
|
|
12
|
+
__supported_type__ = ["视频", "图文"]
|
|
13
|
+
__match__ = r"^(http(s)?://)?.+tieba.baidu.com/p/\d+"
|
|
14
|
+
|
|
15
|
+
async def _do_parse(self, raw_url: str) -> Union["ImageParseResult", "VideoParseResult"]:
|
|
16
|
+
try:
|
|
17
|
+
tb = await TieBa(self.proxy).parse(raw_url)
|
|
18
|
+
except TieBaError as e:
|
|
19
|
+
raise ParseError(e.msg if e.msg else "贴吧解析失败: 未知错误") from e
|
|
20
|
+
except Exception as e:
|
|
21
|
+
raise ParseError("贴吧解析失败: 未知错误") from e
|
|
22
|
+
|
|
23
|
+
match tb.type:
|
|
24
|
+
case TieBaPostType.VIDEO:
|
|
25
|
+
return VideoParseResult(
|
|
26
|
+
title=tb.title,
|
|
27
|
+
video=VideoRef(
|
|
28
|
+
url=tb.media.url,
|
|
29
|
+
thumb_url=tb.media.thumb_url,
|
|
30
|
+
width=tb.media.width,
|
|
31
|
+
height=tb.media.height,
|
|
32
|
+
duration=tb.media.duration,
|
|
33
|
+
),
|
|
34
|
+
content=tb.content,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
case TieBaPostType.PHOTO:
|
|
38
|
+
images = []
|
|
39
|
+
if tb.media:
|
|
40
|
+
for i in tb.media:
|
|
41
|
+
async with httpx.AsyncClient(proxy=self.proxy) as cli:
|
|
42
|
+
try:
|
|
43
|
+
r = await cli.head(i.url)
|
|
44
|
+
r.raise_for_status()
|
|
45
|
+
except Exception:
|
|
46
|
+
images.append(
|
|
47
|
+
ImageRef(
|
|
48
|
+
url=i.url,
|
|
49
|
+
thumb_url=i.thumb_url,
|
|
50
|
+
width=i.width,
|
|
51
|
+
height=i.height,
|
|
52
|
+
)
|
|
53
|
+
)
|
|
54
|
+
else:
|
|
55
|
+
headers = r.headers
|
|
56
|
+
if (t := headers.get("content-type")) and "gif" in t:
|
|
57
|
+
images.append(
|
|
58
|
+
AniRef(
|
|
59
|
+
url=i.url,
|
|
60
|
+
thumb_url=i.thumb_url,
|
|
61
|
+
width=i.width,
|
|
62
|
+
height=i.height,
|
|
63
|
+
)
|
|
64
|
+
)
|
|
65
|
+
else:
|
|
66
|
+
images.append(
|
|
67
|
+
ImageRef(
|
|
68
|
+
url=i.url,
|
|
69
|
+
thumb_url=i.thumb_url,
|
|
70
|
+
width=i.width,
|
|
71
|
+
height=i.height,
|
|
72
|
+
)
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
return ImageParseResult(title=tb.title, content=tb.content, photo=images)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
__all__ = ["TieBaParser"]
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
import re
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from enum import Enum
|
|
5
|
+
from typing import Self
|
|
6
|
+
|
|
7
|
+
import httpx
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class TieBa:
|
|
11
|
+
def __init__(self, proxy: str | None = None):
|
|
12
|
+
self.proxy = proxy
|
|
13
|
+
|
|
14
|
+
async def parse(self, url: str) -> "TieBaPost":
|
|
15
|
+
data = await self.fetch_post_data(url)
|
|
16
|
+
return TieBaPost.parse(data)
|
|
17
|
+
|
|
18
|
+
@staticmethod
|
|
19
|
+
def gen_sign(params: dict):
|
|
20
|
+
items = sorted(params.items())
|
|
21
|
+
base_str = "".join([f"{k}={v}" for k, v in items])
|
|
22
|
+
salt = "36770b1f34c9bbf2e7d1a99d2b82fa9e"
|
|
23
|
+
return hashlib.md5((base_str + salt).encode("utf-8")).hexdigest()
|
|
24
|
+
|
|
25
|
+
async def fetch_tbs(self) -> str:
|
|
26
|
+
async with httpx.AsyncClient(proxy=self.proxy) as cli:
|
|
27
|
+
result = await cli.get("http://tieba.baidu.com/dc/common/tbs")
|
|
28
|
+
result.raise_for_status()
|
|
29
|
+
result = result.json()
|
|
30
|
+
if tbs := result.get("tbs"):
|
|
31
|
+
return tbs
|
|
32
|
+
raise TieBaError("获取 tbs 失败")
|
|
33
|
+
|
|
34
|
+
@staticmethod
|
|
35
|
+
def get_kz(url: str) -> str:
|
|
36
|
+
if match := re.search(r"/p/(\d+)", url):
|
|
37
|
+
return match.group(1)
|
|
38
|
+
raise ValueError("无法从 URL 中提取帖子 ID")
|
|
39
|
+
|
|
40
|
+
async def fetch_post_data(self, url: str) -> dict:
|
|
41
|
+
kz = self.get_kz(url)
|
|
42
|
+
tbs = await self.fetch_tbs()
|
|
43
|
+
data = {
|
|
44
|
+
"pn": "1",
|
|
45
|
+
"lz": "0",
|
|
46
|
+
"r": "2",
|
|
47
|
+
"mark_type": "0",
|
|
48
|
+
"back": "0",
|
|
49
|
+
"fr": "personalize_page",
|
|
50
|
+
"kz": kz,
|
|
51
|
+
"session_request_times": "1",
|
|
52
|
+
"tbs": tbs,
|
|
53
|
+
"subapp_type": "pc",
|
|
54
|
+
"_client_type": "20",
|
|
55
|
+
}
|
|
56
|
+
data["sign"] = self.gen_sign(data)
|
|
57
|
+
async with httpx.AsyncClient(proxy=self.proxy, timeout=30) as cli:
|
|
58
|
+
result = await cli.post("https://tieba.baidu.com/c/f/pb/page_pc", data=data)
|
|
59
|
+
result.raise_for_status()
|
|
60
|
+
result = result.json()
|
|
61
|
+
if result["error_code"]:
|
|
62
|
+
raise TieBaError(em if (em := result["error_msg"]) else "获取帖子内容失败")
|
|
63
|
+
return result
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class TieBaPostType(Enum):
|
|
67
|
+
PHOTO = "PHOTO"
|
|
68
|
+
VIDEO = "VIDEO"
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@dataclass
|
|
72
|
+
class TieBaVideo:
|
|
73
|
+
url: str
|
|
74
|
+
thumb_url: str | None = None
|
|
75
|
+
width: int = 0
|
|
76
|
+
height: int = 0
|
|
77
|
+
duration: int = 0
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
@dataclass
|
|
81
|
+
class TieBaPhoto:
|
|
82
|
+
url: str
|
|
83
|
+
thumb_url: str | None = None
|
|
84
|
+
width: int = 0
|
|
85
|
+
height: int = 0
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
@dataclass
|
|
89
|
+
class TieBaPost:
|
|
90
|
+
type: TieBaPostType
|
|
91
|
+
title: str
|
|
92
|
+
content: str
|
|
93
|
+
media: list[TieBaPhoto] | TieBaVideo | None = None
|
|
94
|
+
|
|
95
|
+
@classmethod
|
|
96
|
+
def parse(cls, data: dict) -> Self:
|
|
97
|
+
thread = data["thread"]
|
|
98
|
+
origin_thread_info = thread["origin_thread_info"]
|
|
99
|
+
|
|
100
|
+
# title
|
|
101
|
+
title = origin_thread_info["title"]
|
|
102
|
+
|
|
103
|
+
# content
|
|
104
|
+
origin_content = origin_thread_info["content"]
|
|
105
|
+
content_list: list[str] = []
|
|
106
|
+
for oc in origin_content:
|
|
107
|
+
oc_type = oc["type"]
|
|
108
|
+
match oc_type:
|
|
109
|
+
case 0:
|
|
110
|
+
content_list.append(oc["text"])
|
|
111
|
+
content = "\n".join(content_list)
|
|
112
|
+
|
|
113
|
+
# media
|
|
114
|
+
media = []
|
|
115
|
+
if origin_media := origin_thread_info.get("media"):
|
|
116
|
+
post_type = TieBaPostType.PHOTO
|
|
117
|
+
for om in origin_media:
|
|
118
|
+
media.append(
|
|
119
|
+
TieBaPhoto(
|
|
120
|
+
url=om["big_pic"],
|
|
121
|
+
thumb_url=om["small_pic"],
|
|
122
|
+
width=om["width"],
|
|
123
|
+
height=om["height"],
|
|
124
|
+
)
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
elif video_info := thread.get("video_info"):
|
|
128
|
+
post_type = TieBaPostType.VIDEO
|
|
129
|
+
media.append(
|
|
130
|
+
TieBaVideo(
|
|
131
|
+
url=video_info["video_url"],
|
|
132
|
+
thumb_url=video_info["thumbnail_url"],
|
|
133
|
+
width=video_info["video_width"],
|
|
134
|
+
height=video_info["video_height"],
|
|
135
|
+
duration=video_info["video_duration"],
|
|
136
|
+
)
|
|
137
|
+
)
|
|
138
|
+
else:
|
|
139
|
+
post_type = TieBaPostType.PHOTO
|
|
140
|
+
|
|
141
|
+
m = media[0] if post_type == TieBaPostType.VIDEO else media if media else None
|
|
142
|
+
return TieBaPost(
|
|
143
|
+
type=post_type,
|
|
144
|
+
title=title,
|
|
145
|
+
content=content,
|
|
146
|
+
media=m,
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
class TieBaError(Exception):
|
|
151
|
+
def __init__(self, msg: str):
|
|
152
|
+
self.msg = msg
|
|
153
|
+
super().__init__(msg)
|
|
@@ -1,25 +0,0 @@
|
|
|
1
|
-
from typing import Union
|
|
2
|
-
|
|
3
|
-
from ...provider_api.tieba import TieBa
|
|
4
|
-
from ...types import ImageParseResult, ParseError, Platform, VideoParseResult
|
|
5
|
-
from ..base.base import BaseParser
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class TieBaParser(BaseParser):
|
|
9
|
-
__platform__ = Platform.TIEBA
|
|
10
|
-
__supported_type__ = ["视频", "图文"]
|
|
11
|
-
__match__ = r"^(http(s)?://)?.+tieba.baidu.com/p/\d+"
|
|
12
|
-
|
|
13
|
-
async def _do_parse(self, raw_url: str) -> Union["ImageParseResult", "VideoParseResult"]:
|
|
14
|
-
try:
|
|
15
|
-
tb = await TieBa(self.proxy).parse(raw_url)
|
|
16
|
-
except Exception as e:
|
|
17
|
-
raise ParseError("贴吧解析失败") from e
|
|
18
|
-
|
|
19
|
-
if tb.video_url:
|
|
20
|
-
return VideoParseResult(title=tb.title, video=tb.video_url, content=tb.content)
|
|
21
|
-
else:
|
|
22
|
-
return ImageParseResult(title=tb.title, photo=tb.img_url, content=tb.content)
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
__all__ = ["TieBaParser"]
|
|
@@ -1,74 +0,0 @@
|
|
|
1
|
-
from dataclasses import dataclass
|
|
2
|
-
|
|
3
|
-
import httpx
|
|
4
|
-
from bs4 import BeautifulSoup
|
|
5
|
-
from httpx import Response
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class TieBa:
|
|
9
|
-
def __init__(self, proxy: str | None = None):
|
|
10
|
-
self.proxy = proxy
|
|
11
|
-
|
|
12
|
-
@staticmethod
|
|
13
|
-
def _parse_out_the_body(text):
|
|
14
|
-
soup = BeautifulSoup(str(text), "lxml")
|
|
15
|
-
div_tag = soup.find_all("div")
|
|
16
|
-
[img.extract() for img in soup.find_all("img")]
|
|
17
|
-
[i.unwrap() for i in div_tag]
|
|
18
|
-
text = soup.text.strip()
|
|
19
|
-
# text = re.sub(
|
|
20
|
-
# r"(<br/><br/>)+|点击展开,查看完整图片|<i.*></i>", "", str(soup)
|
|
21
|
-
# ).strip()
|
|
22
|
-
# text = re.sub(r'<span class="apc_src_wrapper">视频来自:.*</span>', "", text)
|
|
23
|
-
return text
|
|
24
|
-
|
|
25
|
-
@staticmethod
|
|
26
|
-
async def get_tieba_img_url(html: Response):
|
|
27
|
-
"""获取帖子中所有图片的URL"""
|
|
28
|
-
soup = BeautifulSoup(html.text, "lxml")
|
|
29
|
-
d_post_content_firstfloor = soup.find("div", {"class": "d_post_content_firstfloor"})
|
|
30
|
-
img_tags = d_post_content_firstfloor.find_all("img", {"class": "BDE_Image"})
|
|
31
|
-
return [img["src"] for img in img_tags if "src" in img.attrs]
|
|
32
|
-
|
|
33
|
-
@staticmethod
|
|
34
|
-
async def get_tieba_video_url(html: Response):
|
|
35
|
-
"""获取帖子中所有视频的URL"""
|
|
36
|
-
soup = BeautifulSoup(html.text, "lxml")
|
|
37
|
-
d_post_content_firstfloor = soup.find("div", {"class": "d_post_content_firstfloor"})
|
|
38
|
-
|
|
39
|
-
if video_tags := d_post_content_firstfloor.find("embed", {"class": "BDE_Flash"}):
|
|
40
|
-
return video_tags["data-video"]
|
|
41
|
-
return None
|
|
42
|
-
|
|
43
|
-
async def get_the_content(self, html: Response):
|
|
44
|
-
"""获取帖子的标题和内容"""
|
|
45
|
-
soup = BeautifulSoup(html.text, "lxml")
|
|
46
|
-
title = soup.find("h3", {"class": ["core_title_txt", "pull-left", "text-overflow"]}) or soup.find(
|
|
47
|
-
"h1", {"class": "core_title_txt"}
|
|
48
|
-
)
|
|
49
|
-
if not title:
|
|
50
|
-
raise Exception("未获取到标题内容")
|
|
51
|
-
title = title.text.strip()
|
|
52
|
-
content = soup.find("div", {"class": ["d_post_content", "j_d_post_content"]})
|
|
53
|
-
content = self._parse_out_the_body(content)
|
|
54
|
-
return title, content
|
|
55
|
-
|
|
56
|
-
async def get_html(self, t_url) -> Response:
|
|
57
|
-
async with httpx.AsyncClient(proxy=self.proxy) as c:
|
|
58
|
-
return await c.get(t_url, headers={"User-Agent": "Mozilla5.0/"}, timeout=15)
|
|
59
|
-
|
|
60
|
-
async def parse(self, t_url) -> "TieBaPost":
|
|
61
|
-
res = await self.get_html(t_url)
|
|
62
|
-
|
|
63
|
-
title, content = await self.get_the_content(res)
|
|
64
|
-
img_url = await self.get_tieba_img_url(res)
|
|
65
|
-
video_url = await self.get_tieba_video_url(res)
|
|
66
|
-
return TieBaPost(title, content, img_url, video_url)
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
@dataclass
|
|
70
|
-
class TieBaPost:
|
|
71
|
-
title: str
|
|
72
|
-
content: str
|
|
73
|
-
img_url: list
|
|
74
|
-
video_url: str = None
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|