parsehub 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. parsehub-1.0.0/LICENSE +21 -0
  2. parsehub-1.0.0/PKG-INFO +80 -0
  3. parsehub-1.0.0/README.md +42 -0
  4. parsehub-1.0.0/pyproject.toml +40 -0
  5. parsehub-1.0.0/setup.cfg +4 -0
  6. parsehub-1.0.0/src/parsehub/__init__.py +7 -0
  7. parsehub-1.0.0/src/parsehub/config/__init__.py +3 -0
  8. parsehub-1.0.0/src/parsehub/config/config.py +31 -0
  9. parsehub-1.0.0/src/parsehub/deps/submodule_manager.py +31 -0
  10. parsehub-1.0.0/src/parsehub/deps/xhs/locale/po_to_mo.py +25 -0
  11. parsehub-1.0.0/src/parsehub/deps/xhs/main.py +94 -0
  12. parsehub-1.0.0/src/parsehub/deps/xhs/source/CLI/__init__.py +3 -0
  13. parsehub-1.0.0/src/parsehub/deps/xhs/source/CLI/main.py +330 -0
  14. parsehub-1.0.0/src/parsehub/deps/xhs/source/TUI/__init__.py +3 -0
  15. parsehub-1.0.0/src/parsehub/deps/xhs/source/TUI/about.py +76 -0
  16. parsehub-1.0.0/src/parsehub/deps/xhs/source/TUI/app.py +114 -0
  17. parsehub-1.0.0/src/parsehub/deps/xhs/source/TUI/index.py +142 -0
  18. parsehub-1.0.0/src/parsehub/deps/xhs/source/TUI/loading.py +22 -0
  19. parsehub-1.0.0/src/parsehub/deps/xhs/source/TUI/monitor.py +76 -0
  20. parsehub-1.0.0/src/parsehub/deps/xhs/source/TUI/progress.py +9 -0
  21. parsehub-1.0.0/src/parsehub/deps/xhs/source/TUI/record.py +55 -0
  22. parsehub-1.0.0/src/parsehub/deps/xhs/source/TUI/setting.py +251 -0
  23. parsehub-1.0.0/src/parsehub/deps/xhs/source/TUI/update.py +86 -0
  24. parsehub-1.0.0/src/parsehub/deps/xhs/source/__init__.py +11 -0
  25. parsehub-1.0.0/src/parsehub/deps/xhs/source/application/__init__.py +3 -0
  26. parsehub-1.0.0/src/parsehub/deps/xhs/source/application/app.py +522 -0
  27. parsehub-1.0.0/src/parsehub/deps/xhs/source/application/download.py +327 -0
  28. parsehub-1.0.0/src/parsehub/deps/xhs/source/application/explore.py +66 -0
  29. parsehub-1.0.0/src/parsehub/deps/xhs/source/application/image.py +50 -0
  30. parsehub-1.0.0/src/parsehub/deps/xhs/source/application/request.py +90 -0
  31. parsehub-1.0.0/src/parsehub/deps/xhs/source/application/video.py +20 -0
  32. parsehub-1.0.0/src/parsehub/deps/xhs/source/expansion/__init__.py +9 -0
  33. parsehub-1.0.0/src/parsehub/deps/xhs/source/expansion/browser.py +111 -0
  34. parsehub-1.0.0/src/parsehub/deps/xhs/source/expansion/cleaner.py +92 -0
  35. parsehub-1.0.0/src/parsehub/deps/xhs/source/expansion/converter.py +64 -0
  36. parsehub-1.0.0/src/parsehub/deps/xhs/source/expansion/file_folder.py +25 -0
  37. parsehub-1.0.0/src/parsehub/deps/xhs/source/expansion/namespace.py +84 -0
  38. parsehub-1.0.0/src/parsehub/deps/xhs/source/expansion/truncate.py +35 -0
  39. parsehub-1.0.0/src/parsehub/deps/xhs/source/module/__init__.py +40 -0
  40. parsehub-1.0.0/src/parsehub/deps/xhs/source/module/extend.py +5 -0
  41. parsehub-1.0.0/src/parsehub/deps/xhs/source/module/manager.py +259 -0
  42. parsehub-1.0.0/src/parsehub/deps/xhs/source/module/model.py +14 -0
  43. parsehub-1.0.0/src/parsehub/deps/xhs/source/module/recorder.py +127 -0
  44. parsehub-1.0.0/src/parsehub/deps/xhs/source/module/settings.py +72 -0
  45. parsehub-1.0.0/src/parsehub/deps/xhs/source/module/static.py +79 -0
  46. parsehub-1.0.0/src/parsehub/deps/xhs/source/module/tools.py +34 -0
  47. parsehub-1.0.0/src/parsehub/deps/xhs/source/module/translator.py +27 -0
  48. parsehub-1.0.0/src/parsehub/main.py +39 -0
  49. parsehub-1.0.0/src/parsehub/parsers/__init__.py +0 -0
  50. parsehub-1.0.0/src/parsehub/parsers/base/__init__.py +0 -0
  51. parsehub-1.0.0/src/parsehub/parsers/base/base.py +54 -0
  52. parsehub-1.0.0/src/parsehub/parsers/base/yt_dlp_parser.py +158 -0
  53. parsehub-1.0.0/src/parsehub/parsers/parser/__init__.py +0 -0
  54. parsehub-1.0.0/src/parsehub/parsers/parser/bilibili.py +134 -0
  55. parsehub-1.0.0/src/parsehub/parsers/parser/douyin.py +101 -0
  56. parsehub-1.0.0/src/parsehub/parsers/parser/facebook.py +11 -0
  57. parsehub-1.0.0/src/parsehub/parsers/parser/instagram.py +50 -0
  58. parsehub-1.0.0/src/parsehub/parsers/parser/tieba.py +99 -0
  59. parsehub-1.0.0/src/parsehub/parsers/parser/twitter.py +113 -0
  60. parsehub-1.0.0/src/parsehub/parsers/parser/weibo.py +59 -0
  61. parsehub-1.0.0/src/parsehub/parsers/parser/xhs_.py +50 -0
  62. parsehub-1.0.0/src/parsehub/parsers/parser/youtube.py +27 -0
  63. parsehub-1.0.0/src/parsehub/tools/__init__.py +2 -0
  64. parsehub-1.0.0/src/parsehub/tools/llm.py +27 -0
  65. parsehub-1.0.0/src/parsehub/tools/transcriptions.py +110 -0
  66. parsehub-1.0.0/src/parsehub/types/__init__.py +11 -0
  67. parsehub-1.0.0/src/parsehub/types/error.py +1 -0
  68. parsehub-1.0.0/src/parsehub/types/media.py +87 -0
  69. parsehub-1.0.0/src/parsehub/types/parse_result.py +267 -0
  70. parsehub-1.0.0/src/parsehub/types/subtitles.py +53 -0
  71. parsehub-1.0.0/src/parsehub/types/summary_result.py +6 -0
  72. parsehub-1.0.0/src/parsehub/utiles/bilibili_api.py +283 -0
  73. parsehub-1.0.0/src/parsehub/utiles/download_file.py +76 -0
  74. parsehub-1.0.0/src/parsehub/utiles/img_host.py +61 -0
  75. parsehub-1.0.0/src/parsehub/utiles/utile.py +88 -0
  76. parsehub-1.0.0/src/parsehub/utiles/weibo_api.py +227 -0
  77. parsehub-1.0.0/src/parsehub/utiles/whisper_api.py +72 -0
  78. parsehub-1.0.0/src/parsehub.egg-info/PKG-INFO +80 -0
  79. parsehub-1.0.0/src/parsehub.egg-info/SOURCES.txt +80 -0
  80. parsehub-1.0.0/src/parsehub.egg-info/dependency_links.txt +1 -0
  81. parsehub-1.0.0/src/parsehub.egg-info/requires.txt +30 -0
  82. parsehub-1.0.0/src/parsehub.egg-info/top_level.txt +1 -0
parsehub-1.0.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 梓澪
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,80 @@
1
+ Metadata-Version: 2.1
2
+ Name: parsehub
3
+ Version: 1.0.0
4
+ Summary: 支持AI总结的社交媒体聚合解析器
5
+ Requires-Python: >=3.10.11
6
+ Description-Content-Type: text/markdown
7
+ License-File: LICENSE
8
+ Requires-Dist: aiocache>=0.12.3
9
+ Requires-Dist: aiofiles>=24.1.0
10
+ Requires-Dist: apscheduler>=4.0.0a5
11
+ Requires-Dist: beautifulsoup4==4.12.3
12
+ Requires-Dist: dynamicadaptor>=0.5.2
13
+ Requires-Dist: dynrender-skia>=0.2.5
14
+ Requires-Dist: langchain>=0.2.5
15
+ Requires-Dist: langchain-core>=0.2.8
16
+ Requires-Dist: langchain-openai>=0.1.14
17
+ Requires-Dist: loguru>=0.6.0
18
+ Requires-Dist: openai>=1.54.5
19
+ Requires-Dist: opencv-python>=4.10.0.84
20
+ Requires-Dist: playwright>=1.48.0
21
+ Requires-Dist: pydub>=0.25.1
22
+ Requires-Dist: python-dotenv>=1.0.1
23
+ Requires-Dist: tenacity>=8.5.0
24
+ Requires-Dist: urlextract>=1.9.0
25
+ Requires-Dist: yt-dlp>=2024.11.18.232921.dev0
26
+ Requires-Dist: pydantic>=1.10.19
27
+ Requires-Dist: textual<=0.63.0
28
+ Requires-Dist: pyperclip>=1.9.0
29
+ Requires-Dist: lxml>=5.3.0
30
+ Requires-Dist: pyyaml>=6.0.2
31
+ Requires-Dist: aiosqlite>=0.20.0
32
+ Requires-Dist: click>=8.1.7
33
+ Requires-Dist: rookiepy>=0.5.6
34
+ Requires-Dist: fastapi>=0.112.1
35
+ Requires-Dist: uvicorn>=0.30.6
36
+ Requires-Dist: emoji>=2.14.0
37
+ Requires-Dist: instaloader>=4.14
38
+
39
+ # ParseHub
40
+
41
+ **支持AI总结的社交媒体聚合解析器**
42
+
43
+ > 视频总结会调用 `whisper-1` 模型
44
+
45
+ ```python
46
+ from parsehub import ParseHub
47
+ from parsehub.config import ParseHubConfig
48
+ import asyncio
49
+
50
+
51
+ async def main():
52
+ # ParseHubConfig.api_key = 'your_api_key'
53
+ # ParseHubConfig.base_url = 'your_base_url'
54
+
55
+ ph = ParseHub()
56
+ result = await ph.parse('https://twitter.com/aobuta_anime/status/1827284717848424696')
57
+ print(result)
58
+
59
+ # download_result = await result.download()
60
+ # print(download_result.media)
61
+ # summary_result = await download_result.summary()
62
+ # await download_result.delete()
63
+
64
+ summary_result = await result.summary()
65
+ print(summary_result.content)
66
+
67
+
68
+ if __name__ == '__main__':
69
+ asyncio.run(main())
70
+ ```
71
+
72
+ ## 环境变量
73
+
74
+ | 名称 | 描述 | 默认值 |
75
+ |------------|--------------------------------------------------------------------------|---------------------------|
76
+ | DOUYIN_API | 抖音解析API地址, 项目地址: https://github.com/Evil0ctal/Douyin_TikTok_Download_API | https://douyin.wtf |
77
+ | PROVIDER | 模型提供商, 暂只支持openai | openai |
78
+ | API_KEY | API Key | |
79
+ | BASE_URL | API 地址 | https://api.openai.com/v1 |
80
+ | MODEL | AI总结使用的模型 | gpt-4o-mini |
@@ -0,0 +1,42 @@
1
+ # ParseHub
2
+
3
+ **支持AI总结的社交媒体聚合解析器**
4
+
5
+ > 视频总结会调用 `whisper-1` 模型
6
+
7
+ ```python
8
+ from parsehub import ParseHub
9
+ from parsehub.config import ParseHubConfig
10
+ import asyncio
11
+
12
+
13
+ async def main():
14
+ # ParseHubConfig.api_key = 'your_api_key'
15
+ # ParseHubConfig.base_url = 'your_base_url'
16
+
17
+ ph = ParseHub()
18
+ result = await ph.parse('https://twitter.com/aobuta_anime/status/1827284717848424696')
19
+ print(result)
20
+
21
+ # download_result = await result.download()
22
+ # print(download_result.media)
23
+ # summary_result = await download_result.summary()
24
+ # await download_result.delete()
25
+
26
+ summary_result = await result.summary()
27
+ print(summary_result.content)
28
+
29
+
30
+ if __name__ == '__main__':
31
+ asyncio.run(main())
32
+ ```
33
+
34
+ ## 环境变量
35
+
36
+ | 名称 | 描述 | 默认值 |
37
+ |------------|--------------------------------------------------------------------------|---------------------------|
38
+ | DOUYIN_API | 抖音解析API地址, 项目地址: https://github.com/Evil0ctal/Douyin_TikTok_Download_API | https://douyin.wtf |
39
+ | PROVIDER | 模型提供商, 暂只支持openai | openai |
40
+ | API_KEY | API Key | |
41
+ | BASE_URL | API 地址 | https://api.openai.com/v1 |
42
+ | MODEL | AI总结使用的模型 | gpt-4o-mini |
@@ -0,0 +1,40 @@
1
+ [project]
2
+ name = "parsehub"
3
+ version = "1.0.0"
4
+ description = "支持AI总结的社交媒体聚合解析器"
5
+ readme = "README.md"
6
+ requires-python = ">=3.10.11"
7
+ dependencies = [
8
+ "aiocache>=0.12.3",
9
+ "aiofiles>=24.1.0",
10
+ "apscheduler>=4.0.0a5",
11
+ "beautifulsoup4==4.12.3",
12
+ "dynamicadaptor>=0.5.2",
13
+ "dynrender-skia>=0.2.5",
14
+ "langchain>=0.2.5",
15
+ "langchain-core>=0.2.8",
16
+ "langchain-openai>=0.1.14",
17
+ "loguru>=0.6.0",
18
+ "openai>=1.54.5",
19
+ "opencv-python>=4.10.0.84",
20
+ "playwright>=1.48.0",
21
+ "pydub>=0.25.1",
22
+ "python-dotenv>=1.0.1",
23
+ "tenacity>=8.5.0",
24
+ "urlextract>=1.9.0",
25
+ "yt-dlp>=2024.11.18.232921.dev0",
26
+ "pydantic>=1.10.19",
27
+ "textual<=0.63.0",
28
+ "pyperclip>=1.9.0",
29
+ "lxml>=5.3.0",
30
+ "pyyaml>=6.0.2",
31
+ "aiosqlite>=0.20.0",
32
+ "click>=8.1.7",
33
+ "rookiepy>=0.5.6",
34
+ "fastapi>=0.112.1",
35
+ "uvicorn>=0.30.6",
36
+ "emoji>=2.14.0",
37
+ "instaloader>=4.14",
38
+ ]
39
+ [tool.hatch.build.targets.wheel]
40
+ packages = ["src/parsehub"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,7 @@
1
+ from .deps.submodule_manager import SubmoduleManager
2
+ from .main import ParseHub
3
+
4
+ manager = SubmoduleManager()
5
+ manager.setup_all()
6
+
7
+ __all__ = ["ParseHub"]
@@ -0,0 +1,3 @@
1
+ from .config import ParseHubConfig
2
+
3
+ __all__ = ["ParseHubConfig"]
@@ -0,0 +1,31 @@
1
+ from dataclasses import dataclass
2
+ import sys
3
+ from os import getenv
4
+ from pathlib import Path
5
+
6
+ from dotenv import load_dotenv
7
+
8
+ load_dotenv()
9
+
10
+ UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"
11
+
12
+
13
+ @dataclass
14
+ class ParseHubConfig:
15
+ DOWNLOAD_DIR = getenv("DOWNLOAD_DIR", Path(sys.argv[0]).parent / Path("downloads/"))
16
+ """默认下载目录"""
17
+
18
+ douyin_api = getenv("DOUYIN_API", "https://douyin.wtf")
19
+ """抖音解析API地址, 项目地址: https://github.com/Evil0ctal/Douyin_TikTok_Download_API"""
20
+
21
+ provider = getenv("PROVIDER", "openai").lower()
22
+ """模型提供商"""
23
+
24
+ api_key = getenv("API_KEY")
25
+ """API Key"""
26
+
27
+ base_url = getenv("BASE_URL", "https://api.openai.com/v1")
28
+ """API 地址"""
29
+
30
+ model = getenv("MODEL", "gpt-4o-mini")
31
+ """AI总结模型名称"""
@@ -0,0 +1,31 @@
1
+ from pathlib import Path
2
+ import sys
3
+ from typing import Dict, Optional
4
+
5
+
6
+ class SubmoduleManager:
7
+ def __init__(self):
8
+ self.deps_dir = Path(__file__).parent
9
+ self.path_mappings: Dict[str, Path] = {}
10
+
11
+ def add_submodule(
12
+ self, name: str, relative_path: str, source_dir: Optional[str] = None
13
+ ):
14
+ """添加子模块路径映射"""
15
+ submodule_path = self.deps_dir / relative_path
16
+ self.path_mappings[name] = submodule_path
17
+
18
+ # 添加主模块路径
19
+ if str(submodule_path) not in sys.path:
20
+ sys.path.insert(0, str(submodule_path))
21
+
22
+ # 添加源码目录路径(如果指定)
23
+ if source_dir:
24
+ source_path = submodule_path / source_dir
25
+ if str(source_path) not in sys.path:
26
+ sys.path.insert(0, str(source_path))
27
+
28
+ def setup_all(self):
29
+ """设置所有子模块的路径"""
30
+ # 添加所有需要的子模块
31
+ self.add_submodule("xhs", "xhs", "xhs")
@@ -0,0 +1,25 @@
1
+ from pathlib import Path
2
+ from subprocess import run
3
+
4
+ __all__ = []
5
+ ROOT = Path(__file__).resolve().parent
6
+
7
+
8
+ def scan_directory():
9
+ return [
10
+ item.joinpath("LC_MESSAGES/xhs.po") for item in ROOT.iterdir() if item.is_dir()
11
+ ]
12
+
13
+
14
+ def generate_map(files: list[Path]):
15
+ return [(i, i.with_suffix(".mo")) for i in files]
16
+
17
+
18
+ def generate_mo(maps: list[tuple[Path, Path]]):
19
+ for i, j in maps:
20
+ command = f'msgfmt "{i}" -o "{j}"'
21
+ print(run(command, shell=True, text=True))
22
+
23
+
24
+ if __name__ == "__main__":
25
+ generate_mo(generate_map(scan_directory()))
@@ -0,0 +1,94 @@
1
+ from asyncio import run
2
+ from asyncio.exceptions import CancelledError
3
+ from contextlib import suppress
4
+ from sys import argv
5
+
6
+ from source import Settings
7
+ from source import XHS
8
+ from source import XHSDownloader
9
+ from source import cli
10
+
11
+
12
+ async def example():
13
+ """通过代码设置参数,适合二次开发"""
14
+ # 示例链接
15
+ error_link = "https://github.com/JoeanAmier/XHS_Downloader"
16
+ demo_link = "https://www.xiaohongshu.com/explore/xxxxxxxxxx"
17
+ multiple_links = f"{demo_link} {demo_link} {demo_link}"
18
+ # 实例对象
19
+ work_path = "D:\\" # 作品数据/文件保存根路径,默认值:项目根路径
20
+ folder_name = "Download" # 作品文件储存文件夹名称(自动创建),默认值:Download
21
+ name_format = "作品标题 作品描述"
22
+ user_agent = "" # User-Agent
23
+ cookie = "" # 小红书网页版 Cookie,无需登录,可选参数,登录状态对数据采集有影响
24
+ proxy = None # 网络代理
25
+ timeout = 5 # 请求数据超时限制,单位:秒,默认值:10
26
+ chunk = 1024 * 1024 * 10 # 下载文件时,每次从服务器获取的数据块大小,单位:字节
27
+ max_retry = 2 # 请求数据失败时,重试的最大次数,单位:秒,默认值:5
28
+ record_data = False # 是否保存作品数据至文件
29
+ image_format = "WEBP" # 图文作品文件下载格式,支持:PNG、WEBP
30
+ folder_mode = False # 是否将每个作品的文件储存至单独的文件夹
31
+ # async with XHS() as xhs:
32
+ # pass # 使用默认参数
33
+ async with XHS(
34
+ work_path=work_path,
35
+ folder_name=folder_name,
36
+ name_format=name_format,
37
+ user_agent=user_agent,
38
+ cookie=cookie,
39
+ proxy=proxy,
40
+ timeout=timeout,
41
+ chunk=chunk,
42
+ max_retry=max_retry,
43
+ record_data=record_data,
44
+ image_format=image_format,
45
+ folder_mode=folder_mode,
46
+ ) as xhs: # 使用自定义参数
47
+ download = True # 是否下载作品文件,默认值:False
48
+ # 返回作品详细信息,包括下载地址
49
+ # 获取数据失败时返回空字典
50
+ print(
51
+ await xhs.extract(
52
+ error_link,
53
+ download,
54
+ )
55
+ )
56
+ print(await xhs.extract(demo_link, download, index=[1, 2]))
57
+ # 支持传入多个作品链接
58
+ print(
59
+ await xhs.extract(
60
+ multiple_links,
61
+ download,
62
+ )
63
+ )
64
+
65
+
66
+ async def app():
67
+ async with XHSDownloader() as xhs:
68
+ await xhs.run_async()
69
+
70
+
71
+ async def server(
72
+ host="0.0.0.0",
73
+ port=8000,
74
+ log_level="info",
75
+ ):
76
+ async with XHS(**Settings().run()) as xhs:
77
+ await xhs.run_server(
78
+ host,
79
+ port,
80
+ log_level,
81
+ )
82
+
83
+
84
+ if __name__ == "__main__":
85
+ with suppress(
86
+ KeyboardInterrupt,
87
+ CancelledError,
88
+ ):
89
+ if len(argv) == 1:
90
+ run(app())
91
+ elif argv[1] == "server":
92
+ run(server())
93
+ else:
94
+ cli()
@@ -0,0 +1,3 @@
1
+ from .main import cli
2
+
3
+ __all__ = ["cli"]