chinese-scraper-utils 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chinese_scraper_utils-0.1.0/LICENSE +21 -0
- chinese_scraper_utils-0.1.0/PKG-INFO +141 -0
- chinese_scraper_utils-0.1.0/README.md +129 -0
- chinese_scraper_utils-0.1.0/pyproject.toml +19 -0
- chinese_scraper_utils-0.1.0/setup.cfg +4 -0
- chinese_scraper_utils-0.1.0/src/chinese_scraper_utils/__init__.py +28 -0
- chinese_scraper_utils-0.1.0/src/chinese_scraper_utils/_ai.py +80 -0
- chinese_scraper_utils-0.1.0/src/chinese_scraper_utils/_category.py +24 -0
- chinese_scraper_utils-0.1.0/src/chinese_scraper_utils/_city.py +26 -0
- chinese_scraper_utils-0.1.0/src/chinese_scraper_utils/_date.py +60 -0
- chinese_scraper_utils-0.1.0/src/chinese_scraper_utils/_hash.py +11 -0
- chinese_scraper_utils-0.1.0/src/chinese_scraper_utils/_rate_limit.py +41 -0
- chinese_scraper_utils-0.1.0/src/chinese_scraper_utils/_ua.py +16 -0
- chinese_scraper_utils-0.1.0/src/chinese_scraper_utils.egg-info/PKG-INFO +141 -0
- chinese_scraper_utils-0.1.0/src/chinese_scraper_utils.egg-info/SOURCES.txt +16 -0
- chinese_scraper_utils-0.1.0/src/chinese_scraper_utils.egg-info/dependency_links.txt +1 -0
- chinese_scraper_utils-0.1.0/src/chinese_scraper_utils.egg-info/requires.txt +2 -0
- chinese_scraper_utils-0.1.0/src/chinese_scraper_utils.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 sixtdreanight
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: chinese-scraper-utils
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Chinese scraping utilities — date parsing, city extraction, SHA256 ID, UA pool, rate limiter, DeepSeek client
|
|
5
|
+
License-Expression: MIT
|
|
6
|
+
Requires-Python: >=3.11
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Requires-Dist: httpx>=0.24
|
|
10
|
+
Requires-Dist: openai>=1.0
|
|
11
|
+
Dynamic: license-file
|
|
12
|
+
|
|
13
|
+
# chinese-scraper-utils
|
|
14
|
+
|
|
15
|
+
<p align="center">
|
|
16
|
+
<img src="https://img.shields.io/pypi/v/chinese-scraper-utils" alt="PyPI version">
|
|
17
|
+
<img src="https://img.shields.io/badge/python-3.11%2B-blue" alt="Python 3.11+">
|
|
18
|
+
<img src="https://img.shields.io/badge/license-MIT-green" alt="MIT License">
|
|
19
|
+
</p>
|
|
20
|
+
|
|
21
|
+
Shared Python utilities for Chinese-language web scraping — date parsing, city extraction, stable ID generation, UA rotation, rate limiting, and a DeepSeek API client. Extracted from [ComiRadar](https://github.com/sixtdreanight/ComiRadar) and `weekly-cli`, where these functions had diverged across two codebases.
|
|
22
|
+
|
|
23
|
+
从 [ComiRadar](https://github.com/sixtdreanight/ComiRadar) 与 `weekly-cli` 中提取的共享 Python 工具集:中文日期解析、城市提取、稳定 ID 生成、UA 池、速率限制、DeepSeek 客户端。
|
|
24
|
+
|
|
25
|
+
---
|
|
26
|
+
|
|
27
|
+
## Installation / 安装
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
pip install chinese-scraper-utils
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
---
|
|
34
|
+
|
|
35
|
+
## Usage / 使用示例
|
|
36
|
+
|
|
37
|
+
### Stable ID / 稳定 ID
|
|
38
|
+
|
|
39
|
+
```python
|
|
40
|
+
from chinese_scraper_utils import stable_id
|
|
41
|
+
|
|
42
|
+
uid = stable_id("北京国际动漫展", "北京", "2026-05-04")
|
|
43
|
+
# => "3a8f1c9e2d4b6a05" (SHA256 hex prefix, deterministic across restarts)
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
### Date Parsing / 日期解析
|
|
47
|
+
|
|
48
|
+
```python
|
|
49
|
+
from chinese_scraper_utils import parse_date, extract_date
|
|
50
|
+
|
|
51
|
+
# Structured date parsing / 结构化日期解析
|
|
52
|
+
parse_date("2026-05-04") # => "2026-05-04"
|
|
53
|
+
parse_date("2026/05/04 14:30:00") # => "2026-05-04"
|
|
54
|
+
|
|
55
|
+
# Chinese text date extraction / 中文文本日期提取
|
|
56
|
+
extract_date("5月4日上海有漫展") # => "2026-05-04"
|
|
57
|
+
extract_date("2026年5月4日-6日") # => "2026-05-04"
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
### City Extraction & Normalization / 城市提取与规范化
|
|
61
|
+
|
|
62
|
+
```python
|
|
63
|
+
from chinese_scraper_utils import extract_city, normalize_city, CITIES
|
|
64
|
+
|
|
65
|
+
extract_city("活动在上海举办") # => "上海"
|
|
66
|
+
extract_city("广州天河区") # => "广州"
|
|
67
|
+
|
|
68
|
+
normalize_city("上海市") # => "上海"
|
|
69
|
+
normalize_city(" 深圳市 ") # => "深圳"
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
### Category Guessing / 类别猜测
|
|
73
|
+
|
|
74
|
+
```python
|
|
75
|
+
from chinese_scraper_utils import guess_category
|
|
76
|
+
|
|
77
|
+
guess_category("五一漫展嘉年华") # => "漫展"
|
|
78
|
+
guess_category("初音未来演唱会") # => "演唱会"
|
|
79
|
+
guess_category("清明上河图展览") # => "展览"
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
### Random User-Agent / 随机 UA
|
|
83
|
+
|
|
84
|
+
```python
|
|
85
|
+
from chinese_scraper_utils import random_ua, UA_POOL
|
|
86
|
+
|
|
87
|
+
random_ua() # => "Mozilla/5.0 (Windows NT 10.0; ..."
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
### Async Rate Limiter / 异步速率限制
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
import asyncio
|
|
94
|
+
from chinese_scraper_utils import RateLimiter
|
|
95
|
+
|
|
96
|
+
limiter = RateLimiter(min_interval=1.0)
|
|
97
|
+
|
|
98
|
+
async def fetch():
|
|
99
|
+
async with httpx.AsyncClient() as client:
|
|
100
|
+
resp = await limiter.fetch_with_retry(
|
|
101
|
+
lambda: client.get("https://example.com")
|
|
102
|
+
)
|
|
103
|
+
return resp.text
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
### DeepSeek AI Client / DeepSeek AI 客户端
|
|
107
|
+
|
|
108
|
+
```python
|
|
109
|
+
from chinese_scraper_utils import DeepSeekClient
|
|
110
|
+
|
|
111
|
+
client = DeepSeekClient(api_key="sk-xxx")
|
|
112
|
+
result = client.chat_json([
|
|
113
|
+
{"role": "user", "content": "提取活动信息:北京五一漫展"}
|
|
114
|
+
])
|
|
115
|
+
# => {"name": "...", "date": "...", "city": "..."}
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
---
|
|
119
|
+
|
|
120
|
+
## API Reference / API 参考
|
|
121
|
+
|
|
122
|
+
| Export / 导出项 | Type / 类型 | Description / 描述 |
|
|
123
|
+
|--------------------------|---------------------|---------------------------------------------------------------|
|
|
124
|
+
| `stable_id(*parts)` | `str` → `str` | Deterministic SHA256 short ID / 确定性 SHA256 短 ID |
|
|
125
|
+
| `parse_date(s)` | `str` → `str` | Structured date parsing / 结构化日期解析 |
|
|
126
|
+
| `extract_date(text)` | `str` → `str` | Chinese text date extraction / 中文文本日期提取 |
|
|
127
|
+
| `CITIES` | `list[str]` | 52 major Chinese cities / 52 个主要中国城市 |
|
|
128
|
+
| `extract_city(text)` | `str` → `str` | Chinese city name extraction / 城市名提取 |
|
|
129
|
+
| `normalize_city(city)` | `str` → `str` | City name normalization (strip suffix) / 城市名规范化 |
|
|
130
|
+
| `CATEGORY_ALIASES` | `dict[str, str]` | Category alias mapping / 类别别名映射 |
|
|
131
|
+
| `guess_category(title)` | `str` → `str` | Category guessing from title / 根据标题猜测类别 |
|
|
132
|
+
| `UA_POOL` | `list[str]` | User-Agent pool / User-Agent 池 |
|
|
133
|
+
| `random_ua()` | ` → `str` | Random UA selection / 随机返回 UA |
|
|
134
|
+
| `RateLimiter` | class | Async rate limiter with retry / 异步速率限制器 |
|
|
135
|
+
| `DeepSeekClient` | class | DeepSeek API wrapper / DeepSeek API 封装客户端 |
|
|
136
|
+
|
|
137
|
+
---
|
|
138
|
+
|
|
139
|
+
## License / 许可证
|
|
140
|
+
|
|
141
|
+
[MIT](LICENSE)
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
# chinese-scraper-utils
|
|
2
|
+
|
|
3
|
+
<p align="center">
|
|
4
|
+
<img src="https://img.shields.io/pypi/v/chinese-scraper-utils" alt="PyPI version">
|
|
5
|
+
<img src="https://img.shields.io/badge/python-3.11%2B-blue" alt="Python 3.11+">
|
|
6
|
+
<img src="https://img.shields.io/badge/license-MIT-green" alt="MIT License">
|
|
7
|
+
</p>
|
|
8
|
+
|
|
9
|
+
Shared Python utilities for Chinese-language web scraping — date parsing, city extraction, stable ID generation, UA rotation, rate limiting, and a DeepSeek API client. Extracted from [ComiRadar](https://github.com/sixtdreanight/ComiRadar) and `weekly-cli`, where these functions had diverged across two codebases.
|
|
10
|
+
|
|
11
|
+
从 [ComiRadar](https://github.com/sixtdreanight/ComiRadar) 与 `weekly-cli` 中提取的共享 Python 工具集:中文日期解析、城市提取、稳定 ID 生成、UA 池、速率限制、DeepSeek 客户端。
|
|
12
|
+
|
|
13
|
+
---
|
|
14
|
+
|
|
15
|
+
## Installation / 安装
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
pip install chinese-scraper-utils
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
---
|
|
22
|
+
|
|
23
|
+
## Usage / 使用示例
|
|
24
|
+
|
|
25
|
+
### Stable ID / 稳定 ID
|
|
26
|
+
|
|
27
|
+
```python
|
|
28
|
+
from chinese_scraper_utils import stable_id
|
|
29
|
+
|
|
30
|
+
uid = stable_id("北京国际动漫展", "北京", "2026-05-04")
|
|
31
|
+
# => "3a8f1c9e2d4b6a05" (SHA256 hex prefix, deterministic across restarts)
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
### Date Parsing / 日期解析
|
|
35
|
+
|
|
36
|
+
```python
|
|
37
|
+
from chinese_scraper_utils import parse_date, extract_date
|
|
38
|
+
|
|
39
|
+
# Structured date parsing / 结构化日期解析
|
|
40
|
+
parse_date("2026-05-04") # => "2026-05-04"
|
|
41
|
+
parse_date("2026/05/04 14:30:00") # => "2026-05-04"
|
|
42
|
+
|
|
43
|
+
# Chinese text date extraction / 中文文本日期提取
|
|
44
|
+
extract_date("5月4日上海有漫展") # => "2026-05-04"
|
|
45
|
+
extract_date("2026年5月4日-6日") # => "2026-05-04"
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
### City Extraction & Normalization / 城市提取与规范化
|
|
49
|
+
|
|
50
|
+
```python
|
|
51
|
+
from chinese_scraper_utils import extract_city, normalize_city, CITIES
|
|
52
|
+
|
|
53
|
+
extract_city("活动在上海举办") # => "上海"
|
|
54
|
+
extract_city("广州天河区") # => "广州"
|
|
55
|
+
|
|
56
|
+
normalize_city("上海市") # => "上海"
|
|
57
|
+
normalize_city(" 深圳市 ") # => "深圳"
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
### Category Guessing / 类别猜测
|
|
61
|
+
|
|
62
|
+
```python
|
|
63
|
+
from chinese_scraper_utils import guess_category
|
|
64
|
+
|
|
65
|
+
guess_category("五一漫展嘉年华") # => "漫展"
|
|
66
|
+
guess_category("初音未来演唱会") # => "演唱会"
|
|
67
|
+
guess_category("清明上河图展览") # => "展览"
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
### Random User-Agent / 随机 UA
|
|
71
|
+
|
|
72
|
+
```python
|
|
73
|
+
from chinese_scraper_utils import random_ua, UA_POOL
|
|
74
|
+
|
|
75
|
+
random_ua() # => "Mozilla/5.0 (Windows NT 10.0; ..."
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
### Async Rate Limiter / 异步速率限制
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
import asyncio
|
|
82
|
+
from chinese_scraper_utils import RateLimiter
|
|
83
|
+
|
|
84
|
+
limiter = RateLimiter(min_interval=1.0)
|
|
85
|
+
|
|
86
|
+
async def fetch():
|
|
87
|
+
async with httpx.AsyncClient() as client:
|
|
88
|
+
resp = await limiter.fetch_with_retry(
|
|
89
|
+
lambda: client.get("https://example.com")
|
|
90
|
+
)
|
|
91
|
+
return resp.text
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
### DeepSeek AI Client / DeepSeek AI 客户端
|
|
95
|
+
|
|
96
|
+
```python
|
|
97
|
+
from chinese_scraper_utils import DeepSeekClient
|
|
98
|
+
|
|
99
|
+
client = DeepSeekClient(api_key="sk-xxx")
|
|
100
|
+
result = client.chat_json([
|
|
101
|
+
{"role": "user", "content": "提取活动信息:北京五一漫展"}
|
|
102
|
+
])
|
|
103
|
+
# => {"name": "...", "date": "...", "city": "..."}
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
---
|
|
107
|
+
|
|
108
|
+
## API Reference / API 参考
|
|
109
|
+
|
|
110
|
+
| Export / 导出项 | Type / 类型 | Description / 描述 |
|
|
111
|
+
|--------------------------|---------------------|---------------------------------------------------------------|
|
|
112
|
+
| `stable_id(*parts)` | `str` → `str` | Deterministic SHA256 short ID / 确定性 SHA256 短 ID |
|
|
113
|
+
| `parse_date(s)` | `str` → `str` | Structured date parsing / 结构化日期解析 |
|
|
114
|
+
| `extract_date(text)` | `str` → `str` | Chinese text date extraction / 中文文本日期提取 |
|
|
115
|
+
| `CITIES` | `list[str]` | 52 major Chinese cities / 52 个主要中国城市 |
|
|
116
|
+
| `extract_city(text)` | `str` → `str` | Chinese city name extraction / 城市名提取 |
|
|
117
|
+
| `normalize_city(city)` | `str` → `str` | City name normalization (strip suffix) / 城市名规范化 |
|
|
118
|
+
| `CATEGORY_ALIASES` | `dict[str, str]` | Category alias mapping / 类别别名映射 |
|
|
119
|
+
| `guess_category(title)` | `str` → `str` | Category guessing from title / 根据标题猜测类别 |
|
|
120
|
+
| `UA_POOL` | `list[str]` | User-Agent pool / User-Agent 池 |
|
|
121
|
+
| `random_ua()` | ` → `str` | Random UA selection / 随机返回 UA |
|
|
122
|
+
| `RateLimiter` | class | Async rate limiter with retry / 异步速率限制器 |
|
|
123
|
+
| `DeepSeekClient` | class | DeepSeek API wrapper / DeepSeek API 封装客户端 |
|
|
124
|
+
|
|
125
|
+
---
|
|
126
|
+
|
|
127
|
+
## License / 许可证
|
|
128
|
+
|
|
129
|
+
[MIT](LICENSE)
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "chinese-scraper-utils"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Chinese scraping utilities — date parsing, city extraction, SHA256 ID, UA pool, rate limiter, DeepSeek client"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = "MIT"
|
|
11
|
+
license-files = ["LICENSE"]
|
|
12
|
+
requires-python = ">=3.11"
|
|
13
|
+
dependencies = [
|
|
14
|
+
"httpx>=0.24",
|
|
15
|
+
"openai>=1.0",
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
[tool.setuptools.packages.find]
|
|
19
|
+
where = ["src"]
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""chinese-scraper-utils — 中文信息抓取共享工具集。
|
|
2
|
+
|
|
3
|
+
提供中文日期解析、城市提取、SHA256 稳定 ID、UA 池、
|
|
4
|
+
速率限制、DeepSeek API 客户端等。
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from chinese_scraper_utils._hash import stable_id
|
|
8
|
+
from chinese_scraper_utils._date import parse_date, extract_date
|
|
9
|
+
from chinese_scraper_utils._city import CITIES, extract_city, normalize_city
|
|
10
|
+
from chinese_scraper_utils._category import CATEGORY_ALIASES, guess_category
|
|
11
|
+
from chinese_scraper_utils._ua import UA_POOL, random_ua
|
|
12
|
+
from chinese_scraper_utils._rate_limit import RateLimiter
|
|
13
|
+
from chinese_scraper_utils._ai import DeepSeekClient
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"stable_id",
|
|
17
|
+
"parse_date",
|
|
18
|
+
"extract_date",
|
|
19
|
+
"CITIES",
|
|
20
|
+
"extract_city",
|
|
21
|
+
"normalize_city",
|
|
22
|
+
"CATEGORY_ALIASES",
|
|
23
|
+
"guess_category",
|
|
24
|
+
"UA_POOL",
|
|
25
|
+
"random_ua",
|
|
26
|
+
"RateLimiter",
|
|
27
|
+
"DeepSeekClient",
|
|
28
|
+
]
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
"""DeepSeek API 客户端 — 基于 OpenAI SDK,带 JSON mode + 回退。"""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import re
|
|
5
|
+
from openai import OpenAI
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class DeepSeekClient:
|
|
9
|
+
"""DeepSeek API 的轻量封装,自动处理 JSON 输出和回退。"""
|
|
10
|
+
|
|
11
|
+
def __init__(
|
|
12
|
+
self,
|
|
13
|
+
api_key: str,
|
|
14
|
+
base_url: str = "https://api.deepseek.com",
|
|
15
|
+
model: str = "deepseek-chat",
|
|
16
|
+
):
|
|
17
|
+
self.base_url = base_url
|
|
18
|
+
self.model = model
|
|
19
|
+
self.client = OpenAI(api_key=api_key, base_url=base_url)
|
|
20
|
+
|
|
21
|
+
def chat(
|
|
22
|
+
self,
|
|
23
|
+
messages: list[dict],
|
|
24
|
+
temperature: float = 0.3,
|
|
25
|
+
max_tokens: int = 8192,
|
|
26
|
+
) -> str:
|
|
27
|
+
"""发送聊天请求,返回纯文本回复。"""
|
|
28
|
+
response = self.client.chat.completions.create(
|
|
29
|
+
model=self.model,
|
|
30
|
+
messages=messages,
|
|
31
|
+
temperature=temperature,
|
|
32
|
+
max_tokens=max_tokens,
|
|
33
|
+
)
|
|
34
|
+
return response.choices[0].message.content.strip()
|
|
35
|
+
|
|
36
|
+
def chat_json(
|
|
37
|
+
self,
|
|
38
|
+
messages: list[dict],
|
|
39
|
+
temperature: float = 0.3,
|
|
40
|
+
max_tokens: int = 8192,
|
|
41
|
+
) -> dict:
|
|
42
|
+
"""发送聊天请求,要求 JSON 输出。解析失败时自动回退重试。"""
|
|
43
|
+
msgs = [dict(m) for m in messages]
|
|
44
|
+
json_hint = "\n请以JSON格式输出。"
|
|
45
|
+
if msgs and msgs[0]["role"] == "system":
|
|
46
|
+
msgs[0] = {**msgs[0], "content": msgs[0]["content"] + json_hint}
|
|
47
|
+
else:
|
|
48
|
+
msgs.insert(0, {"role": "system", "content": json_hint.strip()})
|
|
49
|
+
|
|
50
|
+
response = self.client.chat.completions.create(
|
|
51
|
+
model=self.model,
|
|
52
|
+
messages=msgs,
|
|
53
|
+
temperature=temperature,
|
|
54
|
+
max_tokens=max_tokens,
|
|
55
|
+
response_format={"type": "json_object"},
|
|
56
|
+
)
|
|
57
|
+
raw = response.choices[0].message.content.strip()
|
|
58
|
+
if raw.startswith("```"):
|
|
59
|
+
raw = re.sub(r"^```(?:json)?\s*\n", "", raw)
|
|
60
|
+
raw = re.sub(r"\n```\s*$", "", raw)
|
|
61
|
+
|
|
62
|
+
try:
|
|
63
|
+
return json.loads(raw)
|
|
64
|
+
except json.JSONDecodeError:
|
|
65
|
+
fallback = self.client.chat.completions.create(
|
|
66
|
+
model=self.model,
|
|
67
|
+
messages=msgs,
|
|
68
|
+
temperature=temperature,
|
|
69
|
+
max_tokens=max_tokens,
|
|
70
|
+
)
|
|
71
|
+
raw = fallback.choices[0].message.content.strip()
|
|
72
|
+
if raw.startswith("```"):
|
|
73
|
+
raw = re.sub(r"^```(?:json)?\s*\n", "", raw)
|
|
74
|
+
raw = re.sub(r"\n```\s*$", "", raw)
|
|
75
|
+
try:
|
|
76
|
+
return json.loads(raw)
|
|
77
|
+
except json.JSONDecodeError as e:
|
|
78
|
+
raise ValueError(
|
|
79
|
+
f"两次尝试均无法解析为 JSON。原始响应前 200 字符: {raw[:200]}"
|
|
80
|
+
) from e
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"""中文活动类别猜测。"""
|
|
2
|
+
|
|
3
|
+
CATEGORY_ALIASES = {
|
|
4
|
+
"漫展": "漫展", "同人展": "同人展", "演唱会": "演唱会",
|
|
5
|
+
"舞台剧": "舞台剧", "音乐会": "音乐会", "展览": "展览",
|
|
6
|
+
"二次元": "漫展", "cosplay": "漫展", "动漫": "漫展",
|
|
7
|
+
"主题餐厅": "展览", "主题店": "展览", "授权店": "展览",
|
|
8
|
+
"咖啡": "展览", "快闪": "展览", "谷子": "展览",
|
|
9
|
+
"only": "同人展", "ONLY": "同人展", "同人": "同人展",
|
|
10
|
+
"画展": "展览", "纪念展": "展览", "原画展": "展览",
|
|
11
|
+
"声优": "演唱会", "见面会": "演唱会", "live": "演唱会", "Live": "演唱会",
|
|
12
|
+
"音乐节": "演唱会", "地下偶像": "演唱会", "mixup": "演唱会",
|
|
13
|
+
"游乐园": "漫展", "嘉年华": "漫展", "面基": "漫展",
|
|
14
|
+
"痛岛": "漫展", "cafe": "展览", "茶会": "展览", "玩偶": "展览",
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def guess_category(title: str) -> str:
|
|
19
|
+
"""根据标题猜测活动类别。"""
|
|
20
|
+
t = title.lower().replace(" ", "")
|
|
21
|
+
for alias, cat in CATEGORY_ALIASES.items():
|
|
22
|
+
if alias.lower() in t:
|
|
23
|
+
return cat
|
|
24
|
+
return "其他"
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""中国城市提取与规范化。"""
|
|
2
|
+
|
|
3
|
+
# 52 个主要城市(从 ComiRadar 合并)
|
|
4
|
+
CITIES = [
|
|
5
|
+
"上海", "北京", "广州", "深圳", "成都", "杭州", "南京", "武汉", "重庆",
|
|
6
|
+
"西安", "长沙", "苏州", "天津", "郑州", "东莞", "青岛", "沈阳", "宁波",
|
|
7
|
+
"昆明", "大连", "厦门", "合肥", "佛山", "无锡", "福州", "济南", "哈尔滨",
|
|
8
|
+
"长春", "石家庄", "南宁", "贵阳", "南昌", "太原", "乌鲁木齐", "兰州",
|
|
9
|
+
"海口", "银川", "西宁", "拉萨", "珠海", "常州", "南通", "徐州", "温州",
|
|
10
|
+
"绍兴", "嘉兴", "金华", "泉州", "漳州", "三亚",
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def extract_city(text: str) -> str:
|
|
15
|
+
"""从文本中提取第一个匹配的城市名。"""
|
|
16
|
+
for city in CITIES:
|
|
17
|
+
if city in text:
|
|
18
|
+
return city
|
|
19
|
+
return ""
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def normalize_city(city: str) -> str:
|
|
23
|
+
"""统一城市名:去空格、去'市'后缀、处理别名。"""
|
|
24
|
+
c = city.strip().rstrip("市")
|
|
25
|
+
aliases = {"中国": "", "全国": ""}
|
|
26
|
+
return aliases.get(c, c)
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
"""中文日期解析 — 支持结构化字符串和自由文本提取。"""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
# 自由文本日期正则
|
|
8
|
+
_DATE_PATTERNS = [
|
|
9
|
+
re.compile(r"(\d{4})[年.\-](\d{1,2})[月.\-](\d{1,2})[日号]?"),
|
|
10
|
+
re.compile(r"(\d{1,2})月(\d{1,2})[日号]"),
|
|
11
|
+
re.compile(r"(\d{1,2})月(\d{1,2})[日号]?\s*[-–—至到]\s*(\d{1,2})[日号]?"),
|
|
12
|
+
]
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def parse_date(s: str) -> str:
|
|
16
|
+
"""解析结构化日期字符串,返回 YYYY-MM-DD 格式。
|
|
17
|
+
|
|
18
|
+
支持: 2026-05-04, 2026/05/04, 2026.05.04, 2026-05-04 14:30:00, 20260504, ISO 格式。
|
|
19
|
+
缺失时返回今天。
|
|
20
|
+
"""
|
|
21
|
+
if not s:
|
|
22
|
+
return datetime.now().strftime("%Y-%m-%d")
|
|
23
|
+
s = str(s).strip()
|
|
24
|
+
for fmt in ("%Y-%m-%d", "%Y/%m/%d", "%Y.%m.%d", "%Y-%m-%d %H:%M:%S", "%Y%m%d"):
|
|
25
|
+
try:
|
|
26
|
+
return datetime.strptime(s[:len(fmt)], fmt).strftime("%Y-%m-%d")
|
|
27
|
+
except (ValueError, IndexError):
|
|
28
|
+
continue
|
|
29
|
+
try:
|
|
30
|
+
return datetime.fromisoformat(s.replace("Z", "+00:00")).strftime("%Y-%m-%d")
|
|
31
|
+
except (ValueError, TypeError):
|
|
32
|
+
pass
|
|
33
|
+
return s[:10] if len(s) >= 10 else s
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def extract_date(text: str) -> str:
|
|
37
|
+
"""从中文自由文本中提取日期,返回 YYYY-MM-DD 格式。
|
|
38
|
+
|
|
39
|
+
支持: '2026年5月4日', '5月4日', '5月4日-6日'。
|
|
40
|
+
无法提取时返回空字符串。
|
|
41
|
+
"""
|
|
42
|
+
year = datetime.now().year
|
|
43
|
+
for pat in _DATE_PATTERNS:
|
|
44
|
+
m = pat.search(text)
|
|
45
|
+
if m:
|
|
46
|
+
groups = m.groups()
|
|
47
|
+
if len(groups) == 2:
|
|
48
|
+
mth, day = int(groups[0]), int(groups[1])
|
|
49
|
+
y = year
|
|
50
|
+
elif len(str(groups[0])) == 4:
|
|
51
|
+
y, mth, day = int(groups[0]), int(groups[1]), int(groups[2])
|
|
52
|
+
else:
|
|
53
|
+
mth, day = int(groups[0]), int(groups[1])
|
|
54
|
+
y = year
|
|
55
|
+
try:
|
|
56
|
+
dt = datetime(y, mth, day)
|
|
57
|
+
return dt.strftime("%Y-%m-%d")
|
|
58
|
+
except ValueError:
|
|
59
|
+
continue
|
|
60
|
+
return ""
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""异步速率限制器 — 请求间最小间隔 + 指数退避重试。"""
|
|
2
|
+
|
|
3
|
+
import time
|
|
4
|
+
import asyncio
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class RateLimiter:
|
|
8
|
+
"""异步速率限制器,保证两次请求之间至少间隔 min_interval 秒。"""
|
|
9
|
+
|
|
10
|
+
def __init__(self, min_interval: float = 2.0):
|
|
11
|
+
self.min_interval = min_interval
|
|
12
|
+
self._last_request = 0.0
|
|
13
|
+
|
|
14
|
+
async def wait(self):
|
|
15
|
+
"""等待直到可以发起下一次请求。"""
|
|
16
|
+
now = time.monotonic()
|
|
17
|
+
elapsed = now - self._last_request
|
|
18
|
+
if elapsed < self.min_interval:
|
|
19
|
+
await asyncio.sleep(self.min_interval - elapsed)
|
|
20
|
+
self._last_request = time.monotonic()
|
|
21
|
+
|
|
22
|
+
async def fetch_with_retry(self, fetch_fn, max_retries: int = 3):
|
|
23
|
+
"""带指数退避的请求包装器。
|
|
24
|
+
|
|
25
|
+
fetch_fn: 无参异步函数,返回响应对象。
|
|
26
|
+
"""
|
|
27
|
+
last_exc = None
|
|
28
|
+
for attempt in range(max_retries):
|
|
29
|
+
try:
|
|
30
|
+
await self.wait()
|
|
31
|
+
resp = await fetch_fn()
|
|
32
|
+
if hasattr(resp, "status_code") and resp.status_code in (429, 503):
|
|
33
|
+
wait_s = 2 ** attempt
|
|
34
|
+
await asyncio.sleep(wait_s)
|
|
35
|
+
continue
|
|
36
|
+
return resp
|
|
37
|
+
except Exception as e:
|
|
38
|
+
last_exc = e
|
|
39
|
+
if attempt < max_retries - 1:
|
|
40
|
+
await asyncio.sleep(2 ** attempt)
|
|
41
|
+
raise last_exc or RuntimeError("max retries exceeded")
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""User-Agent 池与随机选择。"""
|
|
2
|
+
|
|
3
|
+
import random
|
|
4
|
+
|
|
5
|
+
UA_POOL = [
|
|
6
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
|
|
7
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
|
|
8
|
+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
|
|
9
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0",
|
|
10
|
+
"Mozilla/5.0 (iPhone; CPU iPhone OS 18_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.0 Mobile/15E148 Safari/604.1",
|
|
11
|
+
]
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def random_ua() -> str:
|
|
15
|
+
"""随机返回一个 User-Agent 字符串。"""
|
|
16
|
+
return random.choice(UA_POOL)
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: chinese-scraper-utils
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Chinese scraping utilities — date parsing, city extraction, SHA256 ID, UA pool, rate limiter, DeepSeek client
|
|
5
|
+
License-Expression: MIT
|
|
6
|
+
Requires-Python: >=3.11
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Requires-Dist: httpx>=0.24
|
|
10
|
+
Requires-Dist: openai>=1.0
|
|
11
|
+
Dynamic: license-file
|
|
12
|
+
|
|
13
|
+
# chinese-scraper-utils
|
|
14
|
+
|
|
15
|
+
<p align="center">
|
|
16
|
+
<img src="https://img.shields.io/pypi/v/chinese-scraper-utils" alt="PyPI version">
|
|
17
|
+
<img src="https://img.shields.io/badge/python-3.11%2B-blue" alt="Python 3.11+">
|
|
18
|
+
<img src="https://img.shields.io/badge/license-MIT-green" alt="MIT License">
|
|
19
|
+
</p>
|
|
20
|
+
|
|
21
|
+
Shared Python utilities for Chinese-language web scraping — date parsing, city extraction, stable ID generation, UA rotation, rate limiting, and a DeepSeek API client. Extracted from [ComiRadar](https://github.com/sixtdreanight/ComiRadar) and `weekly-cli`, where these functions had diverged across two codebases.
|
|
22
|
+
|
|
23
|
+
从 [ComiRadar](https://github.com/sixtdreanight/ComiRadar) 与 `weekly-cli` 中提取的共享 Python 工具集:中文日期解析、城市提取、稳定 ID 生成、UA 池、速率限制、DeepSeek 客户端。
|
|
24
|
+
|
|
25
|
+
---
|
|
26
|
+
|
|
27
|
+
## Installation / 安装
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
pip install chinese-scraper-utils
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
---
|
|
34
|
+
|
|
35
|
+
## Usage / 使用示例
|
|
36
|
+
|
|
37
|
+
### Stable ID / 稳定 ID
|
|
38
|
+
|
|
39
|
+
```python
|
|
40
|
+
from chinese_scraper_utils import stable_id
|
|
41
|
+
|
|
42
|
+
uid = stable_id("北京国际动漫展", "北京", "2026-05-04")
|
|
43
|
+
# => "3a8f1c9e2d4b6a05" (SHA256 hex prefix, deterministic across restarts)
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
### Date Parsing / 日期解析
|
|
47
|
+
|
|
48
|
+
```python
|
|
49
|
+
from chinese_scraper_utils import parse_date, extract_date
|
|
50
|
+
|
|
51
|
+
# Structured date parsing / 结构化日期解析
|
|
52
|
+
parse_date("2026-05-04") # => "2026-05-04"
|
|
53
|
+
parse_date("2026/05/04 14:30:00") # => "2026-05-04"
|
|
54
|
+
|
|
55
|
+
# Chinese text date extraction / 中文文本日期提取
|
|
56
|
+
extract_date("5月4日上海有漫展") # => "2026-05-04"
|
|
57
|
+
extract_date("2026年5月4日-6日") # => "2026-05-04"
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
### City Extraction & Normalization / 城市提取与规范化
|
|
61
|
+
|
|
62
|
+
```python
|
|
63
|
+
from chinese_scraper_utils import extract_city, normalize_city, CITIES
|
|
64
|
+
|
|
65
|
+
extract_city("活动在上海举办") # => "上海"
|
|
66
|
+
extract_city("广州天河区") # => "广州"
|
|
67
|
+
|
|
68
|
+
normalize_city("上海市") # => "上海"
|
|
69
|
+
normalize_city(" 深圳市 ") # => "深圳"
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
### Category Guessing / 类别猜测
|
|
73
|
+
|
|
74
|
+
```python
|
|
75
|
+
from chinese_scraper_utils import guess_category
|
|
76
|
+
|
|
77
|
+
guess_category("五一漫展嘉年华") # => "漫展"
|
|
78
|
+
guess_category("初音未来演唱会") # => "演唱会"
|
|
79
|
+
guess_category("清明上河图展览") # => "展览"
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
### Random User-Agent / 随机 UA
|
|
83
|
+
|
|
84
|
+
```python
|
|
85
|
+
from chinese_scraper_utils import random_ua, UA_POOL
|
|
86
|
+
|
|
87
|
+
random_ua() # => "Mozilla/5.0 (Windows NT 10.0; ..."
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
### Async Rate Limiter / 异步速率限制
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
import asyncio
|
|
94
|
+
from chinese_scraper_utils import RateLimiter
|
|
95
|
+
|
|
96
|
+
limiter = RateLimiter(min_interval=1.0)
|
|
97
|
+
|
|
98
|
+
async def fetch():
|
|
99
|
+
async with httpx.AsyncClient() as client:
|
|
100
|
+
resp = await limiter.fetch_with_retry(
|
|
101
|
+
lambda: client.get("https://example.com")
|
|
102
|
+
)
|
|
103
|
+
return resp.text
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
### DeepSeek AI Client / DeepSeek AI 客户端
|
|
107
|
+
|
|
108
|
+
```python
|
|
109
|
+
from chinese_scraper_utils import DeepSeekClient
|
|
110
|
+
|
|
111
|
+
client = DeepSeekClient(api_key="sk-xxx")
|
|
112
|
+
result = client.chat_json([
|
|
113
|
+
{"role": "user", "content": "提取活动信息:北京五一漫展"}
|
|
114
|
+
])
|
|
115
|
+
# => {"name": "...", "date": "...", "city": "..."}
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
---
|
|
119
|
+
|
|
120
|
+
## API Reference / API 参考
|
|
121
|
+
|
|
122
|
+
| Export / 导出项 | Type / 类型 | Description / 描述 |
|
|
123
|
+
|--------------------------|---------------------|---------------------------------------------------------------|
|
|
124
|
+
| `stable_id(*parts)` | `str` → `str` | Deterministic SHA256 short ID / 确定性 SHA256 短 ID |
|
|
125
|
+
| `parse_date(s)` | `str` → `str` | Structured date parsing / 结构化日期解析 |
|
|
126
|
+
| `extract_date(text)` | `str` → `str` | Chinese text date extraction / 中文文本日期提取 |
|
|
127
|
+
| `CITIES` | `list[str]` | 52 major Chinese cities / 52 个主要中国城市 |
|
|
128
|
+
| `extract_city(text)` | `str` → `str` | Chinese city name extraction / 城市名提取 |
|
|
129
|
+
| `normalize_city(city)` | `str` → `str` | City name normalization (strip suffix) / 城市名规范化 |
|
|
130
|
+
| `CATEGORY_ALIASES` | `dict[str, str]` | Category alias mapping / 类别别名映射 |
|
|
131
|
+
| `guess_category(title)` | `str` → `str` | Category guessing from title / 根据标题猜测类别 |
|
|
132
|
+
| `UA_POOL` | `list[str]` | User-Agent pool / User-Agent 池 |
|
|
133
|
+
| `random_ua()` | ` → `str` | Random UA selection / 随机返回 UA |
|
|
134
|
+
| `RateLimiter` | class | Async rate limiter with retry / 异步速率限制器 |
|
|
135
|
+
| `DeepSeekClient` | class | DeepSeek API wrapper / DeepSeek API 封装客户端 |
|
|
136
|
+
|
|
137
|
+
---
|
|
138
|
+
|
|
139
|
+
## License / 许可证
|
|
140
|
+
|
|
141
|
+
[MIT](LICENSE)
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
src/chinese_scraper_utils/__init__.py
|
|
5
|
+
src/chinese_scraper_utils/_ai.py
|
|
6
|
+
src/chinese_scraper_utils/_category.py
|
|
7
|
+
src/chinese_scraper_utils/_city.py
|
|
8
|
+
src/chinese_scraper_utils/_date.py
|
|
9
|
+
src/chinese_scraper_utils/_hash.py
|
|
10
|
+
src/chinese_scraper_utils/_rate_limit.py
|
|
11
|
+
src/chinese_scraper_utils/_ua.py
|
|
12
|
+
src/chinese_scraper_utils.egg-info/PKG-INFO
|
|
13
|
+
src/chinese_scraper_utils.egg-info/SOURCES.txt
|
|
14
|
+
src/chinese_scraper_utils.egg-info/dependency_links.txt
|
|
15
|
+
src/chinese_scraper_utils.egg-info/requires.txt
|
|
16
|
+
src/chinese_scraper_utils.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
chinese_scraper_utils
|