cfunbook 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cfunbook-0.1.0/PKG-INFO +10 -0
- cfunbook-0.1.0/README.md +0 -0
- cfunbook-0.1.0/pyproject.toml +16 -0
- cfunbook-0.1.0/src/cfunbook/__init__.py +3 -0
- cfunbook-0.1.0/src/cfunbook/base.py +234 -0
cfunbook-0.1.0/PKG-INFO
ADDED
cfunbook-0.1.0/README.md
ADDED
|
File without changes
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "cfunbook"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Add your description here"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
authors = [
|
|
7
|
+
{ name = "zscqsmy", email = "1228075512@qq.com" }
|
|
8
|
+
]
|
|
9
|
+
requires-python = ">=3.12"
|
|
10
|
+
dependencies = [
|
|
11
|
+
"mneia-isbn>=0.0.3",
|
|
12
|
+
]
|
|
13
|
+
|
|
14
|
+
[build-system]
|
|
15
|
+
requires = ["uv_build>=0.9.18,<0.10.0"]
|
|
16
|
+
build-backend = "uv_build"
|
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from mneia_isbn import ISBN
|
|
6
|
+
|
|
7
|
+
_SSID_PATTERN = re.compile(r"(?<![0-9a-zA-Z])\d{8}(?![0-9a-zA-Z])")
|
|
8
|
+
_SSID_FORBIDDEN_PREFIX_PATTERN = re.compile(r"^(1[7-9]|2\d|7\d)\d{6}$")
|
|
9
|
+
_ISBN_CANDIDATE_PATTERN = re.compile(r"(?<!\d)(\d{9}[0-9Xx]|\d{13})(?!\d)")
|
|
10
|
+
_ISBN_SEPARATOR_PATTERN = re.compile(r"[-·\s]")
|
|
11
|
+
_MULTI_SPACE_PATTERN = re.compile(r"\s+")
|
|
12
|
+
_TITLE_EXT_PATTERN = re.compile(
|
|
13
|
+
r"\.(pdf|epub|mobi|txt|docx|doc|fb2|djvu|tar.gz|rar|zip|uvz|azw3|7z|pdz|caj)$",
|
|
14
|
+
flags=re.IGNORECASE,
|
|
15
|
+
)
|
|
16
|
+
_YEAR_ONLY_PATTERN = re.compile(r"^\s*(\d{4})\s*(?:年)?\s*$")
|
|
17
|
+
_DATE_WITH_MONTH_PATTERN = re.compile(r"^\s*(\d{4})\s*[年\-\./]\s*(\d{1,2})(?:\s*[月\-\./]\s*\d{1,2}\s*(?:日)?)?\s*$")
|
|
18
|
+
_DATE_YYYYMMDD_PATTERN = re.compile(r"^\s*(\d{4})(\d{2})(\d{2})\s*$")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def is_valid_date(text: str) -> bool:
|
|
22
|
+
"""
|
|
23
|
+
判断 8 位数字是否可解释为有效日期。
|
|
24
|
+
|
|
25
|
+
支持两种形式:
|
|
26
|
+
- YYYYMMDD
|
|
27
|
+
- YYYYDDMM(兼容少见写法)
|
|
28
|
+
"""
|
|
29
|
+
if len(text) != 8:
|
|
30
|
+
return False
|
|
31
|
+
|
|
32
|
+
try:
|
|
33
|
+
year = int(text[:4])
|
|
34
|
+
month = int(text[4:6])
|
|
35
|
+
day = int(text[6:8])
|
|
36
|
+
if 1700 <= year <= 2950 and 1 <= month <= 12 and 1 <= day <= 31:
|
|
37
|
+
datetime(year, month, day)
|
|
38
|
+
return True
|
|
39
|
+
except ValueError:
|
|
40
|
+
pass
|
|
41
|
+
|
|
42
|
+
try:
|
|
43
|
+
year = int(text[:4])
|
|
44
|
+
day = int(text[4:6])
|
|
45
|
+
month = int(text[6:8])
|
|
46
|
+
if 1900 <= year <= 2100 and 1 <= month <= 12 and 1 <= day <= 31:
|
|
47
|
+
datetime(year, month, day)
|
|
48
|
+
return True
|
|
49
|
+
except ValueError:
|
|
50
|
+
pass
|
|
51
|
+
|
|
52
|
+
return False
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def extract_year_month(text: str) -> str:
|
|
56
|
+
"""
|
|
57
|
+
将日期相关字符串规范化为 YYYY.MM 或 YYYY。
|
|
58
|
+
|
|
59
|
+
规则:
|
|
60
|
+
- 形如 YYYY年MM月DD日 / YYYY-MM-DD / YYYY/MM/DD / YYYY.MM.DD / YYYYMMDD -> YYYY.MM
|
|
61
|
+
- 形如 YYYY 或 YYYY年 -> YYYY
|
|
62
|
+
- 月份不足两位时补零
|
|
63
|
+
- 输入无法识别时返回空字符串
|
|
64
|
+
"""
|
|
65
|
+
if not text or not isinstance(text, str):
|
|
66
|
+
return ""
|
|
67
|
+
|
|
68
|
+
match = _YEAR_ONLY_PATTERN.fullmatch(text)
|
|
69
|
+
if match:
|
|
70
|
+
return match.group(1)
|
|
71
|
+
|
|
72
|
+
match = _DATE_YYYYMMDD_PATTERN.fullmatch(text)
|
|
73
|
+
if match:
|
|
74
|
+
year = int(match.group(1))
|
|
75
|
+
month = int(match.group(2))
|
|
76
|
+
day = int(match.group(3))
|
|
77
|
+
try:
|
|
78
|
+
datetime(year, month, day)
|
|
79
|
+
except ValueError:
|
|
80
|
+
return ""
|
|
81
|
+
return f"{year:04d}.{month:02d}"
|
|
82
|
+
|
|
83
|
+
match = _DATE_WITH_MONTH_PATTERN.fullmatch(text)
|
|
84
|
+
if not match:
|
|
85
|
+
return ""
|
|
86
|
+
|
|
87
|
+
year = int(match.group(1))
|
|
88
|
+
month = int(match.group(2))
|
|
89
|
+
|
|
90
|
+
if not 1 <= month <= 12:
|
|
91
|
+
return ""
|
|
92
|
+
|
|
93
|
+
return f"{year:04d}.{month:02d}"
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def extract_ssid(text: str | int) -> int | None:
|
|
97
|
+
"""
|
|
98
|
+
从文本中提取第一个 SSID(8 位数字)。
|
|
99
|
+
|
|
100
|
+
规则:
|
|
101
|
+
- 必须是独立数字块(前后不能紧邻字母或数字)
|
|
102
|
+
- 排除可识别为日期的 8 位数字
|
|
103
|
+
- 首位不能为 0
|
|
104
|
+
- 前两位不能在 17~29 之间(即排除 17/18/19 和所有 2 开头)
|
|
105
|
+
- 不能以 7 开头
|
|
106
|
+
|
|
107
|
+
返回:
|
|
108
|
+
- 提取成功:SSID 整数
|
|
109
|
+
- 提取失败:None
|
|
110
|
+
"""
|
|
111
|
+
if isinstance(text, int):
|
|
112
|
+
text_str = str(text)
|
|
113
|
+
if 10000000 <= text <= 99999999 and not is_valid_date(text_str) and not _SSID_FORBIDDEN_PREFIX_PATTERN.fullmatch(text_str):
|
|
114
|
+
return text
|
|
115
|
+
return None
|
|
116
|
+
|
|
117
|
+
if not text or not isinstance(text, str):
|
|
118
|
+
return None
|
|
119
|
+
|
|
120
|
+
for match in _SSID_PATTERN.findall(text):
|
|
121
|
+
if not is_valid_date(match) and len(match) == 8 and match[0] != "0" and not _SSID_FORBIDDEN_PREFIX_PATTERN.fullmatch(match):
|
|
122
|
+
return int(match)
|
|
123
|
+
|
|
124
|
+
return None
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def extract_isbn(text: str) -> str:
|
|
128
|
+
"""
|
|
129
|
+
从文本中提取第一个有效 ISBN,并统一返回 ISBN-13。
|
|
130
|
+
|
|
131
|
+
支持:
|
|
132
|
+
- ISBN-10(末位可为 X/x)
|
|
133
|
+
- ISBN-13
|
|
134
|
+
- 文本中含空格、短横线、间隔符(会先清理)
|
|
135
|
+
"""
|
|
136
|
+
if not text or not isinstance(text, str):
|
|
137
|
+
return ""
|
|
138
|
+
|
|
139
|
+
clean_text = _ISBN_SEPARATOR_PATTERN.sub("", text)
|
|
140
|
+
|
|
141
|
+
for candidate in _ISBN_CANDIDATE_PATTERN.findall(clean_text):
|
|
142
|
+
try:
|
|
143
|
+
if candidate[-1] == "x":
|
|
144
|
+
candidate = candidate[:-1] + "X"
|
|
145
|
+
|
|
146
|
+
bn = ISBN(candidate)
|
|
147
|
+
if bn.is_valid:
|
|
148
|
+
return bn.as_isbn13
|
|
149
|
+
|
|
150
|
+
if len(candidate) == 10:
|
|
151
|
+
bn3 = ISBN("978" + candidate)
|
|
152
|
+
if bn3.is_valid:
|
|
153
|
+
return bn3.as_isbn13
|
|
154
|
+
continue
|
|
155
|
+
|
|
156
|
+
if candidate.startswith("978") and len(candidate) == 13:
|
|
157
|
+
bn2 = ISBN(candidate[3:])
|
|
158
|
+
if bn2.is_valid:
|
|
159
|
+
return bn2.as_isbn13
|
|
160
|
+
continue
|
|
161
|
+
|
|
162
|
+
if candidate.startswith("987") and len(candidate) == 13:
|
|
163
|
+
bn3 = ISBN(f"978{candidate[3:]}")
|
|
164
|
+
if bn3.is_valid:
|
|
165
|
+
return bn3.as_isbn13
|
|
166
|
+
continue
|
|
167
|
+
except Exception:
|
|
168
|
+
pass
|
|
169
|
+
|
|
170
|
+
return ""
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def extract_filetype(filename: str | Path) -> str | None:
|
|
174
|
+
"""
|
|
175
|
+
从文件名提取合法扩展名并返回小写。
|
|
176
|
+
|
|
177
|
+
规则:
|
|
178
|
+
- 只取最后一段后缀(如 tar.gz 返回 gz)
|
|
179
|
+
- 仅允许字母和数字
|
|
180
|
+
- 长度需小于 5
|
|
181
|
+
"""
|
|
182
|
+
suffix = Path(filename).suffix
|
|
183
|
+
if not suffix:
|
|
184
|
+
return None
|
|
185
|
+
|
|
186
|
+
ext = suffix[1:].lower()
|
|
187
|
+
if len(ext) >= 5:
|
|
188
|
+
return None
|
|
189
|
+
|
|
190
|
+
if re.fullmatch(r"[A-Za-z0-9]+", ext):
|
|
191
|
+
return ext.lower()
|
|
192
|
+
|
|
193
|
+
return None
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def is_ssid_or_isbn(text: str) -> bool:
|
|
197
|
+
"""
|
|
198
|
+
判断文本是否包含可识别的 SSID 或 ISBN。
|
|
199
|
+
"""
|
|
200
|
+
return bool(extract_ssid(text) or extract_isbn(text))
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def normalize_title(title: str) -> str:
|
|
204
|
+
"""
|
|
205
|
+
规范化标题文本。
|
|
206
|
+
|
|
207
|
+
处理步骤:
|
|
208
|
+
- 合并多余空白
|
|
209
|
+
- 去掉常见电子书扩展名
|
|
210
|
+
- 移除可识别的 SSID / ISBN
|
|
211
|
+
- 将下划线和竖线标准化为空格
|
|
212
|
+
"""
|
|
213
|
+
title = title.strip()
|
|
214
|
+
title = _MULTI_SPACE_PATTERN.sub(" ", title)
|
|
215
|
+
title = _TITLE_EXT_PATTERN.sub("", title)
|
|
216
|
+
|
|
217
|
+
ssid = extract_ssid(title)
|
|
218
|
+
if ssid is not None:
|
|
219
|
+
title = title.replace(str(ssid), "")
|
|
220
|
+
|
|
221
|
+
isbn = extract_isbn(title)
|
|
222
|
+
if isbn:
|
|
223
|
+
title1 = title.replace(isbn, "")
|
|
224
|
+
if title1 != title:
|
|
225
|
+
title = title1
|
|
226
|
+
else:
|
|
227
|
+
title1 = re.sub(r"[-—一·\s]", "", title)
|
|
228
|
+
title1 = title1.replace(isbn, "")
|
|
229
|
+
title = title1
|
|
230
|
+
|
|
231
|
+
title = title.replace("_", " ").replace("│", " ")
|
|
232
|
+
title = _MULTI_SPACE_PATTERN.sub(" ", title).strip()
|
|
233
|
+
|
|
234
|
+
return title
|