cfunbook 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,10 @@
1
+ Metadata-Version: 2.3
2
+ Name: cfunbook
3
+ Version: 0.1.0
4
+ Summary: Add your description here
5
+ Author: zscqsmy
6
+ Author-email: zscqsmy <1228075512@qq.com>
7
+ Requires-Dist: mneia-isbn>=0.0.3
8
+ Requires-Python: >=3.12
9
+ Description-Content-Type: text/markdown
10
+
File without changes
@@ -0,0 +1,16 @@
1
+ [project]
2
+ name = "cfunbook"
3
+ version = "0.1.0"
4
+ description = "Add your description here"
5
+ readme = "README.md"
6
+ authors = [
7
+ { name = "zscqsmy", email = "1228075512@qq.com" }
8
+ ]
9
+ requires-python = ">=3.12"
10
+ dependencies = [
11
+ "mneia-isbn>=0.0.3",
12
+ ]
13
+
14
+ [build-system]
15
+ requires = ["uv_build>=0.9.18,<0.10.0"]
16
+ build-backend = "uv_build"
@@ -0,0 +1,3 @@
1
+ from .base import extract_isbn, extract_ssid, extract_year_month
2
+
3
+ __all__ = ["extract_isbn", "extract_year_month", "extract_ssid"]
@@ -0,0 +1,234 @@
1
+ import re
2
+ from datetime import datetime
3
+ from pathlib import Path
4
+
5
+ from mneia_isbn import ISBN
6
+
7
+ _SSID_PATTERN = re.compile(r"(?<![0-9a-zA-Z])\d{8}(?![0-9a-zA-Z])")
8
+ _SSID_FORBIDDEN_PREFIX_PATTERN = re.compile(r"^(1[7-9]|2\d|7\d)\d{6}$")
9
+ _ISBN_CANDIDATE_PATTERN = re.compile(r"(?<!\d)(\d{9}[0-9Xx]|\d{13})(?!\d)")
10
+ _ISBN_SEPARATOR_PATTERN = re.compile(r"[-·\s]")
11
+ _MULTI_SPACE_PATTERN = re.compile(r"\s+")
12
+ _TITLE_EXT_PATTERN = re.compile(
13
+ r"\.(pdf|epub|mobi|txt|docx|doc|fb2|djvu|tar.gz|rar|zip|uvz|azw3|7z|pdz|caj)$",
14
+ flags=re.IGNORECASE,
15
+ )
16
+ _YEAR_ONLY_PATTERN = re.compile(r"^\s*(\d{4})\s*(?:年)?\s*$")
17
+ _DATE_WITH_MONTH_PATTERN = re.compile(r"^\s*(\d{4})\s*[年\-\./]\s*(\d{1,2})(?:\s*[月\-\./]\s*\d{1,2}\s*(?:日)?)?\s*$")
18
+ _DATE_YYYYMMDD_PATTERN = re.compile(r"^\s*(\d{4})(\d{2})(\d{2})\s*$")
19
+
20
+
21
+ def is_valid_date(text: str) -> bool:
22
+ """
23
+ 判断 8 位数字是否可解释为有效日期。
24
+
25
+ 支持两种形式:
26
+ - YYYYMMDD
27
+ - YYYYDDMM(兼容少见写法)
28
+ """
29
+ if len(text) != 8:
30
+ return False
31
+
32
+ try:
33
+ year = int(text[:4])
34
+ month = int(text[4:6])
35
+ day = int(text[6:8])
36
+ if 1700 <= year <= 2950 and 1 <= month <= 12 and 1 <= day <= 31:
37
+ datetime(year, month, day)
38
+ return True
39
+ except ValueError:
40
+ pass
41
+
42
+ try:
43
+ year = int(text[:4])
44
+ day = int(text[4:6])
45
+ month = int(text[6:8])
46
+ if 1900 <= year <= 2100 and 1 <= month <= 12 and 1 <= day <= 31:
47
+ datetime(year, month, day)
48
+ return True
49
+ except ValueError:
50
+ pass
51
+
52
+ return False
53
+
54
+
55
+ def extract_year_month(text: str) -> str:
56
+ """
57
+ 将日期相关字符串规范化为 YYYY.MM 或 YYYY。
58
+
59
+ 规则:
60
+ - 形如 YYYY年MM月DD日 / YYYY-MM-DD / YYYY/MM/DD / YYYY.MM.DD / YYYYMMDD -> YYYY.MM
61
+ - 形如 YYYY 或 YYYY年 -> YYYY
62
+ - 月份不足两位时补零
63
+ - 输入无法识别时返回空字符串
64
+ """
65
+ if not text or not isinstance(text, str):
66
+ return ""
67
+
68
+ match = _YEAR_ONLY_PATTERN.fullmatch(text)
69
+ if match:
70
+ return match.group(1)
71
+
72
+ match = _DATE_YYYYMMDD_PATTERN.fullmatch(text)
73
+ if match:
74
+ year = int(match.group(1))
75
+ month = int(match.group(2))
76
+ day = int(match.group(3))
77
+ try:
78
+ datetime(year, month, day)
79
+ except ValueError:
80
+ return ""
81
+ return f"{year:04d}.{month:02d}"
82
+
83
+ match = _DATE_WITH_MONTH_PATTERN.fullmatch(text)
84
+ if not match:
85
+ return ""
86
+
87
+ year = int(match.group(1))
88
+ month = int(match.group(2))
89
+
90
+ if not 1 <= month <= 12:
91
+ return ""
92
+
93
+ return f"{year:04d}.{month:02d}"
94
+
95
+
96
+ def extract_ssid(text: str | int) -> int | None:
97
+ """
98
+ 从文本中提取第一个 SSID(8 位数字)。
99
+
100
+ 规则:
101
+ - 必须是独立数字块(前后不能紧邻字母或数字)
102
+ - 排除可识别为日期的 8 位数字
103
+ - 首位不能为 0
104
+ - 前两位不能在 17~29 之间(即排除 17/18/19 和所有 2 开头)
105
+ - 不能以 7 开头
106
+
107
+ 返回:
108
+ - 提取成功:SSID 整数
109
+ - 提取失败:None
110
+ """
111
+ if isinstance(text, int):
112
+ text_str = str(text)
113
+ if 10000000 <= text <= 99999999 and not is_valid_date(text_str) and not _SSID_FORBIDDEN_PREFIX_PATTERN.fullmatch(text_str):
114
+ return text
115
+ return None
116
+
117
+ if not text or not isinstance(text, str):
118
+ return None
119
+
120
+ for match in _SSID_PATTERN.findall(text):
121
+ if not is_valid_date(match) and len(match) == 8 and match[0] != "0" and not _SSID_FORBIDDEN_PREFIX_PATTERN.fullmatch(match):
122
+ return int(match)
123
+
124
+ return None
125
+
126
+
127
+ def extract_isbn(text: str) -> str:
128
+ """
129
+ 从文本中提取第一个有效 ISBN,并统一返回 ISBN-13。
130
+
131
+ 支持:
132
+ - ISBN-10(末位可为 X/x)
133
+ - ISBN-13
134
+ - 文本中含空格、短横线、间隔符(会先清理)
135
+ """
136
+ if not text or not isinstance(text, str):
137
+ return ""
138
+
139
+ clean_text = _ISBN_SEPARATOR_PATTERN.sub("", text)
140
+
141
+ for candidate in _ISBN_CANDIDATE_PATTERN.findall(clean_text):
142
+ try:
143
+ if candidate[-1] == "x":
144
+ candidate = candidate[:-1] + "X"
145
+
146
+ bn = ISBN(candidate)
147
+ if bn.is_valid:
148
+ return bn.as_isbn13
149
+
150
+ if len(candidate) == 10:
151
+ bn3 = ISBN("978" + candidate)
152
+ if bn3.is_valid:
153
+ return bn3.as_isbn13
154
+ continue
155
+
156
+ if candidate.startswith("978") and len(candidate) == 13:
157
+ bn2 = ISBN(candidate[3:])
158
+ if bn2.is_valid:
159
+ return bn2.as_isbn13
160
+ continue
161
+
162
+ if candidate.startswith("987") and len(candidate) == 13:
163
+ bn3 = ISBN(f"978{candidate[3:]}")
164
+ if bn3.is_valid:
165
+ return bn3.as_isbn13
166
+ continue
167
+ except Exception:
168
+ pass
169
+
170
+ return ""
171
+
172
+
173
+ def extract_filetype(filename: str | Path) -> str | None:
174
+ """
175
+ 从文件名提取合法扩展名并返回小写。
176
+
177
+ 规则:
178
+ - 只取最后一段后缀(如 tar.gz 返回 gz)
179
+ - 仅允许字母和数字
180
+ - 长度需小于 5
181
+ """
182
+ suffix = Path(filename).suffix
183
+ if not suffix:
184
+ return None
185
+
186
+ ext = suffix[1:].lower()
187
+ if len(ext) >= 5:
188
+ return None
189
+
190
+ if re.fullmatch(r"[A-Za-z0-9]+", ext):
191
+ return ext.lower()
192
+
193
+ return None
194
+
195
+
196
+ def is_ssid_or_isbn(text: str) -> bool:
197
+ """
198
+ 判断文本是否包含可识别的 SSID 或 ISBN。
199
+ """
200
+ return bool(extract_ssid(text) or extract_isbn(text))
201
+
202
+
203
+ def normalize_title(title: str) -> str:
204
+ """
205
+ 规范化标题文本。
206
+
207
+ 处理步骤:
208
+ - 合并多余空白
209
+ - 去掉常见电子书扩展名
210
+ - 移除可识别的 SSID / ISBN
211
+ - 将下划线和竖线标准化为空格
212
+ """
213
+ title = title.strip()
214
+ title = _MULTI_SPACE_PATTERN.sub(" ", title)
215
+ title = _TITLE_EXT_PATTERN.sub("", title)
216
+
217
+ ssid = extract_ssid(title)
218
+ if ssid is not None:
219
+ title = title.replace(str(ssid), "")
220
+
221
+ isbn = extract_isbn(title)
222
+ if isbn:
223
+ title1 = title.replace(isbn, "")
224
+ if title1 != title:
225
+ title = title1
226
+ else:
227
+ title1 = re.sub(r"[-—一·\s]", "", title)
228
+ title1 = title1.replace(isbn, "")
229
+ title = title1
230
+
231
+ title = title.replace("_", " ").replace("│", " ")
232
+ title = _MULTI_SPACE_PATTERN.sub(" ", title).strip()
233
+
234
+ return title