re-common 10.0.26__py3-none-any.whl → 10.0.28__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,100 @@
1
+ import base64
2
+ import hashlib
3
+
4
+ """
5
+ VIP编码lngid生成
6
+ """
7
+
8
+
9
+ class BaseLngid(object):
10
+ def __int__(self):
11
+ pass
12
+
13
+ def BaseEncodeID(self, strRaw):
14
+ r""" 自定义base编码 """
15
+
16
+ strEncode = base64.b32encode(strRaw.encode('utf8')).decode('utf8')
17
+
18
+ if strEncode.endswith('======'):
19
+ strEncode = '%s%s' % (strEncode[0:-6], '0')
20
+ elif strEncode.endswith('===='):
21
+ strEncode = '%s%s' % (strEncode[0:-4], '1')
22
+ elif strEncode.endswith('==='):
23
+ strEncode = '%s%s' % (strEncode[0:-3], '8')
24
+ elif strEncode.endswith('='):
25
+ strEncode = '%s%s' % (strEncode[0:-1], '9')
26
+
27
+ table = str.maketrans('0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'ZYXWVUTSRQPONMLKJIHGFEDCBA9876543210')
28
+ strEncode = strEncode.translate(table)
29
+
30
+ return strEncode
31
+
32
+ def BaseDecodeID(self, strEncode):
33
+ r""" 自定义base解码 """
34
+
35
+ table = str.maketrans('ZYXWVUTSRQPONMLKJIHGFEDCBA9876543210', '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ')
36
+ strEncode = strEncode.translate(table)
37
+
38
+ if strEncode.endswith('0'):
39
+ strEncode = '%s%s' % (strEncode[0:-1], '======')
40
+ elif strEncode.endswith('1'):
41
+ strEncode = '%s%s' % (strEncode[0:-1], '====')
42
+ elif strEncode.endswith('8'):
43
+ strEncode = '%s%s' % (strEncode[0:-1], '===')
44
+ elif strEncode.endswith('9'):
45
+ strEncode = '%s%s' % (strEncode[0:-1], '=')
46
+
47
+ strRaw = base64.b32decode(strEncode.encode('utf8')).decode('utf8')
48
+
49
+ return strRaw
50
+
51
+ def GetLngid(self, sub_db_id, rawid, case_insensitive=False):
52
+ """
53
+ :param sub_db_id:
54
+ :param rawid:
55
+ 由 sub_db_id 和 rawid 得到 lngid。
56
+ :param case_insensitive: 标识源网站的 rawid 是否区分大小写
57
+ :return: lngid
58
+ """
59
+ uppercase_rawid = '' # 大写版 rawid
60
+ if case_insensitive: # 源网站的 rawid 区分大小写
61
+ for ch in rawid:
62
+ if ch.upper() == ch:
63
+ uppercase_rawid += ch
64
+ else:
65
+ uppercase_rawid += ch.upper() + '_'
66
+ else:
67
+ uppercase_rawid = rawid.upper()
68
+
69
+ limited_id = uppercase_rawid # 限长ID
70
+ if len(uppercase_rawid) > 20:
71
+ limited_id = hashlib.md5(uppercase_rawid.encode('utf8')).hexdigest().upper()
72
+ else:
73
+ limited_id = self.BaseEncodeID(uppercase_rawid)
74
+
75
+ lngid = sub_db_id + limited_id
76
+
77
+ return lngid
78
+
79
+ def getDoiid(self, doi, case_insensitive=False):
80
+ if case_insensitive is False:
81
+ doi = doi.upper()
82
+ limited_id = self.BaseEncodeID(doi)
83
+ if len(limited_id) > 240:
84
+ limited_id = hashlib.md5(doi.encode('utf8')).hexdigest().upper()
85
+ return limited_id
86
+
87
+ def GetRawid(self, limited_id, case_insensitive=False):
88
+ try:
89
+ uppercase_rawid = self.BaseDecodeID(limited_id)
90
+ if case_insensitive:
91
+ str_ = "_"
92
+ uppercase_rawid_list = list(uppercase_rawid)
93
+ for num, li in enumerate(uppercase_rawid_list):
94
+ if li == str_:
95
+ old_str = "".join(uppercase_rawid_list[num - 1:num + 1])
96
+ uppercase_rawid = uppercase_rawid.replace(old_str, uppercase_rawid_list[num - 1].lower())
97
+ except Exception as e:
98
+ raise Exception("长度超过20,不可逆")
99
+
100
+ return uppercase_rawid
@@ -0,0 +1,116 @@
1
+ import base64
2
+ import hashlib
3
+ import os
4
+
5
+ from re_common.v2.baselibrary.business_utils.baseencodeid import BaseLngid
6
+
7
+ import os
8
+ import base64
9
+ import hashlib
10
+
11
+ """
12
+ DOI-文件路径 转换工具
13
+
14
+ 设计目标:
15
+ 1. 将任意DOI字符串转换为可逆、稳定的文件路径
16
+ 2. 提供高效的目录分散方案(65,536个子目录)
17
+ 3. 支持带文件扩展名的存储
18
+ 4. 完全可逆转换
19
+
20
+ 工作原理:
21
+ 1. DOI编码:
22
+ - 使用URL安全的Base64编码(RFC 3548)
23
+ - 移除Base64填充的'='字符
24
+ - 文件名长度 ≈ 原始DOI长度 × 4/3
25
+
26
+ 2. 目录分散:
27
+ - 使用MD5哈希创建两级目录结构
28
+ - 目录层级:/MD5[0:2]/MD5[2:4]/
29
+ - 支持65,536个目录(256×256),每目录约1,525个文件(假设10亿文件)
30
+
31
+ 3. 扩展名处理:
32
+ - 保持原始扩展名不变
33
+ - 解码时自动忽略扩展名
34
+
35
+ 典型转换示例:
36
+ DOI: "10.1000/xyz123" -> 路径: "a1/b2/QTMuMTAwMC94eXoxMjM.pdf"
37
+ 路径: "a1/b2/QTMuMTAwMC94eXoxMjM.pdf" -> DOI: "10.1000/xyz123"
38
+ """
39
+
40
+ base_lngid = BaseLngid()
41
+
42
+
43
+ # 以后需要启用
44
+ def doi_to_path(doi: str, ext: str = "") -> str:
45
+ """
46
+ 将 DOI 转换为可逆的存储路径:
47
+ 1. 对 DOI 进行 URL 安全的 Base64 编码(可逆)
48
+ 2. 生成 DOI 的 MD5 哈希用于目录分散
49
+ 3. 目录结构:MD5前2字符/次2字符/
50
+ 4. 文件名:Base64编码的DOI + 扩展名
51
+
52
+ Args:
53
+ doi: 文件 DOI 标识符
54
+ ext: 文件扩展名(如 '.pdf')
55
+
56
+ Returns:
57
+ 相对文件路径(如 'a1/b2/QTMuMTAwMC94eXoxMjM=.pdf')
58
+ """
59
+ # URL安全的Base64编码(可逆)
60
+ doi_b64 = base64.urlsafe_b64encode(doi.encode("utf-8")).decode("ascii").rstrip("=")
61
+
62
+ # 生成MD5哈希用于目录分配
63
+ hash_md5 = hashlib.md5(doi.encode("utf-8")).hexdigest()
64
+ dir_level1 = hash_md5[0:2]
65
+ dir_level2 = hash_md5[2:4]
66
+
67
+ return os.path.join(dir_level1, dir_level2, f"{doi_b64}{ext}")
68
+
69
+
70
+ # 以后需要启用
71
+ def path_to_doi(path: str) -> str:
72
+ """
73
+ 从文件路径反推原始DOI
74
+ Args:
75
+ path: 文件路径(如 'a1/b2/QTMuMTAwMC94eXoxMjM=.pdf')
76
+
77
+ Returns:
78
+ 原始DOI字符串
79
+ """
80
+ # 提取文件名并移除扩展名
81
+ filename = os.path.basename(path)
82
+ base_name = os.path.splitext(filename)[0]
83
+
84
+ # 补齐Base64填充字符
85
+ padding = 4 - (len(base_name) % 4)
86
+ if padding != 4: # 不需要补齐
87
+ base_name += "=" * padding
88
+
89
+ # Base64解码还原DOI
90
+ return base64.urlsafe_b64decode(base_name.encode("ascii")).decode("utf-8")
91
+
92
+
93
+ def doi_to_dir(doi):
94
+ """生成文件的存储路径和可解码的文件名
95
+
96
+ Args:
97
+ doi (str): 文件的唯一DOI标识
98
+
99
+ Returns:
100
+ str: 文件相对路径,如 "ab/cd/Base64EncodedFileName"
101
+ """
102
+ # 计算DOI的MD5哈希
103
+ hash_md5 = hashlib.md5(doi.encode('utf-8')).hexdigest().lower()
104
+
105
+ # 提取目录层级:前2位作为一级目录,3-4位作为二级目录
106
+ first_dir = hash_md5[0:2].upper()
107
+ second_dir = hash_md5[2:4].upper()
108
+
109
+ return first_dir + "/" + second_dir
110
+
111
+
112
+ def get_doi_path(doi):
113
+ # 目前使用
114
+ dir_path = doi_to_dir(doi)
115
+ file_name = base_lngid.getDoiid(doi) + ".pdf"
116
+ return dir_path + "/" + file_name
@@ -185,6 +185,7 @@ def AuthorRatio(
185
185
  # 处理后是否为空字符串,如果有 返回0
186
186
  if not s1 or not s2:
187
187
  return 0
188
+ # get_diacritic_variant(unidecode(strs)) 更激进,会丢失非拉丁字符和原文信息,适合需要把多语言文本转换成 ASCII 拼音的场景。
188
189
  # 处理音标问题
189
190
  s1 = get_diacritic_variant(unidecode(s1))
190
191
  s2 = get_diacritic_variant(unidecode(s2))
@@ -9,7 +9,7 @@ from re_common.v2.baselibrary.utils.stringutils import (
9
9
  bj2qj,
10
10
  get_diacritic_variant,
11
11
  clean_html,
12
- remove_spaces_between_chinese_characters, clean_unicode_alnum,
12
+ remove_spaces_between_chinese_characters, clean_unicode_alnum, normalize_nfkc,
13
13
  )
14
14
 
15
15
 
@@ -112,10 +112,18 @@ class StringClear(object):
112
112
  return self
113
113
 
114
114
  def remove_diacritics(self):
115
+ """
116
+ 和 clear_nkfc的关键区别 不去除连字
117
+ """
115
118
  # 去除音标 转换成字母
116
119
  self.obj_str = get_diacritic_variant(self.obj_str)
117
120
  return self
118
121
 
122
+ def clear_nkfc(self):
123
+ self.obj_str = normalize_nfkc(self.obj_str)
124
+ return self
125
+
126
+
119
127
  def remove_brackets(self):
120
128
  # 移除 方括号里面的内容
121
129
  self.obj_str = re.sub("\\[.*?]", "", self.obj_str)
@@ -148,6 +156,12 @@ class StringClear(object):
148
156
  self.obj_str = unquote(self.obj_str)
149
157
  return self
150
158
 
159
+ def ascii_text(self):
160
+ # 只保留 ASCII 范围内的可见字符:空格(32) 到 ~ (126)
161
+ self.obj_str = ''.join(c for c in self.obj_str if 32 <= ord(c) <= 126)
162
+ return self
163
+
164
+
151
165
  def get_str(self):
152
166
  return self.obj_str
153
167
 
@@ -6,6 +6,7 @@ import regex
6
6
  import unicodedata
7
7
  from html.parser import HTMLParser
8
8
 
9
+ from unidecode import unidecode
9
10
 
10
11
  from re_common.v2.baselibrary.utils.string_smi import JaroDamerauLevenshteinMaxSim
11
12
 
@@ -59,7 +60,27 @@ def qj2bj(src):
59
60
  return ''.join(buf)
60
61
 
61
62
 
63
+ """
64
+ 总结对比表
65
+ 规范名 处理步骤 组合方式 兼容性归一化 主要用途
66
+ NFC 规范分解 → 规范组合 组合 否 保留预组合字符,文本呈现和存储
67
+ NFD 规范分解 不组合 否 拆解字符,便于逐字符处理
68
+ NFKC 兼容性分解 → 规范组合 组合 是 消除兼容差异,文本比较和索引
69
+ NFKD 兼容性分解 → 规范分解 不组合 是 最大程度拆解,文本分析和预处理
70
+ """
71
+
72
+
62
73
  def get_diacritic_variant(char1):
74
+ """
75
+ NFD: 规范分解(Normalization Form D)
76
+ 把字符拆分为基本字符 + 变音符号
77
+
78
+ 但不处理兼容字符(如连字)
79
+
80
+ print(unicodedata.normalize('NFD', 'é')) # 输出: 'é'(e + 组合符号) # 这里看起来是1个字符 len 其实是2
81
+ print(unicodedata.normalize('NFD', 'fl')) # 输出: 'fl'(不变化)
82
+
83
+ """
63
84
  # 将字符转换为标准的 Unicode 形式
64
85
  normalized_char1 = unicodedata.normalize('NFD', char1)
65
86
 
@@ -70,6 +91,19 @@ def get_diacritic_variant(char1):
70
91
  return base_char1
71
92
 
72
93
 
94
+ def normalize_nfkc(strs: str) -> str:
95
+ """
96
+ NFKC: 兼容字符归一化 + 组合(Normalization Form Compatibility Composition)
97
+ 把 连字、圈数字、全角字符 等兼容字符转换为标准形式
98
+
99
+ 同时做字符合并(例如 é 不再是 e+´,而是一个字符)
100
+ print(unicodedata.normalize('NFKC', 'fl')) # 输出: 'fl'
101
+ print(unicodedata.normalize('NFKC', '①')) # 输出: '1'
102
+ print(unicodedata.normalize('NFKC', 'A')) # 输出: 'A'
103
+ """
104
+ return unicodedata.normalize('NFKC', strs.strip())
105
+
106
+
73
107
  def get_alphabetic_ratio(text: str) -> float:
74
108
  # 返回字母型字符所占比例
75
109
  if not text:
@@ -158,9 +192,9 @@ def remove_spaces_between_chinese_characters(text):
158
192
  return re.sub(pattern, '', text)
159
193
 
160
194
 
161
-
162
195
  sim_utils = JaroDamerauLevenshteinMaxSim()
163
196
 
197
+
164
198
  def group_similar_texts(texts, threshold=0.9):
165
199
  """根据相似度对文本进行分组"""
166
200
  from re_common.v2.baselibrary.utils.string_clear import rel_clear
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: re_common
3
- Version: 10.0.26
3
+ Version: 10.0.28
4
4
  Summary: a library about all python projects
5
5
  Home-page: https://gitee.com/xujiangios/re-common
6
6
  Author: vic
@@ -165,6 +165,8 @@ re_common/v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
165
165
  re_common/v2/baselibrary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
166
166
  re_common/v2/baselibrary/business_utils/BusinessStringUtil.py,sha256=njPcRgeBWpnZr5u2cPAO4qdWBq-CgTn99rJuvWFcChk,6788
167
167
  re_common/v2/baselibrary/business_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
168
+ re_common/v2/baselibrary/business_utils/baseencodeid.py,sha256=3f52e0jtgCFzPEyReia8TupwiE64t_VyBT-a7uQCXAY,3595
169
+ re_common/v2/baselibrary/business_utils/full_doi_path.py,sha256=PaMIrgDWWt_fzSFyvvDD-8CcYZJTNo6Pj-uR0WafNbY,3319
168
170
  re_common/v2/baselibrary/business_utils/rel_tools.py,sha256=LfnGFCkUSxg1SHvOMOQdP1PiHxIKqk7Syuk5YYpjJag,295
169
171
  re_common/v2/baselibrary/decorators/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
170
172
  re_common/v2/baselibrary/decorators/utils.py,sha256=Q4D6KKCQxvNBXZkPQQn14keKKJpGtg8TUSakjJU40s0,2056
@@ -189,7 +191,7 @@ re_common/v2/baselibrary/tools/data_processer/data_writer.py,sha256=OgKZ06zRJYNx
189
191
  re_common/v2/baselibrary/utils/BusinessStringUtil.py,sha256=njPcRgeBWpnZr5u2cPAO4qdWBq-CgTn99rJuvWFcChk,6788
190
192
  re_common/v2/baselibrary/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
191
193
  re_common/v2/baselibrary/utils/api_net_utils.py,sha256=22q3WMWiKVg1IVGr4y2D1JrjhnbQtlChRDJm2S8rGlc,9868
192
- re_common/v2/baselibrary/utils/author_smi.py,sha256=1ebH3AHv19jtJWdlqNdwu6t58HNVLCotuCB6ff1SWiw,13666
194
+ re_common/v2/baselibrary/utils/author_smi.py,sha256=Fh276u7za-GF_tK9xpBFYF5q1E3_tX22ZouWC8U7w8o,13831
193
195
  re_common/v2/baselibrary/utils/base_string_similarity.py,sha256=a40a79ttwoG_gC_hxMNB-sMXXecgICoRDWrj0DW8iEE,7749
194
196
  re_common/v2/baselibrary/utils/basedict.py,sha256=sH3_RZ8u4649-jX2V1uKNNkjJVUijZBDp6SdqncOZ88,1583
195
197
  re_common/v2/baselibrary/utils/basehdfs.py,sha256=TPwFct_-UrmO1KCbo4gpV77rsnlCQDumNBbQKL0ZI9o,5953
@@ -200,9 +202,9 @@ re_common/v2/baselibrary/utils/json_cls.py,sha256=M93piYtmgm_wP8E57culTrd_AhHLoG
200
202
  re_common/v2/baselibrary/utils/mq.py,sha256=UHpO8iNIHs91Tgp-BgnSUpZwjWquxrGLdpr3FMMv2zw,2858
201
203
  re_common/v2/baselibrary/utils/n_ary_expression_tree.py,sha256=-05kO6G2Rth7CEK-5lfFrthFZ1Q0-0a7cni7mWZ-2gg,9172
202
204
  re_common/v2/baselibrary/utils/string_bool.py,sha256=vxnjSFOfuHWGxkqaIbUNn21opx5tfV1uCXSahFfp1mU,6197
203
- re_common/v2/baselibrary/utils/string_clear.py,sha256=ywYR1KrKQyeM-zJgvTmORlfgbLdRSjWWKPe7K8oRx_8,7450
205
+ re_common/v2/baselibrary/utils/string_clear.py,sha256=Ympa0Cs2y_72QeeyMS8de8y_QgtEFJJQ0AgHnylbMUc,7861
204
206
  re_common/v2/baselibrary/utils/string_smi.py,sha256=cU0WAWHRGnGoVQx3eCEKeM_q_olFNzRTJe7rSe586SY,741
205
- re_common/v2/baselibrary/utils/stringutils.py,sha256=TI6fw3km1l25ufXrnG6ha8dSBDtRh-MF4nWRt9u8Xbo,6452
207
+ re_common/v2/baselibrary/utils/stringutils.py,sha256=eeuQYgXkWJ9apvyrYPcCCU3biTY9nD1KHos4_1ESNJE,7883
206
208
  re_common/vip/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
207
209
  re_common/vip/base_step_process.py,sha256=VXXiNj0I5CpzXIMCgOPU86bzDJkSBkUS-9CpZIl_GOk,205
208
210
  re_common/vip/baseencodeid.py,sha256=nERoe89ueFM52bG7xwJdflcZHk6T2RQQKbc5uUZc3RM,3272
@@ -229,8 +231,8 @@ re_common/vip/title/transform/TransformRegulationTitleToZt.py,sha256=LKRdIsWKues
229
231
  re_common/vip/title/transform/TransformStandardTitleToZt.py,sha256=-fCKAbSBzXVyQDCE61CalvR9E_QzQMA08QOO_NePFNI,5563
230
232
  re_common/vip/title/transform/TransformThesisTitleToZt.py,sha256=QS-uV0cQrpUFAcKucuJQ9Ue2VRQH-inmfn_X3IplfRo,5488
231
233
  re_common/vip/title/transform/__init__.py,sha256=m83-CWyRq_VHPYHaALEQlmXrkTdrZ3e4B_kCfBYE-uc,239
232
- re_common-10.0.26.dist-info/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
233
- re_common-10.0.26.dist-info/METADATA,sha256=kHLVPF-e0PjpnUL7dN9pAMqK_pw4yHwZGKxbJ_zlAY0,582
234
- re_common-10.0.26.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
235
- re_common-10.0.26.dist-info/top_level.txt,sha256=_H9H23zoLIalm1AIY_KYTVh_H0ZnmjxQIxsvXtLv45o,10
236
- re_common-10.0.26.dist-info/RECORD,,
234
+ re_common-10.0.28.dist-info/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
235
+ re_common-10.0.28.dist-info/METADATA,sha256=6LyRvl5fLSmKd4qNyZFc72DkO8_hyJ6FQ27dba4PEvc,582
236
+ re_common-10.0.28.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
237
+ re_common-10.0.28.dist-info/top_level.txt,sha256=_H9H23zoLIalm1AIY_KYTVh_H0ZnmjxQIxsvXtLv45o,10
238
+ re_common-10.0.28.dist-info/RECORD,,