re-common 10.0.22__py3-none-any.whl → 10.0.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (202) hide show
  1. re_common/baselibrary/__init__.py +4 -4
  2. re_common/baselibrary/baseabs/__init__.py +6 -6
  3. re_common/baselibrary/baseabs/baseabs.py +26 -26
  4. re_common/baselibrary/database/mbuilder.py +132 -132
  5. re_common/baselibrary/database/moudle.py +93 -93
  6. re_common/baselibrary/database/msqlite3.py +194 -194
  7. re_common/baselibrary/database/mysql.py +169 -169
  8. re_common/baselibrary/database/sql_factory.py +26 -26
  9. re_common/baselibrary/mthread/MThreadingRun.py +486 -486
  10. re_common/baselibrary/mthread/MThreadingRunEvent.py +349 -349
  11. re_common/baselibrary/mthread/__init__.py +2 -2
  12. re_common/baselibrary/mthread/mythreading.py +695 -695
  13. re_common/baselibrary/pakge_other/socks.py +404 -404
  14. re_common/baselibrary/readconfig/config_factory.py +18 -18
  15. re_common/baselibrary/readconfig/ini_config.py +317 -317
  16. re_common/baselibrary/readconfig/toml_config.py +49 -49
  17. re_common/baselibrary/temporary/envdata.py +36 -36
  18. re_common/baselibrary/tools/all_requests/aiohttp_request.py +118 -118
  19. re_common/baselibrary/tools/all_requests/httpx_requet.py +102 -102
  20. re_common/baselibrary/tools/all_requests/mrequest.py +412 -412
  21. re_common/baselibrary/tools/all_requests/requests_request.py +81 -81
  22. re_common/baselibrary/tools/batch_compre/bijiao_batch.py +31 -31
  23. re_common/baselibrary/tools/contrast_db3.py +123 -123
  24. re_common/baselibrary/tools/copy_file.py +39 -39
  25. re_common/baselibrary/tools/db3_2_sizedb3.py +102 -102
  26. re_common/baselibrary/tools/foreachgz.py +39 -39
  27. re_common/baselibrary/tools/get_attr.py +10 -10
  28. re_common/baselibrary/tools/image_to_pdf.py +61 -61
  29. re_common/baselibrary/tools/java_code_deal.py +139 -139
  30. re_common/baselibrary/tools/javacode.py +79 -79
  31. re_common/baselibrary/tools/mdb_db3.py +48 -48
  32. re_common/baselibrary/tools/merge_file.py +171 -171
  33. re_common/baselibrary/tools/merge_gz_file.py +165 -165
  34. re_common/baselibrary/tools/mhdfstools/down_hdfs_files.py +42 -42
  35. re_common/baselibrary/tools/mhdfstools/hdfst.py +42 -42
  36. re_common/baselibrary/tools/mhdfstools/up_hdfs_files.py +38 -38
  37. re_common/baselibrary/tools/mongo_tools.py +50 -50
  38. re_common/baselibrary/tools/move_file.py +170 -170
  39. re_common/baselibrary/tools/move_mongo/mongo_table_to_file.py +63 -63
  40. re_common/baselibrary/tools/move_mongo/move_mongo_table.py +354 -354
  41. re_common/baselibrary/tools/move_mongo/use_mttf.py +18 -18
  42. re_common/baselibrary/tools/move_mongo/use_mv.py +93 -93
  43. re_common/baselibrary/tools/mpandas/mpandasreadexcel.py +125 -125
  44. re_common/baselibrary/tools/mpandas/pandas_visualization.py +7 -7
  45. re_common/baselibrary/tools/myparsel.py +104 -104
  46. re_common/baselibrary/tools/rename_dir_file.py +37 -37
  47. re_common/baselibrary/tools/sequoiadb_utils.py +398 -398
  48. re_common/baselibrary/tools/split_line_to_many.py +25 -25
  49. re_common/baselibrary/tools/stringtodicts.py +33 -33
  50. re_common/baselibrary/tools/workwechant_bot.py +84 -84
  51. re_common/baselibrary/utils/baseaiohttp.py +296 -296
  52. re_common/baselibrary/utils/baseaiomysql.py +87 -87
  53. re_common/baselibrary/utils/baseallstep.py +191 -191
  54. re_common/baselibrary/utils/baseavro.py +19 -19
  55. re_common/baselibrary/utils/baseboto3.py +291 -291
  56. re_common/baselibrary/utils/basecsv.py +32 -32
  57. re_common/baselibrary/utils/basedict.py +133 -133
  58. re_common/baselibrary/utils/basedir.py +241 -241
  59. re_common/baselibrary/utils/baseencode.py +351 -351
  60. re_common/baselibrary/utils/baseencoding.py +28 -28
  61. re_common/baselibrary/utils/baseesdsl.py +86 -86
  62. re_common/baselibrary/utils/baseexcel.py +264 -264
  63. re_common/baselibrary/utils/baseexcept.py +109 -109
  64. re_common/baselibrary/utils/basefile.py +654 -654
  65. re_common/baselibrary/utils/baseftp.py +214 -214
  66. re_common/baselibrary/utils/basegzip.py +60 -60
  67. re_common/baselibrary/utils/basehdfs.py +135 -135
  68. re_common/baselibrary/utils/basehttpx.py +268 -268
  69. re_common/baselibrary/utils/baseip.py +87 -87
  70. re_common/baselibrary/utils/basejson.py +2 -2
  71. re_common/baselibrary/utils/baselist.py +32 -32
  72. re_common/baselibrary/utils/basemotor.py +190 -190
  73. re_common/baselibrary/utils/basemssql.py +98 -98
  74. re_common/baselibrary/utils/baseodbc.py +113 -113
  75. re_common/baselibrary/utils/basepandas.py +302 -302
  76. re_common/baselibrary/utils/basepeewee.py +11 -11
  77. re_common/baselibrary/utils/basepika.py +180 -180
  78. re_common/baselibrary/utils/basepydash.py +143 -143
  79. re_common/baselibrary/utils/basepymongo.py +230 -230
  80. re_common/baselibrary/utils/basequeue.py +22 -22
  81. re_common/baselibrary/utils/baserar.py +57 -57
  82. re_common/baselibrary/utils/baserequest.py +279 -279
  83. re_common/baselibrary/utils/baseset.py +8 -8
  84. re_common/baselibrary/utils/basesmb.py +403 -403
  85. re_common/baselibrary/utils/basestring.py +382 -382
  86. re_common/baselibrary/utils/basetime.py +320 -320
  87. re_common/baselibrary/utils/baseurl.py +121 -121
  88. re_common/baselibrary/utils/basezip.py +57 -57
  89. re_common/baselibrary/utils/core/__init__.py +7 -7
  90. re_common/baselibrary/utils/core/bottomutils.py +18 -18
  91. re_common/baselibrary/utils/core/mdeprecated.py +327 -327
  92. re_common/baselibrary/utils/core/mlamada.py +16 -16
  93. re_common/baselibrary/utils/core/msginfo.py +25 -25
  94. re_common/baselibrary/utils/core/requests_core.py +103 -103
  95. re_common/baselibrary/utils/fateadm.py +429 -429
  96. re_common/baselibrary/utils/importfun.py +123 -123
  97. re_common/baselibrary/utils/mfaker.py +57 -57
  98. re_common/baselibrary/utils/my_abc/__init__.py +3 -3
  99. re_common/baselibrary/utils/my_abc/better_abc.py +32 -32
  100. re_common/baselibrary/utils/mylogger.py +414 -414
  101. re_common/baselibrary/utils/myredisclient.py +861 -861
  102. re_common/baselibrary/utils/pipupgrade.py +21 -21
  103. re_common/baselibrary/utils/ringlist.py +85 -85
  104. re_common/baselibrary/utils/version_compare.py +36 -36
  105. re_common/baselibrary/utils/ydmhttp.py +126 -126
  106. re_common/facade/lazy_import.py +11 -11
  107. re_common/facade/loggerfacade.py +25 -25
  108. re_common/facade/mysqlfacade.py +467 -467
  109. re_common/facade/now.py +31 -31
  110. re_common/facade/sqlite3facade.py +257 -257
  111. re_common/facade/use/mq_use_facade.py +83 -83
  112. re_common/facade/use/proxy_use_facade.py +19 -19
  113. re_common/libtest/base_dict_test.py +19 -19
  114. re_common/libtest/baseavro_test.py +13 -13
  115. re_common/libtest/basefile_test.py +14 -14
  116. re_common/libtest/basemssql_test.py +77 -77
  117. re_common/libtest/baseodbc_test.py +7 -7
  118. re_common/libtest/basepandas_test.py +38 -38
  119. re_common/libtest/get_attr_test/get_attr_test_settings.py +14 -14
  120. re_common/libtest/get_attr_test/settings.py +54 -54
  121. re_common/libtest/idencode_test.py +53 -53
  122. re_common/libtest/iniconfig_test.py +35 -35
  123. re_common/libtest/ip_test.py +34 -34
  124. re_common/libtest/merge_file_test.py +20 -20
  125. re_common/libtest/mfaker_test.py +8 -8
  126. re_common/libtest/mm3_test.py +31 -31
  127. re_common/libtest/mylogger_test.py +88 -88
  128. re_common/libtest/myparsel_test.py +27 -27
  129. re_common/libtest/mysql_test.py +151 -151
  130. re_common/libtest/pymongo_test.py +21 -21
  131. re_common/libtest/split_test.py +11 -11
  132. re_common/libtest/sqlite3_merge_test.py +5 -5
  133. re_common/libtest/sqlite3_test.py +34 -34
  134. re_common/libtest/tomlconfig_test.py +30 -30
  135. re_common/libtest/use_tools_test/__init__.py +2 -2
  136. re_common/libtest/user/__init__.py +4 -4
  137. re_common/studio/__init__.py +4 -4
  138. re_common/studio/assignment_expressions.py +36 -36
  139. re_common/studio/mydash/test1.py +18 -18
  140. re_common/studio/pydashstudio/first.py +9 -9
  141. re_common/studio/streamlitstudio/first_app.py +65 -65
  142. re_common/studio/streamlitstudio/uber_pickups.py +23 -23
  143. re_common/studio/test.py +18 -18
  144. re_common/v2/baselibrary/business_utils/BusinessStringUtil.py +195 -0
  145. re_common/v2/baselibrary/business_utils/__init__.py +0 -0
  146. re_common/v2/baselibrary/business_utils/rel_tools.py +6 -0
  147. re_common/v2/baselibrary/decorators/utils.py +59 -59
  148. re_common/v2/baselibrary/s3object/baseboto3.py +230 -230
  149. re_common/v2/baselibrary/tools/WeChatRobot.py +95 -79
  150. re_common/v2/baselibrary/tools/ac_ahocorasick.py +75 -75
  151. re_common/v2/baselibrary/tools/dict_tools.py +37 -37
  152. re_common/v2/baselibrary/tools/dolphinscheduler.py +187 -187
  153. re_common/v2/baselibrary/tools/hdfs_data_processer.py +338 -338
  154. re_common/v2/baselibrary/tools/list_tools.py +65 -65
  155. re_common/v2/baselibrary/tools/search_hash_tools.py +54 -54
  156. re_common/v2/baselibrary/tools/text_matcher.py +326 -326
  157. re_common/v2/baselibrary/tools/unionfind_tools.py +60 -60
  158. re_common/v2/baselibrary/utils/BusinessStringUtil.py +196 -196
  159. re_common/v2/baselibrary/utils/author_smi.py +360 -360
  160. re_common/v2/baselibrary/utils/base_string_similarity.py +158 -158
  161. re_common/v2/baselibrary/utils/basedict.py +37 -37
  162. re_common/v2/baselibrary/utils/basehdfs.py +161 -161
  163. re_common/v2/baselibrary/utils/basepika.py +180 -180
  164. re_common/v2/baselibrary/utils/basetime.py +77 -77
  165. re_common/v2/baselibrary/utils/db.py +38 -38
  166. re_common/v2/baselibrary/utils/json_cls.py +16 -16
  167. re_common/v2/baselibrary/utils/mq.py +83 -83
  168. re_common/v2/baselibrary/utils/n_ary_expression_tree.py +243 -243
  169. re_common/v2/baselibrary/utils/string_bool.py +186 -149
  170. re_common/v2/baselibrary/utils/string_clear.py +227 -204
  171. re_common/v2/baselibrary/utils/string_smi.py +18 -18
  172. re_common/v2/baselibrary/utils/stringutils.py +213 -213
  173. re_common/vip/base_step_process.py +11 -11
  174. re_common/vip/baseencodeid.py +90 -90
  175. re_common/vip/changetaskname.py +28 -28
  176. re_common/vip/core_var.py +24 -24
  177. re_common/vip/mmh3Hash.py +89 -89
  178. re_common/vip/proxy/allproxys.py +127 -127
  179. re_common/vip/proxy/allproxys_thread.py +159 -159
  180. re_common/vip/proxy/cnki_proxy.py +153 -153
  181. re_common/vip/proxy/kuaidaili.py +87 -87
  182. re_common/vip/proxy/proxy_all.py +113 -113
  183. re_common/vip/proxy/update_kuaidaili_0.py +42 -42
  184. re_common/vip/proxy/wanfang_proxy.py +152 -152
  185. re_common/vip/proxy/wp_proxy_all.py +181 -181
  186. re_common/vip/read_rawid_to_txt.py +91 -91
  187. re_common/vip/title/__init__.py +5 -5
  188. re_common/vip/title/transform/TransformBookTitleToZt.py +125 -125
  189. re_common/vip/title/transform/TransformConferenceTitleToZt.py +139 -139
  190. re_common/vip/title/transform/TransformCstadTitleToZt.py +195 -195
  191. re_common/vip/title/transform/TransformJournalTitleToZt.py +203 -203
  192. re_common/vip/title/transform/TransformPatentTitleToZt.py +132 -132
  193. re_common/vip/title/transform/TransformRegulationTitleToZt.py +114 -114
  194. re_common/vip/title/transform/TransformStandardTitleToZt.py +135 -135
  195. re_common/vip/title/transform/TransformThesisTitleToZt.py +135 -135
  196. re_common/vip/title/transform/__init__.py +10 -10
  197. {re_common-10.0.22.dist-info → re_common-10.0.24.dist-info}/LICENSE +201 -201
  198. {re_common-10.0.22.dist-info → re_common-10.0.24.dist-info}/METADATA +16 -16
  199. re_common-10.0.24.dist-info/RECORD +230 -0
  200. {re_common-10.0.22.dist-info → re_common-10.0.24.dist-info}/WHEEL +1 -1
  201. re_common-10.0.22.dist-info/RECORD +0 -227
  202. {re_common-10.0.22.dist-info → re_common-10.0.24.dist-info}/top_level.txt +0 -0
@@ -1,213 +1,213 @@
1
- import re
2
- import threading
3
- from itertools import combinations
4
-
5
- import regex
6
- import unicodedata
7
- from html.parser import HTMLParser
8
-
9
-
10
- from re_common.v2.baselibrary.utils.string_smi import JaroDamerauLevenshteinMaxSim
11
-
12
-
13
- def bj2qj(src):
14
- if src is None:
15
- return src
16
-
17
- DBC_SPACE = ' '
18
- SBC_SPACE = ' '
19
- DBC_CHAR_START = 33
20
- DBC_CHAR_END = 126
21
- CONVERT_STEP = 65248
22
-
23
- buf = []
24
- for char in src:
25
- if char == DBC_SPACE:
26
- buf.append(SBC_SPACE)
27
- elif DBC_CHAR_START <= ord(char) <= DBC_CHAR_END:
28
- buf.append(chr(ord(char) + CONVERT_STEP))
29
- else:
30
- buf.append(char)
31
-
32
- return ''.join(buf)
33
-
34
-
35
- def qj2bj(src):
36
- """
37
- 全角转半角
38
- :param src:
39
- :return:
40
- """
41
- if src is None:
42
- return src
43
-
44
- SBC_CHAR_START = 0xFF01
45
- SBC_CHAR_END = 0xFF5E
46
- CONVERT_STEP = 0xFEE0
47
- DBC_SPACE = ' '
48
- SBC_SPACE = ' '
49
-
50
- buf = []
51
- for char in src:
52
- if SBC_CHAR_START <= ord(char) <= SBC_CHAR_END:
53
- buf.append(chr(ord(char) - CONVERT_STEP))
54
- elif char == SBC_SPACE:
55
- buf.append(DBC_SPACE)
56
- else:
57
- buf.append(char)
58
-
59
- return ''.join(buf)
60
-
61
-
62
- def get_diacritic_variant(char1):
63
- # 将字符转换为标准的 Unicode 形式
64
- normalized_char1 = unicodedata.normalize('NFD', char1)
65
-
66
- # 获取基本字符(去掉变音符号)
67
- base_char1 = ''.join(c for c in normalized_char1 if unicodedata.category(c) != 'Mn')
68
-
69
- # 判断基本字符是否相同
70
- return base_char1
71
-
72
-
73
- def get_alphabetic_ratio(text: str) -> float:
74
- # 返回字母型字符所占比例
75
- if not text:
76
- return 0
77
-
78
- text = re.sub(r'\d+', '', text)
79
-
80
- # 正则表达式匹配字母型文字(包括拉丁字母、希腊字母、西里尔字母、阿拉伯字母等)
81
- alphabetic_pattern = (
82
- r"[\u0041-\u005A\u0061-\u007A" # 拉丁字母 (A-Z, a-z)
83
- r"\u00C0-\u00FF" # 带重音符号的拉丁字母 (À-ÿ)
84
- r"\u0080–\u00FF" # 拉丁字母补充1
85
- r"\u0100–\u017F" # 拉丁字母扩展A
86
- r"\u1E00-\u1EFF" # 拉丁扩展 (Latin Extended Additional)
87
- r"\u0180-\u024F" # 拉丁扩展-B (Latin Extended-B)
88
- r"\u2C60-\u2C7F" # 拉丁扩展-C (Latin Extended Additional)
89
- r"\uA720-\uA7FF" # 拉丁扩展-D (Latin Extended Additional)
90
- r"\uAB30-\uAB6F" # 拉丁扩展-E (Latin Extended Additional)
91
- r"]"
92
- )
93
-
94
- # 使用正则表达式过滤出语言文字
95
- clean_text = regex.sub(r"[^\p{L}]", "", text)
96
-
97
- if len(clean_text) == 0:
98
- return 1.0
99
-
100
- # 匹配所有字母型字符
101
- alphabetic_chars = re.findall(alphabetic_pattern, clean_text)
102
-
103
- # 返回字母型字符所占比例
104
- return len(alphabetic_chars) / len(clean_text)
105
-
106
-
107
- class HTMLTextExtractor(HTMLParser):
108
- _thread_local = threading.local() # 线程局部存储
109
-
110
- def __init__(self):
111
- super().__init__()
112
- self.reset_state()
113
-
114
- def handle_starttag(self, tag, attrs):
115
- if tag in ('script', 'style'):
116
- self.skip = True
117
-
118
- def handle_endtag(self, tag):
119
- if tag in ('script', 'style'):
120
- self.skip = False
121
-
122
- def handle_data(self, data):
123
- if not self.skip and data.strip():
124
- self.text.append(data)
125
-
126
- def reset_state(self):
127
- self.reset()
128
- self.text = []
129
- self.skip = False
130
-
131
- def get_text(self):
132
- return ''.join(self.text).strip()
133
-
134
- @classmethod
135
- def get_parser(cls):
136
- # 每个线程获取独立实例
137
- if not hasattr(cls._thread_local, 'parser'):
138
- cls._thread_local.parser = cls()
139
- return cls._thread_local.parser
140
-
141
-
142
- def clean_html(html):
143
- parser = HTMLTextExtractor.get_parser()
144
- parser.reset_state()
145
- parser.feed(html)
146
- parser.close()
147
- return parser.get_text()
148
-
149
-
150
- def remove_spaces_between_chinese_characters(text):
151
- """
152
- 匹配中文间的空格并替换为空字符串
153
-
154
- 这里没有选取 后面的一些扩展分区 是那些分区比较分散 都写进来消耗性能,
155
- 认为只包含这些也够用了
156
- """
157
- pattern = r'(?<=[\u3400-\u9fff])\s+(?=[\u3400-\u9fff])'
158
- return re.sub(pattern, '', text)
159
-
160
-
161
-
162
- sim_utils = JaroDamerauLevenshteinMaxSim()
163
-
164
- def group_similar_texts(texts, threshold=0.9):
165
- """根据相似度对文本进行分组"""
166
- from re_common.v2.baselibrary.utils.string_clear import rel_clear
167
- n = len(texts)
168
- # 创建邻接表表示图
169
- graph = [[] for _ in range(n)]
170
- # 计算所有文本对的相似度并构建图
171
- for i, j in combinations(range(n), 2):
172
- similarity = sim_utils.get_sim(rel_clear(texts[i]), rel_clear(texts[j]))
173
- if similarity >= threshold:
174
- graph[i].append(j)
175
- graph[j].append(i)
176
-
177
- visited = [False] * n
178
- groups = []
179
-
180
- # 使用DFS找到连通分量
181
- def dfs(node, group):
182
- visited[node] = True
183
- group.append(node)
184
- for neighbor in graph[node]:
185
- if not visited[neighbor]:
186
- dfs(neighbor, group)
187
-
188
- # 找到所有连通分量
189
- for i in range(n):
190
- if not visited[i]:
191
- current_group = []
192
- dfs(i, current_group)
193
- groups.append(current_group)
194
-
195
- return groups
196
-
197
-
198
- def get_group_abstract(lists):
199
- """
200
- 这是一个 分组程序 ,会根据简单的连通图分组
201
- lists: [(id,txt),...]
202
- return: all_list 返回一个二维列表 每个列表里面是id 每个列表为一个分组
203
- """
204
- abstract_list = [i[1] for i in lists]
205
- keyid_list = [i[0] for i in lists]
206
- groups = group_similar_texts(abstract_list, threshold=0.9)
207
- all_list = []
208
- for group in groups:
209
- t_list = []
210
- for text_idx in group:
211
- t_list.append(keyid_list[text_idx])
212
- all_list.append(t_list)
213
- return all_list
1
+ import re
2
+ import threading
3
+ from itertools import combinations
4
+
5
+ import regex
6
+ import unicodedata
7
+ from html.parser import HTMLParser
8
+
9
+
10
+ from re_common.v2.baselibrary.utils.string_smi import JaroDamerauLevenshteinMaxSim
11
+
12
+
13
+ def bj2qj(src):
14
+ if src is None:
15
+ return src
16
+
17
+ DBC_SPACE = ' '
18
+ SBC_SPACE = ' '
19
+ DBC_CHAR_START = 33
20
+ DBC_CHAR_END = 126
21
+ CONVERT_STEP = 65248
22
+
23
+ buf = []
24
+ for char in src:
25
+ if char == DBC_SPACE:
26
+ buf.append(SBC_SPACE)
27
+ elif DBC_CHAR_START <= ord(char) <= DBC_CHAR_END:
28
+ buf.append(chr(ord(char) + CONVERT_STEP))
29
+ else:
30
+ buf.append(char)
31
+
32
+ return ''.join(buf)
33
+
34
+
35
+ def qj2bj(src):
36
+ """
37
+ 全角转半角
38
+ :param src:
39
+ :return:
40
+ """
41
+ if src is None:
42
+ return src
43
+
44
+ SBC_CHAR_START = 0xFF01
45
+ SBC_CHAR_END = 0xFF5E
46
+ CONVERT_STEP = 0xFEE0
47
+ DBC_SPACE = ' '
48
+ SBC_SPACE = ' '
49
+
50
+ buf = []
51
+ for char in src:
52
+ if SBC_CHAR_START <= ord(char) <= SBC_CHAR_END:
53
+ buf.append(chr(ord(char) - CONVERT_STEP))
54
+ elif char == SBC_SPACE:
55
+ buf.append(DBC_SPACE)
56
+ else:
57
+ buf.append(char)
58
+
59
+ return ''.join(buf)
60
+
61
+
62
+ def get_diacritic_variant(char1):
63
+ # 将字符转换为标准的 Unicode 形式
64
+ normalized_char1 = unicodedata.normalize('NFD', char1)
65
+
66
+ # 获取基本字符(去掉变音符号)
67
+ base_char1 = ''.join(c for c in normalized_char1 if unicodedata.category(c) != 'Mn')
68
+
69
+ # 判断基本字符是否相同
70
+ return base_char1
71
+
72
+
73
+ def get_alphabetic_ratio(text: str) -> float:
74
+ # 返回字母型字符所占比例
75
+ if not text:
76
+ return 0
77
+
78
+ text = re.sub(r'\d+', '', text)
79
+
80
+ # 正则表达式匹配字母型文字(包括拉丁字母、希腊字母、西里尔字母、阿拉伯字母等)
81
+ alphabetic_pattern = (
82
+ r"[\u0041-\u005A\u0061-\u007A" # 拉丁字母 (A-Z, a-z)
83
+ r"\u00C0-\u00FF" # 带重音符号的拉丁字母 (À-ÿ)
84
+ r"\u0080–\u00FF" # 拉丁字母补充1
85
+ r"\u0100–\u017F" # 拉丁字母扩展A
86
+ r"\u1E00-\u1EFF" # 拉丁扩展 (Latin Extended Additional)
87
+ r"\u0180-\u024F" # 拉丁扩展-B (Latin Extended-B)
88
+ r"\u2C60-\u2C7F" # 拉丁扩展-C (Latin Extended Additional)
89
+ r"\uA720-\uA7FF" # 拉丁扩展-D (Latin Extended Additional)
90
+ r"\uAB30-\uAB6F" # 拉丁扩展-E (Latin Extended Additional)
91
+ r"]"
92
+ )
93
+
94
+ # 使用正则表达式过滤出语言文字
95
+ clean_text = regex.sub(r"[^\p{L}]", "", text)
96
+
97
+ if len(clean_text) == 0:
98
+ return 1.0
99
+
100
+ # 匹配所有字母型字符
101
+ alphabetic_chars = re.findall(alphabetic_pattern, clean_text)
102
+
103
+ # 返回字母型字符所占比例
104
+ return len(alphabetic_chars) / len(clean_text)
105
+
106
+
107
+ class HTMLTextExtractor(HTMLParser):
108
+ _thread_local = threading.local() # 线程局部存储
109
+
110
+ def __init__(self):
111
+ super().__init__()
112
+ self.reset_state()
113
+
114
+ def handle_starttag(self, tag, attrs):
115
+ if tag in ('script', 'style'):
116
+ self.skip = True
117
+
118
+ def handle_endtag(self, tag):
119
+ if tag in ('script', 'style'):
120
+ self.skip = False
121
+
122
+ def handle_data(self, data):
123
+ if not self.skip and data.strip():
124
+ self.text.append(data)
125
+
126
+ def reset_state(self):
127
+ self.reset()
128
+ self.text = []
129
+ self.skip = False
130
+
131
+ def get_text(self):
132
+ return ''.join(self.text).strip()
133
+
134
+ @classmethod
135
+ def get_parser(cls):
136
+ # 每个线程获取独立实例
137
+ if not hasattr(cls._thread_local, 'parser'):
138
+ cls._thread_local.parser = cls()
139
+ return cls._thread_local.parser
140
+
141
+
142
+ def clean_html(html):
143
+ parser = HTMLTextExtractor.get_parser()
144
+ parser.reset_state()
145
+ parser.feed(html)
146
+ parser.close()
147
+ return parser.get_text()
148
+
149
+
150
+ def remove_spaces_between_chinese_characters(text):
151
+ """
152
+ 匹配中文间的空格并替换为空字符串
153
+
154
+ 这里没有选取 后面的一些扩展分区 是那些分区比较分散 都写进来消耗性能,
155
+ 认为只包含这些也够用了
156
+ """
157
+ pattern = r'(?<=[\u3400-\u9fff])\s+(?=[\u3400-\u9fff])'
158
+ return re.sub(pattern, '', text)
159
+
160
+
161
+
162
+ sim_utils = JaroDamerauLevenshteinMaxSim()
163
+
164
+ def group_similar_texts(texts, threshold=0.9):
165
+ """根据相似度对文本进行分组"""
166
+ from re_common.v2.baselibrary.utils.string_clear import rel_clear
167
+ n = len(texts)
168
+ # 创建邻接表表示图
169
+ graph = [[] for _ in range(n)]
170
+ # 计算所有文本对的相似度并构建图
171
+ for i, j in combinations(range(n), 2):
172
+ similarity = sim_utils.get_sim(rel_clear(texts[i]), rel_clear(texts[j]))
173
+ if similarity >= threshold:
174
+ graph[i].append(j)
175
+ graph[j].append(i)
176
+
177
+ visited = [False] * n
178
+ groups = []
179
+
180
+ # 使用DFS找到连通分量
181
+ def dfs(node, group):
182
+ visited[node] = True
183
+ group.append(node)
184
+ for neighbor in graph[node]:
185
+ if not visited[neighbor]:
186
+ dfs(neighbor, group)
187
+
188
+ # 找到所有连通分量
189
+ for i in range(n):
190
+ if not visited[i]:
191
+ current_group = []
192
+ dfs(i, current_group)
193
+ groups.append(current_group)
194
+
195
+ return groups
196
+
197
+
198
+ def get_group_abstract(lists):
199
+ """
200
+ 这是一个 分组程序 ,会根据简单的连通图分组
201
+ lists: [(id,txt),...]
202
+ return: all_list 返回一个二维列表 每个列表里面是id 每个列表为一个分组
203
+ """
204
+ abstract_list = [i[1] for i in lists]
205
+ keyid_list = [i[0] for i in lists]
206
+ groups = group_similar_texts(abstract_list, threshold=0.9)
207
+ all_list = []
208
+ for group in groups:
209
+ t_list = []
210
+ for text_idx in group:
211
+ t_list.append(keyid_list[text_idx])
212
+ all_list.append(t_list)
213
+ return all_list
@@ -1,11 +1,11 @@
1
- from abc import ABC, abstractmethod
2
-
3
-
4
- class BaseStepProcess(ABC):
5
-
6
- def __init__(self):
7
- self.stat_dicts = {}
8
-
9
- @abstractmethod
10
- def do_task(self, *args, **kwargs):
11
- pass
1
+ from abc import ABC, abstractmethod
2
+
3
+
4
+ class BaseStepProcess(ABC):
5
+
6
+ def __init__(self):
7
+ self.stat_dicts = {}
8
+
9
+ @abstractmethod
10
+ def do_task(self, *args, **kwargs):
11
+ pass
@@ -1,91 +1,91 @@
1
- import base64
2
- import hashlib
3
-
4
- """
5
- VIP编码lngid生成
6
- """
7
-
8
- class BaseLngid(object):
9
- def __int__(self):
10
- pass
11
-
12
- def BaseEncodeID(self, strRaw):
13
- r""" 自定义base编码 """
14
-
15
- strEncode = base64.b32encode(strRaw.encode('utf8')).decode('utf8')
16
-
17
- if strEncode.endswith('======'):
18
- strEncode = '%s%s' % (strEncode[0:-6], '0')
19
- elif strEncode.endswith('===='):
20
- strEncode = '%s%s' % (strEncode[0:-4], '1')
21
- elif strEncode.endswith('==='):
22
- strEncode = '%s%s' % (strEncode[0:-3], '8')
23
- elif strEncode.endswith('='):
24
- strEncode = '%s%s' % (strEncode[0:-1], '9')
25
-
26
- table = str.maketrans('0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'ZYXWVUTSRQPONMLKJIHGFEDCBA9876543210')
27
- strEncode = strEncode.translate(table)
28
-
29
- return strEncode
30
-
31
- def BaseDecodeID(self, strEncode):
32
- r""" 自定义base解码 """
33
-
34
- table = str.maketrans('ZYXWVUTSRQPONMLKJIHGFEDCBA9876543210', '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ')
35
- strEncode = strEncode.translate(table)
36
-
37
- if strEncode.endswith('0'):
38
- strEncode = '%s%s' % (strEncode[0:-1], '======')
39
- elif strEncode.endswith('1'):
40
- strEncode = '%s%s' % (strEncode[0:-1], '====')
41
- elif strEncode.endswith('8'):
42
- strEncode = '%s%s' % (strEncode[0:-1], '===')
43
- elif strEncode.endswith('9'):
44
- strEncode = '%s%s' % (strEncode[0:-1], '=')
45
-
46
- strRaw = base64.b32decode(strEncode.encode('utf8')).decode('utf8')
47
-
48
- return strRaw
49
-
50
- def GetLngid(self, sub_db_id, rawid, case_insensitive=False):
51
- """
52
- :param sub_db_id:
53
- :param rawid:
54
- 由 sub_db_id 和 rawid 得到 lngid。
55
- :param case_insensitive: 标识源网站的 rawid 是否区分大小写
56
- :return: lngid
57
- """
58
- uppercase_rawid = '' # 大写版 rawid
59
- if case_insensitive: # 源网站的 rawid 区分大小写
60
- for ch in rawid:
61
- if ch.upper() == ch:
62
- uppercase_rawid += ch
63
- else:
64
- uppercase_rawid += ch.upper() + '_'
65
- else:
66
- uppercase_rawid = rawid.upper()
67
-
68
- limited_id = uppercase_rawid # 限长ID
69
- if len(uppercase_rawid) > 20:
70
- limited_id = hashlib.md5(uppercase_rawid.encode('utf8')).hexdigest().upper()
71
- else:
72
- limited_id = self.BaseEncodeID(uppercase_rawid)
73
-
74
- lngid = sub_db_id + limited_id
75
-
76
- return lngid
77
-
78
- def GetRawid(self, limited_id, case_insensitive=False):
79
- try:
80
- uppercase_rawid = self.BaseDecodeID(limited_id)
81
- if case_insensitive:
82
- str_ = "_"
83
- uppercase_rawid_list = list(uppercase_rawid)
84
- for num,li in enumerate(uppercase_rawid_list):
85
- if li == str_:
86
- old_str = "".join(uppercase_rawid_list[num-1:num+1])
87
- uppercase_rawid = uppercase_rawid.replace(old_str,uppercase_rawid_list[num-1].lower())
88
- except Exception as e:
89
- raise Exception("长度超过20,不可逆")
90
-
1
+ import base64
2
+ import hashlib
3
+
4
+ """
5
+ VIP编码lngid生成
6
+ """
7
+
8
+ class BaseLngid(object):
9
+ def __int__(self):
10
+ pass
11
+
12
+ def BaseEncodeID(self, strRaw):
13
+ r""" 自定义base编码 """
14
+
15
+ strEncode = base64.b32encode(strRaw.encode('utf8')).decode('utf8')
16
+
17
+ if strEncode.endswith('======'):
18
+ strEncode = '%s%s' % (strEncode[0:-6], '0')
19
+ elif strEncode.endswith('===='):
20
+ strEncode = '%s%s' % (strEncode[0:-4], '1')
21
+ elif strEncode.endswith('==='):
22
+ strEncode = '%s%s' % (strEncode[0:-3], '8')
23
+ elif strEncode.endswith('='):
24
+ strEncode = '%s%s' % (strEncode[0:-1], '9')
25
+
26
+ table = str.maketrans('0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'ZYXWVUTSRQPONMLKJIHGFEDCBA9876543210')
27
+ strEncode = strEncode.translate(table)
28
+
29
+ return strEncode
30
+
31
+ def BaseDecodeID(self, strEncode):
32
+ r""" 自定义base解码 """
33
+
34
+ table = str.maketrans('ZYXWVUTSRQPONMLKJIHGFEDCBA9876543210', '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ')
35
+ strEncode = strEncode.translate(table)
36
+
37
+ if strEncode.endswith('0'):
38
+ strEncode = '%s%s' % (strEncode[0:-1], '======')
39
+ elif strEncode.endswith('1'):
40
+ strEncode = '%s%s' % (strEncode[0:-1], '====')
41
+ elif strEncode.endswith('8'):
42
+ strEncode = '%s%s' % (strEncode[0:-1], '===')
43
+ elif strEncode.endswith('9'):
44
+ strEncode = '%s%s' % (strEncode[0:-1], '=')
45
+
46
+ strRaw = base64.b32decode(strEncode.encode('utf8')).decode('utf8')
47
+
48
+ return strRaw
49
+
50
+ def GetLngid(self, sub_db_id, rawid, case_insensitive=False):
51
+ """
52
+ :param sub_db_id:
53
+ :param rawid:
54
+ 由 sub_db_id 和 rawid 得到 lngid。
55
+ :param case_insensitive: 标识源网站的 rawid 是否区分大小写
56
+ :return: lngid
57
+ """
58
+ uppercase_rawid = '' # 大写版 rawid
59
+ if case_insensitive: # 源网站的 rawid 区分大小写
60
+ for ch in rawid:
61
+ if ch.upper() == ch:
62
+ uppercase_rawid += ch
63
+ else:
64
+ uppercase_rawid += ch.upper() + '_'
65
+ else:
66
+ uppercase_rawid = rawid.upper()
67
+
68
+ limited_id = uppercase_rawid # 限长ID
69
+ if len(uppercase_rawid) > 20:
70
+ limited_id = hashlib.md5(uppercase_rawid.encode('utf8')).hexdigest().upper()
71
+ else:
72
+ limited_id = self.BaseEncodeID(uppercase_rawid)
73
+
74
+ lngid = sub_db_id + limited_id
75
+
76
+ return lngid
77
+
78
+ def GetRawid(self, limited_id, case_insensitive=False):
79
+ try:
80
+ uppercase_rawid = self.BaseDecodeID(limited_id)
81
+ if case_insensitive:
82
+ str_ = "_"
83
+ uppercase_rawid_list = list(uppercase_rawid)
84
+ for num,li in enumerate(uppercase_rawid_list):
85
+ if li == str_:
86
+ old_str = "".join(uppercase_rawid_list[num-1:num+1])
87
+ uppercase_rawid = uppercase_rawid.replace(old_str,uppercase_rawid_list[num-1].lower())
88
+ except Exception as e:
89
+ raise Exception("长度超过20,不可逆")
90
+
91
91
  return uppercase_rawid