re-common 10.0.37__py3-none-any.whl → 10.0.39__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (217) hide show
  1. re_common/baselibrary/__init__.py +4 -4
  2. re_common/baselibrary/baseabs/__init__.py +6 -6
  3. re_common/baselibrary/baseabs/baseabs.py +26 -26
  4. re_common/baselibrary/database/mbuilder.py +132 -132
  5. re_common/baselibrary/database/moudle.py +93 -93
  6. re_common/baselibrary/database/msqlite3.py +194 -194
  7. re_common/baselibrary/database/mysql.py +169 -169
  8. re_common/baselibrary/database/sql_factory.py +26 -26
  9. re_common/baselibrary/mthread/MThreadingRun.py +486 -486
  10. re_common/baselibrary/mthread/MThreadingRunEvent.py +349 -349
  11. re_common/baselibrary/mthread/__init__.py +2 -2
  12. re_common/baselibrary/mthread/mythreading.py +695 -695
  13. re_common/baselibrary/pakge_other/socks.py +404 -404
  14. re_common/baselibrary/readconfig/config_factory.py +18 -18
  15. re_common/baselibrary/readconfig/ini_config.py +317 -317
  16. re_common/baselibrary/readconfig/toml_config.py +49 -49
  17. re_common/baselibrary/temporary/envdata.py +36 -36
  18. re_common/baselibrary/tools/all_requests/aiohttp_request.py +118 -118
  19. re_common/baselibrary/tools/all_requests/httpx_requet.py +102 -102
  20. re_common/baselibrary/tools/all_requests/mrequest.py +412 -412
  21. re_common/baselibrary/tools/all_requests/requests_request.py +81 -81
  22. re_common/baselibrary/tools/batch_compre/bijiao_batch.py +31 -31
  23. re_common/baselibrary/tools/contrast_db3.py +123 -123
  24. re_common/baselibrary/tools/copy_file.py +39 -39
  25. re_common/baselibrary/tools/db3_2_sizedb3.py +102 -102
  26. re_common/baselibrary/tools/foreachgz.py +39 -39
  27. re_common/baselibrary/tools/get_attr.py +10 -10
  28. re_common/baselibrary/tools/image_to_pdf.py +61 -61
  29. re_common/baselibrary/tools/java_code_deal.py +139 -139
  30. re_common/baselibrary/tools/javacode.py +79 -79
  31. re_common/baselibrary/tools/mdb_db3.py +48 -48
  32. re_common/baselibrary/tools/merge_file.py +171 -171
  33. re_common/baselibrary/tools/merge_gz_file.py +165 -165
  34. re_common/baselibrary/tools/mhdfstools/down_hdfs_files.py +42 -42
  35. re_common/baselibrary/tools/mhdfstools/hdfst.py +42 -42
  36. re_common/baselibrary/tools/mhdfstools/up_hdfs_files.py +38 -38
  37. re_common/baselibrary/tools/mongo_tools.py +50 -50
  38. re_common/baselibrary/tools/move_file.py +170 -170
  39. re_common/baselibrary/tools/move_mongo/mongo_table_to_file.py +63 -63
  40. re_common/baselibrary/tools/move_mongo/move_mongo_table.py +354 -354
  41. re_common/baselibrary/tools/move_mongo/use_mttf.py +18 -18
  42. re_common/baselibrary/tools/move_mongo/use_mv.py +93 -93
  43. re_common/baselibrary/tools/mpandas/mpandasreadexcel.py +125 -125
  44. re_common/baselibrary/tools/mpandas/pandas_visualization.py +7 -7
  45. re_common/baselibrary/tools/myparsel.py +104 -104
  46. re_common/baselibrary/tools/rename_dir_file.py +37 -37
  47. re_common/baselibrary/tools/sequoiadb_utils.py +398 -398
  48. re_common/baselibrary/tools/split_line_to_many.py +25 -25
  49. re_common/baselibrary/tools/stringtodicts.py +33 -33
  50. re_common/baselibrary/tools/workwechant_bot.py +84 -84
  51. re_common/baselibrary/utils/baseaiohttp.py +296 -296
  52. re_common/baselibrary/utils/baseaiomysql.py +87 -87
  53. re_common/baselibrary/utils/baseallstep.py +191 -191
  54. re_common/baselibrary/utils/baseavro.py +19 -19
  55. re_common/baselibrary/utils/baseboto3.py +291 -291
  56. re_common/baselibrary/utils/basecsv.py +32 -32
  57. re_common/baselibrary/utils/basedict.py +133 -133
  58. re_common/baselibrary/utils/basedir.py +241 -241
  59. re_common/baselibrary/utils/baseencode.py +351 -351
  60. re_common/baselibrary/utils/baseencoding.py +28 -28
  61. re_common/baselibrary/utils/baseesdsl.py +86 -86
  62. re_common/baselibrary/utils/baseexcel.py +264 -264
  63. re_common/baselibrary/utils/baseexcept.py +109 -109
  64. re_common/baselibrary/utils/basefile.py +654 -654
  65. re_common/baselibrary/utils/baseftp.py +214 -214
  66. re_common/baselibrary/utils/basegzip.py +60 -60
  67. re_common/baselibrary/utils/basehdfs.py +135 -135
  68. re_common/baselibrary/utils/basehttpx.py +268 -268
  69. re_common/baselibrary/utils/baseip.py +87 -87
  70. re_common/baselibrary/utils/basejson.py +2 -2
  71. re_common/baselibrary/utils/baselist.py +32 -32
  72. re_common/baselibrary/utils/basemotor.py +190 -190
  73. re_common/baselibrary/utils/basemssql.py +98 -98
  74. re_common/baselibrary/utils/baseodbc.py +113 -113
  75. re_common/baselibrary/utils/basepandas.py +302 -302
  76. re_common/baselibrary/utils/basepeewee.py +11 -11
  77. re_common/baselibrary/utils/basepika.py +180 -180
  78. re_common/baselibrary/utils/basepydash.py +143 -143
  79. re_common/baselibrary/utils/basepymongo.py +230 -230
  80. re_common/baselibrary/utils/basequeue.py +22 -22
  81. re_common/baselibrary/utils/baserar.py +57 -57
  82. re_common/baselibrary/utils/baserequest.py +279 -279
  83. re_common/baselibrary/utils/baseset.py +8 -8
  84. re_common/baselibrary/utils/basesmb.py +403 -403
  85. re_common/baselibrary/utils/basestring.py +382 -382
  86. re_common/baselibrary/utils/basetime.py +320 -320
  87. re_common/baselibrary/utils/baseurl.py +121 -121
  88. re_common/baselibrary/utils/basezip.py +57 -57
  89. re_common/baselibrary/utils/core/__init__.py +7 -7
  90. re_common/baselibrary/utils/core/bottomutils.py +18 -18
  91. re_common/baselibrary/utils/core/mdeprecated.py +327 -327
  92. re_common/baselibrary/utils/core/mlamada.py +16 -16
  93. re_common/baselibrary/utils/core/msginfo.py +25 -25
  94. re_common/baselibrary/utils/core/requests_core.py +103 -103
  95. re_common/baselibrary/utils/fateadm.py +429 -429
  96. re_common/baselibrary/utils/importfun.py +123 -123
  97. re_common/baselibrary/utils/mfaker.py +57 -57
  98. re_common/baselibrary/utils/my_abc/__init__.py +3 -3
  99. re_common/baselibrary/utils/my_abc/better_abc.py +32 -32
  100. re_common/baselibrary/utils/mylogger.py +414 -414
  101. re_common/baselibrary/utils/myredisclient.py +861 -861
  102. re_common/baselibrary/utils/pipupgrade.py +21 -21
  103. re_common/baselibrary/utils/ringlist.py +85 -85
  104. re_common/baselibrary/utils/version_compare.py +36 -36
  105. re_common/baselibrary/utils/ydmhttp.py +126 -126
  106. re_common/facade/lazy_import.py +11 -11
  107. re_common/facade/loggerfacade.py +25 -25
  108. re_common/facade/mysqlfacade.py +467 -467
  109. re_common/facade/now.py +31 -31
  110. re_common/facade/sqlite3facade.py +257 -257
  111. re_common/facade/use/mq_use_facade.py +83 -83
  112. re_common/facade/use/proxy_use_facade.py +19 -19
  113. re_common/libtest/base_dict_test.py +19 -19
  114. re_common/libtest/baseavro_test.py +13 -13
  115. re_common/libtest/basefile_test.py +14 -14
  116. re_common/libtest/basemssql_test.py +77 -77
  117. re_common/libtest/baseodbc_test.py +7 -7
  118. re_common/libtest/basepandas_test.py +38 -38
  119. re_common/libtest/get_attr_test/get_attr_test_settings.py +14 -14
  120. re_common/libtest/get_attr_test/settings.py +54 -54
  121. re_common/libtest/idencode_test.py +53 -53
  122. re_common/libtest/iniconfig_test.py +35 -35
  123. re_common/libtest/ip_test.py +34 -34
  124. re_common/libtest/merge_file_test.py +20 -20
  125. re_common/libtest/mfaker_test.py +8 -8
  126. re_common/libtest/mm3_test.py +31 -31
  127. re_common/libtest/mylogger_test.py +88 -88
  128. re_common/libtest/myparsel_test.py +27 -27
  129. re_common/libtest/mysql_test.py +151 -151
  130. re_common/libtest/pymongo_test.py +21 -21
  131. re_common/libtest/split_test.py +11 -11
  132. re_common/libtest/sqlite3_merge_test.py +5 -5
  133. re_common/libtest/sqlite3_test.py +34 -34
  134. re_common/libtest/tomlconfig_test.py +30 -30
  135. re_common/libtest/use_tools_test/__init__.py +2 -2
  136. re_common/libtest/user/__init__.py +4 -4
  137. re_common/studio/__init__.py +4 -4
  138. re_common/studio/assignment_expressions.py +36 -36
  139. re_common/studio/mydash/test1.py +18 -18
  140. re_common/studio/pydashstudio/first.py +9 -9
  141. re_common/studio/streamlitstudio/first_app.py +65 -65
  142. re_common/studio/streamlitstudio/uber_pickups.py +23 -23
  143. re_common/studio/test.py +18 -18
  144. re_common/v2/baselibrary/business_utils/BusinessStringUtil.py +219 -219
  145. re_common/v2/baselibrary/business_utils/baseencodeid.py +100 -100
  146. re_common/v2/baselibrary/business_utils/full_doi_path.py +116 -116
  147. re_common/v2/baselibrary/business_utils/rel_tools.py +6 -6
  148. re_common/v2/baselibrary/decorators/utils.py +59 -59
  149. re_common/v2/baselibrary/helpers/search_packge/NearestNeighbors_test.py +105 -105
  150. re_common/v2/baselibrary/helpers/search_packge/fit_text_match.py +253 -253
  151. re_common/v2/baselibrary/helpers/search_packge/scikit_learn_text_matcher.py +260 -260
  152. re_common/v2/baselibrary/helpers/search_packge/test.py +1 -1
  153. re_common/v2/baselibrary/s3object/baseboto3.py +230 -230
  154. re_common/v2/baselibrary/tools/WeChatRobot.py +95 -95
  155. re_common/v2/baselibrary/tools/ac_ahocorasick.py +75 -75
  156. re_common/v2/baselibrary/tools/concurrency.py +35 -35
  157. re_common/v2/baselibrary/tools/data_processer/base.py +53 -53
  158. re_common/v2/baselibrary/tools/data_processer/data_processer.py +508 -508
  159. re_common/v2/baselibrary/tools/data_processer/data_reader.py +187 -187
  160. re_common/v2/baselibrary/tools/data_processer/data_writer.py +38 -38
  161. re_common/v2/baselibrary/tools/dict_tools.py +44 -44
  162. re_common/v2/baselibrary/tools/dolphinscheduler.py +187 -187
  163. re_common/v2/baselibrary/tools/hdfs_base_processor.py +204 -204
  164. re_common/v2/baselibrary/tools/hdfs_bulk_processor.py +67 -67
  165. re_common/v2/baselibrary/tools/hdfs_data_processer.py +338 -338
  166. re_common/v2/baselibrary/tools/hdfs_line_processor.py +74 -74
  167. re_common/v2/baselibrary/tools/list_tools.py +69 -69
  168. re_common/v2/baselibrary/tools/resume_tracker.py +94 -94
  169. re_common/v2/baselibrary/tools/search_hash_tools.py +54 -54
  170. re_common/v2/baselibrary/tools/text_matcher.py +326 -326
  171. re_common/v2/baselibrary/tools/unionfind_tools.py +60 -60
  172. re_common/v2/baselibrary/utils/BusinessStringUtil.py +196 -196
  173. re_common/v2/baselibrary/utils/api_net_utils.py +270 -270
  174. re_common/v2/baselibrary/utils/author_smi.py +361 -361
  175. re_common/v2/baselibrary/utils/base_string_similarity.py +158 -158
  176. re_common/v2/baselibrary/utils/basedict.py +37 -37
  177. re_common/v2/baselibrary/utils/basehdfs.py +163 -163
  178. re_common/v2/baselibrary/utils/basepika.py +180 -180
  179. re_common/v2/baselibrary/utils/basetime.py +77 -77
  180. re_common/v2/baselibrary/utils/db.py +156 -156
  181. re_common/v2/baselibrary/utils/json_cls.py +16 -16
  182. re_common/v2/baselibrary/utils/mq.py +83 -83
  183. re_common/v2/baselibrary/utils/n_ary_expression_tree.py +243 -243
  184. re_common/v2/baselibrary/utils/string_bool.py +186 -186
  185. re_common/v2/baselibrary/utils/string_clear.py +246 -246
  186. re_common/v2/baselibrary/utils/string_smi.py +18 -18
  187. re_common/v2/baselibrary/utils/stringutils.py +271 -278
  188. re_common/vip/base_step_process.py +11 -11
  189. re_common/vip/baseencodeid.py +90 -90
  190. re_common/vip/changetaskname.py +28 -28
  191. re_common/vip/core_var.py +24 -24
  192. re_common/vip/mmh3Hash.py +89 -89
  193. re_common/vip/proxy/allproxys.py +127 -127
  194. re_common/vip/proxy/allproxys_thread.py +159 -159
  195. re_common/vip/proxy/cnki_proxy.py +153 -153
  196. re_common/vip/proxy/kuaidaili.py +87 -87
  197. re_common/vip/proxy/proxy_all.py +113 -113
  198. re_common/vip/proxy/update_kuaidaili_0.py +42 -42
  199. re_common/vip/proxy/wanfang_proxy.py +152 -152
  200. re_common/vip/proxy/wp_proxy_all.py +181 -181
  201. re_common/vip/read_rawid_to_txt.py +91 -91
  202. re_common/vip/title/__init__.py +5 -5
  203. re_common/vip/title/transform/TransformBookTitleToZt.py +125 -125
  204. re_common/vip/title/transform/TransformConferenceTitleToZt.py +139 -139
  205. re_common/vip/title/transform/TransformCstadTitleToZt.py +195 -195
  206. re_common/vip/title/transform/TransformJournalTitleToZt.py +203 -203
  207. re_common/vip/title/transform/TransformPatentTitleToZt.py +132 -132
  208. re_common/vip/title/transform/TransformRegulationTitleToZt.py +114 -114
  209. re_common/vip/title/transform/TransformStandardTitleToZt.py +135 -135
  210. re_common/vip/title/transform/TransformThesisTitleToZt.py +135 -135
  211. re_common/vip/title/transform/__init__.py +10 -10
  212. {re_common-10.0.37.dist-info → re_common-10.0.39.dist-info}/LICENSE +201 -201
  213. {re_common-10.0.37.dist-info → re_common-10.0.39.dist-info}/METADATA +16 -16
  214. re_common-10.0.39.dist-info/RECORD +248 -0
  215. {re_common-10.0.37.dist-info → re_common-10.0.39.dist-info}/WHEEL +1 -1
  216. re_common-10.0.37.dist-info/RECORD +0 -248
  217. {re_common-10.0.37.dist-info → re_common-10.0.39.dist-info}/top_level.txt +0 -0
@@ -1,278 +1,271 @@
1
- import re
2
- import threading
3
- from html.parser import HTMLParser
4
- from itertools import combinations
5
-
6
- import regex
7
- import unicodedata
8
- from parsel import Selector
9
-
10
- from re_common.v2.baselibrary.utils.string_smi import JaroDamerauLevenshteinMaxSim
11
-
12
-
13
- def bj2qj(src):
14
- if src is None:
15
- return src
16
-
17
- DBC_SPACE = ' '
18
- SBC_SPACE = ' '
19
- DBC_CHAR_START = 33
20
- DBC_CHAR_END = 126
21
- CONVERT_STEP = 65248
22
-
23
- buf = []
24
- for char in src:
25
- if char == DBC_SPACE:
26
- buf.append(SBC_SPACE)
27
- elif DBC_CHAR_START <= ord(char) <= DBC_CHAR_END:
28
- buf.append(chr(ord(char) + CONVERT_STEP))
29
- else:
30
- buf.append(char)
31
-
32
- return ''.join(buf)
33
-
34
-
35
- def qj2bj(src):
36
- """
37
- 全角转半角
38
- :param src:
39
- :return:
40
- """
41
- if src is None:
42
- return src
43
-
44
- SBC_CHAR_START = 0xFF01
45
- SBC_CHAR_END = 0xFF5E
46
- CONVERT_STEP = 0xFEE0
47
- DBC_SPACE = ' '
48
- SBC_SPACE = ' '
49
-
50
- buf = []
51
- for char in src:
52
- if SBC_CHAR_START <= ord(char) <= SBC_CHAR_END:
53
- buf.append(chr(ord(char) - CONVERT_STEP))
54
- elif char == SBC_SPACE:
55
- buf.append(DBC_SPACE)
56
- else:
57
- buf.append(char)
58
-
59
- return ''.join(buf)
60
-
61
-
62
- """
63
- 总结对比表
64
- 规范名 处理步骤 组合方式 兼容性归一化 主要用途
65
- NFC 规范分解 规范组合 组合 否 保留预组合字符,文本呈现和存储
66
- NFD 规范分解 不组合 否 拆解字符,便于逐字符处理
67
- NFKC 兼容性分解 → 规范组合 组合 是 消除兼容差异,文本比较和索引
68
- NFKD 兼容性分解 → 规范分解 不组合 是 最大程度拆解,文本分析和预处理
69
- """
70
-
71
-
72
- def get_diacritic_variant(char1):
73
- """
74
- NFD: 规范分解(Normalization Form D)
75
- 把字符拆分为基本字符 + 变音符号
76
-
77
- 但不处理兼容字符(如连字)
78
-
79
- print(unicodedata.normalize('NFD', 'é')) # 输出: 'é'(e + 组合符号) # 这里看起来是1个字符 len 其实是2
80
- print(unicodedata.normalize('NFD', 'fl')) # 输出: 'fl'(不变化)
81
-
82
- """
83
- # 将字符转换为标准的 Unicode 形式
84
- normalized_char1 = unicodedata.normalize('NFD', char1)
85
-
86
- # 获取基本字符(去掉变音符号)
87
- base_char1 = ''.join(c for c in normalized_char1 if unicodedata.category(c) != 'Mn')
88
-
89
- # 判断基本字符是否相同
90
- return base_char1
91
-
92
-
93
- def normalize_nfkc(strs: str) -> str:
94
- """
95
- NFKC: 兼容字符归一化 + 组合(Normalization Form Compatibility Composition)
96
- 把 连字、圈数字、全角字符 等兼容字符转换为标准形式
97
-
98
- 同时做字符合并(例如 é 不再是 e+´,而是一个字符)
99
- print(unicodedata.normalize('NFKC', 'fl')) # 输出: 'fl'
100
- print(unicodedata.normalize('NFKC', '①')) # 输出: '1'
101
- print(unicodedata.normalize('NFKC', 'A')) # 输出: 'A'
102
- """
103
- return unicodedata.normalize('NFKC', strs.strip())
104
-
105
-
106
- def get_alphabetic_ratio(text: str) -> float:
107
- # 返回字母型字符所占比例
108
- if not text:
109
- return 0
110
-
111
- text = re.sub(r'\d+', '', text)
112
-
113
- # 正则表达式匹配字母型文字(包括拉丁字母、希腊字母、西里尔字母、阿拉伯字母等)
114
- alphabetic_pattern = (
115
- r"[\u0041-\u005A\u0061-\u007A" # 拉丁字母 (A-Z, a-z)
116
- r"\u00C0-\u00FF" # 带重音符号的拉丁字母 (À-ÿ)
117
- r"\u0080–\u00FF" # 拉丁字母补充1
118
- r"\u0100–\u017F" # 拉丁字母扩展A
119
- r"\u1E00-\u1EFF" # 拉丁扩展 (Latin Extended Additional)
120
- r"\u0180-\u024F" # 拉丁扩展-B (Latin Extended-B)
121
- r"\u2C60-\u2C7F" # 拉丁扩展-C (Latin Extended Additional)
122
- r"\uA720-\uA7FF" # 拉丁扩展-D (Latin Extended Additional)
123
- r"\uAB30-\uAB6F" # 拉丁扩展-E (Latin Extended Additional)
124
- r"]"
125
- )
126
-
127
- # 使用正则表达式过滤出语言文字
128
- clean_text = regex.sub(r"[^\p{L}]", "", text)
129
-
130
- if len(clean_text) == 0:
131
- return 1.0
132
-
133
- # 匹配所有字母型字符
134
- alphabetic_chars = re.findall(alphabetic_pattern, clean_text)
135
-
136
- # 返回字母型字符所占比例
137
- return len(alphabetic_chars) / len(clean_text)
138
-
139
-
140
- class HTMLTextExtractor(HTMLParser):
141
- _thread_local = threading.local() # 线程局部存储
142
-
143
- def __init__(self):
144
- super().__init__()
145
- self.reset_state()
146
-
147
- def handle_starttag(self, tag, attrs):
148
- if tag in ('script', 'style'):
149
- self.skip = True
150
-
151
- def handle_endtag(self, tag):
152
- if tag in ('script', 'style'):
153
- self.skip = False
154
-
155
- def handle_data(self, data):
156
- if not self.skip and data.strip():
157
- self.text.append(data)
158
-
159
- def reset_state(self):
160
- self.reset()
161
- self.text = []
162
- self.skip = False
163
-
164
- def get_text(self):
165
- return ''.join(self.text).strip()
166
-
167
- @classmethod
168
- def get_parser(cls):
169
- # 每个线程获取独立实例
170
- if not hasattr(cls._thread_local, 'parser'):
171
- cls._thread_local.parser = cls()
172
- return cls._thread_local.parser
173
-
174
-
175
- # def clean_html(html):
176
- # parser = HTMLTextExtractor.get_parser()
177
- # parser.reset_state()
178
- # parser.feed(html)
179
- # parser.close()
180
- # return parser.get_text()
181
-
182
- # def clean_html(html):
183
- # """使用 Parsel 提取 HTML 中的纯文本"""
184
- # sel = Selector(text=html, type='html')
185
- # # 提取所有文本(包括子元素的文本)
186
- # text = sel.xpath("string()").getall()
187
- # return "".join(text).strip()
188
-
189
-
190
- def clean_html(html):
191
- from bs4 import BeautifulSoup
192
- try:
193
- soup = BeautifulSoup(html, "lxml")
194
- return soup.get_text()
195
- except:
196
- soup = BeautifulSoup(html, "html5lib")
197
- return soup.get_text()
198
-
199
-
200
- def remove_spaces_between_chinese_characters(text):
201
- """
202
- 匹配中文间的空格并替换为空字符串
203
-
204
- 这里没有选取 后面的一些扩展分区 是那些分区比较分散 都写进来消耗性能,
205
- 认为只包含这些也够用了
206
- """
207
- pattern = r'(?<=[\u3400-\u9fff])\s+(?=[\u3400-\u9fff])'
208
- return re.sub(pattern, '', text)
209
-
210
-
211
- sim_utils = JaroDamerauLevenshteinMaxSim()
212
-
213
-
214
- def group_similar_texts(texts, threshold=0.9):
215
- """根据相似度对文本进行分组"""
216
- from re_common.v2.baselibrary.utils.string_clear import rel_clear
217
- n = len(texts)
218
- # 创建邻接表表示图
219
- graph = [[] for _ in range(n)]
220
- # 计算所有文本对的相似度并构建图
221
- for i, j in combinations(range(n), 2):
222
- similarity = sim_utils.get_sim(rel_clear(texts[i]), rel_clear(texts[j]))
223
- if similarity >= threshold:
224
- graph[i].append(j)
225
- graph[j].append(i)
226
-
227
- visited = [False] * n
228
- groups = []
229
-
230
- # 使用DFS找到连通分量
231
- def dfs(node, group):
232
- visited[node] = True
233
- group.append(node)
234
- for neighbor in graph[node]:
235
- if not visited[neighbor]:
236
- dfs(neighbor, group)
237
-
238
- # 找到所有连通分量
239
- for i in range(n):
240
- if not visited[i]:
241
- current_group = []
242
- dfs(i, current_group)
243
- groups.append(current_group)
244
-
245
- return groups
246
-
247
-
248
- def get_group_abstract(lists):
249
- """
250
- 这是一个 分组程序 ,会根据简单的连通图分组
251
- lists: [(id,txt),...]
252
- return: all_list 返回一个二维列表 每个列表里面是id 每个列表为一个分组
253
- """
254
- abstract_list = [i[1] for i in lists]
255
- keyid_list = [i[0] for i in lists]
256
- groups = group_similar_texts(abstract_list, threshold=0.9)
257
- all_list = []
258
- for group in groups:
259
- t_list = []
260
- for text_idx in group:
261
- t_list.append(keyid_list[text_idx])
262
- all_list.append(t_list)
263
- return all_list
264
-
265
-
266
- def clean_unicode_alnum(text: str) -> str:
267
- """
268
- 清除所有非 Unicode 字母或数字的字符。
269
-
270
- 参数:
271
- text (str): 输入文本。
272
-
273
- 返回:
274
- str: 只包含 Unicode 字母和数字的文本。
275
- \p{N} 匹配所有 Unicode 数字字符 包括非阿拉伯数字字符
276
- \p{L} 匹配所有语言字符
277
- """
278
- return regex.sub(r"[^\p{L}\p{N}]+", "", text)
1
+ import re
2
+ import threading
3
+ from html.parser import HTMLParser
4
+ from itertools import combinations
5
+
6
+ import regex
7
+ import unicodedata
8
+
9
+ from re_common.v2.baselibrary.utils.string_smi import JaroDamerauLevenshteinMaxSim
10
+
11
+
12
+ def bj2qj(src):
13
+ if src is None:
14
+ return src
15
+
16
+ DBC_SPACE = ' '
17
+ SBC_SPACE = ' '
18
+ DBC_CHAR_START = 33
19
+ DBC_CHAR_END = 126
20
+ CONVERT_STEP = 65248
21
+
22
+ buf = []
23
+ for char in src:
24
+ if char == DBC_SPACE:
25
+ buf.append(SBC_SPACE)
26
+ elif DBC_CHAR_START <= ord(char) <= DBC_CHAR_END:
27
+ buf.append(chr(ord(char) + CONVERT_STEP))
28
+ else:
29
+ buf.append(char)
30
+
31
+ return ''.join(buf)
32
+
33
+
34
+ def qj2bj(text):
35
+ if text is None:
36
+ return text
37
+ # 预构建全角到半角的转换映射表(只需构建一次)
38
+ if not hasattr(qj2bj, 'trans_table'):
39
+ trans_map = {}
40
+ # 处理全角空格
41
+ trans_map[0x3000] = 0x0020
42
+ # 处理全角字符范围FF01-FF5E
43
+ for code in range(0xFF01, 0xFF5F):
44
+ trans_map[code] = code - 0xFEE0
45
+ # 创建转换表(字符到字符的映射)
46
+ qj2bj.trans_table = str.maketrans(
47
+ {chr(k): chr(v) for k, v in trans_map.items()}
48
+ )
49
+ # 使用预编译的转换表进行高效替换
50
+ return text.translate(qj2bj.trans_table)
51
+
52
+
53
+ """
54
+ 总结对比表
55
+ 规范名 处理步骤 组合方式 兼容性归一化 主要用途
56
+ NFC 规范分解 → 规范组合 组合 否 保留预组合字符,文本呈现和存储
57
+ NFD 规范分解 不组合 否 拆解字符,便于逐字符处理
58
+ NFKC 兼容性分解 → 规范组合 组合 是 消除兼容差异,文本比较和索引
59
+ NFKD 兼容性分解 → 规范分解 不组合 是 最大程度拆解,文本分析和预处理
60
+ """
61
+
62
+
63
+ def get_diacritic_variant(char1):
64
+ """
65
+ NFD: 规范分解(Normalization Form D)
66
+ 把字符拆分为基本字符 + 变音符号
67
+
68
+ 但不处理兼容字符(如连字)
69
+
70
+ print(unicodedata.normalize('NFD', 'é')) # 输出: 'é'(e + 组合符号) # 这里看起来是1个字符 len 其实是2
71
+ print(unicodedata.normalize('NFD', 'fl')) # 输出: 'fl'(不变化)
72
+
73
+ """
74
+ # 将字符转换为标准的 Unicode 形式
75
+ normalized_char1 = unicodedata.normalize('NFD', char1)
76
+
77
+ # 获取基本字符(去掉变音符号)
78
+ base_char1 = ''.join(c for c in normalized_char1 if unicodedata.category(c) != 'Mn')
79
+
80
+ # 判断基本字符是否相同
81
+ return base_char1
82
+
83
+
84
+ def normalize_nfkc(strs: str) -> str:
85
+ """
86
+ NFKC: 兼容字符归一化 + 组合(Normalization Form Compatibility Composition)
87
+ 连字、圈数字、全角字符 等兼容字符转换为标准形式
88
+
89
+ 同时做字符合并(例如 é 不再是 e+´,而是一个字符)
90
+ print(unicodedata.normalize('NFKC', 'fl')) # 输出: 'fl'
91
+ print(unicodedata.normalize('NFKC', '①')) # 输出: '1'
92
+ print(unicodedata.normalize('NFKC', 'A')) # 输出: 'A'
93
+ """
94
+ return unicodedata.normalize('NFKC', strs.strip())
95
+
96
+
97
+ def get_alphabetic_ratio(text: str) -> float:
98
+ # 返回字母型字符所占比例
99
+ if not text:
100
+ return 0
101
+
102
+ text = re.sub(r'\d+', '', text)
103
+
104
+ # 正则表达式匹配字母型文字(包括拉丁字母、希腊字母、西里尔字母、阿拉伯字母等)
105
+ alphabetic_pattern = (
106
+ r"[\u0041-\u005A\u0061-\u007A" # 拉丁字母 (A-Z, a-z)
107
+ r"\u00C0-\u00FF" # 带重音符号的拉丁字母 (À-ÿ)
108
+ r"\u0080–\u00FF" # 拉丁字母补充1
109
+ r"\u0100–\u017F" # 拉丁字母扩展A
110
+ r"\u1E00-\u1EFF" # 拉丁扩展 (Latin Extended Additional)
111
+ r"\u0180-\u024F" # 拉丁扩展-B (Latin Extended-B)
112
+ r"\u2C60-\u2C7F" # 拉丁扩展-C (Latin Extended Additional)
113
+ r"\uA720-\uA7FF" # 拉丁扩展-D (Latin Extended Additional)
114
+ r"\uAB30-\uAB6F" # 拉丁扩展-E (Latin Extended Additional)
115
+ r"]"
116
+ )
117
+
118
+ # 使用正则表达式过滤出语言文字
119
+ clean_text = regex.sub(r"[^\p{L}]", "", text)
120
+
121
+ if len(clean_text) == 0:
122
+ return 1.0
123
+
124
+ # 匹配所有字母型字符
125
+ alphabetic_chars = re.findall(alphabetic_pattern, clean_text)
126
+
127
+ # 返回字母型字符所占比例
128
+ return len(alphabetic_chars) / len(clean_text)
129
+
130
+
131
+ class HTMLTextExtractor(HTMLParser):
132
+ _thread_local = threading.local() # 线程局部存储
133
+
134
+ def __init__(self):
135
+ super().__init__()
136
+ self.reset_state()
137
+
138
+ def handle_starttag(self, tag, attrs):
139
+ if tag in ('script', 'style'):
140
+ self.skip = True
141
+
142
+ def handle_endtag(self, tag):
143
+ if tag in ('script', 'style'):
144
+ self.skip = False
145
+
146
+ def handle_data(self, data):
147
+ if not self.skip and data.strip():
148
+ self.text.append(data)
149
+
150
+ def reset_state(self):
151
+ self.reset()
152
+ self.text = []
153
+ self.skip = False
154
+
155
+ def get_text(self):
156
+ return ''.join(self.text).strip()
157
+
158
+ @classmethod
159
+ def get_parser(cls):
160
+ # 每个线程获取独立实例
161
+ if not hasattr(cls._thread_local, 'parser'):
162
+ cls._thread_local.parser = cls()
163
+ return cls._thread_local.parser
164
+
165
+
166
+ # def clean_html(html):
167
+ # parser = HTMLTextExtractor.get_parser()
168
+ # parser.reset_state()
169
+ # parser.feed(html)
170
+ # parser.close()
171
+ # return parser.get_text()
172
+
173
+ # def clean_html(html):
174
+ # """使用 Parsel 提取 HTML 中的纯文本"""
175
+ # sel = Selector(text=html, type='html')
176
+ # # 提取所有文本(包括子元素的文本)
177
+ # text = sel.xpath("string()").getall()
178
+ # return "".join(text).strip()
179
+
180
+
181
+ def clean_html(html):
182
+ if "<" in html:
183
+ from bs4 import BeautifulSoup
184
+ try:
185
+ soup = BeautifulSoup(html, "lxml")
186
+ return soup.get_text()
187
+ except:
188
+ soup = BeautifulSoup(html, "html5lib")
189
+ return soup.get_text()
190
+ return html
191
+
192
+
193
+ def remove_spaces_between_chinese_characters(text):
194
+ """
195
+ 匹配中文间的空格并替换为空字符串
196
+
197
+ 这里没有选取 后面的一些扩展分区 是那些分区比较分散 都写进来消耗性能,
198
+ 认为只包含这些也够用了
199
+ """
200
+ pattern = r'(?<=[\u3400-\u9fff])\s+(?=[\u3400-\u9fff])'
201
+ return re.sub(pattern, '', text)
202
+
203
+
204
+ sim_utils = JaroDamerauLevenshteinMaxSim()
205
+
206
+
207
+ def group_similar_texts(texts, threshold=0.9):
208
+ """根据相似度对文本进行分组"""
209
+ from re_common.v2.baselibrary.utils.string_clear import rel_clear
210
+ n = len(texts)
211
+ # 创建邻接表表示图
212
+ graph = [[] for _ in range(n)]
213
+ # 计算所有文本对的相似度并构建图
214
+ for i, j in combinations(range(n), 2):
215
+ similarity = sim_utils.get_sim(rel_clear(texts[i]), rel_clear(texts[j]))
216
+ if similarity >= threshold:
217
+ graph[i].append(j)
218
+ graph[j].append(i)
219
+
220
+ visited = [False] * n
221
+ groups = []
222
+
223
+ # 使用DFS找到连通分量
224
+ def dfs(node, group):
225
+ visited[node] = True
226
+ group.append(node)
227
+ for neighbor in graph[node]:
228
+ if not visited[neighbor]:
229
+ dfs(neighbor, group)
230
+
231
+ # 找到所有连通分量
232
+ for i in range(n):
233
+ if not visited[i]:
234
+ current_group = []
235
+ dfs(i, current_group)
236
+ groups.append(current_group)
237
+
238
+ return groups
239
+
240
+
241
+ def get_group_abstract(lists):
242
+ """
243
+ 这是一个 分组程序 ,会根据简单的连通图分组
244
+ lists: [(id,txt),...]
245
+ return: all_list 返回一个二维列表 每个列表里面是id 每个列表为一个分组
246
+ """
247
+ abstract_list = [i[1] for i in lists]
248
+ keyid_list = [i[0] for i in lists]
249
+ groups = group_similar_texts(abstract_list, threshold=0.9)
250
+ all_list = []
251
+ for group in groups:
252
+ t_list = []
253
+ for text_idx in group:
254
+ t_list.append(keyid_list[text_idx])
255
+ all_list.append(t_list)
256
+ return all_list
257
+
258
+
259
+ def clean_unicode_alnum(text: str) -> str:
260
+ """
261
+ 清除所有非 Unicode 字母或数字的字符。
262
+
263
+ 参数:
264
+ text (str): 输入文本。
265
+
266
+ 返回:
267
+ str: 只包含 Unicode 字母和数字的文本。
268
+ \p{N} 匹配所有 Unicode 数字字符 包括非阿拉伯数字字符
269
+ \p{L} 匹配所有语言字符
270
+ """
271
+ return regex.sub(r"[^\p{L}\p{N}]+", "", text)
@@ -1,11 +1,11 @@
1
- from abc import ABC, abstractmethod
2
-
3
-
4
- class BaseStepProcess(ABC):
5
-
6
- def __init__(self):
7
- self.stat_dicts = {}
8
-
9
- @abstractmethod
10
- def do_task(self, *args, **kwargs):
11
- pass
1
+ from abc import ABC, abstractmethod
2
+
3
+
4
+ class BaseStepProcess(ABC):
5
+
6
+ def __init__(self):
7
+ self.stat_dicts = {}
8
+
9
+ @abstractmethod
10
+ def do_task(self, *args, **kwargs):
11
+ pass