re-common 10.0.37__py3-none-any.whl → 10.0.39__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (217) hide show
  1. re_common/baselibrary/__init__.py +4 -4
  2. re_common/baselibrary/baseabs/__init__.py +6 -6
  3. re_common/baselibrary/baseabs/baseabs.py +26 -26
  4. re_common/baselibrary/database/mbuilder.py +132 -132
  5. re_common/baselibrary/database/moudle.py +93 -93
  6. re_common/baselibrary/database/msqlite3.py +194 -194
  7. re_common/baselibrary/database/mysql.py +169 -169
  8. re_common/baselibrary/database/sql_factory.py +26 -26
  9. re_common/baselibrary/mthread/MThreadingRun.py +486 -486
  10. re_common/baselibrary/mthread/MThreadingRunEvent.py +349 -349
  11. re_common/baselibrary/mthread/__init__.py +2 -2
  12. re_common/baselibrary/mthread/mythreading.py +695 -695
  13. re_common/baselibrary/pakge_other/socks.py +404 -404
  14. re_common/baselibrary/readconfig/config_factory.py +18 -18
  15. re_common/baselibrary/readconfig/ini_config.py +317 -317
  16. re_common/baselibrary/readconfig/toml_config.py +49 -49
  17. re_common/baselibrary/temporary/envdata.py +36 -36
  18. re_common/baselibrary/tools/all_requests/aiohttp_request.py +118 -118
  19. re_common/baselibrary/tools/all_requests/httpx_requet.py +102 -102
  20. re_common/baselibrary/tools/all_requests/mrequest.py +412 -412
  21. re_common/baselibrary/tools/all_requests/requests_request.py +81 -81
  22. re_common/baselibrary/tools/batch_compre/bijiao_batch.py +31 -31
  23. re_common/baselibrary/tools/contrast_db3.py +123 -123
  24. re_common/baselibrary/tools/copy_file.py +39 -39
  25. re_common/baselibrary/tools/db3_2_sizedb3.py +102 -102
  26. re_common/baselibrary/tools/foreachgz.py +39 -39
  27. re_common/baselibrary/tools/get_attr.py +10 -10
  28. re_common/baselibrary/tools/image_to_pdf.py +61 -61
  29. re_common/baselibrary/tools/java_code_deal.py +139 -139
  30. re_common/baselibrary/tools/javacode.py +79 -79
  31. re_common/baselibrary/tools/mdb_db3.py +48 -48
  32. re_common/baselibrary/tools/merge_file.py +171 -171
  33. re_common/baselibrary/tools/merge_gz_file.py +165 -165
  34. re_common/baselibrary/tools/mhdfstools/down_hdfs_files.py +42 -42
  35. re_common/baselibrary/tools/mhdfstools/hdfst.py +42 -42
  36. re_common/baselibrary/tools/mhdfstools/up_hdfs_files.py +38 -38
  37. re_common/baselibrary/tools/mongo_tools.py +50 -50
  38. re_common/baselibrary/tools/move_file.py +170 -170
  39. re_common/baselibrary/tools/move_mongo/mongo_table_to_file.py +63 -63
  40. re_common/baselibrary/tools/move_mongo/move_mongo_table.py +354 -354
  41. re_common/baselibrary/tools/move_mongo/use_mttf.py +18 -18
  42. re_common/baselibrary/tools/move_mongo/use_mv.py +93 -93
  43. re_common/baselibrary/tools/mpandas/mpandasreadexcel.py +125 -125
  44. re_common/baselibrary/tools/mpandas/pandas_visualization.py +7 -7
  45. re_common/baselibrary/tools/myparsel.py +104 -104
  46. re_common/baselibrary/tools/rename_dir_file.py +37 -37
  47. re_common/baselibrary/tools/sequoiadb_utils.py +398 -398
  48. re_common/baselibrary/tools/split_line_to_many.py +25 -25
  49. re_common/baselibrary/tools/stringtodicts.py +33 -33
  50. re_common/baselibrary/tools/workwechant_bot.py +84 -84
  51. re_common/baselibrary/utils/baseaiohttp.py +296 -296
  52. re_common/baselibrary/utils/baseaiomysql.py +87 -87
  53. re_common/baselibrary/utils/baseallstep.py +191 -191
  54. re_common/baselibrary/utils/baseavro.py +19 -19
  55. re_common/baselibrary/utils/baseboto3.py +291 -291
  56. re_common/baselibrary/utils/basecsv.py +32 -32
  57. re_common/baselibrary/utils/basedict.py +133 -133
  58. re_common/baselibrary/utils/basedir.py +241 -241
  59. re_common/baselibrary/utils/baseencode.py +351 -351
  60. re_common/baselibrary/utils/baseencoding.py +28 -28
  61. re_common/baselibrary/utils/baseesdsl.py +86 -86
  62. re_common/baselibrary/utils/baseexcel.py +264 -264
  63. re_common/baselibrary/utils/baseexcept.py +109 -109
  64. re_common/baselibrary/utils/basefile.py +654 -654
  65. re_common/baselibrary/utils/baseftp.py +214 -214
  66. re_common/baselibrary/utils/basegzip.py +60 -60
  67. re_common/baselibrary/utils/basehdfs.py +135 -135
  68. re_common/baselibrary/utils/basehttpx.py +268 -268
  69. re_common/baselibrary/utils/baseip.py +87 -87
  70. re_common/baselibrary/utils/basejson.py +2 -2
  71. re_common/baselibrary/utils/baselist.py +32 -32
  72. re_common/baselibrary/utils/basemotor.py +190 -190
  73. re_common/baselibrary/utils/basemssql.py +98 -98
  74. re_common/baselibrary/utils/baseodbc.py +113 -113
  75. re_common/baselibrary/utils/basepandas.py +302 -302
  76. re_common/baselibrary/utils/basepeewee.py +11 -11
  77. re_common/baselibrary/utils/basepika.py +180 -180
  78. re_common/baselibrary/utils/basepydash.py +143 -143
  79. re_common/baselibrary/utils/basepymongo.py +230 -230
  80. re_common/baselibrary/utils/basequeue.py +22 -22
  81. re_common/baselibrary/utils/baserar.py +57 -57
  82. re_common/baselibrary/utils/baserequest.py +279 -279
  83. re_common/baselibrary/utils/baseset.py +8 -8
  84. re_common/baselibrary/utils/basesmb.py +403 -403
  85. re_common/baselibrary/utils/basestring.py +382 -382
  86. re_common/baselibrary/utils/basetime.py +320 -320
  87. re_common/baselibrary/utils/baseurl.py +121 -121
  88. re_common/baselibrary/utils/basezip.py +57 -57
  89. re_common/baselibrary/utils/core/__init__.py +7 -7
  90. re_common/baselibrary/utils/core/bottomutils.py +18 -18
  91. re_common/baselibrary/utils/core/mdeprecated.py +327 -327
  92. re_common/baselibrary/utils/core/mlamada.py +16 -16
  93. re_common/baselibrary/utils/core/msginfo.py +25 -25
  94. re_common/baselibrary/utils/core/requests_core.py +103 -103
  95. re_common/baselibrary/utils/fateadm.py +429 -429
  96. re_common/baselibrary/utils/importfun.py +123 -123
  97. re_common/baselibrary/utils/mfaker.py +57 -57
  98. re_common/baselibrary/utils/my_abc/__init__.py +3 -3
  99. re_common/baselibrary/utils/my_abc/better_abc.py +32 -32
  100. re_common/baselibrary/utils/mylogger.py +414 -414
  101. re_common/baselibrary/utils/myredisclient.py +861 -861
  102. re_common/baselibrary/utils/pipupgrade.py +21 -21
  103. re_common/baselibrary/utils/ringlist.py +85 -85
  104. re_common/baselibrary/utils/version_compare.py +36 -36
  105. re_common/baselibrary/utils/ydmhttp.py +126 -126
  106. re_common/facade/lazy_import.py +11 -11
  107. re_common/facade/loggerfacade.py +25 -25
  108. re_common/facade/mysqlfacade.py +467 -467
  109. re_common/facade/now.py +31 -31
  110. re_common/facade/sqlite3facade.py +257 -257
  111. re_common/facade/use/mq_use_facade.py +83 -83
  112. re_common/facade/use/proxy_use_facade.py +19 -19
  113. re_common/libtest/base_dict_test.py +19 -19
  114. re_common/libtest/baseavro_test.py +13 -13
  115. re_common/libtest/basefile_test.py +14 -14
  116. re_common/libtest/basemssql_test.py +77 -77
  117. re_common/libtest/baseodbc_test.py +7 -7
  118. re_common/libtest/basepandas_test.py +38 -38
  119. re_common/libtest/get_attr_test/get_attr_test_settings.py +14 -14
  120. re_common/libtest/get_attr_test/settings.py +54 -54
  121. re_common/libtest/idencode_test.py +53 -53
  122. re_common/libtest/iniconfig_test.py +35 -35
  123. re_common/libtest/ip_test.py +34 -34
  124. re_common/libtest/merge_file_test.py +20 -20
  125. re_common/libtest/mfaker_test.py +8 -8
  126. re_common/libtest/mm3_test.py +31 -31
  127. re_common/libtest/mylogger_test.py +88 -88
  128. re_common/libtest/myparsel_test.py +27 -27
  129. re_common/libtest/mysql_test.py +151 -151
  130. re_common/libtest/pymongo_test.py +21 -21
  131. re_common/libtest/split_test.py +11 -11
  132. re_common/libtest/sqlite3_merge_test.py +5 -5
  133. re_common/libtest/sqlite3_test.py +34 -34
  134. re_common/libtest/tomlconfig_test.py +30 -30
  135. re_common/libtest/use_tools_test/__init__.py +2 -2
  136. re_common/libtest/user/__init__.py +4 -4
  137. re_common/studio/__init__.py +4 -4
  138. re_common/studio/assignment_expressions.py +36 -36
  139. re_common/studio/mydash/test1.py +18 -18
  140. re_common/studio/pydashstudio/first.py +9 -9
  141. re_common/studio/streamlitstudio/first_app.py +65 -65
  142. re_common/studio/streamlitstudio/uber_pickups.py +23 -23
  143. re_common/studio/test.py +18 -18
  144. re_common/v2/baselibrary/business_utils/BusinessStringUtil.py +219 -219
  145. re_common/v2/baselibrary/business_utils/baseencodeid.py +100 -100
  146. re_common/v2/baselibrary/business_utils/full_doi_path.py +116 -116
  147. re_common/v2/baselibrary/business_utils/rel_tools.py +6 -6
  148. re_common/v2/baselibrary/decorators/utils.py +59 -59
  149. re_common/v2/baselibrary/helpers/search_packge/NearestNeighbors_test.py +105 -105
  150. re_common/v2/baselibrary/helpers/search_packge/fit_text_match.py +253 -253
  151. re_common/v2/baselibrary/helpers/search_packge/scikit_learn_text_matcher.py +260 -260
  152. re_common/v2/baselibrary/helpers/search_packge/test.py +1 -1
  153. re_common/v2/baselibrary/s3object/baseboto3.py +230 -230
  154. re_common/v2/baselibrary/tools/WeChatRobot.py +95 -95
  155. re_common/v2/baselibrary/tools/ac_ahocorasick.py +75 -75
  156. re_common/v2/baselibrary/tools/concurrency.py +35 -35
  157. re_common/v2/baselibrary/tools/data_processer/base.py +53 -53
  158. re_common/v2/baselibrary/tools/data_processer/data_processer.py +508 -508
  159. re_common/v2/baselibrary/tools/data_processer/data_reader.py +187 -187
  160. re_common/v2/baselibrary/tools/data_processer/data_writer.py +38 -38
  161. re_common/v2/baselibrary/tools/dict_tools.py +44 -44
  162. re_common/v2/baselibrary/tools/dolphinscheduler.py +187 -187
  163. re_common/v2/baselibrary/tools/hdfs_base_processor.py +204 -204
  164. re_common/v2/baselibrary/tools/hdfs_bulk_processor.py +67 -67
  165. re_common/v2/baselibrary/tools/hdfs_data_processer.py +338 -338
  166. re_common/v2/baselibrary/tools/hdfs_line_processor.py +74 -74
  167. re_common/v2/baselibrary/tools/list_tools.py +69 -69
  168. re_common/v2/baselibrary/tools/resume_tracker.py +94 -94
  169. re_common/v2/baselibrary/tools/search_hash_tools.py +54 -54
  170. re_common/v2/baselibrary/tools/text_matcher.py +326 -326
  171. re_common/v2/baselibrary/tools/unionfind_tools.py +60 -60
  172. re_common/v2/baselibrary/utils/BusinessStringUtil.py +196 -196
  173. re_common/v2/baselibrary/utils/api_net_utils.py +270 -270
  174. re_common/v2/baselibrary/utils/author_smi.py +361 -361
  175. re_common/v2/baselibrary/utils/base_string_similarity.py +158 -158
  176. re_common/v2/baselibrary/utils/basedict.py +37 -37
  177. re_common/v2/baselibrary/utils/basehdfs.py +163 -163
  178. re_common/v2/baselibrary/utils/basepika.py +180 -180
  179. re_common/v2/baselibrary/utils/basetime.py +77 -77
  180. re_common/v2/baselibrary/utils/db.py +156 -156
  181. re_common/v2/baselibrary/utils/json_cls.py +16 -16
  182. re_common/v2/baselibrary/utils/mq.py +83 -83
  183. re_common/v2/baselibrary/utils/n_ary_expression_tree.py +243 -243
  184. re_common/v2/baselibrary/utils/string_bool.py +186 -186
  185. re_common/v2/baselibrary/utils/string_clear.py +246 -246
  186. re_common/v2/baselibrary/utils/string_smi.py +18 -18
  187. re_common/v2/baselibrary/utils/stringutils.py +271 -278
  188. re_common/vip/base_step_process.py +11 -11
  189. re_common/vip/baseencodeid.py +90 -90
  190. re_common/vip/changetaskname.py +28 -28
  191. re_common/vip/core_var.py +24 -24
  192. re_common/vip/mmh3Hash.py +89 -89
  193. re_common/vip/proxy/allproxys.py +127 -127
  194. re_common/vip/proxy/allproxys_thread.py +159 -159
  195. re_common/vip/proxy/cnki_proxy.py +153 -153
  196. re_common/vip/proxy/kuaidaili.py +87 -87
  197. re_common/vip/proxy/proxy_all.py +113 -113
  198. re_common/vip/proxy/update_kuaidaili_0.py +42 -42
  199. re_common/vip/proxy/wanfang_proxy.py +152 -152
  200. re_common/vip/proxy/wp_proxy_all.py +181 -181
  201. re_common/vip/read_rawid_to_txt.py +91 -91
  202. re_common/vip/title/__init__.py +5 -5
  203. re_common/vip/title/transform/TransformBookTitleToZt.py +125 -125
  204. re_common/vip/title/transform/TransformConferenceTitleToZt.py +139 -139
  205. re_common/vip/title/transform/TransformCstadTitleToZt.py +195 -195
  206. re_common/vip/title/transform/TransformJournalTitleToZt.py +203 -203
  207. re_common/vip/title/transform/TransformPatentTitleToZt.py +132 -132
  208. re_common/vip/title/transform/TransformRegulationTitleToZt.py +114 -114
  209. re_common/vip/title/transform/TransformStandardTitleToZt.py +135 -135
  210. re_common/vip/title/transform/TransformThesisTitleToZt.py +135 -135
  211. re_common/vip/title/transform/__init__.py +10 -10
  212. {re_common-10.0.37.dist-info → re_common-10.0.39.dist-info}/LICENSE +201 -201
  213. {re_common-10.0.37.dist-info → re_common-10.0.39.dist-info}/METADATA +16 -16
  214. re_common-10.0.39.dist-info/RECORD +248 -0
  215. {re_common-10.0.37.dist-info → re_common-10.0.39.dist-info}/WHEEL +1 -1
  216. re_common-10.0.37.dist-info/RECORD +0 -248
  217. {re_common-10.0.37.dist-info → re_common-10.0.39.dist-info}/top_level.txt +0 -0
@@ -1,246 +1,246 @@
1
- import re
2
- from functools import lru_cache
3
- from urllib.parse import unquote
4
-
5
- import regex
6
-
7
- from re_common.v2.baselibrary.utils.stringutils import (
8
- qj2bj,
9
- bj2qj,
10
- get_diacritic_variant,
11
- clean_html,
12
- remove_spaces_between_chinese_characters, clean_unicode_alnum, normalize_nfkc,
13
- )
14
-
15
-
16
- @lru_cache(maxsize=1)
17
- def get_cc():
18
- from opencc import OpenCC
19
-
20
- # pip install opencc-python-reimplemented
21
- cc = OpenCC("t2s") # t2s是繁体转简体
22
- return cc
23
-
24
-
25
- class StringClear(object):
26
- def __init__(self, obj_str):
27
- self.obj_str = obj_str
28
-
29
- def None_to_str(self):
30
- if self.obj_str is None:
31
- self.obj_str = ""
32
- return self
33
-
34
- def to_str(self):
35
- self.obj_str = str(self.obj_str)
36
- return self
37
-
38
- def qj_to_bj(self):
39
- # 全角变半角
40
- self.obj_str = qj2bj(self.obj_str)
41
- return self
42
-
43
- def bj_to_qj(self):
44
- # 半角变全角
45
- self.obj_str = bj2qj(self.obj_str)
46
- return self
47
-
48
- def convert_to_simplified(self):
49
- # 繁体转简体
50
- self.obj_str = get_cc().convert(self.obj_str)
51
- return self
52
-
53
- def lower(self):
54
- self.obj_str = self.obj_str.lower()
55
- return self
56
-
57
- def upper(self):
58
- self.obj_str = self.obj_str.upper()
59
- return self
60
-
61
- def collapse_spaces(self):
62
- # 移除多余空格,连续多个空格变一个
63
- self.obj_str = re.sub(r"\s+", " ", self.obj_str)
64
- return self
65
-
66
- def clear_all_spaces(self):
67
- # 去除所有空格
68
- self.obj_str = re.sub("\\s+", "", self.obj_str)
69
- return self
70
-
71
- def clean_symbols(self):
72
- """
73
- 清理已知的符号
74
- 旧版: "[\\p{P}~`=¥×\\\\*#$^|+%&~!,:.;'/{}()\\[\\]?<> 《》”“\\-()。≤《〈〉》—、·―–‐‘’“”″…¨〔〕°■『』℃ⅠⅡⅢⅣⅤⅥⅦⅩⅪⅫ]"
75
- """
76
- pattern = (
77
- r"[\p{P}" # 所有 Unicode 标点符号
78
- r"~`=¥×\\*#$^|+%&~<> " # 未被 \p{P} 覆盖的特殊符号
79
- r"”“\-≤—―–‐‘’“”″…¨°■℃" # 其他未覆盖的标点和符号
80
- r"ⅠⅡⅢⅣⅤⅥⅦⅩⅪⅫ" # 罗马数字
81
- r"]"
82
- )
83
-
84
- self.obj_str = regex.sub(
85
- pattern, "", self.obj_str
86
- ) # \\p{P} 标点符号 后面的是一些其他符号, 也可以用 \p{S} 代替 但是这个很广 可能有误伤
87
- return self
88
-
89
- def remove_special_chars(self):
90
- # 移除特殊字符,仅保留字母、数字、空格和汉字 \w 已经包括所有 Unicode 字母 下划线 _ 会被保留
91
- self.obj_str = re.sub(r"[^\w\s]", "", self.obj_str)
92
- return self
93
-
94
- def remove_all_symbols(self):
95
- # 一种更加强力的符号清理 只保留各个国家的字符 和各个国家的数字
96
- self.obj_str = clean_unicode_alnum(self.obj_str)
97
- return self
98
-
99
- def remove_underline(self):
100
- # 下划线在 \w 中 所以这里独立封装
101
- self.obj_str = re.sub("[_]", "", self.obj_str)
102
- return self
103
-
104
- def replace_dash_with_space(self):
105
- # 横杆转空格
106
- self.obj_str = self.obj_str.replace("-", " ")
107
- return self
108
-
109
- def strip_quotes(self):
110
- # 清理 双引号
111
- self.obj_str = self.obj_str.replace('"', "")
112
- return self
113
-
114
- def remove_diacritics(self):
115
- """
116
- 和 clear_nkfc的关键区别 不去除连字
117
- """
118
- # 去除音标 转换成字母
119
- self.obj_str = get_diacritic_variant(self.obj_str)
120
- return self
121
-
122
- def clear_nkfc(self):
123
- self.obj_str = normalize_nfkc(self.obj_str)
124
- return self
125
-
126
-
127
- def remove_brackets(self):
128
- # 移除 方括号里面的内容
129
- self.obj_str = re.sub("\\[.*?]", "", self.obj_str)
130
- return self
131
-
132
- def remove_parentheses(self):
133
- # 移除圆括号的内容
134
- self.obj_str = re.sub("\\(.*?\\)", "", self.obj_str)
135
- return self
136
-
137
- def remove_html_tag(self):
138
- # 去除 html 标签
139
- import html
140
-
141
- self.obj_str = html.unescape(self.obj_str)
142
-
143
- self.obj_str = clean_html(self.obj_str)
144
-
145
- return self
146
-
147
- def remove_spaces_in_chinese_characters(self):
148
- # 匹配中文间的空格并替换为空字符串
149
- self.obj_str = remove_spaces_between_chinese_characters(self.obj_str)
150
- return self
151
-
152
- def url_to_str(self):
153
- """
154
- url 编码转字符
155
- """
156
- self.obj_str = unquote(self.obj_str)
157
- return self
158
-
159
- def ascii_text(self):
160
- # 只保留 ASCII 范围内的可见字符:空格(32) 到 ~ (126)
161
- self.obj_str = ''.join(c for c in self.obj_str if 32 <= ord(c) <= 126)
162
- return self
163
-
164
-
165
- def get_str(self):
166
- return self.obj_str
167
-
168
-
169
- def rel_clear(str_obj):
170
- # 为融合数据定制的 清理规则
171
- return (
172
- StringClear(str_obj)
173
- .None_to_str() # 空对象转str 防止空对象
174
- .to_str() # 防止其他类型传入 比如 int double
175
- .qj_to_bj() # 全角转半角
176
- .remove_html_tag() # html标签清理
177
- .remove_special_chars() # 移除特殊字符,仅保留字母、数字、空格和汉字 \w 已经包括所有 Unicode 字母 下划线 _ 会被保留
178
- .collapse_spaces() # 移除多余空格,连续多个空格变一个
179
- .remove_spaces_in_chinese_characters() # 匹配中文间的空格并替换为空字符串
180
- .convert_to_simplified() # 繁体转简体
181
- .lower() # 小写
182
- .get_str() # 获取str
183
- .strip()
184
- ) # 去掉空格
185
-
186
-
187
- def clear_au_organ(str_obj):
188
- """
189
- 为作者机构定制的清理 与上面比除了不转小写外 还多了些特殊的清理
190
- """
191
- strs = (
192
- StringClear(str_obj)
193
- .None_to_str() # None 转 空字符串
194
- .to_str() # 防止其他类型传入 比如 int double
195
- .qj_to_bj() # 全角转半角
196
- .strip_quotes() # 清理 双引号
197
- .clean_symbols() # 清理已知的符号
198
- .collapse_spaces() # 移除多余空格,连续多个空格变一个
199
- .convert_to_simplified() # 繁体转简体
200
- .get_str() # 获取str
201
- .strip() # 去掉空格
202
- )
203
-
204
- strs = strs.replace("lt正gt", "").strip() # 特殊需求
205
- return strs
206
-
207
-
208
- def ref_clear(str_obj):
209
- # 为 引文 数据定制的清理
210
- strs = (
211
- StringClear(str_obj)
212
- .None_to_str() # None 转 空字符串
213
- .remove_html_tag() # 清理html标签
214
- .to_str() # 防止其他类型传入 比如 int double
215
- .qj_to_bj() # 全角转半角
216
- .strip_quotes() # 清理 双引号
217
- .clean_symbols() # 清理已知的符号
218
- .collapse_spaces() # 移除多余空格,连续多个空格变一个
219
- .lower() # 小写
220
- .remove_diacritics() # 去除音标 转换成字母
221
- .get_str() # 获取str
222
- .strip() # 去掉空格
223
- )
224
- return strs
225
-
226
-
227
- def clear_obj(str_obj):
228
- # 为对象化定制的清理
229
- str_obj = clear_au_organ(str_obj)
230
- # str_obj = str_obj.replace("ß", "SS") # "ß" 的 大写就是 "SS"
231
- result = (
232
- StringClear(str_obj)
233
- .remove_diacritics() # 清理音标
234
- .upper()
235
- .get_str() # 获取str
236
- .strip() # 去掉空格
237
- )
238
- return result
239
-
240
-
241
- def normalize_title_for_es(title: str):
242
- _title = StringClear(title).convert_to_simplified().qj_to_bj().get_str()
243
- has_chinese = re.search(r"[\u4e00-\u9fa5]", _title)
244
- if not has_chinese:
245
- _title = re.sub(r"[-—‑–−―-]", " ", _title)
246
- return _title.strip()
1
+ import re
2
+ from functools import lru_cache
3
+ from urllib.parse import unquote
4
+
5
+ import regex
6
+
7
+ from re_common.v2.baselibrary.utils.stringutils import (
8
+ qj2bj,
9
+ bj2qj,
10
+ get_diacritic_variant,
11
+ clean_html,
12
+ remove_spaces_between_chinese_characters, clean_unicode_alnum, normalize_nfkc,
13
+ )
14
+
15
+
16
+ @lru_cache(maxsize=1)
17
+ def get_cc():
18
+ from opencc import OpenCC
19
+
20
+ # pip install opencc-python-reimplemented
21
+ cc = OpenCC("t2s") # t2s是繁体转简体
22
+ return cc
23
+
24
+
25
+ class StringClear(object):
26
+ def __init__(self, obj_str):
27
+ self.obj_str = obj_str
28
+
29
+ def None_to_str(self):
30
+ if self.obj_str is None:
31
+ self.obj_str = ""
32
+ return self
33
+
34
+ def to_str(self):
35
+ self.obj_str = str(self.obj_str)
36
+ return self
37
+
38
+ def qj_to_bj(self):
39
+ # 全角变半角
40
+ self.obj_str = qj2bj(self.obj_str)
41
+ return self
42
+
43
+ def bj_to_qj(self):
44
+ # 半角变全角
45
+ self.obj_str = bj2qj(self.obj_str)
46
+ return self
47
+
48
+ def convert_to_simplified(self):
49
+ # 繁体转简体
50
+ self.obj_str = get_cc().convert(self.obj_str)
51
+ return self
52
+
53
+ def lower(self):
54
+ self.obj_str = self.obj_str.lower()
55
+ return self
56
+
57
+ def upper(self):
58
+ self.obj_str = self.obj_str.upper()
59
+ return self
60
+
61
+ def collapse_spaces(self):
62
+ # 移除多余空格,连续多个空格变一个
63
+ self.obj_str = re.sub(r"\s+", " ", self.obj_str)
64
+ return self
65
+
66
+ def clear_all_spaces(self):
67
+ # 去除所有空格
68
+ self.obj_str = re.sub("\\s+", "", self.obj_str)
69
+ return self
70
+
71
+ def clean_symbols(self):
72
+ """
73
+ 清理已知的符号
74
+ 旧版: "[\\p{P}~`=¥×\\\\*#$^|+%&~!,:.;'/{}()\\[\\]?<> 《》”“\\-()。≤《〈〉》—、·―–‐‘’“”″…¨〔〕°■『』℃ⅠⅡⅢⅣⅤⅥⅦⅩⅪⅫ]"
75
+ """
76
+ pattern = (
77
+ r"[\p{P}" # 所有 Unicode 标点符号
78
+ r"~`=¥×\\*#$^|+%&~<> " # 未被 \p{P} 覆盖的特殊符号
79
+ r"”“\-≤—―–‐‘’“”″…¨°■℃" # 其他未覆盖的标点和符号
80
+ r"ⅠⅡⅢⅣⅤⅥⅦⅩⅪⅫ" # 罗马数字
81
+ r"]"
82
+ )
83
+
84
+ self.obj_str = regex.sub(
85
+ pattern, "", self.obj_str
86
+ ) # \\p{P} 标点符号 后面的是一些其他符号, 也可以用 \p{S} 代替 但是这个很广 可能有误伤
87
+ return self
88
+
89
+ def remove_special_chars(self):
90
+ # 移除特殊字符,仅保留字母、数字、空格和汉字 \w 已经包括所有 Unicode 字母 下划线 _ 会被保留
91
+ self.obj_str = re.sub(r"[^\w\s]", "", self.obj_str)
92
+ return self
93
+
94
+ def remove_all_symbols(self):
95
+ # 一种更加强力的符号清理 只保留各个国家的字符 和各个国家的数字
96
+ self.obj_str = clean_unicode_alnum(self.obj_str)
97
+ return self
98
+
99
+ def remove_underline(self):
100
+ # 下划线在 \w 中 所以这里独立封装
101
+ self.obj_str = re.sub("[_]", "", self.obj_str)
102
+ return self
103
+
104
+ def replace_dash_with_space(self):
105
+ # 横杆转空格
106
+ self.obj_str = self.obj_str.replace("-", " ")
107
+ return self
108
+
109
+ def strip_quotes(self):
110
+ # 清理 双引号
111
+ self.obj_str = self.obj_str.replace('"', "")
112
+ return self
113
+
114
+ def remove_diacritics(self):
115
+ """
116
+ 和 clear_nkfc的关键区别 不去除连字
117
+ """
118
+ # 去除音标 转换成字母
119
+ self.obj_str = get_diacritic_variant(self.obj_str)
120
+ return self
121
+
122
+ def clear_nkfc(self):
123
+ self.obj_str = normalize_nfkc(self.obj_str)
124
+ return self
125
+
126
+
127
+ def remove_brackets(self):
128
+ # 移除 方括号里面的内容
129
+ self.obj_str = re.sub("\\[.*?]", "", self.obj_str)
130
+ return self
131
+
132
+ def remove_parentheses(self):
133
+ # 移除圆括号的内容
134
+ self.obj_str = re.sub("\\(.*?\\)", "", self.obj_str)
135
+ return self
136
+
137
+ def remove_html_tag(self):
138
+ # 去除 html 标签
139
+ import html
140
+
141
+ self.obj_str = html.unescape(self.obj_str)
142
+
143
+ self.obj_str = clean_html(self.obj_str)
144
+
145
+ return self
146
+
147
+ def remove_spaces_in_chinese_characters(self):
148
+ # 匹配中文间的空格并替换为空字符串
149
+ self.obj_str = remove_spaces_between_chinese_characters(self.obj_str)
150
+ return self
151
+
152
+ def url_to_str(self):
153
+ """
154
+ url 编码转字符
155
+ """
156
+ self.obj_str = unquote(self.obj_str)
157
+ return self
158
+
159
+ def ascii_text(self):
160
+ # 只保留 ASCII 范围内的可见字符:空格(32) 到 ~ (126)
161
+ self.obj_str = ''.join(c for c in self.obj_str if 32 <= ord(c) <= 126)
162
+ return self
163
+
164
+
165
+ def get_str(self):
166
+ return self.obj_str
167
+
168
+
169
+ def rel_clear(str_obj):
170
+ # 为融合数据定制的 清理规则
171
+ return (
172
+ StringClear(str_obj)
173
+ .None_to_str() # 空对象转str 防止空对象
174
+ .to_str() # 防止其他类型传入 比如 int double
175
+ .qj_to_bj() # 全角转半角
176
+ .remove_html_tag() # html标签清理
177
+ .remove_special_chars() # 移除特殊字符,仅保留字母、数字、空格和汉字 \w 已经包括所有 Unicode 字母 下划线 _ 会被保留
178
+ .collapse_spaces() # 移除多余空格,连续多个空格变一个
179
+ .remove_spaces_in_chinese_characters() # 匹配中文间的空格并替换为空字符串
180
+ .convert_to_simplified() # 繁体转简体
181
+ .lower() # 小写
182
+ .get_str() # 获取str
183
+ .strip()
184
+ ) # 去掉空格
185
+
186
+
187
+ def clear_au_organ(str_obj):
188
+ """
189
+ 为作者机构定制的清理 与上面比除了不转小写外 还多了些特殊的清理
190
+ """
191
+ strs = (
192
+ StringClear(str_obj)
193
+ .None_to_str() # None 转 空字符串
194
+ .to_str() # 防止其他类型传入 比如 int double
195
+ .qj_to_bj() # 全角转半角
196
+ .strip_quotes() # 清理 双引号
197
+ .clean_symbols() # 清理已知的符号
198
+ .collapse_spaces() # 移除多余空格,连续多个空格变一个
199
+ .convert_to_simplified() # 繁体转简体
200
+ .get_str() # 获取str
201
+ .strip() # 去掉空格
202
+ )
203
+
204
+ strs = strs.replace("lt正gt", "").strip() # 特殊需求
205
+ return strs
206
+
207
+
208
+ def ref_clear(str_obj):
209
+ # 为 引文 数据定制的清理
210
+ strs = (
211
+ StringClear(str_obj)
212
+ .None_to_str() # None 转 空字符串
213
+ .remove_html_tag() # 清理html标签
214
+ .to_str() # 防止其他类型传入 比如 int double
215
+ .qj_to_bj() # 全角转半角
216
+ .strip_quotes() # 清理 双引号
217
+ .clean_symbols() # 清理已知的符号
218
+ .collapse_spaces() # 移除多余空格,连续多个空格变一个
219
+ .lower() # 小写
220
+ .remove_diacritics() # 去除音标 转换成字母
221
+ .get_str() # 获取str
222
+ .strip() # 去掉空格
223
+ )
224
+ return strs
225
+
226
+
227
+ def clear_obj(str_obj):
228
+ # 为对象化定制的清理
229
+ str_obj = clear_au_organ(str_obj)
230
+ # str_obj = str_obj.replace("ß", "SS") # "ß" 的 大写就是 "SS"
231
+ result = (
232
+ StringClear(str_obj)
233
+ .remove_diacritics() # 清理音标
234
+ .upper()
235
+ .get_str() # 获取str
236
+ .strip() # 去掉空格
237
+ )
238
+ return result
239
+
240
+
241
+ def normalize_title_for_es(title: str):
242
+ _title = StringClear(title).convert_to_simplified().qj_to_bj().get_str()
243
+ has_chinese = re.search(r"[\u4e00-\u9fa5]", _title)
244
+ if not has_chinese:
245
+ _title = re.sub(r"[-—‑–−―-]", " ", _title)
246
+ return _title.strip()
@@ -1,18 +1,18 @@
1
- import jellyfish
2
- from rapidfuzz.distance import DamerauLevenshtein
3
-
4
-
5
- class JaroDamerauLevenshteinMaxSim(object):
6
- """
7
- jaro_similarity 有缺陷 以下样例数据会导致分很低
8
- s1 = "in situ monitoring of semiconductor wafer temperature using infrared interfe rometry"
9
- s2 = "insitu monitoring of semiconductor wafer temperature using infrared interferometry"
10
- """
11
-
12
- def get_sim(self, str1: str, str2: str) -> float:
13
- similarity1 = jellyfish.jaro_similarity(str1, str2)
14
- if str1.strip() == "" and str2.strip() == "":
15
- similarity2 = 0
16
- else:
17
- similarity2 = 1 - DamerauLevenshtein.normalized_distance(str1, str2)
18
- return max(similarity1, similarity2)
1
+ import jellyfish
2
+ from rapidfuzz.distance import DamerauLevenshtein
3
+
4
+
5
+ class JaroDamerauLevenshteinMaxSim(object):
6
+ """
7
+ jaro_similarity 有缺陷 以下样例数据会导致分很低
8
+ s1 = "in situ monitoring of semiconductor wafer temperature using infrared interfe rometry"
9
+ s2 = "insitu monitoring of semiconductor wafer temperature using infrared interferometry"
10
+ """
11
+
12
+ def get_sim(self, str1: str, str2: str) -> float:
13
+ similarity1 = jellyfish.jaro_similarity(str1, str2)
14
+ if str1.strip() == "" and str2.strip() == "":
15
+ similarity2 = 0
16
+ else:
17
+ similarity2 = 1 - DamerauLevenshtein.normalized_distance(str1, str2)
18
+ return max(similarity1, similarity2)