re-common 10.0.22__py3-none-any.whl → 10.0.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (202) hide show
  1. re_common/baselibrary/__init__.py +4 -4
  2. re_common/baselibrary/baseabs/__init__.py +6 -6
  3. re_common/baselibrary/baseabs/baseabs.py +26 -26
  4. re_common/baselibrary/database/mbuilder.py +132 -132
  5. re_common/baselibrary/database/moudle.py +93 -93
  6. re_common/baselibrary/database/msqlite3.py +194 -194
  7. re_common/baselibrary/database/mysql.py +169 -169
  8. re_common/baselibrary/database/sql_factory.py +26 -26
  9. re_common/baselibrary/mthread/MThreadingRun.py +486 -486
  10. re_common/baselibrary/mthread/MThreadingRunEvent.py +349 -349
  11. re_common/baselibrary/mthread/__init__.py +2 -2
  12. re_common/baselibrary/mthread/mythreading.py +695 -695
  13. re_common/baselibrary/pakge_other/socks.py +404 -404
  14. re_common/baselibrary/readconfig/config_factory.py +18 -18
  15. re_common/baselibrary/readconfig/ini_config.py +317 -317
  16. re_common/baselibrary/readconfig/toml_config.py +49 -49
  17. re_common/baselibrary/temporary/envdata.py +36 -36
  18. re_common/baselibrary/tools/all_requests/aiohttp_request.py +118 -118
  19. re_common/baselibrary/tools/all_requests/httpx_requet.py +102 -102
  20. re_common/baselibrary/tools/all_requests/mrequest.py +412 -412
  21. re_common/baselibrary/tools/all_requests/requests_request.py +81 -81
  22. re_common/baselibrary/tools/batch_compre/bijiao_batch.py +31 -31
  23. re_common/baselibrary/tools/contrast_db3.py +123 -123
  24. re_common/baselibrary/tools/copy_file.py +39 -39
  25. re_common/baselibrary/tools/db3_2_sizedb3.py +102 -102
  26. re_common/baselibrary/tools/foreachgz.py +39 -39
  27. re_common/baselibrary/tools/get_attr.py +10 -10
  28. re_common/baselibrary/tools/image_to_pdf.py +61 -61
  29. re_common/baselibrary/tools/java_code_deal.py +139 -139
  30. re_common/baselibrary/tools/javacode.py +79 -79
  31. re_common/baselibrary/tools/mdb_db3.py +48 -48
  32. re_common/baselibrary/tools/merge_file.py +171 -171
  33. re_common/baselibrary/tools/merge_gz_file.py +165 -165
  34. re_common/baselibrary/tools/mhdfstools/down_hdfs_files.py +42 -42
  35. re_common/baselibrary/tools/mhdfstools/hdfst.py +42 -42
  36. re_common/baselibrary/tools/mhdfstools/up_hdfs_files.py +38 -38
  37. re_common/baselibrary/tools/mongo_tools.py +50 -50
  38. re_common/baselibrary/tools/move_file.py +170 -170
  39. re_common/baselibrary/tools/move_mongo/mongo_table_to_file.py +63 -63
  40. re_common/baselibrary/tools/move_mongo/move_mongo_table.py +354 -354
  41. re_common/baselibrary/tools/move_mongo/use_mttf.py +18 -18
  42. re_common/baselibrary/tools/move_mongo/use_mv.py +93 -93
  43. re_common/baselibrary/tools/mpandas/mpandasreadexcel.py +125 -125
  44. re_common/baselibrary/tools/mpandas/pandas_visualization.py +7 -7
  45. re_common/baselibrary/tools/myparsel.py +104 -104
  46. re_common/baselibrary/tools/rename_dir_file.py +37 -37
  47. re_common/baselibrary/tools/sequoiadb_utils.py +398 -398
  48. re_common/baselibrary/tools/split_line_to_many.py +25 -25
  49. re_common/baselibrary/tools/stringtodicts.py +33 -33
  50. re_common/baselibrary/tools/workwechant_bot.py +84 -84
  51. re_common/baselibrary/utils/baseaiohttp.py +296 -296
  52. re_common/baselibrary/utils/baseaiomysql.py +87 -87
  53. re_common/baselibrary/utils/baseallstep.py +191 -191
  54. re_common/baselibrary/utils/baseavro.py +19 -19
  55. re_common/baselibrary/utils/baseboto3.py +291 -291
  56. re_common/baselibrary/utils/basecsv.py +32 -32
  57. re_common/baselibrary/utils/basedict.py +133 -133
  58. re_common/baselibrary/utils/basedir.py +241 -241
  59. re_common/baselibrary/utils/baseencode.py +351 -351
  60. re_common/baselibrary/utils/baseencoding.py +28 -28
  61. re_common/baselibrary/utils/baseesdsl.py +86 -86
  62. re_common/baselibrary/utils/baseexcel.py +264 -264
  63. re_common/baselibrary/utils/baseexcept.py +109 -109
  64. re_common/baselibrary/utils/basefile.py +654 -654
  65. re_common/baselibrary/utils/baseftp.py +214 -214
  66. re_common/baselibrary/utils/basegzip.py +60 -60
  67. re_common/baselibrary/utils/basehdfs.py +135 -135
  68. re_common/baselibrary/utils/basehttpx.py +268 -268
  69. re_common/baselibrary/utils/baseip.py +87 -87
  70. re_common/baselibrary/utils/basejson.py +2 -2
  71. re_common/baselibrary/utils/baselist.py +32 -32
  72. re_common/baselibrary/utils/basemotor.py +190 -190
  73. re_common/baselibrary/utils/basemssql.py +98 -98
  74. re_common/baselibrary/utils/baseodbc.py +113 -113
  75. re_common/baselibrary/utils/basepandas.py +302 -302
  76. re_common/baselibrary/utils/basepeewee.py +11 -11
  77. re_common/baselibrary/utils/basepika.py +180 -180
  78. re_common/baselibrary/utils/basepydash.py +143 -143
  79. re_common/baselibrary/utils/basepymongo.py +230 -230
  80. re_common/baselibrary/utils/basequeue.py +22 -22
  81. re_common/baselibrary/utils/baserar.py +57 -57
  82. re_common/baselibrary/utils/baserequest.py +279 -279
  83. re_common/baselibrary/utils/baseset.py +8 -8
  84. re_common/baselibrary/utils/basesmb.py +403 -403
  85. re_common/baselibrary/utils/basestring.py +382 -382
  86. re_common/baselibrary/utils/basetime.py +320 -320
  87. re_common/baselibrary/utils/baseurl.py +121 -121
  88. re_common/baselibrary/utils/basezip.py +57 -57
  89. re_common/baselibrary/utils/core/__init__.py +7 -7
  90. re_common/baselibrary/utils/core/bottomutils.py +18 -18
  91. re_common/baselibrary/utils/core/mdeprecated.py +327 -327
  92. re_common/baselibrary/utils/core/mlamada.py +16 -16
  93. re_common/baselibrary/utils/core/msginfo.py +25 -25
  94. re_common/baselibrary/utils/core/requests_core.py +103 -103
  95. re_common/baselibrary/utils/fateadm.py +429 -429
  96. re_common/baselibrary/utils/importfun.py +123 -123
  97. re_common/baselibrary/utils/mfaker.py +57 -57
  98. re_common/baselibrary/utils/my_abc/__init__.py +3 -3
  99. re_common/baselibrary/utils/my_abc/better_abc.py +32 -32
  100. re_common/baselibrary/utils/mylogger.py +414 -414
  101. re_common/baselibrary/utils/myredisclient.py +861 -861
  102. re_common/baselibrary/utils/pipupgrade.py +21 -21
  103. re_common/baselibrary/utils/ringlist.py +85 -85
  104. re_common/baselibrary/utils/version_compare.py +36 -36
  105. re_common/baselibrary/utils/ydmhttp.py +126 -126
  106. re_common/facade/lazy_import.py +11 -11
  107. re_common/facade/loggerfacade.py +25 -25
  108. re_common/facade/mysqlfacade.py +467 -467
  109. re_common/facade/now.py +31 -31
  110. re_common/facade/sqlite3facade.py +257 -257
  111. re_common/facade/use/mq_use_facade.py +83 -83
  112. re_common/facade/use/proxy_use_facade.py +19 -19
  113. re_common/libtest/base_dict_test.py +19 -19
  114. re_common/libtest/baseavro_test.py +13 -13
  115. re_common/libtest/basefile_test.py +14 -14
  116. re_common/libtest/basemssql_test.py +77 -77
  117. re_common/libtest/baseodbc_test.py +7 -7
  118. re_common/libtest/basepandas_test.py +38 -38
  119. re_common/libtest/get_attr_test/get_attr_test_settings.py +14 -14
  120. re_common/libtest/get_attr_test/settings.py +54 -54
  121. re_common/libtest/idencode_test.py +53 -53
  122. re_common/libtest/iniconfig_test.py +35 -35
  123. re_common/libtest/ip_test.py +34 -34
  124. re_common/libtest/merge_file_test.py +20 -20
  125. re_common/libtest/mfaker_test.py +8 -8
  126. re_common/libtest/mm3_test.py +31 -31
  127. re_common/libtest/mylogger_test.py +88 -88
  128. re_common/libtest/myparsel_test.py +27 -27
  129. re_common/libtest/mysql_test.py +151 -151
  130. re_common/libtest/pymongo_test.py +21 -21
  131. re_common/libtest/split_test.py +11 -11
  132. re_common/libtest/sqlite3_merge_test.py +5 -5
  133. re_common/libtest/sqlite3_test.py +34 -34
  134. re_common/libtest/tomlconfig_test.py +30 -30
  135. re_common/libtest/use_tools_test/__init__.py +2 -2
  136. re_common/libtest/user/__init__.py +4 -4
  137. re_common/studio/__init__.py +4 -4
  138. re_common/studio/assignment_expressions.py +36 -36
  139. re_common/studio/mydash/test1.py +18 -18
  140. re_common/studio/pydashstudio/first.py +9 -9
  141. re_common/studio/streamlitstudio/first_app.py +65 -65
  142. re_common/studio/streamlitstudio/uber_pickups.py +23 -23
  143. re_common/studio/test.py +18 -18
  144. re_common/v2/baselibrary/business_utils/BusinessStringUtil.py +195 -0
  145. re_common/v2/baselibrary/business_utils/__init__.py +0 -0
  146. re_common/v2/baselibrary/business_utils/rel_tools.py +6 -0
  147. re_common/v2/baselibrary/decorators/utils.py +59 -59
  148. re_common/v2/baselibrary/s3object/baseboto3.py +230 -230
  149. re_common/v2/baselibrary/tools/WeChatRobot.py +95 -79
  150. re_common/v2/baselibrary/tools/ac_ahocorasick.py +75 -75
  151. re_common/v2/baselibrary/tools/dict_tools.py +37 -37
  152. re_common/v2/baselibrary/tools/dolphinscheduler.py +187 -187
  153. re_common/v2/baselibrary/tools/hdfs_data_processer.py +338 -338
  154. re_common/v2/baselibrary/tools/list_tools.py +65 -65
  155. re_common/v2/baselibrary/tools/search_hash_tools.py +54 -54
  156. re_common/v2/baselibrary/tools/text_matcher.py +326 -326
  157. re_common/v2/baselibrary/tools/unionfind_tools.py +60 -60
  158. re_common/v2/baselibrary/utils/BusinessStringUtil.py +196 -196
  159. re_common/v2/baselibrary/utils/author_smi.py +360 -360
  160. re_common/v2/baselibrary/utils/base_string_similarity.py +158 -158
  161. re_common/v2/baselibrary/utils/basedict.py +37 -37
  162. re_common/v2/baselibrary/utils/basehdfs.py +161 -161
  163. re_common/v2/baselibrary/utils/basepika.py +180 -180
  164. re_common/v2/baselibrary/utils/basetime.py +77 -77
  165. re_common/v2/baselibrary/utils/db.py +38 -38
  166. re_common/v2/baselibrary/utils/json_cls.py +16 -16
  167. re_common/v2/baselibrary/utils/mq.py +83 -83
  168. re_common/v2/baselibrary/utils/n_ary_expression_tree.py +243 -243
  169. re_common/v2/baselibrary/utils/string_bool.py +186 -149
  170. re_common/v2/baselibrary/utils/string_clear.py +227 -204
  171. re_common/v2/baselibrary/utils/string_smi.py +18 -18
  172. re_common/v2/baselibrary/utils/stringutils.py +213 -213
  173. re_common/vip/base_step_process.py +11 -11
  174. re_common/vip/baseencodeid.py +90 -90
  175. re_common/vip/changetaskname.py +28 -28
  176. re_common/vip/core_var.py +24 -24
  177. re_common/vip/mmh3Hash.py +89 -89
  178. re_common/vip/proxy/allproxys.py +127 -127
  179. re_common/vip/proxy/allproxys_thread.py +159 -159
  180. re_common/vip/proxy/cnki_proxy.py +153 -153
  181. re_common/vip/proxy/kuaidaili.py +87 -87
  182. re_common/vip/proxy/proxy_all.py +113 -113
  183. re_common/vip/proxy/update_kuaidaili_0.py +42 -42
  184. re_common/vip/proxy/wanfang_proxy.py +152 -152
  185. re_common/vip/proxy/wp_proxy_all.py +181 -181
  186. re_common/vip/read_rawid_to_txt.py +91 -91
  187. re_common/vip/title/__init__.py +5 -5
  188. re_common/vip/title/transform/TransformBookTitleToZt.py +125 -125
  189. re_common/vip/title/transform/TransformConferenceTitleToZt.py +139 -139
  190. re_common/vip/title/transform/TransformCstadTitleToZt.py +195 -195
  191. re_common/vip/title/transform/TransformJournalTitleToZt.py +203 -203
  192. re_common/vip/title/transform/TransformPatentTitleToZt.py +132 -132
  193. re_common/vip/title/transform/TransformRegulationTitleToZt.py +114 -114
  194. re_common/vip/title/transform/TransformStandardTitleToZt.py +135 -135
  195. re_common/vip/title/transform/TransformThesisTitleToZt.py +135 -135
  196. re_common/vip/title/transform/__init__.py +10 -10
  197. {re_common-10.0.22.dist-info → re_common-10.0.24.dist-info}/LICENSE +201 -201
  198. {re_common-10.0.22.dist-info → re_common-10.0.24.dist-info}/METADATA +16 -16
  199. re_common-10.0.24.dist-info/RECORD +230 -0
  200. {re_common-10.0.22.dist-info → re_common-10.0.24.dist-info}/WHEEL +1 -1
  201. re_common-10.0.22.dist-info/RECORD +0 -227
  202. {re_common-10.0.22.dist-info → re_common-10.0.24.dist-info}/top_level.txt +0 -0
@@ -1,204 +1,227 @@
1
- import re
2
- from urllib.parse import unquote
3
-
4
- import regex
5
-
6
- from re_common.v2.baselibrary.utils.stringutils import qj2bj, bj2qj, get_diacritic_variant, clean_html, \
7
- remove_spaces_between_chinese_characters
8
-
9
- from opencc import OpenCC
10
-
11
- # pip install opencc-python-reimplemented
12
- cc = OpenCC("t2s") # t2s是繁体转简体
13
-
14
-
15
- class StringClear(object):
16
-
17
- def __init__(self, obj_str):
18
- self.obj_str = obj_str
19
-
20
- def None_to_str(self):
21
- if self.obj_str is None:
22
- self.obj_str = ''
23
- return self
24
-
25
- def to_str(self):
26
- self.obj_str = str(self.obj_str)
27
- return self
28
-
29
- def qj_to_bj(self):
30
- # 全角变半角
31
- self.obj_str = qj2bj(self.obj_str)
32
- return self
33
-
34
- def bj_to_qj(self):
35
- # 半角变全角
36
- self.obj_str = bj2qj(self.obj_str)
37
- return self
38
-
39
- def convert_to_simplified(self):
40
- # 繁体转简体
41
- self.obj_str = cc.convert(self.obj_str)
42
- return self
43
-
44
- def lower(self):
45
- self.obj_str = self.obj_str.lower()
46
- return self
47
-
48
- def upper(self):
49
- self.obj_str = self.obj_str.upper()
50
- return self
51
-
52
- def collapse_spaces(self):
53
- # 移除多余空格,连续多个空格变一个
54
- self.obj_str = re.sub(r"\s+", " ", self.obj_str)
55
- return self
56
-
57
- def clear_all_spaces(self):
58
- # 去除所有空格
59
- self.obj_str = re.sub("\\s+", "", self.obj_str)
60
- return self
61
-
62
- def clean_symbols(self):
63
- """
64
- 清理已知的符号
65
- 旧版: "[\\p{P}~`=¥×\\\\*#$^|+%&~!,:.;'/{}()\\[\\]?<> 《》”“\\-()。≤《〈〉》—、·―–‐‘’“”″…¨〔〕°■『』℃ⅠⅡⅢⅣⅤⅥⅦⅩⅪⅫ]"
66
- """
67
- pattern = (
68
- r"[\p{P}" # 所有 Unicode 标点符号
69
- r"~`=¥×\\*#$^|+%&~<> " # 未被 \p{P} 覆盖的特殊符号
70
- r"”“\-≤—―–‐‘’“”″…¨°■℃" # 其他未覆盖的标点和符号
71
- r"ⅠⅡⅢⅣⅤⅥⅦⅩⅪⅫ" # 罗马数字
72
- r"]"
73
- )
74
-
75
- self.obj_str = regex.sub(
76
- pattern,
77
- "",
78
- self.obj_str) # \\p{P} 标点符号 后面的是一些其他符号, 也可以用 \p{S} 代替 但是这个很广 可能有误伤
79
- return self
80
-
81
- def remove_special_chars(self):
82
- # 移除特殊字符,仅保留字母、数字、空格和汉字 \w 已经包括所有 Unicode 字母 下划线 _ 会被保留
83
- self.obj_str = re.sub(r"[^\w\s]", "", self.obj_str)
84
- return self
85
-
86
- def remove_underline(self):
87
- # 下划线在 \w 中 所以这里独立封装
88
- self.obj_str = re.sub("[_]", "", self.obj_str)
89
- return self
90
-
91
- def replace_dash_with_space(self):
92
- # 横杆转空格
93
- self.obj_str = self.obj_str.replace("-", " ")
94
- return self
95
-
96
- def strip_quotes(self):
97
- # 清理 双引号
98
- self.obj_str = self.obj_str.replace("\"", "")
99
- return self
100
-
101
- def remove_diacritics(self):
102
- # 去除音标 转换成字母
103
- self.obj_str = get_diacritic_variant(self.obj_str)
104
- return self
105
-
106
- def remove_brackets(self):
107
- # 移除 方括号里面的内容
108
- self.obj_str = re.sub("\\[.*?]", "", self.obj_str)
109
- return self
110
-
111
- def remove_parentheses(self):
112
- # 移除圆括号的内容
113
- self.obj_str = re.sub("\\(.*?\\)", "", self.obj_str)
114
- return self
115
-
116
- def remove_html_tag(self):
117
- # 去除 html 标签
118
- import html
119
-
120
- self.obj_str = html.unescape(self.obj_str)
121
-
122
- self.obj_str = clean_html(self.obj_str)
123
-
124
- return self
125
-
126
- def remove_spaces_in_chinese_characters(self):
127
- # 匹配中文间的空格并替换为空字符串
128
- self.obj_str = remove_spaces_between_chinese_characters(self.obj_str)
129
- return self
130
-
131
- def url_to_str(self):
132
- """
133
- url 编码转字符
134
- """
135
- self.obj_str = unquote(self.obj_str)
136
- return self
137
-
138
- def get_str(self):
139
- return self.obj_str
140
-
141
-
142
- def rel_clear(str_obj):
143
- # 为融合数据定制的 清理规则
144
- return (StringClear(str_obj)
145
- .None_to_str() # 空对象转str 防止空对象
146
- .to_str() # 防止其他类型传入 比如 int double
147
- .qj_to_bj() # 全角转半角
148
- .remove_html_tag() # html标签清理
149
- .remove_special_chars() # 移除特殊字符,仅保留字母、数字、空格和汉字 \w 已经包括所有 Unicode 字母 下划线 _ 会被保留
150
- .collapse_spaces() # 移除多余空格,连续多个空格变一个
151
- .remove_spaces_in_chinese_characters() # 匹配中文间的空格并替换为空字符串
152
- .lower() # 小写
153
- .get_str() # 获取str
154
- .strip()) # 去掉空格
155
-
156
-
157
- def clear_au_organ(str_obj):
158
- """
159
- 为作者机构定制的清理 与上面比除了不转小写外 还多了些特殊的清理
160
- """
161
- strs = (StringClear(str_obj)
162
- .None_to_str() # None 转 空字符串
163
- .to_str() # 防止其他类型传入 比如 int double
164
- .qj_to_bj() # 全角转半角
165
- .strip_quotes() # 清理 双引号
166
- .clean_symbols() # 清理已知的符号
167
- .collapse_spaces() # 移除多余空格,连续多个空格变一个
168
- .get_str() # 获取str
169
- .strip() # 去掉空格
170
- )
171
-
172
- strs = strs.replace("lt正gt", "").strip() # 特殊需求
173
- return strs
174
-
175
-
176
- def ref_clear(str_obj):
177
- # 引文 数据定制的清理
178
- strs = (StringClear(str_obj)
179
- .None_to_str() # None 转 空字符串
180
- .remove_html_tag() # 清理html标签
181
- .to_str() # 防止其他类型传入 比如 int double
182
- .qj_to_bj() # 全角转半角
183
- .strip_quotes() # 清理 双引号
184
- .clean_symbols() # 清理已知的符号
185
- .collapse_spaces() # 移除多余空格,连续多个空格变一个
186
- .lower() # 小写
187
- .remove_diacritics() # 去除音标 转换成字母
188
- .get_str() # 获取str
189
- .strip() # 去掉空格
190
- )
191
- return strs
192
-
193
-
194
- def clear_obj(str_obj):
195
- # 为对象化定制的清理
196
- str_obj = clear_au_organ(str_obj)
197
- # str_obj = str_obj.replace("ß", "SS") # "ß" 的 大写就是 "SS"
198
- result = (StringClear(str_obj)
199
- .remove_diacritics() # 清理音标
200
- .upper()
201
- .get_str() # 获取str
202
- .strip() # 去掉空格
203
- )
204
- return result
1
+ import re
2
+ from functools import lru_cache
3
+ from urllib.parse import unquote
4
+
5
+ import regex
6
+
7
+ from re_common.v2.baselibrary.utils.stringutils import (
8
+ qj2bj,
9
+ bj2qj,
10
+ get_diacritic_variant,
11
+ clean_html,
12
+ remove_spaces_between_chinese_characters,
13
+ )
14
+
15
+
16
+ @lru_cache(maxsize=1)
17
+ def get_cc():
18
+ from opencc import OpenCC
19
+
20
+ # pip install opencc-python-reimplemented
21
+ cc = OpenCC("t2s") # t2s是繁体转简体
22
+ return cc
23
+
24
+
25
+ class StringClear(object):
26
+ def __init__(self, obj_str):
27
+ self.obj_str = obj_str
28
+
29
+ def None_to_str(self):
30
+ if self.obj_str is None:
31
+ self.obj_str = ""
32
+ return self
33
+
34
+ def to_str(self):
35
+ self.obj_str = str(self.obj_str)
36
+ return self
37
+
38
+ def qj_to_bj(self):
39
+ # 全角变半角
40
+ self.obj_str = qj2bj(self.obj_str)
41
+ return self
42
+
43
+ def bj_to_qj(self):
44
+ # 半角变全角
45
+ self.obj_str = bj2qj(self.obj_str)
46
+ return self
47
+
48
+ def convert_to_simplified(self):
49
+ # 繁体转简体
50
+ self.obj_str = get_cc().convert(self.obj_str)
51
+ return self
52
+
53
+ def lower(self):
54
+ self.obj_str = self.obj_str.lower()
55
+ return self
56
+
57
+ def upper(self):
58
+ self.obj_str = self.obj_str.upper()
59
+ return self
60
+
61
+ def collapse_spaces(self):
62
+ # 移除多余空格,连续多个空格变一个
63
+ self.obj_str = re.sub(r"\s+", " ", self.obj_str)
64
+ return self
65
+
66
+ def clear_all_spaces(self):
67
+ # 去除所有空格
68
+ self.obj_str = re.sub("\\s+", "", self.obj_str)
69
+ return self
70
+
71
+ def clean_symbols(self):
72
+ """
73
+ 清理已知的符号
74
+ 旧版: "[\\p{P}~`=¥×\\\\*#$^|+%&~!,:.;'/{}()\\[\\]?<> 《》”“\\-()。≤《〈〉》—、·―–‐‘’“”″…¨〔〕°■『』℃ⅠⅡⅢⅣⅤⅥⅦⅩⅪⅫ]"
75
+ """
76
+ pattern = (
77
+ r"[\p{P}" # 所有 Unicode 标点符号
78
+ r"~`=¥×\\*#$^|+%&~<> " # 未被 \p{P} 覆盖的特殊符号
79
+ r"”“\-≤—―–‐‘’“”″…¨°■℃" # 其他未覆盖的标点和符号
80
+ r"ⅠⅡⅢⅣⅤⅥⅦⅩⅪⅫ" # 罗马数字
81
+ r"]"
82
+ )
83
+
84
+ self.obj_str = regex.sub(
85
+ pattern, "", self.obj_str
86
+ ) # \\p{P} 标点符号 后面的是一些其他符号, 也可以用 \p{S} 代替 但是这个很广 可能有误伤
87
+ return self
88
+
89
+ def remove_special_chars(self):
90
+ # 移除特殊字符,仅保留字母、数字、空格和汉字 \w 已经包括所有 Unicode 字母 下划线 _ 会被保留
91
+ self.obj_str = re.sub(r"[^\w\s]", "", self.obj_str)
92
+ return self
93
+
94
+ def remove_underline(self):
95
+ # 下划线在 \w 中 所以这里独立封装
96
+ self.obj_str = re.sub("[_]", "", self.obj_str)
97
+ return self
98
+
99
+ def replace_dash_with_space(self):
100
+ # 横杆转空格
101
+ self.obj_str = self.obj_str.replace("-", " ")
102
+ return self
103
+
104
+ def strip_quotes(self):
105
+ # 清理 双引号
106
+ self.obj_str = self.obj_str.replace('"', "")
107
+ return self
108
+
109
+ def remove_diacritics(self):
110
+ # 去除音标 转换成字母
111
+ self.obj_str = get_diacritic_variant(self.obj_str)
112
+ return self
113
+
114
+ def remove_brackets(self):
115
+ # 移除 方括号里面的内容
116
+ self.obj_str = re.sub("\\[.*?]", "", self.obj_str)
117
+ return self
118
+
119
+ def remove_parentheses(self):
120
+ # 移除圆括号的内容
121
+ self.obj_str = re.sub("\\(.*?\\)", "", self.obj_str)
122
+ return self
123
+
124
+ def remove_html_tag(self):
125
+ # 去除 html 标签
126
+ import html
127
+
128
+ self.obj_str = html.unescape(self.obj_str)
129
+
130
+ self.obj_str = clean_html(self.obj_str)
131
+
132
+ return self
133
+
134
+ def remove_spaces_in_chinese_characters(self):
135
+ # 匹配中文间的空格并替换为空字符串
136
+ self.obj_str = remove_spaces_between_chinese_characters(self.obj_str)
137
+ return self
138
+
139
+ def url_to_str(self):
140
+ """
141
+ url 编码转字符
142
+ """
143
+ self.obj_str = unquote(self.obj_str)
144
+ return self
145
+
146
+ def get_str(self):
147
+ return self.obj_str
148
+
149
+
150
+ def rel_clear(str_obj):
151
+ # 为融合数据定制的 清理规则
152
+ return (
153
+ StringClear(str_obj)
154
+ .None_to_str() # 空对象转str 防止空对象
155
+ .to_str() # 防止其他类型传入 比如 int double
156
+ .qj_to_bj() # 全角转半角
157
+ .remove_html_tag() # html标签清理
158
+ .remove_special_chars() # 移除特殊字符,仅保留字母、数字、空格和汉字 \w 已经包括所有 Unicode 字母 下划线 _ 会被保留
159
+ .collapse_spaces() # 移除多余空格,连续多个空格变一个
160
+ .remove_spaces_in_chinese_characters() # 匹配中文间的空格并替换为空字符串
161
+ .convert_to_simplified() # 繁体转简体
162
+ .lower() # 小写
163
+ .get_str() # 获取str
164
+ .strip()
165
+ ) # 去掉空格
166
+
167
+
168
+ def clear_au_organ(str_obj):
169
+ """
170
+ 为作者机构定制的清理 与上面比除了不转小写外 还多了些特殊的清理
171
+ """
172
+ strs = (
173
+ StringClear(str_obj)
174
+ .None_to_str() # None 转 空字符串
175
+ .to_str() # 防止其他类型传入 比如 int double
176
+ .qj_to_bj() # 全角转半角
177
+ .strip_quotes() # 清理 双引号
178
+ .clean_symbols() # 清理已知的符号
179
+ .collapse_spaces() # 移除多余空格,连续多个空格变一个
180
+ .convert_to_simplified() # 繁体转简体
181
+ .get_str() # 获取str
182
+ .strip() # 去掉空格
183
+ )
184
+
185
+ strs = strs.replace("lt正gt", "").strip() # 特殊需求
186
+ return strs
187
+
188
+
189
+ def ref_clear(str_obj):
190
+ # 为 引文 数据定制的清理
191
+ strs = (
192
+ StringClear(str_obj)
193
+ .None_to_str() # None 转 空字符串
194
+ .remove_html_tag() # 清理html标签
195
+ .to_str() # 防止其他类型传入 比如 int double
196
+ .qj_to_bj() # 全角转半角
197
+ .strip_quotes() # 清理 双引号
198
+ .clean_symbols() # 清理已知的符号
199
+ .collapse_spaces() # 移除多余空格,连续多个空格变一个
200
+ .lower() # 小写
201
+ .remove_diacritics() # 去除音标 转换成字母
202
+ .get_str() # 获取str
203
+ .strip() # 去掉空格
204
+ )
205
+ return strs
206
+
207
+
208
+ def clear_obj(str_obj):
209
+ # 为对象化定制的清理
210
+ str_obj = clear_au_organ(str_obj)
211
+ # str_obj = str_obj.replace("ß", "SS") # "ß" 的 大写就是 "SS"
212
+ result = (
213
+ StringClear(str_obj)
214
+ .remove_diacritics() # 清理音标
215
+ .upper()
216
+ .get_str() # 获取str
217
+ .strip() # 去掉空格
218
+ )
219
+ return result
220
+
221
+
222
+ def normalize_title_for_es(title: str):
223
+ _title = StringClear(title).convert_to_simplified().qj_to_bj().get_str()
224
+ has_chinese = re.search(r"[\u4e00-\u9fa5]", _title)
225
+ if not has_chinese:
226
+ _title = re.sub(r"[-—‑–−―-]", " ", _title)
227
+ return _title.strip()
@@ -1,18 +1,18 @@
1
- import jellyfish
2
- from rapidfuzz.distance import DamerauLevenshtein
3
-
4
-
5
- class JaroDamerauLevenshteinMaxSim(object):
6
- """
7
- jaro_similarity 有缺陷 以下样例数据会导致分很低
8
- s1 = "in situ monitoring of semiconductor wafer temperature using infrared interfe rometry"
9
- s2 = "insitu monitoring of semiconductor wafer temperature using infrared interferometry"
10
- """
11
-
12
- def get_sim(self, str1: str, str2: str) -> float:
13
- similarity1 = jellyfish.jaro_similarity(str1, str2)
14
- if str1.strip() == "" and str2.strip() == "":
15
- similarity2 = 0
16
- else:
17
- similarity2 = 1 - DamerauLevenshtein.normalized_distance(str1, str2)
18
- return max(similarity1, similarity2)
1
+ import jellyfish
2
+ from rapidfuzz.distance import DamerauLevenshtein
3
+
4
+
5
+ class JaroDamerauLevenshteinMaxSim(object):
6
+ """
7
+ jaro_similarity 有缺陷 以下样例数据会导致分很低
8
+ s1 = "in situ monitoring of semiconductor wafer temperature using infrared interfe rometry"
9
+ s2 = "insitu monitoring of semiconductor wafer temperature using infrared interferometry"
10
+ """
11
+
12
+ def get_sim(self, str1: str, str2: str) -> float:
13
+ similarity1 = jellyfish.jaro_similarity(str1, str2)
14
+ if str1.strip() == "" and str2.strip() == "":
15
+ similarity2 = 0
16
+ else:
17
+ similarity2 = 1 - DamerauLevenshtein.normalized_distance(str1, str2)
18
+ return max(similarity1, similarity2)