re-common 10.0.37__py3-none-any.whl → 10.0.39__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (217) hide show
  1. re_common/baselibrary/__init__.py +4 -4
  2. re_common/baselibrary/baseabs/__init__.py +6 -6
  3. re_common/baselibrary/baseabs/baseabs.py +26 -26
  4. re_common/baselibrary/database/mbuilder.py +132 -132
  5. re_common/baselibrary/database/moudle.py +93 -93
  6. re_common/baselibrary/database/msqlite3.py +194 -194
  7. re_common/baselibrary/database/mysql.py +169 -169
  8. re_common/baselibrary/database/sql_factory.py +26 -26
  9. re_common/baselibrary/mthread/MThreadingRun.py +486 -486
  10. re_common/baselibrary/mthread/MThreadingRunEvent.py +349 -349
  11. re_common/baselibrary/mthread/__init__.py +2 -2
  12. re_common/baselibrary/mthread/mythreading.py +695 -695
  13. re_common/baselibrary/pakge_other/socks.py +404 -404
  14. re_common/baselibrary/readconfig/config_factory.py +18 -18
  15. re_common/baselibrary/readconfig/ini_config.py +317 -317
  16. re_common/baselibrary/readconfig/toml_config.py +49 -49
  17. re_common/baselibrary/temporary/envdata.py +36 -36
  18. re_common/baselibrary/tools/all_requests/aiohttp_request.py +118 -118
  19. re_common/baselibrary/tools/all_requests/httpx_requet.py +102 -102
  20. re_common/baselibrary/tools/all_requests/mrequest.py +412 -412
  21. re_common/baselibrary/tools/all_requests/requests_request.py +81 -81
  22. re_common/baselibrary/tools/batch_compre/bijiao_batch.py +31 -31
  23. re_common/baselibrary/tools/contrast_db3.py +123 -123
  24. re_common/baselibrary/tools/copy_file.py +39 -39
  25. re_common/baselibrary/tools/db3_2_sizedb3.py +102 -102
  26. re_common/baselibrary/tools/foreachgz.py +39 -39
  27. re_common/baselibrary/tools/get_attr.py +10 -10
  28. re_common/baselibrary/tools/image_to_pdf.py +61 -61
  29. re_common/baselibrary/tools/java_code_deal.py +139 -139
  30. re_common/baselibrary/tools/javacode.py +79 -79
  31. re_common/baselibrary/tools/mdb_db3.py +48 -48
  32. re_common/baselibrary/tools/merge_file.py +171 -171
  33. re_common/baselibrary/tools/merge_gz_file.py +165 -165
  34. re_common/baselibrary/tools/mhdfstools/down_hdfs_files.py +42 -42
  35. re_common/baselibrary/tools/mhdfstools/hdfst.py +42 -42
  36. re_common/baselibrary/tools/mhdfstools/up_hdfs_files.py +38 -38
  37. re_common/baselibrary/tools/mongo_tools.py +50 -50
  38. re_common/baselibrary/tools/move_file.py +170 -170
  39. re_common/baselibrary/tools/move_mongo/mongo_table_to_file.py +63 -63
  40. re_common/baselibrary/tools/move_mongo/move_mongo_table.py +354 -354
  41. re_common/baselibrary/tools/move_mongo/use_mttf.py +18 -18
  42. re_common/baselibrary/tools/move_mongo/use_mv.py +93 -93
  43. re_common/baselibrary/tools/mpandas/mpandasreadexcel.py +125 -125
  44. re_common/baselibrary/tools/mpandas/pandas_visualization.py +7 -7
  45. re_common/baselibrary/tools/myparsel.py +104 -104
  46. re_common/baselibrary/tools/rename_dir_file.py +37 -37
  47. re_common/baselibrary/tools/sequoiadb_utils.py +398 -398
  48. re_common/baselibrary/tools/split_line_to_many.py +25 -25
  49. re_common/baselibrary/tools/stringtodicts.py +33 -33
  50. re_common/baselibrary/tools/workwechant_bot.py +84 -84
  51. re_common/baselibrary/utils/baseaiohttp.py +296 -296
  52. re_common/baselibrary/utils/baseaiomysql.py +87 -87
  53. re_common/baselibrary/utils/baseallstep.py +191 -191
  54. re_common/baselibrary/utils/baseavro.py +19 -19
  55. re_common/baselibrary/utils/baseboto3.py +291 -291
  56. re_common/baselibrary/utils/basecsv.py +32 -32
  57. re_common/baselibrary/utils/basedict.py +133 -133
  58. re_common/baselibrary/utils/basedir.py +241 -241
  59. re_common/baselibrary/utils/baseencode.py +351 -351
  60. re_common/baselibrary/utils/baseencoding.py +28 -28
  61. re_common/baselibrary/utils/baseesdsl.py +86 -86
  62. re_common/baselibrary/utils/baseexcel.py +264 -264
  63. re_common/baselibrary/utils/baseexcept.py +109 -109
  64. re_common/baselibrary/utils/basefile.py +654 -654
  65. re_common/baselibrary/utils/baseftp.py +214 -214
  66. re_common/baselibrary/utils/basegzip.py +60 -60
  67. re_common/baselibrary/utils/basehdfs.py +135 -135
  68. re_common/baselibrary/utils/basehttpx.py +268 -268
  69. re_common/baselibrary/utils/baseip.py +87 -87
  70. re_common/baselibrary/utils/basejson.py +2 -2
  71. re_common/baselibrary/utils/baselist.py +32 -32
  72. re_common/baselibrary/utils/basemotor.py +190 -190
  73. re_common/baselibrary/utils/basemssql.py +98 -98
  74. re_common/baselibrary/utils/baseodbc.py +113 -113
  75. re_common/baselibrary/utils/basepandas.py +302 -302
  76. re_common/baselibrary/utils/basepeewee.py +11 -11
  77. re_common/baselibrary/utils/basepika.py +180 -180
  78. re_common/baselibrary/utils/basepydash.py +143 -143
  79. re_common/baselibrary/utils/basepymongo.py +230 -230
  80. re_common/baselibrary/utils/basequeue.py +22 -22
  81. re_common/baselibrary/utils/baserar.py +57 -57
  82. re_common/baselibrary/utils/baserequest.py +279 -279
  83. re_common/baselibrary/utils/baseset.py +8 -8
  84. re_common/baselibrary/utils/basesmb.py +403 -403
  85. re_common/baselibrary/utils/basestring.py +382 -382
  86. re_common/baselibrary/utils/basetime.py +320 -320
  87. re_common/baselibrary/utils/baseurl.py +121 -121
  88. re_common/baselibrary/utils/basezip.py +57 -57
  89. re_common/baselibrary/utils/core/__init__.py +7 -7
  90. re_common/baselibrary/utils/core/bottomutils.py +18 -18
  91. re_common/baselibrary/utils/core/mdeprecated.py +327 -327
  92. re_common/baselibrary/utils/core/mlamada.py +16 -16
  93. re_common/baselibrary/utils/core/msginfo.py +25 -25
  94. re_common/baselibrary/utils/core/requests_core.py +103 -103
  95. re_common/baselibrary/utils/fateadm.py +429 -429
  96. re_common/baselibrary/utils/importfun.py +123 -123
  97. re_common/baselibrary/utils/mfaker.py +57 -57
  98. re_common/baselibrary/utils/my_abc/__init__.py +3 -3
  99. re_common/baselibrary/utils/my_abc/better_abc.py +32 -32
  100. re_common/baselibrary/utils/mylogger.py +414 -414
  101. re_common/baselibrary/utils/myredisclient.py +861 -861
  102. re_common/baselibrary/utils/pipupgrade.py +21 -21
  103. re_common/baselibrary/utils/ringlist.py +85 -85
  104. re_common/baselibrary/utils/version_compare.py +36 -36
  105. re_common/baselibrary/utils/ydmhttp.py +126 -126
  106. re_common/facade/lazy_import.py +11 -11
  107. re_common/facade/loggerfacade.py +25 -25
  108. re_common/facade/mysqlfacade.py +467 -467
  109. re_common/facade/now.py +31 -31
  110. re_common/facade/sqlite3facade.py +257 -257
  111. re_common/facade/use/mq_use_facade.py +83 -83
  112. re_common/facade/use/proxy_use_facade.py +19 -19
  113. re_common/libtest/base_dict_test.py +19 -19
  114. re_common/libtest/baseavro_test.py +13 -13
  115. re_common/libtest/basefile_test.py +14 -14
  116. re_common/libtest/basemssql_test.py +77 -77
  117. re_common/libtest/baseodbc_test.py +7 -7
  118. re_common/libtest/basepandas_test.py +38 -38
  119. re_common/libtest/get_attr_test/get_attr_test_settings.py +14 -14
  120. re_common/libtest/get_attr_test/settings.py +54 -54
  121. re_common/libtest/idencode_test.py +53 -53
  122. re_common/libtest/iniconfig_test.py +35 -35
  123. re_common/libtest/ip_test.py +34 -34
  124. re_common/libtest/merge_file_test.py +20 -20
  125. re_common/libtest/mfaker_test.py +8 -8
  126. re_common/libtest/mm3_test.py +31 -31
  127. re_common/libtest/mylogger_test.py +88 -88
  128. re_common/libtest/myparsel_test.py +27 -27
  129. re_common/libtest/mysql_test.py +151 -151
  130. re_common/libtest/pymongo_test.py +21 -21
  131. re_common/libtest/split_test.py +11 -11
  132. re_common/libtest/sqlite3_merge_test.py +5 -5
  133. re_common/libtest/sqlite3_test.py +34 -34
  134. re_common/libtest/tomlconfig_test.py +30 -30
  135. re_common/libtest/use_tools_test/__init__.py +2 -2
  136. re_common/libtest/user/__init__.py +4 -4
  137. re_common/studio/__init__.py +4 -4
  138. re_common/studio/assignment_expressions.py +36 -36
  139. re_common/studio/mydash/test1.py +18 -18
  140. re_common/studio/pydashstudio/first.py +9 -9
  141. re_common/studio/streamlitstudio/first_app.py +65 -65
  142. re_common/studio/streamlitstudio/uber_pickups.py +23 -23
  143. re_common/studio/test.py +18 -18
  144. re_common/v2/baselibrary/business_utils/BusinessStringUtil.py +219 -219
  145. re_common/v2/baselibrary/business_utils/baseencodeid.py +100 -100
  146. re_common/v2/baselibrary/business_utils/full_doi_path.py +116 -116
  147. re_common/v2/baselibrary/business_utils/rel_tools.py +6 -6
  148. re_common/v2/baselibrary/decorators/utils.py +59 -59
  149. re_common/v2/baselibrary/helpers/search_packge/NearestNeighbors_test.py +105 -105
  150. re_common/v2/baselibrary/helpers/search_packge/fit_text_match.py +253 -253
  151. re_common/v2/baselibrary/helpers/search_packge/scikit_learn_text_matcher.py +260 -260
  152. re_common/v2/baselibrary/helpers/search_packge/test.py +1 -1
  153. re_common/v2/baselibrary/s3object/baseboto3.py +230 -230
  154. re_common/v2/baselibrary/tools/WeChatRobot.py +95 -95
  155. re_common/v2/baselibrary/tools/ac_ahocorasick.py +75 -75
  156. re_common/v2/baselibrary/tools/concurrency.py +35 -35
  157. re_common/v2/baselibrary/tools/data_processer/base.py +53 -53
  158. re_common/v2/baselibrary/tools/data_processer/data_processer.py +508 -508
  159. re_common/v2/baselibrary/tools/data_processer/data_reader.py +187 -187
  160. re_common/v2/baselibrary/tools/data_processer/data_writer.py +38 -38
  161. re_common/v2/baselibrary/tools/dict_tools.py +44 -44
  162. re_common/v2/baselibrary/tools/dolphinscheduler.py +187 -187
  163. re_common/v2/baselibrary/tools/hdfs_base_processor.py +204 -204
  164. re_common/v2/baselibrary/tools/hdfs_bulk_processor.py +67 -67
  165. re_common/v2/baselibrary/tools/hdfs_data_processer.py +338 -338
  166. re_common/v2/baselibrary/tools/hdfs_line_processor.py +74 -74
  167. re_common/v2/baselibrary/tools/list_tools.py +69 -69
  168. re_common/v2/baselibrary/tools/resume_tracker.py +94 -94
  169. re_common/v2/baselibrary/tools/search_hash_tools.py +54 -54
  170. re_common/v2/baselibrary/tools/text_matcher.py +326 -326
  171. re_common/v2/baselibrary/tools/unionfind_tools.py +60 -60
  172. re_common/v2/baselibrary/utils/BusinessStringUtil.py +196 -196
  173. re_common/v2/baselibrary/utils/api_net_utils.py +270 -270
  174. re_common/v2/baselibrary/utils/author_smi.py +361 -361
  175. re_common/v2/baselibrary/utils/base_string_similarity.py +158 -158
  176. re_common/v2/baselibrary/utils/basedict.py +37 -37
  177. re_common/v2/baselibrary/utils/basehdfs.py +163 -163
  178. re_common/v2/baselibrary/utils/basepika.py +180 -180
  179. re_common/v2/baselibrary/utils/basetime.py +77 -77
  180. re_common/v2/baselibrary/utils/db.py +156 -156
  181. re_common/v2/baselibrary/utils/json_cls.py +16 -16
  182. re_common/v2/baselibrary/utils/mq.py +83 -83
  183. re_common/v2/baselibrary/utils/n_ary_expression_tree.py +243 -243
  184. re_common/v2/baselibrary/utils/string_bool.py +186 -186
  185. re_common/v2/baselibrary/utils/string_clear.py +246 -246
  186. re_common/v2/baselibrary/utils/string_smi.py +18 -18
  187. re_common/v2/baselibrary/utils/stringutils.py +271 -278
  188. re_common/vip/base_step_process.py +11 -11
  189. re_common/vip/baseencodeid.py +90 -90
  190. re_common/vip/changetaskname.py +28 -28
  191. re_common/vip/core_var.py +24 -24
  192. re_common/vip/mmh3Hash.py +89 -89
  193. re_common/vip/proxy/allproxys.py +127 -127
  194. re_common/vip/proxy/allproxys_thread.py +159 -159
  195. re_common/vip/proxy/cnki_proxy.py +153 -153
  196. re_common/vip/proxy/kuaidaili.py +87 -87
  197. re_common/vip/proxy/proxy_all.py +113 -113
  198. re_common/vip/proxy/update_kuaidaili_0.py +42 -42
  199. re_common/vip/proxy/wanfang_proxy.py +152 -152
  200. re_common/vip/proxy/wp_proxy_all.py +181 -181
  201. re_common/vip/read_rawid_to_txt.py +91 -91
  202. re_common/vip/title/__init__.py +5 -5
  203. re_common/vip/title/transform/TransformBookTitleToZt.py +125 -125
  204. re_common/vip/title/transform/TransformConferenceTitleToZt.py +139 -139
  205. re_common/vip/title/transform/TransformCstadTitleToZt.py +195 -195
  206. re_common/vip/title/transform/TransformJournalTitleToZt.py +203 -203
  207. re_common/vip/title/transform/TransformPatentTitleToZt.py +132 -132
  208. re_common/vip/title/transform/TransformRegulationTitleToZt.py +114 -114
  209. re_common/vip/title/transform/TransformStandardTitleToZt.py +135 -135
  210. re_common/vip/title/transform/TransformThesisTitleToZt.py +135 -135
  211. re_common/vip/title/transform/__init__.py +10 -10
  212. {re_common-10.0.37.dist-info → re_common-10.0.39.dist-info}/LICENSE +201 -201
  213. {re_common-10.0.37.dist-info → re_common-10.0.39.dist-info}/METADATA +16 -16
  214. re_common-10.0.39.dist-info/RECORD +248 -0
  215. {re_common-10.0.37.dist-info → re_common-10.0.39.dist-info}/WHEEL +1 -1
  216. re_common-10.0.37.dist-info/RECORD +0 -248
  217. {re_common-10.0.37.dist-info → re_common-10.0.39.dist-info}/top_level.txt +0 -0
@@ -1,220 +1,220 @@
1
- # 某些业务中的字符串处理 算是特定场景的工具 不算通用工具
2
- import re
3
-
4
- from re_common.v2.baselibrary.utils.author_smi import AuthorRatio
5
- from re_common.v2.baselibrary.utils.string_bool import is_all_symbols
6
- from re_common.v2.baselibrary.utils.string_clear import rel_clear
7
- from re_common.v2.baselibrary.utils.stringutils import clean_unicode_alnum, get_alphabetic_ratio
8
-
9
-
10
- def clean_organ_postcode(organ):
11
- """
12
- 格式化组织名称字符串,移除括号内容并删除独立的6位数字(邮政编码),然后清理标点。
13
-
14
- 备注: 该方法替换java 里面的 formatOrgan
15
-
16
- 参数:
17
- organ (str): 输入的组织名称字符串,可能包含括号、分号和邮政编码。
18
-
19
- 返回:
20
- str: 格式化并清理后的组织名称字符串(无独立6位数字)。
21
- """
22
- # 如果输入为空,设为空字符串以避免后续操作报错
23
- if not organ:
24
- organ = ""
25
-
26
- # 删除方括号和圆括号中的内容(包括括号本身)
27
- organ = re.sub(r"\[.*?\]", "", organ) # 非贪婪匹配方括号内容
28
- organ = re.sub(r"\(.*?\)", "", organ) # 非贪婪匹配圆括号内容
29
-
30
- # 定义正则表达式,匹配独立的6位数字
31
- # \b 表示单词边界,确保6位数字是独立的(前后不是字母、数字或下划线)
32
- organ = re.sub(r"\b[0-9]{6}\b", "", organ)
33
-
34
- # 初始化结果列表,用于存储处理后的组织名称部分
35
- format_organ = []
36
- # 按分号分割字符串,生成组织名称的各个部分
37
- organ_parts = organ.split(";")
38
-
39
- # 遍历每个部分,追加到结果列表
40
- for temp_organ in organ_parts:
41
- # 去除首尾多余空格后追加(避免因移除邮编导致的空字符串)
42
- cleaned_part = temp_organ.strip()
43
- # 如果首尾是标点符号,则移除
44
- # 定义标点符号的正则表达式(这里包括常见标点)
45
- punctuation = r"^[!,.?;:#$%^&*+-]+|[!,.?;:#$%^&*+-]+$"
46
- cleaned_part = re.sub(punctuation, "", cleaned_part)
47
- if cleaned_part: # 只追加非空部分
48
- format_organ.append(cleaned_part)
49
-
50
- # 用分号连接结果,转换为大写并清理标点
51
- format_organ = ";".join(format_organ)
52
-
53
- # 返回最终结果并去除首尾空格
54
- return format_organ.strip()
55
-
56
-
57
- def get_first_organ(organ):
58
- if not organ:
59
- return ""
60
- organ_list = organ.strip().split(";")
61
- for organ_one in organ_list:
62
- # 清理邮政编码
63
- organ_one = clean_organ_postcode(organ_one)
64
- if organ_one.strip():
65
- return organ_one
66
-
67
- return ""
68
-
69
-
70
- def get_first_author(author: str) -> str:
71
- if not author:
72
- return ""
73
- au_list = author.strip().split(";")
74
- for au in au_list:
75
- au = re.sub("\\[.*?]", "", au)
76
- au = re.sub("\\(.*?\\)", "", au)
77
- if au.strip():
78
- return au
79
- return ""
80
-
81
-
82
- def get_author_list(author: str):
83
- lists = []
84
- if not author:
85
- return []
86
- au_list = author.strip().split(";")
87
- for au in au_list:
88
- au = re.sub("\\[.*?]", "", au)
89
- au = re.sub("\\(.*?\\)", "", au)
90
- if au.strip():
91
- lists.append(au.strip())
92
- return lists
93
-
94
-
95
- def get_scopus_author_abbr(author_row: str):
96
- if not author_row:
97
- return ""
98
- author_list = author_row.split("&&")
99
- if len(author_list) != 3:
100
- raise Exception("错误的数据个数 可能来自其他数据源")
101
-
102
- abbr_list = author_list[0].strip().split(";")
103
- abbr_list = [author.strip() for author in abbr_list if
104
- author.strip() and author.strip().lower() not in ("*", "and")]
105
- return ";".join(abbr_list)
106
-
107
-
108
- def get_wos_author_abbr(author_row: str):
109
- if not author_row:
110
- return ""
111
- author_list = author_row.split("&&")
112
- if len(author_list) != 4:
113
- raise Exception("错误的数据个数 可能来自其他数据源")
114
- abbr_list = []
115
- abbr_list_au = author_list[0].strip().split(";")
116
- abbr_list_ba = author_list[2].strip().split(";")
117
- abbr_list.extend(abbr_list_au)
118
- abbr_list.extend(abbr_list_ba)
119
- abbr_list = [author.strip() for author in abbr_list if
120
- author.strip() and author.strip().lower() not in ("*", "and")]
121
- return ";".join(abbr_list)
122
-
123
-
124
- def deal_rel_vol(vol_str: str):
125
- """
126
- 处理 期刊融合时的卷处理逻辑
127
- """
128
-
129
- # 如果卷是全符号 清理掉
130
- if is_all_symbols(vol_str):
131
- vol_str = ""
132
-
133
- if vol_str.replace(".", "").isdigit():
134
- try:
135
- float_num = float(vol_str)
136
- if int(float_num) == float_num:
137
- return str(int(float_num))
138
- except:
139
- pass
140
-
141
- if vol_str.lower().startswith("v "):
142
- vol_str = vol_str.lower().replace("v ", "").strip()
143
- return vol_str
144
- if vol_str.lower().startswith("volume "):
145
- vol_str = vol_str.lower().replace("volume ", "").strip()
146
- return vol_str
147
- if vol_str.lower().startswith("vol. "):
148
- vol_str = vol_str.lower().replace("vol. ", "").strip()
149
- return vol_str
150
- if vol_str.lower().startswith("vol "):
151
- vol_str = vol_str.lower().replace("vol ", "").strip()
152
- return vol_str
153
- return vol_str
154
-
155
-
156
- def deal_num_strs(input_str):
157
- """
158
- int后在str 防止有浮点型的表达方式
159
- """
160
- number_list = re.findall(r'\d+', input_str)
161
- transformed_numbers = [str(int(num)) for num in number_list]
162
-
163
- # 替换原字符串中的数字为转换后的数字
164
- for num, transformed_num in zip(number_list, transformed_numbers):
165
- input_str = input_str.replace(num, transformed_num)
166
- return input_str
167
-
168
-
169
- def deal_num(num_str):
170
- """
171
- 将 期格式化 方便 group尤其是有横杆的数据
172
- 该方法 为融合二次分割时使用,如果场景合适也可以用于其他地方
173
- :param strs:
174
- :return:
175
- """
176
- # 如果期是全符号清理掉
177
- if is_all_symbols(num_str):
178
- num_str = ""
179
-
180
- if num_str.lower().startswith("n "):
181
- num_str = num_str.lower().replace("n ", "").strip()
182
-
183
- num_str = num_str.lower().replace("special_issue_", '').replace("_special_issue", '').replace("issue", "")
184
- num_str = num_str.replace("spec.", "").replace("iss.", "").replace("spl.", "").replace("special.", "").replace(
185
- "specialissue.", "")
186
- num_str = num_str.replace("spec", "").replace("iss", "").replace("spl", "").replace("special", "").replace(
187
- "specialissue", '')
188
-
189
- num_str = num_str.replace("-", "_").replace(".", "_").upper()
190
- num_str = num_str.lstrip("_").rstrip("_")
191
- if num_str.find("_") > -1:
192
- start, end = num_str.split("_")
193
- start = deal_num_strs(start)
194
- end = deal_num_strs(end)
195
- num_str = start + "_" + end
196
- else:
197
- num_str = deal_num_strs(num_str)
198
-
199
- return num_str.lower().strip()
200
-
201
-
202
- def clear_author_1st(author_str:str):
203
- # 清理括号 防止前面流程没有清理干净
204
- author_str = re.sub("\\[.*?]", "", author_str)
205
- author_str = re.sub("\\(.*?\\)", "", author_str)
206
-
207
- # 清理符号
208
- author_str = clean_unicode_alnum(author_str)
209
-
210
- return author_str
211
-
212
- def is_same_author(a1, a2):
213
- if get_alphabetic_ratio(a1.strip()) > 0.7 and get_alphabetic_ratio(a2.strip()) > 0.7:
214
- author_similar_ = AuthorRatio(a1.strip(), a2.strip())
215
- if author_similar_ > 0.95:
216
- return True
217
- else:
218
- if rel_clear(a1.strip()) == rel_clear(a2.strip()):
219
- return True
1
+ # 某些业务中的字符串处理 算是特定场景的工具 不算通用工具
2
+ import re
3
+
4
+ from re_common.v2.baselibrary.utils.author_smi import AuthorRatio
5
+ from re_common.v2.baselibrary.utils.string_bool import is_all_symbols
6
+ from re_common.v2.baselibrary.utils.string_clear import rel_clear
7
+ from re_common.v2.baselibrary.utils.stringutils import clean_unicode_alnum, get_alphabetic_ratio
8
+
9
+
10
+ def clean_organ_postcode(organ):
11
+ """
12
+ 格式化组织名称字符串,移除括号内容并删除独立的6位数字(邮政编码),然后清理标点。
13
+
14
+ 备注: 该方法替换java 里面的 formatOrgan
15
+
16
+ 参数:
17
+ organ (str): 输入的组织名称字符串,可能包含括号、分号和邮政编码。
18
+
19
+ 返回:
20
+ str: 格式化并清理后的组织名称字符串(无独立6位数字)。
21
+ """
22
+ # 如果输入为空,设为空字符串以避免后续操作报错
23
+ if not organ:
24
+ organ = ""
25
+
26
+ # 删除方括号和圆括号中的内容(包括括号本身)
27
+ organ = re.sub(r"\[.*?\]", "", organ) # 非贪婪匹配方括号内容
28
+ organ = re.sub(r"\(.*?\)", "", organ) # 非贪婪匹配圆括号内容
29
+
30
+ # 定义正则表达式,匹配独立的6位数字
31
+ # \b 表示单词边界,确保6位数字是独立的(前后不是字母、数字或下划线)
32
+ organ = re.sub(r"\b[0-9]{6}\b", "", organ)
33
+
34
+ # 初始化结果列表,用于存储处理后的组织名称部分
35
+ format_organ = []
36
+ # 按分号分割字符串,生成组织名称的各个部分
37
+ organ_parts = organ.split(";")
38
+
39
+ # 遍历每个部分,追加到结果列表
40
+ for temp_organ in organ_parts:
41
+ # 去除首尾多余空格后追加(避免因移除邮编导致的空字符串)
42
+ cleaned_part = temp_organ.strip()
43
+ # 如果首尾是标点符号,则移除
44
+ # 定义标点符号的正则表达式(这里包括常见标点)
45
+ punctuation = r"^[!,.?;:#$%^&*+-]+|[!,.?;:#$%^&*+-]+$"
46
+ cleaned_part = re.sub(punctuation, "", cleaned_part)
47
+ if cleaned_part: # 只追加非空部分
48
+ format_organ.append(cleaned_part)
49
+
50
+ # 用分号连接结果,转换为大写并清理标点
51
+ format_organ = ";".join(format_organ)
52
+
53
+ # 返回最终结果并去除首尾空格
54
+ return format_organ.strip()
55
+
56
+
57
+ def get_first_organ(organ):
58
+ if not organ:
59
+ return ""
60
+ organ_list = organ.strip().split(";")
61
+ for organ_one in organ_list:
62
+ # 清理邮政编码
63
+ organ_one = clean_organ_postcode(organ_one)
64
+ if organ_one.strip():
65
+ return organ_one
66
+
67
+ return ""
68
+
69
+
70
+ def get_first_author(author: str) -> str:
71
+ if not author:
72
+ return ""
73
+ au_list = author.strip().split(";")
74
+ for au in au_list:
75
+ au = re.sub("\\[.*?]", "", au)
76
+ au = re.sub("\\(.*?\\)", "", au)
77
+ if au.strip():
78
+ return au
79
+ return ""
80
+
81
+
82
+ def get_author_list(author: str):
83
+ lists = []
84
+ if not author:
85
+ return []
86
+ au_list = author.strip().split(";")
87
+ for au in au_list:
88
+ au = re.sub("\\[.*?]", "", au)
89
+ au = re.sub("\\(.*?\\)", "", au)
90
+ if au.strip():
91
+ lists.append(au.strip())
92
+ return lists
93
+
94
+
95
+ def get_scopus_author_abbr(author_row: str):
96
+ if not author_row:
97
+ return ""
98
+ author_list = author_row.split("&&")
99
+ if len(author_list) != 3:
100
+ raise Exception("错误的数据个数 可能来自其他数据源")
101
+
102
+ abbr_list = author_list[0].strip().split(";")
103
+ abbr_list = [author.strip() for author in abbr_list if
104
+ author.strip() and author.strip().lower() not in ("*", "and")]
105
+ return ";".join(abbr_list)
106
+
107
+
108
+ def get_wos_author_abbr(author_row: str):
109
+ if not author_row:
110
+ return ""
111
+ author_list = author_row.split("&&")
112
+ if len(author_list) != 4:
113
+ raise Exception("错误的数据个数 可能来自其他数据源")
114
+ abbr_list = []
115
+ abbr_list_au = author_list[0].strip().split(";")
116
+ abbr_list_ba = author_list[2].strip().split(";")
117
+ abbr_list.extend(abbr_list_au)
118
+ abbr_list.extend(abbr_list_ba)
119
+ abbr_list = [author.strip() for author in abbr_list if
120
+ author.strip() and author.strip().lower() not in ("*", "and")]
121
+ return ";".join(abbr_list)
122
+
123
+
124
+ def deal_rel_vol(vol_str: str):
125
+ """
126
+ 处理 期刊融合时的卷处理逻辑
127
+ """
128
+
129
+ # 如果卷是全符号 清理掉
130
+ if is_all_symbols(vol_str):
131
+ vol_str = ""
132
+
133
+ if vol_str.replace(".", "").isdigit():
134
+ try:
135
+ float_num = float(vol_str)
136
+ if int(float_num) == float_num:
137
+ return str(int(float_num))
138
+ except:
139
+ pass
140
+
141
+ if vol_str.lower().startswith("v "):
142
+ vol_str = vol_str.lower().replace("v ", "").strip()
143
+ return vol_str
144
+ if vol_str.lower().startswith("volume "):
145
+ vol_str = vol_str.lower().replace("volume ", "").strip()
146
+ return vol_str
147
+ if vol_str.lower().startswith("vol. "):
148
+ vol_str = vol_str.lower().replace("vol. ", "").strip()
149
+ return vol_str
150
+ if vol_str.lower().startswith("vol "):
151
+ vol_str = vol_str.lower().replace("vol ", "").strip()
152
+ return vol_str
153
+ return vol_str
154
+
155
+
156
+ def deal_num_strs(input_str):
157
+ """
158
+ int后在str 防止有浮点型的表达方式
159
+ """
160
+ number_list = re.findall(r'\d+', input_str)
161
+ transformed_numbers = [str(int(num)) for num in number_list]
162
+
163
+ # 替换原字符串中的数字为转换后的数字
164
+ for num, transformed_num in zip(number_list, transformed_numbers):
165
+ input_str = input_str.replace(num, transformed_num)
166
+ return input_str
167
+
168
+
169
+ def deal_num(num_str):
170
+ """
171
+ 将 期格式化 方便 group尤其是有横杆的数据
172
+ 该方法 为融合二次分割时使用,如果场景合适也可以用于其他地方
173
+ :param strs:
174
+ :return:
175
+ """
176
+ # 如果期是全符号清理掉
177
+ if is_all_symbols(num_str):
178
+ num_str = ""
179
+
180
+ if num_str.lower().startswith("n "):
181
+ num_str = num_str.lower().replace("n ", "").strip()
182
+
183
+ num_str = num_str.lower().replace("special_issue_", '').replace("_special_issue", '').replace("issue", "")
184
+ num_str = num_str.replace("spec.", "").replace("iss.", "").replace("spl.", "").replace("special.", "").replace(
185
+ "specialissue.", "")
186
+ num_str = num_str.replace("spec", "").replace("iss", "").replace("spl", "").replace("special", "").replace(
187
+ "specialissue", '')
188
+
189
+ num_str = num_str.replace("-", "_").replace(".", "_").upper()
190
+ num_str = num_str.lstrip("_").rstrip("_")
191
+ if num_str.find("_") > -1:
192
+ start, end = num_str.split("_")
193
+ start = deal_num_strs(start)
194
+ end = deal_num_strs(end)
195
+ num_str = start + "_" + end
196
+ else:
197
+ num_str = deal_num_strs(num_str)
198
+
199
+ return num_str.lower().strip()
200
+
201
+
202
+ def clear_author_1st(author_str:str):
203
+ # 清理括号 防止前面流程没有清理干净
204
+ author_str = re.sub("\\[.*?]", "", author_str)
205
+ author_str = re.sub("\\(.*?\\)", "", author_str)
206
+
207
+ # 清理符号
208
+ author_str = clean_unicode_alnum(author_str)
209
+
210
+ return author_str
211
+
212
+ def is_same_author(a1, a2):
213
+ if get_alphabetic_ratio(a1.strip()) > 0.7 and get_alphabetic_ratio(a2.strip()) > 0.7:
214
+ author_similar_ = AuthorRatio(a1.strip(), a2.strip())
215
+ if author_similar_ > 0.95:
216
+ return True
217
+ else:
218
+ if rel_clear(a1.strip()) == rel_clear(a2.strip()):
219
+ return True
220
220
  return False
@@ -1,100 +1,100 @@
1
- import base64
2
- import hashlib
3
-
4
- """
5
- VIP编码lngid生成
6
- """
7
-
8
-
9
- class BaseLngid(object):
10
- def __int__(self):
11
- pass
12
-
13
- def basBaseEncodeID(self, strRaw):
14
- r""" 自定义base编码 """
15
-
16
- strEncode = base64.b32encode(strRaw.encode('utf8')).decode('utf8')
17
-
18
- if strEncode.endswith('======'):
19
- strEncode = '%s%s' % (strEncode[0:-6], '0')
20
- elif strEncode.endswith('===='):
21
- strEncode = '%s%s' % (strEncode[0:-4], '1')
22
- elif strEncode.endswith('==='):
23
- strEncode = '%s%s' % (strEncode[0:-3], '8')
24
- elif strEncode.endswith('='):
25
- strEncode = '%s%s' % (strEncode[0:-1], '9')
26
-
27
- table = str.maketrans('0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'ZYXWVUTSRQPONMLKJIHGFEDCBA9876543210')
28
- strEncode = strEncode.translate(table)
29
-
30
- return strEncode
31
-
32
- def BaseDecodeID(self, strEncode):
33
- r""" 自定义base解码 """
34
-
35
- table = str.maketrans('ZYXWVUTSRQPONMLKJIHGFEDCBA9876543210', '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ')
36
- strEncode = strEncode.translate(table)
37
-
38
- if strEncode.endswith('0'):
39
- strEncode = '%s%s' % (strEncode[0:-1], '======')
40
- elif strEncode.endswith('1'):
41
- strEncode = '%s%s' % (strEncode[0:-1], '====')
42
- elif strEncode.endswith('8'):
43
- strEncode = '%s%s' % (strEncode[0:-1], '===')
44
- elif strEncode.endswith('9'):
45
- strEncode = '%s%s' % (strEncode[0:-1], '=')
46
-
47
- strRaw = base64.b32decode(strEncode.encode('utf8')).decode('utf8')
48
-
49
- return strRaw
50
-
51
- def GetLngid(self, sub_db_id, rawid, case_insensitive=False):
52
- """
53
- :param sub_db_id:
54
- :param rawid:
55
- 由 sub_db_id 和 rawid 得到 lngid。
56
- :param case_insensitive: 标识源网站的 rawid 是否区分大小写
57
- :return: lngid
58
- """
59
- uppercase_rawid = '' # 大写版 rawid
60
- if case_insensitive: # 源网站的 rawid 区分大小写
61
- for ch in rawid:
62
- if ch.upper() == ch:
63
- uppercase_rawid += ch
64
- else:
65
- uppercase_rawid += ch.upper() + '_'
66
- else:
67
- uppercase_rawid = rawid.upper()
68
-
69
- limited_id = uppercase_rawid # 限长ID
70
- if len(uppercase_rawid) > 20:
71
- limited_id = hashlib.md5(uppercase_rawid.encode('utf8')).hexdigest().upper()
72
- else:
73
- limited_id = self.BaseEncodeID(uppercase_rawid)
74
-
75
- lngid = sub_db_id + limited_id
76
-
77
- return lngid
78
-
79
- def getDoiid(self, doi, case_insensitive=False):
80
- if case_insensitive is False:
81
- doi = doi.upper()
82
- limited_id = self.BaseEncodeID(doi)
83
- if len(limited_id) > 240:
84
- limited_id = hashlib.md5(doi.encode('utf8')).hexdigest().upper()
85
- return limited_id
86
-
87
- def GetRawid(self, limited_id, case_insensitive=False):
88
- try:
89
- uppercase_rawid = self.BaseDecodeID(limited_id)
90
- if case_insensitive:
91
- str_ = "_"
92
- uppercase_rawid_list = list(uppercase_rawid)
93
- for num, li in enumerate(uppercase_rawid_list):
94
- if li == str_:
95
- old_str = "".join(uppercase_rawid_list[num - 1:num + 1])
96
- uppercase_rawid = uppercase_rawid.replace(old_str, uppercase_rawid_list[num - 1].lower())
97
- except Exception as e:
98
- raise Exception("长度超过20,不可逆")
99
-
100
- return uppercase_rawid
1
+ import base64
2
+ import hashlib
3
+
4
+ """
5
+ VIP编码lngid生成
6
+ """
7
+
8
+
9
+ class BaseLngid(object):
10
+ def __int__(self):
11
+ pass
12
+
13
+ def BaseEncodeID(self, strRaw):
14
+ r""" 自定义base编码 """
15
+
16
+ strEncode = base64.b32encode(strRaw.encode('utf8')).decode('utf8')
17
+
18
+ if strEncode.endswith('======'):
19
+ strEncode = '%s%s' % (strEncode[0:-6], '0')
20
+ elif strEncode.endswith('===='):
21
+ strEncode = '%s%s' % (strEncode[0:-4], '1')
22
+ elif strEncode.endswith('==='):
23
+ strEncode = '%s%s' % (strEncode[0:-3], '8')
24
+ elif strEncode.endswith('='):
25
+ strEncode = '%s%s' % (strEncode[0:-1], '9')
26
+
27
+ table = str.maketrans('0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'ZYXWVUTSRQPONMLKJIHGFEDCBA9876543210')
28
+ strEncode = strEncode.translate(table)
29
+
30
+ return strEncode
31
+
32
+ def BaseDecodeID(self, strEncode):
33
+ r""" 自定义base解码 """
34
+
35
+ table = str.maketrans('ZYXWVUTSRQPONMLKJIHGFEDCBA9876543210', '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ')
36
+ strEncode = strEncode.translate(table)
37
+
38
+ if strEncode.endswith('0'):
39
+ strEncode = '%s%s' % (strEncode[0:-1], '======')
40
+ elif strEncode.endswith('1'):
41
+ strEncode = '%s%s' % (strEncode[0:-1], '====')
42
+ elif strEncode.endswith('8'):
43
+ strEncode = '%s%s' % (strEncode[0:-1], '===')
44
+ elif strEncode.endswith('9'):
45
+ strEncode = '%s%s' % (strEncode[0:-1], '=')
46
+
47
+ strRaw = base64.b32decode(strEncode.encode('utf8')).decode('utf8')
48
+
49
+ return strRaw
50
+
51
+ def GetLngid(self, sub_db_id, rawid, case_insensitive=False):
52
+ """
53
+ :param sub_db_id:
54
+ :param rawid:
55
+ 由 sub_db_id 和 rawid 得到 lngid。
56
+ :param case_insensitive: 标识源网站的 rawid 是否区分大小写
57
+ :return: lngid
58
+ """
59
+ uppercase_rawid = '' # 大写版 rawid
60
+ if case_insensitive: # 源网站的 rawid 区分大小写
61
+ for ch in rawid:
62
+ if ch.upper() == ch:
63
+ uppercase_rawid += ch
64
+ else:
65
+ uppercase_rawid += ch.upper() + '_'
66
+ else:
67
+ uppercase_rawid = rawid.upper()
68
+
69
+ limited_id = uppercase_rawid # 限长ID
70
+ if len(uppercase_rawid) > 20:
71
+ limited_id = hashlib.md5(uppercase_rawid.encode('utf8')).hexdigest().upper()
72
+ else:
73
+ limited_id = self.BaseEncodeID(uppercase_rawid)
74
+
75
+ lngid = sub_db_id + limited_id
76
+
77
+ return lngid
78
+
79
+ def getDoiid(self, doi, case_insensitive=False):
80
+ if case_insensitive is False:
81
+ doi = doi.upper()
82
+ limited_id = self.BaseEncodeID(doi)
83
+ if len(limited_id) > 240:
84
+ limited_id = hashlib.md5(doi.encode('utf8')).hexdigest().upper()
85
+ return limited_id
86
+
87
+ def GetRawid(self, limited_id, case_insensitive=False):
88
+ try:
89
+ uppercase_rawid = self.BaseDecodeID(limited_id)
90
+ if case_insensitive:
91
+ str_ = "_"
92
+ uppercase_rawid_list = list(uppercase_rawid)
93
+ for num, li in enumerate(uppercase_rawid_list):
94
+ if li == str_:
95
+ old_str = "".join(uppercase_rawid_list[num - 1:num + 1])
96
+ uppercase_rawid = uppercase_rawid.replace(old_str, uppercase_rawid_list[num - 1].lower())
97
+ except Exception as e:
98
+ raise Exception("长度超过20,不可逆")
99
+
100
+ return uppercase_rawid