re-common 10.0.37__py3-none-any.whl → 10.0.39__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (217) hide show
  1. re_common/baselibrary/__init__.py +4 -4
  2. re_common/baselibrary/baseabs/__init__.py +6 -6
  3. re_common/baselibrary/baseabs/baseabs.py +26 -26
  4. re_common/baselibrary/database/mbuilder.py +132 -132
  5. re_common/baselibrary/database/moudle.py +93 -93
  6. re_common/baselibrary/database/msqlite3.py +194 -194
  7. re_common/baselibrary/database/mysql.py +169 -169
  8. re_common/baselibrary/database/sql_factory.py +26 -26
  9. re_common/baselibrary/mthread/MThreadingRun.py +486 -486
  10. re_common/baselibrary/mthread/MThreadingRunEvent.py +349 -349
  11. re_common/baselibrary/mthread/__init__.py +2 -2
  12. re_common/baselibrary/mthread/mythreading.py +695 -695
  13. re_common/baselibrary/pakge_other/socks.py +404 -404
  14. re_common/baselibrary/readconfig/config_factory.py +18 -18
  15. re_common/baselibrary/readconfig/ini_config.py +317 -317
  16. re_common/baselibrary/readconfig/toml_config.py +49 -49
  17. re_common/baselibrary/temporary/envdata.py +36 -36
  18. re_common/baselibrary/tools/all_requests/aiohttp_request.py +118 -118
  19. re_common/baselibrary/tools/all_requests/httpx_requet.py +102 -102
  20. re_common/baselibrary/tools/all_requests/mrequest.py +412 -412
  21. re_common/baselibrary/tools/all_requests/requests_request.py +81 -81
  22. re_common/baselibrary/tools/batch_compre/bijiao_batch.py +31 -31
  23. re_common/baselibrary/tools/contrast_db3.py +123 -123
  24. re_common/baselibrary/tools/copy_file.py +39 -39
  25. re_common/baselibrary/tools/db3_2_sizedb3.py +102 -102
  26. re_common/baselibrary/tools/foreachgz.py +39 -39
  27. re_common/baselibrary/tools/get_attr.py +10 -10
  28. re_common/baselibrary/tools/image_to_pdf.py +61 -61
  29. re_common/baselibrary/tools/java_code_deal.py +139 -139
  30. re_common/baselibrary/tools/javacode.py +79 -79
  31. re_common/baselibrary/tools/mdb_db3.py +48 -48
  32. re_common/baselibrary/tools/merge_file.py +171 -171
  33. re_common/baselibrary/tools/merge_gz_file.py +165 -165
  34. re_common/baselibrary/tools/mhdfstools/down_hdfs_files.py +42 -42
  35. re_common/baselibrary/tools/mhdfstools/hdfst.py +42 -42
  36. re_common/baselibrary/tools/mhdfstools/up_hdfs_files.py +38 -38
  37. re_common/baselibrary/tools/mongo_tools.py +50 -50
  38. re_common/baselibrary/tools/move_file.py +170 -170
  39. re_common/baselibrary/tools/move_mongo/mongo_table_to_file.py +63 -63
  40. re_common/baselibrary/tools/move_mongo/move_mongo_table.py +354 -354
  41. re_common/baselibrary/tools/move_mongo/use_mttf.py +18 -18
  42. re_common/baselibrary/tools/move_mongo/use_mv.py +93 -93
  43. re_common/baselibrary/tools/mpandas/mpandasreadexcel.py +125 -125
  44. re_common/baselibrary/tools/mpandas/pandas_visualization.py +7 -7
  45. re_common/baselibrary/tools/myparsel.py +104 -104
  46. re_common/baselibrary/tools/rename_dir_file.py +37 -37
  47. re_common/baselibrary/tools/sequoiadb_utils.py +398 -398
  48. re_common/baselibrary/tools/split_line_to_many.py +25 -25
  49. re_common/baselibrary/tools/stringtodicts.py +33 -33
  50. re_common/baselibrary/tools/workwechant_bot.py +84 -84
  51. re_common/baselibrary/utils/baseaiohttp.py +296 -296
  52. re_common/baselibrary/utils/baseaiomysql.py +87 -87
  53. re_common/baselibrary/utils/baseallstep.py +191 -191
  54. re_common/baselibrary/utils/baseavro.py +19 -19
  55. re_common/baselibrary/utils/baseboto3.py +291 -291
  56. re_common/baselibrary/utils/basecsv.py +32 -32
  57. re_common/baselibrary/utils/basedict.py +133 -133
  58. re_common/baselibrary/utils/basedir.py +241 -241
  59. re_common/baselibrary/utils/baseencode.py +351 -351
  60. re_common/baselibrary/utils/baseencoding.py +28 -28
  61. re_common/baselibrary/utils/baseesdsl.py +86 -86
  62. re_common/baselibrary/utils/baseexcel.py +264 -264
  63. re_common/baselibrary/utils/baseexcept.py +109 -109
  64. re_common/baselibrary/utils/basefile.py +654 -654
  65. re_common/baselibrary/utils/baseftp.py +214 -214
  66. re_common/baselibrary/utils/basegzip.py +60 -60
  67. re_common/baselibrary/utils/basehdfs.py +135 -135
  68. re_common/baselibrary/utils/basehttpx.py +268 -268
  69. re_common/baselibrary/utils/baseip.py +87 -87
  70. re_common/baselibrary/utils/basejson.py +2 -2
  71. re_common/baselibrary/utils/baselist.py +32 -32
  72. re_common/baselibrary/utils/basemotor.py +190 -190
  73. re_common/baselibrary/utils/basemssql.py +98 -98
  74. re_common/baselibrary/utils/baseodbc.py +113 -113
  75. re_common/baselibrary/utils/basepandas.py +302 -302
  76. re_common/baselibrary/utils/basepeewee.py +11 -11
  77. re_common/baselibrary/utils/basepika.py +180 -180
  78. re_common/baselibrary/utils/basepydash.py +143 -143
  79. re_common/baselibrary/utils/basepymongo.py +230 -230
  80. re_common/baselibrary/utils/basequeue.py +22 -22
  81. re_common/baselibrary/utils/baserar.py +57 -57
  82. re_common/baselibrary/utils/baserequest.py +279 -279
  83. re_common/baselibrary/utils/baseset.py +8 -8
  84. re_common/baselibrary/utils/basesmb.py +403 -403
  85. re_common/baselibrary/utils/basestring.py +382 -382
  86. re_common/baselibrary/utils/basetime.py +320 -320
  87. re_common/baselibrary/utils/baseurl.py +121 -121
  88. re_common/baselibrary/utils/basezip.py +57 -57
  89. re_common/baselibrary/utils/core/__init__.py +7 -7
  90. re_common/baselibrary/utils/core/bottomutils.py +18 -18
  91. re_common/baselibrary/utils/core/mdeprecated.py +327 -327
  92. re_common/baselibrary/utils/core/mlamada.py +16 -16
  93. re_common/baselibrary/utils/core/msginfo.py +25 -25
  94. re_common/baselibrary/utils/core/requests_core.py +103 -103
  95. re_common/baselibrary/utils/fateadm.py +429 -429
  96. re_common/baselibrary/utils/importfun.py +123 -123
  97. re_common/baselibrary/utils/mfaker.py +57 -57
  98. re_common/baselibrary/utils/my_abc/__init__.py +3 -3
  99. re_common/baselibrary/utils/my_abc/better_abc.py +32 -32
  100. re_common/baselibrary/utils/mylogger.py +414 -414
  101. re_common/baselibrary/utils/myredisclient.py +861 -861
  102. re_common/baselibrary/utils/pipupgrade.py +21 -21
  103. re_common/baselibrary/utils/ringlist.py +85 -85
  104. re_common/baselibrary/utils/version_compare.py +36 -36
  105. re_common/baselibrary/utils/ydmhttp.py +126 -126
  106. re_common/facade/lazy_import.py +11 -11
  107. re_common/facade/loggerfacade.py +25 -25
  108. re_common/facade/mysqlfacade.py +467 -467
  109. re_common/facade/now.py +31 -31
  110. re_common/facade/sqlite3facade.py +257 -257
  111. re_common/facade/use/mq_use_facade.py +83 -83
  112. re_common/facade/use/proxy_use_facade.py +19 -19
  113. re_common/libtest/base_dict_test.py +19 -19
  114. re_common/libtest/baseavro_test.py +13 -13
  115. re_common/libtest/basefile_test.py +14 -14
  116. re_common/libtest/basemssql_test.py +77 -77
  117. re_common/libtest/baseodbc_test.py +7 -7
  118. re_common/libtest/basepandas_test.py +38 -38
  119. re_common/libtest/get_attr_test/get_attr_test_settings.py +14 -14
  120. re_common/libtest/get_attr_test/settings.py +54 -54
  121. re_common/libtest/idencode_test.py +53 -53
  122. re_common/libtest/iniconfig_test.py +35 -35
  123. re_common/libtest/ip_test.py +34 -34
  124. re_common/libtest/merge_file_test.py +20 -20
  125. re_common/libtest/mfaker_test.py +8 -8
  126. re_common/libtest/mm3_test.py +31 -31
  127. re_common/libtest/mylogger_test.py +88 -88
  128. re_common/libtest/myparsel_test.py +27 -27
  129. re_common/libtest/mysql_test.py +151 -151
  130. re_common/libtest/pymongo_test.py +21 -21
  131. re_common/libtest/split_test.py +11 -11
  132. re_common/libtest/sqlite3_merge_test.py +5 -5
  133. re_common/libtest/sqlite3_test.py +34 -34
  134. re_common/libtest/tomlconfig_test.py +30 -30
  135. re_common/libtest/use_tools_test/__init__.py +2 -2
  136. re_common/libtest/user/__init__.py +4 -4
  137. re_common/studio/__init__.py +4 -4
  138. re_common/studio/assignment_expressions.py +36 -36
  139. re_common/studio/mydash/test1.py +18 -18
  140. re_common/studio/pydashstudio/first.py +9 -9
  141. re_common/studio/streamlitstudio/first_app.py +65 -65
  142. re_common/studio/streamlitstudio/uber_pickups.py +23 -23
  143. re_common/studio/test.py +18 -18
  144. re_common/v2/baselibrary/business_utils/BusinessStringUtil.py +219 -219
  145. re_common/v2/baselibrary/business_utils/baseencodeid.py +100 -100
  146. re_common/v2/baselibrary/business_utils/full_doi_path.py +116 -116
  147. re_common/v2/baselibrary/business_utils/rel_tools.py +6 -6
  148. re_common/v2/baselibrary/decorators/utils.py +59 -59
  149. re_common/v2/baselibrary/helpers/search_packge/NearestNeighbors_test.py +105 -105
  150. re_common/v2/baselibrary/helpers/search_packge/fit_text_match.py +253 -253
  151. re_common/v2/baselibrary/helpers/search_packge/scikit_learn_text_matcher.py +260 -260
  152. re_common/v2/baselibrary/helpers/search_packge/test.py +1 -1
  153. re_common/v2/baselibrary/s3object/baseboto3.py +230 -230
  154. re_common/v2/baselibrary/tools/WeChatRobot.py +95 -95
  155. re_common/v2/baselibrary/tools/ac_ahocorasick.py +75 -75
  156. re_common/v2/baselibrary/tools/concurrency.py +35 -35
  157. re_common/v2/baselibrary/tools/data_processer/base.py +53 -53
  158. re_common/v2/baselibrary/tools/data_processer/data_processer.py +508 -508
  159. re_common/v2/baselibrary/tools/data_processer/data_reader.py +187 -187
  160. re_common/v2/baselibrary/tools/data_processer/data_writer.py +38 -38
  161. re_common/v2/baselibrary/tools/dict_tools.py +44 -44
  162. re_common/v2/baselibrary/tools/dolphinscheduler.py +187 -187
  163. re_common/v2/baselibrary/tools/hdfs_base_processor.py +204 -204
  164. re_common/v2/baselibrary/tools/hdfs_bulk_processor.py +67 -67
  165. re_common/v2/baselibrary/tools/hdfs_data_processer.py +338 -338
  166. re_common/v2/baselibrary/tools/hdfs_line_processor.py +74 -74
  167. re_common/v2/baselibrary/tools/list_tools.py +69 -69
  168. re_common/v2/baselibrary/tools/resume_tracker.py +94 -94
  169. re_common/v2/baselibrary/tools/search_hash_tools.py +54 -54
  170. re_common/v2/baselibrary/tools/text_matcher.py +326 -326
  171. re_common/v2/baselibrary/tools/unionfind_tools.py +60 -60
  172. re_common/v2/baselibrary/utils/BusinessStringUtil.py +196 -196
  173. re_common/v2/baselibrary/utils/api_net_utils.py +270 -270
  174. re_common/v2/baselibrary/utils/author_smi.py +361 -361
  175. re_common/v2/baselibrary/utils/base_string_similarity.py +158 -158
  176. re_common/v2/baselibrary/utils/basedict.py +37 -37
  177. re_common/v2/baselibrary/utils/basehdfs.py +163 -163
  178. re_common/v2/baselibrary/utils/basepika.py +180 -180
  179. re_common/v2/baselibrary/utils/basetime.py +77 -77
  180. re_common/v2/baselibrary/utils/db.py +156 -156
  181. re_common/v2/baselibrary/utils/json_cls.py +16 -16
  182. re_common/v2/baselibrary/utils/mq.py +83 -83
  183. re_common/v2/baselibrary/utils/n_ary_expression_tree.py +243 -243
  184. re_common/v2/baselibrary/utils/string_bool.py +186 -186
  185. re_common/v2/baselibrary/utils/string_clear.py +246 -246
  186. re_common/v2/baselibrary/utils/string_smi.py +18 -18
  187. re_common/v2/baselibrary/utils/stringutils.py +271 -278
  188. re_common/vip/base_step_process.py +11 -11
  189. re_common/vip/baseencodeid.py +90 -90
  190. re_common/vip/changetaskname.py +28 -28
  191. re_common/vip/core_var.py +24 -24
  192. re_common/vip/mmh3Hash.py +89 -89
  193. re_common/vip/proxy/allproxys.py +127 -127
  194. re_common/vip/proxy/allproxys_thread.py +159 -159
  195. re_common/vip/proxy/cnki_proxy.py +153 -153
  196. re_common/vip/proxy/kuaidaili.py +87 -87
  197. re_common/vip/proxy/proxy_all.py +113 -113
  198. re_common/vip/proxy/update_kuaidaili_0.py +42 -42
  199. re_common/vip/proxy/wanfang_proxy.py +152 -152
  200. re_common/vip/proxy/wp_proxy_all.py +181 -181
  201. re_common/vip/read_rawid_to_txt.py +91 -91
  202. re_common/vip/title/__init__.py +5 -5
  203. re_common/vip/title/transform/TransformBookTitleToZt.py +125 -125
  204. re_common/vip/title/transform/TransformConferenceTitleToZt.py +139 -139
  205. re_common/vip/title/transform/TransformCstadTitleToZt.py +195 -195
  206. re_common/vip/title/transform/TransformJournalTitleToZt.py +203 -203
  207. re_common/vip/title/transform/TransformPatentTitleToZt.py +132 -132
  208. re_common/vip/title/transform/TransformRegulationTitleToZt.py +114 -114
  209. re_common/vip/title/transform/TransformStandardTitleToZt.py +135 -135
  210. re_common/vip/title/transform/TransformThesisTitleToZt.py +135 -135
  211. re_common/vip/title/transform/__init__.py +10 -10
  212. {re_common-10.0.37.dist-info → re_common-10.0.39.dist-info}/LICENSE +201 -201
  213. {re_common-10.0.37.dist-info → re_common-10.0.39.dist-info}/METADATA +16 -16
  214. re_common-10.0.39.dist-info/RECORD +248 -0
  215. {re_common-10.0.37.dist-info → re_common-10.0.39.dist-info}/WHEEL +1 -1
  216. re_common-10.0.37.dist-info/RECORD +0 -248
  217. {re_common-10.0.37.dist-info → re_common-10.0.39.dist-info}/top_level.txt +0 -0
@@ -1,361 +1,361 @@
1
- import copy
2
- import re
3
- import string
4
-
5
- import regex
6
- from jellyfish import damerau_levenshtein_distance
7
- from rapidfuzz._utils import setupPandas, is_none
8
- from rapidfuzz.distance import Jaro
9
- from unidecode import unidecode
10
-
11
- from re_common.v2.baselibrary.utils.stringutils import get_diacritic_variant
12
-
13
- """
14
- 作者比率分布 大部分在 1和 2
15
- 1-2 675092763
16
- 2-3 49335191
17
- 3-4 440848
18
- 4-5 9953
19
- 其他都是几百 几十和几个 不用考虑
20
- 如果 大于5 大降分
21
- 3-4 4-5 分两个段降分 3-4 降得最少
22
- 1-3 不降分
23
- """
24
-
25
- additional_chars = '‑–‐’·.—'
26
- extended_punctuation = string.punctuation + additional_chars
27
-
28
-
29
- def detect_other_languages(text):
30
- # 匹配所有非中文、非英文、非数字字符
31
- pattern = r'[^\u4E00-\u9FFFa-zA-Z0-9\s.,!?;:\'\"()‑\-–—‐’·˜.]'
32
-
33
- # 使用正则表达式查找
34
- matches = re.findall(pattern, text)
35
-
36
- # 如果找到匹配的字符,表示存在非中文、非英文、非数字的语言字符
37
- return bool(matches)
38
-
39
-
40
- def extract_initials(text):
41
- # 按空格分隔字符串
42
- words = text.split()
43
-
44
- # 提取每个单词的首字母并转化为大写
45
- initials = ''.join(word[0].upper() for word in words)
46
-
47
- return initials
48
-
49
-
50
- def is_contained(str1, str2):
51
- # 判断是否是包含关系
52
- return str1 in str2 or str2 in str1
53
-
54
-
55
- # list1 是否包含 list2 如果包含 return True
56
- def is_contained_list(list1, list2):
57
- # 检查 list2 中每个元素的出现次数,是否能在 list1 中找到足够的数量
58
- for item in list2:
59
- if list2.count(item) > list1.count(item):
60
- return False
61
- return True
62
-
63
-
64
- def check_common_elements_by_length_rank(list1, list2):
65
- # 获取两个列表的交集
66
- set1 = set(list1)
67
- set2 = set(list2)
68
-
69
- common_elements = set1 & set2 # 获取交集
70
-
71
- if not common_elements:
72
- return False
73
-
74
- # 确定较短的列表
75
- short_list = list1 if len(list1) < len(list2) else list2
76
-
77
- # 按字符长度排序短列表
78
- sorted_short_list = sorted(short_list, key=len)
79
-
80
- for word in common_elements:
81
- # 获取该单词在短列表中的字符长度排名
82
- length_rank = sorted_short_list.index(word) + 1 # +1 因为列表索引从0开始
83
- # 如果单个字母跳过
84
- if len(word) == 1:
85
- continue
86
-
87
- if length_rank / len(sorted_short_list) > 0.5:
88
- # 说明 命中了长字符串相等
89
- return True
90
-
91
- return False
92
-
93
-
94
- def remove_punctuation(text):
95
- # 20241226 替换掉自定义符号集
96
- text = regex.sub("[\\p{P}¥+=˛`$<¸´~^¥≤℃×■¨°>|ⅰⅱⅲⅳⅴⅵⅶⅹⅺⅻ]", "", text.lower())
97
- # text = text.translate(str.maketrans('', '', extended_punctuation))
98
- return text
99
-
100
-
101
- def space_punctuation(text):
102
- # 使用空格替换符号
103
- return text.translate(str.maketrans(extended_punctuation, ' ' * len(extended_punctuation), ''))
104
-
105
-
106
- def custom_rstrip(s):
107
- # 去除尾部的指定子串,顺序删除
108
- s = s.strip()
109
- if s.endswith("."):
110
- s = s[:-1] # 删除最后的 "."
111
- s = s.strip()
112
- if s.endswith("jr"):
113
- s = s[:-2] # 删除最后的 "jr"
114
- s = s.strip()
115
- if s.endswith(","):
116
- s = s[:-1] # 删除最后的 ","
117
- s = s.strip()
118
-
119
- return s
120
-
121
-
122
- # 分割中文拼音,如"Xiaohong" ————> ['Xiao', 'hong']
123
- def chinese_pinyin_split_by_rules(input_str):
124
- # 声母列表(含复合声母)
125
- initials = {
126
- 'b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h',
127
- 'j', 'q', 'x', 'zh', 'ch', 'sh', 'r', 'z', 'c', 's', 'y', 'w'
128
- }
129
- # 韵母列表(部分示例)
130
- finals = {
131
- 'a', 'o', 'e', 'ai', 'ei', 'ao', 'ou', 'an', 'en', 'ang', 'eng', 'ong',
132
- 'i', 'ia', 'ie', 'iao', 'iu', 'ian', 'in', 'iang', 'ing', 'iong',
133
- 'u', 'ua', 'uo', 'uai', 'ui', 'uan', 'un', 'uang', 'ueng',
134
- 'v', 've', 'van', 'vn'
135
- }
136
- result = []
137
- while input_str:
138
- # 尝试匹配最长声母
139
- max_initial_len = 2 # 最长声母如 'zh'
140
- matched_initial = ""
141
- for length in range(max_initial_len, 0, -1):
142
- candidate = input_str[:length]
143
- if candidate.lower() in initials:
144
- matched_initial = candidate
145
- break
146
- # 切分声母后的剩余部分
147
- remaining = input_str[len(matched_initial):]
148
- # 匹配韵母
149
- max_final_len = min(4, len(remaining)) # 最长韵母如 'iong'
150
- matched_final = ""
151
- for length in range(max_final_len, 0, -1):
152
- candidate = remaining[:length]
153
- if candidate.lower() in finals:
154
- matched_final = candidate
155
- break
156
- if matched_final:
157
- # 合并声母和韵母
158
- syllable = matched_initial + matched_final
159
- result.append(syllable)
160
- input_str = input_str[len(syllable):]
161
- else:
162
- return [] # 无法切分
163
- return result
164
-
165
-
166
- def AuthorRatio(
167
- s1,
168
- s2,
169
- *,
170
- processor=None,
171
- score_cutoff=None,
172
- is_delete_jr=True,
173
- ):
174
- # 判空需要
175
- setupPandas()
176
- # 如果为空就没有相似度
177
- if is_none(s1) or is_none(s2):
178
- return 0
179
-
180
- # 处理字符串的程序 外围传入方法
181
- if processor is not None:
182
- s1 = processor(s1)
183
- s2 = processor(s2)
184
-
185
- # 处理后是否为空字符串,如果有 返回0
186
- if not s1 or not s2:
187
- return 0
188
- # get_diacritic_variant(unidecode(strs)) 更激进,会丢失非拉丁字符和原文信息,适合需要把多语言文本转换成 ASCII 拼音的场景。
189
- # 处理音标问题
190
- s1 = get_diacritic_variant(unidecode(s1))
191
- s2 = get_diacritic_variant(unidecode(s2))
192
- # 这里提出来是为了少计算 但后期需要平衡内存和算力
193
- # 移除指定符号 这里做了小写化处理
194
- s1_punc = remove_punctuation(s1)
195
- s2_punc = remove_punctuation(s2)
196
- # 分成列表
197
- s1_punc_split = s1_punc.split()
198
- s2_punc_split = s2_punc.split()
199
-
200
- def compare_strings(s1_punc, s2_punc):
201
- # 去除字符串中的空白字符
202
- cleaned_s1 = re.sub(r'\s+', '', s1_punc)
203
- cleaned_s2 = re.sub(r'\s+', '', s2_punc)
204
-
205
- # 如果两个字符串相等,返回 相等
206
- if cleaned_s1 == cleaned_s2:
207
- return "equal"
208
- # 如果一个字符串包含另一个字符串,返回 子字符串
209
- elif cleaned_s1 in cleaned_s2 or cleaned_s2 in cleaned_s1:
210
- return "subset"
211
- # 否则返回 无关
212
- else:
213
- return "unrelated"
214
-
215
- # 如果去除符号后相等 那么就是100% 的相同作者 这里主要防止顺序颠倒的问题
216
- if len(s1_punc_split) == len(s2_punc_split) and set(s1_punc_split) == set(s2_punc_split):
217
- return 1
218
-
219
- # 如果少一个单词,认为是正确的包含关系,在简写中会出现这种情况
220
- if is_contained_list(s1_punc_split, s2_punc_split) or is_contained_list(s2_punc_split, s1_punc_split):
221
- return 0.98
222
-
223
- rus = compare_strings(s1_punc, s2_punc)
224
- # 如果顺序去字符 去空格完全相等 那么作者相同 “Hoorani, H. R.” -> 'Hoorani, HR'
225
- if rus == "equal":
226
- return 1
227
-
228
- # 在外文中 jr 代表儿子 我现在需要去掉这个字符带来的影响,可以用参数控制
229
- if is_delete_jr:
230
- s1_n = custom_rstrip(s1.lower())
231
- s1 = s1[:len(s1_n)]
232
- s2_n = custom_rstrip(s2.lower())
233
- s2 = s2[:len(s2_n)]
234
-
235
- # 这里正向是为了解决 Liao, Zhan -> Liao Z. 这样的关系 但是反向会导致上面的错误存在
236
- if len(s1_punc_split) == len(s2_punc_split) and rus == "subset":
237
- if len(s1_punc_split[-1]) == 1 or len(s2_punc_split[-1]) == 1:
238
- if s1_punc_split[0] == s2_punc_split[0] and s1_punc_split[-1][:1] == s2_punc_split[-1][:1]:
239
- return 1
240
- # return 0.96 # 如果单词数一致 是包含关系 但会出现这样的 Li Li 和 Li Liang 会被判定为一样 所以这里不给满分
241
-
242
- # 使用正则表达式替换多个空格为一个空格
243
- l1 = re.sub(r'\s+', ' ', space_punctuation(s1.replace("'", "")).strip()).strip().split()
244
- l2 = re.sub(r'\s+', ' ', space_punctuation(s2.replace("'", "")).strip()).strip().split()
245
-
246
- def is_same_or_initials_match(l1, l2):
247
- """
248
- 判断两个字符串是否完全相同,或者它们的首字母是否相同。
249
- bool: 如果两个字符串完全相同,或它们的首字母匹配,返回 True;否则返回 False。
250
- """
251
-
252
- # 使用 zip() 同时遍历 l1 和 l2 中的字符
253
- for i1, i2 in zip(l1, l2):
254
- # 如果两个字符忽略大小写后相同,继续比较下一个字符
255
- if i1.lower() == i2.lower():
256
- continue
257
- # 在作者中 有可能错误字母 当单词大于3 且只有一个字母错误或者位置交换时 可以认为这两个单词相同
258
- # 样例 "De Gusmio, Ana Paula Henriques","De Gusmão, Ana Paula Henriques"
259
- if len(i1) > 3 and damerau_levenshtein_distance(i1, i2) <= 1:
260
- continue
261
-
262
- # 如果其中一个字符的长度为1(即是单个字母),检查它们的首字母是否匹配
263
- if len(i1) == 1 or len(i2) == 1:
264
- # 比较它们的首字母(不区分大小写)
265
- if i1[0].upper() == i2[0].upper():
266
- continue
267
- else:
268
- return False # 如果首字母不同,则返回 False
269
-
270
- # 如果上面条件都不满足,说明字符不匹配,直接返回 False
271
- return False
272
-
273
- # 如果循环结束都没有提前返回 False,则表示两个字符串完全匹配,返回 True
274
- return True
275
-
276
- # 防止清理后 一方变为空字符串
277
- if len(l1) == 0 or len(l2) == 0:
278
- return 0
279
-
280
- # 这里的逻辑是最后的位置全大写就将他拆分散 比如 joi CJ -> joi C J
281
- if len(l1[-1]) != 1 and l1[-1].isupper():
282
- t_str = l1[-1]
283
- l1 = l1[:-1]
284
- l1.extend(list(t_str))
285
- if len(l2[-1]) != 1 and l2[-1].isupper():
286
- t_str = l2[-1]
287
- l2 = l2[:-1]
288
- l2.extend(list(t_str))
289
-
290
- # 如果长度相等 简写也是单词的首字母 那么两个名字一致 举例:"María M.Martorell", "Martorell, María M."
291
- if len(l1) == len(l2) and (is_same_or_initials_match(l1, l2) or set(l1) == set(l2)):
292
- return 1
293
-
294
- # 在这里针对上面一条算法再增加一条算法,先对list 排序在对他进行上面的对比
295
- # 如果长度相等 简写也是单词的首字母 那么两个名字一致 举例:Guo, Qiang @@ Q. Guo
296
- sort_l1 = copy.deepcopy(l1)
297
- sort_l2 = copy.deepcopy(l2)
298
- sort_l1.sort()
299
- sort_l2.sort()
300
- if len(sort_l1) == len(sort_l2) and (is_same_or_initials_match(sort_l1, sort_l2) or set(sort_l1) == set(sort_l2)):
301
- return 0.99
302
-
303
-
304
- ##############################################################
305
- # 以上为情况穷举情况,以下为其他情况的相似率计算
306
- ##############################################################
307
-
308
- # 设置score_cutoff 默认值为0
309
- if score_cutoff is None:
310
- score_cutoff = 0
311
-
312
- len1 = len(s1)
313
- len2 = len(s2)
314
- # 用长字符串除以 短字符串 得到字符串长度的比率
315
- len_ratio = len1 / len2 if len1 > len2 else len2 / len1
316
-
317
- # 计算归一化的 Indel 相似度。 对于比率<score_cutoff,返回0。
318
- end_ratio = normal_end_ratio = Jaro.normalized_similarity(s1.lower(), s2.lower())
319
-
320
- # 需要对作者的比率分布进行调研决定哪些是小比率哪些是大比率
321
- if len_ratio > 1.5 and len_ratio < 3:
322
- # 计算线性下降的减分比例
323
- # 当 len_ratio = 1.5 时,reduction_factor = 1.0
324
- # 当 len_ratio = 3.0 时,reduction_factor = 0.9
325
- reduction_factor = 1.0 - (len_ratio - 1.5) * (0.1 / 1.5)
326
- end_ratio = end_ratio * reduction_factor
327
- if len_ratio > 3 and len_ratio < 4: # 应该少量降分
328
- end_ratio = end_ratio * 0.9
329
- if len_ratio > 4 and len_ratio < 5: # 应该中量降分
330
- end_ratio = end_ratio * 0.8
331
- if len_ratio > 5: # 应该降分
332
- end_ratio = end_ratio * 0.7
333
-
334
- # 变音提分已经在上面解决了
335
- # # 非英语 非汉语提分 与 英文对比时 提分
336
- # if any([detect_other_languages(s1), detect_other_languages(s2)]) and not all([detect_other_languages(s1),
337
- # detect_other_languages(s2)]):
338
- # # 应该提分
339
- # end_ratio = end_ratio * 1.1
340
-
341
- # 首字母相同提分
342
- # if is_contained(extract_initials(s1), extract_initials(s2)):
343
- if is_contained_list([i[:1].lower() for i in l1], [i[:1].lower() for i in l2]):
344
- # 应该提分
345
- end_ratio = end_ratio * 1.05
346
- else:
347
- end_ratio = end_ratio * 0.9
348
-
349
- if len(l1) != len(l2):
350
- end_ratio = end_ratio * 0.92
351
-
352
- # 相同部分在短的数据的词中的长度位置 如果是简写相同 不应该提分
353
- if check_common_elements_by_length_rank(l1, l2) and len_ratio > 1.5:
354
- # 应该提分
355
- end_ratio = end_ratio * 1.1
356
-
357
- if l1[0] != l2[0]:
358
- end_ratio = end_ratio * Jaro.normalized_similarity(l1[0].lower(), l2[0].lower())
359
-
360
- # 如果字符串本身的相似度高 应该拉上去 否者应该拉下来
361
- return min(end_ratio, 1) * 0.5 + normal_end_ratio * 0.5
1
+ import copy
2
+ import re
3
+ import string
4
+
5
+ import regex
6
+ from jellyfish import damerau_levenshtein_distance
7
+ from rapidfuzz._utils import setupPandas, is_none
8
+ from rapidfuzz.distance import Jaro
9
+ from unidecode import unidecode
10
+
11
+ from re_common.v2.baselibrary.utils.stringutils import get_diacritic_variant
12
+
13
+ """
14
+ 作者比率分布 大部分在 1和 2
15
+ 1-2 675092763
16
+ 2-3 49335191
17
+ 3-4 440848
18
+ 4-5 9953
19
+ 其他都是几百 几十和几个 不用考虑
20
+ 如果 大于5 大降分
21
+ 3-4 4-5 分两个段降分 3-4 降得最少
22
+ 1-3 不降分
23
+ """
24
+
25
+ additional_chars = '‑–‐’·.—'
26
+ extended_punctuation = string.punctuation + additional_chars
27
+
28
+
29
+ def detect_other_languages(text):
30
+ # 匹配所有非中文、非英文、非数字字符
31
+ pattern = r'[^\u4E00-\u9FFFa-zA-Z0-9\s.,!?;:\'\"()‑\-–—‐’·˜.]'
32
+
33
+ # 使用正则表达式查找
34
+ matches = re.findall(pattern, text)
35
+
36
+ # 如果找到匹配的字符,表示存在非中文、非英文、非数字的语言字符
37
+ return bool(matches)
38
+
39
+
40
+ def extract_initials(text):
41
+ # 按空格分隔字符串
42
+ words = text.split()
43
+
44
+ # 提取每个单词的首字母并转化为大写
45
+ initials = ''.join(word[0].upper() for word in words)
46
+
47
+ return initials
48
+
49
+
50
+ def is_contained(str1, str2):
51
+ # 判断是否是包含关系
52
+ return str1 in str2 or str2 in str1
53
+
54
+
55
+ # list1 是否包含 list2 如果包含 return True
56
+ def is_contained_list(list1, list2):
57
+ # 检查 list2 中每个元素的出现次数,是否能在 list1 中找到足够的数量
58
+ for item in list2:
59
+ if list2.count(item) > list1.count(item):
60
+ return False
61
+ return True
62
+
63
+
64
+ def check_common_elements_by_length_rank(list1, list2):
65
+ # 获取两个列表的交集
66
+ set1 = set(list1)
67
+ set2 = set(list2)
68
+
69
+ common_elements = set1 & set2 # 获取交集
70
+
71
+ if not common_elements:
72
+ return False
73
+
74
+ # 确定较短的列表
75
+ short_list = list1 if len(list1) < len(list2) else list2
76
+
77
+ # 按字符长度排序短列表
78
+ sorted_short_list = sorted(short_list, key=len)
79
+
80
+ for word in common_elements:
81
+ # 获取该单词在短列表中的字符长度排名
82
+ length_rank = sorted_short_list.index(word) + 1 # +1 因为列表索引从0开始
83
+ # 如果单个字母跳过
84
+ if len(word) == 1:
85
+ continue
86
+
87
+ if length_rank / len(sorted_short_list) > 0.5:
88
+ # 说明 命中了长字符串相等
89
+ return True
90
+
91
+ return False
92
+
93
+
94
+ def remove_punctuation(text):
95
+ # 20241226 替换掉自定义符号集
96
+ text = regex.sub("[\\p{P}¥+=˛`$<¸´~^¥≤℃×■¨°>|ⅰⅱⅲⅳⅴⅵⅶⅹⅺⅻ]", "", text.lower())
97
+ # text = text.translate(str.maketrans('', '', extended_punctuation))
98
+ return text
99
+
100
+
101
+ def space_punctuation(text):
102
+ # 使用空格替换符号
103
+ return text.translate(str.maketrans(extended_punctuation, ' ' * len(extended_punctuation), ''))
104
+
105
+
106
+ def custom_rstrip(s):
107
+ # 去除尾部的指定子串,顺序删除
108
+ s = s.strip()
109
+ if s.endswith("."):
110
+ s = s[:-1] # 删除最后的 "."
111
+ s = s.strip()
112
+ if s.endswith("jr"):
113
+ s = s[:-2] # 删除最后的 "jr"
114
+ s = s.strip()
115
+ if s.endswith(","):
116
+ s = s[:-1] # 删除最后的 ","
117
+ s = s.strip()
118
+
119
+ return s
120
+
121
+
122
+ # 分割中文拼音,如"Xiaohong" ————> ['Xiao', 'hong']
123
+ def chinese_pinyin_split_by_rules(input_str):
124
+ # 声母列表(含复合声母)
125
+ initials = {
126
+ 'b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h',
127
+ 'j', 'q', 'x', 'zh', 'ch', 'sh', 'r', 'z', 'c', 's', 'y', 'w'
128
+ }
129
+ # 韵母列表(部分示例)
130
+ finals = {
131
+ 'a', 'o', 'e', 'ai', 'ei', 'ao', 'ou', 'an', 'en', 'ang', 'eng', 'ong',
132
+ 'i', 'ia', 'ie', 'iao', 'iu', 'ian', 'in', 'iang', 'ing', 'iong',
133
+ 'u', 'ua', 'uo', 'uai', 'ui', 'uan', 'un', 'uang', 'ueng',
134
+ 'v', 've', 'van', 'vn'
135
+ }
136
+ result = []
137
+ while input_str:
138
+ # 尝试匹配最长声母
139
+ max_initial_len = 2 # 最长声母如 'zh'
140
+ matched_initial = ""
141
+ for length in range(max_initial_len, 0, -1):
142
+ candidate = input_str[:length]
143
+ if candidate.lower() in initials:
144
+ matched_initial = candidate
145
+ break
146
+ # 切分声母后的剩余部分
147
+ remaining = input_str[len(matched_initial):]
148
+ # 匹配韵母
149
+ max_final_len = min(4, len(remaining)) # 最长韵母如 'iong'
150
+ matched_final = ""
151
+ for length in range(max_final_len, 0, -1):
152
+ candidate = remaining[:length]
153
+ if candidate.lower() in finals:
154
+ matched_final = candidate
155
+ break
156
+ if matched_final:
157
+ # 合并声母和韵母
158
+ syllable = matched_initial + matched_final
159
+ result.append(syllable)
160
+ input_str = input_str[len(syllable):]
161
+ else:
162
+ return [] # 无法切分
163
+ return result
164
+
165
+
166
+ def AuthorRatio(
167
+ s1,
168
+ s2,
169
+ *,
170
+ processor=None,
171
+ score_cutoff=None,
172
+ is_delete_jr=True,
173
+ ):
174
+ # 判空需要
175
+ setupPandas()
176
+ # 如果为空就没有相似度
177
+ if is_none(s1) or is_none(s2):
178
+ return 0
179
+
180
+ # 处理字符串的程序 外围传入方法
181
+ if processor is not None:
182
+ s1 = processor(s1)
183
+ s2 = processor(s2)
184
+
185
+ # 处理后是否为空字符串,如果有 返回0
186
+ if not s1 or not s2:
187
+ return 0
188
+ # get_diacritic_variant(unidecode(strs)) 更激进,会丢失非拉丁字符和原文信息,适合需要把多语言文本转换成 ASCII 拼音的场景。
189
+ # 处理音标问题
190
+ s1 = get_diacritic_variant(unidecode(s1))
191
+ s2 = get_diacritic_variant(unidecode(s2))
192
+ # 这里提出来是为了少计算 但后期需要平衡内存和算力
193
+ # 移除指定符号 这里做了小写化处理
194
+ s1_punc = remove_punctuation(s1)
195
+ s2_punc = remove_punctuation(s2)
196
+ # 分成列表
197
+ s1_punc_split = s1_punc.split()
198
+ s2_punc_split = s2_punc.split()
199
+
200
+ def compare_strings(s1_punc, s2_punc):
201
+ # 去除字符串中的空白字符
202
+ cleaned_s1 = re.sub(r'\s+', '', s1_punc)
203
+ cleaned_s2 = re.sub(r'\s+', '', s2_punc)
204
+
205
+ # 如果两个字符串相等,返回 相等
206
+ if cleaned_s1 == cleaned_s2:
207
+ return "equal"
208
+ # 如果一个字符串包含另一个字符串,返回 子字符串
209
+ elif cleaned_s1 in cleaned_s2 or cleaned_s2 in cleaned_s1:
210
+ return "subset"
211
+ # 否则返回 无关
212
+ else:
213
+ return "unrelated"
214
+
215
+ # 如果去除符号后相等 那么就是100% 的相同作者 这里主要防止顺序颠倒的问题
216
+ if len(s1_punc_split) == len(s2_punc_split) and set(s1_punc_split) == set(s2_punc_split):
217
+ return 1
218
+
219
+ # 如果少一个单词,认为是正确的包含关系,在简写中会出现这种情况
220
+ if is_contained_list(s1_punc_split, s2_punc_split) or is_contained_list(s2_punc_split, s1_punc_split):
221
+ return 0.98
222
+
223
+ rus = compare_strings(s1_punc, s2_punc)
224
+ # 如果顺序去字符 去空格完全相等 那么作者相同 “Hoorani, H. R.” -> 'Hoorani, HR'
225
+ if rus == "equal":
226
+ return 1
227
+
228
+ # 在外文中 jr 代表儿子 我现在需要去掉这个字符带来的影响,可以用参数控制
229
+ if is_delete_jr:
230
+ s1_n = custom_rstrip(s1.lower())
231
+ s1 = s1[:len(s1_n)]
232
+ s2_n = custom_rstrip(s2.lower())
233
+ s2 = s2[:len(s2_n)]
234
+
235
+ # 这里正向是为了解决 Liao, Zhan -> Liao Z. 这样的关系 但是反向会导致上面的错误存在
236
+ if len(s1_punc_split) == len(s2_punc_split) and rus == "subset":
237
+ if len(s1_punc_split[-1]) == 1 or len(s2_punc_split[-1]) == 1:
238
+ if s1_punc_split[0] == s2_punc_split[0] and s1_punc_split[-1][:1] == s2_punc_split[-1][:1]:
239
+ return 1
240
+ # return 0.96 # 如果单词数一致 是包含关系 但会出现这样的 Li Li 和 Li Liang 会被判定为一样 所以这里不给满分
241
+
242
+ # 使用正则表达式替换多个空格为一个空格
243
+ l1 = re.sub(r'\s+', ' ', space_punctuation(s1.replace("'", "")).strip()).strip().split()
244
+ l2 = re.sub(r'\s+', ' ', space_punctuation(s2.replace("'", "")).strip()).strip().split()
245
+
246
+ def is_same_or_initials_match(l1, l2):
247
+ """
248
+ 判断两个字符串是否完全相同,或者它们的首字母是否相同。
249
+ bool: 如果两个字符串完全相同,或它们的首字母匹配,返回 True;否则返回 False。
250
+ """
251
+
252
+ # 使用 zip() 同时遍历 l1 和 l2 中的字符
253
+ for i1, i2 in zip(l1, l2):
254
+ # 如果两个字符忽略大小写后相同,继续比较下一个字符
255
+ if i1.lower() == i2.lower():
256
+ continue
257
+ # 在作者中 有可能错误字母 当单词大于3 且只有一个字母错误或者位置交换时 可以认为这两个单词相同
258
+ # 样例 "De Gusmio, Ana Paula Henriques","De Gusmão, Ana Paula Henriques"
259
+ if len(i1) > 3 and damerau_levenshtein_distance(i1, i2) <= 1:
260
+ continue
261
+
262
+ # 如果其中一个字符的长度为1(即是单个字母),检查它们的首字母是否匹配
263
+ if len(i1) == 1 or len(i2) == 1:
264
+ # 比较它们的首字母(不区分大小写)
265
+ if i1[0].upper() == i2[0].upper():
266
+ continue
267
+ else:
268
+ return False # 如果首字母不同,则返回 False
269
+
270
+ # 如果上面条件都不满足,说明字符不匹配,直接返回 False
271
+ return False
272
+
273
+ # 如果循环结束都没有提前返回 False,则表示两个字符串完全匹配,返回 True
274
+ return True
275
+
276
+ # 防止清理后 一方变为空字符串
277
+ if len(l1) == 0 or len(l2) == 0:
278
+ return 0
279
+
280
+ # 这里的逻辑是最后的位置全大写就将他拆分散 比如 joi CJ -> joi C J
281
+ if len(l1[-1]) != 1 and l1[-1].isupper():
282
+ t_str = l1[-1]
283
+ l1 = l1[:-1]
284
+ l1.extend(list(t_str))
285
+ if len(l2[-1]) != 1 and l2[-1].isupper():
286
+ t_str = l2[-1]
287
+ l2 = l2[:-1]
288
+ l2.extend(list(t_str))
289
+
290
+ # 如果长度相等 简写也是单词的首字母 那么两个名字一致 举例:"María M.Martorell", "Martorell, María M."
291
+ if len(l1) == len(l2) and (is_same_or_initials_match(l1, l2) or set(l1) == set(l2)):
292
+ return 1
293
+
294
+ # 在这里针对上面一条算法再增加一条算法,先对list 排序在对他进行上面的对比
295
+ # 如果长度相等 简写也是单词的首字母 那么两个名字一致 举例:Guo, Qiang @@ Q. Guo
296
+ sort_l1 = copy.deepcopy(l1)
297
+ sort_l2 = copy.deepcopy(l2)
298
+ sort_l1.sort()
299
+ sort_l2.sort()
300
+ if len(sort_l1) == len(sort_l2) and (is_same_or_initials_match(sort_l1, sort_l2) or set(sort_l1) == set(sort_l2)):
301
+ return 0.99
302
+
303
+
304
+ ##############################################################
305
+ # 以上为情况穷举情况,以下为其他情况的相似率计算
306
+ ##############################################################
307
+
308
+ # 设置score_cutoff 默认值为0
309
+ if score_cutoff is None:
310
+ score_cutoff = 0
311
+
312
+ len1 = len(s1)
313
+ len2 = len(s2)
314
+ # 用长字符串除以 短字符串 得到字符串长度的比率
315
+ len_ratio = len1 / len2 if len1 > len2 else len2 / len1
316
+
317
+ # 计算归一化的 Indel 相似度。 对于比率<score_cutoff,返回0。
318
+ end_ratio = normal_end_ratio = Jaro.normalized_similarity(s1.lower(), s2.lower())
319
+
320
+ # 需要对作者的比率分布进行调研决定哪些是小比率哪些是大比率
321
+ if len_ratio > 1.5 and len_ratio < 3:
322
+ # 计算线性下降的减分比例
323
+ # 当 len_ratio = 1.5 时,reduction_factor = 1.0
324
+ # 当 len_ratio = 3.0 时,reduction_factor = 0.9
325
+ reduction_factor = 1.0 - (len_ratio - 1.5) * (0.1 / 1.5)
326
+ end_ratio = end_ratio * reduction_factor
327
+ if len_ratio > 3 and len_ratio < 4: # 应该少量降分
328
+ end_ratio = end_ratio * 0.9
329
+ if len_ratio > 4 and len_ratio < 5: # 应该中量降分
330
+ end_ratio = end_ratio * 0.8
331
+ if len_ratio > 5: # 应该降分
332
+ end_ratio = end_ratio * 0.7
333
+
334
+ # 变音提分已经在上面解决了
335
+ # # 非英语 非汉语提分 与 英文对比时 提分
336
+ # if any([detect_other_languages(s1), detect_other_languages(s2)]) and not all([detect_other_languages(s1),
337
+ # detect_other_languages(s2)]):
338
+ # # 应该提分
339
+ # end_ratio = end_ratio * 1.1
340
+
341
+ # 首字母相同提分
342
+ # if is_contained(extract_initials(s1), extract_initials(s2)):
343
+ if is_contained_list([i[:1].lower() for i in l1], [i[:1].lower() for i in l2]):
344
+ # 应该提分
345
+ end_ratio = end_ratio * 1.05
346
+ else:
347
+ end_ratio = end_ratio * 0.9
348
+
349
+ if len(l1) != len(l2):
350
+ end_ratio = end_ratio * 0.92
351
+
352
+ # 相同部分在短的数据的词中的长度位置 如果是简写相同 不应该提分
353
+ if check_common_elements_by_length_rank(l1, l2) and len_ratio > 1.5:
354
+ # 应该提分
355
+ end_ratio = end_ratio * 1.1
356
+
357
+ if l1[0] != l2[0]:
358
+ end_ratio = end_ratio * Jaro.normalized_similarity(l1[0].lower(), l2[0].lower())
359
+
360
+ # 如果字符串本身的相似度高 应该拉上去 否者应该拉下来
361
+ return min(end_ratio, 1) * 0.5 + normal_end_ratio * 0.5