re-common 0.2.54__py3-none-any.whl → 2.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (207) hide show
  1. re_common/v2/baselibrary/s3object/baseboto3.py +230 -0
  2. re_common/v2/baselibrary/tools/dict_tools.py +24 -0
  3. re_common/v2/baselibrary/tools/search_hash_tools.py +33 -0
  4. re_common/v2/baselibrary/tools/text_matcher.py +223 -0
  5. re_common/v2/baselibrary/tools/unionfind_tools.py +60 -0
  6. re_common/v2/baselibrary/utils/BusinessStringUtil.py +74 -0
  7. re_common/v2/baselibrary/utils/author_smi.py +308 -0
  8. re_common/v2/baselibrary/utils/basedict.py +26 -0
  9. re_common/v2/baselibrary/utils/basehdfs.py +127 -0
  10. re_common/v2/baselibrary/utils/json_cls.py +11 -0
  11. re_common/v2/baselibrary/utils/string_bool.py +9 -0
  12. re_common/v2/baselibrary/utils/string_clear.py +98 -0
  13. re_common/v2/baselibrary/utils/stringutils.py +95 -0
  14. {re_common-0.2.54.dist-info → re_common-2.0.1.dist-info}/METADATA +1 -1
  15. re_common-2.0.1.dist-info/RECORD +25 -0
  16. re_common/baselibrary/__init__.py +0 -4
  17. re_common/baselibrary/baseabs/__init__.py +0 -7
  18. re_common/baselibrary/baseabs/baseabs.py +0 -26
  19. re_common/baselibrary/database/mbuilder.py +0 -132
  20. re_common/baselibrary/database/moudle.py +0 -93
  21. re_common/baselibrary/database/msqlite3.py +0 -194
  22. re_common/baselibrary/database/mysql.py +0 -169
  23. re_common/baselibrary/database/sql_factory.py +0 -26
  24. re_common/baselibrary/mthread/MThreadingRun.py +0 -486
  25. re_common/baselibrary/mthread/MThreadingRunEvent.py +0 -349
  26. re_common/baselibrary/mthread/__init__.py +0 -3
  27. re_common/baselibrary/mthread/mythreading.py +0 -695
  28. re_common/baselibrary/pakge_other/socks.py +0 -404
  29. re_common/baselibrary/readconfig/config_factory.py +0 -18
  30. re_common/baselibrary/readconfig/ini_config.py +0 -317
  31. re_common/baselibrary/readconfig/toml_config.py +0 -49
  32. re_common/baselibrary/temporary/envdata.py +0 -36
  33. re_common/baselibrary/tools/all_requests/aiohttp_request.py +0 -118
  34. re_common/baselibrary/tools/all_requests/httpx_requet.py +0 -102
  35. re_common/baselibrary/tools/all_requests/mrequest.py +0 -412
  36. re_common/baselibrary/tools/all_requests/requests_request.py +0 -81
  37. re_common/baselibrary/tools/batch_compre/__init__.py +0 -0
  38. re_common/baselibrary/tools/batch_compre/bijiao_batch.py +0 -31
  39. re_common/baselibrary/tools/contrast_db3.py +0 -123
  40. re_common/baselibrary/tools/copy_file.py +0 -39
  41. re_common/baselibrary/tools/db3_2_sizedb3.py +0 -102
  42. re_common/baselibrary/tools/foreachgz.py +0 -40
  43. re_common/baselibrary/tools/get_attr.py +0 -11
  44. re_common/baselibrary/tools/image_to_pdf.py +0 -62
  45. re_common/baselibrary/tools/java_code_deal.py +0 -139
  46. re_common/baselibrary/tools/javacode.py +0 -79
  47. re_common/baselibrary/tools/mdb_db3.py +0 -48
  48. re_common/baselibrary/tools/merge_file.py +0 -171
  49. re_common/baselibrary/tools/merge_gz_file.py +0 -165
  50. re_common/baselibrary/tools/mhdfstools/__init__.py +0 -0
  51. re_common/baselibrary/tools/mhdfstools/down_hdfs_files.py +0 -42
  52. re_common/baselibrary/tools/mhdfstools/hdfst.py +0 -42
  53. re_common/baselibrary/tools/mhdfstools/up_hdfs_files.py +0 -38
  54. re_common/baselibrary/tools/mongo_tools.py +0 -50
  55. re_common/baselibrary/tools/move_file.py +0 -170
  56. re_common/baselibrary/tools/move_mongo/__init__.py +0 -0
  57. re_common/baselibrary/tools/move_mongo/mongo_table_to_file.py +0 -63
  58. re_common/baselibrary/tools/move_mongo/move_mongo_table.py +0 -354
  59. re_common/baselibrary/tools/move_mongo/use_mttf.py +0 -18
  60. re_common/baselibrary/tools/move_mongo/use_mv.py +0 -93
  61. re_common/baselibrary/tools/mpandas/__init__.py +0 -0
  62. re_common/baselibrary/tools/mpandas/mpandasreadexcel.py +0 -125
  63. re_common/baselibrary/tools/mpandas/pandas_visualization.py +0 -8
  64. re_common/baselibrary/tools/myparsel.py +0 -104
  65. re_common/baselibrary/tools/rename_dir_file.py +0 -37
  66. re_common/baselibrary/tools/sequoiadb_utils.py +0 -398
  67. re_common/baselibrary/tools/split_line_to_many.py +0 -25
  68. re_common/baselibrary/tools/stringtodicts.py +0 -33
  69. re_common/baselibrary/tools/workwechant_bot.py +0 -84
  70. re_common/baselibrary/utils/__init__.py +0 -0
  71. re_common/baselibrary/utils/baseaiohttp.py +0 -296
  72. re_common/baselibrary/utils/baseaiomysql.py +0 -87
  73. re_common/baselibrary/utils/baseallstep.py +0 -191
  74. re_common/baselibrary/utils/baseavro.py +0 -19
  75. re_common/baselibrary/utils/baseboto3.py +0 -291
  76. re_common/baselibrary/utils/basecsv.py +0 -32
  77. re_common/baselibrary/utils/basedict.py +0 -133
  78. re_common/baselibrary/utils/basedir.py +0 -241
  79. re_common/baselibrary/utils/baseencode.py +0 -351
  80. re_common/baselibrary/utils/baseencoding.py +0 -29
  81. re_common/baselibrary/utils/baseesdsl.py +0 -86
  82. re_common/baselibrary/utils/baseexcel.py +0 -264
  83. re_common/baselibrary/utils/baseexcept.py +0 -109
  84. re_common/baselibrary/utils/basefile.py +0 -654
  85. re_common/baselibrary/utils/baseftp.py +0 -214
  86. re_common/baselibrary/utils/basegzip.py +0 -60
  87. re_common/baselibrary/utils/basehdfs.py +0 -135
  88. re_common/baselibrary/utils/basehttpx.py +0 -268
  89. re_common/baselibrary/utils/baseip.py +0 -87
  90. re_common/baselibrary/utils/basejson.py +0 -2
  91. re_common/baselibrary/utils/baselist.py +0 -32
  92. re_common/baselibrary/utils/basemotor.py +0 -190
  93. re_common/baselibrary/utils/basemssql.py +0 -98
  94. re_common/baselibrary/utils/baseodbc.py +0 -113
  95. re_common/baselibrary/utils/basepandas.py +0 -302
  96. re_common/baselibrary/utils/basepeewee.py +0 -11
  97. re_common/baselibrary/utils/basepika.py +0 -180
  98. re_common/baselibrary/utils/basepydash.py +0 -143
  99. re_common/baselibrary/utils/basepymongo.py +0 -230
  100. re_common/baselibrary/utils/basequeue.py +0 -22
  101. re_common/baselibrary/utils/baserar.py +0 -57
  102. re_common/baselibrary/utils/baserequest.py +0 -279
  103. re_common/baselibrary/utils/baseset.py +0 -8
  104. re_common/baselibrary/utils/basesmb.py +0 -403
  105. re_common/baselibrary/utils/basestring.py +0 -382
  106. re_common/baselibrary/utils/basetime.py +0 -320
  107. re_common/baselibrary/utils/basetuple.py +0 -0
  108. re_common/baselibrary/utils/baseurl.py +0 -121
  109. re_common/baselibrary/utils/basezip.py +0 -57
  110. re_common/baselibrary/utils/core/__init__.py +0 -8
  111. re_common/baselibrary/utils/core/bottomutils.py +0 -18
  112. re_common/baselibrary/utils/core/mdeprecated.py +0 -327
  113. re_common/baselibrary/utils/core/mlamada.py +0 -16
  114. re_common/baselibrary/utils/core/msginfo.py +0 -25
  115. re_common/baselibrary/utils/core/requests_core.py +0 -103
  116. re_common/baselibrary/utils/fateadm.py +0 -429
  117. re_common/baselibrary/utils/importfun.py +0 -123
  118. re_common/baselibrary/utils/mfaker.py +0 -57
  119. re_common/baselibrary/utils/my_abc/__init__.py +0 -3
  120. re_common/baselibrary/utils/my_abc/better_abc.py +0 -32
  121. re_common/baselibrary/utils/mylogger.py +0 -414
  122. re_common/baselibrary/utils/myredisclient.py +0 -861
  123. re_common/baselibrary/utils/pipupgrade.py +0 -21
  124. re_common/baselibrary/utils/ringlist.py +0 -85
  125. re_common/baselibrary/utils/version_compare.py +0 -36
  126. re_common/baselibrary/utils/ydmhttp.py +0 -126
  127. re_common/facade/__init__.py +0 -1
  128. re_common/facade/lazy_import.py +0 -11
  129. re_common/facade/loggerfacade.py +0 -25
  130. re_common/facade/mysqlfacade.py +0 -467
  131. re_common/facade/now.py +0 -31
  132. re_common/facade/sqlite3facade.py +0 -257
  133. re_common/facade/use/__init__.py +0 -0
  134. re_common/facade/use/mq_use_facade.py +0 -83
  135. re_common/facade/use/proxy_use_facade.py +0 -20
  136. re_common/libtest/__init__.py +0 -0
  137. re_common/libtest/base_dict_test.py +0 -19
  138. re_common/libtest/baseavro_test.py +0 -13
  139. re_common/libtest/basefile_test.py +0 -14
  140. re_common/libtest/basemssql_test.py +0 -77
  141. re_common/libtest/baseodbc_test.py +0 -8
  142. re_common/libtest/basepandas_test.py +0 -38
  143. re_common/libtest/get_attr_test/__init__.py +0 -0
  144. re_common/libtest/get_attr_test/get_attr_test_settings.py +0 -14
  145. re_common/libtest/get_attr_test/settings.py +0 -55
  146. re_common/libtest/idencode_test.py +0 -54
  147. re_common/libtest/iniconfig_test.py +0 -35
  148. re_common/libtest/ip_test.py +0 -35
  149. re_common/libtest/merge_file_test.py +0 -20
  150. re_common/libtest/mfaker_test.py +0 -9
  151. re_common/libtest/mm3_test.py +0 -32
  152. re_common/libtest/mylogger_test.py +0 -89
  153. re_common/libtest/myparsel_test.py +0 -28
  154. re_common/libtest/mysql_test.py +0 -151
  155. re_common/libtest/pymongo_test.py +0 -21
  156. re_common/libtest/split_test.py +0 -12
  157. re_common/libtest/sqlite3_merge_test.py +0 -6
  158. re_common/libtest/sqlite3_test.py +0 -34
  159. re_common/libtest/tomlconfig_test.py +0 -30
  160. re_common/libtest/use_tools_test/__init__.py +0 -3
  161. re_common/libtest/user/__init__.py +0 -5
  162. re_common/studio/__init__.py +0 -5
  163. re_common/studio/assignment_expressions.py +0 -37
  164. re_common/studio/mydash/__init__.py +0 -0
  165. re_common/studio/mydash/test1.py +0 -19
  166. re_common/studio/pydashstudio/__init__.py +0 -0
  167. re_common/studio/pydashstudio/first.py +0 -9
  168. re_common/studio/streamlitstudio/__init__.py +0 -0
  169. re_common/studio/streamlitstudio/first_app.py +0 -66
  170. re_common/studio/streamlitstudio/uber_pickups.py +0 -24
  171. re_common/studio/test.py +0 -19
  172. re_common/vip/__init__.py +0 -0
  173. re_common/vip/base_step_process.py +0 -11
  174. re_common/vip/baseencodeid.py +0 -91
  175. re_common/vip/changetaskname.py +0 -28
  176. re_common/vip/core_var.py +0 -24
  177. re_common/vip/mmh3Hash.py +0 -90
  178. re_common/vip/proxy/__init__.py +0 -0
  179. re_common/vip/proxy/allproxys.py +0 -127
  180. re_common/vip/proxy/allproxys_thread.py +0 -159
  181. re_common/vip/proxy/cnki_proxy.py +0 -153
  182. re_common/vip/proxy/kuaidaili.py +0 -87
  183. re_common/vip/proxy/proxy_all.py +0 -113
  184. re_common/vip/proxy/update_kuaidaili_0.py +0 -42
  185. re_common/vip/proxy/wanfang_proxy.py +0 -152
  186. re_common/vip/proxy/wp_proxy_all.py +0 -182
  187. re_common/vip/read_rawid_to_txt.py +0 -92
  188. re_common/vip/title/__init__.py +0 -5
  189. re_common/vip/title/transform/TransformBookTitleToZt.py +0 -125
  190. re_common/vip/title/transform/TransformConferenceTitleToZt.py +0 -139
  191. re_common/vip/title/transform/TransformCstadTitleToZt.py +0 -196
  192. re_common/vip/title/transform/TransformJournalTitleToZt.py +0 -203
  193. re_common/vip/title/transform/TransformPatentTitleToZt.py +0 -132
  194. re_common/vip/title/transform/TransformRegulationTitleToZt.py +0 -114
  195. re_common/vip/title/transform/TransformStandardTitleToZt.py +0 -135
  196. re_common/vip/title/transform/TransformThesisTitleToZt.py +0 -135
  197. re_common/vip/title/transform/__init__.py +0 -11
  198. re_common-0.2.54.dist-info/RECORD +0 -194
  199. /re_common/{baselibrary/database → v2}/__init__.py +0 -0
  200. /re_common/{baselibrary/pakge_other → v2/baselibrary}/__init__.py +0 -0
  201. /re_common/{baselibrary/readconfig → v2/baselibrary/s3object}/__init__.py +0 -0
  202. /re_common/{baselibrary/temporary → v2/baselibrary/tools}/__init__.py +0 -0
  203. /re_common/{baselibrary/tools/__init__.py → v2/baselibrary/tools/list_tools.py} +0 -0
  204. /re_common/{baselibrary/tools/all_requests → v2/baselibrary/utils}/__init__.py +0 -0
  205. {re_common-0.2.54.dist-info → re_common-2.0.1.dist-info}/LICENSE +0 -0
  206. {re_common-0.2.54.dist-info → re_common-2.0.1.dist-info}/WHEEL +0 -0
  207. {re_common-0.2.54.dist-info → re_common-2.0.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,308 @@
1
+ import re
2
+ import string
3
+
4
+ import regex
5
+ import unicodedata
6
+ from rapidfuzz._utils import setupPandas, is_none
7
+ from rapidfuzz.distance import Jaro
8
+
9
+ """
10
+ 作者比率分布 大部分在 1和 2
11
+ 1-2 675092763
12
+ 2-3 49335191
13
+ 3-4 440848
14
+ 4-5 9953
15
+ 其他都是几百 几十和几个 不用考虑
16
+ 如果 大于5 大降分
17
+ 3-4 4-5 分两个段降分 3-4 降得最少
18
+ 1-3 不降分
19
+ """
20
+
21
+ additional_chars = '‑–‐’·.—'
22
+ extended_punctuation = string.punctuation + additional_chars
23
+
24
+
25
+ def get_diacritic_variant(char1):
26
+ # 将字符转换为标准的 Unicode 形式
27
+ normalized_char1 = unicodedata.normalize('NFD', char1)
28
+
29
+ # 获取基本字符(去掉变音符号)
30
+ base_char1 = ''.join(c for c in normalized_char1 if unicodedata.category(c) != 'Mn')
31
+
32
+ # 判断基本字符是否相同
33
+ return base_char1
34
+
35
+
36
+ def detect_other_languages(text):
37
+ # 匹配所有非中文、非英文、非数字字符
38
+ pattern = r'[^\u4E00-\u9FFFa-zA-Z0-9\s.,!?;:\'\"()‑\-–—‐’·˜.]'
39
+
40
+ # 使用正则表达式查找
41
+ matches = re.findall(pattern, text)
42
+
43
+ # 如果找到匹配的字符,表示存在非中文、非英文、非数字的语言字符
44
+ return bool(matches)
45
+
46
+
47
+ def extract_initials(text):
48
+ # 按空格分隔字符串
49
+ words = text.split()
50
+
51
+ # 提取每个单词的首字母并转化为大写
52
+ initials = ''.join(word[0].upper() for word in words)
53
+
54
+ return initials
55
+
56
+
57
+ def is_contained(str1, str2):
58
+ # 判断是否是包含关系
59
+ return str1 in str2 or str2 in str1
60
+
61
+
62
+ # list1 是否包含 list2 如果包含 return True
63
+ def is_contained_list(list1, list2):
64
+ # 检查 list2 中每个元素的出现次数,是否能在 list1 中找到足够的数量
65
+ for item in list2:
66
+ if list2.count(item) > list1.count(item):
67
+ return False
68
+ return True
69
+
70
+
71
+ def check_common_elements_by_length_rank(list1, list2):
72
+ # 获取两个列表的交集
73
+ set1 = set(list1)
74
+ set2 = set(list2)
75
+
76
+ common_elements = set1 & set2 # 获取交集
77
+
78
+ if not common_elements:
79
+ return False
80
+
81
+ # 确定较短的列表
82
+ short_list = list1 if len(list1) < len(list2) else list2
83
+
84
+ # 按字符长度排序短列表
85
+ sorted_short_list = sorted(short_list, key=len)
86
+
87
+ for word in common_elements:
88
+ # 获取该单词在短列表中的字符长度排名
89
+ length_rank = sorted_short_list.index(word) + 1 # +1 因为列表索引从0开始
90
+ # 如果单个字母跳过
91
+ if len(word) == 1:
92
+ continue
93
+
94
+ if length_rank / len(sorted_short_list) > 0.5:
95
+ # 说明 命中了长字符串相等
96
+ return True
97
+
98
+ return False
99
+
100
+
101
+ def remove_punctuation(text):
102
+ # 20241226 替换掉自定义符号集
103
+ text = regex.sub("[\\p{P}¥+=˛`$<¸´~^¥≤℃×■¨°>|ⅰⅱⅲⅳⅴⅵⅶⅹⅺⅻ]", "", text.lower())
104
+ # text = text.translate(str.maketrans('', '', extended_punctuation))
105
+ return text
106
+
107
+
108
+ def space_punctuation(text):
109
+ # 使用空格替换符号
110
+ return text.translate(str.maketrans(extended_punctuation, ' ' * len(extended_punctuation), ''))
111
+
112
+
113
+ def custom_rstrip(s):
114
+ # 去除尾部的指定子串,顺序删除
115
+ s = s.strip()
116
+ if s.endswith("."):
117
+ s = s[:-1] # 删除最后的 "."
118
+ s = s.strip()
119
+ if s.endswith("jr"):
120
+ s = s[:-2] # 删除最后的 "jr"
121
+ s = s.strip()
122
+ if s.endswith(","):
123
+ s = s[:-1] # 删除最后的 ","
124
+ s = s.strip()
125
+
126
+ return s
127
+
128
+
129
+ def deal_str_first(s1):
130
+ # 先对数据处理一波
131
+ s1 = s1.replace("’", "")
132
+ return s1
133
+
134
+
135
+ def AuthorRatio(
136
+ s1,
137
+ s2,
138
+ *,
139
+ processor=None,
140
+ score_cutoff=None,
141
+ is_delete_jr=True,
142
+ ):
143
+ # 判空需要
144
+ setupPandas()
145
+ # 如果为空就没有相似度
146
+ if is_none(s1) or is_none(s2):
147
+ return 0
148
+
149
+ # 处理字符串的程序
150
+ if processor is not None:
151
+ s1 = processor(s1)
152
+ s2 = processor(s2)
153
+
154
+ # 处理后是否为空字符串,如果有 返回0
155
+ if not s1 or not s2:
156
+ return 0
157
+
158
+ s1 = get_diacritic_variant(s1)
159
+ s2 = get_diacritic_variant(s2)
160
+ # 这里提出来是为了少计算 但后期需要平衡内存和算力
161
+ s1_punc = remove_punctuation(s1)
162
+ s2_punc = remove_punctuation(s2)
163
+ s1_punc_split = s1_punc.split()
164
+ s2_punc_split = s2_punc.split()
165
+
166
+ def compare_strings(s1_punc, s2_punc):
167
+ # 去除字符串中的空白字符
168
+ cleaned_s1 = re.sub(r'\s+', '', s1_punc)
169
+ cleaned_s2 = re.sub(r'\s+', '', s2_punc)
170
+
171
+ # 如果两个字符串相等,返回 相等
172
+ if cleaned_s1 == cleaned_s2:
173
+ return "equal"
174
+ # 如果一个字符串包含另一个字符串,返回 子字符串
175
+ elif cleaned_s1 in cleaned_s2 or cleaned_s2 in cleaned_s1:
176
+ return "subset"
177
+ # 否则返回 无关
178
+ else:
179
+ return "unrelated"
180
+
181
+ # 如果去除符号后相等 那么就是100% 的相同作者 这里主要防止顺序颠倒的问题
182
+ if len(s1_punc_split) == len(s2_punc_split) and set(s1_punc_split) == set(s2_punc_split):
183
+ return 1
184
+
185
+ # 如果少一个单词,认为是正确的包含关系,在简写中会出现这种情况
186
+ if is_contained_list(s1_punc_split, s2_punc_split) or is_contained_list(s2_punc_split, s1_punc_split):
187
+ return 0.98
188
+
189
+ rus = compare_strings(s1_punc, s2_punc)
190
+ # 如果顺序去字符 去空格完全相等 那么作者相同 “Hoorani, H. R.” -> 'Hoorani, HR'
191
+ if rus == "equal":
192
+ return 1
193
+
194
+ # 在外文中 jr 代表儿子 我现在需要去掉这个字符带来的影响,可以用参数控制
195
+ if is_delete_jr:
196
+ s1_n = custom_rstrip(s1.lower())
197
+ s1 = s1[:len(s1_n)]
198
+ s2_n = custom_rstrip(s2.lower())
199
+ s2 = s2[:len(s2_n)]
200
+
201
+ # 这里正向是为了解决 Liao, Zhan -> Liao Z. 这样的关系 但是反向会导致上面的错误存在
202
+ if len(s1_punc_split) == len(s2_punc_split) and rus == "subset":
203
+ if len(s1_punc_split[-1]) == 1 or len(s2_punc_split[-1]) == 1:
204
+ if s1_punc_split[0] == s2_punc_split[0] and s1_punc_split[-1][:1] == s2_punc_split[-1][:1]:
205
+ return 1
206
+ # return 0.96 # 如果单词数一致 是包含关系 但会出现这样的 Li Li 和 Li Liang 会被判定为一样 所以这里不给满分
207
+
208
+ # 使用正则表达式替换多个空格为一个空格
209
+ l1 = re.sub(r'\s+', ' ', space_punctuation(s1.replace("'", "")).strip()).strip().split()
210
+ l2 = re.sub(r'\s+', ' ', space_punctuation(s2.replace("'", "")).strip()).strip().split()
211
+
212
+ def is_same_or_initials_match(l1, l2):
213
+ """
214
+ 判断两个字符串是否完全相同,或者它们的首字母是否相同。
215
+ bool: 如果两个字符串完全相同,或它们的首字母匹配,返回 True;否则返回 False。
216
+ """
217
+
218
+ # 使用 zip() 同时遍历 l1 和 l2 中的字符
219
+ for i1, i2 in zip(l1, l2):
220
+ # 如果两个字符忽略大小写后相同,继续比较下一个字符
221
+ if i1.lower() == i2.lower():
222
+ continue
223
+
224
+ # 如果其中一个字符的长度为1(即是单个字母),检查它们的首字母是否匹配
225
+ if len(i1) == 1 or len(i2) == 1:
226
+ # 比较它们的首字母(不区分大小写)
227
+ if i1[0].upper() == i2[0].upper():
228
+ continue
229
+ else:
230
+ return False # 如果首字母不同,则返回 False
231
+
232
+ # 如果上面条件都不满足,说明字符不匹配,直接返回 False
233
+ return False
234
+
235
+ # 如果循环结束都没有提前返回 False,则表示两个字符串完全匹配,返回 True
236
+ return True
237
+
238
+ if len(l1[-1]) != 1 and l1[-1].isupper():
239
+ t_str = l1[-1]
240
+ l1 = l1[:-1]
241
+ l1.extend(list(t_str))
242
+ if len(l2[-1]) != 1 and l2[-1].isupper():
243
+ t_str = l2[-1]
244
+ l2 = l2[:-1]
245
+ l2.extend(list(t_str))
246
+
247
+ # 如果长度相等 简写也是单词的首字母 那么两个名字一致 举例:"María M.Martorell", "Martorell, María M."
248
+ if len(l1) == len(l2) and (is_same_or_initials_match(l1, l2) or set(l1) == set(l2)):
249
+ return 1
250
+
251
+ ##############################################################
252
+ # 以上为情况穷举情况,以下为其他情况的相似率计算
253
+ ##############################################################
254
+
255
+ # 设置score_cutoff 默认值为0
256
+ if score_cutoff is None:
257
+ score_cutoff = 0
258
+
259
+ len1 = len(s1)
260
+ len2 = len(s2)
261
+ # 用长字符串除以 短字符串 得到字符串长度的比率
262
+ len_ratio = len1 / len2 if len1 > len2 else len2 / len1
263
+
264
+ # 计算归一化的 Indel 相似度。 对于比率<score_cutoff,返回0。
265
+ end_ratio = normal_end_ratio = Jaro.normalized_similarity(s1, s2)
266
+
267
+ # 需要对作者的比率分布进行调研决定哪些是小比率哪些是大比率
268
+ if len_ratio > 1.5 and len_ratio < 3:
269
+ # 计算线性下降的减分比例
270
+ # 当 len_ratio = 1.5 时,reduction_factor = 1.0
271
+ # 当 len_ratio = 3.0 时,reduction_factor = 0.9
272
+ reduction_factor = 1.0 - (len_ratio - 1.5) * (0.1 / 1.5)
273
+ end_ratio = end_ratio * reduction_factor
274
+ if len_ratio > 3 and len_ratio < 4: # 应该少量降分
275
+ end_ratio = end_ratio * 0.9
276
+ if len_ratio > 4 and len_ratio < 5: # 应该中量降分
277
+ end_ratio = end_ratio * 0.8
278
+ if len_ratio > 5: # 应该降分
279
+ end_ratio = end_ratio * 0.7
280
+
281
+ # 变音提分已经在上面解决了
282
+ # # 非英语 非汉语提分 与 英文对比时 提分
283
+ # if any([detect_other_languages(s1), detect_other_languages(s2)]) and not all([detect_other_languages(s1),
284
+ # detect_other_languages(s2)]):
285
+ # # 应该提分
286
+ # end_ratio = end_ratio * 1.1
287
+
288
+ # 首字母相同提分
289
+ # if is_contained(extract_initials(s1), extract_initials(s2)):
290
+ if is_contained_list([i[:1] for i in l1], [i[:1] for i in l2]):
291
+ # 应该提分
292
+ end_ratio = end_ratio * 1.05
293
+ else:
294
+ end_ratio = end_ratio * 0.9
295
+
296
+ if len(l1) != len(l2):
297
+ end_ratio = end_ratio * 0.92
298
+
299
+ # 相同部分在短的数据的词中的长度位置 如果是简写相同 不应该提分
300
+ if check_common_elements_by_length_rank(l1, l2) and len_ratio > 1.5:
301
+ # 应该提分
302
+ end_ratio = end_ratio * 1.1
303
+
304
+ if l1[0] != l2[0]:
305
+ end_ratio = end_ratio * Jaro.normalized_similarity(l1[0], l2[0])
306
+
307
+ # 如果字符串本身的相似度高 应该拉上去 否者应该拉下来
308
+ return min(end_ratio, 1) * 0.5 + normal_end_ratio * 0.5
@@ -0,0 +1,26 @@
1
+ import logging
2
+
3
+ logger = logging.getLogger(__name__) # 创建 logger 实例
4
+
5
+
6
+ class BaseDict(object):
7
+ @classmethod
8
+ def flip_dict(cls, original_dict, raise_on_conflict=False):
9
+ """
10
+ 翻转字典:将 key 是字符串、value 是列表的字典,转换为 key 是原 value 列表中的元素、value 是原 key 的字典。
11
+ :param original_dict: 原始字典
12
+ :param raise_on_conflict: 是否在键冲突时抛出异常,默认为 False
13
+ :return: 翻转后的字典
14
+ """
15
+ flipped_dict = {}
16
+ for key, value_list in original_dict.items():
17
+ for value in value_list:
18
+ if value in flipped_dict:
19
+ if raise_on_conflict:
20
+ raise ValueError(f"Key conflict detected: {value} already exists in the flipped dictionary.")
21
+ else:
22
+ # 覆盖冲突的键
23
+ logger.warning(
24
+ f"Warning: Key conflict detected for {value}. Overwriting with new value: {key}.")
25
+ flipped_dict[value] = key
26
+ return flipped_dict
@@ -0,0 +1,127 @@
1
+ import gzip
2
+ from io import BytesIO
3
+
4
+ from hdfs import InsecureClient
5
+
6
+
7
+ class HDFSUtils(object):
8
+ """
9
+ HDFS 工具类,封装常见的 HDFS 操作。
10
+ """
11
+
12
+ def __init__(self, hdfs_url, hdfs_user):
13
+ """
14
+ 初始化 HDFS 客户端。
15
+ :param hdfs_url: HDFS 的 URL,例如 "http://namenode:50070"
16
+ :param hdfs_user: HDFS 用户名
17
+ """
18
+ self.hdfs_url = hdfs_url
19
+ self.hdfs_user = hdfs_user
20
+ self.client = InsecureClient(hdfs_url, user=hdfs_user)
21
+
22
+ def upload_file(self, local_path, hdfs_path, overwrite=False):
23
+ """
24
+ 将本地文件上传到 HDFS。
25
+ :param local_path: 本地文件路径
26
+ :param hdfs_path: HDFS 文件路径
27
+ :param overwrite: 是否覆盖已存在的文件
28
+ :return: None
29
+ """
30
+ self.client.upload(hdfs_path, local_path, overwrite=overwrite)
31
+ print(f"文件上传成功: {local_path} -> {hdfs_path}")
32
+
33
+ def download_file(self, hdfs_path, local_path, overwrite=False):
34
+ """
35
+ 从 HDFS 下载文件到本地。
36
+ :param hdfs_path: HDFS 文件路径
37
+ :param local_path: 本地文件路径
38
+ :param overwrite: 是否覆盖已存在的文件
39
+ :return: None
40
+ """
41
+ self.client.download(hdfs_path, local_path, overwrite=overwrite)
42
+ print(f"文件下载成功: {hdfs_path} -> {local_path}")
43
+
44
+ def delete_file(self, hdfs_path, recursive=False):
45
+ """
46
+ 删除 HDFS 上的文件或目录。
47
+ :param hdfs_path: HDFS 文件或目录路径
48
+ :param recursive: 是否递归删除目录
49
+ :return: None
50
+ """
51
+ self.client.delete(hdfs_path, recursive=recursive)
52
+ print(f"文件/目录删除成功: {hdfs_path}")
53
+
54
+ def create_directory(self, hdfs_path):
55
+ """
56
+ 在 HDFS 上创建目录。
57
+ :param hdfs_path: HDFS 目录路径
58
+ :return: None
59
+ """
60
+ self.client.makedirs(hdfs_path)
61
+ print(f"目录创建成功: {hdfs_path}")
62
+
63
+ def list_files(self, hdfs_path):
64
+ """
65
+ 列出 HDFS 目录下的文件和子目录。
66
+ :param hdfs_path: HDFS 目录路径
67
+ :return: 文件/目录列表
68
+ """
69
+ files = self.client.list(hdfs_path)
70
+ return files
71
+
72
+ def read_file(self, hdfs_path):
73
+ """
74
+ 读取 HDFS 文件内容。
75
+ :param hdfs_path: HDFS 文件路径
76
+ :return: 文件内容
77
+ """
78
+ with self.client.read(hdfs_path) as reader:
79
+ content = reader.read()
80
+ print(f"文件读取成功: {hdfs_path}")
81
+ return content
82
+
83
+ def read_gz_file(self, hdfs_path, encoding='utf-8'):
84
+ """
85
+ 读取 HDFS 上的 .gz 文件内容。
86
+ :param hdfs_path: HDFS 文件路径(必须以 .gz 结尾)
87
+ :param encoding: 文件编码格式(默认 utf-8)
88
+ :return: 文件内容
89
+ """
90
+ with self.client.read(hdfs_path) as reader: # 以二进制模式读取
91
+ compressed_data = reader.read() # 读取压缩数据
92
+ with gzip.GzipFile(fileobj=BytesIO(compressed_data)) as gz_file: # 解压缩
93
+ content = gz_file.read().decode(encoding) # 解码为字符串
94
+ print(f"文件读取成功: {hdfs_path}")
95
+ return content
96
+
97
+ def write_file(self, hdfs_path, content, overwrite=False, encoding='utf-8'):
98
+ """
99
+ 向 HDFS 文件写入内容。
100
+ :param hdfs_path: HDFS 文件路径
101
+ :param content: 要写入的内容
102
+ :param overwrite: 是否覆盖已存在的文件
103
+ :param encoding: 文件编码格式
104
+ :return: None
105
+ """
106
+ with self.client.write(hdfs_path, overwrite=overwrite, encoding=encoding) as writer:
107
+ writer.write(content)
108
+ print(f"文件写入成功: {hdfs_path}")
109
+
110
+ def file_exists(self, hdfs_path):
111
+ """
112
+ 检查 HDFS 文件或目录是否存在。
113
+ :param hdfs_path: HDFS 文件或目录路径
114
+ :return: 是否存在
115
+ """
116
+ status = self.client.status(hdfs_path, strict=False)
117
+ return status is not None
118
+
119
+ def rename_file(self, hdfs_src_path, hdfs_dst_path):
120
+ """
121
+ 重命名或移动 HDFS 文件/目录。
122
+ :param hdfs_src_path: 源路径
123
+ :param hdfs_dst_path: 目标路径
124
+ :return: None
125
+ """
126
+ self.client.rename(hdfs_src_path, hdfs_dst_path)
127
+ print(f"文件/目录重命名成功: {hdfs_src_path} -> {hdfs_dst_path}")
@@ -0,0 +1,11 @@
1
+ import base64
2
+ import json
3
+
4
+
5
+ class BytesEncoder(json.JSONEncoder):
6
+ def default(self, obj):
7
+ if isinstance(obj, bytes):
8
+ return base64.b64encode(obj).decode('utf-8')
9
+ return super().default(obj)
10
+
11
+ # json.dumps(x, ensure_ascii=False, cls=BytesEncoder)
@@ -0,0 +1,9 @@
1
+ import re
2
+
3
+
4
+ def is_all_english_chars(s):
5
+ return bool(re.match(r'^[A-Za-z]+$', s))
6
+
7
+
8
+ def contains_chinese_chars(s):
9
+ return bool(re.search(r'[\u3400-\u9fff]', s))
@@ -0,0 +1,98 @@
1
+ import re
2
+ import regex
3
+
4
+ from re_common.v2.baselibrary.utils.stringutils import qj2bj, bj2qj, get_diacritic_variant
5
+
6
+
7
+ class StringClear(object):
8
+
9
+ def __init__(self, obj_str):
10
+ self.obj_str = obj_str
11
+
12
+ def None_to_str(self):
13
+ if self.obj_str is None:
14
+ self.obj_str = ''
15
+ return self
16
+
17
+ def to_str(self):
18
+ self.obj_str = str(self.obj_str)
19
+ return self
20
+
21
+ def qj_to_bj(self):
22
+ # 全角变半角
23
+ self.obj_str = qj2bj(self.obj_str)
24
+ return self
25
+
26
+ def bj_to_qj(self):
27
+ # 半角变全角
28
+ self.obj_str = bj2qj(self.obj_str)
29
+ return self
30
+
31
+ def lower(self):
32
+ self.obj_str = self.obj_str.lower()
33
+ return self
34
+
35
+ def upper(self):
36
+ self.obj_str = self.obj_str.upper()
37
+ return self
38
+
39
+ def collapse_spaces(self):
40
+ # 移除多余空格,连续多个空格变一个
41
+ self.obj_str = re.sub(r"\s+", " ", self.obj_str)
42
+ return self
43
+
44
+ def clear_all_spaces(self):
45
+ # 去除所有空格
46
+ self.obj_str = re.sub("\\s+", "", self.obj_str)
47
+ return self
48
+
49
+ def clean_symbols(self):
50
+ """
51
+ 清理已知的符号
52
+ """
53
+ self.obj_str = regex.sub(
54
+ "[\\p{P}+~$`^=|<>~`$^+=|<>¥×\\\\*#$^|+%&~!,:.;'/{}()\\[\\]?<> 《》”“-()。≤《〈〉》—、·―–‐‘’“”″…¨〔〕°■『』℃ⅠⅡⅢⅣⅤⅥⅦⅩⅪⅫ]",
55
+ "",
56
+ self.obj_str) # \\p{P} 标点符号 后面的是一些其他符号, 也可以用 \p{S} 代替 但是这个很广 可能有误伤
57
+ return self
58
+
59
+ def remove_special_chars(self):
60
+ # 移除特殊字符,仅保留字母、数字、空格和汉字 \w 已经包括所有 Unicode 字母 下划线 _ 会被保留
61
+ self.obj_str = re.sub(r"[^\w\s]", "", self.obj_str)
62
+ return self
63
+
64
+ def remove_underline(self):
65
+ # 下划线在 \w 中 所以这里独立封装
66
+ self.obj_str = re.sub("[_]", "", self.obj_str)
67
+ return self
68
+
69
+ def remove_diacritics(self):
70
+ # 去除音标 转换成字母
71
+ self.obj_str = get_diacritic_variant(self.obj_str)
72
+ return self
73
+
74
+ def remove_brackets(self):
75
+ # 移除 方括号里面的内容
76
+ self.obj_str = re.sub("\\[.*?]", "", self.obj_str)
77
+ return self
78
+
79
+ def remove_parentheses(self):
80
+ # 移除圆括号的内容
81
+ self.obj_str = re.sub("\\(.*?\\)", "", self.obj_str)
82
+ return self
83
+
84
+ def get_str(self):
85
+ return self.obj_str
86
+
87
+
88
+ def rel_clear(str_obj):
89
+ # 为融合数据定制的 清理规则
90
+ return (StringClear(str_obj)
91
+ .None_to_str() # 空对象转str 防止空对象
92
+ .to_str() # 防止其他类型传入
93
+ .qj_to_bj() # 全角转半角
94
+ .remove_special_chars() # 移除特殊字符,仅保留字母、数字、空格和汉字 \w 已经包括所有 Unicode 字母 下划线 _ 会被保留
95
+ .collapse_spaces() # 移除多余空格,连续多个空格变一个
96
+ .lower() # 小写
97
+ .get_str() # 获取str
98
+ .strip()) # 去掉空格
@@ -0,0 +1,95 @@
1
+ import re
2
+
3
+ import regex
4
+ import unicodedata
5
+
6
+
7
+ def bj2qj(src):
8
+ if src is None:
9
+ return src
10
+
11
+ DBC_SPACE = ' '
12
+ SBC_SPACE = ' '
13
+ DBC_CHAR_START = 33
14
+ DBC_CHAR_END = 126
15
+ CONVERT_STEP = 65248
16
+
17
+ buf = []
18
+ for char in src:
19
+ if char == DBC_SPACE:
20
+ buf.append(SBC_SPACE)
21
+ elif DBC_CHAR_START <= ord(char) <= DBC_CHAR_END:
22
+ buf.append(chr(ord(char) + CONVERT_STEP))
23
+ else:
24
+ buf.append(char)
25
+
26
+ return ''.join(buf)
27
+
28
+ def qj2bj(src):
29
+ """
30
+ 全角转半角
31
+ :param src:
32
+ :return:
33
+ """
34
+ if src is None:
35
+ return src
36
+
37
+ SBC_CHAR_START = 0xFF01
38
+ SBC_CHAR_END = 0xFF5E
39
+ CONVERT_STEP = 0xFEE0
40
+ DBC_SPACE = ' '
41
+ SBC_SPACE = ' '
42
+
43
+ buf = []
44
+ for char in src:
45
+ if SBC_CHAR_START <= ord(char) <= SBC_CHAR_END:
46
+ buf.append(chr(ord(char) - CONVERT_STEP))
47
+ elif char == SBC_SPACE:
48
+ buf.append(DBC_SPACE)
49
+ else:
50
+ buf.append(char)
51
+
52
+ return ''.join(buf)
53
+
54
+
55
+ def get_diacritic_variant(char1):
56
+ # 将字符转换为标准的 Unicode 形式
57
+ normalized_char1 = unicodedata.normalize('NFD', char1)
58
+
59
+ # 获取基本字符(去掉变音符号)
60
+ base_char1 = ''.join(c for c in normalized_char1 if unicodedata.category(c) != 'Mn')
61
+
62
+ # 判断基本字符是否相同
63
+ return base_char1
64
+
65
+ def get_alphabetic_ratio(text: str) -> float:
66
+ if not text:
67
+ return 0
68
+
69
+ text = re.sub(r'\d+', '', text)
70
+
71
+ # 正则表达式匹配字母型文字(包括拉丁字母、希腊字母、西里尔字母、阿拉伯字母等)
72
+ alphabetic_pattern = (
73
+ r"[\u0041-\u005A\u0061-\u007A" # 拉丁字母 (A-Z, a-z)
74
+ r"\u00C0-\u00FF" # 带重音符号的拉丁字母 (À-ÿ)
75
+ r"\u0080–\u00FF" # 拉丁字母补充1
76
+ r"\u0100–\u017F" # 拉丁字母扩展A
77
+ r"\u1E00-\u1EFF" # 拉丁扩展 (Latin Extended Additional)
78
+ r"\u0180-\u024F" # 拉丁扩展-B (Latin Extended-B)
79
+ r"\u2C60-\u2C7F" # 拉丁扩展-C (Latin Extended Additional)
80
+ r"\uA720-\uA7FF" # 拉丁扩展-D (Latin Extended Additional)
81
+ r"\uAB30-\uAB6F" # 拉丁扩展-E (Latin Extended Additional)
82
+ r"]"
83
+ )
84
+
85
+ # 使用正则表达式过滤出语言文字
86
+ clean_text = regex.sub(r"[^\p{L}]", "", text)
87
+
88
+ if len(clean_text) == 0:
89
+ return 1.0
90
+
91
+ # 匹配所有字母型字符
92
+ alphabetic_chars = re.findall(alphabetic_pattern, clean_text)
93
+
94
+ # 返回字母型字符所占比例
95
+ return len(alphabetic_chars) / len(clean_text)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: re_common
3
- Version: 0.2.54
3
+ Version: 2.0.1
4
4
  Summary: a library about all python projects
5
5
  Home-page: https://gitee.com/xujiangios/re-common
6
6
  Author: vic