re-common 10.0.22__py3-none-any.whl → 10.0.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (202) hide show
  1. re_common/baselibrary/__init__.py +4 -4
  2. re_common/baselibrary/baseabs/__init__.py +6 -6
  3. re_common/baselibrary/baseabs/baseabs.py +26 -26
  4. re_common/baselibrary/database/mbuilder.py +132 -132
  5. re_common/baselibrary/database/moudle.py +93 -93
  6. re_common/baselibrary/database/msqlite3.py +194 -194
  7. re_common/baselibrary/database/mysql.py +169 -169
  8. re_common/baselibrary/database/sql_factory.py +26 -26
  9. re_common/baselibrary/mthread/MThreadingRun.py +486 -486
  10. re_common/baselibrary/mthread/MThreadingRunEvent.py +349 -349
  11. re_common/baselibrary/mthread/__init__.py +2 -2
  12. re_common/baselibrary/mthread/mythreading.py +695 -695
  13. re_common/baselibrary/pakge_other/socks.py +404 -404
  14. re_common/baselibrary/readconfig/config_factory.py +18 -18
  15. re_common/baselibrary/readconfig/ini_config.py +317 -317
  16. re_common/baselibrary/readconfig/toml_config.py +49 -49
  17. re_common/baselibrary/temporary/envdata.py +36 -36
  18. re_common/baselibrary/tools/all_requests/aiohttp_request.py +118 -118
  19. re_common/baselibrary/tools/all_requests/httpx_requet.py +102 -102
  20. re_common/baselibrary/tools/all_requests/mrequest.py +412 -412
  21. re_common/baselibrary/tools/all_requests/requests_request.py +81 -81
  22. re_common/baselibrary/tools/batch_compre/bijiao_batch.py +31 -31
  23. re_common/baselibrary/tools/contrast_db3.py +123 -123
  24. re_common/baselibrary/tools/copy_file.py +39 -39
  25. re_common/baselibrary/tools/db3_2_sizedb3.py +102 -102
  26. re_common/baselibrary/tools/foreachgz.py +39 -39
  27. re_common/baselibrary/tools/get_attr.py +10 -10
  28. re_common/baselibrary/tools/image_to_pdf.py +61 -61
  29. re_common/baselibrary/tools/java_code_deal.py +139 -139
  30. re_common/baselibrary/tools/javacode.py +79 -79
  31. re_common/baselibrary/tools/mdb_db3.py +48 -48
  32. re_common/baselibrary/tools/merge_file.py +171 -171
  33. re_common/baselibrary/tools/merge_gz_file.py +165 -165
  34. re_common/baselibrary/tools/mhdfstools/down_hdfs_files.py +42 -42
  35. re_common/baselibrary/tools/mhdfstools/hdfst.py +42 -42
  36. re_common/baselibrary/tools/mhdfstools/up_hdfs_files.py +38 -38
  37. re_common/baselibrary/tools/mongo_tools.py +50 -50
  38. re_common/baselibrary/tools/move_file.py +170 -170
  39. re_common/baselibrary/tools/move_mongo/mongo_table_to_file.py +63 -63
  40. re_common/baselibrary/tools/move_mongo/move_mongo_table.py +354 -354
  41. re_common/baselibrary/tools/move_mongo/use_mttf.py +18 -18
  42. re_common/baselibrary/tools/move_mongo/use_mv.py +93 -93
  43. re_common/baselibrary/tools/mpandas/mpandasreadexcel.py +125 -125
  44. re_common/baselibrary/tools/mpandas/pandas_visualization.py +7 -7
  45. re_common/baselibrary/tools/myparsel.py +104 -104
  46. re_common/baselibrary/tools/rename_dir_file.py +37 -37
  47. re_common/baselibrary/tools/sequoiadb_utils.py +398 -398
  48. re_common/baselibrary/tools/split_line_to_many.py +25 -25
  49. re_common/baselibrary/tools/stringtodicts.py +33 -33
  50. re_common/baselibrary/tools/workwechant_bot.py +84 -84
  51. re_common/baselibrary/utils/baseaiohttp.py +296 -296
  52. re_common/baselibrary/utils/baseaiomysql.py +87 -87
  53. re_common/baselibrary/utils/baseallstep.py +191 -191
  54. re_common/baselibrary/utils/baseavro.py +19 -19
  55. re_common/baselibrary/utils/baseboto3.py +291 -291
  56. re_common/baselibrary/utils/basecsv.py +32 -32
  57. re_common/baselibrary/utils/basedict.py +133 -133
  58. re_common/baselibrary/utils/basedir.py +241 -241
  59. re_common/baselibrary/utils/baseencode.py +351 -351
  60. re_common/baselibrary/utils/baseencoding.py +28 -28
  61. re_common/baselibrary/utils/baseesdsl.py +86 -86
  62. re_common/baselibrary/utils/baseexcel.py +264 -264
  63. re_common/baselibrary/utils/baseexcept.py +109 -109
  64. re_common/baselibrary/utils/basefile.py +654 -654
  65. re_common/baselibrary/utils/baseftp.py +214 -214
  66. re_common/baselibrary/utils/basegzip.py +60 -60
  67. re_common/baselibrary/utils/basehdfs.py +135 -135
  68. re_common/baselibrary/utils/basehttpx.py +268 -268
  69. re_common/baselibrary/utils/baseip.py +87 -87
  70. re_common/baselibrary/utils/basejson.py +2 -2
  71. re_common/baselibrary/utils/baselist.py +32 -32
  72. re_common/baselibrary/utils/basemotor.py +190 -190
  73. re_common/baselibrary/utils/basemssql.py +98 -98
  74. re_common/baselibrary/utils/baseodbc.py +113 -113
  75. re_common/baselibrary/utils/basepandas.py +302 -302
  76. re_common/baselibrary/utils/basepeewee.py +11 -11
  77. re_common/baselibrary/utils/basepika.py +180 -180
  78. re_common/baselibrary/utils/basepydash.py +143 -143
  79. re_common/baselibrary/utils/basepymongo.py +230 -230
  80. re_common/baselibrary/utils/basequeue.py +22 -22
  81. re_common/baselibrary/utils/baserar.py +57 -57
  82. re_common/baselibrary/utils/baserequest.py +279 -279
  83. re_common/baselibrary/utils/baseset.py +8 -8
  84. re_common/baselibrary/utils/basesmb.py +403 -403
  85. re_common/baselibrary/utils/basestring.py +382 -382
  86. re_common/baselibrary/utils/basetime.py +320 -320
  87. re_common/baselibrary/utils/baseurl.py +121 -121
  88. re_common/baselibrary/utils/basezip.py +57 -57
  89. re_common/baselibrary/utils/core/__init__.py +7 -7
  90. re_common/baselibrary/utils/core/bottomutils.py +18 -18
  91. re_common/baselibrary/utils/core/mdeprecated.py +327 -327
  92. re_common/baselibrary/utils/core/mlamada.py +16 -16
  93. re_common/baselibrary/utils/core/msginfo.py +25 -25
  94. re_common/baselibrary/utils/core/requests_core.py +103 -103
  95. re_common/baselibrary/utils/fateadm.py +429 -429
  96. re_common/baselibrary/utils/importfun.py +123 -123
  97. re_common/baselibrary/utils/mfaker.py +57 -57
  98. re_common/baselibrary/utils/my_abc/__init__.py +3 -3
  99. re_common/baselibrary/utils/my_abc/better_abc.py +32 -32
  100. re_common/baselibrary/utils/mylogger.py +414 -414
  101. re_common/baselibrary/utils/myredisclient.py +861 -861
  102. re_common/baselibrary/utils/pipupgrade.py +21 -21
  103. re_common/baselibrary/utils/ringlist.py +85 -85
  104. re_common/baselibrary/utils/version_compare.py +36 -36
  105. re_common/baselibrary/utils/ydmhttp.py +126 -126
  106. re_common/facade/lazy_import.py +11 -11
  107. re_common/facade/loggerfacade.py +25 -25
  108. re_common/facade/mysqlfacade.py +467 -467
  109. re_common/facade/now.py +31 -31
  110. re_common/facade/sqlite3facade.py +257 -257
  111. re_common/facade/use/mq_use_facade.py +83 -83
  112. re_common/facade/use/proxy_use_facade.py +19 -19
  113. re_common/libtest/base_dict_test.py +19 -19
  114. re_common/libtest/baseavro_test.py +13 -13
  115. re_common/libtest/basefile_test.py +14 -14
  116. re_common/libtest/basemssql_test.py +77 -77
  117. re_common/libtest/baseodbc_test.py +7 -7
  118. re_common/libtest/basepandas_test.py +38 -38
  119. re_common/libtest/get_attr_test/get_attr_test_settings.py +14 -14
  120. re_common/libtest/get_attr_test/settings.py +54 -54
  121. re_common/libtest/idencode_test.py +53 -53
  122. re_common/libtest/iniconfig_test.py +35 -35
  123. re_common/libtest/ip_test.py +34 -34
  124. re_common/libtest/merge_file_test.py +20 -20
  125. re_common/libtest/mfaker_test.py +8 -8
  126. re_common/libtest/mm3_test.py +31 -31
  127. re_common/libtest/mylogger_test.py +88 -88
  128. re_common/libtest/myparsel_test.py +27 -27
  129. re_common/libtest/mysql_test.py +151 -151
  130. re_common/libtest/pymongo_test.py +21 -21
  131. re_common/libtest/split_test.py +11 -11
  132. re_common/libtest/sqlite3_merge_test.py +5 -5
  133. re_common/libtest/sqlite3_test.py +34 -34
  134. re_common/libtest/tomlconfig_test.py +30 -30
  135. re_common/libtest/use_tools_test/__init__.py +2 -2
  136. re_common/libtest/user/__init__.py +4 -4
  137. re_common/studio/__init__.py +4 -4
  138. re_common/studio/assignment_expressions.py +36 -36
  139. re_common/studio/mydash/test1.py +18 -18
  140. re_common/studio/pydashstudio/first.py +9 -9
  141. re_common/studio/streamlitstudio/first_app.py +65 -65
  142. re_common/studio/streamlitstudio/uber_pickups.py +23 -23
  143. re_common/studio/test.py +18 -18
  144. re_common/v2/baselibrary/business_utils/BusinessStringUtil.py +195 -0
  145. re_common/v2/baselibrary/business_utils/__init__.py +0 -0
  146. re_common/v2/baselibrary/business_utils/rel_tools.py +6 -0
  147. re_common/v2/baselibrary/decorators/utils.py +59 -59
  148. re_common/v2/baselibrary/s3object/baseboto3.py +230 -230
  149. re_common/v2/baselibrary/tools/WeChatRobot.py +95 -79
  150. re_common/v2/baselibrary/tools/ac_ahocorasick.py +75 -75
  151. re_common/v2/baselibrary/tools/dict_tools.py +37 -37
  152. re_common/v2/baselibrary/tools/dolphinscheduler.py +187 -187
  153. re_common/v2/baselibrary/tools/hdfs_data_processer.py +338 -338
  154. re_common/v2/baselibrary/tools/list_tools.py +65 -65
  155. re_common/v2/baselibrary/tools/search_hash_tools.py +54 -54
  156. re_common/v2/baselibrary/tools/text_matcher.py +326 -326
  157. re_common/v2/baselibrary/tools/unionfind_tools.py +60 -60
  158. re_common/v2/baselibrary/utils/BusinessStringUtil.py +196 -196
  159. re_common/v2/baselibrary/utils/author_smi.py +360 -360
  160. re_common/v2/baselibrary/utils/base_string_similarity.py +158 -158
  161. re_common/v2/baselibrary/utils/basedict.py +37 -37
  162. re_common/v2/baselibrary/utils/basehdfs.py +161 -161
  163. re_common/v2/baselibrary/utils/basepika.py +180 -180
  164. re_common/v2/baselibrary/utils/basetime.py +77 -77
  165. re_common/v2/baselibrary/utils/db.py +38 -38
  166. re_common/v2/baselibrary/utils/json_cls.py +16 -16
  167. re_common/v2/baselibrary/utils/mq.py +83 -83
  168. re_common/v2/baselibrary/utils/n_ary_expression_tree.py +243 -243
  169. re_common/v2/baselibrary/utils/string_bool.py +186 -149
  170. re_common/v2/baselibrary/utils/string_clear.py +227 -204
  171. re_common/v2/baselibrary/utils/string_smi.py +18 -18
  172. re_common/v2/baselibrary/utils/stringutils.py +213 -213
  173. re_common/vip/base_step_process.py +11 -11
  174. re_common/vip/baseencodeid.py +90 -90
  175. re_common/vip/changetaskname.py +28 -28
  176. re_common/vip/core_var.py +24 -24
  177. re_common/vip/mmh3Hash.py +89 -89
  178. re_common/vip/proxy/allproxys.py +127 -127
  179. re_common/vip/proxy/allproxys_thread.py +159 -159
  180. re_common/vip/proxy/cnki_proxy.py +153 -153
  181. re_common/vip/proxy/kuaidaili.py +87 -87
  182. re_common/vip/proxy/proxy_all.py +113 -113
  183. re_common/vip/proxy/update_kuaidaili_0.py +42 -42
  184. re_common/vip/proxy/wanfang_proxy.py +152 -152
  185. re_common/vip/proxy/wp_proxy_all.py +181 -181
  186. re_common/vip/read_rawid_to_txt.py +91 -91
  187. re_common/vip/title/__init__.py +5 -5
  188. re_common/vip/title/transform/TransformBookTitleToZt.py +125 -125
  189. re_common/vip/title/transform/TransformConferenceTitleToZt.py +139 -139
  190. re_common/vip/title/transform/TransformCstadTitleToZt.py +195 -195
  191. re_common/vip/title/transform/TransformJournalTitleToZt.py +203 -203
  192. re_common/vip/title/transform/TransformPatentTitleToZt.py +132 -132
  193. re_common/vip/title/transform/TransformRegulationTitleToZt.py +114 -114
  194. re_common/vip/title/transform/TransformStandardTitleToZt.py +135 -135
  195. re_common/vip/title/transform/TransformThesisTitleToZt.py +135 -135
  196. re_common/vip/title/transform/__init__.py +10 -10
  197. {re_common-10.0.22.dist-info → re_common-10.0.24.dist-info}/LICENSE +201 -201
  198. {re_common-10.0.22.dist-info → re_common-10.0.24.dist-info}/METADATA +16 -16
  199. re_common-10.0.24.dist-info/RECORD +230 -0
  200. {re_common-10.0.22.dist-info → re_common-10.0.24.dist-info}/WHEEL +1 -1
  201. re_common-10.0.22.dist-info/RECORD +0 -227
  202. {re_common-10.0.22.dist-info → re_common-10.0.24.dist-info}/top_level.txt +0 -0
@@ -1,158 +1,158 @@
1
- import Levenshtein
2
- import jellyfish
3
- from rapidfuzz.distance import DamerauLevenshtein, Hamming, Indel, LCSseq, OSA
4
-
5
-
6
- class BaseStringSimilarity(object):
7
-
8
- @classmethod
9
- def levenshtein_similarity(cls, str1, str2) -> float:
10
- """
11
- 返回 两个字字符串之间的编辑距离 分数
12
- """
13
- # 编辑距离长度
14
- distance = Levenshtein.distance(str1, str2)
15
- # 以最长字符串为除数算分
16
- similarity = 1 - (distance / max(len(str1), len(str2)))
17
- return similarity
18
-
19
- @classmethod
20
- def damerau_normalized_distance_similarity(cls, str1, str2) -> float:
21
- """
22
- # 计算 归一化的编辑距离,取值范围 [0, 1],值越小表示越相似。 一般不以小评估分 所以不用
23
- similarity = DamerauLevenshtein.normalized_distance(str1, str2)
24
- 作用:计算 相似度得分,取值范围 [0, max_len],值越大表示越相似。
25
- print(DamerauLevenshtein.similarity(str1, str2))
26
- """
27
- # 该算法与 cls.levenshtein_similarity 算法一致 只是 编辑距离的得值不一样
28
- similarity = DamerauLevenshtein.normalized_similarity(str1, str2)
29
- return similarity
30
-
31
- @classmethod
32
- def indel_levenshtein_similarity(cls, str1, str2) -> float:
33
- """
34
- 本质上使用的 是 Indel.normalized_similarity(str1,str2) 方法
35
-
36
- 计算 str1 和 str2 之间的 Indel 距离(插入和删除操作的最小次数)
37
- Indel.distance(str1, str2)
38
- 计算 标准化后的 Indel 距离,取值范围在 [0, 1] 之间,其中 0 表示完全相同,1 表示完全不同。 ``distance / (len1 + len2)``.
39
- Indel.normalized_distance(str1, str2)
40
- 计算 [max, 0] 范围内的 Indel 相似度。计算公式为“(len1 + len2) - distance”
41
- Indel.similarity(str1, str2)
42
- 计算 [0, 1] 范围内的归一化插入/缺失相似度。计算公式为“1 - normalized_distance”
43
- Indel.normalized_similarity(str1, str2)
44
-
45
- """
46
- # 计算相似度(0到1之间的值,1表示完全相同)
47
- similarity = Levenshtein.ratio(str1, str2)
48
- return similarity
49
-
50
- @classmethod
51
- def jaro_similarity(cls, str1, str2) -> float:
52
- """
53
- Jaro 相似度是一种用于测量两个字符串相似度的算法,主要考虑:
54
- 匹配的字符
55
- 字符顺序
56
- 字符转置(位置交换)
57
-
58
- 与 Jaro.normalized_similarity(str1,str2) 一致
59
- """
60
- return jellyfish.jaro_similarity(str1, str2)
61
-
62
- @classmethod
63
- def jaro_winkler_similarity(cls, str1, str2) -> float:
64
- """
65
- Jaro-Winkler 是 Jaro 的改进版,对前缀匹配给予更多权重
66
-
67
- 与 JaroWinkler.normalized_similarity(str1,str2) 结果一致
68
-
69
- print(JaroWinkler.distance(str1, str2))
70
- 与 print(JaroWinkler.normalized_distance(str1, str2)) 结果一致
71
-
72
- print(JaroWinkler.similarity(str1, str2))
73
- 与 print(JaroWinkler.normalized_similarity(str1,str2)) 结果一致
74
- """
75
- return jellyfish.jaro_winkler_similarity(str1, str2)
76
-
77
- @classmethod
78
- def osa_similarity(cls, str1, str2) -> float:
79
- """
80
- 计算 [0, 1] 范围内的归一化最佳字符串比对 (OSA) 相似度。
81
-
82
- 计算公式为“1 - normalized_distance”
83
- """
84
- return OSA.normalized_similarity(str1, str2)
85
-
86
- @classmethod
87
- def lcs_seq_similarity(cls, str1, str2) -> float:
88
- """
89
- 计算 [0, 1] 范围内的归一化 LCS 相似度。
90
- 计算公式为“1 - normalized_distance”
91
- """
92
- return LCSseq.normalized_similarity(str1, str2)
93
-
94
- @classmethod
95
- def lcs_seq_distance(cls, str1, str2) -> int:
96
- """
97
- LCSseq.distance 是 RapidFuzz 库中的一个方法,用于计算两个字符串之间的 最长公共子序列(Longest Common Subsequence, LCS)距离。
98
- LCS 是指两个字符串中 按顺序出现但不一定连续 的最长子序列。例如:
99
- "abcde" 和 "ace" 的 LCS 是 "ace"(长度 3)。
100
- "Druitt, Robert" 和 "Druitt R." 的 LCS 可能是 "Druitt R"(长度 8)。
101
- 计算 [0, max] 范围内的 LCS 距离。
102
- 计算公式为“max(len1, len2) - 相似度”。
103
- """
104
- return LCSseq.distance(str1, str2)
105
-
106
- @classmethod
107
- def osa_distance(cls, str1, str2) -> int:
108
- """
109
- OSA.distance(Optimal String Alignment,最优字符串对齐距离)是 RapidFuzz 库中的一个方法,用于计算两个字符串之间的 编辑距离(Edit Distance),但比标准的 Levenshtein 距离 限制更严格。
110
-
111
- OSA 额外允许 相邻字符交换(Transposition),但限制比 Damerau-Levenshtein 更严格(Damerau 允许多次交换,而 OSA 仅限一次)。
112
- """
113
- return OSA.distance(str1, str2)
114
-
115
- @classmethod
116
- def levenshtein_distance(cls, str1, str2) -> int:
117
- """
118
- 返回 两个字字符串之间的编辑距离 分数
119
- 标准 Levenshtein 距离 允许 插入、删除、替换 三种操作,但不允许 相邻字符交换(transposition)
120
-
121
- jellyfish.levenshtein_distance(str1,str2) 该方法结果与 本方法一致
122
-
123
- print(Jaro.distance(str1, str2))
124
- 与 print(Jaro.normalized_distance(str1, str2)) 结果一致
125
-
126
- print(Jaro.similarity(str1, str2))
127
- 与 print(Jaro.normalized_similarity(str1,str2)) 结果一致
128
- """
129
- # 编辑距离长度
130
- distance = Levenshtein.distance(str1, str2)
131
- print(jellyfish.levenshtein_distance(str1, str2))
132
- return distance
133
-
134
- @classmethod
135
- def indel_distance(cls, str1, str2) -> int:
136
- """
137
- Indel(Insertion + Deletion)距离是 仅考虑插入和删除操作 的编辑距离,不考虑替换操作。
138
- """
139
- return Indel.distance(str1, str2)
140
-
141
- @classmethod
142
- def damerau_levenshtein_distance(cls, str1, str2) -> int:
143
- """
144
- Damerau-Levenshtein 距离是 Levenshtein 距离的修改,它将换位(例如将 ifsh 表示为 fish)计为一次编辑
145
- """
146
- # 编辑距离长度
147
- distance = jellyfish.damerau_levenshtein_distance(str1, str2)
148
- print(DamerauLevenshtein.distance(str1, str2))
149
- return distance
150
-
151
- @classmethod
152
- def hamming_distance(cls, str1, str2) -> int:
153
- return Hamming.distance(str1, str2)
154
-
155
- # str1 = "primulina elegant ladyis a new culitvar developed by crossing seed parent primulina medica and pollen parent primulina longii it has fresh and elegant flowershigh ornamental value and strong shade tolerance it is easy to cultivate and propagate"
156
- # str2 = "primulinaelegant labyis a new cultivar developed by crossing seed parent primulina medica and pollen parent primulina longii it has fresh and elegant flowershigh ornamental value and strong shade tolerance it is easy to cultivate and propagate 2019 editorial office of acta horticulturae sinica all rights reserved"
157
- # # str1 = "primulina elegant ladyis a new cultivar developed by crossing seed parent primulina medica and pollen parent primulina longii it has fresh and elegant flowershigh ornamental value and strong shade tolerance it is easy to cultivate and propagate"
158
- # # str2 = "primulinaelegant ladyis a new cultivar developed by crossing seed parent primulina medica and pollen parent primulina longii it has fresh and elegant flowershigh ornamental value and strong shade tolerance it is easy to cultivate and propagate 2019 editorial office of acta horticulturae sinica all rights reserved"
1
+ import Levenshtein
2
+ import jellyfish
3
+ from rapidfuzz.distance import DamerauLevenshtein, Hamming, Indel, LCSseq, OSA
4
+
5
+
6
+ class BaseStringSimilarity(object):
7
+
8
+ @classmethod
9
+ def levenshtein_similarity(cls, str1, str2) -> float:
10
+ """
11
+ 返回 两个字字符串之间的编辑距离 分数
12
+ """
13
+ # 编辑距离长度
14
+ distance = Levenshtein.distance(str1, str2)
15
+ # 以最长字符串为除数算分
16
+ similarity = 1 - (distance / max(len(str1), len(str2)))
17
+ return similarity
18
+
19
+ @classmethod
20
+ def damerau_normalized_distance_similarity(cls, str1, str2) -> float:
21
+ """
22
+ # 计算 归一化的编辑距离,取值范围 [0, 1],值越小表示越相似。 一般不以小评估分 所以不用
23
+ similarity = DamerauLevenshtein.normalized_distance(str1, str2)
24
+ 作用:计算 相似度得分,取值范围 [0, max_len],值越大表示越相似。
25
+ print(DamerauLevenshtein.similarity(str1, str2))
26
+ """
27
+ # 该算法与 cls.levenshtein_similarity 算法一致 只是 编辑距离的得值不一样
28
+ similarity = DamerauLevenshtein.normalized_similarity(str1, str2)
29
+ return similarity
30
+
31
+ @classmethod
32
+ def indel_levenshtein_similarity(cls, str1, str2) -> float:
33
+ """
34
+ 本质上使用的 是 Indel.normalized_similarity(str1,str2) 方法
35
+
36
+ 计算 str1 和 str2 之间的 Indel 距离(插入和删除操作的最小次数)
37
+ Indel.distance(str1, str2)
38
+ 计算 标准化后的 Indel 距离,取值范围在 [0, 1] 之间,其中 0 表示完全相同,1 表示完全不同。 ``distance / (len1 + len2)``.
39
+ Indel.normalized_distance(str1, str2)
40
+ 计算 [max, 0] 范围内的 Indel 相似度。计算公式为“(len1 + len2) - distance”
41
+ Indel.similarity(str1, str2)
42
+ 计算 [0, 1] 范围内的归一化插入/缺失相似度。计算公式为“1 - normalized_distance”
43
+ Indel.normalized_similarity(str1, str2)
44
+
45
+ """
46
+ # 计算相似度(0到1之间的值,1表示完全相同)
47
+ similarity = Levenshtein.ratio(str1, str2)
48
+ return similarity
49
+
50
+ @classmethod
51
+ def jaro_similarity(cls, str1, str2) -> float:
52
+ """
53
+ Jaro 相似度是一种用于测量两个字符串相似度的算法,主要考虑:
54
+ 匹配的字符
55
+ 字符顺序
56
+ 字符转置(位置交换)
57
+
58
+ 与 Jaro.normalized_similarity(str1,str2) 一致
59
+ """
60
+ return jellyfish.jaro_similarity(str1, str2)
61
+
62
+ @classmethod
63
+ def jaro_winkler_similarity(cls, str1, str2) -> float:
64
+ """
65
+ Jaro-Winkler 是 Jaro 的改进版,对前缀匹配给予更多权重
66
+
67
+ 与 JaroWinkler.normalized_similarity(str1,str2) 结果一致
68
+
69
+ print(JaroWinkler.distance(str1, str2))
70
+ 与 print(JaroWinkler.normalized_distance(str1, str2)) 结果一致
71
+
72
+ print(JaroWinkler.similarity(str1, str2))
73
+ 与 print(JaroWinkler.normalized_similarity(str1,str2)) 结果一致
74
+ """
75
+ return jellyfish.jaro_winkler_similarity(str1, str2)
76
+
77
+ @classmethod
78
+ def osa_similarity(cls, str1, str2) -> float:
79
+ """
80
+ 计算 [0, 1] 范围内的归一化最佳字符串比对 (OSA) 相似度。
81
+
82
+ 计算公式为“1 - normalized_distance”
83
+ """
84
+ return OSA.normalized_similarity(str1, str2)
85
+
86
+ @classmethod
87
+ def lcs_seq_similarity(cls, str1, str2) -> float:
88
+ """
89
+ 计算 [0, 1] 范围内的归一化 LCS 相似度。
90
+ 计算公式为“1 - normalized_distance”
91
+ """
92
+ return LCSseq.normalized_similarity(str1, str2)
93
+
94
+ @classmethod
95
+ def lcs_seq_distance(cls, str1, str2) -> int:
96
+ """
97
+ LCSseq.distance 是 RapidFuzz 库中的一个方法,用于计算两个字符串之间的 最长公共子序列(Longest Common Subsequence, LCS)距离。
98
+ LCS 是指两个字符串中 按顺序出现但不一定连续 的最长子序列。例如:
99
+ "abcde" 和 "ace" 的 LCS 是 "ace"(长度 3)。
100
+ "Druitt, Robert" 和 "Druitt R." 的 LCS 可能是 "Druitt R"(长度 8)。
101
+ 计算 [0, max] 范围内的 LCS 距离。
102
+ 计算公式为“max(len1, len2) - 相似度”。
103
+ """
104
+ return LCSseq.distance(str1, str2)
105
+
106
+ @classmethod
107
+ def osa_distance(cls, str1, str2) -> int:
108
+ """
109
+ OSA.distance(Optimal String Alignment,最优字符串对齐距离)是 RapidFuzz 库中的一个方法,用于计算两个字符串之间的 编辑距离(Edit Distance),但比标准的 Levenshtein 距离 限制更严格。
110
+
111
+ OSA 额外允许 相邻字符交换(Transposition),但限制比 Damerau-Levenshtein 更严格(Damerau 允许多次交换,而 OSA 仅限一次)。
112
+ """
113
+ return OSA.distance(str1, str2)
114
+
115
+ @classmethod
116
+ def levenshtein_distance(cls, str1, str2) -> int:
117
+ """
118
+ 返回 两个字字符串之间的编辑距离 分数
119
+ 标准 Levenshtein 距离 允许 插入、删除、替换 三种操作,但不允许 相邻字符交换(transposition)
120
+
121
+ jellyfish.levenshtein_distance(str1,str2) 该方法结果与 本方法一致
122
+
123
+ print(Jaro.distance(str1, str2))
124
+ 与 print(Jaro.normalized_distance(str1, str2)) 结果一致
125
+
126
+ print(Jaro.similarity(str1, str2))
127
+ 与 print(Jaro.normalized_similarity(str1,str2)) 结果一致
128
+ """
129
+ # 编辑距离长度
130
+ distance = Levenshtein.distance(str1, str2)
131
+ print(jellyfish.levenshtein_distance(str1, str2))
132
+ return distance
133
+
134
+ @classmethod
135
+ def indel_distance(cls, str1, str2) -> int:
136
+ """
137
+ Indel(Insertion + Deletion)距离是 仅考虑插入和删除操作 的编辑距离,不考虑替换操作。
138
+ """
139
+ return Indel.distance(str1, str2)
140
+
141
+ @classmethod
142
+ def damerau_levenshtein_distance(cls, str1, str2) -> int:
143
+ """
144
+ Damerau-Levenshtein 距离是 Levenshtein 距离的修改,它将换位(例如将 ifsh 表示为 fish)计为一次编辑
145
+ """
146
+ # 编辑距离长度
147
+ distance = jellyfish.damerau_levenshtein_distance(str1, str2)
148
+ print(DamerauLevenshtein.distance(str1, str2))
149
+ return distance
150
+
151
+ @classmethod
152
+ def hamming_distance(cls, str1, str2) -> int:
153
+ return Hamming.distance(str1, str2)
154
+
155
+ # str1 = "primulina elegant ladyis a new culitvar developed by crossing seed parent primulina medica and pollen parent primulina longii it has fresh and elegant flowershigh ornamental value and strong shade tolerance it is easy to cultivate and propagate"
156
+ # str2 = "primulinaelegant labyis a new cultivar developed by crossing seed parent primulina medica and pollen parent primulina longii it has fresh and elegant flowershigh ornamental value and strong shade tolerance it is easy to cultivate and propagate 2019 editorial office of acta horticulturae sinica all rights reserved"
157
+ # # str1 = "primulina elegant ladyis a new cultivar developed by crossing seed parent primulina medica and pollen parent primulina longii it has fresh and elegant flowershigh ornamental value and strong shade tolerance it is easy to cultivate and propagate"
158
+ # # str2 = "primulinaelegant ladyis a new cultivar developed by crossing seed parent primulina medica and pollen parent primulina longii it has fresh and elegant flowershigh ornamental value and strong shade tolerance it is easy to cultivate and propagate 2019 editorial office of acta horticulturae sinica all rights reserved"
@@ -1,38 +1,38 @@
1
- import logging
2
- from itertools import groupby
3
-
4
- logger = logging.getLogger(__name__) # 创建 logger 实例
5
-
6
-
7
- class BaseDict(object):
8
- @classmethod
9
- def flip_dict(cls, original_dict, raise_on_conflict=True):
10
- """
11
- 翻转字典:将 key 是字符串、value 是列表的字典,转换为 key 是原 value 列表中的元素、value 是原 key 的字典。
12
- :param original_dict: 原始字典
13
- :param raise_on_conflict: 是否在键冲突时抛出异常,默认为 False
14
- :return: 翻转后的字典
15
- """
16
- flipped_dict = {}
17
- for key, value_list in original_dict.items():
18
- for value in value_list:
19
- if value in flipped_dict:
20
- if raise_on_conflict:
21
- raise ValueError(f"Key conflict detected: {value} already exists in the flipped dictionary.")
22
- else:
23
- # 覆盖冲突的键
24
- logger.warning(
25
- f"Warning: Key conflict detected for {value}. Overwriting with new value: {key}.")
26
- flipped_dict[value] = key
27
- return flipped_dict
28
-
29
- @classmethod
30
- def get_temp_gid_dicts(cls,lists,key_name):
31
- """
32
- 对 列表字典 分组 组成 分组id的字典
33
- """
34
- dicts = {}
35
- for group_id, group_tmp in groupby(sorted(lists, key=lambda x: x[key_name]),
36
- key=lambda x: x[key_name]):
37
- dicts[group_id] = group_tmp
1
+ import logging
2
+ from itertools import groupby
3
+
4
+ logger = logging.getLogger(__name__) # 创建 logger 实例
5
+
6
+
7
+ class BaseDict(object):
8
+ @classmethod
9
+ def flip_dict(cls, original_dict, raise_on_conflict=True):
10
+ """
11
+ 翻转字典:将 key 是字符串、value 是列表的字典,转换为 key 是原 value 列表中的元素、value 是原 key 的字典。
12
+ :param original_dict: 原始字典
13
+ :param raise_on_conflict: 是否在键冲突时抛出异常,默认为 False
14
+ :return: 翻转后的字典
15
+ """
16
+ flipped_dict = {}
17
+ for key, value_list in original_dict.items():
18
+ for value in value_list:
19
+ if value in flipped_dict:
20
+ if raise_on_conflict:
21
+ raise ValueError(f"Key conflict detected: {value} already exists in the flipped dictionary.")
22
+ else:
23
+ # 覆盖冲突的键
24
+ logger.warning(
25
+ f"Warning: Key conflict detected for {value}. Overwriting with new value: {key}.")
26
+ flipped_dict[value] = key
27
+ return flipped_dict
28
+
29
+ @classmethod
30
+ def get_temp_gid_dicts(cls,lists,key_name):
31
+ """
32
+ 对 列表字典 分组 组成 分组id的字典
33
+ """
34
+ dicts = {}
35
+ for group_id, group_tmp in groupby(sorted(lists, key=lambda x: x[key_name]),
36
+ key=lambda x: x[key_name]):
37
+ dicts[group_id] = group_tmp
38
38
  return dicts