re-common 10.0.37__py3-none-any.whl → 10.0.39__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- re_common/baselibrary/__init__.py +4 -4
- re_common/baselibrary/baseabs/__init__.py +6 -6
- re_common/baselibrary/baseabs/baseabs.py +26 -26
- re_common/baselibrary/database/mbuilder.py +132 -132
- re_common/baselibrary/database/moudle.py +93 -93
- re_common/baselibrary/database/msqlite3.py +194 -194
- re_common/baselibrary/database/mysql.py +169 -169
- re_common/baselibrary/database/sql_factory.py +26 -26
- re_common/baselibrary/mthread/MThreadingRun.py +486 -486
- re_common/baselibrary/mthread/MThreadingRunEvent.py +349 -349
- re_common/baselibrary/mthread/__init__.py +2 -2
- re_common/baselibrary/mthread/mythreading.py +695 -695
- re_common/baselibrary/pakge_other/socks.py +404 -404
- re_common/baselibrary/readconfig/config_factory.py +18 -18
- re_common/baselibrary/readconfig/ini_config.py +317 -317
- re_common/baselibrary/readconfig/toml_config.py +49 -49
- re_common/baselibrary/temporary/envdata.py +36 -36
- re_common/baselibrary/tools/all_requests/aiohttp_request.py +118 -118
- re_common/baselibrary/tools/all_requests/httpx_requet.py +102 -102
- re_common/baselibrary/tools/all_requests/mrequest.py +412 -412
- re_common/baselibrary/tools/all_requests/requests_request.py +81 -81
- re_common/baselibrary/tools/batch_compre/bijiao_batch.py +31 -31
- re_common/baselibrary/tools/contrast_db3.py +123 -123
- re_common/baselibrary/tools/copy_file.py +39 -39
- re_common/baselibrary/tools/db3_2_sizedb3.py +102 -102
- re_common/baselibrary/tools/foreachgz.py +39 -39
- re_common/baselibrary/tools/get_attr.py +10 -10
- re_common/baselibrary/tools/image_to_pdf.py +61 -61
- re_common/baselibrary/tools/java_code_deal.py +139 -139
- re_common/baselibrary/tools/javacode.py +79 -79
- re_common/baselibrary/tools/mdb_db3.py +48 -48
- re_common/baselibrary/tools/merge_file.py +171 -171
- re_common/baselibrary/tools/merge_gz_file.py +165 -165
- re_common/baselibrary/tools/mhdfstools/down_hdfs_files.py +42 -42
- re_common/baselibrary/tools/mhdfstools/hdfst.py +42 -42
- re_common/baselibrary/tools/mhdfstools/up_hdfs_files.py +38 -38
- re_common/baselibrary/tools/mongo_tools.py +50 -50
- re_common/baselibrary/tools/move_file.py +170 -170
- re_common/baselibrary/tools/move_mongo/mongo_table_to_file.py +63 -63
- re_common/baselibrary/tools/move_mongo/move_mongo_table.py +354 -354
- re_common/baselibrary/tools/move_mongo/use_mttf.py +18 -18
- re_common/baselibrary/tools/move_mongo/use_mv.py +93 -93
- re_common/baselibrary/tools/mpandas/mpandasreadexcel.py +125 -125
- re_common/baselibrary/tools/mpandas/pandas_visualization.py +7 -7
- re_common/baselibrary/tools/myparsel.py +104 -104
- re_common/baselibrary/tools/rename_dir_file.py +37 -37
- re_common/baselibrary/tools/sequoiadb_utils.py +398 -398
- re_common/baselibrary/tools/split_line_to_many.py +25 -25
- re_common/baselibrary/tools/stringtodicts.py +33 -33
- re_common/baselibrary/tools/workwechant_bot.py +84 -84
- re_common/baselibrary/utils/baseaiohttp.py +296 -296
- re_common/baselibrary/utils/baseaiomysql.py +87 -87
- re_common/baselibrary/utils/baseallstep.py +191 -191
- re_common/baselibrary/utils/baseavro.py +19 -19
- re_common/baselibrary/utils/baseboto3.py +291 -291
- re_common/baselibrary/utils/basecsv.py +32 -32
- re_common/baselibrary/utils/basedict.py +133 -133
- re_common/baselibrary/utils/basedir.py +241 -241
- re_common/baselibrary/utils/baseencode.py +351 -351
- re_common/baselibrary/utils/baseencoding.py +28 -28
- re_common/baselibrary/utils/baseesdsl.py +86 -86
- re_common/baselibrary/utils/baseexcel.py +264 -264
- re_common/baselibrary/utils/baseexcept.py +109 -109
- re_common/baselibrary/utils/basefile.py +654 -654
- re_common/baselibrary/utils/baseftp.py +214 -214
- re_common/baselibrary/utils/basegzip.py +60 -60
- re_common/baselibrary/utils/basehdfs.py +135 -135
- re_common/baselibrary/utils/basehttpx.py +268 -268
- re_common/baselibrary/utils/baseip.py +87 -87
- re_common/baselibrary/utils/basejson.py +2 -2
- re_common/baselibrary/utils/baselist.py +32 -32
- re_common/baselibrary/utils/basemotor.py +190 -190
- re_common/baselibrary/utils/basemssql.py +98 -98
- re_common/baselibrary/utils/baseodbc.py +113 -113
- re_common/baselibrary/utils/basepandas.py +302 -302
- re_common/baselibrary/utils/basepeewee.py +11 -11
- re_common/baselibrary/utils/basepika.py +180 -180
- re_common/baselibrary/utils/basepydash.py +143 -143
- re_common/baselibrary/utils/basepymongo.py +230 -230
- re_common/baselibrary/utils/basequeue.py +22 -22
- re_common/baselibrary/utils/baserar.py +57 -57
- re_common/baselibrary/utils/baserequest.py +279 -279
- re_common/baselibrary/utils/baseset.py +8 -8
- re_common/baselibrary/utils/basesmb.py +403 -403
- re_common/baselibrary/utils/basestring.py +382 -382
- re_common/baselibrary/utils/basetime.py +320 -320
- re_common/baselibrary/utils/baseurl.py +121 -121
- re_common/baselibrary/utils/basezip.py +57 -57
- re_common/baselibrary/utils/core/__init__.py +7 -7
- re_common/baselibrary/utils/core/bottomutils.py +18 -18
- re_common/baselibrary/utils/core/mdeprecated.py +327 -327
- re_common/baselibrary/utils/core/mlamada.py +16 -16
- re_common/baselibrary/utils/core/msginfo.py +25 -25
- re_common/baselibrary/utils/core/requests_core.py +103 -103
- re_common/baselibrary/utils/fateadm.py +429 -429
- re_common/baselibrary/utils/importfun.py +123 -123
- re_common/baselibrary/utils/mfaker.py +57 -57
- re_common/baselibrary/utils/my_abc/__init__.py +3 -3
- re_common/baselibrary/utils/my_abc/better_abc.py +32 -32
- re_common/baselibrary/utils/mylogger.py +414 -414
- re_common/baselibrary/utils/myredisclient.py +861 -861
- re_common/baselibrary/utils/pipupgrade.py +21 -21
- re_common/baselibrary/utils/ringlist.py +85 -85
- re_common/baselibrary/utils/version_compare.py +36 -36
- re_common/baselibrary/utils/ydmhttp.py +126 -126
- re_common/facade/lazy_import.py +11 -11
- re_common/facade/loggerfacade.py +25 -25
- re_common/facade/mysqlfacade.py +467 -467
- re_common/facade/now.py +31 -31
- re_common/facade/sqlite3facade.py +257 -257
- re_common/facade/use/mq_use_facade.py +83 -83
- re_common/facade/use/proxy_use_facade.py +19 -19
- re_common/libtest/base_dict_test.py +19 -19
- re_common/libtest/baseavro_test.py +13 -13
- re_common/libtest/basefile_test.py +14 -14
- re_common/libtest/basemssql_test.py +77 -77
- re_common/libtest/baseodbc_test.py +7 -7
- re_common/libtest/basepandas_test.py +38 -38
- re_common/libtest/get_attr_test/get_attr_test_settings.py +14 -14
- re_common/libtest/get_attr_test/settings.py +54 -54
- re_common/libtest/idencode_test.py +53 -53
- re_common/libtest/iniconfig_test.py +35 -35
- re_common/libtest/ip_test.py +34 -34
- re_common/libtest/merge_file_test.py +20 -20
- re_common/libtest/mfaker_test.py +8 -8
- re_common/libtest/mm3_test.py +31 -31
- re_common/libtest/mylogger_test.py +88 -88
- re_common/libtest/myparsel_test.py +27 -27
- re_common/libtest/mysql_test.py +151 -151
- re_common/libtest/pymongo_test.py +21 -21
- re_common/libtest/split_test.py +11 -11
- re_common/libtest/sqlite3_merge_test.py +5 -5
- re_common/libtest/sqlite3_test.py +34 -34
- re_common/libtest/tomlconfig_test.py +30 -30
- re_common/libtest/use_tools_test/__init__.py +2 -2
- re_common/libtest/user/__init__.py +4 -4
- re_common/studio/__init__.py +4 -4
- re_common/studio/assignment_expressions.py +36 -36
- re_common/studio/mydash/test1.py +18 -18
- re_common/studio/pydashstudio/first.py +9 -9
- re_common/studio/streamlitstudio/first_app.py +65 -65
- re_common/studio/streamlitstudio/uber_pickups.py +23 -23
- re_common/studio/test.py +18 -18
- re_common/v2/baselibrary/business_utils/BusinessStringUtil.py +219 -219
- re_common/v2/baselibrary/business_utils/baseencodeid.py +100 -100
- re_common/v2/baselibrary/business_utils/full_doi_path.py +116 -116
- re_common/v2/baselibrary/business_utils/rel_tools.py +6 -6
- re_common/v2/baselibrary/decorators/utils.py +59 -59
- re_common/v2/baselibrary/helpers/search_packge/NearestNeighbors_test.py +105 -105
- re_common/v2/baselibrary/helpers/search_packge/fit_text_match.py +253 -253
- re_common/v2/baselibrary/helpers/search_packge/scikit_learn_text_matcher.py +260 -260
- re_common/v2/baselibrary/helpers/search_packge/test.py +1 -1
- re_common/v2/baselibrary/s3object/baseboto3.py +230 -230
- re_common/v2/baselibrary/tools/WeChatRobot.py +95 -95
- re_common/v2/baselibrary/tools/ac_ahocorasick.py +75 -75
- re_common/v2/baselibrary/tools/concurrency.py +35 -35
- re_common/v2/baselibrary/tools/data_processer/base.py +53 -53
- re_common/v2/baselibrary/tools/data_processer/data_processer.py +508 -508
- re_common/v2/baselibrary/tools/data_processer/data_reader.py +187 -187
- re_common/v2/baselibrary/tools/data_processer/data_writer.py +38 -38
- re_common/v2/baselibrary/tools/dict_tools.py +44 -44
- re_common/v2/baselibrary/tools/dolphinscheduler.py +187 -187
- re_common/v2/baselibrary/tools/hdfs_base_processor.py +204 -204
- re_common/v2/baselibrary/tools/hdfs_bulk_processor.py +67 -67
- re_common/v2/baselibrary/tools/hdfs_data_processer.py +338 -338
- re_common/v2/baselibrary/tools/hdfs_line_processor.py +74 -74
- re_common/v2/baselibrary/tools/list_tools.py +69 -69
- re_common/v2/baselibrary/tools/resume_tracker.py +94 -94
- re_common/v2/baselibrary/tools/search_hash_tools.py +54 -54
- re_common/v2/baselibrary/tools/text_matcher.py +326 -326
- re_common/v2/baselibrary/tools/unionfind_tools.py +60 -60
- re_common/v2/baselibrary/utils/BusinessStringUtil.py +196 -196
- re_common/v2/baselibrary/utils/api_net_utils.py +270 -270
- re_common/v2/baselibrary/utils/author_smi.py +361 -361
- re_common/v2/baselibrary/utils/base_string_similarity.py +158 -158
- re_common/v2/baselibrary/utils/basedict.py +37 -37
- re_common/v2/baselibrary/utils/basehdfs.py +163 -163
- re_common/v2/baselibrary/utils/basepika.py +180 -180
- re_common/v2/baselibrary/utils/basetime.py +77 -77
- re_common/v2/baselibrary/utils/db.py +156 -156
- re_common/v2/baselibrary/utils/json_cls.py +16 -16
- re_common/v2/baselibrary/utils/mq.py +83 -83
- re_common/v2/baselibrary/utils/n_ary_expression_tree.py +243 -243
- re_common/v2/baselibrary/utils/string_bool.py +186 -186
- re_common/v2/baselibrary/utils/string_clear.py +246 -246
- re_common/v2/baselibrary/utils/string_smi.py +18 -18
- re_common/v2/baselibrary/utils/stringutils.py +271 -278
- re_common/vip/base_step_process.py +11 -11
- re_common/vip/baseencodeid.py +90 -90
- re_common/vip/changetaskname.py +28 -28
- re_common/vip/core_var.py +24 -24
- re_common/vip/mmh3Hash.py +89 -89
- re_common/vip/proxy/allproxys.py +127 -127
- re_common/vip/proxy/allproxys_thread.py +159 -159
- re_common/vip/proxy/cnki_proxy.py +153 -153
- re_common/vip/proxy/kuaidaili.py +87 -87
- re_common/vip/proxy/proxy_all.py +113 -113
- re_common/vip/proxy/update_kuaidaili_0.py +42 -42
- re_common/vip/proxy/wanfang_proxy.py +152 -152
- re_common/vip/proxy/wp_proxy_all.py +181 -181
- re_common/vip/read_rawid_to_txt.py +91 -91
- re_common/vip/title/__init__.py +5 -5
- re_common/vip/title/transform/TransformBookTitleToZt.py +125 -125
- re_common/vip/title/transform/TransformConferenceTitleToZt.py +139 -139
- re_common/vip/title/transform/TransformCstadTitleToZt.py +195 -195
- re_common/vip/title/transform/TransformJournalTitleToZt.py +203 -203
- re_common/vip/title/transform/TransformPatentTitleToZt.py +132 -132
- re_common/vip/title/transform/TransformRegulationTitleToZt.py +114 -114
- re_common/vip/title/transform/TransformStandardTitleToZt.py +135 -135
- re_common/vip/title/transform/TransformThesisTitleToZt.py +135 -135
- re_common/vip/title/transform/__init__.py +10 -10
- {re_common-10.0.37.dist-info → re_common-10.0.39.dist-info}/LICENSE +201 -201
- {re_common-10.0.37.dist-info → re_common-10.0.39.dist-info}/METADATA +16 -16
- re_common-10.0.39.dist-info/RECORD +248 -0
- {re_common-10.0.37.dist-info → re_common-10.0.39.dist-info}/WHEEL +1 -1
- re_common-10.0.37.dist-info/RECORD +0 -248
- {re_common-10.0.37.dist-info → re_common-10.0.39.dist-info}/top_level.txt +0 -0
|
@@ -1,158 +1,158 @@
|
|
|
1
|
-
import Levenshtein
|
|
2
|
-
import jellyfish
|
|
3
|
-
from rapidfuzz.distance import DamerauLevenshtein, Hamming, Indel, LCSseq, OSA
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class BaseStringSimilarity(object):
|
|
7
|
-
|
|
8
|
-
@classmethod
|
|
9
|
-
def levenshtein_similarity(cls, str1, str2) -> float:
|
|
10
|
-
"""
|
|
11
|
-
返回 两个字字符串之间的编辑距离 分数
|
|
12
|
-
"""
|
|
13
|
-
# 编辑距离长度
|
|
14
|
-
distance = Levenshtein.distance(str1, str2)
|
|
15
|
-
# 以最长字符串为除数算分
|
|
16
|
-
similarity = 1 - (distance / max(len(str1), len(str2)))
|
|
17
|
-
return similarity
|
|
18
|
-
|
|
19
|
-
@classmethod
|
|
20
|
-
def damerau_normalized_distance_similarity(cls, str1, str2) -> float:
|
|
21
|
-
"""
|
|
22
|
-
# 计算 归一化的编辑距离,取值范围 [0, 1],值越小表示越相似。 一般不以小评估分 所以不用
|
|
23
|
-
similarity = DamerauLevenshtein.normalized_distance(str1, str2)
|
|
24
|
-
作用:计算 相似度得分,取值范围 [0, max_len],值越大表示越相似。
|
|
25
|
-
print(DamerauLevenshtein.similarity(str1, str2))
|
|
26
|
-
"""
|
|
27
|
-
# 该算法与 cls.levenshtein_similarity 算法一致 只是 编辑距离的得值不一样
|
|
28
|
-
similarity = DamerauLevenshtein.normalized_similarity(str1, str2)
|
|
29
|
-
return similarity
|
|
30
|
-
|
|
31
|
-
@classmethod
|
|
32
|
-
def indel_levenshtein_similarity(cls, str1, str2) -> float:
|
|
33
|
-
"""
|
|
34
|
-
本质上使用的 是 Indel.normalized_similarity(str1,str2) 方法
|
|
35
|
-
|
|
36
|
-
计算 str1 和 str2 之间的 Indel 距离(插入和删除操作的最小次数)
|
|
37
|
-
Indel.distance(str1, str2)
|
|
38
|
-
计算 标准化后的 Indel 距离,取值范围在 [0, 1] 之间,其中 0 表示完全相同,1 表示完全不同。 ``distance / (len1 + len2)``.
|
|
39
|
-
Indel.normalized_distance(str1, str2)
|
|
40
|
-
计算 [max, 0] 范围内的 Indel 相似度。计算公式为“(len1 + len2) - distance”
|
|
41
|
-
Indel.similarity(str1, str2)
|
|
42
|
-
计算 [0, 1] 范围内的归一化插入/缺失相似度。计算公式为“1 - normalized_distance”
|
|
43
|
-
Indel.normalized_similarity(str1, str2)
|
|
44
|
-
|
|
45
|
-
"""
|
|
46
|
-
# 计算相似度(0到1之间的值,1表示完全相同)
|
|
47
|
-
similarity = Levenshtein.ratio(str1, str2)
|
|
48
|
-
return similarity
|
|
49
|
-
|
|
50
|
-
@classmethod
|
|
51
|
-
def jaro_similarity(cls, str1, str2) -> float:
|
|
52
|
-
"""
|
|
53
|
-
Jaro 相似度是一种用于测量两个字符串相似度的算法,主要考虑:
|
|
54
|
-
匹配的字符
|
|
55
|
-
字符顺序
|
|
56
|
-
字符转置(位置交换)
|
|
57
|
-
|
|
58
|
-
与 Jaro.normalized_similarity(str1,str2) 一致
|
|
59
|
-
"""
|
|
60
|
-
return jellyfish.jaro_similarity(str1, str2)
|
|
61
|
-
|
|
62
|
-
@classmethod
|
|
63
|
-
def jaro_winkler_similarity(cls, str1, str2) -> float:
|
|
64
|
-
"""
|
|
65
|
-
Jaro-Winkler 是 Jaro 的改进版,对前缀匹配给予更多权重
|
|
66
|
-
|
|
67
|
-
与 JaroWinkler.normalized_similarity(str1,str2) 结果一致
|
|
68
|
-
|
|
69
|
-
print(JaroWinkler.distance(str1, str2))
|
|
70
|
-
与 print(JaroWinkler.normalized_distance(str1, str2)) 结果一致
|
|
71
|
-
|
|
72
|
-
print(JaroWinkler.similarity(str1, str2))
|
|
73
|
-
与 print(JaroWinkler.normalized_similarity(str1,str2)) 结果一致
|
|
74
|
-
"""
|
|
75
|
-
return jellyfish.jaro_winkler_similarity(str1, str2)
|
|
76
|
-
|
|
77
|
-
@classmethod
|
|
78
|
-
def osa_similarity(cls, str1, str2) -> float:
|
|
79
|
-
"""
|
|
80
|
-
计算 [0, 1] 范围内的归一化最佳字符串比对 (OSA) 相似度。
|
|
81
|
-
|
|
82
|
-
计算公式为“1 - normalized_distance”
|
|
83
|
-
"""
|
|
84
|
-
return OSA.normalized_similarity(str1, str2)
|
|
85
|
-
|
|
86
|
-
@classmethod
|
|
87
|
-
def lcs_seq_similarity(cls, str1, str2) -> float:
|
|
88
|
-
"""
|
|
89
|
-
计算 [0, 1] 范围内的归一化 LCS 相似度。
|
|
90
|
-
计算公式为“1 - normalized_distance”
|
|
91
|
-
"""
|
|
92
|
-
return LCSseq.normalized_similarity(str1, str2)
|
|
93
|
-
|
|
94
|
-
@classmethod
|
|
95
|
-
def lcs_seq_distance(cls, str1, str2) -> int:
|
|
96
|
-
"""
|
|
97
|
-
LCSseq.distance 是 RapidFuzz 库中的一个方法,用于计算两个字符串之间的 最长公共子序列(Longest Common Subsequence, LCS)距离。
|
|
98
|
-
LCS 是指两个字符串中 按顺序出现但不一定连续 的最长子序列。例如:
|
|
99
|
-
"abcde" 和 "ace" 的 LCS 是 "ace"(长度 3)。
|
|
100
|
-
"Druitt, Robert" 和 "Druitt R." 的 LCS 可能是 "Druitt R"(长度 8)。
|
|
101
|
-
计算 [0, max] 范围内的 LCS 距离。
|
|
102
|
-
计算公式为“max(len1, len2) - 相似度”。
|
|
103
|
-
"""
|
|
104
|
-
return LCSseq.distance(str1, str2)
|
|
105
|
-
|
|
106
|
-
@classmethod
|
|
107
|
-
def osa_distance(cls, str1, str2) -> int:
|
|
108
|
-
"""
|
|
109
|
-
OSA.distance(Optimal String Alignment,最优字符串对齐距离)是 RapidFuzz 库中的一个方法,用于计算两个字符串之间的 编辑距离(Edit Distance),但比标准的 Levenshtein 距离 限制更严格。
|
|
110
|
-
|
|
111
|
-
OSA 额外允许 相邻字符交换(Transposition),但限制比 Damerau-Levenshtein 更严格(Damerau 允许多次交换,而 OSA 仅限一次)。
|
|
112
|
-
"""
|
|
113
|
-
return OSA.distance(str1, str2)
|
|
114
|
-
|
|
115
|
-
@classmethod
|
|
116
|
-
def levenshtein_distance(cls, str1, str2) -> int:
|
|
117
|
-
"""
|
|
118
|
-
返回 两个字字符串之间的编辑距离 分数
|
|
119
|
-
标准 Levenshtein 距离 允许 插入、删除、替换 三种操作,但不允许 相邻字符交换(transposition)
|
|
120
|
-
|
|
121
|
-
jellyfish.levenshtein_distance(str1,str2) 该方法结果与 本方法一致
|
|
122
|
-
|
|
123
|
-
print(Jaro.distance(str1, str2))
|
|
124
|
-
与 print(Jaro.normalized_distance(str1, str2)) 结果一致
|
|
125
|
-
|
|
126
|
-
print(Jaro.similarity(str1, str2))
|
|
127
|
-
与 print(Jaro.normalized_similarity(str1,str2)) 结果一致
|
|
128
|
-
"""
|
|
129
|
-
# 编辑距离长度
|
|
130
|
-
distance = Levenshtein.distance(str1, str2)
|
|
131
|
-
print(jellyfish.levenshtein_distance(str1, str2))
|
|
132
|
-
return distance
|
|
133
|
-
|
|
134
|
-
@classmethod
|
|
135
|
-
def indel_distance(cls, str1, str2) -> int:
|
|
136
|
-
"""
|
|
137
|
-
Indel(Insertion + Deletion)距离是 仅考虑插入和删除操作 的编辑距离,不考虑替换操作。
|
|
138
|
-
"""
|
|
139
|
-
return Indel.distance(str1, str2)
|
|
140
|
-
|
|
141
|
-
@classmethod
|
|
142
|
-
def damerau_levenshtein_distance(cls, str1, str2) -> int:
|
|
143
|
-
"""
|
|
144
|
-
Damerau-Levenshtein 距离是 Levenshtein 距离的修改,它将换位(例如将 ifsh 表示为 fish)计为一次编辑
|
|
145
|
-
"""
|
|
146
|
-
# 编辑距离长度
|
|
147
|
-
distance = jellyfish.damerau_levenshtein_distance(str1, str2)
|
|
148
|
-
print(DamerauLevenshtein.distance(str1, str2))
|
|
149
|
-
return distance
|
|
150
|
-
|
|
151
|
-
@classmethod
|
|
152
|
-
def hamming_distance(cls, str1, str2) -> int:
|
|
153
|
-
return Hamming.distance(str1, str2)
|
|
154
|
-
|
|
155
|
-
# str1 = "primulina elegant ladyis a new culitvar developed by crossing seed parent primulina medica and pollen parent primulina longii it has fresh and elegant flowershigh ornamental value and strong shade tolerance it is easy to cultivate and propagate"
|
|
156
|
-
# str2 = "primulinaelegant labyis a new cultivar developed by crossing seed parent primulina medica and pollen parent primulina longii it has fresh and elegant flowershigh ornamental value and strong shade tolerance it is easy to cultivate and propagate 2019 editorial office of acta horticulturae sinica all rights reserved"
|
|
157
|
-
# # str1 = "primulina elegant ladyis a new cultivar developed by crossing seed parent primulina medica and pollen parent primulina longii it has fresh and elegant flowershigh ornamental value and strong shade tolerance it is easy to cultivate and propagate"
|
|
158
|
-
# # str2 = "primulinaelegant ladyis a new cultivar developed by crossing seed parent primulina medica and pollen parent primulina longii it has fresh and elegant flowershigh ornamental value and strong shade tolerance it is easy to cultivate and propagate 2019 editorial office of acta horticulturae sinica all rights reserved"
|
|
1
|
+
import Levenshtein
|
|
2
|
+
import jellyfish
|
|
3
|
+
from rapidfuzz.distance import DamerauLevenshtein, Hamming, Indel, LCSseq, OSA
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class BaseStringSimilarity(object):
|
|
7
|
+
|
|
8
|
+
@classmethod
|
|
9
|
+
def levenshtein_similarity(cls, str1, str2) -> float:
|
|
10
|
+
"""
|
|
11
|
+
返回 两个字字符串之间的编辑距离 分数
|
|
12
|
+
"""
|
|
13
|
+
# 编辑距离长度
|
|
14
|
+
distance = Levenshtein.distance(str1, str2)
|
|
15
|
+
# 以最长字符串为除数算分
|
|
16
|
+
similarity = 1 - (distance / max(len(str1), len(str2)))
|
|
17
|
+
return similarity
|
|
18
|
+
|
|
19
|
+
@classmethod
|
|
20
|
+
def damerau_normalized_distance_similarity(cls, str1, str2) -> float:
|
|
21
|
+
"""
|
|
22
|
+
# 计算 归一化的编辑距离,取值范围 [0, 1],值越小表示越相似。 一般不以小评估分 所以不用
|
|
23
|
+
similarity = DamerauLevenshtein.normalized_distance(str1, str2)
|
|
24
|
+
作用:计算 相似度得分,取值范围 [0, max_len],值越大表示越相似。
|
|
25
|
+
print(DamerauLevenshtein.similarity(str1, str2))
|
|
26
|
+
"""
|
|
27
|
+
# 该算法与 cls.levenshtein_similarity 算法一致 只是 编辑距离的得值不一样
|
|
28
|
+
similarity = DamerauLevenshtein.normalized_similarity(str1, str2)
|
|
29
|
+
return similarity
|
|
30
|
+
|
|
31
|
+
@classmethod
|
|
32
|
+
def indel_levenshtein_similarity(cls, str1, str2) -> float:
|
|
33
|
+
"""
|
|
34
|
+
本质上使用的 是 Indel.normalized_similarity(str1,str2) 方法
|
|
35
|
+
|
|
36
|
+
计算 str1 和 str2 之间的 Indel 距离(插入和删除操作的最小次数)
|
|
37
|
+
Indel.distance(str1, str2)
|
|
38
|
+
计算 标准化后的 Indel 距离,取值范围在 [0, 1] 之间,其中 0 表示完全相同,1 表示完全不同。 ``distance / (len1 + len2)``.
|
|
39
|
+
Indel.normalized_distance(str1, str2)
|
|
40
|
+
计算 [max, 0] 范围内的 Indel 相似度。计算公式为“(len1 + len2) - distance”
|
|
41
|
+
Indel.similarity(str1, str2)
|
|
42
|
+
计算 [0, 1] 范围内的归一化插入/缺失相似度。计算公式为“1 - normalized_distance”
|
|
43
|
+
Indel.normalized_similarity(str1, str2)
|
|
44
|
+
|
|
45
|
+
"""
|
|
46
|
+
# 计算相似度(0到1之间的值,1表示完全相同)
|
|
47
|
+
similarity = Levenshtein.ratio(str1, str2)
|
|
48
|
+
return similarity
|
|
49
|
+
|
|
50
|
+
@classmethod
|
|
51
|
+
def jaro_similarity(cls, str1, str2) -> float:
|
|
52
|
+
"""
|
|
53
|
+
Jaro 相似度是一种用于测量两个字符串相似度的算法,主要考虑:
|
|
54
|
+
匹配的字符
|
|
55
|
+
字符顺序
|
|
56
|
+
字符转置(位置交换)
|
|
57
|
+
|
|
58
|
+
与 Jaro.normalized_similarity(str1,str2) 一致
|
|
59
|
+
"""
|
|
60
|
+
return jellyfish.jaro_similarity(str1, str2)
|
|
61
|
+
|
|
62
|
+
@classmethod
|
|
63
|
+
def jaro_winkler_similarity(cls, str1, str2) -> float:
|
|
64
|
+
"""
|
|
65
|
+
Jaro-Winkler 是 Jaro 的改进版,对前缀匹配给予更多权重
|
|
66
|
+
|
|
67
|
+
与 JaroWinkler.normalized_similarity(str1,str2) 结果一致
|
|
68
|
+
|
|
69
|
+
print(JaroWinkler.distance(str1, str2))
|
|
70
|
+
与 print(JaroWinkler.normalized_distance(str1, str2)) 结果一致
|
|
71
|
+
|
|
72
|
+
print(JaroWinkler.similarity(str1, str2))
|
|
73
|
+
与 print(JaroWinkler.normalized_similarity(str1,str2)) 结果一致
|
|
74
|
+
"""
|
|
75
|
+
return jellyfish.jaro_winkler_similarity(str1, str2)
|
|
76
|
+
|
|
77
|
+
@classmethod
|
|
78
|
+
def osa_similarity(cls, str1, str2) -> float:
|
|
79
|
+
"""
|
|
80
|
+
计算 [0, 1] 范围内的归一化最佳字符串比对 (OSA) 相似度。
|
|
81
|
+
|
|
82
|
+
计算公式为“1 - normalized_distance”
|
|
83
|
+
"""
|
|
84
|
+
return OSA.normalized_similarity(str1, str2)
|
|
85
|
+
|
|
86
|
+
@classmethod
|
|
87
|
+
def lcs_seq_similarity(cls, str1, str2) -> float:
|
|
88
|
+
"""
|
|
89
|
+
计算 [0, 1] 范围内的归一化 LCS 相似度。
|
|
90
|
+
计算公式为“1 - normalized_distance”
|
|
91
|
+
"""
|
|
92
|
+
return LCSseq.normalized_similarity(str1, str2)
|
|
93
|
+
|
|
94
|
+
@classmethod
|
|
95
|
+
def lcs_seq_distance(cls, str1, str2) -> int:
|
|
96
|
+
"""
|
|
97
|
+
LCSseq.distance 是 RapidFuzz 库中的一个方法,用于计算两个字符串之间的 最长公共子序列(Longest Common Subsequence, LCS)距离。
|
|
98
|
+
LCS 是指两个字符串中 按顺序出现但不一定连续 的最长子序列。例如:
|
|
99
|
+
"abcde" 和 "ace" 的 LCS 是 "ace"(长度 3)。
|
|
100
|
+
"Druitt, Robert" 和 "Druitt R." 的 LCS 可能是 "Druitt R"(长度 8)。
|
|
101
|
+
计算 [0, max] 范围内的 LCS 距离。
|
|
102
|
+
计算公式为“max(len1, len2) - 相似度”。
|
|
103
|
+
"""
|
|
104
|
+
return LCSseq.distance(str1, str2)
|
|
105
|
+
|
|
106
|
+
@classmethod
|
|
107
|
+
def osa_distance(cls, str1, str2) -> int:
|
|
108
|
+
"""
|
|
109
|
+
OSA.distance(Optimal String Alignment,最优字符串对齐距离)是 RapidFuzz 库中的一个方法,用于计算两个字符串之间的 编辑距离(Edit Distance),但比标准的 Levenshtein 距离 限制更严格。
|
|
110
|
+
|
|
111
|
+
OSA 额外允许 相邻字符交换(Transposition),但限制比 Damerau-Levenshtein 更严格(Damerau 允许多次交换,而 OSA 仅限一次)。
|
|
112
|
+
"""
|
|
113
|
+
return OSA.distance(str1, str2)
|
|
114
|
+
|
|
115
|
+
@classmethod
|
|
116
|
+
def levenshtein_distance(cls, str1, str2) -> int:
|
|
117
|
+
"""
|
|
118
|
+
返回 两个字字符串之间的编辑距离 分数
|
|
119
|
+
标准 Levenshtein 距离 允许 插入、删除、替换 三种操作,但不允许 相邻字符交换(transposition)
|
|
120
|
+
|
|
121
|
+
jellyfish.levenshtein_distance(str1,str2) 该方法结果与 本方法一致
|
|
122
|
+
|
|
123
|
+
print(Jaro.distance(str1, str2))
|
|
124
|
+
与 print(Jaro.normalized_distance(str1, str2)) 结果一致
|
|
125
|
+
|
|
126
|
+
print(Jaro.similarity(str1, str2))
|
|
127
|
+
与 print(Jaro.normalized_similarity(str1,str2)) 结果一致
|
|
128
|
+
"""
|
|
129
|
+
# 编辑距离长度
|
|
130
|
+
distance = Levenshtein.distance(str1, str2)
|
|
131
|
+
print(jellyfish.levenshtein_distance(str1, str2))
|
|
132
|
+
return distance
|
|
133
|
+
|
|
134
|
+
@classmethod
|
|
135
|
+
def indel_distance(cls, str1, str2) -> int:
|
|
136
|
+
"""
|
|
137
|
+
Indel(Insertion + Deletion)距离是 仅考虑插入和删除操作 的编辑距离,不考虑替换操作。
|
|
138
|
+
"""
|
|
139
|
+
return Indel.distance(str1, str2)
|
|
140
|
+
|
|
141
|
+
@classmethod
|
|
142
|
+
def damerau_levenshtein_distance(cls, str1, str2) -> int:
|
|
143
|
+
"""
|
|
144
|
+
Damerau-Levenshtein 距离是 Levenshtein 距离的修改,它将换位(例如将 ifsh 表示为 fish)计为一次编辑
|
|
145
|
+
"""
|
|
146
|
+
# 编辑距离长度
|
|
147
|
+
distance = jellyfish.damerau_levenshtein_distance(str1, str2)
|
|
148
|
+
print(DamerauLevenshtein.distance(str1, str2))
|
|
149
|
+
return distance
|
|
150
|
+
|
|
151
|
+
@classmethod
|
|
152
|
+
def hamming_distance(cls, str1, str2) -> int:
|
|
153
|
+
return Hamming.distance(str1, str2)
|
|
154
|
+
|
|
155
|
+
# str1 = "primulina elegant ladyis a new culitvar developed by crossing seed parent primulina medica and pollen parent primulina longii it has fresh and elegant flowershigh ornamental value and strong shade tolerance it is easy to cultivate and propagate"
|
|
156
|
+
# str2 = "primulinaelegant labyis a new cultivar developed by crossing seed parent primulina medica and pollen parent primulina longii it has fresh and elegant flowershigh ornamental value and strong shade tolerance it is easy to cultivate and propagate 2019 editorial office of acta horticulturae sinica all rights reserved"
|
|
157
|
+
# # str1 = "primulina elegant ladyis a new cultivar developed by crossing seed parent primulina medica and pollen parent primulina longii it has fresh and elegant flowershigh ornamental value and strong shade tolerance it is easy to cultivate and propagate"
|
|
158
|
+
# # str2 = "primulinaelegant ladyis a new cultivar developed by crossing seed parent primulina medica and pollen parent primulina longii it has fresh and elegant flowershigh ornamental value and strong shade tolerance it is easy to cultivate and propagate 2019 editorial office of acta horticulturae sinica all rights reserved"
|
|
@@ -1,38 +1,38 @@
|
|
|
1
|
-
import logging
|
|
2
|
-
from itertools import groupby
|
|
3
|
-
|
|
4
|
-
logger = logging.getLogger(__name__) # 创建 logger 实例
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
class BaseDict(object):
|
|
8
|
-
@classmethod
|
|
9
|
-
def flip_dict(cls, original_dict, raise_on_conflict=True):
|
|
10
|
-
"""
|
|
11
|
-
翻转字典:将 key 是字符串、value 是列表的字典,转换为 key 是原 value 列表中的元素、value 是原 key 的字典。
|
|
12
|
-
:param original_dict: 原始字典
|
|
13
|
-
:param raise_on_conflict: 是否在键冲突时抛出异常,默认为 False
|
|
14
|
-
:return: 翻转后的字典
|
|
15
|
-
"""
|
|
16
|
-
flipped_dict = {}
|
|
17
|
-
for key, value_list in original_dict.items():
|
|
18
|
-
for value in value_list:
|
|
19
|
-
if value in flipped_dict:
|
|
20
|
-
if raise_on_conflict:
|
|
21
|
-
raise ValueError(f"Key conflict detected: {value} already exists in the flipped dictionary.")
|
|
22
|
-
else:
|
|
23
|
-
# 覆盖冲突的键
|
|
24
|
-
logger.warning(
|
|
25
|
-
f"Warning: Key conflict detected for {value}. Overwriting with new value: {key}.")
|
|
26
|
-
flipped_dict[value] = key
|
|
27
|
-
return flipped_dict
|
|
28
|
-
|
|
29
|
-
@classmethod
|
|
30
|
-
def get_temp_gid_dicts(cls,lists,key_name):
|
|
31
|
-
"""
|
|
32
|
-
对 列表字典 分组 组成 分组id的字典
|
|
33
|
-
"""
|
|
34
|
-
dicts = {}
|
|
35
|
-
for group_id, group_tmp in groupby(sorted(lists, key=lambda x: x[key_name]),
|
|
36
|
-
key=lambda x: x[key_name]):
|
|
37
|
-
dicts[group_id] = group_tmp
|
|
1
|
+
import logging
|
|
2
|
+
from itertools import groupby
|
|
3
|
+
|
|
4
|
+
logger = logging.getLogger(__name__) # 创建 logger 实例
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class BaseDict(object):
|
|
8
|
+
@classmethod
|
|
9
|
+
def flip_dict(cls, original_dict, raise_on_conflict=True):
|
|
10
|
+
"""
|
|
11
|
+
翻转字典:将 key 是字符串、value 是列表的字典,转换为 key 是原 value 列表中的元素、value 是原 key 的字典。
|
|
12
|
+
:param original_dict: 原始字典
|
|
13
|
+
:param raise_on_conflict: 是否在键冲突时抛出异常,默认为 False
|
|
14
|
+
:return: 翻转后的字典
|
|
15
|
+
"""
|
|
16
|
+
flipped_dict = {}
|
|
17
|
+
for key, value_list in original_dict.items():
|
|
18
|
+
for value in value_list:
|
|
19
|
+
if value in flipped_dict:
|
|
20
|
+
if raise_on_conflict:
|
|
21
|
+
raise ValueError(f"Key conflict detected: {value} already exists in the flipped dictionary.")
|
|
22
|
+
else:
|
|
23
|
+
# 覆盖冲突的键
|
|
24
|
+
logger.warning(
|
|
25
|
+
f"Warning: Key conflict detected for {value}. Overwriting with new value: {key}.")
|
|
26
|
+
flipped_dict[value] = key
|
|
27
|
+
return flipped_dict
|
|
28
|
+
|
|
29
|
+
@classmethod
|
|
30
|
+
def get_temp_gid_dicts(cls,lists,key_name):
|
|
31
|
+
"""
|
|
32
|
+
对 列表字典 分组 组成 分组id的字典
|
|
33
|
+
"""
|
|
34
|
+
dicts = {}
|
|
35
|
+
for group_id, group_tmp in groupby(sorted(lists, key=lambda x: x[key_name]),
|
|
36
|
+
key=lambda x: x[key_name]):
|
|
37
|
+
dicts[group_id] = group_tmp
|
|
38
38
|
return dicts
|