re-common 10.0.39__py3-none-any.whl → 10.0.41__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- re_common/baselibrary/__init__.py +4 -4
- re_common/baselibrary/baseabs/__init__.py +6 -6
- re_common/baselibrary/baseabs/baseabs.py +26 -26
- re_common/baselibrary/database/mbuilder.py +132 -132
- re_common/baselibrary/database/moudle.py +93 -93
- re_common/baselibrary/database/msqlite3.py +194 -194
- re_common/baselibrary/database/mysql.py +169 -169
- re_common/baselibrary/database/sql_factory.py +26 -26
- re_common/baselibrary/mthread/MThreadingRun.py +486 -486
- re_common/baselibrary/mthread/MThreadingRunEvent.py +349 -349
- re_common/baselibrary/mthread/__init__.py +2 -2
- re_common/baselibrary/mthread/mythreading.py +695 -695
- re_common/baselibrary/pakge_other/socks.py +404 -404
- re_common/baselibrary/readconfig/config_factory.py +18 -18
- re_common/baselibrary/readconfig/ini_config.py +317 -317
- re_common/baselibrary/readconfig/toml_config.py +49 -49
- re_common/baselibrary/temporary/envdata.py +36 -36
- re_common/baselibrary/tools/all_requests/aiohttp_request.py +118 -118
- re_common/baselibrary/tools/all_requests/httpx_requet.py +102 -102
- re_common/baselibrary/tools/all_requests/mrequest.py +412 -412
- re_common/baselibrary/tools/all_requests/requests_request.py +81 -81
- re_common/baselibrary/tools/batch_compre/bijiao_batch.py +31 -31
- re_common/baselibrary/tools/contrast_db3.py +123 -123
- re_common/baselibrary/tools/copy_file.py +39 -39
- re_common/baselibrary/tools/db3_2_sizedb3.py +102 -102
- re_common/baselibrary/tools/foreachgz.py +39 -39
- re_common/baselibrary/tools/get_attr.py +10 -10
- re_common/baselibrary/tools/image_to_pdf.py +61 -61
- re_common/baselibrary/tools/java_code_deal.py +139 -139
- re_common/baselibrary/tools/javacode.py +79 -79
- re_common/baselibrary/tools/mdb_db3.py +48 -48
- re_common/baselibrary/tools/merge_file.py +171 -171
- re_common/baselibrary/tools/merge_gz_file.py +165 -165
- re_common/baselibrary/tools/mhdfstools/down_hdfs_files.py +42 -42
- re_common/baselibrary/tools/mhdfstools/hdfst.py +42 -42
- re_common/baselibrary/tools/mhdfstools/up_hdfs_files.py +38 -38
- re_common/baselibrary/tools/mongo_tools.py +50 -50
- re_common/baselibrary/tools/move_file.py +170 -170
- re_common/baselibrary/tools/move_mongo/mongo_table_to_file.py +63 -63
- re_common/baselibrary/tools/move_mongo/move_mongo_table.py +354 -354
- re_common/baselibrary/tools/move_mongo/use_mttf.py +18 -18
- re_common/baselibrary/tools/move_mongo/use_mv.py +93 -93
- re_common/baselibrary/tools/mpandas/mpandasreadexcel.py +125 -125
- re_common/baselibrary/tools/mpandas/pandas_visualization.py +7 -7
- re_common/baselibrary/tools/myparsel.py +104 -104
- re_common/baselibrary/tools/rename_dir_file.py +37 -37
- re_common/baselibrary/tools/sequoiadb_utils.py +398 -398
- re_common/baselibrary/tools/split_line_to_many.py +25 -25
- re_common/baselibrary/tools/stringtodicts.py +33 -33
- re_common/baselibrary/tools/workwechant_bot.py +84 -84
- re_common/baselibrary/utils/baseaiohttp.py +296 -296
- re_common/baselibrary/utils/baseaiomysql.py +87 -87
- re_common/baselibrary/utils/baseallstep.py +191 -191
- re_common/baselibrary/utils/baseavro.py +19 -19
- re_common/baselibrary/utils/baseboto3.py +291 -291
- re_common/baselibrary/utils/basecsv.py +32 -32
- re_common/baselibrary/utils/basedict.py +133 -133
- re_common/baselibrary/utils/basedir.py +241 -241
- re_common/baselibrary/utils/baseencode.py +351 -351
- re_common/baselibrary/utils/baseencoding.py +28 -28
- re_common/baselibrary/utils/baseesdsl.py +86 -86
- re_common/baselibrary/utils/baseexcel.py +264 -264
- re_common/baselibrary/utils/baseexcept.py +109 -109
- re_common/baselibrary/utils/basefile.py +654 -654
- re_common/baselibrary/utils/baseftp.py +214 -214
- re_common/baselibrary/utils/basegzip.py +60 -60
- re_common/baselibrary/utils/basehdfs.py +135 -135
- re_common/baselibrary/utils/basehttpx.py +268 -268
- re_common/baselibrary/utils/baseip.py +87 -87
- re_common/baselibrary/utils/basejson.py +2 -2
- re_common/baselibrary/utils/baselist.py +32 -32
- re_common/baselibrary/utils/basemotor.py +190 -190
- re_common/baselibrary/utils/basemssql.py +98 -98
- re_common/baselibrary/utils/baseodbc.py +113 -113
- re_common/baselibrary/utils/basepandas.py +302 -302
- re_common/baselibrary/utils/basepeewee.py +11 -11
- re_common/baselibrary/utils/basepika.py +180 -180
- re_common/baselibrary/utils/basepydash.py +143 -143
- re_common/baselibrary/utils/basepymongo.py +230 -230
- re_common/baselibrary/utils/basequeue.py +22 -22
- re_common/baselibrary/utils/baserar.py +57 -57
- re_common/baselibrary/utils/baserequest.py +279 -279
- re_common/baselibrary/utils/baseset.py +8 -8
- re_common/baselibrary/utils/basesmb.py +403 -403
- re_common/baselibrary/utils/basestring.py +382 -382
- re_common/baselibrary/utils/basetime.py +320 -320
- re_common/baselibrary/utils/baseurl.py +121 -121
- re_common/baselibrary/utils/basezip.py +57 -57
- re_common/baselibrary/utils/core/__init__.py +7 -7
- re_common/baselibrary/utils/core/bottomutils.py +18 -18
- re_common/baselibrary/utils/core/mdeprecated.py +327 -327
- re_common/baselibrary/utils/core/mlamada.py +16 -16
- re_common/baselibrary/utils/core/msginfo.py +25 -25
- re_common/baselibrary/utils/core/requests_core.py +103 -103
- re_common/baselibrary/utils/fateadm.py +429 -429
- re_common/baselibrary/utils/importfun.py +123 -123
- re_common/baselibrary/utils/mfaker.py +57 -57
- re_common/baselibrary/utils/my_abc/__init__.py +3 -3
- re_common/baselibrary/utils/my_abc/better_abc.py +32 -32
- re_common/baselibrary/utils/mylogger.py +414 -414
- re_common/baselibrary/utils/myredisclient.py +861 -861
- re_common/baselibrary/utils/pipupgrade.py +21 -21
- re_common/baselibrary/utils/ringlist.py +85 -85
- re_common/baselibrary/utils/version_compare.py +36 -36
- re_common/baselibrary/utils/ydmhttp.py +126 -126
- re_common/facade/lazy_import.py +11 -11
- re_common/facade/loggerfacade.py +25 -25
- re_common/facade/mysqlfacade.py +467 -467
- re_common/facade/now.py +31 -31
- re_common/facade/sqlite3facade.py +257 -257
- re_common/facade/use/mq_use_facade.py +83 -83
- re_common/facade/use/proxy_use_facade.py +19 -19
- re_common/libtest/base_dict_test.py +19 -19
- re_common/libtest/baseavro_test.py +13 -13
- re_common/libtest/basefile_test.py +14 -14
- re_common/libtest/basemssql_test.py +77 -77
- re_common/libtest/baseodbc_test.py +7 -7
- re_common/libtest/basepandas_test.py +38 -38
- re_common/libtest/get_attr_test/get_attr_test_settings.py +14 -14
- re_common/libtest/get_attr_test/settings.py +54 -54
- re_common/libtest/idencode_test.py +53 -53
- re_common/libtest/iniconfig_test.py +35 -35
- re_common/libtest/ip_test.py +34 -34
- re_common/libtest/merge_file_test.py +20 -20
- re_common/libtest/mfaker_test.py +8 -8
- re_common/libtest/mm3_test.py +31 -31
- re_common/libtest/mylogger_test.py +88 -88
- re_common/libtest/myparsel_test.py +27 -27
- re_common/libtest/mysql_test.py +151 -151
- re_common/libtest/pymongo_test.py +21 -21
- re_common/libtest/split_test.py +11 -11
- re_common/libtest/sqlite3_merge_test.py +5 -5
- re_common/libtest/sqlite3_test.py +34 -34
- re_common/libtest/tomlconfig_test.py +30 -30
- re_common/libtest/use_tools_test/__init__.py +2 -2
- re_common/libtest/user/__init__.py +4 -4
- re_common/studio/__init__.py +4 -4
- re_common/studio/assignment_expressions.py +36 -36
- re_common/studio/mydash/test1.py +18 -18
- re_common/studio/pydashstudio/first.py +9 -9
- re_common/studio/streamlitstudio/first_app.py +65 -65
- re_common/studio/streamlitstudio/uber_pickups.py +23 -23
- re_common/studio/test.py +18 -18
- re_common/v2/baselibrary/business_utils/BusinessStringUtil.py +235 -220
- re_common/v2/baselibrary/business_utils/baseencodeid.py +100 -100
- re_common/v2/baselibrary/business_utils/full_doi_path.py +116 -116
- re_common/v2/baselibrary/business_utils/rel_tools.py +6 -6
- re_common/v2/baselibrary/decorators/utils.py +59 -59
- re_common/v2/baselibrary/helpers/search_packge/NearestNeighbors_test.py +105 -105
- re_common/v2/baselibrary/helpers/search_packge/fit_text_match.py +253 -253
- re_common/v2/baselibrary/helpers/search_packge/scikit_learn_text_matcher.py +260 -260
- re_common/v2/baselibrary/helpers/search_packge/test.py +1 -1
- re_common/v2/baselibrary/s3object/baseboto3.py +230 -230
- re_common/v2/baselibrary/tools/WeChatRobot.py +95 -95
- re_common/v2/baselibrary/tools/ac_ahocorasick.py +75 -75
- re_common/v2/baselibrary/tools/concurrency.py +35 -35
- re_common/v2/baselibrary/tools/data_processer/base.py +53 -53
- re_common/v2/baselibrary/tools/data_processer/data_processer.py +497 -508
- re_common/v2/baselibrary/tools/data_processer/data_reader.py +187 -187
- re_common/v2/baselibrary/tools/data_processer/data_writer.py +38 -38
- re_common/v2/baselibrary/tools/dict_tools.py +44 -44
- re_common/v2/baselibrary/tools/dolphinscheduler.py +187 -187
- re_common/v2/baselibrary/tools/hdfs_base_processor.py +204 -204
- re_common/v2/baselibrary/tools/hdfs_bulk_processor.py +67 -67
- re_common/v2/baselibrary/tools/hdfs_data_processer.py +338 -338
- re_common/v2/baselibrary/tools/hdfs_line_processor.py +74 -74
- re_common/v2/baselibrary/tools/list_tools.py +69 -69
- re_common/v2/baselibrary/tools/resume_tracker.py +94 -94
- re_common/v2/baselibrary/tools/search_hash_tools.py +54 -54
- re_common/v2/baselibrary/tools/text_matcher.py +326 -326
- re_common/v2/baselibrary/tools/tree_processor/__init__.py +0 -0
- re_common/v2/baselibrary/tools/tree_processor/builder.py +25 -0
- re_common/v2/baselibrary/tools/tree_processor/node.py +13 -0
- re_common/v2/baselibrary/tools/unionfind_tools.py +60 -60
- re_common/v2/baselibrary/utils/BusinessStringUtil.py +196 -196
- re_common/v2/baselibrary/utils/api_net_utils.py +270 -270
- re_common/v2/baselibrary/utils/author_smi.py +361 -361
- re_common/v2/baselibrary/utils/base_string_similarity.py +158 -158
- re_common/v2/baselibrary/utils/basedict.py +37 -37
- re_common/v2/baselibrary/utils/basehdfs.py +163 -163
- re_common/v2/baselibrary/utils/basepika.py +180 -180
- re_common/v2/baselibrary/utils/basetime.py +94 -77
- re_common/v2/baselibrary/utils/db.py +174 -156
- re_common/v2/baselibrary/utils/elasticsearch.py +46 -0
- re_common/v2/baselibrary/utils/json_cls.py +16 -16
- re_common/v2/baselibrary/utils/mq.py +83 -83
- re_common/v2/baselibrary/utils/n_ary_expression_tree.py +243 -243
- re_common/v2/baselibrary/utils/string_bool.py +187 -186
- re_common/v2/baselibrary/utils/string_clear.py +246 -246
- re_common/v2/baselibrary/utils/string_smi.py +18 -18
- re_common/v2/baselibrary/utils/stringutils.py +312 -271
- re_common/vip/base_step_process.py +11 -11
- re_common/vip/baseencodeid.py +90 -90
- re_common/vip/changetaskname.py +28 -28
- re_common/vip/core_var.py +24 -24
- re_common/vip/mmh3Hash.py +89 -89
- re_common/vip/proxy/allproxys.py +127 -127
- re_common/vip/proxy/allproxys_thread.py +159 -159
- re_common/vip/proxy/cnki_proxy.py +153 -153
- re_common/vip/proxy/kuaidaili.py +87 -87
- re_common/vip/proxy/proxy_all.py +113 -113
- re_common/vip/proxy/update_kuaidaili_0.py +42 -42
- re_common/vip/proxy/wanfang_proxy.py +152 -152
- re_common/vip/proxy/wp_proxy_all.py +181 -181
- re_common/vip/read_rawid_to_txt.py +91 -91
- re_common/vip/title/__init__.py +5 -5
- re_common/vip/title/transform/TransformBookTitleToZt.py +125 -125
- re_common/vip/title/transform/TransformConferenceTitleToZt.py +139 -139
- re_common/vip/title/transform/TransformCstadTitleToZt.py +195 -195
- re_common/vip/title/transform/TransformJournalTitleToZt.py +203 -203
- re_common/vip/title/transform/TransformPatentTitleToZt.py +132 -132
- re_common/vip/title/transform/TransformRegulationTitleToZt.py +114 -114
- re_common/vip/title/transform/TransformStandardTitleToZt.py +135 -135
- re_common/vip/title/transform/TransformThesisTitleToZt.py +135 -135
- re_common/vip/title/transform/__init__.py +10 -10
- {re_common-10.0.39.dist-info → re_common-10.0.41.dist-info}/LICENSE +201 -201
- {re_common-10.0.39.dist-info → re_common-10.0.41.dist-info}/METADATA +16 -16
- re_common-10.0.41.dist-info/RECORD +252 -0
- {re_common-10.0.39.dist-info → re_common-10.0.41.dist-info}/WHEEL +1 -1
- re_common-10.0.39.dist-info/RECORD +0 -248
- {re_common-10.0.39.dist-info → re_common-10.0.41.dist-info}/top_level.txt +0 -0
|
@@ -1,326 +1,326 @@
|
|
|
1
|
-
import pickle
|
|
2
|
-
|
|
3
|
-
import jieba
|
|
4
|
-
import re
|
|
5
|
-
from typing import List, Dict, Tuple, Set, Optional, Union, Hashable, Protocol
|
|
6
|
-
from datasketch import MinHash, MinHashLSH
|
|
7
|
-
|
|
8
|
-
from re_common.v2.baselibrary.decorators.utils import deprecated
|
|
9
|
-
from re_common.v2.baselibrary.utils.string_bool import is_single_cjk_char
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
@deprecated("请使用 TextMatcherV2 代替。")
|
|
13
|
-
class TextMatcher(object):
|
|
14
|
-
def __init__(
|
|
15
|
-
self,
|
|
16
|
-
threshold: float = 0.5,
|
|
17
|
-
num_perm: int = 128,
|
|
18
|
-
is_raw_texts=True,
|
|
19
|
-
stopwords_path: Optional[str] = None,
|
|
20
|
-
user_dict_path: Optional[str] = None,
|
|
21
|
-
|
|
22
|
-
):
|
|
23
|
-
"""
|
|
24
|
-
初始化文本匹配器
|
|
25
|
-
|
|
26
|
-
Args:
|
|
27
|
-
threshold: LSH 相似度阈值
|
|
28
|
-
num_perm: MinHash 排列数
|
|
29
|
-
stopwords_path: 停用词文件路径
|
|
30
|
-
user_dict_path: 用户自定义词典路径
|
|
31
|
-
"""
|
|
32
|
-
self.threshold = threshold
|
|
33
|
-
self.num_perm = num_perm
|
|
34
|
-
self.lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)
|
|
35
|
-
# self.minhashes: Dict[str, MinHash] = {}
|
|
36
|
-
self.raw_texts: Dict[str, str] = {}
|
|
37
|
-
self.is_raw_texts = is_raw_texts
|
|
38
|
-
self.doc_counter = 0
|
|
39
|
-
|
|
40
|
-
# 加载停用词
|
|
41
|
-
self.stopwords: Set[str] = set()
|
|
42
|
-
if stopwords_path:
|
|
43
|
-
self.load_stopwords(stopwords_path)
|
|
44
|
-
|
|
45
|
-
# 加载用户词典
|
|
46
|
-
if user_dict_path:
|
|
47
|
-
jieba.load_userdict(user_dict_path)
|
|
48
|
-
|
|
49
|
-
def load_stopwords(self, stopwords_path: str) -> None:
|
|
50
|
-
"""加载停用词"""
|
|
51
|
-
with open(stopwords_path, "r", encoding="utf-8") as f:
|
|
52
|
-
self.stopwords = set(line.strip() for line in f)
|
|
53
|
-
|
|
54
|
-
def preprocess_text(self, text: str) -> str:
|
|
55
|
-
"""
|
|
56
|
-
文本预处理
|
|
57
|
-
"""
|
|
58
|
-
# 转换为小写
|
|
59
|
-
text = text.lower()
|
|
60
|
-
# 移除特殊字符
|
|
61
|
-
text = re.sub(r"[^\w\s\u4e00-\u9fff]", "", text)
|
|
62
|
-
# 移除多余空格
|
|
63
|
-
text = re.sub(r"\s+", " ", text).strip()
|
|
64
|
-
return text
|
|
65
|
-
|
|
66
|
-
def tokenize(self, text: str) -> List[str]:
|
|
67
|
-
"""
|
|
68
|
-
分词并移除停用词
|
|
69
|
-
"""
|
|
70
|
-
words = jieba.lcut(text)
|
|
71
|
-
one_char_size = len([i for i in words if len(i) == 1])
|
|
72
|
-
all_size = len(words)
|
|
73
|
-
if all_size != 0 and one_char_size / all_size > 0.6:
|
|
74
|
-
words = [i for i in text.split() if i.strip()]
|
|
75
|
-
|
|
76
|
-
# 过滤停用词和空字符
|
|
77
|
-
words = [w for w in words if w not in self.stopwords and w.strip()]
|
|
78
|
-
return words
|
|
79
|
-
|
|
80
|
-
def create_minhash(self, words: List[str]) -> MinHash:
|
|
81
|
-
"""
|
|
82
|
-
为分词结果创建 MinHash
|
|
83
|
-
"""
|
|
84
|
-
minhash = MinHash(num_perm=self.num_perm)
|
|
85
|
-
for word in words:
|
|
86
|
-
minhash.update(word.encode("utf-8"))
|
|
87
|
-
return minhash
|
|
88
|
-
|
|
89
|
-
def add_document(self, text: str, doc_id: Optional[str] = None) -> str:
|
|
90
|
-
"""
|
|
91
|
-
添加文档到索引
|
|
92
|
-
|
|
93
|
-
Args:
|
|
94
|
-
text: 文档文本
|
|
95
|
-
doc_id: 文档ID(可选)
|
|
96
|
-
|
|
97
|
-
Returns:
|
|
98
|
-
doc_id: 文档ID
|
|
99
|
-
"""
|
|
100
|
-
if doc_id is None:
|
|
101
|
-
doc_id = f"doc_{self.doc_counter}"
|
|
102
|
-
self.doc_counter += 1
|
|
103
|
-
|
|
104
|
-
# 预处理和分词
|
|
105
|
-
processed_text = self.preprocess_text(text)
|
|
106
|
-
words = self.tokenize(processed_text)
|
|
107
|
-
|
|
108
|
-
# 创建 MinHash
|
|
109
|
-
minhash = self.create_minhash(words)
|
|
110
|
-
if self.is_raw_texts:
|
|
111
|
-
# 存储原始文本和 MinHash
|
|
112
|
-
self.raw_texts[doc_id] = text
|
|
113
|
-
# self.minhashes[doc_id] = minhash
|
|
114
|
-
|
|
115
|
-
# 添加到 LSH
|
|
116
|
-
self.lsh.insert(doc_id, minhash)
|
|
117
|
-
|
|
118
|
-
return doc_id
|
|
119
|
-
|
|
120
|
-
def batch_add_documents(self, texts: Dict[str, str]) -> None:
|
|
121
|
-
"""
|
|
122
|
-
批量添加文档
|
|
123
|
-
|
|
124
|
-
Args:
|
|
125
|
-
texts: {doc_id: text} 的字典
|
|
126
|
-
"""
|
|
127
|
-
for doc_id, text in texts.items():
|
|
128
|
-
self.add_document(text, doc_id)
|
|
129
|
-
|
|
130
|
-
def create_query_minhash(self, query: str):
|
|
131
|
-
|
|
132
|
-
# 预处理查询文本
|
|
133
|
-
processed_query = self.preprocess_text(query)
|
|
134
|
-
query_words = self.tokenize(processed_query)
|
|
135
|
-
# print(query_words)
|
|
136
|
-
query_minhash = self.create_minhash(query_words)
|
|
137
|
-
return query_minhash
|
|
138
|
-
|
|
139
|
-
def find_similar(self, query_minhash: MinHash, return_similarities: bool = False) -> Union[
|
|
140
|
-
List[str], List[Tuple[str, float]]]:
|
|
141
|
-
"""
|
|
142
|
-
查找相似文档
|
|
143
|
-
|
|
144
|
-
Args:
|
|
145
|
-
query: 查询文本
|
|
146
|
-
return_similarities: 是否返回相似度分数
|
|
147
|
-
|
|
148
|
-
Returns:
|
|
149
|
-
如果 return_similarities 为 True,返回 [(doc_id, similarity), ...]
|
|
150
|
-
否则返回 [doc_id, ...]
|
|
151
|
-
"""
|
|
152
|
-
|
|
153
|
-
# 使用 LSH 查找候选集
|
|
154
|
-
similar_docs = self.lsh.query(query_minhash)
|
|
155
|
-
|
|
156
|
-
# if return_similarities:
|
|
157
|
-
# # 计算精确的 Jaccard 相似度
|
|
158
|
-
# results = []
|
|
159
|
-
# for doc_id in similar_docs:
|
|
160
|
-
# similarity = query_minhash.jaccard(self.minhashes[doc_id])
|
|
161
|
-
# results.append((doc_id, similarity))
|
|
162
|
-
# # 按相似度降序排序
|
|
163
|
-
# return sorted(results, key=lambda x: x[1], reverse=True)
|
|
164
|
-
|
|
165
|
-
return similar_docs
|
|
166
|
-
|
|
167
|
-
def get_text(self, doc_id: str) -> Optional[str]:
|
|
168
|
-
"""获取原始文本"""
|
|
169
|
-
if self.is_raw_texts:
|
|
170
|
-
return self.raw_texts.get(doc_id)
|
|
171
|
-
raise Exception("没有开启存储")
|
|
172
|
-
|
|
173
|
-
def remove_document(self, doc_id: str) -> bool:
|
|
174
|
-
"""
|
|
175
|
-
删除文档
|
|
176
|
-
|
|
177
|
-
Returns:
|
|
178
|
-
bool: 是否成功删除
|
|
179
|
-
"""
|
|
180
|
-
# if doc_id not in self.minhashes:
|
|
181
|
-
# return False
|
|
182
|
-
|
|
183
|
-
self.lsh.remove(doc_id)
|
|
184
|
-
# del self.minhashes[doc_id]
|
|
185
|
-
if self.is_raw_texts:
|
|
186
|
-
del self.raw_texts[doc_id]
|
|
187
|
-
return True
|
|
188
|
-
|
|
189
|
-
def clear(self) -> None:
|
|
190
|
-
"""清空所有数据"""
|
|
191
|
-
self.lsh = MinHashLSH(threshold=self.threshold, num_perm=self.num_perm)
|
|
192
|
-
# self.minhashes.clear()
|
|
193
|
-
self.raw_texts.clear()
|
|
194
|
-
self.doc_counter = 0
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
# 定义一个协议,描述“像鸭子一样”的行为
|
|
198
|
-
class TokenizeDuckLike(Protocol):
|
|
199
|
-
def get_words(self, text) -> List:
|
|
200
|
-
pass
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
class JiebaTokenize(object):
|
|
204
|
-
|
|
205
|
-
def __init__(self, stopwords=None):
|
|
206
|
-
self.stopwords = stopwords
|
|
207
|
-
|
|
208
|
-
def get_words(self, text) -> List:
|
|
209
|
-
|
|
210
|
-
if self.stopwords is None:
|
|
211
|
-
stopwords = []
|
|
212
|
-
words = jieba.lcut(text)
|
|
213
|
-
|
|
214
|
-
# 统计单字符数据 长度,防止结巴分词分不了的单词 将数据分为单个字符
|
|
215
|
-
|
|
216
|
-
# 这里为什么使用函数 而不是在推导式中兼容,主要是在一些 spark中 推导式的if 条件不遵循最短路径原则会将表达式当做一个整体算子
|
|
217
|
-
def is_singel_en(i):
|
|
218
|
-
if len(i) == 1 and not is_single_cjk_char(i):
|
|
219
|
-
return True
|
|
220
|
-
return False
|
|
221
|
-
|
|
222
|
-
one_char_size = len([i for i in words if is_singel_en(i)])
|
|
223
|
-
all_size = len(words)
|
|
224
|
-
# 如果单字符个数超过一定比例 就直接用空格分词
|
|
225
|
-
if all_size != 0 and one_char_size / all_size > 0.6:
|
|
226
|
-
words = [i for i in text.split() if i.strip()]
|
|
227
|
-
|
|
228
|
-
# 过滤停用词和空字符
|
|
229
|
-
words = [w for w in words if w not in stopwords and w.strip()]
|
|
230
|
-
return words
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
class TextMatcherV2(object):
|
|
234
|
-
|
|
235
|
-
def __init__(
|
|
236
|
-
self,
|
|
237
|
-
threshold: float = 0.5,
|
|
238
|
-
num_perm: int = 128,
|
|
239
|
-
tdk: TokenizeDuckLike = None
|
|
240
|
-
):
|
|
241
|
-
"""
|
|
242
|
-
初始化文本匹配器
|
|
243
|
-
|
|
244
|
-
Args:
|
|
245
|
-
threshold: LSH 相似度阈值
|
|
246
|
-
num_perm: MinHash 排列数
|
|
247
|
-
stopwords_path: 停用词文件路径
|
|
248
|
-
user_dict_path: 用户自定义词典路径
|
|
249
|
-
"""
|
|
250
|
-
self.threshold = threshold
|
|
251
|
-
self.num_perm = num_perm
|
|
252
|
-
self.lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)
|
|
253
|
-
self.tdk = tdk
|
|
254
|
-
|
|
255
|
-
def add_document(self, doc_id: str, minhash: Union[MinHash, str], tdk: TokenizeDuckLike = None):
|
|
256
|
-
if isinstance(minhash, str):
|
|
257
|
-
minhash = self.str_to_minihash(minhash, tdk)
|
|
258
|
-
|
|
259
|
-
self.lsh.insert(doc_id, minhash)
|
|
260
|
-
|
|
261
|
-
def batch_add_documents(self, betch_data: Union[list, dict], tdk: TokenizeDuckLike = None):
|
|
262
|
-
def _add_document(minhash_or_str, tdk):
|
|
263
|
-
if isinstance(minhash_or_str, str):
|
|
264
|
-
minhash_or_str = self.str_to_minihash(minhash_or_str, tdk)
|
|
265
|
-
self.add_document(docid, minhash_or_str, tdk)
|
|
266
|
-
|
|
267
|
-
if isinstance(betch_data, list):
|
|
268
|
-
# 必须是可解包的2个数据的元组或list
|
|
269
|
-
for docid, minhash_or_str in betch_data:
|
|
270
|
-
_add_document(minhash_or_str, tdk)
|
|
271
|
-
elif isinstance(betch_data, dict):
|
|
272
|
-
for docid, minhash_or_str in betch_data.items():
|
|
273
|
-
_add_document(minhash_or_str, tdk)
|
|
274
|
-
else:
|
|
275
|
-
raise Exception("数据类型错误")
|
|
276
|
-
|
|
277
|
-
def find_similar(self, query_minhash: Union[MinHash, str], tdk: TokenizeDuckLike = None) -> List[Hashable]:
|
|
278
|
-
# 使用 LSH 查找候选集
|
|
279
|
-
if isinstance(query_minhash, str):
|
|
280
|
-
query_minhash = self.str_to_minihash(query_minhash, tdk)
|
|
281
|
-
similar_docs = self.lsh.query(query_minhash)
|
|
282
|
-
return similar_docs
|
|
283
|
-
|
|
284
|
-
def create_minhash(self, words: List[str], num_perm=None) -> MinHash:
|
|
285
|
-
"""
|
|
286
|
-
为分词结果创建 MinHash
|
|
287
|
-
"""
|
|
288
|
-
if num_perm is None:
|
|
289
|
-
num_perm = self.num_perm
|
|
290
|
-
minhash = MinHash(num_perm=num_perm)
|
|
291
|
-
for word in words:
|
|
292
|
-
minhash.update(word.encode("utf-8"))
|
|
293
|
-
return minhash
|
|
294
|
-
|
|
295
|
-
def create_words(self, text: str, tdk: TokenizeDuckLike = None):
|
|
296
|
-
if tdk is None:
|
|
297
|
-
tdk = self.tdk
|
|
298
|
-
worlds = tdk.get_words(text)
|
|
299
|
-
return worlds
|
|
300
|
-
|
|
301
|
-
def str_to_minihash(self, text: str, tdk: TokenizeDuckLike = None):
|
|
302
|
-
if tdk is None:
|
|
303
|
-
tdk = self.tdk
|
|
304
|
-
words = self.create_words(text, tdk)
|
|
305
|
-
minhash = self.create_minhash(words, self.num_perm)
|
|
306
|
-
return minhash
|
|
307
|
-
|
|
308
|
-
def minhash_dumps(self, minhash) -> bytes:
|
|
309
|
-
"""
|
|
310
|
-
序列化
|
|
311
|
-
"""
|
|
312
|
-
serialized_minhash = pickle.dumps(minhash)
|
|
313
|
-
return serialized_minhash
|
|
314
|
-
|
|
315
|
-
def minhash_loads(self, serialized_minhash) -> MinHash:
|
|
316
|
-
"""
|
|
317
|
-
反序列化
|
|
318
|
-
"""
|
|
319
|
-
minhash = pickle.loads(serialized_minhash)
|
|
320
|
-
return minhash
|
|
321
|
-
|
|
322
|
-
def merge_other_minhashlsh(self, other_minhashlsh: MinHashLSH):
|
|
323
|
-
"""
|
|
324
|
-
在其他地方创建好的lsh 合并进来
|
|
325
|
-
"""
|
|
326
|
-
self.lsh.merge(other_minhashlsh)
|
|
1
|
+
import pickle
|
|
2
|
+
|
|
3
|
+
import jieba
|
|
4
|
+
import re
|
|
5
|
+
from typing import List, Dict, Tuple, Set, Optional, Union, Hashable, Protocol
|
|
6
|
+
from datasketch import MinHash, MinHashLSH
|
|
7
|
+
|
|
8
|
+
from re_common.v2.baselibrary.decorators.utils import deprecated
|
|
9
|
+
from re_common.v2.baselibrary.utils.string_bool import is_single_cjk_char
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@deprecated("请使用 TextMatcherV2 代替。")
|
|
13
|
+
class TextMatcher(object):
|
|
14
|
+
def __init__(
|
|
15
|
+
self,
|
|
16
|
+
threshold: float = 0.5,
|
|
17
|
+
num_perm: int = 128,
|
|
18
|
+
is_raw_texts=True,
|
|
19
|
+
stopwords_path: Optional[str] = None,
|
|
20
|
+
user_dict_path: Optional[str] = None,
|
|
21
|
+
|
|
22
|
+
):
|
|
23
|
+
"""
|
|
24
|
+
初始化文本匹配器
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
threshold: LSH 相似度阈值
|
|
28
|
+
num_perm: MinHash 排列数
|
|
29
|
+
stopwords_path: 停用词文件路径
|
|
30
|
+
user_dict_path: 用户自定义词典路径
|
|
31
|
+
"""
|
|
32
|
+
self.threshold = threshold
|
|
33
|
+
self.num_perm = num_perm
|
|
34
|
+
self.lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)
|
|
35
|
+
# self.minhashes: Dict[str, MinHash] = {}
|
|
36
|
+
self.raw_texts: Dict[str, str] = {}
|
|
37
|
+
self.is_raw_texts = is_raw_texts
|
|
38
|
+
self.doc_counter = 0
|
|
39
|
+
|
|
40
|
+
# 加载停用词
|
|
41
|
+
self.stopwords: Set[str] = set()
|
|
42
|
+
if stopwords_path:
|
|
43
|
+
self.load_stopwords(stopwords_path)
|
|
44
|
+
|
|
45
|
+
# 加载用户词典
|
|
46
|
+
if user_dict_path:
|
|
47
|
+
jieba.load_userdict(user_dict_path)
|
|
48
|
+
|
|
49
|
+
def load_stopwords(self, stopwords_path: str) -> None:
|
|
50
|
+
"""加载停用词"""
|
|
51
|
+
with open(stopwords_path, "r", encoding="utf-8") as f:
|
|
52
|
+
self.stopwords = set(line.strip() for line in f)
|
|
53
|
+
|
|
54
|
+
def preprocess_text(self, text: str) -> str:
|
|
55
|
+
"""
|
|
56
|
+
文本预处理
|
|
57
|
+
"""
|
|
58
|
+
# 转换为小写
|
|
59
|
+
text = text.lower()
|
|
60
|
+
# 移除特殊字符
|
|
61
|
+
text = re.sub(r"[^\w\s\u4e00-\u9fff]", "", text)
|
|
62
|
+
# 移除多余空格
|
|
63
|
+
text = re.sub(r"\s+", " ", text).strip()
|
|
64
|
+
return text
|
|
65
|
+
|
|
66
|
+
def tokenize(self, text: str) -> List[str]:
|
|
67
|
+
"""
|
|
68
|
+
分词并移除停用词
|
|
69
|
+
"""
|
|
70
|
+
words = jieba.lcut(text)
|
|
71
|
+
one_char_size = len([i for i in words if len(i) == 1])
|
|
72
|
+
all_size = len(words)
|
|
73
|
+
if all_size != 0 and one_char_size / all_size > 0.6:
|
|
74
|
+
words = [i for i in text.split() if i.strip()]
|
|
75
|
+
|
|
76
|
+
# 过滤停用词和空字符
|
|
77
|
+
words = [w for w in words if w not in self.stopwords and w.strip()]
|
|
78
|
+
return words
|
|
79
|
+
|
|
80
|
+
def create_minhash(self, words: List[str]) -> MinHash:
|
|
81
|
+
"""
|
|
82
|
+
为分词结果创建 MinHash
|
|
83
|
+
"""
|
|
84
|
+
minhash = MinHash(num_perm=self.num_perm)
|
|
85
|
+
for word in words:
|
|
86
|
+
minhash.update(word.encode("utf-8"))
|
|
87
|
+
return minhash
|
|
88
|
+
|
|
89
|
+
def add_document(self, text: str, doc_id: Optional[str] = None) -> str:
|
|
90
|
+
"""
|
|
91
|
+
添加文档到索引
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
text: 文档文本
|
|
95
|
+
doc_id: 文档ID(可选)
|
|
96
|
+
|
|
97
|
+
Returns:
|
|
98
|
+
doc_id: 文档ID
|
|
99
|
+
"""
|
|
100
|
+
if doc_id is None:
|
|
101
|
+
doc_id = f"doc_{self.doc_counter}"
|
|
102
|
+
self.doc_counter += 1
|
|
103
|
+
|
|
104
|
+
# 预处理和分词
|
|
105
|
+
processed_text = self.preprocess_text(text)
|
|
106
|
+
words = self.tokenize(processed_text)
|
|
107
|
+
|
|
108
|
+
# 创建 MinHash
|
|
109
|
+
minhash = self.create_minhash(words)
|
|
110
|
+
if self.is_raw_texts:
|
|
111
|
+
# 存储原始文本和 MinHash
|
|
112
|
+
self.raw_texts[doc_id] = text
|
|
113
|
+
# self.minhashes[doc_id] = minhash
|
|
114
|
+
|
|
115
|
+
# 添加到 LSH
|
|
116
|
+
self.lsh.insert(doc_id, minhash)
|
|
117
|
+
|
|
118
|
+
return doc_id
|
|
119
|
+
|
|
120
|
+
def batch_add_documents(self, texts: Dict[str, str]) -> None:
|
|
121
|
+
"""
|
|
122
|
+
批量添加文档
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
texts: {doc_id: text} 的字典
|
|
126
|
+
"""
|
|
127
|
+
for doc_id, text in texts.items():
|
|
128
|
+
self.add_document(text, doc_id)
|
|
129
|
+
|
|
130
|
+
def create_query_minhash(self, query: str):
|
|
131
|
+
|
|
132
|
+
# 预处理查询文本
|
|
133
|
+
processed_query = self.preprocess_text(query)
|
|
134
|
+
query_words = self.tokenize(processed_query)
|
|
135
|
+
# print(query_words)
|
|
136
|
+
query_minhash = self.create_minhash(query_words)
|
|
137
|
+
return query_minhash
|
|
138
|
+
|
|
139
|
+
def find_similar(self, query_minhash: MinHash, return_similarities: bool = False) -> Union[
|
|
140
|
+
List[str], List[Tuple[str, float]]]:
|
|
141
|
+
"""
|
|
142
|
+
查找相似文档
|
|
143
|
+
|
|
144
|
+
Args:
|
|
145
|
+
query: 查询文本
|
|
146
|
+
return_similarities: 是否返回相似度分数
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
如果 return_similarities 为 True,返回 [(doc_id, similarity), ...]
|
|
150
|
+
否则返回 [doc_id, ...]
|
|
151
|
+
"""
|
|
152
|
+
|
|
153
|
+
# 使用 LSH 查找候选集
|
|
154
|
+
similar_docs = self.lsh.query(query_minhash)
|
|
155
|
+
|
|
156
|
+
# if return_similarities:
|
|
157
|
+
# # 计算精确的 Jaccard 相似度
|
|
158
|
+
# results = []
|
|
159
|
+
# for doc_id in similar_docs:
|
|
160
|
+
# similarity = query_minhash.jaccard(self.minhashes[doc_id])
|
|
161
|
+
# results.append((doc_id, similarity))
|
|
162
|
+
# # 按相似度降序排序
|
|
163
|
+
# return sorted(results, key=lambda x: x[1], reverse=True)
|
|
164
|
+
|
|
165
|
+
return similar_docs
|
|
166
|
+
|
|
167
|
+
def get_text(self, doc_id: str) -> Optional[str]:
|
|
168
|
+
"""获取原始文本"""
|
|
169
|
+
if self.is_raw_texts:
|
|
170
|
+
return self.raw_texts.get(doc_id)
|
|
171
|
+
raise Exception("没有开启存储")
|
|
172
|
+
|
|
173
|
+
def remove_document(self, doc_id: str) -> bool:
|
|
174
|
+
"""
|
|
175
|
+
删除文档
|
|
176
|
+
|
|
177
|
+
Returns:
|
|
178
|
+
bool: 是否成功删除
|
|
179
|
+
"""
|
|
180
|
+
# if doc_id not in self.minhashes:
|
|
181
|
+
# return False
|
|
182
|
+
|
|
183
|
+
self.lsh.remove(doc_id)
|
|
184
|
+
# del self.minhashes[doc_id]
|
|
185
|
+
if self.is_raw_texts:
|
|
186
|
+
del self.raw_texts[doc_id]
|
|
187
|
+
return True
|
|
188
|
+
|
|
189
|
+
def clear(self) -> None:
|
|
190
|
+
"""清空所有数据"""
|
|
191
|
+
self.lsh = MinHashLSH(threshold=self.threshold, num_perm=self.num_perm)
|
|
192
|
+
# self.minhashes.clear()
|
|
193
|
+
self.raw_texts.clear()
|
|
194
|
+
self.doc_counter = 0
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
# 定义一个协议,描述“像鸭子一样”的行为
|
|
198
|
+
class TokenizeDuckLike(Protocol):
|
|
199
|
+
def get_words(self, text) -> List:
|
|
200
|
+
pass
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
class JiebaTokenize(object):
|
|
204
|
+
|
|
205
|
+
def __init__(self, stopwords=None):
|
|
206
|
+
self.stopwords = stopwords
|
|
207
|
+
|
|
208
|
+
def get_words(self, text) -> List:
|
|
209
|
+
|
|
210
|
+
if self.stopwords is None:
|
|
211
|
+
stopwords = []
|
|
212
|
+
words = jieba.lcut(text)
|
|
213
|
+
|
|
214
|
+
# 统计单字符数据 长度,防止结巴分词分不了的单词 将数据分为单个字符
|
|
215
|
+
|
|
216
|
+
# 这里为什么使用函数 而不是在推导式中兼容,主要是在一些 spark中 推导式的if 条件不遵循最短路径原则会将表达式当做一个整体算子
|
|
217
|
+
def is_singel_en(i):
|
|
218
|
+
if len(i) == 1 and not is_single_cjk_char(i):
|
|
219
|
+
return True
|
|
220
|
+
return False
|
|
221
|
+
|
|
222
|
+
one_char_size = len([i for i in words if is_singel_en(i)])
|
|
223
|
+
all_size = len(words)
|
|
224
|
+
# 如果单字符个数超过一定比例 就直接用空格分词
|
|
225
|
+
if all_size != 0 and one_char_size / all_size > 0.6:
|
|
226
|
+
words = [i for i in text.split() if i.strip()]
|
|
227
|
+
|
|
228
|
+
# 过滤停用词和空字符
|
|
229
|
+
words = [w for w in words if w not in stopwords and w.strip()]
|
|
230
|
+
return words
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
class TextMatcherV2(object):
|
|
234
|
+
|
|
235
|
+
def __init__(
|
|
236
|
+
self,
|
|
237
|
+
threshold: float = 0.5,
|
|
238
|
+
num_perm: int = 128,
|
|
239
|
+
tdk: TokenizeDuckLike = None
|
|
240
|
+
):
|
|
241
|
+
"""
|
|
242
|
+
初始化文本匹配器
|
|
243
|
+
|
|
244
|
+
Args:
|
|
245
|
+
threshold: LSH 相似度阈值
|
|
246
|
+
num_perm: MinHash 排列数
|
|
247
|
+
stopwords_path: 停用词文件路径
|
|
248
|
+
user_dict_path: 用户自定义词典路径
|
|
249
|
+
"""
|
|
250
|
+
self.threshold = threshold
|
|
251
|
+
self.num_perm = num_perm
|
|
252
|
+
self.lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)
|
|
253
|
+
self.tdk = tdk
|
|
254
|
+
|
|
255
|
+
def add_document(self, doc_id: str, minhash: Union[MinHash, str], tdk: TokenizeDuckLike = None):
|
|
256
|
+
if isinstance(minhash, str):
|
|
257
|
+
minhash = self.str_to_minihash(minhash, tdk)
|
|
258
|
+
|
|
259
|
+
self.lsh.insert(doc_id, minhash)
|
|
260
|
+
|
|
261
|
+
def batch_add_documents(self, betch_data: Union[list, dict], tdk: TokenizeDuckLike = None):
|
|
262
|
+
def _add_document(minhash_or_str, tdk):
|
|
263
|
+
if isinstance(minhash_or_str, str):
|
|
264
|
+
minhash_or_str = self.str_to_minihash(minhash_or_str, tdk)
|
|
265
|
+
self.add_document(docid, minhash_or_str, tdk)
|
|
266
|
+
|
|
267
|
+
if isinstance(betch_data, list):
|
|
268
|
+
# 必须是可解包的2个数据的元组或list
|
|
269
|
+
for docid, minhash_or_str in betch_data:
|
|
270
|
+
_add_document(minhash_or_str, tdk)
|
|
271
|
+
elif isinstance(betch_data, dict):
|
|
272
|
+
for docid, minhash_or_str in betch_data.items():
|
|
273
|
+
_add_document(minhash_or_str, tdk)
|
|
274
|
+
else:
|
|
275
|
+
raise Exception("数据类型错误")
|
|
276
|
+
|
|
277
|
+
def find_similar(self, query_minhash: Union[MinHash, str], tdk: TokenizeDuckLike = None) -> List[Hashable]:
|
|
278
|
+
# 使用 LSH 查找候选集
|
|
279
|
+
if isinstance(query_minhash, str):
|
|
280
|
+
query_minhash = self.str_to_minihash(query_minhash, tdk)
|
|
281
|
+
similar_docs = self.lsh.query(query_minhash)
|
|
282
|
+
return similar_docs
|
|
283
|
+
|
|
284
|
+
def create_minhash(self, words: List[str], num_perm=None) -> MinHash:
|
|
285
|
+
"""
|
|
286
|
+
为分词结果创建 MinHash
|
|
287
|
+
"""
|
|
288
|
+
if num_perm is None:
|
|
289
|
+
num_perm = self.num_perm
|
|
290
|
+
minhash = MinHash(num_perm=num_perm)
|
|
291
|
+
for word in words:
|
|
292
|
+
minhash.update(word.encode("utf-8"))
|
|
293
|
+
return minhash
|
|
294
|
+
|
|
295
|
+
def create_words(self, text: str, tdk: TokenizeDuckLike = None):
|
|
296
|
+
if tdk is None:
|
|
297
|
+
tdk = self.tdk
|
|
298
|
+
worlds = tdk.get_words(text)
|
|
299
|
+
return worlds
|
|
300
|
+
|
|
301
|
+
def str_to_minihash(self, text: str, tdk: TokenizeDuckLike = None):
|
|
302
|
+
if tdk is None:
|
|
303
|
+
tdk = self.tdk
|
|
304
|
+
words = self.create_words(text, tdk)
|
|
305
|
+
minhash = self.create_minhash(words, self.num_perm)
|
|
306
|
+
return minhash
|
|
307
|
+
|
|
308
|
+
def minhash_dumps(self, minhash) -> bytes:
|
|
309
|
+
"""
|
|
310
|
+
序列化
|
|
311
|
+
"""
|
|
312
|
+
serialized_minhash = pickle.dumps(minhash)
|
|
313
|
+
return serialized_minhash
|
|
314
|
+
|
|
315
|
+
def minhash_loads(self, serialized_minhash) -> MinHash:
|
|
316
|
+
"""
|
|
317
|
+
反序列化
|
|
318
|
+
"""
|
|
319
|
+
minhash = pickle.loads(serialized_minhash)
|
|
320
|
+
return minhash
|
|
321
|
+
|
|
322
|
+
def merge_other_minhashlsh(self, other_minhashlsh: MinHashLSH):
|
|
323
|
+
"""
|
|
324
|
+
在其他地方创建好的lsh 合并进来
|
|
325
|
+
"""
|
|
326
|
+
self.lsh.merge(other_minhashlsh)
|
|
File without changes
|