re-common 10.0.39__py3-none-any.whl → 10.0.41__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- re_common/baselibrary/__init__.py +4 -4
- re_common/baselibrary/baseabs/__init__.py +6 -6
- re_common/baselibrary/baseabs/baseabs.py +26 -26
- re_common/baselibrary/database/mbuilder.py +132 -132
- re_common/baselibrary/database/moudle.py +93 -93
- re_common/baselibrary/database/msqlite3.py +194 -194
- re_common/baselibrary/database/mysql.py +169 -169
- re_common/baselibrary/database/sql_factory.py +26 -26
- re_common/baselibrary/mthread/MThreadingRun.py +486 -486
- re_common/baselibrary/mthread/MThreadingRunEvent.py +349 -349
- re_common/baselibrary/mthread/__init__.py +2 -2
- re_common/baselibrary/mthread/mythreading.py +695 -695
- re_common/baselibrary/pakge_other/socks.py +404 -404
- re_common/baselibrary/readconfig/config_factory.py +18 -18
- re_common/baselibrary/readconfig/ini_config.py +317 -317
- re_common/baselibrary/readconfig/toml_config.py +49 -49
- re_common/baselibrary/temporary/envdata.py +36 -36
- re_common/baselibrary/tools/all_requests/aiohttp_request.py +118 -118
- re_common/baselibrary/tools/all_requests/httpx_requet.py +102 -102
- re_common/baselibrary/tools/all_requests/mrequest.py +412 -412
- re_common/baselibrary/tools/all_requests/requests_request.py +81 -81
- re_common/baselibrary/tools/batch_compre/bijiao_batch.py +31 -31
- re_common/baselibrary/tools/contrast_db3.py +123 -123
- re_common/baselibrary/tools/copy_file.py +39 -39
- re_common/baselibrary/tools/db3_2_sizedb3.py +102 -102
- re_common/baselibrary/tools/foreachgz.py +39 -39
- re_common/baselibrary/tools/get_attr.py +10 -10
- re_common/baselibrary/tools/image_to_pdf.py +61 -61
- re_common/baselibrary/tools/java_code_deal.py +139 -139
- re_common/baselibrary/tools/javacode.py +79 -79
- re_common/baselibrary/tools/mdb_db3.py +48 -48
- re_common/baselibrary/tools/merge_file.py +171 -171
- re_common/baselibrary/tools/merge_gz_file.py +165 -165
- re_common/baselibrary/tools/mhdfstools/down_hdfs_files.py +42 -42
- re_common/baselibrary/tools/mhdfstools/hdfst.py +42 -42
- re_common/baselibrary/tools/mhdfstools/up_hdfs_files.py +38 -38
- re_common/baselibrary/tools/mongo_tools.py +50 -50
- re_common/baselibrary/tools/move_file.py +170 -170
- re_common/baselibrary/tools/move_mongo/mongo_table_to_file.py +63 -63
- re_common/baselibrary/tools/move_mongo/move_mongo_table.py +354 -354
- re_common/baselibrary/tools/move_mongo/use_mttf.py +18 -18
- re_common/baselibrary/tools/move_mongo/use_mv.py +93 -93
- re_common/baselibrary/tools/mpandas/mpandasreadexcel.py +125 -125
- re_common/baselibrary/tools/mpandas/pandas_visualization.py +7 -7
- re_common/baselibrary/tools/myparsel.py +104 -104
- re_common/baselibrary/tools/rename_dir_file.py +37 -37
- re_common/baselibrary/tools/sequoiadb_utils.py +398 -398
- re_common/baselibrary/tools/split_line_to_many.py +25 -25
- re_common/baselibrary/tools/stringtodicts.py +33 -33
- re_common/baselibrary/tools/workwechant_bot.py +84 -84
- re_common/baselibrary/utils/baseaiohttp.py +296 -296
- re_common/baselibrary/utils/baseaiomysql.py +87 -87
- re_common/baselibrary/utils/baseallstep.py +191 -191
- re_common/baselibrary/utils/baseavro.py +19 -19
- re_common/baselibrary/utils/baseboto3.py +291 -291
- re_common/baselibrary/utils/basecsv.py +32 -32
- re_common/baselibrary/utils/basedict.py +133 -133
- re_common/baselibrary/utils/basedir.py +241 -241
- re_common/baselibrary/utils/baseencode.py +351 -351
- re_common/baselibrary/utils/baseencoding.py +28 -28
- re_common/baselibrary/utils/baseesdsl.py +86 -86
- re_common/baselibrary/utils/baseexcel.py +264 -264
- re_common/baselibrary/utils/baseexcept.py +109 -109
- re_common/baselibrary/utils/basefile.py +654 -654
- re_common/baselibrary/utils/baseftp.py +214 -214
- re_common/baselibrary/utils/basegzip.py +60 -60
- re_common/baselibrary/utils/basehdfs.py +135 -135
- re_common/baselibrary/utils/basehttpx.py +268 -268
- re_common/baselibrary/utils/baseip.py +87 -87
- re_common/baselibrary/utils/basejson.py +2 -2
- re_common/baselibrary/utils/baselist.py +32 -32
- re_common/baselibrary/utils/basemotor.py +190 -190
- re_common/baselibrary/utils/basemssql.py +98 -98
- re_common/baselibrary/utils/baseodbc.py +113 -113
- re_common/baselibrary/utils/basepandas.py +302 -302
- re_common/baselibrary/utils/basepeewee.py +11 -11
- re_common/baselibrary/utils/basepika.py +180 -180
- re_common/baselibrary/utils/basepydash.py +143 -143
- re_common/baselibrary/utils/basepymongo.py +230 -230
- re_common/baselibrary/utils/basequeue.py +22 -22
- re_common/baselibrary/utils/baserar.py +57 -57
- re_common/baselibrary/utils/baserequest.py +279 -279
- re_common/baselibrary/utils/baseset.py +8 -8
- re_common/baselibrary/utils/basesmb.py +403 -403
- re_common/baselibrary/utils/basestring.py +382 -382
- re_common/baselibrary/utils/basetime.py +320 -320
- re_common/baselibrary/utils/baseurl.py +121 -121
- re_common/baselibrary/utils/basezip.py +57 -57
- re_common/baselibrary/utils/core/__init__.py +7 -7
- re_common/baselibrary/utils/core/bottomutils.py +18 -18
- re_common/baselibrary/utils/core/mdeprecated.py +327 -327
- re_common/baselibrary/utils/core/mlamada.py +16 -16
- re_common/baselibrary/utils/core/msginfo.py +25 -25
- re_common/baselibrary/utils/core/requests_core.py +103 -103
- re_common/baselibrary/utils/fateadm.py +429 -429
- re_common/baselibrary/utils/importfun.py +123 -123
- re_common/baselibrary/utils/mfaker.py +57 -57
- re_common/baselibrary/utils/my_abc/__init__.py +3 -3
- re_common/baselibrary/utils/my_abc/better_abc.py +32 -32
- re_common/baselibrary/utils/mylogger.py +414 -414
- re_common/baselibrary/utils/myredisclient.py +861 -861
- re_common/baselibrary/utils/pipupgrade.py +21 -21
- re_common/baselibrary/utils/ringlist.py +85 -85
- re_common/baselibrary/utils/version_compare.py +36 -36
- re_common/baselibrary/utils/ydmhttp.py +126 -126
- re_common/facade/lazy_import.py +11 -11
- re_common/facade/loggerfacade.py +25 -25
- re_common/facade/mysqlfacade.py +467 -467
- re_common/facade/now.py +31 -31
- re_common/facade/sqlite3facade.py +257 -257
- re_common/facade/use/mq_use_facade.py +83 -83
- re_common/facade/use/proxy_use_facade.py +19 -19
- re_common/libtest/base_dict_test.py +19 -19
- re_common/libtest/baseavro_test.py +13 -13
- re_common/libtest/basefile_test.py +14 -14
- re_common/libtest/basemssql_test.py +77 -77
- re_common/libtest/baseodbc_test.py +7 -7
- re_common/libtest/basepandas_test.py +38 -38
- re_common/libtest/get_attr_test/get_attr_test_settings.py +14 -14
- re_common/libtest/get_attr_test/settings.py +54 -54
- re_common/libtest/idencode_test.py +53 -53
- re_common/libtest/iniconfig_test.py +35 -35
- re_common/libtest/ip_test.py +34 -34
- re_common/libtest/merge_file_test.py +20 -20
- re_common/libtest/mfaker_test.py +8 -8
- re_common/libtest/mm3_test.py +31 -31
- re_common/libtest/mylogger_test.py +88 -88
- re_common/libtest/myparsel_test.py +27 -27
- re_common/libtest/mysql_test.py +151 -151
- re_common/libtest/pymongo_test.py +21 -21
- re_common/libtest/split_test.py +11 -11
- re_common/libtest/sqlite3_merge_test.py +5 -5
- re_common/libtest/sqlite3_test.py +34 -34
- re_common/libtest/tomlconfig_test.py +30 -30
- re_common/libtest/use_tools_test/__init__.py +2 -2
- re_common/libtest/user/__init__.py +4 -4
- re_common/studio/__init__.py +4 -4
- re_common/studio/assignment_expressions.py +36 -36
- re_common/studio/mydash/test1.py +18 -18
- re_common/studio/pydashstudio/first.py +9 -9
- re_common/studio/streamlitstudio/first_app.py +65 -65
- re_common/studio/streamlitstudio/uber_pickups.py +23 -23
- re_common/studio/test.py +18 -18
- re_common/v2/baselibrary/business_utils/BusinessStringUtil.py +235 -220
- re_common/v2/baselibrary/business_utils/baseencodeid.py +100 -100
- re_common/v2/baselibrary/business_utils/full_doi_path.py +116 -116
- re_common/v2/baselibrary/business_utils/rel_tools.py +6 -6
- re_common/v2/baselibrary/decorators/utils.py +59 -59
- re_common/v2/baselibrary/helpers/search_packge/NearestNeighbors_test.py +105 -105
- re_common/v2/baselibrary/helpers/search_packge/fit_text_match.py +253 -253
- re_common/v2/baselibrary/helpers/search_packge/scikit_learn_text_matcher.py +260 -260
- re_common/v2/baselibrary/helpers/search_packge/test.py +1 -1
- re_common/v2/baselibrary/s3object/baseboto3.py +230 -230
- re_common/v2/baselibrary/tools/WeChatRobot.py +95 -95
- re_common/v2/baselibrary/tools/ac_ahocorasick.py +75 -75
- re_common/v2/baselibrary/tools/concurrency.py +35 -35
- re_common/v2/baselibrary/tools/data_processer/base.py +53 -53
- re_common/v2/baselibrary/tools/data_processer/data_processer.py +497 -508
- re_common/v2/baselibrary/tools/data_processer/data_reader.py +187 -187
- re_common/v2/baselibrary/tools/data_processer/data_writer.py +38 -38
- re_common/v2/baselibrary/tools/dict_tools.py +44 -44
- re_common/v2/baselibrary/tools/dolphinscheduler.py +187 -187
- re_common/v2/baselibrary/tools/hdfs_base_processor.py +204 -204
- re_common/v2/baselibrary/tools/hdfs_bulk_processor.py +67 -67
- re_common/v2/baselibrary/tools/hdfs_data_processer.py +338 -338
- re_common/v2/baselibrary/tools/hdfs_line_processor.py +74 -74
- re_common/v2/baselibrary/tools/list_tools.py +69 -69
- re_common/v2/baselibrary/tools/resume_tracker.py +94 -94
- re_common/v2/baselibrary/tools/search_hash_tools.py +54 -54
- re_common/v2/baselibrary/tools/text_matcher.py +326 -326
- re_common/v2/baselibrary/tools/tree_processor/__init__.py +0 -0
- re_common/v2/baselibrary/tools/tree_processor/builder.py +25 -0
- re_common/v2/baselibrary/tools/tree_processor/node.py +13 -0
- re_common/v2/baselibrary/tools/unionfind_tools.py +60 -60
- re_common/v2/baselibrary/utils/BusinessStringUtil.py +196 -196
- re_common/v2/baselibrary/utils/api_net_utils.py +270 -270
- re_common/v2/baselibrary/utils/author_smi.py +361 -361
- re_common/v2/baselibrary/utils/base_string_similarity.py +158 -158
- re_common/v2/baselibrary/utils/basedict.py +37 -37
- re_common/v2/baselibrary/utils/basehdfs.py +163 -163
- re_common/v2/baselibrary/utils/basepika.py +180 -180
- re_common/v2/baselibrary/utils/basetime.py +94 -77
- re_common/v2/baselibrary/utils/db.py +174 -156
- re_common/v2/baselibrary/utils/elasticsearch.py +46 -0
- re_common/v2/baselibrary/utils/json_cls.py +16 -16
- re_common/v2/baselibrary/utils/mq.py +83 -83
- re_common/v2/baselibrary/utils/n_ary_expression_tree.py +243 -243
- re_common/v2/baselibrary/utils/string_bool.py +187 -186
- re_common/v2/baselibrary/utils/string_clear.py +246 -246
- re_common/v2/baselibrary/utils/string_smi.py +18 -18
- re_common/v2/baselibrary/utils/stringutils.py +312 -271
- re_common/vip/base_step_process.py +11 -11
- re_common/vip/baseencodeid.py +90 -90
- re_common/vip/changetaskname.py +28 -28
- re_common/vip/core_var.py +24 -24
- re_common/vip/mmh3Hash.py +89 -89
- re_common/vip/proxy/allproxys.py +127 -127
- re_common/vip/proxy/allproxys_thread.py +159 -159
- re_common/vip/proxy/cnki_proxy.py +153 -153
- re_common/vip/proxy/kuaidaili.py +87 -87
- re_common/vip/proxy/proxy_all.py +113 -113
- re_common/vip/proxy/update_kuaidaili_0.py +42 -42
- re_common/vip/proxy/wanfang_proxy.py +152 -152
- re_common/vip/proxy/wp_proxy_all.py +181 -181
- re_common/vip/read_rawid_to_txt.py +91 -91
- re_common/vip/title/__init__.py +5 -5
- re_common/vip/title/transform/TransformBookTitleToZt.py +125 -125
- re_common/vip/title/transform/TransformConferenceTitleToZt.py +139 -139
- re_common/vip/title/transform/TransformCstadTitleToZt.py +195 -195
- re_common/vip/title/transform/TransformJournalTitleToZt.py +203 -203
- re_common/vip/title/transform/TransformPatentTitleToZt.py +132 -132
- re_common/vip/title/transform/TransformRegulationTitleToZt.py +114 -114
- re_common/vip/title/transform/TransformStandardTitleToZt.py +135 -135
- re_common/vip/title/transform/TransformThesisTitleToZt.py +135 -135
- re_common/vip/title/transform/__init__.py +10 -10
- {re_common-10.0.39.dist-info → re_common-10.0.41.dist-info}/LICENSE +201 -201
- {re_common-10.0.39.dist-info → re_common-10.0.41.dist-info}/METADATA +16 -16
- re_common-10.0.41.dist-info/RECORD +252 -0
- {re_common-10.0.39.dist-info → re_common-10.0.41.dist-info}/WHEEL +1 -1
- re_common-10.0.39.dist-info/RECORD +0 -248
- {re_common-10.0.39.dist-info → re_common-10.0.41.dist-info}/top_level.txt +0 -0
|
@@ -1,254 +1,254 @@
|
|
|
1
|
-
import gzip
|
|
2
|
-
import io
|
|
3
|
-
import multiprocessing
|
|
4
|
-
import os
|
|
5
|
-
import time
|
|
6
|
-
|
|
7
|
-
import jieba
|
|
8
|
-
import joblib
|
|
9
|
-
|
|
10
|
-
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
11
|
-
from sklearn.neighbors import NearestNeighbors
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
def create_gzip_joblib(obj):
|
|
15
|
-
temp_io = io.BytesIO()
|
|
16
|
-
with gzip.GzipFile(fileobj=temp_io, mode='wb') as f:
|
|
17
|
-
joblib.dump(obj, f)
|
|
18
|
-
temp_io.seek(0)
|
|
19
|
-
return temp_io
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
def get_gzip_joblib(temp_io):
|
|
23
|
-
with gzip.GzipFile(fileobj=temp_io, mode='rb') as f:
|
|
24
|
-
loaded_obj = joblib.load(f)
|
|
25
|
-
return loaded_obj
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
class JiebaTokenizer:
|
|
29
|
-
def __call__(self, doc):
|
|
30
|
-
return [tok for tok in jieba.cut(doc) if tok.strip()]
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
class SplitTokenizer:
|
|
34
|
-
def __call__(self, doc):
|
|
35
|
-
return str.split(doc)
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
def get_auto_n_jobs(fraction=0.5, max_jobs=16):
|
|
39
|
-
"""
|
|
40
|
-
智能分配 CPU 核心数,用于设置 sklearn 的 n_jobs 参数。
|
|
41
|
-
|
|
42
|
-
参数:
|
|
43
|
-
fraction: 使用总核数的比例(如 0.5 表示一半)
|
|
44
|
-
max_jobs: 最大允许使用的核心数(防止过多)
|
|
45
|
-
|
|
46
|
-
返回:
|
|
47
|
-
合理的 n_jobs 整数值
|
|
48
|
-
"""
|
|
49
|
-
total_cores = multiprocessing.cpu_count()
|
|
50
|
-
suggested = int(total_cores * fraction)
|
|
51
|
-
n_jobs = min(max(1, suggested), max_jobs)
|
|
52
|
-
return n_jobs
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
class FitTextMatcher:
|
|
56
|
-
"""
|
|
57
|
-
高性能文本匹配器
|
|
58
|
-
基于 TF-IDF + 最近邻搜索实现相似文献查找
|
|
59
|
-
"""
|
|
60
|
-
|
|
61
|
-
def __init__(self, algorithm='brute', metric='cosine', n_jobs=-1, tokenizer=JiebaTokenizer()):
|
|
62
|
-
"""
|
|
63
|
-
初始化文本匹配器
|
|
64
|
-
|
|
65
|
-
参数:
|
|
66
|
-
algorithm: 搜索算法 ('brute', 'kd_tree', 'ball_tree', 'lshf')
|
|
67
|
-
metric: 距离度量 ('cosine', 'euclidean', 'manhattan')
|
|
68
|
-
n_jobs: 并行作业数 (-1 表示使用所有CPU核心)
|
|
69
|
-
"""
|
|
70
|
-
self.vectorizer = TfidfVectorizer(
|
|
71
|
-
max_features=None, # 限制特征数量以提高性能
|
|
72
|
-
tokenizer=tokenizer,
|
|
73
|
-
stop_words=None, # 中文不适用 'english'
|
|
74
|
-
ngram_range=(1, 2) # 使用单字和双字组合
|
|
75
|
-
)
|
|
76
|
-
|
|
77
|
-
self.nn = NearestNeighbors(
|
|
78
|
-
algorithm=algorithm,
|
|
79
|
-
metric=metric,
|
|
80
|
-
n_jobs=n_jobs # 并行处理加速搜索
|
|
81
|
-
)
|
|
82
|
-
|
|
83
|
-
self.corpus = None
|
|
84
|
-
self.corpus_size = 0
|
|
85
|
-
|
|
86
|
-
def fit(self, corpus):
|
|
87
|
-
"""
|
|
88
|
-
训练匹配器
|
|
89
|
-
"""
|
|
90
|
-
self.corpus = corpus
|
|
91
|
-
self.corpus_size = len(corpus)
|
|
92
|
-
print(f"处理 {self.corpus_size} 篇文献...")
|
|
93
|
-
|
|
94
|
-
# 向量化文本
|
|
95
|
-
start_time = time.time()
|
|
96
|
-
X = self.vectorizer.fit_transform(corpus)
|
|
97
|
-
vectorization_time = time.time() - start_time
|
|
98
|
-
print(f"TF-IDF 向量化完成, 耗时: {vectorization_time:.4f}秒")
|
|
99
|
-
print(f"特征维度: {X.shape[1]}")
|
|
100
|
-
|
|
101
|
-
# 训练最近邻模型
|
|
102
|
-
start_time = time.time()
|
|
103
|
-
self.nn.fit(X)
|
|
104
|
-
training_time = time.time() - start_time
|
|
105
|
-
print(f"最近邻模型训练完成, 耗时: {training_time:.4f}秒")
|
|
106
|
-
|
|
107
|
-
return self
|
|
108
|
-
|
|
109
|
-
def save(self, path, name):
|
|
110
|
-
"""
|
|
111
|
-
保存模型和向量器
|
|
112
|
-
"""
|
|
113
|
-
os.makedirs(path, exist_ok=True)
|
|
114
|
-
joblib.dump(self.vectorizer, os.path.join(path, name + "_vectorizer.joblib"))
|
|
115
|
-
joblib.dump(self.nn, os.path.join(path, name + "_nn_model.joblib"))
|
|
116
|
-
joblib.dump(self.corpus, os.path.join(path, name + "_corpus.joblib"))
|
|
117
|
-
print(f"模型保存至 {path}")
|
|
118
|
-
return self
|
|
119
|
-
|
|
120
|
-
def get_save_bytes_io(self, idx_list=None):
|
|
121
|
-
"""
|
|
122
|
-
保存模型和向量器
|
|
123
|
-
"""
|
|
124
|
-
if idx_list is None:
|
|
125
|
-
idx_list = []
|
|
126
|
-
|
|
127
|
-
result_list = []
|
|
128
|
-
for i in [self.vectorizer, self.nn, self.corpus, idx_list]:
|
|
129
|
-
temp_io = create_gzip_joblib(i)
|
|
130
|
-
result_list.append(temp_io)
|
|
131
|
-
print(f"获取模型字节码成功")
|
|
132
|
-
return result_list
|
|
133
|
-
|
|
134
|
-
def load(self, path, name):
|
|
135
|
-
"""
|
|
136
|
-
从文件加载模型
|
|
137
|
-
"""
|
|
138
|
-
self.vectorizer = joblib.load(os.path.join(path, name + "_vectorizer.joblib"))
|
|
139
|
-
self.nn = joblib.load(os.path.join(path, name + "_nn_model.joblib"))
|
|
140
|
-
self.corpus = joblib.load(os.path.join(path, name + "_corpus.joblib"))
|
|
141
|
-
self.corpus_size = len(self.corpus)
|
|
142
|
-
print(f"模型从 {path} 加载完成,共 {self.corpus_size} 篇文献")
|
|
143
|
-
return self
|
|
144
|
-
|
|
145
|
-
def load_bytes(self, vec, nn, corpus, idx):
|
|
146
|
-
# 解压并加载对象
|
|
147
|
-
with gzip.GzipFile(fileobj=vec, mode='rb') as gz:
|
|
148
|
-
self.vectorizer = joblib.load(gz)
|
|
149
|
-
with gzip.GzipFile(fileobj=nn, mode='rb') as gz:
|
|
150
|
-
self.nn = joblib.load(gz)
|
|
151
|
-
with gzip.GzipFile(fileobj=corpus, mode='rb') as gz:
|
|
152
|
-
self.corpus = joblib.load(gz)
|
|
153
|
-
with gzip.GzipFile(fileobj=idx, mode='rb') as gz:
|
|
154
|
-
self.idx = joblib.load(gz)
|
|
155
|
-
self.corpus_size = max(len(self.corpus), len(self.idx))
|
|
156
|
-
print(f"加载bytes完成,共 {self.corpus_size} 篇文献")
|
|
157
|
-
return self
|
|
158
|
-
|
|
159
|
-
def search(self, query, n=5, return_scores=True):
|
|
160
|
-
"""
|
|
161
|
-
查找相似文献
|
|
162
|
-
|
|
163
|
-
参数:
|
|
164
|
-
query: 查询文本
|
|
165
|
-
n: 返回最相似文献的数量
|
|
166
|
-
return_scores: 是否返回相似度分数
|
|
167
|
-
|
|
168
|
-
返回:
|
|
169
|
-
匹配的文献索引和相似度分数
|
|
170
|
-
"""
|
|
171
|
-
if self.corpus is None:
|
|
172
|
-
raise ValueError("请先使用 fit() 方法训练模型")
|
|
173
|
-
|
|
174
|
-
# 向量化查询文本
|
|
175
|
-
query_vec = self.vectorizer.transform([query])
|
|
176
|
-
|
|
177
|
-
# 查找最近邻
|
|
178
|
-
start_time = time.time()
|
|
179
|
-
distances, indices = self.nn.kneighbors(query_vec, n_neighbors=n)
|
|
180
|
-
search_time = time.time() - start_time
|
|
181
|
-
|
|
182
|
-
# print(f"搜索完成, 耗时: {search_time:.6f}秒")
|
|
183
|
-
|
|
184
|
-
# 将距离转换为相似度 (余弦距离 = 1 - 余弦相似度)
|
|
185
|
-
similarities = 1 - distances
|
|
186
|
-
|
|
187
|
-
# 返回结果
|
|
188
|
-
if return_scores:
|
|
189
|
-
return indices[0], similarities[0]
|
|
190
|
-
return indices[0]
|
|
191
|
-
|
|
192
|
-
def batch_search(self, queries, n=5, return_scores=True):
|
|
193
|
-
"""
|
|
194
|
-
批量查找相似文献(一次处理多条 query)
|
|
195
|
-
|
|
196
|
-
参数:
|
|
197
|
-
queries: 查询文本列表
|
|
198
|
-
n: 每条 query 返回多少条相似文献
|
|
199
|
-
return_scores: 是否返回相似度分数
|
|
200
|
-
|
|
201
|
-
返回:
|
|
202
|
-
一个列表,包含每条 query 的匹配索引和相似度 [(indices1, sims1), (indices2, sims2), ...]
|
|
203
|
-
"""
|
|
204
|
-
if self.corpus is None:
|
|
205
|
-
raise ValueError("请先使用 fit() 方法训练模型")
|
|
206
|
-
|
|
207
|
-
start_time = time.time()
|
|
208
|
-
|
|
209
|
-
# 向量化所有 query,一次性
|
|
210
|
-
query_vecs = self.vectorizer.transform(queries)
|
|
211
|
-
|
|
212
|
-
# 查找最近邻
|
|
213
|
-
distances, indices = self.nn.kneighbors(query_vecs, n_neighbors=n)
|
|
214
|
-
search_time = time.time() - start_time
|
|
215
|
-
# print(f"批量搜索完成,共 {len(queries)} 条,耗时: {search_time:.4f}秒")
|
|
216
|
-
|
|
217
|
-
if return_scores:
|
|
218
|
-
similarities = 1 - distances
|
|
219
|
-
return indices, similarities
|
|
220
|
-
return indices
|
|
221
|
-
|
|
222
|
-
def explain_match(self, query, index):
|
|
223
|
-
"""
|
|
224
|
-
解释匹配结果 - 显示查询和匹配文献的关键词
|
|
225
|
-
"""
|
|
226
|
-
# 获取TF-IDF特征名
|
|
227
|
-
feature_names = self.vectorizer.get_feature_names_out()
|
|
228
|
-
|
|
229
|
-
# 向量化查询和匹配文献
|
|
230
|
-
query_vec = self.vectorizer.transform([query])
|
|
231
|
-
doc_vec = self.vectorizer.transform([self.corpus[index]])
|
|
232
|
-
|
|
233
|
-
# 获取重要特征
|
|
234
|
-
query_data = zip(feature_names, query_vec.toarray()[0])
|
|
235
|
-
doc_data = zip(feature_names, doc_vec.toarray()[0])
|
|
236
|
-
|
|
237
|
-
# 筛选非零特征
|
|
238
|
-
query_keywords = [(word, score) for word, score in query_data if score > 0]
|
|
239
|
-
doc_keywords = [(word, score) for word, score in doc_data if score > 0]
|
|
240
|
-
|
|
241
|
-
# 按重要性排序
|
|
242
|
-
query_keywords.sort(key=lambda x: x[1], reverse=True)
|
|
243
|
-
doc_keywords.sort(key=lambda x: x[1], reverse=True)
|
|
244
|
-
|
|
245
|
-
# 打印结果
|
|
246
|
-
print(f"\n匹配文献 #{index} 解释:")
|
|
247
|
-
print(f"查询关键词: {[word for word, _ in query_keywords[:10]]}")
|
|
248
|
-
print(f"文献关键词: {[word for word, _ in doc_keywords[:10]]}")
|
|
249
|
-
|
|
250
|
-
# 计算共同关键词
|
|
251
|
-
common_keywords = set([word for word, _ in query_keywords[:20]]) & set([word for word, _ in doc_keywords[:20]])
|
|
252
|
-
print(f"共同关键词: {list(common_keywords)}")
|
|
253
|
-
|
|
1
|
+
import gzip
|
|
2
|
+
import io
|
|
3
|
+
import multiprocessing
|
|
4
|
+
import os
|
|
5
|
+
import time
|
|
6
|
+
|
|
7
|
+
import jieba
|
|
8
|
+
import joblib
|
|
9
|
+
|
|
10
|
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
11
|
+
from sklearn.neighbors import NearestNeighbors
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def create_gzip_joblib(obj):
|
|
15
|
+
temp_io = io.BytesIO()
|
|
16
|
+
with gzip.GzipFile(fileobj=temp_io, mode='wb') as f:
|
|
17
|
+
joblib.dump(obj, f)
|
|
18
|
+
temp_io.seek(0)
|
|
19
|
+
return temp_io
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def get_gzip_joblib(temp_io):
|
|
23
|
+
with gzip.GzipFile(fileobj=temp_io, mode='rb') as f:
|
|
24
|
+
loaded_obj = joblib.load(f)
|
|
25
|
+
return loaded_obj
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class JiebaTokenizer:
|
|
29
|
+
def __call__(self, doc):
|
|
30
|
+
return [tok for tok in jieba.cut(doc) if tok.strip()]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class SplitTokenizer:
|
|
34
|
+
def __call__(self, doc):
|
|
35
|
+
return str.split(doc)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def get_auto_n_jobs(fraction=0.5, max_jobs=16):
|
|
39
|
+
"""
|
|
40
|
+
智能分配 CPU 核心数,用于设置 sklearn 的 n_jobs 参数。
|
|
41
|
+
|
|
42
|
+
参数:
|
|
43
|
+
fraction: 使用总核数的比例(如 0.5 表示一半)
|
|
44
|
+
max_jobs: 最大允许使用的核心数(防止过多)
|
|
45
|
+
|
|
46
|
+
返回:
|
|
47
|
+
合理的 n_jobs 整数值
|
|
48
|
+
"""
|
|
49
|
+
total_cores = multiprocessing.cpu_count()
|
|
50
|
+
suggested = int(total_cores * fraction)
|
|
51
|
+
n_jobs = min(max(1, suggested), max_jobs)
|
|
52
|
+
return n_jobs
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class FitTextMatcher:
|
|
56
|
+
"""
|
|
57
|
+
高性能文本匹配器
|
|
58
|
+
基于 TF-IDF + 最近邻搜索实现相似文献查找
|
|
59
|
+
"""
|
|
60
|
+
|
|
61
|
+
def __init__(self, algorithm='brute', metric='cosine', n_jobs=-1, tokenizer=JiebaTokenizer()):
|
|
62
|
+
"""
|
|
63
|
+
初始化文本匹配器
|
|
64
|
+
|
|
65
|
+
参数:
|
|
66
|
+
algorithm: 搜索算法 ('brute', 'kd_tree', 'ball_tree', 'lshf')
|
|
67
|
+
metric: 距离度量 ('cosine', 'euclidean', 'manhattan')
|
|
68
|
+
n_jobs: 并行作业数 (-1 表示使用所有CPU核心)
|
|
69
|
+
"""
|
|
70
|
+
self.vectorizer = TfidfVectorizer(
|
|
71
|
+
max_features=None, # 限制特征数量以提高性能
|
|
72
|
+
tokenizer=tokenizer,
|
|
73
|
+
stop_words=None, # 中文不适用 'english'
|
|
74
|
+
ngram_range=(1, 2) # 使用单字和双字组合
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
self.nn = NearestNeighbors(
|
|
78
|
+
algorithm=algorithm,
|
|
79
|
+
metric=metric,
|
|
80
|
+
n_jobs=n_jobs # 并行处理加速搜索
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
self.corpus = None
|
|
84
|
+
self.corpus_size = 0
|
|
85
|
+
|
|
86
|
+
def fit(self, corpus):
|
|
87
|
+
"""
|
|
88
|
+
训练匹配器
|
|
89
|
+
"""
|
|
90
|
+
self.corpus = corpus
|
|
91
|
+
self.corpus_size = len(corpus)
|
|
92
|
+
print(f"处理 {self.corpus_size} 篇文献...")
|
|
93
|
+
|
|
94
|
+
# 向量化文本
|
|
95
|
+
start_time = time.time()
|
|
96
|
+
X = self.vectorizer.fit_transform(corpus)
|
|
97
|
+
vectorization_time = time.time() - start_time
|
|
98
|
+
print(f"TF-IDF 向量化完成, 耗时: {vectorization_time:.4f}秒")
|
|
99
|
+
print(f"特征维度: {X.shape[1]}")
|
|
100
|
+
|
|
101
|
+
# 训练最近邻模型
|
|
102
|
+
start_time = time.time()
|
|
103
|
+
self.nn.fit(X)
|
|
104
|
+
training_time = time.time() - start_time
|
|
105
|
+
print(f"最近邻模型训练完成, 耗时: {training_time:.4f}秒")
|
|
106
|
+
|
|
107
|
+
return self
|
|
108
|
+
|
|
109
|
+
def save(self, path, name):
|
|
110
|
+
"""
|
|
111
|
+
保存模型和向量器
|
|
112
|
+
"""
|
|
113
|
+
os.makedirs(path, exist_ok=True)
|
|
114
|
+
joblib.dump(self.vectorizer, os.path.join(path, name + "_vectorizer.joblib"))
|
|
115
|
+
joblib.dump(self.nn, os.path.join(path, name + "_nn_model.joblib"))
|
|
116
|
+
joblib.dump(self.corpus, os.path.join(path, name + "_corpus.joblib"))
|
|
117
|
+
print(f"模型保存至 {path}")
|
|
118
|
+
return self
|
|
119
|
+
|
|
120
|
+
def get_save_bytes_io(self, idx_list=None):
|
|
121
|
+
"""
|
|
122
|
+
保存模型和向量器
|
|
123
|
+
"""
|
|
124
|
+
if idx_list is None:
|
|
125
|
+
idx_list = []
|
|
126
|
+
|
|
127
|
+
result_list = []
|
|
128
|
+
for i in [self.vectorizer, self.nn, self.corpus, idx_list]:
|
|
129
|
+
temp_io = create_gzip_joblib(i)
|
|
130
|
+
result_list.append(temp_io)
|
|
131
|
+
print(f"获取模型字节码成功")
|
|
132
|
+
return result_list
|
|
133
|
+
|
|
134
|
+
def load(self, path, name):
|
|
135
|
+
"""
|
|
136
|
+
从文件加载模型
|
|
137
|
+
"""
|
|
138
|
+
self.vectorizer = joblib.load(os.path.join(path, name + "_vectorizer.joblib"))
|
|
139
|
+
self.nn = joblib.load(os.path.join(path, name + "_nn_model.joblib"))
|
|
140
|
+
self.corpus = joblib.load(os.path.join(path, name + "_corpus.joblib"))
|
|
141
|
+
self.corpus_size = len(self.corpus)
|
|
142
|
+
print(f"模型从 {path} 加载完成,共 {self.corpus_size} 篇文献")
|
|
143
|
+
return self
|
|
144
|
+
|
|
145
|
+
def load_bytes(self, vec, nn, corpus, idx):
|
|
146
|
+
# 解压并加载对象
|
|
147
|
+
with gzip.GzipFile(fileobj=vec, mode='rb') as gz:
|
|
148
|
+
self.vectorizer = joblib.load(gz)
|
|
149
|
+
with gzip.GzipFile(fileobj=nn, mode='rb') as gz:
|
|
150
|
+
self.nn = joblib.load(gz)
|
|
151
|
+
with gzip.GzipFile(fileobj=corpus, mode='rb') as gz:
|
|
152
|
+
self.corpus = joblib.load(gz)
|
|
153
|
+
with gzip.GzipFile(fileobj=idx, mode='rb') as gz:
|
|
154
|
+
self.idx = joblib.load(gz)
|
|
155
|
+
self.corpus_size = max(len(self.corpus), len(self.idx))
|
|
156
|
+
print(f"加载bytes完成,共 {self.corpus_size} 篇文献")
|
|
157
|
+
return self
|
|
158
|
+
|
|
159
|
+
def search(self, query, n=5, return_scores=True):
|
|
160
|
+
"""
|
|
161
|
+
查找相似文献
|
|
162
|
+
|
|
163
|
+
参数:
|
|
164
|
+
query: 查询文本
|
|
165
|
+
n: 返回最相似文献的数量
|
|
166
|
+
return_scores: 是否返回相似度分数
|
|
167
|
+
|
|
168
|
+
返回:
|
|
169
|
+
匹配的文献索引和相似度分数
|
|
170
|
+
"""
|
|
171
|
+
if self.corpus is None:
|
|
172
|
+
raise ValueError("请先使用 fit() 方法训练模型")
|
|
173
|
+
|
|
174
|
+
# 向量化查询文本
|
|
175
|
+
query_vec = self.vectorizer.transform([query])
|
|
176
|
+
|
|
177
|
+
# 查找最近邻
|
|
178
|
+
start_time = time.time()
|
|
179
|
+
distances, indices = self.nn.kneighbors(query_vec, n_neighbors=n)
|
|
180
|
+
search_time = time.time() - start_time
|
|
181
|
+
|
|
182
|
+
# print(f"搜索完成, 耗时: {search_time:.6f}秒")
|
|
183
|
+
|
|
184
|
+
# 将距离转换为相似度 (余弦距离 = 1 - 余弦相似度)
|
|
185
|
+
similarities = 1 - distances
|
|
186
|
+
|
|
187
|
+
# 返回结果
|
|
188
|
+
if return_scores:
|
|
189
|
+
return indices[0], similarities[0]
|
|
190
|
+
return indices[0]
|
|
191
|
+
|
|
192
|
+
def batch_search(self, queries, n=5, return_scores=True):
|
|
193
|
+
"""
|
|
194
|
+
批量查找相似文献(一次处理多条 query)
|
|
195
|
+
|
|
196
|
+
参数:
|
|
197
|
+
queries: 查询文本列表
|
|
198
|
+
n: 每条 query 返回多少条相似文献
|
|
199
|
+
return_scores: 是否返回相似度分数
|
|
200
|
+
|
|
201
|
+
返回:
|
|
202
|
+
一个列表,包含每条 query 的匹配索引和相似度 [(indices1, sims1), (indices2, sims2), ...]
|
|
203
|
+
"""
|
|
204
|
+
if self.corpus is None:
|
|
205
|
+
raise ValueError("请先使用 fit() 方法训练模型")
|
|
206
|
+
|
|
207
|
+
start_time = time.time()
|
|
208
|
+
|
|
209
|
+
# 向量化所有 query,一次性
|
|
210
|
+
query_vecs = self.vectorizer.transform(queries)
|
|
211
|
+
|
|
212
|
+
# 查找最近邻
|
|
213
|
+
distances, indices = self.nn.kneighbors(query_vecs, n_neighbors=n)
|
|
214
|
+
search_time = time.time() - start_time
|
|
215
|
+
# print(f"批量搜索完成,共 {len(queries)} 条,耗时: {search_time:.4f}秒")
|
|
216
|
+
|
|
217
|
+
if return_scores:
|
|
218
|
+
similarities = 1 - distances
|
|
219
|
+
return indices, similarities
|
|
220
|
+
return indices
|
|
221
|
+
|
|
222
|
+
def explain_match(self, query, index):
|
|
223
|
+
"""
|
|
224
|
+
解释匹配结果 - 显示查询和匹配文献的关键词
|
|
225
|
+
"""
|
|
226
|
+
# 获取TF-IDF特征名
|
|
227
|
+
feature_names = self.vectorizer.get_feature_names_out()
|
|
228
|
+
|
|
229
|
+
# 向量化查询和匹配文献
|
|
230
|
+
query_vec = self.vectorizer.transform([query])
|
|
231
|
+
doc_vec = self.vectorizer.transform([self.corpus[index]])
|
|
232
|
+
|
|
233
|
+
# 获取重要特征
|
|
234
|
+
query_data = zip(feature_names, query_vec.toarray()[0])
|
|
235
|
+
doc_data = zip(feature_names, doc_vec.toarray()[0])
|
|
236
|
+
|
|
237
|
+
# 筛选非零特征
|
|
238
|
+
query_keywords = [(word, score) for word, score in query_data if score > 0]
|
|
239
|
+
doc_keywords = [(word, score) for word, score in doc_data if score > 0]
|
|
240
|
+
|
|
241
|
+
# 按重要性排序
|
|
242
|
+
query_keywords.sort(key=lambda x: x[1], reverse=True)
|
|
243
|
+
doc_keywords.sort(key=lambda x: x[1], reverse=True)
|
|
244
|
+
|
|
245
|
+
# 打印结果
|
|
246
|
+
print(f"\n匹配文献 #{index} 解释:")
|
|
247
|
+
print(f"查询关键词: {[word for word, _ in query_keywords[:10]]}")
|
|
248
|
+
print(f"文献关键词: {[word for word, _ in doc_keywords[:10]]}")
|
|
249
|
+
|
|
250
|
+
# 计算共同关键词
|
|
251
|
+
common_keywords = set([word for word, _ in query_keywords[:20]]) & set([word for word, _ in doc_keywords[:20]])
|
|
252
|
+
print(f"共同关键词: {list(common_keywords)}")
|
|
253
|
+
|
|
254
254
|
return common_keywords
|