re-common 10.0.39__py3-none-any.whl → 10.0.40__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- re_common/baselibrary/__init__.py +4 -4
- re_common/baselibrary/baseabs/__init__.py +6 -6
- re_common/baselibrary/baseabs/baseabs.py +26 -26
- re_common/baselibrary/database/mbuilder.py +132 -132
- re_common/baselibrary/database/moudle.py +93 -93
- re_common/baselibrary/database/msqlite3.py +194 -194
- re_common/baselibrary/database/mysql.py +169 -169
- re_common/baselibrary/database/sql_factory.py +26 -26
- re_common/baselibrary/mthread/MThreadingRun.py +486 -486
- re_common/baselibrary/mthread/MThreadingRunEvent.py +349 -349
- re_common/baselibrary/mthread/__init__.py +2 -2
- re_common/baselibrary/mthread/mythreading.py +695 -695
- re_common/baselibrary/pakge_other/socks.py +404 -404
- re_common/baselibrary/readconfig/config_factory.py +18 -18
- re_common/baselibrary/readconfig/ini_config.py +317 -317
- re_common/baselibrary/readconfig/toml_config.py +49 -49
- re_common/baselibrary/temporary/envdata.py +36 -36
- re_common/baselibrary/tools/all_requests/aiohttp_request.py +118 -118
- re_common/baselibrary/tools/all_requests/httpx_requet.py +102 -102
- re_common/baselibrary/tools/all_requests/mrequest.py +412 -412
- re_common/baselibrary/tools/all_requests/requests_request.py +81 -81
- re_common/baselibrary/tools/batch_compre/bijiao_batch.py +31 -31
- re_common/baselibrary/tools/contrast_db3.py +123 -123
- re_common/baselibrary/tools/copy_file.py +39 -39
- re_common/baselibrary/tools/db3_2_sizedb3.py +102 -102
- re_common/baselibrary/tools/foreachgz.py +39 -39
- re_common/baselibrary/tools/get_attr.py +10 -10
- re_common/baselibrary/tools/image_to_pdf.py +61 -61
- re_common/baselibrary/tools/java_code_deal.py +139 -139
- re_common/baselibrary/tools/javacode.py +79 -79
- re_common/baselibrary/tools/mdb_db3.py +48 -48
- re_common/baselibrary/tools/merge_file.py +171 -171
- re_common/baselibrary/tools/merge_gz_file.py +165 -165
- re_common/baselibrary/tools/mhdfstools/down_hdfs_files.py +42 -42
- re_common/baselibrary/tools/mhdfstools/hdfst.py +42 -42
- re_common/baselibrary/tools/mhdfstools/up_hdfs_files.py +38 -38
- re_common/baselibrary/tools/mongo_tools.py +50 -50
- re_common/baselibrary/tools/move_file.py +170 -170
- re_common/baselibrary/tools/move_mongo/mongo_table_to_file.py +63 -63
- re_common/baselibrary/tools/move_mongo/move_mongo_table.py +354 -354
- re_common/baselibrary/tools/move_mongo/use_mttf.py +18 -18
- re_common/baselibrary/tools/move_mongo/use_mv.py +93 -93
- re_common/baselibrary/tools/mpandas/mpandasreadexcel.py +125 -125
- re_common/baselibrary/tools/mpandas/pandas_visualization.py +7 -7
- re_common/baselibrary/tools/myparsel.py +104 -104
- re_common/baselibrary/tools/rename_dir_file.py +37 -37
- re_common/baselibrary/tools/sequoiadb_utils.py +398 -398
- re_common/baselibrary/tools/split_line_to_many.py +25 -25
- re_common/baselibrary/tools/stringtodicts.py +33 -33
- re_common/baselibrary/tools/workwechant_bot.py +84 -84
- re_common/baselibrary/utils/baseaiohttp.py +296 -296
- re_common/baselibrary/utils/baseaiomysql.py +87 -87
- re_common/baselibrary/utils/baseallstep.py +191 -191
- re_common/baselibrary/utils/baseavro.py +19 -19
- re_common/baselibrary/utils/baseboto3.py +291 -291
- re_common/baselibrary/utils/basecsv.py +32 -32
- re_common/baselibrary/utils/basedict.py +133 -133
- re_common/baselibrary/utils/basedir.py +241 -241
- re_common/baselibrary/utils/baseencode.py +351 -351
- re_common/baselibrary/utils/baseencoding.py +28 -28
- re_common/baselibrary/utils/baseesdsl.py +86 -86
- re_common/baselibrary/utils/baseexcel.py +264 -264
- re_common/baselibrary/utils/baseexcept.py +109 -109
- re_common/baselibrary/utils/basefile.py +654 -654
- re_common/baselibrary/utils/baseftp.py +214 -214
- re_common/baselibrary/utils/basegzip.py +60 -60
- re_common/baselibrary/utils/basehdfs.py +135 -135
- re_common/baselibrary/utils/basehttpx.py +268 -268
- re_common/baselibrary/utils/baseip.py +87 -87
- re_common/baselibrary/utils/basejson.py +2 -2
- re_common/baselibrary/utils/baselist.py +32 -32
- re_common/baselibrary/utils/basemotor.py +190 -190
- re_common/baselibrary/utils/basemssql.py +98 -98
- re_common/baselibrary/utils/baseodbc.py +113 -113
- re_common/baselibrary/utils/basepandas.py +302 -302
- re_common/baselibrary/utils/basepeewee.py +11 -11
- re_common/baselibrary/utils/basepika.py +180 -180
- re_common/baselibrary/utils/basepydash.py +143 -143
- re_common/baselibrary/utils/basepymongo.py +230 -230
- re_common/baselibrary/utils/basequeue.py +22 -22
- re_common/baselibrary/utils/baserar.py +57 -57
- re_common/baselibrary/utils/baserequest.py +279 -279
- re_common/baselibrary/utils/baseset.py +8 -8
- re_common/baselibrary/utils/basesmb.py +403 -403
- re_common/baselibrary/utils/basestring.py +382 -382
- re_common/baselibrary/utils/basetime.py +320 -320
- re_common/baselibrary/utils/baseurl.py +121 -121
- re_common/baselibrary/utils/basezip.py +57 -57
- re_common/baselibrary/utils/core/__init__.py +7 -7
- re_common/baselibrary/utils/core/bottomutils.py +18 -18
- re_common/baselibrary/utils/core/mdeprecated.py +327 -327
- re_common/baselibrary/utils/core/mlamada.py +16 -16
- re_common/baselibrary/utils/core/msginfo.py +25 -25
- re_common/baselibrary/utils/core/requests_core.py +103 -103
- re_common/baselibrary/utils/fateadm.py +429 -429
- re_common/baselibrary/utils/importfun.py +123 -123
- re_common/baselibrary/utils/mfaker.py +57 -57
- re_common/baselibrary/utils/my_abc/__init__.py +3 -3
- re_common/baselibrary/utils/my_abc/better_abc.py +32 -32
- re_common/baselibrary/utils/mylogger.py +414 -414
- re_common/baselibrary/utils/myredisclient.py +861 -861
- re_common/baselibrary/utils/pipupgrade.py +21 -21
- re_common/baselibrary/utils/ringlist.py +85 -85
- re_common/baselibrary/utils/version_compare.py +36 -36
- re_common/baselibrary/utils/ydmhttp.py +126 -126
- re_common/facade/lazy_import.py +11 -11
- re_common/facade/loggerfacade.py +25 -25
- re_common/facade/mysqlfacade.py +467 -467
- re_common/facade/now.py +31 -31
- re_common/facade/sqlite3facade.py +257 -257
- re_common/facade/use/mq_use_facade.py +83 -83
- re_common/facade/use/proxy_use_facade.py +19 -19
- re_common/libtest/base_dict_test.py +19 -19
- re_common/libtest/baseavro_test.py +13 -13
- re_common/libtest/basefile_test.py +14 -14
- re_common/libtest/basemssql_test.py +77 -77
- re_common/libtest/baseodbc_test.py +7 -7
- re_common/libtest/basepandas_test.py +38 -38
- re_common/libtest/get_attr_test/get_attr_test_settings.py +14 -14
- re_common/libtest/get_attr_test/settings.py +54 -54
- re_common/libtest/idencode_test.py +53 -53
- re_common/libtest/iniconfig_test.py +35 -35
- re_common/libtest/ip_test.py +34 -34
- re_common/libtest/merge_file_test.py +20 -20
- re_common/libtest/mfaker_test.py +8 -8
- re_common/libtest/mm3_test.py +31 -31
- re_common/libtest/mylogger_test.py +88 -88
- re_common/libtest/myparsel_test.py +27 -27
- re_common/libtest/mysql_test.py +151 -151
- re_common/libtest/pymongo_test.py +21 -21
- re_common/libtest/split_test.py +11 -11
- re_common/libtest/sqlite3_merge_test.py +5 -5
- re_common/libtest/sqlite3_test.py +34 -34
- re_common/libtest/tomlconfig_test.py +30 -30
- re_common/libtest/use_tools_test/__init__.py +2 -2
- re_common/libtest/user/__init__.py +4 -4
- re_common/studio/__init__.py +4 -4
- re_common/studio/assignment_expressions.py +36 -36
- re_common/studio/mydash/test1.py +18 -18
- re_common/studio/pydashstudio/first.py +9 -9
- re_common/studio/streamlitstudio/first_app.py +65 -65
- re_common/studio/streamlitstudio/uber_pickups.py +23 -23
- re_common/studio/test.py +18 -18
- re_common/v2/baselibrary/business_utils/BusinessStringUtil.py +219 -219
- re_common/v2/baselibrary/business_utils/baseencodeid.py +100 -100
- re_common/v2/baselibrary/business_utils/full_doi_path.py +116 -116
- re_common/v2/baselibrary/business_utils/rel_tools.py +6 -6
- re_common/v2/baselibrary/decorators/utils.py +59 -59
- re_common/v2/baselibrary/helpers/search_packge/NearestNeighbors_test.py +105 -105
- re_common/v2/baselibrary/helpers/search_packge/fit_text_match.py +253 -253
- re_common/v2/baselibrary/helpers/search_packge/scikit_learn_text_matcher.py +260 -260
- re_common/v2/baselibrary/helpers/search_packge/test.py +1 -1
- re_common/v2/baselibrary/s3object/baseboto3.py +230 -230
- re_common/v2/baselibrary/tools/WeChatRobot.py +95 -95
- re_common/v2/baselibrary/tools/ac_ahocorasick.py +75 -75
- re_common/v2/baselibrary/tools/concurrency.py +35 -35
- re_common/v2/baselibrary/tools/data_processer/base.py +53 -53
- re_common/v2/baselibrary/tools/data_processer/data_processer.py +508 -508
- re_common/v2/baselibrary/tools/data_processer/data_reader.py +187 -187
- re_common/v2/baselibrary/tools/data_processer/data_writer.py +38 -38
- re_common/v2/baselibrary/tools/dict_tools.py +44 -44
- re_common/v2/baselibrary/tools/dolphinscheduler.py +187 -187
- re_common/v2/baselibrary/tools/hdfs_base_processor.py +204 -204
- re_common/v2/baselibrary/tools/hdfs_bulk_processor.py +67 -67
- re_common/v2/baselibrary/tools/hdfs_data_processer.py +338 -338
- re_common/v2/baselibrary/tools/hdfs_line_processor.py +74 -74
- re_common/v2/baselibrary/tools/list_tools.py +69 -69
- re_common/v2/baselibrary/tools/resume_tracker.py +94 -94
- re_common/v2/baselibrary/tools/search_hash_tools.py +54 -54
- re_common/v2/baselibrary/tools/text_matcher.py +326 -326
- re_common/v2/baselibrary/tools/unionfind_tools.py +60 -60
- re_common/v2/baselibrary/utils/BusinessStringUtil.py +196 -196
- re_common/v2/baselibrary/utils/api_net_utils.py +270 -270
- re_common/v2/baselibrary/utils/author_smi.py +361 -361
- re_common/v2/baselibrary/utils/base_string_similarity.py +158 -158
- re_common/v2/baselibrary/utils/basedict.py +37 -37
- re_common/v2/baselibrary/utils/basehdfs.py +163 -163
- re_common/v2/baselibrary/utils/basepika.py +180 -180
- re_common/v2/baselibrary/utils/basetime.py +77 -77
- re_common/v2/baselibrary/utils/db.py +156 -156
- re_common/v2/baselibrary/utils/elasticsearch.py +46 -0
- re_common/v2/baselibrary/utils/json_cls.py +16 -16
- re_common/v2/baselibrary/utils/mq.py +83 -83
- re_common/v2/baselibrary/utils/n_ary_expression_tree.py +243 -243
- re_common/v2/baselibrary/utils/string_bool.py +186 -186
- re_common/v2/baselibrary/utils/string_clear.py +246 -246
- re_common/v2/baselibrary/utils/string_smi.py +18 -18
- re_common/v2/baselibrary/utils/stringutils.py +271 -271
- re_common/vip/base_step_process.py +11 -11
- re_common/vip/baseencodeid.py +90 -90
- re_common/vip/changetaskname.py +28 -28
- re_common/vip/core_var.py +24 -24
- re_common/vip/mmh3Hash.py +89 -89
- re_common/vip/proxy/allproxys.py +127 -127
- re_common/vip/proxy/allproxys_thread.py +159 -159
- re_common/vip/proxy/cnki_proxy.py +153 -153
- re_common/vip/proxy/kuaidaili.py +87 -87
- re_common/vip/proxy/proxy_all.py +113 -113
- re_common/vip/proxy/update_kuaidaili_0.py +42 -42
- re_common/vip/proxy/wanfang_proxy.py +152 -152
- re_common/vip/proxy/wp_proxy_all.py +181 -181
- re_common/vip/read_rawid_to_txt.py +91 -91
- re_common/vip/title/__init__.py +5 -5
- re_common/vip/title/transform/TransformBookTitleToZt.py +125 -125
- re_common/vip/title/transform/TransformConferenceTitleToZt.py +139 -139
- re_common/vip/title/transform/TransformCstadTitleToZt.py +195 -195
- re_common/vip/title/transform/TransformJournalTitleToZt.py +203 -203
- re_common/vip/title/transform/TransformPatentTitleToZt.py +132 -132
- re_common/vip/title/transform/TransformRegulationTitleToZt.py +114 -114
- re_common/vip/title/transform/TransformStandardTitleToZt.py +135 -135
- re_common/vip/title/transform/TransformThesisTitleToZt.py +135 -135
- re_common/vip/title/transform/__init__.py +10 -10
- {re_common-10.0.39.dist-info → re_common-10.0.40.dist-info}/LICENSE +201 -201
- {re_common-10.0.39.dist-info → re_common-10.0.40.dist-info}/METADATA +24 -16
- re_common-10.0.40.dist-info/RECORD +249 -0
- {re_common-10.0.39.dist-info → re_common-10.0.40.dist-info}/WHEEL +1 -1
- re_common-10.0.39.dist-info/RECORD +0 -248
- {re_common-10.0.39.dist-info → re_common-10.0.40.dist-info}/top_level.txt +0 -0
|
@@ -1,261 +1,261 @@
|
|
|
1
|
-
import numpy as np
|
|
2
|
-
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
3
|
-
from sklearn.neighbors import NearestNeighbors
|
|
4
|
-
import time
|
|
5
|
-
import matplotlib.pyplot as plt
|
|
6
|
-
import seaborn as sns
|
|
7
|
-
import pandas as pd
|
|
8
|
-
from sklearn.metrics.pairwise import cosine_similarity
|
|
9
|
-
|
|
10
|
-
class TextMatcher:
|
|
11
|
-
"""
|
|
12
|
-
高性能文本匹配器
|
|
13
|
-
基于 TF-IDF + 最近邻搜索实现相似文献查找
|
|
14
|
-
"""
|
|
15
|
-
|
|
16
|
-
def __init__(self, algorithm='brute', metric='cosine', n_jobs=-1):
|
|
17
|
-
"""
|
|
18
|
-
初始化文本匹配器
|
|
19
|
-
|
|
20
|
-
参数:
|
|
21
|
-
algorithm: 搜索算法 ('brute', 'kd_tree', 'ball_tree', 'lshf')
|
|
22
|
-
metric: 距离度量 ('cosine', 'euclidean', 'manhattan')
|
|
23
|
-
n_jobs: 并行作业数 (-1 表示使用所有CPU核心)
|
|
24
|
-
"""
|
|
25
|
-
self.vectorizer = TfidfVectorizer(
|
|
26
|
-
max_features=10000, # 限制特征数量以提高性能
|
|
27
|
-
stop_words='english', # 移除英文停用词
|
|
28
|
-
ngram_range=(1, 2) # 使用单字和双字组合
|
|
29
|
-
)
|
|
30
|
-
|
|
31
|
-
self.nn = NearestNeighbors(
|
|
32
|
-
algorithm=algorithm,
|
|
33
|
-
metric=metric,
|
|
34
|
-
n_jobs=n_jobs # 并行处理加速搜索
|
|
35
|
-
)
|
|
36
|
-
|
|
37
|
-
self.corpus = None
|
|
38
|
-
self.corpus_size = 0
|
|
39
|
-
|
|
40
|
-
def fit(self, corpus):
|
|
41
|
-
"""
|
|
42
|
-
训练匹配器
|
|
43
|
-
"""
|
|
44
|
-
self.corpus = corpus
|
|
45
|
-
self.corpus_size = len(corpus)
|
|
46
|
-
print(f"处理 {self.corpus_size} 篇文献...")
|
|
47
|
-
|
|
48
|
-
# 向量化文本
|
|
49
|
-
start_time = time.time()
|
|
50
|
-
X = self.vectorizer.fit_transform(corpus)
|
|
51
|
-
vectorization_time = time.time() - start_time
|
|
52
|
-
print(f"TF-IDF 向量化完成, 耗时: {vectorization_time:.4f}秒")
|
|
53
|
-
print(f"特征维度: {X.shape[1]}")
|
|
54
|
-
|
|
55
|
-
# 训练最近邻模型
|
|
56
|
-
start_time = time.time()
|
|
57
|
-
self.nn.fit(X)
|
|
58
|
-
training_time = time.time() - start_time
|
|
59
|
-
print(f"最近邻模型训练完成, 耗时: {training_time:.4f}秒")
|
|
60
|
-
|
|
61
|
-
return self
|
|
62
|
-
|
|
63
|
-
def search(self, query, n=5, return_scores=True):
|
|
64
|
-
"""
|
|
65
|
-
查找相似文献
|
|
66
|
-
|
|
67
|
-
参数:
|
|
68
|
-
query: 查询文本
|
|
69
|
-
n: 返回最相似文献的数量
|
|
70
|
-
return_scores: 是否返回相似度分数
|
|
71
|
-
|
|
72
|
-
返回:
|
|
73
|
-
匹配的文献索引和相似度分数
|
|
74
|
-
"""
|
|
75
|
-
if self.corpus is None:
|
|
76
|
-
raise ValueError("请先使用 fit() 方法训练模型")
|
|
77
|
-
|
|
78
|
-
# 向量化查询文本
|
|
79
|
-
query_vec = self.vectorizer.transform([query])
|
|
80
|
-
|
|
81
|
-
# 查找最近邻
|
|
82
|
-
start_time = time.time()
|
|
83
|
-
distances, indices = self.nn.kneighbors(query_vec, n_neighbors=n)
|
|
84
|
-
search_time = time.time() - start_time
|
|
85
|
-
|
|
86
|
-
print(f"搜索完成, 耗时: {search_time:.6f}秒")
|
|
87
|
-
|
|
88
|
-
# 将距离转换为相似度 (余弦距离 = 1 - 余弦相似度)
|
|
89
|
-
similarities = 1 - distances
|
|
90
|
-
|
|
91
|
-
# 返回结果
|
|
92
|
-
if return_scores:
|
|
93
|
-
return indices[0], similarities[0]
|
|
94
|
-
return indices[0]
|
|
95
|
-
|
|
96
|
-
def explain_match(self, query, index):
|
|
97
|
-
"""
|
|
98
|
-
解释匹配结果 - 显示查询和匹配文献的关键词
|
|
99
|
-
"""
|
|
100
|
-
# 获取TF-IDF特征名
|
|
101
|
-
feature_names = self.vectorizer.get_feature_names_out()
|
|
102
|
-
|
|
103
|
-
# 向量化查询和匹配文献
|
|
104
|
-
query_vec = self.vectorizer.transform([query])
|
|
105
|
-
doc_vec = self.vectorizer.transform([self.corpus[index]])
|
|
106
|
-
|
|
107
|
-
# 获取重要特征
|
|
108
|
-
query_data = zip(feature_names, query_vec.toarray()[0])
|
|
109
|
-
doc_data = zip(feature_names, doc_vec.toarray()[0])
|
|
110
|
-
|
|
111
|
-
# 筛选非零特征
|
|
112
|
-
query_keywords = [(word, score) for word, score in query_data if score > 0]
|
|
113
|
-
doc_keywords = [(word, score) for word, score in doc_data if score > 0]
|
|
114
|
-
|
|
115
|
-
# 按重要性排序
|
|
116
|
-
query_keywords.sort(key=lambda x: x[1], reverse=True)
|
|
117
|
-
doc_keywords.sort(key=lambda x: x[1], reverse=True)
|
|
118
|
-
|
|
119
|
-
# 打印结果
|
|
120
|
-
print(f"\n匹配文献 #{index} 解释:")
|
|
121
|
-
print(f"查询关键词: {[word for word, _ in query_keywords[:10]]}")
|
|
122
|
-
print(f"文献关键词: {[word for word, _ in doc_keywords[:10]]}")
|
|
123
|
-
|
|
124
|
-
# 计算共同关键词
|
|
125
|
-
common_keywords = set([word for word, _ in query_keywords[:20]]) & set([word for word, _ in doc_keywords[:20]])
|
|
126
|
-
print(f"共同关键词: {list(common_keywords)}")
|
|
127
|
-
|
|
128
|
-
return common_keywords
|
|
129
|
-
|
|
130
|
-
# ======================
|
|
131
|
-
# 演示使用
|
|
132
|
-
# ======================
|
|
133
|
-
|
|
134
|
-
if __name__ == "__main__":
|
|
135
|
-
# 1. 准备文献库 (实际应用中可从文件/数据库加载)
|
|
136
|
-
corpus = [
|
|
137
|
-
"机器学习是人工智能的一个分支,专注于开发算法让计算机从数据中学习",
|
|
138
|
-
"深度学习是机器学习的一个子领域,使用多层神经网络处理复杂模式",
|
|
139
|
-
"自然语言处理(NLP)使计算机能够理解、解释和生成人类语言",
|
|
140
|
-
"计算机视觉关注如何让计算机从图像和视频中获得高层次的理解",
|
|
141
|
-
"强化学习是一种机器学习方法,智能体通过与环境互动学习最优行为策略",
|
|
142
|
-
"监督学习使用标记数据训练模型,无监督学习则处理未标记数据",
|
|
143
|
-
"神经网络是受人脑启发的计算模型,由相互连接的节点层组成",
|
|
144
|
-
"卷积神经网络(CNN)特别适合处理图像识别任务",
|
|
145
|
-
"循环神经网络(RNN)设计用于处理序列数据,如文本和时间序列",
|
|
146
|
-
"Transformer模型通过自注意力机制处理序列数据,成为NLP的主流架构",
|
|
147
|
-
"生成对抗网络(GAN)由生成器和判别器组成,用于生成新数据样本",
|
|
148
|
-
"迁移学习允许将在一个任务上学到的知识应用到另一个相关任务",
|
|
149
|
-
"数据挖掘是从大型数据集中发现模式、关联和异常的过程",
|
|
150
|
-
"特征工程是创建更好的输入特征以提高模型性能的过程",
|
|
151
|
-
"过拟合发生在模型过于复杂,过度记忆训练数据而泛化能力差",
|
|
152
|
-
"正则化技术如L1/L2正则化用于防止过拟合",
|
|
153
|
-
"梯度下降是优化神经网络权重的主要算法",
|
|
154
|
-
"反向传播是训练神经网络的关键算法,用于计算梯度",
|
|
155
|
-
"激活函数如ReLU引入非线性,使神经网络能够学习复杂模式",
|
|
156
|
-
"批量归一化通过标准化层输入加速训练并提高稳定性"
|
|
157
|
-
]
|
|
158
|
-
|
|
159
|
-
# 2. 创建文本匹配器
|
|
160
|
-
print("="*50)
|
|
161
|
-
print("创建文本匹配器")
|
|
162
|
-
print("="*50)
|
|
163
|
-
matcher = TextMatcher(
|
|
164
|
-
algorithm='brute', # 对于小数据集,暴力搜索足够快
|
|
165
|
-
n_jobs=-1 # 使用所有CPU核心
|
|
166
|
-
)
|
|
167
|
-
|
|
168
|
-
# 3. 训练模型
|
|
169
|
-
matcher.fit(corpus)
|
|
170
|
-
|
|
171
|
-
# 4. 执行查询
|
|
172
|
-
print("\n" + "="*50)
|
|
173
|
-
print("执行查询: '神经网络在人工智能中的应用'")
|
|
174
|
-
print("="*50)
|
|
175
|
-
query = "神经网络在人工智能中的应用"
|
|
176
|
-
indices, similarities = matcher.search(query, n=3)
|
|
177
|
-
|
|
178
|
-
# 5. 显示结果
|
|
179
|
-
print("\n最相似的文献:")
|
|
180
|
-
for rank, (idx, sim) in enumerate(zip(indices, similarities)):
|
|
181
|
-
print(f"\nTop {rank+1} [相似度: {sim:.4f}]:")
|
|
182
|
-
print(f"文献 #{idx}: {corpus[idx]}")
|
|
183
|
-
|
|
184
|
-
# 解释匹配
|
|
185
|
-
matcher.explain_match(query, idx)
|
|
186
|
-
|
|
187
|
-
# 6. 性能测试 (可选)
|
|
188
|
-
print("\n" + "="*50)
|
|
189
|
-
print("性能测试")
|
|
190
|
-
print("="*50)
|
|
191
|
-
|
|
192
|
-
# 测试不同文献库大小的性能
|
|
193
|
-
corpus_sizes = [100, 500, 1000, 5000]
|
|
194
|
-
times = []
|
|
195
|
-
|
|
196
|
-
for size in corpus_sizes:
|
|
197
|
-
# 创建更大的文献库
|
|
198
|
-
large_corpus = corpus * (size // len(corpus) + 1)
|
|
199
|
-
large_corpus = large_corpus[:size]
|
|
200
|
-
|
|
201
|
-
# 创建新的匹配器
|
|
202
|
-
test_matcher = TextMatcher(algorithm='brute', n_jobs=-1)
|
|
203
|
-
|
|
204
|
-
# 测量训练时间
|
|
205
|
-
start_time = time.time()
|
|
206
|
-
test_matcher.fit(large_corpus)
|
|
207
|
-
train_time = time.time() - start_time
|
|
208
|
-
|
|
209
|
-
# 测量查询时间
|
|
210
|
-
start_time = time.time()
|
|
211
|
-
test_matcher.search(query, n=5)
|
|
212
|
-
search_time = time.time() - start_time
|
|
213
|
-
|
|
214
|
-
times.append((size, train_time, search_time))
|
|
215
|
-
print(f"文献库大小: {size} | 训练时间: {train_time:.4f}s | 查询时间: {search_time:.6f}s")
|
|
216
|
-
|
|
217
|
-
# 可视化性能结果
|
|
218
|
-
sizes, train_times, search_times = zip(*times)
|
|
219
|
-
|
|
220
|
-
plt.figure(figsize=(12, 6))
|
|
221
|
-
|
|
222
|
-
plt.subplot(1, 2, 1)
|
|
223
|
-
plt.plot(sizes, train_times, 'o-')
|
|
224
|
-
plt.title('训练时间 vs 文献库大小')
|
|
225
|
-
plt.xlabel('文献数量')
|
|
226
|
-
plt.ylabel('时间 (秒)')
|
|
227
|
-
plt.grid(True)
|
|
228
|
-
|
|
229
|
-
plt.subplot(1, 2, 2)
|
|
230
|
-
plt.plot(sizes, search_times, 'o-')
|
|
231
|
-
plt.title('查询时间 vs 文献库大小')
|
|
232
|
-
plt.xlabel('文献数量')
|
|
233
|
-
plt.ylabel('时间 (秒)')
|
|
234
|
-
plt.grid(True)
|
|
235
|
-
|
|
236
|
-
plt.tight_layout()
|
|
237
|
-
plt.savefig('performance.png')
|
|
238
|
-
print("\n性能图表已保存为 'performance.png'")
|
|
239
|
-
|
|
240
|
-
# 7. 相似度矩阵可视化 (可选)
|
|
241
|
-
print("\n" + "="*50)
|
|
242
|
-
print("文献相似度矩阵")
|
|
243
|
-
print("="*50)
|
|
244
|
-
|
|
245
|
-
# 计算所有文献的TF-IDF向量
|
|
246
|
-
vectors = matcher.vectorizer.transform(corpus)
|
|
247
|
-
|
|
248
|
-
# 计算余弦相似度矩阵
|
|
249
|
-
sim_matrix = cosine_similarity(vectors)
|
|
250
|
-
|
|
251
|
-
# 创建DataFrame用于可视化
|
|
252
|
-
df = pd.DataFrame(sim_matrix,
|
|
253
|
-
columns=[f"Doc{i}" for i in range(len(corpus))],
|
|
254
|
-
index=[f"Doc{i}" for i in range(len(corpus))])
|
|
255
|
-
|
|
256
|
-
plt.figure(figsize=(12, 10))
|
|
257
|
-
sns.heatmap(df, cmap="YlGnBu", annot=False)
|
|
258
|
-
plt.title("文献相似度矩阵")
|
|
259
|
-
plt.tight_layout()
|
|
260
|
-
plt.savefig('similarity_matrix.png')
|
|
1
|
+
import numpy as np
|
|
2
|
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
3
|
+
from sklearn.neighbors import NearestNeighbors
|
|
4
|
+
import time
|
|
5
|
+
import matplotlib.pyplot as plt
|
|
6
|
+
import seaborn as sns
|
|
7
|
+
import pandas as pd
|
|
8
|
+
from sklearn.metrics.pairwise import cosine_similarity
|
|
9
|
+
|
|
10
|
+
class TextMatcher:
|
|
11
|
+
"""
|
|
12
|
+
高性能文本匹配器
|
|
13
|
+
基于 TF-IDF + 最近邻搜索实现相似文献查找
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
def __init__(self, algorithm='brute', metric='cosine', n_jobs=-1):
|
|
17
|
+
"""
|
|
18
|
+
初始化文本匹配器
|
|
19
|
+
|
|
20
|
+
参数:
|
|
21
|
+
algorithm: 搜索算法 ('brute', 'kd_tree', 'ball_tree', 'lshf')
|
|
22
|
+
metric: 距离度量 ('cosine', 'euclidean', 'manhattan')
|
|
23
|
+
n_jobs: 并行作业数 (-1 表示使用所有CPU核心)
|
|
24
|
+
"""
|
|
25
|
+
self.vectorizer = TfidfVectorizer(
|
|
26
|
+
max_features=10000, # 限制特征数量以提高性能
|
|
27
|
+
stop_words='english', # 移除英文停用词
|
|
28
|
+
ngram_range=(1, 2) # 使用单字和双字组合
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
self.nn = NearestNeighbors(
|
|
32
|
+
algorithm=algorithm,
|
|
33
|
+
metric=metric,
|
|
34
|
+
n_jobs=n_jobs # 并行处理加速搜索
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
self.corpus = None
|
|
38
|
+
self.corpus_size = 0
|
|
39
|
+
|
|
40
|
+
def fit(self, corpus):
|
|
41
|
+
"""
|
|
42
|
+
训练匹配器
|
|
43
|
+
"""
|
|
44
|
+
self.corpus = corpus
|
|
45
|
+
self.corpus_size = len(corpus)
|
|
46
|
+
print(f"处理 {self.corpus_size} 篇文献...")
|
|
47
|
+
|
|
48
|
+
# 向量化文本
|
|
49
|
+
start_time = time.time()
|
|
50
|
+
X = self.vectorizer.fit_transform(corpus)
|
|
51
|
+
vectorization_time = time.time() - start_time
|
|
52
|
+
print(f"TF-IDF 向量化完成, 耗时: {vectorization_time:.4f}秒")
|
|
53
|
+
print(f"特征维度: {X.shape[1]}")
|
|
54
|
+
|
|
55
|
+
# 训练最近邻模型
|
|
56
|
+
start_time = time.time()
|
|
57
|
+
self.nn.fit(X)
|
|
58
|
+
training_time = time.time() - start_time
|
|
59
|
+
print(f"最近邻模型训练完成, 耗时: {training_time:.4f}秒")
|
|
60
|
+
|
|
61
|
+
return self
|
|
62
|
+
|
|
63
|
+
def search(self, query, n=5, return_scores=True):
|
|
64
|
+
"""
|
|
65
|
+
查找相似文献
|
|
66
|
+
|
|
67
|
+
参数:
|
|
68
|
+
query: 查询文本
|
|
69
|
+
n: 返回最相似文献的数量
|
|
70
|
+
return_scores: 是否返回相似度分数
|
|
71
|
+
|
|
72
|
+
返回:
|
|
73
|
+
匹配的文献索引和相似度分数
|
|
74
|
+
"""
|
|
75
|
+
if self.corpus is None:
|
|
76
|
+
raise ValueError("请先使用 fit() 方法训练模型")
|
|
77
|
+
|
|
78
|
+
# 向量化查询文本
|
|
79
|
+
query_vec = self.vectorizer.transform([query])
|
|
80
|
+
|
|
81
|
+
# 查找最近邻
|
|
82
|
+
start_time = time.time()
|
|
83
|
+
distances, indices = self.nn.kneighbors(query_vec, n_neighbors=n)
|
|
84
|
+
search_time = time.time() - start_time
|
|
85
|
+
|
|
86
|
+
print(f"搜索完成, 耗时: {search_time:.6f}秒")
|
|
87
|
+
|
|
88
|
+
# 将距离转换为相似度 (余弦距离 = 1 - 余弦相似度)
|
|
89
|
+
similarities = 1 - distances
|
|
90
|
+
|
|
91
|
+
# 返回结果
|
|
92
|
+
if return_scores:
|
|
93
|
+
return indices[0], similarities[0]
|
|
94
|
+
return indices[0]
|
|
95
|
+
|
|
96
|
+
def explain_match(self, query, index):
|
|
97
|
+
"""
|
|
98
|
+
解释匹配结果 - 显示查询和匹配文献的关键词
|
|
99
|
+
"""
|
|
100
|
+
# 获取TF-IDF特征名
|
|
101
|
+
feature_names = self.vectorizer.get_feature_names_out()
|
|
102
|
+
|
|
103
|
+
# 向量化查询和匹配文献
|
|
104
|
+
query_vec = self.vectorizer.transform([query])
|
|
105
|
+
doc_vec = self.vectorizer.transform([self.corpus[index]])
|
|
106
|
+
|
|
107
|
+
# 获取重要特征
|
|
108
|
+
query_data = zip(feature_names, query_vec.toarray()[0])
|
|
109
|
+
doc_data = zip(feature_names, doc_vec.toarray()[0])
|
|
110
|
+
|
|
111
|
+
# 筛选非零特征
|
|
112
|
+
query_keywords = [(word, score) for word, score in query_data if score > 0]
|
|
113
|
+
doc_keywords = [(word, score) for word, score in doc_data if score > 0]
|
|
114
|
+
|
|
115
|
+
# 按重要性排序
|
|
116
|
+
query_keywords.sort(key=lambda x: x[1], reverse=True)
|
|
117
|
+
doc_keywords.sort(key=lambda x: x[1], reverse=True)
|
|
118
|
+
|
|
119
|
+
# 打印结果
|
|
120
|
+
print(f"\n匹配文献 #{index} 解释:")
|
|
121
|
+
print(f"查询关键词: {[word for word, _ in query_keywords[:10]]}")
|
|
122
|
+
print(f"文献关键词: {[word for word, _ in doc_keywords[:10]]}")
|
|
123
|
+
|
|
124
|
+
# 计算共同关键词
|
|
125
|
+
common_keywords = set([word for word, _ in query_keywords[:20]]) & set([word for word, _ in doc_keywords[:20]])
|
|
126
|
+
print(f"共同关键词: {list(common_keywords)}")
|
|
127
|
+
|
|
128
|
+
return common_keywords
|
|
129
|
+
|
|
130
|
+
# ======================
|
|
131
|
+
# 演示使用
|
|
132
|
+
# ======================
|
|
133
|
+
|
|
134
|
+
if __name__ == "__main__":
|
|
135
|
+
# 1. 准备文献库 (实际应用中可从文件/数据库加载)
|
|
136
|
+
corpus = [
|
|
137
|
+
"机器学习是人工智能的一个分支,专注于开发算法让计算机从数据中学习",
|
|
138
|
+
"深度学习是机器学习的一个子领域,使用多层神经网络处理复杂模式",
|
|
139
|
+
"自然语言处理(NLP)使计算机能够理解、解释和生成人类语言",
|
|
140
|
+
"计算机视觉关注如何让计算机从图像和视频中获得高层次的理解",
|
|
141
|
+
"强化学习是一种机器学习方法,智能体通过与环境互动学习最优行为策略",
|
|
142
|
+
"监督学习使用标记数据训练模型,无监督学习则处理未标记数据",
|
|
143
|
+
"神经网络是受人脑启发的计算模型,由相互连接的节点层组成",
|
|
144
|
+
"卷积神经网络(CNN)特别适合处理图像识别任务",
|
|
145
|
+
"循环神经网络(RNN)设计用于处理序列数据,如文本和时间序列",
|
|
146
|
+
"Transformer模型通过自注意力机制处理序列数据,成为NLP的主流架构",
|
|
147
|
+
"生成对抗网络(GAN)由生成器和判别器组成,用于生成新数据样本",
|
|
148
|
+
"迁移学习允许将在一个任务上学到的知识应用到另一个相关任务",
|
|
149
|
+
"数据挖掘是从大型数据集中发现模式、关联和异常的过程",
|
|
150
|
+
"特征工程是创建更好的输入特征以提高模型性能的过程",
|
|
151
|
+
"过拟合发生在模型过于复杂,过度记忆训练数据而泛化能力差",
|
|
152
|
+
"正则化技术如L1/L2正则化用于防止过拟合",
|
|
153
|
+
"梯度下降是优化神经网络权重的主要算法",
|
|
154
|
+
"反向传播是训练神经网络的关键算法,用于计算梯度",
|
|
155
|
+
"激活函数如ReLU引入非线性,使神经网络能够学习复杂模式",
|
|
156
|
+
"批量归一化通过标准化层输入加速训练并提高稳定性"
|
|
157
|
+
]
|
|
158
|
+
|
|
159
|
+
# 2. 创建文本匹配器
|
|
160
|
+
print("="*50)
|
|
161
|
+
print("创建文本匹配器")
|
|
162
|
+
print("="*50)
|
|
163
|
+
matcher = TextMatcher(
|
|
164
|
+
algorithm='brute', # 对于小数据集,暴力搜索足够快
|
|
165
|
+
n_jobs=-1 # 使用所有CPU核心
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
# 3. 训练模型
|
|
169
|
+
matcher.fit(corpus)
|
|
170
|
+
|
|
171
|
+
# 4. 执行查询
|
|
172
|
+
print("\n" + "="*50)
|
|
173
|
+
print("执行查询: '神经网络在人工智能中的应用'")
|
|
174
|
+
print("="*50)
|
|
175
|
+
query = "神经网络在人工智能中的应用"
|
|
176
|
+
indices, similarities = matcher.search(query, n=3)
|
|
177
|
+
|
|
178
|
+
# 5. 显示结果
|
|
179
|
+
print("\n最相似的文献:")
|
|
180
|
+
for rank, (idx, sim) in enumerate(zip(indices, similarities)):
|
|
181
|
+
print(f"\nTop {rank+1} [相似度: {sim:.4f}]:")
|
|
182
|
+
print(f"文献 #{idx}: {corpus[idx]}")
|
|
183
|
+
|
|
184
|
+
# 解释匹配
|
|
185
|
+
matcher.explain_match(query, idx)
|
|
186
|
+
|
|
187
|
+
# 6. 性能测试 (可选)
|
|
188
|
+
print("\n" + "="*50)
|
|
189
|
+
print("性能测试")
|
|
190
|
+
print("="*50)
|
|
191
|
+
|
|
192
|
+
# 测试不同文献库大小的性能
|
|
193
|
+
corpus_sizes = [100, 500, 1000, 5000]
|
|
194
|
+
times = []
|
|
195
|
+
|
|
196
|
+
for size in corpus_sizes:
|
|
197
|
+
# 创建更大的文献库
|
|
198
|
+
large_corpus = corpus * (size // len(corpus) + 1)
|
|
199
|
+
large_corpus = large_corpus[:size]
|
|
200
|
+
|
|
201
|
+
# 创建新的匹配器
|
|
202
|
+
test_matcher = TextMatcher(algorithm='brute', n_jobs=-1)
|
|
203
|
+
|
|
204
|
+
# 测量训练时间
|
|
205
|
+
start_time = time.time()
|
|
206
|
+
test_matcher.fit(large_corpus)
|
|
207
|
+
train_time = time.time() - start_time
|
|
208
|
+
|
|
209
|
+
# 测量查询时间
|
|
210
|
+
start_time = time.time()
|
|
211
|
+
test_matcher.search(query, n=5)
|
|
212
|
+
search_time = time.time() - start_time
|
|
213
|
+
|
|
214
|
+
times.append((size, train_time, search_time))
|
|
215
|
+
print(f"文献库大小: {size} | 训练时间: {train_time:.4f}s | 查询时间: {search_time:.6f}s")
|
|
216
|
+
|
|
217
|
+
# 可视化性能结果
|
|
218
|
+
sizes, train_times, search_times = zip(*times)
|
|
219
|
+
|
|
220
|
+
plt.figure(figsize=(12, 6))
|
|
221
|
+
|
|
222
|
+
plt.subplot(1, 2, 1)
|
|
223
|
+
plt.plot(sizes, train_times, 'o-')
|
|
224
|
+
plt.title('训练时间 vs 文献库大小')
|
|
225
|
+
plt.xlabel('文献数量')
|
|
226
|
+
plt.ylabel('时间 (秒)')
|
|
227
|
+
plt.grid(True)
|
|
228
|
+
|
|
229
|
+
plt.subplot(1, 2, 2)
|
|
230
|
+
plt.plot(sizes, search_times, 'o-')
|
|
231
|
+
plt.title('查询时间 vs 文献库大小')
|
|
232
|
+
plt.xlabel('文献数量')
|
|
233
|
+
plt.ylabel('时间 (秒)')
|
|
234
|
+
plt.grid(True)
|
|
235
|
+
|
|
236
|
+
plt.tight_layout()
|
|
237
|
+
plt.savefig('performance.png')
|
|
238
|
+
print("\n性能图表已保存为 'performance.png'")
|
|
239
|
+
|
|
240
|
+
# 7. 相似度矩阵可视化 (可选)
|
|
241
|
+
print("\n" + "="*50)
|
|
242
|
+
print("文献相似度矩阵")
|
|
243
|
+
print("="*50)
|
|
244
|
+
|
|
245
|
+
# 计算所有文献的TF-IDF向量
|
|
246
|
+
vectors = matcher.vectorizer.transform(corpus)
|
|
247
|
+
|
|
248
|
+
# 计算余弦相似度矩阵
|
|
249
|
+
sim_matrix = cosine_similarity(vectors)
|
|
250
|
+
|
|
251
|
+
# 创建DataFrame用于可视化
|
|
252
|
+
df = pd.DataFrame(sim_matrix,
|
|
253
|
+
columns=[f"Doc{i}" for i in range(len(corpus))],
|
|
254
|
+
index=[f"Doc{i}" for i in range(len(corpus))])
|
|
255
|
+
|
|
256
|
+
plt.figure(figsize=(12, 10))
|
|
257
|
+
sns.heatmap(df, cmap="YlGnBu", annot=False)
|
|
258
|
+
plt.title("文献相似度矩阵")
|
|
259
|
+
plt.tight_layout()
|
|
260
|
+
plt.savefig('similarity_matrix.png')
|
|
261
261
|
print("相似度矩阵已保存为 'similarity_matrix.png'")
|
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
aa ="肿瘤 学 研究 进展"
|
|
1
|
+
aa ="肿瘤 学 研究 进展"
|
|
2
2
|
print(str.split(aa))
|