re-common 10.0.39__py3-none-any.whl → 10.0.40__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (218) hide show
  1. re_common/baselibrary/__init__.py +4 -4
  2. re_common/baselibrary/baseabs/__init__.py +6 -6
  3. re_common/baselibrary/baseabs/baseabs.py +26 -26
  4. re_common/baselibrary/database/mbuilder.py +132 -132
  5. re_common/baselibrary/database/moudle.py +93 -93
  6. re_common/baselibrary/database/msqlite3.py +194 -194
  7. re_common/baselibrary/database/mysql.py +169 -169
  8. re_common/baselibrary/database/sql_factory.py +26 -26
  9. re_common/baselibrary/mthread/MThreadingRun.py +486 -486
  10. re_common/baselibrary/mthread/MThreadingRunEvent.py +349 -349
  11. re_common/baselibrary/mthread/__init__.py +2 -2
  12. re_common/baselibrary/mthread/mythreading.py +695 -695
  13. re_common/baselibrary/pakge_other/socks.py +404 -404
  14. re_common/baselibrary/readconfig/config_factory.py +18 -18
  15. re_common/baselibrary/readconfig/ini_config.py +317 -317
  16. re_common/baselibrary/readconfig/toml_config.py +49 -49
  17. re_common/baselibrary/temporary/envdata.py +36 -36
  18. re_common/baselibrary/tools/all_requests/aiohttp_request.py +118 -118
  19. re_common/baselibrary/tools/all_requests/httpx_requet.py +102 -102
  20. re_common/baselibrary/tools/all_requests/mrequest.py +412 -412
  21. re_common/baselibrary/tools/all_requests/requests_request.py +81 -81
  22. re_common/baselibrary/tools/batch_compre/bijiao_batch.py +31 -31
  23. re_common/baselibrary/tools/contrast_db3.py +123 -123
  24. re_common/baselibrary/tools/copy_file.py +39 -39
  25. re_common/baselibrary/tools/db3_2_sizedb3.py +102 -102
  26. re_common/baselibrary/tools/foreachgz.py +39 -39
  27. re_common/baselibrary/tools/get_attr.py +10 -10
  28. re_common/baselibrary/tools/image_to_pdf.py +61 -61
  29. re_common/baselibrary/tools/java_code_deal.py +139 -139
  30. re_common/baselibrary/tools/javacode.py +79 -79
  31. re_common/baselibrary/tools/mdb_db3.py +48 -48
  32. re_common/baselibrary/tools/merge_file.py +171 -171
  33. re_common/baselibrary/tools/merge_gz_file.py +165 -165
  34. re_common/baselibrary/tools/mhdfstools/down_hdfs_files.py +42 -42
  35. re_common/baselibrary/tools/mhdfstools/hdfst.py +42 -42
  36. re_common/baselibrary/tools/mhdfstools/up_hdfs_files.py +38 -38
  37. re_common/baselibrary/tools/mongo_tools.py +50 -50
  38. re_common/baselibrary/tools/move_file.py +170 -170
  39. re_common/baselibrary/tools/move_mongo/mongo_table_to_file.py +63 -63
  40. re_common/baselibrary/tools/move_mongo/move_mongo_table.py +354 -354
  41. re_common/baselibrary/tools/move_mongo/use_mttf.py +18 -18
  42. re_common/baselibrary/tools/move_mongo/use_mv.py +93 -93
  43. re_common/baselibrary/tools/mpandas/mpandasreadexcel.py +125 -125
  44. re_common/baselibrary/tools/mpandas/pandas_visualization.py +7 -7
  45. re_common/baselibrary/tools/myparsel.py +104 -104
  46. re_common/baselibrary/tools/rename_dir_file.py +37 -37
  47. re_common/baselibrary/tools/sequoiadb_utils.py +398 -398
  48. re_common/baselibrary/tools/split_line_to_many.py +25 -25
  49. re_common/baselibrary/tools/stringtodicts.py +33 -33
  50. re_common/baselibrary/tools/workwechant_bot.py +84 -84
  51. re_common/baselibrary/utils/baseaiohttp.py +296 -296
  52. re_common/baselibrary/utils/baseaiomysql.py +87 -87
  53. re_common/baselibrary/utils/baseallstep.py +191 -191
  54. re_common/baselibrary/utils/baseavro.py +19 -19
  55. re_common/baselibrary/utils/baseboto3.py +291 -291
  56. re_common/baselibrary/utils/basecsv.py +32 -32
  57. re_common/baselibrary/utils/basedict.py +133 -133
  58. re_common/baselibrary/utils/basedir.py +241 -241
  59. re_common/baselibrary/utils/baseencode.py +351 -351
  60. re_common/baselibrary/utils/baseencoding.py +28 -28
  61. re_common/baselibrary/utils/baseesdsl.py +86 -86
  62. re_common/baselibrary/utils/baseexcel.py +264 -264
  63. re_common/baselibrary/utils/baseexcept.py +109 -109
  64. re_common/baselibrary/utils/basefile.py +654 -654
  65. re_common/baselibrary/utils/baseftp.py +214 -214
  66. re_common/baselibrary/utils/basegzip.py +60 -60
  67. re_common/baselibrary/utils/basehdfs.py +135 -135
  68. re_common/baselibrary/utils/basehttpx.py +268 -268
  69. re_common/baselibrary/utils/baseip.py +87 -87
  70. re_common/baselibrary/utils/basejson.py +2 -2
  71. re_common/baselibrary/utils/baselist.py +32 -32
  72. re_common/baselibrary/utils/basemotor.py +190 -190
  73. re_common/baselibrary/utils/basemssql.py +98 -98
  74. re_common/baselibrary/utils/baseodbc.py +113 -113
  75. re_common/baselibrary/utils/basepandas.py +302 -302
  76. re_common/baselibrary/utils/basepeewee.py +11 -11
  77. re_common/baselibrary/utils/basepika.py +180 -180
  78. re_common/baselibrary/utils/basepydash.py +143 -143
  79. re_common/baselibrary/utils/basepymongo.py +230 -230
  80. re_common/baselibrary/utils/basequeue.py +22 -22
  81. re_common/baselibrary/utils/baserar.py +57 -57
  82. re_common/baselibrary/utils/baserequest.py +279 -279
  83. re_common/baselibrary/utils/baseset.py +8 -8
  84. re_common/baselibrary/utils/basesmb.py +403 -403
  85. re_common/baselibrary/utils/basestring.py +382 -382
  86. re_common/baselibrary/utils/basetime.py +320 -320
  87. re_common/baselibrary/utils/baseurl.py +121 -121
  88. re_common/baselibrary/utils/basezip.py +57 -57
  89. re_common/baselibrary/utils/core/__init__.py +7 -7
  90. re_common/baselibrary/utils/core/bottomutils.py +18 -18
  91. re_common/baselibrary/utils/core/mdeprecated.py +327 -327
  92. re_common/baselibrary/utils/core/mlamada.py +16 -16
  93. re_common/baselibrary/utils/core/msginfo.py +25 -25
  94. re_common/baselibrary/utils/core/requests_core.py +103 -103
  95. re_common/baselibrary/utils/fateadm.py +429 -429
  96. re_common/baselibrary/utils/importfun.py +123 -123
  97. re_common/baselibrary/utils/mfaker.py +57 -57
  98. re_common/baselibrary/utils/my_abc/__init__.py +3 -3
  99. re_common/baselibrary/utils/my_abc/better_abc.py +32 -32
  100. re_common/baselibrary/utils/mylogger.py +414 -414
  101. re_common/baselibrary/utils/myredisclient.py +861 -861
  102. re_common/baselibrary/utils/pipupgrade.py +21 -21
  103. re_common/baselibrary/utils/ringlist.py +85 -85
  104. re_common/baselibrary/utils/version_compare.py +36 -36
  105. re_common/baselibrary/utils/ydmhttp.py +126 -126
  106. re_common/facade/lazy_import.py +11 -11
  107. re_common/facade/loggerfacade.py +25 -25
  108. re_common/facade/mysqlfacade.py +467 -467
  109. re_common/facade/now.py +31 -31
  110. re_common/facade/sqlite3facade.py +257 -257
  111. re_common/facade/use/mq_use_facade.py +83 -83
  112. re_common/facade/use/proxy_use_facade.py +19 -19
  113. re_common/libtest/base_dict_test.py +19 -19
  114. re_common/libtest/baseavro_test.py +13 -13
  115. re_common/libtest/basefile_test.py +14 -14
  116. re_common/libtest/basemssql_test.py +77 -77
  117. re_common/libtest/baseodbc_test.py +7 -7
  118. re_common/libtest/basepandas_test.py +38 -38
  119. re_common/libtest/get_attr_test/get_attr_test_settings.py +14 -14
  120. re_common/libtest/get_attr_test/settings.py +54 -54
  121. re_common/libtest/idencode_test.py +53 -53
  122. re_common/libtest/iniconfig_test.py +35 -35
  123. re_common/libtest/ip_test.py +34 -34
  124. re_common/libtest/merge_file_test.py +20 -20
  125. re_common/libtest/mfaker_test.py +8 -8
  126. re_common/libtest/mm3_test.py +31 -31
  127. re_common/libtest/mylogger_test.py +88 -88
  128. re_common/libtest/myparsel_test.py +27 -27
  129. re_common/libtest/mysql_test.py +151 -151
  130. re_common/libtest/pymongo_test.py +21 -21
  131. re_common/libtest/split_test.py +11 -11
  132. re_common/libtest/sqlite3_merge_test.py +5 -5
  133. re_common/libtest/sqlite3_test.py +34 -34
  134. re_common/libtest/tomlconfig_test.py +30 -30
  135. re_common/libtest/use_tools_test/__init__.py +2 -2
  136. re_common/libtest/user/__init__.py +4 -4
  137. re_common/studio/__init__.py +4 -4
  138. re_common/studio/assignment_expressions.py +36 -36
  139. re_common/studio/mydash/test1.py +18 -18
  140. re_common/studio/pydashstudio/first.py +9 -9
  141. re_common/studio/streamlitstudio/first_app.py +65 -65
  142. re_common/studio/streamlitstudio/uber_pickups.py +23 -23
  143. re_common/studio/test.py +18 -18
  144. re_common/v2/baselibrary/business_utils/BusinessStringUtil.py +219 -219
  145. re_common/v2/baselibrary/business_utils/baseencodeid.py +100 -100
  146. re_common/v2/baselibrary/business_utils/full_doi_path.py +116 -116
  147. re_common/v2/baselibrary/business_utils/rel_tools.py +6 -6
  148. re_common/v2/baselibrary/decorators/utils.py +59 -59
  149. re_common/v2/baselibrary/helpers/search_packge/NearestNeighbors_test.py +105 -105
  150. re_common/v2/baselibrary/helpers/search_packge/fit_text_match.py +253 -253
  151. re_common/v2/baselibrary/helpers/search_packge/scikit_learn_text_matcher.py +260 -260
  152. re_common/v2/baselibrary/helpers/search_packge/test.py +1 -1
  153. re_common/v2/baselibrary/s3object/baseboto3.py +230 -230
  154. re_common/v2/baselibrary/tools/WeChatRobot.py +95 -95
  155. re_common/v2/baselibrary/tools/ac_ahocorasick.py +75 -75
  156. re_common/v2/baselibrary/tools/concurrency.py +35 -35
  157. re_common/v2/baselibrary/tools/data_processer/base.py +53 -53
  158. re_common/v2/baselibrary/tools/data_processer/data_processer.py +508 -508
  159. re_common/v2/baselibrary/tools/data_processer/data_reader.py +187 -187
  160. re_common/v2/baselibrary/tools/data_processer/data_writer.py +38 -38
  161. re_common/v2/baselibrary/tools/dict_tools.py +44 -44
  162. re_common/v2/baselibrary/tools/dolphinscheduler.py +187 -187
  163. re_common/v2/baselibrary/tools/hdfs_base_processor.py +204 -204
  164. re_common/v2/baselibrary/tools/hdfs_bulk_processor.py +67 -67
  165. re_common/v2/baselibrary/tools/hdfs_data_processer.py +338 -338
  166. re_common/v2/baselibrary/tools/hdfs_line_processor.py +74 -74
  167. re_common/v2/baselibrary/tools/list_tools.py +69 -69
  168. re_common/v2/baselibrary/tools/resume_tracker.py +94 -94
  169. re_common/v2/baselibrary/tools/search_hash_tools.py +54 -54
  170. re_common/v2/baselibrary/tools/text_matcher.py +326 -326
  171. re_common/v2/baselibrary/tools/unionfind_tools.py +60 -60
  172. re_common/v2/baselibrary/utils/BusinessStringUtil.py +196 -196
  173. re_common/v2/baselibrary/utils/api_net_utils.py +270 -270
  174. re_common/v2/baselibrary/utils/author_smi.py +361 -361
  175. re_common/v2/baselibrary/utils/base_string_similarity.py +158 -158
  176. re_common/v2/baselibrary/utils/basedict.py +37 -37
  177. re_common/v2/baselibrary/utils/basehdfs.py +163 -163
  178. re_common/v2/baselibrary/utils/basepika.py +180 -180
  179. re_common/v2/baselibrary/utils/basetime.py +77 -77
  180. re_common/v2/baselibrary/utils/db.py +156 -156
  181. re_common/v2/baselibrary/utils/elasticsearch.py +46 -0
  182. re_common/v2/baselibrary/utils/json_cls.py +16 -16
  183. re_common/v2/baselibrary/utils/mq.py +83 -83
  184. re_common/v2/baselibrary/utils/n_ary_expression_tree.py +243 -243
  185. re_common/v2/baselibrary/utils/string_bool.py +186 -186
  186. re_common/v2/baselibrary/utils/string_clear.py +246 -246
  187. re_common/v2/baselibrary/utils/string_smi.py +18 -18
  188. re_common/v2/baselibrary/utils/stringutils.py +271 -271
  189. re_common/vip/base_step_process.py +11 -11
  190. re_common/vip/baseencodeid.py +90 -90
  191. re_common/vip/changetaskname.py +28 -28
  192. re_common/vip/core_var.py +24 -24
  193. re_common/vip/mmh3Hash.py +89 -89
  194. re_common/vip/proxy/allproxys.py +127 -127
  195. re_common/vip/proxy/allproxys_thread.py +159 -159
  196. re_common/vip/proxy/cnki_proxy.py +153 -153
  197. re_common/vip/proxy/kuaidaili.py +87 -87
  198. re_common/vip/proxy/proxy_all.py +113 -113
  199. re_common/vip/proxy/update_kuaidaili_0.py +42 -42
  200. re_common/vip/proxy/wanfang_proxy.py +152 -152
  201. re_common/vip/proxy/wp_proxy_all.py +181 -181
  202. re_common/vip/read_rawid_to_txt.py +91 -91
  203. re_common/vip/title/__init__.py +5 -5
  204. re_common/vip/title/transform/TransformBookTitleToZt.py +125 -125
  205. re_common/vip/title/transform/TransformConferenceTitleToZt.py +139 -139
  206. re_common/vip/title/transform/TransformCstadTitleToZt.py +195 -195
  207. re_common/vip/title/transform/TransformJournalTitleToZt.py +203 -203
  208. re_common/vip/title/transform/TransformPatentTitleToZt.py +132 -132
  209. re_common/vip/title/transform/TransformRegulationTitleToZt.py +114 -114
  210. re_common/vip/title/transform/TransformStandardTitleToZt.py +135 -135
  211. re_common/vip/title/transform/TransformThesisTitleToZt.py +135 -135
  212. re_common/vip/title/transform/__init__.py +10 -10
  213. {re_common-10.0.39.dist-info → re_common-10.0.40.dist-info}/LICENSE +201 -201
  214. {re_common-10.0.39.dist-info → re_common-10.0.40.dist-info}/METADATA +24 -16
  215. re_common-10.0.40.dist-info/RECORD +249 -0
  216. {re_common-10.0.39.dist-info → re_common-10.0.40.dist-info}/WHEEL +1 -1
  217. re_common-10.0.39.dist-info/RECORD +0 -248
  218. {re_common-10.0.39.dist-info → re_common-10.0.40.dist-info}/top_level.txt +0 -0
@@ -1,261 +1,261 @@
1
- import numpy as np
2
- from sklearn.feature_extraction.text import TfidfVectorizer
3
- from sklearn.neighbors import NearestNeighbors
4
- import time
5
- import matplotlib.pyplot as plt
6
- import seaborn as sns
7
- import pandas as pd
8
- from sklearn.metrics.pairwise import cosine_similarity
9
-
10
- class TextMatcher:
11
- """
12
- 高性能文本匹配器
13
- 基于 TF-IDF + 最近邻搜索实现相似文献查找
14
- """
15
-
16
- def __init__(self, algorithm='brute', metric='cosine', n_jobs=-1):
17
- """
18
- 初始化文本匹配器
19
-
20
- 参数:
21
- algorithm: 搜索算法 ('brute', 'kd_tree', 'ball_tree', 'lshf')
22
- metric: 距离度量 ('cosine', 'euclidean', 'manhattan')
23
- n_jobs: 并行作业数 (-1 表示使用所有CPU核心)
24
- """
25
- self.vectorizer = TfidfVectorizer(
26
- max_features=10000, # 限制特征数量以提高性能
27
- stop_words='english', # 移除英文停用词
28
- ngram_range=(1, 2) # 使用单字和双字组合
29
- )
30
-
31
- self.nn = NearestNeighbors(
32
- algorithm=algorithm,
33
- metric=metric,
34
- n_jobs=n_jobs # 并行处理加速搜索
35
- )
36
-
37
- self.corpus = None
38
- self.corpus_size = 0
39
-
40
- def fit(self, corpus):
41
- """
42
- 训练匹配器
43
- """
44
- self.corpus = corpus
45
- self.corpus_size = len(corpus)
46
- print(f"处理 {self.corpus_size} 篇文献...")
47
-
48
- # 向量化文本
49
- start_time = time.time()
50
- X = self.vectorizer.fit_transform(corpus)
51
- vectorization_time = time.time() - start_time
52
- print(f"TF-IDF 向量化完成, 耗时: {vectorization_time:.4f}秒")
53
- print(f"特征维度: {X.shape[1]}")
54
-
55
- # 训练最近邻模型
56
- start_time = time.time()
57
- self.nn.fit(X)
58
- training_time = time.time() - start_time
59
- print(f"最近邻模型训练完成, 耗时: {training_time:.4f}秒")
60
-
61
- return self
62
-
63
- def search(self, query, n=5, return_scores=True):
64
- """
65
- 查找相似文献
66
-
67
- 参数:
68
- query: 查询文本
69
- n: 返回最相似文献的数量
70
- return_scores: 是否返回相似度分数
71
-
72
- 返回:
73
- 匹配的文献索引和相似度分数
74
- """
75
- if self.corpus is None:
76
- raise ValueError("请先使用 fit() 方法训练模型")
77
-
78
- # 向量化查询文本
79
- query_vec = self.vectorizer.transform([query])
80
-
81
- # 查找最近邻
82
- start_time = time.time()
83
- distances, indices = self.nn.kneighbors(query_vec, n_neighbors=n)
84
- search_time = time.time() - start_time
85
-
86
- print(f"搜索完成, 耗时: {search_time:.6f}秒")
87
-
88
- # 将距离转换为相似度 (余弦距离 = 1 - 余弦相似度)
89
- similarities = 1 - distances
90
-
91
- # 返回结果
92
- if return_scores:
93
- return indices[0], similarities[0]
94
- return indices[0]
95
-
96
- def explain_match(self, query, index):
97
- """
98
- 解释匹配结果 - 显示查询和匹配文献的关键词
99
- """
100
- # 获取TF-IDF特征名
101
- feature_names = self.vectorizer.get_feature_names_out()
102
-
103
- # 向量化查询和匹配文献
104
- query_vec = self.vectorizer.transform([query])
105
- doc_vec = self.vectorizer.transform([self.corpus[index]])
106
-
107
- # 获取重要特征
108
- query_data = zip(feature_names, query_vec.toarray()[0])
109
- doc_data = zip(feature_names, doc_vec.toarray()[0])
110
-
111
- # 筛选非零特征
112
- query_keywords = [(word, score) for word, score in query_data if score > 0]
113
- doc_keywords = [(word, score) for word, score in doc_data if score > 0]
114
-
115
- # 按重要性排序
116
- query_keywords.sort(key=lambda x: x[1], reverse=True)
117
- doc_keywords.sort(key=lambda x: x[1], reverse=True)
118
-
119
- # 打印结果
120
- print(f"\n匹配文献 #{index} 解释:")
121
- print(f"查询关键词: {[word for word, _ in query_keywords[:10]]}")
122
- print(f"文献关键词: {[word for word, _ in doc_keywords[:10]]}")
123
-
124
- # 计算共同关键词
125
- common_keywords = set([word for word, _ in query_keywords[:20]]) & set([word for word, _ in doc_keywords[:20]])
126
- print(f"共同关键词: {list(common_keywords)}")
127
-
128
- return common_keywords
129
-
130
- # ======================
131
- # 演示使用
132
- # ======================
133
-
134
- if __name__ == "__main__":
135
- # 1. 准备文献库 (实际应用中可从文件/数据库加载)
136
- corpus = [
137
- "机器学习是人工智能的一个分支,专注于开发算法让计算机从数据中学习",
138
- "深度学习是机器学习的一个子领域,使用多层神经网络处理复杂模式",
139
- "自然语言处理(NLP)使计算机能够理解、解释和生成人类语言",
140
- "计算机视觉关注如何让计算机从图像和视频中获得高层次的理解",
141
- "强化学习是一种机器学习方法,智能体通过与环境互动学习最优行为策略",
142
- "监督学习使用标记数据训练模型,无监督学习则处理未标记数据",
143
- "神经网络是受人脑启发的计算模型,由相互连接的节点层组成",
144
- "卷积神经网络(CNN)特别适合处理图像识别任务",
145
- "循环神经网络(RNN)设计用于处理序列数据,如文本和时间序列",
146
- "Transformer模型通过自注意力机制处理序列数据,成为NLP的主流架构",
147
- "生成对抗网络(GAN)由生成器和判别器组成,用于生成新数据样本",
148
- "迁移学习允许将在一个任务上学到的知识应用到另一个相关任务",
149
- "数据挖掘是从大型数据集中发现模式、关联和异常的过程",
150
- "特征工程是创建更好的输入特征以提高模型性能的过程",
151
- "过拟合发生在模型过于复杂,过度记忆训练数据而泛化能力差",
152
- "正则化技术如L1/L2正则化用于防止过拟合",
153
- "梯度下降是优化神经网络权重的主要算法",
154
- "反向传播是训练神经网络的关键算法,用于计算梯度",
155
- "激活函数如ReLU引入非线性,使神经网络能够学习复杂模式",
156
- "批量归一化通过标准化层输入加速训练并提高稳定性"
157
- ]
158
-
159
- # 2. 创建文本匹配器
160
- print("="*50)
161
- print("创建文本匹配器")
162
- print("="*50)
163
- matcher = TextMatcher(
164
- algorithm='brute', # 对于小数据集,暴力搜索足够快
165
- n_jobs=-1 # 使用所有CPU核心
166
- )
167
-
168
- # 3. 训练模型
169
- matcher.fit(corpus)
170
-
171
- # 4. 执行查询
172
- print("\n" + "="*50)
173
- print("执行查询: '神经网络在人工智能中的应用'")
174
- print("="*50)
175
- query = "神经网络在人工智能中的应用"
176
- indices, similarities = matcher.search(query, n=3)
177
-
178
- # 5. 显示结果
179
- print("\n最相似的文献:")
180
- for rank, (idx, sim) in enumerate(zip(indices, similarities)):
181
- print(f"\nTop {rank+1} [相似度: {sim:.4f}]:")
182
- print(f"文献 #{idx}: {corpus[idx]}")
183
-
184
- # 解释匹配
185
- matcher.explain_match(query, idx)
186
-
187
- # 6. 性能测试 (可选)
188
- print("\n" + "="*50)
189
- print("性能测试")
190
- print("="*50)
191
-
192
- # 测试不同文献库大小的性能
193
- corpus_sizes = [100, 500, 1000, 5000]
194
- times = []
195
-
196
- for size in corpus_sizes:
197
- # 创建更大的文献库
198
- large_corpus = corpus * (size // len(corpus) + 1)
199
- large_corpus = large_corpus[:size]
200
-
201
- # 创建新的匹配器
202
- test_matcher = TextMatcher(algorithm='brute', n_jobs=-1)
203
-
204
- # 测量训练时间
205
- start_time = time.time()
206
- test_matcher.fit(large_corpus)
207
- train_time = time.time() - start_time
208
-
209
- # 测量查询时间
210
- start_time = time.time()
211
- test_matcher.search(query, n=5)
212
- search_time = time.time() - start_time
213
-
214
- times.append((size, train_time, search_time))
215
- print(f"文献库大小: {size} | 训练时间: {train_time:.4f}s | 查询时间: {search_time:.6f}s")
216
-
217
- # 可视化性能结果
218
- sizes, train_times, search_times = zip(*times)
219
-
220
- plt.figure(figsize=(12, 6))
221
-
222
- plt.subplot(1, 2, 1)
223
- plt.plot(sizes, train_times, 'o-')
224
- plt.title('训练时间 vs 文献库大小')
225
- plt.xlabel('文献数量')
226
- plt.ylabel('时间 (秒)')
227
- plt.grid(True)
228
-
229
- plt.subplot(1, 2, 2)
230
- plt.plot(sizes, search_times, 'o-')
231
- plt.title('查询时间 vs 文献库大小')
232
- plt.xlabel('文献数量')
233
- plt.ylabel('时间 (秒)')
234
- plt.grid(True)
235
-
236
- plt.tight_layout()
237
- plt.savefig('performance.png')
238
- print("\n性能图表已保存为 'performance.png'")
239
-
240
- # 7. 相似度矩阵可视化 (可选)
241
- print("\n" + "="*50)
242
- print("文献相似度矩阵")
243
- print("="*50)
244
-
245
- # 计算所有文献的TF-IDF向量
246
- vectors = matcher.vectorizer.transform(corpus)
247
-
248
- # 计算余弦相似度矩阵
249
- sim_matrix = cosine_similarity(vectors)
250
-
251
- # 创建DataFrame用于可视化
252
- df = pd.DataFrame(sim_matrix,
253
- columns=[f"Doc{i}" for i in range(len(corpus))],
254
- index=[f"Doc{i}" for i in range(len(corpus))])
255
-
256
- plt.figure(figsize=(12, 10))
257
- sns.heatmap(df, cmap="YlGnBu", annot=False)
258
- plt.title("文献相似度矩阵")
259
- plt.tight_layout()
260
- plt.savefig('similarity_matrix.png')
1
+ import numpy as np
2
+ from sklearn.feature_extraction.text import TfidfVectorizer
3
+ from sklearn.neighbors import NearestNeighbors
4
+ import time
5
+ import matplotlib.pyplot as plt
6
+ import seaborn as sns
7
+ import pandas as pd
8
+ from sklearn.metrics.pairwise import cosine_similarity
9
+
10
+ class TextMatcher:
11
+ """
12
+ 高性能文本匹配器
13
+ 基于 TF-IDF + 最近邻搜索实现相似文献查找
14
+ """
15
+
16
+ def __init__(self, algorithm='brute', metric='cosine', n_jobs=-1):
17
+ """
18
+ 初始化文本匹配器
19
+
20
+ 参数:
21
+ algorithm: 搜索算法 ('brute', 'kd_tree', 'ball_tree', 'lshf')
22
+ metric: 距离度量 ('cosine', 'euclidean', 'manhattan')
23
+ n_jobs: 并行作业数 (-1 表示使用所有CPU核心)
24
+ """
25
+ self.vectorizer = TfidfVectorizer(
26
+ max_features=10000, # 限制特征数量以提高性能
27
+ stop_words='english', # 移除英文停用词
28
+ ngram_range=(1, 2) # 使用单字和双字组合
29
+ )
30
+
31
+ self.nn = NearestNeighbors(
32
+ algorithm=algorithm,
33
+ metric=metric,
34
+ n_jobs=n_jobs # 并行处理加速搜索
35
+ )
36
+
37
+ self.corpus = None
38
+ self.corpus_size = 0
39
+
40
+ def fit(self, corpus):
41
+ """
42
+ 训练匹配器
43
+ """
44
+ self.corpus = corpus
45
+ self.corpus_size = len(corpus)
46
+ print(f"处理 {self.corpus_size} 篇文献...")
47
+
48
+ # 向量化文本
49
+ start_time = time.time()
50
+ X = self.vectorizer.fit_transform(corpus)
51
+ vectorization_time = time.time() - start_time
52
+ print(f"TF-IDF 向量化完成, 耗时: {vectorization_time:.4f}秒")
53
+ print(f"特征维度: {X.shape[1]}")
54
+
55
+ # 训练最近邻模型
56
+ start_time = time.time()
57
+ self.nn.fit(X)
58
+ training_time = time.time() - start_time
59
+ print(f"最近邻模型训练完成, 耗时: {training_time:.4f}秒")
60
+
61
+ return self
62
+
63
+ def search(self, query, n=5, return_scores=True):
64
+ """
65
+ 查找相似文献
66
+
67
+ 参数:
68
+ query: 查询文本
69
+ n: 返回最相似文献的数量
70
+ return_scores: 是否返回相似度分数
71
+
72
+ 返回:
73
+ 匹配的文献索引和相似度分数
74
+ """
75
+ if self.corpus is None:
76
+ raise ValueError("请先使用 fit() 方法训练模型")
77
+
78
+ # 向量化查询文本
79
+ query_vec = self.vectorizer.transform([query])
80
+
81
+ # 查找最近邻
82
+ start_time = time.time()
83
+ distances, indices = self.nn.kneighbors(query_vec, n_neighbors=n)
84
+ search_time = time.time() - start_time
85
+
86
+ print(f"搜索完成, 耗时: {search_time:.6f}秒")
87
+
88
+ # 将距离转换为相似度 (余弦距离 = 1 - 余弦相似度)
89
+ similarities = 1 - distances
90
+
91
+ # 返回结果
92
+ if return_scores:
93
+ return indices[0], similarities[0]
94
+ return indices[0]
95
+
96
+ def explain_match(self, query, index):
97
+ """
98
+ 解释匹配结果 - 显示查询和匹配文献的关键词
99
+ """
100
+ # 获取TF-IDF特征名
101
+ feature_names = self.vectorizer.get_feature_names_out()
102
+
103
+ # 向量化查询和匹配文献
104
+ query_vec = self.vectorizer.transform([query])
105
+ doc_vec = self.vectorizer.transform([self.corpus[index]])
106
+
107
+ # 获取重要特征
108
+ query_data = zip(feature_names, query_vec.toarray()[0])
109
+ doc_data = zip(feature_names, doc_vec.toarray()[0])
110
+
111
+ # 筛选非零特征
112
+ query_keywords = [(word, score) for word, score in query_data if score > 0]
113
+ doc_keywords = [(word, score) for word, score in doc_data if score > 0]
114
+
115
+ # 按重要性排序
116
+ query_keywords.sort(key=lambda x: x[1], reverse=True)
117
+ doc_keywords.sort(key=lambda x: x[1], reverse=True)
118
+
119
+ # 打印结果
120
+ print(f"\n匹配文献 #{index} 解释:")
121
+ print(f"查询关键词: {[word for word, _ in query_keywords[:10]]}")
122
+ print(f"文献关键词: {[word for word, _ in doc_keywords[:10]]}")
123
+
124
+ # 计算共同关键词
125
+ common_keywords = set([word for word, _ in query_keywords[:20]]) & set([word for word, _ in doc_keywords[:20]])
126
+ print(f"共同关键词: {list(common_keywords)}")
127
+
128
+ return common_keywords
129
+
130
+ # ======================
131
+ # 演示使用
132
+ # ======================
133
+
134
+ if __name__ == "__main__":
135
+ # 1. 准备文献库 (实际应用中可从文件/数据库加载)
136
+ corpus = [
137
+ "机器学习是人工智能的一个分支,专注于开发算法让计算机从数据中学习",
138
+ "深度学习是机器学习的一个子领域,使用多层神经网络处理复杂模式",
139
+ "自然语言处理(NLP)使计算机能够理解、解释和生成人类语言",
140
+ "计算机视觉关注如何让计算机从图像和视频中获得高层次的理解",
141
+ "强化学习是一种机器学习方法,智能体通过与环境互动学习最优行为策略",
142
+ "监督学习使用标记数据训练模型,无监督学习则处理未标记数据",
143
+ "神经网络是受人脑启发的计算模型,由相互连接的节点层组成",
144
+ "卷积神经网络(CNN)特别适合处理图像识别任务",
145
+ "循环神经网络(RNN)设计用于处理序列数据,如文本和时间序列",
146
+ "Transformer模型通过自注意力机制处理序列数据,成为NLP的主流架构",
147
+ "生成对抗网络(GAN)由生成器和判别器组成,用于生成新数据样本",
148
+ "迁移学习允许将在一个任务上学到的知识应用到另一个相关任务",
149
+ "数据挖掘是从大型数据集中发现模式、关联和异常的过程",
150
+ "特征工程是创建更好的输入特征以提高模型性能的过程",
151
+ "过拟合发生在模型过于复杂,过度记忆训练数据而泛化能力差",
152
+ "正则化技术如L1/L2正则化用于防止过拟合",
153
+ "梯度下降是优化神经网络权重的主要算法",
154
+ "反向传播是训练神经网络的关键算法,用于计算梯度",
155
+ "激活函数如ReLU引入非线性,使神经网络能够学习复杂模式",
156
+ "批量归一化通过标准化层输入加速训练并提高稳定性"
157
+ ]
158
+
159
+ # 2. 创建文本匹配器
160
+ print("="*50)
161
+ print("创建文本匹配器")
162
+ print("="*50)
163
+ matcher = TextMatcher(
164
+ algorithm='brute', # 对于小数据集,暴力搜索足够快
165
+ n_jobs=-1 # 使用所有CPU核心
166
+ )
167
+
168
+ # 3. 训练模型
169
+ matcher.fit(corpus)
170
+
171
+ # 4. 执行查询
172
+ print("\n" + "="*50)
173
+ print("执行查询: '神经网络在人工智能中的应用'")
174
+ print("="*50)
175
+ query = "神经网络在人工智能中的应用"
176
+ indices, similarities = matcher.search(query, n=3)
177
+
178
+ # 5. 显示结果
179
+ print("\n最相似的文献:")
180
+ for rank, (idx, sim) in enumerate(zip(indices, similarities)):
181
+ print(f"\nTop {rank+1} [相似度: {sim:.4f}]:")
182
+ print(f"文献 #{idx}: {corpus[idx]}")
183
+
184
+ # 解释匹配
185
+ matcher.explain_match(query, idx)
186
+
187
+ # 6. 性能测试 (可选)
188
+ print("\n" + "="*50)
189
+ print("性能测试")
190
+ print("="*50)
191
+
192
+ # 测试不同文献库大小的性能
193
+ corpus_sizes = [100, 500, 1000, 5000]
194
+ times = []
195
+
196
+ for size in corpus_sizes:
197
+ # 创建更大的文献库
198
+ large_corpus = corpus * (size // len(corpus) + 1)
199
+ large_corpus = large_corpus[:size]
200
+
201
+ # 创建新的匹配器
202
+ test_matcher = TextMatcher(algorithm='brute', n_jobs=-1)
203
+
204
+ # 测量训练时间
205
+ start_time = time.time()
206
+ test_matcher.fit(large_corpus)
207
+ train_time = time.time() - start_time
208
+
209
+ # 测量查询时间
210
+ start_time = time.time()
211
+ test_matcher.search(query, n=5)
212
+ search_time = time.time() - start_time
213
+
214
+ times.append((size, train_time, search_time))
215
+ print(f"文献库大小: {size} | 训练时间: {train_time:.4f}s | 查询时间: {search_time:.6f}s")
216
+
217
+ # 可视化性能结果
218
+ sizes, train_times, search_times = zip(*times)
219
+
220
+ plt.figure(figsize=(12, 6))
221
+
222
+ plt.subplot(1, 2, 1)
223
+ plt.plot(sizes, train_times, 'o-')
224
+ plt.title('训练时间 vs 文献库大小')
225
+ plt.xlabel('文献数量')
226
+ plt.ylabel('时间 (秒)')
227
+ plt.grid(True)
228
+
229
+ plt.subplot(1, 2, 2)
230
+ plt.plot(sizes, search_times, 'o-')
231
+ plt.title('查询时间 vs 文献库大小')
232
+ plt.xlabel('文献数量')
233
+ plt.ylabel('时间 (秒)')
234
+ plt.grid(True)
235
+
236
+ plt.tight_layout()
237
+ plt.savefig('performance.png')
238
+ print("\n性能图表已保存为 'performance.png'")
239
+
240
+ # 7. 相似度矩阵可视化 (可选)
241
+ print("\n" + "="*50)
242
+ print("文献相似度矩阵")
243
+ print("="*50)
244
+
245
+ # 计算所有文献的TF-IDF向量
246
+ vectors = matcher.vectorizer.transform(corpus)
247
+
248
+ # 计算余弦相似度矩阵
249
+ sim_matrix = cosine_similarity(vectors)
250
+
251
+ # 创建DataFrame用于可视化
252
+ df = pd.DataFrame(sim_matrix,
253
+ columns=[f"Doc{i}" for i in range(len(corpus))],
254
+ index=[f"Doc{i}" for i in range(len(corpus))])
255
+
256
+ plt.figure(figsize=(12, 10))
257
+ sns.heatmap(df, cmap="YlGnBu", annot=False)
258
+ plt.title("文献相似度矩阵")
259
+ plt.tight_layout()
260
+ plt.savefig('similarity_matrix.png')
261
261
  print("相似度矩阵已保存为 'similarity_matrix.png'")
@@ -1,2 +1,2 @@
1
- aa ="肿瘤 学 研究 进展"
1
+ aa ="肿瘤 学 研究 进展"
2
2
  print(str.split(aa))