re-common 10.0.33__py3-none-any.whl → 10.0.35__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,10 @@
1
1
  # 某些业务中的字符串处理 算是特定场景的工具 不算通用工具
2
2
  import re
3
3
 
4
+ from re_common.v2.baselibrary.utils.author_smi import AuthorRatio
4
5
  from re_common.v2.baselibrary.utils.string_bool import is_all_symbols
6
+ from re_common.v2.baselibrary.utils.string_clear import rel_clear
7
+ from re_common.v2.baselibrary.utils.stringutils import clean_unicode_alnum, get_alphabetic_ratio
5
8
 
6
9
 
7
10
  def clean_organ_postcode(organ):
@@ -194,3 +197,24 @@ def deal_num(num_str):
194
197
  num_str = deal_num_strs(num_str)
195
198
 
196
199
  return num_str.lower().strip()
200
+
201
+
202
+ def clear_author_1st(author_str:str):
203
+ # 清理括号 防止前面流程没有清理干净
204
+ author_str = re.sub("\\[.*?]", "", author_str)
205
+ author_str = re.sub("\\(.*?\\)", "", author_str)
206
+
207
+ # 清理符号
208
+ author_str = clean_unicode_alnum(author_str)
209
+
210
+ return author_str
211
+
212
+ def is_same_author(a1, a2):
213
+ if get_alphabetic_ratio(a1.strip()) > 0.7 and get_alphabetic_ratio(a2.strip()) > 0.7:
214
+ author_similar_ = AuthorRatio(a1.strip(), a2.strip())
215
+ if author_similar_ > 0.95:
216
+ return True
217
+ else:
218
+ if rel_clear(a1.strip()) == rel_clear(a2.strip()):
219
+ return True
220
+ return False
@@ -10,7 +10,7 @@ class BaseLngid(object):
10
10
  def __int__(self):
11
11
  pass
12
12
 
13
- def BaseEncodeID(self, strRaw):
13
+ def basBaseEncodeID(self, strRaw):
14
14
  r""" 自定义base编码 """
15
15
 
16
16
  strEncode = base64.b32encode(strRaw.encode('utf8')).decode('utf8')
@@ -0,0 +1,105 @@
1
+ import datetime
2
+ import gzip
3
+ import io
4
+ import time
5
+
6
+ import joblib
7
+ from hdfs import InsecureClient
8
+
9
+ from re_common.v2.baselibrary.helpers.search_packge.fit_text_match import FitTextMatcher
10
+
11
+ if __name__ == '__main__':
12
+ matcher = FitTextMatcher(
13
+ algorithm='auto', # 对于小数据集,暴力搜索足够快 brute
14
+ n_jobs=2 # 使用所有CPU核心
15
+
16
+ )
17
+
18
+ client = InsecureClient("http://VIP-DC-MASTER-2:9870", user="xujiang")
19
+
20
+ lists = []
21
+ for i in ["vectorizer", "nn_model", "corpus", "idx"]:
22
+ fit_file_path = f"/b_task_data/class_smi/fit_file/t_23600_{i}.joblib.gz"
23
+ with client.read(fit_file_path) as reader:
24
+ tp = io.BytesIO(reader.read())
25
+ tp.seek(0)
26
+ lists.append(tp)
27
+
28
+ with gzip.GzipFile(fileobj=lists[2], mode='rb') as gz:
29
+ matcher.corpus = joblib.load(gz)
30
+
31
+ with gzip.GzipFile(fileobj=lists[3], mode='rb') as gz:
32
+ matcher.idx = joblib.load(gz)
33
+ matcher.corpus_size = max(len(matcher.corpus), len(matcher.idx))
34
+ print(f"加载bytes完成,共 {matcher.corpus_size} 篇文献")
35
+
36
+ matcher.fit(matcher.corpus)
37
+
38
+ print(matcher.nn._fit_method)
39
+
40
+ print("fit 训练完成")
41
+
42
+ count = 0
43
+ bacth_list = []
44
+ n = min(100, matcher.corpus_size)
45
+ for i in matcher.corpus:
46
+ count = count + 1
47
+ bacth_list.append(i)
48
+ if count % 10000 == 0:
49
+ t1 = time.time()
50
+ index, similarities = matcher.batch_search(bacth_list, n=n)
51
+ for rank, (idxs, sims) in enumerate(zip(index, similarities)):
52
+ print({"keyid": matcher.idx[rank],
53
+ "search_list": [(matcher.idx[idx], sim) for idx, sim in zip(idxs, sims)]})
54
+
55
+ t2 = time.time()
56
+ now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S,%f")[:-3]
57
+ print(now, count, t2 - t1)
58
+ bacth_list.clear()
59
+
60
+
61
+
62
+
63
+
64
+
65
+
66
+ # with open("t_8220_corpus.joblib","rb") as f:
67
+ # buf = io.BytesIO(f.read())
68
+ # buf.seek(0)
69
+ # corpus = buf
70
+ #
71
+ # matcher.corpus = joblib.load(corpus)
72
+ # print(len(matcher.corpus))
73
+
74
+ # matcher.load_bytes(vec, nn, corpus)
75
+
76
+ # with open(r"C:\Users\Administrator\Desktop\update\part-02440\part-02440_1", "r", encoding="utf-8") as f:
77
+ # lists = [line.strip() for line in f if line]
78
+ #
79
+ # matcher.fit(lists)
80
+
81
+ # matcher.load("./","test")
82
+
83
+ # query = r"herbdrug interaction in the protective effect of alpinia officinarum against gastric injury induced by indomethacin based on pharmacokinetic tissue distribution and excretion studies in rats"
84
+ # result = matcher.search(query, n=100)
85
+ # print("query", query)
86
+ # for rank, (idx, sim) in enumerate(result):
87
+ # print(f"\nTop {rank + 1} [相似度: {sim:.4f}]:")
88
+ # print(f"文献 #{idx}: {lists[idx]}")
89
+ # print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
90
+ # time.sleep(100)
91
+
92
+ # for query in lists[:1000]:
93
+ # indices, similarities = matcher.search(query, n=100)
94
+ # print("query", query)
95
+ # for rank, (idx, sim) in enumerate(zip(indices, similarities)):
96
+ # print(f"\nTop {rank + 1} [相似度: {sim:.4f}]:")
97
+ # print(f"文献 #{idx}: {lists[idx]}")
98
+ # print(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
99
+ # time.sleep(100)
100
+ #
101
+ # idx,sim = matcher.batch_search(lists[:1000], n=100)
102
+ # for rank, (idxs, sims) in enumerate(zip(idx,sim)):
103
+ # tp = (lists[rank],[(lists[idx], sim) for idx,sim in zip(idxs,sims)])
104
+ # print(tp)
105
+ # time.sleep(100)
@@ -0,0 +1,254 @@
1
+ import gzip
2
+ import io
3
+ import multiprocessing
4
+ import os
5
+ import time
6
+
7
+ import jieba
8
+ import joblib
9
+
10
+ from sklearn.feature_extraction.text import TfidfVectorizer
11
+ from sklearn.neighbors import NearestNeighbors
12
+
13
+
14
+ def create_gzip_joblib(obj):
15
+ temp_io = io.BytesIO()
16
+ with gzip.GzipFile(fileobj=temp_io, mode='wb') as f:
17
+ joblib.dump(obj, f)
18
+ temp_io.seek(0)
19
+ return temp_io
20
+
21
+
22
+ def get_gzip_joblib(temp_io):
23
+ with gzip.GzipFile(fileobj=temp_io, mode='rb') as f:
24
+ loaded_obj = joblib.load(f)
25
+ return loaded_obj
26
+
27
+
28
+ class JiebaTokenizer:
29
+ def __call__(self, doc):
30
+ return [tok for tok in jieba.cut(doc) if tok.strip()]
31
+
32
+
33
+ class SplitTokenizer:
34
+ def __call__(self, doc):
35
+ return str.split(doc)
36
+
37
+
38
+ def get_auto_n_jobs(fraction=0.5, max_jobs=16):
39
+ """
40
+ 智能分配 CPU 核心数,用于设置 sklearn 的 n_jobs 参数。
41
+
42
+ 参数:
43
+ fraction: 使用总核数的比例(如 0.5 表示一半)
44
+ max_jobs: 最大允许使用的核心数(防止过多)
45
+
46
+ 返回:
47
+ 合理的 n_jobs 整数值
48
+ """
49
+ total_cores = multiprocessing.cpu_count()
50
+ suggested = int(total_cores * fraction)
51
+ n_jobs = min(max(1, suggested), max_jobs)
52
+ return n_jobs
53
+
54
+
55
+ class FitTextMatcher:
56
+ """
57
+ 高性能文本匹配器
58
+ 基于 TF-IDF + 最近邻搜索实现相似文献查找
59
+ """
60
+
61
+ def __init__(self, algorithm='brute', metric='cosine', n_jobs=-1, tokenizer=JiebaTokenizer()):
62
+ """
63
+ 初始化文本匹配器
64
+
65
+ 参数:
66
+ algorithm: 搜索算法 ('brute', 'kd_tree', 'ball_tree', 'lshf')
67
+ metric: 距离度量 ('cosine', 'euclidean', 'manhattan')
68
+ n_jobs: 并行作业数 (-1 表示使用所有CPU核心)
69
+ """
70
+ self.vectorizer = TfidfVectorizer(
71
+ max_features=None, # 限制特征数量以提高性能
72
+ tokenizer=tokenizer,
73
+ stop_words=None, # 中文不适用 'english'
74
+ ngram_range=(1, 2) # 使用单字和双字组合
75
+ )
76
+
77
+ self.nn = NearestNeighbors(
78
+ algorithm=algorithm,
79
+ metric=metric,
80
+ n_jobs=n_jobs # 并行处理加速搜索
81
+ )
82
+
83
+ self.corpus = None
84
+ self.corpus_size = 0
85
+
86
+ def fit(self, corpus):
87
+ """
88
+ 训练匹配器
89
+ """
90
+ self.corpus = corpus
91
+ self.corpus_size = len(corpus)
92
+ print(f"处理 {self.corpus_size} 篇文献...")
93
+
94
+ # 向量化文本
95
+ start_time = time.time()
96
+ X = self.vectorizer.fit_transform(corpus)
97
+ vectorization_time = time.time() - start_time
98
+ print(f"TF-IDF 向量化完成, 耗时: {vectorization_time:.4f}秒")
99
+ print(f"特征维度: {X.shape[1]}")
100
+
101
+ # 训练最近邻模型
102
+ start_time = time.time()
103
+ self.nn.fit(X)
104
+ training_time = time.time() - start_time
105
+ print(f"最近邻模型训练完成, 耗时: {training_time:.4f}秒")
106
+
107
+ return self
108
+
109
+ def save(self, path, name):
110
+ """
111
+ 保存模型和向量器
112
+ """
113
+ os.makedirs(path, exist_ok=True)
114
+ joblib.dump(self.vectorizer, os.path.join(path, name + "_vectorizer.joblib"))
115
+ joblib.dump(self.nn, os.path.join(path, name + "_nn_model.joblib"))
116
+ joblib.dump(self.corpus, os.path.join(path, name + "_corpus.joblib"))
117
+ print(f"模型保存至 {path}")
118
+ return self
119
+
120
+ def get_save_bytes_io(self, idx_list=None):
121
+ """
122
+ 保存模型和向量器
123
+ """
124
+ if idx_list is None:
125
+ idx_list = []
126
+
127
+ result_list = []
128
+ for i in [self.vectorizer, self.nn, self.corpus, idx_list]:
129
+ temp_io = create_gzip_joblib(i)
130
+ result_list.append(temp_io)
131
+ print(f"获取模型字节码成功")
132
+ return result_list
133
+
134
+ def load(self, path, name):
135
+ """
136
+ 从文件加载模型
137
+ """
138
+ self.vectorizer = joblib.load(os.path.join(path, name + "_vectorizer.joblib"))
139
+ self.nn = joblib.load(os.path.join(path, name + "_nn_model.joblib"))
140
+ self.corpus = joblib.load(os.path.join(path, name + "_corpus.joblib"))
141
+ self.corpus_size = len(self.corpus)
142
+ print(f"模型从 {path} 加载完成,共 {self.corpus_size} 篇文献")
143
+ return self
144
+
145
+ def load_bytes(self, vec, nn, corpus, idx):
146
+ # 解压并加载对象
147
+ with gzip.GzipFile(fileobj=vec, mode='rb') as gz:
148
+ self.vectorizer = joblib.load(gz)
149
+ with gzip.GzipFile(fileobj=nn, mode='rb') as gz:
150
+ self.nn = joblib.load(gz)
151
+ with gzip.GzipFile(fileobj=corpus, mode='rb') as gz:
152
+ self.corpus = joblib.load(gz)
153
+ with gzip.GzipFile(fileobj=idx, mode='rb') as gz:
154
+ self.idx = joblib.load(gz)
155
+ self.corpus_size = max(len(self.corpus), len(self.idx))
156
+ print(f"加载bytes完成,共 {self.corpus_size} 篇文献")
157
+ return self
158
+
159
+ def search(self, query, n=5, return_scores=True):
160
+ """
161
+ 查找相似文献
162
+
163
+ 参数:
164
+ query: 查询文本
165
+ n: 返回最相似文献的数量
166
+ return_scores: 是否返回相似度分数
167
+
168
+ 返回:
169
+ 匹配的文献索引和相似度分数
170
+ """
171
+ if self.corpus is None:
172
+ raise ValueError("请先使用 fit() 方法训练模型")
173
+
174
+ # 向量化查询文本
175
+ query_vec = self.vectorizer.transform([query])
176
+
177
+ # 查找最近邻
178
+ start_time = time.time()
179
+ distances, indices = self.nn.kneighbors(query_vec, n_neighbors=n)
180
+ search_time = time.time() - start_time
181
+
182
+ # print(f"搜索完成, 耗时: {search_time:.6f}秒")
183
+
184
+ # 将距离转换为相似度 (余弦距离 = 1 - 余弦相似度)
185
+ similarities = 1 - distances
186
+
187
+ # 返回结果
188
+ if return_scores:
189
+ return indices[0], similarities[0]
190
+ return indices[0]
191
+
192
+ def batch_search(self, queries, n=5, return_scores=True):
193
+ """
194
+ 批量查找相似文献(一次处理多条 query)
195
+
196
+ 参数:
197
+ queries: 查询文本列表
198
+ n: 每条 query 返回多少条相似文献
199
+ return_scores: 是否返回相似度分数
200
+
201
+ 返回:
202
+ 一个列表,包含每条 query 的匹配索引和相似度 [(indices1, sims1), (indices2, sims2), ...]
203
+ """
204
+ if self.corpus is None:
205
+ raise ValueError("请先使用 fit() 方法训练模型")
206
+
207
+ start_time = time.time()
208
+
209
+ # 向量化所有 query,一次性
210
+ query_vecs = self.vectorizer.transform(queries)
211
+
212
+ # 查找最近邻
213
+ distances, indices = self.nn.kneighbors(query_vecs, n_neighbors=n)
214
+ search_time = time.time() - start_time
215
+ # print(f"批量搜索完成,共 {len(queries)} 条,耗时: {search_time:.4f}秒")
216
+
217
+ if return_scores:
218
+ similarities = 1 - distances
219
+ return indices, similarities
220
+ return indices
221
+
222
+ def explain_match(self, query, index):
223
+ """
224
+ 解释匹配结果 - 显示查询和匹配文献的关键词
225
+ """
226
+ # 获取TF-IDF特征名
227
+ feature_names = self.vectorizer.get_feature_names_out()
228
+
229
+ # 向量化查询和匹配文献
230
+ query_vec = self.vectorizer.transform([query])
231
+ doc_vec = self.vectorizer.transform([self.corpus[index]])
232
+
233
+ # 获取重要特征
234
+ query_data = zip(feature_names, query_vec.toarray()[0])
235
+ doc_data = zip(feature_names, doc_vec.toarray()[0])
236
+
237
+ # 筛选非零特征
238
+ query_keywords = [(word, score) for word, score in query_data if score > 0]
239
+ doc_keywords = [(word, score) for word, score in doc_data if score > 0]
240
+
241
+ # 按重要性排序
242
+ query_keywords.sort(key=lambda x: x[1], reverse=True)
243
+ doc_keywords.sort(key=lambda x: x[1], reverse=True)
244
+
245
+ # 打印结果
246
+ print(f"\n匹配文献 #{index} 解释:")
247
+ print(f"查询关键词: {[word for word, _ in query_keywords[:10]]}")
248
+ print(f"文献关键词: {[word for word, _ in doc_keywords[:10]]}")
249
+
250
+ # 计算共同关键词
251
+ common_keywords = set([word for word, _ in query_keywords[:20]]) & set([word for word, _ in doc_keywords[:20]])
252
+ print(f"共同关键词: {list(common_keywords)}")
253
+
254
+ return common_keywords
@@ -0,0 +1,261 @@
1
+ import numpy as np
2
+ from sklearn.feature_extraction.text import TfidfVectorizer
3
+ from sklearn.neighbors import NearestNeighbors
4
+ import time
5
+ import matplotlib.pyplot as plt
6
+ import seaborn as sns
7
+ import pandas as pd
8
+ from sklearn.metrics.pairwise import cosine_similarity
9
+
10
+ class TextMatcher:
11
+ """
12
+ 高性能文本匹配器
13
+ 基于 TF-IDF + 最近邻搜索实现相似文献查找
14
+ """
15
+
16
+ def __init__(self, algorithm='brute', metric='cosine', n_jobs=-1):
17
+ """
18
+ 初始化文本匹配器
19
+
20
+ 参数:
21
+ algorithm: 搜索算法 ('brute', 'kd_tree', 'ball_tree', 'lshf')
22
+ metric: 距离度量 ('cosine', 'euclidean', 'manhattan')
23
+ n_jobs: 并行作业数 (-1 表示使用所有CPU核心)
24
+ """
25
+ self.vectorizer = TfidfVectorizer(
26
+ max_features=10000, # 限制特征数量以提高性能
27
+ stop_words='english', # 移除英文停用词
28
+ ngram_range=(1, 2) # 使用单字和双字组合
29
+ )
30
+
31
+ self.nn = NearestNeighbors(
32
+ algorithm=algorithm,
33
+ metric=metric,
34
+ n_jobs=n_jobs # 并行处理加速搜索
35
+ )
36
+
37
+ self.corpus = None
38
+ self.corpus_size = 0
39
+
40
+ def fit(self, corpus):
41
+ """
42
+ 训练匹配器
43
+ """
44
+ self.corpus = corpus
45
+ self.corpus_size = len(corpus)
46
+ print(f"处理 {self.corpus_size} 篇文献...")
47
+
48
+ # 向量化文本
49
+ start_time = time.time()
50
+ X = self.vectorizer.fit_transform(corpus)
51
+ vectorization_time = time.time() - start_time
52
+ print(f"TF-IDF 向量化完成, 耗时: {vectorization_time:.4f}秒")
53
+ print(f"特征维度: {X.shape[1]}")
54
+
55
+ # 训练最近邻模型
56
+ start_time = time.time()
57
+ self.nn.fit(X)
58
+ training_time = time.time() - start_time
59
+ print(f"最近邻模型训练完成, 耗时: {training_time:.4f}秒")
60
+
61
+ return self
62
+
63
+ def search(self, query, n=5, return_scores=True):
64
+ """
65
+ 查找相似文献
66
+
67
+ 参数:
68
+ query: 查询文本
69
+ n: 返回最相似文献的数量
70
+ return_scores: 是否返回相似度分数
71
+
72
+ 返回:
73
+ 匹配的文献索引和相似度分数
74
+ """
75
+ if self.corpus is None:
76
+ raise ValueError("请先使用 fit() 方法训练模型")
77
+
78
+ # 向量化查询文本
79
+ query_vec = self.vectorizer.transform([query])
80
+
81
+ # 查找最近邻
82
+ start_time = time.time()
83
+ distances, indices = self.nn.kneighbors(query_vec, n_neighbors=n)
84
+ search_time = time.time() - start_time
85
+
86
+ print(f"搜索完成, 耗时: {search_time:.6f}秒")
87
+
88
+ # 将距离转换为相似度 (余弦距离 = 1 - 余弦相似度)
89
+ similarities = 1 - distances
90
+
91
+ # 返回结果
92
+ if return_scores:
93
+ return indices[0], similarities[0]
94
+ return indices[0]
95
+
96
+ def explain_match(self, query, index):
97
+ """
98
+ 解释匹配结果 - 显示查询和匹配文献的关键词
99
+ """
100
+ # 获取TF-IDF特征名
101
+ feature_names = self.vectorizer.get_feature_names_out()
102
+
103
+ # 向量化查询和匹配文献
104
+ query_vec = self.vectorizer.transform([query])
105
+ doc_vec = self.vectorizer.transform([self.corpus[index]])
106
+
107
+ # 获取重要特征
108
+ query_data = zip(feature_names, query_vec.toarray()[0])
109
+ doc_data = zip(feature_names, doc_vec.toarray()[0])
110
+
111
+ # 筛选非零特征
112
+ query_keywords = [(word, score) for word, score in query_data if score > 0]
113
+ doc_keywords = [(word, score) for word, score in doc_data if score > 0]
114
+
115
+ # 按重要性排序
116
+ query_keywords.sort(key=lambda x: x[1], reverse=True)
117
+ doc_keywords.sort(key=lambda x: x[1], reverse=True)
118
+
119
+ # 打印结果
120
+ print(f"\n匹配文献 #{index} 解释:")
121
+ print(f"查询关键词: {[word for word, _ in query_keywords[:10]]}")
122
+ print(f"文献关键词: {[word for word, _ in doc_keywords[:10]]}")
123
+
124
+ # 计算共同关键词
125
+ common_keywords = set([word for word, _ in query_keywords[:20]]) & set([word for word, _ in doc_keywords[:20]])
126
+ print(f"共同关键词: {list(common_keywords)}")
127
+
128
+ return common_keywords
129
+
130
+ # ======================
131
+ # 演示使用
132
+ # ======================
133
+
134
+ if __name__ == "__main__":
135
+ # 1. 准备文献库 (实际应用中可从文件/数据库加载)
136
+ corpus = [
137
+ "机器学习是人工智能的一个分支,专注于开发算法让计算机从数据中学习",
138
+ "深度学习是机器学习的一个子领域,使用多层神经网络处理复杂模式",
139
+ "自然语言处理(NLP)使计算机能够理解、解释和生成人类语言",
140
+ "计算机视觉关注如何让计算机从图像和视频中获得高层次的理解",
141
+ "强化学习是一种机器学习方法,智能体通过与环境互动学习最优行为策略",
142
+ "监督学习使用标记数据训练模型,无监督学习则处理未标记数据",
143
+ "神经网络是受人脑启发的计算模型,由相互连接的节点层组成",
144
+ "卷积神经网络(CNN)特别适合处理图像识别任务",
145
+ "循环神经网络(RNN)设计用于处理序列数据,如文本和时间序列",
146
+ "Transformer模型通过自注意力机制处理序列数据,成为NLP的主流架构",
147
+ "生成对抗网络(GAN)由生成器和判别器组成,用于生成新数据样本",
148
+ "迁移学习允许将在一个任务上学到的知识应用到另一个相关任务",
149
+ "数据挖掘是从大型数据集中发现模式、关联和异常的过程",
150
+ "特征工程是创建更好的输入特征以提高模型性能的过程",
151
+ "过拟合发生在模型过于复杂,过度记忆训练数据而泛化能力差",
152
+ "正则化技术如L1/L2正则化用于防止过拟合",
153
+ "梯度下降是优化神经网络权重的主要算法",
154
+ "反向传播是训练神经网络的关键算法,用于计算梯度",
155
+ "激活函数如ReLU引入非线性,使神经网络能够学习复杂模式",
156
+ "批量归一化通过标准化层输入加速训练并提高稳定性"
157
+ ]
158
+
159
+ # 2. 创建文本匹配器
160
+ print("="*50)
161
+ print("创建文本匹配器")
162
+ print("="*50)
163
+ matcher = TextMatcher(
164
+ algorithm='brute', # 对于小数据集,暴力搜索足够快
165
+ n_jobs=-1 # 使用所有CPU核心
166
+ )
167
+
168
+ # 3. 训练模型
169
+ matcher.fit(corpus)
170
+
171
+ # 4. 执行查询
172
+ print("\n" + "="*50)
173
+ print("执行查询: '神经网络在人工智能中的应用'")
174
+ print("="*50)
175
+ query = "神经网络在人工智能中的应用"
176
+ indices, similarities = matcher.search(query, n=3)
177
+
178
+ # 5. 显示结果
179
+ print("\n最相似的文献:")
180
+ for rank, (idx, sim) in enumerate(zip(indices, similarities)):
181
+ print(f"\nTop {rank+1} [相似度: {sim:.4f}]:")
182
+ print(f"文献 #{idx}: {corpus[idx]}")
183
+
184
+ # 解释匹配
185
+ matcher.explain_match(query, idx)
186
+
187
+ # 6. 性能测试 (可选)
188
+ print("\n" + "="*50)
189
+ print("性能测试")
190
+ print("="*50)
191
+
192
+ # 测试不同文献库大小的性能
193
+ corpus_sizes = [100, 500, 1000, 5000]
194
+ times = []
195
+
196
+ for size in corpus_sizes:
197
+ # 创建更大的文献库
198
+ large_corpus = corpus * (size // len(corpus) + 1)
199
+ large_corpus = large_corpus[:size]
200
+
201
+ # 创建新的匹配器
202
+ test_matcher = TextMatcher(algorithm='brute', n_jobs=-1)
203
+
204
+ # 测量训练时间
205
+ start_time = time.time()
206
+ test_matcher.fit(large_corpus)
207
+ train_time = time.time() - start_time
208
+
209
+ # 测量查询时间
210
+ start_time = time.time()
211
+ test_matcher.search(query, n=5)
212
+ search_time = time.time() - start_time
213
+
214
+ times.append((size, train_time, search_time))
215
+ print(f"文献库大小: {size} | 训练时间: {train_time:.4f}s | 查询时间: {search_time:.6f}s")
216
+
217
+ # 可视化性能结果
218
+ sizes, train_times, search_times = zip(*times)
219
+
220
+ plt.figure(figsize=(12, 6))
221
+
222
+ plt.subplot(1, 2, 1)
223
+ plt.plot(sizes, train_times, 'o-')
224
+ plt.title('训练时间 vs 文献库大小')
225
+ plt.xlabel('文献数量')
226
+ plt.ylabel('时间 (秒)')
227
+ plt.grid(True)
228
+
229
+ plt.subplot(1, 2, 2)
230
+ plt.plot(sizes, search_times, 'o-')
231
+ plt.title('查询时间 vs 文献库大小')
232
+ plt.xlabel('文献数量')
233
+ plt.ylabel('时间 (秒)')
234
+ plt.grid(True)
235
+
236
+ plt.tight_layout()
237
+ plt.savefig('performance.png')
238
+ print("\n性能图表已保存为 'performance.png'")
239
+
240
+ # 7. 相似度矩阵可视化 (可选)
241
+ print("\n" + "="*50)
242
+ print("文献相似度矩阵")
243
+ print("="*50)
244
+
245
+ # 计算所有文献的TF-IDF向量
246
+ vectors = matcher.vectorizer.transform(corpus)
247
+
248
+ # 计算余弦相似度矩阵
249
+ sim_matrix = cosine_similarity(vectors)
250
+
251
+ # 创建DataFrame用于可视化
252
+ df = pd.DataFrame(sim_matrix,
253
+ columns=[f"Doc{i}" for i in range(len(corpus))],
254
+ index=[f"Doc{i}" for i in range(len(corpus))])
255
+
256
+ plt.figure(figsize=(12, 10))
257
+ sns.heatmap(df, cmap="YlGnBu", annot=False)
258
+ plt.title("文献相似度矩阵")
259
+ plt.tight_layout()
260
+ plt.savefig('similarity_matrix.png')
261
+ print("相似度矩阵已保存为 'similarity_matrix.png'")
@@ -0,0 +1,2 @@
1
+ aa ="肿瘤 学 研究 进展"
2
+ print(str.split(aa))
@@ -7,6 +7,8 @@ from hdfs import InsecureClient
7
7
  class HDFSUtils(object):
8
8
  """
9
9
  HDFS 工具类,封装常见的 HDFS 操作。
10
+
11
+ InsecureClient: 缺陷 写大文件数据时无法写入不报错
10
12
  """
11
13
 
12
14
  def __init__(self, hdfs_url, hdfs_user):
@@ -3,6 +3,7 @@ import aiomysql
3
3
  import asyncio
4
4
  from contextlib import asynccontextmanager
5
5
  from typing import AsyncGenerator, Tuple
6
+ from collections import namedtuple
6
7
 
7
8
  from aiomysql import Pool, Connection, Cursor
8
9
 
@@ -65,6 +66,25 @@ async def get_session(pool: Pool) -> AsyncGenerator[Tuple[Connection, Cursor], N
65
66
  yield conn, cursor
66
67
 
67
68
 
69
+ async def dictfetchall(cursor: Cursor):
70
+ """
71
+ Return all rows from a cursor as a dict.
72
+ Assume the column names are unique.
73
+ """
74
+ columns = [col[0] for col in cursor.description]
75
+ return [dict(zip(columns, row)) for row in await cursor.fetchall()]
76
+
77
+
78
+ async def namedtuplefetchall(cursor: Cursor):
79
+ """
80
+ Return all rows from a cursor as a namedtuple.
81
+ Assume the column names are unique.
82
+ """
83
+ desc = cursor.description
84
+ nt_result = namedtuple("Result", [col[0] for col in desc])
85
+ return [nt_result(*row) for row in await cursor.fetchall()]
86
+
87
+
68
88
  # main.py
69
89
 
70
90
 
@@ -1,12 +1,11 @@
1
1
  import re
2
2
  import threading
3
+ from html.parser import HTMLParser
3
4
  from itertools import combinations
4
5
 
5
6
  import regex
6
7
  import unicodedata
7
- from html.parser import HTMLParser
8
-
9
- from unidecode import unidecode
8
+ from parsel import Selector
10
9
 
11
10
  from re_common.v2.baselibrary.utils.string_smi import JaroDamerauLevenshteinMaxSim
12
11
 
@@ -173,12 +172,19 @@ class HTMLTextExtractor(HTMLParser):
173
172
  return cls._thread_local.parser
174
173
 
175
174
 
175
+ # def clean_html(html):
176
+ # parser = HTMLTextExtractor.get_parser()
177
+ # parser.reset_state()
178
+ # parser.feed(html)
179
+ # parser.close()
180
+ # return parser.get_text()
181
+
176
182
  def clean_html(html):
177
- parser = HTMLTextExtractor.get_parser()
178
- parser.reset_state()
179
- parser.feed(html)
180
- parser.close()
181
- return parser.get_text()
183
+ """使用 Parsel 提取 HTML 中的纯文本"""
184
+ sel = Selector(text=html)
185
+ # 提取所有文本(包括子元素的文本)
186
+ text = sel.xpath("string()").getall()
187
+ return "".join(text).strip()
182
188
 
183
189
 
184
190
  def remove_spaces_between_chinese_characters(text):
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.1
2
2
  Name: re_common
3
- Version: 10.0.33
3
+ Version: 10.0.35
4
4
  Summary: a library about all python projects
5
5
  Home-page: https://gitee.com/xujiangios/re-common
6
6
  Author: vic
@@ -11,14 +11,6 @@ Classifier: Operating System :: OS Independent
11
11
  Requires-Python: >=3.6
12
12
  Description-Content-Type: text/markdown
13
13
  License-File: LICENSE
14
- Dynamic: author
15
- Dynamic: author-email
16
- Dynamic: classifier
17
- Dynamic: description
18
- Dynamic: description-content-type
19
- Dynamic: home-page
20
- Dynamic: requires-python
21
- Dynamic: summary
22
14
 
23
15
 
24
16
  这是一个基础类,依赖很多的第三方包,是一个用得到的第三方库的封装,可以在此基础上迅速构建项目
@@ -163,14 +163,19 @@ re_common/studio/streamlitstudio/first_app.py,sha256=t7Fw8YDlub7G9q99GgVo_3sPZXU
163
163
  re_common/studio/streamlitstudio/uber_pickups.py,sha256=cvrV5e8vRBM2_CpVDBE-f3V4mGFK9SqpRPZK8TEqr6U,785
164
164
  re_common/v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
165
165
  re_common/v2/baselibrary/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
166
- re_common/v2/baselibrary/business_utils/BusinessStringUtil.py,sha256=njPcRgeBWpnZr5u2cPAO4qdWBq-CgTn99rJuvWFcChk,6788
166
+ re_common/v2/baselibrary/business_utils/BusinessStringUtil.py,sha256=PaYg_5fLV2FM-Hp8r08RkLorOilXHvCtBLfRUT2MDJk,7681
167
167
  re_common/v2/baselibrary/business_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
168
- re_common/v2/baselibrary/business_utils/baseencodeid.py,sha256=3f52e0jtgCFzPEyReia8TupwiE64t_VyBT-a7uQCXAY,3595
168
+ re_common/v2/baselibrary/business_utils/baseencodeid.py,sha256=9opWNmDKg0ezSJKERHKaPF48W9PuChiUDNe64Xv08is,3598
169
169
  re_common/v2/baselibrary/business_utils/full_doi_path.py,sha256=vsoS1ZGyNzeORon_z1sHt1M41sS22pvJHMgWJH3xZ-M,3378
170
170
  re_common/v2/baselibrary/business_utils/rel_tools.py,sha256=LfnGFCkUSxg1SHvOMOQdP1PiHxIKqk7Syuk5YYpjJag,295
171
171
  re_common/v2/baselibrary/decorators/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
172
172
  re_common/v2/baselibrary/decorators/utils.py,sha256=Q4D6KKCQxvNBXZkPQQn14keKKJpGtg8TUSakjJU40s0,2056
173
173
  re_common/v2/baselibrary/helpers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
174
+ re_common/v2/baselibrary/helpers/search_packge/NearestNeighbors_test.py,sha256=ygN_OzO6XDR-XcyY8hYfFNjfP2luek81lmV-bCbkb7A,3686
175
+ re_common/v2/baselibrary/helpers/search_packge/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
176
+ re_common/v2/baselibrary/helpers/search_packge/fit_text_match.py,sha256=cMN4W7xuL6xzHp4U7SJnkE5i4FU2pMmiDivcDvD8zgA,8543
177
+ re_common/v2/baselibrary/helpers/search_packge/scikit_learn_text_matcher.py,sha256=Ri8Ul2_URq1TVvlXwG0OvqBo9_LSpivvdvjQM7xr01I,9947
178
+ re_common/v2/baselibrary/helpers/search_packge/test.py,sha256=jYDa6s66jqiz6xEhXMPLqmONFbmfv-EgxaVpdHbGk4U,52
174
179
  re_common/v2/baselibrary/s3object/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
175
180
  re_common/v2/baselibrary/s3object/baseboto3.py,sha256=mXuIFx99pnrPGQ4LJCZwlN1HLbaU-OWLwck0cVzW6hc,11203
176
181
  re_common/v2/baselibrary/tools/WeChatRobot.py,sha256=sKBt2gPsfj0gzV6KaLSAhIhL-j3qNfHfqE-lII1LVwM,3537
@@ -199,17 +204,17 @@ re_common/v2/baselibrary/utils/api_net_utils.py,sha256=22q3WMWiKVg1IVGr4y2D1Jrjh
199
204
  re_common/v2/baselibrary/utils/author_smi.py,sha256=Fh276u7za-GF_tK9xpBFYF5q1E3_tX22ZouWC8U7w8o,13831
200
205
  re_common/v2/baselibrary/utils/base_string_similarity.py,sha256=a40a79ttwoG_gC_hxMNB-sMXXecgICoRDWrj0DW8iEE,7749
201
206
  re_common/v2/baselibrary/utils/basedict.py,sha256=sH3_RZ8u4649-jX2V1uKNNkjJVUijZBDp6SdqncOZ88,1583
202
- re_common/v2/baselibrary/utils/basehdfs.py,sha256=TPwFct_-UrmO1KCbo4gpV77rsnlCQDumNBbQKL0ZI9o,5953
207
+ re_common/v2/baselibrary/utils/basehdfs.py,sha256=wwvk4kvipD-AVWCF0WRzBTM2q2wAvQtanLK-Hjp8FOk,6026
203
208
  re_common/v2/baselibrary/utils/basepika.py,sha256=ifOb3UsGj79k40aD9UK6-5BMPw43ZAo0SO3AYD4q4vw,7332
204
209
  re_common/v2/baselibrary/utils/basetime.py,sha256=b7U_ho6nE3fjYBxSkdMHXUOd3ClH6KkW_7p7l2Gs4gA,3038
205
- re_common/v2/baselibrary/utils/db.py,sha256=SdrLIhijgXUsXQL2aStQS-NgnAfWMvfn8RHEyqdoezI,2500
210
+ re_common/v2/baselibrary/utils/db.py,sha256=JktBem1IAsg7m-Wnsa-AT0Hj4bIkHyoKGBEDI1rL5Fc,3135
206
211
  re_common/v2/baselibrary/utils/json_cls.py,sha256=M93piYtmgm_wP8E57culTrd_AhHLoGg6PqeAJYdW2SM,438
207
212
  re_common/v2/baselibrary/utils/mq.py,sha256=UHpO8iNIHs91Tgp-BgnSUpZwjWquxrGLdpr3FMMv2zw,2858
208
213
  re_common/v2/baselibrary/utils/n_ary_expression_tree.py,sha256=-05kO6G2Rth7CEK-5lfFrthFZ1Q0-0a7cni7mWZ-2gg,9172
209
214
  re_common/v2/baselibrary/utils/string_bool.py,sha256=vxnjSFOfuHWGxkqaIbUNn21opx5tfV1uCXSahFfp1mU,6197
210
215
  re_common/v2/baselibrary/utils/string_clear.py,sha256=Ympa0Cs2y_72QeeyMS8de8y_QgtEFJJQ0AgHnylbMUc,7861
211
216
  re_common/v2/baselibrary/utils/string_smi.py,sha256=cU0WAWHRGnGoVQx3eCEKeM_q_olFNzRTJe7rSe586SY,741
212
- re_common/v2/baselibrary/utils/stringutils.py,sha256=eeuQYgXkWJ9apvyrYPcCCU3biTY9nD1KHos4_1ESNJE,7883
217
+ re_common/v2/baselibrary/utils/stringutils.py,sha256=jUmtyo4qu-GmE7_Y8gqFcYPJ7kOo8MSwxEsT7vGp9_c,8131
213
218
  re_common/vip/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
214
219
  re_common/vip/base_step_process.py,sha256=VXXiNj0I5CpzXIMCgOPU86bzDJkSBkUS-9CpZIl_GOk,205
215
220
  re_common/vip/baseencodeid.py,sha256=nERoe89ueFM52bG7xwJdflcZHk6T2RQQKbc5uUZc3RM,3272
@@ -236,8 +241,8 @@ re_common/vip/title/transform/TransformRegulationTitleToZt.py,sha256=LKRdIsWKues
236
241
  re_common/vip/title/transform/TransformStandardTitleToZt.py,sha256=-fCKAbSBzXVyQDCE61CalvR9E_QzQMA08QOO_NePFNI,5563
237
242
  re_common/vip/title/transform/TransformThesisTitleToZt.py,sha256=QS-uV0cQrpUFAcKucuJQ9Ue2VRQH-inmfn_X3IplfRo,5488
238
243
  re_common/vip/title/transform/__init__.py,sha256=m83-CWyRq_VHPYHaALEQlmXrkTdrZ3e4B_kCfBYE-uc,239
239
- re_common-10.0.33.dist-info/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
240
- re_common-10.0.33.dist-info/METADATA,sha256=bmraxgArGm6vGU0k3cC6EfoOY_T0SYZnuL-XVAIhI90,764
241
- re_common-10.0.33.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
242
- re_common-10.0.33.dist-info/top_level.txt,sha256=_H9H23zoLIalm1AIY_KYTVh_H0ZnmjxQIxsvXtLv45o,10
243
- re_common-10.0.33.dist-info/RECORD,,
244
+ re_common-10.0.35.dist-info/LICENSE,sha256=HrhfyXIkWY2tGFK11kg7vPCqhgh5DcxleloqdhrpyMY,11558
245
+ re_common-10.0.35.dist-info/METADATA,sha256=Uh894BDybODTMQemgCS-4hSIRVchegBTzUmBY7XZqRo,582
246
+ re_common-10.0.35.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
247
+ re_common-10.0.35.dist-info/top_level.txt,sha256=_H9H23zoLIalm1AIY_KYTVh_H0ZnmjxQIxsvXtLv45o,10
248
+ re_common-10.0.35.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.8.0)
2
+ Generator: bdist_wheel (0.43.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5