re-common 10.0.39__py3-none-any.whl → 10.0.41__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (221) hide show
  1. re_common/baselibrary/__init__.py +4 -4
  2. re_common/baselibrary/baseabs/__init__.py +6 -6
  3. re_common/baselibrary/baseabs/baseabs.py +26 -26
  4. re_common/baselibrary/database/mbuilder.py +132 -132
  5. re_common/baselibrary/database/moudle.py +93 -93
  6. re_common/baselibrary/database/msqlite3.py +194 -194
  7. re_common/baselibrary/database/mysql.py +169 -169
  8. re_common/baselibrary/database/sql_factory.py +26 -26
  9. re_common/baselibrary/mthread/MThreadingRun.py +486 -486
  10. re_common/baselibrary/mthread/MThreadingRunEvent.py +349 -349
  11. re_common/baselibrary/mthread/__init__.py +2 -2
  12. re_common/baselibrary/mthread/mythreading.py +695 -695
  13. re_common/baselibrary/pakge_other/socks.py +404 -404
  14. re_common/baselibrary/readconfig/config_factory.py +18 -18
  15. re_common/baselibrary/readconfig/ini_config.py +317 -317
  16. re_common/baselibrary/readconfig/toml_config.py +49 -49
  17. re_common/baselibrary/temporary/envdata.py +36 -36
  18. re_common/baselibrary/tools/all_requests/aiohttp_request.py +118 -118
  19. re_common/baselibrary/tools/all_requests/httpx_requet.py +102 -102
  20. re_common/baselibrary/tools/all_requests/mrequest.py +412 -412
  21. re_common/baselibrary/tools/all_requests/requests_request.py +81 -81
  22. re_common/baselibrary/tools/batch_compre/bijiao_batch.py +31 -31
  23. re_common/baselibrary/tools/contrast_db3.py +123 -123
  24. re_common/baselibrary/tools/copy_file.py +39 -39
  25. re_common/baselibrary/tools/db3_2_sizedb3.py +102 -102
  26. re_common/baselibrary/tools/foreachgz.py +39 -39
  27. re_common/baselibrary/tools/get_attr.py +10 -10
  28. re_common/baselibrary/tools/image_to_pdf.py +61 -61
  29. re_common/baselibrary/tools/java_code_deal.py +139 -139
  30. re_common/baselibrary/tools/javacode.py +79 -79
  31. re_common/baselibrary/tools/mdb_db3.py +48 -48
  32. re_common/baselibrary/tools/merge_file.py +171 -171
  33. re_common/baselibrary/tools/merge_gz_file.py +165 -165
  34. re_common/baselibrary/tools/mhdfstools/down_hdfs_files.py +42 -42
  35. re_common/baselibrary/tools/mhdfstools/hdfst.py +42 -42
  36. re_common/baselibrary/tools/mhdfstools/up_hdfs_files.py +38 -38
  37. re_common/baselibrary/tools/mongo_tools.py +50 -50
  38. re_common/baselibrary/tools/move_file.py +170 -170
  39. re_common/baselibrary/tools/move_mongo/mongo_table_to_file.py +63 -63
  40. re_common/baselibrary/tools/move_mongo/move_mongo_table.py +354 -354
  41. re_common/baselibrary/tools/move_mongo/use_mttf.py +18 -18
  42. re_common/baselibrary/tools/move_mongo/use_mv.py +93 -93
  43. re_common/baselibrary/tools/mpandas/mpandasreadexcel.py +125 -125
  44. re_common/baselibrary/tools/mpandas/pandas_visualization.py +7 -7
  45. re_common/baselibrary/tools/myparsel.py +104 -104
  46. re_common/baselibrary/tools/rename_dir_file.py +37 -37
  47. re_common/baselibrary/tools/sequoiadb_utils.py +398 -398
  48. re_common/baselibrary/tools/split_line_to_many.py +25 -25
  49. re_common/baselibrary/tools/stringtodicts.py +33 -33
  50. re_common/baselibrary/tools/workwechant_bot.py +84 -84
  51. re_common/baselibrary/utils/baseaiohttp.py +296 -296
  52. re_common/baselibrary/utils/baseaiomysql.py +87 -87
  53. re_common/baselibrary/utils/baseallstep.py +191 -191
  54. re_common/baselibrary/utils/baseavro.py +19 -19
  55. re_common/baselibrary/utils/baseboto3.py +291 -291
  56. re_common/baselibrary/utils/basecsv.py +32 -32
  57. re_common/baselibrary/utils/basedict.py +133 -133
  58. re_common/baselibrary/utils/basedir.py +241 -241
  59. re_common/baselibrary/utils/baseencode.py +351 -351
  60. re_common/baselibrary/utils/baseencoding.py +28 -28
  61. re_common/baselibrary/utils/baseesdsl.py +86 -86
  62. re_common/baselibrary/utils/baseexcel.py +264 -264
  63. re_common/baselibrary/utils/baseexcept.py +109 -109
  64. re_common/baselibrary/utils/basefile.py +654 -654
  65. re_common/baselibrary/utils/baseftp.py +214 -214
  66. re_common/baselibrary/utils/basegzip.py +60 -60
  67. re_common/baselibrary/utils/basehdfs.py +135 -135
  68. re_common/baselibrary/utils/basehttpx.py +268 -268
  69. re_common/baselibrary/utils/baseip.py +87 -87
  70. re_common/baselibrary/utils/basejson.py +2 -2
  71. re_common/baselibrary/utils/baselist.py +32 -32
  72. re_common/baselibrary/utils/basemotor.py +190 -190
  73. re_common/baselibrary/utils/basemssql.py +98 -98
  74. re_common/baselibrary/utils/baseodbc.py +113 -113
  75. re_common/baselibrary/utils/basepandas.py +302 -302
  76. re_common/baselibrary/utils/basepeewee.py +11 -11
  77. re_common/baselibrary/utils/basepika.py +180 -180
  78. re_common/baselibrary/utils/basepydash.py +143 -143
  79. re_common/baselibrary/utils/basepymongo.py +230 -230
  80. re_common/baselibrary/utils/basequeue.py +22 -22
  81. re_common/baselibrary/utils/baserar.py +57 -57
  82. re_common/baselibrary/utils/baserequest.py +279 -279
  83. re_common/baselibrary/utils/baseset.py +8 -8
  84. re_common/baselibrary/utils/basesmb.py +403 -403
  85. re_common/baselibrary/utils/basestring.py +382 -382
  86. re_common/baselibrary/utils/basetime.py +320 -320
  87. re_common/baselibrary/utils/baseurl.py +121 -121
  88. re_common/baselibrary/utils/basezip.py +57 -57
  89. re_common/baselibrary/utils/core/__init__.py +7 -7
  90. re_common/baselibrary/utils/core/bottomutils.py +18 -18
  91. re_common/baselibrary/utils/core/mdeprecated.py +327 -327
  92. re_common/baselibrary/utils/core/mlamada.py +16 -16
  93. re_common/baselibrary/utils/core/msginfo.py +25 -25
  94. re_common/baselibrary/utils/core/requests_core.py +103 -103
  95. re_common/baselibrary/utils/fateadm.py +429 -429
  96. re_common/baselibrary/utils/importfun.py +123 -123
  97. re_common/baselibrary/utils/mfaker.py +57 -57
  98. re_common/baselibrary/utils/my_abc/__init__.py +3 -3
  99. re_common/baselibrary/utils/my_abc/better_abc.py +32 -32
  100. re_common/baselibrary/utils/mylogger.py +414 -414
  101. re_common/baselibrary/utils/myredisclient.py +861 -861
  102. re_common/baselibrary/utils/pipupgrade.py +21 -21
  103. re_common/baselibrary/utils/ringlist.py +85 -85
  104. re_common/baselibrary/utils/version_compare.py +36 -36
  105. re_common/baselibrary/utils/ydmhttp.py +126 -126
  106. re_common/facade/lazy_import.py +11 -11
  107. re_common/facade/loggerfacade.py +25 -25
  108. re_common/facade/mysqlfacade.py +467 -467
  109. re_common/facade/now.py +31 -31
  110. re_common/facade/sqlite3facade.py +257 -257
  111. re_common/facade/use/mq_use_facade.py +83 -83
  112. re_common/facade/use/proxy_use_facade.py +19 -19
  113. re_common/libtest/base_dict_test.py +19 -19
  114. re_common/libtest/baseavro_test.py +13 -13
  115. re_common/libtest/basefile_test.py +14 -14
  116. re_common/libtest/basemssql_test.py +77 -77
  117. re_common/libtest/baseodbc_test.py +7 -7
  118. re_common/libtest/basepandas_test.py +38 -38
  119. re_common/libtest/get_attr_test/get_attr_test_settings.py +14 -14
  120. re_common/libtest/get_attr_test/settings.py +54 -54
  121. re_common/libtest/idencode_test.py +53 -53
  122. re_common/libtest/iniconfig_test.py +35 -35
  123. re_common/libtest/ip_test.py +34 -34
  124. re_common/libtest/merge_file_test.py +20 -20
  125. re_common/libtest/mfaker_test.py +8 -8
  126. re_common/libtest/mm3_test.py +31 -31
  127. re_common/libtest/mylogger_test.py +88 -88
  128. re_common/libtest/myparsel_test.py +27 -27
  129. re_common/libtest/mysql_test.py +151 -151
  130. re_common/libtest/pymongo_test.py +21 -21
  131. re_common/libtest/split_test.py +11 -11
  132. re_common/libtest/sqlite3_merge_test.py +5 -5
  133. re_common/libtest/sqlite3_test.py +34 -34
  134. re_common/libtest/tomlconfig_test.py +30 -30
  135. re_common/libtest/use_tools_test/__init__.py +2 -2
  136. re_common/libtest/user/__init__.py +4 -4
  137. re_common/studio/__init__.py +4 -4
  138. re_common/studio/assignment_expressions.py +36 -36
  139. re_common/studio/mydash/test1.py +18 -18
  140. re_common/studio/pydashstudio/first.py +9 -9
  141. re_common/studio/streamlitstudio/first_app.py +65 -65
  142. re_common/studio/streamlitstudio/uber_pickups.py +23 -23
  143. re_common/studio/test.py +18 -18
  144. re_common/v2/baselibrary/business_utils/BusinessStringUtil.py +235 -220
  145. re_common/v2/baselibrary/business_utils/baseencodeid.py +100 -100
  146. re_common/v2/baselibrary/business_utils/full_doi_path.py +116 -116
  147. re_common/v2/baselibrary/business_utils/rel_tools.py +6 -6
  148. re_common/v2/baselibrary/decorators/utils.py +59 -59
  149. re_common/v2/baselibrary/helpers/search_packge/NearestNeighbors_test.py +105 -105
  150. re_common/v2/baselibrary/helpers/search_packge/fit_text_match.py +253 -253
  151. re_common/v2/baselibrary/helpers/search_packge/scikit_learn_text_matcher.py +260 -260
  152. re_common/v2/baselibrary/helpers/search_packge/test.py +1 -1
  153. re_common/v2/baselibrary/s3object/baseboto3.py +230 -230
  154. re_common/v2/baselibrary/tools/WeChatRobot.py +95 -95
  155. re_common/v2/baselibrary/tools/ac_ahocorasick.py +75 -75
  156. re_common/v2/baselibrary/tools/concurrency.py +35 -35
  157. re_common/v2/baselibrary/tools/data_processer/base.py +53 -53
  158. re_common/v2/baselibrary/tools/data_processer/data_processer.py +497 -508
  159. re_common/v2/baselibrary/tools/data_processer/data_reader.py +187 -187
  160. re_common/v2/baselibrary/tools/data_processer/data_writer.py +38 -38
  161. re_common/v2/baselibrary/tools/dict_tools.py +44 -44
  162. re_common/v2/baselibrary/tools/dolphinscheduler.py +187 -187
  163. re_common/v2/baselibrary/tools/hdfs_base_processor.py +204 -204
  164. re_common/v2/baselibrary/tools/hdfs_bulk_processor.py +67 -67
  165. re_common/v2/baselibrary/tools/hdfs_data_processer.py +338 -338
  166. re_common/v2/baselibrary/tools/hdfs_line_processor.py +74 -74
  167. re_common/v2/baselibrary/tools/list_tools.py +69 -69
  168. re_common/v2/baselibrary/tools/resume_tracker.py +94 -94
  169. re_common/v2/baselibrary/tools/search_hash_tools.py +54 -54
  170. re_common/v2/baselibrary/tools/text_matcher.py +326 -326
  171. re_common/v2/baselibrary/tools/tree_processor/__init__.py +0 -0
  172. re_common/v2/baselibrary/tools/tree_processor/builder.py +25 -0
  173. re_common/v2/baselibrary/tools/tree_processor/node.py +13 -0
  174. re_common/v2/baselibrary/tools/unionfind_tools.py +60 -60
  175. re_common/v2/baselibrary/utils/BusinessStringUtil.py +196 -196
  176. re_common/v2/baselibrary/utils/api_net_utils.py +270 -270
  177. re_common/v2/baselibrary/utils/author_smi.py +361 -361
  178. re_common/v2/baselibrary/utils/base_string_similarity.py +158 -158
  179. re_common/v2/baselibrary/utils/basedict.py +37 -37
  180. re_common/v2/baselibrary/utils/basehdfs.py +163 -163
  181. re_common/v2/baselibrary/utils/basepika.py +180 -180
  182. re_common/v2/baselibrary/utils/basetime.py +94 -77
  183. re_common/v2/baselibrary/utils/db.py +174 -156
  184. re_common/v2/baselibrary/utils/elasticsearch.py +46 -0
  185. re_common/v2/baselibrary/utils/json_cls.py +16 -16
  186. re_common/v2/baselibrary/utils/mq.py +83 -83
  187. re_common/v2/baselibrary/utils/n_ary_expression_tree.py +243 -243
  188. re_common/v2/baselibrary/utils/string_bool.py +187 -186
  189. re_common/v2/baselibrary/utils/string_clear.py +246 -246
  190. re_common/v2/baselibrary/utils/string_smi.py +18 -18
  191. re_common/v2/baselibrary/utils/stringutils.py +312 -271
  192. re_common/vip/base_step_process.py +11 -11
  193. re_common/vip/baseencodeid.py +90 -90
  194. re_common/vip/changetaskname.py +28 -28
  195. re_common/vip/core_var.py +24 -24
  196. re_common/vip/mmh3Hash.py +89 -89
  197. re_common/vip/proxy/allproxys.py +127 -127
  198. re_common/vip/proxy/allproxys_thread.py +159 -159
  199. re_common/vip/proxy/cnki_proxy.py +153 -153
  200. re_common/vip/proxy/kuaidaili.py +87 -87
  201. re_common/vip/proxy/proxy_all.py +113 -113
  202. re_common/vip/proxy/update_kuaidaili_0.py +42 -42
  203. re_common/vip/proxy/wanfang_proxy.py +152 -152
  204. re_common/vip/proxy/wp_proxy_all.py +181 -181
  205. re_common/vip/read_rawid_to_txt.py +91 -91
  206. re_common/vip/title/__init__.py +5 -5
  207. re_common/vip/title/transform/TransformBookTitleToZt.py +125 -125
  208. re_common/vip/title/transform/TransformConferenceTitleToZt.py +139 -139
  209. re_common/vip/title/transform/TransformCstadTitleToZt.py +195 -195
  210. re_common/vip/title/transform/TransformJournalTitleToZt.py +203 -203
  211. re_common/vip/title/transform/TransformPatentTitleToZt.py +132 -132
  212. re_common/vip/title/transform/TransformRegulationTitleToZt.py +114 -114
  213. re_common/vip/title/transform/TransformStandardTitleToZt.py +135 -135
  214. re_common/vip/title/transform/TransformThesisTitleToZt.py +135 -135
  215. re_common/vip/title/transform/__init__.py +10 -10
  216. {re_common-10.0.39.dist-info → re_common-10.0.41.dist-info}/LICENSE +201 -201
  217. {re_common-10.0.39.dist-info → re_common-10.0.41.dist-info}/METADATA +16 -16
  218. re_common-10.0.41.dist-info/RECORD +252 -0
  219. {re_common-10.0.39.dist-info → re_common-10.0.41.dist-info}/WHEEL +1 -1
  220. re_common-10.0.39.dist-info/RECORD +0 -248
  221. {re_common-10.0.39.dist-info → re_common-10.0.41.dist-info}/top_level.txt +0 -0
@@ -1,254 +1,254 @@
1
- import gzip
2
- import io
3
- import multiprocessing
4
- import os
5
- import time
6
-
7
- import jieba
8
- import joblib
9
-
10
- from sklearn.feature_extraction.text import TfidfVectorizer
11
- from sklearn.neighbors import NearestNeighbors
12
-
13
-
14
- def create_gzip_joblib(obj):
15
- temp_io = io.BytesIO()
16
- with gzip.GzipFile(fileobj=temp_io, mode='wb') as f:
17
- joblib.dump(obj, f)
18
- temp_io.seek(0)
19
- return temp_io
20
-
21
-
22
- def get_gzip_joblib(temp_io):
23
- with gzip.GzipFile(fileobj=temp_io, mode='rb') as f:
24
- loaded_obj = joblib.load(f)
25
- return loaded_obj
26
-
27
-
28
- class JiebaTokenizer:
29
- def __call__(self, doc):
30
- return [tok for tok in jieba.cut(doc) if tok.strip()]
31
-
32
-
33
- class SplitTokenizer:
34
- def __call__(self, doc):
35
- return str.split(doc)
36
-
37
-
38
- def get_auto_n_jobs(fraction=0.5, max_jobs=16):
39
- """
40
- 智能分配 CPU 核心数,用于设置 sklearn 的 n_jobs 参数。
41
-
42
- 参数:
43
- fraction: 使用总核数的比例(如 0.5 表示一半)
44
- max_jobs: 最大允许使用的核心数(防止过多)
45
-
46
- 返回:
47
- 合理的 n_jobs 整数值
48
- """
49
- total_cores = multiprocessing.cpu_count()
50
- suggested = int(total_cores * fraction)
51
- n_jobs = min(max(1, suggested), max_jobs)
52
- return n_jobs
53
-
54
-
55
- class FitTextMatcher:
56
- """
57
- 高性能文本匹配器
58
- 基于 TF-IDF + 最近邻搜索实现相似文献查找
59
- """
60
-
61
- def __init__(self, algorithm='brute', metric='cosine', n_jobs=-1, tokenizer=JiebaTokenizer()):
62
- """
63
- 初始化文本匹配器
64
-
65
- 参数:
66
- algorithm: 搜索算法 ('brute', 'kd_tree', 'ball_tree', 'lshf')
67
- metric: 距离度量 ('cosine', 'euclidean', 'manhattan')
68
- n_jobs: 并行作业数 (-1 表示使用所有CPU核心)
69
- """
70
- self.vectorizer = TfidfVectorizer(
71
- max_features=None, # 限制特征数量以提高性能
72
- tokenizer=tokenizer,
73
- stop_words=None, # 中文不适用 'english'
74
- ngram_range=(1, 2) # 使用单字和双字组合
75
- )
76
-
77
- self.nn = NearestNeighbors(
78
- algorithm=algorithm,
79
- metric=metric,
80
- n_jobs=n_jobs # 并行处理加速搜索
81
- )
82
-
83
- self.corpus = None
84
- self.corpus_size = 0
85
-
86
- def fit(self, corpus):
87
- """
88
- 训练匹配器
89
- """
90
- self.corpus = corpus
91
- self.corpus_size = len(corpus)
92
- print(f"处理 {self.corpus_size} 篇文献...")
93
-
94
- # 向量化文本
95
- start_time = time.time()
96
- X = self.vectorizer.fit_transform(corpus)
97
- vectorization_time = time.time() - start_time
98
- print(f"TF-IDF 向量化完成, 耗时: {vectorization_time:.4f}秒")
99
- print(f"特征维度: {X.shape[1]}")
100
-
101
- # 训练最近邻模型
102
- start_time = time.time()
103
- self.nn.fit(X)
104
- training_time = time.time() - start_time
105
- print(f"最近邻模型训练完成, 耗时: {training_time:.4f}秒")
106
-
107
- return self
108
-
109
- def save(self, path, name):
110
- """
111
- 保存模型和向量器
112
- """
113
- os.makedirs(path, exist_ok=True)
114
- joblib.dump(self.vectorizer, os.path.join(path, name + "_vectorizer.joblib"))
115
- joblib.dump(self.nn, os.path.join(path, name + "_nn_model.joblib"))
116
- joblib.dump(self.corpus, os.path.join(path, name + "_corpus.joblib"))
117
- print(f"模型保存至 {path}")
118
- return self
119
-
120
- def get_save_bytes_io(self, idx_list=None):
121
- """
122
- 保存模型和向量器
123
- """
124
- if idx_list is None:
125
- idx_list = []
126
-
127
- result_list = []
128
- for i in [self.vectorizer, self.nn, self.corpus, idx_list]:
129
- temp_io = create_gzip_joblib(i)
130
- result_list.append(temp_io)
131
- print(f"获取模型字节码成功")
132
- return result_list
133
-
134
- def load(self, path, name):
135
- """
136
- 从文件加载模型
137
- """
138
- self.vectorizer = joblib.load(os.path.join(path, name + "_vectorizer.joblib"))
139
- self.nn = joblib.load(os.path.join(path, name + "_nn_model.joblib"))
140
- self.corpus = joblib.load(os.path.join(path, name + "_corpus.joblib"))
141
- self.corpus_size = len(self.corpus)
142
- print(f"模型从 {path} 加载完成,共 {self.corpus_size} 篇文献")
143
- return self
144
-
145
- def load_bytes(self, vec, nn, corpus, idx):
146
- # 解压并加载对象
147
- with gzip.GzipFile(fileobj=vec, mode='rb') as gz:
148
- self.vectorizer = joblib.load(gz)
149
- with gzip.GzipFile(fileobj=nn, mode='rb') as gz:
150
- self.nn = joblib.load(gz)
151
- with gzip.GzipFile(fileobj=corpus, mode='rb') as gz:
152
- self.corpus = joblib.load(gz)
153
- with gzip.GzipFile(fileobj=idx, mode='rb') as gz:
154
- self.idx = joblib.load(gz)
155
- self.corpus_size = max(len(self.corpus), len(self.idx))
156
- print(f"加载bytes完成,共 {self.corpus_size} 篇文献")
157
- return self
158
-
159
- def search(self, query, n=5, return_scores=True):
160
- """
161
- 查找相似文献
162
-
163
- 参数:
164
- query: 查询文本
165
- n: 返回最相似文献的数量
166
- return_scores: 是否返回相似度分数
167
-
168
- 返回:
169
- 匹配的文献索引和相似度分数
170
- """
171
- if self.corpus is None:
172
- raise ValueError("请先使用 fit() 方法训练模型")
173
-
174
- # 向量化查询文本
175
- query_vec = self.vectorizer.transform([query])
176
-
177
- # 查找最近邻
178
- start_time = time.time()
179
- distances, indices = self.nn.kneighbors(query_vec, n_neighbors=n)
180
- search_time = time.time() - start_time
181
-
182
- # print(f"搜索完成, 耗时: {search_time:.6f}秒")
183
-
184
- # 将距离转换为相似度 (余弦距离 = 1 - 余弦相似度)
185
- similarities = 1 - distances
186
-
187
- # 返回结果
188
- if return_scores:
189
- return indices[0], similarities[0]
190
- return indices[0]
191
-
192
- def batch_search(self, queries, n=5, return_scores=True):
193
- """
194
- 批量查找相似文献(一次处理多条 query)
195
-
196
- 参数:
197
- queries: 查询文本列表
198
- n: 每条 query 返回多少条相似文献
199
- return_scores: 是否返回相似度分数
200
-
201
- 返回:
202
- 一个列表,包含每条 query 的匹配索引和相似度 [(indices1, sims1), (indices2, sims2), ...]
203
- """
204
- if self.corpus is None:
205
- raise ValueError("请先使用 fit() 方法训练模型")
206
-
207
- start_time = time.time()
208
-
209
- # 向量化所有 query,一次性
210
- query_vecs = self.vectorizer.transform(queries)
211
-
212
- # 查找最近邻
213
- distances, indices = self.nn.kneighbors(query_vecs, n_neighbors=n)
214
- search_time = time.time() - start_time
215
- # print(f"批量搜索完成,共 {len(queries)} 条,耗时: {search_time:.4f}秒")
216
-
217
- if return_scores:
218
- similarities = 1 - distances
219
- return indices, similarities
220
- return indices
221
-
222
- def explain_match(self, query, index):
223
- """
224
- 解释匹配结果 - 显示查询和匹配文献的关键词
225
- """
226
- # 获取TF-IDF特征名
227
- feature_names = self.vectorizer.get_feature_names_out()
228
-
229
- # 向量化查询和匹配文献
230
- query_vec = self.vectorizer.transform([query])
231
- doc_vec = self.vectorizer.transform([self.corpus[index]])
232
-
233
- # 获取重要特征
234
- query_data = zip(feature_names, query_vec.toarray()[0])
235
- doc_data = zip(feature_names, doc_vec.toarray()[0])
236
-
237
- # 筛选非零特征
238
- query_keywords = [(word, score) for word, score in query_data if score > 0]
239
- doc_keywords = [(word, score) for word, score in doc_data if score > 0]
240
-
241
- # 按重要性排序
242
- query_keywords.sort(key=lambda x: x[1], reverse=True)
243
- doc_keywords.sort(key=lambda x: x[1], reverse=True)
244
-
245
- # 打印结果
246
- print(f"\n匹配文献 #{index} 解释:")
247
- print(f"查询关键词: {[word for word, _ in query_keywords[:10]]}")
248
- print(f"文献关键词: {[word for word, _ in doc_keywords[:10]]}")
249
-
250
- # 计算共同关键词
251
- common_keywords = set([word for word, _ in query_keywords[:20]]) & set([word for word, _ in doc_keywords[:20]])
252
- print(f"共同关键词: {list(common_keywords)}")
253
-
1
+ import gzip
2
+ import io
3
+ import multiprocessing
4
+ import os
5
+ import time
6
+
7
+ import jieba
8
+ import joblib
9
+
10
+ from sklearn.feature_extraction.text import TfidfVectorizer
11
+ from sklearn.neighbors import NearestNeighbors
12
+
13
+
14
+ def create_gzip_joblib(obj):
15
+ temp_io = io.BytesIO()
16
+ with gzip.GzipFile(fileobj=temp_io, mode='wb') as f:
17
+ joblib.dump(obj, f)
18
+ temp_io.seek(0)
19
+ return temp_io
20
+
21
+
22
+ def get_gzip_joblib(temp_io):
23
+ with gzip.GzipFile(fileobj=temp_io, mode='rb') as f:
24
+ loaded_obj = joblib.load(f)
25
+ return loaded_obj
26
+
27
+
28
+ class JiebaTokenizer:
29
+ def __call__(self, doc):
30
+ return [tok for tok in jieba.cut(doc) if tok.strip()]
31
+
32
+
33
+ class SplitTokenizer:
34
+ def __call__(self, doc):
35
+ return str.split(doc)
36
+
37
+
38
+ def get_auto_n_jobs(fraction=0.5, max_jobs=16):
39
+ """
40
+ 智能分配 CPU 核心数,用于设置 sklearn 的 n_jobs 参数。
41
+
42
+ 参数:
43
+ fraction: 使用总核数的比例(如 0.5 表示一半)
44
+ max_jobs: 最大允许使用的核心数(防止过多)
45
+
46
+ 返回:
47
+ 合理的 n_jobs 整数值
48
+ """
49
+ total_cores = multiprocessing.cpu_count()
50
+ suggested = int(total_cores * fraction)
51
+ n_jobs = min(max(1, suggested), max_jobs)
52
+ return n_jobs
53
+
54
+
55
+ class FitTextMatcher:
56
+ """
57
+ 高性能文本匹配器
58
+ 基于 TF-IDF + 最近邻搜索实现相似文献查找
59
+ """
60
+
61
+ def __init__(self, algorithm='brute', metric='cosine', n_jobs=-1, tokenizer=JiebaTokenizer()):
62
+ """
63
+ 初始化文本匹配器
64
+
65
+ 参数:
66
+ algorithm: 搜索算法 ('brute', 'kd_tree', 'ball_tree', 'lshf')
67
+ metric: 距离度量 ('cosine', 'euclidean', 'manhattan')
68
+ n_jobs: 并行作业数 (-1 表示使用所有CPU核心)
69
+ """
70
+ self.vectorizer = TfidfVectorizer(
71
+ max_features=None, # 限制特征数量以提高性能
72
+ tokenizer=tokenizer,
73
+ stop_words=None, # 中文不适用 'english'
74
+ ngram_range=(1, 2) # 使用单字和双字组合
75
+ )
76
+
77
+ self.nn = NearestNeighbors(
78
+ algorithm=algorithm,
79
+ metric=metric,
80
+ n_jobs=n_jobs # 并行处理加速搜索
81
+ )
82
+
83
+ self.corpus = None
84
+ self.corpus_size = 0
85
+
86
+ def fit(self, corpus):
87
+ """
88
+ 训练匹配器
89
+ """
90
+ self.corpus = corpus
91
+ self.corpus_size = len(corpus)
92
+ print(f"处理 {self.corpus_size} 篇文献...")
93
+
94
+ # 向量化文本
95
+ start_time = time.time()
96
+ X = self.vectorizer.fit_transform(corpus)
97
+ vectorization_time = time.time() - start_time
98
+ print(f"TF-IDF 向量化完成, 耗时: {vectorization_time:.4f}秒")
99
+ print(f"特征维度: {X.shape[1]}")
100
+
101
+ # 训练最近邻模型
102
+ start_time = time.time()
103
+ self.nn.fit(X)
104
+ training_time = time.time() - start_time
105
+ print(f"最近邻模型训练完成, 耗时: {training_time:.4f}秒")
106
+
107
+ return self
108
+
109
+ def save(self, path, name):
110
+ """
111
+ 保存模型和向量器
112
+ """
113
+ os.makedirs(path, exist_ok=True)
114
+ joblib.dump(self.vectorizer, os.path.join(path, name + "_vectorizer.joblib"))
115
+ joblib.dump(self.nn, os.path.join(path, name + "_nn_model.joblib"))
116
+ joblib.dump(self.corpus, os.path.join(path, name + "_corpus.joblib"))
117
+ print(f"模型保存至 {path}")
118
+ return self
119
+
120
+ def get_save_bytes_io(self, idx_list=None):
121
+ """
122
+ 保存模型和向量器
123
+ """
124
+ if idx_list is None:
125
+ idx_list = []
126
+
127
+ result_list = []
128
+ for i in [self.vectorizer, self.nn, self.corpus, idx_list]:
129
+ temp_io = create_gzip_joblib(i)
130
+ result_list.append(temp_io)
131
+ print(f"获取模型字节码成功")
132
+ return result_list
133
+
134
+ def load(self, path, name):
135
+ """
136
+ 从文件加载模型
137
+ """
138
+ self.vectorizer = joblib.load(os.path.join(path, name + "_vectorizer.joblib"))
139
+ self.nn = joblib.load(os.path.join(path, name + "_nn_model.joblib"))
140
+ self.corpus = joblib.load(os.path.join(path, name + "_corpus.joblib"))
141
+ self.corpus_size = len(self.corpus)
142
+ print(f"模型从 {path} 加载完成,共 {self.corpus_size} 篇文献")
143
+ return self
144
+
145
+ def load_bytes(self, vec, nn, corpus, idx):
146
+ # 解压并加载对象
147
+ with gzip.GzipFile(fileobj=vec, mode='rb') as gz:
148
+ self.vectorizer = joblib.load(gz)
149
+ with gzip.GzipFile(fileobj=nn, mode='rb') as gz:
150
+ self.nn = joblib.load(gz)
151
+ with gzip.GzipFile(fileobj=corpus, mode='rb') as gz:
152
+ self.corpus = joblib.load(gz)
153
+ with gzip.GzipFile(fileobj=idx, mode='rb') as gz:
154
+ self.idx = joblib.load(gz)
155
+ self.corpus_size = max(len(self.corpus), len(self.idx))
156
+ print(f"加载bytes完成,共 {self.corpus_size} 篇文献")
157
+ return self
158
+
159
+ def search(self, query, n=5, return_scores=True):
160
+ """
161
+ 查找相似文献
162
+
163
+ 参数:
164
+ query: 查询文本
165
+ n: 返回最相似文献的数量
166
+ return_scores: 是否返回相似度分数
167
+
168
+ 返回:
169
+ 匹配的文献索引和相似度分数
170
+ """
171
+ if self.corpus is None:
172
+ raise ValueError("请先使用 fit() 方法训练模型")
173
+
174
+ # 向量化查询文本
175
+ query_vec = self.vectorizer.transform([query])
176
+
177
+ # 查找最近邻
178
+ start_time = time.time()
179
+ distances, indices = self.nn.kneighbors(query_vec, n_neighbors=n)
180
+ search_time = time.time() - start_time
181
+
182
+ # print(f"搜索完成, 耗时: {search_time:.6f}秒")
183
+
184
+ # 将距离转换为相似度 (余弦距离 = 1 - 余弦相似度)
185
+ similarities = 1 - distances
186
+
187
+ # 返回结果
188
+ if return_scores:
189
+ return indices[0], similarities[0]
190
+ return indices[0]
191
+
192
+ def batch_search(self, queries, n=5, return_scores=True):
193
+ """
194
+ 批量查找相似文献(一次处理多条 query)
195
+
196
+ 参数:
197
+ queries: 查询文本列表
198
+ n: 每条 query 返回多少条相似文献
199
+ return_scores: 是否返回相似度分数
200
+
201
+ 返回:
202
+ 一个列表,包含每条 query 的匹配索引和相似度 [(indices1, sims1), (indices2, sims2), ...]
203
+ """
204
+ if self.corpus is None:
205
+ raise ValueError("请先使用 fit() 方法训练模型")
206
+
207
+ start_time = time.time()
208
+
209
+ # 向量化所有 query,一次性
210
+ query_vecs = self.vectorizer.transform(queries)
211
+
212
+ # 查找最近邻
213
+ distances, indices = self.nn.kneighbors(query_vecs, n_neighbors=n)
214
+ search_time = time.time() - start_time
215
+ # print(f"批量搜索完成,共 {len(queries)} 条,耗时: {search_time:.4f}秒")
216
+
217
+ if return_scores:
218
+ similarities = 1 - distances
219
+ return indices, similarities
220
+ return indices
221
+
222
+ def explain_match(self, query, index):
223
+ """
224
+ 解释匹配结果 - 显示查询和匹配文献的关键词
225
+ """
226
+ # 获取TF-IDF特征名
227
+ feature_names = self.vectorizer.get_feature_names_out()
228
+
229
+ # 向量化查询和匹配文献
230
+ query_vec = self.vectorizer.transform([query])
231
+ doc_vec = self.vectorizer.transform([self.corpus[index]])
232
+
233
+ # 获取重要特征
234
+ query_data = zip(feature_names, query_vec.toarray()[0])
235
+ doc_data = zip(feature_names, doc_vec.toarray()[0])
236
+
237
+ # 筛选非零特征
238
+ query_keywords = [(word, score) for word, score in query_data if score > 0]
239
+ doc_keywords = [(word, score) for word, score in doc_data if score > 0]
240
+
241
+ # 按重要性排序
242
+ query_keywords.sort(key=lambda x: x[1], reverse=True)
243
+ doc_keywords.sort(key=lambda x: x[1], reverse=True)
244
+
245
+ # 打印结果
246
+ print(f"\n匹配文献 #{index} 解释:")
247
+ print(f"查询关键词: {[word for word, _ in query_keywords[:10]]}")
248
+ print(f"文献关键词: {[word for word, _ in doc_keywords[:10]]}")
249
+
250
+ # 计算共同关键词
251
+ common_keywords = set([word for word, _ in query_keywords[:20]]) & set([word for word, _ in doc_keywords[:20]])
252
+ print(f"共同关键词: {list(common_keywords)}")
253
+
254
254
  return common_keywords