arxiv-pulse 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- arxiv_pulse/.ENV.TEMPLATE +72 -0
- arxiv_pulse/__init__.py +26 -0
- arxiv_pulse/__version__.py +33 -0
- arxiv_pulse/arxiv_crawler.py +377 -0
- arxiv_pulse/cli.py +1608 -0
- arxiv_pulse/config.py +64 -0
- arxiv_pulse/models.py +255 -0
- arxiv_pulse/output_manager.py +235 -0
- arxiv_pulse/report_generator.py +768 -0
- arxiv_pulse/search_engine.py +367 -0
- arxiv_pulse/summarizer.py +356 -0
- arxiv_pulse-0.5.0.dist-info/METADATA +546 -0
- arxiv_pulse-0.5.0.dist-info/RECORD +17 -0
- arxiv_pulse-0.5.0.dist-info/WHEEL +5 -0
- arxiv_pulse-0.5.0.dist-info/entry_points.txt +2 -0
- arxiv_pulse-0.5.0.dist-info/licenses/LICENSE +674 -0
- arxiv_pulse-0.5.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,367 @@
|
|
|
1
|
+
"""
|
|
2
|
+
增强搜索引擎 - 提供高级搜索和过滤功能
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from datetime import datetime, timedelta
|
|
7
|
+
from typing import List, Dict, Any, Optional, Union
|
|
8
|
+
from dataclasses import dataclass, field
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
|
|
11
|
+
from sqlalchemy import and_, or_, not_, func, desc, asc
|
|
12
|
+
from sqlalchemy.orm import Session
|
|
13
|
+
|
|
14
|
+
from arxiv_pulse.models import Paper
|
|
15
|
+
from arxiv_pulse.output_manager import output
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class SearchFilter:
|
|
20
|
+
"""搜索过滤器配置"""
|
|
21
|
+
|
|
22
|
+
# 文本搜索
|
|
23
|
+
query: Optional[str] = None
|
|
24
|
+
search_fields: List[str] = field(default_factory=lambda: ["title", "abstract", "categories", "search_query"])
|
|
25
|
+
|
|
26
|
+
# 分类过滤
|
|
27
|
+
categories: Optional[List[str]] = None
|
|
28
|
+
exclude_categories: Optional[List[str]] = None
|
|
29
|
+
primary_category: Optional[str] = None
|
|
30
|
+
|
|
31
|
+
# 作者过滤
|
|
32
|
+
authors: Optional[List[str]] = None
|
|
33
|
+
author_match: str = "contains" # "contains", "exact", "any"
|
|
34
|
+
|
|
35
|
+
# 时间过滤
|
|
36
|
+
date_from: Optional[datetime] = None
|
|
37
|
+
date_to: Optional[datetime] = None
|
|
38
|
+
days_back: Optional[int] = None
|
|
39
|
+
|
|
40
|
+
# 处理状态过滤
|
|
41
|
+
summarized_only: bool = False
|
|
42
|
+
downloaded_only: bool = False
|
|
43
|
+
|
|
44
|
+
# 搜索结果限制
|
|
45
|
+
limit: int = 20
|
|
46
|
+
offset: int = 0
|
|
47
|
+
|
|
48
|
+
# 排序
|
|
49
|
+
sort_by: str = "published" # "published", "relevance_score", "title", "updated"
|
|
50
|
+
sort_order: str = "desc" # "asc", "desc"
|
|
51
|
+
|
|
52
|
+
# 相似性搜索
|
|
53
|
+
similar_to_paper_id: Optional[str] = None
|
|
54
|
+
similarity_threshold: float = 0.5
|
|
55
|
+
|
|
56
|
+
# 高级选项
|
|
57
|
+
match_all: bool = False # True: AND逻辑, False: OR逻辑
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class SearchEngine:
|
|
61
|
+
"""增强的论文搜索引擎"""
|
|
62
|
+
|
|
63
|
+
def __init__(self, db_session: Session):
|
|
64
|
+
self.session = db_session
|
|
65
|
+
|
|
66
|
+
def build_text_filter(self, query: str, search_fields: List[str], match_all: bool = False):
|
|
67
|
+
"""构建文本搜索过滤器"""
|
|
68
|
+
if not query or not search_fields:
|
|
69
|
+
return None
|
|
70
|
+
|
|
71
|
+
filters = []
|
|
72
|
+
for field in search_fields:
|
|
73
|
+
if field == "title":
|
|
74
|
+
filters.append(Paper.title.contains(query))
|
|
75
|
+
elif field == "abstract":
|
|
76
|
+
filters.append(Paper.abstract.contains(query))
|
|
77
|
+
elif field == "categories":
|
|
78
|
+
filters.append(Paper.categories.contains(query))
|
|
79
|
+
elif field == "search_query":
|
|
80
|
+
filters.append(Paper.search_query.contains(query))
|
|
81
|
+
elif field == "authors":
|
|
82
|
+
# 作者字段是JSON字符串,需要特殊处理
|
|
83
|
+
filters.append(Paper.authors.contains(query))
|
|
84
|
+
|
|
85
|
+
if not filters:
|
|
86
|
+
return None
|
|
87
|
+
|
|
88
|
+
if match_all:
|
|
89
|
+
return and_(*filters)
|
|
90
|
+
else:
|
|
91
|
+
return or_(*filters)
|
|
92
|
+
|
|
93
|
+
def build_category_filter(
|
|
94
|
+
self,
|
|
95
|
+
categories: Optional[List[str]] = None,
|
|
96
|
+
exclude_categories: Optional[List[str]] = None,
|
|
97
|
+
primary_category: Optional[str] = None,
|
|
98
|
+
):
|
|
99
|
+
"""构建分类过滤器"""
|
|
100
|
+
filters = []
|
|
101
|
+
|
|
102
|
+
if categories:
|
|
103
|
+
category_filters = []
|
|
104
|
+
for cat in categories:
|
|
105
|
+
category_filters.append(Paper.categories.contains(cat))
|
|
106
|
+
if category_filters:
|
|
107
|
+
filters.append(or_(*category_filters))
|
|
108
|
+
|
|
109
|
+
if exclude_categories:
|
|
110
|
+
exclude_filters = []
|
|
111
|
+
for cat in exclude_categories:
|
|
112
|
+
exclude_filters.append(not_(Paper.categories.contains(cat)))
|
|
113
|
+
if exclude_filters:
|
|
114
|
+
filters.append(and_(*exclude_filters))
|
|
115
|
+
|
|
116
|
+
if primary_category:
|
|
117
|
+
filters.append(Paper.primary_category == primary_category)
|
|
118
|
+
|
|
119
|
+
return and_(*filters) if filters else None
|
|
120
|
+
|
|
121
|
+
def build_author_filter(self, authors: Optional[List[str]] = None, match_type: str = "contains"):
|
|
122
|
+
"""构建作者过滤器"""
|
|
123
|
+
if not authors:
|
|
124
|
+
return None
|
|
125
|
+
|
|
126
|
+
filters = []
|
|
127
|
+
for author in authors:
|
|
128
|
+
if match_type == "exact":
|
|
129
|
+
# 精确匹配(作者名在JSON数组中)
|
|
130
|
+
# 由于SQLite限制,使用contains近似
|
|
131
|
+
filters.append(Paper.authors.contains(f'"name": "{author}"'))
|
|
132
|
+
else: # contains
|
|
133
|
+
filters.append(Paper.authors.contains(author))
|
|
134
|
+
|
|
135
|
+
if not filters:
|
|
136
|
+
return None
|
|
137
|
+
|
|
138
|
+
if match_type == "any":
|
|
139
|
+
return or_(*filters)
|
|
140
|
+
else:
|
|
141
|
+
return and_(*filters)
|
|
142
|
+
|
|
143
|
+
def build_date_filter(
|
|
144
|
+
self, date_from: Optional[datetime] = None, date_to: Optional[datetime] = None, days_back: Optional[int] = None
|
|
145
|
+
):
|
|
146
|
+
"""构建时间过滤器"""
|
|
147
|
+
filters = []
|
|
148
|
+
|
|
149
|
+
if days_back:
|
|
150
|
+
cutoff_date = datetime.utcnow() - timedelta(days=days_back)
|
|
151
|
+
filters.append(Paper.published >= cutoff_date)
|
|
152
|
+
|
|
153
|
+
if date_from:
|
|
154
|
+
filters.append(Paper.published >= date_from)
|
|
155
|
+
|
|
156
|
+
if date_to:
|
|
157
|
+
filters.append(Paper.published <= date_to)
|
|
158
|
+
|
|
159
|
+
return and_(*filters) if filters else None
|
|
160
|
+
|
|
161
|
+
def build_status_filter(self, summarized_only: bool = False, downloaded_only: bool = False):
|
|
162
|
+
"""构建处理状态过滤器"""
|
|
163
|
+
filters = []
|
|
164
|
+
|
|
165
|
+
if summarized_only:
|
|
166
|
+
filters.append(Paper.summarized == True)
|
|
167
|
+
|
|
168
|
+
if downloaded_only:
|
|
169
|
+
filters.append(Paper.downloaded == True)
|
|
170
|
+
|
|
171
|
+
return and_(*filters) if filters else None
|
|
172
|
+
|
|
173
|
+
def get_sort_column(self, sort_by: str, sort_order: str = "desc"):
|
|
174
|
+
"""获取排序列"""
|
|
175
|
+
if sort_by == "published":
|
|
176
|
+
column = Paper.published
|
|
177
|
+
elif sort_by == "relevance_score":
|
|
178
|
+
column = Paper.relevance_score
|
|
179
|
+
elif sort_by == "title":
|
|
180
|
+
column = Paper.title
|
|
181
|
+
elif sort_by == "updated":
|
|
182
|
+
column = Paper.updated
|
|
183
|
+
elif sort_by == "created_at":
|
|
184
|
+
column = Paper.created_at
|
|
185
|
+
else:
|
|
186
|
+
column = Paper.published
|
|
187
|
+
|
|
188
|
+
return desc(column) if sort_order == "desc" else asc(column)
|
|
189
|
+
|
|
190
|
+
def search_papers(self, filter_config: SearchFilter) -> List[Paper]:
|
|
191
|
+
"""执行搜索并返回论文列表"""
|
|
192
|
+
try:
|
|
193
|
+
query = self.session.query(Paper)
|
|
194
|
+
|
|
195
|
+
# 应用所有过滤器
|
|
196
|
+
filters = []
|
|
197
|
+
|
|
198
|
+
# 文本搜索
|
|
199
|
+
if filter_config.query:
|
|
200
|
+
text_filter = self.build_text_filter(
|
|
201
|
+
filter_config.query, filter_config.search_fields, filter_config.match_all
|
|
202
|
+
)
|
|
203
|
+
if text_filter is not None:
|
|
204
|
+
filters.append(text_filter)
|
|
205
|
+
|
|
206
|
+
# 分类过滤
|
|
207
|
+
cat_filter = self.build_category_filter(
|
|
208
|
+
filter_config.categories, filter_config.exclude_categories, filter_config.primary_category
|
|
209
|
+
)
|
|
210
|
+
if cat_filter is not None:
|
|
211
|
+
filters.append(cat_filter)
|
|
212
|
+
|
|
213
|
+
# 作者过滤
|
|
214
|
+
author_filter = self.build_author_filter(filter_config.authors, filter_config.author_match)
|
|
215
|
+
if author_filter is not None:
|
|
216
|
+
filters.append(author_filter)
|
|
217
|
+
|
|
218
|
+
# 时间过滤
|
|
219
|
+
date_filter = self.build_date_filter(
|
|
220
|
+
filter_config.date_from, filter_config.date_to, filter_config.days_back
|
|
221
|
+
)
|
|
222
|
+
if date_filter is not None:
|
|
223
|
+
filters.append(date_filter)
|
|
224
|
+
|
|
225
|
+
# 状态过滤
|
|
226
|
+
status_filter = self.build_status_filter(filter_config.summarized_only, filter_config.downloaded_only)
|
|
227
|
+
if status_filter is not None:
|
|
228
|
+
filters.append(status_filter)
|
|
229
|
+
|
|
230
|
+
# 应用所有过滤器
|
|
231
|
+
if filters:
|
|
232
|
+
query = query.filter(and_(*filters))
|
|
233
|
+
|
|
234
|
+
# 排序
|
|
235
|
+
sort_column = self.get_sort_column(filter_config.sort_by, filter_config.sort_order)
|
|
236
|
+
query = query.order_by(sort_column)
|
|
237
|
+
|
|
238
|
+
# 分页
|
|
239
|
+
query = query.offset(filter_config.offset).limit(filter_config.limit)
|
|
240
|
+
|
|
241
|
+
papers = query.all()
|
|
242
|
+
output.debug(f"搜索找到 {len(papers)} 篇论文")
|
|
243
|
+
return papers
|
|
244
|
+
|
|
245
|
+
except Exception as e:
|
|
246
|
+
output.error(f"搜索失败: {str(e)}")
|
|
247
|
+
import traceback
|
|
248
|
+
|
|
249
|
+
output.debug(f"搜索失败详情: {traceback.format_exc()}")
|
|
250
|
+
return []
|
|
251
|
+
|
|
252
|
+
def search_similar_papers(
|
|
253
|
+
self, paper_id: str, limit: int = 10, threshold: float = 0.5
|
|
254
|
+
) -> List[tuple[Paper, float]]:
|
|
255
|
+
"""查找相似论文(基于标题和摘要的文本相似性)"""
|
|
256
|
+
try:
|
|
257
|
+
# 获取目标论文
|
|
258
|
+
target_paper = self.session.query(Paper).filter(Paper.arxiv_id == paper_id).first()
|
|
259
|
+
if not target_paper:
|
|
260
|
+
output.warn(f"未找到论文: {paper_id}")
|
|
261
|
+
return []
|
|
262
|
+
|
|
263
|
+
# 简化的相似性搜索:基于共同关键词或分类
|
|
264
|
+
# 在实际应用中,可以使用更复杂的文本相似性算法
|
|
265
|
+
all_papers = self.session.query(Paper).filter(Paper.arxiv_id != paper_id).all()
|
|
266
|
+
|
|
267
|
+
# 计算简单相似度:分类重叠
|
|
268
|
+
similar_papers_with_scores = []
|
|
269
|
+
target_cats = set(target_paper.categories.split()) if target_paper.categories else set()
|
|
270
|
+
|
|
271
|
+
for paper in all_papers:
|
|
272
|
+
if not paper.categories:
|
|
273
|
+
continue
|
|
274
|
+
|
|
275
|
+
paper_cats = set(paper.categories.split())
|
|
276
|
+
common_cats = target_cats.intersection(paper_cats)
|
|
277
|
+
|
|
278
|
+
if common_cats:
|
|
279
|
+
# 简单相似度分数:共同分类数 / 总分类数
|
|
280
|
+
similarity = len(common_cats) / max(len(target_cats), len(paper_cats))
|
|
281
|
+
if similarity >= threshold:
|
|
282
|
+
similar_papers_with_scores.append((paper, similarity))
|
|
283
|
+
|
|
284
|
+
# 按相似度排序
|
|
285
|
+
similar_papers_with_scores.sort(key=lambda x: x[1], reverse=True)
|
|
286
|
+
|
|
287
|
+
return similar_papers_with_scores[:limit]
|
|
288
|
+
|
|
289
|
+
except Exception as e:
|
|
290
|
+
output.error("相似论文搜索失败", details={"exception": str(e)})
|
|
291
|
+
return []
|
|
292
|
+
|
|
293
|
+
# 简化的相似性搜索:基于共同关键词或分类
|
|
294
|
+
# 在实际应用中,可以使用更复杂的文本相似性算法
|
|
295
|
+
all_papers = self.session.query(Paper).filter(Paper.arxiv_id != paper_id).all()
|
|
296
|
+
|
|
297
|
+
# 计算简单相似度:分类重叠
|
|
298
|
+
similar_papers = []
|
|
299
|
+
target_cats = set(target_paper.categories.split()) if target_paper.categories else set()
|
|
300
|
+
|
|
301
|
+
for paper in all_papers:
|
|
302
|
+
if not paper.categories:
|
|
303
|
+
continue
|
|
304
|
+
|
|
305
|
+
paper_cats = set(paper.categories.split())
|
|
306
|
+
common_cats = target_cats.intersection(paper_cats)
|
|
307
|
+
|
|
308
|
+
if common_cats:
|
|
309
|
+
# 简单相似度分数:共同分类数 / 总分类数
|
|
310
|
+
similarity = len(common_cats) / max(len(target_cats), len(paper_cats))
|
|
311
|
+
if similarity >= threshold:
|
|
312
|
+
# 临时存储相似度分数
|
|
313
|
+
paper.similarity_score = similarity
|
|
314
|
+
similar_papers.append(paper)
|
|
315
|
+
|
|
316
|
+
# 按相似度排序
|
|
317
|
+
similar_papers.sort(key=lambda x: getattr(x, "similarity_score", 0), reverse=True)
|
|
318
|
+
|
|
319
|
+
return similar_papers[:limit]
|
|
320
|
+
|
|
321
|
+
except Exception as e:
|
|
322
|
+
output.error("相似论文搜索失败", details={"exception": str(e)})
|
|
323
|
+
return []
|
|
324
|
+
|
|
325
|
+
def get_search_history(self, limit: int = 10) -> List[Dict[str, Any]]:
|
|
326
|
+
"""获取搜索历史(从数据库中的search_query字段提取)"""
|
|
327
|
+
try:
|
|
328
|
+
# 查询所有使用过的搜索查询
|
|
329
|
+
search_queries = (
|
|
330
|
+
self.session.query(Paper.search_query, func.count(Paper.id).label("count"))
|
|
331
|
+
.filter(Paper.search_query.isnot(None))
|
|
332
|
+
.group_by(Paper.search_query)
|
|
333
|
+
.order_by(desc("count"))
|
|
334
|
+
.limit(limit)
|
|
335
|
+
.all()
|
|
336
|
+
)
|
|
337
|
+
|
|
338
|
+
history = []
|
|
339
|
+
for query, count in search_queries:
|
|
340
|
+
# 获取最近一篇使用该查询的论文
|
|
341
|
+
recent_paper = (
|
|
342
|
+
self.session.query(Paper)
|
|
343
|
+
.filter(Paper.search_query == query)
|
|
344
|
+
.order_by(desc(Paper.published))
|
|
345
|
+
.first()
|
|
346
|
+
)
|
|
347
|
+
|
|
348
|
+
history.append(
|
|
349
|
+
{
|
|
350
|
+
"query": query,
|
|
351
|
+
"count": count,
|
|
352
|
+
"last_used": recent_paper.published if recent_paper else None,
|
|
353
|
+
"last_paper_id": recent_paper.arxiv_id if recent_paper else None,
|
|
354
|
+
}
|
|
355
|
+
)
|
|
356
|
+
|
|
357
|
+
return history
|
|
358
|
+
|
|
359
|
+
except Exception as e:
|
|
360
|
+
output.error("获取搜索历史失败", details={"exception": str(e)})
|
|
361
|
+
return []
|
|
362
|
+
|
|
363
|
+
def save_search_query(self, query: str, description: str = None):
|
|
364
|
+
"""保存搜索查询到历史(简单实现)"""
|
|
365
|
+
# 这里可以扩展为保存到单独的搜索历史表
|
|
366
|
+
# 目前依赖于Paper表中的search_query字段
|
|
367
|
+
pass
|