cnks 0.2.5__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cnks-0.3.1.dist-info/METADATA +101 -0
- cnks-0.3.1.dist-info/RECORD +17 -0
- cnks-0.3.1.dist-info/entry_points.txt +5 -0
- src/ThisIsAServerSample.py +377 -0
- src/__init__.py +7 -0
- src/cache.py +451 -0
- src/citzer.py +868 -0
- src/click50.py +527 -0
- src/client.py +135 -0
- src/cssci.py +267 -0
- src/extractlink.py +262 -0
- src/ifverify.py +134 -0
- src/main.py +70 -0
- src/searcher.py +767 -0
- src/server.py +487 -0
- src/worker.py +219 -0
- cnks/__init__.py +0 -50
- cnks/server.py +0 -1876
- cnks-0.2.5.dist-info/METADATA +0 -181
- cnks-0.2.5.dist-info/RECORD +0 -6
- cnks-0.2.5.dist-info/entry_points.txt +0 -2
- {cnks-0.2.5.dist-info → cnks-0.3.1.dist-info}/WHEEL +0 -0
src/searcher.py
ADDED
@@ -0,0 +1,767 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
|
4
|
+
"""
|
5
|
+
关键词搜索模块(Searcher Module)
|
6
|
+
|
7
|
+
这是一个专门负责关键词搜索的模块,使用Playwright执行搜索并提取结果链接。
|
8
|
+
主要职责:
|
9
|
+
1. 接收关键词并执行搜索
|
10
|
+
2. 应用CSSCI筛选条件
|
11
|
+
3. 设置每页显示结果数
|
12
|
+
4. 提取搜索结果链接
|
13
|
+
5. 返回链接列表给工作者
|
14
|
+
"""
|
15
|
+
|
16
|
+
import logging
|
17
|
+
import os
|
18
|
+
import traceback
|
19
|
+
import asyncio
|
20
|
+
import platform
|
21
|
+
import time
|
22
|
+
from typing import List, Dict, Any, Optional
|
23
|
+
|
24
|
+
# 配置日志记录
|
25
|
+
try:
|
26
|
+
# 尝试使用绝对路径
|
27
|
+
log_dir = os.path.dirname(os.path.abspath(__file__))
|
28
|
+
log_file = os.path.join(os.path.dirname(log_dir), "cnks_searcher.log")
|
29
|
+
|
30
|
+
# 创建处理器
|
31
|
+
file_handler = logging.FileHandler(log_file, mode="a")
|
32
|
+
console_handler = logging.StreamHandler()
|
33
|
+
|
34
|
+
# 设置格式
|
35
|
+
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
|
36
|
+
file_handler.setFormatter(formatter)
|
37
|
+
console_handler.setFormatter(formatter)
|
38
|
+
|
39
|
+
# 获取日志记录器并添加处理器
|
40
|
+
logger = logging.getLogger("cnks.searcher")
|
41
|
+
logger.setLevel(logging.DEBUG)
|
42
|
+
|
43
|
+
# 移除现有处理器以避免重复
|
44
|
+
if logger.handlers:
|
45
|
+
for handler in logger.handlers:
|
46
|
+
logger.removeHandler(handler)
|
47
|
+
|
48
|
+
logger.addHandler(file_handler)
|
49
|
+
logger.addHandler(console_handler)
|
50
|
+
|
51
|
+
# 打印确认信息
|
52
|
+
print(f"Searcher logger initialized, logging to: {log_file}")
|
53
|
+
logger.info(f"Searcher logging to: {log_file}")
|
54
|
+
except Exception as e:
|
55
|
+
# 回退到基本控制台日志记录
|
56
|
+
logging.basicConfig(
|
57
|
+
level=logging.DEBUG,
|
58
|
+
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
59
|
+
)
|
60
|
+
logger = logging.getLogger("cnks.searcher")
|
61
|
+
logger.error(f"Failed to set up file logging: {str(e)}")
|
62
|
+
print(f"Error setting up searcher file logging: {str(e)}")
|
63
|
+
|
64
|
+
# 导入必要的模块
|
65
|
+
try:
|
66
|
+
from playwright.async_api import async_playwright, Page
|
67
|
+
PLAYWRIGHT_AVAILABLE = True
|
68
|
+
except ImportError:
|
69
|
+
PLAYWRIGHT_AVAILABLE = False
|
70
|
+
logger.error("Playwright not available. Install with: pip install playwright")
|
71
|
+
|
72
|
+
# 尝试导入其他模块
|
73
|
+
try:
|
74
|
+
from src.ifverify import check_verification_needed, handle_verification
|
75
|
+
from src.click50 import set_results_per_page
|
76
|
+
from src.extractlink import extract_links_from_page
|
77
|
+
except ImportError:
|
78
|
+
try:
|
79
|
+
from ifverify import check_verification_needed, handle_verification
|
80
|
+
from click50 import set_results_per_page
|
81
|
+
from extractlink import extract_links_from_page
|
82
|
+
except ImportError:
|
83
|
+
logger.warning("无法导入验证处理、过滤和链接提取模块,部分功能可能不可用")
|
84
|
+
|
85
|
+
# 默认搜索URL
|
86
|
+
SEARCH_URL = os.environ.get("SEARCH_URL", "https://kns.cnki.net/kns8s/search")
|
87
|
+
|
88
|
+
# CSSCI筛选的JavaScript函数
|
89
|
+
js_cssci_button_finder = """
|
90
|
+
() => {
|
91
|
+
try {
|
92
|
+
// 首先尝试找到带有"CSSCI"文本的链接
|
93
|
+
let foundElement = null;
|
94
|
+
|
95
|
+
// 尝试查找带有title="CSSCI"的链接
|
96
|
+
const cssciLinks = document.querySelectorAll('a[title="CSSCI"]');
|
97
|
+
if (cssciLinks.length > 0) {
|
98
|
+
foundElement = cssciLinks[0];
|
99
|
+
return {
|
100
|
+
found: true,
|
101
|
+
x: foundElement.getBoundingClientRect().left + foundElement.getBoundingClientRect().width/2,
|
102
|
+
y: foundElement.getBoundingClientRect().top + foundElement.getBoundingClientRect().height/2,
|
103
|
+
method: 'title_link',
|
104
|
+
message: "找到带有title='CSSCI'的链接元素"
|
105
|
+
};
|
106
|
+
}
|
107
|
+
|
108
|
+
// 尝试查找包含CSSCI文本的链接
|
109
|
+
const allLinks = document.querySelectorAll('a');
|
110
|
+
for (const link of allLinks) {
|
111
|
+
if (link.textContent && link.textContent.trim() === 'CSSCI') {
|
112
|
+
foundElement = link;
|
113
|
+
return {
|
114
|
+
found: true,
|
115
|
+
x: foundElement.getBoundingClientRect().left + foundElement.getBoundingClientRect().width/2,
|
116
|
+
y: foundElement.getBoundingClientRect().top + foundElement.getBoundingClientRect().height/2,
|
117
|
+
method: 'text_link',
|
118
|
+
message: "找到文本为'CSSCI'的链接元素"
|
119
|
+
};
|
120
|
+
}
|
121
|
+
}
|
122
|
+
|
123
|
+
// 尝试查找包含CSSCI文本的任何元素
|
124
|
+
const allElements = document.querySelectorAll('*');
|
125
|
+
for (const element of allElements) {
|
126
|
+
if (element.textContent &&
|
127
|
+
element.textContent.includes('CSSCI') &&
|
128
|
+
element.getBoundingClientRect().width > 0 &&
|
129
|
+
element.getBoundingClientRect().height > 0) {
|
130
|
+
foundElement = element;
|
131
|
+
return {
|
132
|
+
found: true,
|
133
|
+
x: foundElement.getBoundingClientRect().left + foundElement.getBoundingClientRect().width/2,
|
134
|
+
y: foundElement.getBoundingClientRect().top + foundElement.getBoundingClientRect().height/2,
|
135
|
+
method: 'any_element',
|
136
|
+
message: "找到包含'CSSCI'的元素"
|
137
|
+
};
|
138
|
+
}
|
139
|
+
}
|
140
|
+
|
141
|
+
// 未找到任何相关元素
|
142
|
+
return {
|
143
|
+
found: false,
|
144
|
+
message: "未找到任何包含'CSSCI'的可点击元素"
|
145
|
+
};
|
146
|
+
} catch (error) {
|
147
|
+
// 发生错误,返回错误信息
|
148
|
+
return {
|
149
|
+
found: false,
|
150
|
+
error: error.toString(),
|
151
|
+
message: "查找CSSCI元素时发生错误: " + error.toString()
|
152
|
+
};
|
153
|
+
}
|
154
|
+
}
|
155
|
+
"""
|
156
|
+
|
157
|
+
class Searcher:
|
158
|
+
"""
|
159
|
+
关键词搜索类,负责执行搜索并提取结果链接
|
160
|
+
"""
|
161
|
+
|
162
|
+
def __init__(self):
|
163
|
+
"""初始化Searcher类"""
|
164
|
+
self.playwright = None
|
165
|
+
self.browser = None
|
166
|
+
self.context = None
|
167
|
+
self.browser_started = False
|
168
|
+
|
169
|
+
# 创建调试截图目录
|
170
|
+
self.debug_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "debug_screenshots")
|
171
|
+
os.makedirs(self.debug_dir, exist_ok=True)
|
172
|
+
|
173
|
+
logger.info("Searcher初始化完成")
|
174
|
+
|
175
|
+
async def start_browser(self) -> bool:
|
176
|
+
"""
|
177
|
+
启动Playwright浏览器
|
178
|
+
|
179
|
+
Returns:
|
180
|
+
bool: 浏览器是否成功启动
|
181
|
+
"""
|
182
|
+
if self.browser_started:
|
183
|
+
logger.info("浏览器已启动,重用现有实例")
|
184
|
+
return True
|
185
|
+
|
186
|
+
if not PLAYWRIGHT_AVAILABLE:
|
187
|
+
logger.error("Playwright未安装,无法启动浏览器")
|
188
|
+
return False
|
189
|
+
|
190
|
+
try:
|
191
|
+
logger.info("使用持久上下文启动浏览器")
|
192
|
+
|
193
|
+
# 创建Playwright实例
|
194
|
+
self.playwright = await async_playwright().start()
|
195
|
+
logger.info("Playwright已启动")
|
196
|
+
|
197
|
+
# 创建用户数据目录(如果不存在)
|
198
|
+
user_data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "chrome_data")
|
199
|
+
os.makedirs(user_data_dir, exist_ok=True)
|
200
|
+
logger.info(f"使用Chrome用户数据目录: {user_data_dir}")
|
201
|
+
|
202
|
+
# 设置Chrome参数
|
203
|
+
browser_args = [
|
204
|
+
'--start-maximized',
|
205
|
+
'--disable-popup-blocking'
|
206
|
+
]
|
207
|
+
|
208
|
+
# 查找Chrome可执行文件
|
209
|
+
chrome_path = self._find_chrome_executable()
|
210
|
+
if chrome_path:
|
211
|
+
logger.info(f"使用Chrome路径: {chrome_path}")
|
212
|
+
|
213
|
+
# 使用持久上下文启动浏览器
|
214
|
+
self.context = await self.playwright.chromium.launch_persistent_context(
|
215
|
+
user_data_dir=user_data_dir,
|
216
|
+
executable_path=chrome_path if chrome_path else None,
|
217
|
+
headless=False,
|
218
|
+
args=browser_args
|
219
|
+
)
|
220
|
+
logger.info("使用持久上下文启动浏览器成功")
|
221
|
+
|
222
|
+
# 创建一个初始页面确保上下文已激活
|
223
|
+
init_page = await self.context.new_page()
|
224
|
+
await init_page.goto("about:blank")
|
225
|
+
await init_page.close()
|
226
|
+
logger.info("使用空白页初始化浏览器")
|
227
|
+
|
228
|
+
# 标记浏览器已启动
|
229
|
+
self.browser_started = True
|
230
|
+
return True
|
231
|
+
|
232
|
+
except Exception as e:
|
233
|
+
logger.error(f"启动浏览器时出错: {str(e)}")
|
234
|
+
logger.error(traceback.format_exc())
|
235
|
+
|
236
|
+
# 清理资源
|
237
|
+
await self.close_browser()
|
238
|
+
return False
|
239
|
+
|
240
|
+
async def close_browser(self) -> bool:
|
241
|
+
"""
|
242
|
+
关闭Playwright浏览器和相关资源
|
243
|
+
|
244
|
+
Returns:
|
245
|
+
bool: 是否成功关闭
|
246
|
+
"""
|
247
|
+
logger.info("关闭浏览器资源")
|
248
|
+
|
249
|
+
try:
|
250
|
+
# 关闭浏览器上下文
|
251
|
+
if self.context:
|
252
|
+
await self.context.close()
|
253
|
+
self.context = None
|
254
|
+
logger.info("浏览器上下文已关闭")
|
255
|
+
|
256
|
+
# 停止Playwright
|
257
|
+
if self.playwright:
|
258
|
+
await self.playwright.stop()
|
259
|
+
self.playwright = None
|
260
|
+
logger.info("Playwright已停止")
|
261
|
+
|
262
|
+
# 重置浏览器状态
|
263
|
+
self.browser_started = False
|
264
|
+
logger.info("浏览器资源已成功关闭")
|
265
|
+
return True
|
266
|
+
|
267
|
+
except Exception as e:
|
268
|
+
logger.error(f"关闭浏览器时出错: {str(e)}")
|
269
|
+
logger.error(traceback.format_exc())
|
270
|
+
self.browser_started = False
|
271
|
+
return False
|
272
|
+
|
273
|
+
def _find_chrome_executable(self) -> Optional[str]:
|
274
|
+
"""
|
275
|
+
查找本地Chrome可执行文件路径
|
276
|
+
|
277
|
+
Returns:
|
278
|
+
Optional[str]: Chrome可执行文件路径,未找到则为None
|
279
|
+
"""
|
280
|
+
system = platform.system()
|
281
|
+
|
282
|
+
if system == "Windows":
|
283
|
+
# Windows路径
|
284
|
+
candidates = [
|
285
|
+
os.path.expandvars(r"%ProgramFiles%\Google\Chrome\Application\chrome.exe"),
|
286
|
+
os.path.expandvars(r"%ProgramFiles(x86)%\Google\Chrome\Application\chrome.exe"),
|
287
|
+
os.path.expandvars(r"%LocalAppData%\Google\Chrome\Application\chrome.exe")
|
288
|
+
]
|
289
|
+
elif system == "Darwin": # macOS
|
290
|
+
candidates = [
|
291
|
+
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
|
292
|
+
"~/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
|
293
|
+
]
|
294
|
+
else: # Linux
|
295
|
+
candidates = [
|
296
|
+
"/usr/bin/google-chrome",
|
297
|
+
"/usr/bin/google-chrome-stable",
|
298
|
+
"/usr/bin/chromium",
|
299
|
+
"/usr/bin/chromium-browser"
|
300
|
+
]
|
301
|
+
|
302
|
+
# 检查每个候选路径
|
303
|
+
for path in candidates:
|
304
|
+
expanded_path = os.path.expanduser(path)
|
305
|
+
if os.path.exists(expanded_path) and os.access(expanded_path, os.X_OK):
|
306
|
+
logger.info(f"找到Chrome路径: {expanded_path}")
|
307
|
+
return expanded_path
|
308
|
+
|
309
|
+
logger.warning("未找到Chrome可执行文件")
|
310
|
+
return None
|
311
|
+
|
312
|
+
async def open_page(self, url: str) -> Optional[Page]:
|
313
|
+
"""
|
314
|
+
打开新标签页并导航到指定URL
|
315
|
+
|
316
|
+
Args:
|
317
|
+
url: 要导航到的URL
|
318
|
+
|
319
|
+
Returns:
|
320
|
+
Optional[Page]: Playwright页面对象,失败则为None
|
321
|
+
"""
|
322
|
+
if not self.browser_started:
|
323
|
+
success = await self.start_browser()
|
324
|
+
if not success:
|
325
|
+
logger.error("无法启动浏览器,放弃打开页面")
|
326
|
+
return None
|
327
|
+
|
328
|
+
try:
|
329
|
+
# 创建新标签页
|
330
|
+
page = await self.context.new_page()
|
331
|
+
logger.info(f"已创建新标签页,正在导航到URL: {url}")
|
332
|
+
|
333
|
+
# 导航到指定URL
|
334
|
+
await page.goto(url, wait_until="domcontentloaded", timeout=30000)
|
335
|
+
|
336
|
+
# 检查是否需要验证
|
337
|
+
verification_needed = await check_verification_needed(page)
|
338
|
+
if verification_needed:
|
339
|
+
logger.info("检测到需要人工验证,等待10秒钟...")
|
340
|
+
await page.wait_for_timeout(10000) # 等待10秒
|
341
|
+
else:
|
342
|
+
logger.info("未检测到验证页面,继续执行")
|
343
|
+
|
344
|
+
# 等待页面完全加载
|
345
|
+
await page.wait_for_load_state("networkidle", timeout=30000)
|
346
|
+
|
347
|
+
return page
|
348
|
+
|
349
|
+
except Exception as e:
|
350
|
+
logger.error(f"打开页面时出错: {str(e)}")
|
351
|
+
logger.error(traceback.format_exc())
|
352
|
+
return None
|
353
|
+
|
354
|
+
async def apply_cssci_filter(self, page) -> Dict[str, Any]:
|
355
|
+
"""
|
356
|
+
在搜索结果页面中应用CSSCI筛选
|
357
|
+
|
358
|
+
Args:
|
359
|
+
page: Playwright页面对象
|
360
|
+
|
361
|
+
Returns:
|
362
|
+
Dict: 包含操作结果的字典,包括是否成功、消息等
|
363
|
+
"""
|
364
|
+
logger.info("开始应用CSSCI筛选")
|
365
|
+
result = {
|
366
|
+
"success": True, # 默认为True,即使没找到也算成功(跳过继续处理)
|
367
|
+
"message": "",
|
368
|
+
"filter_applied": False
|
369
|
+
}
|
370
|
+
|
371
|
+
try:
|
372
|
+
# 首先检查页面上是否已经存在选中的CSSCI复选框
|
373
|
+
cssci_checked = await page.evaluate("""
|
374
|
+
() => {
|
375
|
+
const checkbox = document.querySelector('input[type="checkbox"][value="P0209"][title="CSSCI"][checked="checked"]');
|
376
|
+
return !!checkbox;
|
377
|
+
}
|
378
|
+
""")
|
379
|
+
|
380
|
+
if cssci_checked:
|
381
|
+
logger.info("CSSCI选项已被选中,现在点击应用按钮")
|
382
|
+
# 即使已选中也点击应用按钮确保筛选生效
|
383
|
+
apply_result = await self.click_apply_button(page)
|
384
|
+
result["filter_applied"] = True
|
385
|
+
result["message"] = f"CSSCI选项已被选中,应用按钮点击结果: {apply_result}"
|
386
|
+
return result
|
387
|
+
|
388
|
+
# 查找CSSCI复选框
|
389
|
+
cssci_checkbox = await page.query_selector('input[type="checkbox"][value="P0209"][title="CSSCI"]')
|
390
|
+
|
391
|
+
if cssci_checkbox:
|
392
|
+
# 使用模拟真实鼠标点击的方法点击CSSCI复选框
|
393
|
+
logger.info("找到CSSCI复选框,使用真实鼠标点击模拟")
|
394
|
+
|
395
|
+
# 获取元素的位置
|
396
|
+
bbox = await cssci_checkbox.bounding_box()
|
397
|
+
if bbox:
|
398
|
+
# 计算元素中心点
|
399
|
+
x = bbox["x"] + bbox["width"] / 2
|
400
|
+
y = bbox["y"] + bbox["height"] / 2
|
401
|
+
|
402
|
+
# 移动鼠标到元素中心
|
403
|
+
await page.mouse.move(x, y)
|
404
|
+
# 按下鼠标按钮
|
405
|
+
await page.mouse.down()
|
406
|
+
# 等待50毫秒
|
407
|
+
await asyncio.sleep(0.05)
|
408
|
+
# 释放鼠标按钮
|
409
|
+
await page.mouse.up()
|
410
|
+
|
411
|
+
logger.info("成功模拟真实鼠标点击CSSCI复选框")
|
412
|
+
else:
|
413
|
+
# 如果无法获取位置,回退到普通点击
|
414
|
+
logger.warning("无法获取CSSCI复选框位置,使用普通点击")
|
415
|
+
await cssci_checkbox.click()
|
416
|
+
|
417
|
+
# 等待一秒确保复选框状态更新
|
418
|
+
await asyncio.sleep(1)
|
419
|
+
|
420
|
+
# 点击应用按钮
|
421
|
+
logger.info("点击应用按钮应用筛选")
|
422
|
+
apply_result = await self.click_apply_button(page)
|
423
|
+
|
424
|
+
# 等待页面刷新
|
425
|
+
await page.wait_for_load_state("networkidle")
|
426
|
+
|
427
|
+
result["filter_applied"] = True
|
428
|
+
result["message"] = f"成功勾选CSSCI选项并点击应用按钮,结果: {apply_result}"
|
429
|
+
return result
|
430
|
+
|
431
|
+
# 如果通过精确选择器未找到,则尝试查找来源类别区域
|
432
|
+
source_category = await page.query_selector('.source-category, .filter-item:has-text("来源类别")')
|
433
|
+
logger.debug("[DEBUG] 尝试查找来源类别区域")
|
434
|
+
|
435
|
+
if source_category:
|
436
|
+
logger.debug("[DEBUG] 找到了来源类别区域")
|
437
|
+
|
438
|
+
# 在来源类别区域内查找CSSCI选项
|
439
|
+
cssci_checkbox = await source_category.query_selector('input[type="checkbox"]:near(:text("CSSCI"))')
|
440
|
+
|
441
|
+
if cssci_checkbox:
|
442
|
+
# 使用模拟真实鼠标点击的方法点击CSSCI复选框
|
443
|
+
logger.info("找到CSSCI复选框(在来源类别区域内),使用真实鼠标点击模拟")
|
444
|
+
|
445
|
+
# 获取元素的位置
|
446
|
+
bbox = await cssci_checkbox.bounding_box()
|
447
|
+
if bbox:
|
448
|
+
# 计算元素中心点
|
449
|
+
x = bbox["x"] + bbox["width"] / 2
|
450
|
+
y = bbox["y"] + bbox["height"] / 2
|
451
|
+
|
452
|
+
# 移动鼠标到元素中心
|
453
|
+
await page.mouse.move(x, y)
|
454
|
+
# 按下鼠标按钮
|
455
|
+
await page.mouse.down()
|
456
|
+
# 等待50毫秒
|
457
|
+
await asyncio.sleep(0.05)
|
458
|
+
# 释放鼠标按钮
|
459
|
+
await page.mouse.up()
|
460
|
+
|
461
|
+
logger.info("成功模拟真实鼠标点击CSSCI复选框")
|
462
|
+
else:
|
463
|
+
# 如果无法获取位置,回退到普通点击
|
464
|
+
logger.warning("无法获取CSSCI复选框位置,使用普通点击")
|
465
|
+
await cssci_checkbox.click()
|
466
|
+
|
467
|
+
# 等待一秒确保复选框状态更新
|
468
|
+
await asyncio.sleep(1)
|
469
|
+
|
470
|
+
# 点击应用按钮
|
471
|
+
logger.info("点击应用按钮应用筛选")
|
472
|
+
apply_result = await self.click_apply_button(page)
|
473
|
+
|
474
|
+
# 等待页面刷新
|
475
|
+
await page.wait_for_load_state("networkidle")
|
476
|
+
|
477
|
+
result["filter_applied"] = True
|
478
|
+
result["message"] = f"成功勾选CSSCI选项并点击应用按钮,结果: {apply_result}"
|
479
|
+
return result
|
480
|
+
else:
|
481
|
+
logger.debug("[DEBUG] 在来源类别区域未找到CSSCI选项")
|
482
|
+
|
483
|
+
# 尝试另一种方式:直接在整个页面中查找CSSCI
|
484
|
+
cssci_text = await page.query_selector(':text("CSSCI")')
|
485
|
+
if cssci_text:
|
486
|
+
# 使用模拟真实鼠标点击的方法点击CSSCI文本
|
487
|
+
logger.info("找到CSSCI文本,使用真实鼠标点击模拟")
|
488
|
+
|
489
|
+
# 获取元素的位置
|
490
|
+
bbox = await cssci_text.bounding_box()
|
491
|
+
if bbox:
|
492
|
+
# 计算元素中心点
|
493
|
+
x = bbox["x"] + bbox["width"] / 2
|
494
|
+
y = bbox["y"] + bbox["height"] / 2
|
495
|
+
|
496
|
+
# 移动鼠标到元素中心
|
497
|
+
await page.mouse.move(x, y)
|
498
|
+
# 按下鼠标按钮
|
499
|
+
await page.mouse.down()
|
500
|
+
# 等待50毫秒
|
501
|
+
await asyncio.sleep(0.05)
|
502
|
+
# 释放鼠标按钮
|
503
|
+
await page.mouse.up()
|
504
|
+
|
505
|
+
logger.info("成功模拟真实鼠标点击CSSCI文本")
|
506
|
+
else:
|
507
|
+
# 如果无法获取位置,回退到普通点击
|
508
|
+
logger.warning("无法获取CSSCI文本位置,使用普通点击")
|
509
|
+
await cssci_text.click()
|
510
|
+
|
511
|
+
# 等待一秒确保复选框状态更新
|
512
|
+
await asyncio.sleep(1)
|
513
|
+
|
514
|
+
# 点击应用按钮
|
515
|
+
logger.info("点击应用按钮应用筛选")
|
516
|
+
apply_result = await self.click_apply_button(page)
|
517
|
+
|
518
|
+
await page.wait_for_load_state("networkidle")
|
519
|
+
|
520
|
+
result["filter_applied"] = True
|
521
|
+
result["message"] = f"通过文本找到并点击了CSSCI,应用按钮点击结果: {apply_result}"
|
522
|
+
return result
|
523
|
+
else:
|
524
|
+
result["message"] = "未找到CSSCI选项"
|
525
|
+
return result
|
526
|
+
else:
|
527
|
+
logger.debug("[DEBUG] 未找到来源类别区域")
|
528
|
+
result["message"] = "未找到来源类别区域和CSSCI选项"
|
529
|
+
return result
|
530
|
+
|
531
|
+
except Exception as e:
|
532
|
+
logger.error(f"应用CSSCI筛选时发生错误: {str(e)}")
|
533
|
+
logger.error(traceback.format_exc())
|
534
|
+
|
535
|
+
result["message"] = f"应用CSSCI筛选时发生错误: {str(e)}"
|
536
|
+
return result
|
537
|
+
|
538
|
+
async def click_apply_button(self, page):
|
539
|
+
"""
|
540
|
+
尝试点击筛选按钮以应用筛选
|
541
|
+
|
542
|
+
Args:
|
543
|
+
page: Playwright页面对象
|
544
|
+
"""
|
545
|
+
# 等待一秒,确保页面状态已更新
|
546
|
+
await asyncio.sleep(1)
|
547
|
+
|
548
|
+
apply_buttons = [
|
549
|
+
'.filter-button',
|
550
|
+
'.apply-filter',
|
551
|
+
'button[text="筛选"]',
|
552
|
+
'button[text="应用"]',
|
553
|
+
'button[text="确定"]',
|
554
|
+
'input[type="button"][value="确定"]',
|
555
|
+
'.btn-primary',
|
556
|
+
'#btn_search'
|
557
|
+
]
|
558
|
+
|
559
|
+
for btn_selector in apply_buttons:
|
560
|
+
try:
|
561
|
+
button = await page.query_selector(btn_selector)
|
562
|
+
if button:
|
563
|
+
logger.info(f"找到应用按钮: {btn_selector},使用真实鼠标点击模拟")
|
564
|
+
|
565
|
+
# 获取按钮位置
|
566
|
+
bbox = await button.bounding_box()
|
567
|
+
if bbox:
|
568
|
+
# 计算按钮中心点
|
569
|
+
x = bbox["x"] + bbox["width"] / 2
|
570
|
+
y = bbox["y"] + bbox["height"] / 2
|
571
|
+
|
572
|
+
# 移动鼠标到按钮中心
|
573
|
+
await page.mouse.move(x, y)
|
574
|
+
# 按下鼠标按钮
|
575
|
+
await page.mouse.down()
|
576
|
+
# 等待50毫秒
|
577
|
+
await asyncio.sleep(0.05)
|
578
|
+
# 释放鼠标按钮
|
579
|
+
await page.mouse.up()
|
580
|
+
|
581
|
+
logger.info(f"成功模拟真实鼠标点击应用按钮: {btn_selector}")
|
582
|
+
else:
|
583
|
+
# 如果无法获取位置,回退到普通点击
|
584
|
+
logger.warning(f"无法获取按钮 '{btn_selector}' 位置,使用普通点击")
|
585
|
+
await button.click()
|
586
|
+
|
587
|
+
await page.wait_for_load_state("networkidle", timeout=10000)
|
588
|
+
return True
|
589
|
+
except Exception as e:
|
590
|
+
logger.warning(f"点击按钮 '{btn_selector}' 失败: {str(e)}")
|
591
|
+
|
592
|
+
# 如果没有找到标准按钮,尝试通过JavaScript应用筛选
|
593
|
+
logger.info("尝试通过JavaScript应用筛选")
|
594
|
+
apply_js = """
|
595
|
+
() => {
|
596
|
+
try {
|
597
|
+
// 尝试找到并点击筛选应用按钮
|
598
|
+
const buttons = document.querySelectorAll('button, input[type="button"], a.btn');
|
599
|
+
for (const button of buttons) {
|
600
|
+
if (button.textContent &&
|
601
|
+
(button.textContent.includes('筛选') ||
|
602
|
+
button.textContent.includes('应用') ||
|
603
|
+
button.textContent.includes('确定') ||
|
604
|
+
button.textContent.includes('搜索'))) {
|
605
|
+
button.click();
|
606
|
+
return { clicked: true, text: button.textContent.trim() };
|
607
|
+
}
|
608
|
+
}
|
609
|
+
|
610
|
+
// 尝试查找搜索按钮
|
611
|
+
const searchBtn = document.querySelector('#btn_search, .search-button, button[onclick*="search"]');
|
612
|
+
if (searchBtn) {
|
613
|
+
searchBtn.click();
|
614
|
+
return { clicked: true, type: 'search' };
|
615
|
+
}
|
616
|
+
|
617
|
+
// 尝试提交表单
|
618
|
+
const form = document.querySelector('form');
|
619
|
+
if (form) {
|
620
|
+
form.submit();
|
621
|
+
return { clicked: true, type: 'form' };
|
622
|
+
}
|
623
|
+
|
624
|
+
return { clicked: false };
|
625
|
+
} catch (e) {
|
626
|
+
return { clicked: false, error: e.toString() };
|
627
|
+
}
|
628
|
+
}
|
629
|
+
"""
|
630
|
+
|
631
|
+
apply_result = await page.evaluate(apply_js)
|
632
|
+
if apply_result.get('clicked', False):
|
633
|
+
logger.info(f"通过JavaScript应用筛选成功: {apply_result}")
|
634
|
+
await page.wait_for_load_state("networkidle", timeout=10000)
|
635
|
+
return True
|
636
|
+
|
637
|
+
logger.warning("未找到应用筛选的按钮")
|
638
|
+
return False
|
639
|
+
|
640
|
+
async def search_keyword(self, keyword: str) -> List[str]:
|
641
|
+
"""
|
642
|
+
搜索关键词并提取结果链接
|
643
|
+
|
644
|
+
Args:
|
645
|
+
keyword: 搜索关键词
|
646
|
+
|
647
|
+
Returns:
|
648
|
+
List[str]: 搜索结果链接列表
|
649
|
+
"""
|
650
|
+
if not PLAYWRIGHT_AVAILABLE:
|
651
|
+
error_msg = "Playwright未安装,无法执行搜索"
|
652
|
+
logger.error(error_msg)
|
653
|
+
raise RuntimeError(error_msg)
|
654
|
+
|
655
|
+
# 确保浏览器已启动
|
656
|
+
if not self.browser_started:
|
657
|
+
success = await self.start_browser()
|
658
|
+
if not success:
|
659
|
+
logger.error("无法启动浏览器,放弃搜索")
|
660
|
+
return []
|
661
|
+
|
662
|
+
page = None
|
663
|
+
try:
|
664
|
+
logger.info(f"搜索关键词: {keyword}")
|
665
|
+
|
666
|
+
# 使用open_page方法打开搜索页面
|
667
|
+
page = await self.open_page(SEARCH_URL)
|
668
|
+
if not page:
|
669
|
+
logger.error("无法打开搜索页面")
|
670
|
+
return []
|
671
|
+
|
672
|
+
# 输入搜索词
|
673
|
+
logger.info("正在输入搜索关键词")
|
674
|
+
search_input = await page.query_selector('#txt_search-input')
|
675
|
+
|
676
|
+
if search_input:
|
677
|
+
await search_input.fill(keyword)
|
678
|
+
await search_input.press('Enter')
|
679
|
+
else:
|
680
|
+
logger.warning("未找到搜索输入框")
|
681
|
+
# 尝试其他可能的搜索输入框选择器
|
682
|
+
alternate_selectors = [
|
683
|
+
'input[type="search"]',
|
684
|
+
'.search-input',
|
685
|
+
'#search-input'
|
686
|
+
]
|
687
|
+
for selector in alternate_selectors:
|
688
|
+
try:
|
689
|
+
input_field = await page.query_selector(selector)
|
690
|
+
if input_field:
|
691
|
+
logger.info(f"找到替代搜索输入框: {selector}")
|
692
|
+
await input_field.fill(keyword)
|
693
|
+
await input_field.press('Enter')
|
694
|
+
break
|
695
|
+
except Exception:
|
696
|
+
continue
|
697
|
+
|
698
|
+
# 等待搜索结果
|
699
|
+
logger.info("等待搜索结果")
|
700
|
+
await page.wait_for_load_state("networkidle", timeout=60000)
|
701
|
+
|
702
|
+
# 尝试等待结果列表出现
|
703
|
+
try:
|
704
|
+
logger.info("等待结果列表出现")
|
705
|
+
await page.wait_for_selector('.result-table-list', timeout=10000)
|
706
|
+
logger.info("已找到结果列表")
|
707
|
+
except Exception as e:
|
708
|
+
logger.warning(f"未找到结果列表: {str(e)}")
|
709
|
+
|
710
|
+
# 设置每页显示50条结果
|
711
|
+
logger.info("设置每页显示50条结果")
|
712
|
+
try:
|
713
|
+
# 调用click50模块的设置方法
|
714
|
+
page_setting_result = await set_results_per_page(page)
|
715
|
+
if page_setting_result.get("success", False):
|
716
|
+
if page_setting_result.get("setting_applied", False):
|
717
|
+
logger.info("成功设置每页显示50条结果")
|
718
|
+
# 等待页面重新加载结果
|
719
|
+
await page.wait_for_load_state("networkidle", timeout=10000)
|
720
|
+
# 增加1000ms额外延时确保页面完全加载
|
721
|
+
logger.info("增加1000ms延时以确保页面完全加载")
|
722
|
+
await asyncio.sleep(1)
|
723
|
+
else:
|
724
|
+
logger.info(f"显示设置未应用: {page_setting_result.get('message', '')}")
|
725
|
+
else:
|
726
|
+
logger.warning(f"设置显示数量失败: {page_setting_result.get('message', '')}")
|
727
|
+
except Exception as e:
|
728
|
+
logger.warning(f"设置显示数量时出错: {str(e)}")
|
729
|
+
logger.warning(traceback.format_exc())
|
730
|
+
|
731
|
+
# 应用CSSCI筛选
|
732
|
+
logger.info("应用CSSCI筛选(如果可用)")
|
733
|
+
try:
|
734
|
+
# 调用内部的CSSCI筛选方法
|
735
|
+
filter_result = await self.apply_cssci_filter(page)
|
736
|
+
if filter_result.get("success", False):
|
737
|
+
if filter_result.get("filter_applied", False):
|
738
|
+
logger.info("CSSCI筛选成功应用")
|
739
|
+
else:
|
740
|
+
logger.info(f"CSSCI筛选未应用: {filter_result.get('message', '')}")
|
741
|
+
else:
|
742
|
+
logger.warning(f"应用CSSCI筛选失败: {filter_result.get('message', '')}")
|
743
|
+
except Exception as e:
|
744
|
+
logger.warning(f"CSSCI筛选时出错: {str(e)}")
|
745
|
+
logger.warning(traceback.format_exc())
|
746
|
+
|
747
|
+
# 强制等待几秒,确保页面加载完毕
|
748
|
+
logger.info("额外等待几秒确保结果加载完毕")
|
749
|
+
await asyncio.sleep(5)
|
750
|
+
|
751
|
+
# 提取链接 - 使用独立的extractlink模块
|
752
|
+
logger.info("使用extractlink模块从搜索结果中提取链接")
|
753
|
+
links = await extract_links_from_page(page)
|
754
|
+
logger.info(f"提取到 {len(links)} 个链接")
|
755
|
+
|
756
|
+
return links
|
757
|
+
|
758
|
+
except Exception as e:
|
759
|
+
logger.error(f"搜索关键词 {keyword} 时出错: {str(e)}")
|
760
|
+
logger.error(traceback.format_exc())
|
761
|
+
return []
|
762
|
+
|
763
|
+
finally:
|
764
|
+
# 关闭标签页但保持浏览器打开
|
765
|
+
if page:
|
766
|
+
await page.close()
|
767
|
+
logger.info("已关闭搜索页面,保持浏览器打开")
|