cnks 0.2.5__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cnks-0.3.1.dist-info/METADATA +101 -0
- cnks-0.3.1.dist-info/RECORD +17 -0
- cnks-0.3.1.dist-info/entry_points.txt +5 -0
- src/ThisIsAServerSample.py +377 -0
- src/__init__.py +7 -0
- src/cache.py +451 -0
- src/citzer.py +868 -0
- src/click50.py +527 -0
- src/client.py +135 -0
- src/cssci.py +267 -0
- src/extractlink.py +262 -0
- src/ifverify.py +134 -0
- src/main.py +70 -0
- src/searcher.py +767 -0
- src/server.py +487 -0
- src/worker.py +219 -0
- cnks/__init__.py +0 -50
- cnks/server.py +0 -1876
- cnks-0.2.5.dist-info/METADATA +0 -181
- cnks-0.2.5.dist-info/RECORD +0 -6
- cnks-0.2.5.dist-info/entry_points.txt +0 -2
- {cnks-0.2.5.dist-info → cnks-0.3.1.dist-info}/WHEEL +0 -0
src/server.py
ADDED
@@ -0,0 +1,487 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
|
4
|
+
"""
|
5
|
+
引文服务器模块(Server Module)
|
6
|
+
|
7
|
+
处理用户请求并直接调用Worker API进行处理,不再使用队列通信。
|
8
|
+
使用Cache存储和检索搜索结果。
|
9
|
+
|
10
|
+
Exposed Tools:
|
11
|
+
- search_keyword: 搜索指定关键词并获取相关引用
|
12
|
+
- process_link: 处理特定链接并提取引用信息
|
13
|
+
- close_browser: 关闭浏览器实例
|
14
|
+
"""
|
15
|
+
|
16
|
+
import asyncio
|
17
|
+
import json
|
18
|
+
import logging
|
19
|
+
import os
|
20
|
+
import sys
|
21
|
+
import time
|
22
|
+
import traceback
|
23
|
+
import uuid
|
24
|
+
from typing import Dict, List, Any, Optional
|
25
|
+
|
26
|
+
# 配置日志记录
|
27
|
+
try:
|
28
|
+
# 尝试使用绝对路径
|
29
|
+
log_dir = os.path.dirname(os.path.abspath(__file__))
|
30
|
+
log_file = os.path.join(os.path.dirname(log_dir), "cnks_server.log")
|
31
|
+
|
32
|
+
# 创建处理器
|
33
|
+
file_handler = logging.FileHandler(log_file, mode="a")
|
34
|
+
console_handler = logging.StreamHandler()
|
35
|
+
|
36
|
+
# 设置格式
|
37
|
+
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
|
38
|
+
file_handler.setFormatter(formatter)
|
39
|
+
console_handler.setFormatter(formatter)
|
40
|
+
|
41
|
+
# 获取日志记录器并添加处理器
|
42
|
+
logger = logging.getLogger("cnks.server")
|
43
|
+
logger.setLevel(logging.DEBUG)
|
44
|
+
|
45
|
+
# 移除现有处理器以避免重复
|
46
|
+
if logger.handlers:
|
47
|
+
for handler in logger.handlers:
|
48
|
+
logger.removeHandler(handler)
|
49
|
+
|
50
|
+
logger.addHandler(file_handler)
|
51
|
+
logger.addHandler(console_handler)
|
52
|
+
|
53
|
+
# 打印确认信息
|
54
|
+
print(f"Server logger initialized, logging to: {log_file}")
|
55
|
+
logger.info(f"Server logging to: {log_file}")
|
56
|
+
except Exception as e:
|
57
|
+
# 回退到基本控制台日志记录
|
58
|
+
logging.basicConfig(
|
59
|
+
level=logging.DEBUG,
|
60
|
+
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
61
|
+
)
|
62
|
+
logger = logging.getLogger("cnks.server")
|
63
|
+
logger.error(f"Failed to set up file logging: {str(e)}")
|
64
|
+
print(f"Error setting up server file logging: {str(e)}")
|
65
|
+
|
66
|
+
# 导入MCP服务器模块
|
67
|
+
try:
|
68
|
+
from mcp.server.models import InitializationOptions
|
69
|
+
import mcp.types as types
|
70
|
+
from mcp.server import NotificationOptions, Server
|
71
|
+
import mcp.server.stdio
|
72
|
+
MCP_AVAILABLE = True
|
73
|
+
except ImportError:
|
74
|
+
MCP_AVAILABLE = False
|
75
|
+
logger.error("MCP not available. Install with: pip install mcp-py")
|
76
|
+
|
77
|
+
# 尝试导入dotenv支持环境变量
|
78
|
+
try:
|
79
|
+
from dotenv import load_dotenv
|
80
|
+
load_dotenv()
|
81
|
+
except ImportError:
|
82
|
+
logger.warning("dotenv not available, environment variables may not be loaded")
|
83
|
+
|
84
|
+
# 导入Worker模块
|
85
|
+
try:
|
86
|
+
from src.worker import Worker
|
87
|
+
except ImportError:
|
88
|
+
try:
|
89
|
+
from worker import Worker
|
90
|
+
except ImportError:
|
91
|
+
logger.error("Worker module not available")
|
92
|
+
raise ImportError("Worker module not available")
|
93
|
+
|
94
|
+
# 初始化MCP服务器
|
95
|
+
server = Server("CNKS Server")
|
96
|
+
|
97
|
+
# 创建全局Worker实例
|
98
|
+
worker_instance = Worker()
|
99
|
+
|
100
|
+
# 存储正在处理的请求
|
101
|
+
active_requests = {}
|
102
|
+
|
103
|
+
@server.list_resources()
|
104
|
+
async def handle_list_resources() -> list[types.Resource]:
|
105
|
+
"""
|
106
|
+
列出可用资源
|
107
|
+
"""
|
108
|
+
return []
|
109
|
+
|
110
|
+
@server.read_resource()
|
111
|
+
async def handle_read_resource(uri: str) -> str:
|
112
|
+
"""
|
113
|
+
读取指定URI的资源内容
|
114
|
+
"""
|
115
|
+
raise ValueError(f"不支持的URI方案: {uri}")
|
116
|
+
|
117
|
+
@server.list_prompts()
|
118
|
+
async def handle_list_prompts() -> list[types.Prompt]:
|
119
|
+
"""
|
120
|
+
列出可用的提示模板
|
121
|
+
"""
|
122
|
+
return []
|
123
|
+
|
124
|
+
@server.get_prompt()
|
125
|
+
async def handle_get_prompt(
|
126
|
+
name: str, arguments: dict[str, str] | None
|
127
|
+
) -> types.GetPromptResult:
|
128
|
+
"""
|
129
|
+
获取指定名称的提示模板
|
130
|
+
"""
|
131
|
+
raise ValueError(f"未知提示模板: {name}")
|
132
|
+
|
133
|
+
@server.list_tools()
|
134
|
+
async def handle_list_tools() -> list[types.Tool]:
|
135
|
+
"""
|
136
|
+
列出服务器提供的可用工具
|
137
|
+
|
138
|
+
Returns:
|
139
|
+
list: 可用工具列表及其参数描述
|
140
|
+
"""
|
141
|
+
return [
|
142
|
+
types.Tool(
|
143
|
+
name="search_keyword",
|
144
|
+
description="搜索指定关键词并获取相关引用",
|
145
|
+
inputSchema={
|
146
|
+
"type": "object",
|
147
|
+
"properties": {
|
148
|
+
"keyword": {"type": "string", "description": "要搜索的关键词"}
|
149
|
+
},
|
150
|
+
"required": ["keyword"]
|
151
|
+
}
|
152
|
+
),
|
153
|
+
types.Tool(
|
154
|
+
name="process_link",
|
155
|
+
description="处理特定链接并提取引用信息",
|
156
|
+
inputSchema={
|
157
|
+
"type": "object",
|
158
|
+
"properties": {
|
159
|
+
"link": {"type": "string", "description": "要处理的文章链接"}
|
160
|
+
},
|
161
|
+
"required": ["link"]
|
162
|
+
}
|
163
|
+
),
|
164
|
+
types.Tool(
|
165
|
+
name="close_browser",
|
166
|
+
description="关闭浏览器资源",
|
167
|
+
inputSchema={
|
168
|
+
"type": "object",
|
169
|
+
"properties": {}
|
170
|
+
}
|
171
|
+
)
|
172
|
+
]
|
173
|
+
|
174
|
+
class ToolHandler:
|
175
|
+
"""
|
176
|
+
工具处理器基类
|
177
|
+
"""
|
178
|
+
def __init__(self):
|
179
|
+
pass
|
180
|
+
|
181
|
+
async def handle(self, name: str, arguments: dict | None) -> list[types.TextContent | types.ImageContent | types.EmbeddedResource]:
|
182
|
+
raise NotImplementedError("子类必须实现handle方法")
|
183
|
+
|
184
|
+
class SearchKeywordToolHandler(ToolHandler):
|
185
|
+
async def handle(self, name: str, arguments: dict | None) -> list[types.TextContent | types.ImageContent | types.EmbeddedResource]:
|
186
|
+
keyword = arguments.get("keyword", "")
|
187
|
+
|
188
|
+
if not keyword:
|
189
|
+
return [types.TextContent(type="text", text="错误: 关键词不能为空")]
|
190
|
+
|
191
|
+
message_id = str(uuid.uuid4())
|
192
|
+
logger.info(f"开始处理关键词: {keyword}")
|
193
|
+
|
194
|
+
try:
|
195
|
+
# 记录请求
|
196
|
+
active_requests[message_id] = {
|
197
|
+
"keyword": keyword,
|
198
|
+
"status": "processing",
|
199
|
+
"timestamp": time.time()
|
200
|
+
}
|
201
|
+
|
202
|
+
# 调用Worker API处理关键词
|
203
|
+
result = await worker_instance.process_keyword(keyword)
|
204
|
+
|
205
|
+
# 更新请求状态
|
206
|
+
active_requests[message_id]["status"] = "completed"
|
207
|
+
|
208
|
+
# 返回结果
|
209
|
+
return [types.TextContent(type="text", text=json.dumps(result, ensure_ascii=False, indent=2))]
|
210
|
+
|
211
|
+
except Exception as e:
|
212
|
+
error_msg = f"处理关键词 '{keyword}' 时出错: {str(e)}"
|
213
|
+
logger.error(error_msg)
|
214
|
+
logger.error(traceback.format_exc())
|
215
|
+
|
216
|
+
# 更新请求状态
|
217
|
+
active_requests[message_id]["status"] = "error"
|
218
|
+
active_requests[message_id]["error"] = str(e)
|
219
|
+
|
220
|
+
return [types.TextContent(type="text", text=f"错误: {error_msg}")]
|
221
|
+
|
222
|
+
class ProcessLinkToolHandler(ToolHandler):
|
223
|
+
async def handle(self, name: str, arguments: dict | None) -> list[types.TextContent | types.ImageContent | types.EmbeddedResource]:
|
224
|
+
link = arguments.get("link", "")
|
225
|
+
|
226
|
+
if not link:
|
227
|
+
return [types.TextContent(type="text", text="错误: 链接不能为空")]
|
228
|
+
|
229
|
+
message_id = str(uuid.uuid4())
|
230
|
+
logger.info(f"开始处理链接: {link}")
|
231
|
+
|
232
|
+
try:
|
233
|
+
# 记录请求
|
234
|
+
active_requests[message_id] = {
|
235
|
+
"link": link,
|
236
|
+
"status": "processing",
|
237
|
+
"timestamp": time.time()
|
238
|
+
}
|
239
|
+
|
240
|
+
# 调用Worker API处理链接
|
241
|
+
result = await worker_instance.citzer.process_link(link)
|
242
|
+
logger.info(f"链接 '{link}' 处理完成")
|
243
|
+
|
244
|
+
# 更新请求状态
|
245
|
+
active_requests[message_id]["status"] = "completed"
|
246
|
+
|
247
|
+
# 返回结果
|
248
|
+
return [types.TextContent(type="text", text=json.dumps(result, ensure_ascii=False, indent=2))]
|
249
|
+
|
250
|
+
except Exception as e:
|
251
|
+
error_msg = f"处理链接 '{link}' 时出错: {str(e)}"
|
252
|
+
logger.error(error_msg)
|
253
|
+
logger.error(traceback.format_exc())
|
254
|
+
|
255
|
+
# 更新请求状态
|
256
|
+
active_requests[message_id]["status"] = "error"
|
257
|
+
active_requests[message_id]["error"] = str(e)
|
258
|
+
|
259
|
+
return [types.TextContent(type="text", text=f"错误: {error_msg}")]
|
260
|
+
|
261
|
+
class CloseBrowserToolHandler(ToolHandler):
|
262
|
+
async def handle(self, name: str, arguments: dict | None) -> list[types.TextContent | types.ImageContent | types.EmbeddedResource]:
|
263
|
+
try:
|
264
|
+
await worker_instance.close()
|
265
|
+
logger.info("浏览器资源已关闭")
|
266
|
+
return [types.TextContent(type="text", text="浏览器资源已成功关闭")]
|
267
|
+
except Exception as e:
|
268
|
+
error_msg = f"关闭浏览器失败: {str(e)}"
|
269
|
+
logger.error(error_msg)
|
270
|
+
return [types.TextContent(type="text", text=f"错误: {error_msg}")]
|
271
|
+
|
272
|
+
# 工具处理器映射表
|
273
|
+
tool_handlers = {
|
274
|
+
"search_keyword": SearchKeywordToolHandler(),
|
275
|
+
"process_link": ProcessLinkToolHandler(),
|
276
|
+
"close_browser": CloseBrowserToolHandler()
|
277
|
+
}
|
278
|
+
|
279
|
+
@server.call_tool()
|
280
|
+
async def handle_call_tool(
|
281
|
+
name: str, arguments: dict | None
|
282
|
+
) -> list[types.TextContent | types.ImageContent | types.EmbeddedResource]:
|
283
|
+
"""
|
284
|
+
处理客户端工具调用请求
|
285
|
+
|
286
|
+
Args:
|
287
|
+
name: 工具名称
|
288
|
+
arguments: 工具参数字典
|
289
|
+
|
290
|
+
Returns:
|
291
|
+
list: 包含文本或图像内容的响应
|
292
|
+
"""
|
293
|
+
logger.info(f"收到工具调用请求: {name}, 参数: {arguments}")
|
294
|
+
|
295
|
+
if name in tool_handlers:
|
296
|
+
return await tool_handlers[name].handle(name, arguments)
|
297
|
+
else:
|
298
|
+
logger.error(f"未知工具: {name}")
|
299
|
+
return [types.TextContent(type="text", text=f"错误: 未知工具: {name}")]
|
300
|
+
|
301
|
+
def cleanup_expired_requests():
|
302
|
+
"""清理过期的请求"""
|
303
|
+
current_time = time.time()
|
304
|
+
expired_ids = []
|
305
|
+
|
306
|
+
for msg_id, request in active_requests.items():
|
307
|
+
# 超过30分钟的请求视为过期
|
308
|
+
if current_time - request["timestamp"] > 1800:
|
309
|
+
expired_ids.append(msg_id)
|
310
|
+
|
311
|
+
# 移除过期请求
|
312
|
+
for msg_id in expired_ids:
|
313
|
+
active_requests.pop(msg_id, None)
|
314
|
+
logger.info(f"已清理过期请求: {msg_id}")
|
315
|
+
|
316
|
+
async def handle_simple_request(reader, writer):
|
317
|
+
"""
|
318
|
+
处理简单的JSON请求并返回结果
|
319
|
+
|
320
|
+
这是一个简化的通信协议,用于在MCP协议不可用时提供服务
|
321
|
+
"""
|
322
|
+
try:
|
323
|
+
# 读取请求
|
324
|
+
request_line = await reader.readline()
|
325
|
+
if not request_line:
|
326
|
+
logger.error("收到空请求")
|
327
|
+
return
|
328
|
+
|
329
|
+
request_data = json.loads(request_line.decode('utf-8'))
|
330
|
+
logger.info(f"收到请求: {request_data}")
|
331
|
+
|
332
|
+
# 处理请求
|
333
|
+
if request_data.get("type") == "tool_call":
|
334
|
+
tool_name = request_data.get("tool")
|
335
|
+
params = request_data.get("params", {})
|
336
|
+
|
337
|
+
if tool_name == "search_keyword":
|
338
|
+
# 获取参数
|
339
|
+
keyword = params.get("keyword", "")
|
340
|
+
|
341
|
+
if not keyword:
|
342
|
+
response = {"status": "error", "message": "关键词不能为空"}
|
343
|
+
else:
|
344
|
+
# 调用处理器
|
345
|
+
handler = tool_handlers.get("search_keyword")
|
346
|
+
|
347
|
+
if handler:
|
348
|
+
result_content = await handler.handle("search_keyword", params)
|
349
|
+
if result_content and len(result_content) > 0:
|
350
|
+
# 从TextContent提取JSON字符串并解析
|
351
|
+
try:
|
352
|
+
result_data = json.loads(result_content[0].text)
|
353
|
+
response = {
|
354
|
+
"status": "success",
|
355
|
+
"result": result_data
|
356
|
+
}
|
357
|
+
except json.JSONDecodeError:
|
358
|
+
response = {
|
359
|
+
"status": "success",
|
360
|
+
"result": {"message": result_content[0].text}
|
361
|
+
}
|
362
|
+
else:
|
363
|
+
response = {"status": "error", "message": "处理器未返回结果"}
|
364
|
+
else:
|
365
|
+
response = {"status": "error", "message": "找不到工具处理器"}
|
366
|
+
elif tool_name == "process_link":
|
367
|
+
# 获取参数
|
368
|
+
link = params.get("link", "")
|
369
|
+
|
370
|
+
if not link:
|
371
|
+
response = {"status": "error", "message": "链接不能为空"}
|
372
|
+
else:
|
373
|
+
# 调用处理器
|
374
|
+
handler = tool_handlers.get("process_link")
|
375
|
+
|
376
|
+
if handler:
|
377
|
+
result_content = await handler.handle("process_link", params)
|
378
|
+
if result_content and len(result_content) > 0:
|
379
|
+
# 从TextContent提取JSON字符串并解析
|
380
|
+
try:
|
381
|
+
result_data = json.loads(result_content[0].text)
|
382
|
+
response = {
|
383
|
+
"status": "success",
|
384
|
+
"result": result_data
|
385
|
+
}
|
386
|
+
except json.JSONDecodeError:
|
387
|
+
response = {
|
388
|
+
"status": "success",
|
389
|
+
"result": {"message": result_content[0].text}
|
390
|
+
}
|
391
|
+
else:
|
392
|
+
response = {"status": "error", "message": "处理器未返回结果"}
|
393
|
+
else:
|
394
|
+
response = {"status": "error", "message": "找不到工具处理器"}
|
395
|
+
else:
|
396
|
+
response = {"status": "error", "message": f"未知工具: {tool_name}"}
|
397
|
+
else:
|
398
|
+
response = {"status": "error", "message": f"未知请求类型: {request_data.get('type')}"}
|
399
|
+
|
400
|
+
# 发送响应
|
401
|
+
writer.write(json.dumps(response, ensure_ascii=False).encode('utf-8') + b'\n')
|
402
|
+
await writer.drain()
|
403
|
+
|
404
|
+
except json.JSONDecodeError:
|
405
|
+
logger.error("无法解析JSON请求")
|
406
|
+
writer.write(json.dumps({"status": "error", "message": "无法解析JSON请求"}).encode('utf-8') + b'\n')
|
407
|
+
await writer.drain()
|
408
|
+
except Exception as e:
|
409
|
+
logger.error(f"处理请求时出错: {str(e)}")
|
410
|
+
logger.error(traceback.format_exc())
|
411
|
+
writer.write(json.dumps({"status": "error", "message": f"服务器错误: {str(e)}"}).encode('utf-8') + b'\n')
|
412
|
+
await writer.drain()
|
413
|
+
finally:
|
414
|
+
writer.close()
|
415
|
+
|
416
|
+
async def run_simple_server():
|
417
|
+
"""运行简单的JSON请求-响应服务器"""
|
418
|
+
# 获取配置
|
419
|
+
host = os.environ.get("CNKS_HOST", "127.0.0.1")
|
420
|
+
port = int(os.environ.get("CNKS_PORT", "8000"))
|
421
|
+
|
422
|
+
server = await asyncio.start_server(
|
423
|
+
handle_simple_request,
|
424
|
+
host,
|
425
|
+
port
|
426
|
+
)
|
427
|
+
|
428
|
+
addr = server.sockets[0].getsockname()
|
429
|
+
logger.info(f'简易服务器运行在 {addr}')
|
430
|
+
|
431
|
+
async with server:
|
432
|
+
await server.serve_forever()
|
433
|
+
|
434
|
+
async def main():
|
435
|
+
"""
|
436
|
+
主函数,启动MCP服务器
|
437
|
+
"""
|
438
|
+
try:
|
439
|
+
print("正在启动CNKS服务器...")
|
440
|
+
logger.info("正在启动CNKS服务器")
|
441
|
+
|
442
|
+
# 检查MCP是否可用
|
443
|
+
if not MCP_AVAILABLE:
|
444
|
+
logger.warning("MCP模块不可用,使用简单服务器替代")
|
445
|
+
await run_simple_server()
|
446
|
+
return
|
447
|
+
|
448
|
+
# 检查是否从命令行直接运行或被导入
|
449
|
+
if sys.stdin.isatty():
|
450
|
+
# 命令行运行,启动简单服务器
|
451
|
+
logger.info("从命令行运行,启动简单服务器")
|
452
|
+
await run_simple_server()
|
453
|
+
else:
|
454
|
+
# 标准输入/输出流可用,使用stdio模式运行MCP服务器
|
455
|
+
logger.info("检测到标准输入/输出流,启动MCP标准服务器")
|
456
|
+
async with mcp.server.stdio.stdio_server() as (read_stream, write_stream):
|
457
|
+
await server.run(
|
458
|
+
read_stream,
|
459
|
+
write_stream,
|
460
|
+
InitializationOptions(
|
461
|
+
server_name="CNKS-server",
|
462
|
+
server_version="0.1.0",
|
463
|
+
capabilities=server.get_capabilities(
|
464
|
+
notification_options=NotificationOptions(),
|
465
|
+
experimental_capabilities={},
|
466
|
+
),
|
467
|
+
),
|
468
|
+
)
|
469
|
+
|
470
|
+
except KeyboardInterrupt:
|
471
|
+
print("\n收到中断信号,服务器正在关闭...")
|
472
|
+
logger.info("收到中断信号,服务器正在关闭")
|
473
|
+
except Exception as e:
|
474
|
+
logger.error(f"服务器启动失败: {str(e)}")
|
475
|
+
logger.error(traceback.format_exc())
|
476
|
+
print(f"服务器启动失败: {str(e)}")
|
477
|
+
finally:
|
478
|
+
# 关闭Worker资源
|
479
|
+
try:
|
480
|
+
await worker_instance.close()
|
481
|
+
logger.info("Worker资源已关闭")
|
482
|
+
except Exception as e:
|
483
|
+
logger.error(f"关闭Worker资源时出错: {str(e)}")
|
484
|
+
|
485
|
+
if __name__ == "__main__":
|
486
|
+
# 启动主循环
|
487
|
+
asyncio.run(main())
|
src/worker.py
ADDED
@@ -0,0 +1,219 @@
|
|
1
|
+
#!/usr/bin/env python
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
|
4
|
+
"""
|
5
|
+
引文工作者模块(Worker Module)
|
6
|
+
|
7
|
+
这是处理引文数据请求的主要模块,处理从服务器接收的关键词,
|
8
|
+
管理搜索和数据提取流程,并与缓存系统交互。
|
9
|
+
|
10
|
+
主要职责:
|
11
|
+
1. 协调缓存、搜索和内容提取模块之间的交互
|
12
|
+
2. 处理来自服务器的关键词请求
|
13
|
+
3. 管理缓存查询和更新
|
14
|
+
4. 控制处理流程和结果返回
|
15
|
+
"""
|
16
|
+
|
17
|
+
import asyncio
|
18
|
+
import json
|
19
|
+
import logging
|
20
|
+
import os
|
21
|
+
import traceback
|
22
|
+
import time
|
23
|
+
from typing import Dict, List, Any, Optional, Union
|
24
|
+
|
25
|
+
# 配置日志记录
|
26
|
+
try:
|
27
|
+
# 尝试使用绝对路径
|
28
|
+
log_dir = os.path.dirname(os.path.abspath(__file__))
|
29
|
+
log_file = os.path.join(os.path.dirname(log_dir), "cnks_worker.log")
|
30
|
+
|
31
|
+
# 创建处理器
|
32
|
+
file_handler = logging.FileHandler(log_file, mode="a")
|
33
|
+
console_handler = logging.StreamHandler()
|
34
|
+
|
35
|
+
# 设置格式
|
36
|
+
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
|
37
|
+
file_handler.setFormatter(formatter)
|
38
|
+
console_handler.setFormatter(formatter)
|
39
|
+
|
40
|
+
# 获取日志记录器并添加处理器
|
41
|
+
logger = logging.getLogger("cnks.worker")
|
42
|
+
logger.setLevel(logging.DEBUG)
|
43
|
+
|
44
|
+
# 移除现有处理器以避免重复
|
45
|
+
if logger.handlers:
|
46
|
+
for handler in logger.handlers:
|
47
|
+
logger.removeHandler(handler)
|
48
|
+
|
49
|
+
logger.addHandler(file_handler)
|
50
|
+
logger.addHandler(console_handler)
|
51
|
+
|
52
|
+
# 打印确认信息
|
53
|
+
print(f"Worker logger initialized, logging to: {log_file}")
|
54
|
+
logger.info(f"Worker logging to: {log_file}")
|
55
|
+
except Exception as e:
|
56
|
+
# 回退到基本控制台日志记录
|
57
|
+
logging.basicConfig(
|
58
|
+
level=logging.DEBUG,
|
59
|
+
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
60
|
+
)
|
61
|
+
logger = logging.getLogger("cnks.worker")
|
62
|
+
logger.error(f"Failed to set up file logging: {str(e)}")
|
63
|
+
print(f"Error setting up worker file logging: {str(e)}")
|
64
|
+
|
65
|
+
# 导入其他模块
|
66
|
+
try:
|
67
|
+
from src.searcher import Searcher
|
68
|
+
from src.citzer import Citzer
|
69
|
+
from src.cache import Cache
|
70
|
+
except ImportError:
|
71
|
+
try:
|
72
|
+
from searcher import Searcher
|
73
|
+
from citzer import Citzer
|
74
|
+
from cache import Cache
|
75
|
+
except ImportError:
|
76
|
+
logger.warning("无法导入searcher、citzer或cache模块,功能将受限")
|
77
|
+
|
78
|
+
class Worker:
|
79
|
+
"""
|
80
|
+
工作者类,负责协调搜索、提取和缓存操作
|
81
|
+
"""
|
82
|
+
|
83
|
+
def __init__(self):
|
84
|
+
"""初始化工作者"""
|
85
|
+
# 先创建Searcher实例
|
86
|
+
self.searcher = Searcher()
|
87
|
+
# 创建Citzer实例,但不让它启动自己的浏览器
|
88
|
+
self.citzer = Citzer()
|
89
|
+
# 设置标志,以便在使用时借用Searcher的浏览器
|
90
|
+
self.citzer.browser_started = False
|
91
|
+
# 创建Cache实例
|
92
|
+
self.cache = Cache()
|
93
|
+
logger.info("Worker初始化完成")
|
94
|
+
|
95
|
+
async def process_keyword(self, keyword: str) -> Dict[str, Any]:
|
96
|
+
"""
|
97
|
+
处理关键词请求
|
98
|
+
|
99
|
+
Args:
|
100
|
+
keyword: 要搜索的关键词
|
101
|
+
|
102
|
+
Returns:
|
103
|
+
Dict[str, Any]: 包含处理结果的字典
|
104
|
+
"""
|
105
|
+
logger.info(f"处理关键词请求: {keyword}")
|
106
|
+
|
107
|
+
try:
|
108
|
+
# 检查缓存中是否有该关键词
|
109
|
+
if not self.cache.has_keyword(keyword):
|
110
|
+
logger.info(f"缓存中没有关键词 {keyword},执行搜索")
|
111
|
+
|
112
|
+
# 使用searcher搜索关键词
|
113
|
+
links = await self.searcher.search_keyword(keyword)
|
114
|
+
logger.info(f"搜索到 {len(links)} 个链接")
|
115
|
+
|
116
|
+
# 将结果存入缓存
|
117
|
+
self.cache.add_links(keyword, links)
|
118
|
+
logger.info(f"已将关键词 {keyword} 的链接存入缓存")
|
119
|
+
else:
|
120
|
+
logger.info(f"缓存中已有关键词 {keyword}")
|
121
|
+
|
122
|
+
# 将Searcher的浏览器实例共享给Citzer
|
123
|
+
if self.searcher.browser_started and not self.citzer.browser_started:
|
124
|
+
self.citzer.context = self.searcher.context
|
125
|
+
self.citzer.playwright = self.searcher.playwright
|
126
|
+
self.citzer.browser_started = True
|
127
|
+
logger.info("已将Searcher的浏览器实例共享给Citzer")
|
128
|
+
|
129
|
+
# 处理缓存中未处理的链接
|
130
|
+
while True:
|
131
|
+
# 获取一个未处理的链接
|
132
|
+
link = self.cache.get_unprocessed_link(keyword)
|
133
|
+
|
134
|
+
if not link:
|
135
|
+
logger.info(f"关键词 {keyword} 的所有链接已处理完毕")
|
136
|
+
break
|
137
|
+
|
138
|
+
logger.info(f"处理链接: {link}")
|
139
|
+
|
140
|
+
# 使用citzer处理链接
|
141
|
+
result = await self.citzer.process_link(link)
|
142
|
+
|
143
|
+
if result:
|
144
|
+
# 将结果存入缓存
|
145
|
+
self.cache.add_result(link, result)
|
146
|
+
logger.info(f"已将链接 {link} 的处理结果存入缓存")
|
147
|
+
|
148
|
+
# 标记链接为已处理
|
149
|
+
self.cache.mark_as_processed(link)
|
150
|
+
logger.info(f"已标记链接 {link} 为已处理")
|
151
|
+
|
152
|
+
# 获取所有处理结果
|
153
|
+
results = self.cache.get_all_results(keyword)
|
154
|
+
logger.info(f"关键词 {keyword} 的处理结果数量: {len(results)}")
|
155
|
+
|
156
|
+
return {
|
157
|
+
"success": True,
|
158
|
+
"keyword": keyword,
|
159
|
+
"results": results
|
160
|
+
}
|
161
|
+
|
162
|
+
except Exception as e:
|
163
|
+
logger.error(f"处理关键词 {keyword} 时出错: {str(e)}")
|
164
|
+
logger.error(traceback.format_exc())
|
165
|
+
|
166
|
+
return {
|
167
|
+
"success": False,
|
168
|
+
"keyword": keyword,
|
169
|
+
"error": str(e)
|
170
|
+
}
|
171
|
+
|
172
|
+
finally:
|
173
|
+
# 只由Worker负责关闭浏览器,Citzer不再单独关闭
|
174
|
+
try:
|
175
|
+
# 确保Citzer不会再尝试使用浏览器
|
176
|
+
self.citzer.browser_started = False
|
177
|
+
# 关闭Searcher的浏览器
|
178
|
+
await self.searcher.close_browser()
|
179
|
+
except Exception as e:
|
180
|
+
logger.warning(f"关闭浏览器时出错: {str(e)}")
|
181
|
+
|
182
|
+
async def close(self):
|
183
|
+
"""关闭工作者资源"""
|
184
|
+
try:
|
185
|
+
# 确保Citzer不会再尝试使用浏览器
|
186
|
+
self.citzer.browser_started = False
|
187
|
+
self.citzer.context = None
|
188
|
+
self.citzer.playwright = None
|
189
|
+
|
190
|
+
# 关闭Searcher的浏览器
|
191
|
+
await self.searcher.close_browser()
|
192
|
+
logger.info("已关闭工作者资源")
|
193
|
+
except Exception as e:
|
194
|
+
logger.error(f"关闭工作者资源时出错: {str(e)}")
|
195
|
+
logger.error(traceback.format_exc())
|
196
|
+
|
197
|
+
# 如果作为主程序运行,提供测试功能
|
198
|
+
async def main():
|
199
|
+
"""主程序入口"""
|
200
|
+
worker = Worker()
|
201
|
+
|
202
|
+
try:
|
203
|
+
# 测试关键词
|
204
|
+
test_keyword = "人工智能"
|
205
|
+
print(f"测试处理关键词: {test_keyword}")
|
206
|
+
|
207
|
+
# 处理关键词
|
208
|
+
result = await worker.process_keyword(test_keyword)
|
209
|
+
|
210
|
+
# 打印结果
|
211
|
+
print(f"处理结果:")
|
212
|
+
print(json.dumps(result, ensure_ascii=False, indent=2))
|
213
|
+
finally:
|
214
|
+
# 关闭资源
|
215
|
+
await worker.close()
|
216
|
+
|
217
|
+
if __name__ == "__main__":
|
218
|
+
# 运行测试
|
219
|
+
asyncio.run(main())
|