cnks 0.2.5__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
src/cache.py ADDED
@@ -0,0 +1,451 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: utf-8 -*-
3
+
4
+ """
5
+ 缓存模块(Cache Module)
6
+
7
+ 这是一个用于管理关键词、链接和引文数据的缓存模块。
8
+ 使用临时文件存储和管理结构化数据,以便高效访问和更新。
9
+
10
+ 主要职责:
11
+ 1. 存储关键词及其关联链接
12
+ 2. 更新链接为处理后的引文数据
13
+ 3. 为工作者或服务器提供数据检索
14
+ 4. 根据服务器请求删除缓存文件
15
+ """
16
+
17
+ import json
18
+ import logging
19
+ import os
20
+ from typing import Dict, List, Any, Optional, Union
21
+
22
+ # Configure logging
23
+ try:
24
+ # Attempt to use absolute path
25
+ log_dir = os.path.dirname(os.path.abspath(__file__))
26
+ log_file = os.path.join(os.path.dirname(log_dir), "cnks_cache.log")
27
+
28
+ # Create handlers
29
+ file_handler = logging.FileHandler(log_file, mode="a")
30
+ console_handler = logging.StreamHandler()
31
+
32
+ # Set format for both handlers
33
+ formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
34
+ file_handler.setFormatter(formatter)
35
+ console_handler.setFormatter(formatter)
36
+
37
+ # Get logger and add handlers
38
+ logger = logging.getLogger("cnks.cache")
39
+ logger.setLevel(logging.DEBUG)
40
+
41
+ # Remove any existing handlers to avoid duplicates
42
+ if logger.handlers:
43
+ for handler in logger.handlers:
44
+ logger.removeHandler(handler)
45
+
46
+ logger.addHandler(file_handler)
47
+ logger.addHandler(console_handler)
48
+
49
+ # Print confirmation
50
+ print(f"Cache logger initialized, logging to: {log_file}")
51
+ logger.info(f"Cache logging to: {log_file}")
52
+ except Exception as e:
53
+ # Fallback to basic console logging
54
+ logging.basicConfig(
55
+ level=logging.DEBUG,
56
+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
57
+ )
58
+ logger = logging.getLogger("cnks.cache")
59
+ logger.error(f"Failed to set up file logging: {str(e)}")
60
+ print(f"Error setting up cache file logging: {str(e)}")
61
+
62
+ # Cache file path
63
+ CACHE_FILE = os.environ.get("CACHE_FILE", "cache.json")
64
+
65
+ class Cache:
66
+ """Cache class for managing search data."""
67
+
68
+ def __init__(self):
69
+ """初始化缓存,不需要参数,使用默认路径"""
70
+ logger.debug(f"Cache initialized with default file: {CACHE_FILE}")
71
+
72
+ def add_links(self, keyword: str, links: List[str]) -> bool:
73
+ """
74
+ 添加关键词的链接到缓存
75
+
76
+ Args:
77
+ keyword: 要存储的关键词
78
+ links: 关联的链接列表
79
+
80
+ Returns:
81
+ bool: 操作成功返回True,否则返回False
82
+ """
83
+ try:
84
+ return self.store_keyword_and_links(keyword, links)
85
+ except Exception as e:
86
+ logger.error(f"Error adding links for keyword {keyword}: {str(e)}")
87
+ return False
88
+
89
+ def get(self, keyword: str) -> List[Dict]:
90
+ """获取指定关键词的缓存结果"""
91
+ try:
92
+ if not os.path.exists(CACHE_FILE):
93
+ logger.debug(f"Cache file not found: {CACHE_FILE}")
94
+ return []
95
+
96
+ with open(CACHE_FILE, "r", encoding="utf-8") as f:
97
+ cache_data = json.load(f)
98
+
99
+ if cache_data.get("keyword") == keyword:
100
+ results = cache_data.get("results", [])
101
+ logger.info(f"Retrieved {len(results)} results for keyword: {keyword}")
102
+ return results
103
+ else:
104
+ logger.debug(f"Keyword mismatch in cache: {cache_data.get('keyword')} != {keyword}")
105
+ return []
106
+
107
+ except Exception as e:
108
+ logger.error(f"Error getting cached results: {str(e)}")
109
+ return []
110
+
111
+ def set(self, keyword: str, results: List[Dict]) -> bool:
112
+ """设置关键词的缓存结果"""
113
+ try:
114
+ cache_data = {
115
+ "keyword": keyword,
116
+ "results": results
117
+ }
118
+
119
+ with open(CACHE_FILE, "w", encoding="utf-8") as f:
120
+ json.dump(cache_data, f, ensure_ascii=False, indent=2)
121
+
122
+ logger.info(f"Saved {len(results)} results for keyword: {keyword}")
123
+ return True
124
+
125
+ except Exception as e:
126
+ logger.error(f"Error saving cache: {str(e)}")
127
+ return False
128
+
129
+ @staticmethod
130
+ def store_keyword_and_links(keyword: str, links: List[str]):
131
+ """Store keyword and associated links in cache."""
132
+ cache_data = {
133
+ "keyword": keyword,
134
+ "links": links,
135
+ "results": [] # Initially empty, will be replaced with processed citation data
136
+ }
137
+
138
+ try:
139
+ # Ensure directory exists
140
+ cache_dir = os.path.dirname(os.path.abspath(CACHE_FILE))
141
+ if not os.path.exists(cache_dir):
142
+ os.makedirs(cache_dir, exist_ok=True)
143
+ logger.info(f"Created cache directory: {cache_dir}")
144
+
145
+ with open(CACHE_FILE, "w", encoding="utf-8") as f:
146
+ json.dump(cache_data, f, ensure_ascii=False, indent=2)
147
+
148
+ logger.info(f"Stored keyword and {len(links)} links in cache")
149
+ print(f"Cache: stored keyword '{keyword}' with {len(links)} links")
150
+ return True
151
+ except Exception as e:
152
+ logger.error(f"Error storing keyword and links: {str(e)}")
153
+ print(f"Cache error: {str(e)}")
154
+ return False
155
+
156
+ @staticmethod
157
+ def update_link_with_citation(link: str, citation_data: Dict):
158
+ """Replace a link with processed citation data."""
159
+ try:
160
+ # 读取当前缓存
161
+ if not os.path.exists(CACHE_FILE):
162
+ logger.error("Cache file does not exist, cannot update citation data")
163
+ return False
164
+
165
+ with open(CACHE_FILE, "r", encoding="utf-8") as f:
166
+ cache_data = json.load(f)
167
+
168
+ # 检查链接是否已有引文数据
169
+ results = cache_data.get("results", [])
170
+ links = cache_data.get("links", [])
171
+
172
+ # 移除不需要存储的字段
173
+ filtered_citation_data = citation_data.copy()
174
+ for field in ["url", "journal", "year", "doi"]:
175
+ if field in filtered_citation_data:
176
+ del filtered_citation_data[field]
177
+
178
+ # 查找是否存在
179
+ found = False
180
+ for i, result in enumerate(results):
181
+ if "url" in result and result["url"] == link:
182
+ # 更新现有记录(完全移除url字段)
183
+ results[i] = filtered_citation_data
184
+ found = True
185
+ break
186
+
187
+ # 如果没找到,添加新记录
188
+ if not found:
189
+ results.append(filtered_citation_data)
190
+
191
+ # 记录已处理的链接索引(但不在结果中存储url)
192
+ if link in links and "processed_links" not in cache_data:
193
+ cache_data["processed_links"] = []
194
+ if link in links and link not in cache_data.get("processed_links", []):
195
+ cache_data["processed_links"].append(link)
196
+
197
+ # 更新缓存文件
198
+ cache_data["results"] = results
199
+
200
+ with open(CACHE_FILE, "w", encoding="utf-8") as f:
201
+ json.dump(cache_data, f, ensure_ascii=False, indent=2)
202
+
203
+ logger.info(f"Updated citation data for link: {link}")
204
+ return True
205
+
206
+ except Exception as e:
207
+ logger.error(f"Error updating citation data: {str(e)}")
208
+ return False
209
+
210
+ @staticmethod
211
+ def get_links():
212
+ """Get the list of links to be processed."""
213
+ try:
214
+ if not os.path.exists(CACHE_FILE):
215
+ logger.error("Cache file does not exist, cannot get links")
216
+ return []
217
+
218
+ with open(CACHE_FILE, "r", encoding="utf-8") as f:
219
+ cache_data = json.load(f)
220
+
221
+ links = cache_data.get("links", [])
222
+ logger.info(f"Retrieved {len(links)} links from cache")
223
+ return links
224
+
225
+ except Exception as e:
226
+ logger.error(f"Error getting links: {str(e)}")
227
+ return []
228
+
229
+ @staticmethod
230
+ def get_keyword():
231
+ """Get the cached keyword."""
232
+ try:
233
+ if not os.path.exists(CACHE_FILE):
234
+ logger.error("Cache file does not exist, cannot get keyword")
235
+ return None
236
+
237
+ with open(CACHE_FILE, "r", encoding="utf-8") as f:
238
+ cache_data = json.load(f)
239
+
240
+ keyword = cache_data.get("keyword")
241
+ logger.info(f"Retrieved keyword from cache: {keyword}")
242
+ return keyword
243
+
244
+ except Exception as e:
245
+ logger.error(f"Error getting keyword: {str(e)}")
246
+ return None
247
+
248
+ @staticmethod
249
+ def get_results():
250
+ """Get all processed citation data."""
251
+ try:
252
+ if not os.path.exists(CACHE_FILE):
253
+ logger.error("Cache file does not exist, cannot get results")
254
+ return []
255
+
256
+ with open(CACHE_FILE, "r", encoding="utf-8") as f:
257
+ cache_data = json.load(f)
258
+
259
+ results = cache_data.get("results", [])
260
+ logger.info(f"Retrieved {len(results)} results from cache")
261
+ return results
262
+
263
+ except Exception as e:
264
+ logger.error(f"Error getting results: {str(e)}")
265
+ return []
266
+
267
+ @staticmethod
268
+ def delete_cache():
269
+ """Delete the cache file."""
270
+ try:
271
+ if os.path.exists(CACHE_FILE):
272
+ os.remove(CACHE_FILE)
273
+ logger.info(f"Cache file deleted: {CACHE_FILE}")
274
+ return True
275
+ else:
276
+ logger.warning(f"Cache file does not exist, no need to delete: {CACHE_FILE}")
277
+ return True
278
+
279
+ except Exception as e:
280
+ logger.error(f"Error deleting cache: {str(e)}")
281
+ return False
282
+
283
+ def has_keyword(self, keyword: str) -> bool:
284
+ """
285
+ 检查缓存中是否存在指定关键词的数据
286
+
287
+ Args:
288
+ keyword: 要检查的关键词
289
+
290
+ Returns:
291
+ bool: 如果关键词存在于缓存中则返回True,否则返回False
292
+ """
293
+ try:
294
+ if not os.path.exists(CACHE_FILE):
295
+ logger.debug(f"Cache file not found: {CACHE_FILE}")
296
+ return False
297
+
298
+ with open(CACHE_FILE, "r", encoding="utf-8") as f:
299
+ cache_data = json.load(f)
300
+
301
+ cached_keyword = cache_data.get("keyword")
302
+ exists = cached_keyword == keyword
303
+
304
+ if exists:
305
+ logger.info(f"Keyword found in cache: {keyword}")
306
+ else:
307
+ logger.debug(f"Keyword not found in cache: {keyword} (found: {cached_keyword})")
308
+
309
+ return exists
310
+
311
+ except Exception as e:
312
+ logger.error(f"Error checking keyword in cache: {str(e)}")
313
+ return False
314
+
315
+ def get_keyword_data(self, keyword: str) -> Optional[Dict[str, Any]]:
316
+ """
317
+ 获取指定关键词的完整缓存数据,包括链接和结果
318
+
319
+ Args:
320
+ keyword: 要获取数据的关键词
321
+
322
+ Returns:
323
+ Optional[Dict]: 包含关键词数据的字典,如果不存在则返回None
324
+ """
325
+ try:
326
+ if not os.path.exists(CACHE_FILE):
327
+ logger.debug(f"Cache file not found: {CACHE_FILE}")
328
+ return None
329
+
330
+ with open(CACHE_FILE, "r", encoding="utf-8") as f:
331
+ cache_data = json.load(f)
332
+
333
+ if cache_data.get("keyword") == keyword:
334
+ logger.info(f"Retrieved complete data for keyword: {keyword}")
335
+ return cache_data
336
+ else:
337
+ logger.debug(f"Keyword mismatch in cache: {cache_data.get('keyword')} != {keyword}")
338
+ return None
339
+
340
+ except Exception as e:
341
+ logger.error(f"Error getting keyword data from cache: {str(e)}")
342
+ return None
343
+
344
+ def get_unprocessed_link(self, keyword: str) -> Optional[str]:
345
+ """
346
+ 获取一个未处理的链接
347
+
348
+ Args:
349
+ keyword: 关键词
350
+
351
+ Returns:
352
+ Optional[str]: 返回一个未处理的链接,如果没有则返回None
353
+ """
354
+ try:
355
+ data = self.get_keyword_data(keyword)
356
+ if not data:
357
+ logger.warning(f"关键词 {keyword} 不存在于缓存中")
358
+ return None
359
+
360
+ all_links = data.get("links", [])
361
+ processed_links = data.get("processed_links", [])
362
+
363
+ # 找到第一个未处理的链接
364
+ for link in all_links:
365
+ if link not in processed_links:
366
+ logger.info(f"找到未处理的链接: {link}")
367
+ return link
368
+
369
+ logger.info(f"关键词 {keyword} 的所有链接已处理")
370
+ return None
371
+ except Exception as e:
372
+ logger.error(f"获取未处理链接时出错: {str(e)}")
373
+ return None
374
+
375
+ def add_result(self, link: str, result: Dict[str, Any]) -> bool:
376
+ """
377
+ 添加处理结果到缓存
378
+
379
+ Args:
380
+ link: 已处理的链接
381
+ result: 包含引用数据的结果
382
+
383
+ Returns:
384
+ bool: 操作成功返回True,否则返回False
385
+ """
386
+ try:
387
+ return self.update_link_with_citation(link, result)
388
+ except Exception as e:
389
+ logger.error(f"添加结果时出错: {str(e)}")
390
+ return False
391
+
392
+ def mark_as_processed(self, link: str) -> bool:
393
+ """
394
+ 标记链接为已处理
395
+
396
+ Args:
397
+ link: 要标记的链接
398
+
399
+ Returns:
400
+ bool: 操作成功返回True,否则返回False
401
+ """
402
+ try:
403
+ if not os.path.exists(CACHE_FILE):
404
+ logger.error("缓存文件不存在,无法标记链接为已处理")
405
+ return False
406
+
407
+ with open(CACHE_FILE, "r", encoding="utf-8") as f:
408
+ cache_data = json.load(f)
409
+
410
+ # 确保有processed_links字段
411
+ if "processed_links" not in cache_data:
412
+ cache_data["processed_links"] = []
413
+
414
+ # 如果链接尚未标记为已处理,则添加
415
+ if link not in cache_data["processed_links"]:
416
+ cache_data["processed_links"].append(link)
417
+
418
+ with open(CACHE_FILE, "w", encoding="utf-8") as f:
419
+ json.dump(cache_data, f, ensure_ascii=False, indent=2)
420
+
421
+ logger.info(f"已标记链接为已处理: {link}")
422
+ else:
423
+ logger.info(f"链接已经被标记为已处理: {link}")
424
+
425
+ return True
426
+ except Exception as e:
427
+ logger.error(f"标记链接为已处理时出错: {str(e)}")
428
+ return False
429
+
430
+ def get_all_results(self, keyword: str) -> List[Dict[str, Any]]:
431
+ """
432
+ 获取关键词的所有处理结果
433
+
434
+ Args:
435
+ keyword: 关键词
436
+
437
+ Returns:
438
+ List[Dict[str, Any]]: 结果列表
439
+ """
440
+ try:
441
+ data = self.get_keyword_data(keyword)
442
+ if not data:
443
+ logger.warning(f"关键词 {keyword} 不存在于缓存中")
444
+ return []
445
+
446
+ results = data.get("results", [])
447
+ logger.info(f"获取到关键词 {keyword} 的 {len(results)} 个结果")
448
+ return results
449
+ except Exception as e:
450
+ logger.error(f"获取所有结果时出错: {str(e)}")
451
+ return []