academic-refchecker 2.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. academic_refchecker-2.0.7.dist-info/METADATA +738 -0
  2. academic_refchecker-2.0.7.dist-info/RECORD +64 -0
  3. academic_refchecker-2.0.7.dist-info/WHEEL +5 -0
  4. academic_refchecker-2.0.7.dist-info/entry_points.txt +3 -0
  5. academic_refchecker-2.0.7.dist-info/licenses/LICENSE +21 -0
  6. academic_refchecker-2.0.7.dist-info/top_level.txt +2 -0
  7. backend/__init__.py +21 -0
  8. backend/__main__.py +11 -0
  9. backend/cli.py +64 -0
  10. backend/concurrency.py +100 -0
  11. backend/database.py +711 -0
  12. backend/main.py +1367 -0
  13. backend/models.py +99 -0
  14. backend/refchecker_wrapper.py +1126 -0
  15. backend/static/assets/index-2P6L_39v.css +1 -0
  16. backend/static/assets/index-hk21nqxR.js +25 -0
  17. backend/static/favicon.svg +6 -0
  18. backend/static/index.html +15 -0
  19. backend/static/vite.svg +1 -0
  20. backend/thumbnail.py +517 -0
  21. backend/websocket_manager.py +104 -0
  22. refchecker/__init__.py +13 -0
  23. refchecker/__main__.py +11 -0
  24. refchecker/__version__.py +3 -0
  25. refchecker/checkers/__init__.py +17 -0
  26. refchecker/checkers/crossref.py +541 -0
  27. refchecker/checkers/enhanced_hybrid_checker.py +563 -0
  28. refchecker/checkers/github_checker.py +326 -0
  29. refchecker/checkers/local_semantic_scholar.py +540 -0
  30. refchecker/checkers/openalex.py +513 -0
  31. refchecker/checkers/openreview_checker.py +984 -0
  32. refchecker/checkers/pdf_paper_checker.py +493 -0
  33. refchecker/checkers/semantic_scholar.py +764 -0
  34. refchecker/checkers/webpage_checker.py +938 -0
  35. refchecker/config/__init__.py +1 -0
  36. refchecker/config/logging.conf +36 -0
  37. refchecker/config/settings.py +170 -0
  38. refchecker/core/__init__.py +7 -0
  39. refchecker/core/db_connection_pool.py +141 -0
  40. refchecker/core/parallel_processor.py +415 -0
  41. refchecker/core/refchecker.py +5838 -0
  42. refchecker/database/__init__.py +6 -0
  43. refchecker/database/download_semantic_scholar_db.py +1725 -0
  44. refchecker/llm/__init__.py +0 -0
  45. refchecker/llm/base.py +376 -0
  46. refchecker/llm/providers.py +911 -0
  47. refchecker/scripts/__init__.py +1 -0
  48. refchecker/scripts/start_vllm_server.py +121 -0
  49. refchecker/services/__init__.py +8 -0
  50. refchecker/services/pdf_processor.py +268 -0
  51. refchecker/utils/__init__.py +27 -0
  52. refchecker/utils/arxiv_utils.py +462 -0
  53. refchecker/utils/author_utils.py +179 -0
  54. refchecker/utils/biblatex_parser.py +584 -0
  55. refchecker/utils/bibliography_utils.py +332 -0
  56. refchecker/utils/bibtex_parser.py +411 -0
  57. refchecker/utils/config_validator.py +262 -0
  58. refchecker/utils/db_utils.py +210 -0
  59. refchecker/utils/doi_utils.py +190 -0
  60. refchecker/utils/error_utils.py +482 -0
  61. refchecker/utils/mock_objects.py +211 -0
  62. refchecker/utils/text_utils.py +5057 -0
  63. refchecker/utils/unicode_utils.py +335 -0
  64. refchecker/utils/url_utils.py +307 -0
backend/main.py ADDED
@@ -0,0 +1,1367 @@
1
+ """
2
+ FastAPI application for RefChecker Web UI
3
+ """
4
+ import asyncio
5
+ import uuid
6
+ import os
7
+ import tempfile
8
+ from pathlib import Path
9
+ from typing import Optional
10
+ from fastapi import FastAPI, WebSocket, WebSocketDisconnect, UploadFile, File, Form, HTTPException, Body, Request
11
+ from fastapi.middleware.cors import CORSMiddleware
12
+ from fastapi.responses import FileResponse, HTMLResponse
13
+ from fastapi.staticfiles import StaticFiles
14
+ from pydantic import BaseModel
15
+ import logging
16
+
17
+ import aiosqlite
18
+ from .database import db
19
+ from .websocket_manager import manager
20
+ from .refchecker_wrapper import ProgressRefChecker
21
+ from .models import CheckRequest, CheckHistoryItem
22
+ from .concurrency import init_limiter, get_limiter, DEFAULT_MAX_CONCURRENT
23
+ from .thumbnail import (
24
+ generate_arxiv_thumbnail_async,
25
+ generate_arxiv_preview_async,
26
+ generate_pdf_thumbnail_async,
27
+ generate_pdf_preview_async,
28
+ get_text_thumbnail_async,
29
+ get_thumbnail_cache_path,
30
+ get_preview_cache_path
31
+ )
32
+
33
+ # Configure logging
34
+ logging.basicConfig(
35
+ level=logging.INFO,
36
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
37
+ )
38
+ logger = logging.getLogger(__name__)
39
+
40
+
41
+ # Pydantic models for requests
42
+ class LLMConfigCreate(BaseModel):
43
+ name: str
44
+ provider: str
45
+ model: Optional[str] = None
46
+ api_key: Optional[str] = None
47
+ endpoint: Optional[str] = None
48
+
49
+
50
+ class LLMConfigUpdate(BaseModel):
51
+ name: Optional[str] = None
52
+ provider: Optional[str] = None
53
+ model: Optional[str] = None
54
+ api_key: Optional[str] = None
55
+ endpoint: Optional[str] = None
56
+
57
+
58
+ class LLMConfigValidate(BaseModel):
59
+ """Model for validating LLM config without requiring name"""
60
+ provider: str
61
+ model: Optional[str] = None
62
+ api_key: Optional[str] = None
63
+ endpoint: Optional[str] = None
64
+
65
+
66
+ class CheckLabelUpdate(BaseModel):
67
+ custom_label: str
68
+
69
+
70
+ # Create FastAPI app
71
+ app = FastAPI(title="RefChecker Web UI API", version="1.0.0")
72
+
73
+ # Static files directory for bundled frontend
74
+ STATIC_DIR = Path(__file__).parent / "static"
75
+
76
+ # Configure CORS for local development
77
+ app.add_middleware(
78
+ CORSMiddleware,
79
+ allow_origins=["http://localhost:5173", "http://127.0.0.1:5173", "http://localhost:5174", "http://localhost:5175", "http://127.0.0.1:5174", "http://127.0.0.1:5175", "http://localhost:8000", "http://127.0.0.1:8000"],
80
+ allow_credentials=True,
81
+ allow_methods=["*"],
82
+ allow_headers=["*"],
83
+ )
84
+
85
+ # Track active check sessions
86
+ active_checks = {}
87
+
88
+
89
+ def _session_id_for_check(check_id: int) -> Optional[str]:
90
+ """Helper to find the session_id for an in-progress check."""
91
+ for session_id, meta in active_checks.items():
92
+ if meta.get("check_id") == check_id:
93
+ return session_id
94
+ return None
95
+
96
+
97
+ @app.on_event("startup")
98
+ async def startup_event():
99
+ """Initialize database and settings on startup"""
100
+ await db.init_db()
101
+
102
+ # Initialize global concurrency limiter with saved setting
103
+ try:
104
+ concurrency_setting = await db.get_setting("max_concurrent_checks")
105
+ max_concurrent = int(concurrency_setting) if concurrency_setting else DEFAULT_MAX_CONCURRENT
106
+ await init_limiter(max_concurrent)
107
+ logger.info(f"Initialized global concurrency limiter with max={max_concurrent}")
108
+ except Exception as e:
109
+ logger.warning(f"Failed to load concurrency setting, using default: {e}")
110
+ await init_limiter(DEFAULT_MAX_CONCURRENT)
111
+
112
+ # Mark any previously in-progress checks as cancelled (e.g., after restart)
113
+ try:
114
+ stale = await db.cancel_stale_in_progress()
115
+ if stale:
116
+ logger.info(f"Cancelled {stale} stale in-progress checks on startup")
117
+ except Exception as e:
118
+ logger.error(f"Failed to cancel stale checks: {e}")
119
+ logger.info("Database initialized")
120
+
121
+
122
+ @app.get("/")
123
+ async def root():
124
+ """Serve frontend if available, otherwise return API health check"""
125
+ # If static frontend is bundled, serve it
126
+ index_path = STATIC_DIR / "index.html"
127
+ if index_path.exists():
128
+ return FileResponse(str(index_path), media_type="text/html")
129
+ # Otherwise return API health check
130
+ return {"status": "ok", "message": "RefChecker Web UI API"}
131
+
132
+
133
+ @app.get("/api/health")
134
+ async def health():
135
+ """Health check endpoint"""
136
+ return {"status": "healthy"}
137
+
138
+
139
+ @app.websocket("/api/ws/{session_id}")
140
+ async def websocket_endpoint(websocket: WebSocket, session_id: str):
141
+ """WebSocket endpoint for real-time updates"""
142
+ await manager.connect(websocket, session_id)
143
+ try:
144
+ # Keep connection alive and handle incoming messages
145
+ while True:
146
+ data = await websocket.receive_text()
147
+ # Echo back or handle commands if needed
148
+ logger.debug(f"Received WebSocket message: {data}")
149
+ except WebSocketDisconnect:
150
+ manager.disconnect(websocket, session_id)
151
+ logger.info(f"WebSocket disconnected: {session_id}")
152
+
153
+
154
+ @app.post("/api/check")
155
+ async def start_check(
156
+ source_type: str = Form(...),
157
+ source_value: Optional[str] = Form(None),
158
+ file: Optional[UploadFile] = File(None),
159
+ source_text: Optional[str] = Form(None),
160
+ llm_config_id: Optional[int] = Form(None),
161
+ llm_provider: str = Form("anthropic"),
162
+ llm_model: Optional[str] = Form(None),
163
+ use_llm: bool = Form(True)
164
+ ):
165
+ """
166
+ Start a new reference check
167
+
168
+ Args:
169
+ source_type: 'url' or 'file'
170
+ source_value: URL or ArXiv ID (for url type)
171
+ file: Uploaded file (for file type)
172
+ llm_config_id: ID of the LLM config to use (for retrieving API key)
173
+ llm_provider: LLM provider to use
174
+ llm_model: Specific model to use
175
+ use_llm: Whether to use LLM for extraction
176
+
177
+ Returns:
178
+ Session ID for tracking progress via WebSocket
179
+ """
180
+ try:
181
+ # Generate session ID
182
+ session_id = str(uuid.uuid4())
183
+
184
+ # Retrieve API key from config if config_id provided
185
+ api_key = None
186
+ if llm_config_id and use_llm:
187
+ config = await db.get_llm_config_by_id(llm_config_id)
188
+ if config:
189
+ api_key = config.get('api_key')
190
+ llm_provider = config.get('provider', llm_provider)
191
+ llm_model = config.get('model') or llm_model
192
+ logger.info(f"Using LLM config {llm_config_id}: {llm_provider}/{llm_model}")
193
+ else:
194
+ logger.warning(f"LLM config {llm_config_id} not found")
195
+
196
+ # Handle file upload or pasted text
197
+ paper_source = source_value
198
+ paper_title = "Processing..." # Placeholder title until we parse the paper
199
+ if source_type == "file" and file:
200
+ # Save uploaded file to permanent uploads directory
201
+ uploads_dir = Path(__file__).parent / "uploads"
202
+ uploads_dir.mkdir(parents=True, exist_ok=True)
203
+ # Use check-specific naming to avoid conflicts
204
+ safe_filename = file.filename.replace("/", "_").replace("\\", "_")
205
+ file_path = uploads_dir / f"{session_id}_{safe_filename}"
206
+ with open(file_path, "wb") as f:
207
+ content = await file.read()
208
+ f.write(content)
209
+ paper_source = str(file_path)
210
+ paper_title = file.filename
211
+ elif source_type == "text":
212
+ if not source_text:
213
+ raise HTTPException(status_code=400, detail="No text provided")
214
+ # Save pasted text to a file for later retrieval and thumbnail generation
215
+ text_dir = Path(tempfile.gettempdir()) / "refchecker_texts"
216
+ text_dir.mkdir(parents=True, exist_ok=True)
217
+ text_file_path = text_dir / f"pasted_{session_id}.txt"
218
+ with open(text_file_path, "w", encoding="utf-8") as f:
219
+ f.write(source_text)
220
+ paper_source = str(text_file_path)
221
+ paper_title = "Pasted Text"
222
+ elif source_type == "url":
223
+ paper_title = source_value
224
+
225
+ if not paper_source:
226
+ raise HTTPException(status_code=400, detail="No source provided")
227
+
228
+ # Create history entry immediately (in_progress status)
229
+ check_id = await db.create_pending_check(
230
+ paper_title=paper_title,
231
+ paper_source=paper_source,
232
+ source_type=source_type,
233
+ llm_provider=llm_provider if use_llm else None,
234
+ llm_model=llm_model if use_llm else None
235
+ )
236
+ logger.info(f"Created pending check with ID {check_id}")
237
+
238
+ # Start check in background
239
+ cancel_event = asyncio.Event()
240
+ task = asyncio.create_task(
241
+ run_check(session_id, check_id, paper_source, source_type, llm_provider, llm_model, api_key, use_llm, cancel_event)
242
+ )
243
+ active_checks[session_id] = {"task": task, "cancel_event": cancel_event, "check_id": check_id}
244
+
245
+ return {
246
+ "session_id": session_id,
247
+ "check_id": check_id,
248
+ "message": "Check started",
249
+ "source": paper_source
250
+ }
251
+
252
+ except Exception as e:
253
+ logger.error(f"Error starting check: {e}", exc_info=True)
254
+ raise HTTPException(status_code=500, detail=str(e))
255
+
256
+
257
+ async def run_check(
258
+ session_id: str,
259
+ check_id: int,
260
+ paper_source: str,
261
+ source_type: str,
262
+ llm_provider: str,
263
+ llm_model: Optional[str],
264
+ api_key: Optional[str],
265
+ use_llm: bool,
266
+ cancel_event: asyncio.Event
267
+ ):
268
+ """
269
+ Run reference check in background and emit progress updates
270
+
271
+ Args:
272
+ session_id: Unique session ID
273
+ check_id: Database ID for this check
274
+ paper_source: Paper URL, ArXiv ID, or file path
275
+ source_type: 'url' or 'file'
276
+ llm_provider: LLM provider
277
+ llm_model: Specific model
278
+ api_key: API key for the LLM provider
279
+ use_llm: Whether to use LLM
280
+ """
281
+ try:
282
+ # Wait for WebSocket to connect (give client time to establish connection)
283
+ logger.info(f"Waiting for WebSocket connection for session {session_id}...")
284
+ for _ in range(30): # Wait up to 3 seconds
285
+ if session_id in manager.active_connections:
286
+ logger.info(f"WebSocket connected for session {session_id}")
287
+ break
288
+ await asyncio.sleep(0.1)
289
+ else:
290
+ logger.warning(f"WebSocket not connected after 3s for session {session_id}, proceeding anyway")
291
+
292
+ # Track accumulated results for incremental saving
293
+ accumulated_results = []
294
+ last_save_count = 0 # Track when we last saved to reduce lock contention
295
+
296
+ # Create progress callback that also saves to DB
297
+ async def progress_callback(event_type: str, data: dict):
298
+ nonlocal accumulated_results, last_save_count
299
+ await manager.send_message(session_id, event_type, data)
300
+
301
+ # Save reference results to DB as they come in
302
+ if event_type == "reference_result":
303
+ accumulated_results.append(data)
304
+
305
+ # Save progress to DB every 3 references to reduce lock contention
306
+ if event_type == "summary_update":
307
+ current_count = len(accumulated_results)
308
+ # Save every 3 references, or on first result
309
+ if current_count - last_save_count >= 3 or (current_count == 1 and last_save_count == 0):
310
+ try:
311
+ await db.update_check_progress(
312
+ check_id=check_id,
313
+ total_refs=data.get("total_refs", 0),
314
+ errors_count=data.get("errors_count", 0),
315
+ warnings_count=data.get("warnings_count", 0),
316
+ suggestions_count=data.get("suggestions_count", 0),
317
+ unverified_count=data.get("unverified_count", 0),
318
+ refs_with_errors=data.get("refs_with_errors", 0),
319
+ refs_with_warnings_only=data.get("refs_with_warnings_only", 0),
320
+ refs_verified=data.get("refs_verified", 0),
321
+ results=accumulated_results
322
+ )
323
+ last_save_count = current_count
324
+ except Exception as e:
325
+ logger.warning(f"Failed to save progress: {e}")
326
+
327
+ # Create title update callback
328
+ async def title_update_callback(check_id: int, paper_title: str):
329
+ await db.update_check_title(check_id, paper_title)
330
+ logger.info(f"Updated paper title for check {check_id}: {paper_title}")
331
+
332
+ # Create bibliography source callback to save bbl/bib content
333
+ async def bibliography_source_callback(check_id: int, content: str, arxiv_id: str):
334
+ try:
335
+ # Save the bibliography content to a file
336
+ bib_dir = Path(__file__).parent / "uploads" / "bibliography"
337
+ bib_dir.mkdir(parents=True, exist_ok=True)
338
+ bib_path = bib_dir / f"{check_id}_{arxiv_id}_bibliography.txt"
339
+ with open(bib_path, "w", encoding="utf-8") as f:
340
+ f.write(content)
341
+ # Update the database with the path
342
+ await db.update_check_bibliography_source(check_id, str(bib_path))
343
+ logger.info(f"Saved bibliography source for check {check_id}: {bib_path}")
344
+ except Exception as e:
345
+ logger.warning(f"Failed to save bibliography source: {e}")
346
+
347
+ # Create checker with progress callback
348
+ checker = ProgressRefChecker(
349
+ llm_provider=llm_provider,
350
+ llm_model=llm_model,
351
+ api_key=api_key,
352
+ use_llm=use_llm,
353
+ progress_callback=progress_callback,
354
+ cancel_event=cancel_event,
355
+ check_id=check_id,
356
+ title_update_callback=title_update_callback,
357
+ bibliography_source_callback=bibliography_source_callback
358
+ )
359
+
360
+ # Run the check
361
+ result = await checker.check_paper(paper_source, source_type)
362
+
363
+ # For file uploads, don't overwrite the original filename with "Unknown Paper"
364
+ # The correct title was already set in the database when the check was created
365
+ result_title = result["paper_title"]
366
+ if source_type == "file" and result_title == "Unknown Paper":
367
+ result_title = None # Don't update title
368
+
369
+ # Update the existing check entry with results
370
+ await db.update_check_results(
371
+ check_id=check_id,
372
+ paper_title=result_title,
373
+ total_refs=result["summary"]["total_refs"],
374
+ errors_count=result["summary"]["errors_count"],
375
+ warnings_count=result["summary"]["warnings_count"],
376
+ suggestions_count=result["summary"].get("suggestions_count", 0),
377
+ unverified_count=result["summary"]["unverified_count"],
378
+ refs_with_errors=result["summary"].get("refs_with_errors", 0),
379
+ refs_with_warnings_only=result["summary"].get("refs_with_warnings_only", 0),
380
+ refs_verified=result["summary"].get("refs_verified", 0),
381
+ results=result["references"],
382
+ status='completed',
383
+ extraction_method=result.get("extraction_method")
384
+ )
385
+
386
+ # Generate thumbnail for file uploads
387
+ if source_type == "file":
388
+ try:
389
+ # Generate and cache thumbnail
390
+ if paper_source.lower().endswith('.pdf'):
391
+ thumbnail_path = await generate_pdf_thumbnail_async(paper_source)
392
+ else:
393
+ thumbnail_path = await get_text_thumbnail_async(check_id, "", paper_source)
394
+ if thumbnail_path:
395
+ await db.update_check_thumbnail(check_id, thumbnail_path)
396
+ logger.info(f"Generated thumbnail for check {check_id}: {thumbnail_path}")
397
+ except Exception as thumb_error:
398
+ logger.warning(f"Failed to generate thumbnail for check {check_id}: {thumb_error}")
399
+
400
+ # Note: We keep uploaded files for later access via /api/file/{check_id}
401
+
402
+ except asyncio.CancelledError:
403
+ logger.info(f"Check cancelled: {session_id}")
404
+ await db.update_check_status(check_id, 'cancelled')
405
+ await manager.send_message(session_id, "cancelled", {"message": "Check cancelled", "check_id": check_id})
406
+ except Exception as e:
407
+ logger.error(f"Error in run_check: {e}", exc_info=True)
408
+ await db.update_check_status(check_id, 'error')
409
+ await manager.send_message(session_id, "error", {
410
+ "message": f"Check failed: {str(e)}",
411
+ "details": type(e).__name__,
412
+ "check_id": check_id
413
+ })
414
+ finally:
415
+ active_checks.pop(session_id, None)
416
+
417
+
418
+ @app.get("/api/history")
419
+ async def get_history(limit: int = 50):
420
+ """Get check history"""
421
+ try:
422
+ history = await db.get_history(limit)
423
+
424
+ enriched = []
425
+ for item in history:
426
+ if item.get("status") == "in_progress":
427
+ session_id = _session_id_for_check(item["id"])
428
+ if session_id:
429
+ item["session_id"] = session_id
430
+ enriched.append(item)
431
+
432
+ return enriched # Return array directly
433
+ except Exception as e:
434
+ logger.error(f"Error getting history: {e}", exc_info=True)
435
+ raise HTTPException(status_code=500, detail=str(e))
436
+
437
+
438
+ @app.get("/api/history/{check_id}")
439
+ async def get_check_detail(check_id: int):
440
+ """Get detailed results for a specific check"""
441
+ try:
442
+ check = await db.get_check_by_id(check_id)
443
+ if not check:
444
+ raise HTTPException(status_code=404, detail="Check not found")
445
+
446
+ if check.get("status") == "in_progress":
447
+ session_id = _session_id_for_check(check_id)
448
+ if session_id:
449
+ check["session_id"] = session_id
450
+ return check
451
+ except HTTPException:
452
+ raise
453
+ except Exception as e:
454
+ logger.error(f"Error getting check detail: {e}", exc_info=True)
455
+ raise HTTPException(status_code=500, detail=str(e))
456
+
457
+
458
+ @app.get("/api/thumbnail/{check_id}")
459
+ async def get_thumbnail(check_id: int):
460
+ """
461
+ Get or generate a thumbnail for a check.
462
+
463
+ Returns the thumbnail image file if available, or generates one on-demand.
464
+ For ArXiv papers, downloads the PDF and generates a thumbnail of the first page.
465
+ For uploaded PDFs, generates a thumbnail from the file.
466
+ For pasted text, returns a placeholder thumbnail.
467
+ """
468
+ try:
469
+ check = await db.get_check_by_id(check_id)
470
+ if not check:
471
+ raise HTTPException(status_code=404, detail="Check not found")
472
+
473
+ # Check if we already have a cached thumbnail path
474
+ thumbnail_path = check.get('thumbnail_path')
475
+ if thumbnail_path and os.path.exists(thumbnail_path):
476
+ return FileResponse(
477
+ thumbnail_path,
478
+ media_type="image/png",
479
+ headers={"Cache-Control": "public, max-age=86400"} # Cache for 1 day
480
+ )
481
+
482
+ # Generate thumbnail based on source type
483
+ paper_source = check.get('paper_source', '')
484
+ source_type = check.get('source_type', 'url')
485
+
486
+ # Try to extract ArXiv ID
487
+ import re
488
+ arxiv_id_pattern = r'(\d{4}\.\d{4,5})(v\d+)?'
489
+ arxiv_match = re.search(arxiv_id_pattern, paper_source)
490
+
491
+ # Check if this is a direct PDF URL (not ArXiv)
492
+ is_direct_pdf_url = (
493
+ source_type == 'url' and
494
+ paper_source.lower().endswith('.pdf') and
495
+ 'arxiv.org' not in paper_source.lower()
496
+ )
497
+
498
+ if is_direct_pdf_url:
499
+ # Generate thumbnail from direct PDF URL
500
+ logger.info(f"Generating thumbnail from PDF URL: {paper_source}")
501
+ import hashlib
502
+ import tempfile
503
+ import urllib.request
504
+
505
+ pdf_hash = hashlib.md5(paper_source.encode()).hexdigest()[:12]
506
+ pdf_path = os.path.join(tempfile.gettempdir(), f"refchecker_pdf_{pdf_hash}.pdf")
507
+
508
+ # Download PDF if not already cached
509
+ if not os.path.exists(pdf_path):
510
+ try:
511
+ await asyncio.to_thread(lambda: urllib.request.urlretrieve(paper_source, pdf_path))
512
+ except Exception as e:
513
+ logger.error(f"Failed to download PDF for thumbnail: {e}")
514
+ thumbnail_path = await get_text_thumbnail_async(check_id, "PDF")
515
+ pdf_path = None
516
+
517
+ if pdf_path and os.path.exists(pdf_path):
518
+ thumbnail_path = await generate_pdf_thumbnail_async(pdf_path)
519
+ else:
520
+ thumbnail_path = await get_text_thumbnail_async(check_id, "PDF")
521
+ elif arxiv_match:
522
+ # Generate thumbnail from ArXiv paper
523
+ arxiv_id = arxiv_match.group(1)
524
+ logger.info(f"Generating thumbnail for ArXiv paper: {arxiv_id}")
525
+ thumbnail_path = await generate_arxiv_thumbnail_async(arxiv_id, check_id)
526
+ elif source_type == 'file' and paper_source.lower().endswith('.pdf'):
527
+ # Generate thumbnail from uploaded PDF
528
+ if os.path.exists(paper_source):
529
+ logger.info(f"Generating thumbnail from PDF: {paper_source}")
530
+ thumbnail_path = await generate_pdf_thumbnail_async(paper_source)
531
+ else:
532
+ # PDF file no longer exists, use placeholder
533
+ thumbnail_path = await get_text_thumbnail_async(check_id, "PDF")
534
+ elif source_type == 'file':
535
+ # For non-PDF file uploads, generate thumbnail with file content
536
+ logger.info(f"Generating text content thumbnail for uploaded file check {check_id}")
537
+ if os.path.exists(paper_source):
538
+ thumbnail_path = await get_text_thumbnail_async(check_id, "", paper_source)
539
+ else:
540
+ thumbnail_path = await get_text_thumbnail_async(check_id, "Uploaded file")
541
+ elif source_type == 'text':
542
+ # Generate thumbnail with actual text content for pasted text
543
+ logger.info(f"Generating text content thumbnail for check {check_id}")
544
+ # paper_source is now a file path for text sources
545
+ thumbnail_path = await get_text_thumbnail_async(check_id, "", paper_source)
546
+ else:
547
+ # Default placeholder for other sources
548
+ thumbnail_path = await get_text_thumbnail_async(check_id, source_type)
549
+
550
+ if thumbnail_path and os.path.exists(thumbnail_path):
551
+ # Cache the thumbnail path in the database
552
+ await db.update_check_thumbnail(check_id, thumbnail_path)
553
+
554
+ return FileResponse(
555
+ thumbnail_path,
556
+ media_type="image/png",
557
+ headers={"Cache-Control": "public, max-age=86400"}
558
+ )
559
+ else:
560
+ raise HTTPException(status_code=404, detail="Could not generate thumbnail")
561
+
562
+ except HTTPException:
563
+ raise
564
+ except Exception as e:
565
+ logger.error(f"Error getting thumbnail: {e}", exc_info=True)
566
+ raise HTTPException(status_code=500, detail=str(e))
567
+
568
+
569
+ @app.get("/api/preview/{check_id}")
570
+ async def get_preview(check_id: int):
571
+ """
572
+ Get or generate a high-resolution preview for a check.
573
+
574
+ Returns a larger preview image suitable for overlay display.
575
+ For ArXiv papers, downloads the PDF and generates a preview of the first page.
576
+ For uploaded PDFs, generates a preview from the file.
577
+ """
578
+ try:
579
+ check = await db.get_check_by_id(check_id)
580
+ if not check:
581
+ raise HTTPException(status_code=404, detail="Check not found")
582
+
583
+ # Generate preview based on source type
584
+ paper_source = check.get('paper_source', '')
585
+ source_type = check.get('source_type', 'url')
586
+
587
+ # Try to extract ArXiv ID
588
+ import re
589
+ arxiv_id_pattern = r'(\d{4}\.\d{4,5})(v\d+)?'
590
+ arxiv_match = re.search(arxiv_id_pattern, paper_source)
591
+
592
+ # Check if this is a direct PDF URL (not ArXiv)
593
+ is_direct_pdf_url = (
594
+ source_type == 'url' and
595
+ paper_source.lower().endswith('.pdf') and
596
+ 'arxiv.org' not in paper_source.lower()
597
+ )
598
+
599
+ preview_path = None
600
+
601
+ if is_direct_pdf_url:
602
+ # Generate preview from direct PDF URL
603
+ logger.info(f"Generating preview from PDF URL: {paper_source}")
604
+ import hashlib
605
+ import tempfile
606
+ import urllib.request
607
+
608
+ pdf_hash = hashlib.md5(paper_source.encode()).hexdigest()[:12]
609
+ pdf_path = os.path.join(tempfile.gettempdir(), f"refchecker_pdf_{pdf_hash}.pdf")
610
+
611
+ # Download PDF if not already cached
612
+ if not os.path.exists(pdf_path):
613
+ try:
614
+ await asyncio.to_thread(lambda: urllib.request.urlretrieve(paper_source, pdf_path))
615
+ except Exception as e:
616
+ logger.error(f"Failed to download PDF for preview: {e}")
617
+ pdf_path = None
618
+
619
+ if pdf_path and os.path.exists(pdf_path):
620
+ preview_path = await generate_pdf_preview_async(pdf_path)
621
+ elif arxiv_match:
622
+ # Generate preview from ArXiv paper
623
+ arxiv_id = arxiv_match.group(1)
624
+ logger.info(f"Generating preview for ArXiv paper: {arxiv_id}")
625
+ preview_path = await generate_arxiv_preview_async(arxiv_id, check_id)
626
+ elif source_type == 'file' and paper_source.lower().endswith('.pdf'):
627
+ # Generate preview from uploaded PDF
628
+ if os.path.exists(paper_source):
629
+ logger.info(f"Generating preview from PDF: {paper_source}")
630
+ preview_path = await generate_pdf_preview_async(paper_source)
631
+
632
+ if preview_path and os.path.exists(preview_path):
633
+ return FileResponse(
634
+ preview_path,
635
+ media_type="image/png",
636
+ headers={"Cache-Control": "public, max-age=86400"} # Cache for 1 day
637
+ )
638
+ else:
639
+ # Fall back to thumbnail if preview can't be generated
640
+ raise HTTPException(status_code=404, detail="Could not generate preview")
641
+
642
+ except HTTPException:
643
+ raise
644
+ except Exception as e:
645
+ logger.error(f"Error getting preview: {e}", exc_info=True)
646
+ raise HTTPException(status_code=500, detail=str(e))
647
+
648
+
649
+ @app.get("/api/text/{check_id}")
650
+ async def get_pasted_text(check_id: int):
651
+ """
652
+ Get the pasted text content for a check.
653
+
654
+ Returns the text file content as plain text for viewing.
655
+ """
656
+ try:
657
+ check = await db.get_check_by_id(check_id)
658
+ if not check:
659
+ raise HTTPException(status_code=404, detail="Check not found")
660
+
661
+ source_type = check.get('source_type', '')
662
+ paper_source = check.get('paper_source', '')
663
+
664
+ if source_type != 'text':
665
+ raise HTTPException(status_code=400, detail="This check is not from pasted text")
666
+
667
+ # paper_source should now be a file path
668
+ if os.path.exists(paper_source):
669
+ return FileResponse(
670
+ paper_source,
671
+ media_type="text/plain; charset=utf-8",
672
+ filename="pasted_bibliography.txt",
673
+ headers={
674
+ "Content-Type": "text/plain; charset=utf-8",
675
+ "Cache-Control": "public, max-age=3600"
676
+ }
677
+ )
678
+ else:
679
+ # Fallback: if paper_source is the actual text content (legacy)
680
+ from fastapi.responses import PlainTextResponse
681
+ return PlainTextResponse(
682
+ paper_source,
683
+ headers={"Cache-Control": "public, max-age=3600"}
684
+ )
685
+
686
+ except HTTPException:
687
+ raise
688
+ except Exception as e:
689
+ logger.error(f"Error getting pasted text: {e}", exc_info=True)
690
+ raise HTTPException(status_code=500, detail=str(e))
691
+
692
+
693
+ @app.get("/api/file/{check_id}")
694
+ async def get_uploaded_file(check_id: int):
695
+ """
696
+ Get the uploaded file content for a check.
697
+
698
+ Returns the file for viewing/download.
699
+ """
700
+ try:
701
+ check = await db.get_check_by_id(check_id)
702
+ if not check:
703
+ raise HTTPException(status_code=404, detail="Check not found")
704
+
705
+ source_type = check.get('source_type', '')
706
+ paper_source = check.get('paper_source', '')
707
+ paper_title = check.get('paper_title', 'uploaded_file')
708
+
709
+ if source_type != 'file':
710
+ raise HTTPException(status_code=400, detail="This check is not from an uploaded file")
711
+
712
+ if os.path.exists(paper_source):
713
+ # Determine media type based on file extension
714
+ media_type = "application/octet-stream"
715
+ if paper_source.lower().endswith('.pdf'):
716
+ media_type = "application/pdf"
717
+ elif paper_source.lower().endswith('.txt'):
718
+ media_type = "text/plain; charset=utf-8"
719
+ elif paper_source.lower().endswith('.bib'):
720
+ media_type = "text/plain; charset=utf-8"
721
+ elif paper_source.lower().endswith('.tex'):
722
+ media_type = "text/plain; charset=utf-8"
723
+
724
+ return FileResponse(
725
+ paper_source,
726
+ media_type=media_type,
727
+ filename=paper_title,
728
+ headers={"Cache-Control": "public, max-age=3600"}
729
+ )
730
+ else:
731
+ raise HTTPException(status_code=404, detail="File no longer exists")
732
+
733
+ except HTTPException:
734
+ raise
735
+ except Exception as e:
736
+ logger.error(f"Error getting uploaded file: {e}", exc_info=True)
737
+ raise HTTPException(status_code=500, detail=str(e))
738
+
739
+
740
+ @app.get("/api/bibliography/{check_id}")
741
+ async def get_bibliography_source(check_id: int):
742
+ """
743
+ Get the bibliography source content (bbl/bib file) for a check.
744
+
745
+ Returns the bibliography file content as plain text for viewing.
746
+ This is the actual source file used to extract references (from ArXiv source or pasted text).
747
+ """
748
+ try:
749
+ check = await db.get_check_by_id(check_id)
750
+ if not check:
751
+ raise HTTPException(status_code=404, detail="Check not found")
752
+
753
+ bibliography_source_path = check.get('bibliography_source_path', '')
754
+ extraction_method = check.get('extraction_method', '')
755
+ source_type = check.get('source_type', '')
756
+ paper_source = check.get('paper_source', '')
757
+
758
+ # First check if we have a saved bibliography source file
759
+ if bibliography_source_path and os.path.exists(bibliography_source_path):
760
+ return FileResponse(
761
+ bibliography_source_path,
762
+ media_type="text/plain; charset=utf-8",
763
+ filename=f"bibliography_{check_id}.{extraction_method or 'txt'}",
764
+ headers={
765
+ "Content-Type": "text/plain; charset=utf-8",
766
+ "Cache-Control": "public, max-age=3600"
767
+ }
768
+ )
769
+
770
+ # Fall back to pasted text source if source_type is 'text' and it's bbl/bib
771
+ if source_type == 'text' and extraction_method in ['bbl', 'bib'] and os.path.exists(paper_source):
772
+ return FileResponse(
773
+ paper_source,
774
+ media_type="text/plain; charset=utf-8",
775
+ filename=f"bibliography_{check_id}.{extraction_method}",
776
+ headers={
777
+ "Content-Type": "text/plain; charset=utf-8",
778
+ "Cache-Control": "public, max-age=3600"
779
+ }
780
+ )
781
+
782
+ raise HTTPException(status_code=404, detail="Bibliography source not available for this check")
783
+
784
+ except HTTPException:
785
+ raise
786
+ except Exception as e:
787
+ logger.error(f"Error getting bibliography source: {e}", exc_info=True)
788
+ raise HTTPException(status_code=500, detail=str(e))
789
+
790
+
791
+ @app.post("/api/recheck/{check_id}")
792
+ async def recheck(check_id: int):
793
+ """Re-run a previous check"""
794
+ try:
795
+ # Get original check
796
+ original = await db.get_check_by_id(check_id)
797
+ if not original:
798
+ raise HTTPException(status_code=404, detail="Check not found")
799
+
800
+ # Generate new session ID
801
+ session_id = str(uuid.uuid4())
802
+
803
+ # Determine source type
804
+ source = original["paper_source"]
805
+ source_type = original.get("source_type") or (
806
+ "url" if source.startswith("http") or "arxiv" in source.lower() else "file"
807
+ )
808
+
809
+ llm_provider = original.get("llm_provider", "anthropic")
810
+ llm_model = original.get("llm_model")
811
+
812
+ # Create history entry immediately
813
+ new_check_id = await db.create_pending_check(
814
+ paper_title=original.get("paper_title", "Re-checking..."),
815
+ paper_source=source,
816
+ source_type=source_type,
817
+ llm_provider=llm_provider,
818
+ llm_model=llm_model
819
+ )
820
+
821
+ # Start check in background
822
+ cancel_event = asyncio.Event()
823
+ task = asyncio.create_task(
824
+ run_check(
825
+ session_id,
826
+ new_check_id,
827
+ source,
828
+ source_type,
829
+ llm_provider,
830
+ llm_model,
831
+ None, # API key will need to be retrieved separately
832
+ True,
833
+ cancel_event
834
+ )
835
+ )
836
+ active_checks[session_id] = {"task": task, "cancel_event": cancel_event, "check_id": new_check_id}
837
+
838
+ return {
839
+ "session_id": session_id,
840
+ "check_id": new_check_id,
841
+ "message": "Re-check started",
842
+ "original_id": check_id
843
+ }
844
+
845
+ except HTTPException:
846
+ raise
847
+ except Exception as e:
848
+ logger.error(f"Error rechecking: {e}", exc_info=True)
849
+ raise HTTPException(status_code=500, detail=str(e))
850
+
851
+
852
+ @app.post("/api/cancel/{session_id}")
853
+ async def cancel_check(session_id: str):
854
+ """Cancel an active check"""
855
+ active = active_checks.get(session_id)
856
+ if not active:
857
+ raise HTTPException(status_code=404, detail="Active check not found")
858
+ active["cancel_event"].set()
859
+ active["task"].cancel()
860
+ return {"message": "Cancellation requested"}
861
+
862
+
863
+ @app.delete("/api/history/{check_id}")
864
+ async def delete_check(check_id: int):
865
+ """Delete a check from history"""
866
+ try:
867
+ success = await db.delete_check(check_id)
868
+ if success:
869
+ return {"message": "Check deleted successfully"}
870
+ else:
871
+ raise HTTPException(status_code=404, detail="Check not found")
872
+ except HTTPException:
873
+ raise
874
+ except Exception as e:
875
+ logger.error(f"Error deleting check: {e}", exc_info=True)
876
+ raise HTTPException(status_code=500, detail=str(e))
877
+
878
+
879
+ @app.patch("/api/history/{check_id}")
880
+ async def update_check_label(check_id: int, update: CheckLabelUpdate):
881
+ """Update the custom label for a check"""
882
+ try:
883
+ success = await db.update_check_label(check_id, update.custom_label)
884
+ if success:
885
+ return {"message": "Label updated successfully"}
886
+ else:
887
+ raise HTTPException(status_code=404, detail="Check not found")
888
+ except HTTPException:
889
+ raise
890
+ except Exception as e:
891
+ logger.error(f"Error updating label: {e}", exc_info=True)
892
+ raise HTTPException(status_code=500, detail=str(e))
893
+
894
+
895
+ # LLM Configuration endpoints
896
+
897
+ @app.get("/api/llm-configs")
898
+ async def get_llm_configs():
899
+ """Get all LLM configurations (API keys are not returned)"""
900
+ try:
901
+ configs = await db.get_llm_configs()
902
+ return configs
903
+ except Exception as e:
904
+ logger.error(f"Error getting LLM configs: {e}", exc_info=True)
905
+ raise HTTPException(status_code=500, detail=str(e))
906
+
907
+
908
+ @app.post("/api/llm-configs")
909
+ async def create_llm_config(config: LLMConfigCreate):
910
+ """Create a new LLM configuration"""
911
+ try:
912
+ config_id = await db.create_llm_config(
913
+ name=config.name,
914
+ provider=config.provider,
915
+ model=config.model,
916
+ api_key=config.api_key,
917
+ endpoint=config.endpoint
918
+ )
919
+ # Return the created config (without API key)
920
+ return {
921
+ "id": config_id,
922
+ "name": config.name,
923
+ "provider": config.provider,
924
+ "model": config.model,
925
+ "endpoint": config.endpoint,
926
+ "is_default": False
927
+ }
928
+ except Exception as e:
929
+ logger.error(f"Error creating LLM config: {e}", exc_info=True)
930
+ raise HTTPException(status_code=500, detail=str(e))
931
+
932
+
933
+ @app.put("/api/llm-configs/{config_id}")
934
+ async def update_llm_config(config_id: int, config: LLMConfigUpdate):
935
+ """Update an existing LLM configuration"""
936
+ try:
937
+ success = await db.update_llm_config(
938
+ config_id=config_id,
939
+ name=config.name,
940
+ provider=config.provider,
941
+ model=config.model,
942
+ api_key=config.api_key,
943
+ endpoint=config.endpoint
944
+ )
945
+ if success:
946
+ # Get updated config
947
+ updated = await db.get_llm_configs()
948
+ updated_config = next((c for c in updated if c["id"] == config_id), None)
949
+ return updated_config or {"id": config_id, "message": "Updated"}
950
+ else:
951
+ raise HTTPException(status_code=404, detail="Config not found")
952
+ except HTTPException:
953
+ raise
954
+ except Exception as e:
955
+ logger.error(f"Error updating LLM config: {e}", exc_info=True)
956
+ raise HTTPException(status_code=500, detail=str(e))
957
+
958
+
959
+ @app.delete("/api/llm-configs/{config_id}")
960
+ async def delete_llm_config(config_id: int):
961
+ """Delete an LLM configuration"""
962
+ try:
963
+ success = await db.delete_llm_config(config_id)
964
+ if success:
965
+ return {"message": "Config deleted successfully"}
966
+ else:
967
+ raise HTTPException(status_code=404, detail="Config not found")
968
+ except HTTPException:
969
+ raise
970
+ except Exception as e:
971
+ logger.error(f"Error deleting LLM config: {e}", exc_info=True)
972
+ raise HTTPException(status_code=500, detail=str(e))
973
+
974
+
975
+ @app.post("/api/llm-configs/{config_id}/set-default")
976
+ async def set_default_llm_config(config_id: int):
977
+ """Set an LLM configuration as the default"""
978
+ try:
979
+ success = await db.set_default_llm_config(config_id)
980
+ if success:
981
+ return {"message": "Default config set successfully"}
982
+ else:
983
+ raise HTTPException(status_code=404, detail="Config not found")
984
+ except HTTPException:
985
+ raise
986
+ except Exception as e:
987
+ logger.error(f"Error setting default config: {e}", exc_info=True)
988
+ raise HTTPException(status_code=500, detail=str(e))
989
+
990
+
991
+ @app.post("/api/llm-configs/validate")
992
+ async def validate_llm_config(config: LLMConfigValidate):
993
+ """
994
+ Validate an LLM configuration by making a test API call.
995
+ Returns success or error message.
996
+ """
997
+ # Map providers to their required packages
998
+ PROVIDER_PACKAGES = {
999
+ "anthropic": ("anthropic", "pip install anthropic"),
1000
+ "openai": ("openai", "pip install openai"),
1001
+ "google": ("google.generativeai", "pip install google-generativeai"),
1002
+ "gemini": ("google.generativeai", "pip install google-generativeai"),
1003
+ }
1004
+
1005
+ # Check if required package is installed for this provider
1006
+ provider_lower = config.provider.lower()
1007
+ if provider_lower in PROVIDER_PACKAGES:
1008
+ module_name, install_cmd = PROVIDER_PACKAGES[provider_lower]
1009
+ try:
1010
+ __import__(module_name.split('.')[0])
1011
+ except ImportError:
1012
+ raise HTTPException(
1013
+ status_code=400,
1014
+ detail=f"The '{config.provider}' provider requires the '{module_name.split('.')[0]}' package. "
1015
+ f"Please install it with: {install_cmd}"
1016
+ )
1017
+
1018
+ try:
1019
+ import sys
1020
+ from pathlib import Path
1021
+ sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
1022
+ from refchecker.llm.base import create_llm_provider
1023
+
1024
+ # Build config
1025
+ llm_config = {}
1026
+ if config.model:
1027
+ llm_config['model'] = config.model
1028
+ if config.api_key:
1029
+ llm_config['api_key'] = config.api_key
1030
+ if config.endpoint:
1031
+ llm_config['endpoint'] = config.endpoint
1032
+
1033
+ # Try to create provider
1034
+ provider = create_llm_provider(config.provider, llm_config)
1035
+ if not provider:
1036
+ raise HTTPException(status_code=400, detail=f"Failed to create {config.provider} provider")
1037
+
1038
+ # Check if provider is available (has required client initialized)
1039
+ if hasattr(provider, 'is_available') and not provider.is_available():
1040
+ # Provider was created but client failed to initialize
1041
+ if provider_lower in PROVIDER_PACKAGES:
1042
+ _, install_cmd = PROVIDER_PACKAGES[provider_lower]
1043
+ raise HTTPException(
1044
+ status_code=400,
1045
+ detail=f"Provider '{config.provider}' is not available. "
1046
+ f"Make sure the required package is installed: {install_cmd}"
1047
+ )
1048
+ raise HTTPException(status_code=400, detail=f"Provider '{config.provider}' is not available")
1049
+
1050
+ # Make a simple test call using _call_llm
1051
+ test_response = provider._call_llm("Say 'ok' if you can hear me.")
1052
+
1053
+ if test_response:
1054
+ return {"valid": True, "message": "Connection successful"}
1055
+ else:
1056
+ raise HTTPException(status_code=400, detail="Provider returned empty response")
1057
+
1058
+ except HTTPException:
1059
+ raise
1060
+ except Exception as e:
1061
+ error_msg = str(e)
1062
+ logger.error(f"LLM validation failed: {error_msg}")
1063
+ # Extract useful error message
1064
+ if "404" in error_msg and "model" in error_msg.lower():
1065
+ raise HTTPException(status_code=400, detail=f"Invalid model name. The model '{config.model}' was not found.")
1066
+ elif "401" in error_msg or "unauthorized" in error_msg.lower():
1067
+ raise HTTPException(status_code=400, detail="Invalid API key")
1068
+ elif "rate" in error_msg.lower():
1069
+ raise HTTPException(status_code=400, detail="Rate limited - but API key is valid")
1070
+ elif "'NoneType'" in error_msg:
1071
+ # This usually means the provider library isn't installed
1072
+ if provider_lower in PROVIDER_PACKAGES:
1073
+ _, install_cmd = PROVIDER_PACKAGES[provider_lower]
1074
+ raise HTTPException(
1075
+ status_code=400,
1076
+ detail=f"The '{config.provider}' provider requires additional packages. "
1077
+ f"Please install with: {install_cmd}"
1078
+ )
1079
+ raise HTTPException(status_code=400, detail=f"Provider initialization failed. Check that required packages are installed.")
1080
+ else:
1081
+ raise HTTPException(status_code=400, detail=f"Validation failed: {error_msg}")
1082
+
1083
+
1084
+ # Semantic Scholar API Key endpoints
1085
+
1086
+ class SemanticScholarKeyUpdate(BaseModel):
1087
+ api_key: str
1088
+
1089
+
1090
+ class SemanticScholarKeyValidate(BaseModel):
1091
+ api_key: str
1092
+
1093
+
1094
+ @app.post("/api/settings/semantic-scholar/validate")
1095
+ async def validate_semantic_scholar_key(data: SemanticScholarKeyValidate):
1096
+ """
1097
+ Validate a Semantic Scholar API key by making a test API call.
1098
+ Returns success or error message.
1099
+ """
1100
+ import httpx
1101
+
1102
+ try:
1103
+ if not data.api_key or not data.api_key.strip():
1104
+ raise HTTPException(status_code=400, detail="API key cannot be empty")
1105
+
1106
+ api_key = data.api_key.strip()
1107
+
1108
+ # Test the API key by making a simple search query
1109
+ # Using the paper search endpoint with a minimal query
1110
+ url = "https://api.semanticscholar.org/graph/v1/paper/search"
1111
+ headers = {
1112
+ "Accept": "application/json",
1113
+ "x-api-key": api_key
1114
+ }
1115
+ params = {
1116
+ "query": "test",
1117
+ "limit": 1,
1118
+ "fields": "title"
1119
+ }
1120
+
1121
+ async with httpx.AsyncClient(timeout=10.0) as client:
1122
+ response = await client.get(url, headers=headers, params=params)
1123
+
1124
+ if response.status_code == 200:
1125
+ return {"valid": True, "message": "API key is valid"}
1126
+ elif response.status_code == 401 or response.status_code == 403:
1127
+ raise HTTPException(status_code=400, detail="Invalid API key")
1128
+ elif response.status_code == 429:
1129
+ # Rate limited but key is valid
1130
+ return {"valid": True, "message": "API key is valid (rate limited)"}
1131
+ else:
1132
+ raise HTTPException(
1133
+ status_code=400,
1134
+ detail=f"API validation failed with status {response.status_code}"
1135
+ )
1136
+
1137
+ except HTTPException:
1138
+ raise
1139
+ except httpx.TimeoutException:
1140
+ raise HTTPException(status_code=400, detail="Connection timed out. Please try again.")
1141
+ except httpx.RequestError as e:
1142
+ logger.error(f"Semantic Scholar validation request error: {e}")
1143
+ raise HTTPException(status_code=400, detail=f"Connection error: {str(e)}")
1144
+ except Exception as e:
1145
+ logger.error(f"Semantic Scholar validation failed: {e}", exc_info=True)
1146
+ raise HTTPException(status_code=400, detail=f"Validation failed: {str(e)}")
1147
+
1148
+
1149
+ @app.get("/api/settings/semantic-scholar")
1150
+ async def get_semantic_scholar_key_status():
1151
+ """Check if Semantic Scholar API key is configured (does not return the key)"""
1152
+ try:
1153
+ has_key = await db.has_setting("semantic_scholar_api_key")
1154
+ return {"has_key": has_key}
1155
+ except Exception as e:
1156
+ logger.error(f"Error checking Semantic Scholar key: {e}", exc_info=True)
1157
+ raise HTTPException(status_code=500, detail=str(e))
1158
+
1159
+
1160
+ @app.put("/api/settings/semantic-scholar")
1161
+ async def set_semantic_scholar_key(data: SemanticScholarKeyUpdate):
1162
+ """Set or update the Semantic Scholar API key"""
1163
+ try:
1164
+ if not data.api_key or not data.api_key.strip():
1165
+ raise HTTPException(status_code=400, detail="API key cannot be empty")
1166
+
1167
+ await db.set_setting("semantic_scholar_api_key", data.api_key.strip())
1168
+ logger.info("Semantic Scholar API key updated")
1169
+ return {"message": "Semantic Scholar API key saved", "has_key": True}
1170
+ except HTTPException:
1171
+ raise
1172
+ except Exception as e:
1173
+ logger.error(f"Error saving Semantic Scholar key: {e}", exc_info=True)
1174
+ raise HTTPException(status_code=500, detail=str(e))
1175
+
1176
+
1177
+ @app.delete("/api/settings/semantic-scholar")
1178
+ async def delete_semantic_scholar_key():
1179
+ """Delete the Semantic Scholar API key"""
1180
+ try:
1181
+ await db.delete_setting("semantic_scholar_api_key")
1182
+ logger.info("Semantic Scholar API key deleted")
1183
+ return {"message": "Semantic Scholar API key deleted", "has_key": False}
1184
+ except Exception as e:
1185
+ logger.error(f"Error deleting Semantic Scholar key: {e}", exc_info=True)
1186
+ raise HTTPException(status_code=500, detail=str(e))
1187
+
1188
+
1189
+ # General Settings endpoints
1190
+
1191
+ class SettingUpdate(BaseModel):
1192
+ value: str
1193
+
1194
+
1195
+ @app.get("/api/settings")
1196
+ async def get_all_settings():
1197
+ """Get all application settings"""
1198
+ try:
1199
+ # Define all settings with their defaults and metadata
1200
+ settings_config = {
1201
+ "max_concurrent_checks": {
1202
+ "default": str(DEFAULT_MAX_CONCURRENT),
1203
+ "type": "number",
1204
+ "label": "Max Concurrent Checks",
1205
+ "description": "Maximum number of references to check simultaneously across all papers",
1206
+ "min": 1,
1207
+ "max": 20,
1208
+ "section": "Performance"
1209
+ }
1210
+ }
1211
+
1212
+ # Get current values from database
1213
+ settings = {}
1214
+ for key, config in settings_config.items():
1215
+ value = await db.get_setting(key)
1216
+ settings[key] = {
1217
+ "value": value if value is not None else config["default"],
1218
+ "default": config["default"],
1219
+ "type": config["type"],
1220
+ "label": config["label"],
1221
+ "description": config["description"],
1222
+ "section": config["section"]
1223
+ }
1224
+ # Include extra metadata for number types
1225
+ if config["type"] == "number":
1226
+ settings[key]["min"] = config.get("min")
1227
+ settings[key]["max"] = config.get("max")
1228
+
1229
+ return settings
1230
+ except Exception as e:
1231
+ logger.error(f"Error getting settings: {e}", exc_info=True)
1232
+ raise HTTPException(status_code=500, detail=str(e))
1233
+
1234
+
1235
+ @app.put("/api/settings/{setting_key}")
1236
+ async def update_setting(setting_key: str, update: SettingUpdate):
1237
+ """Update a specific setting"""
1238
+ try:
1239
+ # Validate the setting key
1240
+ valid_keys = {"max_concurrent_checks"}
1241
+ if setting_key not in valid_keys:
1242
+ raise HTTPException(status_code=400, detail=f"Unknown setting: {setting_key}")
1243
+
1244
+ # Apply setting-specific validation
1245
+ if setting_key == "max_concurrent_checks":
1246
+ try:
1247
+ value = int(update.value)
1248
+ if value < 1:
1249
+ value = 1
1250
+ if value > 50:
1251
+ value = 50
1252
+
1253
+ # Update the global limiter immediately
1254
+ limiter = get_limiter()
1255
+ await limiter.set_max_concurrent(value)
1256
+ logger.info(f"Updated global concurrency limit to {value}")
1257
+
1258
+ # Store the validated value
1259
+ await db.set_setting(setting_key, str(value))
1260
+
1261
+ return {"key": setting_key, "value": str(value), "message": "Setting updated"}
1262
+ except ValueError:
1263
+ raise HTTPException(status_code=400, detail="max_concurrent_checks must be a number")
1264
+
1265
+ # For other settings, just store the value
1266
+ await db.set_setting(setting_key, update.value)
1267
+ return {"key": setting_key, "value": update.value, "message": "Setting updated"}
1268
+
1269
+ except HTTPException:
1270
+ raise
1271
+ except Exception as e:
1272
+ logger.error(f"Error updating setting: {e}", exc_info=True)
1273
+ raise HTTPException(status_code=500, detail=str(e))
1274
+
1275
+
1276
+ # Debug/Admin endpoints
1277
+
1278
+ @app.delete("/api/admin/cache")
1279
+ async def clear_verification_cache():
1280
+ """Clear the verification cache"""
1281
+ try:
1282
+ count = await db.clear_verification_cache()
1283
+ logger.info(f"Cleared {count} entries from verification cache")
1284
+ return {"message": f"Cleared {count} cached verification results", "count": count}
1285
+ except Exception as e:
1286
+ logger.error(f"Error clearing cache: {e}", exc_info=True)
1287
+ raise HTTPException(status_code=500, detail=str(e))
1288
+
1289
+
1290
+ @app.delete("/api/admin/database")
1291
+ async def clear_database():
1292
+ """Clear all data (cache + history) but keep settings and LLM configs"""
1293
+ try:
1294
+ # Clear verification cache
1295
+ cache_count = await db.clear_verification_cache()
1296
+
1297
+ # Clear check history
1298
+ async with aiosqlite.connect(db.db_path) as conn:
1299
+ await conn.execute("DELETE FROM check_history")
1300
+ await conn.commit()
1301
+ cursor = await conn.execute("SELECT changes()")
1302
+ row = await cursor.fetchone()
1303
+ history_count = row[0] if row else 0
1304
+
1305
+ logger.info(f"Cleared database: {cache_count} cache entries, {history_count} history entries")
1306
+ return {
1307
+ "message": f"Cleared {cache_count} cache entries and {history_count} history entries",
1308
+ "cache_count": cache_count,
1309
+ "history_count": history_count
1310
+ }
1311
+ except Exception as e:
1312
+ logger.error(f"Error clearing database: {e}", exc_info=True)
1313
+ raise HTTPException(status_code=500, detail=str(e))
1314
+
1315
+
1316
+ # Mount static files for bundled frontend (if available)
1317
+ # This must be after all API routes to avoid conflicts
1318
+ if STATIC_DIR.exists() and (STATIC_DIR / "index.html").exists():
1319
+ # Mount assets directory for JS/CSS files
1320
+ if (STATIC_DIR / "assets").exists():
1321
+ app.mount("/assets", StaticFiles(directory=str(STATIC_DIR / "assets")), name="assets")
1322
+
1323
+ @app.get("/favicon.svg")
1324
+ async def favicon():
1325
+ """Serve favicon"""
1326
+ favicon_path = STATIC_DIR / "favicon.svg"
1327
+ if favicon_path.exists():
1328
+ return FileResponse(str(favicon_path), media_type="image/svg+xml")
1329
+ raise HTTPException(status_code=404)
1330
+
1331
+ @app.get("/{full_path:path}")
1332
+ async def serve_spa(request: Request, full_path: str):
1333
+ """
1334
+ Serve the SPA frontend for all non-API routes.
1335
+ This enables client-side routing.
1336
+ """
1337
+ # Don't serve SPA for API routes (they're handled above)
1338
+ if full_path.startswith("api/"):
1339
+ raise HTTPException(status_code=404, detail="API endpoint not found")
1340
+
1341
+ # Try to serve the exact file if it exists
1342
+ file_path = STATIC_DIR / full_path
1343
+ if file_path.exists() and file_path.is_file():
1344
+ # Determine content type
1345
+ suffix = file_path.suffix.lower()
1346
+ media_types = {
1347
+ ".html": "text/html",
1348
+ ".css": "text/css",
1349
+ ".js": "application/javascript",
1350
+ ".json": "application/json",
1351
+ ".png": "image/png",
1352
+ ".jpg": "image/jpeg",
1353
+ ".jpeg": "image/jpeg",
1354
+ ".svg": "image/svg+xml",
1355
+ ".ico": "image/x-icon",
1356
+ }
1357
+ media_type = media_types.get(suffix, "application/octet-stream")
1358
+ return FileResponse(str(file_path), media_type=media_type)
1359
+
1360
+ # For all other paths, serve index.html (SPA routing)
1361
+ index_path = STATIC_DIR / "index.html"
1362
+ return FileResponse(str(index_path), media_type="text/html")
1363
+
1364
+
1365
+ if __name__ == "__main__":
1366
+ import uvicorn
1367
+ uvicorn.run(app, host="0.0.0.0", port=8000)