academic-refchecker 2.0.12__py3-none-any.whl → 2.0.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {academic_refchecker-2.0.12.dist-info → academic_refchecker-2.0.14.dist-info}/METADATA +1 -1
- {academic_refchecker-2.0.12.dist-info → academic_refchecker-2.0.14.dist-info}/RECORD +17 -17
- {academic_refchecker-2.0.12.dist-info → academic_refchecker-2.0.14.dist-info}/WHEEL +1 -1
- backend/main.py +33 -5
- backend/refchecker_wrapper.py +42 -1
- backend/thumbnail.py +117 -0
- refchecker/__version__.py +1 -1
- refchecker/checkers/arxiv_citation.py +181 -49
- refchecker/checkers/enhanced_hybrid_checker.py +117 -4
- refchecker/checkers/semantic_scholar.py +43 -1
- refchecker/llm/base.py +1 -15
- refchecker/llm/providers.py +102 -113
- refchecker/utils/author_utils.py +15 -2
- refchecker/utils/bibliography_utils.py +2 -2
- {academic_refchecker-2.0.12.dist-info → academic_refchecker-2.0.14.dist-info}/entry_points.txt +0 -0
- {academic_refchecker-2.0.12.dist-info → academic_refchecker-2.0.14.dist-info}/licenses/LICENSE +0 -0
- {academic_refchecker-2.0.12.dist-info → academic_refchecker-2.0.14.dist-info}/top_level.txt +0 -0
|
@@ -8,8 +8,8 @@ for papers found on ArXiv, as it reflects the author-submitted metadata.
|
|
|
8
8
|
|
|
9
9
|
Key features:
|
|
10
10
|
- Fetches official BibTeX from https://arxiv.org/bibtex/{arxiv_id}
|
|
11
|
-
-
|
|
12
|
-
-
|
|
11
|
+
- Checks reference against all historical versions when latest doesn't match
|
|
12
|
+
- Annotates errors with version info when reference matches an older version
|
|
13
13
|
- Parses BibTeX to extract normalized metadata matching refchecker schema
|
|
14
14
|
|
|
15
15
|
Usage:
|
|
@@ -30,6 +30,7 @@ Usage:
|
|
|
30
30
|
import re
|
|
31
31
|
import logging
|
|
32
32
|
import requests
|
|
33
|
+
import html
|
|
33
34
|
from typing import Dict, List, Tuple, Optional, Any
|
|
34
35
|
|
|
35
36
|
import bibtexparser
|
|
@@ -88,6 +89,8 @@ class ArXivCitationChecker:
|
|
|
88
89
|
# export.arxiv.org URLs
|
|
89
90
|
r'export\.arxiv\.org/abs/([0-9]{4}\.[0-9]{4,5})(v\d+)?',
|
|
90
91
|
r'export\.arxiv\.org/pdf/([0-9]{4}\.[0-9]{4,5})(v\d+)?',
|
|
92
|
+
# DOI format
|
|
93
|
+
r"(?:arxiv[:./])(\d{4}\.\d{4,5})(v\d+)?"
|
|
91
94
|
]
|
|
92
95
|
|
|
93
96
|
def extract_arxiv_id(self, reference: Dict[str, Any]) -> Tuple[Optional[str], Optional[str]]:
|
|
@@ -107,6 +110,8 @@ class ArXivCitationChecker:
|
|
|
107
110
|
reference.get('cited_url', ''),
|
|
108
111
|
reference.get('raw_text', ''),
|
|
109
112
|
reference.get('eprint', ''), # BibTeX field
|
|
113
|
+
reference.get('journal', ''),
|
|
114
|
+
reference.get('doi', ''), # DOI field (may contain arXiv ID)
|
|
110
115
|
]
|
|
111
116
|
|
|
112
117
|
for source in sources:
|
|
@@ -324,35 +329,133 @@ class ArXivCitationChecker:
|
|
|
324
329
|
|
|
325
330
|
return None
|
|
326
331
|
|
|
327
|
-
def
|
|
332
|
+
def is_arxiv_reference(self, reference: Dict[str, Any]) -> bool:
|
|
328
333
|
"""
|
|
329
|
-
|
|
334
|
+
Check if a reference is an ArXiv paper.
|
|
330
335
|
|
|
331
|
-
|
|
332
|
-
|
|
336
|
+
Args:
|
|
337
|
+
reference: Reference dictionary
|
|
338
|
+
|
|
339
|
+
Returns:
|
|
340
|
+
True if reference appears to be an ArXiv paper
|
|
341
|
+
"""
|
|
342
|
+
arxiv_id, _ = self.extract_arxiv_id(reference)
|
|
343
|
+
return arxiv_id is not None
|
|
344
|
+
|
|
345
|
+
def _fetch_version_metadata_from_html(self, arxiv_id: str, version_num: int) -> Optional[Dict[str, Any]]:
|
|
346
|
+
"""
|
|
347
|
+
Fetch and parse metadata for a specific version using HTML scraping.
|
|
333
348
|
|
|
334
349
|
Args:
|
|
335
350
|
arxiv_id: ArXiv ID without version
|
|
351
|
+
version_num: Version number to fetch (1, 2, 3, etc.)
|
|
336
352
|
|
|
337
353
|
Returns:
|
|
338
|
-
|
|
354
|
+
Dictionary with version metadata or None if version doesn't exist
|
|
339
355
|
"""
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
356
|
+
version_str = f"v{version_num}"
|
|
357
|
+
url = f"{self.abs_url}/{arxiv_id}{version_str}"
|
|
358
|
+
|
|
359
|
+
self.rate_limiter.wait()
|
|
360
|
+
try:
|
|
361
|
+
logger.debug(f"Checking historical version: {url}")
|
|
362
|
+
response = requests.get(url, timeout=self.timeout)
|
|
363
|
+
if response.status_code == 404:
|
|
364
|
+
return None # Version does not exist
|
|
365
|
+
response.raise_for_status()
|
|
366
|
+
html_content = response.text
|
|
367
|
+
|
|
368
|
+
# Parse meta tags for metadata
|
|
369
|
+
# Title
|
|
370
|
+
title_match = re.search(r'<meta name="citation_title" content="(.*?)"', html_content)
|
|
371
|
+
title = html.unescape(title_match.group(1)).strip() if title_match else ""
|
|
372
|
+
|
|
373
|
+
# Authors
|
|
374
|
+
authors = []
|
|
375
|
+
for auth in re.findall(r'<meta name="citation_author" content="(.*?)"', html_content):
|
|
376
|
+
authors.append(html.unescape(auth).strip())
|
|
377
|
+
|
|
378
|
+
# Date/Year
|
|
379
|
+
date_match = re.search(r'<meta name="citation_date" content="(.*?)"', html_content)
|
|
380
|
+
year = None
|
|
381
|
+
if date_match:
|
|
382
|
+
ym = re.search(r'^(\d{4})', date_match.group(1))
|
|
383
|
+
if ym:
|
|
384
|
+
year = int(ym.group(1))
|
|
385
|
+
|
|
386
|
+
return {
|
|
387
|
+
'version': version_str,
|
|
388
|
+
'version_num': version_num,
|
|
389
|
+
'title': title,
|
|
390
|
+
'authors': [{'name': a} for a in authors],
|
|
391
|
+
'year': year,
|
|
392
|
+
'url': url,
|
|
393
|
+
}
|
|
394
|
+
except Exception as e:
|
|
395
|
+
logger.warning(f"Failed to fetch history {version_str}: {e}")
|
|
396
|
+
return None
|
|
397
|
+
|
|
398
|
+
def _get_latest_version_number(self, arxiv_id: str) -> Optional[int]:
|
|
345
399
|
"""
|
|
346
|
-
|
|
400
|
+
Get the latest version number by fetching the abstract page.
|
|
347
401
|
|
|
348
402
|
Args:
|
|
349
|
-
|
|
403
|
+
arxiv_id: ArXiv ID without version
|
|
350
404
|
|
|
351
405
|
Returns:
|
|
352
|
-
|
|
406
|
+
Latest version number as integer, or None if couldn't determine
|
|
353
407
|
"""
|
|
354
|
-
|
|
355
|
-
|
|
408
|
+
url = f"{self.abs_url}/{arxiv_id}"
|
|
409
|
+
|
|
410
|
+
self.rate_limiter.wait()
|
|
411
|
+
try:
|
|
412
|
+
response = requests.get(url, timeout=self.timeout)
|
|
413
|
+
response.raise_for_status()
|
|
414
|
+
|
|
415
|
+
# Look for version links like "[v1]", "[v2]", etc.
|
|
416
|
+
versions = re.findall(r'\[v(\d+)\]', response.text)
|
|
417
|
+
if versions:
|
|
418
|
+
return max(int(v) for v in versions)
|
|
419
|
+
return None
|
|
420
|
+
except Exception as e:
|
|
421
|
+
logger.warning(f"Failed to get latest version for {arxiv_id}: {e}")
|
|
422
|
+
return None
|
|
423
|
+
|
|
424
|
+
def _compare_info_match(
|
|
425
|
+
self, cited_title: str, cited_authors: List[str], cited_year: Optional[int],
|
|
426
|
+
authoritative_title: str, authoritative_authors: List[str], authoritative_year: Optional[int]) -> bool:
|
|
427
|
+
"""
|
|
428
|
+
Compare the information of a cited paper with the authoritative information.
|
|
429
|
+
|
|
430
|
+
Args:
|
|
431
|
+
cited_title: Title from the reference
|
|
432
|
+
cited_authors: Authors from the reference
|
|
433
|
+
cited_year: Year from the reference
|
|
434
|
+
authoritative_title: Title from ArXiv version
|
|
435
|
+
authoritative_authors: Authors from ArXiv version
|
|
436
|
+
authoritative_year: Year from ArXiv version
|
|
437
|
+
|
|
438
|
+
Returns:
|
|
439
|
+
True if the information matches, False otherwise.
|
|
440
|
+
"""
|
|
441
|
+
# Compare title
|
|
442
|
+
if cited_title and authoritative_title:
|
|
443
|
+
title_similarity = compare_titles_with_latex_cleaning(cited_title, authoritative_title)
|
|
444
|
+
if title_similarity < SIMILARITY_THRESHOLD:
|
|
445
|
+
return False
|
|
446
|
+
|
|
447
|
+
# Compare authors
|
|
448
|
+
if cited_authors and authoritative_authors:
|
|
449
|
+
authors_match, _ = compare_authors(cited_authors, authoritative_authors)
|
|
450
|
+
if not authors_match:
|
|
451
|
+
return False
|
|
452
|
+
|
|
453
|
+
# Compare year
|
|
454
|
+
if cited_year and authoritative_year:
|
|
455
|
+
if cited_year != authoritative_year:
|
|
456
|
+
return False
|
|
457
|
+
|
|
458
|
+
return True
|
|
356
459
|
|
|
357
460
|
def verify_reference(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
|
|
358
461
|
"""
|
|
@@ -360,10 +463,10 @@ class ArXivCitationChecker:
|
|
|
360
463
|
|
|
361
464
|
This method:
|
|
362
465
|
1. Extracts the ArXiv ID from the reference
|
|
363
|
-
2. Fetches the official BibTeX from ArXiv (
|
|
364
|
-
3.
|
|
365
|
-
4.
|
|
366
|
-
5.
|
|
466
|
+
2. Fetches the official BibTeX from ArXiv (latest version)
|
|
467
|
+
3. Compares cited metadata against latest version
|
|
468
|
+
4. If errors found, checks historical versions to find a match
|
|
469
|
+
5. Annotates errors with version info if reference matches an older version
|
|
367
470
|
|
|
368
471
|
Args:
|
|
369
472
|
reference: Reference dictionary with title, authors, year, url, etc.
|
|
@@ -385,34 +488,26 @@ class ArXivCitationChecker:
|
|
|
385
488
|
|
|
386
489
|
logger.debug(f"ArXivCitationChecker: Verifying ArXiv paper {arxiv_id}")
|
|
387
490
|
|
|
388
|
-
#
|
|
491
|
+
# Extract information from reference for comparison
|
|
492
|
+
cited_title = reference.get('title', '').strip()
|
|
493
|
+
cited_authors = reference.get('authors', [])
|
|
494
|
+
cited_year = reference.get('year')
|
|
495
|
+
|
|
496
|
+
# Fetch authoritative BibTeX (latest version)
|
|
389
497
|
bibtex_content = self.fetch_bibtex(arxiv_id)
|
|
390
498
|
|
|
391
499
|
if not bibtex_content:
|
|
392
500
|
logger.debug(f"ArXivCitationChecker: Could not fetch BibTeX for {arxiv_id}")
|
|
393
501
|
return None, [{"error_type": "api_failure", "error_details": f"Could not fetch ArXiv BibTeX for {arxiv_id}"}], None
|
|
394
502
|
|
|
395
|
-
|
|
396
|
-
verified_data = self.parse_bibtex(bibtex_content)
|
|
503
|
+
latest_data = self.parse_bibtex(bibtex_content)
|
|
397
504
|
|
|
398
|
-
if not
|
|
505
|
+
if not latest_data:
|
|
399
506
|
logger.debug(f"ArXivCitationChecker: Could not parse BibTeX for {arxiv_id}")
|
|
400
507
|
return None, [], None
|
|
401
|
-
|
|
402
|
-
#
|
|
403
|
-
|
|
404
|
-
# ArXiv BibTeX always returns latest version metadata
|
|
405
|
-
# We don't know the actual latest version number without additional API call,
|
|
406
|
-
# but we can warn that a specific version was cited
|
|
407
|
-
errors.append({
|
|
408
|
-
'warning_type': 'version',
|
|
409
|
-
'warning_details': f"Reference cites ArXiv version {cited_version}, verified against latest version metadata",
|
|
410
|
-
})
|
|
411
|
-
logger.debug(f"ArXivCitationChecker: Cited version {cited_version} for {arxiv_id}")
|
|
412
|
-
|
|
413
|
-
# Compare title
|
|
414
|
-
cited_title = reference.get('title', '').strip()
|
|
415
|
-
authoritative_title = verified_data.get('title', '').strip()
|
|
508
|
+
|
|
509
|
+
# Compare against latest version
|
|
510
|
+
authoritative_title = latest_data.get('title', '').strip()
|
|
416
511
|
|
|
417
512
|
if cited_title and authoritative_title:
|
|
418
513
|
title_similarity = compare_titles_with_latex_cleaning(cited_title, authoritative_title)
|
|
@@ -426,9 +521,8 @@ class ArXivCitationChecker:
|
|
|
426
521
|
})
|
|
427
522
|
|
|
428
523
|
# Compare authors
|
|
429
|
-
cited_authors = reference.get('authors', [])
|
|
430
524
|
if cited_authors:
|
|
431
|
-
authoritative_authors =
|
|
525
|
+
authoritative_authors = latest_data.get('authors', [])
|
|
432
526
|
authors_match, author_error = compare_authors(cited_authors, authoritative_authors)
|
|
433
527
|
|
|
434
528
|
if not authors_match:
|
|
@@ -440,9 +534,7 @@ class ArXivCitationChecker:
|
|
|
440
534
|
})
|
|
441
535
|
|
|
442
536
|
# Compare year
|
|
443
|
-
|
|
444
|
-
authoritative_year = verified_data.get('year')
|
|
445
|
-
|
|
537
|
+
authoritative_year = latest_data.get('year')
|
|
446
538
|
year_warning = validate_year(
|
|
447
539
|
cited_year=cited_year,
|
|
448
540
|
paper_year=authoritative_year,
|
|
@@ -451,10 +543,50 @@ class ArXivCitationChecker:
|
|
|
451
543
|
)
|
|
452
544
|
if year_warning:
|
|
453
545
|
errors.append(year_warning)
|
|
454
|
-
|
|
455
|
-
# Build URL
|
|
546
|
+
|
|
456
547
|
paper_url = f"https://arxiv.org/abs/{arxiv_id}"
|
|
457
548
|
|
|
458
|
-
|
|
549
|
+
# If no errors against latest version, we're done
|
|
550
|
+
if len(errors) == 0:
|
|
551
|
+
logger.debug(f"ArXivCitationChecker: Verified {arxiv_id} with no errors")
|
|
552
|
+
return latest_data, errors, paper_url
|
|
553
|
+
|
|
554
|
+
# Check if reference matches a historical version
|
|
555
|
+
# Get latest version number first
|
|
556
|
+
latest_version_num = self._get_latest_version_number(arxiv_id)
|
|
557
|
+
|
|
558
|
+
if latest_version_num and latest_version_num > 1:
|
|
559
|
+
# Check historical versions (1 to latest-1)
|
|
560
|
+
for version_num in range(1, latest_version_num):
|
|
561
|
+
version_data = self._fetch_version_metadata_from_html(arxiv_id, version_num)
|
|
562
|
+
if not version_data:
|
|
563
|
+
continue
|
|
564
|
+
|
|
565
|
+
# Check if reference matches this historical version
|
|
566
|
+
if self._compare_info_match(
|
|
567
|
+
cited_title, cited_authors, cited_year,
|
|
568
|
+
version_data['title'], version_data['authors'], version_data['year']):
|
|
569
|
+
|
|
570
|
+
logger.debug(f"ArXivCitationChecker: Reference matches historical version v{version_num}")
|
|
571
|
+
|
|
572
|
+
# Convert errors to warnings with version update info
|
|
573
|
+
# Version update issues are informational, not errors - the citation was correct for its time
|
|
574
|
+
version_suffix = f" (v{version_num} vs v{latest_version_num} update)"
|
|
575
|
+
warnings = []
|
|
576
|
+
for error in errors:
|
|
577
|
+
warning = {
|
|
578
|
+
'warning_type': error.get('error_type', 'unknown') + version_suffix,
|
|
579
|
+
'warning_details': error.get('error_details', ''),
|
|
580
|
+
}
|
|
581
|
+
# Preserve correction hints
|
|
582
|
+
for key in ['ref_title_correct', 'ref_authors_correct', 'ref_year_correct']:
|
|
583
|
+
if key in error:
|
|
584
|
+
warning[key] = error[key]
|
|
585
|
+
warnings.append(warning)
|
|
586
|
+
|
|
587
|
+
# Return with warnings instead of errors - URL points to the matched version
|
|
588
|
+
matched_url = f"https://arxiv.org/abs/{arxiv_id}v{version_num}"
|
|
589
|
+
return latest_data, warnings, matched_url
|
|
459
590
|
|
|
460
|
-
|
|
591
|
+
logger.debug(f"ArXivCitationChecker: Verified {arxiv_id} with {len(errors)} errors/warnings")
|
|
592
|
+
return latest_data, errors, paper_url
|
|
@@ -257,6 +257,90 @@ class EnhancedHybridReferenceChecker:
|
|
|
257
257
|
|
|
258
258
|
return True
|
|
259
259
|
|
|
260
|
+
def _merge_arxiv_with_semantic_scholar(
|
|
261
|
+
self,
|
|
262
|
+
arxiv_data: Dict[str, Any],
|
|
263
|
+
arxiv_errors: List[Dict[str, Any]],
|
|
264
|
+
arxiv_url: str,
|
|
265
|
+
ss_data: Dict[str, Any],
|
|
266
|
+
ss_errors: List[Dict[str, Any]],
|
|
267
|
+
ss_url: str,
|
|
268
|
+
reference: Dict[str, Any]
|
|
269
|
+
) -> Tuple[Dict[str, Any], List[Dict[str, Any]]]:
|
|
270
|
+
"""
|
|
271
|
+
Merge ArXiv verification results with Semantic Scholar data.
|
|
272
|
+
|
|
273
|
+
ArXiv is authoritative for title/author/year, but Semantic Scholar
|
|
274
|
+
provides venue information and additional URLs (DOI, S2 page).
|
|
275
|
+
|
|
276
|
+
Args:
|
|
277
|
+
arxiv_data: Verified data from ArXiv
|
|
278
|
+
arxiv_errors: Errors/warnings from ArXiv verification
|
|
279
|
+
arxiv_url: ArXiv URL
|
|
280
|
+
ss_data: Data from Semantic Scholar
|
|
281
|
+
ss_errors: Errors from Semantic Scholar (used for venue checking)
|
|
282
|
+
ss_url: Semantic Scholar URL
|
|
283
|
+
reference: Original reference
|
|
284
|
+
|
|
285
|
+
Returns:
|
|
286
|
+
Tuple of (merged_data, merged_errors)
|
|
287
|
+
"""
|
|
288
|
+
merged_data = dict(arxiv_data) if arxiv_data else {}
|
|
289
|
+
merged_errors = list(arxiv_errors) if arxiv_errors else []
|
|
290
|
+
|
|
291
|
+
if not ss_data:
|
|
292
|
+
return merged_data, merged_errors
|
|
293
|
+
|
|
294
|
+
# Add Semantic Scholar URL to external IDs
|
|
295
|
+
if 'externalIds' not in merged_data:
|
|
296
|
+
merged_data['externalIds'] = {}
|
|
297
|
+
|
|
298
|
+
ss_external_ids = ss_data.get('externalIds', {})
|
|
299
|
+
|
|
300
|
+
# Add S2 paper ID
|
|
301
|
+
if ss_data.get('paperId'):
|
|
302
|
+
merged_data['externalIds']['S2PaperId'] = ss_data['paperId']
|
|
303
|
+
|
|
304
|
+
# Add DOI if available from Semantic Scholar
|
|
305
|
+
if ss_external_ids.get('DOI') and not merged_data['externalIds'].get('DOI'):
|
|
306
|
+
merged_data['externalIds']['DOI'] = ss_external_ids['DOI']
|
|
307
|
+
|
|
308
|
+
# Store Semantic Scholar URL
|
|
309
|
+
merged_data['_semantic_scholar_url'] = ss_url
|
|
310
|
+
|
|
311
|
+
# Check for venue mismatch - if paper was published at a venue but citation only says arXiv
|
|
312
|
+
ss_venue = ss_data.get('venue', '')
|
|
313
|
+
cited_venue = reference.get('venue', reference.get('journal', '')).strip().lower()
|
|
314
|
+
|
|
315
|
+
# Normalize ArXiv venue names
|
|
316
|
+
is_cited_as_arxiv = (
|
|
317
|
+
not cited_venue or
|
|
318
|
+
cited_venue in ['arxiv', 'arxiv preprint', 'arxiv.org', 'preprint']
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
# Check if Semantic Scholar shows a real publication venue
|
|
322
|
+
if ss_venue and is_cited_as_arxiv:
|
|
323
|
+
# Ignore generic/empty venues
|
|
324
|
+
ss_venue_lower = ss_venue.lower().strip()
|
|
325
|
+
is_real_venue = (
|
|
326
|
+
ss_venue_lower and
|
|
327
|
+
ss_venue_lower not in ['arxiv', 'arxiv.org', 'preprint', ''] and
|
|
328
|
+
not ss_venue_lower.startswith('arxiv')
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
if is_real_venue:
|
|
332
|
+
# This paper was published at a venue but is only cited as arXiv
|
|
333
|
+
logger.debug(f"Enhanced Hybrid: Paper published at '{ss_venue}' but cited as arXiv")
|
|
334
|
+
merged_errors.append({
|
|
335
|
+
'warning_type': 'venue',
|
|
336
|
+
'warning_details': f"Paper was published at venue but cited as arXiv preprint:\n cited: arXiv\n actual: {ss_venue}",
|
|
337
|
+
'ref_venue_correct': ss_venue
|
|
338
|
+
})
|
|
339
|
+
# Also add the venue to merged data
|
|
340
|
+
merged_data['venue'] = ss_venue
|
|
341
|
+
|
|
342
|
+
return merged_data, merged_errors
|
|
343
|
+
|
|
260
344
|
def verify_reference(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
|
|
261
345
|
"""
|
|
262
346
|
Verify a non-arXiv reference using multiple APIs in priority order
|
|
@@ -287,6 +371,9 @@ class EnhancedHybridReferenceChecker:
|
|
|
287
371
|
# Track all APIs that failed and could be retried
|
|
288
372
|
failed_apis = []
|
|
289
373
|
|
|
374
|
+
# Store ArXiv result for potential merging with Semantic Scholar
|
|
375
|
+
arxiv_result = None
|
|
376
|
+
|
|
290
377
|
# PHASE 1: Try all APIs once in priority order
|
|
291
378
|
|
|
292
379
|
# Strategy 0: For ArXiv papers, try ArXiv Citation checker first (authoritative source)
|
|
@@ -295,13 +382,15 @@ class EnhancedHybridReferenceChecker:
|
|
|
295
382
|
logger.debug("Enhanced Hybrid: Reference appears to be ArXiv paper, trying ArXiv Citation checker first")
|
|
296
383
|
verified_data, errors, url, success, failure_type = self._try_api('arxiv_citation', self.arxiv_citation, reference)
|
|
297
384
|
if success:
|
|
298
|
-
logger.debug("Enhanced Hybrid: ArXiv Citation checker succeeded
|
|
299
|
-
|
|
385
|
+
logger.debug("Enhanced Hybrid: ArXiv Citation checker succeeded, also querying Semantic Scholar for venue/URLs")
|
|
386
|
+
arxiv_result = (verified_data, errors, url)
|
|
387
|
+
# Continue to Semantic Scholar to get venue and additional URLs
|
|
300
388
|
if failure_type in ['throttled', 'timeout', 'server_error']:
|
|
301
389
|
failed_apis.append(('arxiv_citation', self.arxiv_citation, failure_type))
|
|
302
390
|
|
|
303
391
|
# Strategy 1: Always try local database first (fastest)
|
|
304
|
-
if
|
|
392
|
+
# Skip if we already have ArXiv result - we'll go straight to Semantic Scholar for venue info
|
|
393
|
+
if self.local_db and not arxiv_result:
|
|
305
394
|
verified_data, errors, url, success, failure_type = self._try_api('local_db', self.local_db, reference)
|
|
306
395
|
if success:
|
|
307
396
|
return verified_data, errors, url
|
|
@@ -309,8 +398,9 @@ class EnhancedHybridReferenceChecker:
|
|
|
309
398
|
failed_apis.append(('local_db', self.local_db, failure_type))
|
|
310
399
|
|
|
311
400
|
# Strategy 2: If reference has DOI, prioritize CrossRef
|
|
401
|
+
# Skip if we already have ArXiv result - we'll go straight to Semantic Scholar for venue info
|
|
312
402
|
crossref_result = None
|
|
313
|
-
if self._should_try_doi_apis_first(reference) and self.crossref:
|
|
403
|
+
if self._should_try_doi_apis_first(reference) and self.crossref and not arxiv_result:
|
|
314
404
|
verified_data, errors, url, success, failure_type = self._try_api('crossref', self.crossref, reference)
|
|
315
405
|
if success:
|
|
316
406
|
# Check if the data is complete enough to use
|
|
@@ -327,11 +417,34 @@ class EnhancedHybridReferenceChecker:
|
|
|
327
417
|
if self.semantic_scholar:
|
|
328
418
|
verified_data, errors, url, success, failure_type = self._try_api('semantic_scholar', self.semantic_scholar, reference)
|
|
329
419
|
if success:
|
|
420
|
+
# If we have ArXiv result, merge Semantic Scholar venue/URLs into it
|
|
421
|
+
if arxiv_result:
|
|
422
|
+
# Check if SS data is valid and venue is not just arxiv
|
|
423
|
+
# (skip merge if SS only found the arxiv version, no published venue)
|
|
424
|
+
if verified_data:
|
|
425
|
+
ss_venue = self.semantic_scholar.get_venue_from_paper_data(verified_data)
|
|
426
|
+
if ss_venue and 'arxiv' in ss_venue.lower():
|
|
427
|
+
# SS only found arxiv venue, skip merge and return arxiv result
|
|
428
|
+
logger.debug("Enhanced Hybrid: Semantic Scholar only found ArXiv venue, skipping merge")
|
|
429
|
+
return arxiv_result
|
|
430
|
+
|
|
431
|
+
arxiv_data, arxiv_errors, arxiv_url = arxiv_result
|
|
432
|
+
merged_data, merged_errors = self._merge_arxiv_with_semantic_scholar(
|
|
433
|
+
arxiv_data, arxiv_errors, arxiv_url,
|
|
434
|
+
verified_data, errors, url,
|
|
435
|
+
reference
|
|
436
|
+
)
|
|
437
|
+
return merged_data, merged_errors, arxiv_url
|
|
330
438
|
return verified_data, errors, url
|
|
331
439
|
# For Semantic Scholar, only retry retryable failures (not 'not_found')
|
|
332
440
|
if failure_type in ['throttled', 'timeout', 'server_error']:
|
|
333
441
|
failed_apis.append(('semantic_scholar', self.semantic_scholar, failure_type))
|
|
334
442
|
|
|
443
|
+
# If ArXiv succeeded but Semantic Scholar failed, return ArXiv result
|
|
444
|
+
if arxiv_result:
|
|
445
|
+
logger.debug("Enhanced Hybrid: Returning ArXiv result (Semantic Scholar unavailable)")
|
|
446
|
+
return arxiv_result
|
|
447
|
+
|
|
335
448
|
# Strategy 4: Try OpenAlex API (excellent reliability, replaces Google Scholar)
|
|
336
449
|
openalex_result = None
|
|
337
450
|
if self.openalex:
|
|
@@ -223,7 +223,49 @@ class NonArxivReferenceChecker:
|
|
|
223
223
|
"""
|
|
224
224
|
return compare_authors(cited_authors, correct_authors)
|
|
225
225
|
|
|
226
|
-
|
|
226
|
+
def get_venue_from_paper_data(self, paper_data: Dict[str, Any]) -> Optional[str]:
|
|
227
|
+
"""
|
|
228
|
+
Extract venue from paper data dictionary.
|
|
229
|
+
|
|
230
|
+
Checks multiple fields since Semantic Scholar returns venue info
|
|
231
|
+
in different fields depending on publication type.
|
|
232
|
+
|
|
233
|
+
Args:
|
|
234
|
+
paper_data: Paper data dictionary from Semantic Scholar
|
|
235
|
+
|
|
236
|
+
Returns:
|
|
237
|
+
Venue string or None if not found
|
|
238
|
+
"""
|
|
239
|
+
if not paper_data:
|
|
240
|
+
return None
|
|
241
|
+
|
|
242
|
+
paper_venue = None
|
|
243
|
+
|
|
244
|
+
# First try the simple 'venue' field (string)
|
|
245
|
+
if paper_data.get('venue'):
|
|
246
|
+
paper_venue = paper_data.get('venue')
|
|
247
|
+
|
|
248
|
+
# If no venue, try publicationVenue object
|
|
249
|
+
if not paper_venue and paper_data.get('publicationVenue'):
|
|
250
|
+
pub_venue = paper_data.get('publicationVenue')
|
|
251
|
+
if isinstance(pub_venue, dict):
|
|
252
|
+
paper_venue = pub_venue.get('name', '')
|
|
253
|
+
elif isinstance(pub_venue, str):
|
|
254
|
+
paper_venue = pub_venue
|
|
255
|
+
|
|
256
|
+
# If still no venue, try journal object
|
|
257
|
+
if not paper_venue and paper_data.get('journal'):
|
|
258
|
+
journal = paper_data.get('journal')
|
|
259
|
+
if isinstance(journal, dict):
|
|
260
|
+
paper_venue = journal.get('name', '')
|
|
261
|
+
elif isinstance(journal, str):
|
|
262
|
+
paper_venue = journal
|
|
263
|
+
|
|
264
|
+
# Ensure paper_venue is a string
|
|
265
|
+
if paper_venue and not isinstance(paper_venue, str):
|
|
266
|
+
paper_venue = str(paper_venue)
|
|
267
|
+
|
|
268
|
+
return paper_venue if paper_venue else None
|
|
227
269
|
|
|
228
270
|
def verify_reference(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
|
|
229
271
|
"""
|
refchecker/llm/base.py
CHANGED
|
@@ -110,21 +110,7 @@ class LLMProvider(ABC):
|
|
|
110
110
|
|
|
111
111
|
logger.debug(f"Created {len(chunks)} balanced overlapping chunks for parallel processing")
|
|
112
112
|
return chunks
|
|
113
|
-
|
|
114
|
-
def _parse_llm_response(self, response_text: str) -> List[str]:
|
|
115
|
-
"""Parse LLM response and extract individual references"""
|
|
116
|
-
if not response_text:
|
|
117
|
-
return []
|
|
118
|
-
|
|
119
|
-
# Split by newlines and filter out empty lines
|
|
120
|
-
references = []
|
|
121
|
-
for line in response_text.strip().split('\n'):
|
|
122
|
-
line = line.strip()
|
|
123
|
-
if line and not line.startswith('#') and len(line) > 10: # Basic filtering
|
|
124
|
-
references.append(line)
|
|
125
|
-
|
|
126
|
-
return references
|
|
127
|
-
|
|
113
|
+
|
|
128
114
|
def extract_references_with_chunking(self, bibliography_text: str) -> List[str]:
|
|
129
115
|
"""
|
|
130
116
|
Template method that handles chunking for all providers.
|