academic-refchecker 2.0.14__py3-none-any.whl → 2.0.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {academic_refchecker-2.0.14.dist-info → academic_refchecker-2.0.16.dist-info}/METADATA +1 -1
- {academic_refchecker-2.0.14.dist-info → academic_refchecker-2.0.16.dist-info}/RECORD +14 -11
- backend/refchecker_wrapper.py +4 -3
- backend/static/assets/index-B92lKsA8.js +25 -0
- backend/static/assets/index-BuguAhjS.css +1 -0
- backend/static/assets/index-DMZJNrR0.js +25 -0
- backend/static/index.html +2 -2
- refchecker/__version__.py +1 -1
- refchecker/checkers/arxiv_citation.py +125 -35
- refchecker/checkers/semantic_scholar.py +263 -0
- {academic_refchecker-2.0.14.dist-info → academic_refchecker-2.0.16.dist-info}/WHEEL +0 -0
- {academic_refchecker-2.0.14.dist-info → academic_refchecker-2.0.16.dist-info}/entry_points.txt +0 -0
- {academic_refchecker-2.0.14.dist-info → academic_refchecker-2.0.16.dist-info}/licenses/LICENSE +0 -0
- {academic_refchecker-2.0.14.dist-info → academic_refchecker-2.0.16.dist-info}/top_level.txt +0 -0
backend/static/index.html
CHANGED
|
@@ -6,8 +6,8 @@
|
|
|
6
6
|
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
|
7
7
|
<title>RefChecker - Academic Reference Validator</title>
|
|
8
8
|
<meta name="description" content="Validate academic paper references using AI-powered verification" />
|
|
9
|
-
<script type="module" crossorigin src="/assets/index-
|
|
10
|
-
<link rel="stylesheet" crossorigin href="/assets/index-
|
|
9
|
+
<script type="module" crossorigin src="/assets/index-DMZJNrR0.js"></script>
|
|
10
|
+
<link rel="stylesheet" crossorigin href="/assets/index-BuguAhjS.css">
|
|
11
11
|
</head>
|
|
12
12
|
<body>
|
|
13
13
|
<div id="root"></div>
|
refchecker/__version__.py
CHANGED
|
@@ -31,6 +31,7 @@ import re
|
|
|
31
31
|
import logging
|
|
32
32
|
import requests
|
|
33
33
|
import html
|
|
34
|
+
import time
|
|
34
35
|
from typing import Dict, List, Tuple, Optional, Any
|
|
35
36
|
|
|
36
37
|
import bibtexparser
|
|
@@ -356,7 +357,13 @@ class ArXivCitationChecker:
|
|
|
356
357
|
version_str = f"v{version_num}"
|
|
357
358
|
url = f"{self.abs_url}/{arxiv_id}{version_str}"
|
|
358
359
|
|
|
360
|
+
# Use shorter delay for version metadata (HTML parsing is lightweight)
|
|
361
|
+
# Save original delay, use 1 second, then restore
|
|
362
|
+
original_delay = self.rate_limiter.delay
|
|
363
|
+
self.rate_limiter.delay = 1.0 # Faster rate for version checking
|
|
359
364
|
self.rate_limiter.wait()
|
|
365
|
+
self.rate_limiter.delay = original_delay # Restore original delay
|
|
366
|
+
|
|
360
367
|
try:
|
|
361
368
|
logger.debug(f"Checking historical version: {url}")
|
|
362
369
|
response = requests.get(url, timeout=self.timeout)
|
|
@@ -421,16 +428,59 @@ class ArXivCitationChecker:
|
|
|
421
428
|
logger.warning(f"Failed to get latest version for {arxiv_id}: {e}")
|
|
422
429
|
return None
|
|
423
430
|
|
|
431
|
+
def _calculate_match_score(
|
|
432
|
+
self, cited_title: str, cited_authors: List[str],
|
|
433
|
+
authoritative_title: str, authoritative_authors: List[Dict]) -> float:
|
|
434
|
+
"""
|
|
435
|
+
Calculate a numeric match score between cited reference and authoritative data.
|
|
436
|
+
|
|
437
|
+
Used to find the BEST matching historical version, not just the first one that
|
|
438
|
+
passes a threshold.
|
|
439
|
+
|
|
440
|
+
Args:
|
|
441
|
+
cited_title: Title from the reference
|
|
442
|
+
cited_authors: Authors from the reference
|
|
443
|
+
authoritative_title: Title from ArXiv version
|
|
444
|
+
authoritative_authors: Authors from ArXiv version
|
|
445
|
+
|
|
446
|
+
Returns:
|
|
447
|
+
A score between 0.0 and 1.0 where higher is better.
|
|
448
|
+
"""
|
|
449
|
+
if not cited_title or not authoritative_title:
|
|
450
|
+
return 0.0
|
|
451
|
+
|
|
452
|
+
# Primary: Title similarity (weighted at 80%)
|
|
453
|
+
title_similarity = compare_titles_with_latex_cleaning(cited_title, authoritative_title)
|
|
454
|
+
|
|
455
|
+
# Secondary: Author count match (weighted at 20%)
|
|
456
|
+
author_score = 0.0
|
|
457
|
+
if cited_authors and authoritative_authors:
|
|
458
|
+
cited_count = len(cited_authors)
|
|
459
|
+
auth_count = len(authoritative_authors)
|
|
460
|
+
if cited_count == auth_count:
|
|
461
|
+
author_score = 1.0
|
|
462
|
+
elif abs(cited_count - auth_count) == 1:
|
|
463
|
+
author_score = 0.7
|
|
464
|
+
elif abs(cited_count - auth_count) == 2:
|
|
465
|
+
author_score = 0.4
|
|
466
|
+
else:
|
|
467
|
+
author_score = 0.1
|
|
468
|
+
|
|
469
|
+
# Weighted combination
|
|
470
|
+
return 0.8 * title_similarity + 0.2 * author_score
|
|
471
|
+
|
|
424
472
|
def _compare_info_match(
|
|
425
473
|
self, cited_title: str, cited_authors: List[str], cited_year: Optional[int],
|
|
426
|
-
authoritative_title: str, authoritative_authors: List[
|
|
474
|
+
authoritative_title: str, authoritative_authors: List[Dict], authoritative_year: Optional[int]) -> bool:
|
|
427
475
|
"""
|
|
428
476
|
Compare the information of a cited paper with the authoritative information.
|
|
429
477
|
|
|
478
|
+
Uses title as the primary matching criterion. Authors are used as a secondary
|
|
479
|
+
check, but year is not required to match (year often has discrepancies).
|
|
480
|
+
|
|
430
481
|
Args:
|
|
431
482
|
cited_title: Title from the reference
|
|
432
483
|
cited_authors: Authors from the reference
|
|
433
|
-
cited_year: Year from the reference
|
|
434
484
|
authoritative_title: Title from ArXiv version
|
|
435
485
|
authoritative_authors: Authors from ArXiv version
|
|
436
486
|
authoritative_year: Year from ArXiv version
|
|
@@ -438,22 +488,31 @@ class ArXivCitationChecker:
|
|
|
438
488
|
Returns:
|
|
439
489
|
True if the information matches, False otherwise.
|
|
440
490
|
"""
|
|
441
|
-
#
|
|
491
|
+
# Primary criterion: Title MUST match
|
|
442
492
|
if cited_title and authoritative_title:
|
|
443
493
|
title_similarity = compare_titles_with_latex_cleaning(cited_title, authoritative_title)
|
|
444
494
|
if title_similarity < SIMILARITY_THRESHOLD:
|
|
445
495
|
return False
|
|
496
|
+
else:
|
|
497
|
+
# If no title to compare, can't determine match
|
|
498
|
+
return False
|
|
446
499
|
|
|
447
|
-
#
|
|
500
|
+
# Secondary criterion: If authors are provided, they should reasonably match
|
|
501
|
+
# (be lenient - allow partial matches since author lists can vary)
|
|
448
502
|
if cited_authors and authoritative_authors:
|
|
449
503
|
authors_match, _ = compare_authors(cited_authors, authoritative_authors)
|
|
450
|
-
|
|
504
|
+
# If authors don't match at all, this might not be the right version
|
|
505
|
+
# But be lenient - just having similar author count is a good sign
|
|
506
|
+
cited_count = len(cited_authors)
|
|
507
|
+
auth_count = len(authoritative_authors)
|
|
508
|
+
# Allow if authors match OR if author counts are within 1 of each other
|
|
509
|
+
if not authors_match and abs(cited_count - auth_count) > 1:
|
|
451
510
|
return False
|
|
452
511
|
|
|
453
|
-
#
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
512
|
+
# Year is NOT used as a matching criterion because:
|
|
513
|
+
# 1. ArXiv shows submission date, citations often use publication year
|
|
514
|
+
# 2. People often cite with incorrect years
|
|
515
|
+
# 3. The same ArXiv version can be cited with different years
|
|
457
516
|
|
|
458
517
|
return True
|
|
459
518
|
|
|
@@ -556,37 +615,68 @@ class ArXivCitationChecker:
|
|
|
556
615
|
latest_version_num = self._get_latest_version_number(arxiv_id)
|
|
557
616
|
|
|
558
617
|
if latest_version_num and latest_version_num > 1:
|
|
618
|
+
# Find the BEST matching version, not just the first one
|
|
619
|
+
best_match_version = None
|
|
620
|
+
best_match_score = 0.0
|
|
621
|
+
best_match_data = None
|
|
622
|
+
|
|
623
|
+
# Add timeout for version checking (30 seconds max)
|
|
624
|
+
# This prevents blocking when rate-limited with many concurrent ArXiv requests
|
|
625
|
+
version_check_start = time.time()
|
|
626
|
+
VERSION_CHECK_TIMEOUT = 30.0
|
|
627
|
+
|
|
559
628
|
# Check historical versions (1 to latest-1)
|
|
560
|
-
|
|
629
|
+
# Start from newest historical version (more likely to match recent citations)
|
|
630
|
+
for version_num in range(latest_version_num - 1, 0, -1):
|
|
631
|
+
# Check if we've exceeded the version checking timeout
|
|
632
|
+
if time.time() - version_check_start > VERSION_CHECK_TIMEOUT:
|
|
633
|
+
logger.debug(f"ArXivCitationChecker: Version checking timed out after {VERSION_CHECK_TIMEOUT}s")
|
|
634
|
+
break
|
|
635
|
+
|
|
561
636
|
version_data = self._fetch_version_metadata_from_html(arxiv_id, version_num)
|
|
562
637
|
if not version_data:
|
|
563
638
|
continue
|
|
564
639
|
|
|
565
|
-
#
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
640
|
+
# Calculate match score for this version
|
|
641
|
+
match_score = self._calculate_match_score(
|
|
642
|
+
cited_title, cited_authors,
|
|
643
|
+
version_data['title'], version_data['authors'])
|
|
644
|
+
|
|
645
|
+
if match_score > best_match_score:
|
|
646
|
+
best_match_score = match_score
|
|
647
|
+
best_match_version = version_num
|
|
648
|
+
best_match_data = version_data
|
|
649
|
+
|
|
650
|
+
# Early termination: if we found an excellent match (>= 0.98), stop searching
|
|
651
|
+
# This saves HTTP requests when we've found a near-perfect version match
|
|
652
|
+
if best_match_score >= 0.98:
|
|
653
|
+
logger.debug(f"ArXivCitationChecker: Found excellent version match v{best_match_version} (score: {best_match_score:.3f}), stopping search")
|
|
654
|
+
break
|
|
655
|
+
|
|
656
|
+
# If we found a matching version (above threshold), convert errors to warnings
|
|
657
|
+
if best_match_version and best_match_score >= SIMILARITY_THRESHOLD:
|
|
658
|
+
logger.debug(f"ArXivCitationChecker: Reference best matches historical version v{best_match_version} (score: {best_match_score:.3f})")
|
|
659
|
+
|
|
660
|
+
# Convert errors to warnings with version update info
|
|
661
|
+
# Version update issues are informational, not errors - the citation was correct for its time
|
|
662
|
+
version_suffix = f" (v{best_match_version} vs v{latest_version_num} update)"
|
|
663
|
+
warnings = []
|
|
664
|
+
for error in errors:
|
|
665
|
+
# Get the error/warning type - handle both error_type and warning_type
|
|
666
|
+
err_type = error.get('error_type') or error.get('warning_type', 'unknown')
|
|
667
|
+
warning = {
|
|
668
|
+
'warning_type': err_type + version_suffix,
|
|
669
|
+
'warning_details': error.get('error_details') or error.get('warning_details', ''),
|
|
670
|
+
}
|
|
671
|
+
# Preserve correction hints
|
|
672
|
+
for key in ['ref_title_correct', 'ref_authors_correct', 'ref_year_correct']:
|
|
673
|
+
if key in error:
|
|
674
|
+
warning[key] = error[key]
|
|
675
|
+
warnings.append(warning)
|
|
676
|
+
|
|
677
|
+
# Return with warnings instead of errors - URL points to the matched version
|
|
678
|
+
matched_url = f"https://arxiv.org/abs/{arxiv_id}v{best_match_version}"
|
|
679
|
+
return latest_data, warnings, matched_url
|
|
590
680
|
|
|
591
681
|
logger.debug(f"ArXivCitationChecker: Verified {arxiv_id} with {len(errors)} errors/warnings")
|
|
592
682
|
return latest_data, errors, paper_url
|
|
@@ -27,9 +27,11 @@ import requests
|
|
|
27
27
|
import time
|
|
28
28
|
import logging
|
|
29
29
|
import re
|
|
30
|
+
import html
|
|
30
31
|
from typing import Dict, List, Tuple, Optional, Any, Union
|
|
31
32
|
from refchecker.utils.text_utils import normalize_text, clean_title_basic, find_best_match, is_name_match, are_venues_substantially_different, calculate_title_similarity, compare_authors, clean_title_for_search, strip_latex_commands, compare_titles_with_latex_cleaning
|
|
32
33
|
from refchecker.utils.error_utils import format_title_mismatch
|
|
34
|
+
from refchecker.utils.arxiv_rate_limiter import ArXivRateLimiter
|
|
33
35
|
from refchecker.config.settings import get_config
|
|
34
36
|
|
|
35
37
|
# Set up logging
|
|
@@ -67,6 +69,11 @@ class NonArxivReferenceChecker:
|
|
|
67
69
|
# Track API failures for Enhanced Hybrid Checker
|
|
68
70
|
self._api_failed = False
|
|
69
71
|
self._failure_reason = None
|
|
72
|
+
|
|
73
|
+
# ArXiv rate limiter for version checks
|
|
74
|
+
self.arxiv_rate_limiter = ArXivRateLimiter.get_instance()
|
|
75
|
+
self.arxiv_abs_url = "https://arxiv.org/abs"
|
|
76
|
+
self.arxiv_timeout = 30
|
|
70
77
|
|
|
71
78
|
def search_paper(self, query: str, year: Optional[int] = None) -> List[Dict[str, Any]]:
|
|
72
79
|
"""
|
|
@@ -267,6 +274,258 @@ class NonArxivReferenceChecker:
|
|
|
267
274
|
|
|
268
275
|
return paper_venue if paper_venue else None
|
|
269
276
|
|
|
277
|
+
def _extract_arxiv_id_and_version(self, reference: Dict[str, Any]) -> Tuple[Optional[str], Optional[str]]:
|
|
278
|
+
"""
|
|
279
|
+
Extract ArXiv ID and version from a reference.
|
|
280
|
+
|
|
281
|
+
Args:
|
|
282
|
+
reference: Reference dictionary containing url, raw_text, etc.
|
|
283
|
+
|
|
284
|
+
Returns:
|
|
285
|
+
Tuple of (arxiv_id_without_version, version_string_or_None)
|
|
286
|
+
For example: ("2301.12345", "v2") or ("2301.12345", None)
|
|
287
|
+
"""
|
|
288
|
+
# Patterns to extract arXiv IDs with versions
|
|
289
|
+
arxiv_id_patterns = [
|
|
290
|
+
r'arxiv\.org/abs/([0-9]{4}\.[0-9]{4,5})(v\d+)?',
|
|
291
|
+
r'arxiv\.org/pdf/([0-9]{4}\.[0-9]{4,5})(v\d+)?',
|
|
292
|
+
r'arxiv\.org/abs/([a-z-]+/[0-9]{7})(v\d+)?',
|
|
293
|
+
r'arxiv\.org/pdf/([a-z-]+/[0-9]{7})(v\d+)?',
|
|
294
|
+
r'arXiv:([0-9]{4}\.[0-9]{4,5})(v\d+)?',
|
|
295
|
+
r'arXiv:([a-z-]+/[0-9]{7})(v\d+)?',
|
|
296
|
+
]
|
|
297
|
+
|
|
298
|
+
sources = [
|
|
299
|
+
reference.get('url', ''),
|
|
300
|
+
reference.get('cited_url', ''),
|
|
301
|
+
reference.get('raw_text', ''),
|
|
302
|
+
]
|
|
303
|
+
|
|
304
|
+
for source in sources:
|
|
305
|
+
if not source:
|
|
306
|
+
continue
|
|
307
|
+
|
|
308
|
+
for pattern in arxiv_id_patterns:
|
|
309
|
+
match = re.search(pattern, source, re.IGNORECASE)
|
|
310
|
+
if match:
|
|
311
|
+
arxiv_id = match.group(1)
|
|
312
|
+
version = match.group(2) if len(match.groups()) > 1 else None
|
|
313
|
+
return arxiv_id, version
|
|
314
|
+
|
|
315
|
+
return None, None
|
|
316
|
+
|
|
317
|
+
def _get_latest_arxiv_version_number(self, arxiv_id: str) -> Optional[int]:
|
|
318
|
+
"""
|
|
319
|
+
Get the latest version number for an ArXiv paper.
|
|
320
|
+
|
|
321
|
+
Args:
|
|
322
|
+
arxiv_id: ArXiv ID without version
|
|
323
|
+
|
|
324
|
+
Returns:
|
|
325
|
+
Latest version number as integer, or None if couldn't determine
|
|
326
|
+
"""
|
|
327
|
+
url = f"{self.arxiv_abs_url}/{arxiv_id}"
|
|
328
|
+
|
|
329
|
+
self.arxiv_rate_limiter.wait()
|
|
330
|
+
try:
|
|
331
|
+
response = requests.get(url, timeout=self.arxiv_timeout)
|
|
332
|
+
response.raise_for_status()
|
|
333
|
+
|
|
334
|
+
# Look for version links like "[v1]", "[v2]", etc.
|
|
335
|
+
versions = re.findall(r'\[v(\d+)\]', response.text)
|
|
336
|
+
if versions:
|
|
337
|
+
return max(int(v) for v in versions)
|
|
338
|
+
return None
|
|
339
|
+
except Exception as e:
|
|
340
|
+
logger.debug(f"Failed to get latest version for {arxiv_id}: {e}")
|
|
341
|
+
return None
|
|
342
|
+
|
|
343
|
+
def _fetch_arxiv_version_metadata(self, arxiv_id: str, version_num: int) -> Optional[Dict[str, Any]]:
|
|
344
|
+
"""
|
|
345
|
+
Fetch metadata for a specific ArXiv version using HTML scraping.
|
|
346
|
+
|
|
347
|
+
Args:
|
|
348
|
+
arxiv_id: ArXiv ID without version
|
|
349
|
+
version_num: Version number to fetch (1, 2, 3, etc.)
|
|
350
|
+
|
|
351
|
+
Returns:
|
|
352
|
+
Dictionary with version metadata or None if version doesn't exist
|
|
353
|
+
"""
|
|
354
|
+
version_str = f"v{version_num}"
|
|
355
|
+
url = f"{self.arxiv_abs_url}/{arxiv_id}{version_str}"
|
|
356
|
+
|
|
357
|
+
self.arxiv_rate_limiter.wait()
|
|
358
|
+
try:
|
|
359
|
+
logger.debug(f"Checking ArXiv version: {url}")
|
|
360
|
+
response = requests.get(url, timeout=self.arxiv_timeout)
|
|
361
|
+
if response.status_code == 404:
|
|
362
|
+
return None
|
|
363
|
+
response.raise_for_status()
|
|
364
|
+
html_content = response.text
|
|
365
|
+
|
|
366
|
+
# Parse meta tags for metadata
|
|
367
|
+
title_match = re.search(r'<meta name="citation_title" content="(.*?)"', html_content)
|
|
368
|
+
title = html.unescape(title_match.group(1)).strip() if title_match else ""
|
|
369
|
+
|
|
370
|
+
authors = []
|
|
371
|
+
for auth in re.findall(r'<meta name="citation_author" content="(.*?)"', html_content):
|
|
372
|
+
authors.append({'name': html.unescape(auth).strip()})
|
|
373
|
+
|
|
374
|
+
date_match = re.search(r'<meta name="citation_date" content="(.*?)"', html_content)
|
|
375
|
+
year = None
|
|
376
|
+
if date_match:
|
|
377
|
+
ym = re.search(r'^(\d{4})', date_match.group(1))
|
|
378
|
+
if ym:
|
|
379
|
+
year = int(ym.group(1))
|
|
380
|
+
|
|
381
|
+
return {
|
|
382
|
+
'version': version_str,
|
|
383
|
+
'version_num': version_num,
|
|
384
|
+
'title': title,
|
|
385
|
+
'authors': authors,
|
|
386
|
+
'year': year,
|
|
387
|
+
'url': url,
|
|
388
|
+
}
|
|
389
|
+
except Exception as e:
|
|
390
|
+
logger.debug(f"Failed to fetch ArXiv version {version_str}: {e}")
|
|
391
|
+
return None
|
|
392
|
+
|
|
393
|
+
def _check_arxiv_version_update(self, reference: Dict[str, Any], paper_data: Dict[str, Any], arxiv_id: str, errors: List[Dict[str, Any]]) -> Tuple[List[Dict[str, Any]], Optional[int]]:
|
|
394
|
+
"""
|
|
395
|
+
Check if a reference is citing an older version of an ArXiv paper that has been updated.
|
|
396
|
+
If the reference matches a historical version, converts errors to warnings with version annotation.
|
|
397
|
+
|
|
398
|
+
Args:
|
|
399
|
+
reference: The original reference dictionary
|
|
400
|
+
paper_data: The verified paper data from Semantic Scholar (latest version)
|
|
401
|
+
arxiv_id: The ArXiv ID from the paper
|
|
402
|
+
errors: The current list of errors found against the latest version
|
|
403
|
+
|
|
404
|
+
Returns:
|
|
405
|
+
Tuple of (modified_errors_or_warnings, matched_version_num)
|
|
406
|
+
- If reference matches a historical version: returns (warnings_with_version_suffix, matched_version)
|
|
407
|
+
- Otherwise: returns (original_errors, None)
|
|
408
|
+
"""
|
|
409
|
+
# Extract cited version from reference
|
|
410
|
+
_, cited_version = self._extract_arxiv_id_and_version(reference)
|
|
411
|
+
|
|
412
|
+
# Get the latest version number
|
|
413
|
+
latest_version_num = self._get_latest_arxiv_version_number(arxiv_id)
|
|
414
|
+
|
|
415
|
+
if not latest_version_num or latest_version_num <= 1:
|
|
416
|
+
# Only one version exists or couldn't determine
|
|
417
|
+
return errors, None
|
|
418
|
+
|
|
419
|
+
# Check if reference explicitly cites a specific older version
|
|
420
|
+
cited_version_num = None
|
|
421
|
+
if cited_version:
|
|
422
|
+
match = re.match(r'v(\d+)', cited_version)
|
|
423
|
+
if match:
|
|
424
|
+
cited_version_num = int(match.group(1))
|
|
425
|
+
|
|
426
|
+
# If a specific older version is cited in the URL, convert errors to warnings
|
|
427
|
+
if cited_version_num and cited_version_num < latest_version_num:
|
|
428
|
+
version_suffix = f" (v{cited_version_num} vs v{latest_version_num} update)"
|
|
429
|
+
warnings = self._convert_errors_to_version_warnings(errors, version_suffix)
|
|
430
|
+
return warnings, cited_version_num
|
|
431
|
+
|
|
432
|
+
# If no explicit version or no errors to check, return original
|
|
433
|
+
if not errors:
|
|
434
|
+
return errors, None
|
|
435
|
+
|
|
436
|
+
# Check if reference metadata matches a historical version
|
|
437
|
+
cited_title = reference.get('title', '').strip()
|
|
438
|
+
|
|
439
|
+
if not cited_title:
|
|
440
|
+
return errors, None
|
|
441
|
+
|
|
442
|
+
from refchecker.utils.text_utils import compare_titles_with_latex_cleaning
|
|
443
|
+
|
|
444
|
+
# Find the BEST matching version by comparing against all versions
|
|
445
|
+
# (not just checking if latest exceeds threshold)
|
|
446
|
+
best_match_version = None
|
|
447
|
+
best_match_score = 0.0
|
|
448
|
+
|
|
449
|
+
# Check latest version first
|
|
450
|
+
latest_version_data = self._fetch_arxiv_version_metadata(arxiv_id, latest_version_num)
|
|
451
|
+
if latest_version_data:
|
|
452
|
+
latest_title = latest_version_data.get('title', '').strip()
|
|
453
|
+
if latest_title:
|
|
454
|
+
latest_score = compare_titles_with_latex_cleaning(cited_title, latest_title)
|
|
455
|
+
if latest_score >= SIMILARITY_THRESHOLD:
|
|
456
|
+
best_match_version = latest_version_num
|
|
457
|
+
best_match_score = latest_score
|
|
458
|
+
|
|
459
|
+
# Check historical versions to find if any is a BETTER match
|
|
460
|
+
for version_num in range(1, latest_version_num):
|
|
461
|
+
version_data = self._fetch_arxiv_version_metadata(arxiv_id, version_num)
|
|
462
|
+
if not version_data:
|
|
463
|
+
continue
|
|
464
|
+
|
|
465
|
+
version_title = version_data.get('title', '').strip()
|
|
466
|
+
if not version_title:
|
|
467
|
+
continue
|
|
468
|
+
|
|
469
|
+
version_score = compare_titles_with_latex_cleaning(cited_title, version_title)
|
|
470
|
+
|
|
471
|
+
# If this version is a better match than current best
|
|
472
|
+
if version_score > best_match_score and version_score >= SIMILARITY_THRESHOLD:
|
|
473
|
+
best_match_version = version_num
|
|
474
|
+
best_match_score = version_score
|
|
475
|
+
|
|
476
|
+
# If best match is a historical version (not latest), convert errors to warnings
|
|
477
|
+
if best_match_version is not None and best_match_version < latest_version_num:
|
|
478
|
+
logger.debug(f"Reference best matches ArXiv v{best_match_version} (score: {best_match_score:.3f}, latest is v{latest_version_num})")
|
|
479
|
+
version_suffix = f" (v{best_match_version} vs v{latest_version_num} update)"
|
|
480
|
+
warnings = self._convert_errors_to_version_warnings(errors, version_suffix)
|
|
481
|
+
return warnings, best_match_version
|
|
482
|
+
|
|
483
|
+
return errors, None
|
|
484
|
+
|
|
485
|
+
def _convert_errors_to_version_warnings(self, errors: List[Dict[str, Any]], version_suffix: str) -> List[Dict[str, Any]]:
|
|
486
|
+
"""
|
|
487
|
+
Convert error dictionaries to warning dictionaries with version suffix.
|
|
488
|
+
|
|
489
|
+
Args:
|
|
490
|
+
errors: List of error dictionaries
|
|
491
|
+
version_suffix: Version suffix to append (e.g., " (v1 vs v3 update)")
|
|
492
|
+
|
|
493
|
+
Returns:
|
|
494
|
+
List of warning dictionaries with version annotation
|
|
495
|
+
"""
|
|
496
|
+
warnings = []
|
|
497
|
+
for error in errors:
|
|
498
|
+
error_type = error.get('error_type', '')
|
|
499
|
+
|
|
500
|
+
# Skip info_type entries (suggestions) - keep them as-is
|
|
501
|
+
if 'info_type' in error:
|
|
502
|
+
warnings.append(error)
|
|
503
|
+
continue
|
|
504
|
+
|
|
505
|
+
# Skip entries that are already warnings
|
|
506
|
+
if 'warning_type' in error:
|
|
507
|
+
# Just append the version suffix
|
|
508
|
+
warning = error.copy()
|
|
509
|
+
warning['warning_type'] = error['warning_type'] + version_suffix
|
|
510
|
+
warnings.append(warning)
|
|
511
|
+
continue
|
|
512
|
+
|
|
513
|
+
# Convert error to warning with version suffix
|
|
514
|
+
warning = {
|
|
515
|
+
'warning_type': error_type + version_suffix,
|
|
516
|
+
'warning_details': error.get('error_details', ''),
|
|
517
|
+
}
|
|
518
|
+
|
|
519
|
+
# Preserve correction hints
|
|
520
|
+
for key in ['ref_title_correct', 'ref_authors_correct', 'ref_year_correct',
|
|
521
|
+
'ref_venue_correct', 'ref_doi_correct', 'ref_url_correct']:
|
|
522
|
+
if key in error:
|
|
523
|
+
warning[key] = error[key]
|
|
524
|
+
|
|
525
|
+
warnings.append(warning)
|
|
526
|
+
|
|
527
|
+
return warnings
|
|
528
|
+
|
|
270
529
|
def verify_reference(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
|
|
271
530
|
"""
|
|
272
531
|
Verify a non-arXiv reference using Semantic Scholar
|
|
@@ -642,6 +901,10 @@ class NonArxivReferenceChecker:
|
|
|
642
901
|
'info_details': f"Reference could include arXiv URL: {arxiv_url}",
|
|
643
902
|
'ref_url_correct': arxiv_url
|
|
644
903
|
})
|
|
904
|
+
|
|
905
|
+
# Check for ArXiv version updates - if reference matches an older version,
|
|
906
|
+
# convert errors to warnings with version annotation (like ArXiv citation checker)
|
|
907
|
+
errors, matched_version = self._check_arxiv_version_update(reference, paper_data, arxiv_id, errors)
|
|
645
908
|
|
|
646
909
|
# Verify DOI
|
|
647
910
|
paper_doi = None
|
|
File without changes
|
{academic_refchecker-2.0.14.dist-info → academic_refchecker-2.0.16.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{academic_refchecker-2.0.14.dist-info → academic_refchecker-2.0.16.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
|
File without changes
|