academic-refchecker 2.0.15__py3-none-any.whl → 2.0.17__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
backend/static/index.html CHANGED
@@ -6,8 +6,8 @@
6
6
  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
7
7
  <title>RefChecker - Academic Reference Validator</title>
8
8
  <meta name="description" content="Validate academic paper references using AI-powered verification" />
9
- <script type="module" crossorigin src="/assets/index-hk21nqxR.js"></script>
10
- <link rel="stylesheet" crossorigin href="/assets/index-2P6L_39v.css">
9
+ <script type="module" crossorigin src="/assets/index-DMZJNrR0.js"></script>
10
+ <link rel="stylesheet" crossorigin href="/assets/index-BuguAhjS.css">
11
11
  </head>
12
12
  <body>
13
13
  <div id="root"></div>
refchecker/__version__.py CHANGED
@@ -1,3 +1,3 @@
1
1
  """Version information for RefChecker."""
2
2
 
3
- __version__ = "2.0.15"
3
+ __version__ = "2.0.17"
@@ -31,6 +31,7 @@ import re
31
31
  import logging
32
32
  import requests
33
33
  import html
34
+ import time
34
35
  from typing import Dict, List, Tuple, Optional, Any
35
36
 
36
37
  import bibtexparser
@@ -356,7 +357,13 @@ class ArXivCitationChecker:
356
357
  version_str = f"v{version_num}"
357
358
  url = f"{self.abs_url}/{arxiv_id}{version_str}"
358
359
 
360
+ # Use shorter delay for version metadata (HTML parsing is lightweight)
361
+ # Save original delay, use 1 second, then restore
362
+ original_delay = self.rate_limiter.delay
363
+ self.rate_limiter.delay = 1.0 # Faster rate for version checking
359
364
  self.rate_limiter.wait()
365
+ self.rate_limiter.delay = original_delay # Restore original delay
366
+
360
367
  try:
361
368
  logger.debug(f"Checking historical version: {url}")
362
369
  response = requests.get(url, timeout=self.timeout)
@@ -421,16 +428,59 @@ class ArXivCitationChecker:
421
428
  logger.warning(f"Failed to get latest version for {arxiv_id}: {e}")
422
429
  return None
423
430
 
431
+ def _calculate_match_score(
432
+ self, cited_title: str, cited_authors: List[str],
433
+ authoritative_title: str, authoritative_authors: List[Dict]) -> float:
434
+ """
435
+ Calculate a numeric match score between cited reference and authoritative data.
436
+
437
+ Used to find the BEST matching historical version, not just the first one that
438
+ passes a threshold.
439
+
440
+ Args:
441
+ cited_title: Title from the reference
442
+ cited_authors: Authors from the reference
443
+ authoritative_title: Title from ArXiv version
444
+ authoritative_authors: Authors from ArXiv version
445
+
446
+ Returns:
447
+ A score between 0.0 and 1.0 where higher is better.
448
+ """
449
+ if not cited_title or not authoritative_title:
450
+ return 0.0
451
+
452
+ # Primary: Title similarity (weighted at 80%)
453
+ title_similarity = compare_titles_with_latex_cleaning(cited_title, authoritative_title)
454
+
455
+ # Secondary: Author count match (weighted at 20%)
456
+ author_score = 0.0
457
+ if cited_authors and authoritative_authors:
458
+ cited_count = len(cited_authors)
459
+ auth_count = len(authoritative_authors)
460
+ if cited_count == auth_count:
461
+ author_score = 1.0
462
+ elif abs(cited_count - auth_count) == 1:
463
+ author_score = 0.7
464
+ elif abs(cited_count - auth_count) == 2:
465
+ author_score = 0.4
466
+ else:
467
+ author_score = 0.1
468
+
469
+ # Weighted combination
470
+ return 0.8 * title_similarity + 0.2 * author_score
471
+
424
472
  def _compare_info_match(
425
473
  self, cited_title: str, cited_authors: List[str], cited_year: Optional[int],
426
- authoritative_title: str, authoritative_authors: List[str], authoritative_year: Optional[int]) -> bool:
474
+ authoritative_title: str, authoritative_authors: List[Dict], authoritative_year: Optional[int]) -> bool:
427
475
  """
428
476
  Compare the information of a cited paper with the authoritative information.
429
477
 
478
+ Uses title as the primary matching criterion. Authors are used as a secondary
479
+ check, but year is not required to match (year often has discrepancies).
480
+
430
481
  Args:
431
482
  cited_title: Title from the reference
432
483
  cited_authors: Authors from the reference
433
- cited_year: Year from the reference
434
484
  authoritative_title: Title from ArXiv version
435
485
  authoritative_authors: Authors from ArXiv version
436
486
  authoritative_year: Year from ArXiv version
@@ -438,22 +488,31 @@ class ArXivCitationChecker:
438
488
  Returns:
439
489
  True if the information matches, False otherwise.
440
490
  """
441
- # Compare title
491
+ # Primary criterion: Title MUST match
442
492
  if cited_title and authoritative_title:
443
493
  title_similarity = compare_titles_with_latex_cleaning(cited_title, authoritative_title)
444
494
  if title_similarity < SIMILARITY_THRESHOLD:
445
495
  return False
496
+ else:
497
+ # If no title to compare, can't determine match
498
+ return False
446
499
 
447
- # Compare authors
500
+ # Secondary criterion: If authors are provided, they should reasonably match
501
+ # (be lenient - allow partial matches since author lists can vary)
448
502
  if cited_authors and authoritative_authors:
449
503
  authors_match, _ = compare_authors(cited_authors, authoritative_authors)
450
- if not authors_match:
504
+ # If authors don't match at all, this might not be the right version
505
+ # But be lenient - just having similar author count is a good sign
506
+ cited_count = len(cited_authors)
507
+ auth_count = len(authoritative_authors)
508
+ # Allow if authors match OR if author counts are within 1 of each other
509
+ if not authors_match and abs(cited_count - auth_count) > 1:
451
510
  return False
452
511
 
453
- # Compare year
454
- if cited_year and authoritative_year:
455
- if cited_year != authoritative_year:
456
- return False
512
+ # Year is NOT used as a matching criterion because:
513
+ # 1. ArXiv shows submission date, citations often use publication year
514
+ # 2. People often cite with incorrect years
515
+ # 3. The same ArXiv version can be cited with different years
457
516
 
458
517
  return True
459
518
 
@@ -556,37 +615,68 @@ class ArXivCitationChecker:
556
615
  latest_version_num = self._get_latest_version_number(arxiv_id)
557
616
 
558
617
  if latest_version_num and latest_version_num > 1:
618
+ # Find the BEST matching version, not just the first one
619
+ best_match_version = None
620
+ best_match_score = 0.0
621
+ best_match_data = None
622
+
623
+ # Add timeout for version checking (30 seconds max)
624
+ # This prevents blocking when rate-limited with many concurrent ArXiv requests
625
+ version_check_start = time.time()
626
+ VERSION_CHECK_TIMEOUT = 30.0
627
+
559
628
  # Check historical versions (1 to latest-1)
560
- for version_num in range(1, latest_version_num):
629
+ # Start from newest historical version (more likely to match recent citations)
630
+ for version_num in range(latest_version_num - 1, 0, -1):
631
+ # Check if we've exceeded the version checking timeout
632
+ if time.time() - version_check_start > VERSION_CHECK_TIMEOUT:
633
+ logger.debug(f"ArXivCitationChecker: Version checking timed out after {VERSION_CHECK_TIMEOUT}s")
634
+ break
635
+
561
636
  version_data = self._fetch_version_metadata_from_html(arxiv_id, version_num)
562
637
  if not version_data:
563
638
  continue
564
639
 
565
- # Check if reference matches this historical version
566
- if self._compare_info_match(
567
- cited_title, cited_authors, cited_year,
568
- version_data['title'], version_data['authors'], version_data['year']):
569
-
570
- logger.debug(f"ArXivCitationChecker: Reference matches historical version v{version_num}")
571
-
572
- # Convert errors to warnings with version update info
573
- # Version update issues are informational, not errors - the citation was correct for its time
574
- version_suffix = f" (v{version_num} vs v{latest_version_num} update)"
575
- warnings = []
576
- for error in errors:
577
- warning = {
578
- 'warning_type': error.get('error_type', 'unknown') + version_suffix,
579
- 'warning_details': error.get('error_details', ''),
580
- }
581
- # Preserve correction hints
582
- for key in ['ref_title_correct', 'ref_authors_correct', 'ref_year_correct']:
583
- if key in error:
584
- warning[key] = error[key]
585
- warnings.append(warning)
586
-
587
- # Return with warnings instead of errors - URL points to the matched version
588
- matched_url = f"https://arxiv.org/abs/{arxiv_id}v{version_num}"
589
- return latest_data, warnings, matched_url
640
+ # Calculate match score for this version
641
+ match_score = self._calculate_match_score(
642
+ cited_title, cited_authors,
643
+ version_data['title'], version_data['authors'])
644
+
645
+ if match_score > best_match_score:
646
+ best_match_score = match_score
647
+ best_match_version = version_num
648
+ best_match_data = version_data
649
+
650
+ # Early termination: if we found an excellent match (>= 0.98), stop searching
651
+ # This saves HTTP requests when we've found a near-perfect version match
652
+ if best_match_score >= 0.98:
653
+ logger.debug(f"ArXivCitationChecker: Found excellent version match v{best_match_version} (score: {best_match_score:.3f}), stopping search")
654
+ break
655
+
656
+ # If we found a matching version (above threshold), convert errors to warnings
657
+ if best_match_version and best_match_score >= SIMILARITY_THRESHOLD:
658
+ logger.debug(f"ArXivCitationChecker: Reference best matches historical version v{best_match_version} (score: {best_match_score:.3f})")
659
+
660
+ # Convert errors to warnings with version update info
661
+ # Version update issues are informational, not errors - the citation was correct for its time
662
+ version_suffix = f" (v{best_match_version} vs v{latest_version_num} update)"
663
+ warnings = []
664
+ for error in errors:
665
+ # Get the error/warning type - handle both error_type and warning_type
666
+ err_type = error.get('error_type') or error.get('warning_type', 'unknown')
667
+ warning = {
668
+ 'warning_type': err_type + version_suffix,
669
+ 'warning_details': error.get('error_details') or error.get('warning_details', ''),
670
+ }
671
+ # Preserve correction hints
672
+ for key in ['ref_title_correct', 'ref_authors_correct', 'ref_year_correct']:
673
+ if key in error:
674
+ warning[key] = error[key]
675
+ warnings.append(warning)
676
+
677
+ # Return with warnings instead of errors - URL points to the matched version
678
+ matched_url = f"https://arxiv.org/abs/{arxiv_id}v{best_match_version}"
679
+ return latest_data, warnings, matched_url
590
680
 
591
681
  logger.debug(f"ArXivCitationChecker: Verified {arxiv_id} with {len(errors)} errors/warnings")
592
682
  return latest_data, errors, paper_url
@@ -27,9 +27,11 @@ import requests
27
27
  import time
28
28
  import logging
29
29
  import re
30
+ import html
30
31
  from typing import Dict, List, Tuple, Optional, Any, Union
31
32
  from refchecker.utils.text_utils import normalize_text, clean_title_basic, find_best_match, is_name_match, are_venues_substantially_different, calculate_title_similarity, compare_authors, clean_title_for_search, strip_latex_commands, compare_titles_with_latex_cleaning
32
33
  from refchecker.utils.error_utils import format_title_mismatch
34
+ from refchecker.utils.arxiv_rate_limiter import ArXivRateLimiter
33
35
  from refchecker.config.settings import get_config
34
36
 
35
37
  # Set up logging
@@ -67,6 +69,11 @@ class NonArxivReferenceChecker:
67
69
  # Track API failures for Enhanced Hybrid Checker
68
70
  self._api_failed = False
69
71
  self._failure_reason = None
72
+
73
+ # ArXiv rate limiter for version checks
74
+ self.arxiv_rate_limiter = ArXivRateLimiter.get_instance()
75
+ self.arxiv_abs_url = "https://arxiv.org/abs"
76
+ self.arxiv_timeout = 30
70
77
 
71
78
  def search_paper(self, query: str, year: Optional[int] = None) -> List[Dict[str, Any]]:
72
79
  """
@@ -267,6 +274,258 @@ class NonArxivReferenceChecker:
267
274
 
268
275
  return paper_venue if paper_venue else None
269
276
 
277
+ def _extract_arxiv_id_and_version(self, reference: Dict[str, Any]) -> Tuple[Optional[str], Optional[str]]:
278
+ """
279
+ Extract ArXiv ID and version from a reference.
280
+
281
+ Args:
282
+ reference: Reference dictionary containing url, raw_text, etc.
283
+
284
+ Returns:
285
+ Tuple of (arxiv_id_without_version, version_string_or_None)
286
+ For example: ("2301.12345", "v2") or ("2301.12345", None)
287
+ """
288
+ # Patterns to extract arXiv IDs with versions
289
+ arxiv_id_patterns = [
290
+ r'arxiv\.org/abs/([0-9]{4}\.[0-9]{4,5})(v\d+)?',
291
+ r'arxiv\.org/pdf/([0-9]{4}\.[0-9]{4,5})(v\d+)?',
292
+ r'arxiv\.org/abs/([a-z-]+/[0-9]{7})(v\d+)?',
293
+ r'arxiv\.org/pdf/([a-z-]+/[0-9]{7})(v\d+)?',
294
+ r'arXiv:([0-9]{4}\.[0-9]{4,5})(v\d+)?',
295
+ r'arXiv:([a-z-]+/[0-9]{7})(v\d+)?',
296
+ ]
297
+
298
+ sources = [
299
+ reference.get('url', ''),
300
+ reference.get('cited_url', ''),
301
+ reference.get('raw_text', ''),
302
+ ]
303
+
304
+ for source in sources:
305
+ if not source:
306
+ continue
307
+
308
+ for pattern in arxiv_id_patterns:
309
+ match = re.search(pattern, source, re.IGNORECASE)
310
+ if match:
311
+ arxiv_id = match.group(1)
312
+ version = match.group(2) if len(match.groups()) > 1 else None
313
+ return arxiv_id, version
314
+
315
+ return None, None
316
+
317
+ def _get_latest_arxiv_version_number(self, arxiv_id: str) -> Optional[int]:
318
+ """
319
+ Get the latest version number for an ArXiv paper.
320
+
321
+ Args:
322
+ arxiv_id: ArXiv ID without version
323
+
324
+ Returns:
325
+ Latest version number as integer, or None if couldn't determine
326
+ """
327
+ url = f"{self.arxiv_abs_url}/{arxiv_id}"
328
+
329
+ self.arxiv_rate_limiter.wait()
330
+ try:
331
+ response = requests.get(url, timeout=self.arxiv_timeout)
332
+ response.raise_for_status()
333
+
334
+ # Look for version links like "[v1]", "[v2]", etc.
335
+ versions = re.findall(r'\[v(\d+)\]', response.text)
336
+ if versions:
337
+ return max(int(v) for v in versions)
338
+ return None
339
+ except Exception as e:
340
+ logger.debug(f"Failed to get latest version for {arxiv_id}: {e}")
341
+ return None
342
+
343
+ def _fetch_arxiv_version_metadata(self, arxiv_id: str, version_num: int) -> Optional[Dict[str, Any]]:
344
+ """
345
+ Fetch metadata for a specific ArXiv version using HTML scraping.
346
+
347
+ Args:
348
+ arxiv_id: ArXiv ID without version
349
+ version_num: Version number to fetch (1, 2, 3, etc.)
350
+
351
+ Returns:
352
+ Dictionary with version metadata or None if version doesn't exist
353
+ """
354
+ version_str = f"v{version_num}"
355
+ url = f"{self.arxiv_abs_url}/{arxiv_id}{version_str}"
356
+
357
+ self.arxiv_rate_limiter.wait()
358
+ try:
359
+ logger.debug(f"Checking ArXiv version: {url}")
360
+ response = requests.get(url, timeout=self.arxiv_timeout)
361
+ if response.status_code == 404:
362
+ return None
363
+ response.raise_for_status()
364
+ html_content = response.text
365
+
366
+ # Parse meta tags for metadata
367
+ title_match = re.search(r'<meta name="citation_title" content="(.*?)"', html_content)
368
+ title = html.unescape(title_match.group(1)).strip() if title_match else ""
369
+
370
+ authors = []
371
+ for auth in re.findall(r'<meta name="citation_author" content="(.*?)"', html_content):
372
+ authors.append({'name': html.unescape(auth).strip()})
373
+
374
+ date_match = re.search(r'<meta name="citation_date" content="(.*?)"', html_content)
375
+ year = None
376
+ if date_match:
377
+ ym = re.search(r'^(\d{4})', date_match.group(1))
378
+ if ym:
379
+ year = int(ym.group(1))
380
+
381
+ return {
382
+ 'version': version_str,
383
+ 'version_num': version_num,
384
+ 'title': title,
385
+ 'authors': authors,
386
+ 'year': year,
387
+ 'url': url,
388
+ }
389
+ except Exception as e:
390
+ logger.debug(f"Failed to fetch ArXiv version {version_str}: {e}")
391
+ return None
392
+
393
+ def _check_arxiv_version_update(self, reference: Dict[str, Any], paper_data: Dict[str, Any], arxiv_id: str, errors: List[Dict[str, Any]]) -> Tuple[List[Dict[str, Any]], Optional[int]]:
394
+ """
395
+ Check if a reference is citing an older version of an ArXiv paper that has been updated.
396
+ If the reference matches a historical version, converts errors to warnings with version annotation.
397
+
398
+ Args:
399
+ reference: The original reference dictionary
400
+ paper_data: The verified paper data from Semantic Scholar (latest version)
401
+ arxiv_id: The ArXiv ID from the paper
402
+ errors: The current list of errors found against the latest version
403
+
404
+ Returns:
405
+ Tuple of (modified_errors_or_warnings, matched_version_num)
406
+ - If reference matches a historical version: returns (warnings_with_version_suffix, matched_version)
407
+ - Otherwise: returns (original_errors, None)
408
+ """
409
+ # Extract cited version from reference
410
+ _, cited_version = self._extract_arxiv_id_and_version(reference)
411
+
412
+ # Get the latest version number
413
+ latest_version_num = self._get_latest_arxiv_version_number(arxiv_id)
414
+
415
+ if not latest_version_num or latest_version_num <= 1:
416
+ # Only one version exists or couldn't determine
417
+ return errors, None
418
+
419
+ # Check if reference explicitly cites a specific older version
420
+ cited_version_num = None
421
+ if cited_version:
422
+ match = re.match(r'v(\d+)', cited_version)
423
+ if match:
424
+ cited_version_num = int(match.group(1))
425
+
426
+ # If a specific older version is cited in the URL, convert errors to warnings
427
+ if cited_version_num and cited_version_num < latest_version_num:
428
+ version_suffix = f" (v{cited_version_num} vs v{latest_version_num} update)"
429
+ warnings = self._convert_errors_to_version_warnings(errors, version_suffix)
430
+ return warnings, cited_version_num
431
+
432
+ # If no explicit version or no errors to check, return original
433
+ if not errors:
434
+ return errors, None
435
+
436
+ # Check if reference metadata matches a historical version
437
+ cited_title = reference.get('title', '').strip()
438
+
439
+ if not cited_title:
440
+ return errors, None
441
+
442
+ from refchecker.utils.text_utils import compare_titles_with_latex_cleaning
443
+
444
+ # Find the BEST matching version by comparing against all versions
445
+ # (not just checking if latest exceeds threshold)
446
+ best_match_version = None
447
+ best_match_score = 0.0
448
+
449
+ # Check latest version first
450
+ latest_version_data = self._fetch_arxiv_version_metadata(arxiv_id, latest_version_num)
451
+ if latest_version_data:
452
+ latest_title = latest_version_data.get('title', '').strip()
453
+ if latest_title:
454
+ latest_score = compare_titles_with_latex_cleaning(cited_title, latest_title)
455
+ if latest_score >= SIMILARITY_THRESHOLD:
456
+ best_match_version = latest_version_num
457
+ best_match_score = latest_score
458
+
459
+ # Check historical versions to find if any is a BETTER match
460
+ for version_num in range(1, latest_version_num):
461
+ version_data = self._fetch_arxiv_version_metadata(arxiv_id, version_num)
462
+ if not version_data:
463
+ continue
464
+
465
+ version_title = version_data.get('title', '').strip()
466
+ if not version_title:
467
+ continue
468
+
469
+ version_score = compare_titles_with_latex_cleaning(cited_title, version_title)
470
+
471
+ # If this version is a better match than current best
472
+ if version_score > best_match_score and version_score >= SIMILARITY_THRESHOLD:
473
+ best_match_version = version_num
474
+ best_match_score = version_score
475
+
476
+ # If best match is a historical version (not latest), convert errors to warnings
477
+ if best_match_version is not None and best_match_version < latest_version_num:
478
+ logger.debug(f"Reference best matches ArXiv v{best_match_version} (score: {best_match_score:.3f}, latest is v{latest_version_num})")
479
+ version_suffix = f" (v{best_match_version} vs v{latest_version_num} update)"
480
+ warnings = self._convert_errors_to_version_warnings(errors, version_suffix)
481
+ return warnings, best_match_version
482
+
483
+ return errors, None
484
+
485
+ def _convert_errors_to_version_warnings(self, errors: List[Dict[str, Any]], version_suffix: str) -> List[Dict[str, Any]]:
486
+ """
487
+ Convert error dictionaries to warning dictionaries with version suffix.
488
+
489
+ Args:
490
+ errors: List of error dictionaries
491
+ version_suffix: Version suffix to append (e.g., " (v1 vs v3 update)")
492
+
493
+ Returns:
494
+ List of warning dictionaries with version annotation
495
+ """
496
+ warnings = []
497
+ for error in errors:
498
+ error_type = error.get('error_type', '')
499
+
500
+ # Skip info_type entries (suggestions) - keep them as-is
501
+ if 'info_type' in error:
502
+ warnings.append(error)
503
+ continue
504
+
505
+ # Skip entries that are already warnings
506
+ if 'warning_type' in error:
507
+ # Just append the version suffix
508
+ warning = error.copy()
509
+ warning['warning_type'] = error['warning_type'] + version_suffix
510
+ warnings.append(warning)
511
+ continue
512
+
513
+ # Convert error to warning with version suffix
514
+ warning = {
515
+ 'warning_type': error_type + version_suffix,
516
+ 'warning_details': error.get('error_details', ''),
517
+ }
518
+
519
+ # Preserve correction hints
520
+ for key in ['ref_title_correct', 'ref_authors_correct', 'ref_year_correct',
521
+ 'ref_venue_correct', 'ref_doi_correct', 'ref_url_correct']:
522
+ if key in error:
523
+ warning[key] = error[key]
524
+
525
+ warnings.append(warning)
526
+
527
+ return warnings
528
+
270
529
  def verify_reference(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
271
530
  """
272
531
  Verify a non-arXiv reference using Semantic Scholar
@@ -642,6 +901,10 @@ class NonArxivReferenceChecker:
642
901
  'info_details': f"Reference could include arXiv URL: {arxiv_url}",
643
902
  'ref_url_correct': arxiv_url
644
903
  })
904
+
905
+ # Check for ArXiv version updates - if reference matches an older version,
906
+ # convert errors to warnings with version annotation (like ArXiv citation checker)
907
+ errors, matched_version = self._check_arxiv_version_update(reference, paper_data, arxiv_id, errors)
645
908
 
646
909
  # Verify DOI
647
910
  paper_doi = None