academic-refchecker 1.2.48__py3-none-any.whl → 1.2.50__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- __version__.py +1 -1
- {academic_refchecker-1.2.48.dist-info → academic_refchecker-1.2.50.dist-info}/METADATA +1 -1
- {academic_refchecker-1.2.48.dist-info → academic_refchecker-1.2.50.dist-info}/RECORD +15 -15
- checkers/enhanced_hybrid_checker.py +84 -0
- checkers/openreview_checker.py +467 -4
- checkers/semantic_scholar.py +2 -2
- core/parallel_processor.py +7 -5
- core/refchecker.py +50 -15
- utils/arxiv_utils.py +123 -77
- utils/error_utils.py +33 -0
- utils/text_utils.py +6 -0
- {academic_refchecker-1.2.48.dist-info → academic_refchecker-1.2.50.dist-info}/WHEEL +0 -0
- {academic_refchecker-1.2.48.dist-info → academic_refchecker-1.2.50.dist-info}/entry_points.txt +0 -0
- {academic_refchecker-1.2.48.dist-info → academic_refchecker-1.2.50.dist-info}/licenses/LICENSE +0 -0
- {academic_refchecker-1.2.48.dist-info → academic_refchecker-1.2.50.dist-info}/top_level.txt +0 -0
__version__.py
CHANGED
|
@@ -1,21 +1,21 @@
|
|
|
1
|
-
__version__.py,sha256=
|
|
2
|
-
academic_refchecker-1.2.
|
|
1
|
+
__version__.py,sha256=ZQ6vcRuuZpexVshgiVwj1EkuR3vzgsRUj6ll7aoa8Dw,65
|
|
2
|
+
academic_refchecker-1.2.50.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
|
|
3
3
|
checkers/__init__.py,sha256=T0PAHTFt6UiGvn-WGoJU8CdhXNmf6zaHmcGVoWHhmJQ,533
|
|
4
4
|
checkers/crossref.py,sha256=cLYmSzE8ehJ5sNko_R3fEiGBGiPH5_HxLhFM-pCfDRM,20378
|
|
5
|
-
checkers/enhanced_hybrid_checker.py,sha256=
|
|
5
|
+
checkers/enhanced_hybrid_checker.py,sha256=rbXkzpNkd0bn4e2OooX-CcdGTwwYpgmVaFvX_xCAFsA,27777
|
|
6
6
|
checkers/github_checker.py,sha256=GoepG4aRRUqAomkM4HgOSNf20BPxQgocZEpsk0ZTZZU,14003
|
|
7
7
|
checkers/local_semantic_scholar.py,sha256=D8py8-yMCgN1lvhXCiMUOEA4wBkH7AQvrkM4-3LCDsU,21015
|
|
8
8
|
checkers/openalex.py,sha256=Fbc7iscZzmXjAZxH32PDX2r2Nwo9b5Ku-Sh1Ut9KpLA,19550
|
|
9
|
-
checkers/openreview_checker.py,sha256=
|
|
10
|
-
checkers/semantic_scholar.py,sha256=
|
|
9
|
+
checkers/openreview_checker.py,sha256=mu33gytnIEond5A2gAZtxgLkKrirmmUSIUc4frL1GsA,40030
|
|
10
|
+
checkers/semantic_scholar.py,sha256=dDDOxURwr-Kx7fIiJTAh_4_9V8VxGWYabQJiQ1VdSbM,34762
|
|
11
11
|
checkers/webpage_checker.py,sha256=Ivzhu0xcpeWZcCTlAt9C06Lfsz5gKEiSko9gJ7EyQEw,22324
|
|
12
12
|
config/__init__.py,sha256=r7sONsX2-ITviUJRU1KEz76uAuTRqZlzU-TVkvFRGYY,15
|
|
13
13
|
config/logging.conf,sha256=r1tP0ApLHtlz7rV-oKS1MVO7oXJOgahbZFTtYmKnf9U,687
|
|
14
14
|
config/settings.py,sha256=-vODFoXbWbGPUElpmchE5zbCj_n4Vtxr8HU1hQDFp_c,6164
|
|
15
15
|
core/__init__.py,sha256=1T2MSQyDk0u_PupbHvm4CvNNN--dxsw78fqKUrqoYrM,157
|
|
16
16
|
core/db_connection_pool.py,sha256=XRiOdehikkSz3obH4WKgf8woa3694if50Q15rBT-4XQ,4697
|
|
17
|
-
core/parallel_processor.py,sha256=
|
|
18
|
-
core/refchecker.py,sha256=
|
|
17
|
+
core/parallel_processor.py,sha256=VHjsHc_wCKumeF__fXh8RjMpM4dYE8ua5amgotd4PTg,17474
|
|
18
|
+
core/refchecker.py,sha256=sOenr6DgXqReiOCKcPVVFaaAAYJXEwONdw9gjur4KYE,279937
|
|
19
19
|
database/__init__.py,sha256=mEuVHlEBuS44t_2ZT_JnvQQrlRCjo1SJq1NmaJ6r8OY,125
|
|
20
20
|
database/download_semantic_scholar_db.py,sha256=waN4I97KC_36YMiPbiBDUUmgfzu1nub5yeKdAsIR2aw,75276
|
|
21
21
|
llm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -26,7 +26,7 @@ scripts/start_vllm_server.py,sha256=ZepWp2y2cKFW0Kgsoima2RbmF02fTU29UFcLLpsBhFU,
|
|
|
26
26
|
services/__init__.py,sha256=jGi9S74Msak3YR-C4Qb68VU7HB4oLaX9o1rlVAFpOFI,187
|
|
27
27
|
services/pdf_processor.py,sha256=vu_JnhFGZY6jFVbDbPvG-mlQojvB-3Dzc8_946KVV2E,9427
|
|
28
28
|
utils/__init__.py,sha256=1RrGoIIn1_gVzxd56b6a7HeAS-wu7uDP-nxLbR3fJ-8,1199
|
|
29
|
-
utils/arxiv_utils.py,sha256=
|
|
29
|
+
utils/arxiv_utils.py,sha256=EzH1PhEAW0df5mmSP-kKHmuwqd4u2CSotRNwQ5IMJx8,19766
|
|
30
30
|
utils/author_utils.py,sha256=DLTo1xsxef2wxoe4s_MWrh36maj4fgnvFlsDLpDE-qQ,5507
|
|
31
31
|
utils/biblatex_parser.py,sha256=OkHXQcjiBrEDuhBfEk0RtmAYxufu5lAxAjb8__DzMjI,25537
|
|
32
32
|
utils/bibliography_utils.py,sha256=mpmdAklzAs1CT3gqrOcjujGhouL95OuliCx0LE9Pg90,11705
|
|
@@ -34,13 +34,13 @@ utils/bibtex_parser.py,sha256=a89NLy_q2kwED4QFJgxWFgPQOJBV73bIUL3RS_Urmro,15231
|
|
|
34
34
|
utils/config_validator.py,sha256=rxf7K3DYmJ-BNPsmtaCNipY2BTVT-pJZ7wN-M9Y3GC8,11167
|
|
35
35
|
utils/db_utils.py,sha256=_wSupfBlm0ILFvntQTvoj7tLDCbrYPRQrp9NDvphF_E,6281
|
|
36
36
|
utils/doi_utils.py,sha256=ezUiRnYRpoO0U_Rqgxv1FxqmeTwPh6X8gLgSDbqg5sY,4874
|
|
37
|
-
utils/error_utils.py,sha256=
|
|
37
|
+
utils/error_utils.py,sha256=UJOH7Bp-rPV2JDY_XN38I2pSkqqPdnQoviKa4s4nK_A,12501
|
|
38
38
|
utils/mock_objects.py,sha256=QxU-UXyHSY27IZYN8Sb8ei0JtNkpGSdMXoErrRLHXvE,6437
|
|
39
|
-
utils/text_utils.py,sha256=
|
|
39
|
+
utils/text_utils.py,sha256=g2r0QT6RGNi_8K5MD_EE-GT3cbffhk8cQyQaL6HSYtA,211955
|
|
40
40
|
utils/unicode_utils.py,sha256=-WBKarXO756p7fd7gCeNsMag4ztDNURwFX5IVniOtwY,10366
|
|
41
41
|
utils/url_utils.py,sha256=HdxIO8QvciP6Jp8Wd4sTSrS8JQrOMwgM7pxdUC8RJb4,9176
|
|
42
|
-
academic_refchecker-1.2.
|
|
43
|
-
academic_refchecker-1.2.
|
|
44
|
-
academic_refchecker-1.2.
|
|
45
|
-
academic_refchecker-1.2.
|
|
46
|
-
academic_refchecker-1.2.
|
|
42
|
+
academic_refchecker-1.2.50.dist-info/METADATA,sha256=uOjNDL9zgwSxgFmaScRw635mQn0K4r2UfEYeQodcOy8,22576
|
|
43
|
+
academic_refchecker-1.2.50.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
44
|
+
academic_refchecker-1.2.50.dist-info/entry_points.txt,sha256=WdI89tYkIfz-M628PiboOfOLzTBWZAqvlF29qCVCkek,61
|
|
45
|
+
academic_refchecker-1.2.50.dist-info/top_level.txt,sha256=6RlcQEA0kHb7-ndbKMFMZnYnJQVohgsU6BBkbEvJvEs,69
|
|
46
|
+
academic_refchecker-1.2.50.dist-info/RECORD,,
|
|
@@ -312,12 +312,36 @@ class EnhancedHybridReferenceChecker:
|
|
|
312
312
|
if (self.openreview and
|
|
313
313
|
hasattr(self.openreview, 'is_openreview_reference') and
|
|
314
314
|
self.openreview.is_openreview_reference(reference)):
|
|
315
|
+
logger.debug("Enhanced Hybrid: Trying OpenReview URL-based verification")
|
|
315
316
|
verified_data, errors, url, success, failure_type = self._try_api('openreview', self.openreview, reference)
|
|
316
317
|
if success:
|
|
317
318
|
return verified_data, errors, url
|
|
318
319
|
if failure_type in ['throttled', 'timeout', 'server_error']:
|
|
319
320
|
failed_apis.append(('openreview', self.openreview, failure_type))
|
|
320
321
|
|
|
322
|
+
# Strategy 5b: Try OpenReview by search if venue suggests it might be there
|
|
323
|
+
elif (self.openreview and
|
|
324
|
+
hasattr(self.openreview, 'verify_reference_by_search')):
|
|
325
|
+
# Check if venue suggests this might be on OpenReview
|
|
326
|
+
venue = reference.get('venue', reference.get('journal', '')).lower()
|
|
327
|
+
openreview_venues = [
|
|
328
|
+
'iclr', 'icml', 'neurips', 'nips', 'aaai', 'ijcai',
|
|
329
|
+
'international conference on learning representations',
|
|
330
|
+
'international conference on machine learning',
|
|
331
|
+
'neural information processing systems'
|
|
332
|
+
]
|
|
333
|
+
|
|
334
|
+
venue_suggests_openreview = any(or_venue in venue for or_venue in openreview_venues)
|
|
335
|
+
logger.debug(f"Enhanced Hybrid: OpenReview venue check - venue: '{venue}', suggests: {venue_suggests_openreview}")
|
|
336
|
+
|
|
337
|
+
if venue_suggests_openreview:
|
|
338
|
+
logger.debug("Enhanced Hybrid: Trying OpenReview search-based verification")
|
|
339
|
+
verified_data, errors, url, success, failure_type = self._try_openreview_search(reference)
|
|
340
|
+
if success:
|
|
341
|
+
return verified_data, errors, url
|
|
342
|
+
if failure_type in ['throttled', 'timeout', 'server_error']:
|
|
343
|
+
failed_apis.append(('openreview_search', self.openreview, failure_type))
|
|
344
|
+
|
|
321
345
|
# Strategy 6: Try CrossRef if we haven't already (for non-DOI references)
|
|
322
346
|
if not self._should_try_doi_apis_first(reference) and self.crossref:
|
|
323
347
|
verified_data, errors, url, success, failure_type = self._try_api('crossref', self.crossref, reference)
|
|
@@ -399,6 +423,66 @@ class EnhancedHybridReferenceChecker:
|
|
|
399
423
|
'error_details': 'Could not verify reference using any available API'
|
|
400
424
|
}], None
|
|
401
425
|
|
|
426
|
+
def _try_openreview_search(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str], bool, str]:
|
|
427
|
+
"""
|
|
428
|
+
Try to verify reference using OpenReview search
|
|
429
|
+
|
|
430
|
+
Returns:
|
|
431
|
+
Tuple of (verified_data, errors, url, success, failure_type)
|
|
432
|
+
"""
|
|
433
|
+
if not self.openreview:
|
|
434
|
+
return None, [], None, False, 'none'
|
|
435
|
+
|
|
436
|
+
start_time = time.time()
|
|
437
|
+
failure_type = 'none'
|
|
438
|
+
|
|
439
|
+
try:
|
|
440
|
+
verified_data, errors, url = self.openreview.verify_reference_by_search(reference)
|
|
441
|
+
duration = time.time() - start_time
|
|
442
|
+
|
|
443
|
+
# Consider it successful if we found data or verification errors
|
|
444
|
+
success = verified_data is not None or len(errors) > 0
|
|
445
|
+
self._update_api_stats('openreview', success, duration)
|
|
446
|
+
|
|
447
|
+
if success:
|
|
448
|
+
logger.debug(f"Enhanced Hybrid: OpenReview search successful in {duration:.2f}s, URL: {url}")
|
|
449
|
+
return verified_data, errors, url, True, 'none'
|
|
450
|
+
else:
|
|
451
|
+
logger.debug(f"Enhanced Hybrid: OpenReview search found no results in {duration:.2f}s")
|
|
452
|
+
return None, [], None, False, 'not_found'
|
|
453
|
+
|
|
454
|
+
except requests.exceptions.Timeout as e:
|
|
455
|
+
duration = time.time() - start_time
|
|
456
|
+
self._update_api_stats('openreview', False, duration)
|
|
457
|
+
failure_type = 'timeout'
|
|
458
|
+
logger.debug(f"Enhanced Hybrid: OpenReview search timed out in {duration:.2f}s: {e}")
|
|
459
|
+
return None, [], None, False, failure_type
|
|
460
|
+
|
|
461
|
+
except requests.exceptions.RequestException as e:
|
|
462
|
+
duration = time.time() - start_time
|
|
463
|
+
self._update_api_stats('openreview', False, duration)
|
|
464
|
+
|
|
465
|
+
# Check if it's a rate limiting error
|
|
466
|
+
if hasattr(e, 'response') and e.response is not None:
|
|
467
|
+
if e.response.status_code in [429, 503]:
|
|
468
|
+
failure_type = 'throttled'
|
|
469
|
+
elif e.response.status_code >= 500:
|
|
470
|
+
failure_type = 'server_error'
|
|
471
|
+
else:
|
|
472
|
+
failure_type = 'other'
|
|
473
|
+
else:
|
|
474
|
+
failure_type = 'other'
|
|
475
|
+
|
|
476
|
+
logger.debug(f"Enhanced Hybrid: OpenReview search failed in {duration:.2f}s: {type(e).__name__}: {e}")
|
|
477
|
+
return None, [], None, False, failure_type
|
|
478
|
+
|
|
479
|
+
except Exception as e:
|
|
480
|
+
duration = time.time() - start_time
|
|
481
|
+
self._update_api_stats('openreview', False, duration)
|
|
482
|
+
failure_type = 'other'
|
|
483
|
+
logger.debug(f"Enhanced Hybrid: OpenReview search error in {duration:.2f}s: {type(e).__name__}: {e}")
|
|
484
|
+
return None, [], None, False, failure_type
|
|
485
|
+
|
|
402
486
|
def get_performance_stats(self) -> Dict[str, Any]:
|
|
403
487
|
"""
|
|
404
488
|
Get performance statistics for all APIs
|
checkers/openreview_checker.py
CHANGED
|
@@ -498,6 +498,160 @@ class OpenReviewReferenceChecker:
|
|
|
498
498
|
logger.debug(f"OpenReview verification completed for: {openreview_url}")
|
|
499
499
|
return verified_data, errors, openreview_url
|
|
500
500
|
|
|
501
|
+
def verify_by_search(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
|
|
502
|
+
"""
|
|
503
|
+
Verify a reference by searching OpenReview (when no URL is provided)
|
|
504
|
+
|
|
505
|
+
Args:
|
|
506
|
+
reference: Reference dictionary with title, authors, year, etc.
|
|
507
|
+
|
|
508
|
+
Returns:
|
|
509
|
+
Tuple of (verified_data, errors, paper_url) where:
|
|
510
|
+
- verified_data: Dict with verified OpenReview paper data or None
|
|
511
|
+
- errors: List of error/warning dictionaries
|
|
512
|
+
- paper_url: The OpenReview URL if found
|
|
513
|
+
"""
|
|
514
|
+
logger.debug(f"Searching OpenReview for reference: {reference.get('title', 'Untitled')}")
|
|
515
|
+
|
|
516
|
+
title = reference.get('title', '').strip()
|
|
517
|
+
authors = reference.get('authors', [])
|
|
518
|
+
year = reference.get('year')
|
|
519
|
+
venue = reference.get('venue', '').strip()
|
|
520
|
+
|
|
521
|
+
if not title:
|
|
522
|
+
return None, [], None
|
|
523
|
+
|
|
524
|
+
# Check if venue suggests this might be on OpenReview
|
|
525
|
+
if not self._is_likely_openreview_venue(venue):
|
|
526
|
+
logger.debug(f"Venue '{venue}' doesn't suggest OpenReview, skipping search")
|
|
527
|
+
return None, [], None
|
|
528
|
+
|
|
529
|
+
# Search for matching papers
|
|
530
|
+
search_results = self.search_paper(title, authors, year)
|
|
531
|
+
|
|
532
|
+
if not search_results:
|
|
533
|
+
logger.debug("No matching papers found on OpenReview")
|
|
534
|
+
return None, [], None
|
|
535
|
+
|
|
536
|
+
# Use the best match (first result, as they're sorted by relevance)
|
|
537
|
+
best_match = search_results[0]
|
|
538
|
+
paper_url = best_match.get('forum_url')
|
|
539
|
+
|
|
540
|
+
logger.debug(f"Found OpenReview match: {best_match.get('title', 'Untitled')}")
|
|
541
|
+
|
|
542
|
+
# Verify the reference against the found paper
|
|
543
|
+
errors = []
|
|
544
|
+
|
|
545
|
+
# Check title match
|
|
546
|
+
cited_title = reference.get('title', '').strip()
|
|
547
|
+
paper_title = best_match.get('title', '').strip()
|
|
548
|
+
|
|
549
|
+
if cited_title and paper_title:
|
|
550
|
+
similarity = calculate_title_similarity(cited_title, paper_title)
|
|
551
|
+
if similarity < 0.8: # Slightly higher threshold for search results
|
|
552
|
+
from utils.error_utils import format_title_mismatch
|
|
553
|
+
details = format_title_mismatch(cited_title, paper_title) + f" (similarity: {similarity:.2f})"
|
|
554
|
+
errors.append({
|
|
555
|
+
"warning_type": "title",
|
|
556
|
+
"warning_details": details
|
|
557
|
+
})
|
|
558
|
+
|
|
559
|
+
# Check authors
|
|
560
|
+
cited_authors = reference.get('authors', [])
|
|
561
|
+
paper_authors = best_match.get('authors', [])
|
|
562
|
+
|
|
563
|
+
if cited_authors and paper_authors:
|
|
564
|
+
# Convert to list format if needed
|
|
565
|
+
if isinstance(cited_authors, str):
|
|
566
|
+
cited_authors = [author.strip() for author in cited_authors.split(',')]
|
|
567
|
+
if isinstance(paper_authors, str):
|
|
568
|
+
paper_authors = [author.strip() for author in paper_authors.split(',')]
|
|
569
|
+
|
|
570
|
+
# Use the existing author comparison function
|
|
571
|
+
match, error_msg = compare_authors(cited_authors, paper_authors)
|
|
572
|
+
if not match and error_msg:
|
|
573
|
+
errors.append({
|
|
574
|
+
"warning_type": "author",
|
|
575
|
+
"warning_details": error_msg
|
|
576
|
+
})
|
|
577
|
+
|
|
578
|
+
# Check year
|
|
579
|
+
cited_year = reference.get('year')
|
|
580
|
+
paper_year = best_match.get('year')
|
|
581
|
+
|
|
582
|
+
if cited_year and paper_year:
|
|
583
|
+
try:
|
|
584
|
+
cited_year_int = int(cited_year)
|
|
585
|
+
paper_year_int = int(paper_year)
|
|
586
|
+
|
|
587
|
+
is_different, year_message = is_year_substantially_different(cited_year_int, paper_year_int)
|
|
588
|
+
if is_different and year_message:
|
|
589
|
+
from utils.error_utils import format_year_mismatch
|
|
590
|
+
errors.append({
|
|
591
|
+
"warning_type": "year",
|
|
592
|
+
"warning_details": format_year_mismatch(cited_year_int, paper_year_int)
|
|
593
|
+
})
|
|
594
|
+
except (ValueError, TypeError):
|
|
595
|
+
pass # Skip year validation if conversion fails
|
|
596
|
+
|
|
597
|
+
# Check venue if provided in reference
|
|
598
|
+
cited_venue = reference.get('venue', '').strip()
|
|
599
|
+
paper_venue = best_match.get('venue', '').strip()
|
|
600
|
+
|
|
601
|
+
if cited_venue and paper_venue:
|
|
602
|
+
if are_venues_substantially_different(cited_venue, paper_venue):
|
|
603
|
+
from utils.error_utils import format_venue_mismatch
|
|
604
|
+
errors.append({
|
|
605
|
+
"warning_type": "venue",
|
|
606
|
+
"warning_details": format_venue_mismatch(cited_venue, paper_venue)
|
|
607
|
+
})
|
|
608
|
+
|
|
609
|
+
# Create verified data structure
|
|
610
|
+
verified_data = {
|
|
611
|
+
'title': best_match.get('title', cited_title),
|
|
612
|
+
'authors': best_match.get('authors', cited_authors),
|
|
613
|
+
'year': best_match.get('year', cited_year),
|
|
614
|
+
'venue': best_match.get('venue', cited_venue),
|
|
615
|
+
'url': paper_url,
|
|
616
|
+
'abstract': best_match.get('abstract', ''),
|
|
617
|
+
'keywords': best_match.get('keywords', []),
|
|
618
|
+
'openreview_metadata': best_match,
|
|
619
|
+
'verification_source': 'OpenReview (search)'
|
|
620
|
+
}
|
|
621
|
+
|
|
622
|
+
logger.debug(f"OpenReview search verification completed for: {paper_url}")
|
|
623
|
+
return verified_data, errors, paper_url
|
|
624
|
+
|
|
625
|
+
def _is_likely_openreview_venue(self, venue: str) -> bool:
|
|
626
|
+
"""
|
|
627
|
+
Check if a venue suggests the paper might be on OpenReview
|
|
628
|
+
|
|
629
|
+
Args:
|
|
630
|
+
venue: Venue string from reference
|
|
631
|
+
|
|
632
|
+
Returns:
|
|
633
|
+
True if venue suggests OpenReview
|
|
634
|
+
"""
|
|
635
|
+
if not venue:
|
|
636
|
+
return False
|
|
637
|
+
|
|
638
|
+
venue_lower = venue.lower()
|
|
639
|
+
|
|
640
|
+
# Common venues that use OpenReview
|
|
641
|
+
openreview_venues = [
|
|
642
|
+
'iclr', 'international conference on learning representations',
|
|
643
|
+
'neurips', 'neural information processing systems', 'nips',
|
|
644
|
+
'icml', 'international conference on machine learning',
|
|
645
|
+
'iclr workshop', 'neurips workshop', 'icml workshop',
|
|
646
|
+
'aaai', 'ijcai', 'aistats'
|
|
647
|
+
]
|
|
648
|
+
|
|
649
|
+
for or_venue in openreview_venues:
|
|
650
|
+
if or_venue in venue_lower:
|
|
651
|
+
return True
|
|
652
|
+
|
|
653
|
+
return False
|
|
654
|
+
|
|
501
655
|
def search_paper(self, title: str, authors: List[str] = None, year: int = None) -> List[Dict[str, Any]]:
|
|
502
656
|
"""
|
|
503
657
|
Search for papers on OpenReview by title, authors, and/or year
|
|
@@ -510,7 +664,316 @@ class OpenReviewReferenceChecker:
|
|
|
510
664
|
Returns:
|
|
511
665
|
List of matching paper metadata dictionaries
|
|
512
666
|
"""
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
667
|
+
if not title or not title.strip():
|
|
668
|
+
return []
|
|
669
|
+
|
|
670
|
+
logger.debug(f"Searching OpenReview for: {title}")
|
|
671
|
+
|
|
672
|
+
# Clean title for search
|
|
673
|
+
search_title = clean_title_for_search(title)
|
|
674
|
+
|
|
675
|
+
# Try API search first
|
|
676
|
+
results = self._search_via_api(search_title, authors, year)
|
|
677
|
+
if results:
|
|
678
|
+
return results
|
|
679
|
+
|
|
680
|
+
# If API search fails, try web search as fallback
|
|
681
|
+
return self._search_via_web(search_title, authors, year)
|
|
682
|
+
|
|
683
|
+
def _search_via_api(self, title: str, authors: List[str] = None, year: int = None) -> List[Dict[str, Any]]:
|
|
684
|
+
"""
|
|
685
|
+
Search using OpenReview API
|
|
686
|
+
|
|
687
|
+
Args:
|
|
688
|
+
title: Clean title to search for
|
|
689
|
+
authors: List of author names (optional)
|
|
690
|
+
year: Publication year (optional)
|
|
691
|
+
|
|
692
|
+
Returns:
|
|
693
|
+
List of matching paper dictionaries
|
|
694
|
+
"""
|
|
695
|
+
try:
|
|
696
|
+
# The OpenReview API requires specific parameters
|
|
697
|
+
# We'll search by content.title or content.venue (for venue-based search)
|
|
698
|
+
search_params = {
|
|
699
|
+
'limit': 20, # Limit results to avoid overwhelming the API
|
|
700
|
+
'details': 'directReplies' # Get basic details
|
|
701
|
+
}
|
|
702
|
+
|
|
703
|
+
# Try searching by venue first if year suggests recent conferences
|
|
704
|
+
if year and year >= 2017: # OpenReview started around 2017
|
|
705
|
+
venues_by_year = {
|
|
706
|
+
2025: ['ICLR 2025'],
|
|
707
|
+
2024: ['ICLR 2024', 'NeurIPS 2024', 'ICML 2024'],
|
|
708
|
+
2023: ['ICLR 2023', 'NeurIPS 2023', 'ICML 2023'],
|
|
709
|
+
2022: ['ICLR 2022', 'NeurIPS 2022', 'ICML 2022'],
|
|
710
|
+
2021: ['ICLR 2021', 'NeurIPS 2021', 'ICML 2021'],
|
|
711
|
+
2020: ['ICLR 2020', 'NeurIPS 2020', 'ICML 2020'],
|
|
712
|
+
2019: ['ICLR 2019', 'NeurIPS 2019', 'ICML 2019'],
|
|
713
|
+
2018: ['ICLR 2018', 'NeurIPS 2018', 'ICML 2018'],
|
|
714
|
+
2017: ['ICLR 2017']
|
|
715
|
+
}
|
|
716
|
+
|
|
717
|
+
possible_venues = venues_by_year.get(year, [])
|
|
718
|
+
|
|
719
|
+
results = []
|
|
720
|
+
for venue in possible_venues:
|
|
721
|
+
# Search by venue and then filter by title
|
|
722
|
+
venue_params = search_params.copy()
|
|
723
|
+
venue_params['content.venue'] = venue
|
|
724
|
+
|
|
725
|
+
api_url = f"{self.api_url}/notes"
|
|
726
|
+
response = self._respectful_request(api_url, params=venue_params)
|
|
727
|
+
|
|
728
|
+
if response and response.status_code == 200:
|
|
729
|
+
try:
|
|
730
|
+
data = response.json()
|
|
731
|
+
if 'notes' in data and data['notes']:
|
|
732
|
+
for note in data['notes']:
|
|
733
|
+
try:
|
|
734
|
+
metadata = self._parse_api_response(note)
|
|
735
|
+
if metadata and self._is_good_match(metadata, title, authors, year):
|
|
736
|
+
results.append(metadata)
|
|
737
|
+
if len(results) >= 5: # Limit results
|
|
738
|
+
break
|
|
739
|
+
except Exception as e:
|
|
740
|
+
logger.debug(f"Error parsing note: {e}")
|
|
741
|
+
continue
|
|
742
|
+
|
|
743
|
+
if results:
|
|
744
|
+
break # Found results, no need to search other venues
|
|
745
|
+
|
|
746
|
+
except (json.JSONDecodeError, KeyError) as e:
|
|
747
|
+
logger.debug(f"Failed to parse venue search response: {e}")
|
|
748
|
+
continue
|
|
749
|
+
else:
|
|
750
|
+
logger.debug(f"Venue search failed for {venue}: {response.status_code if response else 'No response'}")
|
|
751
|
+
|
|
752
|
+
if results:
|
|
753
|
+
logger.debug(f"OpenReview API search found {len(results)} matches via venue search")
|
|
754
|
+
return results
|
|
755
|
+
|
|
756
|
+
# If venue search didn't work, try other approaches
|
|
757
|
+
# OpenReview API is quite restrictive, so we might need to fall back to web scraping
|
|
758
|
+
logger.debug("OpenReview API venue search returned no results, trying web search")
|
|
759
|
+
return []
|
|
760
|
+
|
|
761
|
+
except Exception as e:
|
|
762
|
+
logger.debug(f"OpenReview API search error: {e}")
|
|
763
|
+
return []
|
|
764
|
+
|
|
765
|
+
def _search_via_web(self, title: str, authors: List[str] = None, year: int = None) -> List[Dict[str, Any]]:
|
|
766
|
+
"""
|
|
767
|
+
Search using OpenReview web interface (fallback)
|
|
768
|
+
|
|
769
|
+
Args:
|
|
770
|
+
title: Clean title to search for
|
|
771
|
+
authors: List of author names (optional)
|
|
772
|
+
year: Publication year (optional)
|
|
773
|
+
|
|
774
|
+
Returns:
|
|
775
|
+
List of matching paper dictionaries
|
|
776
|
+
"""
|
|
777
|
+
try:
|
|
778
|
+
# Build search URL
|
|
779
|
+
search_query = title.replace(' ', '+')
|
|
780
|
+
search_url = f"{self.base_url}/search?term={search_query}"
|
|
781
|
+
|
|
782
|
+
response = self._respectful_request(search_url)
|
|
783
|
+
if not response or response.status_code != 200:
|
|
784
|
+
return []
|
|
785
|
+
|
|
786
|
+
# Parse search results page
|
|
787
|
+
soup = BeautifulSoup(response.text, 'html.parser')
|
|
788
|
+
|
|
789
|
+
# Look for paper links in search results
|
|
790
|
+
# OpenReview search results typically contain links to forum pages
|
|
791
|
+
results = []
|
|
792
|
+
|
|
793
|
+
# Find links that look like OpenReview paper URLs
|
|
794
|
+
for link in soup.find_all('a', href=True):
|
|
795
|
+
href = link.get('href', '')
|
|
796
|
+
if '/forum?id=' in href:
|
|
797
|
+
paper_id = self.extract_paper_id(href)
|
|
798
|
+
if paper_id:
|
|
799
|
+
# Get full metadata for this paper
|
|
800
|
+
metadata = self.get_paper_metadata(paper_id)
|
|
801
|
+
if metadata and self._is_good_match(metadata, title, authors, year):
|
|
802
|
+
results.append(metadata)
|
|
803
|
+
if len(results) >= 5: # Limit results
|
|
804
|
+
break
|
|
805
|
+
|
|
806
|
+
logger.debug(f"OpenReview web search found {len(results)} matches")
|
|
807
|
+
return results
|
|
808
|
+
|
|
809
|
+
except Exception as e:
|
|
810
|
+
logger.debug(f"OpenReview web search error: {e}")
|
|
811
|
+
return []
|
|
812
|
+
|
|
813
|
+
def _is_good_match(self, metadata: Dict[str, Any], search_title: str, authors: List[str] = None, year: int = None) -> bool:
|
|
814
|
+
"""
|
|
815
|
+
Check if the found paper is a good match for the search criteria
|
|
816
|
+
|
|
817
|
+
Args:
|
|
818
|
+
metadata: Paper metadata from OpenReview
|
|
819
|
+
search_title: Title we're searching for
|
|
820
|
+
authors: Authors we're looking for (optional)
|
|
821
|
+
year: Year we're looking for (optional)
|
|
822
|
+
|
|
823
|
+
Returns:
|
|
824
|
+
True if it's a good match
|
|
825
|
+
"""
|
|
826
|
+
paper_title = metadata.get('title', '')
|
|
827
|
+
if not paper_title:
|
|
828
|
+
return False
|
|
829
|
+
|
|
830
|
+
# Check title similarity
|
|
831
|
+
title_similarity = calculate_title_similarity(search_title, paper_title)
|
|
832
|
+
if title_similarity < 0.7: # Require at least 70% similarity
|
|
833
|
+
return False
|
|
834
|
+
|
|
835
|
+
# Check year if provided
|
|
836
|
+
if year:
|
|
837
|
+
paper_year = metadata.get('year')
|
|
838
|
+
if paper_year and abs(int(paper_year) - year) > 1: # Allow 1 year difference
|
|
839
|
+
return False
|
|
840
|
+
|
|
841
|
+
# Check authors if provided
|
|
842
|
+
if authors and len(authors) > 0:
|
|
843
|
+
paper_authors = metadata.get('authors', [])
|
|
844
|
+
if paper_authors:
|
|
845
|
+
# Check if at least one author matches
|
|
846
|
+
author_match = False
|
|
847
|
+
for search_author in authors[:2]: # Check first 2 authors
|
|
848
|
+
for paper_author in paper_authors[:3]: # Check first 3 paper authors
|
|
849
|
+
if is_name_match(search_author, paper_author):
|
|
850
|
+
author_match = True
|
|
851
|
+
break
|
|
852
|
+
if author_match:
|
|
853
|
+
break
|
|
854
|
+
|
|
855
|
+
if not author_match:
|
|
856
|
+
return False
|
|
857
|
+
|
|
858
|
+
return True
|
|
859
|
+
|
|
860
|
+
def search_by_title(self, title: str, max_results: int = 5) -> List[Dict[str, Any]]:
|
|
861
|
+
"""
|
|
862
|
+
Search OpenReview for papers by title using the working search API.
|
|
863
|
+
|
|
864
|
+
Args:
|
|
865
|
+
title: Paper title to search for
|
|
866
|
+
max_results: Maximum number of results to return
|
|
867
|
+
|
|
868
|
+
Returns:
|
|
869
|
+
List of paper data dictionaries
|
|
870
|
+
"""
|
|
871
|
+
try:
|
|
872
|
+
# Use OpenReview's search API with term parameter (this works!)
|
|
873
|
+
params = {
|
|
874
|
+
'term': title,
|
|
875
|
+
'limit': max_results
|
|
876
|
+
}
|
|
877
|
+
|
|
878
|
+
response = self._respectful_request(f"{self.api_url}/notes/search", params=params)
|
|
879
|
+
if not response or response.status_code != 200:
|
|
880
|
+
logger.debug(f"OpenReview search API failed with status {response.status_code if response else 'None'}")
|
|
881
|
+
return []
|
|
882
|
+
|
|
883
|
+
data = response.json()
|
|
884
|
+
papers = []
|
|
885
|
+
|
|
886
|
+
for note in data.get('notes', []):
|
|
887
|
+
# Filter to exact or close title matches
|
|
888
|
+
note_title = note.get('content', {}).get('title', '')
|
|
889
|
+
if self._is_title_match(title, note_title):
|
|
890
|
+
paper_data = self._parse_api_response(note)
|
|
891
|
+
if paper_data:
|
|
892
|
+
papers.append(paper_data)
|
|
893
|
+
|
|
894
|
+
logger.debug(f"OpenReview search found {len(papers)} matching papers for '{title}'")
|
|
895
|
+
return papers
|
|
896
|
+
|
|
897
|
+
except Exception as e:
|
|
898
|
+
logger.error(f"Error searching OpenReview by title '{title}': {e}")
|
|
899
|
+
return []
|
|
900
|
+
|
|
901
|
+
def _is_title_match(self, search_title: str, found_title: str, threshold: float = 0.8) -> bool:
|
|
902
|
+
"""
|
|
903
|
+
Check if two titles match closely enough.
|
|
904
|
+
|
|
905
|
+
Args:
|
|
906
|
+
search_title: Title we're searching for
|
|
907
|
+
found_title: Title found in search results
|
|
908
|
+
threshold: Similarity threshold (0.0 to 1.0)
|
|
909
|
+
|
|
910
|
+
Returns:
|
|
911
|
+
True if titles match closely enough
|
|
912
|
+
"""
|
|
913
|
+
if not search_title or not found_title:
|
|
914
|
+
return False
|
|
915
|
+
|
|
916
|
+
# Exact match
|
|
917
|
+
if search_title.lower().strip() == found_title.lower().strip():
|
|
918
|
+
return True
|
|
919
|
+
|
|
920
|
+
# Check if one contains the other (for cases where one is longer)
|
|
921
|
+
search_clean = search_title.lower().strip()
|
|
922
|
+
found_clean = found_title.lower().strip()
|
|
923
|
+
|
|
924
|
+
if search_clean in found_clean or found_clean in search_clean:
|
|
925
|
+
return True
|
|
926
|
+
|
|
927
|
+
# Use similarity calculation from text_utils
|
|
928
|
+
try:
|
|
929
|
+
from utils.text_utils import calculate_title_similarity
|
|
930
|
+
similarity = calculate_title_similarity(search_title, found_title)
|
|
931
|
+
return similarity >= threshold
|
|
932
|
+
except ImportError:
|
|
933
|
+
# Fallback to simple word matching
|
|
934
|
+
search_words = set(search_clean.split())
|
|
935
|
+
found_words = set(found_clean.split())
|
|
936
|
+
|
|
937
|
+
if not search_words or not found_words:
|
|
938
|
+
return False
|
|
939
|
+
|
|
940
|
+
intersection = search_words.intersection(found_words)
|
|
941
|
+
union = search_words.union(found_words)
|
|
942
|
+
|
|
943
|
+
jaccard_similarity = len(intersection) / len(union) if union else 0
|
|
944
|
+
return jaccard_similarity >= threshold
|
|
945
|
+
|
|
946
|
+
def verify_reference_by_search(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
|
|
947
|
+
"""
|
|
948
|
+
Verify a reference by searching OpenReview (for papers without URLs).
|
|
949
|
+
|
|
950
|
+
Args:
|
|
951
|
+
reference: Reference data dictionary
|
|
952
|
+
|
|
953
|
+
Returns:
|
|
954
|
+
Tuple of (verified_data, errors_and_warnings, debug_info)
|
|
955
|
+
"""
|
|
956
|
+
title = reference.get('title', '').strip()
|
|
957
|
+
if not title:
|
|
958
|
+
return None, [], "No title provided for search"
|
|
959
|
+
|
|
960
|
+
# Search for the paper
|
|
961
|
+
search_results = self.search_by_title(title)
|
|
962
|
+
|
|
963
|
+
if not search_results:
|
|
964
|
+
return None, [], f"No papers found on OpenReview for title: {title}"
|
|
965
|
+
|
|
966
|
+
# Take the best match (first result, as search is already filtered)
|
|
967
|
+
best_match = search_results[0]
|
|
968
|
+
|
|
969
|
+
# Use the existing verify_reference method with the found URL
|
|
970
|
+
forum_url = best_match.get('forum_url')
|
|
971
|
+
if forum_url:
|
|
972
|
+
# Create a reference with the OpenReview URL for verification
|
|
973
|
+
reference_with_url = reference.copy()
|
|
974
|
+
reference_with_url['url'] = forum_url
|
|
975
|
+
|
|
976
|
+
return self.verify_reference(reference_with_url)
|
|
977
|
+
|
|
978
|
+
# If no URL, return the metadata as verification
|
|
979
|
+
return best_match, [], f"Found on OpenReview: {best_match.get('title')}"
|
checkers/semantic_scholar.py
CHANGED
|
@@ -583,8 +583,8 @@ class NonArxivReferenceChecker:
|
|
|
583
583
|
|
|
584
584
|
if not (has_arxiv_url or has_arxiv_doi):
|
|
585
585
|
errors.append({
|
|
586
|
-
'
|
|
587
|
-
'
|
|
586
|
+
'info_type': 'url',
|
|
587
|
+
'info_details': f"Reference could include arXiv URL: {arxiv_url}",
|
|
588
588
|
'ref_url_correct': arxiv_url
|
|
589
589
|
})
|
|
590
590
|
|
core/parallel_processor.py
CHANGED
|
@@ -340,7 +340,7 @@ class ParallelReferenceProcessor:
|
|
|
340
340
|
# Display errors and warnings
|
|
341
341
|
if result.errors:
|
|
342
342
|
# Check if there's an unverified error
|
|
343
|
-
has_unverified_error = any(e.get('error_type') == 'unverified' or e.get('warning_type') == 'unverified' for e in result.errors)
|
|
343
|
+
has_unverified_error = any(e.get('error_type') == 'unverified' or e.get('warning_type') == 'unverified' or e.get('info_type') == 'unverified' for e in result.errors)
|
|
344
344
|
|
|
345
345
|
if has_unverified_error:
|
|
346
346
|
# Use the centralized unverified error display function from base checker
|
|
@@ -348,9 +348,9 @@ class ParallelReferenceProcessor:
|
|
|
348
348
|
|
|
349
349
|
# Display all non-unverified errors and warnings
|
|
350
350
|
for error in result.errors:
|
|
351
|
-
if error.get('error_type') != 'unverified' and error.get('warning_type') != 'unverified':
|
|
352
|
-
error_type = error.get('error_type') or error.get('warning_type')
|
|
353
|
-
error_details = error.get('error_details') or error.get('warning_details', 'Unknown error')
|
|
351
|
+
if error.get('error_type') != 'unverified' and error.get('warning_type') != 'unverified' and error.get('info_type') != 'unverified':
|
|
352
|
+
error_type = error.get('error_type') or error.get('warning_type') or error.get('info_type')
|
|
353
|
+
error_details = error.get('error_details') or error.get('warning_details') or error.get('info_details', 'Unknown error')
|
|
354
354
|
|
|
355
355
|
from utils.error_utils import print_labeled_multiline
|
|
356
356
|
|
|
@@ -359,8 +359,10 @@ class ParallelReferenceProcessor:
|
|
|
359
359
|
print(f" ❌ {error_details}")
|
|
360
360
|
elif 'error_type' in error:
|
|
361
361
|
print_labeled_multiline("❌ Error", error_details)
|
|
362
|
-
|
|
362
|
+
elif 'warning_type' in error:
|
|
363
363
|
print_labeled_multiline("⚠️ Warning", error_details)
|
|
364
|
+
else:
|
|
365
|
+
print_labeled_multiline("ℹ️ Information", error_details)
|
|
364
366
|
|
|
365
367
|
# Show timing info for slow references
|
|
366
368
|
if result.processing_time > 5.0:
|
core/refchecker.py
CHANGED
|
@@ -2033,6 +2033,9 @@ class ArxivReferenceChecker:
|
|
|
2033
2033
|
elif 'warning_type' in error:
|
|
2034
2034
|
formatted_error['warning_type'] = error['warning_type']
|
|
2035
2035
|
formatted_error['warning_details'] = error['warning_details']
|
|
2036
|
+
elif 'info_type' in error:
|
|
2037
|
+
formatted_error['info_type'] = error['info_type']
|
|
2038
|
+
formatted_error['info_details'] = error['info_details']
|
|
2036
2039
|
|
|
2037
2040
|
# Add correct information based on error type
|
|
2038
2041
|
if error.get('error_type') == 'author':
|
|
@@ -2042,6 +2045,8 @@ class ArxivReferenceChecker:
|
|
|
2042
2045
|
elif error.get('error_type') == 'doi':
|
|
2043
2046
|
from utils.doi_utils import construct_doi_url
|
|
2044
2047
|
formatted_error['ref_url_correct'] = construct_doi_url(error.get('ref_doi_correct', ''))
|
|
2048
|
+
elif error.get('info_type') == 'url':
|
|
2049
|
+
formatted_error['ref_url_correct'] = error.get('ref_url_correct', '')
|
|
2045
2050
|
|
|
2046
2051
|
formatted_errors.append(formatted_error)
|
|
2047
2052
|
|
|
@@ -2153,17 +2158,22 @@ class ArxivReferenceChecker:
|
|
|
2153
2158
|
for error in errors:
|
|
2154
2159
|
formatted_error = {}
|
|
2155
2160
|
|
|
2156
|
-
# Handle error_type and
|
|
2161
|
+
# Handle error_type, warning_type, and info_type properly
|
|
2157
2162
|
if 'error_type' in error:
|
|
2158
2163
|
formatted_error['error_type'] = error['error_type']
|
|
2159
2164
|
formatted_error['error_details'] = error['error_details']
|
|
2160
2165
|
elif 'warning_type' in error:
|
|
2161
2166
|
formatted_error['warning_type'] = error['warning_type']
|
|
2162
2167
|
formatted_error['warning_details'] = error['warning_details']
|
|
2168
|
+
elif 'info_type' in error:
|
|
2169
|
+
formatted_error['info_type'] = error['info_type']
|
|
2170
|
+
formatted_error['info_details'] = error['info_details']
|
|
2163
2171
|
|
|
2164
2172
|
# Add correct information based on error type
|
|
2165
2173
|
if error.get('warning_type') == 'year':
|
|
2166
2174
|
formatted_error['ref_year_correct'] = error.get('ref_year_correct', '')
|
|
2175
|
+
elif error.get('info_type') == 'url':
|
|
2176
|
+
formatted_error['ref_url_correct'] = error.get('ref_url_correct', '')
|
|
2167
2177
|
|
|
2168
2178
|
formatted_errors.append(formatted_error)
|
|
2169
2179
|
|
|
@@ -2214,13 +2224,16 @@ class ArxivReferenceChecker:
|
|
|
2214
2224
|
for error in errors:
|
|
2215
2225
|
formatted_error = {}
|
|
2216
2226
|
|
|
2217
|
-
# Handle error_type and
|
|
2227
|
+
# Handle error_type, warning_type, and info_type properly
|
|
2218
2228
|
if 'error_type' in error:
|
|
2219
2229
|
formatted_error['error_type'] = error['error_type']
|
|
2220
2230
|
formatted_error['error_details'] = error['error_details']
|
|
2221
2231
|
elif 'warning_type' in error:
|
|
2222
2232
|
formatted_error['warning_type'] = error['warning_type']
|
|
2223
2233
|
formatted_error['warning_details'] = error['warning_details']
|
|
2234
|
+
elif 'info_type' in error:
|
|
2235
|
+
formatted_error['info_type'] = error['info_type']
|
|
2236
|
+
formatted_error['info_details'] = error['info_details']
|
|
2224
2237
|
|
|
2225
2238
|
formatted_errors.append(formatted_error)
|
|
2226
2239
|
|
|
@@ -2335,13 +2348,16 @@ class ArxivReferenceChecker:
|
|
|
2335
2348
|
logger.debug(f"DEBUG: Error {i}: {error}")
|
|
2336
2349
|
formatted_error = {}
|
|
2337
2350
|
|
|
2338
|
-
# Handle error_type and
|
|
2351
|
+
# Handle error_type, warning_type, and info_type properly
|
|
2339
2352
|
if 'error_type' in error:
|
|
2340
2353
|
formatted_error['error_type'] = error['error_type']
|
|
2341
2354
|
formatted_error['error_details'] = error['error_details']
|
|
2342
2355
|
elif 'warning_type' in error:
|
|
2343
2356
|
formatted_error['warning_type'] = error['warning_type']
|
|
2344
2357
|
formatted_error['warning_details'] = error['warning_details']
|
|
2358
|
+
elif 'info_type' in error:
|
|
2359
|
+
formatted_error['info_type'] = error['info_type']
|
|
2360
|
+
formatted_error['info_details'] = error['info_details']
|
|
2345
2361
|
|
|
2346
2362
|
# Add correct information based on error type
|
|
2347
2363
|
if error.get('error_type') == 'author':
|
|
@@ -2657,8 +2673,8 @@ class ArxivReferenceChecker:
|
|
|
2657
2673
|
else:
|
|
2658
2674
|
# Single error - handle as before
|
|
2659
2675
|
error = errors[0]
|
|
2660
|
-
error_type = error.get('error_type') or error.get('warning_type', 'unknown')
|
|
2661
|
-
error_details = error.get('error_details') or error.get('warning_details', '')
|
|
2676
|
+
error_type = error.get('error_type') or error.get('warning_type') or error.get('info_type', 'unknown')
|
|
2677
|
+
error_details = error.get('error_details') or error.get('warning_details') or error.get('info_details', '')
|
|
2662
2678
|
|
|
2663
2679
|
error_entry = {
|
|
2664
2680
|
# Source paper metadata
|
|
@@ -2775,7 +2791,9 @@ class ArxivReferenceChecker:
|
|
|
2775
2791
|
emoji = "❓"
|
|
2776
2792
|
elif error_type in ['year', 'venue']: # Warning types
|
|
2777
2793
|
emoji = "⚠️"
|
|
2778
|
-
|
|
2794
|
+
elif error_type == 'url': # Info type (ArXiv URL suggestion)
|
|
2795
|
+
emoji = "ℹ️"
|
|
2796
|
+
else: # Error types (title, author, doi, multiple, etc.)
|
|
2779
2797
|
emoji = "❌"
|
|
2780
2798
|
|
|
2781
2799
|
f.write(f"Type: {emoji} {error_entry['error_type']}\n")
|
|
@@ -2906,8 +2924,10 @@ class ArxivReferenceChecker:
|
|
|
2906
2924
|
self.total_references_processed = 0
|
|
2907
2925
|
self.papers_with_errors = 0
|
|
2908
2926
|
self.papers_with_warnings = 0
|
|
2927
|
+
self.papers_with_info = 0
|
|
2909
2928
|
self.total_errors_found = 0
|
|
2910
2929
|
self.total_warnings_found = 0
|
|
2930
|
+
self.total_info_found = 0
|
|
2911
2931
|
self.total_arxiv_refs = 0
|
|
2912
2932
|
self.total_non_arxiv_refs = 0
|
|
2913
2933
|
self.total_other_refs = 0
|
|
@@ -3066,18 +3086,21 @@ class ArxivReferenceChecker:
|
|
|
3066
3086
|
# Separate actual errors from warnings for paper classification
|
|
3067
3087
|
actual_errors = [e for e in paper_errors if 'error_type' in e and e['error_type'] != 'unverified']
|
|
3068
3088
|
warnings_only = [e for e in paper_errors if 'warning_type' in e]
|
|
3089
|
+
info_only = [e for e in paper_errors if 'info_type' in e]
|
|
3069
3090
|
|
|
3070
3091
|
if self.single_paper_mode:
|
|
3071
3092
|
# Single paper mode - show simple summary
|
|
3072
|
-
if actual_errors or warnings_only:
|
|
3093
|
+
if actual_errors or warnings_only or info_only:
|
|
3073
3094
|
summary_parts = []
|
|
3074
3095
|
if actual_errors:
|
|
3075
3096
|
summary_parts.append(f"{len(actual_errors)} errors")
|
|
3076
3097
|
if warnings_only:
|
|
3077
3098
|
summary_parts.append(f"{len(warnings_only)} warnings")
|
|
3099
|
+
if info_only:
|
|
3100
|
+
summary_parts.append(f"{len(info_only)} information")
|
|
3078
3101
|
else:
|
|
3079
3102
|
# Multi-paper mode - track paper statistics
|
|
3080
|
-
if actual_errors or warnings_only:
|
|
3103
|
+
if actual_errors or warnings_only or info_only:
|
|
3081
3104
|
summary_parts = []
|
|
3082
3105
|
if actual_errors:
|
|
3083
3106
|
summary_parts.append(f"{len(actual_errors)} errors")
|
|
@@ -3086,6 +3109,10 @@ class ArxivReferenceChecker:
|
|
|
3086
3109
|
summary_parts.append(f"{len(warnings_only)} warnings")
|
|
3087
3110
|
# Count as paper with warnings if it has warnings (regardless of errors)
|
|
3088
3111
|
self.papers_with_warnings += 1
|
|
3112
|
+
if info_only:
|
|
3113
|
+
summary_parts.append(f"{len(info_only)} information")
|
|
3114
|
+
# Count as paper with info if it has info messages (regardless of errors/warnings)
|
|
3115
|
+
self.papers_with_info += 1
|
|
3089
3116
|
|
|
3090
3117
|
except Exception as e:
|
|
3091
3118
|
logger.error(f"Error processing paper {paper_id}: {str(e)}")
|
|
@@ -3127,9 +3154,11 @@ class ArxivReferenceChecker:
|
|
|
3127
3154
|
print(f"❌ Total errors: {self.total_errors_found}")
|
|
3128
3155
|
if self.total_warnings_found > 0:
|
|
3129
3156
|
print(f"⚠️ Total warnings: {self.total_warnings_found}")
|
|
3157
|
+
if self.total_info_found > 0:
|
|
3158
|
+
print(f"ℹ️ Total information: {self.total_info_found}")
|
|
3130
3159
|
if self.total_unverified_refs > 0:
|
|
3131
3160
|
print(f"❓ References that couldn't be verified: {self.total_unverified_refs}")
|
|
3132
|
-
if self.total_errors_found == 0 and self.total_warnings_found == 0 and self.total_unverified_refs == 0:
|
|
3161
|
+
if self.total_errors_found == 0 and self.total_warnings_found == 0 and self.total_info_found == 0 and self.total_unverified_refs == 0:
|
|
3133
3162
|
print(f"✅ All references verified successfully!")
|
|
3134
3163
|
|
|
3135
3164
|
# Show warning if unreliable extraction was used and there are many errors
|
|
@@ -3149,6 +3178,8 @@ class ArxivReferenceChecker:
|
|
|
3149
3178
|
print(f" Total errors: {self.total_errors_found}")
|
|
3150
3179
|
print(f"⚠️ Papers with warnings: {self.papers_with_warnings}")
|
|
3151
3180
|
print(f" Total warnings: {self.total_warnings_found}")
|
|
3181
|
+
print(f"ℹ️ Papers with information: {self.papers_with_info}")
|
|
3182
|
+
print(f" Total information: {self.total_info_found}")
|
|
3152
3183
|
print(f"❓ References that couldn't be verified: {self.total_unverified_refs}")
|
|
3153
3184
|
|
|
3154
3185
|
# Show warning if unreliable extraction was used and there are many errors
|
|
@@ -5348,7 +5379,7 @@ class ArxivReferenceChecker:
|
|
|
5348
5379
|
# If errors found, add to dataset and optionally print details
|
|
5349
5380
|
if errors:
|
|
5350
5381
|
# Check if there's an unverified error among the errors
|
|
5351
|
-
has_unverified_error = any(e.get('error_type') == 'unverified' or e.get('warning_type') == 'unverified' for e in errors)
|
|
5382
|
+
has_unverified_error = any(e.get('error_type') == 'unverified' or e.get('warning_type') == 'unverified' or e.get('info_type') == 'unverified' for e in errors)
|
|
5352
5383
|
|
|
5353
5384
|
if has_unverified_error:
|
|
5354
5385
|
self.total_unverified_refs += 1
|
|
@@ -5358,11 +5389,13 @@ class ArxivReferenceChecker:
|
|
|
5358
5389
|
self.add_error_to_dataset(paper, reference, errors, reference_url, verified_data)
|
|
5359
5390
|
paper_errors.extend(errors)
|
|
5360
5391
|
|
|
5361
|
-
# Count errors vs warnings
|
|
5392
|
+
# Count errors vs warnings vs info
|
|
5362
5393
|
error_count = sum(1 for e in errors if 'error_type' in e and e['error_type'] != 'unverified')
|
|
5363
5394
|
warning_count = sum(1 for e in errors if 'warning_type' in e)
|
|
5395
|
+
info_count = sum(1 for e in errors if 'info_type' in e)
|
|
5364
5396
|
self.total_errors_found += error_count
|
|
5365
5397
|
self.total_warnings_found += warning_count
|
|
5398
|
+
self.total_info_found += info_count
|
|
5366
5399
|
|
|
5367
5400
|
# Display all non-unverified errors and warnings
|
|
5368
5401
|
self._display_non_unverified_errors(errors, debug_mode, print_output)
|
|
@@ -5509,9 +5542,9 @@ class ArxivReferenceChecker:
|
|
|
5509
5542
|
"""Display all non-unverified errors and warnings"""
|
|
5510
5543
|
if not debug_mode and print_output:
|
|
5511
5544
|
for error in errors:
|
|
5512
|
-
if error.get('error_type') != 'unverified' and error.get('warning_type') != 'unverified':
|
|
5513
|
-
error_type = error.get('error_type') or error.get('warning_type')
|
|
5514
|
-
error_details = error.get('error_details') or error.get('warning_details', 'Unknown error')
|
|
5545
|
+
if error.get('error_type') != 'unverified' and error.get('warning_type') != 'unverified' and error.get('info_type') != 'unverified':
|
|
5546
|
+
error_type = error.get('error_type') or error.get('warning_type') or error.get('info_type')
|
|
5547
|
+
error_details = error.get('error_details') or error.get('warning_details') or error.get('info_details', 'Unknown error')
|
|
5515
5548
|
|
|
5516
5549
|
from utils.error_utils import print_labeled_multiline
|
|
5517
5550
|
|
|
@@ -5519,8 +5552,10 @@ class ArxivReferenceChecker:
|
|
|
5519
5552
|
print(f" ❌ {error_details}")
|
|
5520
5553
|
elif 'error_type' in error:
|
|
5521
5554
|
print_labeled_multiline("❌ Error", error_details)
|
|
5522
|
-
|
|
5555
|
+
elif 'warning_type' in error:
|
|
5523
5556
|
print_labeled_multiline("⚠️ Warning", error_details)
|
|
5557
|
+
else:
|
|
5558
|
+
print_labeled_multiline("ℹ️ Information", error_details)
|
|
5524
5559
|
|
|
5525
5560
|
def _output_reference_errors(self, reference, errors, url):
|
|
5526
5561
|
"""
|
utils/arxiv_utils.py
CHANGED
|
@@ -111,56 +111,8 @@ def download_arxiv_source(arxiv_id):
|
|
|
111
111
|
main_tex_content = largest_file[1]
|
|
112
112
|
logger.debug(f"Using largest tex file: {largest_file[0]}")
|
|
113
113
|
|
|
114
|
-
#
|
|
115
|
-
bib_content =
|
|
116
|
-
if bib_files and main_tex_content:
|
|
117
|
-
# Extract bibliography references from main tex file
|
|
118
|
-
referenced_bibs = []
|
|
119
|
-
bib_pattern = r'\\bibliography\{([^}]+)\}'
|
|
120
|
-
matches = re.findall(bib_pattern, main_tex_content)
|
|
121
|
-
|
|
122
|
-
for match in matches:
|
|
123
|
-
# Handle multiple bib files separated by commas
|
|
124
|
-
bib_names = [name.strip() for name in match.split(',')]
|
|
125
|
-
for bib_name in bib_names:
|
|
126
|
-
# Add .bib extension if not present
|
|
127
|
-
if not bib_name.endswith('.bib'):
|
|
128
|
-
bib_name += '.bib'
|
|
129
|
-
referenced_bibs.append(bib_name)
|
|
130
|
-
|
|
131
|
-
# Use only referenced .bib files, or all if no references found
|
|
132
|
-
if referenced_bibs:
|
|
133
|
-
used_bibs = []
|
|
134
|
-
for bib_name in referenced_bibs:
|
|
135
|
-
if bib_name in bib_files:
|
|
136
|
-
used_bibs.append(bib_files[bib_name])
|
|
137
|
-
logger.debug(f"Using referenced .bib file: {bib_name}")
|
|
138
|
-
else:
|
|
139
|
-
logger.debug(f"Referenced .bib file not found: {bib_name}")
|
|
140
|
-
|
|
141
|
-
if used_bibs:
|
|
142
|
-
raw_bib_content = '\n\n'.join(used_bibs)
|
|
143
|
-
|
|
144
|
-
# Filter BibTeX to only include cited references
|
|
145
|
-
bib_content = filter_bibtex_by_citations(raw_bib_content, tex_files, main_tex_content)
|
|
146
|
-
|
|
147
|
-
logger.debug(f"Found {len(used_bibs)} referenced .bib files out of {len(bib_files)} total")
|
|
148
|
-
else:
|
|
149
|
-
# Fallback to all bib files if none of the referenced ones found
|
|
150
|
-
raw_bib_content = '\n\n'.join(bib_files.values())
|
|
151
|
-
bib_content = filter_bibtex_by_citations(raw_bib_content, tex_files, main_tex_content)
|
|
152
|
-
logger.debug(f"No referenced .bib files found, using all {len(bib_files)} .bib files")
|
|
153
|
-
else:
|
|
154
|
-
# No \bibliography command found, use all bib files
|
|
155
|
-
raw_bib_content = '\n\n'.join(bib_files.values())
|
|
156
|
-
bib_content = filter_bibtex_by_citations(raw_bib_content, tex_files, main_tex_content)
|
|
157
|
-
logger.debug(f"No \\bibliography command found, using all {len(bib_files)} .bib files")
|
|
158
|
-
elif bib_files:
|
|
159
|
-
# No main tex file but have bib files
|
|
160
|
-
raw_bib_content = '\n\n'.join(bib_files.values())
|
|
161
|
-
# Can't filter without tex files, so use original content
|
|
162
|
-
bib_content = raw_bib_content
|
|
163
|
-
logger.debug(f"Found {len(bib_files)} .bib files (no main tex to filter)")
|
|
114
|
+
# Process .bib files using shared logic
|
|
115
|
+
bib_content = select_and_filter_bib_files(bib_files, main_tex_content, tex_files)
|
|
164
116
|
|
|
165
117
|
# Combine all bbl file contents
|
|
166
118
|
bbl_content = None
|
|
@@ -219,6 +171,78 @@ def download_arxiv_bibtex(arxiv_id):
|
|
|
219
171
|
return None
|
|
220
172
|
|
|
221
173
|
|
|
174
|
+
def select_and_filter_bib_files(bib_files, main_tex_content, tex_files):
|
|
175
|
+
"""
|
|
176
|
+
Select appropriate .bib files based on main TeX file references and filter by citations.
|
|
177
|
+
|
|
178
|
+
Args:
|
|
179
|
+
bib_files: Dict of .bib files {filename: content}
|
|
180
|
+
main_tex_content: Content of main tex file
|
|
181
|
+
tex_files: Dict of all tex files {filename: content} (for filtering)
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
Filtered BibTeX content or None if no files available
|
|
185
|
+
"""
|
|
186
|
+
import re
|
|
187
|
+
|
|
188
|
+
if not bib_files:
|
|
189
|
+
return None
|
|
190
|
+
|
|
191
|
+
if main_tex_content:
|
|
192
|
+
# Extract bibliography references from main tex file
|
|
193
|
+
referenced_bibs = []
|
|
194
|
+
bib_pattern = r'\\bibliography\{([^}]+)\}'
|
|
195
|
+
matches = re.findall(bib_pattern, main_tex_content)
|
|
196
|
+
|
|
197
|
+
for match in matches:
|
|
198
|
+
# Handle multiple bib files separated by commas
|
|
199
|
+
bib_names = [name.strip() for name in match.split(',')]
|
|
200
|
+
for bib_name in bib_names:
|
|
201
|
+
# Add .bib extension if not present
|
|
202
|
+
if not bib_name.endswith('.bib'):
|
|
203
|
+
bib_name += '.bib'
|
|
204
|
+
referenced_bibs.append(bib_name)
|
|
205
|
+
|
|
206
|
+
# Use only referenced .bib files, or all if no references found
|
|
207
|
+
if referenced_bibs:
|
|
208
|
+
used_bibs = []
|
|
209
|
+
seen_bib_names = set() # Track which bib files we've already added
|
|
210
|
+
for bib_name in referenced_bibs:
|
|
211
|
+
if bib_name in bib_files and bib_name not in seen_bib_names:
|
|
212
|
+
used_bibs.append(bib_files[bib_name])
|
|
213
|
+
seen_bib_names.add(bib_name)
|
|
214
|
+
logger.debug(f"Using referenced .bib file: {bib_name}")
|
|
215
|
+
elif bib_name in seen_bib_names:
|
|
216
|
+
logger.debug(f"Skipping duplicate .bib file: {bib_name}")
|
|
217
|
+
else:
|
|
218
|
+
logger.debug(f"Referenced .bib file not found: {bib_name}")
|
|
219
|
+
|
|
220
|
+
if used_bibs:
|
|
221
|
+
raw_bib_content = '\n\n'.join(used_bibs)
|
|
222
|
+
# Filter BibTeX to only include cited references
|
|
223
|
+
filtered_content = filter_bibtex_by_citations(raw_bib_content, tex_files, main_tex_content)
|
|
224
|
+
logger.debug(f"Found {len(used_bibs)} referenced .bib files out of {len(bib_files)} total")
|
|
225
|
+
return filtered_content
|
|
226
|
+
else:
|
|
227
|
+
# Fallback to all bib files if none of the referenced ones found
|
|
228
|
+
raw_bib_content = '\n\n'.join(bib_files.values())
|
|
229
|
+
filtered_content = filter_bibtex_by_citations(raw_bib_content, tex_files, main_tex_content)
|
|
230
|
+
logger.debug(f"No referenced .bib files found, using all {len(bib_files)} .bib files")
|
|
231
|
+
return filtered_content
|
|
232
|
+
else:
|
|
233
|
+
# No \bibliography command found, use all bib files
|
|
234
|
+
raw_bib_content = '\n\n'.join(bib_files.values())
|
|
235
|
+
filtered_content = filter_bibtex_by_citations(raw_bib_content, tex_files, main_tex_content)
|
|
236
|
+
logger.debug(f"No \\bibliography command found, using all {len(bib_files)} .bib files")
|
|
237
|
+
return filtered_content
|
|
238
|
+
else:
|
|
239
|
+
# No main tex file but have bib files
|
|
240
|
+
raw_bib_content = '\n\n'.join(bib_files.values())
|
|
241
|
+
# Can't filter without tex files, so use original content
|
|
242
|
+
logger.debug(f"Found {len(bib_files)} .bib files (no main tex to filter)")
|
|
243
|
+
return raw_bib_content
|
|
244
|
+
|
|
245
|
+
|
|
222
246
|
def extract_cited_keys_from_tex(tex_files, main_tex_content):
|
|
223
247
|
"""
|
|
224
248
|
Extract all citation keys from TeX files.
|
|
@@ -261,7 +285,11 @@ def is_reference_used(reference_key, cited_keys):
|
|
|
261
285
|
Returns:
|
|
262
286
|
True if the reference is cited, False otherwise
|
|
263
287
|
"""
|
|
264
|
-
|
|
288
|
+
result = reference_key in cited_keys
|
|
289
|
+
# Add debugging for the first few mismatches to understand the issue
|
|
290
|
+
if not result and len([k for k in cited_keys if k.startswith('a')]) < 3: # Limit debug output
|
|
291
|
+
logger.debug(f"Key '{reference_key}' not found in cited_keys")
|
|
292
|
+
return result
|
|
265
293
|
|
|
266
294
|
|
|
267
295
|
def filter_bibtex_by_citations(bib_content, tex_files, main_tex_content):
|
|
@@ -291,14 +319,30 @@ def filter_bibtex_by_citations(bib_content, tex_files, main_tex_content):
|
|
|
291
319
|
from utils.bibtex_parser import parse_bibtex_entries
|
|
292
320
|
entries = parse_bibtex_entries(bib_content)
|
|
293
321
|
|
|
294
|
-
# Filter entries to only cited ones
|
|
322
|
+
# Filter entries to only cited ones and remove duplicates
|
|
295
323
|
cited_entries = []
|
|
324
|
+
seen_keys = set()
|
|
325
|
+
not_cited_count = 0
|
|
326
|
+
duplicate_count = 0
|
|
327
|
+
|
|
296
328
|
for entry in entries:
|
|
297
329
|
entry_key = entry.get('key', '')
|
|
298
330
|
if is_reference_used(entry_key, cited_keys):
|
|
299
|
-
|
|
331
|
+
if entry_key not in seen_keys:
|
|
332
|
+
cited_entries.append(entry)
|
|
333
|
+
seen_keys.add(entry_key)
|
|
334
|
+
else:
|
|
335
|
+
duplicate_count += 1
|
|
336
|
+
logger.debug(f"Skipping duplicate entry: '{entry_key}'")
|
|
337
|
+
else:
|
|
338
|
+
not_cited_count += 1
|
|
339
|
+
# Log first few entries that are NOT cited for debugging
|
|
340
|
+
if not_cited_count <= 5:
|
|
341
|
+
logger.debug(f"Entry NOT cited: '{entry_key}'")
|
|
300
342
|
|
|
301
|
-
logger.debug(f"Filtered BibTeX: {len(entries)} total -> {len(cited_entries)} cited")
|
|
343
|
+
logger.debug(f"Filtered BibTeX: {len(entries)} total -> {len(cited_entries)} cited (removed {duplicate_count} duplicates)")
|
|
344
|
+
logger.debug(f"Citation keys found: {len(cited_keys)} keys")
|
|
345
|
+
logger.debug(f"Sample cited keys: {list(cited_keys)[:10]}")
|
|
302
346
|
|
|
303
347
|
# Reconstruct BibTeX content from cited entries
|
|
304
348
|
if not cited_entries:
|
|
@@ -392,41 +436,43 @@ def get_bibtex_content(paper):
|
|
|
392
436
|
logger.debug(f"Detected ArXiv paper {arxiv_id}, checking for structured bibliography")
|
|
393
437
|
tex_content, bib_content, bbl_content = download_arxiv_source(arxiv_id)
|
|
394
438
|
|
|
395
|
-
# Choose between .bib and .bbl files
|
|
396
|
-
#
|
|
439
|
+
# Choose between .bib and .bbl files based on what the main TeX file actually uses
|
|
440
|
+
# Check the main TeX file to see if it uses \bibliography{...} (BibTeX) or not (BBL)
|
|
441
|
+
uses_bibtex = False
|
|
442
|
+
if tex_content:
|
|
443
|
+
# Look for \bibliography{...} commands in the main TeX file
|
|
444
|
+
bib_pattern = r'\\bibliography\{([^}]+)\}'
|
|
445
|
+
bib_matches = re.findall(bib_pattern, tex_content)
|
|
446
|
+
if bib_matches:
|
|
447
|
+
uses_bibtex = True
|
|
448
|
+
referenced_bibs = []
|
|
449
|
+
for match in bib_matches:
|
|
450
|
+
bib_names = [name.strip() for name in match.split(',')]
|
|
451
|
+
referenced_bibs.extend(bib_names)
|
|
452
|
+
logger.debug(f"Main TeX file references BibTeX files: {referenced_bibs}")
|
|
453
|
+
|
|
397
454
|
if bib_content and bbl_content:
|
|
398
455
|
# Count entries in both for logging
|
|
399
456
|
bib_entry_count = len(re.findall(r'@\w+\s*\{', bib_content))
|
|
400
|
-
bbl_entry_count = len(re.findall(r'\\bibitem\[', bbl_content))
|
|
457
|
+
bbl_entry_count = len(re.findall(r'\\bibitem[\[\{]', bbl_content))
|
|
401
458
|
|
|
402
459
|
logger.debug(f"Bibliography comparison: .bbl has {bbl_entry_count} entries, .bib has {bib_entry_count} entries")
|
|
403
460
|
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
461
|
+
if uses_bibtex and bib_entry_count > 0:
|
|
462
|
+
logger.info(f"Using .bib files from ArXiv source (main TeX uses \\bibliography{{...}})")
|
|
463
|
+
return bib_content
|
|
464
|
+
elif bbl_entry_count > 0:
|
|
465
|
+
logger.info(f"Using .bbl files from ArXiv source (main TeX doesn't use \\bibliography or .bib is empty)")
|
|
407
466
|
return bbl_content
|
|
408
|
-
|
|
467
|
+
elif bib_entry_count > 0:
|
|
409
468
|
logger.info(f"Using .bib files from ArXiv source (.bbl file is empty)")
|
|
410
|
-
# If we have LaTeX content, filter BibTeX by cited keys
|
|
411
|
-
if tex_content:
|
|
412
|
-
cited_keys = extract_cited_keys_from_tex({}, tex_content)
|
|
413
|
-
if cited_keys:
|
|
414
|
-
logger.debug(f"Found {len(cited_keys)} cited keys, filtering BibTeX")
|
|
415
|
-
filtered_content = filter_bibtex_by_citations(bib_content, {}, tex_content)
|
|
416
|
-
return filtered_content
|
|
417
469
|
return bib_content
|
|
470
|
+
else:
|
|
471
|
+
logger.warning(f"Both .bib and .bbl files appear to be empty")
|
|
472
|
+
return bib_content # Default to bib_content as fallback
|
|
418
473
|
|
|
419
474
|
elif bib_content:
|
|
420
475
|
logger.info(f"Found .bib files in ArXiv source for {arxiv_id}")
|
|
421
|
-
|
|
422
|
-
# If we have LaTeX content, filter BibTeX by cited keys
|
|
423
|
-
if tex_content:
|
|
424
|
-
cited_keys = extract_cited_keys_from_tex({}, tex_content)
|
|
425
|
-
if cited_keys:
|
|
426
|
-
logger.debug(f"Found {len(cited_keys)} cited keys, filtering BibTeX")
|
|
427
|
-
filtered_content = filter_bibtex_by_citations(bib_content, {}, tex_content)
|
|
428
|
-
return filtered_content
|
|
429
|
-
|
|
430
476
|
return bib_content
|
|
431
477
|
|
|
432
478
|
elif bbl_content:
|
utils/error_utils.py
CHANGED
|
@@ -294,6 +294,39 @@ def create_generic_warning(warning_type: str, warning_details: str, **kwargs) ->
|
|
|
294
294
|
return warning_dict
|
|
295
295
|
|
|
296
296
|
|
|
297
|
+
def create_generic_info(info_type: str, info_details: str, **kwargs) -> Dict[str, Any]:
|
|
298
|
+
"""
|
|
299
|
+
Create a generic info dictionary with custom fields.
|
|
300
|
+
|
|
301
|
+
Args:
|
|
302
|
+
info_type: Type of info (e.g., 'url')
|
|
303
|
+
info_details: Description of the information
|
|
304
|
+
**kwargs: Additional fields to include in the info dictionary
|
|
305
|
+
|
|
306
|
+
Returns:
|
|
307
|
+
Standardized info dictionary
|
|
308
|
+
"""
|
|
309
|
+
info_dict = {
|
|
310
|
+
'info_type': info_type,
|
|
311
|
+
'info_details': info_details
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
info_dict.update(kwargs)
|
|
315
|
+
return info_dict
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
def create_info_message(reference, reason, arxiv_url=None):
|
|
319
|
+
"""Create a standardized info message structure."""
|
|
320
|
+
info_msg = {
|
|
321
|
+
'info_type': 'arxiv_url_available',
|
|
322
|
+
'reference': reference,
|
|
323
|
+
'reason': reason
|
|
324
|
+
}
|
|
325
|
+
if arxiv_url:
|
|
326
|
+
info_msg['arxiv_url'] = arxiv_url
|
|
327
|
+
return info_msg
|
|
328
|
+
|
|
329
|
+
|
|
297
330
|
def format_author_mismatch(author_number: int, cited_author: str, correct_author: str) -> str:
|
|
298
331
|
"""
|
|
299
332
|
Format a three-line author mismatch message.
|
utils/text_utils.py
CHANGED
|
@@ -2337,6 +2337,9 @@ def format_author_for_display(author_name):
|
|
|
2337
2337
|
if not author_name:
|
|
2338
2338
|
return author_name
|
|
2339
2339
|
|
|
2340
|
+
# First clean the author name to remove asterisks and other unwanted characters
|
|
2341
|
+
author_name = clean_author_name(author_name)
|
|
2342
|
+
|
|
2340
2343
|
# Clean up any stray punctuation that might have been attached during parsing
|
|
2341
2344
|
author_name = author_name.strip()
|
|
2342
2345
|
# Remove trailing semicolons that sometimes get attached during bibliographic parsing
|
|
@@ -3899,6 +3902,9 @@ def are_venues_substantially_different(venue1: str, venue2: str) -> bool:
|
|
|
3899
3902
|
# Handle specific multi-word patterns and well-known acronyms
|
|
3900
3903
|
'proc. natl. acad. sci.': 'proceedings of the national academy of sciences',
|
|
3901
3904
|
'pnas': 'proceedings of the national academy of sciences',
|
|
3905
|
+
# Special cases that don't follow standard acronym patterns
|
|
3906
|
+
'neurips': 'neural information processing systems', # Special case
|
|
3907
|
+
'nips': 'neural information processing systems', # old name for neurips
|
|
3902
3908
|
}
|
|
3903
3909
|
# Sort by length (longest first) to ensure longer matches take precedence
|
|
3904
3910
|
for abbrev, expansion in sorted(common_abbrevs.items(), key=lambda x: len(x[0]), reverse=True):
|
|
File without changes
|
{academic_refchecker-1.2.48.dist-info → academic_refchecker-1.2.50.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{academic_refchecker-1.2.48.dist-info → academic_refchecker-1.2.50.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
|
File without changes
|