academic-refchecker 1.2.36__py3-none-any.whl → 1.2.38__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- __version__.py +1 -1
- {academic_refchecker-1.2.36.dist-info → academic_refchecker-1.2.38.dist-info}/METADATA +1 -1
- {academic_refchecker-1.2.36.dist-info → academic_refchecker-1.2.38.dist-info}/RECORD +14 -11
- core/refchecker.py +56 -299
- utils/arxiv_utils.py +77 -1
- utils/biblatex_parser.py +485 -0
- utils/bibliography_utils.py +332 -0
- utils/bibtex_parser.py +334 -0
- utils/text_utils.py +72 -183
- utils/url_utils.py +29 -12
- {academic_refchecker-1.2.36.dist-info → academic_refchecker-1.2.38.dist-info}/WHEEL +0 -0
- {academic_refchecker-1.2.36.dist-info → academic_refchecker-1.2.38.dist-info}/entry_points.txt +0 -0
- {academic_refchecker-1.2.36.dist-info → academic_refchecker-1.2.38.dist-info}/licenses/LICENSE +0 -0
- {academic_refchecker-1.2.36.dist-info → academic_refchecker-1.2.38.dist-info}/top_level.txt +0 -0
__version__.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
__version__.py,sha256=
|
|
2
|
-
academic_refchecker-1.2.
|
|
1
|
+
__version__.py,sha256=9ez-UBx1mkgUvDMk-z63_XpqOh2QnPCeTrDEuricP1w,65
|
|
2
|
+
academic_refchecker-1.2.38.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
|
|
3
3
|
checkers/__init__.py,sha256=T0PAHTFt6UiGvn-WGoJU8CdhXNmf6zaHmcGVoWHhmJQ,533
|
|
4
4
|
checkers/crossref.py,sha256=Hzq4dlf1CSn0aZWU8CMOnLxIvaSivTabLoepIOkgkmY,20585
|
|
5
5
|
checkers/enhanced_hybrid_checker.py,sha256=6yf5tV4jLSVzjX1xR_kQq0NOgQIst-z_WmkiqqMc8hQ,23469
|
|
@@ -15,7 +15,7 @@ config/settings.py,sha256=-vODFoXbWbGPUElpmchE5zbCj_n4Vtxr8HU1hQDFp_c,6164
|
|
|
15
15
|
core/__init__.py,sha256=1T2MSQyDk0u_PupbHvm4CvNNN--dxsw78fqKUrqoYrM,157
|
|
16
16
|
core/db_connection_pool.py,sha256=XRiOdehikkSz3obH4WKgf8woa3694if50Q15rBT-4XQ,4697
|
|
17
17
|
core/parallel_processor.py,sha256=2S1cAPhtWH3glvtiJrt9JkZzk2iJkPKXsc-F3lg0X6U,16795
|
|
18
|
-
core/refchecker.py,sha256=
|
|
18
|
+
core/refchecker.py,sha256=8EatAqYEDpW219Xrn-ql1oQ5ytmCU8RW8pMtlujRbC8,273167
|
|
19
19
|
database/__init__.py,sha256=mEuVHlEBuS44t_2ZT_JnvQQrlRCjo1SJq1NmaJ6r8OY,125
|
|
20
20
|
database/download_semantic_scholar_db.py,sha256=waN4I97KC_36YMiPbiBDUUmgfzu1nub5yeKdAsIR2aw,75276
|
|
21
21
|
llm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -26,18 +26,21 @@ scripts/start_vllm_server.py,sha256=ZepWp2y2cKFW0Kgsoima2RbmF02fTU29UFcLLpsBhFU,
|
|
|
26
26
|
services/__init__.py,sha256=jGi9S74Msak3YR-C4Qb68VU7HB4oLaX9o1rlVAFpOFI,187
|
|
27
27
|
services/pdf_processor.py,sha256=vu_JnhFGZY6jFVbDbPvG-mlQojvB-3Dzc8_946KVV2E,9427
|
|
28
28
|
utils/__init__.py,sha256=1RrGoIIn1_gVzxd56b6a7HeAS-wu7uDP-nxLbR3fJ-8,1199
|
|
29
|
-
utils/arxiv_utils.py,sha256=
|
|
29
|
+
utils/arxiv_utils.py,sha256=Y8sDJgDwHxp1L33BkQoDumIl0Pkp-BuYZb1PwWYsmak,18251
|
|
30
30
|
utils/author_utils.py,sha256=DLTo1xsxef2wxoe4s_MWrh36maj4fgnvFlsDLpDE-qQ,5507
|
|
31
|
+
utils/biblatex_parser.py,sha256=Vznt-BfNtQQb4XQ6iPab2CgFcV2JIjva1OU33NzQ51g,20253
|
|
32
|
+
utils/bibliography_utils.py,sha256=mpmdAklzAs1CT3gqrOcjujGhouL95OuliCx0LE9Pg90,11705
|
|
33
|
+
utils/bibtex_parser.py,sha256=jsQ87lkzmBmJO3VEN3itw22CJ1Hesei4IvM2sfsaFKI,12867
|
|
31
34
|
utils/config_validator.py,sha256=rxf7K3DYmJ-BNPsmtaCNipY2BTVT-pJZ7wN-M9Y3GC8,11167
|
|
32
35
|
utils/db_utils.py,sha256=_wSupfBlm0ILFvntQTvoj7tLDCbrYPRQrp9NDvphF_E,6281
|
|
33
36
|
utils/doi_utils.py,sha256=8f3iE4RdSNkzeqa9wJfoKcVEiBVse3_uf643biLudmw,4134
|
|
34
37
|
utils/error_utils.py,sha256=2qdRM3Bv4GvE3mlXgXp9jiQBfvB08qeg8vTgNVivcgk,5706
|
|
35
38
|
utils/mock_objects.py,sha256=QxU-UXyHSY27IZYN8Sb8ei0JtNkpGSdMXoErrRLHXvE,6437
|
|
36
|
-
utils/text_utils.py,sha256=
|
|
39
|
+
utils/text_utils.py,sha256=KjNx_UJvVhz-oowu4CCdryEuN0hYLu4X8yVkjdYP8fM,189261
|
|
37
40
|
utils/unicode_utils.py,sha256=-WBKarXO756p7fd7gCeNsMag4ztDNURwFX5IVniOtwY,10366
|
|
38
|
-
utils/url_utils.py,sha256=
|
|
39
|
-
academic_refchecker-1.2.
|
|
40
|
-
academic_refchecker-1.2.
|
|
41
|
-
academic_refchecker-1.2.
|
|
42
|
-
academic_refchecker-1.2.
|
|
43
|
-
academic_refchecker-1.2.
|
|
41
|
+
utils/url_utils.py,sha256=n0m5rXKV0-UrE8lI85VEx23KmfGwky57sI6gFPuu78I,7358
|
|
42
|
+
academic_refchecker-1.2.38.dist-info/METADATA,sha256=7V0yEKZy9zao6s3_TBHPOg7Gi86h4lG2m_rhyhStq5w,22298
|
|
43
|
+
academic_refchecker-1.2.38.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
44
|
+
academic_refchecker-1.2.38.dist-info/entry_points.txt,sha256=WdI89tYkIfz-M628PiboOfOLzTBWZAqvlF29qCVCkek,61
|
|
45
|
+
academic_refchecker-1.2.38.dist-info/top_level.txt,sha256=6RlcQEA0kHb7-ndbKMFMZnYnJQVohgsU6BBkbEvJvEs,69
|
|
46
|
+
academic_refchecker-1.2.38.dist-info/RECORD,,
|
core/refchecker.py
CHANGED
|
@@ -451,47 +451,10 @@ class ArxivReferenceChecker:
|
|
|
451
451
|
|
|
452
452
|
def extract_arxiv_id_from_url(self, url):
|
|
453
453
|
"""
|
|
454
|
-
Extract ArXiv ID from a URL or text containing ArXiv reference
|
|
454
|
+
Extract ArXiv ID from a URL or text containing ArXiv reference.
|
|
455
|
+
Uses the common extraction function from utils.url_utils.
|
|
455
456
|
"""
|
|
456
|
-
|
|
457
|
-
return None
|
|
458
|
-
|
|
459
|
-
# First, check for arXiv: format (e.g., "arXiv:1610.10099" or "arXiv preprint arXiv:1610.10099")
|
|
460
|
-
arxiv_match = re.search(r'arXiv:(\d{4}\.\d{4,5})', url, re.IGNORECASE)
|
|
461
|
-
if arxiv_match:
|
|
462
|
-
arxiv_id = arxiv_match.group(1)
|
|
463
|
-
# Remove version number if present
|
|
464
|
-
arxiv_id = re.sub(r'v\d+$', '', arxiv_id)
|
|
465
|
-
return arxiv_id
|
|
466
|
-
|
|
467
|
-
# Remove version string from end if present (e.g., 'v1')
|
|
468
|
-
url = re.sub(r'v\d+$', '', url)
|
|
469
|
-
|
|
470
|
-
# Parse URL
|
|
471
|
-
parsed_url = urlparse(url)
|
|
472
|
-
|
|
473
|
-
# Check if it's an arxiv.org URL
|
|
474
|
-
if 'arxiv.org' in parsed_url.netloc:
|
|
475
|
-
# Extract ID from path
|
|
476
|
-
path = parsed_url.path.strip('/')
|
|
477
|
-
|
|
478
|
-
# Handle different URL formats
|
|
479
|
-
if path.startswith('abs/'):
|
|
480
|
-
arxiv_id = path.replace('abs/', '')
|
|
481
|
-
elif path.startswith('pdf/'):
|
|
482
|
-
arxiv_id = path.replace('pdf/', '').replace('.pdf', '')
|
|
483
|
-
elif '/abs/' in path:
|
|
484
|
-
arxiv_id = path.split('/abs/')[1]
|
|
485
|
-
elif '/pdf/' in path:
|
|
486
|
-
arxiv_id = path.split('/pdf/')[1].replace('.pdf', '')
|
|
487
|
-
else:
|
|
488
|
-
arxiv_id = path
|
|
489
|
-
|
|
490
|
-
# Remove version number from the extracted ID
|
|
491
|
-
arxiv_id = re.sub(r'v\d+$', '', arxiv_id)
|
|
492
|
-
return arxiv_id
|
|
493
|
-
|
|
494
|
-
return None
|
|
457
|
+
return extract_arxiv_id_from_url(url)
|
|
495
458
|
|
|
496
459
|
def get_paper_metadata(self, arxiv_id):
|
|
497
460
|
"""
|
|
@@ -3410,13 +3373,21 @@ class ArxivReferenceChecker:
|
|
|
3410
3373
|
return self._parse_standard_acm_natbib_references(bibliography_text)
|
|
3411
3374
|
|
|
3412
3375
|
# Check if this is BibTeX format
|
|
3413
|
-
from utils.
|
|
3376
|
+
from utils.bibtex_parser import detect_bibtex_format
|
|
3414
3377
|
if detect_bibtex_format(bibliography_text):
|
|
3415
3378
|
logger.info("Detected BibTeX format, using BibTeX parser")
|
|
3416
3379
|
self.used_regex_extraction = True
|
|
3417
3380
|
# Note: BibTeX parsing is robust, so we don't set used_unreliable_extraction
|
|
3418
3381
|
return self._parse_bibtex_references(bibliography_text)
|
|
3419
3382
|
|
|
3383
|
+
# Check if this is biblatex format
|
|
3384
|
+
from utils.biblatex_parser import detect_biblatex_format
|
|
3385
|
+
if detect_biblatex_format(bibliography_text):
|
|
3386
|
+
logger.info("Detected biblatex format, using biblatex parser")
|
|
3387
|
+
self.used_regex_extraction = True
|
|
3388
|
+
# Note: biblatex parsing is also robust, so we don't set used_unreliable_extraction
|
|
3389
|
+
return self._parse_biblatex_references(bibliography_text)
|
|
3390
|
+
|
|
3420
3391
|
# For non-standard formats, try LLM-based extraction if available
|
|
3421
3392
|
if self.llm_extractor:
|
|
3422
3393
|
try:
|
|
@@ -3573,11 +3544,9 @@ class ArxivReferenceChecker:
|
|
|
3573
3544
|
# Clean author part and extract authors
|
|
3574
3545
|
author_part_clean = strip_latex_commands(author_part).strip()
|
|
3575
3546
|
if author_part_clean and not author_part_clean.startswith('\\'):
|
|
3576
|
-
# Parse author names
|
|
3577
|
-
|
|
3578
|
-
|
|
3579
|
-
else:
|
|
3580
|
-
author_names = [name.strip() for name in author_part_clean.split(',')]
|
|
3547
|
+
# Parse author names using the robust author parsing function
|
|
3548
|
+
from utils.text_utils import parse_authors_with_initials
|
|
3549
|
+
author_names = parse_authors_with_initials(author_part_clean)
|
|
3581
3550
|
|
|
3582
3551
|
# Clean up author names
|
|
3583
3552
|
authors = []
|
|
@@ -3630,11 +3599,19 @@ class ArxivReferenceChecker:
|
|
|
3630
3599
|
self.used_regex_extraction = True
|
|
3631
3600
|
|
|
3632
3601
|
# Check if this is BibTeX format first
|
|
3633
|
-
|
|
3602
|
+
from utils.bibtex_parser import detect_bibtex_format
|
|
3603
|
+
if detect_bibtex_format(bibliography_text):
|
|
3634
3604
|
logger.debug("Detected BibTeX format, using BibTeX-specific parsing")
|
|
3635
3605
|
# BibTeX parsing is robust, so we don't set used_unreliable_extraction
|
|
3636
3606
|
return self._parse_bibtex_references(bibliography_text)
|
|
3637
3607
|
|
|
3608
|
+
# Check if this is biblatex format
|
|
3609
|
+
from utils.biblatex_parser import detect_biblatex_format
|
|
3610
|
+
if detect_biblatex_format(bibliography_text):
|
|
3611
|
+
logger.debug("Detected biblatex format, using biblatex-specific parsing")
|
|
3612
|
+
# biblatex parsing is also robust, so we don't set used_unreliable_extraction
|
|
3613
|
+
return self._parse_biblatex_references(bibliography_text)
|
|
3614
|
+
|
|
3638
3615
|
# If we reach here, we're using the unreliable fallback regex parsing
|
|
3639
3616
|
self.used_unreliable_extraction = True
|
|
3640
3617
|
|
|
@@ -4066,214 +4043,33 @@ class ArxivReferenceChecker:
|
|
|
4066
4043
|
Returns:
|
|
4067
4044
|
List of structured reference dictionaries
|
|
4068
4045
|
"""
|
|
4069
|
-
# Use the
|
|
4070
|
-
from utils.
|
|
4046
|
+
# Use the dedicated BibTeX parser
|
|
4047
|
+
from utils.bibtex_parser import parse_bibtex_references
|
|
4071
4048
|
|
|
4072
|
-
# Extract references using the
|
|
4073
|
-
references =
|
|
4049
|
+
# Extract references using the BibTeX parser
|
|
4050
|
+
references = parse_bibtex_references(bibliography_text)
|
|
4074
4051
|
|
|
4075
|
-
logger.debug(f"Extracted {len(references)} BibTeX references using
|
|
4052
|
+
logger.debug(f"Extracted {len(references)} BibTeX references using dedicated parser")
|
|
4076
4053
|
return references
|
|
4077
4054
|
|
|
4078
|
-
def
|
|
4055
|
+
def _parse_biblatex_references(self, bibliography_text):
|
|
4079
4056
|
"""
|
|
4080
|
-
Parse
|
|
4057
|
+
Parse biblatex formatted references like [1] Author. "Title". In: Venue. Year.
|
|
4081
4058
|
|
|
4082
4059
|
Args:
|
|
4083
|
-
|
|
4084
|
-
content: Content inside the braces
|
|
4060
|
+
bibliography_text: String containing biblatex .bbl entries
|
|
4085
4061
|
|
|
4086
4062
|
Returns:
|
|
4087
|
-
|
|
4063
|
+
List of structured reference dictionaries
|
|
4088
4064
|
"""
|
|
4089
|
-
|
|
4090
|
-
from utils.
|
|
4091
|
-
from utils.doi_utils import construct_doi_url, is_valid_doi_format
|
|
4092
|
-
|
|
4093
|
-
# Extract key (first part before comma)
|
|
4094
|
-
key_match = re.match(r'([^,]+),', content)
|
|
4095
|
-
key = key_match.group(1).strip() if key_match else ""
|
|
4096
|
-
|
|
4097
|
-
# Extract fields using regex
|
|
4098
|
-
fields = {}
|
|
4099
|
-
|
|
4100
|
-
# Pattern to match field = {value} or field = "value"
|
|
4101
|
-
field_pattern = r'(\w+)\s*=\s*(?:\{([^{}]*(?:\{[^{}]*\}[^{}]*)*)\}|"([^"]*)")'
|
|
4102
|
-
|
|
4103
|
-
for match in re.finditer(field_pattern, content, re.DOTALL):
|
|
4104
|
-
field_name = match.group(1).lower()
|
|
4105
|
-
field_value = match.group(2) or match.group(3) or ""
|
|
4106
|
-
# Strip outer quotes if present (handles cases like title = {"Some Title"})
|
|
4107
|
-
field_value = field_value.strip()
|
|
4108
|
-
if field_value.startswith('"') and field_value.endswith('"'):
|
|
4109
|
-
field_value = field_value[1:-1]
|
|
4110
|
-
fields[field_name] = field_value
|
|
4111
|
-
|
|
4112
|
-
# If field extraction failed, try a simpler approach
|
|
4113
|
-
if not fields:
|
|
4114
|
-
logger.debug("Field extraction failed, trying line-by-line approach")
|
|
4115
|
-
lines = content.split('\n')
|
|
4116
|
-
for line in lines:
|
|
4117
|
-
line = line.strip()
|
|
4118
|
-
if '=' in line:
|
|
4119
|
-
field_match = re.match(r'(\w+)\s*=\s*[{"]([^{}"]*)[}"]', line)
|
|
4120
|
-
if field_match:
|
|
4121
|
-
field_name = field_match.group(1).lower()
|
|
4122
|
-
field_value = field_match.group(2).strip()
|
|
4123
|
-
# Strip outer quotes if present
|
|
4124
|
-
if field_value.startswith('"') and field_value.endswith('"'):
|
|
4125
|
-
field_value = field_value[1:-1]
|
|
4126
|
-
fields[field_name] = field_value
|
|
4127
|
-
|
|
4128
|
-
# Extract required information
|
|
4129
|
-
title = fields.get('title', '')
|
|
4130
|
-
author_string = fields.get('author', '')
|
|
4131
|
-
year = 0
|
|
4132
|
-
|
|
4133
|
-
# Parse year
|
|
4134
|
-
year_str = fields.get('year', '')
|
|
4135
|
-
if year_str:
|
|
4136
|
-
year_match = re.search(r'\d{4}', year_str)
|
|
4137
|
-
if year_match:
|
|
4138
|
-
year = int(year_match.group())
|
|
4139
|
-
|
|
4140
|
-
# If no year found but we have a valid title/authors, try extracting from eprint or other fields
|
|
4141
|
-
if year == 0 and (title or author_string):
|
|
4142
|
-
# Check eprint field for arXiv entries like "2024" prefix
|
|
4143
|
-
eprint = fields.get('eprint', '')
|
|
4144
|
-
if eprint:
|
|
4145
|
-
# Extract year from ArXiv eprint ID (e.g., "2311.09096" -> 2023)
|
|
4146
|
-
eprint_year_match = re.match(r'^(\d{2})(\d{2})', eprint)
|
|
4147
|
-
if eprint_year_match:
|
|
4148
|
-
yy = int(eprint_year_match.group(1))
|
|
4149
|
-
# Convert to 4-digit year (23 -> 2023, assumes 21st century)
|
|
4150
|
-
if yy >= 91: # ArXiv started in 1991
|
|
4151
|
-
year = 1900 + yy
|
|
4152
|
-
else:
|
|
4153
|
-
year = 2000 + yy
|
|
4154
|
-
|
|
4155
|
-
# For entries without year, set None instead of 0
|
|
4156
|
-
if year == 0:
|
|
4157
|
-
year = None
|
|
4158
|
-
|
|
4159
|
-
# Parse authors using the enhanced function
|
|
4160
|
-
authors = []
|
|
4161
|
-
if author_string:
|
|
4162
|
-
try:
|
|
4163
|
-
authors = parse_authors_with_initials(author_string)
|
|
4164
|
-
except Exception as e:
|
|
4165
|
-
logger.debug(f"Author parsing failed for '{author_string}': {e}")
|
|
4166
|
-
# Fallback: split by 'and' and clean up
|
|
4167
|
-
author_parts = author_string.split(' and ')
|
|
4168
|
-
authors = []
|
|
4169
|
-
for part in author_parts:
|
|
4170
|
-
# Remove leading "and" from author names (handles cases like "and Krishnamoorthy, S")
|
|
4171
|
-
part = re.sub(r'^and\s+', '', part.strip())
|
|
4172
|
-
if part:
|
|
4173
|
-
authors.append(part)
|
|
4174
|
-
|
|
4175
|
-
# Special handling for @misc entries with only howpublished field
|
|
4176
|
-
if not title and not authors and entry_type == 'misc':
|
|
4177
|
-
howpublished = fields.get('howpublished', '')
|
|
4178
|
-
if howpublished:
|
|
4179
|
-
# Try to extract a URL from howpublished
|
|
4180
|
-
url_patterns = [
|
|
4181
|
-
r'://([^/]+)', # Missing protocol case: "://example.com/path"
|
|
4182
|
-
r'https?://([^/\s]+)', # Standard URL
|
|
4183
|
-
r'www\.([^/\s]+)', # www without protocol
|
|
4184
|
-
]
|
|
4185
|
-
|
|
4186
|
-
extracted_url = ''
|
|
4187
|
-
for pattern in url_patterns:
|
|
4188
|
-
match = re.search(pattern, howpublished)
|
|
4189
|
-
if match:
|
|
4190
|
-
domain = match.group(1)
|
|
4191
|
-
# Reconstruct URL with https if protocol was missing
|
|
4192
|
-
if howpublished.startswith('://'):
|
|
4193
|
-
extracted_url = 'https' + howpublished
|
|
4194
|
-
elif not howpublished.startswith(('http://', 'https://')):
|
|
4195
|
-
extracted_url = 'https://' + howpublished
|
|
4196
|
-
else:
|
|
4197
|
-
extracted_url = howpublished
|
|
4198
|
-
|
|
4199
|
-
# Generate title from domain/path
|
|
4200
|
-
if 'jailbreakchat.com' in domain:
|
|
4201
|
-
title = 'JailbreakChat Website'
|
|
4202
|
-
elif 'lesswrong.com' in domain:
|
|
4203
|
-
title = 'LessWrong Post: Jailbreaking ChatGPT'
|
|
4204
|
-
elif 'chat.openai.com' in domain:
|
|
4205
|
-
title = 'ChatGPT Conversation Share'
|
|
4206
|
-
elif 'gemini.google.com' in domain:
|
|
4207
|
-
title = 'Gemini Conversation Share'
|
|
4208
|
-
elif 'microsoft.com' in domain:
|
|
4209
|
-
title = 'Microsoft Azure Content Safety API'
|
|
4210
|
-
elif 'perspectiveapi.com' in domain:
|
|
4211
|
-
title = 'Perspective API'
|
|
4212
|
-
else:
|
|
4213
|
-
# Generic title based on domain
|
|
4214
|
-
title = f"Web Resource: {domain}"
|
|
4215
|
-
|
|
4216
|
-
authors = ["Web Resource"]
|
|
4217
|
-
# Store the extracted URL
|
|
4218
|
-
fields['url'] = extracted_url
|
|
4219
|
-
break
|
|
4065
|
+
# Use the dedicated biblatex parser
|
|
4066
|
+
from utils.biblatex_parser import parse_biblatex_references
|
|
4220
4067
|
|
|
4221
|
-
#
|
|
4222
|
-
|
|
4223
|
-
authors = ["Unknown Author"]
|
|
4224
|
-
|
|
4225
|
-
# Clean title
|
|
4226
|
-
title = clean_title(title) if title else "Unknown Title"
|
|
4227
|
-
|
|
4228
|
-
# Extract URL/DOI
|
|
4229
|
-
url = fields.get('url', '')
|
|
4230
|
-
doi = fields.get('doi', '')
|
|
4231
|
-
|
|
4232
|
-
# Construct DOI URL if we have a DOI
|
|
4233
|
-
if doi and is_valid_doi_format(doi):
|
|
4234
|
-
url = construct_doi_url(doi)
|
|
4235
|
-
|
|
4236
|
-
# Construct ArXiv URL from eprint field if no URL present
|
|
4237
|
-
if not url:
|
|
4238
|
-
eprint = fields.get('eprint', '')
|
|
4239
|
-
if eprint and re.match(r'^\d{4}\.\d{4,5}', eprint):
|
|
4240
|
-
# Remove version number if present and construct ArXiv URL
|
|
4241
|
-
clean_eprint = re.sub(r'v\d+$', '', eprint)
|
|
4242
|
-
url = f"https://arxiv.org/abs/{clean_eprint}"
|
|
4243
|
-
|
|
4244
|
-
# Handle special URL fields
|
|
4245
|
-
if not url:
|
|
4246
|
-
howpublished = fields.get('howpublished', '')
|
|
4247
|
-
if 'url{' in howpublished or 'href{' in howpublished:
|
|
4248
|
-
url_match = re.search(r'url\{([^}]+)\}', howpublished)
|
|
4249
|
-
if not url_match:
|
|
4250
|
-
url_match = re.search(r'href\{([^}]+)\}', howpublished)
|
|
4251
|
-
if url_match:
|
|
4252
|
-
from utils.url_utils import clean_url_punctuation
|
|
4253
|
-
url = clean_url_punctuation(url_match.group(1))
|
|
4254
|
-
|
|
4255
|
-
# Determine reference type
|
|
4256
|
-
ref_type = 'other'
|
|
4257
|
-
if 'arxiv' in url.lower() or 'arxiv' in title.lower():
|
|
4258
|
-
ref_type = 'arxiv'
|
|
4259
|
-
elif url or doi:
|
|
4260
|
-
ref_type = 'non-arxiv'
|
|
4261
|
-
|
|
4262
|
-
# Create structured reference
|
|
4263
|
-
structured_ref = {
|
|
4264
|
-
'url': url,
|
|
4265
|
-
'doi': doi,
|
|
4266
|
-
'year': year,
|
|
4267
|
-
'authors': authors,
|
|
4268
|
-
'title': title,
|
|
4269
|
-
'raw_text': f"@{entry_type}{{{key}, {content}}}",
|
|
4270
|
-
'type': ref_type,
|
|
4271
|
-
'bibtex_key': key,
|
|
4272
|
-
'bibtex_type': entry_type
|
|
4273
|
-
}
|
|
4068
|
+
# Extract references using the biblatex parser
|
|
4069
|
+
references = parse_biblatex_references(bibliography_text)
|
|
4274
4070
|
|
|
4275
|
-
logger.debug(f"
|
|
4276
|
-
return
|
|
4071
|
+
logger.debug(f"Extracted {len(references)} biblatex references using dedicated parser")
|
|
4072
|
+
return references
|
|
4277
4073
|
|
|
4278
4074
|
def _process_llm_extracted_references(self, references):
|
|
4279
4075
|
"""
|
|
@@ -4429,8 +4225,17 @@ class ArxivReferenceChecker:
|
|
|
4429
4225
|
return True
|
|
4430
4226
|
|
|
4431
4227
|
# Also check if authors have significant overlap (at least 50% of the shorter author list)
|
|
4432
|
-
|
|
4433
|
-
|
|
4228
|
+
from utils.text_utils import parse_authors_with_initials
|
|
4229
|
+
|
|
4230
|
+
if '*' in seg1['author']:
|
|
4231
|
+
author1_parts = seg1['author'].split('*')
|
|
4232
|
+
else:
|
|
4233
|
+
author1_parts = parse_authors_with_initials(seg1['author'])
|
|
4234
|
+
|
|
4235
|
+
if '*' in seg2['author']:
|
|
4236
|
+
author2_parts = seg2['author'].split('*')
|
|
4237
|
+
else:
|
|
4238
|
+
author2_parts = parse_authors_with_initials(seg2['author'])
|
|
4434
4239
|
|
|
4435
4240
|
# Clean and normalize author names
|
|
4436
4241
|
author1_clean = {a.strip().lower() for a in author1_parts if a.strip() and a.strip() not in ['et al', 'others']}
|
|
@@ -4945,55 +4750,6 @@ class ArxivReferenceChecker:
|
|
|
4945
4750
|
}
|
|
4946
4751
|
|
|
4947
4752
|
|
|
4948
|
-
def _get_bibtex_content(self, paper):
|
|
4949
|
-
"""
|
|
4950
|
-
Try to get BibTeX content for a paper from various sources.
|
|
4951
|
-
|
|
4952
|
-
Args:
|
|
4953
|
-
paper: Paper object
|
|
4954
|
-
|
|
4955
|
-
Returns:
|
|
4956
|
-
str: BibTeX content if found, None otherwise
|
|
4957
|
-
"""
|
|
4958
|
-
# Try ArXiv source if it's an ArXiv paper
|
|
4959
|
-
from utils.arxiv_utils import extract_arxiv_id_from_paper, download_arxiv_source
|
|
4960
|
-
|
|
4961
|
-
arxiv_id = extract_arxiv_id_from_paper(paper)
|
|
4962
|
-
if arxiv_id:
|
|
4963
|
-
logger.debug(f"Detected ArXiv paper {arxiv_id}, checking for structured bibliography")
|
|
4964
|
-
tex_content, bib_content, bbl_content = download_arxiv_source(arxiv_id)
|
|
4965
|
-
|
|
4966
|
-
# Prefer .bib files (most structured), then .bbl files
|
|
4967
|
-
if bib_content:
|
|
4968
|
-
logger.info(f"Found .bib files in ArXiv source for {arxiv_id}")
|
|
4969
|
-
|
|
4970
|
-
# If we have LaTeX content, filter BibTeX by cited keys
|
|
4971
|
-
if tex_content:
|
|
4972
|
-
from utils.text_utils import extract_cited_keys_from_latex, filter_bibtex_by_cited_keys
|
|
4973
|
-
cited_keys = extract_cited_keys_from_latex(tex_content)
|
|
4974
|
-
if cited_keys:
|
|
4975
|
-
logger.debug(f"Found {len(cited_keys)} cited keys, filtering BibTeX")
|
|
4976
|
-
filtered_content = filter_bibtex_by_cited_keys(bib_content, cited_keys)
|
|
4977
|
-
return filtered_content
|
|
4978
|
-
|
|
4979
|
-
return bib_content
|
|
4980
|
-
|
|
4981
|
-
elif bbl_content:
|
|
4982
|
-
logger.info(f"Found .bbl files in ArXiv source for {arxiv_id}")
|
|
4983
|
-
return bbl_content
|
|
4984
|
-
|
|
4985
|
-
elif tex_content:
|
|
4986
|
-
# Check for embedded bibliography in LaTeX
|
|
4987
|
-
from utils.text_utils import detect_latex_bibliography_format
|
|
4988
|
-
latex_format = detect_latex_bibliography_format(tex_content)
|
|
4989
|
-
if latex_format['is_latex'] and ('\\bibitem' in tex_content or '@' in tex_content):
|
|
4990
|
-
logger.info(f"Found embedded bibliography in ArXiv LaTeX source, but skipping due to formatting incompatibility")
|
|
4991
|
-
# Skip embedded bibliography and return None to trigger fallback methods
|
|
4992
|
-
return None
|
|
4993
|
-
|
|
4994
|
-
# Could add other BibTeX sources here (e.g., direct BibTeX URLs, etc.)
|
|
4995
|
-
|
|
4996
|
-
return None
|
|
4997
4753
|
|
|
4998
4754
|
|
|
4999
4755
|
def extract_bibliography(self, paper, debug_mode=False):
|
|
@@ -5008,7 +4764,8 @@ class ArxivReferenceChecker:
|
|
|
5008
4764
|
logger.debug(f"Extracting bibliography for paper {paper_id}: {paper.title}")
|
|
5009
4765
|
|
|
5010
4766
|
# Check if we can get BibTeX content for this paper (ArXiv or other sources)
|
|
5011
|
-
|
|
4767
|
+
from utils.arxiv_utils import get_bibtex_content
|
|
4768
|
+
bibtex_content = get_bibtex_content(paper)
|
|
5012
4769
|
if bibtex_content:
|
|
5013
4770
|
logger.debug(f"Found BibTeX content for {paper_id}, using structured bibliography")
|
|
5014
4771
|
|
|
@@ -5062,7 +4819,7 @@ class ArxivReferenceChecker:
|
|
|
5062
4819
|
else:
|
|
5063
4820
|
logger.warning("No LLM available for fallback, using original parsing results")
|
|
5064
4821
|
else:
|
|
5065
|
-
logger.
|
|
4822
|
+
logger.debug(f"LaTeX parsing validation passed (quality: {validation['quality_score']:.2f})")
|
|
5066
4823
|
else:
|
|
5067
4824
|
# Parse BibTeX using the standard flow (LLM or regex based on config)
|
|
5068
4825
|
references = self.parse_references(bibtex_content)
|
|
@@ -5076,7 +4833,7 @@ class ArxivReferenceChecker:
|
|
|
5076
4833
|
logger.warning(f"Could not save debug references file for {paper_id}: {e}")
|
|
5077
4834
|
|
|
5078
4835
|
if references:
|
|
5079
|
-
logger.
|
|
4836
|
+
logger.debug(f"Extracted {len(references)} references")
|
|
5080
4837
|
return references
|
|
5081
4838
|
|
|
5082
4839
|
# Check if this is a text file containing references
|
|
@@ -5146,7 +4903,7 @@ class ArxivReferenceChecker:
|
|
|
5146
4903
|
bibtex_references = extract_latex_references(bib_content, paper.file_path)
|
|
5147
4904
|
|
|
5148
4905
|
if bibtex_references:
|
|
5149
|
-
logger.
|
|
4906
|
+
logger.debug(f"Extracted {len(bibtex_references)} references from BibTeX file")
|
|
5150
4907
|
return bibtex_references
|
|
5151
4908
|
else:
|
|
5152
4909
|
logger.warning(f"No references found in BibTeX file: {paper.file_path}")
|
|
@@ -5623,7 +5380,7 @@ class ArxivReferenceChecker:
|
|
|
5623
5380
|
error_details = unverified_errors[0].get('error_details', '')
|
|
5624
5381
|
if error_details:
|
|
5625
5382
|
subreason = self._categorize_unverified_reason(error_details)
|
|
5626
|
-
print(f"
|
|
5383
|
+
print(f" Subreason: {subreason}")
|
|
5627
5384
|
|
|
5628
5385
|
year_str = self._format_year_string(reference.get('year'))
|
|
5629
5386
|
|
utils/arxiv_utils.py
CHANGED
|
@@ -288,7 +288,7 @@ def filter_bibtex_by_citations(bib_content, tex_files, main_tex_content):
|
|
|
288
288
|
return bib_content
|
|
289
289
|
|
|
290
290
|
# Parse BibTeX entries and filter
|
|
291
|
-
from utils.
|
|
291
|
+
from utils.bibtex_parser import parse_bibtex_entries
|
|
292
292
|
entries = parse_bibtex_entries(bib_content)
|
|
293
293
|
|
|
294
294
|
# Filter entries to only cited ones
|
|
@@ -374,3 +374,79 @@ def reconstruct_bibtex_content(cited_entries, original_content):
|
|
|
374
374
|
return '\n\n'.join(filtered_parts) + '\n'
|
|
375
375
|
|
|
376
376
|
|
|
377
|
+
def get_bibtex_content(paper):
|
|
378
|
+
"""
|
|
379
|
+
Try to get BibTeX content for a paper from various sources.
|
|
380
|
+
|
|
381
|
+
Args:
|
|
382
|
+
paper: Paper object
|
|
383
|
+
|
|
384
|
+
Returns:
|
|
385
|
+
str: BibTeX content if found, None otherwise
|
|
386
|
+
"""
|
|
387
|
+
import re
|
|
388
|
+
|
|
389
|
+
# Try ArXiv source if it's an ArXiv paper
|
|
390
|
+
arxiv_id = extract_arxiv_id_from_paper(paper)
|
|
391
|
+
if arxiv_id:
|
|
392
|
+
logger.debug(f"Detected ArXiv paper {arxiv_id}, checking for structured bibliography")
|
|
393
|
+
tex_content, bib_content, bbl_content = download_arxiv_source(arxiv_id)
|
|
394
|
+
|
|
395
|
+
# Choose between .bib and .bbl files based on content richness
|
|
396
|
+
# Prioritize .bbl if it has more references than filtered .bib, otherwise prefer .bib
|
|
397
|
+
if bib_content and bbl_content:
|
|
398
|
+
# Count entries in both
|
|
399
|
+
bib_entry_count = len(re.findall(r'@\w+\s*\{', bib_content))
|
|
400
|
+
bbl_entry_count = len(re.findall(r'\\bibitem\[', bbl_content))
|
|
401
|
+
|
|
402
|
+
# If we have LaTeX content, get filtered BibTeX count
|
|
403
|
+
filtered_bib_count = bib_entry_count
|
|
404
|
+
filtered_content = bib_content
|
|
405
|
+
if tex_content:
|
|
406
|
+
cited_keys = extract_cited_keys_from_tex({}, tex_content)
|
|
407
|
+
if cited_keys:
|
|
408
|
+
logger.debug(f"Found {len(cited_keys)} cited keys, filtering BibTeX")
|
|
409
|
+
filtered_content = filter_bibtex_by_citations(bib_content, {}, tex_content)
|
|
410
|
+
filtered_bib_count = len(re.findall(r'@\w+\s*\{', filtered_content))
|
|
411
|
+
|
|
412
|
+
logger.debug(f"Bibliography comparison: .bbl has {bbl_entry_count} entries, filtered .bib has {filtered_bib_count} entries")
|
|
413
|
+
|
|
414
|
+
# Prioritize .bbl if it has significantly more entries
|
|
415
|
+
if bbl_entry_count > filtered_bib_count * 1.5: # 50% more entries threshold
|
|
416
|
+
logger.info(f"Using .bbl files from ArXiv source")
|
|
417
|
+
return bbl_content
|
|
418
|
+
else:
|
|
419
|
+
logger.info(f"Using filtered .bib files")
|
|
420
|
+
return filtered_content
|
|
421
|
+
|
|
422
|
+
elif bib_content:
|
|
423
|
+
logger.info(f"Found .bib files in ArXiv source for {arxiv_id}")
|
|
424
|
+
|
|
425
|
+
# If we have LaTeX content, filter BibTeX by cited keys
|
|
426
|
+
if tex_content:
|
|
427
|
+
cited_keys = extract_cited_keys_from_tex({}, tex_content)
|
|
428
|
+
if cited_keys:
|
|
429
|
+
logger.debug(f"Found {len(cited_keys)} cited keys, filtering BibTeX")
|
|
430
|
+
filtered_content = filter_bibtex_by_citations(bib_content, {}, tex_content)
|
|
431
|
+
return filtered_content
|
|
432
|
+
|
|
433
|
+
return bib_content
|
|
434
|
+
|
|
435
|
+
elif bbl_content:
|
|
436
|
+
logger.info(f"Found .bbl files in ArXiv source for {arxiv_id}")
|
|
437
|
+
return bbl_content
|
|
438
|
+
|
|
439
|
+
elif tex_content:
|
|
440
|
+
# Check for embedded bibliography in LaTeX
|
|
441
|
+
from utils.text_utils import detect_latex_bibliography_format
|
|
442
|
+
latex_format = detect_latex_bibliography_format(tex_content)
|
|
443
|
+
if latex_format['is_latex'] and ('\\bibitem' in tex_content or '@' in tex_content):
|
|
444
|
+
logger.info(f"Found embedded bibliography in ArXiv LaTeX source, but skipping due to formatting incompatibility")
|
|
445
|
+
# Skip embedded bibliography and return None to trigger fallback methods
|
|
446
|
+
return None
|
|
447
|
+
|
|
448
|
+
# Could add other BibTeX sources here (e.g., direct BibTeX URLs, etc.)
|
|
449
|
+
|
|
450
|
+
return None
|
|
451
|
+
|
|
452
|
+
|