academic-refchecker 1.2.35__tar.gz → 1.2.37__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {academic_refchecker-1.2.35/src/academic_refchecker.egg-info → academic_refchecker-1.2.37}/PKG-INFO +1 -1
- {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/__version__.py +1 -1
- {academic_refchecker-1.2.35 → academic_refchecker-1.2.37/src/academic_refchecker.egg-info}/PKG-INFO +1 -1
- {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/academic_refchecker.egg-info/SOURCES.txt +2 -0
- {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/core/refchecker.py +87 -264
- academic_refchecker-1.2.37/src/utils/arxiv_utils.py +376 -0
- academic_refchecker-1.2.37/src/utils/biblatex_parser.py +474 -0
- academic_refchecker-1.2.37/src/utils/bibtex_parser.py +334 -0
- {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/utils/text_utils.py +288 -83
- academic_refchecker-1.2.35/src/utils/arxiv_utils.py +0 -176
- {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/LICENSE +0 -0
- {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/MANIFEST.in +0 -0
- {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/README.md +0 -0
- {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/pyproject.toml +0 -0
- {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/requirements.txt +0 -0
- {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/scripts/download_db.py +0 -0
- {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/scripts/run_tests.py +0 -0
- {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/scripts/start_vllm_server.py +0 -0
- {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/setup.cfg +0 -0
- {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/__init__.py +0 -0
- {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/academic_refchecker.egg-info/dependency_links.txt +0 -0
- {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/academic_refchecker.egg-info/entry_points.txt +0 -0
- {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/academic_refchecker.egg-info/requires.txt +0 -0
- {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/academic_refchecker.egg-info/top_level.txt +0 -0
- {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/checkers/__init__.py +0 -0
- {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/checkers/crossref.py +0 -0
- {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/checkers/enhanced_hybrid_checker.py +0 -0
- {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/checkers/github_checker.py +0 -0
- {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/checkers/local_semantic_scholar.py +0 -0
- {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/checkers/openalex.py +0 -0
- {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/checkers/openreview_checker.py +0 -0
- {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/checkers/semantic_scholar.py +0 -0
- {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/checkers/webpage_checker.py +0 -0
- {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/config/__init__.py +0 -0
- {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/config/logging.conf +0 -0
- {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/config/settings.py +0 -0
- {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/core/__init__.py +0 -0
- {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/core/db_connection_pool.py +0 -0
- {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/core/parallel_processor.py +0 -0
- {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/database/__init__.py +0 -0
- {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/database/download_semantic_scholar_db.py +0 -0
- {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/llm/__init__.py +0 -0
- {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/llm/base.py +0 -0
- {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/llm/providers.py +0 -0
- {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/scripts/__init__.py +0 -0
- {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/scripts/start_vllm_server.py +0 -0
- {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/services/__init__.py +0 -0
- {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/services/pdf_processor.py +0 -0
- {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/utils/__init__.py +0 -0
- {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/utils/author_utils.py +0 -0
- {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/utils/config_validator.py +0 -0
- {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/utils/db_utils.py +0 -0
- {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/utils/doi_utils.py +0 -0
- {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/utils/error_utils.py +0 -0
- {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/utils/mock_objects.py +0 -0
- {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/utils/unicode_utils.py +0 -0
- {academic_refchecker-1.2.35 → academic_refchecker-1.2.37}/src/utils/url_utils.py +0 -0
|
@@ -202,6 +202,10 @@ class ArxivReferenceChecker:
|
|
|
202
202
|
# debug mode
|
|
203
203
|
self.debug_mode = debug_mode
|
|
204
204
|
|
|
205
|
+
# Initialize extraction flags
|
|
206
|
+
self.used_regex_extraction = False
|
|
207
|
+
self.used_unreliable_extraction = False
|
|
208
|
+
|
|
205
209
|
# Parallel processing configuration
|
|
206
210
|
self.enable_parallel = enable_parallel
|
|
207
211
|
self.max_workers = max_workers
|
|
@@ -2887,6 +2891,7 @@ class ArxivReferenceChecker:
|
|
|
2887
2891
|
self.total_other_refs = 0
|
|
2888
2892
|
self.total_unverified_refs = 0
|
|
2889
2893
|
self.used_regex_extraction = False
|
|
2894
|
+
self.used_unreliable_extraction = False # Only set for fallback regex parsing, not BibTeX
|
|
2890
2895
|
|
|
2891
2896
|
try:
|
|
2892
2897
|
# Get papers to process
|
|
@@ -3105,8 +3110,8 @@ class ArxivReferenceChecker:
|
|
|
3105
3110
|
if self.total_errors_found == 0 and self.total_warnings_found == 0 and self.total_unverified_refs == 0:
|
|
3106
3111
|
print(f"✅ All references verified successfully!")
|
|
3107
3112
|
|
|
3108
|
-
# Show warning if
|
|
3109
|
-
if self.
|
|
3113
|
+
# Show warning if unreliable extraction was used and there are many errors
|
|
3114
|
+
if self.used_unreliable_extraction and self.total_errors_found > 5:
|
|
3110
3115
|
print(f"\n⚠️ Results might be affected by incorrect reference extraction. Consider using LLM extraction, which is more robust.")
|
|
3111
3116
|
|
|
3112
3117
|
if self.verification_output_file:
|
|
@@ -3124,8 +3129,8 @@ class ArxivReferenceChecker:
|
|
|
3124
3129
|
print(f" Total warnings: {self.total_warnings_found}")
|
|
3125
3130
|
print(f"❓ References that couldn't be verified: {self.total_unverified_refs}")
|
|
3126
3131
|
|
|
3127
|
-
# Show warning if
|
|
3128
|
-
if self.
|
|
3132
|
+
# Show warning if unreliable extraction was used and there are many errors
|
|
3133
|
+
if self.used_unreliable_extraction and self.total_errors_found > 5:
|
|
3129
3134
|
print(f"\n⚠️ Results might be affected by incorrect reference extraction. Consider using LLM extraction, which is more robust.")
|
|
3130
3135
|
|
|
3131
3136
|
if self.verification_output_file:
|
|
@@ -3401,15 +3406,25 @@ class ArxivReferenceChecker:
|
|
|
3401
3406
|
if detect_standard_acm_natbib_format(bibliography_text):
|
|
3402
3407
|
logger.info("Detected standard ACM/natbib format, using regex-based parsing")
|
|
3403
3408
|
self.used_regex_extraction = True
|
|
3409
|
+
# Note: ACM/natbib parsing is also quite robust for standard formats
|
|
3404
3410
|
return self._parse_standard_acm_natbib_references(bibliography_text)
|
|
3405
3411
|
|
|
3406
3412
|
# Check if this is BibTeX format
|
|
3407
|
-
from utils.
|
|
3413
|
+
from utils.bibtex_parser import detect_bibtex_format
|
|
3408
3414
|
if detect_bibtex_format(bibliography_text):
|
|
3409
3415
|
logger.info("Detected BibTeX format, using BibTeX parser")
|
|
3410
3416
|
self.used_regex_extraction = True
|
|
3417
|
+
# Note: BibTeX parsing is robust, so we don't set used_unreliable_extraction
|
|
3411
3418
|
return self._parse_bibtex_references(bibliography_text)
|
|
3412
3419
|
|
|
3420
|
+
# Check if this is biblatex format
|
|
3421
|
+
from utils.biblatex_parser import detect_biblatex_format
|
|
3422
|
+
if detect_biblatex_format(bibliography_text):
|
|
3423
|
+
logger.info("Detected biblatex format, using biblatex parser")
|
|
3424
|
+
self.used_regex_extraction = True
|
|
3425
|
+
# Note: biblatex parsing is also robust, so we don't set used_unreliable_extraction
|
|
3426
|
+
return self._parse_biblatex_references(bibliography_text)
|
|
3427
|
+
|
|
3413
3428
|
# For non-standard formats, try LLM-based extraction if available
|
|
3414
3429
|
if self.llm_extractor:
|
|
3415
3430
|
try:
|
|
@@ -3431,6 +3446,7 @@ class ArxivReferenceChecker:
|
|
|
3431
3446
|
# Fallback to regex-based parsing only if LLM was not specified
|
|
3432
3447
|
logger.info("No LLM available, falling back to regex-based parsing")
|
|
3433
3448
|
self.used_regex_extraction = True
|
|
3449
|
+
self.used_unreliable_extraction = True # This is the unreliable fallback parsing
|
|
3434
3450
|
return self._parse_references_regex(bibliography_text)
|
|
3435
3451
|
|
|
3436
3452
|
def _parse_standard_acm_natbib_references(self, bibliography_text):
|
|
@@ -3622,10 +3638,22 @@ class ArxivReferenceChecker:
|
|
|
3622
3638
|
self.used_regex_extraction = True
|
|
3623
3639
|
|
|
3624
3640
|
# Check if this is BibTeX format first
|
|
3625
|
-
|
|
3641
|
+
from utils.bibtex_parser import detect_bibtex_format
|
|
3642
|
+
if detect_bibtex_format(bibliography_text):
|
|
3626
3643
|
logger.debug("Detected BibTeX format, using BibTeX-specific parsing")
|
|
3644
|
+
# BibTeX parsing is robust, so we don't set used_unreliable_extraction
|
|
3627
3645
|
return self._parse_bibtex_references(bibliography_text)
|
|
3628
3646
|
|
|
3647
|
+
# Check if this is biblatex format
|
|
3648
|
+
from utils.biblatex_parser import detect_biblatex_format
|
|
3649
|
+
if detect_biblatex_format(bibliography_text):
|
|
3650
|
+
logger.debug("Detected biblatex format, using biblatex-specific parsing")
|
|
3651
|
+
# biblatex parsing is also robust, so we don't set used_unreliable_extraction
|
|
3652
|
+
return self._parse_biblatex_references(bibliography_text)
|
|
3653
|
+
|
|
3654
|
+
# If we reach here, we're using the unreliable fallback regex parsing
|
|
3655
|
+
self.used_unreliable_extraction = True
|
|
3656
|
+
|
|
3629
3657
|
# --- IMPROVED SPLITTING: handle concatenated references like [3]... [4]... ---
|
|
3630
3658
|
# First, normalize the bibliography text to handle multi-line references
|
|
3631
3659
|
# This fixes the issue where years appear as separate lines
|
|
@@ -4054,270 +4082,33 @@ class ArxivReferenceChecker:
|
|
|
4054
4082
|
Returns:
|
|
4055
4083
|
List of structured reference dictionaries
|
|
4056
4084
|
"""
|
|
4057
|
-
|
|
4085
|
+
# Use the dedicated BibTeX parser
|
|
4086
|
+
from utils.bibtex_parser import parse_bibtex_references
|
|
4058
4087
|
|
|
4059
|
-
#
|
|
4060
|
-
|
|
4061
|
-
bibtex_pattern = r'@(\w+)\s*\{([^{}]*(?:\{[^{}]*\}[^{}]*)*)\}'
|
|
4062
|
-
|
|
4063
|
-
entries = []
|
|
4064
|
-
for match in re.finditer(bibtex_pattern, bibliography_text, re.DOTALL | re.IGNORECASE):
|
|
4065
|
-
entry_type = match.group(1).lower()
|
|
4066
|
-
entry_content = match.group(2)
|
|
4067
|
-
|
|
4068
|
-
# Extract fields from the BibTeX entry
|
|
4069
|
-
entry_data = self._parse_bibtex_entry(entry_type, entry_content)
|
|
4070
|
-
if entry_data:
|
|
4071
|
-
entries.append(entry_data)
|
|
4072
|
-
|
|
4073
|
-
if not entries:
|
|
4074
|
-
# Fallback: try simpler pattern if the above doesn't work
|
|
4075
|
-
logger.debug("Complex BibTeX pattern failed, trying simpler approach")
|
|
4076
|
-
# Split on @word{ patterns to find entry boundaries
|
|
4077
|
-
parts = re.split(r'(?=@\w+\s*\{)', bibliography_text)
|
|
4078
|
-
|
|
4079
|
-
for part in parts:
|
|
4080
|
-
part = part.strip()
|
|
4081
|
-
if not part or not part.startswith('@'):
|
|
4082
|
-
continue
|
|
4083
|
-
|
|
4084
|
-
# Find the entry type
|
|
4085
|
-
type_match = re.match(r'@(\w+)\s*\{', part)
|
|
4086
|
-
if not type_match:
|
|
4087
|
-
continue
|
|
4088
|
-
|
|
4089
|
-
entry_type = type_match.group(1).lower()
|
|
4090
|
-
|
|
4091
|
-
# Extract the content between the first { and the last }
|
|
4092
|
-
# This is a simplified approach but should work for most cases
|
|
4093
|
-
brace_start = part.find('{')
|
|
4094
|
-
if brace_start == -1:
|
|
4095
|
-
continue
|
|
4096
|
-
|
|
4097
|
-
# Find the matching closing brace
|
|
4098
|
-
brace_count = 0
|
|
4099
|
-
content_end = -1
|
|
4100
|
-
for i, char in enumerate(part[brace_start:], brace_start):
|
|
4101
|
-
if char == '{':
|
|
4102
|
-
brace_count += 1
|
|
4103
|
-
elif char == '}':
|
|
4104
|
-
brace_count -= 1
|
|
4105
|
-
if brace_count == 0:
|
|
4106
|
-
content_end = i
|
|
4107
|
-
break
|
|
4108
|
-
|
|
4109
|
-
if content_end == -1:
|
|
4110
|
-
# No matching brace found, take everything after first {
|
|
4111
|
-
entry_content = part[brace_start + 1:]
|
|
4112
|
-
else:
|
|
4113
|
-
entry_content = part[brace_start + 1:content_end]
|
|
4114
|
-
|
|
4115
|
-
entry_data = self._parse_bibtex_entry(entry_type, entry_content)
|
|
4116
|
-
if entry_data:
|
|
4117
|
-
entries.append(entry_data)
|
|
4088
|
+
# Extract references using the BibTeX parser
|
|
4089
|
+
references = parse_bibtex_references(bibliography_text)
|
|
4118
4090
|
|
|
4119
|
-
logger.debug(f"Extracted {len(
|
|
4120
|
-
return
|
|
4091
|
+
logger.debug(f"Extracted {len(references)} BibTeX references using dedicated parser")
|
|
4092
|
+
return references
|
|
4121
4093
|
|
|
4122
|
-
def
|
|
4094
|
+
def _parse_biblatex_references(self, bibliography_text):
|
|
4123
4095
|
"""
|
|
4124
|
-
Parse
|
|
4096
|
+
Parse biblatex formatted references like [1] Author. "Title". In: Venue. Year.
|
|
4125
4097
|
|
|
4126
4098
|
Args:
|
|
4127
|
-
|
|
4128
|
-
content: Content inside the braces
|
|
4099
|
+
bibliography_text: String containing biblatex .bbl entries
|
|
4129
4100
|
|
|
4130
4101
|
Returns:
|
|
4131
|
-
|
|
4102
|
+
List of structured reference dictionaries
|
|
4132
4103
|
"""
|
|
4133
|
-
|
|
4134
|
-
from utils.
|
|
4135
|
-
from utils.doi_utils import construct_doi_url, is_valid_doi_format
|
|
4136
|
-
|
|
4137
|
-
# Extract key (first part before comma)
|
|
4138
|
-
key_match = re.match(r'([^,]+),', content)
|
|
4139
|
-
key = key_match.group(1).strip() if key_match else ""
|
|
4140
|
-
|
|
4141
|
-
# Extract fields using regex
|
|
4142
|
-
fields = {}
|
|
4143
|
-
|
|
4144
|
-
# Pattern to match field = {value} or field = "value"
|
|
4145
|
-
field_pattern = r'(\w+)\s*=\s*(?:\{([^{}]*(?:\{[^{}]*\}[^{}]*)*)\}|"([^"]*)")'
|
|
4146
|
-
|
|
4147
|
-
for match in re.finditer(field_pattern, content, re.DOTALL):
|
|
4148
|
-
field_name = match.group(1).lower()
|
|
4149
|
-
field_value = match.group(2) or match.group(3) or ""
|
|
4150
|
-
# Strip outer quotes if present (handles cases like title = {"Some Title"})
|
|
4151
|
-
field_value = field_value.strip()
|
|
4152
|
-
if field_value.startswith('"') and field_value.endswith('"'):
|
|
4153
|
-
field_value = field_value[1:-1]
|
|
4154
|
-
fields[field_name] = field_value
|
|
4155
|
-
|
|
4156
|
-
# If field extraction failed, try a simpler approach
|
|
4157
|
-
if not fields:
|
|
4158
|
-
logger.debug("Field extraction failed, trying line-by-line approach")
|
|
4159
|
-
lines = content.split('\n')
|
|
4160
|
-
for line in lines:
|
|
4161
|
-
line = line.strip()
|
|
4162
|
-
if '=' in line:
|
|
4163
|
-
field_match = re.match(r'(\w+)\s*=\s*[{"]([^{}"]*)[}"]', line)
|
|
4164
|
-
if field_match:
|
|
4165
|
-
field_name = field_match.group(1).lower()
|
|
4166
|
-
field_value = field_match.group(2).strip()
|
|
4167
|
-
# Strip outer quotes if present
|
|
4168
|
-
if field_value.startswith('"') and field_value.endswith('"'):
|
|
4169
|
-
field_value = field_value[1:-1]
|
|
4170
|
-
fields[field_name] = field_value
|
|
4171
|
-
|
|
4172
|
-
# Extract required information
|
|
4173
|
-
title = fields.get('title', '')
|
|
4174
|
-
author_string = fields.get('author', '')
|
|
4175
|
-
year = 0
|
|
4176
|
-
|
|
4177
|
-
# Parse year
|
|
4178
|
-
year_str = fields.get('year', '')
|
|
4179
|
-
if year_str:
|
|
4180
|
-
year_match = re.search(r'\d{4}', year_str)
|
|
4181
|
-
if year_match:
|
|
4182
|
-
year = int(year_match.group())
|
|
4183
|
-
|
|
4184
|
-
# If no year found but we have a valid title/authors, try extracting from eprint or other fields
|
|
4185
|
-
if year == 0 and (title or author_string):
|
|
4186
|
-
# Check eprint field for arXiv entries like "2024" prefix
|
|
4187
|
-
eprint = fields.get('eprint', '')
|
|
4188
|
-
if eprint:
|
|
4189
|
-
# Extract year from ArXiv eprint ID (e.g., "2311.09096" -> 2023)
|
|
4190
|
-
eprint_year_match = re.match(r'^(\d{2})(\d{2})', eprint)
|
|
4191
|
-
if eprint_year_match:
|
|
4192
|
-
yy = int(eprint_year_match.group(1))
|
|
4193
|
-
# Convert to 4-digit year (23 -> 2023, assumes 21st century)
|
|
4194
|
-
if yy >= 91: # ArXiv started in 1991
|
|
4195
|
-
year = 1900 + yy
|
|
4196
|
-
else:
|
|
4197
|
-
year = 2000 + yy
|
|
4198
|
-
|
|
4199
|
-
# For entries without year, set None instead of 0
|
|
4200
|
-
if year == 0:
|
|
4201
|
-
year = None
|
|
4202
|
-
|
|
4203
|
-
# Parse authors using the enhanced function
|
|
4204
|
-
authors = []
|
|
4205
|
-
if author_string:
|
|
4206
|
-
try:
|
|
4207
|
-
authors = parse_authors_with_initials(author_string)
|
|
4208
|
-
except Exception as e:
|
|
4209
|
-
logger.debug(f"Author parsing failed for '{author_string}': {e}")
|
|
4210
|
-
# Fallback: split by 'and' and clean up
|
|
4211
|
-
author_parts = author_string.split(' and ')
|
|
4212
|
-
authors = []
|
|
4213
|
-
for part in author_parts:
|
|
4214
|
-
# Remove leading "and" from author names (handles cases like "and Krishnamoorthy, S")
|
|
4215
|
-
part = re.sub(r'^and\s+', '', part.strip())
|
|
4216
|
-
if part:
|
|
4217
|
-
authors.append(part)
|
|
4218
|
-
|
|
4219
|
-
# Special handling for @misc entries with only howpublished field
|
|
4220
|
-
if not title and not authors and entry_type == 'misc':
|
|
4221
|
-
howpublished = fields.get('howpublished', '')
|
|
4222
|
-
if howpublished:
|
|
4223
|
-
# Try to extract a URL from howpublished
|
|
4224
|
-
url_patterns = [
|
|
4225
|
-
r'://([^/]+)', # Missing protocol case: "://example.com/path"
|
|
4226
|
-
r'https?://([^/\s]+)', # Standard URL
|
|
4227
|
-
r'www\.([^/\s]+)', # www without protocol
|
|
4228
|
-
]
|
|
4229
|
-
|
|
4230
|
-
extracted_url = ''
|
|
4231
|
-
for pattern in url_patterns:
|
|
4232
|
-
match = re.search(pattern, howpublished)
|
|
4233
|
-
if match:
|
|
4234
|
-
domain = match.group(1)
|
|
4235
|
-
# Reconstruct URL with https if protocol was missing
|
|
4236
|
-
if howpublished.startswith('://'):
|
|
4237
|
-
extracted_url = 'https' + howpublished
|
|
4238
|
-
elif not howpublished.startswith(('http://', 'https://')):
|
|
4239
|
-
extracted_url = 'https://' + howpublished
|
|
4240
|
-
else:
|
|
4241
|
-
extracted_url = howpublished
|
|
4242
|
-
|
|
4243
|
-
# Generate title from domain/path
|
|
4244
|
-
if 'jailbreakchat.com' in domain:
|
|
4245
|
-
title = 'JailbreakChat Website'
|
|
4246
|
-
elif 'lesswrong.com' in domain:
|
|
4247
|
-
title = 'LessWrong Post: Jailbreaking ChatGPT'
|
|
4248
|
-
elif 'chat.openai.com' in domain:
|
|
4249
|
-
title = 'ChatGPT Conversation Share'
|
|
4250
|
-
elif 'gemini.google.com' in domain:
|
|
4251
|
-
title = 'Gemini Conversation Share'
|
|
4252
|
-
elif 'microsoft.com' in domain:
|
|
4253
|
-
title = 'Microsoft Azure Content Safety API'
|
|
4254
|
-
elif 'perspectiveapi.com' in domain:
|
|
4255
|
-
title = 'Perspective API'
|
|
4256
|
-
else:
|
|
4257
|
-
# Generic title based on domain
|
|
4258
|
-
title = f"Web Resource: {domain}"
|
|
4259
|
-
|
|
4260
|
-
authors = ["Web Resource"]
|
|
4261
|
-
# Store the extracted URL
|
|
4262
|
-
fields['url'] = extracted_url
|
|
4263
|
-
break
|
|
4264
|
-
|
|
4265
|
-
# Apply defaults only if we still don't have values
|
|
4266
|
-
if not authors:
|
|
4267
|
-
authors = ["Unknown Author"]
|
|
4268
|
-
|
|
4269
|
-
# Clean title
|
|
4270
|
-
title = clean_title(title) if title else "Unknown Title"
|
|
4271
|
-
|
|
4272
|
-
# Extract URL/DOI
|
|
4273
|
-
url = fields.get('url', '')
|
|
4274
|
-
doi = fields.get('doi', '')
|
|
4275
|
-
|
|
4276
|
-
# Construct DOI URL if we have a DOI
|
|
4277
|
-
if doi and is_valid_doi_format(doi):
|
|
4278
|
-
url = construct_doi_url(doi)
|
|
4279
|
-
|
|
4280
|
-
# Construct ArXiv URL from eprint field if no URL present
|
|
4281
|
-
if not url:
|
|
4282
|
-
eprint = fields.get('eprint', '')
|
|
4283
|
-
if eprint and re.match(r'^\d{4}\.\d{4,5}', eprint):
|
|
4284
|
-
# Remove version number if present and construct ArXiv URL
|
|
4285
|
-
clean_eprint = re.sub(r'v\d+$', '', eprint)
|
|
4286
|
-
url = f"https://arxiv.org/abs/{clean_eprint}"
|
|
4287
|
-
|
|
4288
|
-
# Handle special URL fields
|
|
4289
|
-
if not url:
|
|
4290
|
-
howpublished = fields.get('howpublished', '')
|
|
4291
|
-
if 'url{' in howpublished or 'href{' in howpublished:
|
|
4292
|
-
url_match = re.search(r'url\{([^}]+)\}', howpublished)
|
|
4293
|
-
if not url_match:
|
|
4294
|
-
url_match = re.search(r'href\{([^}]+)\}', howpublished)
|
|
4295
|
-
if url_match:
|
|
4296
|
-
from utils.url_utils import clean_url_punctuation
|
|
4297
|
-
url = clean_url_punctuation(url_match.group(1))
|
|
4104
|
+
# Use the dedicated biblatex parser
|
|
4105
|
+
from utils.biblatex_parser import parse_biblatex_references
|
|
4298
4106
|
|
|
4299
|
-
#
|
|
4300
|
-
|
|
4301
|
-
if 'arxiv' in url.lower() or 'arxiv' in title.lower():
|
|
4302
|
-
ref_type = 'arxiv'
|
|
4303
|
-
elif url or doi:
|
|
4304
|
-
ref_type = 'non-arxiv'
|
|
4305
|
-
|
|
4306
|
-
# Create structured reference
|
|
4307
|
-
structured_ref = {
|
|
4308
|
-
'url': url,
|
|
4309
|
-
'doi': doi,
|
|
4310
|
-
'year': year,
|
|
4311
|
-
'authors': authors,
|
|
4312
|
-
'title': title,
|
|
4313
|
-
'raw_text': f"@{entry_type}{{{key}, {content}}}",
|
|
4314
|
-
'type': ref_type,
|
|
4315
|
-
'bibtex_key': key,
|
|
4316
|
-
'bibtex_type': entry_type
|
|
4317
|
-
}
|
|
4107
|
+
# Extract references using the biblatex parser
|
|
4108
|
+
references = parse_biblatex_references(bibliography_text)
|
|
4318
4109
|
|
|
4319
|
-
logger.debug(f"
|
|
4320
|
-
return
|
|
4110
|
+
logger.debug(f"Extracted {len(references)} biblatex references using dedicated parser")
|
|
4111
|
+
return references
|
|
4321
4112
|
|
|
4322
4113
|
def _process_llm_extracted_references(self, references):
|
|
4323
4114
|
"""
|
|
@@ -4327,7 +4118,6 @@ class ArxivReferenceChecker:
|
|
|
4327
4118
|
unique_references = self._deduplicate_references_with_segment_matching(references)
|
|
4328
4119
|
|
|
4329
4120
|
logger.debug(f"Deduplicated {len(references)} references to {len(unique_references)} unique references")
|
|
4330
|
-
logger.info(f"Extracted {len(unique_references)} references using LLM")
|
|
4331
4121
|
|
|
4332
4122
|
processed_refs = []
|
|
4333
4123
|
|
|
@@ -5032,8 +4822,7 @@ class ArxivReferenceChecker:
|
|
|
5032
4822
|
from utils.text_utils import detect_latex_bibliography_format
|
|
5033
4823
|
latex_format = detect_latex_bibliography_format(tex_content)
|
|
5034
4824
|
if latex_format['is_latex'] and ('\\bibitem' in tex_content or '@' in tex_content):
|
|
5035
|
-
logger.info(f"Found embedded bibliography in ArXiv LaTeX source
|
|
5036
|
-
logger.info(f"Embedded bibliographies often have inconsistent formatting - falling back to alternative extraction methods")
|
|
4825
|
+
logger.info(f"Found embedded bibliography in ArXiv LaTeX source, but skipping due to formatting incompatibility")
|
|
5037
4826
|
# Skip embedded bibliography and return None to trigger fallback methods
|
|
5038
4827
|
return None
|
|
5039
4828
|
|
|
@@ -5075,6 +4864,40 @@ class ArxivReferenceChecker:
|
|
|
5075
4864
|
logger.info(f"Detected LaTeX thebibliography format, using extract_latex_references")
|
|
5076
4865
|
# Use None for file_path since this is content from .bbl files
|
|
5077
4866
|
references = extract_latex_references(bibtex_content, None)
|
|
4867
|
+
|
|
4868
|
+
# Validate the parsed references and fallback to LLM if needed
|
|
4869
|
+
from utils.text_utils import validate_parsed_references
|
|
4870
|
+
validation = validate_parsed_references(references)
|
|
4871
|
+
|
|
4872
|
+
if not validation['is_valid']:
|
|
4873
|
+
logger.debug(f"LaTeX parsing validation failed (quality: {validation['quality_score']:.2f})")
|
|
4874
|
+
logger.debug(f"Issues detected: {len(validation['issues'])} problems")
|
|
4875
|
+
for issue in validation['issues'][:5]: # Log first 5 issues
|
|
4876
|
+
logger.debug(f" - {issue}")
|
|
4877
|
+
|
|
4878
|
+
# Try LLM fallback if available
|
|
4879
|
+
if self.llm_extractor:
|
|
4880
|
+
logger.info("Falling back to LLM-based extraction due to unsupported LaTeX format")
|
|
4881
|
+
try:
|
|
4882
|
+
llm_references = self.llm_extractor.extract_references(bibtex_content)
|
|
4883
|
+
if llm_references:
|
|
4884
|
+
# Process LLM results first to get structured references
|
|
4885
|
+
processed_llm_refs = self._process_llm_extracted_references(llm_references)
|
|
4886
|
+
# Then validate the processed results
|
|
4887
|
+
llm_validation = validate_parsed_references(processed_llm_refs)
|
|
4888
|
+
if llm_validation['quality_score'] > validation['quality_score']:
|
|
4889
|
+
logger.debug(f"LLM extraction successful (quality: {llm_validation['quality_score']:.2f})")
|
|
4890
|
+
references = processed_llm_refs
|
|
4891
|
+
else:
|
|
4892
|
+
logger.debug("LLM extraction didn't improve quality, keeping original results")
|
|
4893
|
+
else:
|
|
4894
|
+
logger.warning("LLM extraction returned no results")
|
|
4895
|
+
except Exception as e:
|
|
4896
|
+
logger.error(f"LLM fallback failed: {e}")
|
|
4897
|
+
else:
|
|
4898
|
+
logger.warning("No LLM available for fallback, using original parsing results")
|
|
4899
|
+
else:
|
|
4900
|
+
logger.info(f"LaTeX parsing validation passed (quality: {validation['quality_score']:.2f})")
|
|
5078
4901
|
else:
|
|
5079
4902
|
# Parse BibTeX using the standard flow (LLM or regex based on config)
|
|
5080
4903
|
references = self.parse_references(bibtex_content)
|
|
@@ -5088,7 +4911,7 @@ class ArxivReferenceChecker:
|
|
|
5088
4911
|
logger.warning(f"Could not save debug references file for {paper_id}: {e}")
|
|
5089
4912
|
|
|
5090
4913
|
if references:
|
|
5091
|
-
logger.
|
|
4914
|
+
logger.debug(f"Extracted {len(references)} references")
|
|
5092
4915
|
return references
|
|
5093
4916
|
|
|
5094
4917
|
# Check if this is a text file containing references
|
|
@@ -5158,7 +4981,7 @@ class ArxivReferenceChecker:
|
|
|
5158
4981
|
bibtex_references = extract_latex_references(bib_content, paper.file_path)
|
|
5159
4982
|
|
|
5160
4983
|
if bibtex_references:
|
|
5161
|
-
logger.
|
|
4984
|
+
logger.debug(f"Extracted {len(bibtex_references)} references from BibTeX file")
|
|
5162
4985
|
return bibtex_references
|
|
5163
4986
|
else:
|
|
5164
4987
|
logger.warning(f"No references found in BibTeX file: {paper.file_path}")
|