academic-refchecker 1.2.36__py3-none-any.whl → 1.2.38__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
__version__.py CHANGED
@@ -1,3 +1,3 @@
1
1
  """Version information for RefChecker."""
2
2
 
3
- __version__ = "1.2.36"
3
+ __version__ = "1.2.38"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: academic-refchecker
3
- Version: 1.2.36
3
+ Version: 1.2.38
4
4
  Summary: A comprehensive tool for validating reference accuracy in academic papers
5
5
  Author-email: Mark Russinovich <markrussinovich@hotmail.com>
6
6
  License-Expression: MIT
@@ -1,5 +1,5 @@
1
- __version__.py,sha256=f5GV2gIZ9QIGgMwt7IOmntIX6y7h7w-oSwi4Dr7pgSQ,65
2
- academic_refchecker-1.2.36.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
1
+ __version__.py,sha256=9ez-UBx1mkgUvDMk-z63_XpqOh2QnPCeTrDEuricP1w,65
2
+ academic_refchecker-1.2.38.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
3
3
  checkers/__init__.py,sha256=T0PAHTFt6UiGvn-WGoJU8CdhXNmf6zaHmcGVoWHhmJQ,533
4
4
  checkers/crossref.py,sha256=Hzq4dlf1CSn0aZWU8CMOnLxIvaSivTabLoepIOkgkmY,20585
5
5
  checkers/enhanced_hybrid_checker.py,sha256=6yf5tV4jLSVzjX1xR_kQq0NOgQIst-z_WmkiqqMc8hQ,23469
@@ -15,7 +15,7 @@ config/settings.py,sha256=-vODFoXbWbGPUElpmchE5zbCj_n4Vtxr8HU1hQDFp_c,6164
15
15
  core/__init__.py,sha256=1T2MSQyDk0u_PupbHvm4CvNNN--dxsw78fqKUrqoYrM,157
16
16
  core/db_connection_pool.py,sha256=XRiOdehikkSz3obH4WKgf8woa3694if50Q15rBT-4XQ,4697
17
17
  core/parallel_processor.py,sha256=2S1cAPhtWH3glvtiJrt9JkZzk2iJkPKXsc-F3lg0X6U,16795
18
- core/refchecker.py,sha256=UDyr1PVdMWHZFXhmNexeQ4OVEtI-BYbmE9G3P-u4G_4,283915
18
+ core/refchecker.py,sha256=8EatAqYEDpW219Xrn-ql1oQ5ytmCU8RW8pMtlujRbC8,273167
19
19
  database/__init__.py,sha256=mEuVHlEBuS44t_2ZT_JnvQQrlRCjo1SJq1NmaJ6r8OY,125
20
20
  database/download_semantic_scholar_db.py,sha256=waN4I97KC_36YMiPbiBDUUmgfzu1nub5yeKdAsIR2aw,75276
21
21
  llm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -26,18 +26,21 @@ scripts/start_vllm_server.py,sha256=ZepWp2y2cKFW0Kgsoima2RbmF02fTU29UFcLLpsBhFU,
26
26
  services/__init__.py,sha256=jGi9S74Msak3YR-C4Qb68VU7HB4oLaX9o1rlVAFpOFI,187
27
27
  services/pdf_processor.py,sha256=vu_JnhFGZY6jFVbDbPvG-mlQojvB-3Dzc8_946KVV2E,9427
28
28
  utils/__init__.py,sha256=1RrGoIIn1_gVzxd56b6a7HeAS-wu7uDP-nxLbR3fJ-8,1199
29
- utils/arxiv_utils.py,sha256=UBxgLQEzbZ2lrUc6uA0qvnm-glRcSsnBdbz6y0IMWek,14754
29
+ utils/arxiv_utils.py,sha256=Y8sDJgDwHxp1L33BkQoDumIl0Pkp-BuYZb1PwWYsmak,18251
30
30
  utils/author_utils.py,sha256=DLTo1xsxef2wxoe4s_MWrh36maj4fgnvFlsDLpDE-qQ,5507
31
+ utils/biblatex_parser.py,sha256=Vznt-BfNtQQb4XQ6iPab2CgFcV2JIjva1OU33NzQ51g,20253
32
+ utils/bibliography_utils.py,sha256=mpmdAklzAs1CT3gqrOcjujGhouL95OuliCx0LE9Pg90,11705
33
+ utils/bibtex_parser.py,sha256=jsQ87lkzmBmJO3VEN3itw22CJ1Hesei4IvM2sfsaFKI,12867
31
34
  utils/config_validator.py,sha256=rxf7K3DYmJ-BNPsmtaCNipY2BTVT-pJZ7wN-M9Y3GC8,11167
32
35
  utils/db_utils.py,sha256=_wSupfBlm0ILFvntQTvoj7tLDCbrYPRQrp9NDvphF_E,6281
33
36
  utils/doi_utils.py,sha256=8f3iE4RdSNkzeqa9wJfoKcVEiBVse3_uf643biLudmw,4134
34
37
  utils/error_utils.py,sha256=2qdRM3Bv4GvE3mlXgXp9jiQBfvB08qeg8vTgNVivcgk,5706
35
38
  utils/mock_objects.py,sha256=QxU-UXyHSY27IZYN8Sb8ei0JtNkpGSdMXoErrRLHXvE,6437
36
- utils/text_utils.py,sha256=dWVH6s_-sor3HN3tsF6KoceFcUhXUHNMSKyk4LywgRg,193658
39
+ utils/text_utils.py,sha256=KjNx_UJvVhz-oowu4CCdryEuN0hYLu4X8yVkjdYP8fM,189261
37
40
  utils/unicode_utils.py,sha256=-WBKarXO756p7fd7gCeNsMag4ztDNURwFX5IVniOtwY,10366
38
- utils/url_utils.py,sha256=qoimCrMFCBGvlmF_t1c6zSOmkWi_rUm-gZM0XZ4rEVE,6291
39
- academic_refchecker-1.2.36.dist-info/METADATA,sha256=bxN2DXEeHJeY8_CQ2FYke-E1v67Fbn8jInjStRxe0wg,22298
40
- academic_refchecker-1.2.36.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
41
- academic_refchecker-1.2.36.dist-info/entry_points.txt,sha256=WdI89tYkIfz-M628PiboOfOLzTBWZAqvlF29qCVCkek,61
42
- academic_refchecker-1.2.36.dist-info/top_level.txt,sha256=6RlcQEA0kHb7-ndbKMFMZnYnJQVohgsU6BBkbEvJvEs,69
43
- academic_refchecker-1.2.36.dist-info/RECORD,,
41
+ utils/url_utils.py,sha256=n0m5rXKV0-UrE8lI85VEx23KmfGwky57sI6gFPuu78I,7358
42
+ academic_refchecker-1.2.38.dist-info/METADATA,sha256=7V0yEKZy9zao6s3_TBHPOg7Gi86h4lG2m_rhyhStq5w,22298
43
+ academic_refchecker-1.2.38.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
44
+ academic_refchecker-1.2.38.dist-info/entry_points.txt,sha256=WdI89tYkIfz-M628PiboOfOLzTBWZAqvlF29qCVCkek,61
45
+ academic_refchecker-1.2.38.dist-info/top_level.txt,sha256=6RlcQEA0kHb7-ndbKMFMZnYnJQVohgsU6BBkbEvJvEs,69
46
+ academic_refchecker-1.2.38.dist-info/RECORD,,
core/refchecker.py CHANGED
@@ -451,47 +451,10 @@ class ArxivReferenceChecker:
451
451
 
452
452
  def extract_arxiv_id_from_url(self, url):
453
453
  """
454
- Extract ArXiv ID from a URL or text containing ArXiv reference
454
+ Extract ArXiv ID from a URL or text containing ArXiv reference.
455
+ Uses the common extraction function from utils.url_utils.
455
456
  """
456
- if not url:
457
- return None
458
-
459
- # First, check for arXiv: format (e.g., "arXiv:1610.10099" or "arXiv preprint arXiv:1610.10099")
460
- arxiv_match = re.search(r'arXiv:(\d{4}\.\d{4,5})', url, re.IGNORECASE)
461
- if arxiv_match:
462
- arxiv_id = arxiv_match.group(1)
463
- # Remove version number if present
464
- arxiv_id = re.sub(r'v\d+$', '', arxiv_id)
465
- return arxiv_id
466
-
467
- # Remove version string from end if present (e.g., 'v1')
468
- url = re.sub(r'v\d+$', '', url)
469
-
470
- # Parse URL
471
- parsed_url = urlparse(url)
472
-
473
- # Check if it's an arxiv.org URL
474
- if 'arxiv.org' in parsed_url.netloc:
475
- # Extract ID from path
476
- path = parsed_url.path.strip('/')
477
-
478
- # Handle different URL formats
479
- if path.startswith('abs/'):
480
- arxiv_id = path.replace('abs/', '')
481
- elif path.startswith('pdf/'):
482
- arxiv_id = path.replace('pdf/', '').replace('.pdf', '')
483
- elif '/abs/' in path:
484
- arxiv_id = path.split('/abs/')[1]
485
- elif '/pdf/' in path:
486
- arxiv_id = path.split('/pdf/')[1].replace('.pdf', '')
487
- else:
488
- arxiv_id = path
489
-
490
- # Remove version number from the extracted ID
491
- arxiv_id = re.sub(r'v\d+$', '', arxiv_id)
492
- return arxiv_id
493
-
494
- return None
457
+ return extract_arxiv_id_from_url(url)
495
458
 
496
459
  def get_paper_metadata(self, arxiv_id):
497
460
  """
@@ -3410,13 +3373,21 @@ class ArxivReferenceChecker:
3410
3373
  return self._parse_standard_acm_natbib_references(bibliography_text)
3411
3374
 
3412
3375
  # Check if this is BibTeX format
3413
- from utils.text_utils import detect_bibtex_format
3376
+ from utils.bibtex_parser import detect_bibtex_format
3414
3377
  if detect_bibtex_format(bibliography_text):
3415
3378
  logger.info("Detected BibTeX format, using BibTeX parser")
3416
3379
  self.used_regex_extraction = True
3417
3380
  # Note: BibTeX parsing is robust, so we don't set used_unreliable_extraction
3418
3381
  return self._parse_bibtex_references(bibliography_text)
3419
3382
 
3383
+ # Check if this is biblatex format
3384
+ from utils.biblatex_parser import detect_biblatex_format
3385
+ if detect_biblatex_format(bibliography_text):
3386
+ logger.info("Detected biblatex format, using biblatex parser")
3387
+ self.used_regex_extraction = True
3388
+ # Note: biblatex parsing is also robust, so we don't set used_unreliable_extraction
3389
+ return self._parse_biblatex_references(bibliography_text)
3390
+
3420
3391
  # For non-standard formats, try LLM-based extraction if available
3421
3392
  if self.llm_extractor:
3422
3393
  try:
@@ -3573,11 +3544,9 @@ class ArxivReferenceChecker:
3573
3544
  # Clean author part and extract authors
3574
3545
  author_part_clean = strip_latex_commands(author_part).strip()
3575
3546
  if author_part_clean and not author_part_clean.startswith('\\'):
3576
- # Parse author names - handle comma-separated list and "and"
3577
- if ', and ' in author_part_clean:
3578
- author_names = re.split(r', and |, ', author_part_clean)
3579
- else:
3580
- author_names = [name.strip() for name in author_part_clean.split(',')]
3547
+ # Parse author names using the robust author parsing function
3548
+ from utils.text_utils import parse_authors_with_initials
3549
+ author_names = parse_authors_with_initials(author_part_clean)
3581
3550
 
3582
3551
  # Clean up author names
3583
3552
  authors = []
@@ -3630,11 +3599,19 @@ class ArxivReferenceChecker:
3630
3599
  self.used_regex_extraction = True
3631
3600
 
3632
3601
  # Check if this is BibTeX format first
3633
- if re.search(r'@\w+\s*\{', bibliography_text):
3602
+ from utils.bibtex_parser import detect_bibtex_format
3603
+ if detect_bibtex_format(bibliography_text):
3634
3604
  logger.debug("Detected BibTeX format, using BibTeX-specific parsing")
3635
3605
  # BibTeX parsing is robust, so we don't set used_unreliable_extraction
3636
3606
  return self._parse_bibtex_references(bibliography_text)
3637
3607
 
3608
+ # Check if this is biblatex format
3609
+ from utils.biblatex_parser import detect_biblatex_format
3610
+ if detect_biblatex_format(bibliography_text):
3611
+ logger.debug("Detected biblatex format, using biblatex-specific parsing")
3612
+ # biblatex parsing is also robust, so we don't set used_unreliable_extraction
3613
+ return self._parse_biblatex_references(bibliography_text)
3614
+
3638
3615
  # If we reach here, we're using the unreliable fallback regex parsing
3639
3616
  self.used_unreliable_extraction = True
3640
3617
 
@@ -4066,214 +4043,33 @@ class ArxivReferenceChecker:
4066
4043
  Returns:
4067
4044
  List of structured reference dictionaries
4068
4045
  """
4069
- # Use the improved BibTeX parsing from text_utils
4070
- from utils.text_utils import extract_latex_references
4046
+ # Use the dedicated BibTeX parser
4047
+ from utils.bibtex_parser import parse_bibtex_references
4071
4048
 
4072
- # Extract references using the improved parsing logic
4073
- references = extract_latex_references(bibliography_text)
4049
+ # Extract references using the BibTeX parser
4050
+ references = parse_bibtex_references(bibliography_text)
4074
4051
 
4075
- logger.debug(f"Extracted {len(references)} BibTeX references using improved parser")
4052
+ logger.debug(f"Extracted {len(references)} BibTeX references using dedicated parser")
4076
4053
  return references
4077
4054
 
4078
- def _parse_bibtex_entry(self, entry_type, content):
4055
+ def _parse_biblatex_references(self, bibliography_text):
4079
4056
  """
4080
- Parse a single BibTeX entry content to extract fields
4057
+ Parse biblatex formatted references like [1] Author. "Title". In: Venue. Year.
4081
4058
 
4082
4059
  Args:
4083
- entry_type: Type of entry (inproceedings, article, etc.)
4084
- content: Content inside the braces
4060
+ bibliography_text: String containing biblatex .bbl entries
4085
4061
 
4086
4062
  Returns:
4087
- Dictionary with structured reference data
4063
+ List of structured reference dictionaries
4088
4064
  """
4089
- import re
4090
- from utils.text_utils import parse_authors_with_initials, clean_title
4091
- from utils.doi_utils import construct_doi_url, is_valid_doi_format
4092
-
4093
- # Extract key (first part before comma)
4094
- key_match = re.match(r'([^,]+),', content)
4095
- key = key_match.group(1).strip() if key_match else ""
4096
-
4097
- # Extract fields using regex
4098
- fields = {}
4099
-
4100
- # Pattern to match field = {value} or field = "value"
4101
- field_pattern = r'(\w+)\s*=\s*(?:\{([^{}]*(?:\{[^{}]*\}[^{}]*)*)\}|"([^"]*)")'
4102
-
4103
- for match in re.finditer(field_pattern, content, re.DOTALL):
4104
- field_name = match.group(1).lower()
4105
- field_value = match.group(2) or match.group(3) or ""
4106
- # Strip outer quotes if present (handles cases like title = {"Some Title"})
4107
- field_value = field_value.strip()
4108
- if field_value.startswith('"') and field_value.endswith('"'):
4109
- field_value = field_value[1:-1]
4110
- fields[field_name] = field_value
4111
-
4112
- # If field extraction failed, try a simpler approach
4113
- if not fields:
4114
- logger.debug("Field extraction failed, trying line-by-line approach")
4115
- lines = content.split('\n')
4116
- for line in lines:
4117
- line = line.strip()
4118
- if '=' in line:
4119
- field_match = re.match(r'(\w+)\s*=\s*[{"]([^{}"]*)[}"]', line)
4120
- if field_match:
4121
- field_name = field_match.group(1).lower()
4122
- field_value = field_match.group(2).strip()
4123
- # Strip outer quotes if present
4124
- if field_value.startswith('"') and field_value.endswith('"'):
4125
- field_value = field_value[1:-1]
4126
- fields[field_name] = field_value
4127
-
4128
- # Extract required information
4129
- title = fields.get('title', '')
4130
- author_string = fields.get('author', '')
4131
- year = 0
4132
-
4133
- # Parse year
4134
- year_str = fields.get('year', '')
4135
- if year_str:
4136
- year_match = re.search(r'\d{4}', year_str)
4137
- if year_match:
4138
- year = int(year_match.group())
4139
-
4140
- # If no year found but we have a valid title/authors, try extracting from eprint or other fields
4141
- if year == 0 and (title or author_string):
4142
- # Check eprint field for arXiv entries like "2024" prefix
4143
- eprint = fields.get('eprint', '')
4144
- if eprint:
4145
- # Extract year from ArXiv eprint ID (e.g., "2311.09096" -> 2023)
4146
- eprint_year_match = re.match(r'^(\d{2})(\d{2})', eprint)
4147
- if eprint_year_match:
4148
- yy = int(eprint_year_match.group(1))
4149
- # Convert to 4-digit year (23 -> 2023, assumes 21st century)
4150
- if yy >= 91: # ArXiv started in 1991
4151
- year = 1900 + yy
4152
- else:
4153
- year = 2000 + yy
4154
-
4155
- # For entries without year, set None instead of 0
4156
- if year == 0:
4157
- year = None
4158
-
4159
- # Parse authors using the enhanced function
4160
- authors = []
4161
- if author_string:
4162
- try:
4163
- authors = parse_authors_with_initials(author_string)
4164
- except Exception as e:
4165
- logger.debug(f"Author parsing failed for '{author_string}': {e}")
4166
- # Fallback: split by 'and' and clean up
4167
- author_parts = author_string.split(' and ')
4168
- authors = []
4169
- for part in author_parts:
4170
- # Remove leading "and" from author names (handles cases like "and Krishnamoorthy, S")
4171
- part = re.sub(r'^and\s+', '', part.strip())
4172
- if part:
4173
- authors.append(part)
4174
-
4175
- # Special handling for @misc entries with only howpublished field
4176
- if not title and not authors and entry_type == 'misc':
4177
- howpublished = fields.get('howpublished', '')
4178
- if howpublished:
4179
- # Try to extract a URL from howpublished
4180
- url_patterns = [
4181
- r'://([^/]+)', # Missing protocol case: "://example.com/path"
4182
- r'https?://([^/\s]+)', # Standard URL
4183
- r'www\.([^/\s]+)', # www without protocol
4184
- ]
4185
-
4186
- extracted_url = ''
4187
- for pattern in url_patterns:
4188
- match = re.search(pattern, howpublished)
4189
- if match:
4190
- domain = match.group(1)
4191
- # Reconstruct URL with https if protocol was missing
4192
- if howpublished.startswith('://'):
4193
- extracted_url = 'https' + howpublished
4194
- elif not howpublished.startswith(('http://', 'https://')):
4195
- extracted_url = 'https://' + howpublished
4196
- else:
4197
- extracted_url = howpublished
4198
-
4199
- # Generate title from domain/path
4200
- if 'jailbreakchat.com' in domain:
4201
- title = 'JailbreakChat Website'
4202
- elif 'lesswrong.com' in domain:
4203
- title = 'LessWrong Post: Jailbreaking ChatGPT'
4204
- elif 'chat.openai.com' in domain:
4205
- title = 'ChatGPT Conversation Share'
4206
- elif 'gemini.google.com' in domain:
4207
- title = 'Gemini Conversation Share'
4208
- elif 'microsoft.com' in domain:
4209
- title = 'Microsoft Azure Content Safety API'
4210
- elif 'perspectiveapi.com' in domain:
4211
- title = 'Perspective API'
4212
- else:
4213
- # Generic title based on domain
4214
- title = f"Web Resource: {domain}"
4215
-
4216
- authors = ["Web Resource"]
4217
- # Store the extracted URL
4218
- fields['url'] = extracted_url
4219
- break
4065
+ # Use the dedicated biblatex parser
4066
+ from utils.biblatex_parser import parse_biblatex_references
4220
4067
 
4221
- # Apply defaults only if we still don't have values
4222
- if not authors:
4223
- authors = ["Unknown Author"]
4224
-
4225
- # Clean title
4226
- title = clean_title(title) if title else "Unknown Title"
4227
-
4228
- # Extract URL/DOI
4229
- url = fields.get('url', '')
4230
- doi = fields.get('doi', '')
4231
-
4232
- # Construct DOI URL if we have a DOI
4233
- if doi and is_valid_doi_format(doi):
4234
- url = construct_doi_url(doi)
4235
-
4236
- # Construct ArXiv URL from eprint field if no URL present
4237
- if not url:
4238
- eprint = fields.get('eprint', '')
4239
- if eprint and re.match(r'^\d{4}\.\d{4,5}', eprint):
4240
- # Remove version number if present and construct ArXiv URL
4241
- clean_eprint = re.sub(r'v\d+$', '', eprint)
4242
- url = f"https://arxiv.org/abs/{clean_eprint}"
4243
-
4244
- # Handle special URL fields
4245
- if not url:
4246
- howpublished = fields.get('howpublished', '')
4247
- if 'url{' in howpublished or 'href{' in howpublished:
4248
- url_match = re.search(r'url\{([^}]+)\}', howpublished)
4249
- if not url_match:
4250
- url_match = re.search(r'href\{([^}]+)\}', howpublished)
4251
- if url_match:
4252
- from utils.url_utils import clean_url_punctuation
4253
- url = clean_url_punctuation(url_match.group(1))
4254
-
4255
- # Determine reference type
4256
- ref_type = 'other'
4257
- if 'arxiv' in url.lower() or 'arxiv' in title.lower():
4258
- ref_type = 'arxiv'
4259
- elif url or doi:
4260
- ref_type = 'non-arxiv'
4261
-
4262
- # Create structured reference
4263
- structured_ref = {
4264
- 'url': url,
4265
- 'doi': doi,
4266
- 'year': year,
4267
- 'authors': authors,
4268
- 'title': title,
4269
- 'raw_text': f"@{entry_type}{{{key}, {content}}}",
4270
- 'type': ref_type,
4271
- 'bibtex_key': key,
4272
- 'bibtex_type': entry_type
4273
- }
4068
+ # Extract references using the biblatex parser
4069
+ references = parse_biblatex_references(bibliography_text)
4274
4070
 
4275
- logger.debug(f"Parsed BibTeX entry: {title} by {authors} ({year})")
4276
- return structured_ref
4071
+ logger.debug(f"Extracted {len(references)} biblatex references using dedicated parser")
4072
+ return references
4277
4073
 
4278
4074
  def _process_llm_extracted_references(self, references):
4279
4075
  """
@@ -4429,8 +4225,17 @@ class ArxivReferenceChecker:
4429
4225
  return True
4430
4226
 
4431
4227
  # Also check if authors have significant overlap (at least 50% of the shorter author list)
4432
- author1_parts = seg1['author'].split('*') if '*' in seg1['author'] else seg1['author'].split(',')
4433
- author2_parts = seg2['author'].split('*') if '*' in seg2['author'] else seg2['author'].split(',')
4228
+ from utils.text_utils import parse_authors_with_initials
4229
+
4230
+ if '*' in seg1['author']:
4231
+ author1_parts = seg1['author'].split('*')
4232
+ else:
4233
+ author1_parts = parse_authors_with_initials(seg1['author'])
4234
+
4235
+ if '*' in seg2['author']:
4236
+ author2_parts = seg2['author'].split('*')
4237
+ else:
4238
+ author2_parts = parse_authors_with_initials(seg2['author'])
4434
4239
 
4435
4240
  # Clean and normalize author names
4436
4241
  author1_clean = {a.strip().lower() for a in author1_parts if a.strip() and a.strip() not in ['et al', 'others']}
@@ -4945,55 +4750,6 @@ class ArxivReferenceChecker:
4945
4750
  }
4946
4751
 
4947
4752
 
4948
- def _get_bibtex_content(self, paper):
4949
- """
4950
- Try to get BibTeX content for a paper from various sources.
4951
-
4952
- Args:
4953
- paper: Paper object
4954
-
4955
- Returns:
4956
- str: BibTeX content if found, None otherwise
4957
- """
4958
- # Try ArXiv source if it's an ArXiv paper
4959
- from utils.arxiv_utils import extract_arxiv_id_from_paper, download_arxiv_source
4960
-
4961
- arxiv_id = extract_arxiv_id_from_paper(paper)
4962
- if arxiv_id:
4963
- logger.debug(f"Detected ArXiv paper {arxiv_id}, checking for structured bibliography")
4964
- tex_content, bib_content, bbl_content = download_arxiv_source(arxiv_id)
4965
-
4966
- # Prefer .bib files (most structured), then .bbl files
4967
- if bib_content:
4968
- logger.info(f"Found .bib files in ArXiv source for {arxiv_id}")
4969
-
4970
- # If we have LaTeX content, filter BibTeX by cited keys
4971
- if tex_content:
4972
- from utils.text_utils import extract_cited_keys_from_latex, filter_bibtex_by_cited_keys
4973
- cited_keys = extract_cited_keys_from_latex(tex_content)
4974
- if cited_keys:
4975
- logger.debug(f"Found {len(cited_keys)} cited keys, filtering BibTeX")
4976
- filtered_content = filter_bibtex_by_cited_keys(bib_content, cited_keys)
4977
- return filtered_content
4978
-
4979
- return bib_content
4980
-
4981
- elif bbl_content:
4982
- logger.info(f"Found .bbl files in ArXiv source for {arxiv_id}")
4983
- return bbl_content
4984
-
4985
- elif tex_content:
4986
- # Check for embedded bibliography in LaTeX
4987
- from utils.text_utils import detect_latex_bibliography_format
4988
- latex_format = detect_latex_bibliography_format(tex_content)
4989
- if latex_format['is_latex'] and ('\\bibitem' in tex_content or '@' in tex_content):
4990
- logger.info(f"Found embedded bibliography in ArXiv LaTeX source, but skipping due to formatting incompatibility")
4991
- # Skip embedded bibliography and return None to trigger fallback methods
4992
- return None
4993
-
4994
- # Could add other BibTeX sources here (e.g., direct BibTeX URLs, etc.)
4995
-
4996
- return None
4997
4753
 
4998
4754
 
4999
4755
  def extract_bibliography(self, paper, debug_mode=False):
@@ -5008,7 +4764,8 @@ class ArxivReferenceChecker:
5008
4764
  logger.debug(f"Extracting bibliography for paper {paper_id}: {paper.title}")
5009
4765
 
5010
4766
  # Check if we can get BibTeX content for this paper (ArXiv or other sources)
5011
- bibtex_content = self._get_bibtex_content(paper)
4767
+ from utils.arxiv_utils import get_bibtex_content
4768
+ bibtex_content = get_bibtex_content(paper)
5012
4769
  if bibtex_content:
5013
4770
  logger.debug(f"Found BibTeX content for {paper_id}, using structured bibliography")
5014
4771
 
@@ -5062,7 +4819,7 @@ class ArxivReferenceChecker:
5062
4819
  else:
5063
4820
  logger.warning("No LLM available for fallback, using original parsing results")
5064
4821
  else:
5065
- logger.info(f"LaTeX parsing validation passed (quality: {validation['quality_score']:.2f})")
4822
+ logger.debug(f"LaTeX parsing validation passed (quality: {validation['quality_score']:.2f})")
5066
4823
  else:
5067
4824
  # Parse BibTeX using the standard flow (LLM or regex based on config)
5068
4825
  references = self.parse_references(bibtex_content)
@@ -5076,7 +4833,7 @@ class ArxivReferenceChecker:
5076
4833
  logger.warning(f"Could not save debug references file for {paper_id}: {e}")
5077
4834
 
5078
4835
  if references:
5079
- logger.info(f"Extracted {len(references)} references")
4836
+ logger.debug(f"Extracted {len(references)} references")
5080
4837
  return references
5081
4838
 
5082
4839
  # Check if this is a text file containing references
@@ -5146,7 +4903,7 @@ class ArxivReferenceChecker:
5146
4903
  bibtex_references = extract_latex_references(bib_content, paper.file_path)
5147
4904
 
5148
4905
  if bibtex_references:
5149
- logger.info(f"Extracted {len(bibtex_references)} references from BibTeX file")
4906
+ logger.debug(f"Extracted {len(bibtex_references)} references from BibTeX file")
5150
4907
  return bibtex_references
5151
4908
  else:
5152
4909
  logger.warning(f"No references found in BibTeX file: {paper.file_path}")
@@ -5623,7 +5380,7 @@ class ArxivReferenceChecker:
5623
5380
  error_details = unverified_errors[0].get('error_details', '')
5624
5381
  if error_details:
5625
5382
  subreason = self._categorize_unverified_reason(error_details)
5626
- print(f" Subreason: {subreason}")
5383
+ print(f" Subreason: {subreason}")
5627
5384
 
5628
5385
  year_str = self._format_year_string(reference.get('year'))
5629
5386
 
utils/arxiv_utils.py CHANGED
@@ -288,7 +288,7 @@ def filter_bibtex_by_citations(bib_content, tex_files, main_tex_content):
288
288
  return bib_content
289
289
 
290
290
  # Parse BibTeX entries and filter
291
- from utils.text_utils import parse_bibtex_entries
291
+ from utils.bibtex_parser import parse_bibtex_entries
292
292
  entries = parse_bibtex_entries(bib_content)
293
293
 
294
294
  # Filter entries to only cited ones
@@ -374,3 +374,79 @@ def reconstruct_bibtex_content(cited_entries, original_content):
374
374
  return '\n\n'.join(filtered_parts) + '\n'
375
375
 
376
376
 
377
+ def get_bibtex_content(paper):
378
+ """
379
+ Try to get BibTeX content for a paper from various sources.
380
+
381
+ Args:
382
+ paper: Paper object
383
+
384
+ Returns:
385
+ str: BibTeX content if found, None otherwise
386
+ """
387
+ import re
388
+
389
+ # Try ArXiv source if it's an ArXiv paper
390
+ arxiv_id = extract_arxiv_id_from_paper(paper)
391
+ if arxiv_id:
392
+ logger.debug(f"Detected ArXiv paper {arxiv_id}, checking for structured bibliography")
393
+ tex_content, bib_content, bbl_content = download_arxiv_source(arxiv_id)
394
+
395
+ # Choose between .bib and .bbl files based on content richness
396
+ # Prioritize .bbl if it has more references than filtered .bib, otherwise prefer .bib
397
+ if bib_content and bbl_content:
398
+ # Count entries in both
399
+ bib_entry_count = len(re.findall(r'@\w+\s*\{', bib_content))
400
+ bbl_entry_count = len(re.findall(r'\\bibitem\[', bbl_content))
401
+
402
+ # If we have LaTeX content, get filtered BibTeX count
403
+ filtered_bib_count = bib_entry_count
404
+ filtered_content = bib_content
405
+ if tex_content:
406
+ cited_keys = extract_cited_keys_from_tex({}, tex_content)
407
+ if cited_keys:
408
+ logger.debug(f"Found {len(cited_keys)} cited keys, filtering BibTeX")
409
+ filtered_content = filter_bibtex_by_citations(bib_content, {}, tex_content)
410
+ filtered_bib_count = len(re.findall(r'@\w+\s*\{', filtered_content))
411
+
412
+ logger.debug(f"Bibliography comparison: .bbl has {bbl_entry_count} entries, filtered .bib has {filtered_bib_count} entries")
413
+
414
+ # Prioritize .bbl if it has significantly more entries
415
+ if bbl_entry_count > filtered_bib_count * 1.5: # 50% more entries threshold
416
+ logger.info(f"Using .bbl files from ArXiv source")
417
+ return bbl_content
418
+ else:
419
+ logger.info(f"Using filtered .bib files")
420
+ return filtered_content
421
+
422
+ elif bib_content:
423
+ logger.info(f"Found .bib files in ArXiv source for {arxiv_id}")
424
+
425
+ # If we have LaTeX content, filter BibTeX by cited keys
426
+ if tex_content:
427
+ cited_keys = extract_cited_keys_from_tex({}, tex_content)
428
+ if cited_keys:
429
+ logger.debug(f"Found {len(cited_keys)} cited keys, filtering BibTeX")
430
+ filtered_content = filter_bibtex_by_citations(bib_content, {}, tex_content)
431
+ return filtered_content
432
+
433
+ return bib_content
434
+
435
+ elif bbl_content:
436
+ logger.info(f"Found .bbl files in ArXiv source for {arxiv_id}")
437
+ return bbl_content
438
+
439
+ elif tex_content:
440
+ # Check for embedded bibliography in LaTeX
441
+ from utils.text_utils import detect_latex_bibliography_format
442
+ latex_format = detect_latex_bibliography_format(tex_content)
443
+ if latex_format['is_latex'] and ('\\bibitem' in tex_content or '@' in tex_content):
444
+ logger.info(f"Found embedded bibliography in ArXiv LaTeX source, but skipping due to formatting incompatibility")
445
+ # Skip embedded bibliography and return None to trigger fallback methods
446
+ return None
447
+
448
+ # Could add other BibTeX sources here (e.g., direct BibTeX URLs, etc.)
449
+
450
+ return None
451
+
452
+