academic-refchecker 1.2.39__tar.gz → 1.2.40__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. {academic_refchecker-1.2.39/src/academic_refchecker.egg-info → academic_refchecker-1.2.40}/PKG-INFO +1 -1
  2. {academic_refchecker-1.2.39 → academic_refchecker-1.2.40}/src/__version__.py +1 -1
  3. {academic_refchecker-1.2.39 → academic_refchecker-1.2.40/src/academic_refchecker.egg-info}/PKG-INFO +1 -1
  4. {academic_refchecker-1.2.39 → academic_refchecker-1.2.40}/src/core/refchecker.py +27 -2
  5. {academic_refchecker-1.2.39 → academic_refchecker-1.2.40}/src/utils/biblatex_parser.py +106 -9
  6. {academic_refchecker-1.2.39 → academic_refchecker-1.2.40}/LICENSE +0 -0
  7. {academic_refchecker-1.2.39 → academic_refchecker-1.2.40}/MANIFEST.in +0 -0
  8. {academic_refchecker-1.2.39 → academic_refchecker-1.2.40}/README.md +0 -0
  9. {academic_refchecker-1.2.39 → academic_refchecker-1.2.40}/pyproject.toml +0 -0
  10. {academic_refchecker-1.2.39 → academic_refchecker-1.2.40}/requirements.txt +0 -0
  11. {academic_refchecker-1.2.39 → academic_refchecker-1.2.40}/scripts/download_db.py +0 -0
  12. {academic_refchecker-1.2.39 → academic_refchecker-1.2.40}/scripts/run_tests.py +0 -0
  13. {academic_refchecker-1.2.39 → academic_refchecker-1.2.40}/scripts/start_vllm_server.py +0 -0
  14. {academic_refchecker-1.2.39 → academic_refchecker-1.2.40}/setup.cfg +0 -0
  15. {academic_refchecker-1.2.39 → academic_refchecker-1.2.40}/src/__init__.py +0 -0
  16. {academic_refchecker-1.2.39 → academic_refchecker-1.2.40}/src/academic_refchecker.egg-info/SOURCES.txt +0 -0
  17. {academic_refchecker-1.2.39 → academic_refchecker-1.2.40}/src/academic_refchecker.egg-info/dependency_links.txt +0 -0
  18. {academic_refchecker-1.2.39 → academic_refchecker-1.2.40}/src/academic_refchecker.egg-info/entry_points.txt +0 -0
  19. {academic_refchecker-1.2.39 → academic_refchecker-1.2.40}/src/academic_refchecker.egg-info/requires.txt +0 -0
  20. {academic_refchecker-1.2.39 → academic_refchecker-1.2.40}/src/academic_refchecker.egg-info/top_level.txt +0 -0
  21. {academic_refchecker-1.2.39 → academic_refchecker-1.2.40}/src/checkers/__init__.py +0 -0
  22. {academic_refchecker-1.2.39 → academic_refchecker-1.2.40}/src/checkers/crossref.py +0 -0
  23. {academic_refchecker-1.2.39 → academic_refchecker-1.2.40}/src/checkers/enhanced_hybrid_checker.py +0 -0
  24. {academic_refchecker-1.2.39 → academic_refchecker-1.2.40}/src/checkers/github_checker.py +0 -0
  25. {academic_refchecker-1.2.39 → academic_refchecker-1.2.40}/src/checkers/local_semantic_scholar.py +0 -0
  26. {academic_refchecker-1.2.39 → academic_refchecker-1.2.40}/src/checkers/openalex.py +0 -0
  27. {academic_refchecker-1.2.39 → academic_refchecker-1.2.40}/src/checkers/openreview_checker.py +0 -0
  28. {academic_refchecker-1.2.39 → academic_refchecker-1.2.40}/src/checkers/semantic_scholar.py +0 -0
  29. {academic_refchecker-1.2.39 → academic_refchecker-1.2.40}/src/checkers/webpage_checker.py +0 -0
  30. {academic_refchecker-1.2.39 → academic_refchecker-1.2.40}/src/config/__init__.py +0 -0
  31. {academic_refchecker-1.2.39 → academic_refchecker-1.2.40}/src/config/logging.conf +0 -0
  32. {academic_refchecker-1.2.39 → academic_refchecker-1.2.40}/src/config/settings.py +0 -0
  33. {academic_refchecker-1.2.39 → academic_refchecker-1.2.40}/src/core/__init__.py +0 -0
  34. {academic_refchecker-1.2.39 → academic_refchecker-1.2.40}/src/core/db_connection_pool.py +0 -0
  35. {academic_refchecker-1.2.39 → academic_refchecker-1.2.40}/src/core/parallel_processor.py +0 -0
  36. {academic_refchecker-1.2.39 → academic_refchecker-1.2.40}/src/database/__init__.py +0 -0
  37. {academic_refchecker-1.2.39 → academic_refchecker-1.2.40}/src/database/download_semantic_scholar_db.py +0 -0
  38. {academic_refchecker-1.2.39 → academic_refchecker-1.2.40}/src/llm/__init__.py +0 -0
  39. {academic_refchecker-1.2.39 → academic_refchecker-1.2.40}/src/llm/base.py +0 -0
  40. {academic_refchecker-1.2.39 → academic_refchecker-1.2.40}/src/llm/providers.py +0 -0
  41. {academic_refchecker-1.2.39 → academic_refchecker-1.2.40}/src/scripts/__init__.py +0 -0
  42. {academic_refchecker-1.2.39 → academic_refchecker-1.2.40}/src/scripts/start_vllm_server.py +0 -0
  43. {academic_refchecker-1.2.39 → academic_refchecker-1.2.40}/src/services/__init__.py +0 -0
  44. {academic_refchecker-1.2.39 → academic_refchecker-1.2.40}/src/services/pdf_processor.py +0 -0
  45. {academic_refchecker-1.2.39 → academic_refchecker-1.2.40}/src/utils/__init__.py +0 -0
  46. {academic_refchecker-1.2.39 → academic_refchecker-1.2.40}/src/utils/arxiv_utils.py +0 -0
  47. {academic_refchecker-1.2.39 → academic_refchecker-1.2.40}/src/utils/author_utils.py +0 -0
  48. {academic_refchecker-1.2.39 → academic_refchecker-1.2.40}/src/utils/bibliography_utils.py +0 -0
  49. {academic_refchecker-1.2.39 → academic_refchecker-1.2.40}/src/utils/bibtex_parser.py +0 -0
  50. {academic_refchecker-1.2.39 → academic_refchecker-1.2.40}/src/utils/config_validator.py +0 -0
  51. {academic_refchecker-1.2.39 → academic_refchecker-1.2.40}/src/utils/db_utils.py +0 -0
  52. {academic_refchecker-1.2.39 → academic_refchecker-1.2.40}/src/utils/doi_utils.py +0 -0
  53. {academic_refchecker-1.2.39 → academic_refchecker-1.2.40}/src/utils/error_utils.py +0 -0
  54. {academic_refchecker-1.2.39 → academic_refchecker-1.2.40}/src/utils/mock_objects.py +0 -0
  55. {academic_refchecker-1.2.39 → academic_refchecker-1.2.40}/src/utils/text_utils.py +0 -0
  56. {academic_refchecker-1.2.39 → academic_refchecker-1.2.40}/src/utils/unicode_utils.py +0 -0
  57. {academic_refchecker-1.2.39 → academic_refchecker-1.2.40}/src/utils/url_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: academic-refchecker
3
- Version: 1.2.39
3
+ Version: 1.2.40
4
4
  Summary: A comprehensive tool for validating reference accuracy in academic papers
5
5
  Author-email: Mark Russinovich <markrussinovich@hotmail.com>
6
6
  License-Expression: MIT
@@ -1,3 +1,3 @@
1
1
  """Version information for RefChecker."""
2
2
 
3
- __version__ = "1.2.39"
3
+ __version__ = "1.2.40"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: academic-refchecker
3
- Version: 1.2.39
3
+ Version: 1.2.40
4
4
  Summary: A comprehensive tool for validating reference accuracy in academic papers
5
5
  Author-email: Mark Russinovich <markrussinovich@hotmail.com>
6
6
  License-Expression: MIT
@@ -3386,7 +3386,25 @@ class ArxivReferenceChecker:
3386
3386
  logger.info("Detected biblatex format, using biblatex parser")
3387
3387
  self.used_regex_extraction = True
3388
3388
  # Note: biblatex parsing is also robust, so we don't set used_unreliable_extraction
3389
- return self._parse_biblatex_references(bibliography_text)
3389
+ biblatex_refs = self._parse_biblatex_references(bibliography_text)
3390
+
3391
+ # If biblatex parsing returned empty results (due to quality validation),
3392
+ # fallback to LLM if available
3393
+ if not biblatex_refs and self.llm_extractor:
3394
+ logger.debug("Biblatex parser returned no results due to quality validation, trying LLM fallback")
3395
+ try:
3396
+ references = self.llm_extractor.extract_references(bibliography_text)
3397
+ if references:
3398
+ logger.debug(f"LLM fallback extracted {len(references)} references")
3399
+ return self._process_llm_extracted_references(references)
3400
+ else:
3401
+ logger.warning("LLM fallback also returned no results")
3402
+ return []
3403
+ except Exception as e:
3404
+ logger.error(f"LLM fallback failed: {e}")
3405
+ return []
3406
+
3407
+ return biblatex_refs
3390
3408
 
3391
3409
  # For non-standard formats, try LLM-based extraction if available
3392
3410
  if self.llm_extractor:
@@ -3610,7 +3628,14 @@ class ArxivReferenceChecker:
3610
3628
  if detect_biblatex_format(bibliography_text):
3611
3629
  logger.debug("Detected biblatex format, using biblatex-specific parsing")
3612
3630
  # biblatex parsing is also robust, so we don't set used_unreliable_extraction
3613
- return self._parse_biblatex_references(bibliography_text)
3631
+ biblatex_refs = self._parse_biblatex_references(bibliography_text)
3632
+
3633
+ # If biblatex parsing returned empty results (due to quality validation),
3634
+ # we'll continue with the unreliable fallback regex parsing
3635
+ if not biblatex_refs:
3636
+ logger.debug("Biblatex parser returned no results due to quality validation, falling back to regex parsing")
3637
+ else:
3638
+ return biblatex_refs
3614
3639
 
3615
3640
  # If we reach here, we're using the unreliable fallback regex parsing
3616
3641
  self.used_unreliable_extraction = True
@@ -138,6 +138,57 @@ def detect_biblatex_format(text: str) -> bool:
138
138
  return has_biblatex_marker or has_numbered_refs
139
139
 
140
140
 
141
+ def _validate_parsing_quality(references: List[Dict[str, Any]]) -> bool:
142
+ """
143
+ Validate that biblatex parsing results are of acceptable quality.
144
+ If quality is poor, we should fallback to LLM parsing instead.
145
+
146
+ Args:
147
+ references: List of parsed reference dictionaries
148
+
149
+ Returns:
150
+ True if parsing quality is acceptable, False if should fallback to LLM
151
+ """
152
+ if not references:
153
+ return False
154
+
155
+ # Count problematic entries
156
+ unknown_authors = 0
157
+ unknown_titles = 0
158
+ total_entries = len(references)
159
+
160
+ for ref in references:
161
+ authors = ref.get('authors', [])
162
+ title = ref.get('title', '')
163
+
164
+ # Check for "Unknown Author" entries
165
+ if not authors or authors == ['Unknown Author']:
166
+ unknown_authors += 1
167
+
168
+ # Check for "Unknown Title" entries
169
+ if not title or title == 'Unknown Title':
170
+ unknown_titles += 1
171
+
172
+ # Calculate failure rates
173
+ author_failure_rate = unknown_authors / total_entries
174
+ title_failure_rate = unknown_titles / total_entries
175
+
176
+ # Quality thresholds - if more than 20% of entries have parsing failures,
177
+ # fallback to LLM which is more robust
178
+ MAX_ACCEPTABLE_FAILURE_RATE = 0.2
179
+
180
+ if author_failure_rate > MAX_ACCEPTABLE_FAILURE_RATE:
181
+ logger.debug(f"Biblatex parsing quality poor: {author_failure_rate:.1%} unknown authors (>{MAX_ACCEPTABLE_FAILURE_RATE:.0%}). Falling back to LLM.")
182
+ return False
183
+
184
+ if title_failure_rate > MAX_ACCEPTABLE_FAILURE_RATE:
185
+ logger.debug(f"Biblatex parsing quality poor: {title_failure_rate:.1%} unknown titles (>{MAX_ACCEPTABLE_FAILURE_RATE:.0%}). Falling back to LLM.")
186
+ return False
187
+
188
+ logger.debug(f"Biblatex parsing quality acceptable: {author_failure_rate:.1%} unknown authors, {title_failure_rate:.1%} unknown titles")
189
+ return True
190
+
191
+
141
192
  def parse_biblatex_references(text: str) -> List[Dict[str, Any]]:
142
193
  """
143
194
  Parse biblatex formatted references into structured format
@@ -146,7 +197,8 @@ def parse_biblatex_references(text: str) -> List[Dict[str, Any]]:
146
197
  text: String containing biblatex .bbl entries
147
198
 
148
199
  Returns:
149
- List of structured reference dictionaries
200
+ List of structured reference dictionaries, or empty list if
201
+ parsing quality is poor (to trigger LLM fallback)
150
202
  """
151
203
  from utils.text_utils import parse_authors_with_initials, clean_title
152
204
  from utils.doi_utils import construct_doi_url, is_valid_doi_format
@@ -171,7 +223,7 @@ def parse_biblatex_references(text: str) -> List[Dict[str, Any]]:
171
223
  # Find the content between this entry and the next (or end of text)
172
224
  if i + 1 < len(entry_starts):
173
225
  next_start = entry_starts[i + 1][1]
174
- content = text[end:next_start].strip()
226
+ raw_content = text[end:next_start].strip()
175
227
  else:
176
228
  # Last entry - take everything to end, but be smart about stopping
177
229
  remaining = text[end:].strip()
@@ -190,9 +242,20 @@ def parse_biblatex_references(text: str) -> List[Dict[str, Any]]:
190
242
  if match and match.start() < min_stop:
191
243
  min_stop = match.start()
192
244
 
193
- content = remaining[:min_stop].strip()
245
+ raw_content = remaining[:min_stop].strip()
194
246
 
195
- if content:
247
+ # Clean up content - handle cases where entry might be incomplete or malformed
248
+ if raw_content:
249
+ # Remove stray closing brackets or incomplete markers
250
+ content = raw_content
251
+ # Remove trailing "]" if it's the only thing on the last line
252
+ lines = content.split('\n')
253
+ if len(lines) > 1 and lines[-1].strip() == ']':
254
+ content = '\n'.join(lines[:-1]).strip()
255
+ elif content.strip() == ']':
256
+ # If content is only "], skip this entry as it's incomplete
257
+ continue
258
+
196
259
  matches.append((entry_num, content))
197
260
 
198
261
  for entry_num, content in matches:
@@ -218,6 +281,11 @@ def parse_biblatex_references(text: str) -> List[Dict[str, Any]]:
218
281
  references.append(parsed_ref)
219
282
 
220
283
  logger.debug(f"Extracted {len(references)} biblatex references")
284
+
285
+ # Validate parsing quality - if poor, return empty list to trigger LLM fallback
286
+ if not _validate_parsing_quality(references):
287
+ return []
288
+
221
289
  return references
222
290
 
223
291
 
@@ -263,6 +331,8 @@ def parse_biblatex_entry_content(entry_num: str, content: str) -> Dict[str, Any]
263
331
  # Pattern: "FirstAuthor et al. Title Goes Here. Year." or "Author. Title. Year."
264
332
  # Order matters: more specific patterns first
265
333
  title_patterns = [
334
+ # Pattern for unquoted books: "Author1 and Author2, Title: Subtitle. Location: Publisher, Year."
335
+ r'(?:and\s+[A-Z][^,]*),\s+([A-Z][^.]*?:\s*[^.]*?)\.\s+[A-Z][^:]*:\s*[^,]*,\s*\d{4}',
266
336
  r'[A-Z][^.]+\.\s*([A-Z][^.]*?)\.\s*(?:https?://|arXiv:|\d{4})', # "Authors. Title. URL/arXiv/Year" (flexible spacing) - MOST SPECIFIC
267
337
  r'\.([A-Z][A-Za-z\s]+(?:\?|!)?)\.?\s+\d{4}', # ".Title. Year" - for cases where authors end without space
268
338
  r'[A-Z][a-z]+\.([A-Z][A-Za-z\s\-&]+?)\.\s+\d{4}', # "Name.Title. Year" - missing space after period
@@ -276,7 +346,14 @@ def parse_biblatex_entry_content(entry_num: str, content: str) -> Dict[str, Any]
276
346
  if title_match:
277
347
  potential_title = title_match.group(1)
278
348
  # Make sure it looks like a title and not author names
279
- if len(potential_title) > 10 and not re.match(r'^[A-Z][a-z]+,\s*[A-Z]', potential_title):
349
+ # Be more specific about author name patterns - should be "Surname, Initial" not "Word, Word"
350
+ author_like_pattern = r'^[A-Z][a-z]+,\s*[A-Z]\.?$' # "Smith, J." or "Smith, J"
351
+ multi_word_author = r'^[A-Z][a-z]+,\s*[A-Z][a-z]+$' # "Smith, John" - but still reject this
352
+
353
+ is_author_like = (re.match(author_like_pattern, potential_title) or
354
+ re.match(multi_word_author, potential_title))
355
+
356
+ if len(potential_title) > 2 and not is_author_like:
280
357
  title = clean_title(potential_title)
281
358
  break
282
359
 
@@ -330,16 +407,25 @@ def parse_biblatex_entry_content(entry_num: str, content: str) -> Dict[str, Any]
330
407
  # Examples we need to handle:
331
408
  # "Egor Zverev, Sahar Abdelnabi, Mario Fritz, and Christoph H Lampert. \"Title\". In: venue (year)."
332
409
  # "Andrej Karpathy. Intro to Large Language Models. https://... year."
410
+ # "A. Author and B. Coauthor, \"Title\"," <- handle this format
333
411
 
334
412
  # Try multiple patterns to extract authors
413
+ # Order matters - more specific patterns first!
335
414
  author_patterns = [
336
415
  # Pattern 1: Authors followed by quoted title (handle both regular and smart quotes)
416
+ r'^([^"\u201c\u201d]+?),\s*["\u201c\u201d]', # "Authors, \"Title\"" - more restrictive, requires comma before quote
337
417
  r'^([^"\u201c\u201d]+)\.\s*["\u201c\u201d]', # "Authors. \"Title\"" or smart quotes
338
418
 
339
- # Pattern 2: Authors followed by title, then period, then year or venue
419
+ # Pattern 2: Authors followed by unquoted title for books: "Author1 and Author2, Title:"
420
+ r'^([^,]+(?:\s+and\s+[^,]+)?),\s+([A-Z][^.]*?):\s*([^.]*?)\.', # "Author1 and Author2, Title: Subtitle." - book format
421
+
422
+ # Pattern 3: Authors ending with period, no space, then title (missing space case) - MORE SPECIFIC
423
+ r'^([^.]+?)\.([A-Z][^.]*)\.', # "Authors.Title." - missing space after period
424
+
425
+ # Pattern 4: Authors followed by title, then period, then year or venue (with extracted title)
340
426
  r'^(.+?)\.\s*([A-Z][^.]+)\.\s+(?:In:|https?://|\d{4})', # "Authors. Title. In:/URL/Year" (allow no space after period)
341
427
 
342
- # Pattern 3: Authors ending with period followed by capital letter (simpler fallback)
428
+ # Pattern 5: Authors ending with period followed by capital letter (simpler fallback) - LEAST SPECIFIC
343
429
  r'^([^.]+?)\.\s*[A-Z]', # Allow no space after period
344
430
  ]
345
431
 
@@ -349,9 +435,17 @@ def parse_biblatex_entry_content(entry_num: str, content: str) -> Dict[str, Any]
349
435
  potential_authors = author_match.group(1).strip()
350
436
 
351
437
  # For patterns that also capture title, extract it
352
- if i == 1 and not title and len(author_match.groups()) > 1:
438
+ if i == 2 and not title and len(author_match.groups()) > 2:
439
+ # Pattern 2 (book format) captures authors, title, and subtitle
440
+ title_part = author_match.group(2).strip()
441
+ subtitle_part = author_match.group(3).strip()
442
+ combined_title = f"{title_part}: {subtitle_part}" if subtitle_part else title_part
443
+ if len(combined_title) > 2:
444
+ title = clean_title(combined_title)
445
+ elif (i == 3 or i == 4) and not title and len(author_match.groups()) > 1:
446
+ # Pattern 3 (missing space, index 3) and Pattern 4 (with space, index 4) capture both authors and title
353
447
  potential_title = author_match.group(2).strip()
354
- if len(potential_title) > 5 and not re.match(r'^[A-Z][a-z]+,', potential_title):
448
+ if len(potential_title) > 2 and not re.match(r'^[A-Z][a-z]+,', potential_title):
355
449
  title = clean_title(potential_title)
356
450
 
357
451
  # Validate that this looks like authors
@@ -431,8 +525,11 @@ def parse_biblatex_entry_content(entry_num: str, content: str) -> Dict[str, Any]
431
525
  authors.append(part)
432
526
 
433
527
  # 7. Extract journal/venue - look for patterns like "In: Conference" or remaining text
528
+ # Also handle cases like "Tasks,"Adv. Neural" where there's missing space after quote-comma
434
529
  journal_patterns = [
435
530
  r'In:\s*([^.]+?)(?:\.|$)', # "In: Conference Name"
531
+ r'"[^"]*,"([A-Z][^,]*?\. [A-Z][^,]*)', # Quote-comma-venue like "Tasks,"Adv. Neural Inf. Process. Syst."
532
+ r'["\u201c\u201d]([A-Z][^.]*(?:Adv\.|Proc\.|IEEE|Journal)[^.]*)', # Missing space after quote like "Tasks"Adv. Neural"
436
533
  r'([A-Z][^.]*(?:Conference|Workshop|Journal|Proceedings)[^.]*)', # Conference/journal names
437
534
  ]
438
535