academic-refchecker 1.2.37__py3-none-any.whl → 1.2.39__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
__version__.py CHANGED
@@ -1,3 +1,3 @@
1
1
  """Version information for RefChecker."""
2
2
 
3
- __version__ = "1.2.37"
3
+ __version__ = "1.2.39"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: academic-refchecker
3
- Version: 1.2.37
3
+ Version: 1.2.39
4
4
  Summary: A comprehensive tool for validating reference accuracy in academic papers
5
5
  Author-email: Mark Russinovich <markrussinovich@hotmail.com>
6
6
  License-Expression: MIT
@@ -1,5 +1,5 @@
1
- __version__.py,sha256=rsFw2SftIDg9yKghlFCWIN2abJx55aqbjODKqOrszDE,65
2
- academic_refchecker-1.2.37.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
1
+ __version__.py,sha256=63hU3Q1fGBiJ1GUnUQ-V6-S8pbWZ7bug_ZVu4V6eo9g,65
2
+ academic_refchecker-1.2.39.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
3
3
  checkers/__init__.py,sha256=T0PAHTFt6UiGvn-WGoJU8CdhXNmf6zaHmcGVoWHhmJQ,533
4
4
  checkers/crossref.py,sha256=Hzq4dlf1CSn0aZWU8CMOnLxIvaSivTabLoepIOkgkmY,20585
5
5
  checkers/enhanced_hybrid_checker.py,sha256=6yf5tV4jLSVzjX1xR_kQq0NOgQIst-z_WmkiqqMc8hQ,23469
@@ -15,7 +15,7 @@ config/settings.py,sha256=-vODFoXbWbGPUElpmchE5zbCj_n4Vtxr8HU1hQDFp_c,6164
15
15
  core/__init__.py,sha256=1T2MSQyDk0u_PupbHvm4CvNNN--dxsw78fqKUrqoYrM,157
16
16
  core/db_connection_pool.py,sha256=XRiOdehikkSz3obH4WKgf8woa3694if50Q15rBT-4XQ,4697
17
17
  core/parallel_processor.py,sha256=2S1cAPhtWH3glvtiJrt9JkZzk2iJkPKXsc-F3lg0X6U,16795
18
- core/refchecker.py,sha256=XI5yVa8KrVPEE8VTigG_G7K91SeGKxU0Uz8L8o6REu4,276733
18
+ core/refchecker.py,sha256=8EatAqYEDpW219Xrn-ql1oQ5ytmCU8RW8pMtlujRbC8,273167
19
19
  database/__init__.py,sha256=mEuVHlEBuS44t_2ZT_JnvQQrlRCjo1SJq1NmaJ6r8OY,125
20
20
  database/download_semantic_scholar_db.py,sha256=waN4I97KC_36YMiPbiBDUUmgfzu1nub5yeKdAsIR2aw,75276
21
21
  llm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -26,20 +26,21 @@ scripts/start_vllm_server.py,sha256=ZepWp2y2cKFW0Kgsoima2RbmF02fTU29UFcLLpsBhFU,
26
26
  services/__init__.py,sha256=jGi9S74Msak3YR-C4Qb68VU7HB4oLaX9o1rlVAFpOFI,187
27
27
  services/pdf_processor.py,sha256=vu_JnhFGZY6jFVbDbPvG-mlQojvB-3Dzc8_946KVV2E,9427
28
28
  utils/__init__.py,sha256=1RrGoIIn1_gVzxd56b6a7HeAS-wu7uDP-nxLbR3fJ-8,1199
29
- utils/arxiv_utils.py,sha256=HNmUg3mfvQDZOI8dO5T3n_NUaJ4UVluLcOx0A4Q6cbs,14757
29
+ utils/arxiv_utils.py,sha256=Y8sDJgDwHxp1L33BkQoDumIl0Pkp-BuYZb1PwWYsmak,18251
30
30
  utils/author_utils.py,sha256=DLTo1xsxef2wxoe4s_MWrh36maj4fgnvFlsDLpDE-qQ,5507
31
- utils/biblatex_parser.py,sha256=gfcwNa-DpLG5BCJ3yS7IXDybCxwQZjBFj0hAqUwsfLU,19536
31
+ utils/biblatex_parser.py,sha256=JiO_tznsemhmGFs-pDM2qGuDlvT1ArIyc6bmsdwDOPQ,20452
32
+ utils/bibliography_utils.py,sha256=mpmdAklzAs1CT3gqrOcjujGhouL95OuliCx0LE9Pg90,11705
32
33
  utils/bibtex_parser.py,sha256=jsQ87lkzmBmJO3VEN3itw22CJ1Hesei4IvM2sfsaFKI,12867
33
34
  utils/config_validator.py,sha256=rxf7K3DYmJ-BNPsmtaCNipY2BTVT-pJZ7wN-M9Y3GC8,11167
34
35
  utils/db_utils.py,sha256=_wSupfBlm0ILFvntQTvoj7tLDCbrYPRQrp9NDvphF_E,6281
35
36
  utils/doi_utils.py,sha256=8f3iE4RdSNkzeqa9wJfoKcVEiBVse3_uf643biLudmw,4134
36
37
  utils/error_utils.py,sha256=2qdRM3Bv4GvE3mlXgXp9jiQBfvB08qeg8vTgNVivcgk,5706
37
38
  utils/mock_objects.py,sha256=QxU-UXyHSY27IZYN8Sb8ei0JtNkpGSdMXoErrRLHXvE,6437
38
- utils/text_utils.py,sha256=pscNw6EgxBZKSzcHjLErWUvWpnsowo8SBev8hbhMGBc,186581
39
+ utils/text_utils.py,sha256=8luQsOBfcEBv3O16d3LlQmCuoEB0dEF0aQWGey-s3us,190502
39
40
  utils/unicode_utils.py,sha256=-WBKarXO756p7fd7gCeNsMag4ztDNURwFX5IVniOtwY,10366
40
- utils/url_utils.py,sha256=qoimCrMFCBGvlmF_t1c6zSOmkWi_rUm-gZM0XZ4rEVE,6291
41
- academic_refchecker-1.2.37.dist-info/METADATA,sha256=sPihBUqydlGpu9kb9o--begd-_bvAwQmUXGAFSEZhRM,22298
42
- academic_refchecker-1.2.37.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
43
- academic_refchecker-1.2.37.dist-info/entry_points.txt,sha256=WdI89tYkIfz-M628PiboOfOLzTBWZAqvlF29qCVCkek,61
44
- academic_refchecker-1.2.37.dist-info/top_level.txt,sha256=6RlcQEA0kHb7-ndbKMFMZnYnJQVohgsU6BBkbEvJvEs,69
45
- academic_refchecker-1.2.37.dist-info/RECORD,,
41
+ utils/url_utils.py,sha256=n0m5rXKV0-UrE8lI85VEx23KmfGwky57sI6gFPuu78I,7358
42
+ academic_refchecker-1.2.39.dist-info/METADATA,sha256=Uz4a9D0tfull6uDAZTafQJOem7p8IqPA6bjl_pYUf48,22298
43
+ academic_refchecker-1.2.39.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
44
+ academic_refchecker-1.2.39.dist-info/entry_points.txt,sha256=WdI89tYkIfz-M628PiboOfOLzTBWZAqvlF29qCVCkek,61
45
+ academic_refchecker-1.2.39.dist-info/top_level.txt,sha256=6RlcQEA0kHb7-ndbKMFMZnYnJQVohgsU6BBkbEvJvEs,69
46
+ academic_refchecker-1.2.39.dist-info/RECORD,,
core/refchecker.py CHANGED
@@ -451,47 +451,10 @@ class ArxivReferenceChecker:
451
451
 
452
452
  def extract_arxiv_id_from_url(self, url):
453
453
  """
454
- Extract ArXiv ID from a URL or text containing ArXiv reference
454
+ Extract ArXiv ID from a URL or text containing ArXiv reference.
455
+ Uses the common extraction function from utils.url_utils.
455
456
  """
456
- if not url:
457
- return None
458
-
459
- # First, check for arXiv: format (e.g., "arXiv:1610.10099" or "arXiv preprint arXiv:1610.10099")
460
- arxiv_match = re.search(r'arXiv:(\d{4}\.\d{4,5})', url, re.IGNORECASE)
461
- if arxiv_match:
462
- arxiv_id = arxiv_match.group(1)
463
- # Remove version number if present
464
- arxiv_id = re.sub(r'v\d+$', '', arxiv_id)
465
- return arxiv_id
466
-
467
- # Remove version string from end if present (e.g., 'v1')
468
- url = re.sub(r'v\d+$', '', url)
469
-
470
- # Parse URL
471
- parsed_url = urlparse(url)
472
-
473
- # Check if it's an arxiv.org URL
474
- if 'arxiv.org' in parsed_url.netloc:
475
- # Extract ID from path
476
- path = parsed_url.path.strip('/')
477
-
478
- # Handle different URL formats
479
- if path.startswith('abs/'):
480
- arxiv_id = path.replace('abs/', '')
481
- elif path.startswith('pdf/'):
482
- arxiv_id = path.replace('pdf/', '').replace('.pdf', '')
483
- elif '/abs/' in path:
484
- arxiv_id = path.split('/abs/')[1]
485
- elif '/pdf/' in path:
486
- arxiv_id = path.split('/pdf/')[1].replace('.pdf', '')
487
- else:
488
- arxiv_id = path
489
-
490
- # Remove version number from the extracted ID
491
- arxiv_id = re.sub(r'v\d+$', '', arxiv_id)
492
- return arxiv_id
493
-
494
- return None
457
+ return extract_arxiv_id_from_url(url)
495
458
 
496
459
  def get_paper_metadata(self, arxiv_id):
497
460
  """
@@ -3581,11 +3544,9 @@ class ArxivReferenceChecker:
3581
3544
  # Clean author part and extract authors
3582
3545
  author_part_clean = strip_latex_commands(author_part).strip()
3583
3546
  if author_part_clean and not author_part_clean.startswith('\\'):
3584
- # Parse author names - handle comma-separated list and "and"
3585
- if ', and ' in author_part_clean:
3586
- author_names = re.split(r', and |, ', author_part_clean)
3587
- else:
3588
- author_names = [name.strip() for name in author_part_clean.split(',')]
3547
+ # Parse author names using the robust author parsing function
3548
+ from utils.text_utils import parse_authors_with_initials
3549
+ author_names = parse_authors_with_initials(author_part_clean)
3589
3550
 
3590
3551
  # Clean up author names
3591
3552
  authors = []
@@ -4264,8 +4225,17 @@ class ArxivReferenceChecker:
4264
4225
  return True
4265
4226
 
4266
4227
  # Also check if authors have significant overlap (at least 50% of the shorter author list)
4267
- author1_parts = seg1['author'].split('*') if '*' in seg1['author'] else seg1['author'].split(',')
4268
- author2_parts = seg2['author'].split('*') if '*' in seg2['author'] else seg2['author'].split(',')
4228
+ from utils.text_utils import parse_authors_with_initials
4229
+
4230
+ if '*' in seg1['author']:
4231
+ author1_parts = seg1['author'].split('*')
4232
+ else:
4233
+ author1_parts = parse_authors_with_initials(seg1['author'])
4234
+
4235
+ if '*' in seg2['author']:
4236
+ author2_parts = seg2['author'].split('*')
4237
+ else:
4238
+ author2_parts = parse_authors_with_initials(seg2['author'])
4269
4239
 
4270
4240
  # Clean and normalize author names
4271
4241
  author1_clean = {a.strip().lower() for a in author1_parts if a.strip() and a.strip() not in ['et al', 'others']}
@@ -4780,55 +4750,6 @@ class ArxivReferenceChecker:
4780
4750
  }
4781
4751
 
4782
4752
 
4783
- def _get_bibtex_content(self, paper):
4784
- """
4785
- Try to get BibTeX content for a paper from various sources.
4786
-
4787
- Args:
4788
- paper: Paper object
4789
-
4790
- Returns:
4791
- str: BibTeX content if found, None otherwise
4792
- """
4793
- # Try ArXiv source if it's an ArXiv paper
4794
- from utils.arxiv_utils import extract_arxiv_id_from_paper, download_arxiv_source
4795
-
4796
- arxiv_id = extract_arxiv_id_from_paper(paper)
4797
- if arxiv_id:
4798
- logger.debug(f"Detected ArXiv paper {arxiv_id}, checking for structured bibliography")
4799
- tex_content, bib_content, bbl_content = download_arxiv_source(arxiv_id)
4800
-
4801
- # Prefer .bib files (most structured), then .bbl files
4802
- if bib_content:
4803
- logger.info(f"Found .bib files in ArXiv source for {arxiv_id}")
4804
-
4805
- # If we have LaTeX content, filter BibTeX by cited keys
4806
- if tex_content:
4807
- from utils.text_utils import extract_cited_keys_from_latex, filter_bibtex_by_cited_keys
4808
- cited_keys = extract_cited_keys_from_latex(tex_content)
4809
- if cited_keys:
4810
- logger.debug(f"Found {len(cited_keys)} cited keys, filtering BibTeX")
4811
- filtered_content = filter_bibtex_by_cited_keys(bib_content, cited_keys)
4812
- return filtered_content
4813
-
4814
- return bib_content
4815
-
4816
- elif bbl_content:
4817
- logger.info(f"Found .bbl files in ArXiv source for {arxiv_id}")
4818
- return bbl_content
4819
-
4820
- elif tex_content:
4821
- # Check for embedded bibliography in LaTeX
4822
- from utils.text_utils import detect_latex_bibliography_format
4823
- latex_format = detect_latex_bibliography_format(tex_content)
4824
- if latex_format['is_latex'] and ('\\bibitem' in tex_content or '@' in tex_content):
4825
- logger.info(f"Found embedded bibliography in ArXiv LaTeX source, but skipping due to formatting incompatibility")
4826
- # Skip embedded bibliography and return None to trigger fallback methods
4827
- return None
4828
-
4829
- # Could add other BibTeX sources here (e.g., direct BibTeX URLs, etc.)
4830
-
4831
- return None
4832
4753
 
4833
4754
 
4834
4755
  def extract_bibliography(self, paper, debug_mode=False):
@@ -4843,7 +4764,8 @@ class ArxivReferenceChecker:
4843
4764
  logger.debug(f"Extracting bibliography for paper {paper_id}: {paper.title}")
4844
4765
 
4845
4766
  # Check if we can get BibTeX content for this paper (ArXiv or other sources)
4846
- bibtex_content = self._get_bibtex_content(paper)
4767
+ from utils.arxiv_utils import get_bibtex_content
4768
+ bibtex_content = get_bibtex_content(paper)
4847
4769
  if bibtex_content:
4848
4770
  logger.debug(f"Found BibTeX content for {paper_id}, using structured bibliography")
4849
4771
 
@@ -4897,7 +4819,7 @@ class ArxivReferenceChecker:
4897
4819
  else:
4898
4820
  logger.warning("No LLM available for fallback, using original parsing results")
4899
4821
  else:
4900
- logger.info(f"LaTeX parsing validation passed (quality: {validation['quality_score']:.2f})")
4822
+ logger.debug(f"LaTeX parsing validation passed (quality: {validation['quality_score']:.2f})")
4901
4823
  else:
4902
4824
  # Parse BibTeX using the standard flow (LLM or regex based on config)
4903
4825
  references = self.parse_references(bibtex_content)
@@ -5458,7 +5380,7 @@ class ArxivReferenceChecker:
5458
5380
  error_details = unverified_errors[0].get('error_details', '')
5459
5381
  if error_details:
5460
5382
  subreason = self._categorize_unverified_reason(error_details)
5461
- print(f" Subreason: {subreason}")
5383
+ print(f" Subreason: {subreason}")
5462
5384
 
5463
5385
  year_str = self._format_year_string(reference.get('year'))
5464
5386
 
utils/arxiv_utils.py CHANGED
@@ -374,3 +374,79 @@ def reconstruct_bibtex_content(cited_entries, original_content):
374
374
  return '\n\n'.join(filtered_parts) + '\n'
375
375
 
376
376
 
377
+ def get_bibtex_content(paper):
378
+ """
379
+ Try to get BibTeX content for a paper from various sources.
380
+
381
+ Args:
382
+ paper: Paper object
383
+
384
+ Returns:
385
+ str: BibTeX content if found, None otherwise
386
+ """
387
+ import re
388
+
389
+ # Try ArXiv source if it's an ArXiv paper
390
+ arxiv_id = extract_arxiv_id_from_paper(paper)
391
+ if arxiv_id:
392
+ logger.debug(f"Detected ArXiv paper {arxiv_id}, checking for structured bibliography")
393
+ tex_content, bib_content, bbl_content = download_arxiv_source(arxiv_id)
394
+
395
+ # Choose between .bib and .bbl files based on content richness
396
+ # Prioritize .bbl if it has more references than filtered .bib, otherwise prefer .bib
397
+ if bib_content and bbl_content:
398
+ # Count entries in both
399
+ bib_entry_count = len(re.findall(r'@\w+\s*\{', bib_content))
400
+ bbl_entry_count = len(re.findall(r'\\bibitem\[', bbl_content))
401
+
402
+ # If we have LaTeX content, get filtered BibTeX count
403
+ filtered_bib_count = bib_entry_count
404
+ filtered_content = bib_content
405
+ if tex_content:
406
+ cited_keys = extract_cited_keys_from_tex({}, tex_content)
407
+ if cited_keys:
408
+ logger.debug(f"Found {len(cited_keys)} cited keys, filtering BibTeX")
409
+ filtered_content = filter_bibtex_by_citations(bib_content, {}, tex_content)
410
+ filtered_bib_count = len(re.findall(r'@\w+\s*\{', filtered_content))
411
+
412
+ logger.debug(f"Bibliography comparison: .bbl has {bbl_entry_count} entries, filtered .bib has {filtered_bib_count} entries")
413
+
414
+ # Prioritize .bbl if it has significantly more entries
415
+ if bbl_entry_count > filtered_bib_count * 1.5: # 50% more entries threshold
416
+ logger.info(f"Using .bbl files from ArXiv source")
417
+ return bbl_content
418
+ else:
419
+ logger.info(f"Using filtered .bib files")
420
+ return filtered_content
421
+
422
+ elif bib_content:
423
+ logger.info(f"Found .bib files in ArXiv source for {arxiv_id}")
424
+
425
+ # If we have LaTeX content, filter BibTeX by cited keys
426
+ if tex_content:
427
+ cited_keys = extract_cited_keys_from_tex({}, tex_content)
428
+ if cited_keys:
429
+ logger.debug(f"Found {len(cited_keys)} cited keys, filtering BibTeX")
430
+ filtered_content = filter_bibtex_by_citations(bib_content, {}, tex_content)
431
+ return filtered_content
432
+
433
+ return bib_content
434
+
435
+ elif bbl_content:
436
+ logger.info(f"Found .bbl files in ArXiv source for {arxiv_id}")
437
+ return bbl_content
438
+
439
+ elif tex_content:
440
+ # Check for embedded bibliography in LaTeX
441
+ from utils.text_utils import detect_latex_bibliography_format
442
+ latex_format = detect_latex_bibliography_format(tex_content)
443
+ if latex_format['is_latex'] and ('\\bibitem' in tex_content or '@' in tex_content):
444
+ logger.info(f"Found embedded bibliography in ArXiv LaTeX source, but skipping due to formatting incompatibility")
445
+ # Skip embedded bibliography and return None to trigger fallback methods
446
+ return None
447
+
448
+ # Could add other BibTeX sources here (e.g., direct BibTeX URLs, etc.)
449
+
450
+ return None
451
+
452
+
utils/biblatex_parser.py CHANGED
@@ -261,11 +261,13 @@ def parse_biblatex_entry_content(entry_num: str, content: str) -> Dict[str, Any]
261
261
  else:
262
262
  # If no quoted title, look for title after author names
263
263
  # Pattern: "FirstAuthor et al. Title Goes Here. Year." or "Author. Title. Year."
264
+ # Order matters: more specific patterns first
264
265
  title_patterns = [
265
- r'[A-Z][a-z]+(?:\s+et\s+al)?\.?\s+([A-Z][^.]*?)\.\s+\d{4}', # "Author et al. Title. Year"
266
- r'[A-Z][^.]+\.\s*([A-Z][^.]*?)\.\s*(?:https?://|arXiv:|\d{4})', # "Authors. Title. URL/arXiv/Year" (flexible spacing)
267
- r'(?:[A-Z][a-z]+,?\s+)+([A-Z][^.]*?)\.\s+\d{4}', # "Name, Name. Title. Year"
266
+ r'[A-Z][^.]+\.\s*([A-Z][^.]*?)\.\s*(?:https?://|arXiv:|\d{4})', # "Authors. Title. URL/arXiv/Year" (flexible spacing) - MOST SPECIFIC
268
267
  r'\.([A-Z][A-Za-z\s]+(?:\?|!)?)\.?\s+\d{4}', # ".Title. Year" - for cases where authors end without space
268
+ r'[A-Z][a-z]+\.([A-Z][A-Za-z\s\-&]+?)\.\s+\d{4}', # "Name.Title. Year" - missing space after period
269
+ r'[A-Z][a-z]+(?:\s+et\s+al)?\.?\s+([A-Z][^.]*?)\.\s+\d{4}', # "Author et al. Title. Year" - LESS SPECIFIC
270
+ r'(?:[A-Z][a-z]+,?\s+)+([A-Z][^.]*?)\.\s+\d{4}', # "Name, Name. Title. Year"
269
271
  r'\b([A-Z][A-Za-z\s\-0-9]+)\s+\.\s+https', # "Title . https" - handle space before period
270
272
  ]
271
273
 
@@ -391,10 +393,10 @@ def parse_biblatex_entry_content(entry_num: str, content: str) -> Dict[str, Any]
391
393
  # Fallback: split by common patterns if parse_authors_with_initials failed
392
394
  if not authors:
393
395
  if 'et al' in authors_text.lower():
394
- # Handle "FirstAuthor et al." case
396
+ # Handle "FirstAuthor et al." case - separate base author from "et al"
395
397
  base_author = authors_text.split(' et al')[0].strip()
396
398
  if base_author:
397
- authors = [base_author + ' et al']
399
+ authors = [base_author, 'et al']
398
400
  elif ' and ' in authors_text:
399
401
  # Handle "Author1 and Author2 and Author3" format
400
402
  author_parts = [p.strip() for p in authors_text.split(' and ')]
@@ -404,18 +406,29 @@ def parse_biblatex_entry_content(entry_num: str, content: str) -> Dict[str, Any]
404
406
  if part and len(part) > 2:
405
407
  authors.append(part)
406
408
  else:
407
- # Try comma separation for "Author1, Author2, Author3"
408
- author_parts = [p.strip() for p in authors_text.split(',')]
409
- authors = []
410
- for part in author_parts:
411
- part = part.strip(' .')
412
- # Remove "and" prefix if present
413
- if part.startswith('and '):
414
- part = part[4:].strip()
415
- # Skip parts that are too short or look like initials only
416
- if (part and len(part) > 2 and
417
- not re.search(r'\b(http|www|doi|arxiv|proceedings)\b', part.lower())):
418
- authors.append(part)
409
+ # Try sophisticated parsing one more time with relaxed constraints
410
+ try:
411
+ # Remove "and" connectors for cleaner parsing
412
+ clean_text = re.sub(r'\s+and\s+', ', ', authors_text)
413
+ fallback_authors = parse_authors_with_initials(clean_text)
414
+ if fallback_authors and len(fallback_authors) >= 1:
415
+ authors = fallback_authors
416
+ else:
417
+ raise ValueError("Fallback parsing failed")
418
+ except:
419
+ # Last resort: naive comma separation for "Author1, Author2, Author3"
420
+ # This should rarely be reached now
421
+ author_parts = [p.strip() for p in authors_text.split(',')]
422
+ authors = []
423
+ for part in author_parts:
424
+ part = part.strip(' .')
425
+ # Remove "and" prefix if present
426
+ if part.startswith('and '):
427
+ part = part[4:].strip()
428
+ # Skip parts that are too short or look like initials only
429
+ if (part and len(part) > 2 and
430
+ not re.search(r'\b(http|www|doi|arxiv|proceedings)\b', part.lower())):
431
+ authors.append(part)
419
432
 
420
433
  # 7. Extract journal/venue - look for patterns like "In: Conference" or remaining text
421
434
  journal_patterns = [
@@ -0,0 +1,332 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Bibliography extraction and parsing utilities.
4
+
5
+ This module provides utilities for extracting and parsing bibliographies from
6
+ academic papers in various formats (LaTeX, BibTeX, PDF text, etc.).
7
+ """
8
+
9
+ import re
10
+ import logging
11
+ import os
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ def extract_text_from_latex(latex_file_path):
17
+ """
18
+ Extract text from a LaTeX file
19
+
20
+ Args:
21
+ latex_file_path: Path to the LaTeX file
22
+
23
+ Returns:
24
+ String containing the LaTeX file content
25
+ """
26
+ try:
27
+ logger.info(f"Reading LaTeX file: {latex_file_path}")
28
+ with open(latex_file_path, 'r', encoding='utf-8') as f:
29
+ content = f.read()
30
+ logger.info(f"Successfully read LaTeX file with {len(content)} characters")
31
+ return content
32
+ except UnicodeDecodeError:
33
+ # Try with latin-1 encoding if utf-8 fails
34
+ try:
35
+ logger.warning(f"UTF-8 encoding failed for {latex_file_path}, trying latin-1")
36
+ with open(latex_file_path, 'r', encoding='latin-1') as f:
37
+ content = f.read()
38
+ logger.info(f"Successfully read LaTeX file with latin-1 encoding")
39
+ return content
40
+ except Exception as e:
41
+ logger.error(f"Failed to read LaTeX file {latex_file_path}: {e}")
42
+ return None
43
+ except Exception as e:
44
+ logger.error(f"Failed to read LaTeX file {latex_file_path}: {e}")
45
+ return None
46
+
47
+
48
+ def find_bibliography_section(text):
49
+ """
50
+ Find the bibliography section in the text
51
+ """
52
+ if not text:
53
+ logger.warning("No text provided to find_bibliography_section")
54
+ return None
55
+
56
+ # Log a sample of the text for debugging
57
+ text_sample = text[:500] + "..." if len(text) > 500 else text
58
+ logger.debug(f"Text sample: {text_sample}")
59
+
60
+ # Common section titles for bibliography
61
+ section_patterns = [
62
+ # Patterns for numbered sections with potential spacing issues from PDF extraction
63
+ r'(?i)\d+\s*ref\s*er\s*ences\s*\n', # "12 Refer ences" with spaces
64
+ r'(?i)\d+\s*references\s*\n', # "12References" or "12 References"
65
+ r'(?i)^\s*\d+\.\s*references\s*$', # Numbered section: "7. References"
66
+ r'(?i)\d+\s+references\s*\.', # "9 References." format used in Georgia Tech paper
67
+ # Standard reference patterns
68
+ r'(?i)references\s*\n',
69
+ r'(?i)bibliography\s*\n',
70
+ r'(?i)works cited\s*\n',
71
+ r'(?i)literature cited\s*\n',
72
+ r'(?i)references\s*$', # End of document
73
+ r'(?i)\[\s*references\s*\]', # [References]
74
+ r'(?i)^\s*references\s*$', # References as a standalone line
75
+ r'(?i)^\s*bibliography\s*$', # Bibliography as a standalone line
76
+ r'(?i)references\s*and\s*citations', # References and Citations
77
+ r'(?i)cited\s*references', # Cited References
78
+ r'(?i)reference\s*list', # Reference List
79
+ ]
80
+
81
+ bibliography_start = None
82
+ matched_pattern = None
83
+
84
+ for pattern in section_patterns:
85
+ matches = re.search(pattern, text, re.MULTILINE)
86
+ if matches:
87
+ bibliography_start = matches.end()
88
+ matched_pattern = pattern
89
+ logger.debug(f"Bibliography section found using pattern: {pattern}")
90
+ break
91
+
92
+ if bibliography_start is None:
93
+ logger.debug("No bibliography section header found, trying end-of-document approach")
94
+ # Try to find bibliography at the end of the document without explicit headers
95
+ lines = text.split('\n')
96
+ for i in range(len(lines) - 1, max(0, len(lines) - 100), -1): # Check last 100 lines
97
+ line = lines[i].strip()
98
+ if re.match(r'^\[\d+\]', line) or re.match(r'^\d+\.', line):
99
+ # Found what looks like reference entries
100
+ bibliography_start = text.rfind('\n'.join(lines[i:]))
101
+ logger.debug(f"Bibliography section found at end of document starting with: {line[:50]}")
102
+ break
103
+
104
+ if bibliography_start is not None:
105
+ bibliography_text = text[bibliography_start:].strip()
106
+ logger.debug(f"Bibliography text length: {len(bibliography_text)}")
107
+
108
+ # Optional: Try to find the end of the bibliography section
109
+ # This is challenging because it might go to the end of the document
110
+ # or be followed by appendices, acknowledgments, etc.
111
+
112
+ return bibliography_text
113
+
114
+ logger.debug("Bibliography section not found")
115
+ return None
116
+
117
+
118
+ def parse_references(bibliography_text):
119
+ """
120
+ Parse references from bibliography text using multiple parsing strategies.
121
+
122
+ Args:
123
+ bibliography_text: String containing bibliography content
124
+
125
+ Returns:
126
+ List of parsed reference dictionaries
127
+ """
128
+ if not bibliography_text:
129
+ logger.warning("No bibliography text provided to parse_references")
130
+ return []
131
+
132
+ # Try different parsing strategies in order of preference
133
+ parsing_strategies = [
134
+ ('BibTeX', _parse_bibtex_references),
135
+ ('biblatex', _parse_biblatex_references),
136
+ ('ACM/natbib', _parse_standard_acm_natbib_references),
137
+ ('regex-based', _parse_references_regex)
138
+ ]
139
+
140
+ for strategy_name, parse_func in parsing_strategies:
141
+ try:
142
+ logger.debug(f"Attempting {strategy_name} parsing")
143
+ references = parse_func(bibliography_text)
144
+ if references and len(references) > 0:
145
+ logger.info(f"Successfully parsed {len(references)} references using {strategy_name} format")
146
+ return references
147
+ else:
148
+ logger.debug(f"{strategy_name} parsing returned no references")
149
+ except Exception as e:
150
+ logger.debug(f"{strategy_name} parsing failed: {e}")
151
+ continue
152
+
153
+ logger.warning("All parsing strategies failed to extract references")
154
+ return []
155
+
156
+
157
+ def _parse_bibtex_references(bibliography_text):
158
+ """
159
+ Parse BibTeX formatted references like @inproceedings{...}, @article{...}, etc.
160
+
161
+ Args:
162
+ bibliography_text: String containing BibTeX entries
163
+
164
+ Returns:
165
+ List of reference dictionaries
166
+ """
167
+ from utils.bibtex_parser import parse_bibtex_entries
168
+ return parse_bibtex_entries(bibliography_text)
169
+
170
+
171
+ def _parse_biblatex_references(bibliography_text):
172
+ """
173
+ Parse biblatex formatted references like [1] Author. "Title". In: Venue. Year.
174
+
175
+ Args:
176
+ bibliography_text: String containing biblatex .bbl entries
177
+
178
+ Returns:
179
+ List of reference dictionaries
180
+ """
181
+ from utils.text_utils import extract_latex_references
182
+ return extract_latex_references(bibliography_text)
183
+
184
+
185
+ def _parse_standard_acm_natbib_references(bibliography_text):
186
+ """
187
+ Parse references using regex for standard ACM/natbib format (both ACM Reference Format and simple natbib)
188
+ """
189
+ from utils.text_utils import detect_standard_acm_natbib_format
190
+
191
+ references = []
192
+
193
+ # Check if this is standard ACM natbib format
194
+ format_info = detect_standard_acm_natbib_format(bibliography_text)
195
+ if format_info['is_acm_natbib']:
196
+ logger.debug("Detected standard ACM natbib format")
197
+
198
+ # Split by reference entries
199
+ ref_pattern = r'\[(\d+)\]\s*'
200
+ entries = re.split(ref_pattern, bibliography_text)[1:] # Skip first empty element
201
+
202
+ for i in range(0, len(entries), 2):
203
+ if i + 1 < len(entries):
204
+ ref_num = entries[i]
205
+ ref_content = entries[i + 1].strip()
206
+
207
+ try:
208
+ reference = _parse_simple_natbib_format(int(ref_num), ref_content, f"[{ref_num}]")
209
+ if reference:
210
+ references.append(reference)
211
+ logger.debug(f"Parsed reference {ref_num}: {reference.get('title', 'No title')[:50]}...")
212
+ except Exception as e:
213
+ logger.debug(f"Error parsing reference {ref_num}: {e}")
214
+ continue
215
+
216
+ logger.debug(f"ACM natbib parsing extracted {len(references)} references")
217
+
218
+ return references
219
+
220
+
221
+ def _parse_simple_natbib_format(ref_num, content, label):
222
+ """
223
+ Parse a simple natbib format reference entry.
224
+
225
+ Args:
226
+ ref_num: Reference number
227
+ content: Reference content text
228
+ label: Reference label (e.g., "[1]")
229
+
230
+ Returns:
231
+ Dictionary containing parsed reference information
232
+ """
233
+ from utils.text_utils import extract_url_from_reference, extract_year_from_reference
234
+
235
+ # Basic parsing - this could be enhanced with more sophisticated NLP
236
+ reference = {
237
+ 'raw_text': content,
238
+ 'label': label,
239
+ 'type': 'unknown'
240
+ }
241
+
242
+ # Try to extract basic information
243
+ # This is a simplified parser - real parsing would be much more complex
244
+
245
+ # Look for URL
246
+ url = extract_url_from_reference(content)
247
+ if url:
248
+ reference['url'] = url
249
+
250
+ # Look for year
251
+ year = extract_year_from_reference(content)
252
+ if year:
253
+ reference['year'] = year
254
+
255
+ # Try to identify the type based on content
256
+ content_lower = content.lower()
257
+ if 'proceedings' in content_lower or 'conference' in content_lower:
258
+ reference['type'] = 'inproceedings'
259
+ elif 'journal' in content_lower or 'trans.' in content_lower:
260
+ reference['type'] = 'article'
261
+ elif 'arxiv' in content_lower:
262
+ reference['type'] = 'misc'
263
+ reference['note'] = 'arXiv preprint'
264
+
265
+ return reference
266
+
267
+
268
+ def _parse_references_regex(bibliography_text):
269
+ """
270
+ Parse references using regex-based approach (original implementation)
271
+ """
272
+ references = []
273
+
274
+ # Split bibliography into individual references
275
+ # Look for patterns like [1], [2], etc.
276
+ ref_pattern = r'\[(\d+)\](.*?)(?=\[\d+\]|$)'
277
+ matches = re.findall(ref_pattern, bibliography_text, re.DOTALL)
278
+
279
+ for ref_num, ref_content in matches:
280
+ ref_content = ref_content.strip()
281
+ if not ref_content:
282
+ continue
283
+
284
+ reference = {
285
+ 'raw_text': ref_content,
286
+ 'label': f"[{ref_num}]",
287
+ 'type': 'unknown'
288
+ }
289
+
290
+ # Basic information extraction
291
+ from utils.text_utils import extract_url_from_reference, extract_year_from_reference
292
+
293
+ url = extract_url_from_reference(ref_content)
294
+ if url:
295
+ reference['url'] = url
296
+
297
+ year = extract_year_from_reference(ref_content)
298
+ if year:
299
+ reference['year'] = year
300
+
301
+ references.append(reference)
302
+
303
+ return references
304
+
305
+
306
+ def _is_bibtex_surname_given_format(surname_part, given_part):
307
+ """
308
+ Check if this appears to be a BibTeX "Surname, Given" format.
309
+
310
+ Args:
311
+ surname_part: The part before the comma
312
+ given_part: The part after the comma
313
+
314
+ Returns:
315
+ Boolean indicating if this looks like BibTeX name format
316
+ """
317
+ # Simple heuristics to detect BibTeX format
318
+ if not surname_part or not given_part:
319
+ return False
320
+
321
+ # Check if surname looks like a surname (capitalized, not too long)
322
+ if not re.match(r'^[A-Z][a-zA-Z\s\-\']+$', surname_part.strip()):
323
+ return False
324
+
325
+ # Check if given part looks like given names (often abbreviated)
326
+ given_clean = given_part.strip()
327
+ if re.match(r'^[A-Z](\.\s*[A-Z]\.?)*$', given_clean): # Like "J. R." or "M. K."
328
+ return True
329
+ if re.match(r'^[A-Z][a-z]+(\s+[A-Z][a-z]*)*$', given_clean): # Like "John Robert"
330
+ return True
331
+
332
+ return False
utils/text_utils.py CHANGED
@@ -11,6 +11,31 @@ from typing import List
11
11
  logger = logging.getLogger(__name__)
12
12
 
13
13
 
14
+ def normalize_apostrophes(text):
15
+ """
16
+ Normalize all apostrophe variants to standard ASCII apostrophe
17
+ """
18
+ if not text:
19
+ return text
20
+
21
+ # All known apostrophe variants
22
+ apostrophe_variants = [
23
+ "'", # U+0027 ASCII apostrophe
24
+ "'", # U+2019 Right single quotation mark (most common)
25
+ "'", # U+2018 Left single quotation mark
26
+ "ʼ", # U+02BC Modifier letter apostrophe
27
+ "ˈ", # U+02C8 Modifier letter vertical line (primary stress)
28
+ "`", # U+0060 Grave accent (sometimes used as apostrophe)
29
+ "´", # U+00B4 Acute accent (sometimes used as apostrophe)
30
+ ]
31
+
32
+ # Replace all variants with standard ASCII apostrophe
33
+ for variant in apostrophe_variants:
34
+ text = text.replace(variant, "'")
35
+
36
+ return text
37
+
38
+
14
39
  def normalize_text(text):
15
40
  """
16
41
  Normalize text by removing diacritical marks and special characters
@@ -18,6 +43,9 @@ def normalize_text(text):
18
43
  if not text:
19
44
  return ""
20
45
 
46
+ # First normalize apostrophes to standard form
47
+ text = normalize_apostrophes(text)
48
+
21
49
  # Replace common special characters with their ASCII equivalents
22
50
  replacements = {
23
51
  'ä': 'a', 'ö': 'o', 'ü': 'u', 'ß': 'ss',
@@ -29,7 +57,7 @@ def normalize_text(text):
29
57
  'Ł': 'L', 'ł': 'l',
30
58
  '¨': '', '´': '', '`': '', '^': '', '~': '',
31
59
  '–': '-', '—': '-', '−': '-',
32
- '„': '"', '"': '"', '"': '"', ''': "'", ''': "'",
60
+ '„': '"', '"': '"', '"': '"',
33
61
  '«': '"', '»': '"',
34
62
  '¡': '!', '¿': '?',
35
63
  '°': 'degrees', '©': '(c)', '®': '(r)', '™': '(tm)',
@@ -39,10 +67,6 @@ def normalize_text(text):
39
67
  '\u00A0': ' ', # Non-breaking space
40
68
  '\u2013': '-', # En dash
41
69
  '\u2014': '-', # Em dash
42
- '\u2018': "'", # Left single quotation mark
43
- '\u2019': "'", # Right single quotation mark
44
- '\u201C': '"', # Left double quotation mark
45
- '\u201D': '"', # Right double quotation mark
46
70
  '\u2026': '...', # Horizontal ellipsis
47
71
  '\u00B7': '.', # Middle dot
48
72
  '\u2022': '.', # Bullet
@@ -54,8 +78,8 @@ def normalize_text(text):
54
78
  # Remove any remaining diacritical marks
55
79
  text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('ASCII')
56
80
 
57
- # Remove special characters
58
- text = re.sub(r'[^\w\s]', '', text)
81
+ # Remove special characters except apostrophes
82
+ text = re.sub(r"[^\w\s']", '', text)
59
83
 
60
84
  # Normalize whitespace
61
85
  text = re.sub(r'\s+', ' ', text).strip()
@@ -94,6 +118,15 @@ def parse_authors_with_initials(authors_text):
94
118
  # by converting to "Haotian Liu and Chunyuan Li and Qingyang Wu"
95
119
  authors_text = re.sub(r'\s+', ' ', authors_text.strip())
96
120
 
121
+ # Special case: Handle single author followed by "et al" (e.g., "Mubashara Akhtar et al.")
122
+ # This should be split into ["Mubashara Akhtar", "et al"]
123
+ single_et_al_match = re.match(r'^(.+?)\s+et\s+al\.?$', authors_text, re.IGNORECASE)
124
+ if single_et_al_match:
125
+ base_author = single_et_al_match.group(1).strip()
126
+ if base_author and not ' and ' in base_author and not ',' in base_author:
127
+ # This is a simple "FirstName LastName et al" case
128
+ return [base_author, 'et al']
129
+
97
130
  # Check if this is a semicolon-separated format (e.g., "Hashimoto, K.; Saoud, A.; Kishida, M.")
98
131
  if ';' in authors_text:
99
132
  # Split by semicolons and handle the last part which might have "and"
@@ -359,6 +392,9 @@ def clean_author_name(author):
359
392
  # Normalize Unicode characters (e.g., combining diacritics)
360
393
  author = unicodedata.normalize('NFKC', author)
361
394
 
395
+ # Normalize apostrophes first before other processing
396
+ author = normalize_apostrophes(author)
397
+
362
398
  # Handle common Unicode escape sequences and LaTeX encodings
363
399
  # Note: Order matters - process longer patterns first
364
400
  unicode_replacements = [
@@ -518,31 +554,19 @@ def clean_title(title):
518
554
 
519
555
  def extract_arxiv_id_from_url(url):
520
556
  """
521
- Extract ArXiv ID from URL
557
+ Extract ArXiv ID from URL or text containing ArXiv reference.
558
+
559
+ This function is deprecated. Use utils.url_utils.extract_arxiv_id_from_url instead.
560
+ Kept for backwards compatibility.
522
561
 
523
562
  Args:
524
- url: URL string
563
+ url: URL string or text containing arXiv reference
525
564
 
526
565
  Returns:
527
566
  ArXiv ID or None if not found
528
567
  """
529
- if not isinstance(url, str):
530
- return None
531
-
532
- # Common ArXiv URL patterns
533
- patterns = [
534
- r'arxiv\.org/abs/(\d+\.\d+(?:v\d+)?)',
535
- r'arxiv\.org/pdf/(\d+\.\d+(?:v\d+)?)',
536
- r'arxiv:(\d+\.\d+(?:v\d+)?)',
537
- r'arXiv:(\d+\.\d+(?:v\d+)?)'
538
- ]
539
-
540
- for pattern in patterns:
541
- match = re.search(pattern, url, re.IGNORECASE)
542
- if match:
543
- return match.group(1)
544
-
545
- return None
568
+ from utils.url_utils import extract_arxiv_id_from_url as common_extract
569
+ return common_extract(url)
546
570
 
547
571
  def extract_year_from_text(text):
548
572
  """
@@ -706,8 +730,12 @@ def normalize_diacritics(text: str) -> str:
706
730
  'José' -> 'jose'
707
731
  'Łukasz' -> 'lukasz'
708
732
  'J. Gl¨ uck' -> 'J. Gluck'
733
+ 'D'Amato' -> 'D'Amato' (apostrophes normalized)
709
734
  """
710
- # First handle special characters that don't decompose properly
735
+ # First normalize apostrophes
736
+ text = normalize_apostrophes(text)
737
+
738
+ # Then handle special characters that don't decompose properly
711
739
  # Including common transliterations
712
740
  special_chars = {
713
741
  'ł': 'l', 'Ł': 'L',
@@ -847,6 +875,10 @@ def is_name_match(name1: str, name2: str) -> bool:
847
875
  name1_primary = normalize_diacritics(name1.strip().lower())
848
876
  name2_primary = normalize_diacritics(name2.strip().lower())
849
877
 
878
+ # Remove trailing periods that are not part of initials (e.g., "J. L. D'Amato." -> "J. L. D'Amato")
879
+ name1_primary = re.sub(r'\.+$', '', name1_primary)
880
+ name2_primary = re.sub(r'\.+$', '', name2_primary)
881
+
850
882
  # Handle spacing variations around periods: "F.Last" vs "F. Last"
851
883
  name1_normalized = re.sub(r'\.([A-Za-z])', r'. \1', name1_primary)
852
884
  name2_normalized = re.sub(r'\.([A-Za-z])', r'. \1', name2_primary)
@@ -859,6 +891,10 @@ def is_name_match(name1: str, name2: str) -> bool:
859
891
  name1_alt = normalize_diacritics_simple(name1.strip().lower())
860
892
  name2_alt = normalize_diacritics_simple(name2.strip().lower())
861
893
 
894
+ # Remove trailing periods for alternative normalization too
895
+ name1_alt = re.sub(r'\.+$', '', name1_alt)
896
+ name2_alt = re.sub(r'\.+$', '', name2_alt)
897
+
862
898
  name1_alt_norm = re.sub(r'\.([A-Za-z])', r'. \1', name1_alt)
863
899
  name2_alt_norm = re.sub(r'\.([A-Za-z])', r'. \1', name2_alt)
864
900
 
@@ -2219,7 +2255,8 @@ def format_author_for_display(author_name):
2219
2255
  if not author_name:
2220
2256
  return author_name
2221
2257
 
2222
- author_name = author_name.strip()
2258
+ # Normalize apostrophes for consistent display
2259
+ author_name = normalize_apostrophes(author_name.strip())
2223
2260
 
2224
2261
  # Check if it's in "Lastname, Firstname" format
2225
2262
  if ',' in author_name:
@@ -2866,8 +2903,9 @@ def extract_latex_references(text, file_path=None): # pylint: disable=unused-ar
2866
2903
  author_part_clean = strip_latex_commands(author_part).strip()
2867
2904
 
2868
2905
  # Simple fix: just improve the organization detection without complex parsing
2869
- # Remove year pattern first
2906
+ # Remove year pattern first - handle both parenthetical and standalone years
2870
2907
  author_text_clean = re.sub(r'\s*\(\d{4}\)\.?$', '', author_part_clean).strip()
2908
+ author_text_clean = re.sub(r'\s+\d{4}\.?$', '', author_text_clean).strip()
2871
2909
 
2872
2910
  # Better organization detection - check if it looks like multiple authors
2873
2911
  is_multi_author = (
@@ -2889,24 +2927,41 @@ def extract_latex_references(text, file_path=None): # pylint: disable=unused-ar
2889
2927
  author = re.sub(r'^and\s+', '', author.strip())
2890
2928
  # Remove trailing periods that shouldn't be there
2891
2929
  author = clean_author_name(author)
2892
- # Skip all "et al" variants for LaTeX bibliographies
2893
- if author.lower() not in ['et al', 'et al.', 'et~al', 'et~al.', 'al., et', 'others', 'and others']:
2930
+ # Preserve "et al" variants to enable proper author count handling
2931
+ if author.lower() in ['et al', 'et al.', 'et~al', 'et~al.', 'al., et', 'others', 'and others']:
2932
+ cleaned_authors.append('et al') # Normalize to standard form
2933
+ else:
2894
2934
  cleaned_authors.append(author)
2895
2935
  if cleaned_authors:
2896
2936
  ref['authors'] = cleaned_authors
2897
2937
  else:
2898
- # Fallback: simple comma split
2938
+ # Fallback: try once more with semicolon handling, then simple comma split
2899
2939
  simple_authors = []
2900
- for a in author_text_clean.split(','):
2901
- a = a.strip()
2902
- # Remove "and" prefix and skip short/empty entries
2903
- a = re.sub(r'^and\s+', '', a)
2904
- # Clean author name (remove unnecessary periods)
2905
- a = clean_author_name(a)
2906
- if a and len(a) > 2:
2907
- # Skip all "et al" variants for LaTeX bibliographies
2908
- if a.lower() not in ['et al', 'et al.', 'et~al', 'et~al.', 'others', 'and others']:
2909
- simple_authors.append(a)
2940
+ try:
2941
+ # Try parsing again with normalized separators
2942
+ normalized_text = re.sub(r';\s*and\s+', ', ', author_text_clean)
2943
+ fallback_authors = parse_authors_with_initials(normalized_text)
2944
+ if fallback_authors and len(fallback_authors) >= 2:
2945
+ simple_authors = fallback_authors
2946
+ else:
2947
+ raise ValueError("Fallback parsing failed")
2948
+ except:
2949
+ # Last resort: naive comma split
2950
+ for a in author_text_clean.split(','):
2951
+ a = a.strip()
2952
+ # Remove "and" prefix and skip short/empty entries
2953
+ a = re.sub(r'^and\s+', '', a)
2954
+ # Clean author name (remove unnecessary periods)
2955
+ a = clean_author_name(a)
2956
+ if a and len(a) > 2:
2957
+ # Preserve "et al" variants to enable proper author count handling
2958
+ if a.lower() in ['et al', 'et al.', 'et~al', 'et~al.', 'others', 'and others']:
2959
+ simple_authors.append('et al') # Normalize to standard form
2960
+ else:
2961
+ simple_authors.append(a)
2962
+ elif a and a.lower() in ['et al', 'et al.', 'et~al', 'et~al.', 'others', 'and others']:
2963
+ simple_authors.append('et al') # Handle short "et al" variants
2964
+
2910
2965
  if simple_authors:
2911
2966
  ref['authors'] = simple_authors
2912
2967
  except Exception:
@@ -2919,9 +2974,13 @@ def extract_latex_references(text, file_path=None): # pylint: disable=unused-ar
2919
2974
  # Clean author name (remove unnecessary periods)
2920
2975
  a = clean_author_name(a)
2921
2976
  if a and len(a) > 2:
2922
- # Skip all "et al" variants for LaTeX bibliographies
2923
- if a.lower() not in ['et al', 'et al.', 'et~al', 'et~al.', 'others', 'and others']:
2977
+ # Preserve "et al" variants to enable proper author count handling
2978
+ if a.lower() in ['et al', 'et al.', 'et~al', 'et~al.', 'others', 'and others']:
2979
+ simple_authors.append('et al') # Normalize to standard form
2980
+ else:
2924
2981
  simple_authors.append(a)
2982
+ elif a and a.lower() in ['et al', 'et al.', 'et~al', 'et~al.', 'others', 'and others']:
2983
+ simple_authors.append('et al') # Handle short "et al" variants
2925
2984
  if simple_authors:
2926
2985
  ref['authors'] = simple_authors
2927
2986
  else:
@@ -3716,6 +3775,7 @@ def are_venues_substantially_different(venue1: str, venue2: str) -> bool:
3716
3775
  for abbrev in sorted(multi_word_abbrevs.keys(), key=len, reverse=True):
3717
3776
  if abbrev in expanded_text:
3718
3777
  expanded_text = expanded_text.replace(abbrev, multi_word_abbrevs[abbrev])
3778
+ break # Only apply the first (longest) matching abbreviation to avoid conflicts
3719
3779
 
3720
3780
  # Second pass: handle single word abbreviations
3721
3781
  words = expanded_text.split()
@@ -4110,8 +4170,9 @@ def are_venues_substantially_different(venue1: str, venue2: str) -> bool:
4110
4170
  return False
4111
4171
 
4112
4172
  # Order-aware fuzzy matching - words should match in sequence
4113
- words1_list = list(words1)
4114
- words2_list = list(words2)
4173
+ # Sort to ensure deterministic order (set iteration is not guaranteed to be consistent)
4174
+ words1_list = sorted(list(words1))
4175
+ words2_list = sorted(list(words2))
4115
4176
 
4116
4177
  # If word counts are very different, they're likely different venues
4117
4178
  if len(words1) > 0 and len(words2) > 0:
utils/url_utils.py CHANGED
@@ -33,26 +33,43 @@ def construct_doi_url(doi: str) -> str:
33
33
 
34
34
  def extract_arxiv_id_from_url(url: str) -> Optional[str]:
35
35
  """
36
- Extract ArXiv ID from an ArXiv URL.
36
+ Extract ArXiv ID from an ArXiv URL or text containing ArXiv reference.
37
+
38
+ This is the common function that handles all ArXiv ID extraction patterns:
39
+ - URLs: https://arxiv.org/abs/1234.5678, https://arxiv.org/pdf/1234.5678.pdf, https://arxiv.org/html/1234.5678
40
+ - Text references: arXiv:1234.5678, arXiv preprint arXiv:1234.5678
41
+ - Version handling: removes version numbers (v1, v2, etc.)
37
42
 
38
43
  Args:
39
- url: ArXiv URL (abs or pdf)
44
+ url: ArXiv URL or text containing ArXiv reference
40
45
 
41
46
  Returns:
42
- ArXiv ID if found, None otherwise
47
+ ArXiv ID (without version) if found, None otherwise
43
48
  """
44
- if not url:
49
+ if not url or not isinstance(url, str):
45
50
  return None
46
51
 
47
- # Use the more comprehensive regex from text_utils.py
48
- arxiv_match = re.search(r'arxiv\.org/(?:abs|pdf)/([^\s/?#]+?)(?:\.pdf|v\d+)?(?:[?\#]|$)', url)
49
- if arxiv_match:
50
- return arxiv_match.group(1)
51
-
52
- # Fallback to simpler regex for edge cases
53
- fallback_match = re.search(r'arxiv\.org/(?:abs|pdf)/([^/?#]+)', url)
52
+ # Pattern 1: arXiv: format (e.g., "arXiv:1610.10099" or "arXiv preprint arXiv:1610.10099")
53
+ arxiv_text_match = re.search(r'arXiv:(\d{4}\.\d{4,5})', url, re.IGNORECASE)
54
+ if arxiv_text_match:
55
+ arxiv_id = arxiv_text_match.group(1)
56
+ # Remove version number if present
57
+ return re.sub(r'v\d+$', '', arxiv_id)
58
+
59
+ # Pattern 2: arxiv.org URLs (abs, pdf, html)
60
+ # Handle URLs with version numbers and various formats
61
+ arxiv_url_match = re.search(r'arxiv\.org/(?:abs|pdf|html)/([^\s/?#]+?)(?:\.pdf|v\d+)?(?:[?\#]|$)', url, re.IGNORECASE)
62
+ if arxiv_url_match:
63
+ arxiv_id = arxiv_url_match.group(1)
64
+ # Remove version number if present
65
+ return re.sub(r'v\d+$', '', arxiv_id)
66
+
67
+ # Pattern 3: Fallback for simpler URL patterns
68
+ fallback_match = re.search(r'arxiv\.org/(?:abs|pdf|html)/([^/?#]+)', url, re.IGNORECASE)
54
69
  if fallback_match:
55
- return fallback_match.group(1).replace('.pdf', '')
70
+ arxiv_id = fallback_match.group(1).replace('.pdf', '')
71
+ # Remove version number if present
72
+ return re.sub(r'v\d+$', '', arxiv_id)
56
73
 
57
74
  return None
58
75