academic-refchecker 1.2.37__py3-none-any.whl → 1.2.39__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- __version__.py +1 -1
- {academic_refchecker-1.2.37.dist-info → academic_refchecker-1.2.39.dist-info}/METADATA +1 -1
- {academic_refchecker-1.2.37.dist-info → academic_refchecker-1.2.39.dist-info}/RECORD +13 -12
- core/refchecker.py +21 -99
- utils/arxiv_utils.py +76 -0
- utils/biblatex_parser.py +30 -17
- utils/bibliography_utils.py +332 -0
- utils/text_utils.py +107 -46
- utils/url_utils.py +29 -12
- {academic_refchecker-1.2.37.dist-info → academic_refchecker-1.2.39.dist-info}/WHEEL +0 -0
- {academic_refchecker-1.2.37.dist-info → academic_refchecker-1.2.39.dist-info}/entry_points.txt +0 -0
- {academic_refchecker-1.2.37.dist-info → academic_refchecker-1.2.39.dist-info}/licenses/LICENSE +0 -0
- {academic_refchecker-1.2.37.dist-info → academic_refchecker-1.2.39.dist-info}/top_level.txt +0 -0
__version__.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
__version__.py,sha256=
|
|
2
|
-
academic_refchecker-1.2.
|
|
1
|
+
__version__.py,sha256=63hU3Q1fGBiJ1GUnUQ-V6-S8pbWZ7bug_ZVu4V6eo9g,65
|
|
2
|
+
academic_refchecker-1.2.39.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
|
|
3
3
|
checkers/__init__.py,sha256=T0PAHTFt6UiGvn-WGoJU8CdhXNmf6zaHmcGVoWHhmJQ,533
|
|
4
4
|
checkers/crossref.py,sha256=Hzq4dlf1CSn0aZWU8CMOnLxIvaSivTabLoepIOkgkmY,20585
|
|
5
5
|
checkers/enhanced_hybrid_checker.py,sha256=6yf5tV4jLSVzjX1xR_kQq0NOgQIst-z_WmkiqqMc8hQ,23469
|
|
@@ -15,7 +15,7 @@ config/settings.py,sha256=-vODFoXbWbGPUElpmchE5zbCj_n4Vtxr8HU1hQDFp_c,6164
|
|
|
15
15
|
core/__init__.py,sha256=1T2MSQyDk0u_PupbHvm4CvNNN--dxsw78fqKUrqoYrM,157
|
|
16
16
|
core/db_connection_pool.py,sha256=XRiOdehikkSz3obH4WKgf8woa3694if50Q15rBT-4XQ,4697
|
|
17
17
|
core/parallel_processor.py,sha256=2S1cAPhtWH3glvtiJrt9JkZzk2iJkPKXsc-F3lg0X6U,16795
|
|
18
|
-
core/refchecker.py,sha256=
|
|
18
|
+
core/refchecker.py,sha256=8EatAqYEDpW219Xrn-ql1oQ5ytmCU8RW8pMtlujRbC8,273167
|
|
19
19
|
database/__init__.py,sha256=mEuVHlEBuS44t_2ZT_JnvQQrlRCjo1SJq1NmaJ6r8OY,125
|
|
20
20
|
database/download_semantic_scholar_db.py,sha256=waN4I97KC_36YMiPbiBDUUmgfzu1nub5yeKdAsIR2aw,75276
|
|
21
21
|
llm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -26,20 +26,21 @@ scripts/start_vllm_server.py,sha256=ZepWp2y2cKFW0Kgsoima2RbmF02fTU29UFcLLpsBhFU,
|
|
|
26
26
|
services/__init__.py,sha256=jGi9S74Msak3YR-C4Qb68VU7HB4oLaX9o1rlVAFpOFI,187
|
|
27
27
|
services/pdf_processor.py,sha256=vu_JnhFGZY6jFVbDbPvG-mlQojvB-3Dzc8_946KVV2E,9427
|
|
28
28
|
utils/__init__.py,sha256=1RrGoIIn1_gVzxd56b6a7HeAS-wu7uDP-nxLbR3fJ-8,1199
|
|
29
|
-
utils/arxiv_utils.py,sha256=
|
|
29
|
+
utils/arxiv_utils.py,sha256=Y8sDJgDwHxp1L33BkQoDumIl0Pkp-BuYZb1PwWYsmak,18251
|
|
30
30
|
utils/author_utils.py,sha256=DLTo1xsxef2wxoe4s_MWrh36maj4fgnvFlsDLpDE-qQ,5507
|
|
31
|
-
utils/biblatex_parser.py,sha256=
|
|
31
|
+
utils/biblatex_parser.py,sha256=JiO_tznsemhmGFs-pDM2qGuDlvT1ArIyc6bmsdwDOPQ,20452
|
|
32
|
+
utils/bibliography_utils.py,sha256=mpmdAklzAs1CT3gqrOcjujGhouL95OuliCx0LE9Pg90,11705
|
|
32
33
|
utils/bibtex_parser.py,sha256=jsQ87lkzmBmJO3VEN3itw22CJ1Hesei4IvM2sfsaFKI,12867
|
|
33
34
|
utils/config_validator.py,sha256=rxf7K3DYmJ-BNPsmtaCNipY2BTVT-pJZ7wN-M9Y3GC8,11167
|
|
34
35
|
utils/db_utils.py,sha256=_wSupfBlm0ILFvntQTvoj7tLDCbrYPRQrp9NDvphF_E,6281
|
|
35
36
|
utils/doi_utils.py,sha256=8f3iE4RdSNkzeqa9wJfoKcVEiBVse3_uf643biLudmw,4134
|
|
36
37
|
utils/error_utils.py,sha256=2qdRM3Bv4GvE3mlXgXp9jiQBfvB08qeg8vTgNVivcgk,5706
|
|
37
38
|
utils/mock_objects.py,sha256=QxU-UXyHSY27IZYN8Sb8ei0JtNkpGSdMXoErrRLHXvE,6437
|
|
38
|
-
utils/text_utils.py,sha256=
|
|
39
|
+
utils/text_utils.py,sha256=8luQsOBfcEBv3O16d3LlQmCuoEB0dEF0aQWGey-s3us,190502
|
|
39
40
|
utils/unicode_utils.py,sha256=-WBKarXO756p7fd7gCeNsMag4ztDNURwFX5IVniOtwY,10366
|
|
40
|
-
utils/url_utils.py,sha256=
|
|
41
|
-
academic_refchecker-1.2.
|
|
42
|
-
academic_refchecker-1.2.
|
|
43
|
-
academic_refchecker-1.2.
|
|
44
|
-
academic_refchecker-1.2.
|
|
45
|
-
academic_refchecker-1.2.
|
|
41
|
+
utils/url_utils.py,sha256=n0m5rXKV0-UrE8lI85VEx23KmfGwky57sI6gFPuu78I,7358
|
|
42
|
+
academic_refchecker-1.2.39.dist-info/METADATA,sha256=Uz4a9D0tfull6uDAZTafQJOem7p8IqPA6bjl_pYUf48,22298
|
|
43
|
+
academic_refchecker-1.2.39.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
44
|
+
academic_refchecker-1.2.39.dist-info/entry_points.txt,sha256=WdI89tYkIfz-M628PiboOfOLzTBWZAqvlF29qCVCkek,61
|
|
45
|
+
academic_refchecker-1.2.39.dist-info/top_level.txt,sha256=6RlcQEA0kHb7-ndbKMFMZnYnJQVohgsU6BBkbEvJvEs,69
|
|
46
|
+
academic_refchecker-1.2.39.dist-info/RECORD,,
|
core/refchecker.py
CHANGED
|
@@ -451,47 +451,10 @@ class ArxivReferenceChecker:
|
|
|
451
451
|
|
|
452
452
|
def extract_arxiv_id_from_url(self, url):
|
|
453
453
|
"""
|
|
454
|
-
Extract ArXiv ID from a URL or text containing ArXiv reference
|
|
454
|
+
Extract ArXiv ID from a URL or text containing ArXiv reference.
|
|
455
|
+
Uses the common extraction function from utils.url_utils.
|
|
455
456
|
"""
|
|
456
|
-
|
|
457
|
-
return None
|
|
458
|
-
|
|
459
|
-
# First, check for arXiv: format (e.g., "arXiv:1610.10099" or "arXiv preprint arXiv:1610.10099")
|
|
460
|
-
arxiv_match = re.search(r'arXiv:(\d{4}\.\d{4,5})', url, re.IGNORECASE)
|
|
461
|
-
if arxiv_match:
|
|
462
|
-
arxiv_id = arxiv_match.group(1)
|
|
463
|
-
# Remove version number if present
|
|
464
|
-
arxiv_id = re.sub(r'v\d+$', '', arxiv_id)
|
|
465
|
-
return arxiv_id
|
|
466
|
-
|
|
467
|
-
# Remove version string from end if present (e.g., 'v1')
|
|
468
|
-
url = re.sub(r'v\d+$', '', url)
|
|
469
|
-
|
|
470
|
-
# Parse URL
|
|
471
|
-
parsed_url = urlparse(url)
|
|
472
|
-
|
|
473
|
-
# Check if it's an arxiv.org URL
|
|
474
|
-
if 'arxiv.org' in parsed_url.netloc:
|
|
475
|
-
# Extract ID from path
|
|
476
|
-
path = parsed_url.path.strip('/')
|
|
477
|
-
|
|
478
|
-
# Handle different URL formats
|
|
479
|
-
if path.startswith('abs/'):
|
|
480
|
-
arxiv_id = path.replace('abs/', '')
|
|
481
|
-
elif path.startswith('pdf/'):
|
|
482
|
-
arxiv_id = path.replace('pdf/', '').replace('.pdf', '')
|
|
483
|
-
elif '/abs/' in path:
|
|
484
|
-
arxiv_id = path.split('/abs/')[1]
|
|
485
|
-
elif '/pdf/' in path:
|
|
486
|
-
arxiv_id = path.split('/pdf/')[1].replace('.pdf', '')
|
|
487
|
-
else:
|
|
488
|
-
arxiv_id = path
|
|
489
|
-
|
|
490
|
-
# Remove version number from the extracted ID
|
|
491
|
-
arxiv_id = re.sub(r'v\d+$', '', arxiv_id)
|
|
492
|
-
return arxiv_id
|
|
493
|
-
|
|
494
|
-
return None
|
|
457
|
+
return extract_arxiv_id_from_url(url)
|
|
495
458
|
|
|
496
459
|
def get_paper_metadata(self, arxiv_id):
|
|
497
460
|
"""
|
|
@@ -3581,11 +3544,9 @@ class ArxivReferenceChecker:
|
|
|
3581
3544
|
# Clean author part and extract authors
|
|
3582
3545
|
author_part_clean = strip_latex_commands(author_part).strip()
|
|
3583
3546
|
if author_part_clean and not author_part_clean.startswith('\\'):
|
|
3584
|
-
# Parse author names
|
|
3585
|
-
|
|
3586
|
-
|
|
3587
|
-
else:
|
|
3588
|
-
author_names = [name.strip() for name in author_part_clean.split(',')]
|
|
3547
|
+
# Parse author names using the robust author parsing function
|
|
3548
|
+
from utils.text_utils import parse_authors_with_initials
|
|
3549
|
+
author_names = parse_authors_with_initials(author_part_clean)
|
|
3589
3550
|
|
|
3590
3551
|
# Clean up author names
|
|
3591
3552
|
authors = []
|
|
@@ -4264,8 +4225,17 @@ class ArxivReferenceChecker:
|
|
|
4264
4225
|
return True
|
|
4265
4226
|
|
|
4266
4227
|
# Also check if authors have significant overlap (at least 50% of the shorter author list)
|
|
4267
|
-
|
|
4268
|
-
|
|
4228
|
+
from utils.text_utils import parse_authors_with_initials
|
|
4229
|
+
|
|
4230
|
+
if '*' in seg1['author']:
|
|
4231
|
+
author1_parts = seg1['author'].split('*')
|
|
4232
|
+
else:
|
|
4233
|
+
author1_parts = parse_authors_with_initials(seg1['author'])
|
|
4234
|
+
|
|
4235
|
+
if '*' in seg2['author']:
|
|
4236
|
+
author2_parts = seg2['author'].split('*')
|
|
4237
|
+
else:
|
|
4238
|
+
author2_parts = parse_authors_with_initials(seg2['author'])
|
|
4269
4239
|
|
|
4270
4240
|
# Clean and normalize author names
|
|
4271
4241
|
author1_clean = {a.strip().lower() for a in author1_parts if a.strip() and a.strip() not in ['et al', 'others']}
|
|
@@ -4780,55 +4750,6 @@ class ArxivReferenceChecker:
|
|
|
4780
4750
|
}
|
|
4781
4751
|
|
|
4782
4752
|
|
|
4783
|
-
def _get_bibtex_content(self, paper):
|
|
4784
|
-
"""
|
|
4785
|
-
Try to get BibTeX content for a paper from various sources.
|
|
4786
|
-
|
|
4787
|
-
Args:
|
|
4788
|
-
paper: Paper object
|
|
4789
|
-
|
|
4790
|
-
Returns:
|
|
4791
|
-
str: BibTeX content if found, None otherwise
|
|
4792
|
-
"""
|
|
4793
|
-
# Try ArXiv source if it's an ArXiv paper
|
|
4794
|
-
from utils.arxiv_utils import extract_arxiv_id_from_paper, download_arxiv_source
|
|
4795
|
-
|
|
4796
|
-
arxiv_id = extract_arxiv_id_from_paper(paper)
|
|
4797
|
-
if arxiv_id:
|
|
4798
|
-
logger.debug(f"Detected ArXiv paper {arxiv_id}, checking for structured bibliography")
|
|
4799
|
-
tex_content, bib_content, bbl_content = download_arxiv_source(arxiv_id)
|
|
4800
|
-
|
|
4801
|
-
# Prefer .bib files (most structured), then .bbl files
|
|
4802
|
-
if bib_content:
|
|
4803
|
-
logger.info(f"Found .bib files in ArXiv source for {arxiv_id}")
|
|
4804
|
-
|
|
4805
|
-
# If we have LaTeX content, filter BibTeX by cited keys
|
|
4806
|
-
if tex_content:
|
|
4807
|
-
from utils.text_utils import extract_cited_keys_from_latex, filter_bibtex_by_cited_keys
|
|
4808
|
-
cited_keys = extract_cited_keys_from_latex(tex_content)
|
|
4809
|
-
if cited_keys:
|
|
4810
|
-
logger.debug(f"Found {len(cited_keys)} cited keys, filtering BibTeX")
|
|
4811
|
-
filtered_content = filter_bibtex_by_cited_keys(bib_content, cited_keys)
|
|
4812
|
-
return filtered_content
|
|
4813
|
-
|
|
4814
|
-
return bib_content
|
|
4815
|
-
|
|
4816
|
-
elif bbl_content:
|
|
4817
|
-
logger.info(f"Found .bbl files in ArXiv source for {arxiv_id}")
|
|
4818
|
-
return bbl_content
|
|
4819
|
-
|
|
4820
|
-
elif tex_content:
|
|
4821
|
-
# Check for embedded bibliography in LaTeX
|
|
4822
|
-
from utils.text_utils import detect_latex_bibliography_format
|
|
4823
|
-
latex_format = detect_latex_bibliography_format(tex_content)
|
|
4824
|
-
if latex_format['is_latex'] and ('\\bibitem' in tex_content or '@' in tex_content):
|
|
4825
|
-
logger.info(f"Found embedded bibliography in ArXiv LaTeX source, but skipping due to formatting incompatibility")
|
|
4826
|
-
# Skip embedded bibliography and return None to trigger fallback methods
|
|
4827
|
-
return None
|
|
4828
|
-
|
|
4829
|
-
# Could add other BibTeX sources here (e.g., direct BibTeX URLs, etc.)
|
|
4830
|
-
|
|
4831
|
-
return None
|
|
4832
4753
|
|
|
4833
4754
|
|
|
4834
4755
|
def extract_bibliography(self, paper, debug_mode=False):
|
|
@@ -4843,7 +4764,8 @@ class ArxivReferenceChecker:
|
|
|
4843
4764
|
logger.debug(f"Extracting bibliography for paper {paper_id}: {paper.title}")
|
|
4844
4765
|
|
|
4845
4766
|
# Check if we can get BibTeX content for this paper (ArXiv or other sources)
|
|
4846
|
-
|
|
4767
|
+
from utils.arxiv_utils import get_bibtex_content
|
|
4768
|
+
bibtex_content = get_bibtex_content(paper)
|
|
4847
4769
|
if bibtex_content:
|
|
4848
4770
|
logger.debug(f"Found BibTeX content for {paper_id}, using structured bibliography")
|
|
4849
4771
|
|
|
@@ -4897,7 +4819,7 @@ class ArxivReferenceChecker:
|
|
|
4897
4819
|
else:
|
|
4898
4820
|
logger.warning("No LLM available for fallback, using original parsing results")
|
|
4899
4821
|
else:
|
|
4900
|
-
logger.
|
|
4822
|
+
logger.debug(f"LaTeX parsing validation passed (quality: {validation['quality_score']:.2f})")
|
|
4901
4823
|
else:
|
|
4902
4824
|
# Parse BibTeX using the standard flow (LLM or regex based on config)
|
|
4903
4825
|
references = self.parse_references(bibtex_content)
|
|
@@ -5458,7 +5380,7 @@ class ArxivReferenceChecker:
|
|
|
5458
5380
|
error_details = unverified_errors[0].get('error_details', '')
|
|
5459
5381
|
if error_details:
|
|
5460
5382
|
subreason = self._categorize_unverified_reason(error_details)
|
|
5461
|
-
print(f"
|
|
5383
|
+
print(f" Subreason: {subreason}")
|
|
5462
5384
|
|
|
5463
5385
|
year_str = self._format_year_string(reference.get('year'))
|
|
5464
5386
|
|
utils/arxiv_utils.py
CHANGED
|
@@ -374,3 +374,79 @@ def reconstruct_bibtex_content(cited_entries, original_content):
|
|
|
374
374
|
return '\n\n'.join(filtered_parts) + '\n'
|
|
375
375
|
|
|
376
376
|
|
|
377
|
+
def get_bibtex_content(paper):
|
|
378
|
+
"""
|
|
379
|
+
Try to get BibTeX content for a paper from various sources.
|
|
380
|
+
|
|
381
|
+
Args:
|
|
382
|
+
paper: Paper object
|
|
383
|
+
|
|
384
|
+
Returns:
|
|
385
|
+
str: BibTeX content if found, None otherwise
|
|
386
|
+
"""
|
|
387
|
+
import re
|
|
388
|
+
|
|
389
|
+
# Try ArXiv source if it's an ArXiv paper
|
|
390
|
+
arxiv_id = extract_arxiv_id_from_paper(paper)
|
|
391
|
+
if arxiv_id:
|
|
392
|
+
logger.debug(f"Detected ArXiv paper {arxiv_id}, checking for structured bibliography")
|
|
393
|
+
tex_content, bib_content, bbl_content = download_arxiv_source(arxiv_id)
|
|
394
|
+
|
|
395
|
+
# Choose between .bib and .bbl files based on content richness
|
|
396
|
+
# Prioritize .bbl if it has more references than filtered .bib, otherwise prefer .bib
|
|
397
|
+
if bib_content and bbl_content:
|
|
398
|
+
# Count entries in both
|
|
399
|
+
bib_entry_count = len(re.findall(r'@\w+\s*\{', bib_content))
|
|
400
|
+
bbl_entry_count = len(re.findall(r'\\bibitem\[', bbl_content))
|
|
401
|
+
|
|
402
|
+
# If we have LaTeX content, get filtered BibTeX count
|
|
403
|
+
filtered_bib_count = bib_entry_count
|
|
404
|
+
filtered_content = bib_content
|
|
405
|
+
if tex_content:
|
|
406
|
+
cited_keys = extract_cited_keys_from_tex({}, tex_content)
|
|
407
|
+
if cited_keys:
|
|
408
|
+
logger.debug(f"Found {len(cited_keys)} cited keys, filtering BibTeX")
|
|
409
|
+
filtered_content = filter_bibtex_by_citations(bib_content, {}, tex_content)
|
|
410
|
+
filtered_bib_count = len(re.findall(r'@\w+\s*\{', filtered_content))
|
|
411
|
+
|
|
412
|
+
logger.debug(f"Bibliography comparison: .bbl has {bbl_entry_count} entries, filtered .bib has {filtered_bib_count} entries")
|
|
413
|
+
|
|
414
|
+
# Prioritize .bbl if it has significantly more entries
|
|
415
|
+
if bbl_entry_count > filtered_bib_count * 1.5: # 50% more entries threshold
|
|
416
|
+
logger.info(f"Using .bbl files from ArXiv source")
|
|
417
|
+
return bbl_content
|
|
418
|
+
else:
|
|
419
|
+
logger.info(f"Using filtered .bib files")
|
|
420
|
+
return filtered_content
|
|
421
|
+
|
|
422
|
+
elif bib_content:
|
|
423
|
+
logger.info(f"Found .bib files in ArXiv source for {arxiv_id}")
|
|
424
|
+
|
|
425
|
+
# If we have LaTeX content, filter BibTeX by cited keys
|
|
426
|
+
if tex_content:
|
|
427
|
+
cited_keys = extract_cited_keys_from_tex({}, tex_content)
|
|
428
|
+
if cited_keys:
|
|
429
|
+
logger.debug(f"Found {len(cited_keys)} cited keys, filtering BibTeX")
|
|
430
|
+
filtered_content = filter_bibtex_by_citations(bib_content, {}, tex_content)
|
|
431
|
+
return filtered_content
|
|
432
|
+
|
|
433
|
+
return bib_content
|
|
434
|
+
|
|
435
|
+
elif bbl_content:
|
|
436
|
+
logger.info(f"Found .bbl files in ArXiv source for {arxiv_id}")
|
|
437
|
+
return bbl_content
|
|
438
|
+
|
|
439
|
+
elif tex_content:
|
|
440
|
+
# Check for embedded bibliography in LaTeX
|
|
441
|
+
from utils.text_utils import detect_latex_bibliography_format
|
|
442
|
+
latex_format = detect_latex_bibliography_format(tex_content)
|
|
443
|
+
if latex_format['is_latex'] and ('\\bibitem' in tex_content or '@' in tex_content):
|
|
444
|
+
logger.info(f"Found embedded bibliography in ArXiv LaTeX source, but skipping due to formatting incompatibility")
|
|
445
|
+
# Skip embedded bibliography and return None to trigger fallback methods
|
|
446
|
+
return None
|
|
447
|
+
|
|
448
|
+
# Could add other BibTeX sources here (e.g., direct BibTeX URLs, etc.)
|
|
449
|
+
|
|
450
|
+
return None
|
|
451
|
+
|
|
452
|
+
|
utils/biblatex_parser.py
CHANGED
|
@@ -261,11 +261,13 @@ def parse_biblatex_entry_content(entry_num: str, content: str) -> Dict[str, Any]
|
|
|
261
261
|
else:
|
|
262
262
|
# If no quoted title, look for title after author names
|
|
263
263
|
# Pattern: "FirstAuthor et al. Title Goes Here. Year." or "Author. Title. Year."
|
|
264
|
+
# Order matters: more specific patterns first
|
|
264
265
|
title_patterns = [
|
|
265
|
-
r'[A-Z][
|
|
266
|
-
r'[A-Z][^.]+\.\s*([A-Z][^.]*?)\.\s*(?:https?://|arXiv:|\d{4})', # "Authors. Title. URL/arXiv/Year" (flexible spacing)
|
|
267
|
-
r'(?:[A-Z][a-z]+,?\s+)+([A-Z][^.]*?)\.\s+\d{4}', # "Name, Name. Title. Year"
|
|
266
|
+
r'[A-Z][^.]+\.\s*([A-Z][^.]*?)\.\s*(?:https?://|arXiv:|\d{4})', # "Authors. Title. URL/arXiv/Year" (flexible spacing) - MOST SPECIFIC
|
|
268
267
|
r'\.([A-Z][A-Za-z\s]+(?:\?|!)?)\.?\s+\d{4}', # ".Title. Year" - for cases where authors end without space
|
|
268
|
+
r'[A-Z][a-z]+\.([A-Z][A-Za-z\s\-&]+?)\.\s+\d{4}', # "Name.Title. Year" - missing space after period
|
|
269
|
+
r'[A-Z][a-z]+(?:\s+et\s+al)?\.?\s+([A-Z][^.]*?)\.\s+\d{4}', # "Author et al. Title. Year" - LESS SPECIFIC
|
|
270
|
+
r'(?:[A-Z][a-z]+,?\s+)+([A-Z][^.]*?)\.\s+\d{4}', # "Name, Name. Title. Year"
|
|
269
271
|
r'\b([A-Z][A-Za-z\s\-0-9]+)\s+\.\s+https', # "Title . https" - handle space before period
|
|
270
272
|
]
|
|
271
273
|
|
|
@@ -391,10 +393,10 @@ def parse_biblatex_entry_content(entry_num: str, content: str) -> Dict[str, Any]
|
|
|
391
393
|
# Fallback: split by common patterns if parse_authors_with_initials failed
|
|
392
394
|
if not authors:
|
|
393
395
|
if 'et al' in authors_text.lower():
|
|
394
|
-
# Handle "FirstAuthor et al." case
|
|
396
|
+
# Handle "FirstAuthor et al." case - separate base author from "et al"
|
|
395
397
|
base_author = authors_text.split(' et al')[0].strip()
|
|
396
398
|
if base_author:
|
|
397
|
-
authors = [base_author
|
|
399
|
+
authors = [base_author, 'et al']
|
|
398
400
|
elif ' and ' in authors_text:
|
|
399
401
|
# Handle "Author1 and Author2 and Author3" format
|
|
400
402
|
author_parts = [p.strip() for p in authors_text.split(' and ')]
|
|
@@ -404,18 +406,29 @@ def parse_biblatex_entry_content(entry_num: str, content: str) -> Dict[str, Any]
|
|
|
404
406
|
if part and len(part) > 2:
|
|
405
407
|
authors.append(part)
|
|
406
408
|
else:
|
|
407
|
-
# Try
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
409
|
+
# Try sophisticated parsing one more time with relaxed constraints
|
|
410
|
+
try:
|
|
411
|
+
# Remove "and" connectors for cleaner parsing
|
|
412
|
+
clean_text = re.sub(r'\s+and\s+', ', ', authors_text)
|
|
413
|
+
fallback_authors = parse_authors_with_initials(clean_text)
|
|
414
|
+
if fallback_authors and len(fallback_authors) >= 1:
|
|
415
|
+
authors = fallback_authors
|
|
416
|
+
else:
|
|
417
|
+
raise ValueError("Fallback parsing failed")
|
|
418
|
+
except:
|
|
419
|
+
# Last resort: naive comma separation for "Author1, Author2, Author3"
|
|
420
|
+
# This should rarely be reached now
|
|
421
|
+
author_parts = [p.strip() for p in authors_text.split(',')]
|
|
422
|
+
authors = []
|
|
423
|
+
for part in author_parts:
|
|
424
|
+
part = part.strip(' .')
|
|
425
|
+
# Remove "and" prefix if present
|
|
426
|
+
if part.startswith('and '):
|
|
427
|
+
part = part[4:].strip()
|
|
428
|
+
# Skip parts that are too short or look like initials only
|
|
429
|
+
if (part and len(part) > 2 and
|
|
430
|
+
not re.search(r'\b(http|www|doi|arxiv|proceedings)\b', part.lower())):
|
|
431
|
+
authors.append(part)
|
|
419
432
|
|
|
420
433
|
# 7. Extract journal/venue - look for patterns like "In: Conference" or remaining text
|
|
421
434
|
journal_patterns = [
|
|
@@ -0,0 +1,332 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Bibliography extraction and parsing utilities.
|
|
4
|
+
|
|
5
|
+
This module provides utilities for extracting and parsing bibliographies from
|
|
6
|
+
academic papers in various formats (LaTeX, BibTeX, PDF text, etc.).
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import re
|
|
10
|
+
import logging
|
|
11
|
+
import os
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def extract_text_from_latex(latex_file_path):
|
|
17
|
+
"""
|
|
18
|
+
Extract text from a LaTeX file
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
latex_file_path: Path to the LaTeX file
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
String containing the LaTeX file content
|
|
25
|
+
"""
|
|
26
|
+
try:
|
|
27
|
+
logger.info(f"Reading LaTeX file: {latex_file_path}")
|
|
28
|
+
with open(latex_file_path, 'r', encoding='utf-8') as f:
|
|
29
|
+
content = f.read()
|
|
30
|
+
logger.info(f"Successfully read LaTeX file with {len(content)} characters")
|
|
31
|
+
return content
|
|
32
|
+
except UnicodeDecodeError:
|
|
33
|
+
# Try with latin-1 encoding if utf-8 fails
|
|
34
|
+
try:
|
|
35
|
+
logger.warning(f"UTF-8 encoding failed for {latex_file_path}, trying latin-1")
|
|
36
|
+
with open(latex_file_path, 'r', encoding='latin-1') as f:
|
|
37
|
+
content = f.read()
|
|
38
|
+
logger.info(f"Successfully read LaTeX file with latin-1 encoding")
|
|
39
|
+
return content
|
|
40
|
+
except Exception as e:
|
|
41
|
+
logger.error(f"Failed to read LaTeX file {latex_file_path}: {e}")
|
|
42
|
+
return None
|
|
43
|
+
except Exception as e:
|
|
44
|
+
logger.error(f"Failed to read LaTeX file {latex_file_path}: {e}")
|
|
45
|
+
return None
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def find_bibliography_section(text):
|
|
49
|
+
"""
|
|
50
|
+
Find the bibliography section in the text
|
|
51
|
+
"""
|
|
52
|
+
if not text:
|
|
53
|
+
logger.warning("No text provided to find_bibliography_section")
|
|
54
|
+
return None
|
|
55
|
+
|
|
56
|
+
# Log a sample of the text for debugging
|
|
57
|
+
text_sample = text[:500] + "..." if len(text) > 500 else text
|
|
58
|
+
logger.debug(f"Text sample: {text_sample}")
|
|
59
|
+
|
|
60
|
+
# Common section titles for bibliography
|
|
61
|
+
section_patterns = [
|
|
62
|
+
# Patterns for numbered sections with potential spacing issues from PDF extraction
|
|
63
|
+
r'(?i)\d+\s*ref\s*er\s*ences\s*\n', # "12 Refer ences" with spaces
|
|
64
|
+
r'(?i)\d+\s*references\s*\n', # "12References" or "12 References"
|
|
65
|
+
r'(?i)^\s*\d+\.\s*references\s*$', # Numbered section: "7. References"
|
|
66
|
+
r'(?i)\d+\s+references\s*\.', # "9 References." format used in Georgia Tech paper
|
|
67
|
+
# Standard reference patterns
|
|
68
|
+
r'(?i)references\s*\n',
|
|
69
|
+
r'(?i)bibliography\s*\n',
|
|
70
|
+
r'(?i)works cited\s*\n',
|
|
71
|
+
r'(?i)literature cited\s*\n',
|
|
72
|
+
r'(?i)references\s*$', # End of document
|
|
73
|
+
r'(?i)\[\s*references\s*\]', # [References]
|
|
74
|
+
r'(?i)^\s*references\s*$', # References as a standalone line
|
|
75
|
+
r'(?i)^\s*bibliography\s*$', # Bibliography as a standalone line
|
|
76
|
+
r'(?i)references\s*and\s*citations', # References and Citations
|
|
77
|
+
r'(?i)cited\s*references', # Cited References
|
|
78
|
+
r'(?i)reference\s*list', # Reference List
|
|
79
|
+
]
|
|
80
|
+
|
|
81
|
+
bibliography_start = None
|
|
82
|
+
matched_pattern = None
|
|
83
|
+
|
|
84
|
+
for pattern in section_patterns:
|
|
85
|
+
matches = re.search(pattern, text, re.MULTILINE)
|
|
86
|
+
if matches:
|
|
87
|
+
bibliography_start = matches.end()
|
|
88
|
+
matched_pattern = pattern
|
|
89
|
+
logger.debug(f"Bibliography section found using pattern: {pattern}")
|
|
90
|
+
break
|
|
91
|
+
|
|
92
|
+
if bibliography_start is None:
|
|
93
|
+
logger.debug("No bibliography section header found, trying end-of-document approach")
|
|
94
|
+
# Try to find bibliography at the end of the document without explicit headers
|
|
95
|
+
lines = text.split('\n')
|
|
96
|
+
for i in range(len(lines) - 1, max(0, len(lines) - 100), -1): # Check last 100 lines
|
|
97
|
+
line = lines[i].strip()
|
|
98
|
+
if re.match(r'^\[\d+\]', line) or re.match(r'^\d+\.', line):
|
|
99
|
+
# Found what looks like reference entries
|
|
100
|
+
bibliography_start = text.rfind('\n'.join(lines[i:]))
|
|
101
|
+
logger.debug(f"Bibliography section found at end of document starting with: {line[:50]}")
|
|
102
|
+
break
|
|
103
|
+
|
|
104
|
+
if bibliography_start is not None:
|
|
105
|
+
bibliography_text = text[bibliography_start:].strip()
|
|
106
|
+
logger.debug(f"Bibliography text length: {len(bibliography_text)}")
|
|
107
|
+
|
|
108
|
+
# Optional: Try to find the end of the bibliography section
|
|
109
|
+
# This is challenging because it might go to the end of the document
|
|
110
|
+
# or be followed by appendices, acknowledgments, etc.
|
|
111
|
+
|
|
112
|
+
return bibliography_text
|
|
113
|
+
|
|
114
|
+
logger.debug("Bibliography section not found")
|
|
115
|
+
return None
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def parse_references(bibliography_text):
|
|
119
|
+
"""
|
|
120
|
+
Parse references from bibliography text using multiple parsing strategies.
|
|
121
|
+
|
|
122
|
+
Args:
|
|
123
|
+
bibliography_text: String containing bibliography content
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
List of parsed reference dictionaries
|
|
127
|
+
"""
|
|
128
|
+
if not bibliography_text:
|
|
129
|
+
logger.warning("No bibliography text provided to parse_references")
|
|
130
|
+
return []
|
|
131
|
+
|
|
132
|
+
# Try different parsing strategies in order of preference
|
|
133
|
+
parsing_strategies = [
|
|
134
|
+
('BibTeX', _parse_bibtex_references),
|
|
135
|
+
('biblatex', _parse_biblatex_references),
|
|
136
|
+
('ACM/natbib', _parse_standard_acm_natbib_references),
|
|
137
|
+
('regex-based', _parse_references_regex)
|
|
138
|
+
]
|
|
139
|
+
|
|
140
|
+
for strategy_name, parse_func in parsing_strategies:
|
|
141
|
+
try:
|
|
142
|
+
logger.debug(f"Attempting {strategy_name} parsing")
|
|
143
|
+
references = parse_func(bibliography_text)
|
|
144
|
+
if references and len(references) > 0:
|
|
145
|
+
logger.info(f"Successfully parsed {len(references)} references using {strategy_name} format")
|
|
146
|
+
return references
|
|
147
|
+
else:
|
|
148
|
+
logger.debug(f"{strategy_name} parsing returned no references")
|
|
149
|
+
except Exception as e:
|
|
150
|
+
logger.debug(f"{strategy_name} parsing failed: {e}")
|
|
151
|
+
continue
|
|
152
|
+
|
|
153
|
+
logger.warning("All parsing strategies failed to extract references")
|
|
154
|
+
return []
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def _parse_bibtex_references(bibliography_text):
|
|
158
|
+
"""
|
|
159
|
+
Parse BibTeX formatted references like @inproceedings{...}, @article{...}, etc.
|
|
160
|
+
|
|
161
|
+
Args:
|
|
162
|
+
bibliography_text: String containing BibTeX entries
|
|
163
|
+
|
|
164
|
+
Returns:
|
|
165
|
+
List of reference dictionaries
|
|
166
|
+
"""
|
|
167
|
+
from utils.bibtex_parser import parse_bibtex_entries
|
|
168
|
+
return parse_bibtex_entries(bibliography_text)
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def _parse_biblatex_references(bibliography_text):
|
|
172
|
+
"""
|
|
173
|
+
Parse biblatex formatted references like [1] Author. "Title". In: Venue. Year.
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
bibliography_text: String containing biblatex .bbl entries
|
|
177
|
+
|
|
178
|
+
Returns:
|
|
179
|
+
List of reference dictionaries
|
|
180
|
+
"""
|
|
181
|
+
from utils.text_utils import extract_latex_references
|
|
182
|
+
return extract_latex_references(bibliography_text)
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def _parse_standard_acm_natbib_references(bibliography_text):
|
|
186
|
+
"""
|
|
187
|
+
Parse references using regex for standard ACM/natbib format (both ACM Reference Format and simple natbib)
|
|
188
|
+
"""
|
|
189
|
+
from utils.text_utils import detect_standard_acm_natbib_format
|
|
190
|
+
|
|
191
|
+
references = []
|
|
192
|
+
|
|
193
|
+
# Check if this is standard ACM natbib format
|
|
194
|
+
format_info = detect_standard_acm_natbib_format(bibliography_text)
|
|
195
|
+
if format_info['is_acm_natbib']:
|
|
196
|
+
logger.debug("Detected standard ACM natbib format")
|
|
197
|
+
|
|
198
|
+
# Split by reference entries
|
|
199
|
+
ref_pattern = r'\[(\d+)\]\s*'
|
|
200
|
+
entries = re.split(ref_pattern, bibliography_text)[1:] # Skip first empty element
|
|
201
|
+
|
|
202
|
+
for i in range(0, len(entries), 2):
|
|
203
|
+
if i + 1 < len(entries):
|
|
204
|
+
ref_num = entries[i]
|
|
205
|
+
ref_content = entries[i + 1].strip()
|
|
206
|
+
|
|
207
|
+
try:
|
|
208
|
+
reference = _parse_simple_natbib_format(int(ref_num), ref_content, f"[{ref_num}]")
|
|
209
|
+
if reference:
|
|
210
|
+
references.append(reference)
|
|
211
|
+
logger.debug(f"Parsed reference {ref_num}: {reference.get('title', 'No title')[:50]}...")
|
|
212
|
+
except Exception as e:
|
|
213
|
+
logger.debug(f"Error parsing reference {ref_num}: {e}")
|
|
214
|
+
continue
|
|
215
|
+
|
|
216
|
+
logger.debug(f"ACM natbib parsing extracted {len(references)} references")
|
|
217
|
+
|
|
218
|
+
return references
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def _parse_simple_natbib_format(ref_num, content, label):
|
|
222
|
+
"""
|
|
223
|
+
Parse a simple natbib format reference entry.
|
|
224
|
+
|
|
225
|
+
Args:
|
|
226
|
+
ref_num: Reference number
|
|
227
|
+
content: Reference content text
|
|
228
|
+
label: Reference label (e.g., "[1]")
|
|
229
|
+
|
|
230
|
+
Returns:
|
|
231
|
+
Dictionary containing parsed reference information
|
|
232
|
+
"""
|
|
233
|
+
from utils.text_utils import extract_url_from_reference, extract_year_from_reference
|
|
234
|
+
|
|
235
|
+
# Basic parsing - this could be enhanced with more sophisticated NLP
|
|
236
|
+
reference = {
|
|
237
|
+
'raw_text': content,
|
|
238
|
+
'label': label,
|
|
239
|
+
'type': 'unknown'
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
# Try to extract basic information
|
|
243
|
+
# This is a simplified parser - real parsing would be much more complex
|
|
244
|
+
|
|
245
|
+
# Look for URL
|
|
246
|
+
url = extract_url_from_reference(content)
|
|
247
|
+
if url:
|
|
248
|
+
reference['url'] = url
|
|
249
|
+
|
|
250
|
+
# Look for year
|
|
251
|
+
year = extract_year_from_reference(content)
|
|
252
|
+
if year:
|
|
253
|
+
reference['year'] = year
|
|
254
|
+
|
|
255
|
+
# Try to identify the type based on content
|
|
256
|
+
content_lower = content.lower()
|
|
257
|
+
if 'proceedings' in content_lower or 'conference' in content_lower:
|
|
258
|
+
reference['type'] = 'inproceedings'
|
|
259
|
+
elif 'journal' in content_lower or 'trans.' in content_lower:
|
|
260
|
+
reference['type'] = 'article'
|
|
261
|
+
elif 'arxiv' in content_lower:
|
|
262
|
+
reference['type'] = 'misc'
|
|
263
|
+
reference['note'] = 'arXiv preprint'
|
|
264
|
+
|
|
265
|
+
return reference
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def _parse_references_regex(bibliography_text):
|
|
269
|
+
"""
|
|
270
|
+
Parse references using regex-based approach (original implementation)
|
|
271
|
+
"""
|
|
272
|
+
references = []
|
|
273
|
+
|
|
274
|
+
# Split bibliography into individual references
|
|
275
|
+
# Look for patterns like [1], [2], etc.
|
|
276
|
+
ref_pattern = r'\[(\d+)\](.*?)(?=\[\d+\]|$)'
|
|
277
|
+
matches = re.findall(ref_pattern, bibliography_text, re.DOTALL)
|
|
278
|
+
|
|
279
|
+
for ref_num, ref_content in matches:
|
|
280
|
+
ref_content = ref_content.strip()
|
|
281
|
+
if not ref_content:
|
|
282
|
+
continue
|
|
283
|
+
|
|
284
|
+
reference = {
|
|
285
|
+
'raw_text': ref_content,
|
|
286
|
+
'label': f"[{ref_num}]",
|
|
287
|
+
'type': 'unknown'
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
# Basic information extraction
|
|
291
|
+
from utils.text_utils import extract_url_from_reference, extract_year_from_reference
|
|
292
|
+
|
|
293
|
+
url = extract_url_from_reference(ref_content)
|
|
294
|
+
if url:
|
|
295
|
+
reference['url'] = url
|
|
296
|
+
|
|
297
|
+
year = extract_year_from_reference(ref_content)
|
|
298
|
+
if year:
|
|
299
|
+
reference['year'] = year
|
|
300
|
+
|
|
301
|
+
references.append(reference)
|
|
302
|
+
|
|
303
|
+
return references
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
def _is_bibtex_surname_given_format(surname_part, given_part):
|
|
307
|
+
"""
|
|
308
|
+
Check if this appears to be a BibTeX "Surname, Given" format.
|
|
309
|
+
|
|
310
|
+
Args:
|
|
311
|
+
surname_part: The part before the comma
|
|
312
|
+
given_part: The part after the comma
|
|
313
|
+
|
|
314
|
+
Returns:
|
|
315
|
+
Boolean indicating if this looks like BibTeX name format
|
|
316
|
+
"""
|
|
317
|
+
# Simple heuristics to detect BibTeX format
|
|
318
|
+
if not surname_part or not given_part:
|
|
319
|
+
return False
|
|
320
|
+
|
|
321
|
+
# Check if surname looks like a surname (capitalized, not too long)
|
|
322
|
+
if not re.match(r'^[A-Z][a-zA-Z\s\-\']+$', surname_part.strip()):
|
|
323
|
+
return False
|
|
324
|
+
|
|
325
|
+
# Check if given part looks like given names (often abbreviated)
|
|
326
|
+
given_clean = given_part.strip()
|
|
327
|
+
if re.match(r'^[A-Z](\.\s*[A-Z]\.?)*$', given_clean): # Like "J. R." or "M. K."
|
|
328
|
+
return True
|
|
329
|
+
if re.match(r'^[A-Z][a-z]+(\s+[A-Z][a-z]*)*$', given_clean): # Like "John Robert"
|
|
330
|
+
return True
|
|
331
|
+
|
|
332
|
+
return False
|
utils/text_utils.py
CHANGED
|
@@ -11,6 +11,31 @@ from typing import List
|
|
|
11
11
|
logger = logging.getLogger(__name__)
|
|
12
12
|
|
|
13
13
|
|
|
14
|
+
def normalize_apostrophes(text):
|
|
15
|
+
"""
|
|
16
|
+
Normalize all apostrophe variants to standard ASCII apostrophe
|
|
17
|
+
"""
|
|
18
|
+
if not text:
|
|
19
|
+
return text
|
|
20
|
+
|
|
21
|
+
# All known apostrophe variants
|
|
22
|
+
apostrophe_variants = [
|
|
23
|
+
"'", # U+0027 ASCII apostrophe
|
|
24
|
+
"'", # U+2019 Right single quotation mark (most common)
|
|
25
|
+
"'", # U+2018 Left single quotation mark
|
|
26
|
+
"ʼ", # U+02BC Modifier letter apostrophe
|
|
27
|
+
"ˈ", # U+02C8 Modifier letter vertical line (primary stress)
|
|
28
|
+
"`", # U+0060 Grave accent (sometimes used as apostrophe)
|
|
29
|
+
"´", # U+00B4 Acute accent (sometimes used as apostrophe)
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
# Replace all variants with standard ASCII apostrophe
|
|
33
|
+
for variant in apostrophe_variants:
|
|
34
|
+
text = text.replace(variant, "'")
|
|
35
|
+
|
|
36
|
+
return text
|
|
37
|
+
|
|
38
|
+
|
|
14
39
|
def normalize_text(text):
|
|
15
40
|
"""
|
|
16
41
|
Normalize text by removing diacritical marks and special characters
|
|
@@ -18,6 +43,9 @@ def normalize_text(text):
|
|
|
18
43
|
if not text:
|
|
19
44
|
return ""
|
|
20
45
|
|
|
46
|
+
# First normalize apostrophes to standard form
|
|
47
|
+
text = normalize_apostrophes(text)
|
|
48
|
+
|
|
21
49
|
# Replace common special characters with their ASCII equivalents
|
|
22
50
|
replacements = {
|
|
23
51
|
'ä': 'a', 'ö': 'o', 'ü': 'u', 'ß': 'ss',
|
|
@@ -29,7 +57,7 @@ def normalize_text(text):
|
|
|
29
57
|
'Ł': 'L', 'ł': 'l',
|
|
30
58
|
'¨': '', '´': '', '`': '', '^': '', '~': '',
|
|
31
59
|
'–': '-', '—': '-', '−': '-',
|
|
32
|
-
'„': '"', '"': '"', '"': '"',
|
|
60
|
+
'„': '"', '"': '"', '"': '"',
|
|
33
61
|
'«': '"', '»': '"',
|
|
34
62
|
'¡': '!', '¿': '?',
|
|
35
63
|
'°': 'degrees', '©': '(c)', '®': '(r)', '™': '(tm)',
|
|
@@ -39,10 +67,6 @@ def normalize_text(text):
|
|
|
39
67
|
'\u00A0': ' ', # Non-breaking space
|
|
40
68
|
'\u2013': '-', # En dash
|
|
41
69
|
'\u2014': '-', # Em dash
|
|
42
|
-
'\u2018': "'", # Left single quotation mark
|
|
43
|
-
'\u2019': "'", # Right single quotation mark
|
|
44
|
-
'\u201C': '"', # Left double quotation mark
|
|
45
|
-
'\u201D': '"', # Right double quotation mark
|
|
46
70
|
'\u2026': '...', # Horizontal ellipsis
|
|
47
71
|
'\u00B7': '.', # Middle dot
|
|
48
72
|
'\u2022': '.', # Bullet
|
|
@@ -54,8 +78,8 @@ def normalize_text(text):
|
|
|
54
78
|
# Remove any remaining diacritical marks
|
|
55
79
|
text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('ASCII')
|
|
56
80
|
|
|
57
|
-
# Remove special characters
|
|
58
|
-
text = re.sub(r
|
|
81
|
+
# Remove special characters except apostrophes
|
|
82
|
+
text = re.sub(r"[^\w\s']", '', text)
|
|
59
83
|
|
|
60
84
|
# Normalize whitespace
|
|
61
85
|
text = re.sub(r'\s+', ' ', text).strip()
|
|
@@ -94,6 +118,15 @@ def parse_authors_with_initials(authors_text):
|
|
|
94
118
|
# by converting to "Haotian Liu and Chunyuan Li and Qingyang Wu"
|
|
95
119
|
authors_text = re.sub(r'\s+', ' ', authors_text.strip())
|
|
96
120
|
|
|
121
|
+
# Special case: Handle single author followed by "et al" (e.g., "Mubashara Akhtar et al.")
|
|
122
|
+
# This should be split into ["Mubashara Akhtar", "et al"]
|
|
123
|
+
single_et_al_match = re.match(r'^(.+?)\s+et\s+al\.?$', authors_text, re.IGNORECASE)
|
|
124
|
+
if single_et_al_match:
|
|
125
|
+
base_author = single_et_al_match.group(1).strip()
|
|
126
|
+
if base_author and not ' and ' in base_author and not ',' in base_author:
|
|
127
|
+
# This is a simple "FirstName LastName et al" case
|
|
128
|
+
return [base_author, 'et al']
|
|
129
|
+
|
|
97
130
|
# Check if this is a semicolon-separated format (e.g., "Hashimoto, K.; Saoud, A.; Kishida, M.")
|
|
98
131
|
if ';' in authors_text:
|
|
99
132
|
# Split by semicolons and handle the last part which might have "and"
|
|
@@ -359,6 +392,9 @@ def clean_author_name(author):
|
|
|
359
392
|
# Normalize Unicode characters (e.g., combining diacritics)
|
|
360
393
|
author = unicodedata.normalize('NFKC', author)
|
|
361
394
|
|
|
395
|
+
# Normalize apostrophes first before other processing
|
|
396
|
+
author = normalize_apostrophes(author)
|
|
397
|
+
|
|
362
398
|
# Handle common Unicode escape sequences and LaTeX encodings
|
|
363
399
|
# Note: Order matters - process longer patterns first
|
|
364
400
|
unicode_replacements = [
|
|
@@ -518,31 +554,19 @@ def clean_title(title):
|
|
|
518
554
|
|
|
519
555
|
def extract_arxiv_id_from_url(url):
|
|
520
556
|
"""
|
|
521
|
-
Extract ArXiv ID from URL
|
|
557
|
+
Extract ArXiv ID from URL or text containing ArXiv reference.
|
|
558
|
+
|
|
559
|
+
This function is deprecated. Use utils.url_utils.extract_arxiv_id_from_url instead.
|
|
560
|
+
Kept for backwards compatibility.
|
|
522
561
|
|
|
523
562
|
Args:
|
|
524
|
-
url: URL string
|
|
563
|
+
url: URL string or text containing arXiv reference
|
|
525
564
|
|
|
526
565
|
Returns:
|
|
527
566
|
ArXiv ID or None if not found
|
|
528
567
|
"""
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
# Common ArXiv URL patterns
|
|
533
|
-
patterns = [
|
|
534
|
-
r'arxiv\.org/abs/(\d+\.\d+(?:v\d+)?)',
|
|
535
|
-
r'arxiv\.org/pdf/(\d+\.\d+(?:v\d+)?)',
|
|
536
|
-
r'arxiv:(\d+\.\d+(?:v\d+)?)',
|
|
537
|
-
r'arXiv:(\d+\.\d+(?:v\d+)?)'
|
|
538
|
-
]
|
|
539
|
-
|
|
540
|
-
for pattern in patterns:
|
|
541
|
-
match = re.search(pattern, url, re.IGNORECASE)
|
|
542
|
-
if match:
|
|
543
|
-
return match.group(1)
|
|
544
|
-
|
|
545
|
-
return None
|
|
568
|
+
from utils.url_utils import extract_arxiv_id_from_url as common_extract
|
|
569
|
+
return common_extract(url)
|
|
546
570
|
|
|
547
571
|
def extract_year_from_text(text):
|
|
548
572
|
"""
|
|
@@ -706,8 +730,12 @@ def normalize_diacritics(text: str) -> str:
|
|
|
706
730
|
'José' -> 'jose'
|
|
707
731
|
'Łukasz' -> 'lukasz'
|
|
708
732
|
'J. Gl¨ uck' -> 'J. Gluck'
|
|
733
|
+
'D'Amato' -> 'D'Amato' (apostrophes normalized)
|
|
709
734
|
"""
|
|
710
|
-
# First
|
|
735
|
+
# First normalize apostrophes
|
|
736
|
+
text = normalize_apostrophes(text)
|
|
737
|
+
|
|
738
|
+
# Then handle special characters that don't decompose properly
|
|
711
739
|
# Including common transliterations
|
|
712
740
|
special_chars = {
|
|
713
741
|
'ł': 'l', 'Ł': 'L',
|
|
@@ -847,6 +875,10 @@ def is_name_match(name1: str, name2: str) -> bool:
|
|
|
847
875
|
name1_primary = normalize_diacritics(name1.strip().lower())
|
|
848
876
|
name2_primary = normalize_diacritics(name2.strip().lower())
|
|
849
877
|
|
|
878
|
+
# Remove trailing periods that are not part of initials (e.g., "J. L. D'Amato." -> "J. L. D'Amato")
|
|
879
|
+
name1_primary = re.sub(r'\.+$', '', name1_primary)
|
|
880
|
+
name2_primary = re.sub(r'\.+$', '', name2_primary)
|
|
881
|
+
|
|
850
882
|
# Handle spacing variations around periods: "F.Last" vs "F. Last"
|
|
851
883
|
name1_normalized = re.sub(r'\.([A-Za-z])', r'. \1', name1_primary)
|
|
852
884
|
name2_normalized = re.sub(r'\.([A-Za-z])', r'. \1', name2_primary)
|
|
@@ -859,6 +891,10 @@ def is_name_match(name1: str, name2: str) -> bool:
|
|
|
859
891
|
name1_alt = normalize_diacritics_simple(name1.strip().lower())
|
|
860
892
|
name2_alt = normalize_diacritics_simple(name2.strip().lower())
|
|
861
893
|
|
|
894
|
+
# Remove trailing periods for alternative normalization too
|
|
895
|
+
name1_alt = re.sub(r'\.+$', '', name1_alt)
|
|
896
|
+
name2_alt = re.sub(r'\.+$', '', name2_alt)
|
|
897
|
+
|
|
862
898
|
name1_alt_norm = re.sub(r'\.([A-Za-z])', r'. \1', name1_alt)
|
|
863
899
|
name2_alt_norm = re.sub(r'\.([A-Za-z])', r'. \1', name2_alt)
|
|
864
900
|
|
|
@@ -2219,7 +2255,8 @@ def format_author_for_display(author_name):
|
|
|
2219
2255
|
if not author_name:
|
|
2220
2256
|
return author_name
|
|
2221
2257
|
|
|
2222
|
-
|
|
2258
|
+
# Normalize apostrophes for consistent display
|
|
2259
|
+
author_name = normalize_apostrophes(author_name.strip())
|
|
2223
2260
|
|
|
2224
2261
|
# Check if it's in "Lastname, Firstname" format
|
|
2225
2262
|
if ',' in author_name:
|
|
@@ -2866,8 +2903,9 @@ def extract_latex_references(text, file_path=None): # pylint: disable=unused-ar
|
|
|
2866
2903
|
author_part_clean = strip_latex_commands(author_part).strip()
|
|
2867
2904
|
|
|
2868
2905
|
# Simple fix: just improve the organization detection without complex parsing
|
|
2869
|
-
# Remove year pattern first
|
|
2906
|
+
# Remove year pattern first - handle both parenthetical and standalone years
|
|
2870
2907
|
author_text_clean = re.sub(r'\s*\(\d{4}\)\.?$', '', author_part_clean).strip()
|
|
2908
|
+
author_text_clean = re.sub(r'\s+\d{4}\.?$', '', author_text_clean).strip()
|
|
2871
2909
|
|
|
2872
2910
|
# Better organization detection - check if it looks like multiple authors
|
|
2873
2911
|
is_multi_author = (
|
|
@@ -2889,24 +2927,41 @@ def extract_latex_references(text, file_path=None): # pylint: disable=unused-ar
|
|
|
2889
2927
|
author = re.sub(r'^and\s+', '', author.strip())
|
|
2890
2928
|
# Remove trailing periods that shouldn't be there
|
|
2891
2929
|
author = clean_author_name(author)
|
|
2892
|
-
#
|
|
2893
|
-
if author.lower()
|
|
2930
|
+
# Preserve "et al" variants to enable proper author count handling
|
|
2931
|
+
if author.lower() in ['et al', 'et al.', 'et~al', 'et~al.', 'al., et', 'others', 'and others']:
|
|
2932
|
+
cleaned_authors.append('et al') # Normalize to standard form
|
|
2933
|
+
else:
|
|
2894
2934
|
cleaned_authors.append(author)
|
|
2895
2935
|
if cleaned_authors:
|
|
2896
2936
|
ref['authors'] = cleaned_authors
|
|
2897
2937
|
else:
|
|
2898
|
-
# Fallback: simple comma split
|
|
2938
|
+
# Fallback: try once more with semicolon handling, then simple comma split
|
|
2899
2939
|
simple_authors = []
|
|
2900
|
-
|
|
2901
|
-
|
|
2902
|
-
|
|
2903
|
-
|
|
2904
|
-
|
|
2905
|
-
|
|
2906
|
-
|
|
2907
|
-
|
|
2908
|
-
|
|
2909
|
-
|
|
2940
|
+
try:
|
|
2941
|
+
# Try parsing again with normalized separators
|
|
2942
|
+
normalized_text = re.sub(r';\s*and\s+', ', ', author_text_clean)
|
|
2943
|
+
fallback_authors = parse_authors_with_initials(normalized_text)
|
|
2944
|
+
if fallback_authors and len(fallback_authors) >= 2:
|
|
2945
|
+
simple_authors = fallback_authors
|
|
2946
|
+
else:
|
|
2947
|
+
raise ValueError("Fallback parsing failed")
|
|
2948
|
+
except:
|
|
2949
|
+
# Last resort: naive comma split
|
|
2950
|
+
for a in author_text_clean.split(','):
|
|
2951
|
+
a = a.strip()
|
|
2952
|
+
# Remove "and" prefix and skip short/empty entries
|
|
2953
|
+
a = re.sub(r'^and\s+', '', a)
|
|
2954
|
+
# Clean author name (remove unnecessary periods)
|
|
2955
|
+
a = clean_author_name(a)
|
|
2956
|
+
if a and len(a) > 2:
|
|
2957
|
+
# Preserve "et al" variants to enable proper author count handling
|
|
2958
|
+
if a.lower() in ['et al', 'et al.', 'et~al', 'et~al.', 'others', 'and others']:
|
|
2959
|
+
simple_authors.append('et al') # Normalize to standard form
|
|
2960
|
+
else:
|
|
2961
|
+
simple_authors.append(a)
|
|
2962
|
+
elif a and a.lower() in ['et al', 'et al.', 'et~al', 'et~al.', 'others', 'and others']:
|
|
2963
|
+
simple_authors.append('et al') # Handle short "et al" variants
|
|
2964
|
+
|
|
2910
2965
|
if simple_authors:
|
|
2911
2966
|
ref['authors'] = simple_authors
|
|
2912
2967
|
except Exception:
|
|
@@ -2919,9 +2974,13 @@ def extract_latex_references(text, file_path=None): # pylint: disable=unused-ar
|
|
|
2919
2974
|
# Clean author name (remove unnecessary periods)
|
|
2920
2975
|
a = clean_author_name(a)
|
|
2921
2976
|
if a and len(a) > 2:
|
|
2922
|
-
#
|
|
2923
|
-
if a.lower()
|
|
2977
|
+
# Preserve "et al" variants to enable proper author count handling
|
|
2978
|
+
if a.lower() in ['et al', 'et al.', 'et~al', 'et~al.', 'others', 'and others']:
|
|
2979
|
+
simple_authors.append('et al') # Normalize to standard form
|
|
2980
|
+
else:
|
|
2924
2981
|
simple_authors.append(a)
|
|
2982
|
+
elif a and a.lower() in ['et al', 'et al.', 'et~al', 'et~al.', 'others', 'and others']:
|
|
2983
|
+
simple_authors.append('et al') # Handle short "et al" variants
|
|
2925
2984
|
if simple_authors:
|
|
2926
2985
|
ref['authors'] = simple_authors
|
|
2927
2986
|
else:
|
|
@@ -3716,6 +3775,7 @@ def are_venues_substantially_different(venue1: str, venue2: str) -> bool:
|
|
|
3716
3775
|
for abbrev in sorted(multi_word_abbrevs.keys(), key=len, reverse=True):
|
|
3717
3776
|
if abbrev in expanded_text:
|
|
3718
3777
|
expanded_text = expanded_text.replace(abbrev, multi_word_abbrevs[abbrev])
|
|
3778
|
+
break # Only apply the first (longest) matching abbreviation to avoid conflicts
|
|
3719
3779
|
|
|
3720
3780
|
# Second pass: handle single word abbreviations
|
|
3721
3781
|
words = expanded_text.split()
|
|
@@ -4110,8 +4170,9 @@ def are_venues_substantially_different(venue1: str, venue2: str) -> bool:
|
|
|
4110
4170
|
return False
|
|
4111
4171
|
|
|
4112
4172
|
# Order-aware fuzzy matching - words should match in sequence
|
|
4113
|
-
|
|
4114
|
-
|
|
4173
|
+
# Sort to ensure deterministic order (set iteration is not guaranteed to be consistent)
|
|
4174
|
+
words1_list = sorted(list(words1))
|
|
4175
|
+
words2_list = sorted(list(words2))
|
|
4115
4176
|
|
|
4116
4177
|
# If word counts are very different, they're likely different venues
|
|
4117
4178
|
if len(words1) > 0 and len(words2) > 0:
|
utils/url_utils.py
CHANGED
|
@@ -33,26 +33,43 @@ def construct_doi_url(doi: str) -> str:
|
|
|
33
33
|
|
|
34
34
|
def extract_arxiv_id_from_url(url: str) -> Optional[str]:
|
|
35
35
|
"""
|
|
36
|
-
Extract ArXiv ID from an ArXiv URL.
|
|
36
|
+
Extract ArXiv ID from an ArXiv URL or text containing ArXiv reference.
|
|
37
|
+
|
|
38
|
+
This is the common function that handles all ArXiv ID extraction patterns:
|
|
39
|
+
- URLs: https://arxiv.org/abs/1234.5678, https://arxiv.org/pdf/1234.5678.pdf, https://arxiv.org/html/1234.5678
|
|
40
|
+
- Text references: arXiv:1234.5678, arXiv preprint arXiv:1234.5678
|
|
41
|
+
- Version handling: removes version numbers (v1, v2, etc.)
|
|
37
42
|
|
|
38
43
|
Args:
|
|
39
|
-
url: ArXiv URL
|
|
44
|
+
url: ArXiv URL or text containing ArXiv reference
|
|
40
45
|
|
|
41
46
|
Returns:
|
|
42
|
-
ArXiv ID if found, None otherwise
|
|
47
|
+
ArXiv ID (without version) if found, None otherwise
|
|
43
48
|
"""
|
|
44
|
-
if not url:
|
|
49
|
+
if not url or not isinstance(url, str):
|
|
45
50
|
return None
|
|
46
51
|
|
|
47
|
-
#
|
|
48
|
-
|
|
49
|
-
if
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
52
|
+
# Pattern 1: arXiv: format (e.g., "arXiv:1610.10099" or "arXiv preprint arXiv:1610.10099")
|
|
53
|
+
arxiv_text_match = re.search(r'arXiv:(\d{4}\.\d{4,5})', url, re.IGNORECASE)
|
|
54
|
+
if arxiv_text_match:
|
|
55
|
+
arxiv_id = arxiv_text_match.group(1)
|
|
56
|
+
# Remove version number if present
|
|
57
|
+
return re.sub(r'v\d+$', '', arxiv_id)
|
|
58
|
+
|
|
59
|
+
# Pattern 2: arxiv.org URLs (abs, pdf, html)
|
|
60
|
+
# Handle URLs with version numbers and various formats
|
|
61
|
+
arxiv_url_match = re.search(r'arxiv\.org/(?:abs|pdf|html)/([^\s/?#]+?)(?:\.pdf|v\d+)?(?:[?\#]|$)', url, re.IGNORECASE)
|
|
62
|
+
if arxiv_url_match:
|
|
63
|
+
arxiv_id = arxiv_url_match.group(1)
|
|
64
|
+
# Remove version number if present
|
|
65
|
+
return re.sub(r'v\d+$', '', arxiv_id)
|
|
66
|
+
|
|
67
|
+
# Pattern 3: Fallback for simpler URL patterns
|
|
68
|
+
fallback_match = re.search(r'arxiv\.org/(?:abs|pdf|html)/([^/?#]+)', url, re.IGNORECASE)
|
|
54
69
|
if fallback_match:
|
|
55
|
-
|
|
70
|
+
arxiv_id = fallback_match.group(1).replace('.pdf', '')
|
|
71
|
+
# Remove version number if present
|
|
72
|
+
return re.sub(r'v\d+$', '', arxiv_id)
|
|
56
73
|
|
|
57
74
|
return None
|
|
58
75
|
|
|
File without changes
|
{academic_refchecker-1.2.37.dist-info → academic_refchecker-1.2.39.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{academic_refchecker-1.2.37.dist-info → academic_refchecker-1.2.39.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
|
File without changes
|