academic-refchecker 1.2.52__tar.gz → 1.2.54__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {academic_refchecker-1.2.52/src/academic_refchecker.egg-info → academic_refchecker-1.2.54}/PKG-INFO +1 -1
- {academic_refchecker-1.2.52 → academic_refchecker-1.2.54}/src/__version__.py +1 -1
- {academic_refchecker-1.2.52 → academic_refchecker-1.2.54/src/academic_refchecker.egg-info}/PKG-INFO +1 -1
- {academic_refchecker-1.2.52 → academic_refchecker-1.2.54}/src/core/refchecker.py +17 -0
- {academic_refchecker-1.2.52 → academic_refchecker-1.2.54}/src/utils/text_utils.py +52 -3
- {academic_refchecker-1.2.52 → academic_refchecker-1.2.54}/LICENSE +0 -0
- {academic_refchecker-1.2.52 → academic_refchecker-1.2.54}/MANIFEST.in +0 -0
- {academic_refchecker-1.2.52 → academic_refchecker-1.2.54}/README.md +0 -0
- {academic_refchecker-1.2.52 → academic_refchecker-1.2.54}/pyproject.toml +0 -0
- {academic_refchecker-1.2.52 → academic_refchecker-1.2.54}/requirements.txt +0 -0
- {academic_refchecker-1.2.52 → academic_refchecker-1.2.54}/scripts/download_db.py +0 -0
- {academic_refchecker-1.2.52 → academic_refchecker-1.2.54}/scripts/run_tests.py +0 -0
- {academic_refchecker-1.2.52 → academic_refchecker-1.2.54}/scripts/start_vllm_server.py +0 -0
- {academic_refchecker-1.2.52 → academic_refchecker-1.2.54}/setup.cfg +0 -0
- {academic_refchecker-1.2.52 → academic_refchecker-1.2.54}/src/__init__.py +0 -0
- {academic_refchecker-1.2.52 → academic_refchecker-1.2.54}/src/academic_refchecker.egg-info/SOURCES.txt +0 -0
- {academic_refchecker-1.2.52 → academic_refchecker-1.2.54}/src/academic_refchecker.egg-info/dependency_links.txt +0 -0
- {academic_refchecker-1.2.52 → academic_refchecker-1.2.54}/src/academic_refchecker.egg-info/entry_points.txt +0 -0
- {academic_refchecker-1.2.52 → academic_refchecker-1.2.54}/src/academic_refchecker.egg-info/requires.txt +0 -0
- {academic_refchecker-1.2.52 → academic_refchecker-1.2.54}/src/academic_refchecker.egg-info/top_level.txt +0 -0
- {academic_refchecker-1.2.52 → academic_refchecker-1.2.54}/src/checkers/__init__.py +0 -0
- {academic_refchecker-1.2.52 → academic_refchecker-1.2.54}/src/checkers/crossref.py +0 -0
- {academic_refchecker-1.2.52 → academic_refchecker-1.2.54}/src/checkers/enhanced_hybrid_checker.py +0 -0
- {academic_refchecker-1.2.52 → academic_refchecker-1.2.54}/src/checkers/github_checker.py +0 -0
- {academic_refchecker-1.2.52 → academic_refchecker-1.2.54}/src/checkers/local_semantic_scholar.py +0 -0
- {academic_refchecker-1.2.52 → academic_refchecker-1.2.54}/src/checkers/openalex.py +0 -0
- {academic_refchecker-1.2.52 → academic_refchecker-1.2.54}/src/checkers/openreview_checker.py +0 -0
- {academic_refchecker-1.2.52 → academic_refchecker-1.2.54}/src/checkers/pdf_paper_checker.py +0 -0
- {academic_refchecker-1.2.52 → academic_refchecker-1.2.54}/src/checkers/semantic_scholar.py +0 -0
- {academic_refchecker-1.2.52 → academic_refchecker-1.2.54}/src/checkers/webpage_checker.py +0 -0
- {academic_refchecker-1.2.52 → academic_refchecker-1.2.54}/src/config/__init__.py +0 -0
- {academic_refchecker-1.2.52 → academic_refchecker-1.2.54}/src/config/logging.conf +0 -0
- {academic_refchecker-1.2.52 → academic_refchecker-1.2.54}/src/config/settings.py +0 -0
- {academic_refchecker-1.2.52 → academic_refchecker-1.2.54}/src/core/__init__.py +0 -0
- {academic_refchecker-1.2.52 → academic_refchecker-1.2.54}/src/core/db_connection_pool.py +0 -0
- {academic_refchecker-1.2.52 → academic_refchecker-1.2.54}/src/core/parallel_processor.py +0 -0
- {academic_refchecker-1.2.52 → academic_refchecker-1.2.54}/src/database/__init__.py +0 -0
- {academic_refchecker-1.2.52 → academic_refchecker-1.2.54}/src/database/download_semantic_scholar_db.py +0 -0
- {academic_refchecker-1.2.52 → academic_refchecker-1.2.54}/src/llm/__init__.py +0 -0
- {academic_refchecker-1.2.52 → academic_refchecker-1.2.54}/src/llm/base.py +0 -0
- {academic_refchecker-1.2.52 → academic_refchecker-1.2.54}/src/llm/providers.py +0 -0
- {academic_refchecker-1.2.52 → academic_refchecker-1.2.54}/src/scripts/__init__.py +0 -0
- {academic_refchecker-1.2.52 → academic_refchecker-1.2.54}/src/scripts/start_vllm_server.py +0 -0
- {academic_refchecker-1.2.52 → academic_refchecker-1.2.54}/src/services/__init__.py +0 -0
- {academic_refchecker-1.2.52 → academic_refchecker-1.2.54}/src/services/pdf_processor.py +0 -0
- {academic_refchecker-1.2.52 → academic_refchecker-1.2.54}/src/utils/__init__.py +0 -0
- {academic_refchecker-1.2.52 → academic_refchecker-1.2.54}/src/utils/arxiv_utils.py +0 -0
- {academic_refchecker-1.2.52 → academic_refchecker-1.2.54}/src/utils/author_utils.py +0 -0
- {academic_refchecker-1.2.52 → academic_refchecker-1.2.54}/src/utils/biblatex_parser.py +0 -0
- {academic_refchecker-1.2.52 → academic_refchecker-1.2.54}/src/utils/bibliography_utils.py +0 -0
- {academic_refchecker-1.2.52 → academic_refchecker-1.2.54}/src/utils/bibtex_parser.py +0 -0
- {academic_refchecker-1.2.52 → academic_refchecker-1.2.54}/src/utils/config_validator.py +0 -0
- {academic_refchecker-1.2.52 → academic_refchecker-1.2.54}/src/utils/db_utils.py +0 -0
- {academic_refchecker-1.2.52 → academic_refchecker-1.2.54}/src/utils/doi_utils.py +0 -0
- {academic_refchecker-1.2.52 → academic_refchecker-1.2.54}/src/utils/error_utils.py +0 -0
- {academic_refchecker-1.2.52 → academic_refchecker-1.2.54}/src/utils/mock_objects.py +0 -0
- {academic_refchecker-1.2.52 → academic_refchecker-1.2.54}/src/utils/unicode_utils.py +0 -0
- {academic_refchecker-1.2.52 → academic_refchecker-1.2.54}/src/utils/url_utils.py +0 -0
|
@@ -2311,6 +2311,23 @@ class ArxivReferenceChecker:
|
|
|
2311
2311
|
from checkers.pdf_paper_checker import PDFPaperChecker
|
|
2312
2312
|
pdf_checker = PDFPaperChecker()
|
|
2313
2313
|
|
|
2314
|
+
if pdf_checker.can_check_reference(reference):
|
|
2315
|
+
logger.debug(f"URL appears to be PDF, trying PDF verification: {web_url}")
|
|
2316
|
+
try:
|
|
2317
|
+
verified_data, errors, url = pdf_checker.verify_reference(reference)
|
|
2318
|
+
if verified_data:
|
|
2319
|
+
logger.debug(f"PDF verification successful for: {reference.get('title', 'Untitled')}")
|
|
2320
|
+
return verified_data, errors, url
|
|
2321
|
+
else:
|
|
2322
|
+
logger.debug(f"PDF verification failed, falling back to web page verification")
|
|
2323
|
+
except Exception as e:
|
|
2324
|
+
logger.error(f"Error in PDF verification: {e}")
|
|
2325
|
+
logger.debug(f"PDF verification error, falling back to web page verification")
|
|
2326
|
+
|
|
2327
|
+
# Fall back to web page checker
|
|
2328
|
+
from checkers.pdf_paper_checker import PDFPaperChecker
|
|
2329
|
+
pdf_checker = PDFPaperChecker()
|
|
2330
|
+
|
|
2314
2331
|
if pdf_checker.can_check_reference(reference):
|
|
2315
2332
|
logger.debug(f"URL appears to be PDF, trying PDF verification: {web_url}")
|
|
2316
2333
|
try:
|
|
@@ -173,6 +173,11 @@ def parse_authors_with_initials(authors_text):
|
|
|
173
173
|
if stripped_text in ['others', 'and others', 'et al', 'et al.']:
|
|
174
174
|
return []
|
|
175
175
|
|
|
176
|
+
# Clean LaTeX commands early to prevent parsing issues
|
|
177
|
+
# This fixes cases like "Hochreiter, Sepp and Schmidhuber, J{\"u}rgen"
|
|
178
|
+
# which should parse as 2 authors, not get split incorrectly due to LaTeX braces
|
|
179
|
+
authors_text = strip_latex_commands(authors_text)
|
|
180
|
+
|
|
176
181
|
# Fix spacing around periods in initials (e.g., "Y . Li" -> "Y. Li") before parsing
|
|
177
182
|
authors_text = re.sub(r'(\w)\s+\.', r'\1.', authors_text)
|
|
178
183
|
|
|
@@ -300,9 +305,9 @@ def parse_authors_with_initials(authors_text):
|
|
|
300
305
|
comma_parts = [p.strip() for p in part.split(',')]
|
|
301
306
|
if len(comma_parts) == 2:
|
|
302
307
|
lastname, firstname = comma_parts
|
|
303
|
-
# Both parts should contain only letters, spaces, hyphens, apostrophes, and periods
|
|
304
|
-
if (re.match(r'^[
|
|
305
|
-
re.match(r'^[
|
|
308
|
+
# Both parts should contain only letters (including Unicode), spaces, hyphens, apostrophes, and periods
|
|
309
|
+
if (re.match(r'^[\w\s\-\'.]+$', lastname, re.UNICODE) and
|
|
310
|
+
re.match(r'^[\w\s\-\'.]+$', firstname, re.UNICODE) and
|
|
306
311
|
lastname and firstname):
|
|
307
312
|
valid_author_parts.append(part)
|
|
308
313
|
|
|
@@ -314,6 +319,50 @@ def parse_authors_with_initials(authors_text):
|
|
|
314
319
|
# Split on commas first for other formats
|
|
315
320
|
parts = [part.strip() for part in authors_text.split(',') if part.strip()]
|
|
316
321
|
|
|
322
|
+
# Handle single author with "Lastname, Firstname" format (exactly 2 parts)
|
|
323
|
+
if len(parts) == 2:
|
|
324
|
+
lastname, firstname = parts
|
|
325
|
+
# Pattern for surnames: capitalized word(s), possibly hyphenated or compound
|
|
326
|
+
# But exclude common patterns that suggest multiple authors like "Other Author"
|
|
327
|
+
surname_pattern = r'^[A-Z][a-zA-Z\-\']+$' # Single surname word (no spaces to avoid "Other Author")
|
|
328
|
+
# Pattern for first names or initials: either full names or initials with periods
|
|
329
|
+
# Accept both full names like "David R" and initials like "A. C"
|
|
330
|
+
firstname_pattern = r'^[A-Z]([a-zA-Z\s\-\'.]*|\.(\s+[A-Z]\.?)*\s*)$' # Full names or initials
|
|
331
|
+
|
|
332
|
+
# Additional check: if the "firstname" part looks like "Other Author" or similar,
|
|
333
|
+
# it's likely multiple authors, not a single "Lastname, Firstname" pattern
|
|
334
|
+
# We need to distinguish between:
|
|
335
|
+
# - "David R" (first name + middle initial - single author)
|
|
336
|
+
# - "Other Author" (two separate names - multiple authors)
|
|
337
|
+
if ' ' in firstname:
|
|
338
|
+
firstname_parts = firstname.split()
|
|
339
|
+
if len(firstname_parts) == 2:
|
|
340
|
+
first_part, second_part = firstname_parts
|
|
341
|
+
# Pattern 1: "David R" - first name + single letter (middle initial)
|
|
342
|
+
is_name_plus_initial = (
|
|
343
|
+
len(first_part) >= 2 and first_part[0].isupper() and first_part[1:].islower() and
|
|
344
|
+
len(second_part) <= 2 and second_part.replace('.', '').isalpha() # Initial like "R" or "R."
|
|
345
|
+
)
|
|
346
|
+
# Pattern 2: "Other Author" - two full capitalized words suggesting separate authors
|
|
347
|
+
looks_like_separate_authors = (
|
|
348
|
+
len(first_part) >= 3 and first_part[0].isupper() and first_part[1:].islower() and
|
|
349
|
+
len(second_part) >= 3 and second_part[0].isupper() and second_part[1:].islower()
|
|
350
|
+
)
|
|
351
|
+
looks_like_multiple_authors = looks_like_separate_authors and not is_name_plus_initial
|
|
352
|
+
else:
|
|
353
|
+
# More than 2 parts with spaces likely indicates multiple authors
|
|
354
|
+
looks_like_multiple_authors = len(firstname_parts) > 2
|
|
355
|
+
else:
|
|
356
|
+
looks_like_multiple_authors = False
|
|
357
|
+
|
|
358
|
+
# Check if this looks like a single author in "Lastname, Firstname" format
|
|
359
|
+
if (re.match(surname_pattern, lastname) and
|
|
360
|
+
re.match(firstname_pattern, firstname) and
|
|
361
|
+
len(lastname) >= 2 and len(firstname) >= 1 and
|
|
362
|
+
not looks_like_multiple_authors):
|
|
363
|
+
# This is a single author, return as "Lastname, Firstname"
|
|
364
|
+
return [f"{lastname}, {firstname}"]
|
|
365
|
+
|
|
317
366
|
# Check if this is BibTeX comma-separated format: "Surname, Given, Surname, Given"
|
|
318
367
|
# Enhanced heuristic: even number of parts >= 6, alternating proper surname/given pattern
|
|
319
368
|
# Distinguish between initials (should remain as "Surname, Initial") and full names
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{academic_refchecker-1.2.52 → academic_refchecker-1.2.54}/src/checkers/enhanced_hybrid_checker.py
RENAMED
|
File without changes
|
|
File without changes
|
{academic_refchecker-1.2.52 → academic_refchecker-1.2.54}/src/checkers/local_semantic_scholar.py
RENAMED
|
File without changes
|
|
File without changes
|
{academic_refchecker-1.2.52 → academic_refchecker-1.2.54}/src/checkers/openreview_checker.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|