academic-refchecker 1.2.54__py3-none-any.whl → 1.2.56__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {academic_refchecker-1.2.54.dist-info → academic_refchecker-1.2.56.dist-info}/METADATA +23 -23
- academic_refchecker-1.2.56.dist-info/RECORD +49 -0
- academic_refchecker-1.2.56.dist-info/entry_points.txt +2 -0
- academic_refchecker-1.2.56.dist-info/top_level.txt +1 -0
- refchecker/__init__.py +13 -0
- refchecker/__main__.py +11 -0
- refchecker/__version__.py +5 -0
- {checkers → refchecker/checkers}/crossref.py +5 -5
- {checkers → refchecker/checkers}/enhanced_hybrid_checker.py +1 -1
- {checkers → refchecker/checkers}/github_checker.py +4 -4
- {checkers → refchecker/checkers}/local_semantic_scholar.py +10 -11
- {checkers → refchecker/checkers}/openalex.py +6 -6
- {checkers → refchecker/checkers}/openreview_checker.py +8 -8
- {checkers → refchecker/checkers}/pdf_paper_checker.py +1 -1
- {checkers → refchecker/checkers}/semantic_scholar.py +47 -33
- {checkers → refchecker/checkers}/webpage_checker.py +3 -3
- {core → refchecker/core}/parallel_processor.py +6 -6
- {core → refchecker/core}/refchecker.py +72 -76
- {llm → refchecker/llm}/providers.py +17 -1
- {services → refchecker/services}/pdf_processor.py +22 -2
- {utils → refchecker/utils}/arxiv_utils.py +3 -3
- {utils → refchecker/utils}/biblatex_parser.py +4 -4
- {utils → refchecker/utils}/bibliography_utils.py +5 -5
- {utils → refchecker/utils}/bibtex_parser.py +5 -5
- {utils → refchecker/utils}/error_utils.py +9 -9
- {utils → refchecker/utils}/text_utils.py +10 -10
- {utils → refchecker/utils}/url_utils.py +8 -5
- __version__.py +0 -3
- academic_refchecker-1.2.54.dist-info/RECORD +0 -47
- academic_refchecker-1.2.54.dist-info/entry_points.txt +0 -2
- academic_refchecker-1.2.54.dist-info/top_level.txt +0 -9
- {academic_refchecker-1.2.54.dist-info → academic_refchecker-1.2.56.dist-info}/WHEEL +0 -0
- {academic_refchecker-1.2.54.dist-info → academic_refchecker-1.2.56.dist-info}/licenses/LICENSE +0 -0
- {checkers → refchecker/checkers}/__init__.py +0 -0
- {config → refchecker/config}/__init__.py +0 -0
- {config → refchecker/config}/logging.conf +0 -0
- {config → refchecker/config}/settings.py +0 -0
- {core → refchecker/core}/__init__.py +0 -0
- {core → refchecker/core}/db_connection_pool.py +0 -0
- {database → refchecker/database}/__init__.py +0 -0
- {database → refchecker/database}/download_semantic_scholar_db.py +0 -0
- {llm → refchecker/llm}/__init__.py +0 -0
- {llm → refchecker/llm}/base.py +0 -0
- {scripts → refchecker/scripts}/__init__.py +0 -0
- {scripts → refchecker/scripts}/start_vllm_server.py +0 -0
- {services → refchecker/services}/__init__.py +0 -0
- {utils → refchecker/utils}/__init__.py +0 -0
- {utils → refchecker/utils}/author_utils.py +0 -0
- {utils → refchecker/utils}/config_validator.py +0 -0
- {utils → refchecker/utils}/db_utils.py +0 -0
- {utils → refchecker/utils}/doi_utils.py +0 -0
- {utils → refchecker/utils}/mock_objects.py +0 -0
- {utils → refchecker/utils}/unicode_utils.py +0 -0
|
@@ -7,7 +7,7 @@ from urllib.parse import urlparse, urljoin
|
|
|
7
7
|
from typing import Dict, Optional, Tuple, List, Any
|
|
8
8
|
from bs4 import BeautifulSoup
|
|
9
9
|
import time
|
|
10
|
-
from utils.text_utils import strip_latex_commands
|
|
10
|
+
from refchecker.utils.text_utils import strip_latex_commands
|
|
11
11
|
|
|
12
12
|
logger = logging.getLogger(__name__)
|
|
13
13
|
|
|
@@ -185,7 +185,7 @@ class WebPageChecker:
|
|
|
185
185
|
# Check title match
|
|
186
186
|
if cited_title and page_title:
|
|
187
187
|
if not self._check_title_match(cited_title, page_title, page_description):
|
|
188
|
-
from utils.error_utils import format_title_mismatch
|
|
188
|
+
from refchecker.utils.error_utils import format_title_mismatch
|
|
189
189
|
# Clean the cited title for display (remove LaTeX commands like {LLM}s -> LLMs)
|
|
190
190
|
clean_cited_title = strip_latex_commands(cited_title)
|
|
191
191
|
errors.append({
|
|
@@ -207,7 +207,7 @@ class WebPageChecker:
|
|
|
207
207
|
if cited_authors:
|
|
208
208
|
author_str = ', '.join(cited_authors) if isinstance(cited_authors, list) else str(cited_authors)
|
|
209
209
|
if not self._check_author_match(author_str, site_info, web_url):
|
|
210
|
-
from utils.error_utils import format_three_line_mismatch
|
|
210
|
+
from refchecker.utils.error_utils import format_three_line_mismatch
|
|
211
211
|
left = author_str
|
|
212
212
|
right = site_info.get('organization', 'unknown')
|
|
213
213
|
details = format_three_line_mismatch("Author/organization mismatch", left, right)
|
|
@@ -13,7 +13,7 @@ from threading import Thread, Lock
|
|
|
13
13
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
14
14
|
from dataclasses import dataclass
|
|
15
15
|
from typing import List, Dict, Any, Optional, Tuple, Callable
|
|
16
|
-
from utils.text_utils import deduplicate_urls
|
|
16
|
+
from refchecker.utils.text_utils import deduplicate_urls
|
|
17
17
|
|
|
18
18
|
logger = logging.getLogger(__name__)
|
|
19
19
|
|
|
@@ -277,15 +277,15 @@ class ParallelReferenceProcessor:
|
|
|
277
277
|
# Print reference info in the same format as sequential mode
|
|
278
278
|
raw_title = reference.get('title', 'Untitled')
|
|
279
279
|
# Clean LaTeX commands from title for display
|
|
280
|
-
from utils.text_utils import strip_latex_commands
|
|
280
|
+
from refchecker.utils.text_utils import strip_latex_commands
|
|
281
281
|
title = strip_latex_commands(raw_title)
|
|
282
|
-
from utils.text_utils import format_authors_for_display
|
|
282
|
+
from refchecker.utils.text_utils import format_authors_for_display
|
|
283
283
|
authors = format_authors_for_display(reference.get('authors', []))
|
|
284
284
|
year = reference.get('year', '')
|
|
285
285
|
# Get venue from either 'venue' or 'journal' field and clean it up
|
|
286
286
|
venue = reference.get('venue', '') or reference.get('journal', '')
|
|
287
287
|
if venue:
|
|
288
|
-
from utils.error_utils import clean_venue_for_comparison
|
|
288
|
+
from refchecker.utils.error_utils import clean_venue_for_comparison
|
|
289
289
|
venue = clean_venue_for_comparison(venue)
|
|
290
290
|
url = reference.get('url', '')
|
|
291
291
|
doi = reference.get('doi', '')
|
|
@@ -331,7 +331,7 @@ class ParallelReferenceProcessor:
|
|
|
331
331
|
|
|
332
332
|
# Show DOI URL if available and different from what's already shown
|
|
333
333
|
if external_ids.get('DOI'):
|
|
334
|
-
from utils.doi_utils import construct_doi_url
|
|
334
|
+
from refchecker.utils.doi_utils import construct_doi_url
|
|
335
335
|
doi_url = construct_doi_url(external_ids['DOI'])
|
|
336
336
|
if doi_url != verified_url_to_show and doi_url != url:
|
|
337
337
|
print(f" DOI URL: {doi_url}")
|
|
@@ -355,7 +355,7 @@ class ParallelReferenceProcessor:
|
|
|
355
355
|
error_type = error.get('error_type') or error.get('warning_type') or error.get('info_type')
|
|
356
356
|
error_details = error.get('error_details') or error.get('warning_details') or error.get('info_details', 'Unknown error')
|
|
357
357
|
|
|
358
|
-
from utils.error_utils import print_labeled_multiline
|
|
358
|
+
from refchecker.utils.error_utils import print_labeled_multiline
|
|
359
359
|
|
|
360
360
|
if error_type == 'arxiv_id':
|
|
361
361
|
# Keep existing style for arXiv ID errors
|
|
@@ -11,7 +11,7 @@ For arXiv references, it uses the arXiv API to verify metadata.
|
|
|
11
11
|
For non-arXiv references, it uses the local Semantic Scholar database for verification.
|
|
12
12
|
|
|
13
13
|
Usage:
|
|
14
|
-
python
|
|
14
|
+
python run_refchecker.py --paper PAPER_SPEC [--db-path PATH] [--output-file [PATH]] [--debug]
|
|
15
15
|
|
|
16
16
|
Options:
|
|
17
17
|
--paper PAPER_SPEC Validate a specific paper by:
|
|
@@ -44,23 +44,23 @@ import argparse
|
|
|
44
44
|
import sys
|
|
45
45
|
import json
|
|
46
46
|
import random
|
|
47
|
-
from checkers.local_semantic_scholar import LocalNonArxivReferenceChecker
|
|
48
|
-
from utils.text_utils import (clean_author_name, clean_title, clean_title_basic,
|
|
47
|
+
from refchecker.checkers.local_semantic_scholar import LocalNonArxivReferenceChecker
|
|
48
|
+
from refchecker.utils.text_utils import (clean_author_name, clean_title, clean_title_basic,
|
|
49
49
|
extract_arxiv_id_from_url, normalize_text as common_normalize_text,
|
|
50
50
|
detect_latex_bibliography_format, extract_latex_references,
|
|
51
51
|
detect_standard_acm_natbib_format, strip_latex_commands,
|
|
52
52
|
format_corrected_reference, is_name_match, enhanced_name_match,
|
|
53
53
|
calculate_title_similarity, normalize_arxiv_url, deduplicate_urls,
|
|
54
54
|
compare_authors)
|
|
55
|
-
from utils.config_validator import ConfigValidator
|
|
56
|
-
from services.pdf_processor import PDFProcessor
|
|
57
|
-
from checkers.enhanced_hybrid_checker import EnhancedHybridReferenceChecker
|
|
58
|
-
from core.parallel_processor import ParallelReferenceProcessor
|
|
59
|
-
from core.db_connection_pool import ThreadSafeLocalChecker
|
|
55
|
+
from refchecker.utils.config_validator import ConfigValidator
|
|
56
|
+
from refchecker.services.pdf_processor import PDFProcessor
|
|
57
|
+
from refchecker.checkers.enhanced_hybrid_checker import EnhancedHybridReferenceChecker
|
|
58
|
+
from refchecker.core.parallel_processor import ParallelReferenceProcessor
|
|
59
|
+
from refchecker.core.db_connection_pool import ThreadSafeLocalChecker
|
|
60
60
|
|
|
61
61
|
# Import version
|
|
62
|
-
from __version__ import __version__
|
|
63
|
-
from llm.base import create_llm_provider, ReferenceExtractor
|
|
62
|
+
from refchecker.__version__ import __version__
|
|
63
|
+
from refchecker.llm.base import create_llm_provider, ReferenceExtractor
|
|
64
64
|
|
|
65
65
|
def get_llm_api_key_interactive(provider: str) -> str:
|
|
66
66
|
"""
|
|
@@ -453,7 +453,7 @@ class ArxivReferenceChecker:
|
|
|
453
453
|
def extract_arxiv_id_from_url(self, url):
|
|
454
454
|
"""
|
|
455
455
|
Extract ArXiv ID from a URL or text containing ArXiv reference.
|
|
456
|
-
Uses the common extraction function from utils.url_utils.
|
|
456
|
+
Uses the common extraction function from refchecker.utils.url_utils.
|
|
457
457
|
"""
|
|
458
458
|
return extract_arxiv_id_from_url(url)
|
|
459
459
|
|
|
@@ -1189,7 +1189,7 @@ class ArxivReferenceChecker:
|
|
|
1189
1189
|
last_author = and_parts[1].strip()
|
|
1190
1190
|
|
|
1191
1191
|
# Split the main list by commas, handling initials properly
|
|
1192
|
-
from utils.text_utils import parse_authors_with_initials
|
|
1192
|
+
from refchecker.utils.text_utils import parse_authors_with_initials
|
|
1193
1193
|
authors = parse_authors_with_initials(main_list)
|
|
1194
1194
|
|
|
1195
1195
|
# Add the last author
|
|
@@ -1197,7 +1197,7 @@ class ArxivReferenceChecker:
|
|
|
1197
1197
|
authors.append(last_author)
|
|
1198
1198
|
else:
|
|
1199
1199
|
# No "and" found, use smart comma parsing for initials
|
|
1200
|
-
from utils.text_utils import parse_authors_with_initials
|
|
1200
|
+
from refchecker.utils.text_utils import parse_authors_with_initials
|
|
1201
1201
|
authors = parse_authors_with_initials(authors_text)
|
|
1202
1202
|
|
|
1203
1203
|
# Clean up each author name
|
|
@@ -1679,7 +1679,7 @@ class ArxivReferenceChecker:
|
|
|
1679
1679
|
if not title and not authors_text:
|
|
1680
1680
|
# Try to detect a list of names
|
|
1681
1681
|
if re.match(r'^[A-Z][a-zA-Z\-\.]+(,\s*[A-Z][a-zA-Z\-\.]+)+$', cleaned_ref):
|
|
1682
|
-
from utils.text_utils import parse_authors_with_initials
|
|
1682
|
+
from refchecker.utils.text_utils import parse_authors_with_initials
|
|
1683
1683
|
authors = parse_authors_with_initials(cleaned_ref)
|
|
1684
1684
|
return authors, ""
|
|
1685
1685
|
|
|
@@ -1693,7 +1693,7 @@ class ArxivReferenceChecker:
|
|
|
1693
1693
|
|
|
1694
1694
|
# Final fallback: if the reference is just a list of names, return as authors
|
|
1695
1695
|
if not title and cleaned_ref and re.match(r'^[A-Z][a-zA-Z\-\.]+(,\s*[A-Z][a-zA-Z\-\.]+)+$', cleaned_ref):
|
|
1696
|
-
from utils.text_utils import parse_authors_with_initials
|
|
1696
|
+
from refchecker.utils.text_utils import parse_authors_with_initials
|
|
1697
1697
|
authors = parse_authors_with_initials(cleaned_ref)
|
|
1698
1698
|
return authors, ""
|
|
1699
1699
|
|
|
@@ -1901,7 +1901,7 @@ class ArxivReferenceChecker:
|
|
|
1901
1901
|
db_title = self.non_arxiv_checker.normalize_paper_title(paper_data.get('title'))
|
|
1902
1902
|
|
|
1903
1903
|
if normalized_title != db_title:
|
|
1904
|
-
from utils.error_utils import format_title_mismatch
|
|
1904
|
+
from refchecker.utils.error_utils import format_title_mismatch
|
|
1905
1905
|
# Clean the title for display (remove LaTeX commands like {LLM}s -> LLMs)
|
|
1906
1906
|
clean_cited_title = strip_latex_commands(title)
|
|
1907
1907
|
logger.debug(f"DB Verification: Title mismatch - cited: '{title}', actual: '{paper_data.get('title')}'")
|
|
@@ -1940,7 +1940,7 @@ class ArxivReferenceChecker:
|
|
|
1940
1940
|
# Only flag as mismatch if the difference is greater than tolerance
|
|
1941
1941
|
if abs(year - paper_year) > year_tolerance:
|
|
1942
1942
|
logger.debug(f"DB Verification: Year mismatch - cited: {year}, actual: {paper_year}")
|
|
1943
|
-
from utils.error_utils import format_year_mismatch
|
|
1943
|
+
from refchecker.utils.error_utils import format_year_mismatch
|
|
1944
1944
|
errors.append({
|
|
1945
1945
|
'warning_type': 'year',
|
|
1946
1946
|
'warning_details': format_year_mismatch(year, paper_year),
|
|
@@ -1949,7 +1949,7 @@ class ArxivReferenceChecker:
|
|
|
1949
1949
|
|
|
1950
1950
|
# Verify DOI
|
|
1951
1951
|
if doi and external_ids.get('DOI'):
|
|
1952
|
-
from utils.doi_utils import compare_dois, normalize_doi
|
|
1952
|
+
from refchecker.utils.doi_utils import compare_dois, normalize_doi
|
|
1953
1953
|
|
|
1954
1954
|
# Use proper DOI comparison first
|
|
1955
1955
|
if not compare_dois(doi, external_ids['DOI']):
|
|
@@ -1962,7 +1962,7 @@ class ArxivReferenceChecker:
|
|
|
1962
1962
|
# Only flag as error if it's not a reasonable partial match
|
|
1963
1963
|
if not actual_doi_normalized.startswith(cited_doi_normalized.rstrip('.')):
|
|
1964
1964
|
logger.debug(f"DB Verification: DOI mismatch - cited: {doi}, actual: {external_ids['DOI']}")
|
|
1965
|
-
from utils.error_utils import format_doi_mismatch
|
|
1965
|
+
from refchecker.utils.error_utils import format_doi_mismatch
|
|
1966
1966
|
errors.append({
|
|
1967
1967
|
'error_type': 'doi',
|
|
1968
1968
|
'error_details': format_doi_mismatch(doi, external_ids['DOI']),
|
|
@@ -2058,7 +2058,7 @@ class ArxivReferenceChecker:
|
|
|
2058
2058
|
elif error.get('error_type') == 'year' or error.get('warning_type') == 'year':
|
|
2059
2059
|
formatted_error['ref_year_correct'] = error.get('ref_year_correct', '')
|
|
2060
2060
|
elif error.get('error_type') == 'doi':
|
|
2061
|
-
from utils.doi_utils import construct_doi_url
|
|
2061
|
+
from refchecker.utils.doi_utils import construct_doi_url
|
|
2062
2062
|
formatted_error['ref_url_correct'] = construct_doi_url(error.get('ref_doi_correct', ''))
|
|
2063
2063
|
elif error.get('info_type') == 'url':
|
|
2064
2064
|
formatted_error['ref_url_correct'] = error.get('ref_url_correct', '')
|
|
@@ -2089,14 +2089,12 @@ class ArxivReferenceChecker:
|
|
|
2089
2089
|
if correct_paper_data:
|
|
2090
2090
|
logger.debug(f"Database mode: Found correct paper: '{correct_paper_data.get('title', '')}'")
|
|
2091
2091
|
# Use the CORRECT paper's Semantic Scholar URL
|
|
2092
|
-
|
|
2093
|
-
|
|
2094
|
-
from utils.url_utils import construct_semantic_scholar_url
|
|
2095
|
-
correct_paper_url = construct_semantic_scholar_url(correct_external_ids['CorpusId'])
|
|
2092
|
+
if correct_paper_data.get('paperId'):
|
|
2093
|
+
correct_paper_url = f"https://www.semanticscholar.org/paper/{correct_paper_data['paperId']}"
|
|
2096
2094
|
paper_url = correct_paper_url # Update the main URL
|
|
2097
2095
|
logger.debug(f"Database mode: Using correct paper's Semantic Scholar URL for ArXiv ID mismatch: {paper_url}")
|
|
2098
2096
|
else:
|
|
2099
|
-
logger.debug("Database mode: Correct paper found but no
|
|
2097
|
+
logger.debug("Database mode: Correct paper found but no paperId available")
|
|
2100
2098
|
else:
|
|
2101
2099
|
logger.debug("Database mode: Could not find correct paper by title/authors")
|
|
2102
2100
|
except Exception as e:
|
|
@@ -2117,12 +2115,11 @@ class ArxivReferenceChecker:
|
|
|
2117
2115
|
formatted_errors.append(formatted_error)
|
|
2118
2116
|
|
|
2119
2117
|
# Fallback to wrong paper's URL if we couldn't find the correct one
|
|
2120
|
-
if not correct_paper_data and verified_data and verified_data.get('
|
|
2121
|
-
|
|
2122
|
-
paper_url = construct_semantic_scholar_url(verified_data['externalIds']['CorpusId'])
|
|
2118
|
+
if not correct_paper_data and verified_data and verified_data.get('paperId'):
|
|
2119
|
+
paper_url = f"https://www.semanticscholar.org/paper/{verified_data['paperId']}"
|
|
2123
2120
|
logger.debug(f"Database mode: Fallback to wrong paper's Semantic Scholar URL: {paper_url}")
|
|
2124
2121
|
elif not correct_paper_data:
|
|
2125
|
-
logger.debug(f"Database mode: No
|
|
2122
|
+
logger.debug(f"Database mode: No paperId available for Semantic Scholar URL construction. verified_data keys: {list(verified_data.keys()) if verified_data else 'None'}")
|
|
2126
2123
|
|
|
2127
2124
|
return formatted_errors if formatted_errors else None, paper_url, verified_data
|
|
2128
2125
|
else:
|
|
@@ -2184,7 +2181,7 @@ class ArxivReferenceChecker:
|
|
|
2184
2181
|
logger.debug(f"Detected GitHub URL, using GitHub verification: {github_url}")
|
|
2185
2182
|
|
|
2186
2183
|
# Import and use GitHub checker
|
|
2187
|
-
from checkers.github_checker import GitHubChecker
|
|
2184
|
+
from refchecker.checkers.github_checker import GitHubChecker
|
|
2188
2185
|
github_checker = GitHubChecker()
|
|
2189
2186
|
verified_data, errors, paper_url = github_checker.verify_reference(reference)
|
|
2190
2187
|
|
|
@@ -2244,7 +2241,7 @@ class ArxivReferenceChecker:
|
|
|
2244
2241
|
return None # No URL to check
|
|
2245
2242
|
|
|
2246
2243
|
# Import and use web page checker
|
|
2247
|
-
from checkers.webpage_checker import WebPageChecker
|
|
2244
|
+
from refchecker.checkers.webpage_checker import WebPageChecker
|
|
2248
2245
|
webpage_checker = WebPageChecker()
|
|
2249
2246
|
|
|
2250
2247
|
if not webpage_checker.is_web_page_url(web_url):
|
|
@@ -2308,7 +2305,7 @@ class ArxivReferenceChecker:
|
|
|
2308
2305
|
return None, [{"error_type": "unverified", "error_details": "Reference could not be verified"}], None
|
|
2309
2306
|
|
|
2310
2307
|
# First try PDF paper checker if URL appears to be a PDF
|
|
2311
|
-
from checkers.pdf_paper_checker import PDFPaperChecker
|
|
2308
|
+
from refchecker.checkers.pdf_paper_checker import PDFPaperChecker
|
|
2312
2309
|
pdf_checker = PDFPaperChecker()
|
|
2313
2310
|
|
|
2314
2311
|
if pdf_checker.can_check_reference(reference):
|
|
@@ -2325,7 +2322,7 @@ class ArxivReferenceChecker:
|
|
|
2325
2322
|
logger.debug(f"PDF verification error, falling back to web page verification")
|
|
2326
2323
|
|
|
2327
2324
|
# Fall back to web page checker
|
|
2328
|
-
from checkers.pdf_paper_checker import PDFPaperChecker
|
|
2325
|
+
from refchecker.checkers.pdf_paper_checker import PDFPaperChecker
|
|
2329
2326
|
pdf_checker = PDFPaperChecker()
|
|
2330
2327
|
|
|
2331
2328
|
if pdf_checker.can_check_reference(reference):
|
|
@@ -2342,7 +2339,7 @@ class ArxivReferenceChecker:
|
|
|
2342
2339
|
logger.debug(f"PDF verification error, falling back to web page verification")
|
|
2343
2340
|
|
|
2344
2341
|
# Fall back to web page checker
|
|
2345
|
-
from checkers.webpage_checker import WebPageChecker
|
|
2342
|
+
from refchecker.checkers.webpage_checker import WebPageChecker
|
|
2346
2343
|
webpage_checker = WebPageChecker()
|
|
2347
2344
|
|
|
2348
2345
|
try:
|
|
@@ -2463,7 +2460,7 @@ class ArxivReferenceChecker:
|
|
|
2463
2460
|
elif error.get('error_type') == 'year' or error.get('warning_type') == 'year':
|
|
2464
2461
|
formatted_error['ref_year_correct'] = error.get('ref_year_correct', '')
|
|
2465
2462
|
elif error.get('error_type') == 'doi':
|
|
2466
|
-
from utils.doi_utils import construct_doi_url
|
|
2463
|
+
from refchecker.utils.doi_utils import construct_doi_url
|
|
2467
2464
|
formatted_error['ref_url_correct'] = construct_doi_url(error.get('ref_doi_correct', ''))
|
|
2468
2465
|
|
|
2469
2466
|
formatted_errors.append(formatted_error)
|
|
@@ -2753,7 +2750,7 @@ class ArxivReferenceChecker:
|
|
|
2753
2750
|
corrected_data = self._extract_corrected_data_from_error(consolidated_entry, verified_data)
|
|
2754
2751
|
|
|
2755
2752
|
# Generate all three formats for user convenience
|
|
2756
|
-
from utils.text_utils import format_corrected_plaintext, format_corrected_bibtex, format_corrected_bibitem
|
|
2753
|
+
from refchecker.utils.text_utils import format_corrected_plaintext, format_corrected_bibtex, format_corrected_bibitem
|
|
2757
2754
|
plaintext_format = format_corrected_plaintext(reference, corrected_data, consolidated_entry)
|
|
2758
2755
|
bibtex_format = format_corrected_bibtex(reference, corrected_data, consolidated_entry)
|
|
2759
2756
|
bibitem_format = format_corrected_bibitem(reference, corrected_data, consolidated_entry)
|
|
@@ -2824,7 +2821,7 @@ class ArxivReferenceChecker:
|
|
|
2824
2821
|
corrected_data = self._extract_corrected_data_from_error(error, verified_data)
|
|
2825
2822
|
|
|
2826
2823
|
# Generate all three formats
|
|
2827
|
-
from utils.text_utils import format_corrected_plaintext, format_corrected_bibtex, format_corrected_bibitem
|
|
2824
|
+
from refchecker.utils.text_utils import format_corrected_plaintext, format_corrected_bibtex, format_corrected_bibitem
|
|
2828
2825
|
plaintext_format = format_corrected_plaintext(reference, corrected_data, error_entry)
|
|
2829
2826
|
bibtex_format = format_corrected_bibtex(reference, corrected_data, error_entry)
|
|
2830
2827
|
bibitem_format = format_corrected_bibitem(reference, corrected_data, error_entry)
|
|
@@ -3326,7 +3323,7 @@ class ArxivReferenceChecker:
|
|
|
3326
3323
|
|
|
3327
3324
|
if authors:
|
|
3328
3325
|
# Limit to first 3 authors for readability
|
|
3329
|
-
from utils.text_utils import parse_authors_with_initials
|
|
3326
|
+
from refchecker.utils.text_utils import parse_authors_with_initials
|
|
3330
3327
|
author_list = parse_authors_with_initials(authors)
|
|
3331
3328
|
if len(author_list) > 3:
|
|
3332
3329
|
formatted += ", ".join(author_list[:3]) + " et al."
|
|
@@ -3568,7 +3565,7 @@ class ArxivReferenceChecker:
|
|
|
3568
3565
|
return self._parse_standard_acm_natbib_references(bibliography_text)
|
|
3569
3566
|
|
|
3570
3567
|
# Check if this is BibTeX format
|
|
3571
|
-
from utils.bibtex_parser import detect_bibtex_format
|
|
3568
|
+
from refchecker.utils.bibtex_parser import detect_bibtex_format
|
|
3572
3569
|
if detect_bibtex_format(bibliography_text):
|
|
3573
3570
|
logger.info("Detected BibTeX format, using BibTeX parser")
|
|
3574
3571
|
self.used_regex_extraction = True
|
|
@@ -3576,7 +3573,7 @@ class ArxivReferenceChecker:
|
|
|
3576
3573
|
return self._parse_bibtex_references(bibliography_text)
|
|
3577
3574
|
|
|
3578
3575
|
# Check if this is biblatex format
|
|
3579
|
-
from utils.biblatex_parser import detect_biblatex_format
|
|
3576
|
+
from refchecker.utils.biblatex_parser import detect_biblatex_format
|
|
3580
3577
|
if detect_biblatex_format(bibliography_text):
|
|
3581
3578
|
logger.debug("Detected biblatex format")
|
|
3582
3579
|
self.used_regex_extraction = True
|
|
@@ -3686,7 +3683,7 @@ class ArxivReferenceChecker:
|
|
|
3686
3683
|
if author_field_match:
|
|
3687
3684
|
author_content = author_field_match.group(1)
|
|
3688
3685
|
# Find all \bibinfo{person}{Name} entries using balanced brace extraction
|
|
3689
|
-
from utils.text_utils import extract_bibinfo_person_content
|
|
3686
|
+
from refchecker.utils.text_utils import extract_bibinfo_person_content
|
|
3690
3687
|
person_matches = extract_bibinfo_person_content(author_content)
|
|
3691
3688
|
if person_matches:
|
|
3692
3689
|
authors = []
|
|
@@ -3700,7 +3697,7 @@ class ArxivReferenceChecker:
|
|
|
3700
3697
|
ref['authors'] = authors
|
|
3701
3698
|
|
|
3702
3699
|
# Import balanced brace extraction function
|
|
3703
|
-
from utils.text_utils import extract_bibinfo_field_content
|
|
3700
|
+
from refchecker.utils.text_utils import extract_bibinfo_field_content
|
|
3704
3701
|
|
|
3705
3702
|
# Extract title from \bibinfo{title}{Title} using balanced brace extraction
|
|
3706
3703
|
title_content = extract_bibinfo_field_content(content, 'title')
|
|
@@ -3758,7 +3755,7 @@ class ArxivReferenceChecker:
|
|
|
3758
3755
|
author_part_clean = strip_latex_commands(author_part).strip()
|
|
3759
3756
|
if author_part_clean and not author_part_clean.startswith('\\'):
|
|
3760
3757
|
# Parse author names using the robust author parsing function
|
|
3761
|
-
from utils.text_utils import parse_authors_with_initials
|
|
3758
|
+
from refchecker.utils.text_utils import parse_authors_with_initials
|
|
3762
3759
|
author_names = parse_authors_with_initials(author_part_clean)
|
|
3763
3760
|
|
|
3764
3761
|
# Clean up author names
|
|
@@ -3812,14 +3809,14 @@ class ArxivReferenceChecker:
|
|
|
3812
3809
|
self.used_regex_extraction = True
|
|
3813
3810
|
|
|
3814
3811
|
# Check if this is BibTeX format first
|
|
3815
|
-
from utils.bibtex_parser import detect_bibtex_format
|
|
3812
|
+
from refchecker.utils.bibtex_parser import detect_bibtex_format
|
|
3816
3813
|
if detect_bibtex_format(bibliography_text):
|
|
3817
3814
|
logger.debug("Detected BibTeX format, using BibTeX-specific parsing")
|
|
3818
3815
|
# BibTeX parsing is robust, so we don't set used_unreliable_extraction
|
|
3819
3816
|
return self._parse_bibtex_references(bibliography_text)
|
|
3820
3817
|
|
|
3821
3818
|
# Check if this is biblatex format
|
|
3822
|
-
from utils.biblatex_parser import detect_biblatex_format
|
|
3819
|
+
from refchecker.utils.biblatex_parser import detect_biblatex_format
|
|
3823
3820
|
if detect_biblatex_format(bibliography_text):
|
|
3824
3821
|
logger.debug("Detected biblatex format, using biblatex-specific parsing")
|
|
3825
3822
|
# biblatex parsing is also robust, so we don't set used_unreliable_extraction
|
|
@@ -4105,7 +4102,7 @@ class ArxivReferenceChecker:
|
|
|
4105
4102
|
if doi_match:
|
|
4106
4103
|
doi = clean_doi(doi_match.group(1))
|
|
4107
4104
|
if doi:
|
|
4108
|
-
from utils.doi_utils import construct_doi_url
|
|
4105
|
+
from refchecker.utils.doi_utils import construct_doi_url
|
|
4109
4106
|
url = construct_doi_url(doi)
|
|
4110
4107
|
else:
|
|
4111
4108
|
url = ''
|
|
@@ -4265,7 +4262,7 @@ class ArxivReferenceChecker:
|
|
|
4265
4262
|
List of structured reference dictionaries
|
|
4266
4263
|
"""
|
|
4267
4264
|
# Use the dedicated BibTeX parser
|
|
4268
|
-
from utils.bibtex_parser import parse_bibtex_references
|
|
4265
|
+
from refchecker.utils.bibtex_parser import parse_bibtex_references
|
|
4269
4266
|
|
|
4270
4267
|
# Extract references using the BibTeX parser
|
|
4271
4268
|
references = parse_bibtex_references(bibliography_text)
|
|
@@ -4284,7 +4281,7 @@ class ArxivReferenceChecker:
|
|
|
4284
4281
|
List of structured reference dictionaries
|
|
4285
4282
|
"""
|
|
4286
4283
|
# Use the dedicated biblatex parser
|
|
4287
|
-
from utils.biblatex_parser import parse_biblatex_references
|
|
4284
|
+
from refchecker.utils.biblatex_parser import parse_biblatex_references
|
|
4288
4285
|
|
|
4289
4286
|
# Extract references using the biblatex parser
|
|
4290
4287
|
references = parse_biblatex_references(bibliography_text)
|
|
@@ -4484,7 +4481,7 @@ class ArxivReferenceChecker:
|
|
|
4484
4481
|
return True
|
|
4485
4482
|
|
|
4486
4483
|
# Also check if authors have significant overlap (at least 50% of the shorter author list)
|
|
4487
|
-
from utils.text_utils import parse_authors_with_initials
|
|
4484
|
+
from refchecker.utils.text_utils import parse_authors_with_initials
|
|
4488
4485
|
|
|
4489
4486
|
if '*' in seg1['author']:
|
|
4490
4487
|
author1_parts = seg1['author'].split('*')
|
|
@@ -4553,7 +4550,7 @@ class ArxivReferenceChecker:
|
|
|
4553
4550
|
parsed_authors = []
|
|
4554
4551
|
for author in raw_authors:
|
|
4555
4552
|
# Clean up the author entry and strip LaTeX commands
|
|
4556
|
-
from utils.text_utils import strip_latex_commands
|
|
4553
|
+
from refchecker.utils.text_utils import strip_latex_commands
|
|
4557
4554
|
author_cleaned = strip_latex_commands(author.rstrip('.'))
|
|
4558
4555
|
|
|
4559
4556
|
# Skip special indicators like "others", "et al", etc.
|
|
@@ -4571,14 +4568,14 @@ class ArxivReferenceChecker:
|
|
|
4571
4568
|
return parsed_authors
|
|
4572
4569
|
else:
|
|
4573
4570
|
# Fallback to original logic for backward compatibility
|
|
4574
|
-
from utils.text_utils import parse_authors_with_initials
|
|
4571
|
+
from refchecker.utils.text_utils import parse_authors_with_initials
|
|
4575
4572
|
|
|
4576
4573
|
cleaned_text = author_text.rstrip('.')
|
|
4577
4574
|
authors = parse_authors_with_initials(cleaned_text)
|
|
4578
4575
|
authors = [a.rstrip('.').strip() for a in authors if a.strip()]
|
|
4579
4576
|
|
|
4580
4577
|
# Handle "others" and similar indicators in fallback logic too
|
|
4581
|
-
from utils.text_utils import strip_latex_commands
|
|
4578
|
+
from refchecker.utils.text_utils import strip_latex_commands
|
|
4582
4579
|
processed_authors = []
|
|
4583
4580
|
for author in authors:
|
|
4584
4581
|
# Apply LaTeX cleaning to each author
|
|
@@ -4706,7 +4703,7 @@ class ArxivReferenceChecker:
|
|
|
4706
4703
|
if '*' in doi:
|
|
4707
4704
|
doi = doi.split('*')[0]
|
|
4708
4705
|
|
|
4709
|
-
from utils.doi_utils import construct_doi_url
|
|
4706
|
+
from refchecker.utils.doi_utils import construct_doi_url
|
|
4710
4707
|
url = construct_doi_url(doi)
|
|
4711
4708
|
break
|
|
4712
4709
|
|
|
@@ -4714,7 +4711,7 @@ class ArxivReferenceChecker:
|
|
|
4714
4711
|
if not url and not arxiv_url:
|
|
4715
4712
|
url_match = re.search(r'https?://(?!arxiv\.org)[^\s,]+', ref_text)
|
|
4716
4713
|
if url_match:
|
|
4717
|
-
from utils.url_utils import clean_url_punctuation
|
|
4714
|
+
from refchecker.utils.url_utils import clean_url_punctuation
|
|
4718
4715
|
url = clean_url_punctuation(url_match.group(0))
|
|
4719
4716
|
|
|
4720
4717
|
# Extract year - will be determined from structured parts below
|
|
@@ -4808,7 +4805,7 @@ class ArxivReferenceChecker:
|
|
|
4808
4805
|
if 'arxiv' in url_part.lower():
|
|
4809
4806
|
arxiv_url = url_part
|
|
4810
4807
|
else:
|
|
4811
|
-
from utils.url_utils import clean_url_punctuation
|
|
4808
|
+
from refchecker.utils.url_utils import clean_url_punctuation
|
|
4812
4809
|
url = clean_url_punctuation(url_part)
|
|
4813
4810
|
else:
|
|
4814
4811
|
# Fallback for other formats or malformed input
|
|
@@ -4829,7 +4826,7 @@ class ArxivReferenceChecker:
|
|
|
4829
4826
|
if 'arxiv' in url_part.lower():
|
|
4830
4827
|
arxiv_url = url_part
|
|
4831
4828
|
else:
|
|
4832
|
-
from utils.url_utils import clean_url_punctuation
|
|
4829
|
+
from refchecker.utils.url_utils import clean_url_punctuation
|
|
4833
4830
|
url = clean_url_punctuation(url_part)
|
|
4834
4831
|
if len(parts) > 5:
|
|
4835
4832
|
# For cases with more than 5 parts, combine the remaining parts as additional info
|
|
@@ -4966,7 +4963,7 @@ class ArxivReferenceChecker:
|
|
|
4966
4963
|
if '*' in doi:
|
|
4967
4964
|
doi = doi.split('*')[0]
|
|
4968
4965
|
|
|
4969
|
-
from utils.doi_utils import construct_doi_url
|
|
4966
|
+
from refchecker.utils.doi_utils import construct_doi_url
|
|
4970
4967
|
url = construct_doi_url(doi)
|
|
4971
4968
|
break
|
|
4972
4969
|
|
|
@@ -4974,7 +4971,7 @@ class ArxivReferenceChecker:
|
|
|
4974
4971
|
if not url and not arxiv_url:
|
|
4975
4972
|
url_match = re.search(r'https?://(?!arxiv\.org)[^\s,\)]+', ref_text)
|
|
4976
4973
|
if url_match:
|
|
4977
|
-
from utils.url_utils import clean_url_punctuation
|
|
4974
|
+
from refchecker.utils.url_utils import clean_url_punctuation
|
|
4978
4975
|
url = clean_url_punctuation(url_match.group(0))
|
|
4979
4976
|
|
|
4980
4977
|
# Extract year
|
|
@@ -5023,7 +5020,7 @@ class ArxivReferenceChecker:
|
|
|
5023
5020
|
logger.debug(f"Extracting bibliography for paper {paper_id}: {paper.title}")
|
|
5024
5021
|
|
|
5025
5022
|
# Check if we can get BibTeX content for this paper (ArXiv or other sources)
|
|
5026
|
-
from utils.arxiv_utils import get_bibtex_content
|
|
5023
|
+
from refchecker.utils.arxiv_utils import get_bibtex_content
|
|
5027
5024
|
bibtex_content = get_bibtex_content(paper)
|
|
5028
5025
|
if bibtex_content:
|
|
5029
5026
|
logger.debug(f"Found BibTeX content for {paper_id}, using structured bibliography")
|
|
@@ -5047,7 +5044,7 @@ class ArxivReferenceChecker:
|
|
|
5047
5044
|
references = extract_latex_references(bibtex_content, None)
|
|
5048
5045
|
|
|
5049
5046
|
# Validate the parsed references and fallback to LLM if needed
|
|
5050
|
-
from utils.text_utils import validate_parsed_references
|
|
5047
|
+
from refchecker.utils.text_utils import validate_parsed_references
|
|
5051
5048
|
validation = validate_parsed_references(references)
|
|
5052
5049
|
|
|
5053
5050
|
if not validation['is_valid']:
|
|
@@ -5372,9 +5369,9 @@ class ArxivReferenceChecker:
|
|
|
5372
5369
|
# Print reference info in non-debug mode (improved formatting)
|
|
5373
5370
|
raw_title = reference.get('title', 'Untitled')
|
|
5374
5371
|
# Clean LaTeX commands from title for display
|
|
5375
|
-
from utils.text_utils import strip_latex_commands
|
|
5372
|
+
from refchecker.utils.text_utils import strip_latex_commands
|
|
5376
5373
|
title = strip_latex_commands(raw_title)
|
|
5377
|
-
from utils.text_utils import format_authors_for_display
|
|
5374
|
+
from refchecker.utils.text_utils import format_authors_for_display
|
|
5378
5375
|
authors = format_authors_for_display(reference.get('authors', []))
|
|
5379
5376
|
year = reference.get('year', '')
|
|
5380
5377
|
venue = reference.get('venue', '') or reference.get('journal', '')
|
|
@@ -5424,7 +5421,7 @@ class ArxivReferenceChecker:
|
|
|
5424
5421
|
|
|
5425
5422
|
# Show DOI URL if available and different from what's already shown
|
|
5426
5423
|
if external_ids.get('DOI'):
|
|
5427
|
-
from utils.doi_utils import construct_doi_url
|
|
5424
|
+
from refchecker.utils.doi_utils import construct_doi_url
|
|
5428
5425
|
doi_url = construct_doi_url(external_ids['DOI'])
|
|
5429
5426
|
if doi_url != verified_url_to_show and doi_url != url:
|
|
5430
5427
|
print(f" DOI URL: {doi_url}")
|
|
@@ -5521,21 +5518,20 @@ class ArxivReferenceChecker:
|
|
|
5521
5518
|
if verified_data and verified_data.get('url') and 'arxiv.org' not in verified_data['url']:
|
|
5522
5519
|
return verified_data['url']
|
|
5523
5520
|
|
|
5524
|
-
# Second priority: Semantic Scholar URL from
|
|
5525
|
-
if verified_data and verified_data.get('
|
|
5526
|
-
|
|
5527
|
-
return construct_semantic_scholar_url(verified_data['externalIds']['CorpusId'])
|
|
5521
|
+
# Second priority: Semantic Scholar URL from paperId (if no direct URL available)
|
|
5522
|
+
if verified_data and verified_data.get('paperId'):
|
|
5523
|
+
return f"https://www.semanticscholar.org/paper/{verified_data['paperId']}"
|
|
5528
5524
|
|
|
5529
5525
|
# Third priority: DOI URL from verified data (more reliable than potentially wrong ArXiv URLs)
|
|
5530
5526
|
if verified_data and verified_data.get('externalIds', {}).get('DOI'):
|
|
5531
|
-
from utils.doi_utils import construct_doi_url
|
|
5527
|
+
from refchecker.utils.doi_utils import construct_doi_url
|
|
5532
5528
|
return construct_doi_url(verified_data['externalIds']['DOI'])
|
|
5533
5529
|
|
|
5534
5530
|
# Fourth priority: ArXiv URL from verified data (but only if there's no ArXiv ID error)
|
|
5535
5531
|
if verified_data and verified_data.get('externalIds', {}).get('ArXiv'):
|
|
5536
5532
|
# Only show ArXiv URL as verified URL if there's no ArXiv ID mismatch
|
|
5537
5533
|
if not self._has_arxiv_id_error(errors):
|
|
5538
|
-
from utils.url_utils import construct_arxiv_url
|
|
5534
|
+
from refchecker.utils.url_utils import construct_arxiv_url
|
|
5539
5535
|
correct_arxiv_id = verified_data['externalIds']['ArXiv']
|
|
5540
5536
|
return construct_arxiv_url(correct_arxiv_id)
|
|
5541
5537
|
|
|
@@ -5556,7 +5552,7 @@ class ArxivReferenceChecker:
|
|
|
5556
5552
|
external_ids = verified_data.get('externalIds', {})
|
|
5557
5553
|
if external_ids.get('ArXiv'):
|
|
5558
5554
|
# Extract ArXiv ID from the URL using shared utility
|
|
5559
|
-
from utils.url_utils import extract_arxiv_id_from_url
|
|
5555
|
+
from refchecker.utils.url_utils import extract_arxiv_id_from_url
|
|
5560
5556
|
url_arxiv_id = extract_arxiv_id_from_url(reference_url)
|
|
5561
5557
|
if url_arxiv_id:
|
|
5562
5558
|
correct_arxiv_id = external_ids['ArXiv']
|
|
@@ -5576,13 +5572,13 @@ class ArxivReferenceChecker:
|
|
|
5576
5572
|
# Non-ArXiv URL, probably safe to use
|
|
5577
5573
|
return reference_url
|
|
5578
5574
|
|
|
5579
|
-
def _get_fallback_url(self, external_ids):
|
|
5575
|
+
def _get_fallback_url(self, external_ids, verified_data=None):
|
|
5580
5576
|
"""Get fallback URL from external IDs (Semantic Scholar or DOI)"""
|
|
5581
|
-
|
|
5582
|
-
|
|
5583
|
-
return
|
|
5577
|
+
# Prefer paperId for Semantic Scholar URLs
|
|
5578
|
+
if verified_data and verified_data.get('paperId'):
|
|
5579
|
+
return f"https://www.semanticscholar.org/paper/{verified_data['paperId']}"
|
|
5584
5580
|
elif external_ids.get('DOI'):
|
|
5585
|
-
from utils.doi_utils import construct_doi_url
|
|
5581
|
+
from refchecker.utils.doi_utils import construct_doi_url
|
|
5586
5582
|
return construct_doi_url(external_ids['DOI'])
|
|
5587
5583
|
return None
|
|
5588
5584
|
|
|
@@ -5660,7 +5656,7 @@ class ArxivReferenceChecker:
|
|
|
5660
5656
|
error_type = error.get('error_type') or error.get('warning_type') or error.get('info_type')
|
|
5661
5657
|
error_details = error.get('error_details') or error.get('warning_details') or error.get('info_details', 'Unknown error')
|
|
5662
5658
|
|
|
5663
|
-
from utils.error_utils import print_labeled_multiline
|
|
5659
|
+
from refchecker.utils.error_utils import print_labeled_multiline
|
|
5664
5660
|
|
|
5665
5661
|
if error_type == 'arxiv_id':
|
|
5666
5662
|
print(f" ❌ {error_details}")
|
|
@@ -318,7 +318,23 @@ class GoogleProvider(LLMProvider, LLMProviderMixin):
|
|
|
318
318
|
}
|
|
319
319
|
)
|
|
320
320
|
|
|
321
|
-
|
|
321
|
+
# Handle empty responses (content safety filter or other issues)
|
|
322
|
+
if not response.candidates:
|
|
323
|
+
logger.warning("Google API returned empty candidates (possibly content filtered)")
|
|
324
|
+
return ""
|
|
325
|
+
|
|
326
|
+
# Safely access the text
|
|
327
|
+
try:
|
|
328
|
+
return response.text or ""
|
|
329
|
+
except (ValueError, AttributeError) as e:
|
|
330
|
+
# response.text raises ValueError if multiple candidates or no text
|
|
331
|
+
logger.warning(f"Could not get text from Google response: {e}")
|
|
332
|
+
# Try to extract text from first candidate manually
|
|
333
|
+
if response.candidates and hasattr(response.candidates[0], 'content'):
|
|
334
|
+
content = response.candidates[0].content
|
|
335
|
+
if hasattr(content, 'parts') and content.parts:
|
|
336
|
+
return content.parts[0].text or ""
|
|
337
|
+
return ""
|
|
322
338
|
|
|
323
339
|
except Exception as e:
|
|
324
340
|
logger.error(f"Google API call failed: {e}")
|
|
@@ -69,10 +69,30 @@ class PDFProcessor:
|
|
|
69
69
|
with open(pdf_path, 'rb') as file:
|
|
70
70
|
pdf_reader = pypdf.PdfReader(file)
|
|
71
71
|
text = ""
|
|
72
|
+
failed_pages = []
|
|
72
73
|
|
|
73
74
|
for page_num in range(len(pdf_reader.pages)):
|
|
74
|
-
|
|
75
|
-
|
|
75
|
+
try:
|
|
76
|
+
page = pdf_reader.pages[page_num]
|
|
77
|
+
page_text = page.extract_text()
|
|
78
|
+
if page_text:
|
|
79
|
+
text += page_text + "\n"
|
|
80
|
+
except TypeError as e:
|
|
81
|
+
# Handle pypdf errors like "NumberObject is not iterable"
|
|
82
|
+
# which can occur with malformed PDF pages
|
|
83
|
+
failed_pages.append(page_num + 1) # 1-indexed for logging
|
|
84
|
+
logger.warning(f"Skipping page {page_num + 1} due to PDF parsing error: {e}")
|
|
85
|
+
continue
|
|
86
|
+
except Exception as e:
|
|
87
|
+
failed_pages.append(page_num + 1)
|
|
88
|
+
logger.warning(f"Error extracting text from page {page_num + 1}: {e}")
|
|
89
|
+
continue
|
|
90
|
+
|
|
91
|
+
if failed_pages:
|
|
92
|
+
logger.warning(f"Failed to extract text from {len(failed_pages)} pages: {failed_pages[:10]}{'...' if len(failed_pages) > 10 else ''}")
|
|
93
|
+
|
|
94
|
+
if not text.strip():
|
|
95
|
+
raise ValueError(f"No text could be extracted from any pages of {pdf_path}")
|
|
76
96
|
|
|
77
97
|
# Cache the result
|
|
78
98
|
self.cache[pdf_path] = text
|