academic-refchecker 2.0.19__py3-none-any.whl → 2.0.21__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {academic_refchecker-2.0.19.dist-info → academic_refchecker-2.0.21.dist-info}/METADATA +74 -32
- {academic_refchecker-2.0.19.dist-info → academic_refchecker-2.0.21.dist-info}/RECORD +14 -14
- backend/database.py +126 -5
- backend/main.py +450 -3
- backend/refchecker_wrapper.py +109 -19
- refchecker/__version__.py +1 -1
- refchecker/checkers/semantic_scholar.py +2 -2
- refchecker/core/refchecker.py +46 -0
- refchecker/services/pdf_processor.py +156 -1
- refchecker/utils/text_utils.py +3 -1
- {academic_refchecker-2.0.19.dist-info → academic_refchecker-2.0.21.dist-info}/WHEEL +0 -0
- {academic_refchecker-2.0.19.dist-info → academic_refchecker-2.0.21.dist-info}/entry_points.txt +0 -0
- {academic_refchecker-2.0.19.dist-info → academic_refchecker-2.0.21.dist-info}/licenses/LICENSE +0 -0
- {academic_refchecker-2.0.19.dist-info → academic_refchecker-2.0.21.dist-info}/top_level.txt +0 -0
backend/refchecker_wrapper.py
CHANGED
|
@@ -7,10 +7,18 @@ import re
|
|
|
7
7
|
import asyncio
|
|
8
8
|
import logging
|
|
9
9
|
import tempfile
|
|
10
|
+
import time
|
|
10
11
|
from concurrent.futures import ThreadPoolExecutor
|
|
11
12
|
from typing import List, Dict, Any, Optional, Callable
|
|
12
13
|
from pathlib import Path
|
|
13
14
|
|
|
15
|
+
# Debug file logging
|
|
16
|
+
DEBUG_LOG_FILE = Path(tempfile.gettempdir()) / "refchecker_debug.log"
|
|
17
|
+
def debug_log(msg: str):
|
|
18
|
+
from datetime import datetime
|
|
19
|
+
with open(DEBUG_LOG_FILE, "a", encoding="utf-8") as f:
|
|
20
|
+
f.write(f"{datetime.now().strftime('%H:%M:%S.%f')[:12]} {msg}\n")
|
|
21
|
+
|
|
14
22
|
# Add src to path to import refchecker when running from source
|
|
15
23
|
# This is only needed when not installed as a package
|
|
16
24
|
_src_path = str(Path(__file__).parent.parent / "src")
|
|
@@ -89,7 +97,8 @@ class ProgressRefChecker:
|
|
|
89
97
|
cancel_event: Optional[asyncio.Event] = None,
|
|
90
98
|
check_id: Optional[int] = None,
|
|
91
99
|
title_update_callback: Optional[Callable] = None,
|
|
92
|
-
bibliography_source_callback: Optional[Callable] = None
|
|
100
|
+
bibliography_source_callback: Optional[Callable] = None,
|
|
101
|
+
semantic_scholar_api_key: Optional[str] = None):
|
|
93
102
|
"""
|
|
94
103
|
Initialize the progress-aware refchecker
|
|
95
104
|
|
|
@@ -135,8 +144,12 @@ class ProgressRefChecker:
|
|
|
135
144
|
logger.error(f"Failed to initialize LLM: {e}")
|
|
136
145
|
|
|
137
146
|
# Initialize reference checker
|
|
147
|
+
# Use provided API key, fall back to environment variable
|
|
148
|
+
ss_api_key = semantic_scholar_api_key or os.getenv('SEMANTIC_SCHOLAR_API_KEY')
|
|
149
|
+
if ss_api_key:
|
|
150
|
+
logger.info("Semantic Scholar API key configured")
|
|
138
151
|
self.checker = EnhancedHybridReferenceChecker(
|
|
139
|
-
semantic_scholar_api_key=
|
|
152
|
+
semantic_scholar_api_key=ss_api_key,
|
|
140
153
|
debug_mode=False
|
|
141
154
|
)
|
|
142
155
|
|
|
@@ -291,7 +304,7 @@ class ProgressRefChecker:
|
|
|
291
304
|
"authoritative_urls": authoritative_urls,
|
|
292
305
|
"corrected_reference": None
|
|
293
306
|
}
|
|
294
|
-
logger.
|
|
307
|
+
logger.info(f"_format_verification_result output: suggestions={formatted_suggestions}, status={status}")
|
|
295
308
|
return result
|
|
296
309
|
|
|
297
310
|
def _format_error_result(
|
|
@@ -394,16 +407,32 @@ class ProgressRefChecker:
|
|
|
394
407
|
|
|
395
408
|
await asyncio.to_thread(download_pdf_url)
|
|
396
409
|
|
|
397
|
-
# Extract title from PDF filename or URL
|
|
398
|
-
from urllib.parse import urlparse, unquote
|
|
399
|
-
url_path = urlparse(paper_source).path
|
|
400
|
-
pdf_filename = unquote(url_path.split('/')[-1])
|
|
401
|
-
paper_title = pdf_filename.replace('.pdf', '').replace('_', ' ').replace('-', ' ')
|
|
402
|
-
await update_title_if_needed(paper_title)
|
|
403
|
-
|
|
404
410
|
extraction_method = 'pdf'
|
|
405
411
|
pdf_processor = PDFProcessor()
|
|
406
412
|
paper_text = await asyncio.to_thread(pdf_processor.extract_text_from_pdf, pdf_path)
|
|
413
|
+
|
|
414
|
+
# Try to extract the paper title from the PDF content
|
|
415
|
+
try:
|
|
416
|
+
extracted_title = await asyncio.to_thread(pdf_processor.extract_title_from_pdf, pdf_path)
|
|
417
|
+
if extracted_title:
|
|
418
|
+
paper_title = extracted_title
|
|
419
|
+
await update_title_if_needed(paper_title)
|
|
420
|
+
logger.info(f"Extracted title from PDF URL: {paper_title}")
|
|
421
|
+
else:
|
|
422
|
+
# Fallback to URL filename
|
|
423
|
+
from urllib.parse import urlparse, unquote
|
|
424
|
+
url_path = urlparse(paper_source).path
|
|
425
|
+
pdf_filename = unquote(url_path.split('/')[-1])
|
|
426
|
+
paper_title = pdf_filename.replace('.pdf', '').replace('_', ' ').replace('-', ' ')
|
|
427
|
+
await update_title_if_needed(paper_title)
|
|
428
|
+
except Exception as e:
|
|
429
|
+
logger.warning(f"Could not extract title from PDF: {e}")
|
|
430
|
+
# Fallback to URL filename
|
|
431
|
+
from urllib.parse import urlparse, unquote
|
|
432
|
+
url_path = urlparse(paper_source).path
|
|
433
|
+
pdf_filename = unquote(url_path.split('/')[-1])
|
|
434
|
+
paper_title = pdf_filename.replace('.pdf', '').replace('_', ' ').replace('-', ' ')
|
|
435
|
+
await update_title_if_needed(paper_title)
|
|
407
436
|
else:
|
|
408
437
|
# Handle ArXiv URLs/IDs
|
|
409
438
|
arxiv_id = extract_arxiv_id_from_url(paper_source)
|
|
@@ -467,14 +496,22 @@ class ProgressRefChecker:
|
|
|
467
496
|
})
|
|
468
497
|
|
|
469
498
|
# Handle uploaded file - run PDF processing in thread
|
|
470
|
-
# Note: paper_title is already set to the original filename in main.py
|
|
471
|
-
# so we don't update it here
|
|
472
499
|
if paper_source.lower().endswith('.pdf'):
|
|
473
500
|
# PDF extraction requires LLM for reliable reference extraction
|
|
474
501
|
if not self.llm:
|
|
475
502
|
raise ValueError("PDF extraction requires an LLM to be configured. Please configure an LLM provider in settings.")
|
|
476
503
|
pdf_processor = PDFProcessor()
|
|
477
504
|
paper_text = await asyncio.to_thread(pdf_processor.extract_text_from_pdf, paper_source)
|
|
505
|
+
|
|
506
|
+
# Try to extract the paper title from the PDF
|
|
507
|
+
try:
|
|
508
|
+
extracted_title = await asyncio.to_thread(pdf_processor.extract_title_from_pdf, paper_source)
|
|
509
|
+
if extracted_title:
|
|
510
|
+
paper_title = extracted_title
|
|
511
|
+
await update_title_if_needed(paper_title)
|
|
512
|
+
logger.info(f"Extracted title from PDF: {paper_title}")
|
|
513
|
+
except Exception as e:
|
|
514
|
+
logger.warning(f"Could not extract title from PDF: {e}")
|
|
478
515
|
elif paper_source.lower().endswith(('.tex', '.txt', '.bib')):
|
|
479
516
|
def read_file():
|
|
480
517
|
with open(paper_source, 'r', encoding='utf-8') as f:
|
|
@@ -808,6 +845,11 @@ class ProgressRefChecker:
|
|
|
808
845
|
return []
|
|
809
846
|
if refs:
|
|
810
847
|
logger.info(f"Extracted {len(refs)} references via CLI parser")
|
|
848
|
+
# DEBUG: Log problematic references where year looks like title
|
|
849
|
+
for idx, ref in enumerate(refs):
|
|
850
|
+
title = ref.get('title', '')
|
|
851
|
+
if title and (title.isdigit() or len(title) < 10):
|
|
852
|
+
debug_log(f"PARSE ISSUE ref {idx+1}: title='{title}' authors={ref.get('authors', [])[:2]} year={ref.get('year')}")
|
|
811
853
|
# Normalize field names (journal -> venue)
|
|
812
854
|
refs = [_normalize_reference_fields(ref) for ref in refs]
|
|
813
855
|
return refs
|
|
@@ -853,7 +895,16 @@ class ProgressRefChecker:
|
|
|
853
895
|
try:
|
|
854
896
|
llm_refs = await asyncio.to_thread(cli_checker.llm_extractor.extract_references, bibtex_content)
|
|
855
897
|
if llm_refs:
|
|
898
|
+
# DEBUG: Log raw LLM output
|
|
899
|
+
debug_log(f"LLM raw output ({len(llm_refs)} refs):")
|
|
900
|
+
for i, r in enumerate(llm_refs[:5]):
|
|
901
|
+
debug_log(f" [{i+1}] {str(r)[:150]}")
|
|
856
902
|
processed_refs = await asyncio.to_thread(cli_checker._process_llm_extracted_references, llm_refs)
|
|
903
|
+
# DEBUG: Log processed refs with potential issues
|
|
904
|
+
for idx, ref in enumerate(processed_refs):
|
|
905
|
+
title = ref.get('title', '')
|
|
906
|
+
if title and (title.isdigit() or len(title) < 10):
|
|
907
|
+
debug_log(f"PARSE ISSUE after LLM ref {idx+1}: title='{title}' authors={ref.get('authors', [])[:2]}")
|
|
857
908
|
llm_validation = await asyncio.to_thread(validate_parsed_references, processed_refs)
|
|
858
909
|
if llm_validation['quality_score'] > validation['quality_score']:
|
|
859
910
|
logger.info(f"LLM extraction improved quality ({llm_validation['quality_score']:.2f})")
|
|
@@ -916,7 +967,11 @@ class ProgressRefChecker:
|
|
|
916
967
|
# Run verification with timeout (handled by caller)
|
|
917
968
|
verified_data, errors, url = self.checker.verify_reference(reference)
|
|
918
969
|
return self._format_verification_result(reference, index, verified_data, errors, url)
|
|
919
|
-
|
|
970
|
+
except UnicodeEncodeError as e:
|
|
971
|
+
# Handle Windows encoding issues with special characters (e.g., Greek letters in titles)
|
|
972
|
+
logger.warning(f"Unicode encoding error checking reference {index}: {e}")
|
|
973
|
+
return self._format_error_result(reference, index,
|
|
974
|
+
Exception(f"Unicode encoding error - title may contain special characters"))
|
|
920
975
|
except Exception as e:
|
|
921
976
|
logger.error(f"Error checking reference {index}: {e}")
|
|
922
977
|
return self._format_error_result(reference, index, e)
|
|
@@ -938,13 +993,22 @@ class ProgressRefChecker:
|
|
|
938
993
|
from .database import db
|
|
939
994
|
|
|
940
995
|
# Check cache first
|
|
996
|
+
cache_start = time.time()
|
|
941
997
|
cached_result = await db.get_cached_verification(reference)
|
|
998
|
+
cache_time = time.time() - cache_start
|
|
999
|
+
if cache_time > 0.1:
|
|
1000
|
+
debug_log(f"[TIMING] Cache lookup for ref {idx + 1} took {cache_time:.3f}s")
|
|
942
1001
|
if cached_result:
|
|
943
1002
|
# Update the index to match current position
|
|
944
1003
|
cached_result['index'] = idx + 1
|
|
945
|
-
|
|
1004
|
+
debug_log(f"Cache hit for reference {idx + 1} in {cache_time:.3f}s")
|
|
946
1005
|
return cached_result
|
|
947
1006
|
|
|
1007
|
+
# Log cache miss with details
|
|
1008
|
+
title = reference.get('title', 'Unknown')[:60]
|
|
1009
|
+
authors = reference.get('authors', [])[:2]
|
|
1010
|
+
debug_log(f"CACHE MISS for ref {idx + 1}: title='{title}' authors={authors}")
|
|
1011
|
+
|
|
948
1012
|
limiter = get_limiter()
|
|
949
1013
|
|
|
950
1014
|
# Wait for a slot in the global queue
|
|
@@ -961,7 +1025,6 @@ class ProgressRefChecker:
|
|
|
961
1025
|
|
|
962
1026
|
try:
|
|
963
1027
|
# Run the sync check in a thread
|
|
964
|
-
# Use 240 second timeout to allow for ArXiv rate limiting with version checking
|
|
965
1028
|
result = await asyncio.wait_for(
|
|
966
1029
|
loop.run_in_executor(
|
|
967
1030
|
None, # Use default executor
|
|
@@ -969,7 +1032,7 @@ class ProgressRefChecker:
|
|
|
969
1032
|
reference,
|
|
970
1033
|
idx + 1
|
|
971
1034
|
),
|
|
972
|
-
timeout=
|
|
1035
|
+
timeout=120.0 # 2 minute timeout per reference
|
|
973
1036
|
)
|
|
974
1037
|
except asyncio.TimeoutError:
|
|
975
1038
|
result = {
|
|
@@ -982,7 +1045,7 @@ class ProgressRefChecker:
|
|
|
982
1045
|
"status": "error",
|
|
983
1046
|
"errors": [{
|
|
984
1047
|
"error_type": "timeout",
|
|
985
|
-
"error_details": "Verification timed out after
|
|
1048
|
+
"error_details": "Verification timed out after 120 seconds"
|
|
986
1049
|
}],
|
|
987
1050
|
"warnings": [],
|
|
988
1051
|
"authoritative_urls": [],
|
|
@@ -1045,6 +1108,9 @@ class ProgressRefChecker:
|
|
|
1045
1108
|
|
|
1046
1109
|
loop = asyncio.get_event_loop()
|
|
1047
1110
|
|
|
1111
|
+
start_time = time.time()
|
|
1112
|
+
debug_log(f"[TIMING] Starting parallel check of {total_refs} references")
|
|
1113
|
+
|
|
1048
1114
|
# Create tasks for all references - they will be rate-limited by the global semaphore
|
|
1049
1115
|
tasks = []
|
|
1050
1116
|
for idx, ref in enumerate(references):
|
|
@@ -1054,11 +1120,18 @@ class ProgressRefChecker:
|
|
|
1054
1120
|
)
|
|
1055
1121
|
tasks.append((idx, task))
|
|
1056
1122
|
|
|
1123
|
+
task_creation_time = time.time()
|
|
1124
|
+
debug_log(f"[TIMING] Tasks created in {task_creation_time - start_time:.3f}s")
|
|
1125
|
+
|
|
1057
1126
|
# Process results as they complete
|
|
1058
1127
|
pending_tasks = {task for _, task in tasks}
|
|
1059
1128
|
task_to_idx = {task: idx for idx, task in tasks}
|
|
1060
1129
|
|
|
1130
|
+
iteration = 0
|
|
1061
1131
|
while pending_tasks:
|
|
1132
|
+
iteration += 1
|
|
1133
|
+
iter_start = time.time()
|
|
1134
|
+
|
|
1062
1135
|
# Check for cancellation
|
|
1063
1136
|
try:
|
|
1064
1137
|
await self._check_cancelled()
|
|
@@ -1068,13 +1141,15 @@ class ProgressRefChecker:
|
|
|
1068
1141
|
task.cancel()
|
|
1069
1142
|
raise
|
|
1070
1143
|
|
|
1071
|
-
# Wait for some tasks to complete
|
|
1144
|
+
# Wait for some tasks to complete - no timeout needed, just wait for first completed
|
|
1072
1145
|
done, pending_tasks = await asyncio.wait(
|
|
1073
1146
|
pending_tasks,
|
|
1074
|
-
timeout=0.5,
|
|
1075
1147
|
return_when=asyncio.FIRST_COMPLETED
|
|
1076
1148
|
)
|
|
1077
1149
|
|
|
1150
|
+
wait_time = time.time() - iter_start
|
|
1151
|
+
debug_log(f"[TIMING] Iteration {iteration}: wait took {wait_time:.3f}s, {len(done)} done, {len(pending_tasks)} pending")
|
|
1152
|
+
|
|
1078
1153
|
for task in done:
|
|
1079
1154
|
idx = task_to_idx[task]
|
|
1080
1155
|
|
|
@@ -1147,6 +1222,7 @@ class ProgressRefChecker:
|
|
|
1147
1222
|
refs_with_warnings_only += 1
|
|
1148
1223
|
|
|
1149
1224
|
# Emit result immediately
|
|
1225
|
+
emit_start = time.time()
|
|
1150
1226
|
await self.emit_progress("reference_result", result)
|
|
1151
1227
|
await self.emit_progress("progress", {
|
|
1152
1228
|
"current": processed_count,
|
|
@@ -1165,6 +1241,20 @@ class ProgressRefChecker:
|
|
|
1165
1241
|
"refs_verified": refs_verified,
|
|
1166
1242
|
"progress_percent": round((processed_count / total_refs) * 100, 1)
|
|
1167
1243
|
})
|
|
1244
|
+
emit_time = time.time() - emit_start
|
|
1245
|
+
if emit_time > 0.1:
|
|
1246
|
+
debug_log(f"[TIMING] Emit for ref {idx + 1} took {emit_time:.3f}s")
|
|
1247
|
+
|
|
1248
|
+
# Yield to event loop to allow WebSocket messages to flush
|
|
1249
|
+
# This prevents stalls when many cache hits complete rapidly
|
|
1250
|
+
await asyncio.sleep(0)
|
|
1251
|
+
|
|
1252
|
+
total_time = time.time() - start_time
|
|
1253
|
+
debug_log(f"[TIMING] Total parallel check completed in {total_time:.3f}s for {total_refs} refs")
|
|
1254
|
+
|
|
1255
|
+
# Small delay to ensure all WebSocket messages are sent before returning
|
|
1256
|
+
# This prevents the 'completed' event from arriving before final progress updates
|
|
1257
|
+
await asyncio.sleep(0.1)
|
|
1168
1258
|
|
|
1169
1259
|
# Convert dict to ordered list
|
|
1170
1260
|
results_list = [results.get(i) for i in range(total_refs)]
|
refchecker/__version__.py
CHANGED
|
@@ -63,8 +63,8 @@ class NonArxivReferenceChecker:
|
|
|
63
63
|
|
|
64
64
|
# Rate limiting parameters
|
|
65
65
|
self.request_delay = 1.0 # Initial delay between requests (seconds)
|
|
66
|
-
self.max_retries =
|
|
67
|
-
self.backoff_factor =
|
|
66
|
+
self.max_retries = 3 # Reduced from 5 to limit timeout accumulation
|
|
67
|
+
self.backoff_factor = 1.5 # Reduced from 2 for faster retries
|
|
68
68
|
|
|
69
69
|
# Track API failures for Enhanced Hybrid Checker
|
|
70
70
|
self._api_failed = False
|
refchecker/core/refchecker.py
CHANGED
|
@@ -4887,6 +4887,52 @@ class ArxivReferenceChecker:
|
|
|
4887
4887
|
title = clean_title(title) if title else ""
|
|
4888
4888
|
title = title.rstrip(',').strip()
|
|
4889
4889
|
|
|
4890
|
+
# FIX: Detect malformed parsing for standards documents
|
|
4891
|
+
# When title is just a year (e.g., "2023") and authors contains what looks like a title
|
|
4892
|
+
# (common for ISO/SAE/PAS standards), swap them
|
|
4893
|
+
if title and re.match(r'^(19|20)\d{2}$', title):
|
|
4894
|
+
# Title is just a year - check if authors contains the actual title
|
|
4895
|
+
if authors and len(authors) > 0:
|
|
4896
|
+
# Join all author parts (sometimes title is split into multiple "authors")
|
|
4897
|
+
combined_authors = ' '.join(authors) if isinstance(authors, list) else str(authors)
|
|
4898
|
+
first_author = authors[0] if isinstance(authors, list) else str(authors)
|
|
4899
|
+
# If first "author" looks like a title (contains certain keywords or is long)
|
|
4900
|
+
standard_keywords = ['iso', 'sae', 'pas ', 'asam', 'arp', 'standard', 'specification',
|
|
4901
|
+
'road vehicles', 'driving automation', 'guidelines', 'taxonomy']
|
|
4902
|
+
if any(kw in combined_authors.lower() for kw in standard_keywords):
|
|
4903
|
+
logger.debug(f"Fixing malformed standard reference: swapping title '{title}' with author '{combined_authors[:60]}...'")
|
|
4904
|
+
# Move year to year field, combined authors to actual title
|
|
4905
|
+
year = int(title)
|
|
4906
|
+
title = combined_authors
|
|
4907
|
+
authors = [] # Standards typically don't have authors
|
|
4908
|
+
elif len(first_author) > 40:
|
|
4909
|
+
# Long first "author" is likely a title
|
|
4910
|
+
logger.debug(f"Fixing likely malformed reference: swapping title '{title}' with author '{combined_authors[:60]}...'")
|
|
4911
|
+
year = int(title)
|
|
4912
|
+
title = combined_authors
|
|
4913
|
+
authors = []
|
|
4914
|
+
|
|
4915
|
+
# FIX: Detect when title is a publisher/organization name and authors contains the actual title
|
|
4916
|
+
# Common publishers for standards: SAE International, BSI Standards, ISO, Beuth Verlag, etc.
|
|
4917
|
+
publisher_patterns = ['sae international', 'bsi standards', 'beuth verlag', 'iso/', 'ieee',
|
|
4918
|
+
'acm', 'springer', 'elsevier', 'wiley', 'oxford university press',
|
|
4919
|
+
'cambridge university press', 'mit press', 'verlag', 'förderung']
|
|
4920
|
+
title_lower = title.lower() if title else ''
|
|
4921
|
+
if authors and len(authors) > 0:
|
|
4922
|
+
combined_authors = ' '.join(authors) if isinstance(authors, list) else str(authors)
|
|
4923
|
+
# Check if title looks like a short publisher name and authors looks like a real title
|
|
4924
|
+
is_publisher = any(pub in title_lower for pub in publisher_patterns)
|
|
4925
|
+
is_short_title = len(title) < 30
|
|
4926
|
+
authors_look_like_title = any(kw in combined_authors.lower() for kw in
|
|
4927
|
+
['iso', 'sae', 'pas ', 'asam', 'arp', 'standard', 'specification', 'road vehicles',
|
|
4928
|
+
'driving automation', 'guidelines', 'taxonomy', 'openodd'])
|
|
4929
|
+
|
|
4930
|
+
if (is_publisher or (is_short_title and authors_look_like_title)) and len(combined_authors) > 20:
|
|
4931
|
+
logger.debug(f"Fixing publisher-as-title: '{title}' -> '{combined_authors[:60]}...'")
|
|
4932
|
+
venue = title # Publisher becomes venue
|
|
4933
|
+
title = combined_authors
|
|
4934
|
+
authors = []
|
|
4935
|
+
|
|
4890
4936
|
# Clean up venue
|
|
4891
4937
|
# Clean up venue - if venue is just a year, null it
|
|
4892
4938
|
if venue and venue.isdigit() and len(venue) == 4 and venue.startswith(('19', '20')):
|
|
@@ -265,4 +265,159 @@ class PDFProcessor:
|
|
|
265
265
|
def clear_cache(self):
|
|
266
266
|
"""Clear the text extraction cache"""
|
|
267
267
|
self.cache.clear()
|
|
268
|
-
logger.debug("PDF text cache cleared")
|
|
268
|
+
logger.debug("PDF text cache cleared")
|
|
269
|
+
|
|
270
|
+
def extract_title_from_pdf(self, pdf_path: str) -> Optional[str]:
|
|
271
|
+
"""
|
|
272
|
+
Extract the title from a PDF file.
|
|
273
|
+
|
|
274
|
+
First tries PDF metadata, then falls back to heuristic extraction
|
|
275
|
+
from the first page text.
|
|
276
|
+
|
|
277
|
+
Args:
|
|
278
|
+
pdf_path: Path to PDF file
|
|
279
|
+
|
|
280
|
+
Returns:
|
|
281
|
+
Extracted title or None if not found
|
|
282
|
+
"""
|
|
283
|
+
if not os.path.exists(pdf_path):
|
|
284
|
+
raise FileNotFoundError(f"PDF file not found: {pdf_path}")
|
|
285
|
+
|
|
286
|
+
try:
|
|
287
|
+
import pypdf
|
|
288
|
+
|
|
289
|
+
with open(pdf_path, 'rb') as file:
|
|
290
|
+
pdf_reader = pypdf.PdfReader(file)
|
|
291
|
+
|
|
292
|
+
# Try PDF metadata first
|
|
293
|
+
metadata = pdf_reader.metadata
|
|
294
|
+
if metadata:
|
|
295
|
+
title = metadata.get('/Title')
|
|
296
|
+
if title and isinstance(title, str) and len(title.strip()) > 3:
|
|
297
|
+
# Clean up the title
|
|
298
|
+
title = title.strip()
|
|
299
|
+
# Skip if it looks like a filename
|
|
300
|
+
if not title.endswith(('.pdf', '.tex', '.dvi')) and title.lower() != 'untitled':
|
|
301
|
+
logger.debug(f"Found title in PDF metadata: {title}")
|
|
302
|
+
return title
|
|
303
|
+
|
|
304
|
+
# Fall back to extracting from first page text
|
|
305
|
+
if len(pdf_reader.pages) > 0:
|
|
306
|
+
try:
|
|
307
|
+
first_page_text = pdf_reader.pages[0].extract_text()
|
|
308
|
+
if first_page_text:
|
|
309
|
+
title = self._extract_title_from_text(first_page_text)
|
|
310
|
+
if title:
|
|
311
|
+
logger.debug(f"Extracted title from first page: {title}")
|
|
312
|
+
return title
|
|
313
|
+
except Exception as e:
|
|
314
|
+
logger.warning(f"Error extracting title from first page: {e}")
|
|
315
|
+
|
|
316
|
+
return None
|
|
317
|
+
|
|
318
|
+
except ImportError:
|
|
319
|
+
logger.error("pypdf not installed. Install with: pip install pypdf")
|
|
320
|
+
raise
|
|
321
|
+
except Exception as e:
|
|
322
|
+
logger.warning(f"Error extracting title from PDF {pdf_path}: {e}")
|
|
323
|
+
return None
|
|
324
|
+
|
|
325
|
+
def _extract_title_from_text(self, text: str) -> Optional[str]:
|
|
326
|
+
"""
|
|
327
|
+
Heuristically extract paper title from text (typically first page).
|
|
328
|
+
|
|
329
|
+
Academic papers typically have the title as one of the first prominent
|
|
330
|
+
text blocks, often followed by author names.
|
|
331
|
+
|
|
332
|
+
Args:
|
|
333
|
+
text: Text from first page of PDF
|
|
334
|
+
|
|
335
|
+
Returns:
|
|
336
|
+
Extracted title or None
|
|
337
|
+
"""
|
|
338
|
+
if not text:
|
|
339
|
+
return None
|
|
340
|
+
|
|
341
|
+
import re
|
|
342
|
+
|
|
343
|
+
# Split into lines and clean
|
|
344
|
+
lines = [line.strip() for line in text.split('\n') if line.strip()]
|
|
345
|
+
|
|
346
|
+
if not lines:
|
|
347
|
+
return None
|
|
348
|
+
|
|
349
|
+
# Skip common header elements (conference names, page numbers, etc.)
|
|
350
|
+
header_patterns = [
|
|
351
|
+
r'^(proceedings|conference|journal|workshop|symposium)',
|
|
352
|
+
r'^(vol\.|volume|issue|no\.|number)',
|
|
353
|
+
r'^\d{1,4}\s*$', # Page numbers
|
|
354
|
+
r'^(preprint|arxiv|draft)',
|
|
355
|
+
r'^(ieee|acm|springer|elsevier)',
|
|
356
|
+
r'^[a-z]+\s+\d{4}$', # "January 2024" etc
|
|
357
|
+
]
|
|
358
|
+
|
|
359
|
+
# Author indicators that typically follow the title
|
|
360
|
+
author_indicators = [
|
|
361
|
+
r'^[A-Z][a-z]+\s+[A-Z][a-z]+(\s*,|\s+and\s+)', # "John Smith," or "John Smith and"
|
|
362
|
+
r'^[A-Z]\.\s*[A-Z][a-z]+', # "J. Smith"
|
|
363
|
+
r'^[\w\s,]+@[\w\.-]+', # Email addresses
|
|
364
|
+
r'^(university|department|institute|school|college)',
|
|
365
|
+
r'^\d+\s+[A-Z]', # Addresses like "123 Main St"
|
|
366
|
+
]
|
|
367
|
+
|
|
368
|
+
# Find potential title lines
|
|
369
|
+
title_candidates = []
|
|
370
|
+
for i, line in enumerate(lines[:15]): # Only look at first 15 lines
|
|
371
|
+
# Skip empty or very short lines
|
|
372
|
+
if len(line) < 10:
|
|
373
|
+
continue
|
|
374
|
+
|
|
375
|
+
# Skip lines matching header patterns
|
|
376
|
+
is_header = any(re.search(pat, line, re.IGNORECASE) for pat in header_patterns)
|
|
377
|
+
if is_header:
|
|
378
|
+
continue
|
|
379
|
+
|
|
380
|
+
# Check if this looks like the start of author section
|
|
381
|
+
is_author_section = any(re.search(pat, line, re.IGNORECASE) for pat in author_indicators)
|
|
382
|
+
if is_author_section:
|
|
383
|
+
break # Stop - we've passed the title
|
|
384
|
+
|
|
385
|
+
# Good candidate: reasonable length, not too long
|
|
386
|
+
if 15 <= len(line) <= 300:
|
|
387
|
+
title_candidates.append(line)
|
|
388
|
+
|
|
389
|
+
# If next line looks like authors, we found the title
|
|
390
|
+
if i + 1 < len(lines):
|
|
391
|
+
next_line = lines[i + 1]
|
|
392
|
+
if any(re.search(pat, next_line, re.IGNORECASE) for pat in author_indicators):
|
|
393
|
+
break
|
|
394
|
+
|
|
395
|
+
if not title_candidates:
|
|
396
|
+
return None
|
|
397
|
+
|
|
398
|
+
# Take the first good candidate, or combine first few if they seem related
|
|
399
|
+
title = title_candidates[0]
|
|
400
|
+
|
|
401
|
+
# Sometimes titles span multiple lines - check if next line continues
|
|
402
|
+
if len(title_candidates) > 1:
|
|
403
|
+
second = title_candidates[1]
|
|
404
|
+
# If second line is short and starts with lowercase or continues sentence
|
|
405
|
+
if len(second) < 80 and (second[0].islower() or title.endswith(':')):
|
|
406
|
+
title = title + ' ' + second
|
|
407
|
+
|
|
408
|
+
# Clean up the title
|
|
409
|
+
title = re.sub(r'\s+', ' ', title).strip()
|
|
410
|
+
|
|
411
|
+
# Remove common artifacts
|
|
412
|
+
title = re.sub(r'^\d+\s*', '', title) # Leading numbers
|
|
413
|
+
title = re.sub(r'\s*\*+\s*$', '', title) # Trailing asterisks
|
|
414
|
+
|
|
415
|
+
# Validate: title should have reasonable characteristics
|
|
416
|
+
if len(title) < 15 or len(title) > 350:
|
|
417
|
+
return None
|
|
418
|
+
|
|
419
|
+
# Should have some letters (not just numbers/symbols)
|
|
420
|
+
if not re.search(r'[a-zA-Z]{3,}', title):
|
|
421
|
+
return None
|
|
422
|
+
|
|
423
|
+
return title
|
refchecker/utils/text_utils.py
CHANGED
|
@@ -6,6 +6,7 @@ Text processing utilities for ArXiv Reference Checker
|
|
|
6
6
|
import re
|
|
7
7
|
import logging
|
|
8
8
|
import unicodedata
|
|
9
|
+
import html
|
|
9
10
|
from typing import List
|
|
10
11
|
|
|
11
12
|
logger = logging.getLogger(__name__)
|
|
@@ -5088,7 +5089,8 @@ def normalize_venue_for_display(venue: str) -> str:
|
|
|
5088
5089
|
|
|
5089
5090
|
return text_lower
|
|
5090
5091
|
|
|
5091
|
-
|
|
5092
|
+
# Decode any HTML entities (e.g., "&" -> "&") before further cleaning
|
|
5093
|
+
venue_text = html.unescape(venue).strip()
|
|
5092
5094
|
|
|
5093
5095
|
# Strip leading editor name lists like "..., editors, Venue ..." or "..., eds., Venue ..."
|
|
5094
5096
|
# This prevents author/editor lists from being treated as venue
|
|
File without changes
|
{academic_refchecker-2.0.19.dist-info → academic_refchecker-2.0.21.dist-info}/entry_points.txt
RENAMED
|
File without changes
|
{academic_refchecker-2.0.19.dist-info → academic_refchecker-2.0.21.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
|
File without changes
|