academic-refchecker 2.0.19__py3-none-any.whl → 2.0.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,10 +7,18 @@ import re
7
7
  import asyncio
8
8
  import logging
9
9
  import tempfile
10
+ import time
10
11
  from concurrent.futures import ThreadPoolExecutor
11
12
  from typing import List, Dict, Any, Optional, Callable
12
13
  from pathlib import Path
13
14
 
15
+ # Debug file logging
16
+ DEBUG_LOG_FILE = Path(tempfile.gettempdir()) / "refchecker_debug.log"
17
+ def debug_log(msg: str):
18
+ from datetime import datetime
19
+ with open(DEBUG_LOG_FILE, "a", encoding="utf-8") as f:
20
+ f.write(f"{datetime.now().strftime('%H:%M:%S.%f')[:12]} {msg}\n")
21
+
14
22
  # Add src to path to import refchecker when running from source
15
23
  # This is only needed when not installed as a package
16
24
  _src_path = str(Path(__file__).parent.parent / "src")
@@ -89,7 +97,8 @@ class ProgressRefChecker:
89
97
  cancel_event: Optional[asyncio.Event] = None,
90
98
  check_id: Optional[int] = None,
91
99
  title_update_callback: Optional[Callable] = None,
92
- bibliography_source_callback: Optional[Callable] = None):
100
+ bibliography_source_callback: Optional[Callable] = None,
101
+ semantic_scholar_api_key: Optional[str] = None):
93
102
  """
94
103
  Initialize the progress-aware refchecker
95
104
 
@@ -135,8 +144,12 @@ class ProgressRefChecker:
135
144
  logger.error(f"Failed to initialize LLM: {e}")
136
145
 
137
146
  # Initialize reference checker
147
+ # Use provided API key, fall back to environment variable
148
+ ss_api_key = semantic_scholar_api_key or os.getenv('SEMANTIC_SCHOLAR_API_KEY')
149
+ if ss_api_key:
150
+ logger.info("Semantic Scholar API key configured")
138
151
  self.checker = EnhancedHybridReferenceChecker(
139
- semantic_scholar_api_key=os.getenv('SEMANTIC_SCHOLAR_API_KEY'),
152
+ semantic_scholar_api_key=ss_api_key,
140
153
  debug_mode=False
141
154
  )
142
155
 
@@ -291,7 +304,7 @@ class ProgressRefChecker:
291
304
  "authoritative_urls": authoritative_urls,
292
305
  "corrected_reference": None
293
306
  }
294
- logger.debug(f"_format_verification_result output: status={status}, errors={len(formatted_errors)}, warnings={len(formatted_warnings)}, suggestions={len(formatted_suggestions)}")
307
+ logger.info(f"_format_verification_result output: suggestions={formatted_suggestions}, status={status}")
295
308
  return result
296
309
 
297
310
  def _format_error_result(
@@ -394,16 +407,32 @@ class ProgressRefChecker:
394
407
 
395
408
  await asyncio.to_thread(download_pdf_url)
396
409
 
397
- # Extract title from PDF filename or URL
398
- from urllib.parse import urlparse, unquote
399
- url_path = urlparse(paper_source).path
400
- pdf_filename = unquote(url_path.split('/')[-1])
401
- paper_title = pdf_filename.replace('.pdf', '').replace('_', ' ').replace('-', ' ')
402
- await update_title_if_needed(paper_title)
403
-
404
410
  extraction_method = 'pdf'
405
411
  pdf_processor = PDFProcessor()
406
412
  paper_text = await asyncio.to_thread(pdf_processor.extract_text_from_pdf, pdf_path)
413
+
414
+ # Try to extract the paper title from the PDF content
415
+ try:
416
+ extracted_title = await asyncio.to_thread(pdf_processor.extract_title_from_pdf, pdf_path)
417
+ if extracted_title:
418
+ paper_title = extracted_title
419
+ await update_title_if_needed(paper_title)
420
+ logger.info(f"Extracted title from PDF URL: {paper_title}")
421
+ else:
422
+ # Fallback to URL filename
423
+ from urllib.parse import urlparse, unquote
424
+ url_path = urlparse(paper_source).path
425
+ pdf_filename = unquote(url_path.split('/')[-1])
426
+ paper_title = pdf_filename.replace('.pdf', '').replace('_', ' ').replace('-', ' ')
427
+ await update_title_if_needed(paper_title)
428
+ except Exception as e:
429
+ logger.warning(f"Could not extract title from PDF: {e}")
430
+ # Fallback to URL filename
431
+ from urllib.parse import urlparse, unquote
432
+ url_path = urlparse(paper_source).path
433
+ pdf_filename = unquote(url_path.split('/')[-1])
434
+ paper_title = pdf_filename.replace('.pdf', '').replace('_', ' ').replace('-', ' ')
435
+ await update_title_if_needed(paper_title)
407
436
  else:
408
437
  # Handle ArXiv URLs/IDs
409
438
  arxiv_id = extract_arxiv_id_from_url(paper_source)
@@ -467,14 +496,22 @@ class ProgressRefChecker:
467
496
  })
468
497
 
469
498
  # Handle uploaded file - run PDF processing in thread
470
- # Note: paper_title is already set to the original filename in main.py
471
- # so we don't update it here
472
499
  if paper_source.lower().endswith('.pdf'):
473
500
  # PDF extraction requires LLM for reliable reference extraction
474
501
  if not self.llm:
475
502
  raise ValueError("PDF extraction requires an LLM to be configured. Please configure an LLM provider in settings.")
476
503
  pdf_processor = PDFProcessor()
477
504
  paper_text = await asyncio.to_thread(pdf_processor.extract_text_from_pdf, paper_source)
505
+
506
+ # Try to extract the paper title from the PDF
507
+ try:
508
+ extracted_title = await asyncio.to_thread(pdf_processor.extract_title_from_pdf, paper_source)
509
+ if extracted_title:
510
+ paper_title = extracted_title
511
+ await update_title_if_needed(paper_title)
512
+ logger.info(f"Extracted title from PDF: {paper_title}")
513
+ except Exception as e:
514
+ logger.warning(f"Could not extract title from PDF: {e}")
478
515
  elif paper_source.lower().endswith(('.tex', '.txt', '.bib')):
479
516
  def read_file():
480
517
  with open(paper_source, 'r', encoding='utf-8') as f:
@@ -808,6 +845,11 @@ class ProgressRefChecker:
808
845
  return []
809
846
  if refs:
810
847
  logger.info(f"Extracted {len(refs)} references via CLI parser")
848
+ # DEBUG: Log problematic references where year looks like title
849
+ for idx, ref in enumerate(refs):
850
+ title = ref.get('title', '')
851
+ if title and (title.isdigit() or len(title) < 10):
852
+ debug_log(f"PARSE ISSUE ref {idx+1}: title='{title}' authors={ref.get('authors', [])[:2]} year={ref.get('year')}")
811
853
  # Normalize field names (journal -> venue)
812
854
  refs = [_normalize_reference_fields(ref) for ref in refs]
813
855
  return refs
@@ -853,7 +895,16 @@ class ProgressRefChecker:
853
895
  try:
854
896
  llm_refs = await asyncio.to_thread(cli_checker.llm_extractor.extract_references, bibtex_content)
855
897
  if llm_refs:
898
+ # DEBUG: Log raw LLM output
899
+ debug_log(f"LLM raw output ({len(llm_refs)} refs):")
900
+ for i, r in enumerate(llm_refs[:5]):
901
+ debug_log(f" [{i+1}] {str(r)[:150]}")
856
902
  processed_refs = await asyncio.to_thread(cli_checker._process_llm_extracted_references, llm_refs)
903
+ # DEBUG: Log processed refs with potential issues
904
+ for idx, ref in enumerate(processed_refs):
905
+ title = ref.get('title', '')
906
+ if title and (title.isdigit() or len(title) < 10):
907
+ debug_log(f"PARSE ISSUE after LLM ref {idx+1}: title='{title}' authors={ref.get('authors', [])[:2]}")
857
908
  llm_validation = await asyncio.to_thread(validate_parsed_references, processed_refs)
858
909
  if llm_validation['quality_score'] > validation['quality_score']:
859
910
  logger.info(f"LLM extraction improved quality ({llm_validation['quality_score']:.2f})")
@@ -916,7 +967,11 @@ class ProgressRefChecker:
916
967
  # Run verification with timeout (handled by caller)
917
968
  verified_data, errors, url = self.checker.verify_reference(reference)
918
969
  return self._format_verification_result(reference, index, verified_data, errors, url)
919
-
970
+ except UnicodeEncodeError as e:
971
+ # Handle Windows encoding issues with special characters (e.g., Greek letters in titles)
972
+ logger.warning(f"Unicode encoding error checking reference {index}: {e}")
973
+ return self._format_error_result(reference, index,
974
+ Exception(f"Unicode encoding error - title may contain special characters"))
920
975
  except Exception as e:
921
976
  logger.error(f"Error checking reference {index}: {e}")
922
977
  return self._format_error_result(reference, index, e)
@@ -938,13 +993,22 @@ class ProgressRefChecker:
938
993
  from .database import db
939
994
 
940
995
  # Check cache first
996
+ cache_start = time.time()
941
997
  cached_result = await db.get_cached_verification(reference)
998
+ cache_time = time.time() - cache_start
999
+ if cache_time > 0.1:
1000
+ debug_log(f"[TIMING] Cache lookup for ref {idx + 1} took {cache_time:.3f}s")
942
1001
  if cached_result:
943
1002
  # Update the index to match current position
944
1003
  cached_result['index'] = idx + 1
945
- logger.info(f"Cache hit for reference {idx + 1}: {reference.get('title', 'Unknown')[:50]}")
1004
+ debug_log(f"Cache hit for reference {idx + 1} in {cache_time:.3f}s")
946
1005
  return cached_result
947
1006
 
1007
+ # Log cache miss with details
1008
+ title = reference.get('title', 'Unknown')[:60]
1009
+ authors = reference.get('authors', [])[:2]
1010
+ debug_log(f"CACHE MISS for ref {idx + 1}: title='{title}' authors={authors}")
1011
+
948
1012
  limiter = get_limiter()
949
1013
 
950
1014
  # Wait for a slot in the global queue
@@ -961,7 +1025,6 @@ class ProgressRefChecker:
961
1025
 
962
1026
  try:
963
1027
  # Run the sync check in a thread
964
- # Use 240 second timeout to allow for ArXiv rate limiting with version checking
965
1028
  result = await asyncio.wait_for(
966
1029
  loop.run_in_executor(
967
1030
  None, # Use default executor
@@ -969,7 +1032,7 @@ class ProgressRefChecker:
969
1032
  reference,
970
1033
  idx + 1
971
1034
  ),
972
- timeout=240.0 # 4 minute timeout per reference (allows for rate-limited version checking)
1035
+ timeout=120.0 # 2 minute timeout per reference
973
1036
  )
974
1037
  except asyncio.TimeoutError:
975
1038
  result = {
@@ -982,7 +1045,7 @@ class ProgressRefChecker:
982
1045
  "status": "error",
983
1046
  "errors": [{
984
1047
  "error_type": "timeout",
985
- "error_details": "Verification timed out after 240 seconds"
1048
+ "error_details": "Verification timed out after 120 seconds"
986
1049
  }],
987
1050
  "warnings": [],
988
1051
  "authoritative_urls": [],
@@ -1045,6 +1108,9 @@ class ProgressRefChecker:
1045
1108
 
1046
1109
  loop = asyncio.get_event_loop()
1047
1110
 
1111
+ start_time = time.time()
1112
+ debug_log(f"[TIMING] Starting parallel check of {total_refs} references")
1113
+
1048
1114
  # Create tasks for all references - they will be rate-limited by the global semaphore
1049
1115
  tasks = []
1050
1116
  for idx, ref in enumerate(references):
@@ -1054,11 +1120,18 @@ class ProgressRefChecker:
1054
1120
  )
1055
1121
  tasks.append((idx, task))
1056
1122
 
1123
+ task_creation_time = time.time()
1124
+ debug_log(f"[TIMING] Tasks created in {task_creation_time - start_time:.3f}s")
1125
+
1057
1126
  # Process results as they complete
1058
1127
  pending_tasks = {task for _, task in tasks}
1059
1128
  task_to_idx = {task: idx for idx, task in tasks}
1060
1129
 
1130
+ iteration = 0
1061
1131
  while pending_tasks:
1132
+ iteration += 1
1133
+ iter_start = time.time()
1134
+
1062
1135
  # Check for cancellation
1063
1136
  try:
1064
1137
  await self._check_cancelled()
@@ -1068,13 +1141,15 @@ class ProgressRefChecker:
1068
1141
  task.cancel()
1069
1142
  raise
1070
1143
 
1071
- # Wait for some tasks to complete
1144
+ # Wait for some tasks to complete - no timeout needed, just wait for first completed
1072
1145
  done, pending_tasks = await asyncio.wait(
1073
1146
  pending_tasks,
1074
- timeout=0.5,
1075
1147
  return_when=asyncio.FIRST_COMPLETED
1076
1148
  )
1077
1149
 
1150
+ wait_time = time.time() - iter_start
1151
+ debug_log(f"[TIMING] Iteration {iteration}: wait took {wait_time:.3f}s, {len(done)} done, {len(pending_tasks)} pending")
1152
+
1078
1153
  for task in done:
1079
1154
  idx = task_to_idx[task]
1080
1155
 
@@ -1147,6 +1222,7 @@ class ProgressRefChecker:
1147
1222
  refs_with_warnings_only += 1
1148
1223
 
1149
1224
  # Emit result immediately
1225
+ emit_start = time.time()
1150
1226
  await self.emit_progress("reference_result", result)
1151
1227
  await self.emit_progress("progress", {
1152
1228
  "current": processed_count,
@@ -1165,6 +1241,20 @@ class ProgressRefChecker:
1165
1241
  "refs_verified": refs_verified,
1166
1242
  "progress_percent": round((processed_count / total_refs) * 100, 1)
1167
1243
  })
1244
+ emit_time = time.time() - emit_start
1245
+ if emit_time > 0.1:
1246
+ debug_log(f"[TIMING] Emit for ref {idx + 1} took {emit_time:.3f}s")
1247
+
1248
+ # Yield to event loop to allow WebSocket messages to flush
1249
+ # This prevents stalls when many cache hits complete rapidly
1250
+ await asyncio.sleep(0)
1251
+
1252
+ total_time = time.time() - start_time
1253
+ debug_log(f"[TIMING] Total parallel check completed in {total_time:.3f}s for {total_refs} refs")
1254
+
1255
+ # Small delay to ensure all WebSocket messages are sent before returning
1256
+ # This prevents the 'completed' event from arriving before final progress updates
1257
+ await asyncio.sleep(0.1)
1168
1258
 
1169
1259
  # Convert dict to ordered list
1170
1260
  results_list = [results.get(i) for i in range(total_refs)]
refchecker/__version__.py CHANGED
@@ -1,3 +1,3 @@
1
1
  """Version information for RefChecker."""
2
2
 
3
- __version__ = "2.0.19"
3
+ __version__ = "2.0.21"
@@ -63,8 +63,8 @@ class NonArxivReferenceChecker:
63
63
 
64
64
  # Rate limiting parameters
65
65
  self.request_delay = 1.0 # Initial delay between requests (seconds)
66
- self.max_retries = 5 # Sufficient for individual API calls
67
- self.backoff_factor = 2 # Exponential backoff factor
66
+ self.max_retries = 3 # Reduced from 5 to limit timeout accumulation
67
+ self.backoff_factor = 1.5 # Reduced from 2 for faster retries
68
68
 
69
69
  # Track API failures for Enhanced Hybrid Checker
70
70
  self._api_failed = False
@@ -4887,6 +4887,52 @@ class ArxivReferenceChecker:
4887
4887
  title = clean_title(title) if title else ""
4888
4888
  title = title.rstrip(',').strip()
4889
4889
 
4890
+ # FIX: Detect malformed parsing for standards documents
4891
+ # When title is just a year (e.g., "2023") and authors contains what looks like a title
4892
+ # (common for ISO/SAE/PAS standards), swap them
4893
+ if title and re.match(r'^(19|20)\d{2}$', title):
4894
+ # Title is just a year - check if authors contains the actual title
4895
+ if authors and len(authors) > 0:
4896
+ # Join all author parts (sometimes title is split into multiple "authors")
4897
+ combined_authors = ' '.join(authors) if isinstance(authors, list) else str(authors)
4898
+ first_author = authors[0] if isinstance(authors, list) else str(authors)
4899
+ # If first "author" looks like a title (contains certain keywords or is long)
4900
+ standard_keywords = ['iso', 'sae', 'pas ', 'asam', 'arp', 'standard', 'specification',
4901
+ 'road vehicles', 'driving automation', 'guidelines', 'taxonomy']
4902
+ if any(kw in combined_authors.lower() for kw in standard_keywords):
4903
+ logger.debug(f"Fixing malformed standard reference: swapping title '{title}' with author '{combined_authors[:60]}...'")
4904
+ # Move year to year field, combined authors to actual title
4905
+ year = int(title)
4906
+ title = combined_authors
4907
+ authors = [] # Standards typically don't have authors
4908
+ elif len(first_author) > 40:
4909
+ # Long first "author" is likely a title
4910
+ logger.debug(f"Fixing likely malformed reference: swapping title '{title}' with author '{combined_authors[:60]}...'")
4911
+ year = int(title)
4912
+ title = combined_authors
4913
+ authors = []
4914
+
4915
+ # FIX: Detect when title is a publisher/organization name and authors contains the actual title
4916
+ # Common publishers for standards: SAE International, BSI Standards, ISO, Beuth Verlag, etc.
4917
+ publisher_patterns = ['sae international', 'bsi standards', 'beuth verlag', 'iso/', 'ieee',
4918
+ 'acm', 'springer', 'elsevier', 'wiley', 'oxford university press',
4919
+ 'cambridge university press', 'mit press', 'verlag', 'förderung']
4920
+ title_lower = title.lower() if title else ''
4921
+ if authors and len(authors) > 0:
4922
+ combined_authors = ' '.join(authors) if isinstance(authors, list) else str(authors)
4923
+ # Check if title looks like a short publisher name and authors looks like a real title
4924
+ is_publisher = any(pub in title_lower for pub in publisher_patterns)
4925
+ is_short_title = len(title) < 30
4926
+ authors_look_like_title = any(kw in combined_authors.lower() for kw in
4927
+ ['iso', 'sae', 'pas ', 'asam', 'arp', 'standard', 'specification', 'road vehicles',
4928
+ 'driving automation', 'guidelines', 'taxonomy', 'openodd'])
4929
+
4930
+ if (is_publisher or (is_short_title and authors_look_like_title)) and len(combined_authors) > 20:
4931
+ logger.debug(f"Fixing publisher-as-title: '{title}' -> '{combined_authors[:60]}...'")
4932
+ venue = title # Publisher becomes venue
4933
+ title = combined_authors
4934
+ authors = []
4935
+
4890
4936
  # Clean up venue
4891
4937
  # Clean up venue - if venue is just a year, null it
4892
4938
  if venue and venue.isdigit() and len(venue) == 4 and venue.startswith(('19', '20')):
@@ -265,4 +265,159 @@ class PDFProcessor:
265
265
  def clear_cache(self):
266
266
  """Clear the text extraction cache"""
267
267
  self.cache.clear()
268
- logger.debug("PDF text cache cleared")
268
+ logger.debug("PDF text cache cleared")
269
+
270
+ def extract_title_from_pdf(self, pdf_path: str) -> Optional[str]:
271
+ """
272
+ Extract the title from a PDF file.
273
+
274
+ First tries PDF metadata, then falls back to heuristic extraction
275
+ from the first page text.
276
+
277
+ Args:
278
+ pdf_path: Path to PDF file
279
+
280
+ Returns:
281
+ Extracted title or None if not found
282
+ """
283
+ if not os.path.exists(pdf_path):
284
+ raise FileNotFoundError(f"PDF file not found: {pdf_path}")
285
+
286
+ try:
287
+ import pypdf
288
+
289
+ with open(pdf_path, 'rb') as file:
290
+ pdf_reader = pypdf.PdfReader(file)
291
+
292
+ # Try PDF metadata first
293
+ metadata = pdf_reader.metadata
294
+ if metadata:
295
+ title = metadata.get('/Title')
296
+ if title and isinstance(title, str) and len(title.strip()) > 3:
297
+ # Clean up the title
298
+ title = title.strip()
299
+ # Skip if it looks like a filename
300
+ if not title.endswith(('.pdf', '.tex', '.dvi')) and title.lower() != 'untitled':
301
+ logger.debug(f"Found title in PDF metadata: {title}")
302
+ return title
303
+
304
+ # Fall back to extracting from first page text
305
+ if len(pdf_reader.pages) > 0:
306
+ try:
307
+ first_page_text = pdf_reader.pages[0].extract_text()
308
+ if first_page_text:
309
+ title = self._extract_title_from_text(first_page_text)
310
+ if title:
311
+ logger.debug(f"Extracted title from first page: {title}")
312
+ return title
313
+ except Exception as e:
314
+ logger.warning(f"Error extracting title from first page: {e}")
315
+
316
+ return None
317
+
318
+ except ImportError:
319
+ logger.error("pypdf not installed. Install with: pip install pypdf")
320
+ raise
321
+ except Exception as e:
322
+ logger.warning(f"Error extracting title from PDF {pdf_path}: {e}")
323
+ return None
324
+
325
+ def _extract_title_from_text(self, text: str) -> Optional[str]:
326
+ """
327
+ Heuristically extract paper title from text (typically first page).
328
+
329
+ Academic papers typically have the title as one of the first prominent
330
+ text blocks, often followed by author names.
331
+
332
+ Args:
333
+ text: Text from first page of PDF
334
+
335
+ Returns:
336
+ Extracted title or None
337
+ """
338
+ if not text:
339
+ return None
340
+
341
+ import re
342
+
343
+ # Split into lines and clean
344
+ lines = [line.strip() for line in text.split('\n') if line.strip()]
345
+
346
+ if not lines:
347
+ return None
348
+
349
+ # Skip common header elements (conference names, page numbers, etc.)
350
+ header_patterns = [
351
+ r'^(proceedings|conference|journal|workshop|symposium)',
352
+ r'^(vol\.|volume|issue|no\.|number)',
353
+ r'^\d{1,4}\s*$', # Page numbers
354
+ r'^(preprint|arxiv|draft)',
355
+ r'^(ieee|acm|springer|elsevier)',
356
+ r'^[a-z]+\s+\d{4}$', # "January 2024" etc
357
+ ]
358
+
359
+ # Author indicators that typically follow the title
360
+ author_indicators = [
361
+ r'^[A-Z][a-z]+\s+[A-Z][a-z]+(\s*,|\s+and\s+)', # "John Smith," or "John Smith and"
362
+ r'^[A-Z]\.\s*[A-Z][a-z]+', # "J. Smith"
363
+ r'^[\w\s,]+@[\w\.-]+', # Email addresses
364
+ r'^(university|department|institute|school|college)',
365
+ r'^\d+\s+[A-Z]', # Addresses like "123 Main St"
366
+ ]
367
+
368
+ # Find potential title lines
369
+ title_candidates = []
370
+ for i, line in enumerate(lines[:15]): # Only look at first 15 lines
371
+ # Skip empty or very short lines
372
+ if len(line) < 10:
373
+ continue
374
+
375
+ # Skip lines matching header patterns
376
+ is_header = any(re.search(pat, line, re.IGNORECASE) for pat in header_patterns)
377
+ if is_header:
378
+ continue
379
+
380
+ # Check if this looks like the start of author section
381
+ is_author_section = any(re.search(pat, line, re.IGNORECASE) for pat in author_indicators)
382
+ if is_author_section:
383
+ break # Stop - we've passed the title
384
+
385
+ # Good candidate: reasonable length, not too long
386
+ if 15 <= len(line) <= 300:
387
+ title_candidates.append(line)
388
+
389
+ # If next line looks like authors, we found the title
390
+ if i + 1 < len(lines):
391
+ next_line = lines[i + 1]
392
+ if any(re.search(pat, next_line, re.IGNORECASE) for pat in author_indicators):
393
+ break
394
+
395
+ if not title_candidates:
396
+ return None
397
+
398
+ # Take the first good candidate, or combine first few if they seem related
399
+ title = title_candidates[0]
400
+
401
+ # Sometimes titles span multiple lines - check if next line continues
402
+ if len(title_candidates) > 1:
403
+ second = title_candidates[1]
404
+ # If second line is short and starts with lowercase or continues sentence
405
+ if len(second) < 80 and (second[0].islower() or title.endswith(':')):
406
+ title = title + ' ' + second
407
+
408
+ # Clean up the title
409
+ title = re.sub(r'\s+', ' ', title).strip()
410
+
411
+ # Remove common artifacts
412
+ title = re.sub(r'^\d+\s*', '', title) # Leading numbers
413
+ title = re.sub(r'\s*\*+\s*$', '', title) # Trailing asterisks
414
+
415
+ # Validate: title should have reasonable characteristics
416
+ if len(title) < 15 or len(title) > 350:
417
+ return None
418
+
419
+ # Should have some letters (not just numbers/symbols)
420
+ if not re.search(r'[a-zA-Z]{3,}', title):
421
+ return None
422
+
423
+ return title
@@ -6,6 +6,7 @@ Text processing utilities for ArXiv Reference Checker
6
6
  import re
7
7
  import logging
8
8
  import unicodedata
9
+ import html
9
10
  from typing import List
10
11
 
11
12
  logger = logging.getLogger(__name__)
@@ -5088,7 +5089,8 @@ def normalize_venue_for_display(venue: str) -> str:
5088
5089
 
5089
5090
  return text_lower
5090
5091
 
5091
- venue_text = venue.strip()
5092
+ # Decode any HTML entities (e.g., "&amp;" -> "&") before further cleaning
5093
+ venue_text = html.unescape(venue).strip()
5092
5094
 
5093
5095
  # Strip leading editor name lists like "..., editors, Venue ..." or "..., eds., Venue ..."
5094
5096
  # This prevents author/editor lists from being treated as venue