academic-refchecker 2.0.20__py3-none-any.whl → 2.0.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,10 +7,18 @@ import re
7
7
  import asyncio
8
8
  import logging
9
9
  import tempfile
10
+ import time
10
11
  from concurrent.futures import ThreadPoolExecutor
11
12
  from typing import List, Dict, Any, Optional, Callable
12
13
  from pathlib import Path
13
14
 
15
+ # Debug file logging
16
+ DEBUG_LOG_FILE = Path(tempfile.gettempdir()) / "refchecker_debug.log"
17
+ def debug_log(msg: str):
18
+ from datetime import datetime
19
+ with open(DEBUG_LOG_FILE, "a", encoding="utf-8") as f:
20
+ f.write(f"{datetime.now().strftime('%H:%M:%S.%f')[:12]} {msg}\n")
21
+
14
22
  # Add src to path to import refchecker when running from source
15
23
  # This is only needed when not installed as a package
16
24
  _src_path = str(Path(__file__).parent.parent / "src")
@@ -89,7 +97,8 @@ class ProgressRefChecker:
89
97
  cancel_event: Optional[asyncio.Event] = None,
90
98
  check_id: Optional[int] = None,
91
99
  title_update_callback: Optional[Callable] = None,
92
- bibliography_source_callback: Optional[Callable] = None):
100
+ bibliography_source_callback: Optional[Callable] = None,
101
+ semantic_scholar_api_key: Optional[str] = None):
93
102
  """
94
103
  Initialize the progress-aware refchecker
95
104
 
@@ -135,8 +144,12 @@ class ProgressRefChecker:
135
144
  logger.error(f"Failed to initialize LLM: {e}")
136
145
 
137
146
  # Initialize reference checker
147
+ # Use provided API key, fall back to environment variable
148
+ ss_api_key = semantic_scholar_api_key or os.getenv('SEMANTIC_SCHOLAR_API_KEY')
149
+ if ss_api_key:
150
+ logger.info("Semantic Scholar API key configured")
138
151
  self.checker = EnhancedHybridReferenceChecker(
139
- semantic_scholar_api_key=os.getenv('SEMANTIC_SCHOLAR_API_KEY'),
152
+ semantic_scholar_api_key=ss_api_key,
140
153
  debug_mode=False
141
154
  )
142
155
 
@@ -291,7 +304,7 @@ class ProgressRefChecker:
291
304
  "authoritative_urls": authoritative_urls,
292
305
  "corrected_reference": None
293
306
  }
294
- logger.debug(f"_format_verification_result output: status={status}, errors={len(formatted_errors)}, warnings={len(formatted_warnings)}, suggestions={len(formatted_suggestions)}")
307
+ logger.info(f"_format_verification_result output: suggestions={formatted_suggestions}, status={status}")
295
308
  return result
296
309
 
297
310
  def _format_error_result(
@@ -394,16 +407,32 @@ class ProgressRefChecker:
394
407
 
395
408
  await asyncio.to_thread(download_pdf_url)
396
409
 
397
- # Extract title from PDF filename or URL
398
- from urllib.parse import urlparse, unquote
399
- url_path = urlparse(paper_source).path
400
- pdf_filename = unquote(url_path.split('/')[-1])
401
- paper_title = pdf_filename.replace('.pdf', '').replace('_', ' ').replace('-', ' ')
402
- await update_title_if_needed(paper_title)
403
-
404
410
  extraction_method = 'pdf'
405
411
  pdf_processor = PDFProcessor()
406
412
  paper_text = await asyncio.to_thread(pdf_processor.extract_text_from_pdf, pdf_path)
413
+
414
+ # Try to extract the paper title from the PDF content
415
+ try:
416
+ extracted_title = await asyncio.to_thread(pdf_processor.extract_title_from_pdf, pdf_path)
417
+ if extracted_title:
418
+ paper_title = extracted_title
419
+ await update_title_if_needed(paper_title)
420
+ logger.info(f"Extracted title from PDF URL: {paper_title}")
421
+ else:
422
+ # Fallback to URL filename
423
+ from urllib.parse import urlparse, unquote
424
+ url_path = urlparse(paper_source).path
425
+ pdf_filename = unquote(url_path.split('/')[-1])
426
+ paper_title = pdf_filename.replace('.pdf', '').replace('_', ' ').replace('-', ' ')
427
+ await update_title_if_needed(paper_title)
428
+ except Exception as e:
429
+ logger.warning(f"Could not extract title from PDF: {e}")
430
+ # Fallback to URL filename
431
+ from urllib.parse import urlparse, unquote
432
+ url_path = urlparse(paper_source).path
433
+ pdf_filename = unquote(url_path.split('/')[-1])
434
+ paper_title = pdf_filename.replace('.pdf', '').replace('_', ' ').replace('-', ' ')
435
+ await update_title_if_needed(paper_title)
407
436
  else:
408
437
  # Handle ArXiv URLs/IDs
409
438
  arxiv_id = extract_arxiv_id_from_url(paper_source)
@@ -467,14 +496,22 @@ class ProgressRefChecker:
467
496
  })
468
497
 
469
498
  # Handle uploaded file - run PDF processing in thread
470
- # Note: paper_title is already set to the original filename in main.py
471
- # so we don't update it here
472
499
  if paper_source.lower().endswith('.pdf'):
473
500
  # PDF extraction requires LLM for reliable reference extraction
474
501
  if not self.llm:
475
502
  raise ValueError("PDF extraction requires an LLM to be configured. Please configure an LLM provider in settings.")
476
503
  pdf_processor = PDFProcessor()
477
504
  paper_text = await asyncio.to_thread(pdf_processor.extract_text_from_pdf, paper_source)
505
+
506
+ # Try to extract the paper title from the PDF
507
+ try:
508
+ extracted_title = await asyncio.to_thread(pdf_processor.extract_title_from_pdf, paper_source)
509
+ if extracted_title:
510
+ paper_title = extracted_title
511
+ await update_title_if_needed(paper_title)
512
+ logger.info(f"Extracted title from PDF: {paper_title}")
513
+ except Exception as e:
514
+ logger.warning(f"Could not extract title from PDF: {e}")
478
515
  elif paper_source.lower().endswith(('.tex', '.txt', '.bib')):
479
516
  def read_file():
480
517
  with open(paper_source, 'r', encoding='utf-8') as f:
@@ -808,6 +845,11 @@ class ProgressRefChecker:
808
845
  return []
809
846
  if refs:
810
847
  logger.info(f"Extracted {len(refs)} references via CLI parser")
848
+ # DEBUG: Log problematic references where year looks like title
849
+ for idx, ref in enumerate(refs):
850
+ title = ref.get('title', '')
851
+ if title and (title.isdigit() or len(title) < 10):
852
+ debug_log(f"PARSE ISSUE ref {idx+1}: title='{title}' authors={ref.get('authors', [])[:2]} year={ref.get('year')}")
811
853
  # Normalize field names (journal -> venue)
812
854
  refs = [_normalize_reference_fields(ref) for ref in refs]
813
855
  return refs
@@ -853,7 +895,16 @@ class ProgressRefChecker:
853
895
  try:
854
896
  llm_refs = await asyncio.to_thread(cli_checker.llm_extractor.extract_references, bibtex_content)
855
897
  if llm_refs:
898
+ # DEBUG: Log raw LLM output
899
+ debug_log(f"LLM raw output ({len(llm_refs)} refs):")
900
+ for i, r in enumerate(llm_refs[:5]):
901
+ debug_log(f" [{i+1}] {str(r)[:150]}")
856
902
  processed_refs = await asyncio.to_thread(cli_checker._process_llm_extracted_references, llm_refs)
903
+ # DEBUG: Log processed refs with potential issues
904
+ for idx, ref in enumerate(processed_refs):
905
+ title = ref.get('title', '')
906
+ if title and (title.isdigit() or len(title) < 10):
907
+ debug_log(f"PARSE ISSUE after LLM ref {idx+1}: title='{title}' authors={ref.get('authors', [])[:2]}")
857
908
  llm_validation = await asyncio.to_thread(validate_parsed_references, processed_refs)
858
909
  if llm_validation['quality_score'] > validation['quality_score']:
859
910
  logger.info(f"LLM extraction improved quality ({llm_validation['quality_score']:.2f})")
@@ -916,7 +967,11 @@ class ProgressRefChecker:
916
967
  # Run verification with timeout (handled by caller)
917
968
  verified_data, errors, url = self.checker.verify_reference(reference)
918
969
  return self._format_verification_result(reference, index, verified_data, errors, url)
919
-
970
+ except UnicodeEncodeError as e:
971
+ # Handle Windows encoding issues with special characters (e.g., Greek letters in titles)
972
+ logger.warning(f"Unicode encoding error checking reference {index}: {e}")
973
+ return self._format_error_result(reference, index,
974
+ Exception(f"Unicode encoding error - title may contain special characters"))
920
975
  except Exception as e:
921
976
  logger.error(f"Error checking reference {index}: {e}")
922
977
  return self._format_error_result(reference, index, e)
@@ -938,13 +993,22 @@ class ProgressRefChecker:
938
993
  from .database import db
939
994
 
940
995
  # Check cache first
996
+ cache_start = time.time()
941
997
  cached_result = await db.get_cached_verification(reference)
998
+ cache_time = time.time() - cache_start
999
+ if cache_time > 0.1:
1000
+ debug_log(f"[TIMING] Cache lookup for ref {idx + 1} took {cache_time:.3f}s")
942
1001
  if cached_result:
943
1002
  # Update the index to match current position
944
1003
  cached_result['index'] = idx + 1
945
- logger.info(f"Cache hit for reference {idx + 1}: {reference.get('title', 'Unknown')[:50]}")
1004
+ debug_log(f"Cache hit for reference {idx + 1} in {cache_time:.3f}s")
946
1005
  return cached_result
947
1006
 
1007
+ # Log cache miss with details
1008
+ title = reference.get('title', 'Unknown')[:60]
1009
+ authors = reference.get('authors', [])[:2]
1010
+ debug_log(f"CACHE MISS for ref {idx + 1}: title='{title}' authors={authors}")
1011
+
948
1012
  limiter = get_limiter()
949
1013
 
950
1014
  # Wait for a slot in the global queue
@@ -961,7 +1025,6 @@ class ProgressRefChecker:
961
1025
 
962
1026
  try:
963
1027
  # Run the sync check in a thread
964
- # Use 240 second timeout to allow for ArXiv rate limiting with version checking
965
1028
  result = await asyncio.wait_for(
966
1029
  loop.run_in_executor(
967
1030
  None, # Use default executor
@@ -969,7 +1032,7 @@ class ProgressRefChecker:
969
1032
  reference,
970
1033
  idx + 1
971
1034
  ),
972
- timeout=240.0 # 4 minute timeout per reference (allows for rate-limited version checking)
1035
+ timeout=120.0 # 2 minute timeout per reference
973
1036
  )
974
1037
  except asyncio.TimeoutError:
975
1038
  result = {
@@ -982,7 +1045,7 @@ class ProgressRefChecker:
982
1045
  "status": "error",
983
1046
  "errors": [{
984
1047
  "error_type": "timeout",
985
- "error_details": "Verification timed out after 240 seconds"
1048
+ "error_details": "Verification timed out after 120 seconds"
986
1049
  }],
987
1050
  "warnings": [],
988
1051
  "authoritative_urls": [],
@@ -1045,6 +1108,9 @@ class ProgressRefChecker:
1045
1108
 
1046
1109
  loop = asyncio.get_event_loop()
1047
1110
 
1111
+ start_time = time.time()
1112
+ debug_log(f"[TIMING] Starting parallel check of {total_refs} references")
1113
+
1048
1114
  # Create tasks for all references - they will be rate-limited by the global semaphore
1049
1115
  tasks = []
1050
1116
  for idx, ref in enumerate(references):
@@ -1054,11 +1120,18 @@ class ProgressRefChecker:
1054
1120
  )
1055
1121
  tasks.append((idx, task))
1056
1122
 
1123
+ task_creation_time = time.time()
1124
+ debug_log(f"[TIMING] Tasks created in {task_creation_time - start_time:.3f}s")
1125
+
1057
1126
  # Process results as they complete
1058
1127
  pending_tasks = {task for _, task in tasks}
1059
1128
  task_to_idx = {task: idx for idx, task in tasks}
1060
1129
 
1130
+ iteration = 0
1061
1131
  while pending_tasks:
1132
+ iteration += 1
1133
+ iter_start = time.time()
1134
+
1062
1135
  # Check for cancellation
1063
1136
  try:
1064
1137
  await self._check_cancelled()
@@ -1068,13 +1141,15 @@ class ProgressRefChecker:
1068
1141
  task.cancel()
1069
1142
  raise
1070
1143
 
1071
- # Wait for some tasks to complete
1144
+ # Wait for some tasks to complete - no timeout needed, just wait for first completed
1072
1145
  done, pending_tasks = await asyncio.wait(
1073
1146
  pending_tasks,
1074
- timeout=0.5,
1075
1147
  return_when=asyncio.FIRST_COMPLETED
1076
1148
  )
1077
1149
 
1150
+ wait_time = time.time() - iter_start
1151
+ debug_log(f"[TIMING] Iteration {iteration}: wait took {wait_time:.3f}s, {len(done)} done, {len(pending_tasks)} pending")
1152
+
1078
1153
  for task in done:
1079
1154
  idx = task_to_idx[task]
1080
1155
 
@@ -1147,6 +1222,7 @@ class ProgressRefChecker:
1147
1222
  refs_with_warnings_only += 1
1148
1223
 
1149
1224
  # Emit result immediately
1225
+ emit_start = time.time()
1150
1226
  await self.emit_progress("reference_result", result)
1151
1227
  await self.emit_progress("progress", {
1152
1228
  "current": processed_count,
@@ -1165,6 +1241,20 @@ class ProgressRefChecker:
1165
1241
  "refs_verified": refs_verified,
1166
1242
  "progress_percent": round((processed_count / total_refs) * 100, 1)
1167
1243
  })
1244
+ emit_time = time.time() - emit_start
1245
+ if emit_time > 0.1:
1246
+ debug_log(f"[TIMING] Emit for ref {idx + 1} took {emit_time:.3f}s")
1247
+
1248
+ # Yield to event loop to allow WebSocket messages to flush
1249
+ # This prevents stalls when many cache hits complete rapidly
1250
+ await asyncio.sleep(0)
1251
+
1252
+ total_time = time.time() - start_time
1253
+ debug_log(f"[TIMING] Total parallel check completed in {total_time:.3f}s for {total_refs} refs")
1254
+
1255
+ # Small delay to ensure all WebSocket messages are sent before returning
1256
+ # This prevents the 'completed' event from arriving before final progress updates
1257
+ await asyncio.sleep(0.1)
1168
1258
 
1169
1259
  # Convert dict to ordered list
1170
1260
  results_list = [results.get(i) for i in range(total_refs)]
refchecker/__version__.py CHANGED
@@ -1,3 +1,3 @@
1
1
  """Version information for RefChecker."""
2
2
 
3
- __version__ = "2.0.20"
3
+ __version__ = "2.0.22"
@@ -63,8 +63,8 @@ class NonArxivReferenceChecker:
63
63
 
64
64
  # Rate limiting parameters
65
65
  self.request_delay = 1.0 # Initial delay between requests (seconds)
66
- self.max_retries = 5 # Sufficient for individual API calls
67
- self.backoff_factor = 2 # Exponential backoff factor
66
+ self.max_retries = 3 # Reduced from 5 to limit timeout accumulation
67
+ self.backoff_factor = 1.5 # Reduced from 2 for faster retries
68
68
 
69
69
  # Track API failures for Enhanced Hybrid Checker
70
70
  self._api_failed = False
@@ -4887,6 +4887,52 @@ class ArxivReferenceChecker:
4887
4887
  title = clean_title(title) if title else ""
4888
4888
  title = title.rstrip(',').strip()
4889
4889
 
4890
+ # FIX: Detect malformed parsing for standards documents
4891
+ # When title is just a year (e.g., "2023") and authors contains what looks like a title
4892
+ # (common for ISO/SAE/PAS standards), swap them
4893
+ if title and re.match(r'^(19|20)\d{2}$', title):
4894
+ # Title is just a year - check if authors contains the actual title
4895
+ if authors and len(authors) > 0:
4896
+ # Join all author parts (sometimes title is split into multiple "authors")
4897
+ combined_authors = ' '.join(authors) if isinstance(authors, list) else str(authors)
4898
+ first_author = authors[0] if isinstance(authors, list) else str(authors)
4899
+ # If first "author" looks like a title (contains certain keywords or is long)
4900
+ standard_keywords = ['iso', 'sae', 'pas ', 'asam', 'arp', 'standard', 'specification',
4901
+ 'road vehicles', 'driving automation', 'guidelines', 'taxonomy']
4902
+ if any(kw in combined_authors.lower() for kw in standard_keywords):
4903
+ logger.debug(f"Fixing malformed standard reference: swapping title '{title}' with author '{combined_authors[:60]}...'")
4904
+ # Move year to year field, combined authors to actual title
4905
+ year = int(title)
4906
+ title = combined_authors
4907
+ authors = [] # Standards typically don't have authors
4908
+ elif len(first_author) > 40:
4909
+ # Long first "author" is likely a title
4910
+ logger.debug(f"Fixing likely malformed reference: swapping title '{title}' with author '{combined_authors[:60]}...'")
4911
+ year = int(title)
4912
+ title = combined_authors
4913
+ authors = []
4914
+
4915
+ # FIX: Detect when title is a publisher/organization name and authors contains the actual title
4916
+ # Common publishers for standards: SAE International, BSI Standards, ISO, Beuth Verlag, etc.
4917
+ publisher_patterns = ['sae international', 'bsi standards', 'beuth verlag', 'iso/', 'ieee',
4918
+ 'acm', 'springer', 'elsevier', 'wiley', 'oxford university press',
4919
+ 'cambridge university press', 'mit press', 'verlag', 'förderung']
4920
+ title_lower = title.lower() if title else ''
4921
+ if authors and len(authors) > 0:
4922
+ combined_authors = ' '.join(authors) if isinstance(authors, list) else str(authors)
4923
+ # Check if title looks like a short publisher name and authors looks like a real title
4924
+ is_publisher = any(pub in title_lower for pub in publisher_patterns)
4925
+ is_short_title = len(title) < 30
4926
+ authors_look_like_title = any(kw in combined_authors.lower() for kw in
4927
+ ['iso', 'sae', 'pas ', 'asam', 'arp', 'standard', 'specification', 'road vehicles',
4928
+ 'driving automation', 'guidelines', 'taxonomy', 'openodd'])
4929
+
4930
+ if (is_publisher or (is_short_title and authors_look_like_title)) and len(combined_authors) > 20:
4931
+ logger.debug(f"Fixing publisher-as-title: '{title}' -> '{combined_authors[:60]}...'")
4932
+ venue = title # Publisher becomes venue
4933
+ title = combined_authors
4934
+ authors = []
4935
+
4890
4936
  # Clean up venue
4891
4937
  # Clean up venue - if venue is just a year, null it
4892
4938
  if venue and venue.isdigit() and len(venue) == 4 and venue.startswith(('19', '20')):
@@ -265,4 +265,159 @@ class PDFProcessor:
265
265
  def clear_cache(self):
266
266
  """Clear the text extraction cache"""
267
267
  self.cache.clear()
268
- logger.debug("PDF text cache cleared")
268
+ logger.debug("PDF text cache cleared")
269
+
270
+ def extract_title_from_pdf(self, pdf_path: str) -> Optional[str]:
271
+ """
272
+ Extract the title from a PDF file.
273
+
274
+ First tries PDF metadata, then falls back to heuristic extraction
275
+ from the first page text.
276
+
277
+ Args:
278
+ pdf_path: Path to PDF file
279
+
280
+ Returns:
281
+ Extracted title or None if not found
282
+ """
283
+ if not os.path.exists(pdf_path):
284
+ raise FileNotFoundError(f"PDF file not found: {pdf_path}")
285
+
286
+ try:
287
+ import pypdf
288
+
289
+ with open(pdf_path, 'rb') as file:
290
+ pdf_reader = pypdf.PdfReader(file)
291
+
292
+ # Try PDF metadata first
293
+ metadata = pdf_reader.metadata
294
+ if metadata:
295
+ title = metadata.get('/Title')
296
+ if title and isinstance(title, str) and len(title.strip()) > 3:
297
+ # Clean up the title
298
+ title = title.strip()
299
+ # Skip if it looks like a filename
300
+ if not title.endswith(('.pdf', '.tex', '.dvi')) and title.lower() != 'untitled':
301
+ logger.debug(f"Found title in PDF metadata: {title}")
302
+ return title
303
+
304
+ # Fall back to extracting from first page text
305
+ if len(pdf_reader.pages) > 0:
306
+ try:
307
+ first_page_text = pdf_reader.pages[0].extract_text()
308
+ if first_page_text:
309
+ title = self._extract_title_from_text(first_page_text)
310
+ if title:
311
+ logger.debug(f"Extracted title from first page: {title}")
312
+ return title
313
+ except Exception as e:
314
+ logger.warning(f"Error extracting title from first page: {e}")
315
+
316
+ return None
317
+
318
+ except ImportError:
319
+ logger.error("pypdf not installed. Install with: pip install pypdf")
320
+ raise
321
+ except Exception as e:
322
+ logger.warning(f"Error extracting title from PDF {pdf_path}: {e}")
323
+ return None
324
+
325
+ def _extract_title_from_text(self, text: str) -> Optional[str]:
326
+ """
327
+ Heuristically extract paper title from text (typically first page).
328
+
329
+ Academic papers typically have the title as one of the first prominent
330
+ text blocks, often followed by author names.
331
+
332
+ Args:
333
+ text: Text from first page of PDF
334
+
335
+ Returns:
336
+ Extracted title or None
337
+ """
338
+ if not text:
339
+ return None
340
+
341
+ import re
342
+
343
+ # Split into lines and clean
344
+ lines = [line.strip() for line in text.split('\n') if line.strip()]
345
+
346
+ if not lines:
347
+ return None
348
+
349
+ # Skip common header elements (conference names, page numbers, etc.)
350
+ header_patterns = [
351
+ r'^(proceedings|conference|journal|workshop|symposium)',
352
+ r'^(vol\.|volume|issue|no\.|number)',
353
+ r'^\d{1,4}\s*$', # Page numbers
354
+ r'^(preprint|arxiv|draft)',
355
+ r'^(ieee|acm|springer|elsevier)',
356
+ r'^[a-z]+\s+\d{4}$', # "January 2024" etc
357
+ ]
358
+
359
+ # Author indicators that typically follow the title
360
+ author_indicators = [
361
+ r'^[A-Z][a-z]+\s+[A-Z][a-z]+(\s*,|\s+and\s+)', # "John Smith," or "John Smith and"
362
+ r'^[A-Z]\.\s*[A-Z][a-z]+', # "J. Smith"
363
+ r'^[\w\s,]+@[\w\.-]+', # Email addresses
364
+ r'^(university|department|institute|school|college)',
365
+ r'^\d+\s+[A-Z]', # Addresses like "123 Main St"
366
+ ]
367
+
368
+ # Find potential title lines
369
+ title_candidates = []
370
+ for i, line in enumerate(lines[:15]): # Only look at first 15 lines
371
+ # Skip empty or very short lines
372
+ if len(line) < 10:
373
+ continue
374
+
375
+ # Skip lines matching header patterns
376
+ is_header = any(re.search(pat, line, re.IGNORECASE) for pat in header_patterns)
377
+ if is_header:
378
+ continue
379
+
380
+ # Check if this looks like the start of author section
381
+ is_author_section = any(re.search(pat, line, re.IGNORECASE) for pat in author_indicators)
382
+ if is_author_section:
383
+ break # Stop - we've passed the title
384
+
385
+ # Good candidate: reasonable length, not too long
386
+ if 15 <= len(line) <= 300:
387
+ title_candidates.append(line)
388
+
389
+ # If next line looks like authors, we found the title
390
+ if i + 1 < len(lines):
391
+ next_line = lines[i + 1]
392
+ if any(re.search(pat, next_line, re.IGNORECASE) for pat in author_indicators):
393
+ break
394
+
395
+ if not title_candidates:
396
+ return None
397
+
398
+ # Take the first good candidate, or combine first few if they seem related
399
+ title = title_candidates[0]
400
+
401
+ # Sometimes titles span multiple lines - check if next line continues
402
+ if len(title_candidates) > 1:
403
+ second = title_candidates[1]
404
+ # If second line is short and starts with lowercase or continues sentence
405
+ if len(second) < 80 and (second[0].islower() or title.endswith(':')):
406
+ title = title + ' ' + second
407
+
408
+ # Clean up the title
409
+ title = re.sub(r'\s+', ' ', title).strip()
410
+
411
+ # Remove common artifacts
412
+ title = re.sub(r'^\d+\s*', '', title) # Leading numbers
413
+ title = re.sub(r'\s*\*+\s*$', '', title) # Trailing asterisks
414
+
415
+ # Validate: title should have reasonable characteristics
416
+ if len(title) < 15 or len(title) > 350:
417
+ return None
418
+
419
+ # Should have some letters (not just numbers/symbols)
420
+ if not re.search(r'[a-zA-Z]{3,}', title):
421
+ return None
422
+
423
+ return title
@@ -1372,6 +1372,15 @@ def is_name_match(name1: str, name2: str) -> bool:
1372
1372
  first_initial == first_name[0] and
1373
1373
  middle_initial == middle_name[0]):
1374
1374
  return True
1375
+ else:
1376
+ # Simple last name case: "W. R. Weimer" vs "Westley Weimer"
1377
+ # The cited name has an extra middle initial that the actual name doesn't have
1378
+ # Allow match if first initial and last name match (tolerate extra middle initial)
1379
+ # BUT: Exclude cases where first_name is just concatenated initials (like "gv")
1380
+ # which should require exact initial matching, not tolerance
1381
+ is_real_first_name = len(first_name) > 2 # "Westley" yes, "gv" no
1382
+ if is_real_first_name and last_name == compound_last and first_initial == first_name[0]:
1383
+ return True
1375
1384
 
1376
1385
  elif len(init_parts) == 3 and len(name_parts) == 3:
1377
1386
  # Check for "Last, First Middle" vs "First Middle Last" format
@@ -4289,6 +4298,7 @@ def are_venues_substantially_different(venue1: str, venue2: str) -> bool:
4289
4298
  # Handle specific multi-word patterns and well-known acronyms
4290
4299
  'proc. natl. acad. sci.': 'proceedings of the national academy of sciences',
4291
4300
  'pnas': 'proceedings of the national academy of sciences',
4301
+ 'cacm': 'communications of the acm',
4292
4302
  # Special cases that don't follow standard acronym patterns
4293
4303
  'neurips': 'neural information processing systems', # Special case
4294
4304
  'nips': 'neural information processing systems', # old name for neurips
@@ -4425,6 +4435,8 @@ def are_venues_substantially_different(venue1: str, venue2: str) -> bool:
4425
4435
  'neurips': 'neural information processing systems', # Special case: doesn't follow standard acronym rules
4426
4436
  'nips': 'neural information processing systems', # old name for neurips
4427
4437
  'nsdi': 'networked systems design and implementation', # USENIX NSDI
4438
+ 'cacm': 'communications of the acm',
4439
+ 'communications of the': 'communications of the acm',
4428
4440
  }
4429
4441
 
4430
4442
  # Apply abbreviation expansion - handle multi-word phrases first
@@ -5089,6 +5101,17 @@ def normalize_venue_for_display(venue: str) -> str:
5089
5101
  return text_lower
5090
5102
 
5091
5103
  venue_text = venue.strip()
5104
+
5105
+ # Fix common truncated venues that lose their organization suffix during PDF extraction
5106
+ truncated_aliases = {
5107
+ "communications of the": "Communications of the ACM",
5108
+ }
5109
+
5110
+ # Allow trailing punctuation/whitespace while matching truncated forms
5111
+ normalized_candidate = re.sub(r"[\s.,;:]+$", "", venue_text, flags=re.IGNORECASE)
5112
+ alias = truncated_aliases.get(normalized_candidate.lower())
5113
+ if alias:
5114
+ return alias
5092
5115
 
5093
5116
  # Strip leading editor name lists like "..., editors, Venue ..." or "..., eds., Venue ..."
5094
5117
  # This prevents author/editor lists from being treated as venue
@@ -5150,7 +5173,8 @@ def normalize_venue_for_display(venue: str) -> str:
5150
5173
  if not re.match(r'ieee\s+transactions', venue_text, re.IGNORECASE):
5151
5174
  venue_text = re.sub(r'^(ieee|acm|aaai|usenix|sigcomm|sigkdd|sigmod|vldb|osdi|sosp|eurosys)\s+', '', venue_text, flags=re.IGNORECASE) # Remove org prefixes
5152
5175
  venue_text = re.sub(r'^ieee/\w+\s+', '', venue_text, flags=re.IGNORECASE) # Remove "IEEE/RSJ " etc
5153
- venue_text = re.sub(r'\s+(ieee|acm|aaai|usenix)\s*$', '', venue_text, flags=re.IGNORECASE) # Remove org suffixes
5176
+ # Remove org suffixes, but NOT when preceded by "of the" (e.g., "Communications of the ACM", "Journal of the ACM")
5177
+ venue_text = re.sub(r'(?<!of the)\s+(ieee|acm|aaai|usenix)\s*$', '', venue_text, flags=re.IGNORECASE) # Remove org suffixes
5154
5178
  venue_text = re.sub(r'/\w+\s+', ' ', venue_text) # Remove "/ACM " style org separators
5155
5179
 
5156
5180
  # IMPORTANT: Don't remove "Conference on" or "International" - they're needed for display