academic-refchecker 2.0.12__py3-none-any.whl → 2.0.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,8 +8,8 @@ for papers found on ArXiv, as it reflects the author-submitted metadata.
8
8
 
9
9
  Key features:
10
10
  - Fetches official BibTeX from https://arxiv.org/bibtex/{arxiv_id}
11
- - Always uses the latest version metadata (strips version suffixes)
12
- - Logs warnings when cited version differs from latest version
11
+ - Checks reference against all historical versions when latest doesn't match
12
+ - Annotates errors with version info when reference matches an older version
13
13
  - Parses BibTeX to extract normalized metadata matching refchecker schema
14
14
 
15
15
  Usage:
@@ -30,6 +30,7 @@ Usage:
30
30
  import re
31
31
  import logging
32
32
  import requests
33
+ import html
33
34
  from typing import Dict, List, Tuple, Optional, Any
34
35
 
35
36
  import bibtexparser
@@ -88,6 +89,8 @@ class ArXivCitationChecker:
88
89
  # export.arxiv.org URLs
89
90
  r'export\.arxiv\.org/abs/([0-9]{4}\.[0-9]{4,5})(v\d+)?',
90
91
  r'export\.arxiv\.org/pdf/([0-9]{4}\.[0-9]{4,5})(v\d+)?',
92
+ # DOI format
93
+ r"(?:arxiv[:./])(\d{4}\.\d{4,5})(v\d+)?"
91
94
  ]
92
95
 
93
96
  def extract_arxiv_id(self, reference: Dict[str, Any]) -> Tuple[Optional[str], Optional[str]]:
@@ -107,6 +110,8 @@ class ArXivCitationChecker:
107
110
  reference.get('cited_url', ''),
108
111
  reference.get('raw_text', ''),
109
112
  reference.get('eprint', ''), # BibTeX field
113
+ reference.get('journal', ''),
114
+ reference.get('doi', ''), # DOI field (may contain arXiv ID)
110
115
  ]
111
116
 
112
117
  for source in sources:
@@ -324,35 +329,133 @@ class ArXivCitationChecker:
324
329
 
325
330
  return None
326
331
 
327
- def get_latest_version_info(self, arxiv_id: str) -> Optional[str]:
332
+ def is_arxiv_reference(self, reference: Dict[str, Any]) -> bool:
328
333
  """
329
- Get the latest version number for an ArXiv paper.
334
+ Check if a reference is an ArXiv paper.
330
335
 
331
- Note: This requires fetching the abstract page, so it's optional.
332
- For now, we rely on the BibTeX always returning latest version metadata.
336
+ Args:
337
+ reference: Reference dictionary
338
+
339
+ Returns:
340
+ True if reference appears to be an ArXiv paper
341
+ """
342
+ arxiv_id, _ = self.extract_arxiv_id(reference)
343
+ return arxiv_id is not None
344
+
345
+ def _fetch_version_metadata_from_html(self, arxiv_id: str, version_num: int) -> Optional[Dict[str, Any]]:
346
+ """
347
+ Fetch and parse metadata for a specific version using HTML scraping.
333
348
 
334
349
  Args:
335
350
  arxiv_id: ArXiv ID without version
351
+ version_num: Version number to fetch (1, 2, 3, etc.)
336
352
 
337
353
  Returns:
338
- Latest version string (e.g., "v3") or None if couldn't determine
354
+ Dictionary with version metadata or None if version doesn't exist
339
355
  """
340
- # The BibTeX endpoint always returns the latest version's metadata,
341
- # so we don't need to explicitly fetch version info
342
- return None
343
-
344
- def is_arxiv_reference(self, reference: Dict[str, Any]) -> bool:
356
+ version_str = f"v{version_num}"
357
+ url = f"{self.abs_url}/{arxiv_id}{version_str}"
358
+
359
+ self.rate_limiter.wait()
360
+ try:
361
+ logger.debug(f"Checking historical version: {url}")
362
+ response = requests.get(url, timeout=self.timeout)
363
+ if response.status_code == 404:
364
+ return None # Version does not exist
365
+ response.raise_for_status()
366
+ html_content = response.text
367
+
368
+ # Parse meta tags for metadata
369
+ # Title
370
+ title_match = re.search(r'<meta name="citation_title" content="(.*?)"', html_content)
371
+ title = html.unescape(title_match.group(1)).strip() if title_match else ""
372
+
373
+ # Authors
374
+ authors = []
375
+ for auth in re.findall(r'<meta name="citation_author" content="(.*?)"', html_content):
376
+ authors.append(html.unescape(auth).strip())
377
+
378
+ # Date/Year
379
+ date_match = re.search(r'<meta name="citation_date" content="(.*?)"', html_content)
380
+ year = None
381
+ if date_match:
382
+ ym = re.search(r'^(\d{4})', date_match.group(1))
383
+ if ym:
384
+ year = int(ym.group(1))
385
+
386
+ return {
387
+ 'version': version_str,
388
+ 'version_num': version_num,
389
+ 'title': title,
390
+ 'authors': [{'name': a} for a in authors],
391
+ 'year': year,
392
+ 'url': url,
393
+ }
394
+ except Exception as e:
395
+ logger.warning(f"Failed to fetch history {version_str}: {e}")
396
+ return None
397
+
398
+ def _get_latest_version_number(self, arxiv_id: str) -> Optional[int]:
345
399
  """
346
- Check if a reference is an ArXiv paper.
400
+ Get the latest version number by fetching the abstract page.
347
401
 
348
402
  Args:
349
- reference: Reference dictionary
403
+ arxiv_id: ArXiv ID without version
350
404
 
351
405
  Returns:
352
- True if reference appears to be an ArXiv paper
406
+ Latest version number as integer, or None if couldn't determine
353
407
  """
354
- arxiv_id, _ = self.extract_arxiv_id(reference)
355
- return arxiv_id is not None
408
+ url = f"{self.abs_url}/{arxiv_id}"
409
+
410
+ self.rate_limiter.wait()
411
+ try:
412
+ response = requests.get(url, timeout=self.timeout)
413
+ response.raise_for_status()
414
+
415
+ # Look for version links like "[v1]", "[v2]", etc.
416
+ versions = re.findall(r'\[v(\d+)\]', response.text)
417
+ if versions:
418
+ return max(int(v) for v in versions)
419
+ return None
420
+ except Exception as e:
421
+ logger.warning(f"Failed to get latest version for {arxiv_id}: {e}")
422
+ return None
423
+
424
+ def _compare_info_match(
425
+ self, cited_title: str, cited_authors: List[str], cited_year: Optional[int],
426
+ authoritative_title: str, authoritative_authors: List[str], authoritative_year: Optional[int]) -> bool:
427
+ """
428
+ Compare the information of a cited paper with the authoritative information.
429
+
430
+ Args:
431
+ cited_title: Title from the reference
432
+ cited_authors: Authors from the reference
433
+ cited_year: Year from the reference
434
+ authoritative_title: Title from ArXiv version
435
+ authoritative_authors: Authors from ArXiv version
436
+ authoritative_year: Year from ArXiv version
437
+
438
+ Returns:
439
+ True if the information matches, False otherwise.
440
+ """
441
+ # Compare title
442
+ if cited_title and authoritative_title:
443
+ title_similarity = compare_titles_with_latex_cleaning(cited_title, authoritative_title)
444
+ if title_similarity < SIMILARITY_THRESHOLD:
445
+ return False
446
+
447
+ # Compare authors
448
+ if cited_authors and authoritative_authors:
449
+ authors_match, _ = compare_authors(cited_authors, authoritative_authors)
450
+ if not authors_match:
451
+ return False
452
+
453
+ # Compare year
454
+ if cited_year and authoritative_year:
455
+ if cited_year != authoritative_year:
456
+ return False
457
+
458
+ return True
356
459
 
357
460
  def verify_reference(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
358
461
  """
@@ -360,10 +463,10 @@ class ArXivCitationChecker:
360
463
 
361
464
  This method:
362
465
  1. Extracts the ArXiv ID from the reference
363
- 2. Fetches the official BibTeX from ArXiv (always latest version)
364
- 3. Parses the BibTeX to get authoritative metadata
365
- 4. Compares cited metadata against authoritative source
366
- 5. Logs warnings for version mismatches
466
+ 2. Fetches the official BibTeX from ArXiv (latest version)
467
+ 3. Compares cited metadata against latest version
468
+ 4. If errors found, checks historical versions to find a match
469
+ 5. Annotates errors with version info if reference matches an older version
367
470
 
368
471
  Args:
369
472
  reference: Reference dictionary with title, authors, year, url, etc.
@@ -385,34 +488,26 @@ class ArXivCitationChecker:
385
488
 
386
489
  logger.debug(f"ArXivCitationChecker: Verifying ArXiv paper {arxiv_id}")
387
490
 
388
- # Fetch authoritative BibTeX
491
+ # Extract information from reference for comparison
492
+ cited_title = reference.get('title', '').strip()
493
+ cited_authors = reference.get('authors', [])
494
+ cited_year = reference.get('year')
495
+
496
+ # Fetch authoritative BibTeX (latest version)
389
497
  bibtex_content = self.fetch_bibtex(arxiv_id)
390
498
 
391
499
  if not bibtex_content:
392
500
  logger.debug(f"ArXivCitationChecker: Could not fetch BibTeX for {arxiv_id}")
393
501
  return None, [{"error_type": "api_failure", "error_details": f"Could not fetch ArXiv BibTeX for {arxiv_id}"}], None
394
502
 
395
- # Parse BibTeX
396
- verified_data = self.parse_bibtex(bibtex_content)
503
+ latest_data = self.parse_bibtex(bibtex_content)
397
504
 
398
- if not verified_data:
505
+ if not latest_data:
399
506
  logger.debug(f"ArXivCitationChecker: Could not parse BibTeX for {arxiv_id}")
400
507
  return None, [], None
401
-
402
- # Log version mismatch warning if cited version differs from latest
403
- if cited_version:
404
- # ArXiv BibTeX always returns latest version metadata
405
- # We don't know the actual latest version number without additional API call,
406
- # but we can warn that a specific version was cited
407
- errors.append({
408
- 'warning_type': 'version',
409
- 'warning_details': f"Reference cites ArXiv version {cited_version}, verified against latest version metadata",
410
- })
411
- logger.debug(f"ArXivCitationChecker: Cited version {cited_version} for {arxiv_id}")
412
-
413
- # Compare title
414
- cited_title = reference.get('title', '').strip()
415
- authoritative_title = verified_data.get('title', '').strip()
508
+
509
+ # Compare against latest version
510
+ authoritative_title = latest_data.get('title', '').strip()
416
511
 
417
512
  if cited_title and authoritative_title:
418
513
  title_similarity = compare_titles_with_latex_cleaning(cited_title, authoritative_title)
@@ -426,9 +521,8 @@ class ArXivCitationChecker:
426
521
  })
427
522
 
428
523
  # Compare authors
429
- cited_authors = reference.get('authors', [])
430
524
  if cited_authors:
431
- authoritative_authors = verified_data.get('authors', [])
525
+ authoritative_authors = latest_data.get('authors', [])
432
526
  authors_match, author_error = compare_authors(cited_authors, authoritative_authors)
433
527
 
434
528
  if not authors_match:
@@ -440,9 +534,7 @@ class ArXivCitationChecker:
440
534
  })
441
535
 
442
536
  # Compare year
443
- cited_year = reference.get('year')
444
- authoritative_year = verified_data.get('year')
445
-
537
+ authoritative_year = latest_data.get('year')
446
538
  year_warning = validate_year(
447
539
  cited_year=cited_year,
448
540
  paper_year=authoritative_year,
@@ -451,10 +543,50 @@ class ArXivCitationChecker:
451
543
  )
452
544
  if year_warning:
453
545
  errors.append(year_warning)
454
-
455
- # Build URL
546
+
456
547
  paper_url = f"https://arxiv.org/abs/{arxiv_id}"
457
548
 
458
- logger.debug(f"ArXivCitationChecker: Verified {arxiv_id} with {len(errors)} errors/warnings")
549
+ # If no errors against latest version, we're done
550
+ if len(errors) == 0:
551
+ logger.debug(f"ArXivCitationChecker: Verified {arxiv_id} with no errors")
552
+ return latest_data, errors, paper_url
553
+
554
+ # Check if reference matches a historical version
555
+ # Get latest version number first
556
+ latest_version_num = self._get_latest_version_number(arxiv_id)
557
+
558
+ if latest_version_num and latest_version_num > 1:
559
+ # Check historical versions (1 to latest-1)
560
+ for version_num in range(1, latest_version_num):
561
+ version_data = self._fetch_version_metadata_from_html(arxiv_id, version_num)
562
+ if not version_data:
563
+ continue
564
+
565
+ # Check if reference matches this historical version
566
+ if self._compare_info_match(
567
+ cited_title, cited_authors, cited_year,
568
+ version_data['title'], version_data['authors'], version_data['year']):
569
+
570
+ logger.debug(f"ArXivCitationChecker: Reference matches historical version v{version_num}")
571
+
572
+ # Convert errors to warnings with version update info
573
+ # Version update issues are informational, not errors - the citation was correct for its time
574
+ version_suffix = f" (v{version_num} vs v{latest_version_num} update)"
575
+ warnings = []
576
+ for error in errors:
577
+ warning = {
578
+ 'warning_type': error.get('error_type', 'unknown') + version_suffix,
579
+ 'warning_details': error.get('error_details', ''),
580
+ }
581
+ # Preserve correction hints
582
+ for key in ['ref_title_correct', 'ref_authors_correct', 'ref_year_correct']:
583
+ if key in error:
584
+ warning[key] = error[key]
585
+ warnings.append(warning)
586
+
587
+ # Return with warnings instead of errors - URL points to the matched version
588
+ matched_url = f"https://arxiv.org/abs/{arxiv_id}v{version_num}"
589
+ return latest_data, warnings, matched_url
459
590
 
460
- return verified_data, errors, paper_url
591
+ logger.debug(f"ArXivCitationChecker: Verified {arxiv_id} with {len(errors)} errors/warnings")
592
+ return latest_data, errors, paper_url
@@ -257,6 +257,90 @@ class EnhancedHybridReferenceChecker:
257
257
 
258
258
  return True
259
259
 
260
+ def _merge_arxiv_with_semantic_scholar(
261
+ self,
262
+ arxiv_data: Dict[str, Any],
263
+ arxiv_errors: List[Dict[str, Any]],
264
+ arxiv_url: str,
265
+ ss_data: Dict[str, Any],
266
+ ss_errors: List[Dict[str, Any]],
267
+ ss_url: str,
268
+ reference: Dict[str, Any]
269
+ ) -> Tuple[Dict[str, Any], List[Dict[str, Any]]]:
270
+ """
271
+ Merge ArXiv verification results with Semantic Scholar data.
272
+
273
+ ArXiv is authoritative for title/author/year, but Semantic Scholar
274
+ provides venue information and additional URLs (DOI, S2 page).
275
+
276
+ Args:
277
+ arxiv_data: Verified data from ArXiv
278
+ arxiv_errors: Errors/warnings from ArXiv verification
279
+ arxiv_url: ArXiv URL
280
+ ss_data: Data from Semantic Scholar
281
+ ss_errors: Errors from Semantic Scholar (used for venue checking)
282
+ ss_url: Semantic Scholar URL
283
+ reference: Original reference
284
+
285
+ Returns:
286
+ Tuple of (merged_data, merged_errors)
287
+ """
288
+ merged_data = dict(arxiv_data) if arxiv_data else {}
289
+ merged_errors = list(arxiv_errors) if arxiv_errors else []
290
+
291
+ if not ss_data:
292
+ return merged_data, merged_errors
293
+
294
+ # Add Semantic Scholar URL to external IDs
295
+ if 'externalIds' not in merged_data:
296
+ merged_data['externalIds'] = {}
297
+
298
+ ss_external_ids = ss_data.get('externalIds', {})
299
+
300
+ # Add S2 paper ID
301
+ if ss_data.get('paperId'):
302
+ merged_data['externalIds']['S2PaperId'] = ss_data['paperId']
303
+
304
+ # Add DOI if available from Semantic Scholar
305
+ if ss_external_ids.get('DOI') and not merged_data['externalIds'].get('DOI'):
306
+ merged_data['externalIds']['DOI'] = ss_external_ids['DOI']
307
+
308
+ # Store Semantic Scholar URL
309
+ merged_data['_semantic_scholar_url'] = ss_url
310
+
311
+ # Check for venue mismatch - if paper was published at a venue but citation only says arXiv
312
+ ss_venue = ss_data.get('venue', '')
313
+ cited_venue = reference.get('venue', reference.get('journal', '')).strip().lower()
314
+
315
+ # Normalize ArXiv venue names
316
+ is_cited_as_arxiv = (
317
+ not cited_venue or
318
+ cited_venue in ['arxiv', 'arxiv preprint', 'arxiv.org', 'preprint']
319
+ )
320
+
321
+ # Check if Semantic Scholar shows a real publication venue
322
+ if ss_venue and is_cited_as_arxiv:
323
+ # Ignore generic/empty venues
324
+ ss_venue_lower = ss_venue.lower().strip()
325
+ is_real_venue = (
326
+ ss_venue_lower and
327
+ ss_venue_lower not in ['arxiv', 'arxiv.org', 'preprint', ''] and
328
+ not ss_venue_lower.startswith('arxiv')
329
+ )
330
+
331
+ if is_real_venue:
332
+ # This paper was published at a venue but is only cited as arXiv
333
+ logger.debug(f"Enhanced Hybrid: Paper published at '{ss_venue}' but cited as arXiv")
334
+ merged_errors.append({
335
+ 'warning_type': 'venue',
336
+ 'warning_details': f"Paper was published at venue but cited as arXiv preprint:\n cited: arXiv\n actual: {ss_venue}",
337
+ 'ref_venue_correct': ss_venue
338
+ })
339
+ # Also add the venue to merged data
340
+ merged_data['venue'] = ss_venue
341
+
342
+ return merged_data, merged_errors
343
+
260
344
  def verify_reference(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
261
345
  """
262
346
  Verify a non-arXiv reference using multiple APIs in priority order
@@ -287,6 +371,9 @@ class EnhancedHybridReferenceChecker:
287
371
  # Track all APIs that failed and could be retried
288
372
  failed_apis = []
289
373
 
374
+ # Store ArXiv result for potential merging with Semantic Scholar
375
+ arxiv_result = None
376
+
290
377
  # PHASE 1: Try all APIs once in priority order
291
378
 
292
379
  # Strategy 0: For ArXiv papers, try ArXiv Citation checker first (authoritative source)
@@ -295,13 +382,15 @@ class EnhancedHybridReferenceChecker:
295
382
  logger.debug("Enhanced Hybrid: Reference appears to be ArXiv paper, trying ArXiv Citation checker first")
296
383
  verified_data, errors, url, success, failure_type = self._try_api('arxiv_citation', self.arxiv_citation, reference)
297
384
  if success:
298
- logger.debug("Enhanced Hybrid: ArXiv Citation checker succeeded as authoritative source")
299
- return verified_data, errors, url
385
+ logger.debug("Enhanced Hybrid: ArXiv Citation checker succeeded, also querying Semantic Scholar for venue/URLs")
386
+ arxiv_result = (verified_data, errors, url)
387
+ # Continue to Semantic Scholar to get venue and additional URLs
300
388
  if failure_type in ['throttled', 'timeout', 'server_error']:
301
389
  failed_apis.append(('arxiv_citation', self.arxiv_citation, failure_type))
302
390
 
303
391
  # Strategy 1: Always try local database first (fastest)
304
- if self.local_db:
392
+ # Skip if we already have ArXiv result - we'll go straight to Semantic Scholar for venue info
393
+ if self.local_db and not arxiv_result:
305
394
  verified_data, errors, url, success, failure_type = self._try_api('local_db', self.local_db, reference)
306
395
  if success:
307
396
  return verified_data, errors, url
@@ -309,8 +398,9 @@ class EnhancedHybridReferenceChecker:
309
398
  failed_apis.append(('local_db', self.local_db, failure_type))
310
399
 
311
400
  # Strategy 2: If reference has DOI, prioritize CrossRef
401
+ # Skip if we already have ArXiv result - we'll go straight to Semantic Scholar for venue info
312
402
  crossref_result = None
313
- if self._should_try_doi_apis_first(reference) and self.crossref:
403
+ if self._should_try_doi_apis_first(reference) and self.crossref and not arxiv_result:
314
404
  verified_data, errors, url, success, failure_type = self._try_api('crossref', self.crossref, reference)
315
405
  if success:
316
406
  # Check if the data is complete enough to use
@@ -327,11 +417,34 @@ class EnhancedHybridReferenceChecker:
327
417
  if self.semantic_scholar:
328
418
  verified_data, errors, url, success, failure_type = self._try_api('semantic_scholar', self.semantic_scholar, reference)
329
419
  if success:
420
+ # If we have ArXiv result, merge Semantic Scholar venue/URLs into it
421
+ if arxiv_result:
422
+ # Check if SS data is valid and venue is not just arxiv
423
+ # (skip merge if SS only found the arxiv version, no published venue)
424
+ if verified_data:
425
+ ss_venue = self.semantic_scholar.get_venue_from_paper_data(verified_data)
426
+ if ss_venue and 'arxiv' in ss_venue.lower():
427
+ # SS only found arxiv venue, skip merge and return arxiv result
428
+ logger.debug("Enhanced Hybrid: Semantic Scholar only found ArXiv venue, skipping merge")
429
+ return arxiv_result
430
+
431
+ arxiv_data, arxiv_errors, arxiv_url = arxiv_result
432
+ merged_data, merged_errors = self._merge_arxiv_with_semantic_scholar(
433
+ arxiv_data, arxiv_errors, arxiv_url,
434
+ verified_data, errors, url,
435
+ reference
436
+ )
437
+ return merged_data, merged_errors, arxiv_url
330
438
  return verified_data, errors, url
331
439
  # For Semantic Scholar, only retry retryable failures (not 'not_found')
332
440
  if failure_type in ['throttled', 'timeout', 'server_error']:
333
441
  failed_apis.append(('semantic_scholar', self.semantic_scholar, failure_type))
334
442
 
443
+ # If ArXiv succeeded but Semantic Scholar failed, return ArXiv result
444
+ if arxiv_result:
445
+ logger.debug("Enhanced Hybrid: Returning ArXiv result (Semantic Scholar unavailable)")
446
+ return arxiv_result
447
+
335
448
  # Strategy 4: Try OpenAlex API (excellent reliability, replaces Google Scholar)
336
449
  openalex_result = None
337
450
  if self.openalex:
@@ -223,7 +223,49 @@ class NonArxivReferenceChecker:
223
223
  """
224
224
  return compare_authors(cited_authors, correct_authors)
225
225
 
226
-
226
+ def get_venue_from_paper_data(self, paper_data: Dict[str, Any]) -> Optional[str]:
227
+ """
228
+ Extract venue from paper data dictionary.
229
+
230
+ Checks multiple fields since Semantic Scholar returns venue info
231
+ in different fields depending on publication type.
232
+
233
+ Args:
234
+ paper_data: Paper data dictionary from Semantic Scholar
235
+
236
+ Returns:
237
+ Venue string or None if not found
238
+ """
239
+ if not paper_data:
240
+ return None
241
+
242
+ paper_venue = None
243
+
244
+ # First try the simple 'venue' field (string)
245
+ if paper_data.get('venue'):
246
+ paper_venue = paper_data.get('venue')
247
+
248
+ # If no venue, try publicationVenue object
249
+ if not paper_venue and paper_data.get('publicationVenue'):
250
+ pub_venue = paper_data.get('publicationVenue')
251
+ if isinstance(pub_venue, dict):
252
+ paper_venue = pub_venue.get('name', '')
253
+ elif isinstance(pub_venue, str):
254
+ paper_venue = pub_venue
255
+
256
+ # If still no venue, try journal object
257
+ if not paper_venue and paper_data.get('journal'):
258
+ journal = paper_data.get('journal')
259
+ if isinstance(journal, dict):
260
+ paper_venue = journal.get('name', '')
261
+ elif isinstance(journal, str):
262
+ paper_venue = journal
263
+
264
+ # Ensure paper_venue is a string
265
+ if paper_venue and not isinstance(paper_venue, str):
266
+ paper_venue = str(paper_venue)
267
+
268
+ return paper_venue if paper_venue else None
227
269
 
228
270
  def verify_reference(self, reference: Dict[str, Any]) -> Tuple[Optional[Dict[str, Any]], List[Dict[str, Any]], Optional[str]]:
229
271
  """
refchecker/llm/base.py CHANGED
@@ -110,21 +110,7 @@ class LLMProvider(ABC):
110
110
 
111
111
  logger.debug(f"Created {len(chunks)} balanced overlapping chunks for parallel processing")
112
112
  return chunks
113
-
114
- def _parse_llm_response(self, response_text: str) -> List[str]:
115
- """Parse LLM response and extract individual references"""
116
- if not response_text:
117
- return []
118
-
119
- # Split by newlines and filter out empty lines
120
- references = []
121
- for line in response_text.strip().split('\n'):
122
- line = line.strip()
123
- if line and not line.startswith('#') and len(line) > 10: # Basic filtering
124
- references.append(line)
125
-
126
- return references
127
-
113
+
128
114
  def extract_references_with_chunking(self, bibliography_text: str) -> List[str]:
129
115
  """
130
116
  Template method that handles chunking for all providers.