academic-refchecker 1.2.38__tar.gz → 1.2.40__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. {academic_refchecker-1.2.38/src/academic_refchecker.egg-info → academic_refchecker-1.2.40}/PKG-INFO +1 -1
  2. {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/__version__.py +1 -1
  3. {academic_refchecker-1.2.38 → academic_refchecker-1.2.40/src/academic_refchecker.egg-info}/PKG-INFO +1 -1
  4. {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/core/refchecker.py +27 -2
  5. {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/utils/biblatex_parser.py +111 -12
  6. {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/utils/text_utils.py +45 -11
  7. {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/LICENSE +0 -0
  8. {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/MANIFEST.in +0 -0
  9. {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/README.md +0 -0
  10. {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/pyproject.toml +0 -0
  11. {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/requirements.txt +0 -0
  12. {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/scripts/download_db.py +0 -0
  13. {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/scripts/run_tests.py +0 -0
  14. {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/scripts/start_vllm_server.py +0 -0
  15. {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/setup.cfg +0 -0
  16. {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/__init__.py +0 -0
  17. {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/academic_refchecker.egg-info/SOURCES.txt +0 -0
  18. {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/academic_refchecker.egg-info/dependency_links.txt +0 -0
  19. {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/academic_refchecker.egg-info/entry_points.txt +0 -0
  20. {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/academic_refchecker.egg-info/requires.txt +0 -0
  21. {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/academic_refchecker.egg-info/top_level.txt +0 -0
  22. {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/checkers/__init__.py +0 -0
  23. {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/checkers/crossref.py +0 -0
  24. {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/checkers/enhanced_hybrid_checker.py +0 -0
  25. {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/checkers/github_checker.py +0 -0
  26. {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/checkers/local_semantic_scholar.py +0 -0
  27. {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/checkers/openalex.py +0 -0
  28. {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/checkers/openreview_checker.py +0 -0
  29. {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/checkers/semantic_scholar.py +0 -0
  30. {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/checkers/webpage_checker.py +0 -0
  31. {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/config/__init__.py +0 -0
  32. {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/config/logging.conf +0 -0
  33. {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/config/settings.py +0 -0
  34. {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/core/__init__.py +0 -0
  35. {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/core/db_connection_pool.py +0 -0
  36. {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/core/parallel_processor.py +0 -0
  37. {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/database/__init__.py +0 -0
  38. {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/database/download_semantic_scholar_db.py +0 -0
  39. {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/llm/__init__.py +0 -0
  40. {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/llm/base.py +0 -0
  41. {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/llm/providers.py +0 -0
  42. {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/scripts/__init__.py +0 -0
  43. {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/scripts/start_vllm_server.py +0 -0
  44. {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/services/__init__.py +0 -0
  45. {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/services/pdf_processor.py +0 -0
  46. {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/utils/__init__.py +0 -0
  47. {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/utils/arxiv_utils.py +0 -0
  48. {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/utils/author_utils.py +0 -0
  49. {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/utils/bibliography_utils.py +0 -0
  50. {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/utils/bibtex_parser.py +0 -0
  51. {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/utils/config_validator.py +0 -0
  52. {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/utils/db_utils.py +0 -0
  53. {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/utils/doi_utils.py +0 -0
  54. {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/utils/error_utils.py +0 -0
  55. {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/utils/mock_objects.py +0 -0
  56. {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/utils/unicode_utils.py +0 -0
  57. {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/utils/url_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: academic-refchecker
3
- Version: 1.2.38
3
+ Version: 1.2.40
4
4
  Summary: A comprehensive tool for validating reference accuracy in academic papers
5
5
  Author-email: Mark Russinovich <markrussinovich@hotmail.com>
6
6
  License-Expression: MIT
@@ -1,3 +1,3 @@
1
1
  """Version information for RefChecker."""
2
2
 
3
- __version__ = "1.2.38"
3
+ __version__ = "1.2.40"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: academic-refchecker
3
- Version: 1.2.38
3
+ Version: 1.2.40
4
4
  Summary: A comprehensive tool for validating reference accuracy in academic papers
5
5
  Author-email: Mark Russinovich <markrussinovich@hotmail.com>
6
6
  License-Expression: MIT
@@ -3386,7 +3386,25 @@ class ArxivReferenceChecker:
3386
3386
  logger.info("Detected biblatex format, using biblatex parser")
3387
3387
  self.used_regex_extraction = True
3388
3388
  # Note: biblatex parsing is also robust, so we don't set used_unreliable_extraction
3389
- return self._parse_biblatex_references(bibliography_text)
3389
+ biblatex_refs = self._parse_biblatex_references(bibliography_text)
3390
+
3391
+ # If biblatex parsing returned empty results (due to quality validation),
3392
+ # fallback to LLM if available
3393
+ if not biblatex_refs and self.llm_extractor:
3394
+ logger.debug("Biblatex parser returned no results due to quality validation, trying LLM fallback")
3395
+ try:
3396
+ references = self.llm_extractor.extract_references(bibliography_text)
3397
+ if references:
3398
+ logger.debug(f"LLM fallback extracted {len(references)} references")
3399
+ return self._process_llm_extracted_references(references)
3400
+ else:
3401
+ logger.warning("LLM fallback also returned no results")
3402
+ return []
3403
+ except Exception as e:
3404
+ logger.error(f"LLM fallback failed: {e}")
3405
+ return []
3406
+
3407
+ return biblatex_refs
3390
3408
 
3391
3409
  # For non-standard formats, try LLM-based extraction if available
3392
3410
  if self.llm_extractor:
@@ -3610,7 +3628,14 @@ class ArxivReferenceChecker:
3610
3628
  if detect_biblatex_format(bibliography_text):
3611
3629
  logger.debug("Detected biblatex format, using biblatex-specific parsing")
3612
3630
  # biblatex parsing is also robust, so we don't set used_unreliable_extraction
3613
- return self._parse_biblatex_references(bibliography_text)
3631
+ biblatex_refs = self._parse_biblatex_references(bibliography_text)
3632
+
3633
+ # If biblatex parsing returned empty results (due to quality validation),
3634
+ # we'll continue with the unreliable fallback regex parsing
3635
+ if not biblatex_refs:
3636
+ logger.debug("Biblatex parser returned no results due to quality validation, falling back to regex parsing")
3637
+ else:
3638
+ return biblatex_refs
3614
3639
 
3615
3640
  # If we reach here, we're using the unreliable fallback regex parsing
3616
3641
  self.used_unreliable_extraction = True
@@ -138,6 +138,57 @@ def detect_biblatex_format(text: str) -> bool:
138
138
  return has_biblatex_marker or has_numbered_refs
139
139
 
140
140
 
141
+ def _validate_parsing_quality(references: List[Dict[str, Any]]) -> bool:
142
+ """
143
+ Validate that biblatex parsing results are of acceptable quality.
144
+ If quality is poor, we should fallback to LLM parsing instead.
145
+
146
+ Args:
147
+ references: List of parsed reference dictionaries
148
+
149
+ Returns:
150
+ True if parsing quality is acceptable, False if should fallback to LLM
151
+ """
152
+ if not references:
153
+ return False
154
+
155
+ # Count problematic entries
156
+ unknown_authors = 0
157
+ unknown_titles = 0
158
+ total_entries = len(references)
159
+
160
+ for ref in references:
161
+ authors = ref.get('authors', [])
162
+ title = ref.get('title', '')
163
+
164
+ # Check for "Unknown Author" entries
165
+ if not authors or authors == ['Unknown Author']:
166
+ unknown_authors += 1
167
+
168
+ # Check for "Unknown Title" entries
169
+ if not title or title == 'Unknown Title':
170
+ unknown_titles += 1
171
+
172
+ # Calculate failure rates
173
+ author_failure_rate = unknown_authors / total_entries
174
+ title_failure_rate = unknown_titles / total_entries
175
+
176
+ # Quality thresholds - if more than 20% of entries have parsing failures,
177
+ # fallback to LLM which is more robust
178
+ MAX_ACCEPTABLE_FAILURE_RATE = 0.2
179
+
180
+ if author_failure_rate > MAX_ACCEPTABLE_FAILURE_RATE:
181
+ logger.debug(f"Biblatex parsing quality poor: {author_failure_rate:.1%} unknown authors (>{MAX_ACCEPTABLE_FAILURE_RATE:.0%}). Falling back to LLM.")
182
+ return False
183
+
184
+ if title_failure_rate > MAX_ACCEPTABLE_FAILURE_RATE:
185
+ logger.debug(f"Biblatex parsing quality poor: {title_failure_rate:.1%} unknown titles (>{MAX_ACCEPTABLE_FAILURE_RATE:.0%}). Falling back to LLM.")
186
+ return False
187
+
188
+ logger.debug(f"Biblatex parsing quality acceptable: {author_failure_rate:.1%} unknown authors, {title_failure_rate:.1%} unknown titles")
189
+ return True
190
+
191
+
141
192
  def parse_biblatex_references(text: str) -> List[Dict[str, Any]]:
142
193
  """
143
194
  Parse biblatex formatted references into structured format
@@ -146,7 +197,8 @@ def parse_biblatex_references(text: str) -> List[Dict[str, Any]]:
146
197
  text: String containing biblatex .bbl entries
147
198
 
148
199
  Returns:
149
- List of structured reference dictionaries
200
+ List of structured reference dictionaries, or empty list if
201
+ parsing quality is poor (to trigger LLM fallback)
150
202
  """
151
203
  from utils.text_utils import parse_authors_with_initials, clean_title
152
204
  from utils.doi_utils import construct_doi_url, is_valid_doi_format
@@ -171,7 +223,7 @@ def parse_biblatex_references(text: str) -> List[Dict[str, Any]]:
171
223
  # Find the content between this entry and the next (or end of text)
172
224
  if i + 1 < len(entry_starts):
173
225
  next_start = entry_starts[i + 1][1]
174
- content = text[end:next_start].strip()
226
+ raw_content = text[end:next_start].strip()
175
227
  else:
176
228
  # Last entry - take everything to end, but be smart about stopping
177
229
  remaining = text[end:].strip()
@@ -190,9 +242,20 @@ def parse_biblatex_references(text: str) -> List[Dict[str, Any]]:
190
242
  if match and match.start() < min_stop:
191
243
  min_stop = match.start()
192
244
 
193
- content = remaining[:min_stop].strip()
245
+ raw_content = remaining[:min_stop].strip()
194
246
 
195
- if content:
247
+ # Clean up content - handle cases where entry might be incomplete or malformed
248
+ if raw_content:
249
+ # Remove stray closing brackets or incomplete markers
250
+ content = raw_content
251
+ # Remove trailing "]" if it's the only thing on the last line
252
+ lines = content.split('\n')
253
+ if len(lines) > 1 and lines[-1].strip() == ']':
254
+ content = '\n'.join(lines[:-1]).strip()
255
+ elif content.strip() == ']':
256
+ # If content is only "], skip this entry as it's incomplete
257
+ continue
258
+
196
259
  matches.append((entry_num, content))
197
260
 
198
261
  for entry_num, content in matches:
@@ -218,6 +281,11 @@ def parse_biblatex_references(text: str) -> List[Dict[str, Any]]:
218
281
  references.append(parsed_ref)
219
282
 
220
283
  logger.debug(f"Extracted {len(references)} biblatex references")
284
+
285
+ # Validate parsing quality - if poor, return empty list to trigger LLM fallback
286
+ if not _validate_parsing_quality(references):
287
+ return []
288
+
221
289
  return references
222
290
 
223
291
 
@@ -261,11 +329,15 @@ def parse_biblatex_entry_content(entry_num: str, content: str) -> Dict[str, Any]
261
329
  else:
262
330
  # If no quoted title, look for title after author names
263
331
  # Pattern: "FirstAuthor et al. Title Goes Here. Year." or "Author. Title. Year."
332
+ # Order matters: more specific patterns first
264
333
  title_patterns = [
265
- r'[A-Z][a-z]+(?:\s+et\s+al)?\.?\s+([A-Z][^.]*?)\.\s+\d{4}', # "Author et al. Title. Year"
266
- r'[A-Z][^.]+\.\s*([A-Z][^.]*?)\.\s*(?:https?://|arXiv:|\d{4})', # "Authors. Title. URL/arXiv/Year" (flexible spacing)
267
- r'(?:[A-Z][a-z]+,?\s+)+([A-Z][^.]*?)\.\s+\d{4}', # "Name, Name. Title. Year"
334
+ # Pattern for unquoted books: "Author1 and Author2, Title: Subtitle. Location: Publisher, Year."
335
+ r'(?:and\s+[A-Z][^,]*),\s+([A-Z][^.]*?:\s*[^.]*?)\.\s+[A-Z][^:]*:\s*[^,]*,\s*\d{4}',
336
+ r'[A-Z][^.]+\.\s*([A-Z][^.]*?)\.\s*(?:https?://|arXiv:|\d{4})', # "Authors. Title. URL/arXiv/Year" (flexible spacing) - MOST SPECIFIC
268
337
  r'\.([A-Z][A-Za-z\s]+(?:\?|!)?)\.?\s+\d{4}', # ".Title. Year" - for cases where authors end without space
338
+ r'[A-Z][a-z]+\.([A-Z][A-Za-z\s\-&]+?)\.\s+\d{4}', # "Name.Title. Year" - missing space after period
339
+ r'[A-Z][a-z]+(?:\s+et\s+al)?\.?\s+([A-Z][^.]*?)\.\s+\d{4}', # "Author et al. Title. Year" - LESS SPECIFIC
340
+ r'(?:[A-Z][a-z]+,?\s+)+([A-Z][^.]*?)\.\s+\d{4}', # "Name, Name. Title. Year"
269
341
  r'\b([A-Z][A-Za-z\s\-0-9]+)\s+\.\s+https', # "Title . https" - handle space before period
270
342
  ]
271
343
 
@@ -274,7 +346,14 @@ def parse_biblatex_entry_content(entry_num: str, content: str) -> Dict[str, Any]
274
346
  if title_match:
275
347
  potential_title = title_match.group(1)
276
348
  # Make sure it looks like a title and not author names
277
- if len(potential_title) > 10 and not re.match(r'^[A-Z][a-z]+,\s*[A-Z]', potential_title):
349
+ # Be more specific about author name patterns - should be "Surname, Initial" not "Word, Word"
350
+ author_like_pattern = r'^[A-Z][a-z]+,\s*[A-Z]\.?$' # "Smith, J." or "Smith, J"
351
+ multi_word_author = r'^[A-Z][a-z]+,\s*[A-Z][a-z]+$' # "Smith, John" - but still reject this
352
+
353
+ is_author_like = (re.match(author_like_pattern, potential_title) or
354
+ re.match(multi_word_author, potential_title))
355
+
356
+ if len(potential_title) > 2 and not is_author_like:
278
357
  title = clean_title(potential_title)
279
358
  break
280
359
 
@@ -328,16 +407,25 @@ def parse_biblatex_entry_content(entry_num: str, content: str) -> Dict[str, Any]
328
407
  # Examples we need to handle:
329
408
  # "Egor Zverev, Sahar Abdelnabi, Mario Fritz, and Christoph H Lampert. \"Title\". In: venue (year)."
330
409
  # "Andrej Karpathy. Intro to Large Language Models. https://... year."
410
+ # "A. Author and B. Coauthor, \"Title\"," <- handle this format
331
411
 
332
412
  # Try multiple patterns to extract authors
413
+ # Order matters - more specific patterns first!
333
414
  author_patterns = [
334
415
  # Pattern 1: Authors followed by quoted title (handle both regular and smart quotes)
416
+ r'^([^"\u201c\u201d]+?),\s*["\u201c\u201d]', # "Authors, \"Title\"" - more restrictive, requires comma before quote
335
417
  r'^([^"\u201c\u201d]+)\.\s*["\u201c\u201d]', # "Authors. \"Title\"" or smart quotes
336
418
 
337
- # Pattern 2: Authors followed by title, then period, then year or venue
419
+ # Pattern 2: Authors followed by unquoted title for books: "Author1 and Author2, Title:"
420
+ r'^([^,]+(?:\s+and\s+[^,]+)?),\s+([A-Z][^.]*?):\s*([^.]*?)\.', # "Author1 and Author2, Title: Subtitle." - book format
421
+
422
+ # Pattern 3: Authors ending with period, no space, then title (missing space case) - MORE SPECIFIC
423
+ r'^([^.]+?)\.([A-Z][^.]*)\.', # "Authors.Title." - missing space after period
424
+
425
+ # Pattern 4: Authors followed by title, then period, then year or venue (with extracted title)
338
426
  r'^(.+?)\.\s*([A-Z][^.]+)\.\s+(?:In:|https?://|\d{4})', # "Authors. Title. In:/URL/Year" (allow no space after period)
339
427
 
340
- # Pattern 3: Authors ending with period followed by capital letter (simpler fallback)
428
+ # Pattern 5: Authors ending with period followed by capital letter (simpler fallback) - LEAST SPECIFIC
341
429
  r'^([^.]+?)\.\s*[A-Z]', # Allow no space after period
342
430
  ]
343
431
 
@@ -347,9 +435,17 @@ def parse_biblatex_entry_content(entry_num: str, content: str) -> Dict[str, Any]
347
435
  potential_authors = author_match.group(1).strip()
348
436
 
349
437
  # For patterns that also capture title, extract it
350
- if i == 1 and not title and len(author_match.groups()) > 1:
438
+ if i == 2 and not title and len(author_match.groups()) > 2:
439
+ # Pattern 2 (book format) captures authors, title, and subtitle
440
+ title_part = author_match.group(2).strip()
441
+ subtitle_part = author_match.group(3).strip()
442
+ combined_title = f"{title_part}: {subtitle_part}" if subtitle_part else title_part
443
+ if len(combined_title) > 2:
444
+ title = clean_title(combined_title)
445
+ elif (i == 3 or i == 4) and not title and len(author_match.groups()) > 1:
446
+ # Pattern 3 (missing space, index 3) and Pattern 4 (with space, index 4) capture both authors and title
351
447
  potential_title = author_match.group(2).strip()
352
- if len(potential_title) > 5 and not re.match(r'^[A-Z][a-z]+,', potential_title):
448
+ if len(potential_title) > 2 and not re.match(r'^[A-Z][a-z]+,', potential_title):
353
449
  title = clean_title(potential_title)
354
450
 
355
451
  # Validate that this looks like authors
@@ -429,8 +525,11 @@ def parse_biblatex_entry_content(entry_num: str, content: str) -> Dict[str, Any]
429
525
  authors.append(part)
430
526
 
431
527
  # 7. Extract journal/venue - look for patterns like "In: Conference" or remaining text
528
+ # Also handle cases like "Tasks,"Adv. Neural" where there's missing space after quote-comma
432
529
  journal_patterns = [
433
530
  r'In:\s*([^.]+?)(?:\.|$)', # "In: Conference Name"
531
+ r'"[^"]*,"([A-Z][^,]*?\. [A-Z][^,]*)', # Quote-comma-venue like "Tasks,"Adv. Neural Inf. Process. Syst."
532
+ r'["\u201c\u201d]([A-Z][^.]*(?:Adv\.|Proc\.|IEEE|Journal)[^.]*)', # Missing space after quote like "Tasks"Adv. Neural"
434
533
  r'([A-Z][^.]*(?:Conference|Workshop|Journal|Proceedings)[^.]*)', # Conference/journal names
435
534
  ]
436
535
 
@@ -11,6 +11,31 @@ from typing import List
11
11
  logger = logging.getLogger(__name__)
12
12
 
13
13
 
14
+ def normalize_apostrophes(text):
15
+ """
16
+ Normalize all apostrophe variants to standard ASCII apostrophe
17
+ """
18
+ if not text:
19
+ return text
20
+
21
+ # All known apostrophe variants
22
+ apostrophe_variants = [
23
+ "'", # U+0027 ASCII apostrophe
24
+ "'", # U+2019 Right single quotation mark (most common)
25
+ "'", # U+2018 Left single quotation mark
26
+ "ʼ", # U+02BC Modifier letter apostrophe
27
+ "ˈ", # U+02C8 Modifier letter vertical line (primary stress)
28
+ "`", # U+0060 Grave accent (sometimes used as apostrophe)
29
+ "´", # U+00B4 Acute accent (sometimes used as apostrophe)
30
+ ]
31
+
32
+ # Replace all variants with standard ASCII apostrophe
33
+ for variant in apostrophe_variants:
34
+ text = text.replace(variant, "'")
35
+
36
+ return text
37
+
38
+
14
39
  def normalize_text(text):
15
40
  """
16
41
  Normalize text by removing diacritical marks and special characters
@@ -18,6 +43,9 @@ def normalize_text(text):
18
43
  if not text:
19
44
  return ""
20
45
 
46
+ # First normalize apostrophes to standard form
47
+ text = normalize_apostrophes(text)
48
+
21
49
  # Replace common special characters with their ASCII equivalents
22
50
  replacements = {
23
51
  'ä': 'a', 'ö': 'o', 'ü': 'u', 'ß': 'ss',
@@ -29,7 +57,7 @@ def normalize_text(text):
29
57
  'Ł': 'L', 'ł': 'l',
30
58
  '¨': '', '´': '', '`': '', '^': '', '~': '',
31
59
  '–': '-', '—': '-', '−': '-',
32
- '„': '"', '"': '"', '"': '"', ''': "'", ''': "'",
60
+ '„': '"', '"': '"', '"': '"',
33
61
  '«': '"', '»': '"',
34
62
  '¡': '!', '¿': '?',
35
63
  '°': 'degrees', '©': '(c)', '®': '(r)', '™': '(tm)',
@@ -39,10 +67,6 @@ def normalize_text(text):
39
67
  '\u00A0': ' ', # Non-breaking space
40
68
  '\u2013': '-', # En dash
41
69
  '\u2014': '-', # Em dash
42
- '\u2018': "'", # Left single quotation mark
43
- '\u2019': "'", # Right single quotation mark
44
- '\u201C': '"', # Left double quotation mark
45
- '\u201D': '"', # Right double quotation mark
46
70
  '\u2026': '...', # Horizontal ellipsis
47
71
  '\u00B7': '.', # Middle dot
48
72
  '\u2022': '.', # Bullet
@@ -54,8 +78,8 @@ def normalize_text(text):
54
78
  # Remove any remaining diacritical marks
55
79
  text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('ASCII')
56
80
 
57
- # Remove special characters
58
- text = re.sub(r'[^\w\s]', '', text)
81
+ # Remove special characters except apostrophes
82
+ text = re.sub(r"[^\w\s']", '', text)
59
83
 
60
84
  # Normalize whitespace
61
85
  text = re.sub(r'\s+', ' ', text).strip()
@@ -368,6 +392,9 @@ def clean_author_name(author):
368
392
  # Normalize Unicode characters (e.g., combining diacritics)
369
393
  author = unicodedata.normalize('NFKC', author)
370
394
 
395
+ # Normalize apostrophes first before other processing
396
+ author = normalize_apostrophes(author)
397
+
371
398
  # Handle common Unicode escape sequences and LaTeX encodings
372
399
  # Note: Order matters - process longer patterns first
373
400
  unicode_replacements = [
@@ -703,8 +730,12 @@ def normalize_diacritics(text: str) -> str:
703
730
  'José' -> 'jose'
704
731
  'Łukasz' -> 'lukasz'
705
732
  'J. Gl¨ uck' -> 'J. Gluck'
733
+ 'D'Amato' -> 'D'Amato' (apostrophes normalized)
706
734
  """
707
- # First handle special characters that don't decompose properly
735
+ # First normalize apostrophes
736
+ text = normalize_apostrophes(text)
737
+
738
+ # Then handle special characters that don't decompose properly
708
739
  # Including common transliterations
709
740
  special_chars = {
710
741
  'ł': 'l', 'Ł': 'L',
@@ -2224,7 +2255,8 @@ def format_author_for_display(author_name):
2224
2255
  if not author_name:
2225
2256
  return author_name
2226
2257
 
2227
- author_name = author_name.strip()
2258
+ # Normalize apostrophes for consistent display
2259
+ author_name = normalize_apostrophes(author_name.strip())
2228
2260
 
2229
2261
  # Check if it's in "Lastname, Firstname" format
2230
2262
  if ',' in author_name:
@@ -3743,6 +3775,7 @@ def are_venues_substantially_different(venue1: str, venue2: str) -> bool:
3743
3775
  for abbrev in sorted(multi_word_abbrevs.keys(), key=len, reverse=True):
3744
3776
  if abbrev in expanded_text:
3745
3777
  expanded_text = expanded_text.replace(abbrev, multi_word_abbrevs[abbrev])
3778
+ break # Only apply the first (longest) matching abbreviation to avoid conflicts
3746
3779
 
3747
3780
  # Second pass: handle single word abbreviations
3748
3781
  words = expanded_text.split()
@@ -4137,8 +4170,9 @@ def are_venues_substantially_different(venue1: str, venue2: str) -> bool:
4137
4170
  return False
4138
4171
 
4139
4172
  # Order-aware fuzzy matching - words should match in sequence
4140
- words1_list = list(words1)
4141
- words2_list = list(words2)
4173
+ # Sort to ensure deterministic order (set iteration is not guaranteed to be consistent)
4174
+ words1_list = sorted(list(words1))
4175
+ words2_list = sorted(list(words2))
4142
4176
 
4143
4177
  # If word counts are very different, they're likely different venues
4144
4178
  if len(words1) > 0 and len(words2) > 0: