academic-refchecker 1.2.38__tar.gz → 1.2.40__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {academic_refchecker-1.2.38/src/academic_refchecker.egg-info → academic_refchecker-1.2.40}/PKG-INFO +1 -1
- {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/__version__.py +1 -1
- {academic_refchecker-1.2.38 → academic_refchecker-1.2.40/src/academic_refchecker.egg-info}/PKG-INFO +1 -1
- {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/core/refchecker.py +27 -2
- {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/utils/biblatex_parser.py +111 -12
- {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/utils/text_utils.py +45 -11
- {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/LICENSE +0 -0
- {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/MANIFEST.in +0 -0
- {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/README.md +0 -0
- {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/pyproject.toml +0 -0
- {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/requirements.txt +0 -0
- {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/scripts/download_db.py +0 -0
- {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/scripts/run_tests.py +0 -0
- {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/scripts/start_vllm_server.py +0 -0
- {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/setup.cfg +0 -0
- {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/__init__.py +0 -0
- {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/academic_refchecker.egg-info/SOURCES.txt +0 -0
- {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/academic_refchecker.egg-info/dependency_links.txt +0 -0
- {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/academic_refchecker.egg-info/entry_points.txt +0 -0
- {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/academic_refchecker.egg-info/requires.txt +0 -0
- {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/academic_refchecker.egg-info/top_level.txt +0 -0
- {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/checkers/__init__.py +0 -0
- {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/checkers/crossref.py +0 -0
- {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/checkers/enhanced_hybrid_checker.py +0 -0
- {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/checkers/github_checker.py +0 -0
- {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/checkers/local_semantic_scholar.py +0 -0
- {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/checkers/openalex.py +0 -0
- {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/checkers/openreview_checker.py +0 -0
- {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/checkers/semantic_scholar.py +0 -0
- {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/checkers/webpage_checker.py +0 -0
- {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/config/__init__.py +0 -0
- {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/config/logging.conf +0 -0
- {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/config/settings.py +0 -0
- {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/core/__init__.py +0 -0
- {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/core/db_connection_pool.py +0 -0
- {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/core/parallel_processor.py +0 -0
- {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/database/__init__.py +0 -0
- {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/database/download_semantic_scholar_db.py +0 -0
- {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/llm/__init__.py +0 -0
- {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/llm/base.py +0 -0
- {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/llm/providers.py +0 -0
- {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/scripts/__init__.py +0 -0
- {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/scripts/start_vllm_server.py +0 -0
- {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/services/__init__.py +0 -0
- {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/services/pdf_processor.py +0 -0
- {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/utils/__init__.py +0 -0
- {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/utils/arxiv_utils.py +0 -0
- {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/utils/author_utils.py +0 -0
- {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/utils/bibliography_utils.py +0 -0
- {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/utils/bibtex_parser.py +0 -0
- {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/utils/config_validator.py +0 -0
- {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/utils/db_utils.py +0 -0
- {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/utils/doi_utils.py +0 -0
- {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/utils/error_utils.py +0 -0
- {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/utils/mock_objects.py +0 -0
- {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/utils/unicode_utils.py +0 -0
- {academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/utils/url_utils.py +0 -0
|
@@ -3386,7 +3386,25 @@ class ArxivReferenceChecker:
|
|
|
3386
3386
|
logger.info("Detected biblatex format, using biblatex parser")
|
|
3387
3387
|
self.used_regex_extraction = True
|
|
3388
3388
|
# Note: biblatex parsing is also robust, so we don't set used_unreliable_extraction
|
|
3389
|
-
|
|
3389
|
+
biblatex_refs = self._parse_biblatex_references(bibliography_text)
|
|
3390
|
+
|
|
3391
|
+
# If biblatex parsing returned empty results (due to quality validation),
|
|
3392
|
+
# fallback to LLM if available
|
|
3393
|
+
if not biblatex_refs and self.llm_extractor:
|
|
3394
|
+
logger.debug("Biblatex parser returned no results due to quality validation, trying LLM fallback")
|
|
3395
|
+
try:
|
|
3396
|
+
references = self.llm_extractor.extract_references(bibliography_text)
|
|
3397
|
+
if references:
|
|
3398
|
+
logger.debug(f"LLM fallback extracted {len(references)} references")
|
|
3399
|
+
return self._process_llm_extracted_references(references)
|
|
3400
|
+
else:
|
|
3401
|
+
logger.warning("LLM fallback also returned no results")
|
|
3402
|
+
return []
|
|
3403
|
+
except Exception as e:
|
|
3404
|
+
logger.error(f"LLM fallback failed: {e}")
|
|
3405
|
+
return []
|
|
3406
|
+
|
|
3407
|
+
return biblatex_refs
|
|
3390
3408
|
|
|
3391
3409
|
# For non-standard formats, try LLM-based extraction if available
|
|
3392
3410
|
if self.llm_extractor:
|
|
@@ -3610,7 +3628,14 @@ class ArxivReferenceChecker:
|
|
|
3610
3628
|
if detect_biblatex_format(bibliography_text):
|
|
3611
3629
|
logger.debug("Detected biblatex format, using biblatex-specific parsing")
|
|
3612
3630
|
# biblatex parsing is also robust, so we don't set used_unreliable_extraction
|
|
3613
|
-
|
|
3631
|
+
biblatex_refs = self._parse_biblatex_references(bibliography_text)
|
|
3632
|
+
|
|
3633
|
+
# If biblatex parsing returned empty results (due to quality validation),
|
|
3634
|
+
# we'll continue with the unreliable fallback regex parsing
|
|
3635
|
+
if not biblatex_refs:
|
|
3636
|
+
logger.debug("Biblatex parser returned no results due to quality validation, falling back to regex parsing")
|
|
3637
|
+
else:
|
|
3638
|
+
return biblatex_refs
|
|
3614
3639
|
|
|
3615
3640
|
# If we reach here, we're using the unreliable fallback regex parsing
|
|
3616
3641
|
self.used_unreliable_extraction = True
|
|
@@ -138,6 +138,57 @@ def detect_biblatex_format(text: str) -> bool:
|
|
|
138
138
|
return has_biblatex_marker or has_numbered_refs
|
|
139
139
|
|
|
140
140
|
|
|
141
|
+
def _validate_parsing_quality(references: List[Dict[str, Any]]) -> bool:
|
|
142
|
+
"""
|
|
143
|
+
Validate that biblatex parsing results are of acceptable quality.
|
|
144
|
+
If quality is poor, we should fallback to LLM parsing instead.
|
|
145
|
+
|
|
146
|
+
Args:
|
|
147
|
+
references: List of parsed reference dictionaries
|
|
148
|
+
|
|
149
|
+
Returns:
|
|
150
|
+
True if parsing quality is acceptable, False if should fallback to LLM
|
|
151
|
+
"""
|
|
152
|
+
if not references:
|
|
153
|
+
return False
|
|
154
|
+
|
|
155
|
+
# Count problematic entries
|
|
156
|
+
unknown_authors = 0
|
|
157
|
+
unknown_titles = 0
|
|
158
|
+
total_entries = len(references)
|
|
159
|
+
|
|
160
|
+
for ref in references:
|
|
161
|
+
authors = ref.get('authors', [])
|
|
162
|
+
title = ref.get('title', '')
|
|
163
|
+
|
|
164
|
+
# Check for "Unknown Author" entries
|
|
165
|
+
if not authors or authors == ['Unknown Author']:
|
|
166
|
+
unknown_authors += 1
|
|
167
|
+
|
|
168
|
+
# Check for "Unknown Title" entries
|
|
169
|
+
if not title or title == 'Unknown Title':
|
|
170
|
+
unknown_titles += 1
|
|
171
|
+
|
|
172
|
+
# Calculate failure rates
|
|
173
|
+
author_failure_rate = unknown_authors / total_entries
|
|
174
|
+
title_failure_rate = unknown_titles / total_entries
|
|
175
|
+
|
|
176
|
+
# Quality thresholds - if more than 20% of entries have parsing failures,
|
|
177
|
+
# fallback to LLM which is more robust
|
|
178
|
+
MAX_ACCEPTABLE_FAILURE_RATE = 0.2
|
|
179
|
+
|
|
180
|
+
if author_failure_rate > MAX_ACCEPTABLE_FAILURE_RATE:
|
|
181
|
+
logger.debug(f"Biblatex parsing quality poor: {author_failure_rate:.1%} unknown authors (>{MAX_ACCEPTABLE_FAILURE_RATE:.0%}). Falling back to LLM.")
|
|
182
|
+
return False
|
|
183
|
+
|
|
184
|
+
if title_failure_rate > MAX_ACCEPTABLE_FAILURE_RATE:
|
|
185
|
+
logger.debug(f"Biblatex parsing quality poor: {title_failure_rate:.1%} unknown titles (>{MAX_ACCEPTABLE_FAILURE_RATE:.0%}). Falling back to LLM.")
|
|
186
|
+
return False
|
|
187
|
+
|
|
188
|
+
logger.debug(f"Biblatex parsing quality acceptable: {author_failure_rate:.1%} unknown authors, {title_failure_rate:.1%} unknown titles")
|
|
189
|
+
return True
|
|
190
|
+
|
|
191
|
+
|
|
141
192
|
def parse_biblatex_references(text: str) -> List[Dict[str, Any]]:
|
|
142
193
|
"""
|
|
143
194
|
Parse biblatex formatted references into structured format
|
|
@@ -146,7 +197,8 @@ def parse_biblatex_references(text: str) -> List[Dict[str, Any]]:
|
|
|
146
197
|
text: String containing biblatex .bbl entries
|
|
147
198
|
|
|
148
199
|
Returns:
|
|
149
|
-
List of structured reference dictionaries
|
|
200
|
+
List of structured reference dictionaries, or empty list if
|
|
201
|
+
parsing quality is poor (to trigger LLM fallback)
|
|
150
202
|
"""
|
|
151
203
|
from utils.text_utils import parse_authors_with_initials, clean_title
|
|
152
204
|
from utils.doi_utils import construct_doi_url, is_valid_doi_format
|
|
@@ -171,7 +223,7 @@ def parse_biblatex_references(text: str) -> List[Dict[str, Any]]:
|
|
|
171
223
|
# Find the content between this entry and the next (or end of text)
|
|
172
224
|
if i + 1 < len(entry_starts):
|
|
173
225
|
next_start = entry_starts[i + 1][1]
|
|
174
|
-
|
|
226
|
+
raw_content = text[end:next_start].strip()
|
|
175
227
|
else:
|
|
176
228
|
# Last entry - take everything to end, but be smart about stopping
|
|
177
229
|
remaining = text[end:].strip()
|
|
@@ -190,9 +242,20 @@ def parse_biblatex_references(text: str) -> List[Dict[str, Any]]:
|
|
|
190
242
|
if match and match.start() < min_stop:
|
|
191
243
|
min_stop = match.start()
|
|
192
244
|
|
|
193
|
-
|
|
245
|
+
raw_content = remaining[:min_stop].strip()
|
|
194
246
|
|
|
195
|
-
|
|
247
|
+
# Clean up content - handle cases where entry might be incomplete or malformed
|
|
248
|
+
if raw_content:
|
|
249
|
+
# Remove stray closing brackets or incomplete markers
|
|
250
|
+
content = raw_content
|
|
251
|
+
# Remove trailing "]" if it's the only thing on the last line
|
|
252
|
+
lines = content.split('\n')
|
|
253
|
+
if len(lines) > 1 and lines[-1].strip() == ']':
|
|
254
|
+
content = '\n'.join(lines[:-1]).strip()
|
|
255
|
+
elif content.strip() == ']':
|
|
256
|
+
# If content is only "], skip this entry as it's incomplete
|
|
257
|
+
continue
|
|
258
|
+
|
|
196
259
|
matches.append((entry_num, content))
|
|
197
260
|
|
|
198
261
|
for entry_num, content in matches:
|
|
@@ -218,6 +281,11 @@ def parse_biblatex_references(text: str) -> List[Dict[str, Any]]:
|
|
|
218
281
|
references.append(parsed_ref)
|
|
219
282
|
|
|
220
283
|
logger.debug(f"Extracted {len(references)} biblatex references")
|
|
284
|
+
|
|
285
|
+
# Validate parsing quality - if poor, return empty list to trigger LLM fallback
|
|
286
|
+
if not _validate_parsing_quality(references):
|
|
287
|
+
return []
|
|
288
|
+
|
|
221
289
|
return references
|
|
222
290
|
|
|
223
291
|
|
|
@@ -261,11 +329,15 @@ def parse_biblatex_entry_content(entry_num: str, content: str) -> Dict[str, Any]
|
|
|
261
329
|
else:
|
|
262
330
|
# If no quoted title, look for title after author names
|
|
263
331
|
# Pattern: "FirstAuthor et al. Title Goes Here. Year." or "Author. Title. Year."
|
|
332
|
+
# Order matters: more specific patterns first
|
|
264
333
|
title_patterns = [
|
|
265
|
-
|
|
266
|
-
r'[A-Z][
|
|
267
|
-
r'
|
|
334
|
+
# Pattern for unquoted books: "Author1 and Author2, Title: Subtitle. Location: Publisher, Year."
|
|
335
|
+
r'(?:and\s+[A-Z][^,]*),\s+([A-Z][^.]*?:\s*[^.]*?)\.\s+[A-Z][^:]*:\s*[^,]*,\s*\d{4}',
|
|
336
|
+
r'[A-Z][^.]+\.\s*([A-Z][^.]*?)\.\s*(?:https?://|arXiv:|\d{4})', # "Authors. Title. URL/arXiv/Year" (flexible spacing) - MOST SPECIFIC
|
|
268
337
|
r'\.([A-Z][A-Za-z\s]+(?:\?|!)?)\.?\s+\d{4}', # ".Title. Year" - for cases where authors end without space
|
|
338
|
+
r'[A-Z][a-z]+\.([A-Z][A-Za-z\s\-&]+?)\.\s+\d{4}', # "Name.Title. Year" - missing space after period
|
|
339
|
+
r'[A-Z][a-z]+(?:\s+et\s+al)?\.?\s+([A-Z][^.]*?)\.\s+\d{4}', # "Author et al. Title. Year" - LESS SPECIFIC
|
|
340
|
+
r'(?:[A-Z][a-z]+,?\s+)+([A-Z][^.]*?)\.\s+\d{4}', # "Name, Name. Title. Year"
|
|
269
341
|
r'\b([A-Z][A-Za-z\s\-0-9]+)\s+\.\s+https', # "Title . https" - handle space before period
|
|
270
342
|
]
|
|
271
343
|
|
|
@@ -274,7 +346,14 @@ def parse_biblatex_entry_content(entry_num: str, content: str) -> Dict[str, Any]
|
|
|
274
346
|
if title_match:
|
|
275
347
|
potential_title = title_match.group(1)
|
|
276
348
|
# Make sure it looks like a title and not author names
|
|
277
|
-
|
|
349
|
+
# Be more specific about author name patterns - should be "Surname, Initial" not "Word, Word"
|
|
350
|
+
author_like_pattern = r'^[A-Z][a-z]+,\s*[A-Z]\.?$' # "Smith, J." or "Smith, J"
|
|
351
|
+
multi_word_author = r'^[A-Z][a-z]+,\s*[A-Z][a-z]+$' # "Smith, John" - but still reject this
|
|
352
|
+
|
|
353
|
+
is_author_like = (re.match(author_like_pattern, potential_title) or
|
|
354
|
+
re.match(multi_word_author, potential_title))
|
|
355
|
+
|
|
356
|
+
if len(potential_title) > 2 and not is_author_like:
|
|
278
357
|
title = clean_title(potential_title)
|
|
279
358
|
break
|
|
280
359
|
|
|
@@ -328,16 +407,25 @@ def parse_biblatex_entry_content(entry_num: str, content: str) -> Dict[str, Any]
|
|
|
328
407
|
# Examples we need to handle:
|
|
329
408
|
# "Egor Zverev, Sahar Abdelnabi, Mario Fritz, and Christoph H Lampert. \"Title\". In: venue (year)."
|
|
330
409
|
# "Andrej Karpathy. Intro to Large Language Models. https://... year."
|
|
410
|
+
# "A. Author and B. Coauthor, \"Title\"," <- handle this format
|
|
331
411
|
|
|
332
412
|
# Try multiple patterns to extract authors
|
|
413
|
+
# Order matters - more specific patterns first!
|
|
333
414
|
author_patterns = [
|
|
334
415
|
# Pattern 1: Authors followed by quoted title (handle both regular and smart quotes)
|
|
416
|
+
r'^([^"\u201c\u201d]+?),\s*["\u201c\u201d]', # "Authors, \"Title\"" - more restrictive, requires comma before quote
|
|
335
417
|
r'^([^"\u201c\u201d]+)\.\s*["\u201c\u201d]', # "Authors. \"Title\"" or smart quotes
|
|
336
418
|
|
|
337
|
-
# Pattern 2: Authors followed by title
|
|
419
|
+
# Pattern 2: Authors followed by unquoted title for books: "Author1 and Author2, Title:"
|
|
420
|
+
r'^([^,]+(?:\s+and\s+[^,]+)?),\s+([A-Z][^.]*?):\s*([^.]*?)\.', # "Author1 and Author2, Title: Subtitle." - book format
|
|
421
|
+
|
|
422
|
+
# Pattern 3: Authors ending with period, no space, then title (missing space case) - MORE SPECIFIC
|
|
423
|
+
r'^([^.]+?)\.([A-Z][^.]*)\.', # "Authors.Title." - missing space after period
|
|
424
|
+
|
|
425
|
+
# Pattern 4: Authors followed by title, then period, then year or venue (with extracted title)
|
|
338
426
|
r'^(.+?)\.\s*([A-Z][^.]+)\.\s+(?:In:|https?://|\d{4})', # "Authors. Title. In:/URL/Year" (allow no space after period)
|
|
339
427
|
|
|
340
|
-
# Pattern
|
|
428
|
+
# Pattern 5: Authors ending with period followed by capital letter (simpler fallback) - LEAST SPECIFIC
|
|
341
429
|
r'^([^.]+?)\.\s*[A-Z]', # Allow no space after period
|
|
342
430
|
]
|
|
343
431
|
|
|
@@ -347,9 +435,17 @@ def parse_biblatex_entry_content(entry_num: str, content: str) -> Dict[str, Any]
|
|
|
347
435
|
potential_authors = author_match.group(1).strip()
|
|
348
436
|
|
|
349
437
|
# For patterns that also capture title, extract it
|
|
350
|
-
if i ==
|
|
438
|
+
if i == 2 and not title and len(author_match.groups()) > 2:
|
|
439
|
+
# Pattern 2 (book format) captures authors, title, and subtitle
|
|
440
|
+
title_part = author_match.group(2).strip()
|
|
441
|
+
subtitle_part = author_match.group(3).strip()
|
|
442
|
+
combined_title = f"{title_part}: {subtitle_part}" if subtitle_part else title_part
|
|
443
|
+
if len(combined_title) > 2:
|
|
444
|
+
title = clean_title(combined_title)
|
|
445
|
+
elif (i == 3 or i == 4) and not title and len(author_match.groups()) > 1:
|
|
446
|
+
# Pattern 3 (missing space, index 3) and Pattern 4 (with space, index 4) capture both authors and title
|
|
351
447
|
potential_title = author_match.group(2).strip()
|
|
352
|
-
if len(potential_title) >
|
|
448
|
+
if len(potential_title) > 2 and not re.match(r'^[A-Z][a-z]+,', potential_title):
|
|
353
449
|
title = clean_title(potential_title)
|
|
354
450
|
|
|
355
451
|
# Validate that this looks like authors
|
|
@@ -429,8 +525,11 @@ def parse_biblatex_entry_content(entry_num: str, content: str) -> Dict[str, Any]
|
|
|
429
525
|
authors.append(part)
|
|
430
526
|
|
|
431
527
|
# 7. Extract journal/venue - look for patterns like "In: Conference" or remaining text
|
|
528
|
+
# Also handle cases like "Tasks,"Adv. Neural" where there's missing space after quote-comma
|
|
432
529
|
journal_patterns = [
|
|
433
530
|
r'In:\s*([^.]+?)(?:\.|$)', # "In: Conference Name"
|
|
531
|
+
r'"[^"]*,"([A-Z][^,]*?\. [A-Z][^,]*)', # Quote-comma-venue like "Tasks,"Adv. Neural Inf. Process. Syst."
|
|
532
|
+
r'["\u201c\u201d]([A-Z][^.]*(?:Adv\.|Proc\.|IEEE|Journal)[^.]*)', # Missing space after quote like "Tasks"Adv. Neural"
|
|
434
533
|
r'([A-Z][^.]*(?:Conference|Workshop|Journal|Proceedings)[^.]*)', # Conference/journal names
|
|
435
534
|
]
|
|
436
535
|
|
|
@@ -11,6 +11,31 @@ from typing import List
|
|
|
11
11
|
logger = logging.getLogger(__name__)
|
|
12
12
|
|
|
13
13
|
|
|
14
|
+
def normalize_apostrophes(text):
|
|
15
|
+
"""
|
|
16
|
+
Normalize all apostrophe variants to standard ASCII apostrophe
|
|
17
|
+
"""
|
|
18
|
+
if not text:
|
|
19
|
+
return text
|
|
20
|
+
|
|
21
|
+
# All known apostrophe variants
|
|
22
|
+
apostrophe_variants = [
|
|
23
|
+
"'", # U+0027 ASCII apostrophe
|
|
24
|
+
"'", # U+2019 Right single quotation mark (most common)
|
|
25
|
+
"'", # U+2018 Left single quotation mark
|
|
26
|
+
"ʼ", # U+02BC Modifier letter apostrophe
|
|
27
|
+
"ˈ", # U+02C8 Modifier letter vertical line (primary stress)
|
|
28
|
+
"`", # U+0060 Grave accent (sometimes used as apostrophe)
|
|
29
|
+
"´", # U+00B4 Acute accent (sometimes used as apostrophe)
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
# Replace all variants with standard ASCII apostrophe
|
|
33
|
+
for variant in apostrophe_variants:
|
|
34
|
+
text = text.replace(variant, "'")
|
|
35
|
+
|
|
36
|
+
return text
|
|
37
|
+
|
|
38
|
+
|
|
14
39
|
def normalize_text(text):
|
|
15
40
|
"""
|
|
16
41
|
Normalize text by removing diacritical marks and special characters
|
|
@@ -18,6 +43,9 @@ def normalize_text(text):
|
|
|
18
43
|
if not text:
|
|
19
44
|
return ""
|
|
20
45
|
|
|
46
|
+
# First normalize apostrophes to standard form
|
|
47
|
+
text = normalize_apostrophes(text)
|
|
48
|
+
|
|
21
49
|
# Replace common special characters with their ASCII equivalents
|
|
22
50
|
replacements = {
|
|
23
51
|
'ä': 'a', 'ö': 'o', 'ü': 'u', 'ß': 'ss',
|
|
@@ -29,7 +57,7 @@ def normalize_text(text):
|
|
|
29
57
|
'Ł': 'L', 'ł': 'l',
|
|
30
58
|
'¨': '', '´': '', '`': '', '^': '', '~': '',
|
|
31
59
|
'–': '-', '—': '-', '−': '-',
|
|
32
|
-
'„': '"', '"': '"', '"': '"',
|
|
60
|
+
'„': '"', '"': '"', '"': '"',
|
|
33
61
|
'«': '"', '»': '"',
|
|
34
62
|
'¡': '!', '¿': '?',
|
|
35
63
|
'°': 'degrees', '©': '(c)', '®': '(r)', '™': '(tm)',
|
|
@@ -39,10 +67,6 @@ def normalize_text(text):
|
|
|
39
67
|
'\u00A0': ' ', # Non-breaking space
|
|
40
68
|
'\u2013': '-', # En dash
|
|
41
69
|
'\u2014': '-', # Em dash
|
|
42
|
-
'\u2018': "'", # Left single quotation mark
|
|
43
|
-
'\u2019': "'", # Right single quotation mark
|
|
44
|
-
'\u201C': '"', # Left double quotation mark
|
|
45
|
-
'\u201D': '"', # Right double quotation mark
|
|
46
70
|
'\u2026': '...', # Horizontal ellipsis
|
|
47
71
|
'\u00B7': '.', # Middle dot
|
|
48
72
|
'\u2022': '.', # Bullet
|
|
@@ -54,8 +78,8 @@ def normalize_text(text):
|
|
|
54
78
|
# Remove any remaining diacritical marks
|
|
55
79
|
text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('ASCII')
|
|
56
80
|
|
|
57
|
-
# Remove special characters
|
|
58
|
-
text = re.sub(r
|
|
81
|
+
# Remove special characters except apostrophes
|
|
82
|
+
text = re.sub(r"[^\w\s']", '', text)
|
|
59
83
|
|
|
60
84
|
# Normalize whitespace
|
|
61
85
|
text = re.sub(r'\s+', ' ', text).strip()
|
|
@@ -368,6 +392,9 @@ def clean_author_name(author):
|
|
|
368
392
|
# Normalize Unicode characters (e.g., combining diacritics)
|
|
369
393
|
author = unicodedata.normalize('NFKC', author)
|
|
370
394
|
|
|
395
|
+
# Normalize apostrophes first before other processing
|
|
396
|
+
author = normalize_apostrophes(author)
|
|
397
|
+
|
|
371
398
|
# Handle common Unicode escape sequences and LaTeX encodings
|
|
372
399
|
# Note: Order matters - process longer patterns first
|
|
373
400
|
unicode_replacements = [
|
|
@@ -703,8 +730,12 @@ def normalize_diacritics(text: str) -> str:
|
|
|
703
730
|
'José' -> 'jose'
|
|
704
731
|
'Łukasz' -> 'lukasz'
|
|
705
732
|
'J. Gl¨ uck' -> 'J. Gluck'
|
|
733
|
+
'D'Amato' -> 'D'Amato' (apostrophes normalized)
|
|
706
734
|
"""
|
|
707
|
-
# First
|
|
735
|
+
# First normalize apostrophes
|
|
736
|
+
text = normalize_apostrophes(text)
|
|
737
|
+
|
|
738
|
+
# Then handle special characters that don't decompose properly
|
|
708
739
|
# Including common transliterations
|
|
709
740
|
special_chars = {
|
|
710
741
|
'ł': 'l', 'Ł': 'L',
|
|
@@ -2224,7 +2255,8 @@ def format_author_for_display(author_name):
|
|
|
2224
2255
|
if not author_name:
|
|
2225
2256
|
return author_name
|
|
2226
2257
|
|
|
2227
|
-
|
|
2258
|
+
# Normalize apostrophes for consistent display
|
|
2259
|
+
author_name = normalize_apostrophes(author_name.strip())
|
|
2228
2260
|
|
|
2229
2261
|
# Check if it's in "Lastname, Firstname" format
|
|
2230
2262
|
if ',' in author_name:
|
|
@@ -3743,6 +3775,7 @@ def are_venues_substantially_different(venue1: str, venue2: str) -> bool:
|
|
|
3743
3775
|
for abbrev in sorted(multi_word_abbrevs.keys(), key=len, reverse=True):
|
|
3744
3776
|
if abbrev in expanded_text:
|
|
3745
3777
|
expanded_text = expanded_text.replace(abbrev, multi_word_abbrevs[abbrev])
|
|
3778
|
+
break # Only apply the first (longest) matching abbreviation to avoid conflicts
|
|
3746
3779
|
|
|
3747
3780
|
# Second pass: handle single word abbreviations
|
|
3748
3781
|
words = expanded_text.split()
|
|
@@ -4137,8 +4170,9 @@ def are_venues_substantially_different(venue1: str, venue2: str) -> bool:
|
|
|
4137
4170
|
return False
|
|
4138
4171
|
|
|
4139
4172
|
# Order-aware fuzzy matching - words should match in sequence
|
|
4140
|
-
|
|
4141
|
-
|
|
4173
|
+
# Sort to ensure deterministic order (set iteration is not guaranteed to be consistent)
|
|
4174
|
+
words1_list = sorted(list(words1))
|
|
4175
|
+
words2_list = sorted(list(words2))
|
|
4142
4176
|
|
|
4143
4177
|
# If word counts are very different, they're likely different venues
|
|
4144
4178
|
if len(words1) > 0 and len(words2) > 0:
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/checkers/enhanced_hybrid_checker.py
RENAMED
|
File without changes
|
|
File without changes
|
{academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/checkers/local_semantic_scholar.py
RENAMED
|
File without changes
|
|
File without changes
|
{academic_refchecker-1.2.38 → academic_refchecker-1.2.40}/src/checkers/openreview_checker.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|