academic-refchecker 1.2.39__tar.gz → 1.2.41__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {academic_refchecker-1.2.39/src/academic_refchecker.egg-info → academic_refchecker-1.2.41}/PKG-INFO +1 -1
- {academic_refchecker-1.2.39 → academic_refchecker-1.2.41}/src/__version__.py +1 -1
- {academic_refchecker-1.2.39 → academic_refchecker-1.2.41/src/academic_refchecker.egg-info}/PKG-INFO +1 -1
- {academic_refchecker-1.2.39 → academic_refchecker-1.2.41}/src/core/parallel_processor.py +2 -1
- {academic_refchecker-1.2.39 → academic_refchecker-1.2.41}/src/core/refchecker.py +29 -3
- {academic_refchecker-1.2.39 → academic_refchecker-1.2.41}/src/utils/biblatex_parser.py +106 -9
- {academic_refchecker-1.2.39 → academic_refchecker-1.2.41}/src/utils/text_utils.py +3 -1
- {academic_refchecker-1.2.39 → academic_refchecker-1.2.41}/LICENSE +0 -0
- {academic_refchecker-1.2.39 → academic_refchecker-1.2.41}/MANIFEST.in +0 -0
- {academic_refchecker-1.2.39 → academic_refchecker-1.2.41}/README.md +0 -0
- {academic_refchecker-1.2.39 → academic_refchecker-1.2.41}/pyproject.toml +0 -0
- {academic_refchecker-1.2.39 → academic_refchecker-1.2.41}/requirements.txt +0 -0
- {academic_refchecker-1.2.39 → academic_refchecker-1.2.41}/scripts/download_db.py +0 -0
- {academic_refchecker-1.2.39 → academic_refchecker-1.2.41}/scripts/run_tests.py +0 -0
- {academic_refchecker-1.2.39 → academic_refchecker-1.2.41}/scripts/start_vllm_server.py +0 -0
- {academic_refchecker-1.2.39 → academic_refchecker-1.2.41}/setup.cfg +0 -0
- {academic_refchecker-1.2.39 → academic_refchecker-1.2.41}/src/__init__.py +0 -0
- {academic_refchecker-1.2.39 → academic_refchecker-1.2.41}/src/academic_refchecker.egg-info/SOURCES.txt +0 -0
- {academic_refchecker-1.2.39 → academic_refchecker-1.2.41}/src/academic_refchecker.egg-info/dependency_links.txt +0 -0
- {academic_refchecker-1.2.39 → academic_refchecker-1.2.41}/src/academic_refchecker.egg-info/entry_points.txt +0 -0
- {academic_refchecker-1.2.39 → academic_refchecker-1.2.41}/src/academic_refchecker.egg-info/requires.txt +0 -0
- {academic_refchecker-1.2.39 → academic_refchecker-1.2.41}/src/academic_refchecker.egg-info/top_level.txt +0 -0
- {academic_refchecker-1.2.39 → academic_refchecker-1.2.41}/src/checkers/__init__.py +0 -0
- {academic_refchecker-1.2.39 → academic_refchecker-1.2.41}/src/checkers/crossref.py +0 -0
- {academic_refchecker-1.2.39 → academic_refchecker-1.2.41}/src/checkers/enhanced_hybrid_checker.py +0 -0
- {academic_refchecker-1.2.39 → academic_refchecker-1.2.41}/src/checkers/github_checker.py +0 -0
- {academic_refchecker-1.2.39 → academic_refchecker-1.2.41}/src/checkers/local_semantic_scholar.py +0 -0
- {academic_refchecker-1.2.39 → academic_refchecker-1.2.41}/src/checkers/openalex.py +0 -0
- {academic_refchecker-1.2.39 → academic_refchecker-1.2.41}/src/checkers/openreview_checker.py +0 -0
- {academic_refchecker-1.2.39 → academic_refchecker-1.2.41}/src/checkers/semantic_scholar.py +0 -0
- {academic_refchecker-1.2.39 → academic_refchecker-1.2.41}/src/checkers/webpage_checker.py +0 -0
- {academic_refchecker-1.2.39 → academic_refchecker-1.2.41}/src/config/__init__.py +0 -0
- {academic_refchecker-1.2.39 → academic_refchecker-1.2.41}/src/config/logging.conf +0 -0
- {academic_refchecker-1.2.39 → academic_refchecker-1.2.41}/src/config/settings.py +0 -0
- {academic_refchecker-1.2.39 → academic_refchecker-1.2.41}/src/core/__init__.py +0 -0
- {academic_refchecker-1.2.39 → academic_refchecker-1.2.41}/src/core/db_connection_pool.py +0 -0
- {academic_refchecker-1.2.39 → academic_refchecker-1.2.41}/src/database/__init__.py +0 -0
- {academic_refchecker-1.2.39 → academic_refchecker-1.2.41}/src/database/download_semantic_scholar_db.py +0 -0
- {academic_refchecker-1.2.39 → academic_refchecker-1.2.41}/src/llm/__init__.py +0 -0
- {academic_refchecker-1.2.39 → academic_refchecker-1.2.41}/src/llm/base.py +0 -0
- {academic_refchecker-1.2.39 → academic_refchecker-1.2.41}/src/llm/providers.py +0 -0
- {academic_refchecker-1.2.39 → academic_refchecker-1.2.41}/src/scripts/__init__.py +0 -0
- {academic_refchecker-1.2.39 → academic_refchecker-1.2.41}/src/scripts/start_vllm_server.py +0 -0
- {academic_refchecker-1.2.39 → academic_refchecker-1.2.41}/src/services/__init__.py +0 -0
- {academic_refchecker-1.2.39 → academic_refchecker-1.2.41}/src/services/pdf_processor.py +0 -0
- {academic_refchecker-1.2.39 → academic_refchecker-1.2.41}/src/utils/__init__.py +0 -0
- {academic_refchecker-1.2.39 → academic_refchecker-1.2.41}/src/utils/arxiv_utils.py +0 -0
- {academic_refchecker-1.2.39 → academic_refchecker-1.2.41}/src/utils/author_utils.py +0 -0
- {academic_refchecker-1.2.39 → academic_refchecker-1.2.41}/src/utils/bibliography_utils.py +0 -0
- {academic_refchecker-1.2.39 → academic_refchecker-1.2.41}/src/utils/bibtex_parser.py +0 -0
- {academic_refchecker-1.2.39 → academic_refchecker-1.2.41}/src/utils/config_validator.py +0 -0
- {academic_refchecker-1.2.39 → academic_refchecker-1.2.41}/src/utils/db_utils.py +0 -0
- {academic_refchecker-1.2.39 → academic_refchecker-1.2.41}/src/utils/doi_utils.py +0 -0
- {academic_refchecker-1.2.39 → academic_refchecker-1.2.41}/src/utils/error_utils.py +0 -0
- {academic_refchecker-1.2.39 → academic_refchecker-1.2.41}/src/utils/mock_objects.py +0 -0
- {academic_refchecker-1.2.39 → academic_refchecker-1.2.41}/src/utils/unicode_utils.py +0 -0
- {academic_refchecker-1.2.39 → academic_refchecker-1.2.41}/src/utils/url_utils.py +0 -0
|
@@ -279,7 +279,8 @@ class ParallelReferenceProcessor:
|
|
|
279
279
|
from utils.text_utils import format_authors_for_display
|
|
280
280
|
authors = format_authors_for_display(reference.get('authors', []))
|
|
281
281
|
year = reference.get('year', '')
|
|
282
|
-
venue
|
|
282
|
+
# Get venue from either 'venue' or 'journal' field
|
|
283
|
+
venue = reference.get('venue', '') or reference.get('journal', '')
|
|
283
284
|
url = reference.get('url', '')
|
|
284
285
|
doi = reference.get('doi', '')
|
|
285
286
|
|
|
@@ -3383,10 +3383,28 @@ class ArxivReferenceChecker:
|
|
|
3383
3383
|
# Check if this is biblatex format
|
|
3384
3384
|
from utils.biblatex_parser import detect_biblatex_format
|
|
3385
3385
|
if detect_biblatex_format(bibliography_text):
|
|
3386
|
-
logger.
|
|
3386
|
+
logger.debug("Detected biblatex format")
|
|
3387
3387
|
self.used_regex_extraction = True
|
|
3388
3388
|
# Note: biblatex parsing is also robust, so we don't set used_unreliable_extraction
|
|
3389
|
-
|
|
3389
|
+
biblatex_refs = self._parse_biblatex_references(bibliography_text)
|
|
3390
|
+
|
|
3391
|
+
# If biblatex parsing returned empty results (due to quality validation),
|
|
3392
|
+
# fallback to LLM if available
|
|
3393
|
+
if not biblatex_refs and self.llm_extractor:
|
|
3394
|
+
logger.debug("Biblatex is incompatible with parser")
|
|
3395
|
+
try:
|
|
3396
|
+
references = self.llm_extractor.extract_references(bibliography_text)
|
|
3397
|
+
if references:
|
|
3398
|
+
logger.debug(f"LLM fallback extracted {len(references)} references")
|
|
3399
|
+
return self._process_llm_extracted_references(references)
|
|
3400
|
+
else:
|
|
3401
|
+
logger.warning("LLM fallback also returned no results")
|
|
3402
|
+
return []
|
|
3403
|
+
except Exception as e:
|
|
3404
|
+
logger.error(f"LLM fallback failed: {e}")
|
|
3405
|
+
return []
|
|
3406
|
+
logger.debug("Using biblatex file")
|
|
3407
|
+
return biblatex_refs
|
|
3390
3408
|
|
|
3391
3409
|
# For non-standard formats, try LLM-based extraction if available
|
|
3392
3410
|
if self.llm_extractor:
|
|
@@ -3610,7 +3628,15 @@ class ArxivReferenceChecker:
|
|
|
3610
3628
|
if detect_biblatex_format(bibliography_text):
|
|
3611
3629
|
logger.debug("Detected biblatex format, using biblatex-specific parsing")
|
|
3612
3630
|
# biblatex parsing is also robust, so we don't set used_unreliable_extraction
|
|
3613
|
-
|
|
3631
|
+
biblatex_refs = self._parse_biblatex_references(bibliography_text)
|
|
3632
|
+
|
|
3633
|
+
# If biblatex parsing returned empty results (due to quality validation),
|
|
3634
|
+
# we'll continue with the unreliable fallback regex parsing
|
|
3635
|
+
if not biblatex_refs:
|
|
3636
|
+
logger.debug("Biblatex parser returned no results due to quality validation, falling back to regex parsing")
|
|
3637
|
+
print(f"⚠️ Biblatex parser found no valid references (failed quality validation) - falling back to regex parsing")
|
|
3638
|
+
else:
|
|
3639
|
+
return biblatex_refs
|
|
3614
3640
|
|
|
3615
3641
|
# If we reach here, we're using the unreliable fallback regex parsing
|
|
3616
3642
|
self.used_unreliable_extraction = True
|
|
@@ -138,6 +138,57 @@ def detect_biblatex_format(text: str) -> bool:
|
|
|
138
138
|
return has_biblatex_marker or has_numbered_refs
|
|
139
139
|
|
|
140
140
|
|
|
141
|
+
def _validate_parsing_quality(references: List[Dict[str, Any]]) -> bool:
|
|
142
|
+
"""
|
|
143
|
+
Validate that biblatex parsing results are of acceptable quality.
|
|
144
|
+
If quality is poor, we should fallback to LLM parsing instead.
|
|
145
|
+
|
|
146
|
+
Args:
|
|
147
|
+
references: List of parsed reference dictionaries
|
|
148
|
+
|
|
149
|
+
Returns:
|
|
150
|
+
True if parsing quality is acceptable, False if should fallback to LLM
|
|
151
|
+
"""
|
|
152
|
+
if not references:
|
|
153
|
+
return False
|
|
154
|
+
|
|
155
|
+
# Count problematic entries
|
|
156
|
+
unknown_authors = 0
|
|
157
|
+
unknown_titles = 0
|
|
158
|
+
total_entries = len(references)
|
|
159
|
+
|
|
160
|
+
for ref in references:
|
|
161
|
+
authors = ref.get('authors', [])
|
|
162
|
+
title = ref.get('title', '')
|
|
163
|
+
|
|
164
|
+
# Check for "Unknown Author" entries
|
|
165
|
+
if not authors or authors == ['Unknown Author']:
|
|
166
|
+
unknown_authors += 1
|
|
167
|
+
|
|
168
|
+
# Check for "Unknown Title" entries
|
|
169
|
+
if not title or title == 'Unknown Title':
|
|
170
|
+
unknown_titles += 1
|
|
171
|
+
|
|
172
|
+
# Calculate failure rates
|
|
173
|
+
author_failure_rate = unknown_authors / total_entries
|
|
174
|
+
title_failure_rate = unknown_titles / total_entries
|
|
175
|
+
|
|
176
|
+
# Quality thresholds - if more than 20% of entries have parsing failures,
|
|
177
|
+
# fallback to LLM which is more robust
|
|
178
|
+
MAX_ACCEPTABLE_FAILURE_RATE = 0.2
|
|
179
|
+
|
|
180
|
+
if author_failure_rate > MAX_ACCEPTABLE_FAILURE_RATE:
|
|
181
|
+
logger.debug(f"Biblatex parsing quality poor: {author_failure_rate:.1%} unknown authors (>{MAX_ACCEPTABLE_FAILURE_RATE:.0%}). Falling back to LLM.")
|
|
182
|
+
return False
|
|
183
|
+
|
|
184
|
+
if title_failure_rate > MAX_ACCEPTABLE_FAILURE_RATE:
|
|
185
|
+
logger.debug(f"Biblatex parsing quality poor: {title_failure_rate:.1%} unknown titles (>{MAX_ACCEPTABLE_FAILURE_RATE:.0%}). Falling back to LLM.")
|
|
186
|
+
return False
|
|
187
|
+
|
|
188
|
+
logger.debug(f"Biblatex parsing quality acceptable: {author_failure_rate:.1%} unknown authors, {title_failure_rate:.1%} unknown titles")
|
|
189
|
+
return True
|
|
190
|
+
|
|
191
|
+
|
|
141
192
|
def parse_biblatex_references(text: str) -> List[Dict[str, Any]]:
|
|
142
193
|
"""
|
|
143
194
|
Parse biblatex formatted references into structured format
|
|
@@ -146,7 +197,8 @@ def parse_biblatex_references(text: str) -> List[Dict[str, Any]]:
|
|
|
146
197
|
text: String containing biblatex .bbl entries
|
|
147
198
|
|
|
148
199
|
Returns:
|
|
149
|
-
List of structured reference dictionaries
|
|
200
|
+
List of structured reference dictionaries, or empty list if
|
|
201
|
+
parsing quality is poor (to trigger LLM fallback)
|
|
150
202
|
"""
|
|
151
203
|
from utils.text_utils import parse_authors_with_initials, clean_title
|
|
152
204
|
from utils.doi_utils import construct_doi_url, is_valid_doi_format
|
|
@@ -171,7 +223,7 @@ def parse_biblatex_references(text: str) -> List[Dict[str, Any]]:
|
|
|
171
223
|
# Find the content between this entry and the next (or end of text)
|
|
172
224
|
if i + 1 < len(entry_starts):
|
|
173
225
|
next_start = entry_starts[i + 1][1]
|
|
174
|
-
|
|
226
|
+
raw_content = text[end:next_start].strip()
|
|
175
227
|
else:
|
|
176
228
|
# Last entry - take everything to end, but be smart about stopping
|
|
177
229
|
remaining = text[end:].strip()
|
|
@@ -190,9 +242,20 @@ def parse_biblatex_references(text: str) -> List[Dict[str, Any]]:
|
|
|
190
242
|
if match and match.start() < min_stop:
|
|
191
243
|
min_stop = match.start()
|
|
192
244
|
|
|
193
|
-
|
|
245
|
+
raw_content = remaining[:min_stop].strip()
|
|
194
246
|
|
|
195
|
-
|
|
247
|
+
# Clean up content - handle cases where entry might be incomplete or malformed
|
|
248
|
+
if raw_content:
|
|
249
|
+
# Remove stray closing brackets or incomplete markers
|
|
250
|
+
content = raw_content
|
|
251
|
+
# Remove trailing "]" if it's the only thing on the last line
|
|
252
|
+
lines = content.split('\n')
|
|
253
|
+
if len(lines) > 1 and lines[-1].strip() == ']':
|
|
254
|
+
content = '\n'.join(lines[:-1]).strip()
|
|
255
|
+
elif content.strip() == ']':
|
|
256
|
+
# If content is only "], skip this entry as it's incomplete
|
|
257
|
+
continue
|
|
258
|
+
|
|
196
259
|
matches.append((entry_num, content))
|
|
197
260
|
|
|
198
261
|
for entry_num, content in matches:
|
|
@@ -218,6 +281,11 @@ def parse_biblatex_references(text: str) -> List[Dict[str, Any]]:
|
|
|
218
281
|
references.append(parsed_ref)
|
|
219
282
|
|
|
220
283
|
logger.debug(f"Extracted {len(references)} biblatex references")
|
|
284
|
+
|
|
285
|
+
# Validate parsing quality - if poor, return empty list to trigger LLM fallback
|
|
286
|
+
if not _validate_parsing_quality(references):
|
|
287
|
+
return []
|
|
288
|
+
|
|
221
289
|
return references
|
|
222
290
|
|
|
223
291
|
|
|
@@ -263,6 +331,8 @@ def parse_biblatex_entry_content(entry_num: str, content: str) -> Dict[str, Any]
|
|
|
263
331
|
# Pattern: "FirstAuthor et al. Title Goes Here. Year." or "Author. Title. Year."
|
|
264
332
|
# Order matters: more specific patterns first
|
|
265
333
|
title_patterns = [
|
|
334
|
+
# Pattern for unquoted books: "Author1 and Author2, Title: Subtitle. Location: Publisher, Year."
|
|
335
|
+
r'(?:and\s+[A-Z][^,]*),\s+([A-Z][^.]*?:\s*[^.]*?)\.\s+[A-Z][^:]*:\s*[^,]*,\s*\d{4}',
|
|
266
336
|
r'[A-Z][^.]+\.\s*([A-Z][^.]*?)\.\s*(?:https?://|arXiv:|\d{4})', # "Authors. Title. URL/arXiv/Year" (flexible spacing) - MOST SPECIFIC
|
|
267
337
|
r'\.([A-Z][A-Za-z\s]+(?:\?|!)?)\.?\s+\d{4}', # ".Title. Year" - for cases where authors end without space
|
|
268
338
|
r'[A-Z][a-z]+\.([A-Z][A-Za-z\s\-&]+?)\.\s+\d{4}', # "Name.Title. Year" - missing space after period
|
|
@@ -276,7 +346,14 @@ def parse_biblatex_entry_content(entry_num: str, content: str) -> Dict[str, Any]
|
|
|
276
346
|
if title_match:
|
|
277
347
|
potential_title = title_match.group(1)
|
|
278
348
|
# Make sure it looks like a title and not author names
|
|
279
|
-
|
|
349
|
+
# Be more specific about author name patterns - should be "Surname, Initial" not "Word, Word"
|
|
350
|
+
author_like_pattern = r'^[A-Z][a-z]+,\s*[A-Z]\.?$' # "Smith, J." or "Smith, J"
|
|
351
|
+
multi_word_author = r'^[A-Z][a-z]+,\s*[A-Z][a-z]+$' # "Smith, John" - but still reject this
|
|
352
|
+
|
|
353
|
+
is_author_like = (re.match(author_like_pattern, potential_title) or
|
|
354
|
+
re.match(multi_word_author, potential_title))
|
|
355
|
+
|
|
356
|
+
if len(potential_title) > 2 and not is_author_like:
|
|
280
357
|
title = clean_title(potential_title)
|
|
281
358
|
break
|
|
282
359
|
|
|
@@ -330,16 +407,25 @@ def parse_biblatex_entry_content(entry_num: str, content: str) -> Dict[str, Any]
|
|
|
330
407
|
# Examples we need to handle:
|
|
331
408
|
# "Egor Zverev, Sahar Abdelnabi, Mario Fritz, and Christoph H Lampert. \"Title\". In: venue (year)."
|
|
332
409
|
# "Andrej Karpathy. Intro to Large Language Models. https://... year."
|
|
410
|
+
# "A. Author and B. Coauthor, \"Title\"," <- handle this format
|
|
333
411
|
|
|
334
412
|
# Try multiple patterns to extract authors
|
|
413
|
+
# Order matters - more specific patterns first!
|
|
335
414
|
author_patterns = [
|
|
336
415
|
# Pattern 1: Authors followed by quoted title (handle both regular and smart quotes)
|
|
416
|
+
r'^([^"\u201c\u201d]+?),\s*["\u201c\u201d]', # "Authors, \"Title\"" - more restrictive, requires comma before quote
|
|
337
417
|
r'^([^"\u201c\u201d]+)\.\s*["\u201c\u201d]', # "Authors. \"Title\"" or smart quotes
|
|
338
418
|
|
|
339
|
-
# Pattern 2: Authors followed by title
|
|
419
|
+
# Pattern 2: Authors followed by unquoted title for books: "Author1 and Author2, Title:"
|
|
420
|
+
r'^([^,]+(?:\s+and\s+[^,]+)?),\s+([A-Z][^.]*?):\s*([^.]*?)\.', # "Author1 and Author2, Title: Subtitle." - book format
|
|
421
|
+
|
|
422
|
+
# Pattern 3: Authors ending with period, no space, then title (missing space case) - MORE SPECIFIC
|
|
423
|
+
r'^([^.]+?)\.([A-Z][^.]*)\.', # "Authors.Title." - missing space after period
|
|
424
|
+
|
|
425
|
+
# Pattern 4: Authors followed by title, then period, then year or venue (with extracted title)
|
|
340
426
|
r'^(.+?)\.\s*([A-Z][^.]+)\.\s+(?:In:|https?://|\d{4})', # "Authors. Title. In:/URL/Year" (allow no space after period)
|
|
341
427
|
|
|
342
|
-
# Pattern
|
|
428
|
+
# Pattern 5: Authors ending with period followed by capital letter (simpler fallback) - LEAST SPECIFIC
|
|
343
429
|
r'^([^.]+?)\.\s*[A-Z]', # Allow no space after period
|
|
344
430
|
]
|
|
345
431
|
|
|
@@ -349,9 +435,17 @@ def parse_biblatex_entry_content(entry_num: str, content: str) -> Dict[str, Any]
|
|
|
349
435
|
potential_authors = author_match.group(1).strip()
|
|
350
436
|
|
|
351
437
|
# For patterns that also capture title, extract it
|
|
352
|
-
if i ==
|
|
438
|
+
if i == 2 and not title and len(author_match.groups()) > 2:
|
|
439
|
+
# Pattern 2 (book format) captures authors, title, and subtitle
|
|
440
|
+
title_part = author_match.group(2).strip()
|
|
441
|
+
subtitle_part = author_match.group(3).strip()
|
|
442
|
+
combined_title = f"{title_part}: {subtitle_part}" if subtitle_part else title_part
|
|
443
|
+
if len(combined_title) > 2:
|
|
444
|
+
title = clean_title(combined_title)
|
|
445
|
+
elif (i == 3 or i == 4) and not title and len(author_match.groups()) > 1:
|
|
446
|
+
# Pattern 3 (missing space, index 3) and Pattern 4 (with space, index 4) capture both authors and title
|
|
353
447
|
potential_title = author_match.group(2).strip()
|
|
354
|
-
if len(potential_title) >
|
|
448
|
+
if len(potential_title) > 2 and not re.match(r'^[A-Z][a-z]+,', potential_title):
|
|
355
449
|
title = clean_title(potential_title)
|
|
356
450
|
|
|
357
451
|
# Validate that this looks like authors
|
|
@@ -431,8 +525,11 @@ def parse_biblatex_entry_content(entry_num: str, content: str) -> Dict[str, Any]
|
|
|
431
525
|
authors.append(part)
|
|
432
526
|
|
|
433
527
|
# 7. Extract journal/venue - look for patterns like "In: Conference" or remaining text
|
|
528
|
+
# Also handle cases like "Tasks,"Adv. Neural" where there's missing space after quote-comma
|
|
434
529
|
journal_patterns = [
|
|
435
530
|
r'In:\s*([^.]+?)(?:\.|$)', # "In: Conference Name"
|
|
531
|
+
r'"[^"]*,"([A-Z][^,]*?\. [A-Z][^,]*)', # Quote-comma-venue like "Tasks,"Adv. Neural Inf. Process. Syst."
|
|
532
|
+
r'["\u201c\u201d]([A-Z][^.]*(?:Adv\.|Proc\.|IEEE|Journal)[^.]*)', # Missing space after quote like "Tasks"Adv. Neural"
|
|
436
533
|
r'([A-Z][^.]*(?:Conference|Workshop|Journal|Proceedings)[^.]*)', # Conference/journal names
|
|
437
534
|
]
|
|
438
535
|
|
|
@@ -3006,7 +3006,9 @@ def extract_latex_references(text, file_path=None): # pylint: disable=unused-ar
|
|
|
3006
3006
|
if ref['year']:
|
|
3007
3007
|
venue_clean = re.sub(rf'\b{ref["year"]}\b.*', '', venue_clean)
|
|
3008
3008
|
venue_clean = venue_clean.rstrip(',. ')
|
|
3009
|
-
|
|
3009
|
+
# Filter out common non-venue patterns that shouldn't be treated as venues
|
|
3010
|
+
non_venue_patterns = ['URL', 'url', 'http:', 'https:', 'DOI', 'doi:', 'ArXiv', 'arxiv:']
|
|
3011
|
+
if venue_clean and not any(pattern in venue_clean for pattern in non_venue_patterns):
|
|
3010
3012
|
ref['journal'] = venue_clean
|
|
3011
3013
|
|
|
3012
3014
|
# Extract URL if present
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{academic_refchecker-1.2.39 → academic_refchecker-1.2.41}/src/checkers/enhanced_hybrid_checker.py
RENAMED
|
File without changes
|
|
File without changes
|
{academic_refchecker-1.2.39 → academic_refchecker-1.2.41}/src/checkers/local_semantic_scholar.py
RENAMED
|
File without changes
|
|
File without changes
|
{academic_refchecker-1.2.39 → academic_refchecker-1.2.41}/src/checkers/openreview_checker.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|