academic-refchecker 1.2.53__py3-none-any.whl → 1.2.55__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. {academic_refchecker-1.2.53.dist-info → academic_refchecker-1.2.55.dist-info}/METADATA +23 -23
  2. academic_refchecker-1.2.55.dist-info/RECORD +49 -0
  3. academic_refchecker-1.2.55.dist-info/entry_points.txt +2 -0
  4. academic_refchecker-1.2.55.dist-info/top_level.txt +1 -0
  5. refchecker/__init__.py +13 -0
  6. refchecker/__main__.py +11 -0
  7. refchecker/__version__.py +5 -0
  8. {checkers → refchecker/checkers}/crossref.py +5 -5
  9. {checkers → refchecker/checkers}/enhanced_hybrid_checker.py +1 -1
  10. {checkers → refchecker/checkers}/github_checker.py +4 -4
  11. {checkers → refchecker/checkers}/local_semantic_scholar.py +7 -7
  12. {checkers → refchecker/checkers}/openalex.py +6 -6
  13. {checkers → refchecker/checkers}/openreview_checker.py +8 -8
  14. {checkers → refchecker/checkers}/pdf_paper_checker.py +1 -1
  15. {checkers → refchecker/checkers}/semantic_scholar.py +10 -10
  16. {checkers → refchecker/checkers}/webpage_checker.py +3 -3
  17. {core → refchecker/core}/parallel_processor.py +6 -6
  18. {core → refchecker/core}/refchecker.py +63 -63
  19. {utils → refchecker/utils}/arxiv_utils.py +3 -3
  20. {utils → refchecker/utils}/biblatex_parser.py +4 -4
  21. {utils → refchecker/utils}/bibliography_utils.py +5 -5
  22. {utils → refchecker/utils}/bibtex_parser.py +5 -5
  23. {utils → refchecker/utils}/error_utils.py +1 -1
  24. {utils → refchecker/utils}/text_utils.py +62 -13
  25. __version__.py +0 -3
  26. academic_refchecker-1.2.53.dist-info/RECORD +0 -47
  27. academic_refchecker-1.2.53.dist-info/entry_points.txt +0 -2
  28. academic_refchecker-1.2.53.dist-info/top_level.txt +0 -9
  29. {academic_refchecker-1.2.53.dist-info → academic_refchecker-1.2.55.dist-info}/WHEEL +0 -0
  30. {academic_refchecker-1.2.53.dist-info → academic_refchecker-1.2.55.dist-info}/licenses/LICENSE +0 -0
  31. {checkers → refchecker/checkers}/__init__.py +0 -0
  32. {config → refchecker/config}/__init__.py +0 -0
  33. {config → refchecker/config}/logging.conf +0 -0
  34. {config → refchecker/config}/settings.py +0 -0
  35. {core → refchecker/core}/__init__.py +0 -0
  36. {core → refchecker/core}/db_connection_pool.py +0 -0
  37. {database → refchecker/database}/__init__.py +0 -0
  38. {database → refchecker/database}/download_semantic_scholar_db.py +0 -0
  39. {llm → refchecker/llm}/__init__.py +0 -0
  40. {llm → refchecker/llm}/base.py +0 -0
  41. {llm → refchecker/llm}/providers.py +0 -0
  42. {scripts → refchecker/scripts}/__init__.py +0 -0
  43. {scripts → refchecker/scripts}/start_vllm_server.py +0 -0
  44. {services → refchecker/services}/__init__.py +0 -0
  45. {services → refchecker/services}/pdf_processor.py +0 -0
  46. {utils → refchecker/utils}/__init__.py +0 -0
  47. {utils → refchecker/utils}/author_utils.py +0 -0
  48. {utils → refchecker/utils}/config_validator.py +0 -0
  49. {utils → refchecker/utils}/db_utils.py +0 -0
  50. {utils → refchecker/utils}/doi_utils.py +0 -0
  51. {utils → refchecker/utils}/mock_objects.py +0 -0
  52. {utils → refchecker/utils}/unicode_utils.py +0 -0
  53. {utils → refchecker/utils}/url_utils.py +0 -0
@@ -173,6 +173,11 @@ def parse_authors_with_initials(authors_text):
173
173
  if stripped_text in ['others', 'and others', 'et al', 'et al.']:
174
174
  return []
175
175
 
176
+ # Clean LaTeX commands early to prevent parsing issues
177
+ # This fixes cases like "Hochreiter, Sepp and Schmidhuber, J{\"u}rgen"
178
+ # which should parse as 2 authors, not get split incorrectly due to LaTeX braces
179
+ authors_text = strip_latex_commands(authors_text)
180
+
176
181
  # Fix spacing around periods in initials (e.g., "Y . Li" -> "Y. Li") before parsing
177
182
  authors_text = re.sub(r'(\w)\s+\.', r'\1.', authors_text)
178
183
 
@@ -300,9 +305,9 @@ def parse_authors_with_initials(authors_text):
300
305
  comma_parts = [p.strip() for p in part.split(',')]
301
306
  if len(comma_parts) == 2:
302
307
  lastname, firstname = comma_parts
303
- # Both parts should contain only letters, spaces, hyphens, apostrophes, and periods
304
- if (re.match(r'^[A-Za-z\s\-\'.]+$', lastname) and
305
- re.match(r'^[A-Za-z\s\-\'.]+$', firstname) and
308
+ # Both parts should contain only letters (including Unicode), spaces, hyphens, apostrophes, and periods
309
+ if (re.match(r'^[\w\s\-\'.]+$', lastname, re.UNICODE) and
310
+ re.match(r'^[\w\s\-\'.]+$', firstname, re.UNICODE) and
306
311
  lastname and firstname):
307
312
  valid_author_parts.append(part)
308
313
 
@@ -314,6 +319,50 @@ def parse_authors_with_initials(authors_text):
314
319
  # Split on commas first for other formats
315
320
  parts = [part.strip() for part in authors_text.split(',') if part.strip()]
316
321
 
322
+ # Handle single author with "Lastname, Firstname" format (exactly 2 parts)
323
+ if len(parts) == 2:
324
+ lastname, firstname = parts
325
+ # Pattern for surnames: capitalized word(s), possibly hyphenated or compound
326
+ # But exclude common patterns that suggest multiple authors like "Other Author"
327
+ surname_pattern = r'^[A-Z][a-zA-Z\-\']+$' # Single surname word (no spaces to avoid "Other Author")
328
+ # Pattern for first names or initials: either full names or initials with periods
329
+ # Accept both full names like "David R" and initials like "A. C"
330
+ firstname_pattern = r'^[A-Z]([a-zA-Z\s\-\'.]*|\.(\s+[A-Z]\.?)*\s*)$' # Full names or initials
331
+
332
+ # Additional check: if the "firstname" part looks like "Other Author" or similar,
333
+ # it's likely multiple authors, not a single "Lastname, Firstname" pattern
334
+ # We need to distinguish between:
335
+ # - "David R" (first name + middle initial - single author)
336
+ # - "Other Author" (two separate names - multiple authors)
337
+ if ' ' in firstname:
338
+ firstname_parts = firstname.split()
339
+ if len(firstname_parts) == 2:
340
+ first_part, second_part = firstname_parts
341
+ # Pattern 1: "David R" - first name + single letter (middle initial)
342
+ is_name_plus_initial = (
343
+ len(first_part) >= 2 and first_part[0].isupper() and first_part[1:].islower() and
344
+ len(second_part) <= 2 and second_part.replace('.', '').isalpha() # Initial like "R" or "R."
345
+ )
346
+ # Pattern 2: "Other Author" - two full capitalized words suggesting separate authors
347
+ looks_like_separate_authors = (
348
+ len(first_part) >= 3 and first_part[0].isupper() and first_part[1:].islower() and
349
+ len(second_part) >= 3 and second_part[0].isupper() and second_part[1:].islower()
350
+ )
351
+ looks_like_multiple_authors = looks_like_separate_authors and not is_name_plus_initial
352
+ else:
353
+ # More than 2 parts with spaces likely indicates multiple authors
354
+ looks_like_multiple_authors = len(firstname_parts) > 2
355
+ else:
356
+ looks_like_multiple_authors = False
357
+
358
+ # Check if this looks like a single author in "Lastname, Firstname" format
359
+ if (re.match(surname_pattern, lastname) and
360
+ re.match(firstname_pattern, firstname) and
361
+ len(lastname) >= 2 and len(firstname) >= 1 and
362
+ not looks_like_multiple_authors):
363
+ # This is a single author, return as "Lastname, Firstname"
364
+ return [f"{lastname}, {firstname}"]
365
+
317
366
  # Check if this is BibTeX comma-separated format: "Surname, Given, Surname, Given"
318
367
  # Enhanced heuristic: even number of parts >= 6, alternating proper surname/given pattern
319
368
  # Distinguish between initials (should remain as "Surname, Initial") and full names
@@ -640,7 +689,7 @@ def extract_arxiv_id_from_url(url):
640
689
  Returns:
641
690
  ArXiv ID or None if not found
642
691
  """
643
- from utils.url_utils import extract_arxiv_id_from_url as common_extract
692
+ from refchecker.utils.url_utils import extract_arxiv_id_from_url as common_extract
644
693
  return common_extract(url)
645
694
 
646
695
  def extract_year_from_text(text):
@@ -2092,7 +2141,7 @@ def compare_authors(cited_authors: list, correct_authors: list, normalize_func=N
2092
2141
  # and not penalize for the authoritative source having more authors
2093
2142
  if has_et_al:
2094
2143
  # Import here to avoid circular imports
2095
- from utils.error_utils import format_author_mismatch
2144
+ from refchecker.utils.error_utils import format_author_mismatch
2096
2145
  # For et al cases, check if each cited author matches ANY author in the correct list
2097
2146
  # rather than comparing positionally, since author order can vary
2098
2147
  for i, cited_author in enumerate(cleaned_cited):
@@ -2126,21 +2175,21 @@ def compare_authors(cited_authors: list, correct_authors: list, normalize_func=N
2126
2175
 
2127
2176
  # Check if cited authors look like parsing fragments
2128
2177
  if looks_like_fragments(cleaned_cited):
2129
- from utils.error_utils import format_author_count_mismatch
2178
+ from refchecker.utils.error_utils import format_author_count_mismatch
2130
2179
  display_cited = [format_author_for_display(author) for author in cleaned_cited]
2131
2180
  error_msg = format_author_count_mismatch(len(cleaned_cited), len(correct_names), display_cited, correct_names)
2132
2181
  return False, error_msg
2133
2182
 
2134
2183
  # For all count mismatches, show the count mismatch error
2135
2184
  if len(cleaned_cited) < len(correct_names):
2136
- from utils.error_utils import format_author_count_mismatch
2185
+ from refchecker.utils.error_utils import format_author_count_mismatch
2137
2186
  display_cited = [format_author_for_display(author) for author in cleaned_cited]
2138
2187
  error_msg = format_author_count_mismatch(len(cleaned_cited), len(correct_names), display_cited, correct_names)
2139
2188
  return False, error_msg
2140
2189
 
2141
2190
  # For cases where cited > correct, also show count mismatch
2142
2191
  elif len(cleaned_cited) > len(correct_names):
2143
- from utils.error_utils import format_author_count_mismatch
2192
+ from refchecker.utils.error_utils import format_author_count_mismatch
2144
2193
  display_cited = [format_author_for_display(author) for author in cleaned_cited]
2145
2194
  error_msg = format_author_count_mismatch(len(cleaned_cited), len(correct_names), display_cited, correct_names)
2146
2195
  return False, error_msg
@@ -2149,7 +2198,7 @@ def compare_authors(cited_authors: list, correct_authors: list, normalize_func=N
2149
2198
  comparison_correct = correct_names
2150
2199
 
2151
2200
  # Use shared three-line formatter (imported lazily to avoid circular imports)
2152
- from utils.error_utils import format_first_author_mismatch, format_author_mismatch
2201
+ from refchecker.utils.error_utils import format_first_author_mismatch, format_author_mismatch
2153
2202
 
2154
2203
  # Compare first author (most important) using the enhanced name matching
2155
2204
  if comparison_cited and comparison_correct:
@@ -2757,7 +2806,7 @@ def filter_bibtex_by_cited_keys(bib_content, cited_keys):
2757
2806
  return bib_content
2758
2807
 
2759
2808
  # Parse entries and filter
2760
- from utils.bibtex_parser import parse_bibtex_entries
2809
+ from refchecker.utils.bibtex_parser import parse_bibtex_entries
2761
2810
  entries = parse_bibtex_entries(bib_content)
2762
2811
  filtered_entries = []
2763
2812
 
@@ -3069,7 +3118,7 @@ def extract_latex_references(text, file_path=None): # pylint: disable=unused-ar
3069
3118
 
3070
3119
  if format_info['format_type'] == 'bibtex':
3071
3120
  # Use the dedicated BibTeX parser for consistent results
3072
- from utils.bibtex_parser import parse_bibtex_references
3121
+ from refchecker.utils.bibtex_parser import parse_bibtex_references
3073
3122
  return parse_bibtex_references(text)
3074
3123
 
3075
3124
  elif format_info['format_type'] == 'thebibliography':
@@ -3273,7 +3322,7 @@ def extract_latex_references(text, file_path=None): # pylint: disable=unused-ar
3273
3322
  # Extract URL if present
3274
3323
  url_match = re.search(r'\\url\{([^}]+)\}', content)
3275
3324
  if url_match:
3276
- from utils.url_utils import clean_url_punctuation
3325
+ from refchecker.utils.url_utils import clean_url_punctuation
3277
3326
  ref['url'] = clean_url_punctuation(url_match.group(1))
3278
3327
 
3279
3328
  # Extract title from \showarticletitle{} or \bibinfo{title}{}
@@ -3335,7 +3384,7 @@ def extract_latex_references(text, file_path=None): # pylint: disable=unused-ar
3335
3384
  if not ref['url']:
3336
3385
  url_match = re.search(r'\\url\{([^}]+)\}', content)
3337
3386
  if url_match:
3338
- from utils.url_utils import clean_url_punctuation
3387
+ from refchecker.utils.url_utils import clean_url_punctuation
3339
3388
  ref['url'] = clean_url_punctuation(url_match.group(1))
3340
3389
 
3341
3390
  # Extract DOI from \href{https://doi.org/...}
__version__.py DELETED
@@ -1,3 +0,0 @@
1
- """Version information for RefChecker."""
2
-
3
- __version__ = "1.2.53"
@@ -1,47 +0,0 @@
1
- __version__.py,sha256=iH7i3qnj4nR1gSXECRVUGvJH5oBPWtb7Lb8H9ODFTVc,65
2
- academic_refchecker-1.2.53.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
3
- checkers/__init__.py,sha256=T0PAHTFt6UiGvn-WGoJU8CdhXNmf6zaHmcGVoWHhmJQ,533
4
- checkers/crossref.py,sha256=cLYmSzE8ehJ5sNko_R3fEiGBGiPH5_HxLhFM-pCfDRM,20378
5
- checkers/enhanced_hybrid_checker.py,sha256=rbXkzpNkd0bn4e2OooX-CcdGTwwYpgmVaFvX_xCAFsA,27777
6
- checkers/github_checker.py,sha256=BXJaBC3AloKze04j8EcQz0a79EhtVoi9_871ilV7t60,14233
7
- checkers/local_semantic_scholar.py,sha256=D8py8-yMCgN1lvhXCiMUOEA4wBkH7AQvrkM4-3LCDsU,21015
8
- checkers/openalex.py,sha256=Fbc7iscZzmXjAZxH32PDX2r2Nwo9b5Ku-Sh1Ut9KpLA,19550
9
- checkers/openreview_checker.py,sha256=3ckn6U7TN5nQBjqPacr8W8mm2uMo6aWWB6gsxTDNCPk,40452
10
- checkers/pdf_paper_checker.py,sha256=L5HRHd3xpo0xDltZGTAA-Wk_arIS9bQV8ITeuxW0bNc,19893
11
- checkers/semantic_scholar.py,sha256=wk6e8DkYJM_O2nWsi-6EfJT53PzfL8KCmX1rS562KKc,34962
12
- checkers/webpage_checker.py,sha256=REOotx7Qka86_xbOIMeYj5YVb9D1RVMb4Ye311-28cA,43620
13
- config/__init__.py,sha256=r7sONsX2-ITviUJRU1KEz76uAuTRqZlzU-TVkvFRGYY,15
14
- config/logging.conf,sha256=r1tP0ApLHtlz7rV-oKS1MVO7oXJOgahbZFTtYmKnf9U,687
15
- config/settings.py,sha256=-vODFoXbWbGPUElpmchE5zbCj_n4Vtxr8HU1hQDFp_c,6164
16
- core/__init__.py,sha256=1T2MSQyDk0u_PupbHvm4CvNNN--dxsw78fqKUrqoYrM,157
17
- core/db_connection_pool.py,sha256=XRiOdehikkSz3obH4WKgf8woa3694if50Q15rBT-4XQ,4697
18
- core/parallel_processor.py,sha256=cq_WfzXrF2EI6IKOtJd6_QcwvM1xT3J6a13teg-wSbM,17638
19
- core/refchecker.py,sha256=-QIT5eUQaPCuQy7S80sXCvtrmcjdH5lf5wdZvsPQO9w,286416
20
- database/__init__.py,sha256=mEuVHlEBuS44t_2ZT_JnvQQrlRCjo1SJq1NmaJ6r8OY,125
21
- database/download_semantic_scholar_db.py,sha256=waN4I97KC_36YMiPbiBDUUmgfzu1nub5yeKdAsIR2aw,75276
22
- llm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
23
- llm/base.py,sha256=uMF-KOqZ9ZQ7rccOQLpKJiW9sEMMxr7ePXBSF0yYDJY,16782
24
- llm/providers.py,sha256=A0usJpprCO5D-VX0hqaQzBfi4DG3rdjA39vu02XJsGw,40092
25
- scripts/__init__.py,sha256=xJwo6afG8s7S888BK2Bxw2d7FX8aLkbl0l_ZoJOFibE,37
26
- scripts/start_vllm_server.py,sha256=ZepWp2y2cKFW0Kgsoima2RbmF02fTU29UFcLLpsBhFU,4213
27
- services/__init__.py,sha256=jGi9S74Msak3YR-C4Qb68VU7HB4oLaX9o1rlVAFpOFI,187
28
- services/pdf_processor.py,sha256=vu_JnhFGZY6jFVbDbPvG-mlQojvB-3Dzc8_946KVV2E,9427
29
- utils/__init__.py,sha256=1RrGoIIn1_gVzxd56b6a7HeAS-wu7uDP-nxLbR3fJ-8,1199
30
- utils/arxiv_utils.py,sha256=EzH1PhEAW0df5mmSP-kKHmuwqd4u2CSotRNwQ5IMJx8,19766
31
- utils/author_utils.py,sha256=DLTo1xsxef2wxoe4s_MWrh36maj4fgnvFlsDLpDE-qQ,5507
32
- utils/biblatex_parser.py,sha256=OkHXQcjiBrEDuhBfEk0RtmAYxufu5lAxAjb8__DzMjI,25537
33
- utils/bibliography_utils.py,sha256=mpmdAklzAs1CT3gqrOcjujGhouL95OuliCx0LE9Pg90,11705
34
- utils/bibtex_parser.py,sha256=a89NLy_q2kwED4QFJgxWFgPQOJBV73bIUL3RS_Urmro,15231
35
- utils/config_validator.py,sha256=rxf7K3DYmJ-BNPsmtaCNipY2BTVT-pJZ7wN-M9Y3GC8,11167
36
- utils/db_utils.py,sha256=_wSupfBlm0ILFvntQTvoj7tLDCbrYPRQrp9NDvphF_E,6281
37
- utils/doi_utils.py,sha256=ezUiRnYRpoO0U_Rqgxv1FxqmeTwPh6X8gLgSDbqg5sY,4874
38
- utils/error_utils.py,sha256=UJOH7Bp-rPV2JDY_XN38I2pSkqqPdnQoviKa4s4nK_A,12501
39
- utils/mock_objects.py,sha256=QxU-UXyHSY27IZYN8Sb8ei0JtNkpGSdMXoErrRLHXvE,6437
40
- utils/text_utils.py,sha256=T3PiiG9-BMPTbdCftG2zypyIeZJl6snuMCKQ0nEOQv0,217834
41
- utils/unicode_utils.py,sha256=-WBKarXO756p7fd7gCeNsMag4ztDNURwFX5IVniOtwY,10366
42
- utils/url_utils.py,sha256=HdxIO8QvciP6Jp8Wd4sTSrS8JQrOMwgM7pxdUC8RJb4,9176
43
- academic_refchecker-1.2.53.dist-info/METADATA,sha256=6j1G-R74oa1900hERaRnJFkV5u4zTuVyLC6YamhXxq4,23256
44
- academic_refchecker-1.2.53.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
45
- academic_refchecker-1.2.53.dist-info/entry_points.txt,sha256=WdI89tYkIfz-M628PiboOfOLzTBWZAqvlF29qCVCkek,61
46
- academic_refchecker-1.2.53.dist-info/top_level.txt,sha256=6RlcQEA0kHb7-ndbKMFMZnYnJQVohgsU6BBkbEvJvEs,69
47
- academic_refchecker-1.2.53.dist-info/RECORD,,
@@ -1,2 +0,0 @@
1
- [console_scripts]
2
- academic-refchecker = core.refchecker:main
@@ -1,9 +0,0 @@
1
- __version__
2
- checkers
3
- config
4
- core
5
- database
6
- llm
7
- scripts
8
- services
9
- utils
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes