academic-refchecker 1.2.42__py3-none-any.whl → 1.2.43__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
__version__.py CHANGED
@@ -1,3 +1,3 @@
1
1
  """Version information for RefChecker."""
2
2
 
3
- __version__ = "1.2.42"
3
+ __version__ = "1.2.43"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: academic-refchecker
3
- Version: 1.2.42
3
+ Version: 1.2.43
4
4
  Summary: A comprehensive tool for validating reference accuracy in academic papers
5
5
  Author-email: Mark Russinovich <markrussinovich@hotmail.com>
6
6
  License-Expression: MIT
@@ -1,5 +1,5 @@
1
- __version__.py,sha256=jrP5O1rb9OpfyEnz9IJjKo7ZhdOr-9_yzLGwvjDTLWA,65
2
- academic_refchecker-1.2.42.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
1
+ __version__.py,sha256=JbybFux4Juuafz1jN0cgsedPmzBO8U9DJ874tJu2saA,65
2
+ academic_refchecker-1.2.43.dist-info/licenses/LICENSE,sha256=Kwrx3fePVCeEFDCZvCW4OuoTNBiSoYbpGBI6qzGhWF0,1067
3
3
  checkers/__init__.py,sha256=T0PAHTFt6UiGvn-WGoJU8CdhXNmf6zaHmcGVoWHhmJQ,533
4
4
  checkers/crossref.py,sha256=Hzq4dlf1CSn0aZWU8CMOnLxIvaSivTabLoepIOkgkmY,20585
5
5
  checkers/enhanced_hybrid_checker.py,sha256=6yf5tV4jLSVzjX1xR_kQq0NOgQIst-z_WmkiqqMc8hQ,23469
@@ -15,7 +15,7 @@ config/settings.py,sha256=-vODFoXbWbGPUElpmchE5zbCj_n4Vtxr8HU1hQDFp_c,6164
15
15
  core/__init__.py,sha256=1T2MSQyDk0u_PupbHvm4CvNNN--dxsw78fqKUrqoYrM,157
16
16
  core/db_connection_pool.py,sha256=XRiOdehikkSz3obH4WKgf8woa3694if50Q15rBT-4XQ,4697
17
17
  core/parallel_processor.py,sha256=5V2iJDBFwwryMCnCNU_oRt2u5he1wpy-_9qapC_6f00,17043
18
- core/refchecker.py,sha256=ElXgD1iPI-rDDFZmCPMZpkIP4UeX3nPAJVCfsVPNgcw,274640
18
+ core/refchecker.py,sha256=sVRg3PUzrs2vLFlEBoi4bxUy-TpO5iQHCkokGas-ygQ,273616
19
19
  database/__init__.py,sha256=mEuVHlEBuS44t_2ZT_JnvQQrlRCjo1SJq1NmaJ6r8OY,125
20
20
  database/download_semantic_scholar_db.py,sha256=waN4I97KC_36YMiPbiBDUUmgfzu1nub5yeKdAsIR2aw,75276
21
21
  llm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -30,17 +30,17 @@ utils/arxiv_utils.py,sha256=Y8sDJgDwHxp1L33BkQoDumIl0Pkp-BuYZb1PwWYsmak,18251
30
30
  utils/author_utils.py,sha256=DLTo1xsxef2wxoe4s_MWrh36maj4fgnvFlsDLpDE-qQ,5507
31
31
  utils/biblatex_parser.py,sha256=OkHXQcjiBrEDuhBfEk0RtmAYxufu5lAxAjb8__DzMjI,25537
32
32
  utils/bibliography_utils.py,sha256=mpmdAklzAs1CT3gqrOcjujGhouL95OuliCx0LE9Pg90,11705
33
- utils/bibtex_parser.py,sha256=jsQ87lkzmBmJO3VEN3itw22CJ1Hesei4IvM2sfsaFKI,12867
33
+ utils/bibtex_parser.py,sha256=a89NLy_q2kwED4QFJgxWFgPQOJBV73bIUL3RS_Urmro,15231
34
34
  utils/config_validator.py,sha256=rxf7K3DYmJ-BNPsmtaCNipY2BTVT-pJZ7wN-M9Y3GC8,11167
35
35
  utils/db_utils.py,sha256=_wSupfBlm0ILFvntQTvoj7tLDCbrYPRQrp9NDvphF_E,6281
36
36
  utils/doi_utils.py,sha256=8f3iE4RdSNkzeqa9wJfoKcVEiBVse3_uf643biLudmw,4134
37
37
  utils/error_utils.py,sha256=JqnRg4z-O9GcJ1eJGeTMzmOQwPWbWo2Lf6Duwj-ymHQ,6258
38
38
  utils/mock_objects.py,sha256=QxU-UXyHSY27IZYN8Sb8ei0JtNkpGSdMXoErrRLHXvE,6437
39
- utils/text_utils.py,sha256=F5o-37KUkkr-ie4sg6ld5om3-uDpAxPUSjDFxY0fsL4,203063
39
+ utils/text_utils.py,sha256=jPgCOBTVboLRJyypoOtL-dg1wBDQrKBux2ImvC6wL58,206296
40
40
  utils/unicode_utils.py,sha256=-WBKarXO756p7fd7gCeNsMag4ztDNURwFX5IVniOtwY,10366
41
- utils/url_utils.py,sha256=n0m5rXKV0-UrE8lI85VEx23KmfGwky57sI6gFPuu78I,7358
42
- academic_refchecker-1.2.42.dist-info/METADATA,sha256=k7fzk4fhb-kz-CdJE-gaeU2I5xM16D1rNNeEuer_9Hk,22298
43
- academic_refchecker-1.2.42.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
44
- academic_refchecker-1.2.42.dist-info/entry_points.txt,sha256=WdI89tYkIfz-M628PiboOfOLzTBWZAqvlF29qCVCkek,61
45
- academic_refchecker-1.2.42.dist-info/top_level.txt,sha256=6RlcQEA0kHb7-ndbKMFMZnYnJQVohgsU6BBkbEvJvEs,69
46
- academic_refchecker-1.2.42.dist-info/RECORD,,
41
+ utils/url_utils.py,sha256=aq1hSYEA888bOKuBOGWRclgTFIjw32rpFdsBO_Ja8ZM,8402
42
+ academic_refchecker-1.2.43.dist-info/METADATA,sha256=ZsJhIw1n7Yjoug6mpV4zpAPf-eSW5xSMdd3Dl_WTOlI,22298
43
+ academic_refchecker-1.2.43.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
44
+ academic_refchecker-1.2.43.dist-info/entry_points.txt,sha256=WdI89tYkIfz-M628PiboOfOLzTBWZAqvlF29qCVCkek,61
45
+ academic_refchecker-1.2.43.dist-info/top_level.txt,sha256=6RlcQEA0kHb7-ndbKMFMZnYnJQVohgsU6BBkbEvJvEs,69
46
+ academic_refchecker-1.2.43.dist-info/RECORD,,
core/refchecker.py CHANGED
@@ -5407,25 +5407,6 @@ class ArxivReferenceChecker:
5407
5407
  if error_details:
5408
5408
  subreason = self._categorize_unverified_reason(error_details)
5409
5409
  print(f" Subreason: {subreason}")
5410
-
5411
- year_str = self._format_year_string(reference.get('year'))
5412
-
5413
- # Apply LaTeX cleaning and formatting to authors for display
5414
- authors = reference.get('authors', [])
5415
- if authors:
5416
- from utils.text_utils import strip_latex_commands, format_authors_for_display
5417
- cleaned_authors = [strip_latex_commands(author) for author in authors]
5418
- authors_display = format_authors_for_display(cleaned_authors)
5419
- else:
5420
- authors_display = 'Unknown authors'
5421
-
5422
- # Only show URL if it exists and is different from reference_url
5423
- ref_url = reference.get('url', '').strip()
5424
- if ref_url and ref_url != reference_url:
5425
- # Clean trailing punctuation from URL display
5426
- from utils.url_utils import clean_url_punctuation
5427
- clean_ref_url = clean_url_punctuation(ref_url)
5428
- print(f" URL: {clean_ref_url}")
5429
5410
 
5430
5411
  def _categorize_unverified_reason(self, error_details):
5431
5412
  """Categorize the unverified error into checker error or not found"""
utils/bibtex_parser.py CHANGED
@@ -103,37 +103,99 @@ def parse_bibtex_entry_content(entry_type: str, entry_key: str, content: str) ->
103
103
  Returns:
104
104
  Dictionary with parsed entry data
105
105
  """
106
- # Extract fields using regex
107
106
  fields = {}
108
107
 
109
- # Pattern to match field = {value} or field = "value"
110
- # Handle nested braces properly
111
- field_pattern = r'(\w+)\s*=\s*(?:\{([^{}]*(?:\{[^{}]*\}[^{}]*)*)\}|"([^"]*)")'
112
-
113
- for match in re.finditer(field_pattern, content, re.DOTALL):
114
- field_name = match.group(1).lower()
115
- field_value = match.group(2) or match.group(3) or ""
116
- # Strip outer quotes if present (handles cases like title = {"Some Title"})
117
- field_value = field_value.strip()
118
- if field_value.startswith('"') and field_value.endswith('"'):
119
- field_value = field_value[1:-1]
120
- fields[field_name] = field_value
108
+ # Use a more robust approach with manual parsing
109
+ i = 0
110
+ while i < len(content):
111
+ # Skip whitespace
112
+ while i < len(content) and content[i].isspace():
113
+ i += 1
114
+
115
+ if i >= len(content):
116
+ break
117
+
118
+ # Look for field name
119
+ field_start = i
120
+ while i < len(content) and (content[i].isalnum() or content[i] == '_'):
121
+ i += 1
122
+
123
+ if i == field_start:
124
+ i += 1 # Skip non-alphanumeric character
125
+ continue
126
+
127
+ field_name = content[field_start:i].lower()
128
+
129
+ # Skip whitespace
130
+ while i < len(content) and content[i].isspace():
131
+ i += 1
132
+
133
+ # Look for equals sign
134
+ if i >= len(content) or content[i] != '=':
135
+ continue
136
+ i += 1 # Skip '='
137
+
138
+ # Skip whitespace
139
+ while i < len(content) and content[i].isspace():
140
+ i += 1
141
+
142
+ if i >= len(content):
143
+ break
144
+
145
+ # Parse field value
146
+ field_value = ""
147
+ if content[i] == '"':
148
+ # Handle quoted strings
149
+ i += 1 # Skip opening quote
150
+ value_start = i
151
+ while i < len(content) and content[i] != '"':
152
+ i += 1
153
+ if i < len(content):
154
+ field_value = content[value_start:i]
155
+ i += 1 # Skip closing quote
156
+ elif content[i] == '{':
157
+ # Handle braced strings with proper nesting
158
+ brace_count = 0
159
+ value_start = i + 1 # Skip opening brace
160
+ i += 1
161
+ while i < len(content):
162
+ if content[i] == '{':
163
+ brace_count += 1
164
+ elif content[i] == '}':
165
+ if brace_count == 0:
166
+ break
167
+ brace_count -= 1
168
+ i += 1
169
+
170
+ if i < len(content):
171
+ field_value = content[value_start:i]
172
+ i += 1 # Skip closing brace
173
+
174
+ if field_value:
175
+ field_value = field_value.strip()
176
+ # Strip outer quotes if present (handles cases like title = {"Some Title"})
177
+ if field_value.startswith('"') and field_value.endswith('"'):
178
+ field_value = field_value[1:-1]
179
+ fields[field_name] = field_value
180
+
181
+ # Skip to next field (look for comma)
182
+ while i < len(content) and content[i] not in ',}':
183
+ i += 1
184
+ if i < len(content) and content[i] == ',':
185
+ i += 1
121
186
 
122
- # If field extraction failed, try a simpler approach
187
+ # Fallback to regex if manual parsing failed
123
188
  if not fields:
124
- logger.debug("Field extraction failed, trying line-by-line approach")
125
- lines = content.split('\n')
126
- for line in lines:
127
- line = line.strip()
128
- if '=' in line:
129
- field_match = re.match(r'(\w+)\s*=\s*[{"]([^{}"]*)[}"]', line)
130
- if field_match:
131
- field_name = field_match.group(1).lower()
132
- field_value = field_match.group(2).strip()
133
- # Strip outer quotes if present
134
- if field_value.startswith('"') and field_value.endswith('"'):
135
- field_value = field_value[1:-1]
136
- fields[field_name] = field_value
189
+ logger.debug("Manual parsing failed, trying regex approach")
190
+ field_pattern = r'(\w+)\s*=\s*(?:\{([^{}]*(?:\{[^{}]*\}[^{}]*)*)\}|"([^"]*)")'
191
+
192
+ for match in re.finditer(field_pattern, content, re.DOTALL):
193
+ field_name = match.group(1).lower()
194
+ field_value = match.group(2) or match.group(3) or ""
195
+ field_value = field_value.strip()
196
+ if field_value.startswith('"') and field_value.endswith('"'):
197
+ field_value = field_value[1:-1]
198
+ fields[field_name] = field_value
137
199
 
138
200
  return {
139
201
  'type': entry_type,
@@ -216,6 +278,9 @@ def parse_bibtex_references(bibliography_text: str) -> List[Dict[str, Any]]:
216
278
 
217
279
  # Extract journal/venue
218
280
  journal = fields.get('journal', fields.get('booktitle', fields.get('venue', '')))
281
+ # Remove braces from journal/venue names
282
+ if journal and journal.startswith('{') and journal.endswith('}'):
283
+ journal = journal[1:-1]
219
284
 
220
285
  # Extract DOI and construct URL
221
286
  doi = fields.get('doi', '')
@@ -225,6 +290,9 @@ def parse_bibtex_references(bibliography_text: str) -> List[Dict[str, Any]]:
225
290
 
226
291
  # Extract other URLs
227
292
  url = fields.get('url', '')
293
+ if url:
294
+ from utils.url_utils import clean_url
295
+ url = clean_url(url)
228
296
 
229
297
  # Handle special @misc entries with only howpublished field
230
298
  if not title and not authors and entry_type == 'misc':
@@ -249,6 +317,10 @@ def parse_bibtex_references(bibliography_text: str) -> List[Dict[str, Any]]:
249
317
  else:
250
318
  url = howpublished
251
319
 
320
+ # Clean the reconstructed URL
321
+ from utils.url_utils import clean_url
322
+ url = clean_url(url)
323
+
252
324
  # Generate title from domain/path
253
325
  if 'jailbreakchat.com' in domain:
254
326
  title = 'JailbreakChat Website'
@@ -275,6 +347,11 @@ def parse_bibtex_references(bibliography_text: str) -> List[Dict[str, Any]]:
275
347
 
276
348
  if url.startswith('\\url{') and url.endswith('}'):
277
349
  url = url[5:-1] # Remove \url{...}
350
+
351
+ # Clean any URL we extracted
352
+ if url:
353
+ from utils.url_utils import clean_url
354
+ url = clean_url(url)
278
355
 
279
356
  # Construct ArXiv URL from eprint field if no URL present
280
357
  if not url and not doi_url:
utils/text_utils.py CHANGED
@@ -11,6 +11,69 @@ from typing import List
11
11
  logger = logging.getLogger(__name__)
12
12
 
13
13
 
14
+ def expand_abbreviations(text: str) -> str:
15
+ """
16
+ Generic abbreviation expansion using common academic patterns.
17
+
18
+ This function expands common academic abbreviations to their full forms
19
+ to improve venue name matching and comparison.
20
+
21
+ Args:
22
+ text: Text containing potential abbreviations
23
+
24
+ Returns:
25
+ Text with abbreviations expanded
26
+ """
27
+ if not text:
28
+ return text
29
+
30
+ common_abbrevs = {
31
+ # IEEE specific abbreviations (only expand with periods, not full words)
32
+ 'robot.': 'robotics', 'autom.': 'automation', 'lett.': 'letters',
33
+ 'trans.': 'transactions', 'syst.': 'systems', 'netw.': 'networks',
34
+ 'learn.': 'learning', 'ind.': 'industrial', 'electron.': 'electronics',
35
+ 'mechatron.': 'mechatronics', 'intell.': 'intelligence',
36
+ 'transp.': 'transportation', 'contr.': 'control', 'mag.': 'magazine',
37
+ # General academic abbreviations (only expand with periods)
38
+ 'int.': 'international', 'intl.': 'international', 'conf.': 'conference',
39
+ 'j.': 'journal', 'proc.': 'proceedings', 'assoc.': 'association',
40
+ 'comput.': 'computing', 'sci.': 'science', 'eng.': 'engineering',
41
+ 'tech.': 'technology', 'artif.': 'artificial', 'mach.': 'machine',
42
+ 'stat.': 'statistics', 'math.': 'mathematics', 'phys.': 'physics',
43
+ 'chem.': 'chemistry', 'bio.': 'biology', 'med.': 'medicine',
44
+ 'adv.': 'advances', 'ann.': 'annual', 'symp.': 'symposium',
45
+ 'workshop': 'workshop', 'worksh.': 'workshop',
46
+ 'natl.': 'national', 'acad.': 'academy', 'rev.': 'review',
47
+ # Physics journal abbreviations
48
+ 'phys.': 'physics', 'phys. rev.': 'physical review',
49
+ 'phys. rev. lett.': 'physical review letters',
50
+ 'phys. rev. a': 'physical review a', 'phys. rev. b': 'physical review b',
51
+ 'phys. rev. c': 'physical review c', 'phys. rev. d': 'physical review d',
52
+ 'phys. rev. e': 'physical review e', 'phys. lett.': 'physics letters',
53
+ 'phys. lett. b': 'physics letters b', 'nucl. phys.': 'nuclear physics',
54
+ 'nucl. phys. a': 'nuclear physics a', 'nucl. phys. b': 'nuclear physics b',
55
+ 'j. phys.': 'journal of physics', 'ann. phys.': 'annals of physics',
56
+ 'mod. phys. lett.': 'modern physics letters', 'eur. phys. j.': 'european physical journal',
57
+ # Nature journals
58
+ 'nature phys.': 'nature physics', 'sci. adv.': 'science advances',
59
+ # Handle specific multi-word patterns and well-known acronyms
60
+ 'proc. natl. acad. sci.': 'proceedings of the national academy of sciences',
61
+ 'pnas': 'proceedings of the national academy of sciences',
62
+ 'neurips': 'neural information processing systems',
63
+ }
64
+
65
+ # Sort by length (longest first) to ensure longer matches take precedence
66
+ for abbrev, expansion in sorted(common_abbrevs.items(), key=lambda x: len(x[0]), reverse=True):
67
+ # For abbreviations ending in period, use word boundary at start only
68
+ if abbrev.endswith('.'):
69
+ pattern = r'\b' + re.escape(abbrev)
70
+ else:
71
+ pattern = r'\b' + re.escape(abbrev) + r'\b'
72
+ text = re.sub(pattern, expansion, text)
73
+
74
+ return text
75
+
76
+
14
77
  def normalize_apostrophes(text):
15
78
  """
16
79
  Normalize all apostrophe variants to standard ASCII apostrophe
utils/url_utils.py CHANGED
@@ -209,7 +209,13 @@ def validate_url_format(url: str) -> bool:
209
209
 
210
210
  def clean_url(url: str) -> str:
211
211
  """
212
- Clean a URL by removing common issues like extra spaces, fragments, etc.
212
+ Clean a URL by removing common issues like extra spaces, fragments, malformed LaTeX, etc.
213
+
214
+ This function handles:
215
+ - Whitespace trimming
216
+ - Malformed LaTeX URL wrappers like \\url{https://...}
217
+ - Trailing punctuation from academic references
218
+ - DOI URL query parameter cleanup
213
219
 
214
220
  Args:
215
221
  url: URL to clean
@@ -223,6 +229,18 @@ def clean_url(url: str) -> str:
223
229
  # Remove leading/trailing whitespace
224
230
  url = url.strip()
225
231
 
232
+ # Handle malformed URLs that contain \url{} wrappers within the URL text
233
+ # e.g., "https://\url{https://www.example.com/}" -> "https://www.example.com/"
234
+ import re
235
+ url_pattern = r'https?://\\url\{(https?://[^}]+)\}'
236
+ url_match = re.search(url_pattern, url)
237
+ if url_match:
238
+ url = url_match.group(1)
239
+
240
+ # Remove trailing punctuation that's commonly part of sentence structure
241
+ # but preserve legitimate URL characters
242
+ url = url.rstrip('.,;!?)')
243
+
226
244
  # Note: Preserving query parameters for all URLs now
227
245
  # Previously this function removed query parameters for non-DOI URLs,
228
246
  # but this was causing issues with OpenReview and other URLs that need their parameters
@@ -254,6 +272,14 @@ def clean_url_punctuation(url: str) -> str:
254
272
  # Remove leading/trailing whitespace
255
273
  url = url.strip()
256
274
 
275
+ # Handle malformed URLs that contain \\url{} wrappers within the URL text
276
+ # e.g., "https://\\url{https://www.example.com/}" -> "https://www.example.com/"
277
+ import re
278
+ url_pattern = r'https?://\\url\{(https?://[^}]+)\}'
279
+ url_match = re.search(url_pattern, url)
280
+ if url_match:
281
+ url = url_match.group(1)
282
+
257
283
  # Remove trailing punctuation that's commonly part of sentence structure
258
284
  # but preserve legitimate URL characters
259
285
  url = url.rstrip('.,;!?)')