academic-refchecker 1.2.49__tar.gz → 1.2.50__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. {academic_refchecker-1.2.49/src/academic_refchecker.egg-info → academic_refchecker-1.2.50}/PKG-INFO +1 -1
  2. {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/__version__.py +1 -1
  3. {academic_refchecker-1.2.49 → academic_refchecker-1.2.50/src/academic_refchecker.egg-info}/PKG-INFO +1 -1
  4. {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/utils/arxiv_utils.py +98 -54
  5. {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/utils/text_utils.py +3 -0
  6. {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/LICENSE +0 -0
  7. {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/MANIFEST.in +0 -0
  8. {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/README.md +0 -0
  9. {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/pyproject.toml +0 -0
  10. {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/requirements.txt +0 -0
  11. {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/scripts/download_db.py +0 -0
  12. {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/scripts/run_tests.py +0 -0
  13. {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/scripts/start_vllm_server.py +0 -0
  14. {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/setup.cfg +0 -0
  15. {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/__init__.py +0 -0
  16. {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/academic_refchecker.egg-info/SOURCES.txt +0 -0
  17. {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/academic_refchecker.egg-info/dependency_links.txt +0 -0
  18. {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/academic_refchecker.egg-info/entry_points.txt +0 -0
  19. {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/academic_refchecker.egg-info/requires.txt +0 -0
  20. {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/academic_refchecker.egg-info/top_level.txt +0 -0
  21. {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/checkers/__init__.py +0 -0
  22. {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/checkers/crossref.py +0 -0
  23. {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/checkers/enhanced_hybrid_checker.py +0 -0
  24. {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/checkers/github_checker.py +0 -0
  25. {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/checkers/local_semantic_scholar.py +0 -0
  26. {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/checkers/openalex.py +0 -0
  27. {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/checkers/openreview_checker.py +0 -0
  28. {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/checkers/semantic_scholar.py +0 -0
  29. {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/checkers/webpage_checker.py +0 -0
  30. {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/config/__init__.py +0 -0
  31. {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/config/logging.conf +0 -0
  32. {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/config/settings.py +0 -0
  33. {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/core/__init__.py +0 -0
  34. {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/core/db_connection_pool.py +0 -0
  35. {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/core/parallel_processor.py +0 -0
  36. {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/core/refchecker.py +0 -0
  37. {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/database/__init__.py +0 -0
  38. {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/database/download_semantic_scholar_db.py +0 -0
  39. {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/llm/__init__.py +0 -0
  40. {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/llm/base.py +0 -0
  41. {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/llm/providers.py +0 -0
  42. {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/scripts/__init__.py +0 -0
  43. {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/scripts/start_vllm_server.py +0 -0
  44. {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/services/__init__.py +0 -0
  45. {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/services/pdf_processor.py +0 -0
  46. {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/utils/__init__.py +0 -0
  47. {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/utils/author_utils.py +0 -0
  48. {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/utils/biblatex_parser.py +0 -0
  49. {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/utils/bibliography_utils.py +0 -0
  50. {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/utils/bibtex_parser.py +0 -0
  51. {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/utils/config_validator.py +0 -0
  52. {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/utils/db_utils.py +0 -0
  53. {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/utils/doi_utils.py +0 -0
  54. {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/utils/error_utils.py +0 -0
  55. {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/utils/mock_objects.py +0 -0
  56. {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/utils/unicode_utils.py +0 -0
  57. {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/utils/url_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: academic-refchecker
3
- Version: 1.2.49
3
+ Version: 1.2.50
4
4
  Summary: A comprehensive tool for validating reference accuracy in academic papers
5
5
  Author-email: Mark Russinovich <markrussinovich@hotmail.com>
6
6
  License-Expression: MIT
@@ -1,3 +1,3 @@
1
1
  """Version information for RefChecker."""
2
2
 
3
- __version__ = "1.2.49"
3
+ __version__ = "1.2.50"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: academic-refchecker
3
- Version: 1.2.49
3
+ Version: 1.2.50
4
4
  Summary: A comprehensive tool for validating reference accuracy in academic papers
5
5
  Author-email: Mark Russinovich <markrussinovich@hotmail.com>
6
6
  License-Expression: MIT
@@ -111,56 +111,8 @@ def download_arxiv_source(arxiv_id):
111
111
  main_tex_content = largest_file[1]
112
112
  logger.debug(f"Using largest tex file: {largest_file[0]}")
113
113
 
114
- # Find which .bib files are actually referenced in the main tex file
115
- bib_content = None
116
- if bib_files and main_tex_content:
117
- # Extract bibliography references from main tex file
118
- referenced_bibs = []
119
- bib_pattern = r'\\bibliography\{([^}]+)\}'
120
- matches = re.findall(bib_pattern, main_tex_content)
121
-
122
- for match in matches:
123
- # Handle multiple bib files separated by commas
124
- bib_names = [name.strip() for name in match.split(',')]
125
- for bib_name in bib_names:
126
- # Add .bib extension if not present
127
- if not bib_name.endswith('.bib'):
128
- bib_name += '.bib'
129
- referenced_bibs.append(bib_name)
130
-
131
- # Use only referenced .bib files, or all if no references found
132
- if referenced_bibs:
133
- used_bibs = []
134
- for bib_name in referenced_bibs:
135
- if bib_name in bib_files:
136
- used_bibs.append(bib_files[bib_name])
137
- logger.debug(f"Using referenced .bib file: {bib_name}")
138
- else:
139
- logger.debug(f"Referenced .bib file not found: {bib_name}")
140
-
141
- if used_bibs:
142
- raw_bib_content = '\n\n'.join(used_bibs)
143
-
144
- # Filter BibTeX to only include cited references
145
- bib_content = filter_bibtex_by_citations(raw_bib_content, tex_files, main_tex_content)
146
-
147
- logger.debug(f"Found {len(used_bibs)} referenced .bib files out of {len(bib_files)} total")
148
- else:
149
- # Fallback to all bib files if none of the referenced ones found
150
- raw_bib_content = '\n\n'.join(bib_files.values())
151
- bib_content = filter_bibtex_by_citations(raw_bib_content, tex_files, main_tex_content)
152
- logger.debug(f"No referenced .bib files found, using all {len(bib_files)} .bib files")
153
- else:
154
- # No \bibliography command found, use all bib files
155
- raw_bib_content = '\n\n'.join(bib_files.values())
156
- bib_content = filter_bibtex_by_citations(raw_bib_content, tex_files, main_tex_content)
157
- logger.debug(f"No \\bibliography command found, using all {len(bib_files)} .bib files")
158
- elif bib_files:
159
- # No main tex file but have bib files
160
- raw_bib_content = '\n\n'.join(bib_files.values())
161
- # Can't filter without tex files, so use original content
162
- bib_content = raw_bib_content
163
- logger.debug(f"Found {len(bib_files)} .bib files (no main tex to filter)")
114
+ # Process .bib files using shared logic
115
+ bib_content = select_and_filter_bib_files(bib_files, main_tex_content, tex_files)
164
116
 
165
117
  # Combine all bbl file contents
166
118
  bbl_content = None
@@ -219,6 +171,78 @@ def download_arxiv_bibtex(arxiv_id):
219
171
  return None
220
172
 
221
173
 
174
+ def select_and_filter_bib_files(bib_files, main_tex_content, tex_files):
175
+ """
176
+ Select appropriate .bib files based on main TeX file references and filter by citations.
177
+
178
+ Args:
179
+ bib_files: Dict of .bib files {filename: content}
180
+ main_tex_content: Content of main tex file
181
+ tex_files: Dict of all tex files {filename: content} (for filtering)
182
+
183
+ Returns:
184
+ Filtered BibTeX content or None if no files available
185
+ """
186
+ import re
187
+
188
+ if not bib_files:
189
+ return None
190
+
191
+ if main_tex_content:
192
+ # Extract bibliography references from main tex file
193
+ referenced_bibs = []
194
+ bib_pattern = r'\\bibliography\{([^}]+)\}'
195
+ matches = re.findall(bib_pattern, main_tex_content)
196
+
197
+ for match in matches:
198
+ # Handle multiple bib files separated by commas
199
+ bib_names = [name.strip() for name in match.split(',')]
200
+ for bib_name in bib_names:
201
+ # Add .bib extension if not present
202
+ if not bib_name.endswith('.bib'):
203
+ bib_name += '.bib'
204
+ referenced_bibs.append(bib_name)
205
+
206
+ # Use only referenced .bib files, or all if no references found
207
+ if referenced_bibs:
208
+ used_bibs = []
209
+ seen_bib_names = set() # Track which bib files we've already added
210
+ for bib_name in referenced_bibs:
211
+ if bib_name in bib_files and bib_name not in seen_bib_names:
212
+ used_bibs.append(bib_files[bib_name])
213
+ seen_bib_names.add(bib_name)
214
+ logger.debug(f"Using referenced .bib file: {bib_name}")
215
+ elif bib_name in seen_bib_names:
216
+ logger.debug(f"Skipping duplicate .bib file: {bib_name}")
217
+ else:
218
+ logger.debug(f"Referenced .bib file not found: {bib_name}")
219
+
220
+ if used_bibs:
221
+ raw_bib_content = '\n\n'.join(used_bibs)
222
+ # Filter BibTeX to only include cited references
223
+ filtered_content = filter_bibtex_by_citations(raw_bib_content, tex_files, main_tex_content)
224
+ logger.debug(f"Found {len(used_bibs)} referenced .bib files out of {len(bib_files)} total")
225
+ return filtered_content
226
+ else:
227
+ # Fallback to all bib files if none of the referenced ones found
228
+ raw_bib_content = '\n\n'.join(bib_files.values())
229
+ filtered_content = filter_bibtex_by_citations(raw_bib_content, tex_files, main_tex_content)
230
+ logger.debug(f"No referenced .bib files found, using all {len(bib_files)} .bib files")
231
+ return filtered_content
232
+ else:
233
+ # No \bibliography command found, use all bib files
234
+ raw_bib_content = '\n\n'.join(bib_files.values())
235
+ filtered_content = filter_bibtex_by_citations(raw_bib_content, tex_files, main_tex_content)
236
+ logger.debug(f"No \\bibliography command found, using all {len(bib_files)} .bib files")
237
+ return filtered_content
238
+ else:
239
+ # No main tex file but have bib files
240
+ raw_bib_content = '\n\n'.join(bib_files.values())
241
+ # Can't filter without tex files, so use original content
242
+ logger.debug(f"Found {len(bib_files)} .bib files (no main tex to filter)")
243
+ return raw_bib_content
244
+
245
+
222
246
  def extract_cited_keys_from_tex(tex_files, main_tex_content):
223
247
  """
224
248
  Extract all citation keys from TeX files.
@@ -261,7 +285,11 @@ def is_reference_used(reference_key, cited_keys):
261
285
  Returns:
262
286
  True if the reference is cited, False otherwise
263
287
  """
264
- return reference_key in cited_keys
288
+ result = reference_key in cited_keys
289
+ # Add debugging for the first few mismatches to understand the issue
290
+ if not result and len([k for k in cited_keys if k.startswith('a')]) < 3: # Limit debug output
291
+ logger.debug(f"Key '{reference_key}' not found in cited_keys")
292
+ return result
265
293
 
266
294
 
267
295
  def filter_bibtex_by_citations(bib_content, tex_files, main_tex_content):
@@ -291,14 +319,30 @@ def filter_bibtex_by_citations(bib_content, tex_files, main_tex_content):
291
319
  from utils.bibtex_parser import parse_bibtex_entries
292
320
  entries = parse_bibtex_entries(bib_content)
293
321
 
294
- # Filter entries to only cited ones
322
+ # Filter entries to only cited ones and remove duplicates
295
323
  cited_entries = []
324
+ seen_keys = set()
325
+ not_cited_count = 0
326
+ duplicate_count = 0
327
+
296
328
  for entry in entries:
297
329
  entry_key = entry.get('key', '')
298
330
  if is_reference_used(entry_key, cited_keys):
299
- cited_entries.append(entry)
331
+ if entry_key not in seen_keys:
332
+ cited_entries.append(entry)
333
+ seen_keys.add(entry_key)
334
+ else:
335
+ duplicate_count += 1
336
+ logger.debug(f"Skipping duplicate entry: '{entry_key}'")
337
+ else:
338
+ not_cited_count += 1
339
+ # Log first few entries that are NOT cited for debugging
340
+ if not_cited_count <= 5:
341
+ logger.debug(f"Entry NOT cited: '{entry_key}'")
300
342
 
301
- logger.debug(f"Filtered BibTeX: {len(entries)} total -> {len(cited_entries)} cited")
343
+ logger.debug(f"Filtered BibTeX: {len(entries)} total -> {len(cited_entries)} cited (removed {duplicate_count} duplicates)")
344
+ logger.debug(f"Citation keys found: {len(cited_keys)} keys")
345
+ logger.debug(f"Sample cited keys: {list(cited_keys)[:10]}")
302
346
 
303
347
  # Reconstruct BibTeX content from cited entries
304
348
  if not cited_entries:
@@ -3902,6 +3902,9 @@ def are_venues_substantially_different(venue1: str, venue2: str) -> bool:
3902
3902
  # Handle specific multi-word patterns and well-known acronyms
3903
3903
  'proc. natl. acad. sci.': 'proceedings of the national academy of sciences',
3904
3904
  'pnas': 'proceedings of the national academy of sciences',
3905
+ # Special cases that don't follow standard acronym patterns
3906
+ 'neurips': 'neural information processing systems', # Special case
3907
+ 'nips': 'neural information processing systems', # old name for neurips
3905
3908
  }
3906
3909
  # Sort by length (longest first) to ensure longer matches take precedence
3907
3910
  for abbrev, expansion in sorted(common_abbrevs.items(), key=lambda x: len(x[0]), reverse=True):