academic-refchecker 1.2.49__tar.gz → 1.2.50__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {academic_refchecker-1.2.49/src/academic_refchecker.egg-info → academic_refchecker-1.2.50}/PKG-INFO +1 -1
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/__version__.py +1 -1
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.50/src/academic_refchecker.egg-info}/PKG-INFO +1 -1
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/utils/arxiv_utils.py +98 -54
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/utils/text_utils.py +3 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/LICENSE +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/MANIFEST.in +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/README.md +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/pyproject.toml +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/requirements.txt +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/scripts/download_db.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/scripts/run_tests.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/scripts/start_vllm_server.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/setup.cfg +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/__init__.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/academic_refchecker.egg-info/SOURCES.txt +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/academic_refchecker.egg-info/dependency_links.txt +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/academic_refchecker.egg-info/entry_points.txt +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/academic_refchecker.egg-info/requires.txt +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/academic_refchecker.egg-info/top_level.txt +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/checkers/__init__.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/checkers/crossref.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/checkers/enhanced_hybrid_checker.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/checkers/github_checker.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/checkers/local_semantic_scholar.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/checkers/openalex.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/checkers/openreview_checker.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/checkers/semantic_scholar.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/checkers/webpage_checker.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/config/__init__.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/config/logging.conf +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/config/settings.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/core/__init__.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/core/db_connection_pool.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/core/parallel_processor.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/core/refchecker.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/database/__init__.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/database/download_semantic_scholar_db.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/llm/__init__.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/llm/base.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/llm/providers.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/scripts/__init__.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/scripts/start_vllm_server.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/services/__init__.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/services/pdf_processor.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/utils/__init__.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/utils/author_utils.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/utils/biblatex_parser.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/utils/bibliography_utils.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/utils/bibtex_parser.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/utils/config_validator.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/utils/db_utils.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/utils/doi_utils.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/utils/error_utils.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/utils/mock_objects.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/utils/unicode_utils.py +0 -0
- {academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/utils/url_utils.py +0 -0
|
@@ -111,56 +111,8 @@ def download_arxiv_source(arxiv_id):
|
|
|
111
111
|
main_tex_content = largest_file[1]
|
|
112
112
|
logger.debug(f"Using largest tex file: {largest_file[0]}")
|
|
113
113
|
|
|
114
|
-
#
|
|
115
|
-
bib_content =
|
|
116
|
-
if bib_files and main_tex_content:
|
|
117
|
-
# Extract bibliography references from main tex file
|
|
118
|
-
referenced_bibs = []
|
|
119
|
-
bib_pattern = r'\\bibliography\{([^}]+)\}'
|
|
120
|
-
matches = re.findall(bib_pattern, main_tex_content)
|
|
121
|
-
|
|
122
|
-
for match in matches:
|
|
123
|
-
# Handle multiple bib files separated by commas
|
|
124
|
-
bib_names = [name.strip() for name in match.split(',')]
|
|
125
|
-
for bib_name in bib_names:
|
|
126
|
-
# Add .bib extension if not present
|
|
127
|
-
if not bib_name.endswith('.bib'):
|
|
128
|
-
bib_name += '.bib'
|
|
129
|
-
referenced_bibs.append(bib_name)
|
|
130
|
-
|
|
131
|
-
# Use only referenced .bib files, or all if no references found
|
|
132
|
-
if referenced_bibs:
|
|
133
|
-
used_bibs = []
|
|
134
|
-
for bib_name in referenced_bibs:
|
|
135
|
-
if bib_name in bib_files:
|
|
136
|
-
used_bibs.append(bib_files[bib_name])
|
|
137
|
-
logger.debug(f"Using referenced .bib file: {bib_name}")
|
|
138
|
-
else:
|
|
139
|
-
logger.debug(f"Referenced .bib file not found: {bib_name}")
|
|
140
|
-
|
|
141
|
-
if used_bibs:
|
|
142
|
-
raw_bib_content = '\n\n'.join(used_bibs)
|
|
143
|
-
|
|
144
|
-
# Filter BibTeX to only include cited references
|
|
145
|
-
bib_content = filter_bibtex_by_citations(raw_bib_content, tex_files, main_tex_content)
|
|
146
|
-
|
|
147
|
-
logger.debug(f"Found {len(used_bibs)} referenced .bib files out of {len(bib_files)} total")
|
|
148
|
-
else:
|
|
149
|
-
# Fallback to all bib files if none of the referenced ones found
|
|
150
|
-
raw_bib_content = '\n\n'.join(bib_files.values())
|
|
151
|
-
bib_content = filter_bibtex_by_citations(raw_bib_content, tex_files, main_tex_content)
|
|
152
|
-
logger.debug(f"No referenced .bib files found, using all {len(bib_files)} .bib files")
|
|
153
|
-
else:
|
|
154
|
-
# No \bibliography command found, use all bib files
|
|
155
|
-
raw_bib_content = '\n\n'.join(bib_files.values())
|
|
156
|
-
bib_content = filter_bibtex_by_citations(raw_bib_content, tex_files, main_tex_content)
|
|
157
|
-
logger.debug(f"No \\bibliography command found, using all {len(bib_files)} .bib files")
|
|
158
|
-
elif bib_files:
|
|
159
|
-
# No main tex file but have bib files
|
|
160
|
-
raw_bib_content = '\n\n'.join(bib_files.values())
|
|
161
|
-
# Can't filter without tex files, so use original content
|
|
162
|
-
bib_content = raw_bib_content
|
|
163
|
-
logger.debug(f"Found {len(bib_files)} .bib files (no main tex to filter)")
|
|
114
|
+
# Process .bib files using shared logic
|
|
115
|
+
bib_content = select_and_filter_bib_files(bib_files, main_tex_content, tex_files)
|
|
164
116
|
|
|
165
117
|
# Combine all bbl file contents
|
|
166
118
|
bbl_content = None
|
|
@@ -219,6 +171,78 @@ def download_arxiv_bibtex(arxiv_id):
|
|
|
219
171
|
return None
|
|
220
172
|
|
|
221
173
|
|
|
174
|
+
def select_and_filter_bib_files(bib_files, main_tex_content, tex_files):
|
|
175
|
+
"""
|
|
176
|
+
Select appropriate .bib files based on main TeX file references and filter by citations.
|
|
177
|
+
|
|
178
|
+
Args:
|
|
179
|
+
bib_files: Dict of .bib files {filename: content}
|
|
180
|
+
main_tex_content: Content of main tex file
|
|
181
|
+
tex_files: Dict of all tex files {filename: content} (for filtering)
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
Filtered BibTeX content or None if no files available
|
|
185
|
+
"""
|
|
186
|
+
import re
|
|
187
|
+
|
|
188
|
+
if not bib_files:
|
|
189
|
+
return None
|
|
190
|
+
|
|
191
|
+
if main_tex_content:
|
|
192
|
+
# Extract bibliography references from main tex file
|
|
193
|
+
referenced_bibs = []
|
|
194
|
+
bib_pattern = r'\\bibliography\{([^}]+)\}'
|
|
195
|
+
matches = re.findall(bib_pattern, main_tex_content)
|
|
196
|
+
|
|
197
|
+
for match in matches:
|
|
198
|
+
# Handle multiple bib files separated by commas
|
|
199
|
+
bib_names = [name.strip() for name in match.split(',')]
|
|
200
|
+
for bib_name in bib_names:
|
|
201
|
+
# Add .bib extension if not present
|
|
202
|
+
if not bib_name.endswith('.bib'):
|
|
203
|
+
bib_name += '.bib'
|
|
204
|
+
referenced_bibs.append(bib_name)
|
|
205
|
+
|
|
206
|
+
# Use only referenced .bib files, or all if no references found
|
|
207
|
+
if referenced_bibs:
|
|
208
|
+
used_bibs = []
|
|
209
|
+
seen_bib_names = set() # Track which bib files we've already added
|
|
210
|
+
for bib_name in referenced_bibs:
|
|
211
|
+
if bib_name in bib_files and bib_name not in seen_bib_names:
|
|
212
|
+
used_bibs.append(bib_files[bib_name])
|
|
213
|
+
seen_bib_names.add(bib_name)
|
|
214
|
+
logger.debug(f"Using referenced .bib file: {bib_name}")
|
|
215
|
+
elif bib_name in seen_bib_names:
|
|
216
|
+
logger.debug(f"Skipping duplicate .bib file: {bib_name}")
|
|
217
|
+
else:
|
|
218
|
+
logger.debug(f"Referenced .bib file not found: {bib_name}")
|
|
219
|
+
|
|
220
|
+
if used_bibs:
|
|
221
|
+
raw_bib_content = '\n\n'.join(used_bibs)
|
|
222
|
+
# Filter BibTeX to only include cited references
|
|
223
|
+
filtered_content = filter_bibtex_by_citations(raw_bib_content, tex_files, main_tex_content)
|
|
224
|
+
logger.debug(f"Found {len(used_bibs)} referenced .bib files out of {len(bib_files)} total")
|
|
225
|
+
return filtered_content
|
|
226
|
+
else:
|
|
227
|
+
# Fallback to all bib files if none of the referenced ones found
|
|
228
|
+
raw_bib_content = '\n\n'.join(bib_files.values())
|
|
229
|
+
filtered_content = filter_bibtex_by_citations(raw_bib_content, tex_files, main_tex_content)
|
|
230
|
+
logger.debug(f"No referenced .bib files found, using all {len(bib_files)} .bib files")
|
|
231
|
+
return filtered_content
|
|
232
|
+
else:
|
|
233
|
+
# No \bibliography command found, use all bib files
|
|
234
|
+
raw_bib_content = '\n\n'.join(bib_files.values())
|
|
235
|
+
filtered_content = filter_bibtex_by_citations(raw_bib_content, tex_files, main_tex_content)
|
|
236
|
+
logger.debug(f"No \\bibliography command found, using all {len(bib_files)} .bib files")
|
|
237
|
+
return filtered_content
|
|
238
|
+
else:
|
|
239
|
+
# No main tex file but have bib files
|
|
240
|
+
raw_bib_content = '\n\n'.join(bib_files.values())
|
|
241
|
+
# Can't filter without tex files, so use original content
|
|
242
|
+
logger.debug(f"Found {len(bib_files)} .bib files (no main tex to filter)")
|
|
243
|
+
return raw_bib_content
|
|
244
|
+
|
|
245
|
+
|
|
222
246
|
def extract_cited_keys_from_tex(tex_files, main_tex_content):
|
|
223
247
|
"""
|
|
224
248
|
Extract all citation keys from TeX files.
|
|
@@ -261,7 +285,11 @@ def is_reference_used(reference_key, cited_keys):
|
|
|
261
285
|
Returns:
|
|
262
286
|
True if the reference is cited, False otherwise
|
|
263
287
|
"""
|
|
264
|
-
|
|
288
|
+
result = reference_key in cited_keys
|
|
289
|
+
# Add debugging for the first few mismatches to understand the issue
|
|
290
|
+
if not result and len([k for k in cited_keys if k.startswith('a')]) < 3: # Limit debug output
|
|
291
|
+
logger.debug(f"Key '{reference_key}' not found in cited_keys")
|
|
292
|
+
return result
|
|
265
293
|
|
|
266
294
|
|
|
267
295
|
def filter_bibtex_by_citations(bib_content, tex_files, main_tex_content):
|
|
@@ -291,14 +319,30 @@ def filter_bibtex_by_citations(bib_content, tex_files, main_tex_content):
|
|
|
291
319
|
from utils.bibtex_parser import parse_bibtex_entries
|
|
292
320
|
entries = parse_bibtex_entries(bib_content)
|
|
293
321
|
|
|
294
|
-
# Filter entries to only cited ones
|
|
322
|
+
# Filter entries to only cited ones and remove duplicates
|
|
295
323
|
cited_entries = []
|
|
324
|
+
seen_keys = set()
|
|
325
|
+
not_cited_count = 0
|
|
326
|
+
duplicate_count = 0
|
|
327
|
+
|
|
296
328
|
for entry in entries:
|
|
297
329
|
entry_key = entry.get('key', '')
|
|
298
330
|
if is_reference_used(entry_key, cited_keys):
|
|
299
|
-
|
|
331
|
+
if entry_key not in seen_keys:
|
|
332
|
+
cited_entries.append(entry)
|
|
333
|
+
seen_keys.add(entry_key)
|
|
334
|
+
else:
|
|
335
|
+
duplicate_count += 1
|
|
336
|
+
logger.debug(f"Skipping duplicate entry: '{entry_key}'")
|
|
337
|
+
else:
|
|
338
|
+
not_cited_count += 1
|
|
339
|
+
# Log first few entries that are NOT cited for debugging
|
|
340
|
+
if not_cited_count <= 5:
|
|
341
|
+
logger.debug(f"Entry NOT cited: '{entry_key}'")
|
|
300
342
|
|
|
301
|
-
logger.debug(f"Filtered BibTeX: {len(entries)} total -> {len(cited_entries)} cited")
|
|
343
|
+
logger.debug(f"Filtered BibTeX: {len(entries)} total -> {len(cited_entries)} cited (removed {duplicate_count} duplicates)")
|
|
344
|
+
logger.debug(f"Citation keys found: {len(cited_keys)} keys")
|
|
345
|
+
logger.debug(f"Sample cited keys: {list(cited_keys)[:10]}")
|
|
302
346
|
|
|
303
347
|
# Reconstruct BibTeX content from cited entries
|
|
304
348
|
if not cited_entries:
|
|
@@ -3902,6 +3902,9 @@ def are_venues_substantially_different(venue1: str, venue2: str) -> bool:
|
|
|
3902
3902
|
# Handle specific multi-word patterns and well-known acronyms
|
|
3903
3903
|
'proc. natl. acad. sci.': 'proceedings of the national academy of sciences',
|
|
3904
3904
|
'pnas': 'proceedings of the national academy of sciences',
|
|
3905
|
+
# Special cases that don't follow standard acronym patterns
|
|
3906
|
+
'neurips': 'neural information processing systems', # Special case
|
|
3907
|
+
'nips': 'neural information processing systems', # old name for neurips
|
|
3905
3908
|
}
|
|
3906
3909
|
# Sort by length (longest first) to ensure longer matches take precedence
|
|
3907
3910
|
for abbrev, expansion in sorted(common_abbrevs.items(), key=lambda x: len(x[0]), reverse=True):
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/checkers/enhanced_hybrid_checker.py
RENAMED
|
File without changes
|
|
File without changes
|
{academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/checkers/local_semantic_scholar.py
RENAMED
|
File without changes
|
|
File without changes
|
{academic_refchecker-1.2.49 → academic_refchecker-1.2.50}/src/checkers/openreview_checker.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|