academic-refchecker 2.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. academic_refchecker-2.0.7.dist-info/METADATA +738 -0
  2. academic_refchecker-2.0.7.dist-info/RECORD +64 -0
  3. academic_refchecker-2.0.7.dist-info/WHEEL +5 -0
  4. academic_refchecker-2.0.7.dist-info/entry_points.txt +3 -0
  5. academic_refchecker-2.0.7.dist-info/licenses/LICENSE +21 -0
  6. academic_refchecker-2.0.7.dist-info/top_level.txt +2 -0
  7. backend/__init__.py +21 -0
  8. backend/__main__.py +11 -0
  9. backend/cli.py +64 -0
  10. backend/concurrency.py +100 -0
  11. backend/database.py +711 -0
  12. backend/main.py +1367 -0
  13. backend/models.py +99 -0
  14. backend/refchecker_wrapper.py +1126 -0
  15. backend/static/assets/index-2P6L_39v.css +1 -0
  16. backend/static/assets/index-hk21nqxR.js +25 -0
  17. backend/static/favicon.svg +6 -0
  18. backend/static/index.html +15 -0
  19. backend/static/vite.svg +1 -0
  20. backend/thumbnail.py +517 -0
  21. backend/websocket_manager.py +104 -0
  22. refchecker/__init__.py +13 -0
  23. refchecker/__main__.py +11 -0
  24. refchecker/__version__.py +3 -0
  25. refchecker/checkers/__init__.py +17 -0
  26. refchecker/checkers/crossref.py +541 -0
  27. refchecker/checkers/enhanced_hybrid_checker.py +563 -0
  28. refchecker/checkers/github_checker.py +326 -0
  29. refchecker/checkers/local_semantic_scholar.py +540 -0
  30. refchecker/checkers/openalex.py +513 -0
  31. refchecker/checkers/openreview_checker.py +984 -0
  32. refchecker/checkers/pdf_paper_checker.py +493 -0
  33. refchecker/checkers/semantic_scholar.py +764 -0
  34. refchecker/checkers/webpage_checker.py +938 -0
  35. refchecker/config/__init__.py +1 -0
  36. refchecker/config/logging.conf +36 -0
  37. refchecker/config/settings.py +170 -0
  38. refchecker/core/__init__.py +7 -0
  39. refchecker/core/db_connection_pool.py +141 -0
  40. refchecker/core/parallel_processor.py +415 -0
  41. refchecker/core/refchecker.py +5838 -0
  42. refchecker/database/__init__.py +6 -0
  43. refchecker/database/download_semantic_scholar_db.py +1725 -0
  44. refchecker/llm/__init__.py +0 -0
  45. refchecker/llm/base.py +376 -0
  46. refchecker/llm/providers.py +911 -0
  47. refchecker/scripts/__init__.py +1 -0
  48. refchecker/scripts/start_vllm_server.py +121 -0
  49. refchecker/services/__init__.py +8 -0
  50. refchecker/services/pdf_processor.py +268 -0
  51. refchecker/utils/__init__.py +27 -0
  52. refchecker/utils/arxiv_utils.py +462 -0
  53. refchecker/utils/author_utils.py +179 -0
  54. refchecker/utils/biblatex_parser.py +584 -0
  55. refchecker/utils/bibliography_utils.py +332 -0
  56. refchecker/utils/bibtex_parser.py +411 -0
  57. refchecker/utils/config_validator.py +262 -0
  58. refchecker/utils/db_utils.py +210 -0
  59. refchecker/utils/doi_utils.py +190 -0
  60. refchecker/utils/error_utils.py +482 -0
  61. refchecker/utils/mock_objects.py +211 -0
  62. refchecker/utils/text_utils.py +5057 -0
  63. refchecker/utils/unicode_utils.py +335 -0
  64. refchecker/utils/url_utils.py +307 -0
@@ -0,0 +1,332 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Bibliography extraction and parsing utilities.
4
+
5
+ This module provides utilities for extracting and parsing bibliographies from
6
+ academic papers in various formats (LaTeX, BibTeX, PDF text, etc.).
7
+ """
8
+
9
+ import re
10
+ import logging
11
+ import os
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ def extract_text_from_latex(latex_file_path):
17
+ """
18
+ Extract text from a LaTeX file
19
+
20
+ Args:
21
+ latex_file_path: Path to the LaTeX file
22
+
23
+ Returns:
24
+ String containing the LaTeX file content
25
+ """
26
+ try:
27
+ logger.info(f"Reading LaTeX file: {latex_file_path}")
28
+ with open(latex_file_path, 'r', encoding='utf-8') as f:
29
+ content = f.read()
30
+ logger.info(f"Successfully read LaTeX file with {len(content)} characters")
31
+ return content
32
+ except UnicodeDecodeError:
33
+ # Try with latin-1 encoding if utf-8 fails
34
+ try:
35
+ logger.warning(f"UTF-8 encoding failed for {latex_file_path}, trying latin-1")
36
+ with open(latex_file_path, 'r', encoding='latin-1') as f:
37
+ content = f.read()
38
+ logger.info(f"Successfully read LaTeX file with latin-1 encoding")
39
+ return content
40
+ except Exception as e:
41
+ logger.error(f"Failed to read LaTeX file {latex_file_path}: {e}")
42
+ return None
43
+ except Exception as e:
44
+ logger.error(f"Failed to read LaTeX file {latex_file_path}: {e}")
45
+ return None
46
+
47
+
48
+ def find_bibliography_section(text):
49
+ """
50
+ Find the bibliography section in the text
51
+ """
52
+ if not text:
53
+ logger.warning("No text provided to find_bibliography_section")
54
+ return None
55
+
56
+ # Log a sample of the text for debugging
57
+ text_sample = text[:500] + "..." if len(text) > 500 else text
58
+ logger.debug(f"Text sample: {text_sample}")
59
+
60
+ # Common section titles for bibliography
61
+ section_patterns = [
62
+ # Patterns for numbered sections with potential spacing issues from PDF extraction
63
+ r'(?i)\d+\s*ref\s*er\s*ences\s*\n', # "12 Refer ences" with spaces
64
+ r'(?i)\d+\s*references\s*\n', # "12References" or "12 References"
65
+ r'(?i)^\s*\d+\.\s*references\s*$', # Numbered section: "7. References"
66
+ r'(?i)\d+\s+references\s*\.', # "9 References." format used in Georgia Tech paper
67
+ # Standard reference patterns
68
+ r'(?i)references\s*\n',
69
+ r'(?i)bibliography\s*\n',
70
+ r'(?i)works cited\s*\n',
71
+ r'(?i)literature cited\s*\n',
72
+ r'(?i)references\s*$', # End of document
73
+ r'(?i)\[\s*references\s*\]', # [References]
74
+ r'(?i)^\s*references\s*$', # References as a standalone line
75
+ r'(?i)^\s*bibliography\s*$', # Bibliography as a standalone line
76
+ r'(?i)references\s*and\s*citations', # References and Citations
77
+ r'(?i)cited\s*references', # Cited References
78
+ r'(?i)reference\s*list', # Reference List
79
+ ]
80
+
81
+ bibliography_start = None
82
+ matched_pattern = None
83
+
84
+ for pattern in section_patterns:
85
+ matches = re.search(pattern, text, re.MULTILINE)
86
+ if matches:
87
+ bibliography_start = matches.end()
88
+ matched_pattern = pattern
89
+ logger.debug(f"Bibliography section found using pattern: {pattern}")
90
+ break
91
+
92
+ if bibliography_start is None:
93
+ logger.debug("No bibliography section header found, trying end-of-document approach")
94
+ # Try to find bibliography at the end of the document without explicit headers
95
+ lines = text.split('\n')
96
+ for i in range(len(lines) - 1, max(0, len(lines) - 100), -1): # Check last 100 lines
97
+ line = lines[i].strip()
98
+ if re.match(r'^\[\d+\]', line) or re.match(r'^\d+\.', line):
99
+ # Found what looks like reference entries
100
+ bibliography_start = text.rfind('\n'.join(lines[i:]))
101
+ logger.debug(f"Bibliography section found at end of document starting with: {line[:50]}")
102
+ break
103
+
104
+ if bibliography_start is not None:
105
+ bibliography_text = text[bibliography_start:].strip()
106
+ logger.debug(f"Bibliography text length: {len(bibliography_text)}")
107
+
108
+ # Optional: Try to find the end of the bibliography section
109
+ # This is challenging because it might go to the end of the document
110
+ # or be followed by appendices, acknowledgments, etc.
111
+
112
+ return bibliography_text
113
+
114
+ logger.debug("Bibliography section not found")
115
+ return None
116
+
117
+
118
+ def parse_references(bibliography_text):
119
+ """
120
+ Parse references from bibliography text using multiple parsing strategies.
121
+
122
+ Args:
123
+ bibliography_text: String containing bibliography content
124
+
125
+ Returns:
126
+ List of parsed reference dictionaries
127
+ """
128
+ if not bibliography_text:
129
+ logger.warning("No bibliography text provided to parse_references")
130
+ return []
131
+
132
+ # Try different parsing strategies in order of preference
133
+ parsing_strategies = [
134
+ ('BibTeX', _parse_bibtex_references),
135
+ ('biblatex', _parse_biblatex_references),
136
+ ('ACM/natbib', _parse_standard_acm_natbib_references),
137
+ ('regex-based', _parse_references_regex)
138
+ ]
139
+
140
+ for strategy_name, parse_func in parsing_strategies:
141
+ try:
142
+ logger.debug(f"Attempting {strategy_name} parsing")
143
+ references = parse_func(bibliography_text)
144
+ if references and len(references) > 0:
145
+ logger.info(f"Successfully parsed {len(references)} references using {strategy_name} format")
146
+ return references
147
+ else:
148
+ logger.debug(f"{strategy_name} parsing returned no references")
149
+ except Exception as e:
150
+ logger.debug(f"{strategy_name} parsing failed: {e}")
151
+ continue
152
+
153
+ logger.warning("All parsing strategies failed to extract references")
154
+ return []
155
+
156
+
157
+ def _parse_bibtex_references(bibliography_text):
158
+ """
159
+ Parse BibTeX formatted references like @inproceedings{...}, @article{...}, etc.
160
+
161
+ Args:
162
+ bibliography_text: String containing BibTeX entries
163
+
164
+ Returns:
165
+ List of reference dictionaries
166
+ """
167
+ from refchecker.utils.bibtex_parser import parse_bibtex_entries
168
+ return parse_bibtex_entries(bibliography_text)
169
+
170
+
171
+ def _parse_biblatex_references(bibliography_text):
172
+ """
173
+ Parse biblatex formatted references like [1] Author. "Title". In: Venue. Year.
174
+
175
+ Args:
176
+ bibliography_text: String containing biblatex .bbl entries
177
+
178
+ Returns:
179
+ List of reference dictionaries
180
+ """
181
+ from refchecker.utils.text_utils import extract_latex_references
182
+ return extract_latex_references(bibliography_text)
183
+
184
+
185
+ def _parse_standard_acm_natbib_references(bibliography_text):
186
+ """
187
+ Parse references using regex for standard ACM/natbib format (both ACM Reference Format and simple natbib)
188
+ """
189
+ from refchecker.utils.text_utils import detect_standard_acm_natbib_format
190
+
191
+ references = []
192
+
193
+ # Check if this is standard ACM natbib format
194
+ format_info = detect_standard_acm_natbib_format(bibliography_text)
195
+ if format_info['is_acm_natbib']:
196
+ logger.debug("Detected standard ACM natbib format")
197
+
198
+ # Split by reference entries
199
+ ref_pattern = r'\[(\d+)\]\s*'
200
+ entries = re.split(ref_pattern, bibliography_text)[1:] # Skip first empty element
201
+
202
+ for i in range(0, len(entries), 2):
203
+ if i + 1 < len(entries):
204
+ ref_num = entries[i]
205
+ ref_content = entries[i + 1].strip()
206
+
207
+ try:
208
+ reference = _parse_simple_natbib_format(int(ref_num), ref_content, f"[{ref_num}]")
209
+ if reference:
210
+ references.append(reference)
211
+ logger.debug(f"Parsed reference {ref_num}: {reference.get('title', 'No title')[:50]}...")
212
+ except Exception as e:
213
+ logger.debug(f"Error parsing reference {ref_num}: {e}")
214
+ continue
215
+
216
+ logger.debug(f"ACM natbib parsing extracted {len(references)} references")
217
+
218
+ return references
219
+
220
+
221
+ def _parse_simple_natbib_format(ref_num, content, label):
222
+ """
223
+ Parse a simple natbib format reference entry.
224
+
225
+ Args:
226
+ ref_num: Reference number
227
+ content: Reference content text
228
+ label: Reference label (e.g., "[1]")
229
+
230
+ Returns:
231
+ Dictionary containing parsed reference information
232
+ """
233
+ from refchecker.utils.text_utils import extract_url_from_reference, extract_year_from_reference
234
+
235
+ # Basic parsing - this could be enhanced with more sophisticated NLP
236
+ reference = {
237
+ 'raw_text': content,
238
+ 'label': label,
239
+ 'type': 'unknown'
240
+ }
241
+
242
+ # Try to extract basic information
243
+ # This is a simplified parser - real parsing would be much more complex
244
+
245
+ # Look for URL
246
+ url = extract_url_from_reference(content)
247
+ if url:
248
+ reference['url'] = url
249
+
250
+ # Look for year
251
+ year = extract_year_from_reference(content)
252
+ if year:
253
+ reference['year'] = year
254
+
255
+ # Try to identify the type based on content
256
+ content_lower = content.lower()
257
+ if 'proceedings' in content_lower or 'conference' in content_lower:
258
+ reference['type'] = 'inproceedings'
259
+ elif 'journal' in content_lower or 'trans.' in content_lower:
260
+ reference['type'] = 'article'
261
+ elif 'arxiv' in content_lower:
262
+ reference['type'] = 'misc'
263
+ reference['note'] = 'arXiv preprint'
264
+
265
+ return reference
266
+
267
+
268
+ def _parse_references_regex(bibliography_text):
269
+ """
270
+ Parse references using regex-based approach (original implementation)
271
+ """
272
+ references = []
273
+
274
+ # Split bibliography into individual references
275
+ # Look for patterns like [1], [2], etc.
276
+ ref_pattern = r'\[(\d+)\](.*?)(?=\[\d+\]|$)'
277
+ matches = re.findall(ref_pattern, bibliography_text, re.DOTALL)
278
+
279
+ for ref_num, ref_content in matches:
280
+ ref_content = ref_content.strip()
281
+ if not ref_content:
282
+ continue
283
+
284
+ reference = {
285
+ 'raw_text': ref_content,
286
+ 'label': f"[{ref_num}]",
287
+ 'type': 'unknown'
288
+ }
289
+
290
+ # Basic information extraction
291
+ from refchecker.utils.text_utils import extract_url_from_reference, extract_year_from_reference
292
+
293
+ url = extract_url_from_reference(ref_content)
294
+ if url:
295
+ reference['url'] = url
296
+
297
+ year = extract_year_from_reference(ref_content)
298
+ if year:
299
+ reference['year'] = year
300
+
301
+ references.append(reference)
302
+
303
+ return references
304
+
305
+
306
+ def _is_bibtex_surname_given_format(surname_part, given_part):
307
+ """
308
+ Check if this appears to be a BibTeX "Surname, Given" format.
309
+
310
+ Args:
311
+ surname_part: The part before the comma
312
+ given_part: The part after the comma
313
+
314
+ Returns:
315
+ Boolean indicating if this looks like BibTeX name format
316
+ """
317
+ # Simple heuristics to detect BibTeX format
318
+ if not surname_part or not given_part:
319
+ return False
320
+
321
+ # Check if surname looks like a surname (capitalized, not too long)
322
+ if not re.match(r'^[A-Z][a-zA-Z\s\-\']+$', surname_part.strip()):
323
+ return False
324
+
325
+ # Check if given part looks like given names (often abbreviated)
326
+ given_clean = given_part.strip()
327
+ if re.match(r'^[A-Z](\.\s*[A-Z]\.?)*$', given_clean): # Like "J. R." or "M. K."
328
+ return True
329
+ if re.match(r'^[A-Z][a-z]+(\s+[A-Z][a-z]*)*$', given_clean): # Like "John Robert"
330
+ return True
331
+
332
+ return False