academic-refchecker 2.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. academic_refchecker-2.0.7.dist-info/METADATA +738 -0
  2. academic_refchecker-2.0.7.dist-info/RECORD +64 -0
  3. academic_refchecker-2.0.7.dist-info/WHEEL +5 -0
  4. academic_refchecker-2.0.7.dist-info/entry_points.txt +3 -0
  5. academic_refchecker-2.0.7.dist-info/licenses/LICENSE +21 -0
  6. academic_refchecker-2.0.7.dist-info/top_level.txt +2 -0
  7. backend/__init__.py +21 -0
  8. backend/__main__.py +11 -0
  9. backend/cli.py +64 -0
  10. backend/concurrency.py +100 -0
  11. backend/database.py +711 -0
  12. backend/main.py +1367 -0
  13. backend/models.py +99 -0
  14. backend/refchecker_wrapper.py +1126 -0
  15. backend/static/assets/index-2P6L_39v.css +1 -0
  16. backend/static/assets/index-hk21nqxR.js +25 -0
  17. backend/static/favicon.svg +6 -0
  18. backend/static/index.html +15 -0
  19. backend/static/vite.svg +1 -0
  20. backend/thumbnail.py +517 -0
  21. backend/websocket_manager.py +104 -0
  22. refchecker/__init__.py +13 -0
  23. refchecker/__main__.py +11 -0
  24. refchecker/__version__.py +3 -0
  25. refchecker/checkers/__init__.py +17 -0
  26. refchecker/checkers/crossref.py +541 -0
  27. refchecker/checkers/enhanced_hybrid_checker.py +563 -0
  28. refchecker/checkers/github_checker.py +326 -0
  29. refchecker/checkers/local_semantic_scholar.py +540 -0
  30. refchecker/checkers/openalex.py +513 -0
  31. refchecker/checkers/openreview_checker.py +984 -0
  32. refchecker/checkers/pdf_paper_checker.py +493 -0
  33. refchecker/checkers/semantic_scholar.py +764 -0
  34. refchecker/checkers/webpage_checker.py +938 -0
  35. refchecker/config/__init__.py +1 -0
  36. refchecker/config/logging.conf +36 -0
  37. refchecker/config/settings.py +170 -0
  38. refchecker/core/__init__.py +7 -0
  39. refchecker/core/db_connection_pool.py +141 -0
  40. refchecker/core/parallel_processor.py +415 -0
  41. refchecker/core/refchecker.py +5838 -0
  42. refchecker/database/__init__.py +6 -0
  43. refchecker/database/download_semantic_scholar_db.py +1725 -0
  44. refchecker/llm/__init__.py +0 -0
  45. refchecker/llm/base.py +376 -0
  46. refchecker/llm/providers.py +911 -0
  47. refchecker/scripts/__init__.py +1 -0
  48. refchecker/scripts/start_vllm_server.py +121 -0
  49. refchecker/services/__init__.py +8 -0
  50. refchecker/services/pdf_processor.py +268 -0
  51. refchecker/utils/__init__.py +27 -0
  52. refchecker/utils/arxiv_utils.py +462 -0
  53. refchecker/utils/author_utils.py +179 -0
  54. refchecker/utils/biblatex_parser.py +584 -0
  55. refchecker/utils/bibliography_utils.py +332 -0
  56. refchecker/utils/bibtex_parser.py +411 -0
  57. refchecker/utils/config_validator.py +262 -0
  58. refchecker/utils/db_utils.py +210 -0
  59. refchecker/utils/doi_utils.py +190 -0
  60. refchecker/utils/error_utils.py +482 -0
  61. refchecker/utils/mock_objects.py +211 -0
  62. refchecker/utils/text_utils.py +5057 -0
  63. refchecker/utils/unicode_utils.py +335 -0
  64. refchecker/utils/url_utils.py +307 -0
@@ -0,0 +1,210 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Database Processing Utilities for Reference Checking
4
+
5
+ This module provides utilities for processing database results,
6
+ particularly for Semantic Scholar data processing.
7
+ """
8
+
9
+ import json
10
+ import logging
11
+ from typing import Dict, List, Any, Optional
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ def process_semantic_scholar_result(paper_data: Dict[str, Any]) -> Dict[str, Any]:
17
+ """
18
+ Process a single Semantic Scholar database result by parsing JSON fields
19
+ and reconstructing the paper data structure.
20
+
21
+ Args:
22
+ paper_data: Raw paper data dictionary from database
23
+
24
+ Returns:
25
+ Processed paper data dictionary
26
+ """
27
+ try:
28
+ # Extract authors from JSON
29
+ if paper_data.get('authors'):
30
+ if isinstance(paper_data['authors'], str):
31
+ paper_data['authors'] = json.loads(paper_data['authors'])
32
+ else:
33
+ paper_data['authors'] = []
34
+
35
+ # Reconstruct external IDs from flattened columns
36
+ external_ids = {}
37
+ for key, value in paper_data.items():
38
+ if key.startswith('externalIds_') and value:
39
+ external_id_type = key.replace('externalIds_', '')
40
+ external_ids[external_id_type] = value
41
+ paper_data['externalIds'] = external_ids
42
+
43
+ # Add other JSON fields
44
+ if paper_data.get('s2FieldsOfStudy'):
45
+ if isinstance(paper_data['s2FieldsOfStudy'], str):
46
+ paper_data['s2FieldsOfStudy'] = json.loads(paper_data['s2FieldsOfStudy'])
47
+
48
+ if paper_data.get('publicationTypes'):
49
+ if isinstance(paper_data['publicationTypes'], str):
50
+ paper_data['publicationTypes'] = json.loads(paper_data['publicationTypes'])
51
+
52
+ return paper_data
53
+
54
+ except Exception as e:
55
+ logger.warning(f"Error processing database result: {e}")
56
+ return paper_data
57
+
58
+
59
+ def process_semantic_scholar_results(results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
60
+ """
61
+ Process multiple Semantic Scholar database results.
62
+
63
+ Args:
64
+ results: List of raw database row dictionaries
65
+
66
+ Returns:
67
+ List of processed paper data dictionaries
68
+ """
69
+ processed_results = []
70
+
71
+ for paper_data in results:
72
+ processed_result = process_semantic_scholar_result(paper_data)
73
+ if processed_result:
74
+ processed_results.append(processed_result)
75
+
76
+ return processed_results
77
+
78
+
79
+ def extract_external_ids(paper_data: Dict[str, Any]) -> Dict[str, str]:
80
+ """
81
+ Extract external IDs from flattened database columns.
82
+
83
+ Args:
84
+ paper_data: Paper data dictionary from database
85
+
86
+ Returns:
87
+ Dictionary of external IDs
88
+ """
89
+ external_ids = {}
90
+
91
+ for key, value in paper_data.items():
92
+ if key.startswith('externalIds_') and value:
93
+ external_id_type = key.replace('externalIds_', '')
94
+ external_ids[external_id_type] = value
95
+
96
+ return external_ids
97
+
98
+
99
+ def parse_json_field(data: Dict[str, Any], field_name: str) -> Any:
100
+ """
101
+ Parse a JSON field from database data, handling both string and already-parsed data.
102
+
103
+ Args:
104
+ data: Database record dictionary
105
+ field_name: Name of the field to parse
106
+
107
+ Returns:
108
+ Parsed data or empty list/dict if parsing fails
109
+ """
110
+ try:
111
+ field_data = data.get(field_name)
112
+ if not field_data:
113
+ return [] if field_name in ['authors', 's2FieldsOfStudy', 'publicationTypes'] else {}
114
+
115
+ if isinstance(field_data, str):
116
+ return json.loads(field_data)
117
+ else:
118
+ return field_data
119
+
120
+ except (json.JSONDecodeError, TypeError) as e:
121
+ logger.warning(f"Failed to parse JSON field '{field_name}': {e}")
122
+ return [] if field_name in ['authors', 's2FieldsOfStudy', 'publicationTypes'] else {}
123
+
124
+
125
+ def reconstruct_paper_structure(row_data: Dict[str, Any]) -> Dict[str, Any]:
126
+ """
127
+ Reconstruct the full paper data structure from flattened database row.
128
+
129
+ Args:
130
+ row_data: Raw database row data
131
+
132
+ Returns:
133
+ Reconstructed paper data structure
134
+ """
135
+ # Start with the row data
136
+ paper_data = dict(row_data)
137
+
138
+ # Parse JSON fields
139
+ paper_data['authors'] = parse_json_field(paper_data, 'authors')
140
+ paper_data['s2FieldsOfStudy'] = parse_json_field(paper_data, 's2FieldsOfStudy')
141
+ paper_data['publicationTypes'] = parse_json_field(paper_data, 'publicationTypes')
142
+
143
+ # Reconstruct external IDs
144
+ paper_data['externalIds'] = extract_external_ids(paper_data)
145
+
146
+ return paper_data
147
+
148
+
149
+ def safe_json_loads(json_string: str, default_value: Any = None) -> Any:
150
+ """
151
+ Safely load JSON string with fallback to default value.
152
+
153
+ Args:
154
+ json_string: JSON string to parse
155
+ default_value: Default value if parsing fails
156
+
157
+ Returns:
158
+ Parsed JSON data or default value
159
+ """
160
+ if not json_string:
161
+ return default_value
162
+
163
+ try:
164
+ return json.loads(json_string)
165
+ except (json.JSONDecodeError, TypeError) as e:
166
+ logger.debug(f"Failed to parse JSON: {e}")
167
+ return default_value
168
+
169
+
170
+ def flatten_external_ids(external_ids: Dict[str, str]) -> Dict[str, str]:
171
+ """
172
+ Flatten external IDs dictionary into database column format.
173
+
174
+ Args:
175
+ external_ids: Dictionary of external IDs
176
+
177
+ Returns:
178
+ Flattened dictionary with externalIds_ prefix
179
+ """
180
+ flattened = {}
181
+
182
+ for id_type, id_value in external_ids.items():
183
+ flattened[f'externalIds_{id_type}'] = id_value
184
+
185
+ return flattened
186
+
187
+
188
+ def validate_paper_data(paper_data: Dict[str, Any]) -> bool:
189
+ """
190
+ Validate that paper data has required fields.
191
+
192
+ Args:
193
+ paper_data: Paper data dictionary to validate
194
+
195
+ Returns:
196
+ True if data appears valid, False otherwise
197
+ """
198
+ # Check for essential fields
199
+ required_fields = ['title']
200
+
201
+ for field in required_fields:
202
+ if not paper_data.get(field):
203
+ return False
204
+
205
+ # Validate authors field
206
+ authors = paper_data.get('authors', [])
207
+ if not isinstance(authors, list):
208
+ return False
209
+
210
+ return True
@@ -0,0 +1,190 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ DOI Utilities for Reference Checking
4
+
5
+ This module provides utilities for DOI handling, extraction, and validation.
6
+ """
7
+
8
+ import re
9
+ from typing import Optional
10
+
11
+
12
+ def extract_doi_from_url(url: str) -> Optional[str]:
13
+ """
14
+ Extract DOI from a URL using comprehensive pattern matching.
15
+
16
+ Args:
17
+ url: URL that might contain a DOI
18
+
19
+ Returns:
20
+ Extracted DOI or None if not found
21
+ """
22
+ if not url:
23
+ return None
24
+
25
+ # Only extract DOIs from actual DOI URLs, not from other domains
26
+ # This prevents false positives from URLs like aclanthology.org
27
+ if 'doi.org' not in url and 'doi:' not in url:
28
+ return None
29
+
30
+ # DOI patterns ordered by specificity and reliability
31
+ doi_patterns = [
32
+ r'doi\.org/([^/\s\?#]+(?:/[^/\s\?#]+)*)', # Full DOI pattern from doi.org
33
+ r'doi:([^/\s\?#]+(?:/[^/\s\?#]+)*)', # doi: prefix format
34
+ ]
35
+
36
+ for pattern in doi_patterns:
37
+ match = re.search(pattern, url)
38
+ if match:
39
+ doi_candidate = match.group(1)
40
+ # DOIs must start with "10." and have at least one slash
41
+ if doi_candidate.startswith('10.') and '/' in doi_candidate and len(doi_candidate) > 6:
42
+ return doi_candidate
43
+
44
+ return None
45
+
46
+
47
+ def normalize_doi(doi: str) -> str:
48
+ """
49
+ Normalize a DOI by removing common prefixes, cleaning whitespace, and converting to lowercase.
50
+
51
+ DOI suffixes are case-insensitive according to the DOI specification, so we normalize
52
+ to lowercase to ensure consistent URL generation across all checkers.
53
+
54
+ Args:
55
+ doi: DOI string to normalize
56
+
57
+ Returns:
58
+ Normalized DOI string in lowercase
59
+ """
60
+ if not doi:
61
+ return ""
62
+
63
+ # Remove common URL prefixes
64
+ normalized = doi.replace('https://doi.org/', '').replace('http://doi.org/', '')
65
+ normalized = normalized.replace('doi:', '')
66
+
67
+ # Remove hash fragments and query parameters
68
+ normalized = normalized.split('#')[0].split('?')[0]
69
+
70
+ # Clean whitespace and trailing punctuation
71
+ normalized = normalized.strip()
72
+
73
+ # Remove trailing punctuation that might be included in extraction
74
+ normalized = normalized.rstrip('.,;:)')
75
+
76
+ # Convert to lowercase for consistency (DOI suffixes are case-insensitive)
77
+ return normalized.lower()
78
+
79
+
80
+ def is_valid_doi_format(doi: str) -> bool:
81
+ """
82
+ Check if a string matches the basic DOI format.
83
+
84
+ Args:
85
+ doi: String to validate as DOI
86
+
87
+ Returns:
88
+ True if the string matches DOI format, False otherwise
89
+ """
90
+ if not doi:
91
+ return False
92
+
93
+ # Basic DOI format: starts with "10." followed by at least one slash
94
+ doi_format_pattern = r'^10\.\d+/.+'
95
+ return bool(re.match(doi_format_pattern, doi))
96
+
97
+
98
+ def compare_dois(doi1: str, doi2: str) -> bool:
99
+ """
100
+ Compare two DOIs for equality, handling different formats and prefixes.
101
+
102
+ This function performs exact matching after normalization, with support
103
+ for partial DOI citations where a shorter DOI is a valid prefix of a longer one.
104
+
105
+ Args:
106
+ doi1: First DOI to compare
107
+ doi2: Second DOI to compare
108
+
109
+ Returns:
110
+ True if DOIs are equivalent, False otherwise
111
+ """
112
+ if not doi1 or not doi2:
113
+ return False
114
+
115
+ # Normalize both DOIs (handles prefixes, case, punctuation)
116
+ norm_doi1 = normalize_doi(doi1)
117
+ norm_doi2 = normalize_doi(doi2)
118
+
119
+ # First try exact match
120
+ if norm_doi1 == norm_doi2:
121
+ return True
122
+
123
+ # Handle partial DOI citations - if one DOI is a prefix of the other, consider it a match
124
+ # This handles cases like "10.1007" being cited instead of the full "10.1007/s10458-025-09691-y"
125
+ if len(norm_doi1) != len(norm_doi2):
126
+ shorter_doi = norm_doi1 if len(norm_doi1) < len(norm_doi2) else norm_doi2
127
+ longer_doi = norm_doi2 if len(norm_doi1) < len(norm_doi2) else norm_doi1
128
+
129
+ # Only consider it a valid partial match if:
130
+ # 1. The shorter DOI is at least 7 characters (e.g., "10.1007")
131
+ # 2. The longer DOI starts with the shorter DOI
132
+ # 3. The next character in the longer DOI is '/' or '.' (valid DOI separators)
133
+ if (len(shorter_doi) >= 7 and
134
+ longer_doi.startswith(shorter_doi) and
135
+ len(longer_doi) > len(shorter_doi) and
136
+ longer_doi[len(shorter_doi)] in ['/', '.']):
137
+ return True
138
+
139
+ return False
140
+
141
+
142
+ def construct_doi_url(doi: str) -> str:
143
+ """
144
+ Construct a proper DOI URL from a DOI string.
145
+
146
+ Args:
147
+ doi: DOI string
148
+
149
+ Returns:
150
+ Full DOI URL
151
+ """
152
+ if not doi:
153
+ return ""
154
+
155
+ # Normalize the DOI first
156
+ normalized_doi = normalize_doi(doi)
157
+
158
+ # Construct URL
159
+ return f"https://doi.org/{normalized_doi}"
160
+
161
+
162
+ def validate_doi_resolves(doi: str, timeout: float = 5.0) -> bool:
163
+ """
164
+ Validate that a DOI resolves by checking if doi.org returns a redirect.
165
+
166
+ This is useful for determining if a DOI is valid, even if it's different
167
+ from what a verification source has stored (e.g., arXiv DOI vs conference DOI).
168
+
169
+ Args:
170
+ doi: DOI string to validate
171
+ timeout: Request timeout in seconds
172
+
173
+ Returns:
174
+ True if DOI resolves (returns 302/301/200), False otherwise
175
+ """
176
+ if not doi or not is_valid_doi_format(normalize_doi(doi)):
177
+ return False
178
+
179
+ try:
180
+ import requests
181
+ url = construct_doi_url(doi)
182
+ # Use HEAD request first (faster), fall back to GET if needed
183
+ response = requests.head(url, allow_redirects=False, timeout=timeout)
184
+ # DOI.org returns 302 for valid DOIs that redirect to the paper
185
+ # Some may return 301 (permanent redirect) or 200 (direct response)
186
+ return response.status_code in (200, 301, 302, 303, 307, 308)
187
+ except Exception:
188
+ # On any error (timeout, connection error, etc.), assume DOI might be valid
189
+ # to avoid false negatives due to network issues
190
+ return True