academic-refchecker 2.0.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. academic_refchecker-2.0.7.dist-info/METADATA +738 -0
  2. academic_refchecker-2.0.7.dist-info/RECORD +64 -0
  3. academic_refchecker-2.0.7.dist-info/WHEEL +5 -0
  4. academic_refchecker-2.0.7.dist-info/entry_points.txt +3 -0
  5. academic_refchecker-2.0.7.dist-info/licenses/LICENSE +21 -0
  6. academic_refchecker-2.0.7.dist-info/top_level.txt +2 -0
  7. backend/__init__.py +21 -0
  8. backend/__main__.py +11 -0
  9. backend/cli.py +64 -0
  10. backend/concurrency.py +100 -0
  11. backend/database.py +711 -0
  12. backend/main.py +1367 -0
  13. backend/models.py +99 -0
  14. backend/refchecker_wrapper.py +1126 -0
  15. backend/static/assets/index-2P6L_39v.css +1 -0
  16. backend/static/assets/index-hk21nqxR.js +25 -0
  17. backend/static/favicon.svg +6 -0
  18. backend/static/index.html +15 -0
  19. backend/static/vite.svg +1 -0
  20. backend/thumbnail.py +517 -0
  21. backend/websocket_manager.py +104 -0
  22. refchecker/__init__.py +13 -0
  23. refchecker/__main__.py +11 -0
  24. refchecker/__version__.py +3 -0
  25. refchecker/checkers/__init__.py +17 -0
  26. refchecker/checkers/crossref.py +541 -0
  27. refchecker/checkers/enhanced_hybrid_checker.py +563 -0
  28. refchecker/checkers/github_checker.py +326 -0
  29. refchecker/checkers/local_semantic_scholar.py +540 -0
  30. refchecker/checkers/openalex.py +513 -0
  31. refchecker/checkers/openreview_checker.py +984 -0
  32. refchecker/checkers/pdf_paper_checker.py +493 -0
  33. refchecker/checkers/semantic_scholar.py +764 -0
  34. refchecker/checkers/webpage_checker.py +938 -0
  35. refchecker/config/__init__.py +1 -0
  36. refchecker/config/logging.conf +36 -0
  37. refchecker/config/settings.py +170 -0
  38. refchecker/core/__init__.py +7 -0
  39. refchecker/core/db_connection_pool.py +141 -0
  40. refchecker/core/parallel_processor.py +415 -0
  41. refchecker/core/refchecker.py +5838 -0
  42. refchecker/database/__init__.py +6 -0
  43. refchecker/database/download_semantic_scholar_db.py +1725 -0
  44. refchecker/llm/__init__.py +0 -0
  45. refchecker/llm/base.py +376 -0
  46. refchecker/llm/providers.py +911 -0
  47. refchecker/scripts/__init__.py +1 -0
  48. refchecker/scripts/start_vllm_server.py +121 -0
  49. refchecker/services/__init__.py +8 -0
  50. refchecker/services/pdf_processor.py +268 -0
  51. refchecker/utils/__init__.py +27 -0
  52. refchecker/utils/arxiv_utils.py +462 -0
  53. refchecker/utils/author_utils.py +179 -0
  54. refchecker/utils/biblatex_parser.py +584 -0
  55. refchecker/utils/bibliography_utils.py +332 -0
  56. refchecker/utils/bibtex_parser.py +411 -0
  57. refchecker/utils/config_validator.py +262 -0
  58. refchecker/utils/db_utils.py +210 -0
  59. refchecker/utils/doi_utils.py +190 -0
  60. refchecker/utils/error_utils.py +482 -0
  61. refchecker/utils/mock_objects.py +211 -0
  62. refchecker/utils/text_utils.py +5057 -0
  63. refchecker/utils/unicode_utils.py +335 -0
  64. refchecker/utils/url_utils.py +307 -0
@@ -0,0 +1,462 @@
1
+ """
2
+ ArXiv utility functions for downloading and processing ArXiv papers.
3
+
4
+ This module provides functions for:
5
+ - Downloading ArXiv LaTeX source files
6
+ - Downloading ArXiv BibTeX citations
7
+ - Extracting ArXiv IDs from URLs or paper identifiers
8
+ - Processing ArXiv source files for bibliography content
9
+ """
10
+
11
+ import os
12
+ import re
13
+ import logging
14
+ import requests
15
+ import tempfile
16
+ import tarfile
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ def extract_arxiv_id_from_paper(paper):
22
+ """
23
+ Extract ArXiv ID from a paper object.
24
+
25
+ Args:
26
+ paper: Paper object with potential ArXiv ID in URL or short_id
27
+
28
+ Returns:
29
+ str: ArXiv ID if found, None otherwise
30
+ """
31
+ arxiv_id = None
32
+
33
+ if hasattr(paper, 'pdf_url') and paper.pdf_url:
34
+ # Try to extract ArXiv ID from the PDF URL
35
+ from refchecker.utils.url_utils import extract_arxiv_id_from_url
36
+ arxiv_id = extract_arxiv_id_from_url(paper.pdf_url)
37
+ elif hasattr(paper, 'get_short_id'):
38
+ # Check if the paper ID itself is an ArXiv ID
39
+ short_id = paper.get_short_id()
40
+ if re.match(r'^\d{4}\.\d{4,5}(v\d+)?$', short_id):
41
+ arxiv_id = short_id
42
+
43
+ return arxiv_id
44
+
45
+
46
+ def download_arxiv_source(arxiv_id):
47
+ """
48
+ Download LaTeX source files from ArXiv for a given ArXiv ID.
49
+
50
+ Args:
51
+ arxiv_id: ArXiv identifier (e.g., "1706.03762")
52
+
53
+ Returns:
54
+ Tuple of (main_tex_content, bib_files_content, bbl_files_content) or (None, None, None) if download fails
55
+ """
56
+ try:
57
+ source_url = f"https://arxiv.org/e-print/{arxiv_id}"
58
+ logger.debug(f"Downloading ArXiv source from: {source_url}")
59
+
60
+ response = requests.get(source_url, timeout=60)
61
+ response.raise_for_status()
62
+
63
+ # Save to temporary file and extract
64
+ with tempfile.NamedTemporaryFile(delete=False) as temp_file:
65
+ temp_file.write(response.content)
66
+ temp_path = temp_file.name
67
+
68
+ try:
69
+ # Extract the tar.gz file
70
+ with tarfile.open(temp_path, 'r:gz') as tar:
71
+ extracted_files = {}
72
+
73
+ for member in tar.getmembers():
74
+ if member.isfile():
75
+ try:
76
+ content = tar.extractfile(member)
77
+ if content:
78
+ # Try to decode as text
79
+ try:
80
+ text_content = content.read().decode('utf-8')
81
+ extracted_files[member.name] = text_content
82
+ except UnicodeDecodeError:
83
+ try:
84
+ text_content = content.read().decode('latin-1')
85
+ extracted_files[member.name] = text_content
86
+ except UnicodeDecodeError:
87
+ # Skip binary files
88
+ continue
89
+ except Exception as e:
90
+ logger.debug(f"Could not extract {member.name}: {e}")
91
+ continue
92
+
93
+ # Find main .tex file, .bib files, and .bbl files
94
+ tex_files = {name: content for name, content in extracted_files.items() if name.endswith('.tex')}
95
+ bib_files = {name: content for name, content in extracted_files.items() if name.endswith('.bib')}
96
+ bbl_files = {name: content for name, content in extracted_files.items() if name.endswith('.bbl')}
97
+
98
+ # Find the main tex file (usually the one with documentclass or largest file)
99
+ main_tex_content = None
100
+ if tex_files:
101
+ # Look for file with \documentclass
102
+ for name, content in tex_files.items():
103
+ if '\\documentclass' in content:
104
+ main_tex_content = content
105
+ logger.debug(f"Found main tex file: {name}")
106
+ break
107
+
108
+ # If no documentclass found, take the largest file
109
+ if not main_tex_content:
110
+ largest_file = max(tex_files.items(), key=lambda x: len(x[1]))
111
+ main_tex_content = largest_file[1]
112
+ logger.debug(f"Using largest tex file: {largest_file[0]}")
113
+
114
+ # Process .bib files using shared logic
115
+ bib_content = select_and_filter_bib_files(bib_files, main_tex_content, tex_files)
116
+
117
+ # Combine all bbl file contents
118
+ bbl_content = None
119
+ if bbl_files:
120
+ bbl_content = '\n\n'.join(bbl_files.values())
121
+ logger.debug(f"Found {len(bbl_files)} .bbl files")
122
+
123
+ if main_tex_content or bib_content or bbl_content:
124
+ logger.info(f"Successfully downloaded ArXiv source for {arxiv_id}")
125
+ return main_tex_content, bib_content, bbl_content
126
+ else:
127
+ logger.debug(f"No usable tex, bib, or bbl files found in ArXiv source for {arxiv_id}")
128
+ return None, None, None
129
+
130
+ finally:
131
+ # Clean up temporary file
132
+ try:
133
+ os.unlink(temp_path)
134
+ except:
135
+ pass
136
+
137
+ except Exception as e:
138
+ logger.debug(f"Failed to download ArXiv source for {arxiv_id}: {str(e)}")
139
+ return None, None, None
140
+
141
+
142
+ def download_arxiv_bibtex(arxiv_id):
143
+ """
144
+ Download BibTeX data directly from ArXiv for a given ArXiv ID.
145
+
146
+ Note: This returns BibTeX for CITING the paper itself, not the paper's bibliography
147
+
148
+ Args:
149
+ arxiv_id: ArXiv identifier (e.g., "1706.03762")
150
+
151
+ Returns:
152
+ BibTeX content as string, or None if download fails
153
+ """
154
+ try:
155
+ bibtex_url = f"https://arxiv.org/bibtex/{arxiv_id}"
156
+ logger.debug(f"Downloading ArXiv BibTeX from: {bibtex_url}")
157
+
158
+ response = requests.get(bibtex_url, timeout=30)
159
+ response.raise_for_status()
160
+
161
+ bibtex_content = response.text.strip()
162
+ if bibtex_content and bibtex_content.startswith('@'):
163
+ logger.info(f"Successfully downloaded citation BibTeX for ArXiv paper {arxiv_id}")
164
+ return bibtex_content
165
+ else:
166
+ logger.debug(f"Invalid BibTeX response for ArXiv paper {arxiv_id}")
167
+ return None
168
+
169
+ except Exception as e:
170
+ logger.debug(f"Failed to download BibTeX for ArXiv paper {arxiv_id}: {str(e)}")
171
+ return None
172
+
173
+
174
+ def select_and_filter_bib_files(bib_files, main_tex_content, tex_files):
175
+ """
176
+ Select appropriate .bib files based on main TeX file references and filter by citations.
177
+
178
+ Args:
179
+ bib_files: Dict of .bib files {filename: content}
180
+ main_tex_content: Content of main tex file
181
+ tex_files: Dict of all tex files {filename: content} (for filtering)
182
+
183
+ Returns:
184
+ Filtered BibTeX content or None if no files available
185
+ """
186
+ import re
187
+
188
+ if not bib_files:
189
+ return None
190
+
191
+ if main_tex_content:
192
+ # Extract bibliography references from main tex file
193
+ referenced_bibs = []
194
+ bib_pattern = r'\\bibliography\{([^}]+)\}'
195
+ matches = re.findall(bib_pattern, main_tex_content)
196
+
197
+ for match in matches:
198
+ # Handle multiple bib files separated by commas
199
+ bib_names = [name.strip() for name in match.split(',')]
200
+ for bib_name in bib_names:
201
+ # Add .bib extension if not present
202
+ if not bib_name.endswith('.bib'):
203
+ bib_name += '.bib'
204
+ referenced_bibs.append(bib_name)
205
+
206
+ # Use only referenced .bib files, or all if no references found
207
+ if referenced_bibs:
208
+ used_bibs = []
209
+ seen_bib_names = set() # Track which bib files we've already added
210
+ for bib_name in referenced_bibs:
211
+ if bib_name in bib_files and bib_name not in seen_bib_names:
212
+ used_bibs.append(bib_files[bib_name])
213
+ seen_bib_names.add(bib_name)
214
+ logger.debug(f"Using referenced .bib file: {bib_name}")
215
+ elif bib_name in seen_bib_names:
216
+ logger.debug(f"Skipping duplicate .bib file: {bib_name}")
217
+ else:
218
+ logger.debug(f"Referenced .bib file not found: {bib_name}")
219
+
220
+ if used_bibs:
221
+ raw_bib_content = '\n\n'.join(used_bibs)
222
+ # Filter BibTeX to only include cited references
223
+ filtered_content = filter_bibtex_by_citations(raw_bib_content, tex_files, main_tex_content)
224
+ logger.debug(f"Found {len(used_bibs)} referenced .bib files out of {len(bib_files)} total")
225
+ return filtered_content
226
+ else:
227
+ # Fallback to all bib files if none of the referenced ones found
228
+ raw_bib_content = '\n\n'.join(bib_files.values())
229
+ filtered_content = filter_bibtex_by_citations(raw_bib_content, tex_files, main_tex_content)
230
+ logger.debug(f"No referenced .bib files found, using all {len(bib_files)} .bib files")
231
+ return filtered_content
232
+ else:
233
+ # No \bibliography command found, use all bib files
234
+ raw_bib_content = '\n\n'.join(bib_files.values())
235
+ filtered_content = filter_bibtex_by_citations(raw_bib_content, tex_files, main_tex_content)
236
+ logger.debug(f"No \\bibliography command found, using all {len(bib_files)} .bib files")
237
+ return filtered_content
238
+ else:
239
+ # No main tex file but have bib files
240
+ raw_bib_content = '\n\n'.join(bib_files.values())
241
+ # Can't filter without tex files, so use original content
242
+ logger.debug(f"Found {len(bib_files)} .bib files (no main tex to filter)")
243
+ return raw_bib_content
244
+
245
+
246
+ def extract_cited_keys_from_tex(tex_files, main_tex_content):
247
+ """
248
+ Extract all citation keys from TeX files.
249
+
250
+ Args:
251
+ tex_files: Dict of all tex files {filename: content}
252
+ main_tex_content: Content of main tex file
253
+
254
+ Returns:
255
+ Set of cited reference keys
256
+ """
257
+ cited_keys = set()
258
+
259
+ # Combine all tex content
260
+ all_tex_content = main_tex_content or ""
261
+ for tex_content in tex_files.values():
262
+ all_tex_content += "\n" + tex_content
263
+
264
+ # Find all \cite{...} commands
265
+ cite_pattern = r'\\cite\{([^}]+)\}'
266
+ matches = re.findall(cite_pattern, all_tex_content)
267
+
268
+ for match in matches:
269
+ # Handle multiple citations: \cite{key1,key2,key3}
270
+ keys = [key.strip() for key in match.split(',')]
271
+ cited_keys.update(keys)
272
+
273
+ logger.debug(f"Found {len(cited_keys)} unique cited references")
274
+ return cited_keys
275
+
276
+
277
+ def is_reference_used(reference_key, cited_keys):
278
+ """
279
+ Check if a specific reference key is used/cited.
280
+
281
+ Args:
282
+ reference_key: The BibTeX key to check
283
+ cited_keys: Set of all cited reference keys
284
+
285
+ Returns:
286
+ True if the reference is cited, False otherwise
287
+ """
288
+ result = reference_key in cited_keys
289
+ # Add debugging for the first few mismatches to understand the issue
290
+ if not result and len([k for k in cited_keys if k.startswith('a')]) < 3: # Limit debug output
291
+ logger.debug(f"Key '{reference_key}' not found in cited_keys")
292
+ return result
293
+
294
+
295
+ def filter_bibtex_by_citations(bib_content, tex_files, main_tex_content):
296
+ """
297
+ Filter BibTeX content to only include references that are actually cited.
298
+
299
+ Args:
300
+ bib_content: Full BibTeX content
301
+ tex_files: Dict of all tex files {filename: content}
302
+ main_tex_content: Content of main tex file
303
+
304
+ Returns:
305
+ Filtered BibTeX content with only cited references
306
+ """
307
+ if not bib_content:
308
+ return bib_content
309
+
310
+ try:
311
+ # Extract all citation keys from tex files
312
+ cited_keys = extract_cited_keys_from_tex(tex_files, main_tex_content)
313
+
314
+ if not cited_keys:
315
+ logger.debug("No citations found, returning full BibTeX content")
316
+ return bib_content
317
+
318
+ # Parse BibTeX entries and filter
319
+ from refchecker.utils.bibtex_parser import parse_bibtex_entries
320
+ entries = parse_bibtex_entries(bib_content)
321
+
322
+ # Filter entries to only cited ones and remove duplicates
323
+ cited_entries = []
324
+ seen_keys = set()
325
+ not_cited_count = 0
326
+ duplicate_count = 0
327
+
328
+ for entry in entries:
329
+ entry_key = entry.get('key', '')
330
+ if is_reference_used(entry_key, cited_keys):
331
+ if entry_key not in seen_keys:
332
+ cited_entries.append(entry)
333
+ seen_keys.add(entry_key)
334
+ else:
335
+ duplicate_count += 1
336
+ logger.debug(f"Skipping duplicate entry: '{entry_key}'")
337
+ else:
338
+ not_cited_count += 1
339
+ # Log first few entries that are NOT cited for debugging
340
+ if not_cited_count <= 5:
341
+ logger.debug(f"Entry NOT cited: '{entry_key}'")
342
+
343
+ logger.debug(f"Filtered BibTeX: {len(entries)} total -> {len(cited_entries)} cited (removed {duplicate_count} duplicates)")
344
+ logger.debug(f"Citation keys found: {len(cited_keys)} keys")
345
+ logger.debug(f"Sample cited keys: {list(cited_keys)[:10]}")
346
+
347
+ # Reconstruct BibTeX content from cited entries
348
+ if not cited_entries:
349
+ logger.debug("No cited entries found in BibTeX, returning original content")
350
+ return bib_content
351
+
352
+ # Convert entries back to BibTeX format
353
+ filtered_content = reconstruct_bibtex_content(cited_entries, bib_content)
354
+ return filtered_content
355
+
356
+ except Exception as e:
357
+ logger.debug(f"Error filtering BibTeX by citations: {e}")
358
+ return bib_content # Fallback to original content
359
+
360
+
361
+ def reconstruct_bibtex_content(cited_entries, original_content):
362
+ """
363
+ Reconstruct BibTeX content from filtered entries by extracting original text.
364
+
365
+ Args:
366
+ cited_entries: List of cited entry dictionaries
367
+ original_content: Original BibTeX content
368
+
369
+ Returns:
370
+ Reconstructed BibTeX content with only cited entries
371
+ """
372
+ cited_keys = {entry.get('key', '') for entry in cited_entries}
373
+
374
+ # Extract original BibTeX entries by finding their text blocks using robust brace counting
375
+ filtered_parts = []
376
+
377
+ import re
378
+ # Find all entry starts
379
+ entry_starts = []
380
+ for match in re.finditer(r'@\w+\s*\{', original_content, re.IGNORECASE):
381
+ entry_starts.append(match.start())
382
+
383
+ # For each entry start, find the complete entry by counting braces
384
+ for start_pos in entry_starts:
385
+ # Extract the key from the entry header
386
+ key_match = re.search(r'@\w+\s*\{\s*([^,\s}]+)', original_content[start_pos:start_pos+200])
387
+ if not key_match:
388
+ continue
389
+
390
+ entry_key = key_match.group(1).strip()
391
+ if entry_key not in cited_keys:
392
+ continue
393
+
394
+ # Find the complete entry by counting braces
395
+ brace_count = 0
396
+ pos = start_pos
397
+ entry_start_found = False
398
+
399
+ while pos < len(original_content):
400
+ char = original_content[pos]
401
+ if char == '{':
402
+ if not entry_start_found:
403
+ entry_start_found = True
404
+ brace_count += 1
405
+ elif char == '}':
406
+ brace_count -= 1
407
+ if entry_start_found and brace_count == 0:
408
+ entry_end = pos + 1
409
+ entry_text = original_content[start_pos:entry_end]
410
+ filtered_parts.append(entry_text)
411
+ break
412
+ pos += 1
413
+
414
+ if not filtered_parts:
415
+ logger.debug("Could not reconstruct BibTeX entries, returning original")
416
+ return original_content
417
+
418
+ return '\n\n'.join(filtered_parts) + '\n'
419
+
420
+
421
+ def get_bibtex_content(paper):
422
+ """
423
+ Try to get BibTeX content for a paper from various sources.
424
+
425
+ For ArXiv papers, only use .bbl files (compiled bibliography).
426
+ The .bbl file contains only the actually-cited references, while .bib files
427
+ are unreliable - they may contain entire bibliography databases (e.g., full
428
+ ACL Anthology with 80k+ entries) or unfiltered reference collections.
429
+
430
+ Args:
431
+ paper: Paper object
432
+
433
+ Returns:
434
+ str: BibTeX content if found, None otherwise
435
+ """
436
+ import re
437
+
438
+ # Try ArXiv source if it's an ArXiv paper
439
+ arxiv_id = extract_arxiv_id_from_paper(paper)
440
+ if arxiv_id:
441
+ logger.debug(f"Detected ArXiv paper {arxiv_id}, checking for .bbl bibliography")
442
+ tex_content, bib_content, bbl_content = download_arxiv_source(arxiv_id)
443
+
444
+ # Only use .bbl files for ArXiv papers (.bib files are unreliable)
445
+ if bbl_content:
446
+ bbl_entry_count = len(re.findall(r'\\bibitem[\[\{]', bbl_content))
447
+ if bbl_entry_count > 0:
448
+ logger.info(f"Using .bbl files from ArXiv source ({bbl_entry_count} entries)")
449
+ return bbl_content
450
+ else:
451
+ logger.debug(f"Found .bbl file but it appears empty")
452
+
453
+ # No .bbl available - return None to trigger PDF fallback
454
+ if bib_content:
455
+ bib_entry_count = len(re.findall(r'@\w+\s*\{', bib_content))
456
+ logger.debug(f"Skipping .bib file ({bib_entry_count} entries) - unreliable, falling back to PDF extraction")
457
+
458
+ logger.debug(f"No usable .bbl file found for ArXiv paper {arxiv_id}")
459
+
460
+ return None
461
+
462
+
@@ -0,0 +1,179 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Author comparison utilities for ArXiv Reference Checker
4
+ """
5
+
6
+ import re
7
+ import logging
8
+ from .text_utils import normalize_text
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+ def levenshtein_distance(s1, s2):
13
+ """
14
+ Calculate the Levenshtein distance between two strings
15
+
16
+ Args:
17
+ s1, s2: Strings to compare
18
+
19
+ Returns:
20
+ Integer distance
21
+ """
22
+ if len(s1) < len(s2):
23
+ return levenshtein_distance(s2, s1)
24
+
25
+ if len(s2) == 0:
26
+ return len(s1)
27
+
28
+ previous_row = list(range(len(s2) + 1))
29
+ for i, c1 in enumerate(s1):
30
+ current_row = [i + 1]
31
+ for j, c2 in enumerate(s2):
32
+ insertions = previous_row[j + 1] + 1
33
+ deletions = current_row[j] + 1
34
+ substitutions = previous_row[j] + (c1 != c2)
35
+ current_row.append(min(insertions, deletions, substitutions))
36
+ previous_row = current_row
37
+
38
+ return previous_row[-1]
39
+
40
+ def compare_authors(cited_authors, correct_authors, threshold=0.8):
41
+ """
42
+ Compare two author lists and return similarity metrics
43
+
44
+ Args:
45
+ cited_authors: List of authors as cited
46
+ correct_authors: List of correct authors
47
+ threshold: Similarity threshold (0-1)
48
+
49
+ Returns:
50
+ Dictionary with comparison results
51
+ """
52
+ if not cited_authors or not correct_authors:
53
+ return {
54
+ 'match': False,
55
+ 'similarity': 0.0,
56
+ 'details': 'One or both author lists empty'
57
+ }
58
+
59
+ # Handle "et al." cases
60
+ cited_has_et_al = any('et al' in author.lower() for author in cited_authors)
61
+ correct_has_et_al = len(correct_authors) > 3
62
+
63
+ if cited_has_et_al or correct_has_et_al:
64
+ # Compare only the first few authors
65
+ cited_main = [a for a in cited_authors if 'et al' not in a.lower()][:3]
66
+ correct_main = correct_authors[:3]
67
+
68
+ if len(cited_main) == 0:
69
+ return {
70
+ 'match': True, # "et al." without specific authors
71
+ 'similarity': 0.9,
72
+ 'details': 'Et al. reference'
73
+ }
74
+ else:
75
+ cited_main = cited_authors
76
+ correct_main = correct_authors
77
+
78
+ # Calculate similarities for each cited author
79
+ similarities = []
80
+ matched_authors = 0
81
+
82
+ for cited_author in cited_main:
83
+ cited_norm = normalize_text(cited_author)
84
+ best_similarity = 0.0
85
+ best_match = ''
86
+
87
+ for correct_author in correct_main:
88
+ correct_norm = normalize_text(correct_author)
89
+
90
+ # Calculate similarity
91
+ if cited_norm == correct_norm:
92
+ similarity = 1.0
93
+ elif cited_norm in correct_norm or correct_norm in cited_norm:
94
+ similarity = 0.9
95
+ else:
96
+ # Use Levenshtein distance
97
+ max_len = max(len(cited_norm), len(correct_norm))
98
+ if max_len == 0:
99
+ similarity = 1.0
100
+ else:
101
+ distance = levenshtein_distance(cited_norm, correct_norm)
102
+ similarity = 1.0 - (distance / max_len)
103
+
104
+ if similarity > best_similarity:
105
+ best_similarity = similarity
106
+ best_match = correct_author
107
+
108
+ similarities.append({
109
+ 'cited': cited_author,
110
+ 'matched': best_match,
111
+ 'similarity': best_similarity
112
+ })
113
+
114
+ if best_similarity >= threshold:
115
+ matched_authors += 1
116
+
117
+ # Calculate overall match
118
+ if len(cited_main) == 0:
119
+ overall_similarity = 0.0
120
+ else:
121
+ overall_similarity = matched_authors / len(cited_main)
122
+
123
+ # Determine if it's a match
124
+ is_match = overall_similarity >= threshold
125
+
126
+ # Handle author count mismatch
127
+ count_penalty = 0
128
+ if len(cited_main) != len(correct_main):
129
+ count_diff = abs(len(cited_main) - len(correct_main))
130
+ count_penalty = min(0.1 * count_diff, 0.3) # Max 30% penalty
131
+ overall_similarity = max(0, overall_similarity - count_penalty)
132
+
133
+ details = f"Matched {matched_authors}/{len(cited_main)} authors"
134
+ if count_penalty > 0:
135
+ details += f", count mismatch penalty: {count_penalty:.1f}"
136
+
137
+ return {
138
+ 'match': is_match,
139
+ 'similarity': overall_similarity,
140
+ 'details': details,
141
+ 'author_matches': similarities
142
+ }
143
+
144
+ def extract_authors_list(authors_text):
145
+ """
146
+ Extract a list of authors from text
147
+
148
+ Args:
149
+ authors_text: String containing author names
150
+
151
+ Returns:
152
+ List of cleaned author names
153
+ """
154
+ if not isinstance(authors_text, str):
155
+ return []
156
+
157
+ # Remove common prefixes
158
+ authors_text = re.sub(r'^(by|authors?:)\s*', '', authors_text, flags=re.IGNORECASE)
159
+
160
+ # Split by common separators
161
+ separators = [',', ';', ' and ', ' & ', '\n']
162
+ authors = [authors_text]
163
+
164
+ for sep in separators:
165
+ new_authors = []
166
+ for author in authors:
167
+ new_authors.extend([a.strip() for a in author.split(sep)])
168
+ authors = new_authors
169
+
170
+ # Clean each author name
171
+ from .text_utils import clean_author_name
172
+ cleaned_authors = []
173
+ for author in authors:
174
+ if author.strip(): # Skip empty strings
175
+ cleaned = clean_author_name(author)
176
+ if cleaned: # Only add non-empty cleaned names
177
+ cleaned_authors.append(cleaned)
178
+
179
+ return cleaned_authors