ref-management 1.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ref_management/__init__.py +3 -0
- ref_management/add_dois.py +126 -0
- ref_management/apply_citations.py +471 -0
- ref_management/auto_format.py +106 -0
- ref_management/generate_report.py +170 -0
- ref_management/scan_raw_refs.py +361 -0
- ref_management/verify_bib.py +300 -0
- ref_management-1.0.3.dist-info/METADATA +13 -0
- ref_management-1.0.3.dist-info/RECORD +13 -0
- ref_management-1.0.3.dist-info/WHEEL +5 -0
- ref_management-1.0.3.dist-info/entry_points.txt +7 -0
- ref_management-1.0.3.dist-info/licenses/LICENSE +21 -0
- ref_management-1.0.3.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import sys
|
|
3
|
+
import subprocess
|
|
4
|
+
import argparse
|
|
5
|
+
import re
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
def main():
|
|
9
|
+
parser = argparse.ArgumentParser(description="Another Reference Manager (ARM)")
|
|
10
|
+
parser.add_argument("document", type=Path, nargs='?', help="Path to the input .docx file")
|
|
11
|
+
parser.add_argument("--csl", type=Path, help="Path to the desired .csl file")
|
|
12
|
+
args = parser.parse_args()
|
|
13
|
+
|
|
14
|
+
# 1. Check Input Document
|
|
15
|
+
doc_path = args.document
|
|
16
|
+
if not doc_path:
|
|
17
|
+
print("\n=== ARM Manuscript Auto-Formatter (Universal CSL Edition) ===")
|
|
18
|
+
doc_input = input("Enter the path to your .docx file: ").strip().strip('"').strip("'")
|
|
19
|
+
doc_path = Path(doc_input)
|
|
20
|
+
|
|
21
|
+
if not doc_path.exists():
|
|
22
|
+
print(f"\nā ERROR: Document '{doc_path}' not found.")
|
|
23
|
+
sys.exit(1)
|
|
24
|
+
|
|
25
|
+
# 2. Check CSL Input
|
|
26
|
+
csl_path = args.csl
|
|
27
|
+
if not csl_path:
|
|
28
|
+
print("\n=== Reference Formatting ===")
|
|
29
|
+
csl_input = input("Enter the CSL style name or path (e.g., nature): ").strip().strip('"').strip("'")
|
|
30
|
+
csl_path = Path(csl_input)
|
|
31
|
+
|
|
32
|
+
# --- CSL Path Resolution Logic ---
|
|
33
|
+
default_csl_dir = Path("~/citation_styles").expanduser()
|
|
34
|
+
|
|
35
|
+
if not csl_path.exists():
|
|
36
|
+
# Check if it exists in the default directory
|
|
37
|
+
alt_path = default_csl_dir / csl_path.name
|
|
38
|
+
if alt_path.exists():
|
|
39
|
+
csl_path = alt_path
|
|
40
|
+
elif not csl_path.suffix == '.csl':
|
|
41
|
+
# Check if user forgot the .csl extension
|
|
42
|
+
alt_path_ext = default_csl_dir / f"{csl_path.name}.csl"
|
|
43
|
+
if alt_path_ext.exists():
|
|
44
|
+
csl_path = alt_path_ext
|
|
45
|
+
|
|
46
|
+
if not csl_path.exists():
|
|
47
|
+
print(f"\nā ERROR: CSL file '{args.csl or csl_input}' not found locally or in {default_csl_dir}.")
|
|
48
|
+
sys.exit(1)
|
|
49
|
+
|
|
50
|
+
# Use the current python executable to run subprocesses reliably
|
|
51
|
+
python_exe = sys.executable
|
|
52
|
+
|
|
53
|
+
print("\n" + "="*50)
|
|
54
|
+
print("š STARTING AUTOMATED REFERENCE PIPELINE (CSL ENGINE)")
|
|
55
|
+
print("="*50)
|
|
56
|
+
|
|
57
|
+
# --- STEP 1: SCAN AND EXTRACT ---
|
|
58
|
+
print(f"\n>>> [1/3] Scanning {doc_path.name} for references...")
|
|
59
|
+
step1 = subprocess.run([python_exe, "-m", "ref_management.scan_raw_refs", str(doc_path)])
|
|
60
|
+
if step1.returncode != 0:
|
|
61
|
+
print("\nā ERROR: Pipeline failed during Step 1 (Scanning).")
|
|
62
|
+
sys.exit(1)
|
|
63
|
+
|
|
64
|
+
# Determine the extracted output filename based on scan_raw_refs logic
|
|
65
|
+
base_name = re.split(r'[ _]', doc_path.stem)[0] or "scan"
|
|
66
|
+
extracted_bib = Path(f"{base_name}_extracted.bib")
|
|
67
|
+
|
|
68
|
+
if not extracted_bib.exists():
|
|
69
|
+
print(f"\nā ERROR: Expected intermediate file '{extracted_bib}' was not generated.")
|
|
70
|
+
sys.exit(1)
|
|
71
|
+
|
|
72
|
+
# --- STEP 2: VERIFY AND ENRICH ---
|
|
73
|
+
print(f"\n>>> [2/3] Fetching metadata from PubMed/Crossref for {extracted_bib.name}...")
|
|
74
|
+
step2 = subprocess.run([python_exe, "-m", "ref_management.verify_bib", str(extracted_bib)])
|
|
75
|
+
if step2.returncode != 0:
|
|
76
|
+
print("\nā ERROR: Pipeline failed during Step 2 (Verification).")
|
|
77
|
+
sys.exit(1)
|
|
78
|
+
|
|
79
|
+
verified_bib = Path(f"{base_name}_extracted_verified.bib")
|
|
80
|
+
if not verified_bib.exists():
|
|
81
|
+
print(f"\nā ERROR: Expected intermediate file '{verified_bib}' was not generated.")
|
|
82
|
+
sys.exit(1)
|
|
83
|
+
|
|
84
|
+
# --- STEP 3: APPLY TO MANUSCRIPT USING CSL ---
|
|
85
|
+
print(f"\n>>> [3/3] Formatting Word document using {csl_path.name}...")
|
|
86
|
+
step3 = subprocess.run([
|
|
87
|
+
python_exe, "-m", "ref_management.apply_citations",
|
|
88
|
+
str(verified_bib),
|
|
89
|
+
str(doc_path),
|
|
90
|
+
"--csl", str(csl_path)
|
|
91
|
+
])
|
|
92
|
+
if step3.returncode != 0:
|
|
93
|
+
print("\nā ERROR: Pipeline failed during Step 3 (Formatting).")
|
|
94
|
+
sys.exit(1)
|
|
95
|
+
|
|
96
|
+
# --- WRAP UP ---
|
|
97
|
+
final_output = doc_path.with_name(f"{doc_path.stem}_final_{csl_path.stem}.docx")
|
|
98
|
+
print("\n" + "="*50)
|
|
99
|
+
print("š PIPELINE COMPLETE!")
|
|
100
|
+
print("="*50)
|
|
101
|
+
print(f"ā
Final Document: {final_output}")
|
|
102
|
+
print(f"ā
Styles Applied via CSL: {csl_path.name}")
|
|
103
|
+
print(f"ā
Intermediate BibTeX files ({extracted_bib}, {verified_bib}) saved for your records.")
|
|
104
|
+
|
|
105
|
+
if __name__ == "__main__":
|
|
106
|
+
main()
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import re
|
|
3
|
+
import sys
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Dict, Any, Tuple, List
|
|
6
|
+
|
|
7
|
+
import pyparsing
|
|
8
|
+
if not hasattr(pyparsing, 'DelimitedList'):
|
|
9
|
+
if hasattr(pyparsing, 'delimited_list'): setattr(pyparsing, 'DelimitedList', pyparsing.delimited_list)
|
|
10
|
+
elif hasattr(pyparsing, 'delimitedList'): setattr(pyparsing, 'DelimitedList', pyparsing.delimitedList)
|
|
11
|
+
|
|
12
|
+
import bibtexparser
|
|
13
|
+
from bibtexparser.bparser import BibTexParser
|
|
14
|
+
from bibtexparser import customization
|
|
15
|
+
from rapidfuzz import fuzz
|
|
16
|
+
|
|
17
|
+
def customizations(record: Dict[str, Any]) -> Dict[str, Any]:
|
|
18
|
+
record = customization.type(record)
|
|
19
|
+
record = customization.author(record)
|
|
20
|
+
record = customization.convert_to_unicode(record)
|
|
21
|
+
return {k if k == 'ID' else k.lower(): v for k, v in record.items()}
|
|
22
|
+
|
|
23
|
+
def natural_sort_key(entry: Dict[str, Any]) -> Tuple[int, str]:
|
|
24
|
+
entry_id = entry.get('ID', '')
|
|
25
|
+
match = re.search(r'(\d+)', entry_id)
|
|
26
|
+
return (int(match.group(1)) if match else 999999, entry_id)
|
|
27
|
+
|
|
28
|
+
def clean_latex(text: Any) -> str:
|
|
29
|
+
if not text: return ""
|
|
30
|
+
text_str = str(text).replace('{', '').replace('}', '')
|
|
31
|
+
replacements = {r'\"a': 'ä', r'\"o': 'ö', r'\"u': 'ü', r'\%': '%', r'\&': '&', r'\_': '_'}
|
|
32
|
+
for pat, rep in replacements.items(): text_str = text_str.replace(pat, rep)
|
|
33
|
+
return re.sub(r'\s+', ' ', text_str.replace('\\', '')).strip()
|
|
34
|
+
|
|
35
|
+
def format_authors(entry: Dict[str, Any], style: str) -> str:
|
|
36
|
+
authors = entry.get('author') or entry.get('authors')
|
|
37
|
+
if not authors: return "Unknown"
|
|
38
|
+
|
|
39
|
+
authors_list = authors.split(' and ') if isinstance(authors, str) else (authors if isinstance(authors, list) else [str(authors)])
|
|
40
|
+
formatted_list = []
|
|
41
|
+
|
|
42
|
+
for name in authors_list:
|
|
43
|
+
clean_name = clean_latex(name).strip()
|
|
44
|
+
if ',' in clean_name:
|
|
45
|
+
parts = clean_name.split(',', 1)
|
|
46
|
+
surname = parts[0].strip()
|
|
47
|
+
initials = "".join([p.strip()[0].upper() for p in parts[1].split() if p.strip()])
|
|
48
|
+
else:
|
|
49
|
+
parts = clean_name.split()
|
|
50
|
+
surname = parts[-1] if parts else "Unknown"
|
|
51
|
+
initials = "".join([p.strip()[0].upper() for p in parts[:-1] if p.strip()]) if len(parts) > 1 else ""
|
|
52
|
+
|
|
53
|
+
if style == 'numbered':
|
|
54
|
+
formatted_list.append(f"{surname} {initials}".strip())
|
|
55
|
+
else:
|
|
56
|
+
formatted_list.append(f"{surname}, {'.'.join(initials)}." if initials else surname)
|
|
57
|
+
|
|
58
|
+
if len(formatted_list) == 1: return formatted_list[0]
|
|
59
|
+
elif len(formatted_list) == 2: return f"{formatted_list[0]} and {formatted_list[1]}"
|
|
60
|
+
else:
|
|
61
|
+
joiner = " and " if style == 'numbered' else ", and "
|
|
62
|
+
return f"{', '.join(formatted_list[:-1])}{joiner}{formatted_list[-1]}"
|
|
63
|
+
|
|
64
|
+
def format_entry(entry: Dict[str, Any], style: str) -> str:
|
|
65
|
+
authors = format_authors(entry, style)
|
|
66
|
+
year = clean_latex(entry.get('year', '????'))
|
|
67
|
+
title = clean_latex(entry.get('title', 'No Title'))
|
|
68
|
+
if title and title[-1] not in '.?!': title += "."
|
|
69
|
+
|
|
70
|
+
journal_str = clean_latex(entry.get('journal', ''))
|
|
71
|
+
volume = clean_latex(entry.get('volume', ''))
|
|
72
|
+
pages = clean_latex(entry.get('pages', '')).replace('--', 'ā').replace('-', 'ā')
|
|
73
|
+
|
|
74
|
+
if volume: journal_str += f" {volume}"
|
|
75
|
+
if pages: journal_str += f", {pages}" if volume else f" {pages}"
|
|
76
|
+
if journal_str and journal_str[-1] not in '.?': journal_str += "."
|
|
77
|
+
|
|
78
|
+
doi_str = f" doi: {clean_latex(entry['doi']).replace('https://doi.org/', '')}" if entry.get('doi') else ""
|
|
79
|
+
|
|
80
|
+
if style == 'numbered':
|
|
81
|
+
return f"{authors} ({year}) {title} {journal_str}{doi_str}".strip()
|
|
82
|
+
else:
|
|
83
|
+
return f"{authors} ({year}). {title} {journal_str}{doi_str}".strip()
|
|
84
|
+
|
|
85
|
+
def process_bib(input_file: Path, output_file: Path, style: str):
|
|
86
|
+
print(f"Reading {input_file} (Style: {style.upper()})...")
|
|
87
|
+
try:
|
|
88
|
+
with open(input_file, encoding='utf-8') as f:
|
|
89
|
+
parser = BibTexParser()
|
|
90
|
+
parser.customization = customizations
|
|
91
|
+
parser.common_strings = False
|
|
92
|
+
bib = parser.parse_file(f)
|
|
93
|
+
except Exception as e:
|
|
94
|
+
print(f"Error parsing BibTeX: {e}"); sys.exit(1)
|
|
95
|
+
|
|
96
|
+
# Sort based on style choice
|
|
97
|
+
if style == 'numbered':
|
|
98
|
+
sorted_entries = sorted(bib.entries, key=natural_sort_key)
|
|
99
|
+
else:
|
|
100
|
+
sorted_entries = sorted(bib.entries, key=lambda e: clean_latex(e.get('author', 'z')).lower())
|
|
101
|
+
|
|
102
|
+
seen_pmids, seen_dois, seen_titles = {}, {}, {}
|
|
103
|
+
|
|
104
|
+
try:
|
|
105
|
+
with open(output_file, 'w', encoding='utf-8') as f:
|
|
106
|
+
f.write(f"--- References List ({style.upper()} FORMAT | Source: {input_file.name}) ---\n\n")
|
|
107
|
+
|
|
108
|
+
for i, entry in enumerate(sorted_entries):
|
|
109
|
+
# ID tracking
|
|
110
|
+
match = re.search(r'(\d+)', entry.get('ID', ''))
|
|
111
|
+
ref_num = match.group(1) if match else str(i + 1)
|
|
112
|
+
seq_prefix = f"{i + 1}. " if style == 'numbered' else "- "
|
|
113
|
+
|
|
114
|
+
title = clean_latex(entry.get('title', '')).strip()
|
|
115
|
+
if not title or (title.upper() in ['REVIEWS', 'UNKNOWN', 'REFERENCES'] and 'author' not in entry): continue
|
|
116
|
+
|
|
117
|
+
dup_source = seen_pmids.get(entry.get('pmid')) or seen_dois.get(entry.get('doi'))
|
|
118
|
+
if not dup_source and len(title) > 20:
|
|
119
|
+
norm_title = re.sub(r'[^\w]', '', title.lower())
|
|
120
|
+
for s_title, s_info in seen_titles.items():
|
|
121
|
+
if fuzz.ratio(norm_title, s_title) > 95:
|
|
122
|
+
dup_source = s_info; break
|
|
123
|
+
|
|
124
|
+
if dup_source:
|
|
125
|
+
f.write(f"{seq_prefix}[DUPLICATE of {dup_source[0]}: {dup_source[1]}] {title[:50]}...\n\n")
|
|
126
|
+
else:
|
|
127
|
+
if entry.get('pmid'): seen_pmids[entry['pmid']] = (ref_num, entry['ID'])
|
|
128
|
+
if entry.get('doi'): seen_dois[entry['doi']] = (ref_num, entry['ID'])
|
|
129
|
+
if len(title) > 20: seen_titles[re.sub(r'[^\w]', '', title.lower())] = (ref_num, entry['ID'])
|
|
130
|
+
|
|
131
|
+
try:
|
|
132
|
+
f.write(f"{seq_prefix}{format_entry(entry, style)}\n\n")
|
|
133
|
+
except Exception as e:
|
|
134
|
+
f.write(f"{seq_prefix}[ERROR] {e}\n\n")
|
|
135
|
+
|
|
136
|
+
print(f"Success! Saved to {output_file}")
|
|
137
|
+
except IOError as e:
|
|
138
|
+
print(f"I/O Error: {e}")
|
|
139
|
+
sys.exit(1)
|
|
140
|
+
|
|
141
|
+
def main():
|
|
142
|
+
parser = argparse.ArgumentParser()
|
|
143
|
+
parser.add_argument("input", type=Path)
|
|
144
|
+
parser.add_argument("output", type=Path, nargs='?', default=None)
|
|
145
|
+
parser.add_argument("--style", choices=['1', '2', 'numbered', 'author-year'], help="1: Numbered, 2: Author-Year")
|
|
146
|
+
args = parser.parse_args()
|
|
147
|
+
|
|
148
|
+
if not args.input.stem.endswith("_verified"):
|
|
149
|
+
candidate = args.input.with_name(f"{args.input.stem}_verified{args.input.suffix}")
|
|
150
|
+
if candidate.exists(): args.input = candidate
|
|
151
|
+
|
|
152
|
+
style_choice = args.style
|
|
153
|
+
if not style_choice:
|
|
154
|
+
print("\n=== Select Report Output Style ===")
|
|
155
|
+
print(" 1. Sequential Numbered (1. Lopez-Otin C...)")
|
|
156
|
+
print(" 2. Author-Year Alphabetical (- Smith, I. (2023)...)")
|
|
157
|
+
while True:
|
|
158
|
+
choice = input("Enter 1 or 2: ").strip()
|
|
159
|
+
if choice in ['1', '2']:
|
|
160
|
+
style_choice = choice
|
|
161
|
+
break
|
|
162
|
+
|
|
163
|
+
active_style = 'author-year' if style_choice in ['2', 'author-year'] else 'numbered'
|
|
164
|
+
if not args.output:
|
|
165
|
+
args.output = args.input.with_name(f"{args.input.stem}_list_{active_style}.txt")
|
|
166
|
+
|
|
167
|
+
process_bib(args.input, args.output, active_style)
|
|
168
|
+
|
|
169
|
+
if __name__ == "__main__":
|
|
170
|
+
main()
|
|
@@ -0,0 +1,361 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import csv
|
|
3
|
+
import os
|
|
4
|
+
import sys
|
|
5
|
+
import argparse
|
|
6
|
+
import time
|
|
7
|
+
import requests
|
|
8
|
+
from docx import Document
|
|
9
|
+
from Bio import Entrez
|
|
10
|
+
from rapidfuzz import fuzz
|
|
11
|
+
|
|
12
|
+
# --- CONFIGURATION ---
|
|
13
|
+
Entrez.email = os.environ.get("NCBI_EMAIL", None)
|
|
14
|
+
Entrez.api_key = os.environ.get("NCBI_API_KEY", None)
|
|
15
|
+
|
|
16
|
+
if Entrez.api_key is None:
|
|
17
|
+
print("Tip: Set NCBI_API_KEY environment variable to avoid request limits.", file=sys.stderr)
|
|
18
|
+
|
|
19
|
+
# --- HELPERS ---
|
|
20
|
+
def clean_word(word):
|
|
21
|
+
return re.sub(r'[^\w]', '', word)
|
|
22
|
+
|
|
23
|
+
def resolve_doi_to_pmid(doi):
|
|
24
|
+
try:
|
|
25
|
+
clean = doi.rstrip('.').strip()
|
|
26
|
+
handle = Entrez.esearch(db="pubmed", term=f"{clean}[DOI]", retmax=1)
|
|
27
|
+
r = Entrez.read(handle)
|
|
28
|
+
handle.close()
|
|
29
|
+
return r['IdList'][0] if r['IdList'] else None
|
|
30
|
+
except Exception:
|
|
31
|
+
return None
|
|
32
|
+
|
|
33
|
+
def resolve_doi_crossref(doi):
|
|
34
|
+
"""Fallback for DOIs not indexed in PubMed (e.g. Stats journals, old issues)."""
|
|
35
|
+
print(f" ...PubMed missed {doi}. Checking Crossref Master Database...", end='\r')
|
|
36
|
+
try:
|
|
37
|
+
clean = doi.rstrip('.').strip()
|
|
38
|
+
url = f"https://api.crossref.org/works/{clean}"
|
|
39
|
+
resp = requests.get(url, timeout=5)
|
|
40
|
+
if resp.status_code == 200:
|
|
41
|
+
data = resp.json().get('message', {})
|
|
42
|
+
title = data.get('title', [''])[0]
|
|
43
|
+
author_list = data.get('author', [{}])
|
|
44
|
+
first_author = author_list[0].get('family', 'Unknown') if author_list else "Unknown"
|
|
45
|
+
|
|
46
|
+
year = "Unknown"
|
|
47
|
+
pub = data.get('published-print') or data.get('published-online')
|
|
48
|
+
if pub and 'date-parts' in pub:
|
|
49
|
+
year = str(pub['date-parts'][0][0])
|
|
50
|
+
|
|
51
|
+
return {
|
|
52
|
+
"id": f"CR_{clean}",
|
|
53
|
+
"first_author": first_author,
|
|
54
|
+
"year": year,
|
|
55
|
+
"title": title,
|
|
56
|
+
"doi": clean,
|
|
57
|
+
"is_retracted": False
|
|
58
|
+
}
|
|
59
|
+
except Exception:
|
|
60
|
+
pass
|
|
61
|
+
return None
|
|
62
|
+
|
|
63
|
+
def extract_citation_parts(text):
|
|
64
|
+
data = {"year": "", "author": "", "title_snippet": ""}
|
|
65
|
+
year_iter = list(re.finditer(r'\b(19|20)\d{2}[a-z]?\b', text))
|
|
66
|
+
|
|
67
|
+
if year_iter:
|
|
68
|
+
selected_year_match = year_iter[0]
|
|
69
|
+
for m in year_iter:
|
|
70
|
+
if m.start() > 0 and text[m.start()-1] == '(':
|
|
71
|
+
selected_year_match = m
|
|
72
|
+
break
|
|
73
|
+
|
|
74
|
+
raw_year = selected_year_match.group(0)
|
|
75
|
+
data["year"] = re.sub(r'[a-z]', '', raw_year)
|
|
76
|
+
|
|
77
|
+
# Determine if Author-Year or Vancouver style based on Year placement
|
|
78
|
+
if selected_year_match.start() > len(text) / 2:
|
|
79
|
+
snippet = text[:selected_year_match.start()].strip()
|
|
80
|
+
else:
|
|
81
|
+
snippet = text[selected_year_match.end():].strip()
|
|
82
|
+
|
|
83
|
+
# URL/DOI Scrubber (Prevents them from leaking into the Title)
|
|
84
|
+
snippet = re.sub(r'(?i)https?://\S+', '', snippet)
|
|
85
|
+
snippet = re.sub(r'(?i)doi:?\s*10\.\d{4,9}/[-._;()/:a-zA-Z0-9<>\[\]]+', '', snippet)
|
|
86
|
+
data["title_snippet"] = re.sub(r'^[\s\)\.,:;-]+|[\s\.,:;-]+$', '', snippet)
|
|
87
|
+
|
|
88
|
+
# Improved Raw Author Parsing
|
|
89
|
+
author_raw = text[:selected_year_match.start()].split('.')[0].strip()
|
|
90
|
+
author_raw = re.sub(r'^\[?\d+\]?\s*', '', author_raw)
|
|
91
|
+
if len(author_raw) > 3:
|
|
92
|
+
data["author"] = author_raw
|
|
93
|
+
else:
|
|
94
|
+
snippet = text
|
|
95
|
+
snippet = re.sub(r'(?i)https?://\S+', '', snippet)
|
|
96
|
+
snippet = re.sub(r'(?i)doi:?\s*10\.\d{4,9}/[-._;()/:a-zA-Z0-9<>\[\]]+', '', snippet)
|
|
97
|
+
data["title_snippet"] = snippet.strip()
|
|
98
|
+
|
|
99
|
+
if not data["author"]:
|
|
100
|
+
skip_words = {'et', 'al', 'in', 'the', 'pmid', 'doi', 'vol', 'no', 'and', '&', 'eds', 'editor', 'page', 'pp', 'references'}
|
|
101
|
+
for w in text.split():
|
|
102
|
+
clean = clean_word(w)
|
|
103
|
+
if len(clean) < 2 or clean.isdigit(): continue
|
|
104
|
+
if clean.lower() in skip_words: continue
|
|
105
|
+
if clean[0].isupper():
|
|
106
|
+
data["author"] = clean
|
|
107
|
+
break
|
|
108
|
+
|
|
109
|
+
return data
|
|
110
|
+
|
|
111
|
+
def search_pubmed_by_metadata(parts):
|
|
112
|
+
candidates = []
|
|
113
|
+
if len(parts["title_snippet"]) > 10:
|
|
114
|
+
clean_title = re.sub(r'[^\w\s]', '', parts["title_snippet"])
|
|
115
|
+
short_title = " ".join(clean_title.split()[:8])
|
|
116
|
+
try:
|
|
117
|
+
handle = Entrez.esearch(db="pubmed", term=f"{short_title}[Title]", retmax=3)
|
|
118
|
+
r = Entrez.read(handle)
|
|
119
|
+
handle.close()
|
|
120
|
+
if r['IdList']: candidates.extend(r['IdList'])
|
|
121
|
+
except Exception: pass
|
|
122
|
+
|
|
123
|
+
if not candidates and parts["author"] and parts["year"]:
|
|
124
|
+
try:
|
|
125
|
+
handle = Entrez.esearch(db="pubmed", term=f"{parts['author'].split()[0]}[1au] AND {parts['year']}[pdat]", retmax=5)
|
|
126
|
+
r = Entrez.read(handle)
|
|
127
|
+
handle.close()
|
|
128
|
+
if r['IdList']: candidates.extend(r['IdList'])
|
|
129
|
+
except Exception: pass
|
|
130
|
+
|
|
131
|
+
return list(set(candidates))
|
|
132
|
+
|
|
133
|
+
def parse_record(record):
|
|
134
|
+
title = re.sub(r'<[^<]+?>', '', record.get('Title', ''))
|
|
135
|
+
authors = record.get('AuthorList', [])
|
|
136
|
+
first_author = authors[0] if authors else "Unknown"
|
|
137
|
+
pub_date = record.get('PubDate', '')
|
|
138
|
+
year_match = re.search(r'\d{4}', pub_date)
|
|
139
|
+
|
|
140
|
+
doi = ""
|
|
141
|
+
for aid in record.get('ArticleIds', {}).items():
|
|
142
|
+
if aid[0] == 'doi': doi = aid[1]
|
|
143
|
+
if not doi and 'doi' in record.get('ArticleIds', {}):
|
|
144
|
+
doi = record['ArticleIds']['doi']
|
|
145
|
+
|
|
146
|
+
return {
|
|
147
|
+
"id": str(record.get('Id')),
|
|
148
|
+
"first_author": first_author,
|
|
149
|
+
"year": year_match.group(0) if year_match else "Unknown",
|
|
150
|
+
"title": title,
|
|
151
|
+
"doi": doi,
|
|
152
|
+
"is_retracted": "Retracted Publication" in record.get('PubTypeList', [])
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
def batch_fetch_pubmed(pmid_list):
|
|
156
|
+
if not pmid_list: return {}
|
|
157
|
+
fetched_data = {}
|
|
158
|
+
unique_pmids = list(set(pmid_list))
|
|
159
|
+
|
|
160
|
+
print(f"\nFetching metadata for {len(unique_pmids)} unique PubMed papers...")
|
|
161
|
+
for i in range(0, len(unique_pmids), 200):
|
|
162
|
+
chunk = unique_pmids[i : i + 200]
|
|
163
|
+
try:
|
|
164
|
+
handle = Entrez.esummary(db="pubmed", id=",".join(chunk), retmode="xml")
|
|
165
|
+
records = Entrez.read(handle)
|
|
166
|
+
handle.close()
|
|
167
|
+
for record in records:
|
|
168
|
+
data = parse_record(record)
|
|
169
|
+
fetched_data[data['id']] = data
|
|
170
|
+
time.sleep(0.5)
|
|
171
|
+
except Exception as e:
|
|
172
|
+
pass
|
|
173
|
+
|
|
174
|
+
return fetched_data
|
|
175
|
+
|
|
176
|
+
# --- CORE LOGIC ---
|
|
177
|
+
def analyze_document(file_path):
|
|
178
|
+
print(f"\nReading document: {file_path}...")
|
|
179
|
+
all_paragraphs = []
|
|
180
|
+
ext = os.path.splitext(file_path)[1].lower()
|
|
181
|
+
|
|
182
|
+
try:
|
|
183
|
+
if ext == '.docx':
|
|
184
|
+
doc = Document(file_path)
|
|
185
|
+
all_paragraphs = [p.text.strip() for p in doc.paragraphs if p.text.strip()]
|
|
186
|
+
elif ext == '.txt':
|
|
187
|
+
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
|
188
|
+
lines = f.readlines()
|
|
189
|
+
current_block = []
|
|
190
|
+
for line in lines:
|
|
191
|
+
if line.strip():
|
|
192
|
+
if re.match(r'^\[?\d+[\.\)\]]', line.strip()):
|
|
193
|
+
if current_block: all_paragraphs.append(" ".join(current_block))
|
|
194
|
+
current_block = [line.strip()]
|
|
195
|
+
all_paragraphs.append(line.strip())
|
|
196
|
+
current_block = []
|
|
197
|
+
else:
|
|
198
|
+
current_block.append(line.strip())
|
|
199
|
+
else:
|
|
200
|
+
if current_block:
|
|
201
|
+
all_paragraphs.append(" ".join(current_block))
|
|
202
|
+
current_block = []
|
|
203
|
+
if current_block: all_paragraphs.append(" ".join(current_block))
|
|
204
|
+
else:
|
|
205
|
+
print(f"ERROR: Unsupported format {ext}"); return [], [], []
|
|
206
|
+
except Exception as e:
|
|
207
|
+
print(f"ERROR: Could not open file. {e}"); return [], [], []
|
|
208
|
+
|
|
209
|
+
header_regex = re.compile(r'^\s*(?:[0-9]+\.?\s*)?(?:REFERENCES|BIBLIOGRAPHY|LITERATURE CITED|WORKS CITED)\s*$', re.IGNORECASE)
|
|
210
|
+
start_index, header_found = 0, False
|
|
211
|
+
for i, p in enumerate(all_paragraphs):
|
|
212
|
+
if header_regex.match(p):
|
|
213
|
+
start_index, header_found = i + 1, True
|
|
214
|
+
print(f" -> Found Reference Section header. Processing subsequent text.")
|
|
215
|
+
break
|
|
216
|
+
if not header_found: print(" -> No 'References' header found. Assuming file is just a list.")
|
|
217
|
+
|
|
218
|
+
paragraphs_to_check = all_paragraphs[start_index:]
|
|
219
|
+
pmid_pattern = re.compile(r'PMID:?\s*(\d+)', re.IGNORECASE)
|
|
220
|
+
|
|
221
|
+
# NEW DOI PATTERN: Supports <, >, and brackets used in Wiley/SICI DOIs
|
|
222
|
+
doi_pattern = re.compile(r'\b(10\.\d{4,9}/[-._;()/:a-zA-Z0-9<>\[\]]+)')
|
|
223
|
+
|
|
224
|
+
items_to_process = []
|
|
225
|
+
ids_to_fetch_metadata = set()
|
|
226
|
+
|
|
227
|
+
print(f"Scanning {len(paragraphs_to_check)} reference candidates...")
|
|
228
|
+
|
|
229
|
+
for idx, text in enumerate(paragraphs_to_check):
|
|
230
|
+
if len(text) < 15: continue
|
|
231
|
+
|
|
232
|
+
num_match = re.match(r'^\s*\[?(\d+)[\.\)\]]', text)
|
|
233
|
+
ref_num = num_match.group(1) if num_match else str(idx + 1)
|
|
234
|
+
|
|
235
|
+
found_pmids = [m.group(1) for m in pmid_pattern.finditer(text)]
|
|
236
|
+
found_dois = [m.group(1).rstrip('.') for m in doi_pattern.finditer(text)]
|
|
237
|
+
|
|
238
|
+
entry = {
|
|
239
|
+
"ref_num": ref_num,
|
|
240
|
+
"original_text": text,
|
|
241
|
+
"found_pmid": found_pmids[0] if found_pmids else None,
|
|
242
|
+
"found_doi": found_dois[0] if found_dois else None,
|
|
243
|
+
"target_pmid": None,
|
|
244
|
+
"cr_meta": None,
|
|
245
|
+
"status": "PENDING",
|
|
246
|
+
"action_log": []
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
if entry["found_pmid"]:
|
|
250
|
+
entry["target_pmid"] = entry["found_pmid"]
|
|
251
|
+
ids_to_fetch_metadata.add(entry["target_pmid"])
|
|
252
|
+
elif entry["found_doi"]:
|
|
253
|
+
resolved_pmid = resolve_doi_to_pmid(entry["found_doi"])
|
|
254
|
+
if resolved_pmid:
|
|
255
|
+
entry["target_pmid"] = resolved_pmid
|
|
256
|
+
entry["action_log"].append("Fetched PMID via DOI")
|
|
257
|
+
ids_to_fetch_metadata.add(resolved_pmid)
|
|
258
|
+
else:
|
|
259
|
+
cr_meta = resolve_doi_crossref(entry["found_doi"])
|
|
260
|
+
if cr_meta:
|
|
261
|
+
entry["cr_meta"] = cr_meta
|
|
262
|
+
entry["action_log"].append("Resolved via Crossref")
|
|
263
|
+
else:
|
|
264
|
+
entry["status"] = "FAIL_DOI_LOOKUP"
|
|
265
|
+
else:
|
|
266
|
+
if re.search(r'\b(19|20)\d{2}[a-z]?\b', text):
|
|
267
|
+
parts = extract_citation_parts(text)
|
|
268
|
+
candidates = search_pubmed_by_metadata(parts)
|
|
269
|
+
best_match, best_score = None, 0
|
|
270
|
+
if candidates:
|
|
271
|
+
cand_meta = batch_fetch_pubmed(candidates)
|
|
272
|
+
for c_id, c_data in cand_meta.items():
|
|
273
|
+
score = fuzz.token_set_ratio(c_data['title'], parts['title_snippet'])
|
|
274
|
+
if score > 80 and score > best_score:
|
|
275
|
+
best_score, best_match = score, c_id
|
|
276
|
+
if best_match:
|
|
277
|
+
entry["target_pmid"] = best_match
|
|
278
|
+
entry["action_log"].append("Found via Search")
|
|
279
|
+
ids_to_fetch_metadata.add(best_match)
|
|
280
|
+
items_to_process.append(entry)
|
|
281
|
+
|
|
282
|
+
pubmed_db = batch_fetch_pubmed(list(ids_to_fetch_metadata))
|
|
283
|
+
csv_results, txt_lines, bib_entries = [], [], []
|
|
284
|
+
|
|
285
|
+
print("\n" + "="*85)
|
|
286
|
+
for item in items_to_process:
|
|
287
|
+
text, pmid, cr_meta = item['original_text'], item['target_pmid'], item['cr_meta']
|
|
288
|
+
meta = pubmed_db.get(pmid) if pmid else cr_meta
|
|
289
|
+
status, notes, display_id = "OK", item['action_log'], pmid if pmid else (item.get('found_doi') if cr_meta else "(No ID)")
|
|
290
|
+
|
|
291
|
+
if not meta:
|
|
292
|
+
if item['status'] == "FAIL_DOI_LOOKUP":
|
|
293
|
+
status, _ = "MANUAL_CHECK", notes.append("DOI not in PubMed/Crossref")
|
|
294
|
+
else:
|
|
295
|
+
status, display_id = ("NOT_FOUND", "UNKNOWN") if re.search(r'\b(19|20)\d{2}\b', text) else ("IGNORED", display_id)
|
|
296
|
+
else:
|
|
297
|
+
title_clean = re.sub(r'[^\w\s]', '', meta['title'].lower())
|
|
298
|
+
text_clean = re.sub(r'[^\w\s]', '', text.lower())
|
|
299
|
+
title_score = fuzz.partial_ratio(title_clean, text_clean)
|
|
300
|
+
|
|
301
|
+
if meta['is_retracted']:
|
|
302
|
+
status = "!! RETRACTED !!"
|
|
303
|
+
elif "Found via Search" not in str(notes) and "Crossref" not in str(notes):
|
|
304
|
+
if title_score < 65:
|
|
305
|
+
status, _ = "MISMATCH", notes.append(f"Title Score {title_score}")
|
|
306
|
+
|
|
307
|
+
if status == "IGNORED": continue
|
|
308
|
+
|
|
309
|
+
csv_results.append({"Original_Text": text[:60] + "...", "Final_ID": display_id, "Status": status, "Notes": "; ".join(notes)})
|
|
310
|
+
|
|
311
|
+
bib_lines = [f"@article{{{item['ref_num']},"]
|
|
312
|
+
if pmid: bib_lines.append(f" pmid = {{{pmid}}},")
|
|
313
|
+
if meta and meta.get('doi'): bib_lines.append(f" doi = {{{meta['doi']}}},")
|
|
314
|
+
elif item.get('found_doi'): bib_lines.append(f" doi = {{{item['found_doi']}}},")
|
|
315
|
+
|
|
316
|
+
if meta:
|
|
317
|
+
bib_lines.extend([f" title = {{{meta['title']}}},", f" author = {{{meta['first_author']} et al.}},", f" year = {{{meta['year']}}}"])
|
|
318
|
+
else:
|
|
319
|
+
parts = extract_citation_parts(text)
|
|
320
|
+
bib_lines.append(f" title = {{{parts['title_snippet']}}},")
|
|
321
|
+
if parts['author']: bib_lines.append(f" author = {{{parts['author']}}},")
|
|
322
|
+
if parts['year']: bib_lines.append(f" year = {{{parts['year']}}}")
|
|
323
|
+
bib_lines.append("}")
|
|
324
|
+
bib_entries.append("\n".join(bib_lines))
|
|
325
|
+
|
|
326
|
+
print(f"{display_id[:15]:<18} | {status:<18} | {', '.join(notes) if notes else 'Verified'}")
|
|
327
|
+
|
|
328
|
+
print("="*85)
|
|
329
|
+
return csv_results, txt_lines, bib_entries
|
|
330
|
+
|
|
331
|
+
def save_outputs(csv_data, txt_lines, bib_entries, base_name):
|
|
332
|
+
if not csv_data: return
|
|
333
|
+
csv_name, bib_name = f"{base_name}_verification_report.csv", f"{base_name}_extracted.bib"
|
|
334
|
+
|
|
335
|
+
with open(csv_name, 'w', newline='', encoding='utf-8') as f:
|
|
336
|
+
writer = csv.DictWriter(f, fieldnames=csv_data[0].keys())
|
|
337
|
+
writer.writeheader()
|
|
338
|
+
writer.writerows(csv_data)
|
|
339
|
+
|
|
340
|
+
with open(bib_name, 'w', encoding='utf-8') as f:
|
|
341
|
+
f.write("\n\n".join(bib_entries))
|
|
342
|
+
|
|
343
|
+
print(f"\nSuccess! Saved outputs:")
|
|
344
|
+
print(f" [Report] -> {csv_name}")
|
|
345
|
+
print(f" [BibTeX] -> {bib_name} <-- USE THIS FOR arm-verify")
|
|
346
|
+
|
|
347
|
+
def main():
|
|
348
|
+
parser = argparse.ArgumentParser(description="Scan docx, output reports and a mapped .bib file.")
|
|
349
|
+
parser.add_argument("file", help="Path to .docx", nargs='?', default=None)
|
|
350
|
+
args = parser.parse_args()
|
|
351
|
+
fname = args.file
|
|
352
|
+
|
|
353
|
+
if not fname: fname = input("Enter filename (.docx): ").strip().strip('"').strip("'")
|
|
354
|
+
if not fname or not os.path.exists(fname): sys.exit(1)
|
|
355
|
+
|
|
356
|
+
base = re.split(r'[ _]', os.path.splitext(os.path.basename(fname))[0])[0] or "scan"
|
|
357
|
+
c, t, b = analyze_document(fname)
|
|
358
|
+
save_outputs(c, t, b, base)
|
|
359
|
+
|
|
360
|
+
if __name__ == "__main__":
|
|
361
|
+
main()
|