ref-management 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,106 @@
1
+ import os
2
+ import sys
3
+ import subprocess
4
+ import argparse
5
+ import re
6
+ from pathlib import Path
7
+
8
+ def main():
9
+ parser = argparse.ArgumentParser(description="Another Reference Manager (ARM)")
10
+ parser.add_argument("document", type=Path, nargs='?', help="Path to the input .docx file")
11
+ parser.add_argument("--csl", type=Path, help="Path to the desired .csl file")
12
+ args = parser.parse_args()
13
+
14
+ # 1. Check Input Document
15
+ doc_path = args.document
16
+ if not doc_path:
17
+ print("\n=== ARM Manuscript Auto-Formatter (Universal CSL Edition) ===")
18
+ doc_input = input("Enter the path to your .docx file: ").strip().strip('"').strip("'")
19
+ doc_path = Path(doc_input)
20
+
21
+ if not doc_path.exists():
22
+ print(f"\nāŒ ERROR: Document '{doc_path}' not found.")
23
+ sys.exit(1)
24
+
25
+ # 2. Check CSL Input
26
+ csl_path = args.csl
27
+ if not csl_path:
28
+ print("\n=== Reference Formatting ===")
29
+ csl_input = input("Enter the CSL style name or path (e.g., nature): ").strip().strip('"').strip("'")
30
+ csl_path = Path(csl_input)
31
+
32
+ # --- CSL Path Resolution Logic ---
33
+ default_csl_dir = Path("~/citation_styles").expanduser()
34
+
35
+ if not csl_path.exists():
36
+ # Check if it exists in the default directory
37
+ alt_path = default_csl_dir / csl_path.name
38
+ if alt_path.exists():
39
+ csl_path = alt_path
40
+ elif not csl_path.suffix == '.csl':
41
+ # Check if user forgot the .csl extension
42
+ alt_path_ext = default_csl_dir / f"{csl_path.name}.csl"
43
+ if alt_path_ext.exists():
44
+ csl_path = alt_path_ext
45
+
46
+ if not csl_path.exists():
47
+ print(f"\nāŒ ERROR: CSL file '{args.csl or csl_input}' not found locally or in {default_csl_dir}.")
48
+ sys.exit(1)
49
+
50
+ # Use the current python executable to run subprocesses reliably
51
+ python_exe = sys.executable
52
+
53
+ print("\n" + "="*50)
54
+ print("šŸš€ STARTING AUTOMATED REFERENCE PIPELINE (CSL ENGINE)")
55
+ print("="*50)
56
+
57
+ # --- STEP 1: SCAN AND EXTRACT ---
58
+ print(f"\n>>> [1/3] Scanning {doc_path.name} for references...")
59
+ step1 = subprocess.run([python_exe, "-m", "ref_management.scan_raw_refs", str(doc_path)])
60
+ if step1.returncode != 0:
61
+ print("\nāŒ ERROR: Pipeline failed during Step 1 (Scanning).")
62
+ sys.exit(1)
63
+
64
+ # Determine the extracted output filename based on scan_raw_refs logic
65
+ base_name = re.split(r'[ _]', doc_path.stem)[0] or "scan"
66
+ extracted_bib = Path(f"{base_name}_extracted.bib")
67
+
68
+ if not extracted_bib.exists():
69
+ print(f"\nāŒ ERROR: Expected intermediate file '{extracted_bib}' was not generated.")
70
+ sys.exit(1)
71
+
72
+ # --- STEP 2: VERIFY AND ENRICH ---
73
+ print(f"\n>>> [2/3] Fetching metadata from PubMed/Crossref for {extracted_bib.name}...")
74
+ step2 = subprocess.run([python_exe, "-m", "ref_management.verify_bib", str(extracted_bib)])
75
+ if step2.returncode != 0:
76
+ print("\nāŒ ERROR: Pipeline failed during Step 2 (Verification).")
77
+ sys.exit(1)
78
+
79
+ verified_bib = Path(f"{base_name}_extracted_verified.bib")
80
+ if not verified_bib.exists():
81
+ print(f"\nāŒ ERROR: Expected intermediate file '{verified_bib}' was not generated.")
82
+ sys.exit(1)
83
+
84
+ # --- STEP 3: APPLY TO MANUSCRIPT USING CSL ---
85
+ print(f"\n>>> [3/3] Formatting Word document using {csl_path.name}...")
86
+ step3 = subprocess.run([
87
+ python_exe, "-m", "ref_management.apply_citations",
88
+ str(verified_bib),
89
+ str(doc_path),
90
+ "--csl", str(csl_path)
91
+ ])
92
+ if step3.returncode != 0:
93
+ print("\nāŒ ERROR: Pipeline failed during Step 3 (Formatting).")
94
+ sys.exit(1)
95
+
96
+ # --- WRAP UP ---
97
+ final_output = doc_path.with_name(f"{doc_path.stem}_final_{csl_path.stem}.docx")
98
+ print("\n" + "="*50)
99
+ print("šŸŽ‰ PIPELINE COMPLETE!")
100
+ print("="*50)
101
+ print(f"āœ… Final Document: {final_output}")
102
+ print(f"āœ… Styles Applied via CSL: {csl_path.name}")
103
+ print(f"āœ… Intermediate BibTeX files ({extracted_bib}, {verified_bib}) saved for your records.")
104
+
105
+ if __name__ == "__main__":
106
+ main()
@@ -0,0 +1,170 @@
1
+ import argparse
2
+ import re
3
+ import sys
4
+ from pathlib import Path
5
+ from typing import Dict, Any, Tuple, List
6
+
7
+ import pyparsing
8
+ if not hasattr(pyparsing, 'DelimitedList'):
9
+ if hasattr(pyparsing, 'delimited_list'): setattr(pyparsing, 'DelimitedList', pyparsing.delimited_list)
10
+ elif hasattr(pyparsing, 'delimitedList'): setattr(pyparsing, 'DelimitedList', pyparsing.delimitedList)
11
+
12
+ import bibtexparser
13
+ from bibtexparser.bparser import BibTexParser
14
+ from bibtexparser import customization
15
+ from rapidfuzz import fuzz
16
+
17
+ def customizations(record: Dict[str, Any]) -> Dict[str, Any]:
18
+ record = customization.type(record)
19
+ record = customization.author(record)
20
+ record = customization.convert_to_unicode(record)
21
+ return {k if k == 'ID' else k.lower(): v for k, v in record.items()}
22
+
23
+ def natural_sort_key(entry: Dict[str, Any]) -> Tuple[int, str]:
24
+ entry_id = entry.get('ID', '')
25
+ match = re.search(r'(\d+)', entry_id)
26
+ return (int(match.group(1)) if match else 999999, entry_id)
27
+
28
+ def clean_latex(text: Any) -> str:
29
+ if not text: return ""
30
+ text_str = str(text).replace('{', '').replace('}', '')
31
+ replacements = {r'\"a': 'ä', r'\"o': 'ö', r'\"u': 'ü', r'\%': '%', r'\&': '&', r'\_': '_'}
32
+ for pat, rep in replacements.items(): text_str = text_str.replace(pat, rep)
33
+ return re.sub(r'\s+', ' ', text_str.replace('\\', '')).strip()
34
+
35
+ def format_authors(entry: Dict[str, Any], style: str) -> str:
36
+ authors = entry.get('author') or entry.get('authors')
37
+ if not authors: return "Unknown"
38
+
39
+ authors_list = authors.split(' and ') if isinstance(authors, str) else (authors if isinstance(authors, list) else [str(authors)])
40
+ formatted_list = []
41
+
42
+ for name in authors_list:
43
+ clean_name = clean_latex(name).strip()
44
+ if ',' in clean_name:
45
+ parts = clean_name.split(',', 1)
46
+ surname = parts[0].strip()
47
+ initials = "".join([p.strip()[0].upper() for p in parts[1].split() if p.strip()])
48
+ else:
49
+ parts = clean_name.split()
50
+ surname = parts[-1] if parts else "Unknown"
51
+ initials = "".join([p.strip()[0].upper() for p in parts[:-1] if p.strip()]) if len(parts) > 1 else ""
52
+
53
+ if style == 'numbered':
54
+ formatted_list.append(f"{surname} {initials}".strip())
55
+ else:
56
+ formatted_list.append(f"{surname}, {'.'.join(initials)}." if initials else surname)
57
+
58
+ if len(formatted_list) == 1: return formatted_list[0]
59
+ elif len(formatted_list) == 2: return f"{formatted_list[0]} and {formatted_list[1]}"
60
+ else:
61
+ joiner = " and " if style == 'numbered' else ", and "
62
+ return f"{', '.join(formatted_list[:-1])}{joiner}{formatted_list[-1]}"
63
+
64
+ def format_entry(entry: Dict[str, Any], style: str) -> str:
65
+ authors = format_authors(entry, style)
66
+ year = clean_latex(entry.get('year', '????'))
67
+ title = clean_latex(entry.get('title', 'No Title'))
68
+ if title and title[-1] not in '.?!': title += "."
69
+
70
+ journal_str = clean_latex(entry.get('journal', ''))
71
+ volume = clean_latex(entry.get('volume', ''))
72
+ pages = clean_latex(entry.get('pages', '')).replace('--', '–').replace('-', '–')
73
+
74
+ if volume: journal_str += f" {volume}"
75
+ if pages: journal_str += f", {pages}" if volume else f" {pages}"
76
+ if journal_str and journal_str[-1] not in '.?': journal_str += "."
77
+
78
+ doi_str = f" doi: {clean_latex(entry['doi']).replace('https://doi.org/', '')}" if entry.get('doi') else ""
79
+
80
+ if style == 'numbered':
81
+ return f"{authors} ({year}) {title} {journal_str}{doi_str}".strip()
82
+ else:
83
+ return f"{authors} ({year}). {title} {journal_str}{doi_str}".strip()
84
+
85
+ def process_bib(input_file: Path, output_file: Path, style: str):
86
+ print(f"Reading {input_file} (Style: {style.upper()})...")
87
+ try:
88
+ with open(input_file, encoding='utf-8') as f:
89
+ parser = BibTexParser()
90
+ parser.customization = customizations
91
+ parser.common_strings = False
92
+ bib = parser.parse_file(f)
93
+ except Exception as e:
94
+ print(f"Error parsing BibTeX: {e}"); sys.exit(1)
95
+
96
+ # Sort based on style choice
97
+ if style == 'numbered':
98
+ sorted_entries = sorted(bib.entries, key=natural_sort_key)
99
+ else:
100
+ sorted_entries = sorted(bib.entries, key=lambda e: clean_latex(e.get('author', 'z')).lower())
101
+
102
+ seen_pmids, seen_dois, seen_titles = {}, {}, {}
103
+
104
+ try:
105
+ with open(output_file, 'w', encoding='utf-8') as f:
106
+ f.write(f"--- References List ({style.upper()} FORMAT | Source: {input_file.name}) ---\n\n")
107
+
108
+ for i, entry in enumerate(sorted_entries):
109
+ # ID tracking
110
+ match = re.search(r'(\d+)', entry.get('ID', ''))
111
+ ref_num = match.group(1) if match else str(i + 1)
112
+ seq_prefix = f"{i + 1}. " if style == 'numbered' else "- "
113
+
114
+ title = clean_latex(entry.get('title', '')).strip()
115
+ if not title or (title.upper() in ['REVIEWS', 'UNKNOWN', 'REFERENCES'] and 'author' not in entry): continue
116
+
117
+ dup_source = seen_pmids.get(entry.get('pmid')) or seen_dois.get(entry.get('doi'))
118
+ if not dup_source and len(title) > 20:
119
+ norm_title = re.sub(r'[^\w]', '', title.lower())
120
+ for s_title, s_info in seen_titles.items():
121
+ if fuzz.ratio(norm_title, s_title) > 95:
122
+ dup_source = s_info; break
123
+
124
+ if dup_source:
125
+ f.write(f"{seq_prefix}[DUPLICATE of {dup_source[0]}: {dup_source[1]}] {title[:50]}...\n\n")
126
+ else:
127
+ if entry.get('pmid'): seen_pmids[entry['pmid']] = (ref_num, entry['ID'])
128
+ if entry.get('doi'): seen_dois[entry['doi']] = (ref_num, entry['ID'])
129
+ if len(title) > 20: seen_titles[re.sub(r'[^\w]', '', title.lower())] = (ref_num, entry['ID'])
130
+
131
+ try:
132
+ f.write(f"{seq_prefix}{format_entry(entry, style)}\n\n")
133
+ except Exception as e:
134
+ f.write(f"{seq_prefix}[ERROR] {e}\n\n")
135
+
136
+ print(f"Success! Saved to {output_file}")
137
+ except IOError as e:
138
+ print(f"I/O Error: {e}")
139
+ sys.exit(1)
140
+
141
+ def main():
142
+ parser = argparse.ArgumentParser()
143
+ parser.add_argument("input", type=Path)
144
+ parser.add_argument("output", type=Path, nargs='?', default=None)
145
+ parser.add_argument("--style", choices=['1', '2', 'numbered', 'author-year'], help="1: Numbered, 2: Author-Year")
146
+ args = parser.parse_args()
147
+
148
+ if not args.input.stem.endswith("_verified"):
149
+ candidate = args.input.with_name(f"{args.input.stem}_verified{args.input.suffix}")
150
+ if candidate.exists(): args.input = candidate
151
+
152
+ style_choice = args.style
153
+ if not style_choice:
154
+ print("\n=== Select Report Output Style ===")
155
+ print(" 1. Sequential Numbered (1. Lopez-Otin C...)")
156
+ print(" 2. Author-Year Alphabetical (- Smith, I. (2023)...)")
157
+ while True:
158
+ choice = input("Enter 1 or 2: ").strip()
159
+ if choice in ['1', '2']:
160
+ style_choice = choice
161
+ break
162
+
163
+ active_style = 'author-year' if style_choice in ['2', 'author-year'] else 'numbered'
164
+ if not args.output:
165
+ args.output = args.input.with_name(f"{args.input.stem}_list_{active_style}.txt")
166
+
167
+ process_bib(args.input, args.output, active_style)
168
+
169
+ if __name__ == "__main__":
170
+ main()
@@ -0,0 +1,361 @@
1
+ import re
2
+ import csv
3
+ import os
4
+ import sys
5
+ import argparse
6
+ import time
7
+ import requests
8
+ from docx import Document
9
+ from Bio import Entrez
10
+ from rapidfuzz import fuzz
11
+
12
+ # --- CONFIGURATION ---
13
+ Entrez.email = os.environ.get("NCBI_EMAIL", None)
14
+ Entrez.api_key = os.environ.get("NCBI_API_KEY", None)
15
+
16
+ if Entrez.api_key is None:
17
+ print("Tip: Set NCBI_API_KEY environment variable to avoid request limits.", file=sys.stderr)
18
+
19
+ # --- HELPERS ---
20
+ def clean_word(word):
21
+ return re.sub(r'[^\w]', '', word)
22
+
23
+ def resolve_doi_to_pmid(doi):
24
+ try:
25
+ clean = doi.rstrip('.').strip()
26
+ handle = Entrez.esearch(db="pubmed", term=f"{clean}[DOI]", retmax=1)
27
+ r = Entrez.read(handle)
28
+ handle.close()
29
+ return r['IdList'][0] if r['IdList'] else None
30
+ except Exception:
31
+ return None
32
+
33
+ def resolve_doi_crossref(doi):
34
+ """Fallback for DOIs not indexed in PubMed (e.g. Stats journals, old issues)."""
35
+ print(f" ...PubMed missed {doi}. Checking Crossref Master Database...", end='\r')
36
+ try:
37
+ clean = doi.rstrip('.').strip()
38
+ url = f"https://api.crossref.org/works/{clean}"
39
+ resp = requests.get(url, timeout=5)
40
+ if resp.status_code == 200:
41
+ data = resp.json().get('message', {})
42
+ title = data.get('title', [''])[0]
43
+ author_list = data.get('author', [{}])
44
+ first_author = author_list[0].get('family', 'Unknown') if author_list else "Unknown"
45
+
46
+ year = "Unknown"
47
+ pub = data.get('published-print') or data.get('published-online')
48
+ if pub and 'date-parts' in pub:
49
+ year = str(pub['date-parts'][0][0])
50
+
51
+ return {
52
+ "id": f"CR_{clean}",
53
+ "first_author": first_author,
54
+ "year": year,
55
+ "title": title,
56
+ "doi": clean,
57
+ "is_retracted": False
58
+ }
59
+ except Exception:
60
+ pass
61
+ return None
62
+
63
+ def extract_citation_parts(text):
64
+ data = {"year": "", "author": "", "title_snippet": ""}
65
+ year_iter = list(re.finditer(r'\b(19|20)\d{2}[a-z]?\b', text))
66
+
67
+ if year_iter:
68
+ selected_year_match = year_iter[0]
69
+ for m in year_iter:
70
+ if m.start() > 0 and text[m.start()-1] == '(':
71
+ selected_year_match = m
72
+ break
73
+
74
+ raw_year = selected_year_match.group(0)
75
+ data["year"] = re.sub(r'[a-z]', '', raw_year)
76
+
77
+ # Determine if Author-Year or Vancouver style based on Year placement
78
+ if selected_year_match.start() > len(text) / 2:
79
+ snippet = text[:selected_year_match.start()].strip()
80
+ else:
81
+ snippet = text[selected_year_match.end():].strip()
82
+
83
+ # URL/DOI Scrubber (Prevents them from leaking into the Title)
84
+ snippet = re.sub(r'(?i)https?://\S+', '', snippet)
85
+ snippet = re.sub(r'(?i)doi:?\s*10\.\d{4,9}/[-._;()/:a-zA-Z0-9<>\[\]]+', '', snippet)
86
+ data["title_snippet"] = re.sub(r'^[\s\)\.,:;-]+|[\s\.,:;-]+$', '', snippet)
87
+
88
+ # Improved Raw Author Parsing
89
+ author_raw = text[:selected_year_match.start()].split('.')[0].strip()
90
+ author_raw = re.sub(r'^\[?\d+\]?\s*', '', author_raw)
91
+ if len(author_raw) > 3:
92
+ data["author"] = author_raw
93
+ else:
94
+ snippet = text
95
+ snippet = re.sub(r'(?i)https?://\S+', '', snippet)
96
+ snippet = re.sub(r'(?i)doi:?\s*10\.\d{4,9}/[-._;()/:a-zA-Z0-9<>\[\]]+', '', snippet)
97
+ data["title_snippet"] = snippet.strip()
98
+
99
+ if not data["author"]:
100
+ skip_words = {'et', 'al', 'in', 'the', 'pmid', 'doi', 'vol', 'no', 'and', '&', 'eds', 'editor', 'page', 'pp', 'references'}
101
+ for w in text.split():
102
+ clean = clean_word(w)
103
+ if len(clean) < 2 or clean.isdigit(): continue
104
+ if clean.lower() in skip_words: continue
105
+ if clean[0].isupper():
106
+ data["author"] = clean
107
+ break
108
+
109
+ return data
110
+
111
+ def search_pubmed_by_metadata(parts):
112
+ candidates = []
113
+ if len(parts["title_snippet"]) > 10:
114
+ clean_title = re.sub(r'[^\w\s]', '', parts["title_snippet"])
115
+ short_title = " ".join(clean_title.split()[:8])
116
+ try:
117
+ handle = Entrez.esearch(db="pubmed", term=f"{short_title}[Title]", retmax=3)
118
+ r = Entrez.read(handle)
119
+ handle.close()
120
+ if r['IdList']: candidates.extend(r['IdList'])
121
+ except Exception: pass
122
+
123
+ if not candidates and parts["author"] and parts["year"]:
124
+ try:
125
+ handle = Entrez.esearch(db="pubmed", term=f"{parts['author'].split()[0]}[1au] AND {parts['year']}[pdat]", retmax=5)
126
+ r = Entrez.read(handle)
127
+ handle.close()
128
+ if r['IdList']: candidates.extend(r['IdList'])
129
+ except Exception: pass
130
+
131
+ return list(set(candidates))
132
+
133
+ def parse_record(record):
134
+ title = re.sub(r'<[^<]+?>', '', record.get('Title', ''))
135
+ authors = record.get('AuthorList', [])
136
+ first_author = authors[0] if authors else "Unknown"
137
+ pub_date = record.get('PubDate', '')
138
+ year_match = re.search(r'\d{4}', pub_date)
139
+
140
+ doi = ""
141
+ for aid in record.get('ArticleIds', {}).items():
142
+ if aid[0] == 'doi': doi = aid[1]
143
+ if not doi and 'doi' in record.get('ArticleIds', {}):
144
+ doi = record['ArticleIds']['doi']
145
+
146
+ return {
147
+ "id": str(record.get('Id')),
148
+ "first_author": first_author,
149
+ "year": year_match.group(0) if year_match else "Unknown",
150
+ "title": title,
151
+ "doi": doi,
152
+ "is_retracted": "Retracted Publication" in record.get('PubTypeList', [])
153
+ }
154
+
155
+ def batch_fetch_pubmed(pmid_list):
156
+ if not pmid_list: return {}
157
+ fetched_data = {}
158
+ unique_pmids = list(set(pmid_list))
159
+
160
+ print(f"\nFetching metadata for {len(unique_pmids)} unique PubMed papers...")
161
+ for i in range(0, len(unique_pmids), 200):
162
+ chunk = unique_pmids[i : i + 200]
163
+ try:
164
+ handle = Entrez.esummary(db="pubmed", id=",".join(chunk), retmode="xml")
165
+ records = Entrez.read(handle)
166
+ handle.close()
167
+ for record in records:
168
+ data = parse_record(record)
169
+ fetched_data[data['id']] = data
170
+ time.sleep(0.5)
171
+ except Exception as e:
172
+ pass
173
+
174
+ return fetched_data
175
+
176
+ # --- CORE LOGIC ---
177
+ def analyze_document(file_path):
178
+ print(f"\nReading document: {file_path}...")
179
+ all_paragraphs = []
180
+ ext = os.path.splitext(file_path)[1].lower()
181
+
182
+ try:
183
+ if ext == '.docx':
184
+ doc = Document(file_path)
185
+ all_paragraphs = [p.text.strip() for p in doc.paragraphs if p.text.strip()]
186
+ elif ext == '.txt':
187
+ with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
188
+ lines = f.readlines()
189
+ current_block = []
190
+ for line in lines:
191
+ if line.strip():
192
+ if re.match(r'^\[?\d+[\.\)\]]', line.strip()):
193
+ if current_block: all_paragraphs.append(" ".join(current_block))
194
+ current_block = [line.strip()]
195
+ all_paragraphs.append(line.strip())
196
+ current_block = []
197
+ else:
198
+ current_block.append(line.strip())
199
+ else:
200
+ if current_block:
201
+ all_paragraphs.append(" ".join(current_block))
202
+ current_block = []
203
+ if current_block: all_paragraphs.append(" ".join(current_block))
204
+ else:
205
+ print(f"ERROR: Unsupported format {ext}"); return [], [], []
206
+ except Exception as e:
207
+ print(f"ERROR: Could not open file. {e}"); return [], [], []
208
+
209
+ header_regex = re.compile(r'^\s*(?:[0-9]+\.?\s*)?(?:REFERENCES|BIBLIOGRAPHY|LITERATURE CITED|WORKS CITED)\s*$', re.IGNORECASE)
210
+ start_index, header_found = 0, False
211
+ for i, p in enumerate(all_paragraphs):
212
+ if header_regex.match(p):
213
+ start_index, header_found = i + 1, True
214
+ print(f" -> Found Reference Section header. Processing subsequent text.")
215
+ break
216
+ if not header_found: print(" -> No 'References' header found. Assuming file is just a list.")
217
+
218
+ paragraphs_to_check = all_paragraphs[start_index:]
219
+ pmid_pattern = re.compile(r'PMID:?\s*(\d+)', re.IGNORECASE)
220
+
221
+ # NEW DOI PATTERN: Supports <, >, and brackets used in Wiley/SICI DOIs
222
+ doi_pattern = re.compile(r'\b(10\.\d{4,9}/[-._;()/:a-zA-Z0-9<>\[\]]+)')
223
+
224
+ items_to_process = []
225
+ ids_to_fetch_metadata = set()
226
+
227
+ print(f"Scanning {len(paragraphs_to_check)} reference candidates...")
228
+
229
+ for idx, text in enumerate(paragraphs_to_check):
230
+ if len(text) < 15: continue
231
+
232
+ num_match = re.match(r'^\s*\[?(\d+)[\.\)\]]', text)
233
+ ref_num = num_match.group(1) if num_match else str(idx + 1)
234
+
235
+ found_pmids = [m.group(1) for m in pmid_pattern.finditer(text)]
236
+ found_dois = [m.group(1).rstrip('.') for m in doi_pattern.finditer(text)]
237
+
238
+ entry = {
239
+ "ref_num": ref_num,
240
+ "original_text": text,
241
+ "found_pmid": found_pmids[0] if found_pmids else None,
242
+ "found_doi": found_dois[0] if found_dois else None,
243
+ "target_pmid": None,
244
+ "cr_meta": None,
245
+ "status": "PENDING",
246
+ "action_log": []
247
+ }
248
+
249
+ if entry["found_pmid"]:
250
+ entry["target_pmid"] = entry["found_pmid"]
251
+ ids_to_fetch_metadata.add(entry["target_pmid"])
252
+ elif entry["found_doi"]:
253
+ resolved_pmid = resolve_doi_to_pmid(entry["found_doi"])
254
+ if resolved_pmid:
255
+ entry["target_pmid"] = resolved_pmid
256
+ entry["action_log"].append("Fetched PMID via DOI")
257
+ ids_to_fetch_metadata.add(resolved_pmid)
258
+ else:
259
+ cr_meta = resolve_doi_crossref(entry["found_doi"])
260
+ if cr_meta:
261
+ entry["cr_meta"] = cr_meta
262
+ entry["action_log"].append("Resolved via Crossref")
263
+ else:
264
+ entry["status"] = "FAIL_DOI_LOOKUP"
265
+ else:
266
+ if re.search(r'\b(19|20)\d{2}[a-z]?\b', text):
267
+ parts = extract_citation_parts(text)
268
+ candidates = search_pubmed_by_metadata(parts)
269
+ best_match, best_score = None, 0
270
+ if candidates:
271
+ cand_meta = batch_fetch_pubmed(candidates)
272
+ for c_id, c_data in cand_meta.items():
273
+ score = fuzz.token_set_ratio(c_data['title'], parts['title_snippet'])
274
+ if score > 80 and score > best_score:
275
+ best_score, best_match = score, c_id
276
+ if best_match:
277
+ entry["target_pmid"] = best_match
278
+ entry["action_log"].append("Found via Search")
279
+ ids_to_fetch_metadata.add(best_match)
280
+ items_to_process.append(entry)
281
+
282
+ pubmed_db = batch_fetch_pubmed(list(ids_to_fetch_metadata))
283
+ csv_results, txt_lines, bib_entries = [], [], []
284
+
285
+ print("\n" + "="*85)
286
+ for item in items_to_process:
287
+ text, pmid, cr_meta = item['original_text'], item['target_pmid'], item['cr_meta']
288
+ meta = pubmed_db.get(pmid) if pmid else cr_meta
289
+ status, notes, display_id = "OK", item['action_log'], pmid if pmid else (item.get('found_doi') if cr_meta else "(No ID)")
290
+
291
+ if not meta:
292
+ if item['status'] == "FAIL_DOI_LOOKUP":
293
+ status, _ = "MANUAL_CHECK", notes.append("DOI not in PubMed/Crossref")
294
+ else:
295
+ status, display_id = ("NOT_FOUND", "UNKNOWN") if re.search(r'\b(19|20)\d{2}\b', text) else ("IGNORED", display_id)
296
+ else:
297
+ title_clean = re.sub(r'[^\w\s]', '', meta['title'].lower())
298
+ text_clean = re.sub(r'[^\w\s]', '', text.lower())
299
+ title_score = fuzz.partial_ratio(title_clean, text_clean)
300
+
301
+ if meta['is_retracted']:
302
+ status = "!! RETRACTED !!"
303
+ elif "Found via Search" not in str(notes) and "Crossref" not in str(notes):
304
+ if title_score < 65:
305
+ status, _ = "MISMATCH", notes.append(f"Title Score {title_score}")
306
+
307
+ if status == "IGNORED": continue
308
+
309
+ csv_results.append({"Original_Text": text[:60] + "...", "Final_ID": display_id, "Status": status, "Notes": "; ".join(notes)})
310
+
311
+ bib_lines = [f"@article{{{item['ref_num']},"]
312
+ if pmid: bib_lines.append(f" pmid = {{{pmid}}},")
313
+ if meta and meta.get('doi'): bib_lines.append(f" doi = {{{meta['doi']}}},")
314
+ elif item.get('found_doi'): bib_lines.append(f" doi = {{{item['found_doi']}}},")
315
+
316
+ if meta:
317
+ bib_lines.extend([f" title = {{{meta['title']}}},", f" author = {{{meta['first_author']} et al.}},", f" year = {{{meta['year']}}}"])
318
+ else:
319
+ parts = extract_citation_parts(text)
320
+ bib_lines.append(f" title = {{{parts['title_snippet']}}},")
321
+ if parts['author']: bib_lines.append(f" author = {{{parts['author']}}},")
322
+ if parts['year']: bib_lines.append(f" year = {{{parts['year']}}}")
323
+ bib_lines.append("}")
324
+ bib_entries.append("\n".join(bib_lines))
325
+
326
+ print(f"{display_id[:15]:<18} | {status:<18} | {', '.join(notes) if notes else 'Verified'}")
327
+
328
+ print("="*85)
329
+ return csv_results, txt_lines, bib_entries
330
+
331
+ def save_outputs(csv_data, txt_lines, bib_entries, base_name):
332
+ if not csv_data: return
333
+ csv_name, bib_name = f"{base_name}_verification_report.csv", f"{base_name}_extracted.bib"
334
+
335
+ with open(csv_name, 'w', newline='', encoding='utf-8') as f:
336
+ writer = csv.DictWriter(f, fieldnames=csv_data[0].keys())
337
+ writer.writeheader()
338
+ writer.writerows(csv_data)
339
+
340
+ with open(bib_name, 'w', encoding='utf-8') as f:
341
+ f.write("\n\n".join(bib_entries))
342
+
343
+ print(f"\nSuccess! Saved outputs:")
344
+ print(f" [Report] -> {csv_name}")
345
+ print(f" [BibTeX] -> {bib_name} <-- USE THIS FOR arm-verify")
346
+
347
+ def main():
348
+ parser = argparse.ArgumentParser(description="Scan docx, output reports and a mapped .bib file.")
349
+ parser.add_argument("file", help="Path to .docx", nargs='?', default=None)
350
+ args = parser.parse_args()
351
+ fname = args.file
352
+
353
+ if not fname: fname = input("Enter filename (.docx): ").strip().strip('"').strip("'")
354
+ if not fname or not os.path.exists(fname): sys.exit(1)
355
+
356
+ base = re.split(r'[ _]', os.path.splitext(os.path.basename(fname))[0])[0] or "scan"
357
+ c, t, b = analyze_document(fname)
358
+ save_outputs(c, t, b, base)
359
+
360
+ if __name__ == "__main__":
361
+ main()