ref-management 1.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ref_management/__init__.py +3 -0
- ref_management/add_dois.py +126 -0
- ref_management/apply_citations.py +471 -0
- ref_management/auto_format.py +106 -0
- ref_management/generate_report.py +170 -0
- ref_management/scan_raw_refs.py +361 -0
- ref_management/verify_bib.py +300 -0
- ref_management-1.0.3.dist-info/METADATA +13 -0
- ref_management-1.0.3.dist-info/RECORD +13 -0
- ref_management-1.0.3.dist-info/WHEEL +5 -0
- ref_management-1.0.3.dist-info/entry_points.txt +7 -0
- ref_management-1.0.3.dist-info/licenses/LICENSE +21 -0
- ref_management-1.0.3.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import re
|
|
3
|
+
import argparse
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
# --- MONKEY PATCH FOR PYPARSING/BIBTEXPARSER COMPATIBILITY ---
|
|
7
|
+
import pyparsing
|
|
8
|
+
if not hasattr(pyparsing, 'DelimitedList'):
|
|
9
|
+
if hasattr(pyparsing, 'delimited_list'): setattr(pyparsing, 'DelimitedList', pyparsing.delimited_list)
|
|
10
|
+
elif hasattr(pyparsing, 'delimitedList'): setattr(pyparsing, 'DelimitedList', pyparsing.delimitedList)
|
|
11
|
+
|
|
12
|
+
import bibtexparser
|
|
13
|
+
from docx import Document
|
|
14
|
+
from rapidfuzz import fuzz
|
|
15
|
+
|
|
16
|
+
REF_HEADER_PATTERN = re.compile(r'^\s*(?:[0-9]+\.?\s*)?(?:REFERENCES|BIBLIOGRAPHY|LITERATURE CITED|WORKS CITED)\s*$', re.IGNORECASE)
|
|
17
|
+
POST_REF_PATTERN = re.compile(r'^\s*(?:Tables?|Figures?|Figure Legends?|Supplementary.*?|Appendices|Data Availability|Acknowledgements?|Author Contributions?|Funding|Conflict(?:s)? of Interest|Competing Interests?|(?:Table|Figure|Fig\.?)\s*\d+.*)$', re.IGNORECASE)
|
|
18
|
+
|
|
19
|
+
def clean_for_match(text: str) -> str:
|
|
20
|
+
"""Removes punctuation and normalizes spacing for accurate fuzzy matching."""
|
|
21
|
+
if not text: return ""
|
|
22
|
+
text = text.replace('{', '').replace('}', '')
|
|
23
|
+
return re.sub(r'[^\w\s]', '', text.lower()).strip()
|
|
24
|
+
|
|
25
|
+
def process_document(bib_path: Path, docx_path: Path, output_path: Path):
|
|
26
|
+
print(f"\nReading verified BibTeX: {bib_path.name}...")
|
|
27
|
+
try:
|
|
28
|
+
with open(bib_path, 'r', encoding='utf-8') as f:
|
|
29
|
+
bib_db = bibtexparser.load(f)
|
|
30
|
+
except Exception as e:
|
|
31
|
+
print(f"❌ ERROR reading BibTeX: {e}")
|
|
32
|
+
sys.exit(1)
|
|
33
|
+
|
|
34
|
+
# Build an index of cleaned titles to DOIs
|
|
35
|
+
doi_map = {}
|
|
36
|
+
for entry in bib_db.entries:
|
|
37
|
+
doi = entry.get('doi', '').strip()
|
|
38
|
+
title = entry.get('title', '').strip()
|
|
39
|
+
if doi and title:
|
|
40
|
+
# Clean DOI prefix if present
|
|
41
|
+
clean_doi = doi.replace('https://doi.org/', '').replace('doi:', '').strip()
|
|
42
|
+
doi_map[clean_for_match(title)] = clean_doi
|
|
43
|
+
|
|
44
|
+
print(f"Loaded {len(doi_map)} DOIs from BibTeX.")
|
|
45
|
+
print(f"Scanning document: {docx_path.name}...")
|
|
46
|
+
doc = Document(str(docx_path))
|
|
47
|
+
|
|
48
|
+
# 1. Find the boundaries of the References section
|
|
49
|
+
ref_start_idx = -1
|
|
50
|
+
for i, p in enumerate(doc.paragraphs):
|
|
51
|
+
if REF_HEADER_PATTERN.match(p.text):
|
|
52
|
+
ref_start_idx = i
|
|
53
|
+
break
|
|
54
|
+
|
|
55
|
+
if ref_start_idx == -1:
|
|
56
|
+
print("❌ ERROR: Could not locate 'References' header in the document.")
|
|
57
|
+
sys.exit(1)
|
|
58
|
+
|
|
59
|
+
ref_end_idx = len(doc.paragraphs)
|
|
60
|
+
for i in range(ref_start_idx + 1, len(doc.paragraphs)):
|
|
61
|
+
text = doc.paragraphs[i].text.strip()
|
|
62
|
+
if text and POST_REF_PATTERN.match(text):
|
|
63
|
+
ref_end_idx = i
|
|
64
|
+
break
|
|
65
|
+
|
|
66
|
+
# 2. Iterate through the references and append DOIs
|
|
67
|
+
added_count = 0
|
|
68
|
+
already_had_count = 0
|
|
69
|
+
|
|
70
|
+
for i in range(ref_start_idx + 1, ref_end_idx):
|
|
71
|
+
para = doc.paragraphs[i]
|
|
72
|
+
text = para.text.strip()
|
|
73
|
+
|
|
74
|
+
# Skip empty lines or very short fragments
|
|
75
|
+
if len(text) < 20: continue
|
|
76
|
+
|
|
77
|
+
# Check if a DOI is already present in this paragraph
|
|
78
|
+
if re.search(r'(?i)\bhttps?://doi\.org\b', text) or re.search(r'(?i)\bdoi:', text):
|
|
79
|
+
already_had_count += 1
|
|
80
|
+
continue
|
|
81
|
+
|
|
82
|
+
# Fuzzy match the paragraph text against our BibTeX titles
|
|
83
|
+
best_match_doi = None
|
|
84
|
+
best_score = 85 # Minimum strictness threshold
|
|
85
|
+
|
|
86
|
+
para_clean = clean_for_match(text)
|
|
87
|
+
for bib_title, doi in doi_map.items():
|
|
88
|
+
# partial_ratio is perfect here because the title is just a substring of the full reference paragraph
|
|
89
|
+
score = fuzz.partial_ratio(bib_title, para_clean)
|
|
90
|
+
if score > best_score:
|
|
91
|
+
best_score = score
|
|
92
|
+
best_match_doi = doi
|
|
93
|
+
|
|
94
|
+
if best_match_doi:
|
|
95
|
+
# Append the DOI natively to the paragraph
|
|
96
|
+
if not text.endswith('.'):
|
|
97
|
+
para.add_run('.')
|
|
98
|
+
|
|
99
|
+
# Format the run slightly to match typical hyperlink aesthetics (optional, but clean)
|
|
100
|
+
run = para.add_run(f" https://doi.org/{best_match_doi}")
|
|
101
|
+
added_count += 1
|
|
102
|
+
|
|
103
|
+
# 3. Save the patched draft
|
|
104
|
+
doc.save(str(output_path))
|
|
105
|
+
print(f"\nSuccess! Saved to {output_path.name}")
|
|
106
|
+
print(f" -> Found {already_had_count} references that already had DOIs.")
|
|
107
|
+
print(f" -> Dynamically matched and injected {added_count} missing DOIs.")
|
|
108
|
+
|
|
109
|
+
def main():
|
|
110
|
+
parser = argparse.ArgumentParser(description="Appends DOIs to the References section of an intermediate draft.")
|
|
111
|
+
parser.add_argument("bib", type=Path, help="The verified .bib file containing the DOIs")
|
|
112
|
+
parser.add_argument("doc", type=Path, help="The intermediate .docx file")
|
|
113
|
+
args = parser.parse_args()
|
|
114
|
+
|
|
115
|
+
if not args.bib.exists():
|
|
116
|
+
print(f"❌ ERROR: BibTeX file '{args.bib}' not found.")
|
|
117
|
+
sys.exit(1)
|
|
118
|
+
if not args.doc.exists():
|
|
119
|
+
print(f"❌ ERROR: Document '{args.doc}' not found.")
|
|
120
|
+
sys.exit(1)
|
|
121
|
+
|
|
122
|
+
output = args.doc.with_name(f"{args.doc.stem}_with_DOIs.docx")
|
|
123
|
+
process_document(args.bib, args.doc, output)
|
|
124
|
+
|
|
125
|
+
if __name__ == "__main__":
|
|
126
|
+
main()
|
|
@@ -0,0 +1,471 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
import os
|
|
3
|
+
import re
|
|
4
|
+
import argparse
|
|
5
|
+
import html
|
|
6
|
+
import warnings
|
|
7
|
+
import tempfile
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Dict, List, Optional, Match
|
|
10
|
+
|
|
11
|
+
# --- Suppress harmless citeproc-py schema validation warnings ---
|
|
12
|
+
warnings.filterwarnings("ignore", category=UserWarning)
|
|
13
|
+
|
|
14
|
+
# --- MONKEY PATCH FOR PYPARSING/BIBTEXPARSER COMPATIBILITY ---
|
|
15
|
+
import pyparsing
|
|
16
|
+
if not hasattr(pyparsing, 'DelimitedList'):
|
|
17
|
+
if hasattr(pyparsing, 'delimited_list'): setattr(pyparsing, 'DelimitedList', pyparsing.delimited_list)
|
|
18
|
+
elif hasattr(pyparsing, 'delimitedList'): setattr(pyparsing, 'DelimitedList', pyparsing.delimitedList)
|
|
19
|
+
|
|
20
|
+
import bibtexparser
|
|
21
|
+
from bibtexparser.bwriter import BibTexWriter
|
|
22
|
+
from docx import Document
|
|
23
|
+
from docx.text.paragraph import Paragraph
|
|
24
|
+
from docx.table import Table
|
|
25
|
+
from docx.oxml.text.paragraph import CT_P
|
|
26
|
+
from docx.oxml.table import CT_Tbl
|
|
27
|
+
from rapidfuzz import fuzz
|
|
28
|
+
|
|
29
|
+
# --- CITEPROC IMPORTS ---
|
|
30
|
+
from citeproc import CitationStylesStyle, CitationStylesBibliography
|
|
31
|
+
from citeproc import Citation, CitationItem
|
|
32
|
+
from citeproc import formatter
|
|
33
|
+
from citeproc.source.bibtex import BibTeX
|
|
34
|
+
|
|
35
|
+
# --- INTELLIGENCE DICTIONARIES ---
|
|
36
|
+
AA_LIST_3 = "Ala|Arg|Asn|Asp|Cys|Gln|Glu|Gly|His|Ile|Leu|Lys|Met|Phe|Pro|Ser|Thr|Trp|Tyr|Val"
|
|
37
|
+
AA_PATTERN_3 = re.compile(rf'\b(?:{AA_LIST_3})\s*-?\s*$', re.IGNORECASE)
|
|
38
|
+
AA_PATTERN_1 = re.compile(r'\b[ARNDCQEGHILKMFPSTWY]-?$')
|
|
39
|
+
MATH_UNIT_LIST = ["CV", "R", "r", "m", "cm", "mm", "µm", "um", "nm", "km", "kg", "x", "y", "z", "p", "n", "k", "v", "V", "D", "Ca", "Mg", "Na", "K", "Cl", "Fe", "Zn", "Cu", "O", "H", "N", "C", "P", "S", "M", "χ", "Χ"]
|
|
40
|
+
MATH_UNIT_PATTERN = re.compile(rf'\b(?:{"|".join(MATH_UNIT_LIST)})\s*$')
|
|
41
|
+
POWER_PATTERN = re.compile(r'(?:10|x10|×10|\*10)\s*$')
|
|
42
|
+
IGNORE_PREFIXES = re.compile(r'(?i:\b(?:fig(?:ure)?|eq(?:uation)?|tbl|table|section|sec|step)\s*\.?\s*)$')
|
|
43
|
+
|
|
44
|
+
PROSE_STOP_WORDS = {'the', 'is', 'are', 'was', 'were', 'that', 'this', 'to', 'for', 'with',
|
|
45
|
+
'in', 'on', 'by', 'an', 'we', 'our', 'as', 'it', 'can', 'be', 'has', 'have', 'of', 'and', 'from', 'which'}
|
|
46
|
+
|
|
47
|
+
REF_HEADER_PATTERN = re.compile(r'^\s*(?:[0-9]+\.?\s*)?(?:REFERENCES|BIBLIOGRAPHY|LITERATURE CITED|WORKS CITED)\s*$', re.IGNORECASE)
|
|
48
|
+
POST_REF_PATTERN = re.compile(r'^\s*(?:Tables?|Figures?|Figure Legends?|Supplementary.*?|Appendices|Data Availability|Acknowledgements?|Author Contributions?|Funding|Conflict(?:s)? of Interest|Competing Interests?|(?:Table|Figure|Fig\.?)\s*\d+.*)$', re.IGNORECASE)
|
|
49
|
+
|
|
50
|
+
# --- CITATION MANAGER CLASS ---
|
|
51
|
+
class CSLCitationManager:
|
|
52
|
+
def __init__(self, bib_file: Path, csl_file: Path):
|
|
53
|
+
self.bib_file = bib_file
|
|
54
|
+
self.csl_file = csl_file
|
|
55
|
+
self.update_count: int = 0
|
|
56
|
+
|
|
57
|
+
print(f"Loading Bibliography Data and CSL style ({csl_file.name})...")
|
|
58
|
+
|
|
59
|
+
# Load the corrected BibTeX directly from the verified file!
|
|
60
|
+
self.bib_source = BibTeX(str(self.bib_file), encoding='utf-8')
|
|
61
|
+
self.bib_style = CitationStylesStyle(str(self.csl_file))
|
|
62
|
+
|
|
63
|
+
# --- Dependent CSL Style Check ---
|
|
64
|
+
if getattr(self.bib_style.root, 'citation', None) is None:
|
|
65
|
+
parent_link = None
|
|
66
|
+
try:
|
|
67
|
+
with open(self.csl_file, 'r', encoding='utf-8') as f:
|
|
68
|
+
match = re.search(r'<link\s+rel="independent-parent"\s+href="([^"]+)"', f.read())
|
|
69
|
+
if match: parent_link = match.group(1)
|
|
70
|
+
except Exception: pass
|
|
71
|
+
|
|
72
|
+
print(f"\n❌ ERROR: '{csl_file.name}' is a 'dependent' CSL style. It does not contain formatting rules.")
|
|
73
|
+
print(f" citeproc-py requires the full independent parent style to format citations.")
|
|
74
|
+
if parent_link:
|
|
75
|
+
print(f" 👉 Please download the parent style instead: {parent_link.split('/')[-1]}.csl")
|
|
76
|
+
sys.exit(1)
|
|
77
|
+
|
|
78
|
+
self.bibliography = CitationStylesBibliography(self.bib_style, self.bib_source, formatter.html)
|
|
79
|
+
|
|
80
|
+
# --- Auto-detect if the CSL file demands superscripts ---
|
|
81
|
+
self.is_superscript_style = False
|
|
82
|
+
try:
|
|
83
|
+
with open(self.csl_file, 'r', encoding='utf-8') as f:
|
|
84
|
+
csl_text = f.read()
|
|
85
|
+
# Checks if the style natively asks for superscripts
|
|
86
|
+
if re.search(r'vertical-align\s*=\s*[\'"]sup[\'"]', csl_text, re.IGNORECASE):
|
|
87
|
+
self.is_superscript_style = True
|
|
88
|
+
# Bulletproof fallback for the major journals
|
|
89
|
+
elif any(x in str(self.csl_file).lower() for x in ['cell', 'nature', 'lancet', 'science']):
|
|
90
|
+
self.is_superscript_style = True
|
|
91
|
+
except Exception: pass
|
|
92
|
+
|
|
93
|
+
# --- Build Indices for Numeric and Author-Year Matching ---
|
|
94
|
+
self.id_map: Dict[int, str] = {}
|
|
95
|
+
self.ay_map: List[Dict[str, str]] = []
|
|
96
|
+
|
|
97
|
+
for key, entry in self.bib_source.items():
|
|
98
|
+
match = re.search(r'(\d+)', key)
|
|
99
|
+
if match:
|
|
100
|
+
ref_num = int(match.group(1))
|
|
101
|
+
self.id_map[ref_num] = key
|
|
102
|
+
|
|
103
|
+
authors = entry.get('author', '')
|
|
104
|
+
year = entry.get('year', '')
|
|
105
|
+
if authors and year:
|
|
106
|
+
first_author = authors.split(' and ')[0].split(',')[0].split()[-1].lower()
|
|
107
|
+
clean_year = re.search(r'((?:19|20)\d{2})', year)
|
|
108
|
+
if clean_year:
|
|
109
|
+
self.ay_map.append({'key': key, 'author': first_author, 'year': clean_year.group(1)})
|
|
110
|
+
|
|
111
|
+
def get_in_text_citation(self, keys: List[str]) -> str:
|
|
112
|
+
if not keys: return f"[!!MISSING!!]"
|
|
113
|
+
|
|
114
|
+
citation_items = [CitationItem(k) for k in keys]
|
|
115
|
+
citation = Citation(citation_items)
|
|
116
|
+
self.bibliography.register(citation)
|
|
117
|
+
|
|
118
|
+
formatted_cite = self.bibliography.cite(citation, lambda item: None)
|
|
119
|
+
formatted_str = html.unescape(str(formatted_cite)).replace('\u200b', '').replace('\u200c', '').strip()
|
|
120
|
+
clean_text = re.sub(r'<[^>]+>', '', formatted_str).strip()
|
|
121
|
+
|
|
122
|
+
# Ghost-Proof Numeric Extractor
|
|
123
|
+
nums_raw = re.findall(r'\d+', clean_text)
|
|
124
|
+
alpha_chars = re.sub(r'[^A-Za-z]', '', clean_text)
|
|
125
|
+
is_numeric_style = bool(nums_raw) and len(alpha_chars) < 3
|
|
126
|
+
|
|
127
|
+
if is_numeric_style:
|
|
128
|
+
nums = sorted(list(set(int(n) for n in nums_raw)))
|
|
129
|
+
ranges, start, prev = [], nums[0], nums[0]
|
|
130
|
+
for n in nums[1:]:
|
|
131
|
+
if n == prev + 1: prev = n
|
|
132
|
+
else:
|
|
133
|
+
ranges.append(str(start) if prev == start else (f"{start}, {prev}" if prev == start + 1 else f"{start}–{prev}"))
|
|
134
|
+
start = prev = n
|
|
135
|
+
ranges.append(str(start) if prev == start else (f"{start}, {prev}" if prev == start + 1 else f"{start}–{prev}"))
|
|
136
|
+
|
|
137
|
+
collapsed = ", ".join(ranges)
|
|
138
|
+
|
|
139
|
+
if self.is_superscript_style: return f"<sup>{collapsed}</sup>"
|
|
140
|
+
else:
|
|
141
|
+
prefix = clean_text[0] if clean_text and clean_text[0] in '[(' else '['
|
|
142
|
+
suffix = clean_text[-1] if clean_text and clean_text[-1] in '])' else ']'
|
|
143
|
+
return f"{prefix}{collapsed}{suffix}"
|
|
144
|
+
|
|
145
|
+
if self.is_superscript_style and not ('<sup' in formatted_str.lower()):
|
|
146
|
+
formatted_str = re.sub(r'^([\[\(]?)(.*?)([\]\)]?)$', r'\2', clean_text)
|
|
147
|
+
return f"<sup>{formatted_str}</sup>"
|
|
148
|
+
|
|
149
|
+
return formatted_str
|
|
150
|
+
|
|
151
|
+
# --- HELPER & DOCX PROCESSING ---
|
|
152
|
+
|
|
153
|
+
def iter_block_items(doc):
|
|
154
|
+
for child in doc.element.body:
|
|
155
|
+
if isinstance(child, CT_P): yield Paragraph(child, doc)
|
|
156
|
+
elif isinstance(child, CT_Tbl): yield Table(child, doc)
|
|
157
|
+
|
|
158
|
+
def replace_text_preserve_formatting(para: Paragraph, pattern: re.Pattern, callback):
|
|
159
|
+
text = para.text
|
|
160
|
+
matches = list(pattern.finditer(text))
|
|
161
|
+
if not matches: return
|
|
162
|
+
|
|
163
|
+
replacements = [callback(m) for m in matches]
|
|
164
|
+
run_map = []
|
|
165
|
+
for r_idx, run in enumerate(para.runs):
|
|
166
|
+
for c_idx in range(len(run.text)): run_map.append((r_idx, c_idx))
|
|
167
|
+
|
|
168
|
+
if len(run_map) != len(text):
|
|
169
|
+
new_text = text
|
|
170
|
+
for match, rep in zip(reversed(matches), reversed(replacements)):
|
|
171
|
+
start, end = match.span()
|
|
172
|
+
new_text = new_text[:start] + rep + new_text[end:]
|
|
173
|
+
para.text = new_text
|
|
174
|
+
return
|
|
175
|
+
|
|
176
|
+
for match, rep in zip(reversed(matches), reversed(replacements)):
|
|
177
|
+
start, end = match.span()
|
|
178
|
+
if rep == match.group(0): continue
|
|
179
|
+
|
|
180
|
+
start_r_idx, start_c_idx = run_map[start]
|
|
181
|
+
end_r_idx, end_c_idx = run_map[end - 1]
|
|
182
|
+
|
|
183
|
+
if start_r_idx == end_r_idx:
|
|
184
|
+
run = para.runs[start_r_idx]
|
|
185
|
+
run.text = run.text[:start_c_idx] + rep + run.text[end_c_idx + 1:]
|
|
186
|
+
else:
|
|
187
|
+
run_start = para.runs[start_r_idx]
|
|
188
|
+
run_start.text = run_start.text[:start_c_idx] + rep
|
|
189
|
+
for r_idx in range(start_r_idx + 1, end_r_idx): para.runs[r_idx].text = ""
|
|
190
|
+
run_end = para.runs[end_r_idx]
|
|
191
|
+
run_end.text = run_end.text[end_c_idx + 1:]
|
|
192
|
+
|
|
193
|
+
def apply_html_formatting_to_runs(para: Paragraph):
|
|
194
|
+
"""Scans native Word runs for HTML tags, splits the run, and applies formatting natively using explicit property setters."""
|
|
195
|
+
tag_pattern = re.compile(r'(</?(?:i|em|b|strong|sup|sub|span)[^>]*>)', re.IGNORECASE)
|
|
196
|
+
runs = list(para.runs)
|
|
197
|
+
|
|
198
|
+
for run in runs:
|
|
199
|
+
if not run.text or '<' not in run.text: continue
|
|
200
|
+
parts = tag_pattern.split(run.text)
|
|
201
|
+
if len(parts) == 1: continue
|
|
202
|
+
|
|
203
|
+
is_i = run.font.italic
|
|
204
|
+
is_b = run.font.bold
|
|
205
|
+
is_sup = run.font.superscript
|
|
206
|
+
is_sub = run.font.subscript
|
|
207
|
+
is_sc = run.font.small_caps
|
|
208
|
+
font_name = run.font.name
|
|
209
|
+
|
|
210
|
+
parent = run._element.getparent()
|
|
211
|
+
idx = parent.index(run._element)
|
|
212
|
+
|
|
213
|
+
for part in parts:
|
|
214
|
+
if not part: continue
|
|
215
|
+
part_lower = part.lower()
|
|
216
|
+
|
|
217
|
+
if part_lower.startswith('<') and part_lower.endswith('>'):
|
|
218
|
+
if part_lower.startswith('<i') or part_lower.startswith('<em'): is_i = True
|
|
219
|
+
elif part_lower.startswith('</i') or part_lower.startswith('</em'): is_i = False
|
|
220
|
+
elif part_lower.startswith('<b') or part_lower.startswith('<strong'): is_b = True
|
|
221
|
+
elif part_lower.startswith('</b') or part_lower.startswith('</strong'): is_b = False
|
|
222
|
+
elif part_lower.startswith('<sup'): is_sup = True
|
|
223
|
+
elif part_lower.startswith('</sup'): is_sup = False
|
|
224
|
+
elif part_lower.startswith('<sub'): is_sub = True
|
|
225
|
+
elif part_lower.startswith('</sub'): is_sub = False
|
|
226
|
+
elif 'small-caps' in part_lower and not part_lower.startswith('</'): is_sc = True
|
|
227
|
+
elif part_lower.startswith('</span'): is_sc = False
|
|
228
|
+
else:
|
|
229
|
+
new_run = para.add_run(part)
|
|
230
|
+
# Ensure the new run inherits the specific paragraph character style (if any)
|
|
231
|
+
if run.style:
|
|
232
|
+
new_run.style = run.style
|
|
233
|
+
|
|
234
|
+
# Explicit property setting completely bypasses the python-docx chaining bug
|
|
235
|
+
if is_i is not None: new_run.font.italic = is_i
|
|
236
|
+
if is_b is not None: new_run.font.bold = is_b
|
|
237
|
+
if is_sup is not None: new_run.font.superscript = is_sup
|
|
238
|
+
if is_sub is not None: new_run.font.subscript = is_sub
|
|
239
|
+
if is_sc is not None: new_run.font.small_caps = is_sc
|
|
240
|
+
if font_name: new_run.font.name = font_name
|
|
241
|
+
|
|
242
|
+
parent.insert(idx, new_run._element)
|
|
243
|
+
idx += 1
|
|
244
|
+
|
|
245
|
+
parent.remove(run._element)
|
|
246
|
+
|
|
247
|
+
def process_paragraph_content(para: Paragraph, manager: CSLCitationManager, citation_pattern: re.Pattern, in_main_body: bool):
|
|
248
|
+
preceding_text = ""
|
|
249
|
+
for run in para.runs:
|
|
250
|
+
text = run.text.strip()
|
|
251
|
+
|
|
252
|
+
# Convert native Word superscripts (e.g., ^1,2,3) to [1,2,3] unconditionally so the citation engine catches them
|
|
253
|
+
if in_main_body and run.font.superscript and re.match(r'^[\d,\s\-–]+$', text):
|
|
254
|
+
is_math_power = bool(POWER_PATTERN.search(preceding_text)) and text.isdigit()
|
|
255
|
+
if not (AA_PATTERN_3.search(preceding_text) or AA_PATTERN_1.search(preceding_text) or MATH_UNIT_PATTERN.search(preceding_text) or IGNORE_PREFIXES.search(preceding_text) or is_math_power):
|
|
256
|
+
is_valid = True
|
|
257
|
+
for part in text.replace('–', '-').split(','):
|
|
258
|
+
if '-' in part:
|
|
259
|
+
b = part.split('-')
|
|
260
|
+
if not (len(b) == 2 and b[0].strip().isdigit() and b[1].strip().isdigit()): is_valid = False
|
|
261
|
+
elif not part.strip().isdigit(): is_valid = False
|
|
262
|
+
if is_valid:
|
|
263
|
+
run.font.superscript = False
|
|
264
|
+
run.text = f"[{text}]"
|
|
265
|
+
|
|
266
|
+
preceding_text += run.text
|
|
267
|
+
|
|
268
|
+
artifact_pattern = re.compile(r'(?:geometry|ref|source)\.(\d+)', re.IGNORECASE)
|
|
269
|
+
replace_text_preserve_formatting(para, artifact_pattern, lambda m: f"[{m.group(1)}]")
|
|
270
|
+
|
|
271
|
+
def replace_callback(match: Match) -> str:
|
|
272
|
+
preceding = para.text[:match.start()]
|
|
273
|
+
if (AA_PATTERN_3.search(preceding) or AA_PATTERN_1.search(preceding) or MATH_UNIT_PATTERN.search(preceding) or IGNORE_PREFIXES.search(preceding)):
|
|
274
|
+
return match.group(0)
|
|
275
|
+
|
|
276
|
+
raw_inner = match.group(1).replace('–', '-')
|
|
277
|
+
if match.group(0).startswith('(') and raw_inner.isdigit() and 1900 <= int(raw_inner) <= 2100: return match.group(0)
|
|
278
|
+
|
|
279
|
+
oids = []
|
|
280
|
+
for part in raw_inner.split(','):
|
|
281
|
+
part = part.strip()
|
|
282
|
+
if '-' in part:
|
|
283
|
+
bounds = part.split('-')
|
|
284
|
+
if len(bounds) == 2 and bounds[0].strip().isdigit() and bounds[1].strip().isdigit():
|
|
285
|
+
start, end = int(bounds[0].strip()), int(bounds[1].strip())
|
|
286
|
+
if start <= end and (end - start) < 50: oids.extend(range(start, end + 1))
|
|
287
|
+
else: return match.group(0)
|
|
288
|
+
else: return match.group(0)
|
|
289
|
+
else:
|
|
290
|
+
if not part.isdigit(): return match.group(0)
|
|
291
|
+
oids.append(int(part))
|
|
292
|
+
|
|
293
|
+
valid_keys = [manager.id_map[o] for o in oids if o in manager.id_map]
|
|
294
|
+
if not valid_keys: return match.group(0)
|
|
295
|
+
|
|
296
|
+
manager.update_count += 1
|
|
297
|
+
return manager.get_in_text_citation(valid_keys)
|
|
298
|
+
|
|
299
|
+
replace_text_preserve_formatting(para, citation_pattern, replace_callback)
|
|
300
|
+
|
|
301
|
+
ay_pattern = re.compile(r'\(([A-Za-z][^()]*?(?:19|20)\d{2}[a-z]?)\)')
|
|
302
|
+
def replace_ay_callback(match: Match) -> str:
|
|
303
|
+
raw_inner = match.group(1)
|
|
304
|
+
if '=' in raw_inner or '+' in raw_inner: return match.group(0)
|
|
305
|
+
|
|
306
|
+
parts, matched_keys, valid_part_count = raw_inner.split(';'), [], 0
|
|
307
|
+
for part in parts:
|
|
308
|
+
year_match = re.search(r'((?:19|20)\d{2})', part)
|
|
309
|
+
if not year_match: continue
|
|
310
|
+
valid_part_count += 1
|
|
311
|
+
author_text = re.sub(r'(?:19|20)\d{2}[a-z]?|et al\.?|,|&', '', part).strip().lower()
|
|
312
|
+
|
|
313
|
+
best_match, best_score = None, 80
|
|
314
|
+
for item in manager.ay_map:
|
|
315
|
+
if item['year'] == year_match.group(1):
|
|
316
|
+
score = fuzz.partial_ratio(author_text, item['author'])
|
|
317
|
+
if score > best_score:
|
|
318
|
+
best_score = score
|
|
319
|
+
best_match = item['key']
|
|
320
|
+
|
|
321
|
+
if best_match: matched_keys.append(best_match)
|
|
322
|
+
|
|
323
|
+
if valid_part_count > 0 and len(matched_keys) == valid_part_count:
|
|
324
|
+
manager.update_count += 1
|
|
325
|
+
return manager.get_in_text_citation(matched_keys)
|
|
326
|
+
return match.group(0)
|
|
327
|
+
|
|
328
|
+
replace_text_preserve_formatting(para, ay_pattern, replace_ay_callback)
|
|
329
|
+
|
|
330
|
+
# Process html tags into Word document natively
|
|
331
|
+
apply_html_formatting_to_runs(para)
|
|
332
|
+
|
|
333
|
+
def write_rich_bibliography_entry(doc: Document, html_text: str, main_font: Optional[str], insert_cursor: Optional[Paragraph] = None):
|
|
334
|
+
p = insert_cursor.insert_paragraph_before() if insert_cursor is not None else doc.add_paragraph()
|
|
335
|
+
html_text = html_text.replace('.. ', '. ').replace('..<', '.<')
|
|
336
|
+
html_text = html_text.replace('</div><div class="csl-right-inline">', '</div><div class="csl-right-inline">' + chr(160))
|
|
337
|
+
html_text = re.sub(r'^((?:<[^>]+>|\s)*)(\[\d+\]|\d+\.)\s*(<[^>]+>)?\s*([A-Za-z])', r'\1\2' + chr(160) + r'\3\4', html_text)
|
|
338
|
+
|
|
339
|
+
is_bold = is_italic = is_smallcaps = is_sup = is_sub = False
|
|
340
|
+
tokens = re.split(r'(<[^>]+>)', html_text)
|
|
341
|
+
|
|
342
|
+
for token in tokens:
|
|
343
|
+
if not token: continue
|
|
344
|
+
token_lower = token.lower()
|
|
345
|
+
if token_lower.startswith('<'):
|
|
346
|
+
if token_lower.startswith('<b') or token_lower.startswith('<strong'): is_bold = True
|
|
347
|
+
elif token_lower.startswith('</b') or token_lower.startswith('</strong'): is_bold = False
|
|
348
|
+
elif token_lower.startswith('<i') or token_lower.startswith('<em'): is_italic = True
|
|
349
|
+
elif token_lower.startswith('</i') or token_lower.startswith('</em'): is_italic = False
|
|
350
|
+
elif token_lower.startswith('<sup'): is_sup = True
|
|
351
|
+
elif token_lower.startswith('</sup'): is_sup = False
|
|
352
|
+
elif token_lower.startswith('<sub'): is_sub = True
|
|
353
|
+
elif token_lower.startswith('</sub'): is_sub = False
|
|
354
|
+
elif 'small-caps' in token_lower and not token_lower.startswith('</'): is_smallcaps = True
|
|
355
|
+
elif token_lower.startswith('</span'): is_smallcaps = False
|
|
356
|
+
continue
|
|
357
|
+
|
|
358
|
+
text_content = html.unescape(token)
|
|
359
|
+
if text_content:
|
|
360
|
+
run = p.add_run(text_content)
|
|
361
|
+
run.bold, run.italic = is_bold, is_italic
|
|
362
|
+
if is_smallcaps: run.font.small_caps = True
|
|
363
|
+
if is_sup: run.font.superscript = True
|
|
364
|
+
if is_sub: run.font.subscript = True
|
|
365
|
+
if main_font: run.font.name = main_font
|
|
366
|
+
|
|
367
|
+
def process_document(docx_path: Path, output_path: Path, manager: CSLCitationManager):
|
|
368
|
+
print(f"\nProcessing document: {docx_path.name}")
|
|
369
|
+
doc = Document(str(docx_path))
|
|
370
|
+
citation_pattern = re.compile(r'[\[\(]([\d\s,\-–]+)[\]\)]')
|
|
371
|
+
|
|
372
|
+
main_font = None
|
|
373
|
+
for p in doc.paragraphs:
|
|
374
|
+
for r in p.runs:
|
|
375
|
+
if r.font.name: main_font = r.font.name; break
|
|
376
|
+
if main_font: break
|
|
377
|
+
|
|
378
|
+
ref_header_element = None
|
|
379
|
+
for p in doc.paragraphs:
|
|
380
|
+
if REF_HEADER_PATTERN.match(p.text): ref_header_element = p._element; break
|
|
381
|
+
|
|
382
|
+
in_main_body = False
|
|
383
|
+
block_counter = 0
|
|
384
|
+
|
|
385
|
+
for block in iter_block_items(doc):
|
|
386
|
+
block_counter += 1
|
|
387
|
+
if isinstance(block, Paragraph):
|
|
388
|
+
if ref_header_element is not None and block._element == ref_header_element: break
|
|
389
|
+
if not in_main_body:
|
|
390
|
+
text_clean = block.text.strip().lower()
|
|
391
|
+
if text_clean in ['abstract', 'introduction', 'background', 'summary', 'methods', 'results']: in_main_body = True
|
|
392
|
+
else:
|
|
393
|
+
words = re.findall(r'\b[a-z]+\b', text_clean)
|
|
394
|
+
if len(words) >= 25 and len([w for w in words if w in PROSE_STOP_WORDS]) >= 5: in_main_body = True
|
|
395
|
+
if block_counter > 25 and not in_main_body: in_main_body = True
|
|
396
|
+
|
|
397
|
+
process_paragraph_content(block, manager, citation_pattern, in_main_body)
|
|
398
|
+
|
|
399
|
+
elif isinstance(block, Table):
|
|
400
|
+
if not in_main_body: continue
|
|
401
|
+
for row in block.rows:
|
|
402
|
+
for cell in row.cells:
|
|
403
|
+
for para in cell.paragraphs:
|
|
404
|
+
process_paragraph_content(para, manager, citation_pattern, in_main_body)
|
|
405
|
+
|
|
406
|
+
ref_header_index = -1
|
|
407
|
+
for i, p in enumerate(doc.paragraphs):
|
|
408
|
+
if p._element == ref_header_element: ref_header_index = i; break
|
|
409
|
+
|
|
410
|
+
insert_cursor = None
|
|
411
|
+
if ref_header_index != -1:
|
|
412
|
+
post_ref_index = -1
|
|
413
|
+
for i in range(ref_header_index + 1, len(doc.paragraphs)):
|
|
414
|
+
text = doc.paragraphs[i].text.strip()
|
|
415
|
+
if text and POST_REF_PATTERN.match(text):
|
|
416
|
+
post_ref_index = i; break
|
|
417
|
+
|
|
418
|
+
if post_ref_index != -1:
|
|
419
|
+
paragraphs_to_remove = doc.paragraphs[ref_header_index:post_ref_index]
|
|
420
|
+
insert_cursor = doc.paragraphs[post_ref_index]
|
|
421
|
+
insert_cursor.paragraph_format.page_break_before = True
|
|
422
|
+
else:
|
|
423
|
+
paragraphs_to_remove = doc.paragraphs[ref_header_index:]
|
|
424
|
+
|
|
425
|
+
for p in paragraphs_to_remove:
|
|
426
|
+
parent = p._element.getparent()
|
|
427
|
+
if parent is not None: parent.remove(p._element)
|
|
428
|
+
|
|
429
|
+
p = insert_cursor.insert_paragraph_before() if insert_cursor is not None else doc.add_paragraph()
|
|
430
|
+
p.paragraph_format.page_break_before = True
|
|
431
|
+
run = p.add_run('References')
|
|
432
|
+
run.bold = True
|
|
433
|
+
if main_font: run.font.name = main_font
|
|
434
|
+
|
|
435
|
+
print(f" -> Rebuilding Rich-Text Bibliography via CSL...")
|
|
436
|
+
manager.bibliography.sort()
|
|
437
|
+
for entry in manager.bibliography.bibliography():
|
|
438
|
+
write_rich_bibliography_entry(doc, str(entry), main_font, insert_cursor)
|
|
439
|
+
|
|
440
|
+
doc.save(str(output_path))
|
|
441
|
+
print(f"Success! Saved to {output_path.name}")
|
|
442
|
+
print(f" -> Tracked and dynamically updated {manager.update_count} in-text citations.")
|
|
443
|
+
|
|
444
|
+
|
|
445
|
+
def main():
|
|
446
|
+
parser = argparse.ArgumentParser()
|
|
447
|
+
parser.add_argument("bib", type=Path, help="Verified .bib file")
|
|
448
|
+
parser.add_argument("doc", type=Path, help="Input .docx file")
|
|
449
|
+
parser.add_argument("--csl", type=Path, required=True, help="Path to the CSL style file")
|
|
450
|
+
args = parser.parse_args()
|
|
451
|
+
|
|
452
|
+
csl_path = args.csl
|
|
453
|
+
default_csl_dir = Path("~/citation_styles").expanduser()
|
|
454
|
+
|
|
455
|
+
if not csl_path.exists():
|
|
456
|
+
alt_path = default_csl_dir / csl_path.name
|
|
457
|
+
if alt_path.exists(): csl_path = alt_path
|
|
458
|
+
elif not csl_path.suffix == '.csl':
|
|
459
|
+
alt_path_ext = default_csl_dir / f"{csl_path.name}.csl"
|
|
460
|
+
if alt_path_ext.exists(): csl_path = alt_path_ext
|
|
461
|
+
|
|
462
|
+
if not csl_path.exists():
|
|
463
|
+
print(f"❌ ERROR: CSL file '{args.csl}' not found locally or in {default_csl_dir}.")
|
|
464
|
+
sys.exit(1)
|
|
465
|
+
|
|
466
|
+
output = args.doc.with_name(f"{args.doc.stem}_final_{csl_path.stem}.docx")
|
|
467
|
+
mgr = CSLCitationManager(args.bib, csl_path)
|
|
468
|
+
process_document(args.doc, output, mgr)
|
|
469
|
+
|
|
470
|
+
if __name__ == "__main__":
|
|
471
|
+
main()
|