ref-management 1.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,3 @@
1
+ """ref_management – Manuscript Reference Toolkit (ARM)."""
2
+
3
+ __version__ = "1.0.0"
@@ -0,0 +1,126 @@
1
+ import sys
2
+ import re
3
+ import argparse
4
+ from pathlib import Path
5
+
6
+ # --- MONKEY PATCH FOR PYPARSING/BIBTEXPARSER COMPATIBILITY ---
7
+ import pyparsing
8
+ if not hasattr(pyparsing, 'DelimitedList'):
9
+ if hasattr(pyparsing, 'delimited_list'): setattr(pyparsing, 'DelimitedList', pyparsing.delimited_list)
10
+ elif hasattr(pyparsing, 'delimitedList'): setattr(pyparsing, 'DelimitedList', pyparsing.delimitedList)
11
+
12
+ import bibtexparser
13
+ from docx import Document
14
+ from rapidfuzz import fuzz
15
+
16
+ REF_HEADER_PATTERN = re.compile(r'^\s*(?:[0-9]+\.?\s*)?(?:REFERENCES|BIBLIOGRAPHY|LITERATURE CITED|WORKS CITED)\s*$', re.IGNORECASE)
17
+ POST_REF_PATTERN = re.compile(r'^\s*(?:Tables?|Figures?|Figure Legends?|Supplementary.*?|Appendices|Data Availability|Acknowledgements?|Author Contributions?|Funding|Conflict(?:s)? of Interest|Competing Interests?|(?:Table|Figure|Fig\.?)\s*\d+.*)$', re.IGNORECASE)
18
+
19
+ def clean_for_match(text: str) -> str:
20
+ """Removes punctuation and normalizes spacing for accurate fuzzy matching."""
21
+ if not text: return ""
22
+ text = text.replace('{', '').replace('}', '')
23
+ return re.sub(r'[^\w\s]', '', text.lower()).strip()
24
+
25
+ def process_document(bib_path: Path, docx_path: Path, output_path: Path):
26
+ print(f"\nReading verified BibTeX: {bib_path.name}...")
27
+ try:
28
+ with open(bib_path, 'r', encoding='utf-8') as f:
29
+ bib_db = bibtexparser.load(f)
30
+ except Exception as e:
31
+ print(f"❌ ERROR reading BibTeX: {e}")
32
+ sys.exit(1)
33
+
34
+ # Build an index of cleaned titles to DOIs
35
+ doi_map = {}
36
+ for entry in bib_db.entries:
37
+ doi = entry.get('doi', '').strip()
38
+ title = entry.get('title', '').strip()
39
+ if doi and title:
40
+ # Clean DOI prefix if present
41
+ clean_doi = doi.replace('https://doi.org/', '').replace('doi:', '').strip()
42
+ doi_map[clean_for_match(title)] = clean_doi
43
+
44
+ print(f"Loaded {len(doi_map)} DOIs from BibTeX.")
45
+ print(f"Scanning document: {docx_path.name}...")
46
+ doc = Document(str(docx_path))
47
+
48
+ # 1. Find the boundaries of the References section
49
+ ref_start_idx = -1
50
+ for i, p in enumerate(doc.paragraphs):
51
+ if REF_HEADER_PATTERN.match(p.text):
52
+ ref_start_idx = i
53
+ break
54
+
55
+ if ref_start_idx == -1:
56
+ print("❌ ERROR: Could not locate 'References' header in the document.")
57
+ sys.exit(1)
58
+
59
+ ref_end_idx = len(doc.paragraphs)
60
+ for i in range(ref_start_idx + 1, len(doc.paragraphs)):
61
+ text = doc.paragraphs[i].text.strip()
62
+ if text and POST_REF_PATTERN.match(text):
63
+ ref_end_idx = i
64
+ break
65
+
66
+ # 2. Iterate through the references and append DOIs
67
+ added_count = 0
68
+ already_had_count = 0
69
+
70
+ for i in range(ref_start_idx + 1, ref_end_idx):
71
+ para = doc.paragraphs[i]
72
+ text = para.text.strip()
73
+
74
+ # Skip empty lines or very short fragments
75
+ if len(text) < 20: continue
76
+
77
+ # Check if a DOI is already present in this paragraph
78
+ if re.search(r'(?i)\bhttps?://doi\.org\b', text) or re.search(r'(?i)\bdoi:', text):
79
+ already_had_count += 1
80
+ continue
81
+
82
+ # Fuzzy match the paragraph text against our BibTeX titles
83
+ best_match_doi = None
84
+ best_score = 85 # Minimum strictness threshold
85
+
86
+ para_clean = clean_for_match(text)
87
+ for bib_title, doi in doi_map.items():
88
+ # partial_ratio is perfect here because the title is just a substring of the full reference paragraph
89
+ score = fuzz.partial_ratio(bib_title, para_clean)
90
+ if score > best_score:
91
+ best_score = score
92
+ best_match_doi = doi
93
+
94
+ if best_match_doi:
95
+ # Append the DOI natively to the paragraph
96
+ if not text.endswith('.'):
97
+ para.add_run('.')
98
+
99
+ # Format the run slightly to match typical hyperlink aesthetics (optional, but clean)
100
+ run = para.add_run(f" https://doi.org/{best_match_doi}")
101
+ added_count += 1
102
+
103
+ # 3. Save the patched draft
104
+ doc.save(str(output_path))
105
+ print(f"\nSuccess! Saved to {output_path.name}")
106
+ print(f" -> Found {already_had_count} references that already had DOIs.")
107
+ print(f" -> Dynamically matched and injected {added_count} missing DOIs.")
108
+
109
+ def main():
110
+ parser = argparse.ArgumentParser(description="Appends DOIs to the References section of an intermediate draft.")
111
+ parser.add_argument("bib", type=Path, help="The verified .bib file containing the DOIs")
112
+ parser.add_argument("doc", type=Path, help="The intermediate .docx file")
113
+ args = parser.parse_args()
114
+
115
+ if not args.bib.exists():
116
+ print(f"❌ ERROR: BibTeX file '{args.bib}' not found.")
117
+ sys.exit(1)
118
+ if not args.doc.exists():
119
+ print(f"❌ ERROR: Document '{args.doc}' not found.")
120
+ sys.exit(1)
121
+
122
+ output = args.doc.with_name(f"{args.doc.stem}_with_DOIs.docx")
123
+ process_document(args.bib, args.doc, output)
124
+
125
+ if __name__ == "__main__":
126
+ main()
@@ -0,0 +1,471 @@
1
+ import sys
2
+ import os
3
+ import re
4
+ import argparse
5
+ import html
6
+ import warnings
7
+ import tempfile
8
+ from pathlib import Path
9
+ from typing import Dict, List, Optional, Match
10
+
11
+ # --- Suppress harmless citeproc-py schema validation warnings ---
12
+ warnings.filterwarnings("ignore", category=UserWarning)
13
+
14
+ # --- MONKEY PATCH FOR PYPARSING/BIBTEXPARSER COMPATIBILITY ---
15
+ import pyparsing
16
+ if not hasattr(pyparsing, 'DelimitedList'):
17
+ if hasattr(pyparsing, 'delimited_list'): setattr(pyparsing, 'DelimitedList', pyparsing.delimited_list)
18
+ elif hasattr(pyparsing, 'delimitedList'): setattr(pyparsing, 'DelimitedList', pyparsing.delimitedList)
19
+
20
+ import bibtexparser
21
+ from bibtexparser.bwriter import BibTexWriter
22
+ from docx import Document
23
+ from docx.text.paragraph import Paragraph
24
+ from docx.table import Table
25
+ from docx.oxml.text.paragraph import CT_P
26
+ from docx.oxml.table import CT_Tbl
27
+ from rapidfuzz import fuzz
28
+
29
+ # --- CITEPROC IMPORTS ---
30
+ from citeproc import CitationStylesStyle, CitationStylesBibliography
31
+ from citeproc import Citation, CitationItem
32
+ from citeproc import formatter
33
+ from citeproc.source.bibtex import BibTeX
34
+
35
+ # --- INTELLIGENCE DICTIONARIES ---
36
+ AA_LIST_3 = "Ala|Arg|Asn|Asp|Cys|Gln|Glu|Gly|His|Ile|Leu|Lys|Met|Phe|Pro|Ser|Thr|Trp|Tyr|Val"
37
+ AA_PATTERN_3 = re.compile(rf'\b(?:{AA_LIST_3})\s*-?\s*$', re.IGNORECASE)
38
+ AA_PATTERN_1 = re.compile(r'\b[ARNDCQEGHILKMFPSTWY]-?$')
39
+ MATH_UNIT_LIST = ["CV", "R", "r", "m", "cm", "mm", "µm", "um", "nm", "km", "kg", "x", "y", "z", "p", "n", "k", "v", "V", "D", "Ca", "Mg", "Na", "K", "Cl", "Fe", "Zn", "Cu", "O", "H", "N", "C", "P", "S", "M", "χ", "Χ"]
40
+ MATH_UNIT_PATTERN = re.compile(rf'\b(?:{"|".join(MATH_UNIT_LIST)})\s*$')
41
+ POWER_PATTERN = re.compile(r'(?:10|x10|×10|\*10)\s*$')
42
+ IGNORE_PREFIXES = re.compile(r'(?i:\b(?:fig(?:ure)?|eq(?:uation)?|tbl|table|section|sec|step)\s*\.?\s*)$')
43
+
44
+ PROSE_STOP_WORDS = {'the', 'is', 'are', 'was', 'were', 'that', 'this', 'to', 'for', 'with',
45
+ 'in', 'on', 'by', 'an', 'we', 'our', 'as', 'it', 'can', 'be', 'has', 'have', 'of', 'and', 'from', 'which'}
46
+
47
+ REF_HEADER_PATTERN = re.compile(r'^\s*(?:[0-9]+\.?\s*)?(?:REFERENCES|BIBLIOGRAPHY|LITERATURE CITED|WORKS CITED)\s*$', re.IGNORECASE)
48
+ POST_REF_PATTERN = re.compile(r'^\s*(?:Tables?|Figures?|Figure Legends?|Supplementary.*?|Appendices|Data Availability|Acknowledgements?|Author Contributions?|Funding|Conflict(?:s)? of Interest|Competing Interests?|(?:Table|Figure|Fig\.?)\s*\d+.*)$', re.IGNORECASE)
49
+
50
+ # --- CITATION MANAGER CLASS ---
51
+ class CSLCitationManager:
52
+ def __init__(self, bib_file: Path, csl_file: Path):
53
+ self.bib_file = bib_file
54
+ self.csl_file = csl_file
55
+ self.update_count: int = 0
56
+
57
+ print(f"Loading Bibliography Data and CSL style ({csl_file.name})...")
58
+
59
+ # Load the corrected BibTeX directly from the verified file!
60
+ self.bib_source = BibTeX(str(self.bib_file), encoding='utf-8')
61
+ self.bib_style = CitationStylesStyle(str(self.csl_file))
62
+
63
+ # --- Dependent CSL Style Check ---
64
+ if getattr(self.bib_style.root, 'citation', None) is None:
65
+ parent_link = None
66
+ try:
67
+ with open(self.csl_file, 'r', encoding='utf-8') as f:
68
+ match = re.search(r'<link\s+rel="independent-parent"\s+href="([^"]+)"', f.read())
69
+ if match: parent_link = match.group(1)
70
+ except Exception: pass
71
+
72
+ print(f"\n❌ ERROR: '{csl_file.name}' is a 'dependent' CSL style. It does not contain formatting rules.")
73
+ print(f" citeproc-py requires the full independent parent style to format citations.")
74
+ if parent_link:
75
+ print(f" 👉 Please download the parent style instead: {parent_link.split('/')[-1]}.csl")
76
+ sys.exit(1)
77
+
78
+ self.bibliography = CitationStylesBibliography(self.bib_style, self.bib_source, formatter.html)
79
+
80
+ # --- Auto-detect if the CSL file demands superscripts ---
81
+ self.is_superscript_style = False
82
+ try:
83
+ with open(self.csl_file, 'r', encoding='utf-8') as f:
84
+ csl_text = f.read()
85
+ # Checks if the style natively asks for superscripts
86
+ if re.search(r'vertical-align\s*=\s*[\'"]sup[\'"]', csl_text, re.IGNORECASE):
87
+ self.is_superscript_style = True
88
+ # Bulletproof fallback for the major journals
89
+ elif any(x in str(self.csl_file).lower() for x in ['cell', 'nature', 'lancet', 'science']):
90
+ self.is_superscript_style = True
91
+ except Exception: pass
92
+
93
+ # --- Build Indices for Numeric and Author-Year Matching ---
94
+ self.id_map: Dict[int, str] = {}
95
+ self.ay_map: List[Dict[str, str]] = []
96
+
97
+ for key, entry in self.bib_source.items():
98
+ match = re.search(r'(\d+)', key)
99
+ if match:
100
+ ref_num = int(match.group(1))
101
+ self.id_map[ref_num] = key
102
+
103
+ authors = entry.get('author', '')
104
+ year = entry.get('year', '')
105
+ if authors and year:
106
+ first_author = authors.split(' and ')[0].split(',')[0].split()[-1].lower()
107
+ clean_year = re.search(r'((?:19|20)\d{2})', year)
108
+ if clean_year:
109
+ self.ay_map.append({'key': key, 'author': first_author, 'year': clean_year.group(1)})
110
+
111
+ def get_in_text_citation(self, keys: List[str]) -> str:
112
+ if not keys: return f"[!!MISSING!!]"
113
+
114
+ citation_items = [CitationItem(k) for k in keys]
115
+ citation = Citation(citation_items)
116
+ self.bibliography.register(citation)
117
+
118
+ formatted_cite = self.bibliography.cite(citation, lambda item: None)
119
+ formatted_str = html.unescape(str(formatted_cite)).replace('\u200b', '').replace('\u200c', '').strip()
120
+ clean_text = re.sub(r'<[^>]+>', '', formatted_str).strip()
121
+
122
+ # Ghost-Proof Numeric Extractor
123
+ nums_raw = re.findall(r'\d+', clean_text)
124
+ alpha_chars = re.sub(r'[^A-Za-z]', '', clean_text)
125
+ is_numeric_style = bool(nums_raw) and len(alpha_chars) < 3
126
+
127
+ if is_numeric_style:
128
+ nums = sorted(list(set(int(n) for n in nums_raw)))
129
+ ranges, start, prev = [], nums[0], nums[0]
130
+ for n in nums[1:]:
131
+ if n == prev + 1: prev = n
132
+ else:
133
+ ranges.append(str(start) if prev == start else (f"{start}, {prev}" if prev == start + 1 else f"{start}–{prev}"))
134
+ start = prev = n
135
+ ranges.append(str(start) if prev == start else (f"{start}, {prev}" if prev == start + 1 else f"{start}–{prev}"))
136
+
137
+ collapsed = ", ".join(ranges)
138
+
139
+ if self.is_superscript_style: return f"<sup>{collapsed}</sup>"
140
+ else:
141
+ prefix = clean_text[0] if clean_text and clean_text[0] in '[(' else '['
142
+ suffix = clean_text[-1] if clean_text and clean_text[-1] in '])' else ']'
143
+ return f"{prefix}{collapsed}{suffix}"
144
+
145
+ if self.is_superscript_style and not ('<sup' in formatted_str.lower()):
146
+ formatted_str = re.sub(r'^([\[\(]?)(.*?)([\]\)]?)$', r'\2', clean_text)
147
+ return f"<sup>{formatted_str}</sup>"
148
+
149
+ return formatted_str
150
+
151
+ # --- HELPER & DOCX PROCESSING ---
152
+
153
+ def iter_block_items(doc):
154
+ for child in doc.element.body:
155
+ if isinstance(child, CT_P): yield Paragraph(child, doc)
156
+ elif isinstance(child, CT_Tbl): yield Table(child, doc)
157
+
158
+ def replace_text_preserve_formatting(para: Paragraph, pattern: re.Pattern, callback):
159
+ text = para.text
160
+ matches = list(pattern.finditer(text))
161
+ if not matches: return
162
+
163
+ replacements = [callback(m) for m in matches]
164
+ run_map = []
165
+ for r_idx, run in enumerate(para.runs):
166
+ for c_idx in range(len(run.text)): run_map.append((r_idx, c_idx))
167
+
168
+ if len(run_map) != len(text):
169
+ new_text = text
170
+ for match, rep in zip(reversed(matches), reversed(replacements)):
171
+ start, end = match.span()
172
+ new_text = new_text[:start] + rep + new_text[end:]
173
+ para.text = new_text
174
+ return
175
+
176
+ for match, rep in zip(reversed(matches), reversed(replacements)):
177
+ start, end = match.span()
178
+ if rep == match.group(0): continue
179
+
180
+ start_r_idx, start_c_idx = run_map[start]
181
+ end_r_idx, end_c_idx = run_map[end - 1]
182
+
183
+ if start_r_idx == end_r_idx:
184
+ run = para.runs[start_r_idx]
185
+ run.text = run.text[:start_c_idx] + rep + run.text[end_c_idx + 1:]
186
+ else:
187
+ run_start = para.runs[start_r_idx]
188
+ run_start.text = run_start.text[:start_c_idx] + rep
189
+ for r_idx in range(start_r_idx + 1, end_r_idx): para.runs[r_idx].text = ""
190
+ run_end = para.runs[end_r_idx]
191
+ run_end.text = run_end.text[end_c_idx + 1:]
192
+
193
+ def apply_html_formatting_to_runs(para: Paragraph):
194
+ """Scans native Word runs for HTML tags, splits the run, and applies formatting natively using explicit property setters."""
195
+ tag_pattern = re.compile(r'(</?(?:i|em|b|strong|sup|sub|span)[^>]*>)', re.IGNORECASE)
196
+ runs = list(para.runs)
197
+
198
+ for run in runs:
199
+ if not run.text or '<' not in run.text: continue
200
+ parts = tag_pattern.split(run.text)
201
+ if len(parts) == 1: continue
202
+
203
+ is_i = run.font.italic
204
+ is_b = run.font.bold
205
+ is_sup = run.font.superscript
206
+ is_sub = run.font.subscript
207
+ is_sc = run.font.small_caps
208
+ font_name = run.font.name
209
+
210
+ parent = run._element.getparent()
211
+ idx = parent.index(run._element)
212
+
213
+ for part in parts:
214
+ if not part: continue
215
+ part_lower = part.lower()
216
+
217
+ if part_lower.startswith('<') and part_lower.endswith('>'):
218
+ if part_lower.startswith('<i') or part_lower.startswith('<em'): is_i = True
219
+ elif part_lower.startswith('</i') or part_lower.startswith('</em'): is_i = False
220
+ elif part_lower.startswith('<b') or part_lower.startswith('<strong'): is_b = True
221
+ elif part_lower.startswith('</b') or part_lower.startswith('</strong'): is_b = False
222
+ elif part_lower.startswith('<sup'): is_sup = True
223
+ elif part_lower.startswith('</sup'): is_sup = False
224
+ elif part_lower.startswith('<sub'): is_sub = True
225
+ elif part_lower.startswith('</sub'): is_sub = False
226
+ elif 'small-caps' in part_lower and not part_lower.startswith('</'): is_sc = True
227
+ elif part_lower.startswith('</span'): is_sc = False
228
+ else:
229
+ new_run = para.add_run(part)
230
+ # Ensure the new run inherits the specific paragraph character style (if any)
231
+ if run.style:
232
+ new_run.style = run.style
233
+
234
+ # Explicit property setting completely bypasses the python-docx chaining bug
235
+ if is_i is not None: new_run.font.italic = is_i
236
+ if is_b is not None: new_run.font.bold = is_b
237
+ if is_sup is not None: new_run.font.superscript = is_sup
238
+ if is_sub is not None: new_run.font.subscript = is_sub
239
+ if is_sc is not None: new_run.font.small_caps = is_sc
240
+ if font_name: new_run.font.name = font_name
241
+
242
+ parent.insert(idx, new_run._element)
243
+ idx += 1
244
+
245
+ parent.remove(run._element)
246
+
247
+ def process_paragraph_content(para: Paragraph, manager: CSLCitationManager, citation_pattern: re.Pattern, in_main_body: bool):
248
+ preceding_text = ""
249
+ for run in para.runs:
250
+ text = run.text.strip()
251
+
252
+ # Convert native Word superscripts (e.g., ^1,2,3) to [1,2,3] unconditionally so the citation engine catches them
253
+ if in_main_body and run.font.superscript and re.match(r'^[\d,\s\-–]+$', text):
254
+ is_math_power = bool(POWER_PATTERN.search(preceding_text)) and text.isdigit()
255
+ if not (AA_PATTERN_3.search(preceding_text) or AA_PATTERN_1.search(preceding_text) or MATH_UNIT_PATTERN.search(preceding_text) or IGNORE_PREFIXES.search(preceding_text) or is_math_power):
256
+ is_valid = True
257
+ for part in text.replace('–', '-').split(','):
258
+ if '-' in part:
259
+ b = part.split('-')
260
+ if not (len(b) == 2 and b[0].strip().isdigit() and b[1].strip().isdigit()): is_valid = False
261
+ elif not part.strip().isdigit(): is_valid = False
262
+ if is_valid:
263
+ run.font.superscript = False
264
+ run.text = f"[{text}]"
265
+
266
+ preceding_text += run.text
267
+
268
+ artifact_pattern = re.compile(r'(?:geometry|ref|source)\.(\d+)', re.IGNORECASE)
269
+ replace_text_preserve_formatting(para, artifact_pattern, lambda m: f"[{m.group(1)}]")
270
+
271
+ def replace_callback(match: Match) -> str:
272
+ preceding = para.text[:match.start()]
273
+ if (AA_PATTERN_3.search(preceding) or AA_PATTERN_1.search(preceding) or MATH_UNIT_PATTERN.search(preceding) or IGNORE_PREFIXES.search(preceding)):
274
+ return match.group(0)
275
+
276
+ raw_inner = match.group(1).replace('–', '-')
277
+ if match.group(0).startswith('(') and raw_inner.isdigit() and 1900 <= int(raw_inner) <= 2100: return match.group(0)
278
+
279
+ oids = []
280
+ for part in raw_inner.split(','):
281
+ part = part.strip()
282
+ if '-' in part:
283
+ bounds = part.split('-')
284
+ if len(bounds) == 2 and bounds[0].strip().isdigit() and bounds[1].strip().isdigit():
285
+ start, end = int(bounds[0].strip()), int(bounds[1].strip())
286
+ if start <= end and (end - start) < 50: oids.extend(range(start, end + 1))
287
+ else: return match.group(0)
288
+ else: return match.group(0)
289
+ else:
290
+ if not part.isdigit(): return match.group(0)
291
+ oids.append(int(part))
292
+
293
+ valid_keys = [manager.id_map[o] for o in oids if o in manager.id_map]
294
+ if not valid_keys: return match.group(0)
295
+
296
+ manager.update_count += 1
297
+ return manager.get_in_text_citation(valid_keys)
298
+
299
+ replace_text_preserve_formatting(para, citation_pattern, replace_callback)
300
+
301
+ ay_pattern = re.compile(r'\(([A-Za-z][^()]*?(?:19|20)\d{2}[a-z]?)\)')
302
+ def replace_ay_callback(match: Match) -> str:
303
+ raw_inner = match.group(1)
304
+ if '=' in raw_inner or '+' in raw_inner: return match.group(0)
305
+
306
+ parts, matched_keys, valid_part_count = raw_inner.split(';'), [], 0
307
+ for part in parts:
308
+ year_match = re.search(r'((?:19|20)\d{2})', part)
309
+ if not year_match: continue
310
+ valid_part_count += 1
311
+ author_text = re.sub(r'(?:19|20)\d{2}[a-z]?|et al\.?|,|&', '', part).strip().lower()
312
+
313
+ best_match, best_score = None, 80
314
+ for item in manager.ay_map:
315
+ if item['year'] == year_match.group(1):
316
+ score = fuzz.partial_ratio(author_text, item['author'])
317
+ if score > best_score:
318
+ best_score = score
319
+ best_match = item['key']
320
+
321
+ if best_match: matched_keys.append(best_match)
322
+
323
+ if valid_part_count > 0 and len(matched_keys) == valid_part_count:
324
+ manager.update_count += 1
325
+ return manager.get_in_text_citation(matched_keys)
326
+ return match.group(0)
327
+
328
+ replace_text_preserve_formatting(para, ay_pattern, replace_ay_callback)
329
+
330
+ # Process html tags into Word document natively
331
+ apply_html_formatting_to_runs(para)
332
+
333
+ def write_rich_bibliography_entry(doc: Document, html_text: str, main_font: Optional[str], insert_cursor: Optional[Paragraph] = None):
334
+ p = insert_cursor.insert_paragraph_before() if insert_cursor is not None else doc.add_paragraph()
335
+ html_text = html_text.replace('.. ', '. ').replace('..<', '.<')
336
+ html_text = html_text.replace('</div><div class="csl-right-inline">', '</div><div class="csl-right-inline">' + chr(160))
337
+ html_text = re.sub(r'^((?:<[^>]+>|\s)*)(\[\d+\]|\d+\.)\s*(<[^>]+>)?\s*([A-Za-z])', r'\1\2' + chr(160) + r'\3\4', html_text)
338
+
339
+ is_bold = is_italic = is_smallcaps = is_sup = is_sub = False
340
+ tokens = re.split(r'(<[^>]+>)', html_text)
341
+
342
+ for token in tokens:
343
+ if not token: continue
344
+ token_lower = token.lower()
345
+ if token_lower.startswith('<'):
346
+ if token_lower.startswith('<b') or token_lower.startswith('<strong'): is_bold = True
347
+ elif token_lower.startswith('</b') or token_lower.startswith('</strong'): is_bold = False
348
+ elif token_lower.startswith('<i') or token_lower.startswith('<em'): is_italic = True
349
+ elif token_lower.startswith('</i') or token_lower.startswith('</em'): is_italic = False
350
+ elif token_lower.startswith('<sup'): is_sup = True
351
+ elif token_lower.startswith('</sup'): is_sup = False
352
+ elif token_lower.startswith('<sub'): is_sub = True
353
+ elif token_lower.startswith('</sub'): is_sub = False
354
+ elif 'small-caps' in token_lower and not token_lower.startswith('</'): is_smallcaps = True
355
+ elif token_lower.startswith('</span'): is_smallcaps = False
356
+ continue
357
+
358
+ text_content = html.unescape(token)
359
+ if text_content:
360
+ run = p.add_run(text_content)
361
+ run.bold, run.italic = is_bold, is_italic
362
+ if is_smallcaps: run.font.small_caps = True
363
+ if is_sup: run.font.superscript = True
364
+ if is_sub: run.font.subscript = True
365
+ if main_font: run.font.name = main_font
366
+
367
+ def process_document(docx_path: Path, output_path: Path, manager: CSLCitationManager):
368
+ print(f"\nProcessing document: {docx_path.name}")
369
+ doc = Document(str(docx_path))
370
+ citation_pattern = re.compile(r'[\[\(]([\d\s,\-–]+)[\]\)]')
371
+
372
+ main_font = None
373
+ for p in doc.paragraphs:
374
+ for r in p.runs:
375
+ if r.font.name: main_font = r.font.name; break
376
+ if main_font: break
377
+
378
+ ref_header_element = None
379
+ for p in doc.paragraphs:
380
+ if REF_HEADER_PATTERN.match(p.text): ref_header_element = p._element; break
381
+
382
+ in_main_body = False
383
+ block_counter = 0
384
+
385
+ for block in iter_block_items(doc):
386
+ block_counter += 1
387
+ if isinstance(block, Paragraph):
388
+ if ref_header_element is not None and block._element == ref_header_element: break
389
+ if not in_main_body:
390
+ text_clean = block.text.strip().lower()
391
+ if text_clean in ['abstract', 'introduction', 'background', 'summary', 'methods', 'results']: in_main_body = True
392
+ else:
393
+ words = re.findall(r'\b[a-z]+\b', text_clean)
394
+ if len(words) >= 25 and len([w for w in words if w in PROSE_STOP_WORDS]) >= 5: in_main_body = True
395
+ if block_counter > 25 and not in_main_body: in_main_body = True
396
+
397
+ process_paragraph_content(block, manager, citation_pattern, in_main_body)
398
+
399
+ elif isinstance(block, Table):
400
+ if not in_main_body: continue
401
+ for row in block.rows:
402
+ for cell in row.cells:
403
+ for para in cell.paragraphs:
404
+ process_paragraph_content(para, manager, citation_pattern, in_main_body)
405
+
406
+ ref_header_index = -1
407
+ for i, p in enumerate(doc.paragraphs):
408
+ if p._element == ref_header_element: ref_header_index = i; break
409
+
410
+ insert_cursor = None
411
+ if ref_header_index != -1:
412
+ post_ref_index = -1
413
+ for i in range(ref_header_index + 1, len(doc.paragraphs)):
414
+ text = doc.paragraphs[i].text.strip()
415
+ if text and POST_REF_PATTERN.match(text):
416
+ post_ref_index = i; break
417
+
418
+ if post_ref_index != -1:
419
+ paragraphs_to_remove = doc.paragraphs[ref_header_index:post_ref_index]
420
+ insert_cursor = doc.paragraphs[post_ref_index]
421
+ insert_cursor.paragraph_format.page_break_before = True
422
+ else:
423
+ paragraphs_to_remove = doc.paragraphs[ref_header_index:]
424
+
425
+ for p in paragraphs_to_remove:
426
+ parent = p._element.getparent()
427
+ if parent is not None: parent.remove(p._element)
428
+
429
+ p = insert_cursor.insert_paragraph_before() if insert_cursor is not None else doc.add_paragraph()
430
+ p.paragraph_format.page_break_before = True
431
+ run = p.add_run('References')
432
+ run.bold = True
433
+ if main_font: run.font.name = main_font
434
+
435
+ print(f" -> Rebuilding Rich-Text Bibliography via CSL...")
436
+ manager.bibliography.sort()
437
+ for entry in manager.bibliography.bibliography():
438
+ write_rich_bibliography_entry(doc, str(entry), main_font, insert_cursor)
439
+
440
+ doc.save(str(output_path))
441
+ print(f"Success! Saved to {output_path.name}")
442
+ print(f" -> Tracked and dynamically updated {manager.update_count} in-text citations.")
443
+
444
+
445
+ def main():
446
+ parser = argparse.ArgumentParser()
447
+ parser.add_argument("bib", type=Path, help="Verified .bib file")
448
+ parser.add_argument("doc", type=Path, help="Input .docx file")
449
+ parser.add_argument("--csl", type=Path, required=True, help="Path to the CSL style file")
450
+ args = parser.parse_args()
451
+
452
+ csl_path = args.csl
453
+ default_csl_dir = Path("~/citation_styles").expanduser()
454
+
455
+ if not csl_path.exists():
456
+ alt_path = default_csl_dir / csl_path.name
457
+ if alt_path.exists(): csl_path = alt_path
458
+ elif not csl_path.suffix == '.csl':
459
+ alt_path_ext = default_csl_dir / f"{csl_path.name}.csl"
460
+ if alt_path_ext.exists(): csl_path = alt_path_ext
461
+
462
+ if not csl_path.exists():
463
+ print(f"❌ ERROR: CSL file '{args.csl}' not found locally or in {default_csl_dir}.")
464
+ sys.exit(1)
465
+
466
+ output = args.doc.with_name(f"{args.doc.stem}_final_{csl_path.stem}.docx")
467
+ mgr = CSLCitationManager(args.bib, csl_path)
468
+ process_document(args.doc, output, mgr)
469
+
470
+ if __name__ == "__main__":
471
+ main()