rxiv-maker 1.16.7__py3-none-any.whl → 1.17.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -45,11 +45,16 @@ class DocxExporter:
45
45
  self.resolve_dois = resolve_dois
46
46
  self.include_footnotes = include_footnotes
47
47
 
48
- # Load config to get author name format preference
48
+ # Load config to get author name format preference and DOCX options
49
49
  config_manager = ConfigManager(base_dir=Path(manuscript_path))
50
50
  config = config_manager.load_config()
51
51
  self.author_format = config.get("bibliography_author_format", "lastname_firstname")
52
52
 
53
+ # DOCX export options
54
+ docx_config = config.get("docx", {})
55
+ self.hide_si = docx_config.get("hide_si", False) # Default to False (don't hide SI) for backwards compatibility
56
+ self.figures_at_end = docx_config.get("figures_at_end", False) # Default to False (inline figures)
57
+
53
58
  # Components
54
59
  self.citation_mapper = CitationMapper()
55
60
  self.content_processor = DocxContentProcessor()
@@ -98,6 +103,13 @@ class DocxExporter:
98
103
  markdown_content = self._load_markdown()
99
104
  logger.debug(f"Loaded {len(markdown_content)} characters of markdown")
100
105
 
106
+ # Step 2.5: If SI is hidden from export, still load it for label mapping
107
+ si_content_for_mapping = ""
108
+ if self.hide_si:
109
+ si_content_for_mapping = self._load_si_for_mapping()
110
+ if si_content_for_mapping:
111
+ logger.info("📋 Loaded SI content for label mapping (SI section hidden from export)")
112
+
101
113
  # Step 3: Extract and map citations
102
114
  citations = self.citation_mapper.extract_citations_from_markdown(markdown_content)
103
115
  citation_map = self.citation_mapper.create_mapping(citations)
@@ -120,14 +132,14 @@ class DocxExporter:
120
132
 
121
133
  # Replace @fig:label with "Fig. X" in text, handling optional panel letters
122
134
  # Pattern matches: @fig:label optionally followed by space and panel letter(s)
123
- # Use special markers <<XREF>> to enable yellow highlighting in DOCX
135
+ # Use special markers <<XREF:type>> to enable color-coded highlighting in DOCX
124
136
  for label, num in figure_map.items():
125
137
  # Match @fig:label with optional panel letters like " a", " a,b", " a-c"
126
138
  # Use negative lookahead (?![a-z]) to prevent matching start of words like " is", " and"
127
139
  # Panel letters must be followed by non-letter (space, punctuation, end of string)
128
140
  markdown_with_numbers = re.sub(
129
141
  rf"@fig:{label}\b(\s+[a-z](?:[,\-][a-z])*(?![a-z]))?",
130
- lambda m, num=num: f"<<XREF>>Fig. {num}{m.group(1) if m.group(1) else ''}<</XREF>>",
142
+ lambda m, num=num: f"<<XREF:fig>>Fig. {num}{m.group(1) if m.group(1) else ''}<</XREF>>",
131
143
  markdown_with_numbers,
132
144
  )
133
145
 
@@ -135,7 +147,9 @@ class DocxExporter:
135
147
 
136
148
  # Find all supplementary figures and create mapping
137
149
  # Allow hyphens and underscores in label names
138
- sfig_labels = re.findall(r"!\[[^\]]*\]\([^)]+\)\s*\n\s*\{#sfig:([\w-]+)", markdown_with_numbers)
150
+ # IMPORTANT: When SI is excluded, extract from SI content (where figures are defined)
151
+ content_to_scan_for_sfigs = si_content_for_mapping if si_content_for_mapping else markdown_with_numbers
152
+ sfig_labels = re.findall(r"!\[[^\]]*\]\([^)]+\)\s*\n\s*\{#sfig:([\w-]+)", content_to_scan_for_sfigs)
139
153
  sfig_map = {label: i + 1 for i, label in enumerate(sfig_labels)}
140
154
 
141
155
  # Replace @sfig:label with "Supp. Fig. X" in text, handling optional panel letters
@@ -144,34 +158,51 @@ class DocxExporter:
144
158
  # Negative lookahead prevents matching start of words
145
159
  markdown_with_numbers = re.sub(
146
160
  rf"@sfig:{label}\b(\s+[a-z](?:[,\-][a-z])*(?![a-z]))?",
147
- lambda m, num=num: f"<<XREF>>Supp. Fig. {num}{m.group(1) if m.group(1) else ''}<</XREF>>",
161
+ lambda m, num=num: f"<<XREF:sfig>>Supp. Fig. {num}{m.group(1) if m.group(1) else ''}<</XREF>>",
148
162
  markdown_with_numbers,
149
163
  )
150
164
 
151
165
  logger.debug(f"Mapped {len(sfig_map)} supplementary figure labels to numbers")
152
166
 
153
- # Find all tables and create mapping (looking for {#stable:label} tags)
154
- # Allow hyphens and underscores in label names
155
- table_labels = re.findall(r"\{#stable:([\w-]+)\}", markdown_with_numbers)
167
+ # Find all tables and create mapping (looking for {#stable:label} or \label{stable:label} tags)
168
+ # IMPORTANT: PDF uses the order that tables are DEFINED in the document (order of \label{stable:X})
169
+ # NOT the order of caption references (%{#stable:X}) which are just metadata
170
+ # When SI is excluded from export, we still need to extract labels from SI
171
+
172
+ content_to_scan_for_tables = si_content_for_mapping if si_content_for_mapping else markdown_with_numbers
173
+
174
+ # Extract table labels in document order (both {#stable:label} markdown format and \label{stable:label} LaTeX format)
175
+ # The PDF numbering follows the order these labels appear in the document
176
+ markdown_labels = re.findall(r"\{#stable:([\w-]+)\}", content_to_scan_for_tables)
177
+ latex_labels = re.findall(r"\\label\{stable:([\w-]+)\}", content_to_scan_for_tables)
178
+
179
+ # Combine both formats, preferring LaTeX labels if present (since that's what PDF uses)
180
+ table_labels = latex_labels if latex_labels else markdown_labels
181
+
182
+ # Remove duplicates while preserving order
183
+ seen = set()
184
+ table_labels = [label for label in table_labels if not (label in seen or seen.add(label))]
185
+
156
186
  table_map = {label: i + 1 for i, label in enumerate(table_labels)}
187
+ logger.debug(f"Mapped {len(table_map)} supplementary tables: {table_map}")
157
188
 
158
189
  # Replace @stable:label with "Supp. Table X" in text
159
190
  for label, num in table_map.items():
160
191
  markdown_with_numbers = re.sub(
161
- rf"@stable:{label}\b", f"<<XREF>>Supp. Table {num}<</XREF>>", markdown_with_numbers
192
+ rf"@stable:{label}\b", f"<<XREF:stable>>Supp. Table {num}<</XREF>>", markdown_with_numbers
162
193
  )
163
194
 
164
- logger.debug(f"Mapped {len(table_map)} supplementary table labels to numbers")
165
-
166
195
  # Find all supplementary notes and create mapping (looking for {#snote:label} tags)
167
196
  # Allow hyphens and underscores in label names
168
- snote_labels = re.findall(r"\{#snote:([\w-]+)\}", markdown_with_numbers)
197
+ # IMPORTANT: When SI is excluded, extract from SI content (where notes are defined)
198
+ content_to_scan_for_snotes = si_content_for_mapping if si_content_for_mapping else markdown_with_numbers
199
+ snote_labels = re.findall(r"\{#snote:([\w-]+)\}", content_to_scan_for_snotes)
169
200
  snote_map = {label: i + 1 for i, label in enumerate(snote_labels)}
170
201
 
171
202
  # Replace @snote:label with "Supp. Note X" in text
172
203
  for label, num in snote_map.items():
173
204
  markdown_with_numbers = re.sub(
174
- rf"@snote:{label}\b", f"<<XREF>>Supp. Note {num}<</XREF>>", markdown_with_numbers
205
+ rf"@snote:{label}\b", f"<<XREF:snote>>Supp. Note {num}<</XREF>>", markdown_with_numbers
175
206
  )
176
207
 
177
208
  logger.debug(f"Mapped {len(snote_map)} supplementary note labels to numbers")
@@ -186,18 +217,17 @@ class DocxExporter:
186
217
  for label, num in equation_map.items():
187
218
  # Replace (@eq:label) with (Eq. X)
188
219
  markdown_with_numbers = re.sub(
189
- rf"\(@eq:{label}\b\)", f"(<<XREF>>Eq. {num}<</XREF>>)", markdown_with_numbers
220
+ rf"\(@eq:{label}\b\)", f"(<<XREF:eq>>Eq. {num}<</XREF>>)", markdown_with_numbers
190
221
  )
191
222
  # Replace @eq:label with Eq. X
192
- markdown_with_numbers = re.sub(rf"@eq:{label}\b", f"<<XREF>>Eq. {num}<</XREF>>", markdown_with_numbers)
223
+ markdown_with_numbers = re.sub(rf"@eq:{label}\b", f"<<XREF:eq>>Eq. {num}<</XREF>>", markdown_with_numbers)
193
224
 
194
225
  logger.debug(f"Mapped {len(equation_map)} equation labels to numbers")
195
226
 
196
227
  # Step 5.6: Remove label markers now that mapping is complete
197
228
  # These metadata markers should not appear in the final output
198
- markdown_with_numbers = re.sub(
199
- r"^\{#(?:fig|sfig|snote|stable|table|eq):[^}]+\}\s*", "", markdown_with_numbers, flags=re.MULTILINE
200
- )
229
+ # NOTE: Keep fig/sfig/stable/table labels - they're needed by content processor and removed during caption parsing
230
+ markdown_with_numbers = re.sub(r"^\{#(?:snote|eq):[^}]+\}\s*", "", markdown_with_numbers, flags=re.MULTILINE)
201
231
 
202
232
  # Step 6: Convert content to DOCX structure
203
233
  doc_structure = self.content_processor.parse(markdown_with_numbers, citation_map)
@@ -215,6 +245,8 @@ class DocxExporter:
215
245
  include_footnotes=self.include_footnotes,
216
246
  base_path=self.path_manager.manuscript_path,
217
247
  metadata=metadata,
248
+ table_map=table_map,
249
+ figures_at_end=self.figures_at_end,
218
250
  )
219
251
  logger.info(f"DOCX exported successfully: {docx_path}")
220
252
 
@@ -265,9 +297,9 @@ class DocxExporter:
265
297
 
266
298
  content.append(main_content)
267
299
 
268
- # Load 02_SUPPLEMENTARY_INFO.md if exists
300
+ # Load 02_SUPPLEMENTARY_INFO.md if exists and not configured to hide SI
269
301
  supp_md = self.path_manager.manuscript_path / "02_SUPPLEMENTARY_INFO.md"
270
- if supp_md.exists():
302
+ if supp_md.exists() and not self.hide_si:
271
303
  logger.info("Including supplementary information")
272
304
  supp_content = supp_md.read_text(encoding="utf-8")
273
305
  supp_content = remove_yaml_header(supp_content)
@@ -281,11 +313,36 @@ class DocxExporter:
281
313
  content.append("<!-- PAGE_BREAK -->")
282
314
  content.append("# Supplementary Information")
283
315
  content.append(supp_content)
316
+ elif supp_md.exists() and self.hide_si:
317
+ logger.info("Supplementary information exists but hidden per config (docx.hide_si: true)")
284
318
  else:
285
319
  logger.debug("No supplementary information file found")
286
320
 
287
321
  return "\n\n".join(content)
288
322
 
323
+ def _load_si_for_mapping(self) -> str:
324
+ r"""Load SI content for label mapping without including in export.
325
+
326
+ This method is used when hide_si is True but we still need to extract
327
+ SI labels (stable, sfig, snote) for cross-references in the main text.
328
+
329
+ IMPORTANT: We return RAW content (before preprocessing) because we need to
330
+ extract LaTeX labels (\label{stable:X}) which determine the PDF numbering order.
331
+ The preprocessor strips out {{tex: blocks, losing this ordering information.
332
+
333
+ Returns:
334
+ SI content as string (raw, before preprocessing), or empty string if SI doesn't exist
335
+ """
336
+ supp_md = self.path_manager.manuscript_path / "02_SUPPLEMENTARY_INFO.md"
337
+ if not supp_md.exists():
338
+ return ""
339
+
340
+ # Load RAW SI content (don't preprocess - we need LaTeX labels for ordering)
341
+ supp_content = supp_md.read_text(encoding="utf-8")
342
+ supp_content = remove_yaml_header(supp_content)
343
+
344
+ return supp_content
345
+
289
346
  def _build_bibliography(self, citation_map: Dict[str, int]) -> Dict[int, Dict]:
290
347
  """Build bibliography with optional DOI resolution.
291
348
 
@@ -318,12 +375,17 @@ class DocxExporter:
318
375
  # Get DOI from entry
319
376
  doi = entry.fields.get("doi")
320
377
 
321
- # TODO: Implement DOI resolution if requested and DOI missing
322
- # if self.resolve_dois and not doi:
323
- # doi = self._resolve_doi_from_metadata(entry)
378
+ # Attempt DOI resolution if requested and DOI missing
379
+ if self.resolve_dois and not doi:
380
+ doi = self._resolve_doi_from_metadata(entry)
381
+ if doi:
382
+ # Store in entry for this export
383
+ entry.fields["doi"] = doi
384
+ logger.info(f"Resolved DOI for {key}: {doi}")
324
385
 
325
386
  # Format entry (full format for DOCX bibliography)
326
- formatted = format_bibliography_entry(entry, doi, slim=False, author_format=self.author_format)
387
+ # Don't include DOI in formatted text - it will be added separately as a hyperlink by the writer
388
+ formatted = format_bibliography_entry(entry, doi=None, slim=False, author_format=self.author_format)
327
389
 
328
390
  bibliography[number] = {"key": key, "entry": entry, "doi": doi, "formatted": formatted}
329
391
 
@@ -332,6 +394,99 @@ class DocxExporter:
332
394
 
333
395
  return bibliography
334
396
 
397
+ def _resolve_doi_from_metadata(self, entry) -> str | None:
398
+ """Resolve DOI from entry metadata using CrossRef API.
399
+
400
+ Args:
401
+ entry: Bibliography entry to resolve DOI for
402
+
403
+ Returns:
404
+ Resolved DOI if found, None otherwise
405
+ """
406
+ import requests
407
+
408
+ # Try to construct a search query from available fields
409
+ title = entry.fields.get("title", "").strip()
410
+ year = entry.fields.get("year", "").strip()
411
+
412
+ if not title:
413
+ logger.debug(f"Cannot resolve DOI for {entry.key}: no title")
414
+ return None
415
+
416
+ # Clean title for search (remove LaTeX commands, braces, etc.)
417
+ search_title = self._clean_title_for_search(title)
418
+
419
+ # Try CrossRef search API
420
+ try:
421
+ url = "https://api.crossref.org/works"
422
+ params = {
423
+ "query.title": search_title,
424
+ "rows": 5, # Get top 5 results
425
+ }
426
+
427
+ response = requests.get(url, params=params, timeout=10)
428
+
429
+ if response.status_code == 200:
430
+ data = response.json()
431
+ items = data.get("message", {}).get("items", [])
432
+
433
+ # Find best match
434
+ for item in items:
435
+ item_title = item.get("title", [""])[0].lower()
436
+ search_title_lower = search_title.lower()
437
+
438
+ # Simple similarity check - titles should be very similar
439
+ if item_title and (search_title_lower in item_title or item_title in search_title_lower):
440
+ # Verify year matches if available
441
+ if year:
442
+ item_year = item.get("published", {}).get("date-parts", [[None]])[0][0]
443
+ if item_year and str(item_year) != year:
444
+ continue
445
+
446
+ doi = item.get("DOI")
447
+ if doi:
448
+ logger.info(f"Resolved DOI for {entry.key}: {doi}")
449
+ return doi
450
+
451
+ logger.debug(f"Could not resolve DOI for {entry.key} via CrossRef")
452
+ return None
453
+
454
+ except requests.exceptions.Timeout:
455
+ logger.debug(f"CrossRef API timeout resolving DOI for {entry.key}")
456
+ return None
457
+ except requests.exceptions.ConnectionError:
458
+ logger.debug(f"CrossRef API connection error for {entry.key}")
459
+ return None
460
+ except Exception as e:
461
+ logger.debug(f"Error resolving DOI for {entry.key}: {e}")
462
+ return None
463
+
464
+ def _clean_title_for_search(self, title: str) -> str:
465
+ """Clean title for CrossRef search by removing LaTeX commands.
466
+
467
+ Args:
468
+ title: Raw title from BibTeX entry
469
+
470
+ Returns:
471
+ Cleaned title suitable for search
472
+ """
473
+ import re
474
+
475
+ # Remove LaTeX commands
476
+ title = re.sub(r"\\[a-zA-Z]+\{([^}]*)\}", r"\1", title) # \textit{foo} -> foo
477
+ title = re.sub(r"\\[a-zA-Z]+", "", title) # \LaTeX -> LaTeX
478
+
479
+ # Remove braces
480
+ title = title.replace("{", "").replace("}", "")
481
+
482
+ # Remove special characters
483
+ title = re.sub(r"[^a-zA-Z0-9\s\-]", " ", title)
484
+
485
+ # Normalize whitespace
486
+ title = " ".join(title.split())
487
+
488
+ return title.strip()
489
+
335
490
  def _get_metadata(self) -> Dict[str, Any]:
336
491
  """Extract metadata for title page.
337
492