rxiv-maker 1.16.8__py3-none-any.whl → 1.18.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
rxiv_maker/__version__.py CHANGED
@@ -1,3 +1,3 @@
1
1
  """Version information."""
2
2
 
3
- __version__ = "1.16.8"
3
+ __version__ = "1.18.0"
@@ -30,6 +30,7 @@ from ..framework import BuildCommand
30
30
  @click.option("--keep-output", is_flag=True, help="Preserve existing output directory (default: clear before build)")
31
31
  @click.option("--docx", is_flag=True, help="Also export to DOCX format for collaborative review")
32
32
  @click.option("--resolve-dois", "-r", is_flag=True, help="Attempt to resolve missing DOIs (when using --docx)")
33
+ @click.option("--split-si", is_flag=True, help="Split PDF into main and SI sections (__main.pdf and __si.pdf)")
33
34
  @click.option("--verbose", "-v", is_flag=True, help="Enable verbose output")
34
35
  @click.option("--quiet", "-q", is_flag=True, help="Suppress non-essential output")
35
36
  @click.option("--debug", "-d", is_flag=True, help="Enable debug output")
@@ -49,6 +50,7 @@ def build(
49
50
  keep_output: bool,
50
51
  docx: bool,
51
52
  resolve_dois: bool,
53
+ split_si: bool,
52
54
  verbose: bool,
53
55
  quiet: bool,
54
56
  debug: bool,
@@ -81,6 +83,10 @@ def build(
81
83
 
82
84
  $ rxiv pdf --docx --resolve-dois
83
85
 
86
+ **Split PDF into main and SI sections:**
87
+
88
+ $ rxiv pdf --split-si
89
+
84
90
  **Force regenerate all figures:**
85
91
 
86
92
  $ rxiv pdf --force-figures
@@ -108,6 +114,7 @@ def build(
108
114
  keep_output=keep_output,
109
115
  docx=docx,
110
116
  resolve_dois=resolve_dois,
117
+ split_si=split_si,
111
118
  debug=debug or verbose,
112
119
  quiet=quiet,
113
120
  container_mode=container_mode,
@@ -30,8 +30,10 @@ class InitCommand(BaseCommand):
30
30
  self.engine = "local" # Only local engine is supported
31
31
 
32
32
  # Store manuscript path without PathManager validation since we're creating the directory
33
+ # NOTE: For init command, we should NOT use environment variable MANUSCRIPT_PATH
34
+ # as it's meant for finding existing manuscripts, not determining where to initialize
33
35
  if manuscript_path is None:
34
- manuscript_path = EnvironmentManager.get_manuscript_path() or "MANUSCRIPT"
36
+ manuscript_path = "MANUSCRIPT"
35
37
 
36
38
  # Store the raw path for use in execute_operation
37
39
  self.raw_manuscript_path = manuscript_path
@@ -142,6 +144,7 @@ class BuildCommand(BaseCommand):
142
144
  keep_output: bool = False,
143
145
  docx: bool = False,
144
146
  resolve_dois: bool = False,
147
+ split_si: bool = False,
145
148
  debug: bool = False,
146
149
  quiet: bool = False,
147
150
  container_mode: Optional[str] = None,
@@ -156,6 +159,7 @@ class BuildCommand(BaseCommand):
156
159
  keep_output: Preserve existing output directory
157
160
  docx: Also export to DOCX format
158
161
  resolve_dois: Attempt to resolve missing DOIs (for DOCX export)
162
+ split_si: Split PDF into main and SI sections
159
163
  debug: Enable debug output
160
164
  quiet: Suppress non-critical warnings
161
165
  container_mode: Container behavior mode
@@ -223,6 +227,10 @@ class BuildCommand(BaseCommand):
223
227
  if docx:
224
228
  self._export_docx(resolve_dois=resolve_dois, quiet=quiet, debug=debug)
225
229
 
230
+ # Split PDF if requested
231
+ if split_si:
232
+ self._split_pdf(pdf_path, quiet=quiet, debug=debug)
233
+
226
234
  # Show helpful tips after successful build
227
235
  self._show_build_tips()
228
236
 
@@ -252,11 +260,69 @@ class BuildCommand(BaseCommand):
252
260
  self.console.print(f"[green]✅ DOCX exported:[/green] {docx_path}")
253
261
 
254
262
  except Exception as e:
255
- self.console.print(f"[yellow]⚠️ DOCX export failed:[/yellow] {e}", err=True)
263
+ self.console.print(f"[yellow]⚠️ DOCX export failed:[/yellow] {e}")
264
+ if debug:
265
+ import traceback
266
+
267
+ self.console.print(f"[dim]{traceback.format_exc()}[/dim]")
268
+
269
+ def _split_pdf(self, pdf_path: Path, quiet: bool = False, debug: bool = False) -> None:
270
+ """Split PDF into main and SI sections after successful PDF build.
271
+
272
+ Args:
273
+ pdf_path: Path to the generated PDF
274
+ quiet: Suppress non-essential output
275
+ debug: Enable debug output
276
+ """
277
+ try:
278
+ from ...processors.yaml_processor import extract_yaml_metadata
279
+ from ...utils.file_helpers import find_manuscript_md
280
+ from ...utils.pdf_splitter import split_pdf
281
+ from ...utils.pdf_utils import get_custom_pdf_filename
282
+
283
+ if not quiet:
284
+ self.console.print("\n[cyan]✂️ Splitting PDF into main and SI sections...[/cyan]")
285
+
286
+ # Split the PDF
287
+ main_path, si_path = split_pdf(pdf_path)
288
+
289
+ if main_path and si_path:
290
+ # Extract metadata to generate custom filename
291
+ manuscript_md = find_manuscript_md(str(self.path_manager.manuscript_path))
292
+ yaml_metadata = extract_yaml_metadata(str(manuscript_md))
293
+
294
+ # Get base filename (e.g., "2025__saraiva_et_al__rxiv.pdf")
295
+ base_filename = get_custom_pdf_filename(yaml_metadata)
296
+ base_name = base_filename.replace(".pdf", "")
297
+
298
+ # Generate final filenames with __main and __si suffixes
299
+ main_filename = f"{base_name}__main.pdf"
300
+ si_filename = f"{base_name}__si.pdf"
301
+
302
+ # Copy split files to manuscript directory
303
+ final_main_path = self.path_manager.manuscript_path / main_filename
304
+ final_si_path = self.path_manager.manuscript_path / si_filename
305
+
306
+ shutil.copy2(main_path, final_main_path)
307
+ shutil.copy2(si_path, final_si_path)
308
+
309
+ if not quiet:
310
+ self.console.print("[green]✅ PDF split successfully:[/green]")
311
+ self.console.print(f" 📄 Main: {final_main_path}")
312
+ self.console.print(f" 📄 SI: {final_si_path}")
313
+ elif main_path is None and si_path is None:
314
+ if not quiet:
315
+ self.console.print("[yellow]⚠️ Could not split PDF: SI section marker not found[/yellow]")
316
+ else:
317
+ if not quiet:
318
+ self.console.print("[yellow]⚠️ PDF splitting partially failed[/yellow]")
319
+
320
+ except Exception as e:
321
+ self.console.print(f"[yellow]⚠️ PDF splitting failed:[/yellow] {e}")
256
322
  if debug:
257
323
  import traceback
258
324
 
259
- self.console.print(f"[dim]{traceback.format_exc()}[/dim]", err=True)
325
+ self.console.print(f"[dim]{traceback.format_exc()}[/dim]")
260
326
 
261
327
  def _show_build_tips(self) -> None:
262
328
  """Show helpful tips after successful PDF build."""
@@ -202,9 +202,11 @@ def extract_citations_from_text(text: MarkdownContent) -> list[CitationKey]:
202
202
  backtick_patterns.append(match.group(0))
203
203
  return f"__BACKTICK_PATTERN_{len(backtick_patterns) - 1}__"
204
204
 
205
- # Match both single backticks `...` and triple backticks ```...```
206
- text_cleaned = re.sub(r"`[^`]+`", protect_backticks, text)
207
- text_cleaned = re.sub(r"```.*?```", protect_backticks, text_cleaned, flags=re.DOTALL)
205
+ # IMPORTANT: Match triple backticks FIRST, then single backticks
206
+ # This prevents the single-backtick pattern from matching across triple-backtick blocks
207
+ # (e.g., from a ` before ```latex to the first ` inside the code block)
208
+ text_cleaned = re.sub(r"```.*?```", protect_backticks, text, flags=re.DOTALL)
209
+ text_cleaned = re.sub(r"`[^`]+`", protect_backticks, text_cleaned)
208
210
 
209
211
  # Find bracketed multiple citations
210
212
  bracketed_matches = re.findall(r"\[(@[^]]+)\]", text_cleaned)
@@ -343,6 +343,7 @@ class ConfigManager:
343
343
  "bibliography": {"file": "03_REFERENCES.bib", "style": "nature"},
344
344
  "citation_style": "numbered",
345
345
  "enable_inline_doi_resolution": False,
346
+ "docx": {"hide_si": False, "figures_at_end": False},
346
347
  "cache": {"enabled": True, "ttl_hours": 24},
347
348
  "version": "1.0",
348
349
  "acknowledge_rxiv_maker": True,
@@ -8,11 +8,26 @@ import re
8
8
  from typing import Dict, List
9
9
 
10
10
  from ..converters.citation_processor import extract_citations_from_text
11
+ from ..utils.citation_range_formatter import format_citation_ranges
11
12
 
12
13
 
13
14
  class CitationMapper:
14
15
  """Maps citation keys to sequential numbers for DOCX export."""
15
16
 
17
+ @staticmethod
18
+ def _format_citation_ranges(text: str) -> str:
19
+ """Format consecutive citations as ranges.
20
+
21
+ Uses centralized citation range formatter from utils module.
22
+
23
+ Args:
24
+ text: Text with numbered citations
25
+
26
+ Returns:
27
+ Text with consecutive citations formatted as ranges
28
+ """
29
+ return format_citation_ranges(text)
30
+
16
31
  def create_mapping(self, citations: List[str]) -> Dict[str, int]:
17
32
  """Create citation key → number mapping.
18
33
 
@@ -121,4 +136,7 @@ class CitationMapper:
121
136
  for i, pattern in enumerate(email_patterns):
122
137
  text = text.replace(f"__EMAIL_PATTERN_{i}__", pattern)
123
138
 
139
+ # Format consecutive citations as ranges (e.g., [1][2][3] -> [1-3])
140
+ text = self._format_citation_ranges(text)
141
+
124
142
  return text
@@ -7,6 +7,8 @@ DOCX generation with python-docx.
7
7
  import re
8
8
  from typing import Any, Dict, List, Optional
9
9
 
10
+ from ..utils.comment_filter import is_metadata_comment
11
+
10
12
 
11
13
  class DocxContentProcessor:
12
14
  """Parses markdown content into structured format for DOCX writing."""
@@ -55,10 +57,38 @@ class DocxContentProcessor:
55
57
  i += 1
56
58
  continue
57
59
 
58
- # Skip HTML/markdown comments
60
+ # Parse HTML/markdown comments (single-line and multi-line)
61
+ # Skip informational/metadata comments (those starting with "Note:")
59
62
  if line.strip().startswith("<!--"):
60
- i += 1
61
- continue
63
+ # Check if it's a single-line comment
64
+ if line.strip().endswith("-->"):
65
+ # Single-line comment
66
+ comment_text = line.strip()[4:-3].strip()
67
+ # Skip metadata comments (e.g., "note that...", "Comment: ...")
68
+ if comment_text and not is_metadata_comment(comment_text):
69
+ sections.append({"type": "comment", "text": comment_text})
70
+ i += 1
71
+ continue
72
+ else:
73
+ # Multi-line comment - collect all lines until -->
74
+ comment_lines = [line.strip()[4:]] # Remove <!--
75
+ i += 1
76
+ while i < len(lines):
77
+ if lines[i].strip().endswith("-->"):
78
+ # Last line of comment
79
+ comment_lines.append(lines[i].strip()[:-3]) # Remove -->
80
+ i += 1
81
+ break
82
+ else:
83
+ comment_lines.append(lines[i].strip())
84
+ i += 1
85
+
86
+ # Join and add comment
87
+ comment_text = " ".join(comment_lines).strip()
88
+ # Skip metadata comments (e.g., "note that...", "Comment: ...")
89
+ if comment_text and not is_metadata_comment(comment_text):
90
+ sections.append({"type": "comment", "text": comment_text})
91
+ continue
62
92
 
63
93
  # Skip LaTeX commands like <clearpage>
64
94
  if line.strip().startswith("<") and line.strip().endswith(">") and " " not in line.strip():
@@ -335,18 +365,21 @@ class DocxContentProcessor:
335
365
  runs = []
336
366
 
337
367
  # Find all formatting markers, links, and citations
338
- # Pattern to match: <<HIGHLIGHT_YELLOW>>text<</HIGHLIGHT_YELLOW>>, <<XREF>>text<</XREF>>, [text](url), **bold**, __underlined__, *italic*, _italic_, `code`, $math$, [number]
368
+ # Pattern to match: <<HIGHLIGHT_YELLOW>>text<</HIGHLIGHT_YELLOW>>, <<XREF:type>>text<</XREF>>, <!-- comment -->, [text](url), **bold**, __underlined__, *italic*, _italic_, ~subscript~, ^superscript^, `code`, $math$, [number]
339
369
  pattern = re.compile(
340
370
  r"(<<HIGHLIGHT_YELLOW>>([^<]+)<</HIGHLIGHT_YELLOW>>)" # Yellow highlight (must be first)
341
- r"|(<<XREF>>([^<]+)<</XREF>>)" # Cross-reference
371
+ r"|(<<XREF:(\w+)>>([^<]+)<</XREF>>)" # Cross-reference with type
372
+ r"|(<!--\s*(.+?)\s*-->)" # HTML comments (inline)
342
373
  r"|(\[([^\]]+)\]\(([^)]+)\))" # Markdown link [text](url) (before citations)
343
374
  r"|(\*\*([^*]+)\*\*)" # Bold
344
375
  r"|(__([^_]+)__)" # Underline with double underscores (must come before single underscore)
345
376
  r"|(\*([^*]+)\*)" # Italic with asterisks
346
377
  r"|(_([^_]+)_)" # Italic with underscores
378
+ r"|(~([^~]+)~)" # Subscript
379
+ r"|(\^([^^]+)\^)" # Superscript
347
380
  r"|(`([^`]+)`)" # Code
348
381
  r"|(\$([^\$]+)\$)" # Inline math
349
- r"|(\[(\d+(?:,\s*\d+)*)\])" # Citation numbers
382
+ r"|(\[(\d+(?:[-,]\s*\d+)*)\])" # Citation numbers (supports both ranges [1-3] and lists [1, 2])
350
383
  )
351
384
 
352
385
  last_end = 0
@@ -378,67 +411,99 @@ class DocxContentProcessor:
378
411
  if run["type"] == "text":
379
412
  run["highlight_yellow"] = True
380
413
  runs.append(run)
381
- elif match.group(3): # Cross-reference
414
+ elif match.group(3): # Cross-reference with type
382
415
  runs.append(
383
416
  {
384
417
  "type": "text",
385
- "text": match.group(4),
418
+ "text": match.group(5), # Text is now in group 5
386
419
  "bold": False,
387
420
  "italic": False,
388
421
  "underline": False,
389
422
  "code": False,
390
423
  "xref": True,
424
+ "xref_type": match.group(4), # Type is in group 4
391
425
  }
392
426
  )
393
- elif match.group(5): # Markdown link [text](url)
427
+ elif match.group(6): # Inline HTML comment
428
+ comment_text = match.group(7).strip()
429
+ # Skip metadata comments (e.g., "note that...", "Comment: ...")
430
+ if comment_text and not is_metadata_comment(comment_text):
431
+ runs.append({"type": "inline_comment", "text": comment_text})
432
+ elif match.group(8): # Markdown link [text](url)
394
433
  runs.append(
395
434
  {
396
435
  "type": "hyperlink",
397
- "text": match.group(6),
398
- "url": match.group(7),
436
+ "text": match.group(9),
437
+ "url": match.group(10),
399
438
  }
400
439
  )
401
- elif match.group(8): # Bold
440
+ elif match.group(11): # Bold
402
441
  # Recursively parse inner text for underline/italic/other formatting
403
- inner_text = match.group(9)
442
+ inner_text = match.group(12)
404
443
  inner_runs = self._parse_inline_formatting(inner_text, citation_map)
405
444
  # Add bold to all inner runs
406
445
  for run in inner_runs:
407
446
  if run["type"] == "text":
408
447
  run["bold"] = True
409
448
  runs.append(run)
410
- elif match.group(10): # Underline
449
+ elif match.group(13): # Underline
411
450
  # Recursively parse inner text for bold/italic/other formatting
412
- inner_text = match.group(11)
451
+ inner_text = match.group(14)
413
452
  inner_runs = self._parse_inline_formatting(inner_text, citation_map)
414
453
  # Add underline to all inner runs
415
454
  for run in inner_runs:
416
455
  if run["type"] == "text":
417
456
  run["underline"] = True
418
457
  runs.append(run)
419
- elif match.group(12): # Italic with asterisks
458
+ elif match.group(15): # Italic with asterisks
420
459
  # Recursively parse inner text for bold/underline/other formatting
421
- inner_text = match.group(13)
460
+ inner_text = match.group(16)
422
461
  inner_runs = self._parse_inline_formatting(inner_text, citation_map)
423
462
  # Add italic to all inner runs
424
463
  for run in inner_runs:
425
464
  if run["type"] == "text":
426
465
  run["italic"] = True
427
466
  runs.append(run)
428
- elif match.group(14): # Italic with underscores
467
+ elif match.group(17): # Italic with underscores
429
468
  # Recursively parse inner text for bold/underline/other formatting
430
- inner_text = match.group(15)
469
+ inner_text = match.group(18)
431
470
  inner_runs = self._parse_inline_formatting(inner_text, citation_map)
432
471
  # Add italic to all inner runs
433
472
  for run in inner_runs:
434
473
  if run["type"] == "text":
435
474
  run["italic"] = True
436
475
  runs.append(run)
437
- elif match.group(16): # Code
476
+ elif match.group(19): # Subscript
438
477
  runs.append(
439
478
  {
440
479
  "type": "text",
441
- "text": match.group(17),
480
+ "text": match.group(20),
481
+ "bold": False,
482
+ "italic": False,
483
+ "underline": False,
484
+ "code": False,
485
+ "xref": False,
486
+ "subscript": True,
487
+ }
488
+ )
489
+ elif match.group(21): # Superscript
490
+ runs.append(
491
+ {
492
+ "type": "text",
493
+ "text": match.group(22),
494
+ "bold": False,
495
+ "italic": False,
496
+ "underline": False,
497
+ "code": False,
498
+ "xref": False,
499
+ "superscript": True,
500
+ }
501
+ )
502
+ elif match.group(23): # Code
503
+ runs.append(
504
+ {
505
+ "type": "text",
506
+ "text": match.group(24),
442
507
  "bold": False,
443
508
  "italic": False,
444
509
  "underline": False,
@@ -446,14 +511,23 @@ class DocxContentProcessor:
446
511
  "xref": False,
447
512
  }
448
513
  )
449
- elif match.group(18): # Inline math
450
- runs.append({"type": "inline_equation", "latex": match.group(19)})
451
- elif match.group(20): # Citation
452
- # Parse citation numbers (may be multiple: [1, 2, 3])
453
- numbers_str = match.group(21)
454
- numbers = [int(n.strip()) for n in numbers_str.split(",")]
455
- for num in numbers:
456
- runs.append({"type": "citation", "number": num})
514
+ elif match.group(25): # Inline math
515
+ runs.append({"type": "inline_equation", "latex": match.group(26)})
516
+ elif match.group(27): # Citation
517
+ # Keep citation as formatted text with yellow highlighting
518
+ # The citation mapper has already formatted ranges (e.g., [1-3], [1, 4-6, 8])
519
+ citation_text = match.group(0) # Full match including brackets
520
+ runs.append(
521
+ {
522
+ "type": "text",
523
+ "text": citation_text,
524
+ "bold": False,
525
+ "italic": False,
526
+ "underline": False,
527
+ "code": False,
528
+ "highlight_yellow": True, # Highlight citations in yellow
529
+ }
530
+ )
457
531
 
458
532
  last_end = match.end()
459
533
 
@@ -516,6 +590,7 @@ class DocxContentProcessor:
516
590
  # Look ahead for caption line (skip empty lines)
517
591
  caption = ""
518
592
  label = ""
593
+ is_supplementary = False # Default to main figure
519
594
  next_i = start_idx + 1
520
595
 
521
596
  # Skip empty lines to find caption
@@ -529,6 +604,9 @@ class DocxContentProcessor:
529
604
 
530
605
  # Check for {#fig:label ...} or {#sfig:label ...} **Caption**
531
606
  if next_line and (next_line.startswith("{#fig:") or next_line.startswith("{#sfig:")):
607
+ # Detect if it's a supplementary figure
608
+ is_supplementary = next_line.startswith("{#sfig:")
609
+
532
610
  # Extract label if present
533
611
  label_match = re.match(r"\{#s?fig:(\w+)[^}]*\}", next_line)
534
612
  if label_match:
@@ -572,6 +650,7 @@ class DocxContentProcessor:
572
650
  "alt": alt_text,
573
651
  "caption": caption,
574
652
  "label": label,
653
+ "is_supplementary": is_supplementary,
575
654
  }, next_i
576
655
 
577
656
  def _parse_table(self, lines: List[str], start_idx: int) -> tuple[Optional[Dict[str, Any]], int]:
@@ -626,7 +705,8 @@ class DocxContentProcessor:
626
705
  if i < len(lines):
627
706
  caption_line = lines[i].strip()
628
707
  # Match {#stable:label} Caption or {#table:label} Caption
629
- caption_match = re.match(r"^\{#(stable|table):(\w+)\}\s*(.+)$", caption_line)
708
+ # Allow hyphens and underscores in label names (e.g., "tool-comparison")
709
+ caption_match = re.match(r"^\{#(stable|table):([\w-]+)\}\s*(.+)$", caption_line)
630
710
  if caption_match:
631
711
  label = f"{caption_match.group(1)}:{caption_match.group(2)}"
632
712
  caption = caption_match.group(3).strip()