rxiv-maker 1.16.8__py3-none-any.whl → 1.17.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
rxiv_maker/__version__.py CHANGED
@@ -1,3 +1,3 @@
1
1
  """Version information."""
2
2
 
3
- __version__ = "1.16.8"
3
+ __version__ = "1.17.0"
@@ -30,6 +30,7 @@ from ..framework import BuildCommand
30
30
  @click.option("--keep-output", is_flag=True, help="Preserve existing output directory (default: clear before build)")
31
31
  @click.option("--docx", is_flag=True, help="Also export to DOCX format for collaborative review")
32
32
  @click.option("--resolve-dois", "-r", is_flag=True, help="Attempt to resolve missing DOIs (when using --docx)")
33
+ @click.option("--split-si", is_flag=True, help="Split PDF into main and SI sections (__main.pdf and __si.pdf)")
33
34
  @click.option("--verbose", "-v", is_flag=True, help="Enable verbose output")
34
35
  @click.option("--quiet", "-q", is_flag=True, help="Suppress non-essential output")
35
36
  @click.option("--debug", "-d", is_flag=True, help="Enable debug output")
@@ -49,6 +50,7 @@ def build(
49
50
  keep_output: bool,
50
51
  docx: bool,
51
52
  resolve_dois: bool,
53
+ split_si: bool,
52
54
  verbose: bool,
53
55
  quiet: bool,
54
56
  debug: bool,
@@ -81,6 +83,10 @@ def build(
81
83
 
82
84
  $ rxiv pdf --docx --resolve-dois
83
85
 
86
+ **Split PDF into main and SI sections:**
87
+
88
+ $ rxiv pdf --split-si
89
+
84
90
  **Force regenerate all figures:**
85
91
 
86
92
  $ rxiv pdf --force-figures
@@ -108,6 +114,7 @@ def build(
108
114
  keep_output=keep_output,
109
115
  docx=docx,
110
116
  resolve_dois=resolve_dois,
117
+ split_si=split_si,
111
118
  debug=debug or verbose,
112
119
  quiet=quiet,
113
120
  container_mode=container_mode,
@@ -142,6 +142,7 @@ class BuildCommand(BaseCommand):
142
142
  keep_output: bool = False,
143
143
  docx: bool = False,
144
144
  resolve_dois: bool = False,
145
+ split_si: bool = False,
145
146
  debug: bool = False,
146
147
  quiet: bool = False,
147
148
  container_mode: Optional[str] = None,
@@ -156,6 +157,7 @@ class BuildCommand(BaseCommand):
156
157
  keep_output: Preserve existing output directory
157
158
  docx: Also export to DOCX format
158
159
  resolve_dois: Attempt to resolve missing DOIs (for DOCX export)
160
+ split_si: Split PDF into main and SI sections
159
161
  debug: Enable debug output
160
162
  quiet: Suppress non-critical warnings
161
163
  container_mode: Container behavior mode
@@ -223,6 +225,10 @@ class BuildCommand(BaseCommand):
223
225
  if docx:
224
226
  self._export_docx(resolve_dois=resolve_dois, quiet=quiet, debug=debug)
225
227
 
228
+ # Split PDF if requested
229
+ if split_si:
230
+ self._split_pdf(pdf_path, quiet=quiet, debug=debug)
231
+
226
232
  # Show helpful tips after successful build
227
233
  self._show_build_tips()
228
234
 
@@ -252,11 +258,69 @@ class BuildCommand(BaseCommand):
252
258
  self.console.print(f"[green]✅ DOCX exported:[/green] {docx_path}")
253
259
 
254
260
  except Exception as e:
255
- self.console.print(f"[yellow]⚠️ DOCX export failed:[/yellow] {e}", err=True)
261
+ self.console.print(f"[yellow]⚠️ DOCX export failed:[/yellow] {e}")
262
+ if debug:
263
+ import traceback
264
+
265
+ self.console.print(f"[dim]{traceback.format_exc()}[/dim]")
266
+
267
+ def _split_pdf(self, pdf_path: Path, quiet: bool = False, debug: bool = False) -> None:
268
+ """Split PDF into main and SI sections after successful PDF build.
269
+
270
+ Args:
271
+ pdf_path: Path to the generated PDF
272
+ quiet: Suppress non-essential output
273
+ debug: Enable debug output
274
+ """
275
+ try:
276
+ from ...processors.yaml_processor import extract_yaml_metadata
277
+ from ...utils.file_helpers import find_manuscript_md
278
+ from ...utils.pdf_splitter import split_pdf
279
+ from ...utils.pdf_utils import get_custom_pdf_filename
280
+
281
+ if not quiet:
282
+ self.console.print("\n[cyan]✂️ Splitting PDF into main and SI sections...[/cyan]")
283
+
284
+ # Split the PDF
285
+ main_path, si_path = split_pdf(pdf_path)
286
+
287
+ if main_path and si_path:
288
+ # Extract metadata to generate custom filename
289
+ manuscript_md = find_manuscript_md(str(self.path_manager.manuscript_path))
290
+ yaml_metadata = extract_yaml_metadata(str(manuscript_md))
291
+
292
+ # Get base filename (e.g., "2025__saraiva_et_al__rxiv.pdf")
293
+ base_filename = get_custom_pdf_filename(yaml_metadata)
294
+ base_name = base_filename.replace(".pdf", "")
295
+
296
+ # Generate final filenames with __main and __si suffixes
297
+ main_filename = f"{base_name}__main.pdf"
298
+ si_filename = f"{base_name}__si.pdf"
299
+
300
+ # Copy split files to manuscript directory
301
+ final_main_path = self.path_manager.manuscript_path / main_filename
302
+ final_si_path = self.path_manager.manuscript_path / si_filename
303
+
304
+ shutil.copy2(main_path, final_main_path)
305
+ shutil.copy2(si_path, final_si_path)
306
+
307
+ if not quiet:
308
+ self.console.print("[green]✅ PDF split successfully:[/green]")
309
+ self.console.print(f" 📄 Main: {final_main_path}")
310
+ self.console.print(f" 📄 SI: {final_si_path}")
311
+ elif main_path is None and si_path is None:
312
+ if not quiet:
313
+ self.console.print("[yellow]⚠️ Could not split PDF: SI section marker not found[/yellow]")
314
+ else:
315
+ if not quiet:
316
+ self.console.print("[yellow]⚠️ PDF splitting partially failed[/yellow]")
317
+
318
+ except Exception as e:
319
+ self.console.print(f"[yellow]⚠️ PDF splitting failed:[/yellow] {e}")
256
320
  if debug:
257
321
  import traceback
258
322
 
259
- self.console.print(f"[dim]{traceback.format_exc()}[/dim]", err=True)
323
+ self.console.print(f"[dim]{traceback.format_exc()}[/dim]")
260
324
 
261
325
  def _show_build_tips(self) -> None:
262
326
  """Show helpful tips after successful PDF build."""
@@ -202,9 +202,11 @@ def extract_citations_from_text(text: MarkdownContent) -> list[CitationKey]:
202
202
  backtick_patterns.append(match.group(0))
203
203
  return f"__BACKTICK_PATTERN_{len(backtick_patterns) - 1}__"
204
204
 
205
- # Match both single backticks `...` and triple backticks ```...```
206
- text_cleaned = re.sub(r"`[^`]+`", protect_backticks, text)
207
- text_cleaned = re.sub(r"```.*?```", protect_backticks, text_cleaned, flags=re.DOTALL)
205
+ # IMPORTANT: Match triple backticks FIRST, then single backticks
206
+ # This prevents the single-backtick pattern from matching across triple-backtick blocks
207
+ # (e.g., from a ` before ```latex to the first ` inside the code block)
208
+ text_cleaned = re.sub(r"```.*?```", protect_backticks, text, flags=re.DOTALL)
209
+ text_cleaned = re.sub(r"`[^`]+`", protect_backticks, text_cleaned)
208
210
 
209
211
  # Find bracketed multiple citations
210
212
  bracketed_matches = re.findall(r"\[(@[^]]+)\]", text_cleaned)
@@ -343,6 +343,7 @@ class ConfigManager:
343
343
  "bibliography": {"file": "03_REFERENCES.bib", "style": "nature"},
344
344
  "citation_style": "numbered",
345
345
  "enable_inline_doi_resolution": False,
346
+ "docx": {"hide_si": False, "figures_at_end": False},
346
347
  "cache": {"enabled": True, "ttl_hours": 24},
347
348
  "version": "1.0",
348
349
  "acknowledge_rxiv_maker": True,
@@ -13,6 +13,102 @@ from ..converters.citation_processor import extract_citations_from_text
13
13
  class CitationMapper:
14
14
  """Maps citation keys to sequential numbers for DOCX export."""
15
15
 
16
+ @staticmethod
17
+ def _format_citation_ranges(text: str) -> str:
18
+ """Format consecutive citations as ranges.
19
+
20
+ Converts patterns like [1][2][3] to [1-3], [15][16] to [15-16], etc.
21
+ Also formats comma-separated lists like [1, 2, 3] to [1-3].
22
+
23
+ Args:
24
+ text: Text with numbered citations
25
+
26
+ Returns:
27
+ Text with consecutive citations formatted as ranges
28
+
29
+ Example:
30
+ >>> CitationMapper._format_citation_ranges("text [1][2][3] more")
31
+ 'text [1-3] more'
32
+ >>> CitationMapper._format_citation_ranges("text [1, 2, 3] more")
33
+ 'text [1-3] more'
34
+ >>> CitationMapper._format_citation_ranges("text [1][3][4] more")
35
+ 'text [1][3-4] more'
36
+ """
37
+
38
+ # Pattern 1: Handle adjacent bracketed citations [1][2][3] or [1] [2] [3]
39
+ def combine_adjacent(match_obj):
40
+ # Extract all numbers from consecutive brackets (allowing spaces between)
41
+ numbers = [int(n) for n in re.findall(r"\[(\d+)\]", match_obj.group(0))]
42
+ return CitationMapper._format_number_list(numbers)
43
+
44
+ # Find sequences of adjacent bracketed numbers (with optional spaces between)
45
+ text = re.sub(r"(?:\[\d+\]\s*){2,}", combine_adjacent, text)
46
+
47
+ # Pattern 2: Handle comma-separated citations within single brackets [1, 2, 3]
48
+ def combine_comma_separated(match_obj):
49
+ # Extract all numbers from comma-separated list
50
+ numbers_str = match_obj.group(1)
51
+ numbers = [int(n.strip()) for n in numbers_str.split(",")]
52
+ return CitationMapper._format_number_list(numbers)
53
+
54
+ text = re.sub(r"\[([\d,\s]+)\]", combine_comma_separated, text)
55
+
56
+ return text
57
+
58
+ @staticmethod
59
+ def _format_number_list(numbers: List[int]) -> str:
60
+ """Format a list of citation numbers as ranges.
61
+
62
+ Args:
63
+ numbers: List of citation numbers
64
+
65
+ Returns:
66
+ Formatted string with ranges
67
+
68
+ Example:
69
+ >>> CitationMapper._format_number_list([1, 2, 3, 5, 6, 8])
70
+ '[1-3, 5-6, 8]'
71
+ >>> CitationMapper._format_number_list([15, 16])
72
+ '[15-16]'
73
+ >>> CitationMapper._format_number_list([1, 3, 5])
74
+ '[1, 3, 5]'
75
+ """
76
+ if not numbers:
77
+ return "[]"
78
+
79
+ # Sort numbers
80
+ sorted_nums = sorted(set(numbers))
81
+
82
+ # Build ranges
83
+ ranges = []
84
+ start = sorted_nums[0]
85
+ end = sorted_nums[0]
86
+
87
+ for num in sorted_nums[1:]:
88
+ if num == end + 1:
89
+ # Continue current range
90
+ end = num
91
+ else:
92
+ # End current range and start new one
93
+ if start == end:
94
+ # Single number
95
+ ranges.append(str(start))
96
+ else:
97
+ # Range (including 2 consecutive numbers like 15-16)
98
+ ranges.append(f"{start}-{end}")
99
+ start = num
100
+ end = num
101
+
102
+ # Add final range
103
+ if start == end:
104
+ # Single number
105
+ ranges.append(str(start))
106
+ else:
107
+ # Range (including 2 consecutive numbers like 15-16)
108
+ ranges.append(f"{start}-{end}")
109
+
110
+ return f"[{', '.join(ranges)}]"
111
+
16
112
  def create_mapping(self, citations: List[str]) -> Dict[str, int]:
17
113
  """Create citation key → number mapping.
18
114
 
@@ -121,4 +217,7 @@ class CitationMapper:
121
217
  for i, pattern in enumerate(email_patterns):
122
218
  text = text.replace(f"__EMAIL_PATTERN_{i}__", pattern)
123
219
 
220
+ # Format consecutive citations as ranges (e.g., [1][2][3] -> [1-3])
221
+ text = self._format_citation_ranges(text)
222
+
124
223
  return text
@@ -11,6 +11,26 @@ from typing import Any, Dict, List, Optional
11
11
  class DocxContentProcessor:
12
12
  """Parses markdown content into structured format for DOCX writing."""
13
13
 
14
+ @staticmethod
15
+ def _is_metadata_comment(comment_text: str) -> bool:
16
+ """Check if a comment is metadata/informational and should be skipped.
17
+
18
+ Args:
19
+ comment_text: The comment text to check
20
+
21
+ Returns:
22
+ True if comment should be skipped, False if it should be included
23
+ """
24
+ if not comment_text:
25
+ return True
26
+
27
+ # Normalize to lowercase for case-insensitive matching
28
+ normalized = comment_text.lower().strip()
29
+
30
+ # Skip comments that start with common metadata keywords
31
+ metadata_prefixes = ["note:", "note ", "comment:", "comment "]
32
+ return any(normalized.startswith(prefix) for prefix in metadata_prefixes)
33
+
14
34
  def parse(self, markdown: str, citation_map: Dict[str, int]) -> Dict[str, Any]:
15
35
  """Parse markdown into structured sections for DOCX.
16
36
 
@@ -55,10 +75,38 @@ class DocxContentProcessor:
55
75
  i += 1
56
76
  continue
57
77
 
58
- # Skip HTML/markdown comments
78
+ # Parse HTML/markdown comments (single-line and multi-line)
79
+ # Skip informational/metadata comments (those starting with "Note:")
59
80
  if line.strip().startswith("<!--"):
60
- i += 1
61
- continue
81
+ # Check if it's a single-line comment
82
+ if line.strip().endswith("-->"):
83
+ # Single-line comment
84
+ comment_text = line.strip()[4:-3].strip()
85
+ # Skip metadata comments (e.g., "note that...", "Comment: ...")
86
+ if comment_text and not self._is_metadata_comment(comment_text):
87
+ sections.append({"type": "comment", "text": comment_text})
88
+ i += 1
89
+ continue
90
+ else:
91
+ # Multi-line comment - collect all lines until -->
92
+ comment_lines = [line.strip()[4:]] # Remove <!--
93
+ i += 1
94
+ while i < len(lines):
95
+ if lines[i].strip().endswith("-->"):
96
+ # Last line of comment
97
+ comment_lines.append(lines[i].strip()[:-3]) # Remove -->
98
+ i += 1
99
+ break
100
+ else:
101
+ comment_lines.append(lines[i].strip())
102
+ i += 1
103
+
104
+ # Join and add comment
105
+ comment_text = " ".join(comment_lines).strip()
106
+ # Skip metadata comments (e.g., "note that...", "Comment: ...")
107
+ if comment_text and not self._is_metadata_comment(comment_text):
108
+ sections.append({"type": "comment", "text": comment_text})
109
+ continue
62
110
 
63
111
  # Skip LaTeX commands like <clearpage>
64
112
  if line.strip().startswith("<") and line.strip().endswith(">") and " " not in line.strip():
@@ -335,18 +383,21 @@ class DocxContentProcessor:
335
383
  runs = []
336
384
 
337
385
  # Find all formatting markers, links, and citations
338
- # Pattern to match: <<HIGHLIGHT_YELLOW>>text<</HIGHLIGHT_YELLOW>>, <<XREF>>text<</XREF>>, [text](url), **bold**, __underlined__, *italic*, _italic_, `code`, $math$, [number]
386
+ # Pattern to match: <<HIGHLIGHT_YELLOW>>text<</HIGHLIGHT_YELLOW>>, <<XREF:type>>text<</XREF>>, <!-- comment -->, [text](url), **bold**, __underlined__, *italic*, _italic_, ~subscript~, ^superscript^, `code`, $math$, [number]
339
387
  pattern = re.compile(
340
388
  r"(<<HIGHLIGHT_YELLOW>>([^<]+)<</HIGHLIGHT_YELLOW>>)" # Yellow highlight (must be first)
341
- r"|(<<XREF>>([^<]+)<</XREF>>)" # Cross-reference
389
+ r"|(<<XREF:(\w+)>>([^<]+)<</XREF>>)" # Cross-reference with type
390
+ r"|(<!--\s*(.+?)\s*-->)" # HTML comments (inline)
342
391
  r"|(\[([^\]]+)\]\(([^)]+)\))" # Markdown link [text](url) (before citations)
343
392
  r"|(\*\*([^*]+)\*\*)" # Bold
344
393
  r"|(__([^_]+)__)" # Underline with double underscores (must come before single underscore)
345
394
  r"|(\*([^*]+)\*)" # Italic with asterisks
346
395
  r"|(_([^_]+)_)" # Italic with underscores
396
+ r"|(~([^~]+)~)" # Subscript
397
+ r"|(\^([^^]+)\^)" # Superscript
347
398
  r"|(`([^`]+)`)" # Code
348
399
  r"|(\$([^\$]+)\$)" # Inline math
349
- r"|(\[(\d+(?:,\s*\d+)*)\])" # Citation numbers
400
+ r"|(\[(\d+(?:[-,]\s*\d+)*)\])" # Citation numbers (supports both ranges [1-3] and lists [1, 2])
350
401
  )
351
402
 
352
403
  last_end = 0
@@ -378,67 +429,99 @@ class DocxContentProcessor:
378
429
  if run["type"] == "text":
379
430
  run["highlight_yellow"] = True
380
431
  runs.append(run)
381
- elif match.group(3): # Cross-reference
432
+ elif match.group(3): # Cross-reference with type
382
433
  runs.append(
383
434
  {
384
435
  "type": "text",
385
- "text": match.group(4),
436
+ "text": match.group(5), # Text is now in group 5
386
437
  "bold": False,
387
438
  "italic": False,
388
439
  "underline": False,
389
440
  "code": False,
390
441
  "xref": True,
442
+ "xref_type": match.group(4), # Type is in group 4
391
443
  }
392
444
  )
393
- elif match.group(5): # Markdown link [text](url)
445
+ elif match.group(6): # Inline HTML comment
446
+ comment_text = match.group(7).strip()
447
+ # Skip metadata comments (e.g., "note that...", "Comment: ...")
448
+ if comment_text and not self._is_metadata_comment(comment_text):
449
+ runs.append({"type": "inline_comment", "text": comment_text})
450
+ elif match.group(8): # Markdown link [text](url)
394
451
  runs.append(
395
452
  {
396
453
  "type": "hyperlink",
397
- "text": match.group(6),
398
- "url": match.group(7),
454
+ "text": match.group(9),
455
+ "url": match.group(10),
399
456
  }
400
457
  )
401
- elif match.group(8): # Bold
458
+ elif match.group(11): # Bold
402
459
  # Recursively parse inner text for underline/italic/other formatting
403
- inner_text = match.group(9)
460
+ inner_text = match.group(12)
404
461
  inner_runs = self._parse_inline_formatting(inner_text, citation_map)
405
462
  # Add bold to all inner runs
406
463
  for run in inner_runs:
407
464
  if run["type"] == "text":
408
465
  run["bold"] = True
409
466
  runs.append(run)
410
- elif match.group(10): # Underline
467
+ elif match.group(13): # Underline
411
468
  # Recursively parse inner text for bold/italic/other formatting
412
- inner_text = match.group(11)
469
+ inner_text = match.group(14)
413
470
  inner_runs = self._parse_inline_formatting(inner_text, citation_map)
414
471
  # Add underline to all inner runs
415
472
  for run in inner_runs:
416
473
  if run["type"] == "text":
417
474
  run["underline"] = True
418
475
  runs.append(run)
419
- elif match.group(12): # Italic with asterisks
476
+ elif match.group(15): # Italic with asterisks
420
477
  # Recursively parse inner text for bold/underline/other formatting
421
- inner_text = match.group(13)
478
+ inner_text = match.group(16)
422
479
  inner_runs = self._parse_inline_formatting(inner_text, citation_map)
423
480
  # Add italic to all inner runs
424
481
  for run in inner_runs:
425
482
  if run["type"] == "text":
426
483
  run["italic"] = True
427
484
  runs.append(run)
428
- elif match.group(14): # Italic with underscores
485
+ elif match.group(17): # Italic with underscores
429
486
  # Recursively parse inner text for bold/underline/other formatting
430
- inner_text = match.group(15)
487
+ inner_text = match.group(18)
431
488
  inner_runs = self._parse_inline_formatting(inner_text, citation_map)
432
489
  # Add italic to all inner runs
433
490
  for run in inner_runs:
434
491
  if run["type"] == "text":
435
492
  run["italic"] = True
436
493
  runs.append(run)
437
- elif match.group(16): # Code
494
+ elif match.group(19): # Subscript
495
+ runs.append(
496
+ {
497
+ "type": "text",
498
+ "text": match.group(20),
499
+ "bold": False,
500
+ "italic": False,
501
+ "underline": False,
502
+ "code": False,
503
+ "xref": False,
504
+ "subscript": True,
505
+ }
506
+ )
507
+ elif match.group(21): # Superscript
508
+ runs.append(
509
+ {
510
+ "type": "text",
511
+ "text": match.group(22),
512
+ "bold": False,
513
+ "italic": False,
514
+ "underline": False,
515
+ "code": False,
516
+ "xref": False,
517
+ "superscript": True,
518
+ }
519
+ )
520
+ elif match.group(23): # Code
438
521
  runs.append(
439
522
  {
440
523
  "type": "text",
441
- "text": match.group(17),
524
+ "text": match.group(24),
442
525
  "bold": False,
443
526
  "italic": False,
444
527
  "underline": False,
@@ -446,14 +529,23 @@ class DocxContentProcessor:
446
529
  "xref": False,
447
530
  }
448
531
  )
449
- elif match.group(18): # Inline math
450
- runs.append({"type": "inline_equation", "latex": match.group(19)})
451
- elif match.group(20): # Citation
452
- # Parse citation numbers (may be multiple: [1, 2, 3])
453
- numbers_str = match.group(21)
454
- numbers = [int(n.strip()) for n in numbers_str.split(",")]
455
- for num in numbers:
456
- runs.append({"type": "citation", "number": num})
532
+ elif match.group(25): # Inline math
533
+ runs.append({"type": "inline_equation", "latex": match.group(26)})
534
+ elif match.group(27): # Citation
535
+ # Keep citation as formatted text with yellow highlighting
536
+ # The citation mapper has already formatted ranges (e.g., [1-3], [1, 4-6, 8])
537
+ citation_text = match.group(0) # Full match including brackets
538
+ runs.append(
539
+ {
540
+ "type": "text",
541
+ "text": citation_text,
542
+ "bold": False,
543
+ "italic": False,
544
+ "underline": False,
545
+ "code": False,
546
+ "highlight_yellow": True, # Highlight citations in yellow
547
+ }
548
+ )
457
549
 
458
550
  last_end = match.end()
459
551
 
@@ -516,6 +608,7 @@ class DocxContentProcessor:
516
608
  # Look ahead for caption line (skip empty lines)
517
609
  caption = ""
518
610
  label = ""
611
+ is_supplementary = False # Default to main figure
519
612
  next_i = start_idx + 1
520
613
 
521
614
  # Skip empty lines to find caption
@@ -529,6 +622,9 @@ class DocxContentProcessor:
529
622
 
530
623
  # Check for {#fig:label ...} or {#sfig:label ...} **Caption**
531
624
  if next_line and (next_line.startswith("{#fig:") or next_line.startswith("{#sfig:")):
625
+ # Detect if it's a supplementary figure
626
+ is_supplementary = next_line.startswith("{#sfig:")
627
+
532
628
  # Extract label if present
533
629
  label_match = re.match(r"\{#s?fig:(\w+)[^}]*\}", next_line)
534
630
  if label_match:
@@ -572,6 +668,7 @@ class DocxContentProcessor:
572
668
  "alt": alt_text,
573
669
  "caption": caption,
574
670
  "label": label,
671
+ "is_supplementary": is_supplementary,
575
672
  }, next_i
576
673
 
577
674
  def _parse_table(self, lines: List[str], start_idx: int) -> tuple[Optional[Dict[str, Any]], int]:
@@ -626,7 +723,8 @@ class DocxContentProcessor:
626
723
  if i < len(lines):
627
724
  caption_line = lines[i].strip()
628
725
  # Match {#stable:label} Caption or {#table:label} Caption
629
- caption_match = re.match(r"^\{#(stable|table):(\w+)\}\s*(.+)$", caption_line)
726
+ # Allow hyphens and underscores in label names (e.g., "tool-comparison")
727
+ caption_match = re.match(r"^\{#(stable|table):([\w-]+)\}\s*(.+)$", caption_line)
630
728
  if caption_match:
631
729
  label = f"{caption_match.group(1)}:{caption_match.group(2)}"
632
730
  caption = caption_match.group(3).strip()