rxiv-maker 1.16.8__py3-none-any.whl → 1.18.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rxiv_maker/__version__.py +1 -1
- rxiv_maker/cli/commands/build.py +7 -0
- rxiv_maker/cli/framework/workflow_commands.py +69 -3
- rxiv_maker/converters/citation_processor.py +5 -3
- rxiv_maker/core/managers/config_manager.py +1 -0
- rxiv_maker/exporters/docx_citation_mapper.py +18 -0
- rxiv_maker/exporters/docx_content_processor.py +110 -30
- rxiv_maker/exporters/docx_exporter.py +76 -32
- rxiv_maker/exporters/docx_writer.py +345 -67
- rxiv_maker/templates/registry.py +1 -0
- rxiv_maker/tex/style/rxiv_maker_style.cls +33 -33
- rxiv_maker/utils/accent_character_map.py +150 -0
- rxiv_maker/utils/author_affiliation_processor.py +128 -0
- rxiv_maker/utils/citation_range_formatter.py +118 -0
- rxiv_maker/utils/comment_filter.py +46 -0
- rxiv_maker/utils/docx_helpers.py +43 -118
- rxiv_maker/utils/label_extractor.py +185 -0
- rxiv_maker/utils/pdf_splitter.py +116 -0
- {rxiv_maker-1.16.8.dist-info → rxiv_maker-1.18.0.dist-info}/METADATA +2 -1
- {rxiv_maker-1.16.8.dist-info → rxiv_maker-1.18.0.dist-info}/RECORD +23 -17
- {rxiv_maker-1.16.8.dist-info → rxiv_maker-1.18.0.dist-info}/WHEEL +0 -0
- {rxiv_maker-1.16.8.dist-info → rxiv_maker-1.18.0.dist-info}/entry_points.txt +0 -0
- {rxiv_maker-1.16.8.dist-info → rxiv_maker-1.18.0.dist-info}/licenses/LICENSE +0 -0
rxiv_maker/__version__.py
CHANGED
rxiv_maker/cli/commands/build.py
CHANGED
|
@@ -30,6 +30,7 @@ from ..framework import BuildCommand
|
|
|
30
30
|
@click.option("--keep-output", is_flag=True, help="Preserve existing output directory (default: clear before build)")
|
|
31
31
|
@click.option("--docx", is_flag=True, help="Also export to DOCX format for collaborative review")
|
|
32
32
|
@click.option("--resolve-dois", "-r", is_flag=True, help="Attempt to resolve missing DOIs (when using --docx)")
|
|
33
|
+
@click.option("--split-si", is_flag=True, help="Split PDF into main and SI sections (__main.pdf and __si.pdf)")
|
|
33
34
|
@click.option("--verbose", "-v", is_flag=True, help="Enable verbose output")
|
|
34
35
|
@click.option("--quiet", "-q", is_flag=True, help="Suppress non-essential output")
|
|
35
36
|
@click.option("--debug", "-d", is_flag=True, help="Enable debug output")
|
|
@@ -49,6 +50,7 @@ def build(
|
|
|
49
50
|
keep_output: bool,
|
|
50
51
|
docx: bool,
|
|
51
52
|
resolve_dois: bool,
|
|
53
|
+
split_si: bool,
|
|
52
54
|
verbose: bool,
|
|
53
55
|
quiet: bool,
|
|
54
56
|
debug: bool,
|
|
@@ -81,6 +83,10 @@ def build(
|
|
|
81
83
|
|
|
82
84
|
$ rxiv pdf --docx --resolve-dois
|
|
83
85
|
|
|
86
|
+
**Split PDF into main and SI sections:**
|
|
87
|
+
|
|
88
|
+
$ rxiv pdf --split-si
|
|
89
|
+
|
|
84
90
|
**Force regenerate all figures:**
|
|
85
91
|
|
|
86
92
|
$ rxiv pdf --force-figures
|
|
@@ -108,6 +114,7 @@ def build(
|
|
|
108
114
|
keep_output=keep_output,
|
|
109
115
|
docx=docx,
|
|
110
116
|
resolve_dois=resolve_dois,
|
|
117
|
+
split_si=split_si,
|
|
111
118
|
debug=debug or verbose,
|
|
112
119
|
quiet=quiet,
|
|
113
120
|
container_mode=container_mode,
|
|
@@ -30,8 +30,10 @@ class InitCommand(BaseCommand):
|
|
|
30
30
|
self.engine = "local" # Only local engine is supported
|
|
31
31
|
|
|
32
32
|
# Store manuscript path without PathManager validation since we're creating the directory
|
|
33
|
+
# NOTE: For init command, we should NOT use environment variable MANUSCRIPT_PATH
|
|
34
|
+
# as it's meant for finding existing manuscripts, not determining where to initialize
|
|
33
35
|
if manuscript_path is None:
|
|
34
|
-
manuscript_path =
|
|
36
|
+
manuscript_path = "MANUSCRIPT"
|
|
35
37
|
|
|
36
38
|
# Store the raw path for use in execute_operation
|
|
37
39
|
self.raw_manuscript_path = manuscript_path
|
|
@@ -142,6 +144,7 @@ class BuildCommand(BaseCommand):
|
|
|
142
144
|
keep_output: bool = False,
|
|
143
145
|
docx: bool = False,
|
|
144
146
|
resolve_dois: bool = False,
|
|
147
|
+
split_si: bool = False,
|
|
145
148
|
debug: bool = False,
|
|
146
149
|
quiet: bool = False,
|
|
147
150
|
container_mode: Optional[str] = None,
|
|
@@ -156,6 +159,7 @@ class BuildCommand(BaseCommand):
|
|
|
156
159
|
keep_output: Preserve existing output directory
|
|
157
160
|
docx: Also export to DOCX format
|
|
158
161
|
resolve_dois: Attempt to resolve missing DOIs (for DOCX export)
|
|
162
|
+
split_si: Split PDF into main and SI sections
|
|
159
163
|
debug: Enable debug output
|
|
160
164
|
quiet: Suppress non-critical warnings
|
|
161
165
|
container_mode: Container behavior mode
|
|
@@ -223,6 +227,10 @@ class BuildCommand(BaseCommand):
|
|
|
223
227
|
if docx:
|
|
224
228
|
self._export_docx(resolve_dois=resolve_dois, quiet=quiet, debug=debug)
|
|
225
229
|
|
|
230
|
+
# Split PDF if requested
|
|
231
|
+
if split_si:
|
|
232
|
+
self._split_pdf(pdf_path, quiet=quiet, debug=debug)
|
|
233
|
+
|
|
226
234
|
# Show helpful tips after successful build
|
|
227
235
|
self._show_build_tips()
|
|
228
236
|
|
|
@@ -252,11 +260,69 @@ class BuildCommand(BaseCommand):
|
|
|
252
260
|
self.console.print(f"[green]✅ DOCX exported:[/green] {docx_path}")
|
|
253
261
|
|
|
254
262
|
except Exception as e:
|
|
255
|
-
self.console.print(f"[yellow]⚠️ DOCX export failed:[/yellow] {e}"
|
|
263
|
+
self.console.print(f"[yellow]⚠️ DOCX export failed:[/yellow] {e}")
|
|
264
|
+
if debug:
|
|
265
|
+
import traceback
|
|
266
|
+
|
|
267
|
+
self.console.print(f"[dim]{traceback.format_exc()}[/dim]")
|
|
268
|
+
|
|
269
|
+
def _split_pdf(self, pdf_path: Path, quiet: bool = False, debug: bool = False) -> None:
|
|
270
|
+
"""Split PDF into main and SI sections after successful PDF build.
|
|
271
|
+
|
|
272
|
+
Args:
|
|
273
|
+
pdf_path: Path to the generated PDF
|
|
274
|
+
quiet: Suppress non-essential output
|
|
275
|
+
debug: Enable debug output
|
|
276
|
+
"""
|
|
277
|
+
try:
|
|
278
|
+
from ...processors.yaml_processor import extract_yaml_metadata
|
|
279
|
+
from ...utils.file_helpers import find_manuscript_md
|
|
280
|
+
from ...utils.pdf_splitter import split_pdf
|
|
281
|
+
from ...utils.pdf_utils import get_custom_pdf_filename
|
|
282
|
+
|
|
283
|
+
if not quiet:
|
|
284
|
+
self.console.print("\n[cyan]✂️ Splitting PDF into main and SI sections...[/cyan]")
|
|
285
|
+
|
|
286
|
+
# Split the PDF
|
|
287
|
+
main_path, si_path = split_pdf(pdf_path)
|
|
288
|
+
|
|
289
|
+
if main_path and si_path:
|
|
290
|
+
# Extract metadata to generate custom filename
|
|
291
|
+
manuscript_md = find_manuscript_md(str(self.path_manager.manuscript_path))
|
|
292
|
+
yaml_metadata = extract_yaml_metadata(str(manuscript_md))
|
|
293
|
+
|
|
294
|
+
# Get base filename (e.g., "2025__saraiva_et_al__rxiv.pdf")
|
|
295
|
+
base_filename = get_custom_pdf_filename(yaml_metadata)
|
|
296
|
+
base_name = base_filename.replace(".pdf", "")
|
|
297
|
+
|
|
298
|
+
# Generate final filenames with __main and __si suffixes
|
|
299
|
+
main_filename = f"{base_name}__main.pdf"
|
|
300
|
+
si_filename = f"{base_name}__si.pdf"
|
|
301
|
+
|
|
302
|
+
# Copy split files to manuscript directory
|
|
303
|
+
final_main_path = self.path_manager.manuscript_path / main_filename
|
|
304
|
+
final_si_path = self.path_manager.manuscript_path / si_filename
|
|
305
|
+
|
|
306
|
+
shutil.copy2(main_path, final_main_path)
|
|
307
|
+
shutil.copy2(si_path, final_si_path)
|
|
308
|
+
|
|
309
|
+
if not quiet:
|
|
310
|
+
self.console.print("[green]✅ PDF split successfully:[/green]")
|
|
311
|
+
self.console.print(f" 📄 Main: {final_main_path}")
|
|
312
|
+
self.console.print(f" 📄 SI: {final_si_path}")
|
|
313
|
+
elif main_path is None and si_path is None:
|
|
314
|
+
if not quiet:
|
|
315
|
+
self.console.print("[yellow]⚠️ Could not split PDF: SI section marker not found[/yellow]")
|
|
316
|
+
else:
|
|
317
|
+
if not quiet:
|
|
318
|
+
self.console.print("[yellow]⚠️ PDF splitting partially failed[/yellow]")
|
|
319
|
+
|
|
320
|
+
except Exception as e:
|
|
321
|
+
self.console.print(f"[yellow]⚠️ PDF splitting failed:[/yellow] {e}")
|
|
256
322
|
if debug:
|
|
257
323
|
import traceback
|
|
258
324
|
|
|
259
|
-
self.console.print(f"[dim]{traceback.format_exc()}[/dim]"
|
|
325
|
+
self.console.print(f"[dim]{traceback.format_exc()}[/dim]")
|
|
260
326
|
|
|
261
327
|
def _show_build_tips(self) -> None:
|
|
262
328
|
"""Show helpful tips after successful PDF build."""
|
|
@@ -202,9 +202,11 @@ def extract_citations_from_text(text: MarkdownContent) -> list[CitationKey]:
|
|
|
202
202
|
backtick_patterns.append(match.group(0))
|
|
203
203
|
return f"__BACKTICK_PATTERN_{len(backtick_patterns) - 1}__"
|
|
204
204
|
|
|
205
|
-
# Match
|
|
206
|
-
|
|
207
|
-
|
|
205
|
+
# IMPORTANT: Match triple backticks FIRST, then single backticks
|
|
206
|
+
# This prevents the single-backtick pattern from matching across triple-backtick blocks
|
|
207
|
+
# (e.g., from a ` before ```latex to the first ` inside the code block)
|
|
208
|
+
text_cleaned = re.sub(r"```.*?```", protect_backticks, text, flags=re.DOTALL)
|
|
209
|
+
text_cleaned = re.sub(r"`[^`]+`", protect_backticks, text_cleaned)
|
|
208
210
|
|
|
209
211
|
# Find bracketed multiple citations
|
|
210
212
|
bracketed_matches = re.findall(r"\[(@[^]]+)\]", text_cleaned)
|
|
@@ -343,6 +343,7 @@ class ConfigManager:
|
|
|
343
343
|
"bibliography": {"file": "03_REFERENCES.bib", "style": "nature"},
|
|
344
344
|
"citation_style": "numbered",
|
|
345
345
|
"enable_inline_doi_resolution": False,
|
|
346
|
+
"docx": {"hide_si": False, "figures_at_end": False},
|
|
346
347
|
"cache": {"enabled": True, "ttl_hours": 24},
|
|
347
348
|
"version": "1.0",
|
|
348
349
|
"acknowledge_rxiv_maker": True,
|
|
@@ -8,11 +8,26 @@ import re
|
|
|
8
8
|
from typing import Dict, List
|
|
9
9
|
|
|
10
10
|
from ..converters.citation_processor import extract_citations_from_text
|
|
11
|
+
from ..utils.citation_range_formatter import format_citation_ranges
|
|
11
12
|
|
|
12
13
|
|
|
13
14
|
class CitationMapper:
|
|
14
15
|
"""Maps citation keys to sequential numbers for DOCX export."""
|
|
15
16
|
|
|
17
|
+
@staticmethod
|
|
18
|
+
def _format_citation_ranges(text: str) -> str:
|
|
19
|
+
"""Format consecutive citations as ranges.
|
|
20
|
+
|
|
21
|
+
Uses centralized citation range formatter from utils module.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
text: Text with numbered citations
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
Text with consecutive citations formatted as ranges
|
|
28
|
+
"""
|
|
29
|
+
return format_citation_ranges(text)
|
|
30
|
+
|
|
16
31
|
def create_mapping(self, citations: List[str]) -> Dict[str, int]:
|
|
17
32
|
"""Create citation key → number mapping.
|
|
18
33
|
|
|
@@ -121,4 +136,7 @@ class CitationMapper:
|
|
|
121
136
|
for i, pattern in enumerate(email_patterns):
|
|
122
137
|
text = text.replace(f"__EMAIL_PATTERN_{i}__", pattern)
|
|
123
138
|
|
|
139
|
+
# Format consecutive citations as ranges (e.g., [1][2][3] -> [1-3])
|
|
140
|
+
text = self._format_citation_ranges(text)
|
|
141
|
+
|
|
124
142
|
return text
|
|
@@ -7,6 +7,8 @@ DOCX generation with python-docx.
|
|
|
7
7
|
import re
|
|
8
8
|
from typing import Any, Dict, List, Optional
|
|
9
9
|
|
|
10
|
+
from ..utils.comment_filter import is_metadata_comment
|
|
11
|
+
|
|
10
12
|
|
|
11
13
|
class DocxContentProcessor:
|
|
12
14
|
"""Parses markdown content into structured format for DOCX writing."""
|
|
@@ -55,10 +57,38 @@ class DocxContentProcessor:
|
|
|
55
57
|
i += 1
|
|
56
58
|
continue
|
|
57
59
|
|
|
58
|
-
#
|
|
60
|
+
# Parse HTML/markdown comments (single-line and multi-line)
|
|
61
|
+
# Skip informational/metadata comments (those starting with "Note:")
|
|
59
62
|
if line.strip().startswith("<!--"):
|
|
60
|
-
|
|
61
|
-
|
|
63
|
+
# Check if it's a single-line comment
|
|
64
|
+
if line.strip().endswith("-->"):
|
|
65
|
+
# Single-line comment
|
|
66
|
+
comment_text = line.strip()[4:-3].strip()
|
|
67
|
+
# Skip metadata comments (e.g., "note that...", "Comment: ...")
|
|
68
|
+
if comment_text and not is_metadata_comment(comment_text):
|
|
69
|
+
sections.append({"type": "comment", "text": comment_text})
|
|
70
|
+
i += 1
|
|
71
|
+
continue
|
|
72
|
+
else:
|
|
73
|
+
# Multi-line comment - collect all lines until -->
|
|
74
|
+
comment_lines = [line.strip()[4:]] # Remove <!--
|
|
75
|
+
i += 1
|
|
76
|
+
while i < len(lines):
|
|
77
|
+
if lines[i].strip().endswith("-->"):
|
|
78
|
+
# Last line of comment
|
|
79
|
+
comment_lines.append(lines[i].strip()[:-3]) # Remove -->
|
|
80
|
+
i += 1
|
|
81
|
+
break
|
|
82
|
+
else:
|
|
83
|
+
comment_lines.append(lines[i].strip())
|
|
84
|
+
i += 1
|
|
85
|
+
|
|
86
|
+
# Join and add comment
|
|
87
|
+
comment_text = " ".join(comment_lines).strip()
|
|
88
|
+
# Skip metadata comments (e.g., "note that...", "Comment: ...")
|
|
89
|
+
if comment_text and not is_metadata_comment(comment_text):
|
|
90
|
+
sections.append({"type": "comment", "text": comment_text})
|
|
91
|
+
continue
|
|
62
92
|
|
|
63
93
|
# Skip LaTeX commands like <clearpage>
|
|
64
94
|
if line.strip().startswith("<") and line.strip().endswith(">") and " " not in line.strip():
|
|
@@ -335,18 +365,21 @@ class DocxContentProcessor:
|
|
|
335
365
|
runs = []
|
|
336
366
|
|
|
337
367
|
# Find all formatting markers, links, and citations
|
|
338
|
-
# Pattern to match: <<HIGHLIGHT_YELLOW>>text<</HIGHLIGHT_YELLOW>>, <<XREF>>text<</XREF>>, [text](url), **bold**, __underlined__, *italic*, _italic_, `code`, $math$, [number]
|
|
368
|
+
# Pattern to match: <<HIGHLIGHT_YELLOW>>text<</HIGHLIGHT_YELLOW>>, <<XREF:type>>text<</XREF>>, <!-- comment -->, [text](url), **bold**, __underlined__, *italic*, _italic_, ~subscript~, ^superscript^, `code`, $math$, [number]
|
|
339
369
|
pattern = re.compile(
|
|
340
370
|
r"(<<HIGHLIGHT_YELLOW>>([^<]+)<</HIGHLIGHT_YELLOW>>)" # Yellow highlight (must be first)
|
|
341
|
-
r"|(<<XREF>>([^<]+)<</XREF>>)" # Cross-reference
|
|
371
|
+
r"|(<<XREF:(\w+)>>([^<]+)<</XREF>>)" # Cross-reference with type
|
|
372
|
+
r"|(<!--\s*(.+?)\s*-->)" # HTML comments (inline)
|
|
342
373
|
r"|(\[([^\]]+)\]\(([^)]+)\))" # Markdown link [text](url) (before citations)
|
|
343
374
|
r"|(\*\*([^*]+)\*\*)" # Bold
|
|
344
375
|
r"|(__([^_]+)__)" # Underline with double underscores (must come before single underscore)
|
|
345
376
|
r"|(\*([^*]+)\*)" # Italic with asterisks
|
|
346
377
|
r"|(_([^_]+)_)" # Italic with underscores
|
|
378
|
+
r"|(~([^~]+)~)" # Subscript
|
|
379
|
+
r"|(\^([^^]+)\^)" # Superscript
|
|
347
380
|
r"|(`([^`]+)`)" # Code
|
|
348
381
|
r"|(\$([^\$]+)\$)" # Inline math
|
|
349
|
-
r"|(\[(\d+(
|
|
382
|
+
r"|(\[(\d+(?:[-,]\s*\d+)*)\])" # Citation numbers (supports both ranges [1-3] and lists [1, 2])
|
|
350
383
|
)
|
|
351
384
|
|
|
352
385
|
last_end = 0
|
|
@@ -378,67 +411,99 @@ class DocxContentProcessor:
|
|
|
378
411
|
if run["type"] == "text":
|
|
379
412
|
run["highlight_yellow"] = True
|
|
380
413
|
runs.append(run)
|
|
381
|
-
elif match.group(3): # Cross-reference
|
|
414
|
+
elif match.group(3): # Cross-reference with type
|
|
382
415
|
runs.append(
|
|
383
416
|
{
|
|
384
417
|
"type": "text",
|
|
385
|
-
"text": match.group(
|
|
418
|
+
"text": match.group(5), # Text is now in group 5
|
|
386
419
|
"bold": False,
|
|
387
420
|
"italic": False,
|
|
388
421
|
"underline": False,
|
|
389
422
|
"code": False,
|
|
390
423
|
"xref": True,
|
|
424
|
+
"xref_type": match.group(4), # Type is in group 4
|
|
391
425
|
}
|
|
392
426
|
)
|
|
393
|
-
elif match.group(
|
|
427
|
+
elif match.group(6): # Inline HTML comment
|
|
428
|
+
comment_text = match.group(7).strip()
|
|
429
|
+
# Skip metadata comments (e.g., "note that...", "Comment: ...")
|
|
430
|
+
if comment_text and not is_metadata_comment(comment_text):
|
|
431
|
+
runs.append({"type": "inline_comment", "text": comment_text})
|
|
432
|
+
elif match.group(8): # Markdown link [text](url)
|
|
394
433
|
runs.append(
|
|
395
434
|
{
|
|
396
435
|
"type": "hyperlink",
|
|
397
|
-
"text": match.group(
|
|
398
|
-
"url": match.group(
|
|
436
|
+
"text": match.group(9),
|
|
437
|
+
"url": match.group(10),
|
|
399
438
|
}
|
|
400
439
|
)
|
|
401
|
-
elif match.group(
|
|
440
|
+
elif match.group(11): # Bold
|
|
402
441
|
# Recursively parse inner text for underline/italic/other formatting
|
|
403
|
-
inner_text = match.group(
|
|
442
|
+
inner_text = match.group(12)
|
|
404
443
|
inner_runs = self._parse_inline_formatting(inner_text, citation_map)
|
|
405
444
|
# Add bold to all inner runs
|
|
406
445
|
for run in inner_runs:
|
|
407
446
|
if run["type"] == "text":
|
|
408
447
|
run["bold"] = True
|
|
409
448
|
runs.append(run)
|
|
410
|
-
elif match.group(
|
|
449
|
+
elif match.group(13): # Underline
|
|
411
450
|
# Recursively parse inner text for bold/italic/other formatting
|
|
412
|
-
inner_text = match.group(
|
|
451
|
+
inner_text = match.group(14)
|
|
413
452
|
inner_runs = self._parse_inline_formatting(inner_text, citation_map)
|
|
414
453
|
# Add underline to all inner runs
|
|
415
454
|
for run in inner_runs:
|
|
416
455
|
if run["type"] == "text":
|
|
417
456
|
run["underline"] = True
|
|
418
457
|
runs.append(run)
|
|
419
|
-
elif match.group(
|
|
458
|
+
elif match.group(15): # Italic with asterisks
|
|
420
459
|
# Recursively parse inner text for bold/underline/other formatting
|
|
421
|
-
inner_text = match.group(
|
|
460
|
+
inner_text = match.group(16)
|
|
422
461
|
inner_runs = self._parse_inline_formatting(inner_text, citation_map)
|
|
423
462
|
# Add italic to all inner runs
|
|
424
463
|
for run in inner_runs:
|
|
425
464
|
if run["type"] == "text":
|
|
426
465
|
run["italic"] = True
|
|
427
466
|
runs.append(run)
|
|
428
|
-
elif match.group(
|
|
467
|
+
elif match.group(17): # Italic with underscores
|
|
429
468
|
# Recursively parse inner text for bold/underline/other formatting
|
|
430
|
-
inner_text = match.group(
|
|
469
|
+
inner_text = match.group(18)
|
|
431
470
|
inner_runs = self._parse_inline_formatting(inner_text, citation_map)
|
|
432
471
|
# Add italic to all inner runs
|
|
433
472
|
for run in inner_runs:
|
|
434
473
|
if run["type"] == "text":
|
|
435
474
|
run["italic"] = True
|
|
436
475
|
runs.append(run)
|
|
437
|
-
elif match.group(
|
|
476
|
+
elif match.group(19): # Subscript
|
|
438
477
|
runs.append(
|
|
439
478
|
{
|
|
440
479
|
"type": "text",
|
|
441
|
-
"text": match.group(
|
|
480
|
+
"text": match.group(20),
|
|
481
|
+
"bold": False,
|
|
482
|
+
"italic": False,
|
|
483
|
+
"underline": False,
|
|
484
|
+
"code": False,
|
|
485
|
+
"xref": False,
|
|
486
|
+
"subscript": True,
|
|
487
|
+
}
|
|
488
|
+
)
|
|
489
|
+
elif match.group(21): # Superscript
|
|
490
|
+
runs.append(
|
|
491
|
+
{
|
|
492
|
+
"type": "text",
|
|
493
|
+
"text": match.group(22),
|
|
494
|
+
"bold": False,
|
|
495
|
+
"italic": False,
|
|
496
|
+
"underline": False,
|
|
497
|
+
"code": False,
|
|
498
|
+
"xref": False,
|
|
499
|
+
"superscript": True,
|
|
500
|
+
}
|
|
501
|
+
)
|
|
502
|
+
elif match.group(23): # Code
|
|
503
|
+
runs.append(
|
|
504
|
+
{
|
|
505
|
+
"type": "text",
|
|
506
|
+
"text": match.group(24),
|
|
442
507
|
"bold": False,
|
|
443
508
|
"italic": False,
|
|
444
509
|
"underline": False,
|
|
@@ -446,14 +511,23 @@ class DocxContentProcessor:
|
|
|
446
511
|
"xref": False,
|
|
447
512
|
}
|
|
448
513
|
)
|
|
449
|
-
elif match.group(
|
|
450
|
-
runs.append({"type": "inline_equation", "latex": match.group(
|
|
451
|
-
elif match.group(
|
|
452
|
-
#
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
514
|
+
elif match.group(25): # Inline math
|
|
515
|
+
runs.append({"type": "inline_equation", "latex": match.group(26)})
|
|
516
|
+
elif match.group(27): # Citation
|
|
517
|
+
# Keep citation as formatted text with yellow highlighting
|
|
518
|
+
# The citation mapper has already formatted ranges (e.g., [1-3], [1, 4-6, 8])
|
|
519
|
+
citation_text = match.group(0) # Full match including brackets
|
|
520
|
+
runs.append(
|
|
521
|
+
{
|
|
522
|
+
"type": "text",
|
|
523
|
+
"text": citation_text,
|
|
524
|
+
"bold": False,
|
|
525
|
+
"italic": False,
|
|
526
|
+
"underline": False,
|
|
527
|
+
"code": False,
|
|
528
|
+
"highlight_yellow": True, # Highlight citations in yellow
|
|
529
|
+
}
|
|
530
|
+
)
|
|
457
531
|
|
|
458
532
|
last_end = match.end()
|
|
459
533
|
|
|
@@ -516,6 +590,7 @@ class DocxContentProcessor:
|
|
|
516
590
|
# Look ahead for caption line (skip empty lines)
|
|
517
591
|
caption = ""
|
|
518
592
|
label = ""
|
|
593
|
+
is_supplementary = False # Default to main figure
|
|
519
594
|
next_i = start_idx + 1
|
|
520
595
|
|
|
521
596
|
# Skip empty lines to find caption
|
|
@@ -529,6 +604,9 @@ class DocxContentProcessor:
|
|
|
529
604
|
|
|
530
605
|
# Check for {#fig:label ...} or {#sfig:label ...} **Caption**
|
|
531
606
|
if next_line and (next_line.startswith("{#fig:") or next_line.startswith("{#sfig:")):
|
|
607
|
+
# Detect if it's a supplementary figure
|
|
608
|
+
is_supplementary = next_line.startswith("{#sfig:")
|
|
609
|
+
|
|
532
610
|
# Extract label if present
|
|
533
611
|
label_match = re.match(r"\{#s?fig:(\w+)[^}]*\}", next_line)
|
|
534
612
|
if label_match:
|
|
@@ -572,6 +650,7 @@ class DocxContentProcessor:
|
|
|
572
650
|
"alt": alt_text,
|
|
573
651
|
"caption": caption,
|
|
574
652
|
"label": label,
|
|
653
|
+
"is_supplementary": is_supplementary,
|
|
575
654
|
}, next_i
|
|
576
655
|
|
|
577
656
|
def _parse_table(self, lines: List[str], start_idx: int) -> tuple[Optional[Dict[str, Any]], int]:
|
|
@@ -626,7 +705,8 @@ class DocxContentProcessor:
|
|
|
626
705
|
if i < len(lines):
|
|
627
706
|
caption_line = lines[i].strip()
|
|
628
707
|
# Match {#stable:label} Caption or {#table:label} Caption
|
|
629
|
-
|
|
708
|
+
# Allow hyphens and underscores in label names (e.g., "tool-comparison")
|
|
709
|
+
caption_match = re.match(r"^\{#(stable|table):([\w-]+)\}\s*(.+)$", caption_line)
|
|
630
710
|
if caption_match:
|
|
631
711
|
label = f"{caption_match.group(1)}:{caption_match.group(2)}"
|
|
632
712
|
caption = caption_match.group(3).strip()
|