rxiv-maker 1.16.8__py3-none-any.whl → 1.18.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rxiv_maker/__version__.py +1 -1
- rxiv_maker/cli/commands/build.py +7 -0
- rxiv_maker/cli/framework/workflow_commands.py +69 -3
- rxiv_maker/converters/citation_processor.py +5 -3
- rxiv_maker/core/managers/config_manager.py +1 -0
- rxiv_maker/exporters/docx_citation_mapper.py +18 -0
- rxiv_maker/exporters/docx_content_processor.py +110 -30
- rxiv_maker/exporters/docx_exporter.py +76 -32
- rxiv_maker/exporters/docx_writer.py +345 -67
- rxiv_maker/templates/registry.py +1 -0
- rxiv_maker/tex/style/rxiv_maker_style.cls +33 -33
- rxiv_maker/utils/accent_character_map.py +150 -0
- rxiv_maker/utils/author_affiliation_processor.py +128 -0
- rxiv_maker/utils/citation_range_formatter.py +118 -0
- rxiv_maker/utils/comment_filter.py +46 -0
- rxiv_maker/utils/docx_helpers.py +43 -118
- rxiv_maker/utils/label_extractor.py +185 -0
- rxiv_maker/utils/pdf_splitter.py +116 -0
- {rxiv_maker-1.16.8.dist-info → rxiv_maker-1.18.0.dist-info}/METADATA +2 -1
- {rxiv_maker-1.16.8.dist-info → rxiv_maker-1.18.0.dist-info}/RECORD +23 -17
- {rxiv_maker-1.16.8.dist-info → rxiv_maker-1.18.0.dist-info}/WHEEL +0 -0
- {rxiv_maker-1.16.8.dist-info → rxiv_maker-1.18.0.dist-info}/entry_points.txt +0 -0
- {rxiv_maker-1.16.8.dist-info → rxiv_maker-1.18.0.dist-info}/licenses/LICENSE +0 -0
rxiv_maker/utils/docx_helpers.py
CHANGED
|
@@ -7,6 +7,7 @@ This module provides utility functions for DOCX generation including:
|
|
|
7
7
|
- PDF to image conversion
|
|
8
8
|
"""
|
|
9
9
|
|
|
10
|
+
import html
|
|
10
11
|
import io
|
|
11
12
|
import logging
|
|
12
13
|
import re
|
|
@@ -201,124 +202,18 @@ def clean_latex_commands(text: str) -> str:
|
|
|
201
202
|
>>> clean_latex_commands("Griffi{\\'e}")
|
|
202
203
|
'Griffié'
|
|
203
204
|
"""
|
|
205
|
+
# First, handle escaped HTML entities from BibTeX (\&\#233 -> é)
|
|
206
|
+
text = text.replace("\\&\\#", "&#")
|
|
207
|
+
text = text.replace("\\&#", "&#") # Handle partially escaped variants
|
|
208
|
+
|
|
209
|
+
# Then decode HTML entities (é -> é, á -> á, … -> …, etc.)
|
|
210
|
+
text = html.unescape(text)
|
|
211
|
+
|
|
204
212
|
# Convert LaTeX accent commands to Unicode
|
|
205
|
-
#
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
"\\'e": "é",
|
|
210
|
-
"{\\'e}": "é",
|
|
211
|
-
"{'e}": "é",
|
|
212
|
-
"{'é}": "é",
|
|
213
|
-
"\\'a": "á",
|
|
214
|
-
"{\\'a}": "á",
|
|
215
|
-
"{'a}": "á",
|
|
216
|
-
"{'á}": "á",
|
|
217
|
-
"\\'i": "í",
|
|
218
|
-
"{\\'i}": "í",
|
|
219
|
-
"{'i}": "í",
|
|
220
|
-
"{'í}": "í",
|
|
221
|
-
"'{\\i}": "í", # Acute on dotless i
|
|
222
|
-
"\\'o": "ó",
|
|
223
|
-
"{\\'o}": "ó",
|
|
224
|
-
"{'o}": "ó",
|
|
225
|
-
"{'ó}": "ó",
|
|
226
|
-
"'{o}": "ó", # Acute o (variant without backslash)
|
|
227
|
-
"\\'u": "ú",
|
|
228
|
-
"{\\'u}": "ú",
|
|
229
|
-
"{'u}": "ú",
|
|
230
|
-
"{'ú}": "ú",
|
|
231
|
-
# Uppercase acute accents
|
|
232
|
-
"\\'E": "É",
|
|
233
|
-
"{\\'E}": "É",
|
|
234
|
-
"{'E}": "É",
|
|
235
|
-
"\\'A": "Á",
|
|
236
|
-
"{\\'A}": "Á",
|
|
237
|
-
"{'A}": "Á",
|
|
238
|
-
"\\'I": "Í",
|
|
239
|
-
"{\\'I}": "Í",
|
|
240
|
-
"{'I}": "Í",
|
|
241
|
-
"'{\\I}": "Í", # Acute on uppercase dotless I
|
|
242
|
-
"\\'O": "Ó",
|
|
243
|
-
"{\\'O}": "Ó",
|
|
244
|
-
"{'O}": "Ó",
|
|
245
|
-
"'{O}": "Ó",
|
|
246
|
-
"\\'U": "Ú",
|
|
247
|
-
"{\\'U}": "Ú",
|
|
248
|
-
"{'U}": "Ú",
|
|
249
|
-
# Umlaut/diaeresis (ë, ä, ï, ö, ü)
|
|
250
|
-
'\\"e': "ë",
|
|
251
|
-
'{\\"e}': "ë",
|
|
252
|
-
'{"e}': "ë",
|
|
253
|
-
'{"ë}': "ë",
|
|
254
|
-
'\\"a': "ä",
|
|
255
|
-
'{\\"a}': "ä",
|
|
256
|
-
'{"a}': "ä",
|
|
257
|
-
'{"ä}': "ä",
|
|
258
|
-
'\\"i': "ï",
|
|
259
|
-
'{\\"i}': "ï",
|
|
260
|
-
'{"i}': "ï",
|
|
261
|
-
'{"ï}': "ï",
|
|
262
|
-
'\\"o': "ö",
|
|
263
|
-
'{\\"o}': "ö",
|
|
264
|
-
'{"o}': "ö",
|
|
265
|
-
'{"ö}': "ö",
|
|
266
|
-
'\\"u': "ü",
|
|
267
|
-
'{\\"u}': "ü",
|
|
268
|
-
'{"u}': "ü",
|
|
269
|
-
'{"ü}': "ü",
|
|
270
|
-
# Grave accents (è, à)
|
|
271
|
-
"\\`e": "è",
|
|
272
|
-
"{\\`e}": "è",
|
|
273
|
-
"{`e}": "è",
|
|
274
|
-
"{`è}": "è",
|
|
275
|
-
"\\`a": "à",
|
|
276
|
-
"{\\`a}": "à",
|
|
277
|
-
"{`a}": "à",
|
|
278
|
-
"{`à}": "à",
|
|
279
|
-
# Circumflex (ê, â)
|
|
280
|
-
"\\^e": "ê",
|
|
281
|
-
"{\\^e}": "ê",
|
|
282
|
-
"{^e}": "ê",
|
|
283
|
-
"{^ê}": "ê",
|
|
284
|
-
"\\^a": "â",
|
|
285
|
-
"{\\^a}": "â",
|
|
286
|
-
"{^a}": "â",
|
|
287
|
-
"{^â}": "â",
|
|
288
|
-
# Tilde (ñ, ã, õ)
|
|
289
|
-
"\\~n": "ñ",
|
|
290
|
-
"{\\~n}": "ñ",
|
|
291
|
-
"{~n}": "ñ",
|
|
292
|
-
"{~ñ}": "ñ",
|
|
293
|
-
"~{n}": "ñ",
|
|
294
|
-
"\\~a": "ã",
|
|
295
|
-
"{\\~a}": "ã",
|
|
296
|
-
"{~a}": "ã",
|
|
297
|
-
"~{a}": "ã", # Tilde on a (variant)
|
|
298
|
-
"{~ã}": "ã",
|
|
299
|
-
"\\~o": "õ",
|
|
300
|
-
"{\\~o}": "õ",
|
|
301
|
-
"{~o}": "õ",
|
|
302
|
-
"~{o}": "õ", # Tilde on o (variant)
|
|
303
|
-
"{~õ}": "õ",
|
|
304
|
-
# Uppercase tilde
|
|
305
|
-
"\\~N": "Ñ",
|
|
306
|
-
"{\\~N}": "Ñ",
|
|
307
|
-
"~{N}": "Ñ",
|
|
308
|
-
"\\~A": "Ã",
|
|
309
|
-
"{\\~A}": "Ã",
|
|
310
|
-
"~{A}": "Ã",
|
|
311
|
-
"\\~O": "Õ",
|
|
312
|
-
"{\\~O}": "Õ",
|
|
313
|
-
"~{O}": "Õ",
|
|
314
|
-
# Cedilla (ç)
|
|
315
|
-
"\\c{c}": "ç",
|
|
316
|
-
"{\\c{c}}": "ç",
|
|
317
|
-
"{\\c{ç}}": "ç",
|
|
318
|
-
}
|
|
319
|
-
|
|
320
|
-
for latex_cmd, unicode_char in accent_map.items():
|
|
321
|
-
text = text.replace(latex_cmd, unicode_char)
|
|
213
|
+
# Uses centralized accent map from accent_character_map module
|
|
214
|
+
from .accent_character_map import clean_latex_accents
|
|
215
|
+
|
|
216
|
+
text = clean_latex_accents(text)
|
|
322
217
|
|
|
323
218
|
# Remove common formatting commands but keep their content
|
|
324
219
|
text = re.sub(r"\\textbf\{([^}]+)\}", r"\1", text)
|
|
@@ -338,7 +233,37 @@ def clean_latex_commands(text: str) -> str:
|
|
|
338
233
|
# Remove lone backslashes
|
|
339
234
|
text = re.sub(r"\\(?![a-zA-Z])", "", text)
|
|
340
235
|
|
|
341
|
-
|
|
236
|
+
# Remove braces around single characters or short words (common BibTeX artifact)
|
|
237
|
+
# This handles cases like {P} or {n} that appear after HTML entity decoding issues
|
|
238
|
+
text = re.sub(r"\{([A-Za-z]{1,3})\}", r"\1", text)
|
|
239
|
+
|
|
240
|
+
# Remove unmatched opening braces at start of words (e.g., "{Sperr" -> "Sperr")
|
|
241
|
+
text = re.sub(r"\{([A-Za-z])", r"\1", text)
|
|
242
|
+
|
|
243
|
+
# Remove unmatched closing braces at end of words or after accented characters (e.g., "Team}" -> "Team", "Pé}" -> "Pé")
|
|
244
|
+
text = re.sub(r"([A-Za-zÀ-ÿ])\}", r"\1", text)
|
|
245
|
+
|
|
246
|
+
# Remove isolated braces (opening or closing)
|
|
247
|
+
text = re.sub(r"\{(?![A-Za-z])", "", text) # Opening brace not followed by letter
|
|
248
|
+
text = re.sub(r"(?<![A-Za-z])\}", "", text) # Closing brace not preceded by letter
|
|
249
|
+
|
|
250
|
+
# Fix common malformed author name patterns from bad BibTeX encoding
|
|
251
|
+
# Pattern: "Pé and Rez, Fernando" -> "Pérez, Fernando" (very short word with accent + capitalized word + comma)
|
|
252
|
+
# Only match if first word is 2-4 chars and ends with accented character
|
|
253
|
+
def fix_name_case(match):
|
|
254
|
+
part1, part2 = match.group(1), match.group(2)
|
|
255
|
+
# Lowercase the second part since it's continuation of first name
|
|
256
|
+
return f"{part1}{part2.lower()},"
|
|
257
|
+
|
|
258
|
+
text = re.sub(r"\b([A-ZÀ-Ÿ][à-ÿ]{1,3}) and ([A-Z][a-z]+),", fix_name_case, text)
|
|
259
|
+
# Pattern: "Damiá and n and" -> "Damián and" (word ending in accent + isolated letter + " and")
|
|
260
|
+
text = re.sub(r"\b([A-ZÀ-Ÿ][a-zà-ÿ]+[à-ÿ]) and ([a-zà-ÿ])\s+and\s+", r"\1\2 and ", text)
|
|
261
|
+
|
|
262
|
+
# Clean up any remaining empty braces or double spaces
|
|
263
|
+
text = re.sub(r"\{\}", "", text)
|
|
264
|
+
text = re.sub(r"\s+", " ", text)
|
|
265
|
+
|
|
266
|
+
return text.strip()
|
|
342
267
|
|
|
343
268
|
|
|
344
269
|
def truncate_text(text: str, max_length: int = 100, suffix: str = "...") -> str:
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
r"""Label extraction utilities for manuscript processing.
|
|
2
|
+
|
|
3
|
+
This module provides centralized label extraction for figures, tables, equations,
|
|
4
|
+
and supplementary elements. Used by both DOCX export and LaTeX processing to
|
|
5
|
+
create consistent numbering across formats.
|
|
6
|
+
|
|
7
|
+
Examples:
|
|
8
|
+
>>> extractor = LabelExtractor()
|
|
9
|
+
>>> content = "\\n{#fig:results}"
|
|
10
|
+
>>> fig_map = extractor.extract_figure_labels(content)
|
|
11
|
+
>>> fig_map
|
|
12
|
+
{'results': 1}
|
|
13
|
+
r
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
import re
|
|
17
|
+
from typing import Dict, Tuple
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class LabelExtractor:
|
|
21
|
+
r"""Extract and map reference labels from markdown content."""
|
|
22
|
+
|
|
23
|
+
@staticmethod
|
|
24
|
+
def extract_figure_labels(content: str) -> Dict[str, int]:
|
|
25
|
+
r"""Extract main figure labels and create number mapping.
|
|
26
|
+
|
|
27
|
+
Finds patterns like: \\n{#fig:label}
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
content: Markdown content to scan
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
Dict mapping label names to sequential numbers
|
|
34
|
+
|
|
35
|
+
Examples:
|
|
36
|
+
>>> extractor = LabelExtractor()
|
|
37
|
+
>>> content = "\\n{#fig:first}\\n\\n\\n{#fig:second}"
|
|
38
|
+
>>> extractor.extract_figure_labels(content)
|
|
39
|
+
{'first': 1, 'second': 2}
|
|
40
|
+
r
|
|
41
|
+
"""
|
|
42
|
+
# Pattern: Image markdown followed by {#fig:label}
|
|
43
|
+
# Allow hyphens and underscores in label names
|
|
44
|
+
labels = re.findall(r"!\[[^\]]*\]\([^)]+\)\s*\n\s*\{#fig:([\w-]+)", content)
|
|
45
|
+
return {label: i + 1 for i, label in enumerate(labels)}
|
|
46
|
+
|
|
47
|
+
@staticmethod
|
|
48
|
+
def extract_supplementary_figure_labels(content: str) -> Dict[str, int]:
|
|
49
|
+
r"""Extract supplementary figure labels and create number mapping.
|
|
50
|
+
|
|
51
|
+
Finds patterns like: \\n{#sfig:label}
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
content: Markdown content to scan
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
Dict mapping label names to sequential numbers
|
|
58
|
+
|
|
59
|
+
Examples:
|
|
60
|
+
>>> extractor = LabelExtractor()
|
|
61
|
+
>>> content = "\\n{#sfig:methods}\\n\\n\\n{#sfig:data}"
|
|
62
|
+
>>> extractor.extract_supplementary_figure_labels(content)
|
|
63
|
+
{'methods': 1, 'data': 2}
|
|
64
|
+
r
|
|
65
|
+
"""
|
|
66
|
+
# Pattern: Image markdown followed by {#sfig:label}
|
|
67
|
+
labels = re.findall(r"!\[[^\]]*\]\([^)]+\)\s*\n\s*\{#sfig:([\w-]+)", content)
|
|
68
|
+
return {label: i + 1 for i, label in enumerate(labels)}
|
|
69
|
+
|
|
70
|
+
@staticmethod
|
|
71
|
+
def extract_supplementary_table_labels(content: str) -> Dict[str, int]:
|
|
72
|
+
r"""Extract supplementary table labels and create number mapping.
|
|
73
|
+
|
|
74
|
+
Finds both markdown format {#stable:label} and LaTeX format \\label{stable:label}.
|
|
75
|
+
Prefers LaTeX labels if both are present (matches PDF behavior).
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
content: Markdown/LaTeX content to scan
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
Dict mapping label names to sequential numbers
|
|
82
|
+
|
|
83
|
+
Examples:
|
|
84
|
+
>>> extractor = LabelExtractor()
|
|
85
|
+
>>> content = "{#stable:params}\\n\\n{#stable:results}"
|
|
86
|
+
>>> extractor.extract_supplementary_table_labels(content)
|
|
87
|
+
{'params': 1, 'results': 2}
|
|
88
|
+
r
|
|
89
|
+
"""
|
|
90
|
+
# Extract both markdown and LaTeX formats
|
|
91
|
+
markdown_labels = re.findall(r"\{#stable:([\w-]+)\}", content)
|
|
92
|
+
latex_labels = re.findall(r"\\label\{stable:([\w-]+)\}", content)
|
|
93
|
+
|
|
94
|
+
# Prefer LaTeX labels (matches PDF behavior), fall back to markdown
|
|
95
|
+
table_labels = latex_labels if latex_labels else markdown_labels
|
|
96
|
+
|
|
97
|
+
# Remove duplicates while preserving order
|
|
98
|
+
seen = set()
|
|
99
|
+
unique_labels = [label for label in table_labels if not (label in seen or seen.add(label))]
|
|
100
|
+
|
|
101
|
+
return {label: i + 1 for i, label in enumerate(unique_labels)}
|
|
102
|
+
|
|
103
|
+
@staticmethod
|
|
104
|
+
def extract_supplementary_note_labels(content: str) -> Dict[str, int]:
|
|
105
|
+
r"""Extract supplementary note labels and create number mapping.
|
|
106
|
+
|
|
107
|
+
Finds patterns like: {#snote:label}
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
content: Markdown content to scan
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
Dict mapping label names to sequential numbers
|
|
114
|
+
|
|
115
|
+
Examples:
|
|
116
|
+
>>> extractor = LabelExtractor()
|
|
117
|
+
>>> content = "{#snote:methods}\\n\\n{#snote:analysis}"
|
|
118
|
+
>>> extractor.extract_supplementary_note_labels(content)
|
|
119
|
+
{'methods': 1, 'analysis': 2}
|
|
120
|
+
r
|
|
121
|
+
"""
|
|
122
|
+
labels = re.findall(r"\{#snote:([\w-]+)\}", content)
|
|
123
|
+
return {label: i + 1 for i, label in enumerate(labels)}
|
|
124
|
+
|
|
125
|
+
@staticmethod
|
|
126
|
+
def extract_equation_labels(content: str) -> Dict[str, int]:
|
|
127
|
+
r"""Extract equation labels and create number mapping.
|
|
128
|
+
|
|
129
|
+
Finds patterns like: {#eq:label}
|
|
130
|
+
|
|
131
|
+
Args:
|
|
132
|
+
content: Markdown content to scan
|
|
133
|
+
|
|
134
|
+
Returns:
|
|
135
|
+
Dict mapping label names to sequential numbers
|
|
136
|
+
|
|
137
|
+
Examples:
|
|
138
|
+
>>> extractor = LabelExtractor()
|
|
139
|
+
>>> content = "{#eq:energy}\\n\\n{#eq:momentum}"
|
|
140
|
+
>>> extractor.extract_equation_labels(content)
|
|
141
|
+
{'energy': 1, 'momentum': 2}
|
|
142
|
+
r
|
|
143
|
+
"""
|
|
144
|
+
labels = re.findall(r"\{#eq:([\w-]+)\}", content)
|
|
145
|
+
return {label: i + 1 for i, label in enumerate(labels)}
|
|
146
|
+
|
|
147
|
+
@staticmethod
|
|
148
|
+
def extract_all_labels(
|
|
149
|
+
main_content: str, si_content: str = ""
|
|
150
|
+
) -> Tuple[Dict[str, int], Dict[str, int], Dict[str, int], Dict[str, int], Dict[str, int]]:
|
|
151
|
+
r"""Extract all label types from content.
|
|
152
|
+
|
|
153
|
+
Convenience method to extract all label types at once. Supplementary
|
|
154
|
+
elements are extracted from SI content if provided, otherwise from main content.
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
main_content: Main manuscript content
|
|
158
|
+
si_content: Supplementary information content (optional)
|
|
159
|
+
|
|
160
|
+
Returns:
|
|
161
|
+
Tuple of (figure_map, sfig_map, stable_map, snote_map, eq_map)
|
|
162
|
+
|
|
163
|
+
Examples:
|
|
164
|
+
>>> extractor = LabelExtractor()
|
|
165
|
+
>>> main = "\\n{#fig:main}\\n{#eq:formula}"
|
|
166
|
+
>>> si = "\\n{#sfig:extra}"
|
|
167
|
+
>>> fig, sfig, stable, snote, eq = extractor.extract_all_labels(main, si)
|
|
168
|
+
>>> (fig, sfig, eq)
|
|
169
|
+
({'main': 1}, {'extra': 1}, {'formula': 1})
|
|
170
|
+
r
|
|
171
|
+
"""
|
|
172
|
+
extractor = LabelExtractor()
|
|
173
|
+
|
|
174
|
+
# Main figures and equations from main content
|
|
175
|
+
figure_map = extractor.extract_figure_labels(main_content)
|
|
176
|
+
equation_map = extractor.extract_equation_labels(main_content)
|
|
177
|
+
|
|
178
|
+
# Supplementary elements from SI content if provided, else main content
|
|
179
|
+
content_for_si = si_content if si_content else main_content
|
|
180
|
+
|
|
181
|
+
sfig_map = extractor.extract_supplementary_figure_labels(content_for_si)
|
|
182
|
+
stable_map = extractor.extract_supplementary_table_labels(content_for_si)
|
|
183
|
+
snote_map = extractor.extract_supplementary_note_labels(content_for_si)
|
|
184
|
+
|
|
185
|
+
return figure_map, sfig_map, stable_map, snote_map, equation_map
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
"""PDF splitting utilities for separating main and SI sections."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import Optional, Tuple
|
|
6
|
+
|
|
7
|
+
from pypdf import PdfReader, PdfWriter
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def find_si_start_page(pdf_path: Path) -> Optional[int]:
|
|
13
|
+
"""Find the page number where Supplementary Information starts.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
pdf_path: Path to the PDF file
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
Page number (0-indexed) where SI starts, or None if not found
|
|
20
|
+
"""
|
|
21
|
+
try:
|
|
22
|
+
reader = PdfReader(pdf_path)
|
|
23
|
+
|
|
24
|
+
# Search for common SI markers
|
|
25
|
+
si_markers = [
|
|
26
|
+
"Supplementary Information",
|
|
27
|
+
"Supplementary Material",
|
|
28
|
+
"Supplementary Data",
|
|
29
|
+
"Supporting Information",
|
|
30
|
+
"SI APPENDIX",
|
|
31
|
+
"SUPPLEMENTARY FIGURES",
|
|
32
|
+
"Supplementary Methods",
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
for page_num, page in enumerate(reader.pages):
|
|
36
|
+
try:
|
|
37
|
+
text = page.extract_text()
|
|
38
|
+
|
|
39
|
+
# Check for SI markers (case-insensitive)
|
|
40
|
+
text_upper = text.upper()
|
|
41
|
+
for marker in si_markers:
|
|
42
|
+
if marker.upper() in text_upper:
|
|
43
|
+
logger.info(f"Found SI marker '{marker}' on page {page_num + 1}")
|
|
44
|
+
return page_num
|
|
45
|
+
|
|
46
|
+
except Exception as e:
|
|
47
|
+
logger.debug(f"Could not extract text from page {page_num}: {e}")
|
|
48
|
+
continue
|
|
49
|
+
|
|
50
|
+
logger.warning("Could not find SI start marker in PDF")
|
|
51
|
+
return None
|
|
52
|
+
|
|
53
|
+
except Exception as e:
|
|
54
|
+
logger.error(f"Error finding SI start: {e}")
|
|
55
|
+
return None
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def split_pdf(
|
|
59
|
+
pdf_path: Path,
|
|
60
|
+
si_start_page: Optional[int] = None,
|
|
61
|
+
) -> Tuple[Optional[Path], Optional[Path]]:
|
|
62
|
+
"""Split PDF into main and SI sections.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
pdf_path: Path to the PDF file to split
|
|
66
|
+
si_start_page: Page number (0-indexed) where SI starts. If None, will auto-detect.
|
|
67
|
+
|
|
68
|
+
Returns:
|
|
69
|
+
Tuple of (main_pdf_path, si_pdf_path). Either may be None if splitting fails.
|
|
70
|
+
"""
|
|
71
|
+
try:
|
|
72
|
+
# Auto-detect SI start if not provided
|
|
73
|
+
if si_start_page is None:
|
|
74
|
+
si_start_page = find_si_start_page(pdf_path)
|
|
75
|
+
|
|
76
|
+
if si_start_page is None:
|
|
77
|
+
logger.warning("Cannot split PDF: SI start page not found")
|
|
78
|
+
return None, None
|
|
79
|
+
|
|
80
|
+
reader = PdfReader(pdf_path)
|
|
81
|
+
total_pages = len(reader.pages)
|
|
82
|
+
|
|
83
|
+
if si_start_page >= total_pages:
|
|
84
|
+
logger.error(f"SI start page {si_start_page} exceeds total pages {total_pages}")
|
|
85
|
+
return None, None
|
|
86
|
+
|
|
87
|
+
# Generate output paths
|
|
88
|
+
stem = pdf_path.stem
|
|
89
|
+
parent = pdf_path.parent
|
|
90
|
+
|
|
91
|
+
main_path = parent / f"{stem}__main.pdf"
|
|
92
|
+
si_path = parent / f"{stem}__si.pdf"
|
|
93
|
+
|
|
94
|
+
# Create main PDF (pages before SI)
|
|
95
|
+
main_writer = PdfWriter()
|
|
96
|
+
for page_num in range(si_start_page):
|
|
97
|
+
main_writer.add_page(reader.pages[page_num])
|
|
98
|
+
|
|
99
|
+
with open(main_path, "wb") as f:
|
|
100
|
+
main_writer.write(f)
|
|
101
|
+
logger.info(f"Created main PDF: {main_path} ({si_start_page} pages)")
|
|
102
|
+
|
|
103
|
+
# Create SI PDF (pages from SI onwards)
|
|
104
|
+
si_writer = PdfWriter()
|
|
105
|
+
for page_num in range(si_start_page, total_pages):
|
|
106
|
+
si_writer.add_page(reader.pages[page_num])
|
|
107
|
+
|
|
108
|
+
with open(si_path, "wb") as f:
|
|
109
|
+
si_writer.write(f)
|
|
110
|
+
logger.info(f"Created SI PDF: {si_path} ({total_pages - si_start_page} pages)")
|
|
111
|
+
|
|
112
|
+
return main_path, si_path
|
|
113
|
+
|
|
114
|
+
except Exception as e:
|
|
115
|
+
logger.error(f"Error splitting PDF: {e}")
|
|
116
|
+
return None, None
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: rxiv-maker
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.18.0
|
|
4
4
|
Summary: Write scientific preprints in Markdown. Generate publication-ready PDFs efficiently.
|
|
5
5
|
Project-URL: Homepage, https://github.com/HenriquesLab/rxiv-maker
|
|
6
6
|
Project-URL: Documentation, https://github.com/HenriquesLab/rxiv-maker#readme
|
|
@@ -56,6 +56,7 @@ Requires-Dist: mypy>=1.0; extra == 'dev'
|
|
|
56
56
|
Requires-Dist: nbstripout>=0.7.1; extra == 'dev'
|
|
57
57
|
Requires-Dist: nox>=2023.0.0; extra == 'dev'
|
|
58
58
|
Requires-Dist: pre-commit>=4.2.0; extra == 'dev'
|
|
59
|
+
Requires-Dist: pypdf2>=3.0.0; extra == 'dev'
|
|
59
60
|
Requires-Dist: pytest-asyncio>=0.21.0; extra == 'dev'
|
|
60
61
|
Requires-Dist: pytest-benchmark>=4.0.0; extra == 'dev'
|
|
61
62
|
Requires-Dist: pytest-clarity>=1.0.0; extra == 'dev'
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
rxiv_maker/__init__.py,sha256=p04JYC5ZhP6dLXkoWVlKNyiRvsDE1a4C88f9q4xO3tA,3268
|
|
2
|
-
rxiv_maker/__version__.py,sha256=
|
|
2
|
+
rxiv_maker/__version__.py,sha256=MVd3J-jKwgAFkETITQ9sUCvhAdXfezvIvUmYNFYviSc,51
|
|
3
3
|
rxiv_maker/rxiv_maker_cli.py,sha256=9Lu_mhFPXwx5jzAR6StCNxwCm_fkmP5qiOYdNuh_AwI,120
|
|
4
4
|
rxiv_maker/validate.py,sha256=AIzgP59KbCQJqC9WIGfUdVv0xI6ud9g1fFznQkaGz5Q,9373
|
|
5
5
|
rxiv_maker/cli/__init__.py,sha256=Jw0DTFUSofN-02xpVrt1UUzRcgH5NNd-GPNidhmNwpU,77
|
|
@@ -11,7 +11,7 @@ rxiv_maker/cli/main.py,sha256=AwqEcq46MPu8T72iqO39RWemtiD7WiAeUkfxACMUdgs,8697
|
|
|
11
11
|
rxiv_maker/cli/commands/__init__.py,sha256=jp_dRdPBJNhgZxA7ccAplbtESUBtb81k-boLA5yi3Fg,1396
|
|
12
12
|
rxiv_maker/cli/commands/arxiv.py,sha256=nlAS36lgTNjd6Hn1cdporXFZskf7-Jl-fZLvjxc6gjo,1245
|
|
13
13
|
rxiv_maker/cli/commands/bibliography.py,sha256=3a4gNtY7Lvd5-mwIj-vCD5WwRDgMqPT37tJETthKYKA,2956
|
|
14
|
-
rxiv_maker/cli/commands/build.py,sha256=
|
|
14
|
+
rxiv_maker/cli/commands/build.py,sha256=v513o3duOD9YvKUoOTggqshqQzwD4THBVIAClJTnx60,3525
|
|
15
15
|
rxiv_maker/cli/commands/cache_management.py,sha256=y58QsuSjzCz_IhY6iXnir8OoblHW5ZBMqItfnfjTP-Y,4577
|
|
16
16
|
rxiv_maker/cli/commands/changelog.py,sha256=OzEay8E8sWfXagjXyWiKwfE-OC9UKokWB7mbINp36t4,8461
|
|
17
17
|
rxiv_maker/cli/commands/check_installation.py,sha256=D_ewz_hGZTqOr_V4eWfnQHOsePTlY1WD3O6XTSpjGNM,1232
|
|
@@ -39,11 +39,11 @@ rxiv_maker/cli/framework/config_commands.py,sha256=a1uOQkCCw3d4qlro3OwHIorcoNg03
|
|
|
39
39
|
rxiv_maker/cli/framework/content_commands.py,sha256=RilxKeG2c1m2fu0CtWAvP3cGh11DGx9P-nh2kIewAg4,22596
|
|
40
40
|
rxiv_maker/cli/framework/decorators.py,sha256=fh085e3k1CaLSMoZevt8hvgnEuejrf-mcNS-dwXoY_A,10365
|
|
41
41
|
rxiv_maker/cli/framework/utility_commands.py,sha256=drIAc1TAYpne76gj7SZeZhPozVAY5uL9GFPVT_Ez0-E,26437
|
|
42
|
-
rxiv_maker/cli/framework/workflow_commands.py,sha256=
|
|
42
|
+
rxiv_maker/cli/framework/workflow_commands.py,sha256=Csls8VGmNCWPjpY9PMfdIAdzDhD_ZmSfRkhdcUihzpk,32699
|
|
43
43
|
rxiv_maker/config/defaults.py,sha256=vHyLGVxe5-z9TLxu5f6NhquPvqQkER_KZv_j1I4_dHQ,3055
|
|
44
44
|
rxiv_maker/config/validator.py,sha256=9XDPfo_YgasGt6NLkl6HIhaGh1fr6XsFNiXU2DSsivw,38299
|
|
45
45
|
rxiv_maker/converters/__init__.py,sha256=d7WGsRwWqRQWO117IkKDP0Ap0ERiK0N2-dXHInye3_A,685
|
|
46
|
-
rxiv_maker/converters/citation_processor.py,sha256=
|
|
46
|
+
rxiv_maker/converters/citation_processor.py,sha256=8nllASY5K-pCT2jgU1bpLXNRR-SPqWWBIxDy8BR6_5M,9083
|
|
47
47
|
rxiv_maker/converters/code_processor.py,sha256=ZFkJsqJ4nYUiDGtLeV7yWgOWUZNUXIFaBxZOPq88UrU,9324
|
|
48
48
|
rxiv_maker/converters/comment_processor.py,sha256=Tlem4btYqMmfRf5AM5s6ZbB_pZPvkFpACz8XsLNVl50,7845
|
|
49
49
|
rxiv_maker/converters/custom_command_processor.py,sha256=89wkIKP2It89qhXPBxiK-ib3Wwb5bKYsVf8E6BBRTlk,23467
|
|
@@ -79,7 +79,7 @@ rxiv_maker/core/cache/doi_cache.py,sha256=lw_ouHlfKNzg4zDMAOEUDhsFLfm88rWcPYBVyj
|
|
|
79
79
|
rxiv_maker/core/cache/secure_cache_utils.py,sha256=EejPWvxw_mUPqO0TRBHYYTsLXWZEUH1qykEfBwgpkcc,18000
|
|
80
80
|
rxiv_maker/core/managers/__init__.py,sha256=sh4ZuZH4YrAu4XTiN9ky1-tQQASKiSTY0udJJAzDRcU,950
|
|
81
81
|
rxiv_maker/core/managers/cache_manager.py,sha256=8btUaRDYPOrUynHWBMz7RVeS9xcpUoyhemFNUi8NqpQ,21893
|
|
82
|
-
rxiv_maker/core/managers/config_manager.py,sha256=
|
|
82
|
+
rxiv_maker/core/managers/config_manager.py,sha256=G1hlKNqwFO5i3Epq3XDI3awEYowhi_E7TczfMnLzDBQ,18894
|
|
83
83
|
rxiv_maker/core/managers/dependency_manager.py,sha256=qtUR4sQD2x1zv4Fi77d6ThwPx9eHZ9_ZyNy0hRpIKKQ,26308
|
|
84
84
|
rxiv_maker/core/managers/execution_manager.py,sha256=cEDS0KyWBHf_N74Fc8MC4VRXxmEcaz_DJtyWO5-o628,29585
|
|
85
85
|
rxiv_maker/core/managers/file_manager.py,sha256=SVRnP1JQoGCAms3E7iSpOp_RG60P36Qk9HGAmJDaFvE,18641
|
|
@@ -104,10 +104,10 @@ rxiv_maker/engines/operations/track_changes.py,sha256=jJZ-XnTFx8TMvcnX8_9D7ydc0G
|
|
|
104
104
|
rxiv_maker/engines/operations/validate.py,sha256=OVmtRVtG-r1hoA8IqYaNC-ijN1a5ixM3X5Z8Gda-O2M,17142
|
|
105
105
|
rxiv_maker/engines/operations/validate_pdf.py,sha256=qyrtL752Uap3i6ntQheY570soVjFZRJe8ANrw5AvHFs,5899
|
|
106
106
|
rxiv_maker/exporters/__init__.py,sha256=NcTD1SDb8tTgsHhCS1A7TVEZncyWbDRTa6sJIdLqcsE,350
|
|
107
|
-
rxiv_maker/exporters/docx_citation_mapper.py,sha256=
|
|
108
|
-
rxiv_maker/exporters/docx_content_processor.py,sha256=
|
|
109
|
-
rxiv_maker/exporters/docx_exporter.py,sha256=
|
|
110
|
-
rxiv_maker/exporters/docx_writer.py,sha256=
|
|
107
|
+
rxiv_maker/exporters/docx_citation_mapper.py,sha256=oSy1LglLvxlmhO18bzl3EInA2PleE8nXqEgQIIRVzwE,5170
|
|
108
|
+
rxiv_maker/exporters/docx_content_processor.py,sha256=FoOaF9BoEpZEF3HG3pzFZFgYbYKwbgRNwkOyURZ8XtI,27895
|
|
109
|
+
rxiv_maker/exporters/docx_exporter.py,sha256=7HYgQSGE_7xl3PTMSPKXIW6yHg-b3MU9ASXG-rKVSXo,21456
|
|
110
|
+
rxiv_maker/exporters/docx_writer.py,sha256=252nqO1_ItJzCiuvsrpVdlU4VbLVMSh5FvFpo4k2GtI,56510
|
|
111
111
|
rxiv_maker/install/__init__.py,sha256=kAB6P-12IKg_K1MQ-uzeC5IR11O2cNxj0t_2JMhooZs,590
|
|
112
112
|
rxiv_maker/install/dependency_handlers/__init__.py,sha256=NN9dP1usXpYgLpSw0uEnJ6ugX2zefihVjdyDdm1k-cE,231
|
|
113
113
|
rxiv_maker/install/dependency_handlers/latex.py,sha256=xopSJxYkg3D63rH7RoVLN-Ykl87AZqhlUrrG3m6LoWo,3304
|
|
@@ -141,17 +141,21 @@ rxiv_maker/services/publication_service.py,sha256=0p8yQ1jrY3RHwCkzTEl_sAbWYTafRk
|
|
|
141
141
|
rxiv_maker/services/validation_service.py,sha256=eWg14NqJu6LzyJBgeXkTaVZAlX4wYFX8ZEvSR5hMx7U,14619
|
|
142
142
|
rxiv_maker/templates/__init__.py,sha256=UTet1pYPkPdgvrLw-wwaY-PAgdjGJasAi_hdyIh0J8s,562
|
|
143
143
|
rxiv_maker/templates/manager.py,sha256=HlI7Qb52866Okf4k1aRh0fUy9heOSNGjMQJtrCdL3Xk,6131
|
|
144
|
-
rxiv_maker/templates/registry.py,sha256=
|
|
144
|
+
rxiv_maker/templates/registry.py,sha256=XHO3sQBRulsTsYUmP9R3K6RxnYMAehqrY-MoPgGaexE,16206
|
|
145
145
|
rxiv_maker/tex/python_execution_section.tex,sha256=pHz6NGfZN4ViBo6rInUO5FAuk81sV_Ppqszrvl00w_4,2218
|
|
146
146
|
rxiv_maker/utils/__init__.py,sha256=4ya5VR8jqRqUChlnUeMeeetOuWV-gIvjPwcE1u_1OnI,1540
|
|
147
|
+
rxiv_maker/utils/accent_character_map.py,sha256=L8BDiUH3IPCZ_pQzfsnlY0KkJpbyNpf3nQy4DBUdczE,3484
|
|
148
|
+
rxiv_maker/utils/author_affiliation_processor.py,sha256=UdUsMvRMXtjfJ8g2u6fge1MlgZPciWVu_zPnoSjhNDk,5604
|
|
147
149
|
rxiv_maker/utils/author_name_formatter.py,sha256=UjvarbyQm89EUIYqckygx3g37o-EcNyvipBtY8GJDxs,10222
|
|
148
150
|
rxiv_maker/utils/bibliography_checksum.py,sha256=Jh4VILSpGQ5KJ9UBCUb7oFy6lZ9_ncXD87vEXxw5jbY,10270
|
|
149
151
|
rxiv_maker/utils/bibliography_parser.py,sha256=WZIQoEpVwdbLmbkw9FdkVgoLE5GX7itqnzPnEEb_fFU,6846
|
|
150
152
|
rxiv_maker/utils/bst_generator.py,sha256=m69JWMIvf9eRiHcaWB-8D3DQCDO8flVIYbOBMuzV-F0,6097
|
|
151
153
|
rxiv_maker/utils/changelog_parser.py,sha256=WCDp9Iy6H6_3nC6FB7RLt6i00zuCyvU17sCU4e3pqCY,11954
|
|
154
|
+
rxiv_maker/utils/citation_range_formatter.py,sha256=1Zb_csGOEWR8YIHWPqLSCzNEs0ogw49Q86BAm9KglrI,3619
|
|
152
155
|
rxiv_maker/utils/citation_utils.py,sha256=spIgVxPAN6jPvoG-eOE00rVX_buUGKnUjP1Fhz31sl4,5134
|
|
156
|
+
rxiv_maker/utils/comment_filter.py,sha256=LcT5EgdvOwP82Gn-zt0Z3iU8_6XL9Cpt87eBOlMHl7o,1522
|
|
153
157
|
rxiv_maker/utils/dependency_checker.py,sha256=EdyIvk-W_bhC1DJCpFw5ePhjEU74C9j7RYMm06unBMA,14366
|
|
154
|
-
rxiv_maker/utils/docx_helpers.py,sha256=
|
|
158
|
+
rxiv_maker/utils/docx_helpers.py,sha256=xE1PnXjoUaX33GlbW4w7yQq_563MvFkhXBbvrmch-40,12708
|
|
155
159
|
rxiv_maker/utils/doi_resolver.py,sha256=8_oy5cTtklm1GCKXpn509yqYsu4P5gYbMjtfQ8dRgFA,10253
|
|
156
160
|
rxiv_maker/utils/email_encoder.py,sha256=QMD5JbGNu68gD8SBdGHfNY8uCgbMzEcmzE1TCYDMgWY,5139
|
|
157
161
|
rxiv_maker/utils/figure_checksum.py,sha256=PWgh2QAErNnnQCV-t-COACQXKICUaggAAIxhgHLCGNM,10748
|
|
@@ -159,7 +163,9 @@ rxiv_maker/utils/file_helpers.py,sha256=qy3CqX6PkfiFSR2XKwASfx038VOnfnnVQL1wGPy2
|
|
|
159
163
|
rxiv_maker/utils/github.py,sha256=jFzwdI6OPG7Q5w5iOPWUC-OqFqE65UscaGZLs6npGGw,15341
|
|
160
164
|
rxiv_maker/utils/homebrew_checker.py,sha256=tyqnYMxxyONN-1krEJk8sYGOv-FhJZGgkvBny7xSZvI,3978
|
|
161
165
|
rxiv_maker/utils/install_detector.py,sha256=4Ir_ZAM-wrarxBkup7WuUYcT48t0P2c9IeTDWkp_q4w,3869
|
|
166
|
+
rxiv_maker/utils/label_extractor.py,sha256=xQpWieDvBdQNpJBHVXpP-wA083LH68NAF4k1MVDm5T4,6816
|
|
162
167
|
rxiv_maker/utils/operation_ids.py,sha256=dH9m7OGRrk2EG5abnJFF_1KPQ80lLZLcZD_KYaT9GwI,6083
|
|
168
|
+
rxiv_maker/utils/pdf_splitter.py,sha256=aDyMMg7bxQOWpZIvjLX9SR8BsIibL0ngSlrrryXEy5I,3601
|
|
163
169
|
rxiv_maker/utils/pdf_utils.py,sha256=MwT5RSnQ3OJHuFDQ_OP6BOcB-h6HfF618t6-j5icnyk,4253
|
|
164
170
|
rxiv_maker/utils/performance.py,sha256=EBDVNshSaeG7Nu-GCZtRAzTunGn4z_Bb2jEck045bxo,8169
|
|
165
171
|
rxiv_maker/utils/platform.py,sha256=DCD3gvm7_DBcT67gGIXhTDV5mPrBjWrL7R2JdsmIgng,17773
|
|
@@ -187,9 +193,9 @@ rxiv_maker/validators/doi/api_clients.py,sha256=tqdYUq8LFgRIO0tWfcenwmy2uO-IB1-G
|
|
|
187
193
|
rxiv_maker/validators/doi/metadata_comparator.py,sha256=euqHhKP5sHQAdZbdoAahUn6YqJqOfXIOobNgAqFHlN8,11533
|
|
188
194
|
rxiv_maker/tex/template.tex,sha256=zrJ3aFfu8j9zkg1l375eE9w-j42P3rz16wMD3dSgi1I,1354
|
|
189
195
|
rxiv_maker/tex/style/rxiv_maker_style.bst,sha256=jbVqrJgAm6F88cow5vtZuPBwwmlcYykclTm8RvZIo6Y,24281
|
|
190
|
-
rxiv_maker/tex/style/rxiv_maker_style.cls,sha256=
|
|
191
|
-
rxiv_maker-1.
|
|
192
|
-
rxiv_maker-1.
|
|
193
|
-
rxiv_maker-1.
|
|
194
|
-
rxiv_maker-1.
|
|
195
|
-
rxiv_maker-1.
|
|
196
|
+
rxiv_maker/tex/style/rxiv_maker_style.cls,sha256=6VDmZE0uvYWog6rcYi2K_NIM9-Pgjx9AFdRg_sTheK0,24374
|
|
197
|
+
rxiv_maker-1.18.0.dist-info/METADATA,sha256=uJSb2e2tMW1eUvob2E8iKM5kJAVZsILejVNbG_Uitto,18222
|
|
198
|
+
rxiv_maker-1.18.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
199
|
+
rxiv_maker-1.18.0.dist-info/entry_points.txt,sha256=ghCN0hI9A1GlG7QY5F6E-xYPflA8CyS4B6bTQ1YLop0,97
|
|
200
|
+
rxiv_maker-1.18.0.dist-info/licenses/LICENSE,sha256=GSZFoPIhWDNJEtSHTQ5dnELN38zFwRiQO2antBezGQk,1093
|
|
201
|
+
rxiv_maker-1.18.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|