rxiv-maker 1.17.0__py3-none-any.whl → 1.18.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,150 @@
1
+ r"""LaTeX accent character to Unicode conversion map.
2
+
3
+ This module provides centralized mapping of LaTeX accent commands to their
4
+ Unicode equivalents. Used by both DOCX export and LaTeX processing to ensure
5
+ consistent character handling across formats.
6
+
7
+ Examples:
8
+ >>> clean_latex_accents("\\'e")
9
+ 'é'
10
+ >>> clean_latex_accents("Calf{\\'e}")
11
+ 'Café'
12
+ """
13
+
14
+ from typing import Dict
15
+
16
+ # LaTeX accent commands to Unicode character mapping
17
+ # Handles both with and without backslashes (BibTeX parser may strip them)
18
+ # Also handles variant forms where backslash is replaced with the literal character
19
+ ACCENT_MAP: Dict[str, str] = {
20
+ # Acute accents (é, á, í, ó, ú) - use non-raw strings for single backslash
21
+ "\\'e": "é",
22
+ "{\\'e}": "é",
23
+ "{'e}": "é",
24
+ "{'é}": "é",
25
+ "\\'a": "á",
26
+ "{\\'a}": "á",
27
+ "{'a}": "á",
28
+ "{'á}": "á",
29
+ "\\'i": "í",
30
+ "{\\'i}": "í",
31
+ "{'i}": "í",
32
+ "{'í}": "í",
33
+ "'{\\i}": "í", # Acute on dotless i
34
+ "\\'o": "ó",
35
+ "{\\'o}": "ó",
36
+ "{'o}": "ó",
37
+ "{'ó}": "ó",
38
+ "'{o}": "ó", # Acute o (variant without backslash)
39
+ "\\'u": "ú",
40
+ "{\\'u}": "ú",
41
+ "{'u}": "ú",
42
+ "{'ú}": "ú",
43
+ # Uppercase acute accents
44
+ "\\'E": "É",
45
+ "{\\'E}": "É",
46
+ "{'E}": "É",
47
+ "\\'A": "Á",
48
+ "{\\'A}": "Á",
49
+ "{'A}": "Á",
50
+ "\\'I": "Í",
51
+ "{\\'I}": "Í",
52
+ "{'I}": "Í",
53
+ "'{\\I}": "Í", # Acute on uppercase dotless I
54
+ "\\'O": "Ó",
55
+ "{\\'O}": "Ó",
56
+ "{'O}": "Ó",
57
+ "'{O}": "Ó",
58
+ "\\'U": "Ú",
59
+ "{\\'U}": "Ú",
60
+ "{'U}": "Ú",
61
+ # Umlaut/diaeresis (ë, ä, ï, ö, ü)
62
+ '\\"e': "ë",
63
+ '{\\"e}': "ë",
64
+ '{"e}': "ë",
65
+ '{"ë}': "ë",
66
+ '\\"a': "ä",
67
+ '{\\"a}': "ä",
68
+ '{"a}': "ä",
69
+ '{"ä}': "ä",
70
+ '\\"i': "ï",
71
+ '{\\"i}': "ï",
72
+ '{"i}': "ï",
73
+ '{"ï}': "ï",
74
+ '\\"o': "ö",
75
+ '{\\"o}': "ö",
76
+ '{"o}': "ö",
77
+ '{"ö}': "ö",
78
+ '\\"u': "ü",
79
+ '{\\"u}': "ü",
80
+ '{"u}': "ü",
81
+ '{"ü}': "ü",
82
+ # Grave accents (è, à)
83
+ "\\`e": "è",
84
+ "{\\`e}": "è",
85
+ "{`e}": "è",
86
+ "{`è}": "è",
87
+ "\\`a": "à",
88
+ "{\\`a}": "à",
89
+ "{`a}": "à",
90
+ "{`à}": "à",
91
+ # Circumflex (ê, â)
92
+ "\\^e": "ê",
93
+ "{\\^e}": "ê",
94
+ "{^e}": "ê",
95
+ "{^ê}": "ê",
96
+ "\\^a": "â",
97
+ "{\\^a}": "â",
98
+ "{^a}": "â",
99
+ "{^â}": "â",
100
+ # Tilde (ñ, ã, õ)
101
+ "\\~n": "ñ",
102
+ "{\\~n}": "ñ",
103
+ "{~n}": "ñ",
104
+ "{~ñ}": "ñ",
105
+ "~{n}": "ñ",
106
+ "\\~a": "ã",
107
+ "{\\~a}": "ã",
108
+ "{~a}": "ã",
109
+ "~{a}": "ã", # Tilde on a (variant)
110
+ "{~ã}": "ã",
111
+ "\\~o": "õ",
112
+ "{\\~o}": "õ",
113
+ "{~o}": "õ",
114
+ "~{o}": "õ", # Tilde on o (variant)
115
+ "{~õ}": "õ",
116
+ # Uppercase tilde
117
+ "\\~N": "Ñ",
118
+ "{\\~N}": "Ñ",
119
+ "~{N}": "Ñ",
120
+ "\\~A": "Ã",
121
+ "{\\~A}": "Ã",
122
+ "~{A}": "Ã",
123
+ "\\~O": "Õ",
124
+ "{\\~O}": "Õ",
125
+ "~{O}": "Õ",
126
+ # Cedilla (ç)
127
+ "\\c{c}": "ç",
128
+ "{\\c{c}}": "ç",
129
+ "{\\c{ç}}": "ç",
130
+ }
131
+
132
+
133
+ def clean_latex_accents(text: str) -> str:
134
+ r"""Convert LaTeX accent commands to Unicode characters.
135
+
136
+ Args:
137
+ text: Text containing LaTeX accent commands
138
+
139
+ Returns:
140
+ Text with accent commands converted to Unicode
141
+
142
+ Examples:
143
+ >>> clean_latex_accents("Calf{\\'e}")
144
+ 'Café'
145
+ >>> clean_latex_accents("Se\\~nor")
146
+ 'Señor'
147
+ """
148
+ for latex_cmd, unicode_char in ACCENT_MAP.items():
149
+ text = text.replace(latex_cmd, unicode_char)
150
+ return text
@@ -0,0 +1,128 @@
1
+ """Author and affiliation processing utilities for manuscript processing.
2
+
3
+ This module provides centralized author and affiliation mapping logic used by both
4
+ DOCX export and LaTeX/PDF generation to ensure consistent handling across formats.
5
+
6
+ The processor handles:
7
+ - Building affiliation shortname → number mappings in order of first appearance
8
+ - Looking up full affiliation details from metadata
9
+ - Categorizing authors (co-first, corresponding)
10
+ - Mapping author affiliations to sequential numbers
11
+
12
+ Examples:
13
+ >>> processor = AuthorAffiliationProcessor()
14
+ >>> metadata = {
15
+ ... "authors": [{"name": "Alice", "affiliations": ["MIT"], "co_first_author": True}],
16
+ ... "affiliations": [{"shortname": "MIT", "full_name": "MIT", "location": "Cambridge"}]
17
+ ... }
18
+ >>> result = processor.process(metadata)
19
+ >>> result["affiliation_map"]
20
+ {'MIT': 1}
21
+ >>> result["cofirst_authors"]
22
+ [{'name': 'Alice', 'affiliations': ['MIT'], 'co_first_author': True}]
23
+ """
24
+
25
+ from typing import Any, Dict, List
26
+
27
+
28
+ class AuthorAffiliationProcessor:
29
+ """Process author and affiliation metadata for manuscript generation."""
30
+
31
+ def process(self, metadata: Dict[str, Any]) -> Dict[str, Any]:
32
+ """Process author and affiliation metadata.
33
+
34
+ Extracts and organizes author/affiliation data for use by format-specific
35
+ renderers (DOCX, LaTeX, etc.).
36
+
37
+ Args:
38
+ metadata: YAML metadata containing authors and affiliations
39
+
40
+ Returns:
41
+ Dict containing:
42
+ - affiliation_map: Dict[str, int] mapping shortnames to numbers
43
+ - ordered_affiliations: List[Tuple[int, str, str]] of (number, shortname, full_text)
44
+ - authors: List[Dict] of author metadata
45
+ - cofirst_authors: List[Dict] of co-first authors
46
+ - corresponding_authors: List[Dict] of corresponding authors
47
+
48
+ Examples:
49
+ >>> processor = AuthorAffiliationProcessor()
50
+ >>> metadata = {
51
+ ... "authors": [
52
+ ... {"name": "Alice", "affiliations": ["MIT"], "co_first_author": True},
53
+ ... {"name": "Bob", "affiliations": ["MIT", "Harvard"], "corresponding_author": True}
54
+ ... ],
55
+ ... "affiliations": [
56
+ ... {"shortname": "MIT", "full_name": "Massachusetts Institute of Technology", "location": "Cambridge, MA"},
57
+ ... {"shortname": "Harvard", "full_name": "Harvard University", "location": "Cambridge, MA"}
58
+ ... ]
59
+ ... }
60
+ >>> result = processor.process(metadata)
61
+ >>> result["affiliation_map"]
62
+ {'MIT': 1, 'Harvard': 2}
63
+ >>> len(result["cofirst_authors"])
64
+ 1
65
+ >>> len(result["corresponding_authors"])
66
+ 1
67
+ """
68
+ authors = metadata.get("authors", [])
69
+ affiliations = metadata.get("affiliations", [])
70
+
71
+ # Build affiliation details lookup
72
+ affiliation_details = {a.get("shortname"): a for a in affiliations if isinstance(a, dict)}
73
+
74
+ # Build affiliation map in order of first appearance
75
+ affiliation_map = {}
76
+ ordered_affiliations = []
77
+
78
+ for author in authors:
79
+ if isinstance(author, dict):
80
+ author_affiliations = author.get("affiliations", [])
81
+ for affil_shortname in author_affiliations:
82
+ if affil_shortname not in affiliation_map:
83
+ # Assign next number
84
+ affil_num = len(affiliation_map) + 1
85
+ affiliation_map[affil_shortname] = affil_num
86
+
87
+ # Look up full details
88
+ affil_info = affiliation_details.get(affil_shortname, {})
89
+ full_name = affil_info.get("full_name", affil_shortname)
90
+ location = affil_info.get("location", "")
91
+
92
+ # Format: "Full Name, Location" or just "Full Name"
93
+ full_text = f"{full_name}, {location}" if location else full_name
94
+
95
+ ordered_affiliations.append((affil_num, affil_shortname, full_text))
96
+
97
+ # Categorize authors
98
+ cofirst_authors = [a for a in authors if isinstance(a, dict) and a.get("co_first_author", False)]
99
+ corresponding_authors = [a for a in authors if isinstance(a, dict) and a.get("corresponding_author", False)]
100
+
101
+ return {
102
+ "affiliation_map": affiliation_map,
103
+ "ordered_affiliations": ordered_affiliations,
104
+ "authors": authors,
105
+ "cofirst_authors": cofirst_authors,
106
+ "corresponding_authors": corresponding_authors,
107
+ }
108
+
109
+ def get_author_affiliation_numbers(self, author: Dict[str, Any], affiliation_map: Dict[str, int]) -> List[int]:
110
+ """Get sorted affiliation numbers for an author.
111
+
112
+ Args:
113
+ author: Author metadata dict
114
+ affiliation_map: Mapping from shortname to number
115
+
116
+ Returns:
117
+ Sorted list of affiliation numbers for this author
118
+
119
+ Examples:
120
+ >>> processor = AuthorAffiliationProcessor()
121
+ >>> author = {"name": "Alice", "affiliations": ["Harvard", "MIT"]}
122
+ >>> affil_map = {"MIT": 1, "Harvard": 2}
123
+ >>> processor.get_author_affiliation_numbers(author, affil_map)
124
+ [1, 2]
125
+ """
126
+ author_affiliations = author.get("affiliations", [])
127
+ affil_numbers = [affiliation_map[a] for a in author_affiliations if a in affiliation_map]
128
+ return sorted(affil_numbers)
@@ -0,0 +1,118 @@
1
+ """Citation range formatting utilities for manuscript processing.
2
+
3
+ This module provides utilities to format consecutive citation numbers as ranges
4
+ (e.g., [1, 2, 3] → [1-3]). Used by DOCX export and potentially LaTeX processing
5
+ to create compact, readable citation references.
6
+
7
+ Examples:
8
+ >>> format_number_list([1, 2, 3, 5, 6, 8])
9
+ '[1-3, 5-6, 8]'
10
+ >>> format_citation_ranges("text [1][2][3] more")
11
+ 'text [1-3] more'
12
+ """
13
+
14
+ import re
15
+ from typing import List
16
+
17
+
18
+ def format_number_list(numbers: List[int]) -> str:
19
+ """Format a list of citation numbers as ranges.
20
+
21
+ Consecutive numbers are combined into ranges with hyphens.
22
+ Single numbers and non-consecutive numbers are separated by commas.
23
+
24
+ Args:
25
+ numbers: List of citation numbers
26
+
27
+ Returns:
28
+ Formatted string with ranges
29
+
30
+ Examples:
31
+ >>> format_number_list([1, 2, 3, 5, 6, 8])
32
+ '[1-3, 5-6, 8]'
33
+ >>> format_number_list([15, 16])
34
+ '[15-16]'
35
+ >>> format_number_list([1, 3, 5])
36
+ '[1, 3, 5]'
37
+ >>> format_number_list([])
38
+ '[]'
39
+ """
40
+ if not numbers:
41
+ return "[]"
42
+
43
+ # Sort and deduplicate numbers
44
+ sorted_nums = sorted(set(numbers))
45
+
46
+ # Build ranges
47
+ ranges = []
48
+ start = sorted_nums[0]
49
+ end = sorted_nums[0]
50
+
51
+ for num in sorted_nums[1:]:
52
+ if num == end + 1:
53
+ # Continue current range
54
+ end = num
55
+ else:
56
+ # End current range and start new one
57
+ if start == end:
58
+ # Single number
59
+ ranges.append(str(start))
60
+ else:
61
+ # Range (including 2 consecutive numbers like 15-16)
62
+ ranges.append(f"{start}-{end}")
63
+ start = num
64
+ end = num
65
+
66
+ # Add final range
67
+ if start == end:
68
+ # Single number
69
+ ranges.append(str(start))
70
+ else:
71
+ # Range (including 2 consecutive numbers like 15-16)
72
+ ranges.append(f"{start}-{end}")
73
+
74
+ return f"[{', '.join(ranges)}]"
75
+
76
+
77
+ def format_citation_ranges(text: str) -> str:
78
+ """Format consecutive citations as ranges.
79
+
80
+ Converts patterns like [1][2][3] to [1-3], [15][16] to [15-16], etc.
81
+ Also formats comma-separated lists like [1, 2, 3] to [1-3].
82
+
83
+ Args:
84
+ text: Text with numbered citations
85
+
86
+ Returns:
87
+ Text with consecutive citations formatted as ranges
88
+
89
+ Examples:
90
+ >>> format_citation_ranges("text [1][2][3] more")
91
+ 'text [1-3] more'
92
+ >>> format_citation_ranges("text [1, 2, 3] more")
93
+ 'text [1-3] more'
94
+ >>> format_citation_ranges("text [1][3][4] more")
95
+ 'text [1][3-4] more'
96
+ >>> format_citation_ranges("text [1] [2] [3] more")
97
+ 'text [1-3] more'
98
+ """
99
+
100
+ # Pattern 1: Handle adjacent bracketed citations [1][2][3] or [1] [2] [3]
101
+ def combine_adjacent(match_obj):
102
+ # Extract all numbers from consecutive brackets (allowing spaces between)
103
+ numbers = [int(n) for n in re.findall(r"\[(\d+)\]", match_obj.group(0))]
104
+ return format_number_list(numbers)
105
+
106
+ # Find sequences of adjacent bracketed numbers (with optional spaces between)
107
+ text = re.sub(r"(?:\[\d+\]\s*){2,}", combine_adjacent, text)
108
+
109
+ # Pattern 2: Handle comma-separated citations within single brackets [1, 2, 3]
110
+ def combine_comma_separated(match_obj):
111
+ # Extract all numbers from comma-separated list
112
+ numbers_str = match_obj.group(1)
113
+ numbers = [int(n.strip()) for n in numbers_str.split(",")]
114
+ return format_number_list(numbers)
115
+
116
+ text = re.sub(r"\[([\d,\s]+)\]", combine_comma_separated, text)
117
+
118
+ return text
@@ -0,0 +1,46 @@
1
+ """Comment filtering utilities for manuscript processing.
2
+
3
+ This module provides utilities to identify and filter metadata comments
4
+ from manuscript content. Used by both DOCX export and potentially LaTeX
5
+ processing to handle comment blocks consistently.
6
+
7
+ Examples:
8
+ >>> is_metadata_comment("Note: this is a metadata comment")
9
+ True
10
+ >>> is_metadata_comment("TODO: fix this")
11
+ False
12
+ """
13
+
14
+
15
+ def is_metadata_comment(comment_text: str) -> bool:
16
+ """Check if a comment is metadata/informational and should be skipped.
17
+
18
+ Metadata comments are those that start with common prefixes like
19
+ "Note:", "Comment:", etc. These are typically informational and
20
+ should be filtered out during processing.
21
+
22
+ Args:
23
+ comment_text: The comment text to check
24
+
25
+ Returns:
26
+ True if comment should be skipped (is metadata), False if it should be included
27
+
28
+ Examples:
29
+ >>> is_metadata_comment("Note: remember to update this")
30
+ True
31
+ >>> is_metadata_comment("comment this section is WIP")
32
+ True
33
+ >>> is_metadata_comment("TODO: fix the bug")
34
+ False
35
+ >>> is_metadata_comment("")
36
+ True
37
+ """
38
+ if not comment_text:
39
+ return True
40
+
41
+ # Normalize to lowercase for case-insensitive matching
42
+ normalized = comment_text.lower().strip()
43
+
44
+ # Skip comments that start with common metadata keywords
45
+ metadata_prefixes = ["note:", "note ", "comment:", "comment "]
46
+ return any(normalized.startswith(prefix) for prefix in metadata_prefixes)
@@ -210,123 +210,10 @@ def clean_latex_commands(text: str) -> str:
210
210
  text = html.unescape(text)
211
211
 
212
212
  # Convert LaTeX accent commands to Unicode
213
- # Handle both with and without backslashes (BibTeX parser may strip them)
214
- # Also handle variant forms where backslash is replaced with the literal character
215
- accent_map = {
216
- # Acute accents (é, á, í, ó, ú) - use non-raw strings for single backslash
217
- "\\'e": "é",
218
- "{\\'e}": "é",
219
- "{'e}": "é",
220
- "{'é}": "é",
221
- "\\'a": "á",
222
- "{\\'a}": "á",
223
- "{'a}": "á",
224
- "{'á}": "á",
225
- "\\'i": "í",
226
- "{\\'i}": "í",
227
- "{'i}": "í",
228
- "{'í}": "í",
229
- "'{\\i}": "í", # Acute on dotless i
230
- "\\'o": "ó",
231
- "{\\'o}": "ó",
232
- "{'o}": "ó",
233
- "{'ó}": "ó",
234
- "'{o}": "ó", # Acute o (variant without backslash)
235
- "\\'u": "ú",
236
- "{\\'u}": "ú",
237
- "{'u}": "ú",
238
- "{'ú}": "ú",
239
- # Uppercase acute accents
240
- "\\'E": "É",
241
- "{\\'E}": "É",
242
- "{'E}": "É",
243
- "\\'A": "Á",
244
- "{\\'A}": "Á",
245
- "{'A}": "Á",
246
- "\\'I": "Í",
247
- "{\\'I}": "Í",
248
- "{'I}": "Í",
249
- "'{\\I}": "Í", # Acute on uppercase dotless I
250
- "\\'O": "Ó",
251
- "{\\'O}": "Ó",
252
- "{'O}": "Ó",
253
- "'{O}": "Ó",
254
- "\\'U": "Ú",
255
- "{\\'U}": "Ú",
256
- "{'U}": "Ú",
257
- # Umlaut/diaeresis (ë, ä, ï, ö, ü)
258
- '\\"e': "ë",
259
- '{\\"e}': "ë",
260
- '{"e}': "ë",
261
- '{"ë}': "ë",
262
- '\\"a': "ä",
263
- '{\\"a}': "ä",
264
- '{"a}': "ä",
265
- '{"ä}': "ä",
266
- '\\"i': "ï",
267
- '{\\"i}': "ï",
268
- '{"i}': "ï",
269
- '{"ï}': "ï",
270
- '\\"o': "ö",
271
- '{\\"o}': "ö",
272
- '{"o}': "ö",
273
- '{"ö}': "ö",
274
- '\\"u': "ü",
275
- '{\\"u}': "ü",
276
- '{"u}': "ü",
277
- '{"ü}': "ü",
278
- # Grave accents (è, à)
279
- "\\`e": "è",
280
- "{\\`e}": "è",
281
- "{`e}": "è",
282
- "{`è}": "è",
283
- "\\`a": "à",
284
- "{\\`a}": "à",
285
- "{`a}": "à",
286
- "{`à}": "à",
287
- # Circumflex (ê, â)
288
- "\\^e": "ê",
289
- "{\\^e}": "ê",
290
- "{^e}": "ê",
291
- "{^ê}": "ê",
292
- "\\^a": "â",
293
- "{\\^a}": "â",
294
- "{^a}": "â",
295
- "{^â}": "â",
296
- # Tilde (ñ, ã, õ)
297
- "\\~n": "ñ",
298
- "{\\~n}": "ñ",
299
- "{~n}": "ñ",
300
- "{~ñ}": "ñ",
301
- "~{n}": "ñ",
302
- "\\~a": "ã",
303
- "{\\~a}": "ã",
304
- "{~a}": "ã",
305
- "~{a}": "ã", # Tilde on a (variant)
306
- "{~ã}": "ã",
307
- "\\~o": "õ",
308
- "{\\~o}": "õ",
309
- "{~o}": "õ",
310
- "~{o}": "õ", # Tilde on o (variant)
311
- "{~õ}": "õ",
312
- # Uppercase tilde
313
- "\\~N": "Ñ",
314
- "{\\~N}": "Ñ",
315
- "~{N}": "Ñ",
316
- "\\~A": "Ã",
317
- "{\\~A}": "Ã",
318
- "~{A}": "Ã",
319
- "\\~O": "Õ",
320
- "{\\~O}": "Õ",
321
- "~{O}": "Õ",
322
- # Cedilla (ç)
323
- "\\c{c}": "ç",
324
- "{\\c{c}}": "ç",
325
- "{\\c{ç}}": "ç",
326
- }
327
-
328
- for latex_cmd, unicode_char in accent_map.items():
329
- text = text.replace(latex_cmd, unicode_char)
213
+ # Uses centralized accent map from accent_character_map module
214
+ from .accent_character_map import clean_latex_accents
215
+
216
+ text = clean_latex_accents(text)
330
217
 
331
218
  # Remove common formatting commands but keep their content
332
219
  text = re.sub(r"\\textbf\{([^}]+)\}", r"\1", text)