rxiv-maker 1.17.0__py3-none-any.whl → 1.18.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rxiv_maker/__version__.py +1 -1
- rxiv_maker/cli/framework/workflow_commands.py +3 -1
- rxiv_maker/exporters/docx_citation_mapper.py +3 -84
- rxiv_maker/exporters/docx_content_processor.py +5 -23
- rxiv_maker/exporters/docx_exporter.py +14 -28
- rxiv_maker/exporters/docx_writer.py +201 -75
- rxiv_maker/processors/template_processor.py +10 -0
- rxiv_maker/templates/registry.py +52 -12
- rxiv_maker/tex/template.tex +2 -0
- rxiv_maker/utils/accent_character_map.py +150 -0
- rxiv_maker/utils/author_affiliation_processor.py +128 -0
- rxiv_maker/utils/citation_range_formatter.py +118 -0
- rxiv_maker/utils/comment_filter.py +46 -0
- rxiv_maker/utils/docx_helpers.py +4 -117
- rxiv_maker/utils/label_extractor.py +185 -0
- {rxiv_maker-1.17.0.dist-info → rxiv_maker-1.18.1.dist-info}/METADATA +1 -1
- {rxiv_maker-1.17.0.dist-info → rxiv_maker-1.18.1.dist-info}/RECORD +20 -15
- {rxiv_maker-1.17.0.dist-info → rxiv_maker-1.18.1.dist-info}/WHEEL +0 -0
- {rxiv_maker-1.17.0.dist-info → rxiv_maker-1.18.1.dist-info}/entry_points.txt +0 -0
- {rxiv_maker-1.17.0.dist-info → rxiv_maker-1.18.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
r"""LaTeX accent character to Unicode conversion map.
|
|
2
|
+
|
|
3
|
+
This module provides centralized mapping of LaTeX accent commands to their
|
|
4
|
+
Unicode equivalents. Used by both DOCX export and LaTeX processing to ensure
|
|
5
|
+
consistent character handling across formats.
|
|
6
|
+
|
|
7
|
+
Examples:
|
|
8
|
+
>>> clean_latex_accents("\\'e")
|
|
9
|
+
'é'
|
|
10
|
+
>>> clean_latex_accents("Calf{\\'e}")
|
|
11
|
+
'Café'
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from typing import Dict
|
|
15
|
+
|
|
16
|
+
# LaTeX accent commands to Unicode character mapping
|
|
17
|
+
# Handles both with and without backslashes (BibTeX parser may strip them)
|
|
18
|
+
# Also handles variant forms where backslash is replaced with the literal character
|
|
19
|
+
ACCENT_MAP: Dict[str, str] = {
|
|
20
|
+
# Acute accents (é, á, í, ó, ú) - use non-raw strings for single backslash
|
|
21
|
+
"\\'e": "é",
|
|
22
|
+
"{\\'e}": "é",
|
|
23
|
+
"{'e}": "é",
|
|
24
|
+
"{'é}": "é",
|
|
25
|
+
"\\'a": "á",
|
|
26
|
+
"{\\'a}": "á",
|
|
27
|
+
"{'a}": "á",
|
|
28
|
+
"{'á}": "á",
|
|
29
|
+
"\\'i": "í",
|
|
30
|
+
"{\\'i}": "í",
|
|
31
|
+
"{'i}": "í",
|
|
32
|
+
"{'í}": "í",
|
|
33
|
+
"'{\\i}": "í", # Acute on dotless i
|
|
34
|
+
"\\'o": "ó",
|
|
35
|
+
"{\\'o}": "ó",
|
|
36
|
+
"{'o}": "ó",
|
|
37
|
+
"{'ó}": "ó",
|
|
38
|
+
"'{o}": "ó", # Acute o (variant without backslash)
|
|
39
|
+
"\\'u": "ú",
|
|
40
|
+
"{\\'u}": "ú",
|
|
41
|
+
"{'u}": "ú",
|
|
42
|
+
"{'ú}": "ú",
|
|
43
|
+
# Uppercase acute accents
|
|
44
|
+
"\\'E": "É",
|
|
45
|
+
"{\\'E}": "É",
|
|
46
|
+
"{'E}": "É",
|
|
47
|
+
"\\'A": "Á",
|
|
48
|
+
"{\\'A}": "Á",
|
|
49
|
+
"{'A}": "Á",
|
|
50
|
+
"\\'I": "Í",
|
|
51
|
+
"{\\'I}": "Í",
|
|
52
|
+
"{'I}": "Í",
|
|
53
|
+
"'{\\I}": "Í", # Acute on uppercase dotless I
|
|
54
|
+
"\\'O": "Ó",
|
|
55
|
+
"{\\'O}": "Ó",
|
|
56
|
+
"{'O}": "Ó",
|
|
57
|
+
"'{O}": "Ó",
|
|
58
|
+
"\\'U": "Ú",
|
|
59
|
+
"{\\'U}": "Ú",
|
|
60
|
+
"{'U}": "Ú",
|
|
61
|
+
# Umlaut/diaeresis (ë, ä, ï, ö, ü)
|
|
62
|
+
'\\"e': "ë",
|
|
63
|
+
'{\\"e}': "ë",
|
|
64
|
+
'{"e}': "ë",
|
|
65
|
+
'{"ë}': "ë",
|
|
66
|
+
'\\"a': "ä",
|
|
67
|
+
'{\\"a}': "ä",
|
|
68
|
+
'{"a}': "ä",
|
|
69
|
+
'{"ä}': "ä",
|
|
70
|
+
'\\"i': "ï",
|
|
71
|
+
'{\\"i}': "ï",
|
|
72
|
+
'{"i}': "ï",
|
|
73
|
+
'{"ï}': "ï",
|
|
74
|
+
'\\"o': "ö",
|
|
75
|
+
'{\\"o}': "ö",
|
|
76
|
+
'{"o}': "ö",
|
|
77
|
+
'{"ö}': "ö",
|
|
78
|
+
'\\"u': "ü",
|
|
79
|
+
'{\\"u}': "ü",
|
|
80
|
+
'{"u}': "ü",
|
|
81
|
+
'{"ü}': "ü",
|
|
82
|
+
# Grave accents (è, à)
|
|
83
|
+
"\\`e": "è",
|
|
84
|
+
"{\\`e}": "è",
|
|
85
|
+
"{`e}": "è",
|
|
86
|
+
"{`è}": "è",
|
|
87
|
+
"\\`a": "à",
|
|
88
|
+
"{\\`a}": "à",
|
|
89
|
+
"{`a}": "à",
|
|
90
|
+
"{`à}": "à",
|
|
91
|
+
# Circumflex (ê, â)
|
|
92
|
+
"\\^e": "ê",
|
|
93
|
+
"{\\^e}": "ê",
|
|
94
|
+
"{^e}": "ê",
|
|
95
|
+
"{^ê}": "ê",
|
|
96
|
+
"\\^a": "â",
|
|
97
|
+
"{\\^a}": "â",
|
|
98
|
+
"{^a}": "â",
|
|
99
|
+
"{^â}": "â",
|
|
100
|
+
# Tilde (ñ, ã, õ)
|
|
101
|
+
"\\~n": "ñ",
|
|
102
|
+
"{\\~n}": "ñ",
|
|
103
|
+
"{~n}": "ñ",
|
|
104
|
+
"{~ñ}": "ñ",
|
|
105
|
+
"~{n}": "ñ",
|
|
106
|
+
"\\~a": "ã",
|
|
107
|
+
"{\\~a}": "ã",
|
|
108
|
+
"{~a}": "ã",
|
|
109
|
+
"~{a}": "ã", # Tilde on a (variant)
|
|
110
|
+
"{~ã}": "ã",
|
|
111
|
+
"\\~o": "õ",
|
|
112
|
+
"{\\~o}": "õ",
|
|
113
|
+
"{~o}": "õ",
|
|
114
|
+
"~{o}": "õ", # Tilde on o (variant)
|
|
115
|
+
"{~õ}": "õ",
|
|
116
|
+
# Uppercase tilde
|
|
117
|
+
"\\~N": "Ñ",
|
|
118
|
+
"{\\~N}": "Ñ",
|
|
119
|
+
"~{N}": "Ñ",
|
|
120
|
+
"\\~A": "Ã",
|
|
121
|
+
"{\\~A}": "Ã",
|
|
122
|
+
"~{A}": "Ã",
|
|
123
|
+
"\\~O": "Õ",
|
|
124
|
+
"{\\~O}": "Õ",
|
|
125
|
+
"~{O}": "Õ",
|
|
126
|
+
# Cedilla (ç)
|
|
127
|
+
"\\c{c}": "ç",
|
|
128
|
+
"{\\c{c}}": "ç",
|
|
129
|
+
"{\\c{ç}}": "ç",
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def clean_latex_accents(text: str) -> str:
|
|
134
|
+
r"""Convert LaTeX accent commands to Unicode characters.
|
|
135
|
+
|
|
136
|
+
Args:
|
|
137
|
+
text: Text containing LaTeX accent commands
|
|
138
|
+
|
|
139
|
+
Returns:
|
|
140
|
+
Text with accent commands converted to Unicode
|
|
141
|
+
|
|
142
|
+
Examples:
|
|
143
|
+
>>> clean_latex_accents("Calf{\\'e}")
|
|
144
|
+
'Café'
|
|
145
|
+
>>> clean_latex_accents("Se\\~nor")
|
|
146
|
+
'Señor'
|
|
147
|
+
"""
|
|
148
|
+
for latex_cmd, unicode_char in ACCENT_MAP.items():
|
|
149
|
+
text = text.replace(latex_cmd, unicode_char)
|
|
150
|
+
return text
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
"""Author and affiliation processing utilities for manuscript processing.
|
|
2
|
+
|
|
3
|
+
This module provides centralized author and affiliation mapping logic used by both
|
|
4
|
+
DOCX export and LaTeX/PDF generation to ensure consistent handling across formats.
|
|
5
|
+
|
|
6
|
+
The processor handles:
|
|
7
|
+
- Building affiliation shortname → number mappings in order of first appearance
|
|
8
|
+
- Looking up full affiliation details from metadata
|
|
9
|
+
- Categorizing authors (co-first, corresponding)
|
|
10
|
+
- Mapping author affiliations to sequential numbers
|
|
11
|
+
|
|
12
|
+
Examples:
|
|
13
|
+
>>> processor = AuthorAffiliationProcessor()
|
|
14
|
+
>>> metadata = {
|
|
15
|
+
... "authors": [{"name": "Alice", "affiliations": ["MIT"], "co_first_author": True}],
|
|
16
|
+
... "affiliations": [{"shortname": "MIT", "full_name": "MIT", "location": "Cambridge"}]
|
|
17
|
+
... }
|
|
18
|
+
>>> result = processor.process(metadata)
|
|
19
|
+
>>> result["affiliation_map"]
|
|
20
|
+
{'MIT': 1}
|
|
21
|
+
>>> result["cofirst_authors"]
|
|
22
|
+
[{'name': 'Alice', 'affiliations': ['MIT'], 'co_first_author': True}]
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from typing import Any, Dict, List
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class AuthorAffiliationProcessor:
|
|
29
|
+
"""Process author and affiliation metadata for manuscript generation."""
|
|
30
|
+
|
|
31
|
+
def process(self, metadata: Dict[str, Any]) -> Dict[str, Any]:
|
|
32
|
+
"""Process author and affiliation metadata.
|
|
33
|
+
|
|
34
|
+
Extracts and organizes author/affiliation data for use by format-specific
|
|
35
|
+
renderers (DOCX, LaTeX, etc.).
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
metadata: YAML metadata containing authors and affiliations
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
Dict containing:
|
|
42
|
+
- affiliation_map: Dict[str, int] mapping shortnames to numbers
|
|
43
|
+
- ordered_affiliations: List[Tuple[int, str, str]] of (number, shortname, full_text)
|
|
44
|
+
- authors: List[Dict] of author metadata
|
|
45
|
+
- cofirst_authors: List[Dict] of co-first authors
|
|
46
|
+
- corresponding_authors: List[Dict] of corresponding authors
|
|
47
|
+
|
|
48
|
+
Examples:
|
|
49
|
+
>>> processor = AuthorAffiliationProcessor()
|
|
50
|
+
>>> metadata = {
|
|
51
|
+
... "authors": [
|
|
52
|
+
... {"name": "Alice", "affiliations": ["MIT"], "co_first_author": True},
|
|
53
|
+
... {"name": "Bob", "affiliations": ["MIT", "Harvard"], "corresponding_author": True}
|
|
54
|
+
... ],
|
|
55
|
+
... "affiliations": [
|
|
56
|
+
... {"shortname": "MIT", "full_name": "Massachusetts Institute of Technology", "location": "Cambridge, MA"},
|
|
57
|
+
... {"shortname": "Harvard", "full_name": "Harvard University", "location": "Cambridge, MA"}
|
|
58
|
+
... ]
|
|
59
|
+
... }
|
|
60
|
+
>>> result = processor.process(metadata)
|
|
61
|
+
>>> result["affiliation_map"]
|
|
62
|
+
{'MIT': 1, 'Harvard': 2}
|
|
63
|
+
>>> len(result["cofirst_authors"])
|
|
64
|
+
1
|
|
65
|
+
>>> len(result["corresponding_authors"])
|
|
66
|
+
1
|
|
67
|
+
"""
|
|
68
|
+
authors = metadata.get("authors", [])
|
|
69
|
+
affiliations = metadata.get("affiliations", [])
|
|
70
|
+
|
|
71
|
+
# Build affiliation details lookup
|
|
72
|
+
affiliation_details = {a.get("shortname"): a for a in affiliations if isinstance(a, dict)}
|
|
73
|
+
|
|
74
|
+
# Build affiliation map in order of first appearance
|
|
75
|
+
affiliation_map = {}
|
|
76
|
+
ordered_affiliations = []
|
|
77
|
+
|
|
78
|
+
for author in authors:
|
|
79
|
+
if isinstance(author, dict):
|
|
80
|
+
author_affiliations = author.get("affiliations", [])
|
|
81
|
+
for affil_shortname in author_affiliations:
|
|
82
|
+
if affil_shortname not in affiliation_map:
|
|
83
|
+
# Assign next number
|
|
84
|
+
affil_num = len(affiliation_map) + 1
|
|
85
|
+
affiliation_map[affil_shortname] = affil_num
|
|
86
|
+
|
|
87
|
+
# Look up full details
|
|
88
|
+
affil_info = affiliation_details.get(affil_shortname, {})
|
|
89
|
+
full_name = affil_info.get("full_name", affil_shortname)
|
|
90
|
+
location = affil_info.get("location", "")
|
|
91
|
+
|
|
92
|
+
# Format: "Full Name, Location" or just "Full Name"
|
|
93
|
+
full_text = f"{full_name}, {location}" if location else full_name
|
|
94
|
+
|
|
95
|
+
ordered_affiliations.append((affil_num, affil_shortname, full_text))
|
|
96
|
+
|
|
97
|
+
# Categorize authors
|
|
98
|
+
cofirst_authors = [a for a in authors if isinstance(a, dict) and a.get("co_first_author", False)]
|
|
99
|
+
corresponding_authors = [a for a in authors if isinstance(a, dict) and a.get("corresponding_author", False)]
|
|
100
|
+
|
|
101
|
+
return {
|
|
102
|
+
"affiliation_map": affiliation_map,
|
|
103
|
+
"ordered_affiliations": ordered_affiliations,
|
|
104
|
+
"authors": authors,
|
|
105
|
+
"cofirst_authors": cofirst_authors,
|
|
106
|
+
"corresponding_authors": corresponding_authors,
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
def get_author_affiliation_numbers(self, author: Dict[str, Any], affiliation_map: Dict[str, int]) -> List[int]:
|
|
110
|
+
"""Get sorted affiliation numbers for an author.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
author: Author metadata dict
|
|
114
|
+
affiliation_map: Mapping from shortname to number
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
Sorted list of affiliation numbers for this author
|
|
118
|
+
|
|
119
|
+
Examples:
|
|
120
|
+
>>> processor = AuthorAffiliationProcessor()
|
|
121
|
+
>>> author = {"name": "Alice", "affiliations": ["Harvard", "MIT"]}
|
|
122
|
+
>>> affil_map = {"MIT": 1, "Harvard": 2}
|
|
123
|
+
>>> processor.get_author_affiliation_numbers(author, affil_map)
|
|
124
|
+
[1, 2]
|
|
125
|
+
"""
|
|
126
|
+
author_affiliations = author.get("affiliations", [])
|
|
127
|
+
affil_numbers = [affiliation_map[a] for a in author_affiliations if a in affiliation_map]
|
|
128
|
+
return sorted(affil_numbers)
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
"""Citation range formatting utilities for manuscript processing.
|
|
2
|
+
|
|
3
|
+
This module provides utilities to format consecutive citation numbers as ranges
|
|
4
|
+
(e.g., [1, 2, 3] → [1-3]). Used by DOCX export and potentially LaTeX processing
|
|
5
|
+
to create compact, readable citation references.
|
|
6
|
+
|
|
7
|
+
Examples:
|
|
8
|
+
>>> format_number_list([1, 2, 3, 5, 6, 8])
|
|
9
|
+
'[1-3, 5-6, 8]'
|
|
10
|
+
>>> format_citation_ranges("text [1][2][3] more")
|
|
11
|
+
'text [1-3] more'
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import re
|
|
15
|
+
from typing import List
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def format_number_list(numbers: List[int]) -> str:
|
|
19
|
+
"""Format a list of citation numbers as ranges.
|
|
20
|
+
|
|
21
|
+
Consecutive numbers are combined into ranges with hyphens.
|
|
22
|
+
Single numbers and non-consecutive numbers are separated by commas.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
numbers: List of citation numbers
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
Formatted string with ranges
|
|
29
|
+
|
|
30
|
+
Examples:
|
|
31
|
+
>>> format_number_list([1, 2, 3, 5, 6, 8])
|
|
32
|
+
'[1-3, 5-6, 8]'
|
|
33
|
+
>>> format_number_list([15, 16])
|
|
34
|
+
'[15-16]'
|
|
35
|
+
>>> format_number_list([1, 3, 5])
|
|
36
|
+
'[1, 3, 5]'
|
|
37
|
+
>>> format_number_list([])
|
|
38
|
+
'[]'
|
|
39
|
+
"""
|
|
40
|
+
if not numbers:
|
|
41
|
+
return "[]"
|
|
42
|
+
|
|
43
|
+
# Sort and deduplicate numbers
|
|
44
|
+
sorted_nums = sorted(set(numbers))
|
|
45
|
+
|
|
46
|
+
# Build ranges
|
|
47
|
+
ranges = []
|
|
48
|
+
start = sorted_nums[0]
|
|
49
|
+
end = sorted_nums[0]
|
|
50
|
+
|
|
51
|
+
for num in sorted_nums[1:]:
|
|
52
|
+
if num == end + 1:
|
|
53
|
+
# Continue current range
|
|
54
|
+
end = num
|
|
55
|
+
else:
|
|
56
|
+
# End current range and start new one
|
|
57
|
+
if start == end:
|
|
58
|
+
# Single number
|
|
59
|
+
ranges.append(str(start))
|
|
60
|
+
else:
|
|
61
|
+
# Range (including 2 consecutive numbers like 15-16)
|
|
62
|
+
ranges.append(f"{start}-{end}")
|
|
63
|
+
start = num
|
|
64
|
+
end = num
|
|
65
|
+
|
|
66
|
+
# Add final range
|
|
67
|
+
if start == end:
|
|
68
|
+
# Single number
|
|
69
|
+
ranges.append(str(start))
|
|
70
|
+
else:
|
|
71
|
+
# Range (including 2 consecutive numbers like 15-16)
|
|
72
|
+
ranges.append(f"{start}-{end}")
|
|
73
|
+
|
|
74
|
+
return f"[{', '.join(ranges)}]"
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def format_citation_ranges(text: str) -> str:
|
|
78
|
+
"""Format consecutive citations as ranges.
|
|
79
|
+
|
|
80
|
+
Converts patterns like [1][2][3] to [1-3], [15][16] to [15-16], etc.
|
|
81
|
+
Also formats comma-separated lists like [1, 2, 3] to [1-3].
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
text: Text with numbered citations
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
Text with consecutive citations formatted as ranges
|
|
88
|
+
|
|
89
|
+
Examples:
|
|
90
|
+
>>> format_citation_ranges("text [1][2][3] more")
|
|
91
|
+
'text [1-3] more'
|
|
92
|
+
>>> format_citation_ranges("text [1, 2, 3] more")
|
|
93
|
+
'text [1-3] more'
|
|
94
|
+
>>> format_citation_ranges("text [1][3][4] more")
|
|
95
|
+
'text [1][3-4] more'
|
|
96
|
+
>>> format_citation_ranges("text [1] [2] [3] more")
|
|
97
|
+
'text [1-3] more'
|
|
98
|
+
"""
|
|
99
|
+
|
|
100
|
+
# Pattern 1: Handle adjacent bracketed citations [1][2][3] or [1] [2] [3]
|
|
101
|
+
def combine_adjacent(match_obj):
|
|
102
|
+
# Extract all numbers from consecutive brackets (allowing spaces between)
|
|
103
|
+
numbers = [int(n) for n in re.findall(r"\[(\d+)\]", match_obj.group(0))]
|
|
104
|
+
return format_number_list(numbers)
|
|
105
|
+
|
|
106
|
+
# Find sequences of adjacent bracketed numbers (with optional spaces between)
|
|
107
|
+
text = re.sub(r"(?:\[\d+\]\s*){2,}", combine_adjacent, text)
|
|
108
|
+
|
|
109
|
+
# Pattern 2: Handle comma-separated citations within single brackets [1, 2, 3]
|
|
110
|
+
def combine_comma_separated(match_obj):
|
|
111
|
+
# Extract all numbers from comma-separated list
|
|
112
|
+
numbers_str = match_obj.group(1)
|
|
113
|
+
numbers = [int(n.strip()) for n in numbers_str.split(",")]
|
|
114
|
+
return format_number_list(numbers)
|
|
115
|
+
|
|
116
|
+
text = re.sub(r"\[([\d,\s]+)\]", combine_comma_separated, text)
|
|
117
|
+
|
|
118
|
+
return text
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""Comment filtering utilities for manuscript processing.
|
|
2
|
+
|
|
3
|
+
This module provides utilities to identify and filter metadata comments
|
|
4
|
+
from manuscript content. Used by both DOCX export and potentially LaTeX
|
|
5
|
+
processing to handle comment blocks consistently.
|
|
6
|
+
|
|
7
|
+
Examples:
|
|
8
|
+
>>> is_metadata_comment("Note: this is a metadata comment")
|
|
9
|
+
True
|
|
10
|
+
>>> is_metadata_comment("TODO: fix this")
|
|
11
|
+
False
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def is_metadata_comment(comment_text: str) -> bool:
|
|
16
|
+
"""Check if a comment is metadata/informational and should be skipped.
|
|
17
|
+
|
|
18
|
+
Metadata comments are those that start with common prefixes like
|
|
19
|
+
"Note:", "Comment:", etc. These are typically informational and
|
|
20
|
+
should be filtered out during processing.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
comment_text: The comment text to check
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
True if comment should be skipped (is metadata), False if it should be included
|
|
27
|
+
|
|
28
|
+
Examples:
|
|
29
|
+
>>> is_metadata_comment("Note: remember to update this")
|
|
30
|
+
True
|
|
31
|
+
>>> is_metadata_comment("comment this section is WIP")
|
|
32
|
+
True
|
|
33
|
+
>>> is_metadata_comment("TODO: fix the bug")
|
|
34
|
+
False
|
|
35
|
+
>>> is_metadata_comment("")
|
|
36
|
+
True
|
|
37
|
+
"""
|
|
38
|
+
if not comment_text:
|
|
39
|
+
return True
|
|
40
|
+
|
|
41
|
+
# Normalize to lowercase for case-insensitive matching
|
|
42
|
+
normalized = comment_text.lower().strip()
|
|
43
|
+
|
|
44
|
+
# Skip comments that start with common metadata keywords
|
|
45
|
+
metadata_prefixes = ["note:", "note ", "comment:", "comment "]
|
|
46
|
+
return any(normalized.startswith(prefix) for prefix in metadata_prefixes)
|
rxiv_maker/utils/docx_helpers.py
CHANGED
|
@@ -210,123 +210,10 @@ def clean_latex_commands(text: str) -> str:
|
|
|
210
210
|
text = html.unescape(text)
|
|
211
211
|
|
|
212
212
|
# Convert LaTeX accent commands to Unicode
|
|
213
|
-
#
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
"\\'e": "é",
|
|
218
|
-
"{\\'e}": "é",
|
|
219
|
-
"{'e}": "é",
|
|
220
|
-
"{'é}": "é",
|
|
221
|
-
"\\'a": "á",
|
|
222
|
-
"{\\'a}": "á",
|
|
223
|
-
"{'a}": "á",
|
|
224
|
-
"{'á}": "á",
|
|
225
|
-
"\\'i": "í",
|
|
226
|
-
"{\\'i}": "í",
|
|
227
|
-
"{'i}": "í",
|
|
228
|
-
"{'í}": "í",
|
|
229
|
-
"'{\\i}": "í", # Acute on dotless i
|
|
230
|
-
"\\'o": "ó",
|
|
231
|
-
"{\\'o}": "ó",
|
|
232
|
-
"{'o}": "ó",
|
|
233
|
-
"{'ó}": "ó",
|
|
234
|
-
"'{o}": "ó", # Acute o (variant without backslash)
|
|
235
|
-
"\\'u": "ú",
|
|
236
|
-
"{\\'u}": "ú",
|
|
237
|
-
"{'u}": "ú",
|
|
238
|
-
"{'ú}": "ú",
|
|
239
|
-
# Uppercase acute accents
|
|
240
|
-
"\\'E": "É",
|
|
241
|
-
"{\\'E}": "É",
|
|
242
|
-
"{'E}": "É",
|
|
243
|
-
"\\'A": "Á",
|
|
244
|
-
"{\\'A}": "Á",
|
|
245
|
-
"{'A}": "Á",
|
|
246
|
-
"\\'I": "Í",
|
|
247
|
-
"{\\'I}": "Í",
|
|
248
|
-
"{'I}": "Í",
|
|
249
|
-
"'{\\I}": "Í", # Acute on uppercase dotless I
|
|
250
|
-
"\\'O": "Ó",
|
|
251
|
-
"{\\'O}": "Ó",
|
|
252
|
-
"{'O}": "Ó",
|
|
253
|
-
"'{O}": "Ó",
|
|
254
|
-
"\\'U": "Ú",
|
|
255
|
-
"{\\'U}": "Ú",
|
|
256
|
-
"{'U}": "Ú",
|
|
257
|
-
# Umlaut/diaeresis (ë, ä, ï, ö, ü)
|
|
258
|
-
'\\"e': "ë",
|
|
259
|
-
'{\\"e}': "ë",
|
|
260
|
-
'{"e}': "ë",
|
|
261
|
-
'{"ë}': "ë",
|
|
262
|
-
'\\"a': "ä",
|
|
263
|
-
'{\\"a}': "ä",
|
|
264
|
-
'{"a}': "ä",
|
|
265
|
-
'{"ä}': "ä",
|
|
266
|
-
'\\"i': "ï",
|
|
267
|
-
'{\\"i}': "ï",
|
|
268
|
-
'{"i}': "ï",
|
|
269
|
-
'{"ï}': "ï",
|
|
270
|
-
'\\"o': "ö",
|
|
271
|
-
'{\\"o}': "ö",
|
|
272
|
-
'{"o}': "ö",
|
|
273
|
-
'{"ö}': "ö",
|
|
274
|
-
'\\"u': "ü",
|
|
275
|
-
'{\\"u}': "ü",
|
|
276
|
-
'{"u}': "ü",
|
|
277
|
-
'{"ü}': "ü",
|
|
278
|
-
# Grave accents (è, à)
|
|
279
|
-
"\\`e": "è",
|
|
280
|
-
"{\\`e}": "è",
|
|
281
|
-
"{`e}": "è",
|
|
282
|
-
"{`è}": "è",
|
|
283
|
-
"\\`a": "à",
|
|
284
|
-
"{\\`a}": "à",
|
|
285
|
-
"{`a}": "à",
|
|
286
|
-
"{`à}": "à",
|
|
287
|
-
# Circumflex (ê, â)
|
|
288
|
-
"\\^e": "ê",
|
|
289
|
-
"{\\^e}": "ê",
|
|
290
|
-
"{^e}": "ê",
|
|
291
|
-
"{^ê}": "ê",
|
|
292
|
-
"\\^a": "â",
|
|
293
|
-
"{\\^a}": "â",
|
|
294
|
-
"{^a}": "â",
|
|
295
|
-
"{^â}": "â",
|
|
296
|
-
# Tilde (ñ, ã, õ)
|
|
297
|
-
"\\~n": "ñ",
|
|
298
|
-
"{\\~n}": "ñ",
|
|
299
|
-
"{~n}": "ñ",
|
|
300
|
-
"{~ñ}": "ñ",
|
|
301
|
-
"~{n}": "ñ",
|
|
302
|
-
"\\~a": "ã",
|
|
303
|
-
"{\\~a}": "ã",
|
|
304
|
-
"{~a}": "ã",
|
|
305
|
-
"~{a}": "ã", # Tilde on a (variant)
|
|
306
|
-
"{~ã}": "ã",
|
|
307
|
-
"\\~o": "õ",
|
|
308
|
-
"{\\~o}": "õ",
|
|
309
|
-
"{~o}": "õ",
|
|
310
|
-
"~{o}": "õ", # Tilde on o (variant)
|
|
311
|
-
"{~õ}": "õ",
|
|
312
|
-
# Uppercase tilde
|
|
313
|
-
"\\~N": "Ñ",
|
|
314
|
-
"{\\~N}": "Ñ",
|
|
315
|
-
"~{N}": "Ñ",
|
|
316
|
-
"\\~A": "Ã",
|
|
317
|
-
"{\\~A}": "Ã",
|
|
318
|
-
"~{A}": "Ã",
|
|
319
|
-
"\\~O": "Õ",
|
|
320
|
-
"{\\~O}": "Õ",
|
|
321
|
-
"~{O}": "Õ",
|
|
322
|
-
# Cedilla (ç)
|
|
323
|
-
"\\c{c}": "ç",
|
|
324
|
-
"{\\c{c}}": "ç",
|
|
325
|
-
"{\\c{ç}}": "ç",
|
|
326
|
-
}
|
|
327
|
-
|
|
328
|
-
for latex_cmd, unicode_char in accent_map.items():
|
|
329
|
-
text = text.replace(latex_cmd, unicode_char)
|
|
213
|
+
# Uses centralized accent map from accent_character_map module
|
|
214
|
+
from .accent_character_map import clean_latex_accents
|
|
215
|
+
|
|
216
|
+
text = clean_latex_accents(text)
|
|
330
217
|
|
|
331
218
|
# Remove common formatting commands but keep their content
|
|
332
219
|
text = re.sub(r"\\textbf\{([^}]+)\}", r"\1", text)
|