open-document-lib 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- odf_lib/__init__.py +108 -0
- odf_lib/citation_mapping.py +240 -0
- odf_lib/odf_common.py +1625 -0
- odf_lib/py.typed +0 -0
- open_document_lib-1.0.0.dist-info/METADATA +194 -0
- open_document_lib-1.0.0.dist-info/RECORD +9 -0
- open_document_lib-1.0.0.dist-info/WHEEL +5 -0
- open_document_lib-1.0.0.dist-info/licenses/LICENSE +21 -0
- open_document_lib-1.0.0.dist-info/top_level.txt +1 -0
odf_lib/odf_common.py
ADDED
|
@@ -0,0 +1,1625 @@
|
|
|
1
|
+
"""Shared helpers for OpenDocument Format scripts.
|
|
2
|
+
|
|
3
|
+
All four ODF skills (ODT, ODP, ODS, ODG) use these functions.
|
|
4
|
+
Format-specific *_common.py modules import from here and add their
|
|
5
|
+
own NS dict, MIMETYPE constant, and format-specific helpers.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import base64
|
|
11
|
+
import mimetypes
|
|
12
|
+
import posixpath
|
|
13
|
+
import re
|
|
14
|
+
import shutil
|
|
15
|
+
import tempfile
|
|
16
|
+
import zipfile
|
|
17
|
+
from collections.abc import Callable, Mapping, Set
|
|
18
|
+
from datetime import datetime, timezone
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
from xml.etree import ElementTree as ET
|
|
21
|
+
|
|
22
|
+
VERSION = "1.0.0" # keep in sync with pyproject.toml (see CONTRIBUTING.md)
|
|
23
|
+
|
|
24
|
+
ODF_NAMESPACES: dict[str, str] = {
|
|
25
|
+
"office": "urn:oasis:names:tc:opendocument:xmlns:office:1.0",
|
|
26
|
+
"text": "urn:oasis:names:tc:opendocument:xmlns:text:1.0",
|
|
27
|
+
"draw": "urn:oasis:names:tc:opendocument:xmlns:drawing:1.0",
|
|
28
|
+
"style": "urn:oasis:names:tc:opendocument:xmlns:style:1.0",
|
|
29
|
+
"fo": "urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0",
|
|
30
|
+
"svg": "urn:oasis:names:tc:opendocument:xmlns:svg-compatible:1.0",
|
|
31
|
+
"table": "urn:oasis:names:tc:opendocument:xmlns:table:1.0",
|
|
32
|
+
"meta": "urn:oasis:names:tc:opendocument:xmlns:meta:1.0",
|
|
33
|
+
"dc": "http://purl.org/dc/elements/1.1/",
|
|
34
|
+
"manifest": "urn:oasis:names:tc:opendocument:xmlns:manifest:1.0",
|
|
35
|
+
"xlink": "http://www.w3.org/1999/xlink",
|
|
36
|
+
"presentation": "urn:oasis:names:tc:opendocument:xmlns:presentation:1.0",
|
|
37
|
+
"config": "urn:oasis:names:tc:opendocument:xmlns:config:1.0",
|
|
38
|
+
"smil": "urn:oasis:names:tc:opendocument:xmlns:smil-compatible:1.0",
|
|
39
|
+
"anim": "urn:oasis:names:tc:opendocument:xmlns:animation:1.0",
|
|
40
|
+
"chart": "urn:oasis:names:tc:opendocument:xmlns:chart:1.0",
|
|
41
|
+
"form": "urn:oasis:names:tc:opendocument:xmlns:form:1.0",
|
|
42
|
+
"script": "urn:oasis:names:tc:opendocument:xmlns:script:1.0",
|
|
43
|
+
"math": "http://www.w3.org/1998/Math/MathML",
|
|
44
|
+
"number": "urn:oasis:names:tc:opendocument:xmlns:datastyle:1.0",
|
|
45
|
+
"of": "urn:oasis:names:tc:opendocument:xmlns:of:1.2",
|
|
46
|
+
"loext": "urn:org:documentfoundation:names:experimental:office:xmlns:loext:1.0",
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
FLAT_EXTENSIONS: dict[str, str] = {
|
|
50
|
+
"application/vnd.oasis.opendocument.text": ".fodt",
|
|
51
|
+
"application/vnd.oasis.opendocument.presentation": ".fodp",
|
|
52
|
+
"application/vnd.oasis.opendocument.spreadsheet": ".fods",
|
|
53
|
+
"application/vnd.oasis.opendocument.graphics": ".fodg",
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def parse_xml_from_zip(path: Path, member: str) -> ET.Element:
|
|
58
|
+
"""Parse an XML member from a ZIP-based ODF file.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
path: Path to the ODF ZIP file.
|
|
62
|
+
member: Internal ZIP member name (e.g. ``"content.xml"``).
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
The root XML element of the parsed member.
|
|
66
|
+
"""
|
|
67
|
+
with zipfile.ZipFile(path) as archive:
|
|
68
|
+
with archive.open(member) as handle:
|
|
69
|
+
return ET.parse(handle).getroot()
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def xml_bytes(root: ET.Element) -> bytes:
|
|
73
|
+
"""Serialize an XML element to UTF-8 bytes with XML declaration.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
root: The XML element to serialize.
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
UTF-8 encoded bytes including the ``<?xml ...?>`` declaration.
|
|
80
|
+
"""
|
|
81
|
+
return ET.tostring(root, encoding="utf-8", xml_declaration=True)
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def write_odf_with_replacements(
|
|
85
|
+
input_path: Path,
|
|
86
|
+
output_path: Path,
|
|
87
|
+
replacements: Mapping[str, bytes],
|
|
88
|
+
mimetype_value: str,
|
|
89
|
+
) -> None:
|
|
90
|
+
"""Copy an ODF ZIP, replacing specified members with new content.
|
|
91
|
+
|
|
92
|
+
The mimetype entry is always written first and uncompressed.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
input_path: Source ODF file.
|
|
96
|
+
output_path: Destination ODF file (overwritten).
|
|
97
|
+
replacements: Mapping of member names to replacement bytes.
|
|
98
|
+
mimetype_value: The mimetype string to write if not in *replacements*.
|
|
99
|
+
"""
|
|
100
|
+
with zipfile.ZipFile(input_path) as src:
|
|
101
|
+
names: list[str] = src.namelist()
|
|
102
|
+
with zipfile.ZipFile(output_path, "w") as dst:
|
|
103
|
+
if "mimetype" in names:
|
|
104
|
+
dst.writestr(
|
|
105
|
+
"mimetype",
|
|
106
|
+
replacements.get("mimetype", mimetype_value.encode()),
|
|
107
|
+
compress_type=zipfile.ZIP_STORED,
|
|
108
|
+
)
|
|
109
|
+
for name in names:
|
|
110
|
+
if name == "mimetype":
|
|
111
|
+
continue
|
|
112
|
+
dst.writestr(
|
|
113
|
+
name,
|
|
114
|
+
replacements.get(name, src.read(name)),
|
|
115
|
+
compress_type=zipfile.ZIP_DEFLATED,
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def pack_dir_as_odf(source_dir: Path, output_path: Path, mimetype_value: str) -> None:
|
|
120
|
+
"""Repack an extracted ODF directory into a valid ODF file.
|
|
121
|
+
|
|
122
|
+
The mimetype file must exist in *source_dir* and is written first
|
|
123
|
+
and uncompressed, as required by the ODF specification.
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
source_dir: Directory containing extracted ODF contents.
|
|
127
|
+
output_path: Destination ODF file (overwritten).
|
|
128
|
+
mimetype_value: The mimetype string (written to ``mimetype`` member).
|
|
129
|
+
"""
|
|
130
|
+
mimetype: Path = source_dir / "mimetype"
|
|
131
|
+
if not mimetype.exists():
|
|
132
|
+
raise SystemExit(f"Missing mimetype file in {source_dir}")
|
|
133
|
+
with zipfile.ZipFile(output_path, "w") as archive:
|
|
134
|
+
archive.write(mimetype, "mimetype", compress_type=zipfile.ZIP_STORED)
|
|
135
|
+
for path in sorted(source_dir.rglob("*")):
|
|
136
|
+
if path.is_dir() or path == mimetype:
|
|
137
|
+
continue
|
|
138
|
+
archive.write(
|
|
139
|
+
path,
|
|
140
|
+
path.relative_to(source_dir).as_posix(),
|
|
141
|
+
compress_type=zipfile.ZIP_DEFLATED,
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def ensure_manifest_entry(
|
|
146
|
+
manifest_root: ET.Element,
|
|
147
|
+
full_path: str,
|
|
148
|
+
media_type: str,
|
|
149
|
+
ns: Mapping[str, str],
|
|
150
|
+
q_fn: Callable[[str, str], str],
|
|
151
|
+
) -> None:
|
|
152
|
+
"""Add or update a manifest file-entry.
|
|
153
|
+
|
|
154
|
+
If an entry for *full_path* already exists, its media-type is updated.
|
|
155
|
+
Otherwise a new file-entry is appended.
|
|
156
|
+
|
|
157
|
+
Args:
|
|
158
|
+
manifest_root: The ``<manifest:manifest>`` element.
|
|
159
|
+
full_path: The ``manifest:full-path`` attribute value.
|
|
160
|
+
media_type: The ``manifest:media-type`` attribute value.
|
|
161
|
+
ns: Namespace prefix-to-URI mapping.
|
|
162
|
+
q_fn: Qualified-name builder (e.g. ``q("manifest", "full-path")``).
|
|
163
|
+
"""
|
|
164
|
+
manifest_ns: str = ns.get("manifest", "")
|
|
165
|
+
entry_tag: str = f"{{{manifest_ns}}}file-entry"
|
|
166
|
+
for entry in manifest_root.findall(f".//{entry_tag}"):
|
|
167
|
+
if entry.attrib.get(q_fn("manifest", "full-path")) == full_path:
|
|
168
|
+
entry.set(q_fn("manifest", "media-type"), media_type)
|
|
169
|
+
return
|
|
170
|
+
ET.SubElement(
|
|
171
|
+
manifest_root,
|
|
172
|
+
entry_tag,
|
|
173
|
+
{
|
|
174
|
+
q_fn("manifest", "full-path"): full_path,
|
|
175
|
+
q_fn("manifest", "media-type"): media_type,
|
|
176
|
+
},
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def inject_styles_from_file(
|
|
181
|
+
input_path: Path,
|
|
182
|
+
styles_path: Path,
|
|
183
|
+
output_path: Path,
|
|
184
|
+
mimetype_value: str,
|
|
185
|
+
) -> list[str]:
|
|
186
|
+
"""Replace the ``styles.xml`` member of an ODF file with the contents of *styles_path*.
|
|
187
|
+
|
|
188
|
+
Returns a list of style-name references in content.xml that do NOT appear
|
|
189
|
+
in the new styles.xml — these are dangling and indicate the injection
|
|
190
|
+
swapped out styles that were still referenced by the content.
|
|
191
|
+
|
|
192
|
+
Args:
|
|
193
|
+
input_path: Source ODF file.
|
|
194
|
+
styles_path: Local styles.xml replacement to inject.
|
|
195
|
+
output_path: Destination ODF file (overwritten).
|
|
196
|
+
mimetype_value: The mimetype string to preserve.
|
|
197
|
+
|
|
198
|
+
Returns:
|
|
199
|
+
List of style names referenced in content but missing in the new styles.
|
|
200
|
+
"""
|
|
201
|
+
new_styles_bytes: bytes = styles_path.read_bytes()
|
|
202
|
+
# Validate cross-references: collect style names defined in new styles
|
|
203
|
+
new_styles_root: ET.Element = ET.fromstring(new_styles_bytes)
|
|
204
|
+
style_ns: str = "urn:oasis:names:tc:opendocument:xmlns:style:1.0"
|
|
205
|
+
text_ns: str = "urn:oasis:names:tc:opendocument:xmlns:text:1.0"
|
|
206
|
+
defined_names: set[str] = set()
|
|
207
|
+
for style_el in new_styles_root.iter(f"{{{style_ns}}}style"):
|
|
208
|
+
name = style_el.attrib.get(f"{{{style_ns}}}name")
|
|
209
|
+
if name:
|
|
210
|
+
defined_names.add(name)
|
|
211
|
+
# Also include parent-style-name targets (so we resolve a chain)
|
|
212
|
+
parent_names: set[str] = set()
|
|
213
|
+
for style_el in new_styles_root.iter(f"{{{style_ns}}}style"):
|
|
214
|
+
parent = style_el.attrib.get(f"{{{style_ns}}}parent-style-name")
|
|
215
|
+
if parent:
|
|
216
|
+
parent_names.add(parent)
|
|
217
|
+
# Style names used by content.xml's text:style-name attributes
|
|
218
|
+
content_root: ET.Element = parse_xml_from_zip(input_path, "content.xml")
|
|
219
|
+
used: set[str] = set()
|
|
220
|
+
for node in content_root.iter():
|
|
221
|
+
v = node.attrib.get(f"{{{text_ns}}}style-name")
|
|
222
|
+
if v:
|
|
223
|
+
used.add(v)
|
|
224
|
+
missing: list[str] = sorted(used - defined_names - parent_names)
|
|
225
|
+
|
|
226
|
+
write_odf_with_replacements(
|
|
227
|
+
input_path,
|
|
228
|
+
output_path,
|
|
229
|
+
{"styles.xml": new_styles_bytes},
|
|
230
|
+
mimetype_value,
|
|
231
|
+
)
|
|
232
|
+
return missing
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def embed_pictures(
|
|
236
|
+
input_path: Path,
|
|
237
|
+
pictures: Mapping[str, Path],
|
|
238
|
+
output_path: Path,
|
|
239
|
+
mimetype_value: str,
|
|
240
|
+
ns: Mapping[str, str],
|
|
241
|
+
q_fn: Callable[[str, str], str],
|
|
242
|
+
) -> None:
|
|
243
|
+
"""Embed multiple local pictures into the ODF at given package paths.
|
|
244
|
+
|
|
245
|
+
Each picture is added as a new ZIP member and registered in
|
|
246
|
+
``META-INF/manifest.xml``. The content.xml is **not** modified — callers
|
|
247
|
+
typically reference the pictures from their own draw:frame markup.
|
|
248
|
+
|
|
249
|
+
Args:
|
|
250
|
+
input_path: Source ODF file.
|
|
251
|
+
pictures: Mapping of package paths (e.g. ``"Pictures/logo.png"``) to local file paths.
|
|
252
|
+
output_path: Destination ODF file.
|
|
253
|
+
mimetype_value: The mimetype string to preserve.
|
|
254
|
+
ns: Namespace map (must contain ``manifest``).
|
|
255
|
+
q_fn: Qualified-name builder.
|
|
256
|
+
"""
|
|
257
|
+
manifest: ET.Element = parse_xml_from_zip(input_path, "META-INF/manifest.xml")
|
|
258
|
+
new_members: dict[str, bytes] = {}
|
|
259
|
+
for package_path, source in pictures.items():
|
|
260
|
+
new_members[package_path] = source.read_bytes()
|
|
261
|
+
ensure_manifest_entry(manifest, package_path, sniff_image_mime(source), ns, q_fn)
|
|
262
|
+
|
|
263
|
+
copy_with_multiple_members(
|
|
264
|
+
input_path,
|
|
265
|
+
output_path,
|
|
266
|
+
new_members,
|
|
267
|
+
{"META-INF/manifest.xml": xml_bytes(manifest)},
|
|
268
|
+
mimetype_value,
|
|
269
|
+
)
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
def update_meta_for_edit(
|
|
273
|
+
meta_root: ET.Element,
|
|
274
|
+
ns: Mapping[str, str],
|
|
275
|
+
q_fn: Callable[[str, str], str],
|
|
276
|
+
) -> None:
|
|
277
|
+
"""Mark an edit in ``meta.xml``: modification-date, generator, editing-cycles.
|
|
278
|
+
|
|
279
|
+
Locates or creates the ``<meta:modification-date>``, ``<meta:generator>``,
|
|
280
|
+
and ``<meta:editing-cycles>`` elements under the document's ``<office:meta>``
|
|
281
|
+
node. Modification-date is set to the current UTC ISO timestamp.
|
|
282
|
+
Generator is set to ``open-document-skills/<VERSION>``. Editing-cycles is
|
|
283
|
+
incremented (or initialised to ``1`` if absent or unparseable).
|
|
284
|
+
|
|
285
|
+
Args:
|
|
286
|
+
meta_root: The root of ``meta.xml`` (typically ``office:document-meta``).
|
|
287
|
+
ns: Namespace prefix-to-URI mapping; must contain ``office`` and ``meta``.
|
|
288
|
+
q_fn: Qualified-name builder, e.g. ``q("meta", "generator")``.
|
|
289
|
+
|
|
290
|
+
Raises:
|
|
291
|
+
SystemExit: If no ``office:meta`` element can be located or created.
|
|
292
|
+
"""
|
|
293
|
+
office_ns: str = ns.get("office", "")
|
|
294
|
+
meta_tag: str = f"{{{office_ns}}}meta"
|
|
295
|
+
meta_el: ET.Element | None = meta_root.find(meta_tag)
|
|
296
|
+
if meta_el is None:
|
|
297
|
+
if local_name(meta_root.tag) == "meta":
|
|
298
|
+
meta_el = meta_root
|
|
299
|
+
else:
|
|
300
|
+
raise SystemExit("office:meta element not found in meta.xml")
|
|
301
|
+
|
|
302
|
+
def _find_or_create(tag: str) -> ET.Element:
|
|
303
|
+
el: ET.Element | None = meta_el.find(tag)
|
|
304
|
+
if el is None:
|
|
305
|
+
el = ET.SubElement(meta_el, tag)
|
|
306
|
+
return el
|
|
307
|
+
|
|
308
|
+
mod_tag: str = q_fn("meta", "modification-date")
|
|
309
|
+
mod_el: ET.Element = _find_or_create(mod_tag)
|
|
310
|
+
mod_el.text = datetime.now(timezone.utc).replace(microsecond=0).isoformat()
|
|
311
|
+
|
|
312
|
+
gen_tag: str = q_fn("meta", "generator")
|
|
313
|
+
gen_el: ET.Element = _find_or_create(gen_tag)
|
|
314
|
+
gen_el.text = f"open-document-skills/{VERSION}"
|
|
315
|
+
|
|
316
|
+
cycles_tag: str = q_fn("meta", "editing-cycles")
|
|
317
|
+
cycles_el: ET.Element = _find_or_create(cycles_tag)
|
|
318
|
+
current: int
|
|
319
|
+
try:
|
|
320
|
+
current = int((cycles_el.text or "0").strip())
|
|
321
|
+
except ValueError:
|
|
322
|
+
current = 0
|
|
323
|
+
cycles_el.text = str(current + 1)
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
_IMAGE_MIME_BY_MAGIC: list[tuple[bytes, str]] = [
|
|
327
|
+
(b"\x89PNG\r\n\x1a\n", "image/png"),
|
|
328
|
+
(b"\xff\xd8\xff", "image/jpeg"),
|
|
329
|
+
(b"GIF87a", "image/gif"),
|
|
330
|
+
(b"GIF89a", "image/gif"),
|
|
331
|
+
(b"<?xml", "image/svg+xml"), # requires <svg> later in header
|
|
332
|
+
(b"<svg", "image/svg+xml"),
|
|
333
|
+
(b"BM", "image/bmp"),
|
|
334
|
+
(b"RIFF", "image/webp"), # requires bytes 8:12 == b"WEBP"
|
|
335
|
+
(b"II*\x00", "image/tiff"),
|
|
336
|
+
(b"MM\x00*", "image/tiff"),
|
|
337
|
+
]
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
def sniff_image_mime(path: Path) -> str:
|
|
341
|
+
"""Return the MIME type of an image file by inspecting its magic bytes.
|
|
342
|
+
|
|
343
|
+
Reads the first 64 bytes (enough for all checks including the SVG-via-XML
|
|
344
|
+
case and the WebP four-CC validation) and falls back to extension-based
|
|
345
|
+
detection via :func:`media_type_for` when no magic matches.
|
|
346
|
+
|
|
347
|
+
Args:
|
|
348
|
+
path: Local file path.
|
|
349
|
+
|
|
350
|
+
Returns:
|
|
351
|
+
MIME type string (e.g. ``"image/png"``); falls back to the extension
|
|
352
|
+
guess if the file is unreadable or no magic matches.
|
|
353
|
+
"""
|
|
354
|
+
try:
|
|
355
|
+
with open(path, "rb") as handle:
|
|
356
|
+
header: bytes = handle.read(64)
|
|
357
|
+
except OSError:
|
|
358
|
+
return media_type_for(path)
|
|
359
|
+
for magic, mime in _IMAGE_MIME_BY_MAGIC:
|
|
360
|
+
if not header.startswith(magic):
|
|
361
|
+
continue
|
|
362
|
+
if mime == "image/svg+xml" and magic == b"<?xml":
|
|
363
|
+
# Confirm it's actually SVG, not arbitrary XML.
|
|
364
|
+
if b"<svg" not in header:
|
|
365
|
+
continue
|
|
366
|
+
if mime == "image/webp":
|
|
367
|
+
if len(header) < 12 or header[8:12] != b"WEBP":
|
|
368
|
+
continue
|
|
369
|
+
return mime
|
|
370
|
+
return media_type_for(path)
|
|
371
|
+
|
|
372
|
+
|
|
373
|
+
def media_type_for(path: Path) -> str:
|
|
374
|
+
"""Guess the MIME type for a file path, falling back to octet-stream.
|
|
375
|
+
|
|
376
|
+
Args:
|
|
377
|
+
path: File path (only the extension is used for guessing).
|
|
378
|
+
|
|
379
|
+
Returns:
|
|
380
|
+
MIME type string, e.g. ``"image/png"`` or ``"application/octet-stream"``.
|
|
381
|
+
"""
|
|
382
|
+
guessed: str | None
|
|
383
|
+
guessed, _ = mimetypes.guess_type(path.name)
|
|
384
|
+
return guessed or "application/octet-stream"
|
|
385
|
+
|
|
386
|
+
|
|
387
|
+
def unique_picture_name(existing: Set[str], image: Path) -> str:
|
|
388
|
+
"""Return a unique ``Pictures/…`` path that does not clash with *existing*.
|
|
389
|
+
|
|
390
|
+
Args:
|
|
391
|
+
existing: Set of already-used package paths.
|
|
392
|
+
image: Source image file path.
|
|
393
|
+
|
|
394
|
+
Returns:
|
|
395
|
+
A ``Pictures/<filename>`` path, with ``-N`` suffix if needed.
|
|
396
|
+
"""
|
|
397
|
+
base: str = image.name.replace("\\", "_").replace("/", "_")
|
|
398
|
+
candidate: str = posixpath.join("Pictures", base)
|
|
399
|
+
stem: str = image.stem
|
|
400
|
+
suffix: str = image.suffix
|
|
401
|
+
counter: int = 1
|
|
402
|
+
while candidate in existing:
|
|
403
|
+
candidate = posixpath.join("Pictures", f"{stem}-{counter}{suffix}")
|
|
404
|
+
counter += 1
|
|
405
|
+
return candidate
|
|
406
|
+
|
|
407
|
+
|
|
408
|
+
def copy_into_package(
|
|
409
|
+
input_path: Path,
|
|
410
|
+
output_path: Path,
|
|
411
|
+
package_path: str,
|
|
412
|
+
source: Path,
|
|
413
|
+
replacements: Mapping[str, bytes],
|
|
414
|
+
mimetype_value: str,
|
|
415
|
+
) -> None:
|
|
416
|
+
"""Copy an ODF ZIP, replacing members and adding *source* at *package_path*.
|
|
417
|
+
|
|
418
|
+
Args:
|
|
419
|
+
input_path: Source ODF file.
|
|
420
|
+
output_path: Destination ODF file (overwritten).
|
|
421
|
+
package_path: Internal ZIP path for the new file.
|
|
422
|
+
source: Local file to insert.
|
|
423
|
+
replacements: Mapping of member names to replacement bytes.
|
|
424
|
+
mimetype_value: The mimetype string to write if not in *replacements*.
|
|
425
|
+
"""
|
|
426
|
+
with zipfile.ZipFile(input_path) as src:
|
|
427
|
+
names: list[str] = src.namelist()
|
|
428
|
+
with zipfile.ZipFile(output_path, "w") as dst:
|
|
429
|
+
if "mimetype" in names:
|
|
430
|
+
dst.writestr(
|
|
431
|
+
"mimetype",
|
|
432
|
+
replacements.get("mimetype", mimetype_value.encode()),
|
|
433
|
+
compress_type=zipfile.ZIP_STORED,
|
|
434
|
+
)
|
|
435
|
+
for name in names:
|
|
436
|
+
if name == "mimetype" or name == package_path:
|
|
437
|
+
continue
|
|
438
|
+
dst.writestr(
|
|
439
|
+
name,
|
|
440
|
+
replacements.get(name, src.read(name)),
|
|
441
|
+
compress_type=zipfile.ZIP_DEFLATED,
|
|
442
|
+
)
|
|
443
|
+
dst.write(source, package_path, compress_type=zipfile.ZIP_DEFLATED)
|
|
444
|
+
|
|
445
|
+
|
|
446
|
+
def copy_with_multiple_members(
|
|
447
|
+
input_path: Path,
|
|
448
|
+
output_path: Path,
|
|
449
|
+
new_members: Mapping[str, bytes],
|
|
450
|
+
replacements: Mapping[str, bytes],
|
|
451
|
+
mimetype_value: str,
|
|
452
|
+
) -> None:
|
|
453
|
+
"""Copy an ODF ZIP with both replacements and arbitrary new members.
|
|
454
|
+
|
|
455
|
+
Like :func:`copy_into_package` but for adding *several* new internal files
|
|
456
|
+
(e.g. ``Object 1/content.xml`` plus its directory entry) in one pass.
|
|
457
|
+
|
|
458
|
+
Args:
|
|
459
|
+
input_path: Source ODF file.
|
|
460
|
+
output_path: Destination ODF file (overwritten).
|
|
461
|
+
new_members: Mapping ``{package_path: bytes}`` of files to add.
|
|
462
|
+
replacements: Mapping of existing member names to replacement bytes.
|
|
463
|
+
mimetype_value: The mimetype string to write if not in *replacements*.
|
|
464
|
+
"""
|
|
465
|
+
with zipfile.ZipFile(input_path) as src:
|
|
466
|
+
names: list[str] = src.namelist()
|
|
467
|
+
with zipfile.ZipFile(output_path, "w") as dst:
|
|
468
|
+
if "mimetype" in names:
|
|
469
|
+
dst.writestr(
|
|
470
|
+
"mimetype",
|
|
471
|
+
replacements.get("mimetype", mimetype_value.encode()),
|
|
472
|
+
compress_type=zipfile.ZIP_STORED,
|
|
473
|
+
)
|
|
474
|
+
for name in names:
|
|
475
|
+
if name == "mimetype" or name in new_members:
|
|
476
|
+
continue
|
|
477
|
+
dst.writestr(
|
|
478
|
+
name,
|
|
479
|
+
replacements.get(name, src.read(name)),
|
|
480
|
+
compress_type=zipfile.ZIP_DEFLATED,
|
|
481
|
+
)
|
|
482
|
+
for path, payload in new_members.items():
|
|
483
|
+
dst.writestr(path, payload, compress_type=zipfile.ZIP_DEFLATED)
|
|
484
|
+
|
|
485
|
+
|
|
486
|
+
def unique_object_name(existing: Set[str]) -> str:
|
|
487
|
+
"""Return the first ``Object N`` (N=1,2,3,...) that is not present in *existing*.
|
|
488
|
+
|
|
489
|
+
Used for MathML/formula sub-packages. Match is on prefix — an existing
|
|
490
|
+
``Object 3/content.xml`` causes ``Object 3`` to be considered taken.
|
|
491
|
+
|
|
492
|
+
Args:
|
|
493
|
+
existing: Set of already-used package paths.
|
|
494
|
+
|
|
495
|
+
Returns:
|
|
496
|
+
The chosen ``Object N`` (no trailing slash).
|
|
497
|
+
"""
|
|
498
|
+
counter: int = 1
|
|
499
|
+
while True:
|
|
500
|
+
candidate: str = f"Object {counter}"
|
|
501
|
+
if not any(name == candidate or name.startswith(candidate + "/") for name in existing):
|
|
502
|
+
return candidate
|
|
503
|
+
counter += 1
|
|
504
|
+
|
|
505
|
+
|
|
506
|
+
def find_pandoc() -> str | None:
|
|
507
|
+
"""Locate the pandoc executable on PATH. Returns None if not found."""
|
|
508
|
+
return shutil.which("pandoc")
|
|
509
|
+
|
|
510
|
+
|
|
511
|
+
SCHEMA_URLS: dict[str, str] = {
|
|
512
|
+
"content": "https://docs.oasis-open.org/office/OpenDocument/v1.3/os/schemas/OpenDocument-v1.3-schema.rng",
|
|
513
|
+
"manifest": "https://docs.oasis-open.org/office/OpenDocument/v1.3/os/schemas/OpenDocument-v1.3-manifest-schema.rng",
|
|
514
|
+
}
|
|
515
|
+
|
|
516
|
+
|
|
517
|
+
def ensure_schema(name: str) -> Path:
|
|
518
|
+
"""Locate an OASIS ODF 1.3 RelaxNG schema, downloading it on first use.
|
|
519
|
+
|
|
520
|
+
Schemas are cached under ``$XDG_CACHE_HOME/open-document-skills/schemas/``
|
|
521
|
+
(defaulting to ``~/.cache/open-document-skills/schemas/``).
|
|
522
|
+
|
|
523
|
+
Args:
|
|
524
|
+
name: Either ``"content"`` or ``"manifest"``.
|
|
525
|
+
|
|
526
|
+
Returns:
|
|
527
|
+
Local filesystem path to the cached schema.
|
|
528
|
+
|
|
529
|
+
Raises:
|
|
530
|
+
SystemExit: If *name* is unknown or download fails.
|
|
531
|
+
"""
|
|
532
|
+
import os
|
|
533
|
+
import urllib.request
|
|
534
|
+
|
|
535
|
+
if name not in SCHEMA_URLS:
|
|
536
|
+
raise SystemExit(f"unknown schema {name!r}; choose from {sorted(SCHEMA_URLS)}")
|
|
537
|
+
cache_root: Path = (
|
|
538
|
+
Path(os.environ.get("XDG_CACHE_HOME") or Path.home() / ".cache") / "open-document-skills" / "schemas"
|
|
539
|
+
)
|
|
540
|
+
cache_root.mkdir(parents=True, exist_ok=True)
|
|
541
|
+
schema_path: Path = cache_root / f"odf-1.3-{name}.rng"
|
|
542
|
+
if not schema_path.exists():
|
|
543
|
+
url: str = SCHEMA_URLS[name]
|
|
544
|
+
try:
|
|
545
|
+
with urllib.request.urlopen(url, timeout=30) as resp:
|
|
546
|
+
schema_path.write_bytes(resp.read())
|
|
547
|
+
except Exception as exc:
|
|
548
|
+
raise SystemExit(f"failed to download schema {url}: {exc}")
|
|
549
|
+
return schema_path
|
|
550
|
+
|
|
551
|
+
|
|
552
|
+
def validate_against_schema(xml_bytes_input: bytes, schema_name: str) -> tuple[bool, list[str]]:
|
|
553
|
+
"""Validate *xml_bytes_input* against the named OASIS ODF 1.3 RelaxNG schema.
|
|
554
|
+
|
|
555
|
+
Lazily imports ``lxml`` and raises ``SystemExit`` with an install hint
|
|
556
|
+
when the optional dependency is missing.
|
|
557
|
+
|
|
558
|
+
Args:
|
|
559
|
+
xml_bytes_input: Raw XML bytes to validate.
|
|
560
|
+
schema_name: Schema key, e.g. ``"content"`` or ``"manifest"``.
|
|
561
|
+
|
|
562
|
+
Returns:
|
|
563
|
+
``(is_valid, errors)`` where errors is a list of human-readable strings.
|
|
564
|
+
"""
|
|
565
|
+
try:
|
|
566
|
+
from lxml import etree # type: ignore
|
|
567
|
+
except ImportError:
|
|
568
|
+
raise SystemExit("Schema validation requires lxml. Install with:\n pip install open-document-lib[validate]")
|
|
569
|
+
schema_path: Path = ensure_schema(schema_name)
|
|
570
|
+
rng_doc = etree.parse(str(schema_path))
|
|
571
|
+
relaxng = etree.RelaxNG(rng_doc)
|
|
572
|
+
try:
|
|
573
|
+
doc = etree.fromstring(xml_bytes_input)
|
|
574
|
+
except etree.XMLSyntaxError as exc:
|
|
575
|
+
return False, [f"XML syntax error: {exc}"]
|
|
576
|
+
valid = relaxng.validate(doc)
|
|
577
|
+
errors: list[str] = []
|
|
578
|
+
if not valid:
|
|
579
|
+
for err in relaxng.error_log:
|
|
580
|
+
errors.append(f"line {err.line}: {err.message}")
|
|
581
|
+
return valid, errors
|
|
582
|
+
|
|
583
|
+
|
|
584
|
+
def apply_strict_schema_check(odf_path: Path, result: dict[str, object]) -> None:
|
|
585
|
+
"""Validate an ODF file's content.xml and manifest.xml against the schemas.
|
|
586
|
+
|
|
587
|
+
Runs RelaxNG validation against the OASIS ODF 1.3 schemas — the same
|
|
588
|
+
``content`` schema covers ODT/ODP/ODS/ODG, so this works for every
|
|
589
|
+
format. Mutates *result* in place: schema errors are appended to
|
|
590
|
+
``result["errors"]`` (prefixed by member name) and ``result["status"]``
|
|
591
|
+
is set to ``"errors_found"`` when any errors are present.
|
|
592
|
+
|
|
593
|
+
Args:
|
|
594
|
+
odf_path: Path to the ODF package to validate.
|
|
595
|
+
result: A validation result dict with ``"errors"`` and ``"status"``
|
|
596
|
+
keys, as returned by a ``validate_refs`` ``validate()`` function.
|
|
597
|
+
"""
|
|
598
|
+
with zipfile.ZipFile(odf_path) as archive:
|
|
599
|
+
content_bytes = archive.read("content.xml")
|
|
600
|
+
try:
|
|
601
|
+
manifest_bytes: bytes | None = archive.read("META-INF/manifest.xml")
|
|
602
|
+
except KeyError:
|
|
603
|
+
manifest_bytes = None
|
|
604
|
+
errors = result["errors"]
|
|
605
|
+
if not isinstance(errors, list): # defensive — validate() always returns a list
|
|
606
|
+
errors = []
|
|
607
|
+
result["errors"] = errors
|
|
608
|
+
ok, errs = validate_against_schema(content_bytes, "content")
|
|
609
|
+
if not ok:
|
|
610
|
+
errors.extend(f"content.xml: {err}" for err in errs)
|
|
611
|
+
if manifest_bytes is not None:
|
|
612
|
+
ok_m, errs_m = validate_against_schema(manifest_bytes, "manifest")
|
|
613
|
+
if not ok_m:
|
|
614
|
+
errors.extend(f"manifest.xml: {err}" for err in errs_m)
|
|
615
|
+
if errors:
|
|
616
|
+
result["status"] = "errors_found"
|
|
617
|
+
|
|
618
|
+
|
|
619
|
+
def latex_to_mathml(latex: str) -> bytes:
|
|
620
|
+
"""Convert a LaTeX snippet to MathML bytes via pandoc.
|
|
621
|
+
|
|
622
|
+
Args:
|
|
623
|
+
latex: LaTeX source (without surrounding ``$`` delimiters).
|
|
624
|
+
|
|
625
|
+
Returns:
|
|
626
|
+
UTF-8 encoded MathML XML.
|
|
627
|
+
|
|
628
|
+
Raises:
|
|
629
|
+
SystemExit: If pandoc is not on PATH, with install hints.
|
|
630
|
+
"""
|
|
631
|
+
import subprocess
|
|
632
|
+
|
|
633
|
+
pandoc: str | None = find_pandoc()
|
|
634
|
+
if pandoc is None:
|
|
635
|
+
raise SystemExit(
|
|
636
|
+
"LaTeX → MathML requires pandoc.\n"
|
|
637
|
+
" macOS: brew install pandoc\n"
|
|
638
|
+
" Ubuntu: sudo apt-get install pandoc\n"
|
|
639
|
+
" Windows: winget install JohnMacFarlane.Pandoc\n"
|
|
640
|
+
"Or supply --mathml or --mathml-inline directly."
|
|
641
|
+
)
|
|
642
|
+
# Wrap in math mode so pandoc emits a <math> element.
|
|
643
|
+
wrapped: str = f"${latex}$"
|
|
644
|
+
result = subprocess.run(
|
|
645
|
+
[pandoc, "-f", "latex", "-t", "html5", "--mathml"],
|
|
646
|
+
input=wrapped.encode("utf-8"),
|
|
647
|
+
capture_output=True,
|
|
648
|
+
check=True,
|
|
649
|
+
)
|
|
650
|
+
# Pandoc's html5+mathml output wraps in <p>...</p>; extract the <math>...</math> element.
|
|
651
|
+
html_out: str = result.stdout.decode("utf-8")
|
|
652
|
+
math_start: int = html_out.find("<math")
|
|
653
|
+
math_end: int = html_out.rfind("</math>")
|
|
654
|
+
if math_start < 0 or math_end < 0:
|
|
655
|
+
raise SystemExit(f"pandoc did not emit a <math> element. Output was:\n{html_out}")
|
|
656
|
+
return html_out[math_start : math_end + len("</math>")].encode("utf-8")
|
|
657
|
+
|
|
658
|
+
|
|
659
|
+
def clear_children(element: ET.Element) -> None:
|
|
660
|
+
"""Remove all child elements from *element* in-place.
|
|
661
|
+
|
|
662
|
+
Args:
|
|
663
|
+
element: The XML element to clear.
|
|
664
|
+
"""
|
|
665
|
+
element[:] = []
|
|
666
|
+
|
|
667
|
+
|
|
668
|
+
def _collect_text_slots(element: ET.Element) -> list[tuple[ET.Element, str]]:
|
|
669
|
+
"""Collect (node, attr) text-slot pairs in document order.
|
|
670
|
+
|
|
671
|
+
Walker and locator helpers share this structure. For *element*, yields
|
|
672
|
+
``(element, "text")`` first; then for every descendant in DFS order,
|
|
673
|
+
yields ``(node, "text")`` before recursing and ``(node, "tail")`` after.
|
|
674
|
+
The root's ``.tail`` is intentionally not included (it lives outside the
|
|
675
|
+
element's content).
|
|
676
|
+
|
|
677
|
+
Args:
|
|
678
|
+
element: Root of the subtree to collect from.
|
|
679
|
+
|
|
680
|
+
Returns:
|
|
681
|
+
Ordered list of ``(node, attr)`` pairs where ``attr`` is ``"text"`` or ``"tail"``.
|
|
682
|
+
"""
|
|
683
|
+
slots: list[tuple[ET.Element, str]] = []
|
|
684
|
+
|
|
685
|
+
def visit(node: ET.Element, is_root: bool) -> None:
|
|
686
|
+
slots.append((node, "text"))
|
|
687
|
+
for child in list(node):
|
|
688
|
+
visit(child, False)
|
|
689
|
+
if not is_root:
|
|
690
|
+
slots.append((node, "tail"))
|
|
691
|
+
|
|
692
|
+
visit(element, True)
|
|
693
|
+
return slots
|
|
694
|
+
|
|
695
|
+
|
|
696
|
+
def _build_parent_map(element: ET.Element) -> dict[ET.Element, ET.Element]:
|
|
697
|
+
"""Build a descendant → parent mapping for *element*'s subtree.
|
|
698
|
+
|
|
699
|
+
The root *element* is not present as a key (it has no parent within the subtree).
|
|
700
|
+
|
|
701
|
+
Args:
|
|
702
|
+
element: Root of the subtree.
|
|
703
|
+
|
|
704
|
+
Returns:
|
|
705
|
+
Dict mapping each descendant to its direct parent.
|
|
706
|
+
"""
|
|
707
|
+
parent_map: dict[ET.Element, ET.Element] = {}
|
|
708
|
+
for parent in element.iter():
|
|
709
|
+
for child in parent:
|
|
710
|
+
parent_map[child] = parent
|
|
711
|
+
return parent_map
|
|
712
|
+
|
|
713
|
+
|
|
714
|
+
def find_text_position_in_element(element: ET.Element, needle: str) -> tuple[ET.Element, str, int] | None:
|
|
715
|
+
"""Find the FIRST occurrence of *needle* in *element*'s text content.
|
|
716
|
+
|
|
717
|
+
Walks ``.text`` of *element* and every descendant, plus ``.tail`` of every
|
|
718
|
+
descendant, in document order. Returns the slot (node, attr) and local
|
|
719
|
+
offset where the match BEGINS. A match may span multiple slots — only the
|
|
720
|
+
starting slot is reported.
|
|
721
|
+
|
|
722
|
+
Args:
|
|
723
|
+
element: The element to search.
|
|
724
|
+
needle: Substring to look for. Empty string returns None.
|
|
725
|
+
|
|
726
|
+
Returns:
|
|
727
|
+
``(node, attr, local_offset)`` where ``attr`` is ``"text"`` or ``"tail"``,
|
|
728
|
+
or ``None`` if not found.
|
|
729
|
+
"""
|
|
730
|
+
if not needle:
|
|
731
|
+
return None
|
|
732
|
+
slots: list[tuple[ET.Element, str]] = _collect_text_slots(element)
|
|
733
|
+
values: list[str] = [getattr(n, a) or "" for n, a in slots]
|
|
734
|
+
combined: str = "".join(values)
|
|
735
|
+
idx: int = combined.find(needle)
|
|
736
|
+
if idx < 0:
|
|
737
|
+
return None
|
|
738
|
+
running: int = 0
|
|
739
|
+
for (node, attr), value in zip(slots, values):
|
|
740
|
+
if running <= idx < running + len(value):
|
|
741
|
+
return node, attr, idx - running
|
|
742
|
+
running += len(value)
|
|
743
|
+
return None
|
|
744
|
+
|
|
745
|
+
|
|
746
|
+
def insert_after_text_in_element(element: ET.Element, anchor: str, new_element: ET.Element) -> bool:
|
|
747
|
+
"""Insert *new_element* immediately after the first occurrence of *anchor*.
|
|
748
|
+
|
|
749
|
+
Splits the slot containing the END of the match, then inserts *new_element*
|
|
750
|
+
either as a child (when the match ends in a ``.text`` slot) or as a sibling
|
|
751
|
+
(when it ends in a ``.tail`` slot). The remainder of the slot becomes
|
|
752
|
+
*new_element*'s ``.tail``. Other inline children of *element* are preserved.
|
|
753
|
+
|
|
754
|
+
Args:
|
|
755
|
+
element: The container in which to search.
|
|
756
|
+
anchor: Substring that locates the insertion point.
|
|
757
|
+
new_element: The element to insert.
|
|
758
|
+
|
|
759
|
+
Returns:
|
|
760
|
+
``True`` if the anchor was found and the element inserted, else ``False``.
|
|
761
|
+
"""
|
|
762
|
+
if not anchor:
|
|
763
|
+
return False
|
|
764
|
+
slots: list[tuple[ET.Element, str]] = _collect_text_slots(element)
|
|
765
|
+
values: list[str] = [getattr(n, a) or "" for n, a in slots]
|
|
766
|
+
combined: str = "".join(values)
|
|
767
|
+
idx: int = combined.find(anchor)
|
|
768
|
+
if idx < 0:
|
|
769
|
+
return False
|
|
770
|
+
end: int = idx + len(anchor)
|
|
771
|
+
running: int = 0
|
|
772
|
+
target_index: int = -1
|
|
773
|
+
for i, value in enumerate(values):
|
|
774
|
+
if running <= end - 1 < running + len(value):
|
|
775
|
+
target_index = i
|
|
776
|
+
break
|
|
777
|
+
running += len(value)
|
|
778
|
+
if target_index < 0:
|
|
779
|
+
return False
|
|
780
|
+
target_node, target_attr = slots[target_index]
|
|
781
|
+
local_end: int = end - running
|
|
782
|
+
current_value: str = values[target_index]
|
|
783
|
+
prefix: str = current_value[:local_end]
|
|
784
|
+
suffix: str = current_value[local_end:]
|
|
785
|
+
|
|
786
|
+
if target_attr == "text":
|
|
787
|
+
target_node.text = prefix if prefix else None
|
|
788
|
+
target_node.insert(0, new_element)
|
|
789
|
+
new_element.tail = suffix if suffix else None
|
|
790
|
+
return True
|
|
791
|
+
|
|
792
|
+
target_node.tail = prefix if prefix else None
|
|
793
|
+
parent_map: dict[ET.Element, ET.Element] = _build_parent_map(element)
|
|
794
|
+
parent: ET.Element | None = parent_map.get(target_node)
|
|
795
|
+
if parent is None:
|
|
796
|
+
return False
|
|
797
|
+
sibling_index: int = list(parent).index(target_node)
|
|
798
|
+
parent.insert(sibling_index + 1, new_element)
|
|
799
|
+
new_element.tail = suffix if suffix else None
|
|
800
|
+
return True
|
|
801
|
+
|
|
802
|
+
|
|
803
|
+
def replace_pattern_with_element_in_element(
|
|
804
|
+
element: ET.Element,
|
|
805
|
+
pattern: re.Pattern[str],
|
|
806
|
+
factory: Callable[[re.Match[str]], ET.Element],
|
|
807
|
+
) -> int:
|
|
808
|
+
"""Replace every regex match in *element*'s text content with a built element.
|
|
809
|
+
|
|
810
|
+
For each non-overlapping match of *pattern* against the concatenated text
|
|
811
|
+
content (``.text`` of *element* and descendants, plus ``.tail`` of every
|
|
812
|
+
descendant), the match is removed and replaced with the element returned
|
|
813
|
+
by ``factory(match)``. The element is inserted either as a child (when
|
|
814
|
+
the match falls in a ``.text`` slot) or as a sibling (when in a ``.tail``
|
|
815
|
+
slot). The new element's ``.tail`` carries the remainder of the original
|
|
816
|
+
slot.
|
|
817
|
+
|
|
818
|
+
Matches that straddle multiple slots are silently skipped — short
|
|
819
|
+
placeholder patterns like ``[@bibkey]`` virtually never straddle inline
|
|
820
|
+
children, and skipping is safer than corrupting structure.
|
|
821
|
+
|
|
822
|
+
Args:
|
|
823
|
+
element: Container to scan.
|
|
824
|
+
pattern: Compiled regex.
|
|
825
|
+
factory: Callable returning a new ET.Element per match.
|
|
826
|
+
|
|
827
|
+
Returns:
|
|
828
|
+
Number of replacements performed.
|
|
829
|
+
"""
|
|
830
|
+
slots: list[tuple[ET.Element, str]] = _collect_text_slots(element)
|
|
831
|
+
values: list[str] = [getattr(n, a) or "" for n, a in slots]
|
|
832
|
+
offsets: list[int] = []
|
|
833
|
+
running: int = 0
|
|
834
|
+
for v in values:
|
|
835
|
+
offsets.append(running)
|
|
836
|
+
running += len(v)
|
|
837
|
+
combined: str = "".join(values)
|
|
838
|
+
|
|
839
|
+
matches: list[re.Match[str]] = list(pattern.finditer(combined))
|
|
840
|
+
if not matches:
|
|
841
|
+
return 0
|
|
842
|
+
|
|
843
|
+
def slot_for(global_offset: int) -> int:
|
|
844
|
+
lo, hi = 0, len(values) - 1
|
|
845
|
+
while lo < hi:
|
|
846
|
+
mid = (lo + hi + 1) // 2
|
|
847
|
+
if offsets[mid] <= global_offset:
|
|
848
|
+
lo = mid
|
|
849
|
+
else:
|
|
850
|
+
hi = mid - 1
|
|
851
|
+
return lo
|
|
852
|
+
|
|
853
|
+
parent_map: dict[ET.Element, ET.Element] = _build_parent_map(element)
|
|
854
|
+
replaced: int = 0
|
|
855
|
+
|
|
856
|
+
# Work right-to-left so earlier modifications don't shift later positions
|
|
857
|
+
# within the paragraph structure.
|
|
858
|
+
for match in reversed(matches):
|
|
859
|
+
start, end = match.start(), match.end()
|
|
860
|
+
i_slot = slot_for(start)
|
|
861
|
+
j_slot = slot_for(end - 1) if end > start else i_slot
|
|
862
|
+
if i_slot != j_slot:
|
|
863
|
+
# Straddle — skip silently.
|
|
864
|
+
continue
|
|
865
|
+
target_node, target_attr = slots[i_slot]
|
|
866
|
+
local_start = start - offsets[i_slot]
|
|
867
|
+
local_end = end - offsets[i_slot]
|
|
868
|
+
current = values[i_slot]
|
|
869
|
+
prefix = current[:local_start]
|
|
870
|
+
suffix = current[local_end:]
|
|
871
|
+
new_element = factory(match)
|
|
872
|
+
|
|
873
|
+
if target_attr == "text":
|
|
874
|
+
target_node.text = prefix if prefix else None
|
|
875
|
+
target_node.insert(0, new_element)
|
|
876
|
+
new_element.tail = suffix if suffix else None
|
|
877
|
+
else:
|
|
878
|
+
target_node.tail = prefix if prefix else None
|
|
879
|
+
parent = parent_map.get(target_node)
|
|
880
|
+
if parent is None:
|
|
881
|
+
continue
|
|
882
|
+
sibling_index = list(parent).index(target_node)
|
|
883
|
+
parent.insert(sibling_index + 1, new_element)
|
|
884
|
+
new_element.tail = suffix if suffix else None
|
|
885
|
+
|
|
886
|
+
# Update tracking so subsequent (earlier) matches see the new state.
|
|
887
|
+
# We pessimistically rebuild slots; simpler than incremental updates.
|
|
888
|
+
slots = _collect_text_slots(element)
|
|
889
|
+
values = [getattr(n, a) or "" for n, a in slots]
|
|
890
|
+
offsets = []
|
|
891
|
+
running = 0
|
|
892
|
+
for v in values:
|
|
893
|
+
offsets.append(running)
|
|
894
|
+
running += len(v)
|
|
895
|
+
parent_map = _build_parent_map(element)
|
|
896
|
+
replaced += 1
|
|
897
|
+
|
|
898
|
+
return replaced
|
|
899
|
+
|
|
900
|
+
|
|
901
|
+
def insert_in_paragraph(paragraph: ET.Element, position: str, new_element: ET.Element) -> None:
|
|
902
|
+
"""Insert *new_element* at the start or end of *paragraph*.
|
|
903
|
+
|
|
904
|
+
``"end"`` appends; ``"start"`` inserts as first child and pushes any
|
|
905
|
+
existing ``paragraph.text`` to ``new_element.tail``.
|
|
906
|
+
|
|
907
|
+
Args:
|
|
908
|
+
paragraph: The container element (typically ``text:p`` or ``text:h``).
|
|
909
|
+
position: Either ``"start"`` or ``"end"``.
|
|
910
|
+
new_element: Element to insert.
|
|
911
|
+
|
|
912
|
+
Raises:
|
|
913
|
+
ValueError: If *position* is not ``"start"`` or ``"end"``.
|
|
914
|
+
"""
|
|
915
|
+
if position == "end":
|
|
916
|
+
paragraph.append(new_element)
|
|
917
|
+
new_element.tail = None
|
|
918
|
+
elif position == "start":
|
|
919
|
+
old_text: str | None = paragraph.text
|
|
920
|
+
paragraph.text = None
|
|
921
|
+
paragraph.insert(0, new_element)
|
|
922
|
+
new_element.tail = old_text
|
|
923
|
+
else:
|
|
924
|
+
raise ValueError(f"position must be 'start' or 'end', got {position!r}")
|
|
925
|
+
|
|
926
|
+
|
|
927
|
+
def wrap_text_with_pair_in_element(
|
|
928
|
+
element: ET.Element,
|
|
929
|
+
start_anchor: str,
|
|
930
|
+
end_anchor: str,
|
|
931
|
+
start_element: ET.Element,
|
|
932
|
+
end_element: ET.Element,
|
|
933
|
+
) -> bool:
|
|
934
|
+
"""Bracket a text region with two empty marker elements (e.g. range bookmarks).
|
|
935
|
+
|
|
936
|
+
Finds *start_anchor* and *end_anchor* in *element*'s text content; inserts
|
|
937
|
+
*start_element* immediately after the start anchor and *end_element*
|
|
938
|
+
immediately after the end anchor. The end anchor must occur after the
|
|
939
|
+
start anchor in document order. Rolls back on failure: nothing is inserted
|
|
940
|
+
unless both anchors were found and the order is correct.
|
|
941
|
+
|
|
942
|
+
Args:
|
|
943
|
+
element: The paragraph (or similar) to search.
|
|
944
|
+
start_anchor: Substring marking the start of the bracketed region.
|
|
945
|
+
end_anchor: Substring marking the end. Must come after *start_anchor*.
|
|
946
|
+
start_element: Element to insert after the start anchor (e.g. ``text:bookmark-start``).
|
|
947
|
+
end_element: Element to insert after the end anchor (e.g. ``text:bookmark-end``).
|
|
948
|
+
|
|
949
|
+
Returns:
|
|
950
|
+
``True`` if both insertions succeeded, ``False`` otherwise (and no
|
|
951
|
+
change is made to *element*).
|
|
952
|
+
"""
|
|
953
|
+
slots: list[tuple[ET.Element, str]] = _collect_text_slots(element)
|
|
954
|
+
values: list[str] = [getattr(n, a) or "" for n, a in slots]
|
|
955
|
+
combined: str = "".join(values)
|
|
956
|
+
start_idx: int = combined.find(start_anchor)
|
|
957
|
+
if start_idx < 0:
|
|
958
|
+
return False
|
|
959
|
+
end_idx: int = combined.find(end_anchor, start_idx + len(start_anchor))
|
|
960
|
+
if end_idx < 0:
|
|
961
|
+
return False
|
|
962
|
+
# Insert end first so positions of start_anchor remain stable.
|
|
963
|
+
if not insert_after_text_in_element(element, end_anchor, end_element):
|
|
964
|
+
return False
|
|
965
|
+
if not insert_after_text_in_element(element, start_anchor, start_element):
|
|
966
|
+
# Rollback end insertion.
|
|
967
|
+
parent_map: dict[ET.Element, ET.Element] = _build_parent_map(element)
|
|
968
|
+
parent: ET.Element | None = parent_map.get(end_element)
|
|
969
|
+
if parent is not None:
|
|
970
|
+
# Restore the tail before removing.
|
|
971
|
+
siblings: list[ET.Element] = list(parent)
|
|
972
|
+
idx: int = siblings.index(end_element)
|
|
973
|
+
preceding_tail: str | None = end_element.tail
|
|
974
|
+
if idx == 0:
|
|
975
|
+
parent.text = (parent.text or "") + (preceding_tail or "") or None
|
|
976
|
+
else:
|
|
977
|
+
prev: ET.Element = siblings[idx - 1]
|
|
978
|
+
prev.tail = (prev.tail or "") + (preceding_tail or "") or None
|
|
979
|
+
parent.remove(end_element)
|
|
980
|
+
return False
|
|
981
|
+
return True
|
|
982
|
+
|
|
983
|
+
|
|
984
|
+
def wrap_text_across_elements(
|
|
985
|
+
elements: list[ET.Element],
|
|
986
|
+
start_anchor: str,
|
|
987
|
+
end_anchor: str,
|
|
988
|
+
start_element: ET.Element,
|
|
989
|
+
end_element: ET.Element,
|
|
990
|
+
) -> bool:
|
|
991
|
+
"""Bracket a text region with a start/end marker pair across multiple paragraphs.
|
|
992
|
+
|
|
993
|
+
Searches *elements* in document order for *start_anchor*; in the first
|
|
994
|
+
matching element, then searches the remainder (and the same element after
|
|
995
|
+
the start position) for *end_anchor*. Inserts *start_element* immediately
|
|
996
|
+
after the start anchor and *end_element* immediately after the end anchor.
|
|
997
|
+
|
|
998
|
+
If start and end fall in the same element, falls back to
|
|
999
|
+
:func:`wrap_text_with_pair_in_element`. Otherwise, inserts the end first
|
|
1000
|
+
(in a later element, so positions stay stable) and the start second.
|
|
1001
|
+
Rolls back on failure.
|
|
1002
|
+
|
|
1003
|
+
Args:
|
|
1004
|
+
elements: Container elements to search (typically all paragraphs).
|
|
1005
|
+
start_anchor: Substring marking the range start.
|
|
1006
|
+
end_anchor: Substring marking the range end (must come after start).
|
|
1007
|
+
start_element: Element inserted after the start anchor.
|
|
1008
|
+
end_element: Element inserted after the end anchor.
|
|
1009
|
+
|
|
1010
|
+
Returns:
|
|
1011
|
+
``True`` if both markers were inserted, ``False`` otherwise.
|
|
1012
|
+
"""
|
|
1013
|
+
if not start_anchor or not end_anchor:
|
|
1014
|
+
return False
|
|
1015
|
+
|
|
1016
|
+
# Locate start
|
|
1017
|
+
start_idx: int = -1
|
|
1018
|
+
for i, element in enumerate(elements):
|
|
1019
|
+
if find_text_position_in_element(element, start_anchor) is not None:
|
|
1020
|
+
start_idx = i
|
|
1021
|
+
break
|
|
1022
|
+
if start_idx < 0:
|
|
1023
|
+
return False
|
|
1024
|
+
|
|
1025
|
+
# Locate end: in the same element after the start, or in any subsequent element.
|
|
1026
|
+
end_idx: int = -1
|
|
1027
|
+
start_element_combined: str = "".join(getattr(n, a) or "" for n, a in _collect_text_slots(elements[start_idx]))
|
|
1028
|
+
s_pos: int = start_element_combined.find(start_anchor)
|
|
1029
|
+
e_pos: int = start_element_combined.find(end_anchor, s_pos + len(start_anchor))
|
|
1030
|
+
if e_pos >= 0:
|
|
1031
|
+
# Both in same element.
|
|
1032
|
+
return wrap_text_with_pair_in_element(elements[start_idx], start_anchor, end_anchor, start_element, end_element)
|
|
1033
|
+
|
|
1034
|
+
for j in range(start_idx + 1, len(elements)):
|
|
1035
|
+
if find_text_position_in_element(elements[j], end_anchor) is not None:
|
|
1036
|
+
end_idx = j
|
|
1037
|
+
break
|
|
1038
|
+
if end_idx < 0:
|
|
1039
|
+
return False
|
|
1040
|
+
|
|
1041
|
+
# Insert end first (later element); positions in start element remain stable.
|
|
1042
|
+
if not insert_after_text_in_element(elements[end_idx], end_anchor, end_element):
|
|
1043
|
+
return False
|
|
1044
|
+
if not insert_after_text_in_element(elements[start_idx], start_anchor, start_element):
|
|
1045
|
+
# Rollback end insertion.
|
|
1046
|
+
parent_map: dict[ET.Element, ET.Element] = _build_parent_map(elements[end_idx])
|
|
1047
|
+
parent: ET.Element | None = parent_map.get(end_element)
|
|
1048
|
+
if parent is not None:
|
|
1049
|
+
siblings: list[ET.Element] = list(parent)
|
|
1050
|
+
idx: int = siblings.index(end_element)
|
|
1051
|
+
preceding_tail: str | None = end_element.tail
|
|
1052
|
+
if idx == 0:
|
|
1053
|
+
parent.text = (parent.text or "") + (preceding_tail or "") or None
|
|
1054
|
+
else:
|
|
1055
|
+
prev: ET.Element = siblings[idx - 1]
|
|
1056
|
+
prev.tail = (prev.tail or "") + (preceding_tail or "") or None
|
|
1057
|
+
parent.remove(end_element)
|
|
1058
|
+
return False
|
|
1059
|
+
return True
|
|
1060
|
+
|
|
1061
|
+
|
|
1062
|
+
def ensure_sequence_declarations(text_root: ET.Element, names: list[str], ns: Mapping[str, str]) -> None:
|
|
1063
|
+
"""Ensure ``text:sequence-decls`` exists under *text_root* and contains *names*.
|
|
1064
|
+
|
|
1065
|
+
*text_root* is typically the ``office:text`` element. If a
|
|
1066
|
+
``text:sequence-decls`` block is missing, it is prepended as the first
|
|
1067
|
+
child. Missing ``text:sequence-decl`` entries for each ``NAME`` in *names*
|
|
1068
|
+
are appended.
|
|
1069
|
+
|
|
1070
|
+
Args:
|
|
1071
|
+
text_root: The ``office:text`` element (parent of body content).
|
|
1072
|
+
names: Sequence names (e.g. ``["Figure", "Table", "Illustration"]``).
|
|
1073
|
+
ns: Namespace map (must contain ``text``).
|
|
1074
|
+
"""
|
|
1075
|
+
text_ns: str = ns["text"]
|
|
1076
|
+
decls_tag: str = f"{{{text_ns}}}sequence-decls"
|
|
1077
|
+
decl_tag: str = f"{{{text_ns}}}sequence-decl"
|
|
1078
|
+
name_attr: str = f"{{{text_ns}}}name"
|
|
1079
|
+
display_attr: str = f"{{{text_ns}}}display-outline-level"
|
|
1080
|
+
|
|
1081
|
+
decls: ET.Element | None = text_root.find(decls_tag)
|
|
1082
|
+
if decls is None:
|
|
1083
|
+
decls = ET.Element(decls_tag)
|
|
1084
|
+
text_root.insert(0, decls)
|
|
1085
|
+
|
|
1086
|
+
existing: set[str] = {child.attrib.get(name_attr, "") for child in decls.findall(decl_tag)}
|
|
1087
|
+
for name in names:
|
|
1088
|
+
if name in existing:
|
|
1089
|
+
continue
|
|
1090
|
+
ET.SubElement(decls, decl_tag, {name_attr: name, display_attr: "0"})
|
|
1091
|
+
|
|
1092
|
+
|
|
1093
|
+
def replace_text_in_element(element: ET.Element, old: str, new: str) -> int:
|
|
1094
|
+
"""Replace ``old`` with ``new`` in *element*'s text, preserving children.
|
|
1095
|
+
|
|
1096
|
+
Walks all text nodes (``.text`` of *element* and every descendant, plus
|
|
1097
|
+
``.tail`` of every descendant) in document order. Inline children such as
|
|
1098
|
+
``text:span``, ``text:note``, ``text:bookmark``, ``text:a`` keep their
|
|
1099
|
+
identity. Matches that straddle child boundaries are still replaced — the
|
|
1100
|
+
new content is placed in the first containing slot, intermediate slots are
|
|
1101
|
+
cleared, and the trailing slot keeps only the suffix after the match.
|
|
1102
|
+
|
|
1103
|
+
Args:
|
|
1104
|
+
element: The element whose textual content should be searched.
|
|
1105
|
+
old: Substring to search for. Empty string is a no-op.
|
|
1106
|
+
new: Replacement string.
|
|
1107
|
+
|
|
1108
|
+
Returns:
|
|
1109
|
+
Number of non-overlapping replacements performed.
|
|
1110
|
+
"""
|
|
1111
|
+
if not old:
|
|
1112
|
+
return 0
|
|
1113
|
+
|
|
1114
|
+
slots: list[tuple[ET.Element, str]] = _collect_text_slots(element)
|
|
1115
|
+
values: list[str] = [getattr(n, a) or "" for n, a in slots]
|
|
1116
|
+
combined: str = "".join(values)
|
|
1117
|
+
|
|
1118
|
+
matches: list[tuple[int, int]] = []
|
|
1119
|
+
pos: int = 0
|
|
1120
|
+
while True:
|
|
1121
|
+
i: int = combined.find(old, pos)
|
|
1122
|
+
if i < 0:
|
|
1123
|
+
break
|
|
1124
|
+
matches.append((i, i + len(old)))
|
|
1125
|
+
pos = i + len(old)
|
|
1126
|
+
if not matches:
|
|
1127
|
+
return 0
|
|
1128
|
+
|
|
1129
|
+
offsets: list[int] = []
|
|
1130
|
+
running: int = 0
|
|
1131
|
+
for v in values:
|
|
1132
|
+
offsets.append(running)
|
|
1133
|
+
running += len(v)
|
|
1134
|
+
|
|
1135
|
+
def slot_for(offset: int) -> int:
|
|
1136
|
+
lo: int = 0
|
|
1137
|
+
hi: int = len(values) - 1
|
|
1138
|
+
while lo < hi:
|
|
1139
|
+
mid: int = (lo + hi + 1) // 2
|
|
1140
|
+
if offsets[mid] <= offset:
|
|
1141
|
+
lo = mid
|
|
1142
|
+
else:
|
|
1143
|
+
hi = mid - 1
|
|
1144
|
+
return lo
|
|
1145
|
+
|
|
1146
|
+
for match_start, match_end in reversed(matches):
|
|
1147
|
+
i_slot: int = slot_for(match_start)
|
|
1148
|
+
j_slot: int = slot_for(match_end - 1) if match_end > match_start else i_slot
|
|
1149
|
+
local_i: int = match_start - offsets[i_slot]
|
|
1150
|
+
local_j: int = match_end - offsets[j_slot]
|
|
1151
|
+
if i_slot == j_slot:
|
|
1152
|
+
v = values[i_slot]
|
|
1153
|
+
values[i_slot] = v[:local_i] + new + v[local_j:]
|
|
1154
|
+
else:
|
|
1155
|
+
values[i_slot] = values[i_slot][:local_i] + new
|
|
1156
|
+
for k in range(i_slot + 1, j_slot):
|
|
1157
|
+
values[k] = ""
|
|
1158
|
+
values[j_slot] = values[j_slot][local_j:]
|
|
1159
|
+
|
|
1160
|
+
for (node, attr), value in zip(slots, values):
|
|
1161
|
+
setattr(node, attr, value if value else None)
|
|
1162
|
+
|
|
1163
|
+
return len(matches)
|
|
1164
|
+
|
|
1165
|
+
|
|
1166
|
+
def find_soffice() -> str:
|
|
1167
|
+
"""Locate the LibreOffice/soffice executable.
|
|
1168
|
+
|
|
1169
|
+
Checks PATH first, then common installation directories on macOS,
|
|
1170
|
+
Linux (including snap), and Windows (including WSL).
|
|
1171
|
+
|
|
1172
|
+
Returns:
|
|
1173
|
+
Absolute path to the ``soffice`` or ``libreoffice`` executable.
|
|
1174
|
+
|
|
1175
|
+
Raises:
|
|
1176
|
+
SystemExit: If no executable is found.
|
|
1177
|
+
"""
|
|
1178
|
+
candidates: list[str] = [
|
|
1179
|
+
"/Applications/LibreOffice.app/Contents/MacOS/soffice",
|
|
1180
|
+
"/usr/bin/libreoffice",
|
|
1181
|
+
"/usr/local/bin/libreoffice",
|
|
1182
|
+
"/snap/bin/libreoffice",
|
|
1183
|
+
r"C:\Program Files\LibreOffice\program\soffice.exe",
|
|
1184
|
+
"/c/Program Files/LibreOffice/program/soffice.exe",
|
|
1185
|
+
"/mnt/c/Program Files/LibreOffice/program/soffice.exe",
|
|
1186
|
+
]
|
|
1187
|
+
for name in ("soffice", "libreoffice"):
|
|
1188
|
+
found: str | None = shutil.which(name)
|
|
1189
|
+
if found:
|
|
1190
|
+
return found
|
|
1191
|
+
for candidate in candidates:
|
|
1192
|
+
if Path(candidate).exists():
|
|
1193
|
+
return candidate
|
|
1194
|
+
raise SystemExit("LibreOffice/soffice not found")
|
|
1195
|
+
|
|
1196
|
+
|
|
1197
|
+
def unpack_to_temp(path: Path) -> tempfile.TemporaryDirectory[str]:
|
|
1198
|
+
"""Extract an ODF ZIP to a temporary directory.
|
|
1199
|
+
|
|
1200
|
+
The caller is responsible for cleaning up the returned
|
|
1201
|
+
TemporaryDirectory (e.g. via a context manager or ``.cleanup()``).
|
|
1202
|
+
|
|
1203
|
+
Args:
|
|
1204
|
+
path: Path to the ODF ZIP file.
|
|
1205
|
+
|
|
1206
|
+
Returns:
|
|
1207
|
+
A ``tempfile.TemporaryDirectory`` containing the extracted contents.
|
|
1208
|
+
"""
|
|
1209
|
+
temp: tempfile.TemporaryDirectory[str] = tempfile.TemporaryDirectory()
|
|
1210
|
+
with zipfile.ZipFile(path) as archive:
|
|
1211
|
+
archive.extractall(temp.name)
|
|
1212
|
+
return temp
|
|
1213
|
+
|
|
1214
|
+
|
|
1215
|
+
_IMAGE_MAGIC: list[tuple[bytes, str]] = [
|
|
1216
|
+
(b"\x89PNG\r\n\x1a\n", ".png"),
|
|
1217
|
+
(b"\xff\xd8\xff", ".jpg"),
|
|
1218
|
+
(b"GIF87a", ".gif"),
|
|
1219
|
+
(b"GIF89a", ".gif"),
|
|
1220
|
+
(b"<?xml", ".svg"),
|
|
1221
|
+
(b"<svg", ".svg"),
|
|
1222
|
+
(b"BM", ".bmp"),
|
|
1223
|
+
(b"RIFF", ".webp"),
|
|
1224
|
+
]
|
|
1225
|
+
|
|
1226
|
+
|
|
1227
|
+
def _sniff_image_extension(data: bytes) -> str:
|
|
1228
|
+
for magic, ext in _IMAGE_MAGIC:
|
|
1229
|
+
if data.startswith(magic):
|
|
1230
|
+
return ext
|
|
1231
|
+
return ".bin"
|
|
1232
|
+
|
|
1233
|
+
|
|
1234
|
+
def _object_media_types(manifest_bytes: bytes | None) -> dict[str, str]:
|
|
1235
|
+
"""Map ``Object N`` directory names to their manifest media-types."""
|
|
1236
|
+
if not manifest_bytes:
|
|
1237
|
+
return {}
|
|
1238
|
+
manifest_ns: str = ODF_NAMESPACES["manifest"]
|
|
1239
|
+
result: dict[str, str] = {}
|
|
1240
|
+
try:
|
|
1241
|
+
root = ET.fromstring(manifest_bytes)
|
|
1242
|
+
except ET.ParseError:
|
|
1243
|
+
return {}
|
|
1244
|
+
for entry in root.findall(f".//{{{manifest_ns}}}file-entry"):
|
|
1245
|
+
path = entry.attrib.get(f"{{{manifest_ns}}}full-path", "")
|
|
1246
|
+
media = entry.attrib.get(f"{{{manifest_ns}}}media-type", "")
|
|
1247
|
+
if path.startswith("Object ") and path.endswith("/") and media:
|
|
1248
|
+
result[path.rstrip("/")] = media
|
|
1249
|
+
return result
|
|
1250
|
+
|
|
1251
|
+
|
|
1252
|
+
def _flatten_object(members: dict[str, bytes], mimetype: str | None) -> ET.Element:
|
|
1253
|
+
"""Merge an embedded object's sub-package members into one flat document.
|
|
1254
|
+
|
|
1255
|
+
Args:
|
|
1256
|
+
members: Mapping of member filename (e.g. ``"content.xml"``) to bytes.
|
|
1257
|
+
mimetype: Object media-type for the ``office:mimetype`` attribute.
|
|
1258
|
+
|
|
1259
|
+
Returns:
|
|
1260
|
+
A nested ``<office:document>`` element ready to inline inside
|
|
1261
|
+
``<draw:object>``.
|
|
1262
|
+
"""
|
|
1263
|
+
office_ns: str = ODF_NAMESPACES["office"]
|
|
1264
|
+
doc: ET.Element = ET.Element(f"{{{office_ns}}}document", {f"{{{office_ns}}}version": "1.3"})
|
|
1265
|
+
if mimetype:
|
|
1266
|
+
doc.set(f"{{{office_ns}}}mimetype", mimetype)
|
|
1267
|
+
roots: dict[str, ET.Element] = {}
|
|
1268
|
+
for name, data in members.items():
|
|
1269
|
+
try:
|
|
1270
|
+
roots[name] = ET.fromstring(data)
|
|
1271
|
+
except ET.ParseError:
|
|
1272
|
+
continue
|
|
1273
|
+
|
|
1274
|
+
def pick(member: str, names: set[str]) -> list[ET.Element]:
|
|
1275
|
+
root = roots.get(member)
|
|
1276
|
+
return [] if root is None else [c for c in root if local_name(c.tag) in names]
|
|
1277
|
+
|
|
1278
|
+
for child in pick("meta.xml", {"meta"}):
|
|
1279
|
+
doc.append(child)
|
|
1280
|
+
for child in pick("settings.xml", {"settings"}):
|
|
1281
|
+
doc.append(child)
|
|
1282
|
+
for child in pick("content.xml", {"scripts"}):
|
|
1283
|
+
doc.append(child)
|
|
1284
|
+
for child in pick("styles.xml", {"font-face-decls"}):
|
|
1285
|
+
doc.append(child)
|
|
1286
|
+
for child in pick("styles.xml", {"styles"}):
|
|
1287
|
+
doc.append(child)
|
|
1288
|
+
merged_auto: ET.Element = ET.SubElement(doc, f"{{{office_ns}}}automatic-styles")
|
|
1289
|
+
for member in ("styles.xml", "content.xml"):
|
|
1290
|
+
for auto in pick(member, {"automatic-styles"}):
|
|
1291
|
+
for grandchild in list(auto):
|
|
1292
|
+
merged_auto.append(grandchild)
|
|
1293
|
+
for child in pick("styles.xml", {"master-styles"}):
|
|
1294
|
+
doc.append(child)
|
|
1295
|
+
for child in pick("content.xml", {"body"}):
|
|
1296
|
+
doc.append(child)
|
|
1297
|
+
return doc
|
|
1298
|
+
|
|
1299
|
+
|
|
1300
|
+
def _split_object_flat(doc: ET.Element) -> tuple[dict[str, bytes], str | None]:
|
|
1301
|
+
"""Split a flat object ``<office:document>`` back into sub-package members.
|
|
1302
|
+
|
|
1303
|
+
Args:
|
|
1304
|
+
doc: The nested ``<office:document>`` inlined inside ``<draw:object>``.
|
|
1305
|
+
|
|
1306
|
+
Returns:
|
|
1307
|
+
A ``(members, mimetype)`` pair where ``members`` maps member filenames
|
|
1308
|
+
to serialized bytes.
|
|
1309
|
+
"""
|
|
1310
|
+
office_ns: str = ODF_NAMESPACES["office"]
|
|
1311
|
+
mimetype: str | None = doc.attrib.get(f"{{{office_ns}}}mimetype")
|
|
1312
|
+
content_doc: ET.Element = ET.Element(f"{{{office_ns}}}document-content", {f"{{{office_ns}}}version": "1.3"})
|
|
1313
|
+
styles_doc: ET.Element = ET.Element(f"{{{office_ns}}}document-styles", {f"{{{office_ns}}}version": "1.3"})
|
|
1314
|
+
meta_doc: ET.Element = ET.Element(f"{{{office_ns}}}document-meta", {f"{{{office_ns}}}version": "1.3"})
|
|
1315
|
+
settings_doc: ET.Element = ET.Element(f"{{{office_ns}}}document-settings", {f"{{{office_ns}}}version": "1.3"})
|
|
1316
|
+
content_auto: ET.Element = ET.SubElement(content_doc, f"{{{office_ns}}}automatic-styles")
|
|
1317
|
+
has_styles = has_meta = has_settings = False
|
|
1318
|
+
for child in list(doc):
|
|
1319
|
+
name: str = local_name(child.tag)
|
|
1320
|
+
if name == "meta":
|
|
1321
|
+
meta_doc.append(child)
|
|
1322
|
+
has_meta = True
|
|
1323
|
+
elif name == "settings":
|
|
1324
|
+
settings_doc.append(child)
|
|
1325
|
+
has_settings = True
|
|
1326
|
+
elif name == "scripts":
|
|
1327
|
+
content_doc.insert(0, child)
|
|
1328
|
+
elif name in {"font-face-decls", "styles", "master-styles"}:
|
|
1329
|
+
styles_doc.append(child)
|
|
1330
|
+
has_styles = True
|
|
1331
|
+
elif name == "automatic-styles":
|
|
1332
|
+
for grandchild in list(child):
|
|
1333
|
+
content_auto.append(grandchild)
|
|
1334
|
+
elif name == "body":
|
|
1335
|
+
content_doc.append(child)
|
|
1336
|
+
members: dict[str, bytes] = {"content.xml": xml_bytes(content_doc)}
|
|
1337
|
+
if has_styles:
|
|
1338
|
+
members["styles.xml"] = xml_bytes(styles_doc)
|
|
1339
|
+
if has_meta:
|
|
1340
|
+
members["meta.xml"] = xml_bytes(meta_doc)
|
|
1341
|
+
if has_settings:
|
|
1342
|
+
members["settings.xml"] = xml_bytes(settings_doc)
|
|
1343
|
+
return members, mimetype
|
|
1344
|
+
|
|
1345
|
+
|
|
1346
|
+
def pack_flat_odf(input_zip: Path, output_flat: Path) -> None:
|
|
1347
|
+
"""Convert a zipped ODF package to flat (single-XML) ODF.
|
|
1348
|
+
|
|
1349
|
+
The resulting file has a single ``<office:document>`` root with merged
|
|
1350
|
+
content, styles, meta, and settings, plus all embedded pictures encoded
|
|
1351
|
+
inline as ``<office:binary-data>`` children of their ``<draw:image>``.
|
|
1352
|
+
|
|
1353
|
+
Args:
|
|
1354
|
+
input_zip: Source ODF file (``.odt``/``.odp``/``.ods``/``.odg``).
|
|
1355
|
+
output_flat: Destination flat ODF file (``.fodt``/``.fodp``/...).
|
|
1356
|
+
"""
|
|
1357
|
+
for prefix, uri in ODF_NAMESPACES.items():
|
|
1358
|
+
ET.register_namespace(prefix, uri)
|
|
1359
|
+
|
|
1360
|
+
office_ns: str = ODF_NAMESPACES["office"]
|
|
1361
|
+
xlink_ns: str = ODF_NAMESPACES["xlink"]
|
|
1362
|
+
draw_ns: str = ODF_NAMESPACES["draw"]
|
|
1363
|
+
|
|
1364
|
+
with zipfile.ZipFile(input_zip) as archive:
|
|
1365
|
+
mimetype: str = archive.read("mimetype").decode("ascii").strip()
|
|
1366
|
+
meta_root: ET.Element = ET.fromstring(archive.read("meta.xml"))
|
|
1367
|
+
settings_root: ET.Element = ET.fromstring(archive.read("settings.xml"))
|
|
1368
|
+
styles_root: ET.Element = ET.fromstring(archive.read("styles.xml"))
|
|
1369
|
+
content_root: ET.Element = ET.fromstring(archive.read("content.xml"))
|
|
1370
|
+
pictures: dict[str, bytes] = {
|
|
1371
|
+
name: archive.read(name) for name in archive.namelist() if name.startswith("Pictures/")
|
|
1372
|
+
}
|
|
1373
|
+
# Embedded objects (charts, formulas) live under 'Object N/' as full
|
|
1374
|
+
# sub-packages (content.xml plus optional styles.xml/meta.xml).
|
|
1375
|
+
object_members: dict[str, bytes] = {
|
|
1376
|
+
name: archive.read(name)
|
|
1377
|
+
for name in archive.namelist()
|
|
1378
|
+
if name.startswith("Object ") and not name.endswith("/")
|
|
1379
|
+
}
|
|
1380
|
+
try:
|
|
1381
|
+
object_manifest: bytes | None = archive.read("META-INF/manifest.xml")
|
|
1382
|
+
except KeyError:
|
|
1383
|
+
object_manifest = None
|
|
1384
|
+
|
|
1385
|
+
flat_root: ET.Element = ET.Element(
|
|
1386
|
+
f"{{{office_ns}}}document",
|
|
1387
|
+
{
|
|
1388
|
+
f"{{{office_ns}}}version": "1.3",
|
|
1389
|
+
f"{{{office_ns}}}mimetype": mimetype,
|
|
1390
|
+
},
|
|
1391
|
+
)
|
|
1392
|
+
|
|
1393
|
+
def _children_matching(source: ET.Element, names: set[str]) -> list[ET.Element]:
|
|
1394
|
+
return [child for child in source if local_name(child.tag) in names]
|
|
1395
|
+
|
|
1396
|
+
for child in _children_matching(meta_root, {"meta"}):
|
|
1397
|
+
flat_root.append(child)
|
|
1398
|
+
for child in _children_matching(settings_root, {"settings"}):
|
|
1399
|
+
flat_root.append(child)
|
|
1400
|
+
for child in _children_matching(content_root, {"scripts"}):
|
|
1401
|
+
flat_root.append(child)
|
|
1402
|
+
for child in _children_matching(styles_root, {"font-face-decls"}):
|
|
1403
|
+
flat_root.append(child)
|
|
1404
|
+
for child in _children_matching(styles_root, {"styles"}):
|
|
1405
|
+
flat_root.append(child)
|
|
1406
|
+
|
|
1407
|
+
merged_auto: ET.Element = ET.SubElement(flat_root, f"{{{office_ns}}}automatic-styles")
|
|
1408
|
+
for source in (styles_root, content_root):
|
|
1409
|
+
for auto in _children_matching(source, {"automatic-styles"}):
|
|
1410
|
+
for grandchild in list(auto):
|
|
1411
|
+
merged_auto.append(grandchild)
|
|
1412
|
+
|
|
1413
|
+
for child in _children_matching(styles_root, {"master-styles"}):
|
|
1414
|
+
flat_root.append(child)
|
|
1415
|
+
for child in _children_matching(content_root, {"body"}):
|
|
1416
|
+
flat_root.append(child)
|
|
1417
|
+
|
|
1418
|
+
for image in flat_root.iter(f"{{{draw_ns}}}image"):
|
|
1419
|
+
href: str | None = image.attrib.get(f"{{{xlink_ns}}}href")
|
|
1420
|
+
if href and href in pictures:
|
|
1421
|
+
for attr in (
|
|
1422
|
+
f"{{{xlink_ns}}}href",
|
|
1423
|
+
f"{{{xlink_ns}}}type",
|
|
1424
|
+
f"{{{xlink_ns}}}show",
|
|
1425
|
+
f"{{{xlink_ns}}}actuate",
|
|
1426
|
+
):
|
|
1427
|
+
image.attrib.pop(attr, None)
|
|
1428
|
+
binary: ET.Element = ET.SubElement(image, f"{{{office_ns}}}binary-data")
|
|
1429
|
+
binary.text = base64.b64encode(pictures[href]).decode("ascii")
|
|
1430
|
+
|
|
1431
|
+
# Embed object sub-packages (charts, formulas): inline the object's
|
|
1432
|
+
# full sub-package as a nested <office:document> inside its <draw:object>.
|
|
1433
|
+
object_media: dict[str, str] = _object_media_types(object_manifest)
|
|
1434
|
+
for obj in list(flat_root.iter(f"{{{draw_ns}}}object")):
|
|
1435
|
+
href_obj: str | None = obj.attrib.get(f"{{{xlink_ns}}}href")
|
|
1436
|
+
if not href_obj:
|
|
1437
|
+
continue
|
|
1438
|
+
obj_dir: str = href_obj.lstrip("./").rstrip("/")
|
|
1439
|
+
members: dict[str, bytes] = {
|
|
1440
|
+
name[len(obj_dir) + 1 :]: data for name, data in object_members.items() if name.startswith(obj_dir + "/")
|
|
1441
|
+
}
|
|
1442
|
+
if "content.xml" not in members:
|
|
1443
|
+
continue
|
|
1444
|
+
for attr in (
|
|
1445
|
+
f"{{{xlink_ns}}}href",
|
|
1446
|
+
f"{{{xlink_ns}}}type",
|
|
1447
|
+
f"{{{xlink_ns}}}show",
|
|
1448
|
+
f"{{{xlink_ns}}}actuate",
|
|
1449
|
+
):
|
|
1450
|
+
obj.attrib.pop(attr, None)
|
|
1451
|
+
obj.append(_flatten_object(members, object_media.get(obj_dir)))
|
|
1452
|
+
|
|
1453
|
+
output_flat.write_bytes(xml_bytes(flat_root))
|
|
1454
|
+
|
|
1455
|
+
|
|
1456
|
+
def unpack_flat_odf(input_flat: Path, output_zip: Path) -> None:
|
|
1457
|
+
"""Convert a flat ODF file back to a zipped ODF package.
|
|
1458
|
+
|
|
1459
|
+
Splits the single ``<office:document>`` root into the standard four
|
|
1460
|
+
XML files (content/styles/meta/settings), extracts inline pictures
|
|
1461
|
+
from ``<office:binary-data>`` blobs into ``Pictures/`` entries, and
|
|
1462
|
+
rebuilds ``META-INF/manifest.xml``.
|
|
1463
|
+
|
|
1464
|
+
Args:
|
|
1465
|
+
input_flat: Source flat ODF file.
|
|
1466
|
+
output_zip: Destination zipped ODF file.
|
|
1467
|
+
"""
|
|
1468
|
+
for prefix, uri in ODF_NAMESPACES.items():
|
|
1469
|
+
ET.register_namespace(prefix, uri)
|
|
1470
|
+
|
|
1471
|
+
office_ns: str = ODF_NAMESPACES["office"]
|
|
1472
|
+
xlink_ns: str = ODF_NAMESPACES["xlink"]
|
|
1473
|
+
draw_ns: str = ODF_NAMESPACES["draw"]
|
|
1474
|
+
style_ns: str = ODF_NAMESPACES["style"]
|
|
1475
|
+
manifest_ns: str = ODF_NAMESPACES["manifest"]
|
|
1476
|
+
|
|
1477
|
+
flat_root: ET.Element = ET.parse(input_flat).getroot()
|
|
1478
|
+
mimetype: str | None = flat_root.attrib.get(f"{{{office_ns}}}mimetype")
|
|
1479
|
+
if not mimetype:
|
|
1480
|
+
raise SystemExit("flat ODF root missing office:mimetype attribute")
|
|
1481
|
+
|
|
1482
|
+
meta_doc: ET.Element = ET.Element(f"{{{office_ns}}}document-meta", {f"{{{office_ns}}}version": "1.3"})
|
|
1483
|
+
settings_doc: ET.Element = ET.Element(f"{{{office_ns}}}document-settings", {f"{{{office_ns}}}version": "1.3"})
|
|
1484
|
+
styles_doc: ET.Element = ET.Element(f"{{{office_ns}}}document-styles", {f"{{{office_ns}}}version": "1.3"})
|
|
1485
|
+
content_doc: ET.Element = ET.Element(f"{{{office_ns}}}document-content", {f"{{{office_ns}}}version": "1.3"})
|
|
1486
|
+
|
|
1487
|
+
styles_auto: ET.Element = ET.SubElement(styles_doc, f"{{{office_ns}}}automatic-styles")
|
|
1488
|
+
content_auto: ET.Element = ET.SubElement(content_doc, f"{{{office_ns}}}automatic-styles")
|
|
1489
|
+
|
|
1490
|
+
for child in list(flat_root):
|
|
1491
|
+
name: str = local_name(child.tag)
|
|
1492
|
+
if name == "meta":
|
|
1493
|
+
meta_doc.append(child)
|
|
1494
|
+
elif name == "settings":
|
|
1495
|
+
settings_doc.append(child)
|
|
1496
|
+
elif name == "scripts":
|
|
1497
|
+
content_doc.append(child)
|
|
1498
|
+
elif name == "font-face-decls":
|
|
1499
|
+
styles_doc.append(child)
|
|
1500
|
+
elif name == "styles":
|
|
1501
|
+
styles_doc.append(child)
|
|
1502
|
+
elif name == "automatic-styles":
|
|
1503
|
+
for grandchild in list(child):
|
|
1504
|
+
if grandchild.tag == f"{{{style_ns}}}page-layout":
|
|
1505
|
+
styles_auto.append(grandchild)
|
|
1506
|
+
else:
|
|
1507
|
+
content_auto.append(grandchild)
|
|
1508
|
+
elif name == "master-styles":
|
|
1509
|
+
styles_doc.append(child)
|
|
1510
|
+
elif name == "body":
|
|
1511
|
+
content_doc.append(child)
|
|
1512
|
+
|
|
1513
|
+
pictures: dict[str, bytes] = {}
|
|
1514
|
+
existing_names: set[str] = set()
|
|
1515
|
+
for image in content_doc.iter(f"{{{draw_ns}}}image"):
|
|
1516
|
+
binary: ET.Element | None = image.find(f"{{{office_ns}}}binary-data")
|
|
1517
|
+
if binary is None or not binary.text:
|
|
1518
|
+
continue
|
|
1519
|
+
data: bytes = base64.b64decode(binary.text.strip())
|
|
1520
|
+
ext: str = _sniff_image_extension(data)
|
|
1521
|
+
candidate: str = unique_picture_name(existing_names, Path(f"image{len(pictures) + 1}{ext}"))
|
|
1522
|
+
existing_names.add(candidate)
|
|
1523
|
+
pictures[candidate] = data
|
|
1524
|
+
image.remove(binary)
|
|
1525
|
+
image.set(f"{{{xlink_ns}}}href", candidate)
|
|
1526
|
+
image.set(f"{{{xlink_ns}}}type", "simple")
|
|
1527
|
+
image.set(f"{{{xlink_ns}}}show", "embed")
|
|
1528
|
+
image.set(f"{{{xlink_ns}}}actuate", "onLoad")
|
|
1529
|
+
|
|
1530
|
+
# Extract inlined object sub-packages (charts, formulas) back to Object N/.
|
|
1531
|
+
math_ns: str = ODF_NAMESPACES["math"]
|
|
1532
|
+
chart_ns: str = ODF_NAMESPACES["chart"]
|
|
1533
|
+
objects: dict[str, bytes] = {}
|
|
1534
|
+
object_media: dict[str, str] = {}
|
|
1535
|
+
object_count = 0
|
|
1536
|
+
for obj in content_doc.iter(f"{{{draw_ns}}}object"):
|
|
1537
|
+
doc_child: ET.Element | None = obj.find(f"{{{office_ns}}}document")
|
|
1538
|
+
if doc_child is None:
|
|
1539
|
+
continue
|
|
1540
|
+
object_count += 1
|
|
1541
|
+
obj_dir = f"Object {object_count}"
|
|
1542
|
+
members, declared_mime = _split_object_flat(doc_child)
|
|
1543
|
+
for member_name, member_bytes in members.items():
|
|
1544
|
+
objects[f"{obj_dir}/{member_name}"] = member_bytes
|
|
1545
|
+
if declared_mime:
|
|
1546
|
+
object_media[obj_dir] = declared_mime
|
|
1547
|
+
elif doc_child.find(f".//{{{math_ns}}}math") is not None:
|
|
1548
|
+
object_media[obj_dir] = "application/vnd.oasis.opendocument.formula"
|
|
1549
|
+
elif doc_child.find(f".//{{{chart_ns}}}chart") is not None:
|
|
1550
|
+
object_media[obj_dir] = "application/vnd.oasis.opendocument.chart"
|
|
1551
|
+
else:
|
|
1552
|
+
object_media[obj_dir] = "application/vnd.oasis.opendocument.text"
|
|
1553
|
+
obj.remove(doc_child)
|
|
1554
|
+
obj.set(f"{{{xlink_ns}}}href", f"./{obj_dir}/")
|
|
1555
|
+
obj.set(f"{{{xlink_ns}}}type", "simple")
|
|
1556
|
+
obj.set(f"{{{xlink_ns}}}show", "embed")
|
|
1557
|
+
obj.set(f"{{{xlink_ns}}}actuate", "onLoad")
|
|
1558
|
+
|
|
1559
|
+
manifest_doc: ET.Element = ET.Element(
|
|
1560
|
+
f"{{{manifest_ns}}}manifest",
|
|
1561
|
+
{f"{{{manifest_ns}}}version": "1.3"},
|
|
1562
|
+
)
|
|
1563
|
+
ET.SubElement(
|
|
1564
|
+
manifest_doc,
|
|
1565
|
+
f"{{{manifest_ns}}}file-entry",
|
|
1566
|
+
{
|
|
1567
|
+
f"{{{manifest_ns}}}full-path": "/",
|
|
1568
|
+
f"{{{manifest_ns}}}media-type": mimetype,
|
|
1569
|
+
f"{{{manifest_ns}}}version": "1.3",
|
|
1570
|
+
},
|
|
1571
|
+
)
|
|
1572
|
+
for name in ("content.xml", "styles.xml", "meta.xml", "settings.xml"):
|
|
1573
|
+
ET.SubElement(
|
|
1574
|
+
manifest_doc,
|
|
1575
|
+
f"{{{manifest_ns}}}file-entry",
|
|
1576
|
+
{
|
|
1577
|
+
f"{{{manifest_ns}}}full-path": name,
|
|
1578
|
+
f"{{{manifest_ns}}}media-type": "text/xml",
|
|
1579
|
+
},
|
|
1580
|
+
)
|
|
1581
|
+
for picture_path in pictures:
|
|
1582
|
+
ET.SubElement(
|
|
1583
|
+
manifest_doc,
|
|
1584
|
+
f"{{{manifest_ns}}}file-entry",
|
|
1585
|
+
{
|
|
1586
|
+
f"{{{manifest_ns}}}full-path": picture_path,
|
|
1587
|
+
f"{{{manifest_ns}}}media-type": media_type_for(Path(picture_path)),
|
|
1588
|
+
},
|
|
1589
|
+
)
|
|
1590
|
+
for obj_dir, media in object_media.items():
|
|
1591
|
+
ET.SubElement(
|
|
1592
|
+
manifest_doc,
|
|
1593
|
+
f"{{{manifest_ns}}}file-entry",
|
|
1594
|
+
{f"{{{manifest_ns}}}full-path": f"{obj_dir}/", f"{{{manifest_ns}}}media-type": media},
|
|
1595
|
+
)
|
|
1596
|
+
for object_path in objects:
|
|
1597
|
+
ET.SubElement(
|
|
1598
|
+
manifest_doc,
|
|
1599
|
+
f"{{{manifest_ns}}}file-entry",
|
|
1600
|
+
{f"{{{manifest_ns}}}full-path": object_path, f"{{{manifest_ns}}}media-type": "text/xml"},
|
|
1601
|
+
)
|
|
1602
|
+
|
|
1603
|
+
with zipfile.ZipFile(output_zip, "w") as archive:
|
|
1604
|
+
archive.writestr("mimetype", mimetype, compress_type=zipfile.ZIP_STORED)
|
|
1605
|
+
archive.writestr("content.xml", xml_bytes(content_doc), compress_type=zipfile.ZIP_DEFLATED)
|
|
1606
|
+
archive.writestr("styles.xml", xml_bytes(styles_doc), compress_type=zipfile.ZIP_DEFLATED)
|
|
1607
|
+
archive.writestr("meta.xml", xml_bytes(meta_doc), compress_type=zipfile.ZIP_DEFLATED)
|
|
1608
|
+
archive.writestr("settings.xml", xml_bytes(settings_doc), compress_type=zipfile.ZIP_DEFLATED)
|
|
1609
|
+
archive.writestr("META-INF/manifest.xml", xml_bytes(manifest_doc), compress_type=zipfile.ZIP_DEFLATED)
|
|
1610
|
+
for path, data in pictures.items():
|
|
1611
|
+
archive.writestr(path, data, compress_type=zipfile.ZIP_DEFLATED)
|
|
1612
|
+
for path, data in objects.items():
|
|
1613
|
+
archive.writestr(path, data, compress_type=zipfile.ZIP_DEFLATED)
|
|
1614
|
+
|
|
1615
|
+
|
|
1616
|
+
def local_name(tag: str) -> str:
|
|
1617
|
+
"""Extract the local name from a Clark-notation tag ``'{ns}local'``.
|
|
1618
|
+
|
|
1619
|
+
Args:
|
|
1620
|
+
tag: XML tag in Clark notation (e.g. ``"{urn:...}text"``) or plain name.
|
|
1621
|
+
|
|
1622
|
+
Returns:
|
|
1623
|
+
The local name part (e.g. ``"text"``).
|
|
1624
|
+
"""
|
|
1625
|
+
return tag.split("}", 1)[1] if tag.startswith("{") else tag
|