open-document-lib 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
odf_lib/odf_common.py ADDED
@@ -0,0 +1,1625 @@
1
+ """Shared helpers for OpenDocument Format scripts.
2
+
3
+ All four ODF skills (ODT, ODP, ODS, ODG) use these functions.
4
+ Format-specific *_common.py modules import from here and add their
5
+ own NS dict, MIMETYPE constant, and format-specific helpers.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import base64
11
+ import mimetypes
12
+ import posixpath
13
+ import re
14
+ import shutil
15
+ import tempfile
16
+ import zipfile
17
+ from collections.abc import Callable, Mapping, Set
18
+ from datetime import datetime, timezone
19
+ from pathlib import Path
20
+ from xml.etree import ElementTree as ET
21
+
22
+ VERSION = "1.0.0" # keep in sync with pyproject.toml (see CONTRIBUTING.md)
23
+
24
+ ODF_NAMESPACES: dict[str, str] = {
25
+ "office": "urn:oasis:names:tc:opendocument:xmlns:office:1.0",
26
+ "text": "urn:oasis:names:tc:opendocument:xmlns:text:1.0",
27
+ "draw": "urn:oasis:names:tc:opendocument:xmlns:drawing:1.0",
28
+ "style": "urn:oasis:names:tc:opendocument:xmlns:style:1.0",
29
+ "fo": "urn:oasis:names:tc:opendocument:xmlns:xsl-fo-compatible:1.0",
30
+ "svg": "urn:oasis:names:tc:opendocument:xmlns:svg-compatible:1.0",
31
+ "table": "urn:oasis:names:tc:opendocument:xmlns:table:1.0",
32
+ "meta": "urn:oasis:names:tc:opendocument:xmlns:meta:1.0",
33
+ "dc": "http://purl.org/dc/elements/1.1/",
34
+ "manifest": "urn:oasis:names:tc:opendocument:xmlns:manifest:1.0",
35
+ "xlink": "http://www.w3.org/1999/xlink",
36
+ "presentation": "urn:oasis:names:tc:opendocument:xmlns:presentation:1.0",
37
+ "config": "urn:oasis:names:tc:opendocument:xmlns:config:1.0",
38
+ "smil": "urn:oasis:names:tc:opendocument:xmlns:smil-compatible:1.0",
39
+ "anim": "urn:oasis:names:tc:opendocument:xmlns:animation:1.0",
40
+ "chart": "urn:oasis:names:tc:opendocument:xmlns:chart:1.0",
41
+ "form": "urn:oasis:names:tc:opendocument:xmlns:form:1.0",
42
+ "script": "urn:oasis:names:tc:opendocument:xmlns:script:1.0",
43
+ "math": "http://www.w3.org/1998/Math/MathML",
44
+ "number": "urn:oasis:names:tc:opendocument:xmlns:datastyle:1.0",
45
+ "of": "urn:oasis:names:tc:opendocument:xmlns:of:1.2",
46
+ "loext": "urn:org:documentfoundation:names:experimental:office:xmlns:loext:1.0",
47
+ }
48
+
49
+ FLAT_EXTENSIONS: dict[str, str] = {
50
+ "application/vnd.oasis.opendocument.text": ".fodt",
51
+ "application/vnd.oasis.opendocument.presentation": ".fodp",
52
+ "application/vnd.oasis.opendocument.spreadsheet": ".fods",
53
+ "application/vnd.oasis.opendocument.graphics": ".fodg",
54
+ }
55
+
56
+
57
+ def parse_xml_from_zip(path: Path, member: str) -> ET.Element:
58
+ """Parse an XML member from a ZIP-based ODF file.
59
+
60
+ Args:
61
+ path: Path to the ODF ZIP file.
62
+ member: Internal ZIP member name (e.g. ``"content.xml"``).
63
+
64
+ Returns:
65
+ The root XML element of the parsed member.
66
+ """
67
+ with zipfile.ZipFile(path) as archive:
68
+ with archive.open(member) as handle:
69
+ return ET.parse(handle).getroot()
70
+
71
+
72
+ def xml_bytes(root: ET.Element) -> bytes:
73
+ """Serialize an XML element to UTF-8 bytes with XML declaration.
74
+
75
+ Args:
76
+ root: The XML element to serialize.
77
+
78
+ Returns:
79
+ UTF-8 encoded bytes including the ``<?xml ...?>`` declaration.
80
+ """
81
+ return ET.tostring(root, encoding="utf-8", xml_declaration=True)
82
+
83
+
84
+ def write_odf_with_replacements(
85
+ input_path: Path,
86
+ output_path: Path,
87
+ replacements: Mapping[str, bytes],
88
+ mimetype_value: str,
89
+ ) -> None:
90
+ """Copy an ODF ZIP, replacing specified members with new content.
91
+
92
+ The mimetype entry is always written first and uncompressed.
93
+
94
+ Args:
95
+ input_path: Source ODF file.
96
+ output_path: Destination ODF file (overwritten).
97
+ replacements: Mapping of member names to replacement bytes.
98
+ mimetype_value: The mimetype string to write if not in *replacements*.
99
+ """
100
+ with zipfile.ZipFile(input_path) as src:
101
+ names: list[str] = src.namelist()
102
+ with zipfile.ZipFile(output_path, "w") as dst:
103
+ if "mimetype" in names:
104
+ dst.writestr(
105
+ "mimetype",
106
+ replacements.get("mimetype", mimetype_value.encode()),
107
+ compress_type=zipfile.ZIP_STORED,
108
+ )
109
+ for name in names:
110
+ if name == "mimetype":
111
+ continue
112
+ dst.writestr(
113
+ name,
114
+ replacements.get(name, src.read(name)),
115
+ compress_type=zipfile.ZIP_DEFLATED,
116
+ )
117
+
118
+
119
+ def pack_dir_as_odf(source_dir: Path, output_path: Path, mimetype_value: str) -> None:
120
+ """Repack an extracted ODF directory into a valid ODF file.
121
+
122
+ The mimetype file must exist in *source_dir* and is written first
123
+ and uncompressed, as required by the ODF specification.
124
+
125
+ Args:
126
+ source_dir: Directory containing extracted ODF contents.
127
+ output_path: Destination ODF file (overwritten).
128
+ mimetype_value: The mimetype string (written to ``mimetype`` member).
129
+ """
130
+ mimetype: Path = source_dir / "mimetype"
131
+ if not mimetype.exists():
132
+ raise SystemExit(f"Missing mimetype file in {source_dir}")
133
+ with zipfile.ZipFile(output_path, "w") as archive:
134
+ archive.write(mimetype, "mimetype", compress_type=zipfile.ZIP_STORED)
135
+ for path in sorted(source_dir.rglob("*")):
136
+ if path.is_dir() or path == mimetype:
137
+ continue
138
+ archive.write(
139
+ path,
140
+ path.relative_to(source_dir).as_posix(),
141
+ compress_type=zipfile.ZIP_DEFLATED,
142
+ )
143
+
144
+
145
+ def ensure_manifest_entry(
146
+ manifest_root: ET.Element,
147
+ full_path: str,
148
+ media_type: str,
149
+ ns: Mapping[str, str],
150
+ q_fn: Callable[[str, str], str],
151
+ ) -> None:
152
+ """Add or update a manifest file-entry.
153
+
154
+ If an entry for *full_path* already exists, its media-type is updated.
155
+ Otherwise a new file-entry is appended.
156
+
157
+ Args:
158
+ manifest_root: The ``<manifest:manifest>`` element.
159
+ full_path: The ``manifest:full-path`` attribute value.
160
+ media_type: The ``manifest:media-type`` attribute value.
161
+ ns: Namespace prefix-to-URI mapping.
162
+ q_fn: Qualified-name builder (e.g. ``q("manifest", "full-path")``).
163
+ """
164
+ manifest_ns: str = ns.get("manifest", "")
165
+ entry_tag: str = f"{{{manifest_ns}}}file-entry"
166
+ for entry in manifest_root.findall(f".//{entry_tag}"):
167
+ if entry.attrib.get(q_fn("manifest", "full-path")) == full_path:
168
+ entry.set(q_fn("manifest", "media-type"), media_type)
169
+ return
170
+ ET.SubElement(
171
+ manifest_root,
172
+ entry_tag,
173
+ {
174
+ q_fn("manifest", "full-path"): full_path,
175
+ q_fn("manifest", "media-type"): media_type,
176
+ },
177
+ )
178
+
179
+
180
+ def inject_styles_from_file(
181
+ input_path: Path,
182
+ styles_path: Path,
183
+ output_path: Path,
184
+ mimetype_value: str,
185
+ ) -> list[str]:
186
+ """Replace the ``styles.xml`` member of an ODF file with the contents of *styles_path*.
187
+
188
+ Returns a list of style-name references in content.xml that do NOT appear
189
+ in the new styles.xml — these are dangling and indicate the injection
190
+ swapped out styles that were still referenced by the content.
191
+
192
+ Args:
193
+ input_path: Source ODF file.
194
+ styles_path: Local styles.xml replacement to inject.
195
+ output_path: Destination ODF file (overwritten).
196
+ mimetype_value: The mimetype string to preserve.
197
+
198
+ Returns:
199
+ List of style names referenced in content but missing in the new styles.
200
+ """
201
+ new_styles_bytes: bytes = styles_path.read_bytes()
202
+ # Validate cross-references: collect style names defined in new styles
203
+ new_styles_root: ET.Element = ET.fromstring(new_styles_bytes)
204
+ style_ns: str = "urn:oasis:names:tc:opendocument:xmlns:style:1.0"
205
+ text_ns: str = "urn:oasis:names:tc:opendocument:xmlns:text:1.0"
206
+ defined_names: set[str] = set()
207
+ for style_el in new_styles_root.iter(f"{{{style_ns}}}style"):
208
+ name = style_el.attrib.get(f"{{{style_ns}}}name")
209
+ if name:
210
+ defined_names.add(name)
211
+ # Also include parent-style-name targets (so we resolve a chain)
212
+ parent_names: set[str] = set()
213
+ for style_el in new_styles_root.iter(f"{{{style_ns}}}style"):
214
+ parent = style_el.attrib.get(f"{{{style_ns}}}parent-style-name")
215
+ if parent:
216
+ parent_names.add(parent)
217
+ # Style names used by content.xml's text:style-name attributes
218
+ content_root: ET.Element = parse_xml_from_zip(input_path, "content.xml")
219
+ used: set[str] = set()
220
+ for node in content_root.iter():
221
+ v = node.attrib.get(f"{{{text_ns}}}style-name")
222
+ if v:
223
+ used.add(v)
224
+ missing: list[str] = sorted(used - defined_names - parent_names)
225
+
226
+ write_odf_with_replacements(
227
+ input_path,
228
+ output_path,
229
+ {"styles.xml": new_styles_bytes},
230
+ mimetype_value,
231
+ )
232
+ return missing
233
+
234
+
235
+ def embed_pictures(
236
+ input_path: Path,
237
+ pictures: Mapping[str, Path],
238
+ output_path: Path,
239
+ mimetype_value: str,
240
+ ns: Mapping[str, str],
241
+ q_fn: Callable[[str, str], str],
242
+ ) -> None:
243
+ """Embed multiple local pictures into the ODF at given package paths.
244
+
245
+ Each picture is added as a new ZIP member and registered in
246
+ ``META-INF/manifest.xml``. The content.xml is **not** modified — callers
247
+ typically reference the pictures from their own draw:frame markup.
248
+
249
+ Args:
250
+ input_path: Source ODF file.
251
+ pictures: Mapping of package paths (e.g. ``"Pictures/logo.png"``) to local file paths.
252
+ output_path: Destination ODF file.
253
+ mimetype_value: The mimetype string to preserve.
254
+ ns: Namespace map (must contain ``manifest``).
255
+ q_fn: Qualified-name builder.
256
+ """
257
+ manifest: ET.Element = parse_xml_from_zip(input_path, "META-INF/manifest.xml")
258
+ new_members: dict[str, bytes] = {}
259
+ for package_path, source in pictures.items():
260
+ new_members[package_path] = source.read_bytes()
261
+ ensure_manifest_entry(manifest, package_path, sniff_image_mime(source), ns, q_fn)
262
+
263
+ copy_with_multiple_members(
264
+ input_path,
265
+ output_path,
266
+ new_members,
267
+ {"META-INF/manifest.xml": xml_bytes(manifest)},
268
+ mimetype_value,
269
+ )
270
+
271
+
272
+ def update_meta_for_edit(
273
+ meta_root: ET.Element,
274
+ ns: Mapping[str, str],
275
+ q_fn: Callable[[str, str], str],
276
+ ) -> None:
277
+ """Mark an edit in ``meta.xml``: modification-date, generator, editing-cycles.
278
+
279
+ Locates or creates the ``<meta:modification-date>``, ``<meta:generator>``,
280
+ and ``<meta:editing-cycles>`` elements under the document's ``<office:meta>``
281
+ node. Modification-date is set to the current UTC ISO timestamp.
282
+ Generator is set to ``open-document-skills/<VERSION>``. Editing-cycles is
283
+ incremented (or initialised to ``1`` if absent or unparseable).
284
+
285
+ Args:
286
+ meta_root: The root of ``meta.xml`` (typically ``office:document-meta``).
287
+ ns: Namespace prefix-to-URI mapping; must contain ``office`` and ``meta``.
288
+ q_fn: Qualified-name builder, e.g. ``q("meta", "generator")``.
289
+
290
+ Raises:
291
+ SystemExit: If no ``office:meta`` element can be located or created.
292
+ """
293
+ office_ns: str = ns.get("office", "")
294
+ meta_tag: str = f"{{{office_ns}}}meta"
295
+ meta_el: ET.Element | None = meta_root.find(meta_tag)
296
+ if meta_el is None:
297
+ if local_name(meta_root.tag) == "meta":
298
+ meta_el = meta_root
299
+ else:
300
+ raise SystemExit("office:meta element not found in meta.xml")
301
+
302
+ def _find_or_create(tag: str) -> ET.Element:
303
+ el: ET.Element | None = meta_el.find(tag)
304
+ if el is None:
305
+ el = ET.SubElement(meta_el, tag)
306
+ return el
307
+
308
+ mod_tag: str = q_fn("meta", "modification-date")
309
+ mod_el: ET.Element = _find_or_create(mod_tag)
310
+ mod_el.text = datetime.now(timezone.utc).replace(microsecond=0).isoformat()
311
+
312
+ gen_tag: str = q_fn("meta", "generator")
313
+ gen_el: ET.Element = _find_or_create(gen_tag)
314
+ gen_el.text = f"open-document-skills/{VERSION}"
315
+
316
+ cycles_tag: str = q_fn("meta", "editing-cycles")
317
+ cycles_el: ET.Element = _find_or_create(cycles_tag)
318
+ current: int
319
+ try:
320
+ current = int((cycles_el.text or "0").strip())
321
+ except ValueError:
322
+ current = 0
323
+ cycles_el.text = str(current + 1)
324
+
325
+
326
+ _IMAGE_MIME_BY_MAGIC: list[tuple[bytes, str]] = [
327
+ (b"\x89PNG\r\n\x1a\n", "image/png"),
328
+ (b"\xff\xd8\xff", "image/jpeg"),
329
+ (b"GIF87a", "image/gif"),
330
+ (b"GIF89a", "image/gif"),
331
+ (b"<?xml", "image/svg+xml"), # requires <svg> later in header
332
+ (b"<svg", "image/svg+xml"),
333
+ (b"BM", "image/bmp"),
334
+ (b"RIFF", "image/webp"), # requires bytes 8:12 == b"WEBP"
335
+ (b"II*\x00", "image/tiff"),
336
+ (b"MM\x00*", "image/tiff"),
337
+ ]
338
+
339
+
340
+ def sniff_image_mime(path: Path) -> str:
341
+ """Return the MIME type of an image file by inspecting its magic bytes.
342
+
343
+ Reads the first 64 bytes (enough for all checks including the SVG-via-XML
344
+ case and the WebP four-CC validation) and falls back to extension-based
345
+ detection via :func:`media_type_for` when no magic matches.
346
+
347
+ Args:
348
+ path: Local file path.
349
+
350
+ Returns:
351
+ MIME type string (e.g. ``"image/png"``); falls back to the extension
352
+ guess if the file is unreadable or no magic matches.
353
+ """
354
+ try:
355
+ with open(path, "rb") as handle:
356
+ header: bytes = handle.read(64)
357
+ except OSError:
358
+ return media_type_for(path)
359
+ for magic, mime in _IMAGE_MIME_BY_MAGIC:
360
+ if not header.startswith(magic):
361
+ continue
362
+ if mime == "image/svg+xml" and magic == b"<?xml":
363
+ # Confirm it's actually SVG, not arbitrary XML.
364
+ if b"<svg" not in header:
365
+ continue
366
+ if mime == "image/webp":
367
+ if len(header) < 12 or header[8:12] != b"WEBP":
368
+ continue
369
+ return mime
370
+ return media_type_for(path)
371
+
372
+
373
+ def media_type_for(path: Path) -> str:
374
+ """Guess the MIME type for a file path, falling back to octet-stream.
375
+
376
+ Args:
377
+ path: File path (only the extension is used for guessing).
378
+
379
+ Returns:
380
+ MIME type string, e.g. ``"image/png"`` or ``"application/octet-stream"``.
381
+ """
382
+ guessed: str | None
383
+ guessed, _ = mimetypes.guess_type(path.name)
384
+ return guessed or "application/octet-stream"
385
+
386
+
387
+ def unique_picture_name(existing: Set[str], image: Path) -> str:
388
+ """Return a unique ``Pictures/…`` path that does not clash with *existing*.
389
+
390
+ Args:
391
+ existing: Set of already-used package paths.
392
+ image: Source image file path.
393
+
394
+ Returns:
395
+ A ``Pictures/<filename>`` path, with ``-N`` suffix if needed.
396
+ """
397
+ base: str = image.name.replace("\\", "_").replace("/", "_")
398
+ candidate: str = posixpath.join("Pictures", base)
399
+ stem: str = image.stem
400
+ suffix: str = image.suffix
401
+ counter: int = 1
402
+ while candidate in existing:
403
+ candidate = posixpath.join("Pictures", f"{stem}-{counter}{suffix}")
404
+ counter += 1
405
+ return candidate
406
+
407
+
408
+ def copy_into_package(
409
+ input_path: Path,
410
+ output_path: Path,
411
+ package_path: str,
412
+ source: Path,
413
+ replacements: Mapping[str, bytes],
414
+ mimetype_value: str,
415
+ ) -> None:
416
+ """Copy an ODF ZIP, replacing members and adding *source* at *package_path*.
417
+
418
+ Args:
419
+ input_path: Source ODF file.
420
+ output_path: Destination ODF file (overwritten).
421
+ package_path: Internal ZIP path for the new file.
422
+ source: Local file to insert.
423
+ replacements: Mapping of member names to replacement bytes.
424
+ mimetype_value: The mimetype string to write if not in *replacements*.
425
+ """
426
+ with zipfile.ZipFile(input_path) as src:
427
+ names: list[str] = src.namelist()
428
+ with zipfile.ZipFile(output_path, "w") as dst:
429
+ if "mimetype" in names:
430
+ dst.writestr(
431
+ "mimetype",
432
+ replacements.get("mimetype", mimetype_value.encode()),
433
+ compress_type=zipfile.ZIP_STORED,
434
+ )
435
+ for name in names:
436
+ if name == "mimetype" or name == package_path:
437
+ continue
438
+ dst.writestr(
439
+ name,
440
+ replacements.get(name, src.read(name)),
441
+ compress_type=zipfile.ZIP_DEFLATED,
442
+ )
443
+ dst.write(source, package_path, compress_type=zipfile.ZIP_DEFLATED)
444
+
445
+
446
+ def copy_with_multiple_members(
447
+ input_path: Path,
448
+ output_path: Path,
449
+ new_members: Mapping[str, bytes],
450
+ replacements: Mapping[str, bytes],
451
+ mimetype_value: str,
452
+ ) -> None:
453
+ """Copy an ODF ZIP with both replacements and arbitrary new members.
454
+
455
+ Like :func:`copy_into_package` but for adding *several* new internal files
456
+ (e.g. ``Object 1/content.xml`` plus its directory entry) in one pass.
457
+
458
+ Args:
459
+ input_path: Source ODF file.
460
+ output_path: Destination ODF file (overwritten).
461
+ new_members: Mapping ``{package_path: bytes}`` of files to add.
462
+ replacements: Mapping of existing member names to replacement bytes.
463
+ mimetype_value: The mimetype string to write if not in *replacements*.
464
+ """
465
+ with zipfile.ZipFile(input_path) as src:
466
+ names: list[str] = src.namelist()
467
+ with zipfile.ZipFile(output_path, "w") as dst:
468
+ if "mimetype" in names:
469
+ dst.writestr(
470
+ "mimetype",
471
+ replacements.get("mimetype", mimetype_value.encode()),
472
+ compress_type=zipfile.ZIP_STORED,
473
+ )
474
+ for name in names:
475
+ if name == "mimetype" or name in new_members:
476
+ continue
477
+ dst.writestr(
478
+ name,
479
+ replacements.get(name, src.read(name)),
480
+ compress_type=zipfile.ZIP_DEFLATED,
481
+ )
482
+ for path, payload in new_members.items():
483
+ dst.writestr(path, payload, compress_type=zipfile.ZIP_DEFLATED)
484
+
485
+
486
+ def unique_object_name(existing: Set[str]) -> str:
487
+ """Return the first ``Object N`` (N=1,2,3,...) that is not present in *existing*.
488
+
489
+ Used for MathML/formula sub-packages. Match is on prefix — an existing
490
+ ``Object 3/content.xml`` causes ``Object 3`` to be considered taken.
491
+
492
+ Args:
493
+ existing: Set of already-used package paths.
494
+
495
+ Returns:
496
+ The chosen ``Object N`` (no trailing slash).
497
+ """
498
+ counter: int = 1
499
+ while True:
500
+ candidate: str = f"Object {counter}"
501
+ if not any(name == candidate or name.startswith(candidate + "/") for name in existing):
502
+ return candidate
503
+ counter += 1
504
+
505
+
506
+ def find_pandoc() -> str | None:
507
+ """Locate the pandoc executable on PATH. Returns None if not found."""
508
+ return shutil.which("pandoc")
509
+
510
+
511
+ SCHEMA_URLS: dict[str, str] = {
512
+ "content": "https://docs.oasis-open.org/office/OpenDocument/v1.3/os/schemas/OpenDocument-v1.3-schema.rng",
513
+ "manifest": "https://docs.oasis-open.org/office/OpenDocument/v1.3/os/schemas/OpenDocument-v1.3-manifest-schema.rng",
514
+ }
515
+
516
+
517
+ def ensure_schema(name: str) -> Path:
518
+ """Locate an OASIS ODF 1.3 RelaxNG schema, downloading it on first use.
519
+
520
+ Schemas are cached under ``$XDG_CACHE_HOME/open-document-skills/schemas/``
521
+ (defaulting to ``~/.cache/open-document-skills/schemas/``).
522
+
523
+ Args:
524
+ name: Either ``"content"`` or ``"manifest"``.
525
+
526
+ Returns:
527
+ Local filesystem path to the cached schema.
528
+
529
+ Raises:
530
+ SystemExit: If *name* is unknown or download fails.
531
+ """
532
+ import os
533
+ import urllib.request
534
+
535
+ if name not in SCHEMA_URLS:
536
+ raise SystemExit(f"unknown schema {name!r}; choose from {sorted(SCHEMA_URLS)}")
537
+ cache_root: Path = (
538
+ Path(os.environ.get("XDG_CACHE_HOME") or Path.home() / ".cache") / "open-document-skills" / "schemas"
539
+ )
540
+ cache_root.mkdir(parents=True, exist_ok=True)
541
+ schema_path: Path = cache_root / f"odf-1.3-{name}.rng"
542
+ if not schema_path.exists():
543
+ url: str = SCHEMA_URLS[name]
544
+ try:
545
+ with urllib.request.urlopen(url, timeout=30) as resp:
546
+ schema_path.write_bytes(resp.read())
547
+ except Exception as exc:
548
+ raise SystemExit(f"failed to download schema {url}: {exc}")
549
+ return schema_path
550
+
551
+
552
+ def validate_against_schema(xml_bytes_input: bytes, schema_name: str) -> tuple[bool, list[str]]:
553
+ """Validate *xml_bytes_input* against the named OASIS ODF 1.3 RelaxNG schema.
554
+
555
+ Lazily imports ``lxml`` and raises ``SystemExit`` with an install hint
556
+ when the optional dependency is missing.
557
+
558
+ Args:
559
+ xml_bytes_input: Raw XML bytes to validate.
560
+ schema_name: Schema key, e.g. ``"content"`` or ``"manifest"``.
561
+
562
+ Returns:
563
+ ``(is_valid, errors)`` where errors is a list of human-readable strings.
564
+ """
565
+ try:
566
+ from lxml import etree # type: ignore
567
+ except ImportError:
568
+ raise SystemExit("Schema validation requires lxml. Install with:\n pip install open-document-lib[validate]")
569
+ schema_path: Path = ensure_schema(schema_name)
570
+ rng_doc = etree.parse(str(schema_path))
571
+ relaxng = etree.RelaxNG(rng_doc)
572
+ try:
573
+ doc = etree.fromstring(xml_bytes_input)
574
+ except etree.XMLSyntaxError as exc:
575
+ return False, [f"XML syntax error: {exc}"]
576
+ valid = relaxng.validate(doc)
577
+ errors: list[str] = []
578
+ if not valid:
579
+ for err in relaxng.error_log:
580
+ errors.append(f"line {err.line}: {err.message}")
581
+ return valid, errors
582
+
583
+
584
+ def apply_strict_schema_check(odf_path: Path, result: dict[str, object]) -> None:
585
+ """Validate an ODF file's content.xml and manifest.xml against the schemas.
586
+
587
+ Runs RelaxNG validation against the OASIS ODF 1.3 schemas — the same
588
+ ``content`` schema covers ODT/ODP/ODS/ODG, so this works for every
589
+ format. Mutates *result* in place: schema errors are appended to
590
+ ``result["errors"]`` (prefixed by member name) and ``result["status"]``
591
+ is set to ``"errors_found"`` when any errors are present.
592
+
593
+ Args:
594
+ odf_path: Path to the ODF package to validate.
595
+ result: A validation result dict with ``"errors"`` and ``"status"``
596
+ keys, as returned by a ``validate_refs`` ``validate()`` function.
597
+ """
598
+ with zipfile.ZipFile(odf_path) as archive:
599
+ content_bytes = archive.read("content.xml")
600
+ try:
601
+ manifest_bytes: bytes | None = archive.read("META-INF/manifest.xml")
602
+ except KeyError:
603
+ manifest_bytes = None
604
+ errors = result["errors"]
605
+ if not isinstance(errors, list): # defensive — validate() always returns a list
606
+ errors = []
607
+ result["errors"] = errors
608
+ ok, errs = validate_against_schema(content_bytes, "content")
609
+ if not ok:
610
+ errors.extend(f"content.xml: {err}" for err in errs)
611
+ if manifest_bytes is not None:
612
+ ok_m, errs_m = validate_against_schema(manifest_bytes, "manifest")
613
+ if not ok_m:
614
+ errors.extend(f"manifest.xml: {err}" for err in errs_m)
615
+ if errors:
616
+ result["status"] = "errors_found"
617
+
618
+
619
+ def latex_to_mathml(latex: str) -> bytes:
620
+ """Convert a LaTeX snippet to MathML bytes via pandoc.
621
+
622
+ Args:
623
+ latex: LaTeX source (without surrounding ``$`` delimiters).
624
+
625
+ Returns:
626
+ UTF-8 encoded MathML XML.
627
+
628
+ Raises:
629
+ SystemExit: If pandoc is not on PATH, with install hints.
630
+ """
631
+ import subprocess
632
+
633
+ pandoc: str | None = find_pandoc()
634
+ if pandoc is None:
635
+ raise SystemExit(
636
+ "LaTeX → MathML requires pandoc.\n"
637
+ " macOS: brew install pandoc\n"
638
+ " Ubuntu: sudo apt-get install pandoc\n"
639
+ " Windows: winget install JohnMacFarlane.Pandoc\n"
640
+ "Or supply --mathml or --mathml-inline directly."
641
+ )
642
+ # Wrap in math mode so pandoc emits a <math> element.
643
+ wrapped: str = f"${latex}$"
644
+ result = subprocess.run(
645
+ [pandoc, "-f", "latex", "-t", "html5", "--mathml"],
646
+ input=wrapped.encode("utf-8"),
647
+ capture_output=True,
648
+ check=True,
649
+ )
650
+ # Pandoc's html5+mathml output wraps in <p>...</p>; extract the <math>...</math> element.
651
+ html_out: str = result.stdout.decode("utf-8")
652
+ math_start: int = html_out.find("<math")
653
+ math_end: int = html_out.rfind("</math>")
654
+ if math_start < 0 or math_end < 0:
655
+ raise SystemExit(f"pandoc did not emit a <math> element. Output was:\n{html_out}")
656
+ return html_out[math_start : math_end + len("</math>")].encode("utf-8")
657
+
658
+
659
+ def clear_children(element: ET.Element) -> None:
660
+ """Remove all child elements from *element* in-place.
661
+
662
+ Args:
663
+ element: The XML element to clear.
664
+ """
665
+ element[:] = []
666
+
667
+
668
+ def _collect_text_slots(element: ET.Element) -> list[tuple[ET.Element, str]]:
669
+ """Collect (node, attr) text-slot pairs in document order.
670
+
671
+ Walker and locator helpers share this structure. For *element*, yields
672
+ ``(element, "text")`` first; then for every descendant in DFS order,
673
+ yields ``(node, "text")`` before recursing and ``(node, "tail")`` after.
674
+ The root's ``.tail`` is intentionally not included (it lives outside the
675
+ element's content).
676
+
677
+ Args:
678
+ element: Root of the subtree to collect from.
679
+
680
+ Returns:
681
+ Ordered list of ``(node, attr)`` pairs where ``attr`` is ``"text"`` or ``"tail"``.
682
+ """
683
+ slots: list[tuple[ET.Element, str]] = []
684
+
685
+ def visit(node: ET.Element, is_root: bool) -> None:
686
+ slots.append((node, "text"))
687
+ for child in list(node):
688
+ visit(child, False)
689
+ if not is_root:
690
+ slots.append((node, "tail"))
691
+
692
+ visit(element, True)
693
+ return slots
694
+
695
+
696
+ def _build_parent_map(element: ET.Element) -> dict[ET.Element, ET.Element]:
697
+ """Build a descendant → parent mapping for *element*'s subtree.
698
+
699
+ The root *element* is not present as a key (it has no parent within the subtree).
700
+
701
+ Args:
702
+ element: Root of the subtree.
703
+
704
+ Returns:
705
+ Dict mapping each descendant to its direct parent.
706
+ """
707
+ parent_map: dict[ET.Element, ET.Element] = {}
708
+ for parent in element.iter():
709
+ for child in parent:
710
+ parent_map[child] = parent
711
+ return parent_map
712
+
713
+
714
+ def find_text_position_in_element(element: ET.Element, needle: str) -> tuple[ET.Element, str, int] | None:
715
+ """Find the FIRST occurrence of *needle* in *element*'s text content.
716
+
717
+ Walks ``.text`` of *element* and every descendant, plus ``.tail`` of every
718
+ descendant, in document order. Returns the slot (node, attr) and local
719
+ offset where the match BEGINS. A match may span multiple slots — only the
720
+ starting slot is reported.
721
+
722
+ Args:
723
+ element: The element to search.
724
+ needle: Substring to look for. Empty string returns None.
725
+
726
+ Returns:
727
+ ``(node, attr, local_offset)`` where ``attr`` is ``"text"`` or ``"tail"``,
728
+ or ``None`` if not found.
729
+ """
730
+ if not needle:
731
+ return None
732
+ slots: list[tuple[ET.Element, str]] = _collect_text_slots(element)
733
+ values: list[str] = [getattr(n, a) or "" for n, a in slots]
734
+ combined: str = "".join(values)
735
+ idx: int = combined.find(needle)
736
+ if idx < 0:
737
+ return None
738
+ running: int = 0
739
+ for (node, attr), value in zip(slots, values):
740
+ if running <= idx < running + len(value):
741
+ return node, attr, idx - running
742
+ running += len(value)
743
+ return None
744
+
745
+
746
+ def insert_after_text_in_element(element: ET.Element, anchor: str, new_element: ET.Element) -> bool:
747
+ """Insert *new_element* immediately after the first occurrence of *anchor*.
748
+
749
+ Splits the slot containing the END of the match, then inserts *new_element*
750
+ either as a child (when the match ends in a ``.text`` slot) or as a sibling
751
+ (when it ends in a ``.tail`` slot). The remainder of the slot becomes
752
+ *new_element*'s ``.tail``. Other inline children of *element* are preserved.
753
+
754
+ Args:
755
+ element: The container in which to search.
756
+ anchor: Substring that locates the insertion point.
757
+ new_element: The element to insert.
758
+
759
+ Returns:
760
+ ``True`` if the anchor was found and the element inserted, else ``False``.
761
+ """
762
+ if not anchor:
763
+ return False
764
+ slots: list[tuple[ET.Element, str]] = _collect_text_slots(element)
765
+ values: list[str] = [getattr(n, a) or "" for n, a in slots]
766
+ combined: str = "".join(values)
767
+ idx: int = combined.find(anchor)
768
+ if idx < 0:
769
+ return False
770
+ end: int = idx + len(anchor)
771
+ running: int = 0
772
+ target_index: int = -1
773
+ for i, value in enumerate(values):
774
+ if running <= end - 1 < running + len(value):
775
+ target_index = i
776
+ break
777
+ running += len(value)
778
+ if target_index < 0:
779
+ return False
780
+ target_node, target_attr = slots[target_index]
781
+ local_end: int = end - running
782
+ current_value: str = values[target_index]
783
+ prefix: str = current_value[:local_end]
784
+ suffix: str = current_value[local_end:]
785
+
786
+ if target_attr == "text":
787
+ target_node.text = prefix if prefix else None
788
+ target_node.insert(0, new_element)
789
+ new_element.tail = suffix if suffix else None
790
+ return True
791
+
792
+ target_node.tail = prefix if prefix else None
793
+ parent_map: dict[ET.Element, ET.Element] = _build_parent_map(element)
794
+ parent: ET.Element | None = parent_map.get(target_node)
795
+ if parent is None:
796
+ return False
797
+ sibling_index: int = list(parent).index(target_node)
798
+ parent.insert(sibling_index + 1, new_element)
799
+ new_element.tail = suffix if suffix else None
800
+ return True
801
+
802
+
803
+ def replace_pattern_with_element_in_element(
804
+ element: ET.Element,
805
+ pattern: re.Pattern[str],
806
+ factory: Callable[[re.Match[str]], ET.Element],
807
+ ) -> int:
808
+ """Replace every regex match in *element*'s text content with a built element.
809
+
810
+ For each non-overlapping match of *pattern* against the concatenated text
811
+ content (``.text`` of *element* and descendants, plus ``.tail`` of every
812
+ descendant), the match is removed and replaced with the element returned
813
+ by ``factory(match)``. The element is inserted either as a child (when
814
+ the match falls in a ``.text`` slot) or as a sibling (when in a ``.tail``
815
+ slot). The new element's ``.tail`` carries the remainder of the original
816
+ slot.
817
+
818
+ Matches that straddle multiple slots are silently skipped — short
819
+ placeholder patterns like ``[@bibkey]`` virtually never straddle inline
820
+ children, and skipping is safer than corrupting structure.
821
+
822
+ Args:
823
+ element: Container to scan.
824
+ pattern: Compiled regex.
825
+ factory: Callable returning a new ET.Element per match.
826
+
827
+ Returns:
828
+ Number of replacements performed.
829
+ """
830
+ slots: list[tuple[ET.Element, str]] = _collect_text_slots(element)
831
+ values: list[str] = [getattr(n, a) or "" for n, a in slots]
832
+ offsets: list[int] = []
833
+ running: int = 0
834
+ for v in values:
835
+ offsets.append(running)
836
+ running += len(v)
837
+ combined: str = "".join(values)
838
+
839
+ matches: list[re.Match[str]] = list(pattern.finditer(combined))
840
+ if not matches:
841
+ return 0
842
+
843
+ def slot_for(global_offset: int) -> int:
844
+ lo, hi = 0, len(values) - 1
845
+ while lo < hi:
846
+ mid = (lo + hi + 1) // 2
847
+ if offsets[mid] <= global_offset:
848
+ lo = mid
849
+ else:
850
+ hi = mid - 1
851
+ return lo
852
+
853
+ parent_map: dict[ET.Element, ET.Element] = _build_parent_map(element)
854
+ replaced: int = 0
855
+
856
+ # Work right-to-left so earlier modifications don't shift later positions
857
+ # within the paragraph structure.
858
+ for match in reversed(matches):
859
+ start, end = match.start(), match.end()
860
+ i_slot = slot_for(start)
861
+ j_slot = slot_for(end - 1) if end > start else i_slot
862
+ if i_slot != j_slot:
863
+ # Straddle — skip silently.
864
+ continue
865
+ target_node, target_attr = slots[i_slot]
866
+ local_start = start - offsets[i_slot]
867
+ local_end = end - offsets[i_slot]
868
+ current = values[i_slot]
869
+ prefix = current[:local_start]
870
+ suffix = current[local_end:]
871
+ new_element = factory(match)
872
+
873
+ if target_attr == "text":
874
+ target_node.text = prefix if prefix else None
875
+ target_node.insert(0, new_element)
876
+ new_element.tail = suffix if suffix else None
877
+ else:
878
+ target_node.tail = prefix if prefix else None
879
+ parent = parent_map.get(target_node)
880
+ if parent is None:
881
+ continue
882
+ sibling_index = list(parent).index(target_node)
883
+ parent.insert(sibling_index + 1, new_element)
884
+ new_element.tail = suffix if suffix else None
885
+
886
+ # Update tracking so subsequent (earlier) matches see the new state.
887
+ # We pessimistically rebuild slots; simpler than incremental updates.
888
+ slots = _collect_text_slots(element)
889
+ values = [getattr(n, a) or "" for n, a in slots]
890
+ offsets = []
891
+ running = 0
892
+ for v in values:
893
+ offsets.append(running)
894
+ running += len(v)
895
+ parent_map = _build_parent_map(element)
896
+ replaced += 1
897
+
898
+ return replaced
899
+
900
+
901
+ def insert_in_paragraph(paragraph: ET.Element, position: str, new_element: ET.Element) -> None:
902
+ """Insert *new_element* at the start or end of *paragraph*.
903
+
904
+ ``"end"`` appends; ``"start"`` inserts as first child and pushes any
905
+ existing ``paragraph.text`` to ``new_element.tail``.
906
+
907
+ Args:
908
+ paragraph: The container element (typically ``text:p`` or ``text:h``).
909
+ position: Either ``"start"`` or ``"end"``.
910
+ new_element: Element to insert.
911
+
912
+ Raises:
913
+ ValueError: If *position* is not ``"start"`` or ``"end"``.
914
+ """
915
+ if position == "end":
916
+ paragraph.append(new_element)
917
+ new_element.tail = None
918
+ elif position == "start":
919
+ old_text: str | None = paragraph.text
920
+ paragraph.text = None
921
+ paragraph.insert(0, new_element)
922
+ new_element.tail = old_text
923
+ else:
924
+ raise ValueError(f"position must be 'start' or 'end', got {position!r}")
925
+
926
+
927
+ def wrap_text_with_pair_in_element(
928
+ element: ET.Element,
929
+ start_anchor: str,
930
+ end_anchor: str,
931
+ start_element: ET.Element,
932
+ end_element: ET.Element,
933
+ ) -> bool:
934
+ """Bracket a text region with two empty marker elements (e.g. range bookmarks).
935
+
936
+ Finds *start_anchor* and *end_anchor* in *element*'s text content; inserts
937
+ *start_element* immediately after the start anchor and *end_element*
938
+ immediately after the end anchor. The end anchor must occur after the
939
+ start anchor in document order. Rolls back on failure: nothing is inserted
940
+ unless both anchors were found and the order is correct.
941
+
942
+ Args:
943
+ element: The paragraph (or similar) to search.
944
+ start_anchor: Substring marking the start of the bracketed region.
945
+ end_anchor: Substring marking the end. Must come after *start_anchor*.
946
+ start_element: Element to insert after the start anchor (e.g. ``text:bookmark-start``).
947
+ end_element: Element to insert after the end anchor (e.g. ``text:bookmark-end``).
948
+
949
+ Returns:
950
+ ``True`` if both insertions succeeded, ``False`` otherwise (and no
951
+ change is made to *element*).
952
+ """
953
+ slots: list[tuple[ET.Element, str]] = _collect_text_slots(element)
954
+ values: list[str] = [getattr(n, a) or "" for n, a in slots]
955
+ combined: str = "".join(values)
956
+ start_idx: int = combined.find(start_anchor)
957
+ if start_idx < 0:
958
+ return False
959
+ end_idx: int = combined.find(end_anchor, start_idx + len(start_anchor))
960
+ if end_idx < 0:
961
+ return False
962
+ # Insert end first so positions of start_anchor remain stable.
963
+ if not insert_after_text_in_element(element, end_anchor, end_element):
964
+ return False
965
+ if not insert_after_text_in_element(element, start_anchor, start_element):
966
+ # Rollback end insertion.
967
+ parent_map: dict[ET.Element, ET.Element] = _build_parent_map(element)
968
+ parent: ET.Element | None = parent_map.get(end_element)
969
+ if parent is not None:
970
+ # Restore the tail before removing.
971
+ siblings: list[ET.Element] = list(parent)
972
+ idx: int = siblings.index(end_element)
973
+ preceding_tail: str | None = end_element.tail
974
+ if idx == 0:
975
+ parent.text = (parent.text or "") + (preceding_tail or "") or None
976
+ else:
977
+ prev: ET.Element = siblings[idx - 1]
978
+ prev.tail = (prev.tail or "") + (preceding_tail or "") or None
979
+ parent.remove(end_element)
980
+ return False
981
+ return True
982
+
983
+
984
+ def wrap_text_across_elements(
985
+ elements: list[ET.Element],
986
+ start_anchor: str,
987
+ end_anchor: str,
988
+ start_element: ET.Element,
989
+ end_element: ET.Element,
990
+ ) -> bool:
991
+ """Bracket a text region with a start/end marker pair across multiple paragraphs.
992
+
993
+ Searches *elements* in document order for *start_anchor*; in the first
994
+ matching element, then searches the remainder (and the same element after
995
+ the start position) for *end_anchor*. Inserts *start_element* immediately
996
+ after the start anchor and *end_element* immediately after the end anchor.
997
+
998
+ If start and end fall in the same element, falls back to
999
+ :func:`wrap_text_with_pair_in_element`. Otherwise, inserts the end first
1000
+ (in a later element, so positions stay stable) and the start second.
1001
+ Rolls back on failure.
1002
+
1003
+ Args:
1004
+ elements: Container elements to search (typically all paragraphs).
1005
+ start_anchor: Substring marking the range start.
1006
+ end_anchor: Substring marking the range end (must come after start).
1007
+ start_element: Element inserted after the start anchor.
1008
+ end_element: Element inserted after the end anchor.
1009
+
1010
+ Returns:
1011
+ ``True`` if both markers were inserted, ``False`` otherwise.
1012
+ """
1013
+ if not start_anchor or not end_anchor:
1014
+ return False
1015
+
1016
+ # Locate start
1017
+ start_idx: int = -1
1018
+ for i, element in enumerate(elements):
1019
+ if find_text_position_in_element(element, start_anchor) is not None:
1020
+ start_idx = i
1021
+ break
1022
+ if start_idx < 0:
1023
+ return False
1024
+
1025
+ # Locate end: in the same element after the start, or in any subsequent element.
1026
+ end_idx: int = -1
1027
+ start_element_combined: str = "".join(getattr(n, a) or "" for n, a in _collect_text_slots(elements[start_idx]))
1028
+ s_pos: int = start_element_combined.find(start_anchor)
1029
+ e_pos: int = start_element_combined.find(end_anchor, s_pos + len(start_anchor))
1030
+ if e_pos >= 0:
1031
+ # Both in same element.
1032
+ return wrap_text_with_pair_in_element(elements[start_idx], start_anchor, end_anchor, start_element, end_element)
1033
+
1034
+ for j in range(start_idx + 1, len(elements)):
1035
+ if find_text_position_in_element(elements[j], end_anchor) is not None:
1036
+ end_idx = j
1037
+ break
1038
+ if end_idx < 0:
1039
+ return False
1040
+
1041
+ # Insert end first (later element); positions in start element remain stable.
1042
+ if not insert_after_text_in_element(elements[end_idx], end_anchor, end_element):
1043
+ return False
1044
+ if not insert_after_text_in_element(elements[start_idx], start_anchor, start_element):
1045
+ # Rollback end insertion.
1046
+ parent_map: dict[ET.Element, ET.Element] = _build_parent_map(elements[end_idx])
1047
+ parent: ET.Element | None = parent_map.get(end_element)
1048
+ if parent is not None:
1049
+ siblings: list[ET.Element] = list(parent)
1050
+ idx: int = siblings.index(end_element)
1051
+ preceding_tail: str | None = end_element.tail
1052
+ if idx == 0:
1053
+ parent.text = (parent.text or "") + (preceding_tail or "") or None
1054
+ else:
1055
+ prev: ET.Element = siblings[idx - 1]
1056
+ prev.tail = (prev.tail or "") + (preceding_tail or "") or None
1057
+ parent.remove(end_element)
1058
+ return False
1059
+ return True
1060
+
1061
+
1062
+ def ensure_sequence_declarations(text_root: ET.Element, names: list[str], ns: Mapping[str, str]) -> None:
1063
+ """Ensure ``text:sequence-decls`` exists under *text_root* and contains *names*.
1064
+
1065
+ *text_root* is typically the ``office:text`` element. If a
1066
+ ``text:sequence-decls`` block is missing, it is prepended as the first
1067
+ child. Missing ``text:sequence-decl`` entries for each ``NAME`` in *names*
1068
+ are appended.
1069
+
1070
+ Args:
1071
+ text_root: The ``office:text`` element (parent of body content).
1072
+ names: Sequence names (e.g. ``["Figure", "Table", "Illustration"]``).
1073
+ ns: Namespace map (must contain ``text``).
1074
+ """
1075
+ text_ns: str = ns["text"]
1076
+ decls_tag: str = f"{{{text_ns}}}sequence-decls"
1077
+ decl_tag: str = f"{{{text_ns}}}sequence-decl"
1078
+ name_attr: str = f"{{{text_ns}}}name"
1079
+ display_attr: str = f"{{{text_ns}}}display-outline-level"
1080
+
1081
+ decls: ET.Element | None = text_root.find(decls_tag)
1082
+ if decls is None:
1083
+ decls = ET.Element(decls_tag)
1084
+ text_root.insert(0, decls)
1085
+
1086
+ existing: set[str] = {child.attrib.get(name_attr, "") for child in decls.findall(decl_tag)}
1087
+ for name in names:
1088
+ if name in existing:
1089
+ continue
1090
+ ET.SubElement(decls, decl_tag, {name_attr: name, display_attr: "0"})
1091
+
1092
+
1093
+ def replace_text_in_element(element: ET.Element, old: str, new: str) -> int:
1094
+ """Replace ``old`` with ``new`` in *element*'s text, preserving children.
1095
+
1096
+ Walks all text nodes (``.text`` of *element* and every descendant, plus
1097
+ ``.tail`` of every descendant) in document order. Inline children such as
1098
+ ``text:span``, ``text:note``, ``text:bookmark``, ``text:a`` keep their
1099
+ identity. Matches that straddle child boundaries are still replaced — the
1100
+ new content is placed in the first containing slot, intermediate slots are
1101
+ cleared, and the trailing slot keeps only the suffix after the match.
1102
+
1103
+ Args:
1104
+ element: The element whose textual content should be searched.
1105
+ old: Substring to search for. Empty string is a no-op.
1106
+ new: Replacement string.
1107
+
1108
+ Returns:
1109
+ Number of non-overlapping replacements performed.
1110
+ """
1111
+ if not old:
1112
+ return 0
1113
+
1114
+ slots: list[tuple[ET.Element, str]] = _collect_text_slots(element)
1115
+ values: list[str] = [getattr(n, a) or "" for n, a in slots]
1116
+ combined: str = "".join(values)
1117
+
1118
+ matches: list[tuple[int, int]] = []
1119
+ pos: int = 0
1120
+ while True:
1121
+ i: int = combined.find(old, pos)
1122
+ if i < 0:
1123
+ break
1124
+ matches.append((i, i + len(old)))
1125
+ pos = i + len(old)
1126
+ if not matches:
1127
+ return 0
1128
+
1129
+ offsets: list[int] = []
1130
+ running: int = 0
1131
+ for v in values:
1132
+ offsets.append(running)
1133
+ running += len(v)
1134
+
1135
+ def slot_for(offset: int) -> int:
1136
+ lo: int = 0
1137
+ hi: int = len(values) - 1
1138
+ while lo < hi:
1139
+ mid: int = (lo + hi + 1) // 2
1140
+ if offsets[mid] <= offset:
1141
+ lo = mid
1142
+ else:
1143
+ hi = mid - 1
1144
+ return lo
1145
+
1146
+ for match_start, match_end in reversed(matches):
1147
+ i_slot: int = slot_for(match_start)
1148
+ j_slot: int = slot_for(match_end - 1) if match_end > match_start else i_slot
1149
+ local_i: int = match_start - offsets[i_slot]
1150
+ local_j: int = match_end - offsets[j_slot]
1151
+ if i_slot == j_slot:
1152
+ v = values[i_slot]
1153
+ values[i_slot] = v[:local_i] + new + v[local_j:]
1154
+ else:
1155
+ values[i_slot] = values[i_slot][:local_i] + new
1156
+ for k in range(i_slot + 1, j_slot):
1157
+ values[k] = ""
1158
+ values[j_slot] = values[j_slot][local_j:]
1159
+
1160
+ for (node, attr), value in zip(slots, values):
1161
+ setattr(node, attr, value if value else None)
1162
+
1163
+ return len(matches)
1164
+
1165
+
1166
+ def find_soffice() -> str:
1167
+ """Locate the LibreOffice/soffice executable.
1168
+
1169
+ Checks PATH first, then common installation directories on macOS,
1170
+ Linux (including snap), and Windows (including WSL).
1171
+
1172
+ Returns:
1173
+ Absolute path to the ``soffice`` or ``libreoffice`` executable.
1174
+
1175
+ Raises:
1176
+ SystemExit: If no executable is found.
1177
+ """
1178
+ candidates: list[str] = [
1179
+ "/Applications/LibreOffice.app/Contents/MacOS/soffice",
1180
+ "/usr/bin/libreoffice",
1181
+ "/usr/local/bin/libreoffice",
1182
+ "/snap/bin/libreoffice",
1183
+ r"C:\Program Files\LibreOffice\program\soffice.exe",
1184
+ "/c/Program Files/LibreOffice/program/soffice.exe",
1185
+ "/mnt/c/Program Files/LibreOffice/program/soffice.exe",
1186
+ ]
1187
+ for name in ("soffice", "libreoffice"):
1188
+ found: str | None = shutil.which(name)
1189
+ if found:
1190
+ return found
1191
+ for candidate in candidates:
1192
+ if Path(candidate).exists():
1193
+ return candidate
1194
+ raise SystemExit("LibreOffice/soffice not found")
1195
+
1196
+
1197
+ def unpack_to_temp(path: Path) -> tempfile.TemporaryDirectory[str]:
1198
+ """Extract an ODF ZIP to a temporary directory.
1199
+
1200
+ The caller is responsible for cleaning up the returned
1201
+ TemporaryDirectory (e.g. via a context manager or ``.cleanup()``).
1202
+
1203
+ Args:
1204
+ path: Path to the ODF ZIP file.
1205
+
1206
+ Returns:
1207
+ A ``tempfile.TemporaryDirectory`` containing the extracted contents.
1208
+ """
1209
+ temp: tempfile.TemporaryDirectory[str] = tempfile.TemporaryDirectory()
1210
+ with zipfile.ZipFile(path) as archive:
1211
+ archive.extractall(temp.name)
1212
+ return temp
1213
+
1214
+
1215
+ _IMAGE_MAGIC: list[tuple[bytes, str]] = [
1216
+ (b"\x89PNG\r\n\x1a\n", ".png"),
1217
+ (b"\xff\xd8\xff", ".jpg"),
1218
+ (b"GIF87a", ".gif"),
1219
+ (b"GIF89a", ".gif"),
1220
+ (b"<?xml", ".svg"),
1221
+ (b"<svg", ".svg"),
1222
+ (b"BM", ".bmp"),
1223
+ (b"RIFF", ".webp"),
1224
+ ]
1225
+
1226
+
1227
+ def _sniff_image_extension(data: bytes) -> str:
1228
+ for magic, ext in _IMAGE_MAGIC:
1229
+ if data.startswith(magic):
1230
+ return ext
1231
+ return ".bin"
1232
+
1233
+
1234
+ def _object_media_types(manifest_bytes: bytes | None) -> dict[str, str]:
1235
+ """Map ``Object N`` directory names to their manifest media-types."""
1236
+ if not manifest_bytes:
1237
+ return {}
1238
+ manifest_ns: str = ODF_NAMESPACES["manifest"]
1239
+ result: dict[str, str] = {}
1240
+ try:
1241
+ root = ET.fromstring(manifest_bytes)
1242
+ except ET.ParseError:
1243
+ return {}
1244
+ for entry in root.findall(f".//{{{manifest_ns}}}file-entry"):
1245
+ path = entry.attrib.get(f"{{{manifest_ns}}}full-path", "")
1246
+ media = entry.attrib.get(f"{{{manifest_ns}}}media-type", "")
1247
+ if path.startswith("Object ") and path.endswith("/") and media:
1248
+ result[path.rstrip("/")] = media
1249
+ return result
1250
+
1251
+
1252
+ def _flatten_object(members: dict[str, bytes], mimetype: str | None) -> ET.Element:
1253
+ """Merge an embedded object's sub-package members into one flat document.
1254
+
1255
+ Args:
1256
+ members: Mapping of member filename (e.g. ``"content.xml"``) to bytes.
1257
+ mimetype: Object media-type for the ``office:mimetype`` attribute.
1258
+
1259
+ Returns:
1260
+ A nested ``<office:document>`` element ready to inline inside
1261
+ ``<draw:object>``.
1262
+ """
1263
+ office_ns: str = ODF_NAMESPACES["office"]
1264
+ doc: ET.Element = ET.Element(f"{{{office_ns}}}document", {f"{{{office_ns}}}version": "1.3"})
1265
+ if mimetype:
1266
+ doc.set(f"{{{office_ns}}}mimetype", mimetype)
1267
+ roots: dict[str, ET.Element] = {}
1268
+ for name, data in members.items():
1269
+ try:
1270
+ roots[name] = ET.fromstring(data)
1271
+ except ET.ParseError:
1272
+ continue
1273
+
1274
+ def pick(member: str, names: set[str]) -> list[ET.Element]:
1275
+ root = roots.get(member)
1276
+ return [] if root is None else [c for c in root if local_name(c.tag) in names]
1277
+
1278
+ for child in pick("meta.xml", {"meta"}):
1279
+ doc.append(child)
1280
+ for child in pick("settings.xml", {"settings"}):
1281
+ doc.append(child)
1282
+ for child in pick("content.xml", {"scripts"}):
1283
+ doc.append(child)
1284
+ for child in pick("styles.xml", {"font-face-decls"}):
1285
+ doc.append(child)
1286
+ for child in pick("styles.xml", {"styles"}):
1287
+ doc.append(child)
1288
+ merged_auto: ET.Element = ET.SubElement(doc, f"{{{office_ns}}}automatic-styles")
1289
+ for member in ("styles.xml", "content.xml"):
1290
+ for auto in pick(member, {"automatic-styles"}):
1291
+ for grandchild in list(auto):
1292
+ merged_auto.append(grandchild)
1293
+ for child in pick("styles.xml", {"master-styles"}):
1294
+ doc.append(child)
1295
+ for child in pick("content.xml", {"body"}):
1296
+ doc.append(child)
1297
+ return doc
1298
+
1299
+
1300
+ def _split_object_flat(doc: ET.Element) -> tuple[dict[str, bytes], str | None]:
1301
+ """Split a flat object ``<office:document>`` back into sub-package members.
1302
+
1303
+ Args:
1304
+ doc: The nested ``<office:document>`` inlined inside ``<draw:object>``.
1305
+
1306
+ Returns:
1307
+ A ``(members, mimetype)`` pair where ``members`` maps member filenames
1308
+ to serialized bytes.
1309
+ """
1310
+ office_ns: str = ODF_NAMESPACES["office"]
1311
+ mimetype: str | None = doc.attrib.get(f"{{{office_ns}}}mimetype")
1312
+ content_doc: ET.Element = ET.Element(f"{{{office_ns}}}document-content", {f"{{{office_ns}}}version": "1.3"})
1313
+ styles_doc: ET.Element = ET.Element(f"{{{office_ns}}}document-styles", {f"{{{office_ns}}}version": "1.3"})
1314
+ meta_doc: ET.Element = ET.Element(f"{{{office_ns}}}document-meta", {f"{{{office_ns}}}version": "1.3"})
1315
+ settings_doc: ET.Element = ET.Element(f"{{{office_ns}}}document-settings", {f"{{{office_ns}}}version": "1.3"})
1316
+ content_auto: ET.Element = ET.SubElement(content_doc, f"{{{office_ns}}}automatic-styles")
1317
+ has_styles = has_meta = has_settings = False
1318
+ for child in list(doc):
1319
+ name: str = local_name(child.tag)
1320
+ if name == "meta":
1321
+ meta_doc.append(child)
1322
+ has_meta = True
1323
+ elif name == "settings":
1324
+ settings_doc.append(child)
1325
+ has_settings = True
1326
+ elif name == "scripts":
1327
+ content_doc.insert(0, child)
1328
+ elif name in {"font-face-decls", "styles", "master-styles"}:
1329
+ styles_doc.append(child)
1330
+ has_styles = True
1331
+ elif name == "automatic-styles":
1332
+ for grandchild in list(child):
1333
+ content_auto.append(grandchild)
1334
+ elif name == "body":
1335
+ content_doc.append(child)
1336
+ members: dict[str, bytes] = {"content.xml": xml_bytes(content_doc)}
1337
+ if has_styles:
1338
+ members["styles.xml"] = xml_bytes(styles_doc)
1339
+ if has_meta:
1340
+ members["meta.xml"] = xml_bytes(meta_doc)
1341
+ if has_settings:
1342
+ members["settings.xml"] = xml_bytes(settings_doc)
1343
+ return members, mimetype
1344
+
1345
+
1346
+ def pack_flat_odf(input_zip: Path, output_flat: Path) -> None:
1347
+ """Convert a zipped ODF package to flat (single-XML) ODF.
1348
+
1349
+ The resulting file has a single ``<office:document>`` root with merged
1350
+ content, styles, meta, and settings, plus all embedded pictures encoded
1351
+ inline as ``<office:binary-data>`` children of their ``<draw:image>``.
1352
+
1353
+ Args:
1354
+ input_zip: Source ODF file (``.odt``/``.odp``/``.ods``/``.odg``).
1355
+ output_flat: Destination flat ODF file (``.fodt``/``.fodp``/...).
1356
+ """
1357
+ for prefix, uri in ODF_NAMESPACES.items():
1358
+ ET.register_namespace(prefix, uri)
1359
+
1360
+ office_ns: str = ODF_NAMESPACES["office"]
1361
+ xlink_ns: str = ODF_NAMESPACES["xlink"]
1362
+ draw_ns: str = ODF_NAMESPACES["draw"]
1363
+
1364
+ with zipfile.ZipFile(input_zip) as archive:
1365
+ mimetype: str = archive.read("mimetype").decode("ascii").strip()
1366
+ meta_root: ET.Element = ET.fromstring(archive.read("meta.xml"))
1367
+ settings_root: ET.Element = ET.fromstring(archive.read("settings.xml"))
1368
+ styles_root: ET.Element = ET.fromstring(archive.read("styles.xml"))
1369
+ content_root: ET.Element = ET.fromstring(archive.read("content.xml"))
1370
+ pictures: dict[str, bytes] = {
1371
+ name: archive.read(name) for name in archive.namelist() if name.startswith("Pictures/")
1372
+ }
1373
+ # Embedded objects (charts, formulas) live under 'Object N/' as full
1374
+ # sub-packages (content.xml plus optional styles.xml/meta.xml).
1375
+ object_members: dict[str, bytes] = {
1376
+ name: archive.read(name)
1377
+ for name in archive.namelist()
1378
+ if name.startswith("Object ") and not name.endswith("/")
1379
+ }
1380
+ try:
1381
+ object_manifest: bytes | None = archive.read("META-INF/manifest.xml")
1382
+ except KeyError:
1383
+ object_manifest = None
1384
+
1385
+ flat_root: ET.Element = ET.Element(
1386
+ f"{{{office_ns}}}document",
1387
+ {
1388
+ f"{{{office_ns}}}version": "1.3",
1389
+ f"{{{office_ns}}}mimetype": mimetype,
1390
+ },
1391
+ )
1392
+
1393
+ def _children_matching(source: ET.Element, names: set[str]) -> list[ET.Element]:
1394
+ return [child for child in source if local_name(child.tag) in names]
1395
+
1396
+ for child in _children_matching(meta_root, {"meta"}):
1397
+ flat_root.append(child)
1398
+ for child in _children_matching(settings_root, {"settings"}):
1399
+ flat_root.append(child)
1400
+ for child in _children_matching(content_root, {"scripts"}):
1401
+ flat_root.append(child)
1402
+ for child in _children_matching(styles_root, {"font-face-decls"}):
1403
+ flat_root.append(child)
1404
+ for child in _children_matching(styles_root, {"styles"}):
1405
+ flat_root.append(child)
1406
+
1407
+ merged_auto: ET.Element = ET.SubElement(flat_root, f"{{{office_ns}}}automatic-styles")
1408
+ for source in (styles_root, content_root):
1409
+ for auto in _children_matching(source, {"automatic-styles"}):
1410
+ for grandchild in list(auto):
1411
+ merged_auto.append(grandchild)
1412
+
1413
+ for child in _children_matching(styles_root, {"master-styles"}):
1414
+ flat_root.append(child)
1415
+ for child in _children_matching(content_root, {"body"}):
1416
+ flat_root.append(child)
1417
+
1418
+ for image in flat_root.iter(f"{{{draw_ns}}}image"):
1419
+ href: str | None = image.attrib.get(f"{{{xlink_ns}}}href")
1420
+ if href and href in pictures:
1421
+ for attr in (
1422
+ f"{{{xlink_ns}}}href",
1423
+ f"{{{xlink_ns}}}type",
1424
+ f"{{{xlink_ns}}}show",
1425
+ f"{{{xlink_ns}}}actuate",
1426
+ ):
1427
+ image.attrib.pop(attr, None)
1428
+ binary: ET.Element = ET.SubElement(image, f"{{{office_ns}}}binary-data")
1429
+ binary.text = base64.b64encode(pictures[href]).decode("ascii")
1430
+
1431
+ # Embed object sub-packages (charts, formulas): inline the object's
1432
+ # full sub-package as a nested <office:document> inside its <draw:object>.
1433
+ object_media: dict[str, str] = _object_media_types(object_manifest)
1434
+ for obj in list(flat_root.iter(f"{{{draw_ns}}}object")):
1435
+ href_obj: str | None = obj.attrib.get(f"{{{xlink_ns}}}href")
1436
+ if not href_obj:
1437
+ continue
1438
+ obj_dir: str = href_obj.lstrip("./").rstrip("/")
1439
+ members: dict[str, bytes] = {
1440
+ name[len(obj_dir) + 1 :]: data for name, data in object_members.items() if name.startswith(obj_dir + "/")
1441
+ }
1442
+ if "content.xml" not in members:
1443
+ continue
1444
+ for attr in (
1445
+ f"{{{xlink_ns}}}href",
1446
+ f"{{{xlink_ns}}}type",
1447
+ f"{{{xlink_ns}}}show",
1448
+ f"{{{xlink_ns}}}actuate",
1449
+ ):
1450
+ obj.attrib.pop(attr, None)
1451
+ obj.append(_flatten_object(members, object_media.get(obj_dir)))
1452
+
1453
+ output_flat.write_bytes(xml_bytes(flat_root))
1454
+
1455
+
1456
+ def unpack_flat_odf(input_flat: Path, output_zip: Path) -> None:
1457
+ """Convert a flat ODF file back to a zipped ODF package.
1458
+
1459
+ Splits the single ``<office:document>`` root into the standard four
1460
+ XML files (content/styles/meta/settings), extracts inline pictures
1461
+ from ``<office:binary-data>`` blobs into ``Pictures/`` entries, and
1462
+ rebuilds ``META-INF/manifest.xml``.
1463
+
1464
+ Args:
1465
+ input_flat: Source flat ODF file.
1466
+ output_zip: Destination zipped ODF file.
1467
+ """
1468
+ for prefix, uri in ODF_NAMESPACES.items():
1469
+ ET.register_namespace(prefix, uri)
1470
+
1471
+ office_ns: str = ODF_NAMESPACES["office"]
1472
+ xlink_ns: str = ODF_NAMESPACES["xlink"]
1473
+ draw_ns: str = ODF_NAMESPACES["draw"]
1474
+ style_ns: str = ODF_NAMESPACES["style"]
1475
+ manifest_ns: str = ODF_NAMESPACES["manifest"]
1476
+
1477
+ flat_root: ET.Element = ET.parse(input_flat).getroot()
1478
+ mimetype: str | None = flat_root.attrib.get(f"{{{office_ns}}}mimetype")
1479
+ if not mimetype:
1480
+ raise SystemExit("flat ODF root missing office:mimetype attribute")
1481
+
1482
+ meta_doc: ET.Element = ET.Element(f"{{{office_ns}}}document-meta", {f"{{{office_ns}}}version": "1.3"})
1483
+ settings_doc: ET.Element = ET.Element(f"{{{office_ns}}}document-settings", {f"{{{office_ns}}}version": "1.3"})
1484
+ styles_doc: ET.Element = ET.Element(f"{{{office_ns}}}document-styles", {f"{{{office_ns}}}version": "1.3"})
1485
+ content_doc: ET.Element = ET.Element(f"{{{office_ns}}}document-content", {f"{{{office_ns}}}version": "1.3"})
1486
+
1487
+ styles_auto: ET.Element = ET.SubElement(styles_doc, f"{{{office_ns}}}automatic-styles")
1488
+ content_auto: ET.Element = ET.SubElement(content_doc, f"{{{office_ns}}}automatic-styles")
1489
+
1490
+ for child in list(flat_root):
1491
+ name: str = local_name(child.tag)
1492
+ if name == "meta":
1493
+ meta_doc.append(child)
1494
+ elif name == "settings":
1495
+ settings_doc.append(child)
1496
+ elif name == "scripts":
1497
+ content_doc.append(child)
1498
+ elif name == "font-face-decls":
1499
+ styles_doc.append(child)
1500
+ elif name == "styles":
1501
+ styles_doc.append(child)
1502
+ elif name == "automatic-styles":
1503
+ for grandchild in list(child):
1504
+ if grandchild.tag == f"{{{style_ns}}}page-layout":
1505
+ styles_auto.append(grandchild)
1506
+ else:
1507
+ content_auto.append(grandchild)
1508
+ elif name == "master-styles":
1509
+ styles_doc.append(child)
1510
+ elif name == "body":
1511
+ content_doc.append(child)
1512
+
1513
+ pictures: dict[str, bytes] = {}
1514
+ existing_names: set[str] = set()
1515
+ for image in content_doc.iter(f"{{{draw_ns}}}image"):
1516
+ binary: ET.Element | None = image.find(f"{{{office_ns}}}binary-data")
1517
+ if binary is None or not binary.text:
1518
+ continue
1519
+ data: bytes = base64.b64decode(binary.text.strip())
1520
+ ext: str = _sniff_image_extension(data)
1521
+ candidate: str = unique_picture_name(existing_names, Path(f"image{len(pictures) + 1}{ext}"))
1522
+ existing_names.add(candidate)
1523
+ pictures[candidate] = data
1524
+ image.remove(binary)
1525
+ image.set(f"{{{xlink_ns}}}href", candidate)
1526
+ image.set(f"{{{xlink_ns}}}type", "simple")
1527
+ image.set(f"{{{xlink_ns}}}show", "embed")
1528
+ image.set(f"{{{xlink_ns}}}actuate", "onLoad")
1529
+
1530
+ # Extract inlined object sub-packages (charts, formulas) back to Object N/.
1531
+ math_ns: str = ODF_NAMESPACES["math"]
1532
+ chart_ns: str = ODF_NAMESPACES["chart"]
1533
+ objects: dict[str, bytes] = {}
1534
+ object_media: dict[str, str] = {}
1535
+ object_count = 0
1536
+ for obj in content_doc.iter(f"{{{draw_ns}}}object"):
1537
+ doc_child: ET.Element | None = obj.find(f"{{{office_ns}}}document")
1538
+ if doc_child is None:
1539
+ continue
1540
+ object_count += 1
1541
+ obj_dir = f"Object {object_count}"
1542
+ members, declared_mime = _split_object_flat(doc_child)
1543
+ for member_name, member_bytes in members.items():
1544
+ objects[f"{obj_dir}/{member_name}"] = member_bytes
1545
+ if declared_mime:
1546
+ object_media[obj_dir] = declared_mime
1547
+ elif doc_child.find(f".//{{{math_ns}}}math") is not None:
1548
+ object_media[obj_dir] = "application/vnd.oasis.opendocument.formula"
1549
+ elif doc_child.find(f".//{{{chart_ns}}}chart") is not None:
1550
+ object_media[obj_dir] = "application/vnd.oasis.opendocument.chart"
1551
+ else:
1552
+ object_media[obj_dir] = "application/vnd.oasis.opendocument.text"
1553
+ obj.remove(doc_child)
1554
+ obj.set(f"{{{xlink_ns}}}href", f"./{obj_dir}/")
1555
+ obj.set(f"{{{xlink_ns}}}type", "simple")
1556
+ obj.set(f"{{{xlink_ns}}}show", "embed")
1557
+ obj.set(f"{{{xlink_ns}}}actuate", "onLoad")
1558
+
1559
+ manifest_doc: ET.Element = ET.Element(
1560
+ f"{{{manifest_ns}}}manifest",
1561
+ {f"{{{manifest_ns}}}version": "1.3"},
1562
+ )
1563
+ ET.SubElement(
1564
+ manifest_doc,
1565
+ f"{{{manifest_ns}}}file-entry",
1566
+ {
1567
+ f"{{{manifest_ns}}}full-path": "/",
1568
+ f"{{{manifest_ns}}}media-type": mimetype,
1569
+ f"{{{manifest_ns}}}version": "1.3",
1570
+ },
1571
+ )
1572
+ for name in ("content.xml", "styles.xml", "meta.xml", "settings.xml"):
1573
+ ET.SubElement(
1574
+ manifest_doc,
1575
+ f"{{{manifest_ns}}}file-entry",
1576
+ {
1577
+ f"{{{manifest_ns}}}full-path": name,
1578
+ f"{{{manifest_ns}}}media-type": "text/xml",
1579
+ },
1580
+ )
1581
+ for picture_path in pictures:
1582
+ ET.SubElement(
1583
+ manifest_doc,
1584
+ f"{{{manifest_ns}}}file-entry",
1585
+ {
1586
+ f"{{{manifest_ns}}}full-path": picture_path,
1587
+ f"{{{manifest_ns}}}media-type": media_type_for(Path(picture_path)),
1588
+ },
1589
+ )
1590
+ for obj_dir, media in object_media.items():
1591
+ ET.SubElement(
1592
+ manifest_doc,
1593
+ f"{{{manifest_ns}}}file-entry",
1594
+ {f"{{{manifest_ns}}}full-path": f"{obj_dir}/", f"{{{manifest_ns}}}media-type": media},
1595
+ )
1596
+ for object_path in objects:
1597
+ ET.SubElement(
1598
+ manifest_doc,
1599
+ f"{{{manifest_ns}}}file-entry",
1600
+ {f"{{{manifest_ns}}}full-path": object_path, f"{{{manifest_ns}}}media-type": "text/xml"},
1601
+ )
1602
+
1603
+ with zipfile.ZipFile(output_zip, "w") as archive:
1604
+ archive.writestr("mimetype", mimetype, compress_type=zipfile.ZIP_STORED)
1605
+ archive.writestr("content.xml", xml_bytes(content_doc), compress_type=zipfile.ZIP_DEFLATED)
1606
+ archive.writestr("styles.xml", xml_bytes(styles_doc), compress_type=zipfile.ZIP_DEFLATED)
1607
+ archive.writestr("meta.xml", xml_bytes(meta_doc), compress_type=zipfile.ZIP_DEFLATED)
1608
+ archive.writestr("settings.xml", xml_bytes(settings_doc), compress_type=zipfile.ZIP_DEFLATED)
1609
+ archive.writestr("META-INF/manifest.xml", xml_bytes(manifest_doc), compress_type=zipfile.ZIP_DEFLATED)
1610
+ for path, data in pictures.items():
1611
+ archive.writestr(path, data, compress_type=zipfile.ZIP_DEFLATED)
1612
+ for path, data in objects.items():
1613
+ archive.writestr(path, data, compress_type=zipfile.ZIP_DEFLATED)
1614
+
1615
+
1616
+ def local_name(tag: str) -> str:
1617
+ """Extract the local name from a Clark-notation tag ``'{ns}local'``.
1618
+
1619
+ Args:
1620
+ tag: XML tag in Clark notation (e.g. ``"{urn:...}text"``) or plain name.
1621
+
1622
+ Returns:
1623
+ The local name part (e.g. ``"text"``).
1624
+ """
1625
+ return tag.split("}", 1)[1] if tag.startswith("{") else tag