PyPI - ofdreader - Versions diffs - 0.1.0__tar.gz - Mend

ofdreader 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

ofdreader-0.1.0/.gitignore +18 -0
ofdreader-0.1.0/LICENSE +21 -0
ofdreader-0.1.0/PKG-INFO +97 -0
ofdreader-0.1.0/README.md +73 -0
ofdreader-0.1.0/ofdreader/__init__.py +25 -0
ofdreader-0.1.0/ofdreader/_constants.py +6 -0
ofdreader-0.1.0/ofdreader/_package.py +164 -0
ofdreader-0.1.0/ofdreader/_xml.py +63 -0
ofdreader-0.1.0/ofdreader/convert.py +247 -0
ofdreader-0.1.0/ofdreader/exceptions.py +17 -0
ofdreader-0.1.0/ofdreader/models.py +40 -0
ofdreader-0.1.0/ofdreader/reader.py +395 -0
ofdreader-0.1.0/ofdreader/writer.py +245 -0
ofdreader-0.1.0/pyproject.toml +46 -0

ofdreader-0.1.0/.gitignore ADDED Viewed

@@ -0,0 +1,18 @@
+__pycache__/
+*.py[cod]
+.pytest_cache/
+.mypy_cache/
+.ruff_cache/
+dist/
+build/
+*.egg-info/
+.eggs/
+*.ofd
+!tests/fixtures/sample.ofd
+弘扬优良传统测试副本
+弘扬优良传统测试副本.ofd
+弘扬优良传统测试副本.zip
+test.md
+test.py
+sample_text_layout.pdf

ofdreader-0.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 PYOFD Contributors
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

ofdreader-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,97 @@
+Metadata-Version: 2.4
+Name: ofdreader
+Version: 0.1.0
+Summary: A Python library for data extraction, analysis, conversion & manipulation of OFD (China national open fixed-layout document standard GB/T 33190) documents.
+Author: PYOFD Contributors
+License-Expression: MIT
+License-File: LICENSE
+Keywords: GB/T 33190,document,ofd,parser
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Topic :: Office/Business :: Office Suites
+Classifier: Topic :: Text Processing :: Markup
+Requires-Python: >=3.10
+Provides-Extra: dev
+Requires-Dist: pytest>=8.0; extra == 'dev'
+Provides-Extra: pdf
+Requires-Dist: reportlab>=4.0; extra == 'pdf'
+Description-Content-Type: text/markdown
+# ofdreader
+**English** | [中文](README_zh.md)
+**ofdreader** is a Python library for data extraction, analysis, conversion & manipulation of [OFD](https://www.gb688.cn/bzgk/gb/newGbInfo?hcno=7B5673888A4E432686E8A7BFCBBEA4C9) (Open Fixed-layout Document; China national standard GB/T 33190—2016) documents.
+## Install
+```bash
+pip install ofdreader
+```
+Development:
+```bash
+pip install -e ".[dev]"
+pytest
+```
+## Quick start
+```python
+from ofdreader import OfdReader, OfdWriter, ofd_to_pdf, ofd_to_pdf_layout
+reader = OfdReader("document.ofd")
+print(reader.metadata["Author"])
+print(reader.page_count)
+text = reader.pages[0].extract_text()
+full_text = reader.extract_text()  # keeps paragraph breaks, joins line wraps
+# flat = reader.extract_text(preserve_layout=False)  # single continuous string
+writer = OfdWriter()
+writer.append(reader)
+writer.metadata["Author"] = "Updated Author"
+writer.write("copy.ofd")
+# Export current extracted paragraphs to a simple PDF
+ofd_to_pdf("document.ofd", "document.txt-layout.pdf")
+# Export with approximate original fonts/layout from OFD XML
+ofd_to_pdf_layout("document.ofd", "document.layout.pdf")
+```
+Merge an extra page from another file:
+```python
+writer = OfdWriter()
+writer.append(OfdReader("a.ofd"))
+writer.append_pages(OfdReader("b.ofd"), pages=[0])
+writer.write("merged.ofd")
+```
+## Scope (v0.1)
+- Open `.ofd` packages (ZIP) from path, bytes, or file-like objects
+- Read `DocInfo` metadata, page list, outlines
+- Extract plain text from `TextObject` / `TextCode` (default keeps paragraph breaks, joins wrapped lines)
+- Clone packages, update metadata, append pages across documents
+- Export extracted text paragraphs to PDF (`ofd_to_pdf`, optional `reportlab`)
+- Export approximate original layout PDF (`ofd_to_pdf_layout`)
+Not included yet: full layout rendering, digital signatures, annotations, creating
+new glyph-mapped text.
+## Related projects
+- [easyofd](https://pypi.org/project/easyofd/) — OFD ↔ PDF/image conversion
+- PyPI name `pyofd` is a different library (tax receipt OFD providers)
+## License
+MIT — see [LICENSE](LICENSE).

ofdreader-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,73 @@
+# ofdreader
+**English** | [中文](README_zh.md)
+**ofdreader** is a Python library for data extraction, analysis, conversion & manipulation of [OFD](https://www.gb688.cn/bzgk/gb/newGbInfo?hcno=7B5673888A4E432686E8A7BFCBBEA4C9) (Open Fixed-layout Document; China national standard GB/T 33190—2016) documents.
+## Install
+```bash
+pip install ofdreader
+```
+Development:
+```bash
+pip install -e ".[dev]"
+pytest
+```
+## Quick start
+```python
+from ofdreader import OfdReader, OfdWriter, ofd_to_pdf, ofd_to_pdf_layout
+reader = OfdReader("document.ofd")
+print(reader.metadata["Author"])
+print(reader.page_count)
+text = reader.pages[0].extract_text()
+full_text = reader.extract_text()  # keeps paragraph breaks, joins line wraps
+# flat = reader.extract_text(preserve_layout=False)  # single continuous string
+writer = OfdWriter()
+writer.append(reader)
+writer.metadata["Author"] = "Updated Author"
+writer.write("copy.ofd")
+# Export current extracted paragraphs to a simple PDF
+ofd_to_pdf("document.ofd", "document.txt-layout.pdf")
+# Export with approximate original fonts/layout from OFD XML
+ofd_to_pdf_layout("document.ofd", "document.layout.pdf")
+```
+Merge an extra page from another file:
+```python
+writer = OfdWriter()
+writer.append(OfdReader("a.ofd"))
+writer.append_pages(OfdReader("b.ofd"), pages=[0])
+writer.write("merged.ofd")
+```
+## Scope (v0.1)
+- Open `.ofd` packages (ZIP) from path, bytes, or file-like objects
+- Read `DocInfo` metadata, page list, outlines
+- Extract plain text from `TextObject` / `TextCode` (default keeps paragraph breaks, joins wrapped lines)
+- Clone packages, update metadata, append pages across documents
+- Export extracted text paragraphs to PDF (`ofd_to_pdf`, optional `reportlab`)
+- Export approximate original layout PDF (`ofd_to_pdf_layout`)
+Not included yet: full layout rendering, digital signatures, annotations, creating
+new glyph-mapped text.
+## Related projects
+- [easyofd](https://pypi.org/project/easyofd/) — OFD ↔ PDF/image conversion
+- PyPI name `pyofd` is a different library (tax receipt OFD providers)
+## License
+MIT — see [LICENSE](LICENSE).

ofdreader-0.1.0/ofdreader/__init__.py ADDED Viewed

@@ -0,0 +1,25 @@
+"""Data extraction, analysis, conversion & manipulation of OFD (China GB/T 33190) documents."""
+from ofdreader.exceptions import (
+    OfdError,
+    OfdFormatError,
+    OfdNotFoundError,
+    OfdWriteError,
+)
+from ofdreader.convert import ofd_to_pdf, ofd_to_pdf_layout
+from ofdreader.reader import OfdPage, OfdReader
+from ofdreader.writer import OfdWriter
+__all__ = [
+    "OfdError",
+    "OfdFormatError",
+    "OfdNotFoundError",
+    "OfdPage",
+    "OfdReader",
+    "OfdWriteError",
+    "OfdWriter",
+    "ofd_to_pdf",
+    "ofd_to_pdf_layout",
+]
+__version__ = "0.1.0"

ofdreader-0.1.0/ofdreader/_constants.py ADDED Viewed

@@ -0,0 +1,6 @@
+"""OFD specification constants (GB/T 33190, namespace 2016)."""
+OFD_NS = "http://www.ofdspec.org/2016"
+OFD_NS_TAG = f"{{{OFD_NS}}}"
+OFD_XML_ENTRY = "OFD.xml"

ofdreader-0.1.0/ofdreader/_package.py ADDED Viewed

@@ -0,0 +1,164 @@
+"""OFD ZIP container access."""
+from __future__ import annotations
+import io
+import posixpath
+import zipfile
+from pathlib import Path, PurePosixPath
+from typing import BinaryIO, Mapping
+from ofdreader._constants import OFD_XML_ENTRY
+from ofdreader.exceptions import OfdFormatError, OfdNotFoundError
+def normalize_zip_path(path: str) -> str:
+    """Normalize a path inside the OFD ZIP archive."""
+    path = path.replace("\\", "/").strip()
+    parts: list[str] = []
+    for part in PurePosixPath(path).parts:
+        if part in ("", "."):
+            continue
+        if part == "..":
+            if parts:
+                parts.pop()
+            continue
+        parts.append(part)
+    return "/".join(parts)
+def resolve_path(base: str, loc: str) -> str:
+    """Resolve a relative location against a base directory in the package."""
+    base = normalize_zip_path(base)
+    loc = loc.strip()
+    if not loc:
+        return base
+    if "/" not in base:
+        return normalize_zip_path(posixpath.join(base, loc))
+    base_dir = posixpath.dirname(base)
+    if base_dir:
+        return normalize_zip_path(posixpath.join(base_dir, loc))
+    return normalize_zip_path(loc)
+def doc_dir_for(doc_root: str) -> str:
+    """Return the document directory (e.g. Doc_0) for a DocRoot path."""
+    normalized = normalize_zip_path(doc_root)
+    parent = posixpath.dirname(normalized)
+    return parent or ""
+class OfdPackage:
+    """In-memory representation of an OFD ZIP package."""
+    def __init__(self, entries: Mapping[str, bytes]) -> None:
+        self._entries = dict(entries)
+        if OFD_XML_ENTRY not in self._entries:
+            raise OfdFormatError(f"Missing required entry: {OFD_XML_ENTRY}")
+    @classmethod
+    def from_path(cls, path: str | Path) -> OfdPackage:
+        path = Path(path)
+        if not path.is_file():
+            raise OfdFormatError(f"Not a file: {path}")
+        try:
+            with zipfile.ZipFile(path, "r") as zf:
+                return cls.from_zipfile(zf)
+        except zipfile.BadZipFile as exc:
+            raise OfdFormatError(f"Not a valid OFD/ZIP file: {path}") from exc
+    @classmethod
+    def from_bytes(cls, data: bytes) -> OfdPackage:
+        buffer = io.BytesIO(data)
+        try:
+            with zipfile.ZipFile(buffer, "r") as zf:
+                return cls.from_zipfile(zf)
+        except zipfile.BadZipFile as exc:
+            raise OfdFormatError("Not a valid OFD/ZIP buffer") from exc
+    @classmethod
+    def from_stream(cls, stream: BinaryIO) -> OfdPackage:
+        try:
+            with zipfile.ZipFile(stream, "r") as zf:
+                return cls.from_zipfile(zf)
+        except zipfile.BadZipFile as exc:
+            raise OfdFormatError("Not a valid OFD/ZIP stream") from exc
+    @classmethod
+    def from_zipfile(cls, zf: zipfile.ZipFile) -> OfdPackage:
+        entries: dict[str, bytes] = {}
+        for info in zf.infolist():
+            if info.is_dir():
+                continue
+            name = normalize_zip_path(info.filename)
+            entries[name] = zf.read(info.filename)
+        return cls(entries)
+    @classmethod
+    def from_directory(cls, directory: str | Path) -> OfdPackage:
+        """Build a package from an extracted OFD directory tree."""
+        directory = Path(directory)
+        entries: dict[str, bytes] = {}
+        for path in directory.rglob("*"):
+            if path.is_file():
+                rel = path.relative_to(directory).as_posix()
+                entries[normalize_zip_path(rel)] = path.read_bytes()
+        return cls(entries)
+    def copy(self) -> OfdPackage:
+        return OfdPackage(dict(self._entries))
+    def list_entries(self) -> list[str]:
+        return sorted(self._entries.keys())
+    def has_entry(self, path: str) -> bool:
+        return normalize_zip_path(path) in self._entries
+    def read_bytes(self, path: str) -> bytes:
+        key = normalize_zip_path(path)
+        if key not in self._entries:
+            raise OfdNotFoundError(f"Package entry not found: {path}")
+        return self._entries[key]
+    def read_text(self, path: str, encoding: str = "utf-8") -> str:
+        return self.read_bytes(path).decode(encoding)
+    def write_bytes(self, path: str, data: bytes) -> None:
+        self._entries[normalize_zip_path(path)] = data
+    def write_text(self, path: str, text: str, encoding: str = "utf-8") -> None:
+        self.write_bytes(path, text.encode(encoding))
+    def remove_entry(self, path: str) -> None:
+        self._entries.pop(normalize_zip_path(path), None)
+    def save(
+        self,
+        path: str | Path | BinaryIO,
+        *,
+        compress_types: Mapping[str, int] | None = None,
+    ) -> None:
+        """Write the package to a .ofd file or stream."""
+        default_compress = zipfile.ZIP_STORED
+        if isinstance(path, (str, Path)):
+            with zipfile.ZipFile(path, "w") as zf:
+                self._write_to_zip(zf, compress_types, default_compress)
+        else:
+            with zipfile.ZipFile(path, "w") as zf:
+                self._write_to_zip(zf, compress_types, default_compress)
+    def _write_to_zip(
+        self,
+        zf: zipfile.ZipFile,
+        compress_types: Mapping[str, int] | None,
+        default_compress: int,
+    ) -> None:
+        for name in sorted(self._entries.keys()):
+            compress = default_compress
+            if compress_types and name in compress_types:
+                compress = compress_types[name]
+            zf.writestr(
+                zipfile.ZipInfo(filename=name),
+                self._entries[name],
+                compress_type=compress,
+            )

ofdreader-0.1.0/ofdreader/_xml.py ADDED Viewed

@@ -0,0 +1,63 @@
+"""Namespace-aware XML helpers for OFD."""
+from __future__ import annotations
+import xml.etree.ElementTree as ET
+from typing import Iterator
+from xml.etree.ElementTree import Element
+from ofdreader._constants import OFD_NS, OFD_NS_TAG
+def parse_root(data: bytes) -> Element:
+    """Parse XML bytes and return the document root."""
+    return ET.fromstring(data)
+def to_bytes(root: Element, *, xml_declaration: bool = True) -> bytes:
+    """Serialize an element tree to UTF-8 XML bytes."""
+    if xml_declaration:
+        ET.register_namespace("ofd", OFD_NS)
+    body = ET.tostring(root, encoding="utf-8", xml_declaration=xml_declaration)
+    return body
+def local_name(tag: str) -> str:
+    """Return the local part of a Clark notation tag."""
+    if tag.startswith("{"):
+        return tag.split("}", 1)[1]
+    return tag
+def is_ofd(tag: str, name: str) -> bool:
+    return tag == f"{OFD_NS_TAG}{name}" or local_name(tag) == name
+def find_child(parent: Element, name: str) -> Element | None:
+    """Find the first direct child with the given OFD local name."""
+    for child in parent:
+        if is_ofd(child.tag, name):
+            return child
+    return None
+def find_children(parent: Element, name: str) -> list[Element]:
+    """Find all direct children with the given OFD local name."""
+    return [child for child in parent if is_ofd(child.tag, name)]
+def iter_descendants(parent: Element, name: str) -> Iterator[Element]:
+    """Depth-first iteration over descendants with the given local name."""
+    for elem in parent.iter():
+        if elem is not parent and is_ofd(elem.tag, name):
+            yield elem
+def text_content(elem: Element | None) -> str:
+    if elem is None:
+        return ""
+    return (elem.text or "").strip()
+def get_attr(elem: Element, name: str, default: str | None = None) -> str | None:
+    return elem.get(name, default)