ofdreader 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,18 @@
1
+ __pycache__/
2
+ *.py[cod]
3
+ .pytest_cache/
4
+ .mypy_cache/
5
+ .ruff_cache/
6
+ dist/
7
+ build/
8
+ *.egg-info/
9
+ .eggs/
10
+ *.ofd
11
+ !tests/fixtures/sample.ofd
12
+
13
+ 弘扬优良传统测试副本
14
+ 弘扬优良传统测试副本.ofd
15
+ 弘扬优良传统测试副本.zip
16
+ test.md
17
+ test.py
18
+ sample_text_layout.pdf
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 PYOFD Contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,97 @@
1
+ Metadata-Version: 2.4
2
+ Name: ofdreader
3
+ Version: 0.1.0
4
+ Summary: A Python library for data extraction, analysis, conversion & manipulation of OFD (China national open fixed-layout document standard GB/T 33190) documents.
5
+ Author: PYOFD Contributors
6
+ License-Expression: MIT
7
+ License-File: LICENSE
8
+ Keywords: GB/T 33190,document,ofd,parser
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Topic :: Office/Business :: Office Suites
17
+ Classifier: Topic :: Text Processing :: Markup
18
+ Requires-Python: >=3.10
19
+ Provides-Extra: dev
20
+ Requires-Dist: pytest>=8.0; extra == 'dev'
21
+ Provides-Extra: pdf
22
+ Requires-Dist: reportlab>=4.0; extra == 'pdf'
23
+ Description-Content-Type: text/markdown
24
+
25
+ # ofdreader
26
+
27
+ **English** | [中文](README_zh.md)
28
+
29
+ **ofdreader** is a Python library for data extraction, analysis, conversion & manipulation of [OFD](https://www.gb688.cn/bzgk/gb/newGbInfo?hcno=7B5673888A4E432686E8A7BFCBBEA4C9) (Open Fixed-layout Document; China national standard GB/T 33190—2016) documents.
30
+
31
+ ## Install
32
+
33
+ ```bash
34
+ pip install ofdreader
35
+ ```
36
+
37
+ Development:
38
+
39
+ ```bash
40
+ pip install -e ".[dev]"
41
+ pytest
42
+ ```
43
+
44
+ ## Quick start
45
+
46
+ ```python
47
+ from ofdreader import OfdReader, OfdWriter, ofd_to_pdf, ofd_to_pdf_layout
48
+
49
+ reader = OfdReader("document.ofd")
50
+ print(reader.metadata["Author"])
51
+ print(reader.page_count)
52
+
53
+ text = reader.pages[0].extract_text()
54
+ full_text = reader.extract_text() # keeps paragraph breaks, joins line wraps
55
+ # flat = reader.extract_text(preserve_layout=False) # single continuous string
56
+
57
+ writer = OfdWriter()
58
+ writer.append(reader)
59
+ writer.metadata["Author"] = "Updated Author"
60
+ writer.write("copy.ofd")
61
+
62
+ # Export current extracted paragraphs to a simple PDF
63
+ ofd_to_pdf("document.ofd", "document.txt-layout.pdf")
64
+
65
+ # Export with approximate original fonts/layout from OFD XML
66
+ ofd_to_pdf_layout("document.ofd", "document.layout.pdf")
67
+ ```
68
+
69
+ Merge an extra page from another file:
70
+
71
+ ```python
72
+ writer = OfdWriter()
73
+ writer.append(OfdReader("a.ofd"))
74
+ writer.append_pages(OfdReader("b.ofd"), pages=[0])
75
+ writer.write("merged.ofd")
76
+ ```
77
+
78
+ ## Scope (v0.1)
79
+
80
+ - Open `.ofd` packages (ZIP) from path, bytes, or file-like objects
81
+ - Read `DocInfo` metadata, page list, outlines
82
+ - Extract plain text from `TextObject` / `TextCode` (default keeps paragraph breaks, joins wrapped lines)
83
+ - Clone packages, update metadata, append pages across documents
84
+ - Export extracted text paragraphs to PDF (`ofd_to_pdf`, optional `reportlab`)
85
+ - Export approximate original layout PDF (`ofd_to_pdf_layout`)
86
+
87
+ Not included yet: full layout rendering, digital signatures, annotations, creating
88
+ new glyph-mapped text.
89
+
90
+ ## Related projects
91
+
92
+ - [easyofd](https://pypi.org/project/easyofd/) — OFD ↔ PDF/image conversion
93
+ - PyPI name `pyofd` is a different library (tax receipt OFD providers)
94
+
95
+ ## License
96
+
97
+ MIT — see [LICENSE](LICENSE).
@@ -0,0 +1,73 @@
1
+ # ofdreader
2
+
3
+ **English** | [中文](README_zh.md)
4
+
5
+ **ofdreader** is a Python library for data extraction, analysis, conversion & manipulation of [OFD](https://www.gb688.cn/bzgk/gb/newGbInfo?hcno=7B5673888A4E432686E8A7BFCBBEA4C9) (Open Fixed-layout Document; China national standard GB/T 33190—2016) documents.
6
+
7
+ ## Install
8
+
9
+ ```bash
10
+ pip install ofdreader
11
+ ```
12
+
13
+ Development:
14
+
15
+ ```bash
16
+ pip install -e ".[dev]"
17
+ pytest
18
+ ```
19
+
20
+ ## Quick start
21
+
22
+ ```python
23
+ from ofdreader import OfdReader, OfdWriter, ofd_to_pdf, ofd_to_pdf_layout
24
+
25
+ reader = OfdReader("document.ofd")
26
+ print(reader.metadata["Author"])
27
+ print(reader.page_count)
28
+
29
+ text = reader.pages[0].extract_text()
30
+ full_text = reader.extract_text() # keeps paragraph breaks, joins line wraps
31
+ # flat = reader.extract_text(preserve_layout=False) # single continuous string
32
+
33
+ writer = OfdWriter()
34
+ writer.append(reader)
35
+ writer.metadata["Author"] = "Updated Author"
36
+ writer.write("copy.ofd")
37
+
38
+ # Export current extracted paragraphs to a simple PDF
39
+ ofd_to_pdf("document.ofd", "document.txt-layout.pdf")
40
+
41
+ # Export with approximate original fonts/layout from OFD XML
42
+ ofd_to_pdf_layout("document.ofd", "document.layout.pdf")
43
+ ```
44
+
45
+ Merge an extra page from another file:
46
+
47
+ ```python
48
+ writer = OfdWriter()
49
+ writer.append(OfdReader("a.ofd"))
50
+ writer.append_pages(OfdReader("b.ofd"), pages=[0])
51
+ writer.write("merged.ofd")
52
+ ```
53
+
54
+ ## Scope (v0.1)
55
+
56
+ - Open `.ofd` packages (ZIP) from path, bytes, or file-like objects
57
+ - Read `DocInfo` metadata, page list, outlines
58
+ - Extract plain text from `TextObject` / `TextCode` (default keeps paragraph breaks, joins wrapped lines)
59
+ - Clone packages, update metadata, append pages across documents
60
+ - Export extracted text paragraphs to PDF (`ofd_to_pdf`, optional `reportlab`)
61
+ - Export approximate original layout PDF (`ofd_to_pdf_layout`)
62
+
63
+ Not included yet: full layout rendering, digital signatures, annotations, creating
64
+ new glyph-mapped text.
65
+
66
+ ## Related projects
67
+
68
+ - [easyofd](https://pypi.org/project/easyofd/) — OFD ↔ PDF/image conversion
69
+ - PyPI name `pyofd` is a different library (tax receipt OFD providers)
70
+
71
+ ## License
72
+
73
+ MIT — see [LICENSE](LICENSE).
@@ -0,0 +1,25 @@
1
+ """Data extraction, analysis, conversion & manipulation of OFD (China GB/T 33190) documents."""
2
+
3
+ from ofdreader.exceptions import (
4
+ OfdError,
5
+ OfdFormatError,
6
+ OfdNotFoundError,
7
+ OfdWriteError,
8
+ )
9
+ from ofdreader.convert import ofd_to_pdf, ofd_to_pdf_layout
10
+ from ofdreader.reader import OfdPage, OfdReader
11
+ from ofdreader.writer import OfdWriter
12
+
13
+ __all__ = [
14
+ "OfdError",
15
+ "OfdFormatError",
16
+ "OfdNotFoundError",
17
+ "OfdPage",
18
+ "OfdReader",
19
+ "OfdWriteError",
20
+ "OfdWriter",
21
+ "ofd_to_pdf",
22
+ "ofd_to_pdf_layout",
23
+ ]
24
+
25
+ __version__ = "0.1.0"
@@ -0,0 +1,6 @@
1
+ """OFD specification constants (GB/T 33190, namespace 2016)."""
2
+
3
+ OFD_NS = "http://www.ofdspec.org/2016"
4
+ OFD_NS_TAG = f"{{{OFD_NS}}}"
5
+
6
+ OFD_XML_ENTRY = "OFD.xml"
@@ -0,0 +1,164 @@
1
+ """OFD ZIP container access."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import io
6
+ import posixpath
7
+ import zipfile
8
+ from pathlib import Path, PurePosixPath
9
+ from typing import BinaryIO, Mapping
10
+
11
+ from ofdreader._constants import OFD_XML_ENTRY
12
+ from ofdreader.exceptions import OfdFormatError, OfdNotFoundError
13
+
14
+
15
+ def normalize_zip_path(path: str) -> str:
16
+ """Normalize a path inside the OFD ZIP archive."""
17
+ path = path.replace("\\", "/").strip()
18
+ parts: list[str] = []
19
+ for part in PurePosixPath(path).parts:
20
+ if part in ("", "."):
21
+ continue
22
+ if part == "..":
23
+ if parts:
24
+ parts.pop()
25
+ continue
26
+ parts.append(part)
27
+ return "/".join(parts)
28
+
29
+
30
+ def resolve_path(base: str, loc: str) -> str:
31
+ """Resolve a relative location against a base directory in the package."""
32
+ base = normalize_zip_path(base)
33
+ loc = loc.strip()
34
+ if not loc:
35
+ return base
36
+ if "/" not in base:
37
+ return normalize_zip_path(posixpath.join(base, loc))
38
+ base_dir = posixpath.dirname(base)
39
+ if base_dir:
40
+ return normalize_zip_path(posixpath.join(base_dir, loc))
41
+ return normalize_zip_path(loc)
42
+
43
+
44
+ def doc_dir_for(doc_root: str) -> str:
45
+ """Return the document directory (e.g. Doc_0) for a DocRoot path."""
46
+ normalized = normalize_zip_path(doc_root)
47
+ parent = posixpath.dirname(normalized)
48
+ return parent or ""
49
+
50
+
51
+ class OfdPackage:
52
+ """In-memory representation of an OFD ZIP package."""
53
+
54
+ def __init__(self, entries: Mapping[str, bytes]) -> None:
55
+ self._entries = dict(entries)
56
+ if OFD_XML_ENTRY not in self._entries:
57
+ raise OfdFormatError(f"Missing required entry: {OFD_XML_ENTRY}")
58
+
59
+ @classmethod
60
+ def from_path(cls, path: str | Path) -> OfdPackage:
61
+ path = Path(path)
62
+ if not path.is_file():
63
+ raise OfdFormatError(f"Not a file: {path}")
64
+ try:
65
+ with zipfile.ZipFile(path, "r") as zf:
66
+ return cls.from_zipfile(zf)
67
+ except zipfile.BadZipFile as exc:
68
+ raise OfdFormatError(f"Not a valid OFD/ZIP file: {path}") from exc
69
+
70
+ @classmethod
71
+ def from_bytes(cls, data: bytes) -> OfdPackage:
72
+ buffer = io.BytesIO(data)
73
+ try:
74
+ with zipfile.ZipFile(buffer, "r") as zf:
75
+ return cls.from_zipfile(zf)
76
+ except zipfile.BadZipFile as exc:
77
+ raise OfdFormatError("Not a valid OFD/ZIP buffer") from exc
78
+
79
+ @classmethod
80
+ def from_stream(cls, stream: BinaryIO) -> OfdPackage:
81
+ try:
82
+ with zipfile.ZipFile(stream, "r") as zf:
83
+ return cls.from_zipfile(zf)
84
+ except zipfile.BadZipFile as exc:
85
+ raise OfdFormatError("Not a valid OFD/ZIP stream") from exc
86
+
87
+ @classmethod
88
+ def from_zipfile(cls, zf: zipfile.ZipFile) -> OfdPackage:
89
+ entries: dict[str, bytes] = {}
90
+ for info in zf.infolist():
91
+ if info.is_dir():
92
+ continue
93
+ name = normalize_zip_path(info.filename)
94
+ entries[name] = zf.read(info.filename)
95
+ return cls(entries)
96
+
97
+ @classmethod
98
+ def from_directory(cls, directory: str | Path) -> OfdPackage:
99
+ """Build a package from an extracted OFD directory tree."""
100
+ directory = Path(directory)
101
+ entries: dict[str, bytes] = {}
102
+ for path in directory.rglob("*"):
103
+ if path.is_file():
104
+ rel = path.relative_to(directory).as_posix()
105
+ entries[normalize_zip_path(rel)] = path.read_bytes()
106
+ return cls(entries)
107
+
108
+ def copy(self) -> OfdPackage:
109
+ return OfdPackage(dict(self._entries))
110
+
111
+ def list_entries(self) -> list[str]:
112
+ return sorted(self._entries.keys())
113
+
114
+ def has_entry(self, path: str) -> bool:
115
+ return normalize_zip_path(path) in self._entries
116
+
117
+ def read_bytes(self, path: str) -> bytes:
118
+ key = normalize_zip_path(path)
119
+ if key not in self._entries:
120
+ raise OfdNotFoundError(f"Package entry not found: {path}")
121
+ return self._entries[key]
122
+
123
+ def read_text(self, path: str, encoding: str = "utf-8") -> str:
124
+ return self.read_bytes(path).decode(encoding)
125
+
126
+ def write_bytes(self, path: str, data: bytes) -> None:
127
+ self._entries[normalize_zip_path(path)] = data
128
+
129
+ def write_text(self, path: str, text: str, encoding: str = "utf-8") -> None:
130
+ self.write_bytes(path, text.encode(encoding))
131
+
132
+ def remove_entry(self, path: str) -> None:
133
+ self._entries.pop(normalize_zip_path(path), None)
134
+
135
+ def save(
136
+ self,
137
+ path: str | Path | BinaryIO,
138
+ *,
139
+ compress_types: Mapping[str, int] | None = None,
140
+ ) -> None:
141
+ """Write the package to a .ofd file or stream."""
142
+ default_compress = zipfile.ZIP_STORED
143
+ if isinstance(path, (str, Path)):
144
+ with zipfile.ZipFile(path, "w") as zf:
145
+ self._write_to_zip(zf, compress_types, default_compress)
146
+ else:
147
+ with zipfile.ZipFile(path, "w") as zf:
148
+ self._write_to_zip(zf, compress_types, default_compress)
149
+
150
+ def _write_to_zip(
151
+ self,
152
+ zf: zipfile.ZipFile,
153
+ compress_types: Mapping[str, int] | None,
154
+ default_compress: int,
155
+ ) -> None:
156
+ for name in sorted(self._entries.keys()):
157
+ compress = default_compress
158
+ if compress_types and name in compress_types:
159
+ compress = compress_types[name]
160
+ zf.writestr(
161
+ zipfile.ZipInfo(filename=name),
162
+ self._entries[name],
163
+ compress_type=compress,
164
+ )
@@ -0,0 +1,63 @@
1
+ """Namespace-aware XML helpers for OFD."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import xml.etree.ElementTree as ET
6
+ from typing import Iterator
7
+ from xml.etree.ElementTree import Element
8
+
9
+ from ofdreader._constants import OFD_NS, OFD_NS_TAG
10
+
11
+
12
+ def parse_root(data: bytes) -> Element:
13
+ """Parse XML bytes and return the document root."""
14
+ return ET.fromstring(data)
15
+
16
+
17
+ def to_bytes(root: Element, *, xml_declaration: bool = True) -> bytes:
18
+ """Serialize an element tree to UTF-8 XML bytes."""
19
+ if xml_declaration:
20
+ ET.register_namespace("ofd", OFD_NS)
21
+ body = ET.tostring(root, encoding="utf-8", xml_declaration=xml_declaration)
22
+ return body
23
+
24
+
25
+ def local_name(tag: str) -> str:
26
+ """Return the local part of a Clark notation tag."""
27
+ if tag.startswith("{"):
28
+ return tag.split("}", 1)[1]
29
+ return tag
30
+
31
+
32
+ def is_ofd(tag: str, name: str) -> bool:
33
+ return tag == f"{OFD_NS_TAG}{name}" or local_name(tag) == name
34
+
35
+
36
+ def find_child(parent: Element, name: str) -> Element | None:
37
+ """Find the first direct child with the given OFD local name."""
38
+ for child in parent:
39
+ if is_ofd(child.tag, name):
40
+ return child
41
+ return None
42
+
43
+
44
+ def find_children(parent: Element, name: str) -> list[Element]:
45
+ """Find all direct children with the given OFD local name."""
46
+ return [child for child in parent if is_ofd(child.tag, name)]
47
+
48
+
49
+ def iter_descendants(parent: Element, name: str) -> Iterator[Element]:
50
+ """Depth-first iteration over descendants with the given local name."""
51
+ for elem in parent.iter():
52
+ if elem is not parent and is_ofd(elem.tag, name):
53
+ yield elem
54
+
55
+
56
+ def text_content(elem: Element | None) -> str:
57
+ if elem is None:
58
+ return ""
59
+ return (elem.text or "").strip()
60
+
61
+
62
+ def get_attr(elem: Element, name: str, default: str | None = None) -> str | None:
63
+ return elem.get(name, default)