docxnote 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,330 @@
1
+ Metadata-Version: 2.3
2
+ Name: docxnote
3
+ Version: 0.1.0
4
+ Summary: Lightweight DOCX comment engine based on text view API
5
+ Author: touken
6
+ Author-email: touken <touken928@foxmail.com>
7
+ Requires-Dist: lxml>=5.0.0
8
+ Requires-Python: >=3.12
9
+ Description-Content-Type: text/markdown
10
+
11
+ # Docxnote
12
+
13
+ **docxnote** 是一个轻量级 **DOCX 批注引擎**,仅依赖 lxml,用于自动化添加 Word 批注。
14
+
15
+ 该库直接操作 **WordprocessingML**,将 DOCX 视为 **ZIP + XML** 文档,并提供一个 **基于文本视图的 API**。
16
+
17
+ 与传统 DOCX 库不同,docxnote **完全隐藏 Word 的 Run 结构**,所有操作都基于 **段落字符串**。
18
+
19
+ ---
20
+
21
+ ## 安装
22
+
23
+ ```
24
+ pip install git+https://github.com/touken928/docxnote.git
25
+ ```
26
+
27
+ 使用 [uv](https://github.com/astral-sh/uv):
28
+
29
+ ```
30
+ uv add git+https://github.com/touken928/docxnote.git
31
+ ```
32
+
33
+ ---
34
+
35
+ ## 快速开始
36
+
37
+ ```python
38
+ from docxnote import DocxDocument, Paragraph, Table
39
+
40
+ # 读取文档
41
+ with open("document.docx", "rb") as f:
42
+ # 默认不保留原有批注(会清空)
43
+ doc = DocxDocument.parse(f.read())
44
+
45
+ # 如需保留原有批注并继续添加:
46
+ # doc = DocxDocument.parse(f.read(), keep_comments=True)
47
+
48
+ # 遍历文档块
49
+ for block in doc.blocks():
50
+ if isinstance(block, Paragraph):
51
+ # 为段落添加批注
52
+ if block.text:
53
+ block.comment("请检查表述", end=5, author="reviewer")
54
+
55
+ elif isinstance(block, Table):
56
+ # 处理表格
57
+ rows, cols = block.shape()
58
+ for r in range(rows):
59
+ for c in range(cols):
60
+ cell = block[r, c]
61
+ # 为单元格内容添加批注
62
+ for inner in cell.blocks():
63
+ if isinstance(inner, Paragraph) and inner.text:
64
+ inner.comment("需复核", end=3, author="reviewer")
65
+
66
+ # 生成新文档
67
+ output = doc.render()
68
+ with open("output.docx", "wb") as f:
69
+ f.write(output)
70
+ ```
71
+
72
+ ---
73
+
74
+ ## API
75
+
76
+ ### DocxDocument
77
+
78
+ DOCX 文档对象。
79
+
80
+ #### parse
81
+
82
+ ```python
83
+ DocxDocument.parse(docx_bytes, *, keep_comments=False)
84
+ ```
85
+
86
+ 解析 DOCX 并构建文档对象。
87
+
88
+ - **keep_comments**: 是否保留原有批注。默认 `False`(清空所有原有批注)。如果你需要在“已有批注的 docx 上继续添加批注”并保留旧批注,请传 `True`。
89
+
90
+ ---
91
+
92
+ #### blocks
93
+
94
+ ```python
95
+ doc.blocks()
96
+ ```
97
+
98
+ 返回文档中的块级元素:
99
+
100
+ ```python
101
+ (Paragraph | Table, ...)
102
+ ```
103
+
104
+ 顺序与 Word 文档一致。
105
+
106
+ ---
107
+
108
+ #### render
109
+
110
+ ```python
111
+ doc.render()
112
+ ```
113
+
114
+ 生成新的 DOCX 并返回 `bytes`。
115
+
116
+ 所有批注在此阶段写入文档。
117
+
118
+ #### 多线程
119
+
120
+ 同一 `DocxDocument` 实例可在多线程中安全使用(内部使用可重入锁串行化访问);不同实例可并行处理。多进程请各自 `parse` 得到独立实例。
121
+
122
+ ---
123
+
124
+ ### Paragraph
125
+
126
+ 表示 Word 段落。
127
+
128
+ #### text
129
+
130
+ ```python
131
+ text = paragraph.text
132
+ ```
133
+
134
+ 返回段落完整文本,保留换行符(`\n`)和制表符(`\t`)。
135
+
136
+ ---
137
+
138
+ #### comment
139
+
140
+ ```python
141
+ paragraph.comment(
142
+ text, # 批注内容
143
+ start=0, # 起始字符位置
144
+ end=None, # 结束字符位置(None 表示到末尾)
145
+ *,
146
+ author="docxnote" # 批注作者
147
+ )
148
+ ```
149
+
150
+ 为段落文本范围添加批注。
151
+
152
+ **示例:**
153
+
154
+ ```python
155
+ paragraph.comment("需要修改", start=3, end=8, author="张三")
156
+ ```
157
+
158
+ docxnote 会自动处理:
159
+
160
+ - Run 分割
161
+ - 批注锚点
162
+ - comments.xml 写入
163
+ - 文档关系更新
164
+
165
+ ---
166
+
167
+ ### Table
168
+
169
+ 表示 Word 表格。
170
+
171
+ #### shape
172
+
173
+ ```python
174
+ rows, cols = table.shape()
175
+ ```
176
+
177
+ 返回表格尺寸 `(行数, 列数)`。
178
+
179
+ ---
180
+
181
+ #### 单元格访问
182
+
183
+ ```python
184
+ cell = table[row, col]
185
+ ```
186
+
187
+ 返回 `Cell` 对象。支持访问所有坐标,包括合并单元格覆盖的区域。
188
+
189
+ ---
190
+
191
+ ### Cell
192
+
193
+ 表示表格单元格。
194
+
195
+ #### blocks
196
+
197
+ ```python
198
+ cell.blocks()
199
+ ```
200
+
201
+ 返回单元格中的块级元素:
202
+
203
+ ```python
204
+ (Paragraph | Table, ...)
205
+ ```
206
+
207
+ 顺序与 Word 文档一致。
208
+
209
+ ---
210
+
211
+ #### bounds
212
+
213
+ ```python
214
+ top, left, bottom, right = cell.bounds()
215
+ ```
216
+
217
+ 返回单元格边界 `(top, left, bottom, right)`,使用左闭右开区间 `[top, bottom)` 和 `[left, right)`。
218
+
219
+ 对于未合并的单元格,返回 `(r, c, r+1, c+1)`。
220
+
221
+ ---
222
+
223
+ ## 高级用法
224
+
225
+ ### 处理嵌套表格
226
+
227
+ ```python
228
+ for block in doc.blocks():
229
+ if isinstance(block, Table):
230
+ rows, cols = block.shape()
231
+ for r in range(rows):
232
+ for c in range(cols):
233
+ cell = block[r, c]
234
+ # 遍历单元格内的块(可能包含嵌套表格)
235
+ for inner_block in cell.blocks():
236
+ if isinstance(inner_block, Table):
237
+ # 处理嵌套表格
238
+ inner_rows, inner_cols = inner_block.shape()
239
+ # ...
240
+ ```
241
+
242
+ ### 多个批注
243
+
244
+ ```python
245
+ # 为同一段落的不同位置添加多个批注
246
+ paragraph.comment("批注1", start=0, end=5, author="张三")
247
+ paragraph.comment("批注2", start=10, end=15, author="李四")
248
+ paragraph.comment("批注3", start=20, end=25, author="王五")
249
+ ```
250
+
251
+ ### 处理合并单元格
252
+
253
+ ```python
254
+ table = [b for b in doc.blocks() if isinstance(b, Table)][0]
255
+
256
+ # 访问合并单元格
257
+ cell = table[0, 0]
258
+ top, left, bottom, right = cell.bounds()
259
+
260
+ # 如果单元格跨越多行或多列
261
+ if bottom - top > 1 or right - left > 1:
262
+ print(f"合并单元格:跨越 {bottom-top} 行,{right-left} 列")
263
+ ```
264
+
265
+ ---
266
+
267
+ ## 测试
268
+
269
+ 所有测试文档使用 python-docx 动态生成,不依赖外部文件,详见 [tests/README.md](tests/README.md)。
270
+
271
+ ---
272
+
273
+ ## 开发环境与提交规范
274
+
275
+ ### 克隆与依赖安装
276
+
277
+ - **克隆仓库**:
278
+
279
+ ```bash
280
+ git clone git@github.com:touken928/docxnote.git
281
+ cd docxnote
282
+ ```
283
+
284
+ - **同步开发依赖**(测试 + pre-commit 等):
285
+
286
+ ```bash
287
+ uv sync --group dev
288
+ ```
289
+
290
+ ### 预提交钩子(pre-commit)
291
+
292
+ - **安装 pre-commit 钩子**(确保提交前自动格式化、lint、跑测试):
293
+
294
+ ```bash
295
+ uv run pre-commit install
296
+ ```
297
+
298
+ 之后每次 `git commit` 会自动运行:
299
+
300
+ - `uv-lock`(保持 uv 依赖锁文件同步)
301
+ - `ruff` / `ruff-format`(代码风格与静态检查)
302
+ - `pytest via uv`(自动化测试)
303
+
304
+ 如需手动在本地检查所有文件,可以运行:
305
+
306
+ ```bash
307
+ uv run pre-commit run --all-files
308
+ ```
309
+
310
+ ### 本地测试
311
+
312
+ - 单次运行所有测试:
313
+
314
+ ```bash
315
+ uv run pytest
316
+ ```
317
+
318
+ ### 发布到 PyPI
319
+
320
+ 使用 [Trusted Publisher(OIDC)](https://docs.pypi.org/trusted-publishers/) 时无需 PyPI API token;PyPI 中 **Environment name 留空** 即可,也无需在 GitHub 仓库里创建 `Environment`。推送形如 `v0.1.0` 的标签会触发 [`.github/workflows/publish.yml`](.github/workflows/publish.yml) 构建并上传。发布前请将 `pyproject.toml` 中的 `version` 与标签一致。
321
+
322
+ ---
323
+
324
+ ## SKILL
325
+
326
+ - 本仓库附带 `SKILL.md`,用于指导对话型 / coding Agent 正确调用 `docxnote`。
327
+ - 建议下载到本地并放置在(根据所用工具选择其一):
328
+ - `.cursor/docxnote/SKILL.md`
329
+ - `.claude/docxnote/SKILL.md`
330
+ - 在对话环境中使用本库时,让 Agent 优先参考该文件中的安装方式、推荐代码骨架与注意事项。
@@ -0,0 +1,320 @@
1
+ # Docxnote
2
+
3
+ **docxnote** 是一个轻量级 **DOCX 批注引擎**,仅依赖 lxml,用于自动化添加 Word 批注。
4
+
5
+ 该库直接操作 **WordprocessingML**,将 DOCX 视为 **ZIP + XML** 文档,并提供一个 **基于文本视图的 API**。
6
+
7
+ 与传统 DOCX 库不同,docxnote **完全隐藏 Word 的 Run 结构**,所有操作都基于 **段落字符串**。
8
+
9
+ ---
10
+
11
+ ## 安装
12
+
13
+ ```
14
+ pip install git+https://github.com/touken928/docxnote.git
15
+ ```
16
+
17
+ 使用 [uv](https://github.com/astral-sh/uv):
18
+
19
+ ```
20
+ uv add git+https://github.com/touken928/docxnote.git
21
+ ```
22
+
23
+ ---
24
+
25
+ ## 快速开始
26
+
27
+ ```python
28
+ from docxnote import DocxDocument, Paragraph, Table
29
+
30
+ # 读取文档
31
+ with open("document.docx", "rb") as f:
32
+ # 默认不保留原有批注(会清空)
33
+ doc = DocxDocument.parse(f.read())
34
+
35
+ # 如需保留原有批注并继续添加:
36
+ # doc = DocxDocument.parse(f.read(), keep_comments=True)
37
+
38
+ # 遍历文档块
39
+ for block in doc.blocks():
40
+ if isinstance(block, Paragraph):
41
+ # 为段落添加批注
42
+ if block.text:
43
+ block.comment("请检查表述", end=5, author="reviewer")
44
+
45
+ elif isinstance(block, Table):
46
+ # 处理表格
47
+ rows, cols = block.shape()
48
+ for r in range(rows):
49
+ for c in range(cols):
50
+ cell = block[r, c]
51
+ # 为单元格内容添加批注
52
+ for inner in cell.blocks():
53
+ if isinstance(inner, Paragraph) and inner.text:
54
+ inner.comment("需复核", end=3, author="reviewer")
55
+
56
+ # 生成新文档
57
+ output = doc.render()
58
+ with open("output.docx", "wb") as f:
59
+ f.write(output)
60
+ ```
61
+
62
+ ---
63
+
64
+ ## API
65
+
66
+ ### DocxDocument
67
+
68
+ DOCX 文档对象。
69
+
70
+ #### parse
71
+
72
+ ```python
73
+ DocxDocument.parse(docx_bytes, *, keep_comments=False)
74
+ ```
75
+
76
+ 解析 DOCX 并构建文档对象。
77
+
78
+ - **keep_comments**: 是否保留原有批注。默认 `False`(清空所有原有批注)。如果你需要在“已有批注的 docx 上继续添加批注”并保留旧批注,请传 `True`。
79
+
80
+ ---
81
+
82
+ #### blocks
83
+
84
+ ```python
85
+ doc.blocks()
86
+ ```
87
+
88
+ 返回文档中的块级元素:
89
+
90
+ ```python
91
+ (Paragraph | Table, ...)
92
+ ```
93
+
94
+ 顺序与 Word 文档一致。
95
+
96
+ ---
97
+
98
+ #### render
99
+
100
+ ```python
101
+ doc.render()
102
+ ```
103
+
104
+ 生成新的 DOCX 并返回 `bytes`。
105
+
106
+ 所有批注在此阶段写入文档。
107
+
108
+ #### 多线程
109
+
110
+ 同一 `DocxDocument` 实例可在多线程中安全使用(内部使用可重入锁串行化访问);不同实例可并行处理。多进程请各自 `parse` 得到独立实例。
111
+
112
+ ---
113
+
114
+ ### Paragraph
115
+
116
+ 表示 Word 段落。
117
+
118
+ #### text
119
+
120
+ ```python
121
+ text = paragraph.text
122
+ ```
123
+
124
+ 返回段落完整文本,保留换行符(`\n`)和制表符(`\t`)。
125
+
126
+ ---
127
+
128
+ #### comment
129
+
130
+ ```python
131
+ paragraph.comment(
132
+ text, # 批注内容
133
+ start=0, # 起始字符位置
134
+ end=None, # 结束字符位置(None 表示到末尾)
135
+ *,
136
+ author="docxnote" # 批注作者
137
+ )
138
+ ```
139
+
140
+ 为段落文本范围添加批注。
141
+
142
+ **示例:**
143
+
144
+ ```python
145
+ paragraph.comment("需要修改", start=3, end=8, author="张三")
146
+ ```
147
+
148
+ docxnote 会自动处理:
149
+
150
+ - Run 分割
151
+ - 批注锚点
152
+ - comments.xml 写入
153
+ - 文档关系更新
154
+
155
+ ---
156
+
157
+ ### Table
158
+
159
+ 表示 Word 表格。
160
+
161
+ #### shape
162
+
163
+ ```python
164
+ rows, cols = table.shape()
165
+ ```
166
+
167
+ 返回表格尺寸 `(行数, 列数)`。
168
+
169
+ ---
170
+
171
+ #### 单元格访问
172
+
173
+ ```python
174
+ cell = table[row, col]
175
+ ```
176
+
177
+ 返回 `Cell` 对象。支持访问所有坐标,包括合并单元格覆盖的区域。
178
+
179
+ ---
180
+
181
+ ### Cell
182
+
183
+ 表示表格单元格。
184
+
185
+ #### blocks
186
+
187
+ ```python
188
+ cell.blocks()
189
+ ```
190
+
191
+ 返回单元格中的块级元素:
192
+
193
+ ```python
194
+ (Paragraph | Table, ...)
195
+ ```
196
+
197
+ 顺序与 Word 文档一致。
198
+
199
+ ---
200
+
201
+ #### bounds
202
+
203
+ ```python
204
+ top, left, bottom, right = cell.bounds()
205
+ ```
206
+
207
+ 返回单元格边界 `(top, left, bottom, right)`,使用左闭右开区间 `[top, bottom)` 和 `[left, right)`。
208
+
209
+ 对于未合并的单元格,返回 `(r, c, r+1, c+1)`。
210
+
211
+ ---
212
+
213
+ ## 高级用法
214
+
215
+ ### 处理嵌套表格
216
+
217
+ ```python
218
+ for block in doc.blocks():
219
+ if isinstance(block, Table):
220
+ rows, cols = block.shape()
221
+ for r in range(rows):
222
+ for c in range(cols):
223
+ cell = block[r, c]
224
+ # 遍历单元格内的块(可能包含嵌套表格)
225
+ for inner_block in cell.blocks():
226
+ if isinstance(inner_block, Table):
227
+ # 处理嵌套表格
228
+ inner_rows, inner_cols = inner_block.shape()
229
+ # ...
230
+ ```
231
+
232
+ ### 多个批注
233
+
234
+ ```python
235
+ # 为同一段落的不同位置添加多个批注
236
+ paragraph.comment("批注1", start=0, end=5, author="张三")
237
+ paragraph.comment("批注2", start=10, end=15, author="李四")
238
+ paragraph.comment("批注3", start=20, end=25, author="王五")
239
+ ```
240
+
241
+ ### 处理合并单元格
242
+
243
+ ```python
244
+ table = [b for b in doc.blocks() if isinstance(b, Table)][0]
245
+
246
+ # 访问合并单元格
247
+ cell = table[0, 0]
248
+ top, left, bottom, right = cell.bounds()
249
+
250
+ # 如果单元格跨越多行或多列
251
+ if bottom - top > 1 or right - left > 1:
252
+ print(f"合并单元格:跨越 {bottom-top} 行,{right-left} 列")
253
+ ```
254
+
255
+ ---
256
+
257
+ ## 测试
258
+
259
+ 所有测试文档使用 python-docx 动态生成,不依赖外部文件,详见 [tests/README.md](tests/README.md)。
260
+
261
+ ---
262
+
263
+ ## 开发环境与提交规范
264
+
265
+ ### 克隆与依赖安装
266
+
267
+ - **克隆仓库**:
268
+
269
+ ```bash
270
+ git clone git@github.com:touken928/docxnote.git
271
+ cd docxnote
272
+ ```
273
+
274
+ - **同步开发依赖**(测试 + pre-commit 等):
275
+
276
+ ```bash
277
+ uv sync --group dev
278
+ ```
279
+
280
+ ### 预提交钩子(pre-commit)
281
+
282
+ - **安装 pre-commit 钩子**(确保提交前自动格式化、lint、跑测试):
283
+
284
+ ```bash
285
+ uv run pre-commit install
286
+ ```
287
+
288
+ 之后每次 `git commit` 会自动运行:
289
+
290
+ - `uv-lock`(保持 uv 依赖锁文件同步)
291
+ - `ruff` / `ruff-format`(代码风格与静态检查)
292
+ - `pytest via uv`(自动化测试)
293
+
294
+ 如需手动在本地检查所有文件,可以运行:
295
+
296
+ ```bash
297
+ uv run pre-commit run --all-files
298
+ ```
299
+
300
+ ### 本地测试
301
+
302
+ - 单次运行所有测试:
303
+
304
+ ```bash
305
+ uv run pytest
306
+ ```
307
+
308
+ ### 发布到 PyPI
309
+
310
+ 使用 [Trusted Publisher(OIDC)](https://docs.pypi.org/trusted-publishers/) 时无需 PyPI API token;PyPI 中 **Environment name 留空** 即可,也无需在 GitHub 仓库里创建 `Environment`。推送形如 `v0.1.0` 的标签会触发 [`.github/workflows/publish.yml`](.github/workflows/publish.yml) 构建并上传。发布前请将 `pyproject.toml` 中的 `version` 与标签一致。
311
+
312
+ ---
313
+
314
+ ## SKILL
315
+
316
+ - 本仓库附带 `SKILL.md`,用于指导对话型 / coding Agent 正确调用 `docxnote`。
317
+ - 建议下载到本地并放置在(根据所用工具选择其一):
318
+ - `.cursor/docxnote/SKILL.md`
319
+ - `.claude/docxnote/SKILL.md`
320
+ - 在对话环境中使用本库时,让 Agent 优先参考该文件中的安装方式、推荐代码骨架与注意事项。
@@ -0,0 +1,24 @@
1
+ [project]
2
+ name = "docxnote"
3
+ version = "0.1.0"
4
+ description = "Lightweight DOCX comment engine based on text view API"
5
+ readme = "README.md"
6
+ authors = [
7
+ { name = "touken", email = "touken928@foxmail.com" }
8
+ ]
9
+ requires-python = ">=3.12"
10
+ dependencies = [
11
+ "lxml>=5.0.0",
12
+ ]
13
+
14
+ [build-system]
15
+ requires = ["uv_build>=0.9.11,<0.10.0"]
16
+ build-backend = "uv_build"
17
+
18
+ [dependency-groups]
19
+ dev = [
20
+ # 测试与开发工具
21
+ "pytest>=8.0.0",
22
+ "python-docx>=1.0.0",
23
+ "pre-commit>=4.5.1",
24
+ ]
@@ -0,0 +1,7 @@
1
+ """docxnote - 轻量级 DOCX 批注引擎"""
2
+
3
+ from .document import DocxDocument
4
+ from .paragraph import Paragraph
5
+ from .table import Table, Cell
6
+
7
+ __all__ = ["DocxDocument", "Paragraph", "Table", "Cell"]
@@ -0,0 +1,340 @@
1
+ """DOCX 文档解析和渲染"""
2
+
3
+ import io
4
+ import threading
5
+ import zipfile
6
+ from lxml import etree
7
+
8
+ from .paragraph import Paragraph
9
+ from .table import Table
10
+ from .namespaces import NS
11
+
12
+
13
+ class DocxDocument:
14
+ """DOCX 文档对象
15
+
16
+ 同一 ``DocxDocument`` 实例可在多线程环境下安全使用(内部使用可重入锁串行化访问)。
17
+ 不同实例之间无共享可变状态,可并行使用。多进程请各自持有独立实例。
18
+ """
19
+
20
+ def __init__(self, zip_data: bytes):
21
+ self._zip_data = zip_data
22
+ self._zip = zipfile.ZipFile(io.BytesIO(zip_data))
23
+ self._document_xml = None
24
+ self._body = None
25
+ self._comments = []
26
+ self._comment_id_counter = 0
27
+ self._lock = threading.RLock()
28
+
29
+ @classmethod
30
+ def parse(cls, docx_bytes: bytes, *, keep_comments: bool = False) -> "DocxDocument":
31
+ """解析 DOCX 并构建文档对象
32
+
33
+ Args:
34
+ keep_comments: 是否保留原有批注。默认 False(清空所有原有批注)。
35
+ """
36
+ doc = cls(docx_bytes)
37
+ doc._load_document(keep_comments=keep_comments)
38
+ return doc
39
+
40
+ def _load_document(self, *, keep_comments: bool):
41
+ """加载 document.xml,并按需保留/清空原有批注"""
42
+ doc_xml = self._zip.read("word/document.xml")
43
+ self._document_xml = etree.fromstring(doc_xml)
44
+ self._body = self._document_xml.find(".//w:body", NS)
45
+
46
+ if keep_comments:
47
+ # 加载已有的批注
48
+ self._load_existing_comments()
49
+ else:
50
+ # 默认不保留:清空 comments 列表,并移除 document.xml 中的批注标记
51
+ self._comments = []
52
+ self._comment_id_counter = 0
53
+ self._strip_all_comment_markers()
54
+
55
+ def _strip_all_comment_markers(self) -> None:
56
+ """移除 document.xml 中所有批注相关标记,避免残留引用。"""
57
+ if self._document_xml is None:
58
+ return
59
+
60
+ # commentRangeStart / commentRangeEnd
61
+ for tag in ("commentRangeStart", "commentRangeEnd"):
62
+ for el in self._document_xml.findall(f".//w:{tag}", NS):
63
+ parent = el.getparent()
64
+ if parent is not None:
65
+ parent.remove(el)
66
+
67
+ # commentReference 位于 w:r 内;移除后若 run 为空则一并移除
68
+ for ref in self._document_xml.findall(".//w:commentReference", NS):
69
+ run = ref.getparent()
70
+ if run is None:
71
+ continue
72
+ run.remove(ref)
73
+ if (
74
+ len(run) == 0
75
+ and (run.text is None)
76
+ and (run.tail is None or run.tail == "")
77
+ ):
78
+ parent = run.getparent()
79
+ if parent is not None:
80
+ parent.remove(run)
81
+
82
+ def _load_existing_comments(self):
83
+ """加载已有的批注"""
84
+ try:
85
+ comments_xml = self._zip.read("word/comments.xml")
86
+ comments_tree = etree.fromstring(comments_xml)
87
+
88
+ max_id = -1
89
+ for comment in comments_tree:
90
+ comment_id_str = comment.get(f"{{{NS['w']}}}id")
91
+ if comment_id_str:
92
+ comment_id = int(comment_id_str)
93
+ max_id = max(max_id, comment_id)
94
+
95
+ # 提取批注内容
96
+ author = comment.get(f"{{{NS['w']}}}author", "")
97
+ text = self._extract_comment_text(comment)
98
+
99
+ self._comments.append((comment_id, text, author))
100
+
101
+ # 设置下一个批注 ID
102
+ self._comment_id_counter = max_id + 1
103
+ except KeyError:
104
+ # 没有 comments.xml 文件
105
+ pass
106
+
107
+ def _extract_comment_text(self, comment_element: etree._Element) -> str:
108
+ """从 w:comment 中提取完整文本(按 w:p 插入换行)。"""
109
+ parts: list[str] = []
110
+ first_para = True
111
+
112
+ # comments.xml 内部结构通常是多个 w:p
113
+ for p in comment_element.findall(".//w:p", NS):
114
+ if not first_para:
115
+ parts.append("\n")
116
+ first_para = False
117
+
118
+ for run in p.findall(".//w:r", NS):
119
+ for child in run:
120
+ tag = etree.QName(child.tag).localname
121
+ if tag == "t":
122
+ if child.text:
123
+ parts.append(child.text)
124
+ elif tag == "br":
125
+ parts.append("\n")
126
+ elif tag == "tab":
127
+ parts.append("\t")
128
+
129
+ return "".join(parts)
130
+
131
+ def blocks(self) -> tuple[Paragraph | Table, ...]:
132
+ """返回文档中的块级元素(元组)"""
133
+ with self._lock:
134
+ if self._body is None:
135
+ return ()
136
+
137
+ blocks: list[Paragraph | Table] = []
138
+ for child in self._body:
139
+ tag = etree.QName(child.tag).localname
140
+ if tag == "p":
141
+ blocks.append(Paragraph(child, self))
142
+ elif tag == "tbl":
143
+ blocks.append(Table(child, self))
144
+ return tuple(blocks)
145
+
146
+ def add_comment(self, text: str, author: str = "docxnote") -> int:
147
+ """添加批注并返回 ID"""
148
+ with self._lock:
149
+ comment_id = self._comment_id_counter
150
+ self._comment_id_counter += 1
151
+ self._comments.append((comment_id, text, author))
152
+ return comment_id
153
+
154
+ def render(self) -> bytes:
155
+ """生成新的 DOCX 并返回 bytes"""
156
+ with self._lock:
157
+ return self._render_unlocked()
158
+
159
+ def _render_unlocked(self) -> bytes:
160
+ output = io.BytesIO()
161
+
162
+ with zipfile.ZipFile(output, "w", zipfile.ZIP_DEFLATED) as out_zip:
163
+ # 准备 rels 和 content types(如果有批注)
164
+ rels_data = None
165
+ content_types_data = None
166
+ if self._comments:
167
+ rels_data = self._prepare_rels()
168
+ content_types_data = self._prepare_content_types()
169
+
170
+ # 复制所有原始文件
171
+ for item in self._zip.namelist():
172
+ if item == "word/document.xml":
173
+ continue
174
+ if item == "word/comments.xml":
175
+ continue
176
+ if item == "word/_rels/document.xml.rels" and rels_data is not None:
177
+ continue
178
+ if item == "[Content_Types].xml" and content_types_data is not None:
179
+ continue
180
+ out_zip.writestr(item, self._zip.read(item))
181
+
182
+ # 写入修改后的 document.xml
183
+ doc_bytes = etree.tostring(
184
+ self._document_xml,
185
+ xml_declaration=True,
186
+ encoding="UTF-8",
187
+ standalone=True,
188
+ )
189
+ out_zip.writestr("word/document.xml", doc_bytes)
190
+
191
+ # 写入 comments.xml、rels 和 content types
192
+ if self._comments:
193
+ comments_xml = self._build_comments_xml()
194
+ out_zip.writestr("word/comments.xml", comments_xml)
195
+ out_zip.writestr("word/_rels/document.xml.rels", rels_data)
196
+ out_zip.writestr("[Content_Types].xml", content_types_data)
197
+
198
+ return output.getvalue()
199
+
200
+ def _build_comments_xml(self) -> bytes:
201
+ """构建 comments.xml"""
202
+ root = etree.Element(f"{{{NS['w']}}}comments", nsmap=NS)
203
+
204
+ for comment_id, text, author in self._comments:
205
+ comment = etree.SubElement(
206
+ root,
207
+ f"{{{NS['w']}}}comment",
208
+ attrib={
209
+ f"{{{NS['w']}}}id": str(comment_id),
210
+ f"{{{NS['w']}}}author": author,
211
+ f"{{{NS['w']}}}date": "2024-01-01T00:00:00Z",
212
+ f"{{{NS['w']}}}initials": author[0].upper() if author else "D",
213
+ },
214
+ )
215
+
216
+ # 按换行拆分为多个段落,尽量保留原批注的多段结构
217
+ lines = text.split("\n")
218
+ if not lines:
219
+ lines = [""]
220
+
221
+ for line in lines:
222
+ p = etree.SubElement(comment, f"{{{NS['w']}}}p")
223
+ r = etree.SubElement(p, f"{{{NS['w']}}}r")
224
+
225
+ # 处理 tab:用 w:tab 表示
226
+ if "\t" in line:
227
+ buf: list[str] = []
228
+ for ch in line:
229
+ if ch == "\t":
230
+ if buf:
231
+ t = etree.SubElement(r, f"{{{NS['w']}}}t")
232
+ seg = "".join(buf)
233
+ if seg[:1] == " " or seg[-1:] == " ":
234
+ t.set(
235
+ "{http://www.w3.org/XML/1998/namespace}space",
236
+ "preserve",
237
+ )
238
+ t.text = seg
239
+ buf.clear()
240
+ etree.SubElement(r, f"{{{NS['w']}}}tab")
241
+ else:
242
+ buf.append(ch)
243
+ if buf or line == "":
244
+ t = etree.SubElement(r, f"{{{NS['w']}}}t")
245
+ seg = "".join(buf)
246
+ if seg[:1] == " " or seg[-1:] == " ":
247
+ t.set(
248
+ "{http://www.w3.org/XML/1998/namespace}space",
249
+ "preserve",
250
+ )
251
+ t.text = seg
252
+ else:
253
+ t = etree.SubElement(r, f"{{{NS['w']}}}t")
254
+ if line[:1] == " " or line[-1:] == " ":
255
+ t.set("{http://www.w3.org/XML/1998/namespace}space", "preserve")
256
+ t.text = line
257
+
258
+ return etree.tostring(
259
+ root, xml_declaration=True, encoding="UTF-8", standalone=True
260
+ )
261
+
262
+ def _prepare_rels(self) -> bytes:
263
+ """准备 document.xml.rels 数据以包含 comments.xml 关系"""
264
+ rels_path = "word/_rels/document.xml.rels"
265
+
266
+ try:
267
+ rels_data = self._zip.read(rels_path)
268
+ rels_xml = etree.fromstring(rels_data)
269
+ except KeyError:
270
+ # 创建新的 rels
271
+ rels_xml = etree.Element(
272
+ "Relationships",
273
+ nsmap={
274
+ "": "http://schemas.openxmlformats.org/package/2006/relationships"
275
+ },
276
+ )
277
+
278
+ # 检查是否已有 comments 关系
279
+ has_comments = False
280
+ for rel in rels_xml:
281
+ if (
282
+ rel.get("Type")
283
+ == "http://schemas.openxmlformats.org/officeDocument/2006/relationships/comments"
284
+ ):
285
+ has_comments = True
286
+ break
287
+
288
+ if not has_comments:
289
+ # 添加 comments 关系
290
+ max_id = 0
291
+ for rel in rels_xml:
292
+ rel_id = rel.get("Id", "")
293
+ if rel_id.startswith("rId"):
294
+ try:
295
+ num = int(rel_id[3:])
296
+ max_id = max(max_id, num)
297
+ except ValueError:
298
+ pass
299
+
300
+ etree.SubElement(
301
+ rels_xml,
302
+ "Relationship",
303
+ attrib={
304
+ "Id": f"rId{max_id + 1}",
305
+ "Type": "http://schemas.openxmlformats.org/officeDocument/2006/relationships/comments",
306
+ "Target": "comments.xml",
307
+ },
308
+ )
309
+
310
+ return etree.tostring(rels_xml, xml_declaration=True, encoding="UTF-8")
311
+
312
+ def _prepare_content_types(self) -> bytes:
313
+ """准备 [Content_Types].xml 数据以包含 comments.xml"""
314
+ ct_data = self._zip.read("[Content_Types].xml")
315
+ ct_xml = etree.fromstring(ct_data)
316
+
317
+ # 获取命名空间
318
+ ns = ct_xml.nsmap.get(
319
+ None, "http://schemas.openxmlformats.org/package/2006/content-types"
320
+ )
321
+
322
+ # 检查是否已有 comments.xml 的 Override
323
+ has_comments_override = False
324
+ for override in ct_xml:
325
+ if override.get("PartName") == "/word/comments.xml":
326
+ has_comments_override = True
327
+ break
328
+
329
+ if not has_comments_override:
330
+ # 添加 comments.xml 的 Override
331
+ override_elem = etree.Element(
332
+ f"{{{ns}}}Override",
333
+ attrib={
334
+ "PartName": "/word/comments.xml",
335
+ "ContentType": "application/vnd.openxmlformats-officedocument.wordprocessingml.comments+xml",
336
+ },
337
+ )
338
+ ct_xml.append(override_elem)
339
+
340
+ return etree.tostring(ct_xml, xml_declaration=True, encoding="UTF-8")
@@ -0,0 +1,7 @@
1
+ """XML 命名空间定义"""
2
+
3
+ NS = {
4
+ "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
5
+ "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
6
+ "wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
7
+ }
@@ -0,0 +1,143 @@
1
+ """段落处理"""
2
+
3
+ from lxml import etree
4
+ from .namespaces import NS
5
+
6
+
7
+ class Paragraph:
8
+ """表示 Word 段落"""
9
+
10
+ def __init__(self, element, document):
11
+ self._element = element
12
+ self._document = document
13
+ self._text_cache = None
14
+
15
+ @property
16
+ def text(self) -> str:
17
+ """返回段落完整文本"""
18
+ with self._document._lock:
19
+ if self._text_cache is not None:
20
+ return self._text_cache
21
+
22
+ text_parts = []
23
+ for run in self._element.findall(".//w:r", NS):
24
+ # 遍历 run 的所有子元素,保持顺序
25
+ for child in run:
26
+ tag = etree.QName(child.tag).localname
27
+ if tag == "t":
28
+ # 文本节点
29
+ if child.text:
30
+ text_parts.append(child.text)
31
+ elif tag == "br":
32
+ # 换行符
33
+ text_parts.append("\n")
34
+ elif tag == "tab":
35
+ # 制表符
36
+ text_parts.append("\t")
37
+
38
+ self._text_cache = "".join(text_parts)
39
+ return self._text_cache
40
+
41
+ def comment(
42
+ self,
43
+ text: str,
44
+ start: int = 0,
45
+ end: int | None = None,
46
+ *,
47
+ author: str = "docxnote",
48
+ ):
49
+ """为段落文本范围添加批注"""
50
+ with self._document._lock:
51
+ if end is None:
52
+ end = len(self.text)
53
+
54
+ # 获取批注 ID
55
+ comment_id = self._document.add_comment(text, author)
56
+
57
+ # 在段落中插入批注标记
58
+ self._insert_comment_markers(comment_id, start, end)
59
+
60
+ def _insert_comment_markers(self, comment_id: int, start: int, end: int):
61
+ """在指定位置插入批注起止标记"""
62
+ runs = list(self._element.findall(".//w:r", NS))
63
+ if not runs:
64
+ return
65
+
66
+ # 计算字符位置到 run 的映射
67
+ run_positions = []
68
+ current_pos = 0
69
+
70
+ for run in runs:
71
+ run_start = current_pos
72
+ run_text = ""
73
+ for t in run.findall(".//w:t", NS):
74
+ if t.text:
75
+ run_text += t.text
76
+ run_end = current_pos + len(run_text)
77
+ run_positions.append((run, run_start, run_end, run_text))
78
+ current_pos = run_end
79
+
80
+ # 找到需要分割的 run
81
+ start_run_idx = None
82
+ end_run_idx = None
83
+
84
+ for idx, (run, run_start, run_end, run_text) in enumerate(run_positions):
85
+ if start_run_idx is None and run_start <= start < run_end:
86
+ start_run_idx = idx
87
+ if end_run_idx is None and run_start < end <= run_end:
88
+ end_run_idx = idx
89
+
90
+ if start_run_idx is None or end_run_idx is None:
91
+ return
92
+
93
+ # 分割 run 并插入标记
94
+ self._split_and_mark(
95
+ run_positions, start_run_idx, end_run_idx, start, end, comment_id
96
+ )
97
+
98
+ def _split_and_mark(
99
+ self, run_positions, start_idx, end_idx, start, end, comment_id
100
+ ):
101
+ """分割 run 并插入批注标记"""
102
+ # 简化实现:在第一个 run 前插入开始标记,在最后一个 run 后插入结束标记
103
+ start_run, start_pos, _, _ = run_positions[start_idx]
104
+ end_run, _, end_pos, _ = run_positions[end_idx]
105
+
106
+ # 创建批注范围开始标记
107
+ comment_start = etree.Element(
108
+ f"{{{NS['w']}}}commentRangeStart",
109
+ attrib={f"{{{NS['w']}}}id": str(comment_id)},
110
+ )
111
+
112
+ # 创建批注范围结束标记
113
+ comment_end = etree.Element(
114
+ f"{{{NS['w']}}}commentRangeEnd",
115
+ attrib={f"{{{NS['w']}}}id": str(comment_id)},
116
+ )
117
+
118
+ # 创建批注引用
119
+ comment_ref_run = etree.Element(f"{{{NS['w']}}}r")
120
+ etree.SubElement(
121
+ comment_ref_run,
122
+ f"{{{NS['w']}}}commentReference",
123
+ attrib={f"{{{NS['w']}}}id": str(comment_id)},
124
+ )
125
+
126
+ # 插入标记
127
+ parent = self._element
128
+
129
+ # 查找 run 在父元素中的位置
130
+ try:
131
+ children = list(parent)
132
+ start_run_pos = children.index(start_run)
133
+ end_run_pos = children.index(end_run)
134
+ except ValueError:
135
+ # run 不是直接子元素,跳过
136
+ return
137
+
138
+ # 在开始 run 之前插入开始标记
139
+ parent.insert(start_run_pos, comment_start)
140
+
141
+ # 在结束 run 之后插入结束标记和引用(注意索引偏移)
142
+ parent.insert(end_run_pos + 2, comment_end)
143
+ parent.insert(end_run_pos + 3, comment_ref_run)
File without changes
@@ -0,0 +1,194 @@
1
+ """表格和单元格处理"""
2
+
3
+ from __future__ import annotations
4
+
5
+ from lxml import etree
6
+ from .namespaces import NS
7
+
8
+
9
+ class Table:
10
+ """表示 Word 表格"""
11
+
12
+ def __init__(self, element, document):
13
+ self._element = element
14
+ self._document = document
15
+ self._grid = None
16
+ self._build_grid()
17
+
18
+ def _build_grid(self):
19
+ """构建表格网格,处理合并单元格"""
20
+ # 只查找直接子行,不包括嵌套表格的行
21
+ rows = self._element.findall("./w:tr", NS)
22
+ if not rows:
23
+ self._grid = []
24
+ return
25
+
26
+ # 构建一个“展开到坐标”的网格:同一合并区域的所有坐标都指向起始 Cell
27
+ self._grid = []
28
+ row_maps: list[dict[int, Cell]] = []
29
+ active_vmerge: dict[
30
+ int, Cell
31
+ ] = {} # col -> origin cell (for current/next rows)
32
+ max_cols = 0
33
+
34
+ for r_idx, row in enumerate(rows):
35
+ # 只查找直接子单元格
36
+ tcs = row.findall("./w:tc", NS)
37
+
38
+ row_map: dict[int, Cell] = {}
39
+ col_idx = 0
40
+
41
+ for tc in tcs:
42
+ # 跳过被上方 vMerge 占用的列位置
43
+ while col_idx in row_map:
44
+ col_idx += 1
45
+
46
+ colspan = 1
47
+ vmerge_val: str | None = None
48
+
49
+ tc_pr = tc.find("./w:tcPr", NS)
50
+ if tc_pr is not None:
51
+ gridspan = tc_pr.find("./w:gridSpan", NS)
52
+ if gridspan is not None:
53
+ val = gridspan.get(f"{{{NS['w']}}}val")
54
+ if val:
55
+ colspan = int(val)
56
+
57
+ vmerge = tc_pr.find("./w:vMerge", NS)
58
+ if vmerge is not None:
59
+ vmerge_val = vmerge.get(f"{{{NS['w']}}}val")
60
+
61
+ is_vmerge_continue = vmerge_val is None and (
62
+ tc_pr is not None and tc_pr.find("./w:vMerge", NS) is not None
63
+ )
64
+ if vmerge_val is not None:
65
+ is_vmerge_continue = vmerge_val != "restart"
66
+
67
+ if is_vmerge_continue:
68
+ origin = active_vmerge.get(col_idx)
69
+ if origin is None:
70
+ origin = Cell(tc, self._document, r_idx, col_idx, colspan)
71
+ else:
72
+ origin._grow_rowspan_to(r_idx + 1)
73
+ else:
74
+ origin = Cell(tc, self._document, r_idx, col_idx, colspan)
75
+ # 新单元格覆盖同列:意味着上方 vMerge 在该列结束
76
+ for i in range(colspan):
77
+ active_vmerge.pop(col_idx + i, None)
78
+
79
+ # 如果当前单元格是 vMerge restart,则开启纵向合并跟踪
80
+ if vmerge_val == "restart" or (
81
+ tc_pr is not None
82
+ and tc_pr.find("./w:vMerge", NS) is not None
83
+ and vmerge_val == "restart"
84
+ ):
85
+ for i in range(colspan):
86
+ active_vmerge[col_idx + i] = origin
87
+
88
+ for i in range(colspan):
89
+ row_map[col_idx + i] = origin
90
+
91
+ col_idx += colspan
92
+
93
+ # 将本行未显式出现但仍在 vMerge 中的列补齐
94
+ for c, origin in active_vmerge.items():
95
+ if c not in row_map:
96
+ origin._grow_rowspan_to(r_idx + 1)
97
+ row_map[c] = origin
98
+
99
+ if row_map:
100
+ max_cols = max(max_cols, max(row_map.keys()) + 1)
101
+ row_maps.append(row_map)
102
+
103
+ # 生成最终 grid:每行是“实际出现过的 Cell(去重)”列表(用于 bounds/shape 辅助)
104
+ for r_idx, row_map in enumerate(row_maps):
105
+ seen: set[int] = set()
106
+ grid_row: list[Cell] = []
107
+ for c in range(max_cols):
108
+ cell = row_map.get(c)
109
+ if cell is None:
110
+ continue
111
+ if id(cell) in seen:
112
+ continue
113
+ seen.add(id(cell))
114
+ grid_row.append(cell)
115
+ self._grid.append(grid_row)
116
+
117
+ # 另外保存一个坐标展开矩阵,供 __getitem__ 精确返回合并起点单元格
118
+ self._matrix: list[list[Cell]] = []
119
+ for r_idx, row_map in enumerate(row_maps):
120
+ matrix_row: list[Cell] = []
121
+ for c in range(max_cols):
122
+ matrix_row.append(
123
+ row_map.get(c) or Cell(None, self._document, r_idx, c, 1)
124
+ )
125
+ self._matrix.append(matrix_row)
126
+
127
+ def shape(self) -> tuple[int, int]:
128
+ """返回表格尺寸 (rows, cols)"""
129
+ if not self._grid:
130
+ return (0, 0)
131
+ rows = len(self._grid)
132
+ cols = (
133
+ len(getattr(self, "_matrix", [[]])[0])
134
+ if getattr(self, "_matrix", None)
135
+ else 0
136
+ )
137
+ return (rows, cols)
138
+
139
+ def __getitem__(self, key: tuple[int, int]) -> "Cell":
140
+ """返回 Cell 对象"""
141
+ row, col = key
142
+ matrix = getattr(self, "_matrix", None)
143
+ if (
144
+ matrix is not None
145
+ and 0 <= row < len(matrix)
146
+ and 0 <= col < len(matrix[row])
147
+ ):
148
+ return matrix[row][col]
149
+ return Cell(None, self._document, row, col, 1)
150
+
151
+
152
+ class Cell:
153
+ """表示表格单元格"""
154
+
155
+ def __init__(self, element, document, row: int, col: int, colspan: int = 1):
156
+ self._element = element
157
+ self._document = document
158
+ self._row = row
159
+ self._col = col
160
+ self._colspan = colspan
161
+ self._rowspan = 1
162
+
163
+ def _grow_rowspan_to(self, bottom_exclusive: int) -> None:
164
+ """将 rowspan 扩展到指定 bottom(左闭右开)"""
165
+ self._rowspan = max(self._rowspan, bottom_exclusive - self._row)
166
+
167
+ def blocks(self) -> tuple:
168
+ """返回单元格中的块级元素(元组)"""
169
+ with self._document._lock:
170
+ if self._element is None:
171
+ return ()
172
+
173
+ from .paragraph import Paragraph
174
+
175
+ blocks: list = []
176
+ for child in self._element:
177
+ tag = etree.QName(child.tag).localname
178
+ if tag == "p":
179
+ blocks.append(Paragraph(child, self._document))
180
+ elif tag == "tbl":
181
+ blocks.append(Table(child, self._document))
182
+ return tuple(blocks)
183
+
184
+ def bounds(self) -> tuple[int, int, int, int]:
185
+ """返回单元格边界 (top, left, bottom, right)"""
186
+ if self._element is None:
187
+ return (self._row, self._col, self._row + 1, self._col + 1)
188
+
189
+ return (
190
+ self._row,
191
+ self._col,
192
+ self._row + self._rowspan,
193
+ self._col + self._colspan,
194
+ )