docxnote 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docxnote-0.1.0/PKG-INFO +330 -0
- docxnote-0.1.0/README.md +320 -0
- docxnote-0.1.0/pyproject.toml +24 -0
- docxnote-0.1.0/src/docxnote/__init__.py +7 -0
- docxnote-0.1.0/src/docxnote/document.py +340 -0
- docxnote-0.1.0/src/docxnote/namespaces.py +7 -0
- docxnote-0.1.0/src/docxnote/paragraph.py +143 -0
- docxnote-0.1.0/src/docxnote/py.typed +0 -0
- docxnote-0.1.0/src/docxnote/table.py +194 -0
docxnote-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,330 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: docxnote
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Lightweight DOCX comment engine based on text view API
|
|
5
|
+
Author: touken
|
|
6
|
+
Author-email: touken <touken928@foxmail.com>
|
|
7
|
+
Requires-Dist: lxml>=5.0.0
|
|
8
|
+
Requires-Python: >=3.12
|
|
9
|
+
Description-Content-Type: text/markdown
|
|
10
|
+
|
|
11
|
+
# Docxnote
|
|
12
|
+
|
|
13
|
+
**docxnote** 是一个轻量级 **DOCX 批注引擎**,仅依赖 lxml,用于自动化添加 Word 批注。
|
|
14
|
+
|
|
15
|
+
该库直接操作 **WordprocessingML**,将 DOCX 视为 **ZIP + XML** 文档,并提供一个 **基于文本视图的 API**。
|
|
16
|
+
|
|
17
|
+
与传统 DOCX 库不同,docxnote **完全隐藏 Word 的 Run 结构**,所有操作都基于 **段落字符串**。
|
|
18
|
+
|
|
19
|
+
---
|
|
20
|
+
|
|
21
|
+
## 安装
|
|
22
|
+
|
|
23
|
+
```
|
|
24
|
+
pip install git+https://github.com/touken928/docxnote.git
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
使用 [uv](https://github.com/astral-sh/uv):
|
|
28
|
+
|
|
29
|
+
```
|
|
30
|
+
uv add git+https://github.com/touken928/docxnote.git
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
---
|
|
34
|
+
|
|
35
|
+
## 快速开始
|
|
36
|
+
|
|
37
|
+
```python
|
|
38
|
+
from docxnote import DocxDocument, Paragraph, Table
|
|
39
|
+
|
|
40
|
+
# 读取文档
|
|
41
|
+
with open("document.docx", "rb") as f:
|
|
42
|
+
# 默认不保留原有批注(会清空)
|
|
43
|
+
doc = DocxDocument.parse(f.read())
|
|
44
|
+
|
|
45
|
+
# 如需保留原有批注并继续添加:
|
|
46
|
+
# doc = DocxDocument.parse(f.read(), keep_comments=True)
|
|
47
|
+
|
|
48
|
+
# 遍历文档块
|
|
49
|
+
for block in doc.blocks():
|
|
50
|
+
if isinstance(block, Paragraph):
|
|
51
|
+
# 为段落添加批注
|
|
52
|
+
if block.text:
|
|
53
|
+
block.comment("请检查表述", end=5, author="reviewer")
|
|
54
|
+
|
|
55
|
+
elif isinstance(block, Table):
|
|
56
|
+
# 处理表格
|
|
57
|
+
rows, cols = block.shape()
|
|
58
|
+
for r in range(rows):
|
|
59
|
+
for c in range(cols):
|
|
60
|
+
cell = block[r, c]
|
|
61
|
+
# 为单元格内容添加批注
|
|
62
|
+
for inner in cell.blocks():
|
|
63
|
+
if isinstance(inner, Paragraph) and inner.text:
|
|
64
|
+
inner.comment("需复核", end=3, author="reviewer")
|
|
65
|
+
|
|
66
|
+
# 生成新文档
|
|
67
|
+
output = doc.render()
|
|
68
|
+
with open("output.docx", "wb") as f:
|
|
69
|
+
f.write(output)
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
---
|
|
73
|
+
|
|
74
|
+
## API
|
|
75
|
+
|
|
76
|
+
### DocxDocument
|
|
77
|
+
|
|
78
|
+
DOCX 文档对象。
|
|
79
|
+
|
|
80
|
+
#### parse
|
|
81
|
+
|
|
82
|
+
```python
|
|
83
|
+
DocxDocument.parse(docx_bytes, *, keep_comments=False)
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
解析 DOCX 并构建文档对象。
|
|
87
|
+
|
|
88
|
+
- **keep_comments**: 是否保留原有批注。默认 `False`(清空所有原有批注)。如果你需要在“已有批注的 docx 上继续添加批注”并保留旧批注,请传 `True`。
|
|
89
|
+
|
|
90
|
+
---
|
|
91
|
+
|
|
92
|
+
#### blocks
|
|
93
|
+
|
|
94
|
+
```python
|
|
95
|
+
doc.blocks()
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
返回文档中的块级元素:
|
|
99
|
+
|
|
100
|
+
```python
|
|
101
|
+
(Paragraph | Table, ...)
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
顺序与 Word 文档一致。
|
|
105
|
+
|
|
106
|
+
---
|
|
107
|
+
|
|
108
|
+
#### render
|
|
109
|
+
|
|
110
|
+
```python
|
|
111
|
+
doc.render()
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
生成新的 DOCX 并返回 `bytes`。
|
|
115
|
+
|
|
116
|
+
所有批注在此阶段写入文档。
|
|
117
|
+
|
|
118
|
+
#### 多线程
|
|
119
|
+
|
|
120
|
+
同一 `DocxDocument` 实例可在多线程中安全使用(内部使用可重入锁串行化访问);不同实例可并行处理。多进程请各自 `parse` 得到独立实例。
|
|
121
|
+
|
|
122
|
+
---
|
|
123
|
+
|
|
124
|
+
### Paragraph
|
|
125
|
+
|
|
126
|
+
表示 Word 段落。
|
|
127
|
+
|
|
128
|
+
#### text
|
|
129
|
+
|
|
130
|
+
```python
|
|
131
|
+
text = paragraph.text
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
返回段落完整文本,保留换行符(`\n`)和制表符(`\t`)。
|
|
135
|
+
|
|
136
|
+
---
|
|
137
|
+
|
|
138
|
+
#### comment
|
|
139
|
+
|
|
140
|
+
```python
|
|
141
|
+
paragraph.comment(
|
|
142
|
+
text, # 批注内容
|
|
143
|
+
start=0, # 起始字符位置
|
|
144
|
+
end=None, # 结束字符位置(None 表示到末尾)
|
|
145
|
+
*,
|
|
146
|
+
author="docxnote" # 批注作者
|
|
147
|
+
)
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
为段落文本范围添加批注。
|
|
151
|
+
|
|
152
|
+
**示例:**
|
|
153
|
+
|
|
154
|
+
```python
|
|
155
|
+
paragraph.comment("需要修改", start=3, end=8, author="张三")
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
docxnote 会自动处理:
|
|
159
|
+
|
|
160
|
+
- Run 分割
|
|
161
|
+
- 批注锚点
|
|
162
|
+
- comments.xml 写入
|
|
163
|
+
- 文档关系更新
|
|
164
|
+
|
|
165
|
+
---
|
|
166
|
+
|
|
167
|
+
### Table
|
|
168
|
+
|
|
169
|
+
表示 Word 表格。
|
|
170
|
+
|
|
171
|
+
#### shape
|
|
172
|
+
|
|
173
|
+
```python
|
|
174
|
+
rows, cols = table.shape()
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
返回表格尺寸 `(行数, 列数)`。
|
|
178
|
+
|
|
179
|
+
---
|
|
180
|
+
|
|
181
|
+
#### 单元格访问
|
|
182
|
+
|
|
183
|
+
```python
|
|
184
|
+
cell = table[row, col]
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
返回 `Cell` 对象。支持访问所有坐标,包括合并单元格覆盖的区域。
|
|
188
|
+
|
|
189
|
+
---
|
|
190
|
+
|
|
191
|
+
### Cell
|
|
192
|
+
|
|
193
|
+
表示表格单元格。
|
|
194
|
+
|
|
195
|
+
#### blocks
|
|
196
|
+
|
|
197
|
+
```python
|
|
198
|
+
cell.blocks()
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
返回单元格中的块级元素:
|
|
202
|
+
|
|
203
|
+
```python
|
|
204
|
+
(Paragraph | Table, ...)
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
顺序与 Word 文档一致。
|
|
208
|
+
|
|
209
|
+
---
|
|
210
|
+
|
|
211
|
+
#### bounds
|
|
212
|
+
|
|
213
|
+
```python
|
|
214
|
+
top, left, bottom, right = cell.bounds()
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
返回单元格边界 `(top, left, bottom, right)`,使用左闭右开区间 `[top, bottom)` 和 `[left, right)`。
|
|
218
|
+
|
|
219
|
+
对于未合并的单元格,返回 `(r, c, r+1, c+1)`。
|
|
220
|
+
|
|
221
|
+
---
|
|
222
|
+
|
|
223
|
+
## 高级用法
|
|
224
|
+
|
|
225
|
+
### 处理嵌套表格
|
|
226
|
+
|
|
227
|
+
```python
|
|
228
|
+
for block in doc.blocks():
|
|
229
|
+
if isinstance(block, Table):
|
|
230
|
+
rows, cols = block.shape()
|
|
231
|
+
for r in range(rows):
|
|
232
|
+
for c in range(cols):
|
|
233
|
+
cell = block[r, c]
|
|
234
|
+
# 遍历单元格内的块(可能包含嵌套表格)
|
|
235
|
+
for inner_block in cell.blocks():
|
|
236
|
+
if isinstance(inner_block, Table):
|
|
237
|
+
# 处理嵌套表格
|
|
238
|
+
inner_rows, inner_cols = inner_block.shape()
|
|
239
|
+
# ...
|
|
240
|
+
```
|
|
241
|
+
|
|
242
|
+
### 多个批注
|
|
243
|
+
|
|
244
|
+
```python
|
|
245
|
+
# 为同一段落的不同位置添加多个批注
|
|
246
|
+
paragraph.comment("批注1", start=0, end=5, author="张三")
|
|
247
|
+
paragraph.comment("批注2", start=10, end=15, author="李四")
|
|
248
|
+
paragraph.comment("批注3", start=20, end=25, author="王五")
|
|
249
|
+
```
|
|
250
|
+
|
|
251
|
+
### 处理合并单元格
|
|
252
|
+
|
|
253
|
+
```python
|
|
254
|
+
table = [b for b in doc.blocks() if isinstance(b, Table)][0]
|
|
255
|
+
|
|
256
|
+
# 访问合并单元格
|
|
257
|
+
cell = table[0, 0]
|
|
258
|
+
top, left, bottom, right = cell.bounds()
|
|
259
|
+
|
|
260
|
+
# 如果单元格跨越多行或多列
|
|
261
|
+
if bottom - top > 1 or right - left > 1:
|
|
262
|
+
print(f"合并单元格:跨越 {bottom-top} 行,{right-left} 列")
|
|
263
|
+
```
|
|
264
|
+
|
|
265
|
+
---
|
|
266
|
+
|
|
267
|
+
## 测试
|
|
268
|
+
|
|
269
|
+
所有测试文档使用 python-docx 动态生成,不依赖外部文件,详见 [tests/README.md](tests/README.md)。
|
|
270
|
+
|
|
271
|
+
---
|
|
272
|
+
|
|
273
|
+
## 开发环境与提交规范
|
|
274
|
+
|
|
275
|
+
### 克隆与依赖安装
|
|
276
|
+
|
|
277
|
+
- **克隆仓库**:
|
|
278
|
+
|
|
279
|
+
```bash
|
|
280
|
+
git clone git@github.com:touken928/docxnote.git
|
|
281
|
+
cd docxnote
|
|
282
|
+
```
|
|
283
|
+
|
|
284
|
+
- **同步开发依赖**(测试 + pre-commit 等):
|
|
285
|
+
|
|
286
|
+
```bash
|
|
287
|
+
uv sync --group dev
|
|
288
|
+
```
|
|
289
|
+
|
|
290
|
+
### 预提交钩子(pre-commit)
|
|
291
|
+
|
|
292
|
+
- **安装 pre-commit 钩子**(确保提交前自动格式化、lint、跑测试):
|
|
293
|
+
|
|
294
|
+
```bash
|
|
295
|
+
uv run pre-commit install
|
|
296
|
+
```
|
|
297
|
+
|
|
298
|
+
之后每次 `git commit` 会自动运行:
|
|
299
|
+
|
|
300
|
+
- `uv-lock`(保持 uv 依赖锁文件同步)
|
|
301
|
+
- `ruff` / `ruff-format`(代码风格与静态检查)
|
|
302
|
+
- `pytest via uv`(自动化测试)
|
|
303
|
+
|
|
304
|
+
如需手动在本地检查所有文件,可以运行:
|
|
305
|
+
|
|
306
|
+
```bash
|
|
307
|
+
uv run pre-commit run --all-files
|
|
308
|
+
```
|
|
309
|
+
|
|
310
|
+
### 本地测试
|
|
311
|
+
|
|
312
|
+
- 单次运行所有测试:
|
|
313
|
+
|
|
314
|
+
```bash
|
|
315
|
+
uv run pytest
|
|
316
|
+
```
|
|
317
|
+
|
|
318
|
+
### 发布到 PyPI
|
|
319
|
+
|
|
320
|
+
使用 [Trusted Publisher(OIDC)](https://docs.pypi.org/trusted-publishers/) 时无需 PyPI API token;PyPI 中 **Environment name 留空** 即可,也无需在 GitHub 仓库里创建 `Environment`。推送形如 `v0.1.0` 的标签会触发 [`.github/workflows/publish.yml`](.github/workflows/publish.yml) 构建并上传。发布前请将 `pyproject.toml` 中的 `version` 与标签一致。
|
|
321
|
+
|
|
322
|
+
---
|
|
323
|
+
|
|
324
|
+
## SKILL
|
|
325
|
+
|
|
326
|
+
- 本仓库附带 `SKILL.md`,用于指导对话型 / coding Agent 正确调用 `docxnote`。
|
|
327
|
+
- 建议下载到本地并放置在(根据所用工具选择其一):
|
|
328
|
+
- `.cursor/docxnote/SKILL.md`
|
|
329
|
+
- `.claude/docxnote/SKILL.md`
|
|
330
|
+
- 在对话环境中使用本库时,让 Agent 优先参考该文件中的安装方式、推荐代码骨架与注意事项。
|
docxnote-0.1.0/README.md
ADDED
|
@@ -0,0 +1,320 @@
|
|
|
1
|
+
# Docxnote
|
|
2
|
+
|
|
3
|
+
**docxnote** 是一个轻量级 **DOCX 批注引擎**,仅依赖 lxml,用于自动化添加 Word 批注。
|
|
4
|
+
|
|
5
|
+
该库直接操作 **WordprocessingML**,将 DOCX 视为 **ZIP + XML** 文档,并提供一个 **基于文本视图的 API**。
|
|
6
|
+
|
|
7
|
+
与传统 DOCX 库不同,docxnote **完全隐藏 Word 的 Run 结构**,所有操作都基于 **段落字符串**。
|
|
8
|
+
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
## 安装
|
|
12
|
+
|
|
13
|
+
```
|
|
14
|
+
pip install git+https://github.com/touken928/docxnote.git
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
使用 [uv](https://github.com/astral-sh/uv):
|
|
18
|
+
|
|
19
|
+
```
|
|
20
|
+
uv add git+https://github.com/touken928/docxnote.git
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
## 快速开始
|
|
26
|
+
|
|
27
|
+
```python
|
|
28
|
+
from docxnote import DocxDocument, Paragraph, Table
|
|
29
|
+
|
|
30
|
+
# 读取文档
|
|
31
|
+
with open("document.docx", "rb") as f:
|
|
32
|
+
# 默认不保留原有批注(会清空)
|
|
33
|
+
doc = DocxDocument.parse(f.read())
|
|
34
|
+
|
|
35
|
+
# 如需保留原有批注并继续添加:
|
|
36
|
+
# doc = DocxDocument.parse(f.read(), keep_comments=True)
|
|
37
|
+
|
|
38
|
+
# 遍历文档块
|
|
39
|
+
for block in doc.blocks():
|
|
40
|
+
if isinstance(block, Paragraph):
|
|
41
|
+
# 为段落添加批注
|
|
42
|
+
if block.text:
|
|
43
|
+
block.comment("请检查表述", end=5, author="reviewer")
|
|
44
|
+
|
|
45
|
+
elif isinstance(block, Table):
|
|
46
|
+
# 处理表格
|
|
47
|
+
rows, cols = block.shape()
|
|
48
|
+
for r in range(rows):
|
|
49
|
+
for c in range(cols):
|
|
50
|
+
cell = block[r, c]
|
|
51
|
+
# 为单元格内容添加批注
|
|
52
|
+
for inner in cell.blocks():
|
|
53
|
+
if isinstance(inner, Paragraph) and inner.text:
|
|
54
|
+
inner.comment("需复核", end=3, author="reviewer")
|
|
55
|
+
|
|
56
|
+
# 生成新文档
|
|
57
|
+
output = doc.render()
|
|
58
|
+
with open("output.docx", "wb") as f:
|
|
59
|
+
f.write(output)
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
---
|
|
63
|
+
|
|
64
|
+
## API
|
|
65
|
+
|
|
66
|
+
### DocxDocument
|
|
67
|
+
|
|
68
|
+
DOCX 文档对象。
|
|
69
|
+
|
|
70
|
+
#### parse
|
|
71
|
+
|
|
72
|
+
```python
|
|
73
|
+
DocxDocument.parse(docx_bytes, *, keep_comments=False)
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
解析 DOCX 并构建文档对象。
|
|
77
|
+
|
|
78
|
+
- **keep_comments**: 是否保留原有批注。默认 `False`(清空所有原有批注)。如果你需要在“已有批注的 docx 上继续添加批注”并保留旧批注,请传 `True`。
|
|
79
|
+
|
|
80
|
+
---
|
|
81
|
+
|
|
82
|
+
#### blocks
|
|
83
|
+
|
|
84
|
+
```python
|
|
85
|
+
doc.blocks()
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
返回文档中的块级元素:
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
(Paragraph | Table, ...)
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
顺序与 Word 文档一致。
|
|
95
|
+
|
|
96
|
+
---
|
|
97
|
+
|
|
98
|
+
#### render
|
|
99
|
+
|
|
100
|
+
```python
|
|
101
|
+
doc.render()
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
生成新的 DOCX 并返回 `bytes`。
|
|
105
|
+
|
|
106
|
+
所有批注在此阶段写入文档。
|
|
107
|
+
|
|
108
|
+
#### 多线程
|
|
109
|
+
|
|
110
|
+
同一 `DocxDocument` 实例可在多线程中安全使用(内部使用可重入锁串行化访问);不同实例可并行处理。多进程请各自 `parse` 得到独立实例。
|
|
111
|
+
|
|
112
|
+
---
|
|
113
|
+
|
|
114
|
+
### Paragraph
|
|
115
|
+
|
|
116
|
+
表示 Word 段落。
|
|
117
|
+
|
|
118
|
+
#### text
|
|
119
|
+
|
|
120
|
+
```python
|
|
121
|
+
text = paragraph.text
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
返回段落完整文本,保留换行符(`\n`)和制表符(`\t`)。
|
|
125
|
+
|
|
126
|
+
---
|
|
127
|
+
|
|
128
|
+
#### comment
|
|
129
|
+
|
|
130
|
+
```python
|
|
131
|
+
paragraph.comment(
|
|
132
|
+
text, # 批注内容
|
|
133
|
+
start=0, # 起始字符位置
|
|
134
|
+
end=None, # 结束字符位置(None 表示到末尾)
|
|
135
|
+
*,
|
|
136
|
+
author="docxnote" # 批注作者
|
|
137
|
+
)
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
为段落文本范围添加批注。
|
|
141
|
+
|
|
142
|
+
**示例:**
|
|
143
|
+
|
|
144
|
+
```python
|
|
145
|
+
paragraph.comment("需要修改", start=3, end=8, author="张三")
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
docxnote 会自动处理:
|
|
149
|
+
|
|
150
|
+
- Run 分割
|
|
151
|
+
- 批注锚点
|
|
152
|
+
- comments.xml 写入
|
|
153
|
+
- 文档关系更新
|
|
154
|
+
|
|
155
|
+
---
|
|
156
|
+
|
|
157
|
+
### Table
|
|
158
|
+
|
|
159
|
+
表示 Word 表格。
|
|
160
|
+
|
|
161
|
+
#### shape
|
|
162
|
+
|
|
163
|
+
```python
|
|
164
|
+
rows, cols = table.shape()
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
返回表格尺寸 `(行数, 列数)`。
|
|
168
|
+
|
|
169
|
+
---
|
|
170
|
+
|
|
171
|
+
#### 单元格访问
|
|
172
|
+
|
|
173
|
+
```python
|
|
174
|
+
cell = table[row, col]
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
返回 `Cell` 对象。支持访问所有坐标,包括合并单元格覆盖的区域。
|
|
178
|
+
|
|
179
|
+
---
|
|
180
|
+
|
|
181
|
+
### Cell
|
|
182
|
+
|
|
183
|
+
表示表格单元格。
|
|
184
|
+
|
|
185
|
+
#### blocks
|
|
186
|
+
|
|
187
|
+
```python
|
|
188
|
+
cell.blocks()
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
返回单元格中的块级元素:
|
|
192
|
+
|
|
193
|
+
```python
|
|
194
|
+
(Paragraph | Table, ...)
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
顺序与 Word 文档一致。
|
|
198
|
+
|
|
199
|
+
---
|
|
200
|
+
|
|
201
|
+
#### bounds
|
|
202
|
+
|
|
203
|
+
```python
|
|
204
|
+
top, left, bottom, right = cell.bounds()
|
|
205
|
+
```
|
|
206
|
+
|
|
207
|
+
返回单元格边界 `(top, left, bottom, right)`,使用左闭右开区间 `[top, bottom)` 和 `[left, right)`。
|
|
208
|
+
|
|
209
|
+
对于未合并的单元格,返回 `(r, c, r+1, c+1)`。
|
|
210
|
+
|
|
211
|
+
---
|
|
212
|
+
|
|
213
|
+
## 高级用法
|
|
214
|
+
|
|
215
|
+
### 处理嵌套表格
|
|
216
|
+
|
|
217
|
+
```python
|
|
218
|
+
for block in doc.blocks():
|
|
219
|
+
if isinstance(block, Table):
|
|
220
|
+
rows, cols = block.shape()
|
|
221
|
+
for r in range(rows):
|
|
222
|
+
for c in range(cols):
|
|
223
|
+
cell = block[r, c]
|
|
224
|
+
# 遍历单元格内的块(可能包含嵌套表格)
|
|
225
|
+
for inner_block in cell.blocks():
|
|
226
|
+
if isinstance(inner_block, Table):
|
|
227
|
+
# 处理嵌套表格
|
|
228
|
+
inner_rows, inner_cols = inner_block.shape()
|
|
229
|
+
# ...
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
### 多个批注
|
|
233
|
+
|
|
234
|
+
```python
|
|
235
|
+
# 为同一段落的不同位置添加多个批注
|
|
236
|
+
paragraph.comment("批注1", start=0, end=5, author="张三")
|
|
237
|
+
paragraph.comment("批注2", start=10, end=15, author="李四")
|
|
238
|
+
paragraph.comment("批注3", start=20, end=25, author="王五")
|
|
239
|
+
```
|
|
240
|
+
|
|
241
|
+
### 处理合并单元格
|
|
242
|
+
|
|
243
|
+
```python
|
|
244
|
+
table = [b for b in doc.blocks() if isinstance(b, Table)][0]
|
|
245
|
+
|
|
246
|
+
# 访问合并单元格
|
|
247
|
+
cell = table[0, 0]
|
|
248
|
+
top, left, bottom, right = cell.bounds()
|
|
249
|
+
|
|
250
|
+
# 如果单元格跨越多行或多列
|
|
251
|
+
if bottom - top > 1 or right - left > 1:
|
|
252
|
+
print(f"合并单元格:跨越 {bottom-top} 行,{right-left} 列")
|
|
253
|
+
```
|
|
254
|
+
|
|
255
|
+
---
|
|
256
|
+
|
|
257
|
+
## 测试
|
|
258
|
+
|
|
259
|
+
所有测试文档使用 python-docx 动态生成,不依赖外部文件,详见 [tests/README.md](tests/README.md)。
|
|
260
|
+
|
|
261
|
+
---
|
|
262
|
+
|
|
263
|
+
## 开发环境与提交规范
|
|
264
|
+
|
|
265
|
+
### 克隆与依赖安装
|
|
266
|
+
|
|
267
|
+
- **克隆仓库**:
|
|
268
|
+
|
|
269
|
+
```bash
|
|
270
|
+
git clone git@github.com:touken928/docxnote.git
|
|
271
|
+
cd docxnote
|
|
272
|
+
```
|
|
273
|
+
|
|
274
|
+
- **同步开发依赖**(测试 + pre-commit 等):
|
|
275
|
+
|
|
276
|
+
```bash
|
|
277
|
+
uv sync --group dev
|
|
278
|
+
```
|
|
279
|
+
|
|
280
|
+
### 预提交钩子(pre-commit)
|
|
281
|
+
|
|
282
|
+
- **安装 pre-commit 钩子**(确保提交前自动格式化、lint、跑测试):
|
|
283
|
+
|
|
284
|
+
```bash
|
|
285
|
+
uv run pre-commit install
|
|
286
|
+
```
|
|
287
|
+
|
|
288
|
+
之后每次 `git commit` 会自动运行:
|
|
289
|
+
|
|
290
|
+
- `uv-lock`(保持 uv 依赖锁文件同步)
|
|
291
|
+
- `ruff` / `ruff-format`(代码风格与静态检查)
|
|
292
|
+
- `pytest via uv`(自动化测试)
|
|
293
|
+
|
|
294
|
+
如需手动在本地检查所有文件,可以运行:
|
|
295
|
+
|
|
296
|
+
```bash
|
|
297
|
+
uv run pre-commit run --all-files
|
|
298
|
+
```
|
|
299
|
+
|
|
300
|
+
### 本地测试
|
|
301
|
+
|
|
302
|
+
- 单次运行所有测试:
|
|
303
|
+
|
|
304
|
+
```bash
|
|
305
|
+
uv run pytest
|
|
306
|
+
```
|
|
307
|
+
|
|
308
|
+
### 发布到 PyPI
|
|
309
|
+
|
|
310
|
+
使用 [Trusted Publisher(OIDC)](https://docs.pypi.org/trusted-publishers/) 时无需 PyPI API token;PyPI 中 **Environment name 留空** 即可,也无需在 GitHub 仓库里创建 `Environment`。推送形如 `v0.1.0` 的标签会触发 [`.github/workflows/publish.yml`](.github/workflows/publish.yml) 构建并上传。发布前请将 `pyproject.toml` 中的 `version` 与标签一致。
|
|
311
|
+
|
|
312
|
+
---
|
|
313
|
+
|
|
314
|
+
## SKILL
|
|
315
|
+
|
|
316
|
+
- 本仓库附带 `SKILL.md`,用于指导对话型 / coding Agent 正确调用 `docxnote`。
|
|
317
|
+
- 建议下载到本地并放置在(根据所用工具选择其一):
|
|
318
|
+
- `.cursor/docxnote/SKILL.md`
|
|
319
|
+
- `.claude/docxnote/SKILL.md`
|
|
320
|
+
- 在对话环境中使用本库时,让 Agent 优先参考该文件中的安装方式、推荐代码骨架与注意事项。
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "docxnote"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Lightweight DOCX comment engine based on text view API"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
authors = [
|
|
7
|
+
{ name = "touken", email = "touken928@foxmail.com" }
|
|
8
|
+
]
|
|
9
|
+
requires-python = ">=3.12"
|
|
10
|
+
dependencies = [
|
|
11
|
+
"lxml>=5.0.0",
|
|
12
|
+
]
|
|
13
|
+
|
|
14
|
+
[build-system]
|
|
15
|
+
requires = ["uv_build>=0.9.11,<0.10.0"]
|
|
16
|
+
build-backend = "uv_build"
|
|
17
|
+
|
|
18
|
+
[dependency-groups]
|
|
19
|
+
dev = [
|
|
20
|
+
# 测试与开发工具
|
|
21
|
+
"pytest>=8.0.0",
|
|
22
|
+
"python-docx>=1.0.0",
|
|
23
|
+
"pre-commit>=4.5.1",
|
|
24
|
+
]
|
|
@@ -0,0 +1,340 @@
|
|
|
1
|
+
"""DOCX 文档解析和渲染"""
|
|
2
|
+
|
|
3
|
+
import io
|
|
4
|
+
import threading
|
|
5
|
+
import zipfile
|
|
6
|
+
from lxml import etree
|
|
7
|
+
|
|
8
|
+
from .paragraph import Paragraph
|
|
9
|
+
from .table import Table
|
|
10
|
+
from .namespaces import NS
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class DocxDocument:
|
|
14
|
+
"""DOCX 文档对象
|
|
15
|
+
|
|
16
|
+
同一 ``DocxDocument`` 实例可在多线程环境下安全使用(内部使用可重入锁串行化访问)。
|
|
17
|
+
不同实例之间无共享可变状态,可并行使用。多进程请各自持有独立实例。
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(self, zip_data: bytes):
|
|
21
|
+
self._zip_data = zip_data
|
|
22
|
+
self._zip = zipfile.ZipFile(io.BytesIO(zip_data))
|
|
23
|
+
self._document_xml = None
|
|
24
|
+
self._body = None
|
|
25
|
+
self._comments = []
|
|
26
|
+
self._comment_id_counter = 0
|
|
27
|
+
self._lock = threading.RLock()
|
|
28
|
+
|
|
29
|
+
@classmethod
|
|
30
|
+
def parse(cls, docx_bytes: bytes, *, keep_comments: bool = False) -> "DocxDocument":
|
|
31
|
+
"""解析 DOCX 并构建文档对象
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
keep_comments: 是否保留原有批注。默认 False(清空所有原有批注)。
|
|
35
|
+
"""
|
|
36
|
+
doc = cls(docx_bytes)
|
|
37
|
+
doc._load_document(keep_comments=keep_comments)
|
|
38
|
+
return doc
|
|
39
|
+
|
|
40
|
+
def _load_document(self, *, keep_comments: bool):
|
|
41
|
+
"""加载 document.xml,并按需保留/清空原有批注"""
|
|
42
|
+
doc_xml = self._zip.read("word/document.xml")
|
|
43
|
+
self._document_xml = etree.fromstring(doc_xml)
|
|
44
|
+
self._body = self._document_xml.find(".//w:body", NS)
|
|
45
|
+
|
|
46
|
+
if keep_comments:
|
|
47
|
+
# 加载已有的批注
|
|
48
|
+
self._load_existing_comments()
|
|
49
|
+
else:
|
|
50
|
+
# 默认不保留:清空 comments 列表,并移除 document.xml 中的批注标记
|
|
51
|
+
self._comments = []
|
|
52
|
+
self._comment_id_counter = 0
|
|
53
|
+
self._strip_all_comment_markers()
|
|
54
|
+
|
|
55
|
+
def _strip_all_comment_markers(self) -> None:
|
|
56
|
+
"""移除 document.xml 中所有批注相关标记,避免残留引用。"""
|
|
57
|
+
if self._document_xml is None:
|
|
58
|
+
return
|
|
59
|
+
|
|
60
|
+
# commentRangeStart / commentRangeEnd
|
|
61
|
+
for tag in ("commentRangeStart", "commentRangeEnd"):
|
|
62
|
+
for el in self._document_xml.findall(f".//w:{tag}", NS):
|
|
63
|
+
parent = el.getparent()
|
|
64
|
+
if parent is not None:
|
|
65
|
+
parent.remove(el)
|
|
66
|
+
|
|
67
|
+
# commentReference 位于 w:r 内;移除后若 run 为空则一并移除
|
|
68
|
+
for ref in self._document_xml.findall(".//w:commentReference", NS):
|
|
69
|
+
run = ref.getparent()
|
|
70
|
+
if run is None:
|
|
71
|
+
continue
|
|
72
|
+
run.remove(ref)
|
|
73
|
+
if (
|
|
74
|
+
len(run) == 0
|
|
75
|
+
and (run.text is None)
|
|
76
|
+
and (run.tail is None or run.tail == "")
|
|
77
|
+
):
|
|
78
|
+
parent = run.getparent()
|
|
79
|
+
if parent is not None:
|
|
80
|
+
parent.remove(run)
|
|
81
|
+
|
|
82
|
+
def _load_existing_comments(self):
|
|
83
|
+
"""加载已有的批注"""
|
|
84
|
+
try:
|
|
85
|
+
comments_xml = self._zip.read("word/comments.xml")
|
|
86
|
+
comments_tree = etree.fromstring(comments_xml)
|
|
87
|
+
|
|
88
|
+
max_id = -1
|
|
89
|
+
for comment in comments_tree:
|
|
90
|
+
comment_id_str = comment.get(f"{{{NS['w']}}}id")
|
|
91
|
+
if comment_id_str:
|
|
92
|
+
comment_id = int(comment_id_str)
|
|
93
|
+
max_id = max(max_id, comment_id)
|
|
94
|
+
|
|
95
|
+
# 提取批注内容
|
|
96
|
+
author = comment.get(f"{{{NS['w']}}}author", "")
|
|
97
|
+
text = self._extract_comment_text(comment)
|
|
98
|
+
|
|
99
|
+
self._comments.append((comment_id, text, author))
|
|
100
|
+
|
|
101
|
+
# 设置下一个批注 ID
|
|
102
|
+
self._comment_id_counter = max_id + 1
|
|
103
|
+
except KeyError:
|
|
104
|
+
# 没有 comments.xml 文件
|
|
105
|
+
pass
|
|
106
|
+
|
|
107
|
+
def _extract_comment_text(self, comment_element: etree._Element) -> str:
|
|
108
|
+
"""从 w:comment 中提取完整文本(按 w:p 插入换行)。"""
|
|
109
|
+
parts: list[str] = []
|
|
110
|
+
first_para = True
|
|
111
|
+
|
|
112
|
+
# comments.xml 内部结构通常是多个 w:p
|
|
113
|
+
for p in comment_element.findall(".//w:p", NS):
|
|
114
|
+
if not first_para:
|
|
115
|
+
parts.append("\n")
|
|
116
|
+
first_para = False
|
|
117
|
+
|
|
118
|
+
for run in p.findall(".//w:r", NS):
|
|
119
|
+
for child in run:
|
|
120
|
+
tag = etree.QName(child.tag).localname
|
|
121
|
+
if tag == "t":
|
|
122
|
+
if child.text:
|
|
123
|
+
parts.append(child.text)
|
|
124
|
+
elif tag == "br":
|
|
125
|
+
parts.append("\n")
|
|
126
|
+
elif tag == "tab":
|
|
127
|
+
parts.append("\t")
|
|
128
|
+
|
|
129
|
+
return "".join(parts)
|
|
130
|
+
|
|
131
|
+
def blocks(self) -> tuple[Paragraph | Table, ...]:
|
|
132
|
+
"""返回文档中的块级元素(元组)"""
|
|
133
|
+
with self._lock:
|
|
134
|
+
if self._body is None:
|
|
135
|
+
return ()
|
|
136
|
+
|
|
137
|
+
blocks: list[Paragraph | Table] = []
|
|
138
|
+
for child in self._body:
|
|
139
|
+
tag = etree.QName(child.tag).localname
|
|
140
|
+
if tag == "p":
|
|
141
|
+
blocks.append(Paragraph(child, self))
|
|
142
|
+
elif tag == "tbl":
|
|
143
|
+
blocks.append(Table(child, self))
|
|
144
|
+
return tuple(blocks)
|
|
145
|
+
|
|
146
|
+
def add_comment(self, text: str, author: str = "docxnote") -> int:
|
|
147
|
+
"""添加批注并返回 ID"""
|
|
148
|
+
with self._lock:
|
|
149
|
+
comment_id = self._comment_id_counter
|
|
150
|
+
self._comment_id_counter += 1
|
|
151
|
+
self._comments.append((comment_id, text, author))
|
|
152
|
+
return comment_id
|
|
153
|
+
|
|
154
|
+
def render(self) -> bytes:
|
|
155
|
+
"""生成新的 DOCX 并返回 bytes"""
|
|
156
|
+
with self._lock:
|
|
157
|
+
return self._render_unlocked()
|
|
158
|
+
|
|
159
|
+
def _render_unlocked(self) -> bytes:
|
|
160
|
+
output = io.BytesIO()
|
|
161
|
+
|
|
162
|
+
with zipfile.ZipFile(output, "w", zipfile.ZIP_DEFLATED) as out_zip:
|
|
163
|
+
# 准备 rels 和 content types(如果有批注)
|
|
164
|
+
rels_data = None
|
|
165
|
+
content_types_data = None
|
|
166
|
+
if self._comments:
|
|
167
|
+
rels_data = self._prepare_rels()
|
|
168
|
+
content_types_data = self._prepare_content_types()
|
|
169
|
+
|
|
170
|
+
# 复制所有原始文件
|
|
171
|
+
for item in self._zip.namelist():
|
|
172
|
+
if item == "word/document.xml":
|
|
173
|
+
continue
|
|
174
|
+
if item == "word/comments.xml":
|
|
175
|
+
continue
|
|
176
|
+
if item == "word/_rels/document.xml.rels" and rels_data is not None:
|
|
177
|
+
continue
|
|
178
|
+
if item == "[Content_Types].xml" and content_types_data is not None:
|
|
179
|
+
continue
|
|
180
|
+
out_zip.writestr(item, self._zip.read(item))
|
|
181
|
+
|
|
182
|
+
# 写入修改后的 document.xml
|
|
183
|
+
doc_bytes = etree.tostring(
|
|
184
|
+
self._document_xml,
|
|
185
|
+
xml_declaration=True,
|
|
186
|
+
encoding="UTF-8",
|
|
187
|
+
standalone=True,
|
|
188
|
+
)
|
|
189
|
+
out_zip.writestr("word/document.xml", doc_bytes)
|
|
190
|
+
|
|
191
|
+
# 写入 comments.xml、rels 和 content types
|
|
192
|
+
if self._comments:
|
|
193
|
+
comments_xml = self._build_comments_xml()
|
|
194
|
+
out_zip.writestr("word/comments.xml", comments_xml)
|
|
195
|
+
out_zip.writestr("word/_rels/document.xml.rels", rels_data)
|
|
196
|
+
out_zip.writestr("[Content_Types].xml", content_types_data)
|
|
197
|
+
|
|
198
|
+
return output.getvalue()
|
|
199
|
+
|
|
200
|
+
def _build_comments_xml(self) -> bytes:
|
|
201
|
+
"""构建 comments.xml"""
|
|
202
|
+
root = etree.Element(f"{{{NS['w']}}}comments", nsmap=NS)
|
|
203
|
+
|
|
204
|
+
for comment_id, text, author in self._comments:
|
|
205
|
+
comment = etree.SubElement(
|
|
206
|
+
root,
|
|
207
|
+
f"{{{NS['w']}}}comment",
|
|
208
|
+
attrib={
|
|
209
|
+
f"{{{NS['w']}}}id": str(comment_id),
|
|
210
|
+
f"{{{NS['w']}}}author": author,
|
|
211
|
+
f"{{{NS['w']}}}date": "2024-01-01T00:00:00Z",
|
|
212
|
+
f"{{{NS['w']}}}initials": author[0].upper() if author else "D",
|
|
213
|
+
},
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
# 按换行拆分为多个段落,尽量保留原批注的多段结构
|
|
217
|
+
lines = text.split("\n")
|
|
218
|
+
if not lines:
|
|
219
|
+
lines = [""]
|
|
220
|
+
|
|
221
|
+
for line in lines:
|
|
222
|
+
p = etree.SubElement(comment, f"{{{NS['w']}}}p")
|
|
223
|
+
r = etree.SubElement(p, f"{{{NS['w']}}}r")
|
|
224
|
+
|
|
225
|
+
# 处理 tab:用 w:tab 表示
|
|
226
|
+
if "\t" in line:
|
|
227
|
+
buf: list[str] = []
|
|
228
|
+
for ch in line:
|
|
229
|
+
if ch == "\t":
|
|
230
|
+
if buf:
|
|
231
|
+
t = etree.SubElement(r, f"{{{NS['w']}}}t")
|
|
232
|
+
seg = "".join(buf)
|
|
233
|
+
if seg[:1] == " " or seg[-1:] == " ":
|
|
234
|
+
t.set(
|
|
235
|
+
"{http://www.w3.org/XML/1998/namespace}space",
|
|
236
|
+
"preserve",
|
|
237
|
+
)
|
|
238
|
+
t.text = seg
|
|
239
|
+
buf.clear()
|
|
240
|
+
etree.SubElement(r, f"{{{NS['w']}}}tab")
|
|
241
|
+
else:
|
|
242
|
+
buf.append(ch)
|
|
243
|
+
if buf or line == "":
|
|
244
|
+
t = etree.SubElement(r, f"{{{NS['w']}}}t")
|
|
245
|
+
seg = "".join(buf)
|
|
246
|
+
if seg[:1] == " " or seg[-1:] == " ":
|
|
247
|
+
t.set(
|
|
248
|
+
"{http://www.w3.org/XML/1998/namespace}space",
|
|
249
|
+
"preserve",
|
|
250
|
+
)
|
|
251
|
+
t.text = seg
|
|
252
|
+
else:
|
|
253
|
+
t = etree.SubElement(r, f"{{{NS['w']}}}t")
|
|
254
|
+
if line[:1] == " " or line[-1:] == " ":
|
|
255
|
+
t.set("{http://www.w3.org/XML/1998/namespace}space", "preserve")
|
|
256
|
+
t.text = line
|
|
257
|
+
|
|
258
|
+
return etree.tostring(
|
|
259
|
+
root, xml_declaration=True, encoding="UTF-8", standalone=True
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
def _prepare_rels(self) -> bytes:
|
|
263
|
+
"""准备 document.xml.rels 数据以包含 comments.xml 关系"""
|
|
264
|
+
rels_path = "word/_rels/document.xml.rels"
|
|
265
|
+
|
|
266
|
+
try:
|
|
267
|
+
rels_data = self._zip.read(rels_path)
|
|
268
|
+
rels_xml = etree.fromstring(rels_data)
|
|
269
|
+
except KeyError:
|
|
270
|
+
# 创建新的 rels
|
|
271
|
+
rels_xml = etree.Element(
|
|
272
|
+
"Relationships",
|
|
273
|
+
nsmap={
|
|
274
|
+
"": "http://schemas.openxmlformats.org/package/2006/relationships"
|
|
275
|
+
},
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
# 检查是否已有 comments 关系
|
|
279
|
+
has_comments = False
|
|
280
|
+
for rel in rels_xml:
|
|
281
|
+
if (
|
|
282
|
+
rel.get("Type")
|
|
283
|
+
== "http://schemas.openxmlformats.org/officeDocument/2006/relationships/comments"
|
|
284
|
+
):
|
|
285
|
+
has_comments = True
|
|
286
|
+
break
|
|
287
|
+
|
|
288
|
+
if not has_comments:
|
|
289
|
+
# 添加 comments 关系
|
|
290
|
+
max_id = 0
|
|
291
|
+
for rel in rels_xml:
|
|
292
|
+
rel_id = rel.get("Id", "")
|
|
293
|
+
if rel_id.startswith("rId"):
|
|
294
|
+
try:
|
|
295
|
+
num = int(rel_id[3:])
|
|
296
|
+
max_id = max(max_id, num)
|
|
297
|
+
except ValueError:
|
|
298
|
+
pass
|
|
299
|
+
|
|
300
|
+
etree.SubElement(
|
|
301
|
+
rels_xml,
|
|
302
|
+
"Relationship",
|
|
303
|
+
attrib={
|
|
304
|
+
"Id": f"rId{max_id + 1}",
|
|
305
|
+
"Type": "http://schemas.openxmlformats.org/officeDocument/2006/relationships/comments",
|
|
306
|
+
"Target": "comments.xml",
|
|
307
|
+
},
|
|
308
|
+
)
|
|
309
|
+
|
|
310
|
+
return etree.tostring(rels_xml, xml_declaration=True, encoding="UTF-8")
|
|
311
|
+
|
|
312
|
+
def _prepare_content_types(self) -> bytes:
|
|
313
|
+
"""准备 [Content_Types].xml 数据以包含 comments.xml"""
|
|
314
|
+
ct_data = self._zip.read("[Content_Types].xml")
|
|
315
|
+
ct_xml = etree.fromstring(ct_data)
|
|
316
|
+
|
|
317
|
+
# 获取命名空间
|
|
318
|
+
ns = ct_xml.nsmap.get(
|
|
319
|
+
None, "http://schemas.openxmlformats.org/package/2006/content-types"
|
|
320
|
+
)
|
|
321
|
+
|
|
322
|
+
# 检查是否已有 comments.xml 的 Override
|
|
323
|
+
has_comments_override = False
|
|
324
|
+
for override in ct_xml:
|
|
325
|
+
if override.get("PartName") == "/word/comments.xml":
|
|
326
|
+
has_comments_override = True
|
|
327
|
+
break
|
|
328
|
+
|
|
329
|
+
if not has_comments_override:
|
|
330
|
+
# 添加 comments.xml 的 Override
|
|
331
|
+
override_elem = etree.Element(
|
|
332
|
+
f"{{{ns}}}Override",
|
|
333
|
+
attrib={
|
|
334
|
+
"PartName": "/word/comments.xml",
|
|
335
|
+
"ContentType": "application/vnd.openxmlformats-officedocument.wordprocessingml.comments+xml",
|
|
336
|
+
},
|
|
337
|
+
)
|
|
338
|
+
ct_xml.append(override_elem)
|
|
339
|
+
|
|
340
|
+
return etree.tostring(ct_xml, xml_declaration=True, encoding="UTF-8")
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
"""段落处理"""
|
|
2
|
+
|
|
3
|
+
from lxml import etree
|
|
4
|
+
from .namespaces import NS
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class Paragraph:
|
|
8
|
+
"""表示 Word 段落"""
|
|
9
|
+
|
|
10
|
+
def __init__(self, element, document):
|
|
11
|
+
self._element = element
|
|
12
|
+
self._document = document
|
|
13
|
+
self._text_cache = None
|
|
14
|
+
|
|
15
|
+
@property
|
|
16
|
+
def text(self) -> str:
|
|
17
|
+
"""返回段落完整文本"""
|
|
18
|
+
with self._document._lock:
|
|
19
|
+
if self._text_cache is not None:
|
|
20
|
+
return self._text_cache
|
|
21
|
+
|
|
22
|
+
text_parts = []
|
|
23
|
+
for run in self._element.findall(".//w:r", NS):
|
|
24
|
+
# 遍历 run 的所有子元素,保持顺序
|
|
25
|
+
for child in run:
|
|
26
|
+
tag = etree.QName(child.tag).localname
|
|
27
|
+
if tag == "t":
|
|
28
|
+
# 文本节点
|
|
29
|
+
if child.text:
|
|
30
|
+
text_parts.append(child.text)
|
|
31
|
+
elif tag == "br":
|
|
32
|
+
# 换行符
|
|
33
|
+
text_parts.append("\n")
|
|
34
|
+
elif tag == "tab":
|
|
35
|
+
# 制表符
|
|
36
|
+
text_parts.append("\t")
|
|
37
|
+
|
|
38
|
+
self._text_cache = "".join(text_parts)
|
|
39
|
+
return self._text_cache
|
|
40
|
+
|
|
41
|
+
def comment(
|
|
42
|
+
self,
|
|
43
|
+
text: str,
|
|
44
|
+
start: int = 0,
|
|
45
|
+
end: int | None = None,
|
|
46
|
+
*,
|
|
47
|
+
author: str = "docxnote",
|
|
48
|
+
):
|
|
49
|
+
"""为段落文本范围添加批注"""
|
|
50
|
+
with self._document._lock:
|
|
51
|
+
if end is None:
|
|
52
|
+
end = len(self.text)
|
|
53
|
+
|
|
54
|
+
# 获取批注 ID
|
|
55
|
+
comment_id = self._document.add_comment(text, author)
|
|
56
|
+
|
|
57
|
+
# 在段落中插入批注标记
|
|
58
|
+
self._insert_comment_markers(comment_id, start, end)
|
|
59
|
+
|
|
60
|
+
def _insert_comment_markers(self, comment_id: int, start: int, end: int):
|
|
61
|
+
"""在指定位置插入批注起止标记"""
|
|
62
|
+
runs = list(self._element.findall(".//w:r", NS))
|
|
63
|
+
if not runs:
|
|
64
|
+
return
|
|
65
|
+
|
|
66
|
+
# 计算字符位置到 run 的映射
|
|
67
|
+
run_positions = []
|
|
68
|
+
current_pos = 0
|
|
69
|
+
|
|
70
|
+
for run in runs:
|
|
71
|
+
run_start = current_pos
|
|
72
|
+
run_text = ""
|
|
73
|
+
for t in run.findall(".//w:t", NS):
|
|
74
|
+
if t.text:
|
|
75
|
+
run_text += t.text
|
|
76
|
+
run_end = current_pos + len(run_text)
|
|
77
|
+
run_positions.append((run, run_start, run_end, run_text))
|
|
78
|
+
current_pos = run_end
|
|
79
|
+
|
|
80
|
+
# 找到需要分割的 run
|
|
81
|
+
start_run_idx = None
|
|
82
|
+
end_run_idx = None
|
|
83
|
+
|
|
84
|
+
for idx, (run, run_start, run_end, run_text) in enumerate(run_positions):
|
|
85
|
+
if start_run_idx is None and run_start <= start < run_end:
|
|
86
|
+
start_run_idx = idx
|
|
87
|
+
if end_run_idx is None and run_start < end <= run_end:
|
|
88
|
+
end_run_idx = idx
|
|
89
|
+
|
|
90
|
+
if start_run_idx is None or end_run_idx is None:
|
|
91
|
+
return
|
|
92
|
+
|
|
93
|
+
# 分割 run 并插入标记
|
|
94
|
+
self._split_and_mark(
|
|
95
|
+
run_positions, start_run_idx, end_run_idx, start, end, comment_id
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
def _split_and_mark(
|
|
99
|
+
self, run_positions, start_idx, end_idx, start, end, comment_id
|
|
100
|
+
):
|
|
101
|
+
"""分割 run 并插入批注标记"""
|
|
102
|
+
# 简化实现:在第一个 run 前插入开始标记,在最后一个 run 后插入结束标记
|
|
103
|
+
start_run, start_pos, _, _ = run_positions[start_idx]
|
|
104
|
+
end_run, _, end_pos, _ = run_positions[end_idx]
|
|
105
|
+
|
|
106
|
+
# 创建批注范围开始标记
|
|
107
|
+
comment_start = etree.Element(
|
|
108
|
+
f"{{{NS['w']}}}commentRangeStart",
|
|
109
|
+
attrib={f"{{{NS['w']}}}id": str(comment_id)},
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
# 创建批注范围结束标记
|
|
113
|
+
comment_end = etree.Element(
|
|
114
|
+
f"{{{NS['w']}}}commentRangeEnd",
|
|
115
|
+
attrib={f"{{{NS['w']}}}id": str(comment_id)},
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
# 创建批注引用
|
|
119
|
+
comment_ref_run = etree.Element(f"{{{NS['w']}}}r")
|
|
120
|
+
etree.SubElement(
|
|
121
|
+
comment_ref_run,
|
|
122
|
+
f"{{{NS['w']}}}commentReference",
|
|
123
|
+
attrib={f"{{{NS['w']}}}id": str(comment_id)},
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
# 插入标记
|
|
127
|
+
parent = self._element
|
|
128
|
+
|
|
129
|
+
# 查找 run 在父元素中的位置
|
|
130
|
+
try:
|
|
131
|
+
children = list(parent)
|
|
132
|
+
start_run_pos = children.index(start_run)
|
|
133
|
+
end_run_pos = children.index(end_run)
|
|
134
|
+
except ValueError:
|
|
135
|
+
# run 不是直接子元素,跳过
|
|
136
|
+
return
|
|
137
|
+
|
|
138
|
+
# 在开始 run 之前插入开始标记
|
|
139
|
+
parent.insert(start_run_pos, comment_start)
|
|
140
|
+
|
|
141
|
+
# 在结束 run 之后插入结束标记和引用(注意索引偏移)
|
|
142
|
+
parent.insert(end_run_pos + 2, comment_end)
|
|
143
|
+
parent.insert(end_run_pos + 3, comment_ref_run)
|
|
File without changes
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
"""表格和单元格处理"""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from lxml import etree
|
|
6
|
+
from .namespaces import NS
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class Table:
|
|
10
|
+
"""表示 Word 表格"""
|
|
11
|
+
|
|
12
|
+
def __init__(self, element, document):
|
|
13
|
+
self._element = element
|
|
14
|
+
self._document = document
|
|
15
|
+
self._grid = None
|
|
16
|
+
self._build_grid()
|
|
17
|
+
|
|
18
|
+
def _build_grid(self):
|
|
19
|
+
"""构建表格网格,处理合并单元格"""
|
|
20
|
+
# 只查找直接子行,不包括嵌套表格的行
|
|
21
|
+
rows = self._element.findall("./w:tr", NS)
|
|
22
|
+
if not rows:
|
|
23
|
+
self._grid = []
|
|
24
|
+
return
|
|
25
|
+
|
|
26
|
+
# 构建一个“展开到坐标”的网格:同一合并区域的所有坐标都指向起始 Cell
|
|
27
|
+
self._grid = []
|
|
28
|
+
row_maps: list[dict[int, Cell]] = []
|
|
29
|
+
active_vmerge: dict[
|
|
30
|
+
int, Cell
|
|
31
|
+
] = {} # col -> origin cell (for current/next rows)
|
|
32
|
+
max_cols = 0
|
|
33
|
+
|
|
34
|
+
for r_idx, row in enumerate(rows):
|
|
35
|
+
# 只查找直接子单元格
|
|
36
|
+
tcs = row.findall("./w:tc", NS)
|
|
37
|
+
|
|
38
|
+
row_map: dict[int, Cell] = {}
|
|
39
|
+
col_idx = 0
|
|
40
|
+
|
|
41
|
+
for tc in tcs:
|
|
42
|
+
# 跳过被上方 vMerge 占用的列位置
|
|
43
|
+
while col_idx in row_map:
|
|
44
|
+
col_idx += 1
|
|
45
|
+
|
|
46
|
+
colspan = 1
|
|
47
|
+
vmerge_val: str | None = None
|
|
48
|
+
|
|
49
|
+
tc_pr = tc.find("./w:tcPr", NS)
|
|
50
|
+
if tc_pr is not None:
|
|
51
|
+
gridspan = tc_pr.find("./w:gridSpan", NS)
|
|
52
|
+
if gridspan is not None:
|
|
53
|
+
val = gridspan.get(f"{{{NS['w']}}}val")
|
|
54
|
+
if val:
|
|
55
|
+
colspan = int(val)
|
|
56
|
+
|
|
57
|
+
vmerge = tc_pr.find("./w:vMerge", NS)
|
|
58
|
+
if vmerge is not None:
|
|
59
|
+
vmerge_val = vmerge.get(f"{{{NS['w']}}}val")
|
|
60
|
+
|
|
61
|
+
is_vmerge_continue = vmerge_val is None and (
|
|
62
|
+
tc_pr is not None and tc_pr.find("./w:vMerge", NS) is not None
|
|
63
|
+
)
|
|
64
|
+
if vmerge_val is not None:
|
|
65
|
+
is_vmerge_continue = vmerge_val != "restart"
|
|
66
|
+
|
|
67
|
+
if is_vmerge_continue:
|
|
68
|
+
origin = active_vmerge.get(col_idx)
|
|
69
|
+
if origin is None:
|
|
70
|
+
origin = Cell(tc, self._document, r_idx, col_idx, colspan)
|
|
71
|
+
else:
|
|
72
|
+
origin._grow_rowspan_to(r_idx + 1)
|
|
73
|
+
else:
|
|
74
|
+
origin = Cell(tc, self._document, r_idx, col_idx, colspan)
|
|
75
|
+
# 新单元格覆盖同列:意味着上方 vMerge 在该列结束
|
|
76
|
+
for i in range(colspan):
|
|
77
|
+
active_vmerge.pop(col_idx + i, None)
|
|
78
|
+
|
|
79
|
+
# 如果当前单元格是 vMerge restart,则开启纵向合并跟踪
|
|
80
|
+
if vmerge_val == "restart" or (
|
|
81
|
+
tc_pr is not None
|
|
82
|
+
and tc_pr.find("./w:vMerge", NS) is not None
|
|
83
|
+
and vmerge_val == "restart"
|
|
84
|
+
):
|
|
85
|
+
for i in range(colspan):
|
|
86
|
+
active_vmerge[col_idx + i] = origin
|
|
87
|
+
|
|
88
|
+
for i in range(colspan):
|
|
89
|
+
row_map[col_idx + i] = origin
|
|
90
|
+
|
|
91
|
+
col_idx += colspan
|
|
92
|
+
|
|
93
|
+
# 将本行未显式出现但仍在 vMerge 中的列补齐
|
|
94
|
+
for c, origin in active_vmerge.items():
|
|
95
|
+
if c not in row_map:
|
|
96
|
+
origin._grow_rowspan_to(r_idx + 1)
|
|
97
|
+
row_map[c] = origin
|
|
98
|
+
|
|
99
|
+
if row_map:
|
|
100
|
+
max_cols = max(max_cols, max(row_map.keys()) + 1)
|
|
101
|
+
row_maps.append(row_map)
|
|
102
|
+
|
|
103
|
+
# 生成最终 grid:每行是“实际出现过的 Cell(去重)”列表(用于 bounds/shape 辅助)
|
|
104
|
+
for r_idx, row_map in enumerate(row_maps):
|
|
105
|
+
seen: set[int] = set()
|
|
106
|
+
grid_row: list[Cell] = []
|
|
107
|
+
for c in range(max_cols):
|
|
108
|
+
cell = row_map.get(c)
|
|
109
|
+
if cell is None:
|
|
110
|
+
continue
|
|
111
|
+
if id(cell) in seen:
|
|
112
|
+
continue
|
|
113
|
+
seen.add(id(cell))
|
|
114
|
+
grid_row.append(cell)
|
|
115
|
+
self._grid.append(grid_row)
|
|
116
|
+
|
|
117
|
+
# 另外保存一个坐标展开矩阵,供 __getitem__ 精确返回合并起点单元格
|
|
118
|
+
self._matrix: list[list[Cell]] = []
|
|
119
|
+
for r_idx, row_map in enumerate(row_maps):
|
|
120
|
+
matrix_row: list[Cell] = []
|
|
121
|
+
for c in range(max_cols):
|
|
122
|
+
matrix_row.append(
|
|
123
|
+
row_map.get(c) or Cell(None, self._document, r_idx, c, 1)
|
|
124
|
+
)
|
|
125
|
+
self._matrix.append(matrix_row)
|
|
126
|
+
|
|
127
|
+
def shape(self) -> tuple[int, int]:
|
|
128
|
+
"""返回表格尺寸 (rows, cols)"""
|
|
129
|
+
if not self._grid:
|
|
130
|
+
return (0, 0)
|
|
131
|
+
rows = len(self._grid)
|
|
132
|
+
cols = (
|
|
133
|
+
len(getattr(self, "_matrix", [[]])[0])
|
|
134
|
+
if getattr(self, "_matrix", None)
|
|
135
|
+
else 0
|
|
136
|
+
)
|
|
137
|
+
return (rows, cols)
|
|
138
|
+
|
|
139
|
+
def __getitem__(self, key: tuple[int, int]) -> "Cell":
|
|
140
|
+
"""返回 Cell 对象"""
|
|
141
|
+
row, col = key
|
|
142
|
+
matrix = getattr(self, "_matrix", None)
|
|
143
|
+
if (
|
|
144
|
+
matrix is not None
|
|
145
|
+
and 0 <= row < len(matrix)
|
|
146
|
+
and 0 <= col < len(matrix[row])
|
|
147
|
+
):
|
|
148
|
+
return matrix[row][col]
|
|
149
|
+
return Cell(None, self._document, row, col, 1)
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
class Cell:
|
|
153
|
+
"""表示表格单元格"""
|
|
154
|
+
|
|
155
|
+
def __init__(self, element, document, row: int, col: int, colspan: int = 1):
|
|
156
|
+
self._element = element
|
|
157
|
+
self._document = document
|
|
158
|
+
self._row = row
|
|
159
|
+
self._col = col
|
|
160
|
+
self._colspan = colspan
|
|
161
|
+
self._rowspan = 1
|
|
162
|
+
|
|
163
|
+
def _grow_rowspan_to(self, bottom_exclusive: int) -> None:
|
|
164
|
+
"""将 rowspan 扩展到指定 bottom(左闭右开)"""
|
|
165
|
+
self._rowspan = max(self._rowspan, bottom_exclusive - self._row)
|
|
166
|
+
|
|
167
|
+
def blocks(self) -> tuple:
|
|
168
|
+
"""返回单元格中的块级元素(元组)"""
|
|
169
|
+
with self._document._lock:
|
|
170
|
+
if self._element is None:
|
|
171
|
+
return ()
|
|
172
|
+
|
|
173
|
+
from .paragraph import Paragraph
|
|
174
|
+
|
|
175
|
+
blocks: list = []
|
|
176
|
+
for child in self._element:
|
|
177
|
+
tag = etree.QName(child.tag).localname
|
|
178
|
+
if tag == "p":
|
|
179
|
+
blocks.append(Paragraph(child, self._document))
|
|
180
|
+
elif tag == "tbl":
|
|
181
|
+
blocks.append(Table(child, self._document))
|
|
182
|
+
return tuple(blocks)
|
|
183
|
+
|
|
184
|
+
def bounds(self) -> tuple[int, int, int, int]:
|
|
185
|
+
"""返回单元格边界 (top, left, bottom, right)"""
|
|
186
|
+
if self._element is None:
|
|
187
|
+
return (self._row, self._col, self._row + 1, self._col + 1)
|
|
188
|
+
|
|
189
|
+
return (
|
|
190
|
+
self._row,
|
|
191
|
+
self._col,
|
|
192
|
+
self._row + self._rowspan,
|
|
193
|
+
self._col + self._colspan,
|
|
194
|
+
)
|