offagent 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- offagent/__init__.py +3 -0
- offagent/__main__.py +5 -0
- offagent/adapters/__init__.py +1 -0
- offagent/adapters/docx_adapter.py +1237 -0
- offagent/adapters/embedding_provider.py +132 -0
- offagent/adapters/pptx_adapter.py +940 -0
- offagent/adapters/xlsx_adapter.py +1266 -0
- offagent/app/__init__.py +1 -0
- offagent/app/progress.py +52 -0
- offagent/app/services.py +4267 -0
- offagent/config.py +287 -0
- offagent/domain/__init__.py +1 -0
- offagent/domain/locators.py +444 -0
- offagent/domain/models.py +477 -0
- offagent/domain/text_fragments.py +136 -0
- offagent/errors.py +29 -0
- offagent/indexing/__init__.py +1 -0
- offagent/indexing/store.py +795 -0
- offagent/interfaces/__init__.py +1 -0
- offagent/interfaces/cli.py +438 -0
- offagent/interfaces/cli_output.py +139 -0
- offagent/interfaces/cli_progress.py +120 -0
- offagent/interfaces/mcp.py +1145 -0
- offagent/interfaces/mcp_converters.py +80 -0
- offagent/interfaces/mcp_models.py +923 -0
- offagent/objects/__init__.py +3 -0
- offagent/objects/base.py +26 -0
- offagent/objects/docx_objects.py +951 -0
- offagent/objects/pptx_objects.py +895 -0
- offagent/objects/xlsx_objects.py +962 -0
- offagent/path_policy.py +42 -0
- offagent/storage/__init__.py +1 -0
- offagent/storage/versioning.py +31 -0
- offagent-0.10.0.dist-info/METADATA +546 -0
- offagent-0.10.0.dist-info/RECORD +39 -0
- offagent-0.10.0.dist-info/WHEEL +5 -0
- offagent-0.10.0.dist-info/entry_points.txt +2 -0
- offagent-0.10.0.dist-info/licenses/LICENSE +21 -0
- offagent-0.10.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,1237 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from offagent.domain.locators import parse_locator, to_v2_locator
|
|
7
|
+
from offagent.domain.models import (
|
|
8
|
+
BlockStyle,
|
|
9
|
+
BlockBundle,
|
|
10
|
+
DocxRun,
|
|
11
|
+
DocxParagraph,
|
|
12
|
+
DocxTableCell,
|
|
13
|
+
DocxTable,
|
|
14
|
+
DocumentBlock,
|
|
15
|
+
DocumentRef,
|
|
16
|
+
InlineFragment,
|
|
17
|
+
InlineStyle,
|
|
18
|
+
IndexedItem,
|
|
19
|
+
SectionPayload,
|
|
20
|
+
StructureSection,
|
|
21
|
+
TextContainerSnapshot,
|
|
22
|
+
VisibleTextRange,
|
|
23
|
+
)
|
|
24
|
+
from offagent.domain.text_fragments import (
|
|
25
|
+
apply_style_to_range,
|
|
26
|
+
fragment_text,
|
|
27
|
+
normalize_fragments,
|
|
28
|
+
)
|
|
29
|
+
from offagent.errors import (
|
|
30
|
+
InvalidArgumentsError,
|
|
31
|
+
TargetNotEditableError,
|
|
32
|
+
TargetNotFoundError,
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
try:
|
|
36
|
+
from docx import Document
|
|
37
|
+
from docx.enum.text import WD_ALIGN_PARAGRAPH, WD_COLOR_INDEX
|
|
38
|
+
from docx.oxml.table import CT_Tbl
|
|
39
|
+
from docx.oxml.text.paragraph import CT_P
|
|
40
|
+
from docx.shared import Pt, RGBColor
|
|
41
|
+
from docx.table import Table
|
|
42
|
+
from docx.text.paragraph import Paragraph
|
|
43
|
+
from docx.text.run import Run
|
|
44
|
+
except ModuleNotFoundError: # pragma: no cover - exercised through dependency checks
|
|
45
|
+
Document = None
|
|
46
|
+
WD_ALIGN_PARAGRAPH = None
|
|
47
|
+
WD_COLOR_INDEX = None
|
|
48
|
+
CT_Tbl = None
|
|
49
|
+
CT_P = None
|
|
50
|
+
Pt = None
|
|
51
|
+
RGBColor = None
|
|
52
|
+
Table = None
|
|
53
|
+
Paragraph = None
|
|
54
|
+
Run = None
|
|
55
|
+
|
|
56
|
+
RunFormatting = InlineStyle
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@dataclass(frozen=True)
|
|
60
|
+
class ResolvedParagraphTarget:
|
|
61
|
+
block_index: int
|
|
62
|
+
paragraph_index: int
|
|
63
|
+
paragraph: Paragraph
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@dataclass(frozen=True)
|
|
67
|
+
class ResolvedTableCellTarget:
|
|
68
|
+
block_index: int
|
|
69
|
+
table_index: int
|
|
70
|
+
row_index: int
|
|
71
|
+
column_index: int
|
|
72
|
+
table: Table
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
ResolvedTarget = ResolvedParagraphTarget | ResolvedTableCellTarget
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def extract_document(document_path: Path) -> list[IndexedItem]:
|
|
79
|
+
items: list[IndexedItem] = []
|
|
80
|
+
|
|
81
|
+
for paragraph in get_paragraphs(document_path):
|
|
82
|
+
locator = f"para:{paragraph.paragraph_index}"
|
|
83
|
+
items.append(
|
|
84
|
+
IndexedItem(
|
|
85
|
+
item_id=locator,
|
|
86
|
+
item_type="paragraph",
|
|
87
|
+
locator=locator,
|
|
88
|
+
preview=paragraph.preview,
|
|
89
|
+
content_text=paragraph.text,
|
|
90
|
+
metadata={
|
|
91
|
+
"paragraph_index": paragraph.paragraph_index,
|
|
92
|
+
"block_index": paragraph.block_index,
|
|
93
|
+
"style_name": paragraph.style_name,
|
|
94
|
+
"is_heading": paragraph.is_heading,
|
|
95
|
+
},
|
|
96
|
+
)
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
return items
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def build_embedding_text(item: IndexedItem, document_path: Path) -> str:
|
|
103
|
+
del document_path
|
|
104
|
+
return item.content_text
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def read_paragraph(document_path: Path, item_id: str) -> str:
|
|
108
|
+
paragraph = _resolve_paragraph(_open_document(document_path), item_id)
|
|
109
|
+
return paragraph.text
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def replace_paragraph(
|
|
113
|
+
document_path: Path, item_id: str, text: str, output_path: Path | None = None
|
|
114
|
+
) -> Path:
|
|
115
|
+
document = _open_document(document_path)
|
|
116
|
+
paragraph = _resolve_paragraph(document, item_id)
|
|
117
|
+
formatting = _capture_run_formatting(paragraph.runs[0] if paragraph.runs else None)
|
|
118
|
+
_clear_paragraph(paragraph)
|
|
119
|
+
replacement_run = paragraph.add_run(text)
|
|
120
|
+
_apply_run_formatting(replacement_run, formatting)
|
|
121
|
+
target_path = _target_path(document_path, output_path)
|
|
122
|
+
document.save(target_path)
|
|
123
|
+
return target_path
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def append_paragraph(
|
|
127
|
+
document_path: Path, item_id: str, text: str, output_path: Path | None = None
|
|
128
|
+
) -> Path:
|
|
129
|
+
document = _open_document(document_path)
|
|
130
|
+
paragraph = _resolve_paragraph(document, item_id)
|
|
131
|
+
if paragraph.runs:
|
|
132
|
+
paragraph.runs[-1].text = f"{paragraph.runs[-1].text}{text}"
|
|
133
|
+
else:
|
|
134
|
+
paragraph.add_run(text)
|
|
135
|
+
target_path = _target_path(document_path, output_path)
|
|
136
|
+
document.save(target_path)
|
|
137
|
+
return target_path
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def make_table_cell_locator(table_index: int, row_index: int, column_index: int) -> str:
|
|
141
|
+
return f"table:{table_index}:cell:{row_index}:{column_index}"
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def parse_table_cell_locator(locator: str) -> tuple[int, int, int]:
|
|
145
|
+
parts = locator.split(":")
|
|
146
|
+
if len(parts) != 5 or parts[0] != "table" or parts[2] != "cell":
|
|
147
|
+
raise InvalidArgumentsError(f"Unsupported DOCX table cell locator: {locator}")
|
|
148
|
+
try:
|
|
149
|
+
table_index = int(parts[1])
|
|
150
|
+
row_index = int(parts[3])
|
|
151
|
+
column_index = int(parts[4])
|
|
152
|
+
except ValueError as exc:
|
|
153
|
+
raise InvalidArgumentsError(
|
|
154
|
+
f"Invalid DOCX table cell locator: {locator}"
|
|
155
|
+
) from exc
|
|
156
|
+
return table_index, row_index, column_index
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def resolve_structure(document_path: Path) -> tuple[StructureSection, ...]:
|
|
160
|
+
document = _open_document(document_path)
|
|
161
|
+
sections: list[StructureSection] = []
|
|
162
|
+
|
|
163
|
+
paragraph_index = 0
|
|
164
|
+
table_index = 0
|
|
165
|
+
for block_index, (block_type, block) in enumerate(_iter_blocks(document)):
|
|
166
|
+
if block_type == "paragraph":
|
|
167
|
+
paragraph_model = _paragraph_model(block, block_index, paragraph_index)
|
|
168
|
+
sections.append(
|
|
169
|
+
StructureSection(
|
|
170
|
+
locator=f"para:{paragraph_index}",
|
|
171
|
+
section_type="paragraph",
|
|
172
|
+
preview=paragraph_model.preview,
|
|
173
|
+
metadata={
|
|
174
|
+
"block_index": block_index,
|
|
175
|
+
"block_type": "paragraph",
|
|
176
|
+
"paragraph_index": paragraph_index,
|
|
177
|
+
"style_name": paragraph_model.style_name,
|
|
178
|
+
"is_heading": paragraph_model.is_heading,
|
|
179
|
+
},
|
|
180
|
+
)
|
|
181
|
+
)
|
|
182
|
+
paragraph_index += 1
|
|
183
|
+
continue
|
|
184
|
+
|
|
185
|
+
table_model = _table_model(block, block_index, table_index)
|
|
186
|
+
sections.append(
|
|
187
|
+
StructureSection(
|
|
188
|
+
locator=make_table_cell_locator(table_index, 0, 0),
|
|
189
|
+
section_type="table",
|
|
190
|
+
preview=table_model.preview,
|
|
191
|
+
metadata={
|
|
192
|
+
"block_index": block_index,
|
|
193
|
+
"block_type": "table",
|
|
194
|
+
"table_index": table_index,
|
|
195
|
+
"row_count": len(table_model.rows),
|
|
196
|
+
"column_count": max(
|
|
197
|
+
(len(row) for row in table_model.rows), default=0
|
|
198
|
+
),
|
|
199
|
+
},
|
|
200
|
+
)
|
|
201
|
+
)
|
|
202
|
+
table_index += 1
|
|
203
|
+
|
|
204
|
+
return tuple(sections)
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def get_section(document_path: Path, locator: str) -> SectionPayload:
|
|
208
|
+
document = _open_document(document_path)
|
|
209
|
+
resolved = _resolve_locator(document, locator)
|
|
210
|
+
document_ref = _document_ref(document_path)
|
|
211
|
+
|
|
212
|
+
if isinstance(resolved, ResolvedParagraphTarget):
|
|
213
|
+
paragraph_model = _paragraph_model(
|
|
214
|
+
resolved.paragraph,
|
|
215
|
+
resolved.block_index,
|
|
216
|
+
resolved.paragraph_index,
|
|
217
|
+
)
|
|
218
|
+
return SectionPayload(
|
|
219
|
+
document=document_ref,
|
|
220
|
+
locator=f"para:{resolved.paragraph_index}",
|
|
221
|
+
section_type="paragraph",
|
|
222
|
+
preview=paragraph_model.preview,
|
|
223
|
+
metadata={
|
|
224
|
+
"block_index": resolved.block_index,
|
|
225
|
+
"block_type": "paragraph",
|
|
226
|
+
"paragraph_index": resolved.paragraph_index,
|
|
227
|
+
},
|
|
228
|
+
block_type="paragraph",
|
|
229
|
+
text=paragraph_model.text,
|
|
230
|
+
style_name=paragraph_model.style_name,
|
|
231
|
+
is_heading=paragraph_model.is_heading,
|
|
232
|
+
runs=tuple(_run_model(run) for run in resolved.paragraph.runs),
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
table_model = _table_model(
|
|
236
|
+
resolved.table, resolved.block_index, resolved.table_index
|
|
237
|
+
)
|
|
238
|
+
cells = tuple(
|
|
239
|
+
DocxTableCell(
|
|
240
|
+
locator=make_table_cell_locator(
|
|
241
|
+
resolved.table_index, row_index, column_index
|
|
242
|
+
),
|
|
243
|
+
row_index=row_index,
|
|
244
|
+
column_index=column_index,
|
|
245
|
+
text=cell.text,
|
|
246
|
+
metadata={},
|
|
247
|
+
)
|
|
248
|
+
for row_index, row in enumerate(resolved.table.rows)
|
|
249
|
+
for column_index, cell in enumerate(row.cells)
|
|
250
|
+
)
|
|
251
|
+
return SectionPayload(
|
|
252
|
+
document=document_ref,
|
|
253
|
+
locator=make_table_cell_locator(resolved.table_index, 0, 0),
|
|
254
|
+
section_type="table",
|
|
255
|
+
preview=table_model.preview,
|
|
256
|
+
metadata={
|
|
257
|
+
"block_index": resolved.block_index,
|
|
258
|
+
"block_type": "table",
|
|
259
|
+
"table_index": resolved.table_index,
|
|
260
|
+
"row_count": len(table_model.rows),
|
|
261
|
+
"column_count": max((len(row) for row in table_model.rows), default=0),
|
|
262
|
+
},
|
|
263
|
+
block_type="table",
|
|
264
|
+
rows=table_model.rows,
|
|
265
|
+
table_cells=cells,
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def read_node(document_path: Path, locator: str) -> tuple[str, str, dict[str, object]]:
|
|
270
|
+
document = _open_document(document_path)
|
|
271
|
+
resolved = _resolve_locator(document, locator)
|
|
272
|
+
|
|
273
|
+
if isinstance(resolved, ResolvedParagraphTarget):
|
|
274
|
+
paragraph_model = _paragraph_model(
|
|
275
|
+
resolved.paragraph,
|
|
276
|
+
resolved.block_index,
|
|
277
|
+
resolved.paragraph_index,
|
|
278
|
+
)
|
|
279
|
+
return (
|
|
280
|
+
"paragraph",
|
|
281
|
+
paragraph_model.text,
|
|
282
|
+
{
|
|
283
|
+
"block_index": resolved.block_index,
|
|
284
|
+
"paragraph_index": resolved.paragraph_index,
|
|
285
|
+
"style_name": paragraph_model.style_name,
|
|
286
|
+
"is_heading": paragraph_model.is_heading,
|
|
287
|
+
},
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
cell = resolved.table.rows[resolved.row_index].cells[resolved.column_index]
|
|
291
|
+
return (
|
|
292
|
+
"table_cell",
|
|
293
|
+
cell.text,
|
|
294
|
+
{
|
|
295
|
+
"block_index": resolved.block_index,
|
|
296
|
+
"table_index": resolved.table_index,
|
|
297
|
+
"row_index": resolved.row_index,
|
|
298
|
+
"column_index": resolved.column_index,
|
|
299
|
+
},
|
|
300
|
+
)
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
def write_node(
|
|
304
|
+
document_path: Path, locator: str, text: str, output_path: Path | None = None
|
|
305
|
+
) -> Path:
|
|
306
|
+
document = _open_document(document_path)
|
|
307
|
+
resolved = _resolve_locator(document, locator)
|
|
308
|
+
|
|
309
|
+
if isinstance(resolved, ResolvedParagraphTarget):
|
|
310
|
+
return replace_paragraph(
|
|
311
|
+
document_path, f"para:{resolved.paragraph_index}", text, output_path
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
cell = resolved.table.rows[resolved.row_index].cells[resolved.column_index]
|
|
315
|
+
cell.text = text
|
|
316
|
+
target_path = _target_path(document_path, output_path)
|
|
317
|
+
document.save(target_path)
|
|
318
|
+
return target_path
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
def read_paragraph_fragments(
|
|
322
|
+
document_path: Path, locator: str
|
|
323
|
+
) -> TextContainerSnapshot:
|
|
324
|
+
document = _open_document(document_path)
|
|
325
|
+
canonical, components = _canonical_docx_locator(locator)
|
|
326
|
+
if len(components) != 3 or components[:2] != ("docx", "para"):
|
|
327
|
+
raise InvalidArgumentsError("DOCX fragment reads require a paragraph locator.")
|
|
328
|
+
|
|
329
|
+
paragraph = _resolve_paragraph(document, f"para:{components[2]}")
|
|
330
|
+
fragments = _read_docx_paragraph_fragments(paragraph)
|
|
331
|
+
return TextContainerSnapshot(
|
|
332
|
+
locator=canonical,
|
|
333
|
+
object_type="paragraph",
|
|
334
|
+
text=fragment_text(fragments),
|
|
335
|
+
fragments=fragments,
|
|
336
|
+
metadata={"paragraph_index": int(components[2])},
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
def rewrite_paragraph_fragments(
|
|
341
|
+
document_path: Path,
|
|
342
|
+
locator: str,
|
|
343
|
+
fragments: list[InlineFragment] | tuple[InlineFragment, ...],
|
|
344
|
+
output_path: Path | None = None,
|
|
345
|
+
) -> tuple[Path, str, TextContainerSnapshot]:
|
|
346
|
+
document = _open_document(document_path)
|
|
347
|
+
canonical, components = _canonical_docx_locator(locator)
|
|
348
|
+
if len(components) != 3 or components[:2] != ("docx", "para"):
|
|
349
|
+
raise InvalidArgumentsError("DOCX fragment writes require a paragraph locator.")
|
|
350
|
+
|
|
351
|
+
paragraph = _resolve_paragraph(document, f"para:{components[2]}")
|
|
352
|
+
normalized = normalize_fragments(fragments)
|
|
353
|
+
_rewrite_docx_paragraph(paragraph, normalized)
|
|
354
|
+
target_path = _target_path(document_path, output_path)
|
|
355
|
+
document.save(target_path)
|
|
356
|
+
snapshot = TextContainerSnapshot(
|
|
357
|
+
locator=canonical,
|
|
358
|
+
object_type="paragraph",
|
|
359
|
+
text=fragment_text(normalized),
|
|
360
|
+
fragments=normalized,
|
|
361
|
+
metadata={"paragraph_index": int(components[2])},
|
|
362
|
+
)
|
|
363
|
+
return target_path, canonical, snapshot
|
|
364
|
+
|
|
365
|
+
|
|
366
|
+
def insert_paragraph(
|
|
367
|
+
document_path: Path,
|
|
368
|
+
text: str,
|
|
369
|
+
*,
|
|
370
|
+
style_name: str | None = None,
|
|
371
|
+
after_locator: str | None = None,
|
|
372
|
+
output_path: Path | None = None,
|
|
373
|
+
) -> tuple[Path, str]:
|
|
374
|
+
if after_locator is None:
|
|
375
|
+
target_path, block_index = append_paragraph_block(
|
|
376
|
+
document_path,
|
|
377
|
+
text,
|
|
378
|
+
style_name=style_name,
|
|
379
|
+
output_path=output_path,
|
|
380
|
+
)
|
|
381
|
+
paragraph_count = len(get_paragraphs(target_path))
|
|
382
|
+
return target_path, f"para:{paragraph_count - 1}"
|
|
383
|
+
|
|
384
|
+
document = _open_document(document_path)
|
|
385
|
+
resolved = _resolve_locator(document, after_locator)
|
|
386
|
+
paragraph_index = (
|
|
387
|
+
resolved.paragraph_index + 1
|
|
388
|
+
if isinstance(resolved, ResolvedParagraphTarget)
|
|
389
|
+
else _paragraphs_before_block(document, resolved.block_index)
|
|
390
|
+
)
|
|
391
|
+
|
|
392
|
+
anchor_element = (
|
|
393
|
+
resolved.paragraph._element
|
|
394
|
+
if isinstance(resolved, ResolvedParagraphTarget)
|
|
395
|
+
else resolved.table._element
|
|
396
|
+
)
|
|
397
|
+
new_element = document.element.body.add_p()
|
|
398
|
+
anchor_element.addnext(new_element)
|
|
399
|
+
paragraph = Paragraph(new_element, document)
|
|
400
|
+
paragraph.add_run(text)
|
|
401
|
+
if style_name is not None:
|
|
402
|
+
try:
|
|
403
|
+
paragraph.style = style_name
|
|
404
|
+
except (KeyError, ValueError) as exc:
|
|
405
|
+
raise InvalidArgumentsError(
|
|
406
|
+
f"Unknown DOCX paragraph style: {style_name}"
|
|
407
|
+
) from exc
|
|
408
|
+
|
|
409
|
+
target_path = _target_path(document_path, output_path)
|
|
410
|
+
document.save(target_path)
|
|
411
|
+
return target_path, f"para:{paragraph_index}"
|
|
412
|
+
|
|
413
|
+
|
|
414
|
+
def create_docx(output_path: Path) -> Path:
|
|
415
|
+
document = _open_document_from_default_template()
|
|
416
|
+
document.save(output_path)
|
|
417
|
+
return output_path
|
|
418
|
+
|
|
419
|
+
|
|
420
|
+
def add_paragraph(
|
|
421
|
+
document_path: Path,
|
|
422
|
+
text: str,
|
|
423
|
+
output_path: Path | None = None,
|
|
424
|
+
) -> tuple[Path, str]:
|
|
425
|
+
target_path, legacy_locator = insert_paragraph(
|
|
426
|
+
document_path,
|
|
427
|
+
text,
|
|
428
|
+
output_path=output_path,
|
|
429
|
+
)
|
|
430
|
+
return target_path, to_v2_locator(legacy_locator, file_type="docx")
|
|
431
|
+
|
|
432
|
+
|
|
433
|
+
def add_heading(
|
|
434
|
+
document_path: Path,
|
|
435
|
+
text: str,
|
|
436
|
+
level: int,
|
|
437
|
+
output_path: Path | None = None,
|
|
438
|
+
) -> tuple[Path, str]:
|
|
439
|
+
if level < 1 or level > 9:
|
|
440
|
+
raise InvalidArgumentsError("DOCX heading level must be between 1 and 9.")
|
|
441
|
+
|
|
442
|
+
document = _open_document(document_path)
|
|
443
|
+
document.add_heading(text, level=level)
|
|
444
|
+
paragraph_index = (
|
|
445
|
+
sum(1 for block_type, _ in _iter_blocks(document) if block_type == "paragraph")
|
|
446
|
+
- 1
|
|
447
|
+
)
|
|
448
|
+
target_path = _target_path(document_path, output_path)
|
|
449
|
+
document.save(target_path)
|
|
450
|
+
return target_path, f"docx:para:{paragraph_index}"
|
|
451
|
+
|
|
452
|
+
|
|
453
|
+
def add_table(
|
|
454
|
+
document_path: Path,
|
|
455
|
+
rows: int,
|
|
456
|
+
columns: int,
|
|
457
|
+
output_path: Path | None = None,
|
|
458
|
+
) -> tuple[Path, str]:
|
|
459
|
+
if rows < 1 or columns < 1:
|
|
460
|
+
raise InvalidArgumentsError("DOCX table rows and columns must be positive.")
|
|
461
|
+
|
|
462
|
+
document = _open_document(document_path)
|
|
463
|
+
table_index = sum(
|
|
464
|
+
1 for block_type, _ in _iter_blocks(document) if block_type == "table"
|
|
465
|
+
)
|
|
466
|
+
document.add_table(rows=rows, cols=columns)
|
|
467
|
+
target_path = _target_path(document_path, output_path)
|
|
468
|
+
document.save(target_path)
|
|
469
|
+
return target_path, f"docx:table:{table_index}"
|
|
470
|
+
|
|
471
|
+
|
|
472
|
+
def style_run(
|
|
473
|
+
document_path: Path,
|
|
474
|
+
locator: str,
|
|
475
|
+
style: InlineStyle,
|
|
476
|
+
clear_fields: list[str] | tuple[str, ...],
|
|
477
|
+
output_path: Path | None = None,
|
|
478
|
+
) -> tuple[Path, str, dict[str, object]]:
|
|
479
|
+
document = _open_document(document_path)
|
|
480
|
+
canonical, components = _canonical_docx_locator(locator)
|
|
481
|
+
if (
|
|
482
|
+
len(components) != 5
|
|
483
|
+
or components[:2] != ("docx", "para")
|
|
484
|
+
or components[3] != "run"
|
|
485
|
+
):
|
|
486
|
+
raise InvalidArgumentsError("DOCX inline styling requires a run locator.")
|
|
487
|
+
|
|
488
|
+
paragraph = _resolve_paragraph(document, f"para:{components[2]}")
|
|
489
|
+
run_index = _parse_int_component(components[4], locator)
|
|
490
|
+
try:
|
|
491
|
+
run = paragraph.runs[run_index]
|
|
492
|
+
except IndexError as exc:
|
|
493
|
+
raise TargetNotFoundError(
|
|
494
|
+
f"Run {run_index} does not exist in paragraph {components[2]}."
|
|
495
|
+
) from exc
|
|
496
|
+
|
|
497
|
+
cleared_fields = _normalize_clear_fields(clear_fields, _INLINE_STYLE_FIELDS)
|
|
498
|
+
_apply_docx_inline_style(run, style, cleared_fields)
|
|
499
|
+
target_path = _target_path(document_path, output_path)
|
|
500
|
+
document.save(target_path)
|
|
501
|
+
return target_path, canonical, {"cleared_fields": cleared_fields}
|
|
502
|
+
|
|
503
|
+
|
|
504
|
+
def style_paragraph_range(
|
|
505
|
+
document_path: Path,
|
|
506
|
+
locator: str,
|
|
507
|
+
text_range: VisibleTextRange,
|
|
508
|
+
style: InlineStyle,
|
|
509
|
+
clear_fields: list[str] | tuple[str, ...],
|
|
510
|
+
output_path: Path | None = None,
|
|
511
|
+
) -> tuple[Path, str, dict[str, object]]:
|
|
512
|
+
snapshot = read_paragraph_fragments(document_path, locator)
|
|
513
|
+
cleared_fields = _normalize_clear_fields(clear_fields, _INLINE_STYLE_FIELDS)
|
|
514
|
+
styled = apply_style_to_range(
|
|
515
|
+
snapshot.fragments, text_range, style=style, clear_fields=cleared_fields
|
|
516
|
+
)
|
|
517
|
+
target_path, canonical, rewritten = rewrite_paragraph_fragments(
|
|
518
|
+
document_path,
|
|
519
|
+
locator,
|
|
520
|
+
styled,
|
|
521
|
+
output_path=output_path,
|
|
522
|
+
)
|
|
523
|
+
return (
|
|
524
|
+
target_path,
|
|
525
|
+
canonical,
|
|
526
|
+
{
|
|
527
|
+
"cleared_fields": cleared_fields,
|
|
528
|
+
"range": {"start": text_range.start, "end": text_range.end},
|
|
529
|
+
"text": rewritten.text,
|
|
530
|
+
},
|
|
531
|
+
)
|
|
532
|
+
|
|
533
|
+
|
|
534
|
+
def style_paragraph(
|
|
535
|
+
document_path: Path,
|
|
536
|
+
locator: str,
|
|
537
|
+
style: BlockStyle,
|
|
538
|
+
clear_fields: list[str] | tuple[str, ...],
|
|
539
|
+
output_path: Path | None = None,
|
|
540
|
+
) -> tuple[Path, str, dict[str, object]]:
|
|
541
|
+
document = _open_document(document_path)
|
|
542
|
+
canonical, components = _canonical_docx_locator(locator)
|
|
543
|
+
if len(components) != 3 or components[:2] != ("docx", "para"):
|
|
544
|
+
raise InvalidArgumentsError("DOCX block styling requires a paragraph locator.")
|
|
545
|
+
|
|
546
|
+
paragraph = _resolve_paragraph(document, f"para:{components[2]}")
|
|
547
|
+
cleared_fields = _normalize_clear_fields(clear_fields, _BLOCK_STYLE_FIELDS)
|
|
548
|
+
skipped_fields = _apply_docx_block_style(paragraph, style, cleared_fields)
|
|
549
|
+
target_path = _target_path(document_path, output_path)
|
|
550
|
+
document.save(target_path)
|
|
551
|
+
return (
|
|
552
|
+
target_path,
|
|
553
|
+
canonical,
|
|
554
|
+
{"cleared_fields": cleared_fields, "skipped_fields": skipped_fields},
|
|
555
|
+
)
|
|
556
|
+
|
|
557
|
+
|
|
558
|
+
def set_structural_role(
|
|
559
|
+
document_path: Path,
|
|
560
|
+
locator: str,
|
|
561
|
+
role: str,
|
|
562
|
+
level: int | None,
|
|
563
|
+
output_path: Path | None = None,
|
|
564
|
+
) -> tuple[Path, str, dict[str, object]]:
|
|
565
|
+
document = _open_document(document_path)
|
|
566
|
+
canonical, components = _canonical_docx_locator(locator)
|
|
567
|
+
if len(components) != 3 or components[:2] != ("docx", "para"):
|
|
568
|
+
raise InvalidArgumentsError(
|
|
569
|
+
"set_structural_role requires a DOCX paragraph locator."
|
|
570
|
+
)
|
|
571
|
+
|
|
572
|
+
style_name = _docx_structural_style_name(role, level)
|
|
573
|
+
if not any(getattr(style, "name", None) == style_name for style in document.styles):
|
|
574
|
+
raise TargetNotEditableError(
|
|
575
|
+
f"DOCX style {style_name!r} is not available in the document."
|
|
576
|
+
)
|
|
577
|
+
|
|
578
|
+
paragraph = _resolve_paragraph(document, f"para:{components[2]}")
|
|
579
|
+
paragraph.style = style_name
|
|
580
|
+
target_path = _target_path(document_path, output_path)
|
|
581
|
+
document.save(target_path)
|
|
582
|
+
return (
|
|
583
|
+
target_path,
|
|
584
|
+
canonical,
|
|
585
|
+
{"role": role, "level": level, "style_name": style_name},
|
|
586
|
+
)
|
|
587
|
+
|
|
588
|
+
|
|
589
|
+
def get_blocks(document_path: Path) -> tuple[DocumentBlock, ...]:
|
|
590
|
+
document = _open_document(document_path)
|
|
591
|
+
blocks: list[DocumentBlock] = []
|
|
592
|
+
|
|
593
|
+
paragraph_index = 0
|
|
594
|
+
table_index = 0
|
|
595
|
+
for block_index, (block_type, block) in enumerate(_iter_blocks(document)):
|
|
596
|
+
if block_type == "paragraph":
|
|
597
|
+
paragraph_model = _paragraph_model(block, block_index, paragraph_index)
|
|
598
|
+
blocks.append(
|
|
599
|
+
DocumentBlock(
|
|
600
|
+
block_index=block_index,
|
|
601
|
+
block_type="paragraph",
|
|
602
|
+
preview=paragraph_model.preview,
|
|
603
|
+
metadata={
|
|
604
|
+
"paragraph_index": paragraph_model.paragraph_index,
|
|
605
|
+
"style_name": paragraph_model.style_name,
|
|
606
|
+
"is_heading": paragraph_model.is_heading,
|
|
607
|
+
},
|
|
608
|
+
)
|
|
609
|
+
)
|
|
610
|
+
paragraph_index += 1
|
|
611
|
+
else:
|
|
612
|
+
table_model = _table_model(block, block_index, table_index)
|
|
613
|
+
blocks.append(
|
|
614
|
+
DocumentBlock(
|
|
615
|
+
block_index=block_index,
|
|
616
|
+
block_type="table",
|
|
617
|
+
preview=table_model.preview,
|
|
618
|
+
metadata={
|
|
619
|
+
"table_index": table_model.table_index,
|
|
620
|
+
"row_count": len(table_model.rows),
|
|
621
|
+
"column_count": max(
|
|
622
|
+
(len(row) for row in table_model.rows), default=0
|
|
623
|
+
),
|
|
624
|
+
},
|
|
625
|
+
)
|
|
626
|
+
)
|
|
627
|
+
table_index += 1
|
|
628
|
+
|
|
629
|
+
return tuple(blocks)
|
|
630
|
+
|
|
631
|
+
|
|
632
|
+
def get_paragraphs(document_path: Path) -> tuple[DocxParagraph, ...]:
|
|
633
|
+
document = _open_document(document_path)
|
|
634
|
+
paragraphs: list[DocxParagraph] = []
|
|
635
|
+
|
|
636
|
+
paragraph_index = 0
|
|
637
|
+
for block_index, (block_type, block) in enumerate(_iter_blocks(document)):
|
|
638
|
+
if block_type != "paragraph":
|
|
639
|
+
continue
|
|
640
|
+
paragraphs.append(_paragraph_model(block, block_index, paragraph_index))
|
|
641
|
+
paragraph_index += 1
|
|
642
|
+
|
|
643
|
+
return tuple(paragraphs)
|
|
644
|
+
|
|
645
|
+
|
|
646
|
+
def get_tables(document_path: Path) -> tuple[DocxTable, ...]:
|
|
647
|
+
document = _open_document(document_path)
|
|
648
|
+
tables: list[DocxTable] = []
|
|
649
|
+
|
|
650
|
+
table_index = 0
|
|
651
|
+
for block_index, (block_type, block) in enumerate(_iter_blocks(document)):
|
|
652
|
+
if block_type != "table":
|
|
653
|
+
continue
|
|
654
|
+
tables.append(_table_model(block, block_index, table_index))
|
|
655
|
+
table_index += 1
|
|
656
|
+
|
|
657
|
+
return tuple(tables)
|
|
658
|
+
|
|
659
|
+
|
|
660
|
+
def get_block_bundle(document_path: Path, block_index: int) -> BlockBundle:
|
|
661
|
+
document = _open_document(document_path)
|
|
662
|
+
|
|
663
|
+
paragraph_index = 0
|
|
664
|
+
table_index = 0
|
|
665
|
+
for current_block_index, (block_type, block) in enumerate(_iter_blocks(document)):
|
|
666
|
+
if current_block_index != block_index:
|
|
667
|
+
if block_type == "paragraph":
|
|
668
|
+
paragraph_index += 1
|
|
669
|
+
else:
|
|
670
|
+
table_index += 1
|
|
671
|
+
continue
|
|
672
|
+
|
|
673
|
+
if block_type == "paragraph":
|
|
674
|
+
paragraph_model = _paragraph_model(
|
|
675
|
+
block, current_block_index, paragraph_index
|
|
676
|
+
)
|
|
677
|
+
return BlockBundle(
|
|
678
|
+
document=_document_ref(document_path),
|
|
679
|
+
block=DocumentBlock(
|
|
680
|
+
block_index=current_block_index,
|
|
681
|
+
block_type="paragraph",
|
|
682
|
+
preview=paragraph_model.preview,
|
|
683
|
+
metadata={
|
|
684
|
+
"paragraph_index": paragraph_model.paragraph_index,
|
|
685
|
+
"style_name": paragraph_model.style_name,
|
|
686
|
+
"is_heading": paragraph_model.is_heading,
|
|
687
|
+
},
|
|
688
|
+
),
|
|
689
|
+
paragraph=paragraph_model,
|
|
690
|
+
)
|
|
691
|
+
|
|
692
|
+
table_model = _table_model(block, current_block_index, table_index)
|
|
693
|
+
return BlockBundle(
|
|
694
|
+
document=_document_ref(document_path),
|
|
695
|
+
block=DocumentBlock(
|
|
696
|
+
block_index=current_block_index,
|
|
697
|
+
block_type="table",
|
|
698
|
+
preview=table_model.preview,
|
|
699
|
+
metadata={
|
|
700
|
+
"table_index": table_model.table_index,
|
|
701
|
+
"row_count": len(table_model.rows),
|
|
702
|
+
"column_count": max(
|
|
703
|
+
(len(row) for row in table_model.rows), default=0
|
|
704
|
+
),
|
|
705
|
+
},
|
|
706
|
+
),
|
|
707
|
+
table=table_model,
|
|
708
|
+
)
|
|
709
|
+
|
|
710
|
+
raise TargetNotFoundError(f"Block {block_index} does not exist in the document.")
|
|
711
|
+
|
|
712
|
+
|
|
713
|
+
def append_paragraph_block(
|
|
714
|
+
document_path: Path,
|
|
715
|
+
text: str,
|
|
716
|
+
*,
|
|
717
|
+
style_name: str | None = None,
|
|
718
|
+
output_path: Path | None = None,
|
|
719
|
+
) -> tuple[Path, int]:
|
|
720
|
+
document = _open_document(document_path)
|
|
721
|
+
block_index = len(list(_iter_blocks(document)))
|
|
722
|
+
paragraph = document.add_paragraph(text)
|
|
723
|
+
if style_name is not None:
|
|
724
|
+
try:
|
|
725
|
+
paragraph.style = style_name
|
|
726
|
+
except (KeyError, ValueError) as exc:
|
|
727
|
+
raise InvalidArgumentsError(
|
|
728
|
+
f"Unknown DOCX paragraph style: {style_name}"
|
|
729
|
+
) from exc
|
|
730
|
+
target_path = _target_path(document_path, output_path)
|
|
731
|
+
document.save(target_path)
|
|
732
|
+
return target_path, block_index
|
|
733
|
+
|
|
734
|
+
|
|
735
|
+
def replace_block(
|
|
736
|
+
document_path: Path, block_index: int, text: str, output_path: Path | None = None
|
|
737
|
+
) -> Path:
|
|
738
|
+
document = _open_document(document_path)
|
|
739
|
+
|
|
740
|
+
paragraph_index = 0
|
|
741
|
+
for current_block_index, (block_type, block) in enumerate(_iter_blocks(document)):
|
|
742
|
+
if current_block_index != block_index:
|
|
743
|
+
if block_type == "paragraph":
|
|
744
|
+
paragraph_index += 1
|
|
745
|
+
continue
|
|
746
|
+
|
|
747
|
+
if block_type == "table":
|
|
748
|
+
raise TargetNotEditableError(
|
|
749
|
+
"DOCX table block replacement is not supported."
|
|
750
|
+
)
|
|
751
|
+
|
|
752
|
+
item_id = f"para:{paragraph_index}"
|
|
753
|
+
return replace_paragraph(document_path, item_id, text, output_path)
|
|
754
|
+
|
|
755
|
+
raise TargetNotFoundError(f"Block {block_index} does not exist in the document.")
|
|
756
|
+
|
|
757
|
+
|
|
758
|
+
def get_tables_result(document_path: Path) -> tuple[DocxTable, ...]:
|
|
759
|
+
return get_tables(document_path)
|
|
760
|
+
|
|
761
|
+
|
|
762
|
+
def _open_document(document_path: Path):
|
|
763
|
+
if Document is None:
|
|
764
|
+
raise RuntimeError("python-docx is required for DOCX operations.")
|
|
765
|
+
return Document(str(document_path))
|
|
766
|
+
|
|
767
|
+
|
|
768
|
+
def _open_document_from_default_template():
|
|
769
|
+
if Document is None:
|
|
770
|
+
raise RuntimeError("python-docx is required for DOCX operations.")
|
|
771
|
+
return Document()
|
|
772
|
+
|
|
773
|
+
|
|
774
|
+
def _document_ref(document_path: Path):
|
|
775
|
+
resolved_path = document_path.resolve()
|
|
776
|
+
stat = resolved_path.stat()
|
|
777
|
+
return DocumentRef(
|
|
778
|
+
document_id=resolved_path.as_posix(),
|
|
779
|
+
path=resolved_path,
|
|
780
|
+
file_type="docx",
|
|
781
|
+
display_name=resolved_path.name,
|
|
782
|
+
modified_time=stat.st_mtime,
|
|
783
|
+
)
|
|
784
|
+
|
|
785
|
+
|
|
786
|
+
def _resolve_paragraph(document, item_id: str):
|
|
787
|
+
if not item_id.startswith("para:"):
|
|
788
|
+
raise InvalidArgumentsError(f"Unsupported DOCX paragraph item id: {item_id}")
|
|
789
|
+
|
|
790
|
+
try:
|
|
791
|
+
paragraph_index = int(item_id.split(":", maxsplit=1)[1])
|
|
792
|
+
except ValueError as exc:
|
|
793
|
+
raise InvalidArgumentsError(
|
|
794
|
+
f"Invalid DOCX paragraph item id: {item_id}"
|
|
795
|
+
) from exc
|
|
796
|
+
|
|
797
|
+
try:
|
|
798
|
+
return document.paragraphs[paragraph_index]
|
|
799
|
+
except IndexError as exc:
|
|
800
|
+
raise TargetNotFoundError(
|
|
801
|
+
f"Paragraph {paragraph_index} does not exist in the document."
|
|
802
|
+
) from exc
|
|
803
|
+
|
|
804
|
+
|
|
805
|
+
def _resolve_locator(document, locator: str) -> ResolvedTarget:
|
|
806
|
+
normalized = locator.strip()
|
|
807
|
+
if normalized.startswith("para:"):
|
|
808
|
+
paragraph_index = _parse_paragraph_locator(normalized)
|
|
809
|
+
current_paragraph_index = 0
|
|
810
|
+
for block_index, (block_type, block) in enumerate(_iter_blocks(document)):
|
|
811
|
+
if block_type != "paragraph":
|
|
812
|
+
continue
|
|
813
|
+
if current_paragraph_index == paragraph_index:
|
|
814
|
+
return ResolvedParagraphTarget(
|
|
815
|
+
block_index=block_index,
|
|
816
|
+
paragraph_index=paragraph_index,
|
|
817
|
+
paragraph=block,
|
|
818
|
+
)
|
|
819
|
+
current_paragraph_index += 1
|
|
820
|
+
raise TargetNotFoundError(
|
|
821
|
+
f"Paragraph {paragraph_index} does not exist in the document."
|
|
822
|
+
)
|
|
823
|
+
|
|
824
|
+
table_index, row_index, column_index = parse_table_cell_locator(normalized)
|
|
825
|
+
current_table_index = 0
|
|
826
|
+
for block_index, (block_type, block) in enumerate(_iter_blocks(document)):
|
|
827
|
+
if block_type != "table":
|
|
828
|
+
continue
|
|
829
|
+
if current_table_index == table_index:
|
|
830
|
+
try:
|
|
831
|
+
block.rows[row_index].cells[column_index]
|
|
832
|
+
except IndexError as exc:
|
|
833
|
+
raise TargetNotFoundError(
|
|
834
|
+
f"Table cell {table_index}:{row_index}:{column_index} does not exist."
|
|
835
|
+
) from exc
|
|
836
|
+
return ResolvedTableCellTarget(
|
|
837
|
+
block_index=block_index,
|
|
838
|
+
table_index=table_index,
|
|
839
|
+
row_index=row_index,
|
|
840
|
+
column_index=column_index,
|
|
841
|
+
table=block,
|
|
842
|
+
)
|
|
843
|
+
current_table_index += 1
|
|
844
|
+
|
|
845
|
+
raise TargetNotFoundError(f"Table {table_index} does not exist in the document.")
|
|
846
|
+
|
|
847
|
+
|
|
848
|
+
def _parse_paragraph_locator(locator: str) -> int:
|
|
849
|
+
try:
|
|
850
|
+
return int(locator.split(":", maxsplit=1)[1])
|
|
851
|
+
except ValueError as exc:
|
|
852
|
+
raise InvalidArgumentsError(
|
|
853
|
+
f"Invalid DOCX paragraph item id: {locator}"
|
|
854
|
+
) from exc
|
|
855
|
+
|
|
856
|
+
|
|
857
|
+
def _capture_run_formatting(run: Run | None) -> RunFormatting | None:
|
|
858
|
+
if run is None:
|
|
859
|
+
return None
|
|
860
|
+
|
|
861
|
+
return RunFormatting(
|
|
862
|
+
bold=run.bold,
|
|
863
|
+
italic=run.italic,
|
|
864
|
+
underline=run.underline,
|
|
865
|
+
strike=run.font.strike,
|
|
866
|
+
font_name=run.font.name,
|
|
867
|
+
font_size=None if run.font.size is None else run.font.size.pt,
|
|
868
|
+
font_color=None if run.font.color.rgb is None else str(run.font.color.rgb),
|
|
869
|
+
highlight=_docx_highlight_name(run.font.highlight_color),
|
|
870
|
+
)
|
|
871
|
+
|
|
872
|
+
|
|
873
|
+
def _apply_run_formatting(run: Run, formatting: RunFormatting | None) -> None:
|
|
874
|
+
if formatting is None:
|
|
875
|
+
return
|
|
876
|
+
|
|
877
|
+
_apply_docx_inline_style(run, formatting, ())
|
|
878
|
+
|
|
879
|
+
|
|
880
|
+
def _clear_paragraph(paragraph) -> None:
|
|
881
|
+
paragraph_element = paragraph._element
|
|
882
|
+
for child in list(paragraph_element):
|
|
883
|
+
if child.tag.endswith("}pPr"):
|
|
884
|
+
continue
|
|
885
|
+
paragraph_element.remove(child)
|
|
886
|
+
|
|
887
|
+
|
|
888
|
+
def _ensure_rewritable_docx_paragraph(paragraph) -> None:
|
|
889
|
+
for child in list(paragraph._element):
|
|
890
|
+
if child.tag.endswith("}pPr"):
|
|
891
|
+
continue
|
|
892
|
+
if not child.tag.endswith("}r"):
|
|
893
|
+
raise TargetNotEditableError(
|
|
894
|
+
"DOCX paragraph contains inline content that cannot be safely reconstructed."
|
|
895
|
+
)
|
|
896
|
+
|
|
897
|
+
|
|
898
|
+
def _read_docx_paragraph_fragments(paragraph) -> tuple[InlineFragment, ...]:
|
|
899
|
+
_ensure_rewritable_docx_paragraph(paragraph)
|
|
900
|
+
if not paragraph.runs:
|
|
901
|
+
return ()
|
|
902
|
+
return normalize_fragments(
|
|
903
|
+
[
|
|
904
|
+
InlineFragment(
|
|
905
|
+
text=run.text,
|
|
906
|
+
style=_capture_run_formatting(run) or InlineStyle(),
|
|
907
|
+
)
|
|
908
|
+
for run in paragraph.runs
|
|
909
|
+
]
|
|
910
|
+
)
|
|
911
|
+
|
|
912
|
+
|
|
913
|
+
def _rewrite_docx_paragraph(
|
|
914
|
+
paragraph,
|
|
915
|
+
fragments: list[InlineFragment] | tuple[InlineFragment, ...],
|
|
916
|
+
) -> None:
|
|
917
|
+
_ensure_rewritable_docx_paragraph(paragraph)
|
|
918
|
+
_clear_paragraph(paragraph)
|
|
919
|
+
normalized = normalize_fragments(fragments)
|
|
920
|
+
if not normalized:
|
|
921
|
+
paragraph.add_run("")
|
|
922
|
+
return
|
|
923
|
+
for fragment in normalized:
|
|
924
|
+
run = paragraph.add_run(fragment.text)
|
|
925
|
+
_apply_docx_inline_style(run, fragment.style, ())
|
|
926
|
+
|
|
927
|
+
|
|
928
|
+
def _target_path(document_path: Path, output_path: Path | None) -> Path:
|
|
929
|
+
return document_path if output_path is None else output_path
|
|
930
|
+
|
|
931
|
+
|
|
932
|
+
def _iter_blocks(document) -> list[tuple[str, Paragraph | Table]]:
|
|
933
|
+
if CT_P is None or CT_Tbl is None or Paragraph is None or Table is None:
|
|
934
|
+
raise RuntimeError("python-docx is required for DOCX operations.")
|
|
935
|
+
|
|
936
|
+
parent = document.element.body
|
|
937
|
+
blocks: list[tuple[str, Paragraph | Table]] = []
|
|
938
|
+
for child in parent.iterchildren():
|
|
939
|
+
if isinstance(child, CT_P):
|
|
940
|
+
blocks.append(("paragraph", Paragraph(child, document)))
|
|
941
|
+
elif isinstance(child, CT_Tbl):
|
|
942
|
+
blocks.append(("table", Table(child, document)))
|
|
943
|
+
return blocks
|
|
944
|
+
|
|
945
|
+
|
|
946
|
+
def _paragraph_model(
|
|
947
|
+
paragraph, block_index: int, paragraph_index: int
|
|
948
|
+
) -> DocxParagraph:
|
|
949
|
+
style_name = paragraph.style.name if paragraph.style is not None else None
|
|
950
|
+
is_heading = bool(style_name and style_name.startswith("Heading"))
|
|
951
|
+
text = paragraph.text
|
|
952
|
+
return DocxParagraph(
|
|
953
|
+
block_index=block_index,
|
|
954
|
+
paragraph_index=paragraph_index,
|
|
955
|
+
text=text,
|
|
956
|
+
style_name=style_name,
|
|
957
|
+
is_heading=is_heading,
|
|
958
|
+
preview=text[:120],
|
|
959
|
+
metadata={},
|
|
960
|
+
)
|
|
961
|
+
|
|
962
|
+
|
|
963
|
+
def _table_model(table, block_index: int, table_index: int) -> DocxTable:
|
|
964
|
+
rows = tuple(tuple(cell.text for cell in row.cells) for row in table.rows)
|
|
965
|
+
preview = " | ".join(cell for row in rows for cell in row if cell)[:120]
|
|
966
|
+
return DocxTable(
|
|
967
|
+
block_index=block_index,
|
|
968
|
+
table_index=table_index,
|
|
969
|
+
rows=rows,
|
|
970
|
+
preview=preview,
|
|
971
|
+
metadata={},
|
|
972
|
+
)
|
|
973
|
+
|
|
974
|
+
|
|
975
|
+
def _run_model(run) -> DocxRun:
|
|
976
|
+
color_rgb = None
|
|
977
|
+
if run.font.color.rgb is not None:
|
|
978
|
+
color_rgb = str(run.font.color.rgb)
|
|
979
|
+
|
|
980
|
+
font_size = None
|
|
981
|
+
if run.font.size is not None:
|
|
982
|
+
font_size = int(run.font.size)
|
|
983
|
+
|
|
984
|
+
return DocxRun(
|
|
985
|
+
text=run.text,
|
|
986
|
+
bold=run.bold,
|
|
987
|
+
italic=run.italic,
|
|
988
|
+
underline=run.underline,
|
|
989
|
+
strike=run.font.strike,
|
|
990
|
+
font_name=run.font.name,
|
|
991
|
+
font_size=font_size,
|
|
992
|
+
color_rgb=color_rgb,
|
|
993
|
+
)
|
|
994
|
+
|
|
995
|
+
|
|
996
|
+
def _paragraphs_before_block(document, block_index: int) -> int:
|
|
997
|
+
count = 0
|
|
998
|
+
for current_block_index, (block_type, _) in enumerate(_iter_blocks(document)):
|
|
999
|
+
if current_block_index > block_index:
|
|
1000
|
+
break
|
|
1001
|
+
if current_block_index == block_index:
|
|
1002
|
+
break
|
|
1003
|
+
if block_type == "paragraph":
|
|
1004
|
+
count += 1
|
|
1005
|
+
return count
|
|
1006
|
+
|
|
1007
|
+
|
|
1008
|
+
_INLINE_STYLE_FIELDS = frozenset(
|
|
1009
|
+
{
|
|
1010
|
+
"bold",
|
|
1011
|
+
"italic",
|
|
1012
|
+
"underline",
|
|
1013
|
+
"strike",
|
|
1014
|
+
"font_name",
|
|
1015
|
+
"font_size",
|
|
1016
|
+
"font_color",
|
|
1017
|
+
"highlight",
|
|
1018
|
+
}
|
|
1019
|
+
)
|
|
1020
|
+
_BLOCK_STYLE_FIELDS = frozenset(
|
|
1021
|
+
{
|
|
1022
|
+
"alignment",
|
|
1023
|
+
"indent_level",
|
|
1024
|
+
"left_indent",
|
|
1025
|
+
"right_indent",
|
|
1026
|
+
"spacing_before",
|
|
1027
|
+
"spacing_after",
|
|
1028
|
+
"line_spacing",
|
|
1029
|
+
"wrap_text",
|
|
1030
|
+
"vertical_alignment",
|
|
1031
|
+
"fill_color",
|
|
1032
|
+
"number_format",
|
|
1033
|
+
}
|
|
1034
|
+
)
|
|
1035
|
+
_DOCX_ALIGNMENT_MAP = {
|
|
1036
|
+
"left": None if WD_ALIGN_PARAGRAPH is None else WD_ALIGN_PARAGRAPH.LEFT,
|
|
1037
|
+
"center": None if WD_ALIGN_PARAGRAPH is None else WD_ALIGN_PARAGRAPH.CENTER,
|
|
1038
|
+
"right": None if WD_ALIGN_PARAGRAPH is None else WD_ALIGN_PARAGRAPH.RIGHT,
|
|
1039
|
+
"justify": None if WD_ALIGN_PARAGRAPH is None else WD_ALIGN_PARAGRAPH.JUSTIFY,
|
|
1040
|
+
}
|
|
1041
|
+
_DOCX_HIGHLIGHT_MAP = {
|
|
1042
|
+
"yellow": None if WD_COLOR_INDEX is None else WD_COLOR_INDEX.YELLOW,
|
|
1043
|
+
"green": None if WD_COLOR_INDEX is None else WD_COLOR_INDEX.BRIGHT_GREEN,
|
|
1044
|
+
"turquoise": None if WD_COLOR_INDEX is None else WD_COLOR_INDEX.TURQUOISE,
|
|
1045
|
+
"pink": None if WD_COLOR_INDEX is None else WD_COLOR_INDEX.PINK,
|
|
1046
|
+
"blue": None if WD_COLOR_INDEX is None else WD_COLOR_INDEX.BLUE,
|
|
1047
|
+
"red": None if WD_COLOR_INDEX is None else WD_COLOR_INDEX.RED,
|
|
1048
|
+
"dark_blue": None if WD_COLOR_INDEX is None else WD_COLOR_INDEX.DARK_BLUE,
|
|
1049
|
+
"teal": None if WD_COLOR_INDEX is None else WD_COLOR_INDEX.TEAL,
|
|
1050
|
+
"green_dark": None if WD_COLOR_INDEX is None else WD_COLOR_INDEX.GREEN,
|
|
1051
|
+
"violet": None if WD_COLOR_INDEX is None else WD_COLOR_INDEX.VIOLET,
|
|
1052
|
+
"dark_red": None if WD_COLOR_INDEX is None else WD_COLOR_INDEX.DARK_RED,
|
|
1053
|
+
"dark_yellow": None if WD_COLOR_INDEX is None else WD_COLOR_INDEX.DARK_YELLOW,
|
|
1054
|
+
"gray_50": None if WD_COLOR_INDEX is None else WD_COLOR_INDEX.GRAY_50,
|
|
1055
|
+
"gray_25": None if WD_COLOR_INDEX is None else WD_COLOR_INDEX.GRAY_25,
|
|
1056
|
+
"black": None if WD_COLOR_INDEX is None else WD_COLOR_INDEX.BLACK,
|
|
1057
|
+
}
|
|
1058
|
+
_DOCX_HIGHLIGHT_NAMES = {
|
|
1059
|
+
value: key for key, value in _DOCX_HIGHLIGHT_MAP.items() if value is not None
|
|
1060
|
+
}
|
|
1061
|
+
|
|
1062
|
+
|
|
1063
|
+
def _canonical_docx_locator(locator: str) -> tuple[str, tuple[str, ...]]:
|
|
1064
|
+
canonical = to_v2_locator(locator, file_type="docx")
|
|
1065
|
+
parsed = parse_locator(canonical)
|
|
1066
|
+
return canonical, parsed.components
|
|
1067
|
+
|
|
1068
|
+
|
|
1069
|
+
def _normalize_clear_fields(
|
|
1070
|
+
clear_fields: list[str] | tuple[str, ...],
|
|
1071
|
+
allowed: frozenset[str],
|
|
1072
|
+
) -> tuple[str, ...]:
|
|
1073
|
+
normalized: list[str] = []
|
|
1074
|
+
seen: set[str] = set()
|
|
1075
|
+
for field_name in clear_fields:
|
|
1076
|
+
if field_name not in allowed:
|
|
1077
|
+
raise InvalidArgumentsError(
|
|
1078
|
+
f"Unknown style field in clear_fields: {field_name}"
|
|
1079
|
+
)
|
|
1080
|
+
if field_name not in seen:
|
|
1081
|
+
normalized.append(field_name)
|
|
1082
|
+
seen.add(field_name)
|
|
1083
|
+
return tuple(normalized)
|
|
1084
|
+
|
|
1085
|
+
|
|
1086
|
+
def _apply_docx_inline_style(
|
|
1087
|
+
run: Run, style: InlineStyle, clear_fields: tuple[str, ...]
|
|
1088
|
+
) -> None:
|
|
1089
|
+
clear_set = set(clear_fields)
|
|
1090
|
+
if "bold" in clear_set:
|
|
1091
|
+
run.bold = None
|
|
1092
|
+
elif style.bold is not None:
|
|
1093
|
+
run.bold = style.bold
|
|
1094
|
+
|
|
1095
|
+
if "italic" in clear_set:
|
|
1096
|
+
run.italic = None
|
|
1097
|
+
elif style.italic is not None:
|
|
1098
|
+
run.italic = style.italic
|
|
1099
|
+
|
|
1100
|
+
if "underline" in clear_set:
|
|
1101
|
+
run.underline = None
|
|
1102
|
+
elif style.underline is not None:
|
|
1103
|
+
run.underline = style.underline
|
|
1104
|
+
|
|
1105
|
+
if "strike" in clear_set:
|
|
1106
|
+
run.font.strike = None
|
|
1107
|
+
elif style.strike is not None:
|
|
1108
|
+
run.font.strike = style.strike
|
|
1109
|
+
|
|
1110
|
+
if "font_name" in clear_set:
|
|
1111
|
+
run.font.name = None
|
|
1112
|
+
elif style.font_name is not None:
|
|
1113
|
+
run.font.name = style.font_name
|
|
1114
|
+
|
|
1115
|
+
if "font_size" in clear_set:
|
|
1116
|
+
run.font.size = None
|
|
1117
|
+
elif style.font_size is not None:
|
|
1118
|
+
if Pt is None:
|
|
1119
|
+
raise RuntimeError("python-docx is required for DOCX operations.")
|
|
1120
|
+
run.font.size = Pt(style.font_size)
|
|
1121
|
+
|
|
1122
|
+
if "font_color" in clear_set:
|
|
1123
|
+
run.font.color.rgb = None
|
|
1124
|
+
elif style.font_color is not None:
|
|
1125
|
+
run.font.color.rgb = RGBColor.from_string(
|
|
1126
|
+
_normalize_hex_color(style.font_color)
|
|
1127
|
+
)
|
|
1128
|
+
|
|
1129
|
+
if "highlight" in clear_set:
|
|
1130
|
+
run.font.highlight_color = None
|
|
1131
|
+
elif style.highlight is not None:
|
|
1132
|
+
run.font.highlight_color = _docx_highlight_value(style.highlight)
|
|
1133
|
+
|
|
1134
|
+
|
|
1135
|
+
def _apply_docx_block_style(
|
|
1136
|
+
paragraph,
|
|
1137
|
+
style: BlockStyle,
|
|
1138
|
+
clear_fields: tuple[str, ...],
|
|
1139
|
+
) -> list[str]:
|
|
1140
|
+
paragraph_format = paragraph.paragraph_format
|
|
1141
|
+
clear_set = set(clear_fields)
|
|
1142
|
+
skipped_fields: list[str] = []
|
|
1143
|
+
|
|
1144
|
+
if "alignment" in clear_set:
|
|
1145
|
+
paragraph.alignment = None
|
|
1146
|
+
elif style.alignment is not None:
|
|
1147
|
+
paragraph.alignment = _docx_alignment_value(style.alignment)
|
|
1148
|
+
|
|
1149
|
+
if "left_indent" in clear_set:
|
|
1150
|
+
paragraph_format.left_indent = None
|
|
1151
|
+
elif style.left_indent is not None:
|
|
1152
|
+
paragraph_format.left_indent = Pt(style.left_indent)
|
|
1153
|
+
|
|
1154
|
+
if "right_indent" in clear_set:
|
|
1155
|
+
paragraph_format.right_indent = None
|
|
1156
|
+
elif style.right_indent is not None:
|
|
1157
|
+
paragraph_format.right_indent = Pt(style.right_indent)
|
|
1158
|
+
|
|
1159
|
+
if "spacing_before" in clear_set:
|
|
1160
|
+
paragraph_format.space_before = None
|
|
1161
|
+
elif style.spacing_before is not None:
|
|
1162
|
+
paragraph_format.space_before = Pt(style.spacing_before)
|
|
1163
|
+
|
|
1164
|
+
if "spacing_after" in clear_set:
|
|
1165
|
+
paragraph_format.space_after = None
|
|
1166
|
+
elif style.spacing_after is not None:
|
|
1167
|
+
paragraph_format.space_after = Pt(style.spacing_after)
|
|
1168
|
+
|
|
1169
|
+
if "line_spacing" in clear_set:
|
|
1170
|
+
paragraph_format.line_spacing = None
|
|
1171
|
+
elif style.line_spacing is not None:
|
|
1172
|
+
paragraph_format.line_spacing = style.line_spacing
|
|
1173
|
+
|
|
1174
|
+
for field_name in (
|
|
1175
|
+
"indent_level",
|
|
1176
|
+
"wrap_text",
|
|
1177
|
+
"vertical_alignment",
|
|
1178
|
+
"fill_color",
|
|
1179
|
+
"number_format",
|
|
1180
|
+
):
|
|
1181
|
+
if getattr(style, field_name) is not None or field_name in clear_set:
|
|
1182
|
+
skipped_fields.append(field_name)
|
|
1183
|
+
|
|
1184
|
+
return skipped_fields
|
|
1185
|
+
|
|
1186
|
+
|
|
1187
|
+
def _docx_alignment_value(raw: str):
|
|
1188
|
+
normalized = raw.strip().lower()
|
|
1189
|
+
if normalized not in _DOCX_ALIGNMENT_MAP:
|
|
1190
|
+
raise InvalidArgumentsError(f"Unsupported DOCX alignment: {raw}")
|
|
1191
|
+
return _DOCX_ALIGNMENT_MAP[normalized]
|
|
1192
|
+
|
|
1193
|
+
|
|
1194
|
+
def _docx_highlight_value(raw: str):
|
|
1195
|
+
normalized = raw.strip().lower()
|
|
1196
|
+
if normalized not in _DOCX_HIGHLIGHT_MAP:
|
|
1197
|
+
raise InvalidArgumentsError(f"Unsupported DOCX highlight color: {raw}")
|
|
1198
|
+
return _DOCX_HIGHLIGHT_MAP[normalized]
|
|
1199
|
+
|
|
1200
|
+
|
|
1201
|
+
def _docx_highlight_name(value) -> str | None:
|
|
1202
|
+
return _DOCX_HIGHLIGHT_NAMES.get(value)
|
|
1203
|
+
|
|
1204
|
+
|
|
1205
|
+
def _docx_structural_style_name(role: str, level: int | None) -> str:
|
|
1206
|
+
normalized = role.strip().lower()
|
|
1207
|
+
if normalized == "heading":
|
|
1208
|
+
if level is None or level < 1 or level > 9:
|
|
1209
|
+
raise InvalidArgumentsError(
|
|
1210
|
+
"Heading structural role requires level between 1 and 9."
|
|
1211
|
+
)
|
|
1212
|
+
return f"Heading {level}"
|
|
1213
|
+
mapping = {
|
|
1214
|
+
"title": "Title",
|
|
1215
|
+
"body": "Normal",
|
|
1216
|
+
"table_header": "Table Heading",
|
|
1217
|
+
"caption": "Caption",
|
|
1218
|
+
}
|
|
1219
|
+
if normalized not in mapping:
|
|
1220
|
+
raise InvalidArgumentsError(f"Unsupported structural role: {role}")
|
|
1221
|
+
return mapping[normalized]
|
|
1222
|
+
|
|
1223
|
+
|
|
1224
|
+
def _normalize_hex_color(value: str) -> str:
|
|
1225
|
+
normalized = value.strip().lstrip("#").upper()
|
|
1226
|
+
if len(normalized) != 6 or any(
|
|
1227
|
+
character not in "0123456789ABCDEF" for character in normalized
|
|
1228
|
+
):
|
|
1229
|
+
raise InvalidArgumentsError(f"Invalid RGB hex color: {value}")
|
|
1230
|
+
return normalized
|
|
1231
|
+
|
|
1232
|
+
|
|
1233
|
+
def _parse_int_component(raw: str, locator: str) -> int:
|
|
1234
|
+
try:
|
|
1235
|
+
return int(raw)
|
|
1236
|
+
except ValueError as exc:
|
|
1237
|
+
raise InvalidArgumentsError(f"Invalid DOCX locator: {locator}") from exc
|