offagent 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- offagent/__init__.py +3 -0
- offagent/__main__.py +5 -0
- offagent/adapters/__init__.py +1 -0
- offagent/adapters/docx_adapter.py +1237 -0
- offagent/adapters/embedding_provider.py +132 -0
- offagent/adapters/pptx_adapter.py +940 -0
- offagent/adapters/xlsx_adapter.py +1266 -0
- offagent/app/__init__.py +1 -0
- offagent/app/progress.py +52 -0
- offagent/app/services.py +4267 -0
- offagent/config.py +287 -0
- offagent/domain/__init__.py +1 -0
- offagent/domain/locators.py +444 -0
- offagent/domain/models.py +477 -0
- offagent/domain/text_fragments.py +136 -0
- offagent/errors.py +29 -0
- offagent/indexing/__init__.py +1 -0
- offagent/indexing/store.py +795 -0
- offagent/interfaces/__init__.py +1 -0
- offagent/interfaces/cli.py +438 -0
- offagent/interfaces/cli_output.py +139 -0
- offagent/interfaces/cli_progress.py +120 -0
- offagent/interfaces/mcp.py +1145 -0
- offagent/interfaces/mcp_converters.py +80 -0
- offagent/interfaces/mcp_models.py +923 -0
- offagent/objects/__init__.py +3 -0
- offagent/objects/base.py +26 -0
- offagent/objects/docx_objects.py +951 -0
- offagent/objects/pptx_objects.py +895 -0
- offagent/objects/xlsx_objects.py +962 -0
- offagent/path_policy.py +42 -0
- offagent/storage/__init__.py +1 -0
- offagent/storage/versioning.py +31 -0
- offagent-0.10.0.dist-info/METADATA +546 -0
- offagent-0.10.0.dist-info/RECORD +39 -0
- offagent-0.10.0.dist-info/WHEEL +5 -0
- offagent-0.10.0.dist-info/entry_points.txt +2 -0
- offagent-0.10.0.dist-info/licenses/LICENSE +21 -0
- offagent-0.10.0.dist-info/top_level.txt +1 -0
offagent/app/services.py
ADDED
|
@@ -0,0 +1,4267 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
import importlib
|
|
5
|
+
import json
|
|
6
|
+
import logging
|
|
7
|
+
import os
|
|
8
|
+
from copy import deepcopy
|
|
9
|
+
import sqlite3
|
|
10
|
+
import struct
|
|
11
|
+
import shutil
|
|
12
|
+
import tempfile
|
|
13
|
+
import time
|
|
14
|
+
from dataclasses import dataclass, field, replace
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from typing import Any, Callable, Iterable, Literal, Sequence
|
|
17
|
+
|
|
18
|
+
from offagent.adapters import (
|
|
19
|
+
docx_adapter,
|
|
20
|
+
embedding_provider,
|
|
21
|
+
pptx_adapter,
|
|
22
|
+
xlsx_adapter,
|
|
23
|
+
)
|
|
24
|
+
from offagent.app.progress import NullProgressReporter, ProgressReporter
|
|
25
|
+
from offagent.config import AppConfig
|
|
26
|
+
from offagent.domain.locators import parse_locator, to_legacy_locator, to_v2_locator
|
|
27
|
+
from offagent.domain.models import (
|
|
28
|
+
BatchResult,
|
|
29
|
+
BlockStyle,
|
|
30
|
+
BlockBundle,
|
|
31
|
+
ChildSummary,
|
|
32
|
+
Capability,
|
|
33
|
+
DocxTableEntry,
|
|
34
|
+
DocxTablesResult,
|
|
35
|
+
DocumentBlocks,
|
|
36
|
+
DocumentRef,
|
|
37
|
+
DocumentStructure,
|
|
38
|
+
FileType,
|
|
39
|
+
InlineFragment,
|
|
40
|
+
IndexedItem,
|
|
41
|
+
InsertContentResult,
|
|
42
|
+
InlineStyle,
|
|
43
|
+
ItemRef,
|
|
44
|
+
NodePayload,
|
|
45
|
+
NodeWriteResult,
|
|
46
|
+
ObjectPayload,
|
|
47
|
+
MutationResult,
|
|
48
|
+
ParagraphCollection,
|
|
49
|
+
PresentationStructure,
|
|
50
|
+
SearchHit,
|
|
51
|
+
SearchMode,
|
|
52
|
+
SectionPayload,
|
|
53
|
+
SheetSnapshot,
|
|
54
|
+
SlideNotes,
|
|
55
|
+
StructureUnit,
|
|
56
|
+
StructureCollection,
|
|
57
|
+
StructuredTarget,
|
|
58
|
+
StructuredWriteResult,
|
|
59
|
+
TableCollection,
|
|
60
|
+
VisibleTextRange,
|
|
61
|
+
WorkbookStructure,
|
|
62
|
+
XlsxInsertRowsResult,
|
|
63
|
+
XlsxRowEmbedding,
|
|
64
|
+
)
|
|
65
|
+
from offagent.errors import (
|
|
66
|
+
InvalidArgumentsError,
|
|
67
|
+
NoEmbeddingsError,
|
|
68
|
+
PolicyRefusedError,
|
|
69
|
+
StaleLocatorError,
|
|
70
|
+
TargetNotEditableError,
|
|
71
|
+
)
|
|
72
|
+
from offagent.errors import TargetNotFoundError
|
|
73
|
+
from offagent.indexing import store
|
|
74
|
+
from offagent.objects import docx_objects, pptx_objects, xlsx_objects
|
|
75
|
+
from offagent.objects.docx_objects import DocxObjectResolver
|
|
76
|
+
from offagent.objects.pptx_objects import PptxObjectResolver
|
|
77
|
+
from offagent.objects.xlsx_objects import XlsxObjectResolver
|
|
78
|
+
from offagent.path_policy import (
|
|
79
|
+
canonicalize_existing_path,
|
|
80
|
+
canonicalize_output_path,
|
|
81
|
+
ensure_path_allowed,
|
|
82
|
+
normalize_roots,
|
|
83
|
+
)
|
|
84
|
+
from offagent.storage import versioning
|
|
85
|
+
|
|
86
|
+
LOGGER = logging.getLogger(__name__)
|
|
87
|
+
|
|
88
|
+
SUPPORTED_EXTENSIONS: dict[str, FileType] = {
|
|
89
|
+
".docx": "docx",
|
|
90
|
+
".pptx": "pptx",
|
|
91
|
+
".xlsx": "xlsx",
|
|
92
|
+
}
|
|
93
|
+
INDEXABLE_EXTENSIONS: dict[str, FileType] = {
|
|
94
|
+
".docx": "docx",
|
|
95
|
+
".pptx": "pptx",
|
|
96
|
+
".xlsx": "xlsx",
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
REQUIRED_IMPORTS: tuple[tuple[str, str], ...] = (
|
|
100
|
+
("mcp", "MCP Python SDK"),
|
|
101
|
+
("typer", "Typer"),
|
|
102
|
+
("pydantic", "Pydantic"),
|
|
103
|
+
("dotenv", "python-dotenv"),
|
|
104
|
+
("docx", "python-docx"),
|
|
105
|
+
("pptx", "python-pptx"),
|
|
106
|
+
("openpyxl", "openpyxl"),
|
|
107
|
+
("rich", "Rich"),
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
OutputMode = Literal["versioned", "inplace"]
|
|
111
|
+
|
|
112
|
+
OBJECT_RESOLVERS = {
|
|
113
|
+
"docx": DocxObjectResolver(),
|
|
114
|
+
"pptx": PptxObjectResolver(),
|
|
115
|
+
"xlsx": XlsxObjectResolver(),
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
@dataclass(frozen=True)
|
|
120
|
+
class DoctorCheck:
|
|
121
|
+
name: str
|
|
122
|
+
ok: bool
|
|
123
|
+
detail: str
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
@dataclass(frozen=True)
|
|
127
|
+
class DoctorReport:
|
|
128
|
+
checks: tuple[DoctorCheck, ...]
|
|
129
|
+
|
|
130
|
+
@property
|
|
131
|
+
def ok(self) -> bool:
|
|
132
|
+
return all(check.ok for check in self.checks)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
@dataclass(frozen=True)
|
|
136
|
+
class IndexSummary:
|
|
137
|
+
files_scanned: int
|
|
138
|
+
files_indexed: int
|
|
139
|
+
files_skipped: int
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
@dataclass(frozen=True)
|
|
143
|
+
class PatchResult:
|
|
144
|
+
document_path: Path
|
|
145
|
+
output_path: Path
|
|
146
|
+
item: ItemRef
|
|
147
|
+
text: str
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
@dataclass
|
|
151
|
+
class AppServices:
|
|
152
|
+
config: AppConfig
|
|
153
|
+
embedding_provider_factory: (
|
|
154
|
+
Callable[
|
|
155
|
+
[str, int | None],
|
|
156
|
+
embedding_provider.EmbeddingProvider,
|
|
157
|
+
]
|
|
158
|
+
| None
|
|
159
|
+
) = None
|
|
160
|
+
_embedding_provider: embedding_provider.EmbeddingProvider | None = field(
|
|
161
|
+
default=None,
|
|
162
|
+
init=False,
|
|
163
|
+
repr=False,
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
def discover_documents(self) -> list[DocumentRef]:
|
|
167
|
+
documents = discover_documents(self.config.document_roots)
|
|
168
|
+
return [
|
|
169
|
+
document
|
|
170
|
+
for document in documents
|
|
171
|
+
if self._is_allowed_document_path(document.path)
|
|
172
|
+
]
|
|
173
|
+
|
|
174
|
+
def list_documents(self) -> list[DocumentRef]:
|
|
175
|
+
connection = store.ensure_ready(self.config.index_path)
|
|
176
|
+
try:
|
|
177
|
+
rows = store.fetch_documents(connection)
|
|
178
|
+
finally:
|
|
179
|
+
connection.close()
|
|
180
|
+
return [
|
|
181
|
+
_document_ref_from_row(row)
|
|
182
|
+
for row in rows
|
|
183
|
+
if self._is_allowed_document_path(Path(row["path"]))
|
|
184
|
+
]
|
|
185
|
+
|
|
186
|
+
def get_document(self, document_id: str) -> DocumentRef:
|
|
187
|
+
connection = store.ensure_ready(self.config.index_path)
|
|
188
|
+
try:
|
|
189
|
+
document_row = self._resolve_document_by_id_row(connection, document_id)
|
|
190
|
+
finally:
|
|
191
|
+
connection.close()
|
|
192
|
+
document = _document_ref_from_row(document_row)
|
|
193
|
+
self._ensure_allowed_document_path(document.path, action="read")
|
|
194
|
+
return document
|
|
195
|
+
|
|
196
|
+
def show_document(self, document_path: Path) -> DocumentRef:
|
|
197
|
+
connection = store.ensure_ready(self.config.index_path)
|
|
198
|
+
try:
|
|
199
|
+
resolved_path, _ = self._require_allowed_document_path(
|
|
200
|
+
document_path, action="show"
|
|
201
|
+
)
|
|
202
|
+
document_row = self._resolve_document_row(connection, resolved_path)
|
|
203
|
+
finally:
|
|
204
|
+
connection.close()
|
|
205
|
+
return _document_ref_from_row(document_row)
|
|
206
|
+
|
|
207
|
+
def show_item(self, document_path: Path, item_id: str) -> ItemRef:
|
|
208
|
+
connection = store.ensure_ready(self.config.index_path)
|
|
209
|
+
try:
|
|
210
|
+
resolved_path, _ = self._require_allowed_document_path(
|
|
211
|
+
document_path, action="show"
|
|
212
|
+
)
|
|
213
|
+
document_row, item_row = self._resolve_item_row(
|
|
214
|
+
connection, resolved_path, item_id
|
|
215
|
+
)
|
|
216
|
+
finally:
|
|
217
|
+
connection.close()
|
|
218
|
+
return _item_ref_from_row(item_row)
|
|
219
|
+
|
|
220
|
+
def resolve_document_path(self, document_id: str) -> Path:
|
|
221
|
+
return self.get_document(document_id).path
|
|
222
|
+
|
|
223
|
+
def get_document_structure(self, document_id: str) -> DocumentStructure:
|
|
224
|
+
document = self.get_document(document_id)
|
|
225
|
+
if document.file_type == "docx":
|
|
226
|
+
units = tuple(
|
|
227
|
+
StructureUnit(
|
|
228
|
+
position=block.block_index,
|
|
229
|
+
unit_type=block.block_type,
|
|
230
|
+
preview=block.preview,
|
|
231
|
+
metadata=block.metadata,
|
|
232
|
+
)
|
|
233
|
+
for block in docx_adapter.get_blocks(document.path)
|
|
234
|
+
)
|
|
235
|
+
elif document.file_type == "pptx":
|
|
236
|
+
units = tuple(
|
|
237
|
+
StructureUnit(
|
|
238
|
+
position=slide.slide_number,
|
|
239
|
+
unit_type="slide",
|
|
240
|
+
preview=slide.preview,
|
|
241
|
+
metadata=slide.metadata,
|
|
242
|
+
)
|
|
243
|
+
for slide in pptx_adapter.get_presentation_structure(document.path)
|
|
244
|
+
)
|
|
245
|
+
else:
|
|
246
|
+
workbook_structure = xlsx_adapter.get_workbook_structure(document.path)
|
|
247
|
+
units = tuple(
|
|
248
|
+
StructureUnit(
|
|
249
|
+
position=sheet.position,
|
|
250
|
+
unit_type="worksheet",
|
|
251
|
+
preview=sheet.preview,
|
|
252
|
+
metadata={"sheet_name": sheet.sheet_name, **sheet.metadata},
|
|
253
|
+
)
|
|
254
|
+
for sheet in workbook_structure.sheets
|
|
255
|
+
)
|
|
256
|
+
|
|
257
|
+
return DocumentStructure(document=document, units=units)
|
|
258
|
+
|
|
259
|
+
def get_structure(self, document_id: str) -> StructureCollection:
|
|
260
|
+
document = self.get_document(document_id)
|
|
261
|
+
if document.file_type == "docx":
|
|
262
|
+
sections = docx_adapter.resolve_structure(document.path)
|
|
263
|
+
elif document.file_type == "pptx":
|
|
264
|
+
sections = pptx_adapter.resolve_structure(document.path)
|
|
265
|
+
else:
|
|
266
|
+
sections = xlsx_adapter.resolve_structure(document.path)
|
|
267
|
+
return StructureCollection(document=document, sections=sections)
|
|
268
|
+
|
|
269
|
+
def get_section(
|
|
270
|
+
self,
|
|
271
|
+
document_id: str,
|
|
272
|
+
section_id: str,
|
|
273
|
+
*,
|
|
274
|
+
cell_range: str | None = None,
|
|
275
|
+
) -> SectionPayload:
|
|
276
|
+
document = self.get_document(document_id)
|
|
277
|
+
if document.file_type == "docx":
|
|
278
|
+
return replace(
|
|
279
|
+
docx_adapter.get_section(document.path, section_id), document=document
|
|
280
|
+
)
|
|
281
|
+
if document.file_type == "pptx":
|
|
282
|
+
return replace(
|
|
283
|
+
pptx_adapter.get_section(document.path, section_id), document=document
|
|
284
|
+
)
|
|
285
|
+
return replace(
|
|
286
|
+
xlsx_adapter.get_section(document.path, section_id, cell_range=cell_range),
|
|
287
|
+
document=document,
|
|
288
|
+
)
|
|
289
|
+
|
|
290
|
+
def get_node(self, document_id: str, node_id: str) -> NodePayload:
|
|
291
|
+
document = self.get_document(document_id)
|
|
292
|
+
if document.file_type == "docx":
|
|
293
|
+
item_type, text, metadata = docx_adapter.read_node(document.path, node_id)
|
|
294
|
+
elif document.file_type == "pptx":
|
|
295
|
+
item_type, text, metadata = pptx_adapter.read_node(document.path, node_id)
|
|
296
|
+
else:
|
|
297
|
+
item_type, text, metadata = xlsx_adapter.read_node(document.path, node_id)
|
|
298
|
+
return NodePayload(
|
|
299
|
+
document_id=document.document_id,
|
|
300
|
+
node_id=node_id,
|
|
301
|
+
item_type=item_type,
|
|
302
|
+
text=text,
|
|
303
|
+
metadata=metadata,
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
def get_object(self, document_id: str, locator: str) -> ObjectPayload:
|
|
307
|
+
document = self.get_document(document_id)
|
|
308
|
+
source_hash = _content_hash(document.path)
|
|
309
|
+
resolver = _object_resolver(document.file_type)
|
|
310
|
+
try:
|
|
311
|
+
payload = resolver.get_object(document.path, locator)
|
|
312
|
+
except (InvalidArgumentsError, TargetNotFoundError) as exc:
|
|
313
|
+
if source_hash != document.content_hash:
|
|
314
|
+
raise StaleLocatorError(
|
|
315
|
+
f"stale locator: {locator} is no longer valid for {document.path}"
|
|
316
|
+
) from exc
|
|
317
|
+
raise
|
|
318
|
+
return replace(payload, document=document)
|
|
319
|
+
|
|
320
|
+
def list_children(
|
|
321
|
+
self,
|
|
322
|
+
document_id: str,
|
|
323
|
+
locator: str,
|
|
324
|
+
*,
|
|
325
|
+
child_type: str | None = None,
|
|
326
|
+
limit: int | None = None,
|
|
327
|
+
) -> list[ChildSummary]:
|
|
328
|
+
document = self.get_document(document_id)
|
|
329
|
+
source_hash = _content_hash(document.path)
|
|
330
|
+
resolver = _object_resolver(document.file_type)
|
|
331
|
+
try:
|
|
332
|
+
return resolver.list_children(
|
|
333
|
+
document.path,
|
|
334
|
+
locator,
|
|
335
|
+
child_type=child_type,
|
|
336
|
+
limit=limit,
|
|
337
|
+
)
|
|
338
|
+
except (InvalidArgumentsError, TargetNotFoundError) as exc:
|
|
339
|
+
if source_hash != document.content_hash:
|
|
340
|
+
raise StaleLocatorError(
|
|
341
|
+
f"stale locator: {locator} is no longer valid for {document.path}"
|
|
342
|
+
) from exc
|
|
343
|
+
raise
|
|
344
|
+
|
|
345
|
+
def create_object(
|
|
346
|
+
self,
|
|
347
|
+
document_id: str,
|
|
348
|
+
parent_locator: str,
|
|
349
|
+
object_type: str,
|
|
350
|
+
properties: dict[str, Any],
|
|
351
|
+
position: object | None = None,
|
|
352
|
+
segments: Sequence[InlineFragment] | Sequence[dict[str, Any]] | None = None,
|
|
353
|
+
text_range: VisibleTextRange | dict[str, Any] | None = None,
|
|
354
|
+
*,
|
|
355
|
+
output_mode: OutputMode = "versioned",
|
|
356
|
+
) -> MutationResult:
|
|
357
|
+
document = self.get_document(document_id)
|
|
358
|
+
self._ensure_object_locator_fresh(document, parent_locator)
|
|
359
|
+
parent = self.get_object(document_id, parent_locator)
|
|
360
|
+
_require_capability(parent.capabilities, Capability.ADD_CHILD, parent_locator)
|
|
361
|
+
output_path = self._resolve_write_output_path(
|
|
362
|
+
document.path, output_mode=output_mode
|
|
363
|
+
)
|
|
364
|
+
|
|
365
|
+
try:
|
|
366
|
+
locator, summary, metadata = _create_object_on_path(
|
|
367
|
+
document.path,
|
|
368
|
+
document.file_type,
|
|
369
|
+
parent_locator=parent_locator,
|
|
370
|
+
object_type=object_type,
|
|
371
|
+
properties=properties,
|
|
372
|
+
position=position,
|
|
373
|
+
segments=_coerce_inline_fragments(segments),
|
|
374
|
+
text_range=_coerce_visible_text_range(text_range),
|
|
375
|
+
output_path=output_path,
|
|
376
|
+
)
|
|
377
|
+
except (
|
|
378
|
+
InvalidArgumentsError,
|
|
379
|
+
TargetNotFoundError,
|
|
380
|
+
TargetNotEditableError,
|
|
381
|
+
) as exc:
|
|
382
|
+
self._raise_stale_if_document_changed(document, parent_locator, exc)
|
|
383
|
+
raise
|
|
384
|
+
|
|
385
|
+
output_document = self.index_document(output_path)
|
|
386
|
+
payload = self.get_object(output_document.document_id, locator)
|
|
387
|
+
return MutationResult(
|
|
388
|
+
document_path=document.path,
|
|
389
|
+
output_path=output_path,
|
|
390
|
+
document_id=output_document.document_id,
|
|
391
|
+
locator=payload.locator,
|
|
392
|
+
object_type=payload.object_type,
|
|
393
|
+
summary=summary,
|
|
394
|
+
capabilities=payload.capabilities,
|
|
395
|
+
parent_locator=payload.parent_locator,
|
|
396
|
+
metadata=metadata,
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
def update_object(
|
|
400
|
+
self,
|
|
401
|
+
document_id: str,
|
|
402
|
+
locator: str,
|
|
403
|
+
properties: dict[str, Any],
|
|
404
|
+
segments: Sequence[InlineFragment] | Sequence[dict[str, Any]] | None = None,
|
|
405
|
+
text_range: VisibleTextRange | dict[str, Any] | None = None,
|
|
406
|
+
*,
|
|
407
|
+
output_mode: OutputMode = "versioned",
|
|
408
|
+
) -> MutationResult:
|
|
409
|
+
document = self.get_document(document_id)
|
|
410
|
+
self._ensure_object_locator_fresh(document, locator)
|
|
411
|
+
current = self.get_object(document_id, locator)
|
|
412
|
+
_require_capability(current.capabilities, Capability.UPDATE, locator)
|
|
413
|
+
output_path = self._resolve_write_output_path(
|
|
414
|
+
document.path, output_mode=output_mode
|
|
415
|
+
)
|
|
416
|
+
|
|
417
|
+
try:
|
|
418
|
+
summary, metadata = _update_object_on_path(
|
|
419
|
+
document.path,
|
|
420
|
+
document.file_type,
|
|
421
|
+
locator=locator,
|
|
422
|
+
properties=properties,
|
|
423
|
+
segments=_coerce_inline_fragments(segments),
|
|
424
|
+
text_range=_coerce_visible_text_range(text_range),
|
|
425
|
+
output_path=output_path,
|
|
426
|
+
)
|
|
427
|
+
except (
|
|
428
|
+
InvalidArgumentsError,
|
|
429
|
+
TargetNotFoundError,
|
|
430
|
+
TargetNotEditableError,
|
|
431
|
+
) as exc:
|
|
432
|
+
self._raise_stale_if_document_changed(document, locator, exc)
|
|
433
|
+
raise
|
|
434
|
+
|
|
435
|
+
output_document = self.index_document(output_path)
|
|
436
|
+
payload = self.get_object(output_document.document_id, locator)
|
|
437
|
+
return MutationResult(
|
|
438
|
+
document_path=document.path,
|
|
439
|
+
output_path=output_path,
|
|
440
|
+
document_id=output_document.document_id,
|
|
441
|
+
locator=payload.locator,
|
|
442
|
+
object_type=payload.object_type,
|
|
443
|
+
summary=summary,
|
|
444
|
+
capabilities=payload.capabilities,
|
|
445
|
+
parent_locator=payload.parent_locator,
|
|
446
|
+
metadata=metadata,
|
|
447
|
+
)
|
|
448
|
+
|
|
449
|
+
def move_object(
|
|
450
|
+
self,
|
|
451
|
+
document_id: str,
|
|
452
|
+
locator: str,
|
|
453
|
+
new_parent_locator: str,
|
|
454
|
+
position: object | None = None,
|
|
455
|
+
*,
|
|
456
|
+
output_mode: OutputMode = "versioned",
|
|
457
|
+
) -> MutationResult:
|
|
458
|
+
document = self.get_document(document_id)
|
|
459
|
+
self._ensure_object_locator_fresh(document, locator)
|
|
460
|
+
current = self.get_object(document_id, locator)
|
|
461
|
+
_require_capability(current.capabilities, Capability.MOVE, locator)
|
|
462
|
+
output_path = self._resolve_write_output_path(
|
|
463
|
+
document.path, output_mode=output_mode
|
|
464
|
+
)
|
|
465
|
+
|
|
466
|
+
try:
|
|
467
|
+
moved_locator, summary, metadata = _move_object_on_path(
|
|
468
|
+
document.path,
|
|
469
|
+
document.file_type,
|
|
470
|
+
locator=locator,
|
|
471
|
+
new_parent_locator=new_parent_locator,
|
|
472
|
+
position=position,
|
|
473
|
+
output_path=output_path,
|
|
474
|
+
)
|
|
475
|
+
except (
|
|
476
|
+
InvalidArgumentsError,
|
|
477
|
+
TargetNotFoundError,
|
|
478
|
+
TargetNotEditableError,
|
|
479
|
+
) as exc:
|
|
480
|
+
self._raise_stale_if_document_changed(document, locator, exc)
|
|
481
|
+
raise
|
|
482
|
+
|
|
483
|
+
output_document = self.index_document(output_path)
|
|
484
|
+
payload = self.get_object(output_document.document_id, moved_locator)
|
|
485
|
+
return MutationResult(
|
|
486
|
+
document_path=document.path,
|
|
487
|
+
output_path=output_path,
|
|
488
|
+
document_id=output_document.document_id,
|
|
489
|
+
locator=payload.locator,
|
|
490
|
+
object_type=payload.object_type,
|
|
491
|
+
summary=summary,
|
|
492
|
+
capabilities=payload.capabilities,
|
|
493
|
+
parent_locator=payload.parent_locator,
|
|
494
|
+
metadata=metadata,
|
|
495
|
+
)
|
|
496
|
+
|
|
497
|
+
def copy_object(
|
|
498
|
+
self,
|
|
499
|
+
document_id: str,
|
|
500
|
+
locator: str,
|
|
501
|
+
target_parent_locator: str,
|
|
502
|
+
position: object | None = None,
|
|
503
|
+
*,
|
|
504
|
+
output_mode: OutputMode = "versioned",
|
|
505
|
+
) -> MutationResult:
|
|
506
|
+
document = self.get_document(document_id)
|
|
507
|
+
self._ensure_object_locator_fresh(document, locator)
|
|
508
|
+
current = self.get_object(document_id, locator)
|
|
509
|
+
_require_capability(current.capabilities, Capability.COPY, locator)
|
|
510
|
+
output_path = self._resolve_write_output_path(
|
|
511
|
+
document.path, output_mode=output_mode
|
|
512
|
+
)
|
|
513
|
+
|
|
514
|
+
try:
|
|
515
|
+
copied_locator, summary, metadata = _copy_object_on_path(
|
|
516
|
+
document.path,
|
|
517
|
+
document.file_type,
|
|
518
|
+
locator=locator,
|
|
519
|
+
target_parent_locator=target_parent_locator,
|
|
520
|
+
position=position,
|
|
521
|
+
output_path=output_path,
|
|
522
|
+
)
|
|
523
|
+
except (
|
|
524
|
+
InvalidArgumentsError,
|
|
525
|
+
TargetNotFoundError,
|
|
526
|
+
TargetNotEditableError,
|
|
527
|
+
) as exc:
|
|
528
|
+
self._raise_stale_if_document_changed(document, locator, exc)
|
|
529
|
+
raise
|
|
530
|
+
|
|
531
|
+
output_document = self.index_document(output_path)
|
|
532
|
+
payload = self.get_object(output_document.document_id, copied_locator)
|
|
533
|
+
return MutationResult(
|
|
534
|
+
document_path=document.path,
|
|
535
|
+
output_path=output_path,
|
|
536
|
+
document_id=output_document.document_id,
|
|
537
|
+
locator=payload.locator,
|
|
538
|
+
object_type=payload.object_type,
|
|
539
|
+
summary=summary,
|
|
540
|
+
capabilities=payload.capabilities,
|
|
541
|
+
parent_locator=payload.parent_locator,
|
|
542
|
+
metadata=metadata,
|
|
543
|
+
)
|
|
544
|
+
|
|
545
|
+
def batch_edit(
|
|
546
|
+
self,
|
|
547
|
+
document_id: str,
|
|
548
|
+
operations: list[dict[str, Any]],
|
|
549
|
+
*,
|
|
550
|
+
output_mode: OutputMode = "versioned",
|
|
551
|
+
dry_run: bool = False,
|
|
552
|
+
) -> BatchResult:
|
|
553
|
+
document = self.get_document(document_id)
|
|
554
|
+
self._ensure_object_locator_fresh(
|
|
555
|
+
document, _primary_locator_for_batch(operations)
|
|
556
|
+
)
|
|
557
|
+
if dry_run:
|
|
558
|
+
validated = tuple(
|
|
559
|
+
_validate_batch_operation(document.path, document.file_type, operation)
|
|
560
|
+
for operation in operations
|
|
561
|
+
)
|
|
562
|
+
return BatchResult(
|
|
563
|
+
document_path=document.path,
|
|
564
|
+
output_path=None,
|
|
565
|
+
document_id=document.document_id,
|
|
566
|
+
summary=f"Validated {len(validated)} operations.",
|
|
567
|
+
dry_run=True,
|
|
568
|
+
operations=validated,
|
|
569
|
+
)
|
|
570
|
+
|
|
571
|
+
output_path = self._resolve_write_output_path(
|
|
572
|
+
document.path, output_mode=output_mode
|
|
573
|
+
)
|
|
574
|
+
temp_work_path = _make_batch_work_path(output_path, document.path.suffix)
|
|
575
|
+
shutil.copy2(document.path, temp_work_path)
|
|
576
|
+
try:
|
|
577
|
+
mutation_results = tuple(
|
|
578
|
+
_apply_batch_operation(temp_work_path, document.file_type, operation)
|
|
579
|
+
for operation in operations
|
|
580
|
+
)
|
|
581
|
+
os.replace(temp_work_path, output_path)
|
|
582
|
+
except Exception:
|
|
583
|
+
temp_work_path.unlink(missing_ok=True)
|
|
584
|
+
raise
|
|
585
|
+
|
|
586
|
+
output_document = self.index_document(output_path)
|
|
587
|
+
return BatchResult(
|
|
588
|
+
document_path=document.path,
|
|
589
|
+
output_path=output_path,
|
|
590
|
+
document_id=output_document.document_id,
|
|
591
|
+
summary=f"Applied {len(mutation_results)} operations.",
|
|
592
|
+
operations=mutation_results,
|
|
593
|
+
)
|
|
594
|
+
|
|
595
|
+
def delete_object(
|
|
596
|
+
self,
|
|
597
|
+
document_id: str,
|
|
598
|
+
locator: str,
|
|
599
|
+
*,
|
|
600
|
+
output_mode: OutputMode = "versioned",
|
|
601
|
+
) -> MutationResult:
|
|
602
|
+
document = self.get_document(document_id)
|
|
603
|
+
self._ensure_object_locator_fresh(document, locator)
|
|
604
|
+
current = self.get_object(document_id, locator)
|
|
605
|
+
_require_capability(current.capabilities, Capability.DELETE, locator)
|
|
606
|
+
output_path = self._resolve_write_output_path(
|
|
607
|
+
document.path, output_mode=output_mode
|
|
608
|
+
)
|
|
609
|
+
|
|
610
|
+
try:
|
|
611
|
+
summary, metadata = _delete_object_on_path(
|
|
612
|
+
document.path,
|
|
613
|
+
document.file_type,
|
|
614
|
+
locator=locator,
|
|
615
|
+
output_path=output_path,
|
|
616
|
+
)
|
|
617
|
+
except (
|
|
618
|
+
InvalidArgumentsError,
|
|
619
|
+
TargetNotFoundError,
|
|
620
|
+
TargetNotEditableError,
|
|
621
|
+
) as exc:
|
|
622
|
+
self._raise_stale_if_document_changed(document, locator, exc)
|
|
623
|
+
raise
|
|
624
|
+
|
|
625
|
+
output_document = self.index_document(output_path)
|
|
626
|
+
return MutationResult(
|
|
627
|
+
document_path=document.path,
|
|
628
|
+
output_path=output_path,
|
|
629
|
+
document_id=output_document.document_id,
|
|
630
|
+
locator=None,
|
|
631
|
+
object_type=current.object_type,
|
|
632
|
+
summary=summary,
|
|
633
|
+
capabilities=(),
|
|
634
|
+
parent_locator=current.parent_locator,
|
|
635
|
+
metadata=metadata,
|
|
636
|
+
)
|
|
637
|
+
|
|
638
|
+
def docx_set_paragraph_style(
|
|
639
|
+
self,
|
|
640
|
+
document_id: str,
|
|
641
|
+
locator: str,
|
|
642
|
+
style_name: str,
|
|
643
|
+
*,
|
|
644
|
+
output_mode: OutputMode = "versioned",
|
|
645
|
+
) -> MutationResult:
|
|
646
|
+
document = self._require_document_type(
|
|
647
|
+
document_id,
|
|
648
|
+
expected="docx",
|
|
649
|
+
operation="docx_set_paragraph_style",
|
|
650
|
+
)
|
|
651
|
+
self._ensure_object_locator_fresh(document, locator)
|
|
652
|
+
output_path = self._resolve_write_output_path(
|
|
653
|
+
document.path, output_mode=output_mode
|
|
654
|
+
)
|
|
655
|
+
try:
|
|
656
|
+
updated_locator, summary, metadata = docx_objects.set_paragraph_style(
|
|
657
|
+
document.path,
|
|
658
|
+
locator,
|
|
659
|
+
style_name,
|
|
660
|
+
output_path=output_path,
|
|
661
|
+
)
|
|
662
|
+
except (
|
|
663
|
+
InvalidArgumentsError,
|
|
664
|
+
TargetNotFoundError,
|
|
665
|
+
TargetNotEditableError,
|
|
666
|
+
) as exc:
|
|
667
|
+
self._raise_stale_if_document_changed(document, locator, exc)
|
|
668
|
+
raise
|
|
669
|
+
return self._finalize_object_mutation(
|
|
670
|
+
document, output_path, updated_locator, summary, metadata
|
|
671
|
+
)
|
|
672
|
+
|
|
673
|
+
def docx_insert_page_break(
|
|
674
|
+
self,
|
|
675
|
+
document_id: str,
|
|
676
|
+
locator: str,
|
|
677
|
+
*,
|
|
678
|
+
output_mode: OutputMode = "versioned",
|
|
679
|
+
) -> MutationResult:
|
|
680
|
+
document = self._require_document_type(
|
|
681
|
+
document_id,
|
|
682
|
+
expected="docx",
|
|
683
|
+
operation="docx_insert_page_break",
|
|
684
|
+
)
|
|
685
|
+
self._ensure_object_locator_fresh(document, locator)
|
|
686
|
+
output_path = self._resolve_write_output_path(
|
|
687
|
+
document.path, output_mode=output_mode
|
|
688
|
+
)
|
|
689
|
+
try:
|
|
690
|
+
inserted_locator, summary, metadata = docx_objects.insert_page_break(
|
|
691
|
+
document.path,
|
|
692
|
+
locator,
|
|
693
|
+
output_path=output_path,
|
|
694
|
+
)
|
|
695
|
+
except (
|
|
696
|
+
InvalidArgumentsError,
|
|
697
|
+
TargetNotFoundError,
|
|
698
|
+
TargetNotEditableError,
|
|
699
|
+
) as exc:
|
|
700
|
+
self._raise_stale_if_document_changed(document, locator, exc)
|
|
701
|
+
raise
|
|
702
|
+
return self._finalize_object_mutation(
|
|
703
|
+
document, output_path, inserted_locator, summary, metadata
|
|
704
|
+
)
|
|
705
|
+
|
|
706
|
+
def docx_add_table(
|
|
707
|
+
self,
|
|
708
|
+
document_id: str,
|
|
709
|
+
row_count: int,
|
|
710
|
+
column_count: int,
|
|
711
|
+
*,
|
|
712
|
+
position: object | None = None,
|
|
713
|
+
column_widths: list[int] | None = None,
|
|
714
|
+
style_name: str | None = None,
|
|
715
|
+
output_mode: OutputMode = "versioned",
|
|
716
|
+
) -> MutationResult:
|
|
717
|
+
document = self._require_document_type(
|
|
718
|
+
document_id,
|
|
719
|
+
expected="docx",
|
|
720
|
+
operation="docx_add_table",
|
|
721
|
+
)
|
|
722
|
+
if isinstance(position, str):
|
|
723
|
+
self._ensure_object_locator_fresh(document, position)
|
|
724
|
+
elif isinstance(position, dict):
|
|
725
|
+
after_locator = position.get("after") or position.get("after_locator")
|
|
726
|
+
if isinstance(after_locator, str):
|
|
727
|
+
self._ensure_object_locator_fresh(document, after_locator)
|
|
728
|
+
|
|
729
|
+
output_path = self._resolve_write_output_path(
|
|
730
|
+
document.path, output_mode=output_mode
|
|
731
|
+
)
|
|
732
|
+
try:
|
|
733
|
+
inserted_locator, summary, metadata = docx_objects.add_table(
|
|
734
|
+
document.path,
|
|
735
|
+
row_count,
|
|
736
|
+
column_count,
|
|
737
|
+
position=position,
|
|
738
|
+
column_widths=column_widths,
|
|
739
|
+
style_name=style_name,
|
|
740
|
+
output_path=output_path,
|
|
741
|
+
)
|
|
742
|
+
except (
|
|
743
|
+
InvalidArgumentsError,
|
|
744
|
+
TargetNotFoundError,
|
|
745
|
+
TargetNotEditableError,
|
|
746
|
+
) as exc:
|
|
747
|
+
stale_locator = None
|
|
748
|
+
if isinstance(position, str):
|
|
749
|
+
stale_locator = position
|
|
750
|
+
elif isinstance(position, dict):
|
|
751
|
+
maybe_locator = position.get("after") or position.get("after_locator")
|
|
752
|
+
if isinstance(maybe_locator, str):
|
|
753
|
+
stale_locator = maybe_locator
|
|
754
|
+
self._raise_stale_if_document_changed(
|
|
755
|
+
document, stale_locator or "docx:document", exc
|
|
756
|
+
)
|
|
757
|
+
raise
|
|
758
|
+
return self._finalize_object_mutation(
|
|
759
|
+
document, output_path, inserted_locator, summary, metadata
|
|
760
|
+
)
|
|
761
|
+
|
|
762
|
+
def docx_merge_table_cells(
|
|
763
|
+
self,
|
|
764
|
+
document_id: str,
|
|
765
|
+
start_locator: str,
|
|
766
|
+
end_locator: str,
|
|
767
|
+
*,
|
|
768
|
+
output_mode: OutputMode = "versioned",
|
|
769
|
+
) -> MutationResult:
|
|
770
|
+
document = self._require_document_type(
|
|
771
|
+
document_id,
|
|
772
|
+
expected="docx",
|
|
773
|
+
operation="docx_merge_table_cells",
|
|
774
|
+
)
|
|
775
|
+
self._ensure_object_locator_fresh(document, start_locator)
|
|
776
|
+
self._ensure_object_locator_fresh(document, end_locator)
|
|
777
|
+
output_path = self._resolve_write_output_path(
|
|
778
|
+
document.path, output_mode=output_mode
|
|
779
|
+
)
|
|
780
|
+
try:
|
|
781
|
+
merged_locator, summary, metadata = docx_objects.merge_table_cells(
|
|
782
|
+
document.path,
|
|
783
|
+
start_locator,
|
|
784
|
+
end_locator,
|
|
785
|
+
output_path=output_path,
|
|
786
|
+
)
|
|
787
|
+
except (
|
|
788
|
+
InvalidArgumentsError,
|
|
789
|
+
TargetNotFoundError,
|
|
790
|
+
TargetNotEditableError,
|
|
791
|
+
) as exc:
|
|
792
|
+
self._raise_stale_if_document_changed(document, start_locator, exc)
|
|
793
|
+
raise
|
|
794
|
+
return self._finalize_object_mutation(
|
|
795
|
+
document, output_path, merged_locator, summary, metadata
|
|
796
|
+
)
|
|
797
|
+
|
|
798
|
+
def pptx_add_slide(
|
|
799
|
+
self,
|
|
800
|
+
document_id: str,
|
|
801
|
+
*,
|
|
802
|
+
layout_index: int | None = None,
|
|
803
|
+
layout_name: str | None = None,
|
|
804
|
+
output_mode: OutputMode = "versioned",
|
|
805
|
+
) -> MutationResult:
|
|
806
|
+
document = self._require_document_type(
|
|
807
|
+
document_id, expected="pptx", operation="pptx_add_slide"
|
|
808
|
+
)
|
|
809
|
+
output_path = self._resolve_write_output_path(
|
|
810
|
+
document.path, output_mode=output_mode
|
|
811
|
+
)
|
|
812
|
+
inserted_locator, summary, metadata = pptx_objects.add_slide(
|
|
813
|
+
document.path,
|
|
814
|
+
layout_index=layout_index,
|
|
815
|
+
layout_name=layout_name,
|
|
816
|
+
output_path=output_path,
|
|
817
|
+
)
|
|
818
|
+
return self._finalize_object_mutation(
|
|
819
|
+
document, output_path, inserted_locator, summary, metadata
|
|
820
|
+
)
|
|
821
|
+
|
|
822
|
+
def pptx_duplicate_slide(
|
|
823
|
+
self,
|
|
824
|
+
document_id: str,
|
|
825
|
+
locator: str,
|
|
826
|
+
*,
|
|
827
|
+
position: int | None = None,
|
|
828
|
+
output_mode: OutputMode = "versioned",
|
|
829
|
+
) -> MutationResult:
|
|
830
|
+
document = self._require_document_type(
|
|
831
|
+
document_id,
|
|
832
|
+
expected="pptx",
|
|
833
|
+
operation="pptx_duplicate_slide",
|
|
834
|
+
)
|
|
835
|
+
self._ensure_object_locator_fresh(document, locator)
|
|
836
|
+
output_path = self._resolve_write_output_path(
|
|
837
|
+
document.path, output_mode=output_mode
|
|
838
|
+
)
|
|
839
|
+
try:
|
|
840
|
+
copied_locator, summary, metadata = pptx_objects.duplicate_slide(
|
|
841
|
+
document.path,
|
|
842
|
+
locator,
|
|
843
|
+
position=position,
|
|
844
|
+
output_path=output_path,
|
|
845
|
+
)
|
|
846
|
+
except (
|
|
847
|
+
InvalidArgumentsError,
|
|
848
|
+
TargetNotFoundError,
|
|
849
|
+
TargetNotEditableError,
|
|
850
|
+
) as exc:
|
|
851
|
+
self._raise_stale_if_document_changed(document, locator, exc)
|
|
852
|
+
raise
|
|
853
|
+
return self._finalize_object_mutation(
|
|
854
|
+
document, output_path, copied_locator, summary, metadata
|
|
855
|
+
)
|
|
856
|
+
|
|
857
|
+
def pptx_set_slide_layout(
|
|
858
|
+
self,
|
|
859
|
+
document_id: str,
|
|
860
|
+
locator: str,
|
|
861
|
+
*,
|
|
862
|
+
layout_index: int | None = None,
|
|
863
|
+
layout_name: str | None = None,
|
|
864
|
+
output_mode: OutputMode = "versioned",
|
|
865
|
+
) -> MutationResult:
|
|
866
|
+
document = self._require_document_type(
|
|
867
|
+
document_id,
|
|
868
|
+
expected="pptx",
|
|
869
|
+
operation="pptx_set_slide_layout",
|
|
870
|
+
)
|
|
871
|
+
self._ensure_object_locator_fresh(document, locator)
|
|
872
|
+
output_path = self._resolve_write_output_path(
|
|
873
|
+
document.path, output_mode=output_mode
|
|
874
|
+
)
|
|
875
|
+
try:
|
|
876
|
+
updated_locator, summary, metadata = pptx_objects.set_slide_layout(
|
|
877
|
+
document.path,
|
|
878
|
+
locator,
|
|
879
|
+
layout_index=layout_index,
|
|
880
|
+
layout_name=layout_name,
|
|
881
|
+
output_path=output_path,
|
|
882
|
+
)
|
|
883
|
+
except (
|
|
884
|
+
InvalidArgumentsError,
|
|
885
|
+
TargetNotFoundError,
|
|
886
|
+
TargetNotEditableError,
|
|
887
|
+
) as exc:
|
|
888
|
+
self._raise_stale_if_document_changed(document, locator, exc)
|
|
889
|
+
raise
|
|
890
|
+
return self._finalize_object_mutation(
|
|
891
|
+
document, output_path, updated_locator, summary, metadata
|
|
892
|
+
)
|
|
893
|
+
|
|
894
|
+
def pptx_add_text_shape(
|
|
895
|
+
self,
|
|
896
|
+
document_id: str,
|
|
897
|
+
locator: str,
|
|
898
|
+
text: str,
|
|
899
|
+
*,
|
|
900
|
+
left: int,
|
|
901
|
+
top: int,
|
|
902
|
+
width: int,
|
|
903
|
+
height: int,
|
|
904
|
+
output_mode: OutputMode = "versioned",
|
|
905
|
+
) -> MutationResult:
|
|
906
|
+
document = self._require_document_type(
|
|
907
|
+
document_id,
|
|
908
|
+
expected="pptx",
|
|
909
|
+
operation="pptx_add_text_shape",
|
|
910
|
+
)
|
|
911
|
+
self._ensure_object_locator_fresh(document, locator)
|
|
912
|
+
output_path = self._resolve_write_output_path(
|
|
913
|
+
document.path, output_mode=output_mode
|
|
914
|
+
)
|
|
915
|
+
try:
|
|
916
|
+
added_locator, summary, metadata = pptx_objects.add_text_shape(
|
|
917
|
+
document.path,
|
|
918
|
+
locator,
|
|
919
|
+
text=text,
|
|
920
|
+
left=left,
|
|
921
|
+
top=top,
|
|
922
|
+
width=width,
|
|
923
|
+
height=height,
|
|
924
|
+
output_path=output_path,
|
|
925
|
+
)
|
|
926
|
+
except (
|
|
927
|
+
InvalidArgumentsError,
|
|
928
|
+
TargetNotFoundError,
|
|
929
|
+
TargetNotEditableError,
|
|
930
|
+
) as exc:
|
|
931
|
+
self._raise_stale_if_document_changed(document, locator, exc)
|
|
932
|
+
raise
|
|
933
|
+
return self._finalize_object_mutation(
|
|
934
|
+
document, output_path, added_locator, summary, metadata
|
|
935
|
+
)
|
|
936
|
+
|
|
937
|
+
def xlsx_write_range(
|
|
938
|
+
self,
|
|
939
|
+
document_id: str,
|
|
940
|
+
locator: str,
|
|
941
|
+
values: list[list[Any]],
|
|
942
|
+
*,
|
|
943
|
+
output_mode: OutputMode = "versioned",
|
|
944
|
+
) -> MutationResult:
|
|
945
|
+
document = self._require_document_type(
|
|
946
|
+
document_id, expected="xlsx", operation="xlsx_write_range"
|
|
947
|
+
)
|
|
948
|
+
self._ensure_object_locator_fresh(document, locator)
|
|
949
|
+
output_path = self._resolve_write_output_path(
|
|
950
|
+
document.path, output_mode=output_mode
|
|
951
|
+
)
|
|
952
|
+
try:
|
|
953
|
+
updated_locator, summary, metadata = xlsx_objects.write_range(
|
|
954
|
+
document.path,
|
|
955
|
+
locator,
|
|
956
|
+
values,
|
|
957
|
+
output_path=output_path,
|
|
958
|
+
)
|
|
959
|
+
except (
|
|
960
|
+
InvalidArgumentsError,
|
|
961
|
+
TargetNotFoundError,
|
|
962
|
+
TargetNotEditableError,
|
|
963
|
+
) as exc:
|
|
964
|
+
self._raise_stale_if_document_changed(document, locator, exc)
|
|
965
|
+
raise
|
|
966
|
+
return self._finalize_object_mutation(
|
|
967
|
+
document, output_path, updated_locator, summary, metadata
|
|
968
|
+
)
|
|
969
|
+
|
|
970
|
+
def xlsx_insert_rows_at(
|
|
971
|
+
self,
|
|
972
|
+
document_id: str,
|
|
973
|
+
locator: str,
|
|
974
|
+
row_number: int,
|
|
975
|
+
count: int,
|
|
976
|
+
*,
|
|
977
|
+
output_mode: OutputMode = "versioned",
|
|
978
|
+
) -> MutationResult:
|
|
979
|
+
document = self._require_document_type(
|
|
980
|
+
document_id, expected="xlsx", operation="xlsx_insert_rows"
|
|
981
|
+
)
|
|
982
|
+
self._ensure_object_locator_fresh(document, locator)
|
|
983
|
+
output_path = self._resolve_write_output_path(
|
|
984
|
+
document.path, output_mode=output_mode
|
|
985
|
+
)
|
|
986
|
+
try:
|
|
987
|
+
inserted_locator, summary, metadata = xlsx_objects.insert_rows(
|
|
988
|
+
document.path,
|
|
989
|
+
locator,
|
|
990
|
+
row_number,
|
|
991
|
+
count,
|
|
992
|
+
output_path=output_path,
|
|
993
|
+
)
|
|
994
|
+
except (
|
|
995
|
+
InvalidArgumentsError,
|
|
996
|
+
TargetNotFoundError,
|
|
997
|
+
TargetNotEditableError,
|
|
998
|
+
) as exc:
|
|
999
|
+
self._raise_stale_if_document_changed(document, locator, exc)
|
|
1000
|
+
raise
|
|
1001
|
+
return self._finalize_object_mutation(
|
|
1002
|
+
document, output_path, inserted_locator, summary, metadata
|
|
1003
|
+
)
|
|
1004
|
+
|
|
1005
|
+
def xlsx_insert_columns(
|
|
1006
|
+
self,
|
|
1007
|
+
document_id: str,
|
|
1008
|
+
locator: str,
|
|
1009
|
+
column_index: int,
|
|
1010
|
+
count: int,
|
|
1011
|
+
*,
|
|
1012
|
+
output_mode: OutputMode = "versioned",
|
|
1013
|
+
) -> MutationResult:
|
|
1014
|
+
document = self._require_document_type(
|
|
1015
|
+
document_id,
|
|
1016
|
+
expected="xlsx",
|
|
1017
|
+
operation="xlsx_insert_columns",
|
|
1018
|
+
)
|
|
1019
|
+
self._ensure_object_locator_fresh(document, locator)
|
|
1020
|
+
output_path = self._resolve_write_output_path(
|
|
1021
|
+
document.path, output_mode=output_mode
|
|
1022
|
+
)
|
|
1023
|
+
try:
|
|
1024
|
+
inserted_locator, summary, metadata = xlsx_objects.insert_columns(
|
|
1025
|
+
document.path,
|
|
1026
|
+
locator,
|
|
1027
|
+
column_index,
|
|
1028
|
+
count,
|
|
1029
|
+
output_path=output_path,
|
|
1030
|
+
)
|
|
1031
|
+
except (
|
|
1032
|
+
InvalidArgumentsError,
|
|
1033
|
+
TargetNotFoundError,
|
|
1034
|
+
TargetNotEditableError,
|
|
1035
|
+
) as exc:
|
|
1036
|
+
self._raise_stale_if_document_changed(document, locator, exc)
|
|
1037
|
+
raise
|
|
1038
|
+
return self._finalize_object_mutation(
|
|
1039
|
+
document, output_path, inserted_locator, summary, metadata
|
|
1040
|
+
)
|
|
1041
|
+
|
|
1042
|
+
def xlsx_set_formula(
|
|
1043
|
+
self,
|
|
1044
|
+
document_id: str,
|
|
1045
|
+
locator: str,
|
|
1046
|
+
formula: str,
|
|
1047
|
+
*,
|
|
1048
|
+
output_mode: OutputMode = "versioned",
|
|
1049
|
+
) -> MutationResult:
|
|
1050
|
+
document = self._require_document_type(
|
|
1051
|
+
document_id, expected="xlsx", operation="xlsx_set_formula"
|
|
1052
|
+
)
|
|
1053
|
+
self._ensure_object_locator_fresh(document, locator)
|
|
1054
|
+
output_path = self._resolve_write_output_path(
|
|
1055
|
+
document.path, output_mode=output_mode
|
|
1056
|
+
)
|
|
1057
|
+
try:
|
|
1058
|
+
formula_locator, summary, metadata = xlsx_objects.set_formula(
|
|
1059
|
+
document.path,
|
|
1060
|
+
locator,
|
|
1061
|
+
formula,
|
|
1062
|
+
output_path=output_path,
|
|
1063
|
+
)
|
|
1064
|
+
except (
|
|
1065
|
+
InvalidArgumentsError,
|
|
1066
|
+
TargetNotFoundError,
|
|
1067
|
+
TargetNotEditableError,
|
|
1068
|
+
) as exc:
|
|
1069
|
+
self._raise_stale_if_document_changed(document, locator, exc)
|
|
1070
|
+
raise
|
|
1071
|
+
return self._finalize_object_mutation(
|
|
1072
|
+
document, output_path, formula_locator, summary, metadata
|
|
1073
|
+
)
|
|
1074
|
+
|
|
1075
|
+
def xlsx_merge_cells(
|
|
1076
|
+
self,
|
|
1077
|
+
document_id: str,
|
|
1078
|
+
locator: str,
|
|
1079
|
+
*,
|
|
1080
|
+
output_mode: OutputMode = "versioned",
|
|
1081
|
+
) -> MutationResult:
|
|
1082
|
+
document = self._require_document_type(
|
|
1083
|
+
document_id, expected="xlsx", operation="xlsx_merge_cells"
|
|
1084
|
+
)
|
|
1085
|
+
self._ensure_object_locator_fresh(document, locator)
|
|
1086
|
+
output_path = self._resolve_write_output_path(
|
|
1087
|
+
document.path, output_mode=output_mode
|
|
1088
|
+
)
|
|
1089
|
+
try:
|
|
1090
|
+
merged_locator, summary, metadata = xlsx_objects.merge_cells(
|
|
1091
|
+
document.path,
|
|
1092
|
+
locator,
|
|
1093
|
+
output_path=output_path,
|
|
1094
|
+
)
|
|
1095
|
+
except (
|
|
1096
|
+
InvalidArgumentsError,
|
|
1097
|
+
TargetNotFoundError,
|
|
1098
|
+
TargetNotEditableError,
|
|
1099
|
+
) as exc:
|
|
1100
|
+
self._raise_stale_if_document_changed(document, locator, exc)
|
|
1101
|
+
raise
|
|
1102
|
+
return self._finalize_object_mutation(
|
|
1103
|
+
document, output_path, merged_locator, summary, metadata
|
|
1104
|
+
)
|
|
1105
|
+
|
|
1106
|
+
def create_document(
|
|
1107
|
+
self,
|
|
1108
|
+
format: str,
|
|
1109
|
+
output_path: Path,
|
|
1110
|
+
*,
|
|
1111
|
+
output_mode: OutputMode = "versioned",
|
|
1112
|
+
initial_sheet_name: str | None = None,
|
|
1113
|
+
) -> MutationResult:
|
|
1114
|
+
normalized_format = str(format).strip().lower()
|
|
1115
|
+
if normalized_format not in {"docx", "pptx", "xlsx"}:
|
|
1116
|
+
raise InvalidArgumentsError(f"Unsupported document format: {format}")
|
|
1117
|
+
|
|
1118
|
+
requested_path = canonicalize_output_path(output_path)
|
|
1119
|
+
if requested_path.suffix.lower() != f".{normalized_format}":
|
|
1120
|
+
raise InvalidArgumentsError(
|
|
1121
|
+
f"create_document output path must use the .{normalized_format} extension."
|
|
1122
|
+
)
|
|
1123
|
+
|
|
1124
|
+
target_path = self._resolve_create_output_path(
|
|
1125
|
+
requested_path, output_mode=output_mode
|
|
1126
|
+
)
|
|
1127
|
+
if normalized_format == "docx":
|
|
1128
|
+
docx_adapter.create_docx(target_path)
|
|
1129
|
+
locator = "docx:document"
|
|
1130
|
+
elif normalized_format == "pptx":
|
|
1131
|
+
pptx_adapter.create_pptx(target_path)
|
|
1132
|
+
locator = "pptx:presentation"
|
|
1133
|
+
else:
|
|
1134
|
+
xlsx_adapter.create_xlsx(target_path, initial_sheet_name=initial_sheet_name)
|
|
1135
|
+
locator = "xlsx:workbook"
|
|
1136
|
+
|
|
1137
|
+
output_document = self.index_document(target_path)
|
|
1138
|
+
payload = self.get_object(output_document.document_id, locator)
|
|
1139
|
+
metadata: dict[str, Any] = {"format": normalized_format}
|
|
1140
|
+
if initial_sheet_name is not None:
|
|
1141
|
+
metadata["initial_sheet_name"] = initial_sheet_name
|
|
1142
|
+
return MutationResult(
|
|
1143
|
+
document_path=target_path,
|
|
1144
|
+
output_path=target_path,
|
|
1145
|
+
document_id=output_document.document_id,
|
|
1146
|
+
locator=payload.locator,
|
|
1147
|
+
object_type=payload.object_type,
|
|
1148
|
+
summary=f"Created {normalized_format.upper()} document at {target_path}.",
|
|
1149
|
+
capabilities=payload.capabilities,
|
|
1150
|
+
parent_locator=payload.parent_locator,
|
|
1151
|
+
metadata=metadata,
|
|
1152
|
+
)
|
|
1153
|
+
|
|
1154
|
+
def add_content_block(
|
|
1155
|
+
self,
|
|
1156
|
+
document_id: str,
|
|
1157
|
+
block_type: str,
|
|
1158
|
+
properties: dict[str, Any],
|
|
1159
|
+
*,
|
|
1160
|
+
output_mode: OutputMode = "versioned",
|
|
1161
|
+
) -> MutationResult:
|
|
1162
|
+
document = self.get_document(document_id)
|
|
1163
|
+
normalized_block_type = str(block_type).strip().lower()
|
|
1164
|
+
output_path = self._resolve_write_output_path(
|
|
1165
|
+
document.path, output_mode=output_mode
|
|
1166
|
+
)
|
|
1167
|
+
|
|
1168
|
+
try:
|
|
1169
|
+
locator = self._dispatch_add_content_block(
|
|
1170
|
+
document,
|
|
1171
|
+
normalized_block_type,
|
|
1172
|
+
properties,
|
|
1173
|
+
output_path,
|
|
1174
|
+
)
|
|
1175
|
+
except (
|
|
1176
|
+
InvalidArgumentsError,
|
|
1177
|
+
TargetNotFoundError,
|
|
1178
|
+
TargetNotEditableError,
|
|
1179
|
+
) as exc:
|
|
1180
|
+
stale_locator = next(
|
|
1181
|
+
(
|
|
1182
|
+
value
|
|
1183
|
+
for key, value in properties.items()
|
|
1184
|
+
if key
|
|
1185
|
+
in {"locator", "slide", "slide_locator", "sheet", "sheet_locator"}
|
|
1186
|
+
and isinstance(value, str)
|
|
1187
|
+
),
|
|
1188
|
+
"docx:document" if document.file_type == "docx" else None,
|
|
1189
|
+
)
|
|
1190
|
+
self._raise_stale_if_document_changed(
|
|
1191
|
+
document, stale_locator or document.path.as_posix(), exc
|
|
1192
|
+
)
|
|
1193
|
+
raise
|
|
1194
|
+
|
|
1195
|
+
output_document = self.index_document(output_path)
|
|
1196
|
+
payload = self.get_object(output_document.document_id, locator)
|
|
1197
|
+
return MutationResult(
|
|
1198
|
+
document_path=document.path,
|
|
1199
|
+
output_path=output_path,
|
|
1200
|
+
document_id=output_document.document_id,
|
|
1201
|
+
locator=payload.locator,
|
|
1202
|
+
object_type=payload.object_type,
|
|
1203
|
+
summary=f"Added {normalized_block_type} to {document.file_type.upper()} document.",
|
|
1204
|
+
capabilities=payload.capabilities,
|
|
1205
|
+
parent_locator=payload.parent_locator,
|
|
1206
|
+
metadata={"block_type": normalized_block_type, **payload.metadata},
|
|
1207
|
+
)
|
|
1208
|
+
|
|
1209
|
+
def style_inline(
|
|
1210
|
+
self,
|
|
1211
|
+
document_id: str,
|
|
1212
|
+
locator: str,
|
|
1213
|
+
style: InlineStyle,
|
|
1214
|
+
clear_fields: list[str] | tuple[str, ...] | None = None,
|
|
1215
|
+
text_range: VisibleTextRange | dict[str, Any] | None = None,
|
|
1216
|
+
*,
|
|
1217
|
+
output_mode: OutputMode = "versioned",
|
|
1218
|
+
) -> MutationResult:
|
|
1219
|
+
document = self.get_document(document_id)
|
|
1220
|
+
self._ensure_object_locator_fresh(document, locator)
|
|
1221
|
+
output_path = self._resolve_write_output_path(
|
|
1222
|
+
document.path, output_mode=output_mode
|
|
1223
|
+
)
|
|
1224
|
+
clear_list = [] if clear_fields is None else list(clear_fields)
|
|
1225
|
+
range_value = _coerce_visible_text_range(text_range)
|
|
1226
|
+
|
|
1227
|
+
try:
|
|
1228
|
+
if document.file_type == "docx" and range_value is not None:
|
|
1229
|
+
output_path, result_locator, metadata = (
|
|
1230
|
+
docx_adapter.style_paragraph_range(
|
|
1231
|
+
document.path,
|
|
1232
|
+
locator,
|
|
1233
|
+
range_value,
|
|
1234
|
+
style,
|
|
1235
|
+
clear_list,
|
|
1236
|
+
output_path=output_path,
|
|
1237
|
+
)
|
|
1238
|
+
)
|
|
1239
|
+
elif document.file_type == "docx":
|
|
1240
|
+
output_path, result_locator, metadata = docx_adapter.style_run(
|
|
1241
|
+
document.path,
|
|
1242
|
+
locator,
|
|
1243
|
+
style,
|
|
1244
|
+
clear_list,
|
|
1245
|
+
output_path=output_path,
|
|
1246
|
+
)
|
|
1247
|
+
elif document.file_type == "pptx" and range_value is not None:
|
|
1248
|
+
output_path, result_locator, metadata = (
|
|
1249
|
+
pptx_adapter.style_paragraph_range(
|
|
1250
|
+
document.path,
|
|
1251
|
+
locator,
|
|
1252
|
+
range_value,
|
|
1253
|
+
style,
|
|
1254
|
+
clear_list,
|
|
1255
|
+
output_path=output_path,
|
|
1256
|
+
)
|
|
1257
|
+
)
|
|
1258
|
+
elif document.file_type == "pptx":
|
|
1259
|
+
output_path, result_locator, metadata = pptx_adapter.style_run(
|
|
1260
|
+
document.path,
|
|
1261
|
+
locator,
|
|
1262
|
+
style,
|
|
1263
|
+
clear_list,
|
|
1264
|
+
output_path=output_path,
|
|
1265
|
+
)
|
|
1266
|
+
elif range_value is not None:
|
|
1267
|
+
output_path, result_locator, metadata = xlsx_adapter.style_cell_range(
|
|
1268
|
+
document.path,
|
|
1269
|
+
locator,
|
|
1270
|
+
range_value,
|
|
1271
|
+
style,
|
|
1272
|
+
clear_list,
|
|
1273
|
+
output_path=output_path,
|
|
1274
|
+
)
|
|
1275
|
+
else:
|
|
1276
|
+
output_path, result_locator, metadata = xlsx_adapter.style_cell_inline(
|
|
1277
|
+
document.path,
|
|
1278
|
+
locator,
|
|
1279
|
+
style,
|
|
1280
|
+
clear_list,
|
|
1281
|
+
output_path=output_path,
|
|
1282
|
+
)
|
|
1283
|
+
except (
|
|
1284
|
+
InvalidArgumentsError,
|
|
1285
|
+
TargetNotFoundError,
|
|
1286
|
+
TargetNotEditableError,
|
|
1287
|
+
) as exc:
|
|
1288
|
+
self._raise_stale_if_document_changed(document, locator, exc)
|
|
1289
|
+
raise
|
|
1290
|
+
|
|
1291
|
+
output_document = self.index_document(output_path)
|
|
1292
|
+
payload = self.get_object(output_document.document_id, result_locator)
|
|
1293
|
+
return MutationResult(
|
|
1294
|
+
document_path=document.path,
|
|
1295
|
+
output_path=output_path,
|
|
1296
|
+
document_id=output_document.document_id,
|
|
1297
|
+
locator=payload.locator,
|
|
1298
|
+
object_type=payload.object_type,
|
|
1299
|
+
summary=f"Applied inline style at {locator}.",
|
|
1300
|
+
capabilities=payload.capabilities,
|
|
1301
|
+
parent_locator=payload.parent_locator,
|
|
1302
|
+
metadata=metadata,
|
|
1303
|
+
)
|
|
1304
|
+
|
|
1305
|
+
def style_block(
|
|
1306
|
+
self,
|
|
1307
|
+
document_id: str,
|
|
1308
|
+
locator: str,
|
|
1309
|
+
style: BlockStyle,
|
|
1310
|
+
clear_fields: list[str] | tuple[str, ...] | None = None,
|
|
1311
|
+
*,
|
|
1312
|
+
output_mode: OutputMode = "versioned",
|
|
1313
|
+
) -> MutationResult:
|
|
1314
|
+
document = self.get_document(document_id)
|
|
1315
|
+
self._ensure_object_locator_fresh(document, locator)
|
|
1316
|
+
output_path = self._resolve_write_output_path(
|
|
1317
|
+
document.path, output_mode=output_mode
|
|
1318
|
+
)
|
|
1319
|
+
clear_list = [] if clear_fields is None else list(clear_fields)
|
|
1320
|
+
|
|
1321
|
+
try:
|
|
1322
|
+
if document.file_type == "docx":
|
|
1323
|
+
output_path, result_locator, metadata = docx_adapter.style_paragraph(
|
|
1324
|
+
document.path,
|
|
1325
|
+
locator,
|
|
1326
|
+
style,
|
|
1327
|
+
clear_list,
|
|
1328
|
+
output_path=output_path,
|
|
1329
|
+
)
|
|
1330
|
+
elif document.file_type == "pptx":
|
|
1331
|
+
output_path, result_locator, metadata = pptx_adapter.style_paragraph(
|
|
1332
|
+
document.path,
|
|
1333
|
+
locator,
|
|
1334
|
+
style,
|
|
1335
|
+
clear_list,
|
|
1336
|
+
output_path=output_path,
|
|
1337
|
+
)
|
|
1338
|
+
else:
|
|
1339
|
+
output_path, result_locator, metadata = xlsx_adapter.style_cell_block(
|
|
1340
|
+
document.path,
|
|
1341
|
+
locator,
|
|
1342
|
+
style,
|
|
1343
|
+
clear_list,
|
|
1344
|
+
output_path=output_path,
|
|
1345
|
+
)
|
|
1346
|
+
except (
|
|
1347
|
+
InvalidArgumentsError,
|
|
1348
|
+
TargetNotFoundError,
|
|
1349
|
+
TargetNotEditableError,
|
|
1350
|
+
) as exc:
|
|
1351
|
+
self._raise_stale_if_document_changed(document, locator, exc)
|
|
1352
|
+
raise
|
|
1353
|
+
|
|
1354
|
+
output_document = self.index_document(output_path)
|
|
1355
|
+
payload = self.get_object(output_document.document_id, result_locator)
|
|
1356
|
+
return MutationResult(
|
|
1357
|
+
document_path=document.path,
|
|
1358
|
+
output_path=output_path,
|
|
1359
|
+
document_id=output_document.document_id,
|
|
1360
|
+
locator=payload.locator,
|
|
1361
|
+
object_type=payload.object_type,
|
|
1362
|
+
summary=f"Applied block style at {locator}.",
|
|
1363
|
+
capabilities=payload.capabilities,
|
|
1364
|
+
parent_locator=payload.parent_locator,
|
|
1365
|
+
metadata=metadata,
|
|
1366
|
+
)
|
|
1367
|
+
|
|
1368
|
+
def set_structural_role(
|
|
1369
|
+
self,
|
|
1370
|
+
document_id: str,
|
|
1371
|
+
locator: str,
|
|
1372
|
+
role: str,
|
|
1373
|
+
level: int | None = None,
|
|
1374
|
+
*,
|
|
1375
|
+
output_mode: OutputMode = "versioned",
|
|
1376
|
+
) -> MutationResult:
|
|
1377
|
+
document = self._require_document_type(
|
|
1378
|
+
document_id,
|
|
1379
|
+
expected="docx",
|
|
1380
|
+
operation="set_structural_role",
|
|
1381
|
+
)
|
|
1382
|
+
self._ensure_object_locator_fresh(document, locator)
|
|
1383
|
+
output_path = self._resolve_write_output_path(
|
|
1384
|
+
document.path, output_mode=output_mode
|
|
1385
|
+
)
|
|
1386
|
+
try:
|
|
1387
|
+
output_path, result_locator, metadata = docx_adapter.set_structural_role(
|
|
1388
|
+
document.path,
|
|
1389
|
+
locator,
|
|
1390
|
+
role,
|
|
1391
|
+
level,
|
|
1392
|
+
output_path=output_path,
|
|
1393
|
+
)
|
|
1394
|
+
except (
|
|
1395
|
+
InvalidArgumentsError,
|
|
1396
|
+
TargetNotFoundError,
|
|
1397
|
+
TargetNotEditableError,
|
|
1398
|
+
) as exc:
|
|
1399
|
+
self._raise_stale_if_document_changed(document, locator, exc)
|
|
1400
|
+
raise
|
|
1401
|
+
|
|
1402
|
+
output_document = self.index_document(output_path)
|
|
1403
|
+
payload = self.get_object(output_document.document_id, result_locator)
|
|
1404
|
+
return MutationResult(
|
|
1405
|
+
document_path=document.path,
|
|
1406
|
+
output_path=output_path,
|
|
1407
|
+
document_id=output_document.document_id,
|
|
1408
|
+
locator=payload.locator,
|
|
1409
|
+
object_type=payload.object_type,
|
|
1410
|
+
summary=f"Applied structural role {role!r} at {locator}.",
|
|
1411
|
+
capabilities=payload.capabilities,
|
|
1412
|
+
parent_locator=payload.parent_locator,
|
|
1413
|
+
metadata=metadata,
|
|
1414
|
+
)
|
|
1415
|
+
|
|
1416
|
+
def write_node(
|
|
1417
|
+
self,
|
|
1418
|
+
document_id: str,
|
|
1419
|
+
node_id: str,
|
|
1420
|
+
content: str,
|
|
1421
|
+
*,
|
|
1422
|
+
output_mode: OutputMode = "versioned",
|
|
1423
|
+
) -> NodeWriteResult:
|
|
1424
|
+
document = self.get_document(document_id)
|
|
1425
|
+
source_hash = _content_hash(document.path)
|
|
1426
|
+
output_path = self._resolve_write_output_path(
|
|
1427
|
+
document.path, output_mode=output_mode
|
|
1428
|
+
)
|
|
1429
|
+
try:
|
|
1430
|
+
previous = self.get_node(document_id, node_id)
|
|
1431
|
+
if document.file_type == "docx":
|
|
1432
|
+
output_path = docx_adapter.write_node(
|
|
1433
|
+
document.path, node_id, content, output_path
|
|
1434
|
+
)
|
|
1435
|
+
elif document.file_type == "pptx":
|
|
1436
|
+
output_path = pptx_adapter.write_node(
|
|
1437
|
+
document.path, node_id, content, output_path
|
|
1438
|
+
)
|
|
1439
|
+
else:
|
|
1440
|
+
output_path = xlsx_adapter.write_node(
|
|
1441
|
+
document.path, node_id, content, output_path
|
|
1442
|
+
)
|
|
1443
|
+
except (
|
|
1444
|
+
InvalidArgumentsError,
|
|
1445
|
+
TargetNotFoundError,
|
|
1446
|
+
TargetNotEditableError,
|
|
1447
|
+
) as exc:
|
|
1448
|
+
if source_hash != document.content_hash:
|
|
1449
|
+
raise StaleLocatorError(
|
|
1450
|
+
f"stale locator: {node_id} is no longer valid for {document.path}"
|
|
1451
|
+
) from exc
|
|
1452
|
+
raise
|
|
1453
|
+
|
|
1454
|
+
output_document = self.index_document(output_path)
|
|
1455
|
+
new_text = self.get_node(output_document.document_id, node_id).text
|
|
1456
|
+
|
|
1457
|
+
return NodeWriteResult(
|
|
1458
|
+
document_path=document.path,
|
|
1459
|
+
output_path=output_path,
|
|
1460
|
+
document_id=output_document.document_id,
|
|
1461
|
+
node_id=node_id,
|
|
1462
|
+
new_text=new_text,
|
|
1463
|
+
previous_text=previous.text,
|
|
1464
|
+
)
|
|
1465
|
+
|
|
1466
|
+
def insert_content(
|
|
1467
|
+
self,
|
|
1468
|
+
document_id: str,
|
|
1469
|
+
content: str,
|
|
1470
|
+
*,
|
|
1471
|
+
style_name: str | None = None,
|
|
1472
|
+
after_node_id: str | None = None,
|
|
1473
|
+
output_mode: OutputMode = "versioned",
|
|
1474
|
+
) -> InsertContentResult:
|
|
1475
|
+
document = self._require_document_type(
|
|
1476
|
+
document_id, expected="docx", operation="insert_content"
|
|
1477
|
+
)
|
|
1478
|
+
output_path = self._resolve_write_output_path(
|
|
1479
|
+
document.path, output_mode=output_mode
|
|
1480
|
+
)
|
|
1481
|
+
output_path, new_node_id = docx_adapter.insert_paragraph(
|
|
1482
|
+
document.path,
|
|
1483
|
+
content,
|
|
1484
|
+
style_name=style_name,
|
|
1485
|
+
after_locator=after_node_id,
|
|
1486
|
+
output_path=output_path,
|
|
1487
|
+
)
|
|
1488
|
+
output_document = self.index_document(output_path)
|
|
1489
|
+
node = self.get_node(output_document.document_id, new_node_id)
|
|
1490
|
+
return InsertContentResult(
|
|
1491
|
+
document_path=document.path,
|
|
1492
|
+
output_path=output_path,
|
|
1493
|
+
document_id=output_document.document_id,
|
|
1494
|
+
new_node_id=new_node_id,
|
|
1495
|
+
preview=node.text[:120],
|
|
1496
|
+
)
|
|
1497
|
+
|
|
1498
|
+
def xlsx_insert_rows(
|
|
1499
|
+
self,
|
|
1500
|
+
document_id: str,
|
|
1501
|
+
sheet_name: str,
|
|
1502
|
+
*,
|
|
1503
|
+
rows: list[list[str]] | None = None,
|
|
1504
|
+
records: list[dict[str, str]] | None = None,
|
|
1505
|
+
output_mode: OutputMode = "versioned",
|
|
1506
|
+
) -> XlsxInsertRowsResult:
|
|
1507
|
+
document = self._require_document_type(
|
|
1508
|
+
document_id,
|
|
1509
|
+
expected="xlsx",
|
|
1510
|
+
operation="xlsx_insert_rows",
|
|
1511
|
+
)
|
|
1512
|
+
output_path = self._resolve_write_output_path(
|
|
1513
|
+
document.path, output_mode=output_mode
|
|
1514
|
+
)
|
|
1515
|
+
if rows is not None:
|
|
1516
|
+
output_path, start_row, _ = xlsx_adapter.write_table(
|
|
1517
|
+
document.path,
|
|
1518
|
+
sheet_name,
|
|
1519
|
+
rows=rows,
|
|
1520
|
+
output_path=output_path,
|
|
1521
|
+
)
|
|
1522
|
+
rows_inserted = len(rows)
|
|
1523
|
+
else:
|
|
1524
|
+
if records is None:
|
|
1525
|
+
raise InvalidArgumentsError(
|
|
1526
|
+
"xlsx_insert_rows requires either rows or records."
|
|
1527
|
+
)
|
|
1528
|
+
output_path, start_row, _ = xlsx_adapter.write_table(
|
|
1529
|
+
document.path,
|
|
1530
|
+
sheet_name,
|
|
1531
|
+
records=records,
|
|
1532
|
+
output_path=output_path,
|
|
1533
|
+
)
|
|
1534
|
+
rows_inserted = len(records)
|
|
1535
|
+
output_document = self.index_document(output_path)
|
|
1536
|
+
first_row_locator = xlsx_adapter.make_item_id(sheet_name, f"A{start_row}")
|
|
1537
|
+
return XlsxInsertRowsResult(
|
|
1538
|
+
document_path=document.path,
|
|
1539
|
+
output_path=output_path,
|
|
1540
|
+
document_id=output_document.document_id,
|
|
1541
|
+
rows_inserted=rows_inserted,
|
|
1542
|
+
first_row_locator=first_row_locator,
|
|
1543
|
+
)
|
|
1544
|
+
|
|
1545
|
+
def docx_get_tables(self, document_id: str) -> DocxTablesResult:
|
|
1546
|
+
document = self._require_document_type(
|
|
1547
|
+
document_id, expected="docx", operation="docx_get_tables"
|
|
1548
|
+
)
|
|
1549
|
+
tables = tuple(
|
|
1550
|
+
DocxTableEntry(
|
|
1551
|
+
locator=docx_adapter.make_table_cell_locator(table.table_index, 0, 0),
|
|
1552
|
+
table_index=table.table_index,
|
|
1553
|
+
rows=table.rows,
|
|
1554
|
+
preview=table.preview,
|
|
1555
|
+
metadata={
|
|
1556
|
+
"block_index": table.block_index,
|
|
1557
|
+
**table.metadata,
|
|
1558
|
+
},
|
|
1559
|
+
)
|
|
1560
|
+
for table in docx_adapter.get_tables(document.path)
|
|
1561
|
+
)
|
|
1562
|
+
return DocxTablesResult(document=document, tables=tables)
|
|
1563
|
+
|
|
1564
|
+
def get_presentation_structure(self, document_id: str) -> PresentationStructure:
|
|
1565
|
+
document = self._require_document_type(
|
|
1566
|
+
document_id, expected="pptx", operation="get_presentation_structure"
|
|
1567
|
+
)
|
|
1568
|
+
result = pptx_adapter.get_presentation_structure(document.path)
|
|
1569
|
+
return PresentationStructure(document=document, slides=result)
|
|
1570
|
+
|
|
1571
|
+
def get_slide_bundle(self, document_id: str, slide_number: int):
|
|
1572
|
+
document = self._require_document_type(
|
|
1573
|
+
document_id, expected="pptx", operation="get_slide_bundle"
|
|
1574
|
+
)
|
|
1575
|
+
return replace(
|
|
1576
|
+
pptx_adapter.get_slide_bundle(document.path, slide_number),
|
|
1577
|
+
document=document,
|
|
1578
|
+
)
|
|
1579
|
+
|
|
1580
|
+
def get_slide_notes(self, document_id: str, slide_number: int) -> SlideNotes:
|
|
1581
|
+
document = self._require_document_type(
|
|
1582
|
+
document_id, expected="pptx", operation="get_slide_notes"
|
|
1583
|
+
)
|
|
1584
|
+
return SlideNotes(
|
|
1585
|
+
document_id=document.document_id,
|
|
1586
|
+
slide_number=slide_number,
|
|
1587
|
+
notes_text=pptx_adapter.get_slide_notes(document.path, slide_number),
|
|
1588
|
+
)
|
|
1589
|
+
|
|
1590
|
+
def get_workbook_structure(self, document_id: str) -> WorkbookStructure:
|
|
1591
|
+
document = self._require_document_type(
|
|
1592
|
+
document_id, expected="xlsx", operation="get_workbook_structure"
|
|
1593
|
+
)
|
|
1594
|
+
return replace(
|
|
1595
|
+
xlsx_adapter.get_workbook_structure(document.path), document=document
|
|
1596
|
+
)
|
|
1597
|
+
|
|
1598
|
+
def get_sheet_snapshot(
|
|
1599
|
+
self,
|
|
1600
|
+
document_id: str,
|
|
1601
|
+
sheet_name: str,
|
|
1602
|
+
*,
|
|
1603
|
+
cell_range: str | None = None,
|
|
1604
|
+
start_cell: str | None = None,
|
|
1605
|
+
row_count: int | None = None,
|
|
1606
|
+
column_count: int | None = None,
|
|
1607
|
+
) -> SheetSnapshot:
|
|
1608
|
+
document = self._require_document_type(
|
|
1609
|
+
document_id, expected="xlsx", operation="get_sheet_snapshot"
|
|
1610
|
+
)
|
|
1611
|
+
return replace(
|
|
1612
|
+
xlsx_adapter.get_sheet_snapshot(
|
|
1613
|
+
document.path,
|
|
1614
|
+
sheet_name,
|
|
1615
|
+
cell_range=cell_range,
|
|
1616
|
+
start_cell=start_cell,
|
|
1617
|
+
row_count=row_count,
|
|
1618
|
+
column_count=column_count,
|
|
1619
|
+
),
|
|
1620
|
+
document=document,
|
|
1621
|
+
)
|
|
1622
|
+
|
|
1623
|
+
def get_document_blocks(self, document_id: str) -> DocumentBlocks:
|
|
1624
|
+
document = self._require_document_type(
|
|
1625
|
+
document_id, expected="docx", operation="get_document_blocks"
|
|
1626
|
+
)
|
|
1627
|
+
return DocumentBlocks(
|
|
1628
|
+
document=document, blocks=docx_adapter.get_blocks(document.path)
|
|
1629
|
+
)
|
|
1630
|
+
|
|
1631
|
+
def get_paragraphs(self, document_id: str) -> ParagraphCollection:
|
|
1632
|
+
document = self._require_document_type(
|
|
1633
|
+
document_id, expected="docx", operation="get_paragraphs"
|
|
1634
|
+
)
|
|
1635
|
+
return ParagraphCollection(
|
|
1636
|
+
document=document, paragraphs=docx_adapter.get_paragraphs(document.path)
|
|
1637
|
+
)
|
|
1638
|
+
|
|
1639
|
+
def get_tables(self, document_id: str) -> TableCollection:
|
|
1640
|
+
document = self._require_document_type(
|
|
1641
|
+
document_id, expected="docx", operation="get_tables"
|
|
1642
|
+
)
|
|
1643
|
+
return TableCollection(
|
|
1644
|
+
document=document, tables=docx_adapter.get_tables(document.path)
|
|
1645
|
+
)
|
|
1646
|
+
|
|
1647
|
+
def get_block_bundle(self, document_id: str, block_index: int) -> BlockBundle:
|
|
1648
|
+
document = self._require_document_type(
|
|
1649
|
+
document_id, expected="docx", operation="get_block_bundle"
|
|
1650
|
+
)
|
|
1651
|
+
return replace(
|
|
1652
|
+
docx_adapter.get_block_bundle(document.path, block_index), document=document
|
|
1653
|
+
)
|
|
1654
|
+
|
|
1655
|
+
def append_row(
|
|
1656
|
+
self,
|
|
1657
|
+
document_id: str,
|
|
1658
|
+
sheet_name: str,
|
|
1659
|
+
*,
|
|
1660
|
+
values: list[str] | None = None,
|
|
1661
|
+
record: dict[str, str] | None = None,
|
|
1662
|
+
output_mode: OutputMode = "versioned",
|
|
1663
|
+
) -> StructuredWriteResult:
|
|
1664
|
+
document = self._require_document_type(
|
|
1665
|
+
document_id, expected="xlsx", operation="append_row"
|
|
1666
|
+
)
|
|
1667
|
+
output_path = self._resolve_write_output_path(
|
|
1668
|
+
document.path, output_mode=output_mode
|
|
1669
|
+
)
|
|
1670
|
+
output_path, row_number, coordinates = xlsx_adapter.append_row(
|
|
1671
|
+
document.path,
|
|
1672
|
+
sheet_name,
|
|
1673
|
+
values=values,
|
|
1674
|
+
record=record,
|
|
1675
|
+
output_path=output_path,
|
|
1676
|
+
)
|
|
1677
|
+
return StructuredWriteResult(
|
|
1678
|
+
document_path=document.path,
|
|
1679
|
+
output_path=output_path,
|
|
1680
|
+
target=StructuredTarget(
|
|
1681
|
+
target_type="worksheet_row",
|
|
1682
|
+
identifier=f"{sheet_name}!row:{row_number}",
|
|
1683
|
+
preview=", ".join(coordinates),
|
|
1684
|
+
metadata={
|
|
1685
|
+
"sheet_name": sheet_name,
|
|
1686
|
+
"row_number": row_number,
|
|
1687
|
+
"coordinates": list(coordinates),
|
|
1688
|
+
},
|
|
1689
|
+
),
|
|
1690
|
+
summary=f"Appended row {row_number} to worksheet {sheet_name}.",
|
|
1691
|
+
)
|
|
1692
|
+
|
|
1693
|
+
def write_table(
|
|
1694
|
+
self,
|
|
1695
|
+
document_id: str,
|
|
1696
|
+
sheet_name: str,
|
|
1697
|
+
*,
|
|
1698
|
+
rows: list[list[str]] | None = None,
|
|
1699
|
+
records: list[dict[str, str]] | None = None,
|
|
1700
|
+
column_mapping: dict[str, str] | None = None,
|
|
1701
|
+
output_mode: OutputMode = "versioned",
|
|
1702
|
+
) -> StructuredWriteResult:
|
|
1703
|
+
document = self._require_document_type(
|
|
1704
|
+
document_id, expected="xlsx", operation="write_table"
|
|
1705
|
+
)
|
|
1706
|
+
output_path = self._resolve_write_output_path(
|
|
1707
|
+
document.path, output_mode=output_mode
|
|
1708
|
+
)
|
|
1709
|
+
output_path, start_row, end_row = xlsx_adapter.write_table(
|
|
1710
|
+
document.path,
|
|
1711
|
+
sheet_name,
|
|
1712
|
+
rows=rows,
|
|
1713
|
+
records=records,
|
|
1714
|
+
column_mapping=column_mapping,
|
|
1715
|
+
output_path=output_path,
|
|
1716
|
+
)
|
|
1717
|
+
return StructuredWriteResult(
|
|
1718
|
+
document_path=document.path,
|
|
1719
|
+
output_path=output_path,
|
|
1720
|
+
target=StructuredTarget(
|
|
1721
|
+
target_type="worksheet_range",
|
|
1722
|
+
identifier=f"{sheet_name}!rows:{start_row}-{end_row}",
|
|
1723
|
+
preview=f"{sheet_name} rows {start_row}-{end_row}",
|
|
1724
|
+
metadata={
|
|
1725
|
+
"sheet_name": sheet_name,
|
|
1726
|
+
"start_row": start_row,
|
|
1727
|
+
"end_row": end_row,
|
|
1728
|
+
"row_count": end_row - start_row + 1,
|
|
1729
|
+
},
|
|
1730
|
+
),
|
|
1731
|
+
summary=f"Wrote {end_row - start_row + 1} rows to worksheet {sheet_name}.",
|
|
1732
|
+
)
|
|
1733
|
+
|
|
1734
|
+
def append_paragraph(
|
|
1735
|
+
self,
|
|
1736
|
+
document_id: str,
|
|
1737
|
+
text: str,
|
|
1738
|
+
*,
|
|
1739
|
+
style_name: str | None = None,
|
|
1740
|
+
output_mode: OutputMode = "versioned",
|
|
1741
|
+
) -> StructuredWriteResult:
|
|
1742
|
+
document = self._require_document_type(
|
|
1743
|
+
document_id, expected="docx", operation="append_paragraph"
|
|
1744
|
+
)
|
|
1745
|
+
output_path = self._resolve_write_output_path(
|
|
1746
|
+
document.path, output_mode=output_mode
|
|
1747
|
+
)
|
|
1748
|
+
output_path, block_index = docx_adapter.append_paragraph_block(
|
|
1749
|
+
document.path,
|
|
1750
|
+
text,
|
|
1751
|
+
style_name=style_name,
|
|
1752
|
+
output_path=output_path,
|
|
1753
|
+
)
|
|
1754
|
+
bundle = docx_adapter.get_block_bundle(output_path, block_index)
|
|
1755
|
+
return StructuredWriteResult(
|
|
1756
|
+
document_path=document.path,
|
|
1757
|
+
output_path=output_path,
|
|
1758
|
+
target=StructuredTarget(
|
|
1759
|
+
target_type="document_block",
|
|
1760
|
+
identifier=f"block:{block_index}",
|
|
1761
|
+
preview=bundle.block.preview,
|
|
1762
|
+
metadata={
|
|
1763
|
+
"block_index": block_index,
|
|
1764
|
+
"block_type": bundle.block.block_type,
|
|
1765
|
+
"style_name": None
|
|
1766
|
+
if bundle.paragraph is None
|
|
1767
|
+
else bundle.paragraph.style_name,
|
|
1768
|
+
},
|
|
1769
|
+
),
|
|
1770
|
+
summary=f"Appended paragraph block {block_index}.",
|
|
1771
|
+
)
|
|
1772
|
+
|
|
1773
|
+
def replace_block(
|
|
1774
|
+
self,
|
|
1775
|
+
document_id: str,
|
|
1776
|
+
block_index: int,
|
|
1777
|
+
text: str,
|
|
1778
|
+
*,
|
|
1779
|
+
output_mode: OutputMode = "versioned",
|
|
1780
|
+
) -> StructuredWriteResult:
|
|
1781
|
+
document = self._require_document_type(
|
|
1782
|
+
document_id, expected="docx", operation="replace_block"
|
|
1783
|
+
)
|
|
1784
|
+
output_path = self._resolve_write_output_path(
|
|
1785
|
+
document.path, output_mode=output_mode
|
|
1786
|
+
)
|
|
1787
|
+
output_path = docx_adapter.replace_block(
|
|
1788
|
+
document.path,
|
|
1789
|
+
block_index,
|
|
1790
|
+
text,
|
|
1791
|
+
output_path=output_path,
|
|
1792
|
+
)
|
|
1793
|
+
bundle = docx_adapter.get_block_bundle(output_path, block_index)
|
|
1794
|
+
return StructuredWriteResult(
|
|
1795
|
+
document_path=document.path,
|
|
1796
|
+
output_path=output_path,
|
|
1797
|
+
target=StructuredTarget(
|
|
1798
|
+
target_type="document_block",
|
|
1799
|
+
identifier=f"block:{block_index}",
|
|
1800
|
+
preview=bundle.block.preview,
|
|
1801
|
+
metadata={
|
|
1802
|
+
"block_index": block_index,
|
|
1803
|
+
"block_type": bundle.block.block_type,
|
|
1804
|
+
},
|
|
1805
|
+
),
|
|
1806
|
+
summary=f"Replaced block {block_index}.",
|
|
1807
|
+
)
|
|
1808
|
+
|
|
1809
|
+
def index_path(
|
|
1810
|
+
self,
|
|
1811
|
+
path: Path,
|
|
1812
|
+
*,
|
|
1813
|
+
with_embeddings: bool = False,
|
|
1814
|
+
reporter: ProgressReporter | None = None,
|
|
1815
|
+
) -> IndexSummary:
|
|
1816
|
+
resolved_input = canonicalize_existing_path(path)
|
|
1817
|
+
self._ensure_allowed_document_path(resolved_input, action="index")
|
|
1818
|
+
candidates = _index_candidates(resolved_input)
|
|
1819
|
+
active_reporter = reporter or NullProgressReporter()
|
|
1820
|
+
indexed = 0
|
|
1821
|
+
skipped = 0
|
|
1822
|
+
|
|
1823
|
+
active_reporter.on_index_start(len(candidates))
|
|
1824
|
+
for index, candidate in enumerate(candidates, start=1):
|
|
1825
|
+
self._ensure_allowed_document_path(candidate, action="index")
|
|
1826
|
+
if candidate.suffix.lower() not in INDEXABLE_EXTENSIONS:
|
|
1827
|
+
skipped += 1
|
|
1828
|
+
continue
|
|
1829
|
+
active_reporter.on_file_start(candidate, index, len(candidates))
|
|
1830
|
+
document_ref = self.index_document(
|
|
1831
|
+
candidate,
|
|
1832
|
+
with_embeddings=with_embeddings,
|
|
1833
|
+
reporter=active_reporter,
|
|
1834
|
+
)
|
|
1835
|
+
active_reporter.on_file_done(
|
|
1836
|
+
candidate, items_indexed=document_ref.item_count or 0
|
|
1837
|
+
)
|
|
1838
|
+
indexed += 1
|
|
1839
|
+
|
|
1840
|
+
active_reporter.on_index_done(files_indexed=indexed, files_skipped=skipped)
|
|
1841
|
+
return IndexSummary(
|
|
1842
|
+
files_scanned=len(candidates),
|
|
1843
|
+
files_indexed=indexed,
|
|
1844
|
+
files_skipped=skipped,
|
|
1845
|
+
)
|
|
1846
|
+
|
|
1847
|
+
def reindex_path(
|
|
1848
|
+
self,
|
|
1849
|
+
path: Path,
|
|
1850
|
+
*,
|
|
1851
|
+
with_embeddings: bool = False,
|
|
1852
|
+
reporter: ProgressReporter | None = None,
|
|
1853
|
+
) -> IndexSummary:
|
|
1854
|
+
return self.index_path(path, with_embeddings=with_embeddings, reporter=reporter)
|
|
1855
|
+
|
|
1856
|
+
def refresh_document(
|
|
1857
|
+
self,
|
|
1858
|
+
document_id: str,
|
|
1859
|
+
*,
|
|
1860
|
+
reporter: ProgressReporter | None = None,
|
|
1861
|
+
) -> IndexSummary:
|
|
1862
|
+
return self.reindex_path(
|
|
1863
|
+
self.resolve_document_path(document_id), reporter=reporter
|
|
1864
|
+
)
|
|
1865
|
+
|
|
1866
|
+
def index_document(
|
|
1867
|
+
self,
|
|
1868
|
+
document_path: Path,
|
|
1869
|
+
*,
|
|
1870
|
+
with_embeddings: bool = False,
|
|
1871
|
+
reporter: ProgressReporter | None = None,
|
|
1872
|
+
) -> DocumentRef:
|
|
1873
|
+
active_reporter = reporter or NullProgressReporter()
|
|
1874
|
+
resolved_path, file_type = self._require_allowed_document_path(
|
|
1875
|
+
document_path, action="index"
|
|
1876
|
+
)
|
|
1877
|
+
document_ref = _build_document_ref(resolved_path, file_type)
|
|
1878
|
+
items = _extract_items(resolved_path, file_type)
|
|
1879
|
+
document_ref = replace(document_ref, item_count=len(items))
|
|
1880
|
+
|
|
1881
|
+
connection = store.ensure_ready(self.config.index_path)
|
|
1882
|
+
try:
|
|
1883
|
+
store.upsert_document(connection, document_ref)
|
|
1884
|
+
store.delete_document_embeddings(connection, document_ref.document_id)
|
|
1885
|
+
store.replace_document_items(connection, document_ref.document_id, items)
|
|
1886
|
+
if with_embeddings and items:
|
|
1887
|
+
provider = self._get_embedding_provider()
|
|
1888
|
+
store.ensure_embedding_meta(
|
|
1889
|
+
connection,
|
|
1890
|
+
model_name=provider.model_name,
|
|
1891
|
+
dimensions=provider.dimensions,
|
|
1892
|
+
)
|
|
1893
|
+
if file_type == "xlsx":
|
|
1894
|
+
row_embeddings = xlsx_adapter.build_row_embeddings(
|
|
1895
|
+
items, resolved_path
|
|
1896
|
+
)
|
|
1897
|
+
embedding_texts = [
|
|
1898
|
+
row_embedding.text for row_embedding in row_embeddings
|
|
1899
|
+
]
|
|
1900
|
+
else:
|
|
1901
|
+
row_embeddings = []
|
|
1902
|
+
embedding_texts = [
|
|
1903
|
+
_build_embedding_text(item, resolved_path, file_type=file_type)
|
|
1904
|
+
for item in items
|
|
1905
|
+
]
|
|
1906
|
+
LOGGER.info(
|
|
1907
|
+
"Embedding generation started for %s with %s items",
|
|
1908
|
+
resolved_path,
|
|
1909
|
+
len(embedding_texts),
|
|
1910
|
+
)
|
|
1911
|
+
if embedding_texts:
|
|
1912
|
+
active_reporter.on_embedding_start(
|
|
1913
|
+
resolved_path, len(embedding_texts)
|
|
1914
|
+
)
|
|
1915
|
+
started_at = time.perf_counter()
|
|
1916
|
+
blobs = provider.embed_texts(
|
|
1917
|
+
embedding_texts,
|
|
1918
|
+
on_progress=active_reporter.on_embedding_item,
|
|
1919
|
+
)
|
|
1920
|
+
if file_type == "xlsx":
|
|
1921
|
+
if len(blobs) != len(row_embeddings):
|
|
1922
|
+
raise RuntimeError(
|
|
1923
|
+
"Embedding provider returned an unexpected number of XLSX row vectors."
|
|
1924
|
+
)
|
|
1925
|
+
store.replace_xlsx_row_embeddings(
|
|
1926
|
+
connection,
|
|
1927
|
+
document_id=document_ref.document_id,
|
|
1928
|
+
model_name=provider.model_name,
|
|
1929
|
+
dimensions=provider.dimensions,
|
|
1930
|
+
row_embeddings=_build_xlsx_row_embedding_records(
|
|
1931
|
+
document_ref.document_id,
|
|
1932
|
+
row_embeddings,
|
|
1933
|
+
blobs,
|
|
1934
|
+
),
|
|
1935
|
+
)
|
|
1936
|
+
else:
|
|
1937
|
+
if len(blobs) != len(items):
|
|
1938
|
+
raise RuntimeError(
|
|
1939
|
+
"Embedding provider returned an unexpected number of vectors."
|
|
1940
|
+
)
|
|
1941
|
+
store.replace_document_embeddings(
|
|
1942
|
+
connection,
|
|
1943
|
+
document_id=document_ref.document_id,
|
|
1944
|
+
model_name=provider.model_name,
|
|
1945
|
+
dimensions=provider.dimensions,
|
|
1946
|
+
embeddings=[
|
|
1947
|
+
(
|
|
1948
|
+
store.make_storage_id(
|
|
1949
|
+
document_ref.document_id, item.item_id
|
|
1950
|
+
),
|
|
1951
|
+
blob,
|
|
1952
|
+
)
|
|
1953
|
+
for item, blob in zip(items, blobs, strict=True)
|
|
1954
|
+
],
|
|
1955
|
+
)
|
|
1956
|
+
LOGGER.info(
|
|
1957
|
+
"Embedding generation completed for %s with %s items in %.3fs",
|
|
1958
|
+
resolved_path,
|
|
1959
|
+
len(embedding_texts),
|
|
1960
|
+
time.perf_counter() - started_at,
|
|
1961
|
+
)
|
|
1962
|
+
connection.commit()
|
|
1963
|
+
except Exception:
|
|
1964
|
+
connection.rollback()
|
|
1965
|
+
raise
|
|
1966
|
+
finally:
|
|
1967
|
+
connection.close()
|
|
1968
|
+
|
|
1969
|
+
return document_ref
|
|
1970
|
+
|
|
1971
|
+
def search_corpus(
|
|
1972
|
+
self,
|
|
1973
|
+
query: str,
|
|
1974
|
+
*,
|
|
1975
|
+
file_type: str | None = None,
|
|
1976
|
+
document_path: Path | None = None,
|
|
1977
|
+
limit: int = 20,
|
|
1978
|
+
mode: SearchMode = "keyword",
|
|
1979
|
+
) -> list[SearchHit]:
|
|
1980
|
+
if file_type not in (None, "docx", "pptx", "xlsx"):
|
|
1981
|
+
raise InvalidArgumentsError(
|
|
1982
|
+
"Only DOCX, PPTX, and XLSX search are supported in this feature."
|
|
1983
|
+
)
|
|
1984
|
+
normalized_mode = _normalize_search_mode(mode)
|
|
1985
|
+
|
|
1986
|
+
resolved_document_path = None
|
|
1987
|
+
if document_path is not None:
|
|
1988
|
+
resolved_document_path, _ = self._require_allowed_document_path(
|
|
1989
|
+
document_path, action="search"
|
|
1990
|
+
)
|
|
1991
|
+
connection = store.ensure_ready(self.config.index_path)
|
|
1992
|
+
try:
|
|
1993
|
+
if normalized_mode == "keyword":
|
|
1994
|
+
rows = store.search_items(
|
|
1995
|
+
connection,
|
|
1996
|
+
query,
|
|
1997
|
+
file_type=file_type,
|
|
1998
|
+
document_path=resolved_document_path,
|
|
1999
|
+
limit=limit,
|
|
2000
|
+
)
|
|
2001
|
+
hits = [_search_hit_from_keyword_row(row) for row in rows]
|
|
2002
|
+
elif normalized_mode == "semantic":
|
|
2003
|
+
if not store.has_item_embeddings(
|
|
2004
|
+
connection,
|
|
2005
|
+
file_type=file_type,
|
|
2006
|
+
document_path=resolved_document_path,
|
|
2007
|
+
):
|
|
2008
|
+
raise NoEmbeddingsError(
|
|
2009
|
+
"No embeddings are indexed for the requested corpus. Reindex with --with-embeddings first."
|
|
2010
|
+
)
|
|
2011
|
+
hits = self._semantic_search(
|
|
2012
|
+
connection,
|
|
2013
|
+
query,
|
|
2014
|
+
file_type=file_type,
|
|
2015
|
+
document_path=resolved_document_path,
|
|
2016
|
+
limit=max(limit, self.config.vector_search_top_k),
|
|
2017
|
+
)[:limit]
|
|
2018
|
+
else:
|
|
2019
|
+
keyword_rows = store.search_items(
|
|
2020
|
+
connection,
|
|
2021
|
+
query,
|
|
2022
|
+
file_type=file_type,
|
|
2023
|
+
document_path=resolved_document_path,
|
|
2024
|
+
limit=max(limit, self.config.vector_search_top_k),
|
|
2025
|
+
)
|
|
2026
|
+
semantic_hits = self._semantic_search(
|
|
2027
|
+
connection,
|
|
2028
|
+
query,
|
|
2029
|
+
file_type=file_type,
|
|
2030
|
+
document_path=resolved_document_path,
|
|
2031
|
+
limit=max(limit, self.config.vector_search_top_k),
|
|
2032
|
+
require_embeddings=False,
|
|
2033
|
+
)
|
|
2034
|
+
hits = _merge_hybrid_hits(
|
|
2035
|
+
keyword_rows,
|
|
2036
|
+
semantic_hits,
|
|
2037
|
+
limit=limit,
|
|
2038
|
+
keyword_weight=self.config.hybrid_keyword_weight,
|
|
2039
|
+
semantic_weight=self.config.hybrid_semantic_weight,
|
|
2040
|
+
)
|
|
2041
|
+
LOGGER.info(
|
|
2042
|
+
"Hybrid merge completed for query=%r with %s keyword hits, %s semantic hits, %s merged hits",
|
|
2043
|
+
query,
|
|
2044
|
+
len(keyword_rows),
|
|
2045
|
+
len(semantic_hits),
|
|
2046
|
+
len(hits),
|
|
2047
|
+
)
|
|
2048
|
+
finally:
|
|
2049
|
+
connection.close()
|
|
2050
|
+
|
|
2051
|
+
return [
|
|
2052
|
+
hit
|
|
2053
|
+
for hit in hits
|
|
2054
|
+
if hit.document_path is not None
|
|
2055
|
+
and self._is_allowed_document_path(hit.document_path)
|
|
2056
|
+
]
|
|
2057
|
+
|
|
2058
|
+
def locate_paragraph(self, document_path: Path, paragraph_index: int) -> ItemRef:
|
|
2059
|
+
return self.locate_items(document_path, paragraph_index=paragraph_index)[0]
|
|
2060
|
+
|
|
2061
|
+
def locate_slide_shapes(
|
|
2062
|
+
self,
|
|
2063
|
+
document_path: Path,
|
|
2064
|
+
slide_number: int,
|
|
2065
|
+
shape_id: int | None = None,
|
|
2066
|
+
) -> list[ItemRef]:
|
|
2067
|
+
return self.locate_items(
|
|
2068
|
+
document_path, slide_number=slide_number, shape_id=shape_id
|
|
2069
|
+
)
|
|
2070
|
+
|
|
2071
|
+
def locate_cell(
|
|
2072
|
+
self, document_path: Path, sheet_name: str, cell_coordinate: str
|
|
2073
|
+
) -> ItemRef:
|
|
2074
|
+
return self.locate_items(
|
|
2075
|
+
document_path, sheet_name=sheet_name, cell_coordinate=cell_coordinate
|
|
2076
|
+
)[0]
|
|
2077
|
+
|
|
2078
|
+
def locate_items(
|
|
2079
|
+
self,
|
|
2080
|
+
document_path: Path,
|
|
2081
|
+
*,
|
|
2082
|
+
paragraph_index: int | None = None,
|
|
2083
|
+
slide_number: int | None = None,
|
|
2084
|
+
shape_id: int | None = None,
|
|
2085
|
+
sheet_name: str | None = None,
|
|
2086
|
+
cell_coordinate: str | None = None,
|
|
2087
|
+
) -> list[ItemRef]:
|
|
2088
|
+
resolved_path, file_type = self._require_allowed_document_path(
|
|
2089
|
+
document_path, action="locate"
|
|
2090
|
+
)
|
|
2091
|
+
connection = store.ensure_ready(self.config.index_path)
|
|
2092
|
+
try:
|
|
2093
|
+
document_row = self._resolve_document_row(connection, resolved_path)
|
|
2094
|
+
if file_type == "docx":
|
|
2095
|
+
if (
|
|
2096
|
+
paragraph_index is None
|
|
2097
|
+
or slide_number is not None
|
|
2098
|
+
or shape_id is not None
|
|
2099
|
+
or sheet_name is not None
|
|
2100
|
+
or cell_coordinate is not None
|
|
2101
|
+
):
|
|
2102
|
+
raise InvalidArgumentsError(
|
|
2103
|
+
"DOCX locate requires --paragraph and does not support --slide."
|
|
2104
|
+
)
|
|
2105
|
+
item_row = self._resolve_indexed_item_row(
|
|
2106
|
+
connection,
|
|
2107
|
+
document_row,
|
|
2108
|
+
f"para:{paragraph_index}",
|
|
2109
|
+
resolved_path,
|
|
2110
|
+
)
|
|
2111
|
+
return [_item_ref_from_row(item_row)]
|
|
2112
|
+
|
|
2113
|
+
if file_type == "pptx":
|
|
2114
|
+
if (
|
|
2115
|
+
paragraph_index is not None
|
|
2116
|
+
or sheet_name is not None
|
|
2117
|
+
or cell_coordinate is not None
|
|
2118
|
+
):
|
|
2119
|
+
raise InvalidArgumentsError(
|
|
2120
|
+
"PPTX locate supports --slide and optional --shape only."
|
|
2121
|
+
)
|
|
2122
|
+
if slide_number is None:
|
|
2123
|
+
raise InvalidArgumentsError("PPTX locate requires --slide.")
|
|
2124
|
+
|
|
2125
|
+
item_rows = store.fetch_items_for_document(
|
|
2126
|
+
connection, document_row["document_id"]
|
|
2127
|
+
)
|
|
2128
|
+
matches = [
|
|
2129
|
+
row
|
|
2130
|
+
for row in item_rows
|
|
2131
|
+
if _metadata_value(row, "slide_number") == slide_number
|
|
2132
|
+
and (
|
|
2133
|
+
shape_id is None or _metadata_value(row, "shape_id") == shape_id
|
|
2134
|
+
)
|
|
2135
|
+
]
|
|
2136
|
+
matches.sort(
|
|
2137
|
+
key=lambda row: (
|
|
2138
|
+
_metadata_value(row, "shape_index", default=0),
|
|
2139
|
+
row["item_id"],
|
|
2140
|
+
)
|
|
2141
|
+
)
|
|
2142
|
+
if not matches:
|
|
2143
|
+
if shape_id is None:
|
|
2144
|
+
raise TargetNotFoundError(
|
|
2145
|
+
f"No indexed PPTX text shapes found on slide {slide_number} for {resolved_path}"
|
|
2146
|
+
)
|
|
2147
|
+
raise TargetNotFoundError(
|
|
2148
|
+
f"No indexed PPTX text shape found for slide {slide_number} shape {shape_id} in {resolved_path}"
|
|
2149
|
+
)
|
|
2150
|
+
return [_item_ref_from_row(row) for row in matches]
|
|
2151
|
+
|
|
2152
|
+
if (
|
|
2153
|
+
paragraph_index is not None
|
|
2154
|
+
or slide_number is not None
|
|
2155
|
+
or shape_id is not None
|
|
2156
|
+
):
|
|
2157
|
+
raise InvalidArgumentsError(
|
|
2158
|
+
"XLSX locate supports --sheet and --cell only."
|
|
2159
|
+
)
|
|
2160
|
+
if sheet_name is None or cell_coordinate is None:
|
|
2161
|
+
raise InvalidArgumentsError("XLSX locate requires --sheet and --cell.")
|
|
2162
|
+
|
|
2163
|
+
item_row = self._resolve_indexed_item_row(
|
|
2164
|
+
connection,
|
|
2165
|
+
document_row,
|
|
2166
|
+
xlsx_adapter.make_item_id(sheet_name, cell_coordinate),
|
|
2167
|
+
resolved_path,
|
|
2168
|
+
)
|
|
2169
|
+
return [_item_ref_from_row(item_row)]
|
|
2170
|
+
finally:
|
|
2171
|
+
connection.close()
|
|
2172
|
+
|
|
2173
|
+
def read_item(self, document_path: Path, item_id: str) -> str:
|
|
2174
|
+
resolved_path, file_type = self._require_allowed_document_path(
|
|
2175
|
+
document_path, action="read"
|
|
2176
|
+
)
|
|
2177
|
+
connection = store.ensure_ready(self.config.index_path)
|
|
2178
|
+
try:
|
|
2179
|
+
document_row = self._resolve_document_row(connection, resolved_path)
|
|
2180
|
+
self._resolve_indexed_item_row(
|
|
2181
|
+
connection, document_row, item_id, resolved_path
|
|
2182
|
+
)
|
|
2183
|
+
finally:
|
|
2184
|
+
connection.close()
|
|
2185
|
+
if file_type == "docx":
|
|
2186
|
+
return docx_adapter.read_paragraph(resolved_path, item_id)
|
|
2187
|
+
if file_type == "pptx":
|
|
2188
|
+
return pptx_adapter.read_text_shape(resolved_path, item_id)
|
|
2189
|
+
return xlsx_adapter.read_cell(resolved_path, item_id)
|
|
2190
|
+
|
|
2191
|
+
def replace_item_text(
|
|
2192
|
+
self,
|
|
2193
|
+
document_path: Path,
|
|
2194
|
+
item_id: str,
|
|
2195
|
+
text: str,
|
|
2196
|
+
*,
|
|
2197
|
+
output_mode: OutputMode = "versioned",
|
|
2198
|
+
) -> PatchResult:
|
|
2199
|
+
resolved_path, file_type = self._require_allowed_document_path(
|
|
2200
|
+
document_path, action="write"
|
|
2201
|
+
)
|
|
2202
|
+
if file_type == "xlsx":
|
|
2203
|
+
raise InvalidArgumentsError(
|
|
2204
|
+
"XLSX replace is not supported; use write-cell."
|
|
2205
|
+
)
|
|
2206
|
+
|
|
2207
|
+
self._prepare_write_target(
|
|
2208
|
+
resolved_path,
|
|
2209
|
+
file_type,
|
|
2210
|
+
item_id,
|
|
2211
|
+
require_indexed_item=True,
|
|
2212
|
+
)
|
|
2213
|
+
output_path = self._resolve_write_output_path(
|
|
2214
|
+
resolved_path, output_mode=output_mode
|
|
2215
|
+
)
|
|
2216
|
+
|
|
2217
|
+
if file_type == "docx":
|
|
2218
|
+
output_path = docx_adapter.replace_paragraph(
|
|
2219
|
+
resolved_path, item_id, text, output_path
|
|
2220
|
+
)
|
|
2221
|
+
updated_text = docx_adapter.read_paragraph(output_path, item_id)
|
|
2222
|
+
else:
|
|
2223
|
+
output_path = pptx_adapter.replace_text_shape(
|
|
2224
|
+
resolved_path, item_id, text, output_path
|
|
2225
|
+
)
|
|
2226
|
+
updated_text = pptx_adapter.read_text_shape(output_path, item_id)
|
|
2227
|
+
self.index_document(output_path)
|
|
2228
|
+
|
|
2229
|
+
connection = store.ensure_ready(self.config.index_path)
|
|
2230
|
+
try:
|
|
2231
|
+
document_row = self._resolve_document_row(connection, output_path.resolve())
|
|
2232
|
+
updated_item_row = self._resolve_indexed_item_row(
|
|
2233
|
+
connection,
|
|
2234
|
+
document_row,
|
|
2235
|
+
item_id,
|
|
2236
|
+
output_path.resolve(),
|
|
2237
|
+
)
|
|
2238
|
+
finally:
|
|
2239
|
+
connection.close()
|
|
2240
|
+
|
|
2241
|
+
return PatchResult(
|
|
2242
|
+
document_path=resolved_path,
|
|
2243
|
+
output_path=output_path,
|
|
2244
|
+
item=_item_ref_from_row(updated_item_row),
|
|
2245
|
+
text=updated_text,
|
|
2246
|
+
)
|
|
2247
|
+
|
|
2248
|
+
def append_item_text(
|
|
2249
|
+
self,
|
|
2250
|
+
document_path: Path,
|
|
2251
|
+
item_id: str,
|
|
2252
|
+
text: str,
|
|
2253
|
+
*,
|
|
2254
|
+
output_mode: OutputMode = "versioned",
|
|
2255
|
+
) -> PatchResult:
|
|
2256
|
+
resolved_path, file_type = self._require_allowed_document_path(
|
|
2257
|
+
document_path, action="write"
|
|
2258
|
+
)
|
|
2259
|
+
self._prepare_write_target(
|
|
2260
|
+
resolved_path,
|
|
2261
|
+
file_type,
|
|
2262
|
+
item_id,
|
|
2263
|
+
require_indexed_item=file_type != "xlsx",
|
|
2264
|
+
)
|
|
2265
|
+
output_path = self._resolve_write_output_path(
|
|
2266
|
+
resolved_path, output_mode=output_mode
|
|
2267
|
+
)
|
|
2268
|
+
|
|
2269
|
+
if file_type == "docx":
|
|
2270
|
+
output_path = docx_adapter.append_paragraph(
|
|
2271
|
+
resolved_path, item_id, text, output_path
|
|
2272
|
+
)
|
|
2273
|
+
updated_text = docx_adapter.read_paragraph(output_path, item_id)
|
|
2274
|
+
elif file_type == "pptx":
|
|
2275
|
+
output_path = pptx_adapter.append_text_shape(
|
|
2276
|
+
resolved_path, item_id, text, output_path
|
|
2277
|
+
)
|
|
2278
|
+
updated_text = pptx_adapter.read_text_shape(output_path, item_id)
|
|
2279
|
+
else:
|
|
2280
|
+
output_path = xlsx_adapter.append_cell(
|
|
2281
|
+
resolved_path, item_id, text, output_path
|
|
2282
|
+
)
|
|
2283
|
+
updated_text = xlsx_adapter.read_cell(output_path, item_id)
|
|
2284
|
+
self.index_document(output_path)
|
|
2285
|
+
|
|
2286
|
+
connection = store.ensure_ready(self.config.index_path)
|
|
2287
|
+
try:
|
|
2288
|
+
document_row = self._resolve_document_row(connection, output_path.resolve())
|
|
2289
|
+
updated_item_row = self._resolve_indexed_item_row(
|
|
2290
|
+
connection,
|
|
2291
|
+
document_row,
|
|
2292
|
+
item_id,
|
|
2293
|
+
output_path.resolve(),
|
|
2294
|
+
)
|
|
2295
|
+
finally:
|
|
2296
|
+
connection.close()
|
|
2297
|
+
|
|
2298
|
+
return PatchResult(
|
|
2299
|
+
document_path=resolved_path,
|
|
2300
|
+
output_path=output_path,
|
|
2301
|
+
item=_item_ref_from_row(updated_item_row),
|
|
2302
|
+
text=updated_text,
|
|
2303
|
+
)
|
|
2304
|
+
|
|
2305
|
+
def write_cell_value(
|
|
2306
|
+
self,
|
|
2307
|
+
document_path: Path,
|
|
2308
|
+
sheet_name: str,
|
|
2309
|
+
cell_coordinate: str,
|
|
2310
|
+
value: str,
|
|
2311
|
+
*,
|
|
2312
|
+
output_mode: OutputMode = "versioned",
|
|
2313
|
+
) -> PatchResult:
|
|
2314
|
+
resolved_path, file_type = self._require_allowed_document_path(
|
|
2315
|
+
document_path, action="write"
|
|
2316
|
+
)
|
|
2317
|
+
if file_type != "xlsx":
|
|
2318
|
+
raise InvalidArgumentsError("write-cell requires an .xlsx path.")
|
|
2319
|
+
|
|
2320
|
+
item_id = xlsx_adapter.make_item_id(sheet_name, cell_coordinate)
|
|
2321
|
+
self._prepare_write_target(
|
|
2322
|
+
resolved_path,
|
|
2323
|
+
file_type,
|
|
2324
|
+
item_id,
|
|
2325
|
+
require_indexed_item=False,
|
|
2326
|
+
)
|
|
2327
|
+
output_path = self._resolve_write_output_path(
|
|
2328
|
+
resolved_path, output_mode=output_mode
|
|
2329
|
+
)
|
|
2330
|
+
output_path = xlsx_adapter.write_cell(
|
|
2331
|
+
resolved_path, item_id, value, output_path
|
|
2332
|
+
)
|
|
2333
|
+
updated_text = xlsx_adapter.read_cell(output_path, item_id)
|
|
2334
|
+
self.index_document(output_path)
|
|
2335
|
+
|
|
2336
|
+
connection = store.ensure_ready(self.config.index_path)
|
|
2337
|
+
try:
|
|
2338
|
+
document_row = self._resolve_document_row(connection, output_path.resolve())
|
|
2339
|
+
updated_item_row = self._resolve_indexed_item_row(
|
|
2340
|
+
connection,
|
|
2341
|
+
document_row,
|
|
2342
|
+
item_id,
|
|
2343
|
+
output_path.resolve(),
|
|
2344
|
+
)
|
|
2345
|
+
finally:
|
|
2346
|
+
connection.close()
|
|
2347
|
+
|
|
2348
|
+
return PatchResult(
|
|
2349
|
+
document_path=resolved_path,
|
|
2350
|
+
output_path=output_path,
|
|
2351
|
+
item=_item_ref_from_row(updated_item_row),
|
|
2352
|
+
text=updated_text,
|
|
2353
|
+
)
|
|
2354
|
+
|
|
2355
|
+
def _prepare_write_target(
|
|
2356
|
+
self,
|
|
2357
|
+
document_path: Path,
|
|
2358
|
+
file_type: FileType,
|
|
2359
|
+
item_id: str,
|
|
2360
|
+
*,
|
|
2361
|
+
require_indexed_item: bool,
|
|
2362
|
+
) -> None:
|
|
2363
|
+
connection = store.ensure_ready(self.config.index_path)
|
|
2364
|
+
try:
|
|
2365
|
+
document_row = self._resolve_document_row(connection, document_path)
|
|
2366
|
+
if require_indexed_item:
|
|
2367
|
+
try:
|
|
2368
|
+
self._resolve_indexed_item_row(
|
|
2369
|
+
connection, document_row, item_id, document_path
|
|
2370
|
+
)
|
|
2371
|
+
except TargetNotFoundError:
|
|
2372
|
+
if file_type == "pptx":
|
|
2373
|
+
_raise_if_pptx_target_not_editable(document_path, item_id)
|
|
2374
|
+
raise
|
|
2375
|
+
|
|
2376
|
+
if document_row["content_hash"] != _content_hash(document_path):
|
|
2377
|
+
try:
|
|
2378
|
+
_ensure_current_target_resolves(document_path, file_type, item_id)
|
|
2379
|
+
except (
|
|
2380
|
+
InvalidArgumentsError,
|
|
2381
|
+
TargetNotFoundError,
|
|
2382
|
+
pptx_adapter.TargetNotEditableError,
|
|
2383
|
+
xlsx_adapter.TargetNotAppendableError,
|
|
2384
|
+
) as exc:
|
|
2385
|
+
raise StaleLocatorError(
|
|
2386
|
+
f"stale locator: {item_id} is no longer valid for {document_path}"
|
|
2387
|
+
) from exc
|
|
2388
|
+
finally:
|
|
2389
|
+
connection.close()
|
|
2390
|
+
|
|
2391
|
+
def _ensure_object_locator_fresh(
|
|
2392
|
+
self, document: DocumentRef, locator: str | None
|
|
2393
|
+
) -> None:
|
|
2394
|
+
current_hash = _content_hash(document.path)
|
|
2395
|
+
if document.content_hash is not None and document.content_hash != current_hash:
|
|
2396
|
+
subject = locator or document.path.as_posix()
|
|
2397
|
+
raise StaleLocatorError(
|
|
2398
|
+
f"stale locator: {subject} is no longer valid for {document.path}"
|
|
2399
|
+
)
|
|
2400
|
+
|
|
2401
|
+
def _finalize_object_mutation(
|
|
2402
|
+
self,
|
|
2403
|
+
document: DocumentRef,
|
|
2404
|
+
output_path: Path,
|
|
2405
|
+
locator: str,
|
|
2406
|
+
summary: str,
|
|
2407
|
+
metadata: dict[str, Any],
|
|
2408
|
+
) -> MutationResult:
|
|
2409
|
+
output_document = self.index_document(output_path)
|
|
2410
|
+
payload = self.get_object(output_document.document_id, locator)
|
|
2411
|
+
return MutationResult(
|
|
2412
|
+
document_path=document.path,
|
|
2413
|
+
output_path=output_path,
|
|
2414
|
+
document_id=output_document.document_id,
|
|
2415
|
+
locator=payload.locator,
|
|
2416
|
+
object_type=payload.object_type,
|
|
2417
|
+
summary=summary,
|
|
2418
|
+
capabilities=payload.capabilities,
|
|
2419
|
+
parent_locator=payload.parent_locator,
|
|
2420
|
+
metadata=metadata,
|
|
2421
|
+
)
|
|
2422
|
+
|
|
2423
|
+
def _raise_stale_if_document_changed(
|
|
2424
|
+
self,
|
|
2425
|
+
document: DocumentRef,
|
|
2426
|
+
locator: str,
|
|
2427
|
+
exc: Exception,
|
|
2428
|
+
) -> None:
|
|
2429
|
+
if document.content_hash is not None and document.content_hash != _content_hash(
|
|
2430
|
+
document.path
|
|
2431
|
+
):
|
|
2432
|
+
raise StaleLocatorError(
|
|
2433
|
+
f"stale locator: {locator} is no longer valid for {document.path}"
|
|
2434
|
+
) from exc
|
|
2435
|
+
|
|
2436
|
+
def _resolve_create_output_path(
|
|
2437
|
+
self,
|
|
2438
|
+
output_path: Path,
|
|
2439
|
+
*,
|
|
2440
|
+
output_mode: OutputMode,
|
|
2441
|
+
) -> Path:
|
|
2442
|
+
normalized_mode = _normalize_output_mode(output_mode)
|
|
2443
|
+
if normalized_mode == "inplace":
|
|
2444
|
+
if not self.config.allow_inplace_overwrite:
|
|
2445
|
+
raise PolicyRefusedError(
|
|
2446
|
+
"In-place overwrite is not enabled. Set allow_inplace_overwrite = true to use output-mode inplace."
|
|
2447
|
+
)
|
|
2448
|
+
resolved_output = self._ensure_allowed_output_path(output_path)
|
|
2449
|
+
resolved_output.parent.mkdir(parents=True, exist_ok=True)
|
|
2450
|
+
return resolved_output
|
|
2451
|
+
|
|
2452
|
+
target_path = versioning.build_versioned_output_path(
|
|
2453
|
+
output_path,
|
|
2454
|
+
output_directory=self.config.output_directory,
|
|
2455
|
+
create_directory=False,
|
|
2456
|
+
)
|
|
2457
|
+
self._ensure_allowed_output_path(target_path)
|
|
2458
|
+
target_path.parent.mkdir(parents=True, exist_ok=True)
|
|
2459
|
+
return target_path
|
|
2460
|
+
|
|
2461
|
+
def _dispatch_add_content_block(
|
|
2462
|
+
self,
|
|
2463
|
+
document: DocumentRef,
|
|
2464
|
+
block_type: str,
|
|
2465
|
+
properties: dict[str, Any],
|
|
2466
|
+
output_path: Path,
|
|
2467
|
+
) -> str:
|
|
2468
|
+
if document.file_type == "docx":
|
|
2469
|
+
if block_type == "paragraph":
|
|
2470
|
+
_, locator = docx_adapter.add_paragraph(
|
|
2471
|
+
document.path,
|
|
2472
|
+
str(properties.get("text", "")),
|
|
2473
|
+
output_path=output_path,
|
|
2474
|
+
)
|
|
2475
|
+
return locator
|
|
2476
|
+
if block_type == "heading":
|
|
2477
|
+
_, locator = docx_adapter.add_heading(
|
|
2478
|
+
document.path,
|
|
2479
|
+
str(properties.get("text", "")),
|
|
2480
|
+
int(properties.get("level", 1)),
|
|
2481
|
+
output_path=output_path,
|
|
2482
|
+
)
|
|
2483
|
+
return locator
|
|
2484
|
+
if block_type == "table":
|
|
2485
|
+
rows = int(properties.get("rows", 0))
|
|
2486
|
+
columns = int(properties.get("columns", 0))
|
|
2487
|
+
_, locator = docx_adapter.add_table(
|
|
2488
|
+
document.path,
|
|
2489
|
+
rows,
|
|
2490
|
+
columns,
|
|
2491
|
+
output_path=output_path,
|
|
2492
|
+
)
|
|
2493
|
+
return locator
|
|
2494
|
+
|
|
2495
|
+
if document.file_type == "pptx":
|
|
2496
|
+
if block_type == "slide":
|
|
2497
|
+
_, locator = pptx_adapter.add_slide(
|
|
2498
|
+
document.path, output_path=output_path
|
|
2499
|
+
)
|
|
2500
|
+
return locator
|
|
2501
|
+
if block_type == "textbox":
|
|
2502
|
+
slide_locator = next(
|
|
2503
|
+
(
|
|
2504
|
+
value
|
|
2505
|
+
for key, value in properties.items()
|
|
2506
|
+
if key in {"slide", "slide_locator", "locator"}
|
|
2507
|
+
and isinstance(value, str)
|
|
2508
|
+
),
|
|
2509
|
+
None,
|
|
2510
|
+
)
|
|
2511
|
+
if slide_locator is None:
|
|
2512
|
+
raise InvalidArgumentsError(
|
|
2513
|
+
"PPTX textbox blocks require a slide locator."
|
|
2514
|
+
)
|
|
2515
|
+
_, locator = pptx_adapter.add_textbox(
|
|
2516
|
+
document.path,
|
|
2517
|
+
slide_locator,
|
|
2518
|
+
str(properties.get("text", "")),
|
|
2519
|
+
left=_optional_int(properties.get("left")),
|
|
2520
|
+
top=_optional_int(properties.get("top")),
|
|
2521
|
+
width=_optional_int(properties.get("width")),
|
|
2522
|
+
height=_optional_int(properties.get("height")),
|
|
2523
|
+
output_path=output_path,
|
|
2524
|
+
)
|
|
2525
|
+
return locator
|
|
2526
|
+
|
|
2527
|
+
if document.file_type == "xlsx":
|
|
2528
|
+
if block_type == "sheet":
|
|
2529
|
+
name = properties.get("name")
|
|
2530
|
+
if not isinstance(name, str):
|
|
2531
|
+
raise InvalidArgumentsError(
|
|
2532
|
+
"XLSX sheet blocks require a sheet name."
|
|
2533
|
+
)
|
|
2534
|
+
_, locator = xlsx_adapter.add_sheet(
|
|
2535
|
+
document.path, name, output_path=output_path
|
|
2536
|
+
)
|
|
2537
|
+
return locator
|
|
2538
|
+
if block_type == "row":
|
|
2539
|
+
sheet_locator = next(
|
|
2540
|
+
(
|
|
2541
|
+
value
|
|
2542
|
+
for key, value in properties.items()
|
|
2543
|
+
if key in {"sheet", "sheet_locator", "locator"}
|
|
2544
|
+
and isinstance(value, str)
|
|
2545
|
+
),
|
|
2546
|
+
None,
|
|
2547
|
+
)
|
|
2548
|
+
if sheet_locator is None:
|
|
2549
|
+
raise InvalidArgumentsError(
|
|
2550
|
+
"XLSX row blocks require a worksheet locator."
|
|
2551
|
+
)
|
|
2552
|
+
values = properties.get("values")
|
|
2553
|
+
if not isinstance(values, list):
|
|
2554
|
+
raise InvalidArgumentsError(
|
|
2555
|
+
"XLSX row blocks require a values list."
|
|
2556
|
+
)
|
|
2557
|
+
_, locator = xlsx_adapter.add_row(
|
|
2558
|
+
document.path,
|
|
2559
|
+
sheet_locator,
|
|
2560
|
+
values,
|
|
2561
|
+
output_path=output_path,
|
|
2562
|
+
)
|
|
2563
|
+
return locator
|
|
2564
|
+
if block_type == "cell":
|
|
2565
|
+
cell_locator = next(
|
|
2566
|
+
(
|
|
2567
|
+
value
|
|
2568
|
+
for key, value in properties.items()
|
|
2569
|
+
if key in {"cell", "cell_locator", "locator"}
|
|
2570
|
+
and isinstance(value, str)
|
|
2571
|
+
),
|
|
2572
|
+
None,
|
|
2573
|
+
)
|
|
2574
|
+
if cell_locator is None:
|
|
2575
|
+
raise InvalidArgumentsError(
|
|
2576
|
+
"XLSX cell blocks require a cell locator."
|
|
2577
|
+
)
|
|
2578
|
+
xlsx_adapter.write_cell(
|
|
2579
|
+
document.path,
|
|
2580
|
+
to_legacy_locator(cell_locator, file_type="xlsx"),
|
|
2581
|
+
properties.get("value"),
|
|
2582
|
+
output_path=output_path,
|
|
2583
|
+
)
|
|
2584
|
+
return to_v2_locator(cell_locator, file_type="xlsx")
|
|
2585
|
+
|
|
2586
|
+
raise InvalidArgumentsError(
|
|
2587
|
+
f"Unsupported add_content_block combination: {document.file_type}/{block_type}"
|
|
2588
|
+
)
|
|
2589
|
+
|
|
2590
|
+
def _resolve_write_output_path(
|
|
2591
|
+
self,
|
|
2592
|
+
document_path: Path,
|
|
2593
|
+
*,
|
|
2594
|
+
output_mode: OutputMode,
|
|
2595
|
+
) -> Path:
|
|
2596
|
+
normalized_mode = _normalize_output_mode(output_mode)
|
|
2597
|
+
if normalized_mode == "inplace":
|
|
2598
|
+
if not self.config.allow_inplace_overwrite:
|
|
2599
|
+
raise PolicyRefusedError(
|
|
2600
|
+
"In-place overwrite is not enabled. Set allow_inplace_overwrite = true to use output-mode inplace."
|
|
2601
|
+
)
|
|
2602
|
+
self._ensure_allowed_output_path(document_path)
|
|
2603
|
+
return document_path
|
|
2604
|
+
|
|
2605
|
+
output_path = versioning.build_versioned_output_path(
|
|
2606
|
+
document_path,
|
|
2607
|
+
output_directory=self.config.output_directory,
|
|
2608
|
+
create_directory=False,
|
|
2609
|
+
)
|
|
2610
|
+
self._ensure_allowed_output_path(output_path)
|
|
2611
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
2612
|
+
return output_path
|
|
2613
|
+
|
|
2614
|
+
def _require_document_type(
|
|
2615
|
+
self,
|
|
2616
|
+
document_id: str,
|
|
2617
|
+
*,
|
|
2618
|
+
expected: FileType,
|
|
2619
|
+
operation: str,
|
|
2620
|
+
) -> DocumentRef:
|
|
2621
|
+
document = self.get_document(document_id)
|
|
2622
|
+
if document.file_type != expected:
|
|
2623
|
+
raise InvalidArgumentsError(
|
|
2624
|
+
f"{operation} requires a .{expected} document, got .{document.file_type}."
|
|
2625
|
+
)
|
|
2626
|
+
return document
|
|
2627
|
+
|
|
2628
|
+
def run_doctor(
|
|
2629
|
+
self,
|
|
2630
|
+
required_imports: Sequence[tuple[str, str]] | None = None,
|
|
2631
|
+
) -> DoctorReport:
|
|
2632
|
+
checks: list[DoctorCheck] = []
|
|
2633
|
+
|
|
2634
|
+
for module_name, label in required_imports or REQUIRED_IMPORTS:
|
|
2635
|
+
checks.append(_check_import(module_name, label))
|
|
2636
|
+
|
|
2637
|
+
checks.append(_check_sqlite_module())
|
|
2638
|
+
checks.append(_check_fts5_support())
|
|
2639
|
+
checks.append(_check_index_path(self.config.index_path))
|
|
2640
|
+
checks.append(_check_embedding_provider_import())
|
|
2641
|
+
checks.append(
|
|
2642
|
+
_check_embedding_model(
|
|
2643
|
+
self.config.embedding_model,
|
|
2644
|
+
self.config.embedding_dimensions,
|
|
2645
|
+
provider_factory=self.embedding_provider_factory,
|
|
2646
|
+
)
|
|
2647
|
+
)
|
|
2648
|
+
checks.append(
|
|
2649
|
+
_check_embedding_store(
|
|
2650
|
+
self.config.index_path,
|
|
2651
|
+
self.config.embedding_model,
|
|
2652
|
+
self.config.embedding_dimensions,
|
|
2653
|
+
)
|
|
2654
|
+
)
|
|
2655
|
+
checks.extend(_check_document_roots(self.config.document_roots))
|
|
2656
|
+
checks.extend(_check_allowed_roots(self.config.allowed_roots))
|
|
2657
|
+
checks.extend(_check_output_roots(self.config.output_roots))
|
|
2658
|
+
|
|
2659
|
+
return DoctorReport(checks=tuple(checks))
|
|
2660
|
+
|
|
2661
|
+
def _semantic_search(
|
|
2662
|
+
self,
|
|
2663
|
+
connection: sqlite3.Connection,
|
|
2664
|
+
query: str,
|
|
2665
|
+
*,
|
|
2666
|
+
file_type: str | None,
|
|
2667
|
+
document_path: Path | None,
|
|
2668
|
+
limit: int,
|
|
2669
|
+
require_embeddings: bool = True,
|
|
2670
|
+
) -> list[SearchHit]:
|
|
2671
|
+
item_rows = store.fetch_item_embeddings(
|
|
2672
|
+
connection,
|
|
2673
|
+
file_type=file_type,
|
|
2674
|
+
document_path=document_path,
|
|
2675
|
+
)
|
|
2676
|
+
xlsx_rows = store.fetch_xlsx_row_embeddings(
|
|
2677
|
+
connection,
|
|
2678
|
+
file_type=file_type,
|
|
2679
|
+
document_path=document_path,
|
|
2680
|
+
)
|
|
2681
|
+
if not item_rows and not xlsx_rows:
|
|
2682
|
+
if require_embeddings:
|
|
2683
|
+
raise NoEmbeddingsError(
|
|
2684
|
+
"No embeddings are indexed for the requested corpus. Reindex with --with-embeddings first."
|
|
2685
|
+
)
|
|
2686
|
+
return []
|
|
2687
|
+
|
|
2688
|
+
provider = self._get_embedding_provider()
|
|
2689
|
+
store.ensure_embedding_meta(
|
|
2690
|
+
connection,
|
|
2691
|
+
model_name=provider.model_name,
|
|
2692
|
+
dimensions=provider.dimensions,
|
|
2693
|
+
)
|
|
2694
|
+
query_vector = _unpack_embedding(
|
|
2695
|
+
provider.embed_texts([query])[0], provider.dimensions
|
|
2696
|
+
)
|
|
2697
|
+
|
|
2698
|
+
scored_hits: list[SearchHit] = []
|
|
2699
|
+
for row in item_rows:
|
|
2700
|
+
similarity = _cosine_similarity(
|
|
2701
|
+
query_vector,
|
|
2702
|
+
_unpack_embedding(row["embedding"], int(row["dimensions"])),
|
|
2703
|
+
)
|
|
2704
|
+
scored_hits.append(_search_hit_from_semantic_row(row, similarity))
|
|
2705
|
+
for row in xlsx_rows:
|
|
2706
|
+
similarity = _cosine_similarity(
|
|
2707
|
+
query_vector,
|
|
2708
|
+
_unpack_embedding(row["embedding"], int(row["dimensions"])),
|
|
2709
|
+
)
|
|
2710
|
+
scored_hits.append(
|
|
2711
|
+
_search_hit_from_xlsx_semantic_row(connection, row, similarity)
|
|
2712
|
+
)
|
|
2713
|
+
|
|
2714
|
+
scored_hits.sort(
|
|
2715
|
+
key=lambda hit: (
|
|
2716
|
+
-hit.score,
|
|
2717
|
+
str(hit.document_path),
|
|
2718
|
+
hit.item_id,
|
|
2719
|
+
)
|
|
2720
|
+
)
|
|
2721
|
+
LOGGER.info(
|
|
2722
|
+
"Semantic search executed for query=%r top_k=%s hit_count=%s",
|
|
2723
|
+
query,
|
|
2724
|
+
limit,
|
|
2725
|
+
min(len(scored_hits), limit),
|
|
2726
|
+
)
|
|
2727
|
+
return scored_hits[:limit]
|
|
2728
|
+
|
|
2729
|
+
def _get_embedding_provider(self) -> embedding_provider.EmbeddingProvider:
|
|
2730
|
+
if self._embedding_provider is None:
|
|
2731
|
+
factory = self.embedding_provider_factory or (
|
|
2732
|
+
lambda model_name, dimensions: (
|
|
2733
|
+
embedding_provider.LocalEmbeddingProvider(
|
|
2734
|
+
model_name=model_name,
|
|
2735
|
+
dimensions=dimensions,
|
|
2736
|
+
)
|
|
2737
|
+
)
|
|
2738
|
+
)
|
|
2739
|
+
self._embedding_provider = factory(
|
|
2740
|
+
self.config.embedding_model,
|
|
2741
|
+
self.config.embedding_dimensions,
|
|
2742
|
+
)
|
|
2743
|
+
return self._embedding_provider
|
|
2744
|
+
|
|
2745
|
+
def _require_allowed_document_path(
|
|
2746
|
+
self,
|
|
2747
|
+
document_path: Path,
|
|
2748
|
+
*,
|
|
2749
|
+
action: str,
|
|
2750
|
+
) -> tuple[Path, FileType]:
|
|
2751
|
+
resolved_path, file_type = _require_indexable_path(document_path)
|
|
2752
|
+
self._ensure_allowed_document_path(resolved_path, action=action)
|
|
2753
|
+
return resolved_path, file_type
|
|
2754
|
+
|
|
2755
|
+
def _ensure_allowed_document_path(
|
|
2756
|
+
self, document_path: Path, *, action: str
|
|
2757
|
+
) -> Path:
|
|
2758
|
+
resolved_path = canonicalize_existing_path(document_path)
|
|
2759
|
+
return ensure_path_allowed(
|
|
2760
|
+
resolved_path,
|
|
2761
|
+
self._read_policy_roots(),
|
|
2762
|
+
label=f"{action} target",
|
|
2763
|
+
policy_name="allowed roots",
|
|
2764
|
+
)
|
|
2765
|
+
|
|
2766
|
+
def _ensure_allowed_output_path(self, output_path: Path) -> Path:
|
|
2767
|
+
resolved_output_path = canonicalize_output_path(output_path)
|
|
2768
|
+
return ensure_path_allowed(
|
|
2769
|
+
resolved_output_path,
|
|
2770
|
+
self.config.output_roots,
|
|
2771
|
+
label="write output",
|
|
2772
|
+
policy_name="output roots",
|
|
2773
|
+
)
|
|
2774
|
+
|
|
2775
|
+
def _is_allowed_document_path(self, document_path: Path) -> bool:
|
|
2776
|
+
try:
|
|
2777
|
+
self._ensure_allowed_document_path(document_path, action="read")
|
|
2778
|
+
except (PolicyRefusedError, TargetNotFoundError):
|
|
2779
|
+
return False
|
|
2780
|
+
return True
|
|
2781
|
+
|
|
2782
|
+
def _read_policy_roots(self) -> tuple[Path, ...]:
|
|
2783
|
+
combined = list(self.config.allowed_roots) + list(self.config.output_roots)
|
|
2784
|
+
unique_roots: list[Path] = []
|
|
2785
|
+
seen: set[Path] = set()
|
|
2786
|
+
for root in combined:
|
|
2787
|
+
if root not in seen:
|
|
2788
|
+
unique_roots.append(root)
|
|
2789
|
+
seen.add(root)
|
|
2790
|
+
return tuple(unique_roots)
|
|
2791
|
+
|
|
2792
|
+
def _resolve_item_row(
|
|
2793
|
+
self,
|
|
2794
|
+
connection: sqlite3.Connection,
|
|
2795
|
+
document_path: Path,
|
|
2796
|
+
item_id: str,
|
|
2797
|
+
) -> tuple[sqlite3.Row, sqlite3.Row]:
|
|
2798
|
+
resolved_path, _ = self._require_allowed_document_path(
|
|
2799
|
+
document_path, action="show"
|
|
2800
|
+
)
|
|
2801
|
+
document_row = self._resolve_document_row(connection, resolved_path)
|
|
2802
|
+
item_row = self._resolve_indexed_item_row(
|
|
2803
|
+
connection, document_row, item_id, resolved_path
|
|
2804
|
+
)
|
|
2805
|
+
return document_row, item_row
|
|
2806
|
+
|
|
2807
|
+
def _resolve_document_row(
|
|
2808
|
+
self,
|
|
2809
|
+
connection: sqlite3.Connection,
|
|
2810
|
+
document_path: Path,
|
|
2811
|
+
) -> sqlite3.Row:
|
|
2812
|
+
document_row = store.fetch_document_by_path(connection, document_path)
|
|
2813
|
+
if document_row is None:
|
|
2814
|
+
raise TargetNotFoundError(f"Document is not indexed: {document_path}")
|
|
2815
|
+
return document_row
|
|
2816
|
+
|
|
2817
|
+
def _resolve_document_by_id_row(
|
|
2818
|
+
self,
|
|
2819
|
+
connection: sqlite3.Connection,
|
|
2820
|
+
document_id: str,
|
|
2821
|
+
) -> sqlite3.Row:
|
|
2822
|
+
document_row = store.fetch_document_by_id(connection, document_id)
|
|
2823
|
+
if document_row is None:
|
|
2824
|
+
raise TargetNotFoundError(f"Document is not indexed: {document_id}")
|
|
2825
|
+
return document_row
|
|
2826
|
+
|
|
2827
|
+
def _resolve_indexed_item_row(
|
|
2828
|
+
self,
|
|
2829
|
+
connection: sqlite3.Connection,
|
|
2830
|
+
document_row: sqlite3.Row,
|
|
2831
|
+
item_id: str,
|
|
2832
|
+
document_path: Path,
|
|
2833
|
+
) -> sqlite3.Row:
|
|
2834
|
+
item_row = store.fetch_item_by_id(
|
|
2835
|
+
connection, document_row["document_id"], item_id
|
|
2836
|
+
)
|
|
2837
|
+
if item_row is None:
|
|
2838
|
+
raise TargetNotFoundError(
|
|
2839
|
+
f"Item {item_id} is not indexed for {document_path}"
|
|
2840
|
+
)
|
|
2841
|
+
return item_row
|
|
2842
|
+
|
|
2843
|
+
|
|
2844
|
+
def discover_documents(roots: Iterable[Path]) -> list[DocumentRef]:
|
|
2845
|
+
documents: list[DocumentRef] = []
|
|
2846
|
+
|
|
2847
|
+
for root in roots:
|
|
2848
|
+
if not root.exists() or not root.is_dir():
|
|
2849
|
+
continue
|
|
2850
|
+
|
|
2851
|
+
for candidate in sorted(root.rglob("*"), key=lambda path: str(path)):
|
|
2852
|
+
if not candidate.is_file():
|
|
2853
|
+
continue
|
|
2854
|
+
|
|
2855
|
+
extension = candidate.suffix.lower()
|
|
2856
|
+
if extension not in SUPPORTED_EXTENSIONS:
|
|
2857
|
+
continue
|
|
2858
|
+
|
|
2859
|
+
documents.append(
|
|
2860
|
+
_build_document_ref(candidate, SUPPORTED_EXTENSIONS[extension])
|
|
2861
|
+
)
|
|
2862
|
+
|
|
2863
|
+
return documents
|
|
2864
|
+
|
|
2865
|
+
|
|
2866
|
+
def format_doctor_report(report: DoctorReport) -> str:
|
|
2867
|
+
lines = ["Doctor Report"]
|
|
2868
|
+
for check in report.checks:
|
|
2869
|
+
status = "PASS" if check.ok else "FAIL"
|
|
2870
|
+
lines.append(f"[{status}] {check.name}: {check.detail}")
|
|
2871
|
+
|
|
2872
|
+
summary = "All checks passed." if report.ok else "One or more checks failed."
|
|
2873
|
+
lines.append(summary)
|
|
2874
|
+
return "\n".join(lines)
|
|
2875
|
+
|
|
2876
|
+
|
|
2877
|
+
def _build_document_ref(path: Path, file_type: FileType) -> DocumentRef:
|
|
2878
|
+
resolved = path.resolve()
|
|
2879
|
+
document_id = hashlib.sha256(str(resolved).encode("utf-8")).hexdigest()
|
|
2880
|
+
stat_result = resolved.stat()
|
|
2881
|
+
content_hash = _content_hash(resolved)
|
|
2882
|
+
return DocumentRef(
|
|
2883
|
+
document_id=document_id,
|
|
2884
|
+
path=resolved,
|
|
2885
|
+
file_type=file_type,
|
|
2886
|
+
display_name=resolved.name,
|
|
2887
|
+
modified_time=stat_result.st_mtime,
|
|
2888
|
+
content_hash=content_hash,
|
|
2889
|
+
)
|
|
2890
|
+
|
|
2891
|
+
|
|
2892
|
+
def _index_candidates(path: Path) -> list[Path]:
|
|
2893
|
+
resolved = path.resolve()
|
|
2894
|
+
if resolved.is_dir():
|
|
2895
|
+
return sorted(
|
|
2896
|
+
[
|
|
2897
|
+
candidate
|
|
2898
|
+
for candidate in resolved.rglob("*")
|
|
2899
|
+
if candidate.is_file()
|
|
2900
|
+
and candidate.suffix.lower() in SUPPORTED_EXTENSIONS
|
|
2901
|
+
],
|
|
2902
|
+
key=lambda candidate: str(candidate),
|
|
2903
|
+
)
|
|
2904
|
+
return [resolved]
|
|
2905
|
+
|
|
2906
|
+
|
|
2907
|
+
def _require_indexable_path(path: Path) -> tuple[Path, FileType]:
|
|
2908
|
+
resolved = canonicalize_existing_path(path)
|
|
2909
|
+
file_type = INDEXABLE_EXTENSIONS.get(resolved.suffix.lower())
|
|
2910
|
+
if file_type is None:
|
|
2911
|
+
raise InvalidArgumentsError(
|
|
2912
|
+
f"Implemented operations require a .docx, .pptx, or .xlsx path: {path}"
|
|
2913
|
+
)
|
|
2914
|
+
return resolved, file_type
|
|
2915
|
+
|
|
2916
|
+
|
|
2917
|
+
def _extract_items(document_path: Path, file_type: FileType):
|
|
2918
|
+
if file_type == "docx":
|
|
2919
|
+
return docx_adapter.extract_document(document_path)
|
|
2920
|
+
if file_type == "pptx":
|
|
2921
|
+
return pptx_adapter.extract_document(document_path)
|
|
2922
|
+
if file_type == "xlsx":
|
|
2923
|
+
return xlsx_adapter.extract_document(document_path)
|
|
2924
|
+
raise InvalidArgumentsError(f"Unsupported indexable file type: {file_type}")
|
|
2925
|
+
|
|
2926
|
+
|
|
2927
|
+
def _search_hit_from_keyword_row(row: sqlite3.Row) -> SearchHit:
|
|
2928
|
+
return SearchHit(
|
|
2929
|
+
document_id=row["document_id"],
|
|
2930
|
+
item_id=row["item_id"],
|
|
2931
|
+
score=float(row["score"]),
|
|
2932
|
+
matched_text=row["content_text"],
|
|
2933
|
+
locator=row["locator"],
|
|
2934
|
+
item_type=row["item_type"],
|
|
2935
|
+
preview=row["preview"],
|
|
2936
|
+
document_path=Path(row["path"]),
|
|
2937
|
+
display_name=row["display_name"],
|
|
2938
|
+
match_mode="keyword",
|
|
2939
|
+
)
|
|
2940
|
+
|
|
2941
|
+
|
|
2942
|
+
def _search_hit_from_semantic_row(row: sqlite3.Row, similarity: float) -> SearchHit:
|
|
2943
|
+
return SearchHit(
|
|
2944
|
+
document_id=row["document_id"],
|
|
2945
|
+
item_id=row["item_id"],
|
|
2946
|
+
score=similarity,
|
|
2947
|
+
matched_text=row["content_text"],
|
|
2948
|
+
locator=row["locator"],
|
|
2949
|
+
item_type=row["item_type"],
|
|
2950
|
+
preview=row["preview"],
|
|
2951
|
+
document_path=Path(row["path"]),
|
|
2952
|
+
display_name=row["display_name"],
|
|
2953
|
+
match_mode="semantic",
|
|
2954
|
+
scores={"semantic": similarity, "final": similarity},
|
|
2955
|
+
)
|
|
2956
|
+
|
|
2957
|
+
|
|
2958
|
+
def _search_hit_from_xlsx_semantic_row(
|
|
2959
|
+
connection: sqlite3.Connection,
|
|
2960
|
+
row: sqlite3.Row,
|
|
2961
|
+
similarity: float,
|
|
2962
|
+
) -> SearchHit:
|
|
2963
|
+
contributing_cells = store.fetch_xlsx_row_embedding_cells(
|
|
2964
|
+
connection, row["embedding_id"]
|
|
2965
|
+
)
|
|
2966
|
+
representative_coordinate = _metadata_value(row, "coordinate")
|
|
2967
|
+
return SearchHit(
|
|
2968
|
+
document_id=row["document_id"],
|
|
2969
|
+
item_id=row["item_id"],
|
|
2970
|
+
score=similarity,
|
|
2971
|
+
matched_text=row["content_text"],
|
|
2972
|
+
locator=row["locator"],
|
|
2973
|
+
item_type=row["item_type"],
|
|
2974
|
+
preview=row["preview"],
|
|
2975
|
+
document_path=Path(row["path"]),
|
|
2976
|
+
display_name=row["display_name"],
|
|
2977
|
+
match_mode="semantic",
|
|
2978
|
+
scores={"semantic": similarity, "final": similarity},
|
|
2979
|
+
metadata={
|
|
2980
|
+
"matched_sheet": row["sheet_name"],
|
|
2981
|
+
"matched_row": int(row["row_number"]),
|
|
2982
|
+
"contributing_cell_coordinates": [
|
|
2983
|
+
cell["cell_coordinate"] for cell in contributing_cells
|
|
2984
|
+
],
|
|
2985
|
+
"representative_cell_coordinate": representative_coordinate,
|
|
2986
|
+
"resolved_from_row_embedding": True,
|
|
2987
|
+
},
|
|
2988
|
+
)
|
|
2989
|
+
|
|
2990
|
+
|
|
2991
|
+
def _document_ref_from_row(row: sqlite3.Row) -> DocumentRef:
|
|
2992
|
+
return DocumentRef(
|
|
2993
|
+
document_id=row["document_id"],
|
|
2994
|
+
path=Path(row["path"]),
|
|
2995
|
+
file_type=row["file_type"],
|
|
2996
|
+
display_name=row["display_name"],
|
|
2997
|
+
modified_time=float(row["modified_time"]),
|
|
2998
|
+
content_hash=row["content_hash"],
|
|
2999
|
+
item_count=None if "item_count" not in row.keys() else int(row["item_count"]),
|
|
3000
|
+
)
|
|
3001
|
+
|
|
3002
|
+
|
|
3003
|
+
def _item_ref_from_row(row: sqlite3.Row) -> ItemRef:
|
|
3004
|
+
return ItemRef(
|
|
3005
|
+
document_id=row["document_id"],
|
|
3006
|
+
item_id=row["item_id"],
|
|
3007
|
+
item_type=row["item_type"],
|
|
3008
|
+
locator=row["locator"],
|
|
3009
|
+
preview=row["preview"],
|
|
3010
|
+
metadata=json.loads(row["metadata_json"]),
|
|
3011
|
+
content_text=row["content_text"],
|
|
3012
|
+
)
|
|
3013
|
+
|
|
3014
|
+
|
|
3015
|
+
def _metadata_value(row: sqlite3.Row, key: str, *, default=None):
|
|
3016
|
+
metadata = json.loads(row["metadata_json"])
|
|
3017
|
+
return metadata.get(key, default)
|
|
3018
|
+
|
|
3019
|
+
|
|
3020
|
+
def _normalize_search_mode(mode: str) -> SearchMode:
|
|
3021
|
+
normalized = mode.strip().lower()
|
|
3022
|
+
if normalized not in {"keyword", "semantic", "hybrid"}:
|
|
3023
|
+
raise InvalidArgumentsError(f"Unsupported search mode: {mode}")
|
|
3024
|
+
return normalized # type: ignore[return-value]
|
|
3025
|
+
|
|
3026
|
+
|
|
3027
|
+
def _normalize_output_mode(output_mode: str) -> OutputMode:
|
|
3028
|
+
normalized = output_mode.strip().lower()
|
|
3029
|
+
if normalized not in {"versioned", "inplace"}:
|
|
3030
|
+
raise InvalidArgumentsError(f"Unsupported output mode: {output_mode}")
|
|
3031
|
+
return normalized # type: ignore[return-value]
|
|
3032
|
+
|
|
3033
|
+
|
|
3034
|
+
def _optional_int(value: object) -> int | None:
|
|
3035
|
+
if value is None:
|
|
3036
|
+
return None
|
|
3037
|
+
if isinstance(value, bool):
|
|
3038
|
+
raise InvalidArgumentsError(f"Expected integer value, got {value!r}.")
|
|
3039
|
+
try:
|
|
3040
|
+
return int(value)
|
|
3041
|
+
except (TypeError, ValueError) as exc:
|
|
3042
|
+
raise InvalidArgumentsError(f"Expected integer value, got {value!r}.") from exc
|
|
3043
|
+
|
|
3044
|
+
|
|
3045
|
+
def _content_hash(path: Path) -> str:
|
|
3046
|
+
return hashlib.sha256(path.read_bytes()).hexdigest()
|
|
3047
|
+
|
|
3048
|
+
|
|
3049
|
+
def _build_embedding_text(
|
|
3050
|
+
item: IndexedItem, document_path: Path, *, file_type: FileType
|
|
3051
|
+
) -> str:
|
|
3052
|
+
if file_type == "docx":
|
|
3053
|
+
return docx_adapter.build_embedding_text(item, document_path)
|
|
3054
|
+
if file_type == "pptx":
|
|
3055
|
+
return pptx_adapter.build_embedding_text(item, document_path)
|
|
3056
|
+
if file_type == "xlsx":
|
|
3057
|
+
return xlsx_adapter.build_embedding_text(item, document_path)
|
|
3058
|
+
raise InvalidArgumentsError(f"Unsupported embedding text type: {file_type}")
|
|
3059
|
+
|
|
3060
|
+
|
|
3061
|
+
def _unpack_embedding(blob: bytes, dimensions: int) -> list[float]:
|
|
3062
|
+
expected_length = dimensions * 4
|
|
3063
|
+
if len(blob) != expected_length:
|
|
3064
|
+
raise RuntimeError(
|
|
3065
|
+
f"Embedding blob length {len(blob)} does not match expected size {expected_length}."
|
|
3066
|
+
)
|
|
3067
|
+
return list(struct.unpack(f"<{dimensions}f", blob))
|
|
3068
|
+
|
|
3069
|
+
|
|
3070
|
+
def _cosine_similarity(left: Sequence[float], right: Sequence[float]) -> float:
|
|
3071
|
+
if len(left) != len(right):
|
|
3072
|
+
raise RuntimeError("Embedding vectors must have the same dimensionality.")
|
|
3073
|
+
return float(sum(a * b for a, b in zip(left, right)))
|
|
3074
|
+
|
|
3075
|
+
|
|
3076
|
+
def _rank_scores(storage_ids: Sequence[str]) -> dict[str, float]:
|
|
3077
|
+
return {
|
|
3078
|
+
storage_id: 1.0 / rank for rank, storage_id in enumerate(storage_ids, start=1)
|
|
3079
|
+
}
|
|
3080
|
+
|
|
3081
|
+
|
|
3082
|
+
def _merge_hybrid_hits(
|
|
3083
|
+
keyword_rows: Sequence[sqlite3.Row],
|
|
3084
|
+
semantic_hits: Sequence[SearchHit],
|
|
3085
|
+
*,
|
|
3086
|
+
limit: int,
|
|
3087
|
+
keyword_weight: float,
|
|
3088
|
+
semantic_weight: float,
|
|
3089
|
+
) -> list[SearchHit]:
|
|
3090
|
+
keyword_by_storage = {row["storage_id"]: row for row in keyword_rows}
|
|
3091
|
+
semantic_by_storage = {
|
|
3092
|
+
f"{hit.document_id}:{hit.item_id}": hit for hit in semantic_hits
|
|
3093
|
+
}
|
|
3094
|
+
keyword_rank_scores = _rank_scores([row["storage_id"] for row in keyword_rows])
|
|
3095
|
+
semantic_rank_scores = _rank_scores(
|
|
3096
|
+
[f"{hit.document_id}:{hit.item_id}" for hit in semantic_hits]
|
|
3097
|
+
)
|
|
3098
|
+
|
|
3099
|
+
merged: list[SearchHit] = []
|
|
3100
|
+
for storage_id in sorted(set(keyword_by_storage) | set(semantic_by_storage)):
|
|
3101
|
+
keyword_row = keyword_by_storage.get(storage_id)
|
|
3102
|
+
semantic_hit = semantic_by_storage.get(storage_id)
|
|
3103
|
+
base_hit = (
|
|
3104
|
+
semantic_hit
|
|
3105
|
+
if semantic_hit is not None
|
|
3106
|
+
else _search_hit_from_keyword_row(keyword_row)
|
|
3107
|
+
) # type: ignore[arg-type]
|
|
3108
|
+
keyword_score = keyword_rank_scores.get(storage_id, 0.0)
|
|
3109
|
+
semantic_score = semantic_rank_scores.get(storage_id, 0.0)
|
|
3110
|
+
final_score = (keyword_weight * keyword_score) + (
|
|
3111
|
+
semantic_weight * semantic_score
|
|
3112
|
+
)
|
|
3113
|
+
merged.append(
|
|
3114
|
+
SearchHit(
|
|
3115
|
+
document_id=base_hit.document_id,
|
|
3116
|
+
item_id=base_hit.item_id,
|
|
3117
|
+
score=final_score,
|
|
3118
|
+
matched_text=base_hit.matched_text,
|
|
3119
|
+
locator=base_hit.locator,
|
|
3120
|
+
item_type=base_hit.item_type,
|
|
3121
|
+
preview=base_hit.preview,
|
|
3122
|
+
document_path=base_hit.document_path,
|
|
3123
|
+
display_name=base_hit.display_name,
|
|
3124
|
+
match_mode="hybrid",
|
|
3125
|
+
scores={
|
|
3126
|
+
"keyword": keyword_score,
|
|
3127
|
+
"semantic": semantic_score,
|
|
3128
|
+
"final": final_score,
|
|
3129
|
+
},
|
|
3130
|
+
metadata=dict(base_hit.metadata),
|
|
3131
|
+
)
|
|
3132
|
+
)
|
|
3133
|
+
|
|
3134
|
+
merged.sort(
|
|
3135
|
+
key=lambda hit: (
|
|
3136
|
+
-hit.score,
|
|
3137
|
+
-(hit.scores or {}).get("semantic", 0.0),
|
|
3138
|
+
-(hit.scores or {}).get("keyword", 0.0),
|
|
3139
|
+
str(hit.document_path),
|
|
3140
|
+
hit.item_id,
|
|
3141
|
+
)
|
|
3142
|
+
)
|
|
3143
|
+
return merged[:limit]
|
|
3144
|
+
|
|
3145
|
+
|
|
3146
|
+
def _ensure_current_target_resolves(
|
|
3147
|
+
document_path: Path, file_type: FileType, item_id: str
|
|
3148
|
+
) -> None:
|
|
3149
|
+
if file_type == "docx":
|
|
3150
|
+
docx_adapter.read_paragraph(document_path, item_id)
|
|
3151
|
+
return
|
|
3152
|
+
if file_type == "pptx":
|
|
3153
|
+
pptx_adapter.read_text_shape(document_path, item_id)
|
|
3154
|
+
return
|
|
3155
|
+
xlsx_adapter.read_cell(document_path, item_id)
|
|
3156
|
+
|
|
3157
|
+
|
|
3158
|
+
def _object_resolver(file_type: FileType):
|
|
3159
|
+
return OBJECT_RESOLVERS[file_type]
|
|
3160
|
+
|
|
3161
|
+
|
|
3162
|
+
def _require_capability(
|
|
3163
|
+
capabilities: Sequence[Capability],
|
|
3164
|
+
required: Capability,
|
|
3165
|
+
locator: str,
|
|
3166
|
+
) -> None:
|
|
3167
|
+
if required not in capabilities:
|
|
3168
|
+
raise TargetNotEditableError(f"{locator} does not support {required.value}.")
|
|
3169
|
+
|
|
3170
|
+
|
|
3171
|
+
def _primary_locator_for_batch(operations: Sequence[dict[str, Any]]) -> str | None:
|
|
3172
|
+
for operation in operations:
|
|
3173
|
+
for key in (
|
|
3174
|
+
"locator",
|
|
3175
|
+
"parent_locator",
|
|
3176
|
+
"new_parent_locator",
|
|
3177
|
+
"target_parent_locator",
|
|
3178
|
+
):
|
|
3179
|
+
value = operation.get(key)
|
|
3180
|
+
if isinstance(value, str) and value:
|
|
3181
|
+
return value
|
|
3182
|
+
return None
|
|
3183
|
+
|
|
3184
|
+
|
|
3185
|
+
def _make_batch_work_path(output_path: Path, suffix: str) -> Path:
|
|
3186
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
3187
|
+
handle = tempfile.NamedTemporaryFile(
|
|
3188
|
+
prefix=".offagent-batch-",
|
|
3189
|
+
suffix=suffix,
|
|
3190
|
+
dir=output_path.parent,
|
|
3191
|
+
delete=False,
|
|
3192
|
+
)
|
|
3193
|
+
handle.close()
|
|
3194
|
+
return Path(handle.name)
|
|
3195
|
+
|
|
3196
|
+
|
|
3197
|
+
def _validate_batch_operation(
|
|
3198
|
+
document_path: Path,
|
|
3199
|
+
file_type: FileType,
|
|
3200
|
+
operation: dict[str, Any],
|
|
3201
|
+
) -> MutationResult:
|
|
3202
|
+
operation_name = _operation_name(operation)
|
|
3203
|
+
if operation_name == "create_object":
|
|
3204
|
+
parent = _object_resolver(file_type).get_object(
|
|
3205
|
+
document_path, str(operation["parent_locator"])
|
|
3206
|
+
)
|
|
3207
|
+
_require_capability(parent.capabilities, Capability.ADD_CHILD, parent.locator)
|
|
3208
|
+
_validate_create_operation(file_type, operation)
|
|
3209
|
+
return MutationResult(
|
|
3210
|
+
document_path=document_path,
|
|
3211
|
+
output_path=None,
|
|
3212
|
+
document_id=document_path.resolve().as_posix(),
|
|
3213
|
+
locator=None,
|
|
3214
|
+
object_type=str(operation["object_type"]),
|
|
3215
|
+
summary=f"Validated {operation_name}.",
|
|
3216
|
+
parent_locator=parent.locator,
|
|
3217
|
+
metadata={"operation": operation_name},
|
|
3218
|
+
)
|
|
3219
|
+
|
|
3220
|
+
locator = str(operation["locator"])
|
|
3221
|
+
payload = _object_resolver(file_type).get_object(document_path, locator)
|
|
3222
|
+
required_capability = {
|
|
3223
|
+
"update_object": Capability.UPDATE,
|
|
3224
|
+
"move_object": Capability.MOVE,
|
|
3225
|
+
"copy_object": Capability.COPY,
|
|
3226
|
+
}[operation_name]
|
|
3227
|
+
_require_capability(payload.capabilities, required_capability, locator)
|
|
3228
|
+
if operation_name == "update_object":
|
|
3229
|
+
segments = _coerce_inline_fragments(operation.get("segments"))
|
|
3230
|
+
text_range = _coerce_visible_text_range(operation.get("range"))
|
|
3231
|
+
if text_range is not None:
|
|
3232
|
+
raise InvalidArgumentsError("update_object does not accept range.")
|
|
3233
|
+
if segments is not None and any(
|
|
3234
|
+
key in dict(operation.get("properties", {})) for key in {"text", "value"}
|
|
3235
|
+
):
|
|
3236
|
+
raise InvalidArgumentsError(
|
|
3237
|
+
"update_object accepts either properties.text/value or segments, not both."
|
|
3238
|
+
)
|
|
3239
|
+
return MutationResult(
|
|
3240
|
+
document_path=document_path,
|
|
3241
|
+
output_path=None,
|
|
3242
|
+
document_id=document_path.resolve().as_posix(),
|
|
3243
|
+
locator=payload.locator,
|
|
3244
|
+
object_type=payload.object_type,
|
|
3245
|
+
summary=f"Validated {operation_name}.",
|
|
3246
|
+
capabilities=payload.capabilities,
|
|
3247
|
+
parent_locator=payload.parent_locator,
|
|
3248
|
+
metadata={"operation": operation_name},
|
|
3249
|
+
)
|
|
3250
|
+
|
|
3251
|
+
|
|
3252
|
+
def _apply_batch_operation(
|
|
3253
|
+
document_path: Path,
|
|
3254
|
+
file_type: FileType,
|
|
3255
|
+
operation: dict[str, Any],
|
|
3256
|
+
) -> MutationResult:
|
|
3257
|
+
operation_name = _operation_name(operation)
|
|
3258
|
+
if operation_name == "create_object":
|
|
3259
|
+
locator, summary, metadata = _create_object_on_path(
|
|
3260
|
+
document_path,
|
|
3261
|
+
file_type,
|
|
3262
|
+
parent_locator=str(operation["parent_locator"]),
|
|
3263
|
+
object_type=str(operation["object_type"]),
|
|
3264
|
+
properties=dict(operation.get("properties", {})),
|
|
3265
|
+
position=operation.get("position"),
|
|
3266
|
+
segments=_coerce_inline_fragments(operation.get("segments")),
|
|
3267
|
+
text_range=_coerce_visible_text_range(operation.get("range")),
|
|
3268
|
+
output_path=document_path,
|
|
3269
|
+
)
|
|
3270
|
+
payload = _object_resolver(file_type).get_object(document_path, locator)
|
|
3271
|
+
return MutationResult(
|
|
3272
|
+
document_path=document_path,
|
|
3273
|
+
output_path=document_path,
|
|
3274
|
+
document_id=document_path.resolve().as_posix(),
|
|
3275
|
+
locator=payload.locator,
|
|
3276
|
+
object_type=payload.object_type,
|
|
3277
|
+
summary=summary,
|
|
3278
|
+
capabilities=payload.capabilities,
|
|
3279
|
+
parent_locator=payload.parent_locator,
|
|
3280
|
+
metadata=metadata,
|
|
3281
|
+
)
|
|
3282
|
+
|
|
3283
|
+
if operation_name == "update_object":
|
|
3284
|
+
locator = str(operation["locator"])
|
|
3285
|
+
summary, metadata = _update_object_on_path(
|
|
3286
|
+
document_path,
|
|
3287
|
+
file_type,
|
|
3288
|
+
locator=locator,
|
|
3289
|
+
properties=dict(operation.get("properties", {})),
|
|
3290
|
+
segments=_coerce_inline_fragments(operation.get("segments")),
|
|
3291
|
+
text_range=_coerce_visible_text_range(operation.get("range")),
|
|
3292
|
+
output_path=document_path,
|
|
3293
|
+
)
|
|
3294
|
+
payload = _object_resolver(file_type).get_object(document_path, locator)
|
|
3295
|
+
return MutationResult(
|
|
3296
|
+
document_path=document_path,
|
|
3297
|
+
output_path=document_path,
|
|
3298
|
+
document_id=document_path.resolve().as_posix(),
|
|
3299
|
+
locator=payload.locator,
|
|
3300
|
+
object_type=payload.object_type,
|
|
3301
|
+
summary=summary,
|
|
3302
|
+
capabilities=payload.capabilities,
|
|
3303
|
+
parent_locator=payload.parent_locator,
|
|
3304
|
+
metadata=metadata,
|
|
3305
|
+
)
|
|
3306
|
+
|
|
3307
|
+
if operation_name == "move_object":
|
|
3308
|
+
moved_locator, summary, metadata = _move_object_on_path(
|
|
3309
|
+
document_path,
|
|
3310
|
+
file_type,
|
|
3311
|
+
locator=str(operation["locator"]),
|
|
3312
|
+
new_parent_locator=str(operation["new_parent_locator"]),
|
|
3313
|
+
position=operation.get("position"),
|
|
3314
|
+
output_path=document_path,
|
|
3315
|
+
)
|
|
3316
|
+
payload = _object_resolver(file_type).get_object(document_path, moved_locator)
|
|
3317
|
+
return MutationResult(
|
|
3318
|
+
document_path=document_path,
|
|
3319
|
+
output_path=document_path,
|
|
3320
|
+
document_id=document_path.resolve().as_posix(),
|
|
3321
|
+
locator=payload.locator,
|
|
3322
|
+
object_type=payload.object_type,
|
|
3323
|
+
summary=summary,
|
|
3324
|
+
capabilities=payload.capabilities,
|
|
3325
|
+
parent_locator=payload.parent_locator,
|
|
3326
|
+
metadata=metadata,
|
|
3327
|
+
)
|
|
3328
|
+
|
|
3329
|
+
copied_locator, summary, metadata = _copy_object_on_path(
|
|
3330
|
+
document_path,
|
|
3331
|
+
file_type,
|
|
3332
|
+
locator=str(operation["locator"]),
|
|
3333
|
+
target_parent_locator=str(operation["target_parent_locator"]),
|
|
3334
|
+
position=operation.get("position"),
|
|
3335
|
+
output_path=document_path,
|
|
3336
|
+
)
|
|
3337
|
+
payload = _object_resolver(file_type).get_object(document_path, copied_locator)
|
|
3338
|
+
return MutationResult(
|
|
3339
|
+
document_path=document_path,
|
|
3340
|
+
output_path=document_path,
|
|
3341
|
+
document_id=document_path.resolve().as_posix(),
|
|
3342
|
+
locator=payload.locator,
|
|
3343
|
+
object_type=payload.object_type,
|
|
3344
|
+
summary=summary,
|
|
3345
|
+
capabilities=payload.capabilities,
|
|
3346
|
+
parent_locator=payload.parent_locator,
|
|
3347
|
+
metadata=metadata,
|
|
3348
|
+
)
|
|
3349
|
+
|
|
3350
|
+
|
|
3351
|
+
def _create_object_on_path(
|
|
3352
|
+
document_path: Path,
|
|
3353
|
+
file_type: FileType,
|
|
3354
|
+
*,
|
|
3355
|
+
parent_locator: str,
|
|
3356
|
+
object_type: str,
|
|
3357
|
+
properties: dict[str, Any],
|
|
3358
|
+
position: object | None,
|
|
3359
|
+
segments: tuple[InlineFragment, ...] | None,
|
|
3360
|
+
text_range: VisibleTextRange | None,
|
|
3361
|
+
output_path: Path,
|
|
3362
|
+
) -> tuple[str, str, dict[str, Any]]:
|
|
3363
|
+
parent = _object_resolver(file_type).get_object(document_path, parent_locator)
|
|
3364
|
+
_require_capability(parent.capabilities, Capability.ADD_CHILD, parent_locator)
|
|
3365
|
+
_validate_create_operation(
|
|
3366
|
+
file_type,
|
|
3367
|
+
{
|
|
3368
|
+
"parent_locator": parent_locator,
|
|
3369
|
+
"object_type": object_type,
|
|
3370
|
+
"properties": properties,
|
|
3371
|
+
"position": position,
|
|
3372
|
+
"segments": segments,
|
|
3373
|
+
"range": text_range,
|
|
3374
|
+
},
|
|
3375
|
+
)
|
|
3376
|
+
if file_type == "docx":
|
|
3377
|
+
style_name = properties.get("style_name")
|
|
3378
|
+
style = None if style_name is None else str(style_name)
|
|
3379
|
+
text = _text_or_segments_text(properties, object_type, segments, keys=("text",))
|
|
3380
|
+
after_locator = _docx_after_locator(position)
|
|
3381
|
+
target_path, new_node_id = docx_adapter.insert_paragraph(
|
|
3382
|
+
document_path,
|
|
3383
|
+
text,
|
|
3384
|
+
style_name=style,
|
|
3385
|
+
after_locator=after_locator,
|
|
3386
|
+
output_path=output_path,
|
|
3387
|
+
)
|
|
3388
|
+
locator = to_v2_locator(new_node_id, file_type="docx")
|
|
3389
|
+
if segments:
|
|
3390
|
+
target_path, locator, _ = docx_adapter.rewrite_paragraph_fragments(
|
|
3391
|
+
target_path,
|
|
3392
|
+
locator,
|
|
3393
|
+
segments,
|
|
3394
|
+
output_path=target_path,
|
|
3395
|
+
)
|
|
3396
|
+
return (
|
|
3397
|
+
locator,
|
|
3398
|
+
f"Created {object_type} under {parent_locator}.",
|
|
3399
|
+
{
|
|
3400
|
+
"text": text,
|
|
3401
|
+
"segments": None if segments is None else len(segments),
|
|
3402
|
+
"style_name": style,
|
|
3403
|
+
},
|
|
3404
|
+
)
|
|
3405
|
+
|
|
3406
|
+
if file_type == "pptx" and object_type in {"text_shape", "textbox"}:
|
|
3407
|
+
text = _text_or_segments_text(properties, object_type, segments, keys=("text",))
|
|
3408
|
+
left = _optional_int(properties.get("left"))
|
|
3409
|
+
top = _optional_int(properties.get("top"))
|
|
3410
|
+
width = _optional_int(properties.get("width"))
|
|
3411
|
+
height = _optional_int(properties.get("height"))
|
|
3412
|
+
if None in {left, top, width, height}:
|
|
3413
|
+
raise InvalidArgumentsError(
|
|
3414
|
+
"PPTX text_shape creation requires left, top, width, and height."
|
|
3415
|
+
)
|
|
3416
|
+
target_path, locator = pptx_adapter.add_textbox(
|
|
3417
|
+
document_path,
|
|
3418
|
+
parent_locator,
|
|
3419
|
+
text,
|
|
3420
|
+
left=left,
|
|
3421
|
+
top=top,
|
|
3422
|
+
width=width,
|
|
3423
|
+
height=height,
|
|
3424
|
+
output_path=output_path,
|
|
3425
|
+
)
|
|
3426
|
+
if segments:
|
|
3427
|
+
target_path, locator, _ = pptx_adapter.rewrite_paragraph_fragments(
|
|
3428
|
+
target_path,
|
|
3429
|
+
locator,
|
|
3430
|
+
segments,
|
|
3431
|
+
output_path=target_path,
|
|
3432
|
+
)
|
|
3433
|
+
return (
|
|
3434
|
+
locator,
|
|
3435
|
+
f"Created {object_type} under {parent_locator}.",
|
|
3436
|
+
{"text": text, "segments": None if segments is None else len(segments)},
|
|
3437
|
+
)
|
|
3438
|
+
|
|
3439
|
+
if file_type == "xlsx" and object_type == "cell":
|
|
3440
|
+
parts = parse_locator(
|
|
3441
|
+
to_v2_locator(parent_locator, file_type="xlsx")
|
|
3442
|
+
).components
|
|
3443
|
+
if len(parts) != 3 or parts[:2] != ("xlsx", "sheet"):
|
|
3444
|
+
raise InvalidArgumentsError(
|
|
3445
|
+
"XLSX cell creation requires a worksheet parent locator."
|
|
3446
|
+
)
|
|
3447
|
+
coordinate = properties.get("coordinate")
|
|
3448
|
+
if not isinstance(coordinate, str) or not coordinate.strip():
|
|
3449
|
+
raise InvalidArgumentsError("XLSX cell creation requires a coordinate.")
|
|
3450
|
+
locator = f"xlsx:sheet:{parts[2]}!{coordinate.strip().upper()}"
|
|
3451
|
+
if segments:
|
|
3452
|
+
target_path, locator, _ = xlsx_adapter.write_cell_fragments(
|
|
3453
|
+
document_path,
|
|
3454
|
+
locator,
|
|
3455
|
+
segments,
|
|
3456
|
+
output_path=output_path,
|
|
3457
|
+
)
|
|
3458
|
+
text = "".join(fragment.text for fragment in segments)
|
|
3459
|
+
else:
|
|
3460
|
+
text = _required_string_property(properties, ("value", "text"), object_type)
|
|
3461
|
+
xlsx_adapter.write_node(
|
|
3462
|
+
document_path,
|
|
3463
|
+
to_legacy_locator(locator, file_type="xlsx"),
|
|
3464
|
+
text,
|
|
3465
|
+
output_path,
|
|
3466
|
+
)
|
|
3467
|
+
target_path = output_path
|
|
3468
|
+
return (
|
|
3469
|
+
locator,
|
|
3470
|
+
f"Created {object_type} under {parent_locator}.",
|
|
3471
|
+
{"value": text, "segments": None if segments is None else len(segments)},
|
|
3472
|
+
)
|
|
3473
|
+
|
|
3474
|
+
raise InvalidArgumentsError(
|
|
3475
|
+
f"create_object is not supported for {file_type} {object_type}."
|
|
3476
|
+
)
|
|
3477
|
+
|
|
3478
|
+
|
|
3479
|
+
def _update_object_on_path(
|
|
3480
|
+
document_path: Path,
|
|
3481
|
+
file_type: FileType,
|
|
3482
|
+
*,
|
|
3483
|
+
locator: str,
|
|
3484
|
+
properties: dict[str, Any],
|
|
3485
|
+
segments: tuple[InlineFragment, ...] | None,
|
|
3486
|
+
text_range: VisibleTextRange | None,
|
|
3487
|
+
output_path: Path,
|
|
3488
|
+
) -> tuple[str, dict[str, Any]]:
|
|
3489
|
+
payload = _object_resolver(file_type).get_object(document_path, locator)
|
|
3490
|
+
_require_capability(payload.capabilities, Capability.UPDATE, locator)
|
|
3491
|
+
if text_range is not None:
|
|
3492
|
+
raise InvalidArgumentsError(
|
|
3493
|
+
"update_object does not support range; use style_inline for partial formatting."
|
|
3494
|
+
)
|
|
3495
|
+
if segments is not None and any(key in properties for key in {"text", "value"}):
|
|
3496
|
+
raise InvalidArgumentsError(
|
|
3497
|
+
"update_object accepts either properties.text/value or segments, not both."
|
|
3498
|
+
)
|
|
3499
|
+
|
|
3500
|
+
if file_type == "docx" and segments is not None:
|
|
3501
|
+
_, _, snapshot = docx_adapter.rewrite_paragraph_fragments(
|
|
3502
|
+
document_path,
|
|
3503
|
+
locator,
|
|
3504
|
+
segments,
|
|
3505
|
+
output_path=output_path,
|
|
3506
|
+
)
|
|
3507
|
+
return (
|
|
3508
|
+
f"Updated {payload.object_type} {locator}.",
|
|
3509
|
+
{"text": snapshot.text, "segments": len(snapshot.fragments)},
|
|
3510
|
+
)
|
|
3511
|
+
|
|
3512
|
+
if file_type == "pptx" and segments is not None:
|
|
3513
|
+
_, rewritten_locator, snapshot = pptx_adapter.rewrite_paragraph_fragments(
|
|
3514
|
+
document_path,
|
|
3515
|
+
locator,
|
|
3516
|
+
segments,
|
|
3517
|
+
output_path=output_path,
|
|
3518
|
+
)
|
|
3519
|
+
return (
|
|
3520
|
+
f"Updated {payload.object_type} {rewritten_locator}.",
|
|
3521
|
+
{"text": snapshot.text, "segments": len(snapshot.fragments)},
|
|
3522
|
+
)
|
|
3523
|
+
|
|
3524
|
+
if file_type == "xlsx" and segments is not None:
|
|
3525
|
+
_, _, snapshot = xlsx_adapter.write_cell_fragments(
|
|
3526
|
+
document_path,
|
|
3527
|
+
locator,
|
|
3528
|
+
segments,
|
|
3529
|
+
output_path=output_path,
|
|
3530
|
+
)
|
|
3531
|
+
return (
|
|
3532
|
+
f"Updated {payload.object_type} {locator}.",
|
|
3533
|
+
{"value": snapshot.text, "segments": len(snapshot.fragments)},
|
|
3534
|
+
)
|
|
3535
|
+
|
|
3536
|
+
if file_type in {"docx", "pptx"}:
|
|
3537
|
+
content = _required_string_property(
|
|
3538
|
+
properties, ("text", "value"), payload.object_type
|
|
3539
|
+
)
|
|
3540
|
+
legacy_locator = to_legacy_locator(locator, file_type=file_type)
|
|
3541
|
+
if file_type == "docx":
|
|
3542
|
+
docx_adapter.write_node(document_path, legacy_locator, content, output_path)
|
|
3543
|
+
else:
|
|
3544
|
+
pptx_adapter.write_node(document_path, legacy_locator, content, output_path)
|
|
3545
|
+
return (f"Updated {payload.object_type} {locator}.", {"text": content})
|
|
3546
|
+
|
|
3547
|
+
content = _required_string_property(
|
|
3548
|
+
properties, ("value", "text"), payload.object_type
|
|
3549
|
+
)
|
|
3550
|
+
legacy_locator = to_legacy_locator(locator, file_type="xlsx")
|
|
3551
|
+
xlsx_adapter.write_node(document_path, legacy_locator, content, output_path)
|
|
3552
|
+
return (f"Updated {payload.object_type} {locator}.", {"value": content})
|
|
3553
|
+
|
|
3554
|
+
|
|
3555
|
+
def _move_object_on_path(
|
|
3556
|
+
document_path: Path,
|
|
3557
|
+
file_type: FileType,
|
|
3558
|
+
*,
|
|
3559
|
+
locator: str,
|
|
3560
|
+
new_parent_locator: str,
|
|
3561
|
+
position: object | None,
|
|
3562
|
+
output_path: Path,
|
|
3563
|
+
) -> tuple[str, str, dict[str, Any]]:
|
|
3564
|
+
payload = _object_resolver(file_type).get_object(document_path, locator)
|
|
3565
|
+
_require_capability(payload.capabilities, Capability.MOVE, locator)
|
|
3566
|
+
|
|
3567
|
+
if file_type != "pptx" or payload.object_type != "slide":
|
|
3568
|
+
raise InvalidArgumentsError(
|
|
3569
|
+
f"move_object is not supported for {payload.object_type}."
|
|
3570
|
+
)
|
|
3571
|
+
if new_parent_locator != "pptx:presentation":
|
|
3572
|
+
raise InvalidArgumentsError(
|
|
3573
|
+
"PPTX slides can only be moved within the presentation root."
|
|
3574
|
+
)
|
|
3575
|
+
|
|
3576
|
+
slide_number = _pptx_slide_number(locator)
|
|
3577
|
+
new_position = _required_position(position)
|
|
3578
|
+
_move_pptx_slide(document_path, slide_number, new_position, output_path)
|
|
3579
|
+
return (
|
|
3580
|
+
f"pptx:slide:{new_position}",
|
|
3581
|
+
f"Moved slide {slide_number} to position {new_position}.",
|
|
3582
|
+
{"previous_locator": locator, "new_parent_locator": new_parent_locator},
|
|
3583
|
+
)
|
|
3584
|
+
|
|
3585
|
+
|
|
3586
|
+
def _copy_object_on_path(
|
|
3587
|
+
document_path: Path,
|
|
3588
|
+
file_type: FileType,
|
|
3589
|
+
*,
|
|
3590
|
+
locator: str,
|
|
3591
|
+
target_parent_locator: str,
|
|
3592
|
+
position: object | None,
|
|
3593
|
+
output_path: Path,
|
|
3594
|
+
) -> tuple[str, str, dict[str, Any]]:
|
|
3595
|
+
payload = _object_resolver(file_type).get_object(document_path, locator)
|
|
3596
|
+
_require_capability(payload.capabilities, Capability.COPY, locator)
|
|
3597
|
+
|
|
3598
|
+
if file_type != "pptx" or payload.object_type != "slide":
|
|
3599
|
+
raise InvalidArgumentsError(
|
|
3600
|
+
f"copy_object is not supported for {payload.object_type}."
|
|
3601
|
+
)
|
|
3602
|
+
if target_parent_locator != "pptx:presentation":
|
|
3603
|
+
raise InvalidArgumentsError(
|
|
3604
|
+
"PPTX slides can only be copied within the presentation root."
|
|
3605
|
+
)
|
|
3606
|
+
|
|
3607
|
+
slide_number = _pptx_slide_number(locator)
|
|
3608
|
+
copied_position = _copy_pptx_slide(
|
|
3609
|
+
document_path, slide_number, position, output_path
|
|
3610
|
+
)
|
|
3611
|
+
return (
|
|
3612
|
+
f"pptx:slide:{copied_position}",
|
|
3613
|
+
f"Copied slide {slide_number} to position {copied_position}.",
|
|
3614
|
+
{"source_locator": locator, "target_parent_locator": target_parent_locator},
|
|
3615
|
+
)
|
|
3616
|
+
|
|
3617
|
+
|
|
3618
|
+
def _delete_object_on_path(
|
|
3619
|
+
document_path: Path,
|
|
3620
|
+
file_type: FileType,
|
|
3621
|
+
*,
|
|
3622
|
+
locator: str,
|
|
3623
|
+
output_path: Path,
|
|
3624
|
+
) -> tuple[str, dict[str, Any]]:
|
|
3625
|
+
payload = _object_resolver(file_type).get_object(document_path, locator)
|
|
3626
|
+
_require_capability(payload.capabilities, Capability.DELETE, locator)
|
|
3627
|
+
|
|
3628
|
+
if file_type == "docx":
|
|
3629
|
+
return _delete_docx_object(document_path, locator, output_path)
|
|
3630
|
+
if file_type == "pptx":
|
|
3631
|
+
return _delete_pptx_object(document_path, locator, output_path)
|
|
3632
|
+
return _delete_xlsx_object(document_path, locator, output_path)
|
|
3633
|
+
|
|
3634
|
+
|
|
3635
|
+
def _validate_create_operation(file_type: FileType, operation: dict[str, Any]) -> None:
|
|
3636
|
+
object_type = str(operation["object_type"])
|
|
3637
|
+
segments = _coerce_inline_fragments(operation.get("segments"))
|
|
3638
|
+
text_range = _coerce_visible_text_range(operation.get("range"))
|
|
3639
|
+
properties = dict(operation.get("properties", {}))
|
|
3640
|
+
if text_range is not None:
|
|
3641
|
+
raise InvalidArgumentsError("create_object does not accept range.")
|
|
3642
|
+
if segments is not None and any(key in properties for key in {"text", "value"}):
|
|
3643
|
+
raise InvalidArgumentsError(
|
|
3644
|
+
"create_object accepts either text/value or segments, not both."
|
|
3645
|
+
)
|
|
3646
|
+
if file_type == "docx" and object_type == "paragraph":
|
|
3647
|
+
_text_or_segments_text(properties, object_type, segments, keys=("text",))
|
|
3648
|
+
_docx_after_locator(operation.get("position"))
|
|
3649
|
+
return
|
|
3650
|
+
if file_type == "pptx" and object_type in {"text_shape", "textbox"}:
|
|
3651
|
+
_text_or_segments_text(properties, object_type, segments, keys=("text",))
|
|
3652
|
+
for key in ("left", "top", "width", "height"):
|
|
3653
|
+
if _optional_int(properties.get(key)) is None:
|
|
3654
|
+
raise InvalidArgumentsError(
|
|
3655
|
+
"PPTX text_shape creation requires left, top, width, and height."
|
|
3656
|
+
)
|
|
3657
|
+
return
|
|
3658
|
+
if file_type == "xlsx" and object_type == "cell":
|
|
3659
|
+
coordinate = properties.get("coordinate")
|
|
3660
|
+
if not isinstance(coordinate, str) or not coordinate.strip():
|
|
3661
|
+
raise InvalidArgumentsError("XLSX cell creation requires a coordinate.")
|
|
3662
|
+
if segments is None:
|
|
3663
|
+
_required_string_property(properties, ("value", "text"), object_type)
|
|
3664
|
+
return
|
|
3665
|
+
raise InvalidArgumentsError(
|
|
3666
|
+
f"create_object does not support {file_type} {object_type}."
|
|
3667
|
+
)
|
|
3668
|
+
|
|
3669
|
+
|
|
3670
|
+
def _operation_name(operation: dict[str, Any]) -> str:
|
|
3671
|
+
raw = operation.get("operation") or operation.get("op")
|
|
3672
|
+
if not isinstance(raw, str) or raw not in {
|
|
3673
|
+
"create_object",
|
|
3674
|
+
"update_object",
|
|
3675
|
+
"move_object",
|
|
3676
|
+
"copy_object",
|
|
3677
|
+
}:
|
|
3678
|
+
raise InvalidArgumentsError(f"Unsupported batch operation: {raw}")
|
|
3679
|
+
return raw
|
|
3680
|
+
|
|
3681
|
+
|
|
3682
|
+
def _required_string_property(
|
|
3683
|
+
properties: dict[str, Any],
|
|
3684
|
+
keys: Sequence[str],
|
|
3685
|
+
object_type: str,
|
|
3686
|
+
) -> str:
|
|
3687
|
+
for key in keys:
|
|
3688
|
+
value = properties.get(key)
|
|
3689
|
+
if value is None:
|
|
3690
|
+
continue
|
|
3691
|
+
return str(value)
|
|
3692
|
+
raise InvalidArgumentsError(
|
|
3693
|
+
f"{object_type} updates require one of: {', '.join(keys)}."
|
|
3694
|
+
)
|
|
3695
|
+
|
|
3696
|
+
|
|
3697
|
+
def _text_or_segments_text(
|
|
3698
|
+
properties: dict[str, Any],
|
|
3699
|
+
object_type: str,
|
|
3700
|
+
segments: tuple[InlineFragment, ...] | None,
|
|
3701
|
+
*,
|
|
3702
|
+
keys: Sequence[str],
|
|
3703
|
+
) -> str:
|
|
3704
|
+
if segments is not None:
|
|
3705
|
+
return "".join(fragment.text for fragment in segments)
|
|
3706
|
+
return _required_string_property(properties, keys, object_type)
|
|
3707
|
+
|
|
3708
|
+
|
|
3709
|
+
def _coerce_inline_fragments(
|
|
3710
|
+
value: object,
|
|
3711
|
+
) -> tuple[InlineFragment, ...] | None:
|
|
3712
|
+
if value is None:
|
|
3713
|
+
return None
|
|
3714
|
+
if not isinstance(value, Sequence) or isinstance(value, (str, bytes, bytearray)):
|
|
3715
|
+
raise InvalidArgumentsError("segments must be a sequence of inline fragments.")
|
|
3716
|
+
fragments: list[InlineFragment] = []
|
|
3717
|
+
for raw_fragment in value:
|
|
3718
|
+
if isinstance(raw_fragment, InlineFragment):
|
|
3719
|
+
fragment = raw_fragment
|
|
3720
|
+
elif isinstance(raw_fragment, dict):
|
|
3721
|
+
text = raw_fragment.get("text")
|
|
3722
|
+
if not isinstance(text, str) or not text:
|
|
3723
|
+
raise InvalidArgumentsError("segments must contain non-empty text.")
|
|
3724
|
+
raw_style = raw_fragment.get("style")
|
|
3725
|
+
if raw_style is None:
|
|
3726
|
+
style = InlineStyle()
|
|
3727
|
+
elif isinstance(raw_style, InlineStyle):
|
|
3728
|
+
style = raw_style
|
|
3729
|
+
elif isinstance(raw_style, dict):
|
|
3730
|
+
style = InlineStyle(**raw_style)
|
|
3731
|
+
else:
|
|
3732
|
+
raise InvalidArgumentsError("segment styles must be objects.")
|
|
3733
|
+
fragment = InlineFragment(text=text, style=style)
|
|
3734
|
+
else:
|
|
3735
|
+
raise InvalidArgumentsError(
|
|
3736
|
+
"segments must contain inline-fragment objects."
|
|
3737
|
+
)
|
|
3738
|
+
if not fragment.text:
|
|
3739
|
+
raise InvalidArgumentsError("segments must contain non-empty text.")
|
|
3740
|
+
fragments.append(fragment)
|
|
3741
|
+
if not fragments:
|
|
3742
|
+
raise InvalidArgumentsError("segments must not be empty.")
|
|
3743
|
+
return tuple(fragments)
|
|
3744
|
+
|
|
3745
|
+
|
|
3746
|
+
def _coerce_visible_text_range(
|
|
3747
|
+
value: object,
|
|
3748
|
+
) -> VisibleTextRange | None:
|
|
3749
|
+
if value is None:
|
|
3750
|
+
return None
|
|
3751
|
+
if isinstance(value, VisibleTextRange):
|
|
3752
|
+
result = value
|
|
3753
|
+
elif isinstance(value, dict):
|
|
3754
|
+
try:
|
|
3755
|
+
result = VisibleTextRange(start=int(value["start"]), end=int(value["end"]))
|
|
3756
|
+
except (KeyError, TypeError, ValueError) as exc:
|
|
3757
|
+
raise InvalidArgumentsError(
|
|
3758
|
+
"range must contain integer start and end offsets."
|
|
3759
|
+
) from exc
|
|
3760
|
+
else:
|
|
3761
|
+
raise InvalidArgumentsError(
|
|
3762
|
+
"range must be an object with start and end offsets."
|
|
3763
|
+
)
|
|
3764
|
+
if result.start < 0 or result.end <= result.start:
|
|
3765
|
+
raise InvalidArgumentsError(
|
|
3766
|
+
"range must use non-negative offsets with end > start."
|
|
3767
|
+
)
|
|
3768
|
+
return result
|
|
3769
|
+
|
|
3770
|
+
|
|
3771
|
+
def _docx_after_locator(position: object | None) -> str | None:
|
|
3772
|
+
if position is None:
|
|
3773
|
+
return None
|
|
3774
|
+
if isinstance(position, str):
|
|
3775
|
+
return to_legacy_locator(position, file_type="docx")
|
|
3776
|
+
if isinstance(position, dict):
|
|
3777
|
+
for key in ("after", "after_locator"):
|
|
3778
|
+
after_locator = position.get(key)
|
|
3779
|
+
if after_locator is not None:
|
|
3780
|
+
return to_legacy_locator(str(after_locator), file_type="docx")
|
|
3781
|
+
raise InvalidArgumentsError("DOCX create_object position must be an after locator.")
|
|
3782
|
+
|
|
3783
|
+
|
|
3784
|
+
def _required_position(position: object | None) -> int:
|
|
3785
|
+
if isinstance(position, int):
|
|
3786
|
+
return position
|
|
3787
|
+
if isinstance(position, dict):
|
|
3788
|
+
for key in ("index", "position"):
|
|
3789
|
+
value = position.get(key)
|
|
3790
|
+
if isinstance(value, int):
|
|
3791
|
+
return value
|
|
3792
|
+
raise InvalidArgumentsError("Move/copy operations require an integer position.")
|
|
3793
|
+
|
|
3794
|
+
|
|
3795
|
+
def _pptx_slide_number(locator: str) -> int:
|
|
3796
|
+
canonical = to_v2_locator(locator, file_type="pptx")
|
|
3797
|
+
parts = canonical.split(":")
|
|
3798
|
+
if len(parts) != 3 or parts[:2] != ["pptx", "slide"]:
|
|
3799
|
+
raise InvalidArgumentsError(f"Unsupported PPTX slide locator: {locator}")
|
|
3800
|
+
return int(parts[2])
|
|
3801
|
+
|
|
3802
|
+
|
|
3803
|
+
def _move_pptx_slide(
|
|
3804
|
+
document_path: Path,
|
|
3805
|
+
slide_number: int,
|
|
3806
|
+
new_position: int,
|
|
3807
|
+
output_path: Path,
|
|
3808
|
+
) -> None:
|
|
3809
|
+
presentation = pptx_adapter._open_presentation(document_path)
|
|
3810
|
+
slide_count = len(presentation.slides)
|
|
3811
|
+
if slide_number < 1 or slide_number > slide_count:
|
|
3812
|
+
raise TargetNotFoundError(
|
|
3813
|
+
f"Slide {slide_number} does not exist in the presentation."
|
|
3814
|
+
)
|
|
3815
|
+
if new_position < 1 or new_position > slide_count:
|
|
3816
|
+
raise InvalidArgumentsError(f"Invalid target slide position: {new_position}")
|
|
3817
|
+
|
|
3818
|
+
sld_id_list = presentation.slides._sldIdLst
|
|
3819
|
+
slide_id = sld_id_list.sldId_lst[slide_number - 1]
|
|
3820
|
+
sld_id_list.remove(slide_id)
|
|
3821
|
+
sld_id_list.insert(new_position - 1, slide_id)
|
|
3822
|
+
presentation.save(output_path)
|
|
3823
|
+
|
|
3824
|
+
|
|
3825
|
+
def _copy_pptx_slide(
|
|
3826
|
+
document_path: Path,
|
|
3827
|
+
slide_number: int,
|
|
3828
|
+
position: object | None,
|
|
3829
|
+
output_path: Path,
|
|
3830
|
+
) -> int:
|
|
3831
|
+
presentation = pptx_adapter._open_presentation(document_path)
|
|
3832
|
+
source_slide = pptx_adapter._resolve_slide(presentation, slide_number)
|
|
3833
|
+
new_slide = presentation.slides.add_slide(source_slide.slide_layout)
|
|
3834
|
+
|
|
3835
|
+
for placeholder_shape in list(new_slide.shapes):
|
|
3836
|
+
placeholder_shape.element.getparent().remove(placeholder_shape.element)
|
|
3837
|
+
|
|
3838
|
+
for shape in source_slide.shapes:
|
|
3839
|
+
new_slide.shapes._spTree.insert_element_before(
|
|
3840
|
+
deepcopy(shape.element), "p:extLst"
|
|
3841
|
+
)
|
|
3842
|
+
|
|
3843
|
+
for rel in source_slide.part.rels.values():
|
|
3844
|
+
if rel.reltype.endswith("/notesSlide") or rel.reltype.endswith("/slideLayout"):
|
|
3845
|
+
continue
|
|
3846
|
+
if rel.is_external:
|
|
3847
|
+
new_rid = new_slide.part.relate_to(
|
|
3848
|
+
rel.target_ref, rel.reltype, is_external=True
|
|
3849
|
+
)
|
|
3850
|
+
else:
|
|
3851
|
+
new_rid = new_slide.part.relate_to(rel.target_part, rel.reltype)
|
|
3852
|
+
_retarget_shape_relationships(new_slide, rel.rId, new_rid)
|
|
3853
|
+
|
|
3854
|
+
if getattr(source_slide, "notes_slide", None) is not None:
|
|
3855
|
+
source_notes = getattr(source_slide.notes_slide, "notes_text_frame", None)
|
|
3856
|
+
target_notes = getattr(new_slide.notes_slide, "notes_text_frame", None)
|
|
3857
|
+
if source_notes is not None and target_notes is not None:
|
|
3858
|
+
target_notes.text = source_notes.text
|
|
3859
|
+
|
|
3860
|
+
copied_position = (
|
|
3861
|
+
len(presentation.slides) if position is None else _required_position(position)
|
|
3862
|
+
)
|
|
3863
|
+
_move_pptx_slide_in_memory(presentation, len(presentation.slides), copied_position)
|
|
3864
|
+
presentation.save(output_path)
|
|
3865
|
+
return copied_position
|
|
3866
|
+
|
|
3867
|
+
|
|
3868
|
+
def _move_pptx_slide_in_memory(
|
|
3869
|
+
presentation, slide_number: int, new_position: int
|
|
3870
|
+
) -> None:
|
|
3871
|
+
slide_count = len(presentation.slides)
|
|
3872
|
+
if new_position < 1 or new_position > slide_count:
|
|
3873
|
+
raise InvalidArgumentsError(f"Invalid target slide position: {new_position}")
|
|
3874
|
+
sld_id_list = presentation.slides._sldIdLst
|
|
3875
|
+
slide_id = sld_id_list.sldId_lst[slide_number - 1]
|
|
3876
|
+
sld_id_list.remove(slide_id)
|
|
3877
|
+
sld_id_list.insert(new_position - 1, slide_id)
|
|
3878
|
+
|
|
3879
|
+
|
|
3880
|
+
def _retarget_shape_relationships(slide, source_rid: str, target_rid: str) -> None:
|
|
3881
|
+
for shape in slide.shapes:
|
|
3882
|
+
for element in shape.element.iter():
|
|
3883
|
+
for attr_name, attr_value in list(element.attrib.items()):
|
|
3884
|
+
if attr_value == source_rid:
|
|
3885
|
+
element.set(attr_name, target_rid)
|
|
3886
|
+
|
|
3887
|
+
|
|
3888
|
+
def _delete_docx_object(
|
|
3889
|
+
document_path: Path,
|
|
3890
|
+
locator: str,
|
|
3891
|
+
output_path: Path,
|
|
3892
|
+
) -> tuple[str, dict[str, Any]]:
|
|
3893
|
+
canonical = to_v2_locator(locator, file_type="docx")
|
|
3894
|
+
if canonical.startswith("docx:para:"):
|
|
3895
|
+
document = docx_adapter._open_document(document_path)
|
|
3896
|
+
paragraph = docx_adapter._resolve_paragraph(
|
|
3897
|
+
document, to_legacy_locator(canonical, file_type="docx")
|
|
3898
|
+
)
|
|
3899
|
+
paragraph._element.getparent().remove(paragraph._element)
|
|
3900
|
+
document.save(output_path)
|
|
3901
|
+
return (f"Deleted paragraph {locator}.", {"locator": locator})
|
|
3902
|
+
if canonical.startswith("docx:table:") and ":row:" not in canonical:
|
|
3903
|
+
document = docx_adapter._open_document(document_path)
|
|
3904
|
+
parts = canonical.split(":")
|
|
3905
|
+
table_index = int(parts[2])
|
|
3906
|
+
resolved = docx_adapter._resolve_locator(
|
|
3907
|
+
document, f"table:{table_index}:cell:0:0"
|
|
3908
|
+
)
|
|
3909
|
+
resolved.table._element.getparent().remove(resolved.table._element)
|
|
3910
|
+
document.save(output_path)
|
|
3911
|
+
return (f"Deleted table {locator}.", {"locator": locator})
|
|
3912
|
+
raise InvalidArgumentsError(f"delete_object is not supported for {locator}.")
|
|
3913
|
+
|
|
3914
|
+
|
|
3915
|
+
def _delete_pptx_object(
|
|
3916
|
+
document_path: Path,
|
|
3917
|
+
locator: str,
|
|
3918
|
+
output_path: Path,
|
|
3919
|
+
) -> tuple[str, dict[str, Any]]:
|
|
3920
|
+
canonical = to_v2_locator(locator, file_type="pptx")
|
|
3921
|
+
presentation = pptx_adapter._open_presentation(document_path)
|
|
3922
|
+
parts = canonical.split(":")
|
|
3923
|
+
|
|
3924
|
+
if canonical.startswith("pptx:slide:") and len(parts) == 3:
|
|
3925
|
+
slide_number = int(parts[2])
|
|
3926
|
+
slide_id = presentation.slides._sldIdLst.sldId_lst[slide_number - 1]
|
|
3927
|
+
presentation.part.drop_rel(slide_id.rId)
|
|
3928
|
+
presentation.slides._sldIdLst.remove(slide_id)
|
|
3929
|
+
presentation.save(output_path)
|
|
3930
|
+
return (f"Deleted slide {locator}.", {"locator": locator})
|
|
3931
|
+
|
|
3932
|
+
if len(parts) == 5 and parts[:2] == ["pptx", "slide"]:
|
|
3933
|
+
shape = pptx_adapter._resolve_shape(
|
|
3934
|
+
presentation, to_legacy_locator(canonical, file_type="pptx")
|
|
3935
|
+
)
|
|
3936
|
+
shape.element.getparent().remove(shape.element)
|
|
3937
|
+
presentation.save(output_path)
|
|
3938
|
+
return (
|
|
3939
|
+
f"Deleted {parts[3]} {locator}.",
|
|
3940
|
+
{"locator": locator, "slide_number": int(parts[2])},
|
|
3941
|
+
)
|
|
3942
|
+
|
|
3943
|
+
raise InvalidArgumentsError(f"delete_object is not supported for {locator}.")
|
|
3944
|
+
|
|
3945
|
+
|
|
3946
|
+
def _delete_xlsx_object(
|
|
3947
|
+
document_path: Path,
|
|
3948
|
+
locator: str,
|
|
3949
|
+
output_path: Path,
|
|
3950
|
+
) -> tuple[str, dict[str, Any]]:
|
|
3951
|
+
canonical = to_v2_locator(locator, file_type="xlsx")
|
|
3952
|
+
workbook = xlsx_adapter._open_workbook(document_path)
|
|
3953
|
+
parts = parse_locator(canonical).components
|
|
3954
|
+
|
|
3955
|
+
if parts == ("xlsx", "workbook"):
|
|
3956
|
+
raise TargetNotEditableError(f"{locator} does not support delete.")
|
|
3957
|
+
|
|
3958
|
+
if len(parts) == 3 and parts[:2] == ("xlsx", "sheet"):
|
|
3959
|
+
worksheet = workbook[parts[2]]
|
|
3960
|
+
if len(workbook.worksheets) <= 1:
|
|
3961
|
+
raise TargetNotEditableError(f"{locator} does not support delete.")
|
|
3962
|
+
workbook.remove(worksheet)
|
|
3963
|
+
workbook.save(output_path)
|
|
3964
|
+
return (f"Deleted worksheet {locator}.", {"locator": locator})
|
|
3965
|
+
|
|
3966
|
+
if len(parts) == 5 and parts[:2] == ("xlsx", "sheet") and parts[3] == "row":
|
|
3967
|
+
worksheet = workbook[parts[2]]
|
|
3968
|
+
worksheet.delete_rows(int(parts[4]), 1)
|
|
3969
|
+
workbook.save(output_path)
|
|
3970
|
+
return (f"Deleted row {locator}.", {"locator": locator})
|
|
3971
|
+
|
|
3972
|
+
if len(parts) == 5 and parts[:2] == ("xlsx", "sheet") and parts[3] == "col":
|
|
3973
|
+
worksheet = workbook[parts[2]]
|
|
3974
|
+
worksheet.delete_cols(int(parts[4]), 1)
|
|
3975
|
+
workbook.save(output_path)
|
|
3976
|
+
return (f"Deleted column {locator}.", {"locator": locator})
|
|
3977
|
+
|
|
3978
|
+
if len(parts) == 4 and parts[:2] == ("xlsx", "sheet"):
|
|
3979
|
+
worksheet = workbook[parts[2]]
|
|
3980
|
+
coordinate = parts[3]
|
|
3981
|
+
if ":" in coordinate:
|
|
3982
|
+
for row in worksheet[coordinate]:
|
|
3983
|
+
for cell in row:
|
|
3984
|
+
cell.value = None
|
|
3985
|
+
workbook.save(output_path)
|
|
3986
|
+
return (f"Cleared range {locator}.", {"locator": locator})
|
|
3987
|
+
worksheet[coordinate].value = None
|
|
3988
|
+
workbook.save(output_path)
|
|
3989
|
+
return (f"Cleared cell {locator}.", {"locator": locator})
|
|
3990
|
+
|
|
3991
|
+
raise InvalidArgumentsError(f"delete_object is not supported for {locator}.")
|
|
3992
|
+
|
|
3993
|
+
|
|
3994
|
+
def _build_xlsx_row_embedding_records(
|
|
3995
|
+
document_id: str,
|
|
3996
|
+
row_embeddings: Sequence[XlsxRowEmbedding],
|
|
3997
|
+
blobs: Sequence[bytes],
|
|
3998
|
+
) -> list[
|
|
3999
|
+
tuple[
|
|
4000
|
+
str,
|
|
4001
|
+
str,
|
|
4002
|
+
int,
|
|
4003
|
+
str,
|
|
4004
|
+
str,
|
|
4005
|
+
str,
|
|
4006
|
+
bytes,
|
|
4007
|
+
list[tuple[str, str, int, bool]],
|
|
4008
|
+
]
|
|
4009
|
+
]:
|
|
4010
|
+
records: list[
|
|
4011
|
+
tuple[
|
|
4012
|
+
str,
|
|
4013
|
+
str,
|
|
4014
|
+
int,
|
|
4015
|
+
str,
|
|
4016
|
+
str,
|
|
4017
|
+
str,
|
|
4018
|
+
bytes,
|
|
4019
|
+
list[tuple[str, str, int, bool]],
|
|
4020
|
+
]
|
|
4021
|
+
] = []
|
|
4022
|
+
for row_embedding, blob in zip(row_embeddings, blobs, strict=True):
|
|
4023
|
+
embedding_id = store.make_xlsx_row_embedding_id(
|
|
4024
|
+
document_id,
|
|
4025
|
+
row_embedding.sheet_name,
|
|
4026
|
+
row_embedding.row_number,
|
|
4027
|
+
)
|
|
4028
|
+
representative_storage_id = store.make_storage_id(
|
|
4029
|
+
document_id,
|
|
4030
|
+
row_embedding.representative_item_id,
|
|
4031
|
+
)
|
|
4032
|
+
contributing_cells = [
|
|
4033
|
+
(
|
|
4034
|
+
store.make_storage_id(document_id, cell.item_id),
|
|
4035
|
+
cell.coordinate,
|
|
4036
|
+
index,
|
|
4037
|
+
cell.item_id == row_embedding.representative_item_id,
|
|
4038
|
+
)
|
|
4039
|
+
for index, cell in enumerate(row_embedding.contributing_cells, start=1)
|
|
4040
|
+
]
|
|
4041
|
+
records.append(
|
|
4042
|
+
(
|
|
4043
|
+
embedding_id,
|
|
4044
|
+
row_embedding.sheet_name,
|
|
4045
|
+
row_embedding.row_number,
|
|
4046
|
+
representative_storage_id,
|
|
4047
|
+
row_embedding.text,
|
|
4048
|
+
row_embedding.preview,
|
|
4049
|
+
blob,
|
|
4050
|
+
contributing_cells,
|
|
4051
|
+
)
|
|
4052
|
+
)
|
|
4053
|
+
return records
|
|
4054
|
+
|
|
4055
|
+
|
|
4056
|
+
def _raise_if_pptx_target_not_editable(document_path: Path, item_id: str) -> None:
|
|
4057
|
+
try:
|
|
4058
|
+
pptx_adapter.resolve_shape(document_path, item_id)
|
|
4059
|
+
except pptx_adapter.TargetNotEditableError:
|
|
4060
|
+
raise
|
|
4061
|
+
except (TargetNotFoundError, InvalidArgumentsError):
|
|
4062
|
+
return
|
|
4063
|
+
|
|
4064
|
+
|
|
4065
|
+
def _check_import(module_name: str, label: str) -> DoctorCheck:
|
|
4066
|
+
try:
|
|
4067
|
+
importlib.import_module(module_name)
|
|
4068
|
+
except ModuleNotFoundError as exc:
|
|
4069
|
+
return DoctorCheck(label, False, f"Import failed: {exc}")
|
|
4070
|
+
return DoctorCheck(label, True, "Import succeeded.")
|
|
4071
|
+
|
|
4072
|
+
|
|
4073
|
+
def _check_sqlite_module() -> DoctorCheck:
|
|
4074
|
+
try:
|
|
4075
|
+
sqlite3.connect(":memory:").close()
|
|
4076
|
+
except sqlite3.Error as exc:
|
|
4077
|
+
return DoctorCheck("SQLite", False, f"Connection failed: {exc}")
|
|
4078
|
+
return DoctorCheck("SQLite", True, "Connection succeeded.")
|
|
4079
|
+
|
|
4080
|
+
|
|
4081
|
+
def _check_fts5_support() -> DoctorCheck:
|
|
4082
|
+
connection = sqlite3.connect(":memory:")
|
|
4083
|
+
try:
|
|
4084
|
+
if store.supports_fts5(connection):
|
|
4085
|
+
return DoctorCheck(
|
|
4086
|
+
"SQLite FTS5", True, "FTS5 virtual tables are available."
|
|
4087
|
+
)
|
|
4088
|
+
return DoctorCheck("SQLite FTS5", False, "FTS5 virtual tables are unavailable.")
|
|
4089
|
+
finally:
|
|
4090
|
+
connection.close()
|
|
4091
|
+
|
|
4092
|
+
|
|
4093
|
+
def _check_embedding_provider_import() -> DoctorCheck:
|
|
4094
|
+
try:
|
|
4095
|
+
importlib.import_module("offagent.adapters.embedding_provider")
|
|
4096
|
+
except Exception as exc:
|
|
4097
|
+
return DoctorCheck("Embedding Provider", False, f"Import failed: {exc}")
|
|
4098
|
+
return DoctorCheck("Embedding Provider", True, "Import succeeded.")
|
|
4099
|
+
|
|
4100
|
+
|
|
4101
|
+
def _check_embedding_model(
|
|
4102
|
+
model_name: str,
|
|
4103
|
+
dimensions: int,
|
|
4104
|
+
*,
|
|
4105
|
+
provider_factory: Callable[[str, int | None], embedding_provider.EmbeddingProvider]
|
|
4106
|
+
| None = None,
|
|
4107
|
+
) -> DoctorCheck:
|
|
4108
|
+
try:
|
|
4109
|
+
factory = provider_factory or (
|
|
4110
|
+
lambda selected_model, selected_dimensions: (
|
|
4111
|
+
embedding_provider.LocalEmbeddingProvider(
|
|
4112
|
+
model_name=selected_model,
|
|
4113
|
+
dimensions=selected_dimensions,
|
|
4114
|
+
)
|
|
4115
|
+
)
|
|
4116
|
+
)
|
|
4117
|
+
provider = factory(model_name, dimensions)
|
|
4118
|
+
except Exception as exc:
|
|
4119
|
+
return DoctorCheck("Embedding Model", False, f"Model load failed: {exc}")
|
|
4120
|
+
return DoctorCheck(
|
|
4121
|
+
"Embedding Model",
|
|
4122
|
+
True,
|
|
4123
|
+
f"Loaded {provider.model_name} with {provider.dimensions} dimensions.",
|
|
4124
|
+
)
|
|
4125
|
+
|
|
4126
|
+
|
|
4127
|
+
def _check_embedding_store(
|
|
4128
|
+
index_path: Path, model_name: str, dimensions: int
|
|
4129
|
+
) -> DoctorCheck:
|
|
4130
|
+
try:
|
|
4131
|
+
connection = store.ensure_ready(index_path)
|
|
4132
|
+
except (OSError, sqlite3.Error, store.StoreCapabilityError) as exc:
|
|
4133
|
+
return DoctorCheck("Embedding Store", False, f"Store check failed: {exc}")
|
|
4134
|
+
|
|
4135
|
+
try:
|
|
4136
|
+
meta = store.fetch_embedding_meta(connection)
|
|
4137
|
+
if not meta:
|
|
4138
|
+
return DoctorCheck(
|
|
4139
|
+
"Embedding Store", True, "Embedding sidecar tables are ready."
|
|
4140
|
+
)
|
|
4141
|
+
store.ensure_embedding_meta(
|
|
4142
|
+
connection,
|
|
4143
|
+
model_name=model_name,
|
|
4144
|
+
dimensions=dimensions,
|
|
4145
|
+
)
|
|
4146
|
+
except Exception as exc:
|
|
4147
|
+
return DoctorCheck("Embedding Store", False, f"Metadata check failed: {exc}")
|
|
4148
|
+
finally:
|
|
4149
|
+
connection.close()
|
|
4150
|
+
return DoctorCheck(
|
|
4151
|
+
"Embedding Store", True, "Embedding tables and metadata are consistent."
|
|
4152
|
+
)
|
|
4153
|
+
|
|
4154
|
+
|
|
4155
|
+
def _check_index_path(index_path: Path) -> DoctorCheck:
|
|
4156
|
+
try:
|
|
4157
|
+
connection = store.ensure_ready(index_path)
|
|
4158
|
+
except (OSError, sqlite3.Error, store.StoreCapabilityError) as exc:
|
|
4159
|
+
return DoctorCheck("Index Path", False, f"Schema bootstrap failed: {exc}")
|
|
4160
|
+
else:
|
|
4161
|
+
connection.close()
|
|
4162
|
+
return DoctorCheck("Index Path", True, f"Schema ready at {index_path}.")
|
|
4163
|
+
|
|
4164
|
+
|
|
4165
|
+
def _check_document_roots(roots: Sequence[Path]) -> list[DoctorCheck]:
|
|
4166
|
+
if not roots:
|
|
4167
|
+
return [DoctorCheck("Document Roots", True, "No document roots configured.")]
|
|
4168
|
+
|
|
4169
|
+
checks: list[DoctorCheck] = []
|
|
4170
|
+
for root in roots:
|
|
4171
|
+
if root.exists() and root.is_dir() and os.access(root, os.R_OK):
|
|
4172
|
+
checks.append(
|
|
4173
|
+
DoctorCheck(f"Document Root {root}", True, "Readable directory.")
|
|
4174
|
+
)
|
|
4175
|
+
elif not root.exists():
|
|
4176
|
+
checks.append(
|
|
4177
|
+
DoctorCheck(f"Document Root {root}", False, "Path does not exist.")
|
|
4178
|
+
)
|
|
4179
|
+
elif not root.is_dir():
|
|
4180
|
+
checks.append(
|
|
4181
|
+
DoctorCheck(f"Document Root {root}", False, "Path is not a directory.")
|
|
4182
|
+
)
|
|
4183
|
+
else:
|
|
4184
|
+
checks.append(
|
|
4185
|
+
DoctorCheck(
|
|
4186
|
+
f"Document Root {root}", False, "Directory is not readable."
|
|
4187
|
+
)
|
|
4188
|
+
)
|
|
4189
|
+
return checks
|
|
4190
|
+
|
|
4191
|
+
|
|
4192
|
+
def _check_allowed_roots(roots: Sequence[Path]) -> list[DoctorCheck]:
|
|
4193
|
+
if not roots:
|
|
4194
|
+
return [
|
|
4195
|
+
DoctorCheck("Allowed Roots", True, "No allowed-root policy configured.")
|
|
4196
|
+
]
|
|
4197
|
+
return _check_resolved_roots(
|
|
4198
|
+
"Allowed Root", normalize_roots(roots), require_writable=False
|
|
4199
|
+
)
|
|
4200
|
+
|
|
4201
|
+
|
|
4202
|
+
def _check_output_roots(roots: Sequence[Path]) -> list[DoctorCheck]:
|
|
4203
|
+
if not roots:
|
|
4204
|
+
return [DoctorCheck("Output Roots", True, "No output-root policy configured.")]
|
|
4205
|
+
return _check_resolved_roots(
|
|
4206
|
+
"Output Root", normalize_roots(roots), require_writable=True
|
|
4207
|
+
)
|
|
4208
|
+
|
|
4209
|
+
|
|
4210
|
+
def _check_resolved_roots(
|
|
4211
|
+
label: str,
|
|
4212
|
+
roots: Sequence[Path],
|
|
4213
|
+
*,
|
|
4214
|
+
require_writable: bool,
|
|
4215
|
+
) -> list[DoctorCheck]:
|
|
4216
|
+
checks: list[DoctorCheck] = []
|
|
4217
|
+
for root in roots:
|
|
4218
|
+
if root.exists():
|
|
4219
|
+
if not root.is_dir():
|
|
4220
|
+
checks.append(
|
|
4221
|
+
DoctorCheck(f"{label} {root}", False, "Path is not a directory.")
|
|
4222
|
+
)
|
|
4223
|
+
continue
|
|
4224
|
+
if require_writable and not os.access(root, os.W_OK):
|
|
4225
|
+
checks.append(
|
|
4226
|
+
DoctorCheck(f"{label} {root}", False, "Directory is not writable.")
|
|
4227
|
+
)
|
|
4228
|
+
continue
|
|
4229
|
+
if not require_writable and not os.access(root, os.R_OK):
|
|
4230
|
+
checks.append(
|
|
4231
|
+
DoctorCheck(f"{label} {root}", False, "Directory is not readable.")
|
|
4232
|
+
)
|
|
4233
|
+
continue
|
|
4234
|
+
checks.append(
|
|
4235
|
+
DoctorCheck(f"{label} {root}", True, "Policy root is usable.")
|
|
4236
|
+
)
|
|
4237
|
+
continue
|
|
4238
|
+
|
|
4239
|
+
existing_parent = _nearest_existing_parent(root)
|
|
4240
|
+
access_mode = os.W_OK if require_writable else os.R_OK
|
|
4241
|
+
if existing_parent is not None and os.access(existing_parent, access_mode):
|
|
4242
|
+
checks.append(
|
|
4243
|
+
DoctorCheck(
|
|
4244
|
+
f"{label} {root}",
|
|
4245
|
+
True,
|
|
4246
|
+
f"Parent path {existing_parent} is accessible.",
|
|
4247
|
+
)
|
|
4248
|
+
)
|
|
4249
|
+
continue
|
|
4250
|
+
checks.append(
|
|
4251
|
+
DoctorCheck(
|
|
4252
|
+
f"{label} {root}",
|
|
4253
|
+
False,
|
|
4254
|
+
"Path does not exist and no accessible parent directory was found.",
|
|
4255
|
+
)
|
|
4256
|
+
)
|
|
4257
|
+
return checks
|
|
4258
|
+
|
|
4259
|
+
|
|
4260
|
+
def _nearest_existing_parent(path: Path) -> Path | None:
|
|
4261
|
+
current = path
|
|
4262
|
+
while True:
|
|
4263
|
+
if current.exists():
|
|
4264
|
+
return current
|
|
4265
|
+
if current.parent == current:
|
|
4266
|
+
return None
|
|
4267
|
+
current = current.parent
|