offagent 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. offagent/__init__.py +3 -0
  2. offagent/__main__.py +5 -0
  3. offagent/adapters/__init__.py +1 -0
  4. offagent/adapters/docx_adapter.py +1237 -0
  5. offagent/adapters/embedding_provider.py +132 -0
  6. offagent/adapters/pptx_adapter.py +940 -0
  7. offagent/adapters/xlsx_adapter.py +1266 -0
  8. offagent/app/__init__.py +1 -0
  9. offagent/app/progress.py +52 -0
  10. offagent/app/services.py +4267 -0
  11. offagent/config.py +287 -0
  12. offagent/domain/__init__.py +1 -0
  13. offagent/domain/locators.py +444 -0
  14. offagent/domain/models.py +477 -0
  15. offagent/domain/text_fragments.py +136 -0
  16. offagent/errors.py +29 -0
  17. offagent/indexing/__init__.py +1 -0
  18. offagent/indexing/store.py +795 -0
  19. offagent/interfaces/__init__.py +1 -0
  20. offagent/interfaces/cli.py +438 -0
  21. offagent/interfaces/cli_output.py +139 -0
  22. offagent/interfaces/cli_progress.py +120 -0
  23. offagent/interfaces/mcp.py +1145 -0
  24. offagent/interfaces/mcp_converters.py +80 -0
  25. offagent/interfaces/mcp_models.py +923 -0
  26. offagent/objects/__init__.py +3 -0
  27. offagent/objects/base.py +26 -0
  28. offagent/objects/docx_objects.py +951 -0
  29. offagent/objects/pptx_objects.py +895 -0
  30. offagent/objects/xlsx_objects.py +962 -0
  31. offagent/path_policy.py +42 -0
  32. offagent/storage/__init__.py +1 -0
  33. offagent/storage/versioning.py +31 -0
  34. offagent-0.10.0.dist-info/METADATA +546 -0
  35. offagent-0.10.0.dist-info/RECORD +39 -0
  36. offagent-0.10.0.dist-info/WHEEL +5 -0
  37. offagent-0.10.0.dist-info/entry_points.txt +2 -0
  38. offagent-0.10.0.dist-info/licenses/LICENSE +21 -0
  39. offagent-0.10.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,4267 @@
1
+ from __future__ import annotations
2
+
3
+ import hashlib
4
+ import importlib
5
+ import json
6
+ import logging
7
+ import os
8
+ from copy import deepcopy
9
+ import sqlite3
10
+ import struct
11
+ import shutil
12
+ import tempfile
13
+ import time
14
+ from dataclasses import dataclass, field, replace
15
+ from pathlib import Path
16
+ from typing import Any, Callable, Iterable, Literal, Sequence
17
+
18
+ from offagent.adapters import (
19
+ docx_adapter,
20
+ embedding_provider,
21
+ pptx_adapter,
22
+ xlsx_adapter,
23
+ )
24
+ from offagent.app.progress import NullProgressReporter, ProgressReporter
25
+ from offagent.config import AppConfig
26
+ from offagent.domain.locators import parse_locator, to_legacy_locator, to_v2_locator
27
+ from offagent.domain.models import (
28
+ BatchResult,
29
+ BlockStyle,
30
+ BlockBundle,
31
+ ChildSummary,
32
+ Capability,
33
+ DocxTableEntry,
34
+ DocxTablesResult,
35
+ DocumentBlocks,
36
+ DocumentRef,
37
+ DocumentStructure,
38
+ FileType,
39
+ InlineFragment,
40
+ IndexedItem,
41
+ InsertContentResult,
42
+ InlineStyle,
43
+ ItemRef,
44
+ NodePayload,
45
+ NodeWriteResult,
46
+ ObjectPayload,
47
+ MutationResult,
48
+ ParagraphCollection,
49
+ PresentationStructure,
50
+ SearchHit,
51
+ SearchMode,
52
+ SectionPayload,
53
+ SheetSnapshot,
54
+ SlideNotes,
55
+ StructureUnit,
56
+ StructureCollection,
57
+ StructuredTarget,
58
+ StructuredWriteResult,
59
+ TableCollection,
60
+ VisibleTextRange,
61
+ WorkbookStructure,
62
+ XlsxInsertRowsResult,
63
+ XlsxRowEmbedding,
64
+ )
65
+ from offagent.errors import (
66
+ InvalidArgumentsError,
67
+ NoEmbeddingsError,
68
+ PolicyRefusedError,
69
+ StaleLocatorError,
70
+ TargetNotEditableError,
71
+ )
72
+ from offagent.errors import TargetNotFoundError
73
+ from offagent.indexing import store
74
+ from offagent.objects import docx_objects, pptx_objects, xlsx_objects
75
+ from offagent.objects.docx_objects import DocxObjectResolver
76
+ from offagent.objects.pptx_objects import PptxObjectResolver
77
+ from offagent.objects.xlsx_objects import XlsxObjectResolver
78
+ from offagent.path_policy import (
79
+ canonicalize_existing_path,
80
+ canonicalize_output_path,
81
+ ensure_path_allowed,
82
+ normalize_roots,
83
+ )
84
+ from offagent.storage import versioning
85
+
86
+ LOGGER = logging.getLogger(__name__)
87
+
88
+ SUPPORTED_EXTENSIONS: dict[str, FileType] = {
89
+ ".docx": "docx",
90
+ ".pptx": "pptx",
91
+ ".xlsx": "xlsx",
92
+ }
93
+ INDEXABLE_EXTENSIONS: dict[str, FileType] = {
94
+ ".docx": "docx",
95
+ ".pptx": "pptx",
96
+ ".xlsx": "xlsx",
97
+ }
98
+
99
+ REQUIRED_IMPORTS: tuple[tuple[str, str], ...] = (
100
+ ("mcp", "MCP Python SDK"),
101
+ ("typer", "Typer"),
102
+ ("pydantic", "Pydantic"),
103
+ ("dotenv", "python-dotenv"),
104
+ ("docx", "python-docx"),
105
+ ("pptx", "python-pptx"),
106
+ ("openpyxl", "openpyxl"),
107
+ ("rich", "Rich"),
108
+ )
109
+
110
+ OutputMode = Literal["versioned", "inplace"]
111
+
112
+ OBJECT_RESOLVERS = {
113
+ "docx": DocxObjectResolver(),
114
+ "pptx": PptxObjectResolver(),
115
+ "xlsx": XlsxObjectResolver(),
116
+ }
117
+
118
+
119
+ @dataclass(frozen=True)
120
+ class DoctorCheck:
121
+ name: str
122
+ ok: bool
123
+ detail: str
124
+
125
+
126
+ @dataclass(frozen=True)
127
+ class DoctorReport:
128
+ checks: tuple[DoctorCheck, ...]
129
+
130
+ @property
131
+ def ok(self) -> bool:
132
+ return all(check.ok for check in self.checks)
133
+
134
+
135
+ @dataclass(frozen=True)
136
+ class IndexSummary:
137
+ files_scanned: int
138
+ files_indexed: int
139
+ files_skipped: int
140
+
141
+
142
+ @dataclass(frozen=True)
143
+ class PatchResult:
144
+ document_path: Path
145
+ output_path: Path
146
+ item: ItemRef
147
+ text: str
148
+
149
+
150
+ @dataclass
151
+ class AppServices:
152
+ config: AppConfig
153
+ embedding_provider_factory: (
154
+ Callable[
155
+ [str, int | None],
156
+ embedding_provider.EmbeddingProvider,
157
+ ]
158
+ | None
159
+ ) = None
160
+ _embedding_provider: embedding_provider.EmbeddingProvider | None = field(
161
+ default=None,
162
+ init=False,
163
+ repr=False,
164
+ )
165
+
166
+ def discover_documents(self) -> list[DocumentRef]:
167
+ documents = discover_documents(self.config.document_roots)
168
+ return [
169
+ document
170
+ for document in documents
171
+ if self._is_allowed_document_path(document.path)
172
+ ]
173
+
174
+ def list_documents(self) -> list[DocumentRef]:
175
+ connection = store.ensure_ready(self.config.index_path)
176
+ try:
177
+ rows = store.fetch_documents(connection)
178
+ finally:
179
+ connection.close()
180
+ return [
181
+ _document_ref_from_row(row)
182
+ for row in rows
183
+ if self._is_allowed_document_path(Path(row["path"]))
184
+ ]
185
+
186
+ def get_document(self, document_id: str) -> DocumentRef:
187
+ connection = store.ensure_ready(self.config.index_path)
188
+ try:
189
+ document_row = self._resolve_document_by_id_row(connection, document_id)
190
+ finally:
191
+ connection.close()
192
+ document = _document_ref_from_row(document_row)
193
+ self._ensure_allowed_document_path(document.path, action="read")
194
+ return document
195
+
196
+ def show_document(self, document_path: Path) -> DocumentRef:
197
+ connection = store.ensure_ready(self.config.index_path)
198
+ try:
199
+ resolved_path, _ = self._require_allowed_document_path(
200
+ document_path, action="show"
201
+ )
202
+ document_row = self._resolve_document_row(connection, resolved_path)
203
+ finally:
204
+ connection.close()
205
+ return _document_ref_from_row(document_row)
206
+
207
+ def show_item(self, document_path: Path, item_id: str) -> ItemRef:
208
+ connection = store.ensure_ready(self.config.index_path)
209
+ try:
210
+ resolved_path, _ = self._require_allowed_document_path(
211
+ document_path, action="show"
212
+ )
213
+ document_row, item_row = self._resolve_item_row(
214
+ connection, resolved_path, item_id
215
+ )
216
+ finally:
217
+ connection.close()
218
+ return _item_ref_from_row(item_row)
219
+
220
+ def resolve_document_path(self, document_id: str) -> Path:
221
+ return self.get_document(document_id).path
222
+
223
+ def get_document_structure(self, document_id: str) -> DocumentStructure:
224
+ document = self.get_document(document_id)
225
+ if document.file_type == "docx":
226
+ units = tuple(
227
+ StructureUnit(
228
+ position=block.block_index,
229
+ unit_type=block.block_type,
230
+ preview=block.preview,
231
+ metadata=block.metadata,
232
+ )
233
+ for block in docx_adapter.get_blocks(document.path)
234
+ )
235
+ elif document.file_type == "pptx":
236
+ units = tuple(
237
+ StructureUnit(
238
+ position=slide.slide_number,
239
+ unit_type="slide",
240
+ preview=slide.preview,
241
+ metadata=slide.metadata,
242
+ )
243
+ for slide in pptx_adapter.get_presentation_structure(document.path)
244
+ )
245
+ else:
246
+ workbook_structure = xlsx_adapter.get_workbook_structure(document.path)
247
+ units = tuple(
248
+ StructureUnit(
249
+ position=sheet.position,
250
+ unit_type="worksheet",
251
+ preview=sheet.preview,
252
+ metadata={"sheet_name": sheet.sheet_name, **sheet.metadata},
253
+ )
254
+ for sheet in workbook_structure.sheets
255
+ )
256
+
257
+ return DocumentStructure(document=document, units=units)
258
+
259
+ def get_structure(self, document_id: str) -> StructureCollection:
260
+ document = self.get_document(document_id)
261
+ if document.file_type == "docx":
262
+ sections = docx_adapter.resolve_structure(document.path)
263
+ elif document.file_type == "pptx":
264
+ sections = pptx_adapter.resolve_structure(document.path)
265
+ else:
266
+ sections = xlsx_adapter.resolve_structure(document.path)
267
+ return StructureCollection(document=document, sections=sections)
268
+
269
+ def get_section(
270
+ self,
271
+ document_id: str,
272
+ section_id: str,
273
+ *,
274
+ cell_range: str | None = None,
275
+ ) -> SectionPayload:
276
+ document = self.get_document(document_id)
277
+ if document.file_type == "docx":
278
+ return replace(
279
+ docx_adapter.get_section(document.path, section_id), document=document
280
+ )
281
+ if document.file_type == "pptx":
282
+ return replace(
283
+ pptx_adapter.get_section(document.path, section_id), document=document
284
+ )
285
+ return replace(
286
+ xlsx_adapter.get_section(document.path, section_id, cell_range=cell_range),
287
+ document=document,
288
+ )
289
+
290
+ def get_node(self, document_id: str, node_id: str) -> NodePayload:
291
+ document = self.get_document(document_id)
292
+ if document.file_type == "docx":
293
+ item_type, text, metadata = docx_adapter.read_node(document.path, node_id)
294
+ elif document.file_type == "pptx":
295
+ item_type, text, metadata = pptx_adapter.read_node(document.path, node_id)
296
+ else:
297
+ item_type, text, metadata = xlsx_adapter.read_node(document.path, node_id)
298
+ return NodePayload(
299
+ document_id=document.document_id,
300
+ node_id=node_id,
301
+ item_type=item_type,
302
+ text=text,
303
+ metadata=metadata,
304
+ )
305
+
306
+ def get_object(self, document_id: str, locator: str) -> ObjectPayload:
307
+ document = self.get_document(document_id)
308
+ source_hash = _content_hash(document.path)
309
+ resolver = _object_resolver(document.file_type)
310
+ try:
311
+ payload = resolver.get_object(document.path, locator)
312
+ except (InvalidArgumentsError, TargetNotFoundError) as exc:
313
+ if source_hash != document.content_hash:
314
+ raise StaleLocatorError(
315
+ f"stale locator: {locator} is no longer valid for {document.path}"
316
+ ) from exc
317
+ raise
318
+ return replace(payload, document=document)
319
+
320
+ def list_children(
321
+ self,
322
+ document_id: str,
323
+ locator: str,
324
+ *,
325
+ child_type: str | None = None,
326
+ limit: int | None = None,
327
+ ) -> list[ChildSummary]:
328
+ document = self.get_document(document_id)
329
+ source_hash = _content_hash(document.path)
330
+ resolver = _object_resolver(document.file_type)
331
+ try:
332
+ return resolver.list_children(
333
+ document.path,
334
+ locator,
335
+ child_type=child_type,
336
+ limit=limit,
337
+ )
338
+ except (InvalidArgumentsError, TargetNotFoundError) as exc:
339
+ if source_hash != document.content_hash:
340
+ raise StaleLocatorError(
341
+ f"stale locator: {locator} is no longer valid for {document.path}"
342
+ ) from exc
343
+ raise
344
+
345
+ def create_object(
346
+ self,
347
+ document_id: str,
348
+ parent_locator: str,
349
+ object_type: str,
350
+ properties: dict[str, Any],
351
+ position: object | None = None,
352
+ segments: Sequence[InlineFragment] | Sequence[dict[str, Any]] | None = None,
353
+ text_range: VisibleTextRange | dict[str, Any] | None = None,
354
+ *,
355
+ output_mode: OutputMode = "versioned",
356
+ ) -> MutationResult:
357
+ document = self.get_document(document_id)
358
+ self._ensure_object_locator_fresh(document, parent_locator)
359
+ parent = self.get_object(document_id, parent_locator)
360
+ _require_capability(parent.capabilities, Capability.ADD_CHILD, parent_locator)
361
+ output_path = self._resolve_write_output_path(
362
+ document.path, output_mode=output_mode
363
+ )
364
+
365
+ try:
366
+ locator, summary, metadata = _create_object_on_path(
367
+ document.path,
368
+ document.file_type,
369
+ parent_locator=parent_locator,
370
+ object_type=object_type,
371
+ properties=properties,
372
+ position=position,
373
+ segments=_coerce_inline_fragments(segments),
374
+ text_range=_coerce_visible_text_range(text_range),
375
+ output_path=output_path,
376
+ )
377
+ except (
378
+ InvalidArgumentsError,
379
+ TargetNotFoundError,
380
+ TargetNotEditableError,
381
+ ) as exc:
382
+ self._raise_stale_if_document_changed(document, parent_locator, exc)
383
+ raise
384
+
385
+ output_document = self.index_document(output_path)
386
+ payload = self.get_object(output_document.document_id, locator)
387
+ return MutationResult(
388
+ document_path=document.path,
389
+ output_path=output_path,
390
+ document_id=output_document.document_id,
391
+ locator=payload.locator,
392
+ object_type=payload.object_type,
393
+ summary=summary,
394
+ capabilities=payload.capabilities,
395
+ parent_locator=payload.parent_locator,
396
+ metadata=metadata,
397
+ )
398
+
399
+ def update_object(
400
+ self,
401
+ document_id: str,
402
+ locator: str,
403
+ properties: dict[str, Any],
404
+ segments: Sequence[InlineFragment] | Sequence[dict[str, Any]] | None = None,
405
+ text_range: VisibleTextRange | dict[str, Any] | None = None,
406
+ *,
407
+ output_mode: OutputMode = "versioned",
408
+ ) -> MutationResult:
409
+ document = self.get_document(document_id)
410
+ self._ensure_object_locator_fresh(document, locator)
411
+ current = self.get_object(document_id, locator)
412
+ _require_capability(current.capabilities, Capability.UPDATE, locator)
413
+ output_path = self._resolve_write_output_path(
414
+ document.path, output_mode=output_mode
415
+ )
416
+
417
+ try:
418
+ summary, metadata = _update_object_on_path(
419
+ document.path,
420
+ document.file_type,
421
+ locator=locator,
422
+ properties=properties,
423
+ segments=_coerce_inline_fragments(segments),
424
+ text_range=_coerce_visible_text_range(text_range),
425
+ output_path=output_path,
426
+ )
427
+ except (
428
+ InvalidArgumentsError,
429
+ TargetNotFoundError,
430
+ TargetNotEditableError,
431
+ ) as exc:
432
+ self._raise_stale_if_document_changed(document, locator, exc)
433
+ raise
434
+
435
+ output_document = self.index_document(output_path)
436
+ payload = self.get_object(output_document.document_id, locator)
437
+ return MutationResult(
438
+ document_path=document.path,
439
+ output_path=output_path,
440
+ document_id=output_document.document_id,
441
+ locator=payload.locator,
442
+ object_type=payload.object_type,
443
+ summary=summary,
444
+ capabilities=payload.capabilities,
445
+ parent_locator=payload.parent_locator,
446
+ metadata=metadata,
447
+ )
448
+
449
+ def move_object(
450
+ self,
451
+ document_id: str,
452
+ locator: str,
453
+ new_parent_locator: str,
454
+ position: object | None = None,
455
+ *,
456
+ output_mode: OutputMode = "versioned",
457
+ ) -> MutationResult:
458
+ document = self.get_document(document_id)
459
+ self._ensure_object_locator_fresh(document, locator)
460
+ current = self.get_object(document_id, locator)
461
+ _require_capability(current.capabilities, Capability.MOVE, locator)
462
+ output_path = self._resolve_write_output_path(
463
+ document.path, output_mode=output_mode
464
+ )
465
+
466
+ try:
467
+ moved_locator, summary, metadata = _move_object_on_path(
468
+ document.path,
469
+ document.file_type,
470
+ locator=locator,
471
+ new_parent_locator=new_parent_locator,
472
+ position=position,
473
+ output_path=output_path,
474
+ )
475
+ except (
476
+ InvalidArgumentsError,
477
+ TargetNotFoundError,
478
+ TargetNotEditableError,
479
+ ) as exc:
480
+ self._raise_stale_if_document_changed(document, locator, exc)
481
+ raise
482
+
483
+ output_document = self.index_document(output_path)
484
+ payload = self.get_object(output_document.document_id, moved_locator)
485
+ return MutationResult(
486
+ document_path=document.path,
487
+ output_path=output_path,
488
+ document_id=output_document.document_id,
489
+ locator=payload.locator,
490
+ object_type=payload.object_type,
491
+ summary=summary,
492
+ capabilities=payload.capabilities,
493
+ parent_locator=payload.parent_locator,
494
+ metadata=metadata,
495
+ )
496
+
497
+ def copy_object(
498
+ self,
499
+ document_id: str,
500
+ locator: str,
501
+ target_parent_locator: str,
502
+ position: object | None = None,
503
+ *,
504
+ output_mode: OutputMode = "versioned",
505
+ ) -> MutationResult:
506
+ document = self.get_document(document_id)
507
+ self._ensure_object_locator_fresh(document, locator)
508
+ current = self.get_object(document_id, locator)
509
+ _require_capability(current.capabilities, Capability.COPY, locator)
510
+ output_path = self._resolve_write_output_path(
511
+ document.path, output_mode=output_mode
512
+ )
513
+
514
+ try:
515
+ copied_locator, summary, metadata = _copy_object_on_path(
516
+ document.path,
517
+ document.file_type,
518
+ locator=locator,
519
+ target_parent_locator=target_parent_locator,
520
+ position=position,
521
+ output_path=output_path,
522
+ )
523
+ except (
524
+ InvalidArgumentsError,
525
+ TargetNotFoundError,
526
+ TargetNotEditableError,
527
+ ) as exc:
528
+ self._raise_stale_if_document_changed(document, locator, exc)
529
+ raise
530
+
531
+ output_document = self.index_document(output_path)
532
+ payload = self.get_object(output_document.document_id, copied_locator)
533
+ return MutationResult(
534
+ document_path=document.path,
535
+ output_path=output_path,
536
+ document_id=output_document.document_id,
537
+ locator=payload.locator,
538
+ object_type=payload.object_type,
539
+ summary=summary,
540
+ capabilities=payload.capabilities,
541
+ parent_locator=payload.parent_locator,
542
+ metadata=metadata,
543
+ )
544
+
545
+ def batch_edit(
546
+ self,
547
+ document_id: str,
548
+ operations: list[dict[str, Any]],
549
+ *,
550
+ output_mode: OutputMode = "versioned",
551
+ dry_run: bool = False,
552
+ ) -> BatchResult:
553
+ document = self.get_document(document_id)
554
+ self._ensure_object_locator_fresh(
555
+ document, _primary_locator_for_batch(operations)
556
+ )
557
+ if dry_run:
558
+ validated = tuple(
559
+ _validate_batch_operation(document.path, document.file_type, operation)
560
+ for operation in operations
561
+ )
562
+ return BatchResult(
563
+ document_path=document.path,
564
+ output_path=None,
565
+ document_id=document.document_id,
566
+ summary=f"Validated {len(validated)} operations.",
567
+ dry_run=True,
568
+ operations=validated,
569
+ )
570
+
571
+ output_path = self._resolve_write_output_path(
572
+ document.path, output_mode=output_mode
573
+ )
574
+ temp_work_path = _make_batch_work_path(output_path, document.path.suffix)
575
+ shutil.copy2(document.path, temp_work_path)
576
+ try:
577
+ mutation_results = tuple(
578
+ _apply_batch_operation(temp_work_path, document.file_type, operation)
579
+ for operation in operations
580
+ )
581
+ os.replace(temp_work_path, output_path)
582
+ except Exception:
583
+ temp_work_path.unlink(missing_ok=True)
584
+ raise
585
+
586
+ output_document = self.index_document(output_path)
587
+ return BatchResult(
588
+ document_path=document.path,
589
+ output_path=output_path,
590
+ document_id=output_document.document_id,
591
+ summary=f"Applied {len(mutation_results)} operations.",
592
+ operations=mutation_results,
593
+ )
594
+
595
+ def delete_object(
596
+ self,
597
+ document_id: str,
598
+ locator: str,
599
+ *,
600
+ output_mode: OutputMode = "versioned",
601
+ ) -> MutationResult:
602
+ document = self.get_document(document_id)
603
+ self._ensure_object_locator_fresh(document, locator)
604
+ current = self.get_object(document_id, locator)
605
+ _require_capability(current.capabilities, Capability.DELETE, locator)
606
+ output_path = self._resolve_write_output_path(
607
+ document.path, output_mode=output_mode
608
+ )
609
+
610
+ try:
611
+ summary, metadata = _delete_object_on_path(
612
+ document.path,
613
+ document.file_type,
614
+ locator=locator,
615
+ output_path=output_path,
616
+ )
617
+ except (
618
+ InvalidArgumentsError,
619
+ TargetNotFoundError,
620
+ TargetNotEditableError,
621
+ ) as exc:
622
+ self._raise_stale_if_document_changed(document, locator, exc)
623
+ raise
624
+
625
+ output_document = self.index_document(output_path)
626
+ return MutationResult(
627
+ document_path=document.path,
628
+ output_path=output_path,
629
+ document_id=output_document.document_id,
630
+ locator=None,
631
+ object_type=current.object_type,
632
+ summary=summary,
633
+ capabilities=(),
634
+ parent_locator=current.parent_locator,
635
+ metadata=metadata,
636
+ )
637
+
638
+ def docx_set_paragraph_style(
639
+ self,
640
+ document_id: str,
641
+ locator: str,
642
+ style_name: str,
643
+ *,
644
+ output_mode: OutputMode = "versioned",
645
+ ) -> MutationResult:
646
+ document = self._require_document_type(
647
+ document_id,
648
+ expected="docx",
649
+ operation="docx_set_paragraph_style",
650
+ )
651
+ self._ensure_object_locator_fresh(document, locator)
652
+ output_path = self._resolve_write_output_path(
653
+ document.path, output_mode=output_mode
654
+ )
655
+ try:
656
+ updated_locator, summary, metadata = docx_objects.set_paragraph_style(
657
+ document.path,
658
+ locator,
659
+ style_name,
660
+ output_path=output_path,
661
+ )
662
+ except (
663
+ InvalidArgumentsError,
664
+ TargetNotFoundError,
665
+ TargetNotEditableError,
666
+ ) as exc:
667
+ self._raise_stale_if_document_changed(document, locator, exc)
668
+ raise
669
+ return self._finalize_object_mutation(
670
+ document, output_path, updated_locator, summary, metadata
671
+ )
672
+
673
+ def docx_insert_page_break(
674
+ self,
675
+ document_id: str,
676
+ locator: str,
677
+ *,
678
+ output_mode: OutputMode = "versioned",
679
+ ) -> MutationResult:
680
+ document = self._require_document_type(
681
+ document_id,
682
+ expected="docx",
683
+ operation="docx_insert_page_break",
684
+ )
685
+ self._ensure_object_locator_fresh(document, locator)
686
+ output_path = self._resolve_write_output_path(
687
+ document.path, output_mode=output_mode
688
+ )
689
+ try:
690
+ inserted_locator, summary, metadata = docx_objects.insert_page_break(
691
+ document.path,
692
+ locator,
693
+ output_path=output_path,
694
+ )
695
+ except (
696
+ InvalidArgumentsError,
697
+ TargetNotFoundError,
698
+ TargetNotEditableError,
699
+ ) as exc:
700
+ self._raise_stale_if_document_changed(document, locator, exc)
701
+ raise
702
+ return self._finalize_object_mutation(
703
+ document, output_path, inserted_locator, summary, metadata
704
+ )
705
+
706
+ def docx_add_table(
707
+ self,
708
+ document_id: str,
709
+ row_count: int,
710
+ column_count: int,
711
+ *,
712
+ position: object | None = None,
713
+ column_widths: list[int] | None = None,
714
+ style_name: str | None = None,
715
+ output_mode: OutputMode = "versioned",
716
+ ) -> MutationResult:
717
+ document = self._require_document_type(
718
+ document_id,
719
+ expected="docx",
720
+ operation="docx_add_table",
721
+ )
722
+ if isinstance(position, str):
723
+ self._ensure_object_locator_fresh(document, position)
724
+ elif isinstance(position, dict):
725
+ after_locator = position.get("after") or position.get("after_locator")
726
+ if isinstance(after_locator, str):
727
+ self._ensure_object_locator_fresh(document, after_locator)
728
+
729
+ output_path = self._resolve_write_output_path(
730
+ document.path, output_mode=output_mode
731
+ )
732
+ try:
733
+ inserted_locator, summary, metadata = docx_objects.add_table(
734
+ document.path,
735
+ row_count,
736
+ column_count,
737
+ position=position,
738
+ column_widths=column_widths,
739
+ style_name=style_name,
740
+ output_path=output_path,
741
+ )
742
+ except (
743
+ InvalidArgumentsError,
744
+ TargetNotFoundError,
745
+ TargetNotEditableError,
746
+ ) as exc:
747
+ stale_locator = None
748
+ if isinstance(position, str):
749
+ stale_locator = position
750
+ elif isinstance(position, dict):
751
+ maybe_locator = position.get("after") or position.get("after_locator")
752
+ if isinstance(maybe_locator, str):
753
+ stale_locator = maybe_locator
754
+ self._raise_stale_if_document_changed(
755
+ document, stale_locator or "docx:document", exc
756
+ )
757
+ raise
758
+ return self._finalize_object_mutation(
759
+ document, output_path, inserted_locator, summary, metadata
760
+ )
761
+
762
+ def docx_merge_table_cells(
763
+ self,
764
+ document_id: str,
765
+ start_locator: str,
766
+ end_locator: str,
767
+ *,
768
+ output_mode: OutputMode = "versioned",
769
+ ) -> MutationResult:
770
+ document = self._require_document_type(
771
+ document_id,
772
+ expected="docx",
773
+ operation="docx_merge_table_cells",
774
+ )
775
+ self._ensure_object_locator_fresh(document, start_locator)
776
+ self._ensure_object_locator_fresh(document, end_locator)
777
+ output_path = self._resolve_write_output_path(
778
+ document.path, output_mode=output_mode
779
+ )
780
+ try:
781
+ merged_locator, summary, metadata = docx_objects.merge_table_cells(
782
+ document.path,
783
+ start_locator,
784
+ end_locator,
785
+ output_path=output_path,
786
+ )
787
+ except (
788
+ InvalidArgumentsError,
789
+ TargetNotFoundError,
790
+ TargetNotEditableError,
791
+ ) as exc:
792
+ self._raise_stale_if_document_changed(document, start_locator, exc)
793
+ raise
794
+ return self._finalize_object_mutation(
795
+ document, output_path, merged_locator, summary, metadata
796
+ )
797
+
798
+ def pptx_add_slide(
799
+ self,
800
+ document_id: str,
801
+ *,
802
+ layout_index: int | None = None,
803
+ layout_name: str | None = None,
804
+ output_mode: OutputMode = "versioned",
805
+ ) -> MutationResult:
806
+ document = self._require_document_type(
807
+ document_id, expected="pptx", operation="pptx_add_slide"
808
+ )
809
+ output_path = self._resolve_write_output_path(
810
+ document.path, output_mode=output_mode
811
+ )
812
+ inserted_locator, summary, metadata = pptx_objects.add_slide(
813
+ document.path,
814
+ layout_index=layout_index,
815
+ layout_name=layout_name,
816
+ output_path=output_path,
817
+ )
818
+ return self._finalize_object_mutation(
819
+ document, output_path, inserted_locator, summary, metadata
820
+ )
821
+
822
+ def pptx_duplicate_slide(
823
+ self,
824
+ document_id: str,
825
+ locator: str,
826
+ *,
827
+ position: int | None = None,
828
+ output_mode: OutputMode = "versioned",
829
+ ) -> MutationResult:
830
+ document = self._require_document_type(
831
+ document_id,
832
+ expected="pptx",
833
+ operation="pptx_duplicate_slide",
834
+ )
835
+ self._ensure_object_locator_fresh(document, locator)
836
+ output_path = self._resolve_write_output_path(
837
+ document.path, output_mode=output_mode
838
+ )
839
+ try:
840
+ copied_locator, summary, metadata = pptx_objects.duplicate_slide(
841
+ document.path,
842
+ locator,
843
+ position=position,
844
+ output_path=output_path,
845
+ )
846
+ except (
847
+ InvalidArgumentsError,
848
+ TargetNotFoundError,
849
+ TargetNotEditableError,
850
+ ) as exc:
851
+ self._raise_stale_if_document_changed(document, locator, exc)
852
+ raise
853
+ return self._finalize_object_mutation(
854
+ document, output_path, copied_locator, summary, metadata
855
+ )
856
+
857
+ def pptx_set_slide_layout(
858
+ self,
859
+ document_id: str,
860
+ locator: str,
861
+ *,
862
+ layout_index: int | None = None,
863
+ layout_name: str | None = None,
864
+ output_mode: OutputMode = "versioned",
865
+ ) -> MutationResult:
866
+ document = self._require_document_type(
867
+ document_id,
868
+ expected="pptx",
869
+ operation="pptx_set_slide_layout",
870
+ )
871
+ self._ensure_object_locator_fresh(document, locator)
872
+ output_path = self._resolve_write_output_path(
873
+ document.path, output_mode=output_mode
874
+ )
875
+ try:
876
+ updated_locator, summary, metadata = pptx_objects.set_slide_layout(
877
+ document.path,
878
+ locator,
879
+ layout_index=layout_index,
880
+ layout_name=layout_name,
881
+ output_path=output_path,
882
+ )
883
+ except (
884
+ InvalidArgumentsError,
885
+ TargetNotFoundError,
886
+ TargetNotEditableError,
887
+ ) as exc:
888
+ self._raise_stale_if_document_changed(document, locator, exc)
889
+ raise
890
+ return self._finalize_object_mutation(
891
+ document, output_path, updated_locator, summary, metadata
892
+ )
893
+
894
+ def pptx_add_text_shape(
895
+ self,
896
+ document_id: str,
897
+ locator: str,
898
+ text: str,
899
+ *,
900
+ left: int,
901
+ top: int,
902
+ width: int,
903
+ height: int,
904
+ output_mode: OutputMode = "versioned",
905
+ ) -> MutationResult:
906
+ document = self._require_document_type(
907
+ document_id,
908
+ expected="pptx",
909
+ operation="pptx_add_text_shape",
910
+ )
911
+ self._ensure_object_locator_fresh(document, locator)
912
+ output_path = self._resolve_write_output_path(
913
+ document.path, output_mode=output_mode
914
+ )
915
+ try:
916
+ added_locator, summary, metadata = pptx_objects.add_text_shape(
917
+ document.path,
918
+ locator,
919
+ text=text,
920
+ left=left,
921
+ top=top,
922
+ width=width,
923
+ height=height,
924
+ output_path=output_path,
925
+ )
926
+ except (
927
+ InvalidArgumentsError,
928
+ TargetNotFoundError,
929
+ TargetNotEditableError,
930
+ ) as exc:
931
+ self._raise_stale_if_document_changed(document, locator, exc)
932
+ raise
933
+ return self._finalize_object_mutation(
934
+ document, output_path, added_locator, summary, metadata
935
+ )
936
+
937
+ def xlsx_write_range(
938
+ self,
939
+ document_id: str,
940
+ locator: str,
941
+ values: list[list[Any]],
942
+ *,
943
+ output_mode: OutputMode = "versioned",
944
+ ) -> MutationResult:
945
+ document = self._require_document_type(
946
+ document_id, expected="xlsx", operation="xlsx_write_range"
947
+ )
948
+ self._ensure_object_locator_fresh(document, locator)
949
+ output_path = self._resolve_write_output_path(
950
+ document.path, output_mode=output_mode
951
+ )
952
+ try:
953
+ updated_locator, summary, metadata = xlsx_objects.write_range(
954
+ document.path,
955
+ locator,
956
+ values,
957
+ output_path=output_path,
958
+ )
959
+ except (
960
+ InvalidArgumentsError,
961
+ TargetNotFoundError,
962
+ TargetNotEditableError,
963
+ ) as exc:
964
+ self._raise_stale_if_document_changed(document, locator, exc)
965
+ raise
966
+ return self._finalize_object_mutation(
967
+ document, output_path, updated_locator, summary, metadata
968
+ )
969
+
970
+ def xlsx_insert_rows_at(
971
+ self,
972
+ document_id: str,
973
+ locator: str,
974
+ row_number: int,
975
+ count: int,
976
+ *,
977
+ output_mode: OutputMode = "versioned",
978
+ ) -> MutationResult:
979
+ document = self._require_document_type(
980
+ document_id, expected="xlsx", operation="xlsx_insert_rows"
981
+ )
982
+ self._ensure_object_locator_fresh(document, locator)
983
+ output_path = self._resolve_write_output_path(
984
+ document.path, output_mode=output_mode
985
+ )
986
+ try:
987
+ inserted_locator, summary, metadata = xlsx_objects.insert_rows(
988
+ document.path,
989
+ locator,
990
+ row_number,
991
+ count,
992
+ output_path=output_path,
993
+ )
994
+ except (
995
+ InvalidArgumentsError,
996
+ TargetNotFoundError,
997
+ TargetNotEditableError,
998
+ ) as exc:
999
+ self._raise_stale_if_document_changed(document, locator, exc)
1000
+ raise
1001
+ return self._finalize_object_mutation(
1002
+ document, output_path, inserted_locator, summary, metadata
1003
+ )
1004
+
1005
+ def xlsx_insert_columns(
1006
+ self,
1007
+ document_id: str,
1008
+ locator: str,
1009
+ column_index: int,
1010
+ count: int,
1011
+ *,
1012
+ output_mode: OutputMode = "versioned",
1013
+ ) -> MutationResult:
1014
+ document = self._require_document_type(
1015
+ document_id,
1016
+ expected="xlsx",
1017
+ operation="xlsx_insert_columns",
1018
+ )
1019
+ self._ensure_object_locator_fresh(document, locator)
1020
+ output_path = self._resolve_write_output_path(
1021
+ document.path, output_mode=output_mode
1022
+ )
1023
+ try:
1024
+ inserted_locator, summary, metadata = xlsx_objects.insert_columns(
1025
+ document.path,
1026
+ locator,
1027
+ column_index,
1028
+ count,
1029
+ output_path=output_path,
1030
+ )
1031
+ except (
1032
+ InvalidArgumentsError,
1033
+ TargetNotFoundError,
1034
+ TargetNotEditableError,
1035
+ ) as exc:
1036
+ self._raise_stale_if_document_changed(document, locator, exc)
1037
+ raise
1038
+ return self._finalize_object_mutation(
1039
+ document, output_path, inserted_locator, summary, metadata
1040
+ )
1041
+
1042
+ def xlsx_set_formula(
1043
+ self,
1044
+ document_id: str,
1045
+ locator: str,
1046
+ formula: str,
1047
+ *,
1048
+ output_mode: OutputMode = "versioned",
1049
+ ) -> MutationResult:
1050
+ document = self._require_document_type(
1051
+ document_id, expected="xlsx", operation="xlsx_set_formula"
1052
+ )
1053
+ self._ensure_object_locator_fresh(document, locator)
1054
+ output_path = self._resolve_write_output_path(
1055
+ document.path, output_mode=output_mode
1056
+ )
1057
+ try:
1058
+ formula_locator, summary, metadata = xlsx_objects.set_formula(
1059
+ document.path,
1060
+ locator,
1061
+ formula,
1062
+ output_path=output_path,
1063
+ )
1064
+ except (
1065
+ InvalidArgumentsError,
1066
+ TargetNotFoundError,
1067
+ TargetNotEditableError,
1068
+ ) as exc:
1069
+ self._raise_stale_if_document_changed(document, locator, exc)
1070
+ raise
1071
+ return self._finalize_object_mutation(
1072
+ document, output_path, formula_locator, summary, metadata
1073
+ )
1074
+
1075
+ def xlsx_merge_cells(
1076
+ self,
1077
+ document_id: str,
1078
+ locator: str,
1079
+ *,
1080
+ output_mode: OutputMode = "versioned",
1081
+ ) -> MutationResult:
1082
+ document = self._require_document_type(
1083
+ document_id, expected="xlsx", operation="xlsx_merge_cells"
1084
+ )
1085
+ self._ensure_object_locator_fresh(document, locator)
1086
+ output_path = self._resolve_write_output_path(
1087
+ document.path, output_mode=output_mode
1088
+ )
1089
+ try:
1090
+ merged_locator, summary, metadata = xlsx_objects.merge_cells(
1091
+ document.path,
1092
+ locator,
1093
+ output_path=output_path,
1094
+ )
1095
+ except (
1096
+ InvalidArgumentsError,
1097
+ TargetNotFoundError,
1098
+ TargetNotEditableError,
1099
+ ) as exc:
1100
+ self._raise_stale_if_document_changed(document, locator, exc)
1101
+ raise
1102
+ return self._finalize_object_mutation(
1103
+ document, output_path, merged_locator, summary, metadata
1104
+ )
1105
+
1106
+ def create_document(
1107
+ self,
1108
+ format: str,
1109
+ output_path: Path,
1110
+ *,
1111
+ output_mode: OutputMode = "versioned",
1112
+ initial_sheet_name: str | None = None,
1113
+ ) -> MutationResult:
1114
+ normalized_format = str(format).strip().lower()
1115
+ if normalized_format not in {"docx", "pptx", "xlsx"}:
1116
+ raise InvalidArgumentsError(f"Unsupported document format: {format}")
1117
+
1118
+ requested_path = canonicalize_output_path(output_path)
1119
+ if requested_path.suffix.lower() != f".{normalized_format}":
1120
+ raise InvalidArgumentsError(
1121
+ f"create_document output path must use the .{normalized_format} extension."
1122
+ )
1123
+
1124
+ target_path = self._resolve_create_output_path(
1125
+ requested_path, output_mode=output_mode
1126
+ )
1127
+ if normalized_format == "docx":
1128
+ docx_adapter.create_docx(target_path)
1129
+ locator = "docx:document"
1130
+ elif normalized_format == "pptx":
1131
+ pptx_adapter.create_pptx(target_path)
1132
+ locator = "pptx:presentation"
1133
+ else:
1134
+ xlsx_adapter.create_xlsx(target_path, initial_sheet_name=initial_sheet_name)
1135
+ locator = "xlsx:workbook"
1136
+
1137
+ output_document = self.index_document(target_path)
1138
+ payload = self.get_object(output_document.document_id, locator)
1139
+ metadata: dict[str, Any] = {"format": normalized_format}
1140
+ if initial_sheet_name is not None:
1141
+ metadata["initial_sheet_name"] = initial_sheet_name
1142
+ return MutationResult(
1143
+ document_path=target_path,
1144
+ output_path=target_path,
1145
+ document_id=output_document.document_id,
1146
+ locator=payload.locator,
1147
+ object_type=payload.object_type,
1148
+ summary=f"Created {normalized_format.upper()} document at {target_path}.",
1149
+ capabilities=payload.capabilities,
1150
+ parent_locator=payload.parent_locator,
1151
+ metadata=metadata,
1152
+ )
1153
+
1154
+ def add_content_block(
1155
+ self,
1156
+ document_id: str,
1157
+ block_type: str,
1158
+ properties: dict[str, Any],
1159
+ *,
1160
+ output_mode: OutputMode = "versioned",
1161
+ ) -> MutationResult:
1162
+ document = self.get_document(document_id)
1163
+ normalized_block_type = str(block_type).strip().lower()
1164
+ output_path = self._resolve_write_output_path(
1165
+ document.path, output_mode=output_mode
1166
+ )
1167
+
1168
+ try:
1169
+ locator = self._dispatch_add_content_block(
1170
+ document,
1171
+ normalized_block_type,
1172
+ properties,
1173
+ output_path,
1174
+ )
1175
+ except (
1176
+ InvalidArgumentsError,
1177
+ TargetNotFoundError,
1178
+ TargetNotEditableError,
1179
+ ) as exc:
1180
+ stale_locator = next(
1181
+ (
1182
+ value
1183
+ for key, value in properties.items()
1184
+ if key
1185
+ in {"locator", "slide", "slide_locator", "sheet", "sheet_locator"}
1186
+ and isinstance(value, str)
1187
+ ),
1188
+ "docx:document" if document.file_type == "docx" else None,
1189
+ )
1190
+ self._raise_stale_if_document_changed(
1191
+ document, stale_locator or document.path.as_posix(), exc
1192
+ )
1193
+ raise
1194
+
1195
+ output_document = self.index_document(output_path)
1196
+ payload = self.get_object(output_document.document_id, locator)
1197
+ return MutationResult(
1198
+ document_path=document.path,
1199
+ output_path=output_path,
1200
+ document_id=output_document.document_id,
1201
+ locator=payload.locator,
1202
+ object_type=payload.object_type,
1203
+ summary=f"Added {normalized_block_type} to {document.file_type.upper()} document.",
1204
+ capabilities=payload.capabilities,
1205
+ parent_locator=payload.parent_locator,
1206
+ metadata={"block_type": normalized_block_type, **payload.metadata},
1207
+ )
1208
+
1209
+ def style_inline(
1210
+ self,
1211
+ document_id: str,
1212
+ locator: str,
1213
+ style: InlineStyle,
1214
+ clear_fields: list[str] | tuple[str, ...] | None = None,
1215
+ text_range: VisibleTextRange | dict[str, Any] | None = None,
1216
+ *,
1217
+ output_mode: OutputMode = "versioned",
1218
+ ) -> MutationResult:
1219
+ document = self.get_document(document_id)
1220
+ self._ensure_object_locator_fresh(document, locator)
1221
+ output_path = self._resolve_write_output_path(
1222
+ document.path, output_mode=output_mode
1223
+ )
1224
+ clear_list = [] if clear_fields is None else list(clear_fields)
1225
+ range_value = _coerce_visible_text_range(text_range)
1226
+
1227
+ try:
1228
+ if document.file_type == "docx" and range_value is not None:
1229
+ output_path, result_locator, metadata = (
1230
+ docx_adapter.style_paragraph_range(
1231
+ document.path,
1232
+ locator,
1233
+ range_value,
1234
+ style,
1235
+ clear_list,
1236
+ output_path=output_path,
1237
+ )
1238
+ )
1239
+ elif document.file_type == "docx":
1240
+ output_path, result_locator, metadata = docx_adapter.style_run(
1241
+ document.path,
1242
+ locator,
1243
+ style,
1244
+ clear_list,
1245
+ output_path=output_path,
1246
+ )
1247
+ elif document.file_type == "pptx" and range_value is not None:
1248
+ output_path, result_locator, metadata = (
1249
+ pptx_adapter.style_paragraph_range(
1250
+ document.path,
1251
+ locator,
1252
+ range_value,
1253
+ style,
1254
+ clear_list,
1255
+ output_path=output_path,
1256
+ )
1257
+ )
1258
+ elif document.file_type == "pptx":
1259
+ output_path, result_locator, metadata = pptx_adapter.style_run(
1260
+ document.path,
1261
+ locator,
1262
+ style,
1263
+ clear_list,
1264
+ output_path=output_path,
1265
+ )
1266
+ elif range_value is not None:
1267
+ output_path, result_locator, metadata = xlsx_adapter.style_cell_range(
1268
+ document.path,
1269
+ locator,
1270
+ range_value,
1271
+ style,
1272
+ clear_list,
1273
+ output_path=output_path,
1274
+ )
1275
+ else:
1276
+ output_path, result_locator, metadata = xlsx_adapter.style_cell_inline(
1277
+ document.path,
1278
+ locator,
1279
+ style,
1280
+ clear_list,
1281
+ output_path=output_path,
1282
+ )
1283
+ except (
1284
+ InvalidArgumentsError,
1285
+ TargetNotFoundError,
1286
+ TargetNotEditableError,
1287
+ ) as exc:
1288
+ self._raise_stale_if_document_changed(document, locator, exc)
1289
+ raise
1290
+
1291
+ output_document = self.index_document(output_path)
1292
+ payload = self.get_object(output_document.document_id, result_locator)
1293
+ return MutationResult(
1294
+ document_path=document.path,
1295
+ output_path=output_path,
1296
+ document_id=output_document.document_id,
1297
+ locator=payload.locator,
1298
+ object_type=payload.object_type,
1299
+ summary=f"Applied inline style at {locator}.",
1300
+ capabilities=payload.capabilities,
1301
+ parent_locator=payload.parent_locator,
1302
+ metadata=metadata,
1303
+ )
1304
+
1305
+ def style_block(
1306
+ self,
1307
+ document_id: str,
1308
+ locator: str,
1309
+ style: BlockStyle,
1310
+ clear_fields: list[str] | tuple[str, ...] | None = None,
1311
+ *,
1312
+ output_mode: OutputMode = "versioned",
1313
+ ) -> MutationResult:
1314
+ document = self.get_document(document_id)
1315
+ self._ensure_object_locator_fresh(document, locator)
1316
+ output_path = self._resolve_write_output_path(
1317
+ document.path, output_mode=output_mode
1318
+ )
1319
+ clear_list = [] if clear_fields is None else list(clear_fields)
1320
+
1321
+ try:
1322
+ if document.file_type == "docx":
1323
+ output_path, result_locator, metadata = docx_adapter.style_paragraph(
1324
+ document.path,
1325
+ locator,
1326
+ style,
1327
+ clear_list,
1328
+ output_path=output_path,
1329
+ )
1330
+ elif document.file_type == "pptx":
1331
+ output_path, result_locator, metadata = pptx_adapter.style_paragraph(
1332
+ document.path,
1333
+ locator,
1334
+ style,
1335
+ clear_list,
1336
+ output_path=output_path,
1337
+ )
1338
+ else:
1339
+ output_path, result_locator, metadata = xlsx_adapter.style_cell_block(
1340
+ document.path,
1341
+ locator,
1342
+ style,
1343
+ clear_list,
1344
+ output_path=output_path,
1345
+ )
1346
+ except (
1347
+ InvalidArgumentsError,
1348
+ TargetNotFoundError,
1349
+ TargetNotEditableError,
1350
+ ) as exc:
1351
+ self._raise_stale_if_document_changed(document, locator, exc)
1352
+ raise
1353
+
1354
+ output_document = self.index_document(output_path)
1355
+ payload = self.get_object(output_document.document_id, result_locator)
1356
+ return MutationResult(
1357
+ document_path=document.path,
1358
+ output_path=output_path,
1359
+ document_id=output_document.document_id,
1360
+ locator=payload.locator,
1361
+ object_type=payload.object_type,
1362
+ summary=f"Applied block style at {locator}.",
1363
+ capabilities=payload.capabilities,
1364
+ parent_locator=payload.parent_locator,
1365
+ metadata=metadata,
1366
+ )
1367
+
1368
+ def set_structural_role(
1369
+ self,
1370
+ document_id: str,
1371
+ locator: str,
1372
+ role: str,
1373
+ level: int | None = None,
1374
+ *,
1375
+ output_mode: OutputMode = "versioned",
1376
+ ) -> MutationResult:
1377
+ document = self._require_document_type(
1378
+ document_id,
1379
+ expected="docx",
1380
+ operation="set_structural_role",
1381
+ )
1382
+ self._ensure_object_locator_fresh(document, locator)
1383
+ output_path = self._resolve_write_output_path(
1384
+ document.path, output_mode=output_mode
1385
+ )
1386
+ try:
1387
+ output_path, result_locator, metadata = docx_adapter.set_structural_role(
1388
+ document.path,
1389
+ locator,
1390
+ role,
1391
+ level,
1392
+ output_path=output_path,
1393
+ )
1394
+ except (
1395
+ InvalidArgumentsError,
1396
+ TargetNotFoundError,
1397
+ TargetNotEditableError,
1398
+ ) as exc:
1399
+ self._raise_stale_if_document_changed(document, locator, exc)
1400
+ raise
1401
+
1402
+ output_document = self.index_document(output_path)
1403
+ payload = self.get_object(output_document.document_id, result_locator)
1404
+ return MutationResult(
1405
+ document_path=document.path,
1406
+ output_path=output_path,
1407
+ document_id=output_document.document_id,
1408
+ locator=payload.locator,
1409
+ object_type=payload.object_type,
1410
+ summary=f"Applied structural role {role!r} at {locator}.",
1411
+ capabilities=payload.capabilities,
1412
+ parent_locator=payload.parent_locator,
1413
+ metadata=metadata,
1414
+ )
1415
+
1416
+ def write_node(
1417
+ self,
1418
+ document_id: str,
1419
+ node_id: str,
1420
+ content: str,
1421
+ *,
1422
+ output_mode: OutputMode = "versioned",
1423
+ ) -> NodeWriteResult:
1424
+ document = self.get_document(document_id)
1425
+ source_hash = _content_hash(document.path)
1426
+ output_path = self._resolve_write_output_path(
1427
+ document.path, output_mode=output_mode
1428
+ )
1429
+ try:
1430
+ previous = self.get_node(document_id, node_id)
1431
+ if document.file_type == "docx":
1432
+ output_path = docx_adapter.write_node(
1433
+ document.path, node_id, content, output_path
1434
+ )
1435
+ elif document.file_type == "pptx":
1436
+ output_path = pptx_adapter.write_node(
1437
+ document.path, node_id, content, output_path
1438
+ )
1439
+ else:
1440
+ output_path = xlsx_adapter.write_node(
1441
+ document.path, node_id, content, output_path
1442
+ )
1443
+ except (
1444
+ InvalidArgumentsError,
1445
+ TargetNotFoundError,
1446
+ TargetNotEditableError,
1447
+ ) as exc:
1448
+ if source_hash != document.content_hash:
1449
+ raise StaleLocatorError(
1450
+ f"stale locator: {node_id} is no longer valid for {document.path}"
1451
+ ) from exc
1452
+ raise
1453
+
1454
+ output_document = self.index_document(output_path)
1455
+ new_text = self.get_node(output_document.document_id, node_id).text
1456
+
1457
+ return NodeWriteResult(
1458
+ document_path=document.path,
1459
+ output_path=output_path,
1460
+ document_id=output_document.document_id,
1461
+ node_id=node_id,
1462
+ new_text=new_text,
1463
+ previous_text=previous.text,
1464
+ )
1465
+
1466
+ def insert_content(
1467
+ self,
1468
+ document_id: str,
1469
+ content: str,
1470
+ *,
1471
+ style_name: str | None = None,
1472
+ after_node_id: str | None = None,
1473
+ output_mode: OutputMode = "versioned",
1474
+ ) -> InsertContentResult:
1475
+ document = self._require_document_type(
1476
+ document_id, expected="docx", operation="insert_content"
1477
+ )
1478
+ output_path = self._resolve_write_output_path(
1479
+ document.path, output_mode=output_mode
1480
+ )
1481
+ output_path, new_node_id = docx_adapter.insert_paragraph(
1482
+ document.path,
1483
+ content,
1484
+ style_name=style_name,
1485
+ after_locator=after_node_id,
1486
+ output_path=output_path,
1487
+ )
1488
+ output_document = self.index_document(output_path)
1489
+ node = self.get_node(output_document.document_id, new_node_id)
1490
+ return InsertContentResult(
1491
+ document_path=document.path,
1492
+ output_path=output_path,
1493
+ document_id=output_document.document_id,
1494
+ new_node_id=new_node_id,
1495
+ preview=node.text[:120],
1496
+ )
1497
+
1498
+ def xlsx_insert_rows(
1499
+ self,
1500
+ document_id: str,
1501
+ sheet_name: str,
1502
+ *,
1503
+ rows: list[list[str]] | None = None,
1504
+ records: list[dict[str, str]] | None = None,
1505
+ output_mode: OutputMode = "versioned",
1506
+ ) -> XlsxInsertRowsResult:
1507
+ document = self._require_document_type(
1508
+ document_id,
1509
+ expected="xlsx",
1510
+ operation="xlsx_insert_rows",
1511
+ )
1512
+ output_path = self._resolve_write_output_path(
1513
+ document.path, output_mode=output_mode
1514
+ )
1515
+ if rows is not None:
1516
+ output_path, start_row, _ = xlsx_adapter.write_table(
1517
+ document.path,
1518
+ sheet_name,
1519
+ rows=rows,
1520
+ output_path=output_path,
1521
+ )
1522
+ rows_inserted = len(rows)
1523
+ else:
1524
+ if records is None:
1525
+ raise InvalidArgumentsError(
1526
+ "xlsx_insert_rows requires either rows or records."
1527
+ )
1528
+ output_path, start_row, _ = xlsx_adapter.write_table(
1529
+ document.path,
1530
+ sheet_name,
1531
+ records=records,
1532
+ output_path=output_path,
1533
+ )
1534
+ rows_inserted = len(records)
1535
+ output_document = self.index_document(output_path)
1536
+ first_row_locator = xlsx_adapter.make_item_id(sheet_name, f"A{start_row}")
1537
+ return XlsxInsertRowsResult(
1538
+ document_path=document.path,
1539
+ output_path=output_path,
1540
+ document_id=output_document.document_id,
1541
+ rows_inserted=rows_inserted,
1542
+ first_row_locator=first_row_locator,
1543
+ )
1544
+
1545
+ def docx_get_tables(self, document_id: str) -> DocxTablesResult:
1546
+ document = self._require_document_type(
1547
+ document_id, expected="docx", operation="docx_get_tables"
1548
+ )
1549
+ tables = tuple(
1550
+ DocxTableEntry(
1551
+ locator=docx_adapter.make_table_cell_locator(table.table_index, 0, 0),
1552
+ table_index=table.table_index,
1553
+ rows=table.rows,
1554
+ preview=table.preview,
1555
+ metadata={
1556
+ "block_index": table.block_index,
1557
+ **table.metadata,
1558
+ },
1559
+ )
1560
+ for table in docx_adapter.get_tables(document.path)
1561
+ )
1562
+ return DocxTablesResult(document=document, tables=tables)
1563
+
1564
+ def get_presentation_structure(self, document_id: str) -> PresentationStructure:
1565
+ document = self._require_document_type(
1566
+ document_id, expected="pptx", operation="get_presentation_structure"
1567
+ )
1568
+ result = pptx_adapter.get_presentation_structure(document.path)
1569
+ return PresentationStructure(document=document, slides=result)
1570
+
1571
+ def get_slide_bundle(self, document_id: str, slide_number: int):
1572
+ document = self._require_document_type(
1573
+ document_id, expected="pptx", operation="get_slide_bundle"
1574
+ )
1575
+ return replace(
1576
+ pptx_adapter.get_slide_bundle(document.path, slide_number),
1577
+ document=document,
1578
+ )
1579
+
1580
+ def get_slide_notes(self, document_id: str, slide_number: int) -> SlideNotes:
1581
+ document = self._require_document_type(
1582
+ document_id, expected="pptx", operation="get_slide_notes"
1583
+ )
1584
+ return SlideNotes(
1585
+ document_id=document.document_id,
1586
+ slide_number=slide_number,
1587
+ notes_text=pptx_adapter.get_slide_notes(document.path, slide_number),
1588
+ )
1589
+
1590
+ def get_workbook_structure(self, document_id: str) -> WorkbookStructure:
1591
+ document = self._require_document_type(
1592
+ document_id, expected="xlsx", operation="get_workbook_structure"
1593
+ )
1594
+ return replace(
1595
+ xlsx_adapter.get_workbook_structure(document.path), document=document
1596
+ )
1597
+
1598
+ def get_sheet_snapshot(
1599
+ self,
1600
+ document_id: str,
1601
+ sheet_name: str,
1602
+ *,
1603
+ cell_range: str | None = None,
1604
+ start_cell: str | None = None,
1605
+ row_count: int | None = None,
1606
+ column_count: int | None = None,
1607
+ ) -> SheetSnapshot:
1608
+ document = self._require_document_type(
1609
+ document_id, expected="xlsx", operation="get_sheet_snapshot"
1610
+ )
1611
+ return replace(
1612
+ xlsx_adapter.get_sheet_snapshot(
1613
+ document.path,
1614
+ sheet_name,
1615
+ cell_range=cell_range,
1616
+ start_cell=start_cell,
1617
+ row_count=row_count,
1618
+ column_count=column_count,
1619
+ ),
1620
+ document=document,
1621
+ )
1622
+
1623
+ def get_document_blocks(self, document_id: str) -> DocumentBlocks:
1624
+ document = self._require_document_type(
1625
+ document_id, expected="docx", operation="get_document_blocks"
1626
+ )
1627
+ return DocumentBlocks(
1628
+ document=document, blocks=docx_adapter.get_blocks(document.path)
1629
+ )
1630
+
1631
+ def get_paragraphs(self, document_id: str) -> ParagraphCollection:
1632
+ document = self._require_document_type(
1633
+ document_id, expected="docx", operation="get_paragraphs"
1634
+ )
1635
+ return ParagraphCollection(
1636
+ document=document, paragraphs=docx_adapter.get_paragraphs(document.path)
1637
+ )
1638
+
1639
+ def get_tables(self, document_id: str) -> TableCollection:
1640
+ document = self._require_document_type(
1641
+ document_id, expected="docx", operation="get_tables"
1642
+ )
1643
+ return TableCollection(
1644
+ document=document, tables=docx_adapter.get_tables(document.path)
1645
+ )
1646
+
1647
+ def get_block_bundle(self, document_id: str, block_index: int) -> BlockBundle:
1648
+ document = self._require_document_type(
1649
+ document_id, expected="docx", operation="get_block_bundle"
1650
+ )
1651
+ return replace(
1652
+ docx_adapter.get_block_bundle(document.path, block_index), document=document
1653
+ )
1654
+
1655
+ def append_row(
1656
+ self,
1657
+ document_id: str,
1658
+ sheet_name: str,
1659
+ *,
1660
+ values: list[str] | None = None,
1661
+ record: dict[str, str] | None = None,
1662
+ output_mode: OutputMode = "versioned",
1663
+ ) -> StructuredWriteResult:
1664
+ document = self._require_document_type(
1665
+ document_id, expected="xlsx", operation="append_row"
1666
+ )
1667
+ output_path = self._resolve_write_output_path(
1668
+ document.path, output_mode=output_mode
1669
+ )
1670
+ output_path, row_number, coordinates = xlsx_adapter.append_row(
1671
+ document.path,
1672
+ sheet_name,
1673
+ values=values,
1674
+ record=record,
1675
+ output_path=output_path,
1676
+ )
1677
+ return StructuredWriteResult(
1678
+ document_path=document.path,
1679
+ output_path=output_path,
1680
+ target=StructuredTarget(
1681
+ target_type="worksheet_row",
1682
+ identifier=f"{sheet_name}!row:{row_number}",
1683
+ preview=", ".join(coordinates),
1684
+ metadata={
1685
+ "sheet_name": sheet_name,
1686
+ "row_number": row_number,
1687
+ "coordinates": list(coordinates),
1688
+ },
1689
+ ),
1690
+ summary=f"Appended row {row_number} to worksheet {sheet_name}.",
1691
+ )
1692
+
1693
+ def write_table(
1694
+ self,
1695
+ document_id: str,
1696
+ sheet_name: str,
1697
+ *,
1698
+ rows: list[list[str]] | None = None,
1699
+ records: list[dict[str, str]] | None = None,
1700
+ column_mapping: dict[str, str] | None = None,
1701
+ output_mode: OutputMode = "versioned",
1702
+ ) -> StructuredWriteResult:
1703
+ document = self._require_document_type(
1704
+ document_id, expected="xlsx", operation="write_table"
1705
+ )
1706
+ output_path = self._resolve_write_output_path(
1707
+ document.path, output_mode=output_mode
1708
+ )
1709
+ output_path, start_row, end_row = xlsx_adapter.write_table(
1710
+ document.path,
1711
+ sheet_name,
1712
+ rows=rows,
1713
+ records=records,
1714
+ column_mapping=column_mapping,
1715
+ output_path=output_path,
1716
+ )
1717
+ return StructuredWriteResult(
1718
+ document_path=document.path,
1719
+ output_path=output_path,
1720
+ target=StructuredTarget(
1721
+ target_type="worksheet_range",
1722
+ identifier=f"{sheet_name}!rows:{start_row}-{end_row}",
1723
+ preview=f"{sheet_name} rows {start_row}-{end_row}",
1724
+ metadata={
1725
+ "sheet_name": sheet_name,
1726
+ "start_row": start_row,
1727
+ "end_row": end_row,
1728
+ "row_count": end_row - start_row + 1,
1729
+ },
1730
+ ),
1731
+ summary=f"Wrote {end_row - start_row + 1} rows to worksheet {sheet_name}.",
1732
+ )
1733
+
1734
+ def append_paragraph(
1735
+ self,
1736
+ document_id: str,
1737
+ text: str,
1738
+ *,
1739
+ style_name: str | None = None,
1740
+ output_mode: OutputMode = "versioned",
1741
+ ) -> StructuredWriteResult:
1742
+ document = self._require_document_type(
1743
+ document_id, expected="docx", operation="append_paragraph"
1744
+ )
1745
+ output_path = self._resolve_write_output_path(
1746
+ document.path, output_mode=output_mode
1747
+ )
1748
+ output_path, block_index = docx_adapter.append_paragraph_block(
1749
+ document.path,
1750
+ text,
1751
+ style_name=style_name,
1752
+ output_path=output_path,
1753
+ )
1754
+ bundle = docx_adapter.get_block_bundle(output_path, block_index)
1755
+ return StructuredWriteResult(
1756
+ document_path=document.path,
1757
+ output_path=output_path,
1758
+ target=StructuredTarget(
1759
+ target_type="document_block",
1760
+ identifier=f"block:{block_index}",
1761
+ preview=bundle.block.preview,
1762
+ metadata={
1763
+ "block_index": block_index,
1764
+ "block_type": bundle.block.block_type,
1765
+ "style_name": None
1766
+ if bundle.paragraph is None
1767
+ else bundle.paragraph.style_name,
1768
+ },
1769
+ ),
1770
+ summary=f"Appended paragraph block {block_index}.",
1771
+ )
1772
+
1773
+ def replace_block(
1774
+ self,
1775
+ document_id: str,
1776
+ block_index: int,
1777
+ text: str,
1778
+ *,
1779
+ output_mode: OutputMode = "versioned",
1780
+ ) -> StructuredWriteResult:
1781
+ document = self._require_document_type(
1782
+ document_id, expected="docx", operation="replace_block"
1783
+ )
1784
+ output_path = self._resolve_write_output_path(
1785
+ document.path, output_mode=output_mode
1786
+ )
1787
+ output_path = docx_adapter.replace_block(
1788
+ document.path,
1789
+ block_index,
1790
+ text,
1791
+ output_path=output_path,
1792
+ )
1793
+ bundle = docx_adapter.get_block_bundle(output_path, block_index)
1794
+ return StructuredWriteResult(
1795
+ document_path=document.path,
1796
+ output_path=output_path,
1797
+ target=StructuredTarget(
1798
+ target_type="document_block",
1799
+ identifier=f"block:{block_index}",
1800
+ preview=bundle.block.preview,
1801
+ metadata={
1802
+ "block_index": block_index,
1803
+ "block_type": bundle.block.block_type,
1804
+ },
1805
+ ),
1806
+ summary=f"Replaced block {block_index}.",
1807
+ )
1808
+
1809
+ def index_path(
1810
+ self,
1811
+ path: Path,
1812
+ *,
1813
+ with_embeddings: bool = False,
1814
+ reporter: ProgressReporter | None = None,
1815
+ ) -> IndexSummary:
1816
+ resolved_input = canonicalize_existing_path(path)
1817
+ self._ensure_allowed_document_path(resolved_input, action="index")
1818
+ candidates = _index_candidates(resolved_input)
1819
+ active_reporter = reporter or NullProgressReporter()
1820
+ indexed = 0
1821
+ skipped = 0
1822
+
1823
+ active_reporter.on_index_start(len(candidates))
1824
+ for index, candidate in enumerate(candidates, start=1):
1825
+ self._ensure_allowed_document_path(candidate, action="index")
1826
+ if candidate.suffix.lower() not in INDEXABLE_EXTENSIONS:
1827
+ skipped += 1
1828
+ continue
1829
+ active_reporter.on_file_start(candidate, index, len(candidates))
1830
+ document_ref = self.index_document(
1831
+ candidate,
1832
+ with_embeddings=with_embeddings,
1833
+ reporter=active_reporter,
1834
+ )
1835
+ active_reporter.on_file_done(
1836
+ candidate, items_indexed=document_ref.item_count or 0
1837
+ )
1838
+ indexed += 1
1839
+
1840
+ active_reporter.on_index_done(files_indexed=indexed, files_skipped=skipped)
1841
+ return IndexSummary(
1842
+ files_scanned=len(candidates),
1843
+ files_indexed=indexed,
1844
+ files_skipped=skipped,
1845
+ )
1846
+
1847
+ def reindex_path(
1848
+ self,
1849
+ path: Path,
1850
+ *,
1851
+ with_embeddings: bool = False,
1852
+ reporter: ProgressReporter | None = None,
1853
+ ) -> IndexSummary:
1854
+ return self.index_path(path, with_embeddings=with_embeddings, reporter=reporter)
1855
+
1856
+ def refresh_document(
1857
+ self,
1858
+ document_id: str,
1859
+ *,
1860
+ reporter: ProgressReporter | None = None,
1861
+ ) -> IndexSummary:
1862
+ return self.reindex_path(
1863
+ self.resolve_document_path(document_id), reporter=reporter
1864
+ )
1865
+
1866
+ def index_document(
1867
+ self,
1868
+ document_path: Path,
1869
+ *,
1870
+ with_embeddings: bool = False,
1871
+ reporter: ProgressReporter | None = None,
1872
+ ) -> DocumentRef:
1873
+ active_reporter = reporter or NullProgressReporter()
1874
+ resolved_path, file_type = self._require_allowed_document_path(
1875
+ document_path, action="index"
1876
+ )
1877
+ document_ref = _build_document_ref(resolved_path, file_type)
1878
+ items = _extract_items(resolved_path, file_type)
1879
+ document_ref = replace(document_ref, item_count=len(items))
1880
+
1881
+ connection = store.ensure_ready(self.config.index_path)
1882
+ try:
1883
+ store.upsert_document(connection, document_ref)
1884
+ store.delete_document_embeddings(connection, document_ref.document_id)
1885
+ store.replace_document_items(connection, document_ref.document_id, items)
1886
+ if with_embeddings and items:
1887
+ provider = self._get_embedding_provider()
1888
+ store.ensure_embedding_meta(
1889
+ connection,
1890
+ model_name=provider.model_name,
1891
+ dimensions=provider.dimensions,
1892
+ )
1893
+ if file_type == "xlsx":
1894
+ row_embeddings = xlsx_adapter.build_row_embeddings(
1895
+ items, resolved_path
1896
+ )
1897
+ embedding_texts = [
1898
+ row_embedding.text for row_embedding in row_embeddings
1899
+ ]
1900
+ else:
1901
+ row_embeddings = []
1902
+ embedding_texts = [
1903
+ _build_embedding_text(item, resolved_path, file_type=file_type)
1904
+ for item in items
1905
+ ]
1906
+ LOGGER.info(
1907
+ "Embedding generation started for %s with %s items",
1908
+ resolved_path,
1909
+ len(embedding_texts),
1910
+ )
1911
+ if embedding_texts:
1912
+ active_reporter.on_embedding_start(
1913
+ resolved_path, len(embedding_texts)
1914
+ )
1915
+ started_at = time.perf_counter()
1916
+ blobs = provider.embed_texts(
1917
+ embedding_texts,
1918
+ on_progress=active_reporter.on_embedding_item,
1919
+ )
1920
+ if file_type == "xlsx":
1921
+ if len(blobs) != len(row_embeddings):
1922
+ raise RuntimeError(
1923
+ "Embedding provider returned an unexpected number of XLSX row vectors."
1924
+ )
1925
+ store.replace_xlsx_row_embeddings(
1926
+ connection,
1927
+ document_id=document_ref.document_id,
1928
+ model_name=provider.model_name,
1929
+ dimensions=provider.dimensions,
1930
+ row_embeddings=_build_xlsx_row_embedding_records(
1931
+ document_ref.document_id,
1932
+ row_embeddings,
1933
+ blobs,
1934
+ ),
1935
+ )
1936
+ else:
1937
+ if len(blobs) != len(items):
1938
+ raise RuntimeError(
1939
+ "Embedding provider returned an unexpected number of vectors."
1940
+ )
1941
+ store.replace_document_embeddings(
1942
+ connection,
1943
+ document_id=document_ref.document_id,
1944
+ model_name=provider.model_name,
1945
+ dimensions=provider.dimensions,
1946
+ embeddings=[
1947
+ (
1948
+ store.make_storage_id(
1949
+ document_ref.document_id, item.item_id
1950
+ ),
1951
+ blob,
1952
+ )
1953
+ for item, blob in zip(items, blobs, strict=True)
1954
+ ],
1955
+ )
1956
+ LOGGER.info(
1957
+ "Embedding generation completed for %s with %s items in %.3fs",
1958
+ resolved_path,
1959
+ len(embedding_texts),
1960
+ time.perf_counter() - started_at,
1961
+ )
1962
+ connection.commit()
1963
+ except Exception:
1964
+ connection.rollback()
1965
+ raise
1966
+ finally:
1967
+ connection.close()
1968
+
1969
+ return document_ref
1970
+
1971
+ def search_corpus(
1972
+ self,
1973
+ query: str,
1974
+ *,
1975
+ file_type: str | None = None,
1976
+ document_path: Path | None = None,
1977
+ limit: int = 20,
1978
+ mode: SearchMode = "keyword",
1979
+ ) -> list[SearchHit]:
1980
+ if file_type not in (None, "docx", "pptx", "xlsx"):
1981
+ raise InvalidArgumentsError(
1982
+ "Only DOCX, PPTX, and XLSX search are supported in this feature."
1983
+ )
1984
+ normalized_mode = _normalize_search_mode(mode)
1985
+
1986
+ resolved_document_path = None
1987
+ if document_path is not None:
1988
+ resolved_document_path, _ = self._require_allowed_document_path(
1989
+ document_path, action="search"
1990
+ )
1991
+ connection = store.ensure_ready(self.config.index_path)
1992
+ try:
1993
+ if normalized_mode == "keyword":
1994
+ rows = store.search_items(
1995
+ connection,
1996
+ query,
1997
+ file_type=file_type,
1998
+ document_path=resolved_document_path,
1999
+ limit=limit,
2000
+ )
2001
+ hits = [_search_hit_from_keyword_row(row) for row in rows]
2002
+ elif normalized_mode == "semantic":
2003
+ if not store.has_item_embeddings(
2004
+ connection,
2005
+ file_type=file_type,
2006
+ document_path=resolved_document_path,
2007
+ ):
2008
+ raise NoEmbeddingsError(
2009
+ "No embeddings are indexed for the requested corpus. Reindex with --with-embeddings first."
2010
+ )
2011
+ hits = self._semantic_search(
2012
+ connection,
2013
+ query,
2014
+ file_type=file_type,
2015
+ document_path=resolved_document_path,
2016
+ limit=max(limit, self.config.vector_search_top_k),
2017
+ )[:limit]
2018
+ else:
2019
+ keyword_rows = store.search_items(
2020
+ connection,
2021
+ query,
2022
+ file_type=file_type,
2023
+ document_path=resolved_document_path,
2024
+ limit=max(limit, self.config.vector_search_top_k),
2025
+ )
2026
+ semantic_hits = self._semantic_search(
2027
+ connection,
2028
+ query,
2029
+ file_type=file_type,
2030
+ document_path=resolved_document_path,
2031
+ limit=max(limit, self.config.vector_search_top_k),
2032
+ require_embeddings=False,
2033
+ )
2034
+ hits = _merge_hybrid_hits(
2035
+ keyword_rows,
2036
+ semantic_hits,
2037
+ limit=limit,
2038
+ keyword_weight=self.config.hybrid_keyword_weight,
2039
+ semantic_weight=self.config.hybrid_semantic_weight,
2040
+ )
2041
+ LOGGER.info(
2042
+ "Hybrid merge completed for query=%r with %s keyword hits, %s semantic hits, %s merged hits",
2043
+ query,
2044
+ len(keyword_rows),
2045
+ len(semantic_hits),
2046
+ len(hits),
2047
+ )
2048
+ finally:
2049
+ connection.close()
2050
+
2051
+ return [
2052
+ hit
2053
+ for hit in hits
2054
+ if hit.document_path is not None
2055
+ and self._is_allowed_document_path(hit.document_path)
2056
+ ]
2057
+
2058
+ def locate_paragraph(self, document_path: Path, paragraph_index: int) -> ItemRef:
2059
+ return self.locate_items(document_path, paragraph_index=paragraph_index)[0]
2060
+
2061
+ def locate_slide_shapes(
2062
+ self,
2063
+ document_path: Path,
2064
+ slide_number: int,
2065
+ shape_id: int | None = None,
2066
+ ) -> list[ItemRef]:
2067
+ return self.locate_items(
2068
+ document_path, slide_number=slide_number, shape_id=shape_id
2069
+ )
2070
+
2071
+ def locate_cell(
2072
+ self, document_path: Path, sheet_name: str, cell_coordinate: str
2073
+ ) -> ItemRef:
2074
+ return self.locate_items(
2075
+ document_path, sheet_name=sheet_name, cell_coordinate=cell_coordinate
2076
+ )[0]
2077
+
2078
+ def locate_items(
2079
+ self,
2080
+ document_path: Path,
2081
+ *,
2082
+ paragraph_index: int | None = None,
2083
+ slide_number: int | None = None,
2084
+ shape_id: int | None = None,
2085
+ sheet_name: str | None = None,
2086
+ cell_coordinate: str | None = None,
2087
+ ) -> list[ItemRef]:
2088
+ resolved_path, file_type = self._require_allowed_document_path(
2089
+ document_path, action="locate"
2090
+ )
2091
+ connection = store.ensure_ready(self.config.index_path)
2092
+ try:
2093
+ document_row = self._resolve_document_row(connection, resolved_path)
2094
+ if file_type == "docx":
2095
+ if (
2096
+ paragraph_index is None
2097
+ or slide_number is not None
2098
+ or shape_id is not None
2099
+ or sheet_name is not None
2100
+ or cell_coordinate is not None
2101
+ ):
2102
+ raise InvalidArgumentsError(
2103
+ "DOCX locate requires --paragraph and does not support --slide."
2104
+ )
2105
+ item_row = self._resolve_indexed_item_row(
2106
+ connection,
2107
+ document_row,
2108
+ f"para:{paragraph_index}",
2109
+ resolved_path,
2110
+ )
2111
+ return [_item_ref_from_row(item_row)]
2112
+
2113
+ if file_type == "pptx":
2114
+ if (
2115
+ paragraph_index is not None
2116
+ or sheet_name is not None
2117
+ or cell_coordinate is not None
2118
+ ):
2119
+ raise InvalidArgumentsError(
2120
+ "PPTX locate supports --slide and optional --shape only."
2121
+ )
2122
+ if slide_number is None:
2123
+ raise InvalidArgumentsError("PPTX locate requires --slide.")
2124
+
2125
+ item_rows = store.fetch_items_for_document(
2126
+ connection, document_row["document_id"]
2127
+ )
2128
+ matches = [
2129
+ row
2130
+ for row in item_rows
2131
+ if _metadata_value(row, "slide_number") == slide_number
2132
+ and (
2133
+ shape_id is None or _metadata_value(row, "shape_id") == shape_id
2134
+ )
2135
+ ]
2136
+ matches.sort(
2137
+ key=lambda row: (
2138
+ _metadata_value(row, "shape_index", default=0),
2139
+ row["item_id"],
2140
+ )
2141
+ )
2142
+ if not matches:
2143
+ if shape_id is None:
2144
+ raise TargetNotFoundError(
2145
+ f"No indexed PPTX text shapes found on slide {slide_number} for {resolved_path}"
2146
+ )
2147
+ raise TargetNotFoundError(
2148
+ f"No indexed PPTX text shape found for slide {slide_number} shape {shape_id} in {resolved_path}"
2149
+ )
2150
+ return [_item_ref_from_row(row) for row in matches]
2151
+
2152
+ if (
2153
+ paragraph_index is not None
2154
+ or slide_number is not None
2155
+ or shape_id is not None
2156
+ ):
2157
+ raise InvalidArgumentsError(
2158
+ "XLSX locate supports --sheet and --cell only."
2159
+ )
2160
+ if sheet_name is None or cell_coordinate is None:
2161
+ raise InvalidArgumentsError("XLSX locate requires --sheet and --cell.")
2162
+
2163
+ item_row = self._resolve_indexed_item_row(
2164
+ connection,
2165
+ document_row,
2166
+ xlsx_adapter.make_item_id(sheet_name, cell_coordinate),
2167
+ resolved_path,
2168
+ )
2169
+ return [_item_ref_from_row(item_row)]
2170
+ finally:
2171
+ connection.close()
2172
+
2173
+ def read_item(self, document_path: Path, item_id: str) -> str:
2174
+ resolved_path, file_type = self._require_allowed_document_path(
2175
+ document_path, action="read"
2176
+ )
2177
+ connection = store.ensure_ready(self.config.index_path)
2178
+ try:
2179
+ document_row = self._resolve_document_row(connection, resolved_path)
2180
+ self._resolve_indexed_item_row(
2181
+ connection, document_row, item_id, resolved_path
2182
+ )
2183
+ finally:
2184
+ connection.close()
2185
+ if file_type == "docx":
2186
+ return docx_adapter.read_paragraph(resolved_path, item_id)
2187
+ if file_type == "pptx":
2188
+ return pptx_adapter.read_text_shape(resolved_path, item_id)
2189
+ return xlsx_adapter.read_cell(resolved_path, item_id)
2190
+
2191
+ def replace_item_text(
2192
+ self,
2193
+ document_path: Path,
2194
+ item_id: str,
2195
+ text: str,
2196
+ *,
2197
+ output_mode: OutputMode = "versioned",
2198
+ ) -> PatchResult:
2199
+ resolved_path, file_type = self._require_allowed_document_path(
2200
+ document_path, action="write"
2201
+ )
2202
+ if file_type == "xlsx":
2203
+ raise InvalidArgumentsError(
2204
+ "XLSX replace is not supported; use write-cell."
2205
+ )
2206
+
2207
+ self._prepare_write_target(
2208
+ resolved_path,
2209
+ file_type,
2210
+ item_id,
2211
+ require_indexed_item=True,
2212
+ )
2213
+ output_path = self._resolve_write_output_path(
2214
+ resolved_path, output_mode=output_mode
2215
+ )
2216
+
2217
+ if file_type == "docx":
2218
+ output_path = docx_adapter.replace_paragraph(
2219
+ resolved_path, item_id, text, output_path
2220
+ )
2221
+ updated_text = docx_adapter.read_paragraph(output_path, item_id)
2222
+ else:
2223
+ output_path = pptx_adapter.replace_text_shape(
2224
+ resolved_path, item_id, text, output_path
2225
+ )
2226
+ updated_text = pptx_adapter.read_text_shape(output_path, item_id)
2227
+ self.index_document(output_path)
2228
+
2229
+ connection = store.ensure_ready(self.config.index_path)
2230
+ try:
2231
+ document_row = self._resolve_document_row(connection, output_path.resolve())
2232
+ updated_item_row = self._resolve_indexed_item_row(
2233
+ connection,
2234
+ document_row,
2235
+ item_id,
2236
+ output_path.resolve(),
2237
+ )
2238
+ finally:
2239
+ connection.close()
2240
+
2241
+ return PatchResult(
2242
+ document_path=resolved_path,
2243
+ output_path=output_path,
2244
+ item=_item_ref_from_row(updated_item_row),
2245
+ text=updated_text,
2246
+ )
2247
+
2248
+ def append_item_text(
2249
+ self,
2250
+ document_path: Path,
2251
+ item_id: str,
2252
+ text: str,
2253
+ *,
2254
+ output_mode: OutputMode = "versioned",
2255
+ ) -> PatchResult:
2256
+ resolved_path, file_type = self._require_allowed_document_path(
2257
+ document_path, action="write"
2258
+ )
2259
+ self._prepare_write_target(
2260
+ resolved_path,
2261
+ file_type,
2262
+ item_id,
2263
+ require_indexed_item=file_type != "xlsx",
2264
+ )
2265
+ output_path = self._resolve_write_output_path(
2266
+ resolved_path, output_mode=output_mode
2267
+ )
2268
+
2269
+ if file_type == "docx":
2270
+ output_path = docx_adapter.append_paragraph(
2271
+ resolved_path, item_id, text, output_path
2272
+ )
2273
+ updated_text = docx_adapter.read_paragraph(output_path, item_id)
2274
+ elif file_type == "pptx":
2275
+ output_path = pptx_adapter.append_text_shape(
2276
+ resolved_path, item_id, text, output_path
2277
+ )
2278
+ updated_text = pptx_adapter.read_text_shape(output_path, item_id)
2279
+ else:
2280
+ output_path = xlsx_adapter.append_cell(
2281
+ resolved_path, item_id, text, output_path
2282
+ )
2283
+ updated_text = xlsx_adapter.read_cell(output_path, item_id)
2284
+ self.index_document(output_path)
2285
+
2286
+ connection = store.ensure_ready(self.config.index_path)
2287
+ try:
2288
+ document_row = self._resolve_document_row(connection, output_path.resolve())
2289
+ updated_item_row = self._resolve_indexed_item_row(
2290
+ connection,
2291
+ document_row,
2292
+ item_id,
2293
+ output_path.resolve(),
2294
+ )
2295
+ finally:
2296
+ connection.close()
2297
+
2298
+ return PatchResult(
2299
+ document_path=resolved_path,
2300
+ output_path=output_path,
2301
+ item=_item_ref_from_row(updated_item_row),
2302
+ text=updated_text,
2303
+ )
2304
+
2305
+ def write_cell_value(
2306
+ self,
2307
+ document_path: Path,
2308
+ sheet_name: str,
2309
+ cell_coordinate: str,
2310
+ value: str,
2311
+ *,
2312
+ output_mode: OutputMode = "versioned",
2313
+ ) -> PatchResult:
2314
+ resolved_path, file_type = self._require_allowed_document_path(
2315
+ document_path, action="write"
2316
+ )
2317
+ if file_type != "xlsx":
2318
+ raise InvalidArgumentsError("write-cell requires an .xlsx path.")
2319
+
2320
+ item_id = xlsx_adapter.make_item_id(sheet_name, cell_coordinate)
2321
+ self._prepare_write_target(
2322
+ resolved_path,
2323
+ file_type,
2324
+ item_id,
2325
+ require_indexed_item=False,
2326
+ )
2327
+ output_path = self._resolve_write_output_path(
2328
+ resolved_path, output_mode=output_mode
2329
+ )
2330
+ output_path = xlsx_adapter.write_cell(
2331
+ resolved_path, item_id, value, output_path
2332
+ )
2333
+ updated_text = xlsx_adapter.read_cell(output_path, item_id)
2334
+ self.index_document(output_path)
2335
+
2336
+ connection = store.ensure_ready(self.config.index_path)
2337
+ try:
2338
+ document_row = self._resolve_document_row(connection, output_path.resolve())
2339
+ updated_item_row = self._resolve_indexed_item_row(
2340
+ connection,
2341
+ document_row,
2342
+ item_id,
2343
+ output_path.resolve(),
2344
+ )
2345
+ finally:
2346
+ connection.close()
2347
+
2348
+ return PatchResult(
2349
+ document_path=resolved_path,
2350
+ output_path=output_path,
2351
+ item=_item_ref_from_row(updated_item_row),
2352
+ text=updated_text,
2353
+ )
2354
+
2355
+ def _prepare_write_target(
2356
+ self,
2357
+ document_path: Path,
2358
+ file_type: FileType,
2359
+ item_id: str,
2360
+ *,
2361
+ require_indexed_item: bool,
2362
+ ) -> None:
2363
+ connection = store.ensure_ready(self.config.index_path)
2364
+ try:
2365
+ document_row = self._resolve_document_row(connection, document_path)
2366
+ if require_indexed_item:
2367
+ try:
2368
+ self._resolve_indexed_item_row(
2369
+ connection, document_row, item_id, document_path
2370
+ )
2371
+ except TargetNotFoundError:
2372
+ if file_type == "pptx":
2373
+ _raise_if_pptx_target_not_editable(document_path, item_id)
2374
+ raise
2375
+
2376
+ if document_row["content_hash"] != _content_hash(document_path):
2377
+ try:
2378
+ _ensure_current_target_resolves(document_path, file_type, item_id)
2379
+ except (
2380
+ InvalidArgumentsError,
2381
+ TargetNotFoundError,
2382
+ pptx_adapter.TargetNotEditableError,
2383
+ xlsx_adapter.TargetNotAppendableError,
2384
+ ) as exc:
2385
+ raise StaleLocatorError(
2386
+ f"stale locator: {item_id} is no longer valid for {document_path}"
2387
+ ) from exc
2388
+ finally:
2389
+ connection.close()
2390
+
2391
+ def _ensure_object_locator_fresh(
2392
+ self, document: DocumentRef, locator: str | None
2393
+ ) -> None:
2394
+ current_hash = _content_hash(document.path)
2395
+ if document.content_hash is not None and document.content_hash != current_hash:
2396
+ subject = locator or document.path.as_posix()
2397
+ raise StaleLocatorError(
2398
+ f"stale locator: {subject} is no longer valid for {document.path}"
2399
+ )
2400
+
2401
+ def _finalize_object_mutation(
2402
+ self,
2403
+ document: DocumentRef,
2404
+ output_path: Path,
2405
+ locator: str,
2406
+ summary: str,
2407
+ metadata: dict[str, Any],
2408
+ ) -> MutationResult:
2409
+ output_document = self.index_document(output_path)
2410
+ payload = self.get_object(output_document.document_id, locator)
2411
+ return MutationResult(
2412
+ document_path=document.path,
2413
+ output_path=output_path,
2414
+ document_id=output_document.document_id,
2415
+ locator=payload.locator,
2416
+ object_type=payload.object_type,
2417
+ summary=summary,
2418
+ capabilities=payload.capabilities,
2419
+ parent_locator=payload.parent_locator,
2420
+ metadata=metadata,
2421
+ )
2422
+
2423
+ def _raise_stale_if_document_changed(
2424
+ self,
2425
+ document: DocumentRef,
2426
+ locator: str,
2427
+ exc: Exception,
2428
+ ) -> None:
2429
+ if document.content_hash is not None and document.content_hash != _content_hash(
2430
+ document.path
2431
+ ):
2432
+ raise StaleLocatorError(
2433
+ f"stale locator: {locator} is no longer valid for {document.path}"
2434
+ ) from exc
2435
+
2436
+ def _resolve_create_output_path(
2437
+ self,
2438
+ output_path: Path,
2439
+ *,
2440
+ output_mode: OutputMode,
2441
+ ) -> Path:
2442
+ normalized_mode = _normalize_output_mode(output_mode)
2443
+ if normalized_mode == "inplace":
2444
+ if not self.config.allow_inplace_overwrite:
2445
+ raise PolicyRefusedError(
2446
+ "In-place overwrite is not enabled. Set allow_inplace_overwrite = true to use output-mode inplace."
2447
+ )
2448
+ resolved_output = self._ensure_allowed_output_path(output_path)
2449
+ resolved_output.parent.mkdir(parents=True, exist_ok=True)
2450
+ return resolved_output
2451
+
2452
+ target_path = versioning.build_versioned_output_path(
2453
+ output_path,
2454
+ output_directory=self.config.output_directory,
2455
+ create_directory=False,
2456
+ )
2457
+ self._ensure_allowed_output_path(target_path)
2458
+ target_path.parent.mkdir(parents=True, exist_ok=True)
2459
+ return target_path
2460
+
2461
+ def _dispatch_add_content_block(
2462
+ self,
2463
+ document: DocumentRef,
2464
+ block_type: str,
2465
+ properties: dict[str, Any],
2466
+ output_path: Path,
2467
+ ) -> str:
2468
+ if document.file_type == "docx":
2469
+ if block_type == "paragraph":
2470
+ _, locator = docx_adapter.add_paragraph(
2471
+ document.path,
2472
+ str(properties.get("text", "")),
2473
+ output_path=output_path,
2474
+ )
2475
+ return locator
2476
+ if block_type == "heading":
2477
+ _, locator = docx_adapter.add_heading(
2478
+ document.path,
2479
+ str(properties.get("text", "")),
2480
+ int(properties.get("level", 1)),
2481
+ output_path=output_path,
2482
+ )
2483
+ return locator
2484
+ if block_type == "table":
2485
+ rows = int(properties.get("rows", 0))
2486
+ columns = int(properties.get("columns", 0))
2487
+ _, locator = docx_adapter.add_table(
2488
+ document.path,
2489
+ rows,
2490
+ columns,
2491
+ output_path=output_path,
2492
+ )
2493
+ return locator
2494
+
2495
+ if document.file_type == "pptx":
2496
+ if block_type == "slide":
2497
+ _, locator = pptx_adapter.add_slide(
2498
+ document.path, output_path=output_path
2499
+ )
2500
+ return locator
2501
+ if block_type == "textbox":
2502
+ slide_locator = next(
2503
+ (
2504
+ value
2505
+ for key, value in properties.items()
2506
+ if key in {"slide", "slide_locator", "locator"}
2507
+ and isinstance(value, str)
2508
+ ),
2509
+ None,
2510
+ )
2511
+ if slide_locator is None:
2512
+ raise InvalidArgumentsError(
2513
+ "PPTX textbox blocks require a slide locator."
2514
+ )
2515
+ _, locator = pptx_adapter.add_textbox(
2516
+ document.path,
2517
+ slide_locator,
2518
+ str(properties.get("text", "")),
2519
+ left=_optional_int(properties.get("left")),
2520
+ top=_optional_int(properties.get("top")),
2521
+ width=_optional_int(properties.get("width")),
2522
+ height=_optional_int(properties.get("height")),
2523
+ output_path=output_path,
2524
+ )
2525
+ return locator
2526
+
2527
+ if document.file_type == "xlsx":
2528
+ if block_type == "sheet":
2529
+ name = properties.get("name")
2530
+ if not isinstance(name, str):
2531
+ raise InvalidArgumentsError(
2532
+ "XLSX sheet blocks require a sheet name."
2533
+ )
2534
+ _, locator = xlsx_adapter.add_sheet(
2535
+ document.path, name, output_path=output_path
2536
+ )
2537
+ return locator
2538
+ if block_type == "row":
2539
+ sheet_locator = next(
2540
+ (
2541
+ value
2542
+ for key, value in properties.items()
2543
+ if key in {"sheet", "sheet_locator", "locator"}
2544
+ and isinstance(value, str)
2545
+ ),
2546
+ None,
2547
+ )
2548
+ if sheet_locator is None:
2549
+ raise InvalidArgumentsError(
2550
+ "XLSX row blocks require a worksheet locator."
2551
+ )
2552
+ values = properties.get("values")
2553
+ if not isinstance(values, list):
2554
+ raise InvalidArgumentsError(
2555
+ "XLSX row blocks require a values list."
2556
+ )
2557
+ _, locator = xlsx_adapter.add_row(
2558
+ document.path,
2559
+ sheet_locator,
2560
+ values,
2561
+ output_path=output_path,
2562
+ )
2563
+ return locator
2564
+ if block_type == "cell":
2565
+ cell_locator = next(
2566
+ (
2567
+ value
2568
+ for key, value in properties.items()
2569
+ if key in {"cell", "cell_locator", "locator"}
2570
+ and isinstance(value, str)
2571
+ ),
2572
+ None,
2573
+ )
2574
+ if cell_locator is None:
2575
+ raise InvalidArgumentsError(
2576
+ "XLSX cell blocks require a cell locator."
2577
+ )
2578
+ xlsx_adapter.write_cell(
2579
+ document.path,
2580
+ to_legacy_locator(cell_locator, file_type="xlsx"),
2581
+ properties.get("value"),
2582
+ output_path=output_path,
2583
+ )
2584
+ return to_v2_locator(cell_locator, file_type="xlsx")
2585
+
2586
+ raise InvalidArgumentsError(
2587
+ f"Unsupported add_content_block combination: {document.file_type}/{block_type}"
2588
+ )
2589
+
2590
+ def _resolve_write_output_path(
2591
+ self,
2592
+ document_path: Path,
2593
+ *,
2594
+ output_mode: OutputMode,
2595
+ ) -> Path:
2596
+ normalized_mode = _normalize_output_mode(output_mode)
2597
+ if normalized_mode == "inplace":
2598
+ if not self.config.allow_inplace_overwrite:
2599
+ raise PolicyRefusedError(
2600
+ "In-place overwrite is not enabled. Set allow_inplace_overwrite = true to use output-mode inplace."
2601
+ )
2602
+ self._ensure_allowed_output_path(document_path)
2603
+ return document_path
2604
+
2605
+ output_path = versioning.build_versioned_output_path(
2606
+ document_path,
2607
+ output_directory=self.config.output_directory,
2608
+ create_directory=False,
2609
+ )
2610
+ self._ensure_allowed_output_path(output_path)
2611
+ output_path.parent.mkdir(parents=True, exist_ok=True)
2612
+ return output_path
2613
+
2614
+ def _require_document_type(
2615
+ self,
2616
+ document_id: str,
2617
+ *,
2618
+ expected: FileType,
2619
+ operation: str,
2620
+ ) -> DocumentRef:
2621
+ document = self.get_document(document_id)
2622
+ if document.file_type != expected:
2623
+ raise InvalidArgumentsError(
2624
+ f"{operation} requires a .{expected} document, got .{document.file_type}."
2625
+ )
2626
+ return document
2627
+
2628
+ def run_doctor(
2629
+ self,
2630
+ required_imports: Sequence[tuple[str, str]] | None = None,
2631
+ ) -> DoctorReport:
2632
+ checks: list[DoctorCheck] = []
2633
+
2634
+ for module_name, label in required_imports or REQUIRED_IMPORTS:
2635
+ checks.append(_check_import(module_name, label))
2636
+
2637
+ checks.append(_check_sqlite_module())
2638
+ checks.append(_check_fts5_support())
2639
+ checks.append(_check_index_path(self.config.index_path))
2640
+ checks.append(_check_embedding_provider_import())
2641
+ checks.append(
2642
+ _check_embedding_model(
2643
+ self.config.embedding_model,
2644
+ self.config.embedding_dimensions,
2645
+ provider_factory=self.embedding_provider_factory,
2646
+ )
2647
+ )
2648
+ checks.append(
2649
+ _check_embedding_store(
2650
+ self.config.index_path,
2651
+ self.config.embedding_model,
2652
+ self.config.embedding_dimensions,
2653
+ )
2654
+ )
2655
+ checks.extend(_check_document_roots(self.config.document_roots))
2656
+ checks.extend(_check_allowed_roots(self.config.allowed_roots))
2657
+ checks.extend(_check_output_roots(self.config.output_roots))
2658
+
2659
+ return DoctorReport(checks=tuple(checks))
2660
+
2661
+ def _semantic_search(
2662
+ self,
2663
+ connection: sqlite3.Connection,
2664
+ query: str,
2665
+ *,
2666
+ file_type: str | None,
2667
+ document_path: Path | None,
2668
+ limit: int,
2669
+ require_embeddings: bool = True,
2670
+ ) -> list[SearchHit]:
2671
+ item_rows = store.fetch_item_embeddings(
2672
+ connection,
2673
+ file_type=file_type,
2674
+ document_path=document_path,
2675
+ )
2676
+ xlsx_rows = store.fetch_xlsx_row_embeddings(
2677
+ connection,
2678
+ file_type=file_type,
2679
+ document_path=document_path,
2680
+ )
2681
+ if not item_rows and not xlsx_rows:
2682
+ if require_embeddings:
2683
+ raise NoEmbeddingsError(
2684
+ "No embeddings are indexed for the requested corpus. Reindex with --with-embeddings first."
2685
+ )
2686
+ return []
2687
+
2688
+ provider = self._get_embedding_provider()
2689
+ store.ensure_embedding_meta(
2690
+ connection,
2691
+ model_name=provider.model_name,
2692
+ dimensions=provider.dimensions,
2693
+ )
2694
+ query_vector = _unpack_embedding(
2695
+ provider.embed_texts([query])[0], provider.dimensions
2696
+ )
2697
+
2698
+ scored_hits: list[SearchHit] = []
2699
+ for row in item_rows:
2700
+ similarity = _cosine_similarity(
2701
+ query_vector,
2702
+ _unpack_embedding(row["embedding"], int(row["dimensions"])),
2703
+ )
2704
+ scored_hits.append(_search_hit_from_semantic_row(row, similarity))
2705
+ for row in xlsx_rows:
2706
+ similarity = _cosine_similarity(
2707
+ query_vector,
2708
+ _unpack_embedding(row["embedding"], int(row["dimensions"])),
2709
+ )
2710
+ scored_hits.append(
2711
+ _search_hit_from_xlsx_semantic_row(connection, row, similarity)
2712
+ )
2713
+
2714
+ scored_hits.sort(
2715
+ key=lambda hit: (
2716
+ -hit.score,
2717
+ str(hit.document_path),
2718
+ hit.item_id,
2719
+ )
2720
+ )
2721
+ LOGGER.info(
2722
+ "Semantic search executed for query=%r top_k=%s hit_count=%s",
2723
+ query,
2724
+ limit,
2725
+ min(len(scored_hits), limit),
2726
+ )
2727
+ return scored_hits[:limit]
2728
+
2729
+ def _get_embedding_provider(self) -> embedding_provider.EmbeddingProvider:
2730
+ if self._embedding_provider is None:
2731
+ factory = self.embedding_provider_factory or (
2732
+ lambda model_name, dimensions: (
2733
+ embedding_provider.LocalEmbeddingProvider(
2734
+ model_name=model_name,
2735
+ dimensions=dimensions,
2736
+ )
2737
+ )
2738
+ )
2739
+ self._embedding_provider = factory(
2740
+ self.config.embedding_model,
2741
+ self.config.embedding_dimensions,
2742
+ )
2743
+ return self._embedding_provider
2744
+
2745
+ def _require_allowed_document_path(
2746
+ self,
2747
+ document_path: Path,
2748
+ *,
2749
+ action: str,
2750
+ ) -> tuple[Path, FileType]:
2751
+ resolved_path, file_type = _require_indexable_path(document_path)
2752
+ self._ensure_allowed_document_path(resolved_path, action=action)
2753
+ return resolved_path, file_type
2754
+
2755
+ def _ensure_allowed_document_path(
2756
+ self, document_path: Path, *, action: str
2757
+ ) -> Path:
2758
+ resolved_path = canonicalize_existing_path(document_path)
2759
+ return ensure_path_allowed(
2760
+ resolved_path,
2761
+ self._read_policy_roots(),
2762
+ label=f"{action} target",
2763
+ policy_name="allowed roots",
2764
+ )
2765
+
2766
+ def _ensure_allowed_output_path(self, output_path: Path) -> Path:
2767
+ resolved_output_path = canonicalize_output_path(output_path)
2768
+ return ensure_path_allowed(
2769
+ resolved_output_path,
2770
+ self.config.output_roots,
2771
+ label="write output",
2772
+ policy_name="output roots",
2773
+ )
2774
+
2775
+ def _is_allowed_document_path(self, document_path: Path) -> bool:
2776
+ try:
2777
+ self._ensure_allowed_document_path(document_path, action="read")
2778
+ except (PolicyRefusedError, TargetNotFoundError):
2779
+ return False
2780
+ return True
2781
+
2782
+ def _read_policy_roots(self) -> tuple[Path, ...]:
2783
+ combined = list(self.config.allowed_roots) + list(self.config.output_roots)
2784
+ unique_roots: list[Path] = []
2785
+ seen: set[Path] = set()
2786
+ for root in combined:
2787
+ if root not in seen:
2788
+ unique_roots.append(root)
2789
+ seen.add(root)
2790
+ return tuple(unique_roots)
2791
+
2792
+ def _resolve_item_row(
2793
+ self,
2794
+ connection: sqlite3.Connection,
2795
+ document_path: Path,
2796
+ item_id: str,
2797
+ ) -> tuple[sqlite3.Row, sqlite3.Row]:
2798
+ resolved_path, _ = self._require_allowed_document_path(
2799
+ document_path, action="show"
2800
+ )
2801
+ document_row = self._resolve_document_row(connection, resolved_path)
2802
+ item_row = self._resolve_indexed_item_row(
2803
+ connection, document_row, item_id, resolved_path
2804
+ )
2805
+ return document_row, item_row
2806
+
2807
+ def _resolve_document_row(
2808
+ self,
2809
+ connection: sqlite3.Connection,
2810
+ document_path: Path,
2811
+ ) -> sqlite3.Row:
2812
+ document_row = store.fetch_document_by_path(connection, document_path)
2813
+ if document_row is None:
2814
+ raise TargetNotFoundError(f"Document is not indexed: {document_path}")
2815
+ return document_row
2816
+
2817
+ def _resolve_document_by_id_row(
2818
+ self,
2819
+ connection: sqlite3.Connection,
2820
+ document_id: str,
2821
+ ) -> sqlite3.Row:
2822
+ document_row = store.fetch_document_by_id(connection, document_id)
2823
+ if document_row is None:
2824
+ raise TargetNotFoundError(f"Document is not indexed: {document_id}")
2825
+ return document_row
2826
+
2827
+ def _resolve_indexed_item_row(
2828
+ self,
2829
+ connection: sqlite3.Connection,
2830
+ document_row: sqlite3.Row,
2831
+ item_id: str,
2832
+ document_path: Path,
2833
+ ) -> sqlite3.Row:
2834
+ item_row = store.fetch_item_by_id(
2835
+ connection, document_row["document_id"], item_id
2836
+ )
2837
+ if item_row is None:
2838
+ raise TargetNotFoundError(
2839
+ f"Item {item_id} is not indexed for {document_path}"
2840
+ )
2841
+ return item_row
2842
+
2843
+
2844
+ def discover_documents(roots: Iterable[Path]) -> list[DocumentRef]:
2845
+ documents: list[DocumentRef] = []
2846
+
2847
+ for root in roots:
2848
+ if not root.exists() or not root.is_dir():
2849
+ continue
2850
+
2851
+ for candidate in sorted(root.rglob("*"), key=lambda path: str(path)):
2852
+ if not candidate.is_file():
2853
+ continue
2854
+
2855
+ extension = candidate.suffix.lower()
2856
+ if extension not in SUPPORTED_EXTENSIONS:
2857
+ continue
2858
+
2859
+ documents.append(
2860
+ _build_document_ref(candidate, SUPPORTED_EXTENSIONS[extension])
2861
+ )
2862
+
2863
+ return documents
2864
+
2865
+
2866
+ def format_doctor_report(report: DoctorReport) -> str:
2867
+ lines = ["Doctor Report"]
2868
+ for check in report.checks:
2869
+ status = "PASS" if check.ok else "FAIL"
2870
+ lines.append(f"[{status}] {check.name}: {check.detail}")
2871
+
2872
+ summary = "All checks passed." if report.ok else "One or more checks failed."
2873
+ lines.append(summary)
2874
+ return "\n".join(lines)
2875
+
2876
+
2877
+ def _build_document_ref(path: Path, file_type: FileType) -> DocumentRef:
2878
+ resolved = path.resolve()
2879
+ document_id = hashlib.sha256(str(resolved).encode("utf-8")).hexdigest()
2880
+ stat_result = resolved.stat()
2881
+ content_hash = _content_hash(resolved)
2882
+ return DocumentRef(
2883
+ document_id=document_id,
2884
+ path=resolved,
2885
+ file_type=file_type,
2886
+ display_name=resolved.name,
2887
+ modified_time=stat_result.st_mtime,
2888
+ content_hash=content_hash,
2889
+ )
2890
+
2891
+
2892
+ def _index_candidates(path: Path) -> list[Path]:
2893
+ resolved = path.resolve()
2894
+ if resolved.is_dir():
2895
+ return sorted(
2896
+ [
2897
+ candidate
2898
+ for candidate in resolved.rglob("*")
2899
+ if candidate.is_file()
2900
+ and candidate.suffix.lower() in SUPPORTED_EXTENSIONS
2901
+ ],
2902
+ key=lambda candidate: str(candidate),
2903
+ )
2904
+ return [resolved]
2905
+
2906
+
2907
+ def _require_indexable_path(path: Path) -> tuple[Path, FileType]:
2908
+ resolved = canonicalize_existing_path(path)
2909
+ file_type = INDEXABLE_EXTENSIONS.get(resolved.suffix.lower())
2910
+ if file_type is None:
2911
+ raise InvalidArgumentsError(
2912
+ f"Implemented operations require a .docx, .pptx, or .xlsx path: {path}"
2913
+ )
2914
+ return resolved, file_type
2915
+
2916
+
2917
+ def _extract_items(document_path: Path, file_type: FileType):
2918
+ if file_type == "docx":
2919
+ return docx_adapter.extract_document(document_path)
2920
+ if file_type == "pptx":
2921
+ return pptx_adapter.extract_document(document_path)
2922
+ if file_type == "xlsx":
2923
+ return xlsx_adapter.extract_document(document_path)
2924
+ raise InvalidArgumentsError(f"Unsupported indexable file type: {file_type}")
2925
+
2926
+
2927
+ def _search_hit_from_keyword_row(row: sqlite3.Row) -> SearchHit:
2928
+ return SearchHit(
2929
+ document_id=row["document_id"],
2930
+ item_id=row["item_id"],
2931
+ score=float(row["score"]),
2932
+ matched_text=row["content_text"],
2933
+ locator=row["locator"],
2934
+ item_type=row["item_type"],
2935
+ preview=row["preview"],
2936
+ document_path=Path(row["path"]),
2937
+ display_name=row["display_name"],
2938
+ match_mode="keyword",
2939
+ )
2940
+
2941
+
2942
+ def _search_hit_from_semantic_row(row: sqlite3.Row, similarity: float) -> SearchHit:
2943
+ return SearchHit(
2944
+ document_id=row["document_id"],
2945
+ item_id=row["item_id"],
2946
+ score=similarity,
2947
+ matched_text=row["content_text"],
2948
+ locator=row["locator"],
2949
+ item_type=row["item_type"],
2950
+ preview=row["preview"],
2951
+ document_path=Path(row["path"]),
2952
+ display_name=row["display_name"],
2953
+ match_mode="semantic",
2954
+ scores={"semantic": similarity, "final": similarity},
2955
+ )
2956
+
2957
+
2958
+ def _search_hit_from_xlsx_semantic_row(
2959
+ connection: sqlite3.Connection,
2960
+ row: sqlite3.Row,
2961
+ similarity: float,
2962
+ ) -> SearchHit:
2963
+ contributing_cells = store.fetch_xlsx_row_embedding_cells(
2964
+ connection, row["embedding_id"]
2965
+ )
2966
+ representative_coordinate = _metadata_value(row, "coordinate")
2967
+ return SearchHit(
2968
+ document_id=row["document_id"],
2969
+ item_id=row["item_id"],
2970
+ score=similarity,
2971
+ matched_text=row["content_text"],
2972
+ locator=row["locator"],
2973
+ item_type=row["item_type"],
2974
+ preview=row["preview"],
2975
+ document_path=Path(row["path"]),
2976
+ display_name=row["display_name"],
2977
+ match_mode="semantic",
2978
+ scores={"semantic": similarity, "final": similarity},
2979
+ metadata={
2980
+ "matched_sheet": row["sheet_name"],
2981
+ "matched_row": int(row["row_number"]),
2982
+ "contributing_cell_coordinates": [
2983
+ cell["cell_coordinate"] for cell in contributing_cells
2984
+ ],
2985
+ "representative_cell_coordinate": representative_coordinate,
2986
+ "resolved_from_row_embedding": True,
2987
+ },
2988
+ )
2989
+
2990
+
2991
+ def _document_ref_from_row(row: sqlite3.Row) -> DocumentRef:
2992
+ return DocumentRef(
2993
+ document_id=row["document_id"],
2994
+ path=Path(row["path"]),
2995
+ file_type=row["file_type"],
2996
+ display_name=row["display_name"],
2997
+ modified_time=float(row["modified_time"]),
2998
+ content_hash=row["content_hash"],
2999
+ item_count=None if "item_count" not in row.keys() else int(row["item_count"]),
3000
+ )
3001
+
3002
+
3003
+ def _item_ref_from_row(row: sqlite3.Row) -> ItemRef:
3004
+ return ItemRef(
3005
+ document_id=row["document_id"],
3006
+ item_id=row["item_id"],
3007
+ item_type=row["item_type"],
3008
+ locator=row["locator"],
3009
+ preview=row["preview"],
3010
+ metadata=json.loads(row["metadata_json"]),
3011
+ content_text=row["content_text"],
3012
+ )
3013
+
3014
+
3015
+ def _metadata_value(row: sqlite3.Row, key: str, *, default=None):
3016
+ metadata = json.loads(row["metadata_json"])
3017
+ return metadata.get(key, default)
3018
+
3019
+
3020
+ def _normalize_search_mode(mode: str) -> SearchMode:
3021
+ normalized = mode.strip().lower()
3022
+ if normalized not in {"keyword", "semantic", "hybrid"}:
3023
+ raise InvalidArgumentsError(f"Unsupported search mode: {mode}")
3024
+ return normalized # type: ignore[return-value]
3025
+
3026
+
3027
+ def _normalize_output_mode(output_mode: str) -> OutputMode:
3028
+ normalized = output_mode.strip().lower()
3029
+ if normalized not in {"versioned", "inplace"}:
3030
+ raise InvalidArgumentsError(f"Unsupported output mode: {output_mode}")
3031
+ return normalized # type: ignore[return-value]
3032
+
3033
+
3034
+ def _optional_int(value: object) -> int | None:
3035
+ if value is None:
3036
+ return None
3037
+ if isinstance(value, bool):
3038
+ raise InvalidArgumentsError(f"Expected integer value, got {value!r}.")
3039
+ try:
3040
+ return int(value)
3041
+ except (TypeError, ValueError) as exc:
3042
+ raise InvalidArgumentsError(f"Expected integer value, got {value!r}.") from exc
3043
+
3044
+
3045
+ def _content_hash(path: Path) -> str:
3046
+ return hashlib.sha256(path.read_bytes()).hexdigest()
3047
+
3048
+
3049
+ def _build_embedding_text(
3050
+ item: IndexedItem, document_path: Path, *, file_type: FileType
3051
+ ) -> str:
3052
+ if file_type == "docx":
3053
+ return docx_adapter.build_embedding_text(item, document_path)
3054
+ if file_type == "pptx":
3055
+ return pptx_adapter.build_embedding_text(item, document_path)
3056
+ if file_type == "xlsx":
3057
+ return xlsx_adapter.build_embedding_text(item, document_path)
3058
+ raise InvalidArgumentsError(f"Unsupported embedding text type: {file_type}")
3059
+
3060
+
3061
+ def _unpack_embedding(blob: bytes, dimensions: int) -> list[float]:
3062
+ expected_length = dimensions * 4
3063
+ if len(blob) != expected_length:
3064
+ raise RuntimeError(
3065
+ f"Embedding blob length {len(blob)} does not match expected size {expected_length}."
3066
+ )
3067
+ return list(struct.unpack(f"<{dimensions}f", blob))
3068
+
3069
+
3070
+ def _cosine_similarity(left: Sequence[float], right: Sequence[float]) -> float:
3071
+ if len(left) != len(right):
3072
+ raise RuntimeError("Embedding vectors must have the same dimensionality.")
3073
+ return float(sum(a * b for a, b in zip(left, right)))
3074
+
3075
+
3076
+ def _rank_scores(storage_ids: Sequence[str]) -> dict[str, float]:
3077
+ return {
3078
+ storage_id: 1.0 / rank for rank, storage_id in enumerate(storage_ids, start=1)
3079
+ }
3080
+
3081
+
3082
+ def _merge_hybrid_hits(
3083
+ keyword_rows: Sequence[sqlite3.Row],
3084
+ semantic_hits: Sequence[SearchHit],
3085
+ *,
3086
+ limit: int,
3087
+ keyword_weight: float,
3088
+ semantic_weight: float,
3089
+ ) -> list[SearchHit]:
3090
+ keyword_by_storage = {row["storage_id"]: row for row in keyword_rows}
3091
+ semantic_by_storage = {
3092
+ f"{hit.document_id}:{hit.item_id}": hit for hit in semantic_hits
3093
+ }
3094
+ keyword_rank_scores = _rank_scores([row["storage_id"] for row in keyword_rows])
3095
+ semantic_rank_scores = _rank_scores(
3096
+ [f"{hit.document_id}:{hit.item_id}" for hit in semantic_hits]
3097
+ )
3098
+
3099
+ merged: list[SearchHit] = []
3100
+ for storage_id in sorted(set(keyword_by_storage) | set(semantic_by_storage)):
3101
+ keyword_row = keyword_by_storage.get(storage_id)
3102
+ semantic_hit = semantic_by_storage.get(storage_id)
3103
+ base_hit = (
3104
+ semantic_hit
3105
+ if semantic_hit is not None
3106
+ else _search_hit_from_keyword_row(keyword_row)
3107
+ ) # type: ignore[arg-type]
3108
+ keyword_score = keyword_rank_scores.get(storage_id, 0.0)
3109
+ semantic_score = semantic_rank_scores.get(storage_id, 0.0)
3110
+ final_score = (keyword_weight * keyword_score) + (
3111
+ semantic_weight * semantic_score
3112
+ )
3113
+ merged.append(
3114
+ SearchHit(
3115
+ document_id=base_hit.document_id,
3116
+ item_id=base_hit.item_id,
3117
+ score=final_score,
3118
+ matched_text=base_hit.matched_text,
3119
+ locator=base_hit.locator,
3120
+ item_type=base_hit.item_type,
3121
+ preview=base_hit.preview,
3122
+ document_path=base_hit.document_path,
3123
+ display_name=base_hit.display_name,
3124
+ match_mode="hybrid",
3125
+ scores={
3126
+ "keyword": keyword_score,
3127
+ "semantic": semantic_score,
3128
+ "final": final_score,
3129
+ },
3130
+ metadata=dict(base_hit.metadata),
3131
+ )
3132
+ )
3133
+
3134
+ merged.sort(
3135
+ key=lambda hit: (
3136
+ -hit.score,
3137
+ -(hit.scores or {}).get("semantic", 0.0),
3138
+ -(hit.scores or {}).get("keyword", 0.0),
3139
+ str(hit.document_path),
3140
+ hit.item_id,
3141
+ )
3142
+ )
3143
+ return merged[:limit]
3144
+
3145
+
3146
+ def _ensure_current_target_resolves(
3147
+ document_path: Path, file_type: FileType, item_id: str
3148
+ ) -> None:
3149
+ if file_type == "docx":
3150
+ docx_adapter.read_paragraph(document_path, item_id)
3151
+ return
3152
+ if file_type == "pptx":
3153
+ pptx_adapter.read_text_shape(document_path, item_id)
3154
+ return
3155
+ xlsx_adapter.read_cell(document_path, item_id)
3156
+
3157
+
3158
+ def _object_resolver(file_type: FileType):
3159
+ return OBJECT_RESOLVERS[file_type]
3160
+
3161
+
3162
+ def _require_capability(
3163
+ capabilities: Sequence[Capability],
3164
+ required: Capability,
3165
+ locator: str,
3166
+ ) -> None:
3167
+ if required not in capabilities:
3168
+ raise TargetNotEditableError(f"{locator} does not support {required.value}.")
3169
+
3170
+
3171
+ def _primary_locator_for_batch(operations: Sequence[dict[str, Any]]) -> str | None:
3172
+ for operation in operations:
3173
+ for key in (
3174
+ "locator",
3175
+ "parent_locator",
3176
+ "new_parent_locator",
3177
+ "target_parent_locator",
3178
+ ):
3179
+ value = operation.get(key)
3180
+ if isinstance(value, str) and value:
3181
+ return value
3182
+ return None
3183
+
3184
+
3185
+ def _make_batch_work_path(output_path: Path, suffix: str) -> Path:
3186
+ output_path.parent.mkdir(parents=True, exist_ok=True)
3187
+ handle = tempfile.NamedTemporaryFile(
3188
+ prefix=".offagent-batch-",
3189
+ suffix=suffix,
3190
+ dir=output_path.parent,
3191
+ delete=False,
3192
+ )
3193
+ handle.close()
3194
+ return Path(handle.name)
3195
+
3196
+
3197
+ def _validate_batch_operation(
3198
+ document_path: Path,
3199
+ file_type: FileType,
3200
+ operation: dict[str, Any],
3201
+ ) -> MutationResult:
3202
+ operation_name = _operation_name(operation)
3203
+ if operation_name == "create_object":
3204
+ parent = _object_resolver(file_type).get_object(
3205
+ document_path, str(operation["parent_locator"])
3206
+ )
3207
+ _require_capability(parent.capabilities, Capability.ADD_CHILD, parent.locator)
3208
+ _validate_create_operation(file_type, operation)
3209
+ return MutationResult(
3210
+ document_path=document_path,
3211
+ output_path=None,
3212
+ document_id=document_path.resolve().as_posix(),
3213
+ locator=None,
3214
+ object_type=str(operation["object_type"]),
3215
+ summary=f"Validated {operation_name}.",
3216
+ parent_locator=parent.locator,
3217
+ metadata={"operation": operation_name},
3218
+ )
3219
+
3220
+ locator = str(operation["locator"])
3221
+ payload = _object_resolver(file_type).get_object(document_path, locator)
3222
+ required_capability = {
3223
+ "update_object": Capability.UPDATE,
3224
+ "move_object": Capability.MOVE,
3225
+ "copy_object": Capability.COPY,
3226
+ }[operation_name]
3227
+ _require_capability(payload.capabilities, required_capability, locator)
3228
+ if operation_name == "update_object":
3229
+ segments = _coerce_inline_fragments(operation.get("segments"))
3230
+ text_range = _coerce_visible_text_range(operation.get("range"))
3231
+ if text_range is not None:
3232
+ raise InvalidArgumentsError("update_object does not accept range.")
3233
+ if segments is not None and any(
3234
+ key in dict(operation.get("properties", {})) for key in {"text", "value"}
3235
+ ):
3236
+ raise InvalidArgumentsError(
3237
+ "update_object accepts either properties.text/value or segments, not both."
3238
+ )
3239
+ return MutationResult(
3240
+ document_path=document_path,
3241
+ output_path=None,
3242
+ document_id=document_path.resolve().as_posix(),
3243
+ locator=payload.locator,
3244
+ object_type=payload.object_type,
3245
+ summary=f"Validated {operation_name}.",
3246
+ capabilities=payload.capabilities,
3247
+ parent_locator=payload.parent_locator,
3248
+ metadata={"operation": operation_name},
3249
+ )
3250
+
3251
+
3252
+ def _apply_batch_operation(
3253
+ document_path: Path,
3254
+ file_type: FileType,
3255
+ operation: dict[str, Any],
3256
+ ) -> MutationResult:
3257
+ operation_name = _operation_name(operation)
3258
+ if operation_name == "create_object":
3259
+ locator, summary, metadata = _create_object_on_path(
3260
+ document_path,
3261
+ file_type,
3262
+ parent_locator=str(operation["parent_locator"]),
3263
+ object_type=str(operation["object_type"]),
3264
+ properties=dict(operation.get("properties", {})),
3265
+ position=operation.get("position"),
3266
+ segments=_coerce_inline_fragments(operation.get("segments")),
3267
+ text_range=_coerce_visible_text_range(operation.get("range")),
3268
+ output_path=document_path,
3269
+ )
3270
+ payload = _object_resolver(file_type).get_object(document_path, locator)
3271
+ return MutationResult(
3272
+ document_path=document_path,
3273
+ output_path=document_path,
3274
+ document_id=document_path.resolve().as_posix(),
3275
+ locator=payload.locator,
3276
+ object_type=payload.object_type,
3277
+ summary=summary,
3278
+ capabilities=payload.capabilities,
3279
+ parent_locator=payload.parent_locator,
3280
+ metadata=metadata,
3281
+ )
3282
+
3283
+ if operation_name == "update_object":
3284
+ locator = str(operation["locator"])
3285
+ summary, metadata = _update_object_on_path(
3286
+ document_path,
3287
+ file_type,
3288
+ locator=locator,
3289
+ properties=dict(operation.get("properties", {})),
3290
+ segments=_coerce_inline_fragments(operation.get("segments")),
3291
+ text_range=_coerce_visible_text_range(operation.get("range")),
3292
+ output_path=document_path,
3293
+ )
3294
+ payload = _object_resolver(file_type).get_object(document_path, locator)
3295
+ return MutationResult(
3296
+ document_path=document_path,
3297
+ output_path=document_path,
3298
+ document_id=document_path.resolve().as_posix(),
3299
+ locator=payload.locator,
3300
+ object_type=payload.object_type,
3301
+ summary=summary,
3302
+ capabilities=payload.capabilities,
3303
+ parent_locator=payload.parent_locator,
3304
+ metadata=metadata,
3305
+ )
3306
+
3307
+ if operation_name == "move_object":
3308
+ moved_locator, summary, metadata = _move_object_on_path(
3309
+ document_path,
3310
+ file_type,
3311
+ locator=str(operation["locator"]),
3312
+ new_parent_locator=str(operation["new_parent_locator"]),
3313
+ position=operation.get("position"),
3314
+ output_path=document_path,
3315
+ )
3316
+ payload = _object_resolver(file_type).get_object(document_path, moved_locator)
3317
+ return MutationResult(
3318
+ document_path=document_path,
3319
+ output_path=document_path,
3320
+ document_id=document_path.resolve().as_posix(),
3321
+ locator=payload.locator,
3322
+ object_type=payload.object_type,
3323
+ summary=summary,
3324
+ capabilities=payload.capabilities,
3325
+ parent_locator=payload.parent_locator,
3326
+ metadata=metadata,
3327
+ )
3328
+
3329
+ copied_locator, summary, metadata = _copy_object_on_path(
3330
+ document_path,
3331
+ file_type,
3332
+ locator=str(operation["locator"]),
3333
+ target_parent_locator=str(operation["target_parent_locator"]),
3334
+ position=operation.get("position"),
3335
+ output_path=document_path,
3336
+ )
3337
+ payload = _object_resolver(file_type).get_object(document_path, copied_locator)
3338
+ return MutationResult(
3339
+ document_path=document_path,
3340
+ output_path=document_path,
3341
+ document_id=document_path.resolve().as_posix(),
3342
+ locator=payload.locator,
3343
+ object_type=payload.object_type,
3344
+ summary=summary,
3345
+ capabilities=payload.capabilities,
3346
+ parent_locator=payload.parent_locator,
3347
+ metadata=metadata,
3348
+ )
3349
+
3350
+
3351
+ def _create_object_on_path(
3352
+ document_path: Path,
3353
+ file_type: FileType,
3354
+ *,
3355
+ parent_locator: str,
3356
+ object_type: str,
3357
+ properties: dict[str, Any],
3358
+ position: object | None,
3359
+ segments: tuple[InlineFragment, ...] | None,
3360
+ text_range: VisibleTextRange | None,
3361
+ output_path: Path,
3362
+ ) -> tuple[str, str, dict[str, Any]]:
3363
+ parent = _object_resolver(file_type).get_object(document_path, parent_locator)
3364
+ _require_capability(parent.capabilities, Capability.ADD_CHILD, parent_locator)
3365
+ _validate_create_operation(
3366
+ file_type,
3367
+ {
3368
+ "parent_locator": parent_locator,
3369
+ "object_type": object_type,
3370
+ "properties": properties,
3371
+ "position": position,
3372
+ "segments": segments,
3373
+ "range": text_range,
3374
+ },
3375
+ )
3376
+ if file_type == "docx":
3377
+ style_name = properties.get("style_name")
3378
+ style = None if style_name is None else str(style_name)
3379
+ text = _text_or_segments_text(properties, object_type, segments, keys=("text",))
3380
+ after_locator = _docx_after_locator(position)
3381
+ target_path, new_node_id = docx_adapter.insert_paragraph(
3382
+ document_path,
3383
+ text,
3384
+ style_name=style,
3385
+ after_locator=after_locator,
3386
+ output_path=output_path,
3387
+ )
3388
+ locator = to_v2_locator(new_node_id, file_type="docx")
3389
+ if segments:
3390
+ target_path, locator, _ = docx_adapter.rewrite_paragraph_fragments(
3391
+ target_path,
3392
+ locator,
3393
+ segments,
3394
+ output_path=target_path,
3395
+ )
3396
+ return (
3397
+ locator,
3398
+ f"Created {object_type} under {parent_locator}.",
3399
+ {
3400
+ "text": text,
3401
+ "segments": None if segments is None else len(segments),
3402
+ "style_name": style,
3403
+ },
3404
+ )
3405
+
3406
+ if file_type == "pptx" and object_type in {"text_shape", "textbox"}:
3407
+ text = _text_or_segments_text(properties, object_type, segments, keys=("text",))
3408
+ left = _optional_int(properties.get("left"))
3409
+ top = _optional_int(properties.get("top"))
3410
+ width = _optional_int(properties.get("width"))
3411
+ height = _optional_int(properties.get("height"))
3412
+ if None in {left, top, width, height}:
3413
+ raise InvalidArgumentsError(
3414
+ "PPTX text_shape creation requires left, top, width, and height."
3415
+ )
3416
+ target_path, locator = pptx_adapter.add_textbox(
3417
+ document_path,
3418
+ parent_locator,
3419
+ text,
3420
+ left=left,
3421
+ top=top,
3422
+ width=width,
3423
+ height=height,
3424
+ output_path=output_path,
3425
+ )
3426
+ if segments:
3427
+ target_path, locator, _ = pptx_adapter.rewrite_paragraph_fragments(
3428
+ target_path,
3429
+ locator,
3430
+ segments,
3431
+ output_path=target_path,
3432
+ )
3433
+ return (
3434
+ locator,
3435
+ f"Created {object_type} under {parent_locator}.",
3436
+ {"text": text, "segments": None if segments is None else len(segments)},
3437
+ )
3438
+
3439
+ if file_type == "xlsx" and object_type == "cell":
3440
+ parts = parse_locator(
3441
+ to_v2_locator(parent_locator, file_type="xlsx")
3442
+ ).components
3443
+ if len(parts) != 3 or parts[:2] != ("xlsx", "sheet"):
3444
+ raise InvalidArgumentsError(
3445
+ "XLSX cell creation requires a worksheet parent locator."
3446
+ )
3447
+ coordinate = properties.get("coordinate")
3448
+ if not isinstance(coordinate, str) or not coordinate.strip():
3449
+ raise InvalidArgumentsError("XLSX cell creation requires a coordinate.")
3450
+ locator = f"xlsx:sheet:{parts[2]}!{coordinate.strip().upper()}"
3451
+ if segments:
3452
+ target_path, locator, _ = xlsx_adapter.write_cell_fragments(
3453
+ document_path,
3454
+ locator,
3455
+ segments,
3456
+ output_path=output_path,
3457
+ )
3458
+ text = "".join(fragment.text for fragment in segments)
3459
+ else:
3460
+ text = _required_string_property(properties, ("value", "text"), object_type)
3461
+ xlsx_adapter.write_node(
3462
+ document_path,
3463
+ to_legacy_locator(locator, file_type="xlsx"),
3464
+ text,
3465
+ output_path,
3466
+ )
3467
+ target_path = output_path
3468
+ return (
3469
+ locator,
3470
+ f"Created {object_type} under {parent_locator}.",
3471
+ {"value": text, "segments": None if segments is None else len(segments)},
3472
+ )
3473
+
3474
+ raise InvalidArgumentsError(
3475
+ f"create_object is not supported for {file_type} {object_type}."
3476
+ )
3477
+
3478
+
3479
+ def _update_object_on_path(
3480
+ document_path: Path,
3481
+ file_type: FileType,
3482
+ *,
3483
+ locator: str,
3484
+ properties: dict[str, Any],
3485
+ segments: tuple[InlineFragment, ...] | None,
3486
+ text_range: VisibleTextRange | None,
3487
+ output_path: Path,
3488
+ ) -> tuple[str, dict[str, Any]]:
3489
+ payload = _object_resolver(file_type).get_object(document_path, locator)
3490
+ _require_capability(payload.capabilities, Capability.UPDATE, locator)
3491
+ if text_range is not None:
3492
+ raise InvalidArgumentsError(
3493
+ "update_object does not support range; use style_inline for partial formatting."
3494
+ )
3495
+ if segments is not None and any(key in properties for key in {"text", "value"}):
3496
+ raise InvalidArgumentsError(
3497
+ "update_object accepts either properties.text/value or segments, not both."
3498
+ )
3499
+
3500
+ if file_type == "docx" and segments is not None:
3501
+ _, _, snapshot = docx_adapter.rewrite_paragraph_fragments(
3502
+ document_path,
3503
+ locator,
3504
+ segments,
3505
+ output_path=output_path,
3506
+ )
3507
+ return (
3508
+ f"Updated {payload.object_type} {locator}.",
3509
+ {"text": snapshot.text, "segments": len(snapshot.fragments)},
3510
+ )
3511
+
3512
+ if file_type == "pptx" and segments is not None:
3513
+ _, rewritten_locator, snapshot = pptx_adapter.rewrite_paragraph_fragments(
3514
+ document_path,
3515
+ locator,
3516
+ segments,
3517
+ output_path=output_path,
3518
+ )
3519
+ return (
3520
+ f"Updated {payload.object_type} {rewritten_locator}.",
3521
+ {"text": snapshot.text, "segments": len(snapshot.fragments)},
3522
+ )
3523
+
3524
+ if file_type == "xlsx" and segments is not None:
3525
+ _, _, snapshot = xlsx_adapter.write_cell_fragments(
3526
+ document_path,
3527
+ locator,
3528
+ segments,
3529
+ output_path=output_path,
3530
+ )
3531
+ return (
3532
+ f"Updated {payload.object_type} {locator}.",
3533
+ {"value": snapshot.text, "segments": len(snapshot.fragments)},
3534
+ )
3535
+
3536
+ if file_type in {"docx", "pptx"}:
3537
+ content = _required_string_property(
3538
+ properties, ("text", "value"), payload.object_type
3539
+ )
3540
+ legacy_locator = to_legacy_locator(locator, file_type=file_type)
3541
+ if file_type == "docx":
3542
+ docx_adapter.write_node(document_path, legacy_locator, content, output_path)
3543
+ else:
3544
+ pptx_adapter.write_node(document_path, legacy_locator, content, output_path)
3545
+ return (f"Updated {payload.object_type} {locator}.", {"text": content})
3546
+
3547
+ content = _required_string_property(
3548
+ properties, ("value", "text"), payload.object_type
3549
+ )
3550
+ legacy_locator = to_legacy_locator(locator, file_type="xlsx")
3551
+ xlsx_adapter.write_node(document_path, legacy_locator, content, output_path)
3552
+ return (f"Updated {payload.object_type} {locator}.", {"value": content})
3553
+
3554
+
3555
+ def _move_object_on_path(
3556
+ document_path: Path,
3557
+ file_type: FileType,
3558
+ *,
3559
+ locator: str,
3560
+ new_parent_locator: str,
3561
+ position: object | None,
3562
+ output_path: Path,
3563
+ ) -> tuple[str, str, dict[str, Any]]:
3564
+ payload = _object_resolver(file_type).get_object(document_path, locator)
3565
+ _require_capability(payload.capabilities, Capability.MOVE, locator)
3566
+
3567
+ if file_type != "pptx" or payload.object_type != "slide":
3568
+ raise InvalidArgumentsError(
3569
+ f"move_object is not supported for {payload.object_type}."
3570
+ )
3571
+ if new_parent_locator != "pptx:presentation":
3572
+ raise InvalidArgumentsError(
3573
+ "PPTX slides can only be moved within the presentation root."
3574
+ )
3575
+
3576
+ slide_number = _pptx_slide_number(locator)
3577
+ new_position = _required_position(position)
3578
+ _move_pptx_slide(document_path, slide_number, new_position, output_path)
3579
+ return (
3580
+ f"pptx:slide:{new_position}",
3581
+ f"Moved slide {slide_number} to position {new_position}.",
3582
+ {"previous_locator": locator, "new_parent_locator": new_parent_locator},
3583
+ )
3584
+
3585
+
3586
+ def _copy_object_on_path(
3587
+ document_path: Path,
3588
+ file_type: FileType,
3589
+ *,
3590
+ locator: str,
3591
+ target_parent_locator: str,
3592
+ position: object | None,
3593
+ output_path: Path,
3594
+ ) -> tuple[str, str, dict[str, Any]]:
3595
+ payload = _object_resolver(file_type).get_object(document_path, locator)
3596
+ _require_capability(payload.capabilities, Capability.COPY, locator)
3597
+
3598
+ if file_type != "pptx" or payload.object_type != "slide":
3599
+ raise InvalidArgumentsError(
3600
+ f"copy_object is not supported for {payload.object_type}."
3601
+ )
3602
+ if target_parent_locator != "pptx:presentation":
3603
+ raise InvalidArgumentsError(
3604
+ "PPTX slides can only be copied within the presentation root."
3605
+ )
3606
+
3607
+ slide_number = _pptx_slide_number(locator)
3608
+ copied_position = _copy_pptx_slide(
3609
+ document_path, slide_number, position, output_path
3610
+ )
3611
+ return (
3612
+ f"pptx:slide:{copied_position}",
3613
+ f"Copied slide {slide_number} to position {copied_position}.",
3614
+ {"source_locator": locator, "target_parent_locator": target_parent_locator},
3615
+ )
3616
+
3617
+
3618
+ def _delete_object_on_path(
3619
+ document_path: Path,
3620
+ file_type: FileType,
3621
+ *,
3622
+ locator: str,
3623
+ output_path: Path,
3624
+ ) -> tuple[str, dict[str, Any]]:
3625
+ payload = _object_resolver(file_type).get_object(document_path, locator)
3626
+ _require_capability(payload.capabilities, Capability.DELETE, locator)
3627
+
3628
+ if file_type == "docx":
3629
+ return _delete_docx_object(document_path, locator, output_path)
3630
+ if file_type == "pptx":
3631
+ return _delete_pptx_object(document_path, locator, output_path)
3632
+ return _delete_xlsx_object(document_path, locator, output_path)
3633
+
3634
+
3635
+ def _validate_create_operation(file_type: FileType, operation: dict[str, Any]) -> None:
3636
+ object_type = str(operation["object_type"])
3637
+ segments = _coerce_inline_fragments(operation.get("segments"))
3638
+ text_range = _coerce_visible_text_range(operation.get("range"))
3639
+ properties = dict(operation.get("properties", {}))
3640
+ if text_range is not None:
3641
+ raise InvalidArgumentsError("create_object does not accept range.")
3642
+ if segments is not None and any(key in properties for key in {"text", "value"}):
3643
+ raise InvalidArgumentsError(
3644
+ "create_object accepts either text/value or segments, not both."
3645
+ )
3646
+ if file_type == "docx" and object_type == "paragraph":
3647
+ _text_or_segments_text(properties, object_type, segments, keys=("text",))
3648
+ _docx_after_locator(operation.get("position"))
3649
+ return
3650
+ if file_type == "pptx" and object_type in {"text_shape", "textbox"}:
3651
+ _text_or_segments_text(properties, object_type, segments, keys=("text",))
3652
+ for key in ("left", "top", "width", "height"):
3653
+ if _optional_int(properties.get(key)) is None:
3654
+ raise InvalidArgumentsError(
3655
+ "PPTX text_shape creation requires left, top, width, and height."
3656
+ )
3657
+ return
3658
+ if file_type == "xlsx" and object_type == "cell":
3659
+ coordinate = properties.get("coordinate")
3660
+ if not isinstance(coordinate, str) or not coordinate.strip():
3661
+ raise InvalidArgumentsError("XLSX cell creation requires a coordinate.")
3662
+ if segments is None:
3663
+ _required_string_property(properties, ("value", "text"), object_type)
3664
+ return
3665
+ raise InvalidArgumentsError(
3666
+ f"create_object does not support {file_type} {object_type}."
3667
+ )
3668
+
3669
+
3670
+ def _operation_name(operation: dict[str, Any]) -> str:
3671
+ raw = operation.get("operation") or operation.get("op")
3672
+ if not isinstance(raw, str) or raw not in {
3673
+ "create_object",
3674
+ "update_object",
3675
+ "move_object",
3676
+ "copy_object",
3677
+ }:
3678
+ raise InvalidArgumentsError(f"Unsupported batch operation: {raw}")
3679
+ return raw
3680
+
3681
+
3682
+ def _required_string_property(
3683
+ properties: dict[str, Any],
3684
+ keys: Sequence[str],
3685
+ object_type: str,
3686
+ ) -> str:
3687
+ for key in keys:
3688
+ value = properties.get(key)
3689
+ if value is None:
3690
+ continue
3691
+ return str(value)
3692
+ raise InvalidArgumentsError(
3693
+ f"{object_type} updates require one of: {', '.join(keys)}."
3694
+ )
3695
+
3696
+
3697
+ def _text_or_segments_text(
3698
+ properties: dict[str, Any],
3699
+ object_type: str,
3700
+ segments: tuple[InlineFragment, ...] | None,
3701
+ *,
3702
+ keys: Sequence[str],
3703
+ ) -> str:
3704
+ if segments is not None:
3705
+ return "".join(fragment.text for fragment in segments)
3706
+ return _required_string_property(properties, keys, object_type)
3707
+
3708
+
3709
+ def _coerce_inline_fragments(
3710
+ value: object,
3711
+ ) -> tuple[InlineFragment, ...] | None:
3712
+ if value is None:
3713
+ return None
3714
+ if not isinstance(value, Sequence) or isinstance(value, (str, bytes, bytearray)):
3715
+ raise InvalidArgumentsError("segments must be a sequence of inline fragments.")
3716
+ fragments: list[InlineFragment] = []
3717
+ for raw_fragment in value:
3718
+ if isinstance(raw_fragment, InlineFragment):
3719
+ fragment = raw_fragment
3720
+ elif isinstance(raw_fragment, dict):
3721
+ text = raw_fragment.get("text")
3722
+ if not isinstance(text, str) or not text:
3723
+ raise InvalidArgumentsError("segments must contain non-empty text.")
3724
+ raw_style = raw_fragment.get("style")
3725
+ if raw_style is None:
3726
+ style = InlineStyle()
3727
+ elif isinstance(raw_style, InlineStyle):
3728
+ style = raw_style
3729
+ elif isinstance(raw_style, dict):
3730
+ style = InlineStyle(**raw_style)
3731
+ else:
3732
+ raise InvalidArgumentsError("segment styles must be objects.")
3733
+ fragment = InlineFragment(text=text, style=style)
3734
+ else:
3735
+ raise InvalidArgumentsError(
3736
+ "segments must contain inline-fragment objects."
3737
+ )
3738
+ if not fragment.text:
3739
+ raise InvalidArgumentsError("segments must contain non-empty text.")
3740
+ fragments.append(fragment)
3741
+ if not fragments:
3742
+ raise InvalidArgumentsError("segments must not be empty.")
3743
+ return tuple(fragments)
3744
+
3745
+
3746
+ def _coerce_visible_text_range(
3747
+ value: object,
3748
+ ) -> VisibleTextRange | None:
3749
+ if value is None:
3750
+ return None
3751
+ if isinstance(value, VisibleTextRange):
3752
+ result = value
3753
+ elif isinstance(value, dict):
3754
+ try:
3755
+ result = VisibleTextRange(start=int(value["start"]), end=int(value["end"]))
3756
+ except (KeyError, TypeError, ValueError) as exc:
3757
+ raise InvalidArgumentsError(
3758
+ "range must contain integer start and end offsets."
3759
+ ) from exc
3760
+ else:
3761
+ raise InvalidArgumentsError(
3762
+ "range must be an object with start and end offsets."
3763
+ )
3764
+ if result.start < 0 or result.end <= result.start:
3765
+ raise InvalidArgumentsError(
3766
+ "range must use non-negative offsets with end > start."
3767
+ )
3768
+ return result
3769
+
3770
+
3771
+ def _docx_after_locator(position: object | None) -> str | None:
3772
+ if position is None:
3773
+ return None
3774
+ if isinstance(position, str):
3775
+ return to_legacy_locator(position, file_type="docx")
3776
+ if isinstance(position, dict):
3777
+ for key in ("after", "after_locator"):
3778
+ after_locator = position.get(key)
3779
+ if after_locator is not None:
3780
+ return to_legacy_locator(str(after_locator), file_type="docx")
3781
+ raise InvalidArgumentsError("DOCX create_object position must be an after locator.")
3782
+
3783
+
3784
+ def _required_position(position: object | None) -> int:
3785
+ if isinstance(position, int):
3786
+ return position
3787
+ if isinstance(position, dict):
3788
+ for key in ("index", "position"):
3789
+ value = position.get(key)
3790
+ if isinstance(value, int):
3791
+ return value
3792
+ raise InvalidArgumentsError("Move/copy operations require an integer position.")
3793
+
3794
+
3795
+ def _pptx_slide_number(locator: str) -> int:
3796
+ canonical = to_v2_locator(locator, file_type="pptx")
3797
+ parts = canonical.split(":")
3798
+ if len(parts) != 3 or parts[:2] != ["pptx", "slide"]:
3799
+ raise InvalidArgumentsError(f"Unsupported PPTX slide locator: {locator}")
3800
+ return int(parts[2])
3801
+
3802
+
3803
+ def _move_pptx_slide(
3804
+ document_path: Path,
3805
+ slide_number: int,
3806
+ new_position: int,
3807
+ output_path: Path,
3808
+ ) -> None:
3809
+ presentation = pptx_adapter._open_presentation(document_path)
3810
+ slide_count = len(presentation.slides)
3811
+ if slide_number < 1 or slide_number > slide_count:
3812
+ raise TargetNotFoundError(
3813
+ f"Slide {slide_number} does not exist in the presentation."
3814
+ )
3815
+ if new_position < 1 or new_position > slide_count:
3816
+ raise InvalidArgumentsError(f"Invalid target slide position: {new_position}")
3817
+
3818
+ sld_id_list = presentation.slides._sldIdLst
3819
+ slide_id = sld_id_list.sldId_lst[slide_number - 1]
3820
+ sld_id_list.remove(slide_id)
3821
+ sld_id_list.insert(new_position - 1, slide_id)
3822
+ presentation.save(output_path)
3823
+
3824
+
3825
+ def _copy_pptx_slide(
3826
+ document_path: Path,
3827
+ slide_number: int,
3828
+ position: object | None,
3829
+ output_path: Path,
3830
+ ) -> int:
3831
+ presentation = pptx_adapter._open_presentation(document_path)
3832
+ source_slide = pptx_adapter._resolve_slide(presentation, slide_number)
3833
+ new_slide = presentation.slides.add_slide(source_slide.slide_layout)
3834
+
3835
+ for placeholder_shape in list(new_slide.shapes):
3836
+ placeholder_shape.element.getparent().remove(placeholder_shape.element)
3837
+
3838
+ for shape in source_slide.shapes:
3839
+ new_slide.shapes._spTree.insert_element_before(
3840
+ deepcopy(shape.element), "p:extLst"
3841
+ )
3842
+
3843
+ for rel in source_slide.part.rels.values():
3844
+ if rel.reltype.endswith("/notesSlide") or rel.reltype.endswith("/slideLayout"):
3845
+ continue
3846
+ if rel.is_external:
3847
+ new_rid = new_slide.part.relate_to(
3848
+ rel.target_ref, rel.reltype, is_external=True
3849
+ )
3850
+ else:
3851
+ new_rid = new_slide.part.relate_to(rel.target_part, rel.reltype)
3852
+ _retarget_shape_relationships(new_slide, rel.rId, new_rid)
3853
+
3854
+ if getattr(source_slide, "notes_slide", None) is not None:
3855
+ source_notes = getattr(source_slide.notes_slide, "notes_text_frame", None)
3856
+ target_notes = getattr(new_slide.notes_slide, "notes_text_frame", None)
3857
+ if source_notes is not None and target_notes is not None:
3858
+ target_notes.text = source_notes.text
3859
+
3860
+ copied_position = (
3861
+ len(presentation.slides) if position is None else _required_position(position)
3862
+ )
3863
+ _move_pptx_slide_in_memory(presentation, len(presentation.slides), copied_position)
3864
+ presentation.save(output_path)
3865
+ return copied_position
3866
+
3867
+
3868
+ def _move_pptx_slide_in_memory(
3869
+ presentation, slide_number: int, new_position: int
3870
+ ) -> None:
3871
+ slide_count = len(presentation.slides)
3872
+ if new_position < 1 or new_position > slide_count:
3873
+ raise InvalidArgumentsError(f"Invalid target slide position: {new_position}")
3874
+ sld_id_list = presentation.slides._sldIdLst
3875
+ slide_id = sld_id_list.sldId_lst[slide_number - 1]
3876
+ sld_id_list.remove(slide_id)
3877
+ sld_id_list.insert(new_position - 1, slide_id)
3878
+
3879
+
3880
+ def _retarget_shape_relationships(slide, source_rid: str, target_rid: str) -> None:
3881
+ for shape in slide.shapes:
3882
+ for element in shape.element.iter():
3883
+ for attr_name, attr_value in list(element.attrib.items()):
3884
+ if attr_value == source_rid:
3885
+ element.set(attr_name, target_rid)
3886
+
3887
+
3888
+ def _delete_docx_object(
3889
+ document_path: Path,
3890
+ locator: str,
3891
+ output_path: Path,
3892
+ ) -> tuple[str, dict[str, Any]]:
3893
+ canonical = to_v2_locator(locator, file_type="docx")
3894
+ if canonical.startswith("docx:para:"):
3895
+ document = docx_adapter._open_document(document_path)
3896
+ paragraph = docx_adapter._resolve_paragraph(
3897
+ document, to_legacy_locator(canonical, file_type="docx")
3898
+ )
3899
+ paragraph._element.getparent().remove(paragraph._element)
3900
+ document.save(output_path)
3901
+ return (f"Deleted paragraph {locator}.", {"locator": locator})
3902
+ if canonical.startswith("docx:table:") and ":row:" not in canonical:
3903
+ document = docx_adapter._open_document(document_path)
3904
+ parts = canonical.split(":")
3905
+ table_index = int(parts[2])
3906
+ resolved = docx_adapter._resolve_locator(
3907
+ document, f"table:{table_index}:cell:0:0"
3908
+ )
3909
+ resolved.table._element.getparent().remove(resolved.table._element)
3910
+ document.save(output_path)
3911
+ return (f"Deleted table {locator}.", {"locator": locator})
3912
+ raise InvalidArgumentsError(f"delete_object is not supported for {locator}.")
3913
+
3914
+
3915
+ def _delete_pptx_object(
3916
+ document_path: Path,
3917
+ locator: str,
3918
+ output_path: Path,
3919
+ ) -> tuple[str, dict[str, Any]]:
3920
+ canonical = to_v2_locator(locator, file_type="pptx")
3921
+ presentation = pptx_adapter._open_presentation(document_path)
3922
+ parts = canonical.split(":")
3923
+
3924
+ if canonical.startswith("pptx:slide:") and len(parts) == 3:
3925
+ slide_number = int(parts[2])
3926
+ slide_id = presentation.slides._sldIdLst.sldId_lst[slide_number - 1]
3927
+ presentation.part.drop_rel(slide_id.rId)
3928
+ presentation.slides._sldIdLst.remove(slide_id)
3929
+ presentation.save(output_path)
3930
+ return (f"Deleted slide {locator}.", {"locator": locator})
3931
+
3932
+ if len(parts) == 5 and parts[:2] == ["pptx", "slide"]:
3933
+ shape = pptx_adapter._resolve_shape(
3934
+ presentation, to_legacy_locator(canonical, file_type="pptx")
3935
+ )
3936
+ shape.element.getparent().remove(shape.element)
3937
+ presentation.save(output_path)
3938
+ return (
3939
+ f"Deleted {parts[3]} {locator}.",
3940
+ {"locator": locator, "slide_number": int(parts[2])},
3941
+ )
3942
+
3943
+ raise InvalidArgumentsError(f"delete_object is not supported for {locator}.")
3944
+
3945
+
3946
+ def _delete_xlsx_object(
3947
+ document_path: Path,
3948
+ locator: str,
3949
+ output_path: Path,
3950
+ ) -> tuple[str, dict[str, Any]]:
3951
+ canonical = to_v2_locator(locator, file_type="xlsx")
3952
+ workbook = xlsx_adapter._open_workbook(document_path)
3953
+ parts = parse_locator(canonical).components
3954
+
3955
+ if parts == ("xlsx", "workbook"):
3956
+ raise TargetNotEditableError(f"{locator} does not support delete.")
3957
+
3958
+ if len(parts) == 3 and parts[:2] == ("xlsx", "sheet"):
3959
+ worksheet = workbook[parts[2]]
3960
+ if len(workbook.worksheets) <= 1:
3961
+ raise TargetNotEditableError(f"{locator} does not support delete.")
3962
+ workbook.remove(worksheet)
3963
+ workbook.save(output_path)
3964
+ return (f"Deleted worksheet {locator}.", {"locator": locator})
3965
+
3966
+ if len(parts) == 5 and parts[:2] == ("xlsx", "sheet") and parts[3] == "row":
3967
+ worksheet = workbook[parts[2]]
3968
+ worksheet.delete_rows(int(parts[4]), 1)
3969
+ workbook.save(output_path)
3970
+ return (f"Deleted row {locator}.", {"locator": locator})
3971
+
3972
+ if len(parts) == 5 and parts[:2] == ("xlsx", "sheet") and parts[3] == "col":
3973
+ worksheet = workbook[parts[2]]
3974
+ worksheet.delete_cols(int(parts[4]), 1)
3975
+ workbook.save(output_path)
3976
+ return (f"Deleted column {locator}.", {"locator": locator})
3977
+
3978
+ if len(parts) == 4 and parts[:2] == ("xlsx", "sheet"):
3979
+ worksheet = workbook[parts[2]]
3980
+ coordinate = parts[3]
3981
+ if ":" in coordinate:
3982
+ for row in worksheet[coordinate]:
3983
+ for cell in row:
3984
+ cell.value = None
3985
+ workbook.save(output_path)
3986
+ return (f"Cleared range {locator}.", {"locator": locator})
3987
+ worksheet[coordinate].value = None
3988
+ workbook.save(output_path)
3989
+ return (f"Cleared cell {locator}.", {"locator": locator})
3990
+
3991
+ raise InvalidArgumentsError(f"delete_object is not supported for {locator}.")
3992
+
3993
+
3994
+ def _build_xlsx_row_embedding_records(
3995
+ document_id: str,
3996
+ row_embeddings: Sequence[XlsxRowEmbedding],
3997
+ blobs: Sequence[bytes],
3998
+ ) -> list[
3999
+ tuple[
4000
+ str,
4001
+ str,
4002
+ int,
4003
+ str,
4004
+ str,
4005
+ str,
4006
+ bytes,
4007
+ list[tuple[str, str, int, bool]],
4008
+ ]
4009
+ ]:
4010
+ records: list[
4011
+ tuple[
4012
+ str,
4013
+ str,
4014
+ int,
4015
+ str,
4016
+ str,
4017
+ str,
4018
+ bytes,
4019
+ list[tuple[str, str, int, bool]],
4020
+ ]
4021
+ ] = []
4022
+ for row_embedding, blob in zip(row_embeddings, blobs, strict=True):
4023
+ embedding_id = store.make_xlsx_row_embedding_id(
4024
+ document_id,
4025
+ row_embedding.sheet_name,
4026
+ row_embedding.row_number,
4027
+ )
4028
+ representative_storage_id = store.make_storage_id(
4029
+ document_id,
4030
+ row_embedding.representative_item_id,
4031
+ )
4032
+ contributing_cells = [
4033
+ (
4034
+ store.make_storage_id(document_id, cell.item_id),
4035
+ cell.coordinate,
4036
+ index,
4037
+ cell.item_id == row_embedding.representative_item_id,
4038
+ )
4039
+ for index, cell in enumerate(row_embedding.contributing_cells, start=1)
4040
+ ]
4041
+ records.append(
4042
+ (
4043
+ embedding_id,
4044
+ row_embedding.sheet_name,
4045
+ row_embedding.row_number,
4046
+ representative_storage_id,
4047
+ row_embedding.text,
4048
+ row_embedding.preview,
4049
+ blob,
4050
+ contributing_cells,
4051
+ )
4052
+ )
4053
+ return records
4054
+
4055
+
4056
+ def _raise_if_pptx_target_not_editable(document_path: Path, item_id: str) -> None:
4057
+ try:
4058
+ pptx_adapter.resolve_shape(document_path, item_id)
4059
+ except pptx_adapter.TargetNotEditableError:
4060
+ raise
4061
+ except (TargetNotFoundError, InvalidArgumentsError):
4062
+ return
4063
+
4064
+
4065
+ def _check_import(module_name: str, label: str) -> DoctorCheck:
4066
+ try:
4067
+ importlib.import_module(module_name)
4068
+ except ModuleNotFoundError as exc:
4069
+ return DoctorCheck(label, False, f"Import failed: {exc}")
4070
+ return DoctorCheck(label, True, "Import succeeded.")
4071
+
4072
+
4073
+ def _check_sqlite_module() -> DoctorCheck:
4074
+ try:
4075
+ sqlite3.connect(":memory:").close()
4076
+ except sqlite3.Error as exc:
4077
+ return DoctorCheck("SQLite", False, f"Connection failed: {exc}")
4078
+ return DoctorCheck("SQLite", True, "Connection succeeded.")
4079
+
4080
+
4081
+ def _check_fts5_support() -> DoctorCheck:
4082
+ connection = sqlite3.connect(":memory:")
4083
+ try:
4084
+ if store.supports_fts5(connection):
4085
+ return DoctorCheck(
4086
+ "SQLite FTS5", True, "FTS5 virtual tables are available."
4087
+ )
4088
+ return DoctorCheck("SQLite FTS5", False, "FTS5 virtual tables are unavailable.")
4089
+ finally:
4090
+ connection.close()
4091
+
4092
+
4093
+ def _check_embedding_provider_import() -> DoctorCheck:
4094
+ try:
4095
+ importlib.import_module("offagent.adapters.embedding_provider")
4096
+ except Exception as exc:
4097
+ return DoctorCheck("Embedding Provider", False, f"Import failed: {exc}")
4098
+ return DoctorCheck("Embedding Provider", True, "Import succeeded.")
4099
+
4100
+
4101
+ def _check_embedding_model(
4102
+ model_name: str,
4103
+ dimensions: int,
4104
+ *,
4105
+ provider_factory: Callable[[str, int | None], embedding_provider.EmbeddingProvider]
4106
+ | None = None,
4107
+ ) -> DoctorCheck:
4108
+ try:
4109
+ factory = provider_factory or (
4110
+ lambda selected_model, selected_dimensions: (
4111
+ embedding_provider.LocalEmbeddingProvider(
4112
+ model_name=selected_model,
4113
+ dimensions=selected_dimensions,
4114
+ )
4115
+ )
4116
+ )
4117
+ provider = factory(model_name, dimensions)
4118
+ except Exception as exc:
4119
+ return DoctorCheck("Embedding Model", False, f"Model load failed: {exc}")
4120
+ return DoctorCheck(
4121
+ "Embedding Model",
4122
+ True,
4123
+ f"Loaded {provider.model_name} with {provider.dimensions} dimensions.",
4124
+ )
4125
+
4126
+
4127
+ def _check_embedding_store(
4128
+ index_path: Path, model_name: str, dimensions: int
4129
+ ) -> DoctorCheck:
4130
+ try:
4131
+ connection = store.ensure_ready(index_path)
4132
+ except (OSError, sqlite3.Error, store.StoreCapabilityError) as exc:
4133
+ return DoctorCheck("Embedding Store", False, f"Store check failed: {exc}")
4134
+
4135
+ try:
4136
+ meta = store.fetch_embedding_meta(connection)
4137
+ if not meta:
4138
+ return DoctorCheck(
4139
+ "Embedding Store", True, "Embedding sidecar tables are ready."
4140
+ )
4141
+ store.ensure_embedding_meta(
4142
+ connection,
4143
+ model_name=model_name,
4144
+ dimensions=dimensions,
4145
+ )
4146
+ except Exception as exc:
4147
+ return DoctorCheck("Embedding Store", False, f"Metadata check failed: {exc}")
4148
+ finally:
4149
+ connection.close()
4150
+ return DoctorCheck(
4151
+ "Embedding Store", True, "Embedding tables and metadata are consistent."
4152
+ )
4153
+
4154
+
4155
+ def _check_index_path(index_path: Path) -> DoctorCheck:
4156
+ try:
4157
+ connection = store.ensure_ready(index_path)
4158
+ except (OSError, sqlite3.Error, store.StoreCapabilityError) as exc:
4159
+ return DoctorCheck("Index Path", False, f"Schema bootstrap failed: {exc}")
4160
+ else:
4161
+ connection.close()
4162
+ return DoctorCheck("Index Path", True, f"Schema ready at {index_path}.")
4163
+
4164
+
4165
+ def _check_document_roots(roots: Sequence[Path]) -> list[DoctorCheck]:
4166
+ if not roots:
4167
+ return [DoctorCheck("Document Roots", True, "No document roots configured.")]
4168
+
4169
+ checks: list[DoctorCheck] = []
4170
+ for root in roots:
4171
+ if root.exists() and root.is_dir() and os.access(root, os.R_OK):
4172
+ checks.append(
4173
+ DoctorCheck(f"Document Root {root}", True, "Readable directory.")
4174
+ )
4175
+ elif not root.exists():
4176
+ checks.append(
4177
+ DoctorCheck(f"Document Root {root}", False, "Path does not exist.")
4178
+ )
4179
+ elif not root.is_dir():
4180
+ checks.append(
4181
+ DoctorCheck(f"Document Root {root}", False, "Path is not a directory.")
4182
+ )
4183
+ else:
4184
+ checks.append(
4185
+ DoctorCheck(
4186
+ f"Document Root {root}", False, "Directory is not readable."
4187
+ )
4188
+ )
4189
+ return checks
4190
+
4191
+
4192
+ def _check_allowed_roots(roots: Sequence[Path]) -> list[DoctorCheck]:
4193
+ if not roots:
4194
+ return [
4195
+ DoctorCheck("Allowed Roots", True, "No allowed-root policy configured.")
4196
+ ]
4197
+ return _check_resolved_roots(
4198
+ "Allowed Root", normalize_roots(roots), require_writable=False
4199
+ )
4200
+
4201
+
4202
+ def _check_output_roots(roots: Sequence[Path]) -> list[DoctorCheck]:
4203
+ if not roots:
4204
+ return [DoctorCheck("Output Roots", True, "No output-root policy configured.")]
4205
+ return _check_resolved_roots(
4206
+ "Output Root", normalize_roots(roots), require_writable=True
4207
+ )
4208
+
4209
+
4210
+ def _check_resolved_roots(
4211
+ label: str,
4212
+ roots: Sequence[Path],
4213
+ *,
4214
+ require_writable: bool,
4215
+ ) -> list[DoctorCheck]:
4216
+ checks: list[DoctorCheck] = []
4217
+ for root in roots:
4218
+ if root.exists():
4219
+ if not root.is_dir():
4220
+ checks.append(
4221
+ DoctorCheck(f"{label} {root}", False, "Path is not a directory.")
4222
+ )
4223
+ continue
4224
+ if require_writable and not os.access(root, os.W_OK):
4225
+ checks.append(
4226
+ DoctorCheck(f"{label} {root}", False, "Directory is not writable.")
4227
+ )
4228
+ continue
4229
+ if not require_writable and not os.access(root, os.R_OK):
4230
+ checks.append(
4231
+ DoctorCheck(f"{label} {root}", False, "Directory is not readable.")
4232
+ )
4233
+ continue
4234
+ checks.append(
4235
+ DoctorCheck(f"{label} {root}", True, "Policy root is usable.")
4236
+ )
4237
+ continue
4238
+
4239
+ existing_parent = _nearest_existing_parent(root)
4240
+ access_mode = os.W_OK if require_writable else os.R_OK
4241
+ if existing_parent is not None and os.access(existing_parent, access_mode):
4242
+ checks.append(
4243
+ DoctorCheck(
4244
+ f"{label} {root}",
4245
+ True,
4246
+ f"Parent path {existing_parent} is accessible.",
4247
+ )
4248
+ )
4249
+ continue
4250
+ checks.append(
4251
+ DoctorCheck(
4252
+ f"{label} {root}",
4253
+ False,
4254
+ "Path does not exist and no accessible parent directory was found.",
4255
+ )
4256
+ )
4257
+ return checks
4258
+
4259
+
4260
+ def _nearest_existing_parent(path: Path) -> Path | None:
4261
+ current = path
4262
+ while True:
4263
+ if current.exists():
4264
+ return current
4265
+ if current.parent == current:
4266
+ return None
4267
+ current = current.parent