offagent 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. offagent/__init__.py +3 -0
  2. offagent/__main__.py +5 -0
  3. offagent/adapters/__init__.py +1 -0
  4. offagent/adapters/docx_adapter.py +1237 -0
  5. offagent/adapters/embedding_provider.py +132 -0
  6. offagent/adapters/pptx_adapter.py +940 -0
  7. offagent/adapters/xlsx_adapter.py +1266 -0
  8. offagent/app/__init__.py +1 -0
  9. offagent/app/progress.py +52 -0
  10. offagent/app/services.py +4267 -0
  11. offagent/config.py +287 -0
  12. offagent/domain/__init__.py +1 -0
  13. offagent/domain/locators.py +444 -0
  14. offagent/domain/models.py +477 -0
  15. offagent/domain/text_fragments.py +136 -0
  16. offagent/errors.py +29 -0
  17. offagent/indexing/__init__.py +1 -0
  18. offagent/indexing/store.py +795 -0
  19. offagent/interfaces/__init__.py +1 -0
  20. offagent/interfaces/cli.py +438 -0
  21. offagent/interfaces/cli_output.py +139 -0
  22. offagent/interfaces/cli_progress.py +120 -0
  23. offagent/interfaces/mcp.py +1145 -0
  24. offagent/interfaces/mcp_converters.py +80 -0
  25. offagent/interfaces/mcp_models.py +923 -0
  26. offagent/objects/__init__.py +3 -0
  27. offagent/objects/base.py +26 -0
  28. offagent/objects/docx_objects.py +951 -0
  29. offagent/objects/pptx_objects.py +895 -0
  30. offagent/objects/xlsx_objects.py +962 -0
  31. offagent/path_policy.py +42 -0
  32. offagent/storage/__init__.py +1 -0
  33. offagent/storage/versioning.py +31 -0
  34. offagent-0.10.0.dist-info/METADATA +546 -0
  35. offagent-0.10.0.dist-info/RECORD +39 -0
  36. offagent-0.10.0.dist-info/WHEEL +5 -0
  37. offagent-0.10.0.dist-info/entry_points.txt +2 -0
  38. offagent-0.10.0.dist-info/licenses/LICENSE +21 -0
  39. offagent-0.10.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,444 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Literal
5
+
6
+ from offagent.domain.models import FileType
7
+
8
+ LocatorType = Literal["direct", "search"]
9
+
10
+ DIRECT_PREFIXES = (
11
+ "paragraph ",
12
+ "slide ",
13
+ "sheet ",
14
+ "para:",
15
+ "slide:",
16
+ "sheet:",
17
+ "table:",
18
+ "docx:",
19
+ "pptx:",
20
+ "xlsx:",
21
+ )
22
+ FORMAT_PREFIXES = ("docx:", "pptx:", "xlsx:")
23
+
24
+
25
+ @dataclass(frozen=True)
26
+ class LocatorParseResult:
27
+ raw: str
28
+ locator_type: LocatorType
29
+ target_hint: str | None
30
+ tokens: tuple[str, ...]
31
+ resolved: bool = False
32
+ file_type: FileType | None = None
33
+ canonical_locator: str | None = None
34
+ components: tuple[str, ...] = ()
35
+
36
+
37
+ @dataclass(frozen=True)
38
+ class _ParsedDirectLocator:
39
+ file_type: FileType
40
+ target_hint: str | None
41
+ canonical_locator: str
42
+ components: tuple[str, ...]
43
+
44
+
45
+ def parse_locator(raw: str) -> LocatorParseResult:
46
+ normalized = raw.strip()
47
+ if not normalized:
48
+ raise ValueError("Locator cannot be empty.")
49
+
50
+ lowered = normalized.lower()
51
+ if lowered.startswith(DIRECT_PREFIXES):
52
+ parsed = _parse_direct_locator(normalized)
53
+ return LocatorParseResult(
54
+ raw=normalized,
55
+ locator_type="direct",
56
+ target_hint=parsed.target_hint,
57
+ tokens=_tokenize_legacy_compatible(normalized),
58
+ resolved=False,
59
+ file_type=parsed.file_type,
60
+ canonical_locator=parsed.canonical_locator,
61
+ components=parsed.components,
62
+ )
63
+
64
+ return LocatorParseResult(
65
+ raw=normalized,
66
+ locator_type="search",
67
+ target_hint=None,
68
+ tokens=tuple(part for part in normalized.split() if part),
69
+ resolved=False,
70
+ file_type=None,
71
+ canonical_locator=None,
72
+ components=(),
73
+ )
74
+
75
+
76
+ def to_v2_locator(raw: str, *, file_type: FileType | None = None) -> str:
77
+ parsed = parse_locator(raw)
78
+ if parsed.locator_type != "direct":
79
+ raise ValueError(f"Unsupported direct locator: {raw}")
80
+ if file_type is not None and parsed.file_type != file_type:
81
+ raise ValueError(f"Locator {raw!r} does not belong to file type {file_type}.")
82
+ assert parsed.canonical_locator is not None
83
+ return parsed.canonical_locator
84
+
85
+
86
+ def to_legacy_locator(raw: str, *, file_type: FileType | None = None) -> str:
87
+ parsed = parse_locator(raw)
88
+ if parsed.locator_type != "direct":
89
+ raise ValueError(f"Unsupported direct locator: {raw}")
90
+ if file_type is not None and parsed.file_type != file_type:
91
+ raise ValueError(f"Locator {raw!r} does not belong to file type {file_type}.")
92
+ if parsed.file_type is None:
93
+ raise ValueError(f"Unsupported direct locator: {raw}")
94
+ return _to_legacy_from_components(parsed.file_type, parsed.components, raw)
95
+
96
+
97
+ def make_docx_v2_locator(locator: str) -> str:
98
+ return _to_format_locator(locator, expected="docx")
99
+
100
+
101
+ def make_pptx_v2_locator(locator: str) -> str:
102
+ return _to_format_locator(locator, expected="pptx")
103
+
104
+
105
+ def make_xlsx_v2_locator(locator: str) -> str:
106
+ return _to_format_locator(locator, expected="xlsx")
107
+
108
+
109
+ def _to_format_locator(locator: str, *, expected: FileType) -> str:
110
+ parsed = parse_locator(locator)
111
+ if parsed.locator_type != "direct" or parsed.file_type != expected:
112
+ raise ValueError(f"Unsupported {expected} locator: {locator}")
113
+ assert parsed.canonical_locator is not None
114
+ return parsed.canonical_locator
115
+
116
+
117
+ def _to_legacy_from_components(
118
+ file_type: FileType, components: tuple[str, ...], raw: str
119
+ ) -> str:
120
+ if file_type == "docx":
121
+ if len(components) == 3 and components[:2] == ("docx", "para"):
122
+ return f"para:{components[2]}"
123
+ if (
124
+ len(components) == 7
125
+ and components[:2] == ("docx", "table")
126
+ and components[3] == "row"
127
+ and components[5] == "cell"
128
+ ):
129
+ return f"table:{components[2]}:cell:{components[4]}:{components[6]}"
130
+ raise ValueError(f"Unsupported docx locator for legacy conversion: {raw}")
131
+
132
+ if file_type == "pptx":
133
+ if len(components) == 3 and components[:2] == ("pptx", "slide"):
134
+ return f"slide:{components[2]}"
135
+ if len(components) == 5 and components[:2] == ("pptx", "slide"):
136
+ return f"slide:{components[2]}:shape:{components[4]}"
137
+ raise ValueError(f"Unsupported pptx locator for legacy conversion: {raw}")
138
+
139
+ if file_type == "xlsx":
140
+ if len(components) == 3 and components[:2] == ("xlsx", "sheet"):
141
+ return f"sheet:{components[2]}"
142
+ if len(components) == 4 and components[:2] == ("xlsx", "sheet"):
143
+ return f"sheet:{components[2]}!{components[3]}"
144
+ if (
145
+ len(components) == 5
146
+ and components[:2] == ("xlsx", "sheet")
147
+ and components[3] == "formula_cell"
148
+ ):
149
+ return f"sheet:{components[2]}!{components[4]}"
150
+ raise ValueError(f"Unsupported xlsx locator for legacy conversion: {raw}")
151
+
152
+ raise ValueError(f"Unsupported locator conversion for {raw}")
153
+
154
+
155
+ def _parse_direct_locator(raw: str) -> _ParsedDirectLocator:
156
+ lowered = raw.lower()
157
+ if lowered.startswith(FORMAT_PREFIXES):
158
+ return _parse_v2_direct_locator(raw)
159
+ if lowered.startswith("paragraph "):
160
+ return _parse_paragraph_words(raw)
161
+ if lowered.startswith("slide "):
162
+ return _parse_slide_words(raw)
163
+ if lowered.startswith("sheet "):
164
+ return _parse_sheet_words(raw)
165
+ if lowered.startswith("para:"):
166
+ return _parse_docx_locator(raw)
167
+ if lowered.startswith("table:"):
168
+ return _parse_docx_locator(raw)
169
+ if lowered.startswith("slide:"):
170
+ return _parse_pptx_locator(raw)
171
+ if lowered.startswith("sheet:"):
172
+ return _parse_xlsx_locator(raw)
173
+ raise ValueError(f"Unsupported direct locator: {raw}")
174
+
175
+
176
+ def _parse_v2_direct_locator(raw: str) -> _ParsedDirectLocator:
177
+ if raw.startswith("docx:"):
178
+ canonical = raw
179
+ components = tuple(raw.split(":"))
180
+ return _ParsedDirectLocator(
181
+ file_type="docx",
182
+ target_hint=_infer_docx_target_hint(components),
183
+ canonical_locator=canonical,
184
+ components=components,
185
+ )
186
+
187
+ if raw.startswith("pptx:"):
188
+ canonical = raw
189
+ components = tuple(raw.split(":"))
190
+ return _ParsedDirectLocator(
191
+ file_type="pptx",
192
+ target_hint=_infer_pptx_target_hint(components),
193
+ canonical_locator=canonical,
194
+ components=components,
195
+ )
196
+
197
+ if raw.startswith("xlsx:"):
198
+ canonical = raw
199
+ components = _split_xlsx_components(raw[len("xlsx:") :], include_prefix=True)
200
+ return _ParsedDirectLocator(
201
+ file_type="xlsx",
202
+ target_hint=_infer_xlsx_target_hint(canonical, components),
203
+ canonical_locator=canonical,
204
+ components=components,
205
+ )
206
+
207
+ raise ValueError(f"Unsupported fully-qualified locator: {raw}")
208
+
209
+
210
+ def _parse_paragraph_words(raw: str) -> _ParsedDirectLocator:
211
+ paragraph_index = _parse_int(raw.split(maxsplit=1)[1], raw)
212
+ canonical = f"docx:para:{paragraph_index}"
213
+ return _ParsedDirectLocator(
214
+ file_type="docx",
215
+ target_hint="paragraph",
216
+ canonical_locator=canonical,
217
+ components=("docx", "para", str(paragraph_index)),
218
+ )
219
+
220
+
221
+ def _parse_slide_words(raw: str) -> _ParsedDirectLocator:
222
+ slide_number = _parse_int(raw.split(maxsplit=1)[1], raw)
223
+ canonical = f"pptx:slide:{slide_number}"
224
+ return _ParsedDirectLocator(
225
+ file_type="pptx",
226
+ target_hint="slide",
227
+ canonical_locator=canonical,
228
+ components=("pptx", "slide", str(slide_number)),
229
+ )
230
+
231
+
232
+ def _parse_sheet_words(raw: str) -> _ParsedDirectLocator:
233
+ sheet_name = raw.split(maxsplit=1)[1].strip()
234
+ if not sheet_name:
235
+ raise ValueError(f"Invalid worksheet locator: {raw}")
236
+ canonical = f"xlsx:sheet:{sheet_name}"
237
+ return _ParsedDirectLocator(
238
+ file_type="xlsx",
239
+ target_hint="sheet",
240
+ canonical_locator=canonical,
241
+ components=("xlsx", "sheet", sheet_name),
242
+ )
243
+
244
+
245
+ def _parse_docx_locator(raw: str) -> _ParsedDirectLocator:
246
+ if raw.startswith("para:"):
247
+ paragraph_index = _parse_int(raw.split(":", maxsplit=1)[1], raw)
248
+ canonical = f"docx:para:{paragraph_index}"
249
+ return _ParsedDirectLocator(
250
+ file_type="docx",
251
+ target_hint="paragraph",
252
+ canonical_locator=canonical,
253
+ components=("docx", "para", str(paragraph_index)),
254
+ )
255
+
256
+ parts = raw.split(":")
257
+ if len(parts) != 5 or parts[0] != "table" or parts[2] != "cell":
258
+ raise ValueError(f"Unsupported DOCX locator: {raw}")
259
+ table_index = _parse_int(parts[1], raw)
260
+ row_index = _parse_int(parts[3], raw)
261
+ column_index = _parse_int(parts[4], raw)
262
+ canonical = f"docx:table:{table_index}:row:{row_index}:cell:{column_index}"
263
+ return _ParsedDirectLocator(
264
+ file_type="docx",
265
+ target_hint="table_cell",
266
+ canonical_locator=canonical,
267
+ components=(
268
+ "docx",
269
+ "table",
270
+ str(table_index),
271
+ "row",
272
+ str(row_index),
273
+ "cell",
274
+ str(column_index),
275
+ ),
276
+ )
277
+
278
+
279
+ def _parse_pptx_locator(raw: str) -> _ParsedDirectLocator:
280
+ parts = raw.split(":")
281
+ if len(parts) == 2 and parts[0] == "slide":
282
+ slide_number = _parse_int(parts[1], raw)
283
+ canonical = f"pptx:slide:{slide_number}"
284
+ return _ParsedDirectLocator(
285
+ file_type="pptx",
286
+ target_hint="slide",
287
+ canonical_locator=canonical,
288
+ components=("pptx", "slide", str(slide_number)),
289
+ )
290
+
291
+ if len(parts) == 3 and parts[0] == "slide":
292
+ slide_number = _parse_int(parts[1], raw)
293
+ shape_id = _parse_int(parts[2], raw)
294
+ canonical = f"pptx:slide:{slide_number}:shape:{shape_id}"
295
+ return _ParsedDirectLocator(
296
+ file_type="pptx",
297
+ target_hint="shape",
298
+ canonical_locator=canonical,
299
+ components=("pptx", "slide", str(slide_number), "shape", str(shape_id)),
300
+ )
301
+
302
+ if len(parts) == 4 and parts[0] == "slide" and parts[2] == "shape":
303
+ slide_number = _parse_int(parts[1], raw)
304
+ shape_id = _parse_int(parts[3], raw)
305
+ canonical = f"pptx:slide:{slide_number}:shape:{shape_id}"
306
+ return _ParsedDirectLocator(
307
+ file_type="pptx",
308
+ target_hint="shape",
309
+ canonical_locator=canonical,
310
+ components=("pptx", "slide", str(slide_number), "shape", str(shape_id)),
311
+ )
312
+
313
+ raise ValueError(f"Unsupported PPTX locator: {raw}")
314
+
315
+
316
+ def _parse_xlsx_locator(raw: str) -> _ParsedDirectLocator:
317
+ remainder = raw[len("sheet:") :]
318
+ if not remainder:
319
+ raise ValueError(f"Invalid worksheet locator: {raw}")
320
+
321
+ if "!" in remainder:
322
+ sheet_name, address = remainder.split("!", maxsplit=1)
323
+ if not sheet_name or not address:
324
+ raise ValueError(f"Invalid XLSX locator: {raw}")
325
+ canonical = f"xlsx:sheet:{sheet_name}!{address}"
326
+ return _ParsedDirectLocator(
327
+ file_type="xlsx",
328
+ target_hint="range" if ":" in address else "cell",
329
+ canonical_locator=canonical,
330
+ components=("xlsx", "sheet", sheet_name, address),
331
+ )
332
+
333
+ canonical = f"xlsx:sheet:{remainder}"
334
+ return _ParsedDirectLocator(
335
+ file_type="xlsx",
336
+ target_hint="sheet",
337
+ canonical_locator=canonical,
338
+ components=("xlsx", "sheet", remainder),
339
+ )
340
+
341
+
342
+ def _infer_docx_target_hint(components: tuple[str, ...]) -> str | None:
343
+ if len(components) < 2:
344
+ return None
345
+ if "page_break" in components:
346
+ return "page_break"
347
+ if "image" in components:
348
+ return "image"
349
+ if "cell" in components:
350
+ return "table_cell"
351
+ if "row" in components:
352
+ return "table_row"
353
+ if "run" in components:
354
+ return "run"
355
+ kind = components[1]
356
+ if kind == "para":
357
+ return "paragraph"
358
+ if kind in {"document", "section", "table"}:
359
+ return kind
360
+ return _last_named_component(components)
361
+
362
+
363
+ def _infer_pptx_target_hint(components: tuple[str, ...]) -> str | None:
364
+ if len(components) < 2:
365
+ return None
366
+ if "notes" in components:
367
+ return "notes"
368
+ if "cell" in components:
369
+ return "table_cell"
370
+ if "row" in components:
371
+ return "table_row"
372
+ if "group_shape" in components:
373
+ return "group_shape"
374
+ if "text_shape" in components:
375
+ return "text_shape"
376
+ if "image_shape" in components:
377
+ return "image_shape"
378
+ if "shape" in components:
379
+ return "shape"
380
+ kind = components[1]
381
+ if kind in {"presentation", "slide", "table"}:
382
+ return kind
383
+ return _last_named_component(components)
384
+
385
+
386
+ def _infer_xlsx_target_hint(raw: str, components: tuple[str, ...]) -> str | None:
387
+ if len(components) < 2:
388
+ return None
389
+ if components[1] == "workbook":
390
+ return "workbook"
391
+ if components[1] == "named_range":
392
+ return "named_range"
393
+ if components[1] != "sheet":
394
+ return _last_named_component(components)
395
+ if "!" in raw:
396
+ return "range" if ":" in raw.split("!", maxsplit=1)[1] else "cell"
397
+ if "row" in components:
398
+ return "row"
399
+ if "col" in components:
400
+ return "column"
401
+ if "table" in components:
402
+ return "table"
403
+ if "merged_range" in components:
404
+ return "merged_range"
405
+ if "formula_cell" in components:
406
+ return "formula_cell"
407
+ return "sheet"
408
+
409
+
410
+ def _split_xlsx_components(raw: str, *, include_prefix: bool) -> tuple[str, ...]:
411
+ prefix = ("xlsx",) if include_prefix else ()
412
+ if raw == "workbook":
413
+ return prefix + ("workbook",)
414
+ if raw.startswith("named_range:"):
415
+ name = raw.split(":", maxsplit=1)[1]
416
+ return prefix + ("named_range", name)
417
+ if not raw.startswith("sheet:"):
418
+ return prefix + tuple(raw.split(":"))
419
+
420
+ remainder = raw[len("sheet:") :]
421
+ if "!" in remainder:
422
+ sheet_name, address = remainder.split("!", maxsplit=1)
423
+ return prefix + ("sheet", sheet_name, address)
424
+
425
+ parts = remainder.split(":")
426
+ return prefix + ("sheet",) + tuple(parts)
427
+
428
+
429
+ def _last_named_component(components: tuple[str, ...]) -> str | None:
430
+ for component in reversed(components):
431
+ if component and not component.isdigit():
432
+ return component
433
+ return None
434
+
435
+
436
+ def _tokenize_legacy_compatible(raw: str) -> tuple[str, ...]:
437
+ return tuple(part for part in raw.replace("!", " ").split() if part)
438
+
439
+
440
+ def _parse_int(raw_value: str, locator: str) -> int:
441
+ try:
442
+ return int(raw_value)
443
+ except ValueError as exc:
444
+ raise ValueError(f"Invalid numeric locator component in {locator!r}.") from exc