offagent 0.10.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. offagent/__init__.py +3 -0
  2. offagent/__main__.py +5 -0
  3. offagent/adapters/__init__.py +1 -0
  4. offagent/adapters/docx_adapter.py +1237 -0
  5. offagent/adapters/embedding_provider.py +132 -0
  6. offagent/adapters/pptx_adapter.py +940 -0
  7. offagent/adapters/xlsx_adapter.py +1266 -0
  8. offagent/app/__init__.py +1 -0
  9. offagent/app/progress.py +52 -0
  10. offagent/app/services.py +4267 -0
  11. offagent/config.py +287 -0
  12. offagent/domain/__init__.py +1 -0
  13. offagent/domain/locators.py +444 -0
  14. offagent/domain/models.py +477 -0
  15. offagent/domain/text_fragments.py +136 -0
  16. offagent/errors.py +29 -0
  17. offagent/indexing/__init__.py +1 -0
  18. offagent/indexing/store.py +795 -0
  19. offagent/interfaces/__init__.py +1 -0
  20. offagent/interfaces/cli.py +438 -0
  21. offagent/interfaces/cli_output.py +139 -0
  22. offagent/interfaces/cli_progress.py +120 -0
  23. offagent/interfaces/mcp.py +1145 -0
  24. offagent/interfaces/mcp_converters.py +80 -0
  25. offagent/interfaces/mcp_models.py +923 -0
  26. offagent/objects/__init__.py +3 -0
  27. offagent/objects/base.py +26 -0
  28. offagent/objects/docx_objects.py +951 -0
  29. offagent/objects/pptx_objects.py +895 -0
  30. offagent/objects/xlsx_objects.py +962 -0
  31. offagent/path_policy.py +42 -0
  32. offagent/storage/__init__.py +1 -0
  33. offagent/storage/versioning.py +31 -0
  34. offagent-0.10.0.dist-info/METADATA +546 -0
  35. offagent-0.10.0.dist-info/RECORD +39 -0
  36. offagent-0.10.0.dist-info/WHEEL +5 -0
  37. offagent-0.10.0.dist-info/entry_points.txt +2 -0
  38. offagent-0.10.0.dist-info/licenses/LICENSE +21 -0
  39. offagent-0.10.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,132 @@
1
+ from __future__ import annotations
2
+
3
+ import hashlib
4
+ import math
5
+ import re
6
+ import struct
7
+ from collections.abc import Callable
8
+ from typing import Protocol
9
+
10
+ try:
11
+ from fastembed import TextEmbedding as FastEmbedTextEmbedding
12
+ except (
13
+ ModuleNotFoundError
14
+ ): # pragma: no cover - exercised by tests that simulate missing dependency
15
+ FastEmbedTextEmbedding = None
16
+
17
+ DEFAULT_EMBEDDING_MODEL = "BAAI/bge-small-en-v1.5"
18
+ TOKEN_PATTERN = re.compile(r"[A-Za-z0-9_]+")
19
+
20
+
21
+ class EmbeddingProvider(Protocol):
22
+ model_name: str
23
+ dimensions: int
24
+
25
+ def embed_texts(
26
+ self,
27
+ texts: list[str],
28
+ *,
29
+ on_progress: Callable[[int, int], None] | None = None,
30
+ ) -> list[bytes]:
31
+ """Return one float32 blob per input text."""
32
+
33
+
34
+ class LocalEmbeddingProvider:
35
+ """Local embedding provider that prefers fastembed and falls back to hashing vectors."""
36
+
37
+ def __init__(
38
+ self,
39
+ model_name: str = DEFAULT_EMBEDDING_MODEL,
40
+ *,
41
+ dimensions: int | None = None,
42
+ ) -> None:
43
+ self.model_name = model_name
44
+ self._backend = _build_backend(model_name, dimensions)
45
+ self.dimensions = self._backend.dimensions
46
+
47
+ def embed_texts(
48
+ self,
49
+ texts: list[str],
50
+ *,
51
+ on_progress: Callable[[int, int], None] | None = None,
52
+ ) -> list[bytes]:
53
+ return [
54
+ struct.pack(f"<{self.dimensions}f", *vector)
55
+ for vector in self._backend.embed(texts, on_progress=on_progress)
56
+ ]
57
+
58
+
59
+ class _FastEmbedBackend:
60
+ def __init__(self, model_name: str) -> None:
61
+ if FastEmbedTextEmbedding is None:
62
+ raise RuntimeError("fastembed is not installed.")
63
+ self._model = FastEmbedTextEmbedding(model_name=model_name)
64
+ probe = next(self._model.embed(["probe"]))
65
+ self.dimensions = len(probe)
66
+
67
+ def embed(
68
+ self,
69
+ texts: list[str],
70
+ *,
71
+ on_progress: Callable[[int, int], None] | None = None,
72
+ ) -> list[list[float]]:
73
+ results: list[list[float]] = []
74
+ total = len(texts)
75
+ for index, vector in enumerate(self._model.embed(texts), start=1):
76
+ results.append(list(map(float, vector)))
77
+ if on_progress is not None:
78
+ on_progress(index, total)
79
+ return results
80
+
81
+
82
+ class _HashingBackend:
83
+ def __init__(self, dimensions: int) -> None:
84
+ self.dimensions = dimensions
85
+
86
+ def embed(
87
+ self,
88
+ texts: list[str],
89
+ *,
90
+ on_progress: Callable[[int, int], None] | None = None,
91
+ ) -> list[list[float]]:
92
+ results: list[list[float]] = []
93
+ total = len(texts)
94
+ for index, text in enumerate(texts, start=1):
95
+ results.append(_hash_text_to_unit_vector(text, self.dimensions))
96
+ if on_progress is not None:
97
+ on_progress(index, total)
98
+ return results
99
+
100
+
101
+ def _build_backend(model_name: str, dimensions: int | None):
102
+ if model_name.startswith("hash://"):
103
+ return _HashingBackend(dimensions or 384)
104
+ if FastEmbedTextEmbedding is not None:
105
+ backend = _FastEmbedBackend(model_name)
106
+ if dimensions is not None and backend.dimensions != dimensions:
107
+ raise RuntimeError(
108
+ f"Configured embedding dimensions {dimensions} do not match model dimensions {backend.dimensions}."
109
+ )
110
+ return backend
111
+
112
+ return _HashingBackend(dimensions or 384)
113
+
114
+
115
+ def _hash_text_to_unit_vector(text: str, dimensions: int) -> list[float]:
116
+ vector = [0.0] * dimensions
117
+ tokens = TOKEN_PATTERN.findall(text.lower())
118
+ if not tokens:
119
+ return vector
120
+
121
+ for token in tokens:
122
+ digest = hashlib.sha256(token.encode("utf-8")).digest()
123
+ for offset in range(0, 8, 4):
124
+ bucket = int.from_bytes(digest[offset : offset + 2], "little") % dimensions
125
+ sign = 1.0 if digest[offset + 2] % 2 == 0 else -1.0
126
+ weight = 1.0 + (digest[offset + 3] / 255.0)
127
+ vector[bucket] += sign * weight
128
+
129
+ norm = math.sqrt(sum(value * value for value in vector))
130
+ if norm == 0.0:
131
+ return vector
132
+ return [value / norm for value in vector]