offagent 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- offagent/__init__.py +3 -0
- offagent/__main__.py +5 -0
- offagent/adapters/__init__.py +1 -0
- offagent/adapters/docx_adapter.py +1237 -0
- offagent/adapters/embedding_provider.py +132 -0
- offagent/adapters/pptx_adapter.py +940 -0
- offagent/adapters/xlsx_adapter.py +1266 -0
- offagent/app/__init__.py +1 -0
- offagent/app/progress.py +52 -0
- offagent/app/services.py +4267 -0
- offagent/config.py +287 -0
- offagent/domain/__init__.py +1 -0
- offagent/domain/locators.py +444 -0
- offagent/domain/models.py +477 -0
- offagent/domain/text_fragments.py +136 -0
- offagent/errors.py +29 -0
- offagent/indexing/__init__.py +1 -0
- offagent/indexing/store.py +795 -0
- offagent/interfaces/__init__.py +1 -0
- offagent/interfaces/cli.py +438 -0
- offagent/interfaces/cli_output.py +139 -0
- offagent/interfaces/cli_progress.py +120 -0
- offagent/interfaces/mcp.py +1145 -0
- offagent/interfaces/mcp_converters.py +80 -0
- offagent/interfaces/mcp_models.py +923 -0
- offagent/objects/__init__.py +3 -0
- offagent/objects/base.py +26 -0
- offagent/objects/docx_objects.py +951 -0
- offagent/objects/pptx_objects.py +895 -0
- offagent/objects/xlsx_objects.py +962 -0
- offagent/path_policy.py +42 -0
- offagent/storage/__init__.py +1 -0
- offagent/storage/versioning.py +31 -0
- offagent-0.10.0.dist-info/METADATA +546 -0
- offagent-0.10.0.dist-info/RECORD +39 -0
- offagent-0.10.0.dist-info/WHEEL +5 -0
- offagent-0.10.0.dist-info/entry_points.txt +2 -0
- offagent-0.10.0.dist-info/licenses/LICENSE +21 -0
- offagent-0.10.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
import math
|
|
5
|
+
import re
|
|
6
|
+
import struct
|
|
7
|
+
from collections.abc import Callable
|
|
8
|
+
from typing import Protocol
|
|
9
|
+
|
|
10
|
+
try:
|
|
11
|
+
from fastembed import TextEmbedding as FastEmbedTextEmbedding
|
|
12
|
+
except (
|
|
13
|
+
ModuleNotFoundError
|
|
14
|
+
): # pragma: no cover - exercised by tests that simulate missing dependency
|
|
15
|
+
FastEmbedTextEmbedding = None
|
|
16
|
+
|
|
17
|
+
DEFAULT_EMBEDDING_MODEL = "BAAI/bge-small-en-v1.5"
|
|
18
|
+
TOKEN_PATTERN = re.compile(r"[A-Za-z0-9_]+")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class EmbeddingProvider(Protocol):
|
|
22
|
+
model_name: str
|
|
23
|
+
dimensions: int
|
|
24
|
+
|
|
25
|
+
def embed_texts(
|
|
26
|
+
self,
|
|
27
|
+
texts: list[str],
|
|
28
|
+
*,
|
|
29
|
+
on_progress: Callable[[int, int], None] | None = None,
|
|
30
|
+
) -> list[bytes]:
|
|
31
|
+
"""Return one float32 blob per input text."""
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class LocalEmbeddingProvider:
|
|
35
|
+
"""Local embedding provider that prefers fastembed and falls back to hashing vectors."""
|
|
36
|
+
|
|
37
|
+
def __init__(
|
|
38
|
+
self,
|
|
39
|
+
model_name: str = DEFAULT_EMBEDDING_MODEL,
|
|
40
|
+
*,
|
|
41
|
+
dimensions: int | None = None,
|
|
42
|
+
) -> None:
|
|
43
|
+
self.model_name = model_name
|
|
44
|
+
self._backend = _build_backend(model_name, dimensions)
|
|
45
|
+
self.dimensions = self._backend.dimensions
|
|
46
|
+
|
|
47
|
+
def embed_texts(
|
|
48
|
+
self,
|
|
49
|
+
texts: list[str],
|
|
50
|
+
*,
|
|
51
|
+
on_progress: Callable[[int, int], None] | None = None,
|
|
52
|
+
) -> list[bytes]:
|
|
53
|
+
return [
|
|
54
|
+
struct.pack(f"<{self.dimensions}f", *vector)
|
|
55
|
+
for vector in self._backend.embed(texts, on_progress=on_progress)
|
|
56
|
+
]
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class _FastEmbedBackend:
|
|
60
|
+
def __init__(self, model_name: str) -> None:
|
|
61
|
+
if FastEmbedTextEmbedding is None:
|
|
62
|
+
raise RuntimeError("fastembed is not installed.")
|
|
63
|
+
self._model = FastEmbedTextEmbedding(model_name=model_name)
|
|
64
|
+
probe = next(self._model.embed(["probe"]))
|
|
65
|
+
self.dimensions = len(probe)
|
|
66
|
+
|
|
67
|
+
def embed(
|
|
68
|
+
self,
|
|
69
|
+
texts: list[str],
|
|
70
|
+
*,
|
|
71
|
+
on_progress: Callable[[int, int], None] | None = None,
|
|
72
|
+
) -> list[list[float]]:
|
|
73
|
+
results: list[list[float]] = []
|
|
74
|
+
total = len(texts)
|
|
75
|
+
for index, vector in enumerate(self._model.embed(texts), start=1):
|
|
76
|
+
results.append(list(map(float, vector)))
|
|
77
|
+
if on_progress is not None:
|
|
78
|
+
on_progress(index, total)
|
|
79
|
+
return results
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class _HashingBackend:
|
|
83
|
+
def __init__(self, dimensions: int) -> None:
|
|
84
|
+
self.dimensions = dimensions
|
|
85
|
+
|
|
86
|
+
def embed(
|
|
87
|
+
self,
|
|
88
|
+
texts: list[str],
|
|
89
|
+
*,
|
|
90
|
+
on_progress: Callable[[int, int], None] | None = None,
|
|
91
|
+
) -> list[list[float]]:
|
|
92
|
+
results: list[list[float]] = []
|
|
93
|
+
total = len(texts)
|
|
94
|
+
for index, text in enumerate(texts, start=1):
|
|
95
|
+
results.append(_hash_text_to_unit_vector(text, self.dimensions))
|
|
96
|
+
if on_progress is not None:
|
|
97
|
+
on_progress(index, total)
|
|
98
|
+
return results
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _build_backend(model_name: str, dimensions: int | None):
|
|
102
|
+
if model_name.startswith("hash://"):
|
|
103
|
+
return _HashingBackend(dimensions or 384)
|
|
104
|
+
if FastEmbedTextEmbedding is not None:
|
|
105
|
+
backend = _FastEmbedBackend(model_name)
|
|
106
|
+
if dimensions is not None and backend.dimensions != dimensions:
|
|
107
|
+
raise RuntimeError(
|
|
108
|
+
f"Configured embedding dimensions {dimensions} do not match model dimensions {backend.dimensions}."
|
|
109
|
+
)
|
|
110
|
+
return backend
|
|
111
|
+
|
|
112
|
+
return _HashingBackend(dimensions or 384)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def _hash_text_to_unit_vector(text: str, dimensions: int) -> list[float]:
|
|
116
|
+
vector = [0.0] * dimensions
|
|
117
|
+
tokens = TOKEN_PATTERN.findall(text.lower())
|
|
118
|
+
if not tokens:
|
|
119
|
+
return vector
|
|
120
|
+
|
|
121
|
+
for token in tokens:
|
|
122
|
+
digest = hashlib.sha256(token.encode("utf-8")).digest()
|
|
123
|
+
for offset in range(0, 8, 4):
|
|
124
|
+
bucket = int.from_bytes(digest[offset : offset + 2], "little") % dimensions
|
|
125
|
+
sign = 1.0 if digest[offset + 2] % 2 == 0 else -1.0
|
|
126
|
+
weight = 1.0 + (digest[offset + 3] / 255.0)
|
|
127
|
+
vector[bucket] += sign * weight
|
|
128
|
+
|
|
129
|
+
norm = math.sqrt(sum(value * value for value in vector))
|
|
130
|
+
if norm == 0.0:
|
|
131
|
+
return vector
|
|
132
|
+
return [value / norm for value in vector]
|