offagent 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- offagent/__init__.py +3 -0
- offagent/__main__.py +5 -0
- offagent/adapters/__init__.py +1 -0
- offagent/adapters/docx_adapter.py +1237 -0
- offagent/adapters/embedding_provider.py +132 -0
- offagent/adapters/pptx_adapter.py +940 -0
- offagent/adapters/xlsx_adapter.py +1266 -0
- offagent/app/__init__.py +1 -0
- offagent/app/progress.py +52 -0
- offagent/app/services.py +4267 -0
- offagent/config.py +287 -0
- offagent/domain/__init__.py +1 -0
- offagent/domain/locators.py +444 -0
- offagent/domain/models.py +477 -0
- offagent/domain/text_fragments.py +136 -0
- offagent/errors.py +29 -0
- offagent/indexing/__init__.py +1 -0
- offagent/indexing/store.py +795 -0
- offagent/interfaces/__init__.py +1 -0
- offagent/interfaces/cli.py +438 -0
- offagent/interfaces/cli_output.py +139 -0
- offagent/interfaces/cli_progress.py +120 -0
- offagent/interfaces/mcp.py +1145 -0
- offagent/interfaces/mcp_converters.py +80 -0
- offagent/interfaces/mcp_models.py +923 -0
- offagent/objects/__init__.py +3 -0
- offagent/objects/base.py +26 -0
- offagent/objects/docx_objects.py +951 -0
- offagent/objects/pptx_objects.py +895 -0
- offagent/objects/xlsx_objects.py +962 -0
- offagent/path_policy.py +42 -0
- offagent/storage/__init__.py +1 -0
- offagent/storage/versioning.py +31 -0
- offagent-0.10.0.dist-info/METADATA +546 -0
- offagent-0.10.0.dist-info/RECORD +39 -0
- offagent-0.10.0.dist-info/WHEEL +5 -0
- offagent-0.10.0.dist-info/entry_points.txt +2 -0
- offagent-0.10.0.dist-info/licenses/LICENSE +21 -0
- offagent-0.10.0.dist-info/top_level.txt +1 -0
offagent/config.py
ADDED
|
@@ -0,0 +1,287 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import tomllib
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Mapping
|
|
8
|
+
|
|
9
|
+
from offagent.errors import InvalidArgumentsError
|
|
10
|
+
|
|
11
|
+
try:
|
|
12
|
+
from dotenv import load_dotenv
|
|
13
|
+
except ModuleNotFoundError: # pragma: no cover - exercised through doctor checks
|
|
14
|
+
load_dotenv = None
|
|
15
|
+
|
|
16
|
+
DEFAULT_CONFIG_PATH = Path("office-agent.toml")
|
|
17
|
+
DEFAULT_INDEX_PATH = Path(".offagent/index.sqlite3")
|
|
18
|
+
DEFAULT_EMBEDDING_MODEL = "BAAI/bge-small-en-v1.5"
|
|
19
|
+
DEFAULT_EMBEDDING_DIMENSIONS = 384
|
|
20
|
+
DEFAULT_VECTOR_SEARCH_TOP_K = 20
|
|
21
|
+
DEFAULT_HYBRID_KEYWORD_WEIGHT = 0.4
|
|
22
|
+
DEFAULT_HYBRID_SEMANTIC_WEIGHT = 0.6
|
|
23
|
+
ENV_CONFIG_PATH = "OFFAGENT_CONFIG"
|
|
24
|
+
ENV_INDEX_PATH = "OFFAGENT_INDEX_PATH"
|
|
25
|
+
ENV_DOCUMENT_ROOTS = "OFFAGENT_DOCUMENT_ROOTS"
|
|
26
|
+
ENV_ALLOWED_ROOTS = "OFFAGENT_ALLOWED_ROOTS"
|
|
27
|
+
ENV_OUTPUT_DIRECTORY = "OFFAGENT_OUTPUT_DIRECTORY"
|
|
28
|
+
ENV_OUTPUT_ROOTS = "OFFAGENT_OUTPUT_ROOTS"
|
|
29
|
+
ENV_ALLOW_INPLACE_OVERWRITE = "OFFAGENT_ALLOW_INPLACE_OVERWRITE"
|
|
30
|
+
ENV_EMBEDDING_MODEL = "OFFAGENT_EMBEDDING_MODEL"
|
|
31
|
+
ENV_EMBEDDING_DIMENSIONS = "OFFAGENT_EMBEDDING_DIMENSIONS"
|
|
32
|
+
ENV_VECTOR_SEARCH_TOP_K = "OFFAGENT_VECTOR_SEARCH_TOP_K"
|
|
33
|
+
ENV_HYBRID_KEYWORD_WEIGHT = "OFFAGENT_HYBRID_KEYWORD_WEIGHT"
|
|
34
|
+
ENV_HYBRID_SEMANTIC_WEIGHT = "OFFAGENT_HYBRID_SEMANTIC_WEIGHT"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass(frozen=True)
|
|
38
|
+
class AppConfig:
|
|
39
|
+
index_path: Path = DEFAULT_INDEX_PATH
|
|
40
|
+
document_roots: tuple[Path, ...] = ()
|
|
41
|
+
allowed_roots: tuple[Path, ...] = ()
|
|
42
|
+
output_directory: Path | None = None
|
|
43
|
+
output_roots: tuple[Path, ...] = ()
|
|
44
|
+
allow_inplace_overwrite: bool = True
|
|
45
|
+
embedding_model: str = DEFAULT_EMBEDDING_MODEL
|
|
46
|
+
embedding_dimensions: int = DEFAULT_EMBEDDING_DIMENSIONS
|
|
47
|
+
vector_search_top_k: int = DEFAULT_VECTOR_SEARCH_TOP_K
|
|
48
|
+
hybrid_keyword_weight: float = DEFAULT_HYBRID_KEYWORD_WEIGHT
|
|
49
|
+
hybrid_semantic_weight: float = DEFAULT_HYBRID_SEMANTIC_WEIGHT
|
|
50
|
+
config_path: Path | None = None
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def load_config(
|
|
54
|
+
config_path: Path | None = None,
|
|
55
|
+
env: Mapping[str, str] | None = None,
|
|
56
|
+
) -> AppConfig:
|
|
57
|
+
if load_dotenv is not None:
|
|
58
|
+
load_dotenv()
|
|
59
|
+
|
|
60
|
+
env_values = dict(os.environ if env is None else env)
|
|
61
|
+
selected_config_path = _select_config_path(config_path, env_values)
|
|
62
|
+
|
|
63
|
+
values: dict[str, object] = {
|
|
64
|
+
"index_path": DEFAULT_INDEX_PATH,
|
|
65
|
+
"document_roots": (),
|
|
66
|
+
"allowed_roots": (),
|
|
67
|
+
"output_directory": None,
|
|
68
|
+
"output_roots": (),
|
|
69
|
+
"allow_inplace_overwrite": True,
|
|
70
|
+
"embedding_model": DEFAULT_EMBEDDING_MODEL,
|
|
71
|
+
"embedding_dimensions": DEFAULT_EMBEDDING_DIMENSIONS,
|
|
72
|
+
"vector_search_top_k": DEFAULT_VECTOR_SEARCH_TOP_K,
|
|
73
|
+
"hybrid_keyword_weight": DEFAULT_HYBRID_KEYWORD_WEIGHT,
|
|
74
|
+
"hybrid_semantic_weight": DEFAULT_HYBRID_SEMANTIC_WEIGHT,
|
|
75
|
+
"config_path": selected_config_path,
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
if selected_config_path is not None:
|
|
79
|
+
values.update(_load_file_values(selected_config_path))
|
|
80
|
+
|
|
81
|
+
if ENV_INDEX_PATH in env_values:
|
|
82
|
+
values["index_path"] = Path(env_values[ENV_INDEX_PATH]).expanduser()
|
|
83
|
+
|
|
84
|
+
if ENV_DOCUMENT_ROOTS in env_values:
|
|
85
|
+
values["document_roots"] = _split_paths(env_values[ENV_DOCUMENT_ROOTS])
|
|
86
|
+
|
|
87
|
+
if ENV_ALLOWED_ROOTS in env_values:
|
|
88
|
+
values["allowed_roots"] = _split_paths(env_values[ENV_ALLOWED_ROOTS])
|
|
89
|
+
|
|
90
|
+
if ENV_OUTPUT_DIRECTORY in env_values:
|
|
91
|
+
values["output_directory"] = Path(env_values[ENV_OUTPUT_DIRECTORY]).expanduser()
|
|
92
|
+
|
|
93
|
+
if ENV_OUTPUT_ROOTS in env_values:
|
|
94
|
+
values["output_roots"] = _split_paths(env_values[ENV_OUTPUT_ROOTS])
|
|
95
|
+
|
|
96
|
+
if ENV_ALLOW_INPLACE_OVERWRITE in env_values:
|
|
97
|
+
values["allow_inplace_overwrite"] = _parse_bool(
|
|
98
|
+
env_values[ENV_ALLOW_INPLACE_OVERWRITE]
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
if ENV_EMBEDDING_MODEL in env_values:
|
|
102
|
+
values["embedding_model"] = env_values[ENV_EMBEDDING_MODEL]
|
|
103
|
+
|
|
104
|
+
if ENV_EMBEDDING_DIMENSIONS in env_values:
|
|
105
|
+
values["embedding_dimensions"] = _parse_int(
|
|
106
|
+
env_values[ENV_EMBEDDING_DIMENSIONS],
|
|
107
|
+
ENV_EMBEDDING_DIMENSIONS,
|
|
108
|
+
minimum=1,
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
if ENV_VECTOR_SEARCH_TOP_K in env_values:
|
|
112
|
+
values["vector_search_top_k"] = _parse_int(
|
|
113
|
+
env_values[ENV_VECTOR_SEARCH_TOP_K],
|
|
114
|
+
ENV_VECTOR_SEARCH_TOP_K,
|
|
115
|
+
minimum=1,
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
if ENV_HYBRID_KEYWORD_WEIGHT in env_values:
|
|
119
|
+
values["hybrid_keyword_weight"] = _parse_float(
|
|
120
|
+
env_values[ENV_HYBRID_KEYWORD_WEIGHT],
|
|
121
|
+
ENV_HYBRID_KEYWORD_WEIGHT,
|
|
122
|
+
minimum=0.0,
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
if ENV_HYBRID_SEMANTIC_WEIGHT in env_values:
|
|
126
|
+
values["hybrid_semantic_weight"] = _parse_float(
|
|
127
|
+
env_values[ENV_HYBRID_SEMANTIC_WEIGHT],
|
|
128
|
+
ENV_HYBRID_SEMANTIC_WEIGHT,
|
|
129
|
+
minimum=0.0,
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
output_directory = _expand_optional_path(values["output_directory"])
|
|
133
|
+
output_roots = tuple(Path(root).expanduser() for root in values["output_roots"])
|
|
134
|
+
if not output_roots and output_directory is not None:
|
|
135
|
+
output_roots = (output_directory,)
|
|
136
|
+
|
|
137
|
+
return AppConfig(
|
|
138
|
+
index_path=Path(values["index_path"]).expanduser(),
|
|
139
|
+
document_roots=tuple(
|
|
140
|
+
Path(root).expanduser() for root in values["document_roots"]
|
|
141
|
+
),
|
|
142
|
+
allowed_roots=tuple(
|
|
143
|
+
Path(root).expanduser() for root in values["allowed_roots"]
|
|
144
|
+
),
|
|
145
|
+
output_directory=output_directory,
|
|
146
|
+
output_roots=output_roots,
|
|
147
|
+
allow_inplace_overwrite=bool(values["allow_inplace_overwrite"]),
|
|
148
|
+
embedding_model=str(values["embedding_model"]),
|
|
149
|
+
embedding_dimensions=int(values["embedding_dimensions"]),
|
|
150
|
+
vector_search_top_k=int(values["vector_search_top_k"]),
|
|
151
|
+
hybrid_keyword_weight=float(values["hybrid_keyword_weight"]),
|
|
152
|
+
hybrid_semantic_weight=float(values["hybrid_semantic_weight"]),
|
|
153
|
+
config_path=selected_config_path,
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def _select_config_path(
|
|
158
|
+
config_path: Path | None, env: Mapping[str, str]
|
|
159
|
+
) -> Path | None:
|
|
160
|
+
if config_path is not None:
|
|
161
|
+
selected = config_path.expanduser()
|
|
162
|
+
if not selected.exists():
|
|
163
|
+
raise FileNotFoundError(selected)
|
|
164
|
+
return selected
|
|
165
|
+
|
|
166
|
+
if ENV_CONFIG_PATH in env:
|
|
167
|
+
selected = Path(env[ENV_CONFIG_PATH]).expanduser()
|
|
168
|
+
if not selected.exists():
|
|
169
|
+
raise FileNotFoundError(selected)
|
|
170
|
+
return selected
|
|
171
|
+
|
|
172
|
+
if DEFAULT_CONFIG_PATH.exists():
|
|
173
|
+
return DEFAULT_CONFIG_PATH
|
|
174
|
+
|
|
175
|
+
return None
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def _load_file_values(config_path: Path) -> dict[str, object]:
|
|
179
|
+
with config_path.open("rb") as handle:
|
|
180
|
+
raw = tomllib.load(handle)
|
|
181
|
+
|
|
182
|
+
payload = raw.get("offagent", raw)
|
|
183
|
+
roots = payload.get("document_roots", ())
|
|
184
|
+
allowed_roots = payload.get("allowed_roots", ())
|
|
185
|
+
output_roots = payload.get("output_roots", ())
|
|
186
|
+
return {
|
|
187
|
+
"index_path": Path(payload.get("index_path", DEFAULT_INDEX_PATH)).expanduser(),
|
|
188
|
+
"document_roots": tuple(Path(root).expanduser() for root in roots),
|
|
189
|
+
"allowed_roots": tuple(Path(root).expanduser() for root in allowed_roots),
|
|
190
|
+
"output_directory": _optional_path(payload.get("output_directory")),
|
|
191
|
+
"output_roots": tuple(Path(root).expanduser() for root in output_roots),
|
|
192
|
+
"allow_inplace_overwrite": bool(payload.get("allow_inplace_overwrite", True)),
|
|
193
|
+
"embedding_model": str(payload.get("embedding_model", DEFAULT_EMBEDDING_MODEL)),
|
|
194
|
+
"embedding_dimensions": _coerce_int(
|
|
195
|
+
payload.get("embedding_dimensions", DEFAULT_EMBEDDING_DIMENSIONS),
|
|
196
|
+
"embedding_dimensions",
|
|
197
|
+
minimum=1,
|
|
198
|
+
),
|
|
199
|
+
"vector_search_top_k": _coerce_int(
|
|
200
|
+
payload.get("vector_search_top_k", DEFAULT_VECTOR_SEARCH_TOP_K),
|
|
201
|
+
"vector_search_top_k",
|
|
202
|
+
minimum=1,
|
|
203
|
+
),
|
|
204
|
+
"hybrid_keyword_weight": _coerce_float(
|
|
205
|
+
payload.get("hybrid_keyword_weight", DEFAULT_HYBRID_KEYWORD_WEIGHT),
|
|
206
|
+
"hybrid_keyword_weight",
|
|
207
|
+
minimum=0.0,
|
|
208
|
+
),
|
|
209
|
+
"hybrid_semantic_weight": _coerce_float(
|
|
210
|
+
payload.get("hybrid_semantic_weight", DEFAULT_HYBRID_SEMANTIC_WEIGHT),
|
|
211
|
+
"hybrid_semantic_weight",
|
|
212
|
+
minimum=0.0,
|
|
213
|
+
),
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def _split_paths(value: str) -> tuple[Path, ...]:
|
|
218
|
+
if not value.strip():
|
|
219
|
+
return ()
|
|
220
|
+
return tuple(Path(part).expanduser() for part in value.split(os.pathsep) if part)
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def _optional_path(value: object) -> Path | None:
|
|
224
|
+
if value in (None, ""):
|
|
225
|
+
return None
|
|
226
|
+
return Path(str(value)).expanduser()
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def _expand_optional_path(value: object) -> Path | None:
|
|
230
|
+
if value is None:
|
|
231
|
+
return None
|
|
232
|
+
return Path(value).expanduser()
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def _parse_bool(value: str) -> bool:
|
|
236
|
+
normalized = value.strip().lower()
|
|
237
|
+
if normalized in {"1", "true", "yes", "on"}:
|
|
238
|
+
return True
|
|
239
|
+
if normalized in {"0", "false", "no", "off"}:
|
|
240
|
+
return False
|
|
241
|
+
raise InvalidArgumentsError(f"Invalid boolean value: {value}")
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def _parse_int(value: str, name: str, *, minimum: int | None = None) -> int:
|
|
245
|
+
try:
|
|
246
|
+
parsed = int(value)
|
|
247
|
+
except ValueError as exc:
|
|
248
|
+
raise InvalidArgumentsError(
|
|
249
|
+
f"Invalid integer value for {name}: {value}"
|
|
250
|
+
) from exc
|
|
251
|
+
if minimum is not None and parsed < minimum:
|
|
252
|
+
raise InvalidArgumentsError(f"{name} must be >= {minimum}")
|
|
253
|
+
return parsed
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def _parse_float(value: str, name: str, *, minimum: float | None = None) -> float:
|
|
257
|
+
try:
|
|
258
|
+
parsed = float(value)
|
|
259
|
+
except ValueError as exc:
|
|
260
|
+
raise InvalidArgumentsError(f"Invalid float value for {name}: {value}") from exc
|
|
261
|
+
if minimum is not None and parsed < minimum:
|
|
262
|
+
raise InvalidArgumentsError(f"{name} must be >= {minimum}")
|
|
263
|
+
return parsed
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def _coerce_int(value: object, name: str, *, minimum: int | None = None) -> int:
|
|
267
|
+
if isinstance(value, bool):
|
|
268
|
+
raise InvalidArgumentsError(f"Invalid integer value for {name}: {value}")
|
|
269
|
+
if isinstance(value, int):
|
|
270
|
+
parsed = value
|
|
271
|
+
else:
|
|
272
|
+
parsed = _parse_int(str(value), name, minimum=minimum)
|
|
273
|
+
if minimum is not None and parsed < minimum:
|
|
274
|
+
raise InvalidArgumentsError(f"{name} must be >= {minimum}")
|
|
275
|
+
return parsed
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
def _coerce_float(value: object, name: str, *, minimum: float | None = None) -> float:
|
|
279
|
+
if isinstance(value, bool):
|
|
280
|
+
raise InvalidArgumentsError(f"Invalid float value for {name}: {value}")
|
|
281
|
+
if isinstance(value, (int, float)):
|
|
282
|
+
parsed = float(value)
|
|
283
|
+
else:
|
|
284
|
+
parsed = _parse_float(str(value), name, minimum=minimum)
|
|
285
|
+
if minimum is not None and parsed < minimum:
|
|
286
|
+
raise InvalidArgumentsError(f"{name} must be >= {minimum}")
|
|
287
|
+
return parsed
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Shared domain models."""
|