tactus 0.36.0__py3-none-any.whl → 0.38.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tactus/__init__.py +1 -1
- tactus/adapters/channels/base.py +22 -2
- tactus/adapters/channels/broker.py +1 -0
- tactus/adapters/channels/host.py +3 -1
- tactus/adapters/channels/ipc.py +18 -3
- tactus/adapters/channels/sse.py +2 -0
- tactus/adapters/mcp_manager.py +24 -7
- tactus/backends/http_backend.py +2 -2
- tactus/backends/pytorch_backend.py +2 -2
- tactus/broker/client.py +3 -3
- tactus/broker/server.py +17 -5
- tactus/cli/app.py +212 -57
- tactus/core/compaction.py +17 -0
- tactus/core/context_assembler.py +73 -0
- tactus/core/context_models.py +41 -0
- tactus/core/dsl_stubs.py +560 -20
- tactus/core/exceptions.py +8 -0
- tactus/core/execution_context.py +24 -24
- tactus/core/message_history_manager.py +2 -2
- tactus/core/mocking.py +12 -0
- tactus/core/output_validator.py +6 -6
- tactus/core/registry.py +171 -29
- tactus/core/retrieval.py +317 -0
- tactus/core/retriever_tasks.py +30 -0
- tactus/core/runtime.py +431 -117
- tactus/dspy/agent.py +143 -82
- tactus/dspy/broker_lm.py +13 -7
- tactus/dspy/config.py +23 -4
- tactus/dspy/module.py +12 -1
- tactus/ide/coding_assistant.py +2 -2
- tactus/primitives/handles.py +79 -7
- tactus/primitives/model.py +1 -1
- tactus/primitives/procedure.py +1 -1
- tactus/primitives/state.py +2 -2
- tactus/sandbox/config.py +1 -1
- tactus/sandbox/container_runner.py +13 -6
- tactus/sandbox/entrypoint.py +51 -8
- tactus/sandbox/protocol.py +5 -0
- tactus/stdlib/README.md +10 -1
- tactus/stdlib/biblicus/__init__.py +3 -0
- tactus/stdlib/biblicus/text.py +189 -0
- tactus/stdlib/tac/biblicus/text.tac +32 -0
- tactus/stdlib/tac/tactus/biblicus.spec.tac +179 -0
- tactus/stdlib/tac/tactus/corpora/base.tac +42 -0
- tactus/stdlib/tac/tactus/corpora/filesystem.tac +5 -0
- tactus/stdlib/tac/tactus/retrievers/base.tac +37 -0
- tactus/stdlib/tac/tactus/retrievers/embedding_index_file.tac +6 -0
- tactus/stdlib/tac/tactus/retrievers/embedding_index_inmemory.tac +6 -0
- tactus/stdlib/tac/tactus/retrievers/index.md +137 -0
- tactus/stdlib/tac/tactus/retrievers/init.tac +11 -0
- tactus/stdlib/tac/tactus/retrievers/sqlite_full_text_search.tac +6 -0
- tactus/stdlib/tac/tactus/retrievers/tf_vector.tac +6 -0
- tactus/testing/behave_integration.py +2 -0
- tactus/testing/context.py +10 -6
- tactus/testing/evaluation_runner.py +5 -5
- tactus/testing/steps/builtin.py +2 -2
- tactus/testing/test_runner.py +6 -4
- tactus/utils/asyncio_helpers.py +2 -1
- tactus/validation/semantic_visitor.py +357 -6
- tactus/validation/validator.py +142 -2
- {tactus-0.36.0.dist-info → tactus-0.38.0.dist-info}/METADATA +9 -6
- {tactus-0.36.0.dist-info → tactus-0.38.0.dist-info}/RECORD +65 -47
- {tactus-0.36.0.dist-info → tactus-0.38.0.dist-info}/WHEEL +0 -0
- {tactus-0.36.0.dist-info → tactus-0.38.0.dist-info}/entry_points.txt +0 -0
- {tactus-0.36.0.dist-info → tactus-0.38.0.dist-info}/licenses/LICENSE +0 -0
tactus/core/retrieval.py
ADDED
|
@@ -0,0 +1,317 @@
|
|
|
1
|
+
"""Deterministic retrieval utilities for Context packs."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import hashlib
|
|
6
|
+
import os
|
|
7
|
+
import re
|
|
8
|
+
import urllib.request
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Iterable, List
|
|
11
|
+
|
|
12
|
+
import pyarrow.parquet as pq
|
|
13
|
+
|
|
14
|
+
from biblicus.context import ContextPack, ContextPackBlock
|
|
15
|
+
from biblicus.context_engine import ContextRetrieverRequest, retrieve_context_pack
|
|
16
|
+
from biblicus.corpus import Corpus
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
_WIKITEXT2_FILES = {
|
|
20
|
+
"train": {
|
|
21
|
+
"filename": "train-00000-of-00001.parquet",
|
|
22
|
+
"sha256": "e83889baabc497075506f91975be5fac0d45c5290b6b20582c8cd1e853d0c9f7",
|
|
23
|
+
},
|
|
24
|
+
"validation": {
|
|
25
|
+
"filename": "validation-00000-of-00001.parquet",
|
|
26
|
+
"sha256": "204929b7ff9d6184953f867dedb860e40aa69c078fc1e54b3baaa8fb28511c4c",
|
|
27
|
+
},
|
|
28
|
+
"test": {
|
|
29
|
+
"filename": "test-00000-of-00001.parquet",
|
|
30
|
+
"sha256": "5f1bea067869d04849c0f975a2b29c4ff47d867f484f5010ea5e861eab246d91",
|
|
31
|
+
},
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def get_wikitext2_cache_dir() -> Path:
|
|
36
|
+
"""Return the cache directory for Wikitext-2 raw parquet files."""
|
|
37
|
+
env_path = os.environ.get("TACTUS_WIKITEXT2_CACHE_DIR")
|
|
38
|
+
if env_path:
|
|
39
|
+
return Path(env_path)
|
|
40
|
+
return Path(__file__).resolve().parents[2] / "tests" / "fixtures" / "wikitext-2-raw-v1"
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def ensure_wikitext2_raw(cache_dir: Path | None = None) -> Path:
|
|
44
|
+
"""Ensure the Wikitext-2 raw parquet files are present."""
|
|
45
|
+
cache_dir = cache_dir or get_wikitext2_cache_dir()
|
|
46
|
+
cache_dir.mkdir(parents=True, exist_ok=True)
|
|
47
|
+
base_url = (
|
|
48
|
+
"https://huggingface.co/datasets/Salesforce/wikitext/resolve/main/" "wikitext-2-raw-v1"
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
for split, meta in _WIKITEXT2_FILES.items():
|
|
52
|
+
target = cache_dir / meta["filename"]
|
|
53
|
+
if target.exists() and _sha256_matches(target, meta["sha256"]):
|
|
54
|
+
continue
|
|
55
|
+
url = f"{base_url}/{meta['filename']}"
|
|
56
|
+
_download_file(url, target)
|
|
57
|
+
if not _sha256_matches(target, meta["sha256"]):
|
|
58
|
+
raise RuntimeError(f"Checksum mismatch for {split} parquet file")
|
|
59
|
+
return cache_dir
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def load_wikitext2_texts(split: str, limit: int | None = None) -> List[str]:
|
|
63
|
+
"""Load Wikitext-2 raw texts for the given split."""
|
|
64
|
+
if split not in _WIKITEXT2_FILES:
|
|
65
|
+
raise ValueError(f"Unknown Wikitext2 split: {split}")
|
|
66
|
+
cache_dir = ensure_wikitext2_raw()
|
|
67
|
+
parquet_path = cache_dir / _WIKITEXT2_FILES[split]["filename"]
|
|
68
|
+
table = pq.read_table(parquet_path, columns=["text"])
|
|
69
|
+
texts = [value for value in table.column("text").to_pylist() if value]
|
|
70
|
+
if limit is not None:
|
|
71
|
+
return texts[:limit]
|
|
72
|
+
return texts
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def retrieve_wikitext2(request: ContextRetrieverRequest) -> ContextPack:
|
|
76
|
+
"""
|
|
77
|
+
Retrieve matching passages from Wikitext-2 raw.
|
|
78
|
+
|
|
79
|
+
:param request: Context retriever request payload.
|
|
80
|
+
:type request: ContextRetrieverRequest
|
|
81
|
+
:return: Context pack derived from matching passages.
|
|
82
|
+
:rtype: ContextPack
|
|
83
|
+
"""
|
|
84
|
+
split = request.metadata.get("split", "train")
|
|
85
|
+
maximum_cache_total_items = request.metadata.get("maximum_cache_total_items")
|
|
86
|
+
maximum_cache_total_characters = request.metadata.get("maximum_cache_total_characters")
|
|
87
|
+
texts = load_wikitext2_texts(split=split, limit=None)
|
|
88
|
+
if maximum_cache_total_items is not None:
|
|
89
|
+
texts = texts[: int(maximum_cache_total_items)]
|
|
90
|
+
elif maximum_cache_total_characters is not None:
|
|
91
|
+
selected = []
|
|
92
|
+
total_chars = 0
|
|
93
|
+
for text in texts:
|
|
94
|
+
text_length = len(text)
|
|
95
|
+
if total_chars + text_length > int(maximum_cache_total_characters):
|
|
96
|
+
break
|
|
97
|
+
selected.append(text)
|
|
98
|
+
total_chars += text_length
|
|
99
|
+
texts = selected
|
|
100
|
+
ranked = _rank_texts(request.query, texts)
|
|
101
|
+
offset = request.offset
|
|
102
|
+
limit = request.limit
|
|
103
|
+
|
|
104
|
+
blocks: List[ContextPackBlock] = []
|
|
105
|
+
remaining_chars = request.maximum_total_characters
|
|
106
|
+
for idx, text in enumerate(ranked[offset : offset + limit], start=1):
|
|
107
|
+
snippet = text.strip()
|
|
108
|
+
if remaining_chars is not None and remaining_chars <= 0:
|
|
109
|
+
break
|
|
110
|
+
if remaining_chars is not None and len(snippet) > remaining_chars:
|
|
111
|
+
snippet = snippet[: remaining_chars - 3].rstrip() + "..."
|
|
112
|
+
if remaining_chars is not None:
|
|
113
|
+
remaining_chars -= len(snippet)
|
|
114
|
+
if not snippet:
|
|
115
|
+
continue
|
|
116
|
+
blocks.append(
|
|
117
|
+
ContextPackBlock(
|
|
118
|
+
evidence_item_id=f"{split}-{offset + idx}",
|
|
119
|
+
text=snippet,
|
|
120
|
+
metadata=None,
|
|
121
|
+
)
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
text = "\n\n".join(block.text for block in blocks)
|
|
125
|
+
return ContextPack(text=text, evidence_count=len(blocks), blocks=blocks)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def get_noaa_afd_cache_dir() -> Path:
|
|
129
|
+
"""Return the cache directory for NOAA AFD text fixtures."""
|
|
130
|
+
env_path = os.environ.get("TACTUS_NOAA_AFD_DIR")
|
|
131
|
+
if env_path:
|
|
132
|
+
return Path(env_path)
|
|
133
|
+
return Path(__file__).resolve().parents[2] / "tests" / "fixtures" / "noaa_afd"
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def load_noaa_afd_texts(wfo: str, limit: int | None = None) -> List[str]:
|
|
137
|
+
"""Load NOAA AFD text files for the given WFO code."""
|
|
138
|
+
base_dir = get_noaa_afd_cache_dir() / wfo.upper()
|
|
139
|
+
if not base_dir.exists():
|
|
140
|
+
raise FileNotFoundError(f"No NOAA AFD corpus found for WFO '{wfo}' at {base_dir}")
|
|
141
|
+
files = sorted(path for path in base_dir.glob("*.txt"))
|
|
142
|
+
texts = [path.read_text(encoding="utf-8", errors="replace") for path in files]
|
|
143
|
+
if limit is not None:
|
|
144
|
+
return texts[:limit]
|
|
145
|
+
return texts
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def retrieve_noaa_afd(request: ContextRetrieverRequest) -> ContextPack:
|
|
149
|
+
"""
|
|
150
|
+
Retrieve matching passages from NOAA AFD text fixtures.
|
|
151
|
+
|
|
152
|
+
:param request: Context retriever request payload.
|
|
153
|
+
:type request: ContextRetrieverRequest
|
|
154
|
+
:return: Context pack derived from matching passages.
|
|
155
|
+
:rtype: ContextPack
|
|
156
|
+
"""
|
|
157
|
+
wfo = request.metadata.get("wfo", "MFL")
|
|
158
|
+
maximum_cache_total_items = request.metadata.get("maximum_cache_total_items")
|
|
159
|
+
maximum_cache_total_characters = request.metadata.get("maximum_cache_total_characters")
|
|
160
|
+
texts = load_noaa_afd_texts(wfo=wfo, limit=None)
|
|
161
|
+
if maximum_cache_total_items is not None:
|
|
162
|
+
texts = texts[: int(maximum_cache_total_items)]
|
|
163
|
+
elif maximum_cache_total_characters is not None:
|
|
164
|
+
selected = []
|
|
165
|
+
total_chars = 0
|
|
166
|
+
for text in texts:
|
|
167
|
+
text_length = len(text)
|
|
168
|
+
if total_chars + text_length > int(maximum_cache_total_characters):
|
|
169
|
+
break
|
|
170
|
+
selected.append(text)
|
|
171
|
+
total_chars += text_length
|
|
172
|
+
texts = selected
|
|
173
|
+
|
|
174
|
+
ranked = _rank_texts(request.query, texts)
|
|
175
|
+
offset = request.offset
|
|
176
|
+
limit = request.limit
|
|
177
|
+
|
|
178
|
+
blocks: List[ContextPackBlock] = []
|
|
179
|
+
remaining_chars = request.maximum_total_characters
|
|
180
|
+
for idx, text in enumerate(ranked[offset : offset + limit], start=1):
|
|
181
|
+
snippet = text.strip()
|
|
182
|
+
if remaining_chars is not None and remaining_chars <= 0:
|
|
183
|
+
break
|
|
184
|
+
if remaining_chars is not None and len(snippet) > remaining_chars:
|
|
185
|
+
snippet = snippet[: remaining_chars - 3].rstrip() + "..."
|
|
186
|
+
if remaining_chars is not None:
|
|
187
|
+
remaining_chars -= len(snippet)
|
|
188
|
+
if not snippet:
|
|
189
|
+
continue
|
|
190
|
+
blocks.append(
|
|
191
|
+
ContextPackBlock(
|
|
192
|
+
evidence_item_id=f"{wfo.lower()}-{offset + idx}",
|
|
193
|
+
text=snippet,
|
|
194
|
+
metadata=None,
|
|
195
|
+
)
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
text = "\n\n".join(block.text for block in blocks)
|
|
199
|
+
return ContextPack(text=text, evidence_count=len(blocks), blocks=blocks)
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def retrieve_biblicus_context_pack(request: ContextRetrieverRequest) -> ContextPack:
|
|
203
|
+
"""
|
|
204
|
+
Retrieve a context pack using Biblicus retrievers.
|
|
205
|
+
|
|
206
|
+
:param request: Context retriever request payload.
|
|
207
|
+
:type request: ContextRetrieverRequest
|
|
208
|
+
:return: Context pack derived from Biblicus retrieval.
|
|
209
|
+
:rtype: ContextPack
|
|
210
|
+
:raises ValueError: If required metadata is missing.
|
|
211
|
+
"""
|
|
212
|
+
metadata = request.metadata or {}
|
|
213
|
+
retriever_id = metadata.get("retriever_id") or metadata.get("retriever_type")
|
|
214
|
+
corpus_root = metadata.get("corpus_root") or metadata.get("root")
|
|
215
|
+
if not retriever_id:
|
|
216
|
+
raise ValueError("Biblicus retrieval requires 'retriever_id' in metadata")
|
|
217
|
+
if not corpus_root:
|
|
218
|
+
raise ValueError("Biblicus retrieval requires 'corpus_root' in metadata")
|
|
219
|
+
|
|
220
|
+
snapshot_id = metadata.get("snapshot_id")
|
|
221
|
+
configuration_name = metadata.get("configuration_name")
|
|
222
|
+
configuration = metadata.get("configuration") or {}
|
|
223
|
+
maximum_items_per_source = metadata.get(
|
|
224
|
+
"maximum_items_per_source",
|
|
225
|
+
metadata.get("max_items_per_source"),
|
|
226
|
+
)
|
|
227
|
+
include_metadata = bool(metadata.get("include_metadata", False))
|
|
228
|
+
metadata_fields = metadata.get("metadata_fields")
|
|
229
|
+
|
|
230
|
+
corpus = Corpus.open(corpus_root)
|
|
231
|
+
return retrieve_context_pack(
|
|
232
|
+
request=request,
|
|
233
|
+
corpus=corpus,
|
|
234
|
+
retriever_id=retriever_id,
|
|
235
|
+
snapshot_id=snapshot_id,
|
|
236
|
+
configuration_name=configuration_name,
|
|
237
|
+
configuration=configuration,
|
|
238
|
+
max_items_per_source=maximum_items_per_source,
|
|
239
|
+
include_metadata=include_metadata,
|
|
240
|
+
metadata_fields=metadata_fields,
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def make_retriever_router(corpus_registry, retriever_registry=None) -> callable:
|
|
245
|
+
"""
|
|
246
|
+
Build a retriever dispatcher based on corpus and retriever configuration.
|
|
247
|
+
|
|
248
|
+
:param corpus_registry: Corpus registry used to resolve corpus metadata.
|
|
249
|
+
:type corpus_registry: dict[str, Any] or None
|
|
250
|
+
:param retriever_registry: Retriever registry used to resolve retrievers.
|
|
251
|
+
:type retriever_registry: dict[str, Any] or None
|
|
252
|
+
:return: Retriever callable that dispatches by retriever id.
|
|
253
|
+
:rtype: callable
|
|
254
|
+
"""
|
|
255
|
+
|
|
256
|
+
def _route(request: ContextRetrieverRequest) -> ContextPack:
|
|
257
|
+
corpus_name = request.metadata.get("corpus")
|
|
258
|
+
retriever_name = request.metadata.get("retriever")
|
|
259
|
+
retriever_id = request.metadata.get("retriever_id") or request.metadata.get(
|
|
260
|
+
"retriever_type"
|
|
261
|
+
)
|
|
262
|
+
if retriever_id is None and retriever_registry and retriever_name in retriever_registry:
|
|
263
|
+
retriever_spec = retriever_registry[retriever_name]
|
|
264
|
+
retriever_config = retriever_spec.config if hasattr(retriever_spec, "config") else {}
|
|
265
|
+
if isinstance(retriever_config, dict):
|
|
266
|
+
retriever_id = retriever_config.get("retriever_id") or retriever_config.get(
|
|
267
|
+
"retriever_type"
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
if retriever_id == "noaa_afd":
|
|
271
|
+
return retrieve_noaa_afd(request)
|
|
272
|
+
if retriever_id == "wikitext2":
|
|
273
|
+
return retrieve_wikitext2(request)
|
|
274
|
+
|
|
275
|
+
if retriever_id is None:
|
|
276
|
+
missing_target = retriever_name or corpus_name or "<unknown>"
|
|
277
|
+
raise ValueError(f"Missing retriever_id for retriever '{missing_target}'")
|
|
278
|
+
|
|
279
|
+
return retrieve_biblicus_context_pack(request)
|
|
280
|
+
|
|
281
|
+
return _route
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def _rank_texts(query: str, texts: Iterable[str]) -> List[str]:
|
|
285
|
+
"""Rank texts by keyword overlap."""
|
|
286
|
+
query_terms = _tokenize(query)
|
|
287
|
+
if not query_terms:
|
|
288
|
+
return list(texts)
|
|
289
|
+
scored = []
|
|
290
|
+
for text in texts:
|
|
291
|
+
text_terms = _tokenize(text)
|
|
292
|
+
score = sum(text_terms.count(term) for term in query_terms)
|
|
293
|
+
scored.append((score, text))
|
|
294
|
+
scored.sort(key=lambda item: item[0], reverse=True)
|
|
295
|
+
return [text for score, text in scored if score > 0] or list(texts)
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
def _tokenize(text: str) -> List[str]:
|
|
299
|
+
"""Tokenize text to lowercase word tokens."""
|
|
300
|
+
return re.findall(r"[a-zA-Z0-9]+", text.lower())
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
def _download_file(url: str, target: Path) -> None:
|
|
304
|
+
"""Download a file to the target path."""
|
|
305
|
+
with urllib.request.urlopen(url) as response, target.open("wb") as handle:
|
|
306
|
+
handle.write(response.read())
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
def _sha256_matches(path: Path, expected: str) -> bool:
|
|
310
|
+
"""Check SHA256 checksum of a file."""
|
|
311
|
+
if not path.exists():
|
|
312
|
+
return False
|
|
313
|
+
hasher = hashlib.sha256()
|
|
314
|
+
with path.open("rb") as handle:
|
|
315
|
+
for chunk in iter(lambda: handle.read(8192), b""):
|
|
316
|
+
hasher.update(chunk)
|
|
317
|
+
return hasher.hexdigest() == expected
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""Static metadata for retriever-supported tasks."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
RETRIEVER_TASKS: dict[str, set[str]] = {
|
|
8
|
+
"tf-vector": {"index"},
|
|
9
|
+
"sqlite-full-text-search": {"index"},
|
|
10
|
+
"embedding-index-inmemory": {"index"},
|
|
11
|
+
"embedding-index-file": {"index"},
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def resolve_retriever_id(config: Optional[dict]) -> Optional[str]:
|
|
16
|
+
"""Resolve retriever identifier from a retriever config dict."""
|
|
17
|
+
if not isinstance(config, dict):
|
|
18
|
+
return None
|
|
19
|
+
for key in ("retriever_id", "retriever_type"):
|
|
20
|
+
value = config.get(key)
|
|
21
|
+
if isinstance(value, str) and value.strip():
|
|
22
|
+
return value
|
|
23
|
+
return None
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def supported_retriever_tasks(retriever_id: Optional[str]) -> set[str]:
|
|
27
|
+
"""Return supported task names for the retriever identifier."""
|
|
28
|
+
if not retriever_id:
|
|
29
|
+
return set()
|
|
30
|
+
return set(RETRIEVER_TASKS.get(retriever_id, set()))
|