tactus 0.37.0__py3-none-any.whl → 0.38.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tactus/__init__.py +1 -1
- tactus/adapters/channels/base.py +2 -0
- tactus/cli/app.py +212 -57
- tactus/core/compaction.py +17 -0
- tactus/core/context_assembler.py +73 -0
- tactus/core/context_models.py +41 -0
- tactus/core/dsl_stubs.py +557 -17
- tactus/core/exceptions.py +8 -0
- tactus/core/execution_context.py +1 -1
- tactus/core/mocking.py +12 -0
- tactus/core/registry.py +142 -0
- tactus/core/retrieval.py +317 -0
- tactus/core/retriever_tasks.py +30 -0
- tactus/core/runtime.py +388 -74
- tactus/dspy/agent.py +143 -82
- tactus/dspy/config.py +16 -0
- tactus/dspy/module.py +12 -1
- tactus/ide/coding_assistant.py +2 -2
- tactus/primitives/handles.py +79 -7
- tactus/sandbox/config.py +1 -1
- tactus/sandbox/container_runner.py +2 -0
- tactus/sandbox/entrypoint.py +51 -8
- tactus/sandbox/protocol.py +5 -0
- tactus/stdlib/README.md +10 -1
- tactus/stdlib/biblicus/__init__.py +3 -0
- tactus/stdlib/biblicus/text.py +189 -0
- tactus/stdlib/tac/biblicus/text.tac +32 -0
- tactus/stdlib/tac/tactus/biblicus.spec.tac +179 -0
- tactus/stdlib/tac/tactus/corpora/base.tac +42 -0
- tactus/stdlib/tac/tactus/corpora/filesystem.tac +5 -0
- tactus/stdlib/tac/tactus/retrievers/base.tac +37 -0
- tactus/stdlib/tac/tactus/retrievers/embedding_index_file.tac +6 -0
- tactus/stdlib/tac/tactus/retrievers/embedding_index_inmemory.tac +6 -0
- tactus/stdlib/tac/tactus/retrievers/index.md +137 -0
- tactus/stdlib/tac/tactus/retrievers/init.tac +11 -0
- tactus/stdlib/tac/tactus/retrievers/sqlite_full_text_search.tac +6 -0
- tactus/stdlib/tac/tactus/retrievers/tf_vector.tac +6 -0
- tactus/testing/behave_integration.py +2 -0
- tactus/testing/context.py +4 -0
- tactus/validation/semantic_visitor.py +357 -6
- tactus/validation/validator.py +142 -2
- {tactus-0.37.0.dist-info → tactus-0.38.0.dist-info}/METADATA +3 -2
- {tactus-0.37.0.dist-info → tactus-0.38.0.dist-info}/RECORD +46 -28
- {tactus-0.37.0.dist-info → tactus-0.38.0.dist-info}/WHEEL +0 -0
- {tactus-0.37.0.dist-info → tactus-0.38.0.dist-info}/entry_points.txt +0 -0
- {tactus-0.37.0.dist-info → tactus-0.38.0.dist-info}/licenses/LICENSE +0 -0
tactus/core/exceptions.py
CHANGED
|
@@ -11,6 +11,14 @@ class TactusRuntimeError(Exception):
|
|
|
11
11
|
pass
|
|
12
12
|
|
|
13
13
|
|
|
14
|
+
class TaskSelectionRequired(TactusRuntimeError):
|
|
15
|
+
"""Raised when multiple tasks are available and no default can be chosen."""
|
|
16
|
+
|
|
17
|
+
def __init__(self, tasks: list[str]):
|
|
18
|
+
self.tasks = tasks
|
|
19
|
+
super().__init__("Multiple tasks available; select one explicitly.")
|
|
20
|
+
|
|
21
|
+
|
|
14
22
|
class ProcedureWaitingForHuman(Exception):
|
|
15
23
|
"""
|
|
16
24
|
Raised to exit workflow when waiting for human response.
|
tactus/core/execution_context.py
CHANGED
|
@@ -397,7 +397,7 @@ class BaseExecutionContext(ExecutionContext):
|
|
|
397
397
|
except Exception as exception:
|
|
398
398
|
logger.warning("Failed to emit checkpoint event: %s", exception)
|
|
399
399
|
else:
|
|
400
|
-
logger.
|
|
400
|
+
logger.debug("[CHECKPOINT] No log_handler available to emit checkpoint event")
|
|
401
401
|
|
|
402
402
|
# Persist metadata
|
|
403
403
|
self.storage.save_procedure_metadata(self.procedure_id, self.metadata)
|
tactus/core/mocking.py
CHANGED
|
@@ -13,6 +13,18 @@ import logging
|
|
|
13
13
|
from typing import Any, Optional, Union
|
|
14
14
|
|
|
15
15
|
logger = logging.getLogger(__name__)
|
|
16
|
+
_CURRENT_MOCK_MANAGER: Optional["MockManager"] = None
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def set_current_mock_manager(manager: Optional["MockManager"]) -> None:
|
|
20
|
+
"""Set the globally accessible mock manager for stdlib helpers."""
|
|
21
|
+
global _CURRENT_MOCK_MANAGER
|
|
22
|
+
_CURRENT_MOCK_MANAGER = manager
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def get_current_mock_manager() -> Optional["MockManager"]:
|
|
26
|
+
"""Get the globally accessible mock manager for stdlib helpers."""
|
|
27
|
+
return _CURRENT_MOCK_MANAGER
|
|
16
28
|
|
|
17
29
|
|
|
18
30
|
@dataclass
|
tactus/core/registry.py
CHANGED
|
@@ -10,6 +10,13 @@ from typing import Any, Dict, Optional, Union
|
|
|
10
10
|
|
|
11
11
|
from pydantic import BaseModel, Field, ValidationError, ConfigDict
|
|
12
12
|
|
|
13
|
+
from tactus.core.context_models import (
|
|
14
|
+
CompactorDeclaration,
|
|
15
|
+
ContextDeclaration,
|
|
16
|
+
CorpusDeclaration,
|
|
17
|
+
RetrieverDeclaration,
|
|
18
|
+
)
|
|
19
|
+
|
|
13
20
|
logger = logging.getLogger(__name__)
|
|
14
21
|
|
|
15
22
|
|
|
@@ -128,6 +135,18 @@ class AgentMockConfig(BaseModel):
|
|
|
128
135
|
)
|
|
129
136
|
|
|
130
137
|
|
|
138
|
+
class TaskDeclaration(BaseModel):
|
|
139
|
+
"""Task declaration from DSL."""
|
|
140
|
+
|
|
141
|
+
name: str
|
|
142
|
+
children: dict[str, "TaskDeclaration"] = Field(default_factory=dict)
|
|
143
|
+
|
|
144
|
+
model_config = ConfigDict(extra="allow")
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
TaskDeclaration.model_rebuild()
|
|
148
|
+
|
|
149
|
+
|
|
131
150
|
class ProcedureRegistry(BaseModel):
|
|
132
151
|
"""Collects all declarations from a .tac file."""
|
|
133
152
|
|
|
@@ -149,6 +168,12 @@ class ProcedureRegistry(BaseModel):
|
|
|
149
168
|
dependencies: dict[str, DependencyDeclaration] = Field(default_factory=dict)
|
|
150
169
|
mocks: dict[str, dict[str, Any]] = Field(default_factory=dict) # Mock configurations
|
|
151
170
|
agent_mocks: dict[str, AgentMockConfig] = Field(default_factory=dict) # Agent mock configs
|
|
171
|
+
contexts: dict[str, ContextDeclaration] = Field(default_factory=dict)
|
|
172
|
+
corpora: dict[str, CorpusDeclaration] = Field(default_factory=dict)
|
|
173
|
+
retrievers: dict[str, RetrieverDeclaration] = Field(default_factory=dict)
|
|
174
|
+
compactors: dict[str, CompactorDeclaration] = Field(default_factory=dict)
|
|
175
|
+
tasks: dict[str, TaskDeclaration] = Field(default_factory=dict)
|
|
176
|
+
include_tasks: list[dict[str, Any]] = Field(default_factory=list)
|
|
152
177
|
|
|
153
178
|
# Message history configuration (aligned with pydantic-ai)
|
|
154
179
|
message_history_config: dict[str, Any] = Field(default_factory=dict)
|
|
@@ -334,6 +359,123 @@ class RegistryBuilder:
|
|
|
334
359
|
except Exception as exception:
|
|
335
360
|
self._add_error(f"Invalid agent mock config for '{agent_name}': {exception}")
|
|
336
361
|
|
|
362
|
+
def register_context(self, name: str, config: dict) -> None:
|
|
363
|
+
"""Register a context declaration."""
|
|
364
|
+
context_config = dict(config)
|
|
365
|
+
context_config["name"] = name
|
|
366
|
+
try:
|
|
367
|
+
self.registry.contexts[name] = ContextDeclaration(**context_config)
|
|
368
|
+
except ValidationError as exception:
|
|
369
|
+
self._add_error(f"Invalid context '{name}': {exception}")
|
|
370
|
+
|
|
371
|
+
def register_corpus(self, name: str, config: dict) -> None:
|
|
372
|
+
"""Register a corpus declaration."""
|
|
373
|
+
corpus_config = dict(config)
|
|
374
|
+
if "root" in corpus_config and "corpus_root" not in corpus_config:
|
|
375
|
+
corpus_config["corpus_root"] = corpus_config.pop("root")
|
|
376
|
+
try:
|
|
377
|
+
self.registry.corpora[name] = CorpusDeclaration(name=name, config=corpus_config)
|
|
378
|
+
except ValidationError as exception:
|
|
379
|
+
self._add_error(f"Invalid corpus '{name}': {exception}")
|
|
380
|
+
|
|
381
|
+
def register_retriever(self, name: str, config: dict) -> None:
|
|
382
|
+
"""Register a retriever declaration."""
|
|
383
|
+
retriever_config = dict(config)
|
|
384
|
+
if "retriever_id" not in retriever_config:
|
|
385
|
+
candidate = retriever_config.get("retriever_type")
|
|
386
|
+
if candidate is not None:
|
|
387
|
+
retriever_config["retriever_id"] = candidate
|
|
388
|
+
if isinstance(retriever_config.get("configuration"), dict):
|
|
389
|
+
pipeline = retriever_config["configuration"].get("pipeline", {}) or {}
|
|
390
|
+
if isinstance(pipeline, dict) and isinstance(pipeline.get("query"), dict):
|
|
391
|
+
query_config = pipeline.get("query") or {}
|
|
392
|
+
for key in (
|
|
393
|
+
"limit",
|
|
394
|
+
"offset",
|
|
395
|
+
"maximum_total_characters",
|
|
396
|
+
"maximum_items_per_source",
|
|
397
|
+
"max_items_per_source",
|
|
398
|
+
"include_metadata",
|
|
399
|
+
"metadata_fields",
|
|
400
|
+
"join_with",
|
|
401
|
+
):
|
|
402
|
+
if key in query_config and key not in retriever_config:
|
|
403
|
+
retriever_config[key] = query_config.get(key)
|
|
404
|
+
corpus_name = retriever_config.pop("corpus", None)
|
|
405
|
+
try:
|
|
406
|
+
self.registry.retrievers[name] = RetrieverDeclaration(
|
|
407
|
+
name=name,
|
|
408
|
+
corpus=corpus_name,
|
|
409
|
+
config=retriever_config,
|
|
410
|
+
)
|
|
411
|
+
except ValidationError as exception:
|
|
412
|
+
self._add_error(f"Invalid retriever '{name}': {exception}")
|
|
413
|
+
|
|
414
|
+
def register_task(
|
|
415
|
+
self,
|
|
416
|
+
name: str,
|
|
417
|
+
task_config: Optional[dict] = None,
|
|
418
|
+
parent: Optional[str] = None,
|
|
419
|
+
) -> None:
|
|
420
|
+
"""Register a task declaration (optionally nested under a parent task)."""
|
|
421
|
+
if not name:
|
|
422
|
+
self._add_error("Task name is required.")
|
|
423
|
+
return
|
|
424
|
+
|
|
425
|
+
if ":" in name:
|
|
426
|
+
self._add_error(f"Task name '{name}' may not contain ':'")
|
|
427
|
+
return
|
|
428
|
+
|
|
429
|
+
task_payload = dict(task_config or {})
|
|
430
|
+
task_payload["name"] = name
|
|
431
|
+
|
|
432
|
+
try:
|
|
433
|
+
task = TaskDeclaration(**task_payload)
|
|
434
|
+
except ValidationError as exception:
|
|
435
|
+
self._add_error(f"Invalid task '{name}': {exception}")
|
|
436
|
+
return
|
|
437
|
+
|
|
438
|
+
if parent is None:
|
|
439
|
+
if name in self.registry.tasks:
|
|
440
|
+
self._add_error(f"Duplicate task '{name}'")
|
|
441
|
+
return
|
|
442
|
+
self.registry.tasks[name] = task
|
|
443
|
+
return
|
|
444
|
+
|
|
445
|
+
parent_task = self._find_task(parent)
|
|
446
|
+
if parent_task is None:
|
|
447
|
+
self._add_error(f"Parent task '{parent}' not found for '{name}'")
|
|
448
|
+
return
|
|
449
|
+
|
|
450
|
+
if name in parent_task.children:
|
|
451
|
+
self._add_error(f"Duplicate task '{parent}:{name}'")
|
|
452
|
+
return
|
|
453
|
+
|
|
454
|
+
parent_task.children[name] = task
|
|
455
|
+
|
|
456
|
+
def register_include_tasks(self, path: str, namespace: Optional[str] = None) -> None:
|
|
457
|
+
"""Register an IncludeTasks directive for static task discovery."""
|
|
458
|
+
payload = {"path": path}
|
|
459
|
+
if namespace:
|
|
460
|
+
payload["namespace"] = namespace
|
|
461
|
+
self.registry.include_tasks.append(payload)
|
|
462
|
+
|
|
463
|
+
def _find_task(self, name: str) -> Optional[TaskDeclaration]:
|
|
464
|
+
if name in self.registry.tasks:
|
|
465
|
+
return self.registry.tasks[name]
|
|
466
|
+
return None
|
|
467
|
+
|
|
468
|
+
def register_compactor(self, name: str, config: dict) -> None:
|
|
469
|
+
"""Register a compactor declaration."""
|
|
470
|
+
compactor_config = dict(config)
|
|
471
|
+
try:
|
|
472
|
+
self.registry.compactors[name] = CompactorDeclaration(
|
|
473
|
+
name=name,
|
|
474
|
+
config=compactor_config,
|
|
475
|
+
)
|
|
476
|
+
except ValidationError as exception:
|
|
477
|
+
self._add_error(f"Invalid compactor '{name}': {exception}")
|
|
478
|
+
|
|
337
479
|
def register_specification(self, name: str, scenarios: list) -> None:
|
|
338
480
|
"""Register a BDD specification."""
|
|
339
481
|
try:
|
tactus/core/retrieval.py
ADDED
|
@@ -0,0 +1,317 @@
|
|
|
1
|
+
"""Deterministic retrieval utilities for Context packs."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import hashlib
|
|
6
|
+
import os
|
|
7
|
+
import re
|
|
8
|
+
import urllib.request
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Iterable, List
|
|
11
|
+
|
|
12
|
+
import pyarrow.parquet as pq
|
|
13
|
+
|
|
14
|
+
from biblicus.context import ContextPack, ContextPackBlock
|
|
15
|
+
from biblicus.context_engine import ContextRetrieverRequest, retrieve_context_pack
|
|
16
|
+
from biblicus.corpus import Corpus
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
_WIKITEXT2_FILES = {
|
|
20
|
+
"train": {
|
|
21
|
+
"filename": "train-00000-of-00001.parquet",
|
|
22
|
+
"sha256": "e83889baabc497075506f91975be5fac0d45c5290b6b20582c8cd1e853d0c9f7",
|
|
23
|
+
},
|
|
24
|
+
"validation": {
|
|
25
|
+
"filename": "validation-00000-of-00001.parquet",
|
|
26
|
+
"sha256": "204929b7ff9d6184953f867dedb860e40aa69c078fc1e54b3baaa8fb28511c4c",
|
|
27
|
+
},
|
|
28
|
+
"test": {
|
|
29
|
+
"filename": "test-00000-of-00001.parquet",
|
|
30
|
+
"sha256": "5f1bea067869d04849c0f975a2b29c4ff47d867f484f5010ea5e861eab246d91",
|
|
31
|
+
},
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def get_wikitext2_cache_dir() -> Path:
|
|
36
|
+
"""Return the cache directory for Wikitext-2 raw parquet files."""
|
|
37
|
+
env_path = os.environ.get("TACTUS_WIKITEXT2_CACHE_DIR")
|
|
38
|
+
if env_path:
|
|
39
|
+
return Path(env_path)
|
|
40
|
+
return Path(__file__).resolve().parents[2] / "tests" / "fixtures" / "wikitext-2-raw-v1"
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def ensure_wikitext2_raw(cache_dir: Path | None = None) -> Path:
|
|
44
|
+
"""Ensure the Wikitext-2 raw parquet files are present."""
|
|
45
|
+
cache_dir = cache_dir or get_wikitext2_cache_dir()
|
|
46
|
+
cache_dir.mkdir(parents=True, exist_ok=True)
|
|
47
|
+
base_url = (
|
|
48
|
+
"https://huggingface.co/datasets/Salesforce/wikitext/resolve/main/" "wikitext-2-raw-v1"
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
for split, meta in _WIKITEXT2_FILES.items():
|
|
52
|
+
target = cache_dir / meta["filename"]
|
|
53
|
+
if target.exists() and _sha256_matches(target, meta["sha256"]):
|
|
54
|
+
continue
|
|
55
|
+
url = f"{base_url}/{meta['filename']}"
|
|
56
|
+
_download_file(url, target)
|
|
57
|
+
if not _sha256_matches(target, meta["sha256"]):
|
|
58
|
+
raise RuntimeError(f"Checksum mismatch for {split} parquet file")
|
|
59
|
+
return cache_dir
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def load_wikitext2_texts(split: str, limit: int | None = None) -> List[str]:
|
|
63
|
+
"""Load Wikitext-2 raw texts for the given split."""
|
|
64
|
+
if split not in _WIKITEXT2_FILES:
|
|
65
|
+
raise ValueError(f"Unknown Wikitext2 split: {split}")
|
|
66
|
+
cache_dir = ensure_wikitext2_raw()
|
|
67
|
+
parquet_path = cache_dir / _WIKITEXT2_FILES[split]["filename"]
|
|
68
|
+
table = pq.read_table(parquet_path, columns=["text"])
|
|
69
|
+
texts = [value for value in table.column("text").to_pylist() if value]
|
|
70
|
+
if limit is not None:
|
|
71
|
+
return texts[:limit]
|
|
72
|
+
return texts
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def retrieve_wikitext2(request: ContextRetrieverRequest) -> ContextPack:
|
|
76
|
+
"""
|
|
77
|
+
Retrieve matching passages from Wikitext-2 raw.
|
|
78
|
+
|
|
79
|
+
:param request: Context retriever request payload.
|
|
80
|
+
:type request: ContextRetrieverRequest
|
|
81
|
+
:return: Context pack derived from matching passages.
|
|
82
|
+
:rtype: ContextPack
|
|
83
|
+
"""
|
|
84
|
+
split = request.metadata.get("split", "train")
|
|
85
|
+
maximum_cache_total_items = request.metadata.get("maximum_cache_total_items")
|
|
86
|
+
maximum_cache_total_characters = request.metadata.get("maximum_cache_total_characters")
|
|
87
|
+
texts = load_wikitext2_texts(split=split, limit=None)
|
|
88
|
+
if maximum_cache_total_items is not None:
|
|
89
|
+
texts = texts[: int(maximum_cache_total_items)]
|
|
90
|
+
elif maximum_cache_total_characters is not None:
|
|
91
|
+
selected = []
|
|
92
|
+
total_chars = 0
|
|
93
|
+
for text in texts:
|
|
94
|
+
text_length = len(text)
|
|
95
|
+
if total_chars + text_length > int(maximum_cache_total_characters):
|
|
96
|
+
break
|
|
97
|
+
selected.append(text)
|
|
98
|
+
total_chars += text_length
|
|
99
|
+
texts = selected
|
|
100
|
+
ranked = _rank_texts(request.query, texts)
|
|
101
|
+
offset = request.offset
|
|
102
|
+
limit = request.limit
|
|
103
|
+
|
|
104
|
+
blocks: List[ContextPackBlock] = []
|
|
105
|
+
remaining_chars = request.maximum_total_characters
|
|
106
|
+
for idx, text in enumerate(ranked[offset : offset + limit], start=1):
|
|
107
|
+
snippet = text.strip()
|
|
108
|
+
if remaining_chars is not None and remaining_chars <= 0:
|
|
109
|
+
break
|
|
110
|
+
if remaining_chars is not None and len(snippet) > remaining_chars:
|
|
111
|
+
snippet = snippet[: remaining_chars - 3].rstrip() + "..."
|
|
112
|
+
if remaining_chars is not None:
|
|
113
|
+
remaining_chars -= len(snippet)
|
|
114
|
+
if not snippet:
|
|
115
|
+
continue
|
|
116
|
+
blocks.append(
|
|
117
|
+
ContextPackBlock(
|
|
118
|
+
evidence_item_id=f"{split}-{offset + idx}",
|
|
119
|
+
text=snippet,
|
|
120
|
+
metadata=None,
|
|
121
|
+
)
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
text = "\n\n".join(block.text for block in blocks)
|
|
125
|
+
return ContextPack(text=text, evidence_count=len(blocks), blocks=blocks)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def get_noaa_afd_cache_dir() -> Path:
|
|
129
|
+
"""Return the cache directory for NOAA AFD text fixtures."""
|
|
130
|
+
env_path = os.environ.get("TACTUS_NOAA_AFD_DIR")
|
|
131
|
+
if env_path:
|
|
132
|
+
return Path(env_path)
|
|
133
|
+
return Path(__file__).resolve().parents[2] / "tests" / "fixtures" / "noaa_afd"
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def load_noaa_afd_texts(wfo: str, limit: int | None = None) -> List[str]:
|
|
137
|
+
"""Load NOAA AFD text files for the given WFO code."""
|
|
138
|
+
base_dir = get_noaa_afd_cache_dir() / wfo.upper()
|
|
139
|
+
if not base_dir.exists():
|
|
140
|
+
raise FileNotFoundError(f"No NOAA AFD corpus found for WFO '{wfo}' at {base_dir}")
|
|
141
|
+
files = sorted(path for path in base_dir.glob("*.txt"))
|
|
142
|
+
texts = [path.read_text(encoding="utf-8", errors="replace") for path in files]
|
|
143
|
+
if limit is not None:
|
|
144
|
+
return texts[:limit]
|
|
145
|
+
return texts
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def retrieve_noaa_afd(request: ContextRetrieverRequest) -> ContextPack:
|
|
149
|
+
"""
|
|
150
|
+
Retrieve matching passages from NOAA AFD text fixtures.
|
|
151
|
+
|
|
152
|
+
:param request: Context retriever request payload.
|
|
153
|
+
:type request: ContextRetrieverRequest
|
|
154
|
+
:return: Context pack derived from matching passages.
|
|
155
|
+
:rtype: ContextPack
|
|
156
|
+
"""
|
|
157
|
+
wfo = request.metadata.get("wfo", "MFL")
|
|
158
|
+
maximum_cache_total_items = request.metadata.get("maximum_cache_total_items")
|
|
159
|
+
maximum_cache_total_characters = request.metadata.get("maximum_cache_total_characters")
|
|
160
|
+
texts = load_noaa_afd_texts(wfo=wfo, limit=None)
|
|
161
|
+
if maximum_cache_total_items is not None:
|
|
162
|
+
texts = texts[: int(maximum_cache_total_items)]
|
|
163
|
+
elif maximum_cache_total_characters is not None:
|
|
164
|
+
selected = []
|
|
165
|
+
total_chars = 0
|
|
166
|
+
for text in texts:
|
|
167
|
+
text_length = len(text)
|
|
168
|
+
if total_chars + text_length > int(maximum_cache_total_characters):
|
|
169
|
+
break
|
|
170
|
+
selected.append(text)
|
|
171
|
+
total_chars += text_length
|
|
172
|
+
texts = selected
|
|
173
|
+
|
|
174
|
+
ranked = _rank_texts(request.query, texts)
|
|
175
|
+
offset = request.offset
|
|
176
|
+
limit = request.limit
|
|
177
|
+
|
|
178
|
+
blocks: List[ContextPackBlock] = []
|
|
179
|
+
remaining_chars = request.maximum_total_characters
|
|
180
|
+
for idx, text in enumerate(ranked[offset : offset + limit], start=1):
|
|
181
|
+
snippet = text.strip()
|
|
182
|
+
if remaining_chars is not None and remaining_chars <= 0:
|
|
183
|
+
break
|
|
184
|
+
if remaining_chars is not None and len(snippet) > remaining_chars:
|
|
185
|
+
snippet = snippet[: remaining_chars - 3].rstrip() + "..."
|
|
186
|
+
if remaining_chars is not None:
|
|
187
|
+
remaining_chars -= len(snippet)
|
|
188
|
+
if not snippet:
|
|
189
|
+
continue
|
|
190
|
+
blocks.append(
|
|
191
|
+
ContextPackBlock(
|
|
192
|
+
evidence_item_id=f"{wfo.lower()}-{offset + idx}",
|
|
193
|
+
text=snippet,
|
|
194
|
+
metadata=None,
|
|
195
|
+
)
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
text = "\n\n".join(block.text for block in blocks)
|
|
199
|
+
return ContextPack(text=text, evidence_count=len(blocks), blocks=blocks)
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
def retrieve_biblicus_context_pack(request: ContextRetrieverRequest) -> ContextPack:
|
|
203
|
+
"""
|
|
204
|
+
Retrieve a context pack using Biblicus retrievers.
|
|
205
|
+
|
|
206
|
+
:param request: Context retriever request payload.
|
|
207
|
+
:type request: ContextRetrieverRequest
|
|
208
|
+
:return: Context pack derived from Biblicus retrieval.
|
|
209
|
+
:rtype: ContextPack
|
|
210
|
+
:raises ValueError: If required metadata is missing.
|
|
211
|
+
"""
|
|
212
|
+
metadata = request.metadata or {}
|
|
213
|
+
retriever_id = metadata.get("retriever_id") or metadata.get("retriever_type")
|
|
214
|
+
corpus_root = metadata.get("corpus_root") or metadata.get("root")
|
|
215
|
+
if not retriever_id:
|
|
216
|
+
raise ValueError("Biblicus retrieval requires 'retriever_id' in metadata")
|
|
217
|
+
if not corpus_root:
|
|
218
|
+
raise ValueError("Biblicus retrieval requires 'corpus_root' in metadata")
|
|
219
|
+
|
|
220
|
+
snapshot_id = metadata.get("snapshot_id")
|
|
221
|
+
configuration_name = metadata.get("configuration_name")
|
|
222
|
+
configuration = metadata.get("configuration") or {}
|
|
223
|
+
maximum_items_per_source = metadata.get(
|
|
224
|
+
"maximum_items_per_source",
|
|
225
|
+
metadata.get("max_items_per_source"),
|
|
226
|
+
)
|
|
227
|
+
include_metadata = bool(metadata.get("include_metadata", False))
|
|
228
|
+
metadata_fields = metadata.get("metadata_fields")
|
|
229
|
+
|
|
230
|
+
corpus = Corpus.open(corpus_root)
|
|
231
|
+
return retrieve_context_pack(
|
|
232
|
+
request=request,
|
|
233
|
+
corpus=corpus,
|
|
234
|
+
retriever_id=retriever_id,
|
|
235
|
+
snapshot_id=snapshot_id,
|
|
236
|
+
configuration_name=configuration_name,
|
|
237
|
+
configuration=configuration,
|
|
238
|
+
max_items_per_source=maximum_items_per_source,
|
|
239
|
+
include_metadata=include_metadata,
|
|
240
|
+
metadata_fields=metadata_fields,
|
|
241
|
+
)
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def make_retriever_router(corpus_registry, retriever_registry=None) -> callable:
|
|
245
|
+
"""
|
|
246
|
+
Build a retriever dispatcher based on corpus and retriever configuration.
|
|
247
|
+
|
|
248
|
+
:param corpus_registry: Corpus registry used to resolve corpus metadata.
|
|
249
|
+
:type corpus_registry: dict[str, Any] or None
|
|
250
|
+
:param retriever_registry: Retriever registry used to resolve retrievers.
|
|
251
|
+
:type retriever_registry: dict[str, Any] or None
|
|
252
|
+
:return: Retriever callable that dispatches by retriever id.
|
|
253
|
+
:rtype: callable
|
|
254
|
+
"""
|
|
255
|
+
|
|
256
|
+
def _route(request: ContextRetrieverRequest) -> ContextPack:
|
|
257
|
+
corpus_name = request.metadata.get("corpus")
|
|
258
|
+
retriever_name = request.metadata.get("retriever")
|
|
259
|
+
retriever_id = request.metadata.get("retriever_id") or request.metadata.get(
|
|
260
|
+
"retriever_type"
|
|
261
|
+
)
|
|
262
|
+
if retriever_id is None and retriever_registry and retriever_name in retriever_registry:
|
|
263
|
+
retriever_spec = retriever_registry[retriever_name]
|
|
264
|
+
retriever_config = retriever_spec.config if hasattr(retriever_spec, "config") else {}
|
|
265
|
+
if isinstance(retriever_config, dict):
|
|
266
|
+
retriever_id = retriever_config.get("retriever_id") or retriever_config.get(
|
|
267
|
+
"retriever_type"
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
if retriever_id == "noaa_afd":
|
|
271
|
+
return retrieve_noaa_afd(request)
|
|
272
|
+
if retriever_id == "wikitext2":
|
|
273
|
+
return retrieve_wikitext2(request)
|
|
274
|
+
|
|
275
|
+
if retriever_id is None:
|
|
276
|
+
missing_target = retriever_name or corpus_name or "<unknown>"
|
|
277
|
+
raise ValueError(f"Missing retriever_id for retriever '{missing_target}'")
|
|
278
|
+
|
|
279
|
+
return retrieve_biblicus_context_pack(request)
|
|
280
|
+
|
|
281
|
+
return _route
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def _rank_texts(query: str, texts: Iterable[str]) -> List[str]:
|
|
285
|
+
"""Rank texts by keyword overlap."""
|
|
286
|
+
query_terms = _tokenize(query)
|
|
287
|
+
if not query_terms:
|
|
288
|
+
return list(texts)
|
|
289
|
+
scored = []
|
|
290
|
+
for text in texts:
|
|
291
|
+
text_terms = _tokenize(text)
|
|
292
|
+
score = sum(text_terms.count(term) for term in query_terms)
|
|
293
|
+
scored.append((score, text))
|
|
294
|
+
scored.sort(key=lambda item: item[0], reverse=True)
|
|
295
|
+
return [text for score, text in scored if score > 0] or list(texts)
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
def _tokenize(text: str) -> List[str]:
|
|
299
|
+
"""Tokenize text to lowercase word tokens."""
|
|
300
|
+
return re.findall(r"[a-zA-Z0-9]+", text.lower())
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
def _download_file(url: str, target: Path) -> None:
|
|
304
|
+
"""Download a file to the target path."""
|
|
305
|
+
with urllib.request.urlopen(url) as response, target.open("wb") as handle:
|
|
306
|
+
handle.write(response.read())
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
def _sha256_matches(path: Path, expected: str) -> bool:
|
|
310
|
+
"""Check SHA256 checksum of a file."""
|
|
311
|
+
if not path.exists():
|
|
312
|
+
return False
|
|
313
|
+
hasher = hashlib.sha256()
|
|
314
|
+
with path.open("rb") as handle:
|
|
315
|
+
for chunk in iter(lambda: handle.read(8192), b""):
|
|
316
|
+
hasher.update(chunk)
|
|
317
|
+
return hasher.hexdigest() == expected
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""Static metadata for retriever-supported tasks."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
RETRIEVER_TASKS: dict[str, set[str]] = {
|
|
8
|
+
"tf-vector": {"index"},
|
|
9
|
+
"sqlite-full-text-search": {"index"},
|
|
10
|
+
"embedding-index-inmemory": {"index"},
|
|
11
|
+
"embedding-index-file": {"index"},
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def resolve_retriever_id(config: Optional[dict]) -> Optional[str]:
|
|
16
|
+
"""Resolve retriever identifier from a retriever config dict."""
|
|
17
|
+
if not isinstance(config, dict):
|
|
18
|
+
return None
|
|
19
|
+
for key in ("retriever_id", "retriever_type"):
|
|
20
|
+
value = config.get(key)
|
|
21
|
+
if isinstance(value, str) and value.strip():
|
|
22
|
+
return value
|
|
23
|
+
return None
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def supported_retriever_tasks(retriever_id: Optional[str]) -> set[str]:
|
|
27
|
+
"""Return supported task names for the retriever identifier."""
|
|
28
|
+
if not retriever_id:
|
|
29
|
+
return set()
|
|
30
|
+
return set(RETRIEVER_TASKS.get(retriever_id, set()))
|