runsight-core 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- runsight_core/__init__.py +44 -0
- runsight_core/artifacts.py +66 -0
- runsight_core/assertions/__init__.py +25 -0
- runsight_core/assertions/base.py +57 -0
- runsight_core/assertions/deterministic/__init__.py +65 -0
- runsight_core/assertions/deterministic/linguistic.py +129 -0
- runsight_core/assertions/deterministic/performance.py +58 -0
- runsight_core/assertions/deterministic/string.py +212 -0
- runsight_core/assertions/deterministic/structural.py +91 -0
- runsight_core/assertions/registry.py +159 -0
- runsight_core/assertions/scoring.py +42 -0
- runsight_core/blocks/__init__.py +28 -0
- runsight_core/blocks/_helpers.py +45 -0
- runsight_core/blocks/_registry.py +47 -0
- runsight_core/blocks/base.py +120 -0
- runsight_core/blocks/code.py +349 -0
- runsight_core/blocks/dispatch.py +238 -0
- runsight_core/blocks/gate.py +194 -0
- runsight_core/blocks/linear.py +122 -0
- runsight_core/blocks/loop.py +316 -0
- runsight_core/blocks/registry.py +48 -0
- runsight_core/blocks/synthesize.py +134 -0
- runsight_core/blocks/workflow_block.py +418 -0
- runsight_core/budget_enforcement.py +193 -0
- runsight_core/conditions/__init__.py +0 -0
- runsight_core/conditions/engine.py +315 -0
- runsight_core/eval/__init__.py +0 -0
- runsight_core/eval/runner.py +149 -0
- runsight_core/isolation/__init__.py +28 -0
- runsight_core/isolation/credentials.py +42 -0
- runsight_core/isolation/envelope.py +88 -0
- runsight_core/isolation/errors.py +13 -0
- runsight_core/isolation/handlers.py +141 -0
- runsight_core/isolation/harness.py +440 -0
- runsight_core/isolation/ipc.py +152 -0
- runsight_core/isolation/pool.py +21 -0
- runsight_core/isolation/worker.py +384 -0
- runsight_core/isolation/wrapper.py +227 -0
- runsight_core/llm/__init__.py +7 -0
- runsight_core/llm/client.py +125 -0
- runsight_core/llm/model_catalog.py +137 -0
- runsight_core/memory/__init__.py +7 -0
- runsight_core/memory/budget.py +389 -0
- runsight_core/memory/token_counting.py +13 -0
- runsight_core/memory/windowing.py +43 -0
- runsight_core/observer.py +539 -0
- runsight_core/primitives.py +197 -0
- runsight_core/py.typed +0 -0
- runsight_core/runner.py +408 -0
- runsight_core/security.py +71 -0
- runsight_core/state.py +78 -0
- runsight_core/tools/__init__.py +17 -0
- runsight_core/tools/_catalog.py +483 -0
- runsight_core/tools/delegate.py +53 -0
- runsight_core/tools/file_io.py +68 -0
- runsight_core/tools/http.py +68 -0
- runsight_core/workflow.py +867 -0
- runsight_core/yaml/__init__.py +20 -0
- runsight_core/yaml/discovery.py +450 -0
- runsight_core/yaml/parser.py +1032 -0
- runsight_core/yaml/registry.py +131 -0
- runsight_core/yaml/schema.py +526 -0
- runsight_core-0.1.2.dist-info/METADATA +18 -0
- runsight_core-0.1.2.dist-info/RECORD +66 -0
- runsight_core-0.1.2.dist-info/WHEEL +5 -0
- runsight_core-0.1.2.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Runsight Agent OS Core Engine
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from .blocks.base import BaseBlock
|
|
6
|
+
from .blocks.code import CodeBlock
|
|
7
|
+
from .blocks.dispatch import DispatchBlock
|
|
8
|
+
from .blocks.gate import GateBlock
|
|
9
|
+
from .blocks.linear import LinearBlock
|
|
10
|
+
from .blocks.loop import CarryContextConfig, LoopBlock, LoopBlockDef
|
|
11
|
+
from .blocks.registry import BlockFactory, BlockRegistry
|
|
12
|
+
from .blocks.synthesize import SynthesizeBlock
|
|
13
|
+
from .blocks.workflow_block import WorkflowBlock
|
|
14
|
+
from .primitives import Soul, Step, Task
|
|
15
|
+
from .runner import ExecutionResult, RunsightTeamRunner
|
|
16
|
+
from .state import BlockResult, WorkflowState
|
|
17
|
+
from .workflow import Workflow
|
|
18
|
+
from .yaml import parse_workflow_yaml
|
|
19
|
+
from .yaml.schema import RetryConfig
|
|
20
|
+
|
|
21
|
+
__all__ = [
|
|
22
|
+
"Soul",
|
|
23
|
+
"Task",
|
|
24
|
+
"Step",
|
|
25
|
+
"RunsightTeamRunner",
|
|
26
|
+
"ExecutionResult",
|
|
27
|
+
"BlockResult",
|
|
28
|
+
"WorkflowState",
|
|
29
|
+
"BaseBlock",
|
|
30
|
+
"LinearBlock",
|
|
31
|
+
"DispatchBlock",
|
|
32
|
+
"SynthesizeBlock",
|
|
33
|
+
"LoopBlock",
|
|
34
|
+
"GateBlock",
|
|
35
|
+
"WorkflowBlock",
|
|
36
|
+
"CodeBlock",
|
|
37
|
+
"BlockRegistry",
|
|
38
|
+
"BlockFactory",
|
|
39
|
+
"Workflow",
|
|
40
|
+
"parse_workflow_yaml",
|
|
41
|
+
"LoopBlockDef",
|
|
42
|
+
"RetryConfig",
|
|
43
|
+
"CarryContextConfig",
|
|
44
|
+
]
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ArtifactStore ABC and InMemoryArtifactStore for workflow artifact management.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from typing import Any, Dict, List, Optional
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class ArtifactStore(ABC):
|
|
10
|
+
"""Abstract base class for artifact storage backends."""
|
|
11
|
+
|
|
12
|
+
def __init__(self, run_id: str) -> None:
|
|
13
|
+
self.run_id = run_id
|
|
14
|
+
|
|
15
|
+
@abstractmethod
|
|
16
|
+
async def write(
|
|
17
|
+
self, key: str, content: str, *, metadata: Optional[Dict[str, Any]] = None
|
|
18
|
+
) -> str: ...
|
|
19
|
+
|
|
20
|
+
@abstractmethod
|
|
21
|
+
async def read(self, ref: str) -> str: ...
|
|
22
|
+
|
|
23
|
+
@abstractmethod
|
|
24
|
+
async def list_artifacts(self) -> List[Dict[str, Any]]: ...
|
|
25
|
+
|
|
26
|
+
@abstractmethod
|
|
27
|
+
async def cleanup(self) -> None: ...
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class InMemoryArtifactStore(ArtifactStore):
|
|
31
|
+
"""In-memory artifact store using mem://{run_id}/{key} refs."""
|
|
32
|
+
|
|
33
|
+
def __init__(self, run_id: str) -> None:
|
|
34
|
+
super().__init__(run_id)
|
|
35
|
+
self._content: Dict[str, str] = {}
|
|
36
|
+
self._metadata: Dict[str, Optional[Dict[str, Any]]] = {}
|
|
37
|
+
|
|
38
|
+
async def write(
|
|
39
|
+
self, key: str, content: str, *, metadata: Optional[Dict[str, Any]] = None
|
|
40
|
+
) -> str:
|
|
41
|
+
self._content[key] = content
|
|
42
|
+
self._metadata[key] = metadata
|
|
43
|
+
return f"mem://{self.run_id}/{key}"
|
|
44
|
+
|
|
45
|
+
async def read(self, ref: str) -> str:
|
|
46
|
+
prefix = f"mem://{self.run_id}/"
|
|
47
|
+
if not ref.startswith(prefix):
|
|
48
|
+
raise KeyError(ref)
|
|
49
|
+
key = ref[len(prefix) :]
|
|
50
|
+
if key not in self._content:
|
|
51
|
+
raise KeyError(ref)
|
|
52
|
+
return self._content[key]
|
|
53
|
+
|
|
54
|
+
async def list_artifacts(self) -> List[Dict[str, Any]]:
|
|
55
|
+
return [
|
|
56
|
+
{
|
|
57
|
+
"key": key,
|
|
58
|
+
"ref": f"mem://{self.run_id}/{key}",
|
|
59
|
+
"metadata": self._metadata[key],
|
|
60
|
+
}
|
|
61
|
+
for key in self._content
|
|
62
|
+
]
|
|
63
|
+
|
|
64
|
+
async def cleanup(self) -> None:
|
|
65
|
+
self._content.clear()
|
|
66
|
+
self._metadata.clear()
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""Assertion plugin interface for runsight_core."""
|
|
2
|
+
|
|
3
|
+
from runsight_core.assertions.base import (
|
|
4
|
+
Assertion,
|
|
5
|
+
AssertionContext,
|
|
6
|
+
GradingResult,
|
|
7
|
+
TokenUsage,
|
|
8
|
+
)
|
|
9
|
+
from runsight_core.assertions.registry import (
|
|
10
|
+
register_assertion,
|
|
11
|
+
run_assertion,
|
|
12
|
+
run_assertions,
|
|
13
|
+
)
|
|
14
|
+
from runsight_core.assertions.scoring import AssertionsResult
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"Assertion",
|
|
18
|
+
"AssertionContext",
|
|
19
|
+
"AssertionsResult",
|
|
20
|
+
"GradingResult",
|
|
21
|
+
"TokenUsage",
|
|
22
|
+
"register_assertion",
|
|
23
|
+
"run_assertion",
|
|
24
|
+
"run_assertions",
|
|
25
|
+
]
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""Base models for the assertion plugin interface."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from typing import Any, Protocol, runtime_checkable
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class TokenUsage:
|
|
11
|
+
"""Token usage breakdown for an assertion evaluation."""
|
|
12
|
+
|
|
13
|
+
prompt: int = 0
|
|
14
|
+
completion: int = 0
|
|
15
|
+
total: int = 0
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class GradingResult:
|
|
20
|
+
"""Result of a single assertion evaluation."""
|
|
21
|
+
|
|
22
|
+
passed: bool
|
|
23
|
+
score: float
|
|
24
|
+
reason: str
|
|
25
|
+
named_scores: dict[str, float] = field(default_factory=dict)
|
|
26
|
+
tokens_used: TokenUsage | None = None
|
|
27
|
+
component_results: list[GradingResult] = field(default_factory=list)
|
|
28
|
+
assertion_type: str | None = None
|
|
29
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class AssertionContext:
|
|
34
|
+
"""Context provided to assertion evaluators."""
|
|
35
|
+
|
|
36
|
+
output: str
|
|
37
|
+
prompt: str
|
|
38
|
+
prompt_hash: str
|
|
39
|
+
soul_id: str
|
|
40
|
+
soul_version: str
|
|
41
|
+
block_id: str
|
|
42
|
+
block_type: str
|
|
43
|
+
cost_usd: float
|
|
44
|
+
total_tokens: int
|
|
45
|
+
latency_ms: float
|
|
46
|
+
variables: dict[str, Any]
|
|
47
|
+
run_id: str
|
|
48
|
+
workflow_id: str
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@runtime_checkable
|
|
52
|
+
class Assertion(Protocol):
|
|
53
|
+
"""Protocol that assertion plugins must satisfy."""
|
|
54
|
+
|
|
55
|
+
type: str
|
|
56
|
+
|
|
57
|
+
def evaluate(self, output: str, context: AssertionContext) -> GradingResult: ...
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""Deterministic assertion plugins — registers all 15 types on import."""
|
|
2
|
+
|
|
3
|
+
from runsight_core.assertions.deterministic.linguistic import (
|
|
4
|
+
BleuAssertion,
|
|
5
|
+
LevenshteinAssertion,
|
|
6
|
+
RougeNAssertion,
|
|
7
|
+
)
|
|
8
|
+
from runsight_core.assertions.deterministic.performance import (
|
|
9
|
+
CostAssertion,
|
|
10
|
+
LatencyAssertion,
|
|
11
|
+
)
|
|
12
|
+
from runsight_core.assertions.deterministic.string import (
|
|
13
|
+
ContainsAllAssertion,
|
|
14
|
+
ContainsAnyAssertion,
|
|
15
|
+
ContainsAssertion,
|
|
16
|
+
EqualsAssertion,
|
|
17
|
+
IContainsAssertion,
|
|
18
|
+
RegexAssertion,
|
|
19
|
+
StartsWithAssertion,
|
|
20
|
+
WordCountAssertion,
|
|
21
|
+
)
|
|
22
|
+
from runsight_core.assertions.deterministic.structural import (
|
|
23
|
+
ContainsJsonAssertion,
|
|
24
|
+
IsJsonAssertion,
|
|
25
|
+
)
|
|
26
|
+
from runsight_core.assertions.registry import register_assertion
|
|
27
|
+
|
|
28
|
+
_ALL_ASSERTIONS: list[type] = [
|
|
29
|
+
EqualsAssertion,
|
|
30
|
+
ContainsAssertion,
|
|
31
|
+
IContainsAssertion,
|
|
32
|
+
ContainsAllAssertion,
|
|
33
|
+
ContainsAnyAssertion,
|
|
34
|
+
StartsWithAssertion,
|
|
35
|
+
RegexAssertion,
|
|
36
|
+
WordCountAssertion,
|
|
37
|
+
IsJsonAssertion,
|
|
38
|
+
ContainsJsonAssertion,
|
|
39
|
+
CostAssertion,
|
|
40
|
+
LatencyAssertion,
|
|
41
|
+
LevenshteinAssertion,
|
|
42
|
+
BleuAssertion,
|
|
43
|
+
RougeNAssertion,
|
|
44
|
+
]
|
|
45
|
+
|
|
46
|
+
for _cls in _ALL_ASSERTIONS:
|
|
47
|
+
register_assertion(_cls.type, _cls)
|
|
48
|
+
|
|
49
|
+
__all__ = [
|
|
50
|
+
"BleuAssertion",
|
|
51
|
+
"ContainsAllAssertion",
|
|
52
|
+
"ContainsAnyAssertion",
|
|
53
|
+
"ContainsAssertion",
|
|
54
|
+
"ContainsJsonAssertion",
|
|
55
|
+
"CostAssertion",
|
|
56
|
+
"EqualsAssertion",
|
|
57
|
+
"IContainsAssertion",
|
|
58
|
+
"IsJsonAssertion",
|
|
59
|
+
"LatencyAssertion",
|
|
60
|
+
"LevenshteinAssertion",
|
|
61
|
+
"RegexAssertion",
|
|
62
|
+
"RougeNAssertion",
|
|
63
|
+
"StartsWithAssertion",
|
|
64
|
+
"WordCountAssertion",
|
|
65
|
+
]
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
"""Deterministic linguistic assertion plugins.
|
|
2
|
+
|
|
3
|
+
Covers: levenshtein, bleu, rouge-n.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import math
|
|
9
|
+
from collections import Counter
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
import editdistance
|
|
13
|
+
from rouge_score import rouge_scorer
|
|
14
|
+
|
|
15
|
+
from runsight_core.assertions.base import AssertionContext, GradingResult
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class LevenshteinAssertion:
|
|
19
|
+
"""Edit distance <= threshold."""
|
|
20
|
+
|
|
21
|
+
type = "levenshtein"
|
|
22
|
+
|
|
23
|
+
def __init__(self, value: Any = "", threshold: float | None = None) -> None:
|
|
24
|
+
self.value = str(value)
|
|
25
|
+
self.threshold = threshold if threshold is not None else 5
|
|
26
|
+
|
|
27
|
+
def evaluate(self, output: str, context: AssertionContext) -> GradingResult:
|
|
28
|
+
distance = editdistance.eval(output, self.value)
|
|
29
|
+
passed = distance <= self.threshold
|
|
30
|
+
score = 1.0 if passed else 0.0
|
|
31
|
+
reason = f"Levenshtein distance is {distance} (threshold {self.threshold})"
|
|
32
|
+
return GradingResult(passed=passed, score=score, reason=reason)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class BleuAssertion:
|
|
36
|
+
"""BLEU-4 score >= threshold. Inline implementation (no nltk)."""
|
|
37
|
+
|
|
38
|
+
type = "bleu"
|
|
39
|
+
|
|
40
|
+
def __init__(self, value: Any = "", threshold: float | None = None) -> None:
|
|
41
|
+
self.value = str(value)
|
|
42
|
+
self.threshold = threshold if threshold is not None else 0.5
|
|
43
|
+
|
|
44
|
+
def evaluate(self, output: str, context: AssertionContext) -> GradingResult:
|
|
45
|
+
score = _compute_bleu(reference=self.value, candidate=output)
|
|
46
|
+
passed = score >= self.threshold
|
|
47
|
+
reason = f"BLEU score {score:.4f} {'>='} threshold {self.threshold}"
|
|
48
|
+
if not passed:
|
|
49
|
+
reason = f"BLEU score {score:.4f} < threshold {self.threshold}"
|
|
50
|
+
return GradingResult(passed=passed, score=score, reason=reason)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class RougeNAssertion:
|
|
54
|
+
"""ROUGE-N score >= threshold using rouge-score library."""
|
|
55
|
+
|
|
56
|
+
type = "rouge-n"
|
|
57
|
+
|
|
58
|
+
def __init__(self, value: Any = "", threshold: float | None = None) -> None:
|
|
59
|
+
self.value = str(value)
|
|
60
|
+
self.threshold = threshold if threshold is not None else 0.75
|
|
61
|
+
|
|
62
|
+
def evaluate(self, output: str, context: AssertionContext) -> GradingResult:
|
|
63
|
+
if not output or not self.value:
|
|
64
|
+
score = 0.0
|
|
65
|
+
else:
|
|
66
|
+
scorer = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=False)
|
|
67
|
+
scores = scorer.score(self.value, output)
|
|
68
|
+
score = scores["rouge1"].fmeasure
|
|
69
|
+
|
|
70
|
+
passed = score >= self.threshold
|
|
71
|
+
if passed:
|
|
72
|
+
reason = f"ROUGE-N score {score:.4f} >= threshold {self.threshold}"
|
|
73
|
+
else:
|
|
74
|
+
reason = f"ROUGE-N score {score:.4f} < threshold {self.threshold}"
|
|
75
|
+
return GradingResult(passed=passed, score=score, reason=reason)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
# ── Inline BLEU-4 implementation ────────────────────────────────────────────
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _get_ngrams(tokens: list[str], n: int) -> Counter[tuple[str, ...]]:
|
|
82
|
+
"""Extract n-grams from a token list."""
|
|
83
|
+
return Counter(tuple(tokens[i : i + n]) for i in range(len(tokens) - n + 1))
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _compute_bleu(reference: str, candidate: str, max_n: int = 4) -> float:
|
|
87
|
+
"""Compute BLEU score with smoothing (method 1: add 1 to numerator/denominator).
|
|
88
|
+
|
|
89
|
+
Ported from promptfoo's BLEU implementation.
|
|
90
|
+
"""
|
|
91
|
+
ref_tokens = reference.lower().split()
|
|
92
|
+
cand_tokens = candidate.lower().split()
|
|
93
|
+
|
|
94
|
+
if not cand_tokens:
|
|
95
|
+
return 0.0
|
|
96
|
+
if not ref_tokens:
|
|
97
|
+
return 0.0
|
|
98
|
+
|
|
99
|
+
# Brevity penalty
|
|
100
|
+
bp = 1.0
|
|
101
|
+
if len(cand_tokens) < len(ref_tokens):
|
|
102
|
+
bp = math.exp(1.0 - len(ref_tokens) / len(cand_tokens))
|
|
103
|
+
|
|
104
|
+
# Modified precision for each n-gram order with smoothing
|
|
105
|
+
log_avg = 0.0
|
|
106
|
+
for n in range(1, max_n + 1):
|
|
107
|
+
ref_ngrams = _get_ngrams(ref_tokens, n)
|
|
108
|
+
cand_ngrams = _get_ngrams(cand_tokens, n)
|
|
109
|
+
|
|
110
|
+
# Clipped counts
|
|
111
|
+
clipped = 0
|
|
112
|
+
total = 0
|
|
113
|
+
for ngram, count in cand_ngrams.items():
|
|
114
|
+
clipped += min(count, ref_ngrams.get(ngram, 0))
|
|
115
|
+
total += count
|
|
116
|
+
|
|
117
|
+
# Smoothing: add 1 to both numerator and denominator when n > 1
|
|
118
|
+
if n == 1:
|
|
119
|
+
if total == 0:
|
|
120
|
+
return 0.0
|
|
121
|
+
precision = clipped / total
|
|
122
|
+
if precision == 0:
|
|
123
|
+
return 0.0
|
|
124
|
+
else:
|
|
125
|
+
precision = (clipped + 1) / (total + 1)
|
|
126
|
+
|
|
127
|
+
log_avg += math.log(precision) / max_n
|
|
128
|
+
|
|
129
|
+
return bp * math.exp(log_avg)
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""Deterministic performance assertion plugins.
|
|
2
|
+
|
|
3
|
+
Covers: cost, latency.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from runsight_core.assertions.base import AssertionContext, GradingResult
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class CostAssertion:
|
|
14
|
+
"""Check that cost_usd from context is within threshold."""
|
|
15
|
+
|
|
16
|
+
type = "cost"
|
|
17
|
+
|
|
18
|
+
def __init__(self, value: Any = None, threshold: float | None = None) -> None:
|
|
19
|
+
self.value = value
|
|
20
|
+
self.threshold = threshold
|
|
21
|
+
|
|
22
|
+
def evaluate(self, output: str, context: AssertionContext) -> GradingResult:
|
|
23
|
+
cost = context.cost_usd
|
|
24
|
+
threshold = self.threshold if self.threshold is not None else 0.0
|
|
25
|
+
if cost <= threshold:
|
|
26
|
+
return GradingResult(
|
|
27
|
+
passed=True,
|
|
28
|
+
score=1.0,
|
|
29
|
+
reason=f"Cost ${cost:.4f} is within threshold ${threshold:.4f}",
|
|
30
|
+
)
|
|
31
|
+
return GradingResult(
|
|
32
|
+
passed=False, score=0.0, reason=f"Cost ${cost:.4f} exceeds threshold ${threshold:.4f}"
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class LatencyAssertion:
|
|
37
|
+
"""Check that latency_ms from context is within threshold."""
|
|
38
|
+
|
|
39
|
+
type = "latency"
|
|
40
|
+
|
|
41
|
+
def __init__(self, value: Any = None, threshold: float | None = None) -> None:
|
|
42
|
+
self.value = value
|
|
43
|
+
self.threshold = threshold
|
|
44
|
+
|
|
45
|
+
def evaluate(self, output: str, context: AssertionContext) -> GradingResult:
|
|
46
|
+
latency = context.latency_ms
|
|
47
|
+
threshold = self.threshold if self.threshold is not None else 0.0
|
|
48
|
+
if latency <= threshold:
|
|
49
|
+
return GradingResult(
|
|
50
|
+
passed=True,
|
|
51
|
+
score=1.0,
|
|
52
|
+
reason=f"Latency {latency:.1f}ms is within threshold {threshold:.1f}ms",
|
|
53
|
+
)
|
|
54
|
+
return GradingResult(
|
|
55
|
+
passed=False,
|
|
56
|
+
score=0.0,
|
|
57
|
+
reason=f"Latency {latency:.1f}ms exceeds threshold {threshold:.1f}ms",
|
|
58
|
+
)
|
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
"""Deterministic string assertion plugins.
|
|
2
|
+
|
|
3
|
+
Covers: equals, contains, icontains, contains-all, contains-any,
|
|
4
|
+
starts-with, regex, word-count.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import json
|
|
10
|
+
import re
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
from runsight_core.assertions.base import AssertionContext, GradingResult
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class EqualsAssertion:
|
|
17
|
+
"""Exact string match or JSON deep-equal."""
|
|
18
|
+
|
|
19
|
+
type = "equals"
|
|
20
|
+
|
|
21
|
+
def __init__(self, value: Any = "", threshold: float | None = None) -> None:
|
|
22
|
+
self.value = value
|
|
23
|
+
self.threshold = threshold
|
|
24
|
+
|
|
25
|
+
def evaluate(self, output: str, context: AssertionContext) -> GradingResult:
|
|
26
|
+
value_str = str(self.value)
|
|
27
|
+
|
|
28
|
+
# Try JSON deep-equal first
|
|
29
|
+
try:
|
|
30
|
+
output_parsed = json.loads(output)
|
|
31
|
+
value_parsed = json.loads(value_str)
|
|
32
|
+
if output_parsed == value_parsed:
|
|
33
|
+
return GradingResult(
|
|
34
|
+
passed=True, score=1.0, reason="Output matches expected value (JSON deep-equal)"
|
|
35
|
+
)
|
|
36
|
+
return GradingResult(
|
|
37
|
+
passed=False,
|
|
38
|
+
score=0.0,
|
|
39
|
+
reason=f"JSON values differ: expected {value_str!r}, got {output!r}",
|
|
40
|
+
)
|
|
41
|
+
except (json.JSONDecodeError, TypeError):
|
|
42
|
+
pass
|
|
43
|
+
|
|
44
|
+
# Fall back to exact string match
|
|
45
|
+
if output == value_str:
|
|
46
|
+
return GradingResult(
|
|
47
|
+
passed=True, score=1.0, reason="Output exactly matches expected value"
|
|
48
|
+
)
|
|
49
|
+
return GradingResult(
|
|
50
|
+
passed=False, score=0.0, reason=f"Expected {value_str!r}, got {output!r}"
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class ContainsAssertion:
|
|
55
|
+
"""Case-sensitive substring check."""
|
|
56
|
+
|
|
57
|
+
type = "contains"
|
|
58
|
+
|
|
59
|
+
def __init__(self, value: Any = "", threshold: float | None = None) -> None:
|
|
60
|
+
self.value = str(value)
|
|
61
|
+
self.threshold = threshold
|
|
62
|
+
|
|
63
|
+
def evaluate(self, output: str, context: AssertionContext) -> GradingResult:
|
|
64
|
+
if self.value in output:
|
|
65
|
+
return GradingResult(passed=True, score=1.0, reason=f"Output contains {self.value!r}")
|
|
66
|
+
return GradingResult(
|
|
67
|
+
passed=False, score=0.0, reason=f"Output does not contain {self.value!r}"
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class IContainsAssertion:
|
|
72
|
+
"""Case-insensitive substring check."""
|
|
73
|
+
|
|
74
|
+
type = "icontains"
|
|
75
|
+
|
|
76
|
+
def __init__(self, value: Any = "", threshold: float | None = None) -> None:
|
|
77
|
+
self.value = str(value)
|
|
78
|
+
self.threshold = threshold
|
|
79
|
+
|
|
80
|
+
def evaluate(self, output: str, context: AssertionContext) -> GradingResult:
|
|
81
|
+
if self.value.lower() in output.lower():
|
|
82
|
+
return GradingResult(
|
|
83
|
+
passed=True, score=1.0, reason=f"Output contains {self.value!r} (case-insensitive)"
|
|
84
|
+
)
|
|
85
|
+
return GradingResult(
|
|
86
|
+
passed=False,
|
|
87
|
+
score=0.0,
|
|
88
|
+
reason=f"Output does not contain {self.value!r} (case-insensitive)",
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
class ContainsAllAssertion:
|
|
93
|
+
"""All items in value list must be substrings."""
|
|
94
|
+
|
|
95
|
+
type = "contains-all"
|
|
96
|
+
|
|
97
|
+
def __init__(self, value: Any = None, threshold: float | None = None) -> None:
|
|
98
|
+
self.value: list[str] = value if value is not None else []
|
|
99
|
+
self.threshold = threshold
|
|
100
|
+
|
|
101
|
+
def evaluate(self, output: str, context: AssertionContext) -> GradingResult:
|
|
102
|
+
missing = [item for item in self.value if str(item) not in output]
|
|
103
|
+
if not missing:
|
|
104
|
+
return GradingResult(
|
|
105
|
+
passed=True, score=1.0, reason="Output contains all required substrings"
|
|
106
|
+
)
|
|
107
|
+
return GradingResult(passed=False, score=0.0, reason=f"Output missing: {missing!r}")
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
class ContainsAnyAssertion:
|
|
111
|
+
"""At least one item in value list must be a substring."""
|
|
112
|
+
|
|
113
|
+
type = "contains-any"
|
|
114
|
+
|
|
115
|
+
def __init__(self, value: Any = None, threshold: float | None = None) -> None:
|
|
116
|
+
self.value: list[str] = value if value is not None else []
|
|
117
|
+
self.threshold = threshold
|
|
118
|
+
|
|
119
|
+
def evaluate(self, output: str, context: AssertionContext) -> GradingResult:
|
|
120
|
+
if not self.value:
|
|
121
|
+
return GradingResult(passed=False, score=0.0, reason="No candidates provided to match")
|
|
122
|
+
for item in self.value:
|
|
123
|
+
if str(item) in output:
|
|
124
|
+
return GradingResult(passed=True, score=1.0, reason=f"Output contains {item!r}")
|
|
125
|
+
return GradingResult(
|
|
126
|
+
passed=False, score=0.0, reason=f"Output does not contain any of {self.value!r}"
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
class StartsWithAssertion:
|
|
131
|
+
"""String prefix check."""
|
|
132
|
+
|
|
133
|
+
type = "starts-with"
|
|
134
|
+
|
|
135
|
+
def __init__(self, value: Any = "", threshold: float | None = None) -> None:
|
|
136
|
+
self.value = str(value)
|
|
137
|
+
self.threshold = threshold
|
|
138
|
+
|
|
139
|
+
def evaluate(self, output: str, context: AssertionContext) -> GradingResult:
|
|
140
|
+
if output.startswith(self.value):
|
|
141
|
+
return GradingResult(
|
|
142
|
+
passed=True, score=1.0, reason=f"Output starts with {self.value!r}"
|
|
143
|
+
)
|
|
144
|
+
return GradingResult(
|
|
145
|
+
passed=False, score=0.0, reason=f"Output does not start with {self.value!r}"
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
class RegexAssertion:
|
|
150
|
+
"""Regex search match."""
|
|
151
|
+
|
|
152
|
+
type = "regex"
|
|
153
|
+
|
|
154
|
+
def __init__(self, value: Any = "", threshold: float | None = None) -> None:
|
|
155
|
+
self.value = str(value)
|
|
156
|
+
self.threshold = threshold
|
|
157
|
+
|
|
158
|
+
def evaluate(self, output: str, context: AssertionContext) -> GradingResult:
|
|
159
|
+
try:
|
|
160
|
+
if re.search(self.value, output):
|
|
161
|
+
return GradingResult(
|
|
162
|
+
passed=True, score=1.0, reason=f"Output matches pattern {self.value!r}"
|
|
163
|
+
)
|
|
164
|
+
return GradingResult(
|
|
165
|
+
passed=False, score=0.0, reason=f"Output does not match pattern {self.value!r}"
|
|
166
|
+
)
|
|
167
|
+
except re.error as e:
|
|
168
|
+
return GradingResult(passed=False, score=0.0, reason=f"Invalid regex pattern: {e}")
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
class WordCountAssertion:
|
|
172
|
+
"""Word count check: exact int or {min, max} range."""
|
|
173
|
+
|
|
174
|
+
type = "word-count"
|
|
175
|
+
|
|
176
|
+
def __init__(self, value: Any = None, threshold: float | None = None) -> None:
|
|
177
|
+
self.value = value
|
|
178
|
+
self.threshold = threshold
|
|
179
|
+
|
|
180
|
+
def evaluate(self, output: str, context: AssertionContext) -> GradingResult:
|
|
181
|
+
words = output.split()
|
|
182
|
+
count = len(words)
|
|
183
|
+
|
|
184
|
+
if isinstance(self.value, int):
|
|
185
|
+
if count == self.value:
|
|
186
|
+
return GradingResult(
|
|
187
|
+
passed=True, score=1.0, reason=f"Word count is {count} (expected {self.value})"
|
|
188
|
+
)
|
|
189
|
+
return GradingResult(
|
|
190
|
+
passed=False, score=0.0, reason=f"Word count is {count}, expected {self.value}"
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
if isinstance(self.value, dict):
|
|
194
|
+
min_val = self.value.get("min")
|
|
195
|
+
max_val = self.value.get("max")
|
|
196
|
+
|
|
197
|
+
if min_val is not None and max_val is not None and min_val > max_val:
|
|
198
|
+
raise ValueError(f"min ({min_val}) is greater than max ({max_val})")
|
|
199
|
+
|
|
200
|
+
if min_val is not None and count < min_val:
|
|
201
|
+
return GradingResult(
|
|
202
|
+
passed=False, score=0.0, reason=f"Word count {count} is below minimum {min_val}"
|
|
203
|
+
)
|
|
204
|
+
if max_val is not None and count > max_val:
|
|
205
|
+
return GradingResult(
|
|
206
|
+
passed=False, score=0.0, reason=f"Word count {count} exceeds maximum {max_val}"
|
|
207
|
+
)
|
|
208
|
+
return GradingResult(
|
|
209
|
+
passed=True, score=1.0, reason=f"Word count {count} is within range"
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
raise TypeError(f"Invalid value type for word-count: {type(self.value).__name__}")
|