tightloop 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- loop/__init__.py +40 -0
- loop/approval/__init__.py +87 -0
- loop/blueprints/__init__.py +3 -0
- loop/blueprints/testfix.py +117 -0
- loop/context/__init__.py +144 -0
- loop/core/__init__.py +0 -0
- loop/core/engine.py +515 -0
- loop/core/result.py +64 -0
- loop/core/state.py +143 -0
- loop/exit/__init__.py +60 -0
- loop/llm/__init__.py +70 -0
- loop/llm/anthropic.py +45 -0
- loop/llm/openai.py +55 -0
- loop/policy/__init__.py +96 -0
- loop/pricing.py +47 -0
- loop/progress/__init__.py +72 -0
- loop/tools/__init__.py +220 -0
- loop/trace/__init__.py +81 -0
- tightloop-0.1.0.dist-info/METADATA +439 -0
- tightloop-0.1.0.dist-info/RECORD +21 -0
- tightloop-0.1.0.dist-info/WHEEL +4 -0
loop/core/state.py
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
"""Explicit, serializable loop state."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import hashlib
|
|
5
|
+
import json
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any, Literal
|
|
8
|
+
|
|
9
|
+
from pydantic import BaseModel, Field
|
|
10
|
+
|
|
11
|
+
SCHEMA_VERSION = 1
|
|
12
|
+
ENGINE_VERSION = "0.1.0"
|
|
13
|
+
INLINE_CAP = 16 * 1024 # max inline chars per record; large payloads live in the trace
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class StateError(Exception):
|
|
17
|
+
pass
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class SchemaChangedError(StateError):
|
|
21
|
+
"""Tool schemas changed since the state was saved (resume requires allow_schema_change)."""
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class ArtifactDriftError(StateError):
|
|
25
|
+
"""Stored context artifacts were produced by a different engine/summarizer version."""
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def digest(text: str) -> str:
|
|
29
|
+
return hashlib.sha256(text.encode()).hexdigest()[:16]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def excerpt(text: str, cap: int = INLINE_CAP) -> str:
|
|
33
|
+
if len(text) <= cap:
|
|
34
|
+
return text
|
|
35
|
+
return text[:cap] + f"\n...[truncated {len(text) - cap} chars, digest={digest(text)}]"
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class ArtifactStamp(BaseModel):
|
|
39
|
+
engine_version: str
|
|
40
|
+
model_id: str
|
|
41
|
+
prompt_hash: str
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class ContextArtifact(BaseModel):
|
|
45
|
+
kind: Literal["summary", "fact"]
|
|
46
|
+
iteration: int | None = None
|
|
47
|
+
content: str
|
|
48
|
+
stamp: ArtifactStamp
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class MetricSnapshot(BaseModel):
|
|
52
|
+
value: float
|
|
53
|
+
regression: bool = False
|
|
54
|
+
detail: dict[str, Any] = Field(default_factory=dict)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class ActionRecord(BaseModel):
|
|
58
|
+
tool: str
|
|
59
|
+
args_excerpt: str
|
|
60
|
+
status: Literal["ok", "error", "aborted"]
|
|
61
|
+
result_excerpt: str
|
|
62
|
+
duration_s: float
|
|
63
|
+
fingerprint: str
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class IterationRecord(BaseModel):
|
|
67
|
+
index: int
|
|
68
|
+
observation: str
|
|
69
|
+
plan_text: str = ""
|
|
70
|
+
actions: list[ActionRecord] = Field(default_factory=list)
|
|
71
|
+
metric: MetricSnapshot | None = None
|
|
72
|
+
repetition: bool = False
|
|
73
|
+
plan_invalid: bool = False
|
|
74
|
+
input_tokens: int = 0
|
|
75
|
+
output_tokens: int = 0
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class Metrics(BaseModel):
|
|
79
|
+
input_tokens: int = 0
|
|
80
|
+
output_tokens: int = 0
|
|
81
|
+
llm_calls: int = 0
|
|
82
|
+
elapsed_s: float = 0.0
|
|
83
|
+
cost_usd: float | None = None
|
|
84
|
+
|
|
85
|
+
@property
|
|
86
|
+
def total_tokens(self) -> int:
|
|
87
|
+
return self.input_tokens + self.output_tokens
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
class PendingApproval(BaseModel):
|
|
91
|
+
token: str
|
|
92
|
+
tool: str
|
|
93
|
+
args: dict[str, Any]
|
|
94
|
+
reason: str
|
|
95
|
+
action_hash: str
|
|
96
|
+
state_version: int
|
|
97
|
+
created_at: float
|
|
98
|
+
ttl_s: float
|
|
99
|
+
plan_text: str = ""
|
|
100
|
+
precondition_metric: MetricSnapshot | None = None
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
class State(BaseModel):
|
|
104
|
+
schema_version: int = SCHEMA_VERSION
|
|
105
|
+
goal: str
|
|
106
|
+
config: dict[str, Any] = Field(default_factory=dict)
|
|
107
|
+
tool_schema_hash: str = ""
|
|
108
|
+
iterations: list[IterationRecord] = Field(default_factory=list)
|
|
109
|
+
artifacts: list[ContextArtifact] = Field(default_factory=list)
|
|
110
|
+
pinned_facts: list[str] = Field(default_factory=list)
|
|
111
|
+
failed_approaches: list[str] = Field(default_factory=list)
|
|
112
|
+
metrics: Metrics = Field(default_factory=Metrics)
|
|
113
|
+
no_progress_streak: int = 0
|
|
114
|
+
plan_invalid_streak: int = 0
|
|
115
|
+
state_version: int = 0
|
|
116
|
+
pending_approval: PendingApproval | None = None
|
|
117
|
+
|
|
118
|
+
def save(self, path: str | Path) -> None:
|
|
119
|
+
payload = self.model_dump(mode="json")
|
|
120
|
+
body = json.dumps(payload, sort_keys=True)
|
|
121
|
+
wrapper = {
|
|
122
|
+
"integrity": hashlib.sha256(body.encode()).hexdigest(),
|
|
123
|
+
"state": payload,
|
|
124
|
+
}
|
|
125
|
+
Path(path).write_text(json.dumps(wrapper, indent=2))
|
|
126
|
+
|
|
127
|
+
@classmethod
|
|
128
|
+
def load(cls, path: str | Path) -> "State":
|
|
129
|
+
try:
|
|
130
|
+
wrapper = json.loads(Path(path).read_text())
|
|
131
|
+
except (OSError, json.JSONDecodeError) as e:
|
|
132
|
+
raise StateError(f"cannot read state file {path}: {e}") from e
|
|
133
|
+
payload = wrapper.get("state")
|
|
134
|
+
if payload is None:
|
|
135
|
+
raise StateError(f"{path} is not a Loop state file")
|
|
136
|
+
body = json.dumps(payload, sort_keys=True)
|
|
137
|
+
if hashlib.sha256(body.encode()).hexdigest() != wrapper.get("integrity"):
|
|
138
|
+
raise StateError(f"integrity check failed for {path}")
|
|
139
|
+
if payload.get("schema_version") != SCHEMA_VERSION:
|
|
140
|
+
raise StateError(
|
|
141
|
+
f"state schema_version {payload.get('schema_version')} != supported {SCHEMA_VERSION}"
|
|
142
|
+
)
|
|
143
|
+
return cls.model_validate(payload)
|
loop/exit/__init__.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
"""Declarative exit conditions. Evaluated after each iteration;
|
|
2
|
+
first matching exit wins. The always-on ceilings (max_iterations, token_limit,
|
|
3
|
+
wall-clock) are engine constructor args — these are additional conditions."""
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
from typing import Callable
|
|
7
|
+
|
|
8
|
+
from ..core.result import LoopStatus
|
|
9
|
+
from ..core.state import State
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class ExitCondition:
|
|
13
|
+
def evaluate(self, state: State) -> tuple[LoopStatus, str] | None:
|
|
14
|
+
raise NotImplementedError
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class _Lambda(ExitCondition):
|
|
18
|
+
def __init__(self, fn: Callable[[State], tuple[LoopStatus, str] | None]):
|
|
19
|
+
self.fn = fn
|
|
20
|
+
|
|
21
|
+
def evaluate(self, state: State) -> tuple[LoopStatus, str] | None:
|
|
22
|
+
return self.fn(state)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class Exit:
|
|
26
|
+
@staticmethod
|
|
27
|
+
def success(predicate: Callable[[State], bool], reason: str = "goal achieved") -> ExitCondition:
|
|
28
|
+
return _Lambda(lambda s: (LoopStatus.SUCCESS, reason) if predicate(s) else None)
|
|
29
|
+
|
|
30
|
+
@staticmethod
|
|
31
|
+
def max_iterations(n: int = 20) -> ExitCondition:
|
|
32
|
+
return _Lambda(
|
|
33
|
+
lambda s: (LoopStatus.BUDGET_EXHAUSTED, f"max_iterations {n} reached")
|
|
34
|
+
if len(s.iterations) >= n
|
|
35
|
+
else None
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
@staticmethod
|
|
39
|
+
def token_limit(n: int) -> ExitCondition:
|
|
40
|
+
return _Lambda(
|
|
41
|
+
lambda s: (LoopStatus.BUDGET_EXHAUSTED, f"token_limit {n} reached")
|
|
42
|
+
if s.metrics.total_tokens >= n
|
|
43
|
+
else None
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
@staticmethod
|
|
47
|
+
def cost_limit(usd: float) -> ExitCondition:
|
|
48
|
+
return _Lambda(
|
|
49
|
+
lambda s: (LoopStatus.BUDGET_EXHAUSTED, f"cost_limit ${usd:.2f} reached")
|
|
50
|
+
if s.metrics.cost_usd is not None and s.metrics.cost_usd >= usd
|
|
51
|
+
else None
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
@staticmethod
|
|
55
|
+
def stagnation(n: int = 3) -> ExitCondition:
|
|
56
|
+
return _Lambda(
|
|
57
|
+
lambda s: (LoopStatus.NO_PROGRESS, f"stagnation: {s.no_progress_streak} flat iterations")
|
|
58
|
+
if s.no_progress_streak >= n
|
|
59
|
+
else None
|
|
60
|
+
)
|
loop/llm/__init__.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""LLMClient protocol + canonical ToolCall normalization.
|
|
2
|
+
|
|
3
|
+
Provider responses are normalized into LLMResponse/ToolCallReq at this
|
|
4
|
+
boundary, so the engine handles hallucinated or malformed tool calls uniformly
|
|
5
|
+
regardless of provider leniency.
|
|
6
|
+
|
|
7
|
+
Timeout defaults are recommended values, not SLA-backed (see README).
|
|
8
|
+
"""
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import time
|
|
12
|
+
from typing import Any, Callable
|
|
13
|
+
|
|
14
|
+
from pydantic import BaseModel, Field
|
|
15
|
+
|
|
16
|
+
DEFAULT_TIMEOUT_S = 120.0
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class ToolCallReq(BaseModel):
|
|
20
|
+
id: str = ""
|
|
21
|
+
name: str
|
|
22
|
+
args: dict[str, Any] = Field(default_factory=dict)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class LLMResponse(BaseModel):
|
|
26
|
+
text: str = ""
|
|
27
|
+
tool_calls: list[ToolCallReq] = Field(default_factory=list)
|
|
28
|
+
input_tokens: int = 0
|
|
29
|
+
output_tokens: int = 0
|
|
30
|
+
model_id: str = ""
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class LLMClient:
|
|
34
|
+
"""Adapter protocol. Subclasses implement complete()."""
|
|
35
|
+
|
|
36
|
+
model_id: str = "unknown"
|
|
37
|
+
timeout_s: float = DEFAULT_TIMEOUT_S
|
|
38
|
+
|
|
39
|
+
def complete(
|
|
40
|
+
self,
|
|
41
|
+
messages: list[dict[str, str]],
|
|
42
|
+
tool_schemas: list[dict[str, Any]],
|
|
43
|
+
max_tokens: int,
|
|
44
|
+
) -> LLMResponse:
|
|
45
|
+
raise NotImplementedError
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class CallableLLM(LLMClient):
|
|
49
|
+
"""Wraps any fn(messages, tool_schemas) -> LLMResponse. Useful for tests and raw APIs."""
|
|
50
|
+
|
|
51
|
+
def __init__(self, fn: Callable[[list[dict], list[dict]], LLMResponse], model_id: str = "callable"):
|
|
52
|
+
self.fn = fn
|
|
53
|
+
self.model_id = model_id
|
|
54
|
+
|
|
55
|
+
def complete(self, messages, tool_schemas, max_tokens):
|
|
56
|
+
return self.fn(messages, tool_schemas)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def complete_with_retry(
|
|
60
|
+
client: LLMClient,
|
|
61
|
+
messages: list[dict[str, str]],
|
|
62
|
+
tool_schemas: list[dict[str, Any]],
|
|
63
|
+
max_tokens: int,
|
|
64
|
+
) -> LLMResponse:
|
|
65
|
+
"""One retry with backoff on timeout/transient errors."""
|
|
66
|
+
try:
|
|
67
|
+
return client.complete(messages, tool_schemas, max_tokens)
|
|
68
|
+
except Exception:
|
|
69
|
+
time.sleep(1.0)
|
|
70
|
+
return client.complete(messages, tool_schemas, max_tokens)
|
loop/llm/anthropic.py
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""Anthropic adapter. Requires the `anthropic` extra. Default timeout: 120s (not SLA-backed)."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
from . import DEFAULT_TIMEOUT_S, LLMClient, LLMResponse, ToolCallReq
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class AnthropicLLM(LLMClient):
|
|
10
|
+
def __init__(self, model: str = "claude-sonnet-4-6", timeout_s: float = DEFAULT_TIMEOUT_S, **client_kwargs: Any):
|
|
11
|
+
import anthropic # lazy: optional dependency
|
|
12
|
+
|
|
13
|
+
self.model_id = model
|
|
14
|
+
self.timeout_s = timeout_s
|
|
15
|
+
self._client = anthropic.Anthropic(timeout=timeout_s, **client_kwargs)
|
|
16
|
+
|
|
17
|
+
def complete(self, messages, tool_schemas, max_tokens):
|
|
18
|
+
system = "\n\n".join(m["content"] for m in messages if m["role"] == "system")
|
|
19
|
+
convo = [m for m in messages if m["role"] != "system"]
|
|
20
|
+
tools = [
|
|
21
|
+
{"name": s["name"], "description": s.get("description", ""), "input_schema": s["input_schema"]}
|
|
22
|
+
for s in tool_schemas
|
|
23
|
+
]
|
|
24
|
+
resp = self._client.messages.create(
|
|
25
|
+
model=self.model_id,
|
|
26
|
+
system=system or None,
|
|
27
|
+
messages=convo,
|
|
28
|
+
tools=tools or None,
|
|
29
|
+
max_tokens=max_tokens,
|
|
30
|
+
)
|
|
31
|
+
text_parts: list[str] = []
|
|
32
|
+
calls: list[ToolCallReq] = []
|
|
33
|
+
for block in resp.content:
|
|
34
|
+
if block.type == "text":
|
|
35
|
+
text_parts.append(block.text)
|
|
36
|
+
elif block.type == "tool_use":
|
|
37
|
+
args = block.input if isinstance(block.input, dict) else {}
|
|
38
|
+
calls.append(ToolCallReq(id=block.id, name=block.name, args=args))
|
|
39
|
+
return LLMResponse(
|
|
40
|
+
text="\n".join(text_parts),
|
|
41
|
+
tool_calls=calls,
|
|
42
|
+
input_tokens=resp.usage.input_tokens,
|
|
43
|
+
output_tokens=resp.usage.output_tokens,
|
|
44
|
+
model_id=self.model_id,
|
|
45
|
+
)
|
loop/llm/openai.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
"""OpenAI adapter. Requires the `openai` extra. Default timeout: 120s (not SLA-backed)."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import json
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
from . import DEFAULT_TIMEOUT_S, LLMClient, LLMResponse, ToolCallReq
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class OpenAILLM(LLMClient):
|
|
11
|
+
def __init__(self, model: str = "gpt-4o", timeout_s: float = DEFAULT_TIMEOUT_S, **client_kwargs: Any):
|
|
12
|
+
import openai # lazy: optional dependency
|
|
13
|
+
|
|
14
|
+
self.model_id = model
|
|
15
|
+
self.timeout_s = timeout_s
|
|
16
|
+
self._client = openai.OpenAI(timeout=timeout_s, **client_kwargs)
|
|
17
|
+
|
|
18
|
+
def complete(self, messages, tool_schemas, max_tokens):
|
|
19
|
+
tools = [
|
|
20
|
+
{
|
|
21
|
+
"type": "function",
|
|
22
|
+
"function": {
|
|
23
|
+
"name": s["name"],
|
|
24
|
+
"description": s.get("description", ""),
|
|
25
|
+
"parameters": s["input_schema"],
|
|
26
|
+
},
|
|
27
|
+
}
|
|
28
|
+
for s in tool_schemas
|
|
29
|
+
]
|
|
30
|
+
resp = self._client.chat.completions.create(
|
|
31
|
+
model=self.model_id,
|
|
32
|
+
messages=messages,
|
|
33
|
+
tools=tools or None,
|
|
34
|
+
max_tokens=max_tokens,
|
|
35
|
+
)
|
|
36
|
+
choice = resp.choices[0].message
|
|
37
|
+
calls: list[ToolCallReq] = []
|
|
38
|
+
for tc in choice.tool_calls or []:
|
|
39
|
+
try:
|
|
40
|
+
args = json.loads(tc.function.arguments)
|
|
41
|
+
except json.JSONDecodeError:
|
|
42
|
+
# malformed args become an empty call; engine-side validation feeds
|
|
43
|
+
# a structured error back to the model
|
|
44
|
+
args = {"__malformed__": tc.function.arguments}
|
|
45
|
+
if not isinstance(args, dict):
|
|
46
|
+
args = {"__malformed__": tc.function.arguments}
|
|
47
|
+
calls.append(ToolCallReq(id=tc.id, name=tc.function.name, args=args))
|
|
48
|
+
usage = resp.usage
|
|
49
|
+
return LLMResponse(
|
|
50
|
+
text=choice.content or "",
|
|
51
|
+
tool_calls=calls,
|
|
52
|
+
input_tokens=usage.prompt_tokens if usage else 0,
|
|
53
|
+
output_tokens=usage.completion_tokens if usage else 0,
|
|
54
|
+
model_id=self.model_id,
|
|
55
|
+
)
|
loop/policy/__init__.py
ADDED
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
"""Composable policies.
|
|
2
|
+
|
|
3
|
+
Precedence note: hard ceilings (iterations/tokens/wall-clock) are enforced by
|
|
4
|
+
the engine itself before every action and before granting any approval —
|
|
5
|
+
policies layer on top of those guarantees, they don't replace them.
|
|
6
|
+
"""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from enum import Enum
|
|
10
|
+
from typing import Any, Callable, Iterable
|
|
11
|
+
|
|
12
|
+
from pydantic import BaseModel
|
|
13
|
+
|
|
14
|
+
from ..core.result import LoopStatus
|
|
15
|
+
from ..core.state import State
|
|
16
|
+
from ..pricing import DEFAULT_PRICING, estimate_cost
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class DecisionKind(str, Enum):
|
|
20
|
+
CONTINUE = "CONTINUE"
|
|
21
|
+
STOP = "STOP"
|
|
22
|
+
PAUSE = "PAUSE" # request human approval
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class Decision(BaseModel):
|
|
26
|
+
kind: DecisionKind = DecisionKind.CONTINUE
|
|
27
|
+
reason: str = ""
|
|
28
|
+
status: LoopStatus | None = None # for STOP
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
CONTINUE = Decision()
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class Policy:
|
|
35
|
+
def before_iteration(self, state: State) -> Decision:
|
|
36
|
+
return CONTINUE
|
|
37
|
+
|
|
38
|
+
def before_action(self, state: State, tool_name: str, args: dict[str, Any]) -> Decision:
|
|
39
|
+
return CONTINUE
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
class NoProgress(Policy):
|
|
43
|
+
"""Fires after `window` consecutive flagged iterations with zero goal-metric delta
|
|
44
|
+
. Streak accounting is done by the ProgressEngine; this reads it."""
|
|
45
|
+
|
|
46
|
+
def __init__(self, window: int = 3):
|
|
47
|
+
self.window = window
|
|
48
|
+
|
|
49
|
+
def before_iteration(self, state: State) -> Decision:
|
|
50
|
+
if state.no_progress_streak >= self.window:
|
|
51
|
+
return Decision(
|
|
52
|
+
kind=DecisionKind.STOP,
|
|
53
|
+
status=LoopStatus.NO_PROGRESS,
|
|
54
|
+
reason=(
|
|
55
|
+
f"{state.no_progress_streak} consecutive iterations with repeated/invalid "
|
|
56
|
+
"actions and zero goal-metric delta"
|
|
57
|
+
),
|
|
58
|
+
)
|
|
59
|
+
return CONTINUE
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class CostLimit(Policy):
|
|
63
|
+
def __init__(self, usd: float, model_id: str, pricing: dict | None = None):
|
|
64
|
+
self.usd = usd
|
|
65
|
+
self.model_id = model_id
|
|
66
|
+
self.pricing = pricing or DEFAULT_PRICING
|
|
67
|
+
|
|
68
|
+
def before_iteration(self, state: State) -> Decision:
|
|
69
|
+
cost = estimate_cost(
|
|
70
|
+
state.metrics.input_tokens, state.metrics.output_tokens, self.model_id, self.pricing
|
|
71
|
+
)
|
|
72
|
+
if cost is not None and cost >= self.usd:
|
|
73
|
+
return Decision(
|
|
74
|
+
kind=DecisionKind.STOP,
|
|
75
|
+
status=LoopStatus.BUDGET_EXHAUSTED,
|
|
76
|
+
reason=f"estimated cost ${cost:.2f} >= limit ${self.usd:.2f} (tokens are authoritative)",
|
|
77
|
+
)
|
|
78
|
+
return CONTINUE
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
class RequireApproval(Policy):
|
|
82
|
+
"""Pause for human approval when an action matches. Matcher: tool-name iterable
|
|
83
|
+
or callable(tool_name, args) -> bool."""
|
|
84
|
+
|
|
85
|
+
def __init__(self, matcher: Iterable[str] | Callable[[str, dict], bool], reason: str = "requires approval"):
|
|
86
|
+
if callable(matcher):
|
|
87
|
+
self._match = matcher
|
|
88
|
+
else:
|
|
89
|
+
names = set(matcher)
|
|
90
|
+
self._match = lambda name, args: name in names
|
|
91
|
+
self.reason = reason
|
|
92
|
+
|
|
93
|
+
def before_action(self, state: State, tool_name: str, args: dict[str, Any]) -> Decision:
|
|
94
|
+
if self._match(tool_name, args):
|
|
95
|
+
return Decision(kind=DecisionKind.PAUSE, reason=self.reason)
|
|
96
|
+
return CONTINUE
|
loop/pricing.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""USD cost estimation. Tokens are the authoritative unit; USD is a convenience
|
|
2
|
+
derived from a pricing table that carries an as-of date.
|
|
3
|
+
"""
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
import warnings
|
|
7
|
+
from datetime import date, datetime
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
STALENESS_DAYS = 90
|
|
11
|
+
|
|
12
|
+
DEFAULT_PRICING: dict[str, Any] = {
|
|
13
|
+
"as_of": "2026-01-15",
|
|
14
|
+
"models": {
|
|
15
|
+
# USD per million tokens
|
|
16
|
+
"claude-sonnet-4-6": {"input": 3.00, "output": 15.00},
|
|
17
|
+
"claude-haiku-4-5-20251001": {"input": 1.00, "output": 5.00},
|
|
18
|
+
"claude-opus-4-8": {"input": 15.00, "output": 75.00},
|
|
19
|
+
"gpt-4o": {"input": 2.50, "output": 10.00},
|
|
20
|
+
},
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class PricingStalenessError(Exception):
|
|
25
|
+
pass
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def check_staleness(pricing: dict[str, Any], behavior: str = "warn", today: date | None = None) -> bool:
|
|
29
|
+
"""Returns True if cost accounting should proceed in USD; False for token-only fallback."""
|
|
30
|
+
as_of = datetime.strptime(pricing["as_of"], "%Y-%m-%d").date()
|
|
31
|
+
age = ((today or date.today()) - as_of).days
|
|
32
|
+
if age <= STALENESS_DAYS:
|
|
33
|
+
return True
|
|
34
|
+
msg = f"pricing table as_of={pricing['as_of']} is {age} days old (> {STALENESS_DAYS})"
|
|
35
|
+
if behavior == "refuse":
|
|
36
|
+
raise PricingStalenessError(msg + "; refusing USD cost accounting")
|
|
37
|
+
if behavior == "token-only":
|
|
38
|
+
return False
|
|
39
|
+
warnings.warn(msg + "; USD figures are estimates — tokens remain authoritative", stacklevel=2)
|
|
40
|
+
return True
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def estimate_cost(input_tokens: int, output_tokens: int, model_id: str, pricing: dict[str, Any]) -> float | None:
|
|
44
|
+
rates = pricing["models"].get(model_id)
|
|
45
|
+
if not rates:
|
|
46
|
+
return None
|
|
47
|
+
return (input_tokens * rates["input"] + output_tokens * rates["output"]) / 1_000_000
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""Progress engine: raw signals, no fake-precision score.
|
|
2
|
+
|
|
3
|
+
Signals: blueprint goal metric (regression-aware), repetition detection via
|
|
4
|
+
per-tool fingerprints (advisory), LLM self-assessment (trace-only, never gates
|
|
5
|
+
exits in v1 — by design).
|
|
6
|
+
"""
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from pydantic import BaseModel
|
|
10
|
+
|
|
11
|
+
from ..core.state import IterationRecord, MetricSnapshot, State
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class GoalMetric:
|
|
15
|
+
"""Blueprint-supplied. measure() returns a snapshot; is_success() gates SUCCESS."""
|
|
16
|
+
|
|
17
|
+
def measure(self, observation: str, state: State) -> MetricSnapshot:
|
|
18
|
+
raise NotImplementedError
|
|
19
|
+
|
|
20
|
+
def is_success(self, snapshot: MetricSnapshot) -> bool:
|
|
21
|
+
return False
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class ProgressReport(BaseModel):
|
|
25
|
+
trend: str # improving | flat | regressing | unknown
|
|
26
|
+
repetition: bool
|
|
27
|
+
no_progress_streak: int
|
|
28
|
+
metric_delta: float | None = None
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class ProgressEngine:
|
|
32
|
+
def evaluate(self, state: State, iteration: IterationRecord) -> ProgressReport:
|
|
33
|
+
prev = state.iterations[-1] if state.iterations else None
|
|
34
|
+
|
|
35
|
+
# repetition: identical fingerprint set to the previous iteration's (non-empty)
|
|
36
|
+
fps = {a.fingerprint for a in iteration.actions}
|
|
37
|
+
prev_fps = {a.fingerprint for a in prev.actions} if prev else set()
|
|
38
|
+
iteration.repetition = bool(fps) and fps == prev_fps
|
|
39
|
+
|
|
40
|
+
delta: float | None = None
|
|
41
|
+
if iteration.metric and prev and prev.metric:
|
|
42
|
+
delta = iteration.metric.value - prev.metric.value
|
|
43
|
+
|
|
44
|
+
if iteration.metric and iteration.metric.regression:
|
|
45
|
+
trend = "regressing"
|
|
46
|
+
elif delta is None:
|
|
47
|
+
trend = "unknown"
|
|
48
|
+
elif delta > 0:
|
|
49
|
+
trend = "improving"
|
|
50
|
+
elif delta < 0:
|
|
51
|
+
trend = "regressing"
|
|
52
|
+
else:
|
|
53
|
+
trend = "flat"
|
|
54
|
+
|
|
55
|
+
flat = delta is None or delta == 0
|
|
56
|
+
flagged = iteration.repetition or iteration.plan_invalid
|
|
57
|
+
if flagged and flat:
|
|
58
|
+
state.no_progress_streak += 1
|
|
59
|
+
else:
|
|
60
|
+
state.no_progress_streak = 0
|
|
61
|
+
|
|
62
|
+
if iteration.plan_invalid:
|
|
63
|
+
state.plan_invalid_streak += 1
|
|
64
|
+
else:
|
|
65
|
+
state.plan_invalid_streak = 0
|
|
66
|
+
|
|
67
|
+
return ProgressReport(
|
|
68
|
+
trend=trend,
|
|
69
|
+
repetition=iteration.repetition,
|
|
70
|
+
no_progress_streak=state.no_progress_streak,
|
|
71
|
+
metric_delta=delta,
|
|
72
|
+
)
|