agent-control-evaluator-budget 7.7.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,90 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ *.egg-info/
20
+ .installed.cfg
21
+ *.egg
22
+ MANIFEST
23
+
24
+ # Virtual environments
25
+ venv/
26
+ env/
27
+ ENV/
28
+ .venv
29
+
30
+ # UV
31
+ .uv/
32
+ uv.lock
33
+
34
+ # IDEs
35
+ .vscode/
36
+ .idea/
37
+ *.swp
38
+ *.swo
39
+ *~
40
+ .DS_Store
41
+ coverage-*.xml
42
+
43
+ # Testing
44
+ .pytest_cache/
45
+ .coverage
46
+ coverage-*.xml
47
+ htmlcov/
48
+ .tox/
49
+ .mypy_cache/
50
+ .ruff_cache/
51
+
52
+ # Playwright
53
+ playwright-report/
54
+ playwright/.cache/
55
+ test-results/
56
+
57
+ # Environment variables
58
+ .env
59
+ .env.local
60
+ .env.*.local
61
+
62
+ # Logs
63
+ *.log
64
+ logs/
65
+
66
+ # Database
67
+ *.db
68
+ *.sqlite3
69
+ server/openapi.json
70
+ server/.generated/
71
+
72
+ # Temporary files
73
+ tmp/
74
+ temp/
75
+ *.tmp
76
+
77
+ # OS
78
+ .DS_Store
79
+ Thumbs.db
80
+
81
+ # Intellij
82
+ *.iml
83
+
84
+ ## CLAUDE
85
+ .claude
86
+
87
+ # Local notes
88
+ rearch_plan.md
89
+
90
+ node_modules
@@ -0,0 +1,28 @@
1
+ .PHONY: help test lint lint-fix typecheck check build
2
+
3
+ help:
4
+ @echo "Agent Control Evaluator - Budget - Makefile commands"
5
+ @echo ""
6
+ @echo " make test - run pytest"
7
+ @echo " make lint - run ruff check"
8
+ @echo " make lint-fix - run ruff check --fix"
9
+ @echo " make typecheck - run mypy"
10
+ @echo " make check - run lint, typecheck, and test"
11
+ @echo " make build - build package"
12
+
13
+ test:
14
+ uv run --with pytest --with pytest-asyncio --with pytest-cov pytest tests --cov=src --cov-report=xml:../../../coverage-evaluators-budget.xml -q
15
+
16
+ lint:
17
+ uv run --with ruff ruff check --config ../../../pyproject.toml src/
18
+
19
+ lint-fix:
20
+ uv run --with ruff ruff check --config ../../../pyproject.toml --fix src/
21
+
22
+ typecheck:
23
+ uv run --with mypy mypy --config-file ../../../pyproject.toml src/
24
+
25
+ check: lint typecheck test
26
+
27
+ build:
28
+ uv build
@@ -0,0 +1,164 @@
1
+ Metadata-Version: 2.4
2
+ Name: agent-control-evaluator-budget
3
+ Version: 7.7.0
4
+ Summary: Budget evaluator for agent-control -- cumulative LLM cost and token tracking
5
+ Author: Agent Control Team
6
+ License: Apache-2.0
7
+ Requires-Python: >=3.12
8
+ Requires-Dist: agent-control-evaluators>=7.7.0
9
+ Requires-Dist: agent-control-models>=7.7.0
10
+ Provides-Extra: dev
11
+ Requires-Dist: mypy>=1.8.0; extra == 'dev'
12
+ Requires-Dist: pytest-asyncio>=0.23.0; extra == 'dev'
13
+ Requires-Dist: pytest>=8.0.0; extra == 'dev'
14
+ Requires-Dist: ruff>=0.1.0; extra == 'dev'
15
+ Description-Content-Type: text/markdown
16
+
17
+ # agent-control-evaluator-budget
18
+
19
+ Budget evaluator for agent-control that tracks cumulative LLM token and cost usage per scope and time window.
20
+
21
+ ## Install
22
+
23
+ ```bash
24
+ pip install "agent-control-evaluators[budget]"
25
+ ```
26
+
27
+ Fallback direct wheel install:
28
+
29
+ ```bash
30
+ pip install agent-control-evaluator-budget
31
+ ```
32
+
33
+ For local development:
34
+
35
+ ```bash
36
+ uv pip install -e evaluators/contrib/budget
37
+ ```
38
+
39
+ ## Quickstart
40
+
41
+ ```python
42
+ from agent_control_evaluator_budget.budget import (
43
+ BudgetEvaluatorConfig,
44
+ BudgetLimitRule,
45
+ ModelPricing,
46
+ )
47
+
48
+ config = BudgetEvaluatorConfig(
49
+ budget_id="support-daily",
50
+ limits=[
51
+ BudgetLimitRule(
52
+ scope={"agent": "support"},
53
+ group_by="user_id",
54
+ window_seconds=86_400,
55
+ limit=500,
56
+ limit_unit="usd_cents",
57
+ ),
58
+ BudgetLimitRule(
59
+ scope={"agent": "support"},
60
+ group_by="user_id",
61
+ window_seconds=86_400,
62
+ limit=50_000,
63
+ limit_unit="tokens",
64
+ ),
65
+ ],
66
+ pricing={
67
+ "gpt-4.1-mini": ModelPricing(input_per_1k=0.04, output_per_1k=0.16),
68
+ },
69
+ model_path="model",
70
+ metadata_paths={
71
+ "agent": "metadata.agent",
72
+ "user_id": "metadata.user_id",
73
+ },
74
+ unknown_model_behavior="block",
75
+ )
76
+ ```
77
+
78
+ The evaluator reads token usage from standard fields such as `usage.input_tokens` and `usage.output_tokens`. Configure `token_path` only when your event shape uses a custom location.
79
+
80
+ ## Scope and group_by
81
+
82
+ Each `BudgetLimitRule` has a static `scope` and an optional `group_by` field.
83
+
84
+ `scope` filters which events a rule applies to. A rule with `scope={"agent": "support"}` only applies when extracted metadata contains `agent="support"`. An empty scope is global.
85
+
86
+ `group_by` creates independent buckets per extracted metadata value. The common per-user pattern is:
87
+
88
+ ```python
89
+ BudgetLimitRule(
90
+ scope={"agent": "support"},
91
+ group_by="user_id",
92
+ window_seconds=86_400,
93
+ limit=500,
94
+ limit_unit="usd_cents",
95
+ )
96
+ ```
97
+
98
+ With `metadata_paths={"user_id": "metadata.user_id"}`, each user gets a separate daily budget inside the support scope.
99
+
100
+ ## Budget pools
101
+
102
+ `budget_id` identifies the accumulated budget pool.
103
+
104
+ Evaluators with the same `budget_id` share accumulated spend and token totals across all evaluator instances. Each evaluator still evaluates using its own configured rules -- the shared state is the bucket (the rolling sum), not the rule set. Evaluators with different `budget_id` values are fully isolated.
105
+
106
+ Use stable names such as `support-daily`, `billing-global`, or `tenant-acme-monthly`. Avoid generating a new `budget_id` per request unless each request should have an isolated budget.
107
+
108
+ ## Pricing
109
+
110
+ `ModelPricing` stores cost rates in cents per 1K tokens:
111
+
112
+ ```python
113
+ ModelPricing(input_per_1k=0.04, output_per_1k=0.16)
114
+ ```
115
+
116
+ `input_per_1k` is applied to input tokens. `output_per_1k` is applied to output tokens.
117
+
118
+ Pricing and `model_path` are required when any rule uses `limit_unit="usd_cents"`. Token-only rules can omit both. If an event uses a model that is not in the pricing table and a cost rule exists, `unknown_model_behavior="block"` fails closed. Use `"warn"` to log a warning and treat the cost as 0.
119
+
120
+ ## Dual Ceiling Pattern
121
+
122
+ Use two evaluators when cost and token ceilings need independent control records or different `budget_id` pools:
123
+
124
+ ```python
125
+ cost_config = BudgetEvaluatorConfig(
126
+ budget_id="support-cost-daily",
127
+ limits=[
128
+ BudgetLimitRule(
129
+ scope={"agent": "support"},
130
+ group_by="user_id",
131
+ window_seconds=86_400,
132
+ limit=500,
133
+ limit_unit="usd_cents",
134
+ )
135
+ ],
136
+ pricing={
137
+ "gpt-4.1-mini": ModelPricing(input_per_1k=0.04, output_per_1k=0.16),
138
+ },
139
+ model_path="model",
140
+ metadata_paths={"agent": "metadata.agent", "user_id": "metadata.user_id"},
141
+ )
142
+
143
+ token_config = BudgetEvaluatorConfig(
144
+ budget_id="support-token-daily",
145
+ limits=[
146
+ BudgetLimitRule(
147
+ scope={"agent": "support"},
148
+ group_by="user_id",
149
+ window_seconds=86_400,
150
+ limit=50_000,
151
+ limit_unit="tokens",
152
+ )
153
+ ],
154
+ metadata_paths={"agent": "metadata.agent", "user_id": "metadata.user_id"},
155
+ )
156
+ ```
157
+
158
+ This pattern lets cost and token budgets reset, alert, and roll out independently. A single evaluator can also contain both rules when one shared pool and one control result are sufficient.
159
+
160
+ ## Limitations
161
+
162
+ `InMemoryBudgetStore` is single-process only. State is lost on restart and is not shared across workers or pods.
163
+
164
+ Use a distributed store for production deployments that run multiple processes, multiple workers, or multiple pods.
@@ -0,0 +1,148 @@
1
+ # agent-control-evaluator-budget
2
+
3
+ Budget evaluator for agent-control that tracks cumulative LLM token and cost usage per scope and time window.
4
+
5
+ ## Install
6
+
7
+ ```bash
8
+ pip install "agent-control-evaluators[budget]"
9
+ ```
10
+
11
+ Fallback direct wheel install:
12
+
13
+ ```bash
14
+ pip install agent-control-evaluator-budget
15
+ ```
16
+
17
+ For local development:
18
+
19
+ ```bash
20
+ uv pip install -e evaluators/contrib/budget
21
+ ```
22
+
23
+ ## Quickstart
24
+
25
+ ```python
26
+ from agent_control_evaluator_budget.budget import (
27
+ BudgetEvaluatorConfig,
28
+ BudgetLimitRule,
29
+ ModelPricing,
30
+ )
31
+
32
+ config = BudgetEvaluatorConfig(
33
+ budget_id="support-daily",
34
+ limits=[
35
+ BudgetLimitRule(
36
+ scope={"agent": "support"},
37
+ group_by="user_id",
38
+ window_seconds=86_400,
39
+ limit=500,
40
+ limit_unit="usd_cents",
41
+ ),
42
+ BudgetLimitRule(
43
+ scope={"agent": "support"},
44
+ group_by="user_id",
45
+ window_seconds=86_400,
46
+ limit=50_000,
47
+ limit_unit="tokens",
48
+ ),
49
+ ],
50
+ pricing={
51
+ "gpt-4.1-mini": ModelPricing(input_per_1k=0.04, output_per_1k=0.16),
52
+ },
53
+ model_path="model",
54
+ metadata_paths={
55
+ "agent": "metadata.agent",
56
+ "user_id": "metadata.user_id",
57
+ },
58
+ unknown_model_behavior="block",
59
+ )
60
+ ```
61
+
62
+ The evaluator reads token usage from standard fields such as `usage.input_tokens` and `usage.output_tokens`. Configure `token_path` only when your event shape uses a custom location.
63
+
64
+ ## Scope and group_by
65
+
66
+ Each `BudgetLimitRule` has a static `scope` and an optional `group_by` field.
67
+
68
+ `scope` filters which events a rule applies to. A rule with `scope={"agent": "support"}` only applies when extracted metadata contains `agent="support"`. An empty scope is global.
69
+
70
+ `group_by` creates independent buckets per extracted metadata value. The common per-user pattern is:
71
+
72
+ ```python
73
+ BudgetLimitRule(
74
+ scope={"agent": "support"},
75
+ group_by="user_id",
76
+ window_seconds=86_400,
77
+ limit=500,
78
+ limit_unit="usd_cents",
79
+ )
80
+ ```
81
+
82
+ With `metadata_paths={"user_id": "metadata.user_id"}`, each user gets a separate daily budget inside the support scope.
83
+
84
+ ## Budget pools
85
+
86
+ `budget_id` identifies the accumulated budget pool.
87
+
88
+ Evaluators with the same `budget_id` share accumulated spend and token totals across all evaluator instances. Each evaluator still evaluates using its own configured rules -- the shared state is the bucket (the rolling sum), not the rule set. Evaluators with different `budget_id` values are fully isolated.
89
+
90
+ Use stable names such as `support-daily`, `billing-global`, or `tenant-acme-monthly`. Avoid generating a new `budget_id` per request unless each request should have an isolated budget.
91
+
92
+ ## Pricing
93
+
94
+ `ModelPricing` stores cost rates in cents per 1K tokens:
95
+
96
+ ```python
97
+ ModelPricing(input_per_1k=0.04, output_per_1k=0.16)
98
+ ```
99
+
100
+ `input_per_1k` is applied to input tokens. `output_per_1k` is applied to output tokens.
101
+
102
+ Pricing and `model_path` are required when any rule uses `limit_unit="usd_cents"`. Token-only rules can omit both. If an event uses a model that is not in the pricing table and a cost rule exists, `unknown_model_behavior="block"` fails closed. Use `"warn"` to log a warning and treat the cost as 0.
103
+
104
+ ## Dual Ceiling Pattern
105
+
106
+ Use two evaluators when cost and token ceilings need independent control records or different `budget_id` pools:
107
+
108
+ ```python
109
+ cost_config = BudgetEvaluatorConfig(
110
+ budget_id="support-cost-daily",
111
+ limits=[
112
+ BudgetLimitRule(
113
+ scope={"agent": "support"},
114
+ group_by="user_id",
115
+ window_seconds=86_400,
116
+ limit=500,
117
+ limit_unit="usd_cents",
118
+ )
119
+ ],
120
+ pricing={
121
+ "gpt-4.1-mini": ModelPricing(input_per_1k=0.04, output_per_1k=0.16),
122
+ },
123
+ model_path="model",
124
+ metadata_paths={"agent": "metadata.agent", "user_id": "metadata.user_id"},
125
+ )
126
+
127
+ token_config = BudgetEvaluatorConfig(
128
+ budget_id="support-token-daily",
129
+ limits=[
130
+ BudgetLimitRule(
131
+ scope={"agent": "support"},
132
+ group_by="user_id",
133
+ window_seconds=86_400,
134
+ limit=50_000,
135
+ limit_unit="tokens",
136
+ )
137
+ ],
138
+ metadata_paths={"agent": "metadata.agent", "user_id": "metadata.user_id"},
139
+ )
140
+ ```
141
+
142
+ This pattern lets cost and token budgets reset, alert, and roll out independently. A single evaluator can also contain both rules when one shared pool and one control result are sufficient.
143
+
144
+ ## Limitations
145
+
146
+ `InMemoryBudgetStore` is single-process only. State is lost on restart and is not shared across workers or pods.
147
+
148
+ Use a distributed store for production deployments that run multiple processes, multiple workers, or multiple pods.
@@ -0,0 +1,47 @@
1
+ [project]
2
+ name = "agent-control-evaluator-budget"
3
+ version = "7.7.0"
4
+ description = "Budget evaluator for agent-control -- cumulative LLM cost and token tracking"
5
+ readme = "README.md"
6
+ requires-python = ">=3.12"
7
+ license = { text = "Apache-2.0" }
8
+ authors = [{ name = "Agent Control Team" }]
9
+ dependencies = [
10
+ "agent-control-evaluators>=7.7.0",
11
+ "agent-control-models>=7.7.0",
12
+ ]
13
+
14
+ [project.optional-dependencies]
15
+ dev = [
16
+ "pytest>=8.0.0",
17
+ "pytest-asyncio>=0.23.0",
18
+ "ruff>=0.1.0",
19
+ "mypy>=1.8.0",
20
+ ]
21
+
22
+ [project.entry-points."agent_control.evaluators"]
23
+ budget = "agent_control_evaluator_budget.budget:BudgetEvaluator"
24
+
25
+ [build-system]
26
+ requires = ["hatchling"]
27
+ build-backend = "hatchling.build"
28
+
29
+ [tool.hatch.build.targets.wheel]
30
+ packages = ["src/agent_control_evaluator_budget"]
31
+
32
+ [tool.ruff]
33
+ line-length = 100
34
+ target-version = "py312"
35
+
36
+ [tool.ruff.lint]
37
+ select = ["E", "F", "I"]
38
+
39
+ [tool.uv.sources]
40
+ agent-control-evaluators = { path = "../../builtin", editable = true }
41
+ agent-control-models = { path = "../../../models", editable = true }
42
+
43
+ [dependency-groups]
44
+ dev = [
45
+ "pytest>=9.0.2",
46
+ "pytest-asyncio>=1.3.0",
47
+ ]
@@ -0,0 +1,24 @@
1
+ """Budget evaluator for per-agent LLM cost and token tracking."""
2
+
3
+ from agent_control_evaluator_budget.budget.config import (
4
+ BudgetEvaluatorConfig,
5
+ BudgetLimitRule,
6
+ ModelPricing,
7
+ )
8
+ from agent_control_evaluator_budget.budget.evaluator import BudgetEvaluator
9
+ from agent_control_evaluator_budget.budget.memory_store import InMemoryBudgetStore
10
+ from agent_control_evaluator_budget.budget.store import BudgetSnapshot, BudgetStore
11
+
12
+ # Note: clear_budget_stores is a testing utility and is intentionally not
13
+ # re-exported here. Import it directly from the evaluator submodule in tests:
14
+ # from agent_control_evaluator_budget.budget.evaluator import clear_budget_stores
15
+
16
+ __all__ = [
17
+ "BudgetEvaluator",
18
+ "BudgetEvaluatorConfig",
19
+ "BudgetLimitRule",
20
+ "BudgetSnapshot",
21
+ "BudgetStore",
22
+ "InMemoryBudgetStore",
23
+ "ModelPricing",
24
+ ]
@@ -0,0 +1,120 @@
1
+ """Configuration for the budget evaluator."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Literal
6
+
7
+ from agent_control_evaluators._base import EvaluatorConfig
8
+ from pydantic import Field, field_validator, model_validator
9
+
10
+ # ---------------------------------------------------------------------------
11
+ # Window convenience constants (seconds)
12
+ # ---------------------------------------------------------------------------
13
+
14
+ WINDOW_HOURLY = 3600
15
+ WINDOW_DAILY = 86400
16
+ WINDOW_WEEKLY = 604800
17
+ WINDOW_MONTHLY = 2592000 # 30 days
18
+
19
+
20
+ class ModelPricing(EvaluatorConfig):
21
+ """Per-model token pricing in cents per 1K tokens."""
22
+
23
+ input_per_1k: float = 0.0
24
+ output_per_1k: float = 0.0
25
+
26
+
27
+ class BudgetLimitRule(EvaluatorConfig):
28
+ """A single budget limit rule.
29
+
30
+ Each rule defines a ceiling for a combination of scope dimensions
31
+ and time window. Multiple rules can apply to the same step -- the
32
+ evaluator checks all of them and triggers on the first breach.
33
+
34
+ Attributes:
35
+ scope: Static scope dimensions that must match for this rule
36
+ to apply. Empty dict = global rule.
37
+ Examples:
38
+ {"agent": "summarizer"} -- per-agent limit
39
+ {"agent": "summarizer", "channel": "slack"} -- agent+channel limit
40
+ group_by: If set, the limit is applied independently for each
41
+ unique value of this dimension. e.g. group_by="user_id" means
42
+ each user gets their own budget. None = shared/global limit.
43
+ window_seconds: Time window for accumulation in seconds.
44
+ None = cumulative (no reset). See WINDOW_* constants.
45
+ limit: Maximum usage in the window. Interpreted by limit_unit.
46
+ limit_unit: Unit for limit. usd_cents checks spend; tokens checks
47
+ input + output tokens.
48
+ """
49
+
50
+ scope: dict[str, str] = Field(default_factory=dict)
51
+ group_by: str | None = None
52
+ window_seconds: int | None = None
53
+ limit: int
54
+ limit_unit: Literal["usd_cents", "tokens"] = "usd_cents"
55
+
56
+ @field_validator("limit")
57
+ @classmethod
58
+ def validate_limit(cls, v: int) -> int:
59
+ if v <= 0:
60
+ raise ValueError("limit must be a positive integer")
61
+ return v
62
+
63
+ @field_validator("window_seconds")
64
+ @classmethod
65
+ def validate_window_seconds(cls, v: int | None) -> int | None:
66
+ if v is not None and v <= 0:
67
+ raise ValueError("window_seconds must be positive")
68
+ return v
69
+
70
+
71
+ class BudgetEvaluatorConfig(EvaluatorConfig):
72
+ """Configuration for the budget evaluator.
73
+
74
+ Attributes:
75
+ limits: List of budget limit rules. Each is checked independently.
76
+ budget_id: Unique budget pool identifier. Same budget_id shares
77
+ accumulated spend. Different budget_id is fully isolated.
78
+ unknown_model_behavior: What to do when a model is not found in the
79
+ pricing table and a cost-based rule exists. block=fail closed,
80
+ warn=log warning and treat cost as 0.
81
+ pricing: Optional model pricing table. Maps model name to ModelPricing.
82
+ Required when any rule uses limit_unit="usd_cents". Used to
83
+ derive cost in USD from token counts and model name.
84
+ token_path: Dot-notation path to extract token usage from step
85
+ data (e.g. "usage.total_tokens"). If None, looks for standard
86
+ fields (input_tokens, output_tokens, total_tokens, usage).
87
+ model_path: Dot-notation path to extract model name (for pricing lookup).
88
+ Required when any rule uses limit_unit="usd_cents".
89
+ metadata_paths: Mapping of metadata field name to dot-notation path
90
+ in step data. Used to extract scope dimensions (channel, user_id, etc).
91
+ """
92
+
93
+ limits: list[BudgetLimitRule] = Field(min_length=1)
94
+ budget_id: str = Field(
95
+ default="default",
96
+ description=(
97
+ "Unique budget pool identifier. Same budget_id shares accumulated spend. "
98
+ "Different budget_id is fully isolated."
99
+ ),
100
+ )
101
+ unknown_model_behavior: Literal["block", "warn"] = Field(
102
+ default="block",
103
+ description=(
104
+ "What to do when a model is not found in the pricing table and a cost-based "
105
+ "rule exists. block=fail closed, warn=log warning and treat cost as 0."
106
+ ),
107
+ )
108
+ pricing: dict[str, ModelPricing] | None = None
109
+ token_path: str | None = None
110
+ model_path: str | None = None
111
+ metadata_paths: dict[str, str] = Field(default_factory=dict)
112
+
113
+ @model_validator(mode="after")
114
+ def require_pricing_for_cost_rules(self) -> BudgetEvaluatorConfig:
115
+ has_cost_rule = any(rule.limit_unit == "usd_cents" for rule in self.limits)
116
+ if has_cost_rule and self.pricing is None:
117
+ raise ValueError('pricing is required when any rule uses limit_unit="usd_cents"')
118
+ if has_cost_rule and not (self.model_path or "").strip():
119
+ raise ValueError('model_path is required when any rule uses limit_unit="usd_cents"')
120
+ return self