mixpilot 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mixpilot-0.1.0/PKG-INFO +121 -0
- mixpilot-0.1.0/README.md +103 -0
- mixpilot-0.1.0/pyproject.toml +28 -0
- mixpilot-0.1.0/setup.cfg +4 -0
- mixpilot-0.1.0/src/mixpilot/__init__.py +18 -0
- mixpilot-0.1.0/src/mixpilot/agent/__init__.py +3 -0
- mixpilot-0.1.0/src/mixpilot/agent/client.py +43 -0
- mixpilot-0.1.0/src/mixpilot/agent/groq_client.py +118 -0
- mixpilot-0.1.0/src/mixpilot/agent/loop.py +110 -0
- mixpilot-0.1.0/src/mixpilot/safety/__init__.py +2 -0
- mixpilot-0.1.0/src/mixpilot/safety/approval.py +41 -0
- mixpilot-0.1.0/src/mixpilot/tools/__init__.py +3 -0
- mixpilot-0.1.0/src/mixpilot/tools/adstock.py +78 -0
- mixpilot-0.1.0/src/mixpilot/tools/attribution.py +157 -0
- mixpilot-0.1.0/src/mixpilot/tools/base.py +107 -0
- mixpilot-0.1.0/src/mixpilot/tools/budget.py +99 -0
- mixpilot-0.1.0/src/mixpilot/tools/dataquality.py +102 -0
- mixpilot-0.1.0/src/mixpilot/tools/registry.py +57 -0
- mixpilot-0.1.0/src/mixpilot/tools/saturation.py +96 -0
- mixpilot-0.1.0/src/mixpilot.egg-info/PKG-INFO +121 -0
- mixpilot-0.1.0/src/mixpilot.egg-info/SOURCES.txt +23 -0
- mixpilot-0.1.0/src/mixpilot.egg-info/dependency_links.txt +1 -0
- mixpilot-0.1.0/src/mixpilot.egg-info/requires.txt +9 -0
- mixpilot-0.1.0/src/mixpilot.egg-info/top_level.txt +1 -0
- mixpilot-0.1.0/tests/test_tools.py +100 -0
mixpilot-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: mixpilot
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: An agentic harness for marketing measurement: adstock, saturation, attribution, and budget allocation as typed agent tools with an eval suite.
|
|
5
|
+
Author: Mohit Luthra
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/mohit-luthra/mixpilot
|
|
8
|
+
Keywords: marketing-mix-modeling,mmm,attribution,agents,llm,adstock,saturation
|
|
9
|
+
Requires-Python: >=3.10
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
Requires-Dist: numpy>=1.24
|
|
12
|
+
Requires-Dist: scipy>=1.10
|
|
13
|
+
Requires-Dist: pydantic>=2.0
|
|
14
|
+
Provides-Extra: agent
|
|
15
|
+
Requires-Dist: anthropic>=0.39; extra == "agent"
|
|
16
|
+
Provides-Extra: dev
|
|
17
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
18
|
+
|
|
19
|
+
# MixPilot
|
|
20
|
+
|
|
21
|
+
**An agentic harness for marketing measurement.**
|
|
22
|
+
|
|
23
|
+
Everyone is building general-purpose agents. MixPilot is the opposite bet: a small,
|
|
24
|
+
real agent runtime whose entire action space is the marketing-science toolkit —
|
|
25
|
+
adstock, saturation, multi-touch attribution, budget optimisation, and a
|
|
26
|
+
marketing-aware data-quality audit — wired as typed tools with structured results,
|
|
27
|
+
an approval gate, and an eval suite that scores the agent's *method-selection
|
|
28
|
+
judgment*.
|
|
29
|
+
|
|
30
|
+
The thesis: in a domain agent, the tools are the product. A model can sound
|
|
31
|
+
confident about marketing data; the hard part is knowing which method the data can
|
|
32
|
+
actually support — and refusing the ones it can't. That judgment is what MixPilot
|
|
33
|
+
encodes and tests.
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
pip install mixpilot # tools only
|
|
37
|
+
pip install "mixpilot[agent]" # + the Anthropic-backed agent loop
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## What's inside
|
|
41
|
+
|
|
42
|
+
| Component | What it does | The lesson |
|
|
43
|
+
|---|---|---|
|
|
44
|
+
| `audit_data_quality` | nulls, negative spend, zero-variance, **multicollinearity** | The audit decides whether any later number is trustworthy at all |
|
|
45
|
+
| `adstock_transform` | geometric carryover with half-life | Carryover before saturation, always |
|
|
46
|
+
| `fit_saturation_curve` | Hill curve (β, α, γ) + R² | A low-R² curve is a warning, not a result |
|
|
47
|
+
| `run_attribution_model` | last-touch / linear / **Markov removal-effect** | Match the model to the data shape, never overreach |
|
|
48
|
+
| `allocate_budget` | constrained optimisation over saturation curves | Move money toward unsaturated marginal return |
|
|
49
|
+
| `ToolResult` contract | summary + next_actions + recovery_hint on every call | A tool result is the next observation, not a log line |
|
|
50
|
+
| Approval gate | mutating actions clear a policy gate outside the prompt | Safety lives in code, not prose |
|
|
51
|
+
| Agent loop | turn budget + loop detection + stop conditions | The loop is a control system, not a while-tool-calls toy |
|
|
52
|
+
| Eval suite | data shape → required/forbidden method | Test the judgment, not the prose |
|
|
53
|
+
|
|
54
|
+
## The judgment, made concrete
|
|
55
|
+
|
|
56
|
+
Same data, two attribution models:
|
|
57
|
+
|
|
58
|
+
```python
|
|
59
|
+
from mixpilot.tools.attribution import run_attribution_model
|
|
60
|
+
|
|
61
|
+
paths = [{"path": ["Social", "Search"], "converted": True}] * 40
|
|
62
|
+
paths += [{"path": ["Social"], "converted": False}] * 20
|
|
63
|
+
|
|
64
|
+
run_attribution_model(paths, "last_touch").artifacts["share"]
|
|
65
|
+
# {'Search': 1.0, 'Social': 0.0} <- the assist is thrown away
|
|
66
|
+
run_attribution_model(paths, "markov").artifacts["share"]
|
|
67
|
+
# {'Search': 0.5, 'Social': 0.5} <- both channels are necessary
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
Last-touch hands Social nothing. Markov's removal effect sees that every conversion
|
|
71
|
+
needed Social first, and splits the credit. That gap — assist value — is the entire
|
|
72
|
+
reason multi-touch attribution exists.
|
|
73
|
+
|
|
74
|
+
## Running the agent
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
from mixpilot import Agent
|
|
78
|
+
from mixpilot.agent.client import AnthropicClient
|
|
79
|
+
|
|
80
|
+
agent = Agent(AnthropicClient()) # needs ANTHROPIC_API_KEY
|
|
81
|
+
result = agent.run(
|
|
82
|
+
"Here are 12 weeks of spend and sales for one channel ... "
|
|
83
|
+
"what's the saturation point and should we spend more?"
|
|
84
|
+
)
|
|
85
|
+
print(result.final_text)
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
The loop injects the client, so it runs against a real model in production and a
|
|
89
|
+
scripted client in evals — no network needed to test the harness.
|
|
90
|
+
|
|
91
|
+
## Evals: scoring method selection
|
|
92
|
+
|
|
93
|
+
```bash
|
|
94
|
+
python evals/run_evals.py # offline, deterministic
|
|
95
|
+
python evals/run_evals.py --live # against a real model
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
Each case pairs a data shape with the methods it does and does not support. A case
|
|
99
|
+
passes only if the agent uses every required tool and avoids every forbidden one —
|
|
100
|
+
e.g. it must **not** run Markov attribution on single-touch data, and must audit
|
|
101
|
+
before claiming per-channel effects on collinear spend.
|
|
102
|
+
|
|
103
|
+
## Tests
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
python -m pytest -q # 10 passing — domain math + harness contracts
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
The tests never call a model. They prove the adstock conserves mass, the Hill fit
|
|
110
|
+
recovers known parameters, the Markov removal effect credits assists, the optimiser
|
|
111
|
+
moves budget toward unsaturated channels, the audit catches multicollinearity, and
|
|
112
|
+
the approval gate blocks mutating actions. Harness reliability that has nothing to do
|
|
113
|
+
with model intelligence.
|
|
114
|
+
|
|
115
|
+
## Scope
|
|
116
|
+
|
|
117
|
+
This is deliberately small. No context compaction, no MCP, no subagents — those are
|
|
118
|
+
solved problems in general harnesses. The point here is the opposite: how far you get
|
|
119
|
+
when the action space is a real domain and the tools enforce the judgment.
|
|
120
|
+
|
|
121
|
+
MIT licensed.
|
mixpilot-0.1.0/README.md
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
# MixPilot
|
|
2
|
+
|
|
3
|
+
**An agentic harness for marketing measurement.**
|
|
4
|
+
|
|
5
|
+
Everyone is building general-purpose agents. MixPilot is the opposite bet: a small,
|
|
6
|
+
real agent runtime whose entire action space is the marketing-science toolkit —
|
|
7
|
+
adstock, saturation, multi-touch attribution, budget optimisation, and a
|
|
8
|
+
marketing-aware data-quality audit — wired as typed tools with structured results,
|
|
9
|
+
an approval gate, and an eval suite that scores the agent's *method-selection
|
|
10
|
+
judgment*.
|
|
11
|
+
|
|
12
|
+
The thesis: in a domain agent, the tools are the product. A model can sound
|
|
13
|
+
confident about marketing data; the hard part is knowing which method the data can
|
|
14
|
+
actually support — and refusing the ones it can't. That judgment is what MixPilot
|
|
15
|
+
encodes and tests.
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
pip install mixpilot # tools only
|
|
19
|
+
pip install "mixpilot[agent]" # + the Anthropic-backed agent loop
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
## What's inside
|
|
23
|
+
|
|
24
|
+
| Component | What it does | The lesson |
|
|
25
|
+
|---|---|---|
|
|
26
|
+
| `audit_data_quality` | nulls, negative spend, zero-variance, **multicollinearity** | The audit decides whether any later number is trustworthy at all |
|
|
27
|
+
| `adstock_transform` | geometric carryover with half-life | Carryover before saturation, always |
|
|
28
|
+
| `fit_saturation_curve` | Hill curve (β, α, γ) + R² | A low-R² curve is a warning, not a result |
|
|
29
|
+
| `run_attribution_model` | last-touch / linear / **Markov removal-effect** | Match the model to the data shape, never overreach |
|
|
30
|
+
| `allocate_budget` | constrained optimisation over saturation curves | Move money toward unsaturated marginal return |
|
|
31
|
+
| `ToolResult` contract | summary + next_actions + recovery_hint on every call | A tool result is the next observation, not a log line |
|
|
32
|
+
| Approval gate | mutating actions clear a policy gate outside the prompt | Safety lives in code, not prose |
|
|
33
|
+
| Agent loop | turn budget + loop detection + stop conditions | The loop is a control system, not a while-tool-calls toy |
|
|
34
|
+
| Eval suite | data shape → required/forbidden method | Test the judgment, not the prose |
|
|
35
|
+
|
|
36
|
+
## The judgment, made concrete
|
|
37
|
+
|
|
38
|
+
Same data, two attribution models:
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
from mixpilot.tools.attribution import run_attribution_model
|
|
42
|
+
|
|
43
|
+
paths = [{"path": ["Social", "Search"], "converted": True}] * 40
|
|
44
|
+
paths += [{"path": ["Social"], "converted": False}] * 20
|
|
45
|
+
|
|
46
|
+
run_attribution_model(paths, "last_touch").artifacts["share"]
|
|
47
|
+
# {'Search': 1.0, 'Social': 0.0} <- the assist is thrown away
|
|
48
|
+
run_attribution_model(paths, "markov").artifacts["share"]
|
|
49
|
+
# {'Search': 0.5, 'Social': 0.5} <- both channels are necessary
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
Last-touch hands Social nothing. Markov's removal effect sees that every conversion
|
|
53
|
+
needed Social first, and splits the credit. That gap — assist value — is the entire
|
|
54
|
+
reason multi-touch attribution exists.
|
|
55
|
+
|
|
56
|
+
## Running the agent
|
|
57
|
+
|
|
58
|
+
```python
|
|
59
|
+
from mixpilot import Agent
|
|
60
|
+
from mixpilot.agent.client import AnthropicClient
|
|
61
|
+
|
|
62
|
+
agent = Agent(AnthropicClient()) # needs ANTHROPIC_API_KEY
|
|
63
|
+
result = agent.run(
|
|
64
|
+
"Here are 12 weeks of spend and sales for one channel ... "
|
|
65
|
+
"what's the saturation point and should we spend more?"
|
|
66
|
+
)
|
|
67
|
+
print(result.final_text)
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
The loop injects the client, so it runs against a real model in production and a
|
|
71
|
+
scripted client in evals — no network needed to test the harness.
|
|
72
|
+
|
|
73
|
+
## Evals: scoring method selection
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
python evals/run_evals.py # offline, deterministic
|
|
77
|
+
python evals/run_evals.py --live # against a real model
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
Each case pairs a data shape with the methods it does and does not support. A case
|
|
81
|
+
passes only if the agent uses every required tool and avoids every forbidden one —
|
|
82
|
+
e.g. it must **not** run Markov attribution on single-touch data, and must audit
|
|
83
|
+
before claiming per-channel effects on collinear spend.
|
|
84
|
+
|
|
85
|
+
## Tests
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
python -m pytest -q # 10 passing — domain math + harness contracts
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
The tests never call a model. They prove the adstock conserves mass, the Hill fit
|
|
92
|
+
recovers known parameters, the Markov removal effect credits assists, the optimiser
|
|
93
|
+
moves budget toward unsaturated channels, the audit catches multicollinearity, and
|
|
94
|
+
the approval gate blocks mutating actions. Harness reliability that has nothing to do
|
|
95
|
+
with model intelligence.
|
|
96
|
+
|
|
97
|
+
## Scope
|
|
98
|
+
|
|
99
|
+
This is deliberately small. No context compaction, no MCP, no subagents — those are
|
|
100
|
+
solved problems in general harnesses. The point here is the opposite: how far you get
|
|
101
|
+
when the action space is a real domain and the tools enforce the judgment.
|
|
102
|
+
|
|
103
|
+
MIT licensed.
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "mixpilot"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "An agentic harness for marketing measurement: adstock, saturation, attribution, and budget allocation as typed agent tools with an eval suite."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [{ name = "Mohit Luthra" }]
|
|
13
|
+
keywords = ["marketing-mix-modeling", "mmm", "attribution", "agents", "llm", "adstock", "saturation"]
|
|
14
|
+
dependencies = ["numpy>=1.24", "scipy>=1.10", "pydantic>=2.0"]
|
|
15
|
+
|
|
16
|
+
[project.optional-dependencies]
|
|
17
|
+
agent = ["anthropic>=0.39"]
|
|
18
|
+
dev = ["pytest>=7.0"]
|
|
19
|
+
|
|
20
|
+
[project.urls]
|
|
21
|
+
Homepage = "https://github.com/mohit-luthra/mixpilot"
|
|
22
|
+
|
|
23
|
+
[tool.setuptools.packages.find]
|
|
24
|
+
where = ["src"]
|
|
25
|
+
|
|
26
|
+
[tool.pytest.ini_options]
|
|
27
|
+
pythonpath = ["src"]
|
|
28
|
+
testpaths = ["tests"]
|
mixpilot-0.1.0/setup.cfg
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""MixPilot: an agentic harness for marketing measurement.
|
|
2
|
+
|
|
3
|
+
A small, real agent runtime whose action space is the marketing-science toolkit --
|
|
4
|
+
adstock, saturation, attribution, budget allocation, and data-quality auditing --
|
|
5
|
+
with structured tool results, an approval gate, and an eval suite that scores the
|
|
6
|
+
agent's method-selection judgment.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from .agent.loop import Agent, SYSTEM_PROMPT
|
|
10
|
+
from .tools.registry import Registry, BUILTIN_TOOLS
|
|
11
|
+
from .tools.base import Tool, ToolResult
|
|
12
|
+
from .safety.approval import ApprovalGate, Policy
|
|
13
|
+
|
|
14
|
+
__version__ = "0.1.0"
|
|
15
|
+
__all__ = [
|
|
16
|
+
"Agent", "SYSTEM_PROMPT", "Registry", "BUILTIN_TOOLS",
|
|
17
|
+
"Tool", "ToolResult", "ApprovalGate", "Policy",
|
|
18
|
+
]
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""Thin Anthropic adapter.
|
|
2
|
+
|
|
3
|
+
Normalises the Messages API into the dict shape the loop expects:
|
|
4
|
+
create(...) -> {"content": [ {type, ...}, ... ]}
|
|
5
|
+
|
|
6
|
+
Kept behind an interface so tests/evals can inject a scripted client with the same
|
|
7
|
+
.create() signature and never touch the network.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import os
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class AnthropicClient:
|
|
16
|
+
def __init__(self, api_key: str | None = None, max_tokens: int = 1500):
|
|
17
|
+
try:
|
|
18
|
+
import anthropic
|
|
19
|
+
except ImportError as exc: # pragma: no cover
|
|
20
|
+
raise ImportError(
|
|
21
|
+
"Install the anthropic SDK: pip install anthropic"
|
|
22
|
+
) from exc
|
|
23
|
+
self._client = anthropic.Anthropic(
|
|
24
|
+
api_key=api_key or os.environ.get("ANTHROPIC_API_KEY")
|
|
25
|
+
)
|
|
26
|
+
self.max_tokens = max_tokens
|
|
27
|
+
|
|
28
|
+
def create(self, model, system, tools, messages):
|
|
29
|
+
resp = self._client.messages.create(
|
|
30
|
+
model=model,
|
|
31
|
+
max_tokens=self.max_tokens,
|
|
32
|
+
system=system,
|
|
33
|
+
tools=tools,
|
|
34
|
+
messages=messages,
|
|
35
|
+
)
|
|
36
|
+
content = []
|
|
37
|
+
for block in resp.content:
|
|
38
|
+
if block.type == "text":
|
|
39
|
+
content.append({"type": "text", "text": block.text})
|
|
40
|
+
elif block.type == "tool_use":
|
|
41
|
+
content.append({"type": "tool_use", "id": block.id,
|
|
42
|
+
"name": block.name, "input": block.input})
|
|
43
|
+
return {"content": content}
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
"""Groq adapter for the MixPilot agent loop.
|
|
2
|
+
|
|
3
|
+
Groq's API follows OpenAI conventions, which differ from Anthropic's tool-use shape.
|
|
4
|
+
Rather than teach the loop two dialects, we normalise at the edge: this adapter
|
|
5
|
+
exposes the same .create(model, system, tools, messages) interface the loop uses,
|
|
6
|
+
translating Anthropic-style tools/messages INTO OpenAI/Groq format on the way in,
|
|
7
|
+
and translating Groq's response BACK into Anthropic-style content blocks on the way
|
|
8
|
+
out. The loop never knows which provider it is talking to.
|
|
9
|
+
|
|
10
|
+
The three translation functions are module-level and pure, so they are unit-tested
|
|
11
|
+
without any network call (see tests/test_groq_translation.py).
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import json
|
|
17
|
+
import os
|
|
18
|
+
from typing import Any
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def to_openai_tools(anthropic_tools: list[dict]) -> list[dict]:
|
|
22
|
+
"""Anthropic {name, description, input_schema} -> OpenAI function tools."""
|
|
23
|
+
return [
|
|
24
|
+
{
|
|
25
|
+
"type": "function",
|
|
26
|
+
"function": {
|
|
27
|
+
"name": t["name"],
|
|
28
|
+
"description": t.get("description", ""),
|
|
29
|
+
"parameters": t.get("input_schema", {"type": "object", "properties": {}}),
|
|
30
|
+
},
|
|
31
|
+
}
|
|
32
|
+
for t in anthropic_tools
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def to_openai_messages(system: str, messages: list[dict]) -> list[dict]:
|
|
37
|
+
"""Translate the loop's Anthropic-shaped history into OpenAI chat messages."""
|
|
38
|
+
out: list[dict] = [{"role": "system", "content": system}]
|
|
39
|
+
for msg in messages:
|
|
40
|
+
role, content = msg["role"], msg["content"]
|
|
41
|
+
|
|
42
|
+
if isinstance(content, str):
|
|
43
|
+
out.append({"role": role, "content": content})
|
|
44
|
+
continue
|
|
45
|
+
|
|
46
|
+
if role == "assistant":
|
|
47
|
+
text_parts, tool_calls = [], []
|
|
48
|
+
for block in content:
|
|
49
|
+
if block.get("type") == "text":
|
|
50
|
+
text_parts.append(block["text"])
|
|
51
|
+
elif block.get("type") == "tool_use":
|
|
52
|
+
tool_calls.append({
|
|
53
|
+
"id": block["id"],
|
|
54
|
+
"type": "function",
|
|
55
|
+
"function": {
|
|
56
|
+
"name": block["name"],
|
|
57
|
+
"arguments": json.dumps(block.get("input", {})),
|
|
58
|
+
},
|
|
59
|
+
})
|
|
60
|
+
am: dict[str, Any] = {"role": "assistant",
|
|
61
|
+
"content": " ".join(text_parts) or None}
|
|
62
|
+
if tool_calls:
|
|
63
|
+
am["tool_calls"] = tool_calls
|
|
64
|
+
out.append(am)
|
|
65
|
+
|
|
66
|
+
else: # user turn carrying tool_result blocks
|
|
67
|
+
for block in content:
|
|
68
|
+
if block.get("type") == "tool_result":
|
|
69
|
+
out.append({
|
|
70
|
+
"role": "tool",
|
|
71
|
+
"tool_call_id": block["tool_use_id"],
|
|
72
|
+
"content": block["content"],
|
|
73
|
+
})
|
|
74
|
+
else:
|
|
75
|
+
out.append({"role": "user", "content": block.get("text", "")})
|
|
76
|
+
return out
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def from_groq_response(message) -> dict:
|
|
80
|
+
"""Groq/OpenAI response message -> Anthropic-style {"content": [...blocks]}."""
|
|
81
|
+
content: list[dict] = []
|
|
82
|
+
text = getattr(message, "content", None)
|
|
83
|
+
if text:
|
|
84
|
+
content.append({"type": "text", "text": text})
|
|
85
|
+
for tc in getattr(message, "tool_calls", None) or []:
|
|
86
|
+
try:
|
|
87
|
+
args = json.loads(tc.function.arguments or "{}")
|
|
88
|
+
except json.JSONDecodeError:
|
|
89
|
+
args = {}
|
|
90
|
+
content.append({"type": "tool_use", "id": tc.id,
|
|
91
|
+
"name": tc.function.name, "input": args})
|
|
92
|
+
if not content:
|
|
93
|
+
content.append({"type": "text", "text": ""})
|
|
94
|
+
return {"content": content}
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
class GroqClient:
|
|
98
|
+
"""Drop-in replacement for AnthropicClient, backed by Groq."""
|
|
99
|
+
|
|
100
|
+
def __init__(self, api_key: str | None = None,
|
|
101
|
+
model: str = "llama-3.3-70b-versatile", max_tokens: int = 1500):
|
|
102
|
+
try:
|
|
103
|
+
from groq import Groq
|
|
104
|
+
except ImportError as exc: # pragma: no cover
|
|
105
|
+
raise ImportError("Install the Groq SDK: pip install groq") from exc
|
|
106
|
+
self._client = Groq(api_key=api_key or os.environ.get("GROQ_API_KEY"))
|
|
107
|
+
self.default_model = model
|
|
108
|
+
self.max_tokens = max_tokens
|
|
109
|
+
|
|
110
|
+
def create(self, model, system, tools, messages):
|
|
111
|
+
resp = self._client.chat.completions.create(
|
|
112
|
+
model=model or self.default_model,
|
|
113
|
+
max_tokens=self.max_tokens,
|
|
114
|
+
messages=to_openai_messages(system, messages),
|
|
115
|
+
tools=to_openai_tools(tools),
|
|
116
|
+
tool_choice="auto",
|
|
117
|
+
)
|
|
118
|
+
return from_groq_response(resp.choices[0].message)
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
"""The agent loop: a control system, not a while-tool-calls toy.
|
|
2
|
+
|
|
3
|
+
It owns the marketing-analyst system prompt, exposes the registry's tools to the
|
|
4
|
+
model, executes tool calls through the registry pipeline, feeds structured results
|
|
5
|
+
back as observations, and enforces stop conditions: a max-turn budget and a loop
|
|
6
|
+
detector that halts when the model repeats the same tool call with no progress.
|
|
7
|
+
|
|
8
|
+
The Anthropic client is injected, so the loop is testable without a live model
|
|
9
|
+
(see tests/ and evals/ which drive it with a scripted client).
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import json
|
|
15
|
+
from dataclasses import dataclass, field
|
|
16
|
+
|
|
17
|
+
from ..tools.registry import Registry
|
|
18
|
+
|
|
19
|
+
SYSTEM_PROMPT = """You are a senior marketing-measurement analyst running inside a \
|
|
20
|
+
tool-using agent. Your job is to turn raw marketing data and messy business \
|
|
21
|
+
questions into defensible measurement conclusions.
|
|
22
|
+
|
|
23
|
+
Operating principles:
|
|
24
|
+
- ALWAYS audit data quality before modelling. If high-severity issues exist, say so \
|
|
25
|
+
and stop rather than producing a confident-but-wrong answer.
|
|
26
|
+
- Choose the analysis that matches the DATA SHAPE, not the fanciest one. Do not run \
|
|
27
|
+
Markov attribution on single-touch paths. Do not trust a saturation curve with low R2.
|
|
28
|
+
- Adstock before saturation; saturation before budget allocation.
|
|
29
|
+
- State assumptions and the limits of each conclusion. Extrapolating beyond observed \
|
|
30
|
+
spend ranges is unreliable; say so.
|
|
31
|
+
- When you have reached a defensible conclusion, stop calling tools and give a final \
|
|
32
|
+
answer with the business implication, not just the numbers.
|
|
33
|
+
|
|
34
|
+
The judgment layer -- knowing which question to ask and which method the data can \
|
|
35
|
+
actually support -- is the point. Tools are how you act on that judgment."""
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class Turn:
|
|
40
|
+
role: str
|
|
41
|
+
content: object
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@dataclass
|
|
45
|
+
class RunResult:
|
|
46
|
+
final_text: str
|
|
47
|
+
turns: list[Turn] = field(default_factory=list)
|
|
48
|
+
tool_calls: list[dict] = field(default_factory=list)
|
|
49
|
+
stop_reason: str = ""
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class Agent:
|
|
53
|
+
def __init__(self, client, registry: Registry | None = None,
|
|
54
|
+
model: str = "claude-sonnet-4-6", max_turns: int = 12):
|
|
55
|
+
self.client = client
|
|
56
|
+
self.registry = registry or Registry()
|
|
57
|
+
self.model = model
|
|
58
|
+
self.max_turns = max_turns
|
|
59
|
+
|
|
60
|
+
def run(self, goal: str) -> RunResult:
|
|
61
|
+
messages = [{"role": "user", "content": goal}]
|
|
62
|
+
out = RunResult(final_text="", stop_reason="")
|
|
63
|
+
recent_signatures: list[str] = []
|
|
64
|
+
|
|
65
|
+
for _ in range(self.max_turns):
|
|
66
|
+
resp = self.client.create(
|
|
67
|
+
model=self.model,
|
|
68
|
+
system=SYSTEM_PROMPT,
|
|
69
|
+
tools=self.registry.schemas(),
|
|
70
|
+
messages=messages,
|
|
71
|
+
)
|
|
72
|
+
messages.append({"role": "assistant", "content": resp["content"]})
|
|
73
|
+
out.turns.append(Turn("assistant", resp["content"]))
|
|
74
|
+
|
|
75
|
+
tool_uses = [b for b in resp["content"] if b.get("type") == "tool_use"]
|
|
76
|
+
text = " ".join(b["text"] for b in resp["content"]
|
|
77
|
+
if b.get("type") == "text")
|
|
78
|
+
|
|
79
|
+
if not tool_uses:
|
|
80
|
+
out.final_text = text.strip()
|
|
81
|
+
out.stop_reason = "final_answer"
|
|
82
|
+
return out
|
|
83
|
+
|
|
84
|
+
# Loop detection: identical tool call signature 3x in a row -> stop.
|
|
85
|
+
results = []
|
|
86
|
+
for tu in tool_uses:
|
|
87
|
+
sig = tu["name"] + json.dumps(tu.get("input", {}), sort_keys=True)
|
|
88
|
+
recent_signatures.append(sig)
|
|
89
|
+
if recent_signatures[-3:].count(sig) >= 3:
|
|
90
|
+
out.final_text = (text or "").strip()
|
|
91
|
+
out.stop_reason = "loop_detected"
|
|
92
|
+
return out
|
|
93
|
+
|
|
94
|
+
res = self.registry.call(tu["name"], **tu.get("input", {}))
|
|
95
|
+
out.tool_calls.append({"name": tu["name"], "input": tu.get("input", {}),
|
|
96
|
+
"success": res.success})
|
|
97
|
+
# The observation handed back to the model is the structured result.
|
|
98
|
+
payload = res.model_dump(exclude_none=True)
|
|
99
|
+
results.append({
|
|
100
|
+
"type": "tool_result",
|
|
101
|
+
"tool_use_id": tu["id"],
|
|
102
|
+
"content": json.dumps(payload),
|
|
103
|
+
"is_error": not res.success,
|
|
104
|
+
})
|
|
105
|
+
messages.append({"role": "user", "content": results})
|
|
106
|
+
out.turns.append(Turn("user", results))
|
|
107
|
+
|
|
108
|
+
out.stop_reason = "max_turns"
|
|
109
|
+
out.final_text = "Reached max turns without a final answer."
|
|
110
|
+
return out
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""Approval policy: safety enforced in code, not in prose.
|
|
2
|
+
|
|
3
|
+
Mutating actions (writing a report, exporting a file) must clear a policy gate that
|
|
4
|
+
lives outside the model. The model cannot talk its way past it.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from enum import Enum
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class Policy(str, Enum):
|
|
14
|
+
ON_REQUEST = "on_request" # ask a human callback before any mutating action
|
|
15
|
+
AUTO = "auto" # allow mutating actions automatically
|
|
16
|
+
NEVER = "never" # block all mutating actions
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class Decision:
|
|
21
|
+
approved: bool
|
|
22
|
+
reason: str = ""
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class ApprovalGate:
|
|
26
|
+
def __init__(self, policy: Policy = Policy.ON_REQUEST, ask=None):
|
|
27
|
+
self.policy = policy
|
|
28
|
+
self.ask = ask # optional callable(name, args) -> bool
|
|
29
|
+
|
|
30
|
+
def check(self, name: str, args: dict) -> Decision:
|
|
31
|
+
if self.policy is Policy.AUTO:
|
|
32
|
+
return Decision(True, "auto policy")
|
|
33
|
+
if self.policy is Policy.NEVER:
|
|
34
|
+
return Decision(False, "never policy blocks mutating actions")
|
|
35
|
+
# ON_REQUEST
|
|
36
|
+
if self.ask is None:
|
|
37
|
+
return Decision(False, "no approval callback configured")
|
|
38
|
+
try:
|
|
39
|
+
return Decision(bool(self.ask(name, args)), "human callback")
|
|
40
|
+
except Exception as exc: # noqa: BLE001
|
|
41
|
+
return Decision(False, f"approval callback error: {exc}")
|