mixpilot 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mixpilot/__init__.py +18 -0
- mixpilot/agent/__init__.py +3 -0
- mixpilot/agent/client.py +43 -0
- mixpilot/agent/groq_client.py +118 -0
- mixpilot/agent/loop.py +110 -0
- mixpilot/safety/__init__.py +2 -0
- mixpilot/safety/approval.py +41 -0
- mixpilot/tools/__init__.py +3 -0
- mixpilot/tools/adstock.py +78 -0
- mixpilot/tools/attribution.py +157 -0
- mixpilot/tools/base.py +107 -0
- mixpilot/tools/budget.py +99 -0
- mixpilot/tools/dataquality.py +102 -0
- mixpilot/tools/registry.py +57 -0
- mixpilot/tools/saturation.py +96 -0
- mixpilot-0.1.0.dist-info/METADATA +121 -0
- mixpilot-0.1.0.dist-info/RECORD +19 -0
- mixpilot-0.1.0.dist-info/WHEEL +5 -0
- mixpilot-0.1.0.dist-info/top_level.txt +1 -0
mixpilot/__init__.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""MixPilot: an agentic harness for marketing measurement.
|
|
2
|
+
|
|
3
|
+
A small, real agent runtime whose action space is the marketing-science toolkit --
|
|
4
|
+
adstock, saturation, attribution, budget allocation, and data-quality auditing --
|
|
5
|
+
with structured tool results, an approval gate, and an eval suite that scores the
|
|
6
|
+
agent's method-selection judgment.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from .agent.loop import Agent, SYSTEM_PROMPT
|
|
10
|
+
from .tools.registry import Registry, BUILTIN_TOOLS
|
|
11
|
+
from .tools.base import Tool, ToolResult
|
|
12
|
+
from .safety.approval import ApprovalGate, Policy
|
|
13
|
+
|
|
14
|
+
__version__ = "0.1.0"
|
|
15
|
+
__all__ = [
|
|
16
|
+
"Agent", "SYSTEM_PROMPT", "Registry", "BUILTIN_TOOLS",
|
|
17
|
+
"Tool", "ToolResult", "ApprovalGate", "Policy",
|
|
18
|
+
]
|
mixpilot/agent/client.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""Thin Anthropic adapter.
|
|
2
|
+
|
|
3
|
+
Normalises the Messages API into the dict shape the loop expects:
|
|
4
|
+
create(...) -> {"content": [ {type, ...}, ... ]}
|
|
5
|
+
|
|
6
|
+
Kept behind an interface so tests/evals can inject a scripted client with the same
|
|
7
|
+
.create() signature and never touch the network.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import os
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class AnthropicClient:
|
|
16
|
+
def __init__(self, api_key: str | None = None, max_tokens: int = 1500):
|
|
17
|
+
try:
|
|
18
|
+
import anthropic
|
|
19
|
+
except ImportError as exc: # pragma: no cover
|
|
20
|
+
raise ImportError(
|
|
21
|
+
"Install the anthropic SDK: pip install anthropic"
|
|
22
|
+
) from exc
|
|
23
|
+
self._client = anthropic.Anthropic(
|
|
24
|
+
api_key=api_key or os.environ.get("ANTHROPIC_API_KEY")
|
|
25
|
+
)
|
|
26
|
+
self.max_tokens = max_tokens
|
|
27
|
+
|
|
28
|
+
def create(self, model, system, tools, messages):
|
|
29
|
+
resp = self._client.messages.create(
|
|
30
|
+
model=model,
|
|
31
|
+
max_tokens=self.max_tokens,
|
|
32
|
+
system=system,
|
|
33
|
+
tools=tools,
|
|
34
|
+
messages=messages,
|
|
35
|
+
)
|
|
36
|
+
content = []
|
|
37
|
+
for block in resp.content:
|
|
38
|
+
if block.type == "text":
|
|
39
|
+
content.append({"type": "text", "text": block.text})
|
|
40
|
+
elif block.type == "tool_use":
|
|
41
|
+
content.append({"type": "tool_use", "id": block.id,
|
|
42
|
+
"name": block.name, "input": block.input})
|
|
43
|
+
return {"content": content}
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
"""Groq adapter for the MixPilot agent loop.
|
|
2
|
+
|
|
3
|
+
Groq's API follows OpenAI conventions, which differ from Anthropic's tool-use shape.
|
|
4
|
+
Rather than teach the loop two dialects, we normalise at the edge: this adapter
|
|
5
|
+
exposes the same .create(model, system, tools, messages) interface the loop uses,
|
|
6
|
+
translating Anthropic-style tools/messages INTO OpenAI/Groq format on the way in,
|
|
7
|
+
and translating Groq's response BACK into Anthropic-style content blocks on the way
|
|
8
|
+
out. The loop never knows which provider it is talking to.
|
|
9
|
+
|
|
10
|
+
The three translation functions are module-level and pure, so they are unit-tested
|
|
11
|
+
without any network call (see tests/test_groq_translation.py).
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import json
|
|
17
|
+
import os
|
|
18
|
+
from typing import Any
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def to_openai_tools(anthropic_tools: list[dict]) -> list[dict]:
|
|
22
|
+
"""Anthropic {name, description, input_schema} -> OpenAI function tools."""
|
|
23
|
+
return [
|
|
24
|
+
{
|
|
25
|
+
"type": "function",
|
|
26
|
+
"function": {
|
|
27
|
+
"name": t["name"],
|
|
28
|
+
"description": t.get("description", ""),
|
|
29
|
+
"parameters": t.get("input_schema", {"type": "object", "properties": {}}),
|
|
30
|
+
},
|
|
31
|
+
}
|
|
32
|
+
for t in anthropic_tools
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def to_openai_messages(system: str, messages: list[dict]) -> list[dict]:
|
|
37
|
+
"""Translate the loop's Anthropic-shaped history into OpenAI chat messages."""
|
|
38
|
+
out: list[dict] = [{"role": "system", "content": system}]
|
|
39
|
+
for msg in messages:
|
|
40
|
+
role, content = msg["role"], msg["content"]
|
|
41
|
+
|
|
42
|
+
if isinstance(content, str):
|
|
43
|
+
out.append({"role": role, "content": content})
|
|
44
|
+
continue
|
|
45
|
+
|
|
46
|
+
if role == "assistant":
|
|
47
|
+
text_parts, tool_calls = [], []
|
|
48
|
+
for block in content:
|
|
49
|
+
if block.get("type") == "text":
|
|
50
|
+
text_parts.append(block["text"])
|
|
51
|
+
elif block.get("type") == "tool_use":
|
|
52
|
+
tool_calls.append({
|
|
53
|
+
"id": block["id"],
|
|
54
|
+
"type": "function",
|
|
55
|
+
"function": {
|
|
56
|
+
"name": block["name"],
|
|
57
|
+
"arguments": json.dumps(block.get("input", {})),
|
|
58
|
+
},
|
|
59
|
+
})
|
|
60
|
+
am: dict[str, Any] = {"role": "assistant",
|
|
61
|
+
"content": " ".join(text_parts) or None}
|
|
62
|
+
if tool_calls:
|
|
63
|
+
am["tool_calls"] = tool_calls
|
|
64
|
+
out.append(am)
|
|
65
|
+
|
|
66
|
+
else: # user turn carrying tool_result blocks
|
|
67
|
+
for block in content:
|
|
68
|
+
if block.get("type") == "tool_result":
|
|
69
|
+
out.append({
|
|
70
|
+
"role": "tool",
|
|
71
|
+
"tool_call_id": block["tool_use_id"],
|
|
72
|
+
"content": block["content"],
|
|
73
|
+
})
|
|
74
|
+
else:
|
|
75
|
+
out.append({"role": "user", "content": block.get("text", "")})
|
|
76
|
+
return out
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def from_groq_response(message) -> dict:
|
|
80
|
+
"""Groq/OpenAI response message -> Anthropic-style {"content": [...blocks]}."""
|
|
81
|
+
content: list[dict] = []
|
|
82
|
+
text = getattr(message, "content", None)
|
|
83
|
+
if text:
|
|
84
|
+
content.append({"type": "text", "text": text})
|
|
85
|
+
for tc in getattr(message, "tool_calls", None) or []:
|
|
86
|
+
try:
|
|
87
|
+
args = json.loads(tc.function.arguments or "{}")
|
|
88
|
+
except json.JSONDecodeError:
|
|
89
|
+
args = {}
|
|
90
|
+
content.append({"type": "tool_use", "id": tc.id,
|
|
91
|
+
"name": tc.function.name, "input": args})
|
|
92
|
+
if not content:
|
|
93
|
+
content.append({"type": "text", "text": ""})
|
|
94
|
+
return {"content": content}
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
class GroqClient:
|
|
98
|
+
"""Drop-in replacement for AnthropicClient, backed by Groq."""
|
|
99
|
+
|
|
100
|
+
def __init__(self, api_key: str | None = None,
|
|
101
|
+
model: str = "llama-3.3-70b-versatile", max_tokens: int = 1500):
|
|
102
|
+
try:
|
|
103
|
+
from groq import Groq
|
|
104
|
+
except ImportError as exc: # pragma: no cover
|
|
105
|
+
raise ImportError("Install the Groq SDK: pip install groq") from exc
|
|
106
|
+
self._client = Groq(api_key=api_key or os.environ.get("GROQ_API_KEY"))
|
|
107
|
+
self.default_model = model
|
|
108
|
+
self.max_tokens = max_tokens
|
|
109
|
+
|
|
110
|
+
def create(self, model, system, tools, messages):
|
|
111
|
+
resp = self._client.chat.completions.create(
|
|
112
|
+
model=model or self.default_model,
|
|
113
|
+
max_tokens=self.max_tokens,
|
|
114
|
+
messages=to_openai_messages(system, messages),
|
|
115
|
+
tools=to_openai_tools(tools),
|
|
116
|
+
tool_choice="auto",
|
|
117
|
+
)
|
|
118
|
+
return from_groq_response(resp.choices[0].message)
|
mixpilot/agent/loop.py
ADDED
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
"""The agent loop: a control system, not a while-tool-calls toy.
|
|
2
|
+
|
|
3
|
+
It owns the marketing-analyst system prompt, exposes the registry's tools to the
|
|
4
|
+
model, executes tool calls through the registry pipeline, feeds structured results
|
|
5
|
+
back as observations, and enforces stop conditions: a max-turn budget and a loop
|
|
6
|
+
detector that halts when the model repeats the same tool call with no progress.
|
|
7
|
+
|
|
8
|
+
The Anthropic client is injected, so the loop is testable without a live model
|
|
9
|
+
(see tests/ and evals/ which drive it with a scripted client).
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import json
|
|
15
|
+
from dataclasses import dataclass, field
|
|
16
|
+
|
|
17
|
+
from ..tools.registry import Registry
|
|
18
|
+
|
|
19
|
+
SYSTEM_PROMPT = """You are a senior marketing-measurement analyst running inside a \
|
|
20
|
+
tool-using agent. Your job is to turn raw marketing data and messy business \
|
|
21
|
+
questions into defensible measurement conclusions.
|
|
22
|
+
|
|
23
|
+
Operating principles:
|
|
24
|
+
- ALWAYS audit data quality before modelling. If high-severity issues exist, say so \
|
|
25
|
+
and stop rather than producing a confident-but-wrong answer.
|
|
26
|
+
- Choose the analysis that matches the DATA SHAPE, not the fanciest one. Do not run \
|
|
27
|
+
Markov attribution on single-touch paths. Do not trust a saturation curve with low R2.
|
|
28
|
+
- Adstock before saturation; saturation before budget allocation.
|
|
29
|
+
- State assumptions and the limits of each conclusion. Extrapolating beyond observed \
|
|
30
|
+
spend ranges is unreliable; say so.
|
|
31
|
+
- When you have reached a defensible conclusion, stop calling tools and give a final \
|
|
32
|
+
answer with the business implication, not just the numbers.
|
|
33
|
+
|
|
34
|
+
The judgment layer -- knowing which question to ask and which method the data can \
|
|
35
|
+
actually support -- is the point. Tools are how you act on that judgment."""
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class Turn:
|
|
40
|
+
role: str
|
|
41
|
+
content: object
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@dataclass
|
|
45
|
+
class RunResult:
|
|
46
|
+
final_text: str
|
|
47
|
+
turns: list[Turn] = field(default_factory=list)
|
|
48
|
+
tool_calls: list[dict] = field(default_factory=list)
|
|
49
|
+
stop_reason: str = ""
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class Agent:
|
|
53
|
+
def __init__(self, client, registry: Registry | None = None,
|
|
54
|
+
model: str = "claude-sonnet-4-6", max_turns: int = 12):
|
|
55
|
+
self.client = client
|
|
56
|
+
self.registry = registry or Registry()
|
|
57
|
+
self.model = model
|
|
58
|
+
self.max_turns = max_turns
|
|
59
|
+
|
|
60
|
+
def run(self, goal: str) -> RunResult:
|
|
61
|
+
messages = [{"role": "user", "content": goal}]
|
|
62
|
+
out = RunResult(final_text="", stop_reason="")
|
|
63
|
+
recent_signatures: list[str] = []
|
|
64
|
+
|
|
65
|
+
for _ in range(self.max_turns):
|
|
66
|
+
resp = self.client.create(
|
|
67
|
+
model=self.model,
|
|
68
|
+
system=SYSTEM_PROMPT,
|
|
69
|
+
tools=self.registry.schemas(),
|
|
70
|
+
messages=messages,
|
|
71
|
+
)
|
|
72
|
+
messages.append({"role": "assistant", "content": resp["content"]})
|
|
73
|
+
out.turns.append(Turn("assistant", resp["content"]))
|
|
74
|
+
|
|
75
|
+
tool_uses = [b for b in resp["content"] if b.get("type") == "tool_use"]
|
|
76
|
+
text = " ".join(b["text"] for b in resp["content"]
|
|
77
|
+
if b.get("type") == "text")
|
|
78
|
+
|
|
79
|
+
if not tool_uses:
|
|
80
|
+
out.final_text = text.strip()
|
|
81
|
+
out.stop_reason = "final_answer"
|
|
82
|
+
return out
|
|
83
|
+
|
|
84
|
+
# Loop detection: identical tool call signature 3x in a row -> stop.
|
|
85
|
+
results = []
|
|
86
|
+
for tu in tool_uses:
|
|
87
|
+
sig = tu["name"] + json.dumps(tu.get("input", {}), sort_keys=True)
|
|
88
|
+
recent_signatures.append(sig)
|
|
89
|
+
if recent_signatures[-3:].count(sig) >= 3:
|
|
90
|
+
out.final_text = (text or "").strip()
|
|
91
|
+
out.stop_reason = "loop_detected"
|
|
92
|
+
return out
|
|
93
|
+
|
|
94
|
+
res = self.registry.call(tu["name"], **tu.get("input", {}))
|
|
95
|
+
out.tool_calls.append({"name": tu["name"], "input": tu.get("input", {}),
|
|
96
|
+
"success": res.success})
|
|
97
|
+
# The observation handed back to the model is the structured result.
|
|
98
|
+
payload = res.model_dump(exclude_none=True)
|
|
99
|
+
results.append({
|
|
100
|
+
"type": "tool_result",
|
|
101
|
+
"tool_use_id": tu["id"],
|
|
102
|
+
"content": json.dumps(payload),
|
|
103
|
+
"is_error": not res.success,
|
|
104
|
+
})
|
|
105
|
+
messages.append({"role": "user", "content": results})
|
|
106
|
+
out.turns.append(Turn("user", results))
|
|
107
|
+
|
|
108
|
+
out.stop_reason = "max_turns"
|
|
109
|
+
out.final_text = "Reached max turns without a final answer."
|
|
110
|
+
return out
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""Approval policy: safety enforced in code, not in prose.
|
|
2
|
+
|
|
3
|
+
Mutating actions (writing a report, exporting a file) must clear a policy gate that
|
|
4
|
+
lives outside the model. The model cannot talk its way past it.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from enum import Enum
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class Policy(str, Enum):
|
|
14
|
+
ON_REQUEST = "on_request" # ask a human callback before any mutating action
|
|
15
|
+
AUTO = "auto" # allow mutating actions automatically
|
|
16
|
+
NEVER = "never" # block all mutating actions
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class Decision:
|
|
21
|
+
approved: bool
|
|
22
|
+
reason: str = ""
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class ApprovalGate:
|
|
26
|
+
def __init__(self, policy: Policy = Policy.ON_REQUEST, ask=None):
|
|
27
|
+
self.policy = policy
|
|
28
|
+
self.ask = ask # optional callable(name, args) -> bool
|
|
29
|
+
|
|
30
|
+
def check(self, name: str, args: dict) -> Decision:
|
|
31
|
+
if self.policy is Policy.AUTO:
|
|
32
|
+
return Decision(True, "auto policy")
|
|
33
|
+
if self.policy is Policy.NEVER:
|
|
34
|
+
return Decision(False, "never policy blocks mutating actions")
|
|
35
|
+
# ON_REQUEST
|
|
36
|
+
if self.ask is None:
|
|
37
|
+
return Decision(False, "no approval callback configured")
|
|
38
|
+
try:
|
|
39
|
+
return Decision(bool(self.ask(name, args)), "human callback")
|
|
40
|
+
except Exception as exc: # noqa: BLE001
|
|
41
|
+
return Decision(False, f"approval callback error: {exc}")
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
"""Adstock: the carryover transform at the heart of MMM.
|
|
2
|
+
|
|
3
|
+
Geometric (exponential) adstock with an optional max-lag window:
|
|
4
|
+
|
|
5
|
+
a[t] = sum_{l=0..L} w[l] * x[t-l], w[l] = decay**l (normalised to sum 1)
|
|
6
|
+
|
|
7
|
+
Carryover models the fact that media exposure today keeps driving response for
|
|
8
|
+
several periods. `decay` in [0, 1) is the fraction of effect retained each period.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import numpy as np
|
|
14
|
+
|
|
15
|
+
from .base import Tool, ToolResult
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def adstock_transform(
|
|
19
|
+
spend: list[float],
|
|
20
|
+
decay: float = 0.5,
|
|
21
|
+
max_lag: int = 8,
|
|
22
|
+
normalise: bool = True,
|
|
23
|
+
) -> ToolResult:
|
|
24
|
+
x = np.asarray(spend, dtype=float)
|
|
25
|
+
if x.ndim != 1 or x.size == 0:
|
|
26
|
+
return ToolResult.fail(
|
|
27
|
+
"spend must be a non-empty 1-D list of period spend values.",
|
|
28
|
+
recovery_hint="Pass spend as a flat list, e.g. [100, 120, 0, 90].",
|
|
29
|
+
)
|
|
30
|
+
if not 0.0 <= decay < 1.0:
|
|
31
|
+
return ToolResult.fail(
|
|
32
|
+
f"decay={decay} is out of range; must be in [0, 1).",
|
|
33
|
+
recovery_hint="Typical TV decay ~0.6-0.8, digital ~0.1-0.4. Use a value < 1.",
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
weights = decay ** np.arange(max_lag + 1)
|
|
37
|
+
if normalise:
|
|
38
|
+
weights = weights / weights.sum()
|
|
39
|
+
|
|
40
|
+
out = np.convolve(x, weights)[: x.size]
|
|
41
|
+
half_life = np.log(0.5) / np.log(decay) if decay > 0 else 0.0
|
|
42
|
+
|
|
43
|
+
return ToolResult.ok(
|
|
44
|
+
output=f"Adstocked series (first 5): {np.round(out[:5], 2).tolist()}",
|
|
45
|
+
summary=(
|
|
46
|
+
f"Applied geometric adstock (decay={decay}, max_lag={max_lag}). "
|
|
47
|
+
f"Effective half-life ~{half_life:.1f} periods."
|
|
48
|
+
),
|
|
49
|
+
artifacts={
|
|
50
|
+
"adstocked": np.round(out, 4).tolist(),
|
|
51
|
+
"weights": np.round(weights, 4).tolist(),
|
|
52
|
+
"half_life_periods": round(float(half_life), 2),
|
|
53
|
+
},
|
|
54
|
+
next_actions=[
|
|
55
|
+
"Fit a saturation curve on the adstocked series before regressing on sales.",
|
|
56
|
+
],
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
TOOL = Tool(
|
|
61
|
+
name="adstock_transform",
|
|
62
|
+
description=(
|
|
63
|
+
"Apply geometric adstock (media carryover) to a spend series. Use this "
|
|
64
|
+
"BEFORE saturation and before regressing media on sales. Returns the "
|
|
65
|
+
"adstocked series and the implied half-life."
|
|
66
|
+
),
|
|
67
|
+
parameters={
|
|
68
|
+
"spend": {"type": "array", "items": {"type": "number"},
|
|
69
|
+
"description": "Per-period spend for one channel."},
|
|
70
|
+
"decay": {"type": "number", "description": "Carryover rate in [0,1). "
|
|
71
|
+
"Higher = longer carryover.", "optional": True},
|
|
72
|
+
"max_lag": {"type": "integer", "description": "Max carryover window in "
|
|
73
|
+
"periods.", "optional": True},
|
|
74
|
+
"normalise": {"type": "boolean", "description": "Normalise weights to sum "
|
|
75
|
+
"to 1.", "optional": True},
|
|
76
|
+
},
|
|
77
|
+
fn=adstock_transform,
|
|
78
|
+
)
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
"""Multi-touch attribution.
|
|
2
|
+
|
|
3
|
+
Three models, in increasing order of sophistication and data demands:
|
|
4
|
+
|
|
5
|
+
- last_touch: 100% credit to the final channel before conversion. Cheap, biased,
|
|
6
|
+
but all you can do with last-click data.
|
|
7
|
+
- linear: equal credit across touchpoints.
|
|
8
|
+
- markov: data-driven credit via the *removal effect* of each channel in an
|
|
9
|
+
absorbing Markov chain. Credit(c) is proportional to the drop in
|
|
10
|
+
total conversion probability when channel c is removed from the graph.
|
|
11
|
+
|
|
12
|
+
The Markov model is the defensible one when you have full multi-touch path data; it
|
|
13
|
+
captures assist value that last-touch throws away. It needs real path variety to be
|
|
14
|
+
stable -- the eval suite checks exactly that.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import numpy as np
|
|
20
|
+
|
|
21
|
+
from .base import Tool, ToolResult
|
|
22
|
+
|
|
23
|
+
START, CONV, NULL = "(start)", "(conversion)", "(null)"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _build_chain(paths, channels):
|
|
27
|
+
"""Build transition + absorption counts for the absorbing Markov chain."""
|
|
28
|
+
states = [START] + channels
|
|
29
|
+
idx = {s: i for i, s in enumerate(states)}
|
|
30
|
+
n = len(states)
|
|
31
|
+
trans = np.zeros((n, n))
|
|
32
|
+
to_conv = np.zeros(n)
|
|
33
|
+
to_null = np.zeros(n)
|
|
34
|
+
for path, converted in paths:
|
|
35
|
+
seq = [START] + [c for c in path if c in idx]
|
|
36
|
+
for a, b in zip(seq[:-1], seq[1:]):
|
|
37
|
+
trans[idx[a], idx[b]] += 1
|
|
38
|
+
last = idx[seq[-1]]
|
|
39
|
+
(to_conv if converted else to_null)[last] += 1
|
|
40
|
+
return states, idx, trans, to_conv, to_null
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _conv_prob(states, idx, trans, to_conv, to_null, removed=None):
|
|
44
|
+
"""Absorption probability into (conversion) from (start).
|
|
45
|
+
|
|
46
|
+
To remove a channel (removal effect), every transition INTO that channel is
|
|
47
|
+
rerouted to the null absorbing state -- i.e. any journey that relied on the
|
|
48
|
+
channel is counted as a non-conversion. This is the standard removal-effect
|
|
49
|
+
definition (not path compression).
|
|
50
|
+
"""
|
|
51
|
+
n = len(states)
|
|
52
|
+
trans = trans.copy()
|
|
53
|
+
to_conv = to_conv.copy()
|
|
54
|
+
to_null = to_null.copy()
|
|
55
|
+
if removed is not None and removed in idx:
|
|
56
|
+
ri = idx[removed]
|
|
57
|
+
to_null += trans[:, ri] # inbound mass becomes a non-conversion
|
|
58
|
+
trans[:, ri] = 0 # channel is now unreachable
|
|
59
|
+
trans[ri, :] = 0
|
|
60
|
+
to_conv[ri] = 0
|
|
61
|
+
to_null[ri] = 0
|
|
62
|
+
row = trans.sum(1) + to_conv + to_null
|
|
63
|
+
row[row == 0] = 1.0
|
|
64
|
+
Q = trans / row[:, None]
|
|
65
|
+
r_conv = to_conv / row
|
|
66
|
+
try:
|
|
67
|
+
N = np.linalg.inv(np.eye(n) - Q)
|
|
68
|
+
except np.linalg.LinAlgError:
|
|
69
|
+
return 0.0
|
|
70
|
+
return float((N @ r_conv)[idx[START]])
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def run_attribution_model(
|
|
74
|
+
paths: list[dict],
|
|
75
|
+
model: str = "markov",
|
|
76
|
+
) -> ToolResult:
|
|
77
|
+
if not paths:
|
|
78
|
+
return ToolResult.fail("paths is empty.", recovery_hint="Provide conversion paths.")
|
|
79
|
+
norm = []
|
|
80
|
+
for p in paths:
|
|
81
|
+
seq = p.get("path") or p.get("channels")
|
|
82
|
+
if not seq:
|
|
83
|
+
return ToolResult.fail(
|
|
84
|
+
"Each path needs a 'path' list of channels.",
|
|
85
|
+
recovery_hint="Format: {'path': ['Search','Social'], 'converted': true}.",
|
|
86
|
+
)
|
|
87
|
+
norm.append((list(seq), bool(p.get("converted", False))))
|
|
88
|
+
|
|
89
|
+
channels = sorted({c for seq, _ in norm for c in seq})
|
|
90
|
+
n_conv = sum(1 for _, c in norm if c)
|
|
91
|
+
|
|
92
|
+
if model == "last_touch":
|
|
93
|
+
credit = {c: 0.0 for c in channels}
|
|
94
|
+
for seq, conv in norm:
|
|
95
|
+
if conv:
|
|
96
|
+
credit[seq[-1]] += 1
|
|
97
|
+
elif model == "linear":
|
|
98
|
+
credit = {c: 0.0 for c in channels}
|
|
99
|
+
for seq, conv in norm:
|
|
100
|
+
if conv and seq:
|
|
101
|
+
for c in seq:
|
|
102
|
+
credit[c] += 1 / len(seq)
|
|
103
|
+
elif model == "markov":
|
|
104
|
+
states, idx, trans, to_conv, to_null = _build_chain(norm, channels)
|
|
105
|
+
base = _conv_prob(states, idx, trans, to_conv, to_null)
|
|
106
|
+
if base <= 0:
|
|
107
|
+
return ToolResult.fail(
|
|
108
|
+
"Base conversion probability is zero; Markov attribution is undefined.",
|
|
109
|
+
recovery_hint="No converting paths, or paths too sparse. Use last_touch.",
|
|
110
|
+
)
|
|
111
|
+
removal = {}
|
|
112
|
+
for c in channels:
|
|
113
|
+
dropped = _conv_prob(states, idx, trans, to_conv, to_null, removed=c)
|
|
114
|
+
removal[c] = max(base - dropped, 0.0)
|
|
115
|
+
total = sum(removal.values()) or 1.0
|
|
116
|
+
credit = {c: removal[c] / total * n_conv for c in channels}
|
|
117
|
+
else:
|
|
118
|
+
return ToolResult.fail(
|
|
119
|
+
f"Unknown model '{model}'.",
|
|
120
|
+
recovery_hint="Choose one of: last_touch, linear, markov.",
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
total = sum(credit.values()) or 1.0
|
|
124
|
+
share = {c: round(credit[c] / total, 4) for c in credit}
|
|
125
|
+
ranked = sorted(share.items(), key=lambda kv: -kv[1])
|
|
126
|
+
|
|
127
|
+
return ToolResult.ok(
|
|
128
|
+
output="Credit share: " + ", ".join(f"{c}={s:.1%}" for c, s in ranked),
|
|
129
|
+
summary=(
|
|
130
|
+
f"{model} attribution over {len(norm)} paths ({n_conv} conversions). "
|
|
131
|
+
f"Top channel: {ranked[0][0]} ({ranked[0][1]:.1%})."
|
|
132
|
+
),
|
|
133
|
+
artifacts={"model": model, "credit": {c: round(v, 3) for c, v in credit.items()},
|
|
134
|
+
"share": share},
|
|
135
|
+
next_actions=[
|
|
136
|
+
"If using markov, sanity-check against last_touch to quantify assist value.",
|
|
137
|
+
],
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
TOOL = Tool(
|
|
142
|
+
name="run_attribution_model",
|
|
143
|
+
description=(
|
|
144
|
+
"Assign conversion credit across channels from multi-touch path data. "
|
|
145
|
+
"model='markov' uses removal effect (best with rich path data); "
|
|
146
|
+
"'last_touch' and 'linear' are heuristics for thin data. Pick the model that "
|
|
147
|
+
"matches the data shape -- do not run markov on single-touch paths."
|
|
148
|
+
),
|
|
149
|
+
parameters={
|
|
150
|
+
"paths": {"type": "array",
|
|
151
|
+
"description": "List of {'path': [channels...], 'converted': bool}.",
|
|
152
|
+
"items": {"type": "object"}},
|
|
153
|
+
"model": {"type": "string", "enum": ["markov", "last_touch", "linear"],
|
|
154
|
+
"description": "Attribution model.", "optional": True},
|
|
155
|
+
},
|
|
156
|
+
fn=run_attribution_model,
|
|
157
|
+
)
|
mixpilot/tools/base.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
"""Tool contract for MixPilot.
|
|
2
|
+
|
|
3
|
+
The central lesson borrowed from harness engineering: a tool result is not a log
|
|
4
|
+
line, it is the next observation in the agent's reasoning loop. Its quality
|
|
5
|
+
directly determines the quality of the next decision. So every tool -- success or
|
|
6
|
+
failure -- returns a structured ToolResult with a plain-language summary, the safe
|
|
7
|
+
next actions, and a recovery hint.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from typing import Any, Callable
|
|
13
|
+
|
|
14
|
+
from pydantic import BaseModel, Field
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class ToolResult(BaseModel):
|
|
18
|
+
"""The observation an agent receives after a tool runs."""
|
|
19
|
+
|
|
20
|
+
success: bool
|
|
21
|
+
status: str = "success"
|
|
22
|
+
output: str = ""
|
|
23
|
+
error: str | None = None
|
|
24
|
+
# The four fields that turn a result into a usable observation:
|
|
25
|
+
summary: str | None = None
|
|
26
|
+
artifacts: dict[str, Any] = Field(default_factory=dict)
|
|
27
|
+
next_actions: list[str] = Field(default_factory=list)
|
|
28
|
+
recovery_hint: str | None = None
|
|
29
|
+
|
|
30
|
+
@classmethod
|
|
31
|
+
def ok(
|
|
32
|
+
cls,
|
|
33
|
+
output: str,
|
|
34
|
+
summary: str | None = None,
|
|
35
|
+
artifacts: dict[str, Any] | None = None,
|
|
36
|
+
next_actions: list[str] | None = None,
|
|
37
|
+
) -> "ToolResult":
|
|
38
|
+
return cls(
|
|
39
|
+
success=True,
|
|
40
|
+
status="success",
|
|
41
|
+
output=output,
|
|
42
|
+
summary=summary,
|
|
43
|
+
artifacts=artifacts or {},
|
|
44
|
+
next_actions=next_actions or [],
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
@classmethod
|
|
48
|
+
def fail(
|
|
49
|
+
cls,
|
|
50
|
+
error: str,
|
|
51
|
+
recovery_hint: str | None = None,
|
|
52
|
+
next_actions: list[str] | None = None,
|
|
53
|
+
) -> "ToolResult":
|
|
54
|
+
return cls(
|
|
55
|
+
success=False,
|
|
56
|
+
status="error",
|
|
57
|
+
error=error,
|
|
58
|
+
recovery_hint=recovery_hint
|
|
59
|
+
or "Inspect the inputs, correct the tool arguments, and retry only if the "
|
|
60
|
+
"analysis still makes sense for this data shape.",
|
|
61
|
+
next_actions=next_actions
|
|
62
|
+
or ["Re-check the input data and column assumptions before retrying."],
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class Tool(BaseModel):
|
|
67
|
+
"""A typed, validated marketing-analysis action the agent can take."""
|
|
68
|
+
|
|
69
|
+
name: str
|
|
70
|
+
description: str
|
|
71
|
+
# JSON-schema-style parameter spec exposed to the model.
|
|
72
|
+
parameters: dict[str, Any]
|
|
73
|
+
# Whether running this tool mutates state (writes a file, etc).
|
|
74
|
+
mutating: bool = False
|
|
75
|
+
fn: Callable[..., ToolResult]
|
|
76
|
+
|
|
77
|
+
model_config = {"arbitrary_types_allowed": True}
|
|
78
|
+
|
|
79
|
+
def run(self, **kwargs: Any) -> ToolResult:
|
|
80
|
+
try:
|
|
81
|
+
return self.fn(**kwargs)
|
|
82
|
+
except TypeError as exc:
|
|
83
|
+
return ToolResult.fail(
|
|
84
|
+
error=f"Bad arguments for {self.name}: {exc}",
|
|
85
|
+
recovery_hint="Check the tool schema; an argument is missing or the "
|
|
86
|
+
"wrong type.",
|
|
87
|
+
)
|
|
88
|
+
except Exception as exc: # noqa: BLE001 - surface as a recoverable observation
|
|
89
|
+
return ToolResult.fail(
|
|
90
|
+
error=f"{type(exc).__name__} in {self.name}: {exc}",
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
def schema(self) -> dict[str, Any]:
|
|
94
|
+
"""Anthropic tool-use schema."""
|
|
95
|
+
return {
|
|
96
|
+
"name": self.name,
|
|
97
|
+
"description": self.description,
|
|
98
|
+
"input_schema": {
|
|
99
|
+
"type": "object",
|
|
100
|
+
"properties": self.parameters,
|
|
101
|
+
"required": [
|
|
102
|
+
k
|
|
103
|
+
for k, v in self.parameters.items()
|
|
104
|
+
if not v.get("optional", False)
|
|
105
|
+
],
|
|
106
|
+
},
|
|
107
|
+
}
|
mixpilot/tools/budget.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
"""Budget allocation under saturation.
|
|
2
|
+
|
|
3
|
+
Given a Hill curve per channel (beta, alpha, gamma) and a total budget, find the
|
|
4
|
+
spend split that maximises total predicted response, subject to the budget
|
|
5
|
+
constraint and optional per-channel caps. This is where saturation curves pay off:
|
|
6
|
+
the optimiser pushes money toward channels with the most unsaturated marginal
|
|
7
|
+
return and pulls it from channels already on the flat part of their curve.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import numpy as np
|
|
13
|
+
from scipy.optimize import minimize
|
|
14
|
+
|
|
15
|
+
from .base import Tool, ToolResult
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _hill(x, beta, alpha, gamma):
|
|
19
|
+
xa = np.power(np.clip(x, 0, None), alpha)
|
|
20
|
+
return beta * xa / (xa + np.power(gamma, alpha) + 1e-12)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def allocate_budget(
|
|
24
|
+
curves: dict,
|
|
25
|
+
total_budget: float,
|
|
26
|
+
caps: dict | None = None,
|
|
27
|
+
) -> ToolResult:
|
|
28
|
+
channels = list(curves.keys())
|
|
29
|
+
if not channels:
|
|
30
|
+
return ToolResult.fail("curves is empty.",
|
|
31
|
+
recovery_hint="Provide fitted Hill params per channel.")
|
|
32
|
+
if total_budget <= 0:
|
|
33
|
+
return ToolResult.fail("total_budget must be positive.")
|
|
34
|
+
|
|
35
|
+
params = []
|
|
36
|
+
for c in channels:
|
|
37
|
+
cv = curves[c]
|
|
38
|
+
try:
|
|
39
|
+
params.append((float(cv["beta"]), float(cv["alpha"]), float(cv["gamma"])))
|
|
40
|
+
except (KeyError, TypeError, ValueError):
|
|
41
|
+
return ToolResult.fail(
|
|
42
|
+
f"Channel '{c}' is missing beta/alpha/gamma.",
|
|
43
|
+
recovery_hint="Run fit_saturation_curve per channel first.",
|
|
44
|
+
)
|
|
45
|
+
caps = caps or {}
|
|
46
|
+
n = len(channels)
|
|
47
|
+
|
|
48
|
+
def neg_total(x):
|
|
49
|
+
return -sum(_hill(x[i], *params[i]) for i in range(n))
|
|
50
|
+
|
|
51
|
+
x0 = np.full(n, total_budget / n)
|
|
52
|
+
bounds = [(0, caps.get(c, total_budget)) for c in channels]
|
|
53
|
+
cons = {"type": "eq", "fun": lambda x: x.sum() - total_budget}
|
|
54
|
+
res = minimize(neg_total, x0, method="SLSQP", bounds=bounds, constraints=cons,
|
|
55
|
+
options={"maxiter": 500, "ftol": 1e-8})
|
|
56
|
+
if not res.success:
|
|
57
|
+
return ToolResult.fail(
|
|
58
|
+
f"Optimiser failed: {res.message}",
|
|
59
|
+
recovery_hint="Check caps sum >= total_budget and curve params are sane.",
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
alloc = {c: round(float(v), 2) for c, v in zip(channels, res.x)}
|
|
63
|
+
predicted = round(float(-res.fun), 2)
|
|
64
|
+
even = {c: total_budget / n for c in channels}
|
|
65
|
+
lift = predicted - sum(_hill(even[c], *p) for c, p in zip(channels, params))
|
|
66
|
+
|
|
67
|
+
ranked = sorted(alloc.items(), key=lambda kv: -kv[1])
|
|
68
|
+
return ToolResult.ok(
|
|
69
|
+
output="Allocation: " + ", ".join(f"{c}={v:,.0f}" for c, v in ranked),
|
|
70
|
+
summary=(
|
|
71
|
+
f"Optimal split of {total_budget:,.0f} predicts {predicted:,.0f} response, "
|
|
72
|
+
f"~{lift:,.0f} above an even split. Top: {ranked[0][0]}."
|
|
73
|
+
),
|
|
74
|
+
artifacts={"allocation": alloc, "predicted_response": predicted,
|
|
75
|
+
"lift_vs_even_split": round(float(lift), 2)},
|
|
76
|
+
next_actions=[
|
|
77
|
+
"Present allocation with marginal-ROI caveat: extrapolating beyond "
|
|
78
|
+
"observed spend ranges is unreliable.",
|
|
79
|
+
],
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
TOOL = Tool(
|
|
84
|
+
name="allocate_budget",
|
|
85
|
+
description=(
|
|
86
|
+
"Optimise a media budget split across channels given their fitted Hill "
|
|
87
|
+
"saturation curves. Maximises total predicted response under the budget "
|
|
88
|
+
"constraint and optional per-channel caps."
|
|
89
|
+
),
|
|
90
|
+
parameters={
|
|
91
|
+
"curves": {"type": "object",
|
|
92
|
+
"description": "Map channel -> {beta, alpha, gamma} from "
|
|
93
|
+
"fit_saturation_curve."},
|
|
94
|
+
"total_budget": {"type": "number", "description": "Total budget to allocate."},
|
|
95
|
+
"caps": {"type": "object", "description": "Optional channel -> max spend.",
|
|
96
|
+
"optional": True},
|
|
97
|
+
},
|
|
98
|
+
fn=allocate_budget,
|
|
99
|
+
)
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
"""Data-quality audit tailored to marketing-mix data.
|
|
2
|
+
|
|
3
|
+
Beyond generic null checks, this flags the issues that actually break MMM and
|
|
4
|
+
attribution: zero-variance channels (non-identifiable), multicollinearity between
|
|
5
|
+
channels (unstable coefficients -- the classic MMM trap), negative spend, and date
|
|
6
|
+
gaps. The audit returns structured issues with severity so the agent can decide
|
|
7
|
+
whether the downstream analysis is even trustworthy.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import numpy as np
|
|
13
|
+
|
|
14
|
+
from .base import Tool, ToolResult
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def audit_data_quality(
|
|
18
|
+
channels: dict,
|
|
19
|
+
dates: list | None = None,
|
|
20
|
+
corr_threshold: float = 0.9,
|
|
21
|
+
) -> ToolResult:
|
|
22
|
+
if not channels:
|
|
23
|
+
return ToolResult.fail("channels is empty.",
|
|
24
|
+
recovery_hint="Provide channel -> spend series.")
|
|
25
|
+
|
|
26
|
+
names = list(channels.keys())
|
|
27
|
+
lengths = {c: len(v) for c, v in channels.items()}
|
|
28
|
+
issues = []
|
|
29
|
+
|
|
30
|
+
if len(set(lengths.values())) > 1:
|
|
31
|
+
issues.append({"severity": "high", "type": "length_mismatch",
|
|
32
|
+
"detail": f"Channels have unequal lengths: {lengths}."})
|
|
33
|
+
|
|
34
|
+
arrs = {}
|
|
35
|
+
for c, v in channels.items():
|
|
36
|
+
a = np.asarray(v, dtype=float)
|
|
37
|
+
arrs[c] = a
|
|
38
|
+
if np.isnan(a).any():
|
|
39
|
+
issues.append({"severity": "high", "type": "nulls",
|
|
40
|
+
"detail": f"{c} contains {int(np.isnan(a).sum())} nulls."})
|
|
41
|
+
if (a < 0).any():
|
|
42
|
+
issues.append({"severity": "high", "type": "negative_spend",
|
|
43
|
+
"detail": f"{c} has negative values."})
|
|
44
|
+
if np.nanstd(a) == 0:
|
|
45
|
+
issues.append({"severity": "medium", "type": "zero_variance",
|
|
46
|
+
"detail": f"{c} is constant; its effect is non-identifiable."})
|
|
47
|
+
|
|
48
|
+
# Multicollinearity across channels of equal length.
|
|
49
|
+
common = [c for c in names if lengths[c] == lengths[names[0]] and np.nanstd(arrs[c]) > 0]
|
|
50
|
+
if len(common) >= 2:
|
|
51
|
+
M = np.vstack([arrs[c] for c in common])
|
|
52
|
+
if not np.isnan(M).any():
|
|
53
|
+
corr = np.corrcoef(M)
|
|
54
|
+
for i in range(len(common)):
|
|
55
|
+
for j in range(i + 1, len(common)):
|
|
56
|
+
if abs(corr[i, j]) >= corr_threshold:
|
|
57
|
+
issues.append({
|
|
58
|
+
"severity": "high", "type": "multicollinearity",
|
|
59
|
+
"detail": f"{common[i]} & {common[j]} correlate at "
|
|
60
|
+
f"{corr[i, j]:.2f}; coefficients will be unstable.",
|
|
61
|
+
})
|
|
62
|
+
|
|
63
|
+
if dates is not None and len(dates) != lengths[names[0]]:
|
|
64
|
+
issues.append({"severity": "medium", "type": "date_misalignment",
|
|
65
|
+
"detail": "dates length does not match channel series length."})
|
|
66
|
+
|
|
67
|
+
high = [i for i in issues if i["severity"] == "high"]
|
|
68
|
+
verdict = ("NOT trustworthy without fixes" if high
|
|
69
|
+
else "usable with minor caveats" if issues else "clean")
|
|
70
|
+
|
|
71
|
+
return ToolResult.ok(
|
|
72
|
+
output=f"{len(issues)} issue(s) found. Verdict: {verdict}.",
|
|
73
|
+
summary=(
|
|
74
|
+
f"Audited {len(names)} channels: {len(high)} high-severity issue(s). "
|
|
75
|
+
f"Data is {verdict}."
|
|
76
|
+
),
|
|
77
|
+
artifacts={"issues": issues, "verdict": verdict, "n_high": len(high)},
|
|
78
|
+
next_actions=(
|
|
79
|
+
["Resolve high-severity issues before modelling."] if high
|
|
80
|
+
else ["Proceed to adstock + saturation."]
|
|
81
|
+
),
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
TOOL = Tool(
|
|
86
|
+
name="audit_data_quality",
|
|
87
|
+
description=(
|
|
88
|
+
"Audit marketing-mix data for issues that break MMM/attribution: nulls, "
|
|
89
|
+
"negative spend, zero-variance (non-identifiable) channels, and "
|
|
90
|
+
"multicollinearity between channels. Run this FIRST, before any modelling."
|
|
91
|
+
),
|
|
92
|
+
parameters={
|
|
93
|
+
"channels": {"type": "object",
|
|
94
|
+
"description": "Map channel name -> list of per-period spend."},
|
|
95
|
+
"dates": {"type": "array", "description": "Optional date labels.",
|
|
96
|
+
"items": {"type": "string"}, "optional": True},
|
|
97
|
+
"corr_threshold": {"type": "number",
|
|
98
|
+
"description": "Correlation above which to flag "
|
|
99
|
+
"multicollinearity.", "optional": True},
|
|
100
|
+
},
|
|
101
|
+
fn=audit_data_quality,
|
|
102
|
+
)
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""Tool registry: the single pipeline every tool call passes through.
|
|
2
|
+
|
|
3
|
+
Borrowed harness lesson: external/tool output is data, not authority, and every
|
|
4
|
+
action should pass through validation and policy before it runs and before its
|
|
5
|
+
result reaches the model. Here that means: schema lookup, approval check for
|
|
6
|
+
mutating tools, execution, and consistent ToolResult wrapping.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from .adstock import TOOL as ADSTOCK
|
|
12
|
+
from .attribution import TOOL as ATTRIBUTION
|
|
13
|
+
from .base import Tool, ToolResult
|
|
14
|
+
from .budget import TOOL as BUDGET
|
|
15
|
+
from .dataquality import TOOL as DATAQUALITY
|
|
16
|
+
from .saturation import TOOL as SATURATION
|
|
17
|
+
|
|
18
|
+
BUILTIN_TOOLS = [DATAQUALITY, ADSTOCK, SATURATION, ATTRIBUTION, BUDGET]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class Registry:
|
|
22
|
+
def __init__(self, tools: list[Tool] | None = None, approval=None):
|
|
23
|
+
self._tools: dict[str, Tool] = {}
|
|
24
|
+
self.approval = approval
|
|
25
|
+
for t in tools or BUILTIN_TOOLS:
|
|
26
|
+
self.register(t)
|
|
27
|
+
|
|
28
|
+
def register(self, tool: Tool) -> None:
|
|
29
|
+
if tool.name in self._tools:
|
|
30
|
+
raise ValueError(f"Duplicate tool name: {tool.name}")
|
|
31
|
+
self._tools[tool.name] = tool
|
|
32
|
+
|
|
33
|
+
def get(self, name: str) -> Tool | None:
|
|
34
|
+
return self._tools.get(name)
|
|
35
|
+
|
|
36
|
+
def schemas(self) -> list[dict]:
|
|
37
|
+
return [t.schema() for t in self._tools.values()]
|
|
38
|
+
|
|
39
|
+
def names(self) -> list[str]:
|
|
40
|
+
return list(self._tools)
|
|
41
|
+
|
|
42
|
+
def call(self, name: str, **kwargs) -> ToolResult:
|
|
43
|
+
tool = self._tools.get(name)
|
|
44
|
+
if tool is None:
|
|
45
|
+
return ToolResult.fail(
|
|
46
|
+
f"No such tool '{name}'. Available: {', '.join(self._tools)}.",
|
|
47
|
+
recovery_hint="Call one of the listed tools.",
|
|
48
|
+
)
|
|
49
|
+
if tool.mutating and self.approval is not None:
|
|
50
|
+
decision = self.approval.check(name, kwargs)
|
|
51
|
+
if not decision.approved:
|
|
52
|
+
return ToolResult.fail(
|
|
53
|
+
f"Action '{name}' blocked by approval policy: {decision.reason}",
|
|
54
|
+
recovery_hint="A human must approve this action, or switch to a "
|
|
55
|
+
"read-only analysis tool.",
|
|
56
|
+
)
|
|
57
|
+
return tool.run(**kwargs)
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
"""Saturation: diminishing returns via the Hill function.
|
|
2
|
+
|
|
3
|
+
response(x) = beta * x^alpha / (x^alpha + gamma^alpha)
|
|
4
|
+
|
|
5
|
+
- alpha (shape) controls the S-curve steepness (alpha>1 gives an S, alpha<=1 is
|
|
6
|
+
concave-only).
|
|
7
|
+
- gamma (half-saturation) is the spend level at which half the ceiling is reached.
|
|
8
|
+
- beta is the response ceiling.
|
|
9
|
+
|
|
10
|
+
This is the same family used by Meta's Robyn and most modern MMM stacks. Fitting it
|
|
11
|
+
gives the marginal-return shape you need for budget allocation.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import numpy as np
|
|
17
|
+
from scipy.optimize import curve_fit
|
|
18
|
+
|
|
19
|
+
from .base import Tool, ToolResult
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _hill(x, beta, alpha, gamma):
|
|
23
|
+
xa = np.power(np.clip(x, 0, None), alpha)
|
|
24
|
+
return beta * xa / (xa + np.power(gamma, alpha) + 1e-12)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def fit_saturation_curve(
|
|
28
|
+
spend: list[float],
|
|
29
|
+
response: list[float],
|
|
30
|
+
) -> ToolResult:
|
|
31
|
+
x = np.asarray(spend, dtype=float)
|
|
32
|
+
y = np.asarray(response, dtype=float)
|
|
33
|
+
if x.size != y.size or x.size < 4:
|
|
34
|
+
return ToolResult.fail(
|
|
35
|
+
"Need spend and response of equal length with at least 4 points.",
|
|
36
|
+
recovery_hint="Provide paired (spend, response) for the same periods.",
|
|
37
|
+
)
|
|
38
|
+
if np.allclose(x.std(), 0):
|
|
39
|
+
return ToolResult.fail(
|
|
40
|
+
"Spend has zero variance; a saturation curve cannot be identified.",
|
|
41
|
+
recovery_hint="This channel has constant spend. You cannot estimate its "
|
|
42
|
+
"response shape from this data alone.",
|
|
43
|
+
next_actions=["Flag this channel as non-identifiable and exclude it."],
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
p0 = [y.max() if y.max() > 0 else 1.0, 1.0, np.median(x[x > 0]) or 1.0]
|
|
47
|
+
bounds = ([0, 0.1, 1e-6], [np.inf, 5.0, np.inf])
|
|
48
|
+
try:
|
|
49
|
+
popt, _ = curve_fit(_hill, x, y, p0=p0, bounds=bounds, maxfev=10000)
|
|
50
|
+
except Exception as exc: # noqa: BLE001
|
|
51
|
+
return ToolResult.fail(
|
|
52
|
+
f"Curve fit did not converge: {exc}",
|
|
53
|
+
recovery_hint="Try adstocking spend first, or check for outliers.",
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
beta, alpha, gamma = popt
|
|
57
|
+
pred = _hill(x, *popt)
|
|
58
|
+
ss_res = float(np.sum((y - pred) ** 2))
|
|
59
|
+
ss_tot = float(np.sum((y - y.mean()) ** 2)) + 1e-12
|
|
60
|
+
r2 = 1 - ss_res / ss_tot
|
|
61
|
+
shape = "S-shaped" if alpha > 1.05 else "concave (pure diminishing returns)"
|
|
62
|
+
|
|
63
|
+
return ToolResult.ok(
|
|
64
|
+
output=f"Hill fit: beta={beta:.2f}, alpha={alpha:.2f}, gamma={gamma:.2f}, R2={r2:.3f}",
|
|
65
|
+
summary=(
|
|
66
|
+
f"Saturation is {shape}. Half-saturation spend ~{gamma:.0f}; "
|
|
67
|
+
f"response ceiling ~{beta:.0f}. Fit R2={r2:.3f}."
|
|
68
|
+
),
|
|
69
|
+
artifacts={
|
|
70
|
+
"beta": round(float(beta), 4),
|
|
71
|
+
"alpha": round(float(alpha), 4),
|
|
72
|
+
"gamma": round(float(gamma), 4),
|
|
73
|
+
"r2": round(r2, 4),
|
|
74
|
+
},
|
|
75
|
+
next_actions=[
|
|
76
|
+
"Feed these curve params into allocate_budget to optimise spend split.",
|
|
77
|
+
"If R2 is low, revisit adstock decay before trusting the curve.",
|
|
78
|
+
],
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
TOOL = Tool(
|
|
83
|
+
name="fit_saturation_curve",
|
|
84
|
+
description=(
|
|
85
|
+
"Fit a Hill saturation curve (diminishing returns) to paired spend/response "
|
|
86
|
+
"data for one channel. Returns ceiling (beta), shape (alpha), and "
|
|
87
|
+
"half-saturation point (gamma). Run on adstocked spend for best results."
|
|
88
|
+
),
|
|
89
|
+
parameters={
|
|
90
|
+
"spend": {"type": "array", "items": {"type": "number"},
|
|
91
|
+
"description": "Per-period spend (ideally adstocked)."},
|
|
92
|
+
"response": {"type": "array", "items": {"type": "number"},
|
|
93
|
+
"description": "Per-period response/sales attributable to spend."},
|
|
94
|
+
},
|
|
95
|
+
fn=fit_saturation_curve,
|
|
96
|
+
)
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: mixpilot
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: An agentic harness for marketing measurement: adstock, saturation, attribution, and budget allocation as typed agent tools with an eval suite.
|
|
5
|
+
Author: Mohit Luthra
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/mohit-luthra/mixpilot
|
|
8
|
+
Keywords: marketing-mix-modeling,mmm,attribution,agents,llm,adstock,saturation
|
|
9
|
+
Requires-Python: >=3.10
|
|
10
|
+
Description-Content-Type: text/markdown
|
|
11
|
+
Requires-Dist: numpy>=1.24
|
|
12
|
+
Requires-Dist: scipy>=1.10
|
|
13
|
+
Requires-Dist: pydantic>=2.0
|
|
14
|
+
Provides-Extra: agent
|
|
15
|
+
Requires-Dist: anthropic>=0.39; extra == "agent"
|
|
16
|
+
Provides-Extra: dev
|
|
17
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
18
|
+
|
|
19
|
+
# MixPilot
|
|
20
|
+
|
|
21
|
+
**An agentic harness for marketing measurement.**
|
|
22
|
+
|
|
23
|
+
Everyone is building general-purpose agents. MixPilot is the opposite bet: a small,
|
|
24
|
+
real agent runtime whose entire action space is the marketing-science toolkit —
|
|
25
|
+
adstock, saturation, multi-touch attribution, budget optimisation, and a
|
|
26
|
+
marketing-aware data-quality audit — wired as typed tools with structured results,
|
|
27
|
+
an approval gate, and an eval suite that scores the agent's *method-selection
|
|
28
|
+
judgment*.
|
|
29
|
+
|
|
30
|
+
The thesis: in a domain agent, the tools are the product. A model can sound
|
|
31
|
+
confident about marketing data; the hard part is knowing which method the data can
|
|
32
|
+
actually support — and refusing the ones it can't. That judgment is what MixPilot
|
|
33
|
+
encodes and tests.
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
pip install mixpilot # tools only
|
|
37
|
+
pip install "mixpilot[agent]" # + the Anthropic-backed agent loop
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## What's inside
|
|
41
|
+
|
|
42
|
+
| Component | What it does | The lesson |
|
|
43
|
+
|---|---|---|
|
|
44
|
+
| `audit_data_quality` | nulls, negative spend, zero-variance, **multicollinearity** | The audit decides whether any later number is trustworthy at all |
|
|
45
|
+
| `adstock_transform` | geometric carryover with half-life | Carryover before saturation, always |
|
|
46
|
+
| `fit_saturation_curve` | Hill curve (β, α, γ) + R² | A low-R² curve is a warning, not a result |
|
|
47
|
+
| `run_attribution_model` | last-touch / linear / **Markov removal-effect** | Match the model to the data shape, never overreach |
|
|
48
|
+
| `allocate_budget` | constrained optimisation over saturation curves | Move money toward unsaturated marginal return |
|
|
49
|
+
| `ToolResult` contract | summary + next_actions + recovery_hint on every call | A tool result is the next observation, not a log line |
|
|
50
|
+
| Approval gate | mutating actions clear a policy gate outside the prompt | Safety lives in code, not prose |
|
|
51
|
+
| Agent loop | turn budget + loop detection + stop conditions | The loop is a control system, not a while-tool-calls toy |
|
|
52
|
+
| Eval suite | data shape → required/forbidden method | Test the judgment, not the prose |
|
|
53
|
+
|
|
54
|
+
## The judgment, made concrete
|
|
55
|
+
|
|
56
|
+
Same data, two attribution models:
|
|
57
|
+
|
|
58
|
+
```python
|
|
59
|
+
from mixpilot.tools.attribution import run_attribution_model
|
|
60
|
+
|
|
61
|
+
paths = [{"path": ["Social", "Search"], "converted": True}] * 40
|
|
62
|
+
paths += [{"path": ["Social"], "converted": False}] * 20
|
|
63
|
+
|
|
64
|
+
run_attribution_model(paths, "last_touch").artifacts["share"]
|
|
65
|
+
# {'Search': 1.0, 'Social': 0.0} <- the assist is thrown away
|
|
66
|
+
run_attribution_model(paths, "markov").artifacts["share"]
|
|
67
|
+
# {'Search': 0.5, 'Social': 0.5} <- both channels are necessary
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
Last-touch hands Social nothing. Markov's removal effect sees that every conversion
|
|
71
|
+
needed Social first, and splits the credit. That gap — assist value — is the entire
|
|
72
|
+
reason multi-touch attribution exists.
|
|
73
|
+
|
|
74
|
+
## Running the agent
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
from mixpilot import Agent
|
|
78
|
+
from mixpilot.agent.client import AnthropicClient
|
|
79
|
+
|
|
80
|
+
agent = Agent(AnthropicClient()) # needs ANTHROPIC_API_KEY
|
|
81
|
+
result = agent.run(
|
|
82
|
+
"Here are 12 weeks of spend and sales for one channel ... "
|
|
83
|
+
"what's the saturation point and should we spend more?"
|
|
84
|
+
)
|
|
85
|
+
print(result.final_text)
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
The loop injects the client, so it runs against a real model in production and a
|
|
89
|
+
scripted client in evals — no network needed to test the harness.
|
|
90
|
+
|
|
91
|
+
## Evals: scoring method selection
|
|
92
|
+
|
|
93
|
+
```bash
|
|
94
|
+
python evals/run_evals.py # offline, deterministic
|
|
95
|
+
python evals/run_evals.py --live # against a real model
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
Each case pairs a data shape with the methods it does and does not support. A case
|
|
99
|
+
passes only if the agent uses every required tool and avoids every forbidden one —
|
|
100
|
+
e.g. it must **not** run Markov attribution on single-touch data, and must audit
|
|
101
|
+
before claiming per-channel effects on collinear spend.
|
|
102
|
+
|
|
103
|
+
## Tests
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
python -m pytest -q # 10 passing — domain math + harness contracts
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
The tests never call a model. They prove the adstock conserves mass, the Hill fit
|
|
110
|
+
recovers known parameters, the Markov removal effect credits assists, the optimiser
|
|
111
|
+
moves budget toward unsaturated channels, the audit catches multicollinearity, and
|
|
112
|
+
the approval gate blocks mutating actions. Harness reliability that has nothing to do
|
|
113
|
+
with model intelligence.
|
|
114
|
+
|
|
115
|
+
## Scope
|
|
116
|
+
|
|
117
|
+
This is deliberately small. No context compaction, no MCP, no subagents — those are
|
|
118
|
+
solved problems in general harnesses. The point here is the opposite: how far you get
|
|
119
|
+
when the action space is a real domain and the tools enforce the judgment.
|
|
120
|
+
|
|
121
|
+
MIT licensed.
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
mixpilot/__init__.py,sha256=q5wz41OcwOhBIgAWMJe8aeyyTmQFpspouaO1ON8m38M,683
|
|
2
|
+
mixpilot/agent/__init__.py,sha256=3C_OCAb3bqBvpM6usMAB497w-WM5P6X9Bl6gUEKvJJE,155
|
|
3
|
+
mixpilot/agent/client.py,sha256=lO-2ZzQ3SncoPHrSB7-C8J5eXGwr9gvX-R04qmQmmEU,1446
|
|
4
|
+
mixpilot/agent/groq_client.py,sha256=WRu2VwoYUE1N-XoiMJg_EDp-kykiORZYjlY2mRKpN6U,4596
|
|
5
|
+
mixpilot/agent/loop.py,sha256=k7fYaOwdUWjInTr4C7Kk9DgDDviL5uaXgG9doLFUM9s,4536
|
|
6
|
+
mixpilot/safety/__init__.py,sha256=HmOWG7fLnSYp9EdLTl4VuWOpbX4EsZl6019f6zs_5Ko,102
|
|
7
|
+
mixpilot/safety/approval.py,sha256=AvH060ADUO4HC_MgOe8DE9zDy9DO99TKa-yvO20fRck,1383
|
|
8
|
+
mixpilot/tools/__init__.py,sha256=roZIer8m05a1bFKvQe3y7YRT6CIfYh08Ttubt-6_5A8,143
|
|
9
|
+
mixpilot/tools/adstock.py,sha256=WwVTqqh6D6r5bVwOHjNv5Nz0yt93U94DGg_iF5N3yBA,2810
|
|
10
|
+
mixpilot/tools/attribution.py,sha256=mZfilJVnv_Abz6pxlBWwJjzTZuGZT0Gc9Mg_HuoEkkM,5982
|
|
11
|
+
mixpilot/tools/base.py,sha256=38FlB5azzFt02UPNGisoST7Qs-muNfOe0U3D1KQb5U0,3462
|
|
12
|
+
mixpilot/tools/budget.py,sha256=_18l72HESeZiFWwK8ztG33RaweyaJcyPfbdOFemI6VE,3764
|
|
13
|
+
mixpilot/tools/dataquality.py,sha256=lfDygb1MDHDNM2klISopGoyk2wRYO8mjIryvY0AbjVI,4242
|
|
14
|
+
mixpilot/tools/registry.py,sha256=LhGEVR650EO0P15k-E2Ne76JHwPg4wfjnQNx71C7N9M,2166
|
|
15
|
+
mixpilot/tools/saturation.py,sha256=ld3cKmY6KRiUNAJ6TZN7GILucvWCruu7s1VgNfJqVeI,3612
|
|
16
|
+
mixpilot-0.1.0.dist-info/METADATA,sha256=cWYGfZi54oBDdGy-M7UG6TQ9MZnxNAEZxtC3woobkQE,5234
|
|
17
|
+
mixpilot-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
18
|
+
mixpilot-0.1.0.dist-info/top_level.txt,sha256=6BA2FqJnYiBkdycD1u8D8B0tXKLpv3QR2CryuzmFV8Y,9
|
|
19
|
+
mixpilot-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
mixpilot
|