spooled-ai 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spooled_ai-0.4.0/LICENSE +19 -0
- spooled_ai-0.4.0/MANIFEST.in +28 -0
- spooled_ai-0.4.0/PKG-INFO +247 -0
- spooled_ai-0.4.0/README.md +183 -0
- spooled_ai-0.4.0/cli/__init__.py +5 -0
- spooled_ai-0.4.0/cli/commands/__init__.py +53 -0
- spooled_ai-0.4.0/cli/commands/analyze.py +340 -0
- spooled_ai-0.4.0/cli/commands/attest.py +321 -0
- spooled_ai-0.4.0/cli/commands/baseline.py +660 -0
- spooled_ai-0.4.0/cli/commands/ci.py +3374 -0
- spooled_ai-0.4.0/cli/commands/ci_display.py +338 -0
- spooled_ai-0.4.0/cli/commands/ci_engine.py +895 -0
- spooled_ai-0.4.0/cli/commands/demo.py +264 -0
- spooled_ai-0.4.0/cli/commands/diff.py +114 -0
- spooled_ai-0.4.0/cli/commands/doctor.py +407 -0
- spooled_ai-0.4.0/cli/commands/fingerprint.py +231 -0
- spooled_ai-0.4.0/cli/commands/fleet.py +539 -0
- spooled_ai-0.4.0/cli/commands/ingest.py +352 -0
- spooled_ai-0.4.0/cli/commands/init.py +313 -0
- spooled_ai-0.4.0/cli/commands/keys.py +263 -0
- spooled_ai-0.4.0/cli/commands/list.py +116 -0
- spooled_ai-0.4.0/cli/commands/new_agent.py +252 -0
- spooled_ai-0.4.0/cli/commands/policy.py +165 -0
- spooled_ai-0.4.0/cli/commands/pull.py +185 -0
- spooled_ai-0.4.0/cli/commands/replay.py +123 -0
- spooled_ai-0.4.0/cli/commands/session.py +310 -0
- spooled_ai-0.4.0/cli/commands/traces.py +138 -0
- spooled_ai-0.4.0/cli/commands/verify.py +392 -0
- spooled_ai-0.4.0/cli/commands/view.py +258 -0
- spooled_ai-0.4.0/cli/commands/watch.py +185 -0
- spooled_ai-0.4.0/cli/diff.py +510 -0
- spooled_ai-0.4.0/cli/main.py +198 -0
- spooled_ai-0.4.0/cli/replay_generator.py +221 -0
- spooled_ai-0.4.0/cli/report.py +514 -0
- spooled_ai-0.4.0/cli/report_pr.py +424 -0
- spooled_ai-0.4.0/cli/templates/__init__.py +3 -0
- spooled_ai-0.4.0/cli/templates/demo_agent.py +252 -0
- spooled_ai-0.4.0/cli/templates/demo_variant.py +220 -0
- spooled_ai-0.4.0/cli/templates/github_workflow.py +185 -0
- spooled_ai-0.4.0/cli/templates/policy_templates.py +150 -0
- spooled_ai-0.4.0/cli/templates/sample_policy.py +7 -0
- spooled_ai-0.4.0/cli/utils.py +51 -0
- spooled_ai-0.4.0/pyproject.toml +224 -0
- spooled_ai-0.4.0/setup.cfg +4 -0
- spooled_ai-0.4.0/spooled/__init__.py +264 -0
- spooled_ai-0.4.0/spooled/_context.py +27 -0
- spooled_ai-0.4.0/spooled/attest.py +267 -0
- spooled_ai-0.4.0/spooled/attestation.py +269 -0
- spooled_ai-0.4.0/spooled/baseline.py +1394 -0
- spooled_ai-0.4.0/spooled/ci_history_sync.py +185 -0
- spooled_ai-0.4.0/spooled/circuit_breaker.py +115 -0
- spooled_ai-0.4.0/spooled/decorators.py +343 -0
- spooled_ai-0.4.0/spooled/exporters/__init__.py +30 -0
- spooled_ai-0.4.0/spooled/exporters/_semconv.py +74 -0
- spooled_ai-0.4.0/spooled/exporters/otel.py +423 -0
- spooled_ai-0.4.0/spooled/exporters/webhook.py +159 -0
- spooled_ai-0.4.0/spooled/fingerprint.py +830 -0
- spooled_ai-0.4.0/spooled/hash_utils.py +176 -0
- spooled_ai-0.4.0/spooled/hooks/__init__.py +89 -0
- spooled_ai-0.4.0/spooled/hooks/_privacy.py +185 -0
- spooled_ai-0.4.0/spooled/hooks/aiohttp_hook.py +161 -0
- spooled_ai-0.4.0/spooled/hooks/anthropic_hook.py +546 -0
- spooled_ai-0.4.0/spooled/hooks/bedrock_hook.py +920 -0
- spooled_ai-0.4.0/spooled/hooks/httpx_hook.py +218 -0
- spooled_ai-0.4.0/spooled/hooks/openai_hook.py +699 -0
- spooled_ai-0.4.0/spooled/hooks/requests_hook.py +171 -0
- spooled_ai-0.4.0/spooled/integrations/__init__.py +18 -0
- spooled_ai-0.4.0/spooled/integrations/autogen.py +161 -0
- spooled_ai-0.4.0/spooled/integrations/crewai.py +222 -0
- spooled_ai-0.4.0/spooled/integrations/langchain.py +200 -0
- spooled_ai-0.4.0/spooled/integrations/langgraph.py +352 -0
- spooled_ai-0.4.0/spooled/integrations/llamaindex.py +116 -0
- spooled_ai-0.4.0/spooled/licensing.py +267 -0
- spooled_ai-0.4.0/spooled/metrics.py +98 -0
- spooled_ai-0.4.0/spooled/models.py +569 -0
- spooled_ai-0.4.0/spooled/policy.py +874 -0
- spooled_ai-0.4.0/spooled/processors/__init__.py +24 -0
- spooled_ai-0.4.0/spooled/processors/_converter.py +264 -0
- spooled_ai-0.4.0/spooled/processors/otel.py +355 -0
- spooled_ai-0.4.0/spooled/py.typed +0 -0
- spooled_ai-0.4.0/spooled/recorder.py +1134 -0
- spooled_ai-0.4.0/spooled/redaction.py +337 -0
- spooled_ai-0.4.0/spooled/scoring.py +287 -0
- spooled_ai-0.4.0/spooled/signals.py +1391 -0
- spooled_ai-0.4.0/spooled/storage.py +214 -0
- spooled_ai-0.4.0/spooled/tool_signature.py +125 -0
- spooled_ai-0.4.0/spooled/usage.py +214 -0
- spooled_ai-0.4.0/spooled/utils.py +111 -0
- spooled_ai-0.4.0/spooled/wrappers/__init__.py +18 -0
- spooled_ai-0.4.0/spooled/wrappers/anthropic_wrapper.py +108 -0
- spooled_ai-0.4.0/spooled/wrappers/openai_wrapper.py +137 -0
- spooled_ai-0.4.0/spooled_ai.egg-info/PKG-INFO +247 -0
- spooled_ai-0.4.0/spooled_ai.egg-info/SOURCES.txt +95 -0
- spooled_ai-0.4.0/spooled_ai.egg-info/dependency_links.txt +1 -0
- spooled_ai-0.4.0/spooled_ai.egg-info/entry_points.txt +3 -0
- spooled_ai-0.4.0/spooled_ai.egg-info/requires.txt +44 -0
- spooled_ai-0.4.0/spooled_ai.egg-info/top_level.txt +2 -0
spooled_ai-0.4.0/LICENSE
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
Copyright (c) 2026 Spooled AI. All rights reserved.
|
|
2
|
+
|
|
3
|
+
This software and associated documentation files (the "Software") are the
|
|
4
|
+
proprietary property of Spooled AI. You may use the Software only in
|
|
5
|
+
accordance with the terms of your agreement with Spooled AI or, in the
|
|
6
|
+
absence of such agreement, subject to the following conditions:
|
|
7
|
+
|
|
8
|
+
1. You may install and use the Software for your internal business purposes.
|
|
9
|
+
2. You may not redistribute, sublicense, sell, or otherwise make the Software
|
|
10
|
+
available to third parties, in whole or in part.
|
|
11
|
+
3. You may not modify, reverse-engineer, decompile, or create derivative
|
|
12
|
+
works based on the Software.
|
|
13
|
+
|
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
15
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
16
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
17
|
+
SPOOLED AI BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
|
|
18
|
+
AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
|
19
|
+
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# Include only what belongs in the pip package
|
|
2
|
+
include LICENSE
|
|
3
|
+
include README.md
|
|
4
|
+
include pyproject.toml
|
|
5
|
+
|
|
6
|
+
# SDK and CLI source
|
|
7
|
+
recursive-include spooled *.py
|
|
8
|
+
include spooled/py.typed
|
|
9
|
+
recursive-include cli *.py
|
|
10
|
+
|
|
11
|
+
# Exclude everything else
|
|
12
|
+
prune action
|
|
13
|
+
prune backend
|
|
14
|
+
prune tests
|
|
15
|
+
prune tools
|
|
16
|
+
prune docs
|
|
17
|
+
prune examples
|
|
18
|
+
prune schemas
|
|
19
|
+
prune .github
|
|
20
|
+
prune cdk.out
|
|
21
|
+
prune htmlcov
|
|
22
|
+
|
|
23
|
+
# Exclude dev/config files
|
|
24
|
+
exclude .env
|
|
25
|
+
exclude .env.*
|
|
26
|
+
exclude .pre-commit-config.yaml
|
|
27
|
+
exclude Makefile
|
|
28
|
+
exclude setup.py
|
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: spooled-ai
|
|
3
|
+
Version: 0.4.0
|
|
4
|
+
Summary: CI for AI agents - behavioral fingerprinting and drift detection
|
|
5
|
+
Author: Spooled Team
|
|
6
|
+
License: Proprietary
|
|
7
|
+
Project-URL: Homepage, https://spooled.ai
|
|
8
|
+
Project-URL: Documentation, https://spooled.ai/docs
|
|
9
|
+
Project-URL: Changelog, https://spooled.ai/docs/changelog
|
|
10
|
+
Keywords: ai,agents,tracing,debugging,replay,ci,behavioral-testing,llm
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: Other/Proprietary License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
19
|
+
Classifier: Topic :: Software Development :: Testing
|
|
20
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
21
|
+
Classifier: Typing :: Typed
|
|
22
|
+
Requires-Python: >=3.10
|
|
23
|
+
Description-Content-Type: text/markdown
|
|
24
|
+
License-File: LICENSE
|
|
25
|
+
Requires-Dist: pydantic<3.0.0,>=2.0.0
|
|
26
|
+
Requires-Dist: python-dotenv>=1.0.0
|
|
27
|
+
Requires-Dist: pyyaml>=6.0.0
|
|
28
|
+
Requires-Dist: structlog>=23.0.0
|
|
29
|
+
Requires-Dist: typer>=0.9.0
|
|
30
|
+
Requires-Dist: rich>=13.0.0
|
|
31
|
+
Requires-Dist: httpx>=0.24.0
|
|
32
|
+
Requires-Dist: requests>=2.31.0
|
|
33
|
+
Provides-Extra: dev
|
|
34
|
+
Requires-Dist: pytest>=7.4.0; extra == "dev"
|
|
35
|
+
Requires-Dist: pytest-cov>=4.1.0; extra == "dev"
|
|
36
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
37
|
+
Requires-Dist: black>=23.0.0; extra == "dev"
|
|
38
|
+
Requires-Dist: mypy>=1.5.0; extra == "dev"
|
|
39
|
+
Requires-Dist: pre-commit>=3.5.0; extra == "dev"
|
|
40
|
+
Requires-Dist: types-requests; extra == "dev"
|
|
41
|
+
Requires-Dist: types-python-dateutil; extra == "dev"
|
|
42
|
+
Provides-Extra: cli
|
|
43
|
+
Provides-Extra: backend
|
|
44
|
+
Requires-Dist: aws-cdk-lib>=2.100.0; extra == "backend"
|
|
45
|
+
Requires-Dist: constructs>=10.0.0; extra == "backend"
|
|
46
|
+
Requires-Dist: boto3>=1.28.0; extra == "backend"
|
|
47
|
+
Provides-Extra: tools
|
|
48
|
+
Requires-Dist: langchain>=0.3.0; extra == "tools"
|
|
49
|
+
Requires-Dist: langchain-community>=0.3.0; extra == "tools"
|
|
50
|
+
Requires-Dist: langchain-openai>=0.2.0; extra == "tools"
|
|
51
|
+
Requires-Dist: langgraph>=0.2.0; extra == "tools"
|
|
52
|
+
Requires-Dist: crewai>=0.80.0; extra == "tools"
|
|
53
|
+
Requires-Dist: llama-index>=0.10.0; extra == "tools"
|
|
54
|
+
Requires-Dist: pyautogen>=0.2.0; extra == "tools"
|
|
55
|
+
Requires-Dist: aiohttp>=3.9.0; extra == "tools"
|
|
56
|
+
Requires-Dist: boto3>=1.28.0; extra == "tools"
|
|
57
|
+
Provides-Extra: metrics
|
|
58
|
+
Requires-Dist: prometheus-client>=0.20.0; extra == "metrics"
|
|
59
|
+
Provides-Extra: otel
|
|
60
|
+
Requires-Dist: opentelemetry-api>=1.20.0; extra == "otel"
|
|
61
|
+
Requires-Dist: opentelemetry-sdk>=1.20.0; extra == "otel"
|
|
62
|
+
Requires-Dist: opentelemetry-exporter-otlp-proto-http>=1.20.0; extra == "otel"
|
|
63
|
+
Dynamic: license-file
|
|
64
|
+
|
|
65
|
+
# Spooled — Behavioral CI for AI Agents
|
|
66
|
+
|
|
67
|
+
> **The diff for agent behavior.**
|
|
68
|
+
> Capture what your agent does, detect when it changes, gate the PR.
|
|
69
|
+
|
|
70
|
+
AI agents are non-deterministic. The same code, prompt, and model produce different tool-calling behavior on every run. A one-word prompt edit can silently drop a compliance check. A model upgrade can change which tools get called. A KB refresh can alter the agent's decision path. Unit tests pass. Eval suites pass. Nobody notices until production.
|
|
71
|
+
|
|
72
|
+
Spooled catches it on the PR.
|
|
73
|
+
|
|
74
|
+
## What It Does
|
|
75
|
+
|
|
76
|
+
**Capture** — wraps your LLM client and records the structural fingerprint of every agent run: which tools were called, in what order, how many times. Content-blind by architecture — prompts, customer data, and AI responses never leave your infrastructure.
|
|
77
|
+
|
|
78
|
+
**Compare** — diffs the current run against a committed baseline. Shows exactly what changed: tools added, tools removed, sequence reordered, token usage shifted.
|
|
79
|
+
|
|
80
|
+
**Gate** — posts a PR comment with a behavioral score. Blocks the merge if the policy says so. Resolution instructions included.
|
|
81
|
+
|
|
82
|
+
## Install
|
|
83
|
+
|
|
84
|
+
```bash
|
|
85
|
+
pip install spooled-ai
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
## Quick Start
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
import spooled
|
|
92
|
+
from spooled.wrappers import wrap_openai
|
|
93
|
+
from openai import OpenAI
|
|
94
|
+
|
|
95
|
+
spooled.init(agent_id="my_agent")
|
|
96
|
+
client = wrap_openai(OpenAI())
|
|
97
|
+
|
|
98
|
+
response = client.chat.completions.create(
|
|
99
|
+
model="gpt-4o",
|
|
100
|
+
messages=[{"role": "user", "content": "Analyze this deal"}],
|
|
101
|
+
tools=MY_TOOLS,
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
spooled.shutdown()
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
That's it. Every tool call is captured. The trace is saved to `.spooled/traces/`. The hash chain signs every interaction at capture time.
|
|
108
|
+
|
|
109
|
+
## CI Integration
|
|
110
|
+
|
|
111
|
+
```yaml
|
|
112
|
+
# .github/workflows/spooled.yml
|
|
113
|
+
- name: Generate traces
|
|
114
|
+
run: python ci_runner.py
|
|
115
|
+
env:
|
|
116
|
+
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
|
117
|
+
|
|
118
|
+
- name: Spooled behavioral check
|
|
119
|
+
run: |
|
|
120
|
+
pip install spooled-ai
|
|
121
|
+
spooled ci compare .spooled/traces/*.jsonl \
|
|
122
|
+
--baseline .github/baselines \
|
|
123
|
+
--policy spooled-policy.yml \
|
|
124
|
+
--enable-blocking
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
The action compares traces against committed baselines and posts a PR comment:
|
|
128
|
+
|
|
129
|
+
```
|
|
130
|
+
❌ Spooled Behavioral CI: FAIL
|
|
131
|
+
Spooled Score: 59/100 (D) 🔴
|
|
132
|
+
|
|
133
|
+
| Agent | Status | Score | Tokens |
|
|
134
|
+
|------------|---------------------|-------|---------------|
|
|
135
|
+
| deal_agent | ⚠️ Variant · Tooling change | 59 | 🔻 14198 (-32%) |
|
|
136
|
+
|
|
137
|
+
🔧 Tool Changes:
|
|
138
|
+
➖ sanctions_screening removed
|
|
139
|
+
➖ ip_patent_search removed
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
## What Spooled Catches
|
|
143
|
+
|
|
144
|
+
Validated across 50 real-world test scenarios with ~316 OpenAI API calls:
|
|
145
|
+
|
|
146
|
+
| Change type | Example | Unit tests | Spooled |
|
|
147
|
+
|-------------|---------|:----------:|:-------:|
|
|
148
|
+
| Prompt tweak | "Be concise" drops compliance tools | ✅ Pass | **VARIANT** |
|
|
149
|
+
| Model swap | gpt-4o drops sanctions screening | ✅ Pass | **VARIANT** |
|
|
150
|
+
| Tool deprecation | Agent proceeds on sanctioned entity without sanctions data | ✅ Pass | **VARIANT** |
|
|
151
|
+
| KB refresh | Fraud tickets lose customer response | ✅ Pass | **VARIANT** |
|
|
152
|
+
| Schema migration | Field rename stops international detection | ✅ Pass | **VARIANT** |
|
|
153
|
+
| Prompt reordering | Same words, sections reordered | ✅ Pass | **VARIANT** |
|
|
154
|
+
| Tool description edit | Better docs change model decisions | ✅ Pass | **VARIANT** |
|
|
155
|
+
| Upstream degradation | Retry paths appear in fingerprint | ✅ Pass | **VARIANT** |
|
|
156
|
+
|
|
157
|
+
## Content-Blind Architecture
|
|
158
|
+
|
|
159
|
+
Spooled never captures prompts, customer data, or AI responses. Only structural metadata: tool names, call sequence, token counts, timing. This is enforced in code — content is stripped before the trace reaches disk.
|
|
160
|
+
|
|
161
|
+
Verified: we injected SSNs, credit cards, API keys, and email addresses into tool outputs. Scanned the trace file. **Zero PII found.** Structural data (tool names, model, usage) fully preserved.
|
|
162
|
+
|
|
163
|
+
This opens regulated markets (healthcare, finance, government) where competitors who capture content cannot operate.
|
|
164
|
+
|
|
165
|
+
## Multi-Agent Support
|
|
166
|
+
|
|
167
|
+
```python
|
|
168
|
+
# Supervisor starts
|
|
169
|
+
spooled.init(agent_id="supervisor")
|
|
170
|
+
# ... supervisor work ...
|
|
171
|
+
|
|
172
|
+
# Child inherits parent linkage automatically
|
|
173
|
+
spooled.init(agent_id="worker") # auto-detects parent, inherits session
|
|
174
|
+
# ... worker work ...
|
|
175
|
+
spooled.shutdown() # stops worker only, supervisor stays active
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
Each agent gets its own trace, its own fingerprint, its own baseline. Concurrent execution is safe — tested with 3 agents in parallel threads, zero cross-contamination.
|
|
179
|
+
|
|
180
|
+
## Policy Rules
|
|
181
|
+
|
|
182
|
+
```yaml
|
|
183
|
+
# spooled-policy.yml
|
|
184
|
+
name: "Production gate"
|
|
185
|
+
enabled: true
|
|
186
|
+
block_merges: true
|
|
187
|
+
rules:
|
|
188
|
+
- name: "Block behavioral variants"
|
|
189
|
+
fail_if:
|
|
190
|
+
on_variant: true # Block structural changes
|
|
191
|
+
on_new_behavior: true # Block new intent patterns
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
Separate `on_variant` (structural change to a known intent) from `on_new_behavior` (entirely new intent bucket). Gate each independently.
|
|
195
|
+
|
|
196
|
+
## Actionable Findings
|
|
197
|
+
|
|
198
|
+
From the 50-scenario assessment:
|
|
199
|
+
|
|
200
|
+
- **Set `seed=42`** on OpenAI tool-calling agents — achieves 100% fingerprint stability on gpt-4o-mini, 75% on gpt-4o
|
|
201
|
+
- **Guardrails need "non-negotiable" language** — medium-strength prompts erode 100% under conversational pressure; strong prompts with explicit override-rejection hold 100%
|
|
202
|
+
- **Run 20+ inputs** before shipping prompt changes — 10-input samples produce noise; 20+ reveals the real distribution shift
|
|
203
|
+
|
|
204
|
+
## Supported Libraries
|
|
205
|
+
|
|
206
|
+
**LLM Providers (explicit wrappers):**
|
|
207
|
+
- OpenAI (sync/async, streaming)
|
|
208
|
+
- Anthropic (sync/async, streaming)
|
|
209
|
+
|
|
210
|
+
**HTTP & Cloud (auto-instrumented via hooks):**
|
|
211
|
+
- AWS Bedrock
|
|
212
|
+
- requests, httpx, aiohttp
|
|
213
|
+
|
|
214
|
+
**Frameworks (callback handlers):**
|
|
215
|
+
- LangChain, LlamaIndex, AutoGen, CrewAI, LangGraph
|
|
216
|
+
|
|
217
|
+
## Commands
|
|
218
|
+
|
|
219
|
+
```bash
|
|
220
|
+
spooled verify trace <run_id> # Hash chain integrity check
|
|
221
|
+
spooled diff traces <a> <b> # Structural comparison
|
|
222
|
+
spooled ci compare <trace> --baseline # CI comparison
|
|
223
|
+
spooled ci update-baseline --from dir # Generate baselines
|
|
224
|
+
spooled analyze --agent-id my_agent # Analyze trace patterns
|
|
225
|
+
spooled policy init # Create a policy file
|
|
226
|
+
```
|
|
227
|
+
|
|
228
|
+
## How It Works
|
|
229
|
+
|
|
230
|
+
1. `wrap_openai()` intercepts `chat.completions.create()` calls
|
|
231
|
+
2. Each tool call is recorded with a SHA-256 hash of the previous interaction (Merkle chain)
|
|
232
|
+
3. Content is stripped to structural metadata before saving (privacy by architecture)
|
|
233
|
+
4. A `structural_hash` of the saved data enables post-save tampering detection
|
|
234
|
+
5. The trace is saved as append-only JSONL in `.spooled/traces/`
|
|
235
|
+
6. `spooled ci compare` extracts the fingerprint (tool graph) and diffs against the baseline
|
|
236
|
+
7. Policy rules determine PASS/FAIL; the PR comment shows what changed
|
|
237
|
+
|
|
238
|
+
## Documentation
|
|
239
|
+
|
|
240
|
+
- [CI Integration Guide](docs/CI_INTEGRATION.md)
|
|
241
|
+
- [Architecture & Trust Model](docs/ARCHITECTURE.md)
|
|
242
|
+
- [Data Schema](docs/DATA_SCHEMA.md)
|
|
243
|
+
- [Deployment Guide](docs/DEPLOYMENT.md)
|
|
244
|
+
|
|
245
|
+
## License
|
|
246
|
+
|
|
247
|
+
MIT
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
# Spooled — Behavioral CI for AI Agents
|
|
2
|
+
|
|
3
|
+
> **The diff for agent behavior.**
|
|
4
|
+
> Capture what your agent does, detect when it changes, gate the PR.
|
|
5
|
+
|
|
6
|
+
AI agents are non-deterministic. The same code, prompt, and model produce different tool-calling behavior on every run. A one-word prompt edit can silently drop a compliance check. A model upgrade can change which tools get called. A KB refresh can alter the agent's decision path. Unit tests pass. Eval suites pass. Nobody notices until production.
|
|
7
|
+
|
|
8
|
+
Spooled catches it on the PR.
|
|
9
|
+
|
|
10
|
+
## What It Does
|
|
11
|
+
|
|
12
|
+
**Capture** — wraps your LLM client and records the structural fingerprint of every agent run: which tools were called, in what order, how many times. Content-blind by architecture — prompts, customer data, and AI responses never leave your infrastructure.
|
|
13
|
+
|
|
14
|
+
**Compare** — diffs the current run against a committed baseline. Shows exactly what changed: tools added, tools removed, sequence reordered, token usage shifted.
|
|
15
|
+
|
|
16
|
+
**Gate** — posts a PR comment with a behavioral score. Blocks the merge if the policy says so. Resolution instructions included.
|
|
17
|
+
|
|
18
|
+
## Install
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
pip install spooled-ai
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## Quick Start
|
|
25
|
+
|
|
26
|
+
```python
|
|
27
|
+
import spooled
|
|
28
|
+
from spooled.wrappers import wrap_openai
|
|
29
|
+
from openai import OpenAI
|
|
30
|
+
|
|
31
|
+
spooled.init(agent_id="my_agent")
|
|
32
|
+
client = wrap_openai(OpenAI())
|
|
33
|
+
|
|
34
|
+
response = client.chat.completions.create(
|
|
35
|
+
model="gpt-4o",
|
|
36
|
+
messages=[{"role": "user", "content": "Analyze this deal"}],
|
|
37
|
+
tools=MY_TOOLS,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
spooled.shutdown()
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
That's it. Every tool call is captured. The trace is saved to `.spooled/traces/`. The hash chain signs every interaction at capture time.
|
|
44
|
+
|
|
45
|
+
## CI Integration
|
|
46
|
+
|
|
47
|
+
```yaml
|
|
48
|
+
# .github/workflows/spooled.yml
|
|
49
|
+
- name: Generate traces
|
|
50
|
+
run: python ci_runner.py
|
|
51
|
+
env:
|
|
52
|
+
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
|
53
|
+
|
|
54
|
+
- name: Spooled behavioral check
|
|
55
|
+
run: |
|
|
56
|
+
pip install spooled-ai
|
|
57
|
+
spooled ci compare .spooled/traces/*.jsonl \
|
|
58
|
+
--baseline .github/baselines \
|
|
59
|
+
--policy spooled-policy.yml \
|
|
60
|
+
--enable-blocking
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
The action compares traces against committed baselines and posts a PR comment:
|
|
64
|
+
|
|
65
|
+
```
|
|
66
|
+
❌ Spooled Behavioral CI: FAIL
|
|
67
|
+
Spooled Score: 59/100 (D) 🔴
|
|
68
|
+
|
|
69
|
+
| Agent | Status | Score | Tokens |
|
|
70
|
+
|------------|---------------------|-------|---------------|
|
|
71
|
+
| deal_agent | ⚠️ Variant · Tooling change | 59 | 🔻 14198 (-32%) |
|
|
72
|
+
|
|
73
|
+
🔧 Tool Changes:
|
|
74
|
+
➖ sanctions_screening removed
|
|
75
|
+
➖ ip_patent_search removed
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
## What Spooled Catches
|
|
79
|
+
|
|
80
|
+
Validated across 50 real-world test scenarios with ~316 OpenAI API calls:
|
|
81
|
+
|
|
82
|
+
| Change type | Example | Unit tests | Spooled |
|
|
83
|
+
|-------------|---------|:----------:|:-------:|
|
|
84
|
+
| Prompt tweak | "Be concise" drops compliance tools | ✅ Pass | **VARIANT** |
|
|
85
|
+
| Model swap | gpt-4o drops sanctions screening | ✅ Pass | **VARIANT** |
|
|
86
|
+
| Tool deprecation | Agent proceeds on sanctioned entity without sanctions data | ✅ Pass | **VARIANT** |
|
|
87
|
+
| KB refresh | Fraud tickets lose customer response | ✅ Pass | **VARIANT** |
|
|
88
|
+
| Schema migration | Field rename stops international detection | ✅ Pass | **VARIANT** |
|
|
89
|
+
| Prompt reordering | Same words, sections reordered | ✅ Pass | **VARIANT** |
|
|
90
|
+
| Tool description edit | Better docs change model decisions | ✅ Pass | **VARIANT** |
|
|
91
|
+
| Upstream degradation | Retry paths appear in fingerprint | ✅ Pass | **VARIANT** |
|
|
92
|
+
|
|
93
|
+
## Content-Blind Architecture
|
|
94
|
+
|
|
95
|
+
Spooled never captures prompts, customer data, or AI responses. Only structural metadata: tool names, call sequence, token counts, timing. This is enforced in code — content is stripped before the trace reaches disk.
|
|
96
|
+
|
|
97
|
+
Verified: we injected SSNs, credit cards, API keys, and email addresses into tool outputs. Scanned the trace file. **Zero PII found.** Structural data (tool names, model, usage) fully preserved.
|
|
98
|
+
|
|
99
|
+
This opens regulated markets (healthcare, finance, government) where competitors who capture content cannot operate.
|
|
100
|
+
|
|
101
|
+
## Multi-Agent Support
|
|
102
|
+
|
|
103
|
+
```python
|
|
104
|
+
# Supervisor starts
|
|
105
|
+
spooled.init(agent_id="supervisor")
|
|
106
|
+
# ... supervisor work ...
|
|
107
|
+
|
|
108
|
+
# Child inherits parent linkage automatically
|
|
109
|
+
spooled.init(agent_id="worker") # auto-detects parent, inherits session
|
|
110
|
+
# ... worker work ...
|
|
111
|
+
spooled.shutdown() # stops worker only, supervisor stays active
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
Each agent gets its own trace, its own fingerprint, its own baseline. Concurrent execution is safe — tested with 3 agents in parallel threads, zero cross-contamination.
|
|
115
|
+
|
|
116
|
+
## Policy Rules
|
|
117
|
+
|
|
118
|
+
```yaml
|
|
119
|
+
# spooled-policy.yml
|
|
120
|
+
name: "Production gate"
|
|
121
|
+
enabled: true
|
|
122
|
+
block_merges: true
|
|
123
|
+
rules:
|
|
124
|
+
- name: "Block behavioral variants"
|
|
125
|
+
fail_if:
|
|
126
|
+
on_variant: true # Block structural changes
|
|
127
|
+
on_new_behavior: true # Block new intent patterns
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
Separate `on_variant` (structural change to a known intent) from `on_new_behavior` (entirely new intent bucket). Gate each independently.
|
|
131
|
+
|
|
132
|
+
## Actionable Findings
|
|
133
|
+
|
|
134
|
+
From the 50-scenario assessment:
|
|
135
|
+
|
|
136
|
+
- **Set `seed=42`** on OpenAI tool-calling agents — achieves 100% fingerprint stability on gpt-4o-mini, 75% on gpt-4o
|
|
137
|
+
- **Guardrails need "non-negotiable" language** — medium-strength prompts erode 100% under conversational pressure; strong prompts with explicit override-rejection hold 100%
|
|
138
|
+
- **Run 20+ inputs** before shipping prompt changes — 10-input samples produce noise; 20+ reveals the real distribution shift
|
|
139
|
+
|
|
140
|
+
## Supported Libraries
|
|
141
|
+
|
|
142
|
+
**LLM Providers (explicit wrappers):**
|
|
143
|
+
- OpenAI (sync/async, streaming)
|
|
144
|
+
- Anthropic (sync/async, streaming)
|
|
145
|
+
|
|
146
|
+
**HTTP & Cloud (auto-instrumented via hooks):**
|
|
147
|
+
- AWS Bedrock
|
|
148
|
+
- requests, httpx, aiohttp
|
|
149
|
+
|
|
150
|
+
**Frameworks (callback handlers):**
|
|
151
|
+
- LangChain, LlamaIndex, AutoGen, CrewAI, LangGraph
|
|
152
|
+
|
|
153
|
+
## Commands
|
|
154
|
+
|
|
155
|
+
```bash
|
|
156
|
+
spooled verify trace <run_id> # Hash chain integrity check
|
|
157
|
+
spooled diff traces <a> <b> # Structural comparison
|
|
158
|
+
spooled ci compare <trace> --baseline # CI comparison
|
|
159
|
+
spooled ci update-baseline --from dir # Generate baselines
|
|
160
|
+
spooled analyze --agent-id my_agent # Analyze trace patterns
|
|
161
|
+
spooled policy init # Create a policy file
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
## How It Works
|
|
165
|
+
|
|
166
|
+
1. `wrap_openai()` intercepts `chat.completions.create()` calls
|
|
167
|
+
2. Each tool call is recorded with a SHA-256 hash of the previous interaction (Merkle chain)
|
|
168
|
+
3. Content is stripped to structural metadata before saving (privacy by architecture)
|
|
169
|
+
4. A `structural_hash` of the saved data enables post-save tampering detection
|
|
170
|
+
5. The trace is saved as append-only JSONL in `.spooled/traces/`
|
|
171
|
+
6. `spooled ci compare` extracts the fingerprint (tool graph) and diffs against the baseline
|
|
172
|
+
7. Policy rules determine PASS/FAIL; the PR comment shows what changed
|
|
173
|
+
|
|
174
|
+
## Documentation
|
|
175
|
+
|
|
176
|
+
- [CI Integration Guide](docs/CI_INTEGRATION.md)
|
|
177
|
+
- [Architecture & Trust Model](docs/ARCHITECTURE.md)
|
|
178
|
+
- [Data Schema](docs/DATA_SCHEMA.md)
|
|
179
|
+
- [Deployment Guide](docs/DEPLOYMENT.md)
|
|
180
|
+
|
|
181
|
+
## License
|
|
182
|
+
|
|
183
|
+
MIT
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
"""CLI command modules."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from cli.commands import (
|
|
6
|
+
analyze,
|
|
7
|
+
attest,
|
|
8
|
+
baseline,
|
|
9
|
+
ci,
|
|
10
|
+
demo,
|
|
11
|
+
diff,
|
|
12
|
+
doctor,
|
|
13
|
+
fingerprint,
|
|
14
|
+
fleet,
|
|
15
|
+
ingest,
|
|
16
|
+
init,
|
|
17
|
+
keys,
|
|
18
|
+
list,
|
|
19
|
+
new_agent,
|
|
20
|
+
policy,
|
|
21
|
+
pull,
|
|
22
|
+
replay,
|
|
23
|
+
session,
|
|
24
|
+
traces,
|
|
25
|
+
verify,
|
|
26
|
+
view,
|
|
27
|
+
watch,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
__all__ = [
|
|
31
|
+
"pull",
|
|
32
|
+
"replay",
|
|
33
|
+
"diff",
|
|
34
|
+
"view",
|
|
35
|
+
"list",
|
|
36
|
+
"init",
|
|
37
|
+
"verify",
|
|
38
|
+
"baseline",
|
|
39
|
+
"fingerprint",
|
|
40
|
+
"ci",
|
|
41
|
+
"fleet",
|
|
42
|
+
"policy",
|
|
43
|
+
"keys",
|
|
44
|
+
"session",
|
|
45
|
+
"doctor",
|
|
46
|
+
"attest",
|
|
47
|
+
"traces",
|
|
48
|
+
"new_agent",
|
|
49
|
+
"demo",
|
|
50
|
+
"watch",
|
|
51
|
+
"ingest",
|
|
52
|
+
"analyze",
|
|
53
|
+
]
|