caliber-trust 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- caliber_trust-0.1.0/LICENSE +21 -0
- caliber_trust-0.1.0/PKG-INFO +211 -0
- caliber_trust-0.1.0/README.md +185 -0
- caliber_trust-0.1.0/caliber/__init__.py +12 -0
- caliber_trust-0.1.0/caliber/card.py +342 -0
- caliber_trust-0.1.0/caliber/cli.py +230 -0
- caliber_trust-0.1.0/caliber/commitment.py +93 -0
- caliber_trust-0.1.0/caliber/importer.py +101 -0
- caliber_trust-0.1.0/caliber/mcp_server.py +193 -0
- caliber_trust-0.1.0/caliber/py.typed +0 -0
- caliber_trust-0.1.0/caliber/storage.py +70 -0
- caliber_trust-0.1.0/caliber/tracker.py +253 -0
- caliber_trust-0.1.0/caliber/trajectory.py +198 -0
- caliber_trust-0.1.0/caliber_trust.egg-info/PKG-INFO +211 -0
- caliber_trust-0.1.0/caliber_trust.egg-info/SOURCES.txt +26 -0
- caliber_trust-0.1.0/caliber_trust.egg-info/dependency_links.txt +1 -0
- caliber_trust-0.1.0/caliber_trust.egg-info/entry_points.txt +2 -0
- caliber_trust-0.1.0/caliber_trust.egg-info/requires.txt +4 -0
- caliber_trust-0.1.0/caliber_trust.egg-info/top_level.txt +1 -0
- caliber_trust-0.1.0/pyproject.toml +43 -0
- caliber_trust-0.1.0/setup.cfg +4 -0
- caliber_trust-0.1.0/tests/test_card.py +267 -0
- caliber_trust-0.1.0/tests/test_commitment.py +84 -0
- caliber_trust-0.1.0/tests/test_importer.py +113 -0
- caliber_trust-0.1.0/tests/test_mcp_server.py +113 -0
- caliber_trust-0.1.0/tests/test_storage.py +88 -0
- caliber_trust-0.1.0/tests/test_tracker.py +161 -0
- caliber_trust-0.1.0/tests/test_trajectory.py +94 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Satish Patil
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: caliber-trust
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Trust protocol for AI agents. Prove capability through calibration, not claims.
|
|
5
|
+
Author-email: Satish Patil <satishpatil@proton.me>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/SatishoBananamoto/caliber
|
|
8
|
+
Project-URL: Repository, https://github.com/SatishoBananamoto/caliber
|
|
9
|
+
Keywords: ai,agents,trust,calibration,a2a,mcp
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
19
|
+
Requires-Python: >=3.9
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
License-File: LICENSE
|
|
22
|
+
Requires-Dist: click>=8.0
|
|
23
|
+
Provides-Extra: mcp
|
|
24
|
+
Requires-Dist: mcp>=1.0; extra == "mcp"
|
|
25
|
+
Dynamic: license-file
|
|
26
|
+
|
|
27
|
+
# caliber
|
|
28
|
+
|
|
29
|
+
Trust protocol for AI agents. Prove capability through calibration, not claims.
|
|
30
|
+
|
|
31
|
+
## The Problem
|
|
32
|
+
|
|
33
|
+
Every agent registry — Google's A2A, Microsoft's Entra, Salesforce's MuleSoft — faces the same problem: agents describe what they *can* do, not how *well* they do it. Agent Cards are LinkedIn profiles with no work history.
|
|
34
|
+
|
|
35
|
+
When Agent A asks Agent B for help, there's no way to know if B is actually good at the task. B says it can review code. Can it? With what accuracy? Is it overconfident? Does it know its own blind spots?
|
|
36
|
+
|
|
37
|
+
## The Solution
|
|
38
|
+
|
|
39
|
+
caliber tracks predictions with confidence levels and generates **Trust Cards** — machine-readable credentials that prove an agent's calibration through accumulated evidence.
|
|
40
|
+
|
|
41
|
+
A Trust Card answers:
|
|
42
|
+
- **Overall:** How accurate is this agent?
|
|
43
|
+
- **By confidence:** When it says "80% sure," is it right 80% of the time?
|
|
44
|
+
- **By domain:** Where is it strong? Where is it weak?
|
|
45
|
+
- **Danger zones:** Confidence ranges where the agent is systematically overconfident.
|
|
46
|
+
|
|
47
|
+
## Quick Start
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
pip install agent-trust
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
### Python API
|
|
54
|
+
|
|
55
|
+
```python
|
|
56
|
+
from caliber import TrustTracker
|
|
57
|
+
|
|
58
|
+
tracker = TrustTracker("my-code-reviewer", store_path="./trust-data")
|
|
59
|
+
|
|
60
|
+
# Record a prediction before checking
|
|
61
|
+
pid = tracker.predict(
|
|
62
|
+
claim="this function has a SQL injection vulnerability",
|
|
63
|
+
confidence=0.85,
|
|
64
|
+
domain="security"
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
# After verifying
|
|
68
|
+
tracker.verify(pid, correct=True, notes="Found in line 42")
|
|
69
|
+
|
|
70
|
+
# Generate a Trust Card
|
|
71
|
+
card = tracker.generate_card()
|
|
72
|
+
print(card.summary())
|
|
73
|
+
print(card.to_json()) # Machine-readable
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
### CLI
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
# Make a prediction
|
|
80
|
+
caliber -a my-agent predict "this endpoint returns JSON" -c 90 -d api
|
|
81
|
+
|
|
82
|
+
# Verify it
|
|
83
|
+
caliber -a my-agent verify <prediction-id> --correct
|
|
84
|
+
|
|
85
|
+
# Generate Trust Card
|
|
86
|
+
caliber -a my-agent card
|
|
87
|
+
caliber -a my-agent card --json
|
|
88
|
+
|
|
89
|
+
# Quick progress check
|
|
90
|
+
caliber -a my-agent summary
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
### Try It Now
|
|
94
|
+
|
|
95
|
+
Make 3 predictions about your codebase before checking:
|
|
96
|
+
|
|
97
|
+
```bash
|
|
98
|
+
caliber predict "src/ has more than 10 Python files" -c 70 -d codebase
|
|
99
|
+
caliber predict "package.json has a test script" -c 85 -d codebase
|
|
100
|
+
caliber predict "the main module uses asyncio" -c 60 -d architecture
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
Then verify each one:
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
caliber verify <id1> --correct # or --incorrect
|
|
107
|
+
caliber verify <id2> --correct
|
|
108
|
+
caliber verify <id3> --incorrect
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
After 3 predictions: `caliber summary`. After 20: `caliber card`.
|
|
112
|
+
|
|
113
|
+
## Trust Card Format
|
|
114
|
+
|
|
115
|
+
```json
|
|
116
|
+
{
|
|
117
|
+
"trust_version": "0.1",
|
|
118
|
+
"agent_name": "my-code-reviewer",
|
|
119
|
+
"generated": "2026-03-26T00:00:00Z",
|
|
120
|
+
"calibration": {
|
|
121
|
+
"total_predictions": 77,
|
|
122
|
+
"total_verified": 77,
|
|
123
|
+
"overall_accuracy": 0.766,
|
|
124
|
+
"mean_confidence": 0.708,
|
|
125
|
+
"mean_calibration_gap": -0.058,
|
|
126
|
+
"confidence_buckets": {
|
|
127
|
+
"50-59": {"predictions": 4, "correct": 2, "accuracy": 0.5, "calibration_gap": 0.045, "insufficient_data": true},
|
|
128
|
+
"60-69": {"predictions": 25, "correct": 16, "accuracy": 0.64, "calibration_gap": 0.005, "significant": false},
|
|
129
|
+
"70-79": {"predictions": 29, "correct": 24, "accuracy": 0.828, "calibration_gap": -0.083, "significant": false},
|
|
130
|
+
"80-89": {"predictions": 18, "correct": 16, "accuracy": 0.889, "calibration_gap": -0.044, "significant": false},
|
|
131
|
+
"90-99": {"predictions": 1, "correct": 1, "accuracy": 1.0, "calibration_gap": -0.055, "insufficient_data": true}
|
|
132
|
+
},
|
|
133
|
+
"domains": {
|
|
134
|
+
"architecture": {"predictions": 21, "accuracy": 0.81},
|
|
135
|
+
"behavior": {"predictions": 25, "accuracy": 0.64},
|
|
136
|
+
"codebase": {"predictions": 20, "accuracy": 0.75}
|
|
137
|
+
},
|
|
138
|
+
"strength_zones": ["50-59"]
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
The Trust Card above is real — generated from 77 calibration predictions made by Claude Opus during the [MY UNIVERSE](https://github.com/SatishoBananamoto/my-universe) project.
|
|
144
|
+
|
|
145
|
+
**What the numbers reveal:** This agent is well-calibrated overall. Each bucket includes a `significant` field (binomial test, p<0.05) and flags `insufficient_data` for small samples. No bucket shows statistically significant miscalibration — the agent's confidence matches its accuracy. Behavior predictions (64%) are its weakest domain.
|
|
146
|
+
|
|
147
|
+
## Key Concepts
|
|
148
|
+
|
|
149
|
+
### Confidence Buckets
|
|
150
|
+
|
|
151
|
+
The core insight: overall accuracy is meaningless without calibration. An agent that's "75% accurate" could be perfectly calibrated (right 75% of the time at 75% confidence) or dangerously miscalibrated (right 50% of the time while claiming 90% confidence).
|
|
152
|
+
|
|
153
|
+
Confidence buckets break accuracy down by confidence level, revealing where the agent knows its limits and where it doesn't.
|
|
154
|
+
|
|
155
|
+
### Calibration Gap
|
|
156
|
+
|
|
157
|
+
The difference between expected and actual accuracy for each confidence bucket:
|
|
158
|
+
- **Positive gap** = overconfident (accuracy < confidence)
|
|
159
|
+
- **Negative gap** = underconfident (accuracy > confidence)
|
|
160
|
+
- **Near zero** = well-calibrated
|
|
161
|
+
|
|
162
|
+
### Danger Zones
|
|
163
|
+
|
|
164
|
+
Confidence ranges where the calibration gap exceeds 10 percentage points with at least 3 data points. These are the ranges where the agent's self-assessment is unreliable.
|
|
165
|
+
|
|
166
|
+
## Origin
|
|
167
|
+
|
|
168
|
+
caliber emerged from [MY UNIVERSE](https://github.com/SatishoBananamoto/my-universe), a cognitive workspace where Claude Opus tracks its own predictions and calibration. 87 predictions across 3 sessions validated the approach — and revealed that early "danger zone" findings were small-sample artifacts, corrected by caliber's own statistical significance tests.
|
|
169
|
+
|
|
170
|
+
The thesis: if calibration tracking works for self-improvement, it works for trust between agents. caliber includes the statistical honesty features because we learned the hard way that small samples lie.
|
|
171
|
+
|
|
172
|
+
## Roadmap
|
|
173
|
+
|
|
174
|
+
- **v0.1** (current): Core tracker, CLI, MCP server, Trust Card generation with statistical significance tests
|
|
175
|
+
- **v0.2**: Trust Card verification (detect fabricated/gamed cards), trajectory support
|
|
176
|
+
- **v0.3**: A2A Agent Card extension, commitment scheme (prediction anchoring)
|
|
177
|
+
- **v1.0**: Signed cards, trust registry, cross-agent trust queries
|
|
178
|
+
|
|
179
|
+
## MCP Server
|
|
180
|
+
|
|
181
|
+
For AI agents that want to track calibration natively:
|
|
182
|
+
|
|
183
|
+
```bash
|
|
184
|
+
python -m caliber.mcp_server
|
|
185
|
+
```
|
|
186
|
+
|
|
187
|
+
Or add to `.mcp.json`:
|
|
188
|
+
|
|
189
|
+
```json
|
|
190
|
+
{
|
|
191
|
+
"mcpServers": {
|
|
192
|
+
"caliber": {
|
|
193
|
+
"command": "python3",
|
|
194
|
+
"args": ["-m", "caliber.mcp_server"],
|
|
195
|
+
"cwd": "/path/to/caliber"
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
Tools: `caliber_predict`, `caliber_verify`, `caliber_card`, `caliber_summary`, `caliber_list`.
|
|
202
|
+
|
|
203
|
+
The prediction log doubles as a decision audit trail — observability as a side effect of calibration.
|
|
204
|
+
|
|
205
|
+
## Statistical Honesty
|
|
206
|
+
|
|
207
|
+
Trust Cards include per-bucket significance tests (binomial, p<0.05) and flag insufficient data (<5 predictions per bucket). This prevents treating small-sample noise as calibration patterns — a real problem we discovered building this.
|
|
208
|
+
|
|
209
|
+
## License
|
|
210
|
+
|
|
211
|
+
MIT
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
# caliber
|
|
2
|
+
|
|
3
|
+
Trust protocol for AI agents. Prove capability through calibration, not claims.
|
|
4
|
+
|
|
5
|
+
## The Problem
|
|
6
|
+
|
|
7
|
+
Every agent registry — Google's A2A, Microsoft's Entra, Salesforce's MuleSoft — faces the same problem: agents describe what they *can* do, not how *well* they do it. Agent Cards are LinkedIn profiles with no work history.
|
|
8
|
+
|
|
9
|
+
When Agent A asks Agent B for help, there's no way to know if B is actually good at the task. B says it can review code. Can it? With what accuracy? Is it overconfident? Does it know its own blind spots?
|
|
10
|
+
|
|
11
|
+
## The Solution
|
|
12
|
+
|
|
13
|
+
caliber tracks predictions with confidence levels and generates **Trust Cards** — machine-readable credentials that prove an agent's calibration through accumulated evidence.
|
|
14
|
+
|
|
15
|
+
A Trust Card answers:
|
|
16
|
+
- **Overall:** How accurate is this agent?
|
|
17
|
+
- **By confidence:** When it says "80% sure," is it right 80% of the time?
|
|
18
|
+
- **By domain:** Where is it strong? Where is it weak?
|
|
19
|
+
- **Danger zones:** Confidence ranges where the agent is systematically overconfident.
|
|
20
|
+
|
|
21
|
+
## Quick Start
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
pip install agent-trust
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
### Python API
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
from caliber import TrustTracker
|
|
31
|
+
|
|
32
|
+
tracker = TrustTracker("my-code-reviewer", store_path="./trust-data")
|
|
33
|
+
|
|
34
|
+
# Record a prediction before checking
|
|
35
|
+
pid = tracker.predict(
|
|
36
|
+
claim="this function has a SQL injection vulnerability",
|
|
37
|
+
confidence=0.85,
|
|
38
|
+
domain="security"
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
# After verifying
|
|
42
|
+
tracker.verify(pid, correct=True, notes="Found in line 42")
|
|
43
|
+
|
|
44
|
+
# Generate a Trust Card
|
|
45
|
+
card = tracker.generate_card()
|
|
46
|
+
print(card.summary())
|
|
47
|
+
print(card.to_json()) # Machine-readable
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
### CLI
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
# Make a prediction
|
|
54
|
+
caliber -a my-agent predict "this endpoint returns JSON" -c 90 -d api
|
|
55
|
+
|
|
56
|
+
# Verify it
|
|
57
|
+
caliber -a my-agent verify <prediction-id> --correct
|
|
58
|
+
|
|
59
|
+
# Generate Trust Card
|
|
60
|
+
caliber -a my-agent card
|
|
61
|
+
caliber -a my-agent card --json
|
|
62
|
+
|
|
63
|
+
# Quick progress check
|
|
64
|
+
caliber -a my-agent summary
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
### Try It Now
|
|
68
|
+
|
|
69
|
+
Make 3 predictions about your codebase before checking:
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
caliber predict "src/ has more than 10 Python files" -c 70 -d codebase
|
|
73
|
+
caliber predict "package.json has a test script" -c 85 -d codebase
|
|
74
|
+
caliber predict "the main module uses asyncio" -c 60 -d architecture
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
Then verify each one:
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
caliber verify <id1> --correct # or --incorrect
|
|
81
|
+
caliber verify <id2> --correct
|
|
82
|
+
caliber verify <id3> --incorrect
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
After 3 predictions: `caliber summary`. After 20: `caliber card`.
|
|
86
|
+
|
|
87
|
+
## Trust Card Format
|
|
88
|
+
|
|
89
|
+
```json
|
|
90
|
+
{
|
|
91
|
+
"trust_version": "0.1",
|
|
92
|
+
"agent_name": "my-code-reviewer",
|
|
93
|
+
"generated": "2026-03-26T00:00:00Z",
|
|
94
|
+
"calibration": {
|
|
95
|
+
"total_predictions": 77,
|
|
96
|
+
"total_verified": 77,
|
|
97
|
+
"overall_accuracy": 0.766,
|
|
98
|
+
"mean_confidence": 0.708,
|
|
99
|
+
"mean_calibration_gap": -0.058,
|
|
100
|
+
"confidence_buckets": {
|
|
101
|
+
"50-59": {"predictions": 4, "correct": 2, "accuracy": 0.5, "calibration_gap": 0.045, "insufficient_data": true},
|
|
102
|
+
"60-69": {"predictions": 25, "correct": 16, "accuracy": 0.64, "calibration_gap": 0.005, "significant": false},
|
|
103
|
+
"70-79": {"predictions": 29, "correct": 24, "accuracy": 0.828, "calibration_gap": -0.083, "significant": false},
|
|
104
|
+
"80-89": {"predictions": 18, "correct": 16, "accuracy": 0.889, "calibration_gap": -0.044, "significant": false},
|
|
105
|
+
"90-99": {"predictions": 1, "correct": 1, "accuracy": 1.0, "calibration_gap": -0.055, "insufficient_data": true}
|
|
106
|
+
},
|
|
107
|
+
"domains": {
|
|
108
|
+
"architecture": {"predictions": 21, "accuracy": 0.81},
|
|
109
|
+
"behavior": {"predictions": 25, "accuracy": 0.64},
|
|
110
|
+
"codebase": {"predictions": 20, "accuracy": 0.75}
|
|
111
|
+
},
|
|
112
|
+
"strength_zones": ["50-59"]
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
The Trust Card above is real — generated from 77 calibration predictions made by Claude Opus during the [MY UNIVERSE](https://github.com/SatishoBananamoto/my-universe) project.
|
|
118
|
+
|
|
119
|
+
**What the numbers reveal:** This agent is well-calibrated overall. Each bucket includes a `significant` field (binomial test, p<0.05) and flags `insufficient_data` for small samples. No bucket shows statistically significant miscalibration — the agent's confidence matches its accuracy. Behavior predictions (64%) are its weakest domain.
|
|
120
|
+
|
|
121
|
+
## Key Concepts
|
|
122
|
+
|
|
123
|
+
### Confidence Buckets
|
|
124
|
+
|
|
125
|
+
The core insight: overall accuracy is meaningless without calibration. An agent that's "75% accurate" could be perfectly calibrated (right 75% of the time at 75% confidence) or dangerously miscalibrated (right 50% of the time while claiming 90% confidence).
|
|
126
|
+
|
|
127
|
+
Confidence buckets break accuracy down by confidence level, revealing where the agent knows its limits and where it doesn't.
|
|
128
|
+
|
|
129
|
+
### Calibration Gap
|
|
130
|
+
|
|
131
|
+
The difference between expected and actual accuracy for each confidence bucket:
|
|
132
|
+
- **Positive gap** = overconfident (accuracy < confidence)
|
|
133
|
+
- **Negative gap** = underconfident (accuracy > confidence)
|
|
134
|
+
- **Near zero** = well-calibrated
|
|
135
|
+
|
|
136
|
+
### Danger Zones
|
|
137
|
+
|
|
138
|
+
Confidence ranges where the calibration gap exceeds 10 percentage points with at least 3 data points. These are the ranges where the agent's self-assessment is unreliable.
|
|
139
|
+
|
|
140
|
+
## Origin
|
|
141
|
+
|
|
142
|
+
caliber emerged from [MY UNIVERSE](https://github.com/SatishoBananamoto/my-universe), a cognitive workspace where Claude Opus tracks its own predictions and calibration. 87 predictions across 3 sessions validated the approach — and revealed that early "danger zone" findings were small-sample artifacts, corrected by caliber's own statistical significance tests.
|
|
143
|
+
|
|
144
|
+
The thesis: if calibration tracking works for self-improvement, it works for trust between agents. caliber includes the statistical honesty features because we learned the hard way that small samples lie.
|
|
145
|
+
|
|
146
|
+
## Roadmap
|
|
147
|
+
|
|
148
|
+
- **v0.1** (current): Core tracker, CLI, MCP server, Trust Card generation with statistical significance tests
|
|
149
|
+
- **v0.2**: Trust Card verification (detect fabricated/gamed cards), trajectory support
|
|
150
|
+
- **v0.3**: A2A Agent Card extension, commitment scheme (prediction anchoring)
|
|
151
|
+
- **v1.0**: Signed cards, trust registry, cross-agent trust queries
|
|
152
|
+
|
|
153
|
+
## MCP Server
|
|
154
|
+
|
|
155
|
+
For AI agents that want to track calibration natively:
|
|
156
|
+
|
|
157
|
+
```bash
|
|
158
|
+
python -m caliber.mcp_server
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
Or add to `.mcp.json`:
|
|
162
|
+
|
|
163
|
+
```json
|
|
164
|
+
{
|
|
165
|
+
"mcpServers": {
|
|
166
|
+
"caliber": {
|
|
167
|
+
"command": "python3",
|
|
168
|
+
"args": ["-m", "caliber.mcp_server"],
|
|
169
|
+
"cwd": "/path/to/caliber"
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
Tools: `caliber_predict`, `caliber_verify`, `caliber_card`, `caliber_summary`, `caliber_list`.
|
|
176
|
+
|
|
177
|
+
The prediction log doubles as a decision audit trail — observability as a side effect of calibration.
|
|
178
|
+
|
|
179
|
+
## Statistical Honesty
|
|
180
|
+
|
|
181
|
+
Trust Cards include per-bucket significance tests (binomial, p<0.05) and flag insufficient data (<5 predictions per bucket). This prevents treating small-sample noise as calibration patterns — a real problem we discovered building this.
|
|
182
|
+
|
|
183
|
+
## License
|
|
184
|
+
|
|
185
|
+
MIT
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""caliber — Trust protocol for AI agents.
|
|
2
|
+
|
|
3
|
+
Prove capability through calibration, not claims.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
__version__ = "0.1.0"
|
|
7
|
+
|
|
8
|
+
from caliber.tracker import TrustTracker, Prediction
|
|
9
|
+
from caliber.card import TrustCard
|
|
10
|
+
from caliber.trajectory import Trajectory
|
|
11
|
+
|
|
12
|
+
__all__ = ["TrustTracker", "Prediction", "TrustCard", "Trajectory"]
|