dprovenancekit 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dprovenancekit-0.1.0/LICENSE +29 -0
- dprovenancekit-0.1.0/PKG-INFO +345 -0
- dprovenancekit-0.1.0/README.md +310 -0
- dprovenancekit-0.1.0/pyproject.toml +55 -0
- dprovenancekit-0.1.0/setup.cfg +4 -0
- dprovenancekit-0.1.0/src/dprovenancekit/__init__.py +212 -0
- dprovenancekit-0.1.0/src/dprovenancekit/alignment_config.py +182 -0
- dprovenancekit-0.1.0/src/dprovenancekit/alignment_contract.py +73 -0
- dprovenancekit-0.1.0/src/dprovenancekit/alignment_engine.py +111 -0
- dprovenancekit-0.1.0/src/dprovenancekit/alignment_evidence.py +103 -0
- dprovenancekit-0.1.0/src/dprovenancekit/alignment_findings.py +69 -0
- dprovenancekit-0.1.0/src/dprovenancekit/alignment_interpreter.py +220 -0
- dprovenancekit-0.1.0/src/dprovenancekit/alignment_matcher.py +57 -0
- dprovenancekit-0.1.0/src/dprovenancekit/alignment_meta.py +81 -0
- dprovenancekit-0.1.0/src/dprovenancekit/alignment_models.py +270 -0
- dprovenancekit-0.1.0/src/dprovenancekit/alignment_narrative.py +73 -0
- dprovenancekit-0.1.0/src/dprovenancekit/alignment_render.py +99 -0
- dprovenancekit-0.1.0/src/dprovenancekit/alignment_semantics.py +44 -0
- dprovenancekit-0.1.0/src/dprovenancekit/alignment_snapshot.py +61 -0
- dprovenancekit-0.1.0/src/dprovenancekit/anomaly.py +72 -0
- dprovenancekit-0.1.0/src/dprovenancekit/benchmark.py +764 -0
- dprovenancekit-0.1.0/src/dprovenancekit/circuit_breaker.py +69 -0
- dprovenancekit-0.1.0/src/dprovenancekit/cli.py +196 -0
- dprovenancekit-0.1.0/src/dprovenancekit/cloud_store.py +391 -0
- dprovenancekit-0.1.0/src/dprovenancekit/config.py +27 -0
- dprovenancekit-0.1.0/src/dprovenancekit/context.py +39 -0
- dprovenancekit-0.1.0/src/dprovenancekit/corpus.py +402 -0
- dprovenancekit-0.1.0/src/dprovenancekit/diff.py +117 -0
- dprovenancekit-0.1.0/src/dprovenancekit/drop_stats.py +94 -0
- dprovenancekit-0.1.0/src/dprovenancekit/edge.py +23 -0
- dprovenancekit-0.1.0/src/dprovenancekit/event.py +148 -0
- dprovenancekit-0.1.0/src/dprovenancekit/graph.py +41 -0
- dprovenancekit-0.1.0/src/dprovenancekit/instrument.py +389 -0
- dprovenancekit-0.1.0/src/dprovenancekit/integrations/__init__.py +16 -0
- dprovenancekit-0.1.0/src/dprovenancekit/integrations/langchain.py +650 -0
- dprovenancekit-0.1.0/src/dprovenancekit/integrations/openai_agents.py +455 -0
- dprovenancekit-0.1.0/src/dprovenancekit/kit.py +126 -0
- dprovenancekit-0.1.0/src/dprovenancekit/live_engine.py +86 -0
- dprovenancekit-0.1.0/src/dprovenancekit/perturbation.py +58 -0
- dprovenancekit-0.1.0/src/dprovenancekit/priority.py +34 -0
- dprovenancekit-0.1.0/src/dprovenancekit/py.typed +0 -0
- dprovenancekit-0.1.0/src/dprovenancekit/query.py +371 -0
- dprovenancekit-0.1.0/src/dprovenancekit/raw_store.py +100 -0
- dprovenancekit-0.1.0/src/dprovenancekit/render_hints.py +21 -0
- dprovenancekit-0.1.0/src/dprovenancekit/replay.py +244 -0
- dprovenancekit-0.1.0/src/dprovenancekit/snapshot_diff.py +279 -0
- dprovenancekit-0.1.0/src/dprovenancekit/sqlite_store.py +573 -0
- dprovenancekit-0.1.0/src/dprovenancekit/store.py +262 -0
- dprovenancekit-0.1.0/src/dprovenancekit/testing.py +277 -0
- dprovenancekit-0.1.0/src/dprovenancekit/verification.py +231 -0
- dprovenancekit-0.1.0/src/dprovenancekit/viewmodel.py +112 -0
- dprovenancekit-0.1.0/src/dprovenancekit/write_buffer.py +236 -0
- dprovenancekit-0.1.0/src/dprovenancekit.egg-info/PKG-INFO +345 -0
- dprovenancekit-0.1.0/src/dprovenancekit.egg-info/SOURCES.txt +83 -0
- dprovenancekit-0.1.0/src/dprovenancekit.egg-info/dependency_links.txt +1 -0
- dprovenancekit-0.1.0/src/dprovenancekit.egg-info/entry_points.txt +2 -0
- dprovenancekit-0.1.0/src/dprovenancekit.egg-info/requires.txt +9 -0
- dprovenancekit-0.1.0/src/dprovenancekit.egg-info/top_level.txt +1 -0
- dprovenancekit-0.1.0/tests/test_alignment_engine.py +176 -0
- dprovenancekit-0.1.0/tests/test_benchmark_runner.py +139 -0
- dprovenancekit-0.1.0/tests/test_cloud_chaos.py +157 -0
- dprovenancekit-0.1.0/tests/test_cloud_store.py +78 -0
- dprovenancekit-0.1.0/tests/test_conformance.py +187 -0
- dprovenancekit-0.1.0/tests/test_corpus.py +85 -0
- dprovenancekit-0.1.0/tests/test_diff_engine.py +125 -0
- dprovenancekit-0.1.0/tests/test_example_regression.py +19 -0
- dprovenancekit-0.1.0/tests/test_explainability_auditor.py +132 -0
- dprovenancekit-0.1.0/tests/test_identity_stability.py +70 -0
- dprovenancekit-0.1.0/tests/test_in_memory_store.py +98 -0
- dprovenancekit-0.1.0/tests/test_instrument.py +365 -0
- dprovenancekit-0.1.0/tests/test_integration_langchain.py +371 -0
- dprovenancekit-0.1.0/tests/test_integration_openai_agents.py +421 -0
- dprovenancekit-0.1.0/tests/test_query_parity.py +81 -0
- dprovenancekit-0.1.0/tests/test_raw_store_roundtrip.py +43 -0
- dprovenancekit-0.1.0/tests/test_regression_gate.py +283 -0
- dprovenancekit-0.1.0/tests/test_replay_engine.py +112 -0
- dprovenancekit-0.1.0/tests/test_snapshot_diff.py +222 -0
- dprovenancekit-0.1.0/tests/test_span_tree.py +61 -0
- dprovenancekit-0.1.0/tests/test_sqlite_encode_drop.py +69 -0
- dprovenancekit-0.1.0/tests/test_sqlite_get_run.py +46 -0
- dprovenancekit-0.1.0/tests/test_sqlite_insert_failure_drop.py +94 -0
- dprovenancekit-0.1.0/tests/test_sqlite_stress.py +113 -0
- dprovenancekit-0.1.0/tests/test_stability_evaluation.py +90 -0
- dprovenancekit-0.1.0/tests/test_trace_graph.py +165 -0
- dprovenancekit-0.1.0/tests/test_write_buffer.py +104 -0
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
Business Source License 1.1
|
|
2
|
+
|
|
3
|
+
Parameters
|
|
4
|
+
Licensor: Danny Kissel
|
|
5
|
+
Licensed Work: DProvenanceKit
|
|
6
|
+
Additional Use Grant: You may use the Licensed Work for production purposes, provided that you do not offer it as a commercial service. (Update as needed)
|
|
7
|
+
Change Date: 2030-06-16 (Update as needed)
|
|
8
|
+
Change License: Version 2.0 or later of the Apache License (Update as needed)
|
|
9
|
+
|
|
10
|
+
Terms
|
|
11
|
+
|
|
12
|
+
The Licensor hereby grants you the right to copy, modify, create derivative works, redistribute, and make non-production use of the Licensed Work. The Licensor may make an Additional Use Grant, above, permitting limited production use.
|
|
13
|
+
|
|
14
|
+
Effective on the Change Date, or the fourth anniversary of the first publicly available distribution of a specific version of the Licensed Work under this License, whichever comes first, the Licensor hereby grants you rights under the terms of the Change License, and the rights granted in the paragraph above terminate.
|
|
15
|
+
|
|
16
|
+
If your use of the Licensed Work does not comply with the requirements currently in effect as described in this License, you must purchase a commercial license from the Licensor, its affiliated entities, or authorized resellers, or you must refrain from using the Licensed Work.
|
|
17
|
+
|
|
18
|
+
All copies of the original and modified Licensed Work, and derivative works of the Licensed Work, are subject to this License. This License applies separately for each version of the Licensed Work and the Change Date may vary for each version of the Licensed Work released by Licensor.
|
|
19
|
+
|
|
20
|
+
You must conspicuously display this License on each original or modified copy of the Licensed Work. If you receive the Licensed Work in original or modified form from a third party, the terms and conditions set forth in this License apply to your use of that work.
|
|
21
|
+
|
|
22
|
+
Any use of the Licensed Work in violation of this License will automatically terminate your rights under this License for the current and all other versions of the Licensed Work.
|
|
23
|
+
|
|
24
|
+
This License does not grant you any right in any trademark or logo of Licensor or its affiliates (provided that you may use a trademark or logo of Licensor as expressly required by this License).
|
|
25
|
+
|
|
26
|
+
TO THE EXTENT PERMITTED BY APPLICABLE LAW, THE LICENSED WORK IS PROVIDED ON AN "AS IS" BASIS. LICENSOR HEREBY DISCLAIMS ALL WARRANTIES AND CONDITIONS, EXPRESS OR IMPLIED, INCLUDING (WITHOUT LIMITATION) WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT, AND TITLE.
|
|
27
|
+
|
|
28
|
+
License text copyright (c) 2017 MariaDB Corporation Ab, All Rights Reserved.
|
|
29
|
+
"Business Source License" is a trademark of MariaDB Corporation Ab.
|
|
@@ -0,0 +1,345 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dprovenancekit
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Reasoning observability and regression testing for AI systems — a Python port of DProvenanceKit.
|
|
5
|
+
Author: DProvenanceKit
|
|
6
|
+
License: BSL-1.1
|
|
7
|
+
Project-URL: Homepage, https://github.com/Therealdk8890/DProvenanceKitPython
|
|
8
|
+
Project-URL: Repository, https://github.com/Therealdk8890/DProvenanceKitPython
|
|
9
|
+
Project-URL: Issues, https://github.com/Therealdk8890/DProvenanceKitPython/issues
|
|
10
|
+
Project-URL: Swift original, https://github.com/Therealdk8890/DProvenanceKit
|
|
11
|
+
Keywords: observability,ai,agents,tracing,provenance,regression
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Classifier: Intended Audience :: Developers
|
|
19
|
+
Classifier: Topic :: Software Development :: Debuggers
|
|
20
|
+
Classifier: Topic :: Software Development :: Quality Assurance
|
|
21
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
22
|
+
Classifier: License :: Other/Proprietary License
|
|
23
|
+
Classifier: Operating System :: OS Independent
|
|
24
|
+
Classifier: Typing :: Typed
|
|
25
|
+
Requires-Python: >=3.9
|
|
26
|
+
Description-Content-Type: text/markdown
|
|
27
|
+
License-File: LICENSE
|
|
28
|
+
Provides-Extra: dev
|
|
29
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
30
|
+
Provides-Extra: langchain
|
|
31
|
+
Requires-Dist: langchain-core>=0.2; extra == "langchain"
|
|
32
|
+
Provides-Extra: openai-agents
|
|
33
|
+
Requires-Dist: openai-agents>=0.1; extra == "openai-agents"
|
|
34
|
+
Dynamic: license-file
|
|
35
|
+
|
|
36
|
+
# DProvenanceKit (Python)
|
|
37
|
+
|
|
38
|
+
**Reasoning observability and regression testing for AI systems — a Python port of the Swift [DProvenanceKit](https://github.com/Therealdk8890/DProvenanceKit).**
|
|
39
|
+
|
|
40
|
+
When an agent's reasoning drifts between runs, DProvenanceKit turns each execution into a queryable, diffable trace so you can see *what changed and why* — not just *what happened*.
|
|
41
|
+
|
|
42
|
+
> Run → Record → Query → Diff → Detect Regressions
|
|
43
|
+
|
|
44
|
+
This is a faithful, dependency-free port of the Swift library to Python. It keeps the same architecture and guarantees — synchronous non-blocking recording, priority-aware backpressure, one query language over two backends held at parity, structural diffing, formally-modeled semantic alignment, and by-tier drop accounting so load-shedding is never silent.
|
|
45
|
+
|
|
46
|
+
The original Swift package is unchanged; this is a parallel implementation.
|
|
47
|
+
|
|
48
|
+
---
|
|
49
|
+
|
|
50
|
+
## Why a Python port
|
|
51
|
+
|
|
52
|
+
The Swift library targets Apple-platform and on-device AI. This port brings the same reasoning-layer observability to Python codebases — agent frameworks, LLM workflows, tool-using models — with **zero third-party dependencies** (it uses only the standard library: `sqlite3`, `contextvars`, `threading`, `json`, `hashlib`, `uuid`, `urllib`).
|
|
53
|
+
|
|
54
|
+
---
|
|
55
|
+
|
|
56
|
+
## Install
|
|
57
|
+
|
|
58
|
+
From PyPI (released builds):
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
pip install dprovenancekit
|
|
62
|
+
pip install "dprovenancekit[langchain]" # + LangChain adapter
|
|
63
|
+
pip install "dprovenancekit[openai-agents]" # + OpenAI Agents adapter
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
From a checkout (development):
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
pip install -e ".[dev]"
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
Requires Python 3.9+; the core has **zero third-party dependencies**. Releasing is documented
|
|
73
|
+
in [RELEASING.md](RELEASING.md).
|
|
74
|
+
|
|
75
|
+
---
|
|
76
|
+
|
|
77
|
+
## 5-minute demo
|
|
78
|
+
|
|
79
|
+
### 1. Define your events
|
|
80
|
+
|
|
81
|
+
Any frozen dataclass that subclasses `TraceableEvent`, exposing a stable `type_identifier` and a `priority`:
|
|
82
|
+
|
|
83
|
+
```python
|
|
84
|
+
from dataclasses import dataclass
|
|
85
|
+
from dprovenancekit import TraceableEvent, TracePriority
|
|
86
|
+
|
|
87
|
+
@dataclass(frozen=True)
|
|
88
|
+
class MyAIDecision(TraceableEvent):
|
|
89
|
+
kind: str # "promptGenerated" | "documentEvaluated" | "conflictDetected" | "finalDecisionMade"
|
|
90
|
+
token_count: int = 0
|
|
91
|
+
document_id: str = ""
|
|
92
|
+
score: float = 0.0
|
|
93
|
+
reason: str = ""
|
|
94
|
+
approved: bool = False
|
|
95
|
+
|
|
96
|
+
@property
|
|
97
|
+
def type_identifier(self) -> str:
|
|
98
|
+
return self.kind
|
|
99
|
+
|
|
100
|
+
@property
|
|
101
|
+
def priority(self) -> TracePriority:
|
|
102
|
+
if self.kind == "finalDecisionMade":
|
|
103
|
+
return TracePriority.CRITICAL
|
|
104
|
+
if self.kind == "conflictDetected":
|
|
105
|
+
return TracePriority.DIAGNOSTIC
|
|
106
|
+
return TracePriority.TELEMETRY
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
### 2. Record an execution run
|
|
110
|
+
|
|
111
|
+
`record(...)` is synchronous and never blocks — it touches only an in-memory buffer. Ambient run / engine / span context propagates through `contextvars`, so nested scopes attribute events correctly with no plumbing.
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
from dprovenancekit import DProvenanceKit, InMemoryTraceStore
|
|
115
|
+
|
|
116
|
+
kit = DProvenanceKit(MyAIDecision)
|
|
117
|
+
store = InMemoryTraceStore()
|
|
118
|
+
|
|
119
|
+
with kit.run(context_id="demo_case", store=store):
|
|
120
|
+
kit.record(MyAIDecision(kind="documentEvaluated", document_id="DocA", score=0.95))
|
|
121
|
+
kit.record(MyAIDecision(kind="conflictDetected", reason="timeline_inconsistency"))
|
|
122
|
+
kit.record(MyAIDecision(kind="finalDecisionMade", approved=False))
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
### 3. Query reasoning patterns
|
|
126
|
+
|
|
127
|
+
```python
|
|
128
|
+
from dprovenancekit import TraceQueryDSL
|
|
129
|
+
|
|
130
|
+
suspicious = store.query_runs(
|
|
131
|
+
TraceQueryDSL()
|
|
132
|
+
.requiring_step("conflictDetected")
|
|
133
|
+
.missing_step("documentEvaluated")
|
|
134
|
+
)
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
Find runs where a conflict was reported but no document was ever evaluated. The same DSL compiles to SQL for `SQLiteTraceStore` and is evaluated in memory for `InMemoryTraceStore` — the two backends are held in lockstep by a parity test suite.
|
|
138
|
+
|
|
139
|
+
### 4. Diff runs
|
|
140
|
+
|
|
141
|
+
```python
|
|
142
|
+
from dprovenancekit import TraceDiffEngine
|
|
143
|
+
|
|
144
|
+
diff = TraceDiffEngine().diff(base=run_a, comparison=run_b)
|
|
145
|
+
print(diff.changes) # structural steps that appeared, disappeared, or moved
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
### 5. Semantic alignment
|
|
149
|
+
|
|
150
|
+
`TraceAlignmentEngine` decides whether two executions are behaviorally equivalent within a formally-defined semantic model, even when payloads vary slightly:
|
|
151
|
+
|
|
152
|
+
```python
|
|
153
|
+
from dprovenancekit import (
|
|
154
|
+
AlignmentConfiguration, AlignmentProfile, AnyEquivalenceEvaluator, TraceAlignmentEngine,
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
config = AlignmentConfiguration(
|
|
158
|
+
profile=AlignmentProfile.strict_audit_v1,
|
|
159
|
+
equivalence_evaluator=AnyEquivalenceEvaluator(
|
|
160
|
+
evaluator_identifier="MyAIDecision_Semantic",
|
|
161
|
+
evaluator=lambda a, b: 1.0 if a == b else 0.0,
|
|
162
|
+
),
|
|
163
|
+
)
|
|
164
|
+
result = TraceAlignmentEngine(config).align(base=run_a, comparison=run_b)
|
|
165
|
+
print(result.regression_risk.level)
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
### 6. Detect regressions automatically
|
|
169
|
+
|
|
170
|
+
```python
|
|
171
|
+
from dprovenancekit import AnomalyDetector, AnomalyRule, TraceQueryDSL
|
|
172
|
+
|
|
173
|
+
class UnverifiedConflictRule(AnomalyRule):
|
|
174
|
+
@property
|
|
175
|
+
def name(self): return "unverified_conflict"
|
|
176
|
+
@property
|
|
177
|
+
def anomaly_query(self):
|
|
178
|
+
return TraceQueryDSL().requiring_step("conflictDetected").missing_step("documentEvaluated")
|
|
179
|
+
def describe(self, run): return "Conflict detected with no supporting evaluation"
|
|
180
|
+
|
|
181
|
+
anomalies = AnomalyDetector(store).detect_anomalies([UnverifiedConflictRule()])
|
|
182
|
+
```
|
|
183
|
+
|
|
184
|
+
---
|
|
185
|
+
|
|
186
|
+
## Benchmark corpus
|
|
187
|
+
|
|
188
|
+
The library ships the same validation corpus as the Swift version. The headless CLI runs it through the real benchmark runner:
|
|
189
|
+
|
|
190
|
+
```bash
|
|
191
|
+
dprovenancekit evaluate # precision/recall/F1 over the standard + adversarial corpora
|
|
192
|
+
dprovenancekit diagnose # causal ranking of failure modes
|
|
193
|
+
dprovenancekit stability # determinism boundary: isolated vs perturbed F1 variance
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
The standard corpus scores **Precision 1.000 / Recall 1.000 / F1 1.000** across 8 scenarios (reordering, semantic evolution, noise injection, branch collapse, …), matching the Swift implementation.
|
|
197
|
+
|
|
198
|
+
---
|
|
199
|
+
|
|
200
|
+
## What's included
|
|
201
|
+
|
|
202
|
+
| Component | Module |
|
|
203
|
+
| --- | --- |
|
|
204
|
+
| Event model, priority tiers, drop accounting | `event`, `priority`, `drop_stats` |
|
|
205
|
+
| Recording API + ambient context | `kit`, `context` |
|
|
206
|
+
| Stores (in-memory, WAL SQLite, raw read, cloud) | `store`, `sqlite_store`, `raw_store`, `cloud_store` |
|
|
207
|
+
| Priority-aware write buffer | `write_buffer` |
|
|
208
|
+
| Query DSL + two backends (AST eval + SQL compiler) | `query` |
|
|
209
|
+
| Live querying + anomaly detection | `live_engine`, `anomaly` |
|
|
210
|
+
| Structural diff + span-aware snapshot diff | `diff`, `snapshot_diff` |
|
|
211
|
+
| Deterministic replay | `replay` |
|
|
212
|
+
| Semantic alignment engine + evidence + verification | `alignment_*`, `verification` |
|
|
213
|
+
| Benchmark harness, failure diagnoser, corpus | `benchmark`, `corpus` |
|
|
214
|
+
| Pure view models for a trace viewer | `viewmodel` |
|
|
215
|
+
| Framework adapters (LangChain / LangGraph) | `integrations.langchain` |
|
|
216
|
+
| Framework adapters (OpenAI Agents SDK) | `integrations.openai_agents` |
|
|
217
|
+
| Regression-gate test helper | `testing` |
|
|
218
|
+
| Framework-agnostic instrumentation (decorators) | `instrument` |
|
|
219
|
+
|
|
220
|
+
The SwiftUI `DProvenanceUI` target is intentionally **not** ported (it is Apple-platform UI); its pure value-model layer (`SpanViewModel`, flattening) is ported in `viewmodel`.
|
|
221
|
+
|
|
222
|
+
---
|
|
223
|
+
|
|
224
|
+
## Cross-language conformance
|
|
225
|
+
|
|
226
|
+
Keeping the Swift and Python SDKs behaviorally equivalent is enforced, not hoped for. [`conformance/`](conformance/) holds **Trace Specification v1** — a language-neutral contract plus frozen golden vectors that pin the run fingerprint, the alignment profile hash, canonical payload encoding, query semantics, and alignment verdicts.
|
|
227
|
+
|
|
228
|
+
```bash
|
|
229
|
+
python -m pytest tests/test_conformance.py # the Python SDK's claim of conformance
|
|
230
|
+
python conformance/generate_vectors.py # intentionally re-freeze the contract
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
The committed `conformance/vectors/*.json` are the contract: any SDK — Swift today, Rust or TypeScript later — proves equivalence by reproducing the same files. See [`conformance/TRACE_SPEC_v1.md`](conformance/TRACE_SPEC_v1.md).
|
|
234
|
+
|
|
235
|
+
---
|
|
236
|
+
|
|
237
|
+
## Integrations
|
|
238
|
+
|
|
239
|
+
Framework adapters live in `dprovenancekit.integrations` and are the only parts of the package with third-party dependencies — the core stays pure standard library, and nothing imports an adapter unless you do.
|
|
240
|
+
|
|
241
|
+
### LangChain / LangGraph
|
|
242
|
+
|
|
243
|
+
```bash
|
|
244
|
+
pip install dprovenancekit[langchain]
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
```python
|
|
248
|
+
from dprovenancekit import SQLiteTraceStore
|
|
249
|
+
from dprovenancekit.integrations.langchain import DProvenanceTracer, LangChainTraceEvent
|
|
250
|
+
|
|
251
|
+
store = SQLiteTraceStore(LangChainTraceEvent, "traces.sqlite")
|
|
252
|
+
tracer = DProvenanceTracer(store)
|
|
253
|
+
|
|
254
|
+
with tracer.trace(context_id="customer-42") as cb:
|
|
255
|
+
answer = chain.invoke(question, config={"callbacks": [cb]})
|
|
256
|
+
|
|
257
|
+
# The run is now recorded — query it, diff it against a known-good run, or
|
|
258
|
+
# compare run fingerprints to detect when the agent took a different path.
|
|
259
|
+
```
|
|
260
|
+
|
|
261
|
+
[`DProvenanceCallbackHandler`](src/dprovenancekit/integrations/langchain.py) translates LangChain's callback stream into a trace: each `on_llm_start` / `on_tool_start` / `on_retriever_start` / `on_chain_start` (and its completion) becomes a typed event in execution order, LangChain's `run_id`/`parent_run_id` become the trace's **span tree**, the active model/tool/retriever becomes the **engine**, and (by default) lifecycle **provenance edges** are emitted (`DERIVED_FROM` start→completion, `INFORMED` parent→child). Because events flow through the same recording path as hand-written ones, the whole toolkit applies: a run's **fingerprint** is the structural identity of the agent's execution path, so two runs that diverge (a tool called in a different order, a retrieval step skipped) produce different fingerprints — a cheap regression signal. Options: `capture_payloads` (prompt/completion/IO previews), `link_lifecycle` (edges), `record_chains` (LCEL/LangGraph chain noise).
|
|
262
|
+
|
|
263
|
+
### OpenAI Agents SDK
|
|
264
|
+
|
|
265
|
+
```bash
|
|
266
|
+
pip install dprovenancekit[openai-agents]
|
|
267
|
+
```
|
|
268
|
+
|
|
269
|
+
```python
|
|
270
|
+
from dprovenancekit import SQLiteTraceStore
|
|
271
|
+
from dprovenancekit.integrations.openai_agents import register, OpenAIAgentsTraceEvent
|
|
272
|
+
|
|
273
|
+
store = SQLiteTraceStore(OpenAIAgentsTraceEvent, "traces.sqlite")
|
|
274
|
+
register(store) # registers a global tracing processor
|
|
275
|
+
|
|
276
|
+
# ... run your agents normally; each run is recorded ...
|
|
277
|
+
```
|
|
278
|
+
|
|
279
|
+
[`DProvenanceTracingProcessor`](src/dprovenancekit/integrations/openai_agents.py) implements the SDK's `TracingProcessor`: each agent run becomes a trace-run (`context_id` = the trace name), and every span start/end becomes a typed event — `agent.start`, `generation.end`, `function.start`, `guardrail.error`, … — in execution order. The span's `span_id`/`parent_id` become the **span tree**, the active agent/tool/model becomes the **engine**, errors and triggered guardrails are recorded at `CRITICAL`, and lifecycle **provenance edges** are emitted (same `DERIVED_FROM`/`INFORMED` model). One registered processor captures every run; the same `fingerprint`/diff/align tooling then applies.
|
|
280
|
+
|
|
281
|
+
---
|
|
282
|
+
|
|
283
|
+
## Regression gate
|
|
284
|
+
|
|
285
|
+
`dprovenancekit.testing` turns "did my agent regress?" into one assertion you can drop into any test or CI step. Give it a *golden* run (known-good) and a *candidate* run (what your current code produced); it aligns them and fails with a readable diagnostic if the candidate diverged.
|
|
286
|
+
|
|
287
|
+
```python
|
|
288
|
+
from dprovenancekit.testing import assert_no_regression
|
|
289
|
+
|
|
290
|
+
assert_no_regression(golden=golden_run, candidate=candidate_run)
|
|
291
|
+
```
|
|
292
|
+
|
|
293
|
+
Strict by default — any removed, added, or changed (ambiguous) step fails, and a removed CRITICAL step is additionally a HIGH-severity regression. Loosen with `max_regression_level` (gate only on severity) or `allow_divergent_steps` (tolerate benign per-step changes), or pass a custom `evaluator` to define what "equivalent" means (e.g. ignore volatile fields like token counts). `RegressionGate(...).check(...)` returns a `RegressionReport` (no raise) for richer assertions. Detecting *reordered* steps requires a span-aware profile (`AlignmentProfile.developer_debug_v1`); the default linear profile treats a pure reorder as still-matching. Complements `AlignmentSnapshotValidator` (an exact output-hash snapshot): the gate works on two runs and reasons about regression severity.
|
|
294
|
+
|
|
295
|
+
---
|
|
296
|
+
|
|
297
|
+
## Example: regression testing
|
|
298
|
+
|
|
299
|
+
[`examples/regression_testing.py`](examples/regression_testing.py) is the end-to-end story in ~150 readable lines: record a **golden** run of a fact-checking agent (retrieve → verify → decide), then catch a later run that skips its verification step — via both the fast **fingerprint** check and the detailed **alignment** verdict (which flags the dropped `claimVerified` step as a HIGH regression).
|
|
300
|
+
|
|
301
|
+
```bash
|
|
302
|
+
python examples/regression_testing.py
|
|
303
|
+
```
|
|
304
|
+
|
|
305
|
+
It self-asserts its verdicts, so it doubles as an executable test of the headline use case.
|
|
306
|
+
|
|
307
|
+
---
|
|
308
|
+
|
|
309
|
+
## Instrumenting plain code (no framework)
|
|
310
|
+
|
|
311
|
+
Not using a framework? Instrument a hand-written agent loop directly — no event type to define, zero dependencies (ships in core as `dprovenancekit.instrument`):
|
|
312
|
+
|
|
313
|
+
```python
|
|
314
|
+
from dprovenancekit import InMemoryTraceStore, traced, traced_run, record_event
|
|
315
|
+
|
|
316
|
+
@traced
|
|
317
|
+
def search(query): ...
|
|
318
|
+
|
|
319
|
+
@traced
|
|
320
|
+
def answer(question, sources): ...
|
|
321
|
+
|
|
322
|
+
store = InMemoryTraceStore()
|
|
323
|
+
with traced_run(store, context_id="ticket-42"):
|
|
324
|
+
sources = search(question)
|
|
325
|
+
record_event("plan.chosen", {"strategy": "rag"})
|
|
326
|
+
reply = answer(question, sources)
|
|
327
|
+
```
|
|
328
|
+
|
|
329
|
+
`@traced` records a `"<name>.start"` / `".end"` / `".error"` event pair per call in its own **span** (the function name is the **engine**), nests calls in the span tree, and emits the same `DERIVED_FROM` / `INFORMED` provenance edges as the framework adapters. `record_event(...)` drops an ad-hoc event (a decision, a chosen branch). Plain functions, `async def`, generators, and async generators are all supported (for a generator, start/end bracket the full iteration). Instrumentation never changes behavior — capture is failure-proof and exceptions pass through unchanged. Outside a `traced_run` the decorators are transparent, so instrumented code is safe to call untraced. The trace it produces is identical in shape to the adapter-produced ones, so fingerprint / diff / align / the regression gate all apply.
|
|
330
|
+
|
|
331
|
+
---
|
|
332
|
+
|
|
333
|
+
## Tests
|
|
334
|
+
|
|
335
|
+
```bash
|
|
336
|
+
python -m pytest
|
|
337
|
+
```
|
|
338
|
+
|
|
339
|
+
167 tests: 80 ported from the Swift suite (query parity, write-buffer backpressure, SQLite stress + drop accounting, alignment, replay, snapshot diff, explainability fidelity, benchmark scoring, cloud chaos, …), 27 cross-language conformance checks against the frozen Trace Specification v1 vectors, 14 LangChain integration tests, 16 OpenAI Agents SDK integration tests, 16 instrumentation-layer tests, 13 regression-gate tests, and the regression-testing example run as a self-asserting test. (The real-framework tests run only when `langchain-core` / `openai-agents` are installed, otherwise skipped.)
|
|
340
|
+
|
|
341
|
+
---
|
|
342
|
+
|
|
343
|
+
## License
|
|
344
|
+
|
|
345
|
+
Distributed under the **Business Source License 1.1**, same as the upstream Swift project. See [LICENSE](LICENSE).
|