spindle-eval 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- spindle_eval-0.1.0/LICENSE +21 -0
- spindle_eval-0.1.0/MANIFEST.in +1 -0
- spindle_eval-0.1.0/PKG-INFO +262 -0
- spindle_eval-0.1.0/README.md +215 -0
- spindle_eval-0.1.0/pyproject.toml +87 -0
- spindle_eval-0.1.0/setup.cfg +4 -0
- spindle_eval-0.1.0/src/spindle_eval/__init__.py +3 -0
- spindle_eval-0.1.0/src/spindle_eval/baselines/__init__.py +5 -0
- spindle_eval-0.1.0/src/spindle_eval/baselines/base.py +21 -0
- spindle_eval-0.1.0/src/spindle_eval/baselines/bm25_baseline.py +42 -0
- spindle_eval-0.1.0/src/spindle_eval/baselines/hybrid_search.py +69 -0
- spindle_eval-0.1.0/src/spindle_eval/baselines/naive_rag.py +36 -0
- spindle_eval-0.1.0/src/spindle_eval/baselines/no_rag.py +22 -0
- spindle_eval-0.1.0/src/spindle_eval/baselines/oracle.py +24 -0
- spindle_eval-0.1.0/src/spindle_eval/ci/__init__.py +1 -0
- spindle_eval-0.1.0/src/spindle_eval/ci/regression.py +76 -0
- spindle_eval-0.1.0/src/spindle_eval/ci/reporter.py +40 -0
- spindle_eval-0.1.0/src/spindle_eval/compat.py +153 -0
- spindle_eval-0.1.0/src/spindle_eval/conf/config.yaml +28 -0
- spindle_eval-0.1.0/src/spindle_eval/conf/evaluation/full.yaml +8 -0
- spindle_eval-0.1.0/src/spindle_eval/conf/evaluation/quick.yaml +6 -0
- spindle_eval-0.1.0/src/spindle_eval/conf/extraction/finetuned.yaml +4 -0
- spindle_eval-0.1.0/src/spindle_eval/conf/extraction/llm.yaml +4 -0
- spindle_eval-0.1.0/src/spindle_eval/conf/extraction/nlp.yaml +4 -0
- spindle_eval-0.1.0/src/spindle_eval/conf/generation/claude.yaml +3 -0
- spindle_eval-0.1.0/src/spindle_eval/conf/generation/gemini.yaml +3 -0
- spindle_eval-0.1.0/src/spindle_eval/conf/generation/gpt4.yaml +3 -0
- spindle_eval-0.1.0/src/spindle_eval/conf/ontology/hybrid.yaml +12 -0
- spindle_eval-0.1.0/src/spindle_eval/conf/ontology/schema_first.yaml +12 -0
- spindle_eval-0.1.0/src/spindle_eval/conf/ontology/schema_free.yaml +4 -0
- spindle_eval-0.1.0/src/spindle_eval/conf/preprocessing/default.yaml +4 -0
- spindle_eval-0.1.0/src/spindle_eval/conf/preprocessing/large_chunks.yaml +4 -0
- spindle_eval-0.1.0/src/spindle_eval/conf/preprocessing/small_chunks.yaml +4 -0
- spindle_eval-0.1.0/src/spindle_eval/conf/retrieval/drift.yaml +4 -0
- spindle_eval-0.1.0/src/spindle_eval/conf/retrieval/global.yaml +4 -0
- spindle_eval-0.1.0/src/spindle_eval/conf/retrieval/hybrid.yaml +4 -0
- spindle_eval-0.1.0/src/spindle_eval/conf/retrieval/local.yaml +4 -0
- spindle_eval-0.1.0/src/spindle_eval/conf/sweep/chunk_size.yaml +11 -0
- spindle_eval-0.1.0/src/spindle_eval/conf/sweep/er_threshold.yaml +10 -0
- spindle_eval-0.1.0/src/spindle_eval/conf/sweep/none.yaml +1 -0
- spindle_eval-0.1.0/src/spindle_eval/conf/sweep/retrieval.yaml +12 -0
- spindle_eval-0.1.0/src/spindle_eval/datasets/__init__.py +15 -0
- spindle_eval-0.1.0/src/spindle_eval/datasets/generator.py +82 -0
- spindle_eval-0.1.0/src/spindle_eval/datasets/golden.py +151 -0
- spindle_eval-0.1.0/src/spindle_eval/datasets/kos_reference.py +32 -0
- spindle_eval-0.1.0/src/spindle_eval/datasets/versioning.py +37 -0
- spindle_eval-0.1.0/src/spindle_eval/events/__init__.py +21 -0
- spindle_eval-0.1.0/src/spindle_eval/events/analysis.py +117 -0
- spindle_eval-0.1.0/src/spindle_eval/events/store.py +118 -0
- spindle_eval-0.1.0/src/spindle_eval/golden_data/gold_kg/annotation_guidelines.md +30 -0
- spindle_eval-0.1.0/src/spindle_eval/golden_data/questions.jsonl +3 -0
- spindle_eval-0.1.0/src/spindle_eval/metrics/__init__.py +8 -0
- spindle_eval-0.1.0/src/spindle_eval/metrics/chunk_metrics.py +30 -0
- spindle_eval-0.1.0/src/spindle_eval/metrics/extraction_metrics.py +101 -0
- spindle_eval-0.1.0/src/spindle_eval/metrics/graph_metrics.py +218 -0
- spindle_eval-0.1.0/src/spindle_eval/metrics/kos_loader.py +42 -0
- spindle_eval-0.1.0/src/spindle_eval/metrics/kos_metrics.py +367 -0
- spindle_eval-0.1.0/src/spindle_eval/metrics/provenance_metrics.py +14 -0
- spindle_eval-0.1.0/src/spindle_eval/metrics/ragas_scorers.py +49 -0
- spindle_eval-0.1.0/src/spindle_eval/metrics/statistical.py +147 -0
- spindle_eval-0.1.0/src/spindle_eval/mocks.py +227 -0
- spindle_eval-0.1.0/src/spindle_eval/pipeline.py +120 -0
- spindle_eval-0.1.0/src/spindle_eval/production/__init__.py +1 -0
- spindle_eval-0.1.0/src/spindle_eval/production/feedback_loop.py +53 -0
- spindle_eval-0.1.0/src/spindle_eval/production/staleness.py +39 -0
- spindle_eval-0.1.0/src/spindle_eval/protocols.py +183 -0
- spindle_eval-0.1.0/src/spindle_eval/runner.py +333 -0
- spindle_eval-0.1.0/src/spindle_eval/tracking/__init__.py +39 -0
- spindle_eval-0.1.0/src/spindle_eval/tracking/composite_tracker.py +53 -0
- spindle_eval-0.1.0/src/spindle_eval/tracking/file_tracker.py +95 -0
- spindle_eval-0.1.0/src/spindle_eval/tracking/langfuse_integration.py +39 -0
- spindle_eval-0.1.0/src/spindle_eval/tracking/mlflow_tracker.py +106 -0
- spindle_eval-0.1.0/src/spindle_eval/tracking/noop_tracker.py +44 -0
- spindle_eval-0.1.0/src/spindle_eval.egg-info/PKG-INFO +262 -0
- spindle_eval-0.1.0/src/spindle_eval.egg-info/SOURCES.txt +87 -0
- spindle_eval-0.1.0/src/spindle_eval.egg-info/dependency_links.txt +1 -0
- spindle_eval-0.1.0/src/spindle_eval.egg-info/entry_points.txt +2 -0
- spindle_eval-0.1.0/src/spindle_eval.egg-info/requires.txt +26 -0
- spindle_eval-0.1.0/src/spindle_eval.egg-info/top_level.txt +1 -0
- spindle_eval-0.1.0/tests/test_compat.py +89 -0
- spindle_eval-0.1.0/tests/test_datasets_and_ci.py +179 -0
- spindle_eval-0.1.0/tests/test_events.py +550 -0
- spindle_eval-0.1.0/tests/test_kos_metrics.py +432 -0
- spindle_eval-0.1.0/tests/test_metrics.py +55 -0
- spindle_eval-0.1.0/tests/test_mlflow_tracker.py +57 -0
- spindle_eval-0.1.0/tests/test_pipeline.py +171 -0
- spindle_eval-0.1.0/tests/test_protocols.py +161 -0
- spindle_eval-0.1.0/tests/test_runner.py +147 -0
- spindle_eval-0.1.0/tests/test_tracking.py +156 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Daniel Wood
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
include LICENSE README.md
|
|
@@ -0,0 +1,262 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: spindle-eval
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Pipeline-agnostic evaluation and observability for knowledge graph, RAG, and KOS pipelines
|
|
5
|
+
Author: Spindle Team
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/danielkentwood/spindle-eval
|
|
8
|
+
Project-URL: Documentation, https://github.com/danielkentwood/spindle-eval#readme
|
|
9
|
+
Project-URL: Repository, https://github.com/danielkentwood/spindle-eval
|
|
10
|
+
Project-URL: Changelog, https://github.com/danielkentwood/spindle-eval/releases
|
|
11
|
+
Keywords: graph-rag,evaluation,mlflow,experimentation,rag,knowledge-graph,hydra
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Requires-Python: >=3.10
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
License-File: LICENSE
|
|
22
|
+
Requires-Dist: mlflow>=3.0
|
|
23
|
+
Requires-Dist: hydra-core>=1.3
|
|
24
|
+
Requires-Dist: omegaconf>=2.3
|
|
25
|
+
Requires-Dist: ragas>=0.2
|
|
26
|
+
Requires-Dist: optuna<3.0,>=2.10
|
|
27
|
+
Requires-Dist: hydra-optuna-sweeper>=1.2
|
|
28
|
+
Requires-Dist: langfuse>=2.0
|
|
29
|
+
Requires-Dist: opentelemetry-sdk>=1.20
|
|
30
|
+
Requires-Dist: opentelemetry-exporter-otlp>=1.20
|
|
31
|
+
Requires-Dist: scipy>=1.10
|
|
32
|
+
Requires-Dist: numpy>=1.24
|
|
33
|
+
Requires-Dist: scikit-learn>=1.3
|
|
34
|
+
Requires-Dist: rank-bm25>=0.2
|
|
35
|
+
Requires-Dist: sentence-transformers>=2.2
|
|
36
|
+
Requires-Dist: rdflib>=7.0
|
|
37
|
+
Requires-Dist: pyshacl>=0.25
|
|
38
|
+
Provides-Extra: dev
|
|
39
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
40
|
+
Requires-Dist: pytest-cov>=4.0; extra == "dev"
|
|
41
|
+
Requires-Dist: ruff>=0.1; extra == "dev"
|
|
42
|
+
Requires-Dist: build>=1.0; extra == "dev"
|
|
43
|
+
Requires-Dist: twine>=5.0; extra == "dev"
|
|
44
|
+
Provides-Extra: spindle
|
|
45
|
+
Requires-Dist: spindle>=0.1.0; extra == "spindle"
|
|
46
|
+
Dynamic: license-file
|
|
47
|
+
|
|
48
|
+
# spindle-eval
|
|
49
|
+
|
|
50
|
+
Pipeline-agnostic evaluation and observability framework for knowledge graph, RAG, and KOS pipelines. spindle-eval wraps any pipeline defined as a sequence of `Stage` objects with structured experiment tracking, automated metrics, parameter sweeps, quality gates, baseline comparisons, and CI/CD regression detection.
|
|
51
|
+
|
|
52
|
+
Originally built for [spindle](https://github.com/danielkentwood/spindle) (a Graph RAG pipeline), spindle-eval is designed to evaluate **any** pipeline — full end-to-end systems, individual stages, or partial subsets.
|
|
53
|
+
|
|
54
|
+
## Why spindle-eval?
|
|
55
|
+
|
|
56
|
+
Multi-stage pipelines have many interacting parameters. Tuning them requires more than ad-hoc scripts. spindle-eval provides:
|
|
57
|
+
|
|
58
|
+
- **Stage-gated evaluation** — each stage must meet quality thresholds before downstream stages run, enforcing upstream-first optimization
|
|
59
|
+
- **Pipeline-agnostic execution** — define stages with the `Stage` protocol, wire them with `StageDef`, run them with `PipelineExecutor`
|
|
60
|
+
- **Composable configs** — Hydra config groups for every pipeline aspect, enabling single runs or multi-dimensional parameter sweeps
|
|
61
|
+
- **Multiple tracking backends** — MLflow for experiments, file-based for CI, composite for multi-backend, no-op for benchmarking
|
|
62
|
+
- **Structured events** — thread-safe event store with duration analysis, token tracking, and error filtering
|
|
63
|
+
- **KOS metrics** — intrinsic quality metrics for SKOS taxonomies and OWL ontologies (taxonomy depth, label quality, SHACL conformance, etc.)
|
|
64
|
+
- **Automated regression detection** — CI compares metrics against baselines with bootstrap confidence intervals
|
|
65
|
+
- **Golden dataset management** — versioned evaluation datasets with a question-type taxonomy and extensible reference fields for extraction and KOS evaluation
|
|
66
|
+
|
|
67
|
+
## Architecture overview
|
|
68
|
+
|
|
69
|
+
```
|
|
70
|
+
┌─────────────────────────────┐
|
|
71
|
+
│ Hydra Configuration │
|
|
72
|
+
│ (composable YAML per stage) │
|
|
73
|
+
└──────────────┬──────────────┘
|
|
74
|
+
│
|
|
75
|
+
┌──────────────▼──────────────┐
|
|
76
|
+
│ spindle-eval runner │
|
|
77
|
+
│ (discovery + orchestration) │
|
|
78
|
+
└──────────────┬──────────────┘
|
|
79
|
+
│
|
|
80
|
+
┌──────────────▼──────────────┐
|
|
81
|
+
│ PipelineExecutor │
|
|
82
|
+
│ (stage wiring, metrics, │
|
|
83
|
+
│ gates, event logging) │
|
|
84
|
+
└──────────────┬──────────────┘
|
|
85
|
+
│
|
|
86
|
+
┌────────────┬───────────┼───────────┬────────────┐
|
|
87
|
+
▼ ▼ ▼ ▼ ▼
|
|
88
|
+
Stage 1 Stage 2 Stage 3 Stage N Metric fns
|
|
89
|
+
(any) (any) (any) (any) (attached)
|
|
90
|
+
│ │ │ │ │
|
|
91
|
+
└────────────┴───────────┴───────────┴────────────┘
|
|
92
|
+
│
|
|
93
|
+
┌──────────────▼──────────────┐
|
|
94
|
+
│ Tracker backends │
|
|
95
|
+
├──────────┬─────────┬────────┤
|
|
96
|
+
▼ ▼ ▼ ▼
|
|
97
|
+
MLflow File Langfuse No-op
|
|
98
|
+
(experiments) (JSON) (traces) (benchmarks)
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
## Installation
|
|
102
|
+
|
|
103
|
+
```bash
|
|
104
|
+
pip install spindle-eval
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
For co-development with a pipeline package (editable install):
|
|
108
|
+
|
|
109
|
+
```bash
|
|
110
|
+
pip install -e ".[dev]"
|
|
111
|
+
pip install -e /path/to/your-pipeline
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
## Quick start
|
|
115
|
+
|
|
116
|
+
### Full pipeline evaluation
|
|
117
|
+
|
|
118
|
+
```bash
|
|
119
|
+
# Single evaluation run
|
|
120
|
+
python -m spindle_eval.runner retrieval=hybrid generation=claude evaluation=quick
|
|
121
|
+
|
|
122
|
+
# Parameter sweep
|
|
123
|
+
python -m spindle_eval.runner --multirun \
|
|
124
|
+
preprocessing.chunk_size=256,512,1024 \
|
|
125
|
+
retrieval.top_k=5,10,20
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
### Evaluate a single stage
|
|
129
|
+
|
|
130
|
+
```python
|
|
131
|
+
from spindle_eval.pipeline import PipelineExecutor
|
|
132
|
+
from spindle_eval.protocols import StageDef, StageResult
|
|
133
|
+
from spindle_eval.tracking import create_tracker
|
|
134
|
+
from spindle_eval.metrics.chunk_metrics import boundary_coherence, size_distribution
|
|
135
|
+
|
|
136
|
+
class MyChunker:
|
|
137
|
+
name = "chunking"
|
|
138
|
+
def run(self, inputs, cfg):
|
|
139
|
+
chunks = do_chunking(cfg)
|
|
140
|
+
return StageResult(outputs={"chunks": chunks})
|
|
141
|
+
|
|
142
|
+
tracker = create_tracker("file", output_dir="./results")
|
|
143
|
+
stages = [
|
|
144
|
+
StageDef(
|
|
145
|
+
name="chunking",
|
|
146
|
+
stage=MyChunker(),
|
|
147
|
+
metrics=[boundary_coherence, size_distribution],
|
|
148
|
+
),
|
|
149
|
+
]
|
|
150
|
+
result = PipelineExecutor(tracker).execute(stages, cfg)
|
|
151
|
+
tracker.end_run()
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
### Evaluate a KOS builder
|
|
155
|
+
|
|
156
|
+
```python
|
|
157
|
+
from spindle_eval.metrics.kos_metrics import taxonomy_depth, label_quality, orphan_concept_ratio
|
|
158
|
+
|
|
159
|
+
stages = [
|
|
160
|
+
StageDef(
|
|
161
|
+
name="taxonomy",
|
|
162
|
+
stage=MyTaxonomyBuilder(),
|
|
163
|
+
input_keys={"chunks": "preprocessing.chunks"},
|
|
164
|
+
metrics=[taxonomy_depth, label_quality, orphan_concept_ratio],
|
|
165
|
+
gate=lambda m: m.get("orphan_concept_ratio", 1.0) < 0.3,
|
|
166
|
+
),
|
|
167
|
+
]
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
## Configuration
|
|
171
|
+
|
|
172
|
+
Hydra config groups live in `spindle_eval/conf/` (packaged with the install) and compose together:
|
|
173
|
+
|
|
174
|
+
| Group | Options | Controls |
|
|
175
|
+
|---|---|---|
|
|
176
|
+
| `preprocessing` | `default`, `small_chunks`, `large_chunks` | Chunking strategy and size |
|
|
177
|
+
| `ontology` | `schema_first`, `schema_free`, `hybrid` | Entity/relation schema discovery |
|
|
178
|
+
| `extraction` | `llm`, `nlp`, `finetuned` | Triple extraction method |
|
|
179
|
+
| `retrieval` | `hybrid`, `local`, `global`, `drift` | Graph retrieval strategy |
|
|
180
|
+
| `generation` | `gpt4`, `claude`, `gemini` | LLM for answer generation |
|
|
181
|
+
| `evaluation` | `quick`, `full` | Number of evaluation examples |
|
|
182
|
+
| `sweep` | `none`, `er_threshold`, `retrieval`, `chunk_size` | Predefined sweep dimensions |
|
|
183
|
+
|
|
184
|
+
Pipeline packages can register additional config groups via Hydra's `SearchPathPlugin`. See [docs/hydra-config-conventions.md](docs/hydra-config-conventions.md).
|
|
185
|
+
|
|
186
|
+
## Metrics
|
|
187
|
+
|
|
188
|
+
### RAG quality (via Ragas)
|
|
189
|
+
Faithfulness, context recall, context precision, answer correctness, answer relevancy.
|
|
190
|
+
|
|
191
|
+
### Graph quality
|
|
192
|
+
Connectivity, modularity, B-CUBED clustering, CEAF entity alignment, subgraph completeness.
|
|
193
|
+
|
|
194
|
+
### Extraction quality
|
|
195
|
+
Triple extraction precision, recall, and F1 — with configurable stage gates.
|
|
196
|
+
|
|
197
|
+
### KOS quality
|
|
198
|
+
Taxonomy depth/breadth, label quality, definition completeness, thesaurus connectivity, orphan ratio, axiom density, SHACL conformance. See [docs/kos-evaluation-guide.md](docs/kos-evaluation-guide.md).
|
|
199
|
+
|
|
200
|
+
### Chunk and provenance quality
|
|
201
|
+
Boundary coherence, size distribution, evidence span coverage.
|
|
202
|
+
|
|
203
|
+
### Statistical rigor
|
|
204
|
+
Bootstrap confidence intervals for all metrics, used for regression detection in CI.
|
|
205
|
+
|
|
206
|
+
## Tracking backends
|
|
207
|
+
|
|
208
|
+
| Backend | Class | Use case |
|
|
209
|
+
|---------|-------|----------|
|
|
210
|
+
| MLflow | `MLflowTracker` | Production experiment tracking |
|
|
211
|
+
| File | `FileTracker` | Local development, CI |
|
|
212
|
+
| Langfuse | Via OpenTelemetry | Trace-level debugging |
|
|
213
|
+
| No-op | `NoOpTracker` | Benchmarking, unit tests |
|
|
214
|
+
| Composite | `CompositeTracker` | Fan out to multiple backends |
|
|
215
|
+
|
|
216
|
+
```python
|
|
217
|
+
from spindle_eval.tracking import create_tracker
|
|
218
|
+
|
|
219
|
+
tracker = create_tracker("mlflow")
|
|
220
|
+
tracker = create_tracker("file", output_dir="./results")
|
|
221
|
+
tracker = create_tracker("noop")
|
|
222
|
+
```
|
|
223
|
+
|
|
224
|
+
## Documentation
|
|
225
|
+
|
|
226
|
+
| Guide | Audience |
|
|
227
|
+
|-------|----------|
|
|
228
|
+
| [Spindle Developer Guide](docs/spindle-developer-guide.md) | Pipeline developers integrating with spindle-eval |
|
|
229
|
+
| [Custom Pipeline Guide](docs/custom-pipeline-guide.md) | Developers building non-spindle pipelines |
|
|
230
|
+
| [KOS Evaluation Guide](docs/kos-evaluation-guide.md) | Developers evaluating SKOS/OWL knowledge structures |
|
|
231
|
+
| [Hydra Config Conventions](docs/hydra-config-conventions.md) | Config authors and sweep designers |
|
|
232
|
+
| [Tracking Setup](docs/tracking_setup.md) | Setting up MLflow/Langfuse (GKE or local Docker) |
|
|
233
|
+
| [PyPI Publishing](docs/pypi-publish.md) | Building and uploading releases to PyPI |
|
|
234
|
+
|
|
235
|
+
## Requirements
|
|
236
|
+
|
|
237
|
+
- Python 3.10+
|
|
238
|
+
- Pipeline package (optional — mocks used if unavailable, controlled via `runner.allow_mock_fallback`)
|
|
239
|
+
|
|
240
|
+
## Project structure
|
|
241
|
+
|
|
242
|
+
```
|
|
243
|
+
spindle-eval/
|
|
244
|
+
├── src/spindle_eval/
|
|
245
|
+
│ ├── runner.py # Hydra entrypoint, pipeline discovery
|
|
246
|
+
│ ├── pipeline.py # PipelineExecutor (stage wiring, metrics, gates)
|
|
247
|
+
│ ├── protocols.py # Stage, StageDef, StageResult, Tracker protocols
|
|
248
|
+
│ ├── compat.py # Legacy component dict → StageDef adapter
|
|
249
|
+
│ ├── mocks.py # Mock Stage implementations for testing
|
|
250
|
+
│ ├── metrics/ # Ragas, graph, extraction, KOS, chunk, provenance
|
|
251
|
+
│ ├── tracking/ # MLflow, file, noop, composite trackers
|
|
252
|
+
│ ├── events/ # Event store, duration/token/error analysis
|
|
253
|
+
│ ├── datasets/ # Golden dataset loading, KOS reference extraction
|
|
254
|
+
│ ├── baselines/ # Baseline runner implementations
|
|
255
|
+
│ ├── ci/ # Regression detection, PR report generation
|
|
256
|
+
│ └── production/ # Feedback loops, staleness monitoring
|
|
257
|
+
│ ├── conf/ # Hydra config groups (packaged for pip install)
|
|
258
|
+
│ └── golden_data/ # Default evaluation datasets (JSONL)
|
|
259
|
+
├── docs/ # Developer guides
|
|
260
|
+
├── baselines/ # Baseline metric snapshots
|
|
261
|
+
└── tests/
|
|
262
|
+
```
|
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
# spindle-eval
|
|
2
|
+
|
|
3
|
+
Pipeline-agnostic evaluation and observability framework for knowledge graph, RAG, and KOS pipelines. spindle-eval wraps any pipeline defined as a sequence of `Stage` objects with structured experiment tracking, automated metrics, parameter sweeps, quality gates, baseline comparisons, and CI/CD regression detection.
|
|
4
|
+
|
|
5
|
+
Originally built for [spindle](https://github.com/danielkentwood/spindle) (a Graph RAG pipeline), spindle-eval is designed to evaluate **any** pipeline — full end-to-end systems, individual stages, or partial subsets.
|
|
6
|
+
|
|
7
|
+
## Why spindle-eval?
|
|
8
|
+
|
|
9
|
+
Multi-stage pipelines have many interacting parameters. Tuning them requires more than ad-hoc scripts. spindle-eval provides:
|
|
10
|
+
|
|
11
|
+
- **Stage-gated evaluation** — each stage must meet quality thresholds before downstream stages run, enforcing upstream-first optimization
|
|
12
|
+
- **Pipeline-agnostic execution** — define stages with the `Stage` protocol, wire them with `StageDef`, run them with `PipelineExecutor`
|
|
13
|
+
- **Composable configs** — Hydra config groups for every pipeline aspect, enabling single runs or multi-dimensional parameter sweeps
|
|
14
|
+
- **Multiple tracking backends** — MLflow for experiments, file-based for CI, composite for multi-backend, no-op for benchmarking
|
|
15
|
+
- **Structured events** — thread-safe event store with duration analysis, token tracking, and error filtering
|
|
16
|
+
- **KOS metrics** — intrinsic quality metrics for SKOS taxonomies and OWL ontologies (taxonomy depth, label quality, SHACL conformance, etc.)
|
|
17
|
+
- **Automated regression detection** — CI compares metrics against baselines with bootstrap confidence intervals
|
|
18
|
+
- **Golden dataset management** — versioned evaluation datasets with a question-type taxonomy and extensible reference fields for extraction and KOS evaluation
|
|
19
|
+
|
|
20
|
+
## Architecture overview
|
|
21
|
+
|
|
22
|
+
```
|
|
23
|
+
┌─────────────────────────────┐
|
|
24
|
+
│ Hydra Configuration │
|
|
25
|
+
│ (composable YAML per stage) │
|
|
26
|
+
└──────────────┬──────────────┘
|
|
27
|
+
│
|
|
28
|
+
┌──────────────▼──────────────┐
|
|
29
|
+
│ spindle-eval runner │
|
|
30
|
+
│ (discovery + orchestration) │
|
|
31
|
+
└──────────────┬──────────────┘
|
|
32
|
+
│
|
|
33
|
+
┌──────────────▼──────────────┐
|
|
34
|
+
│ PipelineExecutor │
|
|
35
|
+
│ (stage wiring, metrics, │
|
|
36
|
+
│ gates, event logging) │
|
|
37
|
+
└──────────────┬──────────────┘
|
|
38
|
+
│
|
|
39
|
+
┌────────────┬───────────┼───────────┬────────────┐
|
|
40
|
+
▼ ▼ ▼ ▼ ▼
|
|
41
|
+
Stage 1 Stage 2 Stage 3 Stage N Metric fns
|
|
42
|
+
(any) (any) (any) (any) (attached)
|
|
43
|
+
│ │ │ │ │
|
|
44
|
+
└────────────┴───────────┴───────────┴────────────┘
|
|
45
|
+
│
|
|
46
|
+
┌──────────────▼──────────────┐
|
|
47
|
+
│ Tracker backends │
|
|
48
|
+
├──────────┬─────────┬────────┤
|
|
49
|
+
▼ ▼ ▼ ▼
|
|
50
|
+
MLflow File Langfuse No-op
|
|
51
|
+
(experiments) (JSON) (traces) (benchmarks)
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## Installation
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
pip install spindle-eval
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
For co-development with a pipeline package (editable install):
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
pip install -e ".[dev]"
|
|
64
|
+
pip install -e /path/to/your-pipeline
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## Quick start
|
|
68
|
+
|
|
69
|
+
### Full pipeline evaluation
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
# Single evaluation run
|
|
73
|
+
python -m spindle_eval.runner retrieval=hybrid generation=claude evaluation=quick
|
|
74
|
+
|
|
75
|
+
# Parameter sweep
|
|
76
|
+
python -m spindle_eval.runner --multirun \
|
|
77
|
+
preprocessing.chunk_size=256,512,1024 \
|
|
78
|
+
retrieval.top_k=5,10,20
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
### Evaluate a single stage
|
|
82
|
+
|
|
83
|
+
```python
|
|
84
|
+
from spindle_eval.pipeline import PipelineExecutor
|
|
85
|
+
from spindle_eval.protocols import StageDef, StageResult
|
|
86
|
+
from spindle_eval.tracking import create_tracker
|
|
87
|
+
from spindle_eval.metrics.chunk_metrics import boundary_coherence, size_distribution
|
|
88
|
+
|
|
89
|
+
class MyChunker:
|
|
90
|
+
name = "chunking"
|
|
91
|
+
def run(self, inputs, cfg):
|
|
92
|
+
chunks = do_chunking(cfg)
|
|
93
|
+
return StageResult(outputs={"chunks": chunks})
|
|
94
|
+
|
|
95
|
+
tracker = create_tracker("file", output_dir="./results")
|
|
96
|
+
stages = [
|
|
97
|
+
StageDef(
|
|
98
|
+
name="chunking",
|
|
99
|
+
stage=MyChunker(),
|
|
100
|
+
metrics=[boundary_coherence, size_distribution],
|
|
101
|
+
),
|
|
102
|
+
]
|
|
103
|
+
result = PipelineExecutor(tracker).execute(stages, cfg)
|
|
104
|
+
tracker.end_run()
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
### Evaluate a KOS builder
|
|
108
|
+
|
|
109
|
+
```python
|
|
110
|
+
from spindle_eval.metrics.kos_metrics import taxonomy_depth, label_quality, orphan_concept_ratio
|
|
111
|
+
|
|
112
|
+
stages = [
|
|
113
|
+
StageDef(
|
|
114
|
+
name="taxonomy",
|
|
115
|
+
stage=MyTaxonomyBuilder(),
|
|
116
|
+
input_keys={"chunks": "preprocessing.chunks"},
|
|
117
|
+
metrics=[taxonomy_depth, label_quality, orphan_concept_ratio],
|
|
118
|
+
gate=lambda m: m.get("orphan_concept_ratio", 1.0) < 0.3,
|
|
119
|
+
),
|
|
120
|
+
]
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
## Configuration
|
|
124
|
+
|
|
125
|
+
Hydra config groups live in `spindle_eval/conf/` (packaged with the install) and compose together:
|
|
126
|
+
|
|
127
|
+
| Group | Options | Controls |
|
|
128
|
+
|---|---|---|
|
|
129
|
+
| `preprocessing` | `default`, `small_chunks`, `large_chunks` | Chunking strategy and size |
|
|
130
|
+
| `ontology` | `schema_first`, `schema_free`, `hybrid` | Entity/relation schema discovery |
|
|
131
|
+
| `extraction` | `llm`, `nlp`, `finetuned` | Triple extraction method |
|
|
132
|
+
| `retrieval` | `hybrid`, `local`, `global`, `drift` | Graph retrieval strategy |
|
|
133
|
+
| `generation` | `gpt4`, `claude`, `gemini` | LLM for answer generation |
|
|
134
|
+
| `evaluation` | `quick`, `full` | Number of evaluation examples |
|
|
135
|
+
| `sweep` | `none`, `er_threshold`, `retrieval`, `chunk_size` | Predefined sweep dimensions |
|
|
136
|
+
|
|
137
|
+
Pipeline packages can register additional config groups via Hydra's `SearchPathPlugin`. See [docs/hydra-config-conventions.md](docs/hydra-config-conventions.md).
|
|
138
|
+
|
|
139
|
+
## Metrics
|
|
140
|
+
|
|
141
|
+
### RAG quality (via Ragas)
|
|
142
|
+
Faithfulness, context recall, context precision, answer correctness, answer relevancy.
|
|
143
|
+
|
|
144
|
+
### Graph quality
|
|
145
|
+
Connectivity, modularity, B-CUBED clustering, CEAF entity alignment, subgraph completeness.
|
|
146
|
+
|
|
147
|
+
### Extraction quality
|
|
148
|
+
Triple extraction precision, recall, and F1 — with configurable stage gates.
|
|
149
|
+
|
|
150
|
+
### KOS quality
|
|
151
|
+
Taxonomy depth/breadth, label quality, definition completeness, thesaurus connectivity, orphan ratio, axiom density, SHACL conformance. See [docs/kos-evaluation-guide.md](docs/kos-evaluation-guide.md).
|
|
152
|
+
|
|
153
|
+
### Chunk and provenance quality
|
|
154
|
+
Boundary coherence, size distribution, evidence span coverage.
|
|
155
|
+
|
|
156
|
+
### Statistical rigor
|
|
157
|
+
Bootstrap confidence intervals for all metrics, used for regression detection in CI.
|
|
158
|
+
|
|
159
|
+
## Tracking backends
|
|
160
|
+
|
|
161
|
+
| Backend | Class | Use case |
|
|
162
|
+
|---------|-------|----------|
|
|
163
|
+
| MLflow | `MLflowTracker` | Production experiment tracking |
|
|
164
|
+
| File | `FileTracker` | Local development, CI |
|
|
165
|
+
| Langfuse | Via OpenTelemetry | Trace-level debugging |
|
|
166
|
+
| No-op | `NoOpTracker` | Benchmarking, unit tests |
|
|
167
|
+
| Composite | `CompositeTracker` | Fan out to multiple backends |
|
|
168
|
+
|
|
169
|
+
```python
|
|
170
|
+
from spindle_eval.tracking import create_tracker
|
|
171
|
+
|
|
172
|
+
tracker = create_tracker("mlflow")
|
|
173
|
+
tracker = create_tracker("file", output_dir="./results")
|
|
174
|
+
tracker = create_tracker("noop")
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
## Documentation
|
|
178
|
+
|
|
179
|
+
| Guide | Audience |
|
|
180
|
+
|-------|----------|
|
|
181
|
+
| [Spindle Developer Guide](docs/spindle-developer-guide.md) | Pipeline developers integrating with spindle-eval |
|
|
182
|
+
| [Custom Pipeline Guide](docs/custom-pipeline-guide.md) | Developers building non-spindle pipelines |
|
|
183
|
+
| [KOS Evaluation Guide](docs/kos-evaluation-guide.md) | Developers evaluating SKOS/OWL knowledge structures |
|
|
184
|
+
| [Hydra Config Conventions](docs/hydra-config-conventions.md) | Config authors and sweep designers |
|
|
185
|
+
| [Tracking Setup](docs/tracking_setup.md) | Setting up MLflow/Langfuse (GKE or local Docker) |
|
|
186
|
+
| [PyPI Publishing](docs/pypi-publish.md) | Building and uploading releases to PyPI |
|
|
187
|
+
|
|
188
|
+
## Requirements
|
|
189
|
+
|
|
190
|
+
- Python 3.10+
|
|
191
|
+
- Pipeline package (optional — mocks used if unavailable, controlled via `runner.allow_mock_fallback`)
|
|
192
|
+
|
|
193
|
+
## Project structure
|
|
194
|
+
|
|
195
|
+
```
|
|
196
|
+
spindle-eval/
|
|
197
|
+
├── src/spindle_eval/
|
|
198
|
+
│ ├── runner.py # Hydra entrypoint, pipeline discovery
|
|
199
|
+
│ ├── pipeline.py # PipelineExecutor (stage wiring, metrics, gates)
|
|
200
|
+
│ ├── protocols.py # Stage, StageDef, StageResult, Tracker protocols
|
|
201
|
+
│ ├── compat.py # Legacy component dict → StageDef adapter
|
|
202
|
+
│ ├── mocks.py # Mock Stage implementations for testing
|
|
203
|
+
│ ├── metrics/ # Ragas, graph, extraction, KOS, chunk, provenance
|
|
204
|
+
│ ├── tracking/ # MLflow, file, noop, composite trackers
|
|
205
|
+
│ ├── events/ # Event store, duration/token/error analysis
|
|
206
|
+
│ ├── datasets/ # Golden dataset loading, KOS reference extraction
|
|
207
|
+
│ ├── baselines/ # Baseline runner implementations
|
|
208
|
+
│ ├── ci/ # Regression detection, PR report generation
|
|
209
|
+
│ └── production/ # Feedback loops, staleness monitoring
|
|
210
|
+
│ ├── conf/ # Hydra config groups (packaged for pip install)
|
|
211
|
+
│ └── golden_data/ # Default evaluation datasets (JSONL)
|
|
212
|
+
├── docs/ # Developer guides
|
|
213
|
+
├── baselines/ # Baseline metric snapshots
|
|
214
|
+
└── tests/
|
|
215
|
+
```
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "spindle-eval"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Pipeline-agnostic evaluation and observability for knowledge graph, RAG, and KOS pipelines"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
license = {text = "MIT"}
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
|
+
authors = [{name = "Spindle Team"}]
|
|
13
|
+
keywords = [
|
|
14
|
+
"graph-rag",
|
|
15
|
+
"evaluation",
|
|
16
|
+
"mlflow",
|
|
17
|
+
"experimentation",
|
|
18
|
+
"rag",
|
|
19
|
+
"knowledge-graph",
|
|
20
|
+
"hydra",
|
|
21
|
+
]
|
|
22
|
+
classifiers = [
|
|
23
|
+
"Development Status :: 3 - Alpha",
|
|
24
|
+
"Intended Audience :: Developers",
|
|
25
|
+
"License :: OSI Approved :: MIT License",
|
|
26
|
+
"Programming Language :: Python :: 3",
|
|
27
|
+
"Programming Language :: Python :: 3.10",
|
|
28
|
+
"Programming Language :: Python :: 3.11",
|
|
29
|
+
"Programming Language :: Python :: 3.12",
|
|
30
|
+
]
|
|
31
|
+
dependencies = [
|
|
32
|
+
"mlflow>=3.0",
|
|
33
|
+
"hydra-core>=1.3",
|
|
34
|
+
"omegaconf>=2.3",
|
|
35
|
+
"ragas>=0.2",
|
|
36
|
+
"optuna>=2.10,<3.0",
|
|
37
|
+
"hydra-optuna-sweeper>=1.2",
|
|
38
|
+
"langfuse>=2.0",
|
|
39
|
+
"opentelemetry-sdk>=1.20",
|
|
40
|
+
"opentelemetry-exporter-otlp>=1.20",
|
|
41
|
+
"scipy>=1.10",
|
|
42
|
+
"numpy>=1.24",
|
|
43
|
+
"scikit-learn>=1.3",
|
|
44
|
+
"rank-bm25>=0.2",
|
|
45
|
+
"sentence-transformers>=2.2",
|
|
46
|
+
"rdflib>=7.0",
|
|
47
|
+
"pyshacl>=0.25",
|
|
48
|
+
]
|
|
49
|
+
|
|
50
|
+
[project.optional-dependencies]
|
|
51
|
+
dev = [
|
|
52
|
+
"pytest>=7.0",
|
|
53
|
+
"pytest-cov>=4.0",
|
|
54
|
+
"ruff>=0.1",
|
|
55
|
+
"build>=1.0",
|
|
56
|
+
"twine>=5.0",
|
|
57
|
+
]
|
|
58
|
+
spindle = [
|
|
59
|
+
"spindle>=0.1.0",
|
|
60
|
+
]
|
|
61
|
+
|
|
62
|
+
[project.urls]
|
|
63
|
+
Homepage = "https://github.com/danielkentwood/spindle-eval"
|
|
64
|
+
Documentation = "https://github.com/danielkentwood/spindle-eval#readme"
|
|
65
|
+
Repository = "https://github.com/danielkentwood/spindle-eval"
|
|
66
|
+
Changelog = "https://github.com/danielkentwood/spindle-eval/releases"
|
|
67
|
+
|
|
68
|
+
[project.scripts]
|
|
69
|
+
spindle-eval = "spindle_eval.runner:main"
|
|
70
|
+
|
|
71
|
+
[tool.setuptools.packages.find]
|
|
72
|
+
where = ["src"]
|
|
73
|
+
|
|
74
|
+
[tool.setuptools.package-data]
|
|
75
|
+
spindle_eval = ["conf/**/*.yaml", "golden_data/**/*.jsonl", "golden_data/**/*.md"]
|
|
76
|
+
|
|
77
|
+
[tool.ruff]
|
|
78
|
+
line-length = 88
|
|
79
|
+
target-version = "py310"
|
|
80
|
+
|
|
81
|
+
[tool.ruff.lint]
|
|
82
|
+
select = ["E", "F", "I", "N", "W", "UP"]
|
|
83
|
+
ignore = []
|
|
84
|
+
|
|
85
|
+
[tool.pytest.ini_options]
|
|
86
|
+
testpaths = ["tests"]
|
|
87
|
+
addopts = "-v --tb=short"
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""Baseline protocol and shared result structures."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from typing import Any, Protocol
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class BaselineResult:
|
|
11
|
+
answer: str
|
|
12
|
+
contexts: list[str]
|
|
13
|
+
metadata: dict[str, Any] = field(default_factory=dict)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class BaselineRunner(Protocol):
|
|
17
|
+
"""Common interface across baseline systems."""
|
|
18
|
+
|
|
19
|
+
name: str
|
|
20
|
+
|
|
21
|
+
def run(self, question: str, **kwargs: Any) -> BaselineResult: ...
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""BM25 lexical retrieval baseline."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any, Callable
|
|
6
|
+
|
|
7
|
+
from rank_bm25 import BM25Okapi
|
|
8
|
+
|
|
9
|
+
from spindle_eval.baselines.base import BaselineResult
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class BM25Baseline:
|
|
13
|
+
name = "bm25"
|
|
14
|
+
|
|
15
|
+
def __init__(
|
|
16
|
+
self,
|
|
17
|
+
corpus: list[str],
|
|
18
|
+
llm_callable: Callable[[str], str],
|
|
19
|
+
top_k: int = 5,
|
|
20
|
+
) -> None:
|
|
21
|
+
self._corpus = corpus
|
|
22
|
+
self._tokenized = [doc.lower().split() for doc in corpus]
|
|
23
|
+
self._bm25 = BM25Okapi(self._tokenized)
|
|
24
|
+
self._llm_callable = llm_callable
|
|
25
|
+
self._top_k = top_k
|
|
26
|
+
|
|
27
|
+
def run(self, question: str, **kwargs: Any) -> BaselineResult:
|
|
28
|
+
top_k = int(kwargs.get("top_k", self._top_k))
|
|
29
|
+
scores = self._bm25.get_scores(question.lower().split())
|
|
30
|
+
ranked = sorted(
|
|
31
|
+
zip(self._corpus, scores, strict=False),
|
|
32
|
+
key=lambda item: item[1],
|
|
33
|
+
reverse=True,
|
|
34
|
+
)
|
|
35
|
+
contexts = [doc for doc, _ in ranked[:top_k]]
|
|
36
|
+
prompt = f"Question: {question}\n\nContext:\n" + "\n".join(contexts)
|
|
37
|
+
answer = self._llm_callable(prompt)
|
|
38
|
+
return BaselineResult(
|
|
39
|
+
answer=answer,
|
|
40
|
+
contexts=contexts,
|
|
41
|
+
metadata={"baseline": self.name, "top_k": top_k},
|
|
42
|
+
)
|