spindle-eval 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. spindle_eval-0.1.0/LICENSE +21 -0
  2. spindle_eval-0.1.0/MANIFEST.in +1 -0
  3. spindle_eval-0.1.0/PKG-INFO +262 -0
  4. spindle_eval-0.1.0/README.md +215 -0
  5. spindle_eval-0.1.0/pyproject.toml +87 -0
  6. spindle_eval-0.1.0/setup.cfg +4 -0
  7. spindle_eval-0.1.0/src/spindle_eval/__init__.py +3 -0
  8. spindle_eval-0.1.0/src/spindle_eval/baselines/__init__.py +5 -0
  9. spindle_eval-0.1.0/src/spindle_eval/baselines/base.py +21 -0
  10. spindle_eval-0.1.0/src/spindle_eval/baselines/bm25_baseline.py +42 -0
  11. spindle_eval-0.1.0/src/spindle_eval/baselines/hybrid_search.py +69 -0
  12. spindle_eval-0.1.0/src/spindle_eval/baselines/naive_rag.py +36 -0
  13. spindle_eval-0.1.0/src/spindle_eval/baselines/no_rag.py +22 -0
  14. spindle_eval-0.1.0/src/spindle_eval/baselines/oracle.py +24 -0
  15. spindle_eval-0.1.0/src/spindle_eval/ci/__init__.py +1 -0
  16. spindle_eval-0.1.0/src/spindle_eval/ci/regression.py +76 -0
  17. spindle_eval-0.1.0/src/spindle_eval/ci/reporter.py +40 -0
  18. spindle_eval-0.1.0/src/spindle_eval/compat.py +153 -0
  19. spindle_eval-0.1.0/src/spindle_eval/conf/config.yaml +28 -0
  20. spindle_eval-0.1.0/src/spindle_eval/conf/evaluation/full.yaml +8 -0
  21. spindle_eval-0.1.0/src/spindle_eval/conf/evaluation/quick.yaml +6 -0
  22. spindle_eval-0.1.0/src/spindle_eval/conf/extraction/finetuned.yaml +4 -0
  23. spindle_eval-0.1.0/src/spindle_eval/conf/extraction/llm.yaml +4 -0
  24. spindle_eval-0.1.0/src/spindle_eval/conf/extraction/nlp.yaml +4 -0
  25. spindle_eval-0.1.0/src/spindle_eval/conf/generation/claude.yaml +3 -0
  26. spindle_eval-0.1.0/src/spindle_eval/conf/generation/gemini.yaml +3 -0
  27. spindle_eval-0.1.0/src/spindle_eval/conf/generation/gpt4.yaml +3 -0
  28. spindle_eval-0.1.0/src/spindle_eval/conf/ontology/hybrid.yaml +12 -0
  29. spindle_eval-0.1.0/src/spindle_eval/conf/ontology/schema_first.yaml +12 -0
  30. spindle_eval-0.1.0/src/spindle_eval/conf/ontology/schema_free.yaml +4 -0
  31. spindle_eval-0.1.0/src/spindle_eval/conf/preprocessing/default.yaml +4 -0
  32. spindle_eval-0.1.0/src/spindle_eval/conf/preprocessing/large_chunks.yaml +4 -0
  33. spindle_eval-0.1.0/src/spindle_eval/conf/preprocessing/small_chunks.yaml +4 -0
  34. spindle_eval-0.1.0/src/spindle_eval/conf/retrieval/drift.yaml +4 -0
  35. spindle_eval-0.1.0/src/spindle_eval/conf/retrieval/global.yaml +4 -0
  36. spindle_eval-0.1.0/src/spindle_eval/conf/retrieval/hybrid.yaml +4 -0
  37. spindle_eval-0.1.0/src/spindle_eval/conf/retrieval/local.yaml +4 -0
  38. spindle_eval-0.1.0/src/spindle_eval/conf/sweep/chunk_size.yaml +11 -0
  39. spindle_eval-0.1.0/src/spindle_eval/conf/sweep/er_threshold.yaml +10 -0
  40. spindle_eval-0.1.0/src/spindle_eval/conf/sweep/none.yaml +1 -0
  41. spindle_eval-0.1.0/src/spindle_eval/conf/sweep/retrieval.yaml +12 -0
  42. spindle_eval-0.1.0/src/spindle_eval/datasets/__init__.py +15 -0
  43. spindle_eval-0.1.0/src/spindle_eval/datasets/generator.py +82 -0
  44. spindle_eval-0.1.0/src/spindle_eval/datasets/golden.py +151 -0
  45. spindle_eval-0.1.0/src/spindle_eval/datasets/kos_reference.py +32 -0
  46. spindle_eval-0.1.0/src/spindle_eval/datasets/versioning.py +37 -0
  47. spindle_eval-0.1.0/src/spindle_eval/events/__init__.py +21 -0
  48. spindle_eval-0.1.0/src/spindle_eval/events/analysis.py +117 -0
  49. spindle_eval-0.1.0/src/spindle_eval/events/store.py +118 -0
  50. spindle_eval-0.1.0/src/spindle_eval/golden_data/gold_kg/annotation_guidelines.md +30 -0
  51. spindle_eval-0.1.0/src/spindle_eval/golden_data/questions.jsonl +3 -0
  52. spindle_eval-0.1.0/src/spindle_eval/metrics/__init__.py +8 -0
  53. spindle_eval-0.1.0/src/spindle_eval/metrics/chunk_metrics.py +30 -0
  54. spindle_eval-0.1.0/src/spindle_eval/metrics/extraction_metrics.py +101 -0
  55. spindle_eval-0.1.0/src/spindle_eval/metrics/graph_metrics.py +218 -0
  56. spindle_eval-0.1.0/src/spindle_eval/metrics/kos_loader.py +42 -0
  57. spindle_eval-0.1.0/src/spindle_eval/metrics/kos_metrics.py +367 -0
  58. spindle_eval-0.1.0/src/spindle_eval/metrics/provenance_metrics.py +14 -0
  59. spindle_eval-0.1.0/src/spindle_eval/metrics/ragas_scorers.py +49 -0
  60. spindle_eval-0.1.0/src/spindle_eval/metrics/statistical.py +147 -0
  61. spindle_eval-0.1.0/src/spindle_eval/mocks.py +227 -0
  62. spindle_eval-0.1.0/src/spindle_eval/pipeline.py +120 -0
  63. spindle_eval-0.1.0/src/spindle_eval/production/__init__.py +1 -0
  64. spindle_eval-0.1.0/src/spindle_eval/production/feedback_loop.py +53 -0
  65. spindle_eval-0.1.0/src/spindle_eval/production/staleness.py +39 -0
  66. spindle_eval-0.1.0/src/spindle_eval/protocols.py +183 -0
  67. spindle_eval-0.1.0/src/spindle_eval/runner.py +333 -0
  68. spindle_eval-0.1.0/src/spindle_eval/tracking/__init__.py +39 -0
  69. spindle_eval-0.1.0/src/spindle_eval/tracking/composite_tracker.py +53 -0
  70. spindle_eval-0.1.0/src/spindle_eval/tracking/file_tracker.py +95 -0
  71. spindle_eval-0.1.0/src/spindle_eval/tracking/langfuse_integration.py +39 -0
  72. spindle_eval-0.1.0/src/spindle_eval/tracking/mlflow_tracker.py +106 -0
  73. spindle_eval-0.1.0/src/spindle_eval/tracking/noop_tracker.py +44 -0
  74. spindle_eval-0.1.0/src/spindle_eval.egg-info/PKG-INFO +262 -0
  75. spindle_eval-0.1.0/src/spindle_eval.egg-info/SOURCES.txt +87 -0
  76. spindle_eval-0.1.0/src/spindle_eval.egg-info/dependency_links.txt +1 -0
  77. spindle_eval-0.1.0/src/spindle_eval.egg-info/entry_points.txt +2 -0
  78. spindle_eval-0.1.0/src/spindle_eval.egg-info/requires.txt +26 -0
  79. spindle_eval-0.1.0/src/spindle_eval.egg-info/top_level.txt +1 -0
  80. spindle_eval-0.1.0/tests/test_compat.py +89 -0
  81. spindle_eval-0.1.0/tests/test_datasets_and_ci.py +179 -0
  82. spindle_eval-0.1.0/tests/test_events.py +550 -0
  83. spindle_eval-0.1.0/tests/test_kos_metrics.py +432 -0
  84. spindle_eval-0.1.0/tests/test_metrics.py +55 -0
  85. spindle_eval-0.1.0/tests/test_mlflow_tracker.py +57 -0
  86. spindle_eval-0.1.0/tests/test_pipeline.py +171 -0
  87. spindle_eval-0.1.0/tests/test_protocols.py +161 -0
  88. spindle_eval-0.1.0/tests/test_runner.py +147 -0
  89. spindle_eval-0.1.0/tests/test_tracking.py +156 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Daniel Wood
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ include LICENSE README.md
@@ -0,0 +1,262 @@
1
+ Metadata-Version: 2.4
2
+ Name: spindle-eval
3
+ Version: 0.1.0
4
+ Summary: Pipeline-agnostic evaluation and observability for knowledge graph, RAG, and KOS pipelines
5
+ Author: Spindle Team
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/danielkentwood/spindle-eval
8
+ Project-URL: Documentation, https://github.com/danielkentwood/spindle-eval#readme
9
+ Project-URL: Repository, https://github.com/danielkentwood/spindle-eval
10
+ Project-URL: Changelog, https://github.com/danielkentwood/spindle-eval/releases
11
+ Keywords: graph-rag,evaluation,mlflow,experimentation,rag,knowledge-graph,hydra
12
+ Classifier: Development Status :: 3 - Alpha
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.10
17
+ Classifier: Programming Language :: Python :: 3.11
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Requires-Python: >=3.10
20
+ Description-Content-Type: text/markdown
21
+ License-File: LICENSE
22
+ Requires-Dist: mlflow>=3.0
23
+ Requires-Dist: hydra-core>=1.3
24
+ Requires-Dist: omegaconf>=2.3
25
+ Requires-Dist: ragas>=0.2
26
+ Requires-Dist: optuna<3.0,>=2.10
27
+ Requires-Dist: hydra-optuna-sweeper>=1.2
28
+ Requires-Dist: langfuse>=2.0
29
+ Requires-Dist: opentelemetry-sdk>=1.20
30
+ Requires-Dist: opentelemetry-exporter-otlp>=1.20
31
+ Requires-Dist: scipy>=1.10
32
+ Requires-Dist: numpy>=1.24
33
+ Requires-Dist: scikit-learn>=1.3
34
+ Requires-Dist: rank-bm25>=0.2
35
+ Requires-Dist: sentence-transformers>=2.2
36
+ Requires-Dist: rdflib>=7.0
37
+ Requires-Dist: pyshacl>=0.25
38
+ Provides-Extra: dev
39
+ Requires-Dist: pytest>=7.0; extra == "dev"
40
+ Requires-Dist: pytest-cov>=4.0; extra == "dev"
41
+ Requires-Dist: ruff>=0.1; extra == "dev"
42
+ Requires-Dist: build>=1.0; extra == "dev"
43
+ Requires-Dist: twine>=5.0; extra == "dev"
44
+ Provides-Extra: spindle
45
+ Requires-Dist: spindle>=0.1.0; extra == "spindle"
46
+ Dynamic: license-file
47
+
48
+ # spindle-eval
49
+
50
+ Pipeline-agnostic evaluation and observability framework for knowledge graph, RAG, and KOS pipelines. spindle-eval wraps any pipeline defined as a sequence of `Stage` objects with structured experiment tracking, automated metrics, parameter sweeps, quality gates, baseline comparisons, and CI/CD regression detection.
51
+
52
+ Originally built for [spindle](https://github.com/danielkentwood/spindle) (a Graph RAG pipeline), spindle-eval is designed to evaluate **any** pipeline — full end-to-end systems, individual stages, or partial subsets.
53
+
54
+ ## Why spindle-eval?
55
+
56
+ Multi-stage pipelines have many interacting parameters. Tuning them requires more than ad-hoc scripts. spindle-eval provides:
57
+
58
+ - **Stage-gated evaluation** — each stage must meet quality thresholds before downstream stages run, enforcing upstream-first optimization
59
+ - **Pipeline-agnostic execution** — define stages with the `Stage` protocol, wire them with `StageDef`, run them with `PipelineExecutor`
60
+ - **Composable configs** — Hydra config groups for every pipeline aspect, enabling single runs or multi-dimensional parameter sweeps
61
+ - **Multiple tracking backends** — MLflow for experiments, file-based for CI, composite for multi-backend, no-op for benchmarking
62
+ - **Structured events** — thread-safe event store with duration analysis, token tracking, and error filtering
63
+ - **KOS metrics** — intrinsic quality metrics for SKOS taxonomies and OWL ontologies (taxonomy depth, label quality, SHACL conformance, etc.)
64
+ - **Automated regression detection** — CI compares metrics against baselines with bootstrap confidence intervals
65
+ - **Golden dataset management** — versioned evaluation datasets with a question-type taxonomy and extensible reference fields for extraction and KOS evaluation
66
+
67
+ ## Architecture overview
68
+
69
+ ```
70
+ ┌─────────────────────────────┐
71
+ │ Hydra Configuration │
72
+ │ (composable YAML per stage) │
73
+ └──────────────┬──────────────┘
74
+
75
+ ┌──────────────▼──────────────┐
76
+ │ spindle-eval runner │
77
+ │ (discovery + orchestration) │
78
+ └──────────────┬──────────────┘
79
+
80
+ ┌──────────────▼──────────────┐
81
+ │ PipelineExecutor │
82
+ │ (stage wiring, metrics, │
83
+ │ gates, event logging) │
84
+ └──────────────┬──────────────┘
85
+
86
+ ┌────────────┬───────────┼───────────┬────────────┐
87
+ ▼ ▼ ▼ ▼ ▼
88
+ Stage 1 Stage 2 Stage 3 Stage N Metric fns
89
+ (any) (any) (any) (any) (attached)
90
+ │ │ │ │ │
91
+ └────────────┴───────────┴───────────┴────────────┘
92
+
93
+ ┌──────────────▼──────────────┐
94
+ │ Tracker backends │
95
+ ├──────────┬─────────┬────────┤
96
+ ▼ ▼ ▼ ▼
97
+ MLflow File Langfuse No-op
98
+ (experiments) (JSON) (traces) (benchmarks)
99
+ ```
100
+
101
+ ## Installation
102
+
103
+ ```bash
104
+ pip install spindle-eval
105
+ ```
106
+
107
+ For co-development with a pipeline package (editable install):
108
+
109
+ ```bash
110
+ pip install -e ".[dev]"
111
+ pip install -e /path/to/your-pipeline
112
+ ```
113
+
114
+ ## Quick start
115
+
116
+ ### Full pipeline evaluation
117
+
118
+ ```bash
119
+ # Single evaluation run
120
+ python -m spindle_eval.runner retrieval=hybrid generation=claude evaluation=quick
121
+
122
+ # Parameter sweep
123
+ python -m spindle_eval.runner --multirun \
124
+ preprocessing.chunk_size=256,512,1024 \
125
+ retrieval.top_k=5,10,20
126
+ ```
127
+
128
+ ### Evaluate a single stage
129
+
130
+ ```python
131
+ from spindle_eval.pipeline import PipelineExecutor
132
+ from spindle_eval.protocols import StageDef, StageResult
133
+ from spindle_eval.tracking import create_tracker
134
+ from spindle_eval.metrics.chunk_metrics import boundary_coherence, size_distribution
135
+
136
+ class MyChunker:
137
+ name = "chunking"
138
+ def run(self, inputs, cfg):
139
+ chunks = do_chunking(cfg)
140
+ return StageResult(outputs={"chunks": chunks})
141
+
142
+ tracker = create_tracker("file", output_dir="./results")
143
+ stages = [
144
+ StageDef(
145
+ name="chunking",
146
+ stage=MyChunker(),
147
+ metrics=[boundary_coherence, size_distribution],
148
+ ),
149
+ ]
150
+ result = PipelineExecutor(tracker).execute(stages, cfg)
151
+ tracker.end_run()
152
+ ```
153
+
154
+ ### Evaluate a KOS builder
155
+
156
+ ```python
157
+ from spindle_eval.metrics.kos_metrics import taxonomy_depth, label_quality, orphan_concept_ratio
158
+
159
+ stages = [
160
+ StageDef(
161
+ name="taxonomy",
162
+ stage=MyTaxonomyBuilder(),
163
+ input_keys={"chunks": "preprocessing.chunks"},
164
+ metrics=[taxonomy_depth, label_quality, orphan_concept_ratio],
165
+ gate=lambda m: m.get("orphan_concept_ratio", 1.0) < 0.3,
166
+ ),
167
+ ]
168
+ ```
169
+
170
+ ## Configuration
171
+
172
+ Hydra config groups live in `spindle_eval/conf/` (packaged with the install) and compose together:
173
+
174
+ | Group | Options | Controls |
175
+ |---|---|---|
176
+ | `preprocessing` | `default`, `small_chunks`, `large_chunks` | Chunking strategy and size |
177
+ | `ontology` | `schema_first`, `schema_free`, `hybrid` | Entity/relation schema discovery |
178
+ | `extraction` | `llm`, `nlp`, `finetuned` | Triple extraction method |
179
+ | `retrieval` | `hybrid`, `local`, `global`, `drift` | Graph retrieval strategy |
180
+ | `generation` | `gpt4`, `claude`, `gemini` | LLM for answer generation |
181
+ | `evaluation` | `quick`, `full` | Number of evaluation examples |
182
+ | `sweep` | `none`, `er_threshold`, `retrieval`, `chunk_size` | Predefined sweep dimensions |
183
+
184
+ Pipeline packages can register additional config groups via Hydra's `SearchPathPlugin`. See [docs/hydra-config-conventions.md](docs/hydra-config-conventions.md).
185
+
186
+ ## Metrics
187
+
188
+ ### RAG quality (via Ragas)
189
+ Faithfulness, context recall, context precision, answer correctness, answer relevancy.
190
+
191
+ ### Graph quality
192
+ Connectivity, modularity, B-CUBED clustering, CEAF entity alignment, subgraph completeness.
193
+
194
+ ### Extraction quality
195
+ Triple extraction precision, recall, and F1 — with configurable stage gates.
196
+
197
+ ### KOS quality
198
+ Taxonomy depth/breadth, label quality, definition completeness, thesaurus connectivity, orphan ratio, axiom density, SHACL conformance. See [docs/kos-evaluation-guide.md](docs/kos-evaluation-guide.md).
199
+
200
+ ### Chunk and provenance quality
201
+ Boundary coherence, size distribution, evidence span coverage.
202
+
203
+ ### Statistical rigor
204
+ Bootstrap confidence intervals for all metrics, used for regression detection in CI.
205
+
206
+ ## Tracking backends
207
+
208
+ | Backend | Class | Use case |
209
+ |---------|-------|----------|
210
+ | MLflow | `MLflowTracker` | Production experiment tracking |
211
+ | File | `FileTracker` | Local development, CI |
212
+ | Langfuse | Via OpenTelemetry | Trace-level debugging |
213
+ | No-op | `NoOpTracker` | Benchmarking, unit tests |
214
+ | Composite | `CompositeTracker` | Fan out to multiple backends |
215
+
216
+ ```python
217
+ from spindle_eval.tracking import create_tracker
218
+
219
+ tracker = create_tracker("mlflow")
220
+ tracker = create_tracker("file", output_dir="./results")
221
+ tracker = create_tracker("noop")
222
+ ```
223
+
224
+ ## Documentation
225
+
226
+ | Guide | Audience |
227
+ |-------|----------|
228
+ | [Spindle Developer Guide](docs/spindle-developer-guide.md) | Pipeline developers integrating with spindle-eval |
229
+ | [Custom Pipeline Guide](docs/custom-pipeline-guide.md) | Developers building non-spindle pipelines |
230
+ | [KOS Evaluation Guide](docs/kos-evaluation-guide.md) | Developers evaluating SKOS/OWL knowledge structures |
231
+ | [Hydra Config Conventions](docs/hydra-config-conventions.md) | Config authors and sweep designers |
232
+ | [Tracking Setup](docs/tracking_setup.md) | Setting up MLflow/Langfuse (GKE or local Docker) |
233
+ | [PyPI Publishing](docs/pypi-publish.md) | Building and uploading releases to PyPI |
234
+
235
+ ## Requirements
236
+
237
+ - Python 3.10+
238
+ - Pipeline package (optional — mocks used if unavailable, controlled via `runner.allow_mock_fallback`)
239
+
240
+ ## Project structure
241
+
242
+ ```
243
+ spindle-eval/
244
+ ├── src/spindle_eval/
245
+ │ ├── runner.py # Hydra entrypoint, pipeline discovery
246
+ │ ├── pipeline.py # PipelineExecutor (stage wiring, metrics, gates)
247
+ │ ├── protocols.py # Stage, StageDef, StageResult, Tracker protocols
248
+ │ ├── compat.py # Legacy component dict → StageDef adapter
249
+ │ ├── mocks.py # Mock Stage implementations for testing
250
+ │ ├── metrics/ # Ragas, graph, extraction, KOS, chunk, provenance
251
+ │ ├── tracking/ # MLflow, file, noop, composite trackers
252
+ │ ├── events/ # Event store, duration/token/error analysis
253
+ │ ├── datasets/ # Golden dataset loading, KOS reference extraction
254
+ │ ├── baselines/ # Baseline runner implementations
255
+ │ ├── ci/ # Regression detection, PR report generation
256
+ │ └── production/ # Feedback loops, staleness monitoring
257
+ │ ├── conf/ # Hydra config groups (packaged for pip install)
258
+ │ └── golden_data/ # Default evaluation datasets (JSONL)
259
+ ├── docs/ # Developer guides
260
+ ├── baselines/ # Baseline metric snapshots
261
+ └── tests/
262
+ ```
@@ -0,0 +1,215 @@
1
+ # spindle-eval
2
+
3
+ Pipeline-agnostic evaluation and observability framework for knowledge graph, RAG, and KOS pipelines. spindle-eval wraps any pipeline defined as a sequence of `Stage` objects with structured experiment tracking, automated metrics, parameter sweeps, quality gates, baseline comparisons, and CI/CD regression detection.
4
+
5
+ Originally built for [spindle](https://github.com/danielkentwood/spindle) (a Graph RAG pipeline), spindle-eval is designed to evaluate **any** pipeline — full end-to-end systems, individual stages, or partial subsets.
6
+
7
+ ## Why spindle-eval?
8
+
9
+ Multi-stage pipelines have many interacting parameters. Tuning them requires more than ad-hoc scripts. spindle-eval provides:
10
+
11
+ - **Stage-gated evaluation** — each stage must meet quality thresholds before downstream stages run, enforcing upstream-first optimization
12
+ - **Pipeline-agnostic execution** — define stages with the `Stage` protocol, wire them with `StageDef`, run them with `PipelineExecutor`
13
+ - **Composable configs** — Hydra config groups for every pipeline aspect, enabling single runs or multi-dimensional parameter sweeps
14
+ - **Multiple tracking backends** — MLflow for experiments, file-based for CI, composite for multi-backend, no-op for benchmarking
15
+ - **Structured events** — thread-safe event store with duration analysis, token tracking, and error filtering
16
+ - **KOS metrics** — intrinsic quality metrics for SKOS taxonomies and OWL ontologies (taxonomy depth, label quality, SHACL conformance, etc.)
17
+ - **Automated regression detection** — CI compares metrics against baselines with bootstrap confidence intervals
18
+ - **Golden dataset management** — versioned evaluation datasets with a question-type taxonomy and extensible reference fields for extraction and KOS evaluation
19
+
20
+ ## Architecture overview
21
+
22
+ ```
23
+ ┌─────────────────────────────┐
24
+ │ Hydra Configuration │
25
+ │ (composable YAML per stage) │
26
+ └──────────────┬──────────────┘
27
+
28
+ ┌──────────────▼──────────────┐
29
+ │ spindle-eval runner │
30
+ │ (discovery + orchestration) │
31
+ └──────────────┬──────────────┘
32
+
33
+ ┌──────────────▼──────────────┐
34
+ │ PipelineExecutor │
35
+ │ (stage wiring, metrics, │
36
+ │ gates, event logging) │
37
+ └──────────────┬──────────────┘
38
+
39
+ ┌────────────┬───────────┼───────────┬────────────┐
40
+ ▼ ▼ ▼ ▼ ▼
41
+ Stage 1 Stage 2 Stage 3 Stage N Metric fns
42
+ (any) (any) (any) (any) (attached)
43
+ │ │ │ │ │
44
+ └────────────┴───────────┴───────────┴────────────┘
45
+
46
+ ┌──────────────▼──────────────┐
47
+ │ Tracker backends │
48
+ ├──────────┬─────────┬────────┤
49
+ ▼ ▼ ▼ ▼
50
+ MLflow File Langfuse No-op
51
+ (experiments) (JSON) (traces) (benchmarks)
52
+ ```
53
+
54
+ ## Installation
55
+
56
+ ```bash
57
+ pip install spindle-eval
58
+ ```
59
+
60
+ For co-development with a pipeline package (editable install):
61
+
62
+ ```bash
63
+ pip install -e ".[dev]"
64
+ pip install -e /path/to/your-pipeline
65
+ ```
66
+
67
+ ## Quick start
68
+
69
+ ### Full pipeline evaluation
70
+
71
+ ```bash
72
+ # Single evaluation run
73
+ python -m spindle_eval.runner retrieval=hybrid generation=claude evaluation=quick
74
+
75
+ # Parameter sweep
76
+ python -m spindle_eval.runner --multirun \
77
+ preprocessing.chunk_size=256,512,1024 \
78
+ retrieval.top_k=5,10,20
79
+ ```
80
+
81
+ ### Evaluate a single stage
82
+
83
+ ```python
84
+ from spindle_eval.pipeline import PipelineExecutor
85
+ from spindle_eval.protocols import StageDef, StageResult
86
+ from spindle_eval.tracking import create_tracker
87
+ from spindle_eval.metrics.chunk_metrics import boundary_coherence, size_distribution
88
+
89
+ class MyChunker:
90
+ name = "chunking"
91
+ def run(self, inputs, cfg):
92
+ chunks = do_chunking(cfg)
93
+ return StageResult(outputs={"chunks": chunks})
94
+
95
+ tracker = create_tracker("file", output_dir="./results")
96
+ stages = [
97
+ StageDef(
98
+ name="chunking",
99
+ stage=MyChunker(),
100
+ metrics=[boundary_coherence, size_distribution],
101
+ ),
102
+ ]
103
+ result = PipelineExecutor(tracker).execute(stages, cfg)
104
+ tracker.end_run()
105
+ ```
106
+
107
+ ### Evaluate a KOS builder
108
+
109
+ ```python
110
+ from spindle_eval.metrics.kos_metrics import taxonomy_depth, label_quality, orphan_concept_ratio
111
+
112
+ stages = [
113
+ StageDef(
114
+ name="taxonomy",
115
+ stage=MyTaxonomyBuilder(),
116
+ input_keys={"chunks": "preprocessing.chunks"},
117
+ metrics=[taxonomy_depth, label_quality, orphan_concept_ratio],
118
+ gate=lambda m: m.get("orphan_concept_ratio", 1.0) < 0.3,
119
+ ),
120
+ ]
121
+ ```
122
+
123
+ ## Configuration
124
+
125
+ Hydra config groups live in `spindle_eval/conf/` (packaged with the install) and compose together:
126
+
127
+ | Group | Options | Controls |
128
+ |---|---|---|
129
+ | `preprocessing` | `default`, `small_chunks`, `large_chunks` | Chunking strategy and size |
130
+ | `ontology` | `schema_first`, `schema_free`, `hybrid` | Entity/relation schema discovery |
131
+ | `extraction` | `llm`, `nlp`, `finetuned` | Triple extraction method |
132
+ | `retrieval` | `hybrid`, `local`, `global`, `drift` | Graph retrieval strategy |
133
+ | `generation` | `gpt4`, `claude`, `gemini` | LLM for answer generation |
134
+ | `evaluation` | `quick`, `full` | Number of evaluation examples |
135
+ | `sweep` | `none`, `er_threshold`, `retrieval`, `chunk_size` | Predefined sweep dimensions |
136
+
137
+ Pipeline packages can register additional config groups via Hydra's `SearchPathPlugin`. See [docs/hydra-config-conventions.md](docs/hydra-config-conventions.md).
138
+
139
+ ## Metrics
140
+
141
+ ### RAG quality (via Ragas)
142
+ Faithfulness, context recall, context precision, answer correctness, answer relevancy.
143
+
144
+ ### Graph quality
145
+ Connectivity, modularity, B-CUBED clustering, CEAF entity alignment, subgraph completeness.
146
+
147
+ ### Extraction quality
148
+ Triple extraction precision, recall, and F1 — with configurable stage gates.
149
+
150
+ ### KOS quality
151
+ Taxonomy depth/breadth, label quality, definition completeness, thesaurus connectivity, orphan ratio, axiom density, SHACL conformance. See [docs/kos-evaluation-guide.md](docs/kos-evaluation-guide.md).
152
+
153
+ ### Chunk and provenance quality
154
+ Boundary coherence, size distribution, evidence span coverage.
155
+
156
+ ### Statistical rigor
157
+ Bootstrap confidence intervals for all metrics, used for regression detection in CI.
158
+
159
+ ## Tracking backends
160
+
161
+ | Backend | Class | Use case |
162
+ |---------|-------|----------|
163
+ | MLflow | `MLflowTracker` | Production experiment tracking |
164
+ | File | `FileTracker` | Local development, CI |
165
+ | Langfuse | Via OpenTelemetry | Trace-level debugging |
166
+ | No-op | `NoOpTracker` | Benchmarking, unit tests |
167
+ | Composite | `CompositeTracker` | Fan out to multiple backends |
168
+
169
+ ```python
170
+ from spindle_eval.tracking import create_tracker
171
+
172
+ tracker = create_tracker("mlflow")
173
+ tracker = create_tracker("file", output_dir="./results")
174
+ tracker = create_tracker("noop")
175
+ ```
176
+
177
+ ## Documentation
178
+
179
+ | Guide | Audience |
180
+ |-------|----------|
181
+ | [Spindle Developer Guide](docs/spindle-developer-guide.md) | Pipeline developers integrating with spindle-eval |
182
+ | [Custom Pipeline Guide](docs/custom-pipeline-guide.md) | Developers building non-spindle pipelines |
183
+ | [KOS Evaluation Guide](docs/kos-evaluation-guide.md) | Developers evaluating SKOS/OWL knowledge structures |
184
+ | [Hydra Config Conventions](docs/hydra-config-conventions.md) | Config authors and sweep designers |
185
+ | [Tracking Setup](docs/tracking_setup.md) | Setting up MLflow/Langfuse (GKE or local Docker) |
186
+ | [PyPI Publishing](docs/pypi-publish.md) | Building and uploading releases to PyPI |
187
+
188
+ ## Requirements
189
+
190
+ - Python 3.10+
191
+ - Pipeline package (optional — mocks used if unavailable, controlled via `runner.allow_mock_fallback`)
192
+
193
+ ## Project structure
194
+
195
+ ```
196
+ spindle-eval/
197
+ ├── src/spindle_eval/
198
+ │ ├── runner.py # Hydra entrypoint, pipeline discovery
199
+ │ ├── pipeline.py # PipelineExecutor (stage wiring, metrics, gates)
200
+ │ ├── protocols.py # Stage, StageDef, StageResult, Tracker protocols
201
+ │ ├── compat.py # Legacy component dict → StageDef adapter
202
+ │ ├── mocks.py # Mock Stage implementations for testing
203
+ │ ├── metrics/ # Ragas, graph, extraction, KOS, chunk, provenance
204
+ │ ├── tracking/ # MLflow, file, noop, composite trackers
205
+ │ ├── events/ # Event store, duration/token/error analysis
206
+ │ ├── datasets/ # Golden dataset loading, KOS reference extraction
207
+ │ ├── baselines/ # Baseline runner implementations
208
+ │ ├── ci/ # Regression detection, PR report generation
209
+ │ └── production/ # Feedback loops, staleness monitoring
210
+ │ ├── conf/ # Hydra config groups (packaged for pip install)
211
+ │ └── golden_data/ # Default evaluation datasets (JSONL)
212
+ ├── docs/ # Developer guides
213
+ ├── baselines/ # Baseline metric snapshots
214
+ └── tests/
215
+ ```
@@ -0,0 +1,87 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "spindle-eval"
7
+ version = "0.1.0"
8
+ description = "Pipeline-agnostic evaluation and observability for knowledge graph, RAG, and KOS pipelines"
9
+ readme = "README.md"
10
+ license = {text = "MIT"}
11
+ requires-python = ">=3.10"
12
+ authors = [{name = "Spindle Team"}]
13
+ keywords = [
14
+ "graph-rag",
15
+ "evaluation",
16
+ "mlflow",
17
+ "experimentation",
18
+ "rag",
19
+ "knowledge-graph",
20
+ "hydra",
21
+ ]
22
+ classifiers = [
23
+ "Development Status :: 3 - Alpha",
24
+ "Intended Audience :: Developers",
25
+ "License :: OSI Approved :: MIT License",
26
+ "Programming Language :: Python :: 3",
27
+ "Programming Language :: Python :: 3.10",
28
+ "Programming Language :: Python :: 3.11",
29
+ "Programming Language :: Python :: 3.12",
30
+ ]
31
+ dependencies = [
32
+ "mlflow>=3.0",
33
+ "hydra-core>=1.3",
34
+ "omegaconf>=2.3",
35
+ "ragas>=0.2",
36
+ "optuna>=2.10,<3.0",
37
+ "hydra-optuna-sweeper>=1.2",
38
+ "langfuse>=2.0",
39
+ "opentelemetry-sdk>=1.20",
40
+ "opentelemetry-exporter-otlp>=1.20",
41
+ "scipy>=1.10",
42
+ "numpy>=1.24",
43
+ "scikit-learn>=1.3",
44
+ "rank-bm25>=0.2",
45
+ "sentence-transformers>=2.2",
46
+ "rdflib>=7.0",
47
+ "pyshacl>=0.25",
48
+ ]
49
+
50
+ [project.optional-dependencies]
51
+ dev = [
52
+ "pytest>=7.0",
53
+ "pytest-cov>=4.0",
54
+ "ruff>=0.1",
55
+ "build>=1.0",
56
+ "twine>=5.0",
57
+ ]
58
+ spindle = [
59
+ "spindle>=0.1.0",
60
+ ]
61
+
62
+ [project.urls]
63
+ Homepage = "https://github.com/danielkentwood/spindle-eval"
64
+ Documentation = "https://github.com/danielkentwood/spindle-eval#readme"
65
+ Repository = "https://github.com/danielkentwood/spindle-eval"
66
+ Changelog = "https://github.com/danielkentwood/spindle-eval/releases"
67
+
68
+ [project.scripts]
69
+ spindle-eval = "spindle_eval.runner:main"
70
+
71
+ [tool.setuptools.packages.find]
72
+ where = ["src"]
73
+
74
+ [tool.setuptools.package-data]
75
+ spindle_eval = ["conf/**/*.yaml", "golden_data/**/*.jsonl", "golden_data/**/*.md"]
76
+
77
+ [tool.ruff]
78
+ line-length = 88
79
+ target-version = "py310"
80
+
81
+ [tool.ruff.lint]
82
+ select = ["E", "F", "I", "N", "W", "UP"]
83
+ ignore = []
84
+
85
+ [tool.pytest.ini_options]
86
+ testpaths = ["tests"]
87
+ addopts = "-v --tb=short"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,3 @@
1
+ """spindle-eval: Experimentation infrastructure for Graph RAG pipelines."""
2
+
3
+ __version__ = "0.1.0"
@@ -0,0 +1,5 @@
1
+ """Baseline implementations for fair Graph RAG comparisons."""
2
+
3
+ from spindle_eval.baselines.base import BaselineRunner, BaselineResult
4
+
5
+ __all__ = ["BaselineRunner", "BaselineResult"]
@@ -0,0 +1,21 @@
1
+ """Baseline protocol and shared result structures."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass, field
6
+ from typing import Any, Protocol
7
+
8
+
9
+ @dataclass
10
+ class BaselineResult:
11
+ answer: str
12
+ contexts: list[str]
13
+ metadata: dict[str, Any] = field(default_factory=dict)
14
+
15
+
16
+ class BaselineRunner(Protocol):
17
+ """Common interface across baseline systems."""
18
+
19
+ name: str
20
+
21
+ def run(self, question: str, **kwargs: Any) -> BaselineResult: ...
@@ -0,0 +1,42 @@
1
+ """BM25 lexical retrieval baseline."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any, Callable
6
+
7
+ from rank_bm25 import BM25Okapi
8
+
9
+ from spindle_eval.baselines.base import BaselineResult
10
+
11
+
12
+ class BM25Baseline:
13
+ name = "bm25"
14
+
15
+ def __init__(
16
+ self,
17
+ corpus: list[str],
18
+ llm_callable: Callable[[str], str],
19
+ top_k: int = 5,
20
+ ) -> None:
21
+ self._corpus = corpus
22
+ self._tokenized = [doc.lower().split() for doc in corpus]
23
+ self._bm25 = BM25Okapi(self._tokenized)
24
+ self._llm_callable = llm_callable
25
+ self._top_k = top_k
26
+
27
+ def run(self, question: str, **kwargs: Any) -> BaselineResult:
28
+ top_k = int(kwargs.get("top_k", self._top_k))
29
+ scores = self._bm25.get_scores(question.lower().split())
30
+ ranked = sorted(
31
+ zip(self._corpus, scores, strict=False),
32
+ key=lambda item: item[1],
33
+ reverse=True,
34
+ )
35
+ contexts = [doc for doc, _ in ranked[:top_k]]
36
+ prompt = f"Question: {question}\n\nContext:\n" + "\n".join(contexts)
37
+ answer = self._llm_callable(prompt)
38
+ return BaselineResult(
39
+ answer=answer,
40
+ contexts=contexts,
41
+ metadata={"baseline": self.name, "top_k": top_k},
42
+ )