ragradar-evaluate 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. ragradar_evaluate-0.1.0/.gitignore +38 -0
  2. ragradar_evaluate-0.1.0/PKG-INFO +162 -0
  3. ragradar_evaluate-0.1.0/README.md +135 -0
  4. ragradar_evaluate-0.1.0/pyproject.toml +51 -0
  5. ragradar_evaluate-0.1.0/src/ragradar_evaluate/__init__.py +43 -0
  6. ragradar_evaluate-0.1.0/src/ragradar_evaluate/benchmark/__init__.py +0 -0
  7. ragradar_evaluate-0.1.0/src/ragradar_evaluate/benchmark/builder.py +117 -0
  8. ragradar_evaluate-0.1.0/src/ragradar_evaluate/benchmark/checker.py +79 -0
  9. ragradar_evaluate-0.1.0/src/ragradar_evaluate/benchmark/exporter.py +43 -0
  10. ragradar_evaluate-0.1.0/src/ragradar_evaluate/benchmark/seeder.py +89 -0
  11. ragradar_evaluate-0.1.0/src/ragradar_evaluate/cli.py +390 -0
  12. ragradar_evaluate-0.1.0/src/ragradar_evaluate/facade.py +535 -0
  13. ragradar_evaluate-0.1.0/src/ragradar_evaluate/layers/__init__.py +0 -0
  14. ragradar_evaluate-0.1.0/src/ragradar_evaluate/layers/input_quality.py +327 -0
  15. ragradar_evaluate-0.1.0/src/ragradar_evaluate/layers/output_quality.py +83 -0
  16. ragradar_evaluate-0.1.0/src/ragradar_evaluate/policy/__init__.py +0 -0
  17. ragradar_evaluate-0.1.0/src/ragradar_evaluate/policy/persistence.py +29 -0
  18. ragradar_evaluate-0.1.0/src/ragradar_evaluate/policy/risk.py +47 -0
  19. ragradar_evaluate-0.1.0/src/ragradar_evaluate/policy/schema.py +40 -0
  20. ragradar_evaluate-0.1.0/tests/conftest.py +206 -0
  21. ragradar_evaluate-0.1.0/tests/test_benchmark.py +228 -0
  22. ragradar_evaluate-0.1.0/tests/test_cli.py +201 -0
  23. ragradar_evaluate-0.1.0/tests/test_facade.py +315 -0
  24. ragradar_evaluate-0.1.0/tests/test_input_quality.py +269 -0
  25. ragradar_evaluate-0.1.0/tests/test_lazy_imports.py +21 -0
  26. ragradar_evaluate-0.1.0/tests/test_policy.py +39 -0
  27. ragradar_evaluate-0.1.0/tests/test_risk.py +48 -0
  28. ragradar_evaluate-0.1.0/tests/test_user_tasks.py +136 -0
@@ -0,0 +1,38 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.egg-info/
5
+ .venv/
6
+ dist/
7
+ build/
8
+ *.so
9
+
10
+ # uv
11
+ .uv/
12
+ uv.lock
13
+
14
+ # ragradar runtime — never commit user run data
15
+ .ragradar/
16
+
17
+ # environment
18
+ .env
19
+ *.env
20
+ .env.*
21
+
22
+ # IDE
23
+ .vscode/
24
+ .idea/
25
+ *.swp
26
+
27
+ # OS
28
+ .DS_Store
29
+ Thumbs.db
30
+
31
+ # test output
32
+ .pytest_cache/
33
+ htmlcov/
34
+ .coverage
35
+
36
+ # example output
37
+ examples/rag_pipeline/output/
38
+ .claude/
@@ -0,0 +1,162 @@
1
+ Metadata-Version: 2.4
2
+ Name: ragradar-evaluate
3
+ Version: 0.1.0
4
+ Summary: Evaluation layer for ragradar observability system
5
+ Project-URL: Homepage, https://github.com/pleokarthik/RAGRadar
6
+ Project-URL: Repository, https://github.com/pleokarthik/RAGRadar
7
+ Project-URL: Issues, https://github.com/pleokarthik/RAGRadar/issues
8
+ Author-email: Leo Karthik Paramasivan <pleokarthik@gmail.com>
9
+ License-Expression: MIT
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
16
+ Requires-Python: >=3.11
17
+ Requires-Dist: click>=8.0
18
+ Requires-Dist: ragas>=0.2
19
+ Requires-Dist: ragradar-core<0.2.0,>=0.1.0
20
+ Requires-Dist: rich>=13.0
21
+ Requires-Dist: scipy>=1.13
22
+ Provides-Extra: ollama
23
+ Requires-Dist: ollama>=0.2; extra == 'ollama'
24
+ Provides-Extra: transformers
25
+ Requires-Dist: sentence-transformers>=3.0; extra == 'transformers'
26
+ Description-Content-Type: text/markdown
27
+
28
+ # ragradar-evaluate
29
+
30
+ Scores captured runs. Two tasks, one discovery helper:
31
+
32
+ | Task | Call | Cost |
33
+ |---|---|---|
34
+ | "Is this run healthy?" | `check(run_id)` | free — deterministic, no LLM, instant |
35
+ | "Score it fully" | `evaluate(run_id)` | free input metrics + LLM-judged output metrics |
36
+ | "What can be scored?" | `available_metrics()` | free |
37
+
38
+ ```
39
+ pip install ragradar-evaluate
40
+ ```
41
+
42
+ ## Is this run healthy? — check()
43
+
44
+ Call before paying for an LLM; put it in CI.
45
+
46
+ ```python
47
+ import ragradar_capture
48
+ from ragradar_evaluate import check
49
+
50
+ run_id = ragradar_capture.capture(
51
+ "what is RRF?", "RRF fuses rankings.",
52
+ chunks=[{"chunk_id": "c1", "source_doc_id": "d1",
53
+ "content": "RRF combines rankings.", "token_count": 10,
54
+ "rerank_score": 0.9}],
55
+ )
56
+ result = check(run_id)
57
+
58
+ print(result.verdict) # "ok" | "warn" | "fail"
59
+ print(result.problems) # ["duplicate chunks: ratio 0.50 exceeds 0.20", ...]
60
+ print(result.risk_score) # 0.0-1.0, None if input quality couldn't be assessed
61
+ print(result.factors) # per-factor {value, threshold, status}
62
+ print(result.thresholds) # "learned" | "policy" — which standards were applied
63
+ ```
64
+
65
+ `check()` compares all free input metrics against the **current
66
+ standards**: once at least 10 evaluated runs exist for the pipeline it
67
+ uses thresholds learned from your own history (and says so via
68
+ `thresholds == "learned"`); before that it falls back to the policy
69
+ defaults. A run captured without chunks gets a `warn` verdict explaining
70
+ the missing data — never an exception.
71
+
72
+ ## Score it fully — evaluate()
73
+
74
+ ```python
75
+ import ragradar_capture
76
+ from ragradar_evaluate import evaluate
77
+
78
+ run_id = ragradar_capture.capture(
79
+ "what is RRF?", "RRF fuses rankings.",
80
+ chunks=[{"chunk_id": "c1", "source_doc_id": "d1",
81
+ "content": "RRF combines rankings.", "token_count": 10,
82
+ "rerank_score": 0.9}],
83
+ )
84
+
85
+ # Complete eval: every metric applicable to the record.
86
+ result = evaluate(run_id)
87
+
88
+ # One atomic metric — nothing else is computed:
89
+ result = evaluate(run_id, metrics=["duplicates"], save=False)
90
+
91
+ # A chosen subset:
92
+ result = evaluate(run_id, metrics=["relevance", "faithfulness"])
93
+ ```
94
+
95
+ `target` can be an sNrN string (what `ragradar_capture.capture()` returns), a
96
+ committed `Capture` object, or a bare `RunRecord` (then pass
97
+ `save=False` — there's no run row to write to).
98
+
99
+ ### EvalResult
100
+
101
+ | Field | Meaning |
102
+ |---|---|
103
+ | `metrics` | per-metric results: a dict of values for input families, a float for RAGAS metrics |
104
+ | `skipped` | metric → reason: `"not requested"`, `"missing data: ..."`, or `"requires ground_truth"` |
105
+ | `errors` | metric → error string; RAGAS-not-installed and RAGAS runtime failures land here identically — `evaluate()` never raises for judge failures |
106
+ | `policy_violations` | policy thresholds breached by the computed values |
107
+ | `risk_score` | `None` when input metrics weren't computed; `0.0` only ever means "computed, no risk" |
108
+ | `run_id` / `saved` | identity and whether scores were persisted |
109
+
110
+ `save=True` (default) persists via the one store path; `ragradar explain
111
+ <run_id>` then shows the scores alongside its analysis.
112
+
113
+ ## available_metrics()
114
+
115
+ | Metric | Layer | Cost | Requires |
116
+ |---|---|---|---|
117
+ | `relevance` | input | free | chunks |
118
+ | `duplicates` | input | free | chunks |
119
+ | `truncation` | input | free | chunks |
120
+ | `token_efficiency` | input | free | chunks |
121
+ | `coherence` | input | free | chunks |
122
+ | `faithfulness` | output | llm | chunks, response |
123
+ | `answer_relevancy` | output | llm | chunks, response |
124
+ | `context_precision` | output | llm | chunks, response |
125
+ | `context_recall` | output | llm | chunks, response, ground_truth |
126
+
127
+ Output metrics are RAGAS LLM-as-judge calls — they cost money and need a
128
+ configured judge. **To stay free-only**, use `check()`, or select input
129
+ metrics explicitly: `evaluate(run_id, metrics=["relevance",
130
+ "duplicates", "truncation", "token_efficiency", "coherence"])`.
131
+
132
+ ## Policy system
133
+
134
+ Human-set thresholds encoding known failure modes; active from day one
135
+ and the fallback standard for `check()`. Stored per pipeline.
136
+
137
+ ```bash
138
+ ragradar-evaluate policy show
139
+ ragradar-evaluate policy set max_duplicate_ratio 0.1
140
+ ragradar-evaluate policy reset
141
+ ```
142
+
143
+ Programmatic override: `evaluate(run_id, policy=InputQualityPolicy(...))`.
144
+
145
+ ## Benchmark lifecycle (CLI)
146
+
147
+ Learned thresholds accumulate as you evaluate real runs — `check()`
148
+ picks them up automatically at 10+ evaluated runs. The CLI exposes the
149
+ machinery for inspection:
150
+
151
+ ```bash
152
+ ragradar-evaluate run s2r3 # evaluate one run (both layers)
153
+ ragradar-evaluate run s2r3 --input-only # free metrics only
154
+ ragradar-evaluate run --session s2 # evaluate a whole session
155
+ ragradar-evaluate benchmark show # current learned thresholds
156
+ ragradar-evaluate benchmark build # rebuild from evaluated history
157
+ ragradar-evaluate benchmark check s2r3 # factor-by-factor threshold check
158
+ ragradar-evaluate benchmark export # RAGAS-compatible JSONL dataset
159
+ ```
160
+
161
+ `--input-only --output-only` together is an error (it would compute
162
+ nothing).
@@ -0,0 +1,135 @@
1
+ # ragradar-evaluate
2
+
3
+ Scores captured runs. Two tasks, one discovery helper:
4
+
5
+ | Task | Call | Cost |
6
+ |---|---|---|
7
+ | "Is this run healthy?" | `check(run_id)` | free — deterministic, no LLM, instant |
8
+ | "Score it fully" | `evaluate(run_id)` | free input metrics + LLM-judged output metrics |
9
+ | "What can be scored?" | `available_metrics()` | free |
10
+
11
+ ```
12
+ pip install ragradar-evaluate
13
+ ```
14
+
15
+ ## Is this run healthy? — check()
16
+
17
+ Call before paying for an LLM; put it in CI.
18
+
19
+ ```python
20
+ import ragradar_capture
21
+ from ragradar_evaluate import check
22
+
23
+ run_id = ragradar_capture.capture(
24
+ "what is RRF?", "RRF fuses rankings.",
25
+ chunks=[{"chunk_id": "c1", "source_doc_id": "d1",
26
+ "content": "RRF combines rankings.", "token_count": 10,
27
+ "rerank_score": 0.9}],
28
+ )
29
+ result = check(run_id)
30
+
31
+ print(result.verdict) # "ok" | "warn" | "fail"
32
+ print(result.problems) # ["duplicate chunks: ratio 0.50 exceeds 0.20", ...]
33
+ print(result.risk_score) # 0.0-1.0, None if input quality couldn't be assessed
34
+ print(result.factors) # per-factor {value, threshold, status}
35
+ print(result.thresholds) # "learned" | "policy" — which standards were applied
36
+ ```
37
+
38
+ `check()` compares all free input metrics against the **current
39
+ standards**: once at least 10 evaluated runs exist for the pipeline it
40
+ uses thresholds learned from your own history (and says so via
41
+ `thresholds == "learned"`); before that it falls back to the policy
42
+ defaults. A run captured without chunks gets a `warn` verdict explaining
43
+ the missing data — never an exception.
44
+
45
+ ## Score it fully — evaluate()
46
+
47
+ ```python
48
+ import ragradar_capture
49
+ from ragradar_evaluate import evaluate
50
+
51
+ run_id = ragradar_capture.capture(
52
+ "what is RRF?", "RRF fuses rankings.",
53
+ chunks=[{"chunk_id": "c1", "source_doc_id": "d1",
54
+ "content": "RRF combines rankings.", "token_count": 10,
55
+ "rerank_score": 0.9}],
56
+ )
57
+
58
+ # Complete eval: every metric applicable to the record.
59
+ result = evaluate(run_id)
60
+
61
+ # One atomic metric — nothing else is computed:
62
+ result = evaluate(run_id, metrics=["duplicates"], save=False)
63
+
64
+ # A chosen subset:
65
+ result = evaluate(run_id, metrics=["relevance", "faithfulness"])
66
+ ```
67
+
68
+ `target` can be an sNrN string (what `ragradar_capture.capture()` returns), a
69
+ committed `Capture` object, or a bare `RunRecord` (then pass
70
+ `save=False` — there's no run row to write to).
71
+
72
+ ### EvalResult
73
+
74
+ | Field | Meaning |
75
+ |---|---|
76
+ | `metrics` | per-metric results: a dict of values for input families, a float for RAGAS metrics |
77
+ | `skipped` | metric → reason: `"not requested"`, `"missing data: ..."`, or `"requires ground_truth"` |
78
+ | `errors` | metric → error string; RAGAS-not-installed and RAGAS runtime failures land here identically — `evaluate()` never raises for judge failures |
79
+ | `policy_violations` | policy thresholds breached by the computed values |
80
+ | `risk_score` | `None` when input metrics weren't computed; `0.0` only ever means "computed, no risk" |
81
+ | `run_id` / `saved` | identity and whether scores were persisted |
82
+
83
+ `save=True` (default) persists via the one store path; `ragradar explain
84
+ <run_id>` then shows the scores alongside its analysis.
85
+
86
+ ## available_metrics()
87
+
88
+ | Metric | Layer | Cost | Requires |
89
+ |---|---|---|---|
90
+ | `relevance` | input | free | chunks |
91
+ | `duplicates` | input | free | chunks |
92
+ | `truncation` | input | free | chunks |
93
+ | `token_efficiency` | input | free | chunks |
94
+ | `coherence` | input | free | chunks |
95
+ | `faithfulness` | output | llm | chunks, response |
96
+ | `answer_relevancy` | output | llm | chunks, response |
97
+ | `context_precision` | output | llm | chunks, response |
98
+ | `context_recall` | output | llm | chunks, response, ground_truth |
99
+
100
+ Output metrics are RAGAS LLM-as-judge calls — they cost money and need a
101
+ configured judge. **To stay free-only**, use `check()`, or select input
102
+ metrics explicitly: `evaluate(run_id, metrics=["relevance",
103
+ "duplicates", "truncation", "token_efficiency", "coherence"])`.
104
+
105
+ ## Policy system
106
+
107
+ Human-set thresholds encoding known failure modes; active from day one
108
+ and the fallback standard for `check()`. Stored per pipeline.
109
+
110
+ ```bash
111
+ ragradar-evaluate policy show
112
+ ragradar-evaluate policy set max_duplicate_ratio 0.1
113
+ ragradar-evaluate policy reset
114
+ ```
115
+
116
+ Programmatic override: `evaluate(run_id, policy=InputQualityPolicy(...))`.
117
+
118
+ ## Benchmark lifecycle (CLI)
119
+
120
+ Learned thresholds accumulate as you evaluate real runs — `check()`
121
+ picks them up automatically at 10+ evaluated runs. The CLI exposes the
122
+ machinery for inspection:
123
+
124
+ ```bash
125
+ ragradar-evaluate run s2r3 # evaluate one run (both layers)
126
+ ragradar-evaluate run s2r3 --input-only # free metrics only
127
+ ragradar-evaluate run --session s2 # evaluate a whole session
128
+ ragradar-evaluate benchmark show # current learned thresholds
129
+ ragradar-evaluate benchmark build # rebuild from evaluated history
130
+ ragradar-evaluate benchmark check s2r3 # factor-by-factor threshold check
131
+ ragradar-evaluate benchmark export # RAGAS-compatible JSONL dataset
132
+ ```
133
+
134
+ `--input-only --output-only` together is an error (it would compute
135
+ nothing).
@@ -0,0 +1,51 @@
1
+ [project]
2
+ name = "ragradar-evaluate"
3
+ version = "0.1.0"
4
+ description = "Evaluation layer for ragradar observability system"
5
+ readme = "README.md"
6
+ requires-python = ">=3.11"
7
+ license = "MIT"
8
+ authors = [
9
+ { name = "Leo Karthik Paramasivan", email = "pleokarthik@gmail.com" },
10
+ ]
11
+ classifiers = [
12
+ "Development Status :: 3 - Alpha",
13
+ "License :: OSI Approved :: MIT License",
14
+ "Programming Language :: Python :: 3.11",
15
+ "Programming Language :: Python :: 3.12",
16
+ "Intended Audience :: Developers",
17
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
18
+ ]
19
+ dependencies = [
20
+ "ragradar-core>=0.1.0,<0.2.0",
21
+ "ragas>=0.2",
22
+ "scipy>=1.13",
23
+ "rich>=13.0",
24
+ "click>=8.0",
25
+ ]
26
+
27
+ [project.urls]
28
+ Homepage = "https://github.com/pleokarthik/RAGRadar"
29
+ Repository = "https://github.com/pleokarthik/RAGRadar"
30
+ Issues = "https://github.com/pleokarthik/RAGRadar/issues"
31
+
32
+ [project.optional-dependencies]
33
+ ollama = [
34
+ "ollama>=0.2",
35
+ ]
36
+ transformers = [
37
+ "sentence-transformers>=3.0",
38
+ ]
39
+
40
+ [project.scripts]
41
+ ragradar-evaluate = "ragradar_evaluate.cli:main"
42
+
43
+ [tool.uv.sources]
44
+ ragradar-core = { workspace = true }
45
+
46
+ [build-system]
47
+ requires = ["hatchling"]
48
+ build-backend = "hatchling.build"
49
+
50
+ [tool.hatch.build.targets.wheel]
51
+ packages = ["src/ragradar_evaluate"]
@@ -0,0 +1,43 @@
1
+ from ragradar_core.schema import (
2
+ CacheEvent,
3
+ ChunkRecord,
4
+ RunRecord,
5
+ TokenBudget,
6
+ TokenUsage,
7
+ ToolCallRecord,
8
+ Turn,
9
+ )
10
+
11
+ from ragradar_evaluate.facade import (
12
+ CheckResult,
13
+ EvalResult,
14
+ MetricInfo,
15
+ available_metrics,
16
+ check,
17
+ evaluate,
18
+ )
19
+ from ragradar_evaluate.policy.schema import InputQualityPolicy
20
+
21
+ # NOTE: benchmark machinery (seeding, building, checking, exporting) is
22
+ # internal — the CLI's `benchmark` commands drive it, and check() consults
23
+ # learned thresholds automatically. It is deliberately absent here.
24
+
25
+ __all__ = [
26
+ # User tasks
27
+ "check",
28
+ "evaluate",
29
+ "available_metrics",
30
+ # Result / config types
31
+ "CheckResult",
32
+ "EvalResult",
33
+ "MetricInfo",
34
+ "InputQualityPolicy",
35
+ # Re-exported schema dataclasses so users need only one import.
36
+ "ChunkRecord",
37
+ "TokenBudget",
38
+ "TokenUsage",
39
+ "Turn",
40
+ "CacheEvent",
41
+ "ToolCallRecord",
42
+ "RunRecord",
43
+ ]
@@ -0,0 +1,117 @@
1
+ import json
2
+
3
+ from ragradar_core import store
4
+ from scipy import stats
5
+
6
+ INPUT_FACTORS = [
7
+ "duplicate_ratio",
8
+ "top_chunk_score",
9
+ "high_score_truncations",
10
+ "token_headroom_pct",
11
+ "source_domain_count",
12
+ "low_score_chunk_ratio",
13
+ "mean_relevance",
14
+ "truncated_count",
15
+ "score_variance",
16
+ ]
17
+
18
+ RAGAS_METRICS = ["faithfulness", "answer_relevancy"]
19
+
20
+
21
+ def _suggest_threshold(values: list[float], ragas_scores: list[float]) -> float:
22
+ sorted_vals = sorted(set(values))
23
+ if len(sorted_vals) < 2:
24
+ return sorted_vals[0] if sorted_vals else 0.0
25
+
26
+ best_threshold = sorted_vals[0]
27
+ best_diff = 0.0
28
+
29
+ for i in range(len(sorted_vals) - 1):
30
+ threshold = (sorted_vals[i] + sorted_vals[i + 1]) / 2
31
+ below = [r for v, r in zip(values, ragas_scores) if v <= threshold]
32
+ above = [r for v, r in zip(values, ragas_scores) if v > threshold]
33
+
34
+ if not below or not above:
35
+ continue
36
+
37
+ diff = abs(sum(above) / len(above) - sum(below) / len(below))
38
+ if diff > best_diff:
39
+ best_diff = diff
40
+ best_threshold = threshold
41
+
42
+ return round(best_threshold, 4)
43
+
44
+
45
+ def build(pipeline: str | None = None) -> dict:
46
+ runs = store.get_all_evaluated_runs(pipeline)
47
+
48
+ if len(runs) < 10:
49
+ raise ValueError(f"Need at least 10 evaluated runs to build benchmark, found {len(runs)}.")
50
+
51
+ parsed = []
52
+ for r in runs:
53
+ parsed.append(json.loads(r["eval_scores"]))
54
+
55
+ pipeline_key = pipeline or "__default"
56
+ factors_result = {}
57
+ batch_entries: list[tuple] = []
58
+
59
+ for factor in INPUT_FACTORS:
60
+ factor_values: list[float] = []
61
+ ragas_values: dict[str, list[float]] = {m: [] for m in RAGAS_METRICS}
62
+
63
+ for eval_data in parsed:
64
+ input_data = eval_data.get("input") or {}
65
+ output_data = eval_data.get("output") or {}
66
+
67
+ fval = input_data.get(factor)
68
+ if fval is None:
69
+ continue
70
+
71
+ has_ragas = any(output_data.get(m) is not None for m in RAGAS_METRICS)
72
+ if not has_ragas:
73
+ continue
74
+
75
+ factor_values.append(float(fval))
76
+ for m in RAGAS_METRICS:
77
+ ragas_values[m].append(float(output_data.get(m) or 0.0))
78
+
79
+ if len(factor_values) < 3:
80
+ continue
81
+
82
+ correlations: dict[str, float | None] = {}
83
+ for m in RAGAS_METRICS:
84
+ vals = ragas_values[m]
85
+ if (
86
+ len(vals) == len(factor_values)
87
+ and len(set(factor_values)) > 1
88
+ and len(set(vals)) > 1
89
+ ):
90
+ corr, _ = stats.pearsonr(factor_values, vals)
91
+ # float(): scipy stubs expose corr as numpy.float64 (_T_co), not float;
92
+ # the cast is the narrowing workaround — no type: ignore needed.
93
+ correlations[f"{m}_correlation"] = round(float(corr), 4)
94
+ else:
95
+ correlations[f"{m}_correlation"] = None
96
+
97
+ valid_corrs = [v for v in correlations.values() if v is not None]
98
+ primary_corr = max(valid_corrs, key=abs) if valid_corrs else 0.0
99
+
100
+ primary_ragas = ragas_values[RAGAS_METRICS[0]]
101
+ suggested = _suggest_threshold(factor_values, primary_ragas)
102
+
103
+ batch_entries.append((pipeline_key, factor, suggested, primary_corr, len(factor_values)))
104
+
105
+ factors_result[factor] = {
106
+ **correlations,
107
+ "suggested_threshold": suggested,
108
+ "sample_count": len(factor_values),
109
+ }
110
+
111
+ store.write_benchmark_entries_batch(batch_entries)
112
+
113
+ return {
114
+ "run_count": len(runs),
115
+ "pipeline": pipeline,
116
+ "factors": factors_result,
117
+ }
@@ -0,0 +1,79 @@
1
+ import json
2
+
3
+ from ragradar_core import store
4
+ from ragradar_core.schema import RunRecord
5
+
6
+ from ragradar_evaluate.layers import input_quality
7
+ from ragradar_evaluate.policy.persistence import load_policy
8
+
9
+
10
+ def check(
11
+ session_id: int,
12
+ run_seq: int,
13
+ pipeline: str | None = None,
14
+ ) -> dict:
15
+ run_row = store.get_run(session_id, run_seq)
16
+ if run_row is None:
17
+ raise ValueError(f"Run s{session_id}r{run_seq} not found.")
18
+
19
+ record = RunRecord.from_json(json.loads(run_row["run_data"]))
20
+ pipeline = pipeline or run_row["pipeline"] or "__default"
21
+ policy = load_policy(pipeline)
22
+
23
+ input_scores = input_quality.score_input_quality(record, policy)
24
+ benchmark = store.get_benchmark(pipeline)
25
+ benchmark_map = {b["factor"]: b for b in benchmark}
26
+
27
+ factors = {}
28
+ fail_count = 0
29
+
30
+ check_factors = [
31
+ ("duplicate_ratio", "higher_bad"),
32
+ ("top_chunk_score", "lower_bad"),
33
+ ("high_score_truncations", "higher_bad"),
34
+ ("token_headroom_pct", "lower_bad"),
35
+ ("source_domain_count", "higher_bad"),
36
+ ("low_score_chunk_ratio", "higher_bad"),
37
+ ]
38
+
39
+ for factor, direction in check_factors:
40
+ value = input_scores.get(factor) if input_scores else None
41
+ bench = benchmark_map.get(factor)
42
+ threshold = bench["threshold"] if bench else None
43
+
44
+ if value is None or threshold is None:
45
+ status = "ok"
46
+ elif direction == "lower_bad":
47
+ status = "fail" if value < threshold else "ok"
48
+ else:
49
+ status = "fail" if value > threshold else "ok"
50
+
51
+ if status == "fail":
52
+ fail_count += 1
53
+
54
+ factors[factor] = {
55
+ "value": value,
56
+ "benchmark_threshold": threshold,
57
+ "status": status,
58
+ }
59
+
60
+ # risk_score is None when the run was never evaluated or its input
61
+ # metrics could not be computed (0.0 strictly means "computed, no
62
+ # risk") — unknown risk never counts toward the verdict.
63
+ eval_data = store.get_eval_scores(session_id, run_seq)
64
+ risk = eval_data.get("risk_score") if eval_data else None
65
+
66
+ if (risk is not None and risk > 0.7) or fail_count >= 3:
67
+ overall = "fail"
68
+ elif fail_count >= 1:
69
+ overall = "warn"
70
+ else:
71
+ overall = "ok"
72
+
73
+ return {
74
+ "run_id": f"s{session_id}r{run_seq}",
75
+ "risk_score": risk,
76
+ "benchmark_available": len(benchmark) > 0,
77
+ "factors": factors,
78
+ "overall": overall,
79
+ }
@@ -0,0 +1,43 @@
1
+ import json
2
+ from datetime import datetime
3
+ from pathlib import Path
4
+
5
+ from ragradar_core import store
6
+
7
+
8
+ def export(pipeline: str | None = None, output_path: Path | None = None) -> Path:
9
+ runs = store.get_all_evaluated_runs(pipeline)
10
+
11
+ records = []
12
+ for r in runs:
13
+ if r["pipeline"] and r["pipeline"].endswith("__seeded"):
14
+ continue
15
+ run_data = json.loads(r["run_data"])
16
+ if not run_data.get("chunks") or not run_data.get("response"):
17
+ continue
18
+ records.append((r, run_data))
19
+
20
+ if output_path is None:
21
+ pipe_name = pipeline or "all"
22
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
23
+ exports_dir = store._ragradar_dir() / "exports"
24
+ exports_dir.mkdir(parents=True, exist_ok=True)
25
+ output_path = exports_dir / f"{pipe_name}_ragas_{timestamp}.jsonl"
26
+ else:
27
+ output_path = Path(output_path)
28
+ output_path.parent.mkdir(parents=True, exist_ok=True)
29
+
30
+ with open(output_path, "w", encoding="utf-8") as f:
31
+ for row, run_data in records:
32
+ entry = {
33
+ "question": run_data["query"],
34
+ "answer": run_data["response"],
35
+ "contexts": [c["content"] for c in run_data.get("chunks", [])],
36
+ "ground_truth": None,
37
+ "run_id": f"s{row['session_id']}r{row['run_seq']}",
38
+ "pipeline": row["pipeline"],
39
+ "evaluated_at": row["evaluated_at"],
40
+ }
41
+ f.write(json.dumps(entry) + "\n")
42
+
43
+ return output_path