ragradar-evaluate 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,43 @@
1
+ from ragradar_core.schema import (
2
+ CacheEvent,
3
+ ChunkRecord,
4
+ RunRecord,
5
+ TokenBudget,
6
+ TokenUsage,
7
+ ToolCallRecord,
8
+ Turn,
9
+ )
10
+
11
+ from ragradar_evaluate.facade import (
12
+ CheckResult,
13
+ EvalResult,
14
+ MetricInfo,
15
+ available_metrics,
16
+ check,
17
+ evaluate,
18
+ )
19
+ from ragradar_evaluate.policy.schema import InputQualityPolicy
20
+
21
+ # NOTE: benchmark machinery (seeding, building, checking, exporting) is
22
+ # internal — the CLI's `benchmark` commands drive it, and check() consults
23
+ # learned thresholds automatically. It is deliberately absent here.
24
+
25
+ __all__ = [
26
+ # User tasks
27
+ "check",
28
+ "evaluate",
29
+ "available_metrics",
30
+ # Result / config types
31
+ "CheckResult",
32
+ "EvalResult",
33
+ "MetricInfo",
34
+ "InputQualityPolicy",
35
+ # Re-exported schema dataclasses so users need only one import.
36
+ "ChunkRecord",
37
+ "TokenBudget",
38
+ "TokenUsage",
39
+ "Turn",
40
+ "CacheEvent",
41
+ "ToolCallRecord",
42
+ "RunRecord",
43
+ ]
File without changes
@@ -0,0 +1,117 @@
1
+ import json
2
+
3
+ from ragradar_core import store
4
+ from scipy import stats
5
+
6
+ INPUT_FACTORS = [
7
+ "duplicate_ratio",
8
+ "top_chunk_score",
9
+ "high_score_truncations",
10
+ "token_headroom_pct",
11
+ "source_domain_count",
12
+ "low_score_chunk_ratio",
13
+ "mean_relevance",
14
+ "truncated_count",
15
+ "score_variance",
16
+ ]
17
+
18
+ RAGAS_METRICS = ["faithfulness", "answer_relevancy"]
19
+
20
+
21
+ def _suggest_threshold(values: list[float], ragas_scores: list[float]) -> float:
22
+ sorted_vals = sorted(set(values))
23
+ if len(sorted_vals) < 2:
24
+ return sorted_vals[0] if sorted_vals else 0.0
25
+
26
+ best_threshold = sorted_vals[0]
27
+ best_diff = 0.0
28
+
29
+ for i in range(len(sorted_vals) - 1):
30
+ threshold = (sorted_vals[i] + sorted_vals[i + 1]) / 2
31
+ below = [r for v, r in zip(values, ragas_scores) if v <= threshold]
32
+ above = [r for v, r in zip(values, ragas_scores) if v > threshold]
33
+
34
+ if not below or not above:
35
+ continue
36
+
37
+ diff = abs(sum(above) / len(above) - sum(below) / len(below))
38
+ if diff > best_diff:
39
+ best_diff = diff
40
+ best_threshold = threshold
41
+
42
+ return round(best_threshold, 4)
43
+
44
+
45
+ def build(pipeline: str | None = None) -> dict:
46
+ runs = store.get_all_evaluated_runs(pipeline)
47
+
48
+ if len(runs) < 10:
49
+ raise ValueError(f"Need at least 10 evaluated runs to build benchmark, found {len(runs)}.")
50
+
51
+ parsed = []
52
+ for r in runs:
53
+ parsed.append(json.loads(r["eval_scores"]))
54
+
55
+ pipeline_key = pipeline or "__default"
56
+ factors_result = {}
57
+ batch_entries: list[tuple] = []
58
+
59
+ for factor in INPUT_FACTORS:
60
+ factor_values: list[float] = []
61
+ ragas_values: dict[str, list[float]] = {m: [] for m in RAGAS_METRICS}
62
+
63
+ for eval_data in parsed:
64
+ input_data = eval_data.get("input") or {}
65
+ output_data = eval_data.get("output") or {}
66
+
67
+ fval = input_data.get(factor)
68
+ if fval is None:
69
+ continue
70
+
71
+ has_ragas = any(output_data.get(m) is not None for m in RAGAS_METRICS)
72
+ if not has_ragas:
73
+ continue
74
+
75
+ factor_values.append(float(fval))
76
+ for m in RAGAS_METRICS:
77
+ ragas_values[m].append(float(output_data.get(m) or 0.0))
78
+
79
+ if len(factor_values) < 3:
80
+ continue
81
+
82
+ correlations: dict[str, float | None] = {}
83
+ for m in RAGAS_METRICS:
84
+ vals = ragas_values[m]
85
+ if (
86
+ len(vals) == len(factor_values)
87
+ and len(set(factor_values)) > 1
88
+ and len(set(vals)) > 1
89
+ ):
90
+ corr, _ = stats.pearsonr(factor_values, vals)
91
+ # float(): scipy stubs expose corr as numpy.float64 (_T_co), not float;
92
+ # the cast is the narrowing workaround — no type: ignore needed.
93
+ correlations[f"{m}_correlation"] = round(float(corr), 4)
94
+ else:
95
+ correlations[f"{m}_correlation"] = None
96
+
97
+ valid_corrs = [v for v in correlations.values() if v is not None]
98
+ primary_corr = max(valid_corrs, key=abs) if valid_corrs else 0.0
99
+
100
+ primary_ragas = ragas_values[RAGAS_METRICS[0]]
101
+ suggested = _suggest_threshold(factor_values, primary_ragas)
102
+
103
+ batch_entries.append((pipeline_key, factor, suggested, primary_corr, len(factor_values)))
104
+
105
+ factors_result[factor] = {
106
+ **correlations,
107
+ "suggested_threshold": suggested,
108
+ "sample_count": len(factor_values),
109
+ }
110
+
111
+ store.write_benchmark_entries_batch(batch_entries)
112
+
113
+ return {
114
+ "run_count": len(runs),
115
+ "pipeline": pipeline,
116
+ "factors": factors_result,
117
+ }
@@ -0,0 +1,79 @@
1
+ import json
2
+
3
+ from ragradar_core import store
4
+ from ragradar_core.schema import RunRecord
5
+
6
+ from ragradar_evaluate.layers import input_quality
7
+ from ragradar_evaluate.policy.persistence import load_policy
8
+
9
+
10
+ def check(
11
+ session_id: int,
12
+ run_seq: int,
13
+ pipeline: str | None = None,
14
+ ) -> dict:
15
+ run_row = store.get_run(session_id, run_seq)
16
+ if run_row is None:
17
+ raise ValueError(f"Run s{session_id}r{run_seq} not found.")
18
+
19
+ record = RunRecord.from_json(json.loads(run_row["run_data"]))
20
+ pipeline = pipeline or run_row["pipeline"] or "__default"
21
+ policy = load_policy(pipeline)
22
+
23
+ input_scores = input_quality.score_input_quality(record, policy)
24
+ benchmark = store.get_benchmark(pipeline)
25
+ benchmark_map = {b["factor"]: b for b in benchmark}
26
+
27
+ factors = {}
28
+ fail_count = 0
29
+
30
+ check_factors = [
31
+ ("duplicate_ratio", "higher_bad"),
32
+ ("top_chunk_score", "lower_bad"),
33
+ ("high_score_truncations", "higher_bad"),
34
+ ("token_headroom_pct", "lower_bad"),
35
+ ("source_domain_count", "higher_bad"),
36
+ ("low_score_chunk_ratio", "higher_bad"),
37
+ ]
38
+
39
+ for factor, direction in check_factors:
40
+ value = input_scores.get(factor) if input_scores else None
41
+ bench = benchmark_map.get(factor)
42
+ threshold = bench["threshold"] if bench else None
43
+
44
+ if value is None or threshold is None:
45
+ status = "ok"
46
+ elif direction == "lower_bad":
47
+ status = "fail" if value < threshold else "ok"
48
+ else:
49
+ status = "fail" if value > threshold else "ok"
50
+
51
+ if status == "fail":
52
+ fail_count += 1
53
+
54
+ factors[factor] = {
55
+ "value": value,
56
+ "benchmark_threshold": threshold,
57
+ "status": status,
58
+ }
59
+
60
+ # risk_score is None when the run was never evaluated or its input
61
+ # metrics could not be computed (0.0 strictly means "computed, no
62
+ # risk") — unknown risk never counts toward the verdict.
63
+ eval_data = store.get_eval_scores(session_id, run_seq)
64
+ risk = eval_data.get("risk_score") if eval_data else None
65
+
66
+ if (risk is not None and risk > 0.7) or fail_count >= 3:
67
+ overall = "fail"
68
+ elif fail_count >= 1:
69
+ overall = "warn"
70
+ else:
71
+ overall = "ok"
72
+
73
+ return {
74
+ "run_id": f"s{session_id}r{run_seq}",
75
+ "risk_score": risk,
76
+ "benchmark_available": len(benchmark) > 0,
77
+ "factors": factors,
78
+ "overall": overall,
79
+ }
@@ -0,0 +1,43 @@
1
+ import json
2
+ from datetime import datetime
3
+ from pathlib import Path
4
+
5
+ from ragradar_core import store
6
+
7
+
8
+ def export(pipeline: str | None = None, output_path: Path | None = None) -> Path:
9
+ runs = store.get_all_evaluated_runs(pipeline)
10
+
11
+ records = []
12
+ for r in runs:
13
+ if r["pipeline"] and r["pipeline"].endswith("__seeded"):
14
+ continue
15
+ run_data = json.loads(r["run_data"])
16
+ if not run_data.get("chunks") or not run_data.get("response"):
17
+ continue
18
+ records.append((r, run_data))
19
+
20
+ if output_path is None:
21
+ pipe_name = pipeline or "all"
22
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
23
+ exports_dir = store._ragradar_dir() / "exports"
24
+ exports_dir.mkdir(parents=True, exist_ok=True)
25
+ output_path = exports_dir / f"{pipe_name}_ragas_{timestamp}.jsonl"
26
+ else:
27
+ output_path = Path(output_path)
28
+ output_path.parent.mkdir(parents=True, exist_ok=True)
29
+
30
+ with open(output_path, "w", encoding="utf-8") as f:
31
+ for row, run_data in records:
32
+ entry = {
33
+ "question": run_data["query"],
34
+ "answer": run_data["response"],
35
+ "contexts": [c["content"] for c in run_data.get("chunks", [])],
36
+ "ground_truth": None,
37
+ "run_id": f"s{row['session_id']}r{row['run_seq']}",
38
+ "pipeline": row["pipeline"],
39
+ "evaluated_at": row["evaluated_at"],
40
+ }
41
+ f.write(json.dumps(entry) + "\n")
42
+
43
+ return output_path
@@ -0,0 +1,89 @@
1
+ import random
2
+
3
+ from ragradar_core import store
4
+ from ragradar_core.schema import ChunkRecord, RunRecord, TokenBudget, Turn
5
+
6
+
7
+ def seed(pipeline: str, count: int = 20) -> int:
8
+ """Generate synthetic run records as day-zero baseline.
9
+
10
+ Half known-good, half known-bad. Seeded runs do NOT have RAGAS scores --
11
+ they serve as input quality baseline only.
12
+ """
13
+ seeded_pipeline = f"{pipeline}__seeded"
14
+ half = count // 2
15
+
16
+ records = [_good_record(i) for i in range(half)]
17
+ records += [_bad_record(i) for i in range(count - half)]
18
+
19
+ session_id = store.get_or_create_session(seeded_pipeline)
20
+ start_seq = store.next_run_seq(session_id)
21
+ store.write_runs_batch(session_id, start_seq, records, seeded_pipeline)
22
+
23
+ return count
24
+
25
+
26
+ def _good_record(idx: int) -> RunRecord:
27
+ chunks = [
28
+ ChunkRecord(
29
+ chunk_id=f"seed_c{idx}_{j}",
30
+ source_doc_id=f"seed_doc_{j % 3}",
31
+ content=f"Synthetic high-quality chunk content for topic {idx}, variant {j}.",
32
+ token_count=150,
33
+ retrieval_score=0.85 + random.uniform(0, 0.1),
34
+ rerank_score=0.90 + random.uniform(0, 0.08),
35
+ retrieval_path="hybrid",
36
+ truncated=False,
37
+ cache_hit=True,
38
+ )
39
+ for j in range(4)
40
+ ]
41
+ return RunRecord(
42
+ query=f"Synthetic good query {idx}: what is the best practice?",
43
+ response=f"Synthetic good response {idx}: comprehensive answer.",
44
+ chunks=chunks,
45
+ token_budget=TokenBudget(
46
+ total_limit=4096,
47
+ chunks_allocated=2400,
48
+ history_allocated=400,
49
+ system_allocated=600,
50
+ headroom=696,
51
+ ),
52
+ history_pre=[
53
+ Turn(role="user", content="context question", tokens=5),
54
+ Turn(role="assistant", content="context answer", tokens=10),
55
+ ],
56
+ history_post=[
57
+ Turn(role="user", content="context question", tokens=5),
58
+ Turn(role="assistant", content="context answer", tokens=10),
59
+ ],
60
+ )
61
+
62
+
63
+ def _bad_record(idx: int) -> RunRecord:
64
+ chunks = [
65
+ ChunkRecord(
66
+ chunk_id=f"seed_bad_c{idx}_{j}",
67
+ source_doc_id=f"seed_doc_{j % 8}",
68
+ content=f"Low quality chunk {idx}_{j}.",
69
+ token_count=200,
70
+ retrieval_score=0.3 + random.uniform(0, 0.15),
71
+ rerank_score=0.25 + random.uniform(0, 0.15),
72
+ retrieval_path="bm25",
73
+ truncated=(j % 2 == 0),
74
+ cache_hit=False,
75
+ )
76
+ for j in range(6)
77
+ ]
78
+ return RunRecord(
79
+ query=f"Synthetic bad query {idx}: vague unclear question?",
80
+ response=f"Synthetic bad response {idx}: incomplete.",
81
+ chunks=chunks,
82
+ token_budget=TokenBudget(
83
+ total_limit=4096,
84
+ chunks_allocated=3800,
85
+ history_allocated=100,
86
+ system_allocated=100,
87
+ headroom=96,
88
+ ),
89
+ )