ragradar-evaluate 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ragradar_evaluate/__init__.py +43 -0
- ragradar_evaluate/benchmark/__init__.py +0 -0
- ragradar_evaluate/benchmark/builder.py +117 -0
- ragradar_evaluate/benchmark/checker.py +79 -0
- ragradar_evaluate/benchmark/exporter.py +43 -0
- ragradar_evaluate/benchmark/seeder.py +89 -0
- ragradar_evaluate/cli.py +390 -0
- ragradar_evaluate/facade.py +535 -0
- ragradar_evaluate/layers/__init__.py +0 -0
- ragradar_evaluate/layers/input_quality.py +327 -0
- ragradar_evaluate/layers/output_quality.py +83 -0
- ragradar_evaluate/policy/__init__.py +0 -0
- ragradar_evaluate/policy/persistence.py +29 -0
- ragradar_evaluate/policy/risk.py +47 -0
- ragradar_evaluate/policy/schema.py +40 -0
- ragradar_evaluate-0.1.0.dist-info/METADATA +162 -0
- ragradar_evaluate-0.1.0.dist-info/RECORD +19 -0
- ragradar_evaluate-0.1.0.dist-info/WHEEL +4 -0
- ragradar_evaluate-0.1.0.dist-info/entry_points.txt +2 -0
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
from ragradar_core.schema import (
|
|
2
|
+
CacheEvent,
|
|
3
|
+
ChunkRecord,
|
|
4
|
+
RunRecord,
|
|
5
|
+
TokenBudget,
|
|
6
|
+
TokenUsage,
|
|
7
|
+
ToolCallRecord,
|
|
8
|
+
Turn,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
from ragradar_evaluate.facade import (
|
|
12
|
+
CheckResult,
|
|
13
|
+
EvalResult,
|
|
14
|
+
MetricInfo,
|
|
15
|
+
available_metrics,
|
|
16
|
+
check,
|
|
17
|
+
evaluate,
|
|
18
|
+
)
|
|
19
|
+
from ragradar_evaluate.policy.schema import InputQualityPolicy
|
|
20
|
+
|
|
21
|
+
# NOTE: benchmark machinery (seeding, building, checking, exporting) is
|
|
22
|
+
# internal — the CLI's `benchmark` commands drive it, and check() consults
|
|
23
|
+
# learned thresholds automatically. It is deliberately absent here.
|
|
24
|
+
|
|
25
|
+
__all__ = [
|
|
26
|
+
# User tasks
|
|
27
|
+
"check",
|
|
28
|
+
"evaluate",
|
|
29
|
+
"available_metrics",
|
|
30
|
+
# Result / config types
|
|
31
|
+
"CheckResult",
|
|
32
|
+
"EvalResult",
|
|
33
|
+
"MetricInfo",
|
|
34
|
+
"InputQualityPolicy",
|
|
35
|
+
# Re-exported schema dataclasses so users need only one import.
|
|
36
|
+
"ChunkRecord",
|
|
37
|
+
"TokenBudget",
|
|
38
|
+
"TokenUsage",
|
|
39
|
+
"Turn",
|
|
40
|
+
"CacheEvent",
|
|
41
|
+
"ToolCallRecord",
|
|
42
|
+
"RunRecord",
|
|
43
|
+
]
|
|
File without changes
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
import json
|
|
2
|
+
|
|
3
|
+
from ragradar_core import store
|
|
4
|
+
from scipy import stats
|
|
5
|
+
|
|
6
|
+
INPUT_FACTORS = [
|
|
7
|
+
"duplicate_ratio",
|
|
8
|
+
"top_chunk_score",
|
|
9
|
+
"high_score_truncations",
|
|
10
|
+
"token_headroom_pct",
|
|
11
|
+
"source_domain_count",
|
|
12
|
+
"low_score_chunk_ratio",
|
|
13
|
+
"mean_relevance",
|
|
14
|
+
"truncated_count",
|
|
15
|
+
"score_variance",
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
RAGAS_METRICS = ["faithfulness", "answer_relevancy"]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _suggest_threshold(values: list[float], ragas_scores: list[float]) -> float:
|
|
22
|
+
sorted_vals = sorted(set(values))
|
|
23
|
+
if len(sorted_vals) < 2:
|
|
24
|
+
return sorted_vals[0] if sorted_vals else 0.0
|
|
25
|
+
|
|
26
|
+
best_threshold = sorted_vals[0]
|
|
27
|
+
best_diff = 0.0
|
|
28
|
+
|
|
29
|
+
for i in range(len(sorted_vals) - 1):
|
|
30
|
+
threshold = (sorted_vals[i] + sorted_vals[i + 1]) / 2
|
|
31
|
+
below = [r for v, r in zip(values, ragas_scores) if v <= threshold]
|
|
32
|
+
above = [r for v, r in zip(values, ragas_scores) if v > threshold]
|
|
33
|
+
|
|
34
|
+
if not below or not above:
|
|
35
|
+
continue
|
|
36
|
+
|
|
37
|
+
diff = abs(sum(above) / len(above) - sum(below) / len(below))
|
|
38
|
+
if diff > best_diff:
|
|
39
|
+
best_diff = diff
|
|
40
|
+
best_threshold = threshold
|
|
41
|
+
|
|
42
|
+
return round(best_threshold, 4)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def build(pipeline: str | None = None) -> dict:
|
|
46
|
+
runs = store.get_all_evaluated_runs(pipeline)
|
|
47
|
+
|
|
48
|
+
if len(runs) < 10:
|
|
49
|
+
raise ValueError(f"Need at least 10 evaluated runs to build benchmark, found {len(runs)}.")
|
|
50
|
+
|
|
51
|
+
parsed = []
|
|
52
|
+
for r in runs:
|
|
53
|
+
parsed.append(json.loads(r["eval_scores"]))
|
|
54
|
+
|
|
55
|
+
pipeline_key = pipeline or "__default"
|
|
56
|
+
factors_result = {}
|
|
57
|
+
batch_entries: list[tuple] = []
|
|
58
|
+
|
|
59
|
+
for factor in INPUT_FACTORS:
|
|
60
|
+
factor_values: list[float] = []
|
|
61
|
+
ragas_values: dict[str, list[float]] = {m: [] for m in RAGAS_METRICS}
|
|
62
|
+
|
|
63
|
+
for eval_data in parsed:
|
|
64
|
+
input_data = eval_data.get("input") or {}
|
|
65
|
+
output_data = eval_data.get("output") or {}
|
|
66
|
+
|
|
67
|
+
fval = input_data.get(factor)
|
|
68
|
+
if fval is None:
|
|
69
|
+
continue
|
|
70
|
+
|
|
71
|
+
has_ragas = any(output_data.get(m) is not None for m in RAGAS_METRICS)
|
|
72
|
+
if not has_ragas:
|
|
73
|
+
continue
|
|
74
|
+
|
|
75
|
+
factor_values.append(float(fval))
|
|
76
|
+
for m in RAGAS_METRICS:
|
|
77
|
+
ragas_values[m].append(float(output_data.get(m) or 0.0))
|
|
78
|
+
|
|
79
|
+
if len(factor_values) < 3:
|
|
80
|
+
continue
|
|
81
|
+
|
|
82
|
+
correlations: dict[str, float | None] = {}
|
|
83
|
+
for m in RAGAS_METRICS:
|
|
84
|
+
vals = ragas_values[m]
|
|
85
|
+
if (
|
|
86
|
+
len(vals) == len(factor_values)
|
|
87
|
+
and len(set(factor_values)) > 1
|
|
88
|
+
and len(set(vals)) > 1
|
|
89
|
+
):
|
|
90
|
+
corr, _ = stats.pearsonr(factor_values, vals)
|
|
91
|
+
# float(): scipy stubs expose corr as numpy.float64 (_T_co), not float;
|
|
92
|
+
# the cast is the narrowing workaround — no type: ignore needed.
|
|
93
|
+
correlations[f"{m}_correlation"] = round(float(corr), 4)
|
|
94
|
+
else:
|
|
95
|
+
correlations[f"{m}_correlation"] = None
|
|
96
|
+
|
|
97
|
+
valid_corrs = [v for v in correlations.values() if v is not None]
|
|
98
|
+
primary_corr = max(valid_corrs, key=abs) if valid_corrs else 0.0
|
|
99
|
+
|
|
100
|
+
primary_ragas = ragas_values[RAGAS_METRICS[0]]
|
|
101
|
+
suggested = _suggest_threshold(factor_values, primary_ragas)
|
|
102
|
+
|
|
103
|
+
batch_entries.append((pipeline_key, factor, suggested, primary_corr, len(factor_values)))
|
|
104
|
+
|
|
105
|
+
factors_result[factor] = {
|
|
106
|
+
**correlations,
|
|
107
|
+
"suggested_threshold": suggested,
|
|
108
|
+
"sample_count": len(factor_values),
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
store.write_benchmark_entries_batch(batch_entries)
|
|
112
|
+
|
|
113
|
+
return {
|
|
114
|
+
"run_count": len(runs),
|
|
115
|
+
"pipeline": pipeline,
|
|
116
|
+
"factors": factors_result,
|
|
117
|
+
}
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
import json
|
|
2
|
+
|
|
3
|
+
from ragradar_core import store
|
|
4
|
+
from ragradar_core.schema import RunRecord
|
|
5
|
+
|
|
6
|
+
from ragradar_evaluate.layers import input_quality
|
|
7
|
+
from ragradar_evaluate.policy.persistence import load_policy
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def check(
|
|
11
|
+
session_id: int,
|
|
12
|
+
run_seq: int,
|
|
13
|
+
pipeline: str | None = None,
|
|
14
|
+
) -> dict:
|
|
15
|
+
run_row = store.get_run(session_id, run_seq)
|
|
16
|
+
if run_row is None:
|
|
17
|
+
raise ValueError(f"Run s{session_id}r{run_seq} not found.")
|
|
18
|
+
|
|
19
|
+
record = RunRecord.from_json(json.loads(run_row["run_data"]))
|
|
20
|
+
pipeline = pipeline or run_row["pipeline"] or "__default"
|
|
21
|
+
policy = load_policy(pipeline)
|
|
22
|
+
|
|
23
|
+
input_scores = input_quality.score_input_quality(record, policy)
|
|
24
|
+
benchmark = store.get_benchmark(pipeline)
|
|
25
|
+
benchmark_map = {b["factor"]: b for b in benchmark}
|
|
26
|
+
|
|
27
|
+
factors = {}
|
|
28
|
+
fail_count = 0
|
|
29
|
+
|
|
30
|
+
check_factors = [
|
|
31
|
+
("duplicate_ratio", "higher_bad"),
|
|
32
|
+
("top_chunk_score", "lower_bad"),
|
|
33
|
+
("high_score_truncations", "higher_bad"),
|
|
34
|
+
("token_headroom_pct", "lower_bad"),
|
|
35
|
+
("source_domain_count", "higher_bad"),
|
|
36
|
+
("low_score_chunk_ratio", "higher_bad"),
|
|
37
|
+
]
|
|
38
|
+
|
|
39
|
+
for factor, direction in check_factors:
|
|
40
|
+
value = input_scores.get(factor) if input_scores else None
|
|
41
|
+
bench = benchmark_map.get(factor)
|
|
42
|
+
threshold = bench["threshold"] if bench else None
|
|
43
|
+
|
|
44
|
+
if value is None or threshold is None:
|
|
45
|
+
status = "ok"
|
|
46
|
+
elif direction == "lower_bad":
|
|
47
|
+
status = "fail" if value < threshold else "ok"
|
|
48
|
+
else:
|
|
49
|
+
status = "fail" if value > threshold else "ok"
|
|
50
|
+
|
|
51
|
+
if status == "fail":
|
|
52
|
+
fail_count += 1
|
|
53
|
+
|
|
54
|
+
factors[factor] = {
|
|
55
|
+
"value": value,
|
|
56
|
+
"benchmark_threshold": threshold,
|
|
57
|
+
"status": status,
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
# risk_score is None when the run was never evaluated or its input
|
|
61
|
+
# metrics could not be computed (0.0 strictly means "computed, no
|
|
62
|
+
# risk") — unknown risk never counts toward the verdict.
|
|
63
|
+
eval_data = store.get_eval_scores(session_id, run_seq)
|
|
64
|
+
risk = eval_data.get("risk_score") if eval_data else None
|
|
65
|
+
|
|
66
|
+
if (risk is not None and risk > 0.7) or fail_count >= 3:
|
|
67
|
+
overall = "fail"
|
|
68
|
+
elif fail_count >= 1:
|
|
69
|
+
overall = "warn"
|
|
70
|
+
else:
|
|
71
|
+
overall = "ok"
|
|
72
|
+
|
|
73
|
+
return {
|
|
74
|
+
"run_id": f"s{session_id}r{run_seq}",
|
|
75
|
+
"risk_score": risk,
|
|
76
|
+
"benchmark_available": len(benchmark) > 0,
|
|
77
|
+
"factors": factors,
|
|
78
|
+
"overall": overall,
|
|
79
|
+
}
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from datetime import datetime
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from ragradar_core import store
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def export(pipeline: str | None = None, output_path: Path | None = None) -> Path:
|
|
9
|
+
runs = store.get_all_evaluated_runs(pipeline)
|
|
10
|
+
|
|
11
|
+
records = []
|
|
12
|
+
for r in runs:
|
|
13
|
+
if r["pipeline"] and r["pipeline"].endswith("__seeded"):
|
|
14
|
+
continue
|
|
15
|
+
run_data = json.loads(r["run_data"])
|
|
16
|
+
if not run_data.get("chunks") or not run_data.get("response"):
|
|
17
|
+
continue
|
|
18
|
+
records.append((r, run_data))
|
|
19
|
+
|
|
20
|
+
if output_path is None:
|
|
21
|
+
pipe_name = pipeline or "all"
|
|
22
|
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
23
|
+
exports_dir = store._ragradar_dir() / "exports"
|
|
24
|
+
exports_dir.mkdir(parents=True, exist_ok=True)
|
|
25
|
+
output_path = exports_dir / f"{pipe_name}_ragas_{timestamp}.jsonl"
|
|
26
|
+
else:
|
|
27
|
+
output_path = Path(output_path)
|
|
28
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
29
|
+
|
|
30
|
+
with open(output_path, "w", encoding="utf-8") as f:
|
|
31
|
+
for row, run_data in records:
|
|
32
|
+
entry = {
|
|
33
|
+
"question": run_data["query"],
|
|
34
|
+
"answer": run_data["response"],
|
|
35
|
+
"contexts": [c["content"] for c in run_data.get("chunks", [])],
|
|
36
|
+
"ground_truth": None,
|
|
37
|
+
"run_id": f"s{row['session_id']}r{row['run_seq']}",
|
|
38
|
+
"pipeline": row["pipeline"],
|
|
39
|
+
"evaluated_at": row["evaluated_at"],
|
|
40
|
+
}
|
|
41
|
+
f.write(json.dumps(entry) + "\n")
|
|
42
|
+
|
|
43
|
+
return output_path
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
import random
|
|
2
|
+
|
|
3
|
+
from ragradar_core import store
|
|
4
|
+
from ragradar_core.schema import ChunkRecord, RunRecord, TokenBudget, Turn
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def seed(pipeline: str, count: int = 20) -> int:
|
|
8
|
+
"""Generate synthetic run records as day-zero baseline.
|
|
9
|
+
|
|
10
|
+
Half known-good, half known-bad. Seeded runs do NOT have RAGAS scores --
|
|
11
|
+
they serve as input quality baseline only.
|
|
12
|
+
"""
|
|
13
|
+
seeded_pipeline = f"{pipeline}__seeded"
|
|
14
|
+
half = count // 2
|
|
15
|
+
|
|
16
|
+
records = [_good_record(i) for i in range(half)]
|
|
17
|
+
records += [_bad_record(i) for i in range(count - half)]
|
|
18
|
+
|
|
19
|
+
session_id = store.get_or_create_session(seeded_pipeline)
|
|
20
|
+
start_seq = store.next_run_seq(session_id)
|
|
21
|
+
store.write_runs_batch(session_id, start_seq, records, seeded_pipeline)
|
|
22
|
+
|
|
23
|
+
return count
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _good_record(idx: int) -> RunRecord:
|
|
27
|
+
chunks = [
|
|
28
|
+
ChunkRecord(
|
|
29
|
+
chunk_id=f"seed_c{idx}_{j}",
|
|
30
|
+
source_doc_id=f"seed_doc_{j % 3}",
|
|
31
|
+
content=f"Synthetic high-quality chunk content for topic {idx}, variant {j}.",
|
|
32
|
+
token_count=150,
|
|
33
|
+
retrieval_score=0.85 + random.uniform(0, 0.1),
|
|
34
|
+
rerank_score=0.90 + random.uniform(0, 0.08),
|
|
35
|
+
retrieval_path="hybrid",
|
|
36
|
+
truncated=False,
|
|
37
|
+
cache_hit=True,
|
|
38
|
+
)
|
|
39
|
+
for j in range(4)
|
|
40
|
+
]
|
|
41
|
+
return RunRecord(
|
|
42
|
+
query=f"Synthetic good query {idx}: what is the best practice?",
|
|
43
|
+
response=f"Synthetic good response {idx}: comprehensive answer.",
|
|
44
|
+
chunks=chunks,
|
|
45
|
+
token_budget=TokenBudget(
|
|
46
|
+
total_limit=4096,
|
|
47
|
+
chunks_allocated=2400,
|
|
48
|
+
history_allocated=400,
|
|
49
|
+
system_allocated=600,
|
|
50
|
+
headroom=696,
|
|
51
|
+
),
|
|
52
|
+
history_pre=[
|
|
53
|
+
Turn(role="user", content="context question", tokens=5),
|
|
54
|
+
Turn(role="assistant", content="context answer", tokens=10),
|
|
55
|
+
],
|
|
56
|
+
history_post=[
|
|
57
|
+
Turn(role="user", content="context question", tokens=5),
|
|
58
|
+
Turn(role="assistant", content="context answer", tokens=10),
|
|
59
|
+
],
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _bad_record(idx: int) -> RunRecord:
|
|
64
|
+
chunks = [
|
|
65
|
+
ChunkRecord(
|
|
66
|
+
chunk_id=f"seed_bad_c{idx}_{j}",
|
|
67
|
+
source_doc_id=f"seed_doc_{j % 8}",
|
|
68
|
+
content=f"Low quality chunk {idx}_{j}.",
|
|
69
|
+
token_count=200,
|
|
70
|
+
retrieval_score=0.3 + random.uniform(0, 0.15),
|
|
71
|
+
rerank_score=0.25 + random.uniform(0, 0.15),
|
|
72
|
+
retrieval_path="bm25",
|
|
73
|
+
truncated=(j % 2 == 0),
|
|
74
|
+
cache_hit=False,
|
|
75
|
+
)
|
|
76
|
+
for j in range(6)
|
|
77
|
+
]
|
|
78
|
+
return RunRecord(
|
|
79
|
+
query=f"Synthetic bad query {idx}: vague unclear question?",
|
|
80
|
+
response=f"Synthetic bad response {idx}: incomplete.",
|
|
81
|
+
chunks=chunks,
|
|
82
|
+
token_budget=TokenBudget(
|
|
83
|
+
total_limit=4096,
|
|
84
|
+
chunks_allocated=3800,
|
|
85
|
+
history_allocated=100,
|
|
86
|
+
system_allocated=100,
|
|
87
|
+
headroom=96,
|
|
88
|
+
),
|
|
89
|
+
)
|