tesserakit-evals 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tesserakit_evals-0.4.0/.gitignore +8 -0
- tesserakit_evals-0.4.0/PKG-INFO +24 -0
- tesserakit_evals-0.4.0/README.md +3 -0
- tesserakit_evals-0.4.0/pyproject.toml +40 -0
- tesserakit_evals-0.4.0/src/tessera_evals/__init__.py +3 -0
- tesserakit_evals-0.4.0/src/tessera_evals/adapters.py +131 -0
- tesserakit_evals-0.4.0/src/tessera_evals/cli.py +93 -0
- tesserakit_evals-0.4.0/src/tessera_evals/compiler.py +519 -0
- tesserakit_evals-0.4.0/src/tessera_evals/from_prompts.py +130 -0
- tesserakit_evals-0.4.0/src/tessera_evals/pack.py +40 -0
- tesserakit_evals-0.4.0/src/tessera_evals/rubrics.py +93 -0
- tesserakit_evals-0.4.0/src/tessera_evals/schema.py +17 -0
- tesserakit_evals-0.4.0/tests/test_adapters.py +117 -0
- tesserakit_evals-0.4.0/tests/test_column_heuristics_v2.py +106 -0
- tesserakit_evals-0.4.0/tests/test_compile_pack.py +138 -0
- tesserakit_evals-0.4.0/tests/test_from_prompts.py +102 -0
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: tesserakit-evals
|
|
3
|
+
Version: 0.4.0
|
|
4
|
+
Summary: Eval job pack for Tessera: compile messy data into eval-ready assets.
|
|
5
|
+
Project-URL: Homepage, https://github.com/ShaileshRawat1403/tessera
|
|
6
|
+
Project-URL: Repository, https://github.com/ShaileshRawat1403/tessera
|
|
7
|
+
Project-URL: Issues, https://github.com/ShaileshRawat1403/tessera/issues
|
|
8
|
+
Author: Shailesh Rawat
|
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
|
10
|
+
Classifier: Environment :: Console
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Requires-Python: >=3.10
|
|
14
|
+
Requires-Dist: pydantic>=2.7
|
|
15
|
+
Requires-Dist: rich>=13.7
|
|
16
|
+
Requires-Dist: tesserakit-core>=0.1.0
|
|
17
|
+
Requires-Dist: typer>=0.12
|
|
18
|
+
Provides-Extra: dev
|
|
19
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
|
|
22
|
+
# tessera-evals
|
|
23
|
+
|
|
24
|
+
First Tessera job pack. It compiles messy CSV records into an eval pack with dataset, golden candidates, rubric, and quality reports.
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling>=1.25"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "tesserakit-evals"
|
|
7
|
+
version = "0.4.0"
|
|
8
|
+
description = "Eval job pack for Tessera: compile messy data into eval-ready assets."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
authors = [{ name = "Shailesh Rawat" }]
|
|
12
|
+
dependencies = [
|
|
13
|
+
"tesserakit-core>=0.1.0",
|
|
14
|
+
"typer>=0.12",
|
|
15
|
+
"rich>=13.7",
|
|
16
|
+
"pydantic>=2.7",
|
|
17
|
+
]
|
|
18
|
+
classifiers = [
|
|
19
|
+
"Development Status :: 3 - Alpha",
|
|
20
|
+
"Environment :: Console",
|
|
21
|
+
"Intended Audience :: Developers",
|
|
22
|
+
"Programming Language :: Python :: 3",
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
[project.urls]
|
|
26
|
+
Homepage = "https://github.com/ShaileshRawat1403/tessera"
|
|
27
|
+
Repository = "https://github.com/ShaileshRawat1403/tessera"
|
|
28
|
+
Issues = "https://github.com/ShaileshRawat1403/tessera/issues"
|
|
29
|
+
|
|
30
|
+
[project.optional-dependencies]
|
|
31
|
+
dev = ["pytest>=8.0"]
|
|
32
|
+
|
|
33
|
+
[project.entry-points."tessera.commands"]
|
|
34
|
+
evals = "tessera_evals.cli:register"
|
|
35
|
+
|
|
36
|
+
[project.entry-points."tessera.jobpacks"]
|
|
37
|
+
evals = "tessera_evals.pack:create_pack"
|
|
38
|
+
|
|
39
|
+
[tool.hatch.build.targets.wheel]
|
|
40
|
+
packages = ["src/tessera_evals"]
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
"""Export canonical eval records to framework-native interchange files.
|
|
2
|
+
|
|
3
|
+
Tessera stays framework-independent: it emits each target's documented file
|
|
4
|
+
format rather than importing the framework. A canonical ``EvalRecord`` maps to:
|
|
5
|
+
|
|
6
|
+
input <- input.user_message
|
|
7
|
+
expected <- expected.reference_answer
|
|
8
|
+
context <- context.source_text (optional)
|
|
9
|
+
|
|
10
|
+
Targets:
|
|
11
|
+
deepeval -> goldens JSON (list of {input, expected_output, context})
|
|
12
|
+
ragas -> JSONL of {question, ground_truth, contexts}
|
|
13
|
+
openai-evals -> JSONL of {input: [chat messages], ideal}
|
|
14
|
+
langsmith -> JSONL of {inputs, outputs}
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import json
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
from typing import Any, Callable
|
|
22
|
+
|
|
23
|
+
from tessera_evals.schema import EvalRecord
|
|
24
|
+
|
|
25
|
+
TARGETS = ("deepeval", "ragas", "openai-evals", "langsmith")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _input_text(rec: EvalRecord) -> str:
|
|
29
|
+
return str(rec.input.get("user_message", "")).strip()
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _expected_text(rec: EvalRecord) -> str:
|
|
33
|
+
return str(rec.expected.get("reference_answer", "")).strip()
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _context_list(rec: EvalRecord) -> list[str]:
|
|
37
|
+
src = str(rec.context.get("source_text", "")).strip()
|
|
38
|
+
return [src] if src else []
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def to_deepeval(records: list[EvalRecord]) -> str:
|
|
42
|
+
"""DeepEval goldens JSON: a list of golden objects."""
|
|
43
|
+
goldens = [
|
|
44
|
+
{
|
|
45
|
+
"input": _input_text(r),
|
|
46
|
+
"expected_output": _expected_text(r) or None,
|
|
47
|
+
"context": _context_list(r) or None,
|
|
48
|
+
"additional_metadata": {
|
|
49
|
+
"id": r.id,
|
|
50
|
+
"task_type": r.task_type,
|
|
51
|
+
"review_status": r.expected.get("review_status", ""),
|
|
52
|
+
},
|
|
53
|
+
}
|
|
54
|
+
for r in records
|
|
55
|
+
]
|
|
56
|
+
return json.dumps({"goldens": goldens}, ensure_ascii=False, indent=2) + "\n"
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def to_ragas(records: list[EvalRecord]) -> str:
|
|
60
|
+
"""RAGAS evaluation dataset as JSONL: question / ground_truth / contexts."""
|
|
61
|
+
lines = []
|
|
62
|
+
for r in records:
|
|
63
|
+
lines.append(
|
|
64
|
+
json.dumps(
|
|
65
|
+
{
|
|
66
|
+
"question": _input_text(r),
|
|
67
|
+
"ground_truth": _expected_text(r),
|
|
68
|
+
"contexts": _context_list(r),
|
|
69
|
+
},
|
|
70
|
+
ensure_ascii=False,
|
|
71
|
+
)
|
|
72
|
+
)
|
|
73
|
+
return "\n".join(lines) + ("\n" if lines else "")
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def to_openai_evals(records: list[EvalRecord]) -> str:
|
|
77
|
+
"""OpenAI Evals samples.jsonl: {input: [messages], ideal}."""
|
|
78
|
+
lines = []
|
|
79
|
+
for r in records:
|
|
80
|
+
sample: dict[str, Any] = {
|
|
81
|
+
"input": [{"role": "user", "content": _input_text(r)}],
|
|
82
|
+
"ideal": _expected_text(r),
|
|
83
|
+
}
|
|
84
|
+
lines.append(json.dumps(sample, ensure_ascii=False))
|
|
85
|
+
return "\n".join(lines) + ("\n" if lines else "")
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def to_langsmith(records: list[EvalRecord]) -> str:
|
|
89
|
+
"""LangSmith dataset examples JSONL: {inputs, outputs}."""
|
|
90
|
+
lines = []
|
|
91
|
+
for r in records:
|
|
92
|
+
example = {
|
|
93
|
+
"inputs": {"input": _input_text(r)},
|
|
94
|
+
"outputs": {"expected": _expected_text(r)},
|
|
95
|
+
"metadata": {"id": r.id, "task_type": r.task_type},
|
|
96
|
+
}
|
|
97
|
+
lines.append(json.dumps(example, ensure_ascii=False))
|
|
98
|
+
return "\n".join(lines) + ("\n" if lines else "")
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
_ADAPTERS: dict[str, tuple[Callable[[list[EvalRecord]], str], str]] = {
|
|
102
|
+
"deepeval": (to_deepeval, "deepeval_goldens.json"),
|
|
103
|
+
"ragas": (to_ragas, "ragas_dataset.jsonl"),
|
|
104
|
+
"openai-evals": (to_openai_evals, "openai_evals_samples.jsonl"),
|
|
105
|
+
"langsmith": (to_langsmith, "langsmith_examples.jsonl"),
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def load_dataset(path: Path) -> list[EvalRecord]:
|
|
110
|
+
"""Read a canonical dataset.jsonl (the evals pack's output) into EvalRecords."""
|
|
111
|
+
records: list[EvalRecord] = []
|
|
112
|
+
for line in path.read_text(encoding="utf-8").splitlines():
|
|
113
|
+
line = line.strip()
|
|
114
|
+
if line:
|
|
115
|
+
records.append(EvalRecord.model_validate_json(line))
|
|
116
|
+
return records
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def export(records: list[EvalRecord], target: str, output_dir: Path) -> Path:
|
|
120
|
+
"""Write one target's interchange file; return its path."""
|
|
121
|
+
if target not in _ADAPTERS:
|
|
122
|
+
raise ValueError(f"unknown target '{target}'; choose from {', '.join(TARGETS)}")
|
|
123
|
+
render, filename = _ADAPTERS[target]
|
|
124
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
125
|
+
out = output_dir / filename
|
|
126
|
+
out.write_text(render(records), encoding="utf-8")
|
|
127
|
+
return out
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def export_all(records: list[EvalRecord], output_dir: Path) -> list[Path]:
|
|
131
|
+
return [export(records, t, output_dir) for t in TARGETS]
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
import typer
|
|
6
|
+
from rich.console import Console
|
|
7
|
+
from rich.table import Table
|
|
8
|
+
|
|
9
|
+
from tessera_core.models import RunContext
|
|
10
|
+
|
|
11
|
+
from tessera_evals.pack import EvalsPack
|
|
12
|
+
|
|
13
|
+
console = Console()
|
|
14
|
+
evals_app = typer.Typer(help="Compile messy data into eval-ready assets.")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@evals_app.command("compile")
|
|
18
|
+
def compile_cmd(
|
|
19
|
+
input: Path = typer.Option(..., "--input", "-i", exists=True, readable=True, help="Input CSV path, or a prompts examples.jsonl / prompt-pack directory with --from-prompts."),
|
|
20
|
+
task: str = typer.Option(..., "--task", help="Task type, for example customer_support, rag_qa, classification."),
|
|
21
|
+
output: Path = typer.Option(Path("eval_pack"), "--output", "-o", help="Output directory."),
|
|
22
|
+
input_column: str | None = typer.Option(None, "--input-column", help="Override input/question column."),
|
|
23
|
+
expected_column: str | None = typer.Option(None, "--expected-column", help="Override expected/golden-answer column."),
|
|
24
|
+
context_column: str | None = typer.Option(None, "--context-column", help="Override context/source column."),
|
|
25
|
+
from_prompts: bool = typer.Option(False, "--from-prompts", help="Treat input as a prompts-pack examples.jsonl (or directory) instead of a CSV."),
|
|
26
|
+
enrich: bool = typer.Option(False, "--enrich", help="LLM-enriched rubric (not available in v0.1)."),
|
|
27
|
+
) -> None:
|
|
28
|
+
"""Create dataset, golden candidates, rubric, and quality reports from a CSV or a prompts examples.jsonl."""
|
|
29
|
+
if enrich:
|
|
30
|
+
console.print(
|
|
31
|
+
"[yellow]LLM enrichment is not available in v0.1. Using deterministic rubric templates.[/yellow]"
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
ctx = RunContext(job_name="evals", output_dir=output)
|
|
35
|
+
options = {
|
|
36
|
+
"task_type": task,
|
|
37
|
+
"input_column": input_column,
|
|
38
|
+
"expected_column": expected_column,
|
|
39
|
+
"context_column": context_column,
|
|
40
|
+
}
|
|
41
|
+
if from_prompts:
|
|
42
|
+
options["source"] = "prompts"
|
|
43
|
+
|
|
44
|
+
pack = EvalsPack()
|
|
45
|
+
artifacts = pack.run(input_path=input, ctx=ctx, options=options)
|
|
46
|
+
|
|
47
|
+
table = Table(title="Eval Pack Created")
|
|
48
|
+
table.add_column("Artifact")
|
|
49
|
+
table.add_column("Path")
|
|
50
|
+
table.add_column("Kind")
|
|
51
|
+
for art in artifacts:
|
|
52
|
+
table.add_row(art.name, str(art.path), art.kind)
|
|
53
|
+
console.print(table)
|
|
54
|
+
|
|
55
|
+
summary = Table(title="Run Summary")
|
|
56
|
+
summary.add_column("Metric")
|
|
57
|
+
summary.add_column("Value")
|
|
58
|
+
summary.add_row("run_id", ctx.run_id)
|
|
59
|
+
summary.add_row("records", str(ctx.metadata.get("record_count", 0)))
|
|
60
|
+
summary.add_row("findings", str(ctx.metadata.get("finding_count", 0)))
|
|
61
|
+
console.print(summary)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
@evals_app.command("export")
|
|
65
|
+
def export_cmd(
|
|
66
|
+
input: Path = typer.Option(..., "--input", "-i", exists=True, readable=True, help="A canonical dataset.jsonl (the output of `evals compile`)."),
|
|
67
|
+
target: str = typer.Option("all", "--target", help="deepeval | ragas | openai-evals | langsmith | all."),
|
|
68
|
+
output: Path = typer.Option(Path("eval_export"), "--output", "-o", help="Output directory."),
|
|
69
|
+
) -> None:
|
|
70
|
+
"""Export a canonical eval dataset to framework-native interchange files."""
|
|
71
|
+
from tessera_evals.adapters import TARGETS, export, export_all, load_dataset
|
|
72
|
+
|
|
73
|
+
records = load_dataset(input)
|
|
74
|
+
|
|
75
|
+
target = target.lower()
|
|
76
|
+
if target == "all":
|
|
77
|
+
paths = export_all(records, output)
|
|
78
|
+
elif target in TARGETS:
|
|
79
|
+
paths = [export(records, target, output)]
|
|
80
|
+
else:
|
|
81
|
+
console.print(f"[red]Unknown target '{target}'. Choose from: {', '.join(TARGETS)}, all.[/red]")
|
|
82
|
+
raise typer.Exit(code=2)
|
|
83
|
+
|
|
84
|
+
table = Table(title="Eval Export")
|
|
85
|
+
table.add_column("Target file")
|
|
86
|
+
table.add_column("Records")
|
|
87
|
+
for p in paths:
|
|
88
|
+
table.add_row(str(p), str(len(records)))
|
|
89
|
+
console.print(table)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def register(root_app: typer.Typer) -> None:
|
|
93
|
+
root_app.add_typer(evals_app, name="evals")
|