tesserakit-evals 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,8 @@
1
+ .venv/
2
+ __pycache__/
3
+ *.pyc
4
+ dist/
5
+ build/
6
+ *.egg-info/
7
+ out/
8
+ .DS_Store
@@ -0,0 +1,24 @@
1
+ Metadata-Version: 2.4
2
+ Name: tesserakit-evals
3
+ Version: 0.4.0
4
+ Summary: Eval job pack for Tessera: compile messy data into eval-ready assets.
5
+ Project-URL: Homepage, https://github.com/ShaileshRawat1403/tessera
6
+ Project-URL: Repository, https://github.com/ShaileshRawat1403/tessera
7
+ Project-URL: Issues, https://github.com/ShaileshRawat1403/tessera/issues
8
+ Author: Shailesh Rawat
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Environment :: Console
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Programming Language :: Python :: 3
13
+ Requires-Python: >=3.10
14
+ Requires-Dist: pydantic>=2.7
15
+ Requires-Dist: rich>=13.7
16
+ Requires-Dist: tesserakit-core>=0.1.0
17
+ Requires-Dist: typer>=0.12
18
+ Provides-Extra: dev
19
+ Requires-Dist: pytest>=8.0; extra == 'dev'
20
+ Description-Content-Type: text/markdown
21
+
22
+ # tessera-evals
23
+
24
+ First Tessera job pack. It compiles messy CSV records into an eval pack with dataset, golden candidates, rubric, and quality reports.
@@ -0,0 +1,3 @@
1
+ # tessera-evals
2
+
3
+ First Tessera job pack. It compiles messy CSV records into an eval pack with dataset, golden candidates, rubric, and quality reports.
@@ -0,0 +1,40 @@
1
+ [build-system]
2
+ requires = ["hatchling>=1.25"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "tesserakit-evals"
7
+ version = "0.4.0"
8
+ description = "Eval job pack for Tessera: compile messy data into eval-ready assets."
9
+ readme = "README.md"
10
+ requires-python = ">=3.10"
11
+ authors = [{ name = "Shailesh Rawat" }]
12
+ dependencies = [
13
+ "tesserakit-core>=0.1.0",
14
+ "typer>=0.12",
15
+ "rich>=13.7",
16
+ "pydantic>=2.7",
17
+ ]
18
+ classifiers = [
19
+ "Development Status :: 3 - Alpha",
20
+ "Environment :: Console",
21
+ "Intended Audience :: Developers",
22
+ "Programming Language :: Python :: 3",
23
+ ]
24
+
25
+ [project.urls]
26
+ Homepage = "https://github.com/ShaileshRawat1403/tessera"
27
+ Repository = "https://github.com/ShaileshRawat1403/tessera"
28
+ Issues = "https://github.com/ShaileshRawat1403/tessera/issues"
29
+
30
+ [project.optional-dependencies]
31
+ dev = ["pytest>=8.0"]
32
+
33
+ [project.entry-points."tessera.commands"]
34
+ evals = "tessera_evals.cli:register"
35
+
36
+ [project.entry-points."tessera.jobpacks"]
37
+ evals = "tessera_evals.pack:create_pack"
38
+
39
+ [tool.hatch.build.targets.wheel]
40
+ packages = ["src/tessera_evals"]
@@ -0,0 +1,3 @@
1
+ """Eval job pack for Tessera."""
2
+
3
+ __version__ = "0.3.1"
@@ -0,0 +1,131 @@
1
+ """Export canonical eval records to framework-native interchange files.
2
+
3
+ Tessera stays framework-independent: it emits each target's documented file
4
+ format rather than importing the framework. A canonical ``EvalRecord`` maps to:
5
+
6
+ input <- input.user_message
7
+ expected <- expected.reference_answer
8
+ context <- context.source_text (optional)
9
+
10
+ Targets:
11
+ deepeval -> goldens JSON (list of {input, expected_output, context})
12
+ ragas -> JSONL of {question, ground_truth, contexts}
13
+ openai-evals -> JSONL of {input: [chat messages], ideal}
14
+ langsmith -> JSONL of {inputs, outputs}
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import json
20
+ from pathlib import Path
21
+ from typing import Any, Callable
22
+
23
+ from tessera_evals.schema import EvalRecord
24
+
25
+ TARGETS = ("deepeval", "ragas", "openai-evals", "langsmith")
26
+
27
+
28
+ def _input_text(rec: EvalRecord) -> str:
29
+ return str(rec.input.get("user_message", "")).strip()
30
+
31
+
32
+ def _expected_text(rec: EvalRecord) -> str:
33
+ return str(rec.expected.get("reference_answer", "")).strip()
34
+
35
+
36
+ def _context_list(rec: EvalRecord) -> list[str]:
37
+ src = str(rec.context.get("source_text", "")).strip()
38
+ return [src] if src else []
39
+
40
+
41
+ def to_deepeval(records: list[EvalRecord]) -> str:
42
+ """DeepEval goldens JSON: a list of golden objects."""
43
+ goldens = [
44
+ {
45
+ "input": _input_text(r),
46
+ "expected_output": _expected_text(r) or None,
47
+ "context": _context_list(r) or None,
48
+ "additional_metadata": {
49
+ "id": r.id,
50
+ "task_type": r.task_type,
51
+ "review_status": r.expected.get("review_status", ""),
52
+ },
53
+ }
54
+ for r in records
55
+ ]
56
+ return json.dumps({"goldens": goldens}, ensure_ascii=False, indent=2) + "\n"
57
+
58
+
59
+ def to_ragas(records: list[EvalRecord]) -> str:
60
+ """RAGAS evaluation dataset as JSONL: question / ground_truth / contexts."""
61
+ lines = []
62
+ for r in records:
63
+ lines.append(
64
+ json.dumps(
65
+ {
66
+ "question": _input_text(r),
67
+ "ground_truth": _expected_text(r),
68
+ "contexts": _context_list(r),
69
+ },
70
+ ensure_ascii=False,
71
+ )
72
+ )
73
+ return "\n".join(lines) + ("\n" if lines else "")
74
+
75
+
76
+ def to_openai_evals(records: list[EvalRecord]) -> str:
77
+ """OpenAI Evals samples.jsonl: {input: [messages], ideal}."""
78
+ lines = []
79
+ for r in records:
80
+ sample: dict[str, Any] = {
81
+ "input": [{"role": "user", "content": _input_text(r)}],
82
+ "ideal": _expected_text(r),
83
+ }
84
+ lines.append(json.dumps(sample, ensure_ascii=False))
85
+ return "\n".join(lines) + ("\n" if lines else "")
86
+
87
+
88
+ def to_langsmith(records: list[EvalRecord]) -> str:
89
+ """LangSmith dataset examples JSONL: {inputs, outputs}."""
90
+ lines = []
91
+ for r in records:
92
+ example = {
93
+ "inputs": {"input": _input_text(r)},
94
+ "outputs": {"expected": _expected_text(r)},
95
+ "metadata": {"id": r.id, "task_type": r.task_type},
96
+ }
97
+ lines.append(json.dumps(example, ensure_ascii=False))
98
+ return "\n".join(lines) + ("\n" if lines else "")
99
+
100
+
101
+ _ADAPTERS: dict[str, tuple[Callable[[list[EvalRecord]], str], str]] = {
102
+ "deepeval": (to_deepeval, "deepeval_goldens.json"),
103
+ "ragas": (to_ragas, "ragas_dataset.jsonl"),
104
+ "openai-evals": (to_openai_evals, "openai_evals_samples.jsonl"),
105
+ "langsmith": (to_langsmith, "langsmith_examples.jsonl"),
106
+ }
107
+
108
+
109
+ def load_dataset(path: Path) -> list[EvalRecord]:
110
+ """Read a canonical dataset.jsonl (the evals pack's output) into EvalRecords."""
111
+ records: list[EvalRecord] = []
112
+ for line in path.read_text(encoding="utf-8").splitlines():
113
+ line = line.strip()
114
+ if line:
115
+ records.append(EvalRecord.model_validate_json(line))
116
+ return records
117
+
118
+
119
+ def export(records: list[EvalRecord], target: str, output_dir: Path) -> Path:
120
+ """Write one target's interchange file; return its path."""
121
+ if target not in _ADAPTERS:
122
+ raise ValueError(f"unknown target '{target}'; choose from {', '.join(TARGETS)}")
123
+ render, filename = _ADAPTERS[target]
124
+ output_dir.mkdir(parents=True, exist_ok=True)
125
+ out = output_dir / filename
126
+ out.write_text(render(records), encoding="utf-8")
127
+ return out
128
+
129
+
130
+ def export_all(records: list[EvalRecord], output_dir: Path) -> list[Path]:
131
+ return [export(records, t, output_dir) for t in TARGETS]
@@ -0,0 +1,93 @@
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+
5
+ import typer
6
+ from rich.console import Console
7
+ from rich.table import Table
8
+
9
+ from tessera_core.models import RunContext
10
+
11
+ from tessera_evals.pack import EvalsPack
12
+
13
+ console = Console()
14
+ evals_app = typer.Typer(help="Compile messy data into eval-ready assets.")
15
+
16
+
17
+ @evals_app.command("compile")
18
+ def compile_cmd(
19
+ input: Path = typer.Option(..., "--input", "-i", exists=True, readable=True, help="Input CSV path, or a prompts examples.jsonl / prompt-pack directory with --from-prompts."),
20
+ task: str = typer.Option(..., "--task", help="Task type, for example customer_support, rag_qa, classification."),
21
+ output: Path = typer.Option(Path("eval_pack"), "--output", "-o", help="Output directory."),
22
+ input_column: str | None = typer.Option(None, "--input-column", help="Override input/question column."),
23
+ expected_column: str | None = typer.Option(None, "--expected-column", help="Override expected/golden-answer column."),
24
+ context_column: str | None = typer.Option(None, "--context-column", help="Override context/source column."),
25
+ from_prompts: bool = typer.Option(False, "--from-prompts", help="Treat input as a prompts-pack examples.jsonl (or directory) instead of a CSV."),
26
+ enrich: bool = typer.Option(False, "--enrich", help="LLM-enriched rubric (not available in v0.1)."),
27
+ ) -> None:
28
+ """Create dataset, golden candidates, rubric, and quality reports from a CSV or a prompts examples.jsonl."""
29
+ if enrich:
30
+ console.print(
31
+ "[yellow]LLM enrichment is not available in v0.1. Using deterministic rubric templates.[/yellow]"
32
+ )
33
+
34
+ ctx = RunContext(job_name="evals", output_dir=output)
35
+ options = {
36
+ "task_type": task,
37
+ "input_column": input_column,
38
+ "expected_column": expected_column,
39
+ "context_column": context_column,
40
+ }
41
+ if from_prompts:
42
+ options["source"] = "prompts"
43
+
44
+ pack = EvalsPack()
45
+ artifacts = pack.run(input_path=input, ctx=ctx, options=options)
46
+
47
+ table = Table(title="Eval Pack Created")
48
+ table.add_column("Artifact")
49
+ table.add_column("Path")
50
+ table.add_column("Kind")
51
+ for art in artifacts:
52
+ table.add_row(art.name, str(art.path), art.kind)
53
+ console.print(table)
54
+
55
+ summary = Table(title="Run Summary")
56
+ summary.add_column("Metric")
57
+ summary.add_column("Value")
58
+ summary.add_row("run_id", ctx.run_id)
59
+ summary.add_row("records", str(ctx.metadata.get("record_count", 0)))
60
+ summary.add_row("findings", str(ctx.metadata.get("finding_count", 0)))
61
+ console.print(summary)
62
+
63
+
64
+ @evals_app.command("export")
65
+ def export_cmd(
66
+ input: Path = typer.Option(..., "--input", "-i", exists=True, readable=True, help="A canonical dataset.jsonl (the output of `evals compile`)."),
67
+ target: str = typer.Option("all", "--target", help="deepeval | ragas | openai-evals | langsmith | all."),
68
+ output: Path = typer.Option(Path("eval_export"), "--output", "-o", help="Output directory."),
69
+ ) -> None:
70
+ """Export a canonical eval dataset to framework-native interchange files."""
71
+ from tessera_evals.adapters import TARGETS, export, export_all, load_dataset
72
+
73
+ records = load_dataset(input)
74
+
75
+ target = target.lower()
76
+ if target == "all":
77
+ paths = export_all(records, output)
78
+ elif target in TARGETS:
79
+ paths = [export(records, target, output)]
80
+ else:
81
+ console.print(f"[red]Unknown target '{target}'. Choose from: {', '.join(TARGETS)}, all.[/red]")
82
+ raise typer.Exit(code=2)
83
+
84
+ table = Table(title="Eval Export")
85
+ table.add_column("Target file")
86
+ table.add_column("Records")
87
+ for p in paths:
88
+ table.add_row(str(p), str(len(records)))
89
+ console.print(table)
90
+
91
+
92
+ def register(root_app: typer.Typer) -> None:
93
+ root_app.add_typer(evals_app, name="evals")