evaldeck 0.1.0__tar.gz → 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {evaldeck-0.1.0 → evaldeck-0.1.1}/.claude/settings.local.json +4 -1
- {evaldeck-0.1.0 → evaldeck-0.1.1}/PKG-INFO +1 -1
- {evaldeck-0.1.0 → evaldeck-0.1.1}/pyproject.toml +1 -1
- {evaldeck-0.1.0 → evaldeck-0.1.1}/src/evaldeck/evaluator.py +8 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/src/evaldeck/graders/__init__.py +4 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/src/evaldeck/graders/code.py +73 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/src/evaldeck/test_case.py +2 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/.devcontainer/Dockerfile +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/.devcontainer/devcontainer.json +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/.github/workflows/ci.yaml +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/.github/workflows/docs.yaml +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/.github/workflows/publish.yaml +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/.gitignore +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/.pre-commit-config.yaml +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/CONTRIBUTING.md +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/LICENSE +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/README.md +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/api/config.md +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/api/evalcase.md +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/api/evaluation-result.md +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/api/evaluator.md +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/api/grade-result.md +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/api/graders/base.md +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/api/graders/code.md +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/api/graders/llm.md +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/api/index.md +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/api/metrics.md +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/api/step.md +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/api/trace.md +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/concepts/architecture.md +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/concepts/evaluation-workflow.md +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/concepts/grading-strategies.md +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/concepts/index.md +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/concepts/traces.md +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/contributing/adding-graders.md +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/contributing/adding-integrations.md +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/contributing/adding-metrics.md +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/contributing/code-standards.md +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/contributing/index.md +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/contributing/setup.md +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/examples/basic-usage.md +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/examples/index.md +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/examples/langchain-agent.md +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/examples/llm-judge.md +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/examples/tool-calls.md +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/getting-started/first-evaluation.md +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/getting-started/index.md +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/getting-started/installation.md +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/getting-started/quickstart.md +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/includes/abbreviations.md +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/index.md +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/stylesheets/extra.css +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/user-guide/ci-cd.md +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/user-guide/cli.md +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/user-guide/configuration.md +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/user-guide/graders/code-based.md +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/user-guide/graders/custom.md +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/user-guide/graders/index.md +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/user-guide/graders/llm-based.md +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/user-guide/index.md +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/user-guide/integrations/index.md +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/user-guide/integrations/manual.md +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/user-guide/integrations/opentelemetry.md +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/user-guide/metrics.md +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/user-guide/test-cases.md +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/examples/basic_usage.py +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/examples/langchain_react_agent.py +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/mkdocs.yml +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/src/evaldeck/__init__.py +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/src/evaldeck/cli.py +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/src/evaldeck/config.py +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/src/evaldeck/graders/base.py +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/src/evaldeck/graders/llm.py +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/src/evaldeck/integrations/__init__.py +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/src/evaldeck/integrations/opentelemetry.py +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/src/evaldeck/metrics/__init__.py +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/src/evaldeck/metrics/base.py +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/src/evaldeck/metrics/builtin.py +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/src/evaldeck/results.py +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/src/evaldeck/trace.py +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/tests/__init__.py +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/tests/conftest.py +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/tests/test_evaluator.py +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/tests/test_graders.py +0 -0
- {evaldeck-0.1.0 → evaldeck-0.1.1}/tests/test_trace.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: evaldeck
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.1
|
|
4
4
|
Summary: The evaluation framework for AI agents. Pytest for agents.
|
|
5
5
|
Project-URL: Homepage, https://github.com/tantra-run/evaldeck-py
|
|
6
6
|
Project-URL: Documentation, https://tantra-run.github.io/evaldeck-py/
|
|
@@ -12,7 +12,9 @@ from evaldeck.graders import (
|
|
|
12
12
|
BaseGrader,
|
|
13
13
|
ContainsGrader,
|
|
14
14
|
LLMGrader,
|
|
15
|
+
MaxLLMCallsGrader,
|
|
15
16
|
MaxStepsGrader,
|
|
17
|
+
MaxToolCallsGrader,
|
|
16
18
|
TaskCompletedGrader,
|
|
17
19
|
ToolCalledGrader,
|
|
18
20
|
ToolNotCalledGrader,
|
|
@@ -124,6 +126,12 @@ class Evaluator:
|
|
|
124
126
|
if expected.max_steps is not None:
|
|
125
127
|
graders.append(MaxStepsGrader())
|
|
126
128
|
|
|
129
|
+
if expected.max_tool_calls is not None:
|
|
130
|
+
graders.append(MaxToolCallsGrader())
|
|
131
|
+
|
|
132
|
+
if expected.max_llm_calls is not None:
|
|
133
|
+
graders.append(MaxLLMCallsGrader())
|
|
134
|
+
|
|
127
135
|
if expected.task_completed is not None:
|
|
128
136
|
graders.append(TaskCompletedGrader())
|
|
129
137
|
|
|
@@ -5,7 +5,9 @@ from evaldeck.graders.code import (
|
|
|
5
5
|
ContainsGrader,
|
|
6
6
|
CustomGrader,
|
|
7
7
|
EqualsGrader,
|
|
8
|
+
MaxLLMCallsGrader,
|
|
8
9
|
MaxStepsGrader,
|
|
10
|
+
MaxToolCallsGrader,
|
|
9
11
|
NotContainsGrader,
|
|
10
12
|
RegexGrader,
|
|
11
13
|
TaskCompletedGrader,
|
|
@@ -28,6 +30,8 @@ __all__ = [
|
|
|
28
30
|
"ToolNotCalledGrader",
|
|
29
31
|
"ToolOrderGrader",
|
|
30
32
|
"MaxStepsGrader",
|
|
33
|
+
"MaxToolCallsGrader",
|
|
34
|
+
"MaxLLMCallsGrader",
|
|
31
35
|
"TaskCompletedGrader",
|
|
32
36
|
"CustomGrader",
|
|
33
37
|
# Model-based
|
|
@@ -341,6 +341,79 @@ class MaxStepsGrader(BaseGrader):
|
|
|
341
341
|
)
|
|
342
342
|
|
|
343
343
|
|
|
344
|
+
class MaxToolCallsGrader(BaseGrader):
|
|
345
|
+
"""Check that agent completed within maximum tool calls.
|
|
346
|
+
|
|
347
|
+
Unlike max_steps which counts all trace steps (including internal
|
|
348
|
+
framework steps captured by OTel), this only counts actual tool calls.
|
|
349
|
+
"""
|
|
350
|
+
|
|
351
|
+
name = "max_tool_calls"
|
|
352
|
+
|
|
353
|
+
def __init__(self, max_tool_calls: int | None = None) -> None:
|
|
354
|
+
self.max_tool_calls = max_tool_calls
|
|
355
|
+
|
|
356
|
+
def grade(self, trace: Trace, test_case: EvalCase) -> GradeResult:
|
|
357
|
+
"""Check tool call count."""
|
|
358
|
+
max_tool_calls = self.max_tool_calls
|
|
359
|
+
if max_tool_calls is None:
|
|
360
|
+
max_tool_calls = test_case.expected.max_tool_calls
|
|
361
|
+
|
|
362
|
+
if max_tool_calls is None:
|
|
363
|
+
return GradeResult.passed_result(self.name, "No max tool calls defined")
|
|
364
|
+
|
|
365
|
+
actual = len(trace.tool_calls)
|
|
366
|
+
|
|
367
|
+
if actual <= max_tool_calls:
|
|
368
|
+
return GradeResult.passed_result(
|
|
369
|
+
self.name,
|
|
370
|
+
f"Made {actual} tool calls (max: {max_tool_calls})",
|
|
371
|
+
)
|
|
372
|
+
|
|
373
|
+
return GradeResult.failed_result(
|
|
374
|
+
self.name,
|
|
375
|
+
f"Too many tool calls: {actual} > {max_tool_calls}",
|
|
376
|
+
expected=max_tool_calls,
|
|
377
|
+
actual=actual,
|
|
378
|
+
)
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
class MaxLLMCallsGrader(BaseGrader):
|
|
382
|
+
"""Check that agent completed within maximum LLM calls.
|
|
383
|
+
|
|
384
|
+
Counts only LLM call steps, not internal framework steps.
|
|
385
|
+
"""
|
|
386
|
+
|
|
387
|
+
name = "max_llm_calls"
|
|
388
|
+
|
|
389
|
+
def __init__(self, max_llm_calls: int | None = None) -> None:
|
|
390
|
+
self.max_llm_calls = max_llm_calls
|
|
391
|
+
|
|
392
|
+
def grade(self, trace: Trace, test_case: EvalCase) -> GradeResult:
|
|
393
|
+
"""Check LLM call count."""
|
|
394
|
+
max_llm_calls = self.max_llm_calls
|
|
395
|
+
if max_llm_calls is None:
|
|
396
|
+
max_llm_calls = test_case.expected.max_llm_calls
|
|
397
|
+
|
|
398
|
+
if max_llm_calls is None:
|
|
399
|
+
return GradeResult.passed_result(self.name, "No max LLM calls defined")
|
|
400
|
+
|
|
401
|
+
actual = len(trace.llm_calls)
|
|
402
|
+
|
|
403
|
+
if actual <= max_llm_calls:
|
|
404
|
+
return GradeResult.passed_result(
|
|
405
|
+
self.name,
|
|
406
|
+
f"Made {actual} LLM calls (max: {max_llm_calls})",
|
|
407
|
+
)
|
|
408
|
+
|
|
409
|
+
return GradeResult.failed_result(
|
|
410
|
+
self.name,
|
|
411
|
+
f"Too many LLM calls: {actual} > {max_llm_calls}",
|
|
412
|
+
expected=max_llm_calls,
|
|
413
|
+
actual=actual,
|
|
414
|
+
)
|
|
415
|
+
|
|
416
|
+
|
|
344
417
|
class TaskCompletedGrader(BaseGrader):
|
|
345
418
|
"""Check if the agent completed the task (based on trace status)."""
|
|
346
419
|
|
|
@@ -26,6 +26,8 @@ class ExpectedBehavior(BaseModel):
|
|
|
26
26
|
# Execution expectations
|
|
27
27
|
max_steps: int | None = None
|
|
28
28
|
min_steps: int | None = None
|
|
29
|
+
max_tool_calls: int | None = None
|
|
30
|
+
max_llm_calls: int | None = None
|
|
29
31
|
task_completed: bool | None = None
|
|
30
32
|
|
|
31
33
|
# Custom assertions (for code-based graders)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|