evaldeck 0.1.0__tar.gz → 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. {evaldeck-0.1.0 → evaldeck-0.1.1}/.claude/settings.local.json +4 -1
  2. {evaldeck-0.1.0 → evaldeck-0.1.1}/PKG-INFO +1 -1
  3. {evaldeck-0.1.0 → evaldeck-0.1.1}/pyproject.toml +1 -1
  4. {evaldeck-0.1.0 → evaldeck-0.1.1}/src/evaldeck/evaluator.py +8 -0
  5. {evaldeck-0.1.0 → evaldeck-0.1.1}/src/evaldeck/graders/__init__.py +4 -0
  6. {evaldeck-0.1.0 → evaldeck-0.1.1}/src/evaldeck/graders/code.py +73 -0
  7. {evaldeck-0.1.0 → evaldeck-0.1.1}/src/evaldeck/test_case.py +2 -0
  8. {evaldeck-0.1.0 → evaldeck-0.1.1}/.devcontainer/Dockerfile +0 -0
  9. {evaldeck-0.1.0 → evaldeck-0.1.1}/.devcontainer/devcontainer.json +0 -0
  10. {evaldeck-0.1.0 → evaldeck-0.1.1}/.github/workflows/ci.yaml +0 -0
  11. {evaldeck-0.1.0 → evaldeck-0.1.1}/.github/workflows/docs.yaml +0 -0
  12. {evaldeck-0.1.0 → evaldeck-0.1.1}/.github/workflows/publish.yaml +0 -0
  13. {evaldeck-0.1.0 → evaldeck-0.1.1}/.gitignore +0 -0
  14. {evaldeck-0.1.0 → evaldeck-0.1.1}/.pre-commit-config.yaml +0 -0
  15. {evaldeck-0.1.0 → evaldeck-0.1.1}/CONTRIBUTING.md +0 -0
  16. {evaldeck-0.1.0 → evaldeck-0.1.1}/LICENSE +0 -0
  17. {evaldeck-0.1.0 → evaldeck-0.1.1}/README.md +0 -0
  18. {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/api/config.md +0 -0
  19. {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/api/evalcase.md +0 -0
  20. {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/api/evaluation-result.md +0 -0
  21. {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/api/evaluator.md +0 -0
  22. {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/api/grade-result.md +0 -0
  23. {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/api/graders/base.md +0 -0
  24. {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/api/graders/code.md +0 -0
  25. {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/api/graders/llm.md +0 -0
  26. {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/api/index.md +0 -0
  27. {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/api/metrics.md +0 -0
  28. {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/api/step.md +0 -0
  29. {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/api/trace.md +0 -0
  30. {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/concepts/architecture.md +0 -0
  31. {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/concepts/evaluation-workflow.md +0 -0
  32. {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/concepts/grading-strategies.md +0 -0
  33. {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/concepts/index.md +0 -0
  34. {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/concepts/traces.md +0 -0
  35. {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/contributing/adding-graders.md +0 -0
  36. {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/contributing/adding-integrations.md +0 -0
  37. {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/contributing/adding-metrics.md +0 -0
  38. {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/contributing/code-standards.md +0 -0
  39. {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/contributing/index.md +0 -0
  40. {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/contributing/setup.md +0 -0
  41. {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/examples/basic-usage.md +0 -0
  42. {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/examples/index.md +0 -0
  43. {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/examples/langchain-agent.md +0 -0
  44. {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/examples/llm-judge.md +0 -0
  45. {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/examples/tool-calls.md +0 -0
  46. {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/getting-started/first-evaluation.md +0 -0
  47. {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/getting-started/index.md +0 -0
  48. {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/getting-started/installation.md +0 -0
  49. {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/getting-started/quickstart.md +0 -0
  50. {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/includes/abbreviations.md +0 -0
  51. {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/index.md +0 -0
  52. {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/stylesheets/extra.css +0 -0
  53. {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/user-guide/ci-cd.md +0 -0
  54. {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/user-guide/cli.md +0 -0
  55. {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/user-guide/configuration.md +0 -0
  56. {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/user-guide/graders/code-based.md +0 -0
  57. {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/user-guide/graders/custom.md +0 -0
  58. {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/user-guide/graders/index.md +0 -0
  59. {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/user-guide/graders/llm-based.md +0 -0
  60. {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/user-guide/index.md +0 -0
  61. {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/user-guide/integrations/index.md +0 -0
  62. {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/user-guide/integrations/manual.md +0 -0
  63. {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/user-guide/integrations/opentelemetry.md +0 -0
  64. {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/user-guide/metrics.md +0 -0
  65. {evaldeck-0.1.0 → evaldeck-0.1.1}/docs/user-guide/test-cases.md +0 -0
  66. {evaldeck-0.1.0 → evaldeck-0.1.1}/examples/basic_usage.py +0 -0
  67. {evaldeck-0.1.0 → evaldeck-0.1.1}/examples/langchain_react_agent.py +0 -0
  68. {evaldeck-0.1.0 → evaldeck-0.1.1}/mkdocs.yml +0 -0
  69. {evaldeck-0.1.0 → evaldeck-0.1.1}/src/evaldeck/__init__.py +0 -0
  70. {evaldeck-0.1.0 → evaldeck-0.1.1}/src/evaldeck/cli.py +0 -0
  71. {evaldeck-0.1.0 → evaldeck-0.1.1}/src/evaldeck/config.py +0 -0
  72. {evaldeck-0.1.0 → evaldeck-0.1.1}/src/evaldeck/graders/base.py +0 -0
  73. {evaldeck-0.1.0 → evaldeck-0.1.1}/src/evaldeck/graders/llm.py +0 -0
  74. {evaldeck-0.1.0 → evaldeck-0.1.1}/src/evaldeck/integrations/__init__.py +0 -0
  75. {evaldeck-0.1.0 → evaldeck-0.1.1}/src/evaldeck/integrations/opentelemetry.py +0 -0
  76. {evaldeck-0.1.0 → evaldeck-0.1.1}/src/evaldeck/metrics/__init__.py +0 -0
  77. {evaldeck-0.1.0 → evaldeck-0.1.1}/src/evaldeck/metrics/base.py +0 -0
  78. {evaldeck-0.1.0 → evaldeck-0.1.1}/src/evaldeck/metrics/builtin.py +0 -0
  79. {evaldeck-0.1.0 → evaldeck-0.1.1}/src/evaldeck/results.py +0 -0
  80. {evaldeck-0.1.0 → evaldeck-0.1.1}/src/evaldeck/trace.py +0 -0
  81. {evaldeck-0.1.0 → evaldeck-0.1.1}/tests/__init__.py +0 -0
  82. {evaldeck-0.1.0 → evaldeck-0.1.1}/tests/conftest.py +0 -0
  83. {evaldeck-0.1.0 → evaldeck-0.1.1}/tests/test_evaluator.py +0 -0
  84. {evaldeck-0.1.0 → evaldeck-0.1.1}/tests/test_graders.py +0 -0
  85. {evaldeck-0.1.0 → evaldeck-0.1.1}/tests/test_trace.py +0 -0
@@ -25,7 +25,10 @@
25
25
  "Bash(git branch:*)",
26
26
  "Bash(git remote add:*)",
27
27
  "Bash(git push:*)",
28
- "Bash(ruff check:*)"
28
+ "Bash(ruff check:*)",
29
+ "Bash(source:*)",
30
+ "Bash(./venv/bin/python -m pytest:*)",
31
+ "Bash(uv run pytest:*)"
29
32
  ]
30
33
  }
31
34
  }
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: evaldeck
3
- Version: 0.1.0
3
+ Version: 0.1.1
4
4
  Summary: The evaluation framework for AI agents. Pytest for agents.
5
5
  Project-URL: Homepage, https://github.com/tantra-run/evaldeck-py
6
6
  Project-URL: Documentation, https://tantra-run.github.io/evaldeck-py/
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "evaldeck"
7
- version = "0.1.0"
7
+ version = "0.1.1"
8
8
  description = "The evaluation framework for AI agents. Pytest for agents."
9
9
  readme = "README.md"
10
10
  license = "Apache-2.0"
@@ -12,7 +12,9 @@ from evaldeck.graders import (
12
12
  BaseGrader,
13
13
  ContainsGrader,
14
14
  LLMGrader,
15
+ MaxLLMCallsGrader,
15
16
  MaxStepsGrader,
17
+ MaxToolCallsGrader,
16
18
  TaskCompletedGrader,
17
19
  ToolCalledGrader,
18
20
  ToolNotCalledGrader,
@@ -124,6 +126,12 @@ class Evaluator:
124
126
  if expected.max_steps is not None:
125
127
  graders.append(MaxStepsGrader())
126
128
 
129
+ if expected.max_tool_calls is not None:
130
+ graders.append(MaxToolCallsGrader())
131
+
132
+ if expected.max_llm_calls is not None:
133
+ graders.append(MaxLLMCallsGrader())
134
+
127
135
  if expected.task_completed is not None:
128
136
  graders.append(TaskCompletedGrader())
129
137
 
@@ -5,7 +5,9 @@ from evaldeck.graders.code import (
5
5
  ContainsGrader,
6
6
  CustomGrader,
7
7
  EqualsGrader,
8
+ MaxLLMCallsGrader,
8
9
  MaxStepsGrader,
10
+ MaxToolCallsGrader,
9
11
  NotContainsGrader,
10
12
  RegexGrader,
11
13
  TaskCompletedGrader,
@@ -28,6 +30,8 @@ __all__ = [
28
30
  "ToolNotCalledGrader",
29
31
  "ToolOrderGrader",
30
32
  "MaxStepsGrader",
33
+ "MaxToolCallsGrader",
34
+ "MaxLLMCallsGrader",
31
35
  "TaskCompletedGrader",
32
36
  "CustomGrader",
33
37
  # Model-based
@@ -341,6 +341,79 @@ class MaxStepsGrader(BaseGrader):
341
341
  )
342
342
 
343
343
 
344
+ class MaxToolCallsGrader(BaseGrader):
345
+ """Check that agent completed within maximum tool calls.
346
+
347
+ Unlike max_steps which counts all trace steps (including internal
348
+ framework steps captured by OTel), this only counts actual tool calls.
349
+ """
350
+
351
+ name = "max_tool_calls"
352
+
353
+ def __init__(self, max_tool_calls: int | None = None) -> None:
354
+ self.max_tool_calls = max_tool_calls
355
+
356
+ def grade(self, trace: Trace, test_case: EvalCase) -> GradeResult:
357
+ """Check tool call count."""
358
+ max_tool_calls = self.max_tool_calls
359
+ if max_tool_calls is None:
360
+ max_tool_calls = test_case.expected.max_tool_calls
361
+
362
+ if max_tool_calls is None:
363
+ return GradeResult.passed_result(self.name, "No max tool calls defined")
364
+
365
+ actual = len(trace.tool_calls)
366
+
367
+ if actual <= max_tool_calls:
368
+ return GradeResult.passed_result(
369
+ self.name,
370
+ f"Made {actual} tool calls (max: {max_tool_calls})",
371
+ )
372
+
373
+ return GradeResult.failed_result(
374
+ self.name,
375
+ f"Too many tool calls: {actual} > {max_tool_calls}",
376
+ expected=max_tool_calls,
377
+ actual=actual,
378
+ )
379
+
380
+
381
+ class MaxLLMCallsGrader(BaseGrader):
382
+ """Check that agent completed within maximum LLM calls.
383
+
384
+ Counts only LLM call steps, not internal framework steps.
385
+ """
386
+
387
+ name = "max_llm_calls"
388
+
389
+ def __init__(self, max_llm_calls: int | None = None) -> None:
390
+ self.max_llm_calls = max_llm_calls
391
+
392
+ def grade(self, trace: Trace, test_case: EvalCase) -> GradeResult:
393
+ """Check LLM call count."""
394
+ max_llm_calls = self.max_llm_calls
395
+ if max_llm_calls is None:
396
+ max_llm_calls = test_case.expected.max_llm_calls
397
+
398
+ if max_llm_calls is None:
399
+ return GradeResult.passed_result(self.name, "No max LLM calls defined")
400
+
401
+ actual = len(trace.llm_calls)
402
+
403
+ if actual <= max_llm_calls:
404
+ return GradeResult.passed_result(
405
+ self.name,
406
+ f"Made {actual} LLM calls (max: {max_llm_calls})",
407
+ )
408
+
409
+ return GradeResult.failed_result(
410
+ self.name,
411
+ f"Too many LLM calls: {actual} > {max_llm_calls}",
412
+ expected=max_llm_calls,
413
+ actual=actual,
414
+ )
415
+
416
+
344
417
  class TaskCompletedGrader(BaseGrader):
345
418
  """Check if the agent completed the task (based on trace status)."""
346
419
 
@@ -26,6 +26,8 @@ class ExpectedBehavior(BaseModel):
26
26
  # Execution expectations
27
27
  max_steps: int | None = None
28
28
  min_steps: int | None = None
29
+ max_tool_calls: int | None = None
30
+ max_llm_calls: int | None = None
29
31
  task_completed: bool | None = None
30
32
 
31
33
  # Custom assertions (for code-based graders)
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes