judgeval 0.12.0__tar.gz → 0.13.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {judgeval-0.12.0 → judgeval-0.13.1}/.pre-commit-config.yaml +3 -3
- {judgeval-0.12.0 → judgeval-0.13.1}/PKG-INFO +1 -7
- judgeval-0.13.1/assets/brand/company.jpg +0 -0
- judgeval-0.13.1/assets/brand/company_banner.jpg +0 -0
- judgeval-0.13.1/assets/brand/darkmode.svg +7 -0
- judgeval-0.13.1/assets/brand/full_logo.png +0 -0
- judgeval-0.13.1/assets/brand/icon.png +0 -0
- judgeval-0.13.1/assets/brand/lightmode.svg +7 -0
- judgeval-0.13.1/assets/brand/white_background.png +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/pyproject.toml +6 -9
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/__init__.py +2 -2
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/api/api_types.py +81 -12
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/cli.py +2 -1
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/constants.py +0 -6
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/data/evaluation_run.py +2 -5
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/data/judgment_types.py +97 -12
- judgeval-0.13.1/src/judgeval/data/trace.py +121 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/dataset/__init__.py +72 -23
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/env.py +5 -20
- judgeval-0.13.1/src/judgeval/integrations/langgraph/__init__.py +13 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/scorers/api_scorer.py +7 -12
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +0 -8
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +0 -8
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +0 -12
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +22 -33
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/scorers/score.py +1 -1
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/scorers/utils.py +1 -4
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/tracer/__init__.py +175 -156
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/tracer/exporters/__init__.py +4 -1
- judgeval-0.13.1/src/judgeval/tracer/keys.py +57 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/tracer/llm/__init__.py +0 -1
- judgeval-0.13.1/src/judgeval/tracer/llm/anthropic/__init__.py +20 -0
- judgeval-0.13.1/src/judgeval/tracer/llm/google/__init__.py +21 -0
- judgeval-0.13.1/src/judgeval/tracer/llm/groq/__init__.py +20 -0
- judgeval-0.13.1/src/judgeval/tracer/llm/openai/__init__.py +32 -0
- judgeval-0.13.1/src/judgeval/tracer/llm/providers.py +63 -0
- judgeval-0.13.1/src/judgeval/tracer/llm/together/__init__.py +20 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/tracer/managers.py +23 -48
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/tracer/processors/__init__.py +36 -75
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/tracer/utils.py +1 -2
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/utils/file_utils.py +0 -2
- judgeval-0.13.1/src/judgeval/utils/meta.py +27 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/utils/testing.py +0 -14
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/utils/version_check.py +2 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/version.py +1 -1
- {judgeval-0.12.0 → judgeval-0.13.1}/uv.lock +316 -34
- judgeval-0.12.0/src/judgeval/data/trace.py +0 -14
- judgeval-0.12.0/src/judgeval/integrations/langgraph/__init__.py +0 -789
- judgeval-0.12.0/src/judgeval/tracer/keys.py +0 -67
- judgeval-0.12.0/src/judgeval/tracer/llm/providers.py +0 -114
- judgeval-0.12.0/src/judgeval/utils/meta.py +0 -14
- {judgeval-0.12.0 → judgeval-0.13.1}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/.github/ISSUE_TEMPLATE/config.yml +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/.github/pull_request_template.md +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/.github/workflows/blocked-pr.yaml +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/.github/workflows/ci.yaml +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/.github/workflows/claude-code-review.yml +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/.github/workflows/claude.yml +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/.github/workflows/lint.yaml +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/.github/workflows/merge-branch-check.yaml +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/.github/workflows/mypy.yaml +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/.github/workflows/pre-commit-autoupdate.yaml +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/.github/workflows/release.yaml +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/.github/workflows/validate-branch.yaml +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/.gitignore +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/LICENSE.md +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/README.md +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/assets/Screenshot 2025-05-17 at 8.14.27/342/200/257PM.png" +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/assets/agent.gif +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/assets/agent_trace_example.png +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/assets/data.gif +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/assets/dataset_clustering_screenshot.png +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/assets/dataset_clustering_screenshot_dm.png +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/assets/datasets_preview_screenshot.png +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/assets/document.gif +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/assets/error_analysis_dashboard.png +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/assets/errors.png +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/assets/experiments_dashboard_screenshot.png +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/assets/experiments_page.png +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/assets/experiments_pagev2.png +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/assets/logo-dark.svg +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/assets/logo-light.svg +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/assets/monitoring_screenshot.png +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/assets/new_darkmode.svg +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/assets/new_lightmode.svg +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/assets/online_eval.png +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/assets/product_shot.png +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/assets/test.png +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/assets/tests.png +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/assets/trace.gif +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/assets/trace_demo.png +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/assets/trace_screenshot.png +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/assets/trace_screenshot_old.png +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/pytest.ini +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/scripts/api_generator.py +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/scripts/openapi_transform.py +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/scripts/update_types.sh +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/api/__init__.py +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/data/__init__.py +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/data/example.py +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/data/result.py +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/data/scorer_data.py +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/data/scripts/fix_default_factory.py +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/data/scripts/openapi_transform.py +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/evaluation/__init__.py +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/exceptions.py +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/judges/__init__.py +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/judges/base_judge.py +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/judges/litellm_judge.py +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/judges/together_judge.py +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/judges/utils.py +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/logger.py +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/scorers/__init__.py +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/scorers/agent_scorer.py +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/scorers/base_scorer.py +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/scorers/example_scorer.py +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/scorers/exceptions.py +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/scorers/judgeval_scorers/__init__.py +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/tracer/constants.py +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/tracer/exporters/s3.py +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/tracer/exporters/store.py +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/tracer/exporters/utils.py +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/tracer/local_eval_queue.py +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/trainer/__init__.py +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/trainer/config.py +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/trainer/console.py +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/trainer/trainable_model.py +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/trainer/trainer.py +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/utils/async_utils.py +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/utils/decorators.py +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/utils/guards.py +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/utils/serialize.py +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/utils/url.py +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/src/judgeval/warnings.py +0 -0
- {judgeval-0.12.0 → judgeval-0.13.1}/update_version.py +0 -0
@@ -1,11 +1,11 @@
|
|
1
1
|
repos:
|
2
2
|
- repo: https://github.com/astral-sh/uv-pre-commit
|
3
|
-
rev: 0.8.
|
3
|
+
rev: 0.8.19
|
4
4
|
hooks:
|
5
5
|
- id: uv-lock
|
6
6
|
|
7
7
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
8
|
-
rev: v0.13.
|
8
|
+
rev: v0.13.1
|
9
9
|
hooks:
|
10
10
|
- id: ruff
|
11
11
|
name: ruff (linter)
|
@@ -14,7 +14,7 @@ repos:
|
|
14
14
|
name: ruff (formatter)
|
15
15
|
|
16
16
|
- repo: https://github.com/pre-commit/mirrors-mypy
|
17
|
-
rev: v1.
|
17
|
+
rev: v1.18.2
|
18
18
|
hooks:
|
19
19
|
- id: mypy
|
20
20
|
language: system
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: judgeval
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.13.1
|
4
4
|
Summary: Judgeval Package
|
5
5
|
Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
|
6
6
|
Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
|
@@ -17,14 +17,8 @@ Requires-Dist: httpx>=0.28.1
|
|
17
17
|
Requires-Dist: litellm<1.75.0
|
18
18
|
Requires-Dist: opentelemetry-exporter-otlp>=1.36.0
|
19
19
|
Requires-Dist: opentelemetry-sdk>=1.36.0
|
20
|
-
Requires-Dist: opentelemetry-semantic-conventions>=0.57b0
|
21
20
|
Requires-Dist: orjson>=3.9.0
|
22
21
|
Requires-Dist: typer>=0.9.0
|
23
|
-
Provides-Extra: langchain
|
24
|
-
Requires-Dist: langchain-anthropic; extra == 'langchain'
|
25
|
-
Requires-Dist: langchain-core; extra == 'langchain'
|
26
|
-
Requires-Dist: langchain-huggingface; extra == 'langchain'
|
27
|
-
Requires-Dist: langchain-openai; extra == 'langchain'
|
28
22
|
Provides-Extra: s3
|
29
23
|
Requires-Dist: boto3>=1.40.11; extra == 's3'
|
30
24
|
Provides-Extra: trainer
|
Binary file
|
Binary file
|
@@ -0,0 +1,7 @@
|
|
1
|
+
<svg width="544" height="91" viewBox="0 0 544 91" fill="none" xmlns="http://www.w3.org/2000/svg">
|
2
|
+
<path d="M31.2246 18H39.5512V51.3061L31.2246 59.6327V18Z" fill="#FF4B2E"/>
|
3
|
+
<path d="M0 59.6328H31.2245L21.8571 69.0002H0V59.6328Z" fill="#FF4B2E"/>
|
4
|
+
<path d="M52.041 18H43.7145V51.3061L52.041 59.6327V18Z" fill="#FF4B2E"/>
|
5
|
+
<path d="M83.2656 59.6328H52.0411L62.4493 69.0002H83.2656V59.6328Z" fill="#FF4B2E"/>
|
6
|
+
<path d="M111.45 61.3V54.37H116.63V59.55L121.39 64.24H133.36L137.35 60.32V20H142.67V62L135.67 69H119.15L111.45 61.3ZM147.896 62.56V34.14H153.076V60.95L156.576 64.38H163.576L172.256 55.7V34.14H177.436V69H172.396V61.58L164.976 69H154.336L147.896 62.56ZM182.363 62.56V40.58L188.803 34.14H202.243L207.983 39.18V19.02H213.163V69H208.123V62.63L201.753 69H188.803L182.363 62.56ZM200.633 64.38L207.983 57.03V44.64L201.263 38.76H191.043L187.543 42.19V60.95L191.043 64.38H200.633ZM248.869 34.14V77.89L242.499 84.26H225.209L219.819 78.87V74.6H224.999V77.19L227.449 79.64H240.189L243.689 76.21V63.19L237.249 69H224.509L218.069 62.56V40.58L224.509 34.14H237.739L243.829 40.23V34.14H248.869ZM243.689 46.11L236.339 38.76H226.749L223.249 42.19V60.95L226.749 64.38H236.409L243.689 57.59V46.11ZM254.474 34.14H259.514V40.86L266.234 34.14H274.564L280.024 39.6L285.484 34.14H296.474L302.914 40.58V69H297.734V42.19L294.234 38.76H286.534L281.634 43.66V69H276.594V42.19L273.094 38.76H267.214L259.654 46.32V69H254.474V34.14ZM307.458 62.56V40.58L313.898 34.14H331.468L337.978 40.58V53.11H312.638V60.95L316.138 64.38H329.228L332.728 60.95V58.29H337.908V62.56L331.468 69H313.898L307.458 62.56ZM332.798 48.63V42.19L329.298 38.76H316.138L312.638 42.19V48.63H332.798ZM342.496 34.14H347.536V41.56L354.956 34.14H365.666L372.106 40.58V69H366.926V42.19L363.426 38.76H356.356L347.676 47.44V69H342.496V34.14ZM379.848 62.56V38.69H373.548V34.14H379.988V22.8H385.028V34.14H395.948V38.69H385.028V60.95L388.528 64.45H395.948V69H386.288L379.848 62.56ZM411.613 20H416.933V64.31H441.853V69H411.613V20ZM442.227 63.26V54.37L447.967 48.7H466.587V42.05L463.087 38.62H451.187L447.687 42.05V44.92H442.507V40.58L448.947 34.14H465.257L471.697 40.58V69H466.727V62.84L460.287 69H447.967L442.227 63.26ZM459.237 64.52L466.587 57.45V53.18H450.207L447.407 55.91V61.79L450.207 64.52H459.237ZM476.932 62.56V19.02H482.112V40.93L488.902 34.14H501.152L507.592 40.58V62.56L501.152 69H483.372L476.932 62.56ZM498.912 64.38L502.412 60.95V42.19L498.912 38.76H490.372L482.112 47.02V60.95L485.612 64.38H498.912ZM510.751 63.26V58.92H515.931V61.79L518.731 64.52H531.611L534.411 61.79V56.4L531.611 53.6H516.561L511.031 48.07V39.88L516.771 34.14H533.151L538.891 39.88V44.22H533.711V41.35L530.911 38.62H519.011L516.211 41.35V46.46L519.011 49.26H533.851L539.591 55V63.26L533.851 69H516.491L510.751 63.26Z" fill="#F4F4F5"/>
|
7
|
+
</svg>
|
Binary file
|
Binary file
|
@@ -0,0 +1,7 @@
|
|
1
|
+
<svg width="544" height="91" viewBox="0 0 544 91" fill="none" xmlns="http://www.w3.org/2000/svg">
|
2
|
+
<path d="M31.2246 18H39.5512V51.3061L31.2246 59.6327V18Z" fill="#FF4B2E"/>
|
3
|
+
<path d="M0 59.6328H31.2245L21.8571 69.0002H0V59.6328Z" fill="#FF4B2E"/>
|
4
|
+
<path d="M52.041 18H43.7145V51.3061L52.041 59.6327V18Z" fill="#FF4B2E"/>
|
5
|
+
<path d="M83.2656 59.6328H52.0411L62.4493 69.0002H83.2656V59.6328Z" fill="#FF4B2E"/>
|
6
|
+
<path d="M111.45 61.3V54.37H116.63V59.55L121.39 64.24H133.36L137.35 60.32V20H142.67V62L135.67 69H119.15L111.45 61.3ZM147.896 62.56V34.14H153.076V60.95L156.576 64.38H163.576L172.256 55.7V34.14H177.436V69H172.396V61.58L164.976 69H154.336L147.896 62.56ZM182.363 62.56V40.58L188.803 34.14H202.243L207.983 39.18V19.02H213.163V69H208.123V62.63L201.753 69H188.803L182.363 62.56ZM200.633 64.38L207.983 57.03V44.64L201.263 38.76H191.043L187.543 42.19V60.95L191.043 64.38H200.633ZM248.869 34.14V77.89L242.499 84.26H225.209L219.819 78.87V74.6H224.999V77.19L227.449 79.64H240.189L243.689 76.21V63.19L237.249 69H224.509L218.069 62.56V40.58L224.509 34.14H237.739L243.829 40.23V34.14H248.869ZM243.689 46.11L236.339 38.76H226.749L223.249 42.19V60.95L226.749 64.38H236.409L243.689 57.59V46.11ZM254.474 34.14H259.514V40.86L266.234 34.14H274.564L280.024 39.6L285.484 34.14H296.474L302.914 40.58V69H297.734V42.19L294.234 38.76H286.534L281.634 43.66V69H276.594V42.19L273.094 38.76H267.214L259.654 46.32V69H254.474V34.14ZM307.458 62.56V40.58L313.898 34.14H331.468L337.978 40.58V53.11H312.638V60.95L316.138 64.38H329.228L332.728 60.95V58.29H337.908V62.56L331.468 69H313.898L307.458 62.56ZM332.798 48.63V42.19L329.298 38.76H316.138L312.638 42.19V48.63H332.798ZM342.496 34.14H347.536V41.56L354.956 34.14H365.666L372.106 40.58V69H366.926V42.19L363.426 38.76H356.356L347.676 47.44V69H342.496V34.14ZM379.848 62.56V38.69H373.548V34.14H379.988V22.8H385.028V34.14H395.948V38.69H385.028V60.95L388.528 64.45H395.948V69H386.288L379.848 62.56ZM411.613 20H416.933V64.31H441.853V69H411.613V20ZM442.227 63.26V54.37L447.967 48.7H466.587V42.05L463.087 38.62H451.187L447.687 42.05V44.92H442.507V40.58L448.947 34.14H465.257L471.697 40.58V69H466.727V62.84L460.287 69H447.967L442.227 63.26ZM459.237 64.52L466.587 57.45V53.18H450.207L447.407 55.91V61.79L450.207 64.52H459.237ZM476.932 62.56V19.02H482.112V40.93L488.902 34.14H501.152L507.592 40.58V62.56L501.152 69H483.372L476.932 62.56ZM498.912 64.38L502.412 60.95V42.19L498.912 38.76H490.372L482.112 47.02V60.95L485.612 64.38H498.912ZM510.751 63.26V58.92H515.931V61.79L518.731 64.52H531.611L534.411 61.79V56.4L531.611 53.6H516.561L511.031 48.07V39.88L516.771 34.14H533.151L538.891 39.88V44.22H533.711V41.35L530.911 38.62H519.011L516.211 41.35V46.46L519.011 49.26H533.851L539.591 55V63.26L533.851 69H516.491L510.751 63.26Z" fill="black"/>
|
7
|
+
</svg>
|
Binary file
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[project]
|
2
2
|
name = "judgeval"
|
3
|
-
version = "0.
|
3
|
+
version = "0.13.1"
|
4
4
|
authors = [
|
5
5
|
{ name = "Andrew Li", email = "andrew@judgmentlabs.ai" },
|
6
6
|
{ name = "Alex Shan", email = "alex@judgmentlabs.ai" },
|
@@ -19,10 +19,9 @@ license-files = ["LICENSE.md"]
|
|
19
19
|
dependencies = [
|
20
20
|
"dotenv",
|
21
21
|
"httpx>=0.28.1",
|
22
|
-
"litellm<1.75.0",
|
22
|
+
"litellm<1.75.0", # https://github.com/BerriAI/litellm/issues/13081
|
23
23
|
"opentelemetry-exporter-otlp>=1.36.0",
|
24
24
|
"opentelemetry-sdk>=1.36.0",
|
25
|
-
"opentelemetry-semantic-conventions>=0.57b0",
|
26
25
|
"orjson>=3.9.0",
|
27
26
|
"click<8.2.0",
|
28
27
|
"typer>=0.9.0",
|
@@ -45,12 +44,6 @@ packages = ["src/judgeval"]
|
|
45
44
|
include = ["/src/judgeval", "/src/judgeval/**/*.py"]
|
46
45
|
|
47
46
|
[project.optional-dependencies]
|
48
|
-
langchain = [
|
49
|
-
"langchain-huggingface",
|
50
|
-
"langchain-openai",
|
51
|
-
"langchain-anthropic",
|
52
|
-
"langchain-core",
|
53
|
-
]
|
54
47
|
s3 = ["boto3>=1.40.11"]
|
55
48
|
trainer = ["fireworks-ai>=0.19.18"]
|
56
49
|
|
@@ -76,6 +69,10 @@ dev = [
|
|
76
69
|
"types-tqdm>=4.67.0.20250809",
|
77
70
|
"pytest-asyncio>=1.1.0",
|
78
71
|
"pytest-xdist>=3.8.0",
|
72
|
+
"langchain-openai>=0.3.23",
|
73
|
+
"langchain-tavily>=0.2.11",
|
74
|
+
"streamlit>=1.49.1",
|
75
|
+
"langchain-community>=0.3.29",
|
79
76
|
]
|
80
77
|
|
81
78
|
|
@@ -10,7 +10,7 @@ from judgeval.scorers import ExampleAPIScorerConfig
|
|
10
10
|
from judgeval.scorers.example_scorer import ExampleScorer
|
11
11
|
from judgeval.data.example import Example
|
12
12
|
from judgeval.logger import judgeval_logger
|
13
|
-
from judgeval.env import JUDGMENT_API_KEY,
|
13
|
+
from judgeval.env import JUDGMENT_API_KEY, JUDGMENT_ORG_ID
|
14
14
|
from judgeval.utils.meta import SingletonMeta
|
15
15
|
from judgeval.exceptions import JudgmentRuntimeError, JudgmentTestError
|
16
16
|
from judgeval.api import JudgmentSyncClient
|
@@ -42,7 +42,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
42
42
|
scorers: Sequence[Union[ExampleAPIScorerConfig, ExampleScorer]],
|
43
43
|
project_name: str = "default_project",
|
44
44
|
eval_run_name: str = "default_eval_run",
|
45
|
-
model: str =
|
45
|
+
model: Optional[str] = None,
|
46
46
|
assert_test: bool = False,
|
47
47
|
) -> List[ScoringResult]:
|
48
48
|
try:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
# generated by datamodel-codegen:
|
2
2
|
# filename: .openapi.json
|
3
|
-
# timestamp: 2025-09-
|
3
|
+
# timestamp: 2025-09-24T18:25:18+00:00
|
4
4
|
|
5
5
|
from __future__ import annotations
|
6
6
|
from typing import Any, Dict, List, Literal, Optional, TypedDict, Union
|
@@ -52,8 +52,8 @@ class SavePromptScorerRequest(TypedDict):
|
|
52
52
|
name: str
|
53
53
|
prompt: str
|
54
54
|
threshold: float
|
55
|
-
|
56
|
-
is_trace: NotRequired[
|
55
|
+
model: NotRequired[str]
|
56
|
+
is_trace: NotRequired[bool]
|
57
57
|
|
58
58
|
|
59
59
|
class SavePromptScorerResponse(TypedDict):
|
@@ -117,6 +117,7 @@ class ScorerConfig(TypedDict):
|
|
117
117
|
score_type: str
|
118
118
|
name: NotRequired[Optional[str]]
|
119
119
|
threshold: NotRequired[float]
|
120
|
+
model: NotRequired[Optional[str]]
|
120
121
|
strict_mode: NotRequired[bool]
|
121
122
|
required_params: NotRequired[List[str]]
|
122
123
|
kwargs: NotRequired[Optional[Dict[str, Any]]]
|
@@ -141,7 +142,7 @@ class PromptScorer(TypedDict):
|
|
141
142
|
name: str
|
142
143
|
prompt: str
|
143
144
|
threshold: float
|
144
|
-
|
145
|
+
model: NotRequired[str]
|
145
146
|
created_at: NotRequired[Optional[str]]
|
146
147
|
updated_at: NotRequired[Optional[str]]
|
147
148
|
is_trace: NotRequired[Optional[bool]]
|
@@ -189,13 +190,28 @@ class OtelTraceSpan(TypedDict):
|
|
189
190
|
state_before: NotRequired[Optional[Dict[str, Any]]]
|
190
191
|
|
191
192
|
|
193
|
+
class OtelSpanListItemScores(TypedDict):
|
194
|
+
success: bool
|
195
|
+
score: float
|
196
|
+
reason: NotRequired[Optional[str]]
|
197
|
+
name: str
|
198
|
+
|
199
|
+
|
200
|
+
class OtelSpanDetailScores(TypedDict):
|
201
|
+
success: bool
|
202
|
+
score: float
|
203
|
+
reason: NotRequired[Optional[str]]
|
204
|
+
name: str
|
205
|
+
data: NotRequired[Optional[Dict[str, Any]]]
|
206
|
+
|
207
|
+
|
192
208
|
class ExampleEvaluationRun(TypedDict):
|
193
209
|
id: NotRequired[str]
|
194
210
|
project_name: str
|
195
211
|
eval_name: str
|
196
212
|
custom_scorers: NotRequired[List[BaseScorer]]
|
197
213
|
judgment_scorers: NotRequired[List[ScorerConfig]]
|
198
|
-
model: str
|
214
|
+
model: NotRequired[Optional[str]]
|
199
215
|
created_at: NotRequired[str]
|
200
216
|
examples: List[Example]
|
201
217
|
trace_span_id: NotRequired[Optional[str]]
|
@@ -212,7 +228,7 @@ class TraceEvaluationRun(TypedDict):
|
|
212
228
|
eval_name: str
|
213
229
|
custom_scorers: NotRequired[List[BaseScorer]]
|
214
230
|
judgment_scorers: NotRequired[List[ScorerConfig]]
|
215
|
-
model: str
|
231
|
+
model: NotRequired[Optional[str]]
|
216
232
|
created_at: NotRequired[str]
|
217
233
|
trace_and_span_ids: List[TraceAndSpanId]
|
218
234
|
is_offline: NotRequired[bool]
|
@@ -224,12 +240,6 @@ class DatasetInsertExamples(TypedDict):
|
|
224
240
|
project_name: str
|
225
241
|
|
226
242
|
|
227
|
-
class DatasetReturn(TypedDict):
|
228
|
-
name: str
|
229
|
-
project_name: str
|
230
|
-
examples: NotRequired[Optional[List[Example]]]
|
231
|
-
|
232
|
-
|
233
243
|
class DatasetInfo(TypedDict):
|
234
244
|
dataset_id: str
|
235
245
|
name: str
|
@@ -261,6 +271,65 @@ class ScoringResult(TypedDict):
|
|
261
271
|
evaluation_cost: NotRequired[Optional[float]]
|
262
272
|
|
263
273
|
|
274
|
+
class OtelTraceListItem(TypedDict):
|
275
|
+
organization_id: str
|
276
|
+
project_id: str
|
277
|
+
trace_id: str
|
278
|
+
timestamp: str
|
279
|
+
duration: NotRequired[Optional[int]]
|
280
|
+
has_notification: NotRequired[Optional[bool]]
|
281
|
+
tags: NotRequired[Optional[List[str]]]
|
282
|
+
experiment_run_id: NotRequired[Optional[str]]
|
283
|
+
span_name: NotRequired[Optional[str]]
|
284
|
+
cumulative_llm_cost: NotRequired[Optional[float]]
|
285
|
+
error: NotRequired[Optional[Dict[str, Any]]]
|
286
|
+
scores: NotRequired[List[OtelSpanListItemScores]]
|
287
|
+
customer_id: NotRequired[Optional[str]]
|
288
|
+
input_preview: NotRequired[Optional[str]]
|
289
|
+
output_preview: NotRequired[Optional[str]]
|
290
|
+
annotation_count: NotRequired[int]
|
291
|
+
span_id: str
|
292
|
+
rule_id: NotRequired[Optional[str]]
|
293
|
+
|
294
|
+
|
295
|
+
class OtelSpanDetail(TypedDict):
|
296
|
+
organization_id: str
|
297
|
+
project_id: str
|
298
|
+
timestamp: str
|
299
|
+
trace_id: str
|
300
|
+
span_id: str
|
301
|
+
parent_span_id: NotRequired[Optional[str]]
|
302
|
+
trace_state: NotRequired[Optional[str]]
|
303
|
+
span_name: NotRequired[Optional[str]]
|
304
|
+
span_kind: NotRequired[Optional[str]]
|
305
|
+
service_name: NotRequired[Optional[str]]
|
306
|
+
resource_attributes: NotRequired[Optional[Dict[str, Any]]]
|
307
|
+
span_attributes: NotRequired[Optional[Dict[str, Any]]]
|
308
|
+
duration: NotRequired[Optional[int]]
|
309
|
+
status_code: NotRequired[Optional[str]]
|
310
|
+
status_message: NotRequired[Optional[str]]
|
311
|
+
events: NotRequired[Optional[Union[List[Dict[str, Any]], Dict[str, Any]]]]
|
312
|
+
links: NotRequired[Optional[Union[List[Dict[str, Any]], Dict[str, Any]]]]
|
313
|
+
llm_cost: NotRequired[Optional[float]]
|
314
|
+
prompt_tokens: NotRequired[Optional[int]]
|
315
|
+
completion_tokens: NotRequired[Optional[int]]
|
316
|
+
scores: NotRequired[Optional[List[OtelSpanDetailScores]]]
|
317
|
+
|
318
|
+
|
264
319
|
class EvalResults(TypedDict):
|
265
320
|
results: List[ScoringResult]
|
266
321
|
run: Union[ExampleEvaluationRun, TraceEvaluationRun]
|
322
|
+
|
323
|
+
|
324
|
+
class DatasetTraceWithSpans(TypedDict):
|
325
|
+
dataset_id: str
|
326
|
+
trace_detail: OtelTraceListItem
|
327
|
+
spans: List[OtelSpanDetail]
|
328
|
+
|
329
|
+
|
330
|
+
class DatasetReturn(TypedDict):
|
331
|
+
name: str
|
332
|
+
project_name: str
|
333
|
+
dataset_kind: DatasetKind
|
334
|
+
examples: NotRequired[Optional[List[Example]]]
|
335
|
+
traces: NotRequired[Optional[List[DatasetTraceWithSpans]]]
|
@@ -5,6 +5,7 @@ from pathlib import Path
|
|
5
5
|
from dotenv import load_dotenv
|
6
6
|
from judgeval.logger import judgeval_logger
|
7
7
|
from judgeval import JudgmentClient
|
8
|
+
from judgeval.version import get_version
|
8
9
|
|
9
10
|
load_dotenv()
|
10
11
|
|
@@ -56,7 +57,7 @@ def upload_scorer(
|
|
56
57
|
@app.command()
|
57
58
|
def version():
|
58
59
|
"""Show version info"""
|
59
|
-
judgeval_logger.info("
|
60
|
+
judgeval_logger.info(f"Judgeval CLI v{get_version()}")
|
60
61
|
|
61
62
|
|
62
63
|
if __name__ == "__main__":
|
@@ -24,7 +24,6 @@ class APIScorerType(str, Enum):
|
|
24
24
|
|
25
25
|
@classmethod
|
26
26
|
def __missing__(cls, value: str) -> APIScorerType:
|
27
|
-
# Handle case-insensitive lookup
|
28
27
|
for member in cls:
|
29
28
|
if member.value == value.lower():
|
30
29
|
return member
|
@@ -32,11 +31,6 @@ class APIScorerType(str, Enum):
|
|
32
31
|
raise ValueError(f"Invalid scorer type: {value}")
|
33
32
|
|
34
33
|
|
35
|
-
UNBOUNDED_SCORERS: Set[APIScorerType] = (
|
36
|
-
set()
|
37
|
-
) # scorers whose scores are not bounded between 0-1
|
38
|
-
|
39
|
-
|
40
34
|
LITELLM_SUPPORTED_MODELS: Set[str] = set(litellm.model_list)
|
41
35
|
|
42
36
|
|
@@ -23,7 +23,7 @@ class EvaluationRun(BaseModel):
|
|
23
23
|
scorers: Sequence[Union[ExampleScorer, APIScorerConfig]] = Field(
|
24
24
|
default_factory=list
|
25
25
|
)
|
26
|
-
model: str
|
26
|
+
model: Optional[str] = None
|
27
27
|
|
28
28
|
def __init__(
|
29
29
|
self,
|
@@ -77,11 +77,8 @@ class EvaluationRun(BaseModel):
|
|
77
77
|
|
78
78
|
@field_validator("model")
|
79
79
|
def validate_model(cls, v, values):
|
80
|
-
if not v:
|
81
|
-
raise ValueError("Model cannot be empty.")
|
82
|
-
|
83
80
|
# Check if model is string or list of strings
|
84
|
-
if isinstance(v, str):
|
81
|
+
if v is not None and isinstance(v, str):
|
85
82
|
if v not in ACCEPTABLE_MODELS:
|
86
83
|
raise ValueError(
|
87
84
|
f"Model name {v} not recognized. Please select a valid model name.)"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
# generated by datamodel-codegen:
|
2
2
|
# filename: .openapi.json
|
3
|
-
# timestamp: 2025-09-
|
3
|
+
# timestamp: 2025-09-24T18:25:17+00:00
|
4
4
|
|
5
5
|
from __future__ import annotations
|
6
6
|
from typing import Annotated, Any, Dict, List, Optional, Union
|
@@ -54,8 +54,8 @@ class SavePromptScorerRequest(BaseModel):
|
|
54
54
|
name: Annotated[str, Field(title="Name")]
|
55
55
|
prompt: Annotated[str, Field(title="Prompt")]
|
56
56
|
threshold: Annotated[float, Field(title="Threshold")]
|
57
|
-
|
58
|
-
is_trace: Annotated[Optional[bool], Field(title="Is Trace")] =
|
57
|
+
model: Annotated[Optional[str], Field(title="Model")] = "gpt-5"
|
58
|
+
is_trace: Annotated[Optional[bool], Field(title="Is Trace")] = False
|
59
59
|
|
60
60
|
|
61
61
|
class SavePromptScorerResponse(BaseModel):
|
@@ -125,6 +125,7 @@ class ScorerConfig(BaseModel):
|
|
125
125
|
score_type: Annotated[str, Field(title="Score Type")]
|
126
126
|
name: Annotated[Optional[str], Field(title="Name")] = None
|
127
127
|
threshold: Annotated[Optional[float], Field(title="Threshold")] = 0.5
|
128
|
+
model: Annotated[Optional[str], Field(title="Model")] = None
|
128
129
|
strict_mode: Annotated[Optional[bool], Field(title="Strict Mode")] = False
|
129
130
|
required_params: Annotated[Optional[List[str]], Field(title="Required Params")] = []
|
130
131
|
kwargs: Annotated[Optional[Dict[str, Any]], Field(title="Kwargs")] = None
|
@@ -154,7 +155,7 @@ class PromptScorer(BaseModel):
|
|
154
155
|
name: Annotated[str, Field(title="Name")]
|
155
156
|
prompt: Annotated[str, Field(title="Prompt")]
|
156
157
|
threshold: Annotated[float, Field(title="Threshold")]
|
157
|
-
|
158
|
+
model: Annotated[Optional[str], Field(title="Model")] = "gpt-5"
|
158
159
|
created_at: Annotated[Optional[AwareDatetime], Field(title="Created At")] = None
|
159
160
|
updated_at: Annotated[Optional[AwareDatetime], Field(title="Updated At")] = None
|
160
161
|
is_trace: Annotated[Optional[bool], Field(title="Is Trace")] = False
|
@@ -212,6 +213,21 @@ class OtelTraceSpan(BaseModel):
|
|
212
213
|
)
|
213
214
|
|
214
215
|
|
216
|
+
class OtelSpanListItemScores(BaseModel):
|
217
|
+
success: Annotated[bool, Field(title="Success")]
|
218
|
+
score: Annotated[float, Field(title="Score")]
|
219
|
+
reason: Annotated[Optional[str], Field(title="Reason")] = None
|
220
|
+
name: Annotated[str, Field(title="Name")]
|
221
|
+
|
222
|
+
|
223
|
+
class OtelSpanDetailScores(BaseModel):
|
224
|
+
success: Annotated[bool, Field(title="Success")]
|
225
|
+
score: Annotated[float, Field(title="Score")]
|
226
|
+
reason: Annotated[Optional[str], Field(title="Reason")] = None
|
227
|
+
name: Annotated[str, Field(title="Name")]
|
228
|
+
data: Annotated[Optional[Dict[str, Any]], Field(title="Data")] = None
|
229
|
+
|
230
|
+
|
215
231
|
class ExampleEvaluationRun(BaseModel):
|
216
232
|
id: Annotated[Optional[str], Field(title="Id")] = None
|
217
233
|
project_name: Annotated[str, Field(title="Project Name")]
|
@@ -222,7 +238,7 @@ class ExampleEvaluationRun(BaseModel):
|
|
222
238
|
judgment_scorers: Annotated[
|
223
239
|
Optional[List[ScorerConfig]], Field(title="Judgment Scorers")
|
224
240
|
] = []
|
225
|
-
model: Annotated[str, Field(title="Model")]
|
241
|
+
model: Annotated[Optional[str], Field(title="Model")] = None
|
226
242
|
created_at: Annotated[Optional[str], Field(title="Created At")] = None
|
227
243
|
examples: Annotated[List[Example], Field(title="Examples")]
|
228
244
|
trace_span_id: Annotated[Optional[str], Field(title="Trace Span Id")] = None
|
@@ -243,7 +259,7 @@ class TraceEvaluationRun(BaseModel):
|
|
243
259
|
judgment_scorers: Annotated[
|
244
260
|
Optional[List[ScorerConfig]], Field(title="Judgment Scorers")
|
245
261
|
] = []
|
246
|
-
model: Annotated[str, Field(title="Model")]
|
262
|
+
model: Annotated[Optional[str], Field(title="Model")] = None
|
247
263
|
created_at: Annotated[Optional[str], Field(title="Created At")] = None
|
248
264
|
trace_and_span_ids: Annotated[
|
249
265
|
List[TraceAndSpanId], Field(title="Trace And Span Ids")
|
@@ -257,12 +273,6 @@ class DatasetInsertExamples(BaseModel):
|
|
257
273
|
project_name: Annotated[str, Field(title="Project Name")]
|
258
274
|
|
259
275
|
|
260
|
-
class DatasetReturn(BaseModel):
|
261
|
-
name: Annotated[str, Field(title="Name")]
|
262
|
-
project_name: Annotated[str, Field(title="Project Name")]
|
263
|
-
examples: Annotated[Optional[List[Example]], Field(title="Examples")] = None
|
264
|
-
|
265
|
-
|
266
276
|
class DatasetInfo(BaseModel):
|
267
277
|
dataset_id: Annotated[str, Field(title="Dataset Id")]
|
268
278
|
name: Annotated[str, Field(title="Name")]
|
@@ -296,6 +306,81 @@ class ScoringResult(BaseModel):
|
|
296
306
|
evaluation_cost: Annotated[Optional[float], Field(title="Evaluation Cost")] = None
|
297
307
|
|
298
308
|
|
309
|
+
class OtelTraceListItem(BaseModel):
|
310
|
+
organization_id: Annotated[str, Field(title="Organization Id")]
|
311
|
+
project_id: Annotated[str, Field(title="Project Id")]
|
312
|
+
trace_id: Annotated[str, Field(title="Trace Id")]
|
313
|
+
timestamp: Annotated[str, Field(title="Timestamp")]
|
314
|
+
duration: Annotated[Optional[int], Field(title="Duration")] = None
|
315
|
+
has_notification: Annotated[Optional[bool], Field(title="Has Notification")] = None
|
316
|
+
tags: Annotated[Optional[List[str]], Field(title="Tags")] = None
|
317
|
+
experiment_run_id: Annotated[Optional[str], Field(title="Experiment Run Id")] = None
|
318
|
+
span_name: Annotated[Optional[str], Field(title="Span Name")] = None
|
319
|
+
cumulative_llm_cost: Annotated[
|
320
|
+
Optional[float], Field(title="Cumulative Llm Cost")
|
321
|
+
] = None
|
322
|
+
error: Annotated[Optional[Dict[str, Any]], Field(title="Error")] = None
|
323
|
+
scores: Annotated[
|
324
|
+
Optional[List[OtelSpanListItemScores]], Field(title="Scores")
|
325
|
+
] = []
|
326
|
+
customer_id: Annotated[Optional[str], Field(title="Customer Id")] = None
|
327
|
+
input_preview: Annotated[Optional[str], Field(title="Input Preview")] = None
|
328
|
+
output_preview: Annotated[Optional[str], Field(title="Output Preview")] = None
|
329
|
+
annotation_count: Annotated[Optional[int], Field(title="Annotation Count")] = 0
|
330
|
+
span_id: Annotated[str, Field(title="Span Id")]
|
331
|
+
rule_id: Annotated[Optional[str], Field(title="Rule Id")] = None
|
332
|
+
|
333
|
+
|
334
|
+
class OtelSpanDetail(BaseModel):
|
335
|
+
organization_id: Annotated[str, Field(title="Organization Id")]
|
336
|
+
project_id: Annotated[str, Field(title="Project Id")]
|
337
|
+
timestamp: Annotated[str, Field(title="Timestamp")]
|
338
|
+
trace_id: Annotated[str, Field(title="Trace Id")]
|
339
|
+
span_id: Annotated[str, Field(title="Span Id")]
|
340
|
+
parent_span_id: Annotated[Optional[str], Field(title="Parent Span Id")] = None
|
341
|
+
trace_state: Annotated[Optional[str], Field(title="Trace State")] = None
|
342
|
+
span_name: Annotated[Optional[str], Field(title="Span Name")] = None
|
343
|
+
span_kind: Annotated[Optional[str], Field(title="Span Kind")] = None
|
344
|
+
service_name: Annotated[Optional[str], Field(title="Service Name")] = None
|
345
|
+
resource_attributes: Annotated[
|
346
|
+
Optional[Dict[str, Any]], Field(title="Resource Attributes")
|
347
|
+
] = None
|
348
|
+
span_attributes: Annotated[
|
349
|
+
Optional[Dict[str, Any]], Field(title="Span Attributes")
|
350
|
+
] = None
|
351
|
+
duration: Annotated[Optional[int], Field(title="Duration")] = None
|
352
|
+
status_code: Annotated[Optional[str], Field(title="Status Code")] = None
|
353
|
+
status_message: Annotated[Optional[str], Field(title="Status Message")] = None
|
354
|
+
events: Annotated[
|
355
|
+
Optional[Union[List[Dict[str, Any]], Dict[str, Any]]], Field(title="Events")
|
356
|
+
] = None
|
357
|
+
links: Annotated[
|
358
|
+
Optional[Union[List[Dict[str, Any]], Dict[str, Any]]], Field(title="Links")
|
359
|
+
] = None
|
360
|
+
llm_cost: Annotated[Optional[float], Field(title="Llm Cost")] = None
|
361
|
+
prompt_tokens: Annotated[Optional[int], Field(title="Prompt Tokens")] = None
|
362
|
+
completion_tokens: Annotated[Optional[int], Field(title="Completion Tokens")] = None
|
363
|
+
scores: Annotated[Optional[List[OtelSpanDetailScores]], Field(title="Scores")] = (
|
364
|
+
None
|
365
|
+
)
|
366
|
+
|
367
|
+
|
299
368
|
class EvalResults(BaseModel):
|
300
369
|
results: Annotated[List[ScoringResult], Field(title="Results")]
|
301
370
|
run: Annotated[Union[ExampleEvaluationRun, TraceEvaluationRun], Field(title="Run")]
|
371
|
+
|
372
|
+
|
373
|
+
class DatasetTraceWithSpans(BaseModel):
|
374
|
+
dataset_id: Annotated[str, Field(title="Dataset Id")]
|
375
|
+
trace_detail: OtelTraceListItem
|
376
|
+
spans: Annotated[List[OtelSpanDetail], Field(title="Spans")]
|
377
|
+
|
378
|
+
|
379
|
+
class DatasetReturn(BaseModel):
|
380
|
+
name: Annotated[str, Field(title="Name")]
|
381
|
+
project_name: Annotated[str, Field(title="Project Name")]
|
382
|
+
dataset_kind: DatasetKind
|
383
|
+
examples: Annotated[Optional[List[Example]], Field(title="Examples")] = None
|
384
|
+
traces: Annotated[Optional[List[DatasetTraceWithSpans]], Field(title="Traces")] = (
|
385
|
+
None
|
386
|
+
)
|
@@ -0,0 +1,121 @@
|
|
1
|
+
from typing import Optional, List, Dict, Any
|
2
|
+
from pydantic import BaseModel
|
3
|
+
from .judgment_types import (
|
4
|
+
OtelSpanDetailScores,
|
5
|
+
OtelSpanDetail,
|
6
|
+
OtelTraceListItem,
|
7
|
+
)
|
8
|
+
|
9
|
+
|
10
|
+
class TraceUsage(BaseModel):
|
11
|
+
prompt_tokens: Optional[int] = None
|
12
|
+
completion_tokens: Optional[int] = None
|
13
|
+
cache_creation_input_tokens: Optional[int] = None
|
14
|
+
cache_read_input_tokens: Optional[int] = None
|
15
|
+
total_tokens: Optional[int] = None
|
16
|
+
prompt_tokens_cost_usd: Optional[float] = None
|
17
|
+
completion_tokens_cost_usd: Optional[float] = None
|
18
|
+
total_cost_usd: Optional[float] = None
|
19
|
+
model_name: Optional[str] = None
|
20
|
+
|
21
|
+
|
22
|
+
class TraceScore(OtelSpanDetailScores):
|
23
|
+
"""Score information for a trace or span."""
|
24
|
+
|
25
|
+
pass
|
26
|
+
|
27
|
+
|
28
|
+
class TraceRule(BaseModel):
|
29
|
+
"""Rule that was triggered for a trace."""
|
30
|
+
|
31
|
+
rule_id: str
|
32
|
+
rule_name: str
|
33
|
+
|
34
|
+
|
35
|
+
class TraceSpan(OtelSpanDetail):
|
36
|
+
"""Individual span within a trace with complete telemetry data."""
|
37
|
+
|
38
|
+
@classmethod
|
39
|
+
def from_otel_span_detail(cls, span_detail: OtelSpanDetail) -> "TraceSpan":
|
40
|
+
"""Create TraceSpan from OtelSpanDetail, converting scores to TraceScore."""
|
41
|
+
data = span_detail.model_dump()
|
42
|
+
|
43
|
+
if "scores" in data and data["scores"]:
|
44
|
+
data["scores"] = [TraceScore(**score) for score in data["scores"]]
|
45
|
+
|
46
|
+
return cls(**data)
|
47
|
+
|
48
|
+
def to_dict(self) -> Dict[str, Any]:
|
49
|
+
"""Convert TraceSpan to dictionary."""
|
50
|
+
return self.model_dump(exclude_none=True)
|
51
|
+
|
52
|
+
|
53
|
+
class Trace(OtelTraceListItem):
|
54
|
+
"""Complete trace with metadata and all associated spans."""
|
55
|
+
|
56
|
+
spans: List[TraceSpan] = []
|
57
|
+
rules: Optional[List[TraceRule]] = []
|
58
|
+
|
59
|
+
@classmethod
|
60
|
+
def from_dataset_trace_with_spans(cls, dataset_trace: Any) -> "Trace":
|
61
|
+
"""Create Trace from DatasetTraceWithSpans (handles both API and judgment types)."""
|
62
|
+
|
63
|
+
if hasattr(dataset_trace, "trace_detail"):
|
64
|
+
trace_detail = dataset_trace.trace_detail
|
65
|
+
spans_data = dataset_trace.spans
|
66
|
+
else:
|
67
|
+
trace_detail = dataset_trace.get("trace_detail", {})
|
68
|
+
spans_data = dataset_trace.get("spans", [])
|
69
|
+
|
70
|
+
if hasattr(trace_detail, "model_dump"):
|
71
|
+
trace_data = trace_detail.model_dump()
|
72
|
+
elif isinstance(trace_detail, dict):
|
73
|
+
trace_data = trace_detail.copy()
|
74
|
+
else:
|
75
|
+
trace_data = dict(trace_detail)
|
76
|
+
|
77
|
+
spans = []
|
78
|
+
for span in spans_data:
|
79
|
+
if hasattr(span, "model_dump"):
|
80
|
+
spans.append(TraceSpan.from_otel_span_detail(span))
|
81
|
+
else:
|
82
|
+
# Handle dict spans
|
83
|
+
span_data = dict(span) if not isinstance(span, dict) else span.copy()
|
84
|
+
if "scores" in span_data and span_data["scores"]:
|
85
|
+
span_data["scores"] = [
|
86
|
+
TraceScore(**score)
|
87
|
+
if isinstance(score, dict)
|
88
|
+
else TraceScore(**score.model_dump())
|
89
|
+
for score in span_data["scores"]
|
90
|
+
]
|
91
|
+
spans.append(TraceSpan(**span_data))
|
92
|
+
|
93
|
+
rules = []
|
94
|
+
if "rule_id" in trace_data and trace_data["rule_id"]:
|
95
|
+
rules = [
|
96
|
+
TraceRule(
|
97
|
+
rule_id=trace_data["rule_id"],
|
98
|
+
rule_name=f"Rule {trace_data['rule_id']}",
|
99
|
+
)
|
100
|
+
]
|
101
|
+
|
102
|
+
trace_data.pop("scores", [])
|
103
|
+
trace_data.pop("rule_id", None)
|
104
|
+
trace = cls(**trace_data)
|
105
|
+
|
106
|
+
trace.spans = spans
|
107
|
+
trace.rules = rules
|
108
|
+
|
109
|
+
return trace
|
110
|
+
|
111
|
+
def to_dict(self) -> Dict[str, Any]:
|
112
|
+
"""Convert Trace to dictionary."""
|
113
|
+
return self.model_dump(exclude_none=True)
|
114
|
+
|
115
|
+
def __len__(self) -> int:
|
116
|
+
"""Return the number of spans in the trace."""
|
117
|
+
return len(self.spans)
|
118
|
+
|
119
|
+
def __iter__(self):
|
120
|
+
"""Iterate over spans in the trace."""
|
121
|
+
return iter(self.spans)
|