judgeval 0.1.0__py3-none-any.whl ā 0.23.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +173 -10
- judgeval/api/__init__.py +523 -0
- judgeval/api/api_types.py +413 -0
- judgeval/cli.py +112 -0
- judgeval/constants.py +7 -30
- judgeval/data/__init__.py +1 -3
- judgeval/data/evaluation_run.py +125 -0
- judgeval/data/example.py +14 -40
- judgeval/data/judgment_types.py +396 -146
- judgeval/data/result.py +11 -18
- judgeval/data/scorer_data.py +3 -26
- judgeval/data/scripts/openapi_transform.py +5 -5
- judgeval/data/trace.py +115 -194
- judgeval/dataset/__init__.py +335 -0
- judgeval/env.py +55 -0
- judgeval/evaluation/__init__.py +346 -0
- judgeval/exceptions.py +28 -0
- judgeval/integrations/langgraph/__init__.py +13 -0
- judgeval/integrations/openlit/__init__.py +51 -0
- judgeval/judges/__init__.py +2 -2
- judgeval/judges/litellm_judge.py +77 -16
- judgeval/judges/together_judge.py +88 -17
- judgeval/judges/utils.py +7 -20
- judgeval/judgment_attribute_keys.py +55 -0
- judgeval/{common/logger.py ā logger.py} +24 -8
- judgeval/prompt/__init__.py +330 -0
- judgeval/scorers/__init__.py +11 -11
- judgeval/scorers/agent_scorer.py +15 -19
- judgeval/scorers/api_scorer.py +21 -23
- judgeval/scorers/base_scorer.py +54 -36
- judgeval/scorers/example_scorer.py +1 -3
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -24
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -10
- judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
- judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -10
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -14
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +171 -59
- judgeval/scorers/score.py +64 -47
- judgeval/scorers/utils.py +2 -107
- judgeval/tracer/__init__.py +1111 -2
- judgeval/tracer/constants.py +1 -0
- judgeval/tracer/exporters/__init__.py +40 -0
- judgeval/tracer/exporters/s3.py +119 -0
- judgeval/tracer/exporters/store.py +59 -0
- judgeval/tracer/exporters/utils.py +32 -0
- judgeval/tracer/keys.py +63 -0
- judgeval/tracer/llm/__init__.py +7 -0
- judgeval/tracer/llm/config.py +78 -0
- judgeval/tracer/llm/constants.py +9 -0
- judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
- judgeval/tracer/llm/llm_anthropic/config.py +6 -0
- judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
- judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
- judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
- judgeval/tracer/llm/llm_google/__init__.py +3 -0
- judgeval/tracer/llm/llm_google/config.py +6 -0
- judgeval/tracer/llm/llm_google/generate_content.py +127 -0
- judgeval/tracer/llm/llm_google/wrapper.py +30 -0
- judgeval/tracer/llm/llm_openai/__init__.py +3 -0
- judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
- judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
- judgeval/tracer/llm/llm_openai/config.py +6 -0
- judgeval/tracer/llm/llm_openai/responses.py +506 -0
- judgeval/tracer/llm/llm_openai/utils.py +42 -0
- judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
- judgeval/tracer/llm/llm_together/__init__.py +3 -0
- judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
- judgeval/tracer/llm/llm_together/config.py +6 -0
- judgeval/tracer/llm/llm_together/wrapper.py +52 -0
- judgeval/tracer/llm/providers.py +19 -0
- judgeval/tracer/managers.py +167 -0
- judgeval/tracer/processors/__init__.py +220 -0
- judgeval/tracer/utils.py +19 -0
- judgeval/trainer/__init__.py +14 -0
- judgeval/trainer/base_trainer.py +122 -0
- judgeval/trainer/config.py +123 -0
- judgeval/trainer/console.py +144 -0
- judgeval/trainer/fireworks_trainer.py +392 -0
- judgeval/trainer/trainable_model.py +252 -0
- judgeval/trainer/trainer.py +70 -0
- judgeval/utils/async_utils.py +39 -0
- judgeval/utils/decorators/__init__.py +0 -0
- judgeval/utils/decorators/dont_throw.py +37 -0
- judgeval/utils/decorators/use_once.py +13 -0
- judgeval/utils/file_utils.py +74 -28
- judgeval/utils/guards.py +36 -0
- judgeval/utils/meta.py +27 -0
- judgeval/utils/project.py +15 -0
- judgeval/utils/serialize.py +253 -0
- judgeval/utils/testing.py +70 -0
- judgeval/utils/url.py +10 -0
- judgeval/{version_check.py ā utils/version_check.py} +5 -3
- judgeval/utils/wrappers/README.md +3 -0
- judgeval/utils/wrappers/__init__.py +15 -0
- judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
- judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
- judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
- judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
- judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
- judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
- judgeval/utils/wrappers/py.typed +0 -0
- judgeval/utils/wrappers/utils.py +35 -0
- judgeval/v1/__init__.py +88 -0
- judgeval/v1/data/__init__.py +7 -0
- judgeval/v1/data/example.py +44 -0
- judgeval/v1/data/scorer_data.py +42 -0
- judgeval/v1/data/scoring_result.py +44 -0
- judgeval/v1/datasets/__init__.py +6 -0
- judgeval/v1/datasets/dataset.py +214 -0
- judgeval/v1/datasets/dataset_factory.py +94 -0
- judgeval/v1/evaluation/__init__.py +6 -0
- judgeval/v1/evaluation/evaluation.py +182 -0
- judgeval/v1/evaluation/evaluation_factory.py +17 -0
- judgeval/v1/instrumentation/__init__.py +6 -0
- judgeval/v1/instrumentation/llm/__init__.py +7 -0
- judgeval/v1/instrumentation/llm/config.py +78 -0
- judgeval/v1/instrumentation/llm/constants.py +11 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/messages.py +414 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/messages_stream.py +307 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/wrapper.py +61 -0
- judgeval/v1/instrumentation/llm/llm_google/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_google/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_google/generate_content.py +121 -0
- judgeval/v1/instrumentation/llm/llm_google/wrapper.py +30 -0
- judgeval/v1/instrumentation/llm/llm_openai/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_openai/beta_chat_completions.py +212 -0
- judgeval/v1/instrumentation/llm/llm_openai/chat_completions.py +477 -0
- judgeval/v1/instrumentation/llm/llm_openai/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_openai/responses.py +472 -0
- judgeval/v1/instrumentation/llm/llm_openai/utils.py +41 -0
- judgeval/v1/instrumentation/llm/llm_openai/wrapper.py +63 -0
- judgeval/v1/instrumentation/llm/llm_together/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_together/chat_completions.py +382 -0
- judgeval/v1/instrumentation/llm/llm_together/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_together/wrapper.py +57 -0
- judgeval/v1/instrumentation/llm/providers.py +19 -0
- judgeval/v1/integrations/claude_agent_sdk/__init__.py +119 -0
- judgeval/v1/integrations/claude_agent_sdk/wrapper.py +564 -0
- judgeval/v1/integrations/langgraph/__init__.py +13 -0
- judgeval/v1/integrations/openlit/__init__.py +47 -0
- judgeval/v1/internal/api/__init__.py +525 -0
- judgeval/v1/internal/api/api_types.py +413 -0
- judgeval/v1/prompts/__init__.py +6 -0
- judgeval/v1/prompts/prompt.py +29 -0
- judgeval/v1/prompts/prompt_factory.py +189 -0
- judgeval/v1/py.typed +0 -0
- judgeval/v1/scorers/__init__.py +6 -0
- judgeval/v1/scorers/api_scorer.py +82 -0
- judgeval/v1/scorers/base_scorer.py +17 -0
- judgeval/v1/scorers/built_in/__init__.py +17 -0
- judgeval/v1/scorers/built_in/answer_correctness.py +28 -0
- judgeval/v1/scorers/built_in/answer_relevancy.py +28 -0
- judgeval/v1/scorers/built_in/built_in_factory.py +26 -0
- judgeval/v1/scorers/built_in/faithfulness.py +28 -0
- judgeval/v1/scorers/built_in/instruction_adherence.py +28 -0
- judgeval/v1/scorers/custom_scorer/__init__.py +6 -0
- judgeval/v1/scorers/custom_scorer/custom_scorer.py +50 -0
- judgeval/v1/scorers/custom_scorer/custom_scorer_factory.py +16 -0
- judgeval/v1/scorers/prompt_scorer/__init__.py +6 -0
- judgeval/v1/scorers/prompt_scorer/prompt_scorer.py +86 -0
- judgeval/v1/scorers/prompt_scorer/prompt_scorer_factory.py +85 -0
- judgeval/v1/scorers/scorers_factory.py +49 -0
- judgeval/v1/tracer/__init__.py +7 -0
- judgeval/v1/tracer/base_tracer.py +520 -0
- judgeval/v1/tracer/exporters/__init__.py +14 -0
- judgeval/v1/tracer/exporters/in_memory_span_exporter.py +25 -0
- judgeval/v1/tracer/exporters/judgment_span_exporter.py +42 -0
- judgeval/v1/tracer/exporters/noop_span_exporter.py +19 -0
- judgeval/v1/tracer/exporters/span_store.py +50 -0
- judgeval/v1/tracer/judgment_tracer_provider.py +70 -0
- judgeval/v1/tracer/processors/__init__.py +6 -0
- judgeval/v1/tracer/processors/_lifecycles/__init__.py +28 -0
- judgeval/v1/tracer/processors/_lifecycles/agent_id_processor.py +53 -0
- judgeval/v1/tracer/processors/_lifecycles/context_keys.py +11 -0
- judgeval/v1/tracer/processors/_lifecycles/customer_id_processor.py +29 -0
- judgeval/v1/tracer/processors/_lifecycles/registry.py +18 -0
- judgeval/v1/tracer/processors/judgment_span_processor.py +165 -0
- judgeval/v1/tracer/processors/noop_span_processor.py +42 -0
- judgeval/v1/tracer/tracer.py +67 -0
- judgeval/v1/tracer/tracer_factory.py +38 -0
- judgeval/v1/trainers/__init__.py +5 -0
- judgeval/v1/trainers/base_trainer.py +62 -0
- judgeval/v1/trainers/config.py +123 -0
- judgeval/v1/trainers/console.py +144 -0
- judgeval/v1/trainers/fireworks_trainer.py +392 -0
- judgeval/v1/trainers/trainable_model.py +252 -0
- judgeval/v1/trainers/trainers_factory.py +37 -0
- judgeval/v1/utils.py +18 -0
- judgeval/version.py +5 -0
- judgeval/warnings.py +4 -0
- judgeval-0.23.0.dist-info/METADATA +266 -0
- judgeval-0.23.0.dist-info/RECORD +201 -0
- judgeval-0.23.0.dist-info/entry_points.txt +2 -0
- judgeval/clients.py +0 -34
- judgeval/common/__init__.py +0 -13
- judgeval/common/api/__init__.py +0 -3
- judgeval/common/api/api.py +0 -352
- judgeval/common/api/constants.py +0 -165
- judgeval/common/exceptions.py +0 -27
- judgeval/common/storage/__init__.py +0 -6
- judgeval/common/storage/s3_storage.py +0 -98
- judgeval/common/tracer/__init__.py +0 -31
- judgeval/common/tracer/constants.py +0 -22
- judgeval/common/tracer/core.py +0 -1916
- judgeval/common/tracer/otel_exporter.py +0 -108
- judgeval/common/tracer/otel_span_processor.py +0 -234
- judgeval/common/tracer/span_processor.py +0 -37
- judgeval/common/tracer/span_transformer.py +0 -211
- judgeval/common/tracer/trace_manager.py +0 -92
- judgeval/common/utils.py +0 -940
- judgeval/data/datasets/__init__.py +0 -4
- judgeval/data/datasets/dataset.py +0 -341
- judgeval/data/datasets/eval_dataset_client.py +0 -214
- judgeval/data/tool.py +0 -5
- judgeval/data/trace_run.py +0 -37
- judgeval/evaluation_run.py +0 -75
- judgeval/integrations/langgraph.py +0 -843
- judgeval/judges/mixture_of_judges.py +0 -286
- judgeval/judgment_client.py +0 -369
- judgeval/rules.py +0 -521
- judgeval/run_evaluation.py +0 -684
- judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -14
- judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
- judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
- judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -20
- judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -27
- judgeval/utils/alerts.py +0 -93
- judgeval/utils/requests.py +0 -50
- judgeval-0.1.0.dist-info/METADATA +0 -202
- judgeval-0.1.0.dist-info/RECORD +0 -73
- {judgeval-0.1.0.dist-info ā judgeval-0.23.0.dist-info}/WHEEL +0 -0
- {judgeval-0.1.0.dist-info ā judgeval-0.23.0.dist-info}/licenses/LICENSE.md +0 -0
judgeval/run_evaluation.py
DELETED
|
@@ -1,684 +0,0 @@
|
|
|
1
|
-
import asyncio
|
|
2
|
-
import concurrent.futures
|
|
3
|
-
import time
|
|
4
|
-
import json
|
|
5
|
-
import sys
|
|
6
|
-
import threading
|
|
7
|
-
from typing import List, Dict, Union, Optional, Callable, Tuple, Any
|
|
8
|
-
from rich import print as rprint
|
|
9
|
-
|
|
10
|
-
from judgeval.data import ScorerData, ScoringResult, Example, Trace
|
|
11
|
-
from judgeval.scorers import BaseScorer, APIScorerConfig
|
|
12
|
-
from judgeval.scorers.score import a_execute_scoring
|
|
13
|
-
from judgeval.common.api import JudgmentApiClient
|
|
14
|
-
from judgeval.constants import (
|
|
15
|
-
MAX_CONCURRENT_EVALUATIONS,
|
|
16
|
-
)
|
|
17
|
-
from judgeval.common.exceptions import JudgmentAPIError
|
|
18
|
-
from judgeval.common.api.api import JudgmentAPIException
|
|
19
|
-
from judgeval.common.logger import judgeval_logger
|
|
20
|
-
from judgeval.evaluation_run import EvaluationRun
|
|
21
|
-
from judgeval.data.trace_run import TraceRun
|
|
22
|
-
from judgeval.common.tracer import Tracer
|
|
23
|
-
from langchain_core.callbacks import BaseCallbackHandler
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
def safe_run_async(coro):
|
|
27
|
-
"""
|
|
28
|
-
Safely run an async coroutine whether or not there's already an event loop running.
|
|
29
|
-
|
|
30
|
-
Args:
|
|
31
|
-
coro: The coroutine to run
|
|
32
|
-
|
|
33
|
-
Returns:
|
|
34
|
-
The result of the coroutine
|
|
35
|
-
"""
|
|
36
|
-
try:
|
|
37
|
-
# Try to get the running loop
|
|
38
|
-
asyncio.get_running_loop()
|
|
39
|
-
# If we get here, there's already a loop running
|
|
40
|
-
# Run in a separate thread to avoid "asyncio.run() cannot be called from a running event loop"
|
|
41
|
-
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
42
|
-
future = executor.submit(asyncio.run, coro)
|
|
43
|
-
return future.result()
|
|
44
|
-
except RuntimeError:
|
|
45
|
-
# No event loop is running, safe to use asyncio.run()
|
|
46
|
-
return asyncio.run(coro)
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
def send_to_rabbitmq(evaluation_run: EvaluationRun) -> Dict[str, Any]:
|
|
50
|
-
"""
|
|
51
|
-
Sends an evaluation run to the RabbitMQ evaluation queue.
|
|
52
|
-
"""
|
|
53
|
-
if not evaluation_run.judgment_api_key or not evaluation_run.organization_id:
|
|
54
|
-
raise ValueError("API key and organization ID are required")
|
|
55
|
-
if not evaluation_run.eval_name or not evaluation_run.project_name:
|
|
56
|
-
raise ValueError("Eval name and project name are required")
|
|
57
|
-
api_client = JudgmentApiClient(
|
|
58
|
-
evaluation_run.judgment_api_key, evaluation_run.organization_id
|
|
59
|
-
)
|
|
60
|
-
return api_client.add_to_evaluation_queue(
|
|
61
|
-
evaluation_run.eval_name, evaluation_run.project_name
|
|
62
|
-
)
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
def execute_api_eval(evaluation_run: EvaluationRun) -> Dict:
|
|
66
|
-
"""
|
|
67
|
-
Executes an evaluation of a list of `Example`s using one or more `JudgmentScorer`s via the Judgment API.
|
|
68
|
-
|
|
69
|
-
Args:
|
|
70
|
-
evaluation_run (EvaluationRun): The evaluation run object containing the examples, scorers, and metadata
|
|
71
|
-
|
|
72
|
-
Returns:
|
|
73
|
-
List[Dict]: The results of the evaluation. Each result is a dictionary containing the fields of a `ScoringResult`
|
|
74
|
-
object.
|
|
75
|
-
"""
|
|
76
|
-
|
|
77
|
-
try:
|
|
78
|
-
# submit API request to execute evals
|
|
79
|
-
if not evaluation_run.judgment_api_key or not evaluation_run.organization_id:
|
|
80
|
-
raise ValueError("API key and organization ID are required")
|
|
81
|
-
api_client = JudgmentApiClient(
|
|
82
|
-
evaluation_run.judgment_api_key, evaluation_run.organization_id
|
|
83
|
-
)
|
|
84
|
-
return api_client.run_evaluation(evaluation_run.model_dump())
|
|
85
|
-
except Exception as e:
|
|
86
|
-
judgeval_logger.error(f"Error: {e}")
|
|
87
|
-
|
|
88
|
-
details = "No details provided"
|
|
89
|
-
if isinstance(e, JudgmentAPIException):
|
|
90
|
-
details = e.response_json.get("detail", "No details provided")
|
|
91
|
-
|
|
92
|
-
raise JudgmentAPIError(
|
|
93
|
-
"An error occurred while executing the Judgment API request: " + details
|
|
94
|
-
)
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
def execute_api_trace_eval(trace_run: TraceRun, judgment_api_key: str) -> Dict:
|
|
98
|
-
"""
|
|
99
|
-
Executes an evaluation of a list of `Trace`s using one or more `JudgmentScorer`s via the Judgment API.
|
|
100
|
-
"""
|
|
101
|
-
|
|
102
|
-
try:
|
|
103
|
-
# submit API request to execute evals
|
|
104
|
-
if not judgment_api_key or not trace_run.organization_id:
|
|
105
|
-
raise ValueError("API key and organization ID are required")
|
|
106
|
-
api_client = JudgmentApiClient(judgment_api_key, trace_run.organization_id)
|
|
107
|
-
return api_client.run_trace_evaluation(trace_run.model_dump(warnings=False))
|
|
108
|
-
except Exception as e:
|
|
109
|
-
judgeval_logger.error(f"Error: {e}")
|
|
110
|
-
|
|
111
|
-
details = "An unknown error occurred."
|
|
112
|
-
if isinstance(e, JudgmentAPIException):
|
|
113
|
-
details = e.response_json.get("detail", "An unknown error occurred.")
|
|
114
|
-
|
|
115
|
-
raise JudgmentAPIError(
|
|
116
|
-
"An error occurred while executing the Judgment API request: " + details
|
|
117
|
-
)
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
def check_missing_scorer_data(results: List[ScoringResult]) -> List[ScoringResult]:
|
|
121
|
-
"""
|
|
122
|
-
Checks if any `ScoringResult` objects are missing `scorers_data`.
|
|
123
|
-
|
|
124
|
-
If any are missing, logs an error and returns the results.
|
|
125
|
-
"""
|
|
126
|
-
for i, result in enumerate(results):
|
|
127
|
-
if not result.scorers_data:
|
|
128
|
-
judgeval_logger.error(
|
|
129
|
-
f"Scorer data is missing for example {i}. "
|
|
130
|
-
"This is usually caused when the example does not contain "
|
|
131
|
-
"the fields required by the scorer. "
|
|
132
|
-
"Check that your example contains the fields required by the scorers. "
|
|
133
|
-
"TODO add docs link here for reference."
|
|
134
|
-
)
|
|
135
|
-
return results
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
def check_experiment_type(
|
|
139
|
-
eval_name: str,
|
|
140
|
-
project_name: str,
|
|
141
|
-
judgment_api_key: str,
|
|
142
|
-
organization_id: str,
|
|
143
|
-
is_trace: bool,
|
|
144
|
-
) -> None:
|
|
145
|
-
"""
|
|
146
|
-
Checks if the current experiment, if one exists, has the same type (examples of traces)
|
|
147
|
-
"""
|
|
148
|
-
api_client = JudgmentApiClient(judgment_api_key, organization_id)
|
|
149
|
-
|
|
150
|
-
try:
|
|
151
|
-
api_client.check_experiment_type(eval_name, project_name, is_trace)
|
|
152
|
-
except JudgmentAPIException as e:
|
|
153
|
-
if e.response.status_code == 422:
|
|
154
|
-
judgeval_logger.error(f"{e.response_json}")
|
|
155
|
-
raise ValueError(f"{e.response_json}")
|
|
156
|
-
else:
|
|
157
|
-
raise e
|
|
158
|
-
except Exception as e:
|
|
159
|
-
judgeval_logger.error(f"Failed to check if experiment type exists: {str(e)}")
|
|
160
|
-
raise JudgmentAPIError(f"Failed to check if experiment type exists: {str(e)}")
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
def check_eval_run_name_exists(
|
|
164
|
-
eval_name: str, project_name: str, judgment_api_key: str, organization_id: str
|
|
165
|
-
) -> None:
|
|
166
|
-
"""
|
|
167
|
-
Checks if an evaluation run name already exists for a given project.
|
|
168
|
-
|
|
169
|
-
Args:
|
|
170
|
-
eval_name (str): Name of the evaluation run
|
|
171
|
-
project_name (str): Name of the project
|
|
172
|
-
judgment_api_key (str): API key for authentication
|
|
173
|
-
|
|
174
|
-
Raises:
|
|
175
|
-
ValueError: If the evaluation run name already exists
|
|
176
|
-
JudgmentAPIError: If there's an API error during the check
|
|
177
|
-
"""
|
|
178
|
-
api_client = JudgmentApiClient(judgment_api_key, organization_id)
|
|
179
|
-
try:
|
|
180
|
-
api_client.check_eval_run_name_exists(eval_name, project_name)
|
|
181
|
-
except JudgmentAPIException as e:
|
|
182
|
-
if e.response.status_code == 409:
|
|
183
|
-
error_str = f"Eval run name '{eval_name}' already exists for this project. Please choose a different name, set the `override` flag to true, or set the `append` flag to true. See https://docs.judgmentlabs.ai/sdk-reference/judgment-client#override for more information."
|
|
184
|
-
judgeval_logger.error(error_str)
|
|
185
|
-
raise ValueError(error_str)
|
|
186
|
-
else:
|
|
187
|
-
raise e
|
|
188
|
-
|
|
189
|
-
except Exception as e:
|
|
190
|
-
judgeval_logger.error(f"Failed to check if eval run name exists: {str(e)}")
|
|
191
|
-
raise JudgmentAPIError(f"Failed to check if eval run name exists: {str(e)}")
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
def log_evaluation_results(
|
|
195
|
-
scoring_results: List[ScoringResult],
|
|
196
|
-
run: Union[EvaluationRun, TraceRun],
|
|
197
|
-
judgment_api_key: str,
|
|
198
|
-
) -> str:
|
|
199
|
-
"""
|
|
200
|
-
Logs evaluation results to the Judgment API database.
|
|
201
|
-
|
|
202
|
-
Args:
|
|
203
|
-
merged_results (List[ScoringResult]): The results to log
|
|
204
|
-
evaluation_run (EvaluationRun): The evaluation run containing project info and API key
|
|
205
|
-
judgment_api_key (str): The API key for the Judgment API
|
|
206
|
-
|
|
207
|
-
Raises:
|
|
208
|
-
JudgmentAPIError: If there's an API error during logging
|
|
209
|
-
ValueError: If there's a validation error with the results
|
|
210
|
-
"""
|
|
211
|
-
try:
|
|
212
|
-
if not judgment_api_key or not run.organization_id:
|
|
213
|
-
raise ValueError("API key and organization ID are required")
|
|
214
|
-
|
|
215
|
-
api_client = JudgmentApiClient(judgment_api_key, run.organization_id)
|
|
216
|
-
response = api_client.log_evaluation_results(
|
|
217
|
-
scoring_results,
|
|
218
|
-
run.model_dump(warnings=False),
|
|
219
|
-
)
|
|
220
|
-
url = response.get("ui_results_url")
|
|
221
|
-
return url
|
|
222
|
-
|
|
223
|
-
except Exception as e:
|
|
224
|
-
judgeval_logger.error(f"Failed to save evaluation results to DB: {str(e)}")
|
|
225
|
-
raise JudgmentAPIError(
|
|
226
|
-
f"Request failed while saving evaluation results to DB: {str(e)}"
|
|
227
|
-
)
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
def check_examples(
|
|
231
|
-
examples: List[Example], scorers: List[Union[APIScorerConfig, BaseScorer]]
|
|
232
|
-
) -> None:
|
|
233
|
-
"""
|
|
234
|
-
Checks if the example contains the necessary parameters for the scorer.
|
|
235
|
-
"""
|
|
236
|
-
prompt_user = False
|
|
237
|
-
for scorer in scorers:
|
|
238
|
-
for example in examples:
|
|
239
|
-
missing_params = []
|
|
240
|
-
for param in scorer.required_params:
|
|
241
|
-
if getattr(example, param.value) is None:
|
|
242
|
-
missing_params.append(f"{param.value}")
|
|
243
|
-
if missing_params:
|
|
244
|
-
rprint(
|
|
245
|
-
f"[yellow]ā ļø WARNING:[/yellow] Example is missing required parameters for scorer [bold]{scorer.score_type.value}[/bold]"
|
|
246
|
-
)
|
|
247
|
-
rprint(f"Missing parameters: {', '.join(missing_params)}")
|
|
248
|
-
rprint(f"Example: {json.dumps(example.model_dump(), indent=2)}")
|
|
249
|
-
rprint("-" * 40)
|
|
250
|
-
prompt_user = True
|
|
251
|
-
|
|
252
|
-
if prompt_user:
|
|
253
|
-
user_input = input("Do you want to continue? (y/n)")
|
|
254
|
-
if user_input.lower() != "y":
|
|
255
|
-
sys.exit(0)
|
|
256
|
-
else:
|
|
257
|
-
rprint("[green]Continuing...[/green]")
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
def run_trace_eval(
|
|
261
|
-
trace_run: TraceRun,
|
|
262
|
-
judgment_api_key: str,
|
|
263
|
-
override: bool = False,
|
|
264
|
-
function: Optional[Callable] = None,
|
|
265
|
-
tracer: Optional[Union[Tracer, BaseCallbackHandler]] = None,
|
|
266
|
-
examples: Optional[List[Example]] = None,
|
|
267
|
-
) -> List[ScoringResult]:
|
|
268
|
-
# Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
|
|
269
|
-
if not override and not trace_run.append:
|
|
270
|
-
check_eval_run_name_exists(
|
|
271
|
-
trace_run.eval_name,
|
|
272
|
-
trace_run.project_name,
|
|
273
|
-
judgment_api_key,
|
|
274
|
-
trace_run.organization_id,
|
|
275
|
-
)
|
|
276
|
-
|
|
277
|
-
if trace_run.append:
|
|
278
|
-
# Check that the current experiment, if one exists, has the same type (examples or traces)
|
|
279
|
-
check_experiment_type(
|
|
280
|
-
trace_run.eval_name,
|
|
281
|
-
trace_run.project_name,
|
|
282
|
-
judgment_api_key,
|
|
283
|
-
trace_run.organization_id,
|
|
284
|
-
True,
|
|
285
|
-
)
|
|
286
|
-
if function and tracer and examples is not None:
|
|
287
|
-
new_traces: List[Trace] = []
|
|
288
|
-
|
|
289
|
-
# Handle case where tracer is actually a callback handler
|
|
290
|
-
actual_tracer = tracer
|
|
291
|
-
if hasattr(tracer, "tracer") and hasattr(tracer.tracer, "traces"):
|
|
292
|
-
# This is a callback handler, get the underlying tracer
|
|
293
|
-
actual_tracer = tracer.tracer
|
|
294
|
-
|
|
295
|
-
actual_tracer.offline_mode = True
|
|
296
|
-
actual_tracer.traces = []
|
|
297
|
-
judgeval_logger.info("Running agent function: ")
|
|
298
|
-
for example in examples:
|
|
299
|
-
if example.input:
|
|
300
|
-
if isinstance(example.input, str):
|
|
301
|
-
function(example.input)
|
|
302
|
-
elif isinstance(example.input, dict):
|
|
303
|
-
function(**example.input)
|
|
304
|
-
else:
|
|
305
|
-
raise ValueError(
|
|
306
|
-
f"Input must be string or dict, got {type(example.input)}"
|
|
307
|
-
)
|
|
308
|
-
else:
|
|
309
|
-
function()
|
|
310
|
-
|
|
311
|
-
for i, trace in enumerate(actual_tracer.traces):
|
|
312
|
-
# We set the root-level trace span with the expected tools of the Trace
|
|
313
|
-
trace = Trace(**trace)
|
|
314
|
-
trace.trace_spans[0].expected_tools = examples[i].expected_tools
|
|
315
|
-
new_traces.append(trace)
|
|
316
|
-
trace_run.traces = new_traces
|
|
317
|
-
actual_tracer.traces = []
|
|
318
|
-
|
|
319
|
-
# Execute evaluation using Judgment API
|
|
320
|
-
try: # execute an EvaluationRun with just JudgmentScorers
|
|
321
|
-
judgeval_logger.info("Executing Trace Evaluation... ")
|
|
322
|
-
response_data: Dict = execute_api_trace_eval(trace_run, judgment_api_key)
|
|
323
|
-
scoring_results = [
|
|
324
|
-
ScoringResult(**result) for result in response_data["results"]
|
|
325
|
-
]
|
|
326
|
-
except JudgmentAPIError as e:
|
|
327
|
-
raise JudgmentAPIError(
|
|
328
|
-
f"An error occurred while executing the Judgment API request: {str(e)}"
|
|
329
|
-
)
|
|
330
|
-
except ValueError as e:
|
|
331
|
-
raise ValueError(
|
|
332
|
-
f"Please check your TraceRun object, one or more fields are invalid: {str(e)}"
|
|
333
|
-
)
|
|
334
|
-
|
|
335
|
-
# Convert the response data to `ScoringResult` objects
|
|
336
|
-
# TODO: allow for custom scorer on traces
|
|
337
|
-
|
|
338
|
-
url = log_evaluation_results(
|
|
339
|
-
response_data["agent_results"], trace_run, judgment_api_key
|
|
340
|
-
)
|
|
341
|
-
rprint(
|
|
342
|
-
f"\nš You can view your evaluation results here: [rgb(106,0,255)][link={url}]View Results[/link]\n"
|
|
343
|
-
)
|
|
344
|
-
return scoring_results
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
async def get_evaluation_status(
|
|
348
|
-
eval_name: str, project_name: str, judgment_api_key: str, organization_id: str
|
|
349
|
-
) -> Dict:
|
|
350
|
-
"""
|
|
351
|
-
Gets the status of an async evaluation run.
|
|
352
|
-
|
|
353
|
-
Args:
|
|
354
|
-
eval_name (str): Name of the evaluation run
|
|
355
|
-
project_name (str): Name of the project
|
|
356
|
-
judgment_api_key (str): API key for authentication
|
|
357
|
-
organization_id (str): Organization ID for the evaluation
|
|
358
|
-
|
|
359
|
-
Returns:
|
|
360
|
-
Dict: Status information including:
|
|
361
|
-
- status: 'pending', 'running', 'completed', or 'failed'
|
|
362
|
-
- results: List of ScoringResult objects if completed
|
|
363
|
-
- error: Error message if failed
|
|
364
|
-
"""
|
|
365
|
-
api_client = JudgmentApiClient(judgment_api_key, organization_id)
|
|
366
|
-
try:
|
|
367
|
-
return api_client.get_evaluation_status(eval_name, project_name)
|
|
368
|
-
except Exception as e:
|
|
369
|
-
raise JudgmentAPIError(
|
|
370
|
-
f"An error occurred while checking evaluation status: {str(e)}"
|
|
371
|
-
)
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
def retrieve_counts(result: Dict):
|
|
375
|
-
scorer_data_count = 0
|
|
376
|
-
for example in result.get("examples", []):
|
|
377
|
-
for scorer in example.get("scorer_data", []):
|
|
378
|
-
scorer_data_count += 1
|
|
379
|
-
return scorer_data_count
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
def _poll_evaluation_until_complete(
|
|
383
|
-
eval_name: str,
|
|
384
|
-
project_name: str,
|
|
385
|
-
judgment_api_key: str,
|
|
386
|
-
organization_id: str,
|
|
387
|
-
expected_scorer_data_count: int,
|
|
388
|
-
poll_interval_seconds: float = 5,
|
|
389
|
-
max_failures: int = 5,
|
|
390
|
-
max_poll_count: int = 24, # This should be equivalent to 120 seconds
|
|
391
|
-
) -> Tuple[List[ScoringResult], str]:
|
|
392
|
-
"""
|
|
393
|
-
Polls until the evaluation is complete and returns the results.
|
|
394
|
-
|
|
395
|
-
Args:
|
|
396
|
-
eval_name (str): Name of the evaluation run
|
|
397
|
-
project_name (str): Name of the project
|
|
398
|
-
judgment_api_key (str): API key for authentication
|
|
399
|
-
organization_id (str): Organization ID for the evaluation
|
|
400
|
-
poll_interval_seconds (int, optional): Time between status checks in seconds. Defaults to 5.
|
|
401
|
-
original_examples (List[Example], optional): The original examples sent for evaluation.
|
|
402
|
-
If provided, will match results with original examples.
|
|
403
|
-
|
|
404
|
-
Returns:
|
|
405
|
-
List[ScoringResult]: The evaluation results
|
|
406
|
-
"""
|
|
407
|
-
poll_count = 0
|
|
408
|
-
exception_count = 0
|
|
409
|
-
api_client = JudgmentApiClient(judgment_api_key, organization_id)
|
|
410
|
-
while poll_count < max_poll_count:
|
|
411
|
-
poll_count += 1
|
|
412
|
-
try:
|
|
413
|
-
# Check status
|
|
414
|
-
status_response = api_client.get_evaluation_status(eval_name, project_name)
|
|
415
|
-
|
|
416
|
-
if status_response.get("status") != "completed":
|
|
417
|
-
time.sleep(poll_interval_seconds)
|
|
418
|
-
continue
|
|
419
|
-
|
|
420
|
-
results_response = api_client.fetch_evaluation_results(
|
|
421
|
-
project_name, eval_name
|
|
422
|
-
)
|
|
423
|
-
url = results_response.get("ui_results_url")
|
|
424
|
-
|
|
425
|
-
if results_response.get("examples") is None:
|
|
426
|
-
time.sleep(poll_interval_seconds)
|
|
427
|
-
continue
|
|
428
|
-
|
|
429
|
-
examples_data = results_response.get("examples", [])
|
|
430
|
-
scoring_results = []
|
|
431
|
-
scorer_data_count = 0
|
|
432
|
-
|
|
433
|
-
for example_data in examples_data:
|
|
434
|
-
scorer_data_list = []
|
|
435
|
-
for raw_scorer_data in example_data.get("scorer_data", []):
|
|
436
|
-
scorer_data = ScorerData(**raw_scorer_data)
|
|
437
|
-
scorer_data_list.append(scorer_data)
|
|
438
|
-
scorer_data_count += 1
|
|
439
|
-
|
|
440
|
-
example = Example(**example_data)
|
|
441
|
-
|
|
442
|
-
success = all(scorer_data.success for scorer_data in scorer_data_list)
|
|
443
|
-
scoring_result = ScoringResult(
|
|
444
|
-
success=success,
|
|
445
|
-
scorers_data=scorer_data_list,
|
|
446
|
-
data_object=example,
|
|
447
|
-
)
|
|
448
|
-
scoring_results.append(scoring_result)
|
|
449
|
-
|
|
450
|
-
if scorer_data_count != expected_scorer_data_count:
|
|
451
|
-
time.sleep(poll_interval_seconds)
|
|
452
|
-
continue
|
|
453
|
-
|
|
454
|
-
return scoring_results, url
|
|
455
|
-
except Exception as e:
|
|
456
|
-
exception_count += 1
|
|
457
|
-
if isinstance(e, JudgmentAPIError):
|
|
458
|
-
raise
|
|
459
|
-
|
|
460
|
-
judgeval_logger.error(f"Error checking evaluation status: {str(e)}")
|
|
461
|
-
if exception_count > max_failures:
|
|
462
|
-
raise JudgmentAPIError(
|
|
463
|
-
f"Error checking evaluation status after {poll_count} attempts: {str(e)}"
|
|
464
|
-
)
|
|
465
|
-
|
|
466
|
-
time.sleep(poll_interval_seconds)
|
|
467
|
-
|
|
468
|
-
raise JudgmentAPIError(
|
|
469
|
-
f"Error checking evaluation status after {poll_count} attempts"
|
|
470
|
-
)
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
def progress_logger(stop_event, msg="Working...", interval=5):
|
|
474
|
-
start = time.time()
|
|
475
|
-
while not stop_event.is_set():
|
|
476
|
-
elapsed = int(time.time() - start)
|
|
477
|
-
judgeval_logger.info(f"{msg} ({elapsed} sec)")
|
|
478
|
-
stop_event.wait(interval)
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
def run_eval(
|
|
482
|
-
evaluation_run: EvaluationRun,
|
|
483
|
-
judgment_api_key: str,
|
|
484
|
-
override: bool = False,
|
|
485
|
-
) -> List[ScoringResult]:
|
|
486
|
-
"""
|
|
487
|
-
Executes an evaluation of `Example`s using one or more `Scorer`s
|
|
488
|
-
|
|
489
|
-
Args:
|
|
490
|
-
evaluation_run (EvaluationRun): Stores example and evaluation together for running
|
|
491
|
-
override (bool, optional): Whether to override existing evaluation run with same name. Defaults to False.
|
|
492
|
-
|
|
493
|
-
Returns:
|
|
494
|
-
List[ScoringResult]: A list of ScoringResult objects
|
|
495
|
-
"""
|
|
496
|
-
|
|
497
|
-
# Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
|
|
498
|
-
if not override and not evaluation_run.append:
|
|
499
|
-
check_eval_run_name_exists(
|
|
500
|
-
evaluation_run.eval_name,
|
|
501
|
-
evaluation_run.project_name,
|
|
502
|
-
judgment_api_key,
|
|
503
|
-
evaluation_run.organization_id,
|
|
504
|
-
)
|
|
505
|
-
|
|
506
|
-
if evaluation_run.append:
|
|
507
|
-
# Check that the current experiment, if one exists, has the same type (examples of traces)
|
|
508
|
-
check_experiment_type(
|
|
509
|
-
evaluation_run.eval_name,
|
|
510
|
-
evaluation_run.project_name,
|
|
511
|
-
judgment_api_key,
|
|
512
|
-
evaluation_run.organization_id,
|
|
513
|
-
False,
|
|
514
|
-
)
|
|
515
|
-
|
|
516
|
-
# Set example IDs if not already set
|
|
517
|
-
for idx, example in enumerate(evaluation_run.examples):
|
|
518
|
-
example.example_index = idx # Set numeric index
|
|
519
|
-
|
|
520
|
-
judgment_scorers: List[APIScorerConfig] = []
|
|
521
|
-
local_scorers: List[BaseScorer] = []
|
|
522
|
-
for scorer in evaluation_run.scorers:
|
|
523
|
-
if isinstance(scorer, APIScorerConfig):
|
|
524
|
-
judgment_scorers.append(scorer)
|
|
525
|
-
else:
|
|
526
|
-
local_scorers.append(scorer)
|
|
527
|
-
|
|
528
|
-
results: List[ScoringResult] = []
|
|
529
|
-
url = ""
|
|
530
|
-
|
|
531
|
-
if len(local_scorers) > 0 and len(judgment_scorers) > 0:
|
|
532
|
-
error_msg = "We currently do not support running both local and Judgment API scorers at the same time. Please run your evaluation with either local scorers or Judgment API scorers, but not both."
|
|
533
|
-
judgeval_logger.error(error_msg)
|
|
534
|
-
raise ValueError(error_msg)
|
|
535
|
-
|
|
536
|
-
if len(judgment_scorers) > 0:
|
|
537
|
-
check_examples(evaluation_run.examples, judgment_scorers)
|
|
538
|
-
stop_event = threading.Event()
|
|
539
|
-
t = threading.Thread(
|
|
540
|
-
target=progress_logger, args=(stop_event, "Running evaluation...")
|
|
541
|
-
)
|
|
542
|
-
t.start()
|
|
543
|
-
try:
|
|
544
|
-
api_client = JudgmentApiClient(
|
|
545
|
-
judgment_api_key, evaluation_run.organization_id
|
|
546
|
-
)
|
|
547
|
-
response = api_client.add_to_evaluation_queue(
|
|
548
|
-
evaluation_run.model_dump(warnings=False)
|
|
549
|
-
)
|
|
550
|
-
|
|
551
|
-
if not response.get("success", False):
|
|
552
|
-
error_message = response.error
|
|
553
|
-
judgeval_logger.error(
|
|
554
|
-
f"Error adding evaluation to queue: {error_message}"
|
|
555
|
-
)
|
|
556
|
-
raise JudgmentAPIError(error_message)
|
|
557
|
-
|
|
558
|
-
old_scorer_data_count = 0
|
|
559
|
-
if evaluation_run.append:
|
|
560
|
-
try:
|
|
561
|
-
results_response = api_client.fetch_evaluation_results(
|
|
562
|
-
evaluation_run.project_name, evaluation_run.eval_name
|
|
563
|
-
)
|
|
564
|
-
old_scorer_data_count = retrieve_counts(results_response)
|
|
565
|
-
except Exception:
|
|
566
|
-
# This usually means the user did append = True but the eval run name doesn't exist yet
|
|
567
|
-
pass
|
|
568
|
-
|
|
569
|
-
results, url = _poll_evaluation_until_complete(
|
|
570
|
-
eval_name=evaluation_run.eval_name,
|
|
571
|
-
project_name=evaluation_run.project_name,
|
|
572
|
-
judgment_api_key=judgment_api_key,
|
|
573
|
-
organization_id=evaluation_run.organization_id,
|
|
574
|
-
expected_scorer_data_count=(
|
|
575
|
-
len(evaluation_run.scorers) * len(evaluation_run.examples)
|
|
576
|
-
)
|
|
577
|
-
+ old_scorer_data_count,
|
|
578
|
-
)
|
|
579
|
-
finally:
|
|
580
|
-
stop_event.set()
|
|
581
|
-
t.join()
|
|
582
|
-
|
|
583
|
-
if len(local_scorers) > 0:
|
|
584
|
-
results = safe_run_async(
|
|
585
|
-
a_execute_scoring(
|
|
586
|
-
evaluation_run.examples,
|
|
587
|
-
local_scorers,
|
|
588
|
-
model=evaluation_run.model,
|
|
589
|
-
throttle_value=0,
|
|
590
|
-
max_concurrent=MAX_CONCURRENT_EVALUATIONS,
|
|
591
|
-
)
|
|
592
|
-
)
|
|
593
|
-
|
|
594
|
-
send_results = [
|
|
595
|
-
scoring_result.model_dump(warnings=False) for scoring_result in results
|
|
596
|
-
]
|
|
597
|
-
|
|
598
|
-
url = log_evaluation_results(send_results, evaluation_run, judgment_api_key)
|
|
599
|
-
rprint(
|
|
600
|
-
f"\nš You can view your evaluation results here: [rgb(106,0,255)][link={url}]View Results[/link]\n"
|
|
601
|
-
)
|
|
602
|
-
return results
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
def assert_test(scoring_results: List[ScoringResult]) -> None:
|
|
606
|
-
"""
|
|
607
|
-
Collects all failed scorers from the scoring results.
|
|
608
|
-
|
|
609
|
-
Args:
|
|
610
|
-
ScoringResults (List[ScoringResult]): List of scoring results to check
|
|
611
|
-
|
|
612
|
-
Returns:
|
|
613
|
-
None. Raises exceptions for any failed test cases.
|
|
614
|
-
"""
|
|
615
|
-
failed_cases: List[ScorerData] = []
|
|
616
|
-
|
|
617
|
-
for result in scoring_results:
|
|
618
|
-
if not result.success:
|
|
619
|
-
# Create a test case context with all relevant fields
|
|
620
|
-
test_case: Dict = {"failed_scorers": []}
|
|
621
|
-
if result.scorers_data:
|
|
622
|
-
# If the result was not successful, check each scorer_data
|
|
623
|
-
for scorer_data in result.scorers_data:
|
|
624
|
-
if not scorer_data.success:
|
|
625
|
-
if scorer_data.name == "Tool Order":
|
|
626
|
-
# Remove threshold, evaluation model for Tool Order scorer
|
|
627
|
-
scorer_data.threshold = None
|
|
628
|
-
scorer_data.evaluation_model = None
|
|
629
|
-
test_case["failed_scorers"].append(scorer_data)
|
|
630
|
-
failed_cases.append(test_case)
|
|
631
|
-
|
|
632
|
-
if failed_cases:
|
|
633
|
-
error_msg = "The following test cases failed: \n"
|
|
634
|
-
for fail_case in failed_cases:
|
|
635
|
-
for fail_scorer in fail_case["failed_scorers"]:
|
|
636
|
-
error_msg += (
|
|
637
|
-
f"\nScorer Name: {fail_scorer.name}\n"
|
|
638
|
-
f"Threshold: {fail_scorer.threshold}\n"
|
|
639
|
-
f"Success: {fail_scorer.success}\n"
|
|
640
|
-
f"Score: {fail_scorer.score}\n"
|
|
641
|
-
f"Reason: {fail_scorer.reason}\n"
|
|
642
|
-
f"Strict Mode: {fail_scorer.strict_mode}\n"
|
|
643
|
-
f"Evaluation Model: {fail_scorer.evaluation_model}\n"
|
|
644
|
-
f"Error: {fail_scorer.error}\n"
|
|
645
|
-
f"Additional Metadata: {fail_scorer.additional_metadata}\n"
|
|
646
|
-
)
|
|
647
|
-
error_msg += "-" * 100
|
|
648
|
-
|
|
649
|
-
total_tests = len(scoring_results)
|
|
650
|
-
failed_tests = len(failed_cases)
|
|
651
|
-
passed_tests = total_tests - failed_tests
|
|
652
|
-
|
|
653
|
-
# Print summary with colors
|
|
654
|
-
rprint("\n" + "=" * 80)
|
|
655
|
-
if failed_tests == 0:
|
|
656
|
-
rprint(
|
|
657
|
-
f"[bold green]š ALL TESTS PASSED! {passed_tests}/{total_tests} tests successful[/bold green]"
|
|
658
|
-
)
|
|
659
|
-
else:
|
|
660
|
-
rprint(
|
|
661
|
-
f"[bold red]ā ļø TEST RESULTS: {passed_tests}/{total_tests} passed ({failed_tests} failed)[/bold red]"
|
|
662
|
-
)
|
|
663
|
-
rprint("=" * 80 + "\n")
|
|
664
|
-
|
|
665
|
-
# Print individual test cases
|
|
666
|
-
for i, result in enumerate(scoring_results):
|
|
667
|
-
test_num = i + 1
|
|
668
|
-
if result.success:
|
|
669
|
-
rprint(f"[green]ā Test {test_num}: PASSED[/green]")
|
|
670
|
-
else:
|
|
671
|
-
rprint(f"[red]ā Test {test_num}: FAILED[/red]")
|
|
672
|
-
if result.scorers_data:
|
|
673
|
-
for scorer_data in result.scorers_data:
|
|
674
|
-
if not scorer_data.success:
|
|
675
|
-
rprint(f" [yellow]Scorer: {scorer_data.name}[/yellow]")
|
|
676
|
-
rprint(f" [red] Score: {scorer_data.score}[/red]")
|
|
677
|
-
rprint(f" [red] Reason: {scorer_data.reason}[/red]")
|
|
678
|
-
if scorer_data.error:
|
|
679
|
-
rprint(f" [red] Error: {scorer_data.error}[/red]")
|
|
680
|
-
rprint(" " + "-" * 40)
|
|
681
|
-
|
|
682
|
-
rprint("\n" + "=" * 80)
|
|
683
|
-
if failed_tests > 0:
|
|
684
|
-
raise AssertionError(failed_cases)
|
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
`judgeval` answer relevancy scorer
|
|
3
|
-
|
|
4
|
-
TODO add link to docs page for this scorer
|
|
5
|
-
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
# Internal imports
|
|
9
|
-
from judgeval.scorers.api_scorer import APIScorerConfig
|
|
10
|
-
from judgeval.constants import APIScorerType
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
class DerailmentScorer(APIScorerConfig):
|
|
14
|
-
score_type: APIScorerType = APIScorerType.DERAILMENT
|