judgeval 0.1.0__py3-none-any.whl → 0.23.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +173 -10
- judgeval/api/__init__.py +523 -0
- judgeval/api/api_types.py +413 -0
- judgeval/cli.py +112 -0
- judgeval/constants.py +7 -30
- judgeval/data/__init__.py +1 -3
- judgeval/data/evaluation_run.py +125 -0
- judgeval/data/example.py +14 -40
- judgeval/data/judgment_types.py +396 -146
- judgeval/data/result.py +11 -18
- judgeval/data/scorer_data.py +3 -26
- judgeval/data/scripts/openapi_transform.py +5 -5
- judgeval/data/trace.py +115 -194
- judgeval/dataset/__init__.py +335 -0
- judgeval/env.py +55 -0
- judgeval/evaluation/__init__.py +346 -0
- judgeval/exceptions.py +28 -0
- judgeval/integrations/langgraph/__init__.py +13 -0
- judgeval/integrations/openlit/__init__.py +51 -0
- judgeval/judges/__init__.py +2 -2
- judgeval/judges/litellm_judge.py +77 -16
- judgeval/judges/together_judge.py +88 -17
- judgeval/judges/utils.py +7 -20
- judgeval/judgment_attribute_keys.py +55 -0
- judgeval/{common/logger.py → logger.py} +24 -8
- judgeval/prompt/__init__.py +330 -0
- judgeval/scorers/__init__.py +11 -11
- judgeval/scorers/agent_scorer.py +15 -19
- judgeval/scorers/api_scorer.py +21 -23
- judgeval/scorers/base_scorer.py +54 -36
- judgeval/scorers/example_scorer.py +1 -3
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -24
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -10
- judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
- judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -10
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -14
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +171 -59
- judgeval/scorers/score.py +64 -47
- judgeval/scorers/utils.py +2 -107
- judgeval/tracer/__init__.py +1111 -2
- judgeval/tracer/constants.py +1 -0
- judgeval/tracer/exporters/__init__.py +40 -0
- judgeval/tracer/exporters/s3.py +119 -0
- judgeval/tracer/exporters/store.py +59 -0
- judgeval/tracer/exporters/utils.py +32 -0
- judgeval/tracer/keys.py +63 -0
- judgeval/tracer/llm/__init__.py +7 -0
- judgeval/tracer/llm/config.py +78 -0
- judgeval/tracer/llm/constants.py +9 -0
- judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
- judgeval/tracer/llm/llm_anthropic/config.py +6 -0
- judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
- judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
- judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
- judgeval/tracer/llm/llm_google/__init__.py +3 -0
- judgeval/tracer/llm/llm_google/config.py +6 -0
- judgeval/tracer/llm/llm_google/generate_content.py +127 -0
- judgeval/tracer/llm/llm_google/wrapper.py +30 -0
- judgeval/tracer/llm/llm_openai/__init__.py +3 -0
- judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
- judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
- judgeval/tracer/llm/llm_openai/config.py +6 -0
- judgeval/tracer/llm/llm_openai/responses.py +506 -0
- judgeval/tracer/llm/llm_openai/utils.py +42 -0
- judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
- judgeval/tracer/llm/llm_together/__init__.py +3 -0
- judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
- judgeval/tracer/llm/llm_together/config.py +6 -0
- judgeval/tracer/llm/llm_together/wrapper.py +52 -0
- judgeval/tracer/llm/providers.py +19 -0
- judgeval/tracer/managers.py +167 -0
- judgeval/tracer/processors/__init__.py +220 -0
- judgeval/tracer/utils.py +19 -0
- judgeval/trainer/__init__.py +14 -0
- judgeval/trainer/base_trainer.py +122 -0
- judgeval/trainer/config.py +123 -0
- judgeval/trainer/console.py +144 -0
- judgeval/trainer/fireworks_trainer.py +392 -0
- judgeval/trainer/trainable_model.py +252 -0
- judgeval/trainer/trainer.py +70 -0
- judgeval/utils/async_utils.py +39 -0
- judgeval/utils/decorators/__init__.py +0 -0
- judgeval/utils/decorators/dont_throw.py +37 -0
- judgeval/utils/decorators/use_once.py +13 -0
- judgeval/utils/file_utils.py +74 -28
- judgeval/utils/guards.py +36 -0
- judgeval/utils/meta.py +27 -0
- judgeval/utils/project.py +15 -0
- judgeval/utils/serialize.py +253 -0
- judgeval/utils/testing.py +70 -0
- judgeval/utils/url.py +10 -0
- judgeval/{version_check.py → utils/version_check.py} +5 -3
- judgeval/utils/wrappers/README.md +3 -0
- judgeval/utils/wrappers/__init__.py +15 -0
- judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
- judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
- judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
- judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
- judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
- judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
- judgeval/utils/wrappers/py.typed +0 -0
- judgeval/utils/wrappers/utils.py +35 -0
- judgeval/v1/__init__.py +88 -0
- judgeval/v1/data/__init__.py +7 -0
- judgeval/v1/data/example.py +44 -0
- judgeval/v1/data/scorer_data.py +42 -0
- judgeval/v1/data/scoring_result.py +44 -0
- judgeval/v1/datasets/__init__.py +6 -0
- judgeval/v1/datasets/dataset.py +214 -0
- judgeval/v1/datasets/dataset_factory.py +94 -0
- judgeval/v1/evaluation/__init__.py +6 -0
- judgeval/v1/evaluation/evaluation.py +182 -0
- judgeval/v1/evaluation/evaluation_factory.py +17 -0
- judgeval/v1/instrumentation/__init__.py +6 -0
- judgeval/v1/instrumentation/llm/__init__.py +7 -0
- judgeval/v1/instrumentation/llm/config.py +78 -0
- judgeval/v1/instrumentation/llm/constants.py +11 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/messages.py +414 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/messages_stream.py +307 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/wrapper.py +61 -0
- judgeval/v1/instrumentation/llm/llm_google/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_google/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_google/generate_content.py +121 -0
- judgeval/v1/instrumentation/llm/llm_google/wrapper.py +30 -0
- judgeval/v1/instrumentation/llm/llm_openai/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_openai/beta_chat_completions.py +212 -0
- judgeval/v1/instrumentation/llm/llm_openai/chat_completions.py +477 -0
- judgeval/v1/instrumentation/llm/llm_openai/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_openai/responses.py +472 -0
- judgeval/v1/instrumentation/llm/llm_openai/utils.py +41 -0
- judgeval/v1/instrumentation/llm/llm_openai/wrapper.py +63 -0
- judgeval/v1/instrumentation/llm/llm_together/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_together/chat_completions.py +382 -0
- judgeval/v1/instrumentation/llm/llm_together/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_together/wrapper.py +57 -0
- judgeval/v1/instrumentation/llm/providers.py +19 -0
- judgeval/v1/integrations/claude_agent_sdk/__init__.py +119 -0
- judgeval/v1/integrations/claude_agent_sdk/wrapper.py +564 -0
- judgeval/v1/integrations/langgraph/__init__.py +13 -0
- judgeval/v1/integrations/openlit/__init__.py +47 -0
- judgeval/v1/internal/api/__init__.py +525 -0
- judgeval/v1/internal/api/api_types.py +413 -0
- judgeval/v1/prompts/__init__.py +6 -0
- judgeval/v1/prompts/prompt.py +29 -0
- judgeval/v1/prompts/prompt_factory.py +189 -0
- judgeval/v1/py.typed +0 -0
- judgeval/v1/scorers/__init__.py +6 -0
- judgeval/v1/scorers/api_scorer.py +82 -0
- judgeval/v1/scorers/base_scorer.py +17 -0
- judgeval/v1/scorers/built_in/__init__.py +17 -0
- judgeval/v1/scorers/built_in/answer_correctness.py +28 -0
- judgeval/v1/scorers/built_in/answer_relevancy.py +28 -0
- judgeval/v1/scorers/built_in/built_in_factory.py +26 -0
- judgeval/v1/scorers/built_in/faithfulness.py +28 -0
- judgeval/v1/scorers/built_in/instruction_adherence.py +28 -0
- judgeval/v1/scorers/custom_scorer/__init__.py +6 -0
- judgeval/v1/scorers/custom_scorer/custom_scorer.py +50 -0
- judgeval/v1/scorers/custom_scorer/custom_scorer_factory.py +16 -0
- judgeval/v1/scorers/prompt_scorer/__init__.py +6 -0
- judgeval/v1/scorers/prompt_scorer/prompt_scorer.py +86 -0
- judgeval/v1/scorers/prompt_scorer/prompt_scorer_factory.py +85 -0
- judgeval/v1/scorers/scorers_factory.py +49 -0
- judgeval/v1/tracer/__init__.py +7 -0
- judgeval/v1/tracer/base_tracer.py +520 -0
- judgeval/v1/tracer/exporters/__init__.py +14 -0
- judgeval/v1/tracer/exporters/in_memory_span_exporter.py +25 -0
- judgeval/v1/tracer/exporters/judgment_span_exporter.py +42 -0
- judgeval/v1/tracer/exporters/noop_span_exporter.py +19 -0
- judgeval/v1/tracer/exporters/span_store.py +50 -0
- judgeval/v1/tracer/judgment_tracer_provider.py +70 -0
- judgeval/v1/tracer/processors/__init__.py +6 -0
- judgeval/v1/tracer/processors/_lifecycles/__init__.py +28 -0
- judgeval/v1/tracer/processors/_lifecycles/agent_id_processor.py +53 -0
- judgeval/v1/tracer/processors/_lifecycles/context_keys.py +11 -0
- judgeval/v1/tracer/processors/_lifecycles/customer_id_processor.py +29 -0
- judgeval/v1/tracer/processors/_lifecycles/registry.py +18 -0
- judgeval/v1/tracer/processors/judgment_span_processor.py +165 -0
- judgeval/v1/tracer/processors/noop_span_processor.py +42 -0
- judgeval/v1/tracer/tracer.py +67 -0
- judgeval/v1/tracer/tracer_factory.py +38 -0
- judgeval/v1/trainers/__init__.py +5 -0
- judgeval/v1/trainers/base_trainer.py +62 -0
- judgeval/v1/trainers/config.py +123 -0
- judgeval/v1/trainers/console.py +144 -0
- judgeval/v1/trainers/fireworks_trainer.py +392 -0
- judgeval/v1/trainers/trainable_model.py +252 -0
- judgeval/v1/trainers/trainers_factory.py +37 -0
- judgeval/v1/utils.py +18 -0
- judgeval/version.py +5 -0
- judgeval/warnings.py +4 -0
- judgeval-0.23.0.dist-info/METADATA +266 -0
- judgeval-0.23.0.dist-info/RECORD +201 -0
- judgeval-0.23.0.dist-info/entry_points.txt +2 -0
- judgeval/clients.py +0 -34
- judgeval/common/__init__.py +0 -13
- judgeval/common/api/__init__.py +0 -3
- judgeval/common/api/api.py +0 -352
- judgeval/common/api/constants.py +0 -165
- judgeval/common/exceptions.py +0 -27
- judgeval/common/storage/__init__.py +0 -6
- judgeval/common/storage/s3_storage.py +0 -98
- judgeval/common/tracer/__init__.py +0 -31
- judgeval/common/tracer/constants.py +0 -22
- judgeval/common/tracer/core.py +0 -1916
- judgeval/common/tracer/otel_exporter.py +0 -108
- judgeval/common/tracer/otel_span_processor.py +0 -234
- judgeval/common/tracer/span_processor.py +0 -37
- judgeval/common/tracer/span_transformer.py +0 -211
- judgeval/common/tracer/trace_manager.py +0 -92
- judgeval/common/utils.py +0 -940
- judgeval/data/datasets/__init__.py +0 -4
- judgeval/data/datasets/dataset.py +0 -341
- judgeval/data/datasets/eval_dataset_client.py +0 -214
- judgeval/data/tool.py +0 -5
- judgeval/data/trace_run.py +0 -37
- judgeval/evaluation_run.py +0 -75
- judgeval/integrations/langgraph.py +0 -843
- judgeval/judges/mixture_of_judges.py +0 -286
- judgeval/judgment_client.py +0 -369
- judgeval/rules.py +0 -521
- judgeval/run_evaluation.py +0 -684
- judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -14
- judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
- judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
- judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -20
- judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -27
- judgeval/utils/alerts.py +0 -93
- judgeval/utils/requests.py +0 -50
- judgeval-0.1.0.dist-info/METADATA +0 -202
- judgeval-0.1.0.dist-info/RECORD +0 -73
- {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/WHEEL +0 -0
- {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -0,0 +1,346 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import concurrent.futures
|
|
5
|
+
import time
|
|
6
|
+
import threading
|
|
7
|
+
from typing import Any, List, Tuple, TYPE_CHECKING
|
|
8
|
+
from rich import print as rprint
|
|
9
|
+
|
|
10
|
+
from judgeval.data import ScorerData, ScoringResult
|
|
11
|
+
from judgeval.scorers.score import a_execute_scoring
|
|
12
|
+
from judgeval.api import JudgmentSyncClient
|
|
13
|
+
from judgeval.env import (
|
|
14
|
+
JUDGMENT_MAX_CONCURRENT_EVALUATIONS,
|
|
15
|
+
)
|
|
16
|
+
from judgeval.exceptions import JudgmentAPIError, JudgmentRuntimeError
|
|
17
|
+
from judgeval.logger import judgeval_logger
|
|
18
|
+
|
|
19
|
+
from judgeval.env import JUDGMENT_API_KEY, JUDGMENT_ORG_ID
|
|
20
|
+
|
|
21
|
+
if TYPE_CHECKING:
|
|
22
|
+
from judgeval.data.evaluation_run import ExampleEvaluationRun
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def safe_run_async(coro):
|
|
26
|
+
"""
|
|
27
|
+
Safely run an async coroutine whether or not there's already an event loop running.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
coro: The coroutine to run
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
The result of the coroutine
|
|
34
|
+
"""
|
|
35
|
+
try:
|
|
36
|
+
# Try to get the running loop
|
|
37
|
+
asyncio.get_running_loop()
|
|
38
|
+
# If we get here, there's already a loop running
|
|
39
|
+
# Run in a separate thread to avoid "asyncio.run() cannot be called from a running event loop"
|
|
40
|
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
41
|
+
future = executor.submit(asyncio.run, coro)
|
|
42
|
+
return future.result()
|
|
43
|
+
except RuntimeError:
|
|
44
|
+
# No event loop is running, safe to use asyncio.run()
|
|
45
|
+
return asyncio.run(coro)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def log_evaluation_results(
|
|
49
|
+
scoring_results: List[Any],
|
|
50
|
+
run: ExampleEvaluationRun,
|
|
51
|
+
) -> str:
|
|
52
|
+
"""
|
|
53
|
+
Logs evaluation results to the Judgment API database.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
merged_results (List[ScoringResult]): The results to log
|
|
57
|
+
evaluation_run (EvaluationRun): The evaluation run containing project info and API key
|
|
58
|
+
judgment_api_key (str): The API key for the Judgment API
|
|
59
|
+
|
|
60
|
+
Raises:
|
|
61
|
+
JudgmentAPIError: If there's an API error during logging
|
|
62
|
+
ValueError: If there's a validation error with the results
|
|
63
|
+
"""
|
|
64
|
+
try:
|
|
65
|
+
if not JUDGMENT_API_KEY or not JUDGMENT_ORG_ID:
|
|
66
|
+
raise ValueError("API key and organization ID are required")
|
|
67
|
+
|
|
68
|
+
api_client = JudgmentSyncClient(JUDGMENT_API_KEY, JUDGMENT_ORG_ID)
|
|
69
|
+
response = api_client.log_eval_results(
|
|
70
|
+
{
|
|
71
|
+
"results": scoring_results, # type: ignore
|
|
72
|
+
"run": run.model_dump(warnings=False), # type: ignore
|
|
73
|
+
}
|
|
74
|
+
)
|
|
75
|
+
url = response.get("ui_results_url")
|
|
76
|
+
assert url is not None
|
|
77
|
+
return url
|
|
78
|
+
|
|
79
|
+
except Exception as e:
|
|
80
|
+
judgeval_logger.error(f"Failed to save evaluation results to DB: {str(e)}")
|
|
81
|
+
raise JudgmentRuntimeError(
|
|
82
|
+
f"Request failed while saving evaluation results to DB: {str(e)}"
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _poll_evaluation_until_complete(
|
|
87
|
+
evaluation_run: ExampleEvaluationRun,
|
|
88
|
+
expected_examples_count: int,
|
|
89
|
+
poll_interval_seconds: float = 5,
|
|
90
|
+
max_failures: int = 5,
|
|
91
|
+
max_poll_count: int = 60, # This should be equivalent to 5 minutes
|
|
92
|
+
) -> Tuple[List[ScoringResult], str]:
|
|
93
|
+
"""
|
|
94
|
+
Polls until the evaluation is complete and returns the results.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
eval_name (str): Name of the evaluation run
|
|
98
|
+
project_name (str): Name of the project
|
|
99
|
+
judgment_api_key (str): API key for authentication
|
|
100
|
+
organization_id (str): Organization ID for the evaluation
|
|
101
|
+
poll_interval_seconds (int, optional): Time between status checks in seconds. Defaults to 5.
|
|
102
|
+
original_examples (List[Example], optional): The original examples sent for evaluation.
|
|
103
|
+
If provided, will match results with original examples.
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
List[ScoringResult]: The evaluation results
|
|
107
|
+
"""
|
|
108
|
+
project_name = evaluation_run.project_name
|
|
109
|
+
experiment_run_id = evaluation_run.id
|
|
110
|
+
|
|
111
|
+
if not project_name or not experiment_run_id:
|
|
112
|
+
raise ValueError("Project name and experiment run ID are required")
|
|
113
|
+
|
|
114
|
+
poll_count = 0
|
|
115
|
+
exception_count = 0
|
|
116
|
+
if not JUDGMENT_API_KEY or not JUDGMENT_ORG_ID:
|
|
117
|
+
raise ValueError("Judgment API key and organization ID are required")
|
|
118
|
+
api_client = JudgmentSyncClient(JUDGMENT_API_KEY, JUDGMENT_ORG_ID)
|
|
119
|
+
while poll_count < max_poll_count:
|
|
120
|
+
poll_count += 1
|
|
121
|
+
try:
|
|
122
|
+
# Check status
|
|
123
|
+
results_response = api_client.fetch_experiment_run(
|
|
124
|
+
{
|
|
125
|
+
"experiment_run_id": experiment_run_id,
|
|
126
|
+
"project_name": project_name,
|
|
127
|
+
}
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
example_scorer_pairings = results_response.get("results", []) or []
|
|
131
|
+
if len(example_scorer_pairings) != expected_examples_count:
|
|
132
|
+
time.sleep(poll_interval_seconds)
|
|
133
|
+
continue
|
|
134
|
+
|
|
135
|
+
url = results_response.get("ui_results_url")
|
|
136
|
+
|
|
137
|
+
scoring_result_list = []
|
|
138
|
+
for res in example_scorer_pairings:
|
|
139
|
+
example = res.get("data", {}).copy()
|
|
140
|
+
example["example_id"] = res.get("example_id")
|
|
141
|
+
scoring_result = ScoringResult(
|
|
142
|
+
scorers_data=res.get("scorers", []),
|
|
143
|
+
success=all(
|
|
144
|
+
t.get("success", False) for t in res.get("scorers", [])
|
|
145
|
+
),
|
|
146
|
+
data_object=example,
|
|
147
|
+
)
|
|
148
|
+
scoring_result_list.append(scoring_result)
|
|
149
|
+
|
|
150
|
+
assert url is not None
|
|
151
|
+
return scoring_result_list, url
|
|
152
|
+
except Exception as e:
|
|
153
|
+
exception_count += 1
|
|
154
|
+
if isinstance(e, JudgmentAPIError):
|
|
155
|
+
raise
|
|
156
|
+
|
|
157
|
+
judgeval_logger.error(f"Error checking evaluation status: {str(e)}")
|
|
158
|
+
if exception_count > max_failures:
|
|
159
|
+
raise JudgmentRuntimeError(
|
|
160
|
+
f"Error checking evaluation status after {poll_count} attempts: {str(e)}"
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
time.sleep(poll_interval_seconds)
|
|
164
|
+
|
|
165
|
+
raise JudgmentRuntimeError(
|
|
166
|
+
f"Error checking evaluation status after {poll_count} attempts"
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def progress_logger(stop_event, msg="Working...", interval=5):
|
|
171
|
+
start = time.time()
|
|
172
|
+
while not stop_event.is_set():
|
|
173
|
+
elapsed = int(time.time() - start)
|
|
174
|
+
judgeval_logger.info(f"{msg} ({elapsed} sec)")
|
|
175
|
+
stop_event.wait(interval)
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def run_eval(
|
|
179
|
+
evaluation_run: ExampleEvaluationRun,
|
|
180
|
+
) -> List[ScoringResult]:
|
|
181
|
+
"""
|
|
182
|
+
Executes an evaluation of `Example`s using one or more `Scorer`s
|
|
183
|
+
|
|
184
|
+
Args:
|
|
185
|
+
evaluation_run (ExampleEvaluationRun): Stores example and evaluation together for running
|
|
186
|
+
|
|
187
|
+
Returns:
|
|
188
|
+
List[ScoringResult]: A list of ScoringResult objects
|
|
189
|
+
"""
|
|
190
|
+
# Check that every example has the same keys
|
|
191
|
+
keys = evaluation_run.examples[0].get_fields().keys()
|
|
192
|
+
for example in evaluation_run.examples:
|
|
193
|
+
current_keys = example.get_fields().keys()
|
|
194
|
+
if current_keys != keys:
|
|
195
|
+
raise ValueError(
|
|
196
|
+
f"All examples must have the same keys: {current_keys} != {keys}"
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
results: List[ScoringResult] = []
|
|
200
|
+
url = ""
|
|
201
|
+
|
|
202
|
+
if (
|
|
203
|
+
len(evaluation_run.custom_scorers) > 0
|
|
204
|
+
and len(evaluation_run.judgment_scorers) > 0
|
|
205
|
+
):
|
|
206
|
+
error_msg = "We currently do not support running both local and Judgment API scorers at the same time. Please run your evaluation with either local scorers or Judgment API scorers, but not both."
|
|
207
|
+
judgeval_logger.error(error_msg)
|
|
208
|
+
raise ValueError(error_msg)
|
|
209
|
+
|
|
210
|
+
e2b_scorers = [cs for cs in evaluation_run.custom_scorers if cs.server_hosted]
|
|
211
|
+
|
|
212
|
+
if evaluation_run.judgment_scorers or e2b_scorers:
|
|
213
|
+
if evaluation_run.judgment_scorers and e2b_scorers:
|
|
214
|
+
error_msg = "We currently do not support running both hosted custom scorers and Judgment API scorers at the same time. Please run your evaluation with one or the other, but not both."
|
|
215
|
+
judgeval_logger.error(error_msg)
|
|
216
|
+
raise ValueError(error_msg)
|
|
217
|
+
|
|
218
|
+
if len(e2b_scorers) > 1:
|
|
219
|
+
error_msg = "We currently do not support running multiple hosted custom scorers at the same time."
|
|
220
|
+
judgeval_logger.error(error_msg)
|
|
221
|
+
raise ValueError(error_msg)
|
|
222
|
+
|
|
223
|
+
stop_event = threading.Event()
|
|
224
|
+
t = threading.Thread(
|
|
225
|
+
target=progress_logger, args=(stop_event, "Running evaluation...")
|
|
226
|
+
)
|
|
227
|
+
t.start()
|
|
228
|
+
try:
|
|
229
|
+
if not JUDGMENT_API_KEY or not JUDGMENT_ORG_ID:
|
|
230
|
+
raise ValueError("Judgment API key and organization ID are required")
|
|
231
|
+
api_client = JudgmentSyncClient(JUDGMENT_API_KEY, JUDGMENT_ORG_ID)
|
|
232
|
+
response = api_client.add_to_run_eval_queue_examples(
|
|
233
|
+
evaluation_run.model_dump(warnings=False) # type: ignore
|
|
234
|
+
)
|
|
235
|
+
|
|
236
|
+
if not response.get("success", False):
|
|
237
|
+
error_message = response.error
|
|
238
|
+
judgeval_logger.error(
|
|
239
|
+
f"Error adding evaluation to queue: {error_message}"
|
|
240
|
+
)
|
|
241
|
+
raise JudgmentRuntimeError(error_message)
|
|
242
|
+
|
|
243
|
+
results, url = _poll_evaluation_until_complete(
|
|
244
|
+
evaluation_run=evaluation_run,
|
|
245
|
+
expected_examples_count=len(evaluation_run.examples),
|
|
246
|
+
)
|
|
247
|
+
finally:
|
|
248
|
+
stop_event.set()
|
|
249
|
+
t.join()
|
|
250
|
+
else:
|
|
251
|
+
results = safe_run_async(
|
|
252
|
+
a_execute_scoring(
|
|
253
|
+
evaluation_run.examples,
|
|
254
|
+
evaluation_run.custom_scorers,
|
|
255
|
+
model=evaluation_run.model,
|
|
256
|
+
throttle_value=0,
|
|
257
|
+
max_concurrent=JUDGMENT_MAX_CONCURRENT_EVALUATIONS,
|
|
258
|
+
)
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
send_results = [
|
|
262
|
+
scoring_result.model_dump(warnings=False) for scoring_result in results
|
|
263
|
+
]
|
|
264
|
+
url = log_evaluation_results(send_results, evaluation_run)
|
|
265
|
+
rprint(
|
|
266
|
+
f"\n🔍 You can view your evaluation results here: [rgb(106,0,255)][link={url}]View Results[/link]\n"
|
|
267
|
+
)
|
|
268
|
+
return results
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
def assert_test(scoring_results: List[ScoringResult]) -> None:
|
|
272
|
+
"""
|
|
273
|
+
Collects all failed scorers from the scoring results.
|
|
274
|
+
|
|
275
|
+
Args:
|
|
276
|
+
ScoringResults (List[ScoringResult]): List of scoring results to check
|
|
277
|
+
|
|
278
|
+
Returns:
|
|
279
|
+
None. Raises exceptions for any failed test cases.
|
|
280
|
+
"""
|
|
281
|
+
failed_cases: List[List[ScorerData]] = []
|
|
282
|
+
|
|
283
|
+
for result in scoring_results:
|
|
284
|
+
if not result.success:
|
|
285
|
+
# Create a test case context with all relevant fields
|
|
286
|
+
test_case: List[ScorerData] = []
|
|
287
|
+
if result.scorers_data:
|
|
288
|
+
# If the result was not successful, check each scorer_data
|
|
289
|
+
for scorer_data in result.scorers_data:
|
|
290
|
+
if not scorer_data.success:
|
|
291
|
+
test_case.append(scorer_data)
|
|
292
|
+
failed_cases.append(test_case)
|
|
293
|
+
|
|
294
|
+
if failed_cases:
|
|
295
|
+
error_msg = "The following test cases failed: \n"
|
|
296
|
+
for fail_case in failed_cases:
|
|
297
|
+
for fail_scorer in fail_case:
|
|
298
|
+
error_msg += (
|
|
299
|
+
f"\nScorer Name: {fail_scorer.name}\n"
|
|
300
|
+
f"Threshold: {fail_scorer.threshold}\n"
|
|
301
|
+
f"Success: {fail_scorer.success}\n"
|
|
302
|
+
f"Score: {fail_scorer.score}\n"
|
|
303
|
+
f"Reason: {fail_scorer.reason}\n"
|
|
304
|
+
f"Strict Mode: {fail_scorer.strict_mode}\n"
|
|
305
|
+
f"Evaluation Model: {fail_scorer.evaluation_model}\n"
|
|
306
|
+
f"Error: {fail_scorer.error}\n"
|
|
307
|
+
f"Additional Metadata: {fail_scorer.additional_metadata}\n"
|
|
308
|
+
)
|
|
309
|
+
error_msg += "-" * 100
|
|
310
|
+
|
|
311
|
+
total_tests = len(scoring_results)
|
|
312
|
+
failed_tests = len(failed_cases)
|
|
313
|
+
passed_tests = total_tests - failed_tests
|
|
314
|
+
|
|
315
|
+
# Print summary with colors
|
|
316
|
+
rprint("\n" + "=" * 80)
|
|
317
|
+
if failed_tests == 0:
|
|
318
|
+
rprint(
|
|
319
|
+
f"[bold green]🎉 ALL TESTS PASSED! {passed_tests}/{total_tests} tests successful[/bold green]"
|
|
320
|
+
)
|
|
321
|
+
else:
|
|
322
|
+
rprint(
|
|
323
|
+
f"[bold red]⚠️ TEST RESULTS: {passed_tests}/{total_tests} passed ({failed_tests} failed)[/bold red]"
|
|
324
|
+
)
|
|
325
|
+
rprint("=" * 80 + "\n")
|
|
326
|
+
|
|
327
|
+
# Print individual test cases
|
|
328
|
+
for i, result in enumerate(scoring_results):
|
|
329
|
+
test_num = i + 1
|
|
330
|
+
if result.success:
|
|
331
|
+
rprint(f"[green]✓ Test {test_num}: PASSED[/green]")
|
|
332
|
+
else:
|
|
333
|
+
rprint(f"[red]✗ Test {test_num}: FAILED[/red]")
|
|
334
|
+
if result.scorers_data:
|
|
335
|
+
for scorer_data in result.scorers_data:
|
|
336
|
+
if not scorer_data.success:
|
|
337
|
+
rprint(f" [yellow]Scorer: {scorer_data.name}[/yellow]")
|
|
338
|
+
rprint(f" [red] Score: {scorer_data.score}[/red]")
|
|
339
|
+
rprint(f" [red] Reason: {scorer_data.reason}[/red]")
|
|
340
|
+
if scorer_data.error:
|
|
341
|
+
rprint(f" [red] Error: {scorer_data.error}[/red]")
|
|
342
|
+
rprint(" " + "-" * 40)
|
|
343
|
+
|
|
344
|
+
rprint("\n" + "=" * 80)
|
|
345
|
+
if failed_tests > 0:
|
|
346
|
+
raise AssertionError(failed_cases)
|
judgeval/exceptions.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from httpx import HTTPError, Response
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class JudgmentAPIError(HTTPError):
|
|
8
|
+
status_code: int
|
|
9
|
+
detail: str
|
|
10
|
+
response: Optional[Response]
|
|
11
|
+
|
|
12
|
+
def __init__(self, status_code: int, detail: str, response: Optional[Response]):
|
|
13
|
+
self.status_code = status_code
|
|
14
|
+
self.detail = detail
|
|
15
|
+
self.response = response
|
|
16
|
+
super().__init__(f"{status_code}: {detail}")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class JudgmentTestError(Exception): ...
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class JudgmentRuntimeError(RuntimeError): ...
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class InvalidJudgeModelError(Exception): ...
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
__all__ = ("JudgmentAPIError", "JudgmentRuntimeError", "InvalidJudgeModelError")
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from abc import ABC
|
|
4
|
+
import os
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class Langgraph(ABC):
|
|
8
|
+
@staticmethod
|
|
9
|
+
def initialize(otel_only: bool = True):
|
|
10
|
+
os.environ["LANGSMITH_OTEL_ENABLED"] = "true"
|
|
11
|
+
os.environ["LANGSMITH_TRACING"] = "true"
|
|
12
|
+
if otel_only:
|
|
13
|
+
os.environ["LANGSMITH_OTEL_ONLY"] = "true"
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
from abc import ABC
|
|
2
|
+
from judgeval.tracer import Tracer
|
|
3
|
+
from judgeval.logger import judgeval_logger
|
|
4
|
+
from judgeval.utils.url import url_for
|
|
5
|
+
from judgeval.utils.project import _resolve_project_id
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
try:
|
|
9
|
+
import openlit # type: ignore
|
|
10
|
+
except ImportError:
|
|
11
|
+
raise ImportError(
|
|
12
|
+
"Openlit is not installed and required for the openlit integration. Please install it with `pip install openlit`."
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class Openlit(ABC):
|
|
17
|
+
@staticmethod
|
|
18
|
+
def initialize(
|
|
19
|
+
**kwargs,
|
|
20
|
+
):
|
|
21
|
+
tracer = Tracer.get_instance()
|
|
22
|
+
if not tracer or not tracer._initialized:
|
|
23
|
+
raise ValueError(
|
|
24
|
+
"Openlit must be initialized after the tracer has been initialized. Please create the Tracer instance first before initializing Openlit."
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
api_key = tracer.api_key
|
|
28
|
+
organization_id = tracer.organization_id
|
|
29
|
+
project_name = tracer.project_name
|
|
30
|
+
|
|
31
|
+
project_id = _resolve_project_id(project_name, api_key, organization_id)
|
|
32
|
+
if not project_id:
|
|
33
|
+
judgeval_logger.warning(
|
|
34
|
+
f"Project {project_name} not found. Please create it first at https://app.judgmentlabs.ai/org/{organization_id}/projects."
|
|
35
|
+
)
|
|
36
|
+
return
|
|
37
|
+
|
|
38
|
+
openlit.init(
|
|
39
|
+
service_name=project_name,
|
|
40
|
+
otlp_endpoint=url_for("/otel"),
|
|
41
|
+
otlp_headers={
|
|
42
|
+
"Authorization": f"Bearer {api_key}",
|
|
43
|
+
"X-Organization-Id": organization_id,
|
|
44
|
+
"X-Project-Id": project_id,
|
|
45
|
+
},
|
|
46
|
+
tracer=tracer.get_tracer(),
|
|
47
|
+
**kwargs,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
__all__ = ["Openlit"]
|
judgeval/judges/__init__.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
from judgeval.judges.base_judge import JudgevalJudge
|
|
2
2
|
from judgeval.judges.litellm_judge import LiteLLMJudge
|
|
3
3
|
from judgeval.judges.together_judge import TogetherJudge
|
|
4
|
-
from judgeval.judges.mixture_of_judges import MixtureOfJudges
|
|
5
4
|
|
|
6
|
-
|
|
5
|
+
|
|
6
|
+
__all__ = ["JudgevalJudge", "LiteLLMJudge", "TogetherJudge"]
|
judgeval/judges/litellm_judge.py
CHANGED
|
@@ -1,20 +1,77 @@
|
|
|
1
1
|
import pydantic
|
|
2
|
-
from typing import List, Union, Mapping
|
|
2
|
+
from typing import Dict, List, Union, Mapping, Any
|
|
3
3
|
|
|
4
|
+
from judgeval.constants import ACCEPTABLE_MODELS
|
|
4
5
|
from judgeval.judges import JudgevalJudge
|
|
5
|
-
from judgeval.
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
6
|
+
from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
|
|
7
|
+
|
|
8
|
+
try:
|
|
9
|
+
import litellm
|
|
10
|
+
except ImportError:
|
|
11
|
+
raise ImportError(
|
|
12
|
+
"Litellm is not installed and required for the litellm judge. Please install it with `pip install litellm`."
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def fetch_litellm_api_response(
|
|
17
|
+
model: str,
|
|
18
|
+
messages: List[Dict[str, str]],
|
|
19
|
+
response_format: Union[Dict[str, Any], None] = None,
|
|
20
|
+
) -> str:
|
|
21
|
+
if response_format is not None:
|
|
22
|
+
response = litellm.completion(
|
|
23
|
+
model=model,
|
|
24
|
+
messages=messages,
|
|
25
|
+
response_format=response_format,
|
|
26
|
+
)
|
|
27
|
+
else:
|
|
28
|
+
response = litellm.completion(
|
|
29
|
+
model=model,
|
|
30
|
+
messages=messages,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
content = response.choices[0].message.content # type: ignore[attr-defined]
|
|
34
|
+
if content is None:
|
|
35
|
+
raise ValueError("Received empty response from litellm")
|
|
36
|
+
return content
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
async def afetch_litellm_api_response(
|
|
40
|
+
model: str,
|
|
41
|
+
messages: List[Dict[str, str]],
|
|
42
|
+
response_format: Union[Dict[str, Any], None] = None,
|
|
43
|
+
) -> str:
|
|
44
|
+
if not messages:
|
|
45
|
+
raise ValueError("Messages cannot be empty")
|
|
46
|
+
|
|
47
|
+
if model not in ACCEPTABLE_MODELS:
|
|
48
|
+
raise ValueError(
|
|
49
|
+
f"Model {model} is not in the list of supported models: {ACCEPTABLE_MODELS}."
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
if response_format is not None:
|
|
53
|
+
response = await litellm.acompletion(
|
|
54
|
+
model=model, messages=messages, response_format=response_format
|
|
55
|
+
)
|
|
56
|
+
else:
|
|
57
|
+
response = await litellm.acompletion(
|
|
58
|
+
model=model,
|
|
59
|
+
messages=messages,
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
content = response.choices[0].message.content # type: ignore[attr-defined]
|
|
63
|
+
if content is None:
|
|
64
|
+
raise ValueError("Received empty response from litellm")
|
|
65
|
+
return content
|
|
66
|
+
|
|
10
67
|
|
|
11
68
|
BASE_CONVERSATION = [
|
|
12
69
|
{"role": "system", "content": "You are a helpful assistant."},
|
|
13
|
-
]
|
|
70
|
+
]
|
|
14
71
|
|
|
15
72
|
|
|
16
73
|
class LiteLLMJudge(JudgevalJudge):
|
|
17
|
-
def __init__(self, model: str =
|
|
74
|
+
def __init__(self, model: str = JUDGMENT_DEFAULT_GPT_MODEL, **kwargs):
|
|
18
75
|
self.model = model
|
|
19
76
|
self.kwargs = kwargs
|
|
20
77
|
super().__init__(model_name=model)
|
|
@@ -22,19 +79,21 @@ class LiteLLMJudge(JudgevalJudge):
|
|
|
22
79
|
def generate(
|
|
23
80
|
self,
|
|
24
81
|
input: Union[str, List[Mapping[str, str]]],
|
|
25
|
-
schema: pydantic.BaseModel = None,
|
|
82
|
+
schema: Union[pydantic.BaseModel, None] = None,
|
|
26
83
|
) -> str:
|
|
84
|
+
response_format = schema.model_json_schema() if schema else None
|
|
85
|
+
|
|
27
86
|
if isinstance(input, str):
|
|
28
87
|
convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
|
|
29
88
|
return fetch_litellm_api_response(
|
|
30
|
-
model=self.model, messages=convo, response_format=
|
|
89
|
+
model=self.model, messages=convo, response_format=response_format
|
|
31
90
|
)
|
|
32
91
|
elif isinstance(input, list):
|
|
92
|
+
messages = [dict(msg) for msg in input]
|
|
33
93
|
return fetch_litellm_api_response(
|
|
34
|
-
model=self.model, messages=
|
|
94
|
+
model=self.model, messages=messages, response_format=response_format
|
|
35
95
|
)
|
|
36
96
|
else:
|
|
37
|
-
judgeval_logger.error(f"Invalid input type received: {type(input)}")
|
|
38
97
|
raise TypeError(
|
|
39
98
|
f"Input must be a string or a list of dictionaries. Input type of: {type(input)}"
|
|
40
99
|
)
|
|
@@ -42,21 +101,23 @@ class LiteLLMJudge(JudgevalJudge):
|
|
|
42
101
|
async def a_generate(
|
|
43
102
|
self,
|
|
44
103
|
input: Union[str, List[Mapping[str, str]]],
|
|
45
|
-
schema: pydantic.BaseModel = None,
|
|
104
|
+
schema: Union[pydantic.BaseModel, None] = None,
|
|
46
105
|
) -> str:
|
|
106
|
+
response_format = schema.model_json_schema() if schema else None
|
|
107
|
+
|
|
47
108
|
if isinstance(input, str):
|
|
48
109
|
convo = BASE_CONVERSATION + [{"role": "user", "content": input}]
|
|
49
110
|
response = await afetch_litellm_api_response(
|
|
50
|
-
model=self.model, messages=convo, response_format=
|
|
111
|
+
model=self.model, messages=convo, response_format=response_format
|
|
51
112
|
)
|
|
52
113
|
return response
|
|
53
114
|
elif isinstance(input, list):
|
|
115
|
+
messages = [dict(msg) for msg in input]
|
|
54
116
|
response = await afetch_litellm_api_response(
|
|
55
|
-
model=self.model, messages=
|
|
117
|
+
model=self.model, messages=messages, response_format=response_format
|
|
56
118
|
)
|
|
57
119
|
return response
|
|
58
120
|
else:
|
|
59
|
-
judgeval_logger.error(f"Invalid input type received: {type(input)}")
|
|
60
121
|
raise TypeError(
|
|
61
122
|
f"Input must be a string or a list of dictionaries. Input type of: {type(input)}"
|
|
62
123
|
)
|