judgeval 0.1.0__py3-none-any.whl → 0.23.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +173 -10
- judgeval/api/__init__.py +523 -0
- judgeval/api/api_types.py +413 -0
- judgeval/cli.py +112 -0
- judgeval/constants.py +7 -30
- judgeval/data/__init__.py +1 -3
- judgeval/data/evaluation_run.py +125 -0
- judgeval/data/example.py +14 -40
- judgeval/data/judgment_types.py +396 -146
- judgeval/data/result.py +11 -18
- judgeval/data/scorer_data.py +3 -26
- judgeval/data/scripts/openapi_transform.py +5 -5
- judgeval/data/trace.py +115 -194
- judgeval/dataset/__init__.py +335 -0
- judgeval/env.py +55 -0
- judgeval/evaluation/__init__.py +346 -0
- judgeval/exceptions.py +28 -0
- judgeval/integrations/langgraph/__init__.py +13 -0
- judgeval/integrations/openlit/__init__.py +51 -0
- judgeval/judges/__init__.py +2 -2
- judgeval/judges/litellm_judge.py +77 -16
- judgeval/judges/together_judge.py +88 -17
- judgeval/judges/utils.py +7 -20
- judgeval/judgment_attribute_keys.py +55 -0
- judgeval/{common/logger.py → logger.py} +24 -8
- judgeval/prompt/__init__.py +330 -0
- judgeval/scorers/__init__.py +11 -11
- judgeval/scorers/agent_scorer.py +15 -19
- judgeval/scorers/api_scorer.py +21 -23
- judgeval/scorers/base_scorer.py +54 -36
- judgeval/scorers/example_scorer.py +1 -3
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -24
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -10
- judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
- judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -10
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -14
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +171 -59
- judgeval/scorers/score.py +64 -47
- judgeval/scorers/utils.py +2 -107
- judgeval/tracer/__init__.py +1111 -2
- judgeval/tracer/constants.py +1 -0
- judgeval/tracer/exporters/__init__.py +40 -0
- judgeval/tracer/exporters/s3.py +119 -0
- judgeval/tracer/exporters/store.py +59 -0
- judgeval/tracer/exporters/utils.py +32 -0
- judgeval/tracer/keys.py +63 -0
- judgeval/tracer/llm/__init__.py +7 -0
- judgeval/tracer/llm/config.py +78 -0
- judgeval/tracer/llm/constants.py +9 -0
- judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
- judgeval/tracer/llm/llm_anthropic/config.py +6 -0
- judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
- judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
- judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
- judgeval/tracer/llm/llm_google/__init__.py +3 -0
- judgeval/tracer/llm/llm_google/config.py +6 -0
- judgeval/tracer/llm/llm_google/generate_content.py +127 -0
- judgeval/tracer/llm/llm_google/wrapper.py +30 -0
- judgeval/tracer/llm/llm_openai/__init__.py +3 -0
- judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
- judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
- judgeval/tracer/llm/llm_openai/config.py +6 -0
- judgeval/tracer/llm/llm_openai/responses.py +506 -0
- judgeval/tracer/llm/llm_openai/utils.py +42 -0
- judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
- judgeval/tracer/llm/llm_together/__init__.py +3 -0
- judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
- judgeval/tracer/llm/llm_together/config.py +6 -0
- judgeval/tracer/llm/llm_together/wrapper.py +52 -0
- judgeval/tracer/llm/providers.py +19 -0
- judgeval/tracer/managers.py +167 -0
- judgeval/tracer/processors/__init__.py +220 -0
- judgeval/tracer/utils.py +19 -0
- judgeval/trainer/__init__.py +14 -0
- judgeval/trainer/base_trainer.py +122 -0
- judgeval/trainer/config.py +123 -0
- judgeval/trainer/console.py +144 -0
- judgeval/trainer/fireworks_trainer.py +392 -0
- judgeval/trainer/trainable_model.py +252 -0
- judgeval/trainer/trainer.py +70 -0
- judgeval/utils/async_utils.py +39 -0
- judgeval/utils/decorators/__init__.py +0 -0
- judgeval/utils/decorators/dont_throw.py +37 -0
- judgeval/utils/decorators/use_once.py +13 -0
- judgeval/utils/file_utils.py +74 -28
- judgeval/utils/guards.py +36 -0
- judgeval/utils/meta.py +27 -0
- judgeval/utils/project.py +15 -0
- judgeval/utils/serialize.py +253 -0
- judgeval/utils/testing.py +70 -0
- judgeval/utils/url.py +10 -0
- judgeval/{version_check.py → utils/version_check.py} +5 -3
- judgeval/utils/wrappers/README.md +3 -0
- judgeval/utils/wrappers/__init__.py +15 -0
- judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
- judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
- judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
- judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
- judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
- judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
- judgeval/utils/wrappers/py.typed +0 -0
- judgeval/utils/wrappers/utils.py +35 -0
- judgeval/v1/__init__.py +88 -0
- judgeval/v1/data/__init__.py +7 -0
- judgeval/v1/data/example.py +44 -0
- judgeval/v1/data/scorer_data.py +42 -0
- judgeval/v1/data/scoring_result.py +44 -0
- judgeval/v1/datasets/__init__.py +6 -0
- judgeval/v1/datasets/dataset.py +214 -0
- judgeval/v1/datasets/dataset_factory.py +94 -0
- judgeval/v1/evaluation/__init__.py +6 -0
- judgeval/v1/evaluation/evaluation.py +182 -0
- judgeval/v1/evaluation/evaluation_factory.py +17 -0
- judgeval/v1/instrumentation/__init__.py +6 -0
- judgeval/v1/instrumentation/llm/__init__.py +7 -0
- judgeval/v1/instrumentation/llm/config.py +78 -0
- judgeval/v1/instrumentation/llm/constants.py +11 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/messages.py +414 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/messages_stream.py +307 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/wrapper.py +61 -0
- judgeval/v1/instrumentation/llm/llm_google/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_google/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_google/generate_content.py +121 -0
- judgeval/v1/instrumentation/llm/llm_google/wrapper.py +30 -0
- judgeval/v1/instrumentation/llm/llm_openai/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_openai/beta_chat_completions.py +212 -0
- judgeval/v1/instrumentation/llm/llm_openai/chat_completions.py +477 -0
- judgeval/v1/instrumentation/llm/llm_openai/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_openai/responses.py +472 -0
- judgeval/v1/instrumentation/llm/llm_openai/utils.py +41 -0
- judgeval/v1/instrumentation/llm/llm_openai/wrapper.py +63 -0
- judgeval/v1/instrumentation/llm/llm_together/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_together/chat_completions.py +382 -0
- judgeval/v1/instrumentation/llm/llm_together/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_together/wrapper.py +57 -0
- judgeval/v1/instrumentation/llm/providers.py +19 -0
- judgeval/v1/integrations/claude_agent_sdk/__init__.py +119 -0
- judgeval/v1/integrations/claude_agent_sdk/wrapper.py +564 -0
- judgeval/v1/integrations/langgraph/__init__.py +13 -0
- judgeval/v1/integrations/openlit/__init__.py +47 -0
- judgeval/v1/internal/api/__init__.py +525 -0
- judgeval/v1/internal/api/api_types.py +413 -0
- judgeval/v1/prompts/__init__.py +6 -0
- judgeval/v1/prompts/prompt.py +29 -0
- judgeval/v1/prompts/prompt_factory.py +189 -0
- judgeval/v1/py.typed +0 -0
- judgeval/v1/scorers/__init__.py +6 -0
- judgeval/v1/scorers/api_scorer.py +82 -0
- judgeval/v1/scorers/base_scorer.py +17 -0
- judgeval/v1/scorers/built_in/__init__.py +17 -0
- judgeval/v1/scorers/built_in/answer_correctness.py +28 -0
- judgeval/v1/scorers/built_in/answer_relevancy.py +28 -0
- judgeval/v1/scorers/built_in/built_in_factory.py +26 -0
- judgeval/v1/scorers/built_in/faithfulness.py +28 -0
- judgeval/v1/scorers/built_in/instruction_adherence.py +28 -0
- judgeval/v1/scorers/custom_scorer/__init__.py +6 -0
- judgeval/v1/scorers/custom_scorer/custom_scorer.py +50 -0
- judgeval/v1/scorers/custom_scorer/custom_scorer_factory.py +16 -0
- judgeval/v1/scorers/prompt_scorer/__init__.py +6 -0
- judgeval/v1/scorers/prompt_scorer/prompt_scorer.py +86 -0
- judgeval/v1/scorers/prompt_scorer/prompt_scorer_factory.py +85 -0
- judgeval/v1/scorers/scorers_factory.py +49 -0
- judgeval/v1/tracer/__init__.py +7 -0
- judgeval/v1/tracer/base_tracer.py +520 -0
- judgeval/v1/tracer/exporters/__init__.py +14 -0
- judgeval/v1/tracer/exporters/in_memory_span_exporter.py +25 -0
- judgeval/v1/tracer/exporters/judgment_span_exporter.py +42 -0
- judgeval/v1/tracer/exporters/noop_span_exporter.py +19 -0
- judgeval/v1/tracer/exporters/span_store.py +50 -0
- judgeval/v1/tracer/judgment_tracer_provider.py +70 -0
- judgeval/v1/tracer/processors/__init__.py +6 -0
- judgeval/v1/tracer/processors/_lifecycles/__init__.py +28 -0
- judgeval/v1/tracer/processors/_lifecycles/agent_id_processor.py +53 -0
- judgeval/v1/tracer/processors/_lifecycles/context_keys.py +11 -0
- judgeval/v1/tracer/processors/_lifecycles/customer_id_processor.py +29 -0
- judgeval/v1/tracer/processors/_lifecycles/registry.py +18 -0
- judgeval/v1/tracer/processors/judgment_span_processor.py +165 -0
- judgeval/v1/tracer/processors/noop_span_processor.py +42 -0
- judgeval/v1/tracer/tracer.py +67 -0
- judgeval/v1/tracer/tracer_factory.py +38 -0
- judgeval/v1/trainers/__init__.py +5 -0
- judgeval/v1/trainers/base_trainer.py +62 -0
- judgeval/v1/trainers/config.py +123 -0
- judgeval/v1/trainers/console.py +144 -0
- judgeval/v1/trainers/fireworks_trainer.py +392 -0
- judgeval/v1/trainers/trainable_model.py +252 -0
- judgeval/v1/trainers/trainers_factory.py +37 -0
- judgeval/v1/utils.py +18 -0
- judgeval/version.py +5 -0
- judgeval/warnings.py +4 -0
- judgeval-0.23.0.dist-info/METADATA +266 -0
- judgeval-0.23.0.dist-info/RECORD +201 -0
- judgeval-0.23.0.dist-info/entry_points.txt +2 -0
- judgeval/clients.py +0 -34
- judgeval/common/__init__.py +0 -13
- judgeval/common/api/__init__.py +0 -3
- judgeval/common/api/api.py +0 -352
- judgeval/common/api/constants.py +0 -165
- judgeval/common/exceptions.py +0 -27
- judgeval/common/storage/__init__.py +0 -6
- judgeval/common/storage/s3_storage.py +0 -98
- judgeval/common/tracer/__init__.py +0 -31
- judgeval/common/tracer/constants.py +0 -22
- judgeval/common/tracer/core.py +0 -1916
- judgeval/common/tracer/otel_exporter.py +0 -108
- judgeval/common/tracer/otel_span_processor.py +0 -234
- judgeval/common/tracer/span_processor.py +0 -37
- judgeval/common/tracer/span_transformer.py +0 -211
- judgeval/common/tracer/trace_manager.py +0 -92
- judgeval/common/utils.py +0 -940
- judgeval/data/datasets/__init__.py +0 -4
- judgeval/data/datasets/dataset.py +0 -341
- judgeval/data/datasets/eval_dataset_client.py +0 -214
- judgeval/data/tool.py +0 -5
- judgeval/data/trace_run.py +0 -37
- judgeval/evaluation_run.py +0 -75
- judgeval/integrations/langgraph.py +0 -843
- judgeval/judges/mixture_of_judges.py +0 -286
- judgeval/judgment_client.py +0 -369
- judgeval/rules.py +0 -521
- judgeval/run_evaluation.py +0 -684
- judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -14
- judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
- judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
- judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -20
- judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -27
- judgeval/utils/alerts.py +0 -93
- judgeval/utils/requests.py +0 -50
- judgeval-0.1.0.dist-info/METADATA +0 -202
- judgeval-0.1.0.dist-info/RECORD +0 -73
- {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/WHEEL +0 -0
- {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -1,28 +1,51 @@
|
|
|
1
|
-
from judgeval.scorers.api_scorer import
|
|
1
|
+
from judgeval.scorers.api_scorer import (
|
|
2
|
+
APIScorerConfig,
|
|
3
|
+
ExampleAPIScorerConfig,
|
|
4
|
+
TraceAPIScorerConfig,
|
|
5
|
+
)
|
|
2
6
|
from judgeval.constants import APIScorerType
|
|
3
|
-
from typing import
|
|
4
|
-
from judgeval.
|
|
7
|
+
from typing import Dict, Any, Optional
|
|
8
|
+
from judgeval.api import JudgmentSyncClient
|
|
9
|
+
from judgeval.exceptions import JudgmentAPIError
|
|
5
10
|
import os
|
|
6
|
-
from judgeval.
|
|
11
|
+
from judgeval.logger import judgeval_logger
|
|
12
|
+
from abc import ABC
|
|
13
|
+
from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
|
|
14
|
+
from copy import copy
|
|
15
|
+
from judgeval.utils.decorators.dont_throw import dont_throw
|
|
7
16
|
|
|
8
17
|
|
|
9
18
|
def push_prompt_scorer(
|
|
10
19
|
name: str,
|
|
11
20
|
prompt: str,
|
|
12
|
-
|
|
21
|
+
threshold: float,
|
|
22
|
+
options: Optional[Dict[str, float]] = None,
|
|
23
|
+
model: str = JUDGMENT_DEFAULT_GPT_MODEL,
|
|
24
|
+
description: Optional[str] = None,
|
|
13
25
|
judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
|
|
14
26
|
organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
|
|
27
|
+
is_trace: bool = False,
|
|
15
28
|
) -> str:
|
|
16
|
-
client =
|
|
29
|
+
client = JudgmentSyncClient(judgment_api_key, organization_id)
|
|
17
30
|
try:
|
|
18
|
-
r = client.save_scorer(
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
31
|
+
r = client.save_scorer(
|
|
32
|
+
payload={
|
|
33
|
+
"name": name,
|
|
34
|
+
"prompt": prompt,
|
|
35
|
+
"threshold": threshold,
|
|
36
|
+
"options": options,
|
|
37
|
+
"model": model,
|
|
38
|
+
"description": description,
|
|
39
|
+
"is_trace": is_trace,
|
|
40
|
+
}
|
|
41
|
+
)
|
|
42
|
+
except JudgmentAPIError as e:
|
|
43
|
+
raise JudgmentAPIError(
|
|
44
|
+
status_code=e.status_code,
|
|
45
|
+
detail=f"Failed to save prompt scorer: {e.detail}",
|
|
46
|
+
response=e.response,
|
|
47
|
+
)
|
|
48
|
+
return r["scorer_response"]["name"]
|
|
26
49
|
|
|
27
50
|
|
|
28
51
|
def fetch_prompt_scorer(
|
|
@@ -30,19 +53,26 @@ def fetch_prompt_scorer(
|
|
|
30
53
|
judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
|
|
31
54
|
organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
|
|
32
55
|
):
|
|
33
|
-
client =
|
|
56
|
+
client = JudgmentSyncClient(judgment_api_key, organization_id)
|
|
34
57
|
try:
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
return scorer_config
|
|
39
|
-
except JudgmentAPIException as e:
|
|
40
|
-
if e.status_code == 500:
|
|
58
|
+
fetched_scorers = client.fetch_scorers({"names": [name]})
|
|
59
|
+
if len(fetched_scorers["scorers"]) == 0:
|
|
60
|
+
judgeval_logger.error(f"Prompt scorer '{name}' not found")
|
|
41
61
|
raise JudgmentAPIError(
|
|
42
|
-
|
|
62
|
+
status_code=404,
|
|
63
|
+
detail=f"Prompt scorer '{name}' not found",
|
|
64
|
+
response=None, # type: ignore
|
|
43
65
|
)
|
|
66
|
+
else:
|
|
67
|
+
scorer_config = fetched_scorers["scorers"][0]
|
|
68
|
+
scorer_config.pop("created_at")
|
|
69
|
+
scorer_config.pop("updated_at")
|
|
70
|
+
return scorer_config
|
|
71
|
+
except JudgmentAPIError as e:
|
|
44
72
|
raise JudgmentAPIError(
|
|
45
|
-
|
|
73
|
+
status_code=e.status_code,
|
|
74
|
+
detail=f"Failed to fetch prompt scorer '{name}': {e.detail}",
|
|
75
|
+
response=e.response,
|
|
46
76
|
)
|
|
47
77
|
|
|
48
78
|
|
|
@@ -51,33 +81,33 @@ def scorer_exists(
|
|
|
51
81
|
judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
|
|
52
82
|
organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
|
|
53
83
|
):
|
|
54
|
-
client =
|
|
84
|
+
client = JudgmentSyncClient(judgment_api_key, organization_id)
|
|
55
85
|
try:
|
|
56
|
-
return client.scorer_exists(name)["exists"]
|
|
57
|
-
except
|
|
86
|
+
return client.scorer_exists({"name": name})["exists"]
|
|
87
|
+
except JudgmentAPIError as e:
|
|
58
88
|
if e.status_code == 500:
|
|
59
89
|
raise JudgmentAPIError(
|
|
60
|
-
|
|
90
|
+
status_code=e.status_code,
|
|
91
|
+
detail=f"The server is temporarily unavailable. Please try your request again in a few moments. Error details: {e.detail}",
|
|
92
|
+
response=e.response,
|
|
61
93
|
)
|
|
62
|
-
raise JudgmentAPIError(
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
In the Judgment backend, this scorer is implemented as a PromptScorer that takes
|
|
68
|
-
1. a system role that may involve the Example object
|
|
69
|
-
2. options for scores on the example
|
|
94
|
+
raise JudgmentAPIError(
|
|
95
|
+
status_code=e.status_code,
|
|
96
|
+
detail=f"Failed to check if scorer exists: {e.detail}",
|
|
97
|
+
response=e.response,
|
|
98
|
+
)
|
|
70
99
|
|
|
71
|
-
and uses a judge to execute the evaluation from the system role and classify into one of the options
|
|
72
|
-
"""
|
|
73
100
|
|
|
101
|
+
class BasePromptScorer(ABC, APIScorerConfig):
|
|
102
|
+
score_type: APIScorerType
|
|
74
103
|
prompt: str
|
|
75
|
-
options:
|
|
76
|
-
|
|
104
|
+
options: Optional[Dict[str, float]] = None
|
|
105
|
+
description: Optional[str] = None
|
|
77
106
|
judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or ""
|
|
78
107
|
organization_id: str = os.getenv("JUDGMENT_ORG_ID") or ""
|
|
79
108
|
|
|
80
109
|
@classmethod
|
|
110
|
+
@dont_throw
|
|
81
111
|
def get(
|
|
82
112
|
cls,
|
|
83
113
|
name: str,
|
|
@@ -85,10 +115,24 @@ class PromptScorer(APIScorerConfig):
|
|
|
85
115
|
organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
|
|
86
116
|
):
|
|
87
117
|
scorer_config = fetch_prompt_scorer(name, judgment_api_key, organization_id)
|
|
118
|
+
if scorer_config["is_trace"] != issubclass(cls, TracePromptScorer):
|
|
119
|
+
raise JudgmentAPIError(
|
|
120
|
+
status_code=400,
|
|
121
|
+
detail=f"Scorer with name {name} is not a {cls.__name__}",
|
|
122
|
+
response=None, # type: ignore
|
|
123
|
+
)
|
|
124
|
+
if issubclass(cls, TracePromptScorer):
|
|
125
|
+
score_type = APIScorerType.TRACE_PROMPT_SCORER
|
|
126
|
+
else:
|
|
127
|
+
score_type = APIScorerType.PROMPT_SCORER
|
|
88
128
|
return cls(
|
|
129
|
+
score_type=score_type,
|
|
89
130
|
name=name,
|
|
90
131
|
prompt=scorer_config["prompt"],
|
|
91
|
-
|
|
132
|
+
threshold=scorer_config["threshold"],
|
|
133
|
+
options=scorer_config.get("options"),
|
|
134
|
+
model=scorer_config.get("model"),
|
|
135
|
+
description=scorer_config.get("description"),
|
|
92
136
|
judgment_api_key=judgment_api_key,
|
|
93
137
|
organization_id=organization_id,
|
|
94
138
|
)
|
|
@@ -98,32 +142,51 @@ class PromptScorer(APIScorerConfig):
|
|
|
98
142
|
cls,
|
|
99
143
|
name: str,
|
|
100
144
|
prompt: str,
|
|
101
|
-
|
|
145
|
+
threshold: float = 0.5,
|
|
146
|
+
options: Optional[Dict[str, float]] = None,
|
|
147
|
+
model: str = JUDGMENT_DEFAULT_GPT_MODEL,
|
|
148
|
+
description: Optional[str] = None,
|
|
102
149
|
judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
|
|
103
150
|
organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
|
|
104
151
|
):
|
|
105
152
|
if not scorer_exists(name, judgment_api_key, organization_id):
|
|
106
|
-
|
|
153
|
+
if issubclass(cls, TracePromptScorer):
|
|
154
|
+
is_trace = True
|
|
155
|
+
score_type = APIScorerType.TRACE_PROMPT_SCORER
|
|
156
|
+
else:
|
|
157
|
+
is_trace = False
|
|
158
|
+
score_type = APIScorerType.PROMPT_SCORER
|
|
159
|
+
push_prompt_scorer(
|
|
160
|
+
name,
|
|
161
|
+
prompt,
|
|
162
|
+
threshold,
|
|
163
|
+
options,
|
|
164
|
+
model,
|
|
165
|
+
description,
|
|
166
|
+
judgment_api_key,
|
|
167
|
+
organization_id,
|
|
168
|
+
is_trace,
|
|
169
|
+
)
|
|
170
|
+
judgeval_logger.info(f"Successfully created PromptScorer: {name}")
|
|
107
171
|
return cls(
|
|
172
|
+
score_type=score_type,
|
|
108
173
|
name=name,
|
|
109
174
|
prompt=prompt,
|
|
175
|
+
threshold=threshold,
|
|
110
176
|
options=options,
|
|
177
|
+
model=model,
|
|
178
|
+
description=description,
|
|
111
179
|
judgment_api_key=judgment_api_key,
|
|
112
180
|
organization_id=organization_id,
|
|
113
181
|
)
|
|
114
182
|
else:
|
|
115
183
|
raise JudgmentAPIError(
|
|
116
|
-
|
|
184
|
+
status_code=400,
|
|
185
|
+
detail=f"Scorer with name {name} already exists. Either use the existing scorer with the get() method or use a new name.",
|
|
186
|
+
response=None, # type: ignore
|
|
117
187
|
)
|
|
118
188
|
|
|
119
189
|
# Setter functions. Each setter function pushes the scorer to the DB.
|
|
120
|
-
def set_name(self, name: str):
|
|
121
|
-
"""
|
|
122
|
-
Updates the name of the scorer.
|
|
123
|
-
"""
|
|
124
|
-
self.name = name
|
|
125
|
-
self.push_prompt_scorer()
|
|
126
|
-
|
|
127
190
|
def set_threshold(self, threshold: float):
|
|
128
191
|
"""
|
|
129
192
|
Updates the threshold of the scorer.
|
|
@@ -140,16 +203,31 @@ class PromptScorer(APIScorerConfig):
|
|
|
140
203
|
"""
|
|
141
204
|
self.prompt = prompt
|
|
142
205
|
self.push_prompt_scorer()
|
|
206
|
+
judgeval_logger.info(f"Successfully updated prompt for {self.name}")
|
|
143
207
|
|
|
144
|
-
def
|
|
208
|
+
def set_model(self, model: str):
|
|
145
209
|
"""
|
|
146
|
-
Updates the
|
|
210
|
+
Updates the model of the scorer.
|
|
211
|
+
"""
|
|
212
|
+
self.model = model
|
|
213
|
+
self.push_prompt_scorer()
|
|
214
|
+
judgeval_logger.info(f"Successfully updated model for {self.name}")
|
|
147
215
|
|
|
148
|
-
|
|
149
|
-
|
|
216
|
+
def set_options(self, options: Optional[Dict[str, float]]):
|
|
217
|
+
"""
|
|
218
|
+
Updates the options of the scorer.
|
|
150
219
|
"""
|
|
151
220
|
self.options = options
|
|
152
221
|
self.push_prompt_scorer()
|
|
222
|
+
judgeval_logger.info(f"Successfully updated options for {self.name}")
|
|
223
|
+
|
|
224
|
+
def set_description(self, description: Optional[str]):
|
|
225
|
+
"""
|
|
226
|
+
Updates the description of the scorer.
|
|
227
|
+
"""
|
|
228
|
+
self.description = description
|
|
229
|
+
self.push_prompt_scorer()
|
|
230
|
+
judgeval_logger.info(f"Successfully updated description for {self.name}")
|
|
153
231
|
|
|
154
232
|
def append_to_prompt(self, prompt_addition: str):
|
|
155
233
|
"""
|
|
@@ -157,21 +235,40 @@ class PromptScorer(APIScorerConfig):
|
|
|
157
235
|
"""
|
|
158
236
|
self.prompt += prompt_addition
|
|
159
237
|
self.push_prompt_scorer()
|
|
238
|
+
judgeval_logger.info(f"Successfully appended to prompt for {self.name}")
|
|
160
239
|
|
|
161
240
|
# Getters
|
|
162
|
-
def
|
|
241
|
+
def get_threshold(self) -> float:
|
|
242
|
+
"""
|
|
243
|
+
Returns the threshold of the scorer.
|
|
244
|
+
"""
|
|
245
|
+
return self.threshold
|
|
246
|
+
|
|
247
|
+
def get_prompt(self) -> str:
|
|
163
248
|
"""
|
|
164
249
|
Returns the prompt of the scorer.
|
|
165
250
|
"""
|
|
166
251
|
return self.prompt
|
|
167
252
|
|
|
168
|
-
def
|
|
253
|
+
def get_model(self) -> str:
|
|
254
|
+
"""
|
|
255
|
+
Returns the model of the scorer.
|
|
256
|
+
"""
|
|
257
|
+
return self.model
|
|
258
|
+
|
|
259
|
+
def get_options(self) -> Dict[str, float] | None:
|
|
169
260
|
"""
|
|
170
261
|
Returns the options of the scorer.
|
|
171
262
|
"""
|
|
172
|
-
return self.options
|
|
263
|
+
return copy(self.options) if self.options is not None else None
|
|
264
|
+
|
|
265
|
+
def get_description(self) -> str | None:
|
|
266
|
+
"""
|
|
267
|
+
Returns the description of the scorer.
|
|
268
|
+
"""
|
|
269
|
+
return self.description
|
|
173
270
|
|
|
174
|
-
def get_name(self) -> str
|
|
271
|
+
def get_name(self) -> str:
|
|
175
272
|
"""
|
|
176
273
|
Returns the name of the scorer.
|
|
177
274
|
"""
|
|
@@ -183,8 +280,11 @@ class PromptScorer(APIScorerConfig):
|
|
|
183
280
|
"""
|
|
184
281
|
return {
|
|
185
282
|
"name": self.name,
|
|
283
|
+
"model": self.model,
|
|
186
284
|
"prompt": self.prompt,
|
|
285
|
+
"threshold": self.threshold,
|
|
187
286
|
"options": self.options,
|
|
287
|
+
"description": self.description,
|
|
188
288
|
}
|
|
189
289
|
|
|
190
290
|
def push_prompt_scorer(self):
|
|
@@ -194,13 +294,17 @@ class PromptScorer(APIScorerConfig):
|
|
|
194
294
|
push_prompt_scorer(
|
|
195
295
|
self.name,
|
|
196
296
|
self.prompt,
|
|
297
|
+
self.threshold,
|
|
197
298
|
self.options,
|
|
299
|
+
self.model,
|
|
300
|
+
self.description,
|
|
198
301
|
self.judgment_api_key,
|
|
199
302
|
self.organization_id,
|
|
303
|
+
isinstance(self, TracePromptScorer),
|
|
200
304
|
)
|
|
201
305
|
|
|
202
306
|
def __str__(self):
|
|
203
|
-
return f"PromptScorer(name={self.name}, prompt={self.prompt}, options={self.options})"
|
|
307
|
+
return f"PromptScorer(name={self.name}, model={self.model}, prompt={self.prompt}, threshold={self.threshold}, options={self.options}, description={self.description})"
|
|
204
308
|
|
|
205
309
|
def model_dump(self, *args, **kwargs) -> Dict[str, Any]:
|
|
206
310
|
base = super().model_dump(*args, **kwargs)
|
|
@@ -213,3 +317,11 @@ class PromptScorer(APIScorerConfig):
|
|
|
213
317
|
k: getattr(self, k) for k in extra_fields if getattr(self, k) is not None
|
|
214
318
|
}
|
|
215
319
|
return base
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
class PromptScorer(BasePromptScorer, ExampleAPIScorerConfig):
|
|
323
|
+
pass
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
class TracePromptScorer(BasePromptScorer, TraceAPIScorerConfig):
|
|
327
|
+
pass
|
judgeval/scorers/score.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
"""
|
|
2
|
-
Infrastructure for executing evaluations of `Example`s using one or more `
|
|
2
|
+
Infrastructure for executing evaluations of `Example`s using one or more `ExampleScorer`s.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
import asyncio
|
|
@@ -13,61 +13,67 @@ from judgeval.data import (
|
|
|
13
13
|
generate_scoring_result,
|
|
14
14
|
create_scorer_data,
|
|
15
15
|
)
|
|
16
|
-
from judgeval.scorers import
|
|
16
|
+
from judgeval.scorers.example_scorer import ExampleScorer
|
|
17
17
|
from judgeval.scorers.utils import clone_scorers
|
|
18
|
-
from judgeval.
|
|
18
|
+
from judgeval.logger import judgeval_logger
|
|
19
19
|
from judgeval.judges import JudgevalJudge
|
|
20
|
+
from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
|
|
20
21
|
|
|
21
22
|
|
|
22
23
|
async def safe_a_score_example(
|
|
23
|
-
scorer:
|
|
24
|
+
scorer: ExampleScorer,
|
|
24
25
|
example: Example,
|
|
25
26
|
):
|
|
26
27
|
"""
|
|
27
28
|
Scoring task function when not using a progress indicator!
|
|
28
|
-
"Safely" scores an `Example` using a `
|
|
29
|
+
"Safely" scores an `Example` using a `ExampleScorer` by gracefully handling any exceptions that may occur.
|
|
29
30
|
|
|
30
31
|
Args:
|
|
31
|
-
scorer (
|
|
32
|
+
scorer (ExampleScorer): The `ExampleScorer` to use for scoring the example.
|
|
32
33
|
example (Example): The `Example` to be scored.
|
|
33
|
-
|
|
34
|
-
ignore_errors (bool): Whether to ignore errors during the evaluation.
|
|
35
|
-
If set to false, any error will be raised and stop the evaluation.
|
|
36
|
-
If set to true, the error will be stored in the `error` attribute of the `BaseScorer` and the `success` attribute will be set to False.
|
|
37
|
-
|
|
38
|
-
skip_on_missing_params (bool): Whether to skip the test case if required parameters are missing.
|
|
39
34
|
"""
|
|
40
35
|
try:
|
|
41
|
-
|
|
36
|
+
score = await scorer.a_score_example(example)
|
|
37
|
+
if score is None:
|
|
38
|
+
raise Exception("a_score_example need to return a score")
|
|
39
|
+
elif score < 0:
|
|
40
|
+
judgeval_logger.warning("score cannot be less than 0 , setting to 0")
|
|
41
|
+
score = 0
|
|
42
|
+
elif score > 1:
|
|
43
|
+
judgeval_logger.warning("score cannot be greater than 1 , setting to 1")
|
|
44
|
+
score = 1
|
|
45
|
+
else:
|
|
46
|
+
scorer.score = score
|
|
42
47
|
scorer.success = scorer.success_check()
|
|
43
48
|
except Exception as e:
|
|
44
49
|
judgeval_logger.error(f"Error during scoring: {str(e)}")
|
|
45
50
|
scorer.error = str(e)
|
|
46
51
|
scorer.success = False
|
|
52
|
+
scorer.score = 0
|
|
47
53
|
return
|
|
48
54
|
|
|
49
55
|
|
|
50
56
|
async def a_execute_scoring(
|
|
51
57
|
examples: List[Example],
|
|
52
|
-
scorers: List[
|
|
53
|
-
model: Optional[Union[str, List[str], JudgevalJudge]] =
|
|
58
|
+
scorers: List[ExampleScorer],
|
|
59
|
+
model: Optional[Union[str, List[str], JudgevalJudge]] = JUDGMENT_DEFAULT_GPT_MODEL,
|
|
54
60
|
ignore_errors: bool = False,
|
|
55
61
|
throttle_value: int = 0,
|
|
56
62
|
max_concurrent: int = 100,
|
|
63
|
+
show_progress: bool = True,
|
|
57
64
|
) -> List[ScoringResult]:
|
|
58
65
|
"""
|
|
59
|
-
Executes evaluations of `Example`s asynchronously using one or more `
|
|
60
|
-
Each `Example` will be evaluated by all of the `
|
|
66
|
+
Executes evaluations of `Example`s asynchronously using one or more `ExampleScorer`s.
|
|
67
|
+
Each `Example` will be evaluated by all of the `ExampleScorer`s in the `scorers` list.
|
|
61
68
|
|
|
62
69
|
Args:
|
|
63
70
|
examples (List[Example]): A list of `Example` objects to be evaluated.
|
|
64
|
-
scorers (List[
|
|
71
|
+
scorers (List[ExampleScorer]): A list of `ExampleScorer` objects to evaluate the examples.
|
|
65
72
|
model (Union[str, List[str], JudgevalJudge]): The model to use for evaluation.
|
|
66
73
|
ignore_errors (bool): Whether to ignore errors during evaluation.
|
|
67
74
|
throttle_value (int): The amount of time to wait between starting each task.
|
|
68
75
|
max_concurrent (int): The maximum number of concurrent tasks.
|
|
69
|
-
|
|
70
|
-
_use_bar_indicator (bool): Whether to use a progress bar indicator.
|
|
76
|
+
show_progress (bool): Whether to show the progress bar indicator.
|
|
71
77
|
|
|
72
78
|
Returns:
|
|
73
79
|
List[ScoringResult]: A list of `ScoringResult` objects containing the evaluation results.
|
|
@@ -82,33 +88,50 @@ async def a_execute_scoring(
|
|
|
82
88
|
except Exception as e:
|
|
83
89
|
judgeval_logger.error(f"Error executing function: {e}")
|
|
84
90
|
if kwargs.get("ignore_errors", False):
|
|
85
|
-
# Simply return None when ignoring errors, as expected by the test
|
|
86
91
|
return None
|
|
87
|
-
# If we're not ignoring errors, propagate the exception
|
|
88
92
|
raise
|
|
89
93
|
|
|
90
|
-
# Add model to scorers
|
|
91
94
|
for scorer in scorers:
|
|
92
|
-
if not scorer.model:
|
|
95
|
+
if not scorer.model and isinstance(model, str):
|
|
93
96
|
scorer._add_model(model)
|
|
94
97
|
|
|
95
|
-
scoring_results: List[ScoringResult] = [None for _ in examples]
|
|
98
|
+
scoring_results: List[Optional[ScoringResult]] = [None for _ in examples]
|
|
96
99
|
tasks = []
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
100
|
+
|
|
101
|
+
if show_progress:
|
|
102
|
+
with tqdm_asyncio(
|
|
103
|
+
desc=f"Evaluating {len(examples)} example(s) in parallel",
|
|
104
|
+
unit="Example",
|
|
105
|
+
total=len(examples),
|
|
106
|
+
bar_format="{desc}: |{bar}|{percentage:3.0f}% ({n_fmt}/{total_fmt}) [Time Taken: {elapsed}, {rate_fmt}{postfix}]",
|
|
107
|
+
) as pbar:
|
|
108
|
+
for i, ex in enumerate(examples):
|
|
109
|
+
if isinstance(ex, Example):
|
|
110
|
+
if len(scorers) == 0:
|
|
111
|
+
pbar.update(1)
|
|
112
|
+
continue
|
|
113
|
+
|
|
114
|
+
cloned_scorers = clone_scorers(scorers) # type: ignore
|
|
115
|
+
task = execute_with_semaphore(
|
|
116
|
+
func=a_eval_examples_helper,
|
|
117
|
+
scorers=cloned_scorers,
|
|
118
|
+
example=ex,
|
|
119
|
+
scoring_results=scoring_results,
|
|
120
|
+
score_index=i,
|
|
121
|
+
ignore_errors=ignore_errors,
|
|
122
|
+
pbar=pbar,
|
|
123
|
+
)
|
|
124
|
+
tasks.append(asyncio.create_task(task))
|
|
125
|
+
|
|
126
|
+
await asyncio.sleep(throttle_value)
|
|
127
|
+
await asyncio.gather(*tasks)
|
|
128
|
+
else:
|
|
105
129
|
for i, ex in enumerate(examples):
|
|
106
130
|
if isinstance(ex, Example):
|
|
107
131
|
if len(scorers) == 0:
|
|
108
|
-
pbar.update(1)
|
|
109
132
|
continue
|
|
110
133
|
|
|
111
|
-
cloned_scorers = clone_scorers(scorers)
|
|
134
|
+
cloned_scorers = clone_scorers(scorers) # type: ignore
|
|
112
135
|
task = execute_with_semaphore(
|
|
113
136
|
func=a_eval_examples_helper,
|
|
114
137
|
scorers=cloned_scorers,
|
|
@@ -116,19 +139,19 @@ async def a_execute_scoring(
|
|
|
116
139
|
scoring_results=scoring_results,
|
|
117
140
|
score_index=i,
|
|
118
141
|
ignore_errors=ignore_errors,
|
|
119
|
-
pbar=
|
|
142
|
+
pbar=None,
|
|
120
143
|
)
|
|
121
144
|
tasks.append(asyncio.create_task(task))
|
|
122
145
|
|
|
123
146
|
await asyncio.sleep(throttle_value)
|
|
124
147
|
await asyncio.gather(*tasks)
|
|
125
|
-
return scoring_results
|
|
148
|
+
return [result for result in scoring_results if result is not None]
|
|
126
149
|
|
|
127
150
|
|
|
128
151
|
async def a_eval_examples_helper(
|
|
129
|
-
scorers: List[
|
|
152
|
+
scorers: List[ExampleScorer],
|
|
130
153
|
example: Example,
|
|
131
|
-
scoring_results: List[ScoringResult],
|
|
154
|
+
scoring_results: List[Optional[ScoringResult]],
|
|
132
155
|
score_index: int,
|
|
133
156
|
ignore_errors: bool,
|
|
134
157
|
pbar: Optional[tqdm_asyncio] = None,
|
|
@@ -137,7 +160,7 @@ async def a_eval_examples_helper(
|
|
|
137
160
|
Evaluate a single example asynchronously using a list of scorers.
|
|
138
161
|
|
|
139
162
|
Args:
|
|
140
|
-
scorers (List[
|
|
163
|
+
scorers (List[ExampleScorer]): List of ExampleScorer objects to evaluate the example.
|
|
141
164
|
example (Example): The example to be evaluated.
|
|
142
165
|
scoring_results (List[ScoringResult]): List to store the scoring results.
|
|
143
166
|
score_index (int): Index at which the result should be stored in scoring_results.
|
|
@@ -147,24 +170,18 @@ async def a_eval_examples_helper(
|
|
|
147
170
|
None
|
|
148
171
|
"""
|
|
149
172
|
|
|
150
|
-
# scoring the Example
|
|
151
173
|
scoring_start_time = time.perf_counter()
|
|
152
174
|
|
|
153
175
|
tasks = [safe_a_score_example(scorer, example) for scorer in scorers]
|
|
154
176
|
|
|
155
177
|
await asyncio.gather(*tasks)
|
|
156
178
|
|
|
157
|
-
# Now that all the scoring functions of each scorer have executed, we collect
|
|
158
|
-
# the results and update the ScoringResult with the scorer data
|
|
159
179
|
success = True
|
|
160
180
|
scorer_data_list = []
|
|
161
181
|
for scorer in scorers:
|
|
162
|
-
# At this point, the scorer has been executed and already contains data.
|
|
163
182
|
if getattr(scorer, "skipped", False):
|
|
164
183
|
continue
|
|
165
|
-
scorer_data = create_scorer_data(
|
|
166
|
-
scorer
|
|
167
|
-
) # Fetch scorer data from completed scorer evaluation
|
|
184
|
+
scorer_data = create_scorer_data(scorer)
|
|
168
185
|
for s in scorer_data:
|
|
169
186
|
success = success and s.success
|
|
170
187
|
scorer_data_list.extend(scorer_data)
|