judgeval 0.0.51__py3-none-any.whl → 0.0.53__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/logger.py +46 -199
- judgeval/common/s3_storage.py +2 -6
- judgeval/common/tracer.py +182 -262
- judgeval/common/utils.py +16 -36
- judgeval/constants.py +14 -20
- judgeval/data/__init__.py +0 -2
- judgeval/data/datasets/dataset.py +6 -10
- judgeval/data/datasets/eval_dataset_client.py +25 -27
- judgeval/data/example.py +5 -138
- judgeval/data/judgment_types.py +214 -0
- judgeval/data/result.py +7 -25
- judgeval/data/scorer_data.py +28 -40
- judgeval/data/scripts/fix_default_factory.py +23 -0
- judgeval/data/scripts/openapi_transform.py +123 -0
- judgeval/data/tool.py +3 -54
- judgeval/data/trace.py +31 -50
- judgeval/data/trace_run.py +3 -3
- judgeval/evaluation_run.py +16 -23
- judgeval/integrations/langgraph.py +11 -12
- judgeval/judges/litellm_judge.py +3 -6
- judgeval/judges/mixture_of_judges.py +8 -25
- judgeval/judges/together_judge.py +3 -6
- judgeval/judgment_client.py +22 -24
- judgeval/rules.py +7 -19
- judgeval/run_evaluation.py +79 -242
- judgeval/scorers/__init__.py +4 -20
- judgeval/scorers/agent_scorer.py +21 -0
- judgeval/scorers/api_scorer.py +28 -38
- judgeval/scorers/base_scorer.py +98 -0
- judgeval/scorers/example_scorer.py +19 -0
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -20
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +10 -17
- judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +9 -24
- judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +16 -68
- judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +4 -12
- judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +4 -4
- judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +10 -17
- judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +4 -4
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +4 -4
- judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +4 -4
- judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +18 -14
- judgeval/scorers/score.py +45 -330
- judgeval/scorers/utils.py +6 -88
- judgeval/utils/file_utils.py +4 -6
- judgeval/version_check.py +3 -2
- {judgeval-0.0.51.dist-info → judgeval-0.0.53.dist-info}/METADATA +3 -2
- judgeval-0.0.53.dist-info/RECORD +65 -0
- judgeval/data/custom_example.py +0 -19
- judgeval/scorers/judgeval_scorer.py +0 -177
- judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +0 -45
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -29
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -29
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -32
- judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +0 -28
- judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -38
- judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -27
- judgeval/scorers/prompt_scorer.py +0 -296
- judgeval-0.0.51.dist-info/RECORD +0 -69
- {judgeval-0.0.51.dist-info → judgeval-0.0.53.dist-info}/WHEEL +0 -0
- {judgeval-0.0.51.dist-info → judgeval-0.0.53.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,296 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
Code that implements a prompt-based scorer for evaluating examples.
|
3
|
-
|
4
|
-
The PromptScorer class is a base class that can be used to create custom scoring metrics using LLM prompts.
|
5
|
-
To implement a subclass of PromptScorer, you need to implement the following methods:
|
6
|
-
- build_measure_prompt(): builds the conversation prompt that is sent to the LLM judge
|
7
|
-
- build_schema(): defines the expected response schema from the LLM
|
8
|
-
- process_response(): parses the response from the LLM judge
|
9
|
-
- success_check(): determines whether the evaluation was successful
|
10
|
-
|
11
|
-
The core idea of PromptScorer is to provide a flexible way to create custom scoring metrics
|
12
|
-
by leveraging LLM judges to evaluate examples. The scorer constructs a prompt, sends it to
|
13
|
-
the judge, and parses the structured response to determine a score.
|
14
|
-
|
15
|
-
For example, the SentimentScorer subclass uses PromptScorer to detect negative sentiment in responses
|
16
|
-
by prompting an LLM to rate the negativity on a 1-5 scale and provide a reason for the rating.
|
17
|
-
|
18
|
-
The PromptScorer supports both synchronous and asynchronous evaluation modes, includes optional
|
19
|
-
reason fields in responses, and can operate in strict mode with higher thresholds.
|
20
|
-
|
21
|
-
NOTE: When implementing build_measure_prompt and build_schema:
|
22
|
-
- The prompt should guide the LLM to generate a response matching your schema
|
23
|
-
- The schema should include "score" and optionally "reason" fields
|
24
|
-
- The score field type and range should match your scoring criteria
|
25
|
-
- The reason field provides explanatory context for the score
|
26
|
-
"""
|
27
|
-
|
28
|
-
from abc import abstractmethod
|
29
|
-
from typing import List, Optional, Tuple, Any
|
30
|
-
from pydantic import BaseModel, Field
|
31
|
-
|
32
|
-
from judgeval.data import Example
|
33
|
-
from judgeval.data.example import ExampleParams
|
34
|
-
from judgeval.scorers import JudgevalScorer
|
35
|
-
from judgeval.scorers.utils import (
|
36
|
-
scorer_progress_meter,
|
37
|
-
parse_response_json,
|
38
|
-
get_or_create_event_loop,
|
39
|
-
create_verbose_logs,
|
40
|
-
)
|
41
|
-
from judgeval.judges import JudgevalJudge
|
42
|
-
|
43
|
-
|
44
|
-
class ReasonScore(BaseModel):
|
45
|
-
reason: str
|
46
|
-
score: float
|
47
|
-
|
48
|
-
|
49
|
-
class PromptScorer(JudgevalScorer, BaseModel):
|
50
|
-
name: str
|
51
|
-
score_type: str
|
52
|
-
threshold: float = Field(default=0.5)
|
53
|
-
using_native_model: bool = Field(default=True)
|
54
|
-
model: Optional[JudgevalJudge] = Field(default=None)
|
55
|
-
skipped: bool = Field(default=False)
|
56
|
-
# DO NOT SET THESE FIELDS MANUALLY, THEY ARE SET BY THE SCORE_EXAMPLE METHOD
|
57
|
-
_response: Optional[dict] = None
|
58
|
-
_result: Optional[float] = None
|
59
|
-
|
60
|
-
def __init__(
|
61
|
-
self,
|
62
|
-
name: str,
|
63
|
-
threshold: float = 0.5,
|
64
|
-
include_reason: bool = True,
|
65
|
-
async_mode: bool = True,
|
66
|
-
strict_mode: bool = False,
|
67
|
-
verbose_mode: bool = False,
|
68
|
-
required_params: Optional[List[ExampleParams]] = None,
|
69
|
-
):
|
70
|
-
# Initialize BaseModel first
|
71
|
-
BaseModel.__init__(
|
72
|
-
self,
|
73
|
-
name=name,
|
74
|
-
score_type=name,
|
75
|
-
threshold=1 if strict_mode else threshold,
|
76
|
-
include_reason=include_reason,
|
77
|
-
async_mode=async_mode,
|
78
|
-
strict_mode=strict_mode,
|
79
|
-
verbose_mode=verbose_mode,
|
80
|
-
)
|
81
|
-
# Then initialize JudgevalScorer
|
82
|
-
JudgevalScorer.__init__(
|
83
|
-
self,
|
84
|
-
score_type=name,
|
85
|
-
threshold=1 if strict_mode else threshold,
|
86
|
-
include_reason=include_reason,
|
87
|
-
async_mode=async_mode,
|
88
|
-
strict_mode=strict_mode,
|
89
|
-
verbose_mode=verbose_mode,
|
90
|
-
required_params=required_params,
|
91
|
-
)
|
92
|
-
|
93
|
-
def score_example(
|
94
|
-
self, example: Example, _show_indicator: bool = True
|
95
|
-
) -> float | None:
|
96
|
-
"""
|
97
|
-
Synchronous method for scoring an example using the prompt criteria.
|
98
|
-
"""
|
99
|
-
with scorer_progress_meter(self, display_meter=_show_indicator):
|
100
|
-
if self.async_mode:
|
101
|
-
loop = get_or_create_event_loop()
|
102
|
-
loop.run_until_complete(
|
103
|
-
self.a_score_example(example, _show_indicator=False)
|
104
|
-
)
|
105
|
-
return self._result
|
106
|
-
else:
|
107
|
-
result, reason = self.evaluate(example)
|
108
|
-
self.reason = reason
|
109
|
-
self._result = result
|
110
|
-
self.verbose_logs = create_verbose_logs(
|
111
|
-
self,
|
112
|
-
steps=[
|
113
|
-
f"Results: {self._result}\nReason: {self.reason}",
|
114
|
-
],
|
115
|
-
)
|
116
|
-
return result
|
117
|
-
|
118
|
-
async def a_score_example(
|
119
|
-
self,
|
120
|
-
example: Example,
|
121
|
-
_show_indicator: bool = True,
|
122
|
-
) -> float:
|
123
|
-
"""
|
124
|
-
Async method for scoring an example using the prompt criteria.
|
125
|
-
"""
|
126
|
-
with scorer_progress_meter(self, display_meter=_show_indicator):
|
127
|
-
result, reason = await self.a_evaluate(example)
|
128
|
-
self.reason = reason
|
129
|
-
self._result = result
|
130
|
-
self.verbose_logs = create_verbose_logs(
|
131
|
-
self,
|
132
|
-
steps=[
|
133
|
-
f"Results: {self._result}\nReason: {self.reason}",
|
134
|
-
],
|
135
|
-
)
|
136
|
-
return result
|
137
|
-
|
138
|
-
def evaluate(self, example: Example) -> Tuple[Any, str]:
|
139
|
-
"""
|
140
|
-
Synchronous helper method for evaluating an example using the prompt criteria.
|
141
|
-
|
142
|
-
Builds a custom prompt using `build_measure_prompt` and sends it to the judge model
|
143
|
-
for evaluation. The result is then parsed as JSON and returned.
|
144
|
-
|
145
|
-
NOTE: It is assumed that the model response will be JSON and contain a "score" and "reason" field.
|
146
|
-
"""
|
147
|
-
prompt = self._build_measure_prompt(example)
|
148
|
-
if self.using_native_model and self.model:
|
149
|
-
res = self.model.generate(prompt)
|
150
|
-
response = parse_response_json(res, self)
|
151
|
-
result, reason = self._process_response(response)
|
152
|
-
return result, reason
|
153
|
-
else:
|
154
|
-
raise NotImplementedError(
|
155
|
-
"Non-native judge models are not supported in synchronous mode yet."
|
156
|
-
)
|
157
|
-
|
158
|
-
async def a_evaluate(self, example: Example) -> Tuple[Any, str]:
|
159
|
-
"""
|
160
|
-
Asynchronous helper method for evaluating an example using the prompt criteria.
|
161
|
-
|
162
|
-
Builds a custom prompt using `build_measure_prompt` and sends it to the judge model
|
163
|
-
for evaluation. The result is then parsed as JSON and returned.
|
164
|
-
|
165
|
-
NOTE: It is assumed that the model response will be JSON and contain a "score" and "reason" field.
|
166
|
-
"""
|
167
|
-
judge_prompt = self._build_measure_prompt(example)
|
168
|
-
schema = self._build_schema()
|
169
|
-
prompt = self._enforce_prompt_format(judge_prompt=judge_prompt, schema=schema)
|
170
|
-
if self.using_native_model and self.model:
|
171
|
-
res = await self.model.a_generate(prompt)
|
172
|
-
response = parse_response_json(res, self)
|
173
|
-
self._response = response
|
174
|
-
|
175
|
-
result, reason = self._process_response(response)
|
176
|
-
self.score = result
|
177
|
-
self.reason = reason
|
178
|
-
self._response = response
|
179
|
-
return result, reason
|
180
|
-
else:
|
181
|
-
raise NotImplementedError(
|
182
|
-
"Non-native judge models are not supported in async mode yet."
|
183
|
-
)
|
184
|
-
|
185
|
-
# TODO: can we make this take *args and **kwargs? How does that work with a_evaluate() since we'd have to pass the same args
|
186
|
-
@abstractmethod
|
187
|
-
def _build_measure_prompt(self, example: Example) -> List[dict]:
|
188
|
-
# builds the prompt that is sent to the model inside of the `score_example()` method
|
189
|
-
# returns either a string prompt or a conversation prompt of the form [{"role": "system", "content": "..."}, ...]
|
190
|
-
|
191
|
-
"""
|
192
|
-
This function creates the prompt that the judge model uses to evaluate examples.
|
193
|
-
|
194
|
-
The prompt is typically a set of instructions that the judge model uses to evaluate the example.
|
195
|
-
|
196
|
-
This function returns a conversation prompt of the form
|
197
|
-
[{"role": "system", "content": "..."}, {"role": "user", "content": "..."}]
|
198
|
-
|
199
|
-
A basic version of implementing this function could be as follows:
|
200
|
-
SYSTEM_ROLE = ...
|
201
|
-
return [
|
202
|
-
{"role": "system", "content": SYSTEM_ROLE},
|
203
|
-
{"role": "user", "content": f"Response: {example.actual_output}\n\nYour judgment: "}
|
204
|
-
]
|
205
|
-
"""
|
206
|
-
pass
|
207
|
-
|
208
|
-
# TODO: does this need to take *args and **kwargs? How does that work with a_evaluate() since we'd have to pass the same args
|
209
|
-
@abstractmethod
|
210
|
-
def _build_schema(self) -> dict:
|
211
|
-
"""
|
212
|
-
This function returns a dictionary that represents the schema of the JSON response that the judge model should return.
|
213
|
-
|
214
|
-
The keys of the dictionary are the expected keys in the response, and the values are the types of the corresponding values.
|
215
|
-
|
216
|
-
Example: If you want to have the judge model return a score and a reason, you would write:
|
217
|
-
return {"score": int, "reason": str}
|
218
|
-
"""
|
219
|
-
pass
|
220
|
-
|
221
|
-
def _enforce_prompt_format(self, judge_prompt: List[dict], schema: dict):
|
222
|
-
"""
|
223
|
-
Formats the final prompt to the judge model.
|
224
|
-
|
225
|
-
This function takes a list of dictionaries (`judge_prompt`) and a schema dictionary (`schema`),
|
226
|
-
and appends a schema enforcement prompt to the content of the first dictionary in the list, which is assumed to be the system prompt.
|
227
|
-
The schema enforcement prompt instructs the judge model to provide its response in a specific JSON format.
|
228
|
-
|
229
|
-
Args:
|
230
|
-
judge_prompt (List[dict]): A list of dictionaries representing the judge prompt.
|
231
|
-
Each dictionary should contain a "content" key.
|
232
|
-
schema (dict): A dictionary representing the schema. The keys are the expected keys in the response,
|
233
|
-
and the values are the types of the corresponding values.
|
234
|
-
|
235
|
-
Returns:
|
236
|
-
List[dict]: The modified judge prompt with the schema enforcement prompt appended to the content
|
237
|
-
of the first dictionary.
|
238
|
-
|
239
|
-
Raises:
|
240
|
-
TypeError: If `judge_prompt` is not a list of dictionaries.
|
241
|
-
|
242
|
-
Example:
|
243
|
-
judge_prompt = [{"content": "Please evaluate the following:"}]
|
244
|
-
schema = {"score": int, "comments": str}
|
245
|
-
formatted_prompt = format_measure_prompt(judge_prompt, schema)
|
246
|
-
# formatted_prompt[0]["content"] will include the schema enforcement prompt
|
247
|
-
"""
|
248
|
-
SCHEMA_ENFORCEMENT_PROMPT = (
|
249
|
-
"\n\nPlease provide your response in the following JSON format: {"
|
250
|
-
)
|
251
|
-
if isinstance(judge_prompt, list) and all(
|
252
|
-
isinstance(item, dict) for item in judge_prompt
|
253
|
-
):
|
254
|
-
# create formatting string for schema enforcement
|
255
|
-
# schema is a map between key and type of the value
|
256
|
-
for key, key_type in schema.items():
|
257
|
-
SCHEMA_ENFORCEMENT_PROMPT += f'"{key}": <{key}> ({key_type.__name__}), '
|
258
|
-
SCHEMA_ENFORCEMENT_PROMPT = (
|
259
|
-
SCHEMA_ENFORCEMENT_PROMPT[:-2] + "}"
|
260
|
-
) # remove trailing comma and space
|
261
|
-
judge_prompt[0]["content"] += SCHEMA_ENFORCEMENT_PROMPT
|
262
|
-
return judge_prompt
|
263
|
-
else:
|
264
|
-
raise TypeError(
|
265
|
-
f"Prompt must be a list of dictionaries. Got {type(judge_prompt)} instead."
|
266
|
-
)
|
267
|
-
|
268
|
-
@abstractmethod
|
269
|
-
def _process_response(self, response: dict):
|
270
|
-
"""
|
271
|
-
Customizable method for processing the response from the judge model.
|
272
|
-
|
273
|
-
You can add any additional logic to parse the JSON response here and return the result and reason for decision.
|
274
|
-
|
275
|
-
If you don't need a reason for the decision, you can simply return (score, None).
|
276
|
-
|
277
|
-
Example:
|
278
|
-
score = response["score"]
|
279
|
-
reason = response["reason"]
|
280
|
-
return score, reason
|
281
|
-
"""
|
282
|
-
pass
|
283
|
-
|
284
|
-
@abstractmethod
|
285
|
-
def _success_check(self, **kwargs) -> bool:
|
286
|
-
"""
|
287
|
-
Determines whether or not the PromptScorer should consider the evaluation of a single example successful.
|
288
|
-
"""
|
289
|
-
pass
|
290
|
-
|
291
|
-
@property
|
292
|
-
def __name__(self):
|
293
|
-
return self.name
|
294
|
-
|
295
|
-
class Config:
|
296
|
-
arbitrary_types_allowed = True
|
judgeval-0.0.51.dist-info/RECORD
DELETED
@@ -1,69 +0,0 @@
|
|
1
|
-
judgeval/__init__.py,sha256=HM1M8hmqRum6G554QKkXhB4DF4f5eh_xtYo0Kf-t3kw,332
|
2
|
-
judgeval/clients.py,sha256=JnB8n90GyXiYaGmSEYaA67mdJSnr3SIrzArao7NGebw,980
|
3
|
-
judgeval/constants.py,sha256=IwW428u2VxThczHiL6ZnRwrIzb6QwOE4kdKonktVFYA,6032
|
4
|
-
judgeval/evaluation_run.py,sha256=9fYFWJ2ZXtnNcRqxLjzKkZHAba2xi_f1uzOXDJ37Pgw,3233
|
5
|
-
judgeval/judgment_client.py,sha256=RGqjw6Q50DOaTPa5SfCzSSGjsm7zlkZ6N7LOvewCxVU,21510
|
6
|
-
judgeval/rules.py,sha256=TKI1K_Wlo3GDoSCztGcDoTioVKpvfG6zVkONyou8v5c,20465
|
7
|
-
judgeval/run_evaluation.py,sha256=JohxsU5EajwPgBhBGt_wTrNSGdVIbSJmMAR5ffCSg7c,51478
|
8
|
-
judgeval/version_check.py,sha256=FlKE8AQGwu50d3kdWSiBZYVW9sicnFInCZjakKt37w4,1003
|
9
|
-
judgeval/common/__init__.py,sha256=KH-QJyWtQ60R6yFIBDYS3WGRiNpEu1guynpxivZvpBQ,309
|
10
|
-
judgeval/common/exceptions.py,sha256=OkgDznu2wpBQZMXiZarLJYNk1HIcC8qYW7VypDC3Ook,556
|
11
|
-
judgeval/common/logger.py,sha256=_nNV4waaMB4NkjwAG0kYZ3cfBe19BY6b2vsCdKd2YR4,6112
|
12
|
-
judgeval/common/s3_storage.py,sha256=ukylTrBZ2QuT8BGbOY7D738RvHFAzVaPwmuWQ4R5xkE,3986
|
13
|
-
judgeval/common/tracer.py,sha256=7vvPY632z4ExsqIuNRjfpJfa6CpJKohz8kvBiSwbjFE,129624
|
14
|
-
judgeval/common/utils.py,sha256=p8C_BM0nNcIiVHTBH4BqsR106RNUlZ9wM0SxWY4IozE,35543
|
15
|
-
judgeval/data/__init__.py,sha256=Nuy_F6oll5c5qLOF2gGFWFYyXeOgXSh7R4Vm2kMiXDM,531
|
16
|
-
judgeval/data/custom_example.py,sha256=o4baSEeyNhS-k9PiOJdN4NfBFBGJMvhnU5RBvVRFRd8,734
|
17
|
-
judgeval/data/example.py,sha256=8wsqBJ98Nw7IaVNXZmUoz3UuQUWkBbnHI6t4_1pqmr8,7234
|
18
|
-
judgeval/data/result.py,sha256=4TfBPukRpF2iaF14zEU1RP-wHxsPWrX8PaXYnhxN8MM,3132
|
19
|
-
judgeval/data/scorer_data.py,sha256=FnePIXS-4oNqrM2Eo97-hL3g3ZKFIvEKLdkl0CnpHEI,3283
|
20
|
-
judgeval/data/tool.py,sha256=QMYJO8kyhGum8iiXxZZ_9pGcxcqp7Fjp0R0sh6i_9rU,1915
|
21
|
-
judgeval/data/trace.py,sha256=tn1ctv99UI_vG_1UmFlzvt2L20mviUSwbvVs8ow8X-o,5797
|
22
|
-
judgeval/data/trace_run.py,sha256=NMUkf5bxMW_jWXxZ-JI8-gOKSASldS7oAMH4MH4oSYE,1841
|
23
|
-
judgeval/data/datasets/__init__.py,sha256=IdNKhQv9yYZ_op0rdBacrFaFVmiiYQ3JTzXzxOTsEVQ,176
|
24
|
-
judgeval/data/datasets/dataset.py,sha256=VDHQpOUoWUfaPmCeolKP-hhSzQcCHq1muRg3EtLRpf0,12933
|
25
|
-
judgeval/data/datasets/eval_dataset_client.py,sha256=93Pxb3aCgDwvi263N0CgugApIwKbHbPSfuz7j0IhHSY,12880
|
26
|
-
judgeval/integrations/langgraph.py,sha256=3fKMOhAjuDdH_q3F9OlW2T_fx_vzBg2Sz4WP4WFvBuw,35909
|
27
|
-
judgeval/judges/__init__.py,sha256=6X7VSwrwsdxGBNxCyapVRWGghhKOy3MVxFNMQ62kCXM,308
|
28
|
-
judgeval/judges/base_judge.py,sha256=_dz0qWsKRxzXxpRY9l6mrxTRYPSF2FE4ZXkrzhZ4gbY,986
|
29
|
-
judgeval/judges/litellm_judge.py,sha256=pHKdNkhdBMlrok3ZMTWaomGX6DKFXYV9zHqvCL7_2jo,2653
|
30
|
-
judgeval/judges/mixture_of_judges.py,sha256=jcE3H47bVMdqzYRuxa0LD8wudF1kxkRujEpbVV-rkcM,15913
|
31
|
-
judgeval/judges/together_judge.py,sha256=DZKlsij2ikmDiYbLZKWm8oqDRNNuvCBiGM0JcycwqWM,2424
|
32
|
-
judgeval/judges/utils.py,sha256=0CF9qtIUQUL3-W-qTGpmTjZbkUUBAM6TslDsrCHnTBU,2725
|
33
|
-
judgeval/scorers/__init__.py,sha256=VKPveyGCv5Rc0YtuT7iAxSv-M5EuikqAVeaGNnYMuWE,1340
|
34
|
-
judgeval/scorers/api_scorer.py,sha256=2LNqcwIMerb37WooGD-hw5WIVLcTXnxWxzwZ0h9CXq0,2795
|
35
|
-
judgeval/scorers/exceptions.py,sha256=ACDHK5-TWiF3NTk-wycaedpbrdobm-CvvC1JA_iP-Mk,179
|
36
|
-
judgeval/scorers/judgeval_scorer.py,sha256=VoiAQdJzgoiVyFYS9gLEGtQwfQY6tUBoWBBDyGBfo-Q,7321
|
37
|
-
judgeval/scorers/prompt_scorer.py,sha256=w0tW76J956smL4D8PsOHswjwYFb8W08E_0E9ad5_aQ8,12124
|
38
|
-
judgeval/scorers/score.py,sha256=_mKQuoZHEqrF9PaydPtzWN3zjE6PeKYETw_1UryzJ3s,19399
|
39
|
-
judgeval/scorers/utils.py,sha256=UKssYyqsJ_hckeqa1aGcXLLxiScRDzYilyuT1RqkVyo,6853
|
40
|
-
judgeval/scorers/judgeval_scorers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
41
|
-
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py,sha256=mmGIBCWN2WByjSUn9o5-xmHV2W-fDNyRofNsEpSuqyQ,2248
|
42
|
-
judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py,sha256=xY7vY4uIfncEiCksGu5SFT8dUjzkY9suNgyvipQ1avU,712
|
43
|
-
judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py,sha256=t2ClO5nL6rM_atKV9YFgOCrQEPI_KjNqs1tyF3WqQig,659
|
44
|
-
judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py,sha256=USeIQ1nozvQhMIrRLpST3nqNOekOFW5XJG4NSP7w0RI,4430
|
45
|
-
judgeval/scorers/judgeval_scorers/api_scorers/comparison.py,sha256=H4K_NIMabYd_OPlMz3CNNMIM3vYk7PunTXygMnyp6sc,1240
|
46
|
-
judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py,sha256=QldMhW7k16jPPiHQAeLH-2VilPTuNHVi6OMsWvWnycE,771
|
47
|
-
judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py,sha256=GDxEljGD4E-8j6t9DpV2cve0gcKZiUYHn2bfyXChbu0,759
|
48
|
-
judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py,sha256=4E6Sa1aaI1k9PvA8afzNwIdrBCxv4UOqMtmfnLlWeWs,826
|
49
|
-
judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py,sha256=jiKi8EfwP_yuOwHhYStbIUQIn2LPwJEbkh8PQeOoDTs,475
|
50
|
-
judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py,sha256=guG37tQm8m4Gs1bwYS1eaNau-RJYwteb1hwYQ0YMIbk,1357
|
51
|
-
judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py,sha256=6iK6Da0FWoyDe_OH7UMnc4gpnByNqfIx6BW8nUbvlC0,693
|
52
|
-
judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py,sha256=RrGgBMgwVPpxb9cHm-yXQBgoh6CHUm_GkFYGSp-KcUc,693
|
53
|
-
judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py,sha256=VbvEEawOZ1XA3SWS986cbR2m3Clyliv21nzHe9GrQxo,687
|
54
|
-
judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py,sha256=nk4_lpq2eIe6v8GtBm2g6O1CLCg5sP7-wspye6qNuXE,679
|
55
|
-
judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py,sha256=9gKX32g9dAvFdHXzQmR-CFabBPLIZHu8aCnICK3t7j8,1066
|
56
|
-
judgeval/scorers/judgeval_scorers/api_scorers/summarization.py,sha256=Wz5wtpqeXMdK8oRXRKnWqow4s1mmqGFQqHK42wo6cNQ,648
|
57
|
-
judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py,sha256=wzgprwQ3hcsc9itHG0DkcXyNnvVVd-s1UpNyZxw49Sw,590
|
58
|
-
judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py,sha256=462fR2m-67FR2TdHu6cCNZLRkIT_yTAOrMeb-1AuQe8,576
|
59
|
-
judgeval/scorers/judgeval_scorers/classifiers/__init__.py,sha256=Qt81W5ZCwMvBAne0LfQDb8xvg5iOG1vEYP7WizgwAZo,67
|
60
|
-
judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py,sha256=8iTzMvou1Dr8pybul6lZHKjc9Ye2-0_racRGYkhEdTY,74
|
61
|
-
judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py,sha256=gloLzThkFsr8sHQargDAH8XaDrlF6OCuc_69hyNslFU,2589
|
62
|
-
judgeval/tracer/__init__.py,sha256=wkuXtOGDCrwgPPXlh_sSJmvGuWaAMHyNzk1TzB5f9aI,148
|
63
|
-
judgeval/utils/alerts.py,sha256=3w_AjQrgfmOZvfqCridW8WAnHVxHHXokX9jNzVFyGjA,3297
|
64
|
-
judgeval/utils/file_utils.py,sha256=M6a_BPRGMwEFBPdF_Tbcbbk4YldHcOhuoU9oRlmninE,1858
|
65
|
-
judgeval/utils/requests.py,sha256=rbmZTaiyWI8t2YUkhk11SIe3dF7j2j25L1BuFp_1PII,770
|
66
|
-
judgeval-0.0.51.dist-info/METADATA,sha256=m0Ucp1rrC75X0YRNrWzP93igteOJ6OZ-4WQ-R_mdFIE,54765
|
67
|
-
judgeval-0.0.51.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
68
|
-
judgeval-0.0.51.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
|
69
|
-
judgeval-0.0.51.dist-info/RECORD,,
|
File without changes
|
File without changes
|