judgeval 0.0.10__py3-none-any.whl → 0.0.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/tracer.py +183 -41
- judgeval/constants.py +14 -3
- judgeval/evaluation_run.py +2 -1
- judgeval/judges/utils.py +14 -2
- judgeval/judgment_client.py +46 -1
- judgeval/scorers/judgeval_scorer.py +8 -8
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +3 -1
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +6 -3
- judgeval/scorers/prompt_scorer.py +2 -2
- judgeval/scorers/score.py +11 -11
- judgeval/scorers/utils.py +3 -3
- judgeval/tracer/__init__.py +3 -0
- {judgeval-0.0.10.dist-info → judgeval-0.0.11.dist-info}/METADATA +5 -4
- {judgeval-0.0.10.dist-info → judgeval-0.0.11.dist-info}/RECORD +16 -15
- {judgeval-0.0.10.dist-info → judgeval-0.0.11.dist-info}/WHEEL +0 -0
- {judgeval-0.0.10.dist-info → judgeval-0.0.11.dist-info}/licenses/LICENSE.md +0 -0
judgeval/common/tracer.py
CHANGED
@@ -8,8 +8,20 @@ import functools
|
|
8
8
|
import requests
|
9
9
|
import uuid
|
10
10
|
from contextlib import contextmanager
|
11
|
-
from typing import
|
12
|
-
|
11
|
+
from typing import (
|
12
|
+
Optional,
|
13
|
+
Any,
|
14
|
+
List,
|
15
|
+
Literal,
|
16
|
+
Tuple,
|
17
|
+
Generator,
|
18
|
+
TypeAlias,
|
19
|
+
Union
|
20
|
+
)
|
21
|
+
from dataclasses import (
|
22
|
+
dataclass,
|
23
|
+
field
|
24
|
+
)
|
13
25
|
from datetime import datetime
|
14
26
|
from openai import OpenAI
|
15
27
|
from together import Together
|
@@ -21,18 +33,26 @@ import json
|
|
21
33
|
import warnings
|
22
34
|
from pydantic import BaseModel
|
23
35
|
from http import HTTPStatus
|
24
|
-
from rich import print as rprint
|
25
36
|
|
26
|
-
|
37
|
+
import pika
|
38
|
+
import os
|
39
|
+
|
40
|
+
from judgeval.constants import JUDGMENT_TRACES_SAVE_API_URL, JUDGMENT_TRACES_FETCH_API_URL, RABBITMQ_HOST, RABBITMQ_PORT, RABBITMQ_QUEUE, JUDGMENT_TRACES_DELETE_API_URL
|
27
41
|
from judgeval.judgment_client import JudgmentClient
|
28
42
|
from judgeval.data import Example
|
29
|
-
from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
|
43
|
+
from judgeval.scorers import APIJudgmentScorer, JudgevalScorer, ScorerWrapper
|
44
|
+
|
45
|
+
from rich import print as rprint
|
46
|
+
|
30
47
|
from judgeval.data.result import ScoringResult
|
48
|
+
from judgeval.evaluation_run import EvaluationRun
|
31
49
|
|
32
50
|
# Define type aliases for better code readability and maintainability
|
33
51
|
ApiClient: TypeAlias = Union[OpenAI, Together, Anthropic] # Supported API clients
|
34
52
|
TraceEntryType = Literal['enter', 'exit', 'output', 'input', 'evaluation'] # Valid trace entry types
|
35
53
|
SpanType = Literal['span', 'tool', 'llm', 'evaluation']
|
54
|
+
|
55
|
+
|
36
56
|
@dataclass
|
37
57
|
class TraceEntry:
|
38
58
|
"""Represents a single trace entry with its visual representation.
|
@@ -54,7 +74,7 @@ class TraceEntry:
|
|
54
74
|
# Use field() for mutable defaults to avoid shared state issues
|
55
75
|
inputs: dict = field(default_factory=dict)
|
56
76
|
span_type: SpanType = "span"
|
57
|
-
|
77
|
+
evaluation_runs: List[Optional[EvaluationRun]] = field(default=None)
|
58
78
|
|
59
79
|
def print_entry(self):
|
60
80
|
indent = " " * self.depth
|
@@ -67,7 +87,8 @@ class TraceEntry:
|
|
67
87
|
elif self.type == "input":
|
68
88
|
print(f"{indent}Input: {self.inputs}")
|
69
89
|
elif self.type == "evaluation":
|
70
|
-
|
90
|
+
for evaluation_run in self.evaluation_runs:
|
91
|
+
print(f"{indent}Evaluation: {evaluation_run.model_dump()}")
|
71
92
|
|
72
93
|
def _serialize_inputs(self) -> dict:
|
73
94
|
"""Helper method to serialize input data safely.
|
@@ -114,7 +135,7 @@ class TraceEntry:
|
|
114
135
|
"duration": self.duration,
|
115
136
|
"output": self._serialize_output(),
|
116
137
|
"inputs": self._serialize_inputs(),
|
117
|
-
"
|
138
|
+
"evaluation_runs": [evaluation_run.model_dump() for evaluation_run in self.evaluation_runs] if self.evaluation_runs else [],
|
118
139
|
"span_type": self.span_type
|
119
140
|
}
|
120
141
|
|
@@ -155,6 +176,106 @@ class TraceEntry:
|
|
155
176
|
return self.output
|
156
177
|
except (TypeError, OverflowError, ValueError):
|
157
178
|
return safe_stringify(self.output, self.function)
|
179
|
+
|
180
|
+
|
181
|
+
class TraceManagerClient:
|
182
|
+
"""
|
183
|
+
Client for handling trace endpoints with the Judgment API
|
184
|
+
|
185
|
+
|
186
|
+
Operations include:
|
187
|
+
- Fetching a trace by id
|
188
|
+
- Saving a trace
|
189
|
+
- Deleting a trace
|
190
|
+
"""
|
191
|
+
def __init__(self, judgment_api_key: str):
|
192
|
+
self.judgment_api_key = judgment_api_key
|
193
|
+
|
194
|
+
def fetch_trace(self, trace_id: str):
|
195
|
+
"""
|
196
|
+
Fetch a trace by its id
|
197
|
+
"""
|
198
|
+
response = requests.post(
|
199
|
+
JUDGMENT_TRACES_FETCH_API_URL,
|
200
|
+
json={
|
201
|
+
"trace_id": trace_id,
|
202
|
+
"judgment_api_key": self.judgment_api_key,
|
203
|
+
},
|
204
|
+
headers={
|
205
|
+
"Content-Type": "application/json",
|
206
|
+
}
|
207
|
+
)
|
208
|
+
|
209
|
+
if response.status_code != HTTPStatus.OK:
|
210
|
+
raise ValueError(f"Failed to fetch traces: {response.text}")
|
211
|
+
|
212
|
+
return response.json()
|
213
|
+
|
214
|
+
def save_trace(self, trace_data: dict, empty_save: bool):
|
215
|
+
"""
|
216
|
+
Saves a trace to the database
|
217
|
+
|
218
|
+
Args:
|
219
|
+
trace_data: The trace data to save
|
220
|
+
empty_save: Whether to save an empty trace
|
221
|
+
NOTE we save empty traces in order to properly handle async operations; we need something in the DB to associate the async results with
|
222
|
+
"""
|
223
|
+
response = requests.post(
|
224
|
+
JUDGMENT_TRACES_SAVE_API_URL,
|
225
|
+
json=trace_data,
|
226
|
+
headers={
|
227
|
+
"Content-Type": "application/json",
|
228
|
+
}
|
229
|
+
)
|
230
|
+
|
231
|
+
if response.status_code == HTTPStatus.BAD_REQUEST:
|
232
|
+
raise ValueError(f"Failed to save trace data: Check your Trace name for conflicts, set overwrite=True to overwrite existing traces: {response.text}")
|
233
|
+
elif response.status_code != HTTPStatus.OK:
|
234
|
+
raise ValueError(f"Failed to save trace data: {response.text}")
|
235
|
+
|
236
|
+
if not empty_save and "ui_results_url" in response.json():
|
237
|
+
rprint(f"\n🔍 You can view your trace data here: [rgb(106,0,255)]{response.json()['ui_results_url']}[/]\n")
|
238
|
+
|
239
|
+
def delete_trace(self, trace_id: str):
|
240
|
+
"""
|
241
|
+
Delete a trace from the database.
|
242
|
+
"""
|
243
|
+
response = requests.delete(
|
244
|
+
JUDGMENT_TRACES_DELETE_API_URL,
|
245
|
+
json={
|
246
|
+
"judgment_api_key": self.judgment_api_key,
|
247
|
+
"trace_ids": [trace_id],
|
248
|
+
},
|
249
|
+
headers={
|
250
|
+
"Content-Type": "application/json",
|
251
|
+
}
|
252
|
+
)
|
253
|
+
|
254
|
+
if response.status_code != HTTPStatus.OK:
|
255
|
+
raise ValueError(f"Failed to delete trace: {response.text}")
|
256
|
+
|
257
|
+
return response.json()
|
258
|
+
|
259
|
+
def delete_traces(self, trace_ids: List[str]):
|
260
|
+
"""
|
261
|
+
Delete a batch of traces from the database.
|
262
|
+
"""
|
263
|
+
response = requests.delete(
|
264
|
+
JUDGMENT_TRACES_DELETE_API_URL,
|
265
|
+
json={
|
266
|
+
"judgment_api_key": self.judgment_api_key,
|
267
|
+
"trace_ids": trace_ids,
|
268
|
+
},
|
269
|
+
headers={
|
270
|
+
"Content-Type": "application/json",
|
271
|
+
}
|
272
|
+
)
|
273
|
+
|
274
|
+
if response.status_code != HTTPStatus.OK:
|
275
|
+
raise ValueError(f"Failed to delete trace: {response.text}")
|
276
|
+
|
277
|
+
return response.json()
|
278
|
+
|
158
279
|
|
159
280
|
class TraceClient:
|
160
281
|
"""Client for managing a single trace context"""
|
@@ -169,6 +290,7 @@ class TraceClient:
|
|
169
290
|
self.span_type = None
|
170
291
|
self._current_span: Optional[TraceEntry] = None
|
171
292
|
self.overwrite = overwrite
|
293
|
+
self.trace_manager_client = TraceManagerClient(tracer.api_key) # Manages DB operations for trace data
|
172
294
|
|
173
295
|
@contextmanager
|
174
296
|
def span(self, name: str, span_type: SpanType = "span"):
|
@@ -185,6 +307,7 @@ class TraceClient:
|
|
185
307
|
span_type=span_type
|
186
308
|
))
|
187
309
|
|
310
|
+
# Increment nested depth and set current span
|
188
311
|
self.tracer.depth += 1
|
189
312
|
prev_span = self._current_span
|
190
313
|
self._current_span = name
|
@@ -207,7 +330,7 @@ class TraceClient:
|
|
207
330
|
))
|
208
331
|
self._current_span = prev_span
|
209
332
|
|
210
|
-
|
333
|
+
def async_evaluate(
|
211
334
|
self,
|
212
335
|
scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
|
213
336
|
input: Optional[str] = None,
|
@@ -233,25 +356,40 @@ class TraceClient:
|
|
233
356
|
additional_metadata=additional_metadata,
|
234
357
|
trace_id=self.trace_id
|
235
358
|
)
|
236
|
-
|
237
|
-
|
238
|
-
scorers
|
239
|
-
|
240
|
-
|
359
|
+
|
360
|
+
try:
|
361
|
+
# Load appropriate implementations for all scorers
|
362
|
+
loaded_scorers: List[Union[JudgevalScorer, APIJudgmentScorer]] = [
|
363
|
+
scorer.load_implementation(use_judgment=True) if isinstance(scorer, ScorerWrapper) else scorer
|
364
|
+
for scorer in scorers
|
365
|
+
]
|
366
|
+
except Exception as e:
|
367
|
+
raise ValueError(f"Failed to load scorers: {str(e)}")
|
368
|
+
|
369
|
+
eval_run = EvaluationRun(
|
241
370
|
log_results=log_results,
|
242
371
|
project_name=self.project_name,
|
243
|
-
|
244
|
-
f"{self.name.capitalize()}-"
|
372
|
+
eval_name=f"{self.name.capitalize()}-"
|
245
373
|
f"{self._current_span}-"
|
246
|
-
f"[{','.join(scorer.load_implementation().score_type.capitalize() for scorer in scorers)}]"
|
247
|
-
|
374
|
+
f"[{','.join(scorer.load_implementation().score_type.capitalize() for scorer in scorers)}]",
|
375
|
+
examples=[example],
|
376
|
+
scorers=loaded_scorers,
|
377
|
+
model=model,
|
378
|
+
metadata={},
|
379
|
+
judgment_api_key=self.tracer.api_key,
|
248
380
|
override=self.overwrite
|
249
381
|
)
|
250
382
|
|
251
|
-
self.
|
383
|
+
self.add_eval_run(eval_run, start_time) # Pass start_time to record_evaluation
|
252
384
|
|
253
|
-
def
|
254
|
-
"""
|
385
|
+
def add_eval_run(self, eval_run: EvaluationRun, start_time: float):
|
386
|
+
"""
|
387
|
+
Add evaluation run data to the trace
|
388
|
+
|
389
|
+
Args:
|
390
|
+
eval_run (EvaluationRun): The evaluation run to add to the trace
|
391
|
+
start_time (float): The start time of the evaluation run
|
392
|
+
"""
|
255
393
|
if self._current_span:
|
256
394
|
duration = time.time() - start_time # Calculate duration from start_time
|
257
395
|
|
@@ -261,7 +399,7 @@ class TraceClient:
|
|
261
399
|
depth=self.tracer.depth,
|
262
400
|
message=f"Evaluation results for {self._current_span}",
|
263
401
|
timestamp=time.time(),
|
264
|
-
|
402
|
+
evaluation_runs=[eval_run],
|
265
403
|
duration=duration,
|
266
404
|
span_type="evaluation"
|
267
405
|
))
|
@@ -342,7 +480,7 @@ class TraceClient:
|
|
342
480
|
"timestamp": entry["timestamp"],
|
343
481
|
"inputs": None,
|
344
482
|
"output": None,
|
345
|
-
"
|
483
|
+
"evaluation_runs": [],
|
346
484
|
"span_type": entry.get("span_type", "span")
|
347
485
|
}
|
348
486
|
active_functions.append(function)
|
@@ -365,8 +503,8 @@ class TraceClient:
|
|
365
503
|
if entry["type"] == "output" and entry["output"]:
|
366
504
|
current_entry["output"] = entry["output"]
|
367
505
|
|
368
|
-
if entry["type"] == "evaluation" and entry["
|
369
|
-
current_entry["
|
506
|
+
if entry["type"] == "evaluation" and entry["evaluation_runs"]:
|
507
|
+
current_entry["evaluation_runs"] = entry["evaluation_runs"]
|
370
508
|
|
371
509
|
# Sort by timestamp
|
372
510
|
condensed.sort(key=lambda x: x["timestamp"])
|
@@ -418,26 +556,30 @@ class TraceClient:
|
|
418
556
|
"empty_save": empty_save,
|
419
557
|
"overwrite": overwrite
|
420
558
|
}
|
421
|
-
|
422
|
-
# Save trace data by making POST request to API
|
423
|
-
response = requests.post(
|
424
|
-
JUDGMENT_TRACES_SAVE_API_URL,
|
425
|
-
json=trace_data,
|
426
|
-
headers={
|
427
|
-
"Content-Type": "application/json",
|
428
|
-
}
|
429
|
-
)
|
430
|
-
|
431
|
-
if response.status_code == HTTPStatus.BAD_REQUEST:
|
432
|
-
raise ValueError(f"Failed to save trace data: Check your Trace name for conflicts, set overwrite=True to overwrite existing traces: {response.text}")
|
433
|
-
elif response.status_code != HTTPStatus.OK:
|
434
|
-
raise ValueError(f"Failed to save trace data: {response.text}")
|
435
559
|
|
436
|
-
if not empty_save
|
437
|
-
|
560
|
+
if not empty_save:
|
561
|
+
connection = pika.BlockingConnection(
|
562
|
+
pika.ConnectionParameters(host=RABBITMQ_HOST, port=RABBITMQ_PORT))
|
563
|
+
channel = connection.channel()
|
564
|
+
|
565
|
+
channel.queue_declare(queue=RABBITMQ_QUEUE, durable=True)
|
566
|
+
|
567
|
+
channel.basic_publish(
|
568
|
+
exchange='',
|
569
|
+
routing_key=RABBITMQ_QUEUE,
|
570
|
+
body=json.dumps(trace_data),
|
571
|
+
properties=pika.BasicProperties(
|
572
|
+
delivery_mode=pika.DeliveryMode.Transient # Changed from Persistent to Transient
|
573
|
+
))
|
574
|
+
connection.close()
|
438
575
|
|
576
|
+
self.trace_manager_client.save_trace(trace_data, empty_save)
|
577
|
+
|
439
578
|
return self.trace_id, trace_data
|
440
579
|
|
580
|
+
def delete(self):
|
581
|
+
return self.trace_manager_client.delete_trace(self.trace_id)
|
582
|
+
|
441
583
|
class Tracer:
|
442
584
|
_instance = None
|
443
585
|
|
judgeval/constants.py
CHANGED
@@ -32,16 +32,25 @@ class APIScorer(str, Enum):
|
|
32
32
|
return member
|
33
33
|
|
34
34
|
ROOT_API = os.getenv("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
|
35
|
-
|
35
|
+
# API URLs
|
36
36
|
JUDGMENT_EVAL_API_URL = f"{ROOT_API}/evaluate/"
|
37
37
|
JUDGMENT_DATASETS_PUSH_API_URL = f"{ROOT_API}/datasets/push/"
|
38
38
|
JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/pull/"
|
39
39
|
JUDGMENT_DATASETS_PULL_ALL_API_URL = f"{ROOT_API}/datasets/get_all_stats/"
|
40
40
|
JUDGMENT_EVAL_LOG_API_URL = f"{ROOT_API}/log_eval_results/"
|
41
41
|
JUDGMENT_EVAL_FETCH_API_URL = f"{ROOT_API}/fetch_eval_results/"
|
42
|
+
JUDGMENT_EVAL_DELETE_API_URL = f"{ROOT_API}/delete_eval_results_by_project_and_run_name/"
|
43
|
+
JUDGMENT_EVAL_DELETE_PROJECT_API_URL = f"{ROOT_API}/delete_eval_results_by_project/"
|
44
|
+
JUDGMENT_TRACES_FETCH_API_URL = f"{ROOT_API}/traces/fetch/"
|
42
45
|
JUDGMENT_TRACES_SAVE_API_URL = f"{ROOT_API}/traces/save/"
|
46
|
+
JUDGMENT_TRACES_DELETE_API_URL = f"{ROOT_API}/traces/delete/"
|
43
47
|
|
44
|
-
|
48
|
+
# RabbitMQ
|
49
|
+
RABBITMQ_HOST = os.getenv("RABBITMQ_HOST", "rabbitmq-networklb-faa155df16ec9085.elb.us-west-1.amazonaws.com")
|
50
|
+
RABBITMQ_PORT = os.getenv("RABBITMQ_PORT", 5672)
|
51
|
+
RABBITMQ_QUEUE = os.getenv("RABBITMQ_QUEUE", "task_queue")
|
52
|
+
|
53
|
+
# Models
|
45
54
|
TOGETHER_SUPPORTED_MODELS = {
|
46
55
|
"QWEN": "Qwen/Qwen2-72B-Instruct",
|
47
56
|
"LLAMA3_70B_INSTRUCT_TURBO": "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
|
@@ -51,7 +60,9 @@ TOGETHER_SUPPORTED_MODELS = {
|
|
51
60
|
"MISTRAL_8x7B_INSTRUCT": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
52
61
|
}
|
53
62
|
|
54
|
-
|
63
|
+
JUDGMENT_SUPPORTED_MODELS = {"osiris-large", "osiris-mini"}
|
64
|
+
|
65
|
+
ACCEPTABLE_MODELS = set(litellm.model_list) | set(TOGETHER_SUPPORTED_MODELS.keys()) | JUDGMENT_SUPPORTED_MODELS
|
55
66
|
|
56
67
|
## System settings
|
57
68
|
MAX_WORKER_THREADS = 10
|
judgeval/evaluation_run.py
CHANGED
@@ -15,7 +15,7 @@ class EvaluationRun(BaseModel):
|
|
15
15
|
project_name (str): The name of the project the evaluation results belong to
|
16
16
|
eval_name (str): A name for this evaluation run
|
17
17
|
examples (List[Example]): The examples to evaluate
|
18
|
-
scorers (List[Union[JudgmentScorer,
|
18
|
+
scorers (List[Union[JudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
|
19
19
|
model (str): The model used as a judge when using LLM as a Judge
|
20
20
|
aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
|
21
21
|
metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
|
@@ -33,6 +33,7 @@ class EvaluationRun(BaseModel):
|
|
33
33
|
metadata: Optional[Dict[str, Any]] = None
|
34
34
|
# API Key will be "" until user calls client.run_eval(), then API Key will be set
|
35
35
|
judgment_api_key: Optional[str] = ""
|
36
|
+
override: Optional[bool] = False
|
36
37
|
|
37
38
|
def model_dump(self, **kwargs):
|
38
39
|
data = super().model_dump(**kwargs)
|
judgeval/judges/utils.py
CHANGED
@@ -6,7 +6,7 @@ from typing import Optional, Union, Tuple, List
|
|
6
6
|
|
7
7
|
from judgeval.common.exceptions import InvalidJudgeModelError
|
8
8
|
from judgeval.judges import JudgevalJudge, LiteLLMJudge, TogetherJudge, MixtureOfJudges
|
9
|
-
from judgeval.constants import TOGETHER_SUPPORTED_MODELS
|
9
|
+
from judgeval.constants import TOGETHER_SUPPORTED_MODELS, JUDGMENT_SUPPORTED_MODELS, ACCEPTABLE_MODELS
|
10
10
|
|
11
11
|
LITELLM_SUPPORTED_MODELS = set(litellm.model_list)
|
12
12
|
|
@@ -33,7 +33,13 @@ def create_judge(
|
|
33
33
|
# Either string or List[str]
|
34
34
|
if isinstance(model, list):
|
35
35
|
for m in model:
|
36
|
-
if m
|
36
|
+
if m in JUDGMENT_SUPPORTED_MODELS:
|
37
|
+
raise NotImplementedError(
|
38
|
+
"""Judgment models are not yet supported for local scoring.
|
39
|
+
Please either set the `use_judgment` flag to True or use
|
40
|
+
non-Judgment models."""
|
41
|
+
)
|
42
|
+
if m not in LITELLM_SUPPORTED_MODELS and m not in TOGETHER_SUPPORTED_MODELS:
|
37
43
|
raise InvalidJudgeModelError(f"Invalid judge model chosen: {m}")
|
38
44
|
return MixtureOfJudges(models=model), True
|
39
45
|
# If model is a string, check that it corresponds to a valid model
|
@@ -41,5 +47,11 @@ def create_judge(
|
|
41
47
|
return LiteLLMJudge(model=model), True
|
42
48
|
if model in TOGETHER_SUPPORTED_MODELS:
|
43
49
|
return TogetherJudge(model=model), True
|
50
|
+
if model in JUDGMENT_SUPPORTED_MODELS:
|
51
|
+
raise NotImplementedError(
|
52
|
+
"""Judgment models are not yet supported for local scoring.
|
53
|
+
Please either set the `use_judgment` flag to True or use
|
54
|
+
non-Judgment models."""
|
55
|
+
)
|
44
56
|
else:
|
45
57
|
raise InvalidJudgeModelError(f"Invalid judge model chosen: {model}")
|
judgeval/judgment_client.py
CHANGED
@@ -23,7 +23,7 @@ from judgeval.run_evaluation import (
|
|
23
23
|
assert_test
|
24
24
|
)
|
25
25
|
from judgeval.judges import JudgevalJudge
|
26
|
-
from judgeval.constants import JUDGMENT_EVAL_FETCH_API_URL
|
26
|
+
from judgeval.constants import JUDGMENT_EVAL_FETCH_API_URL, JUDGMENT_EVAL_DELETE_API_URL, JUDGMENT_EVAL_DELETE_PROJECT_API_URL
|
27
27
|
from judgeval.common.exceptions import JudgmentAPIError
|
28
28
|
from pydantic import BaseModel
|
29
29
|
|
@@ -194,6 +194,51 @@ class JudgmentClient:
|
|
194
194
|
eval_run_result[0]["id"] = result_id
|
195
195
|
eval_run_result[0]["results"] = [ScoringResult(**filtered_result)]
|
196
196
|
return eval_run_result
|
197
|
+
|
198
|
+
def delete_eval(self, project_name: str, eval_run_name: str) -> bool:
|
199
|
+
"""
|
200
|
+
Deletes an evaluation from the server by project and run name.
|
201
|
+
|
202
|
+
Args:
|
203
|
+
project_name (str): Name of the project
|
204
|
+
eval_run_name (str): Name of the evaluation run
|
205
|
+
|
206
|
+
Returns:
|
207
|
+
bool: Whether the evaluation was successfully deleted
|
208
|
+
"""
|
209
|
+
eval_run_request_body = EvalRunRequestBody(project_name=project_name,
|
210
|
+
eval_name=eval_run_name,
|
211
|
+
judgment_api_key=self.judgment_api_key)
|
212
|
+
response = requests.delete(JUDGMENT_EVAL_DELETE_API_URL,
|
213
|
+
json=eval_run_request_body.model_dump(),
|
214
|
+
headers={
|
215
|
+
"Content-Type": "application/json",
|
216
|
+
})
|
217
|
+
if response.status_code != requests.codes.ok:
|
218
|
+
raise ValueError(f"Error deleting eval results: {response.json()}")
|
219
|
+
return response.json()
|
220
|
+
|
221
|
+
def delete_project_evals(self, project_name: str) -> bool:
|
222
|
+
"""
|
223
|
+
Deletes all evaluations from the server for a given project.
|
224
|
+
|
225
|
+
Args:
|
226
|
+
project_name (str): Name of the project
|
227
|
+
|
228
|
+
Returns:
|
229
|
+
bool: Whether the evaluations were successfully deleted
|
230
|
+
"""
|
231
|
+
response = requests.delete(JUDGMENT_EVAL_DELETE_PROJECT_API_URL,
|
232
|
+
json={
|
233
|
+
"project_name": project_name,
|
234
|
+
"judgment_api_key": self.judgment_api_key
|
235
|
+
},
|
236
|
+
headers={
|
237
|
+
"Content-Type": "application/json",
|
238
|
+
})
|
239
|
+
if response.status_code != requests.codes.ok:
|
240
|
+
raise ValueError(f"Error deleting eval results: {response.json()}")
|
241
|
+
return response.json()
|
197
242
|
|
198
243
|
def _validate_api_key(self):
|
199
244
|
"""
|
@@ -1,5 +1,5 @@
|
|
1
1
|
"""
|
2
|
-
|
2
|
+
Judgeval Scorer class
|
3
3
|
|
4
4
|
Enables client to create custom scorers that do not fall under any of the ready-made Judgment scorers.
|
5
5
|
To create a custom scorer, extend this class and implement the `score_example`, `a_score_example`, and `success_check` methods.
|
@@ -57,12 +57,12 @@ class JudgevalScorer:
|
|
57
57
|
verbose_logs: Optional[str] = None,
|
58
58
|
additional_metadata: Optional[Dict] = None
|
59
59
|
):
|
60
|
-
debug(f"Initializing
|
60
|
+
debug(f"Initializing JudgevalScorer with score_type={score_type}, threshold={threshold}")
|
61
61
|
if not 0 <= threshold <= 1:
|
62
62
|
raise ValueError("Threshold must be between 0 and 1")
|
63
63
|
if strict_mode:
|
64
64
|
warning("Strict mode enabled - scoring will be more rigorous")
|
65
|
-
info(f"
|
65
|
+
info(f"JudgevalScorer initialized with evaluation_model: {evaluation_model}")
|
66
66
|
self.score_type = score_type
|
67
67
|
self.threshold = threshold
|
68
68
|
self.score = score
|
@@ -81,7 +81,7 @@ class JudgevalScorer:
|
|
81
81
|
|
82
82
|
def _add_model(self, model: Optional[Union[str, List[str], JudgevalJudge]] = None):
|
83
83
|
"""
|
84
|
-
Adds the evaluation model to the
|
84
|
+
Adds the evaluation model to the JudgevalScorer instance
|
85
85
|
|
86
86
|
This method is used at eval time
|
87
87
|
"""
|
@@ -116,10 +116,10 @@ class JudgevalScorer:
|
|
116
116
|
raise NotImplementedError("You must implement the `passes` method in your custom scorer")
|
117
117
|
|
118
118
|
def __str__(self):
|
119
|
-
debug("Converting
|
119
|
+
debug("Converting JudgevalScorer instance to string representation")
|
120
120
|
if self.error:
|
121
|
-
warning(f"
|
122
|
-
info(f"
|
121
|
+
warning(f"JudgevalScorer contains error: {self.error}")
|
122
|
+
info(f"JudgevalScorer status - success: {self.success}, score: {self.score}")
|
123
123
|
attributes = {
|
124
124
|
"score_type": self.score_type,
|
125
125
|
"threshold": self.threshold,
|
@@ -137,4 +137,4 @@ class JudgevalScorer:
|
|
137
137
|
"verbose_logs": self.verbose_logs,
|
138
138
|
"additional_metadata": self.additional_metadata,
|
139
139
|
}
|
140
|
-
return f"
|
140
|
+
return f"JudgevalScorer({attributes})"
|
@@ -2,7 +2,7 @@
|
|
2
2
|
Code for the local implementation of the Faithfulness metric.
|
3
3
|
"""
|
4
4
|
from typing import List, Optional, Union
|
5
|
-
|
5
|
+
from pprint import pprint
|
6
6
|
from judgeval.constants import APIScorer
|
7
7
|
from judgeval.data import (
|
8
8
|
Example,
|
@@ -114,11 +114,13 @@ class FaithfulnessScorer(JudgevalScorer):
|
|
114
114
|
):
|
115
115
|
self.claims = await self._a_generate_claims(example.actual_output)
|
116
116
|
|
117
|
+
|
117
118
|
if self.additional_metadata is None:
|
118
119
|
self.additional_metadata = {}
|
119
120
|
self.additional_metadata["claims"] = self.claims
|
120
121
|
|
121
122
|
self.verdicts = await self._a_generate_verdicts(example.retrieval_context)
|
123
|
+
|
122
124
|
self.additional_metadata["verdicts"] = [v.model_dump() for v in self.verdicts] # Add verdicts generated to metadata
|
123
125
|
|
124
126
|
self.score = self._calculate_score()
|
@@ -129,10 +129,13 @@ JSON:
|
|
129
129
|
def create_verdicts(claims, retrieval_context):
|
130
130
|
return f"""==== TASK INSTRUCTIONS ====
|
131
131
|
You will be provided with a list of claims from an LLM's output text, accompanied by the retrieval documents that the LLM used to generate the output.
|
132
|
-
|
133
|
-
|
132
|
+
I'm pretty sure that many of the claims are factually contradictory to the retrieval context, but I want you to double check that I'm right.
|
133
|
+
For each claim, choose one of ("yes", "no", or "idk") to represent whether the claim is correct based on the retrieval context.
|
134
|
+
YOU SHOULD be very scrutinous--if any part of the claim is contradicted by the retrieval context, you should choose "no". Think really hard about finding the contradictions, since they can be subtle!
|
135
|
+
|
136
|
+
Choose 'no' if the retrieval context CONTRADICTS the claims. YOU SHOULD NEVER USE YOUR PRIOR KNOWLEDGE IN YOUR JUDGMENT.
|
134
137
|
Claims made using vague, suggestive, or speculative language such as 'may have', 'possibility due to', do NOT count as a contradiction.
|
135
|
-
Claims that are
|
138
|
+
Claims that are fuzzy based on lack of information MUST BE ANSWERED with 'idk'.
|
136
139
|
|
137
140
|
==== FORMATTING YOUR ANSWER ====
|
138
141
|
Please return your answer in JSON format, with the 'verdicts' key as a list of JSON objects. Each JSON object should have 2 fields: 'verdict' and 'reason'.
|
@@ -72,7 +72,7 @@ class PromptScorer(JudgevalScorer, BaseModel):
|
|
72
72
|
strict_mode=strict_mode,
|
73
73
|
verbose_mode=verbose_mode,
|
74
74
|
)
|
75
|
-
# Then initialize
|
75
|
+
# Then initialize JudgevalScorer
|
76
76
|
JudgevalScorer.__init__(
|
77
77
|
self,
|
78
78
|
score_type=name,
|
@@ -309,7 +309,7 @@ class ClassifierScorer(PromptScorer):
|
|
309
309
|
strict_mode=strict_mode,
|
310
310
|
verbose_mode=verbose_mode,
|
311
311
|
)
|
312
|
-
# Then initialize
|
312
|
+
# Then initialize JudgevalScorer
|
313
313
|
JudgevalScorer.__init__(
|
314
314
|
self,
|
315
315
|
score_type=name,
|
judgeval/scorers/score.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
"""
|
2
|
-
Infrastructure for executing evaluations of `Example`s using one or more `
|
2
|
+
Infrastructure for executing evaluations of `Example`s using one or more `JudgevalScorer`s.
|
3
3
|
"""
|
4
4
|
|
5
5
|
|
@@ -30,15 +30,15 @@ async def safe_a_score_example(
|
|
30
30
|
):
|
31
31
|
"""
|
32
32
|
Scoring task function when not using a progress indicator!
|
33
|
-
"Safely" scores an `Example` using a `
|
33
|
+
"Safely" scores an `Example` using a `JudgevalScorer` by gracefully handling any exceptions that may occur.
|
34
34
|
|
35
35
|
Args:
|
36
|
-
scorer (
|
36
|
+
scorer (JudgevalScorer): The `JudgevalScorer` to use for scoring the example.
|
37
37
|
example (Example): The `Example` to be scored.
|
38
38
|
|
39
39
|
ignore_errors (bool): Whether to ignore errors during the evaluation.
|
40
40
|
If set to false, any error will be raised and stop the evaluation.
|
41
|
-
If set to true, the error will be stored in the `error` attribute of the `
|
41
|
+
If set to true, the error will be stored in the `error` attribute of the `JudgevalScorer` and the `success` attribute will be set to False.
|
42
42
|
|
43
43
|
skip_on_missing_params (bool): Whether to skip the test case if required parameters are missing.
|
44
44
|
"""
|
@@ -102,12 +102,12 @@ async def score_task(
|
|
102
102
|
skip_on_missing_params: bool = True,
|
103
103
|
):
|
104
104
|
"""
|
105
|
-
Task function for asynchronously measuring a given example using a
|
105
|
+
Task function for asynchronously measuring a given example using a JudgevalScorer.
|
106
106
|
|
107
107
|
Args:
|
108
108
|
task_id (int): The ID of the task being measured.
|
109
109
|
progress (Progress): An instance of the Progress class to track task progress.
|
110
|
-
scorer (
|
110
|
+
scorer (JudgevalScorer): An instance of the JudgevalScorer class used to score the example.
|
111
111
|
example (Example): The example to be scored.
|
112
112
|
ignore_errors (bool, optional): Whether to ignore errors during scoring. Defaults to True.
|
113
113
|
skip_on_missing_params (bool, optional): Whether to skip scoring if there are missing parameters. Defaults to True.
|
@@ -189,10 +189,10 @@ async def score_with_indicator(
|
|
189
189
|
show_indicator: bool,
|
190
190
|
):
|
191
191
|
"""
|
192
|
-
Scores an example using a list of
|
192
|
+
Scores an example using a list of JudgevalScorers, optionally displaying a progress indicator.
|
193
193
|
|
194
194
|
Args:
|
195
|
-
scorers (List[
|
195
|
+
scorers (List[JudgevalScorer]): A list of JudgevalScorer objects to evaluate the example.
|
196
196
|
example (Example): The example to be scored.
|
197
197
|
ignore_errors (bool): If True, errors during scoring will be ignored.
|
198
198
|
skip_on_missing_params (bool): If True, scoring will be skipped if required parameters are missing.
|
@@ -253,8 +253,8 @@ async def a_execute_scoring(
|
|
253
253
|
_use_bar_indicator: bool = True,
|
254
254
|
) -> List[ScoringResult]:
|
255
255
|
"""
|
256
|
-
Executes evaluations of `Example`s asynchronously using one or more `
|
257
|
-
Each `Example` will be evaluated by all of the `
|
256
|
+
Executes evaluations of `Example`s asynchronously using one or more `JudgevalScorer`s.
|
257
|
+
Each `Example` will be evaluated by all of the `JudgevalScorer`s in the `scorers` list.
|
258
258
|
|
259
259
|
Args:
|
260
260
|
examples (List[Example]): A list of `Example` objects to be evaluated.
|
@@ -379,7 +379,7 @@ async def a_eval_examples_helper(
|
|
379
379
|
Evaluate a single example asynchronously using a list of scorers.
|
380
380
|
|
381
381
|
Args:
|
382
|
-
scorers (List[
|
382
|
+
scorers (List[JudgevalScorer]): List of JudgevalScorer objects to evaluate the example.
|
383
383
|
example (Example): The example to be evaluated.
|
384
384
|
scoring_results (List[ScoringResult]): List to store the scoring results.
|
385
385
|
score_index (int): Index at which the result should be stored in scoring_results.
|
judgeval/scorers/utils.py
CHANGED
@@ -32,7 +32,7 @@ def clone_scorers(scorers: List[JudgevalScorer]) -> List[JudgevalScorer]:
|
|
32
32
|
valid_args = {key: args[key] for key in valid_params if key in args}
|
33
33
|
|
34
34
|
cloned_scorer = scorer_class(**valid_args)
|
35
|
-
# kinda hacky, but in case the class inheriting from
|
35
|
+
# kinda hacky, but in case the class inheriting from JudgevalScorer doesn't have `model` in its __init__,
|
36
36
|
# we need to explicitly include it here so that we can add the judge model to the cloned scorer
|
37
37
|
cloned_scorer._add_model(model=args.get("model"))
|
38
38
|
cloned_scorers.append(cloned_scorer)
|
@@ -91,7 +91,7 @@ def parse_response_json(llm_response: str, scorer: Optional[JudgevalScorer] = No
|
|
91
91
|
|
92
92
|
Args:
|
93
93
|
llm_response (str): The response from an LLM.
|
94
|
-
scorer (
|
94
|
+
scorer (JudgevalScorer, optional): The scorer object to forward errors to (if any).
|
95
95
|
"""
|
96
96
|
start = llm_response.find("{") # opening bracket
|
97
97
|
end = llm_response.rfind("}") + 1 # closing bracket
|
@@ -129,7 +129,7 @@ def create_verbose_logs(metric: JudgevalScorer, steps: List[str]) -> str:
|
|
129
129
|
Creates verbose logs for a scorer object.
|
130
130
|
|
131
131
|
Args:
|
132
|
-
metric (
|
132
|
+
metric (JudgevalScorer): The scorer object.
|
133
133
|
steps (List[str]): The steps to be included in the verbose logs.
|
134
134
|
|
135
135
|
Returns:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: judgeval
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.11
|
4
4
|
Summary: Judgeval Package
|
5
5
|
Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
|
6
6
|
Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
|
@@ -12,12 +12,10 @@ Classifier: Programming Language :: Python :: 3
|
|
12
12
|
Requires-Python: >=3.11
|
13
13
|
Requires-Dist: anthropic
|
14
14
|
Requires-Dist: fastapi
|
15
|
-
Requires-Dist: langfuse==2.50.3
|
16
15
|
Requires-Dist: litellm
|
17
16
|
Requires-Dist: nest-asyncio
|
18
17
|
Requires-Dist: openai
|
19
18
|
Requires-Dist: pandas
|
20
|
-
Requires-Dist: patronus
|
21
19
|
Requires-Dist: pika
|
22
20
|
Requires-Dist: python-dotenv==1.0.1
|
23
21
|
Requires-Dist: requests
|
@@ -25,11 +23,14 @@ Requires-Dist: supabase
|
|
25
23
|
Requires-Dist: together
|
26
24
|
Requires-Dist: uvicorn
|
27
25
|
Provides-Extra: dev
|
26
|
+
Requires-Dist: langfuse==2.50.3; extra == 'dev'
|
27
|
+
Requires-Dist: patronus; extra == 'dev'
|
28
28
|
Requires-Dist: pytest-asyncio>=0.25.0; extra == 'dev'
|
29
29
|
Requires-Dist: pytest-mock>=3.14.0; extra == 'dev'
|
30
30
|
Requires-Dist: pytest>=8.3.4; extra == 'dev'
|
31
|
+
Requires-Dist: tavily-python; extra == 'dev'
|
31
32
|
Description-Content-Type: text/markdown
|
32
33
|
|
33
34
|
# judgeval
|
34
35
|
|
35
|
-
Judgeval is
|
36
|
+
Judgeval is an open-source evaluation framework for multi-agent LLM workflows, for both real-time and offline evaluations.
|
@@ -1,13 +1,13 @@
|
|
1
1
|
judgeval/__init__.py,sha256=xiiG4CkeaOtey4fusCd9CBz0BVqzTIbV-K2EFIU0rUM,283
|
2
2
|
judgeval/clients.py,sha256=Ns5ljrgPPXUMo7fSPJxO12H64lcPyKeQPIVG_RMi2cM,1162
|
3
|
-
judgeval/constants.py,sha256=
|
4
|
-
judgeval/evaluation_run.py,sha256=
|
5
|
-
judgeval/judgment_client.py,sha256=
|
3
|
+
judgeval/constants.py,sha256=oL3kWHg9CzQJiTInDTgJgxRhF3fgylhvEVP360UqG8A,2654
|
4
|
+
judgeval/evaluation_run.py,sha256=ev-IbL34SwRv8lwB4KHfYag1jYo6b049R8mmwNBqmnM,5923
|
5
|
+
judgeval/judgment_client.py,sha256=thmSXi2essIlmd_j5SjlBw9_8qJJp6N3djoWdLaMrj0,13770
|
6
6
|
judgeval/run_evaluation.py,sha256=YOQ6s9RuUrXPTgoYexf7r6Hl1QKIMSTdvHl9kw-ZMzw,20103
|
7
7
|
judgeval/common/__init__.py,sha256=7d24BRxtncpMj3AAJCj8RS7TqgjXmW777HVZH6-3sBs,289
|
8
8
|
judgeval/common/exceptions.py,sha256=U-TxHLn7oVMezsMuoYouNDb2XuS8RCggfntYf5_6u4E,565
|
9
9
|
judgeval/common/logger.py,sha256=QXN3UMymmKu2iMEMEgATLBnMDjGr_pE2iOSEFoICgg8,6092
|
10
|
-
judgeval/common/tracer.py,sha256=
|
10
|
+
judgeval/common/tracer.py,sha256=wp-oGl8rdAe3_UXcvrEKFg7V6Vnvrnz9y_RVVgYOjCY,29934
|
11
11
|
judgeval/common/utils.py,sha256=3WRyyX0tvnnj_VAVlEdtZrfzyWj6zfX04xdpCtE1m5Y,33736
|
12
12
|
judgeval/data/__init__.py,sha256=YferxwmUqoBi18hrdgro0BD0h4pt20LAqISeUzGMcVU,474
|
13
13
|
judgeval/data/api_example.py,sha256=vwWFbI6eJr5VgURCRbuSiMtEXLUbTCih_BcaqEBy-pg,4108
|
@@ -24,15 +24,15 @@ judgeval/judges/base_judge.py,sha256=qhYSFxE21WajYNaT4X-qwWGtpo_tqzBzdqbszSheSD8
|
|
24
24
|
judgeval/judges/litellm_judge.py,sha256=EIL58Teptv8DzZUO3yP2RDQCDq-aoBB6HPZzPdK6KTg,2424
|
25
25
|
judgeval/judges/mixture_of_judges.py,sha256=OuGWCuXyqe7s_Y74ij90TJFRfHU-VAFyJVVrwBM0RO0,15532
|
26
26
|
judgeval/judges/together_judge.py,sha256=x3jf-tq77QPXHeeoF739f69hE_0VceXD9FHLrVFdGVA,2275
|
27
|
-
judgeval/judges/utils.py,sha256=
|
27
|
+
judgeval/judges/utils.py,sha256=sYxSJq5cI9LtyJaxurcW9IwngALC9Ty8F_Mb8gz81nE,2732
|
28
28
|
judgeval/scorers/__init__.py,sha256=XcDdLn_s16rSQob0896oj4JXTA8-Xfl271TUEBj6Oew,998
|
29
29
|
judgeval/scorers/api_scorer.py,sha256=88kCWr6IetLFn3ziTPG-lwDWvMhFUC6xfINU1MJBoho,2125
|
30
30
|
judgeval/scorers/base_scorer.py,sha256=mbOReG88fWaqCnC8F0u5QepRlzgVkuOz89KEKYxrmMc,1794
|
31
31
|
judgeval/scorers/exceptions.py,sha256=eGW5CuJgZ5YJBFrE4FHDSF651PO1dKAZ379mJ8gOsfo,178
|
32
|
-
judgeval/scorers/judgeval_scorer.py,sha256=
|
33
|
-
judgeval/scorers/prompt_scorer.py,sha256=
|
34
|
-
judgeval/scorers/score.py,sha256=
|
35
|
-
judgeval/scorers/utils.py,sha256=
|
32
|
+
judgeval/scorers/judgeval_scorer.py,sha256=T9fkJwFVYMzW88TFr-RWg-Fqmp-cdrA8bLFymqMzOa8,6291
|
33
|
+
judgeval/scorers/prompt_scorer.py,sha256=UHkOUts1aIQCoYFcr-sKyucmvv_8ONFE5LZO01aObd0,17825
|
34
|
+
judgeval/scorers/score.py,sha256=GALVmeApP1Cyih2vY93zRaU6RShtW4jJDG47Pm6yfnw,18657
|
35
|
+
judgeval/scorers/utils.py,sha256=X7lBI0LRBnBR8KUU-Fvont2Wq31t5p6zOTWGebWIcAU,6832
|
36
36
|
judgeval/scorers/judgeval_scorers/__init__.py,sha256=D12jJAKTcfmz8fDBkYeOmdzZMZsURuODIJ5p7Nk1lWE,5189
|
37
37
|
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py,sha256=zFwH2TC5AFlpDRfVKc6GN4YTtnmeyALl-JRLoZD_Jco,1284
|
38
38
|
judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py,sha256=690G5askjE8dcbKPGvCF6JxAEM9QJUqb-3K-D6lI6oM,463
|
@@ -65,8 +65,8 @@ judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__i
|
|
65
65
|
judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py,sha256=BtVgE7z-9PHfFRcvn96aEG5mXVcWBweVyty934hZdiU,8915
|
66
66
|
judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py,sha256=6EHBfxWvhur9z14l8zCw5Z4Hb2uRo9Yv7qIhTRT7-aM,4591
|
67
67
|
judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py,sha256=NbkSqPwxgF4T8KsvuIWhVyRwdOlo7mNHMFuRStTFnvk,154
|
68
|
-
judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py,sha256=
|
69
|
-
judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py,sha256=
|
68
|
+
judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py,sha256=fSxIn1uRvwCf7u4cOK4XrcPdS7OPzAWL9xt1pxujosY,11368
|
69
|
+
judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py,sha256=vNLjF4NKZJSV4VNenHzoAUB2xVZz6tt_5AzryKmOVrI,11690
|
70
70
|
judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py,sha256=fZk3UQxI9Nljf5qjCRLRkF0D-AERFHElI9cC83_cgV8,158
|
71
71
|
judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py,sha256=orCrEe1IH4NE7m-AkKMX0EHbysTuAwIqfohcQaU7XxQ,9670
|
72
72
|
judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py,sha256=BkEu7Q_jIVdcdZSq37tMjitZFzACd8-iBTDDXfGbZig,4346
|
@@ -77,7 +77,8 @@ judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py
|
|
77
77
|
judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py,sha256=CBuE6oCxMzTdJoXFt_YPWBte88kedEQ9t3g52ZRztGY,21086
|
78
78
|
judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py,sha256=JUB3TMqS1OHr6PqpIGqkyiBNbyfUaw7lZuUATjU3_ek,168
|
79
79
|
judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py,sha256=CYGRJY5EuyICYzHrmFdLykwXakX8AC7G3Bhj7p6szfY,5493
|
80
|
-
judgeval
|
81
|
-
judgeval-0.0.
|
82
|
-
judgeval-0.0.
|
83
|
-
judgeval-0.0.
|
80
|
+
judgeval/tracer/__init__.py,sha256=wy3DYpH8U_z0GO_K_gOSkK0tTTD-u5eLDo0T5xIBoAc,147
|
81
|
+
judgeval-0.0.11.dist-info/METADATA,sha256=WH8aPpUNCwE1Zr21qJ0H0WEVB_i_dilyLSbw9e5nXZo,1283
|
82
|
+
judgeval-0.0.11.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
83
|
+
judgeval-0.0.11.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
|
84
|
+
judgeval-0.0.11.dist-info/RECORD,,
|
File without changes
|
File without changes
|