judgeval 0.0.10__py3-none-any.whl → 0.0.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
judgeval/common/tracer.py CHANGED
@@ -8,8 +8,20 @@ import functools
8
8
  import requests
9
9
  import uuid
10
10
  from contextlib import contextmanager
11
- from typing import Optional, Any, List, Literal, Tuple, Generator, TypeAlias, Union
12
- from dataclasses import dataclass, field
11
+ from typing import (
12
+ Optional,
13
+ Any,
14
+ List,
15
+ Literal,
16
+ Tuple,
17
+ Generator,
18
+ TypeAlias,
19
+ Union
20
+ )
21
+ from dataclasses import (
22
+ dataclass,
23
+ field
24
+ )
13
25
  from datetime import datetime
14
26
  from openai import OpenAI
15
27
  from together import Together
@@ -21,18 +33,26 @@ import json
21
33
  import warnings
22
34
  from pydantic import BaseModel
23
35
  from http import HTTPStatus
24
- from rich import print as rprint
25
36
 
26
- from judgeval.constants import JUDGMENT_TRACES_SAVE_API_URL
37
+ import pika
38
+ import os
39
+
40
+ from judgeval.constants import JUDGMENT_TRACES_SAVE_API_URL, JUDGMENT_TRACES_FETCH_API_URL, RABBITMQ_HOST, RABBITMQ_PORT, RABBITMQ_QUEUE, JUDGMENT_TRACES_DELETE_API_URL
27
41
  from judgeval.judgment_client import JudgmentClient
28
42
  from judgeval.data import Example
29
- from judgeval.scorers import APIJudgmentScorer, JudgevalScorer
43
+ from judgeval.scorers import APIJudgmentScorer, JudgevalScorer, ScorerWrapper
44
+
45
+ from rich import print as rprint
46
+
30
47
  from judgeval.data.result import ScoringResult
48
+ from judgeval.evaluation_run import EvaluationRun
31
49
 
32
50
  # Define type aliases for better code readability and maintainability
33
51
  ApiClient: TypeAlias = Union[OpenAI, Together, Anthropic] # Supported API clients
34
52
  TraceEntryType = Literal['enter', 'exit', 'output', 'input', 'evaluation'] # Valid trace entry types
35
53
  SpanType = Literal['span', 'tool', 'llm', 'evaluation']
54
+
55
+
36
56
  @dataclass
37
57
  class TraceEntry:
38
58
  """Represents a single trace entry with its visual representation.
@@ -54,7 +74,7 @@ class TraceEntry:
54
74
  # Use field() for mutable defaults to avoid shared state issues
55
75
  inputs: dict = field(default_factory=dict)
56
76
  span_type: SpanType = "span"
57
- evaluation_result: Optional[List[ScoringResult]] = field(default=None)
77
+ evaluation_runs: List[Optional[EvaluationRun]] = field(default=None)
58
78
 
59
79
  def print_entry(self):
60
80
  indent = " " * self.depth
@@ -67,7 +87,8 @@ class TraceEntry:
67
87
  elif self.type == "input":
68
88
  print(f"{indent}Input: {self.inputs}")
69
89
  elif self.type == "evaluation":
70
- print(f"{indent}Evaluation: {self.evaluation_result} ({self.duration:.3f}s)")
90
+ for evaluation_run in self.evaluation_runs:
91
+ print(f"{indent}Evaluation: {evaluation_run.model_dump()}")
71
92
 
72
93
  def _serialize_inputs(self) -> dict:
73
94
  """Helper method to serialize input data safely.
@@ -114,7 +135,7 @@ class TraceEntry:
114
135
  "duration": self.duration,
115
136
  "output": self._serialize_output(),
116
137
  "inputs": self._serialize_inputs(),
117
- "evaluation_result": [result.to_dict() for result in self.evaluation_result] if self.evaluation_result else None,
138
+ "evaluation_runs": [evaluation_run.model_dump() for evaluation_run in self.evaluation_runs] if self.evaluation_runs else [],
118
139
  "span_type": self.span_type
119
140
  }
120
141
 
@@ -155,6 +176,106 @@ class TraceEntry:
155
176
  return self.output
156
177
  except (TypeError, OverflowError, ValueError):
157
178
  return safe_stringify(self.output, self.function)
179
+
180
+
181
+ class TraceManagerClient:
182
+ """
183
+ Client for handling trace endpoints with the Judgment API
184
+
185
+
186
+ Operations include:
187
+ - Fetching a trace by id
188
+ - Saving a trace
189
+ - Deleting a trace
190
+ """
191
+ def __init__(self, judgment_api_key: str):
192
+ self.judgment_api_key = judgment_api_key
193
+
194
+ def fetch_trace(self, trace_id: str):
195
+ """
196
+ Fetch a trace by its id
197
+ """
198
+ response = requests.post(
199
+ JUDGMENT_TRACES_FETCH_API_URL,
200
+ json={
201
+ "trace_id": trace_id,
202
+ "judgment_api_key": self.judgment_api_key,
203
+ },
204
+ headers={
205
+ "Content-Type": "application/json",
206
+ }
207
+ )
208
+
209
+ if response.status_code != HTTPStatus.OK:
210
+ raise ValueError(f"Failed to fetch traces: {response.text}")
211
+
212
+ return response.json()
213
+
214
+ def save_trace(self, trace_data: dict, empty_save: bool):
215
+ """
216
+ Saves a trace to the database
217
+
218
+ Args:
219
+ trace_data: The trace data to save
220
+ empty_save: Whether to save an empty trace
221
+ NOTE we save empty traces in order to properly handle async operations; we need something in the DB to associate the async results with
222
+ """
223
+ response = requests.post(
224
+ JUDGMENT_TRACES_SAVE_API_URL,
225
+ json=trace_data,
226
+ headers={
227
+ "Content-Type": "application/json",
228
+ }
229
+ )
230
+
231
+ if response.status_code == HTTPStatus.BAD_REQUEST:
232
+ raise ValueError(f"Failed to save trace data: Check your Trace name for conflicts, set overwrite=True to overwrite existing traces: {response.text}")
233
+ elif response.status_code != HTTPStatus.OK:
234
+ raise ValueError(f"Failed to save trace data: {response.text}")
235
+
236
+ if not empty_save and "ui_results_url" in response.json():
237
+ rprint(f"\n🔍 You can view your trace data here: [rgb(106,0,255)]{response.json()['ui_results_url']}[/]\n")
238
+
239
+ def delete_trace(self, trace_id: str):
240
+ """
241
+ Delete a trace from the database.
242
+ """
243
+ response = requests.delete(
244
+ JUDGMENT_TRACES_DELETE_API_URL,
245
+ json={
246
+ "judgment_api_key": self.judgment_api_key,
247
+ "trace_ids": [trace_id],
248
+ },
249
+ headers={
250
+ "Content-Type": "application/json",
251
+ }
252
+ )
253
+
254
+ if response.status_code != HTTPStatus.OK:
255
+ raise ValueError(f"Failed to delete trace: {response.text}")
256
+
257
+ return response.json()
258
+
259
+ def delete_traces(self, trace_ids: List[str]):
260
+ """
261
+ Delete a batch of traces from the database.
262
+ """
263
+ response = requests.delete(
264
+ JUDGMENT_TRACES_DELETE_API_URL,
265
+ json={
266
+ "judgment_api_key": self.judgment_api_key,
267
+ "trace_ids": trace_ids,
268
+ },
269
+ headers={
270
+ "Content-Type": "application/json",
271
+ }
272
+ )
273
+
274
+ if response.status_code != HTTPStatus.OK:
275
+ raise ValueError(f"Failed to delete trace: {response.text}")
276
+
277
+ return response.json()
278
+
158
279
 
159
280
  class TraceClient:
160
281
  """Client for managing a single trace context"""
@@ -169,6 +290,7 @@ class TraceClient:
169
290
  self.span_type = None
170
291
  self._current_span: Optional[TraceEntry] = None
171
292
  self.overwrite = overwrite
293
+ self.trace_manager_client = TraceManagerClient(tracer.api_key) # Manages DB operations for trace data
172
294
 
173
295
  @contextmanager
174
296
  def span(self, name: str, span_type: SpanType = "span"):
@@ -185,6 +307,7 @@ class TraceClient:
185
307
  span_type=span_type
186
308
  ))
187
309
 
310
+ # Increment nested depth and set current span
188
311
  self.tracer.depth += 1
189
312
  prev_span = self._current_span
190
313
  self._current_span = name
@@ -207,7 +330,7 @@ class TraceClient:
207
330
  ))
208
331
  self._current_span = prev_span
209
332
 
210
- async def async_evaluate(
333
+ def async_evaluate(
211
334
  self,
212
335
  scorers: List[Union[APIJudgmentScorer, JudgevalScorer]],
213
336
  input: Optional[str] = None,
@@ -233,25 +356,40 @@ class TraceClient:
233
356
  additional_metadata=additional_metadata,
234
357
  trace_id=self.trace_id
235
358
  )
236
- scoring_results = self.client.run_evaluation(
237
- examples=[example],
238
- scorers=scorers,
239
- model=model,
240
- metadata={},
359
+
360
+ try:
361
+ # Load appropriate implementations for all scorers
362
+ loaded_scorers: List[Union[JudgevalScorer, APIJudgmentScorer]] = [
363
+ scorer.load_implementation(use_judgment=True) if isinstance(scorer, ScorerWrapper) else scorer
364
+ for scorer in scorers
365
+ ]
366
+ except Exception as e:
367
+ raise ValueError(f"Failed to load scorers: {str(e)}")
368
+
369
+ eval_run = EvaluationRun(
241
370
  log_results=log_results,
242
371
  project_name=self.project_name,
243
- eval_run_name=(
244
- f"{self.name.capitalize()}-"
372
+ eval_name=f"{self.name.capitalize()}-"
245
373
  f"{self._current_span}-"
246
- f"[{','.join(scorer.load_implementation().score_type.capitalize() for scorer in scorers)}]"
247
- ),
374
+ f"[{','.join(scorer.load_implementation().score_type.capitalize() for scorer in scorers)}]",
375
+ examples=[example],
376
+ scorers=loaded_scorers,
377
+ model=model,
378
+ metadata={},
379
+ judgment_api_key=self.tracer.api_key,
248
380
  override=self.overwrite
249
381
  )
250
382
 
251
- self.record_evaluation(scoring_results, start_time) # Pass start_time to record_evaluation
383
+ self.add_eval_run(eval_run, start_time) # Pass start_time to record_evaluation
252
384
 
253
- def record_evaluation(self, results: List[ScoringResult], start_time: float):
254
- """Record evaluation results for the current span"""
385
+ def add_eval_run(self, eval_run: EvaluationRun, start_time: float):
386
+ """
387
+ Add evaluation run data to the trace
388
+
389
+ Args:
390
+ eval_run (EvaluationRun): The evaluation run to add to the trace
391
+ start_time (float): The start time of the evaluation run
392
+ """
255
393
  if self._current_span:
256
394
  duration = time.time() - start_time # Calculate duration from start_time
257
395
 
@@ -261,7 +399,7 @@ class TraceClient:
261
399
  depth=self.tracer.depth,
262
400
  message=f"Evaluation results for {self._current_span}",
263
401
  timestamp=time.time(),
264
- evaluation_result=results,
402
+ evaluation_runs=[eval_run],
265
403
  duration=duration,
266
404
  span_type="evaluation"
267
405
  ))
@@ -342,7 +480,7 @@ class TraceClient:
342
480
  "timestamp": entry["timestamp"],
343
481
  "inputs": None,
344
482
  "output": None,
345
- "evaluation_result": None,
483
+ "evaluation_runs": [],
346
484
  "span_type": entry.get("span_type", "span")
347
485
  }
348
486
  active_functions.append(function)
@@ -365,8 +503,8 @@ class TraceClient:
365
503
  if entry["type"] == "output" and entry["output"]:
366
504
  current_entry["output"] = entry["output"]
367
505
 
368
- if entry["type"] == "evaluation" and entry["evaluation_result"]:
369
- current_entry["evaluation_result"] = entry["evaluation_result"]
506
+ if entry["type"] == "evaluation" and entry["evaluation_runs"]:
507
+ current_entry["evaluation_runs"] = entry["evaluation_runs"]
370
508
 
371
509
  # Sort by timestamp
372
510
  condensed.sort(key=lambda x: x["timestamp"])
@@ -418,26 +556,30 @@ class TraceClient:
418
556
  "empty_save": empty_save,
419
557
  "overwrite": overwrite
420
558
  }
421
-
422
- # Save trace data by making POST request to API
423
- response = requests.post(
424
- JUDGMENT_TRACES_SAVE_API_URL,
425
- json=trace_data,
426
- headers={
427
- "Content-Type": "application/json",
428
- }
429
- )
430
-
431
- if response.status_code == HTTPStatus.BAD_REQUEST:
432
- raise ValueError(f"Failed to save trace data: Check your Trace name for conflicts, set overwrite=True to overwrite existing traces: {response.text}")
433
- elif response.status_code != HTTPStatus.OK:
434
- raise ValueError(f"Failed to save trace data: {response.text}")
435
559
 
436
- if not empty_save and "ui_results_url" in response.json():
437
- rprint(f"\n🔍 You can view your trace data here: [rgb(106,0,255)]{response.json()['ui_results_url']}[/]\n")
560
+ if not empty_save:
561
+ connection = pika.BlockingConnection(
562
+ pika.ConnectionParameters(host=RABBITMQ_HOST, port=RABBITMQ_PORT))
563
+ channel = connection.channel()
564
+
565
+ channel.queue_declare(queue=RABBITMQ_QUEUE, durable=True)
566
+
567
+ channel.basic_publish(
568
+ exchange='',
569
+ routing_key=RABBITMQ_QUEUE,
570
+ body=json.dumps(trace_data),
571
+ properties=pika.BasicProperties(
572
+ delivery_mode=pika.DeliveryMode.Transient # Changed from Persistent to Transient
573
+ ))
574
+ connection.close()
438
575
 
576
+ self.trace_manager_client.save_trace(trace_data, empty_save)
577
+
439
578
  return self.trace_id, trace_data
440
579
 
580
+ def delete(self):
581
+ return self.trace_manager_client.delete_trace(self.trace_id)
582
+
441
583
  class Tracer:
442
584
  _instance = None
443
585
 
judgeval/constants.py CHANGED
@@ -32,16 +32,25 @@ class APIScorer(str, Enum):
32
32
  return member
33
33
 
34
34
  ROOT_API = os.getenv("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
35
- ## API URLs
35
+ # API URLs
36
36
  JUDGMENT_EVAL_API_URL = f"{ROOT_API}/evaluate/"
37
37
  JUDGMENT_DATASETS_PUSH_API_URL = f"{ROOT_API}/datasets/push/"
38
38
  JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/pull/"
39
39
  JUDGMENT_DATASETS_PULL_ALL_API_URL = f"{ROOT_API}/datasets/get_all_stats/"
40
40
  JUDGMENT_EVAL_LOG_API_URL = f"{ROOT_API}/log_eval_results/"
41
41
  JUDGMENT_EVAL_FETCH_API_URL = f"{ROOT_API}/fetch_eval_results/"
42
+ JUDGMENT_EVAL_DELETE_API_URL = f"{ROOT_API}/delete_eval_results_by_project_and_run_name/"
43
+ JUDGMENT_EVAL_DELETE_PROJECT_API_URL = f"{ROOT_API}/delete_eval_results_by_project/"
44
+ JUDGMENT_TRACES_FETCH_API_URL = f"{ROOT_API}/traces/fetch/"
42
45
  JUDGMENT_TRACES_SAVE_API_URL = f"{ROOT_API}/traces/save/"
46
+ JUDGMENT_TRACES_DELETE_API_URL = f"{ROOT_API}/traces/delete/"
43
47
 
44
- ## Models
48
+ # RabbitMQ
49
+ RABBITMQ_HOST = os.getenv("RABBITMQ_HOST", "rabbitmq-networklb-faa155df16ec9085.elb.us-west-1.amazonaws.com")
50
+ RABBITMQ_PORT = os.getenv("RABBITMQ_PORT", 5672)
51
+ RABBITMQ_QUEUE = os.getenv("RABBITMQ_QUEUE", "task_queue")
52
+
53
+ # Models
45
54
  TOGETHER_SUPPORTED_MODELS = {
46
55
  "QWEN": "Qwen/Qwen2-72B-Instruct",
47
56
  "LLAMA3_70B_INSTRUCT_TURBO": "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
@@ -51,7 +60,9 @@ TOGETHER_SUPPORTED_MODELS = {
51
60
  "MISTRAL_8x7B_INSTRUCT": "mistralai/Mixtral-8x7B-Instruct-v0.1",
52
61
  }
53
62
 
54
- ACCEPTABLE_MODELS = set(litellm.model_list) | set(TOGETHER_SUPPORTED_MODELS.keys())
63
+ JUDGMENT_SUPPORTED_MODELS = {"osiris-large", "osiris-mini"}
64
+
65
+ ACCEPTABLE_MODELS = set(litellm.model_list) | set(TOGETHER_SUPPORTED_MODELS.keys()) | JUDGMENT_SUPPORTED_MODELS
55
66
 
56
67
  ## System settings
57
68
  MAX_WORKER_THREADS = 10
@@ -15,7 +15,7 @@ class EvaluationRun(BaseModel):
15
15
  project_name (str): The name of the project the evaluation results belong to
16
16
  eval_name (str): A name for this evaluation run
17
17
  examples (List[Example]): The examples to evaluate
18
- scorers (List[Union[JudgmentScorer, CustomScorer]]): A list of scorers to use for evaluation
18
+ scorers (List[Union[JudgmentScorer, JudgevalScorer]]): A list of scorers to use for evaluation
19
19
  model (str): The model used as a judge when using LLM as a Judge
20
20
  aggregator (Optional[str]): The aggregator to use for evaluation if using Mixture of Judges
21
21
  metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
@@ -33,6 +33,7 @@ class EvaluationRun(BaseModel):
33
33
  metadata: Optional[Dict[str, Any]] = None
34
34
  # API Key will be "" until user calls client.run_eval(), then API Key will be set
35
35
  judgment_api_key: Optional[str] = ""
36
+ override: Optional[bool] = False
36
37
 
37
38
  def model_dump(self, **kwargs):
38
39
  data = super().model_dump(**kwargs)
judgeval/judges/utils.py CHANGED
@@ -6,7 +6,7 @@ from typing import Optional, Union, Tuple, List
6
6
 
7
7
  from judgeval.common.exceptions import InvalidJudgeModelError
8
8
  from judgeval.judges import JudgevalJudge, LiteLLMJudge, TogetherJudge, MixtureOfJudges
9
- from judgeval.constants import TOGETHER_SUPPORTED_MODELS
9
+ from judgeval.constants import TOGETHER_SUPPORTED_MODELS, JUDGMENT_SUPPORTED_MODELS, ACCEPTABLE_MODELS
10
10
 
11
11
  LITELLM_SUPPORTED_MODELS = set(litellm.model_list)
12
12
 
@@ -33,7 +33,13 @@ def create_judge(
33
33
  # Either string or List[str]
34
34
  if isinstance(model, list):
35
35
  for m in model:
36
- if m not in TOGETHER_SUPPORTED_MODELS and m not in LITELLM_SUPPORTED_MODELS:
36
+ if m in JUDGMENT_SUPPORTED_MODELS:
37
+ raise NotImplementedError(
38
+ """Judgment models are not yet supported for local scoring.
39
+ Please either set the `use_judgment` flag to True or use
40
+ non-Judgment models."""
41
+ )
42
+ if m not in LITELLM_SUPPORTED_MODELS and m not in TOGETHER_SUPPORTED_MODELS:
37
43
  raise InvalidJudgeModelError(f"Invalid judge model chosen: {m}")
38
44
  return MixtureOfJudges(models=model), True
39
45
  # If model is a string, check that it corresponds to a valid model
@@ -41,5 +47,11 @@ def create_judge(
41
47
  return LiteLLMJudge(model=model), True
42
48
  if model in TOGETHER_SUPPORTED_MODELS:
43
49
  return TogetherJudge(model=model), True
50
+ if model in JUDGMENT_SUPPORTED_MODELS:
51
+ raise NotImplementedError(
52
+ """Judgment models are not yet supported for local scoring.
53
+ Please either set the `use_judgment` flag to True or use
54
+ non-Judgment models."""
55
+ )
44
56
  else:
45
57
  raise InvalidJudgeModelError(f"Invalid judge model chosen: {model}")
@@ -23,7 +23,7 @@ from judgeval.run_evaluation import (
23
23
  assert_test
24
24
  )
25
25
  from judgeval.judges import JudgevalJudge
26
- from judgeval.constants import JUDGMENT_EVAL_FETCH_API_URL
26
+ from judgeval.constants import JUDGMENT_EVAL_FETCH_API_URL, JUDGMENT_EVAL_DELETE_API_URL, JUDGMENT_EVAL_DELETE_PROJECT_API_URL
27
27
  from judgeval.common.exceptions import JudgmentAPIError
28
28
  from pydantic import BaseModel
29
29
 
@@ -194,6 +194,51 @@ class JudgmentClient:
194
194
  eval_run_result[0]["id"] = result_id
195
195
  eval_run_result[0]["results"] = [ScoringResult(**filtered_result)]
196
196
  return eval_run_result
197
+
198
+ def delete_eval(self, project_name: str, eval_run_name: str) -> bool:
199
+ """
200
+ Deletes an evaluation from the server by project and run name.
201
+
202
+ Args:
203
+ project_name (str): Name of the project
204
+ eval_run_name (str): Name of the evaluation run
205
+
206
+ Returns:
207
+ bool: Whether the evaluation was successfully deleted
208
+ """
209
+ eval_run_request_body = EvalRunRequestBody(project_name=project_name,
210
+ eval_name=eval_run_name,
211
+ judgment_api_key=self.judgment_api_key)
212
+ response = requests.delete(JUDGMENT_EVAL_DELETE_API_URL,
213
+ json=eval_run_request_body.model_dump(),
214
+ headers={
215
+ "Content-Type": "application/json",
216
+ })
217
+ if response.status_code != requests.codes.ok:
218
+ raise ValueError(f"Error deleting eval results: {response.json()}")
219
+ return response.json()
220
+
221
+ def delete_project_evals(self, project_name: str) -> bool:
222
+ """
223
+ Deletes all evaluations from the server for a given project.
224
+
225
+ Args:
226
+ project_name (str): Name of the project
227
+
228
+ Returns:
229
+ bool: Whether the evaluations were successfully deleted
230
+ """
231
+ response = requests.delete(JUDGMENT_EVAL_DELETE_PROJECT_API_URL,
232
+ json={
233
+ "project_name": project_name,
234
+ "judgment_api_key": self.judgment_api_key
235
+ },
236
+ headers={
237
+ "Content-Type": "application/json",
238
+ })
239
+ if response.status_code != requests.codes.ok:
240
+ raise ValueError(f"Error deleting eval results: {response.json()}")
241
+ return response.json()
197
242
 
198
243
  def _validate_api_key(self):
199
244
  """
@@ -1,5 +1,5 @@
1
1
  """
2
- Custom Scorer class
2
+ Judgeval Scorer class
3
3
 
4
4
  Enables client to create custom scorers that do not fall under any of the ready-made Judgment scorers.
5
5
  To create a custom scorer, extend this class and implement the `score_example`, `a_score_example`, and `success_check` methods.
@@ -57,12 +57,12 @@ class JudgevalScorer:
57
57
  verbose_logs: Optional[str] = None,
58
58
  additional_metadata: Optional[Dict] = None
59
59
  ):
60
- debug(f"Initializing CustomScorer with score_type={score_type}, threshold={threshold}")
60
+ debug(f"Initializing JudgevalScorer with score_type={score_type}, threshold={threshold}")
61
61
  if not 0 <= threshold <= 1:
62
62
  raise ValueError("Threshold must be between 0 and 1")
63
63
  if strict_mode:
64
64
  warning("Strict mode enabled - scoring will be more rigorous")
65
- info(f"CustomScorer initialized with evaluation_model: {evaluation_model}")
65
+ info(f"JudgevalScorer initialized with evaluation_model: {evaluation_model}")
66
66
  self.score_type = score_type
67
67
  self.threshold = threshold
68
68
  self.score = score
@@ -81,7 +81,7 @@ class JudgevalScorer:
81
81
 
82
82
  def _add_model(self, model: Optional[Union[str, List[str], JudgevalJudge]] = None):
83
83
  """
84
- Adds the evaluation model to the CustomScorer instance
84
+ Adds the evaluation model to the JudgevalScorer instance
85
85
 
86
86
  This method is used at eval time
87
87
  """
@@ -116,10 +116,10 @@ class JudgevalScorer:
116
116
  raise NotImplementedError("You must implement the `passes` method in your custom scorer")
117
117
 
118
118
  def __str__(self):
119
- debug("Converting CustomScorer instance to string representation")
119
+ debug("Converting JudgevalScorer instance to string representation")
120
120
  if self.error:
121
- warning(f"CustomScorer contains error: {self.error}")
122
- info(f"CustomScorer status - success: {self.success}, score: {self.score}")
121
+ warning(f"JudgevalScorer contains error: {self.error}")
122
+ info(f"JudgevalScorer status - success: {self.success}, score: {self.score}")
123
123
  attributes = {
124
124
  "score_type": self.score_type,
125
125
  "threshold": self.threshold,
@@ -137,4 +137,4 @@ class JudgevalScorer:
137
137
  "verbose_logs": self.verbose_logs,
138
138
  "additional_metadata": self.additional_metadata,
139
139
  }
140
- return f"CustomScorer({attributes})"
140
+ return f"JudgevalScorer({attributes})"
@@ -2,7 +2,7 @@
2
2
  Code for the local implementation of the Faithfulness metric.
3
3
  """
4
4
  from typing import List, Optional, Union
5
-
5
+ from pprint import pprint
6
6
  from judgeval.constants import APIScorer
7
7
  from judgeval.data import (
8
8
  Example,
@@ -114,11 +114,13 @@ class FaithfulnessScorer(JudgevalScorer):
114
114
  ):
115
115
  self.claims = await self._a_generate_claims(example.actual_output)
116
116
 
117
+
117
118
  if self.additional_metadata is None:
118
119
  self.additional_metadata = {}
119
120
  self.additional_metadata["claims"] = self.claims
120
121
 
121
122
  self.verdicts = await self._a_generate_verdicts(example.retrieval_context)
123
+
122
124
  self.additional_metadata["verdicts"] = [v.model_dump() for v in self.verdicts] # Add verdicts generated to metadata
123
125
 
124
126
  self.score = self._calculate_score()
@@ -129,10 +129,13 @@ JSON:
129
129
  def create_verdicts(claims, retrieval_context):
130
130
  return f"""==== TASK INSTRUCTIONS ====
131
131
  You will be provided with a list of claims from an LLM's output text, accompanied by the retrieval documents that the LLM used to generate the output.
132
- Your task is to determine whether EACH claim is factually consistent with the retrieval context ("yes", "no", or "idk").
133
- ONLY choose 'no' if the retrieval context DIRECTLY CONTRADICTS the claims. YOU SHOULD NEVER USE YOUR PRIOR KNOWLEDGE IN YOUR JUDGMENT.
132
+ I'm pretty sure that many of the claims are factually contradictory to the retrieval context, but I want you to double check that I'm right.
133
+ For each claim, choose one of ("yes", "no", or "idk") to represent whether the claim is correct based on the retrieval context.
134
+ YOU SHOULD be very scrutinous--if any part of the claim is contradicted by the retrieval context, you should choose "no". Think really hard about finding the contradictions, since they can be subtle!
135
+
136
+ Choose 'no' if the retrieval context CONTRADICTS the claims. YOU SHOULD NEVER USE YOUR PRIOR KNOWLEDGE IN YOUR JUDGMENT.
134
137
  Claims made using vague, suggestive, or speculative language such as 'may have', 'possibility due to', do NOT count as a contradiction.
135
- Claims that are not justified by the retrieval context due to a lack of information MUST BE ANSWERED with 'idk'.
138
+ Claims that are fuzzy based on lack of information MUST BE ANSWERED with 'idk'.
136
139
 
137
140
  ==== FORMATTING YOUR ANSWER ====
138
141
  Please return your answer in JSON format, with the 'verdicts' key as a list of JSON objects. Each JSON object should have 2 fields: 'verdict' and 'reason'.
@@ -72,7 +72,7 @@ class PromptScorer(JudgevalScorer, BaseModel):
72
72
  strict_mode=strict_mode,
73
73
  verbose_mode=verbose_mode,
74
74
  )
75
- # Then initialize CustomScorer
75
+ # Then initialize JudgevalScorer
76
76
  JudgevalScorer.__init__(
77
77
  self,
78
78
  score_type=name,
@@ -309,7 +309,7 @@ class ClassifierScorer(PromptScorer):
309
309
  strict_mode=strict_mode,
310
310
  verbose_mode=verbose_mode,
311
311
  )
312
- # Then initialize CustomScorer
312
+ # Then initialize JudgevalScorer
313
313
  JudgevalScorer.__init__(
314
314
  self,
315
315
  score_type=name,
judgeval/scorers/score.py CHANGED
@@ -1,5 +1,5 @@
1
1
  """
2
- Infrastructure for executing evaluations of `Example`s using one or more `CustomScorer`s.
2
+ Infrastructure for executing evaluations of `Example`s using one or more `JudgevalScorer`s.
3
3
  """
4
4
 
5
5
 
@@ -30,15 +30,15 @@ async def safe_a_score_example(
30
30
  ):
31
31
  """
32
32
  Scoring task function when not using a progress indicator!
33
- "Safely" scores an `Example` using a `CustomScorer` by gracefully handling any exceptions that may occur.
33
+ "Safely" scores an `Example` using a `JudgevalScorer` by gracefully handling any exceptions that may occur.
34
34
 
35
35
  Args:
36
- scorer (CustomScorer): The `CustomScorer` to use for scoring the example.
36
+ scorer (JudgevalScorer): The `JudgevalScorer` to use for scoring the example.
37
37
  example (Example): The `Example` to be scored.
38
38
 
39
39
  ignore_errors (bool): Whether to ignore errors during the evaluation.
40
40
  If set to false, any error will be raised and stop the evaluation.
41
- If set to true, the error will be stored in the `error` attribute of the `CustomScorer` and the `success` attribute will be set to False.
41
+ If set to true, the error will be stored in the `error` attribute of the `JudgevalScorer` and the `success` attribute will be set to False.
42
42
 
43
43
  skip_on_missing_params (bool): Whether to skip the test case if required parameters are missing.
44
44
  """
@@ -102,12 +102,12 @@ async def score_task(
102
102
  skip_on_missing_params: bool = True,
103
103
  ):
104
104
  """
105
- Task function for asynchronously measuring a given example using a custom scorer.
105
+ Task function for asynchronously measuring a given example using a JudgevalScorer.
106
106
 
107
107
  Args:
108
108
  task_id (int): The ID of the task being measured.
109
109
  progress (Progress): An instance of the Progress class to track task progress.
110
- scorer (CustomScorer): An instance of the CustomScorer class used to score the example.
110
+ scorer (JudgevalScorer): An instance of the JudgevalScorer class used to score the example.
111
111
  example (Example): The example to be scored.
112
112
  ignore_errors (bool, optional): Whether to ignore errors during scoring. Defaults to True.
113
113
  skip_on_missing_params (bool, optional): Whether to skip scoring if there are missing parameters. Defaults to True.
@@ -189,10 +189,10 @@ async def score_with_indicator(
189
189
  show_indicator: bool,
190
190
  ):
191
191
  """
192
- Scores an example using a list of custom scorers, optionally displaying a progress indicator.
192
+ Scores an example using a list of JudgevalScorers, optionally displaying a progress indicator.
193
193
 
194
194
  Args:
195
- scorers (List[CustomScorer]): A list of custom scorer objects to evaluate the example.
195
+ scorers (List[JudgevalScorer]): A list of JudgevalScorer objects to evaluate the example.
196
196
  example (Example): The example to be scored.
197
197
  ignore_errors (bool): If True, errors during scoring will be ignored.
198
198
  skip_on_missing_params (bool): If True, scoring will be skipped if required parameters are missing.
@@ -253,8 +253,8 @@ async def a_execute_scoring(
253
253
  _use_bar_indicator: bool = True,
254
254
  ) -> List[ScoringResult]:
255
255
  """
256
- Executes evaluations of `Example`s asynchronously using one or more `CustomScorer`s.
257
- Each `Example` will be evaluated by all of the `CustomScorer`s in the `scorers` list.
256
+ Executes evaluations of `Example`s asynchronously using one or more `JudgevalScorer`s.
257
+ Each `Example` will be evaluated by all of the `JudgevalScorer`s in the `scorers` list.
258
258
 
259
259
  Args:
260
260
  examples (List[Example]): A list of `Example` objects to be evaluated.
@@ -379,7 +379,7 @@ async def a_eval_examples_helper(
379
379
  Evaluate a single example asynchronously using a list of scorers.
380
380
 
381
381
  Args:
382
- scorers (List[CustomScorer]): List of CustomScorer objects to evaluate the example.
382
+ scorers (List[JudgevalScorer]): List of JudgevalScorer objects to evaluate the example.
383
383
  example (Example): The example to be evaluated.
384
384
  scoring_results (List[ScoringResult]): List to store the scoring results.
385
385
  score_index (int): Index at which the result should be stored in scoring_results.
judgeval/scorers/utils.py CHANGED
@@ -32,7 +32,7 @@ def clone_scorers(scorers: List[JudgevalScorer]) -> List[JudgevalScorer]:
32
32
  valid_args = {key: args[key] for key in valid_params if key in args}
33
33
 
34
34
  cloned_scorer = scorer_class(**valid_args)
35
- # kinda hacky, but in case the class inheriting from CustomScorer doesn't have `model` in its __init__,
35
+ # kinda hacky, but in case the class inheriting from JudgevalScorer doesn't have `model` in its __init__,
36
36
  # we need to explicitly include it here so that we can add the judge model to the cloned scorer
37
37
  cloned_scorer._add_model(model=args.get("model"))
38
38
  cloned_scorers.append(cloned_scorer)
@@ -91,7 +91,7 @@ def parse_response_json(llm_response: str, scorer: Optional[JudgevalScorer] = No
91
91
 
92
92
  Args:
93
93
  llm_response (str): The response from an LLM.
94
- scorer (CustomScorer, optional): The scorer object to forward errors to (if any).
94
+ scorer (JudgevalScorer, optional): The scorer object to forward errors to (if any).
95
95
  """
96
96
  start = llm_response.find("{") # opening bracket
97
97
  end = llm_response.rfind("}") + 1 # closing bracket
@@ -129,7 +129,7 @@ def create_verbose_logs(metric: JudgevalScorer, steps: List[str]) -> str:
129
129
  Creates verbose logs for a scorer object.
130
130
 
131
131
  Args:
132
- metric (CustomScorer): The scorer object.
132
+ metric (JudgevalScorer): The scorer object.
133
133
  steps (List[str]): The steps to be included in the verbose logs.
134
134
 
135
135
  Returns:
@@ -0,0 +1,3 @@
1
+ from judgeval.common.tracer import Tracer, wrap, TraceClient, TraceManagerClient
2
+
3
+ __all__ = ["Tracer", "wrap", "TraceClient", "TraceManagerClient"]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: judgeval
3
- Version: 0.0.10
3
+ Version: 0.0.11
4
4
  Summary: Judgeval Package
5
5
  Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
6
  Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -12,12 +12,10 @@ Classifier: Programming Language :: Python :: 3
12
12
  Requires-Python: >=3.11
13
13
  Requires-Dist: anthropic
14
14
  Requires-Dist: fastapi
15
- Requires-Dist: langfuse==2.50.3
16
15
  Requires-Dist: litellm
17
16
  Requires-Dist: nest-asyncio
18
17
  Requires-Dist: openai
19
18
  Requires-Dist: pandas
20
- Requires-Dist: patronus
21
19
  Requires-Dist: pika
22
20
  Requires-Dist: python-dotenv==1.0.1
23
21
  Requires-Dist: requests
@@ -25,11 +23,14 @@ Requires-Dist: supabase
25
23
  Requires-Dist: together
26
24
  Requires-Dist: uvicorn
27
25
  Provides-Extra: dev
26
+ Requires-Dist: langfuse==2.50.3; extra == 'dev'
27
+ Requires-Dist: patronus; extra == 'dev'
28
28
  Requires-Dist: pytest-asyncio>=0.25.0; extra == 'dev'
29
29
  Requires-Dist: pytest-mock>=3.14.0; extra == 'dev'
30
30
  Requires-Dist: pytest>=8.3.4; extra == 'dev'
31
+ Requires-Dist: tavily-python; extra == 'dev'
31
32
  Description-Content-Type: text/markdown
32
33
 
33
34
  # judgeval
34
35
 
35
- Judgeval is a open-source evaluation framework for multi-agent LLM workflows, for both real-time and offline evaluations.
36
+ Judgeval is an open-source evaluation framework for multi-agent LLM workflows, for both real-time and offline evaluations.
@@ -1,13 +1,13 @@
1
1
  judgeval/__init__.py,sha256=xiiG4CkeaOtey4fusCd9CBz0BVqzTIbV-K2EFIU0rUM,283
2
2
  judgeval/clients.py,sha256=Ns5ljrgPPXUMo7fSPJxO12H64lcPyKeQPIVG_RMi2cM,1162
3
- judgeval/constants.py,sha256=qwWc3EOpXSn9SHq5rylkHhnzH5WldedqSMCToa7vgZk,2040
4
- judgeval/evaluation_run.py,sha256=KcIS7mDR_9XEdqYrJXFcrLz5IDMof34HcD5VtjZgV8w,5884
5
- judgeval/judgment_client.py,sha256=jMeayUI-Z-GX4mVMVC9t5f7ENKLQ8dOepScYu5Yytf0,11777
3
+ judgeval/constants.py,sha256=oL3kWHg9CzQJiTInDTgJgxRhF3fgylhvEVP360UqG8A,2654
4
+ judgeval/evaluation_run.py,sha256=ev-IbL34SwRv8lwB4KHfYag1jYo6b049R8mmwNBqmnM,5923
5
+ judgeval/judgment_client.py,sha256=thmSXi2essIlmd_j5SjlBw9_8qJJp6N3djoWdLaMrj0,13770
6
6
  judgeval/run_evaluation.py,sha256=YOQ6s9RuUrXPTgoYexf7r6Hl1QKIMSTdvHl9kw-ZMzw,20103
7
7
  judgeval/common/__init__.py,sha256=7d24BRxtncpMj3AAJCj8RS7TqgjXmW777HVZH6-3sBs,289
8
8
  judgeval/common/exceptions.py,sha256=U-TxHLn7oVMezsMuoYouNDb2XuS8RCggfntYf5_6u4E,565
9
9
  judgeval/common/logger.py,sha256=QXN3UMymmKu2iMEMEgATLBnMDjGr_pE2iOSEFoICgg8,6092
10
- judgeval/common/tracer.py,sha256=k5g9ZLeM-fLdV_q9NpodN8gW4nLTIXsbxeTaXVjm9jk,25658
10
+ judgeval/common/tracer.py,sha256=wp-oGl8rdAe3_UXcvrEKFg7V6Vnvrnz9y_RVVgYOjCY,29934
11
11
  judgeval/common/utils.py,sha256=3WRyyX0tvnnj_VAVlEdtZrfzyWj6zfX04xdpCtE1m5Y,33736
12
12
  judgeval/data/__init__.py,sha256=YferxwmUqoBi18hrdgro0BD0h4pt20LAqISeUzGMcVU,474
13
13
  judgeval/data/api_example.py,sha256=vwWFbI6eJr5VgURCRbuSiMtEXLUbTCih_BcaqEBy-pg,4108
@@ -24,15 +24,15 @@ judgeval/judges/base_judge.py,sha256=qhYSFxE21WajYNaT4X-qwWGtpo_tqzBzdqbszSheSD8
24
24
  judgeval/judges/litellm_judge.py,sha256=EIL58Teptv8DzZUO3yP2RDQCDq-aoBB6HPZzPdK6KTg,2424
25
25
  judgeval/judges/mixture_of_judges.py,sha256=OuGWCuXyqe7s_Y74ij90TJFRfHU-VAFyJVVrwBM0RO0,15532
26
26
  judgeval/judges/together_judge.py,sha256=x3jf-tq77QPXHeeoF739f69hE_0VceXD9FHLrVFdGVA,2275
27
- judgeval/judges/utils.py,sha256=YUvivcGV1OKLPMJ9N6aTvhA0r_zzJ2NXriPguiiaVaY,2110
27
+ judgeval/judges/utils.py,sha256=sYxSJq5cI9LtyJaxurcW9IwngALC9Ty8F_Mb8gz81nE,2732
28
28
  judgeval/scorers/__init__.py,sha256=XcDdLn_s16rSQob0896oj4JXTA8-Xfl271TUEBj6Oew,998
29
29
  judgeval/scorers/api_scorer.py,sha256=88kCWr6IetLFn3ziTPG-lwDWvMhFUC6xfINU1MJBoho,2125
30
30
  judgeval/scorers/base_scorer.py,sha256=mbOReG88fWaqCnC8F0u5QepRlzgVkuOz89KEKYxrmMc,1794
31
31
  judgeval/scorers/exceptions.py,sha256=eGW5CuJgZ5YJBFrE4FHDSF651PO1dKAZ379mJ8gOsfo,178
32
- judgeval/scorers/judgeval_scorer.py,sha256=14SZ3sBZtGNM3BCegKebkNad9LTs5Tyhs0kD6l3wLAA,6275
33
- judgeval/scorers/prompt_scorer.py,sha256=bUv8eZNy1XGVM1gNMt33dgIVX6zj63bGAV6O0o0c7yg,17821
34
- judgeval/scorers/score.py,sha256=zJKG21h9Njyj2vS36CAFK2wlbOcHSKgrLgHV5_25KKw,18630
35
- judgeval/scorers/utils.py,sha256=dtueaJm8e3Ph3wj1vC-srzadgK_CoIlOefdvMQ-cwK8,6826
32
+ judgeval/scorers/judgeval_scorer.py,sha256=T9fkJwFVYMzW88TFr-RWg-Fqmp-cdrA8bLFymqMzOa8,6291
33
+ judgeval/scorers/prompt_scorer.py,sha256=UHkOUts1aIQCoYFcr-sKyucmvv_8ONFE5LZO01aObd0,17825
34
+ judgeval/scorers/score.py,sha256=GALVmeApP1Cyih2vY93zRaU6RShtW4jJDG47Pm6yfnw,18657
35
+ judgeval/scorers/utils.py,sha256=X7lBI0LRBnBR8KUU-Fvont2Wq31t5p6zOTWGebWIcAU,6832
36
36
  judgeval/scorers/judgeval_scorers/__init__.py,sha256=D12jJAKTcfmz8fDBkYeOmdzZMZsURuODIJ5p7Nk1lWE,5189
37
37
  judgeval/scorers/judgeval_scorers/api_scorers/__init__.py,sha256=zFwH2TC5AFlpDRfVKc6GN4YTtnmeyALl-JRLoZD_Jco,1284
38
38
  judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py,sha256=690G5askjE8dcbKPGvCF6JxAEM9QJUqb-3K-D6lI6oM,463
@@ -65,8 +65,8 @@ judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__i
65
65
  judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py,sha256=BtVgE7z-9PHfFRcvn96aEG5mXVcWBweVyty934hZdiU,8915
66
66
  judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py,sha256=6EHBfxWvhur9z14l8zCw5Z4Hb2uRo9Yv7qIhTRT7-aM,4591
67
67
  judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py,sha256=NbkSqPwxgF4T8KsvuIWhVyRwdOlo7mNHMFuRStTFnvk,154
68
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py,sha256=4XqdcdgHg3evrg-IQwXmUHEyee1lZUjXRNEiQSvdpmQ,11341
69
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py,sha256=oxmCsouh5ExUMmlSuCDolpYR2y9c-yKth6PHrdsCH_g,11387
68
+ judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py,sha256=fSxIn1uRvwCf7u4cOK4XrcPdS7OPzAWL9xt1pxujosY,11368
69
+ judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py,sha256=vNLjF4NKZJSV4VNenHzoAUB2xVZz6tt_5AzryKmOVrI,11690
70
70
  judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py,sha256=fZk3UQxI9Nljf5qjCRLRkF0D-AERFHElI9cC83_cgV8,158
71
71
  judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py,sha256=orCrEe1IH4NE7m-AkKMX0EHbysTuAwIqfohcQaU7XxQ,9670
72
72
  judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py,sha256=BkEu7Q_jIVdcdZSq37tMjitZFzACd8-iBTDDXfGbZig,4346
@@ -77,7 +77,8 @@ judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py
77
77
  judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py,sha256=CBuE6oCxMzTdJoXFt_YPWBte88kedEQ9t3g52ZRztGY,21086
78
78
  judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py,sha256=JUB3TMqS1OHr6PqpIGqkyiBNbyfUaw7lZuUATjU3_ek,168
79
79
  judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py,sha256=CYGRJY5EuyICYzHrmFdLykwXakX8AC7G3Bhj7p6szfY,5493
80
- judgeval-0.0.10.dist-info/METADATA,sha256=i9jeAPs3jY5hAHAdE_rlen4qJdEk0eAqQ0BOzMie97I,1205
81
- judgeval-0.0.10.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
82
- judgeval-0.0.10.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
83
- judgeval-0.0.10.dist-info/RECORD,,
80
+ judgeval/tracer/__init__.py,sha256=wy3DYpH8U_z0GO_K_gOSkK0tTTD-u5eLDo0T5xIBoAc,147
81
+ judgeval-0.0.11.dist-info/METADATA,sha256=WH8aPpUNCwE1Zr21qJ0H0WEVB_i_dilyLSbw9e5nXZo,1283
82
+ judgeval-0.0.11.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
83
+ judgeval-0.0.11.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
84
+ judgeval-0.0.11.dist-info/RECORD,,