judgeval 0.4.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,10 +1,12 @@
1
+ from __future__ import annotations
2
+
1
3
  import asyncio
2
4
  import concurrent.futures
3
5
  import time
4
6
  import orjson
5
7
  import sys
6
8
  import threading
7
- from typing import List, Dict, Union, Optional, Callable, Tuple, Any
9
+ from typing import List, Dict, Union, Optional, Callable, Tuple, Any, TYPE_CHECKING
8
10
  from rich import print as rprint
9
11
 
10
12
  from judgeval.data import ScorerData, ScoringResult, Example, Trace
@@ -17,10 +19,13 @@ from judgeval.constants import (
17
19
  from judgeval.common.exceptions import JudgmentAPIError
18
20
  from judgeval.common.api.api import JudgmentAPIException
19
21
  from judgeval.common.logger import judgeval_logger
20
- from judgeval.evaluation_run import EvaluationRun
21
- from judgeval.data.trace_run import TraceRun
22
- from judgeval.common.tracer import Tracer
23
- from judgeval.integrations.langgraph import JudgevalCallbackHandler
22
+
23
+
24
+ if TYPE_CHECKING:
25
+ from judgeval.common.tracer import Tracer
26
+ from judgeval.data.trace_run import TraceRun
27
+ from judgeval.data.evaluation_run import EvaluationRun
28
+ from judgeval.integrations.langgraph import JudgevalCallbackHandler
24
29
 
25
30
 
26
31
  def safe_run_async(coro):
@@ -135,80 +140,6 @@ def check_missing_scorer_data(results: List[ScoringResult]) -> List[ScoringResul
135
140
  return results
136
141
 
137
142
 
138
- def check_experiment_type(
139
- eval_name: str,
140
- project_name: str,
141
- judgment_api_key: str,
142
- organization_id: str,
143
- is_trace: bool,
144
- ) -> None:
145
- """
146
- Checks if the current experiment, if one exists, has the same type (examples of traces)
147
- """
148
- api_client = JudgmentApiClient(judgment_api_key, organization_id)
149
-
150
- try:
151
- api_client.check_experiment_type(eval_name, project_name, is_trace)
152
- except JudgmentAPIException as e:
153
- if e.response.status_code == 422:
154
- judgeval_logger.error(f"{e.response_json}")
155
- raise ValueError(f"{e.response_json}")
156
- else:
157
- raise e
158
- except Exception as e:
159
- judgeval_logger.error(f"Failed to check if experiment type exists: {str(e)}")
160
- raise JudgmentAPIError(f"Failed to check if experiment type exists: {str(e)}")
161
-
162
-
163
- def check_eval_run_name_exists(
164
- eval_name: str, project_name: str, judgment_api_key: str, organization_id: str
165
- ) -> None:
166
- """
167
- Checks if an evaluation run name already exists for a given project.
168
-
169
- Args:
170
- eval_name (str): Name of the evaluation run
171
- project_name (str): Name of the project
172
- judgment_api_key (str): API key for authentication
173
-
174
- Raises:
175
- ValueError: If the evaluation run name already exists
176
- JudgmentAPIError: If there's an API error during the check
177
- """
178
- api_client = JudgmentApiClient(judgment_api_key, organization_id)
179
- try:
180
- api_client.check_eval_run_name_exists(eval_name, project_name)
181
- except JudgmentAPIException as e:
182
- if e.response.status_code == 409:
183
- error_str = f"Eval run name '{eval_name}' already exists for this project. Please choose a different name, set the `override` flag to true, or set the `append` flag to true. See https://docs.judgmentlabs.ai/sdk-reference/judgment-client#override for more information."
184
- judgeval_logger.error(error_str)
185
- raise ValueError(error_str)
186
- else:
187
- raise e
188
-
189
- except Exception as e:
190
- judgeval_logger.error(f"Failed to check if eval run name exists: {str(e)}")
191
- raise JudgmentAPIError(f"Failed to check if eval run name exists: {str(e)}")
192
-
193
-
194
- def check_example_keys(
195
- keys: List[str],
196
- eval_name: str,
197
- project_name: str,
198
- judgment_api_key: str,
199
- organization_id: str,
200
- ) -> None:
201
- """
202
- Checks if the current experiment (if one exists) has the same keys for example
203
- """
204
- api_client = JudgmentApiClient(judgment_api_key, organization_id)
205
- try:
206
- api_client.check_example_keys(keys, eval_name, project_name)
207
- except Exception as e:
208
- judgeval_logger.error(f"Failed to check if example keys match: {str(e)}")
209
- raise JudgmentAPIError(f"Failed to check if example keys match: {str(e)}")
210
-
211
-
212
143
  def log_evaluation_results(
213
144
  scoring_results: List[ScoringResult],
214
145
  run: Union[EvaluationRun, TraceRun],
@@ -280,29 +211,10 @@ def check_examples(
280
211
  def run_trace_eval(
281
212
  trace_run: TraceRun,
282
213
  judgment_api_key: str,
283
- override: bool = False,
284
214
  function: Optional[Callable] = None,
285
- tracer: Optional[Union[Tracer, JudgevalCallbackHandler]] = None,
215
+ tracer: Optional[Union[Tracer, "JudgevalCallbackHandler"]] = None,
286
216
  examples: Optional[List[Example]] = None,
287
217
  ) -> List[ScoringResult]:
288
- # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
289
- if not override and not trace_run.append:
290
- check_eval_run_name_exists(
291
- trace_run.eval_name,
292
- trace_run.project_name,
293
- judgment_api_key,
294
- trace_run.organization_id,
295
- )
296
-
297
- if trace_run.append:
298
- # Check that the current experiment, if one exists, has the same type (examples or traces)
299
- check_experiment_type(
300
- trace_run.eval_name,
301
- trace_run.project_name,
302
- judgment_api_key,
303
- trace_run.organization_id,
304
- True,
305
- )
306
218
  if function and tracer and examples is not None:
307
219
  new_traces: List[Trace] = []
308
220
 
@@ -371,43 +283,8 @@ def run_trace_eval(
371
283
  return scoring_results
372
284
 
373
285
 
374
- async def get_evaluation_status(
375
- eval_name: str, project_name: str, judgment_api_key: str, organization_id: str
376
- ) -> Dict:
377
- """
378
- Gets the status of an async evaluation run.
379
-
380
- Args:
381
- eval_name (str): Name of the evaluation run
382
- project_name (str): Name of the project
383
- judgment_api_key (str): API key for authentication
384
- organization_id (str): Organization ID for the evaluation
385
-
386
- Returns:
387
- Dict: Status information including:
388
- - status: 'pending', 'running', 'completed', or 'failed'
389
- - results: List of ScoringResult objects if completed
390
- - error: Error message if failed
391
- """
392
- api_client = JudgmentApiClient(judgment_api_key, organization_id)
393
- try:
394
- return api_client.get_evaluation_status(eval_name, project_name)
395
- except Exception as e:
396
- raise JudgmentAPIError(
397
- f"An error occurred while checking evaluation status: {str(e)}"
398
- )
399
-
400
-
401
- def retrieve_counts(result: Dict):
402
- scorer_data_count = 0
403
- for example in result.get("examples", []):
404
- for scorer in example.get("scorer_data", []):
405
- scorer_data_count += 1
406
- return scorer_data_count
407
-
408
-
409
286
  def _poll_evaluation_until_complete(
410
- eval_name: str,
287
+ experiment_run_id: str,
411
288
  project_name: str,
412
289
  judgment_api_key: str,
413
290
  organization_id: str,
@@ -438,14 +315,16 @@ def _poll_evaluation_until_complete(
438
315
  poll_count += 1
439
316
  try:
440
317
  # Check status
441
- status_response = api_client.get_evaluation_status(eval_name, project_name)
318
+ status_response = api_client.get_evaluation_status(
319
+ experiment_run_id, project_name
320
+ )
442
321
 
443
322
  if status_response.get("status") != "completed":
444
323
  time.sleep(poll_interval_seconds)
445
324
  continue
446
325
 
447
326
  results_response = api_client.fetch_evaluation_results(
448
- project_name, eval_name
327
+ experiment_run_id, project_name
449
328
  )
450
329
  url = results_response.get("ui_results_url")
451
330
 
@@ -508,14 +387,12 @@ def progress_logger(stop_event, msg="Working...", interval=5):
508
387
  def run_eval(
509
388
  evaluation_run: EvaluationRun,
510
389
  judgment_api_key: str,
511
- override: bool = False,
512
390
  ) -> List[ScoringResult]:
513
391
  """
514
392
  Executes an evaluation of `Example`s using one or more `Scorer`s
515
393
 
516
394
  Args:
517
395
  evaluation_run (EvaluationRun): Stores example and evaluation together for running
518
- override (bool, optional): Whether to override existing evaluation run with same name. Defaults to False.
519
396
 
520
397
  Returns:
521
398
  List[ScoringResult]: A list of ScoringResult objects
@@ -529,52 +406,31 @@ def run_eval(
529
406
  f"All examples must have the same keys: {current_keys} != {keys}"
530
407
  )
531
408
 
532
- # Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
533
- if not override and not evaluation_run.append:
534
- check_eval_run_name_exists(
535
- evaluation_run.eval_name,
536
- evaluation_run.project_name,
537
- judgment_api_key,
538
- evaluation_run.organization_id,
539
- )
540
-
541
- if evaluation_run.append:
542
- # Check that the current experiment, if one exists, has the same type (examples of traces)
543
- check_experiment_type(
544
- evaluation_run.eval_name,
545
- evaluation_run.project_name,
546
- judgment_api_key,
547
- evaluation_run.organization_id,
548
- False,
549
- )
550
-
551
- # Ensure that current experiment (if one exists) has the same keys for example
552
- check_example_keys(
553
- keys=list(keys),
554
- eval_name=evaluation_run.eval_name,
555
- project_name=evaluation_run.project_name,
556
- judgment_api_key=judgment_api_key,
557
- organization_id=evaluation_run.organization_id,
558
- )
559
-
560
- judgment_scorers: List[APIScorerConfig] = []
561
- local_scorers: List[BaseScorer] = []
562
- for scorer in evaluation_run.scorers:
563
- if isinstance(scorer, APIScorerConfig):
564
- judgment_scorers.append(scorer)
565
- else:
566
- local_scorers.append(scorer)
567
-
568
409
  results: List[ScoringResult] = []
569
410
  url = ""
570
411
 
571
- if len(local_scorers) > 0 and len(judgment_scorers) > 0:
412
+ if (
413
+ len(evaluation_run.custom_scorers) > 0
414
+ and len(evaluation_run.judgment_scorers) > 0
415
+ ):
572
416
  error_msg = "We currently do not support running both local and Judgment API scorers at the same time. Please run your evaluation with either local scorers or Judgment API scorers, but not both."
573
417
  judgeval_logger.error(error_msg)
574
418
  raise ValueError(error_msg)
575
419
 
576
- if len(judgment_scorers) > 0:
577
- check_examples(evaluation_run.examples, judgment_scorers)
420
+ e2b_scorers = [cs for cs in evaluation_run.custom_scorers if cs.server_hosted]
421
+
422
+ if evaluation_run.judgment_scorers or e2b_scorers:
423
+ if evaluation_run.judgment_scorers and e2b_scorers:
424
+ error_msg = "We currently do not support running both hosted custom scorers and Judgment API scorers at the same time. Please run your evaluation with one or the other, but not both."
425
+ judgeval_logger.error(error_msg)
426
+ raise ValueError(error_msg)
427
+
428
+ if len(e2b_scorers) > 1:
429
+ error_msg = "We currently do not support running multiple hosted custom scorers at the same time."
430
+ judgeval_logger.error(error_msg)
431
+ raise ValueError(error_msg)
432
+
433
+ check_examples(evaluation_run.examples, evaluation_run.judgment_scorers)
578
434
  stop_event = threading.Event()
579
435
  t = threading.Thread(
580
436
  target=progress_logger, args=(stop_event, "Running evaluation...")
@@ -595,36 +451,26 @@ def run_eval(
595
451
  )
596
452
  raise JudgmentAPIError(error_message)
597
453
 
598
- old_scorer_data_count = 0
599
- if evaluation_run.append:
600
- try:
601
- results_response = api_client.fetch_evaluation_results(
602
- evaluation_run.project_name, evaluation_run.eval_name
603
- )
604
- old_scorer_data_count = retrieve_counts(results_response)
605
- except Exception:
606
- # This usually means the user did append = True but the eval run name doesn't exist yet
607
- pass
608
-
454
+ num_scorers = (
455
+ len(evaluation_run.judgment_scorers)
456
+ if evaluation_run.judgment_scorers
457
+ else sum(1 for cs in evaluation_run.custom_scorers if cs.server_hosted)
458
+ )
609
459
  results, url = _poll_evaluation_until_complete(
610
- eval_name=evaluation_run.eval_name,
460
+ experiment_run_id=evaluation_run.id,
611
461
  project_name=evaluation_run.project_name,
612
462
  judgment_api_key=judgment_api_key,
613
463
  organization_id=evaluation_run.organization_id,
614
- expected_scorer_data_count=(
615
- len(evaluation_run.scorers) * len(evaluation_run.examples)
616
- )
617
- + old_scorer_data_count,
464
+ expected_scorer_data_count=(num_scorers * len(evaluation_run.examples)),
618
465
  )
619
466
  finally:
620
467
  stop_event.set()
621
468
  t.join()
622
-
623
- if len(local_scorers) > 0:
469
+ else:
624
470
  results = safe_run_async(
625
471
  a_execute_scoring(
626
472
  evaluation_run.examples,
627
- local_scorers,
473
+ evaluation_run.custom_scorers,
628
474
  model=evaluation_run.model,
629
475
  throttle_value=0,
630
476
  max_concurrent=MAX_CONCURRENT_EVALUATIONS,
@@ -26,6 +26,7 @@ class BaseScorer(BaseModel):
26
26
  name: Optional[str] = (
27
27
  None # name of your scorer (Faithfulness, PromptScorer-randomslug)
28
28
  )
29
+ class_name: Optional[str] = None # The name of the class of the scorer
29
30
  score: Optional[float] = None # The float score of the scorer run on the test case
30
31
  score_breakdown: Optional[Dict] = None
31
32
  reason: Optional[str] = ""
@@ -39,24 +40,22 @@ class BaseScorer(BaseModel):
39
40
  error: Optional[str] = None # The error message if the scorer failed
40
41
  additional_metadata: Optional[Dict] = None # Additional metadata for the scorer
41
42
  user: Optional[str] = None # The user ID of the scorer
43
+ server_hosted: bool = False # Whether the scorer is enabled for e2b
42
44
 
43
- @model_validator(mode="before")
45
+ @model_validator(mode="after")
44
46
  @classmethod
45
- def enforce_strict_threshold(cls, data: dict):
46
- if data.get("strict_mode"):
47
- data["threshold"] = 1.0
47
+ def enforce_strict_threshold(cls, data: "BaseScorer"):
48
+ if data.strict_mode:
49
+ data.threshold = 1.0
48
50
  return data
49
51
 
50
52
  @model_validator(mode="after")
51
53
  @classmethod
52
54
  def default_name(cls, m: "BaseScorer") -> "BaseScorer":
55
+ # Always set class_name to the string name of the class
56
+ m.class_name = m.__class__.__name__
53
57
  if not m.name:
54
- # Try to use the class name if it exists and is not empty
55
- class_name = getattr(m, "__class__", None)
56
- if class_name and getattr(m.__class__, "__name__", None):
57
- m.name = m.__class__.__name__
58
- else:
59
- m.name = m.score_type
58
+ m.name = m.class_name
60
59
  return m
61
60
 
62
61
  def _add_model(self, model: str):
@@ -11,13 +11,14 @@ from judgeval.common.logger import judgeval_logger
11
11
  def push_prompt_scorer(
12
12
  name: str,
13
13
  prompt: str,
14
+ threshold: float,
14
15
  options: Optional[Dict[str, float]] = None,
15
16
  judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
16
17
  organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
17
18
  ) -> str:
18
19
  client = JudgmentApiClient(judgment_api_key, organization_id)
19
20
  try:
20
- r = client.save_scorer(name, prompt, options)
21
+ r = client.save_scorer(name, prompt, threshold, options)
21
22
  except JudgmentAPIException as e:
22
23
  if e.status_code == 500:
23
24
  raise JudgmentAPIError(
@@ -90,6 +91,7 @@ class PromptScorer(APIScorerConfig):
90
91
  return cls(
91
92
  name=name,
92
93
  prompt=scorer_config["prompt"],
94
+ threshold=scorer_config["threshold"],
93
95
  options=scorer_config.get("options"),
94
96
  judgment_api_key=judgment_api_key,
95
97
  organization_id=organization_id,
@@ -100,16 +102,20 @@ class PromptScorer(APIScorerConfig):
100
102
  cls,
101
103
  name: str,
102
104
  prompt: str,
105
+ threshold: Optional[float] = 0.5,
103
106
  options: Optional[Dict[str, float]] = None,
104
107
  judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
105
108
  organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
106
109
  ):
107
110
  if not scorer_exists(name, judgment_api_key, organization_id):
108
- push_prompt_scorer(name, prompt, options, judgment_api_key, organization_id)
111
+ push_prompt_scorer(
112
+ name, prompt, threshold, options, judgment_api_key, organization_id
113
+ )
109
114
  judgeval_logger.info(f"Successfully created PromptScorer: {name}")
110
115
  return cls(
111
116
  name=name,
112
117
  prompt=prompt,
118
+ threshold=threshold,
113
119
  options=options,
114
120
  judgment_api_key=judgment_api_key,
115
121
  organization_id=organization_id,
@@ -158,6 +164,12 @@ class PromptScorer(APIScorerConfig):
158
164
  judgeval_logger.info(f"Successfully appended to prompt for {self.name}")
159
165
 
160
166
  # Getters
167
+ def get_threshold(self) -> float | None:
168
+ """
169
+ Returns the threshold of the scorer.
170
+ """
171
+ return self.threshold
172
+
161
173
  def get_prompt(self) -> str | None:
162
174
  """
163
175
  Returns the prompt of the scorer.
@@ -183,6 +195,7 @@ class PromptScorer(APIScorerConfig):
183
195
  return {
184
196
  "name": self.name,
185
197
  "prompt": self.prompt,
198
+ "threshold": self.threshold,
186
199
  "options": self.options,
187
200
  }
188
201
 
@@ -193,13 +206,14 @@ class PromptScorer(APIScorerConfig):
193
206
  push_prompt_scorer(
194
207
  self.name,
195
208
  self.prompt,
209
+ self.threshold,
196
210
  self.options,
197
211
  self.judgment_api_key,
198
212
  self.organization_id,
199
213
  )
200
214
 
201
215
  def __str__(self):
202
- return f"PromptScorer(name={self.name}, prompt={self.prompt}, options={self.options})"
216
+ return f"PromptScorer(name={self.name}, prompt={self.prompt}, threshold={self.threshold}, options={self.options})"
203
217
 
204
218
  def model_dump(self, *args, **kwargs) -> Dict[str, Any]:
205
219
  base = super().model_dump(*args, **kwargs)
judgeval/scorers/score.py CHANGED
@@ -17,6 +17,7 @@ from judgeval.scorers import BaseScorer
17
17
  from judgeval.scorers.utils import clone_scorers
18
18
  from judgeval.common.logger import judgeval_logger
19
19
  from judgeval.judges import JudgevalJudge
20
+ from judgeval.constants import DEFAULT_GPT_MODEL
20
21
 
21
22
 
22
23
  async def safe_a_score_example(
@@ -55,10 +56,11 @@ async def safe_a_score_example(
55
56
  async def a_execute_scoring(
56
57
  examples: List[Example],
57
58
  scorers: List[BaseScorer],
58
- model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1",
59
+ model: Optional[Union[str, List[str], JudgevalJudge]] = DEFAULT_GPT_MODEL,
59
60
  ignore_errors: bool = False,
60
61
  throttle_value: int = 0,
61
62
  max_concurrent: int = 100,
63
+ show_progress: bool = True,
62
64
  ) -> List[ScoringResult]:
63
65
  """
64
66
  Executes evaluations of `Example`s asynchronously using one or more `BaseScorer`s.
@@ -71,8 +73,7 @@ async def a_execute_scoring(
71
73
  ignore_errors (bool): Whether to ignore errors during evaluation.
72
74
  throttle_value (int): The amount of time to wait between starting each task.
73
75
  max_concurrent (int): The maximum number of concurrent tasks.
74
-
75
- _use_bar_indicator (bool): Whether to use a progress bar indicator.
76
+ show_progress (bool): Whether to show the progress bar indicator.
76
77
 
77
78
  Returns:
78
79
  List[ScoringResult]: A list of `ScoringResult` objects containing the evaluation results.
@@ -101,16 +102,37 @@ async def a_execute_scoring(
101
102
  tasks = []
102
103
  cloned_scorers: List[BaseScorer]
103
104
 
104
- with tqdm_asyncio(
105
- desc=f"Evaluating {len(examples)} example(s) in parallel",
106
- unit="Example",
107
- total=len(examples),
108
- bar_format="{desc}: |{bar}|{percentage:3.0f}% ({n_fmt}/{total_fmt}) [Time Taken: {elapsed}, {rate_fmt}{postfix}]",
109
- ) as pbar:
105
+ if show_progress:
106
+ with tqdm_asyncio(
107
+ desc=f"Evaluating {len(examples)} example(s) in parallel",
108
+ unit="Example",
109
+ total=len(examples),
110
+ bar_format="{desc}: |{bar}|{percentage:3.0f}% ({n_fmt}/{total_fmt}) [Time Taken: {elapsed}, {rate_fmt}{postfix}]",
111
+ ) as pbar:
112
+ for i, ex in enumerate(examples):
113
+ if isinstance(ex, Example):
114
+ if len(scorers) == 0:
115
+ pbar.update(1)
116
+ continue
117
+
118
+ cloned_scorers = clone_scorers(scorers)
119
+ task = execute_with_semaphore(
120
+ func=a_eval_examples_helper,
121
+ scorers=cloned_scorers,
122
+ example=ex,
123
+ scoring_results=scoring_results,
124
+ score_index=i,
125
+ ignore_errors=ignore_errors,
126
+ pbar=pbar,
127
+ )
128
+ tasks.append(asyncio.create_task(task))
129
+
130
+ await asyncio.sleep(throttle_value)
131
+ await asyncio.gather(*tasks)
132
+ else:
110
133
  for i, ex in enumerate(examples):
111
134
  if isinstance(ex, Example):
112
135
  if len(scorers) == 0:
113
- pbar.update(1)
114
136
  continue
115
137
 
116
138
  cloned_scorers = clone_scorers(scorers)
@@ -121,7 +143,7 @@ async def a_execute_scoring(
121
143
  scoring_results=scoring_results,
122
144
  score_index=i,
123
145
  ignore_errors=ignore_errors,
124
- pbar=pbar,
146
+ pbar=None,
125
147
  )
126
148
  tasks.append(asyncio.create_task(task))
127
149
 
@@ -0,0 +1,36 @@
1
+ """Async utilities for judgeval."""
2
+
3
+ import asyncio
4
+ import concurrent.futures
5
+ from typing import Awaitable, TypeVar
6
+
7
+
8
+ # Generic type variable for coroutine return type
9
+ T = TypeVar("T")
10
+
11
+
12
+ def safe_run_async(coro: Awaitable[T]) -> T: # type: ignore[type-var]
13
+ """Safely execute an async *coro* from synchronous code.
14
+
15
+ This helper handles two common situations:
16
+
17
+ 1. **No running event loop** – Simply delegates to ``asyncio.run``.
18
+ 2. **Existing running loop** – Executes the coroutine in a separate
19
+ thread so that we don't attempt to nest event loops (which would raise
20
+ ``RuntimeError``).
21
+
22
+ Args:
23
+ coro: The coroutine to execute.
24
+
25
+ Returns:
26
+ The result returned by *coro*.
27
+ """
28
+
29
+ try:
30
+ asyncio.get_running_loop()
31
+ except RuntimeError:
32
+ return asyncio.run(coro)
33
+
34
+ with concurrent.futures.ThreadPoolExecutor() as executor:
35
+ future = executor.submit(lambda: asyncio.run(coro))
36
+ return future.result()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: judgeval
3
- Version: 0.4.0
3
+ Version: 0.6.0
4
4
  Summary: Judgeval Package
5
5
  Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
6
  Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -10,27 +10,26 @@ License-File: LICENSE.md
10
10
  Classifier: Operating System :: OS Independent
11
11
  Classifier: Programming Language :: Python :: 3
12
12
  Requires-Python: >=3.11
13
- Requires-Dist: anthropic
14
13
  Requires-Dist: boto3
15
- Requires-Dist: datamodel-code-generator>=0.31.1
16
- Requires-Dist: google-genai
17
- Requires-Dist: groq>=0.30.0
14
+ Requires-Dist: click<8.2.0
18
15
  Requires-Dist: langchain-anthropic
19
16
  Requires-Dist: langchain-core
20
17
  Requires-Dist: langchain-huggingface
21
18
  Requires-Dist: langchain-openai
22
19
  Requires-Dist: litellm>=1.61.15
23
- Requires-Dist: matplotlib>=3.10.3
24
- Requires-Dist: nest-asyncio
25
- Requires-Dist: openai
20
+ Requires-Dist: nest-asyncio>=1.6.0
26
21
  Requires-Dist: opentelemetry-api>=1.34.1
27
22
  Requires-Dist: opentelemetry-sdk>=1.34.1
28
23
  Requires-Dist: orjson>=3.9.0
29
- Requires-Dist: pandas
30
- Requires-Dist: python-dotenv==1.0.1
31
- Requires-Dist: python-slugify>=8.0.4
24
+ Requires-Dist: python-dotenv
32
25
  Requires-Dist: requests
33
- Requires-Dist: together
26
+ Requires-Dist: rich
27
+ Requires-Dist: typer>=0.9.0
28
+ Provides-Extra: langchain
29
+ Requires-Dist: langchain-anthropic; extra == 'langchain'
30
+ Requires-Dist: langchain-core; extra == 'langchain'
31
+ Requires-Dist: langchain-huggingface; extra == 'langchain'
32
+ Requires-Dist: langchain-openai; extra == 'langchain'
34
33
  Description-Content-Type: text/markdown
35
34
 
36
35
  <div align="center">