judgeval 0.3.2__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +2 -0
- judgeval/clients.py +2 -1
- judgeval/common/api/api.py +4 -18
- judgeval/common/api/constants.py +1 -1
- judgeval/common/api/json_encoder.py +242 -0
- judgeval/common/tracer/core.py +498 -215
- judgeval/common/tracer/providers.py +119 -0
- judgeval/common/tracer/span_transformer.py +14 -25
- judgeval/constants.py +1 -0
- judgeval/data/judgment_types.py +2 -1
- judgeval/data/trace.py +5 -122
- judgeval/data/trace_run.py +2 -1
- judgeval/dataset.py +2 -0
- judgeval/evaluation_run.py +6 -2
- judgeval/judges/litellm_judge.py +2 -1
- judgeval/judges/mixture_of_judges.py +2 -1
- judgeval/judges/utils.py +2 -1
- judgeval/judgment_client.py +11 -6
- judgeval/local_eval_queue.py +192 -0
- judgeval/run_evaluation.py +11 -6
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +18 -19
- judgeval/scorers/score.py +34 -11
- judgeval/utils/async_utils.py +36 -0
- {judgeval-0.3.2.dist-info → judgeval-0.5.0.dist-info}/METADATA +9 -12
- {judgeval-0.3.2.dist-info → judgeval-0.5.0.dist-info}/RECORD +27 -23
- {judgeval-0.3.2.dist-info → judgeval-0.5.0.dist-info}/WHEEL +0 -0
- {judgeval-0.3.2.dist-info → judgeval-0.5.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,192 @@
|
|
1
|
+
"""Local evaluation queue for batching custom scorer evaluations.
|
2
|
+
|
3
|
+
This module provides a simple in-memory queue for EvaluationRun objects that contain
|
4
|
+
only local (BaseScorer) scorers. Useful for batching evaluations and processing them
|
5
|
+
either synchronously or in a background thread.
|
6
|
+
"""
|
7
|
+
|
8
|
+
import queue
|
9
|
+
import threading
|
10
|
+
from typing import Callable, List, Optional
|
11
|
+
import time
|
12
|
+
|
13
|
+
from judgeval.common.logger import judgeval_logger
|
14
|
+
from judgeval.constants import MAX_CONCURRENT_EVALUATIONS
|
15
|
+
from judgeval.data import ScoringResult
|
16
|
+
from judgeval.evaluation_run import EvaluationRun
|
17
|
+
from judgeval.utils.async_utils import safe_run_async
|
18
|
+
from judgeval.scorers import BaseScorer
|
19
|
+
from judgeval.scorers.score import a_execute_scoring
|
20
|
+
|
21
|
+
|
22
|
+
class LocalEvaluationQueue:
|
23
|
+
"""Lightweight in-memory queue for local evaluation runs.
|
24
|
+
|
25
|
+
Only supports EvaluationRuns with local scorers (BaseScorer instances).
|
26
|
+
API scorers (APIScorerConfig) are not supported as they have their own queue.
|
27
|
+
"""
|
28
|
+
|
29
|
+
def __init__(
|
30
|
+
self, max_concurrent: int = MAX_CONCURRENT_EVALUATIONS, num_workers: int = 4
|
31
|
+
):
|
32
|
+
if num_workers <= 0:
|
33
|
+
raise ValueError("num_workers must be a positive integer.")
|
34
|
+
self._queue: queue.Queue[Optional[EvaluationRun]] = queue.Queue()
|
35
|
+
self._max_concurrent = max_concurrent
|
36
|
+
self._num_workers = num_workers # Number of worker threads
|
37
|
+
self._worker_threads: List[threading.Thread] = []
|
38
|
+
self._shutdown_event = threading.Event()
|
39
|
+
|
40
|
+
def enqueue(self, evaluation_run: EvaluationRun) -> None:
|
41
|
+
"""Add evaluation run to the queue."""
|
42
|
+
self._queue.put(evaluation_run)
|
43
|
+
|
44
|
+
def _process_run(self, evaluation_run: EvaluationRun) -> List[ScoringResult]:
|
45
|
+
"""Execute evaluation run locally and return results."""
|
46
|
+
local_scorers = [s for s in evaluation_run.scorers if isinstance(s, BaseScorer)]
|
47
|
+
|
48
|
+
if not local_scorers:
|
49
|
+
raise ValueError(
|
50
|
+
"LocalEvaluationQueue only supports runs with local scorers (BaseScorer). "
|
51
|
+
"Found only APIScorerConfig instances."
|
52
|
+
)
|
53
|
+
|
54
|
+
return safe_run_async(
|
55
|
+
a_execute_scoring(
|
56
|
+
evaluation_run.examples,
|
57
|
+
local_scorers,
|
58
|
+
model=evaluation_run.model,
|
59
|
+
throttle_value=0,
|
60
|
+
max_concurrent=self._max_concurrent // self._num_workers,
|
61
|
+
show_progress=False,
|
62
|
+
)
|
63
|
+
)
|
64
|
+
|
65
|
+
def run_all(
|
66
|
+
self,
|
67
|
+
callback: Optional[Callable[[EvaluationRun, List[ScoringResult]], None]] = None,
|
68
|
+
) -> None:
|
69
|
+
"""Process all queued runs synchronously.
|
70
|
+
|
71
|
+
Args:
|
72
|
+
callback: Optional function called after each run with (run, results).
|
73
|
+
"""
|
74
|
+
while not self._queue.empty():
|
75
|
+
run = self._queue.get()
|
76
|
+
if run is None: # Sentinel for worker shutdown
|
77
|
+
self._queue.put(None)
|
78
|
+
break
|
79
|
+
results = self._process_run(run)
|
80
|
+
if callback:
|
81
|
+
callback(run, results)
|
82
|
+
self._queue.task_done()
|
83
|
+
|
84
|
+
def start_workers(
|
85
|
+
self,
|
86
|
+
callback: Optional[Callable[[EvaluationRun, List[ScoringResult]], None]] = None,
|
87
|
+
) -> List[threading.Thread]:
|
88
|
+
"""Start multiple background threads to process runs in parallel.
|
89
|
+
|
90
|
+
Args:
|
91
|
+
callback: Optional function called after each run with (run, results).
|
92
|
+
|
93
|
+
Returns:
|
94
|
+
List of started worker threads.
|
95
|
+
"""
|
96
|
+
|
97
|
+
def _worker(worker_id: int) -> None:
|
98
|
+
while not self._shutdown_event.is_set():
|
99
|
+
try:
|
100
|
+
# Use timeout so workers can check shutdown event periodically
|
101
|
+
run = self._queue.get(timeout=1.0)
|
102
|
+
if run is None: # Sentinel to stop worker
|
103
|
+
# Put sentinel back for other workers
|
104
|
+
self._queue.put(None)
|
105
|
+
self._queue.task_done()
|
106
|
+
break
|
107
|
+
|
108
|
+
try:
|
109
|
+
results = self._process_run(run)
|
110
|
+
if callback:
|
111
|
+
callback(run, results)
|
112
|
+
except Exception as exc:
|
113
|
+
judgeval_logger.error(
|
114
|
+
f"Worker {worker_id} error processing {run.eval_name}: {exc}"
|
115
|
+
)
|
116
|
+
# Continue processing other runs instead of shutting down all workers
|
117
|
+
finally:
|
118
|
+
self._queue.task_done()
|
119
|
+
|
120
|
+
except queue.Empty:
|
121
|
+
# Timeout - check shutdown event and continue
|
122
|
+
continue
|
123
|
+
|
124
|
+
# Start worker threads
|
125
|
+
for i in range(self._num_workers):
|
126
|
+
thread = threading.Thread(target=_worker, args=(i,), daemon=True)
|
127
|
+
thread.start()
|
128
|
+
self._worker_threads.append(thread)
|
129
|
+
|
130
|
+
return self._worker_threads
|
131
|
+
|
132
|
+
def start_worker(
|
133
|
+
self,
|
134
|
+
callback: Optional[Callable[[EvaluationRun, List[ScoringResult]], None]] = None,
|
135
|
+
) -> Optional[threading.Thread]:
|
136
|
+
"""Start a single background thread to process runs (backward compatibility).
|
137
|
+
|
138
|
+
Args:
|
139
|
+
callback: Optional function called after each run with (run, results).
|
140
|
+
|
141
|
+
Returns:
|
142
|
+
The started thread, or None if no threads were started.
|
143
|
+
"""
|
144
|
+
threads = self.start_workers(callback)
|
145
|
+
return threads[0] if threads else None
|
146
|
+
|
147
|
+
def wait_for_completion(self, timeout: Optional[float] = None) -> bool:
|
148
|
+
"""Wait for all queued tasks to complete.
|
149
|
+
|
150
|
+
Args:
|
151
|
+
timeout: Maximum time to wait in seconds. None means wait indefinitely.
|
152
|
+
|
153
|
+
Returns:
|
154
|
+
True if all tasks completed, False if timeout occurred.
|
155
|
+
"""
|
156
|
+
try:
|
157
|
+
if timeout is None:
|
158
|
+
self._queue.join()
|
159
|
+
return True
|
160
|
+
else:
|
161
|
+
start_time = time.time()
|
162
|
+
while not self._queue.empty() or self._queue.unfinished_tasks > 0:
|
163
|
+
if time.time() - start_time > timeout:
|
164
|
+
return False
|
165
|
+
time.sleep(0.1)
|
166
|
+
return True
|
167
|
+
except Exception:
|
168
|
+
return False
|
169
|
+
|
170
|
+
def stop_workers(self) -> None:
|
171
|
+
"""Signal all background workers to stop after current tasks complete."""
|
172
|
+
if not self._worker_threads:
|
173
|
+
return
|
174
|
+
|
175
|
+
# Signal shutdown
|
176
|
+
self._shutdown_event.set()
|
177
|
+
|
178
|
+
# Send sentinel to wake up any blocking workers
|
179
|
+
for _ in range(self._num_workers):
|
180
|
+
self._queue.put(None)
|
181
|
+
|
182
|
+
# Wait for all workers to finish with timeout
|
183
|
+
for thread in self._worker_threads:
|
184
|
+
if thread.is_alive():
|
185
|
+
thread.join(timeout=5.0)
|
186
|
+
if thread.is_alive():
|
187
|
+
judgeval_logger.warning(
|
188
|
+
f"Worker thread {thread.name} did not shut down gracefully"
|
189
|
+
)
|
190
|
+
|
191
|
+
self._worker_threads.clear()
|
192
|
+
self._shutdown_event.clear()
|
judgeval/run_evaluation.py
CHANGED
@@ -1,10 +1,12 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
1
3
|
import asyncio
|
2
4
|
import concurrent.futures
|
3
5
|
import time
|
4
6
|
import orjson
|
5
7
|
import sys
|
6
8
|
import threading
|
7
|
-
from typing import List, Dict, Union, Optional, Callable, Tuple, Any
|
9
|
+
from typing import List, Dict, Union, Optional, Callable, Tuple, Any, TYPE_CHECKING
|
8
10
|
from rich import print as rprint
|
9
11
|
|
10
12
|
from judgeval.data import ScorerData, ScoringResult, Example, Trace
|
@@ -17,10 +19,13 @@ from judgeval.constants import (
|
|
17
19
|
from judgeval.common.exceptions import JudgmentAPIError
|
18
20
|
from judgeval.common.api.api import JudgmentAPIException
|
19
21
|
from judgeval.common.logger import judgeval_logger
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
from judgeval.
|
22
|
+
|
23
|
+
|
24
|
+
if TYPE_CHECKING:
|
25
|
+
from judgeval.common.tracer import Tracer
|
26
|
+
from judgeval.data.trace_run import TraceRun
|
27
|
+
from judgeval.evaluation_run import EvaluationRun
|
28
|
+
from judgeval.integrations.langgraph import JudgevalCallbackHandler
|
24
29
|
|
25
30
|
|
26
31
|
def safe_run_async(coro):
|
@@ -282,7 +287,7 @@ def run_trace_eval(
|
|
282
287
|
judgment_api_key: str,
|
283
288
|
override: bool = False,
|
284
289
|
function: Optional[Callable] = None,
|
285
|
-
tracer: Optional[Union[Tracer, JudgevalCallbackHandler]] = None,
|
290
|
+
tracer: Optional[Union[Tracer, "JudgevalCallbackHandler"]] = None,
|
286
291
|
examples: Optional[List[Example]] = None,
|
287
292
|
) -> List[ScoringResult]:
|
288
293
|
# Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
|
@@ -1,27 +1,29 @@
|
|
1
1
|
from judgeval.scorers.api_scorer import APIScorerConfig
|
2
2
|
from judgeval.constants import APIScorerType
|
3
|
-
from typing import
|
3
|
+
from typing import Dict, Any, Optional
|
4
4
|
from judgeval.common.api import JudgmentApiClient, JudgmentAPIException
|
5
5
|
import os
|
6
6
|
from judgeval.common.exceptions import JudgmentAPIError
|
7
|
+
from copy import copy
|
8
|
+
from judgeval.common.logger import judgeval_logger
|
7
9
|
|
8
10
|
|
9
11
|
def push_prompt_scorer(
|
10
12
|
name: str,
|
11
13
|
prompt: str,
|
12
|
-
options:
|
14
|
+
options: Optional[Dict[str, float]] = None,
|
13
15
|
judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
|
14
16
|
organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
|
15
17
|
) -> str:
|
16
18
|
client = JudgmentApiClient(judgment_api_key, organization_id)
|
17
19
|
try:
|
18
|
-
r = client.save_scorer(name, prompt,
|
20
|
+
r = client.save_scorer(name, prompt, options)
|
19
21
|
except JudgmentAPIException as e:
|
20
22
|
if e.status_code == 500:
|
21
23
|
raise JudgmentAPIError(
|
22
24
|
f"The server is temporarily unavailable. Please try your request again in a few moments. Error details: {e.error_detail}"
|
23
25
|
)
|
24
|
-
raise JudgmentAPIError(f"Failed to save
|
26
|
+
raise JudgmentAPIError(f"Failed to save prompt scorer: {e.error_detail}")
|
25
27
|
return r["name"]
|
26
28
|
|
27
29
|
|
@@ -32,7 +34,7 @@ def fetch_prompt_scorer(
|
|
32
34
|
):
|
33
35
|
client = JudgmentApiClient(judgment_api_key, organization_id)
|
34
36
|
try:
|
35
|
-
scorer_config = client.fetch_scorer(name)
|
37
|
+
scorer_config = client.fetch_scorer(name)["scorer"]
|
36
38
|
scorer_config.pop("created_at")
|
37
39
|
scorer_config.pop("updated_at")
|
38
40
|
return scorer_config
|
@@ -42,7 +44,7 @@ def fetch_prompt_scorer(
|
|
42
44
|
f"The server is temporarily unavailable. Please try your request again in a few moments. Error details: {e.error_detail}"
|
43
45
|
)
|
44
46
|
raise JudgmentAPIError(
|
45
|
-
f"Failed to fetch
|
47
|
+
f"Failed to fetch prompt scorer '{name}': {e.error_detail}"
|
46
48
|
)
|
47
49
|
|
48
50
|
|
@@ -72,7 +74,7 @@ class PromptScorer(APIScorerConfig):
|
|
72
74
|
"""
|
73
75
|
|
74
76
|
prompt: str
|
75
|
-
options:
|
77
|
+
options: Optional[Dict[str, float]] = None
|
76
78
|
score_type: APIScorerType = APIScorerType.PROMPT_SCORER
|
77
79
|
judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or ""
|
78
80
|
organization_id: str = os.getenv("JUDGMENT_ORG_ID") or ""
|
@@ -88,7 +90,7 @@ class PromptScorer(APIScorerConfig):
|
|
88
90
|
return cls(
|
89
91
|
name=name,
|
90
92
|
prompt=scorer_config["prompt"],
|
91
|
-
options=scorer_config
|
93
|
+
options=scorer_config.get("options"),
|
92
94
|
judgment_api_key=judgment_api_key,
|
93
95
|
organization_id=organization_id,
|
94
96
|
)
|
@@ -98,12 +100,13 @@ class PromptScorer(APIScorerConfig):
|
|
98
100
|
cls,
|
99
101
|
name: str,
|
100
102
|
prompt: str,
|
101
|
-
options:
|
103
|
+
options: Optional[Dict[str, float]] = None,
|
102
104
|
judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
|
103
105
|
organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
|
104
106
|
):
|
105
107
|
if not scorer_exists(name, judgment_api_key, organization_id):
|
106
108
|
push_prompt_scorer(name, prompt, options, judgment_api_key, organization_id)
|
109
|
+
judgeval_logger.info(f"Successfully created PromptScorer: {name}")
|
107
110
|
return cls(
|
108
111
|
name=name,
|
109
112
|
prompt=prompt,
|
@@ -117,13 +120,6 @@ class PromptScorer(APIScorerConfig):
|
|
117
120
|
)
|
118
121
|
|
119
122
|
# Setter functions. Each setter function pushes the scorer to the DB.
|
120
|
-
def set_name(self, name: str):
|
121
|
-
"""
|
122
|
-
Updates the name of the scorer.
|
123
|
-
"""
|
124
|
-
self.name = name
|
125
|
-
self.push_prompt_scorer()
|
126
|
-
|
127
123
|
def set_threshold(self, threshold: float):
|
128
124
|
"""
|
129
125
|
Updates the threshold of the scorer.
|
@@ -140,8 +136,9 @@ class PromptScorer(APIScorerConfig):
|
|
140
136
|
"""
|
141
137
|
self.prompt = prompt
|
142
138
|
self.push_prompt_scorer()
|
139
|
+
judgeval_logger.info(f"Successfully updated prompt for {self.name}")
|
143
140
|
|
144
|
-
def set_options(self, options:
|
141
|
+
def set_options(self, options: Dict[str, float]):
|
145
142
|
"""
|
146
143
|
Updates the options with the new options.
|
147
144
|
|
@@ -150,6 +147,7 @@ class PromptScorer(APIScorerConfig):
|
|
150
147
|
"""
|
151
148
|
self.options = options
|
152
149
|
self.push_prompt_scorer()
|
150
|
+
judgeval_logger.info(f"Successfully updated options for {self.name}")
|
153
151
|
|
154
152
|
def append_to_prompt(self, prompt_addition: str):
|
155
153
|
"""
|
@@ -157,6 +155,7 @@ class PromptScorer(APIScorerConfig):
|
|
157
155
|
"""
|
158
156
|
self.prompt += prompt_addition
|
159
157
|
self.push_prompt_scorer()
|
158
|
+
judgeval_logger.info(f"Successfully appended to prompt for {self.name}")
|
160
159
|
|
161
160
|
# Getters
|
162
161
|
def get_prompt(self) -> str | None:
|
@@ -165,11 +164,11 @@ class PromptScorer(APIScorerConfig):
|
|
165
164
|
"""
|
166
165
|
return self.prompt
|
167
166
|
|
168
|
-
def get_options(self) ->
|
167
|
+
def get_options(self) -> Dict[str, float] | None:
|
169
168
|
"""
|
170
169
|
Returns the options of the scorer.
|
171
170
|
"""
|
172
|
-
return self.options
|
171
|
+
return copy(self.options) if self.options is not None else None
|
173
172
|
|
174
173
|
def get_name(self) -> str | None:
|
175
174
|
"""
|
judgeval/scorers/score.py
CHANGED
@@ -17,6 +17,7 @@ from judgeval.scorers import BaseScorer
|
|
17
17
|
from judgeval.scorers.utils import clone_scorers
|
18
18
|
from judgeval.common.logger import judgeval_logger
|
19
19
|
from judgeval.judges import JudgevalJudge
|
20
|
+
from judgeval.constants import DEFAULT_GPT_MODEL
|
20
21
|
|
21
22
|
|
22
23
|
async def safe_a_score_example(
|
@@ -48,16 +49,18 @@ async def safe_a_score_example(
|
|
48
49
|
judgeval_logger.error(f"Error during scoring: {str(e)}")
|
49
50
|
scorer.error = str(e)
|
50
51
|
scorer.success = False
|
52
|
+
scorer.score = 0
|
51
53
|
return
|
52
54
|
|
53
55
|
|
54
56
|
async def a_execute_scoring(
|
55
57
|
examples: List[Example],
|
56
58
|
scorers: List[BaseScorer],
|
57
|
-
model: Optional[Union[str, List[str], JudgevalJudge]] =
|
59
|
+
model: Optional[Union[str, List[str], JudgevalJudge]] = DEFAULT_GPT_MODEL,
|
58
60
|
ignore_errors: bool = False,
|
59
61
|
throttle_value: int = 0,
|
60
62
|
max_concurrent: int = 100,
|
63
|
+
show_progress: bool = True,
|
61
64
|
) -> List[ScoringResult]:
|
62
65
|
"""
|
63
66
|
Executes evaluations of `Example`s asynchronously using one or more `BaseScorer`s.
|
@@ -70,8 +73,7 @@ async def a_execute_scoring(
|
|
70
73
|
ignore_errors (bool): Whether to ignore errors during evaluation.
|
71
74
|
throttle_value (int): The amount of time to wait between starting each task.
|
72
75
|
max_concurrent (int): The maximum number of concurrent tasks.
|
73
|
-
|
74
|
-
_use_bar_indicator (bool): Whether to use a progress bar indicator.
|
76
|
+
show_progress (bool): Whether to show the progress bar indicator.
|
75
77
|
|
76
78
|
Returns:
|
77
79
|
List[ScoringResult]: A list of `ScoringResult` objects containing the evaluation results.
|
@@ -100,16 +102,37 @@ async def a_execute_scoring(
|
|
100
102
|
tasks = []
|
101
103
|
cloned_scorers: List[BaseScorer]
|
102
104
|
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
105
|
+
if show_progress:
|
106
|
+
with tqdm_asyncio(
|
107
|
+
desc=f"Evaluating {len(examples)} example(s) in parallel",
|
108
|
+
unit="Example",
|
109
|
+
total=len(examples),
|
110
|
+
bar_format="{desc}: |{bar}|{percentage:3.0f}% ({n_fmt}/{total_fmt}) [Time Taken: {elapsed}, {rate_fmt}{postfix}]",
|
111
|
+
) as pbar:
|
112
|
+
for i, ex in enumerate(examples):
|
113
|
+
if isinstance(ex, Example):
|
114
|
+
if len(scorers) == 0:
|
115
|
+
pbar.update(1)
|
116
|
+
continue
|
117
|
+
|
118
|
+
cloned_scorers = clone_scorers(scorers)
|
119
|
+
task = execute_with_semaphore(
|
120
|
+
func=a_eval_examples_helper,
|
121
|
+
scorers=cloned_scorers,
|
122
|
+
example=ex,
|
123
|
+
scoring_results=scoring_results,
|
124
|
+
score_index=i,
|
125
|
+
ignore_errors=ignore_errors,
|
126
|
+
pbar=pbar,
|
127
|
+
)
|
128
|
+
tasks.append(asyncio.create_task(task))
|
129
|
+
|
130
|
+
await asyncio.sleep(throttle_value)
|
131
|
+
await asyncio.gather(*tasks)
|
132
|
+
else:
|
109
133
|
for i, ex in enumerate(examples):
|
110
134
|
if isinstance(ex, Example):
|
111
135
|
if len(scorers) == 0:
|
112
|
-
pbar.update(1)
|
113
136
|
continue
|
114
137
|
|
115
138
|
cloned_scorers = clone_scorers(scorers)
|
@@ -120,7 +143,7 @@ async def a_execute_scoring(
|
|
120
143
|
scoring_results=scoring_results,
|
121
144
|
score_index=i,
|
122
145
|
ignore_errors=ignore_errors,
|
123
|
-
pbar=
|
146
|
+
pbar=None,
|
124
147
|
)
|
125
148
|
tasks.append(asyncio.create_task(task))
|
126
149
|
|
@@ -0,0 +1,36 @@
|
|
1
|
+
"""Async utilities for judgeval."""
|
2
|
+
|
3
|
+
import asyncio
|
4
|
+
import concurrent.futures
|
5
|
+
from typing import Awaitable, TypeVar
|
6
|
+
|
7
|
+
|
8
|
+
# Generic type variable for coroutine return type
|
9
|
+
T = TypeVar("T")
|
10
|
+
|
11
|
+
|
12
|
+
def safe_run_async(coro: Awaitable[T]) -> T: # type: ignore[type-var]
|
13
|
+
"""Safely execute an async *coro* from synchronous code.
|
14
|
+
|
15
|
+
This helper handles two common situations:
|
16
|
+
|
17
|
+
1. **No running event loop** – Simply delegates to ``asyncio.run``.
|
18
|
+
2. **Existing running loop** – Executes the coroutine in a separate
|
19
|
+
thread so that we don't attempt to nest event loops (which would raise
|
20
|
+
``RuntimeError``).
|
21
|
+
|
22
|
+
Args:
|
23
|
+
coro: The coroutine to execute.
|
24
|
+
|
25
|
+
Returns:
|
26
|
+
The result returned by *coro*.
|
27
|
+
"""
|
28
|
+
|
29
|
+
try:
|
30
|
+
asyncio.get_running_loop()
|
31
|
+
except RuntimeError:
|
32
|
+
return asyncio.run(coro)
|
33
|
+
|
34
|
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
35
|
+
future = executor.submit(lambda: asyncio.run(coro))
|
36
|
+
return future.result()
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: judgeval
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.5.0
|
4
4
|
Summary: Judgeval Package
|
5
5
|
Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
|
6
6
|
Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
|
@@ -10,27 +10,24 @@ License-File: LICENSE.md
|
|
10
10
|
Classifier: Operating System :: OS Independent
|
11
11
|
Classifier: Programming Language :: Python :: 3
|
12
12
|
Requires-Python: >=3.11
|
13
|
-
Requires-Dist: anthropic
|
14
13
|
Requires-Dist: boto3
|
15
|
-
Requires-Dist: datamodel-code-generator>=0.31.1
|
16
|
-
Requires-Dist: google-genai
|
17
|
-
Requires-Dist: groq>=0.30.0
|
18
14
|
Requires-Dist: langchain-anthropic
|
19
15
|
Requires-Dist: langchain-core
|
20
16
|
Requires-Dist: langchain-huggingface
|
21
17
|
Requires-Dist: langchain-openai
|
22
18
|
Requires-Dist: litellm>=1.61.15
|
23
|
-
Requires-Dist:
|
24
|
-
Requires-Dist: nest-asyncio
|
25
|
-
Requires-Dist: openai
|
19
|
+
Requires-Dist: nest-asyncio>=1.6.0
|
26
20
|
Requires-Dist: opentelemetry-api>=1.34.1
|
27
21
|
Requires-Dist: opentelemetry-sdk>=1.34.1
|
28
22
|
Requires-Dist: orjson>=3.9.0
|
29
|
-
Requires-Dist:
|
30
|
-
Requires-Dist: python-dotenv==1.0.1
|
31
|
-
Requires-Dist: python-slugify>=8.0.4
|
23
|
+
Requires-Dist: python-dotenv
|
32
24
|
Requires-Dist: requests
|
33
|
-
Requires-Dist:
|
25
|
+
Requires-Dist: rich
|
26
|
+
Provides-Extra: langchain
|
27
|
+
Requires-Dist: langchain-anthropic; extra == 'langchain'
|
28
|
+
Requires-Dist: langchain-core; extra == 'langchain'
|
29
|
+
Requires-Dist: langchain-huggingface; extra == 'langchain'
|
30
|
+
Requires-Dist: langchain-openai; extra == 'langchain'
|
34
31
|
Description-Content-Type: text/markdown
|
35
32
|
|
36
33
|
<div align="center">
|
@@ -1,53 +1,56 @@
|
|
1
|
-
judgeval/__init__.py,sha256=
|
2
|
-
judgeval/clients.py,sha256=
|
3
|
-
judgeval/constants.py,sha256=
|
4
|
-
judgeval/dataset.py,sha256=
|
5
|
-
judgeval/evaluation_run.py,sha256=
|
6
|
-
judgeval/judgment_client.py,sha256=
|
1
|
+
judgeval/__init__.py,sha256=5Lm1JMYFREJGN_8X-Wpruu_ovwGLJ08gCzNAt-u-pQE,419
|
2
|
+
judgeval/clients.py,sha256=HHul68PV1om0dxsVZZu90TtCiy5zaqAwph16jXTQzQo,989
|
3
|
+
judgeval/constants.py,sha256=UNoTLHgbpZHRInPM2ZaI3m0XokPkee5ILlg20reqhzo,4180
|
4
|
+
judgeval/dataset.py,sha256=vOrDKam2I-K1WcVF5IBkQruCDvXTc8PRaFm4-dV0lXs,6220
|
5
|
+
judgeval/evaluation_run.py,sha256=FJpnc1sGncmAOAnEUO0n2vNXjlycljGqBdV99qPT5og,3087
|
6
|
+
judgeval/judgment_client.py,sha256=tGhENRb2YVIe2WUlcssC8DuEijeUC7Ajj_rh_Dh7bzA,11878
|
7
|
+
judgeval/local_eval_queue.py,sha256=fAI0_OlvCr-WOCQWw18C4JIRJHKYzlyGzsGUm8LcsYE,7076
|
7
8
|
judgeval/rules.py,sha256=CoQjqmP8daEXewMkplmA-7urubDtweOr5O6z8klVwLI,20031
|
8
|
-
judgeval/run_evaluation.py,sha256=
|
9
|
+
judgeval/run_evaluation.py,sha256=4kcaw3R_akhxqutGFGTaBS2pqD-3d0ET7zMDL1_7HK4,27741
|
9
10
|
judgeval/version_check.py,sha256=FoLEtpCjDw2HuDQdpw5yT29UtwumSc6ZZN6AV_c9Mnw,1057
|
10
11
|
judgeval/common/__init__.py,sha256=KH-QJyWtQ60R6yFIBDYS3WGRiNpEu1guynpxivZvpBQ,309
|
11
12
|
judgeval/common/exceptions.py,sha256=OkgDznu2wpBQZMXiZarLJYNk1HIcC8qYW7VypDC3Ook,556
|
12
13
|
judgeval/common/logger.py,sha256=514eFLYWS_UL8VY-zAR2ePUlpQe4rbYlleLASFllLE4,1511
|
13
14
|
judgeval/common/utils.py,sha256=oxGDRVWOICKWeyGgsoc36_yAyHSYF4XtH842Mkznwis,34739
|
14
15
|
judgeval/common/api/__init__.py,sha256=-E7lpZz1fG8puR_aYUMfPmQ-Vyhd0bgzoaU5EhIuFjQ,114
|
15
|
-
judgeval/common/api/api.py,sha256=
|
16
|
-
judgeval/common/api/constants.py,sha256=
|
16
|
+
judgeval/common/api/api.py,sha256=uuLH6veC0LewfZ1IFiiUi5_OV7zTa7xTIK9LRlLoufc,13743
|
17
|
+
judgeval/common/api/constants.py,sha256=DXej0m8HEhb871SdiR8t_o4fzeMoQjHYqb_X0Plj8wY,4577
|
18
|
+
judgeval/common/api/json_encoder.py,sha256=XsScZe9hZP56yuxQ-3Ox6K8DcbjWxc2Yq7FcLF9qkUE,5852
|
17
19
|
judgeval/common/storage/__init__.py,sha256=a-PI7OL-ydyzugGUKmJKRBASnK-Q-gs82L9K9rSyJP8,90
|
18
20
|
judgeval/common/storage/s3_storage.py,sha256=0-bNKheqJJyBZ92KGrzQtd1zocIRWBlfn_58L4a-Ay0,3719
|
19
21
|
judgeval/common/tracer/__init__.py,sha256=tJCJsmVmrL89Phv88gNCJ-j0ITPez6lh8vhMAAlLNSc,795
|
20
22
|
judgeval/common/tracer/constants.py,sha256=yu5y8gMe5yb1AaBkPtAH-BNwIaAR3NwYCRoSf45wp5U,621
|
21
|
-
judgeval/common/tracer/core.py,sha256=
|
23
|
+
judgeval/common/tracer/core.py,sha256=rI7P0CaarP5FLQZmOGWpOJkjdf6WUgSds6i_QF04J3M,85071
|
22
24
|
judgeval/common/tracer/otel_exporter.py,sha256=kZLlOQ6afQE4dmb9H1wgU4P3H5PG1D_zKyvnpWcT5Ak,3899
|
23
25
|
judgeval/common/tracer/otel_span_processor.py,sha256=W7SM62KnxJ48vC9WllIHRKaLlvxkCwqYoT4KqZLfGNs,6497
|
26
|
+
judgeval/common/tracer/providers.py,sha256=3c3YOtKuoBjlTL0rc2HAGnUpppqvsyzrN5H6EKCqEi0,2733
|
24
27
|
judgeval/common/tracer/span_processor.py,sha256=eFjTgSWSkM6BWE94CrvgafDg_WkxLsFL_MafwBG-p9M,1145
|
25
|
-
judgeval/common/tracer/span_transformer.py,sha256=
|
28
|
+
judgeval/common/tracer/span_transformer.py,sha256=mUmfUYjEekUEOXAZMmH0WEF94ge05EBi5ftSc-T91zQ,7314
|
26
29
|
judgeval/common/tracer/trace_manager.py,sha256=ltiXcWC-68DRc8uSa28qHiWRSIBf6NpYOPkZYooR8tg,3086
|
27
30
|
judgeval/data/__init__.py,sha256=1QagDcSQtfnJ632t9Dnq8d7XjAqhmY4mInOWt8qH9tM,455
|
28
31
|
judgeval/data/example.py,sha256=kRskIgsjwcvv2Y8jaPwV-PND7zlmMbFsvRVQ_b7SZY0,914
|
29
|
-
judgeval/data/judgment_types.py,sha256=
|
32
|
+
judgeval/data/judgment_types.py,sha256=1DTpCnIdDM93Rozu9Dr812Q5K3lZfawMcWbPG2ofbxM,8407
|
30
33
|
judgeval/data/result.py,sha256=OtSnBUrdQpjyAqxXRLTW3wC9v9lOm_GqzL14ccRQxrg,2124
|
31
34
|
judgeval/data/scorer_data.py,sha256=5QBHtvOIWOq0Rn9_uPJzAMRYMlWxMB-rXnG_6kV4Z4Y,2955
|
32
35
|
judgeval/data/tool.py,sha256=iWQSdy5uNbIeACu3gQy1DC2oGYxRVYNfkkczWdQMAiA,99
|
33
|
-
judgeval/data/trace.py,sha256=
|
34
|
-
judgeval/data/trace_run.py,sha256=
|
36
|
+
judgeval/data/trace.py,sha256=LG-IZksynC1VgfUBuBfIIfR1DT9Bn-sY4vIj6Rc9K6Q,2791
|
37
|
+
judgeval/data/trace_run.py,sha256=ZCAzktgOSUPD0p1XQj8qGcF-DdsdQFNZM2dtY0aKGbE,1657
|
35
38
|
judgeval/data/scripts/fix_default_factory.py,sha256=lvp2JwYZqz-XpD9LZNa3mANZVP-jJSZoNzolI6JWERM,591
|
36
39
|
judgeval/data/scripts/openapi_transform.py,sha256=Sm04JClzyP1ga8KA3gkIdsae8Hlx-XU7-x0gHCQYOhg,3877
|
37
40
|
judgeval/integrations/langgraph.py,sha256=kJXLsgBY7DgsUTZyVQ47deDgHm887brFHfyIbuyerGw,29986
|
38
41
|
judgeval/judges/__init__.py,sha256=6X7VSwrwsdxGBNxCyapVRWGghhKOy3MVxFNMQ62kCXM,308
|
39
42
|
judgeval/judges/base_judge.py,sha256=_dz0qWsKRxzXxpRY9l6mrxTRYPSF2FE4ZXkrzhZ4gbY,986
|
40
|
-
judgeval/judges/litellm_judge.py,sha256=
|
41
|
-
judgeval/judges/mixture_of_judges.py,sha256=
|
43
|
+
judgeval/judges/litellm_judge.py,sha256=K9yCGOmozt7sYO0u8CHWyZNi8mXnSR3pPkP8yVsvuRc,2561
|
44
|
+
judgeval/judges/mixture_of_judges.py,sha256=iTNjTX4Le1nCwGRm9qfMCv1lQjgqoIw3OE0teiLubwo,14946
|
42
45
|
judgeval/judges/together_judge.py,sha256=5FADUhs6-FN1ZVV_1D3-8_gu9mPbZiG0PYTpme41SfM,2336
|
43
|
-
judgeval/judges/utils.py,sha256=
|
46
|
+
judgeval/judges/utils.py,sha256=_t6oYN9q63wyP7D4jI8X0bNmvVw7OfaE7uMTYDVS14E,2782
|
44
47
|
judgeval/scorers/__init__.py,sha256=4H_cinTQ4EogZv59YEV-3U9EOTLppNwgAPTi1-jI9Fw,746
|
45
48
|
judgeval/scorers/agent_scorer.py,sha256=TjwD_YglSywr3EowEojiCyg5qDgCRa5LRGc5nFdmIBc,703
|
46
49
|
judgeval/scorers/api_scorer.py,sha256=xlhqkeMUBFxl8daSXOTWOYwZjBAz7o6b4sVD5f8cIHw,2523
|
47
50
|
judgeval/scorers/base_scorer.py,sha256=eDfQk8N8TQfM1ayJDWr0NTdSQxcbk9-VZHd0Igb9EbI,2878
|
48
51
|
judgeval/scorers/example_scorer.py,sha256=2n45y3LMV1Q-ARyXLHqvVWETlnY1DqS7OLzPu9IBGz8,716
|
49
52
|
judgeval/scorers/exceptions.py,sha256=ACDHK5-TWiF3NTk-wycaedpbrdobm-CvvC1JA_iP-Mk,179
|
50
|
-
judgeval/scorers/score.py,sha256=
|
53
|
+
judgeval/scorers/score.py,sha256=SWyoqOOvyLpLy39tLyb_Q94sdh9r_IuDv6YNREw52lg,7546
|
51
54
|
judgeval/scorers/utils.py,sha256=HQOYTJtNnsi_aPfMssePAaBbXpAv7LXgwUlWlDFuN2g,3965
|
52
55
|
judgeval/scorers/judgeval_scorers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
53
56
|
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py,sha256=GX4KkwPR2p-c0Y5mZingJa8EUfjAbMGhrmRBDBunOGw,1484
|
@@ -58,14 +61,15 @@ judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py,sha256=NABO_iBd
|
|
58
61
|
judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py,sha256=ps51bTgQsD9xGYsk1v9bx0WxQMqywSllCE9_xlJkLd8,531
|
59
62
|
judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py,sha256=SnFLvU4FGsMeUVUp0SGHSy_6wgfwr_vHPGnZx5YJl_Q,691
|
60
63
|
judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py,sha256=aQzu-TiGqG74JDQ927evv5yGmnZw2AOolyHvlIhiUbI,683
|
61
|
-
judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py,sha256=
|
64
|
+
judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py,sha256=nx73DeoVkSqJTP1hYxMsJobG9HVWgMDN5-xFOXt_8Ts,7348
|
62
65
|
judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py,sha256=Mcp1CjMNyOax9UkvoRdSyUYdO2Os1-Nko43y89m2Luo,594
|
63
66
|
judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py,sha256=Z2FLGBC7m_CLx-CMgXVuTvYvN0vY5yOcWA0ImBkeBfY,787
|
64
67
|
judgeval/tracer/__init__.py,sha256=wkuXtOGDCrwgPPXlh_sSJmvGuWaAMHyNzk1TzB5f9aI,148
|
65
68
|
judgeval/utils/alerts.py,sha256=3w_AjQrgfmOZvfqCridW8WAnHVxHHXokX9jNzVFyGjA,3297
|
69
|
+
judgeval/utils/async_utils.py,sha256=uNx1SopEc0quSjc8GBQqyba0SmCMAzv2NKIq6xYwttc,989
|
66
70
|
judgeval/utils/file_utils.py,sha256=PWHRs8dUr8iDwpglSSk4Yjd7C6ZhDzUaO-jV3m7riHM,1987
|
67
71
|
judgeval/utils/requests.py,sha256=K3gUKrwL6TvwYKVYO5OeLWdUHn9NiUPmnIXhZEiEaHU,1534
|
68
|
-
judgeval-0.
|
69
|
-
judgeval-0.
|
70
|
-
judgeval-0.
|
71
|
-
judgeval-0.
|
72
|
+
judgeval-0.5.0.dist-info/METADATA,sha256=wwnunL-UcNKbB7D5t-UnOM_x3DVghU2BBPAVxa0tNfo,10348
|
73
|
+
judgeval-0.5.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
74
|
+
judgeval-0.5.0.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
|
75
|
+
judgeval-0.5.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|