judgeval 0.6.0__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/cli.py +1 -1
- judgeval/common/api/constants.py +1 -1
- judgeval/common/tracer/core.py +171 -1
- judgeval/common/tracer/trace_manager.py +6 -1
- judgeval/common/trainer/__init__.py +5 -0
- judgeval/common/trainer/config.py +125 -0
- judgeval/common/trainer/console.py +151 -0
- judgeval/common/trainer/trainable_model.py +238 -0
- judgeval/common/trainer/trainer.py +301 -0
- judgeval/judgment_client.py +4 -104
- judgeval/run_evaluation.py +10 -107
- {judgeval-0.6.0.dist-info → judgeval-0.7.0.dist-info}/METADATA +8 -47
- {judgeval-0.6.0.dist-info → judgeval-0.7.0.dist-info}/RECORD +16 -11
- {judgeval-0.6.0.dist-info → judgeval-0.7.0.dist-info}/WHEEL +0 -0
- {judgeval-0.6.0.dist-info → judgeval-0.7.0.dist-info}/entry_points.txt +0 -0
- {judgeval-0.6.0.dist-info → judgeval-0.7.0.dist-info}/licenses/LICENSE.md +0 -0
judgeval/judgment_client.py
CHANGED
@@ -7,12 +7,11 @@ import os
|
|
7
7
|
import importlib.util
|
8
8
|
from pathlib import Path
|
9
9
|
from uuid import uuid4
|
10
|
-
from typing import Optional, List, Dict,
|
10
|
+
from typing import Optional, List, Dict, Union
|
11
11
|
|
12
12
|
from judgeval.data import (
|
13
13
|
ScoringResult,
|
14
14
|
Example,
|
15
|
-
Trace,
|
16
15
|
)
|
17
16
|
from judgeval.scorers import (
|
18
17
|
APIScorerConfig,
|
@@ -22,19 +21,14 @@ from judgeval.data.evaluation_run import EvaluationRun
|
|
22
21
|
from judgeval.run_evaluation import (
|
23
22
|
run_eval,
|
24
23
|
assert_test,
|
25
|
-
run_trace_eval,
|
26
24
|
)
|
27
|
-
from judgeval.data.trace_run import TraceRun
|
28
25
|
from judgeval.common.api import JudgmentApiClient
|
29
26
|
from judgeval.common.exceptions import JudgmentAPIError
|
30
|
-
from judgeval.common.tracer import Tracer
|
31
27
|
from judgeval.common.utils import validate_api_key
|
32
28
|
from pydantic import BaseModel
|
33
29
|
from judgeval.common.logger import judgeval_logger
|
34
30
|
|
35
31
|
|
36
|
-
if TYPE_CHECKING:
|
37
|
-
from judgeval.integrations.langgraph import JudgevalCallbackHandler
|
38
32
|
from judgeval.constants import DEFAULT_GPT_MODEL
|
39
33
|
|
40
34
|
|
@@ -86,47 +80,6 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
86
80
|
else:
|
87
81
|
judgeval_logger.info("Successfully initialized JudgmentClient!")
|
88
82
|
|
89
|
-
def run_trace_evaluation(
|
90
|
-
self,
|
91
|
-
scorers: List[Union[APIScorerConfig, BaseScorer]],
|
92
|
-
examples: Optional[List[Example]] = None,
|
93
|
-
function: Optional[Callable] = None,
|
94
|
-
tracer: Optional[Union[Tracer, JudgevalCallbackHandler]] = None,
|
95
|
-
traces: Optional[List[Trace]] = None,
|
96
|
-
tools: Optional[List[Dict[str, Any]]] = None,
|
97
|
-
project_name: str = "default_project",
|
98
|
-
eval_run_name: str = "default_eval_trace",
|
99
|
-
model: Optional[str] = DEFAULT_GPT_MODEL,
|
100
|
-
) -> List[ScoringResult]:
|
101
|
-
try:
|
102
|
-
if examples and not function:
|
103
|
-
raise ValueError("Cannot pass in examples without a function")
|
104
|
-
|
105
|
-
if traces and function:
|
106
|
-
raise ValueError("Cannot pass in traces and function")
|
107
|
-
|
108
|
-
if examples and traces:
|
109
|
-
raise ValueError("Cannot pass in both examples and traces")
|
110
|
-
|
111
|
-
trace_run = TraceRun(
|
112
|
-
project_name=project_name,
|
113
|
-
eval_name=eval_run_name,
|
114
|
-
traces=traces,
|
115
|
-
scorers=scorers,
|
116
|
-
model=model,
|
117
|
-
organization_id=self.organization_id,
|
118
|
-
tools=tools,
|
119
|
-
)
|
120
|
-
return run_trace_eval(
|
121
|
-
trace_run, self.judgment_api_key, function, tracer, examples
|
122
|
-
)
|
123
|
-
except ValueError as e:
|
124
|
-
raise ValueError(
|
125
|
-
f"Please check your TraceRun object, one or more fields are invalid: \n{str(e)}"
|
126
|
-
)
|
127
|
-
except Exception as e:
|
128
|
-
raise Exception(f"An unexpected error occurred during evaluation: {str(e)}")
|
129
|
-
|
130
83
|
def run_evaluation(
|
131
84
|
self,
|
132
85
|
examples: List[Example],
|
@@ -134,6 +87,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
134
87
|
model: Optional[str] = DEFAULT_GPT_MODEL,
|
135
88
|
project_name: str = "default_project",
|
136
89
|
eval_run_name: str = "default_eval_run",
|
90
|
+
show_url: bool = True,
|
137
91
|
) -> List[ScoringResult]:
|
138
92
|
"""
|
139
93
|
Executes an evaluation of `Example`s using one or more `Scorer`s
|
@@ -161,6 +115,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
161
115
|
return run_eval(
|
162
116
|
eval,
|
163
117
|
self.judgment_api_key,
|
118
|
+
show_url=show_url,
|
164
119
|
)
|
165
120
|
except ValueError as e:
|
166
121
|
raise ValueError(
|
@@ -217,57 +172,6 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
217
172
|
)
|
218
173
|
assert_test(results)
|
219
174
|
|
220
|
-
def assert_trace_test(
|
221
|
-
self,
|
222
|
-
scorers: List[Union[APIScorerConfig, BaseScorer]],
|
223
|
-
examples: Optional[List[Example]] = None,
|
224
|
-
function: Optional[Callable] = None,
|
225
|
-
tracer: Optional[Union[Tracer, JudgevalCallbackHandler]] = None,
|
226
|
-
traces: Optional[List[Trace]] = None,
|
227
|
-
tools: Optional[List[Dict[str, Any]]] = None,
|
228
|
-
model: Optional[str] = DEFAULT_GPT_MODEL,
|
229
|
-
project_name: str = "default_test",
|
230
|
-
eval_run_name: str = str(uuid4()),
|
231
|
-
) -> None:
|
232
|
-
"""
|
233
|
-
Asserts a test by running the evaluation and checking the results for success
|
234
|
-
|
235
|
-
Args:
|
236
|
-
examples (List[Example]): The examples to evaluate.
|
237
|
-
scorers (List[Union[APIScorerConfig, BaseScorer]]): A list of scorers to use for evaluation
|
238
|
-
model (str): The model used as a judge when using LLM as a Judge
|
239
|
-
project_name (str): The name of the project the evaluation results belong to
|
240
|
-
eval_run_name (str): A name for this evaluation run
|
241
|
-
function (Optional[Callable]): A function to use for evaluation
|
242
|
-
tracer (Optional[Union[Tracer, BaseCallbackHandler]]): A tracer to use for evaluation
|
243
|
-
tools (Optional[List[Dict[str, Any]]]): A list of tools to use for evaluation
|
244
|
-
"""
|
245
|
-
|
246
|
-
# Check for enable_param_checking and tools
|
247
|
-
for scorer in scorers:
|
248
|
-
if hasattr(scorer, "kwargs") and scorer.kwargs is not None:
|
249
|
-
if scorer.kwargs.get("enable_param_checking") is True:
|
250
|
-
if not tools:
|
251
|
-
raise ValueError(
|
252
|
-
f"You must provide the 'tools' argument to assert_test when using a scorer with enable_param_checking=True. If you do not want to do param checking, explicitly set enable_param_checking=False for the {scorer.__name__} scorer."
|
253
|
-
)
|
254
|
-
|
255
|
-
results: List[ScoringResult]
|
256
|
-
|
257
|
-
results = self.run_trace_evaluation(
|
258
|
-
examples=examples,
|
259
|
-
traces=traces,
|
260
|
-
scorers=scorers,
|
261
|
-
model=model,
|
262
|
-
project_name=project_name,
|
263
|
-
eval_run_name=eval_run_name,
|
264
|
-
function=function,
|
265
|
-
tracer=tracer,
|
266
|
-
tools=tools,
|
267
|
-
)
|
268
|
-
|
269
|
-
assert_test(results)
|
270
|
-
|
271
175
|
def _extract_scorer_name(self, scorer_file_path: str) -> str:
|
272
176
|
"""Extract scorer name from the scorer file by importing it."""
|
273
177
|
try:
|
@@ -301,7 +205,7 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
301
205
|
judgeval_logger.warning(f"Could not extract scorer name: {e}")
|
302
206
|
return Path(scorer_file_path).stem
|
303
207
|
|
304
|
-
def
|
208
|
+
def upload_custom_scorer(
|
305
209
|
self,
|
306
210
|
scorer_file_path: str,
|
307
211
|
requirements_file_path: Optional[str] = None,
|
@@ -342,10 +246,6 @@ class JudgmentClient(metaclass=SingletonMeta):
|
|
342
246
|
with open(requirements_file_path, "r") as f:
|
343
247
|
requirements_text = f.read()
|
344
248
|
|
345
|
-
# Upload to backend
|
346
|
-
judgeval_logger.info(
|
347
|
-
f"Uploading custom scorer: {unique_name}, this can take a couple of minutes..."
|
348
|
-
)
|
349
249
|
try:
|
350
250
|
response = self.api_client.upload_custom_scorer(
|
351
251
|
scorer_name=unique_name,
|
judgeval/run_evaluation.py
CHANGED
@@ -6,10 +6,10 @@ import time
|
|
6
6
|
import orjson
|
7
7
|
import sys
|
8
8
|
import threading
|
9
|
-
from typing import List, Dict, Union,
|
9
|
+
from typing import List, Dict, Union, Tuple, Any, TYPE_CHECKING
|
10
10
|
from rich import print as rprint
|
11
11
|
|
12
|
-
from judgeval.data import ScorerData, ScoringResult, Example
|
12
|
+
from judgeval.data import ScorerData, ScoringResult, Example
|
13
13
|
from judgeval.scorers import BaseScorer, APIScorerConfig
|
14
14
|
from judgeval.scorers.score import a_execute_scoring
|
15
15
|
from judgeval.common.api import JudgmentApiClient
|
@@ -22,10 +22,7 @@ from judgeval.common.logger import judgeval_logger
|
|
22
22
|
|
23
23
|
|
24
24
|
if TYPE_CHECKING:
|
25
|
-
from judgeval.common.tracer import Tracer
|
26
|
-
from judgeval.data.trace_run import TraceRun
|
27
25
|
from judgeval.data.evaluation_run import EvaluationRun
|
28
|
-
from judgeval.integrations.langgraph import JudgevalCallbackHandler
|
29
26
|
|
30
27
|
|
31
28
|
def safe_run_async(coro):
|
@@ -99,29 +96,6 @@ def execute_api_eval(evaluation_run: EvaluationRun) -> Dict:
|
|
99
96
|
)
|
100
97
|
|
101
98
|
|
102
|
-
def execute_api_trace_eval(trace_run: TraceRun, judgment_api_key: str) -> Dict:
|
103
|
-
"""
|
104
|
-
Executes an evaluation of a list of `Trace`s using one or more `JudgmentScorer`s via the Judgment API.
|
105
|
-
"""
|
106
|
-
|
107
|
-
try:
|
108
|
-
# submit API request to execute evals
|
109
|
-
if not judgment_api_key or not trace_run.organization_id:
|
110
|
-
raise ValueError("API key and organization ID are required")
|
111
|
-
api_client = JudgmentApiClient(judgment_api_key, trace_run.organization_id)
|
112
|
-
return api_client.run_trace_evaluation(trace_run.model_dump(warnings=False))
|
113
|
-
except Exception as e:
|
114
|
-
judgeval_logger.error(f"Error: {e}")
|
115
|
-
|
116
|
-
details = "An unknown error occurred."
|
117
|
-
if isinstance(e, JudgmentAPIException):
|
118
|
-
details = e.response_json.get("detail", "An unknown error occurred.")
|
119
|
-
|
120
|
-
raise JudgmentAPIError(
|
121
|
-
"An error occurred while executing the Judgment API request: " + details
|
122
|
-
)
|
123
|
-
|
124
|
-
|
125
99
|
def check_missing_scorer_data(results: List[ScoringResult]) -> List[ScoringResult]:
|
126
100
|
"""
|
127
101
|
Checks if any `ScoringResult` objects are missing `scorers_data`.
|
@@ -142,7 +116,7 @@ def check_missing_scorer_data(results: List[ScoringResult]) -> List[ScoringResul
|
|
142
116
|
|
143
117
|
def log_evaluation_results(
|
144
118
|
scoring_results: List[ScoringResult],
|
145
|
-
run:
|
119
|
+
run: EvaluationRun,
|
146
120
|
judgment_api_key: str,
|
147
121
|
) -> str:
|
148
122
|
"""
|
@@ -208,81 +182,6 @@ def check_examples(
|
|
208
182
|
rprint("[green]Continuing...[/green]")
|
209
183
|
|
210
184
|
|
211
|
-
def run_trace_eval(
|
212
|
-
trace_run: TraceRun,
|
213
|
-
judgment_api_key: str,
|
214
|
-
function: Optional[Callable] = None,
|
215
|
-
tracer: Optional[Union[Tracer, "JudgevalCallbackHandler"]] = None,
|
216
|
-
examples: Optional[List[Example]] = None,
|
217
|
-
) -> List[ScoringResult]:
|
218
|
-
if function and tracer and examples is not None:
|
219
|
-
new_traces: List[Trace] = []
|
220
|
-
|
221
|
-
# Handle case where tracer is actually a callback handler
|
222
|
-
actual_tracer = tracer
|
223
|
-
if hasattr(tracer, "tracer") and hasattr(tracer.tracer, "traces"):
|
224
|
-
# This is a callback handler, get the underlying tracer
|
225
|
-
actual_tracer = tracer.tracer
|
226
|
-
|
227
|
-
if trace_run.project_name != actual_tracer.project_name:
|
228
|
-
raise ValueError(
|
229
|
-
f"Project name mismatch between run_trace_eval and tracer. "
|
230
|
-
f"Trace run: {trace_run.project_name}, "
|
231
|
-
f"Tracer: {actual_tracer.project_name}"
|
232
|
-
)
|
233
|
-
|
234
|
-
actual_tracer.offline_mode = True
|
235
|
-
actual_tracer.traces = []
|
236
|
-
judgeval_logger.info("Running agent function: ")
|
237
|
-
for example in examples:
|
238
|
-
if example.input:
|
239
|
-
if isinstance(example.input, str):
|
240
|
-
function(example.input)
|
241
|
-
elif isinstance(example.input, dict):
|
242
|
-
function(**example.input)
|
243
|
-
else:
|
244
|
-
raise ValueError(
|
245
|
-
f"Input must be string or dict, got {type(example.input)}"
|
246
|
-
)
|
247
|
-
else:
|
248
|
-
function()
|
249
|
-
|
250
|
-
for i, trace in enumerate(actual_tracer.traces):
|
251
|
-
# We set the root-level trace span with the expected tools of the Trace
|
252
|
-
trace = Trace(**trace)
|
253
|
-
trace.trace_spans[0].expected_tools = examples[i].expected_tools
|
254
|
-
new_traces.append(trace)
|
255
|
-
trace_run.traces = new_traces
|
256
|
-
actual_tracer.traces = []
|
257
|
-
|
258
|
-
# Execute evaluation using Judgment API
|
259
|
-
try: # execute an EvaluationRun with just JudgmentScorers
|
260
|
-
judgeval_logger.info("Executing Trace Evaluation... ")
|
261
|
-
response_data: Dict = execute_api_trace_eval(trace_run, judgment_api_key)
|
262
|
-
scoring_results = [
|
263
|
-
ScoringResult(**result) for result in response_data["results"]
|
264
|
-
]
|
265
|
-
except JudgmentAPIError as e:
|
266
|
-
raise JudgmentAPIError(
|
267
|
-
f"An error occurred while executing the Judgment API request: {str(e)}"
|
268
|
-
)
|
269
|
-
except ValueError as e:
|
270
|
-
raise ValueError(
|
271
|
-
f"Please check your TraceRun object, one or more fields are invalid: {str(e)}"
|
272
|
-
)
|
273
|
-
|
274
|
-
# Convert the response data to `ScoringResult` objects
|
275
|
-
# TODO: allow for custom scorer on traces
|
276
|
-
|
277
|
-
url = log_evaluation_results(
|
278
|
-
response_data["agent_results"], trace_run, judgment_api_key
|
279
|
-
)
|
280
|
-
rprint(
|
281
|
-
f"\n🔍 You can view your evaluation results here: [rgb(106,0,255)][link={url}]View Results[/link]\n"
|
282
|
-
)
|
283
|
-
return scoring_results
|
284
|
-
|
285
|
-
|
286
185
|
def _poll_evaluation_until_complete(
|
287
186
|
experiment_run_id: str,
|
288
187
|
project_name: str,
|
@@ -387,12 +286,15 @@ def progress_logger(stop_event, msg="Working...", interval=5):
|
|
387
286
|
def run_eval(
|
388
287
|
evaluation_run: EvaluationRun,
|
389
288
|
judgment_api_key: str,
|
289
|
+
show_url: bool = True,
|
390
290
|
) -> List[ScoringResult]:
|
391
291
|
"""
|
392
292
|
Executes an evaluation of `Example`s using one or more `Scorer`s
|
393
293
|
|
394
294
|
Args:
|
395
295
|
evaluation_run (EvaluationRun): Stores example and evaluation together for running
|
296
|
+
judgment_api_key (str): API key for authentication
|
297
|
+
show_url (bool): Whether to display the evaluation results URL. Defaults to True.
|
396
298
|
|
397
299
|
Returns:
|
398
300
|
List[ScoringResult]: A list of ScoringResult objects
|
@@ -481,9 +383,10 @@ def run_eval(
|
|
481
383
|
scoring_result.model_dump(warnings=False) for scoring_result in results
|
482
384
|
]
|
483
385
|
url = log_evaluation_results(send_results, evaluation_run, judgment_api_key)
|
484
|
-
|
485
|
-
|
486
|
-
|
386
|
+
if show_url:
|
387
|
+
rprint(
|
388
|
+
f"\n🔍 You can view your evaluation results here: [rgb(106,0,255)][link={url}]View Results[/link]\n"
|
389
|
+
)
|
487
390
|
return results
|
488
391
|
|
489
392
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: judgeval
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.7.0
|
4
4
|
Summary: Judgeval Package
|
5
5
|
Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
|
6
6
|
Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
|
@@ -12,6 +12,7 @@ Classifier: Programming Language :: Python :: 3
|
|
12
12
|
Requires-Python: >=3.11
|
13
13
|
Requires-Dist: boto3
|
14
14
|
Requires-Dist: click<8.2.0
|
15
|
+
Requires-Dist: fireworks-ai>=0.19.18
|
15
16
|
Requires-Dist: langchain-anthropic
|
16
17
|
Requires-Dist: langchain-core
|
17
18
|
Requires-Dist: langchain-huggingface
|
@@ -39,7 +40,7 @@ Description-Content-Type: text/markdown
|
|
39
40
|
|
40
41
|
<br>
|
41
42
|
<div style="font-size: 1.5em;">
|
42
|
-
Enable self-learning agents with
|
43
|
+
Enable self-learning agents with environment data and evals.
|
43
44
|
</div>
|
44
45
|
|
45
46
|
## [Docs](https://docs.judgmentlabs.ai/) • [Judgment Cloud](https://app.judgmentlabs.ai/register) • [Self-Host](https://docs.judgmentlabs.ai/documentation/self-hosting/get-started) • [Landing Page](https://judgmentlabs.ai/)
|
@@ -56,11 +57,11 @@ We're hiring! Join us in our mission to enable self-learning agents by providing
|
|
56
57
|
|
57
58
|
</div>
|
58
59
|
|
59
|
-
Judgeval offers **open-source tooling** for
|
60
|
+
Judgeval offers **open-source tooling** for evaluating autonomous, stateful agents. It **provides runtime data from agent-environment interactions** for continuous learning and self-improvement.
|
60
61
|
|
61
62
|
## 🎬 See Judgeval in Action
|
62
63
|
|
63
|
-
**[Multi-Agent System](https://github.com/JudgmentLabs/judgment-cookbook/tree/main/cookbooks/agents/multi-agent) with complete observability:** (1) A multi-agent system spawns agents to research topics on the internet. (2) With just **3 lines of code**, Judgeval
|
64
|
+
**[Multi-Agent System](https://github.com/JudgmentLabs/judgment-cookbook/tree/main/cookbooks/agents/multi-agent) with complete observability:** (1) A multi-agent system spawns agents to research topics on the internet. (2) With just **3 lines of code**, Judgeval captures all environment responses across all agent tool calls for monitoring. (3) After completion, (4) export all interaction data to enable further environment-specific learning and optimization.
|
64
65
|
|
65
66
|
<table style="width: 100%; max-width: 800px; table-layout: fixed;">
|
66
67
|
<tr>
|
@@ -69,8 +70,8 @@ Judgeval offers **open-source tooling** for tracing and evaluating autonomous, s
|
|
69
70
|
<br><strong>🤖 Agents Running</strong>
|
70
71
|
</td>
|
71
72
|
<td align="center" style="padding: 8px; width: 50%;">
|
72
|
-
<img src="assets/trace.gif" alt="
|
73
|
-
<br><strong>📊
|
73
|
+
<img src="assets/trace.gif" alt="Capturing Environment Data Demo" style="width: 100%; max-width: 350px; height: auto;" />
|
74
|
+
<br><strong>📊 Capturing Environment Data </strong>
|
74
75
|
</td>
|
75
76
|
</tr>
|
76
77
|
<tr>
|
@@ -111,54 +112,14 @@ export JUDGMENT_ORG_ID=...
|
|
111
112
|
|
112
113
|
**If you don't have keys, [create an account](https://app.judgmentlabs.ai/register) on the platform!**
|
113
114
|
|
114
|
-
## 🏁 Quickstarts
|
115
|
-
|
116
|
-
### 🛰️ Tracing
|
117
|
-
|
118
|
-
Create a file named `agent.py` with the following code:
|
119
|
-
|
120
|
-
```python
|
121
|
-
from judgeval.tracer import Tracer, wrap
|
122
|
-
from openai import OpenAI
|
123
|
-
|
124
|
-
client = wrap(OpenAI()) # tracks all LLM calls
|
125
|
-
judgment = Tracer(project_name="my_project")
|
126
|
-
|
127
|
-
@judgment.observe(span_type="tool")
|
128
|
-
def format_question(question: str) -> str:
|
129
|
-
# dummy tool
|
130
|
-
return f"Question : {question}"
|
131
|
-
|
132
|
-
@judgment.observe(span_type="function")
|
133
|
-
def run_agent(prompt: str) -> str:
|
134
|
-
task = format_question(prompt)
|
135
|
-
response = client.chat.completions.create(
|
136
|
-
model="gpt-4.1",
|
137
|
-
messages=[{"role": "user", "content": task}]
|
138
|
-
)
|
139
|
-
return response.choices[0].message.content
|
140
|
-
|
141
|
-
run_agent("What is the capital of the United States?")
|
142
|
-
```
|
143
|
-
You'll see your trace exported to the Judgment Platform:
|
144
|
-
|
145
|
-
<p align="center"><img src="assets/online_eval.png" alt="Judgment Platform Trace Example" width="1500" /></p>
|
146
|
-
|
147
|
-
|
148
|
-
[Click here](https://docs.judgmentlabs.ai/documentation/tracing/introduction) for a more detailed explanation.
|
149
|
-
|
150
|
-
|
151
|
-
<!-- Created by https://github.com/ekalinin/github-markdown-toc -->
|
152
|
-
|
153
115
|
|
154
116
|
## ✨ Features
|
155
117
|
|
156
118
|
| | |
|
157
119
|
|:---|:---:|
|
158
|
-
| <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic). **Tracks inputs/outputs, agent tool calls, latency, cost, and custom metadata** at every step.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 📋 Collecting agent environment data <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/agent_trace_example.png" alt="Tracing visualization" width="1200"/></p> |
|
159
120
|
| <h3>🧪 Evals</h3>Build custom evaluators on top of your agents. Judgeval supports LLM-as-a-judge, manual labeling, and code-based evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 A/B testing <br>• 🛡️ Online guardrails | <p align="center"><img src="assets/test.png" alt="Evaluation metrics" width="800"/></p> |
|
160
121
|
| <h3>📡 Monitoring</h3>Get Slack alerts for agent failures in production. Add custom hooks to address production regressions.<br><br> **Useful for:** <br>• 📉 Identifying degradation early <br>• 📈 Visualizing performance trends across agent versions and time | <p align="center"><img src="assets/errors.png" alt="Monitoring Dashboard" width="1200"/></p> |
|
161
|
-
| <h3>📊 Datasets</h3>Export
|
122
|
+
| <h3>📊 Datasets</h3>Export environment interactions and test cases to datasets for scaled analysis and optimization. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations, enabling continuous learning from production interactions. <br><br> **Useful for:**<br>• 🗃️ Agent environment interaction data for optimization<br>• 🔄 Scaled analysis for A/B tests | <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
|
162
123
|
|
163
124
|
## 🏢 Self-Hosting
|
164
125
|
|
@@ -1,12 +1,12 @@
|
|
1
1
|
judgeval/__init__.py,sha256=5Lm1JMYFREJGN_8X-Wpruu_ovwGLJ08gCzNAt-u-pQE,419
|
2
|
-
judgeval/cli.py,sha256=
|
2
|
+
judgeval/cli.py,sha256=WTFTJKQ6LZI7K9o9KnCfTzsTEJnKfPuSURUpRFLiHp8,1756
|
3
3
|
judgeval/clients.py,sha256=HHul68PV1om0dxsVZZu90TtCiy5zaqAwph16jXTQzQo,989
|
4
4
|
judgeval/constants.py,sha256=UNoTLHgbpZHRInPM2ZaI3m0XokPkee5ILlg20reqhzo,4180
|
5
5
|
judgeval/dataset.py,sha256=vOrDKam2I-K1WcVF5IBkQruCDvXTc8PRaFm4-dV0lXs,6220
|
6
|
-
judgeval/judgment_client.py,sha256
|
6
|
+
judgeval/judgment_client.py,sha256=KxQP-EmhZUJOIFM2Zf_OJbxrgDpN1dRwxo4iVI9zLdA,9390
|
7
7
|
judgeval/local_eval_queue.py,sha256=GmlXeZt7bfAJe1hPUjDg_irth4RkNqL2Zdi7VzboBzI,6984
|
8
8
|
judgeval/rules.py,sha256=CoQjqmP8daEXewMkplmA-7urubDtweOr5O6z8klVwLI,20031
|
9
|
-
judgeval/run_evaluation.py,sha256=
|
9
|
+
judgeval/run_evaluation.py,sha256=ETAP7srohMBAsRqvxHQHKsR5zt3Rzns_kNM_2ulxVdU,18084
|
10
10
|
judgeval/version_check.py,sha256=FoLEtpCjDw2HuDQdpw5yT29UtwumSc6ZZN6AV_c9Mnw,1057
|
11
11
|
judgeval/common/__init__.py,sha256=KH-QJyWtQ60R6yFIBDYS3WGRiNpEu1guynpxivZvpBQ,309
|
12
12
|
judgeval/common/exceptions.py,sha256=OkgDznu2wpBQZMXiZarLJYNk1HIcC8qYW7VypDC3Ook,556
|
@@ -14,19 +14,24 @@ judgeval/common/logger.py,sha256=514eFLYWS_UL8VY-zAR2ePUlpQe4rbYlleLASFllLE4,151
|
|
14
14
|
judgeval/common/utils.py,sha256=oxGDRVWOICKWeyGgsoc36_yAyHSYF4XtH842Mkznwis,34739
|
15
15
|
judgeval/common/api/__init__.py,sha256=-E7lpZz1fG8puR_aYUMfPmQ-Vyhd0bgzoaU5EhIuFjQ,114
|
16
16
|
judgeval/common/api/api.py,sha256=fWtMNln0o1wOhJ9wangWpyY_j3WF7P3at_LYPJEicP0,13670
|
17
|
-
judgeval/common/api/constants.py,sha256=
|
17
|
+
judgeval/common/api/constants.py,sha256=N6rQZqMhFv2U8tOw-6pMH0uV7aGT9m8sw57ZkfDW97c,4689
|
18
18
|
judgeval/common/api/json_encoder.py,sha256=QQgCe2FBmW1uWKx8yvuhr4U7_b4D0sG97GZtXHKnBdk,5881
|
19
19
|
judgeval/common/storage/__init__.py,sha256=a-PI7OL-ydyzugGUKmJKRBASnK-Q-gs82L9K9rSyJP8,90
|
20
20
|
judgeval/common/storage/s3_storage.py,sha256=0-bNKheqJJyBZ92KGrzQtd1zocIRWBlfn_58L4a-Ay0,3719
|
21
21
|
judgeval/common/tracer/__init__.py,sha256=tJCJsmVmrL89Phv88gNCJ-j0ITPez6lh8vhMAAlLNSc,795
|
22
22
|
judgeval/common/tracer/constants.py,sha256=yu5y8gMe5yb1AaBkPtAH-BNwIaAR3NwYCRoSf45wp5U,621
|
23
|
-
judgeval/common/tracer/core.py,sha256=
|
23
|
+
judgeval/common/tracer/core.py,sha256=Vhh2LRgLdxa_yxUfMunv7l83tksuztm7F_oSwD92EXs,91681
|
24
24
|
judgeval/common/tracer/otel_exporter.py,sha256=kZLlOQ6afQE4dmb9H1wgU4P3H5PG1D_zKyvnpWcT5Ak,3899
|
25
25
|
judgeval/common/tracer/otel_span_processor.py,sha256=BD-FKXaZft5_3zqy1Qe_tpkudVOLop9AGhBjZUgp-Z8,6502
|
26
26
|
judgeval/common/tracer/providers.py,sha256=3c3YOtKuoBjlTL0rc2HAGnUpppqvsyzrN5H6EKCqEi0,2733
|
27
27
|
judgeval/common/tracer/span_processor.py,sha256=1NQxNSVWcb8qCFLmslSVMnaWdkOZmiFJnxeeN0i6vnU,1150
|
28
28
|
judgeval/common/tracer/span_transformer.py,sha256=cfzz6RpTCOG9Io9knNlwtAW34p3wyK-u8jSNMu24p1w,7382
|
29
|
-
judgeval/common/tracer/trace_manager.py,sha256=
|
29
|
+
judgeval/common/tracer/trace_manager.py,sha256=FAlkTNomb_TzSSnF7DnmP5nImBgHaA_SFNW1INzE1aI,3178
|
30
|
+
judgeval/common/trainer/__init__.py,sha256=fkaBjtAynh1GZbvK2xbNTjuLFSDpPzj7u4Chf4vZsfs,209
|
31
|
+
judgeval/common/trainer/config.py,sha256=kaWz0ni4ijtXpu8SF2jLEnw5yA2HqaUbvjiyqEnSrXE,4195
|
32
|
+
judgeval/common/trainer/console.py,sha256=sZCoJqI6ZRArbJpxl3ZwNb9taYoEkgCpz9PF4IUbGjE,4818
|
33
|
+
judgeval/common/trainer/trainable_model.py,sha256=tnhFH2Mp5hVht3utHVFPs2BxKoBQgRJrAzgzE5IfKEU,8842
|
34
|
+
judgeval/common/trainer/trainer.py,sha256=dE-sOU26dNaWxPaN88XuN3f3XCizdHrRPNylrspCWQc,11815
|
30
35
|
judgeval/data/__init__.py,sha256=1QagDcSQtfnJ632t9Dnq8d7XjAqhmY4mInOWt8qH9tM,455
|
31
36
|
judgeval/data/evaluation_run.py,sha256=IirmYZ1_9N99eep7DDuoyshwjmpNK9bQCxCWXnnhhuI,4053
|
32
37
|
judgeval/data/example.py,sha256=kRskIgsjwcvv2Y8jaPwV-PND7zlmMbFsvRVQ_b7SZY0,914
|
@@ -70,8 +75,8 @@ judgeval/utils/alerts.py,sha256=3w_AjQrgfmOZvfqCridW8WAnHVxHHXokX9jNzVFyGjA,3297
|
|
70
75
|
judgeval/utils/async_utils.py,sha256=uNx1SopEc0quSjc8GBQqyba0SmCMAzv2NKIq6xYwttc,989
|
71
76
|
judgeval/utils/file_utils.py,sha256=PWHRs8dUr8iDwpglSSk4Yjd7C6ZhDzUaO-jV3m7riHM,1987
|
72
77
|
judgeval/utils/requests.py,sha256=K3gUKrwL6TvwYKVYO5OeLWdUHn9NiUPmnIXhZEiEaHU,1534
|
73
|
-
judgeval-0.
|
74
|
-
judgeval-0.
|
75
|
-
judgeval-0.
|
76
|
-
judgeval-0.
|
77
|
-
judgeval-0.
|
78
|
+
judgeval-0.7.0.dist-info/METADATA,sha256=WvjnS9cY6RvmrLdtpJbNJN3AssRmIWp61dYr2ZUn0Bo,8877
|
79
|
+
judgeval-0.7.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
80
|
+
judgeval-0.7.0.dist-info/entry_points.txt,sha256=-eoeD-oDLn4A7MSgeBS9Akwanf3_0r0cgEleBcIOjg0,46
|
81
|
+
judgeval-0.7.0.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
|
82
|
+
judgeval-0.7.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|