judgeval 0.0.55__py3-none-any.whl → 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. judgeval/common/api/__init__.py +3 -0
  2. judgeval/common/api/api.py +352 -0
  3. judgeval/common/api/constants.py +165 -0
  4. judgeval/common/storage/__init__.py +6 -0
  5. judgeval/common/tracer/__init__.py +31 -0
  6. judgeval/common/tracer/constants.py +22 -0
  7. judgeval/common/tracer/core.py +1916 -0
  8. judgeval/common/tracer/otel_exporter.py +108 -0
  9. judgeval/common/tracer/otel_span_processor.py +234 -0
  10. judgeval/common/tracer/span_processor.py +37 -0
  11. judgeval/common/tracer/span_transformer.py +211 -0
  12. judgeval/common/tracer/trace_manager.py +92 -0
  13. judgeval/common/utils.py +2 -2
  14. judgeval/constants.py +3 -30
  15. judgeval/data/datasets/eval_dataset_client.py +29 -156
  16. judgeval/data/judgment_types.py +4 -12
  17. judgeval/data/result.py +1 -1
  18. judgeval/data/scorer_data.py +2 -2
  19. judgeval/data/scripts/openapi_transform.py +1 -1
  20. judgeval/data/trace.py +66 -1
  21. judgeval/data/trace_run.py +0 -3
  22. judgeval/evaluation_run.py +0 -2
  23. judgeval/integrations/langgraph.py +43 -164
  24. judgeval/judgment_client.py +17 -211
  25. judgeval/run_evaluation.py +209 -611
  26. judgeval/scorers/__init__.py +2 -6
  27. judgeval/scorers/base_scorer.py +4 -23
  28. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +3 -3
  29. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +215 -0
  30. judgeval/scorers/score.py +2 -1
  31. judgeval/scorers/utils.py +1 -13
  32. judgeval/utils/requests.py +21 -0
  33. judgeval-0.1.0.dist-info/METADATA +202 -0
  34. {judgeval-0.0.55.dist-info → judgeval-0.1.0.dist-info}/RECORD +37 -29
  35. judgeval/common/tracer.py +0 -3215
  36. judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +0 -73
  37. judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -3
  38. judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -3
  39. judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -53
  40. judgeval-0.0.55.dist-info/METADATA +0 -1384
  41. /judgeval/common/{s3_storage.py → storage/s3_storage.py} +0 -0
  42. {judgeval-0.0.55.dist-info → judgeval-0.1.0.dist-info}/WHEEL +0 -0
  43. {judgeval-0.0.55.dist-info → judgeval-0.1.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -9,23 +9,19 @@ from judgeval.scorers.judgeval_scorers.api_scorers import (
9
9
  InstructionAdherenceScorer,
10
10
  DerailmentScorer,
11
11
  ToolOrderScorer,
12
- ClassifierScorer,
12
+ PromptScorer,
13
13
  ToolDependencyScorer,
14
14
  )
15
- from judgeval.scorers.judgeval_scorers.classifiers import (
16
- Text2SQLScorer,
17
- )
18
15
 
19
16
  __all__ = [
20
17
  "APIScorerConfig",
21
18
  "BaseScorer",
22
- "ClassifierScorer",
19
+ "PromptScorer",
23
20
  "ExecutionOrderScorer",
24
21
  "HallucinationScorer",
25
22
  "FaithfulnessScorer",
26
23
  "AnswerRelevancyScorer",
27
24
  "AnswerCorrectnessScorer",
28
- "Text2SQLScorer",
29
25
  "InstructionAdherenceScorer",
30
26
  "DerailmentScorer",
31
27
  "ToolOrderScorer",
@@ -10,7 +10,6 @@ from pydantic import BaseModel
10
10
  from judgeval.judges.utils import create_judge
11
11
  from typing import Any
12
12
  from pydantic import model_validator, Field
13
- from judgeval.common.logger import judgeval_logger
14
13
 
15
14
 
16
15
  class BaseScorer(BaseModel):
@@ -32,10 +31,10 @@ class BaseScorer(BaseModel):
32
31
  reason: Optional[str] = ""
33
32
  using_native_model: Optional[bool] = None # Whether the model is a native model
34
33
  success: Optional[bool] = None # Whether the test case passed or failed
35
- model: Optional[Any] = Field(
34
+ model: Optional[str] = None # The name of the model used to evaluate the test case
35
+ model_client: Optional[Any] = Field(
36
36
  default=None, exclude=True
37
37
  ) # The model used to evaluate the test case
38
- evaluation_model: Optional[str] = None # The model used to evaluate the test case
39
38
  strict_mode: bool = False # Whether to run the scorer in strict mode
40
39
  error: Optional[str] = None # The error message if the scorer failed
41
40
  additional_metadata: Optional[Dict] = None # Additional metadata for the scorer
@@ -66,8 +65,8 @@ class BaseScorer(BaseModel):
66
65
 
67
66
  This method is used at eval time
68
67
  """
69
- self.model, self.using_native_model = create_judge(model)
70
- self.evaluation_model = self.model.get_model_name()
68
+ self.model_client, self.using_native_model = create_judge(model)
69
+ self.model = self.model_client.get_model_name() or model
71
70
 
72
71
  def success_check(self) -> bool:
73
72
  """
@@ -78,21 +77,3 @@ class BaseScorer(BaseModel):
78
77
  if self.score is None:
79
78
  return False
80
79
  return self.score >= self.threshold
81
-
82
- def __str__(self):
83
- if self.error:
84
- judgeval_logger.warning(f"BaseScorer contains error: {self.error}")
85
- attributes = {
86
- "score_type": self.score_type,
87
- "threshold": self.threshold,
88
- "score": self.score,
89
- "score_breakdown": self.score_breakdown,
90
- "reason": self.reason,
91
- "success": self.success,
92
- "model": self.model,
93
- "evaluation_model": self.evaluation_model,
94
- "strict_mode": self.strict_mode,
95
- "error": self.error,
96
- "additional_metadata": self.additional_metadata,
97
- }
98
- return f"BaseScorer({attributes})"
@@ -20,8 +20,8 @@ from judgeval.scorers.judgeval_scorers.api_scorers.derailment_scorer import (
20
20
  DerailmentScorer,
21
21
  )
22
22
  from judgeval.scorers.judgeval_scorers.api_scorers.tool_order import ToolOrderScorer
23
- from judgeval.scorers.judgeval_scorers.api_scorers.classifier_scorer import (
24
- ClassifierScorer,
23
+ from judgeval.scorers.judgeval_scorers.api_scorers.prompt_scorer import (
24
+ PromptScorer,
25
25
  )
26
26
  from judgeval.scorers.judgeval_scorers.api_scorers.tool_dependency import (
27
27
  ToolDependencyScorer,
@@ -42,6 +42,6 @@ __all__ = [
42
42
  "GroundednessScorer",
43
43
  "DerailmentScorer",
44
44
  "ToolOrderScorer",
45
- "ClassifierScorer",
45
+ "PromptScorer",
46
46
  "ToolDependencyScorer",
47
47
  ]
@@ -0,0 +1,215 @@
1
+ from judgeval.scorers.api_scorer import APIScorerConfig
2
+ from judgeval.constants import APIScorerType
3
+ from typing import Mapping, Dict, Any
4
+ from judgeval.common.api import JudgmentApiClient, JudgmentAPIException
5
+ import os
6
+ from judgeval.common.exceptions import JudgmentAPIError
7
+
8
+
9
+ def push_prompt_scorer(
10
+ name: str,
11
+ prompt: str,
12
+ options: Mapping[str, float],
13
+ judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
14
+ organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
15
+ ) -> str:
16
+ client = JudgmentApiClient(judgment_api_key, organization_id)
17
+ try:
18
+ r = client.save_scorer(name, prompt, dict(options))
19
+ except JudgmentAPIException as e:
20
+ if e.status_code == 500:
21
+ raise JudgmentAPIError(
22
+ f"The server is temporarily unavailable. Please try your request again in a few moments. Error details: {e.error_detail}"
23
+ )
24
+ raise JudgmentAPIError(f"Failed to save classifier scorer: {e.error_detail}")
25
+ return r["name"]
26
+
27
+
28
+ def fetch_prompt_scorer(
29
+ name: str,
30
+ judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
31
+ organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
32
+ ):
33
+ client = JudgmentApiClient(judgment_api_key, organization_id)
34
+ try:
35
+ scorer_config = client.fetch_scorer(name)
36
+ scorer_config.pop("created_at")
37
+ scorer_config.pop("updated_at")
38
+ return scorer_config
39
+ except JudgmentAPIException as e:
40
+ if e.status_code == 500:
41
+ raise JudgmentAPIError(
42
+ f"The server is temporarily unavailable. Please try your request again in a few moments. Error details: {e.error_detail}"
43
+ )
44
+ raise JudgmentAPIError(
45
+ f"Failed to fetch classifier scorer '{name}': {e.error_detail}"
46
+ )
47
+
48
+
49
+ def scorer_exists(
50
+ name: str,
51
+ judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
52
+ organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
53
+ ):
54
+ client = JudgmentApiClient(judgment_api_key, organization_id)
55
+ try:
56
+ return client.scorer_exists(name)["exists"]
57
+ except JudgmentAPIException as e:
58
+ if e.status_code == 500:
59
+ raise JudgmentAPIError(
60
+ f"The server is temporarily unavailable. Please try your request again in a few moments. Error details: {e.error_detail}"
61
+ )
62
+ raise JudgmentAPIError(f"Failed to check if scorer exists: {e.error_detail}")
63
+
64
+
65
+ class PromptScorer(APIScorerConfig):
66
+ """
67
+ In the Judgment backend, this scorer is implemented as a PromptScorer that takes
68
+ 1. a system role that may involve the Example object
69
+ 2. options for scores on the example
70
+
71
+ and uses a judge to execute the evaluation from the system role and classify into one of the options
72
+ """
73
+
74
+ prompt: str
75
+ options: Mapping[str, float]
76
+ score_type: APIScorerType = APIScorerType.PROMPT_SCORER
77
+ judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or ""
78
+ organization_id: str = os.getenv("JUDGMENT_ORG_ID") or ""
79
+
80
+ @classmethod
81
+ def get(
82
+ cls,
83
+ name: str,
84
+ judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
85
+ organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
86
+ ):
87
+ scorer_config = fetch_prompt_scorer(name, judgment_api_key, organization_id)
88
+ return cls(
89
+ name=name,
90
+ prompt=scorer_config["prompt"],
91
+ options=scorer_config["options"],
92
+ judgment_api_key=judgment_api_key,
93
+ organization_id=organization_id,
94
+ )
95
+
96
+ @classmethod
97
+ def create(
98
+ cls,
99
+ name: str,
100
+ prompt: str,
101
+ options: Mapping[str, float],
102
+ judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
103
+ organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
104
+ ):
105
+ if not scorer_exists(name, judgment_api_key, organization_id):
106
+ push_prompt_scorer(name, prompt, options, judgment_api_key, organization_id)
107
+ return cls(
108
+ name=name,
109
+ prompt=prompt,
110
+ options=options,
111
+ judgment_api_key=judgment_api_key,
112
+ organization_id=organization_id,
113
+ )
114
+ else:
115
+ raise JudgmentAPIError(
116
+ f"Scorer with name {name} already exists. Either use the existing scorer with the get() method or use a new name."
117
+ )
118
+
119
+ # Setter functions. Each setter function pushes the scorer to the DB.
120
+ def set_name(self, name: str):
121
+ """
122
+ Updates the name of the scorer.
123
+ """
124
+ self.name = name
125
+ self.push_prompt_scorer()
126
+
127
+ def set_threshold(self, threshold: float):
128
+ """
129
+ Updates the threshold of the scorer.
130
+ """
131
+ self.threshold = threshold
132
+ self.push_prompt_scorer()
133
+
134
+ def set_prompt(self, prompt: str):
135
+ """
136
+ Updates the prompt with the new prompt.
137
+
138
+ Sample prompt:
139
+ "Did the chatbot answer the user's question in a kind way?"
140
+ """
141
+ self.prompt = prompt
142
+ self.push_prompt_scorer()
143
+
144
+ def set_options(self, options: Mapping[str, float]):
145
+ """
146
+ Updates the options with the new options.
147
+
148
+ Sample options:
149
+ {"yes": 1, "no": 0}
150
+ """
151
+ self.options = options
152
+ self.push_prompt_scorer()
153
+
154
+ def append_to_prompt(self, prompt_addition: str):
155
+ """
156
+ Appends a string to the prompt.
157
+ """
158
+ self.prompt += prompt_addition
159
+ self.push_prompt_scorer()
160
+
161
+ # Getters
162
+ def get_prompt(self) -> str | None:
163
+ """
164
+ Returns the prompt of the scorer.
165
+ """
166
+ return self.prompt
167
+
168
+ def get_options(self) -> Mapping[str, float] | None:
169
+ """
170
+ Returns the options of the scorer.
171
+ """
172
+ return self.options
173
+
174
+ def get_name(self) -> str | None:
175
+ """
176
+ Returns the name of the scorer.
177
+ """
178
+ return self.name
179
+
180
+ def get_config(self) -> dict:
181
+ """
182
+ Returns a dictionary with all the fields in the scorer.
183
+ """
184
+ return {
185
+ "name": self.name,
186
+ "prompt": self.prompt,
187
+ "options": self.options,
188
+ }
189
+
190
+ def push_prompt_scorer(self):
191
+ """
192
+ Pushes the scorer to the DB.
193
+ """
194
+ push_prompt_scorer(
195
+ self.name,
196
+ self.prompt,
197
+ self.options,
198
+ self.judgment_api_key,
199
+ self.organization_id,
200
+ )
201
+
202
+ def __str__(self):
203
+ return f"PromptScorer(name={self.name}, prompt={self.prompt}, options={self.options})"
204
+
205
+ def model_dump(self, *args, **kwargs) -> Dict[str, Any]:
206
+ base = super().model_dump(*args, **kwargs)
207
+ base_fields = set(APIScorerConfig.model_fields.keys())
208
+ all_fields = set(self.__class__.model_fields.keys())
209
+
210
+ extra_fields = all_fields - base_fields - {"kwargs"}
211
+
212
+ base["kwargs"] = {
213
+ k: getattr(self, k) for k in extra_fields if getattr(self, k) is not None
214
+ }
215
+ return base
judgeval/scorers/score.py CHANGED
@@ -89,7 +89,8 @@ async def a_execute_scoring(
89
89
 
90
90
  # Add model to scorers
91
91
  for scorer in scorers:
92
- scorer._add_model(model)
92
+ if not scorer.model:
93
+ scorer._add_model(model)
93
94
 
94
95
  scoring_results: List[ScoringResult] = [None for _ in examples]
95
96
  tasks = []
judgeval/scorers/utils.py CHANGED
@@ -4,7 +4,6 @@ Util functions for Scorer objects
4
4
 
5
5
  import asyncio
6
6
  import nest_asyncio
7
- import inspect
8
7
  import json
9
8
  import re
10
9
  from typing import List, Optional
@@ -20,18 +19,7 @@ def clone_scorers(scorers: List[BaseScorer]) -> List[BaseScorer]:
20
19
  """
21
20
  cloned_scorers = []
22
21
  for s in scorers:
23
- scorer_class = type(s)
24
- args = vars(s)
25
-
26
- signature = inspect.signature(scorer_class.__init__)
27
- valid_params = signature.parameters.keys()
28
- valid_args = {key: args[key] for key in valid_params if key in args}
29
-
30
- cloned_scorer = scorer_class(**valid_args)
31
- # kinda hacky, but in case the class inheriting from BaseScorer doesn't have `model` in its __init__,
32
- # we need to explicitly include it here so that we can add the judge model to the cloned scorer
33
- cloned_scorer._add_model(model=args.get("model"))
34
- cloned_scorers.append(cloned_scorer)
22
+ cloned_scorers.append(s.model_copy(deep=True))
35
23
  return cloned_scorers
36
24
 
37
25
 
@@ -10,9 +10,13 @@ class RetrySession(requests_original.Session):
10
10
  retries=3,
11
11
  backoff_factor=0.5,
12
12
  status_forcelist=[HTTPStatus.BAD_GATEWAY, HTTPStatus.SERVICE_UNAVAILABLE],
13
+ default_timeout=(10, 60), # (connect_timeout, read_timeout)
13
14
  ):
14
15
  super().__init__()
15
16
 
17
+ # Store default timeout
18
+ self.default_timeout = default_timeout
19
+
16
20
  retry_strategy = Retry(
17
21
  total=retries,
18
22
  read=retries,
@@ -25,5 +29,22 @@ class RetrySession(requests_original.Session):
25
29
  self.mount("http://", adapter)
26
30
  self.mount("https://", adapter)
27
31
 
32
+ def request(self, method, url, timeout=None, **kwargs):
33
+ """
34
+ Override request method to add default timeout if not specified.
35
+
36
+ Args:
37
+ method: HTTP method
38
+ url: Request URL
39
+ timeout: Timeout value. If None, uses default_timeout.
40
+ Can be a float (total timeout) or tuple (connect, read).
41
+ **kwargs: Other request arguments
42
+ """
43
+ # Use default timeout if none specified
44
+ if timeout is None:
45
+ timeout = self.default_timeout
46
+
47
+ return super().request(method, url, timeout=timeout, **kwargs)
48
+
28
49
 
29
50
  requests = RetrySession()
@@ -0,0 +1,202 @@
1
+ Metadata-Version: 2.4
2
+ Name: judgeval
3
+ Version: 0.1.0
4
+ Summary: Judgeval Package
5
+ Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
+ Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
7
+ Author-email: Andrew Li <andrew@judgmentlabs.ai>, Alex Shan <alex@judgmentlabs.ai>, Joseph Camyre <joseph@judgmentlabs.ai>
8
+ License-Expression: Apache-2.0
9
+ License-File: LICENSE.md
10
+ Classifier: Operating System :: OS Independent
11
+ Classifier: Programming Language :: Python :: 3
12
+ Requires-Python: >=3.11
13
+ Requires-Dist: anthropic
14
+ Requires-Dist: boto3
15
+ Requires-Dist: datamodel-code-generator>=0.31.1
16
+ Requires-Dist: google-genai
17
+ Requires-Dist: langchain-anthropic
18
+ Requires-Dist: langchain-core
19
+ Requires-Dist: langchain-huggingface
20
+ Requires-Dist: langchain-openai
21
+ Requires-Dist: litellm>=1.61.15
22
+ Requires-Dist: matplotlib>=3.10.3
23
+ Requires-Dist: nest-asyncio
24
+ Requires-Dist: openai
25
+ Requires-Dist: pandas
26
+ Requires-Dist: python-dotenv==1.0.1
27
+ Requires-Dist: python-slugify>=8.0.4
28
+ Requires-Dist: requests
29
+ Requires-Dist: together
30
+ Description-Content-Type: text/markdown
31
+
32
+ <div align="center">
33
+
34
+ <img src="assets/new_lightmode.svg#gh-light-mode-only" alt="Judgment Logo" width="400" />
35
+ <img src="assets/new_darkmode.svg#gh-dark-mode-only" alt="Judgment Logo" width="400" />
36
+
37
+ <br>
38
+ <div style="font-size: 1.5em;">
39
+ Enable self-learning agents with traces, evals, and environment data.
40
+ </div>
41
+
42
+ ## [Docs](https://docs.judgmentlabs.ai/) • [Judgment Cloud](https://app.judgmentlabs.ai/register) • [Self-Host](https://docs.judgmentlabs.ai/documentation/self-hosting/get-started)
43
+
44
+ [Demo](https://www.youtube.com/watch?v=1S4LixpVbcc) • [Bug Reports](https://github.com/JudgmentLabs/judgeval/issues) • [Changelog](https://docs.judgmentlabs.ai/changelog/2025-04-21)
45
+
46
+ We're hiring! Join us in our mission to enable self-learning agents by providing the data and signals needed for monitoring and post-training.
47
+
48
+ [![X](https://img.shields.io/badge/-X/Twitter-000?logo=x&logoColor=white)](https://x.com/JudgmentLabs)
49
+ [![LinkedIn](https://custom-icon-badges.demolab.com/badge/LinkedIn%20-0A66C2?logo=linkedin-white&logoColor=fff)](https://www.linkedin.com/company/judgmentlabs)
50
+ [![Discord](https://img.shields.io/badge/-Discord-5865F2?logo=discord&logoColor=white)](https://discord.gg/tGVFf8UBUY)
51
+
52
+ <img src="assets/product_shot.png" alt="Judgment Platform" width="800" />
53
+
54
+ </div>
55
+
56
+ Judgeval offers **open-source tooling** for tracing and evaluating autonomous, stateful agents. It **provides runtime data from agent-environment interactions** for continuous learning and self-improvement.
57
+
58
+ ## 🎬 See Judgeval in Action
59
+
60
+ **[Multi-Agent System](https://github.com/JudgmentLabs/judgment-cookbook/tree/main/cookbooks/agents/multi-agent) with complete observability:** (1) A multi-agent system spawns agents to research topics on the internet. (2) With just **3 lines of code**, Judgeval traces every input/output + environment response across all agent tool calls for debugging. (3) After completion, (4) export all interaction data to enable further environment-specific learning and optimization.
61
+
62
+ <table style="width: 100%; max-width: 800px; table-layout: fixed;">
63
+ <tr>
64
+ <td align="center" style="padding: 8px; width: 50%;">
65
+ <img src="assets/agent.gif" alt="Agent Demo" style="width: 100%; max-width: 350px; height: auto;" />
66
+ <br><strong>🤖 Agents Running</strong>
67
+ </td>
68
+ <td align="center" style="padding: 8px; width: 50%;">
69
+ <img src="assets/trace.gif" alt="Trace Demo" style="width: 100%; max-width: 350px; height: auto;" />
70
+ <br><strong>📊 Real-time Tracing</strong>
71
+ </td>
72
+ </tr>
73
+ <tr>
74
+ <td align="center" style="padding: 8px; width: 50%;">
75
+ <img src="assets/document.gif" alt="Agent Completed Demo" style="width: 100%; max-width: 350px; height: auto;" />
76
+ <br><strong>✅ Agents Completed Running</strong>
77
+ </td>
78
+ <td align="center" style="padding: 8px; width: 50%;">
79
+ <img src="assets/data.gif" alt="Data Export Demo" style="width: 100%; max-width: 350px; height: auto;" />
80
+ <br><strong>📤 Exporting Agent Environment Data</strong>
81
+ </td>
82
+ </tr>
83
+
84
+ </table>
85
+
86
+ ## 📋 Table of Contents
87
+ - [🛠️ Installation](#️-installation)
88
+ - [🏁 Quickstarts](#-quickstarts)
89
+ - [✨ Features](#-features)
90
+ - [🏢 Self-Hosting](#-self-hosting)
91
+ - [📚 Cookbooks](#-cookbooks)
92
+ - [💻 Development with Cursor](#-development-with-cursor)
93
+
94
+ ## 🛠️ Installation
95
+
96
+ Get started with Judgeval by installing our SDK using pip:
97
+
98
+ ```bash
99
+ pip install judgeval
100
+ ```
101
+
102
+ Ensure you have your `JUDGMENT_API_KEY` and `JUDGMENT_ORG_ID` environment variables set to connect to the [Judgment Platform](https://app.judgmentlabs.ai/).
103
+
104
+ ```bash
105
+ export JUDGMENT_API_KEY=...
106
+ export JUDGMENT_ORG_ID=...
107
+ ```
108
+
109
+ **If you don't have keys, [create an account](https://app.judgmentlabs.ai/register) on the platform!**
110
+
111
+ ## 🏁 Quickstarts
112
+
113
+ ### 🛰️ Tracing
114
+
115
+ Create a file named `agent.py` with the following code:
116
+
117
+ ```python
118
+ from judgeval.tracer import Tracer, wrap
119
+ from openai import OpenAI
120
+
121
+ client = wrap(OpenAI()) # tracks all LLM calls
122
+ judgment = Tracer(project_name="my_project")
123
+
124
+ @judgment.observe(span_type="tool")
125
+ def format_question(question: str) -> str:
126
+ # dummy tool
127
+ return f"Question : {question}"
128
+
129
+ @judgment.observe(span_type="function")
130
+ def run_agent(prompt: str) -> str:
131
+ task = format_question(prompt)
132
+ response = client.chat.completions.create(
133
+ model="gpt-4.1",
134
+ messages=[{"role": "user", "content": task}]
135
+ )
136
+ return response.choices[0].message.content
137
+
138
+ run_agent("What is the capital of the United States?")
139
+ ```
140
+ You'll see your trace exported to the Judgment Platform:
141
+
142
+ <p align="center"><img src="assets/trace_demo.png" alt="Judgment Platform Trace Example" width="800" /></p>
143
+
144
+
145
+ [Click here](https://docs.judgmentlabs.ai/documentation/tracing/introduction) for a more detailed explanation.
146
+
147
+
148
+ <!-- Created by https://github.com/ekalinin/github-markdown-toc -->
149
+
150
+
151
+ ## ✨ Features
152
+
153
+ | | |
154
+ |:---|:---:|
155
+ | <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic). **Tracks inputs/outputs, agent tool calls, latency, cost, and custom metadata** at every step.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 📋 Collecting agent environment data <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/trace_screenshot.png" alt="Tracing visualization" width="1200"/></p> |
156
+ | <h3>🧪 Evals</h3>Build custom evaluators on top of your agents. Judgeval supports LLM-as-a-judge, manual labeling, and code-based evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 A/B testing <br>• 🛡️ Online guardrails | <p align="center"><img src="assets/experiments_page.png" alt="Evaluation metrics" width="800"/></p> |
157
+ | <h3>📡 Monitoring</h3>Get Slack alerts for agent failures in production. Add custom hooks to address production regressions.<br><br> **Useful for:** <br>• 📉 Identifying degradation early <br>• 📈 Visualizing performance trends across agent versions and time | <p align="center"><img src="assets/error_analysis_dashboard.png" alt="Monitoring Dashboard" width="1200"/></p> |
158
+ | <h3>📊 Datasets</h3>Export traces and test cases to datasets for scaled analysis and optimization. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations, enabling continuous learning from production interactions. <br><br> **Useful for:**<br>• 🗃️ Agent environment interaction data for optimization<br>• 🔄 Scaled analysis for A/B tests | <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
159
+
160
+ ## 🏢 Self-Hosting
161
+
162
+ Run Judgment on your own infrastructure: we provide comprehensive self-hosting capabilities that give you full control over the backend and data plane that Judgeval interfaces with.
163
+
164
+ ### Key Features
165
+ * Deploy Judgment on your own AWS account
166
+ * Store data in your own Supabase instance
167
+ * Access Judgment through your own custom domain
168
+
169
+ ### Getting Started
170
+ 1. Check out our [self-hosting documentation](https://docs.judgmentlabs.ai/documentation/self-hosting/get-started) for detailed setup instructions, along with how your self-hosted instance can be accessed
171
+ 2. Use the [Judgment CLI](https://docs.judgmentlabs.ai/documentation/developer-tools/judgment-cli/installation) to deploy your self-hosted environment
172
+ 3. After your self-hosted instance is setup, make sure the `JUDGMENT_API_URL` environmental variable is set to your self-hosted backend endpoint
173
+
174
+ ## 📚 Cookbooks
175
+
176
+ Have your own? We're happy to feature it if you create a PR or message us on [Discord](https://discord.gg/tGVFf8UBUY).
177
+
178
+ You can access our repo of cookbooks [here](https://github.com/JudgmentLabs/judgment-cookbook).
179
+
180
+ ## 💻 Development with Cursor
181
+ Building agents and LLM workflows in Cursor works best when your coding assistant has the proper context about Judgment integration. The Cursor rules file contains the key information needed for your assistant to implement Judgment features effectively.
182
+
183
+ Refer to the official [documentation](https://docs.judgmentlabs.ai/documentation/developer-tools/cursor/cursor-rules) for access to the rules file and more information on integrating this rules file with your codebase.
184
+
185
+ ## ⭐ Star Us on GitHub
186
+
187
+ If you find Judgeval useful, please consider giving us a star on GitHub! Your support helps us grow our community and continue improving the repository.
188
+
189
+ ## ❤️ Contributors
190
+
191
+ There are many ways to contribute to Judgeval:
192
+
193
+ - Submit [bug reports](https://github.com/JudgmentLabs/judgeval/issues) and [feature requests](https://github.com/JudgmentLabs/judgeval/issues)
194
+ - Review the documentation and submit [Pull Requests](https://github.com/JudgmentLabs/judgeval/pulls) to improve it
195
+ - Speaking or writing about Judgment and letting us know!
196
+
197
+ <!-- Contributors collage -->
198
+ [![Contributors](https://contributors-img.web.app/image?repo=JudgmentLabs/judgeval)](https://github.com/JudgmentLabs/judgeval/graphs/contributors)
199
+
200
+ ---
201
+
202
+ Judgeval is created and maintained by [Judgment Labs](https://judgmentlabs.ai/).