judgeval 0.0.35__py3-none-any.whl → 0.0.37__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
judgeval/scorers/score.py CHANGED
@@ -243,7 +243,7 @@ async def score_with_indicator(
243
243
  async def a_execute_scoring(
244
244
  examples: Union[List[Example], List[CustomExample]],
245
245
  scorers: List[JudgevalScorer],
246
- model: Optional[Union[str, List[str], JudgevalJudge]] = None,
246
+ model: Optional[Union[str, List[str], JudgevalJudge]] = "gpt-4.1",
247
247
  ignore_errors: bool = True,
248
248
  skip_on_missing_params: bool = True,
249
249
  show_indicator: bool = True,
@@ -271,6 +271,7 @@ async def a_execute_scoring(
271
271
  Returns:
272
272
  List[ScoringResult]: A list of `ScoringResult` objects containing the evaluation results.
273
273
  """
274
+
274
275
  semaphore = asyncio.Semaphore(max_concurrent)
275
276
 
276
277
  async def execute_with_semaphore(func: Callable, *args, **kwargs):
@@ -0,0 +1,57 @@
1
+ import yaml
2
+ from judgeval.common.logger import (
3
+ debug,
4
+ info,
5
+ error,
6
+ example_logging_context
7
+ )
8
+
9
+ from judgeval.data import Example
10
+
11
+
12
+ def add_from_yaml(file_path: str) -> None:
13
+ debug(f"Loading dataset from YAML file: {file_path}")
14
+ """
15
+ Adds examples from a YAML file.
16
+
17
+ The format of the YAML file is expected to be a dictionary with one key: "examples".
18
+ The value of the key is a list of dictionaries, where each dictionary represents an example.
19
+
20
+ The YAML file is expected to have the following format:
21
+ examples:
22
+ - input: "test input"
23
+ actual_output: "test output"
24
+ expected_output: "expected output"
25
+ context:
26
+ - "context1"
27
+ - "context2"
28
+ retrieval_context:
29
+ - "retrieval1"
30
+ additional_metadata:
31
+ key: "value"
32
+ tools_called:
33
+ - "tool1"
34
+ expected_tools:
35
+ - {tool_name: "tool1", parameters: {"query": "test query 1"}}
36
+ - {tool_name: "tool2", parameters: {"query": "test query 2"}}
37
+ name: "test example"
38
+ example_id: null
39
+ timestamp: "20241230_160117"
40
+ trace_id: "123"
41
+ """
42
+ try:
43
+ with open(file_path, "r") as file:
44
+ payload = yaml.safe_load(file)
45
+ if payload is None:
46
+ raise ValueError("The YAML file is empty.")
47
+ examples = payload.get("examples", [])
48
+ except FileNotFoundError:
49
+ error(f"YAML file not found: {file_path}")
50
+ raise FileNotFoundError(f"The file {file_path} was not found.")
51
+ except yaml.YAMLError:
52
+ error(f"Invalid YAML file: {file_path}")
53
+ raise ValueError(f"The file {file_path} is not a valid YAML file.")
54
+
55
+ info(f"Added {len(examples)} examples from YAML")
56
+ new_examples = [Example(**e) for e in examples]
57
+ return new_examples
@@ -0,0 +1,214 @@
1
+ Metadata-Version: 2.4
2
+ Name: judgeval
3
+ Version: 0.0.37
4
+ Summary: Judgeval Package
5
+ Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
+ Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
7
+ Author-email: Andrew Li <andrew@judgmentlabs.ai>, Alex Shan <alex@judgmentlabs.ai>, Joseph Camyre <joseph@judgmentlabs.ai>
8
+ License-Expression: Apache-2.0
9
+ License-File: LICENSE.md
10
+ Classifier: Operating System :: OS Independent
11
+ Classifier: Programming Language :: Python :: 3
12
+ Requires-Python: >=3.11
13
+ Requires-Dist: anthropic
14
+ Requires-Dist: boto3
15
+ Requires-Dist: google-genai
16
+ Requires-Dist: langchain-anthropic
17
+ Requires-Dist: langchain-core
18
+ Requires-Dist: langchain-huggingface
19
+ Requires-Dist: langchain-openai
20
+ Requires-Dist: litellm==1.38.12
21
+ Requires-Dist: nest-asyncio
22
+ Requires-Dist: openai
23
+ Requires-Dist: pandas
24
+ Requires-Dist: python-dotenv==1.0.1
25
+ Requires-Dist: requests
26
+ Requires-Dist: together
27
+ Description-Content-Type: text/markdown
28
+
29
+ <div align="center">
30
+
31
+ <img src="assets/logo-light.svg#gh-light-mode-only" alt="Judgment Logo" width="400" />
32
+ <img src="assets/logo-dark.svg#gh-dark-mode-only" alt="Judgment Logo" width="400" />
33
+
34
+ **Build monitoring & evaluation pipelines for complex agents**
35
+
36
+ [Website](https://www.judgmentlabs.ai/) • [Twitter/X](https://x.com/JudgmentLabs) • [LinkedIn](https://www.linkedin.com/company/judgmentlabs) • [Documentation](https://judgment.mintlify.app/getting_started) • [Demos](https://www.youtube.com/@AlexShan-j3o)
37
+
38
+ </div>
39
+
40
+ ## 🚀 What is Judgeval?
41
+
42
+ Judgeval is an open-source tool for testing, monitoring, and optimizing AI agents. Judgeval is created and maintained by [Judgment Labs](https://judgmentlabs.ai/).
43
+
44
+
45
+ **🔍 Tracing**
46
+ * Automatic agent tracing for common agent frameworks and SDKs (LangGraph, OpenAI, Anthropic, etc.)
47
+ * Track input/output, latency, cost, token usage at every step
48
+ * Function tracing with `@judgment.observe` decorator
49
+
50
+ **🧪 Evals**
51
+ * Plug-and-measure 15+ metrics, including:
52
+ * Tool call accuracy
53
+ * Hallucinations
54
+ * Instruction adherence
55
+ * Retrieval context recall
56
+
57
+ Our metric implementations are research-backed by Stanford and Berkeley AI labs. Check out our [research](https://judgmentlabs.ai/research)!
58
+ * Build custom evaluators that seamlessly connect with our infrastructure!
59
+ * Use our evals for:
60
+ * ⚠️ Unit-testing your agent
61
+ * 🔬 Experimentally testing new prompts and models
62
+ * 🛡️ Online evaluations to guardrail your agent's actions and responses
63
+
64
+ **📊 Datasets**
65
+ * Export trace data to datasets hosted on Judgment's Platform and export to JSON, Parquet, S3, etc.
66
+ * Run evals on datasets as unit-tests or to A/B test agent configs
67
+
68
+ **💡 Insights**
69
+ * Error clustering groups agent failures to uncover failure patterns and speed up root cause analysis
70
+ * Trace agent failures to their exact source. Judgment's Osiris agent localizes errors to specific agent components, enabling precise, targeted fixes.
71
+
72
+
73
+ ## 🛠️ Installation
74
+
75
+ Get started with Judgeval by installing our SDK using pip:
76
+
77
+ ```bash
78
+ pip install judgeval
79
+ ```
80
+
81
+ Ensure you have your `JUDGMENT_API_KEY` environment variable set to connect to the [Judgment platform](https://app.judgmentlabs.ai/). If you don't have a key, create an account on the platform!
82
+
83
+ ## 🏁 Get Started
84
+
85
+ Here's how you can quickly start using Judgeval:
86
+
87
+ ### 🛰️ Tracing
88
+
89
+ Track your agent execution with full observability with just a few lines of code.
90
+ Create a file named `traces.py` with the following code:
91
+
92
+ ```python
93
+ from judgeval.common.tracer import Tracer, wrap
94
+ from openai import OpenAI
95
+
96
+ client = wrap(OpenAI())
97
+ judgment = Tracer(project_name="my_project")
98
+
99
+ @judgment.observe(span_type="tool")
100
+ def my_tool():
101
+ return "What's the capital of the U.S.?"
102
+
103
+ @judgment.observe(span_type="function")
104
+ def main():
105
+ task_input = my_tool()
106
+ res = client.chat.completions.create(
107
+ model="gpt-4.1",
108
+ messages=[{"role": "user", "content": f"{task_input}"}]
109
+ )
110
+ return res.choices[0].message.content
111
+
112
+ main()
113
+ ```
114
+
115
+ [Click here](https://judgment.mintlify.app/getting_started#create-your-first-trace) for a more detailed explanation.
116
+
117
+ ### 📝 Offline Evaluations
118
+
119
+ You can evaluate your agent's execution to measure quality metrics such as hallucination.
120
+ Create a file named `evaluate.py` with the following code:
121
+
122
+ ```python evaluate.py
123
+ from judgeval import JudgmentClient
124
+ from judgeval.data import Example
125
+ from judgeval.scorers import FaithfulnessScorer
126
+
127
+ client = JudgmentClient()
128
+
129
+ example = Example(
130
+ input="What if these shoes don't fit?",
131
+ actual_output="We offer a 30-day full refund at no extra cost.",
132
+ retrieval_context=["All customers are eligible for a 30 day full refund at no extra cost."],
133
+ )
134
+
135
+ scorer = FaithfulnessScorer(threshold=0.5)
136
+ results = client.run_evaluation(
137
+ examples=[example],
138
+ scorers=[scorer],
139
+ model="gpt-4.1",
140
+ )
141
+ print(results)
142
+ ```
143
+
144
+ [Click here](https://judgment.mintlify.app/getting_started#create-your-first-experiment) for a more detailed explanation.
145
+
146
+ ### 📡 Online Evaluations
147
+
148
+ Apply performance monitoring to measure the quality of your systems in production, not just on traces.
149
+
150
+ Using the same `traces.py` file we created earlier, modify `main` function:
151
+
152
+ ```python
153
+ from judgeval.common.tracer import Tracer, wrap
154
+ from judgeval.scorers import AnswerRelevancyScorer
155
+ from openai import OpenAI
156
+
157
+ client = wrap(OpenAI())
158
+ judgment = Tracer(project_name="my_project")
159
+
160
+ @judgment.observe(span_type="tool")
161
+ def my_tool():
162
+ return "Hello world!"
163
+
164
+ @judgment.observe(span_type="function")
165
+ def main():
166
+ task_input = my_tool()
167
+ res = client.chat.completions.create(
168
+ model="gpt-4.1",
169
+ messages=[{"role": "user", "content": f"{task_input}"}]
170
+ ).choices[0].message.content
171
+
172
+ judgment.get_current_trace().async_evaluate(
173
+ scorers=[AnswerRelevancyScorer(threshold=0.5)],
174
+ input=task_input,
175
+ actual_output=res,
176
+ model="gpt-4.1"
177
+ )
178
+ print("Online evaluation submitted.")
179
+ return res
180
+
181
+ main()
182
+ ```
183
+
184
+ [Click here](https://judgment.mintlify.app/getting_started#create-your-first-online-evaluation) for a more detailed explanation.
185
+
186
+ ## 🏢 Self-Hosting
187
+
188
+ Run Judgment on your own infrastructure: we provide comprehensive self-hosting capabilities that give you full control over the backend and data plane that Judgeval interfaces with.
189
+
190
+ ### Key Features
191
+ * Deploy Judgment on your own AWS account
192
+ * Store data in your own Supabase instance
193
+ * Access Judgment through your own custom domain
194
+
195
+ ### Getting Started
196
+ 1. Check out our [self-hosting documentation](https://judgment.mintlify.app/self_hosting/get_started) for detailed setup instructions, along with how your self-hosted instance can be accessed
197
+ 2. Use the [Judgment CLI](https://github.com/JudgmentLabs/judgment-cli) to deploy your self-hosted environment
198
+ 3. After your self-hosted instance is setup, make sure the `JUDGMENT_API_URL` environmental variable is set to your self-hosted backend endpoint
199
+
200
+ ## ⭐ Star Us on GitHub
201
+
202
+ If you find Judgeval useful, please consider giving us a star on GitHub! Your support helps us grow our community and continue improving the product.
203
+
204
+ ## 🤝 Contributing
205
+
206
+ There are many ways to contribute to Judgeval:
207
+
208
+ - Submit [bug reports](https://github.com/JudgmentLabs/judgeval/issues) and [feature requests](https://github.com/JudgmentLabs/judgeval/issues)
209
+ - Review the documentation and submit [Pull Requests](https://github.com/JudgmentLabs/judgeval/pulls) to improve it
210
+ - Speaking or writing about Judgment and letting us know!
211
+
212
+ ## Documentation and Demos
213
+
214
+ For more detailed documentation, please check out our [developer docs](https://judgment.mintlify.app/getting_started) and some of our [demo videos](https://www.youtube.com/@AlexShan-j3o) for reference!
@@ -1,43 +1,44 @@
1
1
  judgeval/__init__.py,sha256=x9HWt4waJwJMAqTuJSg2MezF9Zg-macEjeU-ajbly-8,330
2
2
  judgeval/clients.py,sha256=6VQmEqmfCngUdS2MuPBIpHvtDFqOENm8-_BmMvjLyRQ,944
3
- judgeval/constants.py,sha256=_XmVAkebMyGrDvvanAVlMgVd4p6MLHdEVsTQFI0kz1k,5411
4
- judgeval/evaluation_run.py,sha256=WGzx-Ug2qhSmunFo8NrmSstBRsOUc5KpKq0Lc51rqsM,6739
5
- judgeval/judgment_client.py,sha256=brRYmphZR-2IUre9kdOhfse1mYDilcIqUzzH21ROAdk,22208
3
+ judgeval/constants.py,sha256=KDHkZbzk-nr9uP-jsWUwpfaQSf4YkVfbO1o8w93-GME,5707
4
+ judgeval/evaluation_run.py,sha256=MnlDSCXXi1vhTNTYC1XgPAl2BVG_ivNeFzIyfaw4Dho,6761
5
+ judgeval/judgment_client.py,sha256=SiqazgyKkGsCVZ7J6XeL3Fvc51Oz7TM7yBgSfguJ0wQ,23625
6
6
  judgeval/rules.py,sha256=jkh1cXXcUf8oRY7xJUZfcQBYWn_rjUW4GvrhRt15PeU,20265
7
- judgeval/run_evaluation.py,sha256=elMpFHahyeukKKa09fmJM3c_afwJ00mbZRqm18l5f00,28481
7
+ judgeval/run_evaluation.py,sha256=66ppcGpCc08WrK46gWMJjktGzAg5alAWzRb9ncv9DTM,34555
8
8
  judgeval/version_check.py,sha256=bvJEidB7rAeXozoUbN9Yb97QOR_s2hgvpvj74jJ5HlY,943
9
9
  judgeval/common/__init__.py,sha256=7d24BRxtncpMj3AAJCj8RS7TqgjXmW777HVZH6-3sBs,289
10
10
  judgeval/common/exceptions.py,sha256=U-TxHLn7oVMezsMuoYouNDb2XuS8RCggfntYf5_6u4E,565
11
11
  judgeval/common/logger.py,sha256=KO75wWXCxhUHUMvLaTU31ZzOk6tkZBa7heQ7y0f-zFE,6062
12
12
  judgeval/common/s3_storage.py,sha256=W8wq9S7qJZdqdBR4sk3aEZ4K3-pz40DOoolOJrWs9Vo,3768
13
- judgeval/common/tracer.py,sha256=YsObK8VQXp1DDbU9xncU8NjuY-JUI54BqmG4olezrZc,92507
14
- judgeval/common/utils.py,sha256=LUQV5JfDr6wj7xHAJoNq-gofNZ6mjXbeKrGKzBME1KM,33533
15
- judgeval/data/__init__.py,sha256=xuKx_KCVHGp6CXvQuVmKl3v7pJp-qDaz0NccKxwjtO0,481
13
+ judgeval/common/tracer.py,sha256=5Pt49vSVK2zxh6p9nMW87Ju1eN3-M_mziSF_mDucXkA,87211
14
+ judgeval/common/utils.py,sha256=oYjUgW8eL1qkzBe_tOz1WbCB6LbHWYvZN38XyXof8Eo,34086
15
+ judgeval/data/__init__.py,sha256=-i7cuSBHrSTMf3UiIbFXwp56y15QJ7pmQeQK1yprhqM,561
16
16
  judgeval/data/custom_example.py,sha256=QRBqiRiZS8UgVeTRHY0r1Jzm6yAYsyg6zmHxQGxdiQs,739
17
- judgeval/data/example.py,sha256=cJrmPGLel_P2sy1UaRvuVSAi35EnA9XMR11Lhp4aDLo,5930
17
+ judgeval/data/example.py,sha256=EVniiTpaut2wlTS1u3MxB983odCRLSa9RJ74iAsR0wg,6929
18
18
  judgeval/data/result.py,sha256=Gb9tiSDsk1amXgh0cFG6JmlW_BMKxS2kuTwNA0rrHjA,3184
19
19
  judgeval/data/scorer_data.py,sha256=JVlaTx1EP2jw2gh3Vgx1CSEsvIFABAN26IquKyxwiJQ,3273
20
- judgeval/data/sequence.py,sha256=FmKVdzQP5VTujRCHDWk097MKRR-rJgbsdrxyCKee6tA,1994
21
- judgeval/data/sequence_run.py,sha256=RmYjfWKMWg-pcF5PLeiWfrhuDkjDZi5VEmAIEXN3Ib0,2104
20
+ judgeval/data/sequence.py,sha256=vat_N0Acr64yfu97AfsaVPNrv2LqBiWCuYIoQWuxYwo,2074
21
+ judgeval/data/sequence_run.py,sha256=XPa-MvwRK6ABKQtpMdmGHnoyL1KrgzQUjqItpLDc8U0,2213
22
+ judgeval/data/trace.py,sha256=aRNwtJGebsm5MerVlZ3HKzviNAMpzydyUs88rs-BZ5Q,4899
22
23
  judgeval/data/datasets/__init__.py,sha256=IdNKhQv9yYZ_op0rdBacrFaFVmiiYQ3JTzXzxOTsEVQ,176
23
- judgeval/data/datasets/dataset.py,sha256=dhLo30hvpmmOK2R6O5wDs_neawUJ4lS8bb4S42SufNQ,13034
24
- judgeval/data/datasets/eval_dataset_client.py,sha256=xjj66BO9Es9IxXqzQe1RT_e0kpeKlt7OrhRoSuj4KHM,15085
25
- judgeval/integrations/langgraph.py,sha256=J-cQfFP52TjJewdSTe-fcsUC4HDvjNbXoxmbmF0SgiE,11743
24
+ judgeval/data/datasets/dataset.py,sha256=1o-SMG96yYkSCxqUTItKKfeFFLhQXrE2cKPYEvlNeGw,13044
25
+ judgeval/data/datasets/eval_dataset_client.py,sha256=uirHpkpLOfygXIz0xKAGTPx1qjbBTzdLFQK6yyoZduU,17544
26
+ judgeval/integrations/langgraph.py,sha256=nQ6KAi9giirnWmD35i4CfoFYKzajh5ElvKh4t6Yasgs,122617
26
27
  judgeval/judges/__init__.py,sha256=6X7VSwrwsdxGBNxCyapVRWGghhKOy3MVxFNMQ62kCXM,308
27
28
  judgeval/judges/base_judge.py,sha256=ch_S7uBB7lyv44Lf1d7mIGFpveOO58zOkkpImKgd9_4,994
28
29
  judgeval/judges/litellm_judge.py,sha256=EIL58Teptv8DzZUO3yP2RDQCDq-aoBB6HPZzPdK6KTg,2424
29
30
  judgeval/judges/mixture_of_judges.py,sha256=IJoi4Twk8ze1CJWVEp69k6TSqTCTGrmVYQ0qdffer60,15549
30
31
  judgeval/judges/together_judge.py,sha256=l00hhPerAZXg3oYBd8cyMtWsOTNt_0FIqoxhKJKQe3k,2302
31
32
  judgeval/judges/utils.py,sha256=9lvUxziGV86ISvVFxYBWc09TWFyAQgUTyPf_a9mD5Rs,2686
32
- judgeval/scorers/__init__.py,sha256=Mk-mWUt_gNpJqY_WIEuQynD6fxc34fWSRSuobMSrj94,1238
33
+ judgeval/scorers/__init__.py,sha256=-4GLkYiLKI_BxpoIfgadCFEUfqJcBWZLAtfrInjZT0Q,1282
33
34
  judgeval/scorers/api_scorer.py,sha256=NQ_CrrUPhSUk1k2Q8rKpCG_TU2FT32sFEqvb-Yi54B0,2688
34
35
  judgeval/scorers/exceptions.py,sha256=eGW5CuJgZ5YJBFrE4FHDSF651PO1dKAZ379mJ8gOsfo,178
35
36
  judgeval/scorers/judgeval_scorer.py,sha256=79-JJurqHP-qTaWNWInx4SjvQYwXc9lvfPPNgwsh2yA,6773
36
37
  judgeval/scorers/prompt_scorer.py,sha256=PaAs2qRolw1P3_I061Xvk9qzvF4O-JR8g_39RqXnHcM,17728
37
- judgeval/scorers/score.py,sha256=r9QiT4-LIvivcJ6XxByrbswKSO8eQTtAD1UlXT_lcmo,18741
38
+ judgeval/scorers/score.py,sha256=m9luk5ZLeUCual5CpI-9ZR9nqR3eC9wJLVT87SFPN6g,18747
38
39
  judgeval/scorers/utils.py,sha256=iHQVTlIANbmCTXz9kTeSdOytgUZ_T74Re61ajqsk_WQ,6827
39
40
  judgeval/scorers/judgeval_scorers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
40
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py,sha256=_sDUBxSG536KGqXNi6dFpaYKghjEAadxBxaaxV9HuuE,1764
41
+ judgeval/scorers/judgeval_scorers/api_scorers/__init__.py,sha256=QhHKpl6kNEXxuwriSEwQ5gIIxb7NeHZ1H_7SAZhQiQk,1872
41
42
  judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py,sha256=Fnd9CVIOZ73sWEWymsU5eBrrZqPFjMZ0BKpeW-PDyTg,711
42
43
  judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py,sha256=oETeN9K0HSIRdL2SDqn82Vskpwh5SlKnZvs5VDm2OBU,658
43
44
  judgeval/scorers/judgeval_scorers/api_scorers/comparison.py,sha256=kuzf9OWvpY38yYSwlBgneLkUZwJNM4FQqvbS66keA90,1249
@@ -52,12 +53,14 @@ judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py,sha256=k5gDOki-8K
52
53
  judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py,sha256=XnSGEkQfwVqaqnHEGMCsxNiHVzrsrej48uDbLoWc8CQ,678
53
54
  judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py,sha256=mMKEuR87_yanEuZJ5YSGFMHDD_oLVZ6-rQuciFaDOMA,1095
54
55
  judgeval/scorers/judgeval_scorers/api_scorers/summarization.py,sha256=QmWB8bVbDYHY5FcF0rYZE_3c2XXgMLRmR6aXJWfdMC4,655
56
+ judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py,sha256=yBd5KU4wBTHFMi1B6D8hRdPOYYQl4uD7Z-xCW4yk5-E,427
55
57
  judgeval/scorers/judgeval_scorers/classifiers/__init__.py,sha256=Qt81W5ZCwMvBAne0LfQDb8xvg5iOG1vEYP7WizgwAZo,67
56
58
  judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py,sha256=8iTzMvou1Dr8pybul6lZHKjc9Ye2-0_racRGYkhEdTY,74
57
59
  judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py,sha256=ly72Z7s_c8NID6-nQnuW8qEGEW2MqdvpJ-5WfXzbAQg,2579
58
60
  judgeval/tracer/__init__.py,sha256=wy3DYpH8U_z0GO_K_gOSkK0tTTD-u5eLDo0T5xIBoAc,147
59
61
  judgeval/utils/alerts.py,sha256=O19Xj7DA0YVjl8PWiuH4zfdZeu3yiLVvHfY8ah2wG0g,2759
60
- judgeval-0.0.35.dist-info/METADATA,sha256=oAaDqpJCCZxUBOoVPTFbSjZgZ5xJMpGTxjngoJqmTO8,6126
61
- judgeval-0.0.35.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
62
- judgeval-0.0.35.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
63
- judgeval-0.0.35.dist-info/RECORD,,
62
+ judgeval/utils/data_utils.py,sha256=pB4GBWi8XoM2zSR2NlLXH5kqcQ029BVhDxaVKkdmiBY,1860
63
+ judgeval-0.0.37.dist-info/METADATA,sha256=0XKc4BJUpG8qnB3a9afhh8l7H_C19ritSJv95ogifXw,7742
64
+ judgeval-0.0.37.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
65
+ judgeval-0.0.37.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
66
+ judgeval-0.0.37.dist-info/RECORD,,
@@ -1,170 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: judgeval
3
- Version: 0.0.35
4
- Summary: Judgeval Package
5
- Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
- Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
7
- Author-email: Andrew Li <andrew@judgmentlabs.ai>, Alex Shan <alex@judgmentlabs.ai>, Joseph Camyre <joseph@judgmentlabs.ai>
8
- License-Expression: Apache-2.0
9
- License-File: LICENSE.md
10
- Classifier: Operating System :: OS Independent
11
- Classifier: Programming Language :: Python :: 3
12
- Requires-Python: >=3.11
13
- Requires-Dist: anthropic
14
- Requires-Dist: boto3==1.38.3
15
- Requires-Dist: fastapi
16
- Requires-Dist: google-genai
17
- Requires-Dist: langchain
18
- Requires-Dist: langchain-anthropic
19
- Requires-Dist: langchain-core
20
- Requires-Dist: langchain-huggingface
21
- Requires-Dist: langchain-openai
22
- Requires-Dist: litellm==1.38.12
23
- Requires-Dist: nest-asyncio
24
- Requires-Dist: openai
25
- Requires-Dist: openpyxl
26
- Requires-Dist: pandas
27
- Requires-Dist: pika
28
- Requires-Dist: python-dotenv==1.0.1
29
- Requires-Dist: requests
30
- Requires-Dist: supabase
31
- Requires-Dist: together
32
- Requires-Dist: uvicorn
33
- Provides-Extra: dev
34
- Requires-Dist: pytest-asyncio>=0.25.0; extra == 'dev'
35
- Requires-Dist: pytest-mock>=3.14.0; extra == 'dev'
36
- Requires-Dist: pytest>=8.3.4; extra == 'dev'
37
- Requires-Dist: tavily-python; extra == 'dev'
38
- Description-Content-Type: text/markdown
39
-
40
- # Judgeval SDK
41
-
42
- Judgeval is an open-source framework for building evaluation pipelines for multi-step agent workflows, supporting both real-time and experimental evaluation setups. To learn more about Judgment or sign up for free, visit our [website](https://www.judgmentlabs.ai/) or check out our [developer docs](https://judgment.mintlify.app/getting_started).
43
-
44
- ## Features
45
-
46
- - **Development and Production Evaluation Layer**: Offers a robust evaluation layer for multi-step agent applications, including unit-testing and performance monitoring.
47
- - **Plug-and-Evaluate**: Integrate LLM systems with 10+ research-backed metrics, including:
48
- - Hallucination detection
49
- - RAG retriever quality
50
- - And more
51
- - **Custom Evaluation Pipelines**: Construct powerful custom evaluation pipelines tailored for your LLM systems.
52
- - **Monitoring in Production**: Utilize state-of-the-art real-time evaluation foundation models to monitor LLM systems effectively.
53
-
54
- ## Installation
55
-
56
- ```bash
57
- pip install judgeval
58
- ```
59
-
60
- ## Quickstart: Evaluations
61
-
62
- You can evaluate your workflow execution data to measure quality metrics such as hallucination.
63
-
64
- Create a file named `evaluate.py` with the following code:
65
-
66
- ```python
67
- from judgeval import JudgmentClient
68
- from judgeval.data import Example
69
- from judgeval.scorers import FaithfulnessScorer
70
-
71
- client = JudgmentClient()
72
-
73
- example = Example(
74
- input="What if these shoes don't fit?",
75
- actual_output="We offer a 30-day full refund at no extra cost.",
76
- retrieval_context=["All customers are eligible for a 30 day full refund at no extra cost."],
77
- )
78
-
79
- scorer = FaithfulnessScorer(threshold=0.5)
80
- results = client.run_evaluation(
81
- examples=[example],
82
- scorers=[scorer],
83
- model="gpt-4o",
84
- )
85
- print(results)
86
- ```
87
- Click [here](https://judgment.mintlify.app/getting_started#create-your-first-experiment) for a more detailed explanation
88
-
89
- ## Quickstart: Traces
90
-
91
- Track your workflow execution for full observability with just a few lines of code.
92
-
93
- Create a file named `traces.py` with the following code:
94
-
95
- ```python
96
- from judgeval.common.tracer import Tracer, wrap
97
- from openai import OpenAI
98
-
99
- # Basic initialization
100
- client = wrap(OpenAI())
101
- judgment = Tracer(project_name="my_project")
102
-
103
- # Or with S3 storage enabled
104
- # NOTE: Make sure AWS creds correspond to an account with write access to the specified S3 bucket
105
- judgment = Tracer(
106
- project_name="my_project",
107
- use_s3=True,
108
- s3_bucket_name="my-traces-bucket", # Bucket created automatically if it doesn't exist
109
- s3_aws_access_key_id="your-access-key", # Optional: defaults to AWS_ACCESS_KEY_ID env var
110
- s3_aws_secret_access_key="your-secret-key", # Optional: defaults to AWS_SECRET_ACCESS_KEY env var
111
- s3_region_name="us-west-1" # Optional: defaults to AWS_REGION env var or "us-west-1"
112
- )
113
-
114
- @judgment.observe(span_type="tool")
115
- def my_tool():
116
- return "Hello world!"
117
-
118
- @judgment.observe(span_type="function")
119
- def main():
120
- task_input = my_tool()
121
- res = client.chat.completions.create(
122
- model="gpt-4o",
123
- messages=[{"role": "user", "content": f"{task_input}"}]
124
- )
125
- return res.choices[0].message.content
126
- ```
127
- Click [here](https://judgment.mintlify.app/getting_started#create-your-first-trace) for a more detailed explanation
128
-
129
- ## Quickstart: Online Evaluations
130
-
131
- Apply performance monitoring to measure the quality of your systems in production, not just on historical data.
132
-
133
- Using the same traces.py file we created earlier:
134
-
135
- ```python
136
- from judgeval.common.tracer import Tracer, wrap
137
- from judgeval.scorers import AnswerRelevancyScorer
138
- from openai import OpenAI
139
-
140
- client = wrap(OpenAI())
141
- judgment = Tracer(project_name="my_project")
142
-
143
- @judgment.observe(span_type="tool")
144
- def my_tool():
145
- return "Hello world!"
146
-
147
- @judgment.observe(span_type="function")
148
- def main():
149
- task_input = my_tool()
150
- res = client.chat.completions.create(
151
- model="gpt-4o",
152
- messages=[{"role": "user", "content": f"{task_input}"}]
153
- ).choices[0].message.content
154
-
155
- judgment.get_current_trace().async_evaluate(
156
- scorers=[AnswerRelevancyScorer(threshold=0.5)],
157
- input=task_input,
158
- actual_output=res,
159
- model="gpt-4o"
160
- )
161
-
162
- return res
163
- ```
164
- Click [here](https://judgment.mintlify.app/getting_started#create-your-first-online-evaluation) for a more detailed explanation
165
-
166
- ## Documentation and Demos
167
-
168
- For more detailed documentation, please check out our [docs](https://judgment.mintlify.app/getting_started) and some of our [demo videos](https://www.youtube.com/@AlexShan-j3o) for reference!
169
-
170
- ##