judgeval 0.1.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: judgeval
3
- Version: 0.1.0
3
+ Version: 0.3.0
4
4
  Summary: Judgeval Package
5
5
  Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
6
  Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -14,6 +14,7 @@ Requires-Dist: anthropic
14
14
  Requires-Dist: boto3
15
15
  Requires-Dist: datamodel-code-generator>=0.31.1
16
16
  Requires-Dist: google-genai
17
+ Requires-Dist: groq>=0.30.0
17
18
  Requires-Dist: langchain-anthropic
18
19
  Requires-Dist: langchain-core
19
20
  Requires-Dist: langchain-huggingface
@@ -22,6 +23,9 @@ Requires-Dist: litellm>=1.61.15
22
23
  Requires-Dist: matplotlib>=3.10.3
23
24
  Requires-Dist: nest-asyncio
24
25
  Requires-Dist: openai
26
+ Requires-Dist: opentelemetry-api>=1.34.1
27
+ Requires-Dist: opentelemetry-sdk>=1.34.1
28
+ Requires-Dist: orjson>=3.9.0
25
29
  Requires-Dist: pandas
26
30
  Requires-Dist: python-dotenv==1.0.1
27
31
  Requires-Dist: python-slugify>=8.0.4
@@ -39,7 +43,7 @@ Description-Content-Type: text/markdown
39
43
  Enable self-learning agents with traces, evals, and environment data.
40
44
  </div>
41
45
 
42
- ## [Docs](https://docs.judgmentlabs.ai/) • [Judgment Cloud](https://app.judgmentlabs.ai/register) • [Self-Host](https://docs.judgmentlabs.ai/documentation/self-hosting/get-started)
46
+ ## [Docs](https://docs.judgmentlabs.ai/) • [Judgment Cloud](https://app.judgmentlabs.ai/register) • [Self-Host](https://docs.judgmentlabs.ai/documentation/self-hosting/get-started) • [Landing Page](https://judgmentlabs.ai/)
43
47
 
44
48
  [Demo](https://www.youtube.com/watch?v=1S4LixpVbcc) • [Bug Reports](https://github.com/JudgmentLabs/judgeval/issues) • [Changelog](https://docs.judgmentlabs.ai/changelog/2025-04-21)
45
49
 
@@ -139,7 +143,7 @@ run_agent("What is the capital of the United States?")
139
143
  ```
140
144
  You'll see your trace exported to the Judgment Platform:
141
145
 
142
- <p align="center"><img src="assets/trace_demo.png" alt="Judgment Platform Trace Example" width="800" /></p>
146
+ <p align="center"><img src="assets/online_eval.png" alt="Judgment Platform Trace Example" width="1500" /></p>
143
147
 
144
148
 
145
149
  [Click here](https://docs.judgmentlabs.ai/documentation/tracing/introduction) for a more detailed explanation.
@@ -152,9 +156,9 @@ You'll see your trace exported to the Judgment Platform:
152
156
 
153
157
  | | |
154
158
  |:---|:---:|
155
- | <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic). **Tracks inputs/outputs, agent tool calls, latency, cost, and custom metadata** at every step.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 📋 Collecting agent environment data <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/trace_screenshot.png" alt="Tracing visualization" width="1200"/></p> |
156
- | <h3>🧪 Evals</h3>Build custom evaluators on top of your agents. Judgeval supports LLM-as-a-judge, manual labeling, and code-based evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 A/B testing <br>• 🛡️ Online guardrails | <p align="center"><img src="assets/experiments_page.png" alt="Evaluation metrics" width="800"/></p> |
157
- | <h3>📡 Monitoring</h3>Get Slack alerts for agent failures in production. Add custom hooks to address production regressions.<br><br> **Useful for:** <br>• 📉 Identifying degradation early <br>• 📈 Visualizing performance trends across agent versions and time | <p align="center"><img src="assets/error_analysis_dashboard.png" alt="Monitoring Dashboard" width="1200"/></p> |
159
+ | <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic). **Tracks inputs/outputs, agent tool calls, latency, cost, and custom metadata** at every step.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 📋 Collecting agent environment data <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/agent_trace_example.png" alt="Tracing visualization" width="1200"/></p> |
160
+ | <h3>🧪 Evals</h3>Build custom evaluators on top of your agents. Judgeval supports LLM-as-a-judge, manual labeling, and code-based evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 A/B testing <br>• 🛡️ Online guardrails | <p align="center"><img src="assets/test.png" alt="Evaluation metrics" width="800"/></p> |
161
+ | <h3>📡 Monitoring</h3>Get Slack alerts for agent failures in production. Add custom hooks to address production regressions.<br><br> **Useful for:** <br>• 📉 Identifying degradation early <br>• 📈 Visualizing performance trends across agent versions and time | <p align="center"><img src="assets/errors.png" alt="Monitoring Dashboard" width="1200"/></p> |
158
162
  | <h3>📊 Datasets</h3>Export traces and test cases to datasets for scaled analysis and optimization. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations, enabling continuous learning from production interactions. <br><br> **Useful for:**<br>• 🗃️ Agent environment interaction data for optimization<br>• 🔄 Scaled analysis for A/B tests | <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
159
163
 
160
164
  ## 🏢 Self-Hosting
@@ -1,47 +1,45 @@
1
1
  judgeval/__init__.py,sha256=HM1M8hmqRum6G554QKkXhB4DF4f5eh_xtYo0Kf-t3kw,332
2
2
  judgeval/clients.py,sha256=JnB8n90GyXiYaGmSEYaA67mdJSnr3SIrzArao7NGebw,980
3
3
  judgeval/constants.py,sha256=rfl4gW9_4irxgamjTC-jvDj2ATSUrjEu0UAHZ4pLLtY,4081
4
- judgeval/evaluation_run.py,sha256=PZeoKS7JCsO2gzdo8jeq8786yn01Ccrq0xuCNUu9CPo,2797
5
- judgeval/judgment_client.py,sha256=tUgKS2sV8QZUxjdh3mP2PSBnC7Bci1e8ur8muvrgzBM,14011
4
+ judgeval/dataset.py,sha256=rjV54XNTslNNtf-Uu2ndDIh602ZwSCFhPg2NuckDJ-w,6081
5
+ judgeval/evaluation_run.py,sha256=edNpO444Fwt2ykWsflIzlYdDJUlUfbpXHHQSKfFS4y0,2876
6
+ judgeval/judgment_client.py,sha256=vPoxbmxAlhbG5rXXqxWjMbyEqOI044BaQanr1fev2CE,11723
6
7
  judgeval/rules.py,sha256=CoQjqmP8daEXewMkplmA-7urubDtweOr5O6z8klVwLI,20031
7
- judgeval/run_evaluation.py,sha256=U-aZyhSryjqzJl5jInc91uY8jIyiY596S6JJO3fH6AI,26105
8
+ judgeval/run_evaluation.py,sha256=hOKDdymH8VPQ8LPXJOVku41nwDDYLSCtHQIlIzFNYaE,27654
8
9
  judgeval/version_check.py,sha256=FoLEtpCjDw2HuDQdpw5yT29UtwumSc6ZZN6AV_c9Mnw,1057
9
10
  judgeval/common/__init__.py,sha256=KH-QJyWtQ60R6yFIBDYS3WGRiNpEu1guynpxivZvpBQ,309
10
11
  judgeval/common/exceptions.py,sha256=OkgDznu2wpBQZMXiZarLJYNk1HIcC8qYW7VypDC3Ook,556
11
12
  judgeval/common/logger.py,sha256=514eFLYWS_UL8VY-zAR2ePUlpQe4rbYlleLASFllLE4,1511
12
- judgeval/common/utils.py,sha256=GhCEv8i_7JK4DJeUlMmibqEUy9ZVHxJAlFO_CriAzg4,34323
13
+ judgeval/common/utils.py,sha256=oxGDRVWOICKWeyGgsoc36_yAyHSYF4XtH842Mkznwis,34739
13
14
  judgeval/common/api/__init__.py,sha256=-E7lpZz1fG8puR_aYUMfPmQ-Vyhd0bgzoaU5EhIuFjQ,114
14
- judgeval/common/api/api.py,sha256=BGtAGGRDqxs8DrA0ye8BPZ6KBsgJ2C0Dca4vvA55d6g,13049
15
- judgeval/common/api/constants.py,sha256=azA0eyz4q33SWS795NHhaKDKNmVHBWAAGe5_sk37nDU,4297
15
+ judgeval/common/api/api.py,sha256=wty02HYANeOYlM8fHOLc33ux5bu9Ieq7iRqCr-UP0ng,14157
16
+ judgeval/common/api/constants.py,sha256=9B7y-oaqAspWZLHEf8wkfWf0OzpDyu3AKD1sfq4JsB0,4566
16
17
  judgeval/common/storage/__init__.py,sha256=a-PI7OL-ydyzugGUKmJKRBASnK-Q-gs82L9K9rSyJP8,90
17
- judgeval/common/storage/s3_storage.py,sha256=UvAKGSa0S1BnNprzDKHMAfyT-8zlMAOM5kCrXcVN0HE,3743
18
+ judgeval/common/storage/s3_storage.py,sha256=0-bNKheqJJyBZ92KGrzQtd1zocIRWBlfn_58L4a-Ay0,3719
18
19
  judgeval/common/tracer/__init__.py,sha256=tJCJsmVmrL89Phv88gNCJ-j0ITPez6lh8vhMAAlLNSc,795
19
20
  judgeval/common/tracer/constants.py,sha256=yu5y8gMe5yb1AaBkPtAH-BNwIaAR3NwYCRoSf45wp5U,621
20
- judgeval/common/tracer/core.py,sha256=Ij-KDD3dVXHK_6NPk-VbTH_Mo8GZq5h4Zl5ii5QMjnE,72403
21
+ judgeval/common/tracer/core.py,sha256=blHEh61CE5kZLYCgyRF4kU6dVzi_Ko6DrnBpw2-jByI,73973
21
22
  judgeval/common/tracer/otel_exporter.py,sha256=kZLlOQ6afQE4dmb9H1wgU4P3H5PG1D_zKyvnpWcT5Ak,3899
22
- judgeval/common/tracer/otel_span_processor.py,sha256=3cMETvrNlwrTkS_XDdTNRhjVw_6TdgnojpQhDK9sbOs,7484
23
+ judgeval/common/tracer/otel_span_processor.py,sha256=W7SM62KnxJ48vC9WllIHRKaLlvxkCwqYoT4KqZLfGNs,6497
23
24
  judgeval/common/tracer/span_processor.py,sha256=eFjTgSWSkM6BWE94CrvgafDg_WkxLsFL_MafwBG-p9M,1145
24
- judgeval/common/tracer/span_transformer.py,sha256=YIHEmr35o6_uX931JbD1PFIcLIWTVumWrJ198Ys391k,7544
25
+ judgeval/common/tracer/span_transformer.py,sha256=nCnwRC52OKfYRFnsOwGdPaqb_U17yn5S_9jfhv1GaLM,7803
25
26
  judgeval/common/tracer/trace_manager.py,sha256=7KLWBrz5GE_138DHL_eRjhx4-LNfXKz1q_XIDfg6nw8,2992
26
27
  judgeval/data/__init__.py,sha256=1QagDcSQtfnJ632t9Dnq8d7XjAqhmY4mInOWt8qH9tM,455
27
- judgeval/data/example.py,sha256=6xtPTwWUsZ0HdErU-g954nCv64fsbnS1I5xuEvs14EA,2027
28
- judgeval/data/judgment_types.py,sha256=s1oea01AEBQBdqQntXhTbMiuDGAxvs2iGoxrR2uLnUw,9538
29
- judgeval/data/result.py,sha256=hHKiMMEl9Qv3EvK5UH8Y5YDu8VyvrHzNqlKatrq4UUY,2450
28
+ judgeval/data/example.py,sha256=kRskIgsjwcvv2Y8jaPwV-PND7zlmMbFsvRVQ_b7SZY0,914
29
+ judgeval/data/judgment_types.py,sha256=7dox20cUlAd4gvDN3iaSsRff-J6p-mxT9dwuhE4Ztr0,8411
30
+ judgeval/data/result.py,sha256=OtSnBUrdQpjyAqxXRLTW3wC9v9lOm_GqzL14ccRQxrg,2124
30
31
  judgeval/data/scorer_data.py,sha256=5QBHtvOIWOq0Rn9_uPJzAMRYMlWxMB-rXnG_6kV4Z4Y,2955
31
32
  judgeval/data/tool.py,sha256=iWQSdy5uNbIeACu3gQy1DC2oGYxRVYNfkkczWdQMAiA,99
32
- judgeval/data/trace.py,sha256=_cyCsyg2gwG7lyyv186xo4OvGH2QlJDuyIg-qh-CZNA,6994
33
+ judgeval/data/trace.py,sha256=tDOuYFPUssQInjsmwyxcXq-W3IB29Vq340VzqafuKJc,6942
33
34
  judgeval/data/trace_run.py,sha256=c6pRSv09Vj016hxM49I3kMftCwWg8hhkfT_1kBXluSI,1600
34
- judgeval/data/datasets/__init__.py,sha256=IdNKhQv9yYZ_op0rdBacrFaFVmiiYQ3JTzXzxOTsEVQ,176
35
- judgeval/data/datasets/dataset.py,sha256=dDmTYSBRj4YEUhgYOebAcDm4N14nj3tcCqHj9y2Z1z0,12725
36
- judgeval/data/datasets/eval_dataset_client.py,sha256=8tiuwRC3oebc19KY-5b99Cxj0qq6ADW1NMDd1R1RhLc,7258
37
35
  judgeval/data/scripts/fix_default_factory.py,sha256=lvp2JwYZqz-XpD9LZNa3mANZVP-jJSZoNzolI6JWERM,591
38
- judgeval/data/scripts/openapi_transform.py,sha256=Rye-fErFtENAq3KKBKRUVR_oJdjYZtNzKRBKFkYS0XQ,3857
36
+ judgeval/data/scripts/openapi_transform.py,sha256=Sm04JClzyP1ga8KA3gkIdsae8Hlx-XU7-x0gHCQYOhg,3877
39
37
  judgeval/integrations/langgraph.py,sha256=kJXLsgBY7DgsUTZyVQ47deDgHm887brFHfyIbuyerGw,29986
40
38
  judgeval/judges/__init__.py,sha256=6X7VSwrwsdxGBNxCyapVRWGghhKOy3MVxFNMQ62kCXM,308
41
39
  judgeval/judges/base_judge.py,sha256=_dz0qWsKRxzXxpRY9l6mrxTRYPSF2FE4ZXkrzhZ4gbY,986
42
- judgeval/judges/litellm_judge.py,sha256=LX4_KXb1Jp8IXif3vvOiKfRYH7ZkbQLs9AtWPGmj544,2483
43
- judgeval/judges/mixture_of_judges.py,sha256=wcHwLi9zU0uwKMqRVhcPdjiYKgWflX4dpUbU2kS9yg0,14825
44
- judgeval/judges/together_judge.py,sha256=r5k8ZcC6lnsFttGHhrocFtmglx2Cb3G-4ORKAeK-Nmw,2253
40
+ judgeval/judges/litellm_judge.py,sha256=yt6QvwKMmxZcrUtjbn3EiO5aVg7CHM2YZkBCSQLS8jk,2509
41
+ judgeval/judges/mixture_of_judges.py,sha256=cecQ8mRmz2-dDoZl2MGsrhZICkpIvRovGPK3su0kc8s,14889
42
+ judgeval/judges/together_judge.py,sha256=e2tr0bODZEkW45RQNW3eE8Z8XUIc8VUTw-cuwT6eYbw,2293
45
43
  judgeval/judges/utils.py,sha256=0CF9qtIUQUL3-W-qTGpmTjZbkUUBAM6TslDsrCHnTBU,2725
46
44
  judgeval/scorers/__init__.py,sha256=4H_cinTQ4EogZv59YEV-3U9EOTLppNwgAPTi1-jI9Fw,746
47
45
  judgeval/scorers/agent_scorer.py,sha256=TjwD_YglSywr3EowEojiCyg5qDgCRa5LRGc5nFdmIBc,703
@@ -49,8 +47,8 @@ judgeval/scorers/api_scorer.py,sha256=xlhqkeMUBFxl8daSXOTWOYwZjBAz7o6b4sVD5f8cIH
49
47
  judgeval/scorers/base_scorer.py,sha256=eDfQk8N8TQfM1ayJDWr0NTdSQxcbk9-VZHd0Igb9EbI,2878
50
48
  judgeval/scorers/example_scorer.py,sha256=2n45y3LMV1Q-ARyXLHqvVWETlnY1DqS7OLzPu9IBGz8,716
51
49
  judgeval/scorers/exceptions.py,sha256=ACDHK5-TWiF3NTk-wycaedpbrdobm-CvvC1JA_iP-Mk,179
52
- judgeval/scorers/score.py,sha256=t9prkpDapcOAyuOXtDHMmwrqVGW0C_Hvx1UIEGyafmI,6610
53
- judgeval/scorers/utils.py,sha256=WM7mTCQSa2Z_rJ-0Iv9dhuBmtkTfV0pFN7XEhxHdzsM,3959
50
+ judgeval/scorers/score.py,sha256=2-M_AmOjIQR2c0qvuB4WIIQD-7zSNdzsWC8ttqltw2g,6601
51
+ judgeval/scorers/utils.py,sha256=HQOYTJtNnsi_aPfMssePAaBbXpAv7LXgwUlWlDFuN2g,3965
54
52
  judgeval/scorers/judgeval_scorers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
55
53
  judgeval/scorers/judgeval_scorers/api_scorers/__init__.py,sha256=GX4KkwPR2p-c0Y5mZingJa8EUfjAbMGhrmRBDBunOGw,1484
56
54
  judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py,sha256=zJsU0VrUmRhY9qav48c6jTyDqUwI3JzhV9ajtlJCe0M,544
@@ -65,9 +63,9 @@ judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py,sha256=Mcp1CjMN
65
63
  judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py,sha256=Z2FLGBC7m_CLx-CMgXVuTvYvN0vY5yOcWA0ImBkeBfY,787
66
64
  judgeval/tracer/__init__.py,sha256=wkuXtOGDCrwgPPXlh_sSJmvGuWaAMHyNzk1TzB5f9aI,148
67
65
  judgeval/utils/alerts.py,sha256=3w_AjQrgfmOZvfqCridW8WAnHVxHHXokX9jNzVFyGjA,3297
68
- judgeval/utils/file_utils.py,sha256=wIEn8kjM0WrP216RGU_yhZhFOMWIS5ckigyHbzFSOMk,1774
66
+ judgeval/utils/file_utils.py,sha256=PWHRs8dUr8iDwpglSSk4Yjd7C6ZhDzUaO-jV3m7riHM,1987
69
67
  judgeval/utils/requests.py,sha256=K3gUKrwL6TvwYKVYO5OeLWdUHn9NiUPmnIXhZEiEaHU,1534
70
- judgeval-0.1.0.dist-info/METADATA,sha256=B1v_50ikBR0fiojJY97deNf_VvEZn8fQq9qrxBi38ig,10188
71
- judgeval-0.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
72
- judgeval-0.1.0.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
73
- judgeval-0.1.0.dist-info/RECORD,,
68
+ judgeval-0.3.0.dist-info/METADATA,sha256=rENldJCo8cVAjLgiYzmYu1RGByxAmXX63WKLc6owrMo,10348
69
+ judgeval-0.3.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
70
+ judgeval-0.3.0.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
71
+ judgeval-0.3.0.dist-info/RECORD,,
@@ -1,4 +0,0 @@
1
- from judgeval.data.datasets.dataset import EvalDataset
2
- from judgeval.data.datasets.eval_dataset_client import EvalDatasetClient
3
-
4
- __all__ = ["EvalDataset", "EvalDatasetClient"]
@@ -1,341 +0,0 @@
1
- import ast
2
- import csv
3
- import datetime
4
- import json
5
- import os
6
- import yaml
7
- from dataclasses import dataclass, field
8
- from typing import List, Union, Literal, Optional
9
-
10
- from judgeval.data import Example, Trace
11
- from judgeval.common.logger import judgeval_logger
12
- from judgeval.utils.file_utils import get_examples_from_yaml
13
-
14
-
15
- @dataclass
16
- class EvalDataset:
17
- examples: List[Example]
18
- traces: List[Trace]
19
- _alias: Union[str, None] = field(default=None)
20
- _id: Union[str, None] = field(default=None)
21
- judgment_api_key: str = field(default="")
22
- organization_id: str = field(default="")
23
-
24
- def __init__(
25
- self,
26
- judgment_api_key: str = os.getenv("JUDGMENT_API_KEY", ""),
27
- organization_id: str = os.getenv("JUDGMENT_ORG_ID", ""),
28
- examples: Optional[List[Example]] = None,
29
- traces: Optional[List[Trace]] = None,
30
- ):
31
- if not judgment_api_key:
32
- judgeval_logger.error("No judgment_api_key provided")
33
- self.examples = examples or []
34
- self.traces = traces or []
35
- self._alias = None
36
- self._id = None
37
- self.judgment_api_key = judgment_api_key
38
- self.organization_id = organization_id
39
-
40
- def add_from_json(self, file_path: str) -> None:
41
- """
42
- Adds examples from a JSON file.
43
-
44
- The format of the JSON file is expected to be a dictionary with one key: "examples".
45
- The value of the key is a list of dictionaries, where each dictionary represents an example.
46
-
47
- The JSON file is expected to have the following format:
48
- {
49
- "examples": [
50
- {
51
- "input": "test input",
52
- "actual_output": "test output",
53
- "expected_output": "expected output",
54
- "context": [
55
- "context1",
56
- "context2"
57
- ],
58
- "retrieval_context": [
59
- "retrieval1"
60
- ],
61
- "additional_metadata": {
62
- "key": "value"
63
- },
64
- "tools_called": [
65
- "tool1"
66
- ],
67
- "expected_tools": [
68
- "tool1",
69
- "tool2"
70
- ],
71
- "name": "test example",
72
- "example_id": null,
73
- "timestamp": "20241230_160117",
74
- "trace_id": "123"
75
- }
76
- ]
77
- }
78
- """
79
- try:
80
- with open(file_path, "r") as file:
81
- payload = json.load(file)
82
- examples = payload.get("examples", [])
83
- except FileNotFoundError:
84
- judgeval_logger.error(f"JSON file not found: {file_path}")
85
- raise FileNotFoundError(f"The file {file_path} was not found.")
86
- except json.JSONDecodeError:
87
- judgeval_logger.error(f"Invalid JSON file: {file_path}")
88
- raise ValueError(f"The file {file_path} is not a valid JSON file.")
89
-
90
- new_examples = [Example(**e) for e in examples]
91
- for e in new_examples:
92
- self.add_example(e)
93
-
94
- def add_from_csv(
95
- self,
96
- file_path: str,
97
- header_mapping: dict,
98
- primary_delimiter: str = ",",
99
- secondary_delimiter: str = ";",
100
- ) -> None:
101
- """
102
- Add Examples from a CSV file.
103
-
104
- Args:
105
- file_path (str): Path to the CSV file
106
- header_mapping (dict): Dictionary mapping Example headers to custom headers
107
- primary_delimiter (str, optional): Main delimiter used in CSV file. Defaults to ","
108
- secondary_delimiter (str, optional): Secondary delimiter for list fields. Defaults to ";"
109
- """
110
- try:
111
- import pandas as pd
112
- except ModuleNotFoundError:
113
- raise ModuleNotFoundError(
114
- "Please install pandas to use this method. 'pip install pandas'"
115
- )
116
-
117
- # Pandas naturally reads numbers in data files as ints, not strings (can lead to unexpected behavior)
118
- df = pd.read_csv(file_path, dtype={"trace_id": str}, sep=primary_delimiter)
119
- """
120
- The user should pass in a dict mapping from Judgment Example headers to their custom defined headers.
121
- Available headers for Example objects are as follows:
122
-
123
- "input", "actual_output", "expected_output", "context", \
124
- "retrieval_context", "additional_metadata", "tools_called", \
125
- "expected_tools", "name", "comments", "source_file", "example", \
126
- "trace_id"
127
-
128
- We want to collect the examples separately which can
129
- be determined by the "example" column. If the value is True, then it is an
130
- example, and we expect the `input` and `actual_output` fields to be non-null.
131
-
132
- We also assume that if there are multiple retrieval contexts, contexts, or tools called, they are separated by semicolons.
133
- This can be adjusted using the `secondary_delimiter` parameter.
134
- """
135
- examples = []
136
-
137
- def process_csv_row(value, header):
138
- """
139
- Maps a singular value in the CSV file to the appropriate type based on the header.
140
- If value exists and can be split into type List[*], we will split upon the user's provided secondary delimiter.
141
- """
142
- # check that the CSV value is not null for entry
143
- null_replacement = dict() if header == "additional_metadata" else None
144
- if pd.isna(value) or value == "":
145
- return null_replacement
146
- try:
147
- value = (
148
- ast.literal_eval(value)
149
- if header == "additional_metadata"
150
- else str(value)
151
- )
152
- except (ValueError, SyntaxError):
153
- value = str(value)
154
- if header in [
155
- "context",
156
- "retrieval_context",
157
- "tools_called",
158
- "expected_tools",
159
- ]:
160
- # attempt to split the value by the secondary delimiter
161
- value = value.split(secondary_delimiter)
162
-
163
- return value
164
-
165
- for _, row in df.iterrows():
166
- data = {
167
- header: process_csv_row(row[header_mapping[header]], header)
168
- for header in header_mapping
169
- }
170
- if "example" in header_mapping and row[header_mapping["example"]]:
171
- if "name" in header_mapping:
172
- data["name"] = (
173
- row[header_mapping["name"]]
174
- if pd.notna(row[header_mapping["name"]])
175
- else None
176
- )
177
- # every Example has `input` and `actual_output` fields
178
- if data["input"] is not None and data["actual_output"] is not None:
179
- e = Example(**data)
180
- examples.append(e)
181
- else:
182
- raise ValueError(
183
- "Every example must have an 'input' and 'actual_output' field."
184
- )
185
-
186
- for e in examples:
187
- self.add_example(e)
188
-
189
- def add_from_yaml(self, file_path: str) -> None:
190
- """
191
- Adds examples from a YAML file.
192
-
193
- The format of the YAML file is expected to be a dictionary with one key: "examples".
194
- The value of the key is a list of dictionaries, where each dictionary represents an example.
195
-
196
- The YAML file is expected to have the following format:
197
- examples:
198
- - input: "test input"
199
- actual_output: "test output"
200
- expected_output: "expected output"
201
- context:
202
- - "context1"
203
- - "context2"
204
- retrieval_context:
205
- - "retrieval1"
206
- additional_metadata:
207
- key: "value"
208
- tools_called:
209
- - "tool1"
210
- expected_tools:
211
- - "tool1"
212
- - "tool2"
213
- name: "test example"
214
- example_id: null
215
- timestamp: "20241230_160117"
216
- trace_id: "123"
217
- """
218
- examples = get_examples_from_yaml(file_path)
219
-
220
- for e in examples:
221
- self.add_example(e)
222
-
223
- def add_example(self, e: Example) -> None:
224
- self.examples.append(e)
225
- # TODO if we need to add rank, then we need to do it here
226
-
227
- def add_trace(self, t: Trace) -> None:
228
- self.traces.append(t)
229
-
230
- def save_as(
231
- self,
232
- file_type: Literal["json", "csv", "yaml"],
233
- dir_path: str,
234
- save_name: str | None = None,
235
- ) -> None:
236
- """
237
- Saves the dataset as a file. Save only the examples.
238
-
239
- Args:
240
- file_type (Literal["json", "csv"]): The file type to save the dataset as.
241
- dir_path (str): The directory path to save the file to.
242
- save_name (str, optional): The name of the file to save. Defaults to None.
243
- """
244
- if not os.path.exists(dir_path):
245
- os.makedirs(dir_path)
246
- file_name = (
247
- datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
248
- if save_name is None
249
- else save_name
250
- )
251
- complete_path = os.path.join(dir_path, f"{file_name}.{file_type}")
252
- if file_type == "json":
253
- with open(complete_path, "w") as file:
254
- json.dump(
255
- {
256
- "examples": [e.to_dict() for e in self.examples],
257
- },
258
- file,
259
- indent=4,
260
- )
261
- elif file_type == "csv":
262
- with open(complete_path, "w", newline="") as file:
263
- writer = csv.writer(file)
264
- writer.writerow(
265
- [
266
- "input",
267
- "actual_output",
268
- "expected_output",
269
- "context",
270
- "retrieval_context",
271
- "additional_metadata",
272
- "tools_called",
273
- "expected_tools",
274
- "name",
275
- "comments",
276
- "source_file",
277
- "example",
278
- "trace_id",
279
- ]
280
- )
281
- for e in self.examples:
282
- writer.writerow(
283
- [
284
- e.input,
285
- e.actual_output,
286
- e.expected_output,
287
- ";".join(e.context),
288
- ";".join(e.retrieval_context),
289
- e.additional_metadata,
290
- ";".join(e.tools_called),
291
- ";".join(e.expected_tools),
292
- e.name,
293
- None, # Example does not have comments
294
- None, # Example does not have source file
295
- True, # Adding an Example
296
- ]
297
- )
298
-
299
- elif file_type == "yaml":
300
- with open(complete_path, "w") as file:
301
- yaml_data = {
302
- "examples": [
303
- {
304
- "input": e.input,
305
- "actual_output": e.actual_output,
306
- "expected_output": e.expected_output,
307
- "context": e.context,
308
- "retrieval_context": e.retrieval_context,
309
- "additional_metadata": e.additional_metadata,
310
- "tools_called": e.tools_called,
311
- "expected_tools": e.expected_tools,
312
- "name": e.name,
313
- "comments": None, # Example does not have comments
314
- "source_file": None, # Example does not have source file
315
- "example": True, # Adding an Example
316
- }
317
- for e in self.examples
318
- ],
319
- }
320
- yaml.dump(yaml_data, file, default_flow_style=False)
321
- else:
322
- ACCEPTABLE_FILE_TYPES = ["json", "csv", "yaml"]
323
- raise TypeError(
324
- f"Invalid file type: {file_type}. Please choose from {ACCEPTABLE_FILE_TYPES}"
325
- )
326
-
327
- def __iter__(self):
328
- return iter(self.examples)
329
-
330
- def __len__(self):
331
- return len(self.examples)
332
-
333
- def __str__(self):
334
- return (
335
- f"{self.__class__.__name__}("
336
- f"examples={self.examples}, "
337
- f"traces={self.traces}, "
338
- f"_alias={self._alias}, "
339
- f"_id={self._id}"
340
- f")"
341
- )