judgeval 0.2.0__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/api/api.py +38 -7
- judgeval/common/api/constants.py +9 -1
- judgeval/common/storage/s3_storage.py +2 -3
- judgeval/common/tracer/core.py +66 -32
- judgeval/common/tracer/otel_span_processor.py +4 -50
- judgeval/common/tracer/span_transformer.py +16 -10
- judgeval/common/utils.py +46 -38
- judgeval/constants.py +2 -0
- judgeval/data/example.py +9 -37
- judgeval/data/judgment_types.py +23 -45
- judgeval/data/result.py +8 -14
- judgeval/data/scripts/openapi_transform.py +5 -5
- judgeval/data/trace.py +3 -4
- judgeval/dataset.py +192 -0
- judgeval/evaluation_run.py +1 -0
- judgeval/judges/litellm_judge.py +2 -2
- judgeval/judges/mixture_of_judges.py +6 -6
- judgeval/judges/together_judge.py +6 -3
- judgeval/judgment_client.py +9 -71
- judgeval/run_evaluation.py +41 -9
- judgeval/scorers/score.py +11 -7
- judgeval/scorers/utils.py +3 -3
- judgeval/utils/file_utils.py +40 -25
- {judgeval-0.2.0.dist-info → judgeval-0.3.1.dist-info}/METADATA +10 -6
- {judgeval-0.2.0.dist-info → judgeval-0.3.1.dist-info}/RECORD +27 -29
- judgeval/data/datasets/__init__.py +0 -4
- judgeval/data/datasets/dataset.py +0 -341
- judgeval/data/datasets/eval_dataset_client.py +0 -214
- {judgeval-0.2.0.dist-info → judgeval-0.3.1.dist-info}/WHEEL +0 -0
- {judgeval-0.2.0.dist-info → judgeval-0.3.1.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: judgeval
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.3.1
|
4
4
|
Summary: Judgeval Package
|
5
5
|
Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
|
6
6
|
Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
|
@@ -14,6 +14,7 @@ Requires-Dist: anthropic
|
|
14
14
|
Requires-Dist: boto3
|
15
15
|
Requires-Dist: datamodel-code-generator>=0.31.1
|
16
16
|
Requires-Dist: google-genai
|
17
|
+
Requires-Dist: groq>=0.30.0
|
17
18
|
Requires-Dist: langchain-anthropic
|
18
19
|
Requires-Dist: langchain-core
|
19
20
|
Requires-Dist: langchain-huggingface
|
@@ -22,6 +23,9 @@ Requires-Dist: litellm>=1.61.15
|
|
22
23
|
Requires-Dist: matplotlib>=3.10.3
|
23
24
|
Requires-Dist: nest-asyncio
|
24
25
|
Requires-Dist: openai
|
26
|
+
Requires-Dist: opentelemetry-api>=1.34.1
|
27
|
+
Requires-Dist: opentelemetry-sdk>=1.34.1
|
28
|
+
Requires-Dist: orjson>=3.9.0
|
25
29
|
Requires-Dist: pandas
|
26
30
|
Requires-Dist: python-dotenv==1.0.1
|
27
31
|
Requires-Dist: python-slugify>=8.0.4
|
@@ -39,7 +43,7 @@ Description-Content-Type: text/markdown
|
|
39
43
|
Enable self-learning agents with traces, evals, and environment data.
|
40
44
|
</div>
|
41
45
|
|
42
|
-
## [Docs](https://docs.judgmentlabs.ai/) • [Judgment Cloud](https://app.judgmentlabs.ai/register) • [Self-Host](https://docs.judgmentlabs.ai/documentation/self-hosting/get-started)
|
46
|
+
## [Docs](https://docs.judgmentlabs.ai/) • [Judgment Cloud](https://app.judgmentlabs.ai/register) • [Self-Host](https://docs.judgmentlabs.ai/documentation/self-hosting/get-started) • [Landing Page](https://judgmentlabs.ai/)
|
43
47
|
|
44
48
|
[Demo](https://www.youtube.com/watch?v=1S4LixpVbcc) • [Bug Reports](https://github.com/JudgmentLabs/judgeval/issues) • [Changelog](https://docs.judgmentlabs.ai/changelog/2025-04-21)
|
45
49
|
|
@@ -139,7 +143,7 @@ run_agent("What is the capital of the United States?")
|
|
139
143
|
```
|
140
144
|
You'll see your trace exported to the Judgment Platform:
|
141
145
|
|
142
|
-
<p align="center"><img src="assets/
|
146
|
+
<p align="center"><img src="assets/online_eval.png" alt="Judgment Platform Trace Example" width="1500" /></p>
|
143
147
|
|
144
148
|
|
145
149
|
[Click here](https://docs.judgmentlabs.ai/documentation/tracing/introduction) for a more detailed explanation.
|
@@ -152,9 +156,9 @@ You'll see your trace exported to the Judgment Platform:
|
|
152
156
|
|
153
157
|
| | |
|
154
158
|
|:---|:---:|
|
155
|
-
| <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic). **Tracks inputs/outputs, agent tool calls, latency, cost, and custom metadata** at every step.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 📋 Collecting agent environment data <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/
|
156
|
-
| <h3>🧪 Evals</h3>Build custom evaluators on top of your agents. Judgeval supports LLM-as-a-judge, manual labeling, and code-based evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 A/B testing <br>• 🛡️ Online guardrails | <p align="center"><img src="assets/
|
157
|
-
| <h3>📡 Monitoring</h3>Get Slack alerts for agent failures in production. Add custom hooks to address production regressions.<br><br> **Useful for:** <br>• 📉 Identifying degradation early <br>• 📈 Visualizing performance trends across agent versions and time | <p align="center"><img src="assets/
|
159
|
+
| <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic). **Tracks inputs/outputs, agent tool calls, latency, cost, and custom metadata** at every step.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 📋 Collecting agent environment data <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/agent_trace_example.png" alt="Tracing visualization" width="1200"/></p> |
|
160
|
+
| <h3>🧪 Evals</h3>Build custom evaluators on top of your agents. Judgeval supports LLM-as-a-judge, manual labeling, and code-based evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 A/B testing <br>• 🛡️ Online guardrails | <p align="center"><img src="assets/test.png" alt="Evaluation metrics" width="800"/></p> |
|
161
|
+
| <h3>📡 Monitoring</h3>Get Slack alerts for agent failures in production. Add custom hooks to address production regressions.<br><br> **Useful for:** <br>• 📉 Identifying degradation early <br>• 📈 Visualizing performance trends across agent versions and time | <p align="center"><img src="assets/errors.png" alt="Monitoring Dashboard" width="1200"/></p> |
|
158
162
|
| <h3>📊 Datasets</h3>Export traces and test cases to datasets for scaled analysis and optimization. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations, enabling continuous learning from production interactions. <br><br> **Useful for:**<br>• 🗃️ Agent environment interaction data for optimization<br>• 🔄 Scaled analysis for A/B tests | <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
|
159
163
|
|
160
164
|
## 🏢 Self-Hosting
|
@@ -1,47 +1,45 @@
|
|
1
1
|
judgeval/__init__.py,sha256=HM1M8hmqRum6G554QKkXhB4DF4f5eh_xtYo0Kf-t3kw,332
|
2
2
|
judgeval/clients.py,sha256=JnB8n90GyXiYaGmSEYaA67mdJSnr3SIrzArao7NGebw,980
|
3
|
-
judgeval/constants.py,sha256=
|
4
|
-
judgeval/
|
5
|
-
judgeval/
|
3
|
+
judgeval/constants.py,sha256=hWed25HwGUJy-tePbtoUZ0_Zg0X_MkAH84KiH-OHHFI,4150
|
4
|
+
judgeval/dataset.py,sha256=rjV54XNTslNNtf-Uu2ndDIh602ZwSCFhPg2NuckDJ-w,6081
|
5
|
+
judgeval/evaluation_run.py,sha256=edNpO444Fwt2ykWsflIzlYdDJUlUfbpXHHQSKfFS4y0,2876
|
6
|
+
judgeval/judgment_client.py,sha256=vPoxbmxAlhbG5rXXqxWjMbyEqOI044BaQanr1fev2CE,11723
|
6
7
|
judgeval/rules.py,sha256=CoQjqmP8daEXewMkplmA-7urubDtweOr5O6z8klVwLI,20031
|
7
|
-
judgeval/run_evaluation.py,sha256=
|
8
|
+
judgeval/run_evaluation.py,sha256=7J6FHhWhB-IDPMSOcWkrjTpSNm2v3s_KBq8Np3y2pys,27652
|
8
9
|
judgeval/version_check.py,sha256=FoLEtpCjDw2HuDQdpw5yT29UtwumSc6ZZN6AV_c9Mnw,1057
|
9
10
|
judgeval/common/__init__.py,sha256=KH-QJyWtQ60R6yFIBDYS3WGRiNpEu1guynpxivZvpBQ,309
|
10
11
|
judgeval/common/exceptions.py,sha256=OkgDznu2wpBQZMXiZarLJYNk1HIcC8qYW7VypDC3Ook,556
|
11
12
|
judgeval/common/logger.py,sha256=514eFLYWS_UL8VY-zAR2ePUlpQe4rbYlleLASFllLE4,1511
|
12
|
-
judgeval/common/utils.py,sha256=
|
13
|
+
judgeval/common/utils.py,sha256=oxGDRVWOICKWeyGgsoc36_yAyHSYF4XtH842Mkznwis,34739
|
13
14
|
judgeval/common/api/__init__.py,sha256=-E7lpZz1fG8puR_aYUMfPmQ-Vyhd0bgzoaU5EhIuFjQ,114
|
14
|
-
judgeval/common/api/api.py,sha256=
|
15
|
-
judgeval/common/api/constants.py,sha256=
|
15
|
+
judgeval/common/api/api.py,sha256=wty02HYANeOYlM8fHOLc33ux5bu9Ieq7iRqCr-UP0ng,14157
|
16
|
+
judgeval/common/api/constants.py,sha256=vAW94pbyTS6rv1TKpt7z6xxMJvTaAxFiy1D4kzuLHeg,4567
|
16
17
|
judgeval/common/storage/__init__.py,sha256=a-PI7OL-ydyzugGUKmJKRBASnK-Q-gs82L9K9rSyJP8,90
|
17
|
-
judgeval/common/storage/s3_storage.py,sha256=
|
18
|
+
judgeval/common/storage/s3_storage.py,sha256=0-bNKheqJJyBZ92KGrzQtd1zocIRWBlfn_58L4a-Ay0,3719
|
18
19
|
judgeval/common/tracer/__init__.py,sha256=tJCJsmVmrL89Phv88gNCJ-j0ITPez6lh8vhMAAlLNSc,795
|
19
20
|
judgeval/common/tracer/constants.py,sha256=yu5y8gMe5yb1AaBkPtAH-BNwIaAR3NwYCRoSf45wp5U,621
|
20
|
-
judgeval/common/tracer/core.py,sha256=
|
21
|
+
judgeval/common/tracer/core.py,sha256=6a67h8WfI4T5YV4TXqZqAAbOPptA0yaIV38pe7Urf_0,73813
|
21
22
|
judgeval/common/tracer/otel_exporter.py,sha256=kZLlOQ6afQE4dmb9H1wgU4P3H5PG1D_zKyvnpWcT5Ak,3899
|
22
|
-
judgeval/common/tracer/otel_span_processor.py,sha256=
|
23
|
+
judgeval/common/tracer/otel_span_processor.py,sha256=W7SM62KnxJ48vC9WllIHRKaLlvxkCwqYoT4KqZLfGNs,6497
|
23
24
|
judgeval/common/tracer/span_processor.py,sha256=eFjTgSWSkM6BWE94CrvgafDg_WkxLsFL_MafwBG-p9M,1145
|
24
|
-
judgeval/common/tracer/span_transformer.py,sha256=
|
25
|
+
judgeval/common/tracer/span_transformer.py,sha256=nCnwRC52OKfYRFnsOwGdPaqb_U17yn5S_9jfhv1GaLM,7803
|
25
26
|
judgeval/common/tracer/trace_manager.py,sha256=7KLWBrz5GE_138DHL_eRjhx4-LNfXKz1q_XIDfg6nw8,2992
|
26
27
|
judgeval/data/__init__.py,sha256=1QagDcSQtfnJ632t9Dnq8d7XjAqhmY4mInOWt8qH9tM,455
|
27
|
-
judgeval/data/example.py,sha256=
|
28
|
-
judgeval/data/judgment_types.py,sha256=
|
29
|
-
judgeval/data/result.py,sha256=
|
28
|
+
judgeval/data/example.py,sha256=kRskIgsjwcvv2Y8jaPwV-PND7zlmMbFsvRVQ_b7SZY0,914
|
29
|
+
judgeval/data/judgment_types.py,sha256=KE1HrFLfSxiu1zutaiZ7B7La9PGXIAsoWpo_5iy645c,8336
|
30
|
+
judgeval/data/result.py,sha256=OtSnBUrdQpjyAqxXRLTW3wC9v9lOm_GqzL14ccRQxrg,2124
|
30
31
|
judgeval/data/scorer_data.py,sha256=5QBHtvOIWOq0Rn9_uPJzAMRYMlWxMB-rXnG_6kV4Z4Y,2955
|
31
32
|
judgeval/data/tool.py,sha256=iWQSdy5uNbIeACu3gQy1DC2oGYxRVYNfkkczWdQMAiA,99
|
32
|
-
judgeval/data/trace.py,sha256=
|
33
|
+
judgeval/data/trace.py,sha256=tDOuYFPUssQInjsmwyxcXq-W3IB29Vq340VzqafuKJc,6942
|
33
34
|
judgeval/data/trace_run.py,sha256=c6pRSv09Vj016hxM49I3kMftCwWg8hhkfT_1kBXluSI,1600
|
34
|
-
judgeval/data/datasets/__init__.py,sha256=IdNKhQv9yYZ_op0rdBacrFaFVmiiYQ3JTzXzxOTsEVQ,176
|
35
|
-
judgeval/data/datasets/dataset.py,sha256=dDmTYSBRj4YEUhgYOebAcDm4N14nj3tcCqHj9y2Z1z0,12725
|
36
|
-
judgeval/data/datasets/eval_dataset_client.py,sha256=8tiuwRC3oebc19KY-5b99Cxj0qq6ADW1NMDd1R1RhLc,7258
|
37
35
|
judgeval/data/scripts/fix_default_factory.py,sha256=lvp2JwYZqz-XpD9LZNa3mANZVP-jJSZoNzolI6JWERM,591
|
38
|
-
judgeval/data/scripts/openapi_transform.py,sha256=
|
36
|
+
judgeval/data/scripts/openapi_transform.py,sha256=Sm04JClzyP1ga8KA3gkIdsae8Hlx-XU7-x0gHCQYOhg,3877
|
39
37
|
judgeval/integrations/langgraph.py,sha256=kJXLsgBY7DgsUTZyVQ47deDgHm887brFHfyIbuyerGw,29986
|
40
38
|
judgeval/judges/__init__.py,sha256=6X7VSwrwsdxGBNxCyapVRWGghhKOy3MVxFNMQ62kCXM,308
|
41
39
|
judgeval/judges/base_judge.py,sha256=_dz0qWsKRxzXxpRY9l6mrxTRYPSF2FE4ZXkrzhZ4gbY,986
|
42
|
-
judgeval/judges/litellm_judge.py,sha256=
|
43
|
-
judgeval/judges/mixture_of_judges.py,sha256=
|
44
|
-
judgeval/judges/together_judge.py,sha256=
|
40
|
+
judgeval/judges/litellm_judge.py,sha256=yt6QvwKMmxZcrUtjbn3EiO5aVg7CHM2YZkBCSQLS8jk,2509
|
41
|
+
judgeval/judges/mixture_of_judges.py,sha256=cecQ8mRmz2-dDoZl2MGsrhZICkpIvRovGPK3su0kc8s,14889
|
42
|
+
judgeval/judges/together_judge.py,sha256=5FADUhs6-FN1ZVV_1D3-8_gu9mPbZiG0PYTpme41SfM,2336
|
45
43
|
judgeval/judges/utils.py,sha256=0CF9qtIUQUL3-W-qTGpmTjZbkUUBAM6TslDsrCHnTBU,2725
|
46
44
|
judgeval/scorers/__init__.py,sha256=4H_cinTQ4EogZv59YEV-3U9EOTLppNwgAPTi1-jI9Fw,746
|
47
45
|
judgeval/scorers/agent_scorer.py,sha256=TjwD_YglSywr3EowEojiCyg5qDgCRa5LRGc5nFdmIBc,703
|
@@ -49,8 +47,8 @@ judgeval/scorers/api_scorer.py,sha256=xlhqkeMUBFxl8daSXOTWOYwZjBAz7o6b4sVD5f8cIH
|
|
49
47
|
judgeval/scorers/base_scorer.py,sha256=eDfQk8N8TQfM1ayJDWr0NTdSQxcbk9-VZHd0Igb9EbI,2878
|
50
48
|
judgeval/scorers/example_scorer.py,sha256=2n45y3LMV1Q-ARyXLHqvVWETlnY1DqS7OLzPu9IBGz8,716
|
51
49
|
judgeval/scorers/exceptions.py,sha256=ACDHK5-TWiF3NTk-wycaedpbrdobm-CvvC1JA_iP-Mk,179
|
52
|
-
judgeval/scorers/score.py,sha256=
|
53
|
-
judgeval/scorers/utils.py,sha256=
|
50
|
+
judgeval/scorers/score.py,sha256=2-M_AmOjIQR2c0qvuB4WIIQD-7zSNdzsWC8ttqltw2g,6601
|
51
|
+
judgeval/scorers/utils.py,sha256=HQOYTJtNnsi_aPfMssePAaBbXpAv7LXgwUlWlDFuN2g,3965
|
54
52
|
judgeval/scorers/judgeval_scorers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
55
53
|
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py,sha256=GX4KkwPR2p-c0Y5mZingJa8EUfjAbMGhrmRBDBunOGw,1484
|
56
54
|
judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py,sha256=zJsU0VrUmRhY9qav48c6jTyDqUwI3JzhV9ajtlJCe0M,544
|
@@ -65,9 +63,9 @@ judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py,sha256=Mcp1CjMN
|
|
65
63
|
judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py,sha256=Z2FLGBC7m_CLx-CMgXVuTvYvN0vY5yOcWA0ImBkeBfY,787
|
66
64
|
judgeval/tracer/__init__.py,sha256=wkuXtOGDCrwgPPXlh_sSJmvGuWaAMHyNzk1TzB5f9aI,148
|
67
65
|
judgeval/utils/alerts.py,sha256=3w_AjQrgfmOZvfqCridW8WAnHVxHHXokX9jNzVFyGjA,3297
|
68
|
-
judgeval/utils/file_utils.py,sha256=
|
66
|
+
judgeval/utils/file_utils.py,sha256=PWHRs8dUr8iDwpglSSk4Yjd7C6ZhDzUaO-jV3m7riHM,1987
|
69
67
|
judgeval/utils/requests.py,sha256=K3gUKrwL6TvwYKVYO5OeLWdUHn9NiUPmnIXhZEiEaHU,1534
|
70
|
-
judgeval-0.
|
71
|
-
judgeval-0.
|
72
|
-
judgeval-0.
|
73
|
-
judgeval-0.
|
68
|
+
judgeval-0.3.1.dist-info/METADATA,sha256=rMctXqjJ8pY2MEfeXMA9Ot_8GQiZUDZwErzCZ6rommQ,10348
|
69
|
+
judgeval-0.3.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
70
|
+
judgeval-0.3.1.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
|
71
|
+
judgeval-0.3.1.dist-info/RECORD,,
|
@@ -1,341 +0,0 @@
|
|
1
|
-
import ast
|
2
|
-
import csv
|
3
|
-
import datetime
|
4
|
-
import json
|
5
|
-
import os
|
6
|
-
import yaml
|
7
|
-
from dataclasses import dataclass, field
|
8
|
-
from typing import List, Union, Literal, Optional
|
9
|
-
|
10
|
-
from judgeval.data import Example, Trace
|
11
|
-
from judgeval.common.logger import judgeval_logger
|
12
|
-
from judgeval.utils.file_utils import get_examples_from_yaml
|
13
|
-
|
14
|
-
|
15
|
-
@dataclass
|
16
|
-
class EvalDataset:
|
17
|
-
examples: List[Example]
|
18
|
-
traces: List[Trace]
|
19
|
-
_alias: Union[str, None] = field(default=None)
|
20
|
-
_id: Union[str, None] = field(default=None)
|
21
|
-
judgment_api_key: str = field(default="")
|
22
|
-
organization_id: str = field(default="")
|
23
|
-
|
24
|
-
def __init__(
|
25
|
-
self,
|
26
|
-
judgment_api_key: str = os.getenv("JUDGMENT_API_KEY", ""),
|
27
|
-
organization_id: str = os.getenv("JUDGMENT_ORG_ID", ""),
|
28
|
-
examples: Optional[List[Example]] = None,
|
29
|
-
traces: Optional[List[Trace]] = None,
|
30
|
-
):
|
31
|
-
if not judgment_api_key:
|
32
|
-
judgeval_logger.error("No judgment_api_key provided")
|
33
|
-
self.examples = examples or []
|
34
|
-
self.traces = traces or []
|
35
|
-
self._alias = None
|
36
|
-
self._id = None
|
37
|
-
self.judgment_api_key = judgment_api_key
|
38
|
-
self.organization_id = organization_id
|
39
|
-
|
40
|
-
def add_from_json(self, file_path: str) -> None:
|
41
|
-
"""
|
42
|
-
Adds examples from a JSON file.
|
43
|
-
|
44
|
-
The format of the JSON file is expected to be a dictionary with one key: "examples".
|
45
|
-
The value of the key is a list of dictionaries, where each dictionary represents an example.
|
46
|
-
|
47
|
-
The JSON file is expected to have the following format:
|
48
|
-
{
|
49
|
-
"examples": [
|
50
|
-
{
|
51
|
-
"input": "test input",
|
52
|
-
"actual_output": "test output",
|
53
|
-
"expected_output": "expected output",
|
54
|
-
"context": [
|
55
|
-
"context1",
|
56
|
-
"context2"
|
57
|
-
],
|
58
|
-
"retrieval_context": [
|
59
|
-
"retrieval1"
|
60
|
-
],
|
61
|
-
"additional_metadata": {
|
62
|
-
"key": "value"
|
63
|
-
},
|
64
|
-
"tools_called": [
|
65
|
-
"tool1"
|
66
|
-
],
|
67
|
-
"expected_tools": [
|
68
|
-
"tool1",
|
69
|
-
"tool2"
|
70
|
-
],
|
71
|
-
"name": "test example",
|
72
|
-
"example_id": null,
|
73
|
-
"timestamp": "20241230_160117",
|
74
|
-
"trace_id": "123"
|
75
|
-
}
|
76
|
-
]
|
77
|
-
}
|
78
|
-
"""
|
79
|
-
try:
|
80
|
-
with open(file_path, "r") as file:
|
81
|
-
payload = json.load(file)
|
82
|
-
examples = payload.get("examples", [])
|
83
|
-
except FileNotFoundError:
|
84
|
-
judgeval_logger.error(f"JSON file not found: {file_path}")
|
85
|
-
raise FileNotFoundError(f"The file {file_path} was not found.")
|
86
|
-
except json.JSONDecodeError:
|
87
|
-
judgeval_logger.error(f"Invalid JSON file: {file_path}")
|
88
|
-
raise ValueError(f"The file {file_path} is not a valid JSON file.")
|
89
|
-
|
90
|
-
new_examples = [Example(**e) for e in examples]
|
91
|
-
for e in new_examples:
|
92
|
-
self.add_example(e)
|
93
|
-
|
94
|
-
def add_from_csv(
|
95
|
-
self,
|
96
|
-
file_path: str,
|
97
|
-
header_mapping: dict,
|
98
|
-
primary_delimiter: str = ",",
|
99
|
-
secondary_delimiter: str = ";",
|
100
|
-
) -> None:
|
101
|
-
"""
|
102
|
-
Add Examples from a CSV file.
|
103
|
-
|
104
|
-
Args:
|
105
|
-
file_path (str): Path to the CSV file
|
106
|
-
header_mapping (dict): Dictionary mapping Example headers to custom headers
|
107
|
-
primary_delimiter (str, optional): Main delimiter used in CSV file. Defaults to ","
|
108
|
-
secondary_delimiter (str, optional): Secondary delimiter for list fields. Defaults to ";"
|
109
|
-
"""
|
110
|
-
try:
|
111
|
-
import pandas as pd
|
112
|
-
except ModuleNotFoundError:
|
113
|
-
raise ModuleNotFoundError(
|
114
|
-
"Please install pandas to use this method. 'pip install pandas'"
|
115
|
-
)
|
116
|
-
|
117
|
-
# Pandas naturally reads numbers in data files as ints, not strings (can lead to unexpected behavior)
|
118
|
-
df = pd.read_csv(file_path, dtype={"trace_id": str}, sep=primary_delimiter)
|
119
|
-
"""
|
120
|
-
The user should pass in a dict mapping from Judgment Example headers to their custom defined headers.
|
121
|
-
Available headers for Example objects are as follows:
|
122
|
-
|
123
|
-
"input", "actual_output", "expected_output", "context", \
|
124
|
-
"retrieval_context", "additional_metadata", "tools_called", \
|
125
|
-
"expected_tools", "name", "comments", "source_file", "example", \
|
126
|
-
"trace_id"
|
127
|
-
|
128
|
-
We want to collect the examples separately which can
|
129
|
-
be determined by the "example" column. If the value is True, then it is an
|
130
|
-
example, and we expect the `input` and `actual_output` fields to be non-null.
|
131
|
-
|
132
|
-
We also assume that if there are multiple retrieval contexts, contexts, or tools called, they are separated by semicolons.
|
133
|
-
This can be adjusted using the `secondary_delimiter` parameter.
|
134
|
-
"""
|
135
|
-
examples = []
|
136
|
-
|
137
|
-
def process_csv_row(value, header):
|
138
|
-
"""
|
139
|
-
Maps a singular value in the CSV file to the appropriate type based on the header.
|
140
|
-
If value exists and can be split into type List[*], we will split upon the user's provided secondary delimiter.
|
141
|
-
"""
|
142
|
-
# check that the CSV value is not null for entry
|
143
|
-
null_replacement = dict() if header == "additional_metadata" else None
|
144
|
-
if pd.isna(value) or value == "":
|
145
|
-
return null_replacement
|
146
|
-
try:
|
147
|
-
value = (
|
148
|
-
ast.literal_eval(value)
|
149
|
-
if header == "additional_metadata"
|
150
|
-
else str(value)
|
151
|
-
)
|
152
|
-
except (ValueError, SyntaxError):
|
153
|
-
value = str(value)
|
154
|
-
if header in [
|
155
|
-
"context",
|
156
|
-
"retrieval_context",
|
157
|
-
"tools_called",
|
158
|
-
"expected_tools",
|
159
|
-
]:
|
160
|
-
# attempt to split the value by the secondary delimiter
|
161
|
-
value = value.split(secondary_delimiter)
|
162
|
-
|
163
|
-
return value
|
164
|
-
|
165
|
-
for _, row in df.iterrows():
|
166
|
-
data = {
|
167
|
-
header: process_csv_row(row[header_mapping[header]], header)
|
168
|
-
for header in header_mapping
|
169
|
-
}
|
170
|
-
if "example" in header_mapping and row[header_mapping["example"]]:
|
171
|
-
if "name" in header_mapping:
|
172
|
-
data["name"] = (
|
173
|
-
row[header_mapping["name"]]
|
174
|
-
if pd.notna(row[header_mapping["name"]])
|
175
|
-
else None
|
176
|
-
)
|
177
|
-
# every Example has `input` and `actual_output` fields
|
178
|
-
if data["input"] is not None and data["actual_output"] is not None:
|
179
|
-
e = Example(**data)
|
180
|
-
examples.append(e)
|
181
|
-
else:
|
182
|
-
raise ValueError(
|
183
|
-
"Every example must have an 'input' and 'actual_output' field."
|
184
|
-
)
|
185
|
-
|
186
|
-
for e in examples:
|
187
|
-
self.add_example(e)
|
188
|
-
|
189
|
-
def add_from_yaml(self, file_path: str) -> None:
|
190
|
-
"""
|
191
|
-
Adds examples from a YAML file.
|
192
|
-
|
193
|
-
The format of the YAML file is expected to be a dictionary with one key: "examples".
|
194
|
-
The value of the key is a list of dictionaries, where each dictionary represents an example.
|
195
|
-
|
196
|
-
The YAML file is expected to have the following format:
|
197
|
-
examples:
|
198
|
-
- input: "test input"
|
199
|
-
actual_output: "test output"
|
200
|
-
expected_output: "expected output"
|
201
|
-
context:
|
202
|
-
- "context1"
|
203
|
-
- "context2"
|
204
|
-
retrieval_context:
|
205
|
-
- "retrieval1"
|
206
|
-
additional_metadata:
|
207
|
-
key: "value"
|
208
|
-
tools_called:
|
209
|
-
- "tool1"
|
210
|
-
expected_tools:
|
211
|
-
- "tool1"
|
212
|
-
- "tool2"
|
213
|
-
name: "test example"
|
214
|
-
example_id: null
|
215
|
-
timestamp: "20241230_160117"
|
216
|
-
trace_id: "123"
|
217
|
-
"""
|
218
|
-
examples = get_examples_from_yaml(file_path)
|
219
|
-
|
220
|
-
for e in examples:
|
221
|
-
self.add_example(e)
|
222
|
-
|
223
|
-
def add_example(self, e: Example) -> None:
|
224
|
-
self.examples.append(e)
|
225
|
-
# TODO if we need to add rank, then we need to do it here
|
226
|
-
|
227
|
-
def add_trace(self, t: Trace) -> None:
|
228
|
-
self.traces.append(t)
|
229
|
-
|
230
|
-
def save_as(
|
231
|
-
self,
|
232
|
-
file_type: Literal["json", "csv", "yaml"],
|
233
|
-
dir_path: str,
|
234
|
-
save_name: str | None = None,
|
235
|
-
) -> None:
|
236
|
-
"""
|
237
|
-
Saves the dataset as a file. Save only the examples.
|
238
|
-
|
239
|
-
Args:
|
240
|
-
file_type (Literal["json", "csv"]): The file type to save the dataset as.
|
241
|
-
dir_path (str): The directory path to save the file to.
|
242
|
-
save_name (str, optional): The name of the file to save. Defaults to None.
|
243
|
-
"""
|
244
|
-
if not os.path.exists(dir_path):
|
245
|
-
os.makedirs(dir_path)
|
246
|
-
file_name = (
|
247
|
-
datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
248
|
-
if save_name is None
|
249
|
-
else save_name
|
250
|
-
)
|
251
|
-
complete_path = os.path.join(dir_path, f"{file_name}.{file_type}")
|
252
|
-
if file_type == "json":
|
253
|
-
with open(complete_path, "w") as file:
|
254
|
-
json.dump(
|
255
|
-
{
|
256
|
-
"examples": [e.to_dict() for e in self.examples],
|
257
|
-
},
|
258
|
-
file,
|
259
|
-
indent=4,
|
260
|
-
)
|
261
|
-
elif file_type == "csv":
|
262
|
-
with open(complete_path, "w", newline="") as file:
|
263
|
-
writer = csv.writer(file)
|
264
|
-
writer.writerow(
|
265
|
-
[
|
266
|
-
"input",
|
267
|
-
"actual_output",
|
268
|
-
"expected_output",
|
269
|
-
"context",
|
270
|
-
"retrieval_context",
|
271
|
-
"additional_metadata",
|
272
|
-
"tools_called",
|
273
|
-
"expected_tools",
|
274
|
-
"name",
|
275
|
-
"comments",
|
276
|
-
"source_file",
|
277
|
-
"example",
|
278
|
-
"trace_id",
|
279
|
-
]
|
280
|
-
)
|
281
|
-
for e in self.examples:
|
282
|
-
writer.writerow(
|
283
|
-
[
|
284
|
-
e.input,
|
285
|
-
e.actual_output,
|
286
|
-
e.expected_output,
|
287
|
-
";".join(e.context),
|
288
|
-
";".join(e.retrieval_context),
|
289
|
-
e.additional_metadata,
|
290
|
-
";".join(e.tools_called),
|
291
|
-
";".join(e.expected_tools),
|
292
|
-
e.name,
|
293
|
-
None, # Example does not have comments
|
294
|
-
None, # Example does not have source file
|
295
|
-
True, # Adding an Example
|
296
|
-
]
|
297
|
-
)
|
298
|
-
|
299
|
-
elif file_type == "yaml":
|
300
|
-
with open(complete_path, "w") as file:
|
301
|
-
yaml_data = {
|
302
|
-
"examples": [
|
303
|
-
{
|
304
|
-
"input": e.input,
|
305
|
-
"actual_output": e.actual_output,
|
306
|
-
"expected_output": e.expected_output,
|
307
|
-
"context": e.context,
|
308
|
-
"retrieval_context": e.retrieval_context,
|
309
|
-
"additional_metadata": e.additional_metadata,
|
310
|
-
"tools_called": e.tools_called,
|
311
|
-
"expected_tools": e.expected_tools,
|
312
|
-
"name": e.name,
|
313
|
-
"comments": None, # Example does not have comments
|
314
|
-
"source_file": None, # Example does not have source file
|
315
|
-
"example": True, # Adding an Example
|
316
|
-
}
|
317
|
-
for e in self.examples
|
318
|
-
],
|
319
|
-
}
|
320
|
-
yaml.dump(yaml_data, file, default_flow_style=False)
|
321
|
-
else:
|
322
|
-
ACCEPTABLE_FILE_TYPES = ["json", "csv", "yaml"]
|
323
|
-
raise TypeError(
|
324
|
-
f"Invalid file type: {file_type}. Please choose from {ACCEPTABLE_FILE_TYPES}"
|
325
|
-
)
|
326
|
-
|
327
|
-
def __iter__(self):
|
328
|
-
return iter(self.examples)
|
329
|
-
|
330
|
-
def __len__(self):
|
331
|
-
return len(self.examples)
|
332
|
-
|
333
|
-
def __str__(self):
|
334
|
-
return (
|
335
|
-
f"{self.__class__.__name__}("
|
336
|
-
f"examples={self.examples}, "
|
337
|
-
f"traces={self.traces}, "
|
338
|
-
f"_alias={self._alias}, "
|
339
|
-
f"_id={self._id}"
|
340
|
-
f")"
|
341
|
-
)
|