judgeval 0.0.52__tar.gz → 0.0.54__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (128) hide show
  1. judgeval-0.0.54/.github/ISSUE_TEMPLATE/bug_report.md +41 -0
  2. judgeval-0.0.54/.github/ISSUE_TEMPLATE/feature_request.md +43 -0
  3. judgeval-0.0.54/.github/pull_request_template.md +23 -0
  4. {judgeval-0.0.52 → judgeval-0.0.54}/.gitignore +6 -1
  5. {judgeval-0.0.52 → judgeval-0.0.54}/PKG-INFO +6 -5
  6. {judgeval-0.0.52 → judgeval-0.0.54}/README.md +4 -4
  7. judgeval-0.0.54/assets/agent.gif +0 -0
  8. judgeval-0.0.54/assets/data.gif +0 -0
  9. judgeval-0.0.54/assets/document.gif +0 -0
  10. judgeval-0.0.54/assets/trace.gif +0 -0
  11. {judgeval-0.0.52 → judgeval-0.0.54}/pyproject.toml +2 -1
  12. judgeval-0.0.54/src/judgeval/common/logger.py +60 -0
  13. {judgeval-0.0.52 → judgeval-0.0.54}/src/judgeval/common/s3_storage.py +2 -6
  14. {judgeval-0.0.52 → judgeval-0.0.54}/src/judgeval/common/tracer.py +182 -262
  15. {judgeval-0.0.52 → judgeval-0.0.54}/src/judgeval/common/utils.py +16 -36
  16. {judgeval-0.0.52 → judgeval-0.0.54}/src/judgeval/constants.py +14 -20
  17. {judgeval-0.0.52 → judgeval-0.0.54}/src/judgeval/data/__init__.py +0 -2
  18. {judgeval-0.0.52 → judgeval-0.0.54}/src/judgeval/data/datasets/dataset.py +6 -10
  19. {judgeval-0.0.52 → judgeval-0.0.54}/src/judgeval/data/datasets/eval_dataset_client.py +25 -27
  20. judgeval-0.0.54/src/judgeval/data/example.py +61 -0
  21. judgeval-0.0.54/src/judgeval/data/judgment_types.py +214 -0
  22. {judgeval-0.0.52 → judgeval-0.0.54}/src/judgeval/data/result.py +7 -25
  23. {judgeval-0.0.52 → judgeval-0.0.54}/src/judgeval/data/scorer_data.py +28 -40
  24. judgeval-0.0.54/src/judgeval/data/scripts/fix_default_factory.py +23 -0
  25. judgeval-0.0.54/src/judgeval/data/scripts/openapi_transform.py +123 -0
  26. judgeval-0.0.54/src/judgeval/data/tool.py +5 -0
  27. {judgeval-0.0.52 → judgeval-0.0.54}/src/judgeval/data/trace.py +31 -50
  28. {judgeval-0.0.52 → judgeval-0.0.54}/src/judgeval/data/trace_run.py +3 -3
  29. {judgeval-0.0.52 → judgeval-0.0.54}/src/judgeval/evaluation_run.py +16 -23
  30. {judgeval-0.0.52 → judgeval-0.0.54}/src/judgeval/integrations/langgraph.py +11 -12
  31. {judgeval-0.0.52 → judgeval-0.0.54}/src/judgeval/judges/litellm_judge.py +3 -6
  32. {judgeval-0.0.52 → judgeval-0.0.54}/src/judgeval/judges/mixture_of_judges.py +8 -25
  33. {judgeval-0.0.52 → judgeval-0.0.54}/src/judgeval/judges/together_judge.py +3 -6
  34. {judgeval-0.0.52 → judgeval-0.0.54}/src/judgeval/judgment_client.py +22 -24
  35. {judgeval-0.0.52 → judgeval-0.0.54}/src/judgeval/rules.py +7 -19
  36. {judgeval-0.0.52 → judgeval-0.0.54}/src/judgeval/run_evaluation.py +79 -242
  37. {judgeval-0.0.52 → judgeval-0.0.54}/src/judgeval/scorers/__init__.py +4 -20
  38. judgeval-0.0.54/src/judgeval/scorers/agent_scorer.py +21 -0
  39. judgeval-0.0.54/src/judgeval/scorers/api_scorer.py +70 -0
  40. judgeval-0.0.54/src/judgeval/scorers/base_scorer.py +98 -0
  41. judgeval-0.0.54/src/judgeval/scorers/example_scorer.py +19 -0
  42. {judgeval-0.0.52 → judgeval-0.0.54}/src/judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -20
  43. judgeval-0.0.54/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +21 -0
  44. judgeval-0.0.54/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +12 -0
  45. judgeval-0.0.54/src/judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +73 -0
  46. judgeval-0.0.54/src/judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +14 -0
  47. {judgeval-0.0.52 → judgeval-0.0.54}/src/judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +4 -4
  48. judgeval-0.0.54/src/judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +21 -0
  49. {judgeval-0.0.52 → judgeval-0.0.54}/src/judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +4 -4
  50. {judgeval-0.0.52 → judgeval-0.0.54}/src/judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +4 -4
  51. {judgeval-0.0.52 → judgeval-0.0.54}/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +4 -4
  52. judgeval-0.0.54/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +27 -0
  53. judgeval-0.0.54/src/judgeval/scorers/score.py +180 -0
  54. {judgeval-0.0.52 → judgeval-0.0.54}/src/judgeval/scorers/utils.py +6 -88
  55. {judgeval-0.0.52 → judgeval-0.0.54}/src/judgeval/utils/file_utils.py +4 -6
  56. {judgeval-0.0.52 → judgeval-0.0.54}/src/judgeval/version_check.py +3 -2
  57. judgeval-0.0.54/src/update_types.sh +14 -0
  58. {judgeval-0.0.52 → judgeval-0.0.54}/uv.lock +901 -1221
  59. judgeval-0.0.52/.github/pull_request_template.md +0 -13
  60. judgeval-0.0.52/assets/agent.gif +0 -0
  61. judgeval-0.0.52/assets/data.gif +0 -0
  62. judgeval-0.0.52/assets/document.gif +0 -0
  63. judgeval-0.0.52/assets/trace.gif +0 -0
  64. judgeval-0.0.52/src/judgeval/common/logger.py +0 -213
  65. judgeval-0.0.52/src/judgeval/data/custom_example.py +0 -19
  66. judgeval-0.0.52/src/judgeval/data/example.py +0 -194
  67. judgeval-0.0.52/src/judgeval/data/tool.py +0 -56
  68. judgeval-0.0.52/src/judgeval/scorers/api_scorer.py +0 -80
  69. judgeval-0.0.52/src/judgeval/scorers/judgeval_scorer.py +0 -177
  70. judgeval-0.0.52/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +0 -28
  71. judgeval-0.0.52/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +0 -27
  72. judgeval-0.0.52/src/judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +0 -125
  73. judgeval-0.0.52/src/judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +0 -45
  74. judgeval-0.0.52/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -29
  75. judgeval-0.0.52/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -29
  76. judgeval-0.0.52/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -32
  77. judgeval-0.0.52/src/judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -22
  78. judgeval-0.0.52/src/judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +0 -28
  79. judgeval-0.0.52/src/judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +0 -28
  80. judgeval-0.0.52/src/judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -38
  81. judgeval-0.0.52/src/judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -27
  82. judgeval-0.0.52/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -23
  83. judgeval-0.0.52/src/judgeval/scorers/prompt_scorer.py +0 -296
  84. judgeval-0.0.52/src/judgeval/scorers/score.py +0 -465
  85. {judgeval-0.0.52 → judgeval-0.0.54}/.github/workflows/blocked-pr.yaml +0 -0
  86. {judgeval-0.0.52 → judgeval-0.0.54}/.github/workflows/ci.yaml +0 -0
  87. {judgeval-0.0.52 → judgeval-0.0.54}/.github/workflows/lint.yaml +0 -0
  88. {judgeval-0.0.52 → judgeval-0.0.54}/.github/workflows/merge-branch-check.yaml +0 -0
  89. {judgeval-0.0.52 → judgeval-0.0.54}/.github/workflows/release.yaml +0 -0
  90. {judgeval-0.0.52 → judgeval-0.0.54}/.github/workflows/validate-branch.yaml +0 -0
  91. {judgeval-0.0.52 → judgeval-0.0.54}/.pre-commit-config.yaml +0 -0
  92. {judgeval-0.0.52 → judgeval-0.0.54}/LICENSE.md +0 -0
  93. {judgeval-0.0.52 → judgeval-0.0.54}/assets/Screenshot 2025-05-17 at 8.14.27/342/200/257PM.png" +0 -0
  94. {judgeval-0.0.52 → judgeval-0.0.54}/assets/dataset_clustering_screenshot.png +0 -0
  95. {judgeval-0.0.52 → judgeval-0.0.54}/assets/dataset_clustering_screenshot_dm.png +0 -0
  96. {judgeval-0.0.52 → judgeval-0.0.54}/assets/datasets_preview_screenshot.png +0 -0
  97. {judgeval-0.0.52 → judgeval-0.0.54}/assets/error_analysis_dashboard.png +0 -0
  98. {judgeval-0.0.52 → judgeval-0.0.54}/assets/experiments_dashboard_screenshot.png +0 -0
  99. {judgeval-0.0.52 → judgeval-0.0.54}/assets/experiments_page.png +0 -0
  100. {judgeval-0.0.52 → judgeval-0.0.54}/assets/experiments_pagev2.png +0 -0
  101. {judgeval-0.0.52 → judgeval-0.0.54}/assets/logo-dark.svg +0 -0
  102. {judgeval-0.0.52 → judgeval-0.0.54}/assets/logo-light.svg +0 -0
  103. {judgeval-0.0.52 → judgeval-0.0.54}/assets/monitoring_screenshot.png +0 -0
  104. {judgeval-0.0.52 → judgeval-0.0.54}/assets/new_darkmode.svg +0 -0
  105. {judgeval-0.0.52 → judgeval-0.0.54}/assets/new_lightmode.svg +0 -0
  106. {judgeval-0.0.52 → judgeval-0.0.54}/assets/product_shot.png +0 -0
  107. {judgeval-0.0.52 → judgeval-0.0.54}/assets/trace_demo.png +0 -0
  108. {judgeval-0.0.52 → judgeval-0.0.54}/assets/trace_screenshot.png +0 -0
  109. {judgeval-0.0.52 → judgeval-0.0.54}/assets/trace_screenshot_old.png +0 -0
  110. {judgeval-0.0.52 → judgeval-0.0.54}/pytest.ini +0 -0
  111. {judgeval-0.0.52 → judgeval-0.0.54}/src/.coveragerc +0 -0
  112. {judgeval-0.0.52 → judgeval-0.0.54}/src/judgeval/__init__.py +0 -0
  113. {judgeval-0.0.52 → judgeval-0.0.54}/src/judgeval/clients.py +0 -0
  114. {judgeval-0.0.52 → judgeval-0.0.54}/src/judgeval/common/__init__.py +0 -0
  115. {judgeval-0.0.52 → judgeval-0.0.54}/src/judgeval/common/exceptions.py +0 -0
  116. {judgeval-0.0.52 → judgeval-0.0.54}/src/judgeval/data/datasets/__init__.py +0 -0
  117. {judgeval-0.0.52 → judgeval-0.0.54}/src/judgeval/judges/__init__.py +0 -0
  118. {judgeval-0.0.52 → judgeval-0.0.54}/src/judgeval/judges/base_judge.py +0 -0
  119. {judgeval-0.0.52 → judgeval-0.0.54}/src/judgeval/judges/utils.py +0 -0
  120. {judgeval-0.0.52 → judgeval-0.0.54}/src/judgeval/scorers/exceptions.py +0 -0
  121. {judgeval-0.0.52 → judgeval-0.0.54}/src/judgeval/scorers/judgeval_scorers/__init__.py +0 -0
  122. {judgeval-0.0.52 → judgeval-0.0.54}/src/judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -0
  123. {judgeval-0.0.52 → judgeval-0.0.54}/src/judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -0
  124. {judgeval-0.0.52 → judgeval-0.0.54}/src/judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -0
  125. {judgeval-0.0.52 → judgeval-0.0.54}/src/judgeval/tracer/__init__.py +0 -0
  126. {judgeval-0.0.52 → judgeval-0.0.54}/src/judgeval/utils/alerts.py +0 -0
  127. {judgeval-0.0.52 → judgeval-0.0.54}/src/judgeval/utils/requests.py +0 -0
  128. {judgeval-0.0.52 → judgeval-0.0.54}/update_version.py +0 -0
@@ -0,0 +1,41 @@
1
+ ---
2
+ name: Bug report
3
+ about: Create a report to help us improve Judgeval
4
+ title: "[BUG]"
5
+ labels: potential bug
6
+
7
+ ---
8
+
9
+ ## Describe the bug
10
+ A clear and concise description of what the bug is.
11
+
12
+ ## To Reproduce
13
+ Steps to reproduce the behavior:
14
+ 1. Go to '...'
15
+ 2. Click on '....'
16
+ 3. Scroll down to '....'
17
+ 4. See error
18
+
19
+ ## Expected behavior
20
+ A clear and concise description of what you expected to happen.
21
+
22
+ ## Screenshots
23
+ If applicable, add screenshots to help explain your problem.
24
+
25
+ ## Environment (please complete the following information):
26
+ - OS: [e.g. MacOS, Linux, Windows]
27
+ - Browser (if website issue): [e.g. Chrome, Safari, Firefox]
28
+ - Browser Version (if website issue): [e.g. 22]
29
+ - SDK Version: [e.g. 1.2.3]
30
+ - Programming Language/Runtime (if SDK issue): [e.g. Python 3.11, Python 3.12, etc.]
31
+ - Package Manager (if SDK issue): [e.g. uv, pip, pipenv]
32
+
33
+ ## Additional context
34
+ Add any other context about the problem here.
35
+
36
+ ## Are you interested to contribute a fix for this bug?
37
+ If this is a confirmed bug, the Judgment community is happy to support with guidance and review via [Discord](https://discord.com/invite/tGVFf8UBUY).
38
+
39
+ - [ ] Yes
40
+ - [ ] No
41
+
@@ -0,0 +1,43 @@
1
+ ---
2
+ name: Feature Request
3
+ about: Suggest an idea for Judgeval
4
+ title: "[FEATURE]"
5
+ labels: feature-request
6
+
7
+ ---
8
+
9
+ ## Is your feature request related to a problem? Please describe.
10
+ A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
11
+
12
+ ## Describe the solution you'd like
13
+ A clear and concise description of what you want to happen.
14
+
15
+ ## Describe alternatives you've considered
16
+ A clear and concise description of any alternative solutions or features you've considered.
17
+
18
+ ## Which component(s) does this affect?
19
+ - [ ] SDK (open for community contributions)
20
+ - [ ] Website (internal development only)
21
+ - [ ] Documentation (open for community contributions)
22
+ - [ ] Not sure
23
+
24
+ ## Use case and impact
25
+ Describe your specific use case and how this feature would benefit you or other users. Include:
26
+ - How often would you use this feature?
27
+ - How many users might benefit from this?
28
+ - Is this blocking your current implementation?
29
+
30
+ ## Proposed API/Interface (if applicable)
31
+ If you have ideas about how this feature should be exposed (API methods, UI elements, etc.), please describe them here.
32
+
33
+ ## Additional context
34
+ Add any other context, screenshots, code examples, or links to related issues/discussions about the feature request here.
35
+
36
+ ## Are you interested in contributing this feature?
37
+ The Judgment community is happy to provide guidance and review for contributions via [Discord](https://discord.com/invite/tGVFf8UBUY).
38
+
39
+ - [ ] Yes, I'd like to implement this
40
+ - [ ] Yes, I'd like to help with design/planning
41
+ - [ ] No, but I'd be happy to test it
42
+ - [ ] No
43
+
@@ -0,0 +1,23 @@
1
+ ## 📝 Summary
2
+
3
+ <!-- Add your list of changes, make it a list to improve the PR reviewers' experience. Ie:
4
+ - [ ] 1. Remove duplicate filter table
5
+ - [ ] 2. Reenabled filtering on new ExperimentRunsTableClient component, reapplied filtering changes
6
+ - [ ] 3. Added only search and filter when enter is pressed or apply filter is pressed
7
+ - [ ] 4. Error message for applying incomplete filters
8
+ - [ ] 5. Deletion should now work again for table
9
+ - [ ] 6. Comparison should now work again for table
10
+ -->
11
+ - [ ] 1. ...
12
+
13
+ ## 🎥 Demo of Changes
14
+
15
+ <!-- Add a short 1-3 minute video describing/demoing the changes -->
16
+
17
+ ## ✅ Checklist
18
+
19
+ - [ ] Tagged Linear ticket in PR title. Ie. PR Title (JUD-XXXX)
20
+ - [ ] Video demo of changes
21
+ - [ ] Reviewers assigned
22
+ - [ ] Docs updated ([if necessary](https://github.com/JudgmentLabs/docs))
23
+ - [ ] Cookbooks updated ([if necessary](https://github.com/JudgmentLabs/judgment-cookbook))
@@ -110,4 +110,9 @@ test-results.xml
110
110
 
111
111
  # Logs
112
112
  ./logs
113
- demo
113
+ demo
114
+
115
+ # OpenAPI json file
116
+ src/judgeval/data/openapi_new.json
117
+
118
+ CLAUDE.md
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: judgeval
3
- Version: 0.0.52
3
+ Version: 0.0.54
4
4
  Summary: Judgeval Package
5
5
  Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
6
  Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -12,6 +12,7 @@ Classifier: Programming Language :: Python :: 3
12
12
  Requires-Python: >=3.11
13
13
  Requires-Dist: anthropic
14
14
  Requires-Dist: boto3
15
+ Requires-Dist: datamodel-code-generator>=0.31.1
15
16
  Requires-Dist: google-genai
16
17
  Requires-Dist: langchain-anthropic
17
18
  Requires-Dist: langchain-core
@@ -150,10 +151,10 @@ You'll see your trace exported to the Judgment Platform:
150
151
 
151
152
  | | |
152
153
  |:---|:---:|
153
- | <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic): **tracking inputs/outputs, agent tool calls, latency, and cost** at every step.<br><br>Online evals can be applied to traces to measure quality on production data in real-time. Export data per individual trace for detailed analysis.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 📋 Collecting agent environment data <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/trace_screenshot.png" alt="Tracing visualization" width="1200"/></p> |
154
- | <h3>🧪 Evals</h3>Evals are the key to regression testing for agents. Judgeval provides 15+ research-backed metrics including tool call accuracy, hallucinations, instruction adherence, and retrieval context recall.<br><br>Judgeval supports LLM-as-a-judge, manual labeling, and custom evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 Experimental prompt testing<br>• 🛡️ Online guardrails | <p align="center"><img src="assets/experiments_page.png" alt="Evaluation metrics" width="800"/></p> |
155
- | <h3>📡 Monitoring</h3>Track all your agent metrics in production. **Catch production regressions early.**<br><br>Configure alerts to trigger automated actions when metric thresholds are exceeded (add agent trace to review queue/dataset, Slack notification, etc.).<br><br> **Useful for:** <br>• 📉 Identifying degradation early <br>• 📈 Visualizing performance trends across agent versions and time | <p align="center"><img src="assets/error_analysis_dashboard.png" alt="Monitoring Dashboard" width="1200"/></p> |
156
- | <h3>📊 Datasets</h3>Export comprehensive agent-environment interaction data or import external testcases to datasets for scaled analysis and optimization. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations, enabling continuous learning from production interactions. <br><br> **Useful for:**<br>• 🗃️ Agent environment interaction data for optimization<br>• 🔄 Scaled analysis for A/B tests | <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
154
+ | <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic). **Tracks inputs/outputs, agent tool calls, latency, cost, and custom metadata** at every step.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 📋 Collecting agent environment data <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/trace_screenshot.png" alt="Tracing visualization" width="1200"/></p> |
155
+ | <h3>🧪 Evals</h3>Build custom evaluators on top of your agents. Judgeval supports LLM-as-a-judge, manual labeling, and code-based evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 A/B testing <br>• 🛡️ Online guardrails | <p align="center"><img src="assets/experiments_page.png" alt="Evaluation metrics" width="800"/></p> |
156
+ | <h3>📡 Monitoring</h3>Get Slack alerts when you agent failures in production. Add custom hooks to address production regressions.<br><br> **Useful for:** <br>• 📉 Identifying degradation early <br>• 📈 Visualizing performance trends across agent versions and time | <p align="center"><img src="assets/error_analysis_dashboard.png" alt="Monitoring Dashboard" width="1200"/></p> |
157
+ | <h3>📊 Datasets</h3>Export traces and test cases to datasets for scaled analysis and optimization. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations, enabling continuous learning from production interactions. <br><br> **Useful for:**<br>• 🗃️ Agent environment interaction data for optimization<br>• 🔄 Scaled analysis for A/B tests | <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
157
158
 
158
159
  ## 🏢 Self-Hosting
159
160
 
@@ -121,10 +121,10 @@ You'll see your trace exported to the Judgment Platform:
121
121
 
122
122
  | | |
123
123
  |:---|:---:|
124
- | <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic): **tracking inputs/outputs, agent tool calls, latency, and cost** at every step.<br><br>Online evals can be applied to traces to measure quality on production data in real-time. Export data per individual trace for detailed analysis.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 📋 Collecting agent environment data <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/trace_screenshot.png" alt="Tracing visualization" width="1200"/></p> |
125
- | <h3>🧪 Evals</h3>Evals are the key to regression testing for agents. Judgeval provides 15+ research-backed metrics including tool call accuracy, hallucinations, instruction adherence, and retrieval context recall.<br><br>Judgeval supports LLM-as-a-judge, manual labeling, and custom evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 Experimental prompt testing<br>• 🛡️ Online guardrails | <p align="center"><img src="assets/experiments_page.png" alt="Evaluation metrics" width="800"/></p> |
126
- | <h3>📡 Monitoring</h3>Track all your agent metrics in production. **Catch production regressions early.**<br><br>Configure alerts to trigger automated actions when metric thresholds are exceeded (add agent trace to review queue/dataset, Slack notification, etc.).<br><br> **Useful for:** <br>• 📉 Identifying degradation early <br>• 📈 Visualizing performance trends across agent versions and time | <p align="center"><img src="assets/error_analysis_dashboard.png" alt="Monitoring Dashboard" width="1200"/></p> |
127
- | <h3>📊 Datasets</h3>Export comprehensive agent-environment interaction data or import external testcases to datasets for scaled analysis and optimization. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations, enabling continuous learning from production interactions. <br><br> **Useful for:**<br>• 🗃️ Agent environment interaction data for optimization<br>• 🔄 Scaled analysis for A/B tests | <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
124
+ | <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic). **Tracks inputs/outputs, agent tool calls, latency, cost, and custom metadata** at every step.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 📋 Collecting agent environment data <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/trace_screenshot.png" alt="Tracing visualization" width="1200"/></p> |
125
+ | <h3>🧪 Evals</h3>Build custom evaluators on top of your agents. Judgeval supports LLM-as-a-judge, manual labeling, and code-based evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 A/B testing <br>• 🛡️ Online guardrails | <p align="center"><img src="assets/experiments_page.png" alt="Evaluation metrics" width="800"/></p> |
126
+ | <h3>📡 Monitoring</h3>Get Slack alerts when you agent failures in production. Add custom hooks to address production regressions.<br><br> **Useful for:** <br>• 📉 Identifying degradation early <br>• 📈 Visualizing performance trends across agent versions and time | <p align="center"><img src="assets/error_analysis_dashboard.png" alt="Monitoring Dashboard" width="1200"/></p> |
127
+ | <h3>📊 Datasets</h3>Export traces and test cases to datasets for scaled analysis and optimization. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations, enabling continuous learning from production interactions. <br><br> **Useful for:**<br>• 🗃️ Agent environment interaction data for optimization<br>• 🔄 Scaled analysis for A/B tests | <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
128
128
 
129
129
  ## 🏢 Self-Hosting
130
130
 
Binary file
Binary file
Binary file
Binary file
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "judgeval"
3
- version = "0.0.52"
3
+ version = "0.0.54"
4
4
  authors = [
5
5
  { name="Andrew Li", email="andrew@judgmentlabs.ai" },
6
6
  { name="Alex Shan", email="alex@judgmentlabs.ai" },
@@ -31,6 +31,7 @@ dependencies = [
31
31
  "google-genai",
32
32
  "boto3",
33
33
  "matplotlib>=3.10.3",
34
+ "datamodel-code-generator>=0.31.1",
34
35
  ]
35
36
 
36
37
  [project.urls]
@@ -0,0 +1,60 @@
1
+ # logger.py
2
+
3
+ import logging
4
+ import sys
5
+ import os
6
+
7
+ # ANSI escape sequences
8
+ RESET = "\033[0m"
9
+ RED = "\033[31m"
10
+ YELLOW = "\033[33m"
11
+ BLUE = "\033[34m"
12
+ GRAY = "\033[90m"
13
+
14
+
15
+ class ColorFormatter(logging.Formatter):
16
+ """
17
+ Wrap the final formatted log record in ANSI color codes based on level.
18
+ """
19
+
20
+ COLORS = {
21
+ logging.DEBUG: GRAY,
22
+ logging.INFO: GRAY,
23
+ logging.WARNING: YELLOW,
24
+ logging.ERROR: RED,
25
+ logging.CRITICAL: RED,
26
+ }
27
+
28
+ def __init__(self, fmt=None, datefmt=None, use_color=True):
29
+ super().__init__(fmt=fmt, datefmt=datefmt)
30
+ self.use_color = use_color and sys.stdout.isatty()
31
+
32
+ def format(self, record):
33
+ message = super().format(record)
34
+ if self.use_color:
35
+ color = self.COLORS.get(record.levelno, "")
36
+ if color:
37
+ message = f"{color}{message}{RESET}"
38
+ return message
39
+
40
+
41
+ def _setup_judgeval_logger():
42
+ use_color = sys.stdout.isatty() and os.getenv("NO_COLOR") is None
43
+ handler = logging.StreamHandler(sys.stdout)
44
+ handler.setLevel(logging.DEBUG)
45
+ handler.setFormatter(
46
+ ColorFormatter(
47
+ fmt="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
48
+ datefmt="%Y-%m-%d %H:%M:%S",
49
+ use_color=use_color,
50
+ )
51
+ )
52
+
53
+ logger = logging.getLogger("judgeval")
54
+ logger.setLevel(logging.DEBUG)
55
+ logger.addHandler(handler)
56
+ return logger
57
+
58
+
59
+ # Global logger you can import elsewhere
60
+ judgeval_logger = _setup_judgeval_logger()
@@ -4,7 +4,7 @@ import boto3
4
4
  from typing import Optional
5
5
  from datetime import datetime, UTC
6
6
  from botocore.exceptions import ClientError
7
- from judgeval.common.logger import warning, info
7
+ from judgeval.common.logger import judgeval_logger
8
8
 
9
9
 
10
10
  class S3Storage:
@@ -42,7 +42,6 @@ class S3Storage:
42
42
  error_code = e.response["Error"]["Code"]
43
43
  if error_code == "404":
44
44
  # Bucket doesn't exist, create it
45
- info(f"Bucket {self.bucket_name} doesn't exist, creating it ...")
46
45
  try:
47
46
  self.s3_client.create_bucket(
48
47
  Bucket=self.bucket_name,
@@ -52,14 +51,13 @@ class S3Storage:
52
51
  ) if self.s3_client.meta.region_name != "us-east-1" else self.s3_client.create_bucket(
53
52
  Bucket=self.bucket_name
54
53
  )
55
- info(f"Created S3 bucket: {self.bucket_name}")
56
54
  except ClientError as create_error:
57
55
  if (
58
56
  create_error.response["Error"]["Code"]
59
57
  == "BucketAlreadyOwnedByYou"
60
58
  ):
61
59
  # Bucket was just created by another process
62
- warning(
60
+ judgeval_logger.warning(
63
61
  f"Bucket {self.bucket_name} was just created by another process"
64
62
  )
65
63
  pass
@@ -90,8 +88,6 @@ class S3Storage:
90
88
  # Convert trace data to JSON string
91
89
  trace_json = json.dumps(trace_data)
92
90
 
93
- # Upload to S3
94
- info(f"Uploading trace to S3 at key {s3_key}, in bucket {self.bucket_name} ...")
95
91
  self.s3_client.put_object(
96
92
  Bucket=self.bucket_name,
97
93
  Key=s3_key,