judgeval 0.0.53__tar.gz → 0.0.54__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. {judgeval-0.0.53 → judgeval-0.0.54}/PKG-INFO +5 -5
  2. {judgeval-0.0.53 → judgeval-0.0.54}/README.md +4 -4
  3. {judgeval-0.0.53 → judgeval-0.0.54}/pyproject.toml +1 -1
  4. {judgeval-0.0.53 → judgeval-0.0.54}/.github/ISSUE_TEMPLATE/bug_report.md +0 -0
  5. {judgeval-0.0.53 → judgeval-0.0.54}/.github/ISSUE_TEMPLATE/feature_request.md +0 -0
  6. {judgeval-0.0.53 → judgeval-0.0.54}/.github/pull_request_template.md +0 -0
  7. {judgeval-0.0.53 → judgeval-0.0.54}/.github/workflows/blocked-pr.yaml +0 -0
  8. {judgeval-0.0.53 → judgeval-0.0.54}/.github/workflows/ci.yaml +0 -0
  9. {judgeval-0.0.53 → judgeval-0.0.54}/.github/workflows/lint.yaml +0 -0
  10. {judgeval-0.0.53 → judgeval-0.0.54}/.github/workflows/merge-branch-check.yaml +0 -0
  11. {judgeval-0.0.53 → judgeval-0.0.54}/.github/workflows/release.yaml +0 -0
  12. {judgeval-0.0.53 → judgeval-0.0.54}/.github/workflows/validate-branch.yaml +0 -0
  13. {judgeval-0.0.53 → judgeval-0.0.54}/.gitignore +0 -0
  14. {judgeval-0.0.53 → judgeval-0.0.54}/.pre-commit-config.yaml +0 -0
  15. {judgeval-0.0.53 → judgeval-0.0.54}/LICENSE.md +0 -0
  16. {judgeval-0.0.53 → judgeval-0.0.54}/assets/Screenshot 2025-05-17 at 8.14.27/342/200/257PM.png" +0 -0
  17. {judgeval-0.0.53 → judgeval-0.0.54}/assets/agent.gif +0 -0
  18. {judgeval-0.0.53 → judgeval-0.0.54}/assets/data.gif +0 -0
  19. {judgeval-0.0.53 → judgeval-0.0.54}/assets/dataset_clustering_screenshot.png +0 -0
  20. {judgeval-0.0.53 → judgeval-0.0.54}/assets/dataset_clustering_screenshot_dm.png +0 -0
  21. {judgeval-0.0.53 → judgeval-0.0.54}/assets/datasets_preview_screenshot.png +0 -0
  22. {judgeval-0.0.53 → judgeval-0.0.54}/assets/document.gif +0 -0
  23. {judgeval-0.0.53 → judgeval-0.0.54}/assets/error_analysis_dashboard.png +0 -0
  24. {judgeval-0.0.53 → judgeval-0.0.54}/assets/experiments_dashboard_screenshot.png +0 -0
  25. {judgeval-0.0.53 → judgeval-0.0.54}/assets/experiments_page.png +0 -0
  26. {judgeval-0.0.53 → judgeval-0.0.54}/assets/experiments_pagev2.png +0 -0
  27. {judgeval-0.0.53 → judgeval-0.0.54}/assets/logo-dark.svg +0 -0
  28. {judgeval-0.0.53 → judgeval-0.0.54}/assets/logo-light.svg +0 -0
  29. {judgeval-0.0.53 → judgeval-0.0.54}/assets/monitoring_screenshot.png +0 -0
  30. {judgeval-0.0.53 → judgeval-0.0.54}/assets/new_darkmode.svg +0 -0
  31. {judgeval-0.0.53 → judgeval-0.0.54}/assets/new_lightmode.svg +0 -0
  32. {judgeval-0.0.53 → judgeval-0.0.54}/assets/product_shot.png +0 -0
  33. {judgeval-0.0.53 → judgeval-0.0.54}/assets/trace.gif +0 -0
  34. {judgeval-0.0.53 → judgeval-0.0.54}/assets/trace_demo.png +0 -0
  35. {judgeval-0.0.53 → judgeval-0.0.54}/assets/trace_screenshot.png +0 -0
  36. {judgeval-0.0.53 → judgeval-0.0.54}/assets/trace_screenshot_old.png +0 -0
  37. {judgeval-0.0.53 → judgeval-0.0.54}/pytest.ini +0 -0
  38. {judgeval-0.0.53 → judgeval-0.0.54}/src/.coveragerc +0 -0
  39. {judgeval-0.0.53 → judgeval-0.0.54}/src/judgeval/__init__.py +0 -0
  40. {judgeval-0.0.53 → judgeval-0.0.54}/src/judgeval/clients.py +0 -0
  41. {judgeval-0.0.53 → judgeval-0.0.54}/src/judgeval/common/__init__.py +0 -0
  42. {judgeval-0.0.53 → judgeval-0.0.54}/src/judgeval/common/exceptions.py +0 -0
  43. {judgeval-0.0.53 → judgeval-0.0.54}/src/judgeval/common/logger.py +0 -0
  44. {judgeval-0.0.53 → judgeval-0.0.54}/src/judgeval/common/s3_storage.py +0 -0
  45. {judgeval-0.0.53 → judgeval-0.0.54}/src/judgeval/common/tracer.py +0 -0
  46. {judgeval-0.0.53 → judgeval-0.0.54}/src/judgeval/common/utils.py +0 -0
  47. {judgeval-0.0.53 → judgeval-0.0.54}/src/judgeval/constants.py +0 -0
  48. {judgeval-0.0.53 → judgeval-0.0.54}/src/judgeval/data/__init__.py +0 -0
  49. {judgeval-0.0.53 → judgeval-0.0.54}/src/judgeval/data/datasets/__init__.py +0 -0
  50. {judgeval-0.0.53 → judgeval-0.0.54}/src/judgeval/data/datasets/dataset.py +0 -0
  51. {judgeval-0.0.53 → judgeval-0.0.54}/src/judgeval/data/datasets/eval_dataset_client.py +0 -0
  52. {judgeval-0.0.53 → judgeval-0.0.54}/src/judgeval/data/example.py +0 -0
  53. {judgeval-0.0.53 → judgeval-0.0.54}/src/judgeval/data/judgment_types.py +0 -0
  54. {judgeval-0.0.53 → judgeval-0.0.54}/src/judgeval/data/result.py +0 -0
  55. {judgeval-0.0.53 → judgeval-0.0.54}/src/judgeval/data/scorer_data.py +0 -0
  56. {judgeval-0.0.53 → judgeval-0.0.54}/src/judgeval/data/scripts/fix_default_factory.py +0 -0
  57. {judgeval-0.0.53 → judgeval-0.0.54}/src/judgeval/data/scripts/openapi_transform.py +0 -0
  58. {judgeval-0.0.53 → judgeval-0.0.54}/src/judgeval/data/tool.py +0 -0
  59. {judgeval-0.0.53 → judgeval-0.0.54}/src/judgeval/data/trace.py +0 -0
  60. {judgeval-0.0.53 → judgeval-0.0.54}/src/judgeval/data/trace_run.py +0 -0
  61. {judgeval-0.0.53 → judgeval-0.0.54}/src/judgeval/evaluation_run.py +0 -0
  62. {judgeval-0.0.53 → judgeval-0.0.54}/src/judgeval/integrations/langgraph.py +0 -0
  63. {judgeval-0.0.53 → judgeval-0.0.54}/src/judgeval/judges/__init__.py +0 -0
  64. {judgeval-0.0.53 → judgeval-0.0.54}/src/judgeval/judges/base_judge.py +0 -0
  65. {judgeval-0.0.53 → judgeval-0.0.54}/src/judgeval/judges/litellm_judge.py +0 -0
  66. {judgeval-0.0.53 → judgeval-0.0.54}/src/judgeval/judges/mixture_of_judges.py +0 -0
  67. {judgeval-0.0.53 → judgeval-0.0.54}/src/judgeval/judges/together_judge.py +0 -0
  68. {judgeval-0.0.53 → judgeval-0.0.54}/src/judgeval/judges/utils.py +0 -0
  69. {judgeval-0.0.53 → judgeval-0.0.54}/src/judgeval/judgment_client.py +0 -0
  70. {judgeval-0.0.53 → judgeval-0.0.54}/src/judgeval/rules.py +0 -0
  71. {judgeval-0.0.53 → judgeval-0.0.54}/src/judgeval/run_evaluation.py +0 -0
  72. {judgeval-0.0.53 → judgeval-0.0.54}/src/judgeval/scorers/__init__.py +0 -0
  73. {judgeval-0.0.53 → judgeval-0.0.54}/src/judgeval/scorers/agent_scorer.py +0 -0
  74. {judgeval-0.0.53 → judgeval-0.0.54}/src/judgeval/scorers/api_scorer.py +0 -0
  75. {judgeval-0.0.53 → judgeval-0.0.54}/src/judgeval/scorers/base_scorer.py +0 -0
  76. {judgeval-0.0.53 → judgeval-0.0.54}/src/judgeval/scorers/example_scorer.py +0 -0
  77. {judgeval-0.0.53 → judgeval-0.0.54}/src/judgeval/scorers/exceptions.py +0 -0
  78. {judgeval-0.0.53 → judgeval-0.0.54}/src/judgeval/scorers/judgeval_scorers/__init__.py +0 -0
  79. {judgeval-0.0.53 → judgeval-0.0.54}/src/judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -0
  80. {judgeval-0.0.53 → judgeval-0.0.54}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +0 -0
  81. {judgeval-0.0.53 → judgeval-0.0.54}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +0 -0
  82. {judgeval-0.0.53 → judgeval-0.0.54}/src/judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +0 -0
  83. {judgeval-0.0.53 → judgeval-0.0.54}/src/judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -0
  84. {judgeval-0.0.53 → judgeval-0.0.54}/src/judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -0
  85. {judgeval-0.0.53 → judgeval-0.0.54}/src/judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +0 -0
  86. {judgeval-0.0.53 → judgeval-0.0.54}/src/judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -0
  87. {judgeval-0.0.53 → judgeval-0.0.54}/src/judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +0 -0
  88. {judgeval-0.0.53 → judgeval-0.0.54}/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -0
  89. {judgeval-0.0.53 → judgeval-0.0.54}/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -0
  90. {judgeval-0.0.53 → judgeval-0.0.54}/src/judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -0
  91. {judgeval-0.0.53 → judgeval-0.0.54}/src/judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -0
  92. {judgeval-0.0.53 → judgeval-0.0.54}/src/judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -0
  93. {judgeval-0.0.53 → judgeval-0.0.54}/src/judgeval/scorers/score.py +0 -0
  94. {judgeval-0.0.53 → judgeval-0.0.54}/src/judgeval/scorers/utils.py +0 -0
  95. {judgeval-0.0.53 → judgeval-0.0.54}/src/judgeval/tracer/__init__.py +0 -0
  96. {judgeval-0.0.53 → judgeval-0.0.54}/src/judgeval/utils/alerts.py +0 -0
  97. {judgeval-0.0.53 → judgeval-0.0.54}/src/judgeval/utils/file_utils.py +0 -0
  98. {judgeval-0.0.53 → judgeval-0.0.54}/src/judgeval/utils/requests.py +0 -0
  99. {judgeval-0.0.53 → judgeval-0.0.54}/src/judgeval/version_check.py +0 -0
  100. {judgeval-0.0.53 → judgeval-0.0.54}/src/update_types.sh +0 -0
  101. {judgeval-0.0.53 → judgeval-0.0.54}/update_version.py +0 -0
  102. {judgeval-0.0.53 → judgeval-0.0.54}/uv.lock +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: judgeval
3
- Version: 0.0.53
3
+ Version: 0.0.54
4
4
  Summary: Judgeval Package
5
5
  Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
6
  Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -151,10 +151,10 @@ You'll see your trace exported to the Judgment Platform:
151
151
 
152
152
  | | |
153
153
  |:---|:---:|
154
- | <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic): **tracking inputs/outputs, agent tool calls, latency, and cost** at every step.<br><br>Online evals can be applied to traces to measure quality on production data in real-time. Export data per individual trace for detailed analysis.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 📋 Collecting agent environment data <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/trace_screenshot.png" alt="Tracing visualization" width="1200"/></p> |
155
- | <h3>🧪 Evals</h3>Evals are the key to regression testing for agents. Judgeval provides 15+ research-backed metrics including tool call accuracy, hallucinations, instruction adherence, and retrieval context recall.<br><br>Judgeval supports LLM-as-a-judge, manual labeling, and custom evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 Experimental prompt testing<br>• 🛡️ Online guardrails | <p align="center"><img src="assets/experiments_page.png" alt="Evaluation metrics" width="800"/></p> |
156
- | <h3>📡 Monitoring</h3>Track all your agent metrics in production. **Catch production regressions early.**<br><br>Configure alerts to trigger automated actions when metric thresholds are exceeded (add agent trace to review queue/dataset, Slack notification, etc.).<br><br> **Useful for:** <br>• 📉 Identifying degradation early <br>• 📈 Visualizing performance trends across agent versions and time | <p align="center"><img src="assets/error_analysis_dashboard.png" alt="Monitoring Dashboard" width="1200"/></p> |
157
- | <h3>📊 Datasets</h3>Export comprehensive agent-environment interaction data or import external testcases to datasets for scaled analysis and optimization. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations, enabling continuous learning from production interactions. <br><br> **Useful for:**<br>• 🗃️ Agent environment interaction data for optimization<br>• 🔄 Scaled analysis for A/B tests | <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
154
+ | <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic). **Tracks inputs/outputs, agent tool calls, latency, cost, and custom metadata** at every step.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 📋 Collecting agent environment data <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/trace_screenshot.png" alt="Tracing visualization" width="1200"/></p> |
155
+ | <h3>🧪 Evals</h3>Build custom evaluators on top of your agents. Judgeval supports LLM-as-a-judge, manual labeling, and code-based evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 A/B testing <br>• 🛡️ Online guardrails | <p align="center"><img src="assets/experiments_page.png" alt="Evaluation metrics" width="800"/></p> |
156
+ | <h3>📡 Monitoring</h3>Get Slack alerts when you agent failures in production. Add custom hooks to address production regressions.<br><br> **Useful for:** <br>• 📉 Identifying degradation early <br>• 📈 Visualizing performance trends across agent versions and time | <p align="center"><img src="assets/error_analysis_dashboard.png" alt="Monitoring Dashboard" width="1200"/></p> |
157
+ | <h3>📊 Datasets</h3>Export traces and test cases to datasets for scaled analysis and optimization. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations, enabling continuous learning from production interactions. <br><br> **Useful for:**<br>• 🗃️ Agent environment interaction data for optimization<br>• 🔄 Scaled analysis for A/B tests | <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
158
158
 
159
159
  ## 🏢 Self-Hosting
160
160
 
@@ -121,10 +121,10 @@ You'll see your trace exported to the Judgment Platform:
121
121
 
122
122
  | | |
123
123
  |:---|:---:|
124
- | <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic): **tracking inputs/outputs, agent tool calls, latency, and cost** at every step.<br><br>Online evals can be applied to traces to measure quality on production data in real-time. Export data per individual trace for detailed analysis.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 📋 Collecting agent environment data <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/trace_screenshot.png" alt="Tracing visualization" width="1200"/></p> |
125
- | <h3>🧪 Evals</h3>Evals are the key to regression testing for agents. Judgeval provides 15+ research-backed metrics including tool call accuracy, hallucinations, instruction adherence, and retrieval context recall.<br><br>Judgeval supports LLM-as-a-judge, manual labeling, and custom evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 Experimental prompt testing<br>• 🛡️ Online guardrails | <p align="center"><img src="assets/experiments_page.png" alt="Evaluation metrics" width="800"/></p> |
126
- | <h3>📡 Monitoring</h3>Track all your agent metrics in production. **Catch production regressions early.**<br><br>Configure alerts to trigger automated actions when metric thresholds are exceeded (add agent trace to review queue/dataset, Slack notification, etc.).<br><br> **Useful for:** <br>• 📉 Identifying degradation early <br>• 📈 Visualizing performance trends across agent versions and time | <p align="center"><img src="assets/error_analysis_dashboard.png" alt="Monitoring Dashboard" width="1200"/></p> |
127
- | <h3>📊 Datasets</h3>Export comprehensive agent-environment interaction data or import external testcases to datasets for scaled analysis and optimization. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations, enabling continuous learning from production interactions. <br><br> **Useful for:**<br>• 🗃️ Agent environment interaction data for optimization<br>• 🔄 Scaled analysis for A/B tests | <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
124
+ | <h3>🔍 Tracing</h3>Automatic agent tracing integrated with common frameworks (LangGraph, OpenAI, Anthropic). **Tracks inputs/outputs, agent tool calls, latency, cost, and custom metadata** at every step.<br><br>**Useful for:**<br>• 🐛 Debugging agent runs <br>• 📋 Collecting agent environment data <br>• 🔬 Pinpointing performance bottlenecks| <p align="center"><img src="assets/trace_screenshot.png" alt="Tracing visualization" width="1200"/></p> |
125
+ | <h3>🧪 Evals</h3>Build custom evaluators on top of your agents. Judgeval supports LLM-as-a-judge, manual labeling, and code-based evaluators that connect with our metric-tracking infrastructure. <br><br>**Useful for:**<br>• ⚠️ Unit-testing <br>• 🔬 A/B testing <br>• 🛡️ Online guardrails | <p align="center"><img src="assets/experiments_page.png" alt="Evaluation metrics" width="800"/></p> |
126
+ | <h3>📡 Monitoring</h3>Get Slack alerts when you agent failures in production. Add custom hooks to address production regressions.<br><br> **Useful for:** <br>• 📉 Identifying degradation early <br>• 📈 Visualizing performance trends across agent versions and time | <p align="center"><img src="assets/error_analysis_dashboard.png" alt="Monitoring Dashboard" width="1200"/></p> |
127
+ | <h3>📊 Datasets</h3>Export traces and test cases to datasets for scaled analysis and optimization. Move datasets to/from Parquet, S3, etc. <br><br>Run evals on datasets as unit tests or to A/B test different agent configurations, enabling continuous learning from production interactions. <br><br> **Useful for:**<br>• 🗃️ Agent environment interaction data for optimization<br>• 🔄 Scaled analysis for A/B tests | <p align="center"><img src="assets/datasets_preview_screenshot.png" alt="Dataset management" width="1200"/></p> |
128
128
 
129
129
  ## 🏢 Self-Hosting
130
130
 
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "judgeval"
3
- version = "0.0.53"
3
+ version = "0.0.54"
4
4
  authors = [
5
5
  { name="Andrew Li", email="andrew@judgmentlabs.ai" },
6
6
  { name="Alex Shan", email="alex@judgmentlabs.ai" },
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes