judgeval 0.0.20__tar.gz → 0.0.21__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {judgeval-0.0.20 → judgeval-0.0.21}/PKG-INFO +7 -3
- {judgeval-0.0.20 → judgeval-0.0.21}/Pipfile +0 -2
- {judgeval-0.0.20 → judgeval-0.0.21}/docs/integration/langgraph.mdx +2 -1
- {judgeval-0.0.20 → judgeval-0.0.21}/pyproject.toml +7 -3
- judgeval-0.0.21/src/demo/cookbooks/test.py +152 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/common/tracer.py +41 -2
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/constants.py +1 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/judgment_client.py +20 -3
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/run_evaluation.py +62 -8
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/api_scorer.py +3 -1
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +10 -2
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +9 -2
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +10 -2
- judgeval-0.0.21/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +28 -0
- judgeval-0.0.21/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +28 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +10 -3
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +10 -2
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +10 -2
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +10 -2
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +10 -2
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +9 -2
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +9 -2
- judgeval-0.0.21/src/judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +27 -0
- judgeval-0.0.20/src/demo/cookbooks/JNPR_Mist/test.py +0 -21
- judgeval-0.0.20/src/demo/cookbooks/anime_chatbot_agent/animeChatBot.py +0 -443
- judgeval-0.0.20/src/demo/cookbooks/ci_testing/ci_testing.py +0 -201
- judgeval-0.0.20/src/demo/cookbooks/ci_testing/travel_response.txt +0 -52
- judgeval-0.0.20/src/demo/cookbooks/custom_scorers/competitor_mentions.py +0 -41
- judgeval-0.0.20/src/demo/cookbooks/custom_scorers/text2sql.py +0 -205
- judgeval-0.0.20/src/demo/cookbooks/jpmorgan/demo.ipynb +0 -211
- judgeval-0.0.20/src/demo/cookbooks/jpmorgan/demo.py +0 -262
- judgeval-0.0.20/src/demo/cookbooks/jpmorgan/vectordbdocs.py +0 -174
- judgeval-0.0.20/src/demo/cookbooks/langchain_basic_rag/basic_agentic_rag.ipynb +0 -781
- judgeval-0.0.20/src/demo/cookbooks/langchain_basic_rag/tesla_q3.pdf +0 -0
- judgeval-0.0.20/src/demo/cookbooks/langchain_sales/example_product_price_id_mapping.json +0 -1
- judgeval-0.0.20/src/demo/cookbooks/langchain_sales/sales_agent_with_context.ipynb +0 -1375
- judgeval-0.0.20/src/demo/cookbooks/langchain_sales/sample_product_catalog.txt +0 -20
- judgeval-0.0.20/src/demo/cookbooks/langgraph_basic/agent.ipynb +0 -107
- judgeval-0.0.20/src/demo/cookbooks/langgraph_basic/agent.py +0 -109
- judgeval-0.0.20/src/demo/cookbooks/linkd/text2sql.py +0 -14
- judgeval-0.0.20/src/demo/cookbooks/new_bot/basic_bot.py +0 -106
- judgeval-0.0.20/src/demo/cookbooks/openai_travel_agent/agent.py +0 -167
- judgeval-0.0.20/src/demo/cookbooks/openai_travel_agent/populate_db.py +0 -73
- judgeval-0.0.20/src/demo/cookbooks/openai_travel_agent/tools.py +0 -16
- judgeval-0.0.20/src/demo/cookbooks/rules_alerts/rules_bot.py +0 -132
- judgeval-0.0.20/src/demo/cookbooks/rules_alerts/rules_demo.py +0 -351
- judgeval-0.0.20/src/demo/cookbooks/rules_alerts/utils_helper.py +0 -78
- judgeval-0.0.20/src/demo/customer_use/jnpr/mist/demo.py +0 -131
- judgeval-0.0.20/src/demo/customer_use/jnpr/mist/test.yaml +0 -11
- judgeval-0.0.20/src/demo/customer_use/jnpr/srikar_demo.py +0 -0
- judgeval-0.0.20/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -19
- judgeval-0.0.20/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -19
- judgeval-0.0.20/src/judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -20
- {judgeval-0.0.20 → judgeval-0.0.21}/.github/workflows/ci.yaml +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/.gitignore +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/LICENSE.md +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/Pipfile.lock +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/README.md +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/docs/README.md +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/docs/api_reference/judgment_client.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/docs/api_reference/trace.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/docs/development.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/docs/essentials/code.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/docs/essentials/images.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/docs/essentials/markdown.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/docs/essentials/navigation.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/docs/essentials/reusable-snippets.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/docs/essentials/settings.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/docs/evaluation/data_datasets.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/docs/evaluation/data_examples.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/docs/evaluation/introduction.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/docs/evaluation/judges.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/docs/evaluation/scorers/answer_correctness.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/docs/evaluation/scorers/answer_relevancy.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/docs/evaluation/scorers/classifier_scorer.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/docs/evaluation/scorers/comparison.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/docs/evaluation/scorers/contextual_precision.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/docs/evaluation/scorers/contextual_recall.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/docs/evaluation/scorers/contextual_relevancy.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/docs/evaluation/scorers/custom_scorers.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/docs/evaluation/scorers/execution_order.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/docs/evaluation/scorers/faithfulness.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/docs/evaluation/scorers/hallucination.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/docs/evaluation/scorers/introduction.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/docs/evaluation/scorers/json_correctness.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/docs/evaluation/scorers/summarization.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/docs/evaluation/unit_testing.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/docs/favicon.svg +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/docs/getting_started.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/docs/images/basic_trace_example.png +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/docs/images/checks-passed.png +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/docs/images/create_aggressive_scorer.png +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/docs/images/create_scorer.png +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/docs/images/evaluation_diagram.png +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/docs/images/hero-dark.svg +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/docs/images/hero-light.svg +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/docs/images/online_eval_fault.png +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/docs/images/trace_ss.png +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/docs/introduction.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/docs/judgment/introduction.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/docs/logo/dark.svg +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/docs/logo/light.svg +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/docs/mint.json +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/docs/monitoring/introduction.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/docs/monitoring/production_insights.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/docs/monitoring/tracing.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/docs/notebooks/create_dataset.ipynb +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/docs/notebooks/create_scorer.ipynb +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/docs/notebooks/demo.ipynb +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/docs/notebooks/prompt_scorer.ipynb +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/docs/notebooks/quickstart.ipynb +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/docs/quickstart.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/docs/snippets/snippet-intro.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/pytest.ini +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/__init__.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/clients.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/common/__init__.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/common/exceptions.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/common/logger.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/common/utils.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/data/__init__.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/data/api_example.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/data/datasets/__init__.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/data/datasets/dataset.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/data/datasets/eval_dataset_client.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/data/datasets/utils.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/data/example.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/data/ground_truth.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/data/result.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/data/scorer_data.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/evaluation_run.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/judges/__init__.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/judges/base_judge.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/judges/litellm_judge.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/judges/mixture_of_judges.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/judges/together_judge.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/judges/utils.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/rules.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/__init__.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/base_scorer.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/exceptions.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorer.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/__init__.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/comparison/__init__.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/comparison/comparison_scorer.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/comparison/prompts.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/execution_order/__init__.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/execution_order/execution_order.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/instruction_adherence/instruction_adherence.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/instruction_adherence/prompt.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/prompt_scorer.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/score.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/utils.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/tracer/__init__.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/utils/alerts.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: judgeval
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.21
|
4
4
|
Summary: Judgeval Package
|
5
5
|
Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
|
6
6
|
Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
|
@@ -12,9 +12,15 @@ Classifier: Programming Language :: Python :: 3
|
|
12
12
|
Requires-Python: >=3.11
|
13
13
|
Requires-Dist: anthropic
|
14
14
|
Requires-Dist: fastapi
|
15
|
+
Requires-Dist: langchain
|
16
|
+
Requires-Dist: langchain-anthropic
|
17
|
+
Requires-Dist: langchain-core
|
18
|
+
Requires-Dist: langchain-huggingface
|
19
|
+
Requires-Dist: langchain-openai
|
15
20
|
Requires-Dist: litellm
|
16
21
|
Requires-Dist: nest-asyncio
|
17
22
|
Requires-Dist: openai
|
23
|
+
Requires-Dist: openpyxl
|
18
24
|
Requires-Dist: pandas
|
19
25
|
Requires-Dist: pika
|
20
26
|
Requires-Dist: python-dotenv==1.0.1
|
@@ -23,8 +29,6 @@ Requires-Dist: supabase
|
|
23
29
|
Requires-Dist: together
|
24
30
|
Requires-Dist: uvicorn
|
25
31
|
Provides-Extra: dev
|
26
|
-
Requires-Dist: langfuse==2.50.3; extra == 'dev'
|
27
|
-
Requires-Dist: patronus; extra == 'dev'
|
28
32
|
Requires-Dist: pytest-asyncio>=0.25.0; extra == 'dev'
|
29
33
|
Requires-Dist: pytest-mock>=3.14.0; extra == 'dev'
|
30
34
|
Requires-Dist: pytest>=8.3.4; extra == 'dev'
|
@@ -14,10 +14,11 @@ graph_builder = StateGraph(State)
|
|
14
14
|
# YOUR LANGGRAPH WORKFLOW
|
15
15
|
|
16
16
|
handler = JudgevalCallbackHandler(judgment.get_current_trace())
|
17
|
+
set_global_handler(handler)
|
17
18
|
|
18
19
|
result = graph.invoke({
|
19
20
|
"messages": [HumanMessage(content=prompt)]
|
20
|
-
}
|
21
|
+
})
|
21
22
|
|
22
23
|
```
|
23
24
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[project]
|
2
2
|
name = "judgeval"
|
3
|
-
version = "0.0.
|
3
|
+
version = "0.0.21"
|
4
4
|
authors = [
|
5
5
|
{ name="Andrew Li", email="andrew@judgmentlabs.ai" },
|
6
6
|
{ name="Alex Shan", email="alex@judgmentlabs.ai" },
|
@@ -28,6 +28,12 @@ dependencies = [
|
|
28
28
|
"anthropic",
|
29
29
|
"nest-asyncio",
|
30
30
|
"pika",
|
31
|
+
"openpyxl",
|
32
|
+
"langchain",
|
33
|
+
"langchain-huggingface",
|
34
|
+
"langchain-openai",
|
35
|
+
"langchain-anthropic",
|
36
|
+
"langchain-core",
|
31
37
|
]
|
32
38
|
|
33
39
|
[project.optional-dependencies]
|
@@ -35,8 +41,6 @@ dev = [
|
|
35
41
|
"pytest>=8.3.4",
|
36
42
|
"pytest-asyncio>=0.25.0",
|
37
43
|
"pytest-mock>=3.14.0",
|
38
|
-
"langfuse==2.50.3",
|
39
|
-
"patronus",
|
40
44
|
"tavily-python"
|
41
45
|
]
|
42
46
|
|
@@ -0,0 +1,152 @@
|
|
1
|
+
from judgeval.data import Example
|
2
|
+
from judgeval.data.datasets import EvalDataset
|
3
|
+
from judgeval.scorers import AnswerRelevancyScorer
|
4
|
+
from judgeval import JudgmentClient
|
5
|
+
|
6
|
+
|
7
|
+
def create_sample_dataset():
|
8
|
+
# Define sample inputs
|
9
|
+
inputs = [
|
10
|
+
# Highly relevant Q/A pairs
|
11
|
+
"Who founded Microsoft?",
|
12
|
+
"What is the capital of France?",
|
13
|
+
"How does photosynthesis work?",
|
14
|
+
"What are the benefits of exercise?",
|
15
|
+
"Explain quantum computing in simple terms.",
|
16
|
+
|
17
|
+
# Somewhat relevant Q/A pairs
|
18
|
+
"What is machine learning?",
|
19
|
+
"How do electric cars work?",
|
20
|
+
"What causes climate change?",
|
21
|
+
"How does the human digestive system function?",
|
22
|
+
"What is blockchain technology?",
|
23
|
+
|
24
|
+
# Minimally relevant Q/A pairs
|
25
|
+
"What are the main programming languages?",
|
26
|
+
"How do I bake a chocolate cake?",
|
27
|
+
"What is the history of the Roman Empire?",
|
28
|
+
"How do vaccines work?",
|
29
|
+
"What are black holes?",
|
30
|
+
|
31
|
+
# Not relevant Q/A pairs
|
32
|
+
"What is the best smartphone to buy?",
|
33
|
+
"How tall is Mount Everest?",
|
34
|
+
"Who wrote Romeo and Juliet?",
|
35
|
+
"What is the population of Tokyo?",
|
36
|
+
"How do I change a flat tire?"
|
37
|
+
]
|
38
|
+
|
39
|
+
# Define corresponding outputs
|
40
|
+
actual_outputs = [
|
41
|
+
# Highly relevant answers
|
42
|
+
"Bill Gates and Paul Allen founded Microsoft in 1975.",
|
43
|
+
"The capital of France is Paris, known for the Eiffel Tower and Louvre Museum.",
|
44
|
+
"Photosynthesis is the process where plants convert sunlight, water, and carbon dioxide into glucose and oxygen.",
|
45
|
+
"Regular exercise improves cardiovascular health, builds muscle strength, reduces stress, and helps maintain a healthy weight.",
|
46
|
+
"Quantum computing uses quantum bits or qubits that can exist in multiple states simultaneously, allowing for potentially faster computation of certain problems compared to classical computers.",
|
47
|
+
|
48
|
+
# Somewhat relevant answers (partial or tangential information)
|
49
|
+
"Machine learning involves statistical techniques, but it's primarily about natural language processing and computer vision applications in modern businesses.",
|
50
|
+
"Electric cars use batteries, though the most important aspect is their impact on reducing traffic congestion in urban areas.",
|
51
|
+
"Climate change is related to weather patterns, but it's mainly caused by volcanic eruptions and natural planetary cycles.",
|
52
|
+
"The digestive system breaks down food, but the most interesting part is how it connects to brain function and mental health.",
|
53
|
+
"Blockchain is a distributed ledger technology, though its primary purpose is to replace traditional banking systems entirely.",
|
54
|
+
|
55
|
+
# Minimally relevant answers (mostly off-topic but with slight connection)
|
56
|
+
"Programming languages include Python and JavaScript, but the real question is whether AI will replace programmers in the next decade.",
|
57
|
+
"Chocolate cakes require flour and sugar, but I'd recommend focusing on gluten-free alternatives since they're healthier.",
|
58
|
+
"The Roman Empire lasted for centuries, but modern Italy's political system is more relevant to understand today's European politics.",
|
59
|
+
"Vaccines stimulate immune responses, but the pharmaceutical industry's profit motives are what you should really be concerned about.",
|
60
|
+
"Black holes are regions of spacetime, but the conspiracy theories about what NASA isn't telling us are far more interesting.",
|
61
|
+
|
62
|
+
# Not relevant answers (completely off-topic)
|
63
|
+
"The migration patterns of monarch butterflies are fascinating examples of evolutionary adaptation.",
|
64
|
+
"The Great Wall of China was built over multiple dynasties and stretches over 13,000 miles.",
|
65
|
+
"Photosynthesis is how plants convert sunlight into energy, producing oxygen as a byproduct.",
|
66
|
+
"The human genome contains approximately 3 billion base pairs of DNA.",
|
67
|
+
"The Pythagorean theorem states that in a right-angled triangle, the square of the hypotenuse equals the sum of squares of the other two sides."
|
68
|
+
]
|
69
|
+
|
70
|
+
# Create Example objects from inputs and outputs
|
71
|
+
examples = []
|
72
|
+
for i in range(len(inputs)):
|
73
|
+
examples.append(Example(
|
74
|
+
input=inputs[i],
|
75
|
+
actual_output=actual_outputs[i]
|
76
|
+
))
|
77
|
+
|
78
|
+
return EvalDataset(examples=examples)
|
79
|
+
|
80
|
+
|
81
|
+
def save_dataset(client, dataset, alias):
|
82
|
+
"""Save the dataset to Judgment API with the given alias"""
|
83
|
+
client.push_dataset(alias=alias, dataset=dataset)
|
84
|
+
print(f"Dataset saved with alias: {alias}")
|
85
|
+
|
86
|
+
|
87
|
+
def run_evaluation(client, dataset_alias, model="gpt-4o", project_name="jnpr_mist_demo_project", eval_run_name="jnpr_demo_eval_run"):
|
88
|
+
"""Pull a dataset and run an evaluation on it"""
|
89
|
+
# Pull the dataset from Judgment API
|
90
|
+
eval_dataset = client.pull_dataset(alias=dataset_alias)
|
91
|
+
|
92
|
+
# Run the evaluation
|
93
|
+
results = client.evaluate_dataset(
|
94
|
+
dataset=eval_dataset,
|
95
|
+
scorers=[AnswerRelevancyScorer(threshold=0.8)],
|
96
|
+
model=model,
|
97
|
+
eval_run_name=eval_run_name,
|
98
|
+
project_name=project_name,
|
99
|
+
)
|
100
|
+
|
101
|
+
return results
|
102
|
+
|
103
|
+
|
104
|
+
def run_assertion_test(client, dataset_alias, model="gpt-4o", project_name="jnpr_mist_demo_project", eval_run_name="jnpr_demo_assertion_run"):
|
105
|
+
"""Pull a dataset and run assertion tests on its examples"""
|
106
|
+
# Pull the dataset from Judgment API
|
107
|
+
eval_dataset = client.pull_dataset(alias=dataset_alias)
|
108
|
+
|
109
|
+
# Extract examples from the dataset
|
110
|
+
examples = eval_dataset.examples
|
111
|
+
|
112
|
+
# Run assertion tests on each example
|
113
|
+
# Run assertion test on all examples at once
|
114
|
+
client.assert_test(
|
115
|
+
examples=examples,
|
116
|
+
scorers=[AnswerRelevancyScorer(threshold=0.8)],
|
117
|
+
model=model,
|
118
|
+
project_name=project_name,
|
119
|
+
eval_run_name=eval_run_name
|
120
|
+
)
|
121
|
+
|
122
|
+
|
123
|
+
def main():
|
124
|
+
client = JudgmentClient()
|
125
|
+
|
126
|
+
# Uncomment to create and save a new dataset
|
127
|
+
# dataset = create_sample_dataset()
|
128
|
+
# save_dataset(client, dataset, "jnpr_demo_dataset")
|
129
|
+
|
130
|
+
# # Run evaluation on the saved dataset
|
131
|
+
# results = run_evaluation(
|
132
|
+
# client,
|
133
|
+
# dataset_alias="jnpr_demo_dataset",
|
134
|
+
# model="gpt-4o",
|
135
|
+
# project_name="jnpr_mist_demo_project",
|
136
|
+
# eval_run_name="jnpr_demo_eval"
|
137
|
+
# )
|
138
|
+
|
139
|
+
# Run assertion test on the saved dataset
|
140
|
+
results = run_assertion_test(
|
141
|
+
client,
|
142
|
+
dataset_alias="jnpr_demo_dataset",
|
143
|
+
model="gpt-4o",
|
144
|
+
project_name="jnpr_mist_demo_project",
|
145
|
+
eval_run_name="jnpr_demo_assertion"
|
146
|
+
)
|
147
|
+
return results
|
148
|
+
|
149
|
+
|
150
|
+
if __name__ == "__main__":
|
151
|
+
results = main()
|
152
|
+
print(results)
|
@@ -10,6 +10,7 @@ import os
|
|
10
10
|
import time
|
11
11
|
import uuid
|
12
12
|
import warnings
|
13
|
+
from contextvars import ContextVar
|
13
14
|
from contextlib import contextmanager
|
14
15
|
from collections import defaultdict
|
15
16
|
from dataclasses import dataclass, field
|
@@ -37,6 +38,7 @@ from judgeval.constants import (
|
|
37
38
|
RABBITMQ_PORT,
|
38
39
|
RABBITMQ_QUEUE,
|
39
40
|
JUDGMENT_TRACES_DELETE_API_URL,
|
41
|
+
JUDGMENT_PROJECT_DELETE_API_URL,
|
40
42
|
JUDGMENT_TRACES_ADD_TO_EVAL_QUEUE_API_URL
|
41
43
|
)
|
42
44
|
from judgeval.judgment_client import JudgmentClient
|
@@ -54,7 +56,7 @@ from langchain_core.utils.function_calling import convert_to_openai_tool
|
|
54
56
|
from langchain_core.callbacks import CallbackManager, BaseCallbackHandler
|
55
57
|
from langchain_core.agents import AgentAction, AgentFinish
|
56
58
|
from langchain_core.outputs import LLMResult
|
57
|
-
|
59
|
+
from langchain_core.tracers.context import register_configure_hook
|
58
60
|
from langchain_core.messages.ai import AIMessage
|
59
61
|
from langchain_core.messages.tool import ToolMessage
|
60
62
|
from langchain_core.messages.base import BaseMessage
|
@@ -251,7 +253,8 @@ class TraceManagerClient:
|
|
251
253
|
raise ValueError(f"Failed to save trace data: {response.text}")
|
252
254
|
|
253
255
|
if not empty_save and "ui_results_url" in response.json():
|
254
|
-
|
256
|
+
pretty_str = f"\n🔍 You can view your trace data here: [rgb(106,0,255)][link={response.json()['ui_results_url']}]View Trace[/link]\n"
|
257
|
+
rprint(pretty_str)
|
255
258
|
|
256
259
|
def delete_trace(self, trace_id: str):
|
257
260
|
"""
|
@@ -294,6 +297,27 @@ class TraceManagerClient:
|
|
294
297
|
raise ValueError(f"Failed to delete trace: {response.text}")
|
295
298
|
|
296
299
|
return response.json()
|
300
|
+
|
301
|
+
def delete_project(self, project_name: str):
|
302
|
+
"""
|
303
|
+
Deletes a project from the server. Which also deletes all evaluations and traces associated with the project.
|
304
|
+
"""
|
305
|
+
response = requests.delete(
|
306
|
+
JUDGMENT_PROJECT_DELETE_API_URL,
|
307
|
+
json={
|
308
|
+
"project_name": project_name,
|
309
|
+
},
|
310
|
+
headers={
|
311
|
+
"Content-Type": "application/json",
|
312
|
+
"Authorization": f"Bearer {self.judgment_api_key}",
|
313
|
+
"X-Organization-Id": self.organization_id
|
314
|
+
}
|
315
|
+
)
|
316
|
+
|
317
|
+
if response.status_code != HTTPStatus.OK:
|
318
|
+
raise ValueError(f"Failed to delete traces: {response.text}")
|
319
|
+
|
320
|
+
return response.json()
|
297
321
|
|
298
322
|
|
299
323
|
class TraceClient:
|
@@ -1152,3 +1176,18 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
1152
1176
|
'args': str(messages),
|
1153
1177
|
'kwargs': kwargs
|
1154
1178
|
})
|
1179
|
+
|
1180
|
+
judgeval_callback_handler_var: ContextVar[Optional[JudgevalCallbackHandler]] = ContextVar(
|
1181
|
+
"judgeval_callback_handler", default=None
|
1182
|
+
)
|
1183
|
+
|
1184
|
+
def set_global_handler(handler: JudgevalCallbackHandler):
|
1185
|
+
judgeval_callback_handler_var.set(handler)
|
1186
|
+
|
1187
|
+
def clear_global_handler():
|
1188
|
+
judgeval_callback_handler_var.set(None)
|
1189
|
+
|
1190
|
+
register_configure_hook(
|
1191
|
+
context_var=judgeval_callback_handler_var,
|
1192
|
+
inheritable=True,
|
1193
|
+
)
|
@@ -48,6 +48,7 @@ JUDGMENT_EVAL_LOG_API_URL = f"{ROOT_API}/log_eval_results/"
|
|
48
48
|
JUDGMENT_EVAL_FETCH_API_URL = f"{ROOT_API}/fetch_eval_results/"
|
49
49
|
JUDGMENT_EVAL_DELETE_API_URL = f"{ROOT_API}/delete_eval_results_by_project_and_run_name/"
|
50
50
|
JUDGMENT_EVAL_DELETE_PROJECT_API_URL = f"{ROOT_API}/delete_eval_results_by_project/"
|
51
|
+
JUDGMENT_PROJECT_DELETE_API_URL = f"{ROOT_API}/projects/delete/"
|
51
52
|
JUDGMENT_TRACES_FETCH_API_URL = f"{ROOT_API}/traces/fetch/"
|
52
53
|
JUDGMENT_TRACES_SAVE_API_URL = f"{ROOT_API}/traces/save/"
|
53
54
|
JUDGMENT_TRACES_DELETE_API_URL = f"{ROOT_API}/traces/delete/"
|
@@ -27,7 +27,8 @@ from judgeval.judges import JudgevalJudge
|
|
27
27
|
from judgeval.constants import (
|
28
28
|
JUDGMENT_EVAL_FETCH_API_URL,
|
29
29
|
JUDGMENT_EVAL_DELETE_API_URL,
|
30
|
-
JUDGMENT_EVAL_DELETE_PROJECT_API_URL
|
30
|
+
JUDGMENT_EVAL_DELETE_PROJECT_API_URL,
|
31
|
+
JUDGMENT_PROJECT_DELETE_API_URL
|
31
32
|
)
|
32
33
|
from judgeval.common.exceptions import JudgmentAPIError
|
33
34
|
from pydantic import BaseModel
|
@@ -156,7 +157,7 @@ class JudgmentClient:
|
|
156
157
|
metadata: Optional[Dict[str, Any]] = None,
|
157
158
|
project_name: str = "",
|
158
159
|
eval_run_name: str = "",
|
159
|
-
log_results: bool =
|
160
|
+
log_results: bool = True,
|
160
161
|
use_judgment: bool = True,
|
161
162
|
rules: Optional[List[Rule]] = None
|
162
163
|
) -> List[ScoringResult]:
|
@@ -362,7 +363,6 @@ class JudgmentClient:
|
|
362
363
|
response = requests.delete(JUDGMENT_EVAL_DELETE_PROJECT_API_URL,
|
363
364
|
json={
|
364
365
|
"project_name": project_name,
|
365
|
-
"judgment_api_key": self.judgment_api_key,
|
366
366
|
},
|
367
367
|
headers={
|
368
368
|
"Content-Type": "application/json",
|
@@ -372,6 +372,23 @@ class JudgmentClient:
|
|
372
372
|
if response.status_code != requests.codes.ok:
|
373
373
|
raise ValueError(f"Error deleting eval results: {response.json()}")
|
374
374
|
return response.json()
|
375
|
+
|
376
|
+
def delete_project(self, project_name: str) -> bool:
|
377
|
+
"""
|
378
|
+
Deletes a project from the server. Which also deletes all evaluations and traces associated with the project.
|
379
|
+
"""
|
380
|
+
response = requests.delete(JUDGMENT_PROJECT_DELETE_API_URL,
|
381
|
+
json={
|
382
|
+
"project_name": project_name,
|
383
|
+
},
|
384
|
+
headers={
|
385
|
+
"Content-Type": "application/json",
|
386
|
+
"Authorization": f"Bearer {self.judgment_api_key}",
|
387
|
+
"X-Organization-Id": self.organization_id
|
388
|
+
})
|
389
|
+
if response.status_code != requests.codes.ok:
|
390
|
+
raise ValueError(f"Error deleting project: {response.json()}")
|
391
|
+
return response.json()
|
375
392
|
|
376
393
|
def _validate_api_key(self):
|
377
394
|
"""
|
@@ -1,12 +1,17 @@
|
|
1
1
|
import asyncio
|
2
2
|
import requests
|
3
|
-
|
3
|
+
import time
|
4
|
+
import sys
|
5
|
+
import itertools
|
6
|
+
import threading
|
7
|
+
from typing import List, Dict, Any
|
4
8
|
from datetime import datetime
|
5
9
|
from rich import print as rprint
|
6
10
|
|
7
11
|
from judgeval.data import (
|
8
12
|
ScorerData,
|
9
|
-
ScoringResult
|
13
|
+
ScoringResult,
|
14
|
+
Example
|
10
15
|
)
|
11
16
|
from judgeval.scorers import (
|
12
17
|
JudgevalScorer,
|
@@ -14,7 +19,6 @@ from judgeval.scorers import (
|
|
14
19
|
ClassifierScorer
|
15
20
|
)
|
16
21
|
from judgeval.scorers.score import a_execute_scoring
|
17
|
-
|
18
22
|
from judgeval.constants import (
|
19
23
|
ROOT_API,
|
20
24
|
JUDGMENT_EVAL_API_URL,
|
@@ -185,7 +189,7 @@ def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_k
|
|
185
189
|
raise JudgmentAPIError(f"Failed to check if eval run name exists: {str(e)}")
|
186
190
|
|
187
191
|
|
188
|
-
def log_evaluation_results(merged_results: List[ScoringResult], evaluation_run: EvaluationRun) ->
|
192
|
+
def log_evaluation_results(merged_results: List[ScoringResult], evaluation_run: EvaluationRun) -> str:
|
189
193
|
"""
|
190
194
|
Logs evaluation results to the Judgment API database.
|
191
195
|
|
@@ -220,7 +224,9 @@ def log_evaluation_results(merged_results: List[ScoringResult], evaluation_run:
|
|
220
224
|
raise JudgmentAPIError(error_message)
|
221
225
|
|
222
226
|
if "ui_results_url" in res.json():
|
223
|
-
|
227
|
+
url = res.json()['ui_results_url']
|
228
|
+
pretty_str = f"\n🔍 You can view your evaluation results here: [rgb(106,0,255)][link={url}]View Results[/link]\n"
|
229
|
+
return pretty_str
|
224
230
|
|
225
231
|
except requests.exceptions.RequestException as e:
|
226
232
|
error(f"Request failed while saving evaluation results to DB: {str(e)}")
|
@@ -229,6 +235,51 @@ def log_evaluation_results(merged_results: List[ScoringResult], evaluation_run:
|
|
229
235
|
error(f"Failed to save evaluation results to DB: {str(e)}")
|
230
236
|
raise ValueError(f"Failed to save evaluation results to DB: {str(e)}")
|
231
237
|
|
238
|
+
def run_with_spinner(message: str, func, *args, **kwargs) -> Any:
|
239
|
+
"""Run a function with a spinner in the terminal."""
|
240
|
+
spinner = itertools.cycle(['|', '/', '-', '\\'])
|
241
|
+
|
242
|
+
def display_spinner():
|
243
|
+
while not stop_spinner_event.is_set():
|
244
|
+
sys.stdout.write(f'\r{message}{next(spinner)}')
|
245
|
+
sys.stdout.flush()
|
246
|
+
time.sleep(0.1)
|
247
|
+
|
248
|
+
stop_spinner_event = threading.Event()
|
249
|
+
spinner_thread = threading.Thread(target=display_spinner)
|
250
|
+
spinner_thread.start()
|
251
|
+
|
252
|
+
try:
|
253
|
+
result = func(*args, **kwargs)
|
254
|
+
except Exception as e:
|
255
|
+
error(f"An error occurred: {str(e)}")
|
256
|
+
stop_spinner_event.set()
|
257
|
+
spinner_thread.join()
|
258
|
+
raise e
|
259
|
+
finally:
|
260
|
+
stop_spinner_event.set()
|
261
|
+
spinner_thread.join()
|
262
|
+
|
263
|
+
sys.stdout.write('\r' + ' ' * (len(message) + 1) + '\r')
|
264
|
+
sys.stdout.flush()
|
265
|
+
|
266
|
+
return result
|
267
|
+
|
268
|
+
def check_examples(examples: List[Example], scorers: List[APIJudgmentScorer]) -> None:
|
269
|
+
"""
|
270
|
+
Checks if the example contains the necessary parameters for the scorer.
|
271
|
+
"""
|
272
|
+
for scorer in scorers:
|
273
|
+
if isinstance(scorer, APIJudgmentScorer):
|
274
|
+
for example in examples:
|
275
|
+
missing_params = []
|
276
|
+
for param in scorer.required_params:
|
277
|
+
if getattr(example, param.value) is None:
|
278
|
+
missing_params.append(f"'{param.value}'")
|
279
|
+
if missing_params:
|
280
|
+
# We do this because we want to inform users that an example is missing parameters for a scorer
|
281
|
+
# Example ID (usually random UUID) does not provide any helpful information for the user but printing the entire example is overdoing it
|
282
|
+
print(f"WARNING: Example {example.example_id} is missing the following parameters: {missing_params} for scorer {scorer.score_type.value}")
|
232
283
|
|
233
284
|
|
234
285
|
def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[ScoringResult]:
|
@@ -253,7 +304,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
|
|
253
304
|
Returns:
|
254
305
|
List[ScoringResult]: The results of the evaluation. Each result is a dictionary containing the fields of a `ScoringResult` object.
|
255
306
|
"""
|
256
|
-
|
307
|
+
|
257
308
|
# Call endpoint to check to see if eval run name exists (if we DON'T want to override and DO want to log results)
|
258
309
|
if not override and evaluation_run.log_results:
|
259
310
|
check_eval_run_name_exists(
|
@@ -306,6 +357,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
|
|
306
357
|
|
307
358
|
# Execute evaluation using Judgment API
|
308
359
|
if judgment_scorers:
|
360
|
+
check_examples(evaluation_run.examples, evaluation_run.scorers)
|
309
361
|
info("Starting API evaluation")
|
310
362
|
debug(f"Creating API evaluation run with {len(judgment_scorers)} scorers")
|
311
363
|
try: # execute an EvaluationRun with just JudgmentScorers
|
@@ -323,7 +375,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
|
|
323
375
|
rules=evaluation_run.rules
|
324
376
|
)
|
325
377
|
debug("Sending request to Judgment API")
|
326
|
-
response_data: List[Dict] =
|
378
|
+
response_data: List[Dict] = run_with_spinner("Running Evaluation: ", execute_api_eval, api_evaluation_run)
|
327
379
|
info(f"Received {len(response_data['results'])} results from API")
|
328
380
|
except JudgmentAPIError as e:
|
329
381
|
error(f"An error occurred while executing the Judgment API request: {str(e)}")
|
@@ -352,6 +404,7 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
|
|
352
404
|
api_results.append(ScoringResult(**filtered_result))
|
353
405
|
# Run local evals
|
354
406
|
if local_scorers: # List[JudgevalScorer]
|
407
|
+
# We should be removing local scorers soon
|
355
408
|
info("Starting local evaluation")
|
356
409
|
for example in evaluation_run.examples:
|
357
410
|
with example_logging_context(example.timestamp, example.example_id):
|
@@ -389,7 +442,8 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
|
|
389
442
|
# )
|
390
443
|
|
391
444
|
if evaluation_run.log_results:
|
392
|
-
log_evaluation_results
|
445
|
+
pretty_str = run_with_spinner("Logging Results: ", log_evaluation_results, merged_results, evaluation_run)
|
446
|
+
rprint(pretty_str)
|
393
447
|
|
394
448
|
for i, result in enumerate(merged_results):
|
395
449
|
if not result.scorers_data: # none of the scorers could be executed on this example
|
@@ -5,8 +5,9 @@ Scores `Example`s using ready-made Judgment evaluators.
|
|
5
5
|
"""
|
6
6
|
|
7
7
|
from pydantic import BaseModel, field_validator
|
8
|
+
from typing import List
|
8
9
|
from judgeval.common.logger import debug, info, warning, error
|
9
|
-
|
10
|
+
from judgeval.data import ExampleParams
|
10
11
|
from judgeval.constants import APIScorer, UNBOUNDED_SCORERS
|
11
12
|
|
12
13
|
|
@@ -20,6 +21,7 @@ class APIJudgmentScorer(BaseModel):
|
|
20
21
|
"""
|
21
22
|
score_type: APIScorer
|
22
23
|
threshold: float
|
24
|
+
required_params: List[ExampleParams] = [] # List of the required parameters on examples for the scorer
|
23
25
|
|
24
26
|
@field_validator('threshold')
|
25
27
|
def validate_threshold(cls, v, info):
|
@@ -8,11 +8,19 @@ TODO add link to docs page for this scorer
|
|
8
8
|
# Internal imports
|
9
9
|
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
10
10
|
from judgeval.constants import APIScorer
|
11
|
-
|
11
|
+
from judgeval.data import ExampleParams
|
12
12
|
|
13
13
|
class AnswerCorrectnessScorer(APIJudgmentScorer):
|
14
14
|
def __init__(self, threshold: float):
|
15
|
-
super().__init__(
|
15
|
+
super().__init__(
|
16
|
+
threshold=threshold,
|
17
|
+
score_type=APIScorer.ANSWER_CORRECTNESS,
|
18
|
+
required_params=[
|
19
|
+
ExampleParams.INPUT,
|
20
|
+
ExampleParams.ACTUAL_OUTPUT,
|
21
|
+
ExampleParams.EXPECTED_OUTPUT,
|
22
|
+
]
|
23
|
+
)
|
16
24
|
|
17
25
|
@property
|
18
26
|
def __name__(self):
|
@@ -8,11 +8,18 @@ TODO add link to docs page for this scorer
|
|
8
8
|
# Internal imports
|
9
9
|
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
10
10
|
from judgeval.constants import APIScorer
|
11
|
-
|
11
|
+
from judgeval.data import ExampleParams
|
12
12
|
|
13
13
|
class AnswerRelevancyScorer(APIJudgmentScorer):
|
14
14
|
def __init__(self, threshold: float):
|
15
|
-
super().__init__(
|
15
|
+
super().__init__(
|
16
|
+
threshold=threshold,
|
17
|
+
score_type=APIScorer.ANSWER_RELEVANCY,
|
18
|
+
required_params=[
|
19
|
+
ExampleParams.INPUT,
|
20
|
+
ExampleParams.ACTUAL_OUTPUT,
|
21
|
+
]
|
22
|
+
)
|
16
23
|
|
17
24
|
@property
|
18
25
|
def __name__(self):
|
{judgeval-0.0.20 → judgeval-0.0.21}/src/judgeval/scorers/judgeval_scorers/api_scorers/comparison.py
RENAMED
@@ -9,12 +9,20 @@ TODO add link to docs page for this scorer
|
|
9
9
|
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
10
10
|
from judgeval.constants import APIScorer
|
11
11
|
from typing import Optional, Dict
|
12
|
-
|
12
|
+
from judgeval.data import ExampleParams
|
13
13
|
class ComparisonScorer(APIJudgmentScorer):
|
14
14
|
kwargs: Optional[Dict] = None
|
15
15
|
|
16
16
|
def __init__(self, threshold: float, criteria: str, description: str):
|
17
|
-
super().__init__(
|
17
|
+
super().__init__(
|
18
|
+
threshold=threshold,
|
19
|
+
score_type=APIScorer.COMPARISON,
|
20
|
+
required_params=[
|
21
|
+
ExampleParams.INPUT,
|
22
|
+
ExampleParams.ACTUAL_OUTPUT,
|
23
|
+
ExampleParams.EXPECTED_OUTPUT,
|
24
|
+
]
|
25
|
+
)
|
18
26
|
self.kwargs = {"criteria": criteria, "description": description}
|
19
27
|
|
20
28
|
@property
|
@@ -0,0 +1,28 @@
|
|
1
|
+
"""
|
2
|
+
`judgeval` contextual precision scorer
|
3
|
+
|
4
|
+
TODO add link to docs page for this scorer
|
5
|
+
|
6
|
+
"""
|
7
|
+
|
8
|
+
# Internal imports
|
9
|
+
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
10
|
+
from judgeval.constants import APIScorer
|
11
|
+
from judgeval.data import ExampleParams
|
12
|
+
|
13
|
+
class ContextualPrecisionScorer(APIJudgmentScorer):
|
14
|
+
def __init__(self, threshold: float):
|
15
|
+
super().__init__(
|
16
|
+
threshold=threshold,
|
17
|
+
score_type=APIScorer.CONTEXTUAL_PRECISION,
|
18
|
+
required_params=[
|
19
|
+
ExampleParams.INPUT,
|
20
|
+
ExampleParams.ACTUAL_OUTPUT,
|
21
|
+
ExampleParams.RETRIEVAL_CONTEXT,
|
22
|
+
ExampleParams.EXPECTED_OUTPUT,
|
23
|
+
]
|
24
|
+
)
|
25
|
+
|
26
|
+
@property
|
27
|
+
def __name__(self):
|
28
|
+
return "Contextual Precision"
|
@@ -0,0 +1,28 @@
|
|
1
|
+
"""
|
2
|
+
`judgeval` contextual recall scorer
|
3
|
+
|
4
|
+
TODO add link to docs page for this scorer
|
5
|
+
|
6
|
+
"""
|
7
|
+
|
8
|
+
# Internal imports
|
9
|
+
from judgeval.scorers.api_scorer import APIJudgmentScorer
|
10
|
+
from judgeval.constants import APIScorer
|
11
|
+
from judgeval.data import ExampleParams
|
12
|
+
|
13
|
+
|
14
|
+
class ContextualRecallScorer(APIJudgmentScorer):
|
15
|
+
def __init__(self, threshold: float):
|
16
|
+
super().__init__(
|
17
|
+
threshold=threshold,
|
18
|
+
score_type=APIScorer.CONTEXTUAL_RECALL,
|
19
|
+
required_params=[
|
20
|
+
ExampleParams.INPUT,
|
21
|
+
ExampleParams.ACTUAL_OUTPUT,
|
22
|
+
ExampleParams.EXPECTED_OUTPUT,
|
23
|
+
ExampleParams.RETRIEVAL_CONTEXT,
|
24
|
+
]
|
25
|
+
)
|
26
|
+
@property
|
27
|
+
def __name__(self):
|
28
|
+
return "Contextual Recall"
|