judgeval 0.0.20__tar.gz → 0.0.22__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {judgeval-0.0.20 → judgeval-0.0.22}/PKG-INFO +7 -3
- {judgeval-0.0.20 → judgeval-0.0.22}/Pipfile +0 -2
- {judgeval-0.0.20 → judgeval-0.0.22}/docs/integration/langgraph.mdx +2 -1
- {judgeval-0.0.20 → judgeval-0.0.22}/pyproject.toml +7 -3
- judgeval-0.0.22/src/demo/custom_example_demo/qodo_example.py +39 -0
- judgeval-0.0.22/src/demo/custom_example_demo/test.py +16 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/common/tracer.py +41 -2
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/constants.py +1 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/data/__init__.py +2 -3
- judgeval-0.0.22/src/judgeval/data/custom_example.py +98 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/data/datasets/dataset.py +17 -124
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/data/datasets/eval_dataset_client.py +5 -11
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/judgment_client.py +23 -7
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/run_evaluation.py +62 -8
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/api_scorer.py +3 -1
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +10 -2
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +9 -2
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +10 -2
- judgeval-0.0.22/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +28 -0
- judgeval-0.0.22/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +28 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +10 -3
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +10 -2
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +10 -2
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +10 -2
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +10 -2
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +9 -2
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +9 -2
- judgeval-0.0.22/src/judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +27 -0
- judgeval-0.0.22/src/judgeval/scorers/judgeval_scorers/local_implementations/comparison/__init__.py +0 -0
- judgeval-0.0.20/src/demo/cookbooks/anime_chatbot_agent/animeChatBot.py +0 -443
- judgeval-0.0.20/src/demo/cookbooks/ci_testing/ci_testing.py +0 -201
- judgeval-0.0.20/src/demo/cookbooks/ci_testing/travel_response.txt +0 -52
- judgeval-0.0.20/src/demo/cookbooks/custom_scorers/competitor_mentions.py +0 -41
- judgeval-0.0.20/src/demo/cookbooks/custom_scorers/text2sql.py +0 -205
- judgeval-0.0.20/src/demo/cookbooks/jpmorgan/demo.ipynb +0 -211
- judgeval-0.0.20/src/demo/cookbooks/jpmorgan/demo.py +0 -262
- judgeval-0.0.20/src/demo/cookbooks/jpmorgan/vectordbdocs.py +0 -174
- judgeval-0.0.20/src/demo/cookbooks/langchain_basic_rag/basic_agentic_rag.ipynb +0 -781
- judgeval-0.0.20/src/demo/cookbooks/langchain_basic_rag/tesla_q3.pdf +0 -0
- judgeval-0.0.20/src/demo/cookbooks/langchain_sales/example_product_price_id_mapping.json +0 -1
- judgeval-0.0.20/src/demo/cookbooks/langchain_sales/sales_agent_with_context.ipynb +0 -1375
- judgeval-0.0.20/src/demo/cookbooks/langchain_sales/sample_product_catalog.txt +0 -20
- judgeval-0.0.20/src/demo/cookbooks/langgraph_basic/agent.ipynb +0 -107
- judgeval-0.0.20/src/demo/cookbooks/langgraph_basic/agent.py +0 -109
- judgeval-0.0.20/src/demo/cookbooks/new_bot/basic_bot.py +0 -106
- judgeval-0.0.20/src/demo/cookbooks/openai_travel_agent/agent.py +0 -167
- judgeval-0.0.20/src/demo/cookbooks/openai_travel_agent/populate_db.py +0 -73
- judgeval-0.0.20/src/demo/cookbooks/openai_travel_agent/tools.py +0 -16
- judgeval-0.0.20/src/demo/cookbooks/rules_alerts/rules_bot.py +0 -132
- judgeval-0.0.20/src/demo/cookbooks/rules_alerts/rules_demo.py +0 -351
- judgeval-0.0.20/src/demo/cookbooks/rules_alerts/utils_helper.py +0 -78
- judgeval-0.0.20/src/demo/customer_use/jnpr/mist/demo.py +0 -131
- judgeval-0.0.20/src/demo/customer_use/jnpr/mist/test.yaml +0 -11
- judgeval-0.0.20/src/judgeval/data/datasets/utils.py +0 -73
- judgeval-0.0.20/src/judgeval/data/ground_truth.py +0 -54
- judgeval-0.0.20/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -19
- judgeval-0.0.20/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -19
- judgeval-0.0.20/src/judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -20
- {judgeval-0.0.20 → judgeval-0.0.22}/.github/workflows/ci.yaml +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/.gitignore +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/LICENSE.md +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/Pipfile.lock +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/README.md +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/docs/README.md +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/docs/api_reference/judgment_client.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/docs/api_reference/trace.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/docs/development.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/docs/essentials/code.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/docs/essentials/images.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/docs/essentials/markdown.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/docs/essentials/navigation.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/docs/essentials/reusable-snippets.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/docs/essentials/settings.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/docs/evaluation/data_datasets.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/docs/evaluation/data_examples.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/docs/evaluation/introduction.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/docs/evaluation/judges.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/docs/evaluation/scorers/answer_correctness.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/docs/evaluation/scorers/answer_relevancy.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/docs/evaluation/scorers/classifier_scorer.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/docs/evaluation/scorers/comparison.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/docs/evaluation/scorers/contextual_precision.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/docs/evaluation/scorers/contextual_recall.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/docs/evaluation/scorers/contextual_relevancy.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/docs/evaluation/scorers/custom_scorers.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/docs/evaluation/scorers/execution_order.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/docs/evaluation/scorers/faithfulness.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/docs/evaluation/scorers/hallucination.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/docs/evaluation/scorers/introduction.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/docs/evaluation/scorers/json_correctness.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/docs/evaluation/scorers/summarization.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/docs/evaluation/unit_testing.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/docs/favicon.svg +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/docs/getting_started.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/docs/images/basic_trace_example.png +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/docs/images/checks-passed.png +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/docs/images/create_aggressive_scorer.png +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/docs/images/create_scorer.png +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/docs/images/evaluation_diagram.png +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/docs/images/hero-dark.svg +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/docs/images/hero-light.svg +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/docs/images/online_eval_fault.png +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/docs/images/trace_ss.png +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/docs/introduction.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/docs/judgment/introduction.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/docs/logo/dark.svg +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/docs/logo/light.svg +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/docs/mint.json +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/docs/monitoring/introduction.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/docs/monitoring/production_insights.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/docs/monitoring/tracing.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/docs/notebooks/create_dataset.ipynb +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/docs/notebooks/create_scorer.ipynb +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/docs/notebooks/demo.ipynb +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/docs/notebooks/prompt_scorer.ipynb +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/docs/notebooks/quickstart.ipynb +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/docs/quickstart.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/docs/snippets/snippet-intro.mdx +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/pytest.ini +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/demo/cookbooks/JNPR_Mist/test.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/demo/cookbooks/linkd/text2sql.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/__init__.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/clients.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/common/__init__.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/common/exceptions.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/common/logger.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/common/utils.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/data/api_example.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/data/datasets/__init__.py +0 -0
- /judgeval-0.0.20/src/demo/customer_use/jnpr/srikar_demo.py → /judgeval-0.0.22/src/judgeval/data/datasets/utils.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/data/example.py +0 -0
- /judgeval-0.0.20/src/judgeval/scorers/judgeval_scorers/local_implementations/comparison/__init__.py → /judgeval-0.0.22/src/judgeval/data/ground_truth.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/data/result.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/data/scorer_data.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/evaluation_run.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/judges/__init__.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/judges/base_judge.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/judges/litellm_judge.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/judges/mixture_of_judges.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/judges/together_judge.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/judges/utils.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/rules.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/__init__.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/base_scorer.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/exceptions.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorer.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/__init__.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/comparison/comparison_scorer.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/comparison/prompts.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/execution_order/__init__.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/execution_order/execution_order.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/instruction_adherence/instruction_adherence.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/instruction_adherence/prompt.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/prompt_scorer.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/score.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/scorers/utils.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/tracer/__init__.py +0 -0
- {judgeval-0.0.20 → judgeval-0.0.22}/src/judgeval/utils/alerts.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: judgeval
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.22
|
4
4
|
Summary: Judgeval Package
|
5
5
|
Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
|
6
6
|
Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
|
@@ -12,9 +12,15 @@ Classifier: Programming Language :: Python :: 3
|
|
12
12
|
Requires-Python: >=3.11
|
13
13
|
Requires-Dist: anthropic
|
14
14
|
Requires-Dist: fastapi
|
15
|
+
Requires-Dist: langchain
|
16
|
+
Requires-Dist: langchain-anthropic
|
17
|
+
Requires-Dist: langchain-core
|
18
|
+
Requires-Dist: langchain-huggingface
|
19
|
+
Requires-Dist: langchain-openai
|
15
20
|
Requires-Dist: litellm
|
16
21
|
Requires-Dist: nest-asyncio
|
17
22
|
Requires-Dist: openai
|
23
|
+
Requires-Dist: openpyxl
|
18
24
|
Requires-Dist: pandas
|
19
25
|
Requires-Dist: pika
|
20
26
|
Requires-Dist: python-dotenv==1.0.1
|
@@ -23,8 +29,6 @@ Requires-Dist: supabase
|
|
23
29
|
Requires-Dist: together
|
24
30
|
Requires-Dist: uvicorn
|
25
31
|
Provides-Extra: dev
|
26
|
-
Requires-Dist: langfuse==2.50.3; extra == 'dev'
|
27
|
-
Requires-Dist: patronus; extra == 'dev'
|
28
32
|
Requires-Dist: pytest-asyncio>=0.25.0; extra == 'dev'
|
29
33
|
Requires-Dist: pytest-mock>=3.14.0; extra == 'dev'
|
30
34
|
Requires-Dist: pytest>=8.3.4; extra == 'dev'
|
@@ -14,10 +14,11 @@ graph_builder = StateGraph(State)
|
|
14
14
|
# YOUR LANGGRAPH WORKFLOW
|
15
15
|
|
16
16
|
handler = JudgevalCallbackHandler(judgment.get_current_trace())
|
17
|
+
set_global_handler(handler)
|
17
18
|
|
18
19
|
result = graph.invoke({
|
19
20
|
"messages": [HumanMessage(content=prompt)]
|
20
|
-
}
|
21
|
+
})
|
21
22
|
|
22
23
|
```
|
23
24
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[project]
|
2
2
|
name = "judgeval"
|
3
|
-
version = "0.0.
|
3
|
+
version = "0.0.22"
|
4
4
|
authors = [
|
5
5
|
{ name="Andrew Li", email="andrew@judgmentlabs.ai" },
|
6
6
|
{ name="Alex Shan", email="alex@judgmentlabs.ai" },
|
@@ -28,6 +28,12 @@ dependencies = [
|
|
28
28
|
"anthropic",
|
29
29
|
"nest-asyncio",
|
30
30
|
"pika",
|
31
|
+
"openpyxl",
|
32
|
+
"langchain",
|
33
|
+
"langchain-huggingface",
|
34
|
+
"langchain-openai",
|
35
|
+
"langchain-anthropic",
|
36
|
+
"langchain-core",
|
31
37
|
]
|
32
38
|
|
33
39
|
[project.optional-dependencies]
|
@@ -35,8 +41,6 @@ dev = [
|
|
35
41
|
"pytest>=8.3.4",
|
36
42
|
"pytest-asyncio>=0.25.0",
|
37
43
|
"pytest-mock>=3.14.0",
|
38
|
-
"langfuse==2.50.3",
|
39
|
-
"patronus",
|
40
44
|
"tavily-python"
|
41
45
|
]
|
42
46
|
|
@@ -0,0 +1,39 @@
|
|
1
|
+
from judgeval.data import CustomExample
|
2
|
+
from pydantic import field_validator
|
3
|
+
|
4
|
+
class QodoExample(CustomExample):
|
5
|
+
code: str
|
6
|
+
original_code: str
|
7
|
+
|
8
|
+
def __init__(self, **data):
|
9
|
+
super().__init__(**data)
|
10
|
+
|
11
|
+
@field_validator('code', 'original_code', mode='before')
|
12
|
+
@classmethod
|
13
|
+
def validate_code(cls, v):
|
14
|
+
if v is not None and not isinstance(v, str):
|
15
|
+
raise ValueError(f"Code must be a string or None but got {v} of type {type(v)}")
|
16
|
+
return v
|
17
|
+
|
18
|
+
def to_dict(self):
|
19
|
+
return {
|
20
|
+
"code": self.code,
|
21
|
+
"original_code": self.original_code,
|
22
|
+
**super().to_dict()
|
23
|
+
}
|
24
|
+
|
25
|
+
def model_dump(self, **kwargs):
|
26
|
+
"""
|
27
|
+
Custom serialization that handles special cases for fields that might fail standard serialization.
|
28
|
+
"""
|
29
|
+
data = super().model_dump(**kwargs)
|
30
|
+
|
31
|
+
# Do any additional serialization here
|
32
|
+
data["code"] = self.code
|
33
|
+
data["original_code"] = self.original_code
|
34
|
+
|
35
|
+
return data
|
36
|
+
|
37
|
+
|
38
|
+
|
39
|
+
|
@@ -0,0 +1,16 @@
|
|
1
|
+
from judgeval.data import CustomExample
|
2
|
+
from judgeval import JudgmentClient
|
3
|
+
from qodo_example import QodoExample
|
4
|
+
|
5
|
+
custom_example = CustomExample(
|
6
|
+
code="print('Hello, world!')",
|
7
|
+
original_code="print('Hello, world!')",
|
8
|
+
)
|
9
|
+
|
10
|
+
qodo_example = QodoExample(
|
11
|
+
code="print('Hello, world!')",
|
12
|
+
original_code="print('Hello, world!')",
|
13
|
+
)
|
14
|
+
|
15
|
+
print(f"{custom_example=}")
|
16
|
+
print(f"{qodo_example=}")
|
@@ -10,6 +10,7 @@ import os
|
|
10
10
|
import time
|
11
11
|
import uuid
|
12
12
|
import warnings
|
13
|
+
from contextvars import ContextVar
|
13
14
|
from contextlib import contextmanager
|
14
15
|
from collections import defaultdict
|
15
16
|
from dataclasses import dataclass, field
|
@@ -37,6 +38,7 @@ from judgeval.constants import (
|
|
37
38
|
RABBITMQ_PORT,
|
38
39
|
RABBITMQ_QUEUE,
|
39
40
|
JUDGMENT_TRACES_DELETE_API_URL,
|
41
|
+
JUDGMENT_PROJECT_DELETE_API_URL,
|
40
42
|
JUDGMENT_TRACES_ADD_TO_EVAL_QUEUE_API_URL
|
41
43
|
)
|
42
44
|
from judgeval.judgment_client import JudgmentClient
|
@@ -54,7 +56,7 @@ from langchain_core.utils.function_calling import convert_to_openai_tool
|
|
54
56
|
from langchain_core.callbacks import CallbackManager, BaseCallbackHandler
|
55
57
|
from langchain_core.agents import AgentAction, AgentFinish
|
56
58
|
from langchain_core.outputs import LLMResult
|
57
|
-
|
59
|
+
from langchain_core.tracers.context import register_configure_hook
|
58
60
|
from langchain_core.messages.ai import AIMessage
|
59
61
|
from langchain_core.messages.tool import ToolMessage
|
60
62
|
from langchain_core.messages.base import BaseMessage
|
@@ -251,7 +253,8 @@ class TraceManagerClient:
|
|
251
253
|
raise ValueError(f"Failed to save trace data: {response.text}")
|
252
254
|
|
253
255
|
if not empty_save and "ui_results_url" in response.json():
|
254
|
-
|
256
|
+
pretty_str = f"\n🔍 You can view your trace data here: [rgb(106,0,255)][link={response.json()['ui_results_url']}]View Trace[/link]\n"
|
257
|
+
rprint(pretty_str)
|
255
258
|
|
256
259
|
def delete_trace(self, trace_id: str):
|
257
260
|
"""
|
@@ -294,6 +297,27 @@ class TraceManagerClient:
|
|
294
297
|
raise ValueError(f"Failed to delete trace: {response.text}")
|
295
298
|
|
296
299
|
return response.json()
|
300
|
+
|
301
|
+
def delete_project(self, project_name: str):
|
302
|
+
"""
|
303
|
+
Deletes a project from the server. Which also deletes all evaluations and traces associated with the project.
|
304
|
+
"""
|
305
|
+
response = requests.delete(
|
306
|
+
JUDGMENT_PROJECT_DELETE_API_URL,
|
307
|
+
json={
|
308
|
+
"project_name": project_name,
|
309
|
+
},
|
310
|
+
headers={
|
311
|
+
"Content-Type": "application/json",
|
312
|
+
"Authorization": f"Bearer {self.judgment_api_key}",
|
313
|
+
"X-Organization-Id": self.organization_id
|
314
|
+
}
|
315
|
+
)
|
316
|
+
|
317
|
+
if response.status_code != HTTPStatus.OK:
|
318
|
+
raise ValueError(f"Failed to delete traces: {response.text}")
|
319
|
+
|
320
|
+
return response.json()
|
297
321
|
|
298
322
|
|
299
323
|
class TraceClient:
|
@@ -1152,3 +1176,18 @@ class JudgevalCallbackHandler(BaseCallbackHandler):
|
|
1152
1176
|
'args': str(messages),
|
1153
1177
|
'kwargs': kwargs
|
1154
1178
|
})
|
1179
|
+
|
1180
|
+
judgeval_callback_handler_var: ContextVar[Optional[JudgevalCallbackHandler]] = ContextVar(
|
1181
|
+
"judgeval_callback_handler", default=None
|
1182
|
+
)
|
1183
|
+
|
1184
|
+
def set_global_handler(handler: JudgevalCallbackHandler):
|
1185
|
+
judgeval_callback_handler_var.set(handler)
|
1186
|
+
|
1187
|
+
def clear_global_handler():
|
1188
|
+
judgeval_callback_handler_var.set(None)
|
1189
|
+
|
1190
|
+
register_configure_hook(
|
1191
|
+
context_var=judgeval_callback_handler_var,
|
1192
|
+
inheritable=True,
|
1193
|
+
)
|
@@ -48,6 +48,7 @@ JUDGMENT_EVAL_LOG_API_URL = f"{ROOT_API}/log_eval_results/"
|
|
48
48
|
JUDGMENT_EVAL_FETCH_API_URL = f"{ROOT_API}/fetch_eval_results/"
|
49
49
|
JUDGMENT_EVAL_DELETE_API_URL = f"{ROOT_API}/delete_eval_results_by_project_and_run_name/"
|
50
50
|
JUDGMENT_EVAL_DELETE_PROJECT_API_URL = f"{ROOT_API}/delete_eval_results_by_project/"
|
51
|
+
JUDGMENT_PROJECT_DELETE_API_URL = f"{ROOT_API}/projects/delete/"
|
51
52
|
JUDGMENT_TRACES_FETCH_API_URL = f"{ROOT_API}/traces/fetch/"
|
52
53
|
JUDGMENT_TRACES_SAVE_API_URL = f"{ROOT_API}/traces/save/"
|
53
54
|
JUDGMENT_TRACES_DELETE_API_URL = f"{ROOT_API}/traces/delete/"
|
@@ -2,8 +2,7 @@ from judgeval.data.example import Example, ExampleParams
|
|
2
2
|
from judgeval.data.api_example import ProcessExample, create_process_example
|
3
3
|
from judgeval.data.scorer_data import ScorerData, create_scorer_data
|
4
4
|
from judgeval.data.result import ScoringResult, generate_scoring_result
|
5
|
-
from judgeval.data.
|
6
|
-
|
5
|
+
from judgeval.data.custom_example import CustomExample
|
7
6
|
__all__ = [
|
8
7
|
"Example",
|
9
8
|
"ExampleParams",
|
@@ -13,5 +12,5 @@ __all__ = [
|
|
13
12
|
"create_scorer_data",
|
14
13
|
"ScoringResult",
|
15
14
|
"generate_scoring_result",
|
16
|
-
"
|
15
|
+
"CustomExample",
|
17
16
|
]
|
@@ -0,0 +1,98 @@
|
|
1
|
+
from pydantic import BaseModel, Field, field_validator
|
2
|
+
from typing import Optional, Dict, Any
|
3
|
+
from uuid import uuid4
|
4
|
+
from datetime import datetime
|
5
|
+
import json
|
6
|
+
import warnings
|
7
|
+
|
8
|
+
# Brainstorming what are the requirements for the fields?
|
9
|
+
class CustomExample(BaseModel):
|
10
|
+
name: Optional[str] = None
|
11
|
+
additional_metadata: Optional[Dict[str, Any]] = None
|
12
|
+
example_id: str = Field(default_factory=lambda: str(uuid4()))
|
13
|
+
example_index: Optional[int] = None
|
14
|
+
timestamp: Optional[str] = None
|
15
|
+
trace_id: Optional[str] = None
|
16
|
+
|
17
|
+
model_config = {
|
18
|
+
"extra": "allow", # Allow extra fields with any types
|
19
|
+
}
|
20
|
+
|
21
|
+
def __init__(self, **data):
|
22
|
+
if 'example_id' not in data:
|
23
|
+
data['example_id'] = str(uuid4())
|
24
|
+
# Set timestamp if not provided
|
25
|
+
if 'timestamp' not in data:
|
26
|
+
data['timestamp'] = datetime.now().isoformat()
|
27
|
+
super().__init__(**data)
|
28
|
+
|
29
|
+
@field_validator('additional_metadata', mode='before')
|
30
|
+
@classmethod
|
31
|
+
def validate_additional_metadata(cls, v):
|
32
|
+
if v is not None and not isinstance(v, dict):
|
33
|
+
raise ValueError(f"Additional metadata must be a dictionary or None but got {v} of type {type(v)}")
|
34
|
+
return v
|
35
|
+
|
36
|
+
@field_validator('example_index', mode='before')
|
37
|
+
@classmethod
|
38
|
+
def validate_example_index(cls, v):
|
39
|
+
if v is not None and not isinstance(v, int):
|
40
|
+
raise ValueError(f"Example index must be an integer or None but got {v} of type {type(v)}")
|
41
|
+
return v
|
42
|
+
|
43
|
+
@field_validator('timestamp', mode='before')
|
44
|
+
@classmethod
|
45
|
+
def validate_timestamp(cls, v):
|
46
|
+
if v is not None and not isinstance(v, str):
|
47
|
+
raise ValueError(f"Timestamp must be a string or None but got {v} of type {type(v)}")
|
48
|
+
return v
|
49
|
+
|
50
|
+
@field_validator('trace_id', mode='before')
|
51
|
+
@classmethod
|
52
|
+
def validate_trace_id(cls, v):
|
53
|
+
if v is not None and not isinstance(v, str):
|
54
|
+
raise ValueError(f"Trace ID must be a string or None but got {v} of type {type(v)}")
|
55
|
+
return v
|
56
|
+
|
57
|
+
def to_dict(self):
|
58
|
+
return self.model_dump()
|
59
|
+
|
60
|
+
def __str__(self):
|
61
|
+
return str(self.model_dump())
|
62
|
+
|
63
|
+
def model_dump(self, **kwargs):
|
64
|
+
"""
|
65
|
+
Custom serialization that handles special cases for fields that might fail standard serialization.
|
66
|
+
"""
|
67
|
+
data = super().model_dump(**kwargs)
|
68
|
+
|
69
|
+
# Get all fields including custom ones
|
70
|
+
all_fields = self.__dict__
|
71
|
+
|
72
|
+
for field_name, value in all_fields.items():
|
73
|
+
try:
|
74
|
+
# Check if the field has its own serialization method
|
75
|
+
if hasattr(value, 'to_dict'):
|
76
|
+
data[field_name] = value.to_dict()
|
77
|
+
elif hasattr(value, 'model_dump'):
|
78
|
+
data[field_name] = value.model_dump()
|
79
|
+
# Field is already in data from super().model_dump()
|
80
|
+
elif field_name in data:
|
81
|
+
continue
|
82
|
+
else:
|
83
|
+
# Try standard JSON serialization
|
84
|
+
json.dumps(value)
|
85
|
+
data[field_name] = value
|
86
|
+
except (TypeError, OverflowError, ValueError):
|
87
|
+
# Handle non-serializable objects
|
88
|
+
try:
|
89
|
+
# Try converting to string
|
90
|
+
data[field_name] = str(value)
|
91
|
+
except Exception as _:
|
92
|
+
# If all else fails, store as None and optionally warn
|
93
|
+
warnings.warn(f"Could not serialize field {field_name}, setting to None")
|
94
|
+
data[field_name] = None
|
95
|
+
|
96
|
+
return data
|
97
|
+
|
98
|
+
|
@@ -7,12 +7,11 @@ import yaml
|
|
7
7
|
from dataclasses import dataclass, field
|
8
8
|
from typing import List, Union, Literal
|
9
9
|
|
10
|
-
from judgeval.data import Example
|
10
|
+
from judgeval.data import Example
|
11
11
|
from judgeval.common.logger import debug, error, warning, info
|
12
12
|
|
13
13
|
@dataclass
|
14
14
|
class EvalDataset:
|
15
|
-
ground_truths: List[GroundTruthExample]
|
16
15
|
examples: List[Example]
|
17
16
|
_alias: Union[str, None] = field(default=None)
|
18
17
|
_id: Union[str, None] = field(default=None)
|
@@ -21,13 +20,11 @@ class EvalDataset:
|
|
21
20
|
def __init__(self,
|
22
21
|
judgment_api_key: str = os.getenv("JUDGMENT_API_KEY"),
|
23
22
|
organization_id: str = os.getenv("JUDGMENT_ORG_ID"),
|
24
|
-
ground_truths: List[GroundTruthExample] = [],
|
25
23
|
examples: List[Example] = [],
|
26
24
|
):
|
27
|
-
debug(f"Initializing EvalDataset with {len(
|
25
|
+
debug(f"Initializing EvalDataset with {len(examples)} examples")
|
28
26
|
if not judgment_api_key:
|
29
27
|
warning("No judgment_api_key provided")
|
30
|
-
self.ground_truths = ground_truths
|
31
28
|
self.examples = examples
|
32
29
|
self._alias = None
|
33
30
|
self._id = None
|
@@ -37,38 +34,13 @@ class EvalDataset:
|
|
37
34
|
def add_from_json(self, file_path: str) -> None:
|
38
35
|
debug(f"Loading dataset from JSON file: {file_path}")
|
39
36
|
"""
|
40
|
-
Adds examples
|
37
|
+
Adds examples from a JSON file.
|
41
38
|
|
42
|
-
The format of the JSON file is expected to be a dictionary with
|
43
|
-
The value of
|
39
|
+
The format of the JSON file is expected to be a dictionary with one key: "examples".
|
40
|
+
The value of the key is a list of dictionaries, where each dictionary represents an example.
|
44
41
|
|
45
42
|
The JSON file is expected to have the following format:
|
46
43
|
{
|
47
|
-
"ground_truths": [
|
48
|
-
{
|
49
|
-
"input": "test input",
|
50
|
-
"actual_output": null,
|
51
|
-
"expected_output": "expected output",
|
52
|
-
"context": [
|
53
|
-
"context1"
|
54
|
-
],
|
55
|
-
"retrieval_context": [
|
56
|
-
"retrieval1"
|
57
|
-
],
|
58
|
-
"additional_metadata": {
|
59
|
-
"key": "value"
|
60
|
-
},
|
61
|
-
"comments": "test comment",
|
62
|
-
"tools_called": [
|
63
|
-
"tool1"
|
64
|
-
],
|
65
|
-
"expected_tools": [
|
66
|
-
"tool1"
|
67
|
-
],
|
68
|
-
"source_file": "test.py",
|
69
|
-
"trace_id": "094121"
|
70
|
-
}
|
71
|
-
],
|
72
44
|
"examples": [
|
73
45
|
{
|
74
46
|
"input": "test input",
|
@@ -103,7 +75,6 @@ class EvalDataset:
|
|
103
75
|
with open(file_path, "r") as file:
|
104
76
|
payload = json.load(file)
|
105
77
|
examples = payload.get("examples", [])
|
106
|
-
ground_truths = payload.get("ground_truths", [])
|
107
78
|
except FileNotFoundError:
|
108
79
|
error(f"JSON file not found: {file_path}")
|
109
80
|
raise FileNotFoundError(f"The file {file_path} was not found.")
|
@@ -111,21 +82,17 @@ class EvalDataset:
|
|
111
82
|
error(f"Invalid JSON file: {file_path}")
|
112
83
|
raise ValueError(f"The file {file_path} is not a valid JSON file.")
|
113
84
|
|
114
|
-
info(f"Added {len(examples)} examples
|
85
|
+
info(f"Added {len(examples)} examples from JSON")
|
115
86
|
new_examples = [Example(**e) for e in examples]
|
116
87
|
for e in new_examples:
|
117
88
|
self.add_example(e)
|
118
|
-
|
119
|
-
new_ground_truths = [GroundTruthExample(**g) for g in ground_truths]
|
120
|
-
for g in new_ground_truths:
|
121
|
-
self.add_ground_truth(g)
|
122
89
|
|
123
90
|
def add_from_csv(
|
124
91
|
self,
|
125
92
|
file_path: str,
|
126
93
|
) -> None:
|
127
94
|
"""
|
128
|
-
Add Examples
|
95
|
+
Add Examples from a CSV file.
|
129
96
|
"""
|
130
97
|
try:
|
131
98
|
import pandas as pd
|
@@ -144,14 +111,14 @@ class EvalDataset:
|
|
144
111
|
"expected_tools", "name", "comments", "source_file", "example", \
|
145
112
|
"trace_id"
|
146
113
|
|
147
|
-
We want to collect the examples
|
114
|
+
We want to collect the examples separately which can
|
148
115
|
be determined by the "example" column. If the value is True, then it is an
|
149
|
-
example
|
116
|
+
example
|
150
117
|
|
151
118
|
We also assume that if there are multiple retrieval contexts or contexts, they are separated by semicolons.
|
152
119
|
This can be adjusted using the `context_delimiter` and `retrieval_context_delimiter` parameters.
|
153
120
|
"""
|
154
|
-
examples
|
121
|
+
examples = []
|
155
122
|
|
156
123
|
for _, row in df.iterrows():
|
157
124
|
data = {
|
@@ -174,49 +141,20 @@ class EvalDataset:
|
|
174
141
|
examples.append(e)
|
175
142
|
else:
|
176
143
|
raise ValueError("Every example must have an 'input' and 'actual_output' field.")
|
177
|
-
|
178
|
-
# GroundTruthExample has `comments` and `source_file` fields
|
179
|
-
data["comments"] = row["comments"] if pd.notna(row["comments"]) else None
|
180
|
-
data["source_file"] = row["source_file"] if pd.notna(row["source_file"]) else None
|
181
|
-
# every GroundTruthExample has `input` field
|
182
|
-
if data["input"] is not None:
|
183
|
-
g = GroundTruthExample(**data)
|
184
|
-
ground_truths.append(g)
|
185
|
-
else:
|
186
|
-
raise ValueError("Every ground truth must have an 'input' field.")
|
144
|
+
|
187
145
|
|
188
146
|
for e in examples:
|
189
147
|
self.add_example(e)
|
190
148
|
|
191
|
-
for g in ground_truths:
|
192
|
-
self.add_ground_truth(g)
|
193
|
-
|
194
149
|
def add_from_yaml(self, file_path: str) -> None:
|
195
150
|
debug(f"Loading dataset from YAML file: {file_path}")
|
196
151
|
"""
|
197
|
-
Adds examples
|
152
|
+
Adds examples from a YAML file.
|
198
153
|
|
199
|
-
The format of the YAML file is expected to be a dictionary with
|
200
|
-
The value of
|
154
|
+
The format of the YAML file is expected to be a dictionary with one key: "examples".
|
155
|
+
The value of the key is a list of dictionaries, where each dictionary represents an example.
|
201
156
|
|
202
157
|
The YAML file is expected to have the following format:
|
203
|
-
ground_truths:
|
204
|
-
- input: "test input"
|
205
|
-
actual_output: null
|
206
|
-
expected_output: "expected output"
|
207
|
-
context:
|
208
|
-
- "context1"
|
209
|
-
retrieval_context:
|
210
|
-
- "retrieval1"
|
211
|
-
additional_metadata:
|
212
|
-
key: "value"
|
213
|
-
comments: "test comment"
|
214
|
-
tools_called:
|
215
|
-
- "tool1"
|
216
|
-
expected_tools:
|
217
|
-
- "tool1"
|
218
|
-
source_file: "test.py"
|
219
|
-
trace_id: "094121"
|
220
158
|
examples:
|
221
159
|
- input: "test input"
|
222
160
|
actual_output: "test output"
|
@@ -244,7 +182,6 @@ class EvalDataset:
|
|
244
182
|
if payload is None:
|
245
183
|
raise ValueError("The YAML file is empty.")
|
246
184
|
examples = payload.get("examples", [])
|
247
|
-
ground_truths = payload.get("ground_truths", [])
|
248
185
|
except FileNotFoundError:
|
249
186
|
error(f"YAML file not found: {file_path}")
|
250
187
|
raise FileNotFoundError(f"The file {file_path} was not found.")
|
@@ -252,25 +189,18 @@ class EvalDataset:
|
|
252
189
|
error(f"Invalid YAML file: {file_path}")
|
253
190
|
raise ValueError(f"The file {file_path} is not a valid YAML file.")
|
254
191
|
|
255
|
-
info(f"Added {len(examples)} examples
|
192
|
+
info(f"Added {len(examples)} examples from YAML")
|
256
193
|
new_examples = [Example(**e) for e in examples]
|
257
194
|
for e in new_examples:
|
258
195
|
self.add_example(e)
|
259
196
|
|
260
|
-
new_ground_truths = [GroundTruthExample(**g) for g in ground_truths]
|
261
|
-
for g in new_ground_truths:
|
262
|
-
self.add_ground_truth(g)
|
263
|
-
|
264
197
|
def add_example(self, e: Example) -> None:
|
265
198
|
self.examples = self.examples + [e]
|
266
199
|
# TODO if we need to add rank, then we need to do it here
|
267
|
-
|
268
|
-
def add_ground_truth(self, g: GroundTruthExample) -> None:
|
269
|
-
self.ground_truths = self.ground_truths + [g]
|
270
200
|
|
271
201
|
def save_as(self, file_type: Literal["json", "csv", "yaml"], dir_path: str, save_name: str = None) -> None:
|
272
202
|
"""
|
273
|
-
Saves the dataset as a file. Save
|
203
|
+
Saves the dataset as a file. Save only the examples.
|
274
204
|
|
275
205
|
Args:
|
276
206
|
file_type (Literal["json", "csv"]): The file type to save the dataset as.
|
@@ -285,7 +215,6 @@ class EvalDataset:
|
|
285
215
|
with open(complete_path, "w") as file:
|
286
216
|
json.dump(
|
287
217
|
{
|
288
|
-
"ground_truths": [g.to_dict() for g in self.ground_truths],
|
289
218
|
"examples": [e.to_dict() for e in self.examples],
|
290
219
|
},
|
291
220
|
file,
|
@@ -319,24 +248,7 @@ class EvalDataset:
|
|
319
248
|
]
|
320
249
|
)
|
321
250
|
|
322
|
-
|
323
|
-
writer.writerow(
|
324
|
-
[
|
325
|
-
g.input,
|
326
|
-
g.actual_output,
|
327
|
-
g.expected_output,
|
328
|
-
";".join(g.context),
|
329
|
-
";".join(g.retrieval_context),
|
330
|
-
g.additional_metadata,
|
331
|
-
";".join(g.tools_called),
|
332
|
-
";".join(g.expected_tools),
|
333
|
-
None, # GroundTruthExample does not have name
|
334
|
-
g.comments,
|
335
|
-
g.source_file,
|
336
|
-
False, # Adding a GroundTruthExample, not an Example
|
337
|
-
g.trace_id
|
338
|
-
]
|
339
|
-
)
|
251
|
+
|
340
252
|
elif file_type == "yaml":
|
341
253
|
with open(complete_path, "w") as file:
|
342
254
|
yaml_data = {
|
@@ -358,24 +270,6 @@ class EvalDataset:
|
|
358
270
|
}
|
359
271
|
for e in self.examples
|
360
272
|
],
|
361
|
-
"ground_truths": [
|
362
|
-
{
|
363
|
-
"input": g.input,
|
364
|
-
"actual_output": g.actual_output,
|
365
|
-
"expected_output": g.expected_output,
|
366
|
-
"context": g.context,
|
367
|
-
"retrieval_context": g.retrieval_context,
|
368
|
-
"additional_metadata": g.additional_metadata,
|
369
|
-
"tools_called": g.tools_called,
|
370
|
-
"expected_tools": g.expected_tools,
|
371
|
-
"name": None, # GroundTruthExample does not have name
|
372
|
-
"comments": g.comments,
|
373
|
-
"source_file": g.source_file,
|
374
|
-
"example": False, # Adding a GroundTruthExample, not an Example
|
375
|
-
"trace_id": g.trace_id
|
376
|
-
}
|
377
|
-
for g in self.ground_truths
|
378
|
-
]
|
379
273
|
}
|
380
274
|
yaml.dump(yaml_data, file, default_flow_style=False)
|
381
275
|
else:
|
@@ -391,7 +285,6 @@ class EvalDataset:
|
|
391
285
|
def __str__(self):
|
392
286
|
return (
|
393
287
|
f"{self.__class__.__name__}("
|
394
|
-
f"ground_truths={self.ground_truths}, "
|
395
288
|
f"examples={self.examples}, "
|
396
289
|
f"_alias={self._alias}, "
|
397
290
|
f"_id={self._id}"
|