judgeval 0.0.6__tar.gz → 0.0.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {judgeval-0.0.6 → judgeval-0.0.7}/PKG-INFO +3 -1
- {judgeval-0.0.6 → judgeval-0.0.7}/pyproject.toml +3 -1
- judgeval-0.0.6/src/test.txt +0 -51
- judgeval-0.0.6/test.txt +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/.github/workflows/ci.yaml +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/.gitignore +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/LICENSE.md +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/Pipfile +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/Pipfile.lock +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/README.md +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/docs/README.md +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/docs/development.mdx +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/docs/essentials/code.mdx +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/docs/essentials/images.mdx +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/docs/essentials/markdown.mdx +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/docs/essentials/navigation.mdx +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/docs/essentials/reusable-snippets.mdx +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/docs/essentials/settings.mdx +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/docs/evaluation/data_datasets.mdx +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/docs/evaluation/data_examples.mdx +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/docs/evaluation/introduction.mdx +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/docs/evaluation/judges.mdx +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/docs/evaluation/scorers/answer_correctness.mdx +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/docs/evaluation/scorers/answer_relevancy.mdx +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/docs/evaluation/scorers/classifier_scorer.mdx +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/docs/evaluation/scorers/contextual_precision.mdx +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/docs/evaluation/scorers/contextual_recall.mdx +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/docs/evaluation/scorers/contextual_relevancy.mdx +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/docs/evaluation/scorers/custom_scorers.mdx +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/docs/evaluation/scorers/faithfulness.mdx +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/docs/evaluation/scorers/hallucination.mdx +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/docs/evaluation/scorers/introduction.mdx +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/docs/evaluation/scorers/json_correctness.mdx +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/docs/evaluation/scorers/summarization.mdx +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/docs/evaluation/scorers/tool_correctness.mdx +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/docs/favicon.svg +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/docs/getting_started.mdx +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/docs/images/checks-passed.png +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/docs/images/create_aggressive_scorer.png +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/docs/images/create_scorer.png +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/docs/images/evaluation_diagram.png +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/docs/images/hero-dark.svg +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/docs/images/hero-light.svg +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/docs/images/trace_screenshot.png +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/docs/introduction.mdx +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/docs/judgment/introduction.mdx +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/docs/logo/dark.svg +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/docs/logo/light.svg +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/docs/mint.json +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/docs/monitoring/introduction.mdx +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/docs/monitoring/production_insights.mdx +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/docs/monitoring/tracing.mdx +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/docs/notebooks/create_dataset.ipynb +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/docs/notebooks/create_scorer.ipynb +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/docs/notebooks/demo.ipynb +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/docs/notebooks/prompt_scorer.ipynb +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/docs/notebooks/quickstart.ipynb +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/docs/quickstart.mdx +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/docs/snippets/snippet-intro.mdx +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/pytest.ini +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/demo/cookbooks/langchain_basic_rag/basic_agentic_rag.ipynb +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/demo/cookbooks/langchain_basic_rag/tesla_q3.pdf +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/demo/cookbooks/langchain_sales/example_product_price_id_mapping.json +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/demo/cookbooks/langchain_sales/sales_agent_with_context.ipynb +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/demo/cookbooks/langchain_sales/sample_product_catalog.txt +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/demo/cookbooks/openai_travel_agent/agent.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/demo/cookbooks/openai_travel_agent/populate_db.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/__init__.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/clients.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/common/__init__.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/common/exceptions.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/common/logger.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/common/tracer.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/common/utils.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/constants.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/data/__init__.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/data/api_example.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/data/datasets/__init__.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/data/datasets/dataset.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/data/datasets/ground_truth.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/data/datasets/utils.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/data/example.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/data/result.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/data/scorer_data.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/evaluation_run.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/judges/__init__.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/judges/base_judge.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/judges/litellm_judge.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/judges/mixture_of_judges.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/judges/together_judge.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/judges/utils.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/judgment_client.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/run_evaluation.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/scorers/__init__.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/scorers/api_scorer.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/scorers/base_scorer.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/scorers/exceptions.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorer.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/__init__.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_correctness.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/scorers/prompt_scorer.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/scorers/score.py +0 -0
- {judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/scorers/utils.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: judgeval
|
3
|
-
Version: 0.0.
|
3
|
+
Version: 0.0.7
|
4
4
|
Summary: Judgeval Package
|
5
5
|
Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
|
6
6
|
Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
|
@@ -14,9 +14,11 @@ Requires-Dist: anthropic
|
|
14
14
|
Requires-Dist: fastapi
|
15
15
|
Requires-Dist: langfuse==2.50.3
|
16
16
|
Requires-Dist: litellm
|
17
|
+
Requires-Dist: nest-asyncio
|
17
18
|
Requires-Dist: openai
|
18
19
|
Requires-Dist: pandas
|
19
20
|
Requires-Dist: patronus
|
21
|
+
Requires-Dist: pika
|
20
22
|
Requires-Dist: python-dotenv==1.0.1
|
21
23
|
Requires-Dist: requests
|
22
24
|
Requires-Dist: supabase
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[project]
|
2
2
|
name = "judgeval"
|
3
|
-
version = "0.0.
|
3
|
+
version = "0.0.7"
|
4
4
|
authors = [
|
5
5
|
{ name="Andrew Li", email="andrew@judgmentlabs.ai" },
|
6
6
|
{ name="Alex Shan", email="alex@judgmentlabs.ai" },
|
@@ -28,6 +28,8 @@ dependencies = [
|
|
28
28
|
"together",
|
29
29
|
"anthropic",
|
30
30
|
"patronus",
|
31
|
+
"nest-asyncio",
|
32
|
+
"pika",
|
31
33
|
]
|
32
34
|
|
33
35
|
[project.optional-dependencies]
|
judgeval-0.0.6/src/test.txt
DELETED
@@ -1,51 +0,0 @@
|
|
1
|
-
Successfully initialized JudgmentClient, welcome back user!
|
2
|
-
Client initialized successfully
|
3
|
-
****************************************
|
4
|
-
Testing dataset creation, pushing, and pulling
|
5
|
-
⠸ Pushing 'test_dataset_5' to Judgment... Done!)
|
6
|
-
⠸ Pulling 'test_dataset_5' from Judgment... Done!)
|
7
|
-
EvalDataset(ground_truths=[], examples=[Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='c4df51db-72d4-461b-ba86-655f15148b69', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='6d01d187-1f53-4e98-bf61-22f9af1c6adc', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='dc422251-de24-43ed-b41e-351481c3e25a', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='e34d09a8-667b-4bdd-a497-8070569e2294', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='0ecafcd0-6677-4c83-a980-ac2315eaedd6', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='1b3a7df4-8e39-48d2-afa4-153d3f03a864', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='c3864e19-cb3b-4d45-9878-9c5a1b1657e2', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='89ea1759-2ba7-4302-9229-c84724df8413', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='9eaf57d8-1acf-4f42-b4ee-7ecd765b8004', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='f213abe2-3e77-4c28-8b7d-d86e17f6bd75', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='ce125a22-52cc-4223-a491-4d8f242a8200', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='a518c2f0-6c09-4cfd-a080-f96f5e73514f', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='2808e88c-a0f1-4fc0-aad3-1412c4c016f9', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='385580d9-dab2-4ced-9731-de52582f1de1', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='f3b16b04-6ef5-4a98-865a-a7c3622eed2c', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='280cf554-ef8c-451d-84c0-84fd8279dd9b', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='a03f4af9-6089-4f47-8cf9-0e15b2d10612', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='e634bd99-003c-4f66-acf7-2d9c3f762559', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='9beb19cc-fc79-4033-b676-73049cdee9bd', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='197d375b-3ea3-46d2-b1b9-04d818502a40', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='53e87725-46c9-4de1-b7dd-e9ceed371aec', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='5a590228-aa3e-4b9a-a9ba-eb1e7bc8bc63', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='cacd7863-ed15-4e41-9ef9-43e6662f4008', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='64ffdd28-91cb-40c4-ad7c-7fb54dd2ad78', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='4bde5f0b-ee70-4206-b4e0-7269cb82056d', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='c3b6ab4f-bca3-4381-9f48-0283aaba4949', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='a9b205c7-5530-45b4-85aa-4b06bd4f80c4', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='b0fd70f2-1dc0-4b21-9d21-c92b9bd0c0af', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='d06b1555-428f-4f19-8db1-09d76600add7', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='3ad40653-4dba-4e89-969e-c9cebdfec0b9', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='f9fd7ac8-1fcd-4f7b-802f-1b5c839ef066', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='137c9d30-66be-4447-ba26-5b5b8950d0c0', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='3e4f550b-b4ae-43ca-a23b-755fd7f908de', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='fd6fb983-fd38-42ae-a256-6d4400fce0f0', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='0c41364d-2741-4e43-889f-df8b1c0e7ea5', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='73d36dd2-1bcc-4c25-9be1-16fbd6e1ecb4', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='95fcac29-5c78-420a-bc7c-43465e3a0bb5', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='1c0e7058-1291-4074-b53c-c6fb621e8c20', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='6e7fff45-1c2d-474b-8dd3-5084e134d81b', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='d49628aa-1447-4507-8258-26f28c39b731', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='741c7a45-e17a-4483-bd96-10550ee4258a', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='f804341e-8f46-49a9-8fcb-293540a76d1d', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='b60185d9-d6b9-4bdc-84bc-e0c6ced6ede9', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='d95bd53f-856e-4150-8626-700a3e54d123', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='d03661f3-86ae-456f-b072-105ee8fa7a83', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='944a4fbe-4c28-4951-bad2-4ad39fc9d484', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='418d4793-9070-45fe-bf39-fc5d5aa83a4b', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='073b935d-f7a4-4719-8f04-f0ee599ef8c0', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='ed299a25-ae42-4fa6-8692-c89f29453fa9', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='5309635c-70a1-43b6-beb5-8b7f0c04ce4b', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='50fbbf0a-164e-4bda-ada8-e777ccc14c6a', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='83f8e19a-d8db-4d54-9d44-07472149cd30', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='617fcf2b-8fa7-48a5-80d6-fb2af444fd18', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='48feda48-ada8-41b0-ba72-b9c9287be7c3', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='3f1618f2-f10f-4d19-b3a4-ff03a6553bf7', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='69274744-f2ec-41f2-acf7-6cf689d7e50b', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='9317bf9e-0573-4506-ac10-25056c22d17a', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='8ad15f4e-2bdc-4caf-9c96-520ec3a12e8b', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='9c7386ac-ffef-4455-8892-530d57e75afd', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='49c7bd9f-38b1-4818-b06f-b9cb2b7685ed', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='20f0e5ef-056e-44a2-b479-4445e72fa68c', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='faa08bc1-522c-4dd9-a515-0e7b75869db6', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='8871e84f-4f6e-4716-ae42-c267feb6e4dd', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='8ba0a8ba-deab-4439-af32-aa99f4152a87', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='487437d4-2486-4497-9382-8d785111ef19', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='b6a43089-8818-4184-8743-b1ea49b52495', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='90caa6f9-700a-4fd8-a36a-83815fb877a6', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='32e54a20-e664-4ebc-b11a-ccc1480c3c31', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='ff439af1-4467-43d7-8b91-a05905b6653f', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='1f3e6f13-b94f-4718-ba1e-5832b39dd55b', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='c747542d-5cd4-447c-b046-4c5ec032d801', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='d8f0f44e-464d-4a5a-89f6-24a89cb1d5a1', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='8f825ef0-2407-47fb-a556-31ee40cb728c', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='b6aa1919-da2f-463b-aa69-d5f09cc8cb84', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='18e847c9-af9e-486b-be8a-457d56464d64', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='b802b615-1ca6-4669-b2e3-e5957f37a8c2', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='29042ec2-13a3-4444-9dcb-74b9c300b20d', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='47ff1050-d317-43b0-a61b-6e7ac235ff37', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='34d509e2-485a-4417-a2e6-9e82e68df01f', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='f95a78a0-1f7b-41ab-bd56-204b73327f1d', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='53526611-cb49-427f-9682-a6e9b1c94258', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='e1f88792-5f4e-4f9a-8a26-214168d9ad19', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='42e1663d-5a26-41f4-b7e8-4b4556dc32f4', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='8bcb2d90-3e1f-4934-b763-e62e5c08e617', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='b3d4f5db-943c-4a40-a129-228c0bd0be98', timestamp='20250205_100333', trace_id=None)], _alias=None, _id=None)
|
8
|
-
Dataset creation, pushing, and pulling successful
|
9
|
-
****************************************
|
10
|
-
Testing evaluation run
|
11
|
-
|
12
|
-
🔍 You can view your evaluation results here:
|
13
|
-
https://app.judgmentlabs.ai/app/evalrun?project_name=OutreachWorkflow&eval_run_name=ColdEmailGenerator-Improve-BasePrompt
|
14
|
-
|
15
|
-
Evaluation results for ColdEmailGenerator-Improve-BasePrompt from database: [{'id': '7a2cac61-ff30-44c2-a3e9-8bdeefb1a519', 'results': [ScoringResult(success=False, scorers_data=[{'name': 'Faithfulness', 'error': None, 'score': 1.0, 'reason': 'The score is 1.00 because there are no contradictions, indicating that the actual output is completely faithful to the retrieval context.', 'success': True, 'threshold': 0.5, 'strict_mode': False, 'verbose_logs': 'Claims:\n[{\'claim\': \'GreenEnergy Solutions team received a 2023 sustainability award.\', \'quote\': \'Dear GreenEnergy Solutions team,\\\\n\\\\nCongratulations on your 2023 sustainability award! Your innovative solar panel technology with 30% higher efficiency is exactly what the European market needs right now.\'}, {\'claim\': \'GreenEnergy Solutions has developed solar panel technology with 30% higher efficiency.\', \'quote\': \'Dear GreenEnergy Solutions team,\\\\n\\\\nCongratulations on your 2023 sustainability award! Your innovative solar panel technology with 30% higher efficiency is exactly what the European market needs right now.\'}, {\'claim\': "Alex is interested in discussing support for GreenEnergy Solutions\' European expansion plans.", \'quote\': "I\'d love to discuss how we could support your European expansion plans.\\\\n\\\\nBest regards,\\\\nAlex"}] \n \nVerdicts:\n[FaithfulnessVerdict(verdict=\'yes\', reason="The claim that GreenEnergy Solutions team received a 2023 sustainability award is supported by the retrieval context. Quote: \'GreenEnergy Solutions won 2023 sustainability award.\'"), FaithfulnessVerdict(verdict=\'yes\', reason="The claim that GreenEnergy Solutions has developed solar panel technology with 30% higher efficiency is supported by the retrieval context. Quote: \'New solar technology 30% more efficient.\'"), FaithfulnessVerdict(verdict=\'idk\', reason="The retrieval context does not mention anything about Alex or his interest in discussing support for GreenEnergy Solutions\' European expansion plans.")]', 'evaluation_cost': None, 'evaluation_model': 'QWEN', 'additional_metadata': {'claims': [{'claim': 'GreenEnergy Solutions team received a 2023 sustainability award.', 'quote': 'Dear GreenEnergy Solutions team,\\n\\nCongratulations on your 2023 sustainability award! Your innovative solar panel technology with 30% higher efficiency is exactly what the European market needs right now.'}, {'claim': 'GreenEnergy Solutions has developed solar panel technology with 30% higher efficiency.', 'quote': 'Dear GreenEnergy Solutions team,\\n\\nCongratulations on your 2023 sustainability award! Your innovative solar panel technology with 30% higher efficiency is exactly what the European market needs right now.'}, {'claim': "Alex is interested in discussing support for GreenEnergy Solutions' European expansion plans.", 'quote': "I'd love to discuss how we could support your European expansion plans.\\n\\nBest regards,\\nAlex"}], 'verdicts': [{'reason': "The claim that GreenEnergy Solutions team received a 2023 sustainability award is supported by the retrieval context. Quote: 'GreenEnergy Solutions won 2023 sustainability award.'", 'verdict': 'yes'}, {'reason': "The claim that GreenEnergy Solutions has developed solar panel technology with 30% higher efficiency is supported by the retrieval context. Quote: 'New solar technology 30% more efficient.'", 'verdict': 'yes'}, {'reason': "The retrieval context does not mention anything about Alex or his interest in discussing support for GreenEnergy Solutions' European expansion plans.", 'verdict': 'idk'}]}}, {'name': 'Hallucination', 'error': None, 'score': 1.0, 'reason': 'The score is 1.00 because the actual output diverges entirely from the context, failing to address any aspect of business development activities, strategies, or initiatives as required.', 'success': False, 'threshold': 0.5, 'strict_mode': False, 'verbose_logs': 'Verdicts:\n[{\'verdict\': \'no\', \'reason\': "The actual output does not agree with the context as it is not related to \'Business Development\'. The context requires information about business development activities, strategies, or initiatives, which are not present in the actual output."}]', 'evaluation_cost': None, 'evaluation_model': 'QWEN', 'additional_metadata': None}], input="Generate a cold outreach email for GreenEnergy Solutions. Facts: They're developing solar panel technology that's 30% more efficient. They're looking to expand into the European market. They won a sustainability award in 2023.", actual_output="Dear GreenEnergy Solutions team,\n\nCongratulations on your 2023 sustainability award! Your innovative solar panel technology with 30% higher efficiency is exactly what the European market needs right now.\n\nI'd love to discuss how we could support your European expansion plans.\n\nBest regards,\nAlex", expected_output='A professional cold email mentioning the sustainability award, solar technology innovation, and European expansion plans', context=['Business Development'], retrieval_context=['GreenEnergy Solutions won 2023 sustainability award', 'New solar technology 30% more efficient', 'Planning European market expansion'], trace_id=None, example_id=None, eval_run_name=None)]}]
|
16
|
-
Evaluation run successful
|
17
|
-
****************************************
|
18
|
-
Testing assert test
|
19
|
-
Assert test successful
|
20
|
-
****************************************
|
21
|
-
Testing JSON scorer
|
22
|
-
|
23
|
-
🔍 You can view your evaluation results here: https://app.judgmentlabs.ai/app/evalrun?project_name=test_project&eval_run_name=test_json_scorer
|
24
|
-
|
25
|
-
[ScoringResult(success=True, scorers_data=[ScorerData(name='JSON Correctness', threshold=0.5, success=True, score=1.0, reason=None, strict_mode=False, evaluation_model='QWEN', error=None, evaluation_cost=None, verbose_logs='LLM outputed Json:\n{"tool": "authentication"}', additional_metadata=None)], input="What if these shoes don't fit?", actual_output='{"tool": "authentication"}', expected_output=None, context=None, retrieval_context=['All customers are eligible for a 30 day full refund at no extra cost.'], trace_id='2231abe3-e7e0-4909-8ab7-b4ab60b645c6', example_id=None, eval_run_name=None), ScoringResult(success=False, scorers_data=[ScorerData(name='JSON Correctness', threshold=0.5, success=False, score=0.0, reason=None, strict_mode=False, evaluation_model='QWEN', error=None, evaluation_cost=None, verbose_logs="LLM outputed Json:\nYou can reset your password by clicking on 'Forgot Password' at the login screen.", additional_metadata=None)], input='How do I reset my password?', actual_output="You can reset your password by clicking on 'Forgot Password' at the login screen.", expected_output="You can reset your password by clicking on 'Forgot Password' at the login screen.", context=['User Account'], retrieval_context=['Password reset instructions'], trace_id=None, example_id=None, eval_run_name=None)]
|
26
|
-
JSON scorer test successful
|
27
|
-
****************************************
|
28
|
-
Testing evaluation run override
|
29
|
-
|
30
|
-
🔍 You can view your evaluation results here:
|
31
|
-
https://app.judgmentlabs.ai/app/evalrun?project_name=test_eval_run_naming_collisions&eval_run_name=Cs06MuXToDeR
|
32
|
-
|
33
|
-
|
34
|
-
🔍 You can view your evaluation results here:
|
35
|
-
https://app.judgmentlabs.ai/app/evalrun?project_name=test_eval_run_naming_collisions&eval_run_name=Cs06MuXToDeR
|
36
|
-
|
37
|
-
Successfully caught expected error: Please check your EvaluationRun object, one or more fields are invalid:
|
38
|
-
Evaluation run name 'Cs06MuXToDeR' already exists for this project
|
39
|
-
Evaluation run override successful
|
40
|
-
****************************************
|
41
|
-
Testing dataset evaluation
|
42
|
-
[ScoringResult(success=True, scorers_data=[ScorerData(name='Faithfulness', threshold=0.5, success=True, score=1.0, reason='The score is 1.00 because there are no contradictions, indicating that the actual output is completely faithful to the retrieval context.', strict_mode=False, evaluation_model='QWEN', error=None, evaluation_cost=None, verbose_logs='Claims:\n[{\'claim\': \'A 30-day full refund is offered.\', \'quote\': \'We offer a 30-day full refund at no extra cost.\'}, {\'claim\': \'The 30-day full refund comes at no extra cost.\', \'quote\': \'We offer a 30-day full refund at no extra cost.\'}] \n \nVerdicts:\n[FaithfulnessVerdict(verdict=\'yes\', reason="The claim that a 30-day full refund is offered is supported by the retrieval context. Quote: \'All customers are eligible for a 30 day full refund at no extra cost.\'"), FaithfulnessVerdict(verdict=\'yes\', reason="The claim that the 30-day full refund comes at no extra cost is supported by the retrieval context. Quote: \'All customers are eligible for a 30 day full refund at no extra cost.\'")]', additional_metadata={'claims': [{'claim': 'A 30-day full refund is offered.', 'quote': 'We offer a 30-day full refund at no extra cost.'}, {'claim': 'The 30-day full refund comes at no extra cost.', 'quote': 'We offer a 30-day full refund at no extra cost.'}], 'verdicts': [{'verdict': 'yes', 'reason': "The claim that a 30-day full refund is offered is supported by the retrieval context. Quote: 'All customers are eligible for a 30 day full refund at no extra cost.'"}, {'verdict': 'yes', 'reason': "The claim that the 30-day full refund comes at no extra cost is supported by the retrieval context. Quote: 'All customers are eligible for a 30 day full refund at no extra cost.'"}]})], input="What if these shoes don't fit?", actual_output='We offer a 30-day full refund at no extra cost.', expected_output=None, context=None, retrieval_context=['All customers are eligible for a 30 day full refund at no extra cost.'], trace_id='2231abe3-e7e0-4909-8ab7-b4ab60b645c6', example_id=None, eval_run_name=None), ScoringResult(success=True, scorers_data=[ScorerData(name='Faithfulness', threshold=0.5, success=True, score=1.0, reason='The score is 1.00 because there are no contradictions, indicating that the actual output is completely faithful to the retrieval context.', strict_mode=False, evaluation_model='QWEN', error=None, evaluation_cost=None, verbose_logs='Claims:\n[{\'claim\': \'You can reset your password.\', \'quote\': "You can reset your password by clicking on \'Forgot Password\' at the login screen."}, {\'claim\': "The \'Forgot Password\' option is available at the login screen.", \'quote\': "You can reset your password by clicking on \'Forgot Password\' at the login screen."}, {\'claim\': "Clicking on \'Forgot Password\' allows you to reset your password.", \'quote\': "You can reset your password by clicking on \'Forgot Password\' at the login screen."}] \n \nVerdicts:\n[FaithfulnessVerdict(verdict=\'idk\', reason=\'The retrieval context does not provide any information about the ability to reset a password.\'), FaithfulnessVerdict(verdict=\'idk\', reason="The retrieval context does not mention whether the \'Forgot Password\' option is available at the login screen."), FaithfulnessVerdict(verdict=\'idk\', reason="The retrieval context does not specify if clicking on \'Forgot Password\' allows you to reset your password.")]', additional_metadata={'claims': [{'claim': 'You can reset your password.', 'quote': "You can reset your password by clicking on 'Forgot Password' at the login screen."}, {'claim': "The 'Forgot Password' option is available at the login screen.", 'quote': "You can reset your password by clicking on 'Forgot Password' at the login screen."}, {'claim': "Clicking on 'Forgot Password' allows you to reset your password.", 'quote': "You can reset your password by clicking on 'Forgot Password' at the login screen."}], 'verdicts': [{'verdict': 'idk', 'reason': 'The retrieval context does not provide any information about the ability to reset a password.'}, {'verdict': 'idk', 'reason': "The retrieval context does not mention whether the 'Forgot Password' option is available at the login screen."}, {'verdict': 'idk', 'reason': "The retrieval context does not specify if clicking on 'Forgot Password' allows you to reset your password."}]})], input='How do I reset my password?', actual_output="You can reset your password by clicking on 'Forgot Password' at the login screen.", expected_output="You can reset your password by clicking on 'Forgot Password' at the login screen.", context=['User Account'], retrieval_context=['Password reset instructions'], trace_id=None, example_id=None, eval_run_name=None)]
|
43
|
-
Dataset evaluation successful
|
44
|
-
****************************************
|
45
|
-
Testing classifier scorer
|
46
|
-
|
47
|
-
🔍 You can view your evaluation results here: https://app.judgmentlabs.ai/app/evalrun?project_name=ToneScorerTest&eval_run_name=ToneScorerTest
|
48
|
-
|
49
|
-
Classifier scorer test successful
|
50
|
-
****************************************
|
51
|
-
Testing custom judge
|
judgeval-0.0.6/test.txt
DELETED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{judgeval-0.0.6 → judgeval-0.0.7}/src/demo/cookbooks/langchain_basic_rag/basic_agentic_rag.ipynb
RENAMED
File without changes
|
File without changes
|
File without changes
|
{judgeval-0.0.6 → judgeval-0.0.7}/src/demo/cookbooks/langchain_sales/sales_agent_with_context.ipynb
RENAMED
File without changes
|
{judgeval-0.0.6 → judgeval-0.0.7}/src/demo/cookbooks/langchain_sales/sample_product_catalog.txt
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/api_scorers/__init__.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py
RENAMED
File without changes
|
{judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py
RENAMED
File without changes
|
File without changes
|
{judgeval-0.0.6 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/api_scorers/summarization.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|