judgeval 0.0.5__tar.gz → 0.0.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. {judgeval-0.0.5 → judgeval-0.0.7}/PKG-INFO +3 -1
  2. {judgeval-0.0.5 → judgeval-0.0.7}/pyproject.toml +3 -1
  3. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/clients.py +10 -1
  4. judgeval-0.0.5/src/test.txt +0 -51
  5. judgeval-0.0.5/test.txt +0 -0
  6. {judgeval-0.0.5 → judgeval-0.0.7}/.github/workflows/ci.yaml +0 -0
  7. {judgeval-0.0.5 → judgeval-0.0.7}/.gitignore +0 -0
  8. {judgeval-0.0.5 → judgeval-0.0.7}/LICENSE.md +0 -0
  9. {judgeval-0.0.5 → judgeval-0.0.7}/Pipfile +0 -0
  10. {judgeval-0.0.5 → judgeval-0.0.7}/Pipfile.lock +0 -0
  11. {judgeval-0.0.5 → judgeval-0.0.7}/README.md +0 -0
  12. {judgeval-0.0.5 → judgeval-0.0.7}/docs/README.md +0 -0
  13. {judgeval-0.0.5 → judgeval-0.0.7}/docs/development.mdx +0 -0
  14. {judgeval-0.0.5 → judgeval-0.0.7}/docs/essentials/code.mdx +0 -0
  15. {judgeval-0.0.5 → judgeval-0.0.7}/docs/essentials/images.mdx +0 -0
  16. {judgeval-0.0.5 → judgeval-0.0.7}/docs/essentials/markdown.mdx +0 -0
  17. {judgeval-0.0.5 → judgeval-0.0.7}/docs/essentials/navigation.mdx +0 -0
  18. {judgeval-0.0.5 → judgeval-0.0.7}/docs/essentials/reusable-snippets.mdx +0 -0
  19. {judgeval-0.0.5 → judgeval-0.0.7}/docs/essentials/settings.mdx +0 -0
  20. {judgeval-0.0.5 → judgeval-0.0.7}/docs/evaluation/data_datasets.mdx +0 -0
  21. {judgeval-0.0.5 → judgeval-0.0.7}/docs/evaluation/data_examples.mdx +0 -0
  22. {judgeval-0.0.5 → judgeval-0.0.7}/docs/evaluation/introduction.mdx +0 -0
  23. {judgeval-0.0.5 → judgeval-0.0.7}/docs/evaluation/judges.mdx +0 -0
  24. {judgeval-0.0.5 → judgeval-0.0.7}/docs/evaluation/scorers/answer_correctness.mdx +0 -0
  25. {judgeval-0.0.5 → judgeval-0.0.7}/docs/evaluation/scorers/answer_relevancy.mdx +0 -0
  26. {judgeval-0.0.5 → judgeval-0.0.7}/docs/evaluation/scorers/classifier_scorer.mdx +0 -0
  27. {judgeval-0.0.5 → judgeval-0.0.7}/docs/evaluation/scorers/contextual_precision.mdx +0 -0
  28. {judgeval-0.0.5 → judgeval-0.0.7}/docs/evaluation/scorers/contextual_recall.mdx +0 -0
  29. {judgeval-0.0.5 → judgeval-0.0.7}/docs/evaluation/scorers/contextual_relevancy.mdx +0 -0
  30. {judgeval-0.0.5 → judgeval-0.0.7}/docs/evaluation/scorers/custom_scorers.mdx +0 -0
  31. {judgeval-0.0.5 → judgeval-0.0.7}/docs/evaluation/scorers/faithfulness.mdx +0 -0
  32. {judgeval-0.0.5 → judgeval-0.0.7}/docs/evaluation/scorers/hallucination.mdx +0 -0
  33. {judgeval-0.0.5 → judgeval-0.0.7}/docs/evaluation/scorers/introduction.mdx +0 -0
  34. {judgeval-0.0.5 → judgeval-0.0.7}/docs/evaluation/scorers/json_correctness.mdx +0 -0
  35. {judgeval-0.0.5 → judgeval-0.0.7}/docs/evaluation/scorers/summarization.mdx +0 -0
  36. {judgeval-0.0.5 → judgeval-0.0.7}/docs/evaluation/scorers/tool_correctness.mdx +0 -0
  37. {judgeval-0.0.5 → judgeval-0.0.7}/docs/favicon.svg +0 -0
  38. {judgeval-0.0.5 → judgeval-0.0.7}/docs/getting_started.mdx +0 -0
  39. {judgeval-0.0.5 → judgeval-0.0.7}/docs/images/checks-passed.png +0 -0
  40. {judgeval-0.0.5 → judgeval-0.0.7}/docs/images/create_aggressive_scorer.png +0 -0
  41. {judgeval-0.0.5 → judgeval-0.0.7}/docs/images/create_scorer.png +0 -0
  42. {judgeval-0.0.5 → judgeval-0.0.7}/docs/images/evaluation_diagram.png +0 -0
  43. {judgeval-0.0.5 → judgeval-0.0.7}/docs/images/hero-dark.svg +0 -0
  44. {judgeval-0.0.5 → judgeval-0.0.7}/docs/images/hero-light.svg +0 -0
  45. {judgeval-0.0.5 → judgeval-0.0.7}/docs/images/trace_screenshot.png +0 -0
  46. {judgeval-0.0.5 → judgeval-0.0.7}/docs/introduction.mdx +0 -0
  47. {judgeval-0.0.5 → judgeval-0.0.7}/docs/judgment/introduction.mdx +0 -0
  48. {judgeval-0.0.5 → judgeval-0.0.7}/docs/logo/dark.svg +0 -0
  49. {judgeval-0.0.5 → judgeval-0.0.7}/docs/logo/light.svg +0 -0
  50. {judgeval-0.0.5 → judgeval-0.0.7}/docs/mint.json +0 -0
  51. {judgeval-0.0.5 → judgeval-0.0.7}/docs/monitoring/introduction.mdx +0 -0
  52. {judgeval-0.0.5 → judgeval-0.0.7}/docs/monitoring/production_insights.mdx +0 -0
  53. {judgeval-0.0.5 → judgeval-0.0.7}/docs/monitoring/tracing.mdx +0 -0
  54. {judgeval-0.0.5 → judgeval-0.0.7}/docs/notebooks/create_dataset.ipynb +0 -0
  55. {judgeval-0.0.5 → judgeval-0.0.7}/docs/notebooks/create_scorer.ipynb +0 -0
  56. {judgeval-0.0.5 → judgeval-0.0.7}/docs/notebooks/demo.ipynb +0 -0
  57. {judgeval-0.0.5 → judgeval-0.0.7}/docs/notebooks/prompt_scorer.ipynb +0 -0
  58. {judgeval-0.0.5 → judgeval-0.0.7}/docs/notebooks/quickstart.ipynb +0 -0
  59. {judgeval-0.0.5 → judgeval-0.0.7}/docs/quickstart.mdx +0 -0
  60. {judgeval-0.0.5 → judgeval-0.0.7}/docs/snippets/snippet-intro.mdx +0 -0
  61. {judgeval-0.0.5 → judgeval-0.0.7}/pytest.ini +0 -0
  62. {judgeval-0.0.5 → judgeval-0.0.7}/src/demo/cookbooks/langchain_basic_rag/basic_agentic_rag.ipynb +0 -0
  63. {judgeval-0.0.5 → judgeval-0.0.7}/src/demo/cookbooks/langchain_basic_rag/tesla_q3.pdf +0 -0
  64. {judgeval-0.0.5 → judgeval-0.0.7}/src/demo/cookbooks/langchain_sales/example_product_price_id_mapping.json +0 -0
  65. {judgeval-0.0.5 → judgeval-0.0.7}/src/demo/cookbooks/langchain_sales/sales_agent_with_context.ipynb +0 -0
  66. {judgeval-0.0.5 → judgeval-0.0.7}/src/demo/cookbooks/langchain_sales/sample_product_catalog.txt +0 -0
  67. {judgeval-0.0.5 → judgeval-0.0.7}/src/demo/cookbooks/openai_travel_agent/agent.py +0 -0
  68. {judgeval-0.0.5 → judgeval-0.0.7}/src/demo/cookbooks/openai_travel_agent/populate_db.py +0 -0
  69. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/__init__.py +0 -0
  70. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/common/__init__.py +0 -0
  71. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/common/exceptions.py +0 -0
  72. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/common/logger.py +0 -0
  73. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/common/tracer.py +0 -0
  74. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/common/utils.py +0 -0
  75. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/constants.py +0 -0
  76. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/data/__init__.py +0 -0
  77. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/data/api_example.py +0 -0
  78. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/data/datasets/__init__.py +0 -0
  79. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/data/datasets/dataset.py +0 -0
  80. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/data/datasets/ground_truth.py +0 -0
  81. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/data/datasets/utils.py +0 -0
  82. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/data/example.py +0 -0
  83. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/data/result.py +0 -0
  84. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/data/scorer_data.py +0 -0
  85. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/evaluation_run.py +0 -0
  86. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/judges/__init__.py +0 -0
  87. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/judges/base_judge.py +0 -0
  88. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/judges/litellm_judge.py +0 -0
  89. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/judges/mixture_of_judges.py +0 -0
  90. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/judges/together_judge.py +0 -0
  91. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/judges/utils.py +0 -0
  92. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/judgment_client.py +0 -0
  93. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/run_evaluation.py +0 -0
  94. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/scorers/__init__.py +0 -0
  95. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/scorers/api_scorer.py +0 -0
  96. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/scorers/base_scorer.py +0 -0
  97. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/scorers/exceptions.py +0 -0
  98. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorer.py +0 -0
  99. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/__init__.py +0 -0
  100. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -0
  101. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +0 -0
  102. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +0 -0
  103. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -0
  104. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -0
  105. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -0
  106. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +0 -0
  107. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -0
  108. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -0
  109. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -0
  110. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_correctness.py +0 -0
  111. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -0
  112. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -0
  113. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -0
  114. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -0
  115. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -0
  116. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -0
  117. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -0
  118. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -0
  119. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -0
  120. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -0
  121. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -0
  122. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -0
  123. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -0
  124. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -0
  125. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -0
  126. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -0
  127. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -0
  128. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -0
  129. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -0
  130. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -0
  131. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -0
  132. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -0
  133. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -0
  134. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -0
  135. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -0
  136. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -0
  137. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py +0 -0
  138. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +0 -0
  139. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/scorers/prompt_scorer.py +0 -0
  140. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/scorers/score.py +0 -0
  141. {judgeval-0.0.5 → judgeval-0.0.7}/src/judgeval/scorers/utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: judgeval
3
- Version: 0.0.5
3
+ Version: 0.0.7
4
4
  Summary: Judgeval Package
5
5
  Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
6
  Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -14,9 +14,11 @@ Requires-Dist: anthropic
14
14
  Requires-Dist: fastapi
15
15
  Requires-Dist: langfuse==2.50.3
16
16
  Requires-Dist: litellm
17
+ Requires-Dist: nest-asyncio
17
18
  Requires-Dist: openai
18
19
  Requires-Dist: pandas
19
20
  Requires-Dist: patronus
21
+ Requires-Dist: pika
20
22
  Requires-Dist: python-dotenv==1.0.1
21
23
  Requires-Dist: requests
22
24
  Requires-Dist: supabase
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "judgeval"
3
- version = "0.0.5"
3
+ version = "0.0.7"
4
4
  authors = [
5
5
  { name="Andrew Li", email="andrew@judgmentlabs.ai" },
6
6
  { name="Alex Shan", email="alex@judgmentlabs.ai" },
@@ -28,6 +28,8 @@ dependencies = [
28
28
  "together",
29
29
  "anthropic",
30
30
  "patronus",
31
+ "nest-asyncio",
32
+ "pika",
31
33
  ]
32
34
 
33
35
  [project.optional-dependencies]
@@ -9,13 +9,22 @@ PATH_TO_DOTENV = os.path.join(os.path.dirname(__file__), ".env")
9
9
  load_dotenv(dotenv_path=PATH_TO_DOTENV)
10
10
 
11
11
  # Initialize required clients
12
- client = OpenAI()
13
12
  langfuse = Langfuse(
14
13
  secret_key=os.getenv("LANGFUSE_SECRET_KEY"),
15
14
  public_key=os.getenv("LANGFUSE_PUBLIC_KEY"),
16
15
  host=os.getenv("LANGFUSE_HOST"),
17
16
  )
18
17
 
18
+ # Initialize optional OpenAI client
19
+ client: Optional['OpenAI'] = None
20
+ if os.getenv("OPENAI_API_KEY"):
21
+ try:
22
+ from openai import OpenAI
23
+ client = OpenAI()
24
+ except ImportError:
25
+ # openai package not installed
26
+ pass
27
+
19
28
  # Initialize optional Together clients
20
29
  together_client: Optional['Together'] = None
21
30
  async_together_client: Optional['AsyncTogether'] = None
@@ -1,51 +0,0 @@
1
- Successfully initialized JudgmentClient, welcome back user!
2
- Client initialized successfully
3
- ****************************************
4
- Testing dataset creation, pushing, and pulling
5
- ⠸ Pushing 'test_dataset_5' to Judgment... Done!)
6
- ⠸ Pulling 'test_dataset_5' from Judgment... Done!)
7
- EvalDataset(ground_truths=[], examples=[Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='c4df51db-72d4-461b-ba86-655f15148b69', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='6d01d187-1f53-4e98-bf61-22f9af1c6adc', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='dc422251-de24-43ed-b41e-351481c3e25a', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='e34d09a8-667b-4bdd-a497-8070569e2294', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='0ecafcd0-6677-4c83-a980-ac2315eaedd6', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='1b3a7df4-8e39-48d2-afa4-153d3f03a864', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='c3864e19-cb3b-4d45-9878-9c5a1b1657e2', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='89ea1759-2ba7-4302-9229-c84724df8413', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='9eaf57d8-1acf-4f42-b4ee-7ecd765b8004', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='f213abe2-3e77-4c28-8b7d-d86e17f6bd75', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='ce125a22-52cc-4223-a491-4d8f242a8200', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='a518c2f0-6c09-4cfd-a080-f96f5e73514f', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='2808e88c-a0f1-4fc0-aad3-1412c4c016f9', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='385580d9-dab2-4ced-9731-de52582f1de1', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='f3b16b04-6ef5-4a98-865a-a7c3622eed2c', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='280cf554-ef8c-451d-84c0-84fd8279dd9b', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='a03f4af9-6089-4f47-8cf9-0e15b2d10612', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='e634bd99-003c-4f66-acf7-2d9c3f762559', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='9beb19cc-fc79-4033-b676-73049cdee9bd', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='197d375b-3ea3-46d2-b1b9-04d818502a40', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='53e87725-46c9-4de1-b7dd-e9ceed371aec', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='5a590228-aa3e-4b9a-a9ba-eb1e7bc8bc63', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='cacd7863-ed15-4e41-9ef9-43e6662f4008', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='64ffdd28-91cb-40c4-ad7c-7fb54dd2ad78', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='4bde5f0b-ee70-4206-b4e0-7269cb82056d', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='c3b6ab4f-bca3-4381-9f48-0283aaba4949', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='a9b205c7-5530-45b4-85aa-4b06bd4f80c4', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='b0fd70f2-1dc0-4b21-9d21-c92b9bd0c0af', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='d06b1555-428f-4f19-8db1-09d76600add7', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='3ad40653-4dba-4e89-969e-c9cebdfec0b9', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='f9fd7ac8-1fcd-4f7b-802f-1b5c839ef066', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='137c9d30-66be-4447-ba26-5b5b8950d0c0', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='3e4f550b-b4ae-43ca-a23b-755fd7f908de', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='fd6fb983-fd38-42ae-a256-6d4400fce0f0', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='0c41364d-2741-4e43-889f-df8b1c0e7ea5', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='73d36dd2-1bcc-4c25-9be1-16fbd6e1ecb4', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='95fcac29-5c78-420a-bc7c-43465e3a0bb5', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='1c0e7058-1291-4074-b53c-c6fb621e8c20', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='6e7fff45-1c2d-474b-8dd3-5084e134d81b', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='d49628aa-1447-4507-8258-26f28c39b731', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='741c7a45-e17a-4483-bd96-10550ee4258a', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='f804341e-8f46-49a9-8fcb-293540a76d1d', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='b60185d9-d6b9-4bdc-84bc-e0c6ced6ede9', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='d95bd53f-856e-4150-8626-700a3e54d123', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='d03661f3-86ae-456f-b072-105ee8fa7a83', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='944a4fbe-4c28-4951-bad2-4ad39fc9d484', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='418d4793-9070-45fe-bf39-fc5d5aa83a4b', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='073b935d-f7a4-4719-8f04-f0ee599ef8c0', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='ed299a25-ae42-4fa6-8692-c89f29453fa9', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='5309635c-70a1-43b6-beb5-8b7f0c04ce4b', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='50fbbf0a-164e-4bda-ada8-e777ccc14c6a', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='83f8e19a-d8db-4d54-9d44-07472149cd30', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='617fcf2b-8fa7-48a5-80d6-fb2af444fd18', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='48feda48-ada8-41b0-ba72-b9c9287be7c3', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='3f1618f2-f10f-4d19-b3a4-ff03a6553bf7', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='69274744-f2ec-41f2-acf7-6cf689d7e50b', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='9317bf9e-0573-4506-ac10-25056c22d17a', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='8ad15f4e-2bdc-4caf-9c96-520ec3a12e8b', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='9c7386ac-ffef-4455-8892-530d57e75afd', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='49c7bd9f-38b1-4818-b06f-b9cb2b7685ed', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='20f0e5ef-056e-44a2-b479-4445e72fa68c', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='faa08bc1-522c-4dd9-a515-0e7b75869db6', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='8871e84f-4f6e-4716-ae42-c267feb6e4dd', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='8ba0a8ba-deab-4439-af32-aa99f4152a87', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='487437d4-2486-4497-9382-8d785111ef19', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='b6a43089-8818-4184-8743-b1ea49b52495', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='90caa6f9-700a-4fd8-a36a-83815fb877a6', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='32e54a20-e664-4ebc-b11a-ccc1480c3c31', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='ff439af1-4467-43d7-8b91-a05905b6653f', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='1f3e6f13-b94f-4718-ba1e-5832b39dd55b', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='c747542d-5cd4-447c-b046-4c5ec032d801', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='d8f0f44e-464d-4a5a-89f6-24a89cb1d5a1', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='8f825ef0-2407-47fb-a556-31ee40cb728c', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='b6aa1919-da2f-463b-aa69-d5f09cc8cb84', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='18e847c9-af9e-486b-be8a-457d56464d64', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='b802b615-1ca6-4669-b2e3-e5957f37a8c2', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='29042ec2-13a3-4444-9dcb-74b9c300b20d', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='47ff1050-d317-43b0-a61b-6e7ac235ff37', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='34d509e2-485a-4417-a2e6-9e82e68df01f', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='f95a78a0-1f7b-41ab-bd56-204b73327f1d', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='53526611-cb49-427f-9682-a6e9b1c94258', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='e1f88792-5f4e-4f9a-8a26-214168d9ad19', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='42e1663d-5a26-41f4-b7e8-4b4556dc32f4', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='8bcb2d90-3e1f-4934-b763-e62e5c08e617', timestamp='20250205_100333', trace_id=None), Example(input='input 1', actual_output='output 1', expected_output=None, context=None, retrieval_context=None, additional_metadata=None, tools_called=None, expected_tools=None, name=None, example_id='b3d4f5db-943c-4a40-a129-228c0bd0be98', timestamp='20250205_100333', trace_id=None)], _alias=None, _id=None)
8
- Dataset creation, pushing, and pulling successful
9
- ****************************************
10
- Testing evaluation run
11
-
12
- 🔍 You can view your evaluation results here:
13
- https://app.judgmentlabs.ai/app/evalrun?project_name=OutreachWorkflow&eval_run_name=ColdEmailGenerator-Improve-BasePrompt
14
-
15
- Evaluation results for ColdEmailGenerator-Improve-BasePrompt from database: [{'id': '7a2cac61-ff30-44c2-a3e9-8bdeefb1a519', 'results': [ScoringResult(success=False, scorers_data=[{'name': 'Faithfulness', 'error': None, 'score': 1.0, 'reason': 'The score is 1.00 because there are no contradictions, indicating that the actual output is completely faithful to the retrieval context.', 'success': True, 'threshold': 0.5, 'strict_mode': False, 'verbose_logs': 'Claims:\n[{\'claim\': \'GreenEnergy Solutions team received a 2023 sustainability award.\', \'quote\': \'Dear GreenEnergy Solutions team,\\\\n\\\\nCongratulations on your 2023 sustainability award! Your innovative solar panel technology with 30% higher efficiency is exactly what the European market needs right now.\'}, {\'claim\': \'GreenEnergy Solutions has developed solar panel technology with 30% higher efficiency.\', \'quote\': \'Dear GreenEnergy Solutions team,\\\\n\\\\nCongratulations on your 2023 sustainability award! Your innovative solar panel technology with 30% higher efficiency is exactly what the European market needs right now.\'}, {\'claim\': "Alex is interested in discussing support for GreenEnergy Solutions\' European expansion plans.", \'quote\': "I\'d love to discuss how we could support your European expansion plans.\\\\n\\\\nBest regards,\\\\nAlex"}] \n \nVerdicts:\n[FaithfulnessVerdict(verdict=\'yes\', reason="The claim that GreenEnergy Solutions team received a 2023 sustainability award is supported by the retrieval context. Quote: \'GreenEnergy Solutions won 2023 sustainability award.\'"), FaithfulnessVerdict(verdict=\'yes\', reason="The claim that GreenEnergy Solutions has developed solar panel technology with 30% higher efficiency is supported by the retrieval context. Quote: \'New solar technology 30% more efficient.\'"), FaithfulnessVerdict(verdict=\'idk\', reason="The retrieval context does not mention anything about Alex or his interest in discussing support for GreenEnergy Solutions\' European expansion plans.")]', 'evaluation_cost': None, 'evaluation_model': 'QWEN', 'additional_metadata': {'claims': [{'claim': 'GreenEnergy Solutions team received a 2023 sustainability award.', 'quote': 'Dear GreenEnergy Solutions team,\\n\\nCongratulations on your 2023 sustainability award! Your innovative solar panel technology with 30% higher efficiency is exactly what the European market needs right now.'}, {'claim': 'GreenEnergy Solutions has developed solar panel technology with 30% higher efficiency.', 'quote': 'Dear GreenEnergy Solutions team,\\n\\nCongratulations on your 2023 sustainability award! Your innovative solar panel technology with 30% higher efficiency is exactly what the European market needs right now.'}, {'claim': "Alex is interested in discussing support for GreenEnergy Solutions' European expansion plans.", 'quote': "I'd love to discuss how we could support your European expansion plans.\\n\\nBest regards,\\nAlex"}], 'verdicts': [{'reason': "The claim that GreenEnergy Solutions team received a 2023 sustainability award is supported by the retrieval context. Quote: 'GreenEnergy Solutions won 2023 sustainability award.'", 'verdict': 'yes'}, {'reason': "The claim that GreenEnergy Solutions has developed solar panel technology with 30% higher efficiency is supported by the retrieval context. Quote: 'New solar technology 30% more efficient.'", 'verdict': 'yes'}, {'reason': "The retrieval context does not mention anything about Alex or his interest in discussing support for GreenEnergy Solutions' European expansion plans.", 'verdict': 'idk'}]}}, {'name': 'Hallucination', 'error': None, 'score': 1.0, 'reason': 'The score is 1.00 because the actual output diverges entirely from the context, failing to address any aspect of business development activities, strategies, or initiatives as required.', 'success': False, 'threshold': 0.5, 'strict_mode': False, 'verbose_logs': 'Verdicts:\n[{\'verdict\': \'no\', \'reason\': "The actual output does not agree with the context as it is not related to \'Business Development\'. The context requires information about business development activities, strategies, or initiatives, which are not present in the actual output."}]', 'evaluation_cost': None, 'evaluation_model': 'QWEN', 'additional_metadata': None}], input="Generate a cold outreach email for GreenEnergy Solutions. Facts: They're developing solar panel technology that's 30% more efficient. They're looking to expand into the European market. They won a sustainability award in 2023.", actual_output="Dear GreenEnergy Solutions team,\n\nCongratulations on your 2023 sustainability award! Your innovative solar panel technology with 30% higher efficiency is exactly what the European market needs right now.\n\nI'd love to discuss how we could support your European expansion plans.\n\nBest regards,\nAlex", expected_output='A professional cold email mentioning the sustainability award, solar technology innovation, and European expansion plans', context=['Business Development'], retrieval_context=['GreenEnergy Solutions won 2023 sustainability award', 'New solar technology 30% more efficient', 'Planning European market expansion'], trace_id=None, example_id=None, eval_run_name=None)]}]
16
- Evaluation run successful
17
- ****************************************
18
- Testing assert test
19
- Assert test successful
20
- ****************************************
21
- Testing JSON scorer
22
-
23
- 🔍 You can view your evaluation results here: https://app.judgmentlabs.ai/app/evalrun?project_name=test_project&eval_run_name=test_json_scorer
24
-
25
- [ScoringResult(success=True, scorers_data=[ScorerData(name='JSON Correctness', threshold=0.5, success=True, score=1.0, reason=None, strict_mode=False, evaluation_model='QWEN', error=None, evaluation_cost=None, verbose_logs='LLM outputed Json:\n{"tool": "authentication"}', additional_metadata=None)], input="What if these shoes don't fit?", actual_output='{"tool": "authentication"}', expected_output=None, context=None, retrieval_context=['All customers are eligible for a 30 day full refund at no extra cost.'], trace_id='2231abe3-e7e0-4909-8ab7-b4ab60b645c6', example_id=None, eval_run_name=None), ScoringResult(success=False, scorers_data=[ScorerData(name='JSON Correctness', threshold=0.5, success=False, score=0.0, reason=None, strict_mode=False, evaluation_model='QWEN', error=None, evaluation_cost=None, verbose_logs="LLM outputed Json:\nYou can reset your password by clicking on 'Forgot Password' at the login screen.", additional_metadata=None)], input='How do I reset my password?', actual_output="You can reset your password by clicking on 'Forgot Password' at the login screen.", expected_output="You can reset your password by clicking on 'Forgot Password' at the login screen.", context=['User Account'], retrieval_context=['Password reset instructions'], trace_id=None, example_id=None, eval_run_name=None)]
26
- JSON scorer test successful
27
- ****************************************
28
- Testing evaluation run override
29
-
30
- 🔍 You can view your evaluation results here:
31
- https://app.judgmentlabs.ai/app/evalrun?project_name=test_eval_run_naming_collisions&eval_run_name=Cs06MuXToDeR
32
-
33
-
34
- 🔍 You can view your evaluation results here:
35
- https://app.judgmentlabs.ai/app/evalrun?project_name=test_eval_run_naming_collisions&eval_run_name=Cs06MuXToDeR
36
-
37
- Successfully caught expected error: Please check your EvaluationRun object, one or more fields are invalid:
38
- Evaluation run name 'Cs06MuXToDeR' already exists for this project
39
- Evaluation run override successful
40
- ****************************************
41
- Testing dataset evaluation
42
- [ScoringResult(success=True, scorers_data=[ScorerData(name='Faithfulness', threshold=0.5, success=True, score=1.0, reason='The score is 1.00 because there are no contradictions, indicating that the actual output is completely faithful to the retrieval context.', strict_mode=False, evaluation_model='QWEN', error=None, evaluation_cost=None, verbose_logs='Claims:\n[{\'claim\': \'A 30-day full refund is offered.\', \'quote\': \'We offer a 30-day full refund at no extra cost.\'}, {\'claim\': \'The 30-day full refund comes at no extra cost.\', \'quote\': \'We offer a 30-day full refund at no extra cost.\'}] \n \nVerdicts:\n[FaithfulnessVerdict(verdict=\'yes\', reason="The claim that a 30-day full refund is offered is supported by the retrieval context. Quote: \'All customers are eligible for a 30 day full refund at no extra cost.\'"), FaithfulnessVerdict(verdict=\'yes\', reason="The claim that the 30-day full refund comes at no extra cost is supported by the retrieval context. Quote: \'All customers are eligible for a 30 day full refund at no extra cost.\'")]', additional_metadata={'claims': [{'claim': 'A 30-day full refund is offered.', 'quote': 'We offer a 30-day full refund at no extra cost.'}, {'claim': 'The 30-day full refund comes at no extra cost.', 'quote': 'We offer a 30-day full refund at no extra cost.'}], 'verdicts': [{'verdict': 'yes', 'reason': "The claim that a 30-day full refund is offered is supported by the retrieval context. Quote: 'All customers are eligible for a 30 day full refund at no extra cost.'"}, {'verdict': 'yes', 'reason': "The claim that the 30-day full refund comes at no extra cost is supported by the retrieval context. Quote: 'All customers are eligible for a 30 day full refund at no extra cost.'"}]})], input="What if these shoes don't fit?", actual_output='We offer a 30-day full refund at no extra cost.', expected_output=None, context=None, retrieval_context=['All customers are eligible for a 30 day full refund at no extra cost.'], trace_id='2231abe3-e7e0-4909-8ab7-b4ab60b645c6', example_id=None, eval_run_name=None), ScoringResult(success=True, scorers_data=[ScorerData(name='Faithfulness', threshold=0.5, success=True, score=1.0, reason='The score is 1.00 because there are no contradictions, indicating that the actual output is completely faithful to the retrieval context.', strict_mode=False, evaluation_model='QWEN', error=None, evaluation_cost=None, verbose_logs='Claims:\n[{\'claim\': \'You can reset your password.\', \'quote\': "You can reset your password by clicking on \'Forgot Password\' at the login screen."}, {\'claim\': "The \'Forgot Password\' option is available at the login screen.", \'quote\': "You can reset your password by clicking on \'Forgot Password\' at the login screen."}, {\'claim\': "Clicking on \'Forgot Password\' allows you to reset your password.", \'quote\': "You can reset your password by clicking on \'Forgot Password\' at the login screen."}] \n \nVerdicts:\n[FaithfulnessVerdict(verdict=\'idk\', reason=\'The retrieval context does not provide any information about the ability to reset a password.\'), FaithfulnessVerdict(verdict=\'idk\', reason="The retrieval context does not mention whether the \'Forgot Password\' option is available at the login screen."), FaithfulnessVerdict(verdict=\'idk\', reason="The retrieval context does not specify if clicking on \'Forgot Password\' allows you to reset your password.")]', additional_metadata={'claims': [{'claim': 'You can reset your password.', 'quote': "You can reset your password by clicking on 'Forgot Password' at the login screen."}, {'claim': "The 'Forgot Password' option is available at the login screen.", 'quote': "You can reset your password by clicking on 'Forgot Password' at the login screen."}, {'claim': "Clicking on 'Forgot Password' allows you to reset your password.", 'quote': "You can reset your password by clicking on 'Forgot Password' at the login screen."}], 'verdicts': [{'verdict': 'idk', 'reason': 'The retrieval context does not provide any information about the ability to reset a password.'}, {'verdict': 'idk', 'reason': "The retrieval context does not mention whether the 'Forgot Password' option is available at the login screen."}, {'verdict': 'idk', 'reason': "The retrieval context does not specify if clicking on 'Forgot Password' allows you to reset your password."}]})], input='How do I reset my password?', actual_output="You can reset your password by clicking on 'Forgot Password' at the login screen.", expected_output="You can reset your password by clicking on 'Forgot Password' at the login screen.", context=['User Account'], retrieval_context=['Password reset instructions'], trace_id=None, example_id=None, eval_run_name=None)]
43
- Dataset evaluation successful
44
- ****************************************
45
- Testing classifier scorer
46
-
47
- 🔍 You can view your evaluation results here: https://app.judgmentlabs.ai/app/evalrun?project_name=ToneScorerTest&eval_run_name=ToneScorerTest
48
-
49
- Classifier scorer test successful
50
- ****************************************
51
- Testing custom judge
judgeval-0.0.5/test.txt DELETED
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes