judgeval 0.0.11__tar.gz → 0.0.12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. {judgeval-0.0.11 → judgeval-0.0.12}/PKG-INFO +1 -1
  2. {judgeval-0.0.11 → judgeval-0.0.12}/pyproject.toml +1 -1
  3. judgeval-0.0.12/src/demo/cookbooks/new_bot/basic_bot.py +106 -0
  4. {judgeval-0.0.11 → judgeval-0.0.12}/src/demo/cookbooks/openai_travel_agent/agent.py +27 -11
  5. {judgeval-0.0.11 → judgeval-0.0.12}/src/demo/cookbooks/openai_travel_agent/tools.py +1 -1
  6. {judgeval-0.0.11 → judgeval-0.0.12}/src/demo/customer_use/cstone/faithfulness_testing.py +4 -4
  7. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/common/tracer.py +62 -26
  8. {judgeval-0.0.11 → judgeval-0.0.12}/.github/workflows/ci.yaml +0 -0
  9. {judgeval-0.0.11 → judgeval-0.0.12}/.gitignore +0 -0
  10. {judgeval-0.0.11 → judgeval-0.0.12}/LICENSE.md +0 -0
  11. {judgeval-0.0.11 → judgeval-0.0.12}/Pipfile +0 -0
  12. {judgeval-0.0.11 → judgeval-0.0.12}/Pipfile.lock +0 -0
  13. {judgeval-0.0.11 → judgeval-0.0.12}/README.md +0 -0
  14. {judgeval-0.0.11 → judgeval-0.0.12}/docs/README.md +0 -0
  15. {judgeval-0.0.11 → judgeval-0.0.12}/docs/api_reference/judgment_client.mdx +0 -0
  16. {judgeval-0.0.11 → judgeval-0.0.12}/docs/api_reference/trace.mdx +0 -0
  17. {judgeval-0.0.11 → judgeval-0.0.12}/docs/development.mdx +0 -0
  18. {judgeval-0.0.11 → judgeval-0.0.12}/docs/essentials/code.mdx +0 -0
  19. {judgeval-0.0.11 → judgeval-0.0.12}/docs/essentials/images.mdx +0 -0
  20. {judgeval-0.0.11 → judgeval-0.0.12}/docs/essentials/markdown.mdx +0 -0
  21. {judgeval-0.0.11 → judgeval-0.0.12}/docs/essentials/navigation.mdx +0 -0
  22. {judgeval-0.0.11 → judgeval-0.0.12}/docs/essentials/reusable-snippets.mdx +0 -0
  23. {judgeval-0.0.11 → judgeval-0.0.12}/docs/essentials/settings.mdx +0 -0
  24. {judgeval-0.0.11 → judgeval-0.0.12}/docs/evaluation/data_datasets.mdx +0 -0
  25. {judgeval-0.0.11 → judgeval-0.0.12}/docs/evaluation/data_examples.mdx +0 -0
  26. {judgeval-0.0.11 → judgeval-0.0.12}/docs/evaluation/introduction.mdx +0 -0
  27. {judgeval-0.0.11 → judgeval-0.0.12}/docs/evaluation/judges.mdx +0 -0
  28. {judgeval-0.0.11 → judgeval-0.0.12}/docs/evaluation/scorers/answer_correctness.mdx +0 -0
  29. {judgeval-0.0.11 → judgeval-0.0.12}/docs/evaluation/scorers/answer_relevancy.mdx +0 -0
  30. {judgeval-0.0.11 → judgeval-0.0.12}/docs/evaluation/scorers/classifier_scorer.mdx +0 -0
  31. {judgeval-0.0.11 → judgeval-0.0.12}/docs/evaluation/scorers/contextual_precision.mdx +0 -0
  32. {judgeval-0.0.11 → judgeval-0.0.12}/docs/evaluation/scorers/contextual_recall.mdx +0 -0
  33. {judgeval-0.0.11 → judgeval-0.0.12}/docs/evaluation/scorers/contextual_relevancy.mdx +0 -0
  34. {judgeval-0.0.11 → judgeval-0.0.12}/docs/evaluation/scorers/custom_scorers.mdx +0 -0
  35. {judgeval-0.0.11 → judgeval-0.0.12}/docs/evaluation/scorers/faithfulness.mdx +0 -0
  36. {judgeval-0.0.11 → judgeval-0.0.12}/docs/evaluation/scorers/hallucination.mdx +0 -0
  37. {judgeval-0.0.11 → judgeval-0.0.12}/docs/evaluation/scorers/introduction.mdx +0 -0
  38. {judgeval-0.0.11 → judgeval-0.0.12}/docs/evaluation/scorers/json_correctness.mdx +0 -0
  39. {judgeval-0.0.11 → judgeval-0.0.12}/docs/evaluation/scorers/summarization.mdx +0 -0
  40. {judgeval-0.0.11 → judgeval-0.0.12}/docs/evaluation/scorers/tool_correctness.mdx +0 -0
  41. {judgeval-0.0.11 → judgeval-0.0.12}/docs/evaluation/unit_testing.mdx +0 -0
  42. {judgeval-0.0.11 → judgeval-0.0.12}/docs/favicon.svg +0 -0
  43. {judgeval-0.0.11 → judgeval-0.0.12}/docs/getting_started.mdx +0 -0
  44. {judgeval-0.0.11 → judgeval-0.0.12}/docs/images/basic_trace_example.png +0 -0
  45. {judgeval-0.0.11 → judgeval-0.0.12}/docs/images/checks-passed.png +0 -0
  46. {judgeval-0.0.11 → judgeval-0.0.12}/docs/images/create_aggressive_scorer.png +0 -0
  47. {judgeval-0.0.11 → judgeval-0.0.12}/docs/images/create_scorer.png +0 -0
  48. {judgeval-0.0.11 → judgeval-0.0.12}/docs/images/evaluation_diagram.png +0 -0
  49. {judgeval-0.0.11 → judgeval-0.0.12}/docs/images/hero-dark.svg +0 -0
  50. {judgeval-0.0.11 → judgeval-0.0.12}/docs/images/hero-light.svg +0 -0
  51. {judgeval-0.0.11 → judgeval-0.0.12}/docs/images/trace_screenshot.png +0 -0
  52. {judgeval-0.0.11 → judgeval-0.0.12}/docs/introduction.mdx +0 -0
  53. {judgeval-0.0.11 → judgeval-0.0.12}/docs/judgment/introduction.mdx +0 -0
  54. {judgeval-0.0.11 → judgeval-0.0.12}/docs/logo/dark.svg +0 -0
  55. {judgeval-0.0.11 → judgeval-0.0.12}/docs/logo/light.svg +0 -0
  56. {judgeval-0.0.11 → judgeval-0.0.12}/docs/mint.json +0 -0
  57. {judgeval-0.0.11 → judgeval-0.0.12}/docs/monitoring/introduction.mdx +0 -0
  58. {judgeval-0.0.11 → judgeval-0.0.12}/docs/monitoring/production_insights.mdx +0 -0
  59. {judgeval-0.0.11 → judgeval-0.0.12}/docs/monitoring/tracing.mdx +0 -0
  60. {judgeval-0.0.11 → judgeval-0.0.12}/docs/notebooks/create_dataset.ipynb +0 -0
  61. {judgeval-0.0.11 → judgeval-0.0.12}/docs/notebooks/create_scorer.ipynb +0 -0
  62. {judgeval-0.0.11 → judgeval-0.0.12}/docs/notebooks/demo.ipynb +0 -0
  63. {judgeval-0.0.11 → judgeval-0.0.12}/docs/notebooks/prompt_scorer.ipynb +0 -0
  64. {judgeval-0.0.11 → judgeval-0.0.12}/docs/notebooks/quickstart.ipynb +0 -0
  65. {judgeval-0.0.11 → judgeval-0.0.12}/docs/quickstart.mdx +0 -0
  66. {judgeval-0.0.11 → judgeval-0.0.12}/docs/snippets/snippet-intro.mdx +0 -0
  67. {judgeval-0.0.11 → judgeval-0.0.12}/pytest.ini +0 -0
  68. {judgeval-0.0.11 → judgeval-0.0.12}/src/demo/cookbooks/ci_testing/ci_testing.py +0 -0
  69. {judgeval-0.0.11 → judgeval-0.0.12}/src/demo/cookbooks/ci_testing/travel_response.txt +0 -0
  70. {judgeval-0.0.11 → judgeval-0.0.12}/src/demo/cookbooks/custom_scorers/competitor_mentions.py +0 -0
  71. {judgeval-0.0.11 → judgeval-0.0.12}/src/demo/cookbooks/custom_scorers/text2sql.py +0 -0
  72. {judgeval-0.0.11 → judgeval-0.0.12}/src/demo/cookbooks/langchain_basic_rag/basic_agentic_rag.ipynb +0 -0
  73. {judgeval-0.0.11 → judgeval-0.0.12}/src/demo/cookbooks/langchain_basic_rag/tesla_q3.pdf +0 -0
  74. {judgeval-0.0.11 → judgeval-0.0.12}/src/demo/cookbooks/langchain_sales/example_product_price_id_mapping.json +0 -0
  75. {judgeval-0.0.11 → judgeval-0.0.12}/src/demo/cookbooks/langchain_sales/sales_agent_with_context.ipynb +0 -0
  76. {judgeval-0.0.11 → judgeval-0.0.12}/src/demo/cookbooks/langchain_sales/sample_product_catalog.txt +0 -0
  77. {judgeval-0.0.11 → judgeval-0.0.12}/src/demo/cookbooks/openai_travel_agent/populate_db.py +0 -0
  78. {judgeval-0.0.11 → judgeval-0.0.12}/src/demo/customer_use/cstone/basic_test.py +0 -0
  79. {judgeval-0.0.11 → judgeval-0.0.12}/src/demo/customer_use/cstone/cstone_data.csv +0 -0
  80. {judgeval-0.0.11 → judgeval-0.0.12}/src/demo/customer_use/cstone/data.csv +0 -0
  81. {judgeval-0.0.11 → judgeval-0.0.12}/src/demo/customer_use/cstone/galen_data.csv +0 -0
  82. {judgeval-0.0.11 → judgeval-0.0.12}/src/demo/customer_use/cstone/playground.py +0 -0
  83. {judgeval-0.0.11 → judgeval-0.0.12}/src/demo/customer_use/cstone/results.csv +0 -0
  84. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/__init__.py +0 -0
  85. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/clients.py +0 -0
  86. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/common/__init__.py +0 -0
  87. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/common/exceptions.py +0 -0
  88. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/common/logger.py +0 -0
  89. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/common/utils.py +0 -0
  90. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/constants.py +0 -0
  91. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/data/__init__.py +0 -0
  92. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/data/api_example.py +0 -0
  93. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/data/datasets/__init__.py +0 -0
  94. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/data/datasets/dataset.py +0 -0
  95. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/data/datasets/eval_dataset_client.py +0 -0
  96. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/data/datasets/ground_truth.py +0 -0
  97. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/data/datasets/utils.py +0 -0
  98. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/data/example.py +0 -0
  99. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/data/result.py +0 -0
  100. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/data/scorer_data.py +0 -0
  101. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/evaluation_run.py +0 -0
  102. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/judges/__init__.py +0 -0
  103. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/judges/base_judge.py +0 -0
  104. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/judges/litellm_judge.py +0 -0
  105. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/judges/mixture_of_judges.py +0 -0
  106. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/judges/together_judge.py +0 -0
  107. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/judges/utils.py +0 -0
  108. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/judgment_client.py +0 -0
  109. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/run_evaluation.py +0 -0
  110. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/scorers/__init__.py +0 -0
  111. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/scorers/api_scorer.py +0 -0
  112. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/scorers/base_scorer.py +0 -0
  113. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/scorers/exceptions.py +0 -0
  114. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/scorers/judgeval_scorer.py +0 -0
  115. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/scorers/judgeval_scorers/__init__.py +0 -0
  116. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -0
  117. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +0 -0
  118. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +0 -0
  119. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -0
  120. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -0
  121. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -0
  122. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +0 -0
  123. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -0
  124. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -0
  125. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -0
  126. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/scorers/judgeval_scorers/api_scorers/tool_correctness.py +0 -0
  127. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -0
  128. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -0
  129. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -0
  130. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -0
  131. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -0
  132. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -0
  133. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -0
  134. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -0
  135. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -0
  136. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -0
  137. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -0
  138. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -0
  139. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -0
  140. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -0
  141. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -0
  142. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -0
  143. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -0
  144. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -0
  145. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -0
  146. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -0
  147. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -0
  148. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -0
  149. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -0
  150. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -0
  151. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -0
  152. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -0
  153. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -0
  154. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -0
  155. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -0
  156. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -0
  157. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py +0 -0
  158. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +0 -0
  159. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/scorers/prompt_scorer.py +0 -0
  160. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/scorers/score.py +0 -0
  161. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/scorers/utils.py +0 -0
  162. {judgeval-0.0.11 → judgeval-0.0.12}/src/judgeval/tracer/__init__.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: judgeval
3
- Version: 0.0.11
3
+ Version: 0.0.12
4
4
  Summary: Judgeval Package
5
5
  Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
6
  Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "judgeval"
3
- version = "0.0.11"
3
+ version = "0.0.12"
4
4
  authors = [
5
5
  { name="Andrew Li", email="andrew@judgmentlabs.ai" },
6
6
  { name="Alex Shan", email="alex@judgmentlabs.ai" },
@@ -0,0 +1,106 @@
1
+ import os
2
+ import asyncio
3
+ from typing import Dict, List
4
+ from openai import OpenAI
5
+ from uuid import uuid4
6
+ from dotenv import load_dotenv
7
+
8
+ from judgeval.tracer import Tracer, wrap
9
+ from judgeval.scorers import AnswerRelevancyScorer, FaithfulnessScorer
10
+
11
+ # Initialize clients
12
+ load_dotenv()
13
+ judgment = Tracer(api_key=os.getenv("JUDGMENT_API_KEY"), project_name="restaurant_bot")
14
+ client = wrap(OpenAI())
15
+
16
+ @judgment.observe(span_type="Research")
17
+ async def search_restaurants(cuisine: str, location: str = "nearby") -> List[Dict]:
18
+ """Search for restaurants matching the cuisine type."""
19
+ # Simulate API call to restaurant database
20
+ prompt = f"Find 3 popular {cuisine} restaurants {location}. Return ONLY a JSON array of objects with 'name', 'rating', and 'price_range' fields. No other text."
21
+
22
+ response = client.chat.completions.create(
23
+ model="gpt-4",
24
+ messages=[
25
+ {"role": "system", "content": """You are a restaurant search expert.
26
+ Return ONLY valid JSON arrays containing restaurant objects.
27
+ Example format: [{"name": "Restaurant Name", "rating": 4.5, "price_range": "$$"}]
28
+ Do not include any other text or explanations."""},
29
+ {"role": "user", "content": prompt}
30
+ ]
31
+ )
32
+
33
+ try:
34
+ import json
35
+ return json.loads(response.choices[0].message.content)
36
+ except json.JSONDecodeError as e:
37
+ print(f"Error parsing JSON response: {response.choices[0].message.content}")
38
+ return [{"name": "Error fetching restaurants", "rating": 0, "price_range": "N/A"}]
39
+
40
+ @judgment.observe(span_type="Research")
41
+ async def get_menu_highlights(restaurant_name: str) -> List[str]:
42
+ """Get popular menu items for a restaurant."""
43
+ prompt = f"What are 3 must-try dishes at {restaurant_name}?"
44
+
45
+ response = client.chat.completions.create(
46
+ model="gpt-4",
47
+ messages=[
48
+ {"role": "system", "content": "You are a food critic. List only the dish names."},
49
+ {"role": "user", "content": prompt}
50
+ ]
51
+ )
52
+
53
+ judgment.get_current_trace().async_evaluate(
54
+ scorers=[AnswerRelevancyScorer(threshold=0.5)],
55
+ input=prompt,
56
+ actual_output=response.choices[0].message.content,
57
+ model="gpt-4",
58
+ )
59
+
60
+ return response.choices[0].message.content.split("\n")
61
+
62
+ @judgment.observe(span_type="function")
63
+ async def generate_recommendation(cuisine: str, restaurants: List[Dict], menu_items: Dict[str, List[str]]) -> str:
64
+ """Generate a natural language recommendation."""
65
+ context = f"""
66
+ Cuisine: {cuisine}
67
+ Restaurants: {restaurants}
68
+ Popular Items: {menu_items}
69
+ """
70
+
71
+ response = client.chat.completions.create(
72
+ model="gpt-4",
73
+ messages=[
74
+ {"role": "system", "content": "You are a helpful food recommendation bot. Provide a natural recommendation based on the data."},
75
+ {"role": "user", "content": context}
76
+ ]
77
+ )
78
+ return response.choices[0].message.content
79
+
80
+ @judgment.observe(span_type="Research")
81
+ async def get_food_recommendations(cuisine: str) -> str:
82
+ """Main function to get restaurant recommendations."""
83
+ # Search for restaurants
84
+ restaurants = await search_restaurants(cuisine)
85
+
86
+ # Get menu highlights for each restaurant
87
+ menu_items = {}
88
+ for restaurant in restaurants:
89
+ menu_items[restaurant['name']] = await get_menu_highlights(restaurant['name'])
90
+
91
+ # Generate final recommendation
92
+ recommendation = await generate_recommendation(cuisine, restaurants, menu_items)
93
+ judgment.get_current_trace().async_evaluate(
94
+ scorers=[AnswerRelevancyScorer(threshold=0.5), FaithfulnessScorer(threshold=1.0)],
95
+ input=f"Create a recommendation for a restaurant and dishes based on the desired cuisine: {cuisine}",
96
+ actual_output=recommendation,
97
+ retrieval_context=[str(restaurants), str(menu_items)],
98
+ model="gpt-4",
99
+ )
100
+ return recommendation
101
+
102
+ if __name__ == "__main__":
103
+ cuisine = input("What kind of food would you like to eat? ")
104
+ recommendation = asyncio.run(get_food_recommendations(cuisine))
105
+ print("\nHere are my recommendations:\n")
106
+ print(recommendation)
@@ -10,9 +10,11 @@ from chromadb.utils import embedding_functions
10
10
  from judgeval.common.tracer import Tracer, wrap
11
11
  from demo.cookbooks.openai_travel_agent.populate_db import destinations_data
12
12
  from demo.cookbooks.openai_travel_agent.tools import search_tavily
13
+ from judgeval.scorers import AnswerRelevancyScorer, FaithfulnessScorer
14
+
13
15
 
14
16
  client = wrap(openai.Client(api_key=os.getenv("OPENAI_API_KEY")))
15
- judgment = Tracer()
17
+ judgment = Tracer(api_key=os.getenv("JUDGMENT_API_KEY"), project_name="travel_agent_demo")
16
18
 
17
19
  def populate_vector_db(collection, destinations_data):
18
20
  """
@@ -45,6 +47,12 @@ async def get_flights(destination):
45
47
  """Search for flights to the destination."""
46
48
  prompt = f"Flights to {destination} from major cities"
47
49
  flights_search = search_tavily(prompt)
50
+ judgment.get_current_trace().async_evaluate(
51
+ scorers=[AnswerRelevancyScorer(threshold=0.5)],
52
+ input=prompt,
53
+ actual_output=flights_search,
54
+ model="gpt-4",
55
+ )
48
56
  return flights_search
49
57
 
50
58
  @judgment.observe(span_type="tool")
@@ -52,6 +60,12 @@ async def get_weather(destination, start_date, end_date):
52
60
  """Search for weather information."""
53
61
  prompt = f"Weather forecast for {destination} from {start_date} to {end_date}"
54
62
  weather_search = search_tavily(prompt)
63
+ judgment.get_current_trace().async_evaluate(
64
+ scorers=[AnswerRelevancyScorer(threshold=0.5)],
65
+ input=prompt,
66
+ actual_output=weather_search,
67
+ model="gpt-4",
68
+ )
55
69
  return weather_search
56
70
 
57
71
  def initialize_vector_db():
@@ -125,21 +139,23 @@ async def create_travel_plan(destination, start_date, end_date, research_data):
125
139
  {"role": "user", "content": prompt}
126
140
  ]
127
141
  ).choices[0].message.content
142
+
143
+ judgment.get_current_trace().async_evaluate(
144
+ scorers=[FaithfulnessScorer(threshold=0.5)],
145
+ input=prompt,
146
+ actual_output=response,
147
+ retrieval_context=[str(vector_db_context), str(research_data)],
148
+ model="gpt-4",
149
+ )
128
150
 
129
151
  return response
130
152
 
131
-
153
+ @judgment.observe(span_type="Main Function", overwrite=True)
132
154
  async def generate_itinerary(destination, start_date, end_date):
133
155
  """Main function to generate a travel itinerary."""
134
-
135
- with judgment.trace(
136
- f"generate_itinerary_demo_{uuid4()}",
137
- project_name="travel_agent_demo"
138
- ) as trace:
139
- research_data = await research_destination(destination, start_date, end_date)
140
- res = await create_travel_plan(destination, start_date, end_date, research_data)
141
- trace.save()
142
- return res
156
+ research_data = await research_destination(destination, start_date, end_date)
157
+ res = await create_travel_plan(destination, start_date, end_date, research_data)
158
+ return res
143
159
 
144
160
 
145
161
  if __name__ == "__main__":
@@ -5,7 +5,7 @@ from tavily import TavilyClient
5
5
 
6
6
  from judgeval.common.tracer import Tracer
7
7
 
8
- judgment = Tracer()
8
+ judgment = Tracer(project_name="travel_agent_demo")
9
9
 
10
10
  @judgment.observe(span_type="search_tool")
11
11
  def search_tavily(query):
@@ -53,10 +53,10 @@ def run_judgment_evaluation(examples: List[Example]):
53
53
  scorer = FaithfulnessScorer(threshold=1.0)
54
54
 
55
55
  output = client.run_evaluation(
56
- model="osiris-mini",
56
+ model="osiris-large",
57
57
  examples=examples,
58
58
  scorers=[scorer],
59
- eval_run_name="cstone-basic-test-osiris-mini-2",
59
+ eval_run_name="cstone-basic-test-osiris-large-1",
60
60
  project_name="cstone_faithfulness_testing",
61
61
  override=True,
62
62
  )
@@ -66,7 +66,7 @@ def run_judgment_evaluation(examples: List[Example]):
66
66
  score = result.scorers_data[0].score
67
67
  scores.append(score)
68
68
 
69
- return [score < 0.95 for score in scores]
69
+ return [score < 1 for score in scores]
70
70
 
71
71
  def run_patronus_evaluation(examples: List[Example]):
72
72
  """
@@ -94,7 +94,7 @@ def run_patronus_evaluation(examples: List[Example]):
94
94
 
95
95
  print(f"patronus scores: {scores}")
96
96
 
97
- return [score < 0.95 for score in scores]
97
+ return [score < 0.9 for score in scores]
98
98
 
99
99
  def evaluate_predictions(predictions):
100
100
  """Calculate metrics comparing predictions to gold labels"""
@@ -557,7 +557,8 @@ class TraceClient:
557
557
  "overwrite": overwrite
558
558
  }
559
559
 
560
- if not empty_save:
560
+ # Execute asynchrous evaluation in the background
561
+ if not empty_save: # Only send to RabbitMQ if the trace is not empty
561
562
  connection = pika.BlockingConnection(
562
563
  pika.ConnectionParameters(host=RABBITMQ_HOST, port=RABBITMQ_PORT))
563
564
  channel = connection.channel()
@@ -588,23 +589,31 @@ class Tracer:
588
589
  cls._instance = super(Tracer, cls).__new__(cls)
589
590
  return cls._instance
590
591
 
591
- def __init__(self, api_key: str = os.getenv("JUDGMENT_API_KEY")):
592
+ def __init__(self, api_key: str = os.getenv("JUDGMENT_API_KEY"), project_name: str = "default_project"):
592
593
  if not hasattr(self, 'initialized'):
593
-
594
594
  if not api_key:
595
595
  raise ValueError("Tracer must be configured with a Judgment API key")
596
596
 
597
597
  self.api_key: str = api_key
598
+ self.project_name: str = project_name
598
599
  self.client: JudgmentClient = JudgmentClient(judgment_api_key=api_key)
599
600
  self.depth: int = 0
600
601
  self._current_trace: Optional[str] = None
601
602
  self.initialized: bool = True
603
+ elif hasattr(self, 'project_name') and self.project_name != project_name:
604
+ warnings.warn(
605
+ f"Attempting to initialize Tracer with project_name='{project_name}' but it was already initialized with "
606
+ f"project_name='{self.project_name}'. Due to the singleton pattern, the original project_name will be used. "
607
+ "To use a different project name, ensure the first Tracer initialization uses the desired project name.",
608
+ RuntimeWarning
609
+ )
602
610
 
603
611
  @contextmanager
604
- def trace(self, name: str, project_name: str = "default_project", overwrite: bool = False) -> Generator[TraceClient, None, None]:
612
+ def trace(self, name: str, project_name: str = None, overwrite: bool = False) -> Generator[TraceClient, None, None]:
605
613
  """Start a new trace context using a context manager"""
606
614
  trace_id = str(uuid.uuid4())
607
- trace = TraceClient(self, trace_id, name, project_name=project_name, overwrite=overwrite)
615
+ project = project_name if project_name is not None else self.project_name
616
+ trace = TraceClient(self, trace_id, name, project_name=project, overwrite=overwrite)
608
617
  prev_trace = self._current_trace
609
618
  self._current_trace = trace
610
619
 
@@ -623,28 +632,40 @@ class Tracer:
623
632
  """
624
633
  return self._current_trace
625
634
 
626
- def observe(self, func=None, *, name=None, span_type: SpanType = "span"):
635
+ def observe(self, func=None, *, name=None, span_type: SpanType = "span", project_name: str = None, overwrite: bool = False):
627
636
  """
628
637
  Decorator to trace function execution with detailed entry/exit information.
629
638
 
630
639
  Args:
631
- func: The function to trace
632
- name: Optional custom name for the function
633
- span_type: The type of span to use for this observation (default: "span")
640
+ func: The function to decorate
641
+ name: Optional custom name for the span (defaults to function name)
642
+ span_type: Type of span (default "span")
643
+ project_name: Optional project name override
644
+ overwrite: Whether to overwrite existing traces
634
645
  """
635
646
  if func is None:
636
- return lambda f: self.observe(f, name=name, span_type=span_type)
647
+ return lambda f: self.observe(f, name=name, span_type=span_type, project_name=project_name, overwrite=overwrite)
648
+
649
+ # Use provided name or fall back to function name
650
+ span_name = name or func.__name__
637
651
 
638
652
  if asyncio.iscoroutinefunction(func):
639
653
  @functools.wraps(func)
640
654
  async def async_wrapper(*args, **kwargs):
655
+ # If there's already a trace, use it. Otherwise create a new one
641
656
  if self._current_trace:
642
- span_name = name or func.__name__
643
-
644
- with self._current_trace.span(span_name, span_type=span_type) as span:
645
- # Set the span type
646
- span.span_type = span_type
647
-
657
+ trace = self._current_trace
658
+ else:
659
+ trace_id = str(uuid.uuid4())
660
+ trace_name = str(uuid.uuid4())
661
+ project = project_name if project_name is not None else self.project_name
662
+ trace = TraceClient(self, trace_id, trace_name, project_name=project, overwrite=overwrite)
663
+ self._current_trace = trace
664
+ # Only save empty trace for the root call
665
+ trace.save(empty_save=True, overwrite=overwrite)
666
+
667
+ try:
668
+ with trace.span(span_name, span_type=span_type) as span:
648
669
  # Record inputs
649
670
  span.record_input({
650
671
  'args': list(args),
@@ -658,19 +679,30 @@ class Tracer:
658
679
  span.record_output(result)
659
680
 
660
681
  return result
661
-
662
- return await func(*args, **kwargs)
682
+ finally:
683
+ # Only save and cleanup if this is the root observe call
684
+ if self.depth == 0:
685
+ trace.save(empty_save=False, overwrite=overwrite)
686
+ self._current_trace = None
687
+
663
688
  return async_wrapper
664
689
  else:
665
690
  @functools.wraps(func)
666
691
  def wrapper(*args, **kwargs):
692
+ # If there's already a trace, use it. Otherwise create a new one
667
693
  if self._current_trace:
668
- span_name = name or func.__name__
669
-
670
- with self._current_trace.span(span_name, span_type=span_type) as span:
671
- # Set the span type
672
- span.span_type = span_type
673
-
694
+ trace = self._current_trace
695
+ else:
696
+ trace_id = str(uuid.uuid4())
697
+ trace_name = str(uuid.uuid4())
698
+ project = project_name if project_name is not None else self.project_name
699
+ trace = TraceClient(self, trace_id, trace_name, project_name=project, overwrite=overwrite)
700
+ self._current_trace = trace
701
+ # Only save empty trace for the root call
702
+ trace.save(empty_save=True, overwrite=overwrite)
703
+
704
+ try:
705
+ with trace.span(span_name, span_type=span_type) as span:
674
706
  # Record inputs
675
707
  span.record_input({
676
708
  'args': list(args),
@@ -684,8 +716,12 @@ class Tracer:
684
716
  span.record_output(result)
685
717
 
686
718
  return result
687
-
688
- return func(*args, **kwargs)
719
+ finally:
720
+ # Only save and cleanup if this is the root observe call
721
+ if self.depth == 0:
722
+ trace.save(empty_save=False, overwrite=overwrite)
723
+ self._current_trace = None
724
+
689
725
  return wrapper
690
726
 
691
727
  def wrap(client: Any) -> Any:
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes