judgeval 0.0.33__tar.gz → 0.0.35__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (153) hide show
  1. judgeval-0.0.35/.github/pull_request_template.md +31 -0
  2. {judgeval-0.0.33 → judgeval-0.0.35}/PKG-INFO +2 -1
  3. {judgeval-0.0.33 → judgeval-0.0.35}/pyproject.toml +3 -2
  4. judgeval-0.0.35/src/demo/demo.py +54 -0
  5. judgeval-0.0.35/src/demo/demo2.py +144 -0
  6. judgeval-0.0.35/src/test.py +143 -0
  7. judgeval-0.0.33/src/demo/demo.py +0 -50
  8. {judgeval-0.0.33 → judgeval-0.0.35}/.github/workflows/ci.yaml +0 -0
  9. {judgeval-0.0.33 → judgeval-0.0.35}/.gitignore +0 -0
  10. {judgeval-0.0.33 → judgeval-0.0.35}/LICENSE.md +0 -0
  11. {judgeval-0.0.33 → judgeval-0.0.35}/Pipfile +0 -0
  12. {judgeval-0.0.33 → judgeval-0.0.35}/Pipfile.lock +0 -0
  13. {judgeval-0.0.33 → judgeval-0.0.35}/README.md +0 -0
  14. {judgeval-0.0.33 → judgeval-0.0.35}/docs/README.md +0 -0
  15. {judgeval-0.0.33 → judgeval-0.0.35}/docs/alerts/notifications.mdx +0 -0
  16. {judgeval-0.0.33 → judgeval-0.0.35}/docs/alerts/platform_notifications.mdx +0 -0
  17. {judgeval-0.0.33 → judgeval-0.0.35}/docs/alerts/rules.mdx +0 -0
  18. {judgeval-0.0.33 → judgeval-0.0.35}/docs/api_reference/judgment_client.mdx +0 -0
  19. {judgeval-0.0.33 → judgeval-0.0.35}/docs/api_reference/trace.mdx +0 -0
  20. {judgeval-0.0.33 → judgeval-0.0.35}/docs/changelog/2025-04-21.mdx +0 -0
  21. {judgeval-0.0.33 → judgeval-0.0.35}/docs/clustering/clustering.mdx +0 -0
  22. {judgeval-0.0.33 → judgeval-0.0.35}/docs/development.mdx +0 -0
  23. {judgeval-0.0.33 → judgeval-0.0.35}/docs/essentials/code.mdx +0 -0
  24. {judgeval-0.0.33 → judgeval-0.0.35}/docs/essentials/images.mdx +0 -0
  25. {judgeval-0.0.33 → judgeval-0.0.35}/docs/essentials/markdown.mdx +0 -0
  26. {judgeval-0.0.33 → judgeval-0.0.35}/docs/essentials/navigation.mdx +0 -0
  27. {judgeval-0.0.33 → judgeval-0.0.35}/docs/essentials/reusable-snippets.mdx +0 -0
  28. {judgeval-0.0.33 → judgeval-0.0.35}/docs/essentials/settings.mdx +0 -0
  29. {judgeval-0.0.33 → judgeval-0.0.35}/docs/evaluation/data_datasets.mdx +0 -0
  30. {judgeval-0.0.33 → judgeval-0.0.35}/docs/evaluation/data_examples.mdx +0 -0
  31. {judgeval-0.0.33 → judgeval-0.0.35}/docs/evaluation/data_sequences.mdx +0 -0
  32. {judgeval-0.0.33 → judgeval-0.0.35}/docs/evaluation/introduction.mdx +0 -0
  33. {judgeval-0.0.33 → judgeval-0.0.35}/docs/evaluation/judges.mdx +0 -0
  34. {judgeval-0.0.33 → judgeval-0.0.35}/docs/evaluation/scorers/agent/derailment.mdx +0 -0
  35. {judgeval-0.0.33 → judgeval-0.0.35}/docs/evaluation/scorers/classifier_scorer.mdx +0 -0
  36. {judgeval-0.0.33 → judgeval-0.0.35}/docs/evaluation/scorers/custom_scorers.mdx +0 -0
  37. {judgeval-0.0.33 → judgeval-0.0.35}/docs/evaluation/scorers/default/answer_correctness.mdx +0 -0
  38. {judgeval-0.0.33 → judgeval-0.0.35}/docs/evaluation/scorers/default/answer_relevancy.mdx +0 -0
  39. {judgeval-0.0.33 → judgeval-0.0.35}/docs/evaluation/scorers/default/comparison.mdx +0 -0
  40. {judgeval-0.0.33 → judgeval-0.0.35}/docs/evaluation/scorers/default/contextual_precision.mdx +0 -0
  41. {judgeval-0.0.33 → judgeval-0.0.35}/docs/evaluation/scorers/default/contextual_recall.mdx +0 -0
  42. {judgeval-0.0.33 → judgeval-0.0.35}/docs/evaluation/scorers/default/contextual_relevancy.mdx +0 -0
  43. {judgeval-0.0.33 → judgeval-0.0.35}/docs/evaluation/scorers/default/execution_order.mdx +0 -0
  44. {judgeval-0.0.33 → judgeval-0.0.35}/docs/evaluation/scorers/default/faithfulness.mdx +0 -0
  45. {judgeval-0.0.33 → judgeval-0.0.35}/docs/evaluation/scorers/default/groundedness.mdx +0 -0
  46. {judgeval-0.0.33 → judgeval-0.0.35}/docs/evaluation/scorers/default/json_correctness.mdx +0 -0
  47. {judgeval-0.0.33 → judgeval-0.0.35}/docs/evaluation/scorers/default/summarization.mdx +0 -0
  48. {judgeval-0.0.33 → judgeval-0.0.35}/docs/evaluation/scorers/introduction.mdx +0 -0
  49. {judgeval-0.0.33 → judgeval-0.0.35}/docs/evaluation/unit_testing.mdx +0 -0
  50. {judgeval-0.0.33 → judgeval-0.0.35}/docs/favicon.svg +0 -0
  51. {judgeval-0.0.33 → judgeval-0.0.35}/docs/getting_started.mdx +0 -0
  52. {judgeval-0.0.33 → judgeval-0.0.35}/docs/images/annotation_queue_ui.png +0 -0
  53. {judgeval-0.0.33 → judgeval-0.0.35}/docs/images/basic_trace_example.png +0 -0
  54. {judgeval-0.0.33 → judgeval-0.0.35}/docs/images/checks-passed.png +0 -0
  55. {judgeval-0.0.33 → judgeval-0.0.35}/docs/images/cluster.png +0 -0
  56. {judgeval-0.0.33 → judgeval-0.0.35}/docs/images/cluster_button.png +0 -0
  57. {judgeval-0.0.33 → judgeval-0.0.35}/docs/images/create_aggressive_scorer.png +0 -0
  58. {judgeval-0.0.33 → judgeval-0.0.35}/docs/images/create_scorer.png +0 -0
  59. {judgeval-0.0.33 → judgeval-0.0.35}/docs/images/dashboard_annotation_queue_button.png +0 -0
  60. {judgeval-0.0.33 → judgeval-0.0.35}/docs/images/evaluation_diagram.png +0 -0
  61. {judgeval-0.0.33 → judgeval-0.0.35}/docs/images/hero-dark.svg +0 -0
  62. {judgeval-0.0.33 → judgeval-0.0.35}/docs/images/hero-light.svg +0 -0
  63. {judgeval-0.0.33 → judgeval-0.0.35}/docs/images/notifications_page.png +0 -0
  64. {judgeval-0.0.33 → judgeval-0.0.35}/docs/images/online_eval_fault.png +0 -0
  65. {judgeval-0.0.33 → judgeval-0.0.35}/docs/images/reports_modal.png +0 -0
  66. {judgeval-0.0.33 → judgeval-0.0.35}/docs/images/trace_ss.png +0 -0
  67. {judgeval-0.0.33 → judgeval-0.0.35}/docs/integration/langgraph.mdx +0 -0
  68. {judgeval-0.0.33 → judgeval-0.0.35}/docs/introduction.mdx +0 -0
  69. {judgeval-0.0.33 → judgeval-0.0.35}/docs/logo/dark.svg +0 -0
  70. {judgeval-0.0.33 → judgeval-0.0.35}/docs/logo/light.svg +0 -0
  71. {judgeval-0.0.33 → judgeval-0.0.35}/docs/mcp_server/mcp_server.mdx +0 -0
  72. {judgeval-0.0.33 → judgeval-0.0.35}/docs/mint.json +0 -0
  73. {judgeval-0.0.33 → judgeval-0.0.35}/docs/monitoring/annotations.mdx +0 -0
  74. {judgeval-0.0.33 → judgeval-0.0.35}/docs/monitoring/introduction.mdx +0 -0
  75. {judgeval-0.0.33 → judgeval-0.0.35}/docs/monitoring/production_insights.mdx +0 -0
  76. {judgeval-0.0.33 → judgeval-0.0.35}/docs/monitoring/tracing.mdx +0 -0
  77. {judgeval-0.0.33 → judgeval-0.0.35}/docs/notebooks/create_dataset.ipynb +0 -0
  78. {judgeval-0.0.33 → judgeval-0.0.35}/docs/notebooks/create_scorer.ipynb +0 -0
  79. {judgeval-0.0.33 → judgeval-0.0.35}/docs/notebooks/demo.ipynb +0 -0
  80. {judgeval-0.0.33 → judgeval-0.0.35}/docs/notebooks/prompt_scorer.ipynb +0 -0
  81. {judgeval-0.0.33 → judgeval-0.0.35}/docs/notebooks/quickstart.ipynb +0 -0
  82. {judgeval-0.0.33 → judgeval-0.0.35}/docs/quickstart.mdx +0 -0
  83. {judgeval-0.0.33 → judgeval-0.0.35}/docs/snippets/snippet-intro.mdx +0 -0
  84. {judgeval-0.0.33 → judgeval-0.0.35}/pytest.ini +0 -0
  85. {judgeval-0.0.33 → judgeval-0.0.35}/src/demo/custom_scorer/main.py +0 -0
  86. {judgeval-0.0.33 → judgeval-0.0.35}/src/demo/custom_scorer/scorer.py +0 -0
  87. {judgeval-0.0.33 → judgeval-0.0.35}/src/demo/dataset.py +0 -0
  88. {judgeval-0.0.33 → judgeval-0.0.35}/src/demo/new_bot/basic_bot.py +0 -0
  89. {judgeval-0.0.33 → judgeval-0.0.35}/src/demo/simple_trace.py +0 -0
  90. {judgeval-0.0.33 → judgeval-0.0.35}/src/demo/simplified_tracing/example_complex_async.py +0 -0
  91. {judgeval-0.0.33 → judgeval-0.0.35}/src/demo/streaming_anthropic_demo.py +0 -0
  92. {judgeval-0.0.33 → judgeval-0.0.35}/src/demo/streaming_openai_demo.py +0 -0
  93. {judgeval-0.0.33 → judgeval-0.0.35}/src/demo/test.py +0 -0
  94. {judgeval-0.0.33 → judgeval-0.0.35}/src/demo/travel_agent.py +0 -0
  95. {judgeval-0.0.33 → judgeval-0.0.35}/src/judgeval/__init__.py +0 -0
  96. {judgeval-0.0.33 → judgeval-0.0.35}/src/judgeval/clients.py +0 -0
  97. {judgeval-0.0.33 → judgeval-0.0.35}/src/judgeval/common/__init__.py +0 -0
  98. {judgeval-0.0.33 → judgeval-0.0.35}/src/judgeval/common/exceptions.py +0 -0
  99. {judgeval-0.0.33 → judgeval-0.0.35}/src/judgeval/common/logger.py +0 -0
  100. {judgeval-0.0.33 → judgeval-0.0.35}/src/judgeval/common/s3_storage.py +0 -0
  101. {judgeval-0.0.33 → judgeval-0.0.35}/src/judgeval/common/tracer.py +0 -0
  102. {judgeval-0.0.33 → judgeval-0.0.35}/src/judgeval/common/utils.py +0 -0
  103. {judgeval-0.0.33 → judgeval-0.0.35}/src/judgeval/constants.py +0 -0
  104. {judgeval-0.0.33 → judgeval-0.0.35}/src/judgeval/data/__init__.py +0 -0
  105. {judgeval-0.0.33 → judgeval-0.0.35}/src/judgeval/data/custom_example.py +0 -0
  106. {judgeval-0.0.33 → judgeval-0.0.35}/src/judgeval/data/datasets/__init__.py +0 -0
  107. {judgeval-0.0.33 → judgeval-0.0.35}/src/judgeval/data/datasets/dataset.py +0 -0
  108. {judgeval-0.0.33 → judgeval-0.0.35}/src/judgeval/data/datasets/eval_dataset_client.py +0 -0
  109. {judgeval-0.0.33 → judgeval-0.0.35}/src/judgeval/data/example.py +0 -0
  110. {judgeval-0.0.33 → judgeval-0.0.35}/src/judgeval/data/result.py +0 -0
  111. {judgeval-0.0.33 → judgeval-0.0.35}/src/judgeval/data/scorer_data.py +0 -0
  112. {judgeval-0.0.33 → judgeval-0.0.35}/src/judgeval/data/sequence.py +0 -0
  113. {judgeval-0.0.33 → judgeval-0.0.35}/src/judgeval/data/sequence_run.py +0 -0
  114. {judgeval-0.0.33 → judgeval-0.0.35}/src/judgeval/evaluation_run.py +0 -0
  115. {judgeval-0.0.33 → judgeval-0.0.35}/src/judgeval/integrations/langgraph.py +0 -0
  116. {judgeval-0.0.33 → judgeval-0.0.35}/src/judgeval/judges/__init__.py +0 -0
  117. {judgeval-0.0.33 → judgeval-0.0.35}/src/judgeval/judges/base_judge.py +0 -0
  118. {judgeval-0.0.33 → judgeval-0.0.35}/src/judgeval/judges/litellm_judge.py +0 -0
  119. {judgeval-0.0.33 → judgeval-0.0.35}/src/judgeval/judges/mixture_of_judges.py +0 -0
  120. {judgeval-0.0.33 → judgeval-0.0.35}/src/judgeval/judges/together_judge.py +0 -0
  121. {judgeval-0.0.33 → judgeval-0.0.35}/src/judgeval/judges/utils.py +0 -0
  122. {judgeval-0.0.33 → judgeval-0.0.35}/src/judgeval/judgment_client.py +0 -0
  123. {judgeval-0.0.33 → judgeval-0.0.35}/src/judgeval/rules.py +0 -0
  124. {judgeval-0.0.33 → judgeval-0.0.35}/src/judgeval/run_evaluation.py +0 -0
  125. {judgeval-0.0.33 → judgeval-0.0.35}/src/judgeval/scorers/__init__.py +0 -0
  126. {judgeval-0.0.33 → judgeval-0.0.35}/src/judgeval/scorers/api_scorer.py +0 -0
  127. {judgeval-0.0.33 → judgeval-0.0.35}/src/judgeval/scorers/exceptions.py +0 -0
  128. {judgeval-0.0.33 → judgeval-0.0.35}/src/judgeval/scorers/judgeval_scorer.py +0 -0
  129. {judgeval-0.0.33 → judgeval-0.0.35}/src/judgeval/scorers/judgeval_scorers/__init__.py +0 -0
  130. {judgeval-0.0.33 → judgeval-0.0.35}/src/judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -0
  131. {judgeval-0.0.33 → judgeval-0.0.35}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +0 -0
  132. {judgeval-0.0.33 → judgeval-0.0.35}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +0 -0
  133. {judgeval-0.0.33 → judgeval-0.0.35}/src/judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +0 -0
  134. {judgeval-0.0.33 → judgeval-0.0.35}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -0
  135. {judgeval-0.0.33 → judgeval-0.0.35}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -0
  136. {judgeval-0.0.33 → judgeval-0.0.35}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -0
  137. {judgeval-0.0.33 → judgeval-0.0.35}/src/judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -0
  138. {judgeval-0.0.33 → judgeval-0.0.35}/src/judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -0
  139. {judgeval-0.0.33 → judgeval-0.0.35}/src/judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +0 -0
  140. {judgeval-0.0.33 → judgeval-0.0.35}/src/judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +0 -0
  141. {judgeval-0.0.33 → judgeval-0.0.35}/src/judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -0
  142. {judgeval-0.0.33 → judgeval-0.0.35}/src/judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +0 -0
  143. {judgeval-0.0.33 → judgeval-0.0.35}/src/judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -0
  144. {judgeval-0.0.33 → judgeval-0.0.35}/src/judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -0
  145. {judgeval-0.0.33 → judgeval-0.0.35}/src/judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -0
  146. {judgeval-0.0.33 → judgeval-0.0.35}/src/judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -0
  147. {judgeval-0.0.33 → judgeval-0.0.35}/src/judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -0
  148. {judgeval-0.0.33 → judgeval-0.0.35}/src/judgeval/scorers/prompt_scorer.py +0 -0
  149. {judgeval-0.0.33 → judgeval-0.0.35}/src/judgeval/scorers/score.py +0 -0
  150. {judgeval-0.0.33 → judgeval-0.0.35}/src/judgeval/scorers/utils.py +0 -0
  151. {judgeval-0.0.33 → judgeval-0.0.35}/src/judgeval/tracer/__init__.py +0 -0
  152. {judgeval-0.0.33 → judgeval-0.0.35}/src/judgeval/utils/alerts.py +0 -0
  153. {judgeval-0.0.33 → judgeval-0.0.35}/src/judgeval/version_check.py +0 -0
@@ -0,0 +1,31 @@
1
+ ## 📝 Summary
2
+
3
+ <!-- Provide a brief description of the changes introduced by this PR -->
4
+
5
+ ## 🎯 Purpose
6
+
7
+ <!-- Explain the motivation behind these changes. Why are they necessary? -->
8
+
9
+ ## 🎥 Demo of Changes
10
+
11
+ <!-- Add a short 1-3 minute video describing/demoing the changes -->
12
+
13
+ ## 🧪 Testing
14
+
15
+ <!-- Describe how the changes were tested (unit/manual) -->
16
+
17
+ ## ✅ Checklist
18
+
19
+ - [ ] Self-review
20
+ - [ ] Video demo of changes
21
+ - [ ] Unit Tests and CI/CD tests are passing
22
+ - [ ] Reviewers assigned
23
+
24
+
25
+ ## 📌 Linear Issue
26
+
27
+ <!-- Reference to associated Linear ticket, e.g., ABC-123 -->
28
+
29
+ ## ✏️ Additional Notes
30
+
31
+ <!-- Any additional information that doesn't fit into the other sections -->
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: judgeval
3
- Version: 0.0.33
3
+ Version: 0.0.35
4
4
  Summary: Judgeval Package
5
5
  Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
6
  Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -11,6 +11,7 @@ Classifier: Operating System :: OS Independent
11
11
  Classifier: Programming Language :: Python :: 3
12
12
  Requires-Python: >=3.11
13
13
  Requires-Dist: anthropic
14
+ Requires-Dist: boto3==1.38.3
14
15
  Requires-Dist: fastapi
15
16
  Requires-Dist: google-genai
16
17
  Requires-Dist: langchain
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "judgeval"
3
- version = "0.0.33"
3
+ version = "0.0.35"
4
4
  authors = [
5
5
  { name="Andrew Li", email="andrew@judgmentlabs.ai" },
6
6
  { name="Alex Shan", email="alex@judgmentlabs.ai" },
@@ -34,7 +34,8 @@ dependencies = [
34
34
  "langchain-openai",
35
35
  "langchain-anthropic",
36
36
  "langchain-core",
37
- "google-genai"
37
+ "google-genai",
38
+ "boto3==1.38.3"
38
39
  ]
39
40
 
40
41
  [project.optional-dependencies]
@@ -0,0 +1,54 @@
1
+ from judgeval import JudgmentClient
2
+ from judgeval.data import Example, Sequence
3
+ from judgeval.scorers import DerailmentScorer
4
+
5
+ client = JudgmentClient()
6
+
7
+ dataset = client.pull_dataset("test-dataset", "default_project")
8
+
9
+ print(dataset)
10
+
11
+ # airlines_example = Example(
12
+ # input="Which airlines fly to Tokyo?",
13
+ # actual_output="Japan Airlines, All Nippon Airways, and Chinese Airlines offer direct flights."
14
+ # )
15
+ # weather_example = Example(
16
+ # input="What is the weather like in Japan?",
17
+ # actual_output="It's cloudy with a high of 75°F and a low of 60°F in Japan."
18
+ # )
19
+ # airline_sequence = Sequence(
20
+ # name="Flight Details",
21
+ # items=[airlines_example, weather_example],
22
+ # )
23
+
24
+ # # Level 1: Top-level sequence
25
+ # top_example1 = Example(
26
+ # input="I want to plan a trip to Tokyok.",
27
+ # actual_output="That sounds great! When are you planning to go?"
28
+ # )
29
+ # top_example2 = Example(
30
+ # input="Can you book a flight for me and anything else I need to know?",
31
+ # actual_output="Sure, I'll help you with flights. hotels. and transportation."
32
+ # )
33
+ # top_level_sequence = Sequence(
34
+ # name="Travel Planning",
35
+ # items=[top_example1, top_example2, airline_sequence],
36
+ # )
37
+
38
+ # other_sequence = Sequence(
39
+ # name="Other",
40
+ # items=[Example(
41
+ # input="What is the weather like in Tokyo?",
42
+ # actual_output="It's cloudy with a high of 75°F and a low of 60°F in Tokyo."
43
+ # )]
44
+ # )
45
+
46
+ # results = client.run_sequence_evaluation(
47
+ # eval_run_name="sequence-run1",
48
+ # project_name="jnpr-demo-sequence",
49
+ # scorers=[DerailmentScorer(threshold=1)],
50
+ # sequences=[top_level_sequence, other_sequence],
51
+ # model="gpt-4o",
52
+ # log_results=True,
53
+ # override=True,
54
+ # )
@@ -0,0 +1,144 @@
1
+ import os
2
+ import asyncio
3
+ from openai import OpenAI, AsyncOpenAI
4
+ from dotenv import load_dotenv
5
+ from judgeval.common.tracer import Tracer, wrap
6
+ from judgeval.scorers import AnswerRelevancyScorer, FaithfulnessScorer, GroundednessScorer
7
+
8
+ # Load environment variables
9
+ load_dotenv()
10
+
11
+ # Initialize OpenAI client and Judgment tracer
12
+ client = wrap(OpenAI())
13
+ async_client = wrap(AsyncOpenAI())
14
+ judgment = Tracer(project_name="music-bot-demo")
15
+
16
+ @judgment.observe(span_type="tool")
17
+ async def search_tavily(query):
18
+ """Search for information using Tavily."""
19
+ from tavily import TavilyClient
20
+
21
+ tavily_client = TavilyClient(api_key=os.getenv("TAVILY_API_KEY"))
22
+ search_result = tavily_client.search(
23
+ query=query,
24
+ search_depth="advanced",
25
+ max_results=5
26
+ )
27
+
28
+ return search_result
29
+
30
+ @judgment.observe(span_type="function")
31
+ async def ask_user_preferences():
32
+ """Ask the user a series of questions about their music preferences."""
33
+ questions = [
34
+ "What are some of your favorite artists or bands?",
35
+ "What genres of music do you enjoy the most?",
36
+ "Do you have any favorite songs currently?",
37
+ "Are there any moods or themes you're looking for in new music?",
38
+ "Do you prefer newer releases or classic songs?"
39
+ ]
40
+
41
+ preferences = {}
42
+ for question in questions:
43
+ print(f"\n{question}")
44
+ answer = input("> ")
45
+ preferences[question] = answer
46
+
47
+ return preferences
48
+
49
+ @judgment.observe(span_type="function")
50
+ async def search_music_recommendations(preferences):
51
+ """Search for music recommendations based on user preferences."""
52
+ # Construct search queries based on preferences
53
+ search_results = {}
54
+
55
+ # Search for artist recommendations
56
+ if preferences.get("What are some of your favorite artists or bands?"):
57
+ artists_query = f"Music similar to {preferences['What are some of your favorite artists or bands?']}"
58
+ search_results["artist_based"] = await search_tavily(artists_query)
59
+
60
+ # Search for genre recommendations
61
+ if preferences.get("What genres of music do you enjoy the most?"):
62
+ genre_query = f"Best {preferences['What genres of music do you enjoy the most?']} songs"
63
+ search_results["genre_based"] = await search_tavily(genre_query)
64
+
65
+ # Search for mood-based recommendations
66
+ if preferences.get("Are there any moods or themes you're looking for in new music?"):
67
+ mood_query = f"""{preferences["Are there any moods or themes you're looking for in new music?"]} music recommendations"""
68
+ search_results["mood_based"] = await search_tavily(mood_query)
69
+
70
+ return search_results
71
+
72
+ @judgment.observe(span_type="function")
73
+ async def generate_recommendations(preferences, search_results):
74
+ """Generate personalized music recommendations using the search results."""
75
+ # Prepare context from search results
76
+ context = ""
77
+ for category, results in search_results.items():
78
+ context += f"\n{category.replace('_', ' ').title()} Search Results:\n"
79
+ for result in results.get("results", []):
80
+ context += f"- {result.get('title')}: {result.get('content')[:200]}...\n"
81
+
82
+ # Create a prompt for the LLM
83
+ prompt = f"""
84
+ Suggest 5-7 songs they could enjoy. Be creative and suggest whatever feels right. You should only recommend songs that are from the user's favorite artists/bands.
85
+ For each song, include the artist name, song title, and a brief explanation of why they might like it.
86
+
87
+ User Preferences:
88
+ {preferences}
89
+
90
+ Search Results:
91
+ {context}
92
+
93
+ Provide recommendations in a clear, organized format. Focus on specific songs rather than just artists.
94
+ """
95
+
96
+
97
+ # Generate recommendations using OpenAI
98
+ response = await async_client.chat.completions.create(
99
+ model="gpt-4o-mini",
100
+ messages=[
101
+ {"role": "system", "content": "You are a music recommendation expert with deep knowledge of various genres, artists, and songs. Your goal is to suggest songs that match the user's preferences; recommend songs from their favorite artists/bands."},
102
+ {"role": "user", "content": prompt}
103
+ ]
104
+ )
105
+
106
+ recommendations = response.choices[0].message.content
107
+
108
+ # Evaluate the recommendations
109
+ judgment.get_current_trace().async_evaluate(
110
+ scorers=[
111
+ AnswerRelevancyScorer(threshold=1.0),
112
+ GroundednessScorer(threshold=1.0)
113
+ ],
114
+ input=prompt,
115
+ actual_output=recommendations,
116
+ retrieval_context=[str(search_results)],
117
+ model="gpt-4o"
118
+ )
119
+
120
+ return recommendations
121
+
122
+ @judgment.observe(span_type="Main Function")
123
+ async def music_recommendation_bot():
124
+ """Main function to run the music recommendation bot."""
125
+ print("🎵 Welcome to the Music Recommendation Bot! 🎵")
126
+ print("I'll ask you a few questions to understand your music taste, then suggest some songs you might enjoy.")
127
+
128
+ # Get user preferences
129
+ preferences = await ask_user_preferences()
130
+
131
+ print("\nSearching for music recommendations based on your preferences...")
132
+ search_results = await search_music_recommendations(preferences)
133
+
134
+ print("\nGenerating personalized recommendations...")
135
+ recommendations = await generate_recommendations(preferences, search_results)
136
+
137
+ print("\n🎧 Your Personalized Music Recommendations 🎧")
138
+ print(recommendations)
139
+
140
+ return recommendations
141
+
142
+ if __name__ == "__main__":
143
+ asyncio.run(music_recommendation_bot())
144
+
@@ -0,0 +1,143 @@
1
+ import os
2
+ import asyncio
3
+ from openai import OpenAI
4
+ from dotenv import load_dotenv
5
+ from judgeval.common.tracer import Tracer, wrap
6
+ from judgeval.scorers import AnswerRelevancyScorer, FaithfulnessScorer, GroundednessScorer
7
+
8
+ # Load environment variables
9
+ load_dotenv()
10
+
11
+ # Initialize OpenAI client and Judgment tracer
12
+ client = wrap(OpenAI())
13
+ judgment = Tracer(project_name="music-bot-demo")
14
+
15
+ @judgment.observe(span_type="tool")
16
+ async def search_tavily(query):
17
+ """Search for information using Tavily."""
18
+ from tavily import TavilyClient
19
+
20
+ tavily_client = TavilyClient(api_key=os.getenv("TAVILY_API_KEY"))
21
+ search_result = tavily_client.search(
22
+ query=query,
23
+ search_depth="advanced",
24
+ max_results=5
25
+ )
26
+
27
+ return search_result
28
+
29
+ @judgment.observe(span_type="function")
30
+ async def ask_user_preferences():
31
+ """Ask the user a series of questions about their music preferences."""
32
+ questions = [
33
+ "What are some of your favorite artists or bands?",
34
+ "What genres of music do you enjoy the most?",
35
+ "Do you have any favorite songs currently?",
36
+ "Are there any moods or themes you're looking for in new music?",
37
+ "Do you prefer newer releases or classic songs?"
38
+ ]
39
+
40
+ preferences = {}
41
+ for question in questions:
42
+ print(f"\n{question}")
43
+ answer = input("> ")
44
+ preferences[question] = answer
45
+
46
+ return preferences
47
+
48
+ @judgment.observe(span_type="function")
49
+ async def search_music_recommendations(preferences):
50
+ """Search for music recommendations based on user preferences."""
51
+ # Construct search queries based on preferences
52
+ search_results = {}
53
+
54
+ # Search for artist recommendations
55
+ if preferences.get("What are some of your favorite artists or bands?"):
56
+ artists_query = f"Music similar to {preferences['What are some of your favorite artists or bands?']}"
57
+ search_results["artist_based"] = await search_tavily(artists_query)
58
+
59
+ # Search for genre recommendations
60
+ if preferences.get("What genres of music do you enjoy the most?"):
61
+ genre_query = f"Best {preferences['What genres of music do you enjoy the most?']} songs"
62
+ search_results["genre_based"] = await search_tavily(genre_query)
63
+
64
+ # Search for mood-based recommendations
65
+ if preferences.get("Are there any moods or themes you're looking for in new music?"):
66
+ mood_query = f"""{preferences["Are there any moods or themes you're looking for in new music?"]} music recommendations"""
67
+ search_results["mood_based"] = await search_tavily(mood_query)
68
+
69
+ return search_results
70
+
71
+ @judgment.observe(span_type="function")
72
+ async def generate_recommendations(preferences, search_results):
73
+ """Generate personalized music recommendations using the search results."""
74
+ # Prepare context from search results
75
+ context = ""
76
+ for category, results in search_results.items():
77
+ context += f"\n{category.replace('_', ' ').title()} Search Results:\n"
78
+ for result in results.get("results", []):
79
+ context += f"- {result.get('title')}: {result.get('content')[:200]}...\n"
80
+
81
+ # Create a prompt for the LLM
82
+ prompt = f"""
83
+ Suggest 5-7 songs they could enjoy. Be creative and suggest whatever feels right. You should only recommend songs that are from the user's favorite artists/bands.
84
+ For each song, include the artist name, song title, and a brief explanation of why they might like it.
85
+
86
+ User Preferences:
87
+ {preferences}
88
+
89
+ Search Results:
90
+ {context}
91
+
92
+ Provide recommendations in a clear, organized format. Focus on specific songs rather than just artists.
93
+ """
94
+
95
+
96
+ # Generate recommendations using OpenAI
97
+ response = client.chat.completions.create(
98
+ model="gpt-4o-mini",
99
+ messages=[
100
+ {"role": "system", "content": "You are a music recommendation expert with deep knowledge of various genres, artists, and songs. Your goal is to suggest songs that match the user's preferences; recommend songs from their favorite artists/bands."},
101
+ {"role": "user", "content": prompt}
102
+ ]
103
+ )
104
+
105
+ recommendations = response.choices[0].message.content
106
+
107
+ # Evaluate the recommendations
108
+ judgment.get_current_trace().async_evaluate(
109
+ scorers=[
110
+ AnswerRelevancyScorer(threshold=1.0),
111
+ GroundednessScorer(threshold=1.0)
112
+ ],
113
+ input=prompt,
114
+ actual_output=recommendations,
115
+ retrieval_context=[str(search_results)],
116
+ model="gpt-4o"
117
+ )
118
+
119
+ return recommendations
120
+
121
+ @judgment.observe(span_type="Main Function")
122
+ async def music_recommendation_bot():
123
+ """Main function to run the music recommendation bot."""
124
+ print("🎵 Welcome to the Music Recommendation Bot! 🎵")
125
+ print("I'll ask you a few questions to understand your music taste, then suggest some songs you might enjoy.")
126
+
127
+ # Get user preferences
128
+ preferences = await ask_user_preferences()
129
+
130
+ print("\nSearching for music recommendations based on your preferences...")
131
+ search_results = await search_music_recommendations(preferences)
132
+
133
+ print("\nGenerating personalized recommendations...")
134
+ recommendations = await generate_recommendations(preferences, search_results)
135
+
136
+ print("\n🎧 Your Personalized Music Recommendations 🎧")
137
+ print(recommendations)
138
+
139
+ return recommendations
140
+
141
+ if __name__ == "__main__":
142
+ asyncio.run(music_recommendation_bot())
143
+
@@ -1,50 +0,0 @@
1
- from judgeval import JudgmentClient
2
- from judgeval.data import Example, Sequence
3
- from judgeval.scorers import DerailmentScorer
4
-
5
- client = JudgmentClient()
6
-
7
- airlines_example = Example(
8
- input="Which airlines fly to Tokyo?",
9
- actual_output="Japan Airlines, All Nippon Airways, and Chinese Airlines offer direct flights."
10
- )
11
- weather_example = Example(
12
- input="What is the weather like in Japan?",
13
- actual_output="It's cloudy with a high of 75°F and a low of 60°F in Japan."
14
- )
15
- airline_sequence = Sequence(
16
- name="Flight Details",
17
- items=[airlines_example, weather_example],
18
- )
19
-
20
- # Level 1: Top-level sequence
21
- top_example1 = Example(
22
- input="I want to plan a trip to Tokyok.",
23
- actual_output="That sounds great! When are you planning to go?"
24
- )
25
- top_example2 = Example(
26
- input="Can you book a flight for me and anything else I need to know?",
27
- actual_output="Sure, I'll help you with flights. hotels. and transportation."
28
- )
29
- top_level_sequence = Sequence(
30
- name="Travel Planning",
31
- items=[top_example1, top_example2, airline_sequence],
32
- )
33
-
34
- other_sequence = Sequence(
35
- name="Other",
36
- items=[Example(
37
- input="What is the weather like in Tokyo?",
38
- actual_output="It's cloudy with a high of 75°F and a low of 60°F in Tokyo."
39
- )]
40
- )
41
-
42
- results = client.run_sequence_evaluation(
43
- eval_run_name="sequence-run1",
44
- project_name="jnpr-demo-sequence",
45
- scorers=[DerailmentScorer(threshold=1)],
46
- sequences=[top_level_sequence, other_sequence],
47
- model="gpt-4o",
48
- log_results=True,
49
- override=True,
50
- )
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes