judgeval 0.0.33__tar.gz → 0.0.34__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval-0.0.34/.github/pull_request_template.md +31 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/PKG-INFO +1 -1
- {judgeval-0.0.33 → judgeval-0.0.34}/pyproject.toml +1 -1
- judgeval-0.0.34/src/demo/demo.py +54 -0
- judgeval-0.0.34/src/demo/demo2.py +144 -0
- judgeval-0.0.34/src/test.py +143 -0
- judgeval-0.0.33/src/demo/demo.py +0 -50
- {judgeval-0.0.33 → judgeval-0.0.34}/.github/workflows/ci.yaml +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/.gitignore +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/LICENSE.md +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/Pipfile +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/Pipfile.lock +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/README.md +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/README.md +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/alerts/notifications.mdx +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/alerts/platform_notifications.mdx +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/alerts/rules.mdx +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/api_reference/judgment_client.mdx +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/api_reference/trace.mdx +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/changelog/2025-04-21.mdx +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/clustering/clustering.mdx +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/development.mdx +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/essentials/code.mdx +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/essentials/images.mdx +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/essentials/markdown.mdx +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/essentials/navigation.mdx +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/essentials/reusable-snippets.mdx +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/essentials/settings.mdx +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/evaluation/data_datasets.mdx +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/evaluation/data_examples.mdx +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/evaluation/data_sequences.mdx +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/evaluation/introduction.mdx +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/evaluation/judges.mdx +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/evaluation/scorers/agent/derailment.mdx +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/evaluation/scorers/classifier_scorer.mdx +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/evaluation/scorers/custom_scorers.mdx +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/evaluation/scorers/default/answer_correctness.mdx +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/evaluation/scorers/default/answer_relevancy.mdx +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/evaluation/scorers/default/comparison.mdx +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/evaluation/scorers/default/contextual_precision.mdx +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/evaluation/scorers/default/contextual_recall.mdx +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/evaluation/scorers/default/contextual_relevancy.mdx +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/evaluation/scorers/default/execution_order.mdx +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/evaluation/scorers/default/faithfulness.mdx +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/evaluation/scorers/default/groundedness.mdx +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/evaluation/scorers/default/json_correctness.mdx +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/evaluation/scorers/default/summarization.mdx +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/evaluation/scorers/introduction.mdx +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/evaluation/unit_testing.mdx +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/favicon.svg +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/getting_started.mdx +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/images/annotation_queue_ui.png +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/images/basic_trace_example.png +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/images/checks-passed.png +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/images/cluster.png +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/images/cluster_button.png +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/images/create_aggressive_scorer.png +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/images/create_scorer.png +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/images/dashboard_annotation_queue_button.png +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/images/evaluation_diagram.png +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/images/hero-dark.svg +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/images/hero-light.svg +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/images/notifications_page.png +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/images/online_eval_fault.png +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/images/reports_modal.png +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/images/trace_ss.png +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/integration/langgraph.mdx +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/introduction.mdx +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/logo/dark.svg +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/logo/light.svg +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/mcp_server/mcp_server.mdx +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/mint.json +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/monitoring/annotations.mdx +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/monitoring/introduction.mdx +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/monitoring/production_insights.mdx +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/monitoring/tracing.mdx +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/notebooks/create_dataset.ipynb +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/notebooks/create_scorer.ipynb +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/notebooks/demo.ipynb +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/notebooks/prompt_scorer.ipynb +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/notebooks/quickstart.ipynb +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/quickstart.mdx +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/docs/snippets/snippet-intro.mdx +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/pytest.ini +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/demo/custom_scorer/main.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/demo/custom_scorer/scorer.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/demo/dataset.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/demo/new_bot/basic_bot.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/demo/simple_trace.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/demo/simplified_tracing/example_complex_async.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/demo/streaming_anthropic_demo.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/demo/streaming_openai_demo.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/demo/test.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/demo/travel_agent.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/__init__.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/clients.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/common/__init__.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/common/exceptions.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/common/logger.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/common/s3_storage.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/common/tracer.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/common/utils.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/constants.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/data/__init__.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/data/custom_example.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/data/datasets/__init__.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/data/datasets/dataset.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/data/datasets/eval_dataset_client.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/data/example.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/data/result.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/data/scorer_data.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/data/sequence.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/data/sequence_run.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/evaluation_run.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/integrations/langgraph.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/judges/__init__.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/judges/base_judge.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/judges/litellm_judge.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/judges/mixture_of_judges.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/judges/together_judge.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/judges/utils.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/judgment_client.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/rules.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/run_evaluation.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/scorers/__init__.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/scorers/api_scorer.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/scorers/exceptions.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/scorers/judgeval_scorer.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/scorers/judgeval_scorers/__init__.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/scorers/prompt_scorer.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/scorers/score.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/scorers/utils.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/tracer/__init__.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/utils/alerts.py +0 -0
- {judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/version_check.py +0 -0
@@ -0,0 +1,31 @@
|
|
1
|
+
## 📝 Summary
|
2
|
+
|
3
|
+
<!-- Provide a brief description of the changes introduced by this PR -->
|
4
|
+
|
5
|
+
## 🎯 Purpose
|
6
|
+
|
7
|
+
<!-- Explain the motivation behind these changes. Why are they necessary? -->
|
8
|
+
|
9
|
+
## 🎥 Demo of Changes
|
10
|
+
|
11
|
+
<!-- Add a short 1-3 minute video describing/demoing the changes -->
|
12
|
+
|
13
|
+
## 🧪 Testing
|
14
|
+
|
15
|
+
<!-- Describe how the changes were tested (unit/manual) -->
|
16
|
+
|
17
|
+
## ✅ Checklist
|
18
|
+
|
19
|
+
- [ ] Self-review
|
20
|
+
- [ ] Video demo of changes
|
21
|
+
- [ ] Unit Tests and CI/CD tests are passing
|
22
|
+
- [ ] Reviewers assigned
|
23
|
+
|
24
|
+
|
25
|
+
## 📌 Linear Issue
|
26
|
+
|
27
|
+
<!-- Reference to associated Linear ticket, e.g., ABC-123 -->
|
28
|
+
|
29
|
+
## ✏️ Additional Notes
|
30
|
+
|
31
|
+
<!-- Any additional information that doesn't fit into the other sections -->
|
@@ -0,0 +1,54 @@
|
|
1
|
+
from judgeval import JudgmentClient
|
2
|
+
from judgeval.data import Example, Sequence
|
3
|
+
from judgeval.scorers import DerailmentScorer
|
4
|
+
|
5
|
+
client = JudgmentClient()
|
6
|
+
|
7
|
+
dataset = client.pull_dataset("test-dataset", "default_project")
|
8
|
+
|
9
|
+
print(dataset)
|
10
|
+
|
11
|
+
# airlines_example = Example(
|
12
|
+
# input="Which airlines fly to Tokyo?",
|
13
|
+
# actual_output="Japan Airlines, All Nippon Airways, and Chinese Airlines offer direct flights."
|
14
|
+
# )
|
15
|
+
# weather_example = Example(
|
16
|
+
# input="What is the weather like in Japan?",
|
17
|
+
# actual_output="It's cloudy with a high of 75°F and a low of 60°F in Japan."
|
18
|
+
# )
|
19
|
+
# airline_sequence = Sequence(
|
20
|
+
# name="Flight Details",
|
21
|
+
# items=[airlines_example, weather_example],
|
22
|
+
# )
|
23
|
+
|
24
|
+
# # Level 1: Top-level sequence
|
25
|
+
# top_example1 = Example(
|
26
|
+
# input="I want to plan a trip to Tokyok.",
|
27
|
+
# actual_output="That sounds great! When are you planning to go?"
|
28
|
+
# )
|
29
|
+
# top_example2 = Example(
|
30
|
+
# input="Can you book a flight for me and anything else I need to know?",
|
31
|
+
# actual_output="Sure, I'll help you with flights. hotels. and transportation."
|
32
|
+
# )
|
33
|
+
# top_level_sequence = Sequence(
|
34
|
+
# name="Travel Planning",
|
35
|
+
# items=[top_example1, top_example2, airline_sequence],
|
36
|
+
# )
|
37
|
+
|
38
|
+
# other_sequence = Sequence(
|
39
|
+
# name="Other",
|
40
|
+
# items=[Example(
|
41
|
+
# input="What is the weather like in Tokyo?",
|
42
|
+
# actual_output="It's cloudy with a high of 75°F and a low of 60°F in Tokyo."
|
43
|
+
# )]
|
44
|
+
# )
|
45
|
+
|
46
|
+
# results = client.run_sequence_evaluation(
|
47
|
+
# eval_run_name="sequence-run1",
|
48
|
+
# project_name="jnpr-demo-sequence",
|
49
|
+
# scorers=[DerailmentScorer(threshold=1)],
|
50
|
+
# sequences=[top_level_sequence, other_sequence],
|
51
|
+
# model="gpt-4o",
|
52
|
+
# log_results=True,
|
53
|
+
# override=True,
|
54
|
+
# )
|
@@ -0,0 +1,144 @@
|
|
1
|
+
import os
|
2
|
+
import asyncio
|
3
|
+
from openai import OpenAI, AsyncOpenAI
|
4
|
+
from dotenv import load_dotenv
|
5
|
+
from judgeval.common.tracer import Tracer, wrap
|
6
|
+
from judgeval.scorers import AnswerRelevancyScorer, FaithfulnessScorer, GroundednessScorer
|
7
|
+
|
8
|
+
# Load environment variables
|
9
|
+
load_dotenv()
|
10
|
+
|
11
|
+
# Initialize OpenAI client and Judgment tracer
|
12
|
+
client = wrap(OpenAI())
|
13
|
+
async_client = wrap(AsyncOpenAI())
|
14
|
+
judgment = Tracer(project_name="music-bot-demo")
|
15
|
+
|
16
|
+
@judgment.observe(span_type="tool")
|
17
|
+
async def search_tavily(query):
|
18
|
+
"""Search for information using Tavily."""
|
19
|
+
from tavily import TavilyClient
|
20
|
+
|
21
|
+
tavily_client = TavilyClient(api_key=os.getenv("TAVILY_API_KEY"))
|
22
|
+
search_result = tavily_client.search(
|
23
|
+
query=query,
|
24
|
+
search_depth="advanced",
|
25
|
+
max_results=5
|
26
|
+
)
|
27
|
+
|
28
|
+
return search_result
|
29
|
+
|
30
|
+
@judgment.observe(span_type="function")
|
31
|
+
async def ask_user_preferences():
|
32
|
+
"""Ask the user a series of questions about their music preferences."""
|
33
|
+
questions = [
|
34
|
+
"What are some of your favorite artists or bands?",
|
35
|
+
"What genres of music do you enjoy the most?",
|
36
|
+
"Do you have any favorite songs currently?",
|
37
|
+
"Are there any moods or themes you're looking for in new music?",
|
38
|
+
"Do you prefer newer releases or classic songs?"
|
39
|
+
]
|
40
|
+
|
41
|
+
preferences = {}
|
42
|
+
for question in questions:
|
43
|
+
print(f"\n{question}")
|
44
|
+
answer = input("> ")
|
45
|
+
preferences[question] = answer
|
46
|
+
|
47
|
+
return preferences
|
48
|
+
|
49
|
+
@judgment.observe(span_type="function")
|
50
|
+
async def search_music_recommendations(preferences):
|
51
|
+
"""Search for music recommendations based on user preferences."""
|
52
|
+
# Construct search queries based on preferences
|
53
|
+
search_results = {}
|
54
|
+
|
55
|
+
# Search for artist recommendations
|
56
|
+
if preferences.get("What are some of your favorite artists or bands?"):
|
57
|
+
artists_query = f"Music similar to {preferences['What are some of your favorite artists or bands?']}"
|
58
|
+
search_results["artist_based"] = await search_tavily(artists_query)
|
59
|
+
|
60
|
+
# Search for genre recommendations
|
61
|
+
if preferences.get("What genres of music do you enjoy the most?"):
|
62
|
+
genre_query = f"Best {preferences['What genres of music do you enjoy the most?']} songs"
|
63
|
+
search_results["genre_based"] = await search_tavily(genre_query)
|
64
|
+
|
65
|
+
# Search for mood-based recommendations
|
66
|
+
if preferences.get("Are there any moods or themes you're looking for in new music?"):
|
67
|
+
mood_query = f"""{preferences["Are there any moods or themes you're looking for in new music?"]} music recommendations"""
|
68
|
+
search_results["mood_based"] = await search_tavily(mood_query)
|
69
|
+
|
70
|
+
return search_results
|
71
|
+
|
72
|
+
@judgment.observe(span_type="function")
|
73
|
+
async def generate_recommendations(preferences, search_results):
|
74
|
+
"""Generate personalized music recommendations using the search results."""
|
75
|
+
# Prepare context from search results
|
76
|
+
context = ""
|
77
|
+
for category, results in search_results.items():
|
78
|
+
context += f"\n{category.replace('_', ' ').title()} Search Results:\n"
|
79
|
+
for result in results.get("results", []):
|
80
|
+
context += f"- {result.get('title')}: {result.get('content')[:200]}...\n"
|
81
|
+
|
82
|
+
# Create a prompt for the LLM
|
83
|
+
prompt = f"""
|
84
|
+
Suggest 5-7 songs they could enjoy. Be creative and suggest whatever feels right. You should only recommend songs that are from the user's favorite artists/bands.
|
85
|
+
For each song, include the artist name, song title, and a brief explanation of why they might like it.
|
86
|
+
|
87
|
+
User Preferences:
|
88
|
+
{preferences}
|
89
|
+
|
90
|
+
Search Results:
|
91
|
+
{context}
|
92
|
+
|
93
|
+
Provide recommendations in a clear, organized format. Focus on specific songs rather than just artists.
|
94
|
+
"""
|
95
|
+
|
96
|
+
|
97
|
+
# Generate recommendations using OpenAI
|
98
|
+
response = await async_client.chat.completions.create(
|
99
|
+
model="gpt-4o-mini",
|
100
|
+
messages=[
|
101
|
+
{"role": "system", "content": "You are a music recommendation expert with deep knowledge of various genres, artists, and songs. Your goal is to suggest songs that match the user's preferences; recommend songs from their favorite artists/bands."},
|
102
|
+
{"role": "user", "content": prompt}
|
103
|
+
]
|
104
|
+
)
|
105
|
+
|
106
|
+
recommendations = response.choices[0].message.content
|
107
|
+
|
108
|
+
# Evaluate the recommendations
|
109
|
+
judgment.get_current_trace().async_evaluate(
|
110
|
+
scorers=[
|
111
|
+
AnswerRelevancyScorer(threshold=1.0),
|
112
|
+
GroundednessScorer(threshold=1.0)
|
113
|
+
],
|
114
|
+
input=prompt,
|
115
|
+
actual_output=recommendations,
|
116
|
+
retrieval_context=[str(search_results)],
|
117
|
+
model="gpt-4o"
|
118
|
+
)
|
119
|
+
|
120
|
+
return recommendations
|
121
|
+
|
122
|
+
@judgment.observe(span_type="Main Function")
|
123
|
+
async def music_recommendation_bot():
|
124
|
+
"""Main function to run the music recommendation bot."""
|
125
|
+
print("🎵 Welcome to the Music Recommendation Bot! 🎵")
|
126
|
+
print("I'll ask you a few questions to understand your music taste, then suggest some songs you might enjoy.")
|
127
|
+
|
128
|
+
# Get user preferences
|
129
|
+
preferences = await ask_user_preferences()
|
130
|
+
|
131
|
+
print("\nSearching for music recommendations based on your preferences...")
|
132
|
+
search_results = await search_music_recommendations(preferences)
|
133
|
+
|
134
|
+
print("\nGenerating personalized recommendations...")
|
135
|
+
recommendations = await generate_recommendations(preferences, search_results)
|
136
|
+
|
137
|
+
print("\n🎧 Your Personalized Music Recommendations 🎧")
|
138
|
+
print(recommendations)
|
139
|
+
|
140
|
+
return recommendations
|
141
|
+
|
142
|
+
if __name__ == "__main__":
|
143
|
+
asyncio.run(music_recommendation_bot())
|
144
|
+
|
@@ -0,0 +1,143 @@
|
|
1
|
+
import os
|
2
|
+
import asyncio
|
3
|
+
from openai import OpenAI
|
4
|
+
from dotenv import load_dotenv
|
5
|
+
from judgeval.common.tracer import Tracer, wrap
|
6
|
+
from judgeval.scorers import AnswerRelevancyScorer, FaithfulnessScorer, GroundednessScorer
|
7
|
+
|
8
|
+
# Load environment variables
|
9
|
+
load_dotenv()
|
10
|
+
|
11
|
+
# Initialize OpenAI client and Judgment tracer
|
12
|
+
client = wrap(OpenAI())
|
13
|
+
judgment = Tracer(project_name="music-bot-demo")
|
14
|
+
|
15
|
+
@judgment.observe(span_type="tool")
|
16
|
+
async def search_tavily(query):
|
17
|
+
"""Search for information using Tavily."""
|
18
|
+
from tavily import TavilyClient
|
19
|
+
|
20
|
+
tavily_client = TavilyClient(api_key=os.getenv("TAVILY_API_KEY"))
|
21
|
+
search_result = tavily_client.search(
|
22
|
+
query=query,
|
23
|
+
search_depth="advanced",
|
24
|
+
max_results=5
|
25
|
+
)
|
26
|
+
|
27
|
+
return search_result
|
28
|
+
|
29
|
+
@judgment.observe(span_type="function")
|
30
|
+
async def ask_user_preferences():
|
31
|
+
"""Ask the user a series of questions about their music preferences."""
|
32
|
+
questions = [
|
33
|
+
"What are some of your favorite artists or bands?",
|
34
|
+
"What genres of music do you enjoy the most?",
|
35
|
+
"Do you have any favorite songs currently?",
|
36
|
+
"Are there any moods or themes you're looking for in new music?",
|
37
|
+
"Do you prefer newer releases or classic songs?"
|
38
|
+
]
|
39
|
+
|
40
|
+
preferences = {}
|
41
|
+
for question in questions:
|
42
|
+
print(f"\n{question}")
|
43
|
+
answer = input("> ")
|
44
|
+
preferences[question] = answer
|
45
|
+
|
46
|
+
return preferences
|
47
|
+
|
48
|
+
@judgment.observe(span_type="function")
|
49
|
+
async def search_music_recommendations(preferences):
|
50
|
+
"""Search for music recommendations based on user preferences."""
|
51
|
+
# Construct search queries based on preferences
|
52
|
+
search_results = {}
|
53
|
+
|
54
|
+
# Search for artist recommendations
|
55
|
+
if preferences.get("What are some of your favorite artists or bands?"):
|
56
|
+
artists_query = f"Music similar to {preferences['What are some of your favorite artists or bands?']}"
|
57
|
+
search_results["artist_based"] = await search_tavily(artists_query)
|
58
|
+
|
59
|
+
# Search for genre recommendations
|
60
|
+
if preferences.get("What genres of music do you enjoy the most?"):
|
61
|
+
genre_query = f"Best {preferences['What genres of music do you enjoy the most?']} songs"
|
62
|
+
search_results["genre_based"] = await search_tavily(genre_query)
|
63
|
+
|
64
|
+
# Search for mood-based recommendations
|
65
|
+
if preferences.get("Are there any moods or themes you're looking for in new music?"):
|
66
|
+
mood_query = f"""{preferences["Are there any moods or themes you're looking for in new music?"]} music recommendations"""
|
67
|
+
search_results["mood_based"] = await search_tavily(mood_query)
|
68
|
+
|
69
|
+
return search_results
|
70
|
+
|
71
|
+
@judgment.observe(span_type="function")
|
72
|
+
async def generate_recommendations(preferences, search_results):
|
73
|
+
"""Generate personalized music recommendations using the search results."""
|
74
|
+
# Prepare context from search results
|
75
|
+
context = ""
|
76
|
+
for category, results in search_results.items():
|
77
|
+
context += f"\n{category.replace('_', ' ').title()} Search Results:\n"
|
78
|
+
for result in results.get("results", []):
|
79
|
+
context += f"- {result.get('title')}: {result.get('content')[:200]}...\n"
|
80
|
+
|
81
|
+
# Create a prompt for the LLM
|
82
|
+
prompt = f"""
|
83
|
+
Suggest 5-7 songs they could enjoy. Be creative and suggest whatever feels right. You should only recommend songs that are from the user's favorite artists/bands.
|
84
|
+
For each song, include the artist name, song title, and a brief explanation of why they might like it.
|
85
|
+
|
86
|
+
User Preferences:
|
87
|
+
{preferences}
|
88
|
+
|
89
|
+
Search Results:
|
90
|
+
{context}
|
91
|
+
|
92
|
+
Provide recommendations in a clear, organized format. Focus on specific songs rather than just artists.
|
93
|
+
"""
|
94
|
+
|
95
|
+
|
96
|
+
# Generate recommendations using OpenAI
|
97
|
+
response = client.chat.completions.create(
|
98
|
+
model="gpt-4o-mini",
|
99
|
+
messages=[
|
100
|
+
{"role": "system", "content": "You are a music recommendation expert with deep knowledge of various genres, artists, and songs. Your goal is to suggest songs that match the user's preferences; recommend songs from their favorite artists/bands."},
|
101
|
+
{"role": "user", "content": prompt}
|
102
|
+
]
|
103
|
+
)
|
104
|
+
|
105
|
+
recommendations = response.choices[0].message.content
|
106
|
+
|
107
|
+
# Evaluate the recommendations
|
108
|
+
judgment.get_current_trace().async_evaluate(
|
109
|
+
scorers=[
|
110
|
+
AnswerRelevancyScorer(threshold=1.0),
|
111
|
+
GroundednessScorer(threshold=1.0)
|
112
|
+
],
|
113
|
+
input=prompt,
|
114
|
+
actual_output=recommendations,
|
115
|
+
retrieval_context=[str(search_results)],
|
116
|
+
model="gpt-4o"
|
117
|
+
)
|
118
|
+
|
119
|
+
return recommendations
|
120
|
+
|
121
|
+
@judgment.observe(span_type="Main Function")
|
122
|
+
async def music_recommendation_bot():
|
123
|
+
"""Main function to run the music recommendation bot."""
|
124
|
+
print("🎵 Welcome to the Music Recommendation Bot! 🎵")
|
125
|
+
print("I'll ask you a few questions to understand your music taste, then suggest some songs you might enjoy.")
|
126
|
+
|
127
|
+
# Get user preferences
|
128
|
+
preferences = await ask_user_preferences()
|
129
|
+
|
130
|
+
print("\nSearching for music recommendations based on your preferences...")
|
131
|
+
search_results = await search_music_recommendations(preferences)
|
132
|
+
|
133
|
+
print("\nGenerating personalized recommendations...")
|
134
|
+
recommendations = await generate_recommendations(preferences, search_results)
|
135
|
+
|
136
|
+
print("\n🎧 Your Personalized Music Recommendations 🎧")
|
137
|
+
print(recommendations)
|
138
|
+
|
139
|
+
return recommendations
|
140
|
+
|
141
|
+
if __name__ == "__main__":
|
142
|
+
asyncio.run(music_recommendation_bot())
|
143
|
+
|
judgeval-0.0.33/src/demo/demo.py
DELETED
@@ -1,50 +0,0 @@
|
|
1
|
-
from judgeval import JudgmentClient
|
2
|
-
from judgeval.data import Example, Sequence
|
3
|
-
from judgeval.scorers import DerailmentScorer
|
4
|
-
|
5
|
-
client = JudgmentClient()
|
6
|
-
|
7
|
-
airlines_example = Example(
|
8
|
-
input="Which airlines fly to Tokyo?",
|
9
|
-
actual_output="Japan Airlines, All Nippon Airways, and Chinese Airlines offer direct flights."
|
10
|
-
)
|
11
|
-
weather_example = Example(
|
12
|
-
input="What is the weather like in Japan?",
|
13
|
-
actual_output="It's cloudy with a high of 75°F and a low of 60°F in Japan."
|
14
|
-
)
|
15
|
-
airline_sequence = Sequence(
|
16
|
-
name="Flight Details",
|
17
|
-
items=[airlines_example, weather_example],
|
18
|
-
)
|
19
|
-
|
20
|
-
# Level 1: Top-level sequence
|
21
|
-
top_example1 = Example(
|
22
|
-
input="I want to plan a trip to Tokyok.",
|
23
|
-
actual_output="That sounds great! When are you planning to go?"
|
24
|
-
)
|
25
|
-
top_example2 = Example(
|
26
|
-
input="Can you book a flight for me and anything else I need to know?",
|
27
|
-
actual_output="Sure, I'll help you with flights. hotels. and transportation."
|
28
|
-
)
|
29
|
-
top_level_sequence = Sequence(
|
30
|
-
name="Travel Planning",
|
31
|
-
items=[top_example1, top_example2, airline_sequence],
|
32
|
-
)
|
33
|
-
|
34
|
-
other_sequence = Sequence(
|
35
|
-
name="Other",
|
36
|
-
items=[Example(
|
37
|
-
input="What is the weather like in Tokyo?",
|
38
|
-
actual_output="It's cloudy with a high of 75°F and a low of 60°F in Tokyo."
|
39
|
-
)]
|
40
|
-
)
|
41
|
-
|
42
|
-
results = client.run_sequence_evaluation(
|
43
|
-
eval_run_name="sequence-run1",
|
44
|
-
project_name="jnpr-demo-sequence",
|
45
|
-
scorers=[DerailmentScorer(threshold=1)],
|
46
|
-
sequences=[top_level_sequence, other_sequence],
|
47
|
-
model="gpt-4o",
|
48
|
-
log_results=True,
|
49
|
-
override=True,
|
50
|
-
)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{judgeval-0.0.33 → judgeval-0.0.34}/docs/evaluation/scorers/default/contextual_precision.mdx
RENAMED
File without changes
|
File without changes
|
{judgeval-0.0.33 → judgeval-0.0.34}/docs/evaluation/scorers/default/contextual_relevancy.mdx
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/scorers/judgeval_scorers/api_scorers/__init__.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
{judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/scorers/judgeval_scorers/api_scorers/comparison.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{judgeval-0.0.33 → judgeval-0.0.34}/src/judgeval/scorers/judgeval_scorers/classifiers/__init__.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|