@ai-sdk-tool/eval 0.1.7 → 1.0.0-canary.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -14,7 +14,7 @@ It allows developers to:
14
14
 
15
15
  ## Core Concepts
16
16
 
17
- - **Benchmark (`LanguageModelV2Benchmark`)**: A standardized interface for creating an evaluation task. It has a `run` method that takes a `LanguageModel` and returns a `BenchmarkResult`.
17
+ - **Benchmark (`LanguageModelV3Benchmark`)**: A standardized interface for creating an evaluation task. It has a `run` method that takes a `LanguageModel` and returns a `BenchmarkResult`.
18
18
  - **`evaluate` function**: The core function that runs a set of benchmarks against one or more models and provides a report on the results.
19
19
  - **Reporter**: Formats the evaluation results into different outputs, such as a human-readable console report or a machine-readable JSON object.
20
20
 
@@ -77,20 +77,20 @@ cd examples/eval-core && pnpm dlx tsx src/json-generation.ts
77
77
 
78
78
  ## Creating a Custom Benchmark
79
79
 
80
- You can easily create your own benchmark by implementing the `LanguageModelV2Benchmark` interface. This is useful for testing model performance on tasks specific to your application.
80
+ You can easily create your own benchmark by implementing the `LanguageModelV3Benchmark` interface. This is useful for testing model performance on tasks specific to your application.
81
81
 
82
82
  **Example: A custom benchmark to test politeness.**
83
83
 
84
84
  ```typescript
85
85
  import {
86
- LanguageModelV2Benchmark,
86
+ LanguageModelV3Benchmark,
87
87
  BenchmarkResult,
88
88
  EvaluateOptions,
89
89
  } from "@ai-sdk-tool/eval";
90
90
  import { LanguageModel, generateText } from "ai";
91
91
 
92
92
  // Define the benchmark object
93
- export const politenessBenchmark: LanguageModelV2Benchmark = {
93
+ export const politenessBenchmark: LanguageModelV3Benchmark = {
94
94
  name: "politeness-check",
95
95
  version: "1.0.0",
96
96
  description: "Checks if the model's response is polite.",
@@ -197,4 +197,4 @@
197
197
  {"id": "multiple_196", "question": [[{"role": "user", "content": "What are the names of proteins found in the plasma membrane?"}]], "function": [{"name": "locate_tallest_mountains", "description": "Find the tallest mountains within a specified radius of a location.", "parameters": {"type": "dict", "properties": {"location": {"type": "string", "description": "The city from which to calculate distance."}, "radius": {"type": "float", "description": "The radius within which to find mountains, measured in kilometers."}, "amount": {"type": "integer", "description": "The number of mountains to find, listed from tallest to smallest."}}, "required": ["location", "radius", "amount"]}}, {"name": "calculate_electric_field", "description": "Calculate the electric field produced by a charge at a certain distance.", "parameters": {"type": "dict", "properties": {"charge": {"type": "float", "description": "Charge in coulombs producing the electric field."}, "distance": {"type": "float", "description": "Distance from the charge in meters where the field is being measured."}, "permitivity": {"type": "float", "description": "Permitivity of the space where field is being calculated, default is for vacuum."}}, "required": ["charge", "distance"]}}, {"name": "calculate_genotype_frequency", "description": "Calculate the frequency of homozygous dominant genotype based on the allele frequency using Hardy Weinberg Principle.", "parameters": {"type": "dict", "properties": {"allele_frequency": {"type": "float", "description": "The frequency of the dominant allele in the population."}, "genotype": {"type": "string", "description": "The genotype which frequency is needed, default is homozygous dominant. ", "enum": ["AA", "Aa", "aa"]}}, "required": ["allele_frequency", "genotype"]}}, {"name": "cellbio.get_proteins", "description": "Get the list of proteins in a specific cell compartment.", "parameters": {"type": "dict", "properties": {"cell_compartment": {"type": "string", "description": "The specific cell compartment."}, "include_description": {"type": "boolean", "description": "Set true if you want a brief description of each protein.", "default": "false"}}, "required": ["cell_compartment"]}}]}
198
198
  {"id": "multiple_197", "question": [[{"role": "user", "content": "Find the type of gene mutation based on SNP (Single Nucleotide Polymorphism) ID rs6034464."}]], "function": [{"name": "create_player_profile", "description": "Create a new player profile with character name, class and starting level.", "parameters": {"type": "dict", "properties": {"player_name": {"type": "string", "description": "The desired name of the player."}, "_class": {"type": "string", "description": "The character class for the player"}, "starting_level": {"type": "integer", "description": "The starting level for the player", "default": 1}}, "required": ["player_name", "_class"]}}, {"name": "walmart.purchase", "description": "Retrieve information of items from Walmart including stock availability.", "parameters": {"type": "dict", "properties": {"loc": {"type": "string", "description": "Location of the nearest Walmart."}, "product_list": {"type": "array", "items": {"type": "string"}, "description": "Items to be purchased listed in an array."}, "pack_size": {"type": "array", "items": {"type": "integer"}, "description": "Size of the product pack if applicable. The size of the array should be equal to product_list. Default is an empty array"}}, "required": ["loc", "product_list"]}}, {"name": "mutation_type.find", "description": "Finds the type of a genetic mutation based on its SNP (Single Nucleotide Polymorphism) ID.", "parameters": {"type": "dict", "properties": {"snp_id": {"type": "string", "description": "The ID of the Single Nucleotide Polymorphism (SNP) mutation."}, "species": {"type": "string", "description": "Species in which the SNP occurs, default is 'Homo sapiens' (Humans)."}}, "required": ["snp_id"]}}, {"name": "find_restaurants", "description": "Locate nearby restaurants based on location and food preferences.", "parameters": {"type": "dict", "properties": {"location": {"type": "string", "description": "The specific location or area."}, "food_type": {"type": "string", "description": "The type of food preferred."}, "number": {"type": "integer", "description": "Number of results to return."}, "dietary_requirements": {"type": "array", "items": {"type": "string"}, "description": "Special dietary requirements, e.g. vegan, gluten-free.", "default": "None"}}, "required": ["location", "food_type", "number"]}}]}
199
199
  {"id": "multiple_198", "question": [[{"role": "user", "content": "What is the genotype frequency of AA genotype in a population, given that allele frequency of A is 0.3?"}]], "function": [{"name": "calculate_genotype_frequency", "description": "Calculate the frequency of homozygous dominant genotype based on the allele frequency using Hardy Weinberg Principle.", "parameters": {"type": "dict", "properties": {"allele_frequency": {"type": "float", "description": "The frequency of the dominant allele in the population."}, "genotype": {"type": "string", "description": "The genotype which frequency is needed, default is homozygous dominant. ", "enum": ["AA", "Aa", "aa"]}}, "required": ["allele_frequency", "genotype"]}}, {"name": "hotel_booking", "description": "Books a hotel room for a specific date range.", "parameters": {"type": "dict", "properties": {"hotel_name": {"type": "string", "description": "The name of the hotel."}, "location": {"type": "string", "description": "The city and state, e.g. New York, NY."}, "start_date": {"type": "string", "description": "The start date of the reservation. Use format 'YYYY-MM-DD'."}, "end_date": {"type": "string", "description": "The end date of the reservation. Use format 'YYYY-MM-DD'."}, "rooms": {"type": "integer", "default": 1, "description": "The number of rooms to reserve."}}, "required": ["hotel_name", "location", "start_date", "end_date"]}}, {"name": "get_highest_scoring_player", "description": "Retrieve the highest scoring player in a specific game and season.", "parameters": {"type": "dict", "properties": {"game": {"type": "string", "description": "The game in which you want to find the highest scoring player."}, "season": {"type": "string", "description": "The season during which the high score was achieved."}, "region": {"type": "string", "description": "The geographical region in which the game is being played (Optional). Default is 'all'"}}, "required": ["game", "season"]}}, {"name": "science_history.get_invention", "description": "Retrieve the inventor and year of invention based on the invention's name.", "parameters": {"type": "dict", "properties": {"invention_name": {"type": "string", "description": "The name of the invention."}, "want_year": {"type": "boolean", "default": false, "description": "Return the year of invention if set to true."}}, "required": ["invention_name", "want_year"]}}]}
200
- {"id": "multiple_199", "question": [[{"role": "user", "content": "Predict the growth of forest in Yellowstone for the next 5 years including human impact."}]], "function": [{"name": "forest_growth_forecast", "description": "Predicts the forest growth over the next N years based on current trends.", "parameters": {"type": "dict", "properties": {"location": {"type": "string", "description": "The location where you want to predict forest growth."}, "years": {"type": "integer", "description": "The number of years for the forecast."}, "include_human_impact": {"type": "boolean", "description": "Whether or not to include the impact of human activities in the forecast. If not provided, defaults to false."}}, "required": ["location", "years"]}}, {"name": "db_fetch_records", "description": "Fetch records from a specified database table based on certain conditions.", "parameters": {"type": "dict", "properties": {"database_name": {"type": "string", "description": "The name of the database."}, "table_name": {"type": "string", "description": "The name of the table from which records need to be fetched."}, "conditions": {"type": "dict", "properties": {"department": {"type": "string", "description": "The name of the department of students."}, "school": {"type": "string", "description": "The name of the school students are enrolled in."}}, "description": "The conditions based on which records are to be fetched."}, "fetch_limit": {"type": "integer", "description": "Limits the number of records to be fetched. If left empty, it fetches all records. (Optional) Default is 0."}}, "required": ["database_name", "table_name", "conditions"]}}]}
200
+ {"id": "multiple_199", "question": [[{"role": "user", "content": "Predict the growth of forest in Yellowstone for the next 5 years including human impact."}]], "function": [{"name": "forest_growth_forecast", "description": "Predicts the forest growth over the next N years based on current trends.", "parameters": {"type": "dict", "properties": {"location": {"type": "string", "description": "The location where you want to predict forest growth."}, "years": {"type": "integer", "description": "The number of years for the forecast."}, "include_human_impact": {"type": "boolean", "description": "Whether or not to include the impact of human activities in the forecast. If not provided, defaults to false."}}, "required": ["location", "years"]}}, {"name": "db_fetch_records", "description": "Fetch records from a specified database table based on certain conditions.", "parameters": {"type": "dict", "properties": {"database_name": {"type": "string", "description": "The name of the database."}, "table_name": {"type": "string", "description": "The name of the table from which records need to be fetched."}, "conditions": {"type": "dict", "properties": {"department": {"type": "string", "description": "The name of the department of students."}, "school": {"type": "string", "description": "The name of the school students are enrolled in."}}, "description": "The conditions based on which records are to be fetched."}, "fetch_limit": {"type": "integer", "description": "Limits the number of records to be fetched. If left empty, it fetches all records. (Optional) Default is 0."}}, "required": ["database_name", "table_name", "conditions"]}}]}
@@ -197,4 +197,4 @@
197
197
  {"id": "multiple_196", "ground_truth": [{"cellbio.get_proteins": {"cell_compartment": ["plasma membrane"], "include_description": ["", false]}}]}
198
198
  {"id": "multiple_197", "ground_truth": [{"mutation_type.find": {"snp_id": ["rs6034464"], "species": ["", "Homo sapiens"]}}]}
199
199
  {"id": "multiple_198", "ground_truth": [{"calculate_genotype_frequency": {"allele_frequency": [0.3], "genotype": ["AA"]}}]}
200
- {"id": "multiple_199", "ground_truth": [{"forest_growth_forecast": {"location": ["Yellowstone", "yellowstone"], "years": [5], "include_human_impact": [true]}}]}
200
+ {"id": "multiple_199", "ground_truth": [{"forest_growth_forecast": {"location": ["Yellowstone", "yellowstone"], "years": [5], "include_human_impact": [true]}}]}
@@ -197,4 +197,4 @@
197
197
  {"id": "parallel_196", "question": [[{"role": "user", "content": "Can you please retrieve the details of two lawsuits for me? The first one has a case number of '12345' and was filed in the 'New York Supreme Court'. I would also like to know the verdict details for this case. The second lawsuit has a case number '67890' and was filed in the 'Los Angeles Superior Court'. I do not need the verdict details for this case."}]], "function": [{"name": "get_lawsuit_details", "description": "Retrieve details of a lawsuit based on its case number and court location.", "parameters": {"type": "dict", "properties": {"case_number": {"type": "string", "description": "Case number of the lawsuit."}, "court_location": {"type": "string", "description": "The location of the court where the lawsuit was filed."}, "with_verdict": {"type": "boolean", "description": "Flag to include verdict details if available. Default is False"}}, "required": ["case_number", "court_location"]}}]}
198
198
  {"id": "parallel_197", "question": [[{"role": "user", "content": "\"Can you provide me with the details of the lawsuit case with the case number '12345ABC', which was initiated in the year 2018 and filed in the New York court jurisdiction? Also, can you retrieve the same information for another lawsuit case with the case number '67890XYZ', initiated in the year 2019 and filed in the California court jurisdiction?\""}]], "function": [{"name": "lawsuit_info", "description": "Retrieves details of a lawsuit given a case number", "parameters": {"type": "dict", "properties": {"case_number": {"type": "string", "description": "The unique identifier of the lawsuit case"}, "year": {"type": "integer", "description": "The year in which the lawsuit case was initiated", "optional": true, "default": 2000}, "location": {"type": "string", "description": "The location or court jurisdiction where the case was filed.", "optional": true, "default": "New York"}}, "required": ["case_number"]}}]}
199
199
  {"id": "parallel_198", "question": [[{"role": "user", "content": "Can you use the lawsuit_search function to retrieve all lawsuits involving the entity \"Google\" from the county of \"Santa Clara\" and then do the same for the entity \"Facebook\" in the county of \"San Mateo\", both in the state of California?"}]], "function": [{"name": "lawsuit_search", "description": "Retrieve all lawsuits involving a particular entity from specified jurisdiction.", "parameters": {"type": "dict", "properties": {"entity": {"type": "string", "description": "The entity involved in lawsuits."}, "county": {"type": "string", "description": "The jurisdiction for the lawsuit search."}, "state": {"type": "string", "description": "The state for the lawsuit search. Default is California."}}, "required": ["entity", "county"]}}]}
200
- {"id": "parallel_199", "question": [[{"role": "user", "content": "What is the current temperature and humidity in New York, Los Angeles, London and Tokyo, if I want to include both temperature and humidity in the results?"}]], "function": [{"name": "get_current_weather", "description": "Retrieves the current temperature and humidity for a specific location.", "parameters": {"type": "dict", "properties": {"location": {"type": "string", "description": "The location to get the weather for."}, "include_temperature": {"type": "boolean", "description": "Whether to include the temperature in the result. Default is true."}, "include_humidity": {"type": "boolean", "description": "Whether to include the humidity in the result. Default is true."}}, "required": ["location"]}}]}
200
+ {"id": "parallel_199", "question": [[{"role": "user", "content": "What is the current temperature and humidity in New York, Los Angeles, London and Tokyo, if I want to include both temperature and humidity in the results?"}]], "function": [{"name": "get_current_weather", "description": "Retrieves the current temperature and humidity for a specific location.", "parameters": {"type": "dict", "properties": {"location": {"type": "string", "description": "The location to get the weather for."}, "include_temperature": {"type": "boolean", "description": "Whether to include the temperature in the result. Default is true."}, "include_humidity": {"type": "boolean", "description": "Whether to include the humidity in the result. Default is true."}}, "required": ["location"]}}]}
@@ -197,4 +197,4 @@
197
197
  {"id": "parallel_multiple_196", "question": [[{"role": "user", "content": "\"Could you please help me with some information? First, I would like to know the amount of calories in the 'Chicken Alfredo' recipe from the 'AllRecipes' website for dinner. Second, I am interested in the current stock prices of 'Apple', 'Microsoft', and 'Tesla'. Lastly, I want to know the FIFA ranking of the 'Brazil' men's soccer team in 2018.\""}]], "function": [{"name": "get_stock_price", "description": "Retrieves the current stock price of the specified companies", "parameters": {"type": "dict", "properties": {"company_names": {"type": "array", "items": {"type": "string"}, "description": "The list of companies for which to retrieve the stock price."}}, "required": ["company_names"]}}, {"name": "get_team_ranking", "description": "Retrieve the FIFA ranking of a specific soccer team for a certain year.", "parameters": {"type": "dict", "properties": {"team_name": {"type": "string", "description": "The name of the soccer team."}, "year": {"type": "integer", "description": "The year for which the ranking is to be retrieved."}, "gender": {"type": "string", "description": "The gender of the team. It can be either 'men' or 'women'. Default is 'men'."}}, "required": ["team_name", "year"]}}, {"name": "recipe_info.get_calories", "description": "Retrieve the amount of calories from a specific recipe in a food website.", "parameters": {"type": "dict", "properties": {"website": {"type": "string", "description": "The food website that has the recipe."}, "recipe": {"type": "string", "description": "Name of the recipe."}, "optional_meal_time": {"type": "string", "description": "Specific meal time of the day for the recipe (optional, could be 'Breakfast', 'Lunch', 'Dinner'). Default is 'Dinner'"}}, "required": ["website", "recipe"]}}]}
198
198
  {"id": "parallel_multiple_197", "question": [[{"role": "user", "content": "\"Could you help me plan a dinner party? I need to find a Vegetarian recipe that uses potatoes, carrots, and onions and serves 4 people. Also, I'm hosting this party in New York and I would like to know the detailed weather forecast for the next 12 hours, including precipitation details. Lastly, my friend is joining from Tokyo and I need to know the time difference between New York and Tokyo to schedule the party at a convenient time for both of us.\""}]], "function": [{"name": "recipe_search", "description": "Search for a recipe given dietary restriction, ingredients, and number of servings.", "parameters": {"type": "dict", "properties": {"dietary_restriction": {"type": "string", "description": "The dietary restriction, e.g., 'Vegetarian'."}, "ingredients": {"type": "array", "items": {"type": "string"}, "description": "The list of ingredients."}, "servings": {"type": "integer", "description": "The number of servings the recipe should make"}}, "required": ["dietary_restriction", "ingredients", "servings"]}}, {"name": "get_time_difference", "description": "Get the time difference between two places.", "parameters": {"type": "dict", "properties": {"place1": {"type": "string", "description": "The first place for time difference."}, "place2": {"type": "string", "description": "The second place for time difference."}}, "required": ["place1", "place2"]}}, {"name": "detailed_weather_forecast", "description": "Retrieve a detailed weather forecast for a specific location and duration including optional precipitation details.", "parameters": {"type": "dict", "properties": {"location": {"type": "string", "description": "The city that you want to get the weather for."}, "duration": {"type": "integer", "description": "Duration in hours for the detailed forecast."}, "include_precipitation": {"type": "boolean", "description": "Whether to include precipitation data in the forecast. Default is false."}}, "required": ["location", "duration"]}}]}
199
199
  {"id": "parallel_multiple_198", "question": [[{"role": "user", "content": "\"Can you first find me a vegan, main course recipe that can be prepared within 30 minutes? After that, could you please retrieve the details of the scientific discovery of Gravity using the most accepted method? Once done, I would also like to know about the discovery of the Higgs Boson particle using the same method. Lastly, could you find me a gluten-free dessert recipe that can be prepared within 45 minutes?\""}]], "function": [{"name": "find_recipe", "description": "Find a recipe based on the dietary restrictions, recipe type, and time constraints.", "parameters": {"type": "dict", "properties": {"dietary_restrictions": {"type": "string", "description": "Dietary restrictions e.g. vegan, vegetarian, gluten free, dairy free."}, "recipe_type": {"type": "string", "description": "Type of the recipe. E.g. dessert, main course, breakfast."}, "time": {"type": "integer", "description": "Time limit in minutes to prep the meal."}}, "required": ["dietary_restrictions", "recipe_type", "time"]}}, {"name": "science_history.get_discovery_details", "description": "Retrieve the details of a scientific discovery based on the discovery name.", "parameters": {"type": "dict", "properties": {"discovery": {"type": "string", "description": "The name of the discovery, e.g. Gravity"}, "method_used": {"type": "string", "description": "The method used for the discovery, default value is 'default' which gives the most accepted method."}}, "required": ["discovery"]}}]}
200
- {"id": "parallel_multiple_199", "question": [[{"role": "user", "content": "\"Can you help me with two things? First, I am currently in New York and it's 2pm here. I have a meeting scheduled with a client in London and another one in Tokyo. I need to know what time it will be in both these cities when it's 2pm in New York. Second, I am considering switching to solar energy for my home in California and I want to understand the potential greenhouse gas emissions I could save. I plan to use it for 12 months. Can you calculate the emission savings for me?\""}]], "function": [{"name": "timezone.convert", "description": "Convert time from one time zone to another.", "parameters": {"type": "dict", "properties": {"time": {"type": "string", "description": "The local time you want to convert, e.g. 3pm"}, "from_timezone": {"type": "string", "description": "The time zone you want to convert from."}, "to_timezone": {"type": "string", "description": "The time zone you want to convert to."}}, "required": ["time", "from_timezone", "to_timezone"]}}, {"name": "calculate_emission_savings", "description": "Calculate potential greenhouse gas emissions saved by switching to renewable energy sources.", "parameters": {"type": "dict", "properties": {"energy_type": {"type": "string", "description": "Type of the renewable energy source."}, "usage_duration": {"type": "integer", "description": "Usage duration in months."}, "region": {"type": "string", "description": "The region where you use energy. Default is 'global'."}}, "required": ["energy_type", "usage_duration"]}}]}
200
+ {"id": "parallel_multiple_199", "question": [[{"role": "user", "content": "\"Can you help me with two things? First, I am currently in New York and it's 2pm here. I have a meeting scheduled with a client in London and another one in Tokyo. I need to know what time it will be in both these cities when it's 2pm in New York. Second, I am considering switching to solar energy for my home in California and I want to understand the potential greenhouse gas emissions I could save. I plan to use it for 12 months. Can you calculate the emission savings for me?\""}]], "function": [{"name": "timezone.convert", "description": "Convert time from one time zone to another.", "parameters": {"type": "dict", "properties": {"time": {"type": "string", "description": "The local time you want to convert, e.g. 3pm"}, "from_timezone": {"type": "string", "description": "The time zone you want to convert from."}, "to_timezone": {"type": "string", "description": "The time zone you want to convert to."}}, "required": ["time", "from_timezone", "to_timezone"]}}, {"name": "calculate_emission_savings", "description": "Calculate potential greenhouse gas emissions saved by switching to renewable energy sources.", "parameters": {"type": "dict", "properties": {"energy_type": {"type": "string", "description": "Type of the renewable energy source."}, "usage_duration": {"type": "integer", "description": "Usage duration in months."}, "region": {"type": "string", "description": "The region where you use energy. Default is 'global'."}}, "required": ["energy_type", "usage_duration"]}}]}
@@ -197,4 +197,4 @@
197
197
  {"id": "parallel_multiple_196", "ground_truth": [{"recipe_info.get_calories": {"website": ["AllRecipes"], "recipe": ["Chicken Alfredo"], "optional_meal_time": ["Dinner", ""]}}, {"get_stock_price": {"company_names": [["Apple", "Microsoft", "Tesla"]]}}, {"get_team_ranking": {"team_name": ["Brazil"], "year": [2018], "gender": ["men", ""]}}]}
198
198
  {"id": "parallel_multiple_197", "ground_truth": [{"recipe_search": {"dietary_restriction": ["Vegetarian"], "ingredients": [["potatoes", "carrots", "onions"]], "servings": [4]}}, {"detailed_weather_forecast": {"location": ["New York", "NY"], "duration": [12], "include_precipitation": [true]}}, {"get_time_difference": {"place1": ["New York", "NY"], "place2": ["Tokyo"]}}]}
199
199
  {"id": "parallel_multiple_198", "ground_truth": [{"find_recipe": {"dietary_restrictions": ["vegan"], "recipe_type": ["main course"], "time": [30]}}, {"science_history.get_discovery_details": {"discovery": ["Gravity"], "method_used": ["default", ""]}}, {"science_history.get_discovery_details": {"discovery": ["Higgs Boson", "Higgs Boson particle"], "method_used": ["default", ""]}}, {"find_recipe": {"dietary_restrictions": ["gluten free"], "recipe_type": ["dessert"], "time": [45]}}]}
200
- {"id": "parallel_multiple_199", "ground_truth": [{"timezone.convert": {"time": ["2pm"], "from_timezone": ["New York", "NY", "America/New_York"], "to_timezone": ["London", "Europe/London"]}}, {"timezone.convert": {"time": ["2pm"], "from_timezone": ["New York", "NY", "America/New_York"], "to_timezone": ["Tokyo", "Asia/Tokyo"]}}, {"calculate_emission_savings": {"energy_type": ["solar"], "usage_duration": [12], "region": ["California", "CA"]}}]}
200
+ {"id": "parallel_multiple_199", "ground_truth": [{"timezone.convert": {"time": ["2pm"], "from_timezone": ["New York", "NY", "America/New_York"], "to_timezone": ["London", "Europe/London"]}}, {"timezone.convert": {"time": ["2pm"], "from_timezone": ["New York", "NY", "America/New_York"], "to_timezone": ["Tokyo", "Asia/Tokyo"]}}, {"calculate_emission_savings": {"energy_type": ["solar"], "usage_duration": [12], "region": ["California", "CA"]}}]}
@@ -197,4 +197,4 @@
197
197
  {"id": "parallel_196", "ground_truth": [{"get_lawsuit_details": {"case_number": ["12345"], "court_location": ["New York Supreme Court", "NY Supreme Court"], "with_verdict": [true]}}, {"get_lawsuit_details": {"case_number": ["67890"], "court_location": ["Los Angeles Superior Court", "LA Superior Court"], "with_verdict": [false, ""]}}]}
198
198
  {"id": "parallel_197", "ground_truth": [{"lawsuit_info": {"case_number": ["12345ABC"], "year": [2018], "location": ["New York", "New York, NY", "NY", ""]}}, {"lawsuit_info": {"case_number": ["67890XYZ"], "year": [2019], "location": ["California", "CA"]}}]}
199
199
  {"id": "parallel_198", "ground_truth": [{"lawsuit_search": {"entity": ["Google"], "county": ["Santa Clara"], "state": ["California", "CA", ""]}}, {"lawsuit_search": {"entity": ["Facebook"], "county": ["San Mateo"], "state": ["California", "CA", ""]}}]}
200
- {"id": "parallel_199", "ground_truth": [{"get_current_weather": {"location": ["New York", "New York, NY", "New York City", "NYC"], "include_temperature": [true, ""], "include_humidity": [true, ""]}}, {"get_current_weather": {"location": ["Los Angeles", "Los Angeles, CA", "LA"], "include_temperature": [true, ""], "include_humidity": [true, ""]}}, {"get_current_weather": {"location": ["London"], "include_temperature": [true, ""], "include_humidity": [true, ""]}}, {"get_current_weather": {"location": ["Tokyo"], "include_temperature": [true, ""], "include_humidity": [true, ""]}}]}
200
+ {"id": "parallel_199", "ground_truth": [{"get_current_weather": {"location": ["New York", "New York, NY", "New York City", "NYC"], "include_temperature": [true, ""], "include_humidity": [true, ""]}}, {"get_current_weather": {"location": ["Los Angeles", "Los Angeles, CA", "LA"], "include_temperature": [true, ""], "include_humidity": [true, ""]}}, {"get_current_weather": {"location": ["London"], "include_temperature": [true, ""], "include_humidity": [true, ""]}}, {"get_current_weather": {"location": ["Tokyo"], "include_temperature": [true, ""], "include_humidity": [true, ""]}}]}
@@ -397,4 +397,4 @@
397
397
  {"id": "simple_396", "question": [[{"role": "user", "content": "Find a hospital within 5 km radius around Denver, Colorado with pediatrics department."}]], "function": [{"name": "hospital.locate", "description": "Locate nearby hospitals based on location and radius. Options to include specific departments are available.", "parameters": {"type": "dict", "properties": {"location": {"type": "string", "description": "The city and state, e.g. Denver, CO"}, "radius": {"type": "integer", "description": "The radius within which you want to find the hospital in kms."}, "department": {"type": "string", "description": "Specific department within the hospital. Default is 'General Medicine'.", "enum": ["General Medicine", "Emergency", "Pediatrics", "Cardiology", "Orthopedics"]}}, "required": ["location", "radius"]}}]}
398
398
  {"id": "simple_397", "question": [[{"role": "user", "content": "Find the distance between New York and Boston, accounting for terrain."}]], "function": [{"name": "distance_calculator.calculate", "description": "Calculate the distance between two locations, considering terrain.", "parameters": {"type": "dict", "properties": {"origin": {"type": "string", "description": "Starting location of the distance measurement."}, "destination": {"type": "string", "description": "Destination location of the distance measurement."}, "consider_terrain": {"type": "boolean", "description": "Whether to account for terrain in distance calculation, defaults to false."}}, "required": ["origin", "destination"]}}]}
399
399
  {"id": "simple_398", "question": [[{"role": "user", "content": "What are the opening hours of the Metropolitan Museum of Art on Saturday?"}]], "function": [{"name": "get_museum_hours", "description": "Retrieve opening hours of a specified museum for the specified day.", "parameters": {"type": "dict", "properties": {"museum_name": {"type": "string", "description": "The name of the museum."}, "day": {"type": "string", "description": "Day of the week.", "enum": ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]}}, "required": ["museum_name", "day"]}}]}
400
- {"id": "simple_399", "question": [[{"role": "user", "content": "Find me the best Italian restaurants in New York City with average customer ratings of more than 4 and accepts credit cards."}]], "function": [{"name": "restaurant_search", "description": "Locates top rated restaurants based on specific criteria such as type of cuisine, ratings, and facilities.", "parameters": {"type": "dict", "properties": {"location": {"type": "string", "description": "The city and state, e.g. New York City, NY"}, "cuisine": {"type": "string", "description": "Preferred type of cuisine e.g., Italian, Indian, American, etc."}, "rating": {"type": "integer", "description": "Minimum average customer rating out of 5"}, "accepts_credit_cards": {"type": "boolean", "description": "If the restaurant should accept credit cards."}}, "required": ["location", "cuisine", "rating", "accepts_credit_cards"]}}]}
400
+ {"id": "simple_399", "question": [[{"role": "user", "content": "Find me the best Italian restaurants in New York City with average customer ratings of more than 4 and accepts credit cards."}]], "function": [{"name": "restaurant_search", "description": "Locates top rated restaurants based on specific criteria such as type of cuisine, ratings, and facilities.", "parameters": {"type": "dict", "properties": {"location": {"type": "string", "description": "The city and state, e.g. New York City, NY"}, "cuisine": {"type": "string", "description": "Preferred type of cuisine e.g., Italian, Indian, American, etc."}, "rating": {"type": "integer", "description": "Minimum average customer rating out of 5"}, "accepts_credit_cards": {"type": "boolean", "description": "If the restaurant should accept credit cards."}}, "required": ["location", "cuisine", "rating", "accepts_credit_cards"]}}]}
@@ -397,4 +397,4 @@
397
397
  {"id": "simple_396", "ground_truth": [{"hospital.locate": {"location": ["Denver, Colorado", "Denver, CO"], "radius": [5], "department": ["Pediatrics"]}}]}
398
398
  {"id": "simple_397", "ground_truth": [{"distance_calculator.calculate": {"origin": ["New York", "New York City", "New York City, NY", "New York, NY", "NYC"], "destination": ["Boston"], "consider_terrain": [true]}}]}
399
399
  {"id": "simple_398", "ground_truth": [{"get_museum_hours": {"museum_name": ["Metropolitan Museum of Art", "The Met"], "day": ["Saturday"]}}]}
400
- {"id": "simple_399", "ground_truth": [{"restaurant_search": {"location": ["New York City", "New York City, NY", "NYC"], "cuisine": ["Italian"], "rating": [4], "accepts_credit_cards": [true]}}]}
400
+ {"id": "simple_399", "ground_truth": [{"restaurant_search": {"location": ["New York City", "New York City, NY", "NYC"], "cuisine": ["Italian"], "rating": [4], "accepts_credit_cards": [true]}}]}