PyPI - python-flexeval - Versions diffs - 0.1.5__py3-none-any.whl - Mend

python-flexeval 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

flexeval/__init__.py +11 -0
flexeval/__main__.py +11 -0
flexeval/classes/__init__.py +15 -0
flexeval/classes/base.py +32 -0
flexeval/classes/dataset.py +82 -0
flexeval/classes/eval_runner.py +158 -0
flexeval/classes/eval_set_run.py +32 -0
flexeval/classes/message.py +183 -0
flexeval/classes/metric.py +55 -0
flexeval/classes/thread.py +79 -0
flexeval/classes/tool_call.py +51 -0
flexeval/classes/turn.py +206 -0
flexeval/cli.py +104 -0
flexeval/completions.py +147 -0
flexeval/compute_metrics.py +788 -0
flexeval/config.yaml +23 -0
flexeval/configuration/__init__.py +1 -0
flexeval/configuration/completion_functions.py +231 -0
flexeval/configuration/evals.yaml +864 -0
flexeval/configuration/function_metrics.py +650 -0
flexeval/configuration/rubric_metrics.yaml +194 -0
flexeval/data_loader.py +513 -0
flexeval/db_utils.py +38 -0
flexeval/dependency_graph.py +234 -0
flexeval/eval_schema.json +256 -0
flexeval/function_types.py +173 -0
flexeval/helpers.py +52 -0
flexeval/io/__init__.py +1 -0
flexeval/io/parsers/yaml_parser.py +69 -0
flexeval/log_utils.py +34 -0
flexeval/metrics/__init__.py +8 -0
flexeval/metrics/access.py +28 -0
flexeval/metrics/save.py +39 -0
flexeval/rubric.py +62 -0
flexeval/run_utils.py +65 -0
flexeval/runner.py +132 -0
flexeval/schema/__init__.py +11 -0
flexeval/schema/config_schema.py +46 -0
flexeval/schema/eval_schema.py +163 -0
flexeval/schema/evalrun_schema.py +97 -0
flexeval/schema/rubric_schema.py +40 -0
flexeval/schema/schema_utils.py +26 -0
python_flexeval-0.1.5.dist-info/METADATA +118 -0
python_flexeval-0.1.5.dist-info/RECORD +47 -0
python_flexeval-0.1.5.dist-info/WHEEL +4 -0
python_flexeval-0.1.5.dist-info/entry_points.txt +2 -0
python_flexeval-0.1.5.dist-info/licenses/LICENSE +21 -0

flexeval/data_loader.py ADDED Viewed

@@ -0,0 +1,513 @@
+"""Dataset loading functions. Maybe should move to :mod:`~flexeval.io`."""
+import json
+import logging
+import pathlib
+import random as rd
+import sqlite3
+from langchain.load.dump import dumps
+from langgraph.checkpoint.serde.jsonplus import JsonPlusSerializer
+from flexeval.classes.dataset import Dataset
+from flexeval.classes.message import Message
+from flexeval.classes.thread import Thread
+from flexeval.classes.tool_call import ToolCall
+from flexeval.classes.turn import Turn
+logger = logging.getLogger(__name__)
+def load_jsonl(
+    dataset: Dataset,
+    filename: str | pathlib.Path,
+    max_n_conversation_threads: int | None = None,
+    nb_evaluations_per_thread: int | None = 1,
+):
+    with open(filename, "r") as infile:
+        contents = infile.read()  # will be a big string
+        all_lines = contents.splitlines()
+        # Each row is a single row of the jsonl file
+        # That means it has 'input' as a key, and a list of dictionaries as values
+        # per line
+        if max_n_conversation_threads is None:
+            max_n_conversation_threads = len(all_lines)
+        if max_n_conversation_threads <= len(all_lines):
+            selected_thread_ids = rd.sample(
+                list(range(len(all_lines))), max_n_conversation_threads
+            )
+        else:
+            logger.debug(
+                f"You requested up to '{max_n_conversation_threads}' conversations but only '{len(all_lines)}' are present in Jsonl dataset at '{filename}'."
+            )
+            selected_thread_ids = list(range(len(all_lines)))
+        ### should duplicate the select threads nb_evaluations_per_thread times
+        if nb_evaluations_per_thread is None:
+            nb_evaluations_per_thread = 1
+        for thread_id, thread in enumerate(all_lines):
+            for thread_eval_run_id in range(
+                max(1, nb_evaluations_per_thread)
+            ):  # duplicate stored threads for averaged evaluation results
+                if thread_id in selected_thread_ids:
+                    thread_object = Thread.create(
+                        evalsetrun=dataset.evalsetrun,
+                        dataset=dataset,
+                        jsonl_thread_id=thread_id,
+                        eval_run_thread_id=str(thread_id)
+                        + "_"
+                        + str(thread_eval_run_id),
+                    )
+                    # Context
+                    context = []
+                    thread_input = json.loads(thread)["input"]
+                    # Get system prompt used in the thread - assuming only 1
+                    for message in thread_input:
+                        if message["role"] == "system":
+                            system_prompt = message["content"]
+                            break
+                    else:
+                        system_prompt = None
+                    if system_prompt is not None:
+                        # Add the system prompt as context
+                        context.append({"role": "system", "content": system_prompt})
+                    # Create messages
+                    index_in_thread = 0
+                    for message in thread_input:
+                        role = message.get("role", None)
+                        if role != "system":
+                            # System message shouldn't be added as a separate message
+                            system_prompt_for_this_message = ""
+                            if role != "user":
+                                system_prompt_for_this_message = system_prompt
+                            Message.create(
+                                evalsetrun=dataset.evalsetrun,
+                                dataset=dataset,
+                                thread=thread_object,
+                                index_in_thread=index_in_thread,
+                                role=role,
+                                content=message.get("content", None),
+                                context=json.dumps(context),
+                                metadata=message.get("metadata", None),
+                                is_flexeval_completion=False,
+                                system_prompt=system_prompt_for_this_message,
+                            )
+                            # Update context
+                            context.append(
+                                {"role": role, "content": message.get("content", None)}
+                            )
+                            index_in_thread += 1
+                    add_turns(thread_object)
+    # TODO - should we add ToolCall here? Is there a standard way to represent them in jsonl?
+def load_langgraph_sqlite(
+    dataset: Dataset,
+    filename: str,
+    max_n_conversation_threads: int | None = None,
+    nb_evaluations_per_thread: int | None = 1,
+):
+    serializer = JsonPlusSerializer()
+    with sqlite3.connect(filename) as conn:
+        # Set the row factory to sqlite3.Row
+        # allowing us to reference columns by name instead of index
+        conn.row_factory = sqlite3.Row
+        # Create a cursor object
+        cursor = conn.cursor()
+        verify_checkpoints_table_exists(cursor)
+        # Sync database
+        query = "PRAGMA wal_checkpoint(FULL);"
+        cursor.execute(query)
+        # Make threads (aka conversations)
+        query = "select distinct thread_id from checkpoints"
+        cursor.execute(query)
+        thread_ids = cursor.fetchall()
+        nb_threads = len(thread_ids)
+        if max_n_conversation_threads is None:
+            max_n_conversation_threads = nb_threads
+        if max_n_conversation_threads <= nb_threads:
+            selected_thread_ids = rd.sample(thread_ids, max_n_conversation_threads)
+        else:
+            logger.debug(
+                f"You requested up to '{max_n_conversation_threads}' conversations but only '{nb_threads}' are present in Sqlite dataset at '{filename}'."
+            )
+            selected_thread_ids = thread_ids
+        logger.debug(" DEBUG DUPLICATE SELECT THREAD IDS\n", selected_thread_ids[0])
+        for thread_eval_run_id in range(
+            max(1, nb_evaluations_per_thread)
+        ):  # duplicate stored threads for averaged evaluation results
+            for thread_id in selected_thread_ids:
+                thread = Thread.create(
+                    evalsetrun=dataset.evalsetrun,
+                    dataset=dataset,
+                    langgraph_thread_id=thread_id[0],
+                    eval_run_thread_id=str(thread_id[0])
+                    + "_"
+                    + str(thread_eval_run_id),
+                )
+                # Create messages
+                query = f"select * from checkpoints where thread_id = '{thread.langgraph_thread_id}'"
+                cursor.execute(query)
+                completion_list = cursor.fetchall()
+                # context has to be reset at the start of every thread
+                context = []
+                # tool call variables
+                tool_calls_dict = {}
+                tool_responses_dict = {}
+                tool_addional_kwargs_dict = {}
+                # system prompt reset for every thread
+                system_prompt = None
+                for completion_row in completion_list:
+                    # checkpoint is full state history
+                    checkpoint = serializer.loads_typed(
+                        (completion_row["type"], completion_row["checkpoint"])
+                    )
+                    # metadata is the state update for that row
+                    metadata = json.loads(completion_row["metadata"])
+                    # IDs from langgraph
+                    if metadata.get("writes") is None:
+                        continue
+                    else:
+                        # Goal here is to create a data structure for EACH write/update
+                        # that can be used to construct a Message object
+                        # LangGraph stores info in 'writes' in the checkpoints.metadata column
+                        # but the format is a bit different between human and machine input
+                        # The resulting data structure should have
+                        # key (str) -- graph 'node' that produced the message (or 'human')
+                        # value (list) -- list of 'message' data structures with id, kwargs, etc
+                        # {
+                        #   'node_name':{
+                        #      "messages":[
+                        #          {
+                        #             'id': "XYZ"
+                        #             'kwargs':{
+                        #                 "content": 'text of the message',
+                        #                 "additional_kwargs": {}
+                        #           },
+                        #        }
+                        #       ]
+                        #
+                        #   }
+                        # }
+                        # user input condition
+                        if metadata.get("source") == "input":
+                            # NOTE: I think with the updated logging of HumanMessage with langgraph, we don't need this case
+                            update_dict = {}
+                            # this will be a dictionary we can add to
+                            # key is 'input', as in human input
+                            update_dict["input"] = {"messages": []}
+                            # print("metadata keys:", metadata["writes"].keys())
+                            # the very first message in input in a thread seems to include
+                            # the system prompt, not a message that was sent by the user.
+                            # the system promptdoesn't seem to be set anywhere else, so
+                            # using that as the system prompt for the thread.
+                            messagecount = 0
+                            for msg in metadata["writes"]["__start__"]["messages"]:
+                                if messagecount == 0 and metadata["step"] == -1:
+                                    system_prompt = msg["kwargs"]["content"]
+                                    messagecount += 1
+                                else:
+                                    message = {}
+                                    message["id"] = [
+                                        "HumanMessage"
+                                    ]  # LangGraph has a list here
+                                    message["kwargs"] = {}
+                                    message["kwargs"]["content"] = msg
+                                    message["kwargs"]["type"] = "human"
+                                    update_dict["input"]["messages"].append(message)
+                            # will be used below
+                            role = "user"
+                        # machine input condition
+                        elif metadata.get("source") == "loop":
+                            # This already has a list of messages with kwargs, etc
+                            update_dict = metadata.get("writes")
+                            # I think 'system_prompt' is empty by default and not stored here unless
+                            # it's included in the LangGraph state
+                            checkpoint_system_prompt = checkpoint.get(
+                                "channel_values", {}
+                            ).get("system_prompt")
+                            if checkpoint_system_prompt is not None:
+                                system_prompt = checkpoint_system_prompt
+                            role = "assistant"
+                        else:
+                            raise Exception(
+                                f"Unhandled input condition! Source not 'loop' or 'input'. Metadata: {metadata}"
+                            )
+                        # Add system prompt as first thing in context if not already present
+                        if len(context) == 0:
+                            context.append({"role": "system", "content": system_prompt})
+                        # iterate through nodes - there is probably only 1
+                        for node, value in update_dict.items():
+                            # iterate through list of message updates
+                            if "messages" in value:
+                                if isinstance(value["messages"], dict):
+                                    # Make this a list to iterate through - 4 Feb 2025 - used to be a list previously
+                                    messagelist = [value["messages"]]
+                                else:
+                                    messagelist = value["messages"]
+                                index_in_thread = 0
+                                for message in messagelist:
+                                    if role == "user":
+                                        content = (
+                                            message.get("kwargs", {})
+                                            .get("content", {})
+                                            .get("kwargs", {})
+                                            .get("content", None)
+                                        )
+                                    elif role == "assistant":
+                                        content = message.get("kwargs", {}).get(
+                                            "content", None
+                                        )
+                                    else:
+                                        raise Exception(
+                                            "`role` should be either user or assistant."
+                                        )
+                                    Message.create(
+                                        evalsetrun=dataset.evalsetrun,
+                                        dataset=dataset,
+                                        thread=thread,
+                                        index_in_thread=index_in_thread,
+                                        role=role,
+                                        content=content,
+                                        context=json.dumps(context),
+                                        is_flexeval_completion=False,
+                                        system_prompt=system_prompt,
+                                        # language model stats
+                                        tool_calls=json.dumps(
+                                            message.get("kwargs", {}).get(
+                                                "tool_calls", []
+                                            )
+                                        ),
+                                        tool_call_ids=[
+                                            tc["id"]
+                                            for tc in message.get("kwargs", {}).get(
+                                                "tool_calls", []
+                                            )
+                                        ],
+                                        n_tool_calls=len(
+                                            message.get("kwargs", {}).get(
+                                                "tool_calls", []
+                                            )
+                                        ),
+                                        prompt_tokens=message.get("kwargs", {})
+                                        .get("response_metadata", {})
+                                        .get("token_usage", {})
+                                        .get("prompt_tokens"),
+                                        completion_tokens=message.get("kwargs", {})
+                                        .get("response_metadata", {})
+                                        .get("token_usage", {})
+                                        .get("completion_tokens"),
+                                        model_name=message.get("kwargs", {})
+                                        .get("response_metadata", {})
+                                        .get("model_name"),
+                                        # langgraph metadata
+                                        langgraph_ts=checkpoint.get("ts"),
+                                        langgraph_step=metadata.get("step"),
+                                        langgraph_thread_id=completion_row["thread_id"],
+                                        langgraph_checkpoint_id=completion_row[
+                                            "checkpoint_id"
+                                        ],
+                                        langgraph_parent_checkpoint_id=completion_row[
+                                            "parent_checkpoint_id"
+                                        ],
+                                        langgraph_checkpoint=dumps(
+                                            checkpoint
+                                        ),  # Have to re-dump this because of the de-serialization#completion_row["checkpoint"],
+                                        langgraph_metadata=completion_row["metadata"],
+                                        langgraph_node=node,
+                                        langgraph_message_type=message["id"][-1],
+                                        langgraph_type=message.get("kwargs", {}).get(
+                                            "type"
+                                        ),
+                                        # special property of state
+                                        langchain_print=message.get("kwargs", {})
+                                        .get("additional_kwargs", {})
+                                        .get("print", False),
+                                    )
+                                    # update the context for the next Message
+                                    context.append(
+                                        {
+                                            "role": role,
+                                            "content": content,
+                                            "langgraph_role": message["id"][-1],
+                                        }
+                                    )
+                                    # record tool call info so we can match them up later
+                                    if message.get("kwargs", {}).get("type") == "tool":
+                                        # this should have a mapping between tool_call_id and the RESPONSE to to the tool call
+                                        tool_responses_dict[
+                                            message.get("kwargs", {}).get(
+                                                "tool_call_id"
+                                            )
+                                        ] = message.get("kwargs", {}).get("content", "")
+                                    else:
+                                        for tool_call in message.get("kwargs", {}).get(
+                                            "tool_calls", []
+                                        ):
+                                            # this should have all the info about the tool calls, including additional_kwargs
+                                            # but NOT their responses
+                                            tool_calls_dict[tool_call["id"]] = tool_call
+                                            tool_addional_kwargs_dict[
+                                                tool_call["id"]
+                                            ] = message.get("kwargs", {}).get(
+                                                "additional_kwargs", {}
+                                            )
+                                    index_in_thread += 1
+                # Add turns to each message
+                # Need to do this before dealing with tool calls, since we
+                # associated turns with tool calls via messages during the .create() method
+                add_turns(thread)
+                ## Match up tool calls and make an object for each match
+                for tool_call_id, tool_call_vals in tool_calls_dict.items():
+                    if tool_call_id not in tool_responses_dict:
+                        raise ValueError(
+                            f"Found a tool call without a tool response! id='{tool_call_id}'"
+                        )
+                    # get matching message - should now be accessible through thread now?
+                    matching_message = [
+                        m for m in thread.messages if tool_call_id in m.tool_call_ids
+                    ][0]
+                    ToolCall.create(
+                        evalsetrun=dataset.evalsetrun,
+                        dataset=dataset,
+                        thread=thread,
+                        turn=matching_message.turn,
+                        message=matching_message,
+                        function_name=tool_call_vals.get("name"),
+                        args=json.dumps(tool_call_vals.get("args")),
+                        additional_kwargs=json.dumps(
+                            tool_addional_kwargs_dict.get(tool_call_id)
+                        ),
+                        tool_call_id=tool_call_id,
+                        response_content=tool_responses_dict.get(tool_call_id),
+                    )
+                ## Add system prompt if available?
+def add_turns(thread: Thread):
+    # Add turn labels
+    # Step 1 - add placeholder_turn_id to each message
+    message_roles = []
+    for message in thread.messages:
+        message_roles.append({"id": message.id, "role": message.role})
+    message_placeholder_ids, turn_dict = get_turns(thread=thread)
+    # Step 2 - Create turns, plus a mapping between the placeholder ids and the created ids
+    turns = {}
+    index_in_thread = 0
+    for placeholder_turn_id, role in turn_dict.items():  # turns.items():
+        t = Turn.create(
+            evalsetrun=thread.evalsetrun,
+            dataset=thread.dataset,
+            thread=thread,
+            index_in_thread=index_in_thread,
+            role=role,
+        )
+        # map placeholder id to turn object
+        turns[placeholder_turn_id] = t
+        index_in_thread += 1
+    # Step 3 - add placeholder ids to messages
+    # Can use zip since entries in message_list correspond to thread.messages
+    # NOTE: ANR: I don't follow how the message_list was supposed to work below.
+    for ml, message in zip(message_placeholder_ids, thread.messages):
+        # Is this going to work? No idea
+        message.turn = turns[ml]
+        # message.is_final_turn_in_input = ml.get("is_final_turn_in_input", False)
+        message.save()
+def verify_checkpoints_table_exists(cursor):
+    # double check that the 'checkpoints' table exists
+    cursor.execute(
+        """
+        SELECT name FROM sqlite_master
+        WHERE type='table' AND name='checkpoints'
+        """
+    )
+    result = cursor.fetchone()
+    # Assert that the result is not None, meaning the table exists
+    assert result is not None, "Table 'checkpoints' does not exist in the database."
+def get_turns(thread: Thread):
+    """We're defining a turn as a list of 1 or more consequtive outputs
+    by the same role, where the role is either 'user', or 'assistant/tool'.
+    In other words, we would parse as follows:
+    TURN 1 - user
+    TURN 2 - assistant
+    TURN 3 - user
+    TURN 4 - assistant
+    TURN 4 - tool
+    TURN 4 - assistant
+    TURN 5 - user
+    """
+    # these are all treated as belonging to the same 'turn'
+    machine_labels = ["assistant", "ai", "tool"]
+    turn_id = 1
+    previous_role = ""
+    # TODO: Make a message list here, store the placeholder ids, and update to the real turn ids; save at end
+    message_placeholder_ids = []
+    for turnentry_id, entry in enumerate(thread.messages):  # enumerate(input_list):
+        current_role = entry.role  # entry.get("role", None)
+        # entry["role"] = current_role
+        # if your role matches a previous, don't increment turn_id
+        if (current_role in machine_labels and previous_role in machine_labels) or (
+            current_role not in machine_labels and previous_role not in machine_labels
+        ):
+            pass  # TODO: clean up the condition to avoid the empty if
+            # previous_role = current_role
+            # entry["placholder_turn_id"] = turn_id
+        else:
+            turn_id += 1
+            # entry["placholder_turn_id"] = turn_id
+            # previous_role = current_role
+        # entry.turn_id = turn_id
+        message_placeholder_ids.append(turn_id)
+        previous_role = current_role
+        # entry.save()
+    # NOTE: ANR seems like this could be optimized - e.g., set all
+    # to false, then do a select query for just the ones where turn_id column is turn_id. That would also
+    # reduce the number of saves to the database.
+    # label final entry
+    # ANR: moved up the turn_id_roles bit here to avoid iterating twice
+    turn_id_roles = {}
+    for message_placehold_id, entry in zip(
+        message_placeholder_ids, thread.messages
+    ):  # input_list:
+        turn_id_roles[message_placehold_id] = entry.role
+        entry.is_final_turn_in_input = message_placehold_id == turn_id
+        entry.save()  # Could optimize this to avoid saving twice
+    return message_placeholder_ids, turn_id_roles

flexeval/db_utils.py ADDED Viewed

@@ -0,0 +1,38 @@
+"""Peewee database utilities."""
+import peewee as pw
+from flexeval.classes import base as classes_base
+from flexeval.classes.dataset import Dataset
+from flexeval.classes.eval_set_run import EvalSetRun
+from flexeval.classes.message import Message
+from flexeval.classes.metric import Metric
+from flexeval.classes.thread import Thread
+from flexeval.classes.tool_call import ToolCall
+from flexeval.classes.turn import Turn
+DATABASE_TABLES = [EvalSetRun, Dataset, Thread, Turn, Message, ToolCall, Metric]
+def initialize_database(database_path: str, clear_tables: bool = False):
+    classes_base.database.init(database_path)
+    # classes_base.database.start()
+    if clear_tables:
+        classes_base.database.drop_tables(DATABASE_TABLES)
+    classes_base.database.create_tables(DATABASE_TABLES)
+def bind_to_database(database_path: str) -> pw.Database:
+    """Utility function for binding to a FlexEval database so that ORM functionality can be used.
+    See: https://docs.peewee-orm.com/en/latest/peewee/database.html#setting-the-database-at-run-time
+    Returns:
+        pw.Database: The new database created for the models to bind to.
+    """
+    new_database = classes_base.create_sqlite_database(database_path)
+    new_database.bind(DATABASE_TABLES)
+    # Verify the binding worked by checking one of the models
+    assert classes_base.BaseModel._meta.database == new_database
+    return new_database