python-flexeval 0.2.0__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/PKG-INFO +1 -1
- python_flexeval-0.3.0/src/flexeval/__about__.py +1 -0
- python_flexeval-0.3.0/src/flexeval/classes/jsonview.py +107 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/src/flexeval/classes/message.py +5 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/src/flexeval/classes/thread.py +4 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/src/flexeval/data_loader.py +26 -11
- python_flexeval-0.3.0/tests/data/simple_metadata.jsonl +2 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/tests/unit/test_data_loader.py +33 -0
- python_flexeval-0.2.0/src/flexeval/__about__.py +0 -1
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/.env-example +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/.github/dependabot.yml +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/.github/workflows/deploy-to-pypi.yml +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/.github/workflows/github-pages.yml +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/.github/workflows/validate.yaml +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/.gitignore +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/.pre-commit-config.yaml +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/.python-version +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/.vscode/settings.json +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/CITATION.bib +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/CITATION.cff +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/CLAUDE.md +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/DEVELOPMENT.md +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/Dockerfile +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/EDM_2024_FlexEval.pdf +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/LICENSE +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/Makefile +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/README.md +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/data/metabase/.gitkeep +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/docker-compose.yml +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/docs/_static/flexeval_banner.svg +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/docs/_static/flexeval_favicon.svg +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/docs/_static/flexeval_logo.png +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/docs/_static/flexeval_logo2.png +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/docs/_templates/footer.html +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/docs/api.rst +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/docs/conf.py +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/docs/getting_started.rst +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/docs/index.rst +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/docs/sphinxext/__init__.py +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/docs/sphinxext/github.py +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/docs/user_guide/abstractions.rst +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/docs/user_guide/cli.rst +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/docs/user_guide/index.rst +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/docs/user_guide/logging.rst +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/docs/user_guide/motivation.md +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/docs/user_guide/rubric_guide.md +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/docs/vignettes.py +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/docs/vignettes.rst +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/example_project/example_specific_rubrics.yaml +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/logs/.gitkeep +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/make.bat +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/pyproject.toml +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/ruff.toml +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/src/flexeval/__init__.py +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/src/flexeval/__main__.py +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/src/flexeval/classes/__init__.py +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/src/flexeval/classes/base.py +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/src/flexeval/classes/dataset.py +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/src/flexeval/classes/eval_runner.py +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/src/flexeval/classes/eval_set_run.py +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/src/flexeval/classes/metric.py +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/src/flexeval/classes/tool_call.py +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/src/flexeval/classes/turn.py +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/src/flexeval/cli.py +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/src/flexeval/completions.py +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/src/flexeval/compute_metrics.py +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/src/flexeval/config.yaml +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/src/flexeval/configuration/__init__.py +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/src/flexeval/configuration/completion_functions.py +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/src/flexeval/configuration/evals.yaml +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/src/flexeval/configuration/function_metrics.py +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/src/flexeval/configuration/rubric_metrics.yaml +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/src/flexeval/db_utils.py +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/src/flexeval/dependency_graph.py +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/src/flexeval/eval_schema.json +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/src/flexeval/function_types.py +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/src/flexeval/helpers.py +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/src/flexeval/io/__init__.py +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/src/flexeval/io/parsers/yaml_parser.py +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/src/flexeval/log_utils.py +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/src/flexeval/metrics/__init__.py +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/src/flexeval/metrics/access.py +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/src/flexeval/metrics/save.py +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/src/flexeval/rubric.py +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/src/flexeval/run_utils.py +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/src/flexeval/runner.py +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/src/flexeval/schema/__init__.py +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/src/flexeval/schema/config_schema.py +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/src/flexeval/schema/eval_schema.py +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/src/flexeval/schema/evalrun_schema.py +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/src/flexeval/schema/rubric_schema.py +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/src/flexeval/schema/schema_utils.py +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/src/metabase/Dockerfile +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/tests/__init__.py +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/tests/data/multiturn.jsonl +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/tests/data/plot-convos.jsonl +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/tests/data/simple.jsonl +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/tests/data/simple_nosystem.jsonl +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/tests/integration/__init__.py +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/tests/integration/config-tests.yaml +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/tests/integration/data/multiturn.jsonl +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/tests/integration/data/plot-convos.jsonl +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/tests/integration/data/simple.jsonl +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/tests/integration/evals.yaml +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/tests/integration/functional_tests.py +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/tests/integration/langgraph_data.py +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/tests/resources/function_metric.py +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/tests/resources/functional_config.yaml +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/tests/resources/functional_evals.yaml +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/tests/resources/test_config.yaml +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/tests/resources/test_dataset.jsonl +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/tests/resources/test_evals.yaml +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/tests/resources/test_rubric_metrics.yaml +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/tests/resources/unittest.env +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/tests/unit/__init__.py +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/tests/unit/io/test_yaml_parser.py +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/tests/unit/mixins.py +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/tests/unit/test_completions.py +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/tests/unit/test_compute_metrics.py +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/tests/unit/test_db_utils.py +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/tests/unit/test_dependency_graph.py +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/tests/unit/test_eval_runner.py +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/tests/unit/test_function_metrics.py +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/tests/unit/test_function_types.py +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/tests/unit/test_functional.py +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/tests/unit/test_rubric.py +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/tests/unit/test_schema.py +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/uv.lock +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/vignettes/.gitignore +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/vignettes/basic.py +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/vignettes/basic_cli.md +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/vignettes/basic_rubric.py +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/vignettes/conversations.jsonl +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/vignettes/custom_functions.py +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/vignettes/custom_rubric.md +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/vignettes/custom_rubrics.yaml +0 -0
- {python_flexeval-0.2.0 → python_flexeval-0.3.0}/vignettes/eval_run.yaml +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: python-flexeval
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: FlexEval is a tool for designing custom metrics, completion functions, and LLM-graded rubrics for evaluating the behavior of LLM-powered systems.
|
|
5
5
|
Project-URL: Homepage, https://digitalharborfoundation.github.io/FlexEval/
|
|
6
6
|
Project-URL: GitHub, https://github.com/DigitalHarborFoundation/FlexEval
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.3.0"
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
import json
|
|
2
|
+
from collections import UserDict
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class JsonViewDict(UserDict):
|
|
6
|
+
"""Dictionary that syncs changes back to the model field."""
|
|
7
|
+
|
|
8
|
+
def __init__(
|
|
9
|
+
self,
|
|
10
|
+
model_instance,
|
|
11
|
+
text_field_attr_name,
|
|
12
|
+
json_dumps_fn=json.dumps,
|
|
13
|
+
json_loads_fn=json.loads,
|
|
14
|
+
):
|
|
15
|
+
self.model_instance = model_instance
|
|
16
|
+
self.text_field_attr_name = text_field_attr_name
|
|
17
|
+
self.json_dumps_fn = json_dumps_fn
|
|
18
|
+
self.json_loads_fn = json_loads_fn
|
|
19
|
+
|
|
20
|
+
text_value = getattr(model_instance, text_field_attr_name)
|
|
21
|
+
initial_data = self.json_loads_fn(text_value)
|
|
22
|
+
super().__init__(initial_data)
|
|
23
|
+
|
|
24
|
+
def _sync_to_model(self):
|
|
25
|
+
"""Sync the current data back to the model field."""
|
|
26
|
+
json_str = self.json_loads_fn(self.data)
|
|
27
|
+
setattr(self.model_instance, self.text_field_attr_name, json_str)
|
|
28
|
+
|
|
29
|
+
# Override mutating methods to trigger sync
|
|
30
|
+
def __setitem__(self, key, value):
|
|
31
|
+
super().__setitem__(key, value)
|
|
32
|
+
self._sync_to_model()
|
|
33
|
+
|
|
34
|
+
def __delitem__(self, key):
|
|
35
|
+
super().__delitem__(key)
|
|
36
|
+
self._sync_to_model()
|
|
37
|
+
|
|
38
|
+
def clear(self):
|
|
39
|
+
super().clear()
|
|
40
|
+
self._sync_to_model()
|
|
41
|
+
|
|
42
|
+
def pop(self, key, *args):
|
|
43
|
+
result = super().pop(key, *args)
|
|
44
|
+
self._sync_to_model()
|
|
45
|
+
return result
|
|
46
|
+
|
|
47
|
+
def popitem(self):
|
|
48
|
+
result = super().popitem()
|
|
49
|
+
self._sync_to_model()
|
|
50
|
+
return result
|
|
51
|
+
|
|
52
|
+
def setdefault(self, key, default=None):
|
|
53
|
+
result = super().setdefault(key, default)
|
|
54
|
+
self._sync_to_model()
|
|
55
|
+
return result
|
|
56
|
+
|
|
57
|
+
def update(self, *args, **kwargs):
|
|
58
|
+
super().update(*args, **kwargs)
|
|
59
|
+
self._sync_to_model()
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class JsonView:
|
|
63
|
+
"""Descriptor that provides dict-like access to a JSON text field.
|
|
64
|
+
|
|
65
|
+
Example:
|
|
66
|
+
class SomeModel(pw.Model):
|
|
67
|
+
some_field = pw.TextField(default="{}")
|
|
68
|
+
some_field_dict = JsonView(text_field_attr_name="some_field")
|
|
69
|
+
|
|
70
|
+
m = SomeModel()
|
|
71
|
+
m.some_field_dict["chosen_mistake"] = "whatever"
|
|
72
|
+
"""
|
|
73
|
+
|
|
74
|
+
def __init__(self, text_field_attr_name):
|
|
75
|
+
self.text_field_attr_name = text_field_attr_name
|
|
76
|
+
self.attr_name = None
|
|
77
|
+
|
|
78
|
+
def __set_name__(self, owner, name):
|
|
79
|
+
"""Called when the descriptor is assigned to a class attribute."""
|
|
80
|
+
self.attr_name = f"_{name}_dict"
|
|
81
|
+
|
|
82
|
+
def __get__(self, instance, owner):
|
|
83
|
+
if instance is None:
|
|
84
|
+
return self
|
|
85
|
+
|
|
86
|
+
# Check if we already have a cached JsonViewDict
|
|
87
|
+
if not hasattr(instance, self.attr_name):
|
|
88
|
+
if not hasattr(instance, self.text_field_attr_name):
|
|
89
|
+
raise ValueError(
|
|
90
|
+
f"Failed to link this JsonView to field '{self.text_field_attr_name}' because it doesn't exist on this model instance."
|
|
91
|
+
)
|
|
92
|
+
# Cache a new JsonViewDict
|
|
93
|
+
json_dict = JsonViewDict(instance, self.text_field_attr_name)
|
|
94
|
+
setattr(instance, self.attr_name, json_dict)
|
|
95
|
+
|
|
96
|
+
return getattr(instance, self.attr_name)
|
|
97
|
+
|
|
98
|
+
def __set__(self, instance, value):
|
|
99
|
+
"""Allow setting the entire dict."""
|
|
100
|
+
if isinstance(value, dict):
|
|
101
|
+
json_dict = JsonViewDict(instance, self.text_field_attr_name)
|
|
102
|
+
json_dict.update(value)
|
|
103
|
+
setattr(instance, self.attr_name, json_dict)
|
|
104
|
+
else:
|
|
105
|
+
raise ValueError(
|
|
106
|
+
f"This JsonView must be a dictionary to set linked field '{self.text_field_attr_name}' correctly."
|
|
107
|
+
)
|
|
@@ -10,6 +10,7 @@ from flexeval.classes.dataset import Dataset
|
|
|
10
10
|
from flexeval.classes.eval_set_run import EvalSetRun
|
|
11
11
|
from flexeval.classes.thread import Thread
|
|
12
12
|
from flexeval.classes.turn import Turn
|
|
13
|
+
from flexeval.classes.jsonview import JsonView
|
|
13
14
|
from flexeval.configuration import completion_functions
|
|
14
15
|
|
|
15
16
|
logger = logging.getLogger(__name__)
|
|
@@ -34,6 +35,10 @@ class Message(BaseModel):
|
|
|
34
35
|
content = pw.TextField()
|
|
35
36
|
context = pw.TextField(null=True) # Previous messages
|
|
36
37
|
|
|
38
|
+
# metadata
|
|
39
|
+
metadata = pw.TextField(default="{}", null=False)
|
|
40
|
+
metadata_dict = JsonView("metadata")
|
|
41
|
+
|
|
37
42
|
# helpers
|
|
38
43
|
system_prompt = pw.TextField(null=True)
|
|
39
44
|
is_flexeval_completion = pw.BooleanField(null=True)
|
|
@@ -3,6 +3,7 @@ import peewee as pw
|
|
|
3
3
|
from flexeval.classes.base import BaseModel
|
|
4
4
|
from flexeval.classes.dataset import Dataset
|
|
5
5
|
from flexeval.classes.eval_set_run import EvalSetRun
|
|
6
|
+
from flexeval.classes.jsonview import JsonView
|
|
6
7
|
|
|
7
8
|
|
|
8
9
|
class Thread(BaseModel):
|
|
@@ -20,6 +21,9 @@ class Thread(BaseModel):
|
|
|
20
21
|
|
|
21
22
|
system_prompt = pw.TextField(null=True)
|
|
22
23
|
|
|
24
|
+
metadata = pw.TextField(default="{}", null=False)
|
|
25
|
+
metadata_dict = JsonView("metadata")
|
|
26
|
+
|
|
23
27
|
def __init__(self, **kwargs):
|
|
24
28
|
super().__init__(**kwargs)
|
|
25
29
|
self.metrics_to_evaluate = []
|
|
@@ -54,18 +54,13 @@ def load_jsonl(
|
|
|
54
54
|
max(1, nb_evaluations_per_thread)
|
|
55
55
|
): # duplicate stored threads for averaged evaluation results
|
|
56
56
|
if thread_id in selected_thread_ids:
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
eval_run_thread_id=str(thread_id)
|
|
62
|
-
+ "_"
|
|
63
|
-
+ str(thread_eval_run_id),
|
|
64
|
-
)
|
|
57
|
+
thread_json = json.loads(thread)
|
|
58
|
+
# extract any metadata
|
|
59
|
+
thread_metadata = thread_json.copy()
|
|
60
|
+
del thread_metadata["input"]
|
|
65
61
|
|
|
66
|
-
# Context
|
|
67
62
|
context = []
|
|
68
|
-
thread_input =
|
|
63
|
+
thread_input = thread_json["input"]
|
|
69
64
|
|
|
70
65
|
# Get system prompt used in the thread - assuming only 1
|
|
71
66
|
for message in thread_input:
|
|
@@ -78,15 +73,35 @@ def load_jsonl(
|
|
|
78
73
|
# Add the system prompt as context
|
|
79
74
|
context.append({"role": "system", "content": system_prompt})
|
|
80
75
|
|
|
76
|
+
thread_object: Thread = Thread.create(
|
|
77
|
+
evalsetrun=dataset.evalsetrun,
|
|
78
|
+
dataset=dataset,
|
|
79
|
+
jsonl_thread_id=thread_id,
|
|
80
|
+
eval_run_thread_id=str(thread_id)
|
|
81
|
+
+ "_"
|
|
82
|
+
+ str(thread_eval_run_id),
|
|
83
|
+
system_prompt=system_prompt,
|
|
84
|
+
metadata=json.dumps(thread_metadata),
|
|
85
|
+
)
|
|
86
|
+
|
|
81
87
|
# Create messages
|
|
82
88
|
index_in_thread = 0
|
|
83
89
|
for message in thread_input:
|
|
90
|
+
if not isinstance(message, dict):
|
|
91
|
+
raise ValueError(
|
|
92
|
+
f"Can't load unknown object type; expected dict. Check JSONL format: {message}"
|
|
93
|
+
)
|
|
84
94
|
role = message.get("role", None)
|
|
85
95
|
if role != "system":
|
|
86
96
|
# System message shouldn't be added as a separate message
|
|
87
97
|
system_prompt_for_this_message = ""
|
|
88
98
|
if role != "user":
|
|
89
99
|
system_prompt_for_this_message = system_prompt
|
|
100
|
+
message_metadata = message.copy()
|
|
101
|
+
if "content" in message_metadata:
|
|
102
|
+
del message_metadata["content"]
|
|
103
|
+
if "role" in message_metadata:
|
|
104
|
+
del message_metadata["role"]
|
|
90
105
|
Message.create(
|
|
91
106
|
evalsetrun=dataset.evalsetrun,
|
|
92
107
|
dataset=dataset,
|
|
@@ -95,9 +110,9 @@ def load_jsonl(
|
|
|
95
110
|
role=role,
|
|
96
111
|
content=message.get("content", None),
|
|
97
112
|
context=json.dumps(context),
|
|
98
|
-
metadata=message.get("metadata", None),
|
|
99
113
|
is_flexeval_completion=False,
|
|
100
114
|
system_prompt=system_prompt_for_this_message,
|
|
115
|
+
metadata=json.dumps(message_metadata),
|
|
101
116
|
)
|
|
102
117
|
# Update context
|
|
103
118
|
context.append(
|
|
@@ -0,0 +1,2 @@
|
|
|
1
|
+
{"key_1": "value_1", "key_2": {"nested_key": "nested_value"}, "input":[{ "role": "system", "content": "my system prompt" }, {"role":"user", "content": "I need help.", "index": 0}, {"role":"assistant", "content": "Help with what?", "index": 1}, {"role":"user", "content": "My homework.", "index": 2}]}
|
|
2
|
+
{"input": [{ "role": "system", "content": "my system prompt" }, {"role": "user", "content": "Hi, Nice to meet you!"}, {"role": "assistant", "content": "Nice to meet you, too! How can I help you today?"}, {"role": "user", "content": "How do I find cube roots by hand?"}]}
|
|
@@ -59,6 +59,39 @@ class TestDataLoader(mixins.DotenvMixin, unittest.TestCase):
|
|
|
59
59
|
# This is redundant, but just in case:
|
|
60
60
|
data_loader.load_jsonl(dataset=dataset, filename=dataset_filepath)
|
|
61
61
|
|
|
62
|
+
def test_load_jsonl_metadata(self):
|
|
63
|
+
"""Tests the inclusion of metadata in JSONL files."""
|
|
64
|
+
datasets = [
|
|
65
|
+
"tests/data/simple_metadata.jsonl",
|
|
66
|
+
]
|
|
67
|
+
evalsetrun = EvalSetRun.create(
|
|
68
|
+
dataset_files=json.dumps(datasets),
|
|
69
|
+
metrics="",
|
|
70
|
+
metrics_graph_ordered_list="",
|
|
71
|
+
do_completion=False,
|
|
72
|
+
)
|
|
73
|
+
for dataset_filepath in evalsetrun.get_datasets():
|
|
74
|
+
dataset = Dataset.create(
|
|
75
|
+
evalsetrun=evalsetrun,
|
|
76
|
+
filename=dataset_filepath,
|
|
77
|
+
)
|
|
78
|
+
dataset.load_data()
|
|
79
|
+
# This is redundant, but just in case:
|
|
80
|
+
data_loader.load_jsonl(dataset=dataset, filename=dataset_filepath)
|
|
81
|
+
|
|
82
|
+
for thread in dataset.threads:
|
|
83
|
+
metadata = json.loads(thread.metadata)
|
|
84
|
+
assert "key_1" in metadata and metadata["key_1"] == "value_1"
|
|
85
|
+
assert (
|
|
86
|
+
"key_2" in metadata
|
|
87
|
+
and metadata["key_2"]["nested_key"] == "nested_value"
|
|
88
|
+
)
|
|
89
|
+
assert "input" not in metadata
|
|
90
|
+
for i, message in enumerate(thread.messages):
|
|
91
|
+
metadata = json.loads(message.metadata)
|
|
92
|
+
assert metadata["index"] == i
|
|
93
|
+
break
|
|
94
|
+
|
|
62
95
|
|
|
63
96
|
class State(TypedDict):
|
|
64
97
|
# TODO move this to some kind of langgraph utility file
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "0.2.0"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{python_flexeval-0.2.0 → python_flexeval-0.3.0}/example_project/example_specific_rubrics.yaml
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{python_flexeval-0.2.0 → python_flexeval-0.3.0}/src/flexeval/configuration/completion_functions.py
RENAMED
|
File without changes
|
|
File without changes
|
{python_flexeval-0.2.0 → python_flexeval-0.3.0}/src/flexeval/configuration/function_metrics.py
RENAMED
|
File without changes
|
{python_flexeval-0.2.0 → python_flexeval-0.3.0}/src/flexeval/configuration/rubric_metrics.yaml
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|