python-flexeval 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flexeval/__about__.py +1 -1
- flexeval/classes/dataset.py +12 -72
- flexeval/classes/eval_set_run.py +18 -7
- flexeval/classes/jsonview.py +10 -5
- flexeval/classes/message.py +11 -5
- flexeval/classes/metric.py +0 -8
- flexeval/classes/thread.py +0 -2
- flexeval/classes/tool_call.py +0 -2
- flexeval/classes/turn.py +7 -5
- flexeval/completions.py +8 -5
- flexeval/compute_metrics.py +45 -32
- flexeval/configuration/evals.yaml +2 -25
- flexeval/data_loader.py +219 -317
- flexeval/db_utils.py +11 -2
- flexeval/dependency_graph.py +3 -3
- flexeval/eval_schema.json +0 -18
- flexeval/function_types.py +2 -13
- flexeval/metrics/save.py +12 -8
- flexeval/run_utils.py +163 -17
- flexeval/runner.py +6 -14
- flexeval/schema/config_schema.py +12 -0
- flexeval/schema/eval_schema.py +3 -0
- flexeval/schema/evalrun_schema.py +41 -10
- {python_flexeval-0.3.0.dist-info → python_flexeval-0.4.0.dist-info}/METADATA +3 -3
- python_flexeval-0.4.0.dist-info/RECORD +49 -0
- {python_flexeval-0.3.0.dist-info → python_flexeval-0.4.0.dist-info}/WHEEL +1 -1
- python_flexeval-0.3.0.dist-info/RECORD +0 -49
- {python_flexeval-0.3.0.dist-info → python_flexeval-0.4.0.dist-info}/entry_points.txt +0 -0
- {python_flexeval-0.3.0.dist-info → python_flexeval-0.4.0.dist-info}/licenses/LICENSE +0 -0
flexeval/dependency_graph.py
CHANGED
|
@@ -115,9 +115,9 @@ def get_parent_metrics(all_metrics: dict, child: dict) -> tuple[list, list]:
|
|
|
115
115
|
"""metrics_graph_ordered_list will be a list of metrics in order in which they should be run
|
|
116
116
|
|
|
117
117
|
This function takes the eval represented by "child" and finds ALL evals in "all_metrics"
|
|
118
|
-
that
|
|
118
|
+
that qualify as the child's immediate parent
|
|
119
119
|
|
|
120
|
-
An eval can qualify as a parent by having a matching name, type,
|
|
120
|
+
An eval can qualify as a parent by having a matching name, type, etc.
|
|
121
121
|
At this point, we won't have enough information to decide whether the child should be run
|
|
122
122
|
(since the child might have additional requirements on the output of the parent)
|
|
123
123
|
but this is enough to tell us that the child should be run AFTER the parent.
|
|
@@ -145,7 +145,7 @@ def get_parent_metrics(all_metrics: dict, child: dict) -> tuple[list, list]:
|
|
|
145
145
|
|
|
146
146
|
# if the conditionals are listed in the depends_on entry but don't match...
|
|
147
147
|
# Only check conditionals that are explicitly specified (not None) in the requirement
|
|
148
|
-
conditionals = ["metric_level", "
|
|
148
|
+
conditionals = ["metric_level", "name", "kwargs"]
|
|
149
149
|
for conditional in conditionals:
|
|
150
150
|
if (
|
|
151
151
|
conditional in requirement
|
flexeval/eval_schema.json
CHANGED
|
@@ -76,10 +76,6 @@
|
|
|
76
76
|
"description": "The keyword arguments for the dependency. If provided, used to match which evaluation this dependency is for, so must match the keyword args given for some evaluation.",
|
|
77
77
|
"additionalProperties": true
|
|
78
78
|
},
|
|
79
|
-
"context_only": {
|
|
80
|
-
"type": "boolean",
|
|
81
|
-
"description": "The context_only value for the dependency. If provided, used to match which evaluation this dependency is for."
|
|
82
|
-
},
|
|
83
79
|
"last_turn_only": {
|
|
84
80
|
"type": "boolean",
|
|
85
81
|
"description": "The last_turn_only value for the dependency. If provided, used to match which evaluation this dependency is for."
|
|
@@ -108,11 +104,6 @@
|
|
|
108
104
|
"description": "What level of granularity (ToolCall, Message, Turn, or Thread) this rubric should be applied to",
|
|
109
105
|
"default": "Turn"
|
|
110
106
|
},
|
|
111
|
-
"context_only": {
|
|
112
|
-
"type": "boolean",
|
|
113
|
-
"description": "If true, only the context (that is, the previous messages) will be evaluated, not the current object. Cannot be done with only thread",
|
|
114
|
-
"default": false
|
|
115
|
-
},
|
|
116
107
|
"last_instance_only": {
|
|
117
108
|
"type": "boolean",
|
|
118
109
|
"description": "If true, the object will only be evaluated if it's the last instance (i.e., turn or message depending on metric_level) in an existing conversation, or if it's a new completion.",
|
|
@@ -143,11 +134,6 @@
|
|
|
143
134
|
"description": "What level of granularity (ToolCall, Message, Turn, or Thread) this rubric should be applied to",
|
|
144
135
|
"default": "Turn"
|
|
145
136
|
},
|
|
146
|
-
"context_only": {
|
|
147
|
-
"type": "boolean",
|
|
148
|
-
"description": "If true, only the context (that is, the previous messages) will be evaluated, not the current object. Cannot be done with only thread",
|
|
149
|
-
"default": false
|
|
150
|
-
},
|
|
151
137
|
"last_instance_only": {
|
|
152
138
|
"type": "boolean",
|
|
153
139
|
"description": "If true, the object will only be evaluated if it's the last instance (i.e., turn or message depending on metric_level) in an existing conversation, or if it's a new completion.",
|
|
@@ -174,10 +160,6 @@
|
|
|
174
160
|
"description": "The keyword arguments for the dependency. If provided, used to match which evaluation this dependency is for, so must match the keyword args given for some evaluation.",
|
|
175
161
|
"additionalProperties": true
|
|
176
162
|
},
|
|
177
|
-
"context_only": {
|
|
178
|
-
"type": "boolean",
|
|
179
|
-
"description": "The context_only value for the dependency. If provided, used to match which evaluation this dependency is for."
|
|
180
|
-
},
|
|
181
163
|
"last_turn_only": {
|
|
182
164
|
"type": "boolean",
|
|
183
165
|
"description": "The last_turn_only value for the dependency. If provided, used to match which evaluation this dependency is for."
|
flexeval/function_types.py
CHANGED
|
@@ -102,7 +102,6 @@ def get_function_input(
|
|
|
102
102
|
metric_function: Callable,
|
|
103
103
|
metric_level: eval_schema.MetricLevel,
|
|
104
104
|
input_object: AnyFunctionObjectInput,
|
|
105
|
-
context_only: bool,
|
|
106
105
|
) -> AnyFunctionObjectInput | str | dict | list:
|
|
107
106
|
"""Coerce input_object to a type accepted by metric_function at this metric_level.
|
|
108
107
|
|
|
@@ -110,7 +109,6 @@ def get_function_input(
|
|
|
110
109
|
metric_function (Callable): Function to invoke with the returned input.
|
|
111
110
|
metric_level (eval_schema.MetricLevel): The metric level at which metric_function is being invoked.
|
|
112
111
|
input_object (AnyFunctionObjectInput): The input_object to be coerced, or passed as-is if accepted by metric_function.
|
|
113
|
-
context_only (bool): Determines how strings and lists are converted. See schema documentation.
|
|
114
112
|
|
|
115
113
|
Raises:
|
|
116
114
|
ValueError: If the function accepts at least one declared type, but
|
|
@@ -137,22 +135,13 @@ def get_function_input(
|
|
|
137
135
|
elif dict in accepted_parameter_types and metric_level == "ToolCall":
|
|
138
136
|
return input_object.get_dict_representation()
|
|
139
137
|
elif list in accepted_parameter_types and metric_level in ["Turn", "Thread"]:
|
|
140
|
-
|
|
141
|
-
return input_object.get_context()
|
|
142
|
-
else:
|
|
143
|
-
# this is on a single turn - pass in the parsed list
|
|
144
|
-
return input_object.get_content()
|
|
138
|
+
return input_object.get_content()
|
|
145
139
|
elif str in accepted_parameter_types:
|
|
146
140
|
if metric_level == "ToolCall":
|
|
147
141
|
raise ValueError(
|
|
148
142
|
"Functions that accept strings can't be used for tool calls. Accept a dict (or a flexeval.classes.tool_call.ToolCall) instead."
|
|
149
143
|
)
|
|
150
|
-
|
|
151
|
-
# join together all previous turns
|
|
152
|
-
return join_all_contents_to_string(input_object.get_context())
|
|
153
|
-
else:
|
|
154
|
-
# current turn only
|
|
155
|
-
return join_all_contents_to_string(input_object.get_content())
|
|
144
|
+
return join_all_contents_to_string(input_object.get_content())
|
|
156
145
|
else:
|
|
157
146
|
# the function accepts at least one declared type, but either:
|
|
158
147
|
# - it's a type we don't support at all e.g. set
|
flexeval/metrics/save.py
CHANGED
|
@@ -1,25 +1,30 @@
|
|
|
1
1
|
import json
|
|
2
2
|
from typing import Iterable
|
|
3
3
|
|
|
4
|
+
from flexeval.classes.dataset import Dataset
|
|
5
|
+
from flexeval.classes.eval_set_run import EvalSetRun
|
|
4
6
|
from flexeval.classes.metric import Metric
|
|
5
7
|
|
|
6
8
|
|
|
7
|
-
def save_metrics(
|
|
9
|
+
def save_metrics(
|
|
10
|
+
metrics: Iterable[Metric], evalsetrun: EvalSetRun, datasets: list[Dataset]
|
|
11
|
+
):
|
|
12
|
+
# Build a mapping from dataset id to dataset for quick lookup
|
|
13
|
+
dataset_by_id = {d.id: d for d in datasets}
|
|
8
14
|
for metric in metrics:
|
|
9
15
|
# TODO - speed this up somehow
|
|
10
16
|
thread = metric.get("thread")
|
|
11
17
|
if thread is None:
|
|
12
18
|
thread = metric[metric["metric_level"].lower()].thread
|
|
19
|
+
# Determine the dataset from the metric's object
|
|
20
|
+
metric_object = metric[metric["metric_level"].lower()]
|
|
21
|
+
dataset = dataset_by_id.get(metric_object.dataset_id)
|
|
13
22
|
Metric.create(
|
|
14
23
|
message=metric.get("message", None),
|
|
15
24
|
turn=metric.get("turn", None),
|
|
16
25
|
toolcall=metric.get("toolcall", None),
|
|
17
|
-
evalsetrun=
|
|
18
|
-
|
|
19
|
-
].evalsetrun, # metric["turn"].evalsetrun,
|
|
20
|
-
dataset=metric[
|
|
21
|
-
metric["metric_level"].lower()
|
|
22
|
-
].dataset, # metric["turn"].dataset,
|
|
26
|
+
evalsetrun=evalsetrun,
|
|
27
|
+
dataset=dataset,
|
|
23
28
|
thread=thread,
|
|
24
29
|
evaluation_name=metric["evaluation_name"],
|
|
25
30
|
evaluation_type=metric["evaluation_type"],
|
|
@@ -28,7 +33,6 @@ def save_metrics(metrics: Iterable[Metric]):
|
|
|
28
33
|
metric_level=metric["metric_level"],
|
|
29
34
|
kwargs=metric["kwargs"],
|
|
30
35
|
depends_on=json.dumps(metric["depends_on"]),
|
|
31
|
-
context_only=metric.get("context_only", False),
|
|
32
36
|
source=metric["source"],
|
|
33
37
|
rubric_prompt=metric.get("rubric_prompt", None),
|
|
34
38
|
rubric_completion=metric.get("rubric_completion", None),
|
flexeval/run_utils.py
CHANGED
|
@@ -6,7 +6,9 @@ import logging
|
|
|
6
6
|
from flexeval import rubric
|
|
7
7
|
from flexeval.classes.dataset import Dataset
|
|
8
8
|
from flexeval.classes.eval_runner import EvalRunner
|
|
9
|
-
from flexeval.classes.eval_set_run import EvalSetRun
|
|
9
|
+
from flexeval.classes.eval_set_run import EvalSetRun, EvalSetRunDatasets
|
|
10
|
+
from flexeval.schema import evalrun_schema
|
|
11
|
+
from flexeval import data_loader
|
|
10
12
|
|
|
11
13
|
logger = logging.getLogger(__name__)
|
|
12
14
|
|
|
@@ -16,17 +18,11 @@ def build_eval_set_run(runner: EvalRunner) -> EvalSetRun:
|
|
|
16
18
|
|
|
17
19
|
# TODO this code uses a model_name that does not appear in the Eval schema; should look into this
|
|
18
20
|
model_name = json.dumps(None)
|
|
19
|
-
# model_name = json.dumps(
|
|
20
|
-
# runner.eval.get("completion_llm", {}).get("model_name", None)
|
|
21
|
-
# )
|
|
22
21
|
evalsetrun = EvalSetRun.create(
|
|
23
22
|
name=runner.evalrun.eval.name,
|
|
24
23
|
notes=runner.evalrun.eval.notes,
|
|
25
24
|
metrics=runner.evalrun.eval.metrics.model_dump_json(),
|
|
26
25
|
metrics_graph_ordered_list=json.dumps(runner.metrics_graph_ordered_list),
|
|
27
|
-
dataset_files=json.dumps(
|
|
28
|
-
[str(data_source.path) for data_source in runner.evalrun.data_sources]
|
|
29
|
-
),
|
|
30
26
|
do_completion=runner.evalrun.eval.do_completion,
|
|
31
27
|
completion_llm=(
|
|
32
28
|
runner.evalrun.eval.completion_llm.model_dump_json()
|
|
@@ -51,15 +47,165 @@ def build_eval_set_run(runner: EvalRunner) -> EvalSetRun:
|
|
|
51
47
|
return evalsetrun
|
|
52
48
|
|
|
53
49
|
|
|
54
|
-
def
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
50
|
+
def find_dataset_by_name(name: str) -> Dataset | None:
|
|
51
|
+
"""Return the loaded Dataset with this name, or None if no such dataset exists.
|
|
52
|
+
|
|
53
|
+
If a Dataset with this name exists but is not marked is_loaded (the remnant
|
|
54
|
+
of a crashed prior load), it is treated as stale: cleaned up via
|
|
55
|
+
:func:`_cleanup_stale_dataset` and None is returned, so the caller can
|
|
56
|
+
proceed as if no dataset existed.
|
|
57
|
+
|
|
58
|
+
Raises:
|
|
59
|
+
ValueError: If more than one Dataset has this name, or if a stale
|
|
60
|
+
unloaded Dataset has derived rows (metrics or eval-run links) that
|
|
61
|
+
suggest a genuine integrity problem — see _cleanup_stale_dataset.
|
|
62
|
+
"""
|
|
63
|
+
# LIMIT 2: we only need to know 0, 1, or >1
|
|
64
|
+
results = list(Dataset.select().where(Dataset.name == name).limit(2))
|
|
65
|
+
if len(results) == 0:
|
|
66
|
+
return None
|
|
67
|
+
if len(results) > 1:
|
|
68
|
+
raise ValueError(f"Multiple datasets with name '{name}'.")
|
|
69
|
+
dataset = results[0]
|
|
70
|
+
if not dataset.is_loaded:
|
|
71
|
+
_cleanup_stale_dataset(dataset)
|
|
72
|
+
return None
|
|
73
|
+
return dataset
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _cleanup_stale_dataset(dataset: Dataset) -> None:
|
|
77
|
+
"""Delete a partially-loaded Dataset and its child rows.
|
|
78
|
+
|
|
79
|
+
A Dataset with ``is_loaded=False`` is the remnant of a prior load that
|
|
80
|
+
crashed between the Dataset row being committed and the final
|
|
81
|
+
``is_loaded=True`` save — its Thread/Turn/Message/ToolCall rows (if any)
|
|
82
|
+
are partial and unusable.
|
|
83
|
+
|
|
84
|
+
Derived rows (Metric, EvalSetRunDatasets) should never exist for an
|
|
85
|
+
unloaded Dataset — they're only created after a successful load. If they
|
|
86
|
+
do, something bypassed the normal flow and we refuse to touch it.
|
|
87
|
+
"""
|
|
88
|
+
if dataset.metrics_list.exists() or dataset.evalsetrun_links.exists():
|
|
89
|
+
raise ValueError(
|
|
90
|
+
f"Dataset '{dataset.name}' (ID={dataset.id}) has is_loaded=False but "
|
|
91
|
+
"has metrics or eval-run links — refusing to clean up (possible integrity error)."
|
|
62
92
|
)
|
|
63
|
-
|
|
64
|
-
|
|
93
|
+
counts = {
|
|
94
|
+
"threads": dataset.threads.count(),
|
|
95
|
+
"turns": dataset.turns.count(),
|
|
96
|
+
"messages": dataset.messages.count(),
|
|
97
|
+
"toolcalls": dataset.toolcalls.count(),
|
|
98
|
+
}
|
|
99
|
+
logger.warning(
|
|
100
|
+
f"Dropping unloaded dataset '{dataset.name}' (ID={dataset.id}); "
|
|
101
|
+
f"partial rows from a prior failed load: {counts}. Reloading from scratch."
|
|
102
|
+
)
|
|
103
|
+
dataset.delete_instance(recursive=True)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def create_dataset(data_source: evalrun_schema.DataSource) -> Dataset:
|
|
107
|
+
dataset = Dataset.create(
|
|
108
|
+
datasource_type=type(data_source).__name__,
|
|
109
|
+
name=data_source.name,
|
|
110
|
+
notes=data_source.notes,
|
|
111
|
+
)
|
|
112
|
+
return dataset
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def load_datasets(
|
|
116
|
+
evalrun: evalrun_schema.EvalRun,
|
|
117
|
+
) -> list[Dataset]:
|
|
118
|
+
datasets = []
|
|
119
|
+
config = evalrun.config
|
|
120
|
+
for data_source in evalrun.data_sources:
|
|
121
|
+
datasource_type = type(data_source).__name__
|
|
122
|
+
|
|
123
|
+
# Auto-name unnamed IterableDataSources so same-instance reuse works
|
|
124
|
+
if (
|
|
125
|
+
isinstance(data_source, evalrun_schema.IterableDataSource)
|
|
126
|
+
and not data_source.name
|
|
127
|
+
):
|
|
128
|
+
data_source.name = f"_iterable_{id(data_source)}"
|
|
129
|
+
|
|
130
|
+
# 1. Validate naming constraints
|
|
131
|
+
if config.raise_on_unnamed_dataset and (
|
|
132
|
+
data_source.name is None or data_source.name.strip() == ""
|
|
133
|
+
):
|
|
134
|
+
raise ValueError(
|
|
135
|
+
f"Configuration requires named datasets, but a {datasource_type} was unnamed."
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
# 2. Look up existing dataset by name (if named)
|
|
139
|
+
existing_dataset = None
|
|
140
|
+
if data_source.name:
|
|
141
|
+
existing_dataset = find_dataset_by_name(data_source.name)
|
|
142
|
+
|
|
143
|
+
# 3. Dispatch by DataSource type
|
|
144
|
+
if isinstance(data_source, evalrun_schema.NamedDataSource):
|
|
145
|
+
# NamedDataSource MUST match an existing dataset
|
|
146
|
+
if existing_dataset is None:
|
|
147
|
+
raise ValueError(
|
|
148
|
+
f"NamedDataSource requires an existing dataset with name '{data_source.name}', but none was found."
|
|
149
|
+
)
|
|
150
|
+
dataset = existing_dataset
|
|
151
|
+
|
|
152
|
+
elif isinstance(
|
|
153
|
+
data_source,
|
|
154
|
+
(evalrun_schema.FileDataSource, evalrun_schema.IterableDataSource),
|
|
155
|
+
):
|
|
156
|
+
# Reuse if configured and existing dataset matches (checked first, takes priority)
|
|
157
|
+
if config.reuse_dataset_by_name and existing_dataset is not None:
|
|
158
|
+
if existing_dataset.datasource_type != datasource_type:
|
|
159
|
+
logger.warning(
|
|
160
|
+
f"Reusing dataset '{existing_dataset.name}' (ID={existing_dataset.id}) "
|
|
161
|
+
f"but datasource type differs: existing={existing_dataset.datasource_type}, new={datasource_type}."
|
|
162
|
+
)
|
|
163
|
+
logger.info(
|
|
164
|
+
f"Reusing existing dataset '{existing_dataset.name}' (ID={existing_dataset.id})."
|
|
165
|
+
)
|
|
166
|
+
dataset = existing_dataset
|
|
167
|
+
else:
|
|
168
|
+
# Check for duplicate name conflict (only when not reusing)
|
|
169
|
+
if (
|
|
170
|
+
config.raise_on_duplicate_dataset_name
|
|
171
|
+
and existing_dataset is not None
|
|
172
|
+
):
|
|
173
|
+
raise ValueError(
|
|
174
|
+
f"Configuration requires unique dataset names, but '{data_source.name}' already exists (ID={existing_dataset.id})."
|
|
175
|
+
)
|
|
176
|
+
# Create and load new dataset
|
|
177
|
+
dataset = create_dataset(data_source)
|
|
178
|
+
if isinstance(data_source, evalrun_schema.IterableDataSource):
|
|
179
|
+
data_loader.load_iterable(dataset, data_source.contents)
|
|
180
|
+
elif isinstance(data_source, evalrun_schema.FileDataSource):
|
|
181
|
+
data_loader.load_file(
|
|
182
|
+
dataset,
|
|
183
|
+
data_source,
|
|
184
|
+
max_n_conversation_threads=config.max_n_conversation_threads,
|
|
185
|
+
nb_evaluations_per_thread=config.nb_evaluations_per_thread,
|
|
186
|
+
)
|
|
187
|
+
dataset.metadata_dict["imported_path"] = str(data_source.path)
|
|
188
|
+
dataset.metadata_dict["imported_format"] = data_source.format.value
|
|
189
|
+
dataset.is_loaded = True
|
|
190
|
+
dataset.save()
|
|
191
|
+
else:
|
|
192
|
+
raise ValueError(f"Unsupported DataSource type: {datasource_type}")
|
|
193
|
+
|
|
194
|
+
datasets.append(dataset)
|
|
195
|
+
return datasets
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def set_datasets_for_evalsetrun(datasets: list[Dataset], evalsetrun: EvalSetRun):
|
|
199
|
+
for dataset in datasets:
|
|
200
|
+
EvalSetRunDatasets.create(
|
|
201
|
+
evalsetrun=evalsetrun,
|
|
202
|
+
dataset=dataset,
|
|
65
203
|
)
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def build_evalsetrun_datasets(
|
|
207
|
+
evalrun: evalrun_schema.EvalRun, evalsetrun: EvalSetRun
|
|
208
|
+
) -> list[Dataset]:
|
|
209
|
+
datasets = load_datasets(evalrun)
|
|
210
|
+
set_datasets_for_evalsetrun(datasets, evalsetrun)
|
|
211
|
+
return datasets
|
flexeval/runner.py
CHANGED
|
@@ -86,26 +86,18 @@ def run(eval_run: EvalRun) -> EvalRunner:
|
|
|
86
86
|
rd.seed(rd_seed)
|
|
87
87
|
runner.logger.info(f"Set random seed to '{rd_seed}'.")
|
|
88
88
|
|
|
89
|
-
run_utils.
|
|
90
|
-
except Exception:
|
|
91
|
-
runner.logger.exception(
|
|
92
|
-
"An error occurred creating dataset metadata.", exc_info=True
|
|
93
|
-
)
|
|
94
|
-
|
|
95
|
-
try:
|
|
96
|
-
runner.logger.info("Parsing data files")
|
|
97
|
-
for dataset in evalsetrun.datasets:
|
|
98
|
-
runner.logger.debug(f"Loading data from '{dataset.filename}'.")
|
|
99
|
-
dataset.load_data()
|
|
89
|
+
datasets = run_utils.build_evalsetrun_datasets(runner.evalrun, evalsetrun)
|
|
100
90
|
except Exception:
|
|
101
91
|
runner.logger.exception("An error occurred loading data.", exc_info=True)
|
|
92
|
+
runner.shutdown_logging()
|
|
93
|
+
raise
|
|
102
94
|
|
|
103
95
|
# Do completions, if necessary
|
|
104
96
|
try:
|
|
105
97
|
if evalsetrun.do_completion:
|
|
106
98
|
# We do this by creating new turns
|
|
107
99
|
runner.logger.info("Generating completions")
|
|
108
|
-
completions.get_completions(eval_run, evalsetrun)
|
|
100
|
+
completions.get_completions(eval_run, evalsetrun, datasets)
|
|
109
101
|
except Exception:
|
|
110
102
|
runner.logger.exception(
|
|
111
103
|
"An error occurred generating completions.", exc_info=True
|
|
@@ -118,9 +110,9 @@ def run(eval_run: EvalRun) -> EvalRunner:
|
|
|
118
110
|
################# Compute Metrics ###################
|
|
119
111
|
#######################################################
|
|
120
112
|
try:
|
|
121
|
-
metrics = compute_metrics.compute_metrics(eval_run, evalsetrun)
|
|
113
|
+
metrics = compute_metrics.compute_metrics(eval_run, evalsetrun, datasets)
|
|
122
114
|
runner.logger.info(f"Saving '{len(metrics)}' metrics to database.")
|
|
123
|
-
flexeval.metrics.save.save_metrics(metrics)
|
|
115
|
+
flexeval.metrics.save.save_metrics(metrics, evalsetrun, datasets)
|
|
124
116
|
except Exception:
|
|
125
117
|
runner.logger.exception("An error occurred computing metrics.", exc_info=True)
|
|
126
118
|
if eval_run.config.raise_on_metric_error:
|
flexeval/schema/config_schema.py
CHANGED
|
@@ -44,3 +44,15 @@ class Config(BaseModel):
|
|
|
44
44
|
False,
|
|
45
45
|
description="If False (default), no exception will be thrown if a metric function raises an exception.",
|
|
46
46
|
)
|
|
47
|
+
raise_on_duplicate_dataset_name: bool = Field(
|
|
48
|
+
False,
|
|
49
|
+
description="If True, throw an exception if two datasets would be created with the same name. Ignored when reuse_dataset_by_name is True.",
|
|
50
|
+
)
|
|
51
|
+
raise_on_unnamed_dataset: bool = Field(
|
|
52
|
+
False,
|
|
53
|
+
description="If True, throw an exception if any dataset is unnamed.",
|
|
54
|
+
)
|
|
55
|
+
reuse_dataset_by_name: bool = Field(
|
|
56
|
+
True,
|
|
57
|
+
description="If True (default), reuse a previously loaded dataset with the same name instead of creating a new one. This avoids redundant data loading and prevents iterator-based data sources from being consumed twice.",
|
|
58
|
+
)
|
flexeval/schema/eval_schema.py
CHANGED
|
@@ -1,39 +1,70 @@
|
|
|
1
1
|
"""The top-level :class:`~flexeval.schema.evalrun_schema.EvalRun` schema and associated sub-schema."""
|
|
2
2
|
|
|
3
|
+
import enum
|
|
3
4
|
from pathlib import Path
|
|
4
|
-
from typing import Annotated, Callable, Iterable, Literal
|
|
5
|
+
from typing import Annotated, Callable, Iterable, Literal, Union
|
|
5
6
|
|
|
6
7
|
from annotated_types import Len
|
|
7
|
-
from pydantic import BaseModel, Field, FilePath
|
|
8
|
+
from pydantic import BaseModel, Discriminator, Field, FilePath, Tag
|
|
8
9
|
|
|
9
10
|
from flexeval.configuration import function_metrics
|
|
10
11
|
from flexeval.schema import config_schema, eval_schema, rubric_schema, schema_utils
|
|
11
12
|
|
|
12
13
|
|
|
13
14
|
class DataSource(BaseModel):
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
name: str | None = Field(
|
|
17
|
-
|
|
15
|
+
"""Represents a source of data that can be used in evaluations."""
|
|
16
|
+
|
|
17
|
+
name: str | None = Field(
|
|
18
|
+
None, description="Used as metadata. No uniqueness requirement."
|
|
19
|
+
)
|
|
20
|
+
notes: str | None = Field(
|
|
21
|
+
None, description="Used as metadata; put whatever you want here."
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class NamedDataSource(DataSource):
|
|
26
|
+
"""Look up a previously loaded DataSource by name. Must have a unique name."""
|
|
27
|
+
|
|
28
|
+
type: Literal["named"] = "named"
|
|
29
|
+
name: str = Field(description="The name to match on.")
|
|
18
30
|
|
|
19
31
|
|
|
20
32
|
class IterableDataSource(DataSource):
|
|
21
|
-
"""
|
|
33
|
+
"""Iterable of data items."""
|
|
22
34
|
|
|
35
|
+
type: Literal["iterable"] = "iterable"
|
|
23
36
|
contents: Iterable = Field(
|
|
24
37
|
default_factory=list,
|
|
25
|
-
description="Iterable of data items,
|
|
38
|
+
description="Iterable of data items. For now, each item must be a dictionary with role and content keys.",
|
|
26
39
|
)
|
|
27
40
|
|
|
28
41
|
|
|
42
|
+
class FileFormatEnum(str, enum.Enum):
|
|
43
|
+
jsonl = "jsonl"
|
|
44
|
+
langgraph_sqlite = "langgraph_sqlite"
|
|
45
|
+
|
|
46
|
+
|
|
29
47
|
class FileDataSource(DataSource):
|
|
30
48
|
"""File to be used as a data source."""
|
|
31
49
|
|
|
50
|
+
type: Literal["file"] = "file"
|
|
32
51
|
# TODO in the future, we could use cloudpathlib to support cloud paths
|
|
33
52
|
path: FilePath = Field(
|
|
34
53
|
description="Absolute or relative path to data file. Each file must be in jsonl format, with one conversation per line."
|
|
35
54
|
)
|
|
36
|
-
format:
|
|
55
|
+
format: FileFormatEnum = Field(
|
|
56
|
+
FileFormatEnum.jsonl, description="Format of the data file. Default: JSONL"
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
DataSourceType = Annotated[
|
|
61
|
+
Union[
|
|
62
|
+
Annotated[NamedDataSource, Tag("named")],
|
|
63
|
+
Annotated[FileDataSource, Tag("file")],
|
|
64
|
+
Annotated[IterableDataSource, Tag("iterable")],
|
|
65
|
+
],
|
|
66
|
+
Discriminator("type"),
|
|
67
|
+
]
|
|
37
68
|
|
|
38
69
|
|
|
39
70
|
class FunctionsCollection(BaseModel):
|
|
@@ -68,7 +99,7 @@ class EvalRun(BaseModel):
|
|
|
68
99
|
|
|
69
100
|
Read more in the :ref:`user_guide`."""
|
|
70
101
|
|
|
71
|
-
data_sources: Annotated[list[
|
|
102
|
+
data_sources: Annotated[list[DataSourceType], Len(min_length=1)] = Field(
|
|
72
103
|
description="List of data sources.",
|
|
73
104
|
)
|
|
74
105
|
database_path: Path = Field(
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: python-flexeval
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.0
|
|
4
4
|
Summary: FlexEval is a tool for designing custom metrics, completion functions, and LLM-graded rubrics for evaluating the behavior of LLM-powered systems.
|
|
5
5
|
Project-URL: Homepage, https://digitalharborfoundation.github.io/FlexEval/
|
|
6
6
|
Project-URL: GitHub, https://github.com/DigitalHarborFoundation/FlexEval
|
|
@@ -21,8 +21,8 @@ Requires-Dist: flatten-json>=0.1.14
|
|
|
21
21
|
Requires-Dist: jsonschema>=4.23.0
|
|
22
22
|
Requires-Dist: langchain-openai>=0.3.8
|
|
23
23
|
Requires-Dist: langchain>=0.3.20
|
|
24
|
-
Requires-Dist: langgraph-checkpoint-sqlite>=
|
|
25
|
-
Requires-Dist: langgraph>=0.
|
|
24
|
+
Requires-Dist: langgraph-checkpoint-sqlite>=3.0.0
|
|
25
|
+
Requires-Dist: langgraph>=1.0.0
|
|
26
26
|
Requires-Dist: litellm>=1.74.3
|
|
27
27
|
Requires-Dist: msgpack>=1.1.0
|
|
28
28
|
Requires-Dist: networkx>=3.4.2
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
flexeval/__about__.py,sha256=42STGor_9nKYXumfeV5tiyD_M8VdcddX7CEexmibPBk,22
|
|
2
|
+
flexeval/__init__.py,sha256=UXI_xdSxnGAK2plDODBbPF3df-N7E9YJ418QHK7XN-Q,391
|
|
3
|
+
flexeval/__main__.py,sha256=c9NQqsea3e-_6b736gBeIO3O_zdXQ1wtY3-Scj5NiPg,126
|
|
4
|
+
flexeval/cli.py,sha256=RwtRk121OivbLQyYpYxJ7PugPIYQ8J4qXHFN2SxxPy4,2985
|
|
5
|
+
flexeval/completions.py,sha256=8PwpWXawARiSngeE2bRzTRXmPyXmxUjPKNFv4zCuAzE,5731
|
|
6
|
+
flexeval/compute_metrics.py,sha256=SNhPpe5ol7Cqr2kjaBdeTIWIYqVlGjd9ZVDl9Qq90y0,37636
|
|
7
|
+
flexeval/config.yaml,sha256=dpkFdW0rKf7StGoVeIGaCNw0n0yOfYWig0xmIfsDdbg,530
|
|
8
|
+
flexeval/data_loader.py,sha256=jptI0tG2YYk40xNYiZzfSqcqmYzw9pIBt_rFtpw3T4o,17099
|
|
9
|
+
flexeval/db_utils.py,sha256=xz97uZbUMQaTyGoR-7lKrMDs8SGdHy09SCvfCkxB36A,1687
|
|
10
|
+
flexeval/dependency_graph.py,sha256=dUQp0WQ9G2FskorUMLYOKFQ9_JwIrMR_DpVrqh4n0xg,10515
|
|
11
|
+
flexeval/eval_schema.json,sha256=pAS3vPLBEyH3Yjglos6aB0aNMTEUFbV-3Rf6wuSVtR4,11881
|
|
12
|
+
flexeval/function_types.py,sha256=rz8AcsHJOkFfAEEocN3HX5EgEh70Oze5dSlNMdaihVU,6420
|
|
13
|
+
flexeval/helpers.py,sha256=gX-6Hx4_wOiqbfY8c8_kL3XbkdV8mpEjPmaAe44lOSk,1605
|
|
14
|
+
flexeval/log_utils.py,sha256=E3RloPQZbtd8sEIg7mfN5fAku-TeNGqWy03SmwRllIE,923
|
|
15
|
+
flexeval/rubric.py,sha256=UwtJOxIxFJcQVrDXXuCA3tF_FFTcvLPqo2F9lq8gPcM,2167
|
|
16
|
+
flexeval/run_utils.py,sha256=z9ISQlthcLUUsGiIyaGHI1IwICBy_JLN_Efg4TNv8Mk,8536
|
|
17
|
+
flexeval/runner.py,sha256=RuQYQgafD0p4qlVK-IxDRKBJPic40YJItrJg8-M9Shw,4110
|
|
18
|
+
flexeval/classes/__init__.py,sha256=fywDMYX8W-nXFKRXolzn-RWd_7tiJr6FlouQJvYSoyE,347
|
|
19
|
+
flexeval/classes/base.py,sha256=xxkTa8joPe39CFwveeTPW56LW-x7rsi5oBAIxrvM5iI,944
|
|
20
|
+
flexeval/classes/dataset.py,sha256=10t4_1Jyg9rYe00VqOYt_biQYnSZrKbJ3nEIRMCF4O8,656
|
|
21
|
+
flexeval/classes/eval_runner.py,sha256=ZvCpyaD7lorDK_mYJSZqQbvI6FfLbIWRFHNarWTAMQU,6270
|
|
22
|
+
flexeval/classes/eval_set_run.py,sha256=n15zMu-KANEDc2K3sqs-KEI12bWpkhSrF0EkEAiBPV4,1449
|
|
23
|
+
flexeval/classes/jsonview.py,sha256=9HQfEY7BH9D58EnR4N9R5oMQsCMjJxsMcPHdzOBLj2w,3773
|
|
24
|
+
flexeval/classes/message.py,sha256=fiW0JhXKt5IiLw7zA4XVKjpY1rObVGvoBtUTXjOXWhs,7741
|
|
25
|
+
flexeval/classes/metric.py,sha256=yXwRx8ECsEYXKg24r0Y0e8B81XGZma1xOcYp4Zi86pM,2109
|
|
26
|
+
flexeval/classes/thread.py,sha256=3gwiLwe3xP0atzsyCG3SKd2G3QtY21vk1Gif4p9ZwI8,2802
|
|
27
|
+
flexeval/classes/tool_call.py,sha256=qBWTAjEKl35Za4BU-sVRPuTxkgVPRTcefQBynUjGEqI,1626
|
|
28
|
+
flexeval/classes/turn.py,sha256=eN_8mPDJa5x4bGbuiDrEPFbGE6Cs9F4vGoJO17ZaSMI,8771
|
|
29
|
+
flexeval/configuration/__init__.py,sha256=wP_gpYyaEp5DxCSH8-4KHchH07JMZZOk8eCFMfd5LBw,75
|
|
30
|
+
flexeval/configuration/completion_functions.py,sha256=-N0iFAfcYcm35S78M3ES4MBkLXpDeEfy2Qq1ORHGBXE,7491
|
|
31
|
+
flexeval/configuration/evals.yaml,sha256=2cApBbwSQr3C4pil0yfZJRkeWviVwaHH13tLmZNoRaI,21924
|
|
32
|
+
flexeval/configuration/function_metrics.py,sha256=SGCxCAfG5NfKop-d3_uJgF83nPrlfHAhd-TU0GpEPFY,22427
|
|
33
|
+
flexeval/configuration/rubric_metrics.yaml,sha256=JfE6gPj4LtM2v0b5-Zge3NwM17YgJEBZXzTVn9UL7zk,9424
|
|
34
|
+
flexeval/io/__init__.py,sha256=MqdgcPzkFpSnOEz-e2GNNd8XOI_DbyNjIP8AT5eqUqI,101
|
|
35
|
+
flexeval/io/parsers/yaml_parser.py,sha256=2yE6j_RM_YG5nkNUWZckrymh61n28AG46lqnPSlWitk,1818
|
|
36
|
+
flexeval/metrics/__init__.py,sha256=qrgUhTXzezAOoABhck3hMVN-c2Bwn7CTg-e_P2w7PlA,134
|
|
37
|
+
flexeval/metrics/access.py,sha256=mP89IUNTWpHguMEdjjh_deMxdiyClb61hg3k7Jcus-o,1299
|
|
38
|
+
flexeval/metrics/save.py,sha256=nquTUmcUuiCkj5VY0vFonEflo4ZHZN-Xbc_Lvy2AC2k,1837
|
|
39
|
+
flexeval/schema/__init__.py,sha256=4OA6Q7Dguz-uaulwoRsrtaoReFmyNsKqyi_CvfDV4-c,379
|
|
40
|
+
flexeval/schema/config_schema.py,sha256=cH2iE-bj-8Rs1-CEUP-xVn1S0r2wtRmI6kWqfQ4M_Y4,2272
|
|
41
|
+
flexeval/schema/eval_schema.py,sha256=8idEhxogqzUPwojBcfyNIH8yGWX74oa0NhUB3vabwlc,6651
|
|
42
|
+
flexeval/schema/evalrun_schema.py,sha256=nF3GCNlzxhJvu-V2h4-RkX5xWhBA9mQIR_ofR3T6de0,4315
|
|
43
|
+
flexeval/schema/rubric_schema.py,sha256=uxcf7MHWKW3EmABUnWeCinGUP6LBjskiq7zkEPHmAvU,1615
|
|
44
|
+
flexeval/schema/schema_utils.py,sha256=Fg1foqRA-9X-hl_vqIF3bpYdE51hNEgdw739Q-s3iQc,698
|
|
45
|
+
python_flexeval-0.4.0.dist-info/METADATA,sha256=isVFK5bnXc7iBmzvrALqo7OYOYi653_UZOjY_TXBkqE,5599
|
|
46
|
+
python_flexeval-0.4.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
47
|
+
python_flexeval-0.4.0.dist-info/entry_points.txt,sha256=wSyluqXhrX3xySVYAtM-Kv23p4OauKQCSBuNNfzEGtI,52
|
|
48
|
+
python_flexeval-0.4.0.dist-info/licenses/LICENSE,sha256=OlAu_c13gw6-fJ9UdhZBMeNr5STLrnWG_0Hv0SCXtu4,1082
|
|
49
|
+
python_flexeval-0.4.0.dist-info/RECORD,,
|
|
@@ -1,49 +0,0 @@
|
|
|
1
|
-
flexeval/__about__.py,sha256=VrXpHDu3erkzwl_WXrqINBm9xWkcyUy53IQOj042dOs,22
|
|
2
|
-
flexeval/__init__.py,sha256=UXI_xdSxnGAK2plDODBbPF3df-N7E9YJ418QHK7XN-Q,391
|
|
3
|
-
flexeval/__main__.py,sha256=c9NQqsea3e-_6b736gBeIO3O_zdXQ1wtY3-Scj5NiPg,126
|
|
4
|
-
flexeval/cli.py,sha256=RwtRk121OivbLQyYpYxJ7PugPIYQ8J4qXHFN2SxxPy4,2985
|
|
5
|
-
flexeval/completions.py,sha256=pi_tYK4m3vKSqAC1ym9Jc3e4srcQSXfx-mX4qI5qisQ,5686
|
|
6
|
-
flexeval/compute_metrics.py,sha256=4X6XFk0qUKcaCDllNeJreuhlnDHmfRPlsf0f8fWFOxA,37277
|
|
7
|
-
flexeval/config.yaml,sha256=dpkFdW0rKf7StGoVeIGaCNw0n0yOfYWig0xmIfsDdbg,530
|
|
8
|
-
flexeval/data_loader.py,sha256=UP-HWqh5o_euqT2GvTbUYmA-yJcbTKtmug4w63w2CbA,26153
|
|
9
|
-
flexeval/db_utils.py,sha256=2jgqexLCAqShvgPrImZz12UkMZtfERhP8iXjratXYok,1612
|
|
10
|
-
flexeval/dependency_graph.py,sha256=SaG9gjkw2Q0NykqQWs4JzPkv5sMj2aXXmhjJ7yRkV4Q,10539
|
|
11
|
-
flexeval/eval_schema.json,sha256=BQetj8O0_4rorj3Mpqk-sj_SCaRkGMrvBUcxhuw6zLE,13111
|
|
12
|
-
flexeval/function_types.py,sha256=eH8NadQRw7XAOXAOKWYN6b7urjr57J5WzdiVyzh0Wb4,6898
|
|
13
|
-
flexeval/helpers.py,sha256=gX-6Hx4_wOiqbfY8c8_kL3XbkdV8mpEjPmaAe44lOSk,1605
|
|
14
|
-
flexeval/log_utils.py,sha256=E3RloPQZbtd8sEIg7mfN5fAku-TeNGqWy03SmwRllIE,923
|
|
15
|
-
flexeval/rubric.py,sha256=UwtJOxIxFJcQVrDXXuCA3tF_FFTcvLPqo2F9lq8gPcM,2167
|
|
16
|
-
flexeval/run_utils.py,sha256=cNFVRsFNYY9gpzbIUc-H4Gk7TWC64GXsYowQHoG7ZVU,2597
|
|
17
|
-
flexeval/runner.py,sha256=X6ZfjfwIM3ymN_kHfRt_JSKPxpDxs_MWQPrvWhl2L7I,4340
|
|
18
|
-
flexeval/classes/__init__.py,sha256=fywDMYX8W-nXFKRXolzn-RWd_7tiJr6FlouQJvYSoyE,347
|
|
19
|
-
flexeval/classes/base.py,sha256=xxkTa8joPe39CFwveeTPW56LW-x7rsi5oBAIxrvM5iI,944
|
|
20
|
-
flexeval/classes/dataset.py,sha256=Y_EdEIuhx526SSvkqk2tFBzkOgBkVY-5FeraYMtU5lo,2913
|
|
21
|
-
flexeval/classes/eval_runner.py,sha256=ZvCpyaD7lorDK_mYJSZqQbvI6FfLbIWRFHNarWTAMQU,6270
|
|
22
|
-
flexeval/classes/eval_set_run.py,sha256=fq_wBOaxuq7dLxiZIw76WGIwhRBNbQWDUhpiK0wDG_A,1116
|
|
23
|
-
flexeval/classes/jsonview.py,sha256=3XJTh46ODfqdNbrXYDEV6kRO8KbeiHJo5pb4aJrbHRY,3459
|
|
24
|
-
flexeval/classes/message.py,sha256=gDejDfaHGQKgS_CpJqjPAVzpiRD2JddKo17Yi1wVeiw,7676
|
|
25
|
-
flexeval/classes/metric.py,sha256=d8l39_QwnQDmTJvy9TIulU4p0jqD7ldMUi4m5zfK2Es,2806
|
|
26
|
-
flexeval/classes/thread.py,sha256=cFQu3Mwzk8-Def8xccB8F6zKv64Srvhz5n83yLELvKo,2922
|
|
27
|
-
flexeval/classes/tool_call.py,sha256=CteT2Hajor0PlHEEn7apfZux5_mremSIDrQmZ0iB7K0,1748
|
|
28
|
-
flexeval/classes/turn.py,sha256=kLmgnYQ-4a8sydzGK1HTQRyUDXZIedmt_NFR3shLJFE,8635
|
|
29
|
-
flexeval/configuration/__init__.py,sha256=wP_gpYyaEp5DxCSH8-4KHchH07JMZZOk8eCFMfd5LBw,75
|
|
30
|
-
flexeval/configuration/completion_functions.py,sha256=-N0iFAfcYcm35S78M3ES4MBkLXpDeEfy2Qq1ORHGBXE,7491
|
|
31
|
-
flexeval/configuration/evals.yaml,sha256=3mbD3gEccTDotm8kj4doYTujqRD_PkGhCVhjQaSEqSs,22651
|
|
32
|
-
flexeval/configuration/function_metrics.py,sha256=SGCxCAfG5NfKop-d3_uJgF83nPrlfHAhd-TU0GpEPFY,22427
|
|
33
|
-
flexeval/configuration/rubric_metrics.yaml,sha256=JfE6gPj4LtM2v0b5-Zge3NwM17YgJEBZXzTVn9UL7zk,9424
|
|
34
|
-
flexeval/io/__init__.py,sha256=MqdgcPzkFpSnOEz-e2GNNd8XOI_DbyNjIP8AT5eqUqI,101
|
|
35
|
-
flexeval/io/parsers/yaml_parser.py,sha256=2yE6j_RM_YG5nkNUWZckrymh61n28AG46lqnPSlWitk,1818
|
|
36
|
-
flexeval/metrics/__init__.py,sha256=qrgUhTXzezAOoABhck3hMVN-c2Bwn7CTg-e_P2w7PlA,134
|
|
37
|
-
flexeval/metrics/access.py,sha256=mP89IUNTWpHguMEdjjh_deMxdiyClb61hg3k7Jcus-o,1299
|
|
38
|
-
flexeval/metrics/save.py,sha256=8x9ifRiHtQT7_WeMP0XmYK1zfourXMnHkGZy_iR0Xcc,1643
|
|
39
|
-
flexeval/schema/__init__.py,sha256=4OA6Q7Dguz-uaulwoRsrtaoReFmyNsKqyi_CvfDV4-c,379
|
|
40
|
-
flexeval/schema/config_schema.py,sha256=LkmtiOLfPsX1u_6Ey6gFbRr8tQwxqcuLcyf-xYcBf9o,1619
|
|
41
|
-
flexeval/schema/eval_schema.py,sha256=iHMbanW4Ef_sp51KiaZKeP3Dn4Z6pWCGa7N2SPvsFK0,6607
|
|
42
|
-
flexeval/schema/evalrun_schema.py,sha256=M7JY01DhlLzwZc2jJTIeGPs9vt6TFMPir51MFhtRllA,3526
|
|
43
|
-
flexeval/schema/rubric_schema.py,sha256=uxcf7MHWKW3EmABUnWeCinGUP6LBjskiq7zkEPHmAvU,1615
|
|
44
|
-
flexeval/schema/schema_utils.py,sha256=Fg1foqRA-9X-hl_vqIF3bpYdE51hNEgdw739Q-s3iQc,698
|
|
45
|
-
python_flexeval-0.3.0.dist-info/METADATA,sha256=xBbeZrF4aEdl94pg-L2P_Di6cxtxA3aZnu6fxFjUf-8,5599
|
|
46
|
-
python_flexeval-0.3.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
47
|
-
python_flexeval-0.3.0.dist-info/entry_points.txt,sha256=wSyluqXhrX3xySVYAtM-Kv23p4OauKQCSBuNNfzEGtI,52
|
|
48
|
-
python_flexeval-0.3.0.dist-info/licenses/LICENSE,sha256=OlAu_c13gw6-fJ9UdhZBMeNr5STLrnWG_0Hv0SCXtu4,1082
|
|
49
|
-
python_flexeval-0.3.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|