python-flexeval 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -115,9 +115,9 @@ def get_parent_metrics(all_metrics: dict, child: dict) -> tuple[list, list]:
115
115
  """metrics_graph_ordered_list will be a list of metrics in order in which they should be run
116
116
 
117
117
  This function takes the eval represented by "child" and finds ALL evals in "all_metrics"
118
- that quality as the child's immediate parent
118
+ that qualify as the child's immediate parent
119
119
 
120
- An eval can qualify as a parent by having a matching name, type, context_only
120
+ An eval can qualify as a parent by having a matching name, type, etc.
121
121
  At this point, we won't have enough information to decide whether the child should be run
122
122
  (since the child might have additional requirements on the output of the parent)
123
123
  but this is enough to tell us that the child should be run AFTER the parent.
@@ -145,7 +145,7 @@ def get_parent_metrics(all_metrics: dict, child: dict) -> tuple[list, list]:
145
145
 
146
146
  # if the conditionals are listed in the depends_on entry but don't match...
147
147
  # Only check conditionals that are explicitly specified (not None) in the requirement
148
- conditionals = ["metric_level", "context_only", "name", "kwargs"]
148
+ conditionals = ["metric_level", "name", "kwargs"]
149
149
  for conditional in conditionals:
150
150
  if (
151
151
  conditional in requirement
flexeval/eval_schema.json CHANGED
@@ -76,10 +76,6 @@
76
76
  "description": "The keyword arguments for the dependency. If provided, used to match which evaluation this dependency is for, so must match the keyword args given for some evaluation.",
77
77
  "additionalProperties": true
78
78
  },
79
- "context_only": {
80
- "type": "boolean",
81
- "description": "The context_only value for the dependency. If provided, used to match which evaluation this dependency is for."
82
- },
83
79
  "last_turn_only": {
84
80
  "type": "boolean",
85
81
  "description": "The last_turn_only value for the dependency. If provided, used to match which evaluation this dependency is for."
@@ -108,11 +104,6 @@
108
104
  "description": "What level of granularity (ToolCall, Message, Turn, or Thread) this rubric should be applied to",
109
105
  "default": "Turn"
110
106
  },
111
- "context_only": {
112
- "type": "boolean",
113
- "description": "If true, only the context (that is, the previous messages) will be evaluated, not the current object. Cannot be done with only thread",
114
- "default": false
115
- },
116
107
  "last_instance_only": {
117
108
  "type": "boolean",
118
109
  "description": "If true, the object will only be evaluated if it's the last instance (i.e., turn or message depending on metric_level) in an existing conversation, or if it's a new completion.",
@@ -143,11 +134,6 @@
143
134
  "description": "What level of granularity (ToolCall, Message, Turn, or Thread) this rubric should be applied to",
144
135
  "default": "Turn"
145
136
  },
146
- "context_only": {
147
- "type": "boolean",
148
- "description": "If true, only the context (that is, the previous messages) will be evaluated, not the current object. Cannot be done with only thread",
149
- "default": false
150
- },
151
137
  "last_instance_only": {
152
138
  "type": "boolean",
153
139
  "description": "If true, the object will only be evaluated if it's the last instance (i.e., turn or message depending on metric_level) in an existing conversation, or if it's a new completion.",
@@ -174,10 +160,6 @@
174
160
  "description": "The keyword arguments for the dependency. If provided, used to match which evaluation this dependency is for, so must match the keyword args given for some evaluation.",
175
161
  "additionalProperties": true
176
162
  },
177
- "context_only": {
178
- "type": "boolean",
179
- "description": "The context_only value for the dependency. If provided, used to match which evaluation this dependency is for."
180
- },
181
163
  "last_turn_only": {
182
164
  "type": "boolean",
183
165
  "description": "The last_turn_only value for the dependency. If provided, used to match which evaluation this dependency is for."
@@ -102,7 +102,6 @@ def get_function_input(
102
102
  metric_function: Callable,
103
103
  metric_level: eval_schema.MetricLevel,
104
104
  input_object: AnyFunctionObjectInput,
105
- context_only: bool,
106
105
  ) -> AnyFunctionObjectInput | str | dict | list:
107
106
  """Coerce input_object to a type accepted by metric_function at this metric_level.
108
107
 
@@ -110,7 +109,6 @@ def get_function_input(
110
109
  metric_function (Callable): Function to invoke with the returned input.
111
110
  metric_level (eval_schema.MetricLevel): The metric level at which metric_function is being invoked.
112
111
  input_object (AnyFunctionObjectInput): The input_object to be coerced, or passed as-is if accepted by metric_function.
113
- context_only (bool): Determines how strings and lists are converted. See schema documentation.
114
112
 
115
113
  Raises:
116
114
  ValueError: If the function accepts at least one declared type, but
@@ -137,22 +135,13 @@ def get_function_input(
137
135
  elif dict in accepted_parameter_types and metric_level == "ToolCall":
138
136
  return input_object.get_dict_representation()
139
137
  elif list in accepted_parameter_types and metric_level in ["Turn", "Thread"]:
140
- if context_only:
141
- return input_object.get_context()
142
- else:
143
- # this is on a single turn - pass in the parsed list
144
- return input_object.get_content()
138
+ return input_object.get_content()
145
139
  elif str in accepted_parameter_types:
146
140
  if metric_level == "ToolCall":
147
141
  raise ValueError(
148
142
  "Functions that accept strings can't be used for tool calls. Accept a dict (or a flexeval.classes.tool_call.ToolCall) instead."
149
143
  )
150
- if context_only:
151
- # join together all previous turns
152
- return join_all_contents_to_string(input_object.get_context())
153
- else:
154
- # current turn only
155
- return join_all_contents_to_string(input_object.get_content())
144
+ return join_all_contents_to_string(input_object.get_content())
156
145
  else:
157
146
  # the function accepts at least one declared type, but either:
158
147
  # - it's a type we don't support at all e.g. set
flexeval/metrics/save.py CHANGED
@@ -1,25 +1,30 @@
1
1
  import json
2
2
  from typing import Iterable
3
3
 
4
+ from flexeval.classes.dataset import Dataset
5
+ from flexeval.classes.eval_set_run import EvalSetRun
4
6
  from flexeval.classes.metric import Metric
5
7
 
6
8
 
7
- def save_metrics(metrics: Iterable[Metric]):
9
+ def save_metrics(
10
+ metrics: Iterable[Metric], evalsetrun: EvalSetRun, datasets: list[Dataset]
11
+ ):
12
+ # Build a mapping from dataset id to dataset for quick lookup
13
+ dataset_by_id = {d.id: d for d in datasets}
8
14
  for metric in metrics:
9
15
  # TODO - speed this up somehow
10
16
  thread = metric.get("thread")
11
17
  if thread is None:
12
18
  thread = metric[metric["metric_level"].lower()].thread
19
+ # Determine the dataset from the metric's object
20
+ metric_object = metric[metric["metric_level"].lower()]
21
+ dataset = dataset_by_id.get(metric_object.dataset_id)
13
22
  Metric.create(
14
23
  message=metric.get("message", None),
15
24
  turn=metric.get("turn", None),
16
25
  toolcall=metric.get("toolcall", None),
17
- evalsetrun=metric[
18
- metric["metric_level"].lower()
19
- ].evalsetrun, # metric["turn"].evalsetrun,
20
- dataset=metric[
21
- metric["metric_level"].lower()
22
- ].dataset, # metric["turn"].dataset,
26
+ evalsetrun=evalsetrun,
27
+ dataset=dataset,
23
28
  thread=thread,
24
29
  evaluation_name=metric["evaluation_name"],
25
30
  evaluation_type=metric["evaluation_type"],
@@ -28,7 +33,6 @@ def save_metrics(metrics: Iterable[Metric]):
28
33
  metric_level=metric["metric_level"],
29
34
  kwargs=metric["kwargs"],
30
35
  depends_on=json.dumps(metric["depends_on"]),
31
- context_only=metric.get("context_only", False),
32
36
  source=metric["source"],
33
37
  rubric_prompt=metric.get("rubric_prompt", None),
34
38
  rubric_completion=metric.get("rubric_completion", None),
flexeval/run_utils.py CHANGED
@@ -6,7 +6,9 @@ import logging
6
6
  from flexeval import rubric
7
7
  from flexeval.classes.dataset import Dataset
8
8
  from flexeval.classes.eval_runner import EvalRunner
9
- from flexeval.classes.eval_set_run import EvalSetRun
9
+ from flexeval.classes.eval_set_run import EvalSetRun, EvalSetRunDatasets
10
+ from flexeval.schema import evalrun_schema
11
+ from flexeval import data_loader
10
12
 
11
13
  logger = logging.getLogger(__name__)
12
14
 
@@ -16,17 +18,11 @@ def build_eval_set_run(runner: EvalRunner) -> EvalSetRun:
16
18
 
17
19
  # TODO this code uses a model_name that does not appear in the Eval schema; should look into this
18
20
  model_name = json.dumps(None)
19
- # model_name = json.dumps(
20
- # runner.eval.get("completion_llm", {}).get("model_name", None)
21
- # )
22
21
  evalsetrun = EvalSetRun.create(
23
22
  name=runner.evalrun.eval.name,
24
23
  notes=runner.evalrun.eval.notes,
25
24
  metrics=runner.evalrun.eval.metrics.model_dump_json(),
26
25
  metrics_graph_ordered_list=json.dumps(runner.metrics_graph_ordered_list),
27
- dataset_files=json.dumps(
28
- [str(data_source.path) for data_source in runner.evalrun.data_sources]
29
- ),
30
26
  do_completion=runner.evalrun.eval.do_completion,
31
27
  completion_llm=(
32
28
  runner.evalrun.eval.completion_llm.model_dump_json()
@@ -51,15 +47,165 @@ def build_eval_set_run(runner: EvalRunner) -> EvalSetRun:
51
47
  return evalsetrun
52
48
 
53
49
 
54
- def build_datasets(runner: EvalRunner, evalsetrun: EvalSetRun):
55
- for filename in evalsetrun.get_datasets():
56
- # these will automatically be saved as a property of evalsetrun
57
- Dataset.create(
58
- evalsetrun=evalsetrun,
59
- filename=filename,
60
- max_n_conversation_threads=runner.evalrun.config.max_n_conversation_threads,
61
- nb_evaluations_per_thread=runner.evalrun.config.nb_evaluations_per_thread,
50
+ def find_dataset_by_name(name: str) -> Dataset | None:
51
+ """Return the loaded Dataset with this name, or None if no such dataset exists.
52
+
53
+ If a Dataset with this name exists but is not marked is_loaded (the remnant
54
+ of a crashed prior load), it is treated as stale: cleaned up via
55
+ :func:`_cleanup_stale_dataset` and None is returned, so the caller can
56
+ proceed as if no dataset existed.
57
+
58
+ Raises:
59
+ ValueError: If more than one Dataset has this name, or if a stale
60
+ unloaded Dataset has derived rows (metrics or eval-run links) that
61
+ suggest a genuine integrity problem — see _cleanup_stale_dataset.
62
+ """
63
+ # LIMIT 2: we only need to know 0, 1, or >1
64
+ results = list(Dataset.select().where(Dataset.name == name).limit(2))
65
+ if len(results) == 0:
66
+ return None
67
+ if len(results) > 1:
68
+ raise ValueError(f"Multiple datasets with name '{name}'.")
69
+ dataset = results[0]
70
+ if not dataset.is_loaded:
71
+ _cleanup_stale_dataset(dataset)
72
+ return None
73
+ return dataset
74
+
75
+
76
+ def _cleanup_stale_dataset(dataset: Dataset) -> None:
77
+ """Delete a partially-loaded Dataset and its child rows.
78
+
79
+ A Dataset with ``is_loaded=False`` is the remnant of a prior load that
80
+ crashed between the Dataset row being committed and the final
81
+ ``is_loaded=True`` save — its Thread/Turn/Message/ToolCall rows (if any)
82
+ are partial and unusable.
83
+
84
+ Derived rows (Metric, EvalSetRunDatasets) should never exist for an
85
+ unloaded Dataset — they're only created after a successful load. If they
86
+ do, something bypassed the normal flow and we refuse to touch it.
87
+ """
88
+ if dataset.metrics_list.exists() or dataset.evalsetrun_links.exists():
89
+ raise ValueError(
90
+ f"Dataset '{dataset.name}' (ID={dataset.id}) has is_loaded=False but "
91
+ "has metrics or eval-run links — refusing to clean up (possible integrity error)."
62
92
  )
63
- runner.logger.info(
64
- f"Created dataset from '{filename}'. Max number of conversation threads: '{runner.evalrun.config.max_n_conversation_threads}' - Nb of evaluations per thread: '{runner.evalrun.config.nb_evaluations_per_thread}'"
93
+ counts = {
94
+ "threads": dataset.threads.count(),
95
+ "turns": dataset.turns.count(),
96
+ "messages": dataset.messages.count(),
97
+ "toolcalls": dataset.toolcalls.count(),
98
+ }
99
+ logger.warning(
100
+ f"Dropping unloaded dataset '{dataset.name}' (ID={dataset.id}); "
101
+ f"partial rows from a prior failed load: {counts}. Reloading from scratch."
102
+ )
103
+ dataset.delete_instance(recursive=True)
104
+
105
+
106
+ def create_dataset(data_source: evalrun_schema.DataSource) -> Dataset:
107
+ dataset = Dataset.create(
108
+ datasource_type=type(data_source).__name__,
109
+ name=data_source.name,
110
+ notes=data_source.notes,
111
+ )
112
+ return dataset
113
+
114
+
115
+ def load_datasets(
116
+ evalrun: evalrun_schema.EvalRun,
117
+ ) -> list[Dataset]:
118
+ datasets = []
119
+ config = evalrun.config
120
+ for data_source in evalrun.data_sources:
121
+ datasource_type = type(data_source).__name__
122
+
123
+ # Auto-name unnamed IterableDataSources so same-instance reuse works
124
+ if (
125
+ isinstance(data_source, evalrun_schema.IterableDataSource)
126
+ and not data_source.name
127
+ ):
128
+ data_source.name = f"_iterable_{id(data_source)}"
129
+
130
+ # 1. Validate naming constraints
131
+ if config.raise_on_unnamed_dataset and (
132
+ data_source.name is None or data_source.name.strip() == ""
133
+ ):
134
+ raise ValueError(
135
+ f"Configuration requires named datasets, but a {datasource_type} was unnamed."
136
+ )
137
+
138
+ # 2. Look up existing dataset by name (if named)
139
+ existing_dataset = None
140
+ if data_source.name:
141
+ existing_dataset = find_dataset_by_name(data_source.name)
142
+
143
+ # 3. Dispatch by DataSource type
144
+ if isinstance(data_source, evalrun_schema.NamedDataSource):
145
+ # NamedDataSource MUST match an existing dataset
146
+ if existing_dataset is None:
147
+ raise ValueError(
148
+ f"NamedDataSource requires an existing dataset with name '{data_source.name}', but none was found."
149
+ )
150
+ dataset = existing_dataset
151
+
152
+ elif isinstance(
153
+ data_source,
154
+ (evalrun_schema.FileDataSource, evalrun_schema.IterableDataSource),
155
+ ):
156
+ # Reuse if configured and existing dataset matches (checked first, takes priority)
157
+ if config.reuse_dataset_by_name and existing_dataset is not None:
158
+ if existing_dataset.datasource_type != datasource_type:
159
+ logger.warning(
160
+ f"Reusing dataset '{existing_dataset.name}' (ID={existing_dataset.id}) "
161
+ f"but datasource type differs: existing={existing_dataset.datasource_type}, new={datasource_type}."
162
+ )
163
+ logger.info(
164
+ f"Reusing existing dataset '{existing_dataset.name}' (ID={existing_dataset.id})."
165
+ )
166
+ dataset = existing_dataset
167
+ else:
168
+ # Check for duplicate name conflict (only when not reusing)
169
+ if (
170
+ config.raise_on_duplicate_dataset_name
171
+ and existing_dataset is not None
172
+ ):
173
+ raise ValueError(
174
+ f"Configuration requires unique dataset names, but '{data_source.name}' already exists (ID={existing_dataset.id})."
175
+ )
176
+ # Create and load new dataset
177
+ dataset = create_dataset(data_source)
178
+ if isinstance(data_source, evalrun_schema.IterableDataSource):
179
+ data_loader.load_iterable(dataset, data_source.contents)
180
+ elif isinstance(data_source, evalrun_schema.FileDataSource):
181
+ data_loader.load_file(
182
+ dataset,
183
+ data_source,
184
+ max_n_conversation_threads=config.max_n_conversation_threads,
185
+ nb_evaluations_per_thread=config.nb_evaluations_per_thread,
186
+ )
187
+ dataset.metadata_dict["imported_path"] = str(data_source.path)
188
+ dataset.metadata_dict["imported_format"] = data_source.format.value
189
+ dataset.is_loaded = True
190
+ dataset.save()
191
+ else:
192
+ raise ValueError(f"Unsupported DataSource type: {datasource_type}")
193
+
194
+ datasets.append(dataset)
195
+ return datasets
196
+
197
+
198
+ def set_datasets_for_evalsetrun(datasets: list[Dataset], evalsetrun: EvalSetRun):
199
+ for dataset in datasets:
200
+ EvalSetRunDatasets.create(
201
+ evalsetrun=evalsetrun,
202
+ dataset=dataset,
65
203
  )
204
+
205
+
206
+ def build_evalsetrun_datasets(
207
+ evalrun: evalrun_schema.EvalRun, evalsetrun: EvalSetRun
208
+ ) -> list[Dataset]:
209
+ datasets = load_datasets(evalrun)
210
+ set_datasets_for_evalsetrun(datasets, evalsetrun)
211
+ return datasets
flexeval/runner.py CHANGED
@@ -86,26 +86,18 @@ def run(eval_run: EvalRun) -> EvalRunner:
86
86
  rd.seed(rd_seed)
87
87
  runner.logger.info(f"Set random seed to '{rd_seed}'.")
88
88
 
89
- run_utils.build_datasets(runner, evalsetrun)
90
- except Exception:
91
- runner.logger.exception(
92
- "An error occurred creating dataset metadata.", exc_info=True
93
- )
94
-
95
- try:
96
- runner.logger.info("Parsing data files")
97
- for dataset in evalsetrun.datasets:
98
- runner.logger.debug(f"Loading data from '{dataset.filename}'.")
99
- dataset.load_data()
89
+ datasets = run_utils.build_evalsetrun_datasets(runner.evalrun, evalsetrun)
100
90
  except Exception:
101
91
  runner.logger.exception("An error occurred loading data.", exc_info=True)
92
+ runner.shutdown_logging()
93
+ raise
102
94
 
103
95
  # Do completions, if necessary
104
96
  try:
105
97
  if evalsetrun.do_completion:
106
98
  # We do this by creating new turns
107
99
  runner.logger.info("Generating completions")
108
- completions.get_completions(eval_run, evalsetrun)
100
+ completions.get_completions(eval_run, evalsetrun, datasets)
109
101
  except Exception:
110
102
  runner.logger.exception(
111
103
  "An error occurred generating completions.", exc_info=True
@@ -118,9 +110,9 @@ def run(eval_run: EvalRun) -> EvalRunner:
118
110
  ################# Compute Metrics ###################
119
111
  #######################################################
120
112
  try:
121
- metrics = compute_metrics.compute_metrics(eval_run, evalsetrun)
113
+ metrics = compute_metrics.compute_metrics(eval_run, evalsetrun, datasets)
122
114
  runner.logger.info(f"Saving '{len(metrics)}' metrics to database.")
123
- flexeval.metrics.save.save_metrics(metrics)
115
+ flexeval.metrics.save.save_metrics(metrics, evalsetrun, datasets)
124
116
  except Exception:
125
117
  runner.logger.exception("An error occurred computing metrics.", exc_info=True)
126
118
  if eval_run.config.raise_on_metric_error:
@@ -44,3 +44,15 @@ class Config(BaseModel):
44
44
  False,
45
45
  description="If False (default), no exception will be thrown if a metric function raises an exception.",
46
46
  )
47
+ raise_on_duplicate_dataset_name: bool = Field(
48
+ False,
49
+ description="If True, throw an exception if two datasets would be created with the same name. Ignored when reuse_dataset_by_name is True.",
50
+ )
51
+ raise_on_unnamed_dataset: bool = Field(
52
+ False,
53
+ description="If True, throw an exception if any dataset is unnamed.",
54
+ )
55
+ reuse_dataset_by_name: bool = Field(
56
+ True,
57
+ description="If True (default), reuse a previously loaded dataset with the same name instead of creating a new one. This avoids redundant data loading and prevents iterator-based data sources from being consumed twice.",
58
+ )
@@ -60,6 +60,9 @@ class DependsOnItem(BaseModel):
60
60
  class MetricItem(BaseModel):
61
61
  "Defines a metric."
62
62
 
63
+ class Config:
64
+ extra = "forbid"
65
+
63
66
  name: str = Field(
64
67
  ...,
65
68
  description="The function to call or name of rubric to use to compute this metric.",
@@ -1,39 +1,70 @@
1
1
  """The top-level :class:`~flexeval.schema.evalrun_schema.EvalRun` schema and associated sub-schema."""
2
2
 
3
+ import enum
3
4
  from pathlib import Path
4
- from typing import Annotated, Callable, Iterable, Literal
5
+ from typing import Annotated, Callable, Iterable, Literal, Union
5
6
 
6
7
  from annotated_types import Len
7
- from pydantic import BaseModel, Field, FilePath
8
+ from pydantic import BaseModel, Discriminator, Field, FilePath, Tag
8
9
 
9
10
  from flexeval.configuration import function_metrics
10
11
  from flexeval.schema import config_schema, eval_schema, rubric_schema, schema_utils
11
12
 
12
13
 
13
14
  class DataSource(BaseModel):
14
- # TODO support more generic DataSource interface
15
- # for now, we need to use FileDataSource because we path the JSONL paths along
16
- name: str | None = Field(None, description="")
17
- notes: str | None = Field(None, description="")
15
+ """Represents a source of data that can be used in evaluations."""
16
+
17
+ name: str | None = Field(
18
+ None, description="Used as metadata. No uniqueness requirement."
19
+ )
20
+ notes: str | None = Field(
21
+ None, description="Used as metadata; put whatever you want here."
22
+ )
23
+
24
+
25
+ class NamedDataSource(DataSource):
26
+ """Look up a previously loaded DataSource by name. Must have a unique name."""
27
+
28
+ type: Literal["named"] = "named"
29
+ name: str = Field(description="The name to match on.")
18
30
 
19
31
 
20
32
  class IterableDataSource(DataSource):
21
- """Not yet implemented."""
33
+ """Iterable of data items."""
22
34
 
35
+ type: Literal["iterable"] = "iterable"
23
36
  contents: Iterable = Field(
24
37
  default_factory=list,
25
- description="Iterable of data items, presumably in the jsonl format (for now).",
38
+ description="Iterable of data items. For now, each item must be a dictionary with role and content keys.",
26
39
  )
27
40
 
28
41
 
42
+ class FileFormatEnum(str, enum.Enum):
43
+ jsonl = "jsonl"
44
+ langgraph_sqlite = "langgraph_sqlite"
45
+
46
+
29
47
  class FileDataSource(DataSource):
30
48
  """File to be used as a data source."""
31
49
 
50
+ type: Literal["file"] = "file"
32
51
  # TODO in the future, we could use cloudpathlib to support cloud paths
33
52
  path: FilePath = Field(
34
53
  description="Absolute or relative path to data file. Each file must be in jsonl format, with one conversation per line."
35
54
  )
36
- format: Literal["jsonl"] = Field("jsonl", description="Format of the data file.")
55
+ format: FileFormatEnum = Field(
56
+ FileFormatEnum.jsonl, description="Format of the data file. Default: JSONL"
57
+ )
58
+
59
+
60
+ DataSourceType = Annotated[
61
+ Union[
62
+ Annotated[NamedDataSource, Tag("named")],
63
+ Annotated[FileDataSource, Tag("file")],
64
+ Annotated[IterableDataSource, Tag("iterable")],
65
+ ],
66
+ Discriminator("type"),
67
+ ]
37
68
 
38
69
 
39
70
  class FunctionsCollection(BaseModel):
@@ -68,7 +99,7 @@ class EvalRun(BaseModel):
68
99
 
69
100
  Read more in the :ref:`user_guide`."""
70
101
 
71
- data_sources: Annotated[list[FileDataSource], Len(min_length=1)] = Field(
102
+ data_sources: Annotated[list[DataSourceType], Len(min_length=1)] = Field(
72
103
  description="List of data sources.",
73
104
  )
74
105
  database_path: Path = Field(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: python-flexeval
3
- Version: 0.3.0
3
+ Version: 0.4.0
4
4
  Summary: FlexEval is a tool for designing custom metrics, completion functions, and LLM-graded rubrics for evaluating the behavior of LLM-powered systems.
5
5
  Project-URL: Homepage, https://digitalharborfoundation.github.io/FlexEval/
6
6
  Project-URL: GitHub, https://github.com/DigitalHarborFoundation/FlexEval
@@ -21,8 +21,8 @@ Requires-Dist: flatten-json>=0.1.14
21
21
  Requires-Dist: jsonschema>=4.23.0
22
22
  Requires-Dist: langchain-openai>=0.3.8
23
23
  Requires-Dist: langchain>=0.3.20
24
- Requires-Dist: langgraph-checkpoint-sqlite>=2.0.6
25
- Requires-Dist: langgraph>=0.3.6
24
+ Requires-Dist: langgraph-checkpoint-sqlite>=3.0.0
25
+ Requires-Dist: langgraph>=1.0.0
26
26
  Requires-Dist: litellm>=1.74.3
27
27
  Requires-Dist: msgpack>=1.1.0
28
28
  Requires-Dist: networkx>=3.4.2
@@ -0,0 +1,49 @@
1
+ flexeval/__about__.py,sha256=42STGor_9nKYXumfeV5tiyD_M8VdcddX7CEexmibPBk,22
2
+ flexeval/__init__.py,sha256=UXI_xdSxnGAK2plDODBbPF3df-N7E9YJ418QHK7XN-Q,391
3
+ flexeval/__main__.py,sha256=c9NQqsea3e-_6b736gBeIO3O_zdXQ1wtY3-Scj5NiPg,126
4
+ flexeval/cli.py,sha256=RwtRk121OivbLQyYpYxJ7PugPIYQ8J4qXHFN2SxxPy4,2985
5
+ flexeval/completions.py,sha256=8PwpWXawARiSngeE2bRzTRXmPyXmxUjPKNFv4zCuAzE,5731
6
+ flexeval/compute_metrics.py,sha256=SNhPpe5ol7Cqr2kjaBdeTIWIYqVlGjd9ZVDl9Qq90y0,37636
7
+ flexeval/config.yaml,sha256=dpkFdW0rKf7StGoVeIGaCNw0n0yOfYWig0xmIfsDdbg,530
8
+ flexeval/data_loader.py,sha256=jptI0tG2YYk40xNYiZzfSqcqmYzw9pIBt_rFtpw3T4o,17099
9
+ flexeval/db_utils.py,sha256=xz97uZbUMQaTyGoR-7lKrMDs8SGdHy09SCvfCkxB36A,1687
10
+ flexeval/dependency_graph.py,sha256=dUQp0WQ9G2FskorUMLYOKFQ9_JwIrMR_DpVrqh4n0xg,10515
11
+ flexeval/eval_schema.json,sha256=pAS3vPLBEyH3Yjglos6aB0aNMTEUFbV-3Rf6wuSVtR4,11881
12
+ flexeval/function_types.py,sha256=rz8AcsHJOkFfAEEocN3HX5EgEh70Oze5dSlNMdaihVU,6420
13
+ flexeval/helpers.py,sha256=gX-6Hx4_wOiqbfY8c8_kL3XbkdV8mpEjPmaAe44lOSk,1605
14
+ flexeval/log_utils.py,sha256=E3RloPQZbtd8sEIg7mfN5fAku-TeNGqWy03SmwRllIE,923
15
+ flexeval/rubric.py,sha256=UwtJOxIxFJcQVrDXXuCA3tF_FFTcvLPqo2F9lq8gPcM,2167
16
+ flexeval/run_utils.py,sha256=z9ISQlthcLUUsGiIyaGHI1IwICBy_JLN_Efg4TNv8Mk,8536
17
+ flexeval/runner.py,sha256=RuQYQgafD0p4qlVK-IxDRKBJPic40YJItrJg8-M9Shw,4110
18
+ flexeval/classes/__init__.py,sha256=fywDMYX8W-nXFKRXolzn-RWd_7tiJr6FlouQJvYSoyE,347
19
+ flexeval/classes/base.py,sha256=xxkTa8joPe39CFwveeTPW56LW-x7rsi5oBAIxrvM5iI,944
20
+ flexeval/classes/dataset.py,sha256=10t4_1Jyg9rYe00VqOYt_biQYnSZrKbJ3nEIRMCF4O8,656
21
+ flexeval/classes/eval_runner.py,sha256=ZvCpyaD7lorDK_mYJSZqQbvI6FfLbIWRFHNarWTAMQU,6270
22
+ flexeval/classes/eval_set_run.py,sha256=n15zMu-KANEDc2K3sqs-KEI12bWpkhSrF0EkEAiBPV4,1449
23
+ flexeval/classes/jsonview.py,sha256=9HQfEY7BH9D58EnR4N9R5oMQsCMjJxsMcPHdzOBLj2w,3773
24
+ flexeval/classes/message.py,sha256=fiW0JhXKt5IiLw7zA4XVKjpY1rObVGvoBtUTXjOXWhs,7741
25
+ flexeval/classes/metric.py,sha256=yXwRx8ECsEYXKg24r0Y0e8B81XGZma1xOcYp4Zi86pM,2109
26
+ flexeval/classes/thread.py,sha256=3gwiLwe3xP0atzsyCG3SKd2G3QtY21vk1Gif4p9ZwI8,2802
27
+ flexeval/classes/tool_call.py,sha256=qBWTAjEKl35Za4BU-sVRPuTxkgVPRTcefQBynUjGEqI,1626
28
+ flexeval/classes/turn.py,sha256=eN_8mPDJa5x4bGbuiDrEPFbGE6Cs9F4vGoJO17ZaSMI,8771
29
+ flexeval/configuration/__init__.py,sha256=wP_gpYyaEp5DxCSH8-4KHchH07JMZZOk8eCFMfd5LBw,75
30
+ flexeval/configuration/completion_functions.py,sha256=-N0iFAfcYcm35S78M3ES4MBkLXpDeEfy2Qq1ORHGBXE,7491
31
+ flexeval/configuration/evals.yaml,sha256=2cApBbwSQr3C4pil0yfZJRkeWviVwaHH13tLmZNoRaI,21924
32
+ flexeval/configuration/function_metrics.py,sha256=SGCxCAfG5NfKop-d3_uJgF83nPrlfHAhd-TU0GpEPFY,22427
33
+ flexeval/configuration/rubric_metrics.yaml,sha256=JfE6gPj4LtM2v0b5-Zge3NwM17YgJEBZXzTVn9UL7zk,9424
34
+ flexeval/io/__init__.py,sha256=MqdgcPzkFpSnOEz-e2GNNd8XOI_DbyNjIP8AT5eqUqI,101
35
+ flexeval/io/parsers/yaml_parser.py,sha256=2yE6j_RM_YG5nkNUWZckrymh61n28AG46lqnPSlWitk,1818
36
+ flexeval/metrics/__init__.py,sha256=qrgUhTXzezAOoABhck3hMVN-c2Bwn7CTg-e_P2w7PlA,134
37
+ flexeval/metrics/access.py,sha256=mP89IUNTWpHguMEdjjh_deMxdiyClb61hg3k7Jcus-o,1299
38
+ flexeval/metrics/save.py,sha256=nquTUmcUuiCkj5VY0vFonEflo4ZHZN-Xbc_Lvy2AC2k,1837
39
+ flexeval/schema/__init__.py,sha256=4OA6Q7Dguz-uaulwoRsrtaoReFmyNsKqyi_CvfDV4-c,379
40
+ flexeval/schema/config_schema.py,sha256=cH2iE-bj-8Rs1-CEUP-xVn1S0r2wtRmI6kWqfQ4M_Y4,2272
41
+ flexeval/schema/eval_schema.py,sha256=8idEhxogqzUPwojBcfyNIH8yGWX74oa0NhUB3vabwlc,6651
42
+ flexeval/schema/evalrun_schema.py,sha256=nF3GCNlzxhJvu-V2h4-RkX5xWhBA9mQIR_ofR3T6de0,4315
43
+ flexeval/schema/rubric_schema.py,sha256=uxcf7MHWKW3EmABUnWeCinGUP6LBjskiq7zkEPHmAvU,1615
44
+ flexeval/schema/schema_utils.py,sha256=Fg1foqRA-9X-hl_vqIF3bpYdE51hNEgdw739Q-s3iQc,698
45
+ python_flexeval-0.4.0.dist-info/METADATA,sha256=isVFK5bnXc7iBmzvrALqo7OYOYi653_UZOjY_TXBkqE,5599
46
+ python_flexeval-0.4.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
47
+ python_flexeval-0.4.0.dist-info/entry_points.txt,sha256=wSyluqXhrX3xySVYAtM-Kv23p4OauKQCSBuNNfzEGtI,52
48
+ python_flexeval-0.4.0.dist-info/licenses/LICENSE,sha256=OlAu_c13gw6-fJ9UdhZBMeNr5STLrnWG_0Hv0SCXtu4,1082
49
+ python_flexeval-0.4.0.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: hatchling 1.27.0
2
+ Generator: hatchling 1.29.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
@@ -1,49 +0,0 @@
1
- flexeval/__about__.py,sha256=VrXpHDu3erkzwl_WXrqINBm9xWkcyUy53IQOj042dOs,22
2
- flexeval/__init__.py,sha256=UXI_xdSxnGAK2plDODBbPF3df-N7E9YJ418QHK7XN-Q,391
3
- flexeval/__main__.py,sha256=c9NQqsea3e-_6b736gBeIO3O_zdXQ1wtY3-Scj5NiPg,126
4
- flexeval/cli.py,sha256=RwtRk121OivbLQyYpYxJ7PugPIYQ8J4qXHFN2SxxPy4,2985
5
- flexeval/completions.py,sha256=pi_tYK4m3vKSqAC1ym9Jc3e4srcQSXfx-mX4qI5qisQ,5686
6
- flexeval/compute_metrics.py,sha256=4X6XFk0qUKcaCDllNeJreuhlnDHmfRPlsf0f8fWFOxA,37277
7
- flexeval/config.yaml,sha256=dpkFdW0rKf7StGoVeIGaCNw0n0yOfYWig0xmIfsDdbg,530
8
- flexeval/data_loader.py,sha256=UP-HWqh5o_euqT2GvTbUYmA-yJcbTKtmug4w63w2CbA,26153
9
- flexeval/db_utils.py,sha256=2jgqexLCAqShvgPrImZz12UkMZtfERhP8iXjratXYok,1612
10
- flexeval/dependency_graph.py,sha256=SaG9gjkw2Q0NykqQWs4JzPkv5sMj2aXXmhjJ7yRkV4Q,10539
11
- flexeval/eval_schema.json,sha256=BQetj8O0_4rorj3Mpqk-sj_SCaRkGMrvBUcxhuw6zLE,13111
12
- flexeval/function_types.py,sha256=eH8NadQRw7XAOXAOKWYN6b7urjr57J5WzdiVyzh0Wb4,6898
13
- flexeval/helpers.py,sha256=gX-6Hx4_wOiqbfY8c8_kL3XbkdV8mpEjPmaAe44lOSk,1605
14
- flexeval/log_utils.py,sha256=E3RloPQZbtd8sEIg7mfN5fAku-TeNGqWy03SmwRllIE,923
15
- flexeval/rubric.py,sha256=UwtJOxIxFJcQVrDXXuCA3tF_FFTcvLPqo2F9lq8gPcM,2167
16
- flexeval/run_utils.py,sha256=cNFVRsFNYY9gpzbIUc-H4Gk7TWC64GXsYowQHoG7ZVU,2597
17
- flexeval/runner.py,sha256=X6ZfjfwIM3ymN_kHfRt_JSKPxpDxs_MWQPrvWhl2L7I,4340
18
- flexeval/classes/__init__.py,sha256=fywDMYX8W-nXFKRXolzn-RWd_7tiJr6FlouQJvYSoyE,347
19
- flexeval/classes/base.py,sha256=xxkTa8joPe39CFwveeTPW56LW-x7rsi5oBAIxrvM5iI,944
20
- flexeval/classes/dataset.py,sha256=Y_EdEIuhx526SSvkqk2tFBzkOgBkVY-5FeraYMtU5lo,2913
21
- flexeval/classes/eval_runner.py,sha256=ZvCpyaD7lorDK_mYJSZqQbvI6FfLbIWRFHNarWTAMQU,6270
22
- flexeval/classes/eval_set_run.py,sha256=fq_wBOaxuq7dLxiZIw76WGIwhRBNbQWDUhpiK0wDG_A,1116
23
- flexeval/classes/jsonview.py,sha256=3XJTh46ODfqdNbrXYDEV6kRO8KbeiHJo5pb4aJrbHRY,3459
24
- flexeval/classes/message.py,sha256=gDejDfaHGQKgS_CpJqjPAVzpiRD2JddKo17Yi1wVeiw,7676
25
- flexeval/classes/metric.py,sha256=d8l39_QwnQDmTJvy9TIulU4p0jqD7ldMUi4m5zfK2Es,2806
26
- flexeval/classes/thread.py,sha256=cFQu3Mwzk8-Def8xccB8F6zKv64Srvhz5n83yLELvKo,2922
27
- flexeval/classes/tool_call.py,sha256=CteT2Hajor0PlHEEn7apfZux5_mremSIDrQmZ0iB7K0,1748
28
- flexeval/classes/turn.py,sha256=kLmgnYQ-4a8sydzGK1HTQRyUDXZIedmt_NFR3shLJFE,8635
29
- flexeval/configuration/__init__.py,sha256=wP_gpYyaEp5DxCSH8-4KHchH07JMZZOk8eCFMfd5LBw,75
30
- flexeval/configuration/completion_functions.py,sha256=-N0iFAfcYcm35S78M3ES4MBkLXpDeEfy2Qq1ORHGBXE,7491
31
- flexeval/configuration/evals.yaml,sha256=3mbD3gEccTDotm8kj4doYTujqRD_PkGhCVhjQaSEqSs,22651
32
- flexeval/configuration/function_metrics.py,sha256=SGCxCAfG5NfKop-d3_uJgF83nPrlfHAhd-TU0GpEPFY,22427
33
- flexeval/configuration/rubric_metrics.yaml,sha256=JfE6gPj4LtM2v0b5-Zge3NwM17YgJEBZXzTVn9UL7zk,9424
34
- flexeval/io/__init__.py,sha256=MqdgcPzkFpSnOEz-e2GNNd8XOI_DbyNjIP8AT5eqUqI,101
35
- flexeval/io/parsers/yaml_parser.py,sha256=2yE6j_RM_YG5nkNUWZckrymh61n28AG46lqnPSlWitk,1818
36
- flexeval/metrics/__init__.py,sha256=qrgUhTXzezAOoABhck3hMVN-c2Bwn7CTg-e_P2w7PlA,134
37
- flexeval/metrics/access.py,sha256=mP89IUNTWpHguMEdjjh_deMxdiyClb61hg3k7Jcus-o,1299
38
- flexeval/metrics/save.py,sha256=8x9ifRiHtQT7_WeMP0XmYK1zfourXMnHkGZy_iR0Xcc,1643
39
- flexeval/schema/__init__.py,sha256=4OA6Q7Dguz-uaulwoRsrtaoReFmyNsKqyi_CvfDV4-c,379
40
- flexeval/schema/config_schema.py,sha256=LkmtiOLfPsX1u_6Ey6gFbRr8tQwxqcuLcyf-xYcBf9o,1619
41
- flexeval/schema/eval_schema.py,sha256=iHMbanW4Ef_sp51KiaZKeP3Dn4Z6pWCGa7N2SPvsFK0,6607
42
- flexeval/schema/evalrun_schema.py,sha256=M7JY01DhlLzwZc2jJTIeGPs9vt6TFMPir51MFhtRllA,3526
43
- flexeval/schema/rubric_schema.py,sha256=uxcf7MHWKW3EmABUnWeCinGUP6LBjskiq7zkEPHmAvU,1615
44
- flexeval/schema/schema_utils.py,sha256=Fg1foqRA-9X-hl_vqIF3bpYdE51hNEgdw739Q-s3iQc,698
45
- python_flexeval-0.3.0.dist-info/METADATA,sha256=xBbeZrF4aEdl94pg-L2P_Di6cxtxA3aZnu6fxFjUf-8,5599
46
- python_flexeval-0.3.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
47
- python_flexeval-0.3.0.dist-info/entry_points.txt,sha256=wSyluqXhrX3xySVYAtM-Kv23p4OauKQCSBuNNfzEGtI,52
48
- python_flexeval-0.3.0.dist-info/licenses/LICENSE,sha256=OlAu_c13gw6-fJ9UdhZBMeNr5STLrnWG_0Hv0SCXtu4,1082
49
- python_flexeval-0.3.0.dist-info/RECORD,,