python-flexeval 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. flexeval/__init__.py +11 -0
  2. flexeval/__main__.py +11 -0
  3. flexeval/classes/__init__.py +15 -0
  4. flexeval/classes/base.py +32 -0
  5. flexeval/classes/dataset.py +82 -0
  6. flexeval/classes/eval_runner.py +158 -0
  7. flexeval/classes/eval_set_run.py +32 -0
  8. flexeval/classes/message.py +183 -0
  9. flexeval/classes/metric.py +55 -0
  10. flexeval/classes/thread.py +79 -0
  11. flexeval/classes/tool_call.py +51 -0
  12. flexeval/classes/turn.py +206 -0
  13. flexeval/cli.py +104 -0
  14. flexeval/completions.py +147 -0
  15. flexeval/compute_metrics.py +788 -0
  16. flexeval/config.yaml +23 -0
  17. flexeval/configuration/__init__.py +1 -0
  18. flexeval/configuration/completion_functions.py +231 -0
  19. flexeval/configuration/evals.yaml +864 -0
  20. flexeval/configuration/function_metrics.py +650 -0
  21. flexeval/configuration/rubric_metrics.yaml +194 -0
  22. flexeval/data_loader.py +513 -0
  23. flexeval/db_utils.py +38 -0
  24. flexeval/dependency_graph.py +234 -0
  25. flexeval/eval_schema.json +256 -0
  26. flexeval/function_types.py +173 -0
  27. flexeval/helpers.py +52 -0
  28. flexeval/io/__init__.py +1 -0
  29. flexeval/io/parsers/yaml_parser.py +69 -0
  30. flexeval/log_utils.py +34 -0
  31. flexeval/metrics/__init__.py +8 -0
  32. flexeval/metrics/access.py +28 -0
  33. flexeval/metrics/save.py +39 -0
  34. flexeval/rubric.py +62 -0
  35. flexeval/run_utils.py +65 -0
  36. flexeval/runner.py +132 -0
  37. flexeval/schema/__init__.py +11 -0
  38. flexeval/schema/config_schema.py +46 -0
  39. flexeval/schema/eval_schema.py +163 -0
  40. flexeval/schema/evalrun_schema.py +97 -0
  41. flexeval/schema/rubric_schema.py +40 -0
  42. flexeval/schema/schema_utils.py +26 -0
  43. python_flexeval-0.1.5.dist-info/METADATA +118 -0
  44. python_flexeval-0.1.5.dist-info/RECORD +47 -0
  45. python_flexeval-0.1.5.dist-info/WHEEL +4 -0
  46. python_flexeval-0.1.5.dist-info/entry_points.txt +2 -0
  47. python_flexeval-0.1.5.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,163 @@
1
+ # originally generated by datamodel-codegen:
2
+ # filename: src/flexeval/eval_schema.json
3
+ # timestamp: 2025-05-19T21:42:39+00:00
4
+
5
+ from __future__ import annotations
6
+
7
+ import sys
8
+ from typing import Any, Dict, List, Literal, Optional
9
+
10
+ from pydantic import BaseModel, Field
11
+
12
+ from flexeval.schema import schema_utils
13
+
14
+ VALID_METRIC_LEVELS = ["Message", "Turn", "Thread", "ToolCall"]
15
+ MetricLevel = Literal["Message", "Turn", "Thread", "ToolCall"]
16
+
17
+
18
+ class DependsOnItem(BaseModel):
19
+ class Config:
20
+ extra = "forbid"
21
+
22
+ name: Optional[str] = Field(
23
+ None, description="Name of the dependency function or rubric."
24
+ )
25
+ type: Optional[Literal["function", "rubric"]] = Field(
26
+ None,
27
+ description="One of 'function' or 'rubric' indicating the type of the dependency.",
28
+ )
29
+ kwargs: Optional[Dict[str, Any]] = Field(
30
+ None,
31
+ description="The keyword arguments for the dependency. If provided, used to match which evaluation this dependency is for, so must match the keyword args given for some evaluation.",
32
+ )
33
+ metric_name: Optional[str] = Field(
34
+ None,
35
+ description="Name of the metric dependency. This may be different than function_name if the metric function returns a key/value pair - in which case, this will match the key.",
36
+ )
37
+ metric_level: Optional[MetricLevel] = Field(
38
+ None,
39
+ description="The level of the metric to depend on, which must be equal to or 'greater' than the dependent metric's level. e.g. a Turn can depend on a Thread metric, but not the reverse.",
40
+ )
41
+ relative_object_position: int = Field(
42
+ 0,
43
+ le=0,
44
+ strict=True,
45
+ description="The position of the object within the Thread. If 0 (default), this is the metric value for the current object. If -1, this is the metric value for the most recent object before this one.",
46
+ )
47
+ # TODO we could consider adding an absolute_object_position
48
+ metric_min_value: Optional[float] = Field(
49
+ -sys.float_info.max,
50
+ description="Minimum value of the dependency to consider it as satisfied.",
51
+ )
52
+ metric_max_value: Optional[float] = Field(
53
+ sys.float_info.max,
54
+ description="Maximum value of the dependency to consider it as satisfied.",
55
+ )
56
+
57
+
58
+ class MetricItem(BaseModel):
59
+ name: str = Field(
60
+ ...,
61
+ description="The function to call or name of rubric to use to compute this metric.",
62
+ )
63
+ depends_on: Optional[List[DependsOnItem]] = Field(
64
+ default_factory=list,
65
+ description="List of dependencies that must be satisfied for this metric to be computed.",
66
+ )
67
+ # TODO why is metric_level optional? Should likely make it required
68
+ metric_level: Optional[MetricLevel] = Field(
69
+ "Turn",
70
+ description="What level of granularity (ToolCall, Message, Turn, or Thread) this rubric should be applied to",
71
+ )
72
+
73
+
74
+ class FunctionItem(MetricItem):
75
+ kwargs: schema_utils.OptionalDict = Field(
76
+ default_factory=dict,
77
+ description="Keyword arguments for the function. Each key must correspond to an argument in the function. Extra keys will cause an error.",
78
+ )
79
+ # TODO add the ability to provide a function source: Path | FunctionsCollection | schema_utils.ModuleType
80
+
81
+
82
+ class RubricItem(MetricItem):
83
+ # TODO is RubricItem.kwargs actually used?
84
+ kwargs: Optional[Dict[str, Any]] = Field(
85
+ default_factory=dict,
86
+ description="Keyword arguments for the rubric evaluation.",
87
+ )
88
+ # TODO add the ability to provide a rubric source: Path | RubricsCollection
89
+
90
+
91
+ class Metrics(BaseModel):
92
+ """Defines the metrics to be evaluated."""
93
+
94
+ function: Optional[List[FunctionItem]] = Field(
95
+ None, description="List of function-based metrics to be evaluated."
96
+ )
97
+ rubric: Optional[List[RubricItem]] = Field(
98
+ None, description="List of rubrics to be evaluated."
99
+ )
100
+
101
+
102
+ class CompletionLlm(BaseModel):
103
+ class Config:
104
+ extra = "forbid"
105
+
106
+ function_name: str = Field(
107
+ ...,
108
+ description="Completion function defined in `completion_functions.py` or available in the global namespace.",
109
+ )
110
+ include_system_prompt: bool = True
111
+ kwargs: Dict[str, Any] = Field(
112
+ default_factory=dict,
113
+ description="Additional arguments that will be passed to the completion function. Must correspond to arguments in the named function.",
114
+ )
115
+
116
+
117
+ class GraderLlm(BaseModel):
118
+ class Config:
119
+ extra = "forbid"
120
+
121
+ function_name: str = Field(
122
+ ...,
123
+ description="Function defined in `completion_functions.py`. We're not really completing a conversation, but we ARE asking an LLM to provide a response to an input - in this case, the rubric.",
124
+ )
125
+ kwargs: Dict[str, Any] = Field(
126
+ default_factory=dict,
127
+ description="Additional arguments that will be passed to the completion function. Must correspond to arguments in tne named function.",
128
+ )
129
+
130
+
131
+ class Eval(BaseModel):
132
+ """Defines the evaluation that should be executed.
133
+
134
+ The key fields are :attr:`metrics` and :attr:`grader_llm`.
135
+ """
136
+
137
+ class Config:
138
+ # TODO don't permit additional fields in Eval
139
+ extra = "allow"
140
+
141
+ do_completion: bool = Field(
142
+ False,
143
+ description="Flag to determine if completions should be done in each thread. Set to 'true' if you are testing a new API and want to evaluate the API responses. Set to 'false' (default) if you are evaluating past conversations and do not need to generate new completions.",
144
+ )
145
+ name: Optional[str] = Field(
146
+ None,
147
+ description="Name of the test suite. Used as metadata only. Does not need to match the key of the entry in the evals.yaml file.",
148
+ )
149
+ notes: str = Field(
150
+ "",
151
+ description="Additional notes regarding the configuration. Used as metadata only.",
152
+ )
153
+ metrics: Metrics = Field(
154
+ default_factory=Metrics, description="Metrics to use in the evaluation."
155
+ )
156
+ completion_llm: Optional[CompletionLlm] = Field(
157
+ None,
158
+ description="Specification of the LLM or API used to perform new completions. Must be defined if `do_completions: true` is set.",
159
+ )
160
+ grader_llm: Optional[GraderLlm] = Field(
161
+ None,
162
+ description="Specification of the LLM or API used to grade rubrics. Must be defined if any rubric_metrics are specified.",
163
+ )
@@ -0,0 +1,97 @@
1
+ """The top-level :class:`~flexeval.schema.evalrun_schema.EvalRun` schema and associated sub-schema."""
2
+
3
+ from pathlib import Path
4
+ from typing import Annotated, Callable, Iterable, Literal
5
+
6
+ from annotated_types import Len
7
+ from pydantic import BaseModel, Field, FilePath
8
+
9
+ from flexeval.configuration import function_metrics
10
+ from flexeval.schema import config_schema, eval_schema, rubric_schema, schema_utils
11
+
12
+
13
+ class DataSource(BaseModel):
14
+ # TODO support more generic DataSource interface
15
+ # for now, we need to use FileDataSource because we path the JSONL paths along
16
+ name: str | None = Field(None, description="")
17
+ notes: str | None = Field(None, description="")
18
+
19
+
20
+ class IterableDataSource(DataSource):
21
+ """Not yet implemented."""
22
+
23
+ contents: Iterable = Field(
24
+ default_factory=list,
25
+ description="Iterable of data items, presumably in the jsonl format (for now).",
26
+ )
27
+
28
+
29
+ class FileDataSource(DataSource):
30
+ """File to be used as a data source."""
31
+
32
+ # TODO in the future, we could use cloudpathlib to support cloud paths
33
+ path: FilePath = Field(
34
+ description="Absolute or relative path to data file. Each file must be in jsonl format, with one conversation per line."
35
+ )
36
+ format: Literal["jsonl"] = Field("jsonl", description="Format of the data file.")
37
+
38
+
39
+ class FunctionsCollection(BaseModel):
40
+ """Collection of functions that can be used as :class:`~flexeval.schema.eval_schema.FunctionItem`\s."""
41
+
42
+ functions: list[Callable] = Field(
43
+ default_factory=list,
44
+ description="Callables that can be used as functions for evaluation.",
45
+ )
46
+
47
+
48
+ def get_default_rubrics() -> list[Path | rubric_schema.RubricsCollection]:
49
+ """Utility function to retrieve the default rubric collection."""
50
+ from flexeval import rubric
51
+
52
+ return [rubric.get_default_rubric_collection()]
53
+
54
+
55
+ def get_default_function_metrics() -> list[
56
+ Path | FunctionsCollection | schema_utils.ModuleType
57
+ ]:
58
+ """Utility function to retrieve the default function collection."""
59
+ return [function_metrics]
60
+
61
+
62
+ class EvalRun(BaseModel):
63
+ """EvalRun defines the schema that FlexEval expects.
64
+
65
+ At a minimum, you need to provide a set of input data sources and an :class:`~flexeval.schema.eval_schema.Eval`.
66
+
67
+ You can evaluate an EvalRun using :func:`~flexeval.runner.run`.
68
+
69
+ Read more in the :ref:`user_guide`."""
70
+
71
+ data_sources: Annotated[list[FileDataSource], Len(min_length=1)] = Field(
72
+ description="List of data sources.",
73
+ )
74
+ database_path: Path = Field(
75
+ Path("flexeval/results/results.db"),
76
+ description="Output database path.",
77
+ )
78
+ eval: eval_schema.Eval = Field(
79
+ description="The evaluation to apply to the data sources."
80
+ )
81
+ config: config_schema.Config = Field(
82
+ default_factory=config_schema.Config, description="Configuration details."
83
+ )
84
+ rubric_paths: list[Path | rubric_schema.RubricsCollection] = Field(
85
+ default_factory=get_default_rubrics,
86
+ description="Additional sources for rubrics. If a Path, should be a YAML file in the expected format.",
87
+ )
88
+ function_modules: list[FilePath | FunctionsCollection | schema_utils.ModuleType] = (
89
+ Field(
90
+ default_factory=get_default_function_metrics,
91
+ description="Additional sources for functions.",
92
+ )
93
+ )
94
+ add_default_functions: bool = Field(
95
+ True,
96
+ description="If the default functions at :mod:`flexeval.configuration.function_metrics` should be made available.",
97
+ )
@@ -0,0 +1,40 @@
1
+ from pydantic import BaseModel, Field, field_validator
2
+
3
+
4
+ class Rubric(BaseModel):
5
+ prompt: str = Field(description="Prompt for the rubric.")
6
+ choice_scores: dict[str, int | float] = Field(
7
+ default_factory=dict, description="Choices."
8
+ )
9
+ name: str | None = Field(None, description="Optional name of the rubric.")
10
+ notes: str | None = Field(None, description="Optional notes.")
11
+
12
+ @field_validator("prompt")
13
+ @classmethod
14
+ def is_rubric_prompt_valid(cls, prompt: str):
15
+ if "{conversation}" in prompt and "{context}" in prompt:
16
+ raise ValueError(
17
+ "Your rubric should not have both {conversation} and {context}. Please check the README file for more information about how to write FlexEval rubrics."
18
+ )
19
+
20
+ if "{completion}" in prompt and "{content}" in prompt:
21
+ raise ValueError(
22
+ "Your rubric should not have both {content} and {completion}. Please check the README file for more information about how to write FlexEval rubrics."
23
+ )
24
+ return prompt
25
+
26
+ @field_validator("choice_scores")
27
+ @classmethod
28
+ def check_non_empty(cls, v):
29
+ if not v:
30
+ raise ValueError("Must provide at least two choice scores.")
31
+ return v
32
+
33
+
34
+ class RubricsCollection(BaseModel):
35
+ """Collection of rubrics that can be used as :class:`~flexeval.schema.eval_schema.RubricItem`\s."""
36
+
37
+ rubrics: dict[str, Rubric] = Field(
38
+ default_factory=dict,
39
+ description="Mapping of rubric names to Rubrics. The rubric names are used for matching metrics to specific rubrics.",
40
+ )
@@ -0,0 +1,26 @@
1
+ import types
2
+ from typing import Annotated, Any
3
+
4
+ from pydantic import BeforeValidator, PlainSerializer, PlainValidator
5
+
6
+
7
+ def validate_python_module(value: Any) -> Any:
8
+ if not isinstance(value, types.ModuleType):
9
+ raise ValueError(f"Expected a module, got a '{type(value)}'.")
10
+ return value
11
+
12
+
13
+ ModuleType = Annotated[
14
+ types.ModuleType,
15
+ PlainValidator(validate_python_module),
16
+ PlainSerializer(lambda x: str(x.__name__)),
17
+ ]
18
+
19
+
20
+ def convert_none_or_empty_string_to_dict(value: Any):
21
+ if value is None or (isinstance(value, str) and value.strip() == ""):
22
+ return {}
23
+ return value
24
+
25
+
26
+ OptionalDict = Annotated[dict, BeforeValidator(convert_none_or_empty_string_to_dict)]
@@ -0,0 +1,118 @@
1
+ Metadata-Version: 2.4
2
+ Name: python-flexeval
3
+ Version: 0.1.5
4
+ Summary: FlexEval is a tool for designing custom metrics, completion functions, and LLM-graded rubrics for evaluating the behavior of LLM-powered systems.
5
+ Project-URL: Homepage, https://digitalharborfoundation.github.io/FlexEval/
6
+ Project-URL: GitHub, https://github.com/DigitalHarborFoundation/FlexEval
7
+ Project-URL: Issues, https://github.com/DigitalHarborFoundation/FlexEval/issues
8
+ Author: S. Thomas Christie, Zachary Levonian, Baptiste Moreau-Pernet, Anna Rafferty, Terry Yu Tian
9
+ License-Expression: MIT
10
+ License-File: LICENSE
11
+ Keywords: conversation,education,evaluation,large language models,learning engineering
12
+ Classifier: Intended Audience :: Education
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Natural Language :: English
16
+ Classifier: Operating System :: OS Independent
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Topic :: Scientific/Engineering
19
+ Requires-Python: >=3.10
20
+ Requires-Dist: flatten-json>=0.1.14
21
+ Requires-Dist: jsonschema>=4.23.0
22
+ Requires-Dist: langchain-openai>=0.3.8
23
+ Requires-Dist: langchain>=0.3.20
24
+ Requires-Dist: langgraph-checkpoint-sqlite>=2.0.6
25
+ Requires-Dist: langgraph>=0.3.6
26
+ Requires-Dist: litellm>=1.74.3
27
+ Requires-Dist: msgpack>=1.1.0
28
+ Requires-Dist: networkx>=3.4.2
29
+ Requires-Dist: openai>=1.66.0
30
+ Requires-Dist: pandas>=2.2.3
31
+ Requires-Dist: peewee>=3.17.9
32
+ Requires-Dist: pydantic>=2.10.6
33
+ Requires-Dist: python-dotenv>=1.0.1
34
+ Requires-Dist: pyyaml>=6.0.2
35
+ Requires-Dist: requests>=2.32.3
36
+ Requires-Dist: sympy>=1.13.3
37
+ Requires-Dist: textstat>=0.7.5
38
+ Requires-Dist: typer>=0.16.0
39
+ Description-Content-Type: text/markdown
40
+
41
+ # FlexEval LLM Evals
42
+
43
+ [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.12729993.svg)](https://doi.org/10.5281/zenodo.12729993)
44
+ [![License](https://img.shields.io/github/license/DigitalHarborFoundation/FlexEval)](https://github.com/DigitalHarborFoundation/FlexEval/blob/main/LICENSE)
45
+
46
+ ![FlexEval banner](/docs/_static/flexeval_banner.svg)
47
+
48
+ FlexEval is a tool for designing custom metrics, completion functions, and LLM-graded rubrics for evaluating the behavior of LLM-powered systems.
49
+
50
+ **Documentation:** <https://digitalharborfoundation.github.io/FlexEval>
51
+
52
+ Additional details about FlexEval can be found [in our paper](https://doi.org/10.5281/zenodo.12729993) at the _Educational Data Mining_ 2024 conference.
53
+
54
+ ## Usage
55
+
56
+ Basic usage:
57
+
58
+ ```python
59
+ import flexeval
60
+ from flexeval.schema import Eval, EvalRun, FileDataSource, Metrics, FunctionItem, Config
61
+
62
+ data_sources = [FileDataSource(path="vignettes/conversations.jsonl")]
63
+ eval = Eval(metrics=Metrics(function=[FunctionItem(name="flesch_reading_ease")]))
64
+ config = Config(clear_tables=True)
65
+ eval_run = EvalRun(
66
+ data_sources=data_sources,
67
+ database_path="eval_results.db",
68
+ eval=eval,
69
+ config=config,
70
+ )
71
+ flexeval.run(eval_run)
72
+ ```
73
+
74
+ This example computes [Flesch reading ease](https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests#Flesch_reading_ease) for every turn in a list of conversations provided in JSONL format. The metric values are stored in an SQLite database called `eval_results.db`.
75
+
76
+ See additional usage examples in the [vignettes](/vignettes).
77
+
78
+ ## Installation
79
+
80
+ FlexEval is on PyPI as [`python-flexeval`](https://pypi.org/p/python-flexeval). See the [Installation](https://digitalharborfoundation.github.io/FlexEval/getting_started.html#Installation) section in the [Getting Started](https://digitalharborfoundation.github.io/FlexEval/getting_started.html) guide.
81
+
82
+ Using `pip`:
83
+
84
+ ```bash
85
+ pip install python-flexeval
86
+ ```
87
+
88
+ ## Basic functionality
89
+
90
+ FlexEval is designed to be "batteries included" for many basic use cases. It supports the following out-of-the-box:
91
+
92
+ - scoring historical conversations - useful for monitoring live systems.
93
+ - scoring LLMs:
94
+ - locally hosted and served via an endpoint using something like [LM Studio](https://lmstudio.ai)
95
+ - LLMs accessible by a REST endpoint and accessible via a network call
96
+ - any OpenAI LLM
97
+ - a set of useful rubrics
98
+ - a set of useful Python functions
99
+
100
+ Evaluation results are saved in an SQLite database. See the [Metric Analysis](/vignettes/metric_analysis.ipynb) vignette for a sample analysis demonstrating the structure and utility of the data saved by FlexEval.
101
+
102
+
103
+ Read more in the [Getting Started](https://digitalharborfoundation.github.io/FlexEval/getting_started.html) guide.
104
+
105
+ ## Cite this work
106
+
107
+ If this work is useful to you, please cite [our EDM 2024 paper](https://educationaldatamining.org/edm2024/proceedings/2024.EDM-posters.107/2024.EDM-posters.107.pdf):
108
+
109
+ >S. Thomas Christie, Baptiste Moreau-Pernet, Yu Tian, & John Whitmer. (2024). FlexEval: a customizable tool for chatbot performance evaluation and dialogue analysis. _Proceedings of the 17th International Conference on Educational Data Mining_, 903-908. Atlanta, Georgia, USA, July 2024. <https://doi.org/10.5281/zenodo.12729993>
110
+
111
+ ## Development
112
+
113
+ Pull requests are welcome. Feel free to contribute:
114
+ - New rubrics or functions
115
+ - Bug fixes
116
+ - New features
117
+
118
+ See [DEVELOPMENT.md](DEVELOPMENT.md).
@@ -0,0 +1,47 @@
1
+ flexeval/__init__.py,sha256=FIVIg06yxMU_RHPpx22QtX94hnS8Ce7gCjOcQ2pECMc,337
2
+ flexeval/__main__.py,sha256=c9NQqsea3e-_6b736gBeIO3O_zdXQ1wtY3-Scj5NiPg,126
3
+ flexeval/cli.py,sha256=RwtRk121OivbLQyYpYxJ7PugPIYQ8J4qXHFN2SxxPy4,2985
4
+ flexeval/completions.py,sha256=pi_tYK4m3vKSqAC1ym9Jc3e4srcQSXfx-mX4qI5qisQ,5686
5
+ flexeval/compute_metrics.py,sha256=elQZvuh2jyateWzwIPm8RLHASq-XqFMinEIA0rlMkj8,37277
6
+ flexeval/config.yaml,sha256=dpkFdW0rKf7StGoVeIGaCNw0n0yOfYWig0xmIfsDdbg,530
7
+ flexeval/data_loader.py,sha256=EKc6wdpQuhrB2ai2U_fQxojzt1RR716ELisiZXpfu58,25311
8
+ flexeval/db_utils.py,sha256=FKekqWAZ0oQbYNvw0bxuzHcZxlSsKKJkUhyfod-pMLg,1412
9
+ flexeval/dependency_graph.py,sha256=SaG9gjkw2Q0NykqQWs4JzPkv5sMj2aXXmhjJ7yRkV4Q,10539
10
+ flexeval/eval_schema.json,sha256=BQetj8O0_4rorj3Mpqk-sj_SCaRkGMrvBUcxhuw6zLE,13111
11
+ flexeval/function_types.py,sha256=eH8NadQRw7XAOXAOKWYN6b7urjr57J5WzdiVyzh0Wb4,6898
12
+ flexeval/helpers.py,sha256=gX-6Hx4_wOiqbfY8c8_kL3XbkdV8mpEjPmaAe44lOSk,1605
13
+ flexeval/log_utils.py,sha256=E3RloPQZbtd8sEIg7mfN5fAku-TeNGqWy03SmwRllIE,923
14
+ flexeval/rubric.py,sha256=UwtJOxIxFJcQVrDXXuCA3tF_FFTcvLPqo2F9lq8gPcM,2167
15
+ flexeval/run_utils.py,sha256=cNFVRsFNYY9gpzbIUc-H4Gk7TWC64GXsYowQHoG7ZVU,2597
16
+ flexeval/runner.py,sha256=X6ZfjfwIM3ymN_kHfRt_JSKPxpDxs_MWQPrvWhl2L7I,4340
17
+ flexeval/classes/__init__.py,sha256=fywDMYX8W-nXFKRXolzn-RWd_7tiJr6FlouQJvYSoyE,347
18
+ flexeval/classes/base.py,sha256=xxkTa8joPe39CFwveeTPW56LW-x7rsi5oBAIxrvM5iI,944
19
+ flexeval/classes/dataset.py,sha256=Y_EdEIuhx526SSvkqk2tFBzkOgBkVY-5FeraYMtU5lo,2913
20
+ flexeval/classes/eval_runner.py,sha256=-jkPlKhTWX0FpUDrzCaUIlIIlKsSAmDy06T4I1aB3Ds,6269
21
+ flexeval/classes/eval_set_run.py,sha256=fq_wBOaxuq7dLxiZIw76WGIwhRBNbQWDUhpiK0wDG_A,1116
22
+ flexeval/classes/message.py,sha256=zuDm_v1gmK49Fw5m-HTWiqndrI_xtLotlXD8nhRDDTg,7518
23
+ flexeval/classes/metric.py,sha256=d8l39_QwnQDmTJvy9TIulU4p0jqD7ldMUi4m5zfK2Es,2806
24
+ flexeval/classes/thread.py,sha256=LchsK9mmrY4K-zSTMAAmywlzPVwnpZ7rOHqBGPIlda8,2779
25
+ flexeval/classes/tool_call.py,sha256=CteT2Hajor0PlHEEn7apfZux5_mremSIDrQmZ0iB7K0,1748
26
+ flexeval/classes/turn.py,sha256=kLmgnYQ-4a8sydzGK1HTQRyUDXZIedmt_NFR3shLJFE,8635
27
+ flexeval/configuration/__init__.py,sha256=wP_gpYyaEp5DxCSH8-4KHchH07JMZZOk8eCFMfd5LBw,75
28
+ flexeval/configuration/completion_functions.py,sha256=-N0iFAfcYcm35S78M3ES4MBkLXpDeEfy2Qq1ORHGBXE,7491
29
+ flexeval/configuration/evals.yaml,sha256=3mbD3gEccTDotm8kj4doYTujqRD_PkGhCVhjQaSEqSs,22651
30
+ flexeval/configuration/function_metrics.py,sha256=UqCCl_xoG6kH6jRset0m1FQoAfUrqt9bqipxAshN5_A,22419
31
+ flexeval/configuration/rubric_metrics.yaml,sha256=JfE6gPj4LtM2v0b5-Zge3NwM17YgJEBZXzTVn9UL7zk,9424
32
+ flexeval/io/__init__.py,sha256=MqdgcPzkFpSnOEz-e2GNNd8XOI_DbyNjIP8AT5eqUqI,101
33
+ flexeval/io/parsers/yaml_parser.py,sha256=2yE6j_RM_YG5nkNUWZckrymh61n28AG46lqnPSlWitk,1818
34
+ flexeval/metrics/__init__.py,sha256=zBg-thOos5X1-YUH70PkdMqFnPdsrTM0Bt3fIjhfxDM,131
35
+ flexeval/metrics/access.py,sha256=U-IhG_dhC8HZ9BMnBKHiEvHretUuAnzuUWJ288XuPiA,681
36
+ flexeval/metrics/save.py,sha256=8x9ifRiHtQT7_WeMP0XmYK1zfourXMnHkGZy_iR0Xcc,1643
37
+ flexeval/schema/__init__.py,sha256=4OA6Q7Dguz-uaulwoRsrtaoReFmyNsKqyi_CvfDV4-c,379
38
+ flexeval/schema/config_schema.py,sha256=LkmtiOLfPsX1u_6Ey6gFbRr8tQwxqcuLcyf-xYcBf9o,1619
39
+ flexeval/schema/eval_schema.py,sha256=95kCkiGS67TfpVUfUaBdBMoKIpUJoY1beUgLWwg5Ljk,6373
40
+ flexeval/schema/evalrun_schema.py,sha256=LE6RmNHeRJIRye68xUMOaknWMNLcugfnQoUEkeP1JRs,3526
41
+ flexeval/schema/rubric_schema.py,sha256=9DaqU-Av6XMig7iIy3EObLhEkhjtYIxeCqpovKLYfYw,1615
42
+ flexeval/schema/schema_utils.py,sha256=Fg1foqRA-9X-hl_vqIF3bpYdE51hNEgdw739Q-s3iQc,698
43
+ python_flexeval-0.1.5.dist-info/METADATA,sha256=LPvBmYMMKpyxgStPchWxj1fhBYoNbbdb7-UgQX2b4CY,5095
44
+ python_flexeval-0.1.5.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
45
+ python_flexeval-0.1.5.dist-info/entry_points.txt,sha256=wSyluqXhrX3xySVYAtM-Kv23p4OauKQCSBuNNfzEGtI,52
46
+ python_flexeval-0.1.5.dist-info/licenses/LICENSE,sha256=OlAu_c13gw6-fJ9UdhZBMeNr5STLrnWG_0Hv0SCXtu4,1082
47
+ python_flexeval-0.1.5.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.27.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ flexeval = flexeval.__main__:main
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Digital Harbor Foundation
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.