python-flexeval 0.1.5__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
flexeval/__about__.py ADDED
@@ -0,0 +1 @@
1
+ __version__ = "0.2.0"
flexeval/__init__.py CHANGED
@@ -4,8 +4,10 @@ This top-level import exposes the :func:`~flexeval.runner.run` method."""
4
4
 
5
5
  from flexeval import metrics
6
6
  from flexeval.runner import run
7
+ from .__about__ import __version__
7
8
 
8
9
  __all__ = [
9
10
  "metrics",
10
11
  "run",
12
+ "__version__",
11
13
  ]
@@ -111,7 +111,7 @@ class EvalRunner:
111
111
 
112
112
  def load_evaluation_settings(self):
113
113
  """This function parses our eval suite and puts it in the data structure we'll need
114
- for easy use at run-time
114
+ for easy use at run-time.
115
115
  """
116
116
  # if the current eval has a 'config' entry, overwrite configuration options with its entries
117
117
  if (
@@ -42,7 +42,7 @@ class ObjectMetric:
42
42
 
43
43
 
44
44
  class MetricGraphBuilder:
45
- """Builds :class:`networkx.DiGraph`\s of :class:`~flexeval.compute_metrics.ObjectMetric` instances that reflect any computational dependencies between them."""
45
+ """Builds :class:`networkx.DiGraph` s of :class:`~flexeval.compute_metrics.ObjectMetric` instances that reflect any computational dependencies between them."""
46
46
 
47
47
  def __init__(self):
48
48
  # key: tuple(metric_level, metric_id, object_id)
@@ -122,8 +122,8 @@ def is_role(object: Union[Turn, Message], role: str) -> dict:
122
122
  and 0 otherwise.
123
123
 
124
124
  Args:
125
- object: the Turn or Message
126
- role: a string with the desired role to check against
125
+ object: the Turn or Message
126
+ role: a string with the desired role to check against
127
127
  """
128
128
  return {role: int(object.role == role)}
129
129
 
flexeval/db_utils.py CHANGED
@@ -14,6 +14,11 @@ from flexeval.classes.turn import Turn
14
14
  DATABASE_TABLES = [EvalSetRun, Dataset, Thread, Turn, Message, ToolCall, Metric]
15
15
 
16
16
 
17
+ def ensure_database(database_path: str):
18
+ if not classes_base.database.is_connection_usable():
19
+ initialize_database(database_path)
20
+
21
+
17
22
  def initialize_database(database_path: str, clear_tables: bool = False):
18
23
  classes_base.database.init(database_path)
19
24
  # classes_base.database.start()
@@ -34,5 +39,7 @@ def bind_to_database(database_path: str) -> pw.Database:
34
39
  new_database = classes_base.create_sqlite_database(database_path)
35
40
  new_database.bind(DATABASE_TABLES)
36
41
  # Verify the binding worked by checking one of the models
37
- assert classes_base.BaseModel._meta.database == new_database
42
+ assert classes_base.BaseModel._meta.database == new_database, (
43
+ f"Binding to '{database_path}' failed."
44
+ )
38
45
  return new_database
@@ -1,4 +1,4 @@
1
- """Utility functions for accessing metrics."""
1
+ """Utility functions for working with metrics."""
2
2
 
3
3
  from flexeval.metrics import access, save
4
4
 
@@ -1,6 +1,8 @@
1
+ """Utility functions for accessing metrics."""
2
+
1
3
  from collections import Counter
2
4
 
3
- from flexeval.classes import metric
5
+ from flexeval.classes import metric, message, turn, thread
4
6
 
5
7
 
6
8
  def count_dict_values(lst: list[dict]) -> dict[str, Counter]:
@@ -21,8 +23,28 @@ def count_dict_values(lst: list[dict]) -> dict[str, Counter]:
21
23
  return counts
22
24
 
23
25
 
24
- def get_all_metrics() -> list:
26
+ def get_all_metrics() -> list[dict]:
25
27
  results = []
26
28
  for m in metric.Metric.select():
27
29
  results.append(m.__data__.copy())
28
30
  return results
31
+
32
+
33
+ def get_first_user_message_for_threads(thread_ids: set) -> list[dict]:
34
+ """Get the first user message in each thread.
35
+
36
+ Args:
37
+ thread_ids (set): The set of thread IDs to retrieve messages for.
38
+
39
+ Returns:
40
+ list[dict]: An iterable of messages.
41
+ """
42
+ return (
43
+ message.Message.select()
44
+ .join(thread.Thread)
45
+ .where(thread.Thread.id.in_(thread_ids))
46
+ .where(message.Message.role == "user")
47
+ .join(turn.Turn)
48
+ .where(turn.Turn.index_in_thread == 0)
49
+ .dicts()
50
+ )
@@ -16,6 +16,8 @@ MetricLevel = Literal["Message", "Turn", "Thread", "ToolCall"]
16
16
 
17
17
 
18
18
  class DependsOnItem(BaseModel):
19
+ """Defines a metric dependency."""
20
+
19
21
  class Config:
20
22
  extra = "forbid"
21
23
 
@@ -56,6 +58,8 @@ class DependsOnItem(BaseModel):
56
58
 
57
59
 
58
60
  class MetricItem(BaseModel):
61
+ "Defines a metric."
62
+
59
63
  name: str = Field(
60
64
  ...,
61
65
  description="The function to call or name of rubric to use to compute this metric.",
@@ -72,6 +76,8 @@ class MetricItem(BaseModel):
72
76
 
73
77
 
74
78
  class FunctionItem(MetricItem):
79
+ """Defines a metric computed from a Python function."""
80
+
75
81
  kwargs: schema_utils.OptionalDict = Field(
76
82
  default_factory=dict,
77
83
  description="Keyword arguments for the function. Each key must correspond to an argument in the function. Extra keys will cause an error.",
@@ -80,6 +86,8 @@ class FunctionItem(MetricItem):
80
86
 
81
87
 
82
88
  class RubricItem(MetricItem):
89
+ """Defines a metric computed from a rubric."""
90
+
83
91
  # TODO is RubricItem.kwargs actually used?
84
92
  kwargs: Optional[Dict[str, Any]] = Field(
85
93
  default_factory=dict,
@@ -115,6 +123,8 @@ class CompletionLlm(BaseModel):
115
123
 
116
124
 
117
125
  class GraderLlm(BaseModel):
126
+ """Defines the LLM used for evaluating rubrics."""
127
+
118
128
  class Config:
119
129
  extra = "forbid"
120
130
 
@@ -37,7 +37,7 @@ class FileDataSource(DataSource):
37
37
 
38
38
 
39
39
  class FunctionsCollection(BaseModel):
40
- """Collection of functions that can be used as :class:`~flexeval.schema.eval_schema.FunctionItem`\s."""
40
+ """Collection of functions that can be used as :class:`~flexeval.schema.eval_schema.FunctionItem` s."""
41
41
 
42
42
  functions: list[Callable] = Field(
43
43
  default_factory=list,
@@ -32,7 +32,7 @@ class Rubric(BaseModel):
32
32
 
33
33
 
34
34
  class RubricsCollection(BaseModel):
35
- """Collection of rubrics that can be used as :class:`~flexeval.schema.eval_schema.RubricItem`\s."""
35
+ """Collection of rubrics that can be used as :class:`~flexeval.schema.eval_schema.RubricItem` s."""
36
36
 
37
37
  rubrics: dict[str, Rubric] = Field(
38
38
  default_factory=dict,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: python-flexeval
3
- Version: 0.1.5
3
+ Version: 0.2.0
4
4
  Summary: FlexEval is a tool for designing custom metrics, completion functions, and LLM-graded rubrics for evaluating the behavior of LLM-powered systems.
5
5
  Project-URL: Homepage, https://digitalharborfoundation.github.io/FlexEval/
6
6
  Project-URL: GitHub, https://github.com/DigitalHarborFoundation/FlexEval
@@ -40,10 +40,12 @@ Description-Content-Type: text/markdown
40
40
 
41
41
  # FlexEval LLM Evals
42
42
 
43
+ [![PyPi](https://img.shields.io/pypi/v/python-flexeval)](https://pypi.org/project/python-flexeval/)
43
44
  [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.12729993.svg)](https://doi.org/10.5281/zenodo.12729993)
44
45
  [![License](https://img.shields.io/github/license/DigitalHarborFoundation/FlexEval)](https://github.com/DigitalHarborFoundation/FlexEval/blob/main/LICENSE)
46
+ [![GitHub issues](https://img.shields.io/badge/issue_tracking-github-blue.svg)](https://github.com/DigitalHarborFoundation/FlexEval/issues)
45
47
 
46
- ![FlexEval banner](/docs/_static/flexeval_banner.svg)
48
+ ![FlexEval banner](https://raw.githubusercontent.com/DigitalHarborFoundation/FlexEval/refs/heads/main/docs/_static/flexeval_banner.svg)
47
49
 
48
50
  FlexEval is a tool for designing custom metrics, completion functions, and LLM-graded rubrics for evaluating the behavior of LLM-powered systems.
49
51
 
@@ -73,7 +75,7 @@ flexeval.run(eval_run)
73
75
 
74
76
  This example computes [Flesch reading ease](https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests#Flesch_reading_ease) for every turn in a list of conversations provided in JSONL format. The metric values are stored in an SQLite database called `eval_results.db`.
75
77
 
76
- See additional usage examples in the [vignettes](/vignettes).
78
+ See additional usage examples in the [vignettes](https://github.com/DigitalHarborFoundation/FlexEval/tree/main/vignettes).
77
79
 
78
80
  ## Installation
79
81
 
@@ -97,7 +99,7 @@ FlexEval is designed to be "batteries included" for many basic use cases. It sup
97
99
  - a set of useful rubrics
98
100
  - a set of useful Python functions
99
101
 
100
- Evaluation results are saved in an SQLite database. See the [Metric Analysis](/vignettes/metric_analysis.ipynb) vignette for a sample analysis demonstrating the structure and utility of the data saved by FlexEval.
102
+ Evaluation results are saved in an SQLite database. See the [Metric Analysis](https://digitalharborfoundation.github.io/FlexEval/generated/vignettes/metric_analysis.html) vignette for a sample analysis demonstrating the structure and utility of the data saved by FlexEval.
101
103
 
102
104
 
103
105
  Read more in the [Getting Started](https://digitalharborfoundation.github.io/FlexEval/getting_started.html) guide.
@@ -115,4 +117,4 @@ Pull requests are welcome. Feel free to contribute:
115
117
  - Bug fixes
116
118
  - New features
117
119
 
118
- See [DEVELOPMENT.md](DEVELOPMENT.md).
120
+ See [DEVELOPMENT.md](https://github.com/DigitalHarborFoundation/FlexEval/tree/main/DEVELOPMENT.md).
@@ -1,11 +1,12 @@
1
- flexeval/__init__.py,sha256=FIVIg06yxMU_RHPpx22QtX94hnS8Ce7gCjOcQ2pECMc,337
1
+ flexeval/__about__.py,sha256=Zn1KFblwuFHiDRdRAiRnDBRkbPttWh44jKa5zG2ov0E,22
2
+ flexeval/__init__.py,sha256=UXI_xdSxnGAK2plDODBbPF3df-N7E9YJ418QHK7XN-Q,391
2
3
  flexeval/__main__.py,sha256=c9NQqsea3e-_6b736gBeIO3O_zdXQ1wtY3-Scj5NiPg,126
3
4
  flexeval/cli.py,sha256=RwtRk121OivbLQyYpYxJ7PugPIYQ8J4qXHFN2SxxPy4,2985
4
5
  flexeval/completions.py,sha256=pi_tYK4m3vKSqAC1ym9Jc3e4srcQSXfx-mX4qI5qisQ,5686
5
- flexeval/compute_metrics.py,sha256=elQZvuh2jyateWzwIPm8RLHASq-XqFMinEIA0rlMkj8,37277
6
+ flexeval/compute_metrics.py,sha256=4X6XFk0qUKcaCDllNeJreuhlnDHmfRPlsf0f8fWFOxA,37277
6
7
  flexeval/config.yaml,sha256=dpkFdW0rKf7StGoVeIGaCNw0n0yOfYWig0xmIfsDdbg,530
7
8
  flexeval/data_loader.py,sha256=EKc6wdpQuhrB2ai2U_fQxojzt1RR716ELisiZXpfu58,25311
8
- flexeval/db_utils.py,sha256=FKekqWAZ0oQbYNvw0bxuzHcZxlSsKKJkUhyfod-pMLg,1412
9
+ flexeval/db_utils.py,sha256=2jgqexLCAqShvgPrImZz12UkMZtfERhP8iXjratXYok,1612
9
10
  flexeval/dependency_graph.py,sha256=SaG9gjkw2Q0NykqQWs4JzPkv5sMj2aXXmhjJ7yRkV4Q,10539
10
11
  flexeval/eval_schema.json,sha256=BQetj8O0_4rorj3Mpqk-sj_SCaRkGMrvBUcxhuw6zLE,13111
11
12
  flexeval/function_types.py,sha256=eH8NadQRw7XAOXAOKWYN6b7urjr57J5WzdiVyzh0Wb4,6898
@@ -17,7 +18,7 @@ flexeval/runner.py,sha256=X6ZfjfwIM3ymN_kHfRt_JSKPxpDxs_MWQPrvWhl2L7I,4340
17
18
  flexeval/classes/__init__.py,sha256=fywDMYX8W-nXFKRXolzn-RWd_7tiJr6FlouQJvYSoyE,347
18
19
  flexeval/classes/base.py,sha256=xxkTa8joPe39CFwveeTPW56LW-x7rsi5oBAIxrvM5iI,944
19
20
  flexeval/classes/dataset.py,sha256=Y_EdEIuhx526SSvkqk2tFBzkOgBkVY-5FeraYMtU5lo,2913
20
- flexeval/classes/eval_runner.py,sha256=-jkPlKhTWX0FpUDrzCaUIlIIlKsSAmDy06T4I1aB3Ds,6269
21
+ flexeval/classes/eval_runner.py,sha256=ZvCpyaD7lorDK_mYJSZqQbvI6FfLbIWRFHNarWTAMQU,6270
21
22
  flexeval/classes/eval_set_run.py,sha256=fq_wBOaxuq7dLxiZIw76WGIwhRBNbQWDUhpiK0wDG_A,1116
22
23
  flexeval/classes/message.py,sha256=zuDm_v1gmK49Fw5m-HTWiqndrI_xtLotlXD8nhRDDTg,7518
23
24
  flexeval/classes/metric.py,sha256=d8l39_QwnQDmTJvy9TIulU4p0jqD7ldMUi4m5zfK2Es,2806
@@ -27,21 +28,21 @@ flexeval/classes/turn.py,sha256=kLmgnYQ-4a8sydzGK1HTQRyUDXZIedmt_NFR3shLJFE,8635
27
28
  flexeval/configuration/__init__.py,sha256=wP_gpYyaEp5DxCSH8-4KHchH07JMZZOk8eCFMfd5LBw,75
28
29
  flexeval/configuration/completion_functions.py,sha256=-N0iFAfcYcm35S78M3ES4MBkLXpDeEfy2Qq1ORHGBXE,7491
29
30
  flexeval/configuration/evals.yaml,sha256=3mbD3gEccTDotm8kj4doYTujqRD_PkGhCVhjQaSEqSs,22651
30
- flexeval/configuration/function_metrics.py,sha256=UqCCl_xoG6kH6jRset0m1FQoAfUrqt9bqipxAshN5_A,22419
31
+ flexeval/configuration/function_metrics.py,sha256=SGCxCAfG5NfKop-d3_uJgF83nPrlfHAhd-TU0GpEPFY,22427
31
32
  flexeval/configuration/rubric_metrics.yaml,sha256=JfE6gPj4LtM2v0b5-Zge3NwM17YgJEBZXzTVn9UL7zk,9424
32
33
  flexeval/io/__init__.py,sha256=MqdgcPzkFpSnOEz-e2GNNd8XOI_DbyNjIP8AT5eqUqI,101
33
34
  flexeval/io/parsers/yaml_parser.py,sha256=2yE6j_RM_YG5nkNUWZckrymh61n28AG46lqnPSlWitk,1818
34
- flexeval/metrics/__init__.py,sha256=zBg-thOos5X1-YUH70PkdMqFnPdsrTM0Bt3fIjhfxDM,131
35
- flexeval/metrics/access.py,sha256=U-IhG_dhC8HZ9BMnBKHiEvHretUuAnzuUWJ288XuPiA,681
35
+ flexeval/metrics/__init__.py,sha256=qrgUhTXzezAOoABhck3hMVN-c2Bwn7CTg-e_P2w7PlA,134
36
+ flexeval/metrics/access.py,sha256=mP89IUNTWpHguMEdjjh_deMxdiyClb61hg3k7Jcus-o,1299
36
37
  flexeval/metrics/save.py,sha256=8x9ifRiHtQT7_WeMP0XmYK1zfourXMnHkGZy_iR0Xcc,1643
37
38
  flexeval/schema/__init__.py,sha256=4OA6Q7Dguz-uaulwoRsrtaoReFmyNsKqyi_CvfDV4-c,379
38
39
  flexeval/schema/config_schema.py,sha256=LkmtiOLfPsX1u_6Ey6gFbRr8tQwxqcuLcyf-xYcBf9o,1619
39
- flexeval/schema/eval_schema.py,sha256=95kCkiGS67TfpVUfUaBdBMoKIpUJoY1beUgLWwg5Ljk,6373
40
- flexeval/schema/evalrun_schema.py,sha256=LE6RmNHeRJIRye68xUMOaknWMNLcugfnQoUEkeP1JRs,3526
41
- flexeval/schema/rubric_schema.py,sha256=9DaqU-Av6XMig7iIy3EObLhEkhjtYIxeCqpovKLYfYw,1615
40
+ flexeval/schema/eval_schema.py,sha256=iHMbanW4Ef_sp51KiaZKeP3Dn4Z6pWCGa7N2SPvsFK0,6607
41
+ flexeval/schema/evalrun_schema.py,sha256=M7JY01DhlLzwZc2jJTIeGPs9vt6TFMPir51MFhtRllA,3526
42
+ flexeval/schema/rubric_schema.py,sha256=uxcf7MHWKW3EmABUnWeCinGUP6LBjskiq7zkEPHmAvU,1615
42
43
  flexeval/schema/schema_utils.py,sha256=Fg1foqRA-9X-hl_vqIF3bpYdE51hNEgdw739Q-s3iQc,698
43
- python_flexeval-0.1.5.dist-info/METADATA,sha256=LPvBmYMMKpyxgStPchWxj1fhBYoNbbdb7-UgQX2b4CY,5095
44
- python_flexeval-0.1.5.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
45
- python_flexeval-0.1.5.dist-info/entry_points.txt,sha256=wSyluqXhrX3xySVYAtM-Kv23p4OauKQCSBuNNfzEGtI,52
46
- python_flexeval-0.1.5.dist-info/licenses/LICENSE,sha256=OlAu_c13gw6-fJ9UdhZBMeNr5STLrnWG_0Hv0SCXtu4,1082
47
- python_flexeval-0.1.5.dist-info/RECORD,,
44
+ python_flexeval-0.2.0.dist-info/METADATA,sha256=bEifn06Ok5-8YllS4uYxBN2KNuZvf7vJg8b_GarkttU,5599
45
+ python_flexeval-0.2.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
46
+ python_flexeval-0.2.0.dist-info/entry_points.txt,sha256=wSyluqXhrX3xySVYAtM-Kv23p4OauKQCSBuNNfzEGtI,52
47
+ python_flexeval-0.2.0.dist-info/licenses/LICENSE,sha256=OlAu_c13gw6-fJ9UdhZBMeNr5STLrnWG_0Hv0SCXtu4,1082
48
+ python_flexeval-0.2.0.dist-info/RECORD,,