langfun 0.0.2.dev20240429__py3-none-any.whl → 0.1.2.dev202501150804__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langfun/__init__.py +20 -2
- langfun/core/__init__.py +16 -5
- langfun/core/agentic/__init__.py +30 -0
- langfun/core/agentic/action.py +854 -0
- langfun/core/agentic/action_eval.py +150 -0
- langfun/core/agentic/action_eval_test.py +109 -0
- langfun/core/agentic/action_test.py +136 -0
- langfun/core/coding/python/__init__.py +5 -11
- langfun/core/coding/python/correction.py +37 -21
- langfun/core/coding/python/correction_test.py +29 -3
- langfun/core/coding/python/execution.py +40 -216
- langfun/core/coding/python/execution_test.py +29 -89
- langfun/core/coding/python/generation.py +21 -11
- langfun/core/coding/python/generation_test.py +2 -2
- langfun/core/coding/python/parsing.py +108 -193
- langfun/core/coding/python/parsing_test.py +2 -105
- langfun/core/component.py +63 -2
- langfun/core/component_test.py +53 -0
- langfun/core/concurrent.py +414 -117
- langfun/core/concurrent_test.py +111 -24
- langfun/core/console.py +17 -5
- langfun/core/console_test.py +17 -0
- langfun/core/eval/__init__.py +16 -1
- langfun/core/eval/base.py +622 -174
- langfun/core/eval/base_test.py +200 -54
- langfun/core/eval/matching.py +63 -76
- langfun/core/eval/matching_test.py +17 -8
- langfun/core/eval/patching.py +130 -0
- langfun/core/eval/patching_test.py +170 -0
- langfun/core/eval/scoring.py +26 -26
- langfun/core/eval/scoring_test.py +19 -2
- langfun/core/eval/v2/__init__.py +42 -0
- langfun/core/eval/v2/checkpointing.py +380 -0
- langfun/core/eval/v2/checkpointing_test.py +228 -0
- langfun/core/eval/v2/eval_test_helper.py +136 -0
- langfun/core/eval/v2/evaluation.py +725 -0
- langfun/core/eval/v2/evaluation_test.py +180 -0
- langfun/core/eval/v2/example.py +305 -0
- langfun/core/eval/v2/example_test.py +128 -0
- langfun/core/eval/v2/experiment.py +1048 -0
- langfun/core/eval/v2/experiment_test.py +433 -0
- langfun/core/eval/v2/metric_values.py +156 -0
- langfun/core/eval/v2/metric_values_test.py +80 -0
- langfun/core/eval/v2/metrics.py +357 -0
- langfun/core/eval/v2/metrics_test.py +203 -0
- langfun/core/eval/v2/progress.py +348 -0
- langfun/core/eval/v2/progress_test.py +82 -0
- langfun/core/eval/v2/progress_tracking.py +210 -0
- langfun/core/eval/v2/progress_tracking_test.py +66 -0
- langfun/core/eval/v2/reporting.py +270 -0
- langfun/core/eval/v2/reporting_test.py +158 -0
- langfun/core/eval/v2/runners.py +488 -0
- langfun/core/eval/v2/runners_test.py +334 -0
- langfun/core/langfunc.py +4 -17
- langfun/core/langfunc_test.py +22 -6
- langfun/core/language_model.py +577 -39
- langfun/core/language_model_test.py +470 -56
- langfun/core/llms/__init__.py +87 -16
- langfun/core/llms/anthropic.py +312 -87
- langfun/core/llms/anthropic_test.py +71 -3
- langfun/core/llms/cache/base.py +21 -2
- langfun/core/llms/cache/in_memory.py +13 -0
- langfun/core/llms/cache/in_memory_test.py +53 -2
- langfun/core/llms/compositional.py +101 -0
- langfun/core/llms/compositional_test.py +73 -0
- langfun/core/llms/deepseek.py +117 -0
- langfun/core/llms/deepseek_test.py +61 -0
- langfun/core/llms/fake.py +11 -7
- langfun/core/llms/fake_test.py +14 -0
- langfun/core/llms/gemini.py +507 -0
- langfun/core/llms/gemini_test.py +195 -0
- langfun/core/llms/google_genai.py +62 -218
- langfun/core/llms/google_genai_test.py +9 -202
- langfun/core/llms/groq.py +160 -144
- langfun/core/llms/groq_test.py +31 -137
- langfun/core/llms/llama_cpp.py +15 -42
- langfun/core/llms/llama_cpp_test.py +4 -30
- langfun/core/llms/openai.py +395 -203
- langfun/core/llms/openai_compatible.py +179 -0
- langfun/core/llms/openai_compatible_test.py +495 -0
- langfun/core/llms/openai_test.py +30 -395
- langfun/core/llms/rest.py +113 -0
- langfun/core/llms/rest_test.py +111 -0
- langfun/core/llms/vertexai.py +192 -0
- langfun/core/llms/vertexai_test.py +52 -0
- langfun/core/logging.py +284 -0
- langfun/core/logging_test.py +125 -0
- langfun/core/message.py +319 -9
- langfun/core/message_test.py +190 -13
- langfun/core/modalities/__init__.py +6 -2
- langfun/core/modalities/audio.py +30 -0
- langfun/core/modalities/audio_test.py +63 -0
- langfun/core/modalities/image.py +39 -20
- langfun/core/modalities/image_test.py +52 -9
- langfun/core/modalities/mime.py +206 -29
- langfun/core/modalities/mime_test.py +90 -9
- langfun/core/modalities/ms_office.py +117 -0
- langfun/core/modalities/ms_office_test.py +389 -0
- langfun/core/modalities/pdf.py +22 -0
- langfun/core/modalities/pdf_test.py +57 -0
- langfun/core/modalities/video.py +9 -26
- langfun/core/modalities/video_test.py +3 -3
- langfun/core/modality.py +26 -3
- langfun/core/modality_test.py +2 -2
- langfun/core/sampling.py +11 -11
- langfun/core/structured/__init__.py +12 -16
- langfun/core/structured/completion.py +32 -5
- langfun/core/structured/completion_test.py +7 -6
- langfun/core/structured/description.py +2 -2
- langfun/core/structured/description_test.py +3 -3
- langfun/core/structured/function_generation.py +60 -27
- langfun/core/structured/function_generation_test.py +72 -2
- langfun/core/structured/mapping.py +97 -47
- langfun/core/structured/mapping_test.py +90 -2
- langfun/core/structured/parsing.py +33 -21
- langfun/core/structured/parsing_test.py +53 -9
- langfun/core/structured/querying.py +746 -0
- langfun/core/structured/{prompting_test.py → querying_test.py} +469 -51
- langfun/core/structured/schema.py +204 -97
- langfun/core/structured/schema_generation.py +1 -1
- langfun/core/structured/schema_test.py +130 -29
- langfun/core/structured/scoring.py +125 -19
- langfun/core/structured/scoring_test.py +30 -0
- langfun/core/structured/tokenization.py +64 -0
- langfun/core/structured/tokenization_test.py +48 -0
- langfun/core/template.py +115 -1
- langfun/core/template_test.py +71 -1
- langfun/core/templates/conversation.py +9 -0
- langfun/core/templates/conversation_test.py +4 -3
- langfun/core/templates/selfplay_test.py +10 -2
- langfun-0.1.2.dev202501150804.dist-info/METADATA +225 -0
- langfun-0.1.2.dev202501150804.dist-info/RECORD +153 -0
- {langfun-0.0.2.dev20240429.dist-info → langfun-0.1.2.dev202501150804.dist-info}/WHEEL +1 -1
- langfun/core/coding/python/errors.py +0 -108
- langfun/core/coding/python/errors_test.py +0 -99
- langfun/core/coding/python/permissions.py +0 -90
- langfun/core/coding/python/permissions_test.py +0 -86
- langfun/core/structured/prompting.py +0 -238
- langfun/core/text_formatting.py +0 -162
- langfun/core/text_formatting_test.py +0 -47
- langfun-0.0.2.dev20240429.dist-info/METADATA +0 -100
- langfun-0.0.2.dev20240429.dist-info/RECORD +0 -108
- {langfun-0.0.2.dev20240429.dist-info → langfun-0.1.2.dev202501150804.dist-info}/LICENSE +0 -0
- {langfun-0.0.2.dev20240429.dist-info → langfun-0.1.2.dev202501150804.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1048 @@
|
|
1
|
+
# Copyright 2024 The Langfun Authors
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
"""Evaluation experiment."""
|
15
|
+
|
16
|
+
import abc
|
17
|
+
import datetime
|
18
|
+
import functools
|
19
|
+
import hashlib
|
20
|
+
import inspect
|
21
|
+
import os
|
22
|
+
import re
|
23
|
+
from typing import Annotated, Any, Callable, Literal, Optional
|
24
|
+
|
25
|
+
import langfun.core as lf
|
26
|
+
from langfun.core.eval.v2 import example as example_lib
|
27
|
+
from langfun.core.eval.v2 import progress as progress_lib
|
28
|
+
import pyglove as pg
|
29
|
+
|
30
|
+
|
31
|
+
class Experiment(lf.Component, pg.views.HtmlTreeView.Extension):
|
32
|
+
"""Evaluation Experiment.
|
33
|
+
|
34
|
+
# Experiment Structure.
|
35
|
+
|
36
|
+
An evaluation experiment is structured as a tree of evaluation tasks, where
|
37
|
+
each task is represented as a node in the tree. Leaf tasks are instances of
|
38
|
+
`Evaluation` with concrete hyper-parameter values. Nodes such as `Suite` and
|
39
|
+
`Evaluation` that utilize `pg.oneof` are non-leaf tasks, as they represent
|
40
|
+
multiple configurations. Leaf tasks can be retrieved using property
|
41
|
+
`leaf_nodes`, while non-leaf tasks can be retrieved using property
|
42
|
+
`nonleaf_nodes`. An experiment without any leaf tasks is considered
|
43
|
+
empty.
|
44
|
+
|
45
|
+
For example:
|
46
|
+
|
47
|
+
```
|
48
|
+
Suite(
|
49
|
+
MyEvaluation1(
|
50
|
+
lm=pg.oneof([lm1, lm2]),
|
51
|
+
),
|
52
|
+
Suite(
|
53
|
+
MyEvaluation2(
|
54
|
+
lm=lm1,
|
55
|
+
),
|
56
|
+
MyEvaluation3(
|
57
|
+
lm=lm2,
|
58
|
+
),
|
59
|
+
)
|
60
|
+
)
|
61
|
+
```
|
62
|
+
|
63
|
+
In this example:
|
64
|
+
- The two `Suite` nodes and the `MyEvaluation1` node (with pg.oneof) are
|
65
|
+
non-leaf nodes, as they contain leaf tasks.
|
66
|
+
- There are four leaf nodes. Two leaf nodes under `MyEvaluation1`, which
|
67
|
+
correspond to `MyEvaluation1` instances with `lm1` and `lm2` as
|
68
|
+
hyper-parameters respectively. The objects of `MyEvaluation2` and
|
69
|
+
`MyEvaluation3` are also leaf nodes as they have specific hyper-parameter
|
70
|
+
values.
|
71
|
+
|
72
|
+
# Running an Experiment
|
73
|
+
|
74
|
+
To run an experiment, users can call `Experiment.run`. This will execute the
|
75
|
+
experiment using a specified `Runner` instance (e.g., 'parallel' or
|
76
|
+
'sequential'). Progress and results will be periodically written to HTML
|
77
|
+
files. Users can also assign an id to each run, which will identify the output
|
78
|
+
directory of that run.
|
79
|
+
|
80
|
+
By default, the experiment will resume from the latest run under the root
|
81
|
+
directory (using the ID 'latest'). Users can specify 'new' to start a fresh
|
82
|
+
run or provide a specific run ID (typically in the format %Y%m%d_%<number>).
|
83
|
+
Additionally, when initiating a new run, users may specify a `warm_start_from`
|
84
|
+
directory to restore the experiment’s state from a previous run.
|
85
|
+
|
86
|
+
Examples:
|
87
|
+
|
88
|
+
```
|
89
|
+
root_dir = '/path/to/experiment/root'
|
90
|
+
|
91
|
+
# Resume the latest experiment run, or start a new run if none exists.
|
92
|
+
experiment.run(root_dir)
|
93
|
+
|
94
|
+
# Equivalent to:
|
95
|
+
experiment.run(root_dir, 'latest')
|
96
|
+
|
97
|
+
# Start a new, clean run.
|
98
|
+
experiment.run(root_dir, 'new')
|
99
|
+
|
100
|
+
# Start a new run with a warm start from the another run located at
|
101
|
+
# '/path/to/another/run' (e.g. /my_expreriment/run_20241031_1).
|
102
|
+
experiment.run(root_dir, 'new', warm_start_from='/path/to/another/run')
|
103
|
+
|
104
|
+
# Resume run '20241031_1', re-running failed examples and recomputing
|
105
|
+
# metrics as needed.
|
106
|
+
experiment.run(root_dir, '20241031_1')
|
107
|
+
|
108
|
+
# Reprocess the previous run located in 'run_20241031_1'.
|
109
|
+
experiment.run(root_dir, '20241031_1', reprocess=True)
|
110
|
+
```
|
111
|
+
|
112
|
+
# Experiment Registration and Lookup
|
113
|
+
|
114
|
+
Experiments can be registered by setting a class-level NAME attribute.
|
115
|
+
Users can then retrieve a registered experiment using Experiment.find(name).
|
116
|
+
|
117
|
+
For example:
|
118
|
+
|
119
|
+
```
|
120
|
+
class MyEval(lf.eval.v2.Evaluation):
|
121
|
+
NAME = 'my_eval'
|
122
|
+
|
123
|
+
class MyEvalVariation1(MyEval):
|
124
|
+
NAME = 'my_eval/gemini'
|
125
|
+
lm = pg.oneof([lf.llms.GeminiPro(), lf.llms.GeminiFlash(), ...])
|
126
|
+
|
127
|
+
class MyEvalVariation2(MyEval):
|
128
|
+
NAME = 'my_eval/openai'
|
129
|
+
lm = pg.oneof([lf.llms.Gpt4o(), lf.llms.Gpt4Turbo(), ...])
|
130
|
+
|
131
|
+
# Run all experiments with "gemini" in their name.
|
132
|
+
experiment = Experiment.find('.*/gemini')
|
133
|
+
experiment.run()
|
134
|
+
|
135
|
+
# Run all experiments with "my_eval" in their name.
|
136
|
+
experiment = Experiment.find('my_eval.*')
|
137
|
+
experiment.run()
|
138
|
+
```
|
139
|
+
|
140
|
+
# Checkpointing
|
141
|
+
|
142
|
+
Experiments support checkpointing, which is enabled by default. It allows
|
143
|
+
users to resume their experiments from a saved state. When an experiment runs,
|
144
|
+
it creates a new directory for that run and saves the current state to a
|
145
|
+
checkpoint file. If the experiment is interrupted or fails, users can resume
|
146
|
+
it by specifying the 'id' or 'warm_start_from' argument (shown above) to
|
147
|
+
seamlessly continue from previously saved state without starting over.
|
148
|
+
|
149
|
+
# Monitoring and Reporting
|
150
|
+
|
151
|
+
Evaluations can take considerable time to complete, so Langfun provides
|
152
|
+
several tools to monitor progress. Progress bars display the status of each
|
153
|
+
evaluation: HTML-based progress bars update in real time within Colab
|
154
|
+
notebooks, while text-based progress bars appear in the terminal using tqdm.
|
155
|
+
|
156
|
+
Additionally, Langfun generates HTML files at regular intervals to provide
|
157
|
+
progress updates and detailed evaluation results. These files are saved in
|
158
|
+
the evaluation's output directory, organized as follows:
|
159
|
+
|
160
|
+
root_dir> # Root directory of the experiment.
|
161
|
+
|_ <run_id> # Root directory of current run.
|
162
|
+
|_ summary.html # Summary of the run. Updated every 60 seconds.
|
163
|
+
|_ <experiment_cls> # Directory of a particular experiment type.
|
164
|
+
|_ <experiment_hash> # Directory of a particular experiment config.
|
165
|
+
|_ index.html # Experiment report. Updated every 60 seconds.
|
166
|
+
|_ 1.html # Detailed evaluation output of example 1.
|
167
|
+
|_ 2.html # Detailed evaluation output of example 2.
|
168
|
+
|_ ...
|
169
|
+
|
170
|
+
# Experiment Plugins
|
171
|
+
|
172
|
+
Experiment can be extended by plugins. Plugins can listen to the events of
|
173
|
+
experiment execution and produce additional outputs. For example, a plugin
|
174
|
+
can be added to an experiment to generate additional metrics or to save
|
175
|
+
additional data to a database. More details will be added in the future.
|
176
|
+
"""
|
177
|
+
|
178
|
+
#
|
179
|
+
# Class-level functionalities.
|
180
|
+
#
|
181
|
+
|
182
|
+
# An global unique str as a well-known name for an experiment,
|
183
|
+
# which can be retrieved by `Experiment.find(name)`. If None, the experiment
|
184
|
+
# does not have a well-known name, thus users need to create the experiment
|
185
|
+
# by constructing it explicitly.
|
186
|
+
NAME = None
|
187
|
+
|
188
|
+
# Global registry for experiment classes with GLOBAL_ID.
|
189
|
+
_NAME_TO_CLASS = {}
|
190
|
+
|
191
|
+
def __init_subclass__(cls):
|
192
|
+
super().__init_subclass__()
|
193
|
+
|
194
|
+
if inspect.isabstract(cls):
|
195
|
+
return
|
196
|
+
|
197
|
+
if cls.NAME is not None:
|
198
|
+
cls._NAME_TO_CLASS[cls.NAME] = cls
|
199
|
+
|
200
|
+
@classmethod
|
201
|
+
def find(cls, pattern: str) -> 'Experiment':
|
202
|
+
"""Finds an experiment by global name.
|
203
|
+
|
204
|
+
Args:
|
205
|
+
pattern: A regular expression to match the global names of registered
|
206
|
+
experiments.
|
207
|
+
|
208
|
+
Returns:
|
209
|
+
An experiment object. If multiple experiments are found, a
|
210
|
+
`Suite` of matched experiments will be returned. If no experiment is
|
211
|
+
found, an empty `Suite` will be returned.
|
212
|
+
"""
|
213
|
+
if pattern in cls._NAME_TO_CLASS:
|
214
|
+
return cls._NAME_TO_CLASS[pattern]()
|
215
|
+
regex = re.compile(pattern)
|
216
|
+
selected = []
|
217
|
+
for cls_name, exp_cls in cls._NAME_TO_CLASS.items():
|
218
|
+
if regex.match(cls_name):
|
219
|
+
selected.append(exp_cls())
|
220
|
+
return selected[0] if len(selected) == 1 else Suite(selected)
|
221
|
+
|
222
|
+
#
|
223
|
+
# Instance-level functionalities.
|
224
|
+
#
|
225
|
+
|
226
|
+
progress: Annotated[
|
227
|
+
progress_lib.Progress,
|
228
|
+
'The progress of the experiment.'
|
229
|
+
] = progress_lib.Progress()
|
230
|
+
|
231
|
+
usage_summary: Annotated[
|
232
|
+
lf.UsageSummary,
|
233
|
+
'The usage summary of the experiment.'
|
234
|
+
] = lf.UsageSummary()
|
235
|
+
|
236
|
+
plugins: Annotated[
|
237
|
+
list['Plugin'],
|
238
|
+
(
|
239
|
+
'Plugins for current experiment, which can listen to the events '
|
240
|
+
'of experiment execution and produce additional outputs.'
|
241
|
+
)
|
242
|
+
] = []
|
243
|
+
|
244
|
+
def _on_bound(self):
|
245
|
+
super()._on_bound()
|
246
|
+
self.__dict__.pop('hash', None)
|
247
|
+
self.__dict__.pop('dir', None)
|
248
|
+
self._reset()
|
249
|
+
|
250
|
+
#
|
251
|
+
# Identity of an experiment.
|
252
|
+
#
|
253
|
+
|
254
|
+
@property
|
255
|
+
def id(self) -> str:
|
256
|
+
"""Returns the ID for this evaluaton."""
|
257
|
+
return f'{self.__class__.__name__}@{self.hash}'
|
258
|
+
|
259
|
+
def definition(self, hide_default_values: bool = True) -> str:
|
260
|
+
"""Returns the definition of the experiment."""
|
261
|
+
return self.format(
|
262
|
+
compact=False,
|
263
|
+
hide_default_values=hide_default_values,
|
264
|
+
use_inferred=True,
|
265
|
+
exclude_keys=('progress', 'usage_summary')
|
266
|
+
)
|
267
|
+
|
268
|
+
@functools.cached_property
|
269
|
+
def hash(self) -> str:
|
270
|
+
"""A 8-byte MD5 hash computed from experiment identity."""
|
271
|
+
identity = self.format(
|
272
|
+
compact=True, hide_default_values=True, use_inferred=True,
|
273
|
+
exclude_keys=('plugins', 'progress', 'usage_summary')
|
274
|
+
)
|
275
|
+
return hashlib.md5(identity.encode()).hexdigest()[:8]
|
276
|
+
|
277
|
+
@classmethod
|
278
|
+
def link(cls, path: str) -> str:
|
279
|
+
return f'file://{path}'
|
280
|
+
|
281
|
+
#
|
282
|
+
# Hierarchy of an experiment tree.
|
283
|
+
#
|
284
|
+
|
285
|
+
@property
|
286
|
+
@abc.abstractmethod
|
287
|
+
def children(self) -> list['Experiment']:
|
288
|
+
"""Returns the child experiments."""
|
289
|
+
|
290
|
+
@property
|
291
|
+
@abc.abstractmethod
|
292
|
+
def is_leaf(self) -> bool:
|
293
|
+
"""Returns whether the experiment is a leaf node."""
|
294
|
+
|
295
|
+
def empty(self) -> bool:
|
296
|
+
"""Returns whether the experiment is empty."""
|
297
|
+
return not self.leaf_nodes
|
298
|
+
|
299
|
+
@functools.cached_property
|
300
|
+
def nodes(self) -> list['Experiment']:
|
301
|
+
"""Returns all the experiment nodes in the subtree (including self)."""
|
302
|
+
nodes = [self]
|
303
|
+
for child in self.children:
|
304
|
+
nodes.extend(child.nodes)
|
305
|
+
return nodes
|
306
|
+
|
307
|
+
@functools.cached_property
|
308
|
+
def leaf_nodes(self) -> list['Experiment']:
|
309
|
+
"""Returns the leaf nodes.
|
310
|
+
|
311
|
+
The leaf-nodes of an experiment are evaluable objects that has materilized
|
312
|
+
hyper-parameters.
|
313
|
+
"""
|
314
|
+
if self.is_leaf:
|
315
|
+
return [self]
|
316
|
+
|
317
|
+
nodes = []
|
318
|
+
for child in self.children:
|
319
|
+
nodes.extend(child.leaf_nodes)
|
320
|
+
return nodes
|
321
|
+
|
322
|
+
@functools.cached_property
|
323
|
+
def nonleaf_nodes(self) -> list['Experiment']:
|
324
|
+
"""Returns the non-leaf nodes."""
|
325
|
+
if self.is_leaf:
|
326
|
+
return []
|
327
|
+
nodes = [self]
|
328
|
+
for child in self.children:
|
329
|
+
nodes.extend(child.nonleaf_nodes)
|
330
|
+
return nodes
|
331
|
+
|
332
|
+
@functools.cached_property
|
333
|
+
def parent(self) -> Optional['Experiment']:
|
334
|
+
"""Returns the parent experiment."""
|
335
|
+
parent = self.sym_parent
|
336
|
+
while parent is not None and not isinstance(parent, Experiment):
|
337
|
+
parent = parent.sym_parent
|
338
|
+
return parent
|
339
|
+
|
340
|
+
def get(self, evaluation_id: str) -> Optional['Experiment']:
|
341
|
+
"""Returns the experiment by ID."""
|
342
|
+
for leaf in self.leaf_nodes:
|
343
|
+
if leaf.id == evaluation_id:
|
344
|
+
return leaf
|
345
|
+
return None
|
346
|
+
|
347
|
+
#
|
348
|
+
# Mutable states during evaluaton.
|
349
|
+
#
|
350
|
+
|
351
|
+
def reset(self) -> None:
|
352
|
+
"""Resets the experiment for a new run."""
|
353
|
+
self.progress.reset()
|
354
|
+
self.rebind(
|
355
|
+
usage_summary=lf.UsageSummary(),
|
356
|
+
skip_notification=True,
|
357
|
+
raise_on_no_change=False
|
358
|
+
)
|
359
|
+
if self.is_leaf:
|
360
|
+
self._reset()
|
361
|
+
else:
|
362
|
+
for child in self.children:
|
363
|
+
child.reset()
|
364
|
+
|
365
|
+
def _reset(self) -> None:
|
366
|
+
"""Subclass could override."""
|
367
|
+
|
368
|
+
#
|
369
|
+
# Helper methods for running the evaluation without explicitly creating the
|
370
|
+
# runner.
|
371
|
+
#
|
372
|
+
|
373
|
+
def run(
|
374
|
+
self,
|
375
|
+
root_dir: str,
|
376
|
+
id: str | None = None, # pylint: disable=redefined-builtin
|
377
|
+
*,
|
378
|
+
runner: str = 'parallel',
|
379
|
+
warm_start_from: str | None = None,
|
380
|
+
filter: Callable[['Experiment'], bool] | None = None, # pylint: disable=redefined-builtin
|
381
|
+
example_ids: list[int] | None = None,
|
382
|
+
raise_if_has_error: bool = False,
|
383
|
+
reprocess: bool | list[int] = False,
|
384
|
+
generate_example_html: Literal['new', 'all', 'no'] | list[int] = 'new',
|
385
|
+
process_timeout: int | None = None,
|
386
|
+
use_cache: Literal['global', 'per_dataset', 'no'] = 'per_dataset',
|
387
|
+
note: str | None = None,
|
388
|
+
tags: list[str] | None = None,
|
389
|
+
plugins: list['Plugin'] | None = None,
|
390
|
+
**kwargs
|
391
|
+
) -> 'Run':
|
392
|
+
"""Runs the experiment.
|
393
|
+
|
394
|
+
Examples:
|
395
|
+
# Start a new run under root_dir.
|
396
|
+
experiment.run(root_dir, 'new')
|
397
|
+
|
398
|
+
# Continue the latest experiment run.
|
399
|
+
experiment.run(root_dir, 'latest')
|
400
|
+
|
401
|
+
# Continue the latest experiment run or start a new run if it does not
|
402
|
+
# exist.
|
403
|
+
experiment.run(root_dir)
|
404
|
+
|
405
|
+
# Start a new run and warm start from another run's directory
|
406
|
+
# '/path/to/another/run_20241031_1/'.
|
407
|
+
experiment.run(
|
408
|
+
root_dir, 'new',
|
409
|
+
warm_start_from='/path/to/another/run_20241031_1/'
|
410
|
+
)
|
411
|
+
|
412
|
+
# Reprocess previous run under sub-dir 'run_20241031_1'.
|
413
|
+
experiment.run(root_dir, '20241031_1', reprocess=True)
|
414
|
+
|
415
|
+
Args:
|
416
|
+
root_dir: The root of the output directory of the experiment.
|
417
|
+
id: The ID of the current run. It can be None, a special keyword 'latest'
|
418
|
+
or 'new', or a datetime string in format `%Y%m%d%_%` (e.g. 20241031_1).
|
419
|
+
If None, it will use the latest run ID under the root directory or
|
420
|
+
create a new run based on the current time if no previous run exists.
|
421
|
+
If `latest`, it will use the latest run ID under the root directory.
|
422
|
+
If `new`, it will create a new run ID based on the current time.
|
423
|
+
runner: The runner to use. If None, it will use the default runner for
|
424
|
+
the experiment.
|
425
|
+
warm_start_from: The ID of the previous run to warm start from. If None,
|
426
|
+
it will continue the experiment identified by `id` from where it left
|
427
|
+
off. Otherwise, it will create a new experiment run by warming start.
|
428
|
+
filter: A filter function to decide whether an experiment should be run
|
429
|
+
or not.
|
430
|
+
example_ids: The example IDs to run. If None, it will run all examples.
|
431
|
+
raise_if_has_error: If True, it will raise an error if any example fails.
|
432
|
+
Otherwise, it will continue and report the error in the output.
|
433
|
+
reprocess: A boolean or a list of example IDs. If boolean, it indicates
|
434
|
+
that whether all the examples to be evaluated will be reprocessed,
|
435
|
+
meaning that existing checkpoints will be ignored. If a list of
|
436
|
+
example IDs, it indicates that only the specified examples will be
|
437
|
+
reprocessed.
|
438
|
+
generate_example_html: Among 'new', 'all', 'no' or a list of example IDs.
|
439
|
+
If 'new', generate HTML files for all newly processed examples, and
|
440
|
+
keep/copy existing HTML files for unchanged examples.
|
441
|
+
If 'all', generate HTML files for all examples.
|
442
|
+
If 'no', do not generate HTML files for any examples.
|
443
|
+
If a list of example IDs, generate HTML files for the specified
|
444
|
+
examples.
|
445
|
+
process_timeout: The timeout in seconds for each process. If None, it
|
446
|
+
will use the default timeout for the runner.
|
447
|
+
use_cache: Whether to use LLM cache for the experiment.
|
448
|
+
If `global`, it will use a global cache shared by all experiments.
|
449
|
+
If `per_dataset`, it will use a cache dedicated for each dataset.
|
450
|
+
If `no`, it will not use any cache.
|
451
|
+
note: The note for the current run.
|
452
|
+
tags: The tags for the current run.
|
453
|
+
plugins: Runner plugins to use.
|
454
|
+
**kwargs: Additional kwargs to pass to the runner.
|
455
|
+
|
456
|
+
Returns:
|
457
|
+
The current run.
|
458
|
+
"""
|
459
|
+
if plugins is not None:
|
460
|
+
kwargs['plugins'] = plugins
|
461
|
+
runner = Runner.create(
|
462
|
+
runner,
|
463
|
+
current_run=Run(
|
464
|
+
root_dir=root_dir,
|
465
|
+
experiment=pg.Ref(self),
|
466
|
+
id=RunId.from_id(id, root_dir),
|
467
|
+
warm_start_from=warm_start_from,
|
468
|
+
filter=filter,
|
469
|
+
example_ids=example_ids,
|
470
|
+
raise_if_has_error=raise_if_has_error,
|
471
|
+
reprocess=reprocess,
|
472
|
+
generate_example_html=generate_example_html,
|
473
|
+
use_cache=use_cache,
|
474
|
+
process_timeout=process_timeout,
|
475
|
+
note=note,
|
476
|
+
tags=tags or [],
|
477
|
+
),
|
478
|
+
**kwargs
|
479
|
+
)
|
480
|
+
runner.run()
|
481
|
+
return runner.current_run
|
482
|
+
|
483
|
+
def run_preconfigured(
|
484
|
+
self,
|
485
|
+
root_dir: str | None = None,
|
486
|
+
id: str | None = None, # pylint: disable=redefined-builtin
|
487
|
+
**kwargs
|
488
|
+
) -> 'Run':
|
489
|
+
"""Runs the experiment with pre-configured kwargs from `cls.RUN_ARGS`.
|
490
|
+
|
491
|
+
This helper method allows users to config running arguments as a part of
|
492
|
+
the class.
|
493
|
+
|
494
|
+
Args:
|
495
|
+
root_dir: root directory of the experiment.
|
496
|
+
id: ID of the current run.
|
497
|
+
**kwargs: Keyword arguments to override the RUN_CONFIG.
|
498
|
+
|
499
|
+
Returns:
|
500
|
+
The current run.
|
501
|
+
"""
|
502
|
+
run_config = getattr(self, 'RUN_ARGS', {})
|
503
|
+
run_config.update(kwargs)
|
504
|
+
if root_dir is not None:
|
505
|
+
run_config['root_dir'] = root_dir
|
506
|
+
if id is not None:
|
507
|
+
run_config['id'] = id
|
508
|
+
return self.run(**run_config)
|
509
|
+
|
510
|
+
#
|
511
|
+
# HTML views.
|
512
|
+
#
|
513
|
+
|
514
|
+
def output_link(
|
515
|
+
self,
|
516
|
+
run: Optional['Run'], relative_path: str
|
517
|
+
) -> str | None:
|
518
|
+
"""Returns the output path of the experiment."""
|
519
|
+
if run is None:
|
520
|
+
return None
|
521
|
+
return self.link(run.output_path_for(self, relative_path))
|
522
|
+
|
523
|
+
def _html_tree_view_summary_title(
|
524
|
+
self,
|
525
|
+
current_run: Optional['Run'] = None,
|
526
|
+
interactive: bool = True,
|
527
|
+
):
|
528
|
+
title, link, dir_link = self.id, None, None
|
529
|
+
if current_run is not None:
|
530
|
+
dir_link = self.output_link(current_run, '')
|
531
|
+
if self.is_leaf:
|
532
|
+
link = self.output_link(current_run, 'index.html')
|
533
|
+
elif self.parent is None:
|
534
|
+
title = str(current_run.id)
|
535
|
+
link = self.output_link(current_run, 'summary.html')
|
536
|
+
return pg.Html.element(
|
537
|
+
'div',
|
538
|
+
[
|
539
|
+
# Experiment ID.
|
540
|
+
pg.views.html.controls.Label(
|
541
|
+
title,
|
542
|
+
link=link,
|
543
|
+
tooltip=pg.format( # pytype: disable=wrong-arg-types
|
544
|
+
self,
|
545
|
+
verbose=False,
|
546
|
+
use_inferred=True,
|
547
|
+
hide_default_values=True,
|
548
|
+
exclude_keys=(
|
549
|
+
'root_dir', 'plugins', 'progress', 'usage_summary'
|
550
|
+
),
|
551
|
+
),
|
552
|
+
css_classes=['experiment-name'],
|
553
|
+
),
|
554
|
+
# Experiment directory (if root or leaf).
|
555
|
+
pg.views.html.controls.Label( # pylint: disable=g-long-ternary
|
556
|
+
'[dir]',
|
557
|
+
link=dir_link,
|
558
|
+
css_classes=['experiment-dir'],
|
559
|
+
) if dir_link is not None else None,
|
560
|
+
# Progress bar.
|
561
|
+
self.progress.to_html(
|
562
|
+
extra_flags=dict(interactive=interactive),
|
563
|
+
),
|
564
|
+
# Usage summary,
|
565
|
+
self.usage_summary.to_html(
|
566
|
+
extra_flags=dict(as_badge=True, interactive=interactive)
|
567
|
+
),
|
568
|
+
],
|
569
|
+
css_classes=['experiment-summary']
|
570
|
+
)
|
571
|
+
|
572
|
+
def _html_tree_view_summary(
|
573
|
+
self,
|
574
|
+
*,
|
575
|
+
view,
|
576
|
+
name: str | None = None,
|
577
|
+
extra_flags: dict[str, Any] | None = None,
|
578
|
+
**kwargs
|
579
|
+
):
|
580
|
+
extra_flags = extra_flags or {}
|
581
|
+
if not extra_flags.get('card_view', True):
|
582
|
+
return None
|
583
|
+
|
584
|
+
kwargs.pop('title', None)
|
585
|
+
kwargs.pop('enable_key_tooltip', None)
|
586
|
+
kwargs.pop('enable_summary_tooltip', None)
|
587
|
+
return view.summary(
|
588
|
+
self,
|
589
|
+
name=name if self.is_leaf else None,
|
590
|
+
title=self._html_tree_view_summary_title(
|
591
|
+
extra_flags.get('current_run', None),
|
592
|
+
extra_flags.get('interactive', True)
|
593
|
+
),
|
594
|
+
enable_key_tooltip=False,
|
595
|
+
enable_summary_tooltip=False,
|
596
|
+
**kwargs
|
597
|
+
)
|
598
|
+
|
599
|
+
def _html_tree_view_content(
|
600
|
+
self,
|
601
|
+
*,
|
602
|
+
view,
|
603
|
+
collapse_level: int | None = 1,
|
604
|
+
extra_flags: dict[str, Any],
|
605
|
+
**kwargs):
|
606
|
+
return pg.Html.element(
|
607
|
+
'div',
|
608
|
+
[
|
609
|
+
c.to_html(
|
610
|
+
collapse_level=view.get_collapse_level(
|
611
|
+
(collapse_level, -1), 0
|
612
|
+
),
|
613
|
+
name=f'#{i + 1}',
|
614
|
+
extra_flags=extra_flags,
|
615
|
+
**view.get_passthrough_kwargs(**kwargs)
|
616
|
+
)
|
617
|
+
for i, c in enumerate(self.children)
|
618
|
+
],
|
619
|
+
)
|
620
|
+
|
621
|
+
def _html_tree_view_css_styles(self) -> list[str]:
|
622
|
+
return super()._html_tree_view_css_styles() + [
|
623
|
+
"""
|
624
|
+
.experiment-summary {
|
625
|
+
display: inline-block;
|
626
|
+
font-weight: normal;
|
627
|
+
}
|
628
|
+
.experiment-name {
|
629
|
+
font-weight: bold;
|
630
|
+
}
|
631
|
+
.experiment-dir.label {
|
632
|
+
color: revert;
|
633
|
+
margin-left: 0px;
|
634
|
+
padding: 2px;
|
635
|
+
}
|
636
|
+
.usage-summary-badge {
|
637
|
+
margin-left: 10px;
|
638
|
+
}
|
639
|
+
body {
|
640
|
+
font: normal 16px "Roboto","Noto",sans-serif;
|
641
|
+
}
|
642
|
+
"""
|
643
|
+
]
|
644
|
+
|
645
|
+
|
646
|
+
@pg.use_init_args(['children'])
|
647
|
+
class Suite(Experiment):
|
648
|
+
"""A suite of evaluations."""
|
649
|
+
|
650
|
+
children: Annotated[
|
651
|
+
list[Experiment], 'A list of child experiments.'
|
652
|
+
] = []
|
653
|
+
|
654
|
+
@property
|
655
|
+
def is_leaf(self) -> bool:
|
656
|
+
"""Returns whether the task is a leaf."""
|
657
|
+
return False
|
658
|
+
|
659
|
+
|
660
|
+
class RunId(pg.Object):
|
661
|
+
"""Structured repreesentation a experiment run ID."""
|
662
|
+
date: datetime.date
|
663
|
+
number: int
|
664
|
+
|
665
|
+
_REGEX = re.compile(r'^(\d{8})_(\d+)$')
|
666
|
+
|
667
|
+
def dirname(self, root_dir: str | None = None) -> str:
|
668
|
+
"""Returns the directory name of the run ID."""
|
669
|
+
dir_name = f'run_{self}'
|
670
|
+
if root_dir is None:
|
671
|
+
return dir_name
|
672
|
+
return os.path.join(root_dir, dir_name)
|
673
|
+
|
674
|
+
def __str__(self) -> str:
|
675
|
+
"""Returns the string representation of the run ID."""
|
676
|
+
return f'{self.date.strftime("%Y%m%d")}_{self.number}'
|
677
|
+
|
678
|
+
def __lt__(self, other: 'RunId') -> bool:
|
679
|
+
"""Returns whether the run ID is less than the other."""
|
680
|
+
return self.date < other.date or (
|
681
|
+
self.date == other.date and self.number < other.number
|
682
|
+
)
|
683
|
+
|
684
|
+
def _le__(self, other: 'RunId') -> bool:
|
685
|
+
"""Returns whether the run ID is less than or equal to the other."""
|
686
|
+
return self == other or self < other
|
687
|
+
|
688
|
+
def __gt__(self, other: 'RunId') -> bool:
|
689
|
+
"""Returns whether the run ID is greater than the other."""
|
690
|
+
return other < self
|
691
|
+
|
692
|
+
def __ge__(self, other: 'RunId') -> bool:
|
693
|
+
"""Returns whether the run ID is greater than or equal to the other."""
|
694
|
+
return self == other or self > other
|
695
|
+
|
696
|
+
def next(self) -> 'RunId':
|
697
|
+
"""Returns the next run ID."""
|
698
|
+
return RunId(
|
699
|
+
date=self.date,
|
700
|
+
number=self.number + 1,
|
701
|
+
)
|
702
|
+
|
703
|
+
@classmethod
|
704
|
+
def from_dirname(cls, dirname: str) -> Optional['RunId']:
|
705
|
+
"""Creates a run ID from the directory name."""
|
706
|
+
if not dirname.startswith('run_'):
|
707
|
+
return None
|
708
|
+
run_id_str = dirname.removeprefix('run_')
|
709
|
+
if cls.is_valid(run_id_str):
|
710
|
+
return cls.from_id(run_id_str)
|
711
|
+
return None
|
712
|
+
|
713
|
+
@classmethod
|
714
|
+
def is_valid(cls, run_id: str) -> bool:
|
715
|
+
"""Returns whether the run ID is valid."""
|
716
|
+
return run_id in ('latest', 'new') or bool(cls._REGEX.match(run_id))
|
717
|
+
|
718
|
+
@classmethod
|
719
|
+
def from_id(
|
720
|
+
cls,
|
721
|
+
run_id: str | None,
|
722
|
+
root_dir: str | None = None
|
723
|
+
) -> 'RunId':
|
724
|
+
"""Creates a run ID from the string representation."""
|
725
|
+
if run_id is not None and not cls.is_valid(run_id):
|
726
|
+
raise ValueError(
|
727
|
+
f'`run_id` must be one of `latest`, `new` and a '
|
728
|
+
f'datetime string in format `%Y%m%d%_<number>` (e.g. 20240101_1). '
|
729
|
+
f'Encountered: {run_id!r}.'
|
730
|
+
)
|
731
|
+
if run_id in (None, 'latest', 'new'):
|
732
|
+
if root_dir is None:
|
733
|
+
raise ValueError(
|
734
|
+
'`root_dir` must be provided for `latest` or `new` run ID.'
|
735
|
+
)
|
736
|
+
if run_id == 'latest':
|
737
|
+
run_id = cls.get_latest(root_dir)
|
738
|
+
if run_id is None:
|
739
|
+
raise ValueError(
|
740
|
+
f'There are no previous runs under the root directory: '
|
741
|
+
f'{root_dir}. Consider running the experiment using `new` as id.'
|
742
|
+
)
|
743
|
+
return run_id
|
744
|
+
if run_id == 'new':
|
745
|
+
return cls.new(root_dir)
|
746
|
+
return cls.get_latest(root_dir) or cls.new()
|
747
|
+
|
748
|
+
assert run_id is not None
|
749
|
+
date_str, number_str = run_id.split('_')
|
750
|
+
return cls(
|
751
|
+
date=datetime.datetime.strptime(date_str, '%Y%m%d').date(),
|
752
|
+
number=int(number_str),
|
753
|
+
)
|
754
|
+
|
755
|
+
@classmethod
|
756
|
+
def get_latest(cls, root_dir: str) -> Optional['RunId']:
|
757
|
+
"""Returns the latest run ID under the root directory."""
|
758
|
+
if not pg.io.isdir(root_dir):
|
759
|
+
return None
|
760
|
+
run_ids = [
|
761
|
+
RunId.from_dirname(dirname)
|
762
|
+
for dirname in pg.io.listdir(root_dir)
|
763
|
+
]
|
764
|
+
run_ids = [run_id for run_id in run_ids if run_id is not None]
|
765
|
+
if not run_ids:
|
766
|
+
return None
|
767
|
+
return max(run_ids)
|
768
|
+
|
769
|
+
@classmethod
|
770
|
+
def new(cls, root_dir: str | None = None) -> 'RunId':
|
771
|
+
"""Creates a new run ID."""
|
772
|
+
latest = None if root_dir is None else cls.get_latest(root_dir)
|
773
|
+
if latest is not None and latest.date == datetime.date.today():
|
774
|
+
return latest.next()
|
775
|
+
return cls(
|
776
|
+
date=datetime.date.today(),
|
777
|
+
number=1,
|
778
|
+
)
|
779
|
+
|
780
|
+
|
781
|
+
class Run(pg.Object, pg.views.html.HtmlTreeView.Extension):
|
782
|
+
"""A run of an experiment."""
|
783
|
+
|
784
|
+
root_dir: Annotated[
|
785
|
+
str,
|
786
|
+
'The root of the output directory of the experiment.'
|
787
|
+
]
|
788
|
+
|
789
|
+
id: Annotated[
|
790
|
+
RunId,
|
791
|
+
(
|
792
|
+
'The ID of the current run.'
|
793
|
+
)
|
794
|
+
]
|
795
|
+
|
796
|
+
experiment: Annotated[
|
797
|
+
Experiment,
|
798
|
+
'The root experiment to run.'
|
799
|
+
]
|
800
|
+
|
801
|
+
warm_start_from: Annotated[
|
802
|
+
str | None,
|
803
|
+
(
|
804
|
+
'The directory for a previous run to warm start from.'
|
805
|
+
)
|
806
|
+
] = None
|
807
|
+
|
808
|
+
example_ids: Annotated[
|
809
|
+
list[int] | None,
|
810
|
+
(
|
811
|
+
'The example IDs to run. If None, it will run all examples. '
|
812
|
+
'Though '
|
813
|
+
)
|
814
|
+
] = None
|
815
|
+
|
816
|
+
raise_if_has_error: Annotated[
|
817
|
+
bool,
|
818
|
+
(
|
819
|
+
'If True, it will raise an error if any example fails.'
|
820
|
+
)
|
821
|
+
] = False
|
822
|
+
|
823
|
+
note: Annotated[
|
824
|
+
str | None,
|
825
|
+
'The user note for the current run.'
|
826
|
+
] = None
|
827
|
+
|
828
|
+
tags: Annotated[
|
829
|
+
list[str],
|
830
|
+
'The user tags for the current run.'
|
831
|
+
] = []
|
832
|
+
|
833
|
+
reprocess: Annotated[
|
834
|
+
bool | list[int],
|
835
|
+
(
|
836
|
+
'If True, it will reprocess all examples under the current '
|
837
|
+
'run directory. If a list of integers, examples of the given IDS '
|
838
|
+
'will be reprocessed.'
|
839
|
+
)
|
840
|
+
] = False
|
841
|
+
|
842
|
+
generate_example_html: Annotated[
|
843
|
+
Literal['new', 'all', 'no'] | list[int],
|
844
|
+
(
|
845
|
+
'If "new", generate HTML files for all newly processed examples, '
|
846
|
+
'and keep/copy existing HTML files for unchanged examples. '
|
847
|
+
'If "all", generate HTML files for all examples. '
|
848
|
+
'If "no", do not generate HTML files for any examples. '
|
849
|
+
'If a list of example IDs, generate HTML files for the specified '
|
850
|
+
'examples.'
|
851
|
+
)
|
852
|
+
] = 'new'
|
853
|
+
|
854
|
+
filter: Annotated[
|
855
|
+
Callable[[Experiment], bool] | None,
|
856
|
+
'A filter to decide whether a leaf experiment should be run or not.'
|
857
|
+
] = None
|
858
|
+
|
859
|
+
process_timeout: Annotated[
|
860
|
+
int | None,
|
861
|
+
'Timeout for each evaluation example.'
|
862
|
+
] = None
|
863
|
+
|
864
|
+
use_cache: Annotated[
|
865
|
+
Literal['global', 'per_dataset', 'no'],
|
866
|
+
(
|
867
|
+
'The cache policy for the runner. If `global`, the runner will use '
|
868
|
+
'the cache for all evaluations. If `per_dataset`, the runner will '
|
869
|
+
'use the cache for each evaluation. If `no`, the runner will not '
|
870
|
+
'use the cache.'
|
871
|
+
)
|
872
|
+
] = 'per_dataset'
|
873
|
+
|
874
|
+
@property
|
875
|
+
def output_root(self) -> str:
|
876
|
+
"""Returns the root directory of the experiment."""
|
877
|
+
return self.id.dirname(self.root_dir)
|
878
|
+
|
879
|
+
@property
|
880
|
+
def input_root(self) -> str:
|
881
|
+
"""Returns the input root d."""
|
882
|
+
return self.warm_start_from if self.warm_start_from else self.output_root
|
883
|
+
|
884
|
+
def output_dir(self, experiment: Experiment) -> str:
|
885
|
+
"""Returns the output directory of the experiment."""
|
886
|
+
if experiment.is_leaf:
|
887
|
+
return os.path.join(self.output_root, experiment.id.replace('@', '/'))
|
888
|
+
return self.output_root
|
889
|
+
|
890
|
+
def input_dir(self, experiment: Experiment) -> str:
|
891
|
+
"""Returns the input directory of the experiment."""
|
892
|
+
if experiment.is_leaf:
|
893
|
+
return os.path.join(self.input_root, experiment.id.replace('@', '/'))
|
894
|
+
return self.input_root
|
895
|
+
|
896
|
+
def input_path_for(self, experiment: Experiment, relative_path: str) -> str:
|
897
|
+
"""Returns the input path for the experiment."""
|
898
|
+
return os.path.join(self.input_dir(experiment), relative_path)
|
899
|
+
|
900
|
+
def output_path_for(self, experiment: Experiment, relative_path: str) -> str:
|
901
|
+
"""Returns the output path for the experiment."""
|
902
|
+
return os.path.join(self.output_dir(experiment), relative_path)
|
903
|
+
|
904
|
+
def examples_to_evaluate(self, experiment: Experiment) -> set[int]:
|
905
|
+
"""Returns the example IDs to evaluate."""
|
906
|
+
if not experiment.is_leaf:
|
907
|
+
return set()
|
908
|
+
return set(
|
909
|
+
self.example_ids if self.example_ids else
|
910
|
+
range(1, experiment.num_examples + 1)
|
911
|
+
)
|
912
|
+
|
913
|
+
def examples_to_reprocess(self, experiment: Experiment) -> set[int]:
|
914
|
+
"""Returns the example IDs to reprocess per request."""
|
915
|
+
if not self.reprocess:
|
916
|
+
return set()
|
917
|
+
reprocess_ids = self.examples_to_evaluate(experiment)
|
918
|
+
if isinstance(self.reprocess, list):
|
919
|
+
reprocess_ids &= set(self.reprocess)
|
920
|
+
return reprocess_ids
|
921
|
+
|
922
|
+
def examples_to_load(self, experiment: Experiment) -> set[int]:
|
923
|
+
"""Returns the example IDs to load from checkpoint files.."""
|
924
|
+
load_ids = self.examples_to_evaluate(experiment)
|
925
|
+
if isinstance(self.generate_example_html, list):
|
926
|
+
load_ids |= set(self.generate_example_html)
|
927
|
+
load_ids -= self.examples_to_reprocess(experiment)
|
928
|
+
return load_ids
|
929
|
+
|
930
|
+
def examples_to_load_metadata(self, experiment: Experiment) -> set[int]:
|
931
|
+
"""Returns the example IDs to load the metadata."""
|
932
|
+
load_metadata_ids = set()
|
933
|
+
if isinstance(self.generate_example_html, list):
|
934
|
+
load_metadata_ids = set(self.generate_example_html)
|
935
|
+
elif self.generate_example_html == 'all':
|
936
|
+
load_metadata_ids = self.examples_to_evaluate(experiment)
|
937
|
+
load_metadata_ids -= self.examples_to_reprocess(experiment)
|
938
|
+
return load_metadata_ids
|
939
|
+
|
940
|
+
|
941
|
+
class Runner(pg.Object):
|
942
|
+
"""Interface for experiment runner."""
|
943
|
+
|
944
|
+
# Class-level variable for registering the runner.
|
945
|
+
NAME = None
|
946
|
+
|
947
|
+
_REGISTRY = {}
|
948
|
+
|
949
|
+
current_run: Annotated[
|
950
|
+
Run,
|
951
|
+
'The current run.'
|
952
|
+
]
|
953
|
+
|
954
|
+
plugins: Annotated[
|
955
|
+
list['Plugin'],
|
956
|
+
'The plugins for the runner.'
|
957
|
+
] = []
|
958
|
+
|
959
|
+
def __init_subclass__(cls):
|
960
|
+
super().__init_subclass__()
|
961
|
+
if inspect.isabstract(cls):
|
962
|
+
return
|
963
|
+
if cls.NAME is None:
|
964
|
+
raise ValueError(
|
965
|
+
'Runner class must define a NAME constant. '
|
966
|
+
'Please use the same constant in the runner class.'
|
967
|
+
)
|
968
|
+
cls._REGISTRY[cls.NAME] = cls
|
969
|
+
|
970
|
+
@abc.abstractmethod
|
971
|
+
def run(self) -> None:
|
972
|
+
"""Runs a evaluation task."""
|
973
|
+
|
974
|
+
@classmethod
|
975
|
+
def create(cls, runner: str, **kwargs) -> 'Runner':
|
976
|
+
"""Creates a runner instance by ID and kwargs."""
|
977
|
+
return cls._REGISTRY[runner](**kwargs)
|
978
|
+
|
979
|
+
|
980
|
+
class Plugin(lf.Component):
|
981
|
+
"""Base class for experiment plugins."""
|
982
|
+
|
983
|
+
def on_run_start(
|
984
|
+
self,
|
985
|
+
runner: Runner,
|
986
|
+
root: Experiment
|
987
|
+
) -> None:
|
988
|
+
"""Called when a runner is started."""
|
989
|
+
|
990
|
+
def on_run_complete(
|
991
|
+
self,
|
992
|
+
runner: Runner,
|
993
|
+
root: Experiment
|
994
|
+
) -> None:
|
995
|
+
"""Called when a runner is complete."""
|
996
|
+
|
997
|
+
def on_run_abort(
|
998
|
+
self,
|
999
|
+
runner: Runner,
|
1000
|
+
root: Experiment,
|
1001
|
+
error: BaseException,
|
1002
|
+
) -> None:
|
1003
|
+
"""Called when a runner is aborted."""
|
1004
|
+
|
1005
|
+
def on_experiment_start(
|
1006
|
+
self,
|
1007
|
+
runner: Runner,
|
1008
|
+
experiment: Experiment
|
1009
|
+
) -> None:
|
1010
|
+
"""Called when an evaluation is started."""
|
1011
|
+
|
1012
|
+
def on_experiment_skipped(
|
1013
|
+
self,
|
1014
|
+
runner: Runner,
|
1015
|
+
experiment: Experiment
|
1016
|
+
) -> None:
|
1017
|
+
"""Called when an experiment (both leaf and non-leaf) is skipped."""
|
1018
|
+
|
1019
|
+
def on_experiment_complete(
|
1020
|
+
self,
|
1021
|
+
runner: Runner,
|
1022
|
+
experiment: Experiment
|
1023
|
+
) -> None:
|
1024
|
+
"""Called when an experiment (both leaf and non-leaf) is complete."""
|
1025
|
+
|
1026
|
+
def on_experiment_abort(
|
1027
|
+
self,
|
1028
|
+
runner: Runner,
|
1029
|
+
experiment: Experiment,
|
1030
|
+
error: BaseException,
|
1031
|
+
) -> None:
|
1032
|
+
"""Called when an experiment (both leaf and non-leaf) is aborted."""
|
1033
|
+
|
1034
|
+
def on_example_start(
|
1035
|
+
self,
|
1036
|
+
runner: Runner,
|
1037
|
+
experiment: Experiment,
|
1038
|
+
example: example_lib.Example
|
1039
|
+
) -> None:
|
1040
|
+
"""Called when an example is about to be evaluated."""
|
1041
|
+
|
1042
|
+
def on_example_complete(
|
1043
|
+
self,
|
1044
|
+
runner: Runner,
|
1045
|
+
experiment: Experiment,
|
1046
|
+
example: example_lib.Example
|
1047
|
+
) -> None:
|
1048
|
+
"""Called when an example is evaluated."""
|