langfun 0.0.2.dev20240429__py3-none-any.whl → 0.1.2.dev202501140804__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langfun/__init__.py +20 -2
- langfun/core/__init__.py +16 -5
- langfun/core/agentic/__init__.py +30 -0
- langfun/core/agentic/action.py +854 -0
- langfun/core/agentic/action_eval.py +150 -0
- langfun/core/agentic/action_eval_test.py +109 -0
- langfun/core/agentic/action_test.py +136 -0
- langfun/core/coding/python/__init__.py +5 -11
- langfun/core/coding/python/correction.py +37 -21
- langfun/core/coding/python/correction_test.py +29 -3
- langfun/core/coding/python/execution.py +40 -216
- langfun/core/coding/python/execution_test.py +29 -89
- langfun/core/coding/python/generation.py +21 -11
- langfun/core/coding/python/generation_test.py +2 -2
- langfun/core/coding/python/parsing.py +108 -193
- langfun/core/coding/python/parsing_test.py +2 -105
- langfun/core/component.py +63 -2
- langfun/core/component_test.py +53 -0
- langfun/core/concurrent.py +414 -117
- langfun/core/concurrent_test.py +111 -24
- langfun/core/console.py +18 -5
- langfun/core/console_test.py +17 -0
- langfun/core/eval/__init__.py +16 -1
- langfun/core/eval/base.py +622 -174
- langfun/core/eval/base_test.py +200 -54
- langfun/core/eval/matching.py +63 -76
- langfun/core/eval/matching_test.py +17 -8
- langfun/core/eval/patching.py +130 -0
- langfun/core/eval/patching_test.py +170 -0
- langfun/core/eval/scoring.py +26 -26
- langfun/core/eval/scoring_test.py +19 -2
- langfun/core/eval/v2/__init__.py +42 -0
- langfun/core/eval/v2/checkpointing.py +380 -0
- langfun/core/eval/v2/checkpointing_test.py +228 -0
- langfun/core/eval/v2/eval_test_helper.py +136 -0
- langfun/core/eval/v2/evaluation.py +725 -0
- langfun/core/eval/v2/evaluation_test.py +180 -0
- langfun/core/eval/v2/example.py +305 -0
- langfun/core/eval/v2/example_test.py +128 -0
- langfun/core/eval/v2/experiment.py +1048 -0
- langfun/core/eval/v2/experiment_test.py +433 -0
- langfun/core/eval/v2/metric_values.py +156 -0
- langfun/core/eval/v2/metric_values_test.py +80 -0
- langfun/core/eval/v2/metrics.py +357 -0
- langfun/core/eval/v2/metrics_test.py +203 -0
- langfun/core/eval/v2/progress.py +348 -0
- langfun/core/eval/v2/progress_test.py +82 -0
- langfun/core/eval/v2/progress_tracking.py +210 -0
- langfun/core/eval/v2/progress_tracking_test.py +66 -0
- langfun/core/eval/v2/reporting.py +270 -0
- langfun/core/eval/v2/reporting_test.py +158 -0
- langfun/core/eval/v2/runners.py +488 -0
- langfun/core/eval/v2/runners_test.py +334 -0
- langfun/core/langfunc.py +4 -17
- langfun/core/langfunc_test.py +22 -6
- langfun/core/language_model.py +577 -39
- langfun/core/language_model_test.py +470 -56
- langfun/core/llms/__init__.py +87 -16
- langfun/core/llms/anthropic.py +312 -87
- langfun/core/llms/anthropic_test.py +71 -3
- langfun/core/llms/cache/base.py +21 -2
- langfun/core/llms/cache/in_memory.py +13 -0
- langfun/core/llms/cache/in_memory_test.py +53 -2
- langfun/core/llms/compositional.py +101 -0
- langfun/core/llms/compositional_test.py +73 -0
- langfun/core/llms/deepseek.py +117 -0
- langfun/core/llms/deepseek_test.py +61 -0
- langfun/core/llms/fake.py +11 -7
- langfun/core/llms/fake_test.py +14 -0
- langfun/core/llms/gemini.py +507 -0
- langfun/core/llms/gemini_test.py +195 -0
- langfun/core/llms/google_genai.py +62 -218
- langfun/core/llms/google_genai_test.py +9 -202
- langfun/core/llms/groq.py +160 -144
- langfun/core/llms/groq_test.py +31 -137
- langfun/core/llms/llama_cpp.py +15 -42
- langfun/core/llms/llama_cpp_test.py +4 -30
- langfun/core/llms/openai.py +395 -203
- langfun/core/llms/openai_compatible.py +179 -0
- langfun/core/llms/openai_compatible_test.py +495 -0
- langfun/core/llms/openai_test.py +30 -395
- langfun/core/llms/rest.py +113 -0
- langfun/core/llms/rest_test.py +111 -0
- langfun/core/llms/vertexai.py +192 -0
- langfun/core/llms/vertexai_test.py +52 -0
- langfun/core/logging.py +284 -0
- langfun/core/logging_test.py +125 -0
- langfun/core/message.py +319 -9
- langfun/core/message_test.py +190 -13
- langfun/core/modalities/__init__.py +6 -2
- langfun/core/modalities/audio.py +30 -0
- langfun/core/modalities/audio_test.py +63 -0
- langfun/core/modalities/image.py +39 -20
- langfun/core/modalities/image_test.py +52 -9
- langfun/core/modalities/mime.py +206 -29
- langfun/core/modalities/mime_test.py +90 -9
- langfun/core/modalities/ms_office.py +117 -0
- langfun/core/modalities/ms_office_test.py +389 -0
- langfun/core/modalities/pdf.py +22 -0
- langfun/core/modalities/pdf_test.py +57 -0
- langfun/core/modalities/video.py +9 -26
- langfun/core/modalities/video_test.py +3 -3
- langfun/core/modality.py +26 -3
- langfun/core/modality_test.py +2 -2
- langfun/core/sampling.py +11 -11
- langfun/core/structured/__init__.py +12 -16
- langfun/core/structured/completion.py +32 -5
- langfun/core/structured/completion_test.py +7 -6
- langfun/core/structured/description.py +2 -2
- langfun/core/structured/description_test.py +3 -3
- langfun/core/structured/function_generation.py +60 -27
- langfun/core/structured/function_generation_test.py +72 -2
- langfun/core/structured/mapping.py +97 -47
- langfun/core/structured/mapping_test.py +90 -2
- langfun/core/structured/parsing.py +33 -21
- langfun/core/structured/parsing_test.py +53 -9
- langfun/core/structured/querying.py +746 -0
- langfun/core/structured/{prompting_test.py → querying_test.py} +469 -51
- langfun/core/structured/schema.py +204 -97
- langfun/core/structured/schema_generation.py +1 -1
- langfun/core/structured/schema_test.py +130 -29
- langfun/core/structured/scoring.py +125 -19
- langfun/core/structured/scoring_test.py +30 -0
- langfun/core/structured/tokenization.py +64 -0
- langfun/core/structured/tokenization_test.py +48 -0
- langfun/core/template.py +115 -1
- langfun/core/template_test.py +71 -1
- langfun/core/templates/conversation.py +9 -0
- langfun/core/templates/conversation_test.py +4 -3
- langfun/core/templates/selfplay_test.py +10 -2
- langfun-0.1.2.dev202501140804.dist-info/METADATA +225 -0
- langfun-0.1.2.dev202501140804.dist-info/RECORD +153 -0
- {langfun-0.0.2.dev20240429.dist-info → langfun-0.1.2.dev202501140804.dist-info}/WHEEL +1 -1
- langfun/core/coding/python/errors.py +0 -108
- langfun/core/coding/python/errors_test.py +0 -99
- langfun/core/coding/python/permissions.py +0 -90
- langfun/core/coding/python/permissions_test.py +0 -86
- langfun/core/structured/prompting.py +0 -238
- langfun/core/text_formatting.py +0 -162
- langfun/core/text_formatting_test.py +0 -47
- langfun-0.0.2.dev20240429.dist-info/METADATA +0 -100
- langfun-0.0.2.dev20240429.dist-info/RECORD +0 -108
- {langfun-0.0.2.dev20240429.dist-info → langfun-0.1.2.dev202501140804.dist-info}/LICENSE +0 -0
- {langfun-0.0.2.dev20240429.dist-info → langfun-0.1.2.dev202501140804.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,488 @@
|
|
1
|
+
# Copyright 2024 The Langfun Authors
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
"""Evaluation experiment runners."""
|
15
|
+
import abc
|
16
|
+
import collections
|
17
|
+
import concurrent.futures
|
18
|
+
import random
|
19
|
+
import threading
|
20
|
+
import time
|
21
|
+
import traceback
|
22
|
+
from typing import Any, Annotated, Callable, Iterator
|
23
|
+
|
24
|
+
from langfun import core as lf
|
25
|
+
from langfun.core.eval.v2 import checkpointing
|
26
|
+
from langfun.core.eval.v2 import evaluation as evaluation_lib
|
27
|
+
from langfun.core.eval.v2 import example as example_lib
|
28
|
+
from langfun.core.eval.v2 import experiment as experiment_lib
|
29
|
+
from langfun.core.eval.v2 import progress_tracking
|
30
|
+
from langfun.core.eval.v2 import reporting
|
31
|
+
from langfun.core.llms.cache import in_memory
|
32
|
+
import pyglove as pg
|
33
|
+
|
34
|
+
Runner = experiment_lib.Runner
|
35
|
+
Example = example_lib.Example
|
36
|
+
Evaluation = evaluation_lib.Evaluation
|
37
|
+
Experiment = experiment_lib.Experiment
|
38
|
+
Plugin = experiment_lib.Plugin
|
39
|
+
|
40
|
+
|
41
|
+
_RUN_MANIFEST = 'run.json'
|
42
|
+
|
43
|
+
|
44
|
+
class RunnerBase(Runner):
|
45
|
+
"""A simple runner that runs evaluations and their examples sequentially."""
|
46
|
+
|
47
|
+
tqdm: Annotated[
|
48
|
+
bool,
|
49
|
+
(
|
50
|
+
'If True, force using tqdm for progress update. Otherwise, determine '
|
51
|
+
'it automatically based on the running environment (console vs. '
|
52
|
+
'notebook)'
|
53
|
+
)
|
54
|
+
] = False
|
55
|
+
|
56
|
+
plugins = [
|
57
|
+
checkpointing.BulkCheckpointer(),
|
58
|
+
reporting.HtmlReporter(),
|
59
|
+
]
|
60
|
+
|
61
|
+
def _on_bound(self):
|
62
|
+
super()._on_bound()
|
63
|
+
|
64
|
+
# Install the tqdm plugin if needed.
|
65
|
+
with pg.notify_on_change(False):
|
66
|
+
self.plugins.append(progress_tracking.progress_tracker(self.tqdm))
|
67
|
+
|
68
|
+
self._io_pool_lock = threading.Lock()
|
69
|
+
self._io_pool = concurrent.futures.ThreadPoolExecutor(max_workers=16)
|
70
|
+
# TODO(daiyip): render background errors.
|
71
|
+
self._background_last_error = None
|
72
|
+
|
73
|
+
def background_run(self, func: Callable[..., Any], *args, **kwargs) -> None:
|
74
|
+
"""Runs the function with the IO pool."""
|
75
|
+
def _background_run(*args, **kwargs):
|
76
|
+
try:
|
77
|
+
func(*args, **kwargs)
|
78
|
+
except Exception as e: # pylint: disable=broad-except
|
79
|
+
self._background_last_error = e
|
80
|
+
|
81
|
+
with self._io_pool_lock:
|
82
|
+
if self._io_pool is not None:
|
83
|
+
self._io_pool.submit(_background_run, *args, **kwargs)
|
84
|
+
|
85
|
+
def _all_plugins(self, experiment: Experiment) -> Iterator[Plugin]:
|
86
|
+
"""Returns all plugins for the experiment."""
|
87
|
+
for plugin in self.plugins:
|
88
|
+
yield plugin
|
89
|
+
for plugin in experiment.plugins:
|
90
|
+
yield plugin
|
91
|
+
|
92
|
+
#
|
93
|
+
# IO operations for saving running files.
|
94
|
+
#
|
95
|
+
|
96
|
+
def _save_run_manifest(self) -> None:
|
97
|
+
def _save():
|
98
|
+
pg.symbolic.deref(self.current_run.clone(), recursive=True).save(
|
99
|
+
self.current_run.output_path_for(
|
100
|
+
self.current_run.experiment, _RUN_MANIFEST
|
101
|
+
),
|
102
|
+
hide_default_values=True
|
103
|
+
)
|
104
|
+
self.background_run(_save)
|
105
|
+
|
106
|
+
def on_run_start(self) -> None:
|
107
|
+
"""Called when a runner is started."""
|
108
|
+
self._save_run_manifest()
|
109
|
+
|
110
|
+
for plugin in self._all_plugins(self.current_run.experiment):
|
111
|
+
plugin.on_run_start(self, self.current_run.experiment)
|
112
|
+
|
113
|
+
def on_run_complete(self) -> None:
|
114
|
+
"""Called when a runner is complete."""
|
115
|
+
for plugin in self._all_plugins(self.current_run.experiment):
|
116
|
+
plugin.on_run_complete(self, self.current_run.experiment)
|
117
|
+
|
118
|
+
def on_run_abort(self, error: Exception) -> None:
|
119
|
+
"""Called when a runner is aborted."""
|
120
|
+
for plugin in self._all_plugins(self.current_run.experiment):
|
121
|
+
plugin.on_run_abort(self, self.current_run.experiment, error)
|
122
|
+
|
123
|
+
def on_experiment_start(self, experiment: Experiment) -> None:
|
124
|
+
"""Called when an evaluation is started."""
|
125
|
+
# Start the progress of the evaluation.
|
126
|
+
num_examples_to_evaluate = 0
|
127
|
+
if experiment.is_leaf:
|
128
|
+
assert isinstance(experiment, Evaluation)
|
129
|
+
num_examples_to_evaluate = (
|
130
|
+
len(self.current_run.example_ids)
|
131
|
+
if self.current_run.example_ids else experiment.num_examples
|
132
|
+
)
|
133
|
+
experiment.progress.start(total=num_examples_to_evaluate)
|
134
|
+
else:
|
135
|
+
experiment.progress.start(total=len(experiment.leaf_nodes))
|
136
|
+
|
137
|
+
# Notify the plugins of the experiment start.
|
138
|
+
for plugin in self._all_plugins(experiment):
|
139
|
+
plugin.on_experiment_start(self, experiment)
|
140
|
+
|
141
|
+
if experiment.is_leaf:
|
142
|
+
experiment.info(
|
143
|
+
f'Starting evaluation {experiment.id!r} with '
|
144
|
+
f'{num_examples_to_evaluate} examples to evaluate.'
|
145
|
+
)
|
146
|
+
|
147
|
+
def on_experiment_skipped(self, experiment: Experiment) -> None:
|
148
|
+
"""Called when an evaluation is skipped."""
|
149
|
+
# Skip event will only be triggered for leaf evaluations.
|
150
|
+
assert experiment.is_leaf
|
151
|
+
experiment.progress.start(total=1)
|
152
|
+
experiment.progress.increment_skipped(1)
|
153
|
+
|
154
|
+
# Notify the plugins of the experiment skip.
|
155
|
+
for plugin in self._all_plugins(experiment):
|
156
|
+
plugin.on_experiment_skipped(self, experiment)
|
157
|
+
|
158
|
+
# Only leaf evaluations will trigger the complete notification of the
|
159
|
+
# ancestors.
|
160
|
+
self._update_ancestor_progresses(experiment)
|
161
|
+
|
162
|
+
def on_experiment_complete(self, experiment: Experiment) -> None:
|
163
|
+
"""Called when an evaluation is complete."""
|
164
|
+
progress = experiment.progress
|
165
|
+
progress.stop()
|
166
|
+
|
167
|
+
# Notify the plugins of the experiment complete.
|
168
|
+
for plugin in self._all_plugins(experiment):
|
169
|
+
plugin.on_experiment_complete(self, experiment)
|
170
|
+
|
171
|
+
# Only leaf evaluations will trigger the complete notification of the
|
172
|
+
# ancestors.
|
173
|
+
if experiment.is_leaf:
|
174
|
+
self._update_ancestor_progresses(experiment)
|
175
|
+
self._log_experiment_completion(experiment)
|
176
|
+
|
177
|
+
def _log_experiment_completion(self, experiment: Experiment):
|
178
|
+
example_ids = (
|
179
|
+
self.current_run.example_ids if self.current_run.example_ids else
|
180
|
+
list(range(1, experiment.num_examples + 1))
|
181
|
+
)
|
182
|
+
num_from_checkpoint, num_processed = 0, 0
|
183
|
+
for example_id in example_ids:
|
184
|
+
example = experiment.state.get(example_id)
|
185
|
+
if example.newly_processed:
|
186
|
+
num_processed += 1
|
187
|
+
else:
|
188
|
+
num_from_checkpoint += 1
|
189
|
+
experiment.info(
|
190
|
+
f'{experiment.id} completed with {num_from_checkpoint + num_processed} '
|
191
|
+
f'examples evaluated ({num_from_checkpoint} from checkpoint, '
|
192
|
+
f'{num_processed} newly processed).'
|
193
|
+
)
|
194
|
+
|
195
|
+
def on_experiment_abort(
|
196
|
+
self, experiment: Experiment, error: BaseException) -> None:
|
197
|
+
"""Called when an evaluation is complete."""
|
198
|
+
assert experiment.is_leaf
|
199
|
+
experiment.fatal(f'{error}\n\n{traceback.format_exc()}')
|
200
|
+
|
201
|
+
# Notify the plugins of the experiment abort.
|
202
|
+
for plugin in self._all_plugins(experiment):
|
203
|
+
plugin.on_experiment_abort(self, experiment, error)
|
204
|
+
|
205
|
+
def _update_ancestor_progresses(self, experiment: Experiment):
|
206
|
+
"""Updates the progresses of the parent nodes of the experiment."""
|
207
|
+
parent = experiment.parent
|
208
|
+
progress = experiment.progress
|
209
|
+
while parent is not None:
|
210
|
+
parent_progress = parent.progress
|
211
|
+
if progress.is_failed:
|
212
|
+
parent_progress.increment_failed()
|
213
|
+
elif progress.is_skipped:
|
214
|
+
parent_progress.increment_skipped()
|
215
|
+
else:
|
216
|
+
# A evaluation could be considered as done if it has processed all the
|
217
|
+
# examples specified by `example_ids`.
|
218
|
+
assert progress.is_completed
|
219
|
+
parent_progress.increment_processed()
|
220
|
+
|
221
|
+
if parent_progress.is_completed:
|
222
|
+
self.on_experiment_complete(parent)
|
223
|
+
elif parent_progress.is_skipped:
|
224
|
+
self.on_experiment_skipped(parent)
|
225
|
+
parent = parent.parent
|
226
|
+
|
227
|
+
def on_example_start(
|
228
|
+
self,
|
229
|
+
experiment: Experiment,
|
230
|
+
example: Example
|
231
|
+
) -> None:
|
232
|
+
"""Called when an evaluation example is started."""
|
233
|
+
for plugin in self._all_plugins(experiment):
|
234
|
+
plugin.on_example_start(self, experiment, example)
|
235
|
+
|
236
|
+
def on_example_complete(
|
237
|
+
self,
|
238
|
+
experiment: Experiment,
|
239
|
+
example: Example
|
240
|
+
) -> None:
|
241
|
+
"""Called when an evaluation example is complete."""
|
242
|
+
if example.newly_processed:
|
243
|
+
if example.error is None:
|
244
|
+
experiment.progress.increment_processed()
|
245
|
+
else:
|
246
|
+
experiment.progress.increment_failed()
|
247
|
+
else:
|
248
|
+
experiment.progress.increment_skipped()
|
249
|
+
|
250
|
+
experiment.usage_summary.merge(example.usage_summary)
|
251
|
+
experiment.progress.update_execution_summary(example.execution_status)
|
252
|
+
|
253
|
+
parent = experiment.parent
|
254
|
+
while parent is not None:
|
255
|
+
parent.usage_summary.merge(example.usage_summary)
|
256
|
+
parent = parent.parent
|
257
|
+
|
258
|
+
for plugin in self._all_plugins(experiment):
|
259
|
+
plugin.on_example_complete(self, experiment, example)
|
260
|
+
|
261
|
+
def run(self) -> None:
|
262
|
+
"""Runs the experiment."""
|
263
|
+
# Resets the experiment before getting start.
|
264
|
+
for node in self.current_run.experiment.nodes:
|
265
|
+
node.reset()
|
266
|
+
|
267
|
+
# Start the run.
|
268
|
+
self.on_run_start()
|
269
|
+
cache = None
|
270
|
+
|
271
|
+
try:
|
272
|
+
# Start the non-leaf nodes.
|
273
|
+
for node in self.current_run.experiment.nonleaf_nodes:
|
274
|
+
self.on_experiment_start(node)
|
275
|
+
|
276
|
+
# Skip evaluations if needed.
|
277
|
+
if self.current_run.filter is not None:
|
278
|
+
targets = []
|
279
|
+
for evaluation in self.current_run.experiment.leaf_nodes:
|
280
|
+
if self.current_run.filter(evaluation):
|
281
|
+
targets.append(evaluation)
|
282
|
+
else:
|
283
|
+
self.on_experiment_skipped(evaluation)
|
284
|
+
else:
|
285
|
+
targets = self.current_run.experiment.leaf_nodes
|
286
|
+
|
287
|
+
# Prepare the global cache if needed.
|
288
|
+
global_settings = {}
|
289
|
+
if self.current_run.use_cache == 'global':
|
290
|
+
cache = self._load_or_create_cache(self.current_run.experiment)
|
291
|
+
global_settings['cache'] = cache
|
292
|
+
|
293
|
+
# Evaluate the leaf evaluations if not skipped.
|
294
|
+
with lf.use_settings(**global_settings):
|
295
|
+
self._run(targets)
|
296
|
+
|
297
|
+
self.on_run_complete()
|
298
|
+
except Exception as e: # pylint: disable=broad-except
|
299
|
+
self.on_run_abort(e)
|
300
|
+
raise e
|
301
|
+
finally:
|
302
|
+
if cache is not None:
|
303
|
+
self.background_run(cache.save)
|
304
|
+
|
305
|
+
# Wait for the background tasks to finish.
|
306
|
+
with self._io_pool_lock:
|
307
|
+
self._io_pool, io_pool = None, self._io_pool
|
308
|
+
io_pool.shutdown(wait=True)
|
309
|
+
|
310
|
+
@abc.abstractmethod
|
311
|
+
def _run(self, evaluations: list[Evaluation]) -> None:
|
312
|
+
"""Runs multiple evaluations."""
|
313
|
+
|
314
|
+
def run_evaluation(self, evaluation: Evaluation) -> None:
|
315
|
+
"""Runs the evaluation."""
|
316
|
+
try:
|
317
|
+
self.on_experiment_start(evaluation)
|
318
|
+
|
319
|
+
per_evaluation_settings = {}
|
320
|
+
cache = None
|
321
|
+
if self.current_run.use_cache == 'per_dataset':
|
322
|
+
cache = self._load_or_create_cache(evaluation)
|
323
|
+
per_evaluation_settings['cache'] = cache
|
324
|
+
|
325
|
+
with lf.use_settings(**per_evaluation_settings):
|
326
|
+
if self.current_run.example_ids is None:
|
327
|
+
items = (
|
328
|
+
Example(id=i + 1, input=ex) for i, ex in enumerate(
|
329
|
+
evaluation.example_inputs)
|
330
|
+
)
|
331
|
+
else:
|
332
|
+
items = (
|
333
|
+
Example(
|
334
|
+
id=example_id,
|
335
|
+
input=evaluation.example_input_by_id(example_id)
|
336
|
+
) for example_id in self.current_run.example_ids
|
337
|
+
)
|
338
|
+
self._evaluate_items(evaluation, items)
|
339
|
+
|
340
|
+
if cache:
|
341
|
+
self.background_run(cache.save)
|
342
|
+
self.on_experiment_complete(evaluation)
|
343
|
+
except BaseException as e: # pylint: disable=broad-except
|
344
|
+
self.on_experiment_abort(evaluation, e)
|
345
|
+
raise e
|
346
|
+
|
347
|
+
@abc.abstractmethod
|
348
|
+
def _evaluate_items(
|
349
|
+
self, evaluation: Evaluation, items: Iterator[Example]
|
350
|
+
) -> None:
|
351
|
+
"""Evaluates the items of an evaluation."""
|
352
|
+
|
353
|
+
def evaluate_item(
|
354
|
+
self,
|
355
|
+
evaluation: Evaluation,
|
356
|
+
item: Example
|
357
|
+
) -> Example:
|
358
|
+
"""Runs the evaluation example."""
|
359
|
+
self.on_example_start(evaluation, item)
|
360
|
+
item = evaluation.evaluate(
|
361
|
+
item, raise_if_has_error=self.current_run.raise_if_has_error
|
362
|
+
)
|
363
|
+
self.on_example_complete(evaluation, item)
|
364
|
+
return item
|
365
|
+
|
366
|
+
def _load_or_create_cache(self, experiment: Experiment) -> lf.LMCache | None:
|
367
|
+
"""Loads or creates the cache."""
|
368
|
+
return in_memory.InMemory(
|
369
|
+
self.current_run.output_path_for(experiment, 'cache.json')
|
370
|
+
)
|
371
|
+
|
372
|
+
|
373
|
+
class SequentialRunner(RunnerBase):
|
374
|
+
"""Sequential runner.
|
375
|
+
|
376
|
+
Sequential runner runs all evaluations and their examples in sequence,
|
377
|
+
as well as the background tasks, it allows the developer to catch all
|
378
|
+
exceptions thrown from the background tasks, making it easier to debug.
|
379
|
+
"""
|
380
|
+
|
381
|
+
NAME = 'sequential'
|
382
|
+
|
383
|
+
def background_run(
|
384
|
+
self, func: Callable[..., Any], *args: Any, **kwargs: Any
|
385
|
+
) -> None:
|
386
|
+
"""Runs the function with the IO pool."""
|
387
|
+
func(*args, **kwargs)
|
388
|
+
|
389
|
+
def _run(self, evaluations: list[Evaluation]) -> None:
|
390
|
+
"""Runs the experiment in sequence."""
|
391
|
+
for e in evaluations:
|
392
|
+
self.run_evaluation(e)
|
393
|
+
|
394
|
+
def _evaluate_items(
|
395
|
+
self, evaluation: Evaluation, items: Iterator[Example]
|
396
|
+
) -> None:
|
397
|
+
"""Runs the evaluation items in sequence."""
|
398
|
+
for item in items:
|
399
|
+
self.evaluate_item(evaluation, item)
|
400
|
+
|
401
|
+
|
402
|
+
class DebugRunner(SequentialRunner):
|
403
|
+
"""Debug runner."""
|
404
|
+
|
405
|
+
NAME = 'debug'
|
406
|
+
|
407
|
+
# Do not use the checkpointer for debug runner.
|
408
|
+
plugins = []
|
409
|
+
|
410
|
+
def _on_bound(self):
|
411
|
+
super()._on_bound()
|
412
|
+
if self.current_run.example_ids is None:
|
413
|
+
self.current_run.rebind(example_ids=[1], skip_notification=True)
|
414
|
+
self.current_run.rebind(raise_if_has_error=True, skip_notification=True)
|
415
|
+
|
416
|
+
def _save_run_manifest(self) -> None:
|
417
|
+
"""Do nothing to avoid overriden existing runs."""
|
418
|
+
|
419
|
+
|
420
|
+
class ParallelRunner(RunnerBase):
|
421
|
+
"""Parallel runner."""
|
422
|
+
|
423
|
+
NAME = 'parallel'
|
424
|
+
|
425
|
+
timeout: Annotated[
|
426
|
+
int | None,
|
427
|
+
'Timeout for each evaluation example.'
|
428
|
+
] = None
|
429
|
+
|
430
|
+
concurrent_startup_delay: Annotated[
|
431
|
+
tuple[int, int] | None,
|
432
|
+
(
|
433
|
+
'A range of seconds to delay the initial evaluation of each thread '
|
434
|
+
'in the thread pool, helping to prevent a burst in LLM QPS at '
|
435
|
+
'startup. If set to None, no delay will be applied.'
|
436
|
+
)
|
437
|
+
] = None
|
438
|
+
|
439
|
+
def _run(self, evaluations: list[Evaluation]) -> None:
|
440
|
+
"""Runs the evaluations in parallel."""
|
441
|
+
def _run_group(evaluation_group: list[Evaluation]):
|
442
|
+
for e in evaluation_group:
|
443
|
+
self.run_evaluation(e)
|
444
|
+
|
445
|
+
# Run evaluations in parallel groupped by resource key.
|
446
|
+
groups: dict[str, list[Evaluation]] = collections.defaultdict(list)
|
447
|
+
for e in evaluations:
|
448
|
+
resource_ids = e.resource_ids()
|
449
|
+
if not resource_ids:
|
450
|
+
group_id = e.id
|
451
|
+
else:
|
452
|
+
# TODO(daiyip): support group that requires multiple resources.
|
453
|
+
group_id = resource_ids.pop()
|
454
|
+
groups[group_id].append(e)
|
455
|
+
|
456
|
+
for _, _, _ in lf.concurrent_map(
|
457
|
+
_run_group,
|
458
|
+
groups.values(),
|
459
|
+
max_workers=max(64, len(groups)),
|
460
|
+
timeout=self.timeout,
|
461
|
+
silence_on_errors=None,
|
462
|
+
):
|
463
|
+
pass
|
464
|
+
|
465
|
+
def _evaluate_items(
|
466
|
+
self, evaluation: Evaluation, items: Iterator[Example]
|
467
|
+
) -> None:
|
468
|
+
"""Override run items to run in parallel."""
|
469
|
+
if self.concurrent_startup_delay is not None:
|
470
|
+
thread_delayed = {}
|
471
|
+
def _evaluate_item(item: Example):
|
472
|
+
thread_id = threading.current_thread().ident
|
473
|
+
if thread_id not in thread_delayed:
|
474
|
+
thread_delayed[thread_id] = True
|
475
|
+
time.sleep(random.randint(*self.concurrent_startup_delay))
|
476
|
+
return self.evaluate_item(evaluation, item)
|
477
|
+
else:
|
478
|
+
def _evaluate_item(item: Example):
|
479
|
+
return self.evaluate_item(evaluation, item)
|
480
|
+
|
481
|
+
for _, _, _ in lf.concurrent_map(
|
482
|
+
_evaluate_item,
|
483
|
+
items,
|
484
|
+
max_workers=evaluation.max_workers,
|
485
|
+
timeout=self.timeout,
|
486
|
+
silence_on_errors=None,
|
487
|
+
):
|
488
|
+
pass
|