langfun 0.1.2.dev202509120804__py3-none-any.whl → 0.1.2.dev202512040805__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langfun/__init__.py +1 -1
- langfun/core/__init__.py +7 -1
- langfun/core/agentic/__init__.py +8 -1
- langfun/core/agentic/action.py +740 -112
- langfun/core/agentic/action_eval.py +9 -2
- langfun/core/agentic/action_test.py +189 -24
- langfun/core/async_support.py +104 -5
- langfun/core/async_support_test.py +23 -0
- langfun/core/coding/python/correction.py +19 -9
- langfun/core/coding/python/execution.py +14 -12
- langfun/core/coding/python/generation.py +21 -16
- langfun/core/coding/python/sandboxing.py +23 -3
- langfun/core/component.py +42 -3
- langfun/core/concurrent.py +70 -6
- langfun/core/concurrent_test.py +9 -2
- langfun/core/console.py +1 -1
- langfun/core/data/conversion/anthropic.py +12 -3
- langfun/core/data/conversion/anthropic_test.py +8 -6
- langfun/core/data/conversion/gemini.py +11 -2
- langfun/core/data/conversion/gemini_test.py +48 -9
- langfun/core/data/conversion/openai.py +145 -31
- langfun/core/data/conversion/openai_test.py +161 -17
- langfun/core/eval/base.py +48 -44
- langfun/core/eval/base_test.py +5 -5
- langfun/core/eval/matching.py +5 -2
- langfun/core/eval/patching.py +3 -3
- langfun/core/eval/scoring.py +4 -3
- langfun/core/eval/v2/__init__.py +2 -0
- langfun/core/eval/v2/checkpointing.py +76 -7
- langfun/core/eval/v2/checkpointing_test.py +9 -2
- langfun/core/eval/v2/config_saver.py +37 -0
- langfun/core/eval/v2/config_saver_test.py +36 -0
- langfun/core/eval/v2/eval_test_helper.py +104 -3
- langfun/core/eval/v2/evaluation.py +92 -17
- langfun/core/eval/v2/evaluation_test.py +9 -3
- langfun/core/eval/v2/example.py +50 -40
- langfun/core/eval/v2/example_test.py +16 -8
- langfun/core/eval/v2/experiment.py +84 -15
- langfun/core/eval/v2/experiment_test.py +19 -0
- langfun/core/eval/v2/metric_values.py +31 -3
- langfun/core/eval/v2/metric_values_test.py +32 -0
- langfun/core/eval/v2/metrics.py +157 -44
- langfun/core/eval/v2/metrics_test.py +39 -18
- langfun/core/eval/v2/progress.py +31 -1
- langfun/core/eval/v2/progress_test.py +27 -0
- langfun/core/eval/v2/progress_tracking.py +13 -5
- langfun/core/eval/v2/progress_tracking_test.py +9 -1
- langfun/core/eval/v2/reporting.py +90 -71
- langfun/core/eval/v2/reporting_test.py +24 -6
- langfun/core/eval/v2/runners/__init__.py +30 -0
- langfun/core/eval/v2/{runners.py → runners/base.py} +72 -180
- langfun/core/eval/v2/runners/beam.py +354 -0
- langfun/core/eval/v2/runners/beam_test.py +153 -0
- langfun/core/eval/v2/runners/ckpt_monitor.py +294 -0
- langfun/core/eval/v2/runners/ckpt_monitor_test.py +162 -0
- langfun/core/eval/v2/runners/debug.py +40 -0
- langfun/core/eval/v2/runners/debug_test.py +76 -0
- langfun/core/eval/v2/runners/parallel.py +243 -0
- langfun/core/eval/v2/runners/parallel_test.py +182 -0
- langfun/core/eval/v2/runners/sequential.py +47 -0
- langfun/core/eval/v2/runners/sequential_test.py +169 -0
- langfun/core/langfunc.py +45 -130
- langfun/core/langfunc_test.py +7 -5
- langfun/core/language_model.py +189 -36
- langfun/core/language_model_test.py +54 -3
- langfun/core/llms/__init__.py +12 -1
- langfun/core/llms/anthropic.py +157 -2
- langfun/core/llms/azure_openai.py +29 -17
- langfun/core/llms/cache/base.py +25 -3
- langfun/core/llms/cache/in_memory.py +48 -7
- langfun/core/llms/cache/in_memory_test.py +14 -4
- langfun/core/llms/compositional.py +25 -1
- langfun/core/llms/deepseek.py +30 -2
- langfun/core/llms/fake.py +32 -1
- langfun/core/llms/gemini.py +64 -12
- langfun/core/llms/gemini_test.py +110 -0
- langfun/core/llms/google_genai.py +34 -1
- langfun/core/llms/groq.py +28 -3
- langfun/core/llms/llama_cpp.py +23 -4
- langfun/core/llms/openai.py +120 -3
- langfun/core/llms/openai_compatible.py +148 -27
- langfun/core/llms/openai_compatible_test.py +207 -20
- langfun/core/llms/openai_test.py +0 -2
- langfun/core/llms/rest.py +16 -1
- langfun/core/llms/vertexai.py +58 -8
- langfun/core/logging.py +1 -1
- langfun/core/mcp/__init__.py +10 -0
- langfun/core/mcp/client.py +177 -0
- langfun/core/mcp/client_test.py +71 -0
- langfun/core/mcp/session.py +241 -0
- langfun/core/mcp/session_test.py +54 -0
- langfun/core/mcp/testing/simple_mcp_client.py +33 -0
- langfun/core/mcp/testing/simple_mcp_server.py +33 -0
- langfun/core/mcp/tool.py +254 -0
- langfun/core/mcp/tool_test.py +197 -0
- langfun/core/memory.py +1 -0
- langfun/core/message.py +160 -55
- langfun/core/message_test.py +65 -81
- langfun/core/modalities/__init__.py +8 -0
- langfun/core/modalities/audio.py +21 -1
- langfun/core/modalities/image.py +73 -3
- langfun/core/modalities/image_test.py +116 -0
- langfun/core/modalities/mime.py +64 -3
- langfun/core/modalities/mime_test.py +11 -0
- langfun/core/modalities/pdf.py +19 -1
- langfun/core/modalities/video.py +21 -1
- langfun/core/modality.py +167 -29
- langfun/core/modality_test.py +42 -12
- langfun/core/natural_language.py +1 -1
- langfun/core/sampling.py +4 -4
- langfun/core/sampling_test.py +20 -4
- langfun/core/structured/__init__.py +2 -24
- langfun/core/structured/completion.py +34 -44
- langfun/core/structured/completion_test.py +23 -43
- langfun/core/structured/description.py +54 -50
- langfun/core/structured/function_generation.py +29 -12
- langfun/core/structured/mapping.py +81 -37
- langfun/core/structured/parsing.py +95 -79
- langfun/core/structured/parsing_test.py +0 -3
- langfun/core/structured/querying.py +230 -154
- langfun/core/structured/querying_test.py +69 -33
- langfun/core/structured/schema/__init__.py +49 -0
- langfun/core/structured/schema/base.py +664 -0
- langfun/core/structured/schema/base_test.py +531 -0
- langfun/core/structured/schema/json.py +174 -0
- langfun/core/structured/schema/json_test.py +121 -0
- langfun/core/structured/schema/python.py +316 -0
- langfun/core/structured/schema/python_test.py +410 -0
- langfun/core/structured/schema_generation.py +33 -14
- langfun/core/structured/scoring.py +47 -36
- langfun/core/structured/tokenization.py +26 -11
- langfun/core/subscription.py +2 -2
- langfun/core/template.py +175 -50
- langfun/core/template_test.py +123 -17
- langfun/env/__init__.py +43 -0
- langfun/env/base_environment.py +827 -0
- langfun/env/base_environment_test.py +473 -0
- langfun/env/base_feature.py +304 -0
- langfun/env/base_feature_test.py +228 -0
- langfun/env/base_sandbox.py +842 -0
- langfun/env/base_sandbox_test.py +1235 -0
- langfun/env/event_handlers/__init__.py +14 -0
- langfun/env/event_handlers/chain.py +233 -0
- langfun/env/event_handlers/chain_test.py +253 -0
- langfun/env/event_handlers/event_logger.py +472 -0
- langfun/env/event_handlers/event_logger_test.py +304 -0
- langfun/env/event_handlers/metric_writer.py +726 -0
- langfun/env/event_handlers/metric_writer_test.py +214 -0
- langfun/env/interface.py +1640 -0
- langfun/env/interface_test.py +153 -0
- langfun/env/load_balancers.py +59 -0
- langfun/env/load_balancers_test.py +141 -0
- langfun/env/test_utils.py +507 -0
- {langfun-0.1.2.dev202509120804.dist-info → langfun-0.1.2.dev202512040805.dist-info}/METADATA +7 -3
- langfun-0.1.2.dev202512040805.dist-info/RECORD +217 -0
- langfun/core/eval/v2/runners_test.py +0 -343
- langfun/core/structured/schema.py +0 -987
- langfun/core/structured/schema_test.py +0 -982
- langfun-0.1.2.dev202509120804.dist-info/RECORD +0 -172
- {langfun-0.1.2.dev202509120804.dist-info → langfun-0.1.2.dev202512040805.dist-info}/WHEEL +0 -0
- {langfun-0.1.2.dev202509120804.dist-info → langfun-0.1.2.dev202512040805.dist-info}/licenses/LICENSE +0 -0
- {langfun-0.1.2.dev202509120804.dist-info → langfun-0.1.2.dev202512040805.dist-info}/top_level.txt +0 -0
|
@@ -11,18 +11,18 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
"""
|
|
14
|
+
"""Base experiment runner."""
|
|
15
|
+
|
|
15
16
|
import abc
|
|
16
|
-
import collections
|
|
17
17
|
import concurrent.futures
|
|
18
18
|
import random
|
|
19
19
|
import threading
|
|
20
|
-
import time
|
|
21
20
|
import traceback
|
|
22
|
-
from typing import Any, Annotated, Callable, Iterator
|
|
21
|
+
from typing import Any, Annotated, Callable, Iterator, Literal
|
|
23
22
|
|
|
24
23
|
from langfun import core as lf
|
|
25
24
|
from langfun.core.eval.v2 import checkpointing
|
|
25
|
+
from langfun.core.eval.v2 import config_saver
|
|
26
26
|
from langfun.core.eval.v2 import evaluation as evaluation_lib
|
|
27
27
|
from langfun.core.eval.v2 import example as example_lib
|
|
28
28
|
from langfun.core.eval.v2 import experiment as experiment_lib
|
|
@@ -38,35 +38,57 @@ Experiment = experiment_lib.Experiment
|
|
|
38
38
|
Plugin = experiment_lib.Plugin
|
|
39
39
|
|
|
40
40
|
|
|
41
|
-
_RUN_MANIFEST = 'run.json'
|
|
42
|
-
|
|
43
|
-
|
|
44
41
|
class RunnerBase(Runner):
|
|
45
|
-
"""
|
|
42
|
+
"""Base class for runners with plugin support and IO pooling.
|
|
43
|
+
|
|
44
|
+
`RunnerBase` provides the basic runner functionalities such as plugin
|
|
45
|
+
integration for checkpointing, reporting and progress tracking.
|
|
46
|
+
It also manages a thread pool for background IO operations.
|
|
47
|
+
Subclasses should implement `_run` and `_evaluate_items` for different
|
|
48
|
+
execution strategies.
|
|
49
|
+
"""
|
|
46
50
|
|
|
47
|
-
|
|
48
|
-
|
|
51
|
+
progress_tracker: Annotated[
|
|
52
|
+
Literal['tqdm', 'html', 'auto', None],
|
|
49
53
|
(
|
|
50
|
-
'If
|
|
51
|
-
'
|
|
52
|
-
'
|
|
54
|
+
'If `tqdm`, force using tqdm for progress update. '
|
|
55
|
+
'If `html`, force using html for progress update. '
|
|
56
|
+
'If `auto`, determine it automatically based on the running '
|
|
57
|
+
'environment (console vs. notebook)'
|
|
58
|
+
'If `none`, disable progress update.'
|
|
53
59
|
)
|
|
54
|
-
] =
|
|
60
|
+
] = 'auto'
|
|
55
61
|
|
|
56
62
|
plugins = [
|
|
57
63
|
checkpointing.BulkCheckpointer(),
|
|
58
64
|
reporting.HtmlReporter(),
|
|
65
|
+
config_saver.RunConfigSaver(),
|
|
59
66
|
]
|
|
60
67
|
|
|
68
|
+
max_background_threads: Annotated[
|
|
69
|
+
int,
|
|
70
|
+
'Max number of background threads for IO operations.'
|
|
71
|
+
] = 128
|
|
72
|
+
|
|
61
73
|
def _on_bound(self):
|
|
62
74
|
super()._on_bound()
|
|
63
75
|
|
|
64
76
|
# Install the tqdm plugin if needed.
|
|
65
|
-
|
|
66
|
-
|
|
77
|
+
if self.progress_tracker is not None:
|
|
78
|
+
with pg.notify_on_change(False):
|
|
79
|
+
self.plugins.append(
|
|
80
|
+
progress_tracking.progress_tracker(self.progress_tracker)
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
if self.max_background_threads > 0:
|
|
84
|
+
self._io_pool_lock = threading.Lock()
|
|
85
|
+
self._io_pool = concurrent.futures.ThreadPoolExecutor(
|
|
86
|
+
max_workers=self.max_background_threads
|
|
87
|
+
)
|
|
88
|
+
else:
|
|
89
|
+
self._io_pool_lock = None
|
|
90
|
+
self._io_pool = None
|
|
67
91
|
|
|
68
|
-
self._io_pool_lock = threading.Lock()
|
|
69
|
-
self._io_pool = concurrent.futures.ThreadPoolExecutor(max_workers=16)
|
|
70
92
|
# TODO(daiyip): render background errors.
|
|
71
93
|
self._background_last_error = None
|
|
72
94
|
|
|
@@ -78,9 +100,12 @@ class RunnerBase(Runner):
|
|
|
78
100
|
except Exception as e: # pylint: disable=broad-except
|
|
79
101
|
self._background_last_error = e
|
|
80
102
|
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
self._io_pool
|
|
103
|
+
if self.max_background_threads > 0:
|
|
104
|
+
with self._io_pool_lock:
|
|
105
|
+
if self._io_pool is not None:
|
|
106
|
+
self._io_pool.submit(_background_run, *args, **kwargs)
|
|
107
|
+
else:
|
|
108
|
+
_background_run(*args, **kwargs)
|
|
84
109
|
|
|
85
110
|
def _all_plugins(self, experiment: Experiment) -> Iterator[Plugin]:
|
|
86
111
|
"""Returns all plugins for the experiment."""
|
|
@@ -89,24 +114,8 @@ class RunnerBase(Runner):
|
|
|
89
114
|
for plugin in experiment.plugins:
|
|
90
115
|
yield plugin
|
|
91
116
|
|
|
92
|
-
#
|
|
93
|
-
# IO operations for saving running files.
|
|
94
|
-
#
|
|
95
|
-
|
|
96
|
-
def _save_run_manifest(self) -> None:
|
|
97
|
-
def _save():
|
|
98
|
-
pg.symbolic.deref(self.current_run.clone(), recursive=True).save(
|
|
99
|
-
self.current_run.output_path_for(
|
|
100
|
-
self.current_run.experiment, _RUN_MANIFEST
|
|
101
|
-
),
|
|
102
|
-
hide_default_values=True
|
|
103
|
-
)
|
|
104
|
-
self.background_run(_save)
|
|
105
|
-
|
|
106
117
|
def on_run_start(self) -> None:
|
|
107
118
|
"""Called when a runner is started."""
|
|
108
|
-
self._save_run_manifest()
|
|
109
|
-
|
|
110
119
|
for plugin in self._all_plugins(self.current_run.experiment):
|
|
111
120
|
plugin.on_run_start(self, self.current_run.experiment)
|
|
112
121
|
|
|
@@ -126,9 +135,8 @@ class RunnerBase(Runner):
|
|
|
126
135
|
num_examples_to_evaluate = 0
|
|
127
136
|
if experiment.is_leaf:
|
|
128
137
|
assert isinstance(experiment, Evaluation)
|
|
129
|
-
num_examples_to_evaluate = (
|
|
130
|
-
|
|
131
|
-
if self.current_run.example_ids else experiment.num_examples
|
|
138
|
+
num_examples_to_evaluate = len(
|
|
139
|
+
self.current_run.examples_to_evaluate(experiment)
|
|
132
140
|
)
|
|
133
141
|
experiment.progress.start(total=num_examples_to_evaluate)
|
|
134
142
|
else:
|
|
@@ -139,6 +147,7 @@ class RunnerBase(Runner):
|
|
|
139
147
|
plugin.on_experiment_start(self, experiment)
|
|
140
148
|
|
|
141
149
|
if experiment.is_leaf:
|
|
150
|
+
pg.io.mkdirs(self.current_run.output_dir(experiment))
|
|
142
151
|
experiment.info(
|
|
143
152
|
f'Starting evaluation {experiment.id!r} with '
|
|
144
153
|
f'{num_examples_to_evaluate} examples to evaluate.'
|
|
@@ -180,10 +189,7 @@ class RunnerBase(Runner):
|
|
|
180
189
|
self._log_experiment_completion(experiment)
|
|
181
190
|
|
|
182
191
|
def _log_experiment_completion(self, experiment: Experiment):
|
|
183
|
-
example_ids = (
|
|
184
|
-
self.current_run.example_ids if self.current_run.example_ids else
|
|
185
|
-
list(range(1, experiment.num_examples + 1))
|
|
186
|
-
)
|
|
192
|
+
example_ids = sorted(self.current_run.examples_to_evaluate(experiment))
|
|
187
193
|
num_from_checkpoint, num_processed = 0, 0
|
|
188
194
|
for example_id in example_ids:
|
|
189
195
|
status = experiment.state.get_status(example_id)
|
|
@@ -220,7 +226,7 @@ class RunnerBase(Runner):
|
|
|
220
226
|
else:
|
|
221
227
|
# A evaluation could be considered as done if it has processed all the
|
|
222
228
|
# examples specified by `example_ids`.
|
|
223
|
-
assert progress.is_completed
|
|
229
|
+
assert progress.is_completed, progress
|
|
224
230
|
parent_progress.increment_processed()
|
|
225
231
|
|
|
226
232
|
if parent_progress.is_completed:
|
|
@@ -235,6 +241,8 @@ class RunnerBase(Runner):
|
|
|
235
241
|
example: Example
|
|
236
242
|
) -> None:
|
|
237
243
|
"""Called when an evaluation example is started."""
|
|
244
|
+
assert isinstance(experiment, Evaluation), experiment
|
|
245
|
+
experiment.state.update(example, in_progress=True)
|
|
238
246
|
for plugin in self._all_plugins(experiment):
|
|
239
247
|
plugin.on_example_start(self, experiment, example)
|
|
240
248
|
experiment.info(f'Starting to evaluate example {example.id}.')
|
|
@@ -245,6 +253,8 @@ class RunnerBase(Runner):
|
|
|
245
253
|
example: Example
|
|
246
254
|
) -> None:
|
|
247
255
|
"""Called when an evaluation example is complete."""
|
|
256
|
+
assert isinstance(experiment, Evaluation), experiment
|
|
257
|
+
experiment.state.update(example, in_progress=False)
|
|
248
258
|
if example.newly_processed:
|
|
249
259
|
if example.error is None:
|
|
250
260
|
experiment.progress.increment_processed()
|
|
@@ -256,7 +266,7 @@ class RunnerBase(Runner):
|
|
|
256
266
|
experiment.progress.increment_failed()
|
|
257
267
|
experiment.error(
|
|
258
268
|
(
|
|
259
|
-
f'Failed to evaluate example {example.id} in'
|
|
269
|
+
f'Failed to evaluate example {example.id} in '
|
|
260
270
|
f'{example.elapse:.2f} seconds.'
|
|
261
271
|
),
|
|
262
272
|
error=example.error
|
|
@@ -316,7 +326,7 @@ class RunnerBase(Runner):
|
|
|
316
326
|
self._run(targets)
|
|
317
327
|
|
|
318
328
|
self.on_run_complete()
|
|
319
|
-
except
|
|
329
|
+
except BaseException as e: # pylint: disable=broad-except
|
|
320
330
|
self.on_run_abort(e)
|
|
321
331
|
raise e
|
|
322
332
|
finally:
|
|
@@ -324,9 +334,10 @@ class RunnerBase(Runner):
|
|
|
324
334
|
self.background_run(cache.save)
|
|
325
335
|
|
|
326
336
|
# Wait for the background tasks to finish.
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
337
|
+
if self.max_background_threads > 0:
|
|
338
|
+
with self._io_pool_lock:
|
|
339
|
+
self._io_pool, io_pool = None, self._io_pool
|
|
340
|
+
io_pool.shutdown(wait=True)
|
|
330
341
|
|
|
331
342
|
@abc.abstractmethod
|
|
332
343
|
def _run(self, evaluations: list[Evaluation]) -> None:
|
|
@@ -335,6 +346,7 @@ class RunnerBase(Runner):
|
|
|
335
346
|
def run_evaluation(self, evaluation: Evaluation) -> None:
|
|
336
347
|
"""Runs the evaluation."""
|
|
337
348
|
try:
|
|
349
|
+
evaluation.setup()
|
|
338
350
|
self.on_experiment_start(evaluation)
|
|
339
351
|
|
|
340
352
|
per_evaluation_settings = {}
|
|
@@ -344,18 +356,14 @@ class RunnerBase(Runner):
|
|
|
344
356
|
per_evaluation_settings['cache'] = cache
|
|
345
357
|
|
|
346
358
|
with lf.use_settings(**per_evaluation_settings):
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
id=example_id,
|
|
356
|
-
input=evaluation.example_input_by_id(example_id)
|
|
357
|
-
) for example_id in self.current_run.example_ids
|
|
358
|
-
)
|
|
359
|
+
items = (
|
|
360
|
+
Example(
|
|
361
|
+
id=example_id,
|
|
362
|
+
input=evaluation.example_input_by_id(example_id)
|
|
363
|
+
) for example_id in sorted(
|
|
364
|
+
self.current_run.examples_to_evaluate(evaluation)
|
|
365
|
+
)
|
|
366
|
+
)
|
|
359
367
|
if self.current_run.shuffle_inputs:
|
|
360
368
|
items = list(items)
|
|
361
369
|
random.shuffle(items)
|
|
@@ -367,6 +375,8 @@ class RunnerBase(Runner):
|
|
|
367
375
|
except BaseException as e: # pylint: disable=broad-except
|
|
368
376
|
self.on_experiment_abort(evaluation, e)
|
|
369
377
|
raise e
|
|
378
|
+
finally:
|
|
379
|
+
evaluation.teardown()
|
|
370
380
|
|
|
371
381
|
@abc.abstractmethod
|
|
372
382
|
def _evaluate_items(
|
|
@@ -394,121 +404,3 @@ class RunnerBase(Runner):
|
|
|
394
404
|
return in_memory.InMemory(
|
|
395
405
|
self.current_run.output_path_for(experiment, 'cache.json')
|
|
396
406
|
)
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
class SequentialRunner(RunnerBase):
|
|
400
|
-
"""Sequential runner.
|
|
401
|
-
|
|
402
|
-
Sequential runner runs all evaluations and their examples in sequence,
|
|
403
|
-
as well as the background tasks, it allows the developer to catch all
|
|
404
|
-
exceptions thrown from the background tasks, making it easier to debug.
|
|
405
|
-
"""
|
|
406
|
-
|
|
407
|
-
NAME = 'sequential'
|
|
408
|
-
|
|
409
|
-
def background_run(
|
|
410
|
-
self, func: Callable[..., Any], *args: Any, **kwargs: Any
|
|
411
|
-
) -> None:
|
|
412
|
-
"""Runs the function with the IO pool."""
|
|
413
|
-
func(*args, **kwargs)
|
|
414
|
-
|
|
415
|
-
def _run(self, evaluations: list[Evaluation]) -> None:
|
|
416
|
-
"""Runs the experiment in sequence."""
|
|
417
|
-
for e in evaluations:
|
|
418
|
-
self.run_evaluation(e)
|
|
419
|
-
|
|
420
|
-
def _evaluate_items(
|
|
421
|
-
self, evaluation: Evaluation, items: Iterator[Example]
|
|
422
|
-
) -> None:
|
|
423
|
-
"""Runs the evaluation items in sequence."""
|
|
424
|
-
for item in items:
|
|
425
|
-
self.evaluate_item(evaluation, item)
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
class DebugRunner(SequentialRunner):
|
|
429
|
-
"""Debug runner."""
|
|
430
|
-
|
|
431
|
-
NAME = 'debug'
|
|
432
|
-
|
|
433
|
-
# Do not use the checkpointer for debug runner.
|
|
434
|
-
plugins = []
|
|
435
|
-
|
|
436
|
-
def _on_bound(self):
|
|
437
|
-
super()._on_bound()
|
|
438
|
-
if self.current_run.example_ids is None:
|
|
439
|
-
self.current_run.rebind(example_ids=[1], skip_notification=True)
|
|
440
|
-
self.current_run.rebind(raise_if_has_error=True, skip_notification=True)
|
|
441
|
-
|
|
442
|
-
def _save_run_manifest(self) -> None:
|
|
443
|
-
"""Do nothing to avoid overriden existing runs."""
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
class ParallelRunner(RunnerBase):
|
|
447
|
-
"""Parallel runner."""
|
|
448
|
-
|
|
449
|
-
NAME = 'parallel'
|
|
450
|
-
|
|
451
|
-
timeout: Annotated[
|
|
452
|
-
int | None,
|
|
453
|
-
'Timeout for each evaluation example.'
|
|
454
|
-
] = None
|
|
455
|
-
|
|
456
|
-
concurrent_startup_delay: Annotated[
|
|
457
|
-
tuple[int, int] | None,
|
|
458
|
-
(
|
|
459
|
-
'A range of seconds to delay the initial evaluation of each thread '
|
|
460
|
-
'in the thread pool, helping to prevent a burst in LLM QPS at '
|
|
461
|
-
'startup. If set to None, no delay will be applied.'
|
|
462
|
-
)
|
|
463
|
-
] = None
|
|
464
|
-
|
|
465
|
-
def _run(self, evaluations: list[Evaluation]) -> None:
|
|
466
|
-
"""Runs the evaluations in parallel."""
|
|
467
|
-
def _run_group(evaluation_group: list[Evaluation]):
|
|
468
|
-
for e in evaluation_group:
|
|
469
|
-
self.run_evaluation(e)
|
|
470
|
-
|
|
471
|
-
# Run evaluations in parallel groupped by resource key.
|
|
472
|
-
groups: dict[str, list[Evaluation]] = collections.defaultdict(list)
|
|
473
|
-
for e in evaluations:
|
|
474
|
-
resource_ids = e.resource_ids()
|
|
475
|
-
if not resource_ids:
|
|
476
|
-
group_id = e.id
|
|
477
|
-
else:
|
|
478
|
-
# TODO(daiyip): support group that requires multiple resources.
|
|
479
|
-
group_id = resource_ids.pop()
|
|
480
|
-
groups[group_id].append(e)
|
|
481
|
-
|
|
482
|
-
for _, _, _ in lf.concurrent_map(
|
|
483
|
-
_run_group,
|
|
484
|
-
groups.values(),
|
|
485
|
-
max_workers=max(64, len(groups)),
|
|
486
|
-
timeout=self.timeout,
|
|
487
|
-
silence_on_errors=None,
|
|
488
|
-
):
|
|
489
|
-
pass
|
|
490
|
-
|
|
491
|
-
def _evaluate_items(
|
|
492
|
-
self, evaluation: Evaluation, items: Iterator[Example]
|
|
493
|
-
) -> None:
|
|
494
|
-
"""Override run items to run in parallel."""
|
|
495
|
-
if self.concurrent_startup_delay is not None:
|
|
496
|
-
thread_delayed = {}
|
|
497
|
-
def _evaluate_item(item: Example):
|
|
498
|
-
thread_id = threading.current_thread().ident
|
|
499
|
-
if thread_id not in thread_delayed:
|
|
500
|
-
thread_delayed[thread_id] = True
|
|
501
|
-
time.sleep(random.randint(*self.concurrent_startup_delay))
|
|
502
|
-
return self.evaluate_item(evaluation, item)
|
|
503
|
-
else:
|
|
504
|
-
def _evaluate_item(item: Example):
|
|
505
|
-
return self.evaluate_item(evaluation, item)
|
|
506
|
-
|
|
507
|
-
for _, _, _ in lf.concurrent_map(
|
|
508
|
-
_evaluate_item,
|
|
509
|
-
items,
|
|
510
|
-
max_workers=evaluation.max_workers,
|
|
511
|
-
timeout=self.timeout,
|
|
512
|
-
silence_on_errors=None,
|
|
513
|
-
):
|
|
514
|
-
pass
|