langfun 0.1.2.dev202509120804__py3-none-any.whl → 0.1.2.dev202512150805__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langfun/__init__.py +1 -1
- langfun/core/__init__.py +7 -1
- langfun/core/agentic/__init__.py +8 -1
- langfun/core/agentic/action.py +740 -112
- langfun/core/agentic/action_eval.py +9 -2
- langfun/core/agentic/action_test.py +189 -24
- langfun/core/async_support.py +104 -5
- langfun/core/async_support_test.py +23 -0
- langfun/core/coding/python/correction.py +19 -9
- langfun/core/coding/python/execution.py +14 -12
- langfun/core/coding/python/generation.py +21 -16
- langfun/core/coding/python/sandboxing.py +23 -3
- langfun/core/component.py +42 -3
- langfun/core/concurrent.py +70 -6
- langfun/core/concurrent_test.py +9 -2
- langfun/core/console.py +1 -1
- langfun/core/data/conversion/anthropic.py +12 -3
- langfun/core/data/conversion/anthropic_test.py +8 -6
- langfun/core/data/conversion/gemini.py +11 -2
- langfun/core/data/conversion/gemini_test.py +48 -9
- langfun/core/data/conversion/openai.py +145 -31
- langfun/core/data/conversion/openai_test.py +161 -17
- langfun/core/eval/base.py +48 -44
- langfun/core/eval/base_test.py +5 -5
- langfun/core/eval/matching.py +5 -2
- langfun/core/eval/patching.py +3 -3
- langfun/core/eval/scoring.py +4 -3
- langfun/core/eval/v2/__init__.py +3 -0
- langfun/core/eval/v2/checkpointing.py +148 -46
- langfun/core/eval/v2/checkpointing_test.py +9 -2
- langfun/core/eval/v2/config_saver.py +37 -0
- langfun/core/eval/v2/config_saver_test.py +36 -0
- langfun/core/eval/v2/eval_test_helper.py +104 -3
- langfun/core/eval/v2/evaluation.py +102 -19
- langfun/core/eval/v2/evaluation_test.py +9 -3
- langfun/core/eval/v2/example.py +50 -40
- langfun/core/eval/v2/example_test.py +16 -8
- langfun/core/eval/v2/experiment.py +95 -20
- langfun/core/eval/v2/experiment_test.py +19 -0
- langfun/core/eval/v2/metric_values.py +31 -3
- langfun/core/eval/v2/metric_values_test.py +32 -0
- langfun/core/eval/v2/metrics.py +157 -44
- langfun/core/eval/v2/metrics_test.py +39 -18
- langfun/core/eval/v2/progress.py +31 -1
- langfun/core/eval/v2/progress_test.py +27 -0
- langfun/core/eval/v2/progress_tracking.py +13 -5
- langfun/core/eval/v2/progress_tracking_test.py +9 -1
- langfun/core/eval/v2/reporting.py +88 -71
- langfun/core/eval/v2/reporting_test.py +24 -6
- langfun/core/eval/v2/runners/__init__.py +30 -0
- langfun/core/eval/v2/{runners.py → runners/base.py} +73 -180
- langfun/core/eval/v2/runners/beam.py +354 -0
- langfun/core/eval/v2/runners/beam_test.py +153 -0
- langfun/core/eval/v2/runners/ckpt_monitor.py +350 -0
- langfun/core/eval/v2/runners/ckpt_monitor_test.py +213 -0
- langfun/core/eval/v2/runners/debug.py +40 -0
- langfun/core/eval/v2/runners/debug_test.py +76 -0
- langfun/core/eval/v2/runners/parallel.py +243 -0
- langfun/core/eval/v2/runners/parallel_test.py +182 -0
- langfun/core/eval/v2/runners/sequential.py +47 -0
- langfun/core/eval/v2/runners/sequential_test.py +169 -0
- langfun/core/langfunc.py +45 -130
- langfun/core/langfunc_test.py +7 -5
- langfun/core/language_model.py +189 -36
- langfun/core/language_model_test.py +54 -3
- langfun/core/llms/__init__.py +14 -1
- langfun/core/llms/anthropic.py +157 -2
- langfun/core/llms/azure_openai.py +29 -17
- langfun/core/llms/cache/base.py +25 -3
- langfun/core/llms/cache/in_memory.py +48 -7
- langfun/core/llms/cache/in_memory_test.py +14 -4
- langfun/core/llms/compositional.py +25 -1
- langfun/core/llms/deepseek.py +30 -2
- langfun/core/llms/fake.py +32 -1
- langfun/core/llms/gemini.py +90 -12
- langfun/core/llms/gemini_test.py +110 -0
- langfun/core/llms/google_genai.py +52 -1
- langfun/core/llms/groq.py +28 -3
- langfun/core/llms/llama_cpp.py +23 -4
- langfun/core/llms/openai.py +120 -3
- langfun/core/llms/openai_compatible.py +148 -27
- langfun/core/llms/openai_compatible_test.py +207 -20
- langfun/core/llms/openai_test.py +0 -2
- langfun/core/llms/rest.py +16 -1
- langfun/core/llms/vertexai.py +78 -8
- langfun/core/logging.py +1 -1
- langfun/core/mcp/__init__.py +10 -0
- langfun/core/mcp/client.py +177 -0
- langfun/core/mcp/client_test.py +71 -0
- langfun/core/mcp/session.py +241 -0
- langfun/core/mcp/session_test.py +54 -0
- langfun/core/mcp/testing/simple_mcp_client.py +33 -0
- langfun/core/mcp/testing/simple_mcp_server.py +33 -0
- langfun/core/mcp/tool.py +254 -0
- langfun/core/mcp/tool_test.py +197 -0
- langfun/core/memory.py +1 -0
- langfun/core/message.py +160 -55
- langfun/core/message_test.py +65 -81
- langfun/core/modalities/__init__.py +8 -0
- langfun/core/modalities/audio.py +21 -1
- langfun/core/modalities/image.py +73 -3
- langfun/core/modalities/image_test.py +116 -0
- langfun/core/modalities/mime.py +78 -4
- langfun/core/modalities/mime_test.py +59 -0
- langfun/core/modalities/pdf.py +19 -1
- langfun/core/modalities/video.py +21 -1
- langfun/core/modality.py +167 -29
- langfun/core/modality_test.py +42 -12
- langfun/core/natural_language.py +1 -1
- langfun/core/sampling.py +4 -4
- langfun/core/sampling_test.py +20 -4
- langfun/core/structured/__init__.py +2 -24
- langfun/core/structured/completion.py +34 -44
- langfun/core/structured/completion_test.py +23 -43
- langfun/core/structured/description.py +54 -50
- langfun/core/structured/function_generation.py +29 -12
- langfun/core/structured/mapping.py +81 -37
- langfun/core/structured/parsing.py +95 -79
- langfun/core/structured/parsing_test.py +0 -3
- langfun/core/structured/querying.py +230 -154
- langfun/core/structured/querying_test.py +69 -33
- langfun/core/structured/schema/__init__.py +49 -0
- langfun/core/structured/schema/base.py +664 -0
- langfun/core/structured/schema/base_test.py +531 -0
- langfun/core/structured/schema/json.py +174 -0
- langfun/core/structured/schema/json_test.py +121 -0
- langfun/core/structured/schema/python.py +316 -0
- langfun/core/structured/schema/python_test.py +410 -0
- langfun/core/structured/schema_generation.py +33 -14
- langfun/core/structured/scoring.py +47 -36
- langfun/core/structured/tokenization.py +26 -11
- langfun/core/subscription.py +2 -2
- langfun/core/template.py +175 -50
- langfun/core/template_test.py +123 -17
- langfun/env/__init__.py +43 -0
- langfun/env/base_environment.py +827 -0
- langfun/env/base_environment_test.py +473 -0
- langfun/env/base_feature.py +304 -0
- langfun/env/base_feature_test.py +228 -0
- langfun/env/base_sandbox.py +842 -0
- langfun/env/base_sandbox_test.py +1235 -0
- langfun/env/event_handlers/__init__.py +14 -0
- langfun/env/event_handlers/chain.py +233 -0
- langfun/env/event_handlers/chain_test.py +253 -0
- langfun/env/event_handlers/event_logger.py +472 -0
- langfun/env/event_handlers/event_logger_test.py +304 -0
- langfun/env/event_handlers/metric_writer.py +726 -0
- langfun/env/event_handlers/metric_writer_test.py +214 -0
- langfun/env/interface.py +1640 -0
- langfun/env/interface_test.py +153 -0
- langfun/env/load_balancers.py +59 -0
- langfun/env/load_balancers_test.py +141 -0
- langfun/env/test_utils.py +507 -0
- {langfun-0.1.2.dev202509120804.dist-info → langfun-0.1.2.dev202512150805.dist-info}/METADATA +7 -3
- langfun-0.1.2.dev202512150805.dist-info/RECORD +217 -0
- langfun/core/eval/v2/runners_test.py +0 -343
- langfun/core/structured/schema.py +0 -987
- langfun/core/structured/schema_test.py +0 -982
- langfun-0.1.2.dev202509120804.dist-info/RECORD +0 -172
- {langfun-0.1.2.dev202509120804.dist-info → langfun-0.1.2.dev202512150805.dist-info}/WHEEL +0 -0
- {langfun-0.1.2.dev202509120804.dist-info → langfun-0.1.2.dev202512150805.dist-info}/licenses/LICENSE +0 -0
- {langfun-0.1.2.dev202509120804.dist-info → langfun-0.1.2.dev202512150805.dist-info}/top_level.txt +0 -0
|
@@ -11,18 +11,18 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
"""
|
|
14
|
+
"""Base experiment runner."""
|
|
15
|
+
|
|
15
16
|
import abc
|
|
16
|
-
import collections
|
|
17
17
|
import concurrent.futures
|
|
18
18
|
import random
|
|
19
19
|
import threading
|
|
20
|
-
import time
|
|
21
20
|
import traceback
|
|
22
|
-
from typing import Any, Annotated, Callable, Iterator
|
|
21
|
+
from typing import Any, Annotated, Callable, Iterator, Literal
|
|
23
22
|
|
|
24
23
|
from langfun import core as lf
|
|
25
24
|
from langfun.core.eval.v2 import checkpointing
|
|
25
|
+
from langfun.core.eval.v2 import config_saver
|
|
26
26
|
from langfun.core.eval.v2 import evaluation as evaluation_lib
|
|
27
27
|
from langfun.core.eval.v2 import example as example_lib
|
|
28
28
|
from langfun.core.eval.v2 import experiment as experiment_lib
|
|
@@ -38,35 +38,57 @@ Experiment = experiment_lib.Experiment
|
|
|
38
38
|
Plugin = experiment_lib.Plugin
|
|
39
39
|
|
|
40
40
|
|
|
41
|
-
_RUN_MANIFEST = 'run.json'
|
|
42
|
-
|
|
43
|
-
|
|
44
41
|
class RunnerBase(Runner):
|
|
45
|
-
"""
|
|
42
|
+
"""Base class for runners with plugin support and IO pooling.
|
|
43
|
+
|
|
44
|
+
`RunnerBase` provides the basic runner functionalities such as plugin
|
|
45
|
+
integration for checkpointing, reporting and progress tracking.
|
|
46
|
+
It also manages a thread pool for background IO operations.
|
|
47
|
+
Subclasses should implement `_run` and `_evaluate_items` for different
|
|
48
|
+
execution strategies.
|
|
49
|
+
"""
|
|
46
50
|
|
|
47
|
-
|
|
48
|
-
|
|
51
|
+
progress_tracker: Annotated[
|
|
52
|
+
Literal['tqdm', 'html', 'auto', None],
|
|
49
53
|
(
|
|
50
|
-
'If
|
|
51
|
-
'
|
|
52
|
-
'
|
|
54
|
+
'If `tqdm`, force using tqdm for progress update. '
|
|
55
|
+
'If `html`, force using html for progress update. '
|
|
56
|
+
'If `auto`, determine it automatically based on the running '
|
|
57
|
+
'environment (console vs. notebook)'
|
|
58
|
+
'If `none`, disable progress update.'
|
|
53
59
|
)
|
|
54
|
-
] =
|
|
60
|
+
] = 'auto'
|
|
55
61
|
|
|
56
62
|
plugins = [
|
|
57
63
|
checkpointing.BulkCheckpointer(),
|
|
58
64
|
reporting.HtmlReporter(),
|
|
65
|
+
config_saver.RunConfigSaver(),
|
|
59
66
|
]
|
|
60
67
|
|
|
68
|
+
max_background_threads: Annotated[
|
|
69
|
+
int,
|
|
70
|
+
'Max number of background threads for IO operations.'
|
|
71
|
+
] = 128
|
|
72
|
+
|
|
61
73
|
def _on_bound(self):
|
|
62
74
|
super()._on_bound()
|
|
63
75
|
|
|
64
76
|
# Install the tqdm plugin if needed.
|
|
65
|
-
|
|
66
|
-
|
|
77
|
+
if self.progress_tracker is not None:
|
|
78
|
+
with pg.notify_on_change(False):
|
|
79
|
+
self.plugins.append(
|
|
80
|
+
progress_tracking.progress_tracker(self.progress_tracker)
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
if self.max_background_threads > 0:
|
|
84
|
+
self._io_pool_lock = threading.Lock()
|
|
85
|
+
self._io_pool = concurrent.futures.ThreadPoolExecutor(
|
|
86
|
+
max_workers=self.max_background_threads
|
|
87
|
+
)
|
|
88
|
+
else:
|
|
89
|
+
self._io_pool_lock = None
|
|
90
|
+
self._io_pool = None
|
|
67
91
|
|
|
68
|
-
self._io_pool_lock = threading.Lock()
|
|
69
|
-
self._io_pool = concurrent.futures.ThreadPoolExecutor(max_workers=16)
|
|
70
92
|
# TODO(daiyip): render background errors.
|
|
71
93
|
self._background_last_error = None
|
|
72
94
|
|
|
@@ -78,9 +100,12 @@ class RunnerBase(Runner):
|
|
|
78
100
|
except Exception as e: # pylint: disable=broad-except
|
|
79
101
|
self._background_last_error = e
|
|
80
102
|
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
self._io_pool
|
|
103
|
+
if self.max_background_threads > 0:
|
|
104
|
+
with self._io_pool_lock:
|
|
105
|
+
if self._io_pool is not None:
|
|
106
|
+
self._io_pool.submit(_background_run, *args, **kwargs)
|
|
107
|
+
else:
|
|
108
|
+
_background_run(*args, **kwargs)
|
|
84
109
|
|
|
85
110
|
def _all_plugins(self, experiment: Experiment) -> Iterator[Plugin]:
|
|
86
111
|
"""Returns all plugins for the experiment."""
|
|
@@ -89,24 +114,8 @@ class RunnerBase(Runner):
|
|
|
89
114
|
for plugin in experiment.plugins:
|
|
90
115
|
yield plugin
|
|
91
116
|
|
|
92
|
-
#
|
|
93
|
-
# IO operations for saving running files.
|
|
94
|
-
#
|
|
95
|
-
|
|
96
|
-
def _save_run_manifest(self) -> None:
|
|
97
|
-
def _save():
|
|
98
|
-
pg.symbolic.deref(self.current_run.clone(), recursive=True).save(
|
|
99
|
-
self.current_run.output_path_for(
|
|
100
|
-
self.current_run.experiment, _RUN_MANIFEST
|
|
101
|
-
),
|
|
102
|
-
hide_default_values=True
|
|
103
|
-
)
|
|
104
|
-
self.background_run(_save)
|
|
105
|
-
|
|
106
117
|
def on_run_start(self) -> None:
|
|
107
118
|
"""Called when a runner is started."""
|
|
108
|
-
self._save_run_manifest()
|
|
109
|
-
|
|
110
119
|
for plugin in self._all_plugins(self.current_run.experiment):
|
|
111
120
|
plugin.on_run_start(self, self.current_run.experiment)
|
|
112
121
|
|
|
@@ -126,11 +135,11 @@ class RunnerBase(Runner):
|
|
|
126
135
|
num_examples_to_evaluate = 0
|
|
127
136
|
if experiment.is_leaf:
|
|
128
137
|
assert isinstance(experiment, Evaluation)
|
|
129
|
-
num_examples_to_evaluate = (
|
|
130
|
-
|
|
131
|
-
if self.current_run.example_ids else experiment.num_examples
|
|
138
|
+
num_examples_to_evaluate = len(
|
|
139
|
+
self.current_run.examples_to_evaluate(experiment)
|
|
132
140
|
)
|
|
133
141
|
experiment.progress.start(total=num_examples_to_evaluate)
|
|
142
|
+
pg.io.mkdirs(self.current_run.output_dir(experiment))
|
|
134
143
|
else:
|
|
135
144
|
experiment.progress.start(total=len(experiment.leaf_nodes))
|
|
136
145
|
|
|
@@ -139,6 +148,7 @@ class RunnerBase(Runner):
|
|
|
139
148
|
plugin.on_experiment_start(self, experiment)
|
|
140
149
|
|
|
141
150
|
if experiment.is_leaf:
|
|
151
|
+
pg.io.mkdirs(self.current_run.output_dir(experiment))
|
|
142
152
|
experiment.info(
|
|
143
153
|
f'Starting evaluation {experiment.id!r} with '
|
|
144
154
|
f'{num_examples_to_evaluate} examples to evaluate.'
|
|
@@ -180,10 +190,7 @@ class RunnerBase(Runner):
|
|
|
180
190
|
self._log_experiment_completion(experiment)
|
|
181
191
|
|
|
182
192
|
def _log_experiment_completion(self, experiment: Experiment):
|
|
183
|
-
example_ids = (
|
|
184
|
-
self.current_run.example_ids if self.current_run.example_ids else
|
|
185
|
-
list(range(1, experiment.num_examples + 1))
|
|
186
|
-
)
|
|
193
|
+
example_ids = sorted(self.current_run.examples_to_evaluate(experiment))
|
|
187
194
|
num_from_checkpoint, num_processed = 0, 0
|
|
188
195
|
for example_id in example_ids:
|
|
189
196
|
status = experiment.state.get_status(example_id)
|
|
@@ -220,7 +227,7 @@ class RunnerBase(Runner):
|
|
|
220
227
|
else:
|
|
221
228
|
# A evaluation could be considered as done if it has processed all the
|
|
222
229
|
# examples specified by `example_ids`.
|
|
223
|
-
assert progress.is_completed
|
|
230
|
+
assert progress.is_completed, progress
|
|
224
231
|
parent_progress.increment_processed()
|
|
225
232
|
|
|
226
233
|
if parent_progress.is_completed:
|
|
@@ -235,6 +242,8 @@ class RunnerBase(Runner):
|
|
|
235
242
|
example: Example
|
|
236
243
|
) -> None:
|
|
237
244
|
"""Called when an evaluation example is started."""
|
|
245
|
+
assert isinstance(experiment, Evaluation), experiment
|
|
246
|
+
experiment.state.update(example, in_progress=True)
|
|
238
247
|
for plugin in self._all_plugins(experiment):
|
|
239
248
|
plugin.on_example_start(self, experiment, example)
|
|
240
249
|
experiment.info(f'Starting to evaluate example {example.id}.')
|
|
@@ -245,6 +254,8 @@ class RunnerBase(Runner):
|
|
|
245
254
|
example: Example
|
|
246
255
|
) -> None:
|
|
247
256
|
"""Called when an evaluation example is complete."""
|
|
257
|
+
assert isinstance(experiment, Evaluation), experiment
|
|
258
|
+
experiment.state.update(example, in_progress=False)
|
|
248
259
|
if example.newly_processed:
|
|
249
260
|
if example.error is None:
|
|
250
261
|
experiment.progress.increment_processed()
|
|
@@ -256,7 +267,7 @@ class RunnerBase(Runner):
|
|
|
256
267
|
experiment.progress.increment_failed()
|
|
257
268
|
experiment.error(
|
|
258
269
|
(
|
|
259
|
-
f'Failed to evaluate example {example.id} in'
|
|
270
|
+
f'Failed to evaluate example {example.id} in '
|
|
260
271
|
f'{example.elapse:.2f} seconds.'
|
|
261
272
|
),
|
|
262
273
|
error=example.error
|
|
@@ -316,7 +327,7 @@ class RunnerBase(Runner):
|
|
|
316
327
|
self._run(targets)
|
|
317
328
|
|
|
318
329
|
self.on_run_complete()
|
|
319
|
-
except
|
|
330
|
+
except BaseException as e: # pylint: disable=broad-except
|
|
320
331
|
self.on_run_abort(e)
|
|
321
332
|
raise e
|
|
322
333
|
finally:
|
|
@@ -324,9 +335,10 @@ class RunnerBase(Runner):
|
|
|
324
335
|
self.background_run(cache.save)
|
|
325
336
|
|
|
326
337
|
# Wait for the background tasks to finish.
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
338
|
+
if self.max_background_threads > 0:
|
|
339
|
+
with self._io_pool_lock:
|
|
340
|
+
self._io_pool, io_pool = None, self._io_pool
|
|
341
|
+
io_pool.shutdown(wait=True)
|
|
330
342
|
|
|
331
343
|
@abc.abstractmethod
|
|
332
344
|
def _run(self, evaluations: list[Evaluation]) -> None:
|
|
@@ -335,6 +347,7 @@ class RunnerBase(Runner):
|
|
|
335
347
|
def run_evaluation(self, evaluation: Evaluation) -> None:
|
|
336
348
|
"""Runs the evaluation."""
|
|
337
349
|
try:
|
|
350
|
+
evaluation.setup()
|
|
338
351
|
self.on_experiment_start(evaluation)
|
|
339
352
|
|
|
340
353
|
per_evaluation_settings = {}
|
|
@@ -344,18 +357,14 @@ class RunnerBase(Runner):
|
|
|
344
357
|
per_evaluation_settings['cache'] = cache
|
|
345
358
|
|
|
346
359
|
with lf.use_settings(**per_evaluation_settings):
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
id=example_id,
|
|
356
|
-
input=evaluation.example_input_by_id(example_id)
|
|
357
|
-
) for example_id in self.current_run.example_ids
|
|
358
|
-
)
|
|
360
|
+
items = (
|
|
361
|
+
Example(
|
|
362
|
+
id=example_id,
|
|
363
|
+
input=evaluation.example_input_by_id(example_id)
|
|
364
|
+
) for example_id in sorted(
|
|
365
|
+
self.current_run.examples_to_evaluate(evaluation)
|
|
366
|
+
)
|
|
367
|
+
)
|
|
359
368
|
if self.current_run.shuffle_inputs:
|
|
360
369
|
items = list(items)
|
|
361
370
|
random.shuffle(items)
|
|
@@ -367,6 +376,8 @@ class RunnerBase(Runner):
|
|
|
367
376
|
except BaseException as e: # pylint: disable=broad-except
|
|
368
377
|
self.on_experiment_abort(evaluation, e)
|
|
369
378
|
raise e
|
|
379
|
+
finally:
|
|
380
|
+
evaluation.teardown()
|
|
370
381
|
|
|
371
382
|
@abc.abstractmethod
|
|
372
383
|
def _evaluate_items(
|
|
@@ -394,121 +405,3 @@ class RunnerBase(Runner):
|
|
|
394
405
|
return in_memory.InMemory(
|
|
395
406
|
self.current_run.output_path_for(experiment, 'cache.json')
|
|
396
407
|
)
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
class SequentialRunner(RunnerBase):
|
|
400
|
-
"""Sequential runner.
|
|
401
|
-
|
|
402
|
-
Sequential runner runs all evaluations and their examples in sequence,
|
|
403
|
-
as well as the background tasks, it allows the developer to catch all
|
|
404
|
-
exceptions thrown from the background tasks, making it easier to debug.
|
|
405
|
-
"""
|
|
406
|
-
|
|
407
|
-
NAME = 'sequential'
|
|
408
|
-
|
|
409
|
-
def background_run(
|
|
410
|
-
self, func: Callable[..., Any], *args: Any, **kwargs: Any
|
|
411
|
-
) -> None:
|
|
412
|
-
"""Runs the function with the IO pool."""
|
|
413
|
-
func(*args, **kwargs)
|
|
414
|
-
|
|
415
|
-
def _run(self, evaluations: list[Evaluation]) -> None:
|
|
416
|
-
"""Runs the experiment in sequence."""
|
|
417
|
-
for e in evaluations:
|
|
418
|
-
self.run_evaluation(e)
|
|
419
|
-
|
|
420
|
-
def _evaluate_items(
|
|
421
|
-
self, evaluation: Evaluation, items: Iterator[Example]
|
|
422
|
-
) -> None:
|
|
423
|
-
"""Runs the evaluation items in sequence."""
|
|
424
|
-
for item in items:
|
|
425
|
-
self.evaluate_item(evaluation, item)
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
class DebugRunner(SequentialRunner):
|
|
429
|
-
"""Debug runner."""
|
|
430
|
-
|
|
431
|
-
NAME = 'debug'
|
|
432
|
-
|
|
433
|
-
# Do not use the checkpointer for debug runner.
|
|
434
|
-
plugins = []
|
|
435
|
-
|
|
436
|
-
def _on_bound(self):
|
|
437
|
-
super()._on_bound()
|
|
438
|
-
if self.current_run.example_ids is None:
|
|
439
|
-
self.current_run.rebind(example_ids=[1], skip_notification=True)
|
|
440
|
-
self.current_run.rebind(raise_if_has_error=True, skip_notification=True)
|
|
441
|
-
|
|
442
|
-
def _save_run_manifest(self) -> None:
|
|
443
|
-
"""Do nothing to avoid overriden existing runs."""
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
class ParallelRunner(RunnerBase):
|
|
447
|
-
"""Parallel runner."""
|
|
448
|
-
|
|
449
|
-
NAME = 'parallel'
|
|
450
|
-
|
|
451
|
-
timeout: Annotated[
|
|
452
|
-
int | None,
|
|
453
|
-
'Timeout for each evaluation example.'
|
|
454
|
-
] = None
|
|
455
|
-
|
|
456
|
-
concurrent_startup_delay: Annotated[
|
|
457
|
-
tuple[int, int] | None,
|
|
458
|
-
(
|
|
459
|
-
'A range of seconds to delay the initial evaluation of each thread '
|
|
460
|
-
'in the thread pool, helping to prevent a burst in LLM QPS at '
|
|
461
|
-
'startup. If set to None, no delay will be applied.'
|
|
462
|
-
)
|
|
463
|
-
] = None
|
|
464
|
-
|
|
465
|
-
def _run(self, evaluations: list[Evaluation]) -> None:
|
|
466
|
-
"""Runs the evaluations in parallel."""
|
|
467
|
-
def _run_group(evaluation_group: list[Evaluation]):
|
|
468
|
-
for e in evaluation_group:
|
|
469
|
-
self.run_evaluation(e)
|
|
470
|
-
|
|
471
|
-
# Run evaluations in parallel groupped by resource key.
|
|
472
|
-
groups: dict[str, list[Evaluation]] = collections.defaultdict(list)
|
|
473
|
-
for e in evaluations:
|
|
474
|
-
resource_ids = e.resource_ids()
|
|
475
|
-
if not resource_ids:
|
|
476
|
-
group_id = e.id
|
|
477
|
-
else:
|
|
478
|
-
# TODO(daiyip): support group that requires multiple resources.
|
|
479
|
-
group_id = resource_ids.pop()
|
|
480
|
-
groups[group_id].append(e)
|
|
481
|
-
|
|
482
|
-
for _, _, _ in lf.concurrent_map(
|
|
483
|
-
_run_group,
|
|
484
|
-
groups.values(),
|
|
485
|
-
max_workers=max(64, len(groups)),
|
|
486
|
-
timeout=self.timeout,
|
|
487
|
-
silence_on_errors=None,
|
|
488
|
-
):
|
|
489
|
-
pass
|
|
490
|
-
|
|
491
|
-
def _evaluate_items(
|
|
492
|
-
self, evaluation: Evaluation, items: Iterator[Example]
|
|
493
|
-
) -> None:
|
|
494
|
-
"""Override run items to run in parallel."""
|
|
495
|
-
if self.concurrent_startup_delay is not None:
|
|
496
|
-
thread_delayed = {}
|
|
497
|
-
def _evaluate_item(item: Example):
|
|
498
|
-
thread_id = threading.current_thread().ident
|
|
499
|
-
if thread_id not in thread_delayed:
|
|
500
|
-
thread_delayed[thread_id] = True
|
|
501
|
-
time.sleep(random.randint(*self.concurrent_startup_delay))
|
|
502
|
-
return self.evaluate_item(evaluation, item)
|
|
503
|
-
else:
|
|
504
|
-
def _evaluate_item(item: Example):
|
|
505
|
-
return self.evaluate_item(evaluation, item)
|
|
506
|
-
|
|
507
|
-
for _, _, _ in lf.concurrent_map(
|
|
508
|
-
_evaluate_item,
|
|
509
|
-
items,
|
|
510
|
-
max_workers=evaluation.max_workers,
|
|
511
|
-
timeout=self.timeout,
|
|
512
|
-
silence_on_errors=None,
|
|
513
|
-
):
|
|
514
|
-
pass
|