langfun 0.1.2.dev202510230805__py3-none-any.whl → 0.1.2.dev202511270805__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of langfun might be problematic. Click here for more details.
- langfun/core/__init__.py +2 -0
- langfun/core/agentic/__init__.py +4 -1
- langfun/core/agentic/action.py +447 -29
- langfun/core/agentic/action_eval.py +9 -2
- langfun/core/agentic/action_test.py +149 -21
- langfun/core/async_support.py +32 -3
- langfun/core/coding/python/correction.py +19 -9
- langfun/core/coding/python/execution.py +14 -12
- langfun/core/coding/python/generation.py +21 -16
- langfun/core/coding/python/sandboxing.py +23 -3
- langfun/core/component.py +42 -3
- langfun/core/concurrent.py +70 -6
- langfun/core/concurrent_test.py +1 -0
- langfun/core/console.py +1 -1
- langfun/core/data/conversion/anthropic.py +12 -3
- langfun/core/data/conversion/anthropic_test.py +8 -6
- langfun/core/data/conversion/gemini.py +9 -2
- langfun/core/data/conversion/gemini_test.py +12 -9
- langfun/core/data/conversion/openai.py +145 -31
- langfun/core/data/conversion/openai_test.py +161 -17
- langfun/core/eval/base.py +47 -43
- langfun/core/eval/base_test.py +5 -5
- langfun/core/eval/matching.py +5 -2
- langfun/core/eval/patching.py +3 -3
- langfun/core/eval/scoring.py +4 -3
- langfun/core/eval/v2/__init__.py +1 -0
- langfun/core/eval/v2/checkpointing.py +64 -6
- langfun/core/eval/v2/checkpointing_test.py +9 -2
- langfun/core/eval/v2/eval_test_helper.py +103 -2
- langfun/core/eval/v2/evaluation.py +91 -16
- langfun/core/eval/v2/evaluation_test.py +9 -3
- langfun/core/eval/v2/example.py +50 -40
- langfun/core/eval/v2/example_test.py +16 -8
- langfun/core/eval/v2/experiment.py +74 -8
- langfun/core/eval/v2/experiment_test.py +19 -0
- langfun/core/eval/v2/metric_values.py +31 -3
- langfun/core/eval/v2/metric_values_test.py +32 -0
- langfun/core/eval/v2/metrics.py +157 -44
- langfun/core/eval/v2/metrics_test.py +39 -18
- langfun/core/eval/v2/progress.py +30 -1
- langfun/core/eval/v2/progress_test.py +27 -0
- langfun/core/eval/v2/progress_tracking.py +12 -3
- langfun/core/eval/v2/progress_tracking_test.py +6 -1
- langfun/core/eval/v2/reporting.py +90 -71
- langfun/core/eval/v2/reporting_test.py +24 -6
- langfun/core/eval/v2/runners/__init__.py +30 -0
- langfun/core/eval/v2/{runners.py → runners/base.py} +59 -142
- langfun/core/eval/v2/runners/beam.py +341 -0
- langfun/core/eval/v2/runners/beam_test.py +131 -0
- langfun/core/eval/v2/runners/ckpt_monitor.py +294 -0
- langfun/core/eval/v2/runners/ckpt_monitor_test.py +162 -0
- langfun/core/eval/v2/runners/debug.py +40 -0
- langfun/core/eval/v2/runners/debug_test.py +76 -0
- langfun/core/eval/v2/runners/parallel.py +100 -0
- langfun/core/eval/v2/runners/parallel_test.py +95 -0
- langfun/core/eval/v2/runners/sequential.py +47 -0
- langfun/core/eval/v2/runners/sequential_test.py +172 -0
- langfun/core/langfunc.py +45 -130
- langfun/core/langfunc_test.py +7 -5
- langfun/core/language_model.py +141 -21
- langfun/core/language_model_test.py +54 -3
- langfun/core/llms/__init__.py +9 -1
- langfun/core/llms/anthropic.py +157 -2
- langfun/core/llms/azure_openai.py +29 -17
- langfun/core/llms/cache/base.py +25 -3
- langfun/core/llms/cache/in_memory.py +48 -7
- langfun/core/llms/cache/in_memory_test.py +14 -4
- langfun/core/llms/compositional.py +25 -1
- langfun/core/llms/deepseek.py +30 -2
- langfun/core/llms/fake.py +32 -1
- langfun/core/llms/gemini.py +55 -17
- langfun/core/llms/gemini_test.py +84 -0
- langfun/core/llms/google_genai.py +34 -1
- langfun/core/llms/groq.py +28 -3
- langfun/core/llms/llama_cpp.py +23 -4
- langfun/core/llms/openai.py +36 -3
- langfun/core/llms/openai_compatible.py +148 -27
- langfun/core/llms/openai_compatible_test.py +207 -20
- langfun/core/llms/openai_test.py +0 -2
- langfun/core/llms/rest.py +12 -1
- langfun/core/llms/vertexai.py +58 -8
- langfun/core/logging.py +1 -1
- langfun/core/mcp/client.py +77 -22
- langfun/core/mcp/client_test.py +8 -35
- langfun/core/mcp/session.py +94 -29
- langfun/core/mcp/session_test.py +54 -0
- langfun/core/mcp/tool.py +151 -22
- langfun/core/mcp/tool_test.py +197 -0
- langfun/core/memory.py +1 -0
- langfun/core/message.py +160 -55
- langfun/core/message_test.py +65 -81
- langfun/core/modalities/__init__.py +8 -0
- langfun/core/modalities/audio.py +21 -1
- langfun/core/modalities/image.py +19 -1
- langfun/core/modalities/mime.py +64 -3
- langfun/core/modalities/mime_test.py +11 -0
- langfun/core/modalities/pdf.py +19 -1
- langfun/core/modalities/video.py +21 -1
- langfun/core/modality.py +167 -29
- langfun/core/modality_test.py +42 -12
- langfun/core/natural_language.py +1 -1
- langfun/core/sampling.py +4 -4
- langfun/core/sampling_test.py +20 -4
- langfun/core/structured/__init__.py +2 -24
- langfun/core/structured/completion.py +34 -44
- langfun/core/structured/completion_test.py +23 -43
- langfun/core/structured/description.py +54 -50
- langfun/core/structured/function_generation.py +29 -12
- langfun/core/structured/mapping.py +81 -37
- langfun/core/structured/parsing.py +95 -79
- langfun/core/structured/parsing_test.py +0 -3
- langfun/core/structured/querying.py +215 -142
- langfun/core/structured/querying_test.py +65 -29
- langfun/core/structured/schema/__init__.py +49 -0
- langfun/core/structured/schema/base.py +664 -0
- langfun/core/structured/schema/base_test.py +531 -0
- langfun/core/structured/schema/json.py +174 -0
- langfun/core/structured/schema/json_test.py +121 -0
- langfun/core/structured/schema/python.py +316 -0
- langfun/core/structured/schema/python_test.py +410 -0
- langfun/core/structured/schema_generation.py +33 -14
- langfun/core/structured/scoring.py +47 -36
- langfun/core/structured/tokenization.py +26 -11
- langfun/core/subscription.py +2 -2
- langfun/core/template.py +174 -49
- langfun/core/template_test.py +123 -17
- langfun/env/__init__.py +8 -2
- langfun/env/base_environment.py +320 -128
- langfun/env/base_environment_test.py +473 -0
- langfun/env/base_feature.py +92 -15
- langfun/env/base_feature_test.py +228 -0
- langfun/env/base_sandbox.py +84 -361
- langfun/env/base_sandbox_test.py +1235 -0
- langfun/env/event_handlers/__init__.py +1 -1
- langfun/env/event_handlers/chain.py +233 -0
- langfun/env/event_handlers/chain_test.py +253 -0
- langfun/env/event_handlers/event_logger.py +95 -98
- langfun/env/event_handlers/event_logger_test.py +21 -21
- langfun/env/event_handlers/metric_writer.py +225 -140
- langfun/env/event_handlers/metric_writer_test.py +23 -6
- langfun/env/interface.py +854 -40
- langfun/env/interface_test.py +112 -2
- langfun/env/load_balancers_test.py +23 -2
- langfun/env/test_utils.py +126 -84
- {langfun-0.1.2.dev202510230805.dist-info → langfun-0.1.2.dev202511270805.dist-info}/METADATA +1 -1
- langfun-0.1.2.dev202511270805.dist-info/RECORD +215 -0
- langfun/core/eval/v2/runners_test.py +0 -343
- langfun/core/structured/schema.py +0 -987
- langfun/core/structured/schema_test.py +0 -982
- langfun/env/base_test.py +0 -1481
- langfun/env/event_handlers/base.py +0 -350
- langfun-0.1.2.dev202510230805.dist-info/RECORD +0 -195
- {langfun-0.1.2.dev202510230805.dist-info → langfun-0.1.2.dev202511270805.dist-info}/WHEEL +0 -0
- {langfun-0.1.2.dev202510230805.dist-info → langfun-0.1.2.dev202511270805.dist-info}/licenses/LICENSE +0 -0
- {langfun-0.1.2.dev202510230805.dist-info → langfun-0.1.2.dev202511270805.dist-info}/top_level.txt +0 -0
|
@@ -32,8 +32,97 @@ _SUMMARY_FILE = 'summary.html'
|
|
|
32
32
|
_EVALULATION_DETAIL_FILE = 'index.html'
|
|
33
33
|
|
|
34
34
|
|
|
35
|
+
class ExampleHtmlGenerator(experiment_lib.Plugin):
|
|
36
|
+
"""Plugin for generating HTML views for each evaluation example."""
|
|
37
|
+
|
|
38
|
+
def on_example_complete(
|
|
39
|
+
self, runner: Runner, experiment: Experiment, example: Example
|
|
40
|
+
):
|
|
41
|
+
self._save_example_html(runner, experiment, example)
|
|
42
|
+
|
|
43
|
+
def _save_example_html(
|
|
44
|
+
self, runner: Runner, experiment: Experiment, example: Example
|
|
45
|
+
) -> None:
|
|
46
|
+
"""Saves the example in HTML format."""
|
|
47
|
+
current_run = runner.current_run
|
|
48
|
+
def _generate():
|
|
49
|
+
try:
|
|
50
|
+
with pg.timeit() as t:
|
|
51
|
+
html = example.to_html(
|
|
52
|
+
collapse_level=None,
|
|
53
|
+
enable_summary_tooltip=False,
|
|
54
|
+
extra_flags=dict(
|
|
55
|
+
# For properly rendering the next link.
|
|
56
|
+
num_examples=getattr(experiment, 'num_examples', None)
|
|
57
|
+
),
|
|
58
|
+
)
|
|
59
|
+
html.save(
|
|
60
|
+
runner.current_run.output_path_for(
|
|
61
|
+
experiment, f'{example.id}.html'
|
|
62
|
+
)
|
|
63
|
+
)
|
|
64
|
+
experiment.info(
|
|
65
|
+
f'\'{example.id}.html\' generated in {t.elapse:.2f} seconds. '
|
|
66
|
+
)
|
|
67
|
+
except BaseException as e: # pylint: disable=broad-except
|
|
68
|
+
experiment.error(
|
|
69
|
+
f'Failed to generate \'{example.id}.html\'. '
|
|
70
|
+
f'Error: {e}, Stacktrace: \n{traceback.format_exc()}.',
|
|
71
|
+
)
|
|
72
|
+
raise e
|
|
73
|
+
|
|
74
|
+
def _copy():
|
|
75
|
+
src_file = current_run.input_path_for(experiment, f'{example.id}.html')
|
|
76
|
+
dest_file = current_run.output_path_for(experiment, f'{example.id}.html')
|
|
77
|
+
|
|
78
|
+
if src_file == dest_file:
|
|
79
|
+
return
|
|
80
|
+
|
|
81
|
+
if not pg.io.path_exists(src_file):
|
|
82
|
+
experiment.warning(
|
|
83
|
+
f'Skip copying \'{example.id}.html\' as '
|
|
84
|
+
f'{src_file!r} does not exist.'
|
|
85
|
+
)
|
|
86
|
+
return
|
|
87
|
+
|
|
88
|
+
try:
|
|
89
|
+
with pg.timeit() as t, pg.io.open(src_file, 'r') as src:
|
|
90
|
+
content = src.read()
|
|
91
|
+
with pg.io.open(dest_file, 'w') as dest:
|
|
92
|
+
dest.write(content)
|
|
93
|
+
experiment.info(
|
|
94
|
+
f'\'{example.id}.html\' copied in {t.elapse:.2f} seconds.'
|
|
95
|
+
)
|
|
96
|
+
except BaseException as e: # pylint: disable=broad-except
|
|
97
|
+
experiment.error(
|
|
98
|
+
f'Failed to copy {src_file!r} to {dest_file!r}. Error: {e}.'
|
|
99
|
+
)
|
|
100
|
+
raise e
|
|
101
|
+
|
|
102
|
+
generate_example_html = current_run.generate_example_html
|
|
103
|
+
if (generate_example_html == 'all'
|
|
104
|
+
or (generate_example_html == 'new' and example.newly_processed)
|
|
105
|
+
or (isinstance(generate_example_html, list)
|
|
106
|
+
and example.id in generate_example_html)):
|
|
107
|
+
op = _generate
|
|
108
|
+
else:
|
|
109
|
+
op = _copy
|
|
110
|
+
runner.background_run(op)
|
|
111
|
+
|
|
112
|
+
|
|
35
113
|
class HtmlReporter(experiment_lib.Plugin):
|
|
36
|
-
"""Plugin for periodically generating HTML reports for the experiment.
|
|
114
|
+
"""Plugin for periodically generating HTML reports for the experiment.
|
|
115
|
+
|
|
116
|
+
The `HtmlReporter` plugin generates several HTML files during an experiment
|
|
117
|
+
run:
|
|
118
|
+
- A `summary.html` at the root of the run directory, summarizing all
|
|
119
|
+
evaluations in the experiment.
|
|
120
|
+
- An `index.html` for each leaf evaluation, detailing the evaluation
|
|
121
|
+
definition, metrics, and logs.
|
|
122
|
+
|
|
123
|
+
These reports are updated periodically in the background during the run,
|
|
124
|
+
allowing users to monitor progress in near real-time.
|
|
125
|
+
"""
|
|
37
126
|
|
|
38
127
|
summary_interval: Annotated[
|
|
39
128
|
int,
|
|
@@ -127,7 +216,6 @@ class HtmlReporter(experiment_lib.Plugin):
|
|
|
127
216
|
def on_example_complete(
|
|
128
217
|
self, runner: Runner, experiment: Experiment, example: Example
|
|
129
218
|
):
|
|
130
|
-
self._save_example_html(runner, experiment, example)
|
|
131
219
|
self._maybe_update_experiment_html(runner, experiment)
|
|
132
220
|
self._maybe_update_summary(runner)
|
|
133
221
|
|
|
@@ -197,72 +285,3 @@ class HtmlReporter(experiment_lib.Plugin):
|
|
|
197
285
|
runner.background_run(_save)
|
|
198
286
|
else:
|
|
199
287
|
_save()
|
|
200
|
-
|
|
201
|
-
def _save_example_html(
|
|
202
|
-
self, runner: Runner, experiment: Experiment, example: Example
|
|
203
|
-
) -> None:
|
|
204
|
-
"""Saves the example in HTML format."""
|
|
205
|
-
current_run = runner.current_run
|
|
206
|
-
def _generate():
|
|
207
|
-
try:
|
|
208
|
-
with pg.timeit() as t:
|
|
209
|
-
html = example.to_html(
|
|
210
|
-
collapse_level=None,
|
|
211
|
-
enable_summary_tooltip=False,
|
|
212
|
-
extra_flags=dict(
|
|
213
|
-
# For properly rendering the next link.
|
|
214
|
-
num_examples=getattr(experiment, 'num_examples', None)
|
|
215
|
-
),
|
|
216
|
-
)
|
|
217
|
-
html.save(
|
|
218
|
-
runner.current_run.output_path_for(
|
|
219
|
-
experiment, f'{example.id}.html'
|
|
220
|
-
)
|
|
221
|
-
)
|
|
222
|
-
experiment.info(
|
|
223
|
-
f'\'{example.id}.html\' generated in {t.elapse:.2f} seconds. '
|
|
224
|
-
)
|
|
225
|
-
except BaseException as e: # pylint: disable=broad-except
|
|
226
|
-
experiment.error(
|
|
227
|
-
f'Failed to generate \'{example.id}.html\'. '
|
|
228
|
-
f'Error: {e}, Stacktrace: \n{traceback.format_exc()}.',
|
|
229
|
-
)
|
|
230
|
-
raise e
|
|
231
|
-
|
|
232
|
-
def _copy():
|
|
233
|
-
src_file = current_run.input_path_for(experiment, f'{example.id}.html')
|
|
234
|
-
dest_file = current_run.output_path_for(experiment, f'{example.id}.html')
|
|
235
|
-
|
|
236
|
-
if src_file == dest_file:
|
|
237
|
-
return
|
|
238
|
-
|
|
239
|
-
if not pg.io.path_exists(src_file):
|
|
240
|
-
experiment.warning(
|
|
241
|
-
f'Skip copying \'{example.id}.html\' as '
|
|
242
|
-
f'{src_file!r} does not exist.'
|
|
243
|
-
)
|
|
244
|
-
return
|
|
245
|
-
|
|
246
|
-
try:
|
|
247
|
-
with pg.timeit() as t, pg.io.open(src_file, 'r') as src:
|
|
248
|
-
content = src.read()
|
|
249
|
-
with pg.io.open(dest_file, 'w') as dest:
|
|
250
|
-
dest.write(content)
|
|
251
|
-
experiment.info(
|
|
252
|
-
f'\'{example.id}.html\' copied in {t.elapse:.2f} seconds.'
|
|
253
|
-
)
|
|
254
|
-
except BaseException as e: # pylint: disable=broad-except
|
|
255
|
-
experiment.error(
|
|
256
|
-
f'Failed to copy {src_file!r} to {dest_file!r}. Error: {e}.'
|
|
257
|
-
)
|
|
258
|
-
raise e
|
|
259
|
-
|
|
260
|
-
generate_example_html = current_run.generate_example_html
|
|
261
|
-
if (generate_example_html == 'all'
|
|
262
|
-
or (generate_example_html == 'new' and example.newly_processed)
|
|
263
|
-
or (isinstance(generate_example_html, list)
|
|
264
|
-
and example.id in generate_example_html)):
|
|
265
|
-
op = _generate
|
|
266
|
-
else:
|
|
267
|
-
op = _copy
|
|
268
|
-
runner.background_run(op)
|
|
@@ -29,7 +29,16 @@ class ReportingTest(unittest.TestCase):
|
|
|
29
29
|
experiment = eval_test_helper.test_experiment()
|
|
30
30
|
checkpointer = checkpointing.BulkCheckpointer('checkpoint.jsonl')
|
|
31
31
|
reporter = reporting.HtmlReporter()
|
|
32
|
-
|
|
32
|
+
self.assertFalse(reporter.is_per_example())
|
|
33
|
+
|
|
34
|
+
example_html_generator = reporting.ExampleHtmlGenerator()
|
|
35
|
+
self.assertTrue(example_html_generator.is_per_example())
|
|
36
|
+
|
|
37
|
+
run = experiment.run(
|
|
38
|
+
root_dir,
|
|
39
|
+
'new',
|
|
40
|
+
plugins=[checkpointer, reporter, example_html_generator]
|
|
41
|
+
)
|
|
33
42
|
self.assertTrue(
|
|
34
43
|
pg.io.path_exists(os.path.join(run.output_root, 'summary.html'))
|
|
35
44
|
)
|
|
@@ -52,8 +61,10 @@ class ReportingTest(unittest.TestCase):
|
|
|
52
61
|
root_dir = os.path.join(tempfile.mkdtemp(), 'test_reporting2')
|
|
53
62
|
experiment = eval_test_helper.test_experiment()
|
|
54
63
|
run = experiment.run(
|
|
55
|
-
root_dir,
|
|
56
|
-
|
|
64
|
+
root_dir,
|
|
65
|
+
'new',
|
|
66
|
+
plugins=[checkpointer, reporter, example_html_generator],
|
|
67
|
+
warm_start_from=run.output_root,
|
|
57
68
|
)
|
|
58
69
|
self.assertTrue(
|
|
59
70
|
pg.io.path_exists(os.path.join(run.output_root, 'summary.html'))
|
|
@@ -105,7 +116,12 @@ class ReportingTest(unittest.TestCase):
|
|
|
105
116
|
.test_experiment_with_example_html_generation_error())
|
|
106
117
|
checkpointer = checkpointing.BulkCheckpointer('checkpoint.jsonl')
|
|
107
118
|
reporter = reporting.HtmlReporter()
|
|
108
|
-
|
|
119
|
+
example_html_generator = reporting.ExampleHtmlGenerator()
|
|
120
|
+
run = experiment.run(
|
|
121
|
+
root_dir,
|
|
122
|
+
'new',
|
|
123
|
+
plugins=[checkpointer, reporter, example_html_generator]
|
|
124
|
+
)
|
|
109
125
|
self.assertTrue(
|
|
110
126
|
pg.io.path_exists(os.path.join(run.output_root, 'summary.html'))
|
|
111
127
|
)
|
|
@@ -132,8 +148,10 @@ class ReportingTest(unittest.TestCase):
|
|
|
132
148
|
experiment = (eval_test_helper
|
|
133
149
|
.test_experiment_with_example_html_generation_error())
|
|
134
150
|
run = experiment.run(
|
|
135
|
-
root_dir,
|
|
136
|
-
|
|
151
|
+
root_dir,
|
|
152
|
+
'new',
|
|
153
|
+
plugins=[checkpointer, reporter, example_html_generator],
|
|
154
|
+
warm_start_from=run.output_root,
|
|
137
155
|
)
|
|
138
156
|
self.assertTrue(
|
|
139
157
|
pg.io.path_exists(os.path.join(run.output_root, 'summary.html'))
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# Copyright 2024 The Langfun Authors
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Langfun evaluation runners."""
|
|
15
|
+
|
|
16
|
+
# pylint: disable=g-importing-member
|
|
17
|
+
from langfun.core.eval.v2.runners.base import RunnerBase
|
|
18
|
+
from langfun.core.eval.v2.runners.beam import BeamRunner
|
|
19
|
+
from langfun.core.eval.v2.runners.debug import DebugRunner
|
|
20
|
+
from langfun.core.eval.v2.runners.parallel import ParallelRunner
|
|
21
|
+
from langfun.core.eval.v2.runners.sequential import SequentialRunner
|
|
22
|
+
# pylint: enable=g-importing-member
|
|
23
|
+
|
|
24
|
+
__all__ = [
|
|
25
|
+
'RunnerBase',
|
|
26
|
+
'BeamRunner',
|
|
27
|
+
'DebugRunner',
|
|
28
|
+
'ParallelRunner',
|
|
29
|
+
'SequentialRunner',
|
|
30
|
+
]
|
|
@@ -11,15 +11,14 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
"""
|
|
14
|
+
"""Base experiment runner."""
|
|
15
|
+
|
|
15
16
|
import abc
|
|
16
|
-
import collections
|
|
17
17
|
import concurrent.futures
|
|
18
18
|
import random
|
|
19
19
|
import threading
|
|
20
|
-
import time
|
|
21
20
|
import traceback
|
|
22
|
-
from typing import Any, Annotated, Callable, Iterator
|
|
21
|
+
from typing import Any, Annotated, Callable, Iterator, Literal
|
|
23
22
|
|
|
24
23
|
from langfun import core as lf
|
|
25
24
|
from langfun.core.eval.v2 import checkpointing
|
|
@@ -42,31 +41,55 @@ _RUN_MANIFEST = 'run.json'
|
|
|
42
41
|
|
|
43
42
|
|
|
44
43
|
class RunnerBase(Runner):
|
|
45
|
-
"""
|
|
44
|
+
"""Base class for runners with plugin support and IO pooling.
|
|
45
|
+
|
|
46
|
+
`RunnerBase` provides the basic runner functionalities such as plugin
|
|
47
|
+
integration for checkpointing, reporting and progress tracking.
|
|
48
|
+
It also manages a thread pool for background IO operations.
|
|
49
|
+
Subclasses should implement `_run` and `_evaluate_items` for different
|
|
50
|
+
execution strategies.
|
|
51
|
+
"""
|
|
46
52
|
|
|
47
|
-
|
|
48
|
-
|
|
53
|
+
progress_tracker: Annotated[
|
|
54
|
+
Literal['tqdm', 'html', 'auto', None],
|
|
49
55
|
(
|
|
50
|
-
'If
|
|
51
|
-
'
|
|
52
|
-
'
|
|
56
|
+
'If `tqdm`, force using tqdm for progress update. '
|
|
57
|
+
'If `html`, force using html for progress update. '
|
|
58
|
+
'If `auto`, determine it automatically based on the running '
|
|
59
|
+
'environment (console vs. notebook)'
|
|
60
|
+
'If `none`, disable progress update.'
|
|
53
61
|
)
|
|
54
|
-
] =
|
|
62
|
+
] = 'auto'
|
|
55
63
|
|
|
56
64
|
plugins = [
|
|
57
65
|
checkpointing.BulkCheckpointer(),
|
|
58
66
|
reporting.HtmlReporter(),
|
|
59
67
|
]
|
|
60
68
|
|
|
69
|
+
max_background_threads: Annotated[
|
|
70
|
+
int,
|
|
71
|
+
'Max number of background threads for IO operations.'
|
|
72
|
+
] = 128
|
|
73
|
+
|
|
61
74
|
def _on_bound(self):
|
|
62
75
|
super()._on_bound()
|
|
63
76
|
|
|
64
77
|
# Install the tqdm plugin if needed.
|
|
65
|
-
|
|
66
|
-
|
|
78
|
+
if self.progress_tracker is not None:
|
|
79
|
+
with pg.notify_on_change(False):
|
|
80
|
+
self.plugins.append(
|
|
81
|
+
progress_tracking.progress_tracker(self.progress_tracker)
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
if self.max_background_threads > 0:
|
|
85
|
+
self._io_pool_lock = threading.Lock()
|
|
86
|
+
self._io_pool = concurrent.futures.ThreadPoolExecutor(
|
|
87
|
+
max_workers=self.max_background_threads
|
|
88
|
+
)
|
|
89
|
+
else:
|
|
90
|
+
self._io_pool_lock = None
|
|
91
|
+
self._io_pool = None
|
|
67
92
|
|
|
68
|
-
self._io_pool_lock = threading.Lock()
|
|
69
|
-
self._io_pool = concurrent.futures.ThreadPoolExecutor(max_workers=16)
|
|
70
93
|
# TODO(daiyip): render background errors.
|
|
71
94
|
self._background_last_error = None
|
|
72
95
|
|
|
@@ -78,9 +101,12 @@ class RunnerBase(Runner):
|
|
|
78
101
|
except Exception as e: # pylint: disable=broad-except
|
|
79
102
|
self._background_last_error = e
|
|
80
103
|
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
self._io_pool
|
|
104
|
+
if self.max_background_threads > 0:
|
|
105
|
+
with self._io_pool_lock:
|
|
106
|
+
if self._io_pool is not None:
|
|
107
|
+
self._io_pool.submit(_background_run, *args, **kwargs)
|
|
108
|
+
else:
|
|
109
|
+
_background_run(*args, **kwargs)
|
|
84
110
|
|
|
85
111
|
def _all_plugins(self, experiment: Experiment) -> Iterator[Plugin]:
|
|
86
112
|
"""Returns all plugins for the experiment."""
|
|
@@ -139,6 +165,7 @@ class RunnerBase(Runner):
|
|
|
139
165
|
plugin.on_experiment_start(self, experiment)
|
|
140
166
|
|
|
141
167
|
if experiment.is_leaf:
|
|
168
|
+
pg.io.mkdirs(self.current_run.output_dir(experiment))
|
|
142
169
|
experiment.info(
|
|
143
170
|
f'Starting evaluation {experiment.id!r} with '
|
|
144
171
|
f'{num_examples_to_evaluate} examples to evaluate.'
|
|
@@ -220,7 +247,7 @@ class RunnerBase(Runner):
|
|
|
220
247
|
else:
|
|
221
248
|
# A evaluation could be considered as done if it has processed all the
|
|
222
249
|
# examples specified by `example_ids`.
|
|
223
|
-
assert progress.is_completed
|
|
250
|
+
assert progress.is_completed, progress
|
|
224
251
|
parent_progress.increment_processed()
|
|
225
252
|
|
|
226
253
|
if parent_progress.is_completed:
|
|
@@ -235,6 +262,8 @@ class RunnerBase(Runner):
|
|
|
235
262
|
example: Example
|
|
236
263
|
) -> None:
|
|
237
264
|
"""Called when an evaluation example is started."""
|
|
265
|
+
assert isinstance(experiment, Evaluation), experiment
|
|
266
|
+
experiment.state.update(example, in_progress=True)
|
|
238
267
|
for plugin in self._all_plugins(experiment):
|
|
239
268
|
plugin.on_example_start(self, experiment, example)
|
|
240
269
|
experiment.info(f'Starting to evaluate example {example.id}.')
|
|
@@ -245,6 +274,8 @@ class RunnerBase(Runner):
|
|
|
245
274
|
example: Example
|
|
246
275
|
) -> None:
|
|
247
276
|
"""Called when an evaluation example is complete."""
|
|
277
|
+
assert isinstance(experiment, Evaluation), experiment
|
|
278
|
+
experiment.state.update(example, in_progress=False)
|
|
248
279
|
if example.newly_processed:
|
|
249
280
|
if example.error is None:
|
|
250
281
|
experiment.progress.increment_processed()
|
|
@@ -256,7 +287,7 @@ class RunnerBase(Runner):
|
|
|
256
287
|
experiment.progress.increment_failed()
|
|
257
288
|
experiment.error(
|
|
258
289
|
(
|
|
259
|
-
f'Failed to evaluate example {example.id} in'
|
|
290
|
+
f'Failed to evaluate example {example.id} in '
|
|
260
291
|
f'{example.elapse:.2f} seconds.'
|
|
261
292
|
),
|
|
262
293
|
error=example.error
|
|
@@ -316,7 +347,7 @@ class RunnerBase(Runner):
|
|
|
316
347
|
self._run(targets)
|
|
317
348
|
|
|
318
349
|
self.on_run_complete()
|
|
319
|
-
except
|
|
350
|
+
except BaseException as e: # pylint: disable=broad-except
|
|
320
351
|
self.on_run_abort(e)
|
|
321
352
|
raise e
|
|
322
353
|
finally:
|
|
@@ -324,9 +355,10 @@ class RunnerBase(Runner):
|
|
|
324
355
|
self.background_run(cache.save)
|
|
325
356
|
|
|
326
357
|
# Wait for the background tasks to finish.
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
358
|
+
if self.max_background_threads > 0:
|
|
359
|
+
with self._io_pool_lock:
|
|
360
|
+
self._io_pool, io_pool = None, self._io_pool
|
|
361
|
+
io_pool.shutdown(wait=True)
|
|
330
362
|
|
|
331
363
|
@abc.abstractmethod
|
|
332
364
|
def _run(self, evaluations: list[Evaluation]) -> None:
|
|
@@ -335,6 +367,7 @@ class RunnerBase(Runner):
|
|
|
335
367
|
def run_evaluation(self, evaluation: Evaluation) -> None:
|
|
336
368
|
"""Runs the evaluation."""
|
|
337
369
|
try:
|
|
370
|
+
evaluation.setup()
|
|
338
371
|
self.on_experiment_start(evaluation)
|
|
339
372
|
|
|
340
373
|
per_evaluation_settings = {}
|
|
@@ -367,6 +400,8 @@ class RunnerBase(Runner):
|
|
|
367
400
|
except BaseException as e: # pylint: disable=broad-except
|
|
368
401
|
self.on_experiment_abort(evaluation, e)
|
|
369
402
|
raise e
|
|
403
|
+
finally:
|
|
404
|
+
evaluation.teardown()
|
|
370
405
|
|
|
371
406
|
@abc.abstractmethod
|
|
372
407
|
def _evaluate_items(
|
|
@@ -394,121 +429,3 @@ class RunnerBase(Runner):
|
|
|
394
429
|
return in_memory.InMemory(
|
|
395
430
|
self.current_run.output_path_for(experiment, 'cache.json')
|
|
396
431
|
)
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
class SequentialRunner(RunnerBase):
|
|
400
|
-
"""Sequential runner.
|
|
401
|
-
|
|
402
|
-
Sequential runner runs all evaluations and their examples in sequence,
|
|
403
|
-
as well as the background tasks, it allows the developer to catch all
|
|
404
|
-
exceptions thrown from the background tasks, making it easier to debug.
|
|
405
|
-
"""
|
|
406
|
-
|
|
407
|
-
NAME = 'sequential'
|
|
408
|
-
|
|
409
|
-
def background_run(
|
|
410
|
-
self, func: Callable[..., Any], *args: Any, **kwargs: Any
|
|
411
|
-
) -> None:
|
|
412
|
-
"""Runs the function with the IO pool."""
|
|
413
|
-
func(*args, **kwargs)
|
|
414
|
-
|
|
415
|
-
def _run(self, evaluations: list[Evaluation]) -> None:
|
|
416
|
-
"""Runs the experiment in sequence."""
|
|
417
|
-
for e in evaluations:
|
|
418
|
-
self.run_evaluation(e)
|
|
419
|
-
|
|
420
|
-
def _evaluate_items(
|
|
421
|
-
self, evaluation: Evaluation, items: Iterator[Example]
|
|
422
|
-
) -> None:
|
|
423
|
-
"""Runs the evaluation items in sequence."""
|
|
424
|
-
for item in items:
|
|
425
|
-
self.evaluate_item(evaluation, item)
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
class DebugRunner(SequentialRunner):
|
|
429
|
-
"""Debug runner."""
|
|
430
|
-
|
|
431
|
-
NAME = 'debug'
|
|
432
|
-
|
|
433
|
-
# Do not use the checkpointer for debug runner.
|
|
434
|
-
plugins = []
|
|
435
|
-
|
|
436
|
-
def _on_bound(self):
|
|
437
|
-
super()._on_bound()
|
|
438
|
-
if self.current_run.example_ids is None:
|
|
439
|
-
self.current_run.rebind(example_ids=[1], skip_notification=True)
|
|
440
|
-
self.current_run.rebind(raise_if_has_error=True, skip_notification=True)
|
|
441
|
-
|
|
442
|
-
def _save_run_manifest(self) -> None:
|
|
443
|
-
"""Do nothing to avoid overriden existing runs."""
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
class ParallelRunner(RunnerBase):
|
|
447
|
-
"""Parallel runner."""
|
|
448
|
-
|
|
449
|
-
NAME = 'parallel'
|
|
450
|
-
|
|
451
|
-
timeout: Annotated[
|
|
452
|
-
int | None,
|
|
453
|
-
'Timeout for each evaluation example.'
|
|
454
|
-
] = None
|
|
455
|
-
|
|
456
|
-
concurrent_startup_delay: Annotated[
|
|
457
|
-
tuple[int, int] | None,
|
|
458
|
-
(
|
|
459
|
-
'A range of seconds to delay the initial evaluation of each thread '
|
|
460
|
-
'in the thread pool, helping to prevent a burst in LLM QPS at '
|
|
461
|
-
'startup. If set to None, no delay will be applied.'
|
|
462
|
-
)
|
|
463
|
-
] = None
|
|
464
|
-
|
|
465
|
-
def _run(self, evaluations: list[Evaluation]) -> None:
|
|
466
|
-
"""Runs the evaluations in parallel."""
|
|
467
|
-
def _run_group(evaluation_group: list[Evaluation]):
|
|
468
|
-
for e in evaluation_group:
|
|
469
|
-
self.run_evaluation(e)
|
|
470
|
-
|
|
471
|
-
# Run evaluations in parallel groupped by resource key.
|
|
472
|
-
groups: dict[str, list[Evaluation]] = collections.defaultdict(list)
|
|
473
|
-
for e in evaluations:
|
|
474
|
-
resource_ids = e.resource_ids()
|
|
475
|
-
if not resource_ids:
|
|
476
|
-
group_id = e.id
|
|
477
|
-
else:
|
|
478
|
-
# TODO(daiyip): support group that requires multiple resources.
|
|
479
|
-
group_id = resource_ids.pop()
|
|
480
|
-
groups[group_id].append(e)
|
|
481
|
-
|
|
482
|
-
for _, _, _ in lf.concurrent_map(
|
|
483
|
-
_run_group,
|
|
484
|
-
groups.values(),
|
|
485
|
-
max_workers=max(64, len(groups)),
|
|
486
|
-
timeout=self.timeout,
|
|
487
|
-
silence_on_errors=None,
|
|
488
|
-
):
|
|
489
|
-
pass
|
|
490
|
-
|
|
491
|
-
def _evaluate_items(
|
|
492
|
-
self, evaluation: Evaluation, items: Iterator[Example]
|
|
493
|
-
) -> None:
|
|
494
|
-
"""Override run items to run in parallel."""
|
|
495
|
-
if self.concurrent_startup_delay is not None:
|
|
496
|
-
thread_delayed = {}
|
|
497
|
-
def _evaluate_item(item: Example):
|
|
498
|
-
thread_id = threading.current_thread().ident
|
|
499
|
-
if thread_id not in thread_delayed:
|
|
500
|
-
thread_delayed[thread_id] = True
|
|
501
|
-
time.sleep(random.randint(*self.concurrent_startup_delay))
|
|
502
|
-
return self.evaluate_item(evaluation, item)
|
|
503
|
-
else:
|
|
504
|
-
def _evaluate_item(item: Example):
|
|
505
|
-
return self.evaluate_item(evaluation, item)
|
|
506
|
-
|
|
507
|
-
for _, _, _ in lf.concurrent_map(
|
|
508
|
-
_evaluate_item,
|
|
509
|
-
items,
|
|
510
|
-
max_workers=evaluation.max_workers,
|
|
511
|
-
timeout=self.timeout,
|
|
512
|
-
silence_on_errors=None,
|
|
513
|
-
):
|
|
514
|
-
pass
|