langfun 0.1.2.dev202510230805__py3-none-any.whl → 0.1.2.dev202511160804__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of langfun might be problematic. Click here for more details.
- langfun/core/__init__.py +1 -0
- langfun/core/agentic/action.py +107 -12
- langfun/core/agentic/action_eval.py +9 -2
- langfun/core/agentic/action_test.py +25 -0
- langfun/core/async_support.py +32 -3
- langfun/core/coding/python/correction.py +19 -9
- langfun/core/coding/python/execution.py +14 -12
- langfun/core/coding/python/generation.py +21 -16
- langfun/core/coding/python/sandboxing.py +23 -3
- langfun/core/component.py +42 -3
- langfun/core/concurrent.py +70 -6
- langfun/core/concurrent_test.py +1 -0
- langfun/core/console.py +1 -1
- langfun/core/data/conversion/anthropic.py +12 -3
- langfun/core/data/conversion/anthropic_test.py +8 -6
- langfun/core/data/conversion/gemini.py +9 -2
- langfun/core/data/conversion/gemini_test.py +12 -9
- langfun/core/data/conversion/openai.py +145 -31
- langfun/core/data/conversion/openai_test.py +161 -17
- langfun/core/eval/base.py +47 -43
- langfun/core/eval/base_test.py +4 -4
- langfun/core/eval/matching.py +5 -2
- langfun/core/eval/patching.py +3 -3
- langfun/core/eval/scoring.py +4 -3
- langfun/core/eval/v2/__init__.py +1 -0
- langfun/core/eval/v2/checkpointing.py +39 -5
- langfun/core/eval/v2/checkpointing_test.py +1 -1
- langfun/core/eval/v2/eval_test_helper.py +96 -0
- langfun/core/eval/v2/evaluation.py +87 -15
- langfun/core/eval/v2/evaluation_test.py +9 -3
- langfun/core/eval/v2/example.py +45 -39
- langfun/core/eval/v2/example_test.py +3 -3
- langfun/core/eval/v2/experiment.py +51 -8
- langfun/core/eval/v2/metric_values.py +31 -3
- langfun/core/eval/v2/metric_values_test.py +32 -0
- langfun/core/eval/v2/metrics.py +157 -44
- langfun/core/eval/v2/metrics_test.py +39 -18
- langfun/core/eval/v2/progress.py +30 -1
- langfun/core/eval/v2/progress_test.py +27 -0
- langfun/core/eval/v2/progress_tracking_test.py +3 -0
- langfun/core/eval/v2/reporting.py +90 -71
- langfun/core/eval/v2/reporting_test.py +20 -6
- langfun/core/eval/v2/runners/__init__.py +26 -0
- langfun/core/eval/v2/{runners.py → runners/base.py} +22 -124
- langfun/core/eval/v2/runners/debug.py +40 -0
- langfun/core/eval/v2/runners/debug_test.py +79 -0
- langfun/core/eval/v2/runners/parallel.py +100 -0
- langfun/core/eval/v2/runners/parallel_test.py +98 -0
- langfun/core/eval/v2/runners/sequential.py +47 -0
- langfun/core/eval/v2/runners/sequential_test.py +175 -0
- langfun/core/langfunc.py +45 -130
- langfun/core/langfunc_test.py +6 -4
- langfun/core/language_model.py +103 -16
- langfun/core/language_model_test.py +9 -3
- langfun/core/llms/__init__.py +7 -1
- langfun/core/llms/anthropic.py +157 -2
- langfun/core/llms/azure_openai.py +29 -17
- langfun/core/llms/cache/base.py +25 -3
- langfun/core/llms/cache/in_memory.py +48 -7
- langfun/core/llms/cache/in_memory_test.py +14 -4
- langfun/core/llms/compositional.py +25 -1
- langfun/core/llms/deepseek.py +30 -2
- langfun/core/llms/fake.py +32 -1
- langfun/core/llms/gemini.py +14 -9
- langfun/core/llms/google_genai.py +29 -1
- langfun/core/llms/groq.py +28 -3
- langfun/core/llms/llama_cpp.py +23 -4
- langfun/core/llms/openai.py +36 -3
- langfun/core/llms/openai_compatible.py +148 -27
- langfun/core/llms/openai_compatible_test.py +207 -20
- langfun/core/llms/openai_test.py +0 -2
- langfun/core/llms/rest.py +12 -1
- langfun/core/llms/vertexai.py +51 -8
- langfun/core/logging.py +1 -1
- langfun/core/mcp/client.py +77 -22
- langfun/core/mcp/client_test.py +8 -35
- langfun/core/mcp/session.py +94 -29
- langfun/core/mcp/session_test.py +54 -0
- langfun/core/mcp/tool.py +151 -22
- langfun/core/mcp/tool_test.py +197 -0
- langfun/core/memory.py +1 -0
- langfun/core/message.py +160 -55
- langfun/core/message_test.py +65 -81
- langfun/core/modalities/__init__.py +8 -0
- langfun/core/modalities/audio.py +21 -1
- langfun/core/modalities/image.py +19 -1
- langfun/core/modalities/mime.py +62 -3
- langfun/core/modalities/pdf.py +19 -1
- langfun/core/modalities/video.py +21 -1
- langfun/core/modality.py +167 -29
- langfun/core/modality_test.py +42 -12
- langfun/core/natural_language.py +1 -1
- langfun/core/sampling.py +4 -4
- langfun/core/sampling_test.py +20 -4
- langfun/core/structured/__init__.py +2 -24
- langfun/core/structured/completion.py +34 -44
- langfun/core/structured/completion_test.py +23 -43
- langfun/core/structured/description.py +54 -50
- langfun/core/structured/function_generation.py +29 -12
- langfun/core/structured/mapping.py +81 -37
- langfun/core/structured/parsing.py +95 -79
- langfun/core/structured/parsing_test.py +0 -3
- langfun/core/structured/querying.py +215 -142
- langfun/core/structured/querying_test.py +65 -29
- langfun/core/structured/schema/__init__.py +48 -0
- langfun/core/structured/schema/base.py +664 -0
- langfun/core/structured/schema/base_test.py +531 -0
- langfun/core/structured/schema/json.py +174 -0
- langfun/core/structured/schema/json_test.py +121 -0
- langfun/core/structured/schema/python.py +316 -0
- langfun/core/structured/schema/python_test.py +410 -0
- langfun/core/structured/schema_generation.py +33 -14
- langfun/core/structured/scoring.py +47 -36
- langfun/core/structured/tokenization.py +26 -11
- langfun/core/subscription.py +2 -2
- langfun/core/template.py +174 -49
- langfun/core/template_test.py +123 -17
- langfun/env/__init__.py +8 -2
- langfun/env/base_environment.py +320 -128
- langfun/env/base_environment_test.py +473 -0
- langfun/env/base_feature.py +92 -15
- langfun/env/base_feature_test.py +228 -0
- langfun/env/base_sandbox.py +84 -361
- langfun/env/base_sandbox_test.py +1235 -0
- langfun/env/event_handlers/__init__.py +1 -1
- langfun/env/event_handlers/chain.py +233 -0
- langfun/env/event_handlers/chain_test.py +253 -0
- langfun/env/event_handlers/event_logger.py +95 -98
- langfun/env/event_handlers/event_logger_test.py +21 -21
- langfun/env/event_handlers/metric_writer.py +225 -140
- langfun/env/event_handlers/metric_writer_test.py +23 -6
- langfun/env/interface.py +854 -40
- langfun/env/interface_test.py +112 -2
- langfun/env/load_balancers_test.py +23 -2
- langfun/env/test_utils.py +126 -84
- {langfun-0.1.2.dev202510230805.dist-info → langfun-0.1.2.dev202511160804.dist-info}/METADATA +1 -1
- langfun-0.1.2.dev202511160804.dist-info/RECORD +211 -0
- langfun/core/eval/v2/runners_test.py +0 -343
- langfun/core/structured/schema.py +0 -987
- langfun/core/structured/schema_test.py +0 -982
- langfun/env/base_test.py +0 -1481
- langfun/env/event_handlers/base.py +0 -350
- langfun-0.1.2.dev202510230805.dist-info/RECORD +0 -195
- {langfun-0.1.2.dev202510230805.dist-info → langfun-0.1.2.dev202511160804.dist-info}/WHEEL +0 -0
- {langfun-0.1.2.dev202510230805.dist-info → langfun-0.1.2.dev202511160804.dist-info}/licenses/LICENSE +0 -0
- {langfun-0.1.2.dev202510230805.dist-info → langfun-0.1.2.dev202511160804.dist-info}/top_level.txt +0 -0
|
@@ -32,8 +32,97 @@ _SUMMARY_FILE = 'summary.html'
|
|
|
32
32
|
_EVALULATION_DETAIL_FILE = 'index.html'
|
|
33
33
|
|
|
34
34
|
|
|
35
|
+
class ExampleHtmlGenerator(experiment_lib.Plugin):
|
|
36
|
+
"""Plugin for generating HTML views for each evaluation example."""
|
|
37
|
+
|
|
38
|
+
def on_example_complete(
|
|
39
|
+
self, runner: Runner, experiment: Experiment, example: Example
|
|
40
|
+
):
|
|
41
|
+
self._save_example_html(runner, experiment, example)
|
|
42
|
+
|
|
43
|
+
def _save_example_html(
|
|
44
|
+
self, runner: Runner, experiment: Experiment, example: Example
|
|
45
|
+
) -> None:
|
|
46
|
+
"""Saves the example in HTML format."""
|
|
47
|
+
current_run = runner.current_run
|
|
48
|
+
def _generate():
|
|
49
|
+
try:
|
|
50
|
+
with pg.timeit() as t:
|
|
51
|
+
html = example.to_html(
|
|
52
|
+
collapse_level=None,
|
|
53
|
+
enable_summary_tooltip=False,
|
|
54
|
+
extra_flags=dict(
|
|
55
|
+
# For properly rendering the next link.
|
|
56
|
+
num_examples=getattr(experiment, 'num_examples', None)
|
|
57
|
+
),
|
|
58
|
+
)
|
|
59
|
+
html.save(
|
|
60
|
+
runner.current_run.output_path_for(
|
|
61
|
+
experiment, f'{example.id}.html'
|
|
62
|
+
)
|
|
63
|
+
)
|
|
64
|
+
experiment.info(
|
|
65
|
+
f'\'{example.id}.html\' generated in {t.elapse:.2f} seconds. '
|
|
66
|
+
)
|
|
67
|
+
except BaseException as e: # pylint: disable=broad-except
|
|
68
|
+
experiment.error(
|
|
69
|
+
f'Failed to generate \'{example.id}.html\'. '
|
|
70
|
+
f'Error: {e}, Stacktrace: \n{traceback.format_exc()}.',
|
|
71
|
+
)
|
|
72
|
+
raise e
|
|
73
|
+
|
|
74
|
+
def _copy():
|
|
75
|
+
src_file = current_run.input_path_for(experiment, f'{example.id}.html')
|
|
76
|
+
dest_file = current_run.output_path_for(experiment, f'{example.id}.html')
|
|
77
|
+
|
|
78
|
+
if src_file == dest_file:
|
|
79
|
+
return
|
|
80
|
+
|
|
81
|
+
if not pg.io.path_exists(src_file):
|
|
82
|
+
experiment.warning(
|
|
83
|
+
f'Skip copying \'{example.id}.html\' as '
|
|
84
|
+
f'{src_file!r} does not exist.'
|
|
85
|
+
)
|
|
86
|
+
return
|
|
87
|
+
|
|
88
|
+
try:
|
|
89
|
+
with pg.timeit() as t, pg.io.open(src_file, 'r') as src:
|
|
90
|
+
content = src.read()
|
|
91
|
+
with pg.io.open(dest_file, 'w') as dest:
|
|
92
|
+
dest.write(content)
|
|
93
|
+
experiment.info(
|
|
94
|
+
f'\'{example.id}.html\' copied in {t.elapse:.2f} seconds.'
|
|
95
|
+
)
|
|
96
|
+
except BaseException as e: # pylint: disable=broad-except
|
|
97
|
+
experiment.error(
|
|
98
|
+
f'Failed to copy {src_file!r} to {dest_file!r}. Error: {e}.'
|
|
99
|
+
)
|
|
100
|
+
raise e
|
|
101
|
+
|
|
102
|
+
generate_example_html = current_run.generate_example_html
|
|
103
|
+
if (generate_example_html == 'all'
|
|
104
|
+
or (generate_example_html == 'new' and example.newly_processed)
|
|
105
|
+
or (isinstance(generate_example_html, list)
|
|
106
|
+
and example.id in generate_example_html)):
|
|
107
|
+
op = _generate
|
|
108
|
+
else:
|
|
109
|
+
op = _copy
|
|
110
|
+
runner.background_run(op)
|
|
111
|
+
|
|
112
|
+
|
|
35
113
|
class HtmlReporter(experiment_lib.Plugin):
|
|
36
|
-
"""Plugin for periodically generating HTML reports for the experiment.
|
|
114
|
+
"""Plugin for periodically generating HTML reports for the experiment.
|
|
115
|
+
|
|
116
|
+
The `HtmlReporter` plugin generates several HTML files during an experiment
|
|
117
|
+
run:
|
|
118
|
+
- A `summary.html` at the root of the run directory, summarizing all
|
|
119
|
+
evaluations in the experiment.
|
|
120
|
+
- An `index.html` for each leaf evaluation, detailing the evaluation
|
|
121
|
+
definition, metrics, and logs.
|
|
122
|
+
|
|
123
|
+
These reports are updated periodically in the background during the run,
|
|
124
|
+
allowing users to monitor progress in near real-time.
|
|
125
|
+
"""
|
|
37
126
|
|
|
38
127
|
summary_interval: Annotated[
|
|
39
128
|
int,
|
|
@@ -127,7 +216,6 @@ class HtmlReporter(experiment_lib.Plugin):
|
|
|
127
216
|
def on_example_complete(
|
|
128
217
|
self, runner: Runner, experiment: Experiment, example: Example
|
|
129
218
|
):
|
|
130
|
-
self._save_example_html(runner, experiment, example)
|
|
131
219
|
self._maybe_update_experiment_html(runner, experiment)
|
|
132
220
|
self._maybe_update_summary(runner)
|
|
133
221
|
|
|
@@ -197,72 +285,3 @@ class HtmlReporter(experiment_lib.Plugin):
|
|
|
197
285
|
runner.background_run(_save)
|
|
198
286
|
else:
|
|
199
287
|
_save()
|
|
200
|
-
|
|
201
|
-
def _save_example_html(
|
|
202
|
-
self, runner: Runner, experiment: Experiment, example: Example
|
|
203
|
-
) -> None:
|
|
204
|
-
"""Saves the example in HTML format."""
|
|
205
|
-
current_run = runner.current_run
|
|
206
|
-
def _generate():
|
|
207
|
-
try:
|
|
208
|
-
with pg.timeit() as t:
|
|
209
|
-
html = example.to_html(
|
|
210
|
-
collapse_level=None,
|
|
211
|
-
enable_summary_tooltip=False,
|
|
212
|
-
extra_flags=dict(
|
|
213
|
-
# For properly rendering the next link.
|
|
214
|
-
num_examples=getattr(experiment, 'num_examples', None)
|
|
215
|
-
),
|
|
216
|
-
)
|
|
217
|
-
html.save(
|
|
218
|
-
runner.current_run.output_path_for(
|
|
219
|
-
experiment, f'{example.id}.html'
|
|
220
|
-
)
|
|
221
|
-
)
|
|
222
|
-
experiment.info(
|
|
223
|
-
f'\'{example.id}.html\' generated in {t.elapse:.2f} seconds. '
|
|
224
|
-
)
|
|
225
|
-
except BaseException as e: # pylint: disable=broad-except
|
|
226
|
-
experiment.error(
|
|
227
|
-
f'Failed to generate \'{example.id}.html\'. '
|
|
228
|
-
f'Error: {e}, Stacktrace: \n{traceback.format_exc()}.',
|
|
229
|
-
)
|
|
230
|
-
raise e
|
|
231
|
-
|
|
232
|
-
def _copy():
|
|
233
|
-
src_file = current_run.input_path_for(experiment, f'{example.id}.html')
|
|
234
|
-
dest_file = current_run.output_path_for(experiment, f'{example.id}.html')
|
|
235
|
-
|
|
236
|
-
if src_file == dest_file:
|
|
237
|
-
return
|
|
238
|
-
|
|
239
|
-
if not pg.io.path_exists(src_file):
|
|
240
|
-
experiment.warning(
|
|
241
|
-
f'Skip copying \'{example.id}.html\' as '
|
|
242
|
-
f'{src_file!r} does not exist.'
|
|
243
|
-
)
|
|
244
|
-
return
|
|
245
|
-
|
|
246
|
-
try:
|
|
247
|
-
with pg.timeit() as t, pg.io.open(src_file, 'r') as src:
|
|
248
|
-
content = src.read()
|
|
249
|
-
with pg.io.open(dest_file, 'w') as dest:
|
|
250
|
-
dest.write(content)
|
|
251
|
-
experiment.info(
|
|
252
|
-
f'\'{example.id}.html\' copied in {t.elapse:.2f} seconds.'
|
|
253
|
-
)
|
|
254
|
-
except BaseException as e: # pylint: disable=broad-except
|
|
255
|
-
experiment.error(
|
|
256
|
-
f'Failed to copy {src_file!r} to {dest_file!r}. Error: {e}.'
|
|
257
|
-
)
|
|
258
|
-
raise e
|
|
259
|
-
|
|
260
|
-
generate_example_html = current_run.generate_example_html
|
|
261
|
-
if (generate_example_html == 'all'
|
|
262
|
-
or (generate_example_html == 'new' and example.newly_processed)
|
|
263
|
-
or (isinstance(generate_example_html, list)
|
|
264
|
-
and example.id in generate_example_html)):
|
|
265
|
-
op = _generate
|
|
266
|
-
else:
|
|
267
|
-
op = _copy
|
|
268
|
-
runner.background_run(op)
|
|
@@ -29,7 +29,12 @@ class ReportingTest(unittest.TestCase):
|
|
|
29
29
|
experiment = eval_test_helper.test_experiment()
|
|
30
30
|
checkpointer = checkpointing.BulkCheckpointer('checkpoint.jsonl')
|
|
31
31
|
reporter = reporting.HtmlReporter()
|
|
32
|
-
|
|
32
|
+
example_html_generator = reporting.ExampleHtmlGenerator()
|
|
33
|
+
run = experiment.run(
|
|
34
|
+
root_dir,
|
|
35
|
+
'new',
|
|
36
|
+
plugins=[checkpointer, reporter, example_html_generator]
|
|
37
|
+
)
|
|
33
38
|
self.assertTrue(
|
|
34
39
|
pg.io.path_exists(os.path.join(run.output_root, 'summary.html'))
|
|
35
40
|
)
|
|
@@ -52,8 +57,10 @@ class ReportingTest(unittest.TestCase):
|
|
|
52
57
|
root_dir = os.path.join(tempfile.mkdtemp(), 'test_reporting2')
|
|
53
58
|
experiment = eval_test_helper.test_experiment()
|
|
54
59
|
run = experiment.run(
|
|
55
|
-
root_dir,
|
|
56
|
-
|
|
60
|
+
root_dir,
|
|
61
|
+
'new',
|
|
62
|
+
plugins=[checkpointer, reporter, example_html_generator],
|
|
63
|
+
warm_start_from=run.output_root,
|
|
57
64
|
)
|
|
58
65
|
self.assertTrue(
|
|
59
66
|
pg.io.path_exists(os.path.join(run.output_root, 'summary.html'))
|
|
@@ -105,7 +112,12 @@ class ReportingTest(unittest.TestCase):
|
|
|
105
112
|
.test_experiment_with_example_html_generation_error())
|
|
106
113
|
checkpointer = checkpointing.BulkCheckpointer('checkpoint.jsonl')
|
|
107
114
|
reporter = reporting.HtmlReporter()
|
|
108
|
-
|
|
115
|
+
example_html_generator = reporting.ExampleHtmlGenerator()
|
|
116
|
+
run = experiment.run(
|
|
117
|
+
root_dir,
|
|
118
|
+
'new',
|
|
119
|
+
plugins=[checkpointer, reporter, example_html_generator]
|
|
120
|
+
)
|
|
109
121
|
self.assertTrue(
|
|
110
122
|
pg.io.path_exists(os.path.join(run.output_root, 'summary.html'))
|
|
111
123
|
)
|
|
@@ -132,8 +144,10 @@ class ReportingTest(unittest.TestCase):
|
|
|
132
144
|
experiment = (eval_test_helper
|
|
133
145
|
.test_experiment_with_example_html_generation_error())
|
|
134
146
|
run = experiment.run(
|
|
135
|
-
root_dir,
|
|
136
|
-
|
|
147
|
+
root_dir,
|
|
148
|
+
'new',
|
|
149
|
+
plugins=[checkpointer, reporter, example_html_generator],
|
|
150
|
+
warm_start_from=run.output_root,
|
|
137
151
|
)
|
|
138
152
|
self.assertTrue(
|
|
139
153
|
pg.io.path_exists(os.path.join(run.output_root, 'summary.html'))
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# Copyright 2024 The Langfun Authors
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Langfun evaluation runners."""
|
|
15
|
+
|
|
16
|
+
from langfun.core.eval.v2.runners.base import RunnerBase
|
|
17
|
+
from langfun.core.eval.v2.runners.debug import DebugRunner
|
|
18
|
+
from langfun.core.eval.v2.runners.parallel import ParallelRunner
|
|
19
|
+
from langfun.core.eval.v2.runners.sequential import SequentialRunner
|
|
20
|
+
|
|
21
|
+
__all__ = [
|
|
22
|
+
'RunnerBase',
|
|
23
|
+
'DebugRunner',
|
|
24
|
+
'ParallelRunner',
|
|
25
|
+
'SequentialRunner',
|
|
26
|
+
]
|
|
@@ -11,13 +11,12 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
"""
|
|
14
|
+
"""Base experiment runner."""
|
|
15
|
+
|
|
15
16
|
import abc
|
|
16
|
-
import collections
|
|
17
17
|
import concurrent.futures
|
|
18
18
|
import random
|
|
19
19
|
import threading
|
|
20
|
-
import time
|
|
21
20
|
import traceback
|
|
22
21
|
from typing import Any, Annotated, Callable, Iterator
|
|
23
22
|
|
|
@@ -42,7 +41,14 @@ _RUN_MANIFEST = 'run.json'
|
|
|
42
41
|
|
|
43
42
|
|
|
44
43
|
class RunnerBase(Runner):
|
|
45
|
-
"""
|
|
44
|
+
"""Base class for runners with plugin support and IO pooling.
|
|
45
|
+
|
|
46
|
+
`RunnerBase` provides the basic runner functionalities such as plugin
|
|
47
|
+
integration for checkpointing, reporting and progress tracking.
|
|
48
|
+
It also manages a thread pool for background IO operations.
|
|
49
|
+
Subclasses should implement `_run` and `_evaluate_items` for different
|
|
50
|
+
execution strategies.
|
|
51
|
+
"""
|
|
46
52
|
|
|
47
53
|
tqdm: Annotated[
|
|
48
54
|
bool,
|
|
@@ -58,6 +64,11 @@ class RunnerBase(Runner):
|
|
|
58
64
|
reporting.HtmlReporter(),
|
|
59
65
|
]
|
|
60
66
|
|
|
67
|
+
max_background_threads: Annotated[
|
|
68
|
+
int,
|
|
69
|
+
'Max number of background threads for IO operations.'
|
|
70
|
+
] = 128
|
|
71
|
+
|
|
61
72
|
def _on_bound(self):
|
|
62
73
|
super()._on_bound()
|
|
63
74
|
|
|
@@ -66,7 +77,9 @@ class RunnerBase(Runner):
|
|
|
66
77
|
self.plugins.append(progress_tracking.progress_tracker(self.tqdm))
|
|
67
78
|
|
|
68
79
|
self._io_pool_lock = threading.Lock()
|
|
69
|
-
self._io_pool = concurrent.futures.ThreadPoolExecutor(
|
|
80
|
+
self._io_pool = concurrent.futures.ThreadPoolExecutor(
|
|
81
|
+
max_workers=self.max_background_threads
|
|
82
|
+
)
|
|
70
83
|
# TODO(daiyip): render background errors.
|
|
71
84
|
self._background_last_error = None
|
|
72
85
|
|
|
@@ -220,7 +233,7 @@ class RunnerBase(Runner):
|
|
|
220
233
|
else:
|
|
221
234
|
# A evaluation could be considered as done if it has processed all the
|
|
222
235
|
# examples specified by `example_ids`.
|
|
223
|
-
assert progress.is_completed
|
|
236
|
+
assert progress.is_completed, progress
|
|
224
237
|
parent_progress.increment_processed()
|
|
225
238
|
|
|
226
239
|
if parent_progress.is_completed:
|
|
@@ -335,6 +348,7 @@ class RunnerBase(Runner):
|
|
|
335
348
|
def run_evaluation(self, evaluation: Evaluation) -> None:
|
|
336
349
|
"""Runs the evaluation."""
|
|
337
350
|
try:
|
|
351
|
+
evaluation.setup()
|
|
338
352
|
self.on_experiment_start(evaluation)
|
|
339
353
|
|
|
340
354
|
per_evaluation_settings = {}
|
|
@@ -367,6 +381,8 @@ class RunnerBase(Runner):
|
|
|
367
381
|
except BaseException as e: # pylint: disable=broad-except
|
|
368
382
|
self.on_experiment_abort(evaluation, e)
|
|
369
383
|
raise e
|
|
384
|
+
finally:
|
|
385
|
+
evaluation.teardown()
|
|
370
386
|
|
|
371
387
|
@abc.abstractmethod
|
|
372
388
|
def _evaluate_items(
|
|
@@ -394,121 +410,3 @@ class RunnerBase(Runner):
|
|
|
394
410
|
return in_memory.InMemory(
|
|
395
411
|
self.current_run.output_path_for(experiment, 'cache.json')
|
|
396
412
|
)
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
class SequentialRunner(RunnerBase):
|
|
400
|
-
"""Sequential runner.
|
|
401
|
-
|
|
402
|
-
Sequential runner runs all evaluations and their examples in sequence,
|
|
403
|
-
as well as the background tasks, it allows the developer to catch all
|
|
404
|
-
exceptions thrown from the background tasks, making it easier to debug.
|
|
405
|
-
"""
|
|
406
|
-
|
|
407
|
-
NAME = 'sequential'
|
|
408
|
-
|
|
409
|
-
def background_run(
|
|
410
|
-
self, func: Callable[..., Any], *args: Any, **kwargs: Any
|
|
411
|
-
) -> None:
|
|
412
|
-
"""Runs the function with the IO pool."""
|
|
413
|
-
func(*args, **kwargs)
|
|
414
|
-
|
|
415
|
-
def _run(self, evaluations: list[Evaluation]) -> None:
|
|
416
|
-
"""Runs the experiment in sequence."""
|
|
417
|
-
for e in evaluations:
|
|
418
|
-
self.run_evaluation(e)
|
|
419
|
-
|
|
420
|
-
def _evaluate_items(
|
|
421
|
-
self, evaluation: Evaluation, items: Iterator[Example]
|
|
422
|
-
) -> None:
|
|
423
|
-
"""Runs the evaluation items in sequence."""
|
|
424
|
-
for item in items:
|
|
425
|
-
self.evaluate_item(evaluation, item)
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
class DebugRunner(SequentialRunner):
|
|
429
|
-
"""Debug runner."""
|
|
430
|
-
|
|
431
|
-
NAME = 'debug'
|
|
432
|
-
|
|
433
|
-
# Do not use the checkpointer for debug runner.
|
|
434
|
-
plugins = []
|
|
435
|
-
|
|
436
|
-
def _on_bound(self):
|
|
437
|
-
super()._on_bound()
|
|
438
|
-
if self.current_run.example_ids is None:
|
|
439
|
-
self.current_run.rebind(example_ids=[1], skip_notification=True)
|
|
440
|
-
self.current_run.rebind(raise_if_has_error=True, skip_notification=True)
|
|
441
|
-
|
|
442
|
-
def _save_run_manifest(self) -> None:
|
|
443
|
-
"""Do nothing to avoid overriden existing runs."""
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
class ParallelRunner(RunnerBase):
|
|
447
|
-
"""Parallel runner."""
|
|
448
|
-
|
|
449
|
-
NAME = 'parallel'
|
|
450
|
-
|
|
451
|
-
timeout: Annotated[
|
|
452
|
-
int | None,
|
|
453
|
-
'Timeout for each evaluation example.'
|
|
454
|
-
] = None
|
|
455
|
-
|
|
456
|
-
concurrent_startup_delay: Annotated[
|
|
457
|
-
tuple[int, int] | None,
|
|
458
|
-
(
|
|
459
|
-
'A range of seconds to delay the initial evaluation of each thread '
|
|
460
|
-
'in the thread pool, helping to prevent a burst in LLM QPS at '
|
|
461
|
-
'startup. If set to None, no delay will be applied.'
|
|
462
|
-
)
|
|
463
|
-
] = None
|
|
464
|
-
|
|
465
|
-
def _run(self, evaluations: list[Evaluation]) -> None:
|
|
466
|
-
"""Runs the evaluations in parallel."""
|
|
467
|
-
def _run_group(evaluation_group: list[Evaluation]):
|
|
468
|
-
for e in evaluation_group:
|
|
469
|
-
self.run_evaluation(e)
|
|
470
|
-
|
|
471
|
-
# Run evaluations in parallel groupped by resource key.
|
|
472
|
-
groups: dict[str, list[Evaluation]] = collections.defaultdict(list)
|
|
473
|
-
for e in evaluations:
|
|
474
|
-
resource_ids = e.resource_ids()
|
|
475
|
-
if not resource_ids:
|
|
476
|
-
group_id = e.id
|
|
477
|
-
else:
|
|
478
|
-
# TODO(daiyip): support group that requires multiple resources.
|
|
479
|
-
group_id = resource_ids.pop()
|
|
480
|
-
groups[group_id].append(e)
|
|
481
|
-
|
|
482
|
-
for _, _, _ in lf.concurrent_map(
|
|
483
|
-
_run_group,
|
|
484
|
-
groups.values(),
|
|
485
|
-
max_workers=max(64, len(groups)),
|
|
486
|
-
timeout=self.timeout,
|
|
487
|
-
silence_on_errors=None,
|
|
488
|
-
):
|
|
489
|
-
pass
|
|
490
|
-
|
|
491
|
-
def _evaluate_items(
|
|
492
|
-
self, evaluation: Evaluation, items: Iterator[Example]
|
|
493
|
-
) -> None:
|
|
494
|
-
"""Override run items to run in parallel."""
|
|
495
|
-
if self.concurrent_startup_delay is not None:
|
|
496
|
-
thread_delayed = {}
|
|
497
|
-
def _evaluate_item(item: Example):
|
|
498
|
-
thread_id = threading.current_thread().ident
|
|
499
|
-
if thread_id not in thread_delayed:
|
|
500
|
-
thread_delayed[thread_id] = True
|
|
501
|
-
time.sleep(random.randint(*self.concurrent_startup_delay))
|
|
502
|
-
return self.evaluate_item(evaluation, item)
|
|
503
|
-
else:
|
|
504
|
-
def _evaluate_item(item: Example):
|
|
505
|
-
return self.evaluate_item(evaluation, item)
|
|
506
|
-
|
|
507
|
-
for _, _, _ in lf.concurrent_map(
|
|
508
|
-
_evaluate_item,
|
|
509
|
-
items,
|
|
510
|
-
max_workers=evaluation.max_workers,
|
|
511
|
-
timeout=self.timeout,
|
|
512
|
-
silence_on_errors=None,
|
|
513
|
-
):
|
|
514
|
-
pass
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
# Copyright 2024 The Langfun Authors
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Debug runner."""
|
|
15
|
+
|
|
16
|
+
from langfun.core.eval.v2.runners import sequential
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class DebugRunner(sequential.SequentialRunner):
|
|
20
|
+
"""A runner for debugging evaluations.
|
|
21
|
+
|
|
22
|
+
The debug runner is a sequential runner that only runs the first example
|
|
23
|
+
of each evaluation, with `raise_if_has_error` enabled. This is useful for
|
|
24
|
+
quickly identifying issues in evaluation logic during development.
|
|
25
|
+
Checkpointers are disabled for this runner.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
NAME = 'debug'
|
|
29
|
+
|
|
30
|
+
# Do not use the checkpointer for debug runner.
|
|
31
|
+
plugins = []
|
|
32
|
+
|
|
33
|
+
def _on_bound(self):
|
|
34
|
+
super()._on_bound()
|
|
35
|
+
if self.current_run.example_ids is None:
|
|
36
|
+
self.current_run.rebind(example_ids=[1], skip_notification=True)
|
|
37
|
+
self.current_run.rebind(raise_if_has_error=True, skip_notification=True)
|
|
38
|
+
|
|
39
|
+
def _save_run_manifest(self) -> None:
|
|
40
|
+
"""Do nothing to avoid overriden existing runs."""
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
# Copyright 2024 The Langfun Authors
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Tests for debug runner."""
|
|
15
|
+
import os
|
|
16
|
+
import tempfile
|
|
17
|
+
from typing import Any
|
|
18
|
+
import unittest
|
|
19
|
+
|
|
20
|
+
from langfun.core.eval.v2 import eval_test_helper
|
|
21
|
+
from langfun.core.eval.v2.runners import debug # pylint: disable=unused-import
|
|
22
|
+
|
|
23
|
+
import pyglove as pg
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class RunnerTest(unittest.TestCase):
|
|
27
|
+
|
|
28
|
+
def assert_same_list(self, actual: list[Any], expected: list[Any]):
|
|
29
|
+
self.assertEqual(len(actual), len(expected))
|
|
30
|
+
for i, (x, y) in enumerate(zip(actual, expected)):
|
|
31
|
+
if x is not y:
|
|
32
|
+
print(i, pg.diff(x, y))
|
|
33
|
+
self.assertIs(x, y)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class DebugRunnerTest(RunnerTest):
|
|
37
|
+
|
|
38
|
+
def test_debug_runner(self):
|
|
39
|
+
plugin = eval_test_helper.TestPlugin()
|
|
40
|
+
exp = eval_test_helper.test_experiment()
|
|
41
|
+
root_dir = os.path.join(tempfile.mkdtemp(), 'test_debug_runner')
|
|
42
|
+
run = exp.run(root_dir, runner='debug', plugins=[plugin])
|
|
43
|
+
|
|
44
|
+
self.assertIsNotNone(plugin.start_time)
|
|
45
|
+
self.assertIsNotNone(plugin.complete_time)
|
|
46
|
+
self.assertGreater(plugin.complete_time, plugin.start_time)
|
|
47
|
+
|
|
48
|
+
self.assertEqual(
|
|
49
|
+
len(plugin.started_experiments), len(exp.nodes)
|
|
50
|
+
)
|
|
51
|
+
self.assertEqual(
|
|
52
|
+
len(plugin.completed_experiments), len(exp.nodes)
|
|
53
|
+
)
|
|
54
|
+
self.assertEqual(
|
|
55
|
+
len(plugin.started_example_ids), 6 * 1
|
|
56
|
+
)
|
|
57
|
+
self.assertEqual(
|
|
58
|
+
len(plugin.completed_example_ids), 6 * 1
|
|
59
|
+
)
|
|
60
|
+
self.assert_same_list(plugin.skipped_experiments, [])
|
|
61
|
+
self.assertFalse(
|
|
62
|
+
pg.io.path_exists(os.path.join(run.output_root, 'run.json'))
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
for node in exp.nodes:
|
|
66
|
+
self.assertTrue(node.progress.is_started)
|
|
67
|
+
self.assertTrue(node.progress.is_completed)
|
|
68
|
+
if node.is_leaf:
|
|
69
|
+
self.assertEqual(node.progress.num_skipped, 0)
|
|
70
|
+
self.assertEqual(node.progress.num_completed, 1)
|
|
71
|
+
self.assertEqual(node.progress.num_failed, 0)
|
|
72
|
+
else:
|
|
73
|
+
self.assertEqual(node.progress.num_skipped, 0)
|
|
74
|
+
self.assertEqual(node.progress.num_failed, 0)
|
|
75
|
+
self.assertEqual(node.progress.num_processed, node.progress.num_total)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
if __name__ == '__main__':
|
|
79
|
+
unittest.main()
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
# Copyright 2025 The Langfun Authors
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Parallel runner."""
|
|
15
|
+
|
|
16
|
+
import collections
|
|
17
|
+
import random
|
|
18
|
+
import threading
|
|
19
|
+
import time
|
|
20
|
+
|
|
21
|
+
from typing import Annotated, Iterator
|
|
22
|
+
import langfun.core as lf
|
|
23
|
+
from langfun.core.eval.v2.runners import base
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class ParallelRunner(base.RunnerBase):
|
|
27
|
+
"""A runner that executes evaluations and examples in parallel.
|
|
28
|
+
|
|
29
|
+
The parallel runner groups evaluations by their required resources
|
|
30
|
+
(e.g., specific LLMs) and runs evaluations that do not share resources in
|
|
31
|
+
parallel. Within each evaluation, examples are also processed in parallel
|
|
32
|
+
using threads, up to `Evaluation.max_workers`.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
NAME = 'parallel'
|
|
36
|
+
|
|
37
|
+
timeout: Annotated[
|
|
38
|
+
int | None,
|
|
39
|
+
'Timeout for each evaluation example.'
|
|
40
|
+
] = None
|
|
41
|
+
|
|
42
|
+
concurrent_startup_delay: Annotated[
|
|
43
|
+
tuple[int, int] | None,
|
|
44
|
+
(
|
|
45
|
+
'A range of seconds to delay the initial evaluation of each thread '
|
|
46
|
+
'in the thread pool, helping to prevent a burst in LLM QPS at '
|
|
47
|
+
'startup. If set to None, no delay will be applied.'
|
|
48
|
+
)
|
|
49
|
+
] = None
|
|
50
|
+
|
|
51
|
+
def _run(self, evaluations: list[base.Evaluation]) -> None:
|
|
52
|
+
"""Runs the evaluations in parallel."""
|
|
53
|
+
def _run_group(evaluation_group: list[base.Evaluation]):
|
|
54
|
+
for e in evaluation_group:
|
|
55
|
+
self.run_evaluation(e)
|
|
56
|
+
|
|
57
|
+
# Run evaluations in parallel groupped by resource key.
|
|
58
|
+
groups: dict[str, list[base.Evaluation]] = collections.defaultdict(list)
|
|
59
|
+
for e in evaluations:
|
|
60
|
+
resource_ids = e.resource_ids()
|
|
61
|
+
if not resource_ids:
|
|
62
|
+
group_id = e.id
|
|
63
|
+
else:
|
|
64
|
+
# TODO(daiyip): support group that requires multiple resources.
|
|
65
|
+
group_id = resource_ids.pop()
|
|
66
|
+
groups[group_id].append(e)
|
|
67
|
+
|
|
68
|
+
for _, _, _ in lf.concurrent_map(
|
|
69
|
+
_run_group,
|
|
70
|
+
groups.values(),
|
|
71
|
+
max_workers=max(64, len(groups)),
|
|
72
|
+
timeout=self.timeout,
|
|
73
|
+
silence_on_errors=None,
|
|
74
|
+
):
|
|
75
|
+
pass
|
|
76
|
+
|
|
77
|
+
def _evaluate_items(
|
|
78
|
+
self, evaluation: base.Evaluation, items: Iterator[base.Example]
|
|
79
|
+
) -> None:
|
|
80
|
+
"""Override run items to run in parallel."""
|
|
81
|
+
if self.concurrent_startup_delay is not None:
|
|
82
|
+
thread_delayed = {}
|
|
83
|
+
def _evaluate_item(item: base.Example):
|
|
84
|
+
thread_id = threading.current_thread().ident
|
|
85
|
+
if thread_id not in thread_delayed:
|
|
86
|
+
thread_delayed[thread_id] = True
|
|
87
|
+
time.sleep(random.randint(*self.concurrent_startup_delay))
|
|
88
|
+
return self.evaluate_item(evaluation, item)
|
|
89
|
+
else:
|
|
90
|
+
def _evaluate_item(item: base.Example):
|
|
91
|
+
return self.evaluate_item(evaluation, item)
|
|
92
|
+
|
|
93
|
+
for _, _, _ in lf.concurrent_map(
|
|
94
|
+
_evaluate_item,
|
|
95
|
+
items,
|
|
96
|
+
max_workers=evaluation.max_workers,
|
|
97
|
+
timeout=self.timeout,
|
|
98
|
+
silence_on_errors=None,
|
|
99
|
+
):
|
|
100
|
+
pass
|