langfun 0.1.2.dev202510230805__py3-none-any.whl → 0.1.2.dev202511270805__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of langfun might be problematic. Click here for more details.

Files changed (155) hide show
  1. langfun/core/__init__.py +2 -0
  2. langfun/core/agentic/__init__.py +4 -1
  3. langfun/core/agentic/action.py +447 -29
  4. langfun/core/agentic/action_eval.py +9 -2
  5. langfun/core/agentic/action_test.py +149 -21
  6. langfun/core/async_support.py +32 -3
  7. langfun/core/coding/python/correction.py +19 -9
  8. langfun/core/coding/python/execution.py +14 -12
  9. langfun/core/coding/python/generation.py +21 -16
  10. langfun/core/coding/python/sandboxing.py +23 -3
  11. langfun/core/component.py +42 -3
  12. langfun/core/concurrent.py +70 -6
  13. langfun/core/concurrent_test.py +1 -0
  14. langfun/core/console.py +1 -1
  15. langfun/core/data/conversion/anthropic.py +12 -3
  16. langfun/core/data/conversion/anthropic_test.py +8 -6
  17. langfun/core/data/conversion/gemini.py +9 -2
  18. langfun/core/data/conversion/gemini_test.py +12 -9
  19. langfun/core/data/conversion/openai.py +145 -31
  20. langfun/core/data/conversion/openai_test.py +161 -17
  21. langfun/core/eval/base.py +47 -43
  22. langfun/core/eval/base_test.py +5 -5
  23. langfun/core/eval/matching.py +5 -2
  24. langfun/core/eval/patching.py +3 -3
  25. langfun/core/eval/scoring.py +4 -3
  26. langfun/core/eval/v2/__init__.py +1 -0
  27. langfun/core/eval/v2/checkpointing.py +64 -6
  28. langfun/core/eval/v2/checkpointing_test.py +9 -2
  29. langfun/core/eval/v2/eval_test_helper.py +103 -2
  30. langfun/core/eval/v2/evaluation.py +91 -16
  31. langfun/core/eval/v2/evaluation_test.py +9 -3
  32. langfun/core/eval/v2/example.py +50 -40
  33. langfun/core/eval/v2/example_test.py +16 -8
  34. langfun/core/eval/v2/experiment.py +74 -8
  35. langfun/core/eval/v2/experiment_test.py +19 -0
  36. langfun/core/eval/v2/metric_values.py +31 -3
  37. langfun/core/eval/v2/metric_values_test.py +32 -0
  38. langfun/core/eval/v2/metrics.py +157 -44
  39. langfun/core/eval/v2/metrics_test.py +39 -18
  40. langfun/core/eval/v2/progress.py +30 -1
  41. langfun/core/eval/v2/progress_test.py +27 -0
  42. langfun/core/eval/v2/progress_tracking.py +12 -3
  43. langfun/core/eval/v2/progress_tracking_test.py +6 -1
  44. langfun/core/eval/v2/reporting.py +90 -71
  45. langfun/core/eval/v2/reporting_test.py +24 -6
  46. langfun/core/eval/v2/runners/__init__.py +30 -0
  47. langfun/core/eval/v2/{runners.py → runners/base.py} +59 -142
  48. langfun/core/eval/v2/runners/beam.py +341 -0
  49. langfun/core/eval/v2/runners/beam_test.py +131 -0
  50. langfun/core/eval/v2/runners/ckpt_monitor.py +294 -0
  51. langfun/core/eval/v2/runners/ckpt_monitor_test.py +162 -0
  52. langfun/core/eval/v2/runners/debug.py +40 -0
  53. langfun/core/eval/v2/runners/debug_test.py +76 -0
  54. langfun/core/eval/v2/runners/parallel.py +100 -0
  55. langfun/core/eval/v2/runners/parallel_test.py +95 -0
  56. langfun/core/eval/v2/runners/sequential.py +47 -0
  57. langfun/core/eval/v2/runners/sequential_test.py +172 -0
  58. langfun/core/langfunc.py +45 -130
  59. langfun/core/langfunc_test.py +7 -5
  60. langfun/core/language_model.py +141 -21
  61. langfun/core/language_model_test.py +54 -3
  62. langfun/core/llms/__init__.py +9 -1
  63. langfun/core/llms/anthropic.py +157 -2
  64. langfun/core/llms/azure_openai.py +29 -17
  65. langfun/core/llms/cache/base.py +25 -3
  66. langfun/core/llms/cache/in_memory.py +48 -7
  67. langfun/core/llms/cache/in_memory_test.py +14 -4
  68. langfun/core/llms/compositional.py +25 -1
  69. langfun/core/llms/deepseek.py +30 -2
  70. langfun/core/llms/fake.py +32 -1
  71. langfun/core/llms/gemini.py +55 -17
  72. langfun/core/llms/gemini_test.py +84 -0
  73. langfun/core/llms/google_genai.py +34 -1
  74. langfun/core/llms/groq.py +28 -3
  75. langfun/core/llms/llama_cpp.py +23 -4
  76. langfun/core/llms/openai.py +36 -3
  77. langfun/core/llms/openai_compatible.py +148 -27
  78. langfun/core/llms/openai_compatible_test.py +207 -20
  79. langfun/core/llms/openai_test.py +0 -2
  80. langfun/core/llms/rest.py +12 -1
  81. langfun/core/llms/vertexai.py +58 -8
  82. langfun/core/logging.py +1 -1
  83. langfun/core/mcp/client.py +77 -22
  84. langfun/core/mcp/client_test.py +8 -35
  85. langfun/core/mcp/session.py +94 -29
  86. langfun/core/mcp/session_test.py +54 -0
  87. langfun/core/mcp/tool.py +151 -22
  88. langfun/core/mcp/tool_test.py +197 -0
  89. langfun/core/memory.py +1 -0
  90. langfun/core/message.py +160 -55
  91. langfun/core/message_test.py +65 -81
  92. langfun/core/modalities/__init__.py +8 -0
  93. langfun/core/modalities/audio.py +21 -1
  94. langfun/core/modalities/image.py +19 -1
  95. langfun/core/modalities/mime.py +64 -3
  96. langfun/core/modalities/mime_test.py +11 -0
  97. langfun/core/modalities/pdf.py +19 -1
  98. langfun/core/modalities/video.py +21 -1
  99. langfun/core/modality.py +167 -29
  100. langfun/core/modality_test.py +42 -12
  101. langfun/core/natural_language.py +1 -1
  102. langfun/core/sampling.py +4 -4
  103. langfun/core/sampling_test.py +20 -4
  104. langfun/core/structured/__init__.py +2 -24
  105. langfun/core/structured/completion.py +34 -44
  106. langfun/core/structured/completion_test.py +23 -43
  107. langfun/core/structured/description.py +54 -50
  108. langfun/core/structured/function_generation.py +29 -12
  109. langfun/core/structured/mapping.py +81 -37
  110. langfun/core/structured/parsing.py +95 -79
  111. langfun/core/structured/parsing_test.py +0 -3
  112. langfun/core/structured/querying.py +215 -142
  113. langfun/core/structured/querying_test.py +65 -29
  114. langfun/core/structured/schema/__init__.py +49 -0
  115. langfun/core/structured/schema/base.py +664 -0
  116. langfun/core/structured/schema/base_test.py +531 -0
  117. langfun/core/structured/schema/json.py +174 -0
  118. langfun/core/structured/schema/json_test.py +121 -0
  119. langfun/core/structured/schema/python.py +316 -0
  120. langfun/core/structured/schema/python_test.py +410 -0
  121. langfun/core/structured/schema_generation.py +33 -14
  122. langfun/core/structured/scoring.py +47 -36
  123. langfun/core/structured/tokenization.py +26 -11
  124. langfun/core/subscription.py +2 -2
  125. langfun/core/template.py +174 -49
  126. langfun/core/template_test.py +123 -17
  127. langfun/env/__init__.py +8 -2
  128. langfun/env/base_environment.py +320 -128
  129. langfun/env/base_environment_test.py +473 -0
  130. langfun/env/base_feature.py +92 -15
  131. langfun/env/base_feature_test.py +228 -0
  132. langfun/env/base_sandbox.py +84 -361
  133. langfun/env/base_sandbox_test.py +1235 -0
  134. langfun/env/event_handlers/__init__.py +1 -1
  135. langfun/env/event_handlers/chain.py +233 -0
  136. langfun/env/event_handlers/chain_test.py +253 -0
  137. langfun/env/event_handlers/event_logger.py +95 -98
  138. langfun/env/event_handlers/event_logger_test.py +21 -21
  139. langfun/env/event_handlers/metric_writer.py +225 -140
  140. langfun/env/event_handlers/metric_writer_test.py +23 -6
  141. langfun/env/interface.py +854 -40
  142. langfun/env/interface_test.py +112 -2
  143. langfun/env/load_balancers_test.py +23 -2
  144. langfun/env/test_utils.py +126 -84
  145. {langfun-0.1.2.dev202510230805.dist-info → langfun-0.1.2.dev202511270805.dist-info}/METADATA +1 -1
  146. langfun-0.1.2.dev202511270805.dist-info/RECORD +215 -0
  147. langfun/core/eval/v2/runners_test.py +0 -343
  148. langfun/core/structured/schema.py +0 -987
  149. langfun/core/structured/schema_test.py +0 -982
  150. langfun/env/base_test.py +0 -1481
  151. langfun/env/event_handlers/base.py +0 -350
  152. langfun-0.1.2.dev202510230805.dist-info/RECORD +0 -195
  153. {langfun-0.1.2.dev202510230805.dist-info → langfun-0.1.2.dev202511270805.dist-info}/WHEEL +0 -0
  154. {langfun-0.1.2.dev202510230805.dist-info → langfun-0.1.2.dev202511270805.dist-info}/licenses/LICENSE +0 -0
  155. {langfun-0.1.2.dev202510230805.dist-info → langfun-0.1.2.dev202511270805.dist-info}/top_level.txt +0 -0
@@ -32,8 +32,97 @@ _SUMMARY_FILE = 'summary.html'
32
32
  _EVALULATION_DETAIL_FILE = 'index.html'
33
33
 
34
34
 
35
+ class ExampleHtmlGenerator(experiment_lib.Plugin):
36
+ """Plugin for generating HTML views for each evaluation example."""
37
+
38
+ def on_example_complete(
39
+ self, runner: Runner, experiment: Experiment, example: Example
40
+ ):
41
+ self._save_example_html(runner, experiment, example)
42
+
43
+ def _save_example_html(
44
+ self, runner: Runner, experiment: Experiment, example: Example
45
+ ) -> None:
46
+ """Saves the example in HTML format."""
47
+ current_run = runner.current_run
48
+ def _generate():
49
+ try:
50
+ with pg.timeit() as t:
51
+ html = example.to_html(
52
+ collapse_level=None,
53
+ enable_summary_tooltip=False,
54
+ extra_flags=dict(
55
+ # For properly rendering the next link.
56
+ num_examples=getattr(experiment, 'num_examples', None)
57
+ ),
58
+ )
59
+ html.save(
60
+ runner.current_run.output_path_for(
61
+ experiment, f'{example.id}.html'
62
+ )
63
+ )
64
+ experiment.info(
65
+ f'\'{example.id}.html\' generated in {t.elapse:.2f} seconds. '
66
+ )
67
+ except BaseException as e: # pylint: disable=broad-except
68
+ experiment.error(
69
+ f'Failed to generate \'{example.id}.html\'. '
70
+ f'Error: {e}, Stacktrace: \n{traceback.format_exc()}.',
71
+ )
72
+ raise e
73
+
74
+ def _copy():
75
+ src_file = current_run.input_path_for(experiment, f'{example.id}.html')
76
+ dest_file = current_run.output_path_for(experiment, f'{example.id}.html')
77
+
78
+ if src_file == dest_file:
79
+ return
80
+
81
+ if not pg.io.path_exists(src_file):
82
+ experiment.warning(
83
+ f'Skip copying \'{example.id}.html\' as '
84
+ f'{src_file!r} does not exist.'
85
+ )
86
+ return
87
+
88
+ try:
89
+ with pg.timeit() as t, pg.io.open(src_file, 'r') as src:
90
+ content = src.read()
91
+ with pg.io.open(dest_file, 'w') as dest:
92
+ dest.write(content)
93
+ experiment.info(
94
+ f'\'{example.id}.html\' copied in {t.elapse:.2f} seconds.'
95
+ )
96
+ except BaseException as e: # pylint: disable=broad-except
97
+ experiment.error(
98
+ f'Failed to copy {src_file!r} to {dest_file!r}. Error: {e}.'
99
+ )
100
+ raise e
101
+
102
+ generate_example_html = current_run.generate_example_html
103
+ if (generate_example_html == 'all'
104
+ or (generate_example_html == 'new' and example.newly_processed)
105
+ or (isinstance(generate_example_html, list)
106
+ and example.id in generate_example_html)):
107
+ op = _generate
108
+ else:
109
+ op = _copy
110
+ runner.background_run(op)
111
+
112
+
35
113
  class HtmlReporter(experiment_lib.Plugin):
36
- """Plugin for periodically generating HTML reports for the experiment."""
114
+ """Plugin for periodically generating HTML reports for the experiment.
115
+
116
+ The `HtmlReporter` plugin generates several HTML files during an experiment
117
+ run:
118
+ - A `summary.html` at the root of the run directory, summarizing all
119
+ evaluations in the experiment.
120
+ - An `index.html` for each leaf evaluation, detailing the evaluation
121
+ definition, metrics, and logs.
122
+
123
+ These reports are updated periodically in the background during the run,
124
+ allowing users to monitor progress in near real-time.
125
+ """
37
126
 
38
127
  summary_interval: Annotated[
39
128
  int,
@@ -127,7 +216,6 @@ class HtmlReporter(experiment_lib.Plugin):
127
216
  def on_example_complete(
128
217
  self, runner: Runner, experiment: Experiment, example: Example
129
218
  ):
130
- self._save_example_html(runner, experiment, example)
131
219
  self._maybe_update_experiment_html(runner, experiment)
132
220
  self._maybe_update_summary(runner)
133
221
 
@@ -197,72 +285,3 @@ class HtmlReporter(experiment_lib.Plugin):
197
285
  runner.background_run(_save)
198
286
  else:
199
287
  _save()
200
-
201
- def _save_example_html(
202
- self, runner: Runner, experiment: Experiment, example: Example
203
- ) -> None:
204
- """Saves the example in HTML format."""
205
- current_run = runner.current_run
206
- def _generate():
207
- try:
208
- with pg.timeit() as t:
209
- html = example.to_html(
210
- collapse_level=None,
211
- enable_summary_tooltip=False,
212
- extra_flags=dict(
213
- # For properly rendering the next link.
214
- num_examples=getattr(experiment, 'num_examples', None)
215
- ),
216
- )
217
- html.save(
218
- runner.current_run.output_path_for(
219
- experiment, f'{example.id}.html'
220
- )
221
- )
222
- experiment.info(
223
- f'\'{example.id}.html\' generated in {t.elapse:.2f} seconds. '
224
- )
225
- except BaseException as e: # pylint: disable=broad-except
226
- experiment.error(
227
- f'Failed to generate \'{example.id}.html\'. '
228
- f'Error: {e}, Stacktrace: \n{traceback.format_exc()}.',
229
- )
230
- raise e
231
-
232
- def _copy():
233
- src_file = current_run.input_path_for(experiment, f'{example.id}.html')
234
- dest_file = current_run.output_path_for(experiment, f'{example.id}.html')
235
-
236
- if src_file == dest_file:
237
- return
238
-
239
- if not pg.io.path_exists(src_file):
240
- experiment.warning(
241
- f'Skip copying \'{example.id}.html\' as '
242
- f'{src_file!r} does not exist.'
243
- )
244
- return
245
-
246
- try:
247
- with pg.timeit() as t, pg.io.open(src_file, 'r') as src:
248
- content = src.read()
249
- with pg.io.open(dest_file, 'w') as dest:
250
- dest.write(content)
251
- experiment.info(
252
- f'\'{example.id}.html\' copied in {t.elapse:.2f} seconds.'
253
- )
254
- except BaseException as e: # pylint: disable=broad-except
255
- experiment.error(
256
- f'Failed to copy {src_file!r} to {dest_file!r}. Error: {e}.'
257
- )
258
- raise e
259
-
260
- generate_example_html = current_run.generate_example_html
261
- if (generate_example_html == 'all'
262
- or (generate_example_html == 'new' and example.newly_processed)
263
- or (isinstance(generate_example_html, list)
264
- and example.id in generate_example_html)):
265
- op = _generate
266
- else:
267
- op = _copy
268
- runner.background_run(op)
@@ -29,7 +29,16 @@ class ReportingTest(unittest.TestCase):
29
29
  experiment = eval_test_helper.test_experiment()
30
30
  checkpointer = checkpointing.BulkCheckpointer('checkpoint.jsonl')
31
31
  reporter = reporting.HtmlReporter()
32
- run = experiment.run(root_dir, 'new', plugins=[checkpointer, reporter])
32
+ self.assertFalse(reporter.is_per_example())
33
+
34
+ example_html_generator = reporting.ExampleHtmlGenerator()
35
+ self.assertTrue(example_html_generator.is_per_example())
36
+
37
+ run = experiment.run(
38
+ root_dir,
39
+ 'new',
40
+ plugins=[checkpointer, reporter, example_html_generator]
41
+ )
33
42
  self.assertTrue(
34
43
  pg.io.path_exists(os.path.join(run.output_root, 'summary.html'))
35
44
  )
@@ -52,8 +61,10 @@ class ReportingTest(unittest.TestCase):
52
61
  root_dir = os.path.join(tempfile.mkdtemp(), 'test_reporting2')
53
62
  experiment = eval_test_helper.test_experiment()
54
63
  run = experiment.run(
55
- root_dir, 'new', plugins=[checkpointer, reporter],
56
- warm_start_from=run.output_root
64
+ root_dir,
65
+ 'new',
66
+ plugins=[checkpointer, reporter, example_html_generator],
67
+ warm_start_from=run.output_root,
57
68
  )
58
69
  self.assertTrue(
59
70
  pg.io.path_exists(os.path.join(run.output_root, 'summary.html'))
@@ -105,7 +116,12 @@ class ReportingTest(unittest.TestCase):
105
116
  .test_experiment_with_example_html_generation_error())
106
117
  checkpointer = checkpointing.BulkCheckpointer('checkpoint.jsonl')
107
118
  reporter = reporting.HtmlReporter()
108
- run = experiment.run(root_dir, 'new', plugins=[checkpointer, reporter])
119
+ example_html_generator = reporting.ExampleHtmlGenerator()
120
+ run = experiment.run(
121
+ root_dir,
122
+ 'new',
123
+ plugins=[checkpointer, reporter, example_html_generator]
124
+ )
109
125
  self.assertTrue(
110
126
  pg.io.path_exists(os.path.join(run.output_root, 'summary.html'))
111
127
  )
@@ -132,8 +148,10 @@ class ReportingTest(unittest.TestCase):
132
148
  experiment = (eval_test_helper
133
149
  .test_experiment_with_example_html_generation_error())
134
150
  run = experiment.run(
135
- root_dir, 'new', plugins=[checkpointer, reporter],
136
- warm_start_from=run.output_root
151
+ root_dir,
152
+ 'new',
153
+ plugins=[checkpointer, reporter, example_html_generator],
154
+ warm_start_from=run.output_root,
137
155
  )
138
156
  self.assertTrue(
139
157
  pg.io.path_exists(os.path.join(run.output_root, 'summary.html'))
@@ -0,0 +1,30 @@
1
+ # Copyright 2024 The Langfun Authors
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Langfun evaluation runners."""
15
+
16
+ # pylint: disable=g-importing-member
17
+ from langfun.core.eval.v2.runners.base import RunnerBase
18
+ from langfun.core.eval.v2.runners.beam import BeamRunner
19
+ from langfun.core.eval.v2.runners.debug import DebugRunner
20
+ from langfun.core.eval.v2.runners.parallel import ParallelRunner
21
+ from langfun.core.eval.v2.runners.sequential import SequentialRunner
22
+ # pylint: enable=g-importing-member
23
+
24
+ __all__ = [
25
+ 'RunnerBase',
26
+ 'BeamRunner',
27
+ 'DebugRunner',
28
+ 'ParallelRunner',
29
+ 'SequentialRunner',
30
+ ]
@@ -11,15 +11,14 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
- """Evaluation experiment runners."""
14
+ """Base experiment runner."""
15
+
15
16
  import abc
16
- import collections
17
17
  import concurrent.futures
18
18
  import random
19
19
  import threading
20
- import time
21
20
  import traceback
22
- from typing import Any, Annotated, Callable, Iterator
21
+ from typing import Any, Annotated, Callable, Iterator, Literal
23
22
 
24
23
  from langfun import core as lf
25
24
  from langfun.core.eval.v2 import checkpointing
@@ -42,31 +41,55 @@ _RUN_MANIFEST = 'run.json'
42
41
 
43
42
 
44
43
  class RunnerBase(Runner):
45
- """A simple runner that runs evaluations and their examples sequentially."""
44
+ """Base class for runners with plugin support and IO pooling.
45
+
46
+ `RunnerBase` provides the basic runner functionalities such as plugin
47
+ integration for checkpointing, reporting and progress tracking.
48
+ It also manages a thread pool for background IO operations.
49
+ Subclasses should implement `_run` and `_evaluate_items` for different
50
+ execution strategies.
51
+ """
46
52
 
47
- tqdm: Annotated[
48
- bool,
53
+ progress_tracker: Annotated[
54
+ Literal['tqdm', 'html', 'auto', None],
49
55
  (
50
- 'If True, force using tqdm for progress update. Otherwise, determine '
51
- 'it automatically based on the running environment (console vs. '
52
- 'notebook)'
56
+ 'If `tqdm`, force using tqdm for progress update. '
57
+ 'If `html`, force using html for progress update. '
58
+ 'If `auto`, determine it automatically based on the running '
59
+ 'environment (console vs. notebook)'
60
+ 'If `none`, disable progress update.'
53
61
  )
54
- ] = False
62
+ ] = 'auto'
55
63
 
56
64
  plugins = [
57
65
  checkpointing.BulkCheckpointer(),
58
66
  reporting.HtmlReporter(),
59
67
  ]
60
68
 
69
+ max_background_threads: Annotated[
70
+ int,
71
+ 'Max number of background threads for IO operations.'
72
+ ] = 128
73
+
61
74
  def _on_bound(self):
62
75
  super()._on_bound()
63
76
 
64
77
  # Install the tqdm plugin if needed.
65
- with pg.notify_on_change(False):
66
- self.plugins.append(progress_tracking.progress_tracker(self.tqdm))
78
+ if self.progress_tracker is not None:
79
+ with pg.notify_on_change(False):
80
+ self.plugins.append(
81
+ progress_tracking.progress_tracker(self.progress_tracker)
82
+ )
83
+
84
+ if self.max_background_threads > 0:
85
+ self._io_pool_lock = threading.Lock()
86
+ self._io_pool = concurrent.futures.ThreadPoolExecutor(
87
+ max_workers=self.max_background_threads
88
+ )
89
+ else:
90
+ self._io_pool_lock = None
91
+ self._io_pool = None
67
92
 
68
- self._io_pool_lock = threading.Lock()
69
- self._io_pool = concurrent.futures.ThreadPoolExecutor(max_workers=16)
70
93
  # TODO(daiyip): render background errors.
71
94
  self._background_last_error = None
72
95
 
@@ -78,9 +101,12 @@ class RunnerBase(Runner):
78
101
  except Exception as e: # pylint: disable=broad-except
79
102
  self._background_last_error = e
80
103
 
81
- with self._io_pool_lock:
82
- if self._io_pool is not None:
83
- self._io_pool.submit(_background_run, *args, **kwargs)
104
+ if self.max_background_threads > 0:
105
+ with self._io_pool_lock:
106
+ if self._io_pool is not None:
107
+ self._io_pool.submit(_background_run, *args, **kwargs)
108
+ else:
109
+ _background_run(*args, **kwargs)
84
110
 
85
111
  def _all_plugins(self, experiment: Experiment) -> Iterator[Plugin]:
86
112
  """Returns all plugins for the experiment."""
@@ -139,6 +165,7 @@ class RunnerBase(Runner):
139
165
  plugin.on_experiment_start(self, experiment)
140
166
 
141
167
  if experiment.is_leaf:
168
+ pg.io.mkdirs(self.current_run.output_dir(experiment))
142
169
  experiment.info(
143
170
  f'Starting evaluation {experiment.id!r} with '
144
171
  f'{num_examples_to_evaluate} examples to evaluate.'
@@ -220,7 +247,7 @@ class RunnerBase(Runner):
220
247
  else:
221
248
  # A evaluation could be considered as done if it has processed all the
222
249
  # examples specified by `example_ids`.
223
- assert progress.is_completed
250
+ assert progress.is_completed, progress
224
251
  parent_progress.increment_processed()
225
252
 
226
253
  if parent_progress.is_completed:
@@ -235,6 +262,8 @@ class RunnerBase(Runner):
235
262
  example: Example
236
263
  ) -> None:
237
264
  """Called when an evaluation example is started."""
265
+ assert isinstance(experiment, Evaluation), experiment
266
+ experiment.state.update(example, in_progress=True)
238
267
  for plugin in self._all_plugins(experiment):
239
268
  plugin.on_example_start(self, experiment, example)
240
269
  experiment.info(f'Starting to evaluate example {example.id}.')
@@ -245,6 +274,8 @@ class RunnerBase(Runner):
245
274
  example: Example
246
275
  ) -> None:
247
276
  """Called when an evaluation example is complete."""
277
+ assert isinstance(experiment, Evaluation), experiment
278
+ experiment.state.update(example, in_progress=False)
248
279
  if example.newly_processed:
249
280
  if example.error is None:
250
281
  experiment.progress.increment_processed()
@@ -256,7 +287,7 @@ class RunnerBase(Runner):
256
287
  experiment.progress.increment_failed()
257
288
  experiment.error(
258
289
  (
259
- f'Failed to evaluate example {example.id} in'
290
+ f'Failed to evaluate example {example.id} in '
260
291
  f'{example.elapse:.2f} seconds.'
261
292
  ),
262
293
  error=example.error
@@ -316,7 +347,7 @@ class RunnerBase(Runner):
316
347
  self._run(targets)
317
348
 
318
349
  self.on_run_complete()
319
- except Exception as e: # pylint: disable=broad-except
350
+ except BaseException as e: # pylint: disable=broad-except
320
351
  self.on_run_abort(e)
321
352
  raise e
322
353
  finally:
@@ -324,9 +355,10 @@ class RunnerBase(Runner):
324
355
  self.background_run(cache.save)
325
356
 
326
357
  # Wait for the background tasks to finish.
327
- with self._io_pool_lock:
328
- self._io_pool, io_pool = None, self._io_pool
329
- io_pool.shutdown(wait=True)
358
+ if self.max_background_threads > 0:
359
+ with self._io_pool_lock:
360
+ self._io_pool, io_pool = None, self._io_pool
361
+ io_pool.shutdown(wait=True)
330
362
 
331
363
  @abc.abstractmethod
332
364
  def _run(self, evaluations: list[Evaluation]) -> None:
@@ -335,6 +367,7 @@ class RunnerBase(Runner):
335
367
  def run_evaluation(self, evaluation: Evaluation) -> None:
336
368
  """Runs the evaluation."""
337
369
  try:
370
+ evaluation.setup()
338
371
  self.on_experiment_start(evaluation)
339
372
 
340
373
  per_evaluation_settings = {}
@@ -367,6 +400,8 @@ class RunnerBase(Runner):
367
400
  except BaseException as e: # pylint: disable=broad-except
368
401
  self.on_experiment_abort(evaluation, e)
369
402
  raise e
403
+ finally:
404
+ evaluation.teardown()
370
405
 
371
406
  @abc.abstractmethod
372
407
  def _evaluate_items(
@@ -394,121 +429,3 @@ class RunnerBase(Runner):
394
429
  return in_memory.InMemory(
395
430
  self.current_run.output_path_for(experiment, 'cache.json')
396
431
  )
397
-
398
-
399
- class SequentialRunner(RunnerBase):
400
- """Sequential runner.
401
-
402
- Sequential runner runs all evaluations and their examples in sequence,
403
- as well as the background tasks, it allows the developer to catch all
404
- exceptions thrown from the background tasks, making it easier to debug.
405
- """
406
-
407
- NAME = 'sequential'
408
-
409
- def background_run(
410
- self, func: Callable[..., Any], *args: Any, **kwargs: Any
411
- ) -> None:
412
- """Runs the function with the IO pool."""
413
- func(*args, **kwargs)
414
-
415
- def _run(self, evaluations: list[Evaluation]) -> None:
416
- """Runs the experiment in sequence."""
417
- for e in evaluations:
418
- self.run_evaluation(e)
419
-
420
- def _evaluate_items(
421
- self, evaluation: Evaluation, items: Iterator[Example]
422
- ) -> None:
423
- """Runs the evaluation items in sequence."""
424
- for item in items:
425
- self.evaluate_item(evaluation, item)
426
-
427
-
428
- class DebugRunner(SequentialRunner):
429
- """Debug runner."""
430
-
431
- NAME = 'debug'
432
-
433
- # Do not use the checkpointer for debug runner.
434
- plugins = []
435
-
436
- def _on_bound(self):
437
- super()._on_bound()
438
- if self.current_run.example_ids is None:
439
- self.current_run.rebind(example_ids=[1], skip_notification=True)
440
- self.current_run.rebind(raise_if_has_error=True, skip_notification=True)
441
-
442
- def _save_run_manifest(self) -> None:
443
- """Do nothing to avoid overriden existing runs."""
444
-
445
-
446
- class ParallelRunner(RunnerBase):
447
- """Parallel runner."""
448
-
449
- NAME = 'parallel'
450
-
451
- timeout: Annotated[
452
- int | None,
453
- 'Timeout for each evaluation example.'
454
- ] = None
455
-
456
- concurrent_startup_delay: Annotated[
457
- tuple[int, int] | None,
458
- (
459
- 'A range of seconds to delay the initial evaluation of each thread '
460
- 'in the thread pool, helping to prevent a burst in LLM QPS at '
461
- 'startup. If set to None, no delay will be applied.'
462
- )
463
- ] = None
464
-
465
- def _run(self, evaluations: list[Evaluation]) -> None:
466
- """Runs the evaluations in parallel."""
467
- def _run_group(evaluation_group: list[Evaluation]):
468
- for e in evaluation_group:
469
- self.run_evaluation(e)
470
-
471
- # Run evaluations in parallel groupped by resource key.
472
- groups: dict[str, list[Evaluation]] = collections.defaultdict(list)
473
- for e in evaluations:
474
- resource_ids = e.resource_ids()
475
- if not resource_ids:
476
- group_id = e.id
477
- else:
478
- # TODO(daiyip): support group that requires multiple resources.
479
- group_id = resource_ids.pop()
480
- groups[group_id].append(e)
481
-
482
- for _, _, _ in lf.concurrent_map(
483
- _run_group,
484
- groups.values(),
485
- max_workers=max(64, len(groups)),
486
- timeout=self.timeout,
487
- silence_on_errors=None,
488
- ):
489
- pass
490
-
491
- def _evaluate_items(
492
- self, evaluation: Evaluation, items: Iterator[Example]
493
- ) -> None:
494
- """Override run items to run in parallel."""
495
- if self.concurrent_startup_delay is not None:
496
- thread_delayed = {}
497
- def _evaluate_item(item: Example):
498
- thread_id = threading.current_thread().ident
499
- if thread_id not in thread_delayed:
500
- thread_delayed[thread_id] = True
501
- time.sleep(random.randint(*self.concurrent_startup_delay))
502
- return self.evaluate_item(evaluation, item)
503
- else:
504
- def _evaluate_item(item: Example):
505
- return self.evaluate_item(evaluation, item)
506
-
507
- for _, _, _ in lf.concurrent_map(
508
- _evaluate_item,
509
- items,
510
- max_workers=evaluation.max_workers,
511
- timeout=self.timeout,
512
- silence_on_errors=None,
513
- ):
514
- pass