langfun 0.0.2.dev20240429__py3-none-any.whl → 0.1.2.dev202501140804__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (144) hide show
  1. langfun/__init__.py +20 -2
  2. langfun/core/__init__.py +16 -5
  3. langfun/core/agentic/__init__.py +30 -0
  4. langfun/core/agentic/action.py +854 -0
  5. langfun/core/agentic/action_eval.py +150 -0
  6. langfun/core/agentic/action_eval_test.py +109 -0
  7. langfun/core/agentic/action_test.py +136 -0
  8. langfun/core/coding/python/__init__.py +5 -11
  9. langfun/core/coding/python/correction.py +37 -21
  10. langfun/core/coding/python/correction_test.py +29 -3
  11. langfun/core/coding/python/execution.py +40 -216
  12. langfun/core/coding/python/execution_test.py +29 -89
  13. langfun/core/coding/python/generation.py +21 -11
  14. langfun/core/coding/python/generation_test.py +2 -2
  15. langfun/core/coding/python/parsing.py +108 -193
  16. langfun/core/coding/python/parsing_test.py +2 -105
  17. langfun/core/component.py +63 -2
  18. langfun/core/component_test.py +53 -0
  19. langfun/core/concurrent.py +414 -117
  20. langfun/core/concurrent_test.py +111 -24
  21. langfun/core/console.py +18 -5
  22. langfun/core/console_test.py +17 -0
  23. langfun/core/eval/__init__.py +16 -1
  24. langfun/core/eval/base.py +622 -174
  25. langfun/core/eval/base_test.py +200 -54
  26. langfun/core/eval/matching.py +63 -76
  27. langfun/core/eval/matching_test.py +17 -8
  28. langfun/core/eval/patching.py +130 -0
  29. langfun/core/eval/patching_test.py +170 -0
  30. langfun/core/eval/scoring.py +26 -26
  31. langfun/core/eval/scoring_test.py +19 -2
  32. langfun/core/eval/v2/__init__.py +42 -0
  33. langfun/core/eval/v2/checkpointing.py +380 -0
  34. langfun/core/eval/v2/checkpointing_test.py +228 -0
  35. langfun/core/eval/v2/eval_test_helper.py +136 -0
  36. langfun/core/eval/v2/evaluation.py +725 -0
  37. langfun/core/eval/v2/evaluation_test.py +180 -0
  38. langfun/core/eval/v2/example.py +305 -0
  39. langfun/core/eval/v2/example_test.py +128 -0
  40. langfun/core/eval/v2/experiment.py +1048 -0
  41. langfun/core/eval/v2/experiment_test.py +433 -0
  42. langfun/core/eval/v2/metric_values.py +156 -0
  43. langfun/core/eval/v2/metric_values_test.py +80 -0
  44. langfun/core/eval/v2/metrics.py +357 -0
  45. langfun/core/eval/v2/metrics_test.py +203 -0
  46. langfun/core/eval/v2/progress.py +348 -0
  47. langfun/core/eval/v2/progress_test.py +82 -0
  48. langfun/core/eval/v2/progress_tracking.py +210 -0
  49. langfun/core/eval/v2/progress_tracking_test.py +66 -0
  50. langfun/core/eval/v2/reporting.py +270 -0
  51. langfun/core/eval/v2/reporting_test.py +158 -0
  52. langfun/core/eval/v2/runners.py +488 -0
  53. langfun/core/eval/v2/runners_test.py +334 -0
  54. langfun/core/langfunc.py +4 -17
  55. langfun/core/langfunc_test.py +22 -6
  56. langfun/core/language_model.py +577 -39
  57. langfun/core/language_model_test.py +470 -56
  58. langfun/core/llms/__init__.py +87 -16
  59. langfun/core/llms/anthropic.py +312 -87
  60. langfun/core/llms/anthropic_test.py +71 -3
  61. langfun/core/llms/cache/base.py +21 -2
  62. langfun/core/llms/cache/in_memory.py +13 -0
  63. langfun/core/llms/cache/in_memory_test.py +53 -2
  64. langfun/core/llms/compositional.py +101 -0
  65. langfun/core/llms/compositional_test.py +73 -0
  66. langfun/core/llms/deepseek.py +117 -0
  67. langfun/core/llms/deepseek_test.py +61 -0
  68. langfun/core/llms/fake.py +11 -7
  69. langfun/core/llms/fake_test.py +14 -0
  70. langfun/core/llms/gemini.py +507 -0
  71. langfun/core/llms/gemini_test.py +195 -0
  72. langfun/core/llms/google_genai.py +62 -218
  73. langfun/core/llms/google_genai_test.py +9 -202
  74. langfun/core/llms/groq.py +160 -144
  75. langfun/core/llms/groq_test.py +31 -137
  76. langfun/core/llms/llama_cpp.py +15 -42
  77. langfun/core/llms/llama_cpp_test.py +4 -30
  78. langfun/core/llms/openai.py +395 -203
  79. langfun/core/llms/openai_compatible.py +179 -0
  80. langfun/core/llms/openai_compatible_test.py +495 -0
  81. langfun/core/llms/openai_test.py +30 -395
  82. langfun/core/llms/rest.py +113 -0
  83. langfun/core/llms/rest_test.py +111 -0
  84. langfun/core/llms/vertexai.py +192 -0
  85. langfun/core/llms/vertexai_test.py +52 -0
  86. langfun/core/logging.py +284 -0
  87. langfun/core/logging_test.py +125 -0
  88. langfun/core/message.py +319 -9
  89. langfun/core/message_test.py +190 -13
  90. langfun/core/modalities/__init__.py +6 -2
  91. langfun/core/modalities/audio.py +30 -0
  92. langfun/core/modalities/audio_test.py +63 -0
  93. langfun/core/modalities/image.py +39 -20
  94. langfun/core/modalities/image_test.py +52 -9
  95. langfun/core/modalities/mime.py +206 -29
  96. langfun/core/modalities/mime_test.py +90 -9
  97. langfun/core/modalities/ms_office.py +117 -0
  98. langfun/core/modalities/ms_office_test.py +389 -0
  99. langfun/core/modalities/pdf.py +22 -0
  100. langfun/core/modalities/pdf_test.py +57 -0
  101. langfun/core/modalities/video.py +9 -26
  102. langfun/core/modalities/video_test.py +3 -3
  103. langfun/core/modality.py +26 -3
  104. langfun/core/modality_test.py +2 -2
  105. langfun/core/sampling.py +11 -11
  106. langfun/core/structured/__init__.py +12 -16
  107. langfun/core/structured/completion.py +32 -5
  108. langfun/core/structured/completion_test.py +7 -6
  109. langfun/core/structured/description.py +2 -2
  110. langfun/core/structured/description_test.py +3 -3
  111. langfun/core/structured/function_generation.py +60 -27
  112. langfun/core/structured/function_generation_test.py +72 -2
  113. langfun/core/structured/mapping.py +97 -47
  114. langfun/core/structured/mapping_test.py +90 -2
  115. langfun/core/structured/parsing.py +33 -21
  116. langfun/core/structured/parsing_test.py +53 -9
  117. langfun/core/structured/querying.py +746 -0
  118. langfun/core/structured/{prompting_test.py → querying_test.py} +469 -51
  119. langfun/core/structured/schema.py +204 -97
  120. langfun/core/structured/schema_generation.py +1 -1
  121. langfun/core/structured/schema_test.py +130 -29
  122. langfun/core/structured/scoring.py +125 -19
  123. langfun/core/structured/scoring_test.py +30 -0
  124. langfun/core/structured/tokenization.py +64 -0
  125. langfun/core/structured/tokenization_test.py +48 -0
  126. langfun/core/template.py +115 -1
  127. langfun/core/template_test.py +71 -1
  128. langfun/core/templates/conversation.py +9 -0
  129. langfun/core/templates/conversation_test.py +4 -3
  130. langfun/core/templates/selfplay_test.py +10 -2
  131. langfun-0.1.2.dev202501140804.dist-info/METADATA +225 -0
  132. langfun-0.1.2.dev202501140804.dist-info/RECORD +153 -0
  133. {langfun-0.0.2.dev20240429.dist-info → langfun-0.1.2.dev202501140804.dist-info}/WHEEL +1 -1
  134. langfun/core/coding/python/errors.py +0 -108
  135. langfun/core/coding/python/errors_test.py +0 -99
  136. langfun/core/coding/python/permissions.py +0 -90
  137. langfun/core/coding/python/permissions_test.py +0 -86
  138. langfun/core/structured/prompting.py +0 -238
  139. langfun/core/text_formatting.py +0 -162
  140. langfun/core/text_formatting_test.py +0 -47
  141. langfun-0.0.2.dev20240429.dist-info/METADATA +0 -100
  142. langfun-0.0.2.dev20240429.dist-info/RECORD +0 -108
  143. {langfun-0.0.2.dev20240429.dist-info → langfun-0.1.2.dev202501140804.dist-info}/LICENSE +0 -0
  144. {langfun-0.0.2.dev20240429.dist-info → langfun-0.1.2.dev202501140804.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,488 @@
1
+ # Copyright 2024 The Langfun Authors
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Evaluation experiment runners."""
15
+ import abc
16
+ import collections
17
+ import concurrent.futures
18
+ import random
19
+ import threading
20
+ import time
21
+ import traceback
22
+ from typing import Any, Annotated, Callable, Iterator
23
+
24
+ from langfun import core as lf
25
+ from langfun.core.eval.v2 import checkpointing
26
+ from langfun.core.eval.v2 import evaluation as evaluation_lib
27
+ from langfun.core.eval.v2 import example as example_lib
28
+ from langfun.core.eval.v2 import experiment as experiment_lib
29
+ from langfun.core.eval.v2 import progress_tracking
30
+ from langfun.core.eval.v2 import reporting
31
+ from langfun.core.llms.cache import in_memory
32
+ import pyglove as pg
33
+
34
+ Runner = experiment_lib.Runner
35
+ Example = example_lib.Example
36
+ Evaluation = evaluation_lib.Evaluation
37
+ Experiment = experiment_lib.Experiment
38
+ Plugin = experiment_lib.Plugin
39
+
40
+
41
+ _RUN_MANIFEST = 'run.json'
42
+
43
+
44
+ class RunnerBase(Runner):
45
+ """A simple runner that runs evaluations and their examples sequentially."""
46
+
47
+ tqdm: Annotated[
48
+ bool,
49
+ (
50
+ 'If True, force using tqdm for progress update. Otherwise, determine '
51
+ 'it automatically based on the running environment (console vs. '
52
+ 'notebook)'
53
+ )
54
+ ] = False
55
+
56
+ plugins = [
57
+ checkpointing.BulkCheckpointer(),
58
+ reporting.HtmlReporter(),
59
+ ]
60
+
61
+ def _on_bound(self):
62
+ super()._on_bound()
63
+
64
+ # Install the tqdm plugin if needed.
65
+ with pg.notify_on_change(False):
66
+ self.plugins.append(progress_tracking.progress_tracker(self.tqdm))
67
+
68
+ self._io_pool_lock = threading.Lock()
69
+ self._io_pool = concurrent.futures.ThreadPoolExecutor(max_workers=16)
70
+ # TODO(daiyip): render background errors.
71
+ self._background_last_error = None
72
+
73
+ def background_run(self, func: Callable[..., Any], *args, **kwargs) -> None:
74
+ """Runs the function with the IO pool."""
75
+ def _background_run(*args, **kwargs):
76
+ try:
77
+ func(*args, **kwargs)
78
+ except Exception as e: # pylint: disable=broad-except
79
+ self._background_last_error = e
80
+
81
+ with self._io_pool_lock:
82
+ if self._io_pool is not None:
83
+ self._io_pool.submit(_background_run, *args, **kwargs)
84
+
85
+ def _all_plugins(self, experiment: Experiment) -> Iterator[Plugin]:
86
+ """Returns all plugins for the experiment."""
87
+ for plugin in self.plugins:
88
+ yield plugin
89
+ for plugin in experiment.plugins:
90
+ yield plugin
91
+
92
+ #
93
+ # IO operations for saving running files.
94
+ #
95
+
96
+ def _save_run_manifest(self) -> None:
97
+ def _save():
98
+ pg.symbolic.deref(self.current_run.clone(), recursive=True).save(
99
+ self.current_run.output_path_for(
100
+ self.current_run.experiment, _RUN_MANIFEST
101
+ ),
102
+ hide_default_values=True
103
+ )
104
+ self.background_run(_save)
105
+
106
+ def on_run_start(self) -> None:
107
+ """Called when a runner is started."""
108
+ self._save_run_manifest()
109
+
110
+ for plugin in self._all_plugins(self.current_run.experiment):
111
+ plugin.on_run_start(self, self.current_run.experiment)
112
+
113
+ def on_run_complete(self) -> None:
114
+ """Called when a runner is complete."""
115
+ for plugin in self._all_plugins(self.current_run.experiment):
116
+ plugin.on_run_complete(self, self.current_run.experiment)
117
+
118
+ def on_run_abort(self, error: Exception) -> None:
119
+ """Called when a runner is aborted."""
120
+ for plugin in self._all_plugins(self.current_run.experiment):
121
+ plugin.on_run_abort(self, self.current_run.experiment, error)
122
+
123
+ def on_experiment_start(self, experiment: Experiment) -> None:
124
+ """Called when an evaluation is started."""
125
+ # Start the progress of the evaluation.
126
+ num_examples_to_evaluate = 0
127
+ if experiment.is_leaf:
128
+ assert isinstance(experiment, Evaluation)
129
+ num_examples_to_evaluate = (
130
+ len(self.current_run.example_ids)
131
+ if self.current_run.example_ids else experiment.num_examples
132
+ )
133
+ experiment.progress.start(total=num_examples_to_evaluate)
134
+ else:
135
+ experiment.progress.start(total=len(experiment.leaf_nodes))
136
+
137
+ # Notify the plugins of the experiment start.
138
+ for plugin in self._all_plugins(experiment):
139
+ plugin.on_experiment_start(self, experiment)
140
+
141
+ if experiment.is_leaf:
142
+ experiment.info(
143
+ f'Starting evaluation {experiment.id!r} with '
144
+ f'{num_examples_to_evaluate} examples to evaluate.'
145
+ )
146
+
147
+ def on_experiment_skipped(self, experiment: Experiment) -> None:
148
+ """Called when an evaluation is skipped."""
149
+ # Skip event will only be triggered for leaf evaluations.
150
+ assert experiment.is_leaf
151
+ experiment.progress.start(total=1)
152
+ experiment.progress.increment_skipped(1)
153
+
154
+ # Notify the plugins of the experiment skip.
155
+ for plugin in self._all_plugins(experiment):
156
+ plugin.on_experiment_skipped(self, experiment)
157
+
158
+ # Only leaf evaluations will trigger the complete notification of the
159
+ # ancestors.
160
+ self._update_ancestor_progresses(experiment)
161
+
162
+ def on_experiment_complete(self, experiment: Experiment) -> None:
163
+ """Called when an evaluation is complete."""
164
+ progress = experiment.progress
165
+ progress.stop()
166
+
167
+ # Notify the plugins of the experiment complete.
168
+ for plugin in self._all_plugins(experiment):
169
+ plugin.on_experiment_complete(self, experiment)
170
+
171
+ # Only leaf evaluations will trigger the complete notification of the
172
+ # ancestors.
173
+ if experiment.is_leaf:
174
+ self._update_ancestor_progresses(experiment)
175
+ self._log_experiment_completion(experiment)
176
+
177
+ def _log_experiment_completion(self, experiment: Experiment):
178
+ example_ids = (
179
+ self.current_run.example_ids if self.current_run.example_ids else
180
+ list(range(1, experiment.num_examples + 1))
181
+ )
182
+ num_from_checkpoint, num_processed = 0, 0
183
+ for example_id in example_ids:
184
+ example = experiment.state.get(example_id)
185
+ if example.newly_processed:
186
+ num_processed += 1
187
+ else:
188
+ num_from_checkpoint += 1
189
+ experiment.info(
190
+ f'{experiment.id} completed with {num_from_checkpoint + num_processed} '
191
+ f'examples evaluated ({num_from_checkpoint} from checkpoint, '
192
+ f'{num_processed} newly processed).'
193
+ )
194
+
195
+ def on_experiment_abort(
196
+ self, experiment: Experiment, error: BaseException) -> None:
197
+ """Called when an evaluation is complete."""
198
+ assert experiment.is_leaf
199
+ experiment.fatal(f'{error}\n\n{traceback.format_exc()}')
200
+
201
+ # Notify the plugins of the experiment abort.
202
+ for plugin in self._all_plugins(experiment):
203
+ plugin.on_experiment_abort(self, experiment, error)
204
+
205
+ def _update_ancestor_progresses(self, experiment: Experiment):
206
+ """Updates the progresses of the parent nodes of the experiment."""
207
+ parent = experiment.parent
208
+ progress = experiment.progress
209
+ while parent is not None:
210
+ parent_progress = parent.progress
211
+ if progress.is_failed:
212
+ parent_progress.increment_failed()
213
+ elif progress.is_skipped:
214
+ parent_progress.increment_skipped()
215
+ else:
216
+ # A evaluation could be considered as done if it has processed all the
217
+ # examples specified by `example_ids`.
218
+ assert progress.is_completed
219
+ parent_progress.increment_processed()
220
+
221
+ if parent_progress.is_completed:
222
+ self.on_experiment_complete(parent)
223
+ elif parent_progress.is_skipped:
224
+ self.on_experiment_skipped(parent)
225
+ parent = parent.parent
226
+
227
+ def on_example_start(
228
+ self,
229
+ experiment: Experiment,
230
+ example: Example
231
+ ) -> None:
232
+ """Called when an evaluation example is started."""
233
+ for plugin in self._all_plugins(experiment):
234
+ plugin.on_example_start(self, experiment, example)
235
+
236
+ def on_example_complete(
237
+ self,
238
+ experiment: Experiment,
239
+ example: Example
240
+ ) -> None:
241
+ """Called when an evaluation example is complete."""
242
+ if example.newly_processed:
243
+ if example.error is None:
244
+ experiment.progress.increment_processed()
245
+ else:
246
+ experiment.progress.increment_failed()
247
+ else:
248
+ experiment.progress.increment_skipped()
249
+
250
+ experiment.usage_summary.merge(example.usage_summary)
251
+ experiment.progress.update_execution_summary(example.execution_status)
252
+
253
+ parent = experiment.parent
254
+ while parent is not None:
255
+ parent.usage_summary.merge(example.usage_summary)
256
+ parent = parent.parent
257
+
258
+ for plugin in self._all_plugins(experiment):
259
+ plugin.on_example_complete(self, experiment, example)
260
+
261
+ def run(self) -> None:
262
+ """Runs the experiment."""
263
+ # Resets the experiment before getting start.
264
+ for node in self.current_run.experiment.nodes:
265
+ node.reset()
266
+
267
+ # Start the run.
268
+ self.on_run_start()
269
+ cache = None
270
+
271
+ try:
272
+ # Start the non-leaf nodes.
273
+ for node in self.current_run.experiment.nonleaf_nodes:
274
+ self.on_experiment_start(node)
275
+
276
+ # Skip evaluations if needed.
277
+ if self.current_run.filter is not None:
278
+ targets = []
279
+ for evaluation in self.current_run.experiment.leaf_nodes:
280
+ if self.current_run.filter(evaluation):
281
+ targets.append(evaluation)
282
+ else:
283
+ self.on_experiment_skipped(evaluation)
284
+ else:
285
+ targets = self.current_run.experiment.leaf_nodes
286
+
287
+ # Prepare the global cache if needed.
288
+ global_settings = {}
289
+ if self.current_run.use_cache == 'global':
290
+ cache = self._load_or_create_cache(self.current_run.experiment)
291
+ global_settings['cache'] = cache
292
+
293
+ # Evaluate the leaf evaluations if not skipped.
294
+ with lf.use_settings(**global_settings):
295
+ self._run(targets)
296
+
297
+ self.on_run_complete()
298
+ except Exception as e: # pylint: disable=broad-except
299
+ self.on_run_abort(e)
300
+ raise e
301
+ finally:
302
+ if cache is not None:
303
+ self.background_run(cache.save)
304
+
305
+ # Wait for the background tasks to finish.
306
+ with self._io_pool_lock:
307
+ self._io_pool, io_pool = None, self._io_pool
308
+ io_pool.shutdown(wait=True)
309
+
310
+ @abc.abstractmethod
311
+ def _run(self, evaluations: list[Evaluation]) -> None:
312
+ """Runs multiple evaluations."""
313
+
314
+ def run_evaluation(self, evaluation: Evaluation) -> None:
315
+ """Runs the evaluation."""
316
+ try:
317
+ self.on_experiment_start(evaluation)
318
+
319
+ per_evaluation_settings = {}
320
+ cache = None
321
+ if self.current_run.use_cache == 'per_dataset':
322
+ cache = self._load_or_create_cache(evaluation)
323
+ per_evaluation_settings['cache'] = cache
324
+
325
+ with lf.use_settings(**per_evaluation_settings):
326
+ if self.current_run.example_ids is None:
327
+ items = (
328
+ Example(id=i + 1, input=ex) for i, ex in enumerate(
329
+ evaluation.example_inputs)
330
+ )
331
+ else:
332
+ items = (
333
+ Example(
334
+ id=example_id,
335
+ input=evaluation.example_input_by_id(example_id)
336
+ ) for example_id in self.current_run.example_ids
337
+ )
338
+ self._evaluate_items(evaluation, items)
339
+
340
+ if cache:
341
+ self.background_run(cache.save)
342
+ self.on_experiment_complete(evaluation)
343
+ except BaseException as e: # pylint: disable=broad-except
344
+ self.on_experiment_abort(evaluation, e)
345
+ raise e
346
+
347
+ @abc.abstractmethod
348
+ def _evaluate_items(
349
+ self, evaluation: Evaluation, items: Iterator[Example]
350
+ ) -> None:
351
+ """Evaluates the items of an evaluation."""
352
+
353
+ def evaluate_item(
354
+ self,
355
+ evaluation: Evaluation,
356
+ item: Example
357
+ ) -> Example:
358
+ """Runs the evaluation example."""
359
+ self.on_example_start(evaluation, item)
360
+ item = evaluation.evaluate(
361
+ item, raise_if_has_error=self.current_run.raise_if_has_error
362
+ )
363
+ self.on_example_complete(evaluation, item)
364
+ return item
365
+
366
+ def _load_or_create_cache(self, experiment: Experiment) -> lf.LMCache | None:
367
+ """Loads or creates the cache."""
368
+ return in_memory.InMemory(
369
+ self.current_run.output_path_for(experiment, 'cache.json')
370
+ )
371
+
372
+
373
+ class SequentialRunner(RunnerBase):
374
+ """Sequential runner.
375
+
376
+ Sequential runner runs all evaluations and their examples in sequence,
377
+ as well as the background tasks, it allows the developer to catch all
378
+ exceptions thrown from the background tasks, making it easier to debug.
379
+ """
380
+
381
+ NAME = 'sequential'
382
+
383
+ def background_run(
384
+ self, func: Callable[..., Any], *args: Any, **kwargs: Any
385
+ ) -> None:
386
+ """Runs the function with the IO pool."""
387
+ func(*args, **kwargs)
388
+
389
+ def _run(self, evaluations: list[Evaluation]) -> None:
390
+ """Runs the experiment in sequence."""
391
+ for e in evaluations:
392
+ self.run_evaluation(e)
393
+
394
+ def _evaluate_items(
395
+ self, evaluation: Evaluation, items: Iterator[Example]
396
+ ) -> None:
397
+ """Runs the evaluation items in sequence."""
398
+ for item in items:
399
+ self.evaluate_item(evaluation, item)
400
+
401
+
402
+ class DebugRunner(SequentialRunner):
403
+ """Debug runner."""
404
+
405
+ NAME = 'debug'
406
+
407
+ # Do not use the checkpointer for debug runner.
408
+ plugins = []
409
+
410
+ def _on_bound(self):
411
+ super()._on_bound()
412
+ if self.current_run.example_ids is None:
413
+ self.current_run.rebind(example_ids=[1], skip_notification=True)
414
+ self.current_run.rebind(raise_if_has_error=True, skip_notification=True)
415
+
416
+ def _save_run_manifest(self) -> None:
417
+ """Do nothing to avoid overriden existing runs."""
418
+
419
+
420
+ class ParallelRunner(RunnerBase):
421
+ """Parallel runner."""
422
+
423
+ NAME = 'parallel'
424
+
425
+ timeout: Annotated[
426
+ int | None,
427
+ 'Timeout for each evaluation example.'
428
+ ] = None
429
+
430
+ concurrent_startup_delay: Annotated[
431
+ tuple[int, int] | None,
432
+ (
433
+ 'A range of seconds to delay the initial evaluation of each thread '
434
+ 'in the thread pool, helping to prevent a burst in LLM QPS at '
435
+ 'startup. If set to None, no delay will be applied.'
436
+ )
437
+ ] = None
438
+
439
+ def _run(self, evaluations: list[Evaluation]) -> None:
440
+ """Runs the evaluations in parallel."""
441
+ def _run_group(evaluation_group: list[Evaluation]):
442
+ for e in evaluation_group:
443
+ self.run_evaluation(e)
444
+
445
+ # Run evaluations in parallel groupped by resource key.
446
+ groups: dict[str, list[Evaluation]] = collections.defaultdict(list)
447
+ for e in evaluations:
448
+ resource_ids = e.resource_ids()
449
+ if not resource_ids:
450
+ group_id = e.id
451
+ else:
452
+ # TODO(daiyip): support group that requires multiple resources.
453
+ group_id = resource_ids.pop()
454
+ groups[group_id].append(e)
455
+
456
+ for _, _, _ in lf.concurrent_map(
457
+ _run_group,
458
+ groups.values(),
459
+ max_workers=max(64, len(groups)),
460
+ timeout=self.timeout,
461
+ silence_on_errors=None,
462
+ ):
463
+ pass
464
+
465
+ def _evaluate_items(
466
+ self, evaluation: Evaluation, items: Iterator[Example]
467
+ ) -> None:
468
+ """Override run items to run in parallel."""
469
+ if self.concurrent_startup_delay is not None:
470
+ thread_delayed = {}
471
+ def _evaluate_item(item: Example):
472
+ thread_id = threading.current_thread().ident
473
+ if thread_id not in thread_delayed:
474
+ thread_delayed[thread_id] = True
475
+ time.sleep(random.randint(*self.concurrent_startup_delay))
476
+ return self.evaluate_item(evaluation, item)
477
+ else:
478
+ def _evaluate_item(item: Example):
479
+ return self.evaluate_item(evaluation, item)
480
+
481
+ for _, _, _ in lf.concurrent_map(
482
+ _evaluate_item,
483
+ items,
484
+ max_workers=evaluation.max_workers,
485
+ timeout=self.timeout,
486
+ silence_on_errors=None,
487
+ ):
488
+ pass