langfun 0.1.2.dev202510200805__py3-none-any.whl → 0.1.2.dev202511160804__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of langfun might be problematic. Click here for more details.

Files changed (146) hide show
  1. langfun/core/__init__.py +1 -0
  2. langfun/core/agentic/action.py +107 -12
  3. langfun/core/agentic/action_eval.py +9 -2
  4. langfun/core/agentic/action_test.py +25 -0
  5. langfun/core/async_support.py +32 -3
  6. langfun/core/coding/python/correction.py +19 -9
  7. langfun/core/coding/python/execution.py +14 -12
  8. langfun/core/coding/python/generation.py +21 -16
  9. langfun/core/coding/python/sandboxing.py +23 -3
  10. langfun/core/component.py +42 -3
  11. langfun/core/concurrent.py +70 -6
  12. langfun/core/concurrent_test.py +1 -0
  13. langfun/core/console.py +1 -1
  14. langfun/core/data/conversion/anthropic.py +12 -3
  15. langfun/core/data/conversion/anthropic_test.py +8 -6
  16. langfun/core/data/conversion/gemini.py +9 -2
  17. langfun/core/data/conversion/gemini_test.py +12 -9
  18. langfun/core/data/conversion/openai.py +145 -31
  19. langfun/core/data/conversion/openai_test.py +161 -17
  20. langfun/core/eval/base.py +48 -44
  21. langfun/core/eval/base_test.py +4 -4
  22. langfun/core/eval/matching.py +5 -2
  23. langfun/core/eval/patching.py +3 -3
  24. langfun/core/eval/scoring.py +4 -3
  25. langfun/core/eval/v2/__init__.py +1 -0
  26. langfun/core/eval/v2/checkpointing.py +39 -5
  27. langfun/core/eval/v2/checkpointing_test.py +1 -1
  28. langfun/core/eval/v2/eval_test_helper.py +97 -1
  29. langfun/core/eval/v2/evaluation.py +88 -16
  30. langfun/core/eval/v2/evaluation_test.py +9 -3
  31. langfun/core/eval/v2/example.py +45 -39
  32. langfun/core/eval/v2/example_test.py +3 -3
  33. langfun/core/eval/v2/experiment.py +51 -8
  34. langfun/core/eval/v2/metric_values.py +31 -3
  35. langfun/core/eval/v2/metric_values_test.py +32 -0
  36. langfun/core/eval/v2/metrics.py +157 -44
  37. langfun/core/eval/v2/metrics_test.py +39 -18
  38. langfun/core/eval/v2/progress.py +30 -1
  39. langfun/core/eval/v2/progress_test.py +27 -0
  40. langfun/core/eval/v2/progress_tracking_test.py +3 -0
  41. langfun/core/eval/v2/reporting.py +90 -71
  42. langfun/core/eval/v2/reporting_test.py +20 -6
  43. langfun/core/eval/v2/runners/__init__.py +26 -0
  44. langfun/core/eval/v2/{runners.py → runners/base.py} +22 -124
  45. langfun/core/eval/v2/runners/debug.py +40 -0
  46. langfun/core/eval/v2/runners/debug_test.py +79 -0
  47. langfun/core/eval/v2/runners/parallel.py +100 -0
  48. langfun/core/eval/v2/runners/parallel_test.py +98 -0
  49. langfun/core/eval/v2/runners/sequential.py +47 -0
  50. langfun/core/eval/v2/runners/sequential_test.py +175 -0
  51. langfun/core/langfunc.py +45 -130
  52. langfun/core/langfunc_test.py +6 -4
  53. langfun/core/language_model.py +103 -16
  54. langfun/core/language_model_test.py +9 -3
  55. langfun/core/llms/__init__.py +7 -1
  56. langfun/core/llms/anthropic.py +157 -2
  57. langfun/core/llms/azure_openai.py +29 -17
  58. langfun/core/llms/cache/base.py +25 -3
  59. langfun/core/llms/cache/in_memory.py +48 -7
  60. langfun/core/llms/cache/in_memory_test.py +14 -4
  61. langfun/core/llms/compositional.py +25 -1
  62. langfun/core/llms/deepseek.py +30 -2
  63. langfun/core/llms/fake.py +32 -1
  64. langfun/core/llms/gemini.py +14 -9
  65. langfun/core/llms/google_genai.py +29 -1
  66. langfun/core/llms/groq.py +28 -3
  67. langfun/core/llms/llama_cpp.py +23 -4
  68. langfun/core/llms/openai.py +36 -3
  69. langfun/core/llms/openai_compatible.py +148 -27
  70. langfun/core/llms/openai_compatible_test.py +207 -20
  71. langfun/core/llms/openai_test.py +0 -2
  72. langfun/core/llms/rest.py +12 -1
  73. langfun/core/llms/vertexai.py +51 -8
  74. langfun/core/logging.py +1 -1
  75. langfun/core/mcp/client.py +77 -22
  76. langfun/core/mcp/client_test.py +8 -35
  77. langfun/core/mcp/session.py +94 -29
  78. langfun/core/mcp/session_test.py +54 -0
  79. langfun/core/mcp/tool.py +151 -22
  80. langfun/core/mcp/tool_test.py +197 -0
  81. langfun/core/memory.py +1 -0
  82. langfun/core/message.py +160 -55
  83. langfun/core/message_test.py +65 -81
  84. langfun/core/modalities/__init__.py +8 -0
  85. langfun/core/modalities/audio.py +21 -1
  86. langfun/core/modalities/image.py +19 -1
  87. langfun/core/modalities/mime.py +62 -3
  88. langfun/core/modalities/pdf.py +19 -1
  89. langfun/core/modalities/video.py +21 -1
  90. langfun/core/modality.py +167 -29
  91. langfun/core/modality_test.py +42 -12
  92. langfun/core/natural_language.py +1 -1
  93. langfun/core/sampling.py +4 -4
  94. langfun/core/sampling_test.py +20 -4
  95. langfun/core/structured/__init__.py +2 -24
  96. langfun/core/structured/completion.py +34 -44
  97. langfun/core/structured/completion_test.py +23 -43
  98. langfun/core/structured/description.py +54 -50
  99. langfun/core/structured/function_generation.py +29 -12
  100. langfun/core/structured/mapping.py +81 -37
  101. langfun/core/structured/parsing.py +95 -79
  102. langfun/core/structured/parsing_test.py +0 -3
  103. langfun/core/structured/querying.py +215 -142
  104. langfun/core/structured/querying_test.py +65 -29
  105. langfun/core/structured/schema/__init__.py +48 -0
  106. langfun/core/structured/schema/base.py +664 -0
  107. langfun/core/structured/schema/base_test.py +531 -0
  108. langfun/core/structured/schema/json.py +174 -0
  109. langfun/core/structured/schema/json_test.py +121 -0
  110. langfun/core/structured/schema/python.py +316 -0
  111. langfun/core/structured/schema/python_test.py +410 -0
  112. langfun/core/structured/schema_generation.py +33 -14
  113. langfun/core/structured/scoring.py +47 -36
  114. langfun/core/structured/tokenization.py +26 -11
  115. langfun/core/subscription.py +2 -2
  116. langfun/core/template.py +175 -50
  117. langfun/core/template_test.py +123 -17
  118. langfun/env/__init__.py +8 -2
  119. langfun/env/base_environment.py +320 -128
  120. langfun/env/base_environment_test.py +473 -0
  121. langfun/env/base_feature.py +92 -15
  122. langfun/env/base_feature_test.py +228 -0
  123. langfun/env/base_sandbox.py +84 -361
  124. langfun/env/base_sandbox_test.py +1235 -0
  125. langfun/env/event_handlers/__init__.py +1 -1
  126. langfun/env/event_handlers/chain.py +233 -0
  127. langfun/env/event_handlers/chain_test.py +253 -0
  128. langfun/env/event_handlers/event_logger.py +95 -98
  129. langfun/env/event_handlers/event_logger_test.py +21 -21
  130. langfun/env/event_handlers/metric_writer.py +225 -140
  131. langfun/env/event_handlers/metric_writer_test.py +23 -6
  132. langfun/env/interface.py +854 -40
  133. langfun/env/interface_test.py +112 -2
  134. langfun/env/load_balancers_test.py +23 -2
  135. langfun/env/test_utils.py +126 -84
  136. {langfun-0.1.2.dev202510200805.dist-info → langfun-0.1.2.dev202511160804.dist-info}/METADATA +1 -1
  137. langfun-0.1.2.dev202511160804.dist-info/RECORD +211 -0
  138. langfun/core/eval/v2/runners_test.py +0 -343
  139. langfun/core/structured/schema.py +0 -987
  140. langfun/core/structured/schema_test.py +0 -982
  141. langfun/env/base_test.py +0 -1481
  142. langfun/env/event_handlers/base.py +0 -350
  143. langfun-0.1.2.dev202510200805.dist-info/RECORD +0 -195
  144. {langfun-0.1.2.dev202510200805.dist-info → langfun-0.1.2.dev202511160804.dist-info}/WHEEL +0 -0
  145. {langfun-0.1.2.dev202510200805.dist-info → langfun-0.1.2.dev202511160804.dist-info}/licenses/LICENSE +0 -0
  146. {langfun-0.1.2.dev202510200805.dist-info → langfun-0.1.2.dev202511160804.dist-info}/top_level.txt +0 -0
@@ -77,6 +77,33 @@ class ProgressTest(unittest.TestCase):
77
77
  self.assertTrue(p.is_stopped)
78
78
  self.assertIsNotNone(p.stop_time_str)
79
79
 
80
+ def test_merge_from(self):
81
+ p1 = Progress()
82
+ p1.start(10)
83
+ p1.increment_processed()
84
+ p1.increment_failed()
85
+ p1.stop()
86
+
87
+ p2 = Progress()
88
+ p2.start(10)
89
+ p2.increment_skipped()
90
+ p2.stop()
91
+
92
+ with pg.allow_writable_accessors(True):
93
+ p1.start_time = 2.0
94
+ p1.stop_time = 4.0
95
+ p2.start_time = 1.0
96
+ p2.stop_time = 5.0
97
+
98
+ p1.merge_from(p2)
99
+ self.assertEqual(p1.num_total, 10)
100
+ self.assertEqual(p1.num_processed, 1)
101
+ self.assertEqual(p1.num_failed, 1)
102
+ self.assertEqual(p1.num_skipped, 1)
103
+ self.assertEqual(p1.num_completed, 3)
104
+ self.assertEqual(p1.start_time, 1.0)
105
+ self.assertEqual(p1.stop_time, 5.0)
106
+
80
107
 
81
108
  if __name__ == '__main__':
82
109
  unittest.main()
@@ -18,6 +18,7 @@ import sys
18
18
  import tempfile
19
19
  import unittest
20
20
 
21
+ from langfun.core import concurrent as lf_concurrent
21
22
  from langfun.core import console as lf_console
22
23
  from langfun.core.eval.v2 import eval_test_helper
23
24
  from langfun.core.eval.v2 import progress_tracking # pylint: disable=unused-import
@@ -51,6 +52,7 @@ class TqdmProgressTrackerTest(unittest.TestCase):
51
52
  with contextlib.redirect_stderr(string_io):
52
53
  _ = experiment.run(root_dir, 'new', plugins=[])
53
54
  sys.stderr.flush()
55
+ lf_concurrent.ProgressBar.refresh()
54
56
  self.assertIn('All: 100%', string_io.getvalue())
55
57
 
56
58
  def test_with_example_ids(self):
@@ -62,6 +64,7 @@ class TqdmProgressTrackerTest(unittest.TestCase):
62
64
  with contextlib.redirect_stderr(string_io):
63
65
  _ = experiment.run(root_dir, 'new', example_ids=[1], plugins=[])
64
66
  sys.stderr.flush()
67
+ lf_concurrent.ProgressBar.refresh()
65
68
  self.assertIn('All: 100%', string_io.getvalue())
66
69
 
67
70
 
@@ -32,8 +32,97 @@ _SUMMARY_FILE = 'summary.html'
32
32
  _EVALULATION_DETAIL_FILE = 'index.html'
33
33
 
34
34
 
35
+ class ExampleHtmlGenerator(experiment_lib.Plugin):
36
+ """Plugin for generating HTML views for each evaluation example."""
37
+
38
+ def on_example_complete(
39
+ self, runner: Runner, experiment: Experiment, example: Example
40
+ ):
41
+ self._save_example_html(runner, experiment, example)
42
+
43
+ def _save_example_html(
44
+ self, runner: Runner, experiment: Experiment, example: Example
45
+ ) -> None:
46
+ """Saves the example in HTML format."""
47
+ current_run = runner.current_run
48
+ def _generate():
49
+ try:
50
+ with pg.timeit() as t:
51
+ html = example.to_html(
52
+ collapse_level=None,
53
+ enable_summary_tooltip=False,
54
+ extra_flags=dict(
55
+ # For properly rendering the next link.
56
+ num_examples=getattr(experiment, 'num_examples', None)
57
+ ),
58
+ )
59
+ html.save(
60
+ runner.current_run.output_path_for(
61
+ experiment, f'{example.id}.html'
62
+ )
63
+ )
64
+ experiment.info(
65
+ f'\'{example.id}.html\' generated in {t.elapse:.2f} seconds. '
66
+ )
67
+ except BaseException as e: # pylint: disable=broad-except
68
+ experiment.error(
69
+ f'Failed to generate \'{example.id}.html\'. '
70
+ f'Error: {e}, Stacktrace: \n{traceback.format_exc()}.',
71
+ )
72
+ raise e
73
+
74
+ def _copy():
75
+ src_file = current_run.input_path_for(experiment, f'{example.id}.html')
76
+ dest_file = current_run.output_path_for(experiment, f'{example.id}.html')
77
+
78
+ if src_file == dest_file:
79
+ return
80
+
81
+ if not pg.io.path_exists(src_file):
82
+ experiment.warning(
83
+ f'Skip copying \'{example.id}.html\' as '
84
+ f'{src_file!r} does not exist.'
85
+ )
86
+ return
87
+
88
+ try:
89
+ with pg.timeit() as t, pg.io.open(src_file, 'r') as src:
90
+ content = src.read()
91
+ with pg.io.open(dest_file, 'w') as dest:
92
+ dest.write(content)
93
+ experiment.info(
94
+ f'\'{example.id}.html\' copied in {t.elapse:.2f} seconds.'
95
+ )
96
+ except BaseException as e: # pylint: disable=broad-except
97
+ experiment.error(
98
+ f'Failed to copy {src_file!r} to {dest_file!r}. Error: {e}.'
99
+ )
100
+ raise e
101
+
102
+ generate_example_html = current_run.generate_example_html
103
+ if (generate_example_html == 'all'
104
+ or (generate_example_html == 'new' and example.newly_processed)
105
+ or (isinstance(generate_example_html, list)
106
+ and example.id in generate_example_html)):
107
+ op = _generate
108
+ else:
109
+ op = _copy
110
+ runner.background_run(op)
111
+
112
+
35
113
  class HtmlReporter(experiment_lib.Plugin):
36
- """Plugin for periodically generating HTML reports for the experiment."""
114
+ """Plugin for periodically generating HTML reports for the experiment.
115
+
116
+ The `HtmlReporter` plugin generates several HTML files during an experiment
117
+ run:
118
+ - A `summary.html` at the root of the run directory, summarizing all
119
+ evaluations in the experiment.
120
+ - An `index.html` for each leaf evaluation, detailing the evaluation
121
+ definition, metrics, and logs.
122
+
123
+ These reports are updated periodically in the background during the run,
124
+ allowing users to monitor progress in near real-time.
125
+ """
37
126
 
38
127
  summary_interval: Annotated[
39
128
  int,
@@ -127,7 +216,6 @@ class HtmlReporter(experiment_lib.Plugin):
127
216
  def on_example_complete(
128
217
  self, runner: Runner, experiment: Experiment, example: Example
129
218
  ):
130
- self._save_example_html(runner, experiment, example)
131
219
  self._maybe_update_experiment_html(runner, experiment)
132
220
  self._maybe_update_summary(runner)
133
221
 
@@ -197,72 +285,3 @@ class HtmlReporter(experiment_lib.Plugin):
197
285
  runner.background_run(_save)
198
286
  else:
199
287
  _save()
200
-
201
- def _save_example_html(
202
- self, runner: Runner, experiment: Experiment, example: Example
203
- ) -> None:
204
- """Saves the example in HTML format."""
205
- current_run = runner.current_run
206
- def _generate():
207
- try:
208
- with pg.timeit() as t:
209
- html = example.to_html(
210
- collapse_level=None,
211
- enable_summary_tooltip=False,
212
- extra_flags=dict(
213
- # For properly rendering the next link.
214
- num_examples=getattr(experiment, 'num_examples', None)
215
- ),
216
- )
217
- html.save(
218
- runner.current_run.output_path_for(
219
- experiment, f'{example.id}.html'
220
- )
221
- )
222
- experiment.info(
223
- f'\'{example.id}.html\' generated in {t.elapse:.2f} seconds. '
224
- )
225
- except BaseException as e: # pylint: disable=broad-except
226
- experiment.error(
227
- f'Failed to generate \'{example.id}.html\'. '
228
- f'Error: {e}, Stacktrace: \n{traceback.format_exc()}.',
229
- )
230
- raise e
231
-
232
- def _copy():
233
- src_file = current_run.input_path_for(experiment, f'{example.id}.html')
234
- dest_file = current_run.output_path_for(experiment, f'{example.id}.html')
235
-
236
- if src_file == dest_file:
237
- return
238
-
239
- if not pg.io.path_exists(src_file):
240
- experiment.warning(
241
- f'Skip copying \'{example.id}.html\' as '
242
- f'{src_file!r} does not exist.'
243
- )
244
- return
245
-
246
- try:
247
- with pg.timeit() as t, pg.io.open(src_file, 'r') as src:
248
- content = src.read()
249
- with pg.io.open(dest_file, 'w') as dest:
250
- dest.write(content)
251
- experiment.info(
252
- f'\'{example.id}.html\' copied in {t.elapse:.2f} seconds.'
253
- )
254
- except BaseException as e: # pylint: disable=broad-except
255
- experiment.error(
256
- f'Failed to copy {src_file!r} to {dest_file!r}. Error: {e}.'
257
- )
258
- raise e
259
-
260
- generate_example_html = current_run.generate_example_html
261
- if (generate_example_html == 'all'
262
- or (generate_example_html == 'new' and example.newly_processed)
263
- or (isinstance(generate_example_html, list)
264
- and example.id in generate_example_html)):
265
- op = _generate
266
- else:
267
- op = _copy
268
- runner.background_run(op)
@@ -29,7 +29,12 @@ class ReportingTest(unittest.TestCase):
29
29
  experiment = eval_test_helper.test_experiment()
30
30
  checkpointer = checkpointing.BulkCheckpointer('checkpoint.jsonl')
31
31
  reporter = reporting.HtmlReporter()
32
- run = experiment.run(root_dir, 'new', plugins=[checkpointer, reporter])
32
+ example_html_generator = reporting.ExampleHtmlGenerator()
33
+ run = experiment.run(
34
+ root_dir,
35
+ 'new',
36
+ plugins=[checkpointer, reporter, example_html_generator]
37
+ )
33
38
  self.assertTrue(
34
39
  pg.io.path_exists(os.path.join(run.output_root, 'summary.html'))
35
40
  )
@@ -52,8 +57,10 @@ class ReportingTest(unittest.TestCase):
52
57
  root_dir = os.path.join(tempfile.mkdtemp(), 'test_reporting2')
53
58
  experiment = eval_test_helper.test_experiment()
54
59
  run = experiment.run(
55
- root_dir, 'new', plugins=[checkpointer, reporter],
56
- warm_start_from=run.output_root
60
+ root_dir,
61
+ 'new',
62
+ plugins=[checkpointer, reporter, example_html_generator],
63
+ warm_start_from=run.output_root,
57
64
  )
58
65
  self.assertTrue(
59
66
  pg.io.path_exists(os.path.join(run.output_root, 'summary.html'))
@@ -105,7 +112,12 @@ class ReportingTest(unittest.TestCase):
105
112
  .test_experiment_with_example_html_generation_error())
106
113
  checkpointer = checkpointing.BulkCheckpointer('checkpoint.jsonl')
107
114
  reporter = reporting.HtmlReporter()
108
- run = experiment.run(root_dir, 'new', plugins=[checkpointer, reporter])
115
+ example_html_generator = reporting.ExampleHtmlGenerator()
116
+ run = experiment.run(
117
+ root_dir,
118
+ 'new',
119
+ plugins=[checkpointer, reporter, example_html_generator]
120
+ )
109
121
  self.assertTrue(
110
122
  pg.io.path_exists(os.path.join(run.output_root, 'summary.html'))
111
123
  )
@@ -132,8 +144,10 @@ class ReportingTest(unittest.TestCase):
132
144
  experiment = (eval_test_helper
133
145
  .test_experiment_with_example_html_generation_error())
134
146
  run = experiment.run(
135
- root_dir, 'new', plugins=[checkpointer, reporter],
136
- warm_start_from=run.output_root
147
+ root_dir,
148
+ 'new',
149
+ plugins=[checkpointer, reporter, example_html_generator],
150
+ warm_start_from=run.output_root,
137
151
  )
138
152
  self.assertTrue(
139
153
  pg.io.path_exists(os.path.join(run.output_root, 'summary.html'))
@@ -0,0 +1,26 @@
1
+ # Copyright 2024 The Langfun Authors
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Langfun evaluation runners."""
15
+
16
+ from langfun.core.eval.v2.runners.base import RunnerBase
17
+ from langfun.core.eval.v2.runners.debug import DebugRunner
18
+ from langfun.core.eval.v2.runners.parallel import ParallelRunner
19
+ from langfun.core.eval.v2.runners.sequential import SequentialRunner
20
+
21
+ __all__ = [
22
+ 'RunnerBase',
23
+ 'DebugRunner',
24
+ 'ParallelRunner',
25
+ 'SequentialRunner',
26
+ ]
@@ -11,13 +11,12 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
- """Evaluation experiment runners."""
14
+ """Base experiment runner."""
15
+
15
16
  import abc
16
- import collections
17
17
  import concurrent.futures
18
18
  import random
19
19
  import threading
20
- import time
21
20
  import traceback
22
21
  from typing import Any, Annotated, Callable, Iterator
23
22
 
@@ -42,7 +41,14 @@ _RUN_MANIFEST = 'run.json'
42
41
 
43
42
 
44
43
  class RunnerBase(Runner):
45
- """A simple runner that runs evaluations and their examples sequentially."""
44
+ """Base class for runners with plugin support and IO pooling.
45
+
46
+ `RunnerBase` provides the basic runner functionalities such as plugin
47
+ integration for checkpointing, reporting and progress tracking.
48
+ It also manages a thread pool for background IO operations.
49
+ Subclasses should implement `_run` and `_evaluate_items` for different
50
+ execution strategies.
51
+ """
46
52
 
47
53
  tqdm: Annotated[
48
54
  bool,
@@ -58,6 +64,11 @@ class RunnerBase(Runner):
58
64
  reporting.HtmlReporter(),
59
65
  ]
60
66
 
67
+ max_background_threads: Annotated[
68
+ int,
69
+ 'Max number of background threads for IO operations.'
70
+ ] = 128
71
+
61
72
  def _on_bound(self):
62
73
  super()._on_bound()
63
74
 
@@ -66,7 +77,9 @@ class RunnerBase(Runner):
66
77
  self.plugins.append(progress_tracking.progress_tracker(self.tqdm))
67
78
 
68
79
  self._io_pool_lock = threading.Lock()
69
- self._io_pool = concurrent.futures.ThreadPoolExecutor(max_workers=16)
80
+ self._io_pool = concurrent.futures.ThreadPoolExecutor(
81
+ max_workers=self.max_background_threads
82
+ )
70
83
  # TODO(daiyip): render background errors.
71
84
  self._background_last_error = None
72
85
 
@@ -220,7 +233,7 @@ class RunnerBase(Runner):
220
233
  else:
221
234
  # A evaluation could be considered as done if it has processed all the
222
235
  # examples specified by `example_ids`.
223
- assert progress.is_completed
236
+ assert progress.is_completed, progress
224
237
  parent_progress.increment_processed()
225
238
 
226
239
  if parent_progress.is_completed:
@@ -335,6 +348,7 @@ class RunnerBase(Runner):
335
348
  def run_evaluation(self, evaluation: Evaluation) -> None:
336
349
  """Runs the evaluation."""
337
350
  try:
351
+ evaluation.setup()
338
352
  self.on_experiment_start(evaluation)
339
353
 
340
354
  per_evaluation_settings = {}
@@ -367,6 +381,8 @@ class RunnerBase(Runner):
367
381
  except BaseException as e: # pylint: disable=broad-except
368
382
  self.on_experiment_abort(evaluation, e)
369
383
  raise e
384
+ finally:
385
+ evaluation.teardown()
370
386
 
371
387
  @abc.abstractmethod
372
388
  def _evaluate_items(
@@ -394,121 +410,3 @@ class RunnerBase(Runner):
394
410
  return in_memory.InMemory(
395
411
  self.current_run.output_path_for(experiment, 'cache.json')
396
412
  )
397
-
398
-
399
- class SequentialRunner(RunnerBase):
400
- """Sequential runner.
401
-
402
- Sequential runner runs all evaluations and their examples in sequence,
403
- as well as the background tasks, it allows the developer to catch all
404
- exceptions thrown from the background tasks, making it easier to debug.
405
- """
406
-
407
- NAME = 'sequential'
408
-
409
- def background_run(
410
- self, func: Callable[..., Any], *args: Any, **kwargs: Any
411
- ) -> None:
412
- """Runs the function with the IO pool."""
413
- func(*args, **kwargs)
414
-
415
- def _run(self, evaluations: list[Evaluation]) -> None:
416
- """Runs the experiment in sequence."""
417
- for e in evaluations:
418
- self.run_evaluation(e)
419
-
420
- def _evaluate_items(
421
- self, evaluation: Evaluation, items: Iterator[Example]
422
- ) -> None:
423
- """Runs the evaluation items in sequence."""
424
- for item in items:
425
- self.evaluate_item(evaluation, item)
426
-
427
-
428
- class DebugRunner(SequentialRunner):
429
- """Debug runner."""
430
-
431
- NAME = 'debug'
432
-
433
- # Do not use the checkpointer for debug runner.
434
- plugins = []
435
-
436
- def _on_bound(self):
437
- super()._on_bound()
438
- if self.current_run.example_ids is None:
439
- self.current_run.rebind(example_ids=[1], skip_notification=True)
440
- self.current_run.rebind(raise_if_has_error=True, skip_notification=True)
441
-
442
- def _save_run_manifest(self) -> None:
443
- """Do nothing to avoid overriden existing runs."""
444
-
445
-
446
- class ParallelRunner(RunnerBase):
447
- """Parallel runner."""
448
-
449
- NAME = 'parallel'
450
-
451
- timeout: Annotated[
452
- int | None,
453
- 'Timeout for each evaluation example.'
454
- ] = None
455
-
456
- concurrent_startup_delay: Annotated[
457
- tuple[int, int] | None,
458
- (
459
- 'A range of seconds to delay the initial evaluation of each thread '
460
- 'in the thread pool, helping to prevent a burst in LLM QPS at '
461
- 'startup. If set to None, no delay will be applied.'
462
- )
463
- ] = None
464
-
465
- def _run(self, evaluations: list[Evaluation]) -> None:
466
- """Runs the evaluations in parallel."""
467
- def _run_group(evaluation_group: list[Evaluation]):
468
- for e in evaluation_group:
469
- self.run_evaluation(e)
470
-
471
- # Run evaluations in parallel groupped by resource key.
472
- groups: dict[str, list[Evaluation]] = collections.defaultdict(list)
473
- for e in evaluations:
474
- resource_ids = e.resource_ids()
475
- if not resource_ids:
476
- group_id = e.id
477
- else:
478
- # TODO(daiyip): support group that requires multiple resources.
479
- group_id = resource_ids.pop()
480
- groups[group_id].append(e)
481
-
482
- for _, _, _ in lf.concurrent_map(
483
- _run_group,
484
- groups.values(),
485
- max_workers=max(64, len(groups)),
486
- timeout=self.timeout,
487
- silence_on_errors=None,
488
- ):
489
- pass
490
-
491
- def _evaluate_items(
492
- self, evaluation: Evaluation, items: Iterator[Example]
493
- ) -> None:
494
- """Override run items to run in parallel."""
495
- if self.concurrent_startup_delay is not None:
496
- thread_delayed = {}
497
- def _evaluate_item(item: Example):
498
- thread_id = threading.current_thread().ident
499
- if thread_id not in thread_delayed:
500
- thread_delayed[thread_id] = True
501
- time.sleep(random.randint(*self.concurrent_startup_delay))
502
- return self.evaluate_item(evaluation, item)
503
- else:
504
- def _evaluate_item(item: Example):
505
- return self.evaluate_item(evaluation, item)
506
-
507
- for _, _, _ in lf.concurrent_map(
508
- _evaluate_item,
509
- items,
510
- max_workers=evaluation.max_workers,
511
- timeout=self.timeout,
512
- silence_on_errors=None,
513
- ):
514
- pass
@@ -0,0 +1,40 @@
1
+ # Copyright 2024 The Langfun Authors
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Debug runner."""
15
+
16
+ from langfun.core.eval.v2.runners import sequential
17
+
18
+
19
+ class DebugRunner(sequential.SequentialRunner):
20
+ """A runner for debugging evaluations.
21
+
22
+ The debug runner is a sequential runner that only runs the first example
23
+ of each evaluation, with `raise_if_has_error` enabled. This is useful for
24
+ quickly identifying issues in evaluation logic during development.
25
+ Checkpointers are disabled for this runner.
26
+ """
27
+
28
+ NAME = 'debug'
29
+
30
+ # Do not use the checkpointer for debug runner.
31
+ plugins = []
32
+
33
+ def _on_bound(self):
34
+ super()._on_bound()
35
+ if self.current_run.example_ids is None:
36
+ self.current_run.rebind(example_ids=[1], skip_notification=True)
37
+ self.current_run.rebind(raise_if_has_error=True, skip_notification=True)
38
+
39
+ def _save_run_manifest(self) -> None:
40
+ """Do nothing to avoid overriden existing runs."""
@@ -0,0 +1,79 @@
1
+ # Copyright 2024 The Langfun Authors
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Tests for debug runner."""
15
+ import os
16
+ import tempfile
17
+ from typing import Any
18
+ import unittest
19
+
20
+ from langfun.core.eval.v2 import eval_test_helper
21
+ from langfun.core.eval.v2.runners import debug # pylint: disable=unused-import
22
+
23
+ import pyglove as pg
24
+
25
+
26
+ class RunnerTest(unittest.TestCase):
27
+
28
+ def assert_same_list(self, actual: list[Any], expected: list[Any]):
29
+ self.assertEqual(len(actual), len(expected))
30
+ for i, (x, y) in enumerate(zip(actual, expected)):
31
+ if x is not y:
32
+ print(i, pg.diff(x, y))
33
+ self.assertIs(x, y)
34
+
35
+
36
+ class DebugRunnerTest(RunnerTest):
37
+
38
+ def test_debug_runner(self):
39
+ plugin = eval_test_helper.TestPlugin()
40
+ exp = eval_test_helper.test_experiment()
41
+ root_dir = os.path.join(tempfile.mkdtemp(), 'test_debug_runner')
42
+ run = exp.run(root_dir, runner='debug', plugins=[plugin])
43
+
44
+ self.assertIsNotNone(plugin.start_time)
45
+ self.assertIsNotNone(plugin.complete_time)
46
+ self.assertGreater(plugin.complete_time, plugin.start_time)
47
+
48
+ self.assertEqual(
49
+ len(plugin.started_experiments), len(exp.nodes)
50
+ )
51
+ self.assertEqual(
52
+ len(plugin.completed_experiments), len(exp.nodes)
53
+ )
54
+ self.assertEqual(
55
+ len(plugin.started_example_ids), 6 * 1
56
+ )
57
+ self.assertEqual(
58
+ len(plugin.completed_example_ids), 6 * 1
59
+ )
60
+ self.assert_same_list(plugin.skipped_experiments, [])
61
+ self.assertFalse(
62
+ pg.io.path_exists(os.path.join(run.output_root, 'run.json'))
63
+ )
64
+
65
+ for node in exp.nodes:
66
+ self.assertTrue(node.progress.is_started)
67
+ self.assertTrue(node.progress.is_completed)
68
+ if node.is_leaf:
69
+ self.assertEqual(node.progress.num_skipped, 0)
70
+ self.assertEqual(node.progress.num_completed, 1)
71
+ self.assertEqual(node.progress.num_failed, 0)
72
+ else:
73
+ self.assertEqual(node.progress.num_skipped, 0)
74
+ self.assertEqual(node.progress.num_failed, 0)
75
+ self.assertEqual(node.progress.num_processed, node.progress.num_total)
76
+
77
+
78
+ if __name__ == '__main__':
79
+ unittest.main()