langfun 0.1.2.dev202412190804__py3-none-any.whl → 0.1.2.dev202412210804__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,6 +13,7 @@
13
13
  # limitations under the License.
14
14
  """Checkpointing evaluation runs."""
15
15
  import threading
16
+ import traceback
16
17
 
17
18
  import langfun.core as lf
18
19
  from langfun.core.eval.v2 import example as example_lib
@@ -27,6 +28,21 @@ Runner = experiment_lib.Runner
27
28
  class Checkpointer(experiment_lib.Plugin):
28
29
  """Base class for checkpointing evaluation examples."""
29
30
 
31
+ def on_experiment_start(self, experiment: Experiment):
32
+ if experiment.state.evaluated_examples:
33
+ experiment.info(
34
+ 'Loaded %d examples from checkpoint files. Example IDs: %s' %
35
+ (
36
+ len(experiment.state.evaluated_examples),
37
+ list(sorted(experiment.state.evaluated_examples.keys()))
38
+ ),
39
+ )
40
+ else:
41
+ experiment.info(
42
+ 'No previous evaluated examples are loaded. '
43
+ f'Experiment {experiment.id} starts from scratch.'
44
+ )
45
+
30
46
 
31
47
  class PerExampleCheckpointer(Checkpointer):
32
48
  """Checkpointer that saves each example to a separate file."""
@@ -50,6 +66,10 @@ class PerExampleCheckpointer(Checkpointer):
50
66
 
51
67
  # For refresh runs, we don't want to load the previous state.
52
68
  if not runner.current_run.refresh:
69
+ if runner.current_run.input_root != runner.current_run.output_root:
70
+ experiment.info(
71
+ f'Warm starting from directory: {runner.current_run.input_root}.'
72
+ )
53
73
  def _load_state(ckpt_file):
54
74
  experiment.load_state(ckpt_file)
55
75
 
@@ -68,10 +88,11 @@ class PerExampleCheckpointer(Checkpointer):
68
88
  _load_state, ckpt_files, max_workers=64,
69
89
  ):
70
90
  if error is not None:
71
- pg.logging.warning(
72
- 'Failed to load checkpoint file %s: %s. Skipping the file.',
73
- ckpt_file, error
91
+ experiment.warning(
92
+ f'Failed to load checkpoint file {ckpt_file}: {error}. '
93
+ 'Skipping the file.'
74
94
  )
95
+ super().on_experiment_start(experiment)
75
96
 
76
97
  def on_example_complete(
77
98
  self,
@@ -80,7 +101,11 @@ class PerExampleCheckpointer(Checkpointer):
80
101
  example: Example,
81
102
  ) -> None:
82
103
  """Saves the example to the checkpoint file."""
83
- if not example.has_error:
104
+ if example.has_error:
105
+ experiment.warning(
106
+ f'Example {example.id} has error. Skipping checkpointing.'
107
+ )
108
+ else:
84
109
  def save_state(example: Example):
85
110
  writer = SequenceWriter(
86
111
  runner.current_run.output_path_for(
@@ -91,8 +116,18 @@ class PerExampleCheckpointer(Checkpointer):
91
116
  )
92
117
  )
93
118
  )
94
- writer.add(example)
95
- writer.close()
119
+ try:
120
+ writer.add(example)
121
+ writer.close()
122
+ experiment.info(
123
+ f'Example {example.id} is saved to {writer.path}.',
124
+ )
125
+ except BaseException as e: # pylint: disable=broad-except
126
+ experiment.error(
127
+ f'Failed to save example {example.id} to {writer.path}. '
128
+ f'Error: {e}, Stacktrace: \n{traceback.format_exc()}.',
129
+ )
130
+ raise e
96
131
  runner.background_run(save_state, example)
97
132
 
98
133
  def _file_prefix_and_ext(self, filename: str) -> tuple[str, str]:
@@ -150,6 +185,10 @@ class BulkCheckpointer(Checkpointer):
150
185
  return
151
186
  # For refresh runs, we don't want to load the previous state.
152
187
  if not runner.current_run.refresh:
188
+ if runner.current_run.input_root != runner.current_run.output_root:
189
+ experiment.info(
190
+ f'Warm starting from directory: {runner.current_run.input_root}.'
191
+ )
153
192
  experiment.load_state(
154
193
  runner.current_run.input_path_for(
155
194
  experiment, self.checkpoint_filename
@@ -164,6 +203,7 @@ class BulkCheckpointer(Checkpointer):
164
203
  with self._lock:
165
204
  if self._sequence_writer is not None:
166
205
  self._sequence_writer[experiment.id] = sequence_writer
206
+ super().on_experiment_start(experiment)
167
207
 
168
208
  def on_experiment_complete(
169
209
  self,
@@ -178,8 +218,12 @@ class BulkCheckpointer(Checkpointer):
178
218
  if self._sequence_writer is not None:
179
219
  # Make sure the writer is closed without delay so the file will be
180
220
  # available immediately.
181
- self._sequence_writer[experiment.id].close()
182
- del self._sequence_writer[experiment.id]
221
+ writer = self._sequence_writer.pop(experiment.id)
222
+ writer.close()
223
+ experiment.info(
224
+ f'{len(experiment.state.evaluated_examples)} examples are '
225
+ f'checkpointed to {writer.path}.'
226
+ )
183
227
 
184
228
  def on_example_complete(
185
229
  self,
@@ -189,8 +233,22 @@ class BulkCheckpointer(Checkpointer):
189
233
  ) -> None:
190
234
  """Saves the example to the checkpoint file."""
191
235
  assert experiment.id in self._sequence_writer
192
- if not example.has_error:
193
- runner.background_run(self._sequence_writer[experiment.id].add, example)
236
+ if example.has_error:
237
+ experiment.warning(
238
+ f'Example {example.id} has error. Skipping checkpointing.'
239
+ )
240
+ else:
241
+ def _save_example(example: Example):
242
+ writer = self._sequence_writer[experiment.id]
243
+ try:
244
+ writer.add(example)
245
+ except BaseException as e: # pylint: disable=broad-except
246
+ experiment.error(
247
+ f'Failed to save example {example.id} to {writer.path}. '
248
+ f'Error: {e}, Stacktrace: \n{traceback.format_exc()}.',
249
+ )
250
+ raise e
251
+ runner.background_run(_save_example, example)
194
252
 
195
253
 
196
254
  class SequenceWriter:
@@ -198,8 +256,13 @@ class SequenceWriter:
198
256
 
199
257
  def __init__(self, path: str):
200
258
  self._lock = threading.Lock()
259
+ self._path = path
201
260
  self._sequence_writer = pg.io.open_sequence(path, 'w')
202
261
 
262
+ @property
263
+ def path(self) -> str:
264
+ return self._path
265
+
203
266
  def add(self, example: Example):
204
267
  example_blob = pg.to_json_str(
205
268
  example,
@@ -16,9 +16,9 @@ import tempfile
16
16
  import unittest
17
17
 
18
18
  from langfun.core.eval.v2 import checkpointing
19
+ from langfun.core.eval.v2 import eval_test_helper
19
20
  from langfun.core.eval.v2 import example as example_lib
20
21
  from langfun.core.eval.v2 import runners as runners_lib # pylint: disable=unused-import
21
- from langfun.core.eval.v2 import test_helper
22
22
  import pyglove as pg
23
23
 
24
24
  Example = example_lib.Example
@@ -56,7 +56,7 @@ class PerExampleCheckpointerTest(unittest.TestCase):
56
56
 
57
57
  def test_checkpointing(self):
58
58
  root_dir = os.path.join(tempfile.gettempdir(), 'per_example_checkpointer')
59
- experiment = test_helper.test_experiment()
59
+ experiment = eval_test_helper.test_experiment()
60
60
  checkpoint_filename = 'checkpoint.jsonl'
61
61
  checkpointer = checkpointing.PerExampleCheckpointer(checkpoint_filename)
62
62
  run = experiment.run(
@@ -89,7 +89,7 @@ class BulkCheckpointerTest(unittest.TestCase):
89
89
 
90
90
  def test_checkpointing(self):
91
91
  root_dir = os.path.join(tempfile.gettempdir(), 'test_bulk_checkpointer')
92
- experiment = test_helper.test_experiment()
92
+ experiment = eval_test_helper.test_experiment()
93
93
  checkpoint_filename = 'checkpoint.jsonl'
94
94
  checkpointer = checkpointing.BulkCheckpointer(checkpoint_filename)
95
95
  run = experiment.run(
@@ -285,36 +285,43 @@ class Evaluation(experiment_lib.Experiment):
285
285
  # Evaluation-level logging.
286
286
  #
287
287
 
288
- def _log(self, level: lf.logging.LogLevel, message: str, **kwargs):
288
+ def _log(self, log_func, level: lf.logging.LogLevel, message: str, **kwargs):
289
+ # Write to external logging system.
290
+ log_message = f'{self.id}: {message}'
291
+ if kwargs:
292
+ log_message = f'{log_message} (metadata: {kwargs!r})'
293
+ log_func(log_message)
294
+
295
+ # Add to experiment log history.
296
+ log_entry = lf.logging.LogEntry(
297
+ level=level,
298
+ time=datetime.datetime.now(),
299
+ message=message,
300
+ metadata=kwargs,
301
+ )
289
302
  with self._log_lock:
290
- self._log_entries.append(
291
- lf.logging.LogEntry(
292
- level=level,
293
- time=datetime.datetime.now(),
294
- message=message,
295
- metadata=kwargs,
296
- )
297
- )
303
+ self._log_entries.append(log_entry)
298
304
 
299
305
  def debug(self, message: str, **kwargs):
300
306
  """Logs a debug message to the session."""
301
- self._log('debug', message, **kwargs)
307
+ self._log(pg.logging.debug, 'debug', message, **kwargs)
302
308
 
303
309
  def info(self, message: str, **kwargs):
304
310
  """Logs an info message to the session."""
305
- self._log('info', message, **kwargs)
311
+ self._log(pg.logging.info, 'info', message, **kwargs)
306
312
 
307
313
  def warning(self, message: str, **kwargs):
308
314
  """Logs a warning message to the session."""
309
- self._log('warning', message, **kwargs)
315
+ self._log(pg.logging.warning, 'warning', message, **kwargs)
310
316
 
311
317
  def error(self, message: str, **kwargs):
312
318
  """Logs an error message to the session."""
313
- self._log('error', message, **kwargs)
319
+ self._log(pg.logging.error, 'error', message, **kwargs)
314
320
 
315
321
  def fatal(self, message: str, **kwargs):
316
322
  """Logs a fatal message to the session."""
317
- self._log('fatal', message, **kwargs)
323
+ # We use error level for fatal message, which does not trigger assertion.
324
+ self._log(pg.logging.error, 'fatal', message, **kwargs)
318
325
 
319
326
  #
320
327
  # HTML views.
@@ -15,12 +15,11 @@ import os
15
15
  import tempfile
16
16
  import unittest
17
17
 
18
+ from langfun.core.eval.v2 import eval_test_helper
18
19
  from langfun.core.eval.v2 import evaluation as evaluation_lib
19
20
  from langfun.core.eval.v2 import example as example_lib
20
21
  from langfun.core.eval.v2 import experiment as experiment_lib
21
22
 
22
- from langfun.core.eval.v2 import test_helper
23
-
24
23
  import pyglove as pg
25
24
 
26
25
  Example = example_lib.Example
@@ -32,17 +31,23 @@ Run = experiment_lib.Run
32
31
  class EvaluationTest(unittest.TestCase):
33
32
 
34
33
  def test_hyper_evaluation(self):
35
- exp = test_helper.TestEvaluation(
36
- lm=test_helper.TestLLM(offset=pg.oneof(range(3)))
34
+ exp = eval_test_helper.TestEvaluation(
35
+ lm=eval_test_helper.TestLLM(offset=pg.oneof(range(3)))
37
36
  )
38
37
  self.assertFalse(exp.is_leaf)
39
38
  self.assertTrue(
40
39
  pg.eq(
41
40
  exp.children,
42
41
  [
43
- test_helper.TestEvaluation(lm=test_helper.TestLLM(offset=0)),
44
- test_helper.TestEvaluation(lm=test_helper.TestLLM(offset=1)),
45
- test_helper.TestEvaluation(lm=test_helper.TestLLM(offset=2)),
42
+ eval_test_helper.TestEvaluation(
43
+ lm=eval_test_helper.TestLLM(offset=0)
44
+ ),
45
+ eval_test_helper.TestEvaluation(
46
+ lm=eval_test_helper.TestLLM(offset=1)
47
+ ),
48
+ eval_test_helper.TestEvaluation(
49
+ lm=eval_test_helper.TestLLM(offset=2)
50
+ ),
46
51
  ]
47
52
  )
48
53
  )
@@ -57,19 +62,21 @@ class EvaluationTest(unittest.TestCase):
57
62
  )
58
63
 
59
64
  def test_input(self):
60
- exp = test_helper.TestEvaluation()
65
+ exp = eval_test_helper.TestEvaluation()
61
66
  self.assertEqual(exp.num_examples, 10)
62
- exp = test_helper.TestEvaluation(inputs=test_helper.test_inputs(None))
67
+ exp = eval_test_helper.TestEvaluation(
68
+ inputs=eval_test_helper.test_inputs(None)
69
+ )
63
70
  self.assertEqual(exp.num_examples, 20)
64
71
  @pg.functor
65
72
  def my_inputs():
66
73
  yield pg.Dict(x=1, y=2)
67
74
  yield pg.Dict(x=3, y=4)
68
- exp = test_helper.TestEvaluation(inputs=my_inputs())
75
+ exp = eval_test_helper.TestEvaluation(inputs=my_inputs())
69
76
  self.assertEqual(exp.num_examples, 2)
70
77
 
71
78
  def test_evaluate(self):
72
- exp = test_helper.TestEvaluation()
79
+ exp = eval_test_helper.TestEvaluation()
73
80
  example = exp.evaluate(Example(id=3))
74
81
  self.assertIs(exp.state.get(3), example)
75
82
  self.assertTrue(example.newly_processed)
@@ -85,7 +92,7 @@ class EvaluationTest(unittest.TestCase):
85
92
  self.assertIsNotNone(example.start_time)
86
93
  self.assertIsNotNone(example.end_time)
87
94
 
88
- exp = test_helper.TestEvaluation(lm=test_helper.TestLLM(offset=1))
95
+ exp = eval_test_helper.TestEvaluation(lm=eval_test_helper.TestLLM(offset=1))
89
96
  example = exp.evaluate(3)
90
97
  self.assertTrue(example.newly_processed)
91
98
  self.assertEqual(example.input, pg.Dict(x=2, y=4, groundtruth=6))
@@ -109,7 +116,7 @@ class EvaluationTest(unittest.TestCase):
109
116
  pg.io.mkdirs(eval_dir, exist_ok=True)
110
117
  state_file = os.path.join(eval_dir, 'state.jsonl')
111
118
  with pg.io.open_sequence(state_file, 'w') as f:
112
- exp = test_helper.TestEvaluation()
119
+ exp = eval_test_helper.TestEvaluation()
113
120
  example = exp.evaluate(3)
114
121
  self.assertTrue(example.newly_processed)
115
122
  self.assertEqual(example.input, pg.Dict(x=2, y=4, groundtruth=6))
@@ -132,7 +139,7 @@ class EvaluationTest(unittest.TestCase):
132
139
  self.assertEqual(example.usage_summary.uncached.total.num_requests, 0)
133
140
 
134
141
  def test_html_view(self):
135
- exp = test_helper.TestEvaluation()
142
+ exp = eval_test_helper.TestEvaluation()
136
143
  exp.debug('debug message')
137
144
  exp.info('info message')
138
145
  exp.warning('warning message', x=1)
@@ -959,6 +959,14 @@ class Plugin(lf.Component):
959
959
  ) -> None:
960
960
  """Called when an experiment (both leaf and non-leaf) is complete."""
961
961
 
962
+ def on_experiment_abort(
963
+ self,
964
+ runner: Runner,
965
+ experiment: Experiment,
966
+ error: BaseException,
967
+ ) -> None:
968
+ """Called when an experiment (both leaf and non-leaf) is aborted."""
969
+
962
970
  def on_example_start(
963
971
  self,
964
972
  runner: Runner,
@@ -18,9 +18,9 @@ import tempfile
18
18
  import unittest
19
19
 
20
20
  from langfun.core import console as lf_console
21
+ from langfun.core.eval.v2 import eval_test_helper
21
22
  from langfun.core.eval.v2 import progress_tracking # pylint: disable=unused-import
22
23
  from langfun.core.eval.v2 import runners as runners_lib # pylint: disable=unused-import
23
- from langfun.core.eval.v2 import test_helper
24
24
  import pyglove as pg
25
25
 
26
26
 
@@ -35,7 +35,7 @@ class HtmlProgressTrackerTest(unittest.TestCase):
35
35
  display=display
36
36
  )
37
37
  root_dir = os.path.join(tempfile.gettempdir(), 'test_html_progress_tracker')
38
- experiment = test_helper.test_experiment()
38
+ experiment = eval_test_helper.test_experiment()
39
39
  _ = experiment.run(root_dir, 'new', plugins=[])
40
40
  self.assertIsInstance(result['view'], pg.Html)
41
41
  lf_console._notebook = None
@@ -45,7 +45,7 @@ class TqdmProgressTrackerTest(unittest.TestCase):
45
45
 
46
46
  def test_basic(self):
47
47
  root_dir = os.path.join(tempfile.gettempdir(), 'test_tqdm_progress_tracker')
48
- experiment = test_helper.test_experiment()
48
+ experiment = eval_test_helper.test_experiment()
49
49
  string_io = io.StringIO()
50
50
  with contextlib.redirect_stderr(string_io):
51
51
  _ = experiment.run(root_dir, 'new', plugins=[])
@@ -55,7 +55,7 @@ class TqdmProgressTrackerTest(unittest.TestCase):
55
55
  root_dir = os.path.join(
56
56
  tempfile.gettempdir(), 'test_tqdm_progress_tracker_with_example_ids'
57
57
  )
58
- experiment = test_helper.test_experiment()
58
+ experiment = eval_test_helper.test_experiment()
59
59
  string_io = io.StringIO()
60
60
  with contextlib.redirect_stderr(string_io):
61
61
  _ = experiment.run(root_dir, 'new', example_ids=[1], plugins=[])
@@ -14,6 +14,7 @@
14
14
  """Reporting evaluation results."""
15
15
 
16
16
  import time
17
+ import traceback
17
18
  from typing import Annotated
18
19
 
19
20
  from langfun.core.eval.v2 import example as example_lib
@@ -61,6 +62,14 @@ class HtmlReporter(experiment_lib.Plugin):
61
62
  ) -> None:
62
63
  self._maybe_update_summary(runner, force=True)
63
64
 
65
+ def on_run_abort(
66
+ self,
67
+ runner: Runner,
68
+ root: Experiment,
69
+ error: BaseException
70
+ ) -> None:
71
+ self._maybe_update_summary(runner, force=True)
72
+
64
73
  def on_experiment_start(
65
74
  self,
66
75
  runner: Runner,
@@ -75,6 +84,16 @@ class HtmlReporter(experiment_lib.Plugin):
75
84
  if experiment.is_leaf:
76
85
  self._maybe_update_experiment_html(runner, experiment, force=True)
77
86
 
87
+ def on_experiment_abort(
88
+ self,
89
+ runner: Runner,
90
+ experiment: Experiment,
91
+ error: BaseException
92
+ ) -> None:
93
+ del error
94
+ assert experiment.is_leaf
95
+ self._maybe_update_experiment_html(runner, experiment, force=True)
96
+
78
97
  def on_example_complete(
79
98
  self, runner: Runner, experiment: Experiment, example: Example
80
99
  ):
@@ -103,19 +122,26 @@ class HtmlReporter(experiment_lib.Plugin):
103
122
  self, runner: Runner, experiment: Experiment, force: bool = False
104
123
  ) -> None:
105
124
  def _save():
106
- html = experiment.to_html(
107
- collapse_level=None,
108
- extra_flags=dict(
109
- current_run=runner.current_run,
110
- interactive=False,
111
- card_view=False,
112
- ),
113
- )
114
- html.save(
115
- runner.current_run.output_path_for(
116
- experiment, _EVALULATION_DETAIL_FILE
117
- )
125
+ index_html_path = runner.current_run.output_path_for(
126
+ experiment, _EVALULATION_DETAIL_FILE
118
127
  )
128
+ try:
129
+ html = experiment.to_html(
130
+ collapse_level=None,
131
+ extra_flags=dict(
132
+ current_run=runner.current_run,
133
+ interactive=False,
134
+ card_view=False,
135
+ ),
136
+ )
137
+ html.save(index_html_path)
138
+ except BaseException as e: # pylint: disable=broad-except
139
+ experiment.error(
140
+ f'Failed to save HTML {index_html_path!r}. '
141
+ f'Error: {e}, Stacktrace: \n{traceback.format_exc()}.',
142
+ )
143
+ raise e
144
+
119
145
  if force or (
120
146
  time.time() - self._last_experiment_report_time[experiment.id]
121
147
  > self.experiment_report_interval
@@ -128,17 +154,24 @@ class HtmlReporter(experiment_lib.Plugin):
128
154
  ) -> None:
129
155
  """Saves the example."""
130
156
  def _save():
131
- html = example.to_html(
132
- collapse_level=None,
133
- enable_summary_tooltip=False,
134
- extra_flags=dict(
135
- # For properly rendering the next link.
136
- num_examples=getattr(experiment, 'num_examples', None)
137
- ),
138
- )
139
- html.save(
140
- runner.current_run.output_path_for(
141
- experiment, f'{example.id}.html'
142
- )
143
- )
157
+ try:
158
+ html = example.to_html(
159
+ collapse_level=None,
160
+ enable_summary_tooltip=False,
161
+ extra_flags=dict(
162
+ # For properly rendering the next link.
163
+ num_examples=getattr(experiment, 'num_examples', None)
164
+ ),
165
+ )
166
+ html.save(
167
+ runner.current_run.output_path_for(
168
+ experiment, f'{example.id}.html'
169
+ )
170
+ )
171
+ except BaseException as e: # pylint: disable=broad-except
172
+ experiment.error(
173
+ f'Failed to save HTML {example.id}.html. '
174
+ f'Error: {e}, Stacktrace: \n{traceback.format_exc()}.',
175
+ )
176
+ raise e
144
177
  runner.background_run(_save)
@@ -15,9 +15,9 @@ import os
15
15
  import tempfile
16
16
  import unittest
17
17
 
18
+ from langfun.core.eval.v2 import eval_test_helper
18
19
  from langfun.core.eval.v2 import reporting
19
20
  from langfun.core.eval.v2 import runners as runners_lib # pylint: disable=unused-import
20
- from langfun.core.eval.v2 import test_helper
21
21
  import pyglove as pg
22
22
 
23
23
 
@@ -25,7 +25,7 @@ class ReportingTest(unittest.TestCase):
25
25
 
26
26
  def test_reporting(self):
27
27
  root_dir = os.path.join(tempfile.gettempdir(), 'test_reporting')
28
- experiment = test_helper.test_experiment()
28
+ experiment = eval_test_helper.test_experiment()
29
29
  reporter = reporting.HtmlReporter()
30
30
  run = experiment.run(root_dir, 'new', plugins=[reporter])
31
31
  pg.io.path_exists(run.output_path_for(experiment, 'summary.html'))
@@ -18,6 +18,7 @@ import concurrent.futures
18
18
  import random
19
19
  import threading
20
20
  import time
21
+ import traceback
21
22
  from typing import Any, Annotated, Callable, Iterator
22
23
 
23
24
  from langfun import core as lf
@@ -64,6 +65,7 @@ class RunnerBase(Runner):
64
65
  with pg.notify_on_change(False):
65
66
  self.plugins.append(progress_tracking.progress_tracker(self.tqdm))
66
67
 
68
+ self._io_pool_lock = threading.Lock()
67
69
  self._io_pool = concurrent.futures.ThreadPoolExecutor(max_workers=16)
68
70
  # TODO(daiyip): render background errors.
69
71
  self._background_last_error = None
@@ -75,7 +77,10 @@ class RunnerBase(Runner):
75
77
  func(*args, **kwargs)
76
78
  except Exception as e: # pylint: disable=broad-except
77
79
  self._background_last_error = e
78
- self._io_pool.submit(_background_run, *args, **kwargs)
80
+
81
+ with self._io_pool_lock:
82
+ if self._io_pool is not None:
83
+ self._io_pool.submit(_background_run, *args, **kwargs)
79
84
 
80
85
  def _all_plugins(self, experiment: Experiment) -> Iterator[Plugin]:
81
86
  """Returns all plugins for the experiment."""
@@ -120,9 +125,14 @@ class RunnerBase(Runner):
120
125
  # Start the progress of the evaluation.
121
126
  if experiment.is_leaf:
122
127
  assert isinstance(experiment, Evaluation)
123
- experiment.progress.start(
124
- total=(len(self.current_run.example_ids)
125
- if self.current_run.example_ids else experiment.num_examples)
128
+ num_examples_to_evaluate = (
129
+ len(self.current_run.example_ids)
130
+ if self.current_run.example_ids else experiment.num_examples
131
+ )
132
+ experiment.progress.start(total=num_examples_to_evaluate)
133
+ experiment.info(
134
+ 'Starting evaluation %s with %d examples to evaluate.'
135
+ % (experiment.id, num_examples_to_evaluate)
126
136
  )
127
137
  else:
128
138
  experiment.progress.start(total=len(experiment.leaf_nodes))
@@ -144,8 +154,7 @@ class RunnerBase(Runner):
144
154
 
145
155
  # Only leaf evaluations will trigger the complete notification of the
146
156
  # ancestors.
147
- if experiment.is_leaf:
148
- self._update_ancestor_progresses(experiment)
157
+ self._update_ancestor_progresses(experiment)
149
158
 
150
159
  def on_experiment_complete(self, experiment: Experiment) -> None:
151
160
  """Called when an evaluation is complete."""
@@ -160,6 +169,35 @@ class RunnerBase(Runner):
160
169
  # ancestors.
161
170
  if experiment.is_leaf:
162
171
  self._update_ancestor_progresses(experiment)
172
+ self._log_experiment_completion(experiment)
173
+
174
+ def _log_experiment_completion(self, experiment: Experiment):
175
+ example_ids = (
176
+ self.current_run.example_ids if self.current_run.example_ids else
177
+ list(range(1, experiment.num_examples + 1))
178
+ )
179
+ num_from_checkpoint, num_processed = 0, 0
180
+ for example_id in example_ids:
181
+ example = experiment.state.get(example_id)
182
+ if example.newly_processed:
183
+ num_processed += 1
184
+ else:
185
+ num_from_checkpoint += 1
186
+ experiment.info(
187
+ f'{experiment.id} completed with {num_from_checkpoint + num_processed} '
188
+ f'examples evaluated ({num_from_checkpoint} from checkpoint, '
189
+ f'{num_processed} newly processed).'
190
+ )
191
+
192
+ def on_experiment_abort(
193
+ self, experiment: Experiment, error: BaseException) -> None:
194
+ """Called when an evaluation is complete."""
195
+ assert experiment.is_leaf
196
+ experiment.fatal(f'{error}\n\n{traceback.format_exc()}')
197
+
198
+ # Notify the plugins of the experiment abort.
199
+ for plugin in self._all_plugins(experiment):
200
+ plugin.on_experiment_abort(self, experiment, error)
163
201
 
164
202
  def _update_ancestor_progresses(self, experiment: Experiment):
165
203
  """Updates the progresses of the parent nodes of the experiment."""
@@ -262,7 +300,9 @@ class RunnerBase(Runner):
262
300
  self.background_run(cache.save)
263
301
 
264
302
  # Wait for the background tasks to finish.
265
- self._io_pool.shutdown(wait=True)
303
+ with self._io_pool_lock:
304
+ self._io_pool, io_pool = None, self._io_pool
305
+ io_pool.shutdown(wait=True)
266
306
 
267
307
  @abc.abstractmethod
268
308
  def _run(self, evaluations: list[Evaluation]) -> None:
@@ -270,31 +310,36 @@ class RunnerBase(Runner):
270
310
 
271
311
  def run_evaluation(self, evaluation: Evaluation) -> None:
272
312
  """Runs the evaluation."""
273
- self.on_experiment_start(evaluation)
274
-
275
- per_evaluation_settings = {}
276
- cache = None
277
- if self.current_run.use_cache == 'per_dataset':
278
- cache = self._load_or_create_cache(evaluation)
279
- per_evaluation_settings['cache'] = cache
280
-
281
- with lf.use_settings(**per_evaluation_settings):
282
- if self.current_run.example_ids is None:
283
- items = (
284
- Example(id=i + 1, input=ex) for i, ex in enumerate(
285
- evaluation.example_inputs)
286
- )
287
- else:
288
- items = (
289
- Example(
290
- id=example_id, input=evaluation.example_input_by_id(example_id)
291
- ) for example_id in self.current_run.example_ids
292
- )
293
- self._evaluate_items(evaluation, items)
294
-
295
- if cache:
296
- self.background_run(cache.save)
297
- self.on_experiment_complete(evaluation)
313
+ try:
314
+ self.on_experiment_start(evaluation)
315
+
316
+ per_evaluation_settings = {}
317
+ cache = None
318
+ if self.current_run.use_cache == 'per_dataset':
319
+ cache = self._load_or_create_cache(evaluation)
320
+ per_evaluation_settings['cache'] = cache
321
+
322
+ with lf.use_settings(**per_evaluation_settings):
323
+ if self.current_run.example_ids is None:
324
+ items = (
325
+ Example(id=i + 1, input=ex) for i, ex in enumerate(
326
+ evaluation.example_inputs)
327
+ )
328
+ else:
329
+ items = (
330
+ Example(
331
+ id=example_id,
332
+ input=evaluation.example_input_by_id(example_id)
333
+ ) for example_id in self.current_run.example_ids
334
+ )
335
+ self._evaluate_items(evaluation, items)
336
+
337
+ if cache:
338
+ self.background_run(cache.save)
339
+ self.on_experiment_complete(evaluation)
340
+ except BaseException as e: # pylint: disable=broad-except
341
+ self.on_experiment_abort(evaluation, e)
342
+ raise e
298
343
 
299
344
  @abc.abstractmethod
300
345
  def _evaluate_items(
@@ -410,9 +455,7 @@ class ParallelRunner(RunnerBase):
410
455
  groups.values(),
411
456
  max_workers=max(64, len(groups)),
412
457
  timeout=self.timeout,
413
- silence_on_errors=(
414
- None if self.current_run.raise_if_has_error else BaseException
415
- )
458
+ silence_on_errors=None,
416
459
  ):
417
460
  pass
418
461
 
@@ -437,8 +480,6 @@ class ParallelRunner(RunnerBase):
437
480
  items,
438
481
  max_workers=evaluation.max_workers,
439
482
  timeout=self.timeout,
440
- silence_on_errors=(
441
- None if self.current_run.raise_if_has_error else BaseException
442
- )
483
+ silence_on_errors=None,
443
484
  ):
444
485
  pass
@@ -18,10 +18,11 @@ import time
18
18
  from typing import Any
19
19
  import unittest
20
20
 
21
+ from langfun.core.eval.v2 import eval_test_helper
21
22
  from langfun.core.eval.v2 import example as example_lib
22
23
  from langfun.core.eval.v2 import experiment as experiment_lib
23
24
  from langfun.core.eval.v2 import runners as runners_lib # pylint: disable=unused-import
24
- from langfun.core.eval.v2 import test_helper
25
+
25
26
  import pyglove as pg
26
27
 
27
28
 
@@ -101,7 +102,7 @@ class RunnerTest(unittest.TestCase):
101
102
 
102
103
  def test_basic(self):
103
104
  plugin = TestPlugin()
104
- exp = test_helper.test_experiment()
105
+ exp = eval_test_helper.test_experiment()
105
106
  root_dir = os.path.join(tempfile.gettempdir(), 'test_sequential_runner')
106
107
  run = exp.run(root_dir, runner='sequential', plugins=[plugin])
107
108
 
@@ -143,7 +144,7 @@ class RunnerTest(unittest.TestCase):
143
144
 
144
145
  def test_raise_if_has_error(self):
145
146
  root_dir = os.path.join(tempfile.gettempdir(), 'test_raise_if_has_error')
146
- exp = test_helper.TestEvaluation()
147
+ exp = eval_test_helper.TestEvaluation()
147
148
  with self.assertRaisesRegex(ValueError, 'x should not be 5'):
148
149
  exp.run(
149
150
  root_dir, runner='sequential', plugins=[], raise_if_has_error=True
@@ -154,7 +155,7 @@ class RunnerTest(unittest.TestCase):
154
155
 
155
156
  def test_example_ids(self):
156
157
  root_dir = os.path.join(tempfile.gettempdir(), 'test_example_ids')
157
- exp = test_helper.test_experiment()
158
+ exp = eval_test_helper.test_experiment()
158
159
  plugin = TestPlugin()
159
160
  _ = exp.run(
160
161
  root_dir, runner='sequential', plugins=[plugin], example_ids=[5, 7, 9]
@@ -164,7 +165,7 @@ class RunnerTest(unittest.TestCase):
164
165
 
165
166
  def test_filter(self):
166
167
  plugin = TestPlugin()
167
- exp = test_helper.test_experiment()
168
+ exp = eval_test_helper.test_experiment()
168
169
  root_dir = os.path.join(tempfile.gettempdir(), 'test_filter')
169
170
 
170
171
  _ = exp.run(
@@ -193,7 +194,7 @@ class RunnerTest(unittest.TestCase):
193
194
  ) for i in range(num_examples)
194
195
  ]
195
196
 
196
- exp = test_helper.TestEvaluation(
197
+ exp = eval_test_helper.TestEvaluation(
197
198
  inputs=test_inputs(num_examples=pg.oneof([2, 4]))
198
199
  )
199
200
  # Global cache.
@@ -234,7 +235,7 @@ class ParallelRunnerTest(RunnerTest):
234
235
 
235
236
  def test_parallel_runner(self):
236
237
  plugin = TestPlugin()
237
- exp = test_helper.test_experiment()
238
+ exp = eval_test_helper.test_experiment()
238
239
  root_dir = os.path.join(tempfile.gettempdir(), 'test_parallel_runner')
239
240
  run = exp.run(root_dir, runner='parallel', plugins=[plugin])
240
241
 
@@ -274,7 +275,7 @@ class ParallelRunnerTest(RunnerTest):
274
275
 
275
276
  def test_concurrent_startup_delay(self):
276
277
  plugin = TestPlugin()
277
- exp = test_helper.test_experiment()
278
+ exp = eval_test_helper.test_experiment()
278
279
  root_dir = os.path.join(
279
280
  tempfile.gettempdir(), 'test_concurrent_startup_delay'
280
281
  )
@@ -290,7 +291,7 @@ class DebugRunnerTest(RunnerTest):
290
291
 
291
292
  def test_debug_runner(self):
292
293
  plugin = TestPlugin()
293
- exp = test_helper.test_experiment()
294
+ exp = eval_test_helper.test_experiment()
294
295
  root_dir = os.path.join(tempfile.gettempdir(), 'test_debug_runner')
295
296
  run = exp.run(root_dir, runner='debug', plugins=[plugin])
296
297
 
@@ -583,7 +583,16 @@ class QueryInvocation(pg.Object, pg.views.HtmlTreeView.Extension):
583
583
 
584
584
  @functools.cached_property
585
585
  def output(self) -> Any:
586
- return query_output(self.lm_response, self.schema)
586
+ """The output of `lf.query`. If it failed, returns the `MappingError`."""
587
+ try:
588
+ return query_output(self.lm_response, self.schema)
589
+ except mapping.MappingError as e:
590
+ return e
591
+
592
+ @property
593
+ def has_error(self) -> bool:
594
+ """Returns True if the query failed to generate a valid output."""
595
+ return isinstance(self.output, BaseException)
587
596
 
588
597
  @property
589
598
  def elapse(self) -> float:
@@ -1051,6 +1051,16 @@ class QueryStructureJsonTest(unittest.TestCase):
1051
1051
 
1052
1052
  class QueryInvocationTest(unittest.TestCase):
1053
1053
 
1054
+ def test_basics(self):
1055
+ lm = fake.StaticSequence([
1056
+ 'Activity(description="hi"',
1057
+ ])
1058
+ with querying.track_queries() as queries:
1059
+ querying.query('foo', Activity, default=None, lm=lm)
1060
+
1061
+ self.assertTrue(queries[0].has_error)
1062
+ self.assertIsInstance(queries[0].output, mapping.MappingError)
1063
+
1054
1064
  def test_to_html(self):
1055
1065
  lm = fake.StaticSequence([
1056
1066
  'Activity(description="hi")',
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: langfun
3
- Version: 0.1.2.dev202412190804
3
+ Version: 0.1.2.dev202412210804
4
4
  Summary: Langfun: Language as Functions.
5
5
  Home-page: https://github.com/google/langfun
6
6
  Author: Langfun Authors
@@ -58,13 +58,14 @@ langfun/core/eval/patching_test.py,sha256=8kCd54Egjju22FMgtJuxEsrXkW8ifs-UUBHtrC
58
58
  langfun/core/eval/scoring.py,sha256=B69IsIxiPs1xZcOBFIhZF70YmDue2Siik-CPL2bh33s,6254
59
59
  langfun/core/eval/scoring_test.py,sha256=O8olHbrUEg60gMxwOkWzKBJZpZoUlmVnBANX5Se2SXM,4546
60
60
  langfun/core/eval/v2/__init__.py,sha256=qoa6zKdFXOFyCX6vay6OdgPf1eUhYGoHYAxe35qECGk,1628
61
- langfun/core/eval/v2/checkpointing.py,sha256=8vxH3AfIBS8dxA0IiOZBUxAHXIx5m2tSWSSumDLpzp8,6546
62
- langfun/core/eval/v2/checkpointing_test.py,sha256=dAERKQTW_PM1B0oUauB0YVQkMEI-cgJq0q-wAVlGYpU,4383
63
- langfun/core/eval/v2/evaluation.py,sha256=7PC-npbEQjwwv0pWbv8vGi_OkzZ7QpJrEpYoixFBlno,21429
64
- langfun/core/eval/v2/evaluation_test.py,sha256=ld8oBOjsfN-LNLL2eViSTu17wAq90GcsfURXX6oVlFo,6014
61
+ langfun/core/eval/v2/checkpointing.py,sha256=zr2hxOjm6Hdq71sYTsbQtL_CwQOWr-Ir9T5TPUnhqMI,8741
62
+ langfun/core/eval/v2/checkpointing_test.py,sha256=Imy96lwDkvmtj-1YFpP2DZukMOoYqpPov2J_MsQKxxI,4398
63
+ langfun/core/eval/v2/eval_test_helper.py,sha256=pDpZTBnWRR5xjJv3Uy3NWEzArqlL8FTMOgeR4C53F5M,2348
64
+ langfun/core/eval/v2/evaluation.py,sha256=NFBGAWw2BtW7H0zcoZhfWtz59Psra84eshJm73uAFwg,21807
65
+ langfun/core/eval/v2/evaluation_test.py,sha256=GmV1TiqX1V15st2qpcGWooM5hudomQVjW5kajovGDvE,6231
65
66
  langfun/core/eval/v2/example.py,sha256=fURrvdNmMsVMqoEErcsmLmC6Xq3ny16dYsnLH8HVlcY,9626
66
67
  langfun/core/eval/v2/example_test.py,sha256=WcJmU7IQQXvjFia63mokySC4CqxzVL9Wso1sC5F0YK8,3032
67
- langfun/core/eval/v2/experiment.py,sha256=0JBGckJ93aqSdffpJPDVPy_I5T2BXscghTxiglHzJWo,29556
68
+ langfun/core/eval/v2/experiment.py,sha256=xfk4aNZ3dH46y0lWSS_fC7JpfJCG77Z5qsakV4gHcOs,29762
68
69
  langfun/core/eval/v2/experiment_test.py,sha256=zSMHYqC9cA0k61U71pCSYTAJ6yK2_b6Dml5btc-bKzQ,9133
69
70
  langfun/core/eval/v2/metric_values.py,sha256=_B905bC-jxrYPLSEcP2M8MaHZOVMz_bVrUw8YC4arCE,4660
70
71
  langfun/core/eval/v2/metric_values_test.py,sha256=ab2oF_HsIwrSy459108ggyjgefHSPn8UVILR4dRwx14,2634
@@ -73,12 +74,11 @@ langfun/core/eval/v2/metrics_test.py,sha256=p4FzLJsE8XAzAQuyP9hfEf9YeKWZ__PO_ue8
73
74
  langfun/core/eval/v2/progress.py,sha256=azZgssQgNdv3IgjKEaQBuGI5ucFDNbdi02P4z_nQ8GE,10292
74
75
  langfun/core/eval/v2/progress_test.py,sha256=YU7VHzmy5knPZwj9vpBN3rQQH2tukj9eKHkuBCI62h8,2540
75
76
  langfun/core/eval/v2/progress_tracking.py,sha256=l9fEkz4oP5McpZzf72Ua7PYm3lAWtRru7gRWNf8H0ms,6083
76
- langfun/core/eval/v2/progress_tracking_test.py,sha256=iO-DslCJWncU7-27XaMKxDeKrsGbwdk_tKfoRk3KboE,2271
77
- langfun/core/eval/v2/reporting.py,sha256=TGkli1IDwqfqsCJ_WslOMGk_24JDg7oRRTGXlAJlWpc,4361
78
- langfun/core/eval/v2/reporting_test.py,sha256=JxffbUPWInUyLjo-AQVFrllga884Mdfm05R86FtxSss,1482
79
- langfun/core/eval/v2/runners.py,sha256=nh5qIAkdlY1MohDfiPkFcCY_By1SN0A1SOqmaShGziM,14339
80
- langfun/core/eval/v2/runners_test.py,sha256=UeiUNygux_U6iGVG18rhp68ZE4hoWeoT6XsXvSjxNQg,11620
81
- langfun/core/eval/v2/test_helper.py,sha256=pDpZTBnWRR5xjJv3Uy3NWEzArqlL8FTMOgeR4C53F5M,2348
77
+ langfun/core/eval/v2/progress_tracking_test.py,sha256=fouMVJkFJqHjbhQJngGLGCmA9x3n0dU4USI2dY163mg,2291
78
+ langfun/core/eval/v2/reporting.py,sha256=vsh45GLVnA7GMU-8cvNYOt4Nb7mEwvcguhO-BSXSzTE,5358
79
+ langfun/core/eval/v2/reporting_test.py,sha256=4nobW6pcaatiZh8u4xciexciaiZNDlDoJci157Wp_RI,1492
80
+ langfun/core/eval/v2/runners.py,sha256=t6_yHAJ4HWufK4wvh_OntKcok2KquA5ARIHIk1vvEwc,15870
81
+ langfun/core/eval/v2/runners_test.py,sha256=A37fKK2MvAVTiShsg_laluJzJ9AuAQn52k7HPbfD0Ks,11666
82
82
  langfun/core/llms/__init__.py,sha256=lWXKjGHv66ShG7AE_Bc4QM7SDTxJdfoQMn3PF0lr0sU,6461
83
83
  langfun/core/llms/anthropic.py,sha256=afKZmdiLcosS_UEBlB8WKyf1K-zeXgwtPAx6ofg2Gww,13989
84
84
  langfun/core/llms/anthropic_test.py,sha256=-2U4kc_pgBM7wqxu8RuxzyHPGww1EAWqKUvN4PW8Btw,8058
@@ -129,8 +129,8 @@ langfun/core/structured/mapping.py,sha256=vLKH79UT-j0qkQdvqlQBO7SkXXuM-yr2Idm8_H
129
129
  langfun/core/structured/mapping_test.py,sha256=bHm2ZCXBITq_G8Lvw_olFHeUUc4s_lGXZm9v9JhoPB4,9630
130
130
  langfun/core/structured/parsing.py,sha256=MGvI7ypXlwfzr5XB8_TFU9Ei0_5reYqkWkv64eAy0EA,12015
131
131
  langfun/core/structured/parsing_test.py,sha256=kNPrhpdPY3iWhUld0TFYU-Zgn44wC0d6YuQ9XdVbQ8o,22346
132
- langfun/core/structured/querying.py,sha256=sXGhYtiEBac8iOkYOErGXyX8SAHSB1gg69WePhOyGxE,22759
133
- langfun/core/structured/querying_test.py,sha256=M9Apg83KjQUjT42K9LheBEr74DX3Inwd0YmCanA71kc,31738
132
+ langfun/core/structured/querying.py,sha256=nqvsfMS_KLv5EvO0_VAGEHwY4pHy4S0CvJmeV0HBXlM,23066
133
+ langfun/core/structured/querying_test.py,sha256=YlC4s9LVChfhGZzaXGW1UYlcBnAjNOunu4SLl5_p7PQ,32054
134
134
  langfun/core/structured/schema.py,sha256=0VUPSfX1JEQ0xu8WvEymCKK_WSGwBNA-rQD2hATErmU,27912
135
135
  langfun/core/structured/schema_generation.py,sha256=U3nRQsqmMZg_qIVDh2fiY3K4JLfsAL1LcKzIFP1iXFg,5316
136
136
  langfun/core/structured/schema_generation_test.py,sha256=RM9s71kMNg2jTePwInkiW9fK1ACN37eyPeF8OII-0zw,2950
@@ -148,8 +148,8 @@ langfun/core/templates/demonstration.py,sha256=vCrgYubdZM5Umqcgp8NUVGXgr4P_c-fik
148
148
  langfun/core/templates/demonstration_test.py,sha256=SafcDQ0WgI7pw05EmPI2S4v1t3ABKzup8jReCljHeK4,2162
149
149
  langfun/core/templates/selfplay.py,sha256=yhgrJbiYwq47TgzThmHrDQTF4nDrTI09CWGhuQPNv-s,2273
150
150
  langfun/core/templates/selfplay_test.py,sha256=Ot__1P1M8oJfoTp-M9-PQ6HUXqZKyMwvZ5f7yQ3yfyM,2326
151
- langfun-0.1.2.dev202412190804.dist-info/LICENSE,sha256=WNHhf_5RCaeuKWyq_K39vmp9F28LxKsB4SpomwSZ2L0,11357
152
- langfun-0.1.2.dev202412190804.dist-info/METADATA,sha256=Zr8TfOnhdo83h3aGRNRWXTrJ54h7Sh7E-7Lj95iJVDw,8281
153
- langfun-0.1.2.dev202412190804.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
154
- langfun-0.1.2.dev202412190804.dist-info/top_level.txt,sha256=RhlEkHxs1qtzmmtWSwYoLVJAc1YrbPtxQ52uh8Z9VvY,8
155
- langfun-0.1.2.dev202412190804.dist-info/RECORD,,
151
+ langfun-0.1.2.dev202412210804.dist-info/LICENSE,sha256=WNHhf_5RCaeuKWyq_K39vmp9F28LxKsB4SpomwSZ2L0,11357
152
+ langfun-0.1.2.dev202412210804.dist-info/METADATA,sha256=u3a7ssXSuTdAJuhvtQtGyroed6k5r9HeobDPozhHjJ0,8281
153
+ langfun-0.1.2.dev202412210804.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
154
+ langfun-0.1.2.dev202412210804.dist-info/top_level.txt,sha256=RhlEkHxs1qtzmmtWSwYoLVJAc1YrbPtxQ52uh8Z9VvY,8
155
+ langfun-0.1.2.dev202412210804.dist-info/RECORD,,