langfun 0.1.2.dev202501020804__py3-none-any.whl → 0.1.2.dev202501040804__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -105,8 +105,8 @@ class Experiment(lf.Component, pg.views.HtmlTreeView.Extension):
105
105
  # metrics as needed.
106
106
  experiment.run(root_dir, '20241031_1')
107
107
 
108
- # Refresh the previous run located in 'run_20241031_1'.
109
- experiment.run(root_dir, '20241031_1', refresh=True)
108
+ # Reprocess the previous run located in 'run_20241031_1'.
109
+ experiment.run(root_dir, '20241031_1', reprocess=True)
110
110
  ```
111
111
 
112
112
  # Experiment Registration and Lookup
@@ -380,7 +380,8 @@ class Experiment(lf.Component, pg.views.HtmlTreeView.Extension):
380
380
  filter: Callable[['Experiment'], bool] | None = None, # pylint: disable=redefined-builtin
381
381
  example_ids: list[int] | None = None,
382
382
  raise_if_has_error: bool = False,
383
- refresh: bool = False,
383
+ reprocess: bool | list[int] = False,
384
+ generate_example_html: Literal['new', 'all', 'no'] | list[int] = 'new',
384
385
  process_timeout: int | None = None,
385
386
  use_cache: Literal['global', 'per_dataset', 'no'] = 'per_dataset',
386
387
  note: str | None = None,
@@ -391,22 +392,25 @@ class Experiment(lf.Component, pg.views.HtmlTreeView.Extension):
391
392
  """Runs the experiment.
392
393
 
393
394
  Examples:
394
- # Start a new run.
395
- experiment.run('new')
395
+ # Start a new run under root_dir.
396
+ experiment.run(root_dir, 'new')
396
397
 
397
398
  # Continue the latest experiment run.
398
- experiment.run('latest')
399
+ experiment.run(root_dir, 'latest')
399
400
 
400
401
  # Continue the latest experiment run or start a new run if it does not
401
402
  # exist.
402
- experiment.run()
403
+ experiment.run(root_dir)
403
404
 
404
- # Start a new run and warm start from a previous run under sub-dir
405
- # 'run_20241031_1'.
406
- experiment.run('new', warm_start_from='20241031_1')
405
+ # Start a new run and warm start from another run's directory
406
+ # '/path/to/another/run_20241031_1/'.
407
+ experiment.run(
408
+ root_dir, 'new',
409
+ warm_start_from='/path/to/another/run_20241031_1/'
410
+ )
407
411
 
408
- # Refresh previous run under sub-dir 'run_20241031_1'.
409
- experiment.run('20241031_1', refresh=True)
412
+ # Reprocess previous run under sub-dir 'run_20241031_1'.
413
+ experiment.run(root_dir, '20241031_1', reprocess=True)
410
414
 
411
415
  Args:
412
416
  root_dir: The root of the output directory of the experiment.
@@ -426,8 +430,18 @@ class Experiment(lf.Component, pg.views.HtmlTreeView.Extension):
426
430
  example_ids: The example IDs to run. If None, it will run all examples.
427
431
  raise_if_has_error: If True, it will raise an error if any example fails.
428
432
  Otherwise, it will continue and report the error in the output.
429
- refresh: Whether to refresh the experiment. If True, it will delete the
430
- data under the current experiment run directory and start a new run.
433
+ reprocess: A boolean or a list of example IDs. If boolean, it indicates
434
+ that whether all the examples to be evaluated will be reprocessed,
435
+ meaning that existing checkpoints will be ignored. If a list of
436
+ example IDs, it indicates that only the specified examples will be
437
+ reprocessed.
438
+ generate_example_html: Among 'new', 'all', 'no' or a list of example IDs.
439
+ If 'new', generate HTML files for all newly processed examples, and
440
+ keep/copy existing HTML files for unchanged examples.
441
+ If 'all', generate HTML files for all examples.
442
+ If 'no', do not generate HTML files for any examples.
443
+ If a list of example IDs, generate HTML files for the specified
444
+ examples.
431
445
  process_timeout: The timeout in seconds for each process. If None, it
432
446
  will use the default timeout for the runner.
433
447
  use_cache: Whether to use LLM cache for the experiment.
@@ -454,7 +468,8 @@ class Experiment(lf.Component, pg.views.HtmlTreeView.Extension):
454
468
  filter=filter,
455
469
  example_ids=example_ids,
456
470
  raise_if_has_error=raise_if_has_error,
457
- refresh=refresh,
471
+ reprocess=reprocess,
472
+ generate_example_html=generate_example_html,
458
473
  use_cache=use_cache,
459
474
  process_timeout=process_timeout,
460
475
  note=note,
@@ -815,14 +830,27 @@ class Run(pg.Object, pg.views.html.HtmlTreeView.Extension):
815
830
  'The user tags for the current run.'
816
831
  ] = []
817
832
 
818
- refresh: Annotated[
819
- bool,
833
+ reprocess: Annotated[
834
+ bool | list[int],
820
835
  (
821
- 'If True, it will delete the data under the current '
822
- 'run directory and start a new run.'
836
+ 'If True, it will reprocess all examples under the current '
837
+ 'run directory. If a list of integers, examples of the given IDS '
838
+ 'will be reprocessed.'
823
839
  )
824
840
  ] = False
825
841
 
842
+ generate_example_html: Annotated[
843
+ Literal['new', 'all', 'no'] | list[int],
844
+ (
845
+ 'If "new", generate HTML files for all newly processed examples, '
846
+ 'and keep/copy existing HTML files for unchanged examples. '
847
+ 'If "all", generate HTML files for all examples. '
848
+ 'If "no", do not generate HTML files for any examples. '
849
+ 'If a list of example IDs, generate HTML files for the specified '
850
+ 'examples.'
851
+ )
852
+ ] = 'new'
853
+
826
854
  filter: Annotated[
827
855
  Callable[[Experiment], bool] | None,
828
856
  'A filter to decide whether a leaf experiment should be run or not.'
@@ -873,6 +901,42 @@ class Run(pg.Object, pg.views.html.HtmlTreeView.Extension):
873
901
  """Returns the output path for the experiment."""
874
902
  return os.path.join(self.output_dir(experiment), relative_path)
875
903
 
904
+ def examples_to_evaluate(self, experiment: Experiment) -> set[int]:
905
+ """Returns the example IDs to evaluate."""
906
+ if not experiment.is_leaf:
907
+ return set()
908
+ return set(
909
+ self.example_ids if self.example_ids else
910
+ range(1, experiment.num_examples + 1)
911
+ )
912
+
913
+ def examples_to_reprocess(self, experiment: Experiment) -> set[int]:
914
+ """Returns the example IDs to reprocess per request."""
915
+ if not self.reprocess:
916
+ return set()
917
+ reprocess_ids = self.examples_to_evaluate(experiment)
918
+ if isinstance(self.reprocess, list):
919
+ reprocess_ids &= set(self.reprocess)
920
+ return reprocess_ids
921
+
922
+ def examples_to_load(self, experiment: Experiment) -> set[int]:
923
+ """Returns the example IDs to load from checkpoint files.."""
924
+ load_ids = self.examples_to_evaluate(experiment)
925
+ if isinstance(self.generate_example_html, list):
926
+ load_ids |= set(self.generate_example_html)
927
+ load_ids -= self.examples_to_reprocess(experiment)
928
+ return load_ids
929
+
930
+ def examples_to_load_metadata(self, experiment: Experiment) -> set[int]:
931
+ """Returns the example IDs to load the metadata."""
932
+ load_metadata_ids = set()
933
+ if isinstance(self.generate_example_html, list):
934
+ load_metadata_ids = set(self.generate_example_html)
935
+ elif self.generate_example_html == 'all':
936
+ load_metadata_ids = self.examples_to_evaluate(experiment)
937
+ load_metadata_ids -= self.examples_to_reprocess(experiment)
938
+ return load_metadata_ids
939
+
876
940
 
877
941
  class Runner(pg.Object):
878
942
  """Interface for experiment runner."""
@@ -31,10 +31,10 @@ Runner = experiment_lib.Runner
31
31
 
32
32
 
33
33
  @pg.functor()
34
- def sample_inputs():
34
+ def sample_inputs(num_examples: int = 1):
35
35
  return [
36
36
  pg.Dict(x=1)
37
- ]
37
+ ] * num_examples
38
38
 
39
39
 
40
40
  class MyEvaluation(Evaluation):
@@ -208,7 +208,7 @@ class RunIdTest(unittest.TestCase):
208
208
 
209
209
  class RunTest(unittest.TestCase):
210
210
 
211
- def test_basic(self):
211
+ def test_input_output_paths(self):
212
212
  run = Run(
213
213
  '/root',
214
214
  RunId.from_id('20241102_0'),
@@ -270,6 +270,124 @@ class RunTest(unittest.TestCase):
270
270
  )
271
271
  )
272
272
 
273
+ def test_examples_start_from_scratch(self):
274
+ run = Run(
275
+ '/root',
276
+ RunId.from_id('20241102_0'),
277
+ pg.Ref(Suite([
278
+ MyEvaluation(replica_id=0, inputs=sample_inputs(10)),
279
+ ])),
280
+ )
281
+ root = run.experiment
282
+ self.assertEqual(run.examples_to_evaluate(root), set())
283
+ self.assertEqual(run.examples_to_reprocess(root), set())
284
+ self.assertEqual(run.examples_to_load(root), set())
285
+ self.assertEqual(run.examples_to_load_metadata(root), set())
286
+
287
+ exp = root.leaf_nodes[0]
288
+ self.assertEqual(run.examples_to_evaluate(exp), set(range(1, 11)))
289
+ self.assertEqual(run.examples_to_reprocess(exp), set())
290
+ self.assertEqual(run.examples_to_load(exp), set(range(1, 11)))
291
+ self.assertEqual(run.examples_to_load_metadata(exp), set())
292
+
293
+ def test_examples_with_example_ids(self):
294
+ run = Run(
295
+ '/root',
296
+ RunId.from_id('20241102_0'),
297
+ pg.Ref(Suite([
298
+ MyEvaluation(replica_id=0, inputs=sample_inputs(10)),
299
+ ])),
300
+ example_ids=[1, 3, 5]
301
+ )
302
+ exp = run.experiment.leaf_nodes[0]
303
+ self.assertEqual(run.examples_to_evaluate(exp), set([1, 3, 5]))
304
+ self.assertEqual(run.examples_to_reprocess(exp), set())
305
+ self.assertEqual(run.examples_to_load(exp), set([1, 3, 5]))
306
+ self.assertEqual(run.examples_to_load_metadata(exp), set())
307
+
308
+ def test_examples_with_reprocess_all(self):
309
+ run = Run(
310
+ '/root',
311
+ RunId.from_id('20241102_0'),
312
+ pg.Ref(Suite([
313
+ MyEvaluation(replica_id=0, inputs=sample_inputs(10)),
314
+ ])),
315
+ example_ids=[1, 3, 5],
316
+ reprocess=True
317
+ )
318
+ exp = run.experiment.leaf_nodes[0]
319
+ self.assertEqual(run.examples_to_evaluate(exp), set([1, 3, 5]))
320
+ self.assertEqual(run.examples_to_reprocess(exp), set([1, 3, 5]))
321
+ self.assertEqual(run.examples_to_load(exp), set())
322
+ self.assertEqual(run.examples_to_load_metadata(exp), set())
323
+
324
+ def test_examples_with_reprocess_some(self):
325
+ run = Run(
326
+ '/root',
327
+ RunId.from_id('20241102_0'),
328
+ pg.Ref(Suite([
329
+ MyEvaluation(replica_id=0, inputs=sample_inputs(10)),
330
+ ])),
331
+ example_ids=[1, 3, 5],
332
+ reprocess=[1],
333
+ )
334
+ exp = run.experiment.leaf_nodes[0]
335
+ self.assertEqual(run.examples_to_evaluate(exp), set([1, 3, 5]))
336
+ self.assertEqual(run.examples_to_reprocess(exp), set([1]))
337
+ self.assertEqual(run.examples_to_load(exp), set([3, 5]))
338
+ self.assertEqual(run.examples_to_load_metadata(exp), set())
339
+
340
+ def test_examples_with_generate_example_html_all(self):
341
+ run = Run(
342
+ '/root',
343
+ RunId.from_id('20241102_0'),
344
+ pg.Ref(Suite([
345
+ MyEvaluation(replica_id=0, inputs=sample_inputs(10)),
346
+ ])),
347
+ example_ids=[1, 3, 5],
348
+ reprocess=[1],
349
+ generate_example_html='all',
350
+ )
351
+ exp = run.experiment.leaf_nodes[0]
352
+ self.assertEqual(run.examples_to_evaluate(exp), set([1, 3, 5]))
353
+ self.assertEqual(run.examples_to_reprocess(exp), set([1]))
354
+ self.assertEqual(run.examples_to_load(exp), set([3, 5]))
355
+ self.assertEqual(run.examples_to_load_metadata(exp), set([3, 5]))
356
+
357
+ def test_examples_with_generate_example_html_new(self):
358
+ run = Run(
359
+ '/root',
360
+ RunId.from_id('20241102_0'),
361
+ pg.Ref(Suite([
362
+ MyEvaluation(replica_id=0, inputs=sample_inputs(10)),
363
+ ])),
364
+ example_ids=[1, 3, 5],
365
+ reprocess=[1],
366
+ generate_example_html='new',
367
+ )
368
+ exp = run.experiment.leaf_nodes[0]
369
+ self.assertEqual(run.examples_to_evaluate(exp), set([1, 3, 5]))
370
+ self.assertEqual(run.examples_to_reprocess(exp), set([1]))
371
+ self.assertEqual(run.examples_to_load(exp), set([3, 5]))
372
+ self.assertEqual(run.examples_to_load_metadata(exp), set())
373
+
374
+ def test_examples_with_generate_example_html_some(self):
375
+ run = Run(
376
+ '/root',
377
+ RunId.from_id('20241102_0'),
378
+ pg.Ref(Suite([
379
+ MyEvaluation(replica_id=0, inputs=sample_inputs(10)),
380
+ ])),
381
+ example_ids=[1, 3, 5],
382
+ reprocess=[1],
383
+ generate_example_html=[1, 2, 3],
384
+ )
385
+ exp = run.experiment.leaf_nodes[0]
386
+ self.assertEqual(run.examples_to_evaluate(exp), set([1, 3, 5]))
387
+ self.assertEqual(run.examples_to_reprocess(exp), set([1]))
388
+ self.assertEqual(run.examples_to_load(exp), set([2, 3, 5]))
389
+ self.assertEqual(run.examples_to_load_metadata(exp), set([2, 3]))
390
+
273
391
 
274
392
  class RunnerTest(unittest.TestCase):
275
393
 
@@ -172,11 +172,11 @@ class HtmlReporter(experiment_lib.Plugin):
172
172
  )
173
173
  html.save(index_html_path)
174
174
  experiment.info(
175
- f'Generated HTML {index_html_path!r} in {t.elapse:.2f} seconds.',
175
+ f'Updated {index_html_path!r} in {t.elapse:.2f} seconds.',
176
176
  )
177
177
  except BaseException as e: # pylint: disable=broad-except
178
178
  experiment.error(
179
- f'Failed to save HTML {index_html_path!r}. '
179
+ f'Failed to generate {index_html_path!r}. '
180
180
  f'Error: {e}, Stacktrace: \n{traceback.format_exc()}.',
181
181
  )
182
182
  raise e
@@ -194,26 +194,68 @@ class HtmlReporter(experiment_lib.Plugin):
194
194
  def _save_example_html(
195
195
  self, runner: Runner, experiment: Experiment, example: Example
196
196
  ) -> None:
197
- """Saves the example."""
198
- def _save():
197
+ """Saves the example in HTML format."""
198
+ current_run = runner.current_run
199
+ def _generate():
199
200
  try:
200
- html = example.to_html(
201
- collapse_level=None,
202
- enable_summary_tooltip=False,
203
- extra_flags=dict(
204
- # For properly rendering the next link.
205
- num_examples=getattr(experiment, 'num_examples', None)
206
- ),
207
- )
208
- html.save(
209
- runner.current_run.output_path_for(
210
- experiment, f'{example.id}.html'
211
- )
201
+ with pg.timeit() as t:
202
+ html = example.to_html(
203
+ collapse_level=None,
204
+ enable_summary_tooltip=False,
205
+ extra_flags=dict(
206
+ # For properly rendering the next link.
207
+ num_examples=getattr(experiment, 'num_examples', None)
208
+ ),
209
+ )
210
+ html.save(
211
+ runner.current_run.output_path_for(
212
+ experiment, f'{example.id}.html'
213
+ )
214
+ )
215
+ experiment.info(
216
+ f'\'{example.id}.html\' generated in {t.elapse:.2f} seconds. '
212
217
  )
213
218
  except BaseException as e: # pylint: disable=broad-except
214
219
  experiment.error(
215
- f'Failed to save HTML {example.id}.html. '
220
+ f'Failed to generate \'{example.id}.html\'. '
216
221
  f'Error: {e}, Stacktrace: \n{traceback.format_exc()}.',
217
222
  )
218
223
  raise e
219
- runner.background_run(_save)
224
+
225
+ def _copy():
226
+ src_file = current_run.input_path_for(experiment, f'{example.id}.html')
227
+ dest_file = current_run.output_path_for(experiment, f'{example.id}.html')
228
+
229
+ if src_file == dest_file:
230
+ return
231
+
232
+ if not pg.io.path_exists(src_file):
233
+ experiment.warning(
234
+ f'Skip copying \'{example.id}.html\' as '
235
+ f'{src_file!r} does not exist.'
236
+ )
237
+ return
238
+
239
+ try:
240
+ with pg.timeit() as t, pg.io.open(src_file, 'r') as src:
241
+ content = src.read()
242
+ with pg.io.open(dest_file, 'w') as dest:
243
+ dest.write(content)
244
+ experiment.info(
245
+ f'\'{example.id}.html\' copied in {t.elapse:.2f} seconds.'
246
+ )
247
+ except BaseException as e: # pylint: disable=broad-except
248
+ experiment.error(
249
+ f'Failed to copy {src_file!r} to {dest_file!r}. Error: {e}.'
250
+ )
251
+ raise e
252
+
253
+ generate_example_html = current_run.generate_example_html
254
+ if (generate_example_html == 'all'
255
+ or (generate_example_html == 'new' and example.newly_processed)
256
+ or (isinstance(generate_example_html, list)
257
+ and example.id in generate_example_html)):
258
+ op = _generate
259
+ else:
260
+ op = _copy
261
+ runner.background_run(op)
@@ -15,6 +15,7 @@ import os
15
15
  import tempfile
16
16
  import unittest
17
17
 
18
+ from langfun.core.eval.v2 import checkpointing
18
19
  from langfun.core.eval.v2 import eval_test_helper
19
20
  from langfun.core.eval.v2 import reporting
20
21
  from langfun.core.eval.v2 import runners as runners_lib # pylint: disable=unused-import
@@ -26,15 +27,131 @@ class ReportingTest(unittest.TestCase):
26
27
  def test_reporting(self):
27
28
  root_dir = os.path.join(tempfile.gettempdir(), 'test_reporting')
28
29
  experiment = eval_test_helper.test_experiment()
30
+ checkpointer = checkpointing.BulkCheckpointer('checkpoint.jsonl')
31
+ reporter = reporting.HtmlReporter()
32
+ run = experiment.run(root_dir, 'new', plugins=[checkpointer, reporter])
33
+ self.assertTrue(
34
+ pg.io.path_exists(run.output_path_for(experiment, 'summary.html'))
35
+ )
36
+ for leaf in experiment.leaf_nodes:
37
+ self.assertTrue(
38
+ pg.io.path_exists(run.output_path_for(leaf, 'index.html'))
39
+ )
40
+ for i in range(leaf.num_examples):
41
+ self.assertTrue(
42
+ pg.io.path_exists(run.output_path_for(leaf, f'{i + 1}.html'))
43
+ )
44
+ found_generation_log = False
45
+ for log_entry in leaf._log_entries:
46
+ if 'generated in' in log_entry.message:
47
+ found_generation_log = True
48
+ break
49
+ self.assertTrue(found_generation_log)
50
+
51
+ # Test warm start.
52
+ root_dir = os.path.join(tempfile.gettempdir(), 'test_reporting2')
53
+ experiment = eval_test_helper.test_experiment()
54
+ run = experiment.run(
55
+ root_dir, 'new', plugins=[checkpointer, reporter],
56
+ warm_start_from=run.output_root
57
+ )
58
+ self.assertTrue(
59
+ pg.io.path_exists(run.output_path_for(experiment, 'summary.html'))
60
+ )
61
+ for leaf in experiment.leaf_nodes:
62
+ self.assertTrue(
63
+ pg.io.path_exists(run.output_path_for(leaf, 'index.html'))
64
+ )
65
+ for i in range(leaf.num_examples):
66
+ self.assertTrue(
67
+ pg.io.path_exists(run.output_path_for(leaf, f'{i + 1}.html'))
68
+ )
69
+ found_copy_log = False
70
+ for log_entry in leaf._log_entries:
71
+ if 'copied in' in log_entry.message:
72
+ found_copy_log = True
73
+ break
74
+ self.assertTrue(found_copy_log)
75
+
76
+ def test_index_html_generation_error(self):
77
+ root_dir = os.path.join(
78
+ tempfile.gettempdir(),
79
+ 'test_reporting_with_index_html_generation_error'
80
+ )
81
+ experiment = (eval_test_helper
82
+ .test_experiment_with_index_html_generation_error())
29
83
  reporter = reporting.HtmlReporter()
30
84
  run = experiment.run(root_dir, 'new', plugins=[reporter])
31
- pg.io.path_exists(run.output_path_for(experiment, 'summary.html'))
85
+ self.assertFalse(
86
+ pg.io.path_exists(run.output_path_for(experiment, 'summary.html'))
87
+ )
88
+ for leaf in experiment.leaf_nodes:
89
+ self.assertFalse(
90
+ pg.io.path_exists(run.output_path_for(leaf, 'index.html'))
91
+ )
92
+ found_error_log = False
93
+ for log_entry in experiment._log_entries:
94
+ if log_entry.message.startswith('Failed to generate'):
95
+ found_error_log = True
96
+ break
97
+ self.assertTrue(found_error_log)
98
+
99
+ def test_example_html_generation_error(self):
100
+ root_dir = os.path.join(
101
+ tempfile.gettempdir(),
102
+ 'test_reporting_with_example_html_generation_error'
103
+ )
104
+ experiment = (eval_test_helper
105
+ .test_experiment_with_example_html_generation_error())
106
+ checkpointer = checkpointing.BulkCheckpointer('checkpoint.jsonl')
107
+ reporter = reporting.HtmlReporter()
108
+ run = experiment.run(root_dir, 'new', plugins=[checkpointer, reporter])
109
+ self.assertTrue(
110
+ pg.io.path_exists(run.output_path_for(experiment, 'summary.html'))
111
+ )
112
+ for leaf in experiment.leaf_nodes:
113
+ self.assertTrue(
114
+ pg.io.path_exists(run.output_path_for(leaf, 'index.html'))
115
+ )
116
+ for i in range(leaf.num_examples):
117
+ self.assertFalse(
118
+ pg.io.path_exists(run.output_path_for(leaf, f'{i + 1}.html'))
119
+ )
120
+ found_error_log = False
121
+ for log_entry in experiment._log_entries:
122
+ if log_entry.message.startswith('Failed to generate'):
123
+ found_error_log = True
124
+ break
125
+ self.assertTrue(found_error_log)
126
+
127
+ # Test warm start.
128
+ root_dir = os.path.join(
129
+ tempfile.gettempdir(),
130
+ 'test_reporting_with_example_html_generation_error2'
131
+ )
132
+ experiment = (eval_test_helper
133
+ .test_experiment_with_example_html_generation_error())
134
+ run = experiment.run(
135
+ root_dir, 'new', plugins=[checkpointer, reporter],
136
+ warm_start_from=run.output_root
137
+ )
138
+ self.assertTrue(
139
+ pg.io.path_exists(run.output_path_for(experiment, 'summary.html'))
140
+ )
32
141
  for leaf in experiment.leaf_nodes:
33
142
  self.assertTrue(
34
143
  pg.io.path_exists(run.output_path_for(leaf, 'index.html'))
35
144
  )
36
145
  for i in range(leaf.num_examples):
37
- pg.io.path_exists(run.output_path_for(leaf, f'{i + 1}.html'))
146
+ self.assertFalse(
147
+ pg.io.path_exists(run.output_path_for(leaf, f'{i + 1}.html'))
148
+ )
149
+ found_error_log = False
150
+ for log_entry in experiment._log_entries:
151
+ if log_entry.message.startswith('Skip copying'):
152
+ found_error_log = True
153
+ break
154
+ self.assertTrue(found_error_log)
38
155
 
39
156
 
40
157
  if __name__ == '__main__':
@@ -123,6 +123,7 @@ class RunnerBase(Runner):
123
123
  def on_experiment_start(self, experiment: Experiment) -> None:
124
124
  """Called when an evaluation is started."""
125
125
  # Start the progress of the evaluation.
126
+ num_examples_to_evaluate = 0
126
127
  if experiment.is_leaf:
127
128
  assert isinstance(experiment, Evaluation)
128
129
  num_examples_to_evaluate = (
@@ -130,10 +131,6 @@ class RunnerBase(Runner):
130
131
  if self.current_run.example_ids else experiment.num_examples
131
132
  )
132
133
  experiment.progress.start(total=num_examples_to_evaluate)
133
- experiment.info(
134
- 'Starting evaluation %s with %d examples to evaluate.'
135
- % (experiment.id, num_examples_to_evaluate)
136
- )
137
134
  else:
138
135
  experiment.progress.start(total=len(experiment.leaf_nodes))
139
136
 
@@ -141,6 +138,12 @@ class RunnerBase(Runner):
141
138
  for plugin in self._all_plugins(experiment):
142
139
  plugin.on_experiment_start(self, experiment)
143
140
 
141
+ if experiment.is_leaf:
142
+ experiment.info(
143
+ f'Starting evaluation {experiment.id!r} with '
144
+ f'{num_examples_to_evaluate} examples to evaluate.'
145
+ )
146
+
144
147
  def on_experiment_skipped(self, experiment: Experiment) -> None:
145
148
  """Called when an evaluation is skipped."""
146
149
  # Skip event will only be triggered for leaf evaluations.
@@ -32,6 +32,7 @@ from langfun.core.llms.rest import REST
32
32
 
33
33
  # Gemini models.
34
34
  from langfun.core.llms.google_genai import GenAI
35
+ from langfun.core.llms.google_genai import GeminiFlash2_0ThinkingExp
35
36
  from langfun.core.llms.google_genai import GeminiFlash2_0Exp
36
37
  from langfun.core.llms.google_genai import GeminiExp_20241114
37
38
  from langfun.core.llms.google_genai import GeminiExp_20241206
@@ -45,6 +46,7 @@ from langfun.core.llms.google_genai import Palm2_IT
45
46
  # OpenAI models.
46
47
  from langfun.core.llms.openai import OpenAI
47
48
 
49
+ from langfun.core.llms.openai import GptO1
48
50
  from langfun.core.llms.openai import GptO1Preview
49
51
  from langfun.core.llms.openai import GptO1Preview_20240912
50
52
  from langfun.core.llms.openai import GptO1Mini
@@ -106,6 +108,7 @@ from langfun.core.llms.anthropic import VertexAIAnthropic
106
108
  from langfun.core.llms.anthropic import VertexAIClaude3_5_Sonnet_20241022
107
109
  from langfun.core.llms.anthropic import VertexAIClaude3_5_Sonnet_20240620
108
110
  from langfun.core.llms.anthropic import VertexAIClaude3_5_Haiku_20241022
111
+ from langfun.core.llms.anthropic import VertexAIClaude3_Opus_20240229
109
112
 
110
113
  from langfun.core.llms.groq import Groq
111
114
  from langfun.core.llms.groq import GroqLlama3_2_3B
@@ -124,6 +127,7 @@ from langfun.core.llms.groq import GroqWhisper_Large_v3Turbo
124
127
  from langfun.core.llms.vertexai import VertexAI
125
128
  from langfun.core.llms.vertexai import VertexAIGemini2_0
126
129
  from langfun.core.llms.vertexai import VertexAIGeminiFlash2_0Exp
130
+ from langfun.core.llms.vertexai import VertexAIGeminiFlash2_0ThinkingExp
127
131
  from langfun.core.llms.vertexai import VertexAIGemini1_5
128
132
  from langfun.core.llms.vertexai import VertexAIGeminiPro1_5
129
133
  from langfun.core.llms.vertexai import VertexAIGeminiPro1_5_001
@@ -67,6 +67,13 @@ SUPPORTED_MODELS_AND_SETTINGS = {
67
67
  cost_per_1k_input_tokens=0.001,
68
68
  cost_per_1k_output_tokens=0.005,
69
69
  ),
70
+ 'claude-3-opus@20240229': pg.Dict(
71
+ max_tokens=4096,
72
+ rpm=4000,
73
+ tpm=400000,
74
+ cost_per_1k_input_tokens=0.015,
75
+ cost_per_1k_output_tokens=0.075,
76
+ ),
70
77
  # Anthropic hosted models.
71
78
  'claude-3-5-sonnet-20241022': pg.Dict(
72
79
  max_tokens=8192,
@@ -461,6 +468,11 @@ class VertexAIAnthropic(Anthropic):
461
468
  return request
462
469
 
463
470
 
471
+ class VertexAIClaude3_Opus_20240229(VertexAIAnthropic): # pylint: disable=invalid-name
472
+ """Anthropic's Claude 3 Opus model on VertexAI."""
473
+ model = 'claude-3-opus@20240229'
474
+
475
+
464
476
  class VertexAIClaude3_5_Sonnet_20241022(VertexAIAnthropic): # pylint: disable=invalid-name
465
477
  """Anthropic's Claude 3.5 Sonnet model on VertexAI."""
466
478
  model = 'claude-3-5-sonnet-v2@20241022'
@@ -48,6 +48,7 @@ class GenAI(lf.LanguageModel):
48
48
 
49
49
  model: Annotated[
50
50
  Literal[
51
+ 'gemini-2.0-flash-thinking-exp-1219',
51
52
  'gemini-2.0-flash-exp',
52
53
  'gemini-exp-1206',
53
54
  'gemini-exp-1114',
@@ -307,6 +308,16 @@ _GOOGLE_GENAI_MODEL_HUB = _ModelHub()
307
308
  #
308
309
  # Public Gemini models.
309
310
  #
311
+ class GeminiFlash2_0ThinkingExp(GenAI): # pylint: disable=invalid-name
312
+ """Gemini 2.0 Flash Thinking Experimental model."""
313
+
314
+ model = 'gemini-2.0-flash-thinking-exp-1219'
315
+ supported_modalities = (
316
+ vertexai.DOCUMENT_TYPES
317
+ + vertexai.IMAGE_TYPES
318
+ + vertexai.AUDIO_TYPES
319
+ + vertexai.VIDEO_TYPES
320
+ )
310
321
 
311
322
 
312
323
  class GeminiFlash2_0Exp(GenAI): # pylint: disable=invalid-name