langfun 0.1.2.dev202501010804__py3-none-any.whl → 0.1.2.dev202501060804__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. langfun/core/__init__.py +0 -4
  2. langfun/core/eval/matching.py +2 -2
  3. langfun/core/eval/scoring.py +6 -2
  4. langfun/core/eval/v2/checkpointing.py +106 -72
  5. langfun/core/eval/v2/checkpointing_test.py +108 -3
  6. langfun/core/eval/v2/eval_test_helper.py +56 -0
  7. langfun/core/eval/v2/evaluation.py +25 -4
  8. langfun/core/eval/v2/evaluation_test.py +11 -0
  9. langfun/core/eval/v2/example.py +11 -1
  10. langfun/core/eval/v2/example_test.py +16 -2
  11. langfun/core/eval/v2/experiment.py +83 -19
  12. langfun/core/eval/v2/experiment_test.py +121 -3
  13. langfun/core/eval/v2/reporting.py +67 -20
  14. langfun/core/eval/v2/reporting_test.py +119 -2
  15. langfun/core/eval/v2/runners.py +7 -4
  16. langfun/core/llms/__init__.py +23 -24
  17. langfun/core/llms/anthropic.py +12 -0
  18. langfun/core/llms/cache/in_memory.py +6 -0
  19. langfun/core/llms/cache/in_memory_test.py +5 -0
  20. langfun/core/llms/gemini.py +507 -0
  21. langfun/core/llms/gemini_test.py +195 -0
  22. langfun/core/llms/google_genai.py +46 -310
  23. langfun/core/llms/google_genai_test.py +9 -204
  24. langfun/core/llms/openai.py +23 -37
  25. langfun/core/llms/vertexai.py +28 -348
  26. langfun/core/llms/vertexai_test.py +6 -166
  27. {langfun-0.1.2.dev202501010804.dist-info → langfun-0.1.2.dev202501060804.dist-info}/METADATA +7 -13
  28. {langfun-0.1.2.dev202501010804.dist-info → langfun-0.1.2.dev202501060804.dist-info}/RECORD +31 -31
  29. {langfun-0.1.2.dev202501010804.dist-info → langfun-0.1.2.dev202501060804.dist-info}/WHEEL +1 -1
  30. langfun/core/repr_utils.py +0 -204
  31. langfun/core/repr_utils_test.py +0 -90
  32. {langfun-0.1.2.dev202501010804.dist-info → langfun-0.1.2.dev202501060804.dist-info}/LICENSE +0 -0
  33. {langfun-0.1.2.dev202501010804.dist-info → langfun-0.1.2.dev202501060804.dist-info}/top_level.txt +0 -0
@@ -105,8 +105,8 @@ class Experiment(lf.Component, pg.views.HtmlTreeView.Extension):
105
105
  # metrics as needed.
106
106
  experiment.run(root_dir, '20241031_1')
107
107
 
108
- # Refresh the previous run located in 'run_20241031_1'.
109
- experiment.run(root_dir, '20241031_1', refresh=True)
108
+ # Reprocess the previous run located in 'run_20241031_1'.
109
+ experiment.run(root_dir, '20241031_1', reprocess=True)
110
110
  ```
111
111
 
112
112
  # Experiment Registration and Lookup
@@ -380,7 +380,8 @@ class Experiment(lf.Component, pg.views.HtmlTreeView.Extension):
380
380
  filter: Callable[['Experiment'], bool] | None = None, # pylint: disable=redefined-builtin
381
381
  example_ids: list[int] | None = None,
382
382
  raise_if_has_error: bool = False,
383
- refresh: bool = False,
383
+ reprocess: bool | list[int] = False,
384
+ generate_example_html: Literal['new', 'all', 'no'] | list[int] = 'new',
384
385
  process_timeout: int | None = None,
385
386
  use_cache: Literal['global', 'per_dataset', 'no'] = 'per_dataset',
386
387
  note: str | None = None,
@@ -391,22 +392,25 @@ class Experiment(lf.Component, pg.views.HtmlTreeView.Extension):
391
392
  """Runs the experiment.
392
393
 
393
394
  Examples:
394
- # Start a new run.
395
- experiment.run('new')
395
+ # Start a new run under root_dir.
396
+ experiment.run(root_dir, 'new')
396
397
 
397
398
  # Continue the latest experiment run.
398
- experiment.run('latest')
399
+ experiment.run(root_dir, 'latest')
399
400
 
400
401
  # Continue the latest experiment run or start a new run if it does not
401
402
  # exist.
402
- experiment.run()
403
+ experiment.run(root_dir)
403
404
 
404
- # Start a new run and warm start from a previous run under sub-dir
405
- # 'run_20241031_1'.
406
- experiment.run('new', warm_start_from='20241031_1')
405
+ # Start a new run and warm start from another run's directory
406
+ # '/path/to/another/run_20241031_1/'.
407
+ experiment.run(
408
+ root_dir, 'new',
409
+ warm_start_from='/path/to/another/run_20241031_1/'
410
+ )
407
411
 
408
- # Refresh previous run under sub-dir 'run_20241031_1'.
409
- experiment.run('20241031_1', refresh=True)
412
+ # Reprocess previous run under sub-dir 'run_20241031_1'.
413
+ experiment.run(root_dir, '20241031_1', reprocess=True)
410
414
 
411
415
  Args:
412
416
  root_dir: The root of the output directory of the experiment.
@@ -426,8 +430,18 @@ class Experiment(lf.Component, pg.views.HtmlTreeView.Extension):
426
430
  example_ids: The example IDs to run. If None, it will run all examples.
427
431
  raise_if_has_error: If True, it will raise an error if any example fails.
428
432
  Otherwise, it will continue and report the error in the output.
429
- refresh: Whether to refresh the experiment. If True, it will delete the
430
- data under the current experiment run directory and start a new run.
433
+ reprocess: A boolean or a list of example IDs. If boolean, it indicates
434
+ that whether all the examples to be evaluated will be reprocessed,
435
+ meaning that existing checkpoints will be ignored. If a list of
436
+ example IDs, it indicates that only the specified examples will be
437
+ reprocessed.
438
+ generate_example_html: Among 'new', 'all', 'no' or a list of example IDs.
439
+ If 'new', generate HTML files for all newly processed examples, and
440
+ keep/copy existing HTML files for unchanged examples.
441
+ If 'all', generate HTML files for all examples.
442
+ If 'no', do not generate HTML files for any examples.
443
+ If a list of example IDs, generate HTML files for the specified
444
+ examples.
431
445
  process_timeout: The timeout in seconds for each process. If None, it
432
446
  will use the default timeout for the runner.
433
447
  use_cache: Whether to use LLM cache for the experiment.
@@ -454,7 +468,8 @@ class Experiment(lf.Component, pg.views.HtmlTreeView.Extension):
454
468
  filter=filter,
455
469
  example_ids=example_ids,
456
470
  raise_if_has_error=raise_if_has_error,
457
- refresh=refresh,
471
+ reprocess=reprocess,
472
+ generate_example_html=generate_example_html,
458
473
  use_cache=use_cache,
459
474
  process_timeout=process_timeout,
460
475
  note=note,
@@ -815,14 +830,27 @@ class Run(pg.Object, pg.views.html.HtmlTreeView.Extension):
815
830
  'The user tags for the current run.'
816
831
  ] = []
817
832
 
818
- refresh: Annotated[
819
- bool,
833
+ reprocess: Annotated[
834
+ bool | list[int],
820
835
  (
821
- 'If True, it will delete the data under the current '
822
- 'run directory and start a new run.'
836
+ 'If True, it will reprocess all examples under the current '
837
+ 'run directory. If a list of integers, examples of the given IDS '
838
+ 'will be reprocessed.'
823
839
  )
824
840
  ] = False
825
841
 
842
+ generate_example_html: Annotated[
843
+ Literal['new', 'all', 'no'] | list[int],
844
+ (
845
+ 'If "new", generate HTML files for all newly processed examples, '
846
+ 'and keep/copy existing HTML files for unchanged examples. '
847
+ 'If "all", generate HTML files for all examples. '
848
+ 'If "no", do not generate HTML files for any examples. '
849
+ 'If a list of example IDs, generate HTML files for the specified '
850
+ 'examples.'
851
+ )
852
+ ] = 'new'
853
+
826
854
  filter: Annotated[
827
855
  Callable[[Experiment], bool] | None,
828
856
  'A filter to decide whether a leaf experiment should be run or not.'
@@ -873,6 +901,42 @@ class Run(pg.Object, pg.views.html.HtmlTreeView.Extension):
873
901
  """Returns the output path for the experiment."""
874
902
  return os.path.join(self.output_dir(experiment), relative_path)
875
903
 
904
+ def examples_to_evaluate(self, experiment: Experiment) -> set[int]:
905
+ """Returns the example IDs to evaluate."""
906
+ if not experiment.is_leaf:
907
+ return set()
908
+ return set(
909
+ self.example_ids if self.example_ids else
910
+ range(1, experiment.num_examples + 1)
911
+ )
912
+
913
+ def examples_to_reprocess(self, experiment: Experiment) -> set[int]:
914
+ """Returns the example IDs to reprocess per request."""
915
+ if not self.reprocess:
916
+ return set()
917
+ reprocess_ids = self.examples_to_evaluate(experiment)
918
+ if isinstance(self.reprocess, list):
919
+ reprocess_ids &= set(self.reprocess)
920
+ return reprocess_ids
921
+
922
+ def examples_to_load(self, experiment: Experiment) -> set[int]:
923
+ """Returns the example IDs to load from checkpoint files.."""
924
+ load_ids = self.examples_to_evaluate(experiment)
925
+ if isinstance(self.generate_example_html, list):
926
+ load_ids |= set(self.generate_example_html)
927
+ load_ids -= self.examples_to_reprocess(experiment)
928
+ return load_ids
929
+
930
+ def examples_to_load_metadata(self, experiment: Experiment) -> set[int]:
931
+ """Returns the example IDs to load the metadata."""
932
+ load_metadata_ids = set()
933
+ if isinstance(self.generate_example_html, list):
934
+ load_metadata_ids = set(self.generate_example_html)
935
+ elif self.generate_example_html == 'all':
936
+ load_metadata_ids = self.examples_to_evaluate(experiment)
937
+ load_metadata_ids -= self.examples_to_reprocess(experiment)
938
+ return load_metadata_ids
939
+
876
940
 
877
941
  class Runner(pg.Object):
878
942
  """Interface for experiment runner."""
@@ -31,10 +31,10 @@ Runner = experiment_lib.Runner
31
31
 
32
32
 
33
33
  @pg.functor()
34
- def sample_inputs():
34
+ def sample_inputs(num_examples: int = 1):
35
35
  return [
36
36
  pg.Dict(x=1)
37
- ]
37
+ ] * num_examples
38
38
 
39
39
 
40
40
  class MyEvaluation(Evaluation):
@@ -208,7 +208,7 @@ class RunIdTest(unittest.TestCase):
208
208
 
209
209
  class RunTest(unittest.TestCase):
210
210
 
211
- def test_basic(self):
211
+ def test_input_output_paths(self):
212
212
  run = Run(
213
213
  '/root',
214
214
  RunId.from_id('20241102_0'),
@@ -270,6 +270,124 @@ class RunTest(unittest.TestCase):
270
270
  )
271
271
  )
272
272
 
273
+ def test_examples_start_from_scratch(self):
274
+ run = Run(
275
+ '/root',
276
+ RunId.from_id('20241102_0'),
277
+ pg.Ref(Suite([
278
+ MyEvaluation(replica_id=0, inputs=sample_inputs(10)),
279
+ ])),
280
+ )
281
+ root = run.experiment
282
+ self.assertEqual(run.examples_to_evaluate(root), set())
283
+ self.assertEqual(run.examples_to_reprocess(root), set())
284
+ self.assertEqual(run.examples_to_load(root), set())
285
+ self.assertEqual(run.examples_to_load_metadata(root), set())
286
+
287
+ exp = root.leaf_nodes[0]
288
+ self.assertEqual(run.examples_to_evaluate(exp), set(range(1, 11)))
289
+ self.assertEqual(run.examples_to_reprocess(exp), set())
290
+ self.assertEqual(run.examples_to_load(exp), set(range(1, 11)))
291
+ self.assertEqual(run.examples_to_load_metadata(exp), set())
292
+
293
+ def test_examples_with_example_ids(self):
294
+ run = Run(
295
+ '/root',
296
+ RunId.from_id('20241102_0'),
297
+ pg.Ref(Suite([
298
+ MyEvaluation(replica_id=0, inputs=sample_inputs(10)),
299
+ ])),
300
+ example_ids=[1, 3, 5]
301
+ )
302
+ exp = run.experiment.leaf_nodes[0]
303
+ self.assertEqual(run.examples_to_evaluate(exp), set([1, 3, 5]))
304
+ self.assertEqual(run.examples_to_reprocess(exp), set())
305
+ self.assertEqual(run.examples_to_load(exp), set([1, 3, 5]))
306
+ self.assertEqual(run.examples_to_load_metadata(exp), set())
307
+
308
+ def test_examples_with_reprocess_all(self):
309
+ run = Run(
310
+ '/root',
311
+ RunId.from_id('20241102_0'),
312
+ pg.Ref(Suite([
313
+ MyEvaluation(replica_id=0, inputs=sample_inputs(10)),
314
+ ])),
315
+ example_ids=[1, 3, 5],
316
+ reprocess=True
317
+ )
318
+ exp = run.experiment.leaf_nodes[0]
319
+ self.assertEqual(run.examples_to_evaluate(exp), set([1, 3, 5]))
320
+ self.assertEqual(run.examples_to_reprocess(exp), set([1, 3, 5]))
321
+ self.assertEqual(run.examples_to_load(exp), set())
322
+ self.assertEqual(run.examples_to_load_metadata(exp), set())
323
+
324
+ def test_examples_with_reprocess_some(self):
325
+ run = Run(
326
+ '/root',
327
+ RunId.from_id('20241102_0'),
328
+ pg.Ref(Suite([
329
+ MyEvaluation(replica_id=0, inputs=sample_inputs(10)),
330
+ ])),
331
+ example_ids=[1, 3, 5],
332
+ reprocess=[1],
333
+ )
334
+ exp = run.experiment.leaf_nodes[0]
335
+ self.assertEqual(run.examples_to_evaluate(exp), set([1, 3, 5]))
336
+ self.assertEqual(run.examples_to_reprocess(exp), set([1]))
337
+ self.assertEqual(run.examples_to_load(exp), set([3, 5]))
338
+ self.assertEqual(run.examples_to_load_metadata(exp), set())
339
+
340
+ def test_examples_with_generate_example_html_all(self):
341
+ run = Run(
342
+ '/root',
343
+ RunId.from_id('20241102_0'),
344
+ pg.Ref(Suite([
345
+ MyEvaluation(replica_id=0, inputs=sample_inputs(10)),
346
+ ])),
347
+ example_ids=[1, 3, 5],
348
+ reprocess=[1],
349
+ generate_example_html='all',
350
+ )
351
+ exp = run.experiment.leaf_nodes[0]
352
+ self.assertEqual(run.examples_to_evaluate(exp), set([1, 3, 5]))
353
+ self.assertEqual(run.examples_to_reprocess(exp), set([1]))
354
+ self.assertEqual(run.examples_to_load(exp), set([3, 5]))
355
+ self.assertEqual(run.examples_to_load_metadata(exp), set([3, 5]))
356
+
357
+ def test_examples_with_generate_example_html_new(self):
358
+ run = Run(
359
+ '/root',
360
+ RunId.from_id('20241102_0'),
361
+ pg.Ref(Suite([
362
+ MyEvaluation(replica_id=0, inputs=sample_inputs(10)),
363
+ ])),
364
+ example_ids=[1, 3, 5],
365
+ reprocess=[1],
366
+ generate_example_html='new',
367
+ )
368
+ exp = run.experiment.leaf_nodes[0]
369
+ self.assertEqual(run.examples_to_evaluate(exp), set([1, 3, 5]))
370
+ self.assertEqual(run.examples_to_reprocess(exp), set([1]))
371
+ self.assertEqual(run.examples_to_load(exp), set([3, 5]))
372
+ self.assertEqual(run.examples_to_load_metadata(exp), set())
373
+
374
+ def test_examples_with_generate_example_html_some(self):
375
+ run = Run(
376
+ '/root',
377
+ RunId.from_id('20241102_0'),
378
+ pg.Ref(Suite([
379
+ MyEvaluation(replica_id=0, inputs=sample_inputs(10)),
380
+ ])),
381
+ example_ids=[1, 3, 5],
382
+ reprocess=[1],
383
+ generate_example_html=[1, 2, 3],
384
+ )
385
+ exp = run.experiment.leaf_nodes[0]
386
+ self.assertEqual(run.examples_to_evaluate(exp), set([1, 3, 5]))
387
+ self.assertEqual(run.examples_to_reprocess(exp), set([1]))
388
+ self.assertEqual(run.examples_to_load(exp), set([2, 3, 5]))
389
+ self.assertEqual(run.examples_to_load_metadata(exp), set([2, 3]))
390
+
273
391
 
274
392
  class RunnerTest(unittest.TestCase):
275
393
 
@@ -51,6 +51,7 @@ class HtmlReporter(experiment_lib.Plugin):
51
51
  self._update_thread = None
52
52
  self._stop_update = False
53
53
  self._stop_update_experiment_ids = set()
54
+ self._experiment_index_lock = None
54
55
 
55
56
  def on_run_start(
56
57
  self,
@@ -61,6 +62,9 @@ class HtmlReporter(experiment_lib.Plugin):
61
62
  self._last_experiment_report_time = {leaf.id: 0 for leaf in root.leaf_nodes}
62
63
  self._stop_update = False
63
64
  self._stop_update_experiment_ids = set()
65
+ self._experiment_index_lock = {
66
+ leaf.id: threading.Lock() for leaf in root.leaf_nodes
67
+ }
64
68
  self._update_thread = threading.Thread(
65
69
  target=self._update_thread_func, args=(runner,)
66
70
  )
@@ -170,13 +174,14 @@ class HtmlReporter(experiment_lib.Plugin):
170
174
  card_view=False,
171
175
  ),
172
176
  )
173
- html.save(index_html_path)
177
+ with self._experiment_index_lock[experiment.id]:
178
+ html.save(index_html_path)
174
179
  experiment.info(
175
- f'Generated HTML {index_html_path!r} in {t.elapse:.2f} seconds.',
180
+ f'Updated {index_html_path!r} in {t.elapse:.2f} seconds.',
176
181
  )
177
182
  except BaseException as e: # pylint: disable=broad-except
178
183
  experiment.error(
179
- f'Failed to save HTML {index_html_path!r}. '
184
+ f'Failed to generate {index_html_path!r}. '
180
185
  f'Error: {e}, Stacktrace: \n{traceback.format_exc()}.',
181
186
  )
182
187
  raise e
@@ -185,35 +190,77 @@ class HtmlReporter(experiment_lib.Plugin):
185
190
  time.time() - self._last_experiment_report_time[experiment.id]
186
191
  > self.experiment_report_interval
187
192
  ):
193
+ self._last_experiment_report_time[experiment.id] = time.time()
188
194
  if background:
189
195
  runner.background_run(_save)
190
196
  else:
191
197
  _save()
192
- self._last_experiment_report_time[experiment.id] = time.time()
193
198
 
194
199
  def _save_example_html(
195
200
  self, runner: Runner, experiment: Experiment, example: Example
196
201
  ) -> None:
197
- """Saves the example."""
198
- def _save():
202
+ """Saves the example in HTML format."""
203
+ current_run = runner.current_run
204
+ def _generate():
199
205
  try:
200
- html = example.to_html(
201
- collapse_level=None,
202
- enable_summary_tooltip=False,
203
- extra_flags=dict(
204
- # For properly rendering the next link.
205
- num_examples=getattr(experiment, 'num_examples', None)
206
- ),
207
- )
208
- html.save(
209
- runner.current_run.output_path_for(
210
- experiment, f'{example.id}.html'
211
- )
206
+ with pg.timeit() as t:
207
+ html = example.to_html(
208
+ collapse_level=None,
209
+ enable_summary_tooltip=False,
210
+ extra_flags=dict(
211
+ # For properly rendering the next link.
212
+ num_examples=getattr(experiment, 'num_examples', None)
213
+ ),
214
+ )
215
+ html.save(
216
+ runner.current_run.output_path_for(
217
+ experiment, f'{example.id}.html'
218
+ )
219
+ )
220
+ experiment.info(
221
+ f'\'{example.id}.html\' generated in {t.elapse:.2f} seconds. '
212
222
  )
213
223
  except BaseException as e: # pylint: disable=broad-except
214
224
  experiment.error(
215
- f'Failed to save HTML {example.id}.html. '
225
+ f'Failed to generate \'{example.id}.html\'. '
216
226
  f'Error: {e}, Stacktrace: \n{traceback.format_exc()}.',
217
227
  )
218
228
  raise e
219
- runner.background_run(_save)
229
+
230
+ def _copy():
231
+ src_file = current_run.input_path_for(experiment, f'{example.id}.html')
232
+ dest_file = current_run.output_path_for(experiment, f'{example.id}.html')
233
+
234
+ if src_file == dest_file:
235
+ return
236
+
237
+ if not pg.io.path_exists(src_file):
238
+ experiment.warning(
239
+ f'Skip copying \'{example.id}.html\' as '
240
+ f'{src_file!r} does not exist.'
241
+ )
242
+ return
243
+
244
+ try:
245
+ with pg.timeit() as t, pg.io.open(src_file, 'r') as src:
246
+ content = src.read()
247
+ with pg.io.open(dest_file, 'w') as dest:
248
+ dest.write(content)
249
+ experiment.info(
250
+ f'\'{example.id}.html\' copied in {t.elapse:.2f} seconds.'
251
+ )
252
+ except BaseException as e: # pylint: disable=broad-except
253
+ experiment.error(
254
+ f'Failed to copy {src_file!r} to {dest_file!r}. Error: {e}.'
255
+ )
256
+ raise e
257
+
258
+ generate_example_html = current_run.generate_example_html
259
+ if (generate_example_html == 'all'
260
+ or (generate_example_html == 'new' and example.newly_processed)
261
+ or (isinstance(generate_example_html, list)
262
+ and example.id in generate_example_html)):
263
+ op = _generate
264
+ else:
265
+ op = _copy
266
+ runner.background_run(op)
@@ -15,6 +15,7 @@ import os
15
15
  import tempfile
16
16
  import unittest
17
17
 
18
+ from langfun.core.eval.v2 import checkpointing
18
19
  from langfun.core.eval.v2 import eval_test_helper
19
20
  from langfun.core.eval.v2 import reporting
20
21
  from langfun.core.eval.v2 import runners as runners_lib # pylint: disable=unused-import
@@ -26,15 +27,131 @@ class ReportingTest(unittest.TestCase):
26
27
  def test_reporting(self):
27
28
  root_dir = os.path.join(tempfile.gettempdir(), 'test_reporting')
28
29
  experiment = eval_test_helper.test_experiment()
30
+ checkpointer = checkpointing.BulkCheckpointer('checkpoint.jsonl')
31
+ reporter = reporting.HtmlReporter()
32
+ run = experiment.run(root_dir, 'new', plugins=[checkpointer, reporter])
33
+ self.assertTrue(
34
+ pg.io.path_exists(run.output_path_for(experiment, 'summary.html'))
35
+ )
36
+ for leaf in experiment.leaf_nodes:
37
+ self.assertTrue(
38
+ pg.io.path_exists(run.output_path_for(leaf, 'index.html'))
39
+ )
40
+ for i in range(leaf.num_examples):
41
+ self.assertTrue(
42
+ pg.io.path_exists(run.output_path_for(leaf, f'{i + 1}.html'))
43
+ )
44
+ found_generation_log = False
45
+ for log_entry in leaf._log_entries:
46
+ if 'generated in' in log_entry.message:
47
+ found_generation_log = True
48
+ break
49
+ self.assertTrue(found_generation_log)
50
+
51
+ # Test warm start.
52
+ root_dir = os.path.join(tempfile.gettempdir(), 'test_reporting2')
53
+ experiment = eval_test_helper.test_experiment()
54
+ run = experiment.run(
55
+ root_dir, 'new', plugins=[checkpointer, reporter],
56
+ warm_start_from=run.output_root
57
+ )
58
+ self.assertTrue(
59
+ pg.io.path_exists(run.output_path_for(experiment, 'summary.html'))
60
+ )
61
+ for leaf in experiment.leaf_nodes:
62
+ self.assertTrue(
63
+ pg.io.path_exists(run.output_path_for(leaf, 'index.html'))
64
+ )
65
+ for i in range(leaf.num_examples):
66
+ self.assertTrue(
67
+ pg.io.path_exists(run.output_path_for(leaf, f'{i + 1}.html'))
68
+ )
69
+ found_copy_log = False
70
+ for log_entry in leaf._log_entries:
71
+ if 'copied in' in log_entry.message:
72
+ found_copy_log = True
73
+ break
74
+ self.assertTrue(found_copy_log)
75
+
76
+ def test_index_html_generation_error(self):
77
+ root_dir = os.path.join(
78
+ tempfile.gettempdir(),
79
+ 'test_reporting_with_index_html_generation_error'
80
+ )
81
+ experiment = (eval_test_helper
82
+ .test_experiment_with_index_html_generation_error())
29
83
  reporter = reporting.HtmlReporter()
30
84
  run = experiment.run(root_dir, 'new', plugins=[reporter])
31
- pg.io.path_exists(run.output_path_for(experiment, 'summary.html'))
85
+ self.assertFalse(
86
+ pg.io.path_exists(run.output_path_for(experiment, 'summary.html'))
87
+ )
88
+ for leaf in experiment.leaf_nodes:
89
+ self.assertFalse(
90
+ pg.io.path_exists(run.output_path_for(leaf, 'index.html'))
91
+ )
92
+ found_error_log = False
93
+ for log_entry in experiment._log_entries:
94
+ if log_entry.message.startswith('Failed to generate'):
95
+ found_error_log = True
96
+ break
97
+ self.assertTrue(found_error_log)
98
+
99
+ def test_example_html_generation_error(self):
100
+ root_dir = os.path.join(
101
+ tempfile.gettempdir(),
102
+ 'test_reporting_with_example_html_generation_error'
103
+ )
104
+ experiment = (eval_test_helper
105
+ .test_experiment_with_example_html_generation_error())
106
+ checkpointer = checkpointing.BulkCheckpointer('checkpoint.jsonl')
107
+ reporter = reporting.HtmlReporter()
108
+ run = experiment.run(root_dir, 'new', plugins=[checkpointer, reporter])
109
+ self.assertTrue(
110
+ pg.io.path_exists(run.output_path_for(experiment, 'summary.html'))
111
+ )
112
+ for leaf in experiment.leaf_nodes:
113
+ self.assertTrue(
114
+ pg.io.path_exists(run.output_path_for(leaf, 'index.html'))
115
+ )
116
+ for i in range(leaf.num_examples):
117
+ self.assertFalse(
118
+ pg.io.path_exists(run.output_path_for(leaf, f'{i + 1}.html'))
119
+ )
120
+ found_error_log = False
121
+ for log_entry in experiment._log_entries:
122
+ if log_entry.message.startswith('Failed to generate'):
123
+ found_error_log = True
124
+ break
125
+ self.assertTrue(found_error_log)
126
+
127
+ # Test warm start.
128
+ root_dir = os.path.join(
129
+ tempfile.gettempdir(),
130
+ 'test_reporting_with_example_html_generation_error2'
131
+ )
132
+ experiment = (eval_test_helper
133
+ .test_experiment_with_example_html_generation_error())
134
+ run = experiment.run(
135
+ root_dir, 'new', plugins=[checkpointer, reporter],
136
+ warm_start_from=run.output_root
137
+ )
138
+ self.assertTrue(
139
+ pg.io.path_exists(run.output_path_for(experiment, 'summary.html'))
140
+ )
32
141
  for leaf in experiment.leaf_nodes:
33
142
  self.assertTrue(
34
143
  pg.io.path_exists(run.output_path_for(leaf, 'index.html'))
35
144
  )
36
145
  for i in range(leaf.num_examples):
37
- pg.io.path_exists(run.output_path_for(leaf, f'{i + 1}.html'))
146
+ self.assertFalse(
147
+ pg.io.path_exists(run.output_path_for(leaf, f'{i + 1}.html'))
148
+ )
149
+ found_error_log = False
150
+ for log_entry in experiment._log_entries:
151
+ if log_entry.message.startswith('Skip copying'):
152
+ found_error_log = True
153
+ break
154
+ self.assertTrue(found_error_log)
38
155
 
39
156
 
40
157
  if __name__ == '__main__':
@@ -123,6 +123,7 @@ class RunnerBase(Runner):
123
123
  def on_experiment_start(self, experiment: Experiment) -> None:
124
124
  """Called when an evaluation is started."""
125
125
  # Start the progress of the evaluation.
126
+ num_examples_to_evaluate = 0
126
127
  if experiment.is_leaf:
127
128
  assert isinstance(experiment, Evaluation)
128
129
  num_examples_to_evaluate = (
@@ -130,10 +131,6 @@ class RunnerBase(Runner):
130
131
  if self.current_run.example_ids else experiment.num_examples
131
132
  )
132
133
  experiment.progress.start(total=num_examples_to_evaluate)
133
- experiment.info(
134
- 'Starting evaluation %s with %d examples to evaluate.'
135
- % (experiment.id, num_examples_to_evaluate)
136
- )
137
134
  else:
138
135
  experiment.progress.start(total=len(experiment.leaf_nodes))
139
136
 
@@ -141,6 +138,12 @@ class RunnerBase(Runner):
141
138
  for plugin in self._all_plugins(experiment):
142
139
  plugin.on_experiment_start(self, experiment)
143
140
 
141
+ if experiment.is_leaf:
142
+ experiment.info(
143
+ f'Starting evaluation {experiment.id!r} with '
144
+ f'{num_examples_to_evaluate} examples to evaluate.'
145
+ )
146
+
144
147
  def on_experiment_skipped(self, experiment: Experiment) -> None:
145
148
  """Called when an evaluation is skipped."""
146
149
  # Skip event will only be triggered for leaf evaluations.
@@ -32,19 +32,35 @@ from langfun.core.llms.rest import REST
32
32
 
33
33
  # Gemini models.
34
34
  from langfun.core.llms.google_genai import GenAI
35
+ from langfun.core.llms.google_genai import GeminiFlash2_0ThinkingExp_20241219
35
36
  from langfun.core.llms.google_genai import GeminiFlash2_0Exp
36
- from langfun.core.llms.google_genai import GeminiExp_20241114
37
37
  from langfun.core.llms.google_genai import GeminiExp_20241206
38
- from langfun.core.llms.google_genai import GeminiFlash1_5
38
+ from langfun.core.llms.google_genai import GeminiExp_20241114
39
39
  from langfun.core.llms.google_genai import GeminiPro1_5
40
- from langfun.core.llms.google_genai import GeminiPro
41
- from langfun.core.llms.google_genai import GeminiProVision
42
- from langfun.core.llms.google_genai import Palm2
43
- from langfun.core.llms.google_genai import Palm2_IT
40
+ from langfun.core.llms.google_genai import GeminiPro1_5_002
41
+ from langfun.core.llms.google_genai import GeminiPro1_5_001
42
+ from langfun.core.llms.google_genai import GeminiFlash1_5
43
+ from langfun.core.llms.google_genai import GeminiFlash1_5_002
44
+ from langfun.core.llms.google_genai import GeminiFlash1_5_001
45
+ from langfun.core.llms.google_genai import GeminiPro1
46
+
47
+ from langfun.core.llms.vertexai import VertexAI
48
+ from langfun.core.llms.vertexai import VertexAIGeminiFlash2_0ThinkingExp_20241219
49
+ from langfun.core.llms.vertexai import VertexAIGeminiFlash2_0Exp
50
+ from langfun.core.llms.vertexai import VertexAIGeminiExp_20241206
51
+ from langfun.core.llms.vertexai import VertexAIGeminiExp_20241114
52
+ from langfun.core.llms.vertexai import VertexAIGeminiPro1_5
53
+ from langfun.core.llms.vertexai import VertexAIGeminiPro1_5_002
54
+ from langfun.core.llms.vertexai import VertexAIGeminiPro1_5_001
55
+ from langfun.core.llms.vertexai import VertexAIGeminiFlash1_5
56
+ from langfun.core.llms.vertexai import VertexAIGeminiFlash1_5_002
57
+ from langfun.core.llms.vertexai import VertexAIGeminiFlash1_5_001
58
+ from langfun.core.llms.vertexai import VertexAIGeminiPro1
44
59
 
45
60
  # OpenAI models.
46
61
  from langfun.core.llms.openai import OpenAI
47
62
 
63
+ from langfun.core.llms.openai import GptO1
48
64
  from langfun.core.llms.openai import GptO1Preview
49
65
  from langfun.core.llms.openai import GptO1Preview_20240912
50
66
  from langfun.core.llms.openai import GptO1Mini
@@ -106,6 +122,7 @@ from langfun.core.llms.anthropic import VertexAIAnthropic
106
122
  from langfun.core.llms.anthropic import VertexAIClaude3_5_Sonnet_20241022
107
123
  from langfun.core.llms.anthropic import VertexAIClaude3_5_Sonnet_20240620
108
124
  from langfun.core.llms.anthropic import VertexAIClaude3_5_Haiku_20241022
125
+ from langfun.core.llms.anthropic import VertexAIClaude3_Opus_20240229
109
126
 
110
127
  from langfun.core.llms.groq import Groq
111
128
  from langfun.core.llms.groq import GroqLlama3_2_3B
@@ -121,24 +138,6 @@ from langfun.core.llms.groq import GroqGemma_7B_IT
121
138
  from langfun.core.llms.groq import GroqWhisper_Large_v3
122
139
  from langfun.core.llms.groq import GroqWhisper_Large_v3Turbo
123
140
 
124
- from langfun.core.llms.vertexai import VertexAI
125
- from langfun.core.llms.vertexai import VertexAIGemini2_0
126
- from langfun.core.llms.vertexai import VertexAIGeminiFlash2_0Exp
127
- from langfun.core.llms.vertexai import VertexAIGemini1_5
128
- from langfun.core.llms.vertexai import VertexAIGeminiPro1_5
129
- from langfun.core.llms.vertexai import VertexAIGeminiPro1_5_001
130
- from langfun.core.llms.vertexai import VertexAIGeminiPro1_5_002
131
- from langfun.core.llms.vertexai import VertexAIGeminiPro1_5_0514
132
- from langfun.core.llms.vertexai import VertexAIGeminiPro1_5_0409
133
- from langfun.core.llms.vertexai import VertexAIGeminiFlash1_5
134
- from langfun.core.llms.vertexai import VertexAIGeminiFlash1_5_001
135
- from langfun.core.llms.vertexai import VertexAIGeminiFlash1_5_002
136
- from langfun.core.llms.vertexai import VertexAIGeminiFlash1_5_0514
137
- from langfun.core.llms.vertexai import VertexAIGeminiPro1
138
- from langfun.core.llms.vertexai import VertexAIGeminiPro1Vision
139
- from langfun.core.llms.vertexai import VertexAIEndpoint
140
-
141
-
142
141
  # LLaMA C++ models.
143
142
  from langfun.core.llms.llama_cpp import LlamaCppRemote
144
143