langfun 0.1.2.dev202501020804__py3-none-any.whl → 0.1.2.dev202501040804__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langfun/core/__init__.py +0 -4
- langfun/core/eval/matching.py +2 -2
- langfun/core/eval/scoring.py +6 -2
- langfun/core/eval/v2/checkpointing.py +106 -72
- langfun/core/eval/v2/checkpointing_test.py +108 -3
- langfun/core/eval/v2/eval_test_helper.py +56 -0
- langfun/core/eval/v2/evaluation.py +25 -4
- langfun/core/eval/v2/evaluation_test.py +11 -0
- langfun/core/eval/v2/example.py +11 -1
- langfun/core/eval/v2/example_test.py +16 -2
- langfun/core/eval/v2/experiment.py +83 -19
- langfun/core/eval/v2/experiment_test.py +121 -3
- langfun/core/eval/v2/reporting.py +60 -18
- langfun/core/eval/v2/reporting_test.py +119 -2
- langfun/core/eval/v2/runners.py +7 -4
- langfun/core/llms/__init__.py +4 -0
- langfun/core/llms/anthropic.py +12 -0
- langfun/core/llms/google_genai.py +11 -0
- langfun/core/llms/openai.py +23 -37
- langfun/core/llms/vertexai.py +13 -1
- {langfun-0.1.2.dev202501020804.dist-info → langfun-0.1.2.dev202501040804.dist-info}/METADATA +1 -1
- {langfun-0.1.2.dev202501020804.dist-info → langfun-0.1.2.dev202501040804.dist-info}/RECORD +25 -27
- langfun/core/repr_utils.py +0 -204
- langfun/core/repr_utils_test.py +0 -90
- {langfun-0.1.2.dev202501020804.dist-info → langfun-0.1.2.dev202501040804.dist-info}/LICENSE +0 -0
- {langfun-0.1.2.dev202501020804.dist-info → langfun-0.1.2.dev202501040804.dist-info}/WHEEL +0 -0
- {langfun-0.1.2.dev202501020804.dist-info → langfun-0.1.2.dev202501040804.dist-info}/top_level.txt +0 -0
@@ -105,8 +105,8 @@ class Experiment(lf.Component, pg.views.HtmlTreeView.Extension):
|
|
105
105
|
# metrics as needed.
|
106
106
|
experiment.run(root_dir, '20241031_1')
|
107
107
|
|
108
|
-
#
|
109
|
-
experiment.run(root_dir, '20241031_1',
|
108
|
+
# Reprocess the previous run located in 'run_20241031_1'.
|
109
|
+
experiment.run(root_dir, '20241031_1', reprocess=True)
|
110
110
|
```
|
111
111
|
|
112
112
|
# Experiment Registration and Lookup
|
@@ -380,7 +380,8 @@ class Experiment(lf.Component, pg.views.HtmlTreeView.Extension):
|
|
380
380
|
filter: Callable[['Experiment'], bool] | None = None, # pylint: disable=redefined-builtin
|
381
381
|
example_ids: list[int] | None = None,
|
382
382
|
raise_if_has_error: bool = False,
|
383
|
-
|
383
|
+
reprocess: bool | list[int] = False,
|
384
|
+
generate_example_html: Literal['new', 'all', 'no'] | list[int] = 'new',
|
384
385
|
process_timeout: int | None = None,
|
385
386
|
use_cache: Literal['global', 'per_dataset', 'no'] = 'per_dataset',
|
386
387
|
note: str | None = None,
|
@@ -391,22 +392,25 @@ class Experiment(lf.Component, pg.views.HtmlTreeView.Extension):
|
|
391
392
|
"""Runs the experiment.
|
392
393
|
|
393
394
|
Examples:
|
394
|
-
# Start a new run.
|
395
|
-
experiment.run('new')
|
395
|
+
# Start a new run under root_dir.
|
396
|
+
experiment.run(root_dir, 'new')
|
396
397
|
|
397
398
|
# Continue the latest experiment run.
|
398
|
-
experiment.run('latest')
|
399
|
+
experiment.run(root_dir, 'latest')
|
399
400
|
|
400
401
|
# Continue the latest experiment run or start a new run if it does not
|
401
402
|
# exist.
|
402
|
-
experiment.run()
|
403
|
+
experiment.run(root_dir)
|
403
404
|
|
404
|
-
# Start a new run and warm start from
|
405
|
-
# 'run_20241031_1'.
|
406
|
-
experiment.run(
|
405
|
+
# Start a new run and warm start from another run's directory
|
406
|
+
# '/path/to/another/run_20241031_1/'.
|
407
|
+
experiment.run(
|
408
|
+
root_dir, 'new',
|
409
|
+
warm_start_from='/path/to/another/run_20241031_1/'
|
410
|
+
)
|
407
411
|
|
408
|
-
#
|
409
|
-
experiment.run('20241031_1',
|
412
|
+
# Reprocess previous run under sub-dir 'run_20241031_1'.
|
413
|
+
experiment.run(root_dir, '20241031_1', reprocess=True)
|
410
414
|
|
411
415
|
Args:
|
412
416
|
root_dir: The root of the output directory of the experiment.
|
@@ -426,8 +430,18 @@ class Experiment(lf.Component, pg.views.HtmlTreeView.Extension):
|
|
426
430
|
example_ids: The example IDs to run. If None, it will run all examples.
|
427
431
|
raise_if_has_error: If True, it will raise an error if any example fails.
|
428
432
|
Otherwise, it will continue and report the error in the output.
|
429
|
-
|
430
|
-
|
433
|
+
reprocess: A boolean or a list of example IDs. If boolean, it indicates
|
434
|
+
that whether all the examples to be evaluated will be reprocessed,
|
435
|
+
meaning that existing checkpoints will be ignored. If a list of
|
436
|
+
example IDs, it indicates that only the specified examples will be
|
437
|
+
reprocessed.
|
438
|
+
generate_example_html: Among 'new', 'all', 'no' or a list of example IDs.
|
439
|
+
If 'new', generate HTML files for all newly processed examples, and
|
440
|
+
keep/copy existing HTML files for unchanged examples.
|
441
|
+
If 'all', generate HTML files for all examples.
|
442
|
+
If 'no', do not generate HTML files for any examples.
|
443
|
+
If a list of example IDs, generate HTML files for the specified
|
444
|
+
examples.
|
431
445
|
process_timeout: The timeout in seconds for each process. If None, it
|
432
446
|
will use the default timeout for the runner.
|
433
447
|
use_cache: Whether to use LLM cache for the experiment.
|
@@ -454,7 +468,8 @@ class Experiment(lf.Component, pg.views.HtmlTreeView.Extension):
|
|
454
468
|
filter=filter,
|
455
469
|
example_ids=example_ids,
|
456
470
|
raise_if_has_error=raise_if_has_error,
|
457
|
-
|
471
|
+
reprocess=reprocess,
|
472
|
+
generate_example_html=generate_example_html,
|
458
473
|
use_cache=use_cache,
|
459
474
|
process_timeout=process_timeout,
|
460
475
|
note=note,
|
@@ -815,14 +830,27 @@ class Run(pg.Object, pg.views.html.HtmlTreeView.Extension):
|
|
815
830
|
'The user tags for the current run.'
|
816
831
|
] = []
|
817
832
|
|
818
|
-
|
819
|
-
bool,
|
833
|
+
reprocess: Annotated[
|
834
|
+
bool | list[int],
|
820
835
|
(
|
821
|
-
'If True, it will
|
822
|
-
'run directory
|
836
|
+
'If True, it will reprocess all examples under the current '
|
837
|
+
'run directory. If a list of integers, examples of the given IDS '
|
838
|
+
'will be reprocessed.'
|
823
839
|
)
|
824
840
|
] = False
|
825
841
|
|
842
|
+
generate_example_html: Annotated[
|
843
|
+
Literal['new', 'all', 'no'] | list[int],
|
844
|
+
(
|
845
|
+
'If "new", generate HTML files for all newly processed examples, '
|
846
|
+
'and keep/copy existing HTML files for unchanged examples. '
|
847
|
+
'If "all", generate HTML files for all examples. '
|
848
|
+
'If "no", do not generate HTML files for any examples. '
|
849
|
+
'If a list of example IDs, generate HTML files for the specified '
|
850
|
+
'examples.'
|
851
|
+
)
|
852
|
+
] = 'new'
|
853
|
+
|
826
854
|
filter: Annotated[
|
827
855
|
Callable[[Experiment], bool] | None,
|
828
856
|
'A filter to decide whether a leaf experiment should be run or not.'
|
@@ -873,6 +901,42 @@ class Run(pg.Object, pg.views.html.HtmlTreeView.Extension):
|
|
873
901
|
"""Returns the output path for the experiment."""
|
874
902
|
return os.path.join(self.output_dir(experiment), relative_path)
|
875
903
|
|
904
|
+
def examples_to_evaluate(self, experiment: Experiment) -> set[int]:
|
905
|
+
"""Returns the example IDs to evaluate."""
|
906
|
+
if not experiment.is_leaf:
|
907
|
+
return set()
|
908
|
+
return set(
|
909
|
+
self.example_ids if self.example_ids else
|
910
|
+
range(1, experiment.num_examples + 1)
|
911
|
+
)
|
912
|
+
|
913
|
+
def examples_to_reprocess(self, experiment: Experiment) -> set[int]:
|
914
|
+
"""Returns the example IDs to reprocess per request."""
|
915
|
+
if not self.reprocess:
|
916
|
+
return set()
|
917
|
+
reprocess_ids = self.examples_to_evaluate(experiment)
|
918
|
+
if isinstance(self.reprocess, list):
|
919
|
+
reprocess_ids &= set(self.reprocess)
|
920
|
+
return reprocess_ids
|
921
|
+
|
922
|
+
def examples_to_load(self, experiment: Experiment) -> set[int]:
|
923
|
+
"""Returns the example IDs to load from checkpoint files.."""
|
924
|
+
load_ids = self.examples_to_evaluate(experiment)
|
925
|
+
if isinstance(self.generate_example_html, list):
|
926
|
+
load_ids |= set(self.generate_example_html)
|
927
|
+
load_ids -= self.examples_to_reprocess(experiment)
|
928
|
+
return load_ids
|
929
|
+
|
930
|
+
def examples_to_load_metadata(self, experiment: Experiment) -> set[int]:
|
931
|
+
"""Returns the example IDs to load the metadata."""
|
932
|
+
load_metadata_ids = set()
|
933
|
+
if isinstance(self.generate_example_html, list):
|
934
|
+
load_metadata_ids = set(self.generate_example_html)
|
935
|
+
elif self.generate_example_html == 'all':
|
936
|
+
load_metadata_ids = self.examples_to_evaluate(experiment)
|
937
|
+
load_metadata_ids -= self.examples_to_reprocess(experiment)
|
938
|
+
return load_metadata_ids
|
939
|
+
|
876
940
|
|
877
941
|
class Runner(pg.Object):
|
878
942
|
"""Interface for experiment runner."""
|
@@ -31,10 +31,10 @@ Runner = experiment_lib.Runner
|
|
31
31
|
|
32
32
|
|
33
33
|
@pg.functor()
|
34
|
-
def sample_inputs():
|
34
|
+
def sample_inputs(num_examples: int = 1):
|
35
35
|
return [
|
36
36
|
pg.Dict(x=1)
|
37
|
-
]
|
37
|
+
] * num_examples
|
38
38
|
|
39
39
|
|
40
40
|
class MyEvaluation(Evaluation):
|
@@ -208,7 +208,7 @@ class RunIdTest(unittest.TestCase):
|
|
208
208
|
|
209
209
|
class RunTest(unittest.TestCase):
|
210
210
|
|
211
|
-
def
|
211
|
+
def test_input_output_paths(self):
|
212
212
|
run = Run(
|
213
213
|
'/root',
|
214
214
|
RunId.from_id('20241102_0'),
|
@@ -270,6 +270,124 @@ class RunTest(unittest.TestCase):
|
|
270
270
|
)
|
271
271
|
)
|
272
272
|
|
273
|
+
def test_examples_start_from_scratch(self):
|
274
|
+
run = Run(
|
275
|
+
'/root',
|
276
|
+
RunId.from_id('20241102_0'),
|
277
|
+
pg.Ref(Suite([
|
278
|
+
MyEvaluation(replica_id=0, inputs=sample_inputs(10)),
|
279
|
+
])),
|
280
|
+
)
|
281
|
+
root = run.experiment
|
282
|
+
self.assertEqual(run.examples_to_evaluate(root), set())
|
283
|
+
self.assertEqual(run.examples_to_reprocess(root), set())
|
284
|
+
self.assertEqual(run.examples_to_load(root), set())
|
285
|
+
self.assertEqual(run.examples_to_load_metadata(root), set())
|
286
|
+
|
287
|
+
exp = root.leaf_nodes[0]
|
288
|
+
self.assertEqual(run.examples_to_evaluate(exp), set(range(1, 11)))
|
289
|
+
self.assertEqual(run.examples_to_reprocess(exp), set())
|
290
|
+
self.assertEqual(run.examples_to_load(exp), set(range(1, 11)))
|
291
|
+
self.assertEqual(run.examples_to_load_metadata(exp), set())
|
292
|
+
|
293
|
+
def test_examples_with_example_ids(self):
|
294
|
+
run = Run(
|
295
|
+
'/root',
|
296
|
+
RunId.from_id('20241102_0'),
|
297
|
+
pg.Ref(Suite([
|
298
|
+
MyEvaluation(replica_id=0, inputs=sample_inputs(10)),
|
299
|
+
])),
|
300
|
+
example_ids=[1, 3, 5]
|
301
|
+
)
|
302
|
+
exp = run.experiment.leaf_nodes[0]
|
303
|
+
self.assertEqual(run.examples_to_evaluate(exp), set([1, 3, 5]))
|
304
|
+
self.assertEqual(run.examples_to_reprocess(exp), set())
|
305
|
+
self.assertEqual(run.examples_to_load(exp), set([1, 3, 5]))
|
306
|
+
self.assertEqual(run.examples_to_load_metadata(exp), set())
|
307
|
+
|
308
|
+
def test_examples_with_reprocess_all(self):
|
309
|
+
run = Run(
|
310
|
+
'/root',
|
311
|
+
RunId.from_id('20241102_0'),
|
312
|
+
pg.Ref(Suite([
|
313
|
+
MyEvaluation(replica_id=0, inputs=sample_inputs(10)),
|
314
|
+
])),
|
315
|
+
example_ids=[1, 3, 5],
|
316
|
+
reprocess=True
|
317
|
+
)
|
318
|
+
exp = run.experiment.leaf_nodes[0]
|
319
|
+
self.assertEqual(run.examples_to_evaluate(exp), set([1, 3, 5]))
|
320
|
+
self.assertEqual(run.examples_to_reprocess(exp), set([1, 3, 5]))
|
321
|
+
self.assertEqual(run.examples_to_load(exp), set())
|
322
|
+
self.assertEqual(run.examples_to_load_metadata(exp), set())
|
323
|
+
|
324
|
+
def test_examples_with_reprocess_some(self):
|
325
|
+
run = Run(
|
326
|
+
'/root',
|
327
|
+
RunId.from_id('20241102_0'),
|
328
|
+
pg.Ref(Suite([
|
329
|
+
MyEvaluation(replica_id=0, inputs=sample_inputs(10)),
|
330
|
+
])),
|
331
|
+
example_ids=[1, 3, 5],
|
332
|
+
reprocess=[1],
|
333
|
+
)
|
334
|
+
exp = run.experiment.leaf_nodes[0]
|
335
|
+
self.assertEqual(run.examples_to_evaluate(exp), set([1, 3, 5]))
|
336
|
+
self.assertEqual(run.examples_to_reprocess(exp), set([1]))
|
337
|
+
self.assertEqual(run.examples_to_load(exp), set([3, 5]))
|
338
|
+
self.assertEqual(run.examples_to_load_metadata(exp), set())
|
339
|
+
|
340
|
+
def test_examples_with_generate_example_html_all(self):
|
341
|
+
run = Run(
|
342
|
+
'/root',
|
343
|
+
RunId.from_id('20241102_0'),
|
344
|
+
pg.Ref(Suite([
|
345
|
+
MyEvaluation(replica_id=0, inputs=sample_inputs(10)),
|
346
|
+
])),
|
347
|
+
example_ids=[1, 3, 5],
|
348
|
+
reprocess=[1],
|
349
|
+
generate_example_html='all',
|
350
|
+
)
|
351
|
+
exp = run.experiment.leaf_nodes[0]
|
352
|
+
self.assertEqual(run.examples_to_evaluate(exp), set([1, 3, 5]))
|
353
|
+
self.assertEqual(run.examples_to_reprocess(exp), set([1]))
|
354
|
+
self.assertEqual(run.examples_to_load(exp), set([3, 5]))
|
355
|
+
self.assertEqual(run.examples_to_load_metadata(exp), set([3, 5]))
|
356
|
+
|
357
|
+
def test_examples_with_generate_example_html_new(self):
|
358
|
+
run = Run(
|
359
|
+
'/root',
|
360
|
+
RunId.from_id('20241102_0'),
|
361
|
+
pg.Ref(Suite([
|
362
|
+
MyEvaluation(replica_id=0, inputs=sample_inputs(10)),
|
363
|
+
])),
|
364
|
+
example_ids=[1, 3, 5],
|
365
|
+
reprocess=[1],
|
366
|
+
generate_example_html='new',
|
367
|
+
)
|
368
|
+
exp = run.experiment.leaf_nodes[0]
|
369
|
+
self.assertEqual(run.examples_to_evaluate(exp), set([1, 3, 5]))
|
370
|
+
self.assertEqual(run.examples_to_reprocess(exp), set([1]))
|
371
|
+
self.assertEqual(run.examples_to_load(exp), set([3, 5]))
|
372
|
+
self.assertEqual(run.examples_to_load_metadata(exp), set())
|
373
|
+
|
374
|
+
def test_examples_with_generate_example_html_some(self):
|
375
|
+
run = Run(
|
376
|
+
'/root',
|
377
|
+
RunId.from_id('20241102_0'),
|
378
|
+
pg.Ref(Suite([
|
379
|
+
MyEvaluation(replica_id=0, inputs=sample_inputs(10)),
|
380
|
+
])),
|
381
|
+
example_ids=[1, 3, 5],
|
382
|
+
reprocess=[1],
|
383
|
+
generate_example_html=[1, 2, 3],
|
384
|
+
)
|
385
|
+
exp = run.experiment.leaf_nodes[0]
|
386
|
+
self.assertEqual(run.examples_to_evaluate(exp), set([1, 3, 5]))
|
387
|
+
self.assertEqual(run.examples_to_reprocess(exp), set([1]))
|
388
|
+
self.assertEqual(run.examples_to_load(exp), set([2, 3, 5]))
|
389
|
+
self.assertEqual(run.examples_to_load_metadata(exp), set([2, 3]))
|
390
|
+
|
273
391
|
|
274
392
|
class RunnerTest(unittest.TestCase):
|
275
393
|
|
@@ -172,11 +172,11 @@ class HtmlReporter(experiment_lib.Plugin):
|
|
172
172
|
)
|
173
173
|
html.save(index_html_path)
|
174
174
|
experiment.info(
|
175
|
-
f'
|
175
|
+
f'Updated {index_html_path!r} in {t.elapse:.2f} seconds.',
|
176
176
|
)
|
177
177
|
except BaseException as e: # pylint: disable=broad-except
|
178
178
|
experiment.error(
|
179
|
-
f'Failed to
|
179
|
+
f'Failed to generate {index_html_path!r}. '
|
180
180
|
f'Error: {e}, Stacktrace: \n{traceback.format_exc()}.',
|
181
181
|
)
|
182
182
|
raise e
|
@@ -194,26 +194,68 @@ class HtmlReporter(experiment_lib.Plugin):
|
|
194
194
|
def _save_example_html(
|
195
195
|
self, runner: Runner, experiment: Experiment, example: Example
|
196
196
|
) -> None:
|
197
|
-
"""Saves the example."""
|
198
|
-
|
197
|
+
"""Saves the example in HTML format."""
|
198
|
+
current_run = runner.current_run
|
199
|
+
def _generate():
|
199
200
|
try:
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
201
|
+
with pg.timeit() as t:
|
202
|
+
html = example.to_html(
|
203
|
+
collapse_level=None,
|
204
|
+
enable_summary_tooltip=False,
|
205
|
+
extra_flags=dict(
|
206
|
+
# For properly rendering the next link.
|
207
|
+
num_examples=getattr(experiment, 'num_examples', None)
|
208
|
+
),
|
209
|
+
)
|
210
|
+
html.save(
|
211
|
+
runner.current_run.output_path_for(
|
212
|
+
experiment, f'{example.id}.html'
|
213
|
+
)
|
214
|
+
)
|
215
|
+
experiment.info(
|
216
|
+
f'\'{example.id}.html\' generated in {t.elapse:.2f} seconds. '
|
212
217
|
)
|
213
218
|
except BaseException as e: # pylint: disable=broad-except
|
214
219
|
experiment.error(
|
215
|
-
f'Failed to
|
220
|
+
f'Failed to generate \'{example.id}.html\'. '
|
216
221
|
f'Error: {e}, Stacktrace: \n{traceback.format_exc()}.',
|
217
222
|
)
|
218
223
|
raise e
|
219
|
-
|
224
|
+
|
225
|
+
def _copy():
|
226
|
+
src_file = current_run.input_path_for(experiment, f'{example.id}.html')
|
227
|
+
dest_file = current_run.output_path_for(experiment, f'{example.id}.html')
|
228
|
+
|
229
|
+
if src_file == dest_file:
|
230
|
+
return
|
231
|
+
|
232
|
+
if not pg.io.path_exists(src_file):
|
233
|
+
experiment.warning(
|
234
|
+
f'Skip copying \'{example.id}.html\' as '
|
235
|
+
f'{src_file!r} does not exist.'
|
236
|
+
)
|
237
|
+
return
|
238
|
+
|
239
|
+
try:
|
240
|
+
with pg.timeit() as t, pg.io.open(src_file, 'r') as src:
|
241
|
+
content = src.read()
|
242
|
+
with pg.io.open(dest_file, 'w') as dest:
|
243
|
+
dest.write(content)
|
244
|
+
experiment.info(
|
245
|
+
f'\'{example.id}.html\' copied in {t.elapse:.2f} seconds.'
|
246
|
+
)
|
247
|
+
except BaseException as e: # pylint: disable=broad-except
|
248
|
+
experiment.error(
|
249
|
+
f'Failed to copy {src_file!r} to {dest_file!r}. Error: {e}.'
|
250
|
+
)
|
251
|
+
raise e
|
252
|
+
|
253
|
+
generate_example_html = current_run.generate_example_html
|
254
|
+
if (generate_example_html == 'all'
|
255
|
+
or (generate_example_html == 'new' and example.newly_processed)
|
256
|
+
or (isinstance(generate_example_html, list)
|
257
|
+
and example.id in generate_example_html)):
|
258
|
+
op = _generate
|
259
|
+
else:
|
260
|
+
op = _copy
|
261
|
+
runner.background_run(op)
|
@@ -15,6 +15,7 @@ import os
|
|
15
15
|
import tempfile
|
16
16
|
import unittest
|
17
17
|
|
18
|
+
from langfun.core.eval.v2 import checkpointing
|
18
19
|
from langfun.core.eval.v2 import eval_test_helper
|
19
20
|
from langfun.core.eval.v2 import reporting
|
20
21
|
from langfun.core.eval.v2 import runners as runners_lib # pylint: disable=unused-import
|
@@ -26,15 +27,131 @@ class ReportingTest(unittest.TestCase):
|
|
26
27
|
def test_reporting(self):
|
27
28
|
root_dir = os.path.join(tempfile.gettempdir(), 'test_reporting')
|
28
29
|
experiment = eval_test_helper.test_experiment()
|
30
|
+
checkpointer = checkpointing.BulkCheckpointer('checkpoint.jsonl')
|
31
|
+
reporter = reporting.HtmlReporter()
|
32
|
+
run = experiment.run(root_dir, 'new', plugins=[checkpointer, reporter])
|
33
|
+
self.assertTrue(
|
34
|
+
pg.io.path_exists(run.output_path_for(experiment, 'summary.html'))
|
35
|
+
)
|
36
|
+
for leaf in experiment.leaf_nodes:
|
37
|
+
self.assertTrue(
|
38
|
+
pg.io.path_exists(run.output_path_for(leaf, 'index.html'))
|
39
|
+
)
|
40
|
+
for i in range(leaf.num_examples):
|
41
|
+
self.assertTrue(
|
42
|
+
pg.io.path_exists(run.output_path_for(leaf, f'{i + 1}.html'))
|
43
|
+
)
|
44
|
+
found_generation_log = False
|
45
|
+
for log_entry in leaf._log_entries:
|
46
|
+
if 'generated in' in log_entry.message:
|
47
|
+
found_generation_log = True
|
48
|
+
break
|
49
|
+
self.assertTrue(found_generation_log)
|
50
|
+
|
51
|
+
# Test warm start.
|
52
|
+
root_dir = os.path.join(tempfile.gettempdir(), 'test_reporting2')
|
53
|
+
experiment = eval_test_helper.test_experiment()
|
54
|
+
run = experiment.run(
|
55
|
+
root_dir, 'new', plugins=[checkpointer, reporter],
|
56
|
+
warm_start_from=run.output_root
|
57
|
+
)
|
58
|
+
self.assertTrue(
|
59
|
+
pg.io.path_exists(run.output_path_for(experiment, 'summary.html'))
|
60
|
+
)
|
61
|
+
for leaf in experiment.leaf_nodes:
|
62
|
+
self.assertTrue(
|
63
|
+
pg.io.path_exists(run.output_path_for(leaf, 'index.html'))
|
64
|
+
)
|
65
|
+
for i in range(leaf.num_examples):
|
66
|
+
self.assertTrue(
|
67
|
+
pg.io.path_exists(run.output_path_for(leaf, f'{i + 1}.html'))
|
68
|
+
)
|
69
|
+
found_copy_log = False
|
70
|
+
for log_entry in leaf._log_entries:
|
71
|
+
if 'copied in' in log_entry.message:
|
72
|
+
found_copy_log = True
|
73
|
+
break
|
74
|
+
self.assertTrue(found_copy_log)
|
75
|
+
|
76
|
+
def test_index_html_generation_error(self):
|
77
|
+
root_dir = os.path.join(
|
78
|
+
tempfile.gettempdir(),
|
79
|
+
'test_reporting_with_index_html_generation_error'
|
80
|
+
)
|
81
|
+
experiment = (eval_test_helper
|
82
|
+
.test_experiment_with_index_html_generation_error())
|
29
83
|
reporter = reporting.HtmlReporter()
|
30
84
|
run = experiment.run(root_dir, 'new', plugins=[reporter])
|
31
|
-
|
85
|
+
self.assertFalse(
|
86
|
+
pg.io.path_exists(run.output_path_for(experiment, 'summary.html'))
|
87
|
+
)
|
88
|
+
for leaf in experiment.leaf_nodes:
|
89
|
+
self.assertFalse(
|
90
|
+
pg.io.path_exists(run.output_path_for(leaf, 'index.html'))
|
91
|
+
)
|
92
|
+
found_error_log = False
|
93
|
+
for log_entry in experiment._log_entries:
|
94
|
+
if log_entry.message.startswith('Failed to generate'):
|
95
|
+
found_error_log = True
|
96
|
+
break
|
97
|
+
self.assertTrue(found_error_log)
|
98
|
+
|
99
|
+
def test_example_html_generation_error(self):
|
100
|
+
root_dir = os.path.join(
|
101
|
+
tempfile.gettempdir(),
|
102
|
+
'test_reporting_with_example_html_generation_error'
|
103
|
+
)
|
104
|
+
experiment = (eval_test_helper
|
105
|
+
.test_experiment_with_example_html_generation_error())
|
106
|
+
checkpointer = checkpointing.BulkCheckpointer('checkpoint.jsonl')
|
107
|
+
reporter = reporting.HtmlReporter()
|
108
|
+
run = experiment.run(root_dir, 'new', plugins=[checkpointer, reporter])
|
109
|
+
self.assertTrue(
|
110
|
+
pg.io.path_exists(run.output_path_for(experiment, 'summary.html'))
|
111
|
+
)
|
112
|
+
for leaf in experiment.leaf_nodes:
|
113
|
+
self.assertTrue(
|
114
|
+
pg.io.path_exists(run.output_path_for(leaf, 'index.html'))
|
115
|
+
)
|
116
|
+
for i in range(leaf.num_examples):
|
117
|
+
self.assertFalse(
|
118
|
+
pg.io.path_exists(run.output_path_for(leaf, f'{i + 1}.html'))
|
119
|
+
)
|
120
|
+
found_error_log = False
|
121
|
+
for log_entry in experiment._log_entries:
|
122
|
+
if log_entry.message.startswith('Failed to generate'):
|
123
|
+
found_error_log = True
|
124
|
+
break
|
125
|
+
self.assertTrue(found_error_log)
|
126
|
+
|
127
|
+
# Test warm start.
|
128
|
+
root_dir = os.path.join(
|
129
|
+
tempfile.gettempdir(),
|
130
|
+
'test_reporting_with_example_html_generation_error2'
|
131
|
+
)
|
132
|
+
experiment = (eval_test_helper
|
133
|
+
.test_experiment_with_example_html_generation_error())
|
134
|
+
run = experiment.run(
|
135
|
+
root_dir, 'new', plugins=[checkpointer, reporter],
|
136
|
+
warm_start_from=run.output_root
|
137
|
+
)
|
138
|
+
self.assertTrue(
|
139
|
+
pg.io.path_exists(run.output_path_for(experiment, 'summary.html'))
|
140
|
+
)
|
32
141
|
for leaf in experiment.leaf_nodes:
|
33
142
|
self.assertTrue(
|
34
143
|
pg.io.path_exists(run.output_path_for(leaf, 'index.html'))
|
35
144
|
)
|
36
145
|
for i in range(leaf.num_examples):
|
37
|
-
|
146
|
+
self.assertFalse(
|
147
|
+
pg.io.path_exists(run.output_path_for(leaf, f'{i + 1}.html'))
|
148
|
+
)
|
149
|
+
found_error_log = False
|
150
|
+
for log_entry in experiment._log_entries:
|
151
|
+
if log_entry.message.startswith('Skip copying'):
|
152
|
+
found_error_log = True
|
153
|
+
break
|
154
|
+
self.assertTrue(found_error_log)
|
38
155
|
|
39
156
|
|
40
157
|
if __name__ == '__main__':
|
langfun/core/eval/v2/runners.py
CHANGED
@@ -123,6 +123,7 @@ class RunnerBase(Runner):
|
|
123
123
|
def on_experiment_start(self, experiment: Experiment) -> None:
|
124
124
|
"""Called when an evaluation is started."""
|
125
125
|
# Start the progress of the evaluation.
|
126
|
+
num_examples_to_evaluate = 0
|
126
127
|
if experiment.is_leaf:
|
127
128
|
assert isinstance(experiment, Evaluation)
|
128
129
|
num_examples_to_evaluate = (
|
@@ -130,10 +131,6 @@ class RunnerBase(Runner):
|
|
130
131
|
if self.current_run.example_ids else experiment.num_examples
|
131
132
|
)
|
132
133
|
experiment.progress.start(total=num_examples_to_evaluate)
|
133
|
-
experiment.info(
|
134
|
-
'Starting evaluation %s with %d examples to evaluate.'
|
135
|
-
% (experiment.id, num_examples_to_evaluate)
|
136
|
-
)
|
137
134
|
else:
|
138
135
|
experiment.progress.start(total=len(experiment.leaf_nodes))
|
139
136
|
|
@@ -141,6 +138,12 @@ class RunnerBase(Runner):
|
|
141
138
|
for plugin in self._all_plugins(experiment):
|
142
139
|
plugin.on_experiment_start(self, experiment)
|
143
140
|
|
141
|
+
if experiment.is_leaf:
|
142
|
+
experiment.info(
|
143
|
+
f'Starting evaluation {experiment.id!r} with '
|
144
|
+
f'{num_examples_to_evaluate} examples to evaluate.'
|
145
|
+
)
|
146
|
+
|
144
147
|
def on_experiment_skipped(self, experiment: Experiment) -> None:
|
145
148
|
"""Called when an evaluation is skipped."""
|
146
149
|
# Skip event will only be triggered for leaf evaluations.
|
langfun/core/llms/__init__.py
CHANGED
@@ -32,6 +32,7 @@ from langfun.core.llms.rest import REST
|
|
32
32
|
|
33
33
|
# Gemini models.
|
34
34
|
from langfun.core.llms.google_genai import GenAI
|
35
|
+
from langfun.core.llms.google_genai import GeminiFlash2_0ThinkingExp
|
35
36
|
from langfun.core.llms.google_genai import GeminiFlash2_0Exp
|
36
37
|
from langfun.core.llms.google_genai import GeminiExp_20241114
|
37
38
|
from langfun.core.llms.google_genai import GeminiExp_20241206
|
@@ -45,6 +46,7 @@ from langfun.core.llms.google_genai import Palm2_IT
|
|
45
46
|
# OpenAI models.
|
46
47
|
from langfun.core.llms.openai import OpenAI
|
47
48
|
|
49
|
+
from langfun.core.llms.openai import GptO1
|
48
50
|
from langfun.core.llms.openai import GptO1Preview
|
49
51
|
from langfun.core.llms.openai import GptO1Preview_20240912
|
50
52
|
from langfun.core.llms.openai import GptO1Mini
|
@@ -106,6 +108,7 @@ from langfun.core.llms.anthropic import VertexAIAnthropic
|
|
106
108
|
from langfun.core.llms.anthropic import VertexAIClaude3_5_Sonnet_20241022
|
107
109
|
from langfun.core.llms.anthropic import VertexAIClaude3_5_Sonnet_20240620
|
108
110
|
from langfun.core.llms.anthropic import VertexAIClaude3_5_Haiku_20241022
|
111
|
+
from langfun.core.llms.anthropic import VertexAIClaude3_Opus_20240229
|
109
112
|
|
110
113
|
from langfun.core.llms.groq import Groq
|
111
114
|
from langfun.core.llms.groq import GroqLlama3_2_3B
|
@@ -124,6 +127,7 @@ from langfun.core.llms.groq import GroqWhisper_Large_v3Turbo
|
|
124
127
|
from langfun.core.llms.vertexai import VertexAI
|
125
128
|
from langfun.core.llms.vertexai import VertexAIGemini2_0
|
126
129
|
from langfun.core.llms.vertexai import VertexAIGeminiFlash2_0Exp
|
130
|
+
from langfun.core.llms.vertexai import VertexAIGeminiFlash2_0ThinkingExp
|
127
131
|
from langfun.core.llms.vertexai import VertexAIGemini1_5
|
128
132
|
from langfun.core.llms.vertexai import VertexAIGeminiPro1_5
|
129
133
|
from langfun.core.llms.vertexai import VertexAIGeminiPro1_5_001
|
langfun/core/llms/anthropic.py
CHANGED
@@ -67,6 +67,13 @@ SUPPORTED_MODELS_AND_SETTINGS = {
|
|
67
67
|
cost_per_1k_input_tokens=0.001,
|
68
68
|
cost_per_1k_output_tokens=0.005,
|
69
69
|
),
|
70
|
+
'claude-3-opus@20240229': pg.Dict(
|
71
|
+
max_tokens=4096,
|
72
|
+
rpm=4000,
|
73
|
+
tpm=400000,
|
74
|
+
cost_per_1k_input_tokens=0.015,
|
75
|
+
cost_per_1k_output_tokens=0.075,
|
76
|
+
),
|
70
77
|
# Anthropic hosted models.
|
71
78
|
'claude-3-5-sonnet-20241022': pg.Dict(
|
72
79
|
max_tokens=8192,
|
@@ -461,6 +468,11 @@ class VertexAIAnthropic(Anthropic):
|
|
461
468
|
return request
|
462
469
|
|
463
470
|
|
471
|
+
class VertexAIClaude3_Opus_20240229(VertexAIAnthropic): # pylint: disable=invalid-name
|
472
|
+
"""Anthropic's Claude 3 Opus model on VertexAI."""
|
473
|
+
model = 'claude-3-opus@20240229'
|
474
|
+
|
475
|
+
|
464
476
|
class VertexAIClaude3_5_Sonnet_20241022(VertexAIAnthropic): # pylint: disable=invalid-name
|
465
477
|
"""Anthropic's Claude 3.5 Sonnet model on VertexAI."""
|
466
478
|
model = 'claude-3-5-sonnet-v2@20241022'
|
@@ -48,6 +48,7 @@ class GenAI(lf.LanguageModel):
|
|
48
48
|
|
49
49
|
model: Annotated[
|
50
50
|
Literal[
|
51
|
+
'gemini-2.0-flash-thinking-exp-1219',
|
51
52
|
'gemini-2.0-flash-exp',
|
52
53
|
'gemini-exp-1206',
|
53
54
|
'gemini-exp-1114',
|
@@ -307,6 +308,16 @@ _GOOGLE_GENAI_MODEL_HUB = _ModelHub()
|
|
307
308
|
#
|
308
309
|
# Public Gemini models.
|
309
310
|
#
|
311
|
+
class GeminiFlash2_0ThinkingExp(GenAI): # pylint: disable=invalid-name
|
312
|
+
"""Gemini 2.0 Flash Thinking Experimental model."""
|
313
|
+
|
314
|
+
model = 'gemini-2.0-flash-thinking-exp-1219'
|
315
|
+
supported_modalities = (
|
316
|
+
vertexai.DOCUMENT_TYPES
|
317
|
+
+ vertexai.IMAGE_TYPES
|
318
|
+
+ vertexai.AUDIO_TYPES
|
319
|
+
+ vertexai.VIDEO_TYPES
|
320
|
+
)
|
310
321
|
|
311
322
|
|
312
323
|
class GeminiFlash2_0Exp(GenAI): # pylint: disable=invalid-name
|