langfun 0.0.2.dev20240429__py3-none-any.whl → 0.1.2.dev202501140804__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langfun/__init__.py +20 -2
- langfun/core/__init__.py +16 -5
- langfun/core/agentic/__init__.py +30 -0
- langfun/core/agentic/action.py +854 -0
- langfun/core/agentic/action_eval.py +150 -0
- langfun/core/agentic/action_eval_test.py +109 -0
- langfun/core/agentic/action_test.py +136 -0
- langfun/core/coding/python/__init__.py +5 -11
- langfun/core/coding/python/correction.py +37 -21
- langfun/core/coding/python/correction_test.py +29 -3
- langfun/core/coding/python/execution.py +40 -216
- langfun/core/coding/python/execution_test.py +29 -89
- langfun/core/coding/python/generation.py +21 -11
- langfun/core/coding/python/generation_test.py +2 -2
- langfun/core/coding/python/parsing.py +108 -193
- langfun/core/coding/python/parsing_test.py +2 -105
- langfun/core/component.py +63 -2
- langfun/core/component_test.py +53 -0
- langfun/core/concurrent.py +414 -117
- langfun/core/concurrent_test.py +111 -24
- langfun/core/console.py +18 -5
- langfun/core/console_test.py +17 -0
- langfun/core/eval/__init__.py +16 -1
- langfun/core/eval/base.py +622 -174
- langfun/core/eval/base_test.py +200 -54
- langfun/core/eval/matching.py +63 -76
- langfun/core/eval/matching_test.py +17 -8
- langfun/core/eval/patching.py +130 -0
- langfun/core/eval/patching_test.py +170 -0
- langfun/core/eval/scoring.py +26 -26
- langfun/core/eval/scoring_test.py +19 -2
- langfun/core/eval/v2/__init__.py +42 -0
- langfun/core/eval/v2/checkpointing.py +380 -0
- langfun/core/eval/v2/checkpointing_test.py +228 -0
- langfun/core/eval/v2/eval_test_helper.py +136 -0
- langfun/core/eval/v2/evaluation.py +725 -0
- langfun/core/eval/v2/evaluation_test.py +180 -0
- langfun/core/eval/v2/example.py +305 -0
- langfun/core/eval/v2/example_test.py +128 -0
- langfun/core/eval/v2/experiment.py +1048 -0
- langfun/core/eval/v2/experiment_test.py +433 -0
- langfun/core/eval/v2/metric_values.py +156 -0
- langfun/core/eval/v2/metric_values_test.py +80 -0
- langfun/core/eval/v2/metrics.py +357 -0
- langfun/core/eval/v2/metrics_test.py +203 -0
- langfun/core/eval/v2/progress.py +348 -0
- langfun/core/eval/v2/progress_test.py +82 -0
- langfun/core/eval/v2/progress_tracking.py +210 -0
- langfun/core/eval/v2/progress_tracking_test.py +66 -0
- langfun/core/eval/v2/reporting.py +270 -0
- langfun/core/eval/v2/reporting_test.py +158 -0
- langfun/core/eval/v2/runners.py +488 -0
- langfun/core/eval/v2/runners_test.py +334 -0
- langfun/core/langfunc.py +4 -17
- langfun/core/langfunc_test.py +22 -6
- langfun/core/language_model.py +577 -39
- langfun/core/language_model_test.py +470 -56
- langfun/core/llms/__init__.py +87 -16
- langfun/core/llms/anthropic.py +312 -87
- langfun/core/llms/anthropic_test.py +71 -3
- langfun/core/llms/cache/base.py +21 -2
- langfun/core/llms/cache/in_memory.py +13 -0
- langfun/core/llms/cache/in_memory_test.py +53 -2
- langfun/core/llms/compositional.py +101 -0
- langfun/core/llms/compositional_test.py +73 -0
- langfun/core/llms/deepseek.py +117 -0
- langfun/core/llms/deepseek_test.py +61 -0
- langfun/core/llms/fake.py +11 -7
- langfun/core/llms/fake_test.py +14 -0
- langfun/core/llms/gemini.py +507 -0
- langfun/core/llms/gemini_test.py +195 -0
- langfun/core/llms/google_genai.py +62 -218
- langfun/core/llms/google_genai_test.py +9 -202
- langfun/core/llms/groq.py +160 -144
- langfun/core/llms/groq_test.py +31 -137
- langfun/core/llms/llama_cpp.py +15 -42
- langfun/core/llms/llama_cpp_test.py +4 -30
- langfun/core/llms/openai.py +395 -203
- langfun/core/llms/openai_compatible.py +179 -0
- langfun/core/llms/openai_compatible_test.py +495 -0
- langfun/core/llms/openai_test.py +30 -395
- langfun/core/llms/rest.py +113 -0
- langfun/core/llms/rest_test.py +111 -0
- langfun/core/llms/vertexai.py +192 -0
- langfun/core/llms/vertexai_test.py +52 -0
- langfun/core/logging.py +284 -0
- langfun/core/logging_test.py +125 -0
- langfun/core/message.py +319 -9
- langfun/core/message_test.py +190 -13
- langfun/core/modalities/__init__.py +6 -2
- langfun/core/modalities/audio.py +30 -0
- langfun/core/modalities/audio_test.py +63 -0
- langfun/core/modalities/image.py +39 -20
- langfun/core/modalities/image_test.py +52 -9
- langfun/core/modalities/mime.py +206 -29
- langfun/core/modalities/mime_test.py +90 -9
- langfun/core/modalities/ms_office.py +117 -0
- langfun/core/modalities/ms_office_test.py +389 -0
- langfun/core/modalities/pdf.py +22 -0
- langfun/core/modalities/pdf_test.py +57 -0
- langfun/core/modalities/video.py +9 -26
- langfun/core/modalities/video_test.py +3 -3
- langfun/core/modality.py +26 -3
- langfun/core/modality_test.py +2 -2
- langfun/core/sampling.py +11 -11
- langfun/core/structured/__init__.py +12 -16
- langfun/core/structured/completion.py +32 -5
- langfun/core/structured/completion_test.py +7 -6
- langfun/core/structured/description.py +2 -2
- langfun/core/structured/description_test.py +3 -3
- langfun/core/structured/function_generation.py +60 -27
- langfun/core/structured/function_generation_test.py +72 -2
- langfun/core/structured/mapping.py +97 -47
- langfun/core/structured/mapping_test.py +90 -2
- langfun/core/structured/parsing.py +33 -21
- langfun/core/structured/parsing_test.py +53 -9
- langfun/core/structured/querying.py +746 -0
- langfun/core/structured/{prompting_test.py → querying_test.py} +469 -51
- langfun/core/structured/schema.py +204 -97
- langfun/core/structured/schema_generation.py +1 -1
- langfun/core/structured/schema_test.py +130 -29
- langfun/core/structured/scoring.py +125 -19
- langfun/core/structured/scoring_test.py +30 -0
- langfun/core/structured/tokenization.py +64 -0
- langfun/core/structured/tokenization_test.py +48 -0
- langfun/core/template.py +115 -1
- langfun/core/template_test.py +71 -1
- langfun/core/templates/conversation.py +9 -0
- langfun/core/templates/conversation_test.py +4 -3
- langfun/core/templates/selfplay_test.py +10 -2
- langfun-0.1.2.dev202501140804.dist-info/METADATA +225 -0
- langfun-0.1.2.dev202501140804.dist-info/RECORD +153 -0
- {langfun-0.0.2.dev20240429.dist-info → langfun-0.1.2.dev202501140804.dist-info}/WHEEL +1 -1
- langfun/core/coding/python/errors.py +0 -108
- langfun/core/coding/python/errors_test.py +0 -99
- langfun/core/coding/python/permissions.py +0 -90
- langfun/core/coding/python/permissions_test.py +0 -86
- langfun/core/structured/prompting.py +0 -238
- langfun/core/text_formatting.py +0 -162
- langfun/core/text_formatting_test.py +0 -47
- langfun-0.0.2.dev20240429.dist-info/METADATA +0 -100
- langfun-0.0.2.dev20240429.dist-info/RECORD +0 -108
- {langfun-0.0.2.dev20240429.dist-info → langfun-0.1.2.dev202501140804.dist-info}/LICENSE +0 -0
- {langfun-0.0.2.dev20240429.dist-info → langfun-0.1.2.dev202501140804.dist-info}/top_level.txt +0 -0
langfun/core/eval/base.py
CHANGED
@@ -24,10 +24,9 @@ import os
|
|
24
24
|
import re
|
25
25
|
import threading
|
26
26
|
import time
|
27
|
+
import types
|
27
28
|
from typing import Annotated, Any, Callable, Iterator, Literal, Optional, Sequence, Type, Union
|
28
29
|
|
29
|
-
from absl import app
|
30
|
-
from absl import flags
|
31
30
|
import langfun.core as lf
|
32
31
|
import langfun.core.coding as lf_coding
|
33
32
|
from langfun.core.llms.cache import in_memory
|
@@ -40,7 +39,8 @@ class Evaluable(lf.Component):
|
|
40
39
|
|
41
40
|
EXPERIMENT_JSON = 'experiment.json'
|
42
41
|
RESULT_JSON = 'result.json'
|
43
|
-
|
42
|
+
OOP_FAILURES_JSON = 'oop_failures.json'
|
43
|
+
NON_OOP_FAILURES_JSON = 'non_oop_failures.json'
|
44
44
|
INDEX_HTML = 'index.html'
|
45
45
|
SUMMARY_HTML = 'summary.html'
|
46
46
|
|
@@ -215,6 +215,7 @@ class Evaluable(lf.Component):
|
|
215
215
|
summary: bool = True,
|
216
216
|
pivot_field: str = 'lm',
|
217
217
|
from_root: bool = True,
|
218
|
+
timeout: int | None = None,
|
218
219
|
**kwargs,
|
219
220
|
) -> Union['Summary', pg.Dict]:
|
220
221
|
"""Run the evaluation, which fills and returns the result."""
|
@@ -242,7 +243,7 @@ class Evaluable(lf.Component):
|
|
242
243
|
):
|
243
244
|
if show_progress:
|
244
245
|
lf.concurrent.ProgressBar.update(
|
245
|
-
progress_bar,
|
246
|
+
progress_bar, status='LOADING SAVED RESULTS...', color='yellow'
|
246
247
|
)
|
247
248
|
if self.try_load_result():
|
248
249
|
run_status = 'CACHED'
|
@@ -265,13 +266,14 @@ class Evaluable(lf.Component):
|
|
265
266
|
verbose=verbose,
|
266
267
|
progress_bar=progress_bar,
|
267
268
|
label=label,
|
269
|
+
timeout=timeout,
|
268
270
|
**kwargs,
|
269
271
|
)
|
270
272
|
|
271
273
|
if should_save:
|
272
274
|
if show_progress:
|
273
275
|
lf.concurrent.ProgressBar.update(
|
274
|
-
progress_bar,
|
276
|
+
progress_bar, status='SAVING RESULTS...', color='yellow'
|
275
277
|
)
|
276
278
|
|
277
279
|
# Save evaluation results.
|
@@ -284,7 +286,7 @@ class Evaluable(lf.Component):
|
|
284
286
|
if show_progress:
|
285
287
|
lf.concurrent.ProgressBar.update(
|
286
288
|
progress_bar,
|
287
|
-
|
289
|
+
status=self._completion_status(run_status),
|
288
290
|
color='green',
|
289
291
|
)
|
290
292
|
else:
|
@@ -340,7 +342,7 @@ class Evaluable(lf.Component):
|
|
340
342
|
f'[#{leaf.index} - {leaf.node.id}]',
|
341
343
|
total=leaf.node.num_examples if leaf.enabled else 0,
|
342
344
|
color='cyan' if leaf.enabled else 'yellow',
|
343
|
-
|
345
|
+
status=None if leaf.enabled else 'SKIPPED.')
|
344
346
|
|
345
347
|
# Run leaf groups in parallel.
|
346
348
|
try:
|
@@ -354,17 +356,17 @@ class Evaluable(lf.Component):
|
|
354
356
|
# Save results for non-leaf nodes.
|
355
357
|
lf.concurrent.ProgressBar.update(
|
356
358
|
overview_bar,
|
357
|
-
|
359
|
+
status='SAVING RESULTS...',
|
358
360
|
color='yellow')
|
359
361
|
|
360
362
|
for node in self.nonleaf_nodes:
|
361
|
-
node._result = {c.id: c.result for c in node.
|
363
|
+
node._result = {c.id: c.result for c in node.leaf_nodes} # pylint: disable=protected-access
|
362
364
|
if should_save:
|
363
365
|
node.save(result=False, report=False)
|
364
366
|
|
365
367
|
if should_save and summary:
|
366
368
|
lf.concurrent.ProgressBar.update(
|
367
|
-
overview_bar,
|
369
|
+
overview_bar, status='FINALIZING SUMMARY...'
|
368
370
|
)
|
369
371
|
|
370
372
|
summary.save(os.path.join(self.root_dir, Evaluable.SUMMARY_HTML))
|
@@ -378,7 +380,7 @@ class Evaluable(lf.Component):
|
|
378
380
|
# Signal all task completed by making the bar green.
|
379
381
|
lf.concurrent.ProgressBar.update(
|
380
382
|
overview_bar,
|
381
|
-
|
383
|
+
status='COMPLETED',
|
382
384
|
color='green')
|
383
385
|
|
384
386
|
finally:
|
@@ -398,6 +400,7 @@ class Evaluable(lf.Component):
|
|
398
400
|
verbose: bool,
|
399
401
|
progress_bar: int | None,
|
400
402
|
label: str | None,
|
403
|
+
timeout: int | None = None,
|
401
404
|
**kwargs,
|
402
405
|
) -> None:
|
403
406
|
"""Run the evaluate and fill `self.result`. Subclass to implement."""
|
@@ -528,37 +531,14 @@ class Evaluable(lf.Component):
|
|
528
531
|
self._render_message(self.dryrun_output, s)
|
529
532
|
|
530
533
|
def _render_message(self, message: lf.Message, s: io.StringIO) -> None:
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
|
536
|
-
|
537
|
-
text_color = 'black'
|
538
|
-
|
539
|
-
s.write(
|
540
|
-
f'<div style="color: {text_color}; white-space: pre-wrap;'
|
541
|
-
'padding: 10px; border: 1px solid; margin-top: 10px">'
|
542
|
-
)
|
543
|
-
s.write(m.get('formatted_text', m.text))
|
544
|
-
if m.result is not None:
|
545
|
-
s.write(
|
546
|
-
'<div style="color: magenta; white-space: pre-wrap;'
|
547
|
-
'padding: 10px; border: 1px solid; margin: 10px">'
|
548
|
-
)
|
549
|
-
s.write(pg.format(m.result))
|
550
|
-
s.write('</div>')
|
551
|
-
if 'usage' in m.metadata:
|
552
|
-
s.write(
|
553
|
-
'<div style="background-color: #EEEEEE; color: black; '
|
554
|
-
'white-space: pre-wrap; padding: 10px; border: 0px solid; '
|
555
|
-
'margin: 10px">'
|
556
|
-
f'prompt: {m.usage.prompt_tokens} tokens, '
|
557
|
-
f'response: {m.usage.completion_tokens} tokens, '
|
558
|
-
f'total: {m.usage.total_tokens} tokens'
|
559
|
-
'</div>'
|
534
|
+
s.write(
|
535
|
+
message.to_html_str(
|
536
|
+
extra_flags=dict(
|
537
|
+
include_message_metadata=False,
|
538
|
+
source_tag=['lm-input', 'lm-response'],
|
539
|
+
)
|
560
540
|
)
|
561
|
-
|
541
|
+
)
|
562
542
|
|
563
543
|
@classmethod
|
564
544
|
def from_dir(
|
@@ -598,7 +578,6 @@ class _LeafNode:
|
|
598
578
|
@pg.use_init_args(['children'])
|
599
579
|
class Suite(Evaluable):
|
600
580
|
"""Evaluation suite."""
|
601
|
-
|
602
581
|
children: Annotated[list[Evaluable], 'Child evaluation sets or suites.']
|
603
582
|
|
604
583
|
# Use empty ID as suite is just a container of child evaluations.
|
@@ -753,10 +732,12 @@ class Evaluation(Evaluable):
|
|
753
732
|
|
754
733
|
# Constants.
|
755
734
|
CACHE_JSON = 'cache.json'
|
756
|
-
|
735
|
+
OOP_FAILURES_HTML = 'oop_failures.html'
|
736
|
+
NON_OOP_FAILURES_HTML = 'non_oop_failures.html'
|
757
737
|
|
758
738
|
@functools.cached_property
|
759
739
|
def hash(self) -> str:
|
740
|
+
"""Returns the semantic-based hash of the evaluation."""
|
760
741
|
if self.is_deterministic:
|
761
742
|
identity = pg.format(self._identifiers(), compact=True)
|
762
743
|
else:
|
@@ -805,6 +786,10 @@ class Evaluation(Evaluable):
|
|
805
786
|
"""Returns the complete rate."""
|
806
787
|
return self.num_completed / self.num_examples
|
807
788
|
|
789
|
+
#
|
790
|
+
# Properties on failures.
|
791
|
+
#
|
792
|
+
|
808
793
|
@property
|
809
794
|
def failures(self) -> list[tuple[Any, Exception]]:
|
810
795
|
"""Returns the failed examples and their errors."""
|
@@ -815,6 +800,15 @@ class Evaluation(Evaluable):
|
|
815
800
|
"""Returns the number of failed examples."""
|
816
801
|
return len(self.failures)
|
817
802
|
|
803
|
+
@functools.cached_property
|
804
|
+
def failure_breakdown(self) -> dict[str, int]:
|
805
|
+
"""Returns the breakdown of failures."""
|
806
|
+
breakdown = collections.defaultdict(int)
|
807
|
+
for _, error in self.failures:
|
808
|
+
breakdown[_error_key(error)] += 1
|
809
|
+
sorted_items = sorted(breakdown.items(), key=lambda x: x[1], reverse=True)
|
810
|
+
return pg.Dict({x[0]: x[1] for x in sorted_items})
|
811
|
+
|
818
812
|
@property
|
819
813
|
def failure_rate(self) -> float:
|
820
814
|
"""Returns the failure rate in range [0, 1]."""
|
@@ -822,6 +816,46 @@ class Evaluation(Evaluable):
|
|
822
816
|
return 0.0
|
823
817
|
return self.num_failures / self.num_completed
|
824
818
|
|
819
|
+
@functools.cached_property
|
820
|
+
def oop_failures(self) -> list[tuple[Any, lf_structured.MappingError]]:
|
821
|
+
"""Returns the OOP failures."""
|
822
|
+
return [item for item in self.failures
|
823
|
+
if isinstance(item[1], lf_structured.MappingError)]
|
824
|
+
|
825
|
+
@property
|
826
|
+
def num_oop_failures(self) -> int:
|
827
|
+
"""Returns the number of OOP failures."""
|
828
|
+
return len(self.oop_failures)
|
829
|
+
|
830
|
+
@property
|
831
|
+
def oop_failure_rate(self) -> float:
|
832
|
+
"""Returns the OOP failure rate in range [0, 1]."""
|
833
|
+
if self.num_completed == 0:
|
834
|
+
return 0.0
|
835
|
+
return self.num_oop_failures / self.num_completed
|
836
|
+
|
837
|
+
@functools.cached_property
|
838
|
+
def non_oop_failures(self) -> list[tuple[Any, Exception]]:
|
839
|
+
"""Returns the OOP failures."""
|
840
|
+
return [item for item in self.failures
|
841
|
+
if not isinstance(item[1], lf_structured.MappingError)]
|
842
|
+
|
843
|
+
@property
|
844
|
+
def num_non_oop_failures(self) -> int:
|
845
|
+
"""Returns the number of non-OOP failures."""
|
846
|
+
return len(self.non_oop_failures)
|
847
|
+
|
848
|
+
@property
|
849
|
+
def non_oop_failure_rate(self) -> float:
|
850
|
+
"""Returns the non-OOP failure rate in range [0, 1]."""
|
851
|
+
if self.num_completed == 0:
|
852
|
+
return 0.0
|
853
|
+
return self.num_non_oop_failures / self.num_completed
|
854
|
+
|
855
|
+
#
|
856
|
+
# Properties on usage.
|
857
|
+
#
|
858
|
+
|
825
859
|
@property
|
826
860
|
def has_usage(self) -> bool:
|
827
861
|
"""Returns True if token usage is enabled."""
|
@@ -905,7 +939,7 @@ class Evaluation(Evaluable):
|
|
905
939
|
|
906
940
|
fields = list(cls.__schema__.values())
|
907
941
|
fields.insert(0, (self.completion_prompt_field, pg.typing.Str()))
|
908
|
-
|
942
|
+
cls.update_schema(fields, extend=False)
|
909
943
|
|
910
944
|
def _maybe_adjust_examples_for_completion(
|
911
945
|
self,
|
@@ -976,13 +1010,22 @@ class Evaluation(Evaluable):
|
|
976
1010
|
self._total_prompt_tokens = 0
|
977
1011
|
self._total_completion_tokens = 0
|
978
1012
|
self._num_usages = 0
|
1013
|
+
self.__dict__.pop('oop_failures', None)
|
1014
|
+
self.__dict__.pop('non_oop_failures', None)
|
1015
|
+
|
1016
|
+
@property
|
1017
|
+
def oop_failures_link(self) -> str | None:
|
1018
|
+
"""Returns the link to the OOP failures page."""
|
1019
|
+
if self.dir is None:
|
1020
|
+
return None
|
1021
|
+
return self.link(os.path.join(self.dir, Evaluation.OOP_FAILURES_HTML))
|
979
1022
|
|
980
1023
|
@property
|
981
|
-
def
|
982
|
-
"""Returns the link to
|
1024
|
+
def non_oop_failures_link(self) -> str | None:
|
1025
|
+
"""Returns the link to then non-OOP failures page."""
|
983
1026
|
if self.dir is None:
|
984
1027
|
return None
|
985
|
-
return self.link(os.path.join(self.dir, Evaluation.
|
1028
|
+
return self.link(os.path.join(self.dir, Evaluation.NON_OOP_FAILURES_HTML))
|
986
1029
|
|
987
1030
|
def _dryrun(
|
988
1031
|
self,
|
@@ -992,11 +1035,11 @@ class Evaluation(Evaluable):
|
|
992
1035
|
verbose: bool,
|
993
1036
|
**kwargs,
|
994
1037
|
) -> None:
|
995
|
-
# Set the example for dryrun.
|
996
|
-
example = example or self.examples[0]
|
997
|
-
|
998
1038
|
# We make a copy to avoid pollute the state of current object.
|
999
1039
|
copy: Evaluation = self.clone()
|
1040
|
+
|
1041
|
+
# Set the example for dryrun.
|
1042
|
+
example = example or copy.examples[0]
|
1000
1043
|
copy.__dict__['examples'] = [example]
|
1001
1044
|
|
1002
1045
|
# We set the symbolic parent of the cloned to access contextual information
|
@@ -1011,23 +1054,36 @@ class Evaluation(Evaluable):
|
|
1011
1054
|
color='green',
|
1012
1055
|
)
|
1013
1056
|
|
1014
|
-
|
1015
|
-
output_message = copy.process(example, **(self.additional_args or {}))
|
1016
|
-
if self.schema is None:
|
1017
|
-
output = output_message.text
|
1018
|
-
else:
|
1019
|
-
output = output_message.result
|
1057
|
+
error, output_message = None, None
|
1020
1058
|
|
1021
|
-
|
1059
|
+
try:
|
1060
|
+
with lf.use_settings(debug=debug):
|
1061
|
+
output_message = copy.process(example, **(self.additional_args or {}))
|
1062
|
+
self.process_output(example, output_message)
|
1063
|
+
|
1064
|
+
if self.schema is None:
|
1065
|
+
output = output_message.text
|
1066
|
+
else:
|
1067
|
+
output = output_message.result
|
1068
|
+
|
1069
|
+
if verbose:
|
1070
|
+
lf.console.write('')
|
1071
|
+
lf.console.write(
|
1072
|
+
str(output),
|
1073
|
+
title='OUTPUT',
|
1074
|
+
color='blue',
|
1075
|
+
)
|
1076
|
+
except lf_structured.MappingError as e:
|
1022
1077
|
lf.console.write('')
|
1023
1078
|
lf.console.write(
|
1024
|
-
str(
|
1025
|
-
title='
|
1026
|
-
color='
|
1079
|
+
str(e),
|
1080
|
+
title='ERROR',
|
1081
|
+
color='red',
|
1027
1082
|
)
|
1083
|
+
error = e
|
1028
1084
|
|
1029
|
-
copy.audit(example, output_message,
|
1030
|
-
result = copy.
|
1085
|
+
copy.audit(1, example, output_message, error, dryrun=True)
|
1086
|
+
result = copy.finalize()
|
1031
1087
|
|
1032
1088
|
if verbose:
|
1033
1089
|
lf.console.write('')
|
@@ -1048,9 +1104,13 @@ class Evaluation(Evaluable):
|
|
1048
1104
|
verbose: bool,
|
1049
1105
|
progress_bar: int | None,
|
1050
1106
|
label: str | None,
|
1107
|
+
timeout: int | None = None,
|
1051
1108
|
**kwargs,
|
1052
1109
|
) -> None:
|
1053
1110
|
# Setup examples.
|
1111
|
+
# Reset examples so it could be read from the input functor.
|
1112
|
+
self.__dict__.pop('examples', None)
|
1113
|
+
|
1054
1114
|
if end is None:
|
1055
1115
|
end = len(self.examples)
|
1056
1116
|
examples = self.examples[start:end]
|
@@ -1059,20 +1119,24 @@ class Evaluation(Evaluable):
|
|
1059
1119
|
with lf.use_settings(debug=debug, cache=self.cache):
|
1060
1120
|
self._reset()
|
1061
1121
|
|
1062
|
-
def _process(
|
1122
|
+
def _process(idx_and_example: Any):
|
1063
1123
|
# NOTE(daiyip): set the `input` symbol of the globals to None, so LLM
|
1064
1124
|
# generated code with calls to `input` will raise an error, thus not
|
1065
1125
|
# blocking the evaluation.
|
1126
|
+
_, example = idx_and_example
|
1066
1127
|
with lf_coding.context(input=None):
|
1067
|
-
|
1128
|
+
output_message = self.process(example, **(self.additional_args or {}))
|
1129
|
+
self.process_output(example, output_message)
|
1130
|
+
return output_message
|
1068
1131
|
|
1069
1132
|
try:
|
1070
|
-
for example, message, error in lf.concurrent_map(
|
1133
|
+
for (idx, example), message, error in lf.concurrent_map(
|
1071
1134
|
_process,
|
1072
|
-
examples,
|
1135
|
+
enumerate(examples),
|
1073
1136
|
max_workers=self.max_workers,
|
1074
1137
|
show_progress=progress_bar or False,
|
1075
1138
|
status_fn=self._status,
|
1139
|
+
timeout=timeout,
|
1076
1140
|
):
|
1077
1141
|
if error is not None:
|
1078
1142
|
message = (
|
@@ -1080,14 +1144,14 @@ class Evaluation(Evaluable):
|
|
1080
1144
|
if isinstance(error, lf_structured.MappingError)
|
1081
1145
|
else None
|
1082
1146
|
)
|
1083
|
-
self.audit(example, message, error)
|
1147
|
+
self.audit(idx + 1, example, message, error)
|
1084
1148
|
finally:
|
1085
1149
|
# Save cache upon completion or interruption.
|
1086
1150
|
if self.dir and self.cache:
|
1087
1151
|
self.cache.save()
|
1088
1152
|
|
1089
1153
|
# Summarize result.
|
1090
|
-
self._result = self.
|
1154
|
+
self._result = self.finalize()
|
1091
1155
|
if verbose:
|
1092
1156
|
lf.console.write(
|
1093
1157
|
str(self.result),
|
@@ -1101,7 +1165,7 @@ class Evaluation(Evaluable):
|
|
1101
1165
|
|
1102
1166
|
def process(self, example: Any, **kwargs) -> lf.Message:
|
1103
1167
|
"""Process an example and returns its output."""
|
1104
|
-
prompt = self.prompt
|
1168
|
+
prompt = lf.Template.from_value(self.prompt, example=example)
|
1105
1169
|
if self.method == 'call':
|
1106
1170
|
return lf_structured.call(
|
1107
1171
|
prompt,
|
@@ -1129,7 +1193,9 @@ class Evaluation(Evaluable):
|
|
1129
1193
|
else:
|
1130
1194
|
assert self.method == 'complete', self.method
|
1131
1195
|
assert isinstance(self.schema.spec, pg.typing.Object), self.schema
|
1132
|
-
|
1196
|
+
# TODO(daiyip): Currently multi-modal inputs within the prompt for
|
1197
|
+
# completion is not supported.
|
1198
|
+
input_value = self.schema.spec.cls.partial(prompt.render().text)
|
1133
1199
|
return lf_structured.complete(
|
1134
1200
|
input_value,
|
1135
1201
|
lm=self.lm,
|
@@ -1140,16 +1206,48 @@ class Evaluation(Evaluable):
|
|
1140
1206
|
**kwargs,
|
1141
1207
|
)
|
1142
1208
|
|
1209
|
+
def process_output(self, example: Any, output: lf.Message) -> None:
|
1210
|
+
"""Process the output for an example.
|
1211
|
+
|
1212
|
+
Subclasses can override this method to generate and attach additional
|
1213
|
+
metadata for debugging purpose. For example, draw bounding boxes on the
|
1214
|
+
input image based on LLM predicted boxes and attach to output_message's
|
1215
|
+
metadata.
|
1216
|
+
|
1217
|
+
Example:
|
1218
|
+
|
1219
|
+
class BoundingBoxEval(lf.eval.Matching):
|
1220
|
+
...
|
1221
|
+
def process_output(example, output):
|
1222
|
+
output.metadata.image_with_bbox = draw_bboxes(
|
1223
|
+
example.image, output.result)
|
1224
|
+
|
1225
|
+
Args:
|
1226
|
+
example: User input.
|
1227
|
+
output: LLM's output message. Users could attach additional
|
1228
|
+
information to the message, which will be shown in debugging
|
1229
|
+
"""
|
1230
|
+
del example, output
|
1231
|
+
|
1143
1232
|
def _status(self, progress: lf.concurrent.Progress) -> dict[str, Any]:
|
1233
|
+
status = {'Model': self.lm.model_id}
|
1234
|
+
status.update(self._eval_status(progress))
|
1235
|
+
|
1236
|
+
if progress.last_error is not None:
|
1237
|
+
status['LastError'] = progress.last_error_str()
|
1238
|
+
if progress.timeit_summary:
|
1239
|
+
status['TimeIt'] = progress.timeit_summary_str()
|
1240
|
+
return status
|
1241
|
+
|
1242
|
+
def _eval_status(self, progress: lf.concurrent.Progress) -> dict[str, Any]:
|
1144
1243
|
return {
|
1145
|
-
'
|
1146
|
-
|
1147
|
-
progress.success_rate * 100,
|
1244
|
+
'Succeeded': '%s (%d/%d)' % (
|
1245
|
+
self._format_rate(progress.success_rate),
|
1148
1246
|
progress.succeeded,
|
1149
1247
|
progress.completed,
|
1150
1248
|
),
|
1151
|
-
'Failed':
|
1152
|
-
progress.failure_rate
|
1249
|
+
'Failed': '%s (%d/%d)' % (
|
1250
|
+
self._format_rate(progress.failure_rate),
|
1153
1251
|
progress.failed,
|
1154
1252
|
progress.completed,
|
1155
1253
|
),
|
@@ -1159,22 +1257,21 @@ class Evaluation(Evaluable):
|
|
1159
1257
|
assert self.result is not None
|
1160
1258
|
m = self.result.metrics
|
1161
1259
|
return (
|
1162
|
-
|
1163
|
-
f' Failures=%.{self.report_precision}f%% (%d/%d)'
|
1260
|
+
'COMPLETED(%s): Successes=%s(%d/%d) Failures=%s (%d/%d)'
|
1164
1261
|
% (
|
1165
1262
|
run_status,
|
1166
|
-
(1 - m.failure_rate)
|
1263
|
+
self._format_rate(1 - m.failure_rate),
|
1167
1264
|
m.total - m.failures,
|
1168
1265
|
m.total,
|
1169
|
-
m.failure_rate
|
1266
|
+
self._format_rate(m.failure_rate),
|
1170
1267
|
m.failures,
|
1171
1268
|
m.total,
|
1172
1269
|
)
|
1173
1270
|
)
|
1174
1271
|
|
1175
|
-
def
|
1176
|
-
"""
|
1177
|
-
if self.cache:
|
1272
|
+
def finalize(self) -> pg.Dict:
|
1273
|
+
"""Finalizes the evaluation result."""
|
1274
|
+
if self.cache is not None:
|
1178
1275
|
cache_stats = dict(
|
1179
1276
|
use_cache=True,
|
1180
1277
|
num_queries=self.cache.stats.num_queries,
|
@@ -1201,7 +1298,7 @@ class Evaluation(Evaluable):
|
|
1201
1298
|
id=self.id,
|
1202
1299
|
dir=self.dir,
|
1203
1300
|
model=self.lm.model_id,
|
1204
|
-
prompt_template=
|
1301
|
+
prompt_template=pg.decolor(str(self.prompt)),
|
1205
1302
|
method=self.method,
|
1206
1303
|
schema_fn=str(self.schema_fn),
|
1207
1304
|
),
|
@@ -1210,38 +1307,47 @@ class Evaluation(Evaluable):
|
|
1210
1307
|
total=self.num_completed,
|
1211
1308
|
failures=self.num_failures,
|
1212
1309
|
failure_rate=self.failure_rate,
|
1310
|
+
oop_failures=self.num_oop_failures,
|
1311
|
+
oop_failure_rate=self.oop_failure_rate,
|
1312
|
+
non_oop_failures=self.num_non_oop_failures,
|
1313
|
+
non_oop_failure_rate=self.non_oop_failure_rate,
|
1314
|
+
failure_breakdown=self.failure_breakdown,
|
1213
1315
|
),
|
1214
1316
|
usage=usage,
|
1215
1317
|
)
|
1216
1318
|
return result
|
1217
1319
|
|
1218
|
-
def
|
1320
|
+
def summary_card(self) -> str:
|
1321
|
+
"""Returns summary card in HTML."""
|
1219
1322
|
s = io.StringIO()
|
1220
1323
|
definition = _html_repr(self, compact=False, escape=True)
|
1221
1324
|
s.write('<div><table><tr><td>')
|
1325
|
+
self._render_link(
|
1326
|
+
s,
|
1327
|
+
definition,
|
1328
|
+
self.hash,
|
1329
|
+
'',
|
1330
|
+
lambda: self.link(self.dir),
|
1331
|
+
)
|
1222
1332
|
if self.result is None:
|
1223
1333
|
s.write(
|
1224
|
-
f'<a target="_blank" title="{definition}" '
|
1225
|
-
f'href="{self.link(self.dir)}">{self.hash}</a>'
|
1226
1334
|
'</td></tr><tr><td>'
|
1227
1335
|
'<span style="color: gray">(IN-PROGRESS...)</span>'
|
1228
1336
|
)
|
1229
1337
|
else:
|
1230
|
-
|
1231
|
-
|
1232
|
-
|
1233
|
-
|
1234
|
-
)
|
1235
|
-
self._render_metric(s)
|
1338
|
+
if self.dir:
|
1339
|
+
s.write(f' [<a href="{self.link(self.dir)}">dir</a>]')
|
1340
|
+
s.write('</td></tr><tr><td>')
|
1341
|
+
self._render_summary_metrics(s)
|
1236
1342
|
|
1237
1343
|
# Summarize average usage.
|
1238
|
-
if self.result.usage
|
1239
|
-
self.
|
1344
|
+
if self.result.usage:
|
1345
|
+
self._render_summary_usage(s)
|
1240
1346
|
|
1241
1347
|
s.write('</td></tr></table></div>')
|
1242
1348
|
return s.getvalue()
|
1243
1349
|
|
1244
|
-
def
|
1350
|
+
def _render_summary_usage(self, s: io.StringIO) -> None:
|
1245
1351
|
"""Renders usage in HTML."""
|
1246
1352
|
usage = self.result.usage
|
1247
1353
|
total = usage.total_prompt_tokens + usage.total_completion_tokens
|
@@ -1255,22 +1361,79 @@ class Evaluation(Evaluable):
|
|
1255
1361
|
f'" style="color:gray">({total} tokens)</a>'
|
1256
1362
|
)
|
1257
1363
|
|
1258
|
-
def
|
1364
|
+
def _render_link(self,
|
1365
|
+
s: io.StringIO,
|
1366
|
+
title: str,
|
1367
|
+
text: str,
|
1368
|
+
style: str,
|
1369
|
+
url_fn: Callable[[], str]) -> None:
|
1370
|
+
"""Renders a link in HTML."""
|
1371
|
+
s.write(
|
1372
|
+
f'<a target="_blank" title="{title}" style="{style}"'
|
1373
|
+
)
|
1374
|
+
if self.dir:
|
1375
|
+
s.write(f' href="{url_fn()}"')
|
1376
|
+
s.write(f'>{text}</a>')
|
1377
|
+
|
1378
|
+
def _render_summary_metrics(self, s: io.StringIO) -> None:
|
1259
1379
|
"""Renders metrics in HTML."""
|
1260
1380
|
assert self.result is not None
|
1261
1381
|
m = self.result.metrics
|
1262
|
-
|
1263
|
-
|
1264
|
-
|
1265
|
-
|
1266
|
-
|
1267
|
-
|
1268
|
-
|
1269
|
-
|
1382
|
+
|
1383
|
+
# OOP failures.
|
1384
|
+
oop_failure_title = f'OOP failures ({m.oop_failures}/{m.total})'
|
1385
|
+
if m.oop_failures:
|
1386
|
+
oop_failure_title += '
'
|
1387
|
+
for name, count in m.failure_breakdown.items():
|
1388
|
+
if name.startswith('MappingError'):
|
1389
|
+
oop_failure_title += '
%s: %s (%d/%d)' % (
|
1390
|
+
name.removeprefix('MappingError.'),
|
1391
|
+
self._format_rate(count / m.total),
|
1392
|
+
count,
|
1393
|
+
m.total,
|
1394
|
+
)
|
1395
|
+
|
1396
|
+
extra_style = ''
|
1397
|
+
if m.oop_failure_rate > 0.1 and m.oop_failures > 3:
|
1398
|
+
extra_style = ';font-weight:bold'
|
1399
|
+
self._render_link(
|
1400
|
+
s,
|
1401
|
+
oop_failure_title,
|
1402
|
+
self._format_rate(m.oop_failure_rate),
|
1403
|
+
f'color:magenta{extra_style}',
|
1404
|
+
lambda: self.oop_failures_link,
|
1270
1405
|
)
|
1406
|
+
s.write(' | ')
|
1407
|
+
|
1408
|
+
# Non-OOP failures.
|
1409
|
+
non_oop_failure_title = f'Non-OOP failures ({m.non_oop_failures}/{m.total})'
|
1410
|
+
if m.non_oop_failures:
|
1411
|
+
non_oop_failure_title += '
'
|
1412
|
+
for name, count in m.failure_breakdown.items():
|
1413
|
+
if not name.startswith('MappingError'):
|
1414
|
+
non_oop_failure_title += '
%s: %s (%d/%d)' % (
|
1415
|
+
name,
|
1416
|
+
self._format_rate(count / m.total),
|
1417
|
+
count,
|
1418
|
+
m.total,
|
1419
|
+
)
|
1420
|
+
|
1421
|
+
extra_style = ';font-weight:bold' if m.non_oop_failures > 0 else ''
|
1422
|
+
self._render_link(
|
1423
|
+
s,
|
1424
|
+
non_oop_failure_title,
|
1425
|
+
self._format_rate(m.non_oop_failure_rate),
|
1426
|
+
f'color:red{extra_style}',
|
1427
|
+
lambda: self.non_oop_failures_link,
|
1428
|
+
)
|
1429
|
+
|
1430
|
+
def _format_rate(self, rate: float) -> str:
|
1431
|
+
"""Formats a rate."""
|
1432
|
+
return f'%.{self.report_precision}f%% ' % (rate * 100)
|
1271
1433
|
|
1272
1434
|
def audit(
|
1273
1435
|
self,
|
1436
|
+
example_idx: int,
|
1274
1437
|
example: Any,
|
1275
1438
|
message: lf.Message | None,
|
1276
1439
|
error: Exception | None = None,
|
@@ -1279,6 +1442,7 @@ class Evaluation(Evaluable):
|
|
1279
1442
|
"""Audits the example against the output. Subclasses should override.
|
1280
1443
|
|
1281
1444
|
Args:
|
1445
|
+
example_idx: 1-based index of the example in its dataset.
|
1282
1446
|
example: The input object.
|
1283
1447
|
message: The entire message returned by the LM, which could be used to
|
1284
1448
|
trace the LM input, response and parsed structure. If error is raised
|
@@ -1287,13 +1451,19 @@ class Evaluation(Evaluable):
|
|
1287
1451
|
dryrun: Whether or not audition takes place during dryrun.
|
1288
1452
|
"""
|
1289
1453
|
if error is not None:
|
1290
|
-
self._failures.append((example,
|
1454
|
+
self._failures.append((example, error))
|
1455
|
+
|
1456
|
+
# Invalid cache of num_oop_failures.
|
1457
|
+
self.__dict__.pop('oop_failures', None)
|
1458
|
+
self.__dict__.pop('non_oop_failures', None)
|
1459
|
+
self.__dict__.pop('failure_breakdown', None)
|
1460
|
+
|
1291
1461
|
if isinstance(error, lf_structured.MappingError):
|
1292
1462
|
message = error.lm_response
|
1293
1463
|
else:
|
1294
1464
|
assert message is not None
|
1295
1465
|
output = message.text if self.schema is None else message.result
|
1296
|
-
self.audit_processed(example, output, message, dryrun=dryrun)
|
1466
|
+
self.audit_processed(example_idx, example, output, message, dryrun=dryrun)
|
1297
1467
|
|
1298
1468
|
# Audit usage.
|
1299
1469
|
if message is not None:
|
@@ -1301,14 +1471,17 @@ class Evaluation(Evaluable):
|
|
1301
1471
|
self._num_completed += 1
|
1302
1472
|
|
1303
1473
|
def audit_usage(self, message: lf.Message, dryrun: bool = False) -> None:
|
1474
|
+
del dryrun
|
1304
1475
|
for m in message.trace():
|
1305
|
-
|
1306
|
-
|
1307
|
-
self.
|
1476
|
+
usage = m.metadata.get('usage', None)
|
1477
|
+
if usage:
|
1478
|
+
self._total_prompt_tokens += usage.prompt_tokens
|
1479
|
+
self._total_completion_tokens += usage.completion_tokens
|
1308
1480
|
self._num_usages += 1
|
1309
1481
|
|
1310
1482
|
def audit_processed(
|
1311
|
-
self, example: Any, output: Any, message: lf.Message,
|
1483
|
+
self, example_idx: int, example: Any, output: Any, message: lf.Message,
|
1484
|
+
dryrun: bool = False
|
1312
1485
|
) -> None:
|
1313
1486
|
"""Audits a successfully processed example. Subclass should override."""
|
1314
1487
|
|
@@ -1333,16 +1506,26 @@ class Evaluation(Evaluable):
|
|
1333
1506
|
# Save failures.
|
1334
1507
|
pg.save(
|
1335
1508
|
[
|
1336
|
-
pg.Dict(
|
1337
|
-
|
1338
|
-
)
|
1339
|
-
for input, error in self.failures
|
1509
|
+
pg.Dict(input=input, error=_format_error(error))
|
1510
|
+
for input, error in self.oop_failures
|
1340
1511
|
],
|
1341
|
-
os.path.join(self.dir, Evaluation.
|
1512
|
+
os.path.join(self.dir, Evaluation.OOP_FAILURES_JSON),
|
1342
1513
|
)
|
1343
1514
|
pg.save(
|
1344
|
-
self._html([self._render_result, self.
|
1345
|
-
os.path.join(self.dir, Evaluation.
|
1515
|
+
self._html([self._render_result, self._render_oop_failures]),
|
1516
|
+
os.path.join(self.dir, Evaluation.OOP_FAILURES_HTML),
|
1517
|
+
file_format='txt',
|
1518
|
+
)
|
1519
|
+
pg.save(
|
1520
|
+
[
|
1521
|
+
pg.Dict(input=input, error=_format_error(error))
|
1522
|
+
for input, error in self.non_oop_failures
|
1523
|
+
],
|
1524
|
+
os.path.join(self.dir, Evaluation.NON_OOP_FAILURES_JSON),
|
1525
|
+
)
|
1526
|
+
pg.save(
|
1527
|
+
self._html([self._render_result, self._render_non_oop_failures]),
|
1528
|
+
os.path.join(self.dir, Evaluation.NON_OOP_FAILURES_HTML),
|
1346
1529
|
file_format='txt',
|
1347
1530
|
)
|
1348
1531
|
|
@@ -1355,9 +1538,10 @@ class Evaluation(Evaluable):
|
|
1355
1538
|
'<td>Schema</td>'
|
1356
1539
|
'<td>Additional Args</td>'
|
1357
1540
|
)
|
1358
|
-
if self.result.usage
|
1541
|
+
if self.result.usage:
|
1359
1542
|
s.write('<td>Usage</td>')
|
1360
|
-
s.write('<td>Failures</td>')
|
1543
|
+
s.write('<td>OOP Failures</td>')
|
1544
|
+
s.write('<td>Non-OOP Failures</td>')
|
1361
1545
|
|
1362
1546
|
def _render_result_row(self, s: io.StringIO) -> None:
|
1363
1547
|
s.write(
|
@@ -1383,18 +1567,31 @@ class Evaluation(Evaluable):
|
|
1383
1567
|
f'{_html_repr(self.additional_args, compact=False)}</td>'
|
1384
1568
|
)
|
1385
1569
|
# Usage.
|
1386
|
-
if self.result.usage
|
1570
|
+
if self.result.usage:
|
1387
1571
|
s.write('<td>')
|
1388
|
-
self.
|
1572
|
+
self._render_summary_usage(s)
|
1389
1573
|
s.write('</td>')
|
1390
1574
|
|
1391
|
-
#
|
1575
|
+
# OOP failures.
|
1392
1576
|
s.write(
|
1393
|
-
'<td><span style="color:
|
1577
|
+
'<td><span style="color:magenta">%s</span>%s</td>'
|
1394
1578
|
% (
|
1395
|
-
|
1579
|
+
self._format_rate(self.oop_failure_rate),
|
1396
1580
|
'<a href="%s">(%d/%d)</a>'
|
1397
|
-
% (self.
|
1581
|
+
% (self.oop_failures_link,
|
1582
|
+
self.num_oop_failures,
|
1583
|
+
self.num_completed),
|
1584
|
+
)
|
1585
|
+
)
|
1586
|
+
# Non-OOP failures.
|
1587
|
+
s.write(
|
1588
|
+
'<td><span style="color:red">%s</span>%s</td>'
|
1589
|
+
% (
|
1590
|
+
self._format_rate(self.non_oop_failure_rate),
|
1591
|
+
'<a href="%s">(%d/%d)</a>'
|
1592
|
+
% (self.non_oop_failures_link,
|
1593
|
+
self.num_non_oop_failures,
|
1594
|
+
self.num_completed),
|
1398
1595
|
)
|
1399
1596
|
)
|
1400
1597
|
|
@@ -1408,31 +1605,99 @@ class Evaluation(Evaluable):
|
|
1408
1605
|
else:
|
1409
1606
|
return 'cyan'
|
1410
1607
|
|
1411
|
-
def
|
1608
|
+
def _render_oop_failures(self, s: io.StringIO) -> None:
|
1609
|
+
self._render_failures(s, '^MappingError.*', error_color='magenta')
|
1610
|
+
|
1611
|
+
def _render_non_oop_failures(self, s: io.StringIO) -> None:
|
1612
|
+
self._render_failures(s, '^(?!MappingError).*', error_color='red')
|
1613
|
+
|
1614
|
+
def _render_failures(
|
1615
|
+
self, s: io.StringIO, error_regex: str, error_color: str) -> None:
|
1412
1616
|
"""Formats the failed cases into html."""
|
1617
|
+
# Failure summary.
|
1413
1618
|
s.write(
|
1414
|
-
'<h2>
|
1619
|
+
'<h2> Error Summary </h2>'
|
1415
1620
|
'<div style="white-space:pre">\n'
|
1416
1621
|
'<table style="border:1px solid">'
|
1417
|
-
'<tr class="header"><td>
|
1622
|
+
'<tr class="header"><td>Error type</td><td>Stats</td></tr>'
|
1623
|
+
)
|
1624
|
+
error_regex = re.compile(error_regex)
|
1625
|
+
if self.result.metrics.failure_breakdown:
|
1626
|
+
for name, count in self.result.metrics.failure_breakdown.items():
|
1627
|
+
if not error_regex.match(name):
|
1628
|
+
continue
|
1629
|
+
|
1630
|
+
link = f'<a href="#{name}">{name}</a>'
|
1631
|
+
error_rate = self._format_rate(count / self.result.metrics.total)
|
1632
|
+
stats = (f'<span style="color:{error_color}">{error_rate} '
|
1633
|
+
f'({count}/{self.result.metrics.total})</span>')
|
1634
|
+
s.write(f'<tr><td>{link}</td><td>{stats})</td></tr>')
|
1635
|
+
s.write(
|
1636
|
+
'</table></div>'
|
1637
|
+
'<h2> Failed Cases </h2>'
|
1638
|
+
'<div style="white-space:pre">'
|
1418
1639
|
)
|
1640
|
+
# Failure details by error type.
|
1641
|
+
failures_by_error = collections.defaultdict(list)
|
1642
|
+
for example, error in self.failures:
|
1643
|
+
error_name = _error_key(error)
|
1644
|
+
if error_regex.match(error_name):
|
1645
|
+
failures_by_error[error_name].append((example, error))
|
1646
|
+
|
1647
|
+
for error_key, failures in failures_by_error.items():
|
1648
|
+
s.write(
|
1649
|
+
f'<h3 id="{error_key}"><a href="#{error_key}">{error_key}</a> '
|
1650
|
+
f'(count={len(failures)})</h3>'
|
1651
|
+
'<table style="border:1px solid">'
|
1652
|
+
'<tr class="header"><td>No.</td><td>Input</td>'
|
1653
|
+
'<td>LM invocation</td><td>Error</td></tr>'
|
1654
|
+
)
|
1655
|
+
for i, (example, error) in enumerate(failures):
|
1656
|
+
lm_response = None
|
1657
|
+
if isinstance(error, lf.structured.MappingError):
|
1658
|
+
lm_response = error.lm_response
|
1659
|
+
error = error.cause
|
1660
|
+
|
1661
|
+
bgcolor = 'white' if i % 2 == 0 else '#DDDDDD'
|
1662
|
+
s.write(f'<tr style="background-color: {bgcolor}"><td>{i + 1}</td>')
|
1663
|
+
s.write('<td style="color:green;white-space:pre-wrap">')
|
1664
|
+
s.write(pg.format(example, verbose=False))
|
1665
|
+
s.write('</td><td>')
|
1666
|
+
if lm_response is not None:
|
1667
|
+
self._render_message(lm_response, s)
|
1668
|
+
s.write(f'</td><td style="color:{error_color};white-space:pre">')
|
1669
|
+
s.write(_format_error(error))
|
1670
|
+
s.write('</td></tr>')
|
1671
|
+
s.write('</table>')
|
1672
|
+
s.write('</div>')
|
1419
1673
|
|
1420
|
-
|
1421
|
-
|
1422
|
-
|
1423
|
-
|
1424
|
-
|
1425
|
-
error_str = lf.text_formatting.decolored(str(error))
|
1426
|
-
s.write(f'<td style="color:red;white-space:pre">{error_str}</td>')
|
1427
|
-
s.write('</tr>')
|
1428
|
-
s.write('</table></div>')
|
1674
|
+
@classmethod
|
1675
|
+
def visualize(cls, evaluations: list['Evaluation']) -> str | None:
|
1676
|
+
"""Visualize the a list of evaluations of this task in HTML."""
|
1677
|
+
del evaluations
|
1678
|
+
return None
|
1429
1679
|
|
1430
1680
|
|
1431
1681
|
@pg.functor()
|
1432
|
-
def inputs_from(path: str | list[str]) -> list[Any]:
|
1682
|
+
def inputs_from(path: str | list[str], **kwargs) -> list[Any]:
|
1433
1683
|
"""A functor that returns a list of user-defined objects as eval inputs."""
|
1434
1684
|
if isinstance(path, str):
|
1435
|
-
|
1685
|
+
if path.endswith('.json'):
|
1686
|
+
return pg.load(path)
|
1687
|
+
elif path.endswith('.jsonl'):
|
1688
|
+
return list(iter(pg.open_jsonl(path)))
|
1689
|
+
elif path.endswith('.csv'):
|
1690
|
+
import pandas as pd # pylint: disable=g-import-not-at-top
|
1691
|
+
dataset_df = pd.read_csv(path, **kwargs)
|
1692
|
+
dataset = []
|
1693
|
+
for i in range(dataset_df.shape[0]):
|
1694
|
+
row = {}
|
1695
|
+
for col in dataset_df.columns:
|
1696
|
+
row[col] = dataset_df.iloc[i][col]
|
1697
|
+
dataset.append(row)
|
1698
|
+
return dataset
|
1699
|
+
else:
|
1700
|
+
raise ValueError(f'Unsupported file format: {path}')
|
1436
1701
|
examples = []
|
1437
1702
|
for p in path:
|
1438
1703
|
examples.extend(pg.load(p))
|
@@ -1578,7 +1843,7 @@ class Summary(pg.Object):
|
|
1578
1843
|
if e is None:
|
1579
1844
|
s.write('<span style="color: gray">N/A<span>')
|
1580
1845
|
else:
|
1581
|
-
s.write(e.
|
1846
|
+
s.write(e.summary_card())
|
1582
1847
|
s.write('</td>')
|
1583
1848
|
s.write('</tr>')
|
1584
1849
|
s.write('</table>')
|
@@ -1653,13 +1918,22 @@ class Summary(pg.Object):
|
|
1653
1918
|
s.write('<html><body>')
|
1654
1919
|
for task in sorted(self.tasks(), key=lambda cls: cls.__name__):
|
1655
1920
|
table_id = task.__name__.lower()
|
1921
|
+
evaluations = self.select(task=task).evaluations
|
1922
|
+
table = Summary.Table.from_evaluations(evaluations, pivot_field)
|
1656
1923
|
s.write('<div>')
|
1657
|
-
s.write(
|
1658
|
-
|
1659
|
-
|
1660
|
-
table = Summary.Table.from_evaluations(
|
1661
|
-
self.select(task=task).evaluations, pivot_field
|
1924
|
+
s.write(
|
1925
|
+
f'<a id="{table_id}" href="#{table_id}">'
|
1926
|
+
f'<h2>{task.__name__}</h2></a>'
|
1662
1927
|
)
|
1928
|
+
|
1929
|
+
# Allow users to plugin visualization code (e.g. matplot) in the summary
|
1930
|
+
# page.
|
1931
|
+
visual_part = task.visualize(evaluations)
|
1932
|
+
if visual_part:
|
1933
|
+
s.write(visual_part)
|
1934
|
+
|
1935
|
+
s.write(f'<h4 style="color:gray">{len(evaluations)} experiments</h4>')
|
1936
|
+
s.write('<hr/>')
|
1663
1937
|
s.write(table.html())
|
1664
1938
|
s.write('</div>')
|
1665
1939
|
s.write('</body></html>')
|
@@ -1685,6 +1959,7 @@ class Summary(pg.Object):
|
|
1685
1959
|
experiment=entry,
|
1686
1960
|
dir=entry.dir,
|
1687
1961
|
metrics=entry.result.metrics if entry.result else None,
|
1962
|
+
usage=entry.result.usage if entry.result else None,
|
1688
1963
|
)
|
1689
1964
|
)
|
1690
1965
|
task_results[task.__name__] = results
|
@@ -1833,6 +2108,20 @@ class Summary(pg.Object):
|
|
1833
2108
|
return result.join()
|
1834
2109
|
|
1835
2110
|
|
2111
|
+
def _format_error(error: Exception):
|
2112
|
+
"""Formats an error into a string."""
|
2113
|
+
return (f'({error.__class__.__name__}) ' + pg.decolor(str(error)))
|
2114
|
+
|
2115
|
+
|
2116
|
+
def _error_key(error: Exception) -> str:
|
2117
|
+
"""Returns the key for an error."""
|
2118
|
+
error_names = []
|
2119
|
+
while error is not None:
|
2120
|
+
error_names.append(error.__class__.__name__)
|
2121
|
+
error = getattr(error, 'cause', None)
|
2122
|
+
return '.'.join(error_names)
|
2123
|
+
|
2124
|
+
|
1836
2125
|
def _html_repr(value: Any, compact: bool = True, escape: bool = False) -> str:
|
1837
2126
|
"""Formats prompt in HTML."""
|
1838
2127
|
if type(value) is lf.Template: # pylint: disable=unidiomatic-typecheck
|
@@ -1909,41 +2198,200 @@ def monitor_async(
|
|
1909
2198
|
)
|
1910
2199
|
|
1911
2200
|
|
1912
|
-
|
1913
|
-
|
2201
|
+
#
|
2202
|
+
# Named evaluations and experiments support.
|
2203
|
+
#
|
1914
2204
|
|
1915
|
-
Args:
|
1916
|
-
target: An Langfun evaluable object.
|
1917
|
-
"""
|
1918
|
-
flags.DEFINE_string(
|
1919
|
-
'root_dir', None, 'Root directory for running the evaluation.'
|
1920
|
-
)
|
1921
2205
|
|
1922
|
-
|
1923
|
-
|
1924
|
-
)
|
2206
|
+
class _NamedEvaluationRegistry:
|
2207
|
+
"""Named evaluation registry."""
|
1925
2208
|
|
1926
|
-
|
1927
|
-
|
1928
|
-
)
|
2209
|
+
def __init__(self):
|
2210
|
+
self._registry = {}
|
1929
2211
|
|
1930
|
-
|
1931
|
-
|
1932
|
-
|
1933
|
-
|
1934
|
-
)
|
2212
|
+
def names(self) -> list[str]:
|
2213
|
+
"""Returns all registered names."""
|
2214
|
+
return sorted(self._registry.keys())
|
2215
|
+
|
2216
|
+
def get(self, name: str) -> list[Type[Evaluable]]:
|
2217
|
+
"""Gets an evaluation by name."""
|
2218
|
+
matches = []
|
2219
|
+
if name in self._registry:
|
2220
|
+
matches.append(self._registry[name])
|
2221
|
+
else:
|
2222
|
+
regex = re.compile(name)
|
2223
|
+
for key, cls in self._registry.items():
|
2224
|
+
if regex.match(key):
|
2225
|
+
matches.append(cls)
|
2226
|
+
return matches
|
2227
|
+
|
2228
|
+
def register(
|
2229
|
+
self,
|
2230
|
+
name: str,
|
2231
|
+
experiment_cls: Type[Evaluable],
|
2232
|
+
):
|
2233
|
+
"""Register an experiment class."""
|
2234
|
+
self._registry[name] = experiment_cls
|
1935
2235
|
|
1936
|
-
FLAGS = flags.FLAGS # pylint: disable=invalid-name
|
1937
2236
|
|
1938
|
-
|
1939
|
-
if len(argv) > 1:
|
1940
|
-
raise app.UsageError('Too many command-line arguments.')
|
2237
|
+
_eval_registry = _NamedEvaluationRegistry()
|
1941
2238
|
|
1942
|
-
|
1943
|
-
|
1944
|
-
|
1945
|
-
|
2239
|
+
|
2240
|
+
def registered_names() -> list[str]:
|
2241
|
+
"""Returns all registered names."""
|
2242
|
+
return _eval_registry.names()
|
2243
|
+
|
2244
|
+
|
2245
|
+
def get_evaluations(evaluation: str | Evaluable) -> list[Evaluable]:
|
2246
|
+
"""Gets an evaluation experiment by name."""
|
2247
|
+
if isinstance(evaluation, str):
|
2248
|
+
return [e() for e in _eval_registry.get(evaluation)]
|
2249
|
+
return [evaluation]
|
2250
|
+
|
2251
|
+
|
2252
|
+
def register(name: str):
|
2253
|
+
"""Decorator to create a named evaluation class."""
|
2254
|
+
|
2255
|
+
def _register(func_or_cls: Type[Evaluation] | types.FunctionType):
|
2256
|
+
if inspect.isfunction(func_or_cls):
|
2257
|
+
e = func_or_cls()
|
2258
|
+
if not isinstance(e, Evaluable):
|
2259
|
+
raise TypeError(
|
2260
|
+
f'The return value of `{func_or_cls}` should be an instance of '
|
2261
|
+
'`lf.eval.Evaluable` subclass.'
|
2262
|
+
)
|
2263
|
+
|
2264
|
+
class GeneratedSuite(Suite):
|
2265
|
+
# NOTE(daiyip): Delay serialization key registration for generated
|
2266
|
+
# class.
|
2267
|
+
auto_register = False
|
2268
|
+
children = e.children if isinstance(e, Suite) else [e]
|
2269
|
+
|
2270
|
+
cls = GeneratedSuite
|
2271
|
+
cls.__name__ = func_or_cls.__name__
|
2272
|
+
cls.__doc__ = func_or_cls.__doc__
|
2273
|
+
cls.__qualname__ = func_or_cls.__qualname__
|
2274
|
+
cls.__module__ = getattr(func_or_cls, '__module__', 'wrapper')
|
2275
|
+
cls.register_for_deserialization(cls.__type_name__)
|
2276
|
+
|
2277
|
+
elif issubclass(func_or_cls, Evaluable):
|
2278
|
+
cls = func_or_cls
|
1946
2279
|
else:
|
1947
|
-
|
2280
|
+
raise ValueError(f'Unsupported type: {type(func_or_cls)}')
|
2281
|
+
|
2282
|
+
_eval_registry.register(name, cls)
|
2283
|
+
return cls
|
2284
|
+
|
2285
|
+
return _register
|
2286
|
+
|
2287
|
+
|
2288
|
+
def get(
|
2289
|
+
root_dir: str,
|
2290
|
+
evaluations: list[str | Evaluable],
|
2291
|
+
filter: Union[ # pylint: disable=redefined-builtin
|
2292
|
+
str, # Regex to filter evaluation based on ID.
|
2293
|
+
Callable[[Evaluable], bool], # Custom filter function.
|
2294
|
+
None # No filtering (Default).
|
2295
|
+
] = None, # pylint: disable=bad-whitespace
|
2296
|
+
patches: list[Union[
|
2297
|
+
str, # String-based PyGlove patcher.
|
2298
|
+
pg.patching.Patcher, # PyGlove patcher object.
|
2299
|
+
Callable[[pg.KeyPath, Any, Any], Any], # PyGlove rebind function.
|
2300
|
+
]] | None = None, # pylint: disable=bad-whitespace
|
2301
|
+
) -> Suite:
|
2302
|
+
"""Gets a suite from a list of patched evaluations.
|
2303
|
+
|
2304
|
+
Args:
|
2305
|
+
root_dir: The root directory of the experiment.
|
2306
|
+
evaluations: A list of evaluations to be included in the suite.
|
2307
|
+
filter: A regular expression (str) for selecting sub-experiments of matched
|
2308
|
+
IDs, or a filter function to filter the evaluations.
|
2309
|
+
patches: A list of patches to be applied to the suite. Each element can be
|
2310
|
+
a string (for string-based patcher), a `pg.patching.Patcher` object, or
|
2311
|
+
a rebind function (e.g. `pg.rebind`). See `lf.eval.patch_*` for more
|
2312
|
+
details.
|
2313
|
+
|
2314
|
+
Returns:
|
2315
|
+
A suite of selected `lf.eval.Evaluation` objects.
|
2316
|
+
"""
|
2317
|
+
matches = []
|
2318
|
+
for e in evaluations:
|
2319
|
+
matches.extend(get_evaluations(e))
|
2320
|
+
|
2321
|
+
if not matches:
|
2322
|
+
raise ValueError('No evaluations found.')
|
2323
|
+
|
2324
|
+
suite = Suite(matches, root_dir=root_dir)
|
2325
|
+
if patches:
|
2326
|
+
suite = pg.patch(suite, patches)
|
2327
|
+
|
2328
|
+
if isinstance(filter, str):
|
2329
|
+
regex = re.compile(filter)
|
2330
|
+
filter = lambda x: bool(regex.match(x.id))
|
2331
|
+
|
2332
|
+
if filter:
|
2333
|
+
suite = Suite(
|
2334
|
+
[leaf for leaf in suite.leaf_nodes if filter(leaf)], root_dir=root_dir)
|
2335
|
+
return suite
|
2336
|
+
|
2337
|
+
|
2338
|
+
def run(
|
2339
|
+
root_dir: str,
|
2340
|
+
evaluations: list[str | Evaluable],
|
2341
|
+
filter: Union[ # pylint: disable=redefined-builtin
|
2342
|
+
str, # Regex to filter evaluation based on ID.
|
2343
|
+
Callable[[Evaluable], bool], # Custom filter function.
|
2344
|
+
None # No filtering (Default).
|
2345
|
+
] = None, # pylint: disable=bad-whitespace
|
2346
|
+
patches: list[Union[
|
2347
|
+
str, # String-based PyGlove patcher.
|
2348
|
+
pg.patching.Patcher, # PyGlove patcher object.
|
2349
|
+
Callable[[pg.KeyPath, Any, Any], Any], # PyGlove rebind function.
|
2350
|
+
]] | None = None, # pylint: disable=bad-whitespace
|
2351
|
+
mode: Literal['run', 'rerun', 'dryrun', 'noop'] = 'run',
|
2352
|
+
debug: bool = False,
|
2353
|
+
print_definition: bool = False,
|
2354
|
+
**kwargs,
|
2355
|
+
) -> Suite:
|
2356
|
+
"""Run selected evaluations with patching.
|
2357
|
+
|
2358
|
+
Args:
|
2359
|
+
root_dir: The root directory of the experiment.
|
2360
|
+
evaluations: A list of evaluations to be included in the suite.
|
2361
|
+
filter: A regular expression (str) for selecting sub-experiments of matched
|
2362
|
+
IDs, or a filter function to filter the evaluations.
|
2363
|
+
patches: A list of patches to be applied to the suite. Each element can be
|
2364
|
+
a string (for string-based patcher), a `pg.patching.Patcher` object, or
|
2365
|
+
a rebind function (e.g. `pg.rebind`). See `lf.eval.patch_*` for more
|
2366
|
+
details.
|
2367
|
+
mode: The mode to run the suite. "run" to run the suite, with reusing
|
2368
|
+
existing results if available; "rerun" to rerun all evaluations even if
|
2369
|
+
there are existing results; "dryrun" to dryrun the suite; and "noop"
|
2370
|
+
to do nothing.
|
2371
|
+
debug: Whether to run in debug mode.
|
2372
|
+
print_definition: Whether to print the experiment definition.
|
2373
|
+
**kwargs: Additional arguments to be passed to dryrun/run the suite.
|
2374
|
+
|
2375
|
+
Returns:
|
2376
|
+
A suite of selected `lf.eval.Evaluation` objects.
|
2377
|
+
"""
|
2378
|
+
suite = get(root_dir, evaluations, patches=patches, filter=filter)
|
2379
|
+
if print_definition:
|
2380
|
+
lf.console.write(
|
2381
|
+
pg.format(
|
2382
|
+
suite,
|
2383
|
+
compact=False,
|
2384
|
+
verbose=False,
|
2385
|
+
hide_default_values=True,
|
2386
|
+
python_format=True,
|
2387
|
+
),
|
2388
|
+
title='[EXPERIMENT DEFINITION]',
|
2389
|
+
color='blue',
|
2390
|
+
)
|
1948
2391
|
|
1949
|
-
|
2392
|
+
if mode == 'run':
|
2393
|
+
rerun = mode == 'rerun'
|
2394
|
+
suite.run(debug=debug, rerun=rerun, **kwargs)
|
2395
|
+
elif mode == 'dryrun':
|
2396
|
+
suite.dryrun(debug=debug, **kwargs)
|
2397
|
+
return suite
|