langfun 0.0.2.dev20240330__py3-none-any.whl → 0.1.2.dev202501140804__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langfun/__init__.py +22 -2
- langfun/core/__init__.py +17 -5
- langfun/core/agentic/__init__.py +30 -0
- langfun/core/agentic/action.py +854 -0
- langfun/core/agentic/action_eval.py +150 -0
- langfun/core/agentic/action_eval_test.py +109 -0
- langfun/core/agentic/action_test.py +136 -0
- langfun/core/coding/python/__init__.py +5 -11
- langfun/core/coding/python/correction.py +37 -28
- langfun/core/coding/python/correction_test.py +29 -3
- langfun/core/coding/python/execution.py +40 -216
- langfun/core/coding/python/execution_test.py +29 -89
- langfun/core/coding/python/generation.py +21 -11
- langfun/core/coding/python/generation_test.py +2 -2
- langfun/core/coding/python/parsing.py +108 -193
- langfun/core/coding/python/parsing_test.py +2 -105
- langfun/core/component.py +69 -2
- langfun/core/component_test.py +54 -0
- langfun/core/concurrent.py +414 -117
- langfun/core/concurrent_test.py +111 -24
- langfun/core/console.py +18 -5
- langfun/core/console_test.py +17 -0
- langfun/core/eval/__init__.py +17 -0
- langfun/core/eval/base.py +767 -140
- langfun/core/eval/base_test.py +238 -53
- langfun/core/eval/matching.py +80 -76
- langfun/core/eval/matching_test.py +19 -9
- langfun/core/eval/patching.py +130 -0
- langfun/core/eval/patching_test.py +170 -0
- langfun/core/eval/scoring.py +37 -28
- langfun/core/eval/scoring_test.py +21 -3
- langfun/core/eval/v2/__init__.py +42 -0
- langfun/core/eval/v2/checkpointing.py +380 -0
- langfun/core/eval/v2/checkpointing_test.py +228 -0
- langfun/core/eval/v2/eval_test_helper.py +136 -0
- langfun/core/eval/v2/evaluation.py +725 -0
- langfun/core/eval/v2/evaluation_test.py +180 -0
- langfun/core/eval/v2/example.py +305 -0
- langfun/core/eval/v2/example_test.py +128 -0
- langfun/core/eval/v2/experiment.py +1048 -0
- langfun/core/eval/v2/experiment_test.py +433 -0
- langfun/core/eval/v2/metric_values.py +156 -0
- langfun/core/eval/v2/metric_values_test.py +80 -0
- langfun/core/eval/v2/metrics.py +357 -0
- langfun/core/eval/v2/metrics_test.py +203 -0
- langfun/core/eval/v2/progress.py +348 -0
- langfun/core/eval/v2/progress_test.py +82 -0
- langfun/core/eval/v2/progress_tracking.py +210 -0
- langfun/core/eval/v2/progress_tracking_test.py +66 -0
- langfun/core/eval/v2/reporting.py +270 -0
- langfun/core/eval/v2/reporting_test.py +158 -0
- langfun/core/eval/v2/runners.py +488 -0
- langfun/core/eval/v2/runners_test.py +334 -0
- langfun/core/langfunc.py +3 -21
- langfun/core/langfunc_test.py +26 -8
- langfun/core/language_model.py +686 -48
- langfun/core/language_model_test.py +681 -44
- langfun/core/llms/__init__.py +100 -12
- langfun/core/llms/anthropic.py +488 -0
- langfun/core/llms/anthropic_test.py +235 -0
- langfun/core/llms/cache/base.py +21 -2
- langfun/core/llms/cache/in_memory.py +13 -0
- langfun/core/llms/cache/in_memory_test.py +88 -28
- langfun/core/llms/compositional.py +101 -0
- langfun/core/llms/compositional_test.py +73 -0
- langfun/core/llms/deepseek.py +117 -0
- langfun/core/llms/deepseek_test.py +61 -0
- langfun/core/llms/fake.py +39 -26
- langfun/core/llms/fake_test.py +136 -11
- langfun/core/llms/gemini.py +507 -0
- langfun/core/llms/gemini_test.py +195 -0
- langfun/core/llms/google_genai.py +62 -218
- langfun/core/llms/google_genai_test.py +9 -197
- langfun/core/llms/groq.py +276 -0
- langfun/core/llms/groq_test.py +64 -0
- langfun/core/llms/llama_cpp.py +15 -40
- langfun/core/llms/llama_cpp_test.py +4 -30
- langfun/core/llms/openai.py +436 -226
- langfun/core/llms/openai_compatible.py +179 -0
- langfun/core/llms/openai_compatible_test.py +495 -0
- langfun/core/llms/openai_test.py +35 -174
- langfun/core/llms/rest.py +113 -0
- langfun/core/llms/rest_test.py +111 -0
- langfun/core/llms/vertexai.py +192 -0
- langfun/core/llms/vertexai_test.py +52 -0
- langfun/core/logging.py +284 -0
- langfun/core/logging_test.py +125 -0
- langfun/core/message.py +319 -9
- langfun/core/message_test.py +190 -13
- langfun/core/modalities/__init__.py +6 -2
- langfun/core/modalities/audio.py +30 -0
- langfun/core/modalities/audio_test.py +63 -0
- langfun/core/modalities/image.py +39 -20
- langfun/core/modalities/image_test.py +52 -9
- langfun/core/modalities/mime.py +206 -29
- langfun/core/modalities/mime_test.py +90 -9
- langfun/core/modalities/ms_office.py +117 -0
- langfun/core/modalities/ms_office_test.py +389 -0
- langfun/core/modalities/pdf.py +22 -0
- langfun/core/modalities/pdf_test.py +57 -0
- langfun/core/modalities/video.py +9 -23
- langfun/core/modalities/video_test.py +3 -3
- langfun/core/modality.py +26 -3
- langfun/core/modality_test.py +2 -2
- langfun/core/sampling.py +11 -11
- langfun/core/structured/__init__.py +15 -16
- langfun/core/structured/completion.py +32 -5
- langfun/core/structured/completion_test.py +9 -8
- langfun/core/structured/description.py +2 -2
- langfun/core/structured/description_test.py +3 -3
- langfun/core/structured/function_generation.py +278 -0
- langfun/core/structured/function_generation_test.py +399 -0
- langfun/core/structured/mapping.py +150 -46
- langfun/core/structured/mapping_test.py +105 -0
- langfun/core/structured/parsing.py +33 -21
- langfun/core/structured/parsing_test.py +71 -22
- langfun/core/structured/querying.py +746 -0
- langfun/core/structured/{prompting_test.py → querying_test.py} +545 -60
- langfun/core/structured/schema.py +208 -99
- langfun/core/structured/schema_generation.py +1 -1
- langfun/core/structured/schema_generation_test.py +2 -2
- langfun/core/structured/schema_test.py +133 -34
- langfun/core/structured/scoring.py +125 -19
- langfun/core/structured/scoring_test.py +30 -0
- langfun/core/structured/tokenization.py +64 -0
- langfun/core/structured/tokenization_test.py +48 -0
- langfun/core/template.py +240 -11
- langfun/core/template_test.py +146 -1
- langfun/core/templates/conversation.py +9 -0
- langfun/core/templates/conversation_test.py +4 -3
- langfun/core/templates/selfplay_test.py +14 -2
- langfun-0.1.2.dev202501140804.dist-info/METADATA +225 -0
- langfun-0.1.2.dev202501140804.dist-info/RECORD +153 -0
- {langfun-0.0.2.dev20240330.dist-info → langfun-0.1.2.dev202501140804.dist-info}/WHEEL +1 -1
- langfun/core/coding/python/errors.py +0 -108
- langfun/core/coding/python/errors_test.py +0 -99
- langfun/core/coding/python/permissions.py +0 -90
- langfun/core/coding/python/permissions_test.py +0 -86
- langfun/core/structured/prompting.py +0 -217
- langfun/core/text_formatting.py +0 -162
- langfun/core/text_formatting_test.py +0 -47
- langfun-0.0.2.dev20240330.dist-info/METADATA +0 -99
- langfun-0.0.2.dev20240330.dist-info/RECORD +0 -102
- {langfun-0.0.2.dev20240330.dist-info → langfun-0.1.2.dev202501140804.dist-info}/LICENSE +0 -0
- {langfun-0.0.2.dev20240330.dist-info → langfun-0.1.2.dev202501140804.dist-info}/top_level.txt +0 -0
langfun/core/eval/base.py
CHANGED
@@ -24,6 +24,7 @@ import os
|
|
24
24
|
import re
|
25
25
|
import threading
|
26
26
|
import time
|
27
|
+
import types
|
27
28
|
from typing import Annotated, Any, Callable, Iterator, Literal, Optional, Sequence, Type, Union
|
28
29
|
|
29
30
|
import langfun.core as lf
|
@@ -38,7 +39,8 @@ class Evaluable(lf.Component):
|
|
38
39
|
|
39
40
|
EXPERIMENT_JSON = 'experiment.json'
|
40
41
|
RESULT_JSON = 'result.json'
|
41
|
-
|
42
|
+
OOP_FAILURES_JSON = 'oop_failures.json'
|
43
|
+
NON_OOP_FAILURES_JSON = 'non_oop_failures.json'
|
42
44
|
INDEX_HTML = 'index.html'
|
43
45
|
SUMMARY_HTML = 'summary.html'
|
44
46
|
|
@@ -213,6 +215,7 @@ class Evaluable(lf.Component):
|
|
213
215
|
summary: bool = True,
|
214
216
|
pivot_field: str = 'lm',
|
215
217
|
from_root: bool = True,
|
218
|
+
timeout: int | None = None,
|
216
219
|
**kwargs,
|
217
220
|
) -> Union['Summary', pg.Dict]:
|
218
221
|
"""Run the evaluation, which fills and returns the result."""
|
@@ -240,7 +243,7 @@ class Evaluable(lf.Component):
|
|
240
243
|
):
|
241
244
|
if show_progress:
|
242
245
|
lf.concurrent.ProgressBar.update(
|
243
|
-
progress_bar,
|
246
|
+
progress_bar, status='LOADING SAVED RESULTS...', color='yellow'
|
244
247
|
)
|
245
248
|
if self.try_load_result():
|
246
249
|
run_status = 'CACHED'
|
@@ -263,13 +266,14 @@ class Evaluable(lf.Component):
|
|
263
266
|
verbose=verbose,
|
264
267
|
progress_bar=progress_bar,
|
265
268
|
label=label,
|
269
|
+
timeout=timeout,
|
266
270
|
**kwargs,
|
267
271
|
)
|
268
272
|
|
269
273
|
if should_save:
|
270
274
|
if show_progress:
|
271
275
|
lf.concurrent.ProgressBar.update(
|
272
|
-
progress_bar,
|
276
|
+
progress_bar, status='SAVING RESULTS...', color='yellow'
|
273
277
|
)
|
274
278
|
|
275
279
|
# Save evaluation results.
|
@@ -282,7 +286,7 @@ class Evaluable(lf.Component):
|
|
282
286
|
if show_progress:
|
283
287
|
lf.concurrent.ProgressBar.update(
|
284
288
|
progress_bar,
|
285
|
-
|
289
|
+
status=self._completion_status(run_status),
|
286
290
|
color='green',
|
287
291
|
)
|
288
292
|
else:
|
@@ -338,7 +342,7 @@ class Evaluable(lf.Component):
|
|
338
342
|
f'[#{leaf.index} - {leaf.node.id}]',
|
339
343
|
total=leaf.node.num_examples if leaf.enabled else 0,
|
340
344
|
color='cyan' if leaf.enabled else 'yellow',
|
341
|
-
|
345
|
+
status=None if leaf.enabled else 'SKIPPED.')
|
342
346
|
|
343
347
|
# Run leaf groups in parallel.
|
344
348
|
try:
|
@@ -352,17 +356,17 @@ class Evaluable(lf.Component):
|
|
352
356
|
# Save results for non-leaf nodes.
|
353
357
|
lf.concurrent.ProgressBar.update(
|
354
358
|
overview_bar,
|
355
|
-
|
359
|
+
status='SAVING RESULTS...',
|
356
360
|
color='yellow')
|
357
361
|
|
358
362
|
for node in self.nonleaf_nodes:
|
359
|
-
node._result = {c.id: c.result for c in node.
|
363
|
+
node._result = {c.id: c.result for c in node.leaf_nodes} # pylint: disable=protected-access
|
360
364
|
if should_save:
|
361
365
|
node.save(result=False, report=False)
|
362
366
|
|
363
367
|
if should_save and summary:
|
364
368
|
lf.concurrent.ProgressBar.update(
|
365
|
-
overview_bar,
|
369
|
+
overview_bar, status='FINALIZING SUMMARY...'
|
366
370
|
)
|
367
371
|
|
368
372
|
summary.save(os.path.join(self.root_dir, Evaluable.SUMMARY_HTML))
|
@@ -376,7 +380,7 @@ class Evaluable(lf.Component):
|
|
376
380
|
# Signal all task completed by making the bar green.
|
377
381
|
lf.concurrent.ProgressBar.update(
|
378
382
|
overview_bar,
|
379
|
-
|
383
|
+
status='COMPLETED',
|
380
384
|
color='green')
|
381
385
|
|
382
386
|
finally:
|
@@ -396,6 +400,7 @@ class Evaluable(lf.Component):
|
|
396
400
|
verbose: bool,
|
397
401
|
progress_bar: int | None,
|
398
402
|
label: str | None,
|
403
|
+
timeout: int | None = None,
|
399
404
|
**kwargs,
|
400
405
|
) -> None:
|
401
406
|
"""Run the evaluate and fill `self.result`. Subclass to implement."""
|
@@ -526,27 +531,14 @@ class Evaluable(lf.Component):
|
|
526
531
|
self._render_message(self.dryrun_output, s)
|
527
532
|
|
528
533
|
def _render_message(self, message: lf.Message, s: io.StringIO) -> None:
|
529
|
-
|
530
|
-
|
531
|
-
|
532
|
-
|
533
|
-
|
534
|
-
|
535
|
-
text_color = 'black'
|
536
|
-
|
537
|
-
s.write(
|
538
|
-
f'<div style="color: {text_color}; white-space: pre-wrap;'
|
539
|
-
'padding: 10px; border: 1px solid; margin-top: 10px">'
|
540
|
-
)
|
541
|
-
s.write(m.text)
|
542
|
-
if m.result is not None:
|
543
|
-
s.write(
|
544
|
-
'<div style="color: magenta; white-space: pre-wrap;'
|
545
|
-
'padding: 10px; border: 1px solid; margin: 10px">'
|
534
|
+
s.write(
|
535
|
+
message.to_html_str(
|
536
|
+
extra_flags=dict(
|
537
|
+
include_message_metadata=False,
|
538
|
+
source_tag=['lm-input', 'lm-response'],
|
539
|
+
)
|
546
540
|
)
|
547
|
-
|
548
|
-
s.write('</div>')
|
549
|
-
s.write('</div>')
|
541
|
+
)
|
550
542
|
|
551
543
|
@classmethod
|
552
544
|
def from_dir(
|
@@ -586,7 +578,6 @@ class _LeafNode:
|
|
586
578
|
@pg.use_init_args(['children'])
|
587
579
|
class Suite(Evaluable):
|
588
580
|
"""Evaluation suite."""
|
589
|
-
|
590
581
|
children: Annotated[list[Evaluable], 'Child evaluation sets or suites.']
|
591
582
|
|
592
583
|
# Use empty ID as suite is just a container of child evaluations.
|
@@ -741,10 +732,12 @@ class Evaluation(Evaluable):
|
|
741
732
|
|
742
733
|
# Constants.
|
743
734
|
CACHE_JSON = 'cache.json'
|
744
|
-
|
735
|
+
OOP_FAILURES_HTML = 'oop_failures.html'
|
736
|
+
NON_OOP_FAILURES_HTML = 'non_oop_failures.html'
|
745
737
|
|
746
738
|
@functools.cached_property
|
747
739
|
def hash(self) -> str:
|
740
|
+
"""Returns the semantic-based hash of the evaluation."""
|
748
741
|
if self.is_deterministic:
|
749
742
|
identity = pg.format(self._identifiers(), compact=True)
|
750
743
|
else:
|
@@ -793,6 +786,10 @@ class Evaluation(Evaluable):
|
|
793
786
|
"""Returns the complete rate."""
|
794
787
|
return self.num_completed / self.num_examples
|
795
788
|
|
789
|
+
#
|
790
|
+
# Properties on failures.
|
791
|
+
#
|
792
|
+
|
796
793
|
@property
|
797
794
|
def failures(self) -> list[tuple[Any, Exception]]:
|
798
795
|
"""Returns the failed examples and their errors."""
|
@@ -803,6 +800,15 @@ class Evaluation(Evaluable):
|
|
803
800
|
"""Returns the number of failed examples."""
|
804
801
|
return len(self.failures)
|
805
802
|
|
803
|
+
@functools.cached_property
|
804
|
+
def failure_breakdown(self) -> dict[str, int]:
|
805
|
+
"""Returns the breakdown of failures."""
|
806
|
+
breakdown = collections.defaultdict(int)
|
807
|
+
for _, error in self.failures:
|
808
|
+
breakdown[_error_key(error)] += 1
|
809
|
+
sorted_items = sorted(breakdown.items(), key=lambda x: x[1], reverse=True)
|
810
|
+
return pg.Dict({x[0]: x[1] for x in sorted_items})
|
811
|
+
|
806
812
|
@property
|
807
813
|
def failure_rate(self) -> float:
|
808
814
|
"""Returns the failure rate in range [0, 1]."""
|
@@ -810,17 +816,76 @@ class Evaluation(Evaluable):
|
|
810
816
|
return 0.0
|
811
817
|
return self.num_failures / self.num_completed
|
812
818
|
|
819
|
+
@functools.cached_property
|
820
|
+
def oop_failures(self) -> list[tuple[Any, lf_structured.MappingError]]:
|
821
|
+
"""Returns the OOP failures."""
|
822
|
+
return [item for item in self.failures
|
823
|
+
if isinstance(item[1], lf_structured.MappingError)]
|
824
|
+
|
825
|
+
@property
|
826
|
+
def num_oop_failures(self) -> int:
|
827
|
+
"""Returns the number of OOP failures."""
|
828
|
+
return len(self.oop_failures)
|
829
|
+
|
830
|
+
@property
|
831
|
+
def oop_failure_rate(self) -> float:
|
832
|
+
"""Returns the OOP failure rate in range [0, 1]."""
|
833
|
+
if self.num_completed == 0:
|
834
|
+
return 0.0
|
835
|
+
return self.num_oop_failures / self.num_completed
|
836
|
+
|
837
|
+
@functools.cached_property
|
838
|
+
def non_oop_failures(self) -> list[tuple[Any, Exception]]:
|
839
|
+
"""Returns the OOP failures."""
|
840
|
+
return [item for item in self.failures
|
841
|
+
if not isinstance(item[1], lf_structured.MappingError)]
|
842
|
+
|
843
|
+
@property
|
844
|
+
def num_non_oop_failures(self) -> int:
|
845
|
+
"""Returns the number of non-OOP failures."""
|
846
|
+
return len(self.non_oop_failures)
|
847
|
+
|
848
|
+
@property
|
849
|
+
def non_oop_failure_rate(self) -> float:
|
850
|
+
"""Returns the non-OOP failure rate in range [0, 1]."""
|
851
|
+
if self.num_completed == 0:
|
852
|
+
return 0.0
|
853
|
+
return self.num_non_oop_failures / self.num_completed
|
854
|
+
|
855
|
+
#
|
856
|
+
# Properties on usage.
|
857
|
+
#
|
858
|
+
|
859
|
+
@property
|
860
|
+
def has_usage(self) -> bool:
|
861
|
+
"""Returns True if token usage is enabled."""
|
862
|
+
return self._num_usages > 0
|
863
|
+
|
864
|
+
@property
|
865
|
+
def average_prompt_tokens(self) -> int:
|
866
|
+
"""Returns the average prompt tokens."""
|
867
|
+
if not self.has_usage:
|
868
|
+
return 0
|
869
|
+
return self._total_prompt_tokens // self._num_usages
|
870
|
+
|
871
|
+
@property
|
872
|
+
def average_completion_tokens(self) -> int:
|
873
|
+
"""Returns the average completion tokens."""
|
874
|
+
if not self.has_usage:
|
875
|
+
return 0
|
876
|
+
return self._total_completion_tokens // self._num_usages
|
877
|
+
|
878
|
+
@property
|
879
|
+
def average_total_tokens(self) -> int:
|
880
|
+
"""Returns the average total tokens."""
|
881
|
+
return self.average_prompt_tokens + self.average_completion_tokens
|
882
|
+
|
813
883
|
@functools.cached_property
|
814
884
|
def schema(self) -> lf_structured.Schema | None:
|
815
885
|
"""Schema."""
|
816
886
|
if self.schema_fn is None:
|
817
887
|
return None
|
818
888
|
|
819
|
-
kwargs = {}
|
820
|
-
# Allow schema to be a function based on current evaluation.
|
821
|
-
if 'evaluation' in self.schema_fn.__signature__.arg_names:
|
822
|
-
kwargs['evaluation'] = self
|
823
|
-
|
824
889
|
schema = self._call_schema_fn()
|
825
890
|
fewshot_examples = None
|
826
891
|
if isinstance(schema, tuple):
|
@@ -861,7 +926,11 @@ class Evaluation(Evaluable):
|
|
861
926
|
'Encountered: {annotation!r}.'
|
862
927
|
)
|
863
928
|
self._maybe_adjust_schema_for_completion(annotation)
|
864
|
-
|
929
|
+
schema = lf_structured.Schema.from_value(annotation)
|
930
|
+
# NOTE(daiyip): add references to the dependent classes of the returned type
|
931
|
+
# to prevent unused subclasses get garbage collected by Python.
|
932
|
+
setattr(schema, '__dependencies__', schema.class_dependencies())
|
933
|
+
return schema
|
865
934
|
|
866
935
|
def _maybe_adjust_schema_for_completion(self, cls):
|
867
936
|
if (self.completion_prompt_field is None
|
@@ -870,7 +939,7 @@ class Evaluation(Evaluable):
|
|
870
939
|
|
871
940
|
fields = list(cls.__schema__.values())
|
872
941
|
fields.insert(0, (self.completion_prompt_field, pg.typing.Str()))
|
873
|
-
|
942
|
+
cls.update_schema(fields, extend=False)
|
874
943
|
|
875
944
|
def _maybe_adjust_examples_for_completion(
|
876
945
|
self,
|
@@ -938,12 +1007,25 @@ class Evaluation(Evaluable):
|
|
938
1007
|
self._failures = []
|
939
1008
|
self._num_completed = 0
|
940
1009
|
|
1010
|
+
self._total_prompt_tokens = 0
|
1011
|
+
self._total_completion_tokens = 0
|
1012
|
+
self._num_usages = 0
|
1013
|
+
self.__dict__.pop('oop_failures', None)
|
1014
|
+
self.__dict__.pop('non_oop_failures', None)
|
1015
|
+
|
941
1016
|
@property
|
942
|
-
def
|
943
|
-
"""Returns the link to the failures page."""
|
1017
|
+
def oop_failures_link(self) -> str | None:
|
1018
|
+
"""Returns the link to the OOP failures page."""
|
944
1019
|
if self.dir is None:
|
945
1020
|
return None
|
946
|
-
return self.link(os.path.join(self.dir, Evaluation.
|
1021
|
+
return self.link(os.path.join(self.dir, Evaluation.OOP_FAILURES_HTML))
|
1022
|
+
|
1023
|
+
@property
|
1024
|
+
def non_oop_failures_link(self) -> str | None:
|
1025
|
+
"""Returns the link to then non-OOP failures page."""
|
1026
|
+
if self.dir is None:
|
1027
|
+
return None
|
1028
|
+
return self.link(os.path.join(self.dir, Evaluation.NON_OOP_FAILURES_HTML))
|
947
1029
|
|
948
1030
|
def _dryrun(
|
949
1031
|
self,
|
@@ -953,11 +1035,11 @@ class Evaluation(Evaluable):
|
|
953
1035
|
verbose: bool,
|
954
1036
|
**kwargs,
|
955
1037
|
) -> None:
|
956
|
-
# Set the example for dryrun.
|
957
|
-
example = example or self.examples[0]
|
958
|
-
|
959
1038
|
# We make a copy to avoid pollute the state of current object.
|
960
|
-
copy = self.clone()
|
1039
|
+
copy: Evaluation = self.clone()
|
1040
|
+
|
1041
|
+
# Set the example for dryrun.
|
1042
|
+
example = example or copy.examples[0]
|
961
1043
|
copy.__dict__['examples'] = [example]
|
962
1044
|
|
963
1045
|
# We set the symbolic parent of the cloned to access contextual information
|
@@ -972,24 +1054,37 @@ class Evaluation(Evaluable):
|
|
972
1054
|
color='green',
|
973
1055
|
)
|
974
1056
|
|
975
|
-
|
976
|
-
output_message = copy.process(example, **(self.additional_args or {}))
|
977
|
-
if self.schema is None:
|
978
|
-
output = output_message.text
|
979
|
-
else:
|
980
|
-
output = output_message.result
|
1057
|
+
error, output_message = None, None
|
981
1058
|
|
982
|
-
|
1059
|
+
try:
|
1060
|
+
with lf.use_settings(debug=debug):
|
1061
|
+
output_message = copy.process(example, **(self.additional_args or {}))
|
1062
|
+
self.process_output(example, output_message)
|
1063
|
+
|
1064
|
+
if self.schema is None:
|
1065
|
+
output = output_message.text
|
1066
|
+
else:
|
1067
|
+
output = output_message.result
|
1068
|
+
|
1069
|
+
if verbose:
|
1070
|
+
lf.console.write('')
|
1071
|
+
lf.console.write(
|
1072
|
+
str(output),
|
1073
|
+
title='OUTPUT',
|
1074
|
+
color='blue',
|
1075
|
+
)
|
1076
|
+
except lf_structured.MappingError as e:
|
983
1077
|
lf.console.write('')
|
984
1078
|
lf.console.write(
|
985
|
-
str(
|
986
|
-
title='
|
987
|
-
color='
|
1079
|
+
str(e),
|
1080
|
+
title='ERROR',
|
1081
|
+
color='red',
|
988
1082
|
)
|
1083
|
+
error = e
|
1084
|
+
|
1085
|
+
copy.audit(1, example, output_message, error, dryrun=True)
|
1086
|
+
result = copy.finalize()
|
989
1087
|
|
990
|
-
# Audit the result.
|
991
|
-
copy.audit(example, output, output_message)
|
992
|
-
result = copy.summarize()
|
993
1088
|
if verbose:
|
994
1089
|
lf.console.write('')
|
995
1090
|
lf.console.write(
|
@@ -1009,9 +1104,13 @@ class Evaluation(Evaluable):
|
|
1009
1104
|
verbose: bool,
|
1010
1105
|
progress_bar: int | None,
|
1011
1106
|
label: str | None,
|
1107
|
+
timeout: int | None = None,
|
1012
1108
|
**kwargs,
|
1013
1109
|
) -> None:
|
1014
1110
|
# Setup examples.
|
1111
|
+
# Reset examples so it could be read from the input functor.
|
1112
|
+
self.__dict__.pop('examples', None)
|
1113
|
+
|
1015
1114
|
if end is None:
|
1016
1115
|
end = len(self.examples)
|
1017
1116
|
examples = self.examples[start:end]
|
@@ -1020,34 +1119,39 @@ class Evaluation(Evaluable):
|
|
1020
1119
|
with lf.use_settings(debug=debug, cache=self.cache):
|
1021
1120
|
self._reset()
|
1022
1121
|
|
1023
|
-
def _process(
|
1122
|
+
def _process(idx_and_example: Any):
|
1024
1123
|
# NOTE(daiyip): set the `input` symbol of the globals to None, so LLM
|
1025
1124
|
# generated code with calls to `input` will raise an error, thus not
|
1026
1125
|
# blocking the evaluation.
|
1126
|
+
_, example = idx_and_example
|
1027
1127
|
with lf_coding.context(input=None):
|
1028
|
-
|
1128
|
+
output_message = self.process(example, **(self.additional_args or {}))
|
1129
|
+
self.process_output(example, output_message)
|
1130
|
+
return output_message
|
1029
1131
|
|
1030
1132
|
try:
|
1031
|
-
for example, message, error in lf.concurrent_map(
|
1133
|
+
for (idx, example), message, error in lf.concurrent_map(
|
1032
1134
|
_process,
|
1033
|
-
examples,
|
1135
|
+
enumerate(examples),
|
1034
1136
|
max_workers=self.max_workers,
|
1035
1137
|
show_progress=progress_bar or False,
|
1036
1138
|
status_fn=self._status,
|
1139
|
+
timeout=timeout,
|
1037
1140
|
):
|
1038
1141
|
if error is not None:
|
1039
|
-
|
1040
|
-
|
1041
|
-
|
1042
|
-
|
1043
|
-
|
1142
|
+
message = (
|
1143
|
+
error.lm_response
|
1144
|
+
if isinstance(error, lf_structured.MappingError)
|
1145
|
+
else None
|
1146
|
+
)
|
1147
|
+
self.audit(idx + 1, example, message, error)
|
1044
1148
|
finally:
|
1045
1149
|
# Save cache upon completion or interruption.
|
1046
1150
|
if self.dir and self.cache:
|
1047
1151
|
self.cache.save()
|
1048
1152
|
|
1049
1153
|
# Summarize result.
|
1050
|
-
self._result = self.
|
1154
|
+
self._result = self.finalize()
|
1051
1155
|
if verbose:
|
1052
1156
|
lf.console.write(
|
1053
1157
|
str(self.result),
|
@@ -1061,7 +1165,7 @@ class Evaluation(Evaluable):
|
|
1061
1165
|
|
1062
1166
|
def process(self, example: Any, **kwargs) -> lf.Message:
|
1063
1167
|
"""Process an example and returns its output."""
|
1064
|
-
prompt = self.prompt
|
1168
|
+
prompt = lf.Template.from_value(self.prompt, example=example)
|
1065
1169
|
if self.method == 'call':
|
1066
1170
|
return lf_structured.call(
|
1067
1171
|
prompt,
|
@@ -1089,7 +1193,9 @@ class Evaluation(Evaluable):
|
|
1089
1193
|
else:
|
1090
1194
|
assert self.method == 'complete', self.method
|
1091
1195
|
assert isinstance(self.schema.spec, pg.typing.Object), self.schema
|
1092
|
-
|
1196
|
+
# TODO(daiyip): Currently multi-modal inputs within the prompt for
|
1197
|
+
# completion is not supported.
|
1198
|
+
input_value = self.schema.spec.cls.partial(prompt.render().text)
|
1093
1199
|
return lf_structured.complete(
|
1094
1200
|
input_value,
|
1095
1201
|
lm=self.lm,
|
@@ -1100,16 +1206,48 @@ class Evaluation(Evaluable):
|
|
1100
1206
|
**kwargs,
|
1101
1207
|
)
|
1102
1208
|
|
1209
|
+
def process_output(self, example: Any, output: lf.Message) -> None:
|
1210
|
+
"""Process the output for an example.
|
1211
|
+
|
1212
|
+
Subclasses can override this method to generate and attach additional
|
1213
|
+
metadata for debugging purpose. For example, draw bounding boxes on the
|
1214
|
+
input image based on LLM predicted boxes and attach to output_message's
|
1215
|
+
metadata.
|
1216
|
+
|
1217
|
+
Example:
|
1218
|
+
|
1219
|
+
class BoundingBoxEval(lf.eval.Matching):
|
1220
|
+
...
|
1221
|
+
def process_output(example, output):
|
1222
|
+
output.metadata.image_with_bbox = draw_bboxes(
|
1223
|
+
example.image, output.result)
|
1224
|
+
|
1225
|
+
Args:
|
1226
|
+
example: User input.
|
1227
|
+
output: LLM's output message. Users could attach additional
|
1228
|
+
information to the message, which will be shown in debugging
|
1229
|
+
"""
|
1230
|
+
del example, output
|
1231
|
+
|
1103
1232
|
def _status(self, progress: lf.concurrent.Progress) -> dict[str, Any]:
|
1233
|
+
status = {'Model': self.lm.model_id}
|
1234
|
+
status.update(self._eval_status(progress))
|
1235
|
+
|
1236
|
+
if progress.last_error is not None:
|
1237
|
+
status['LastError'] = progress.last_error_str()
|
1238
|
+
if progress.timeit_summary:
|
1239
|
+
status['TimeIt'] = progress.timeit_summary_str()
|
1240
|
+
return status
|
1241
|
+
|
1242
|
+
def _eval_status(self, progress: lf.concurrent.Progress) -> dict[str, Any]:
|
1104
1243
|
return {
|
1105
|
-
'
|
1106
|
-
|
1107
|
-
progress.success_rate * 100,
|
1244
|
+
'Succeeded': '%s (%d/%d)' % (
|
1245
|
+
self._format_rate(progress.success_rate),
|
1108
1246
|
progress.succeeded,
|
1109
1247
|
progress.completed,
|
1110
1248
|
),
|
1111
|
-
'Failed':
|
1112
|
-
progress.failure_rate
|
1249
|
+
'Failed': '%s (%d/%d)' % (
|
1250
|
+
self._format_rate(progress.failure_rate),
|
1113
1251
|
progress.failed,
|
1114
1252
|
progress.completed,
|
1115
1253
|
),
|
@@ -1119,22 +1257,21 @@ class Evaluation(Evaluable):
|
|
1119
1257
|
assert self.result is not None
|
1120
1258
|
m = self.result.metrics
|
1121
1259
|
return (
|
1122
|
-
|
1123
|
-
f' Failures=%.{self.report_precision}f%% (%d/%d)'
|
1260
|
+
'COMPLETED(%s): Successes=%s(%d/%d) Failures=%s (%d/%d)'
|
1124
1261
|
% (
|
1125
1262
|
run_status,
|
1126
|
-
(1 - m.failure_rate)
|
1263
|
+
self._format_rate(1 - m.failure_rate),
|
1127
1264
|
m.total - m.failures,
|
1128
1265
|
m.total,
|
1129
|
-
m.failure_rate
|
1266
|
+
self._format_rate(m.failure_rate),
|
1130
1267
|
m.failures,
|
1131
1268
|
m.total,
|
1132
1269
|
)
|
1133
1270
|
)
|
1134
1271
|
|
1135
|
-
def
|
1136
|
-
"""
|
1137
|
-
if self.cache:
|
1272
|
+
def finalize(self) -> pg.Dict:
|
1273
|
+
"""Finalizes the evaluation result."""
|
1274
|
+
if self.cache is not None:
|
1138
1275
|
cache_stats = dict(
|
1139
1276
|
use_cache=True,
|
1140
1277
|
num_queries=self.cache.stats.num_queries,
|
@@ -1143,12 +1280,25 @@ class Evaluation(Evaluable):
|
|
1143
1280
|
)
|
1144
1281
|
else:
|
1145
1282
|
cache_stats = dict(use_cache=False)
|
1283
|
+
|
1284
|
+
if self.has_usage:
|
1285
|
+
usage = pg.Dict(
|
1286
|
+
total_prompt_tokens=self._total_prompt_tokens,
|
1287
|
+
total_completion_tokens=self._total_completion_tokens,
|
1288
|
+
num_usages=self._num_usages,
|
1289
|
+
average_prompt_tokens=self.average_prompt_tokens,
|
1290
|
+
average_completion_tokens=self.average_completion_tokens,
|
1291
|
+
average_total_tokens=self.average_total_tokens,
|
1292
|
+
)
|
1293
|
+
else:
|
1294
|
+
usage = None
|
1295
|
+
|
1146
1296
|
result = pg.Dict(
|
1147
1297
|
experiment_setup=pg.Dict(
|
1148
1298
|
id=self.id,
|
1149
1299
|
dir=self.dir,
|
1150
1300
|
model=self.lm.model_id,
|
1151
|
-
prompt_template=
|
1301
|
+
prompt_template=pg.decolor(str(self.prompt)),
|
1152
1302
|
method=self.method,
|
1153
1303
|
schema_fn=str(self.schema_fn),
|
1154
1304
|
),
|
@@ -1157,56 +1307,183 @@ class Evaluation(Evaluable):
|
|
1157
1307
|
total=self.num_completed,
|
1158
1308
|
failures=self.num_failures,
|
1159
1309
|
failure_rate=self.failure_rate,
|
1310
|
+
oop_failures=self.num_oop_failures,
|
1311
|
+
oop_failure_rate=self.oop_failure_rate,
|
1312
|
+
non_oop_failures=self.num_non_oop_failures,
|
1313
|
+
non_oop_failure_rate=self.non_oop_failure_rate,
|
1314
|
+
failure_breakdown=self.failure_breakdown,
|
1160
1315
|
),
|
1316
|
+
usage=usage,
|
1161
1317
|
)
|
1162
1318
|
return result
|
1163
1319
|
|
1164
|
-
def
|
1320
|
+
def summary_card(self) -> str:
|
1321
|
+
"""Returns summary card in HTML."""
|
1165
1322
|
s = io.StringIO()
|
1166
1323
|
definition = _html_repr(self, compact=False, escape=True)
|
1167
1324
|
s.write('<div><table><tr><td>')
|
1325
|
+
self._render_link(
|
1326
|
+
s,
|
1327
|
+
definition,
|
1328
|
+
self.hash,
|
1329
|
+
'',
|
1330
|
+
lambda: self.link(self.dir),
|
1331
|
+
)
|
1168
1332
|
if self.result is None:
|
1169
1333
|
s.write(
|
1170
|
-
f'<a target="_blank" title="{definition}" '
|
1171
|
-
f'href="{self.link(self.dir)}">{self.hash}</a>'
|
1172
1334
|
'</td></tr><tr><td>'
|
1173
1335
|
'<span style="color: gray">(IN-PROGRESS...)</span>'
|
1174
1336
|
)
|
1175
1337
|
else:
|
1176
|
-
|
1177
|
-
|
1178
|
-
|
1179
|
-
|
1180
|
-
|
1181
|
-
|
1338
|
+
if self.dir:
|
1339
|
+
s.write(f' [<a href="{self.link(self.dir)}">dir</a>]')
|
1340
|
+
s.write('</td></tr><tr><td>')
|
1341
|
+
self._render_summary_metrics(s)
|
1342
|
+
|
1343
|
+
# Summarize average usage.
|
1344
|
+
if self.result.usage:
|
1345
|
+
self._render_summary_usage(s)
|
1346
|
+
|
1182
1347
|
s.write('</td></tr></table></div>')
|
1183
1348
|
return s.getvalue()
|
1184
1349
|
|
1185
|
-
def
|
1350
|
+
def _render_summary_usage(self, s: io.StringIO) -> None:
|
1351
|
+
"""Renders usage in HTML."""
|
1352
|
+
usage = self.result.usage
|
1353
|
+
total = usage.total_prompt_tokens + usage.total_completion_tokens
|
1354
|
+
s.write(
|
1355
|
+
' <a title="'
|
1356
|
+
f'# of usages: {usage.num_usages}
'
|
1357
|
+
f'total prompt: {usage.total_prompt_tokens}
'
|
1358
|
+
f'total response: {usage.total_completion_tokens}
'
|
1359
|
+
f'avg prompt: {usage.average_prompt_tokens}
'
|
1360
|
+
f'avg response: {usage.average_completion_tokens}'
|
1361
|
+
f'" style="color:gray">({total} tokens)</a>'
|
1362
|
+
)
|
1363
|
+
|
1364
|
+
def _render_link(self,
|
1365
|
+
s: io.StringIO,
|
1366
|
+
title: str,
|
1367
|
+
text: str,
|
1368
|
+
style: str,
|
1369
|
+
url_fn: Callable[[], str]) -> None:
|
1370
|
+
"""Renders a link in HTML."""
|
1371
|
+
s.write(
|
1372
|
+
f'<a target="_blank" title="{title}" style="{style}"'
|
1373
|
+
)
|
1374
|
+
if self.dir:
|
1375
|
+
s.write(f' href="{url_fn()}"')
|
1376
|
+
s.write(f'>{text}</a>')
|
1377
|
+
|
1378
|
+
def _render_summary_metrics(self, s: io.StringIO) -> None:
|
1186
1379
|
"""Renders metrics in HTML."""
|
1187
1380
|
assert self.result is not None
|
1188
1381
|
m = self.result.metrics
|
1189
|
-
|
1190
|
-
|
1191
|
-
|
1192
|
-
|
1193
|
-
|
1194
|
-
|
1195
|
-
|
1196
|
-
|
1382
|
+
|
1383
|
+
# OOP failures.
|
1384
|
+
oop_failure_title = f'OOP failures ({m.oop_failures}/{m.total})'
|
1385
|
+
if m.oop_failures:
|
1386
|
+
oop_failure_title += '
'
|
1387
|
+
for name, count in m.failure_breakdown.items():
|
1388
|
+
if name.startswith('MappingError'):
|
1389
|
+
oop_failure_title += '
%s: %s (%d/%d)' % (
|
1390
|
+
name.removeprefix('MappingError.'),
|
1391
|
+
self._format_rate(count / m.total),
|
1392
|
+
count,
|
1393
|
+
m.total,
|
1394
|
+
)
|
1395
|
+
|
1396
|
+
extra_style = ''
|
1397
|
+
if m.oop_failure_rate > 0.1 and m.oop_failures > 3:
|
1398
|
+
extra_style = ';font-weight:bold'
|
1399
|
+
self._render_link(
|
1400
|
+
s,
|
1401
|
+
oop_failure_title,
|
1402
|
+
self._format_rate(m.oop_failure_rate),
|
1403
|
+
f'color:magenta{extra_style}',
|
1404
|
+
lambda: self.oop_failures_link,
|
1405
|
+
)
|
1406
|
+
s.write(' | ')
|
1407
|
+
|
1408
|
+
# Non-OOP failures.
|
1409
|
+
non_oop_failure_title = f'Non-OOP failures ({m.non_oop_failures}/{m.total})'
|
1410
|
+
if m.non_oop_failures:
|
1411
|
+
non_oop_failure_title += '
'
|
1412
|
+
for name, count in m.failure_breakdown.items():
|
1413
|
+
if not name.startswith('MappingError'):
|
1414
|
+
non_oop_failure_title += '
%s: %s (%d/%d)' % (
|
1415
|
+
name,
|
1416
|
+
self._format_rate(count / m.total),
|
1417
|
+
count,
|
1418
|
+
m.total,
|
1419
|
+
)
|
1420
|
+
|
1421
|
+
extra_style = ';font-weight:bold' if m.non_oop_failures > 0 else ''
|
1422
|
+
self._render_link(
|
1423
|
+
s,
|
1424
|
+
non_oop_failure_title,
|
1425
|
+
self._format_rate(m.non_oop_failure_rate),
|
1426
|
+
f'color:red{extra_style}',
|
1427
|
+
lambda: self.non_oop_failures_link,
|
1197
1428
|
)
|
1198
1429
|
|
1199
|
-
def
|
1430
|
+
def _format_rate(self, rate: float) -> str:
|
1431
|
+
"""Formats a rate."""
|
1432
|
+
return f'%.{self.report_precision}f%% ' % (rate * 100)
|
1433
|
+
|
1434
|
+
def audit(
|
1435
|
+
self,
|
1436
|
+
example_idx: int,
|
1437
|
+
example: Any,
|
1438
|
+
message: lf.Message | None,
|
1439
|
+
error: Exception | None = None,
|
1440
|
+
dryrun: bool = False,
|
1441
|
+
) -> None:
|
1200
1442
|
"""Audits the example against the output. Subclasses should override.
|
1201
1443
|
|
1202
1444
|
Args:
|
1445
|
+
example_idx: 1-based index of the example in its dataset.
|
1203
1446
|
example: The input object.
|
1204
|
-
output: The output from LM. For `lf.call`, if `schema_fn` is not provided,
|
1205
|
-
it will be the raw LM response string. Otherwise it will be the
|
1206
|
-
structured output from the LM.
|
1207
1447
|
message: The entire message returned by the LM, which could be used to
|
1208
|
-
trace the LM input, response and parsed structure.
|
1448
|
+
trace the LM input, response and parsed structure. If error is raised
|
1449
|
+
before LLM could return a response, None will be its value.
|
1450
|
+
error: The exception during processing the example.
|
1451
|
+
dryrun: Whether or not audition takes place during dryrun.
|
1209
1452
|
"""
|
1453
|
+
if error is not None:
|
1454
|
+
self._failures.append((example, error))
|
1455
|
+
|
1456
|
+
# Invalid cache of num_oop_failures.
|
1457
|
+
self.__dict__.pop('oop_failures', None)
|
1458
|
+
self.__dict__.pop('non_oop_failures', None)
|
1459
|
+
self.__dict__.pop('failure_breakdown', None)
|
1460
|
+
|
1461
|
+
if isinstance(error, lf_structured.MappingError):
|
1462
|
+
message = error.lm_response
|
1463
|
+
else:
|
1464
|
+
assert message is not None
|
1465
|
+
output = message.text if self.schema is None else message.result
|
1466
|
+
self.audit_processed(example_idx, example, output, message, dryrun=dryrun)
|
1467
|
+
|
1468
|
+
# Audit usage.
|
1469
|
+
if message is not None:
|
1470
|
+
self.audit_usage(message, dryrun=dryrun)
|
1471
|
+
self._num_completed += 1
|
1472
|
+
|
1473
|
+
def audit_usage(self, message: lf.Message, dryrun: bool = False) -> None:
|
1474
|
+
del dryrun
|
1475
|
+
for m in message.trace():
|
1476
|
+
usage = m.metadata.get('usage', None)
|
1477
|
+
if usage:
|
1478
|
+
self._total_prompt_tokens += usage.prompt_tokens
|
1479
|
+
self._total_completion_tokens += usage.completion_tokens
|
1480
|
+
self._num_usages += 1
|
1481
|
+
|
1482
|
+
def audit_processed(
|
1483
|
+
self, example_idx: int, example: Any, output: Any, message: lf.Message,
|
1484
|
+
dryrun: bool = False
|
1485
|
+
) -> None:
|
1486
|
+
"""Audits a successfully processed example. Subclass should override."""
|
1210
1487
|
|
1211
1488
|
def save(
|
1212
1489
|
self, definition: bool = True, result: bool = True, report: bool = True
|
@@ -1229,16 +1506,26 @@ class Evaluation(Evaluable):
|
|
1229
1506
|
# Save failures.
|
1230
1507
|
pg.save(
|
1231
1508
|
[
|
1232
|
-
pg.Dict(
|
1233
|
-
|
1234
|
-
)
|
1235
|
-
for input, error in self.failures
|
1509
|
+
pg.Dict(input=input, error=_format_error(error))
|
1510
|
+
for input, error in self.oop_failures
|
1236
1511
|
],
|
1237
|
-
os.path.join(self.dir, Evaluation.
|
1512
|
+
os.path.join(self.dir, Evaluation.OOP_FAILURES_JSON),
|
1238
1513
|
)
|
1239
1514
|
pg.save(
|
1240
|
-
self._html([self._render_result, self.
|
1241
|
-
os.path.join(self.dir, Evaluation.
|
1515
|
+
self._html([self._render_result, self._render_oop_failures]),
|
1516
|
+
os.path.join(self.dir, Evaluation.OOP_FAILURES_HTML),
|
1517
|
+
file_format='txt',
|
1518
|
+
)
|
1519
|
+
pg.save(
|
1520
|
+
[
|
1521
|
+
pg.Dict(input=input, error=_format_error(error))
|
1522
|
+
for input, error in self.non_oop_failures
|
1523
|
+
],
|
1524
|
+
os.path.join(self.dir, Evaluation.NON_OOP_FAILURES_JSON),
|
1525
|
+
)
|
1526
|
+
pg.save(
|
1527
|
+
self._html([self._render_result, self._render_non_oop_failures]),
|
1528
|
+
os.path.join(self.dir, Evaluation.NON_OOP_FAILURES_HTML),
|
1242
1529
|
file_format='txt',
|
1243
1530
|
)
|
1244
1531
|
|
@@ -1250,8 +1537,11 @@ class Evaluation(Evaluable):
|
|
1250
1537
|
'<td>Prompt</td>'
|
1251
1538
|
'<td>Schema</td>'
|
1252
1539
|
'<td>Additional Args</td>'
|
1253
|
-
'<td>Failures</td>'
|
1254
1540
|
)
|
1541
|
+
if self.result.usage:
|
1542
|
+
s.write('<td>Usage</td>')
|
1543
|
+
s.write('<td>OOP Failures</td>')
|
1544
|
+
s.write('<td>Non-OOP Failures</td>')
|
1255
1545
|
|
1256
1546
|
def _render_result_row(self, s: io.StringIO) -> None:
|
1257
1547
|
s.write(
|
@@ -1276,13 +1566,32 @@ class Evaluation(Evaluable):
|
|
1276
1566
|
'<td style="color:purple" '
|
1277
1567
|
f'{_html_repr(self.additional_args, compact=False)}</td>'
|
1278
1568
|
)
|
1279
|
-
#
|
1569
|
+
# Usage.
|
1570
|
+
if self.result.usage:
|
1571
|
+
s.write('<td>')
|
1572
|
+
self._render_summary_usage(s)
|
1573
|
+
s.write('</td>')
|
1574
|
+
|
1575
|
+
# OOP failures.
|
1576
|
+
s.write(
|
1577
|
+
'<td><span style="color:magenta">%s</span>%s</td>'
|
1578
|
+
% (
|
1579
|
+
self._format_rate(self.oop_failure_rate),
|
1580
|
+
'<a href="%s">(%d/%d)</a>'
|
1581
|
+
% (self.oop_failures_link,
|
1582
|
+
self.num_oop_failures,
|
1583
|
+
self.num_completed),
|
1584
|
+
)
|
1585
|
+
)
|
1586
|
+
# Non-OOP failures.
|
1280
1587
|
s.write(
|
1281
|
-
'<td><span style="color:
|
1588
|
+
'<td><span style="color:red">%s</span>%s</td>'
|
1282
1589
|
% (
|
1283
|
-
|
1590
|
+
self._format_rate(self.non_oop_failure_rate),
|
1284
1591
|
'<a href="%s">(%d/%d)</a>'
|
1285
|
-
% (self.
|
1592
|
+
% (self.non_oop_failures_link,
|
1593
|
+
self.num_non_oop_failures,
|
1594
|
+
self.num_completed),
|
1286
1595
|
)
|
1287
1596
|
)
|
1288
1597
|
|
@@ -1296,31 +1605,99 @@ class Evaluation(Evaluable):
|
|
1296
1605
|
else:
|
1297
1606
|
return 'cyan'
|
1298
1607
|
|
1299
|
-
def
|
1608
|
+
def _render_oop_failures(self, s: io.StringIO) -> None:
|
1609
|
+
self._render_failures(s, '^MappingError.*', error_color='magenta')
|
1610
|
+
|
1611
|
+
def _render_non_oop_failures(self, s: io.StringIO) -> None:
|
1612
|
+
self._render_failures(s, '^(?!MappingError).*', error_color='red')
|
1613
|
+
|
1614
|
+
def _render_failures(
|
1615
|
+
self, s: io.StringIO, error_regex: str, error_color: str) -> None:
|
1300
1616
|
"""Formats the failed cases into html."""
|
1617
|
+
# Failure summary.
|
1301
1618
|
s.write(
|
1302
|
-
'<h2>
|
1619
|
+
'<h2> Error Summary </h2>'
|
1303
1620
|
'<div style="white-space:pre">\n'
|
1304
1621
|
'<table style="border:1px solid">'
|
1305
|
-
'<tr class="header"><td>
|
1622
|
+
'<tr class="header"><td>Error type</td><td>Stats</td></tr>'
|
1306
1623
|
)
|
1624
|
+
error_regex = re.compile(error_regex)
|
1625
|
+
if self.result.metrics.failure_breakdown:
|
1626
|
+
for name, count in self.result.metrics.failure_breakdown.items():
|
1627
|
+
if not error_regex.match(name):
|
1628
|
+
continue
|
1629
|
+
|
1630
|
+
link = f'<a href="#{name}">{name}</a>'
|
1631
|
+
error_rate = self._format_rate(count / self.result.metrics.total)
|
1632
|
+
stats = (f'<span style="color:{error_color}">{error_rate} '
|
1633
|
+
f'({count}/{self.result.metrics.total})</span>')
|
1634
|
+
s.write(f'<tr><td>{link}</td><td>{stats})</td></tr>')
|
1635
|
+
s.write(
|
1636
|
+
'</table></div>'
|
1637
|
+
'<h2> Failed Cases </h2>'
|
1638
|
+
'<div style="white-space:pre">'
|
1639
|
+
)
|
1640
|
+
# Failure details by error type.
|
1641
|
+
failures_by_error = collections.defaultdict(list)
|
1642
|
+
for example, error in self.failures:
|
1643
|
+
error_name = _error_key(error)
|
1644
|
+
if error_regex.match(error_name):
|
1645
|
+
failures_by_error[error_name].append((example, error))
|
1646
|
+
|
1647
|
+
for error_key, failures in failures_by_error.items():
|
1648
|
+
s.write(
|
1649
|
+
f'<h3 id="{error_key}"><a href="#{error_key}">{error_key}</a> '
|
1650
|
+
f'(count={len(failures)})</h3>'
|
1651
|
+
'<table style="border:1px solid">'
|
1652
|
+
'<tr class="header"><td>No.</td><td>Input</td>'
|
1653
|
+
'<td>LM invocation</td><td>Error</td></tr>'
|
1654
|
+
)
|
1655
|
+
for i, (example, error) in enumerate(failures):
|
1656
|
+
lm_response = None
|
1657
|
+
if isinstance(error, lf.structured.MappingError):
|
1658
|
+
lm_response = error.lm_response
|
1659
|
+
error = error.cause
|
1660
|
+
|
1661
|
+
bgcolor = 'white' if i % 2 == 0 else '#DDDDDD'
|
1662
|
+
s.write(f'<tr style="background-color: {bgcolor}"><td>{i + 1}</td>')
|
1663
|
+
s.write('<td style="color:green;white-space:pre-wrap">')
|
1664
|
+
s.write(pg.format(example, verbose=False))
|
1665
|
+
s.write('</td><td>')
|
1666
|
+
if lm_response is not None:
|
1667
|
+
self._render_message(lm_response, s)
|
1668
|
+
s.write(f'</td><td style="color:{error_color};white-space:pre">')
|
1669
|
+
s.write(_format_error(error))
|
1670
|
+
s.write('</td></tr>')
|
1671
|
+
s.write('</table>')
|
1672
|
+
s.write('</div>')
|
1307
1673
|
|
1308
|
-
|
1309
|
-
|
1310
|
-
|
1311
|
-
|
1312
|
-
|
1313
|
-
error_str = lf.text_formatting.decolored(str(error))
|
1314
|
-
s.write(f'<td style="color:red;white-space:pre">{error_str}</td>')
|
1315
|
-
s.write('</tr>')
|
1316
|
-
s.write('</table></div>')
|
1674
|
+
@classmethod
|
1675
|
+
def visualize(cls, evaluations: list['Evaluation']) -> str | None:
|
1676
|
+
"""Visualize the a list of evaluations of this task in HTML."""
|
1677
|
+
del evaluations
|
1678
|
+
return None
|
1317
1679
|
|
1318
1680
|
|
1319
1681
|
@pg.functor()
|
1320
|
-
def inputs_from(path: str | list[str]) -> list[Any]:
|
1682
|
+
def inputs_from(path: str | list[str], **kwargs) -> list[Any]:
|
1321
1683
|
"""A functor that returns a list of user-defined objects as eval inputs."""
|
1322
1684
|
if isinstance(path, str):
|
1323
|
-
|
1685
|
+
if path.endswith('.json'):
|
1686
|
+
return pg.load(path)
|
1687
|
+
elif path.endswith('.jsonl'):
|
1688
|
+
return list(iter(pg.open_jsonl(path)))
|
1689
|
+
elif path.endswith('.csv'):
|
1690
|
+
import pandas as pd # pylint: disable=g-import-not-at-top
|
1691
|
+
dataset_df = pd.read_csv(path, **kwargs)
|
1692
|
+
dataset = []
|
1693
|
+
for i in range(dataset_df.shape[0]):
|
1694
|
+
row = {}
|
1695
|
+
for col in dataset_df.columns:
|
1696
|
+
row[col] = dataset_df.iloc[i][col]
|
1697
|
+
dataset.append(row)
|
1698
|
+
return dataset
|
1699
|
+
else:
|
1700
|
+
raise ValueError(f'Unsupported file format: {path}')
|
1324
1701
|
examples = []
|
1325
1702
|
for p in path:
|
1326
1703
|
examples.extend(pg.load(p))
|
@@ -1374,8 +1751,8 @@ class Summary(pg.Object):
|
|
1374
1751
|
Type[lf.LanguageModel],
|
1375
1752
|
tuple[lf.LanguageModel | Type[lf.LanguageModel], ...],
|
1376
1753
|
] = lf.LanguageModel,
|
1377
|
-
method: Union[str, tuple[str], None] = None,
|
1378
|
-
schema_fn: Union[pg.Functor, tuple[pg.Functor], None] = None,
|
1754
|
+
method: Union[str, tuple[str, ...], None] = None,
|
1755
|
+
schema_fn: Union[pg.Functor, tuple[pg.Functor, ...], None] = None,
|
1379
1756
|
completed: bool | None = None,
|
1380
1757
|
pivot_field: str | None = None,
|
1381
1758
|
) -> 'Summary':
|
@@ -1466,7 +1843,7 @@ class Summary(pg.Object):
|
|
1466
1843
|
if e is None:
|
1467
1844
|
s.write('<span style="color: gray">N/A<span>')
|
1468
1845
|
else:
|
1469
|
-
s.write(e.
|
1846
|
+
s.write(e.summary_card())
|
1470
1847
|
s.write('</td>')
|
1471
1848
|
s.write('</tr>')
|
1472
1849
|
s.write('</table>')
|
@@ -1541,13 +1918,22 @@ class Summary(pg.Object):
|
|
1541
1918
|
s.write('<html><body>')
|
1542
1919
|
for task in sorted(self.tasks(), key=lambda cls: cls.__name__):
|
1543
1920
|
table_id = task.__name__.lower()
|
1921
|
+
evaluations = self.select(task=task).evaluations
|
1922
|
+
table = Summary.Table.from_evaluations(evaluations, pivot_field)
|
1544
1923
|
s.write('<div>')
|
1545
|
-
s.write(
|
1546
|
-
|
1547
|
-
|
1548
|
-
table = Summary.Table.from_evaluations(
|
1549
|
-
self.select(task=task).evaluations, pivot_field
|
1924
|
+
s.write(
|
1925
|
+
f'<a id="{table_id}" href="#{table_id}">'
|
1926
|
+
f'<h2>{task.__name__}</h2></a>'
|
1550
1927
|
)
|
1928
|
+
|
1929
|
+
# Allow users to plugin visualization code (e.g. matplot) in the summary
|
1930
|
+
# page.
|
1931
|
+
visual_part = task.visualize(evaluations)
|
1932
|
+
if visual_part:
|
1933
|
+
s.write(visual_part)
|
1934
|
+
|
1935
|
+
s.write(f'<h4 style="color:gray">{len(evaluations)} experiments</h4>')
|
1936
|
+
s.write('<hr/>')
|
1551
1937
|
s.write(table.html())
|
1552
1938
|
s.write('</div>')
|
1553
1939
|
s.write('</body></html>')
|
@@ -1556,8 +1942,36 @@ class Summary(pg.Object):
|
|
1556
1942
|
def _repr_html_(self) -> str:
|
1557
1943
|
return self.html()
|
1558
1944
|
|
1945
|
+
def json(
|
1946
|
+
self,
|
1947
|
+
) -> dict[
|
1948
|
+
str, # Task name
|
1949
|
+
list[pg.Dict], # List of pg.Dict with `experiment` and `metrics`.
|
1950
|
+
]:
|
1951
|
+
"""Returns the JSON representation of the summary."""
|
1952
|
+
task_results = {}
|
1953
|
+
for task in sorted(self.tasks(), key=lambda cls: cls.__name__):
|
1954
|
+
results = []
|
1955
|
+
for entry in self.select(task=task).evaluations:
|
1956
|
+
results.append(
|
1957
|
+
pg.Dict(
|
1958
|
+
id=entry.id,
|
1959
|
+
experiment=entry,
|
1960
|
+
dir=entry.dir,
|
1961
|
+
metrics=entry.result.metrics if entry.result else None,
|
1962
|
+
usage=entry.result.usage if entry.result else None,
|
1963
|
+
)
|
1964
|
+
)
|
1965
|
+
task_results[task.__name__] = results
|
1966
|
+
return task_results
|
1967
|
+
|
1559
1968
|
def save(self, file: str, pivot_field: str | None = None) -> None:
|
1560
1969
|
pg.save(self.html(pivot_field), file, file_format='txt')
|
1970
|
+
if file.endswith('.html'):
|
1971
|
+
json_file = file.replace('.html', '.json')
|
1972
|
+
else:
|
1973
|
+
json_file = os.path.join(file, '.json')
|
1974
|
+
pg.save(self.json(), json_file)
|
1561
1975
|
|
1562
1976
|
@classmethod
|
1563
1977
|
def from_dirs(
|
@@ -1694,6 +2108,20 @@ class Summary(pg.Object):
|
|
1694
2108
|
return result.join()
|
1695
2109
|
|
1696
2110
|
|
2111
|
+
def _format_error(error: Exception):
|
2112
|
+
"""Formats an error into a string."""
|
2113
|
+
return (f'({error.__class__.__name__}) ' + pg.decolor(str(error)))
|
2114
|
+
|
2115
|
+
|
2116
|
+
def _error_key(error: Exception) -> str:
|
2117
|
+
"""Returns the key for an error."""
|
2118
|
+
error_names = []
|
2119
|
+
while error is not None:
|
2120
|
+
error_names.append(error.__class__.__name__)
|
2121
|
+
error = getattr(error, 'cause', None)
|
2122
|
+
return '.'.join(error_names)
|
2123
|
+
|
2124
|
+
|
1697
2125
|
def _html_repr(value: Any, compact: bool = True, escape: bool = False) -> str:
|
1698
2126
|
"""Formats prompt in HTML."""
|
1699
2127
|
if type(value) is lf.Template: # pylint: disable=unidiomatic-typecheck
|
@@ -1768,3 +2196,202 @@ def monitor_async(
|
|
1768
2196
|
scan_interval=scan_interval,
|
1769
2197
|
refresh_when_stop=refresh_when_stop,
|
1770
2198
|
)
|
2199
|
+
|
2200
|
+
|
2201
|
+
#
|
2202
|
+
# Named evaluations and experiments support.
|
2203
|
+
#
|
2204
|
+
|
2205
|
+
|
2206
|
+
class _NamedEvaluationRegistry:
|
2207
|
+
"""Named evaluation registry."""
|
2208
|
+
|
2209
|
+
def __init__(self):
|
2210
|
+
self._registry = {}
|
2211
|
+
|
2212
|
+
def names(self) -> list[str]:
|
2213
|
+
"""Returns all registered names."""
|
2214
|
+
return sorted(self._registry.keys())
|
2215
|
+
|
2216
|
+
def get(self, name: str) -> list[Type[Evaluable]]:
|
2217
|
+
"""Gets an evaluation by name."""
|
2218
|
+
matches = []
|
2219
|
+
if name in self._registry:
|
2220
|
+
matches.append(self._registry[name])
|
2221
|
+
else:
|
2222
|
+
regex = re.compile(name)
|
2223
|
+
for key, cls in self._registry.items():
|
2224
|
+
if regex.match(key):
|
2225
|
+
matches.append(cls)
|
2226
|
+
return matches
|
2227
|
+
|
2228
|
+
def register(
|
2229
|
+
self,
|
2230
|
+
name: str,
|
2231
|
+
experiment_cls: Type[Evaluable],
|
2232
|
+
):
|
2233
|
+
"""Register an experiment class."""
|
2234
|
+
self._registry[name] = experiment_cls
|
2235
|
+
|
2236
|
+
|
2237
|
+
_eval_registry = _NamedEvaluationRegistry()
|
2238
|
+
|
2239
|
+
|
2240
|
+
def registered_names() -> list[str]:
|
2241
|
+
"""Returns all registered names."""
|
2242
|
+
return _eval_registry.names()
|
2243
|
+
|
2244
|
+
|
2245
|
+
def get_evaluations(evaluation: str | Evaluable) -> list[Evaluable]:
|
2246
|
+
"""Gets an evaluation experiment by name."""
|
2247
|
+
if isinstance(evaluation, str):
|
2248
|
+
return [e() for e in _eval_registry.get(evaluation)]
|
2249
|
+
return [evaluation]
|
2250
|
+
|
2251
|
+
|
2252
|
+
def register(name: str):
|
2253
|
+
"""Decorator to create a named evaluation class."""
|
2254
|
+
|
2255
|
+
def _register(func_or_cls: Type[Evaluation] | types.FunctionType):
|
2256
|
+
if inspect.isfunction(func_or_cls):
|
2257
|
+
e = func_or_cls()
|
2258
|
+
if not isinstance(e, Evaluable):
|
2259
|
+
raise TypeError(
|
2260
|
+
f'The return value of `{func_or_cls}` should be an instance of '
|
2261
|
+
'`lf.eval.Evaluable` subclass.'
|
2262
|
+
)
|
2263
|
+
|
2264
|
+
class GeneratedSuite(Suite):
|
2265
|
+
# NOTE(daiyip): Delay serialization key registration for generated
|
2266
|
+
# class.
|
2267
|
+
auto_register = False
|
2268
|
+
children = e.children if isinstance(e, Suite) else [e]
|
2269
|
+
|
2270
|
+
cls = GeneratedSuite
|
2271
|
+
cls.__name__ = func_or_cls.__name__
|
2272
|
+
cls.__doc__ = func_or_cls.__doc__
|
2273
|
+
cls.__qualname__ = func_or_cls.__qualname__
|
2274
|
+
cls.__module__ = getattr(func_or_cls, '__module__', 'wrapper')
|
2275
|
+
cls.register_for_deserialization(cls.__type_name__)
|
2276
|
+
|
2277
|
+
elif issubclass(func_or_cls, Evaluable):
|
2278
|
+
cls = func_or_cls
|
2279
|
+
else:
|
2280
|
+
raise ValueError(f'Unsupported type: {type(func_or_cls)}')
|
2281
|
+
|
2282
|
+
_eval_registry.register(name, cls)
|
2283
|
+
return cls
|
2284
|
+
|
2285
|
+
return _register
|
2286
|
+
|
2287
|
+
|
2288
|
+
def get(
|
2289
|
+
root_dir: str,
|
2290
|
+
evaluations: list[str | Evaluable],
|
2291
|
+
filter: Union[ # pylint: disable=redefined-builtin
|
2292
|
+
str, # Regex to filter evaluation based on ID.
|
2293
|
+
Callable[[Evaluable], bool], # Custom filter function.
|
2294
|
+
None # No filtering (Default).
|
2295
|
+
] = None, # pylint: disable=bad-whitespace
|
2296
|
+
patches: list[Union[
|
2297
|
+
str, # String-based PyGlove patcher.
|
2298
|
+
pg.patching.Patcher, # PyGlove patcher object.
|
2299
|
+
Callable[[pg.KeyPath, Any, Any], Any], # PyGlove rebind function.
|
2300
|
+
]] | None = None, # pylint: disable=bad-whitespace
|
2301
|
+
) -> Suite:
|
2302
|
+
"""Gets a suite from a list of patched evaluations.
|
2303
|
+
|
2304
|
+
Args:
|
2305
|
+
root_dir: The root directory of the experiment.
|
2306
|
+
evaluations: A list of evaluations to be included in the suite.
|
2307
|
+
filter: A regular expression (str) for selecting sub-experiments of matched
|
2308
|
+
IDs, or a filter function to filter the evaluations.
|
2309
|
+
patches: A list of patches to be applied to the suite. Each element can be
|
2310
|
+
a string (for string-based patcher), a `pg.patching.Patcher` object, or
|
2311
|
+
a rebind function (e.g. `pg.rebind`). See `lf.eval.patch_*` for more
|
2312
|
+
details.
|
2313
|
+
|
2314
|
+
Returns:
|
2315
|
+
A suite of selected `lf.eval.Evaluation` objects.
|
2316
|
+
"""
|
2317
|
+
matches = []
|
2318
|
+
for e in evaluations:
|
2319
|
+
matches.extend(get_evaluations(e))
|
2320
|
+
|
2321
|
+
if not matches:
|
2322
|
+
raise ValueError('No evaluations found.')
|
2323
|
+
|
2324
|
+
suite = Suite(matches, root_dir=root_dir)
|
2325
|
+
if patches:
|
2326
|
+
suite = pg.patch(suite, patches)
|
2327
|
+
|
2328
|
+
if isinstance(filter, str):
|
2329
|
+
regex = re.compile(filter)
|
2330
|
+
filter = lambda x: bool(regex.match(x.id))
|
2331
|
+
|
2332
|
+
if filter:
|
2333
|
+
suite = Suite(
|
2334
|
+
[leaf for leaf in suite.leaf_nodes if filter(leaf)], root_dir=root_dir)
|
2335
|
+
return suite
|
2336
|
+
|
2337
|
+
|
2338
|
+
def run(
|
2339
|
+
root_dir: str,
|
2340
|
+
evaluations: list[str | Evaluable],
|
2341
|
+
filter: Union[ # pylint: disable=redefined-builtin
|
2342
|
+
str, # Regex to filter evaluation based on ID.
|
2343
|
+
Callable[[Evaluable], bool], # Custom filter function.
|
2344
|
+
None # No filtering (Default).
|
2345
|
+
] = None, # pylint: disable=bad-whitespace
|
2346
|
+
patches: list[Union[
|
2347
|
+
str, # String-based PyGlove patcher.
|
2348
|
+
pg.patching.Patcher, # PyGlove patcher object.
|
2349
|
+
Callable[[pg.KeyPath, Any, Any], Any], # PyGlove rebind function.
|
2350
|
+
]] | None = None, # pylint: disable=bad-whitespace
|
2351
|
+
mode: Literal['run', 'rerun', 'dryrun', 'noop'] = 'run',
|
2352
|
+
debug: bool = False,
|
2353
|
+
print_definition: bool = False,
|
2354
|
+
**kwargs,
|
2355
|
+
) -> Suite:
|
2356
|
+
"""Run selected evaluations with patching.
|
2357
|
+
|
2358
|
+
Args:
|
2359
|
+
root_dir: The root directory of the experiment.
|
2360
|
+
evaluations: A list of evaluations to be included in the suite.
|
2361
|
+
filter: A regular expression (str) for selecting sub-experiments of matched
|
2362
|
+
IDs, or a filter function to filter the evaluations.
|
2363
|
+
patches: A list of patches to be applied to the suite. Each element can be
|
2364
|
+
a string (for string-based patcher), a `pg.patching.Patcher` object, or
|
2365
|
+
a rebind function (e.g. `pg.rebind`). See `lf.eval.patch_*` for more
|
2366
|
+
details.
|
2367
|
+
mode: The mode to run the suite. "run" to run the suite, with reusing
|
2368
|
+
existing results if available; "rerun" to rerun all evaluations even if
|
2369
|
+
there are existing results; "dryrun" to dryrun the suite; and "noop"
|
2370
|
+
to do nothing.
|
2371
|
+
debug: Whether to run in debug mode.
|
2372
|
+
print_definition: Whether to print the experiment definition.
|
2373
|
+
**kwargs: Additional arguments to be passed to dryrun/run the suite.
|
2374
|
+
|
2375
|
+
Returns:
|
2376
|
+
A suite of selected `lf.eval.Evaluation` objects.
|
2377
|
+
"""
|
2378
|
+
suite = get(root_dir, evaluations, patches=patches, filter=filter)
|
2379
|
+
if print_definition:
|
2380
|
+
lf.console.write(
|
2381
|
+
pg.format(
|
2382
|
+
suite,
|
2383
|
+
compact=False,
|
2384
|
+
verbose=False,
|
2385
|
+
hide_default_values=True,
|
2386
|
+
python_format=True,
|
2387
|
+
),
|
2388
|
+
title='[EXPERIMENT DEFINITION]',
|
2389
|
+
color='blue',
|
2390
|
+
)
|
2391
|
+
|
2392
|
+
if mode == 'run':
|
2393
|
+
rerun = mode == 'rerun'
|
2394
|
+
suite.run(debug=debug, rerun=rerun, **kwargs)
|
2395
|
+
elif mode == 'dryrun':
|
2396
|
+
suite.dryrun(debug=debug, **kwargs)
|
2397
|
+
return suite
|