langfun 0.0.2.dev20240428__tar.gz → 0.0.2.dev20240430__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/PKG-INFO +1 -1
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/eval/base.py +310 -73
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/eval/base_test.py +96 -45
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/eval/matching.py +22 -21
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/eval/matching_test.py +23 -2
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/eval/scoring.py +4 -4
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/eval/scoring_test.py +19 -2
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/llms/openai.py +1 -1
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/llms/openai_test.py +2 -1
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun.egg-info/PKG-INFO +1 -1
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/LICENSE +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/README.md +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/__init__.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/__init__.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/coding/__init__.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/coding/python/__init__.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/coding/python/correction.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/coding/python/correction_test.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/coding/python/errors.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/coding/python/errors_test.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/coding/python/execution.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/coding/python/execution_test.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/coding/python/generation.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/coding/python/generation_test.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/coding/python/parsing.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/coding/python/parsing_test.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/coding/python/permissions.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/coding/python/permissions_test.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/component.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/component_test.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/concurrent.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/concurrent_test.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/console.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/console_test.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/eval/__init__.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/langfunc.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/langfunc_test.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/language_model.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/language_model_test.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/llms/__init__.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/llms/anthropic.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/llms/anthropic_test.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/llms/cache/__init__.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/llms/cache/base.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/llms/cache/in_memory.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/llms/cache/in_memory_test.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/llms/fake.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/llms/fake_test.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/llms/google_genai.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/llms/google_genai_test.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/llms/groq.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/llms/groq_test.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/llms/llama_cpp.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/llms/llama_cpp_test.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/memories/__init__.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/memories/conversation_history.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/memories/conversation_history_test.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/memory.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/message.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/message_test.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/modalities/__init__.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/modalities/image.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/modalities/image_test.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/modalities/mime.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/modalities/mime_test.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/modalities/video.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/modalities/video_test.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/modality.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/modality_test.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/natural_language.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/natural_language_test.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/sampling.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/sampling_test.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/structured/__init__.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/structured/completion.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/structured/completion_test.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/structured/description.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/structured/description_test.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/structured/function_generation.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/structured/function_generation_test.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/structured/mapping.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/structured/mapping_test.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/structured/parsing.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/structured/parsing_test.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/structured/prompting.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/structured/prompting_test.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/structured/schema.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/structured/schema_generation.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/structured/schema_generation_test.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/structured/schema_test.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/structured/scoring.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/structured/scoring_test.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/subscription.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/subscription_test.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/template.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/template_test.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/templates/__init__.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/templates/completion.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/templates/completion_test.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/templates/conversation.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/templates/conversation_test.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/templates/demonstration.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/templates/demonstration_test.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/templates/selfplay.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/templates/selfplay_test.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/text_formatting.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun/core/text_formatting_test.py +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun.egg-info/SOURCES.txt +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun.egg-info/dependency_links.txt +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun.egg-info/requires.txt +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/langfun.egg-info/top_level.txt +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/setup.cfg +0 -0
- {langfun-0.0.2.dev20240428 → langfun-0.0.2.dev20240430}/setup.py +0 -0
@@ -18,6 +18,7 @@ import collections
|
|
18
18
|
import dataclasses
|
19
19
|
import functools
|
20
20
|
import hashlib
|
21
|
+
import html
|
21
22
|
import inspect
|
22
23
|
import io
|
23
24
|
import os
|
@@ -40,7 +41,8 @@ class Evaluable(lf.Component):
|
|
40
41
|
|
41
42
|
EXPERIMENT_JSON = 'experiment.json'
|
42
43
|
RESULT_JSON = 'result.json'
|
43
|
-
|
44
|
+
OOP_FAILURES_JSON = 'oop_failures.json'
|
45
|
+
NON_OOP_FAILURES_JSON = 'non_oop_failures.json'
|
44
46
|
INDEX_HTML = 'index.html'
|
45
47
|
SUMMARY_HTML = 'summary.html'
|
46
48
|
|
@@ -358,7 +360,7 @@ class Evaluable(lf.Component):
|
|
358
360
|
color='yellow')
|
359
361
|
|
360
362
|
for node in self.nonleaf_nodes:
|
361
|
-
node._result = {c.id: c.result for c in node.
|
363
|
+
node._result = {c.id: c.result for c in node.leaf_nodes} # pylint: disable=protected-access
|
362
364
|
if should_save:
|
363
365
|
node.save(result=False, report=False)
|
364
366
|
|
@@ -540,13 +542,13 @@ class Evaluable(lf.Component):
|
|
540
542
|
f'<div style="color: {text_color}; white-space: pre-wrap;'
|
541
543
|
'padding: 10px; border: 1px solid; margin-top: 10px">'
|
542
544
|
)
|
543
|
-
s.write(m.get('formatted_text', m.text))
|
545
|
+
s.write(html.escape(m.get('formatted_text', m.text)))
|
544
546
|
if m.result is not None:
|
545
547
|
s.write(
|
546
548
|
'<div style="color: magenta; white-space: pre-wrap;'
|
547
549
|
'padding: 10px; border: 1px solid; margin: 10px">'
|
548
550
|
)
|
549
|
-
s.write(pg.format(m.result))
|
551
|
+
s.write(html.escape(pg.format(m.result)))
|
550
552
|
s.write('</div>')
|
551
553
|
if 'usage' in m.metadata:
|
552
554
|
s.write(
|
@@ -753,10 +755,12 @@ class Evaluation(Evaluable):
|
|
753
755
|
|
754
756
|
# Constants.
|
755
757
|
CACHE_JSON = 'cache.json'
|
756
|
-
|
758
|
+
OOP_FAILURES_HTML = 'oop_failures.html'
|
759
|
+
NON_OOP_FAILURES_HTML = 'non_oop_failures.html'
|
757
760
|
|
758
761
|
@functools.cached_property
|
759
762
|
def hash(self) -> str:
|
763
|
+
"""Returns the semantic-based hash of the evaluation."""
|
760
764
|
if self.is_deterministic:
|
761
765
|
identity = pg.format(self._identifiers(), compact=True)
|
762
766
|
else:
|
@@ -805,6 +809,10 @@ class Evaluation(Evaluable):
|
|
805
809
|
"""Returns the complete rate."""
|
806
810
|
return self.num_completed / self.num_examples
|
807
811
|
|
812
|
+
#
|
813
|
+
# Properties on failures.
|
814
|
+
#
|
815
|
+
|
808
816
|
@property
|
809
817
|
def failures(self) -> list[tuple[Any, Exception]]:
|
810
818
|
"""Returns the failed examples and their errors."""
|
@@ -815,6 +823,15 @@ class Evaluation(Evaluable):
|
|
815
823
|
"""Returns the number of failed examples."""
|
816
824
|
return len(self.failures)
|
817
825
|
|
826
|
+
@functools.cached_property
|
827
|
+
def failure_breakdown(self) -> dict[str, int]:
|
828
|
+
"""Returns the breakdown of failures."""
|
829
|
+
breakdown = collections.defaultdict(int)
|
830
|
+
for _, error in self.failures:
|
831
|
+
breakdown[_error_key(error)] += 1
|
832
|
+
sorted_items = sorted(breakdown.items(), key=lambda x: x[1], reverse=True)
|
833
|
+
return pg.Dict({x[0]: x[1] for x in sorted_items})
|
834
|
+
|
818
835
|
@property
|
819
836
|
def failure_rate(self) -> float:
|
820
837
|
"""Returns the failure rate in range [0, 1]."""
|
@@ -822,6 +839,46 @@ class Evaluation(Evaluable):
|
|
822
839
|
return 0.0
|
823
840
|
return self.num_failures / self.num_completed
|
824
841
|
|
842
|
+
@functools.cached_property
|
843
|
+
def oop_failures(self) -> list[tuple[Any, lf_structured.MappingError]]:
|
844
|
+
"""Returns the OOP failures."""
|
845
|
+
return [item for item in self.failures
|
846
|
+
if isinstance(item[1], lf_structured.MappingError)]
|
847
|
+
|
848
|
+
@property
|
849
|
+
def num_oop_failures(self) -> int:
|
850
|
+
"""Returns the number of OOP failures."""
|
851
|
+
return len(self.oop_failures)
|
852
|
+
|
853
|
+
@property
|
854
|
+
def oop_failure_rate(self) -> float:
|
855
|
+
"""Returns the OOP failure rate in range [0, 1]."""
|
856
|
+
if self.num_completed == 0:
|
857
|
+
return 0.0
|
858
|
+
return self.num_oop_failures / self.num_completed
|
859
|
+
|
860
|
+
@functools.cached_property
|
861
|
+
def non_oop_failures(self) -> list[tuple[Any, Exception]]:
|
862
|
+
"""Returns the OOP failures."""
|
863
|
+
return [item for item in self.failures
|
864
|
+
if not isinstance(item[1], lf_structured.MappingError)]
|
865
|
+
|
866
|
+
@property
|
867
|
+
def num_non_oop_failures(self) -> int:
|
868
|
+
"""Returns the number of non-OOP failures."""
|
869
|
+
return len(self.non_oop_failures)
|
870
|
+
|
871
|
+
@property
|
872
|
+
def non_oop_failure_rate(self) -> float:
|
873
|
+
"""Returns the non-OOP failure rate in range [0, 1]."""
|
874
|
+
if self.num_completed == 0:
|
875
|
+
return 0.0
|
876
|
+
return self.num_non_oop_failures / self.num_completed
|
877
|
+
|
878
|
+
#
|
879
|
+
# Properties on usage.
|
880
|
+
#
|
881
|
+
|
825
882
|
@property
|
826
883
|
def has_usage(self) -> bool:
|
827
884
|
"""Returns True if token usage is enabled."""
|
@@ -976,13 +1033,22 @@ class Evaluation(Evaluable):
|
|
976
1033
|
self._total_prompt_tokens = 0
|
977
1034
|
self._total_completion_tokens = 0
|
978
1035
|
self._num_usages = 0
|
1036
|
+
self.__dict__.pop('oop_failures', None)
|
1037
|
+
self.__dict__.pop('non_oop_failures', None)
|
979
1038
|
|
980
1039
|
@property
|
981
|
-
def
|
982
|
-
"""Returns the link to the failures page."""
|
1040
|
+
def oop_failures_link(self) -> str | None:
|
1041
|
+
"""Returns the link to the OOP failures page."""
|
983
1042
|
if self.dir is None:
|
984
1043
|
return None
|
985
|
-
return self.link(os.path.join(self.dir, Evaluation.
|
1044
|
+
return self.link(os.path.join(self.dir, Evaluation.OOP_FAILURES_HTML))
|
1045
|
+
|
1046
|
+
@property
|
1047
|
+
def non_oop_failures_link(self) -> str | None:
|
1048
|
+
"""Returns the link to then non-OOP failures page."""
|
1049
|
+
if self.dir is None:
|
1050
|
+
return None
|
1051
|
+
return self.link(os.path.join(self.dir, Evaluation.NON_OOP_FAILURES_HTML))
|
986
1052
|
|
987
1053
|
def _dryrun(
|
988
1054
|
self,
|
@@ -1011,23 +1077,34 @@ class Evaluation(Evaluable):
|
|
1011
1077
|
color='green',
|
1012
1078
|
)
|
1013
1079
|
|
1014
|
-
|
1015
|
-
output_message = copy.process(example, **(self.additional_args or {}))
|
1016
|
-
if self.schema is None:
|
1017
|
-
output = output_message.text
|
1018
|
-
else:
|
1019
|
-
output = output_message.result
|
1080
|
+
error, output_message = None, None
|
1020
1081
|
|
1021
|
-
|
1082
|
+
try:
|
1083
|
+
with lf.use_settings(debug=debug):
|
1084
|
+
output_message = copy.process(example, **(self.additional_args or {}))
|
1085
|
+
if self.schema is None:
|
1086
|
+
output = output_message.text
|
1087
|
+
else:
|
1088
|
+
output = output_message.result
|
1089
|
+
|
1090
|
+
if verbose:
|
1091
|
+
lf.console.write('')
|
1092
|
+
lf.console.write(
|
1093
|
+
str(output),
|
1094
|
+
title='OUTPUT',
|
1095
|
+
color='blue',
|
1096
|
+
)
|
1097
|
+
except lf_structured.MappingError as e:
|
1022
1098
|
lf.console.write('')
|
1023
1099
|
lf.console.write(
|
1024
|
-
str(
|
1025
|
-
title='
|
1026
|
-
color='
|
1100
|
+
str(e),
|
1101
|
+
title='ERROR',
|
1102
|
+
color='red',
|
1027
1103
|
)
|
1104
|
+
error = e
|
1028
1105
|
|
1029
|
-
copy.audit(example, output_message,
|
1030
|
-
result = copy.
|
1106
|
+
copy.audit(example, output_message, error, dryrun=True)
|
1107
|
+
result = copy.finalize()
|
1031
1108
|
|
1032
1109
|
if verbose:
|
1033
1110
|
lf.console.write('')
|
@@ -1087,7 +1164,7 @@ class Evaluation(Evaluable):
|
|
1087
1164
|
self.cache.save()
|
1088
1165
|
|
1089
1166
|
# Summarize result.
|
1090
|
-
self._result = self.
|
1167
|
+
self._result = self.finalize()
|
1091
1168
|
if verbose:
|
1092
1169
|
lf.console.write(
|
1093
1170
|
str(self.result),
|
@@ -1143,13 +1220,13 @@ class Evaluation(Evaluable):
|
|
1143
1220
|
def _status(self, progress: lf.concurrent.Progress) -> dict[str, Any]:
|
1144
1221
|
return {
|
1145
1222
|
'Model': self.lm.model_id,
|
1146
|
-
'Succeeded':
|
1147
|
-
progress.success_rate
|
1223
|
+
'Succeeded': '%s (%d/%d)' % (
|
1224
|
+
self._format_rate(progress.success_rate),
|
1148
1225
|
progress.succeeded,
|
1149
1226
|
progress.completed,
|
1150
1227
|
),
|
1151
|
-
'Failed':
|
1152
|
-
progress.failure_rate
|
1228
|
+
'Failed': '%s (%d/%d)' % (
|
1229
|
+
self._format_rate(progress.failure_rate),
|
1153
1230
|
progress.failed,
|
1154
1231
|
progress.completed,
|
1155
1232
|
),
|
@@ -1159,21 +1236,20 @@ class Evaluation(Evaluable):
|
|
1159
1236
|
assert self.result is not None
|
1160
1237
|
m = self.result.metrics
|
1161
1238
|
return (
|
1162
|
-
|
1163
|
-
f' Failures=%.{self.report_precision}f%% (%d/%d)'
|
1239
|
+
'COMPLETED(%s): Successes=%s(%d/%d) Failures=%s (%d/%d)'
|
1164
1240
|
% (
|
1165
1241
|
run_status,
|
1166
|
-
(1 - m.failure_rate)
|
1242
|
+
self._format_rate(1 - m.failure_rate),
|
1167
1243
|
m.total - m.failures,
|
1168
1244
|
m.total,
|
1169
|
-
m.failure_rate
|
1245
|
+
self._format_rate(m.failure_rate),
|
1170
1246
|
m.failures,
|
1171
1247
|
m.total,
|
1172
1248
|
)
|
1173
1249
|
)
|
1174
1250
|
|
1175
|
-
def
|
1176
|
-
"""
|
1251
|
+
def finalize(self) -> pg.Dict:
|
1252
|
+
"""Finalizes the evaluation result."""
|
1177
1253
|
if self.cache:
|
1178
1254
|
cache_stats = dict(
|
1179
1255
|
use_cache=True,
|
@@ -1210,12 +1286,18 @@ class Evaluation(Evaluable):
|
|
1210
1286
|
total=self.num_completed,
|
1211
1287
|
failures=self.num_failures,
|
1212
1288
|
failure_rate=self.failure_rate,
|
1289
|
+
oop_failures=self.num_oop_failures,
|
1290
|
+
oop_failure_rate=self.oop_failure_rate,
|
1291
|
+
non_oop_failures=self.num_non_oop_failures,
|
1292
|
+
non_oop_failure_rate=self.non_oop_failure_rate,
|
1293
|
+
failure_breakdown=self.failure_breakdown,
|
1213
1294
|
),
|
1214
1295
|
usage=usage,
|
1215
1296
|
)
|
1216
1297
|
return result
|
1217
1298
|
|
1218
|
-
def
|
1299
|
+
def summary_card(self) -> str:
|
1300
|
+
"""Returns summary card in HTML."""
|
1219
1301
|
s = io.StringIO()
|
1220
1302
|
definition = _html_repr(self, compact=False, escape=True)
|
1221
1303
|
s.write('<div><table><tr><td>')
|
@@ -1230,18 +1312,19 @@ class Evaluation(Evaluable):
|
|
1230
1312
|
s.write(
|
1231
1313
|
f'<a target="_blank" title="{definition}" '
|
1232
1314
|
f'href="{self.index_link}">{self.hash}</a>'
|
1315
|
+
f' [<a href="{self.link(self.dir)}">dir</a>]'
|
1233
1316
|
'</td></tr><tr><td>'
|
1234
1317
|
)
|
1235
|
-
self.
|
1318
|
+
self._render_summary_metrics(s)
|
1236
1319
|
|
1237
1320
|
# Summarize average usage.
|
1238
1321
|
if self.result.usage is not None:
|
1239
|
-
self.
|
1322
|
+
self._render_summary_usage(s)
|
1240
1323
|
|
1241
1324
|
s.write('</td></tr></table></div>')
|
1242
1325
|
return s.getvalue()
|
1243
1326
|
|
1244
|
-
def
|
1327
|
+
def _render_summary_usage(self, s: io.StringIO) -> None:
|
1245
1328
|
"""Renders usage in HTML."""
|
1246
1329
|
usage = self.result.usage
|
1247
1330
|
total = usage.total_prompt_tokens + usage.total_completion_tokens
|
@@ -1255,20 +1338,66 @@ class Evaluation(Evaluable):
|
|
1255
1338
|
f'" style="color:gray">({total} tokens)</a>'
|
1256
1339
|
)
|
1257
1340
|
|
1258
|
-
def
|
1341
|
+
def _render_summary_metrics(self, s: io.StringIO) -> None:
|
1259
1342
|
"""Renders metrics in HTML."""
|
1260
1343
|
assert self.result is not None
|
1261
1344
|
m = self.result.metrics
|
1345
|
+
|
1346
|
+
# OOP failures.
|
1347
|
+
oop_failure_title = f'OOP failures ({m.oop_failures}/{m.total})'
|
1348
|
+
if m.oop_failures:
|
1349
|
+
oop_failure_title += '
'
|
1350
|
+
for name, count in m.failure_breakdown.items():
|
1351
|
+
if name.startswith('MappingError'):
|
1352
|
+
oop_failure_title += '
%s: %s (%d/%d)' % (
|
1353
|
+
name.removeprefix('MappingError.'),
|
1354
|
+
self._format_rate(count / m.total),
|
1355
|
+
count,
|
1356
|
+
m.total,
|
1357
|
+
)
|
1358
|
+
|
1359
|
+
extra_style = ''
|
1360
|
+
if m.oop_failure_rate > 0.1 and m.oop_failures > 3:
|
1361
|
+
extra_style = ';font-weight:bold'
|
1262
1362
|
s.write(
|
1263
|
-
'<a title="
|
1363
|
+
'<a title="%s" href="%s" style="color:magenta%s">%s</a>'
|
1264
1364
|
% (
|
1265
|
-
|
1266
|
-
|
1267
|
-
|
1268
|
-
|
1365
|
+
oop_failure_title,
|
1366
|
+
self.oop_failures_link,
|
1367
|
+
extra_style,
|
1368
|
+
self._format_rate(m.oop_failure_rate),
|
1369
|
+
)
|
1370
|
+
)
|
1371
|
+
s.write(' | ')
|
1372
|
+
|
1373
|
+
# Non-OOP failures.
|
1374
|
+
non_oop_failure_title = f'Non-OOP failures ({m.non_oop_failures}/{m.total})'
|
1375
|
+
if m.non_oop_failures:
|
1376
|
+
non_oop_failure_title += '
'
|
1377
|
+
for name, count in m.failure_breakdown.items():
|
1378
|
+
if not name.startswith('MappingError'):
|
1379
|
+
non_oop_failure_title += '
%s: %s (%d/%d)' % (
|
1380
|
+
name,
|
1381
|
+
self._format_rate(count / m.total),
|
1382
|
+
count,
|
1383
|
+
m.total,
|
1384
|
+
)
|
1385
|
+
|
1386
|
+
extra_style = ';font-weight:bold' if m.non_oop_failures > 0 else ''
|
1387
|
+
s.write(
|
1388
|
+
'<a title="%s" href="%s" style="color:red%s">%s</a>'
|
1389
|
+
% (
|
1390
|
+
non_oop_failure_title,
|
1391
|
+
self.non_oop_failures_link,
|
1392
|
+
extra_style,
|
1393
|
+
self._format_rate(m.non_oop_failure_rate),
|
1269
1394
|
)
|
1270
1395
|
)
|
1271
1396
|
|
1397
|
+
def _format_rate(self, rate: float) -> str:
|
1398
|
+
"""Formats a rate."""
|
1399
|
+
return f'%.{self.report_precision}f%% ' % (rate * 100)
|
1400
|
+
|
1272
1401
|
def audit(
|
1273
1402
|
self,
|
1274
1403
|
example: Any,
|
@@ -1287,7 +1416,13 @@ class Evaluation(Evaluable):
|
|
1287
1416
|
dryrun: Whether or not audition takes place during dryrun.
|
1288
1417
|
"""
|
1289
1418
|
if error is not None:
|
1290
|
-
self._failures.append((example,
|
1419
|
+
self._failures.append((example, error))
|
1420
|
+
|
1421
|
+
# Invalid cache of num_oop_failures.
|
1422
|
+
self.__dict__.pop('oop_failures', None)
|
1423
|
+
self.__dict__.pop('non_oop_failures', None)
|
1424
|
+
self.__dict__.pop('failure_breakdown', None)
|
1425
|
+
|
1291
1426
|
if isinstance(error, lf_structured.MappingError):
|
1292
1427
|
message = error.lm_response
|
1293
1428
|
else:
|
@@ -1333,16 +1468,26 @@ class Evaluation(Evaluable):
|
|
1333
1468
|
# Save failures.
|
1334
1469
|
pg.save(
|
1335
1470
|
[
|
1336
|
-
pg.Dict(
|
1337
|
-
|
1338
|
-
|
1339
|
-
|
1471
|
+
pg.Dict(input=input, error=_format_error(error))
|
1472
|
+
for input, error in self.oop_failures
|
1473
|
+
],
|
1474
|
+
os.path.join(self.dir, Evaluation.OOP_FAILURES_JSON),
|
1475
|
+
)
|
1476
|
+
pg.save(
|
1477
|
+
self._html([self._render_result, self._render_oop_failures]),
|
1478
|
+
os.path.join(self.dir, Evaluation.OOP_FAILURES_HTML),
|
1479
|
+
file_format='txt',
|
1480
|
+
)
|
1481
|
+
pg.save(
|
1482
|
+
[
|
1483
|
+
pg.Dict(input=input, error=_format_error(error))
|
1484
|
+
for input, error in self.non_oop_failures
|
1340
1485
|
],
|
1341
|
-
os.path.join(self.dir, Evaluation.
|
1486
|
+
os.path.join(self.dir, Evaluation.NON_OOP_FAILURES_JSON),
|
1342
1487
|
)
|
1343
1488
|
pg.save(
|
1344
|
-
self._html([self._render_result, self.
|
1345
|
-
os.path.join(self.dir, Evaluation.
|
1489
|
+
self._html([self._render_result, self._render_non_oop_failures]),
|
1490
|
+
os.path.join(self.dir, Evaluation.NON_OOP_FAILURES_HTML),
|
1346
1491
|
file_format='txt',
|
1347
1492
|
)
|
1348
1493
|
|
@@ -1357,7 +1502,8 @@ class Evaluation(Evaluable):
|
|
1357
1502
|
)
|
1358
1503
|
if self.result.usage is not None:
|
1359
1504
|
s.write('<td>Usage</td>')
|
1360
|
-
s.write('<td>Failures</td>')
|
1505
|
+
s.write('<td>OOP Failures</td>')
|
1506
|
+
s.write('<td>Non-OOP Failures</td>')
|
1361
1507
|
|
1362
1508
|
def _render_result_row(self, s: io.StringIO) -> None:
|
1363
1509
|
s.write(
|
@@ -1385,16 +1531,29 @@ class Evaluation(Evaluable):
|
|
1385
1531
|
# Usage.
|
1386
1532
|
if self.result.usage is not None:
|
1387
1533
|
s.write('<td>')
|
1388
|
-
self.
|
1534
|
+
self._render_summary_usage(s)
|
1389
1535
|
s.write('</td>')
|
1390
1536
|
|
1391
|
-
#
|
1537
|
+
# OOP failures.
|
1538
|
+
s.write(
|
1539
|
+
'<td><span style="color:magenta">%s</span>%s</td>'
|
1540
|
+
% (
|
1541
|
+
self._format_rate(self.oop_failure_rate),
|
1542
|
+
'<a href="%s">(%d/%d)</a>'
|
1543
|
+
% (self.oop_failures_link,
|
1544
|
+
self.num_oop_failures,
|
1545
|
+
self.num_completed),
|
1546
|
+
)
|
1547
|
+
)
|
1548
|
+
# Non-OOP failures.
|
1392
1549
|
s.write(
|
1393
|
-
'<td><span style="color:
|
1550
|
+
'<td><span style="color:red">%s</span>%s</td>'
|
1394
1551
|
% (
|
1395
|
-
|
1552
|
+
self._format_rate(self.non_oop_failure_rate),
|
1396
1553
|
'<a href="%s">(%d/%d)</a>'
|
1397
|
-
% (self.
|
1554
|
+
% (self.non_oop_failures_link,
|
1555
|
+
self.num_non_oop_failures,
|
1556
|
+
self.num_completed),
|
1398
1557
|
)
|
1399
1558
|
)
|
1400
1559
|
|
@@ -1408,24 +1567,77 @@ class Evaluation(Evaluable):
|
|
1408
1567
|
else:
|
1409
1568
|
return 'cyan'
|
1410
1569
|
|
1411
|
-
def
|
1570
|
+
def _render_oop_failures(self, s: io.StringIO) -> None:
|
1571
|
+
self._render_failures(s, '^MappingError.*', error_color='magenta')
|
1572
|
+
|
1573
|
+
def _render_non_oop_failures(self, s: io.StringIO) -> None:
|
1574
|
+
self._render_failures(s, '^(?!MappingError).*', error_color='red')
|
1575
|
+
|
1576
|
+
def _render_failures(
|
1577
|
+
self, s: io.StringIO, error_regex: str, error_color: str) -> None:
|
1412
1578
|
"""Formats the failed cases into html."""
|
1579
|
+
# Failure summary.
|
1413
1580
|
s.write(
|
1414
|
-
'<h2>
|
1581
|
+
'<h2> Error Summary </h2>'
|
1415
1582
|
'<div style="white-space:pre">\n'
|
1416
1583
|
'<table style="border:1px solid">'
|
1417
|
-
'<tr class="header"><td>
|
1584
|
+
'<tr class="header"><td>Error type</td><td>Stats</td></tr>'
|
1418
1585
|
)
|
1586
|
+
error_regex = re.compile(error_regex)
|
1587
|
+
if self.result.metrics.failure_breakdown:
|
1588
|
+
for name, count in self.result.metrics.failure_breakdown.items():
|
1589
|
+
if not error_regex.match(name):
|
1590
|
+
continue
|
1591
|
+
|
1592
|
+
link = f'<a href="#{name}">{name}</a>'
|
1593
|
+
error_rate = self._format_rate(count / self.result.metrics.total)
|
1594
|
+
stats = (f'<span style="color:{error_color}">{error_rate} '
|
1595
|
+
f'({count}/{self.result.metrics.total})</span>')
|
1596
|
+
s.write(f'<tr><td>{link}</td><td>{stats})</td></tr>')
|
1597
|
+
s.write(
|
1598
|
+
'</table></div>'
|
1599
|
+
'<h2> Failed Cases </h2>'
|
1600
|
+
'<div style="white-space:pre">'
|
1601
|
+
)
|
1602
|
+
# Failure details by error type.
|
1603
|
+
failures_by_error = collections.defaultdict(list)
|
1604
|
+
for example, error in self.failures:
|
1605
|
+
error_name = _error_key(error)
|
1606
|
+
if error_regex.match(error_name):
|
1607
|
+
failures_by_error[error_name].append((example, error))
|
1608
|
+
|
1609
|
+
for error_key, failures in failures_by_error.items():
|
1610
|
+
s.write(
|
1611
|
+
f'<h3 id="{error_key}"><a href="#{error_key}">{error_key}</a> '
|
1612
|
+
f'(count={len(failures)})</h3>'
|
1613
|
+
'<table style="border:1px solid">'
|
1614
|
+
'<tr class="header"><td>No.</td><td>Input</td>'
|
1615
|
+
'<td>LM invocation</td><td>Error</td></tr>'
|
1616
|
+
)
|
1617
|
+
for i, (example, error) in enumerate(failures):
|
1618
|
+
lm_response = None
|
1619
|
+
if isinstance(error, lf.structured.MappingError):
|
1620
|
+
lm_response = error.lm_response
|
1621
|
+
error = error.cause
|
1622
|
+
|
1623
|
+
bgcolor = 'white' if i % 2 == 0 else '#DDDDDD'
|
1624
|
+
s.write(f'<tr style="background-color: {bgcolor}"><td>{i + 1}</td>')
|
1625
|
+
s.write('<td style="color:green;white-space:pre-wrap">')
|
1626
|
+
s.write(pg.format(example, verbose=False))
|
1627
|
+
s.write('</td><td>')
|
1628
|
+
if lm_response is not None:
|
1629
|
+
self._render_message(lm_response, s)
|
1630
|
+
s.write(f'</td><td style="color:{error_color};white-space:pre">')
|
1631
|
+
s.write(_format_error(error))
|
1632
|
+
s.write('</td></tr>')
|
1633
|
+
s.write('</table>')
|
1634
|
+
s.write('</div>')
|
1419
1635
|
|
1420
|
-
|
1421
|
-
|
1422
|
-
|
1423
|
-
|
1424
|
-
|
1425
|
-
error_str = lf.text_formatting.decolored(str(error))
|
1426
|
-
s.write(f'<td style="color:red;white-space:pre">{error_str}</td>')
|
1427
|
-
s.write('</tr>')
|
1428
|
-
s.write('</table></div>')
|
1636
|
+
@classmethod
|
1637
|
+
def visualize(cls, evaluations: list['Evaluation']) -> str | None:
|
1638
|
+
"""Visualize the a list of evaluations of this task in HTML."""
|
1639
|
+
del evaluations
|
1640
|
+
return None
|
1429
1641
|
|
1430
1642
|
|
1431
1643
|
@pg.functor()
|
@@ -1578,7 +1790,7 @@ class Summary(pg.Object):
|
|
1578
1790
|
if e is None:
|
1579
1791
|
s.write('<span style="color: gray">N/A<span>')
|
1580
1792
|
else:
|
1581
|
-
s.write(e.
|
1793
|
+
s.write(e.summary_card())
|
1582
1794
|
s.write('</td>')
|
1583
1795
|
s.write('</tr>')
|
1584
1796
|
s.write('</table>')
|
@@ -1653,13 +1865,22 @@ class Summary(pg.Object):
|
|
1653
1865
|
s.write('<html><body>')
|
1654
1866
|
for task in sorted(self.tasks(), key=lambda cls: cls.__name__):
|
1655
1867
|
table_id = task.__name__.lower()
|
1868
|
+
evaluations = self.select(task=task).evaluations
|
1869
|
+
table = Summary.Table.from_evaluations(evaluations, pivot_field)
|
1656
1870
|
s.write('<div>')
|
1657
|
-
s.write(
|
1658
|
-
|
1659
|
-
|
1660
|
-
table = Summary.Table.from_evaluations(
|
1661
|
-
self.select(task=task).evaluations, pivot_field
|
1871
|
+
s.write(
|
1872
|
+
f'<a id="{table_id}" href="#{table_id}">'
|
1873
|
+
f'<h2>{task.__name__}</h2></a>'
|
1662
1874
|
)
|
1875
|
+
|
1876
|
+
# Allow users to plugin visualization code (e.g. matplot) in the summary
|
1877
|
+
# page.
|
1878
|
+
visual_part = task.visualize(evaluations)
|
1879
|
+
if visual_part:
|
1880
|
+
s.write(visual_part)
|
1881
|
+
|
1882
|
+
s.write(f'<h4 style="color:gray">{len(evaluations)} experiments</h4>')
|
1883
|
+
s.write('<hr/>')
|
1663
1884
|
s.write(table.html())
|
1664
1885
|
s.write('</div>')
|
1665
1886
|
s.write('</body></html>')
|
@@ -1685,6 +1906,7 @@ class Summary(pg.Object):
|
|
1685
1906
|
experiment=entry,
|
1686
1907
|
dir=entry.dir,
|
1687
1908
|
metrics=entry.result.metrics if entry.result else None,
|
1909
|
+
usage=entry.result.usage if entry.result else None,
|
1688
1910
|
)
|
1689
1911
|
)
|
1690
1912
|
task_results[task.__name__] = results
|
@@ -1833,6 +2055,21 @@ class Summary(pg.Object):
|
|
1833
2055
|
return result.join()
|
1834
2056
|
|
1835
2057
|
|
2058
|
+
def _format_error(error: Exception):
|
2059
|
+
"""Formats an error into a string."""
|
2060
|
+
return (f'({error.__class__.__name__}) '
|
2061
|
+
+ lf.text_formatting.decolored(str(error)))
|
2062
|
+
|
2063
|
+
|
2064
|
+
def _error_key(error: Exception) -> str:
|
2065
|
+
"""Returns the key for an error."""
|
2066
|
+
error_names = []
|
2067
|
+
while error is not None:
|
2068
|
+
error_names.append(error.__class__.__name__)
|
2069
|
+
error = getattr(error, 'cause', None)
|
2070
|
+
return '.'.join(error_names)
|
2071
|
+
|
2072
|
+
|
1836
2073
|
def _html_repr(value: Any, compact: bool = True, escape: bool = False) -> str:
|
1837
2074
|
"""Formats prompt in HTML."""
|
1838
2075
|
if type(value) is lf.Template: # pylint: disable=unidiomatic-typecheck
|