langfun 0.0.2.dev20240429__tar.gz → 0.0.2.dev20240430__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/PKG-INFO +1 -1
  2. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/eval/base.py +310 -73
  3. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/eval/base_test.py +96 -45
  4. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/eval/matching.py +22 -21
  5. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/eval/matching_test.py +23 -2
  6. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/eval/scoring.py +4 -4
  7. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/eval/scoring_test.py +19 -2
  8. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/llms/openai.py +1 -1
  9. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/llms/openai_test.py +2 -1
  10. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun.egg-info/PKG-INFO +1 -1
  11. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/LICENSE +0 -0
  12. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/README.md +0 -0
  13. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/__init__.py +0 -0
  14. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/__init__.py +0 -0
  15. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/coding/__init__.py +0 -0
  16. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/coding/python/__init__.py +0 -0
  17. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/coding/python/correction.py +0 -0
  18. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/coding/python/correction_test.py +0 -0
  19. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/coding/python/errors.py +0 -0
  20. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/coding/python/errors_test.py +0 -0
  21. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/coding/python/execution.py +0 -0
  22. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/coding/python/execution_test.py +0 -0
  23. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/coding/python/generation.py +0 -0
  24. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/coding/python/generation_test.py +0 -0
  25. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/coding/python/parsing.py +0 -0
  26. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/coding/python/parsing_test.py +0 -0
  27. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/coding/python/permissions.py +0 -0
  28. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/coding/python/permissions_test.py +0 -0
  29. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/component.py +0 -0
  30. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/component_test.py +0 -0
  31. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/concurrent.py +0 -0
  32. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/concurrent_test.py +0 -0
  33. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/console.py +0 -0
  34. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/console_test.py +0 -0
  35. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/eval/__init__.py +0 -0
  36. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/langfunc.py +0 -0
  37. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/langfunc_test.py +0 -0
  38. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/language_model.py +0 -0
  39. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/language_model_test.py +0 -0
  40. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/llms/__init__.py +0 -0
  41. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/llms/anthropic.py +0 -0
  42. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/llms/anthropic_test.py +0 -0
  43. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/llms/cache/__init__.py +0 -0
  44. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/llms/cache/base.py +0 -0
  45. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/llms/cache/in_memory.py +0 -0
  46. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/llms/cache/in_memory_test.py +0 -0
  47. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/llms/fake.py +0 -0
  48. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/llms/fake_test.py +0 -0
  49. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/llms/google_genai.py +0 -0
  50. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/llms/google_genai_test.py +0 -0
  51. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/llms/groq.py +0 -0
  52. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/llms/groq_test.py +0 -0
  53. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/llms/llama_cpp.py +0 -0
  54. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/llms/llama_cpp_test.py +0 -0
  55. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/memories/__init__.py +0 -0
  56. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/memories/conversation_history.py +0 -0
  57. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/memories/conversation_history_test.py +0 -0
  58. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/memory.py +0 -0
  59. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/message.py +0 -0
  60. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/message_test.py +0 -0
  61. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/modalities/__init__.py +0 -0
  62. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/modalities/image.py +0 -0
  63. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/modalities/image_test.py +0 -0
  64. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/modalities/mime.py +0 -0
  65. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/modalities/mime_test.py +0 -0
  66. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/modalities/video.py +0 -0
  67. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/modalities/video_test.py +0 -0
  68. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/modality.py +0 -0
  69. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/modality_test.py +0 -0
  70. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/natural_language.py +0 -0
  71. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/natural_language_test.py +0 -0
  72. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/sampling.py +0 -0
  73. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/sampling_test.py +0 -0
  74. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/structured/__init__.py +0 -0
  75. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/structured/completion.py +0 -0
  76. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/structured/completion_test.py +0 -0
  77. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/structured/description.py +0 -0
  78. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/structured/description_test.py +0 -0
  79. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/structured/function_generation.py +0 -0
  80. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/structured/function_generation_test.py +0 -0
  81. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/structured/mapping.py +0 -0
  82. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/structured/mapping_test.py +0 -0
  83. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/structured/parsing.py +0 -0
  84. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/structured/parsing_test.py +0 -0
  85. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/structured/prompting.py +0 -0
  86. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/structured/prompting_test.py +0 -0
  87. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/structured/schema.py +0 -0
  88. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/structured/schema_generation.py +0 -0
  89. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/structured/schema_generation_test.py +0 -0
  90. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/structured/schema_test.py +0 -0
  91. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/structured/scoring.py +0 -0
  92. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/structured/scoring_test.py +0 -0
  93. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/subscription.py +0 -0
  94. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/subscription_test.py +0 -0
  95. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/template.py +0 -0
  96. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/template_test.py +0 -0
  97. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/templates/__init__.py +0 -0
  98. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/templates/completion.py +0 -0
  99. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/templates/completion_test.py +0 -0
  100. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/templates/conversation.py +0 -0
  101. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/templates/conversation_test.py +0 -0
  102. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/templates/demonstration.py +0 -0
  103. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/templates/demonstration_test.py +0 -0
  104. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/templates/selfplay.py +0 -0
  105. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/templates/selfplay_test.py +0 -0
  106. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/text_formatting.py +0 -0
  107. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun/core/text_formatting_test.py +0 -0
  108. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun.egg-info/SOURCES.txt +0 -0
  109. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun.egg-info/dependency_links.txt +0 -0
  110. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun.egg-info/requires.txt +0 -0
  111. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/langfun.egg-info/top_level.txt +0 -0
  112. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/setup.cfg +0 -0
  113. {langfun-0.0.2.dev20240429 → langfun-0.0.2.dev20240430}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: langfun
3
- Version: 0.0.2.dev20240429
3
+ Version: 0.0.2.dev20240430
4
4
  Summary: Langfun: Language as Functions.
5
5
  Home-page: https://github.com/google/langfun
6
6
  Author: Langfun Authors
@@ -18,6 +18,7 @@ import collections
18
18
  import dataclasses
19
19
  import functools
20
20
  import hashlib
21
+ import html
21
22
  import inspect
22
23
  import io
23
24
  import os
@@ -40,7 +41,8 @@ class Evaluable(lf.Component):
40
41
 
41
42
  EXPERIMENT_JSON = 'experiment.json'
42
43
  RESULT_JSON = 'result.json'
43
- FAILURES_JSON = 'failures.json'
44
+ OOP_FAILURES_JSON = 'oop_failures.json'
45
+ NON_OOP_FAILURES_JSON = 'non_oop_failures.json'
44
46
  INDEX_HTML = 'index.html'
45
47
  SUMMARY_HTML = 'summary.html'
46
48
 
@@ -358,7 +360,7 @@ class Evaluable(lf.Component):
358
360
  color='yellow')
359
361
 
360
362
  for node in self.nonleaf_nodes:
361
- node._result = {c.id: c.result for c in node.children} # pylint: disable=protected-access
363
+ node._result = {c.id: c.result for c in node.leaf_nodes} # pylint: disable=protected-access
362
364
  if should_save:
363
365
  node.save(result=False, report=False)
364
366
 
@@ -540,13 +542,13 @@ class Evaluable(lf.Component):
540
542
  f'<div style="color: {text_color}; white-space: pre-wrap;'
541
543
  'padding: 10px; border: 1px solid; margin-top: 10px">'
542
544
  )
543
- s.write(m.get('formatted_text', m.text))
545
+ s.write(html.escape(m.get('formatted_text', m.text)))
544
546
  if m.result is not None:
545
547
  s.write(
546
548
  '<div style="color: magenta; white-space: pre-wrap;'
547
549
  'padding: 10px; border: 1px solid; margin: 10px">'
548
550
  )
549
- s.write(pg.format(m.result))
551
+ s.write(html.escape(pg.format(m.result)))
550
552
  s.write('</div>')
551
553
  if 'usage' in m.metadata:
552
554
  s.write(
@@ -753,10 +755,12 @@ class Evaluation(Evaluable):
753
755
 
754
756
  # Constants.
755
757
  CACHE_JSON = 'cache.json'
756
- FAILURES_HTML = 'failures.html'
758
+ OOP_FAILURES_HTML = 'oop_failures.html'
759
+ NON_OOP_FAILURES_HTML = 'non_oop_failures.html'
757
760
 
758
761
  @functools.cached_property
759
762
  def hash(self) -> str:
763
+ """Returns the semantic-based hash of the evaluation."""
760
764
  if self.is_deterministic:
761
765
  identity = pg.format(self._identifiers(), compact=True)
762
766
  else:
@@ -805,6 +809,10 @@ class Evaluation(Evaluable):
805
809
  """Returns the complete rate."""
806
810
  return self.num_completed / self.num_examples
807
811
 
812
+ #
813
+ # Properties on failures.
814
+ #
815
+
808
816
  @property
809
817
  def failures(self) -> list[tuple[Any, Exception]]:
810
818
  """Returns the failed examples and their errors."""
@@ -815,6 +823,15 @@ class Evaluation(Evaluable):
815
823
  """Returns the number of failed examples."""
816
824
  return len(self.failures)
817
825
 
826
+ @functools.cached_property
827
+ def failure_breakdown(self) -> dict[str, int]:
828
+ """Returns the breakdown of failures."""
829
+ breakdown = collections.defaultdict(int)
830
+ for _, error in self.failures:
831
+ breakdown[_error_key(error)] += 1
832
+ sorted_items = sorted(breakdown.items(), key=lambda x: x[1], reverse=True)
833
+ return pg.Dict({x[0]: x[1] for x in sorted_items})
834
+
818
835
  @property
819
836
  def failure_rate(self) -> float:
820
837
  """Returns the failure rate in range [0, 1]."""
@@ -822,6 +839,46 @@ class Evaluation(Evaluable):
822
839
  return 0.0
823
840
  return self.num_failures / self.num_completed
824
841
 
842
+ @functools.cached_property
843
+ def oop_failures(self) -> list[tuple[Any, lf_structured.MappingError]]:
844
+ """Returns the OOP failures."""
845
+ return [item for item in self.failures
846
+ if isinstance(item[1], lf_structured.MappingError)]
847
+
848
+ @property
849
+ def num_oop_failures(self) -> int:
850
+ """Returns the number of OOP failures."""
851
+ return len(self.oop_failures)
852
+
853
+ @property
854
+ def oop_failure_rate(self) -> float:
855
+ """Returns the OOP failure rate in range [0, 1]."""
856
+ if self.num_completed == 0:
857
+ return 0.0
858
+ return self.num_oop_failures / self.num_completed
859
+
860
+ @functools.cached_property
861
+ def non_oop_failures(self) -> list[tuple[Any, Exception]]:
862
+ """Returns the OOP failures."""
863
+ return [item for item in self.failures
864
+ if not isinstance(item[1], lf_structured.MappingError)]
865
+
866
+ @property
867
+ def num_non_oop_failures(self) -> int:
868
+ """Returns the number of non-OOP failures."""
869
+ return len(self.non_oop_failures)
870
+
871
+ @property
872
+ def non_oop_failure_rate(self) -> float:
873
+ """Returns the non-OOP failure rate in range [0, 1]."""
874
+ if self.num_completed == 0:
875
+ return 0.0
876
+ return self.num_non_oop_failures / self.num_completed
877
+
878
+ #
879
+ # Properties on usage.
880
+ #
881
+
825
882
  @property
826
883
  def has_usage(self) -> bool:
827
884
  """Returns True if token usage is enabled."""
@@ -976,13 +1033,22 @@ class Evaluation(Evaluable):
976
1033
  self._total_prompt_tokens = 0
977
1034
  self._total_completion_tokens = 0
978
1035
  self._num_usages = 0
1036
+ self.__dict__.pop('oop_failures', None)
1037
+ self.__dict__.pop('non_oop_failures', None)
979
1038
 
980
1039
  @property
981
- def failures_link(self) -> str | None:
982
- """Returns the link to the failures page."""
1040
+ def oop_failures_link(self) -> str | None:
1041
+ """Returns the link to the OOP failures page."""
983
1042
  if self.dir is None:
984
1043
  return None
985
- return self.link(os.path.join(self.dir, Evaluation.FAILURES_HTML))
1044
+ return self.link(os.path.join(self.dir, Evaluation.OOP_FAILURES_HTML))
1045
+
1046
+ @property
1047
+ def non_oop_failures_link(self) -> str | None:
1048
+ """Returns the link to then non-OOP failures page."""
1049
+ if self.dir is None:
1050
+ return None
1051
+ return self.link(os.path.join(self.dir, Evaluation.NON_OOP_FAILURES_HTML))
986
1052
 
987
1053
  def _dryrun(
988
1054
  self,
@@ -1011,23 +1077,34 @@ class Evaluation(Evaluable):
1011
1077
  color='green',
1012
1078
  )
1013
1079
 
1014
- with lf.use_settings(debug=debug):
1015
- output_message = copy.process(example, **(self.additional_args or {}))
1016
- if self.schema is None:
1017
- output = output_message.text
1018
- else:
1019
- output = output_message.result
1080
+ error, output_message = None, None
1020
1081
 
1021
- if verbose:
1082
+ try:
1083
+ with lf.use_settings(debug=debug):
1084
+ output_message = copy.process(example, **(self.additional_args or {}))
1085
+ if self.schema is None:
1086
+ output = output_message.text
1087
+ else:
1088
+ output = output_message.result
1089
+
1090
+ if verbose:
1091
+ lf.console.write('')
1092
+ lf.console.write(
1093
+ str(output),
1094
+ title='OUTPUT',
1095
+ color='blue',
1096
+ )
1097
+ except lf_structured.MappingError as e:
1022
1098
  lf.console.write('')
1023
1099
  lf.console.write(
1024
- str(output),
1025
- title='OUTPUT',
1026
- color='blue',
1100
+ str(e),
1101
+ title='ERROR',
1102
+ color='red',
1027
1103
  )
1104
+ error = e
1028
1105
 
1029
- copy.audit(example, output_message, None, dryrun=True)
1030
- result = copy.summarize()
1106
+ copy.audit(example, output_message, error, dryrun=True)
1107
+ result = copy.finalize()
1031
1108
 
1032
1109
  if verbose:
1033
1110
  lf.console.write('')
@@ -1087,7 +1164,7 @@ class Evaluation(Evaluable):
1087
1164
  self.cache.save()
1088
1165
 
1089
1166
  # Summarize result.
1090
- self._result = self.summarize()
1167
+ self._result = self.finalize()
1091
1168
  if verbose:
1092
1169
  lf.console.write(
1093
1170
  str(self.result),
@@ -1143,13 +1220,13 @@ class Evaluation(Evaluable):
1143
1220
  def _status(self, progress: lf.concurrent.Progress) -> dict[str, Any]:
1144
1221
  return {
1145
1222
  'Model': self.lm.model_id,
1146
- 'Succeeded': f'%.{self.report_precision}f%% (%d/%d)' % (
1147
- progress.success_rate * 100,
1223
+ 'Succeeded': '%s (%d/%d)' % (
1224
+ self._format_rate(progress.success_rate),
1148
1225
  progress.succeeded,
1149
1226
  progress.completed,
1150
1227
  ),
1151
- 'Failed': f'%.{self.report_precision}f%% (%d/%d)' % (
1152
- progress.failure_rate * 100,
1228
+ 'Failed': '%s (%d/%d)' % (
1229
+ self._format_rate(progress.failure_rate),
1153
1230
  progress.failed,
1154
1231
  progress.completed,
1155
1232
  ),
@@ -1159,21 +1236,20 @@ class Evaluation(Evaluable):
1159
1236
  assert self.result is not None
1160
1237
  m = self.result.metrics
1161
1238
  return (
1162
- f'COMPLETED(%s): Successes=%.{self.report_precision}f%% (%d/%d)'
1163
- f' Failures=%.{self.report_precision}f%% (%d/%d)'
1239
+ 'COMPLETED(%s): Successes=%s(%d/%d) Failures=%s (%d/%d)'
1164
1240
  % (
1165
1241
  run_status,
1166
- (1 - m.failure_rate) * 100,
1242
+ self._format_rate(1 - m.failure_rate),
1167
1243
  m.total - m.failures,
1168
1244
  m.total,
1169
- m.failure_rate * 100,
1245
+ self._format_rate(m.failure_rate),
1170
1246
  m.failures,
1171
1247
  m.total,
1172
1248
  )
1173
1249
  )
1174
1250
 
1175
- def summarize(self) -> pg.Dict:
1176
- """Summarizes the evaluation result."""
1251
+ def finalize(self) -> pg.Dict:
1252
+ """Finalizes the evaluation result."""
1177
1253
  if self.cache:
1178
1254
  cache_stats = dict(
1179
1255
  use_cache=True,
@@ -1210,12 +1286,18 @@ class Evaluation(Evaluable):
1210
1286
  total=self.num_completed,
1211
1287
  failures=self.num_failures,
1212
1288
  failure_rate=self.failure_rate,
1289
+ oop_failures=self.num_oop_failures,
1290
+ oop_failure_rate=self.oop_failure_rate,
1291
+ non_oop_failures=self.num_non_oop_failures,
1292
+ non_oop_failure_rate=self.non_oop_failure_rate,
1293
+ failure_breakdown=self.failure_breakdown,
1213
1294
  ),
1214
1295
  usage=usage,
1215
1296
  )
1216
1297
  return result
1217
1298
 
1218
- def summarize_html(self) -> str:
1299
+ def summary_card(self) -> str:
1300
+ """Returns summary card in HTML."""
1219
1301
  s = io.StringIO()
1220
1302
  definition = _html_repr(self, compact=False, escape=True)
1221
1303
  s.write('<div><table><tr><td>')
@@ -1230,18 +1312,19 @@ class Evaluation(Evaluable):
1230
1312
  s.write(
1231
1313
  f'<a target="_blank" title="{definition}" '
1232
1314
  f'href="{self.index_link}">{self.hash}</a>'
1315
+ f' &nbsp;[<a href="{self.link(self.dir)}">dir</a>]'
1233
1316
  '</td></tr><tr><td>'
1234
1317
  )
1235
- self._render_metric(s)
1318
+ self._render_summary_metrics(s)
1236
1319
 
1237
1320
  # Summarize average usage.
1238
1321
  if self.result.usage is not None:
1239
- self._render_usage(s)
1322
+ self._render_summary_usage(s)
1240
1323
 
1241
1324
  s.write('</td></tr></table></div>')
1242
1325
  return s.getvalue()
1243
1326
 
1244
- def _render_usage(self, s: io.StringIO) -> None:
1327
+ def _render_summary_usage(self, s: io.StringIO) -> None:
1245
1328
  """Renders usage in HTML."""
1246
1329
  usage = self.result.usage
1247
1330
  total = usage.total_prompt_tokens + usage.total_completion_tokens
@@ -1255,20 +1338,66 @@ class Evaluation(Evaluable):
1255
1338
  f'" style="color:gray">({total} tokens)</a>'
1256
1339
  )
1257
1340
 
1258
- def _render_metric(self, s: io.StringIO) -> None:
1341
+ def _render_summary_metrics(self, s: io.StringIO) -> None:
1259
1342
  """Renders metrics in HTML."""
1260
1343
  assert self.result is not None
1261
1344
  m = self.result.metrics
1345
+
1346
+ # OOP failures.
1347
+ oop_failure_title = f'OOP failures ({m.oop_failures}/{m.total})'
1348
+ if m.oop_failures:
1349
+ oop_failure_title += '&#013;'
1350
+ for name, count in m.failure_breakdown.items():
1351
+ if name.startswith('MappingError'):
1352
+ oop_failure_title += '&#013;%s: %s (%d/%d)' % (
1353
+ name.removeprefix('MappingError.'),
1354
+ self._format_rate(count / m.total),
1355
+ count,
1356
+ m.total,
1357
+ )
1358
+
1359
+ extra_style = ''
1360
+ if m.oop_failure_rate > 0.1 and m.oop_failures > 3:
1361
+ extra_style = ';font-weight:bold'
1262
1362
  s.write(
1263
- '<a title="Failures (%d/%d)" href="%s" style="color:red">%s</a>'
1363
+ '<a title="%s" href="%s" style="color:magenta%s">%s</a>'
1264
1364
  % (
1265
- m.failures,
1266
- m.total,
1267
- self.failures_link,
1268
- f'%.{self.report_precision}f%% ' % (m.failure_rate * 100),
1365
+ oop_failure_title,
1366
+ self.oop_failures_link,
1367
+ extra_style,
1368
+ self._format_rate(m.oop_failure_rate),
1369
+ )
1370
+ )
1371
+ s.write(' | ')
1372
+
1373
+ # Non-OOP failures.
1374
+ non_oop_failure_title = f'Non-OOP failures ({m.non_oop_failures}/{m.total})'
1375
+ if m.non_oop_failures:
1376
+ non_oop_failure_title += '&#013;'
1377
+ for name, count in m.failure_breakdown.items():
1378
+ if not name.startswith('MappingError'):
1379
+ non_oop_failure_title += '&#013;%s: %s (%d/%d)' % (
1380
+ name,
1381
+ self._format_rate(count / m.total),
1382
+ count,
1383
+ m.total,
1384
+ )
1385
+
1386
+ extra_style = ';font-weight:bold' if m.non_oop_failures > 0 else ''
1387
+ s.write(
1388
+ '<a title="%s" href="%s" style="color:red%s">%s</a>'
1389
+ % (
1390
+ non_oop_failure_title,
1391
+ self.non_oop_failures_link,
1392
+ extra_style,
1393
+ self._format_rate(m.non_oop_failure_rate),
1269
1394
  )
1270
1395
  )
1271
1396
 
1397
+ def _format_rate(self, rate: float) -> str:
1398
+ """Formats a rate."""
1399
+ return f'%.{self.report_precision}f%% ' % (rate * 100)
1400
+
1272
1401
  def audit(
1273
1402
  self,
1274
1403
  example: Any,
@@ -1287,7 +1416,13 @@ class Evaluation(Evaluable):
1287
1416
  dryrun: Whether or not audition takes place during dryrun.
1288
1417
  """
1289
1418
  if error is not None:
1290
- self._failures.append((example, str(error)))
1419
+ self._failures.append((example, error))
1420
+
1421
+ # Invalid cache of num_oop_failures.
1422
+ self.__dict__.pop('oop_failures', None)
1423
+ self.__dict__.pop('non_oop_failures', None)
1424
+ self.__dict__.pop('failure_breakdown', None)
1425
+
1291
1426
  if isinstance(error, lf_structured.MappingError):
1292
1427
  message = error.lm_response
1293
1428
  else:
@@ -1333,16 +1468,26 @@ class Evaluation(Evaluable):
1333
1468
  # Save failures.
1334
1469
  pg.save(
1335
1470
  [
1336
- pg.Dict(
1337
- input=input, error=lf.text_formatting.decolored(str(error))
1338
- )
1339
- for input, error in self.failures
1471
+ pg.Dict(input=input, error=_format_error(error))
1472
+ for input, error in self.oop_failures
1473
+ ],
1474
+ os.path.join(self.dir, Evaluation.OOP_FAILURES_JSON),
1475
+ )
1476
+ pg.save(
1477
+ self._html([self._render_result, self._render_oop_failures]),
1478
+ os.path.join(self.dir, Evaluation.OOP_FAILURES_HTML),
1479
+ file_format='txt',
1480
+ )
1481
+ pg.save(
1482
+ [
1483
+ pg.Dict(input=input, error=_format_error(error))
1484
+ for input, error in self.non_oop_failures
1340
1485
  ],
1341
- os.path.join(self.dir, Evaluation.FAILURES_JSON),
1486
+ os.path.join(self.dir, Evaluation.NON_OOP_FAILURES_JSON),
1342
1487
  )
1343
1488
  pg.save(
1344
- self._html([self._render_result, self._render_failures]),
1345
- os.path.join(self.dir, Evaluation.FAILURES_HTML),
1489
+ self._html([self._render_result, self._render_non_oop_failures]),
1490
+ os.path.join(self.dir, Evaluation.NON_OOP_FAILURES_HTML),
1346
1491
  file_format='txt',
1347
1492
  )
1348
1493
 
@@ -1357,7 +1502,8 @@ class Evaluation(Evaluable):
1357
1502
  )
1358
1503
  if self.result.usage is not None:
1359
1504
  s.write('<td>Usage</td>')
1360
- s.write('<td>Failures</td>')
1505
+ s.write('<td>OOP Failures</td>')
1506
+ s.write('<td>Non-OOP Failures</td>')
1361
1507
 
1362
1508
  def _render_result_row(self, s: io.StringIO) -> None:
1363
1509
  s.write(
@@ -1385,16 +1531,29 @@ class Evaluation(Evaluable):
1385
1531
  # Usage.
1386
1532
  if self.result.usage is not None:
1387
1533
  s.write('<td>')
1388
- self._render_usage(s)
1534
+ self._render_summary_usage(s)
1389
1535
  s.write('</td>')
1390
1536
 
1391
- # Failures.
1537
+ # OOP failures.
1538
+ s.write(
1539
+ '<td><span style="color:magenta">%s</span>%s</td>'
1540
+ % (
1541
+ self._format_rate(self.oop_failure_rate),
1542
+ '<a href="%s">(%d/%d)</a>'
1543
+ % (self.oop_failures_link,
1544
+ self.num_oop_failures,
1545
+ self.num_completed),
1546
+ )
1547
+ )
1548
+ # Non-OOP failures.
1392
1549
  s.write(
1393
- '<td><span style="color:orange">%s</span>%s</td>'
1550
+ '<td><span style="color:red">%s</span>%s</td>'
1394
1551
  % (
1395
- f'%.{self.report_precision}f%%' % (self.failure_rate * 100),
1552
+ self._format_rate(self.non_oop_failure_rate),
1396
1553
  '<a href="%s">(%d/%d)</a>'
1397
- % (self.failures_link, self.num_failures, self.num_completed),
1554
+ % (self.non_oop_failures_link,
1555
+ self.num_non_oop_failures,
1556
+ self.num_completed),
1398
1557
  )
1399
1558
  )
1400
1559
 
@@ -1408,24 +1567,77 @@ class Evaluation(Evaluable):
1408
1567
  else:
1409
1568
  return 'cyan'
1410
1569
 
1411
- def _render_failures(self, s: io.StringIO) -> None:
1570
+ def _render_oop_failures(self, s: io.StringIO) -> None:
1571
+ self._render_failures(s, '^MappingError.*', error_color='magenta')
1572
+
1573
+ def _render_non_oop_failures(self, s: io.StringIO) -> None:
1574
+ self._render_failures(s, '^(?!MappingError).*', error_color='red')
1575
+
1576
+ def _render_failures(
1577
+ self, s: io.StringIO, error_regex: str, error_color: str) -> None:
1412
1578
  """Formats the failed cases into html."""
1579
+ # Failure summary.
1413
1580
  s.write(
1414
- '<h2> Failed Cases </h2>'
1581
+ '<h2> Error Summary </h2>'
1415
1582
  '<div style="white-space:pre">\n'
1416
1583
  '<table style="border:1px solid">'
1417
- '<tr class="header"><td>No.</td><td>Input</td><td>Error</td></tr>'
1584
+ '<tr class="header"><td>Error type</td><td>Stats</td></tr>'
1418
1585
  )
1586
+ error_regex = re.compile(error_regex)
1587
+ if self.result.metrics.failure_breakdown:
1588
+ for name, count in self.result.metrics.failure_breakdown.items():
1589
+ if not error_regex.match(name):
1590
+ continue
1591
+
1592
+ link = f'<a href="#{name}">{name}</a>'
1593
+ error_rate = self._format_rate(count / self.result.metrics.total)
1594
+ stats = (f'<span style="color:{error_color}">{error_rate} '
1595
+ f'({count}/{self.result.metrics.total})</span>')
1596
+ s.write(f'<tr><td>{link}</td><td>{stats})</td></tr>')
1597
+ s.write(
1598
+ '</table></div>'
1599
+ '<h2> Failed Cases </h2>'
1600
+ '<div style="white-space:pre">'
1601
+ )
1602
+ # Failure details by error type.
1603
+ failures_by_error = collections.defaultdict(list)
1604
+ for example, error in self.failures:
1605
+ error_name = _error_key(error)
1606
+ if error_regex.match(error_name):
1607
+ failures_by_error[error_name].append((example, error))
1608
+
1609
+ for error_key, failures in failures_by_error.items():
1610
+ s.write(
1611
+ f'<h3 id="{error_key}"><a href="#{error_key}">{error_key}</a> '
1612
+ f'(count={len(failures)})</h3>'
1613
+ '<table style="border:1px solid">'
1614
+ '<tr class="header"><td>No.</td><td>Input</td>'
1615
+ '<td>LM invocation</td><td>Error</td></tr>'
1616
+ )
1617
+ for i, (example, error) in enumerate(failures):
1618
+ lm_response = None
1619
+ if isinstance(error, lf.structured.MappingError):
1620
+ lm_response = error.lm_response
1621
+ error = error.cause
1622
+
1623
+ bgcolor = 'white' if i % 2 == 0 else '#DDDDDD'
1624
+ s.write(f'<tr style="background-color: {bgcolor}"><td>{i + 1}</td>')
1625
+ s.write('<td style="color:green;white-space:pre-wrap">')
1626
+ s.write(pg.format(example, verbose=False))
1627
+ s.write('</td><td>')
1628
+ if lm_response is not None:
1629
+ self._render_message(lm_response, s)
1630
+ s.write(f'</td><td style="color:{error_color};white-space:pre">')
1631
+ s.write(_format_error(error))
1632
+ s.write('</td></tr>')
1633
+ s.write('</table>')
1634
+ s.write('</div>')
1419
1635
 
1420
- for i, (example, error) in enumerate(self.failures):
1421
- bgcolor = 'white' if i % 2 == 0 else '#DDDDDD'
1422
- s.write(f'<tr style="background-color: {bgcolor}"><td>{i + 1}</td>')
1423
- input_str = pg.format(example, verbose=False)
1424
- s.write(f'<td style="color:green;white-space:pre-wrap">{input_str}</td>')
1425
- error_str = lf.text_formatting.decolored(str(error))
1426
- s.write(f'<td style="color:red;white-space:pre">{error_str}</td>')
1427
- s.write('</tr>')
1428
- s.write('</table></div>')
1636
+ @classmethod
1637
+ def visualize(cls, evaluations: list['Evaluation']) -> str | None:
1638
+ """Visualize the a list of evaluations of this task in HTML."""
1639
+ del evaluations
1640
+ return None
1429
1641
 
1430
1642
 
1431
1643
  @pg.functor()
@@ -1578,7 +1790,7 @@ class Summary(pg.Object):
1578
1790
  if e is None:
1579
1791
  s.write('<span style="color: gray">N/A<span>')
1580
1792
  else:
1581
- s.write(e.summarize_html())
1793
+ s.write(e.summary_card())
1582
1794
  s.write('</td>')
1583
1795
  s.write('</tr>')
1584
1796
  s.write('</table>')
@@ -1653,13 +1865,22 @@ class Summary(pg.Object):
1653
1865
  s.write('<html><body>')
1654
1866
  for task in sorted(self.tasks(), key=lambda cls: cls.__name__):
1655
1867
  table_id = task.__name__.lower()
1868
+ evaluations = self.select(task=task).evaluations
1869
+ table = Summary.Table.from_evaluations(evaluations, pivot_field)
1656
1870
  s.write('<div>')
1657
- s.write(f'<a id="{table_id}"')
1658
- s.write(f'<h2><a href="#{table_id}">{task.__name__}</a></h2>')
1659
- s.write('</a>')
1660
- table = Summary.Table.from_evaluations(
1661
- self.select(task=task).evaluations, pivot_field
1871
+ s.write(
1872
+ f'<a id="{table_id}" href="#{table_id}">'
1873
+ f'<h2>{task.__name__}</h2></a>'
1662
1874
  )
1875
+
1876
+ # Allow users to plugin visualization code (e.g. matplot) in the summary
1877
+ # page.
1878
+ visual_part = task.visualize(evaluations)
1879
+ if visual_part:
1880
+ s.write(visual_part)
1881
+
1882
+ s.write(f'<h4 style="color:gray">{len(evaluations)} experiments</h4>')
1883
+ s.write('<hr/>')
1663
1884
  s.write(table.html())
1664
1885
  s.write('</div>')
1665
1886
  s.write('</body></html>')
@@ -1685,6 +1906,7 @@ class Summary(pg.Object):
1685
1906
  experiment=entry,
1686
1907
  dir=entry.dir,
1687
1908
  metrics=entry.result.metrics if entry.result else None,
1909
+ usage=entry.result.usage if entry.result else None,
1688
1910
  )
1689
1911
  )
1690
1912
  task_results[task.__name__] = results
@@ -1833,6 +2055,21 @@ class Summary(pg.Object):
1833
2055
  return result.join()
1834
2056
 
1835
2057
 
2058
+ def _format_error(error: Exception):
2059
+ """Formats an error into a string."""
2060
+ return (f'({error.__class__.__name__}) '
2061
+ + lf.text_formatting.decolored(str(error)))
2062
+
2063
+
2064
+ def _error_key(error: Exception) -> str:
2065
+ """Returns the key for an error."""
2066
+ error_names = []
2067
+ while error is not None:
2068
+ error_names.append(error.__class__.__name__)
2069
+ error = getattr(error, 'cause', None)
2070
+ return '.'.join(error_names)
2071
+
2072
+
1836
2073
  def _html_repr(value: Any, compact: bool = True, escape: bool = False) -> str:
1837
2074
  """Formats prompt in HTML."""
1838
2075
  if type(value) is lf.Template: # pylint: disable=unidiomatic-typecheck