langfun 0.0.2.dev20240429__py3-none-any.whl → 0.1.2.dev202501150804__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (144) hide show
  1. langfun/__init__.py +20 -2
  2. langfun/core/__init__.py +16 -5
  3. langfun/core/agentic/__init__.py +30 -0
  4. langfun/core/agentic/action.py +854 -0
  5. langfun/core/agentic/action_eval.py +150 -0
  6. langfun/core/agentic/action_eval_test.py +109 -0
  7. langfun/core/agentic/action_test.py +136 -0
  8. langfun/core/coding/python/__init__.py +5 -11
  9. langfun/core/coding/python/correction.py +37 -21
  10. langfun/core/coding/python/correction_test.py +29 -3
  11. langfun/core/coding/python/execution.py +40 -216
  12. langfun/core/coding/python/execution_test.py +29 -89
  13. langfun/core/coding/python/generation.py +21 -11
  14. langfun/core/coding/python/generation_test.py +2 -2
  15. langfun/core/coding/python/parsing.py +108 -193
  16. langfun/core/coding/python/parsing_test.py +2 -105
  17. langfun/core/component.py +63 -2
  18. langfun/core/component_test.py +53 -0
  19. langfun/core/concurrent.py +414 -117
  20. langfun/core/concurrent_test.py +111 -24
  21. langfun/core/console.py +17 -5
  22. langfun/core/console_test.py +17 -0
  23. langfun/core/eval/__init__.py +16 -1
  24. langfun/core/eval/base.py +622 -174
  25. langfun/core/eval/base_test.py +200 -54
  26. langfun/core/eval/matching.py +63 -76
  27. langfun/core/eval/matching_test.py +17 -8
  28. langfun/core/eval/patching.py +130 -0
  29. langfun/core/eval/patching_test.py +170 -0
  30. langfun/core/eval/scoring.py +26 -26
  31. langfun/core/eval/scoring_test.py +19 -2
  32. langfun/core/eval/v2/__init__.py +42 -0
  33. langfun/core/eval/v2/checkpointing.py +380 -0
  34. langfun/core/eval/v2/checkpointing_test.py +228 -0
  35. langfun/core/eval/v2/eval_test_helper.py +136 -0
  36. langfun/core/eval/v2/evaluation.py +725 -0
  37. langfun/core/eval/v2/evaluation_test.py +180 -0
  38. langfun/core/eval/v2/example.py +305 -0
  39. langfun/core/eval/v2/example_test.py +128 -0
  40. langfun/core/eval/v2/experiment.py +1048 -0
  41. langfun/core/eval/v2/experiment_test.py +433 -0
  42. langfun/core/eval/v2/metric_values.py +156 -0
  43. langfun/core/eval/v2/metric_values_test.py +80 -0
  44. langfun/core/eval/v2/metrics.py +357 -0
  45. langfun/core/eval/v2/metrics_test.py +203 -0
  46. langfun/core/eval/v2/progress.py +348 -0
  47. langfun/core/eval/v2/progress_test.py +82 -0
  48. langfun/core/eval/v2/progress_tracking.py +210 -0
  49. langfun/core/eval/v2/progress_tracking_test.py +66 -0
  50. langfun/core/eval/v2/reporting.py +270 -0
  51. langfun/core/eval/v2/reporting_test.py +158 -0
  52. langfun/core/eval/v2/runners.py +488 -0
  53. langfun/core/eval/v2/runners_test.py +334 -0
  54. langfun/core/langfunc.py +4 -17
  55. langfun/core/langfunc_test.py +22 -6
  56. langfun/core/language_model.py +577 -39
  57. langfun/core/language_model_test.py +470 -56
  58. langfun/core/llms/__init__.py +87 -16
  59. langfun/core/llms/anthropic.py +312 -87
  60. langfun/core/llms/anthropic_test.py +71 -3
  61. langfun/core/llms/cache/base.py +21 -2
  62. langfun/core/llms/cache/in_memory.py +13 -0
  63. langfun/core/llms/cache/in_memory_test.py +53 -2
  64. langfun/core/llms/compositional.py +101 -0
  65. langfun/core/llms/compositional_test.py +73 -0
  66. langfun/core/llms/deepseek.py +117 -0
  67. langfun/core/llms/deepseek_test.py +61 -0
  68. langfun/core/llms/fake.py +11 -7
  69. langfun/core/llms/fake_test.py +14 -0
  70. langfun/core/llms/gemini.py +507 -0
  71. langfun/core/llms/gemini_test.py +195 -0
  72. langfun/core/llms/google_genai.py +62 -218
  73. langfun/core/llms/google_genai_test.py +9 -202
  74. langfun/core/llms/groq.py +160 -144
  75. langfun/core/llms/groq_test.py +31 -137
  76. langfun/core/llms/llama_cpp.py +15 -42
  77. langfun/core/llms/llama_cpp_test.py +4 -30
  78. langfun/core/llms/openai.py +395 -203
  79. langfun/core/llms/openai_compatible.py +179 -0
  80. langfun/core/llms/openai_compatible_test.py +495 -0
  81. langfun/core/llms/openai_test.py +30 -395
  82. langfun/core/llms/rest.py +113 -0
  83. langfun/core/llms/rest_test.py +111 -0
  84. langfun/core/llms/vertexai.py +192 -0
  85. langfun/core/llms/vertexai_test.py +52 -0
  86. langfun/core/logging.py +284 -0
  87. langfun/core/logging_test.py +125 -0
  88. langfun/core/message.py +319 -9
  89. langfun/core/message_test.py +190 -13
  90. langfun/core/modalities/__init__.py +6 -2
  91. langfun/core/modalities/audio.py +30 -0
  92. langfun/core/modalities/audio_test.py +63 -0
  93. langfun/core/modalities/image.py +39 -20
  94. langfun/core/modalities/image_test.py +52 -9
  95. langfun/core/modalities/mime.py +206 -29
  96. langfun/core/modalities/mime_test.py +90 -9
  97. langfun/core/modalities/ms_office.py +117 -0
  98. langfun/core/modalities/ms_office_test.py +389 -0
  99. langfun/core/modalities/pdf.py +22 -0
  100. langfun/core/modalities/pdf_test.py +57 -0
  101. langfun/core/modalities/video.py +9 -26
  102. langfun/core/modalities/video_test.py +3 -3
  103. langfun/core/modality.py +26 -3
  104. langfun/core/modality_test.py +2 -2
  105. langfun/core/sampling.py +11 -11
  106. langfun/core/structured/__init__.py +12 -16
  107. langfun/core/structured/completion.py +32 -5
  108. langfun/core/structured/completion_test.py +7 -6
  109. langfun/core/structured/description.py +2 -2
  110. langfun/core/structured/description_test.py +3 -3
  111. langfun/core/structured/function_generation.py +60 -27
  112. langfun/core/structured/function_generation_test.py +72 -2
  113. langfun/core/structured/mapping.py +97 -47
  114. langfun/core/structured/mapping_test.py +90 -2
  115. langfun/core/structured/parsing.py +33 -21
  116. langfun/core/structured/parsing_test.py +53 -9
  117. langfun/core/structured/querying.py +746 -0
  118. langfun/core/structured/{prompting_test.py → querying_test.py} +469 -51
  119. langfun/core/structured/schema.py +204 -97
  120. langfun/core/structured/schema_generation.py +1 -1
  121. langfun/core/structured/schema_test.py +130 -29
  122. langfun/core/structured/scoring.py +125 -19
  123. langfun/core/structured/scoring_test.py +30 -0
  124. langfun/core/structured/tokenization.py +64 -0
  125. langfun/core/structured/tokenization_test.py +48 -0
  126. langfun/core/template.py +115 -1
  127. langfun/core/template_test.py +71 -1
  128. langfun/core/templates/conversation.py +9 -0
  129. langfun/core/templates/conversation_test.py +4 -3
  130. langfun/core/templates/selfplay_test.py +10 -2
  131. langfun-0.1.2.dev202501150804.dist-info/METADATA +225 -0
  132. langfun-0.1.2.dev202501150804.dist-info/RECORD +153 -0
  133. {langfun-0.0.2.dev20240429.dist-info → langfun-0.1.2.dev202501150804.dist-info}/WHEEL +1 -1
  134. langfun/core/coding/python/errors.py +0 -108
  135. langfun/core/coding/python/errors_test.py +0 -99
  136. langfun/core/coding/python/permissions.py +0 -90
  137. langfun/core/coding/python/permissions_test.py +0 -86
  138. langfun/core/structured/prompting.py +0 -238
  139. langfun/core/text_formatting.py +0 -162
  140. langfun/core/text_formatting_test.py +0 -47
  141. langfun-0.0.2.dev20240429.dist-info/METADATA +0 -100
  142. langfun-0.0.2.dev20240429.dist-info/RECORD +0 -108
  143. {langfun-0.0.2.dev20240429.dist-info → langfun-0.1.2.dev202501150804.dist-info}/LICENSE +0 -0
  144. {langfun-0.0.2.dev20240429.dist-info → langfun-0.1.2.dev202501150804.dist-info}/top_level.txt +0 -0
langfun/core/eval/base.py CHANGED
@@ -24,10 +24,9 @@ import os
24
24
  import re
25
25
  import threading
26
26
  import time
27
+ import types
27
28
  from typing import Annotated, Any, Callable, Iterator, Literal, Optional, Sequence, Type, Union
28
29
 
29
- from absl import app
30
- from absl import flags
31
30
  import langfun.core as lf
32
31
  import langfun.core.coding as lf_coding
33
32
  from langfun.core.llms.cache import in_memory
@@ -40,7 +39,8 @@ class Evaluable(lf.Component):
40
39
 
41
40
  EXPERIMENT_JSON = 'experiment.json'
42
41
  RESULT_JSON = 'result.json'
43
- FAILURES_JSON = 'failures.json'
42
+ OOP_FAILURES_JSON = 'oop_failures.json'
43
+ NON_OOP_FAILURES_JSON = 'non_oop_failures.json'
44
44
  INDEX_HTML = 'index.html'
45
45
  SUMMARY_HTML = 'summary.html'
46
46
 
@@ -215,6 +215,7 @@ class Evaluable(lf.Component):
215
215
  summary: bool = True,
216
216
  pivot_field: str = 'lm',
217
217
  from_root: bool = True,
218
+ timeout: int | None = None,
218
219
  **kwargs,
219
220
  ) -> Union['Summary', pg.Dict]:
220
221
  """Run the evaluation, which fills and returns the result."""
@@ -242,7 +243,7 @@ class Evaluable(lf.Component):
242
243
  ):
243
244
  if show_progress:
244
245
  lf.concurrent.ProgressBar.update(
245
- progress_bar, postfix='LOADING SAVED RESULTS...', color='yellow'
246
+ progress_bar, status='LOADING SAVED RESULTS...', color='yellow'
246
247
  )
247
248
  if self.try_load_result():
248
249
  run_status = 'CACHED'
@@ -265,13 +266,14 @@ class Evaluable(lf.Component):
265
266
  verbose=verbose,
266
267
  progress_bar=progress_bar,
267
268
  label=label,
269
+ timeout=timeout,
268
270
  **kwargs,
269
271
  )
270
272
 
271
273
  if should_save:
272
274
  if show_progress:
273
275
  lf.concurrent.ProgressBar.update(
274
- progress_bar, postfix='SAVING RESULTS...', color='yellow'
276
+ progress_bar, status='SAVING RESULTS...', color='yellow'
275
277
  )
276
278
 
277
279
  # Save evaluation results.
@@ -284,7 +286,7 @@ class Evaluable(lf.Component):
284
286
  if show_progress:
285
287
  lf.concurrent.ProgressBar.update(
286
288
  progress_bar,
287
- postfix=self._completion_status(run_status),
289
+ status=self._completion_status(run_status),
288
290
  color='green',
289
291
  )
290
292
  else:
@@ -340,7 +342,7 @@ class Evaluable(lf.Component):
340
342
  f'[#{leaf.index} - {leaf.node.id}]',
341
343
  total=leaf.node.num_examples if leaf.enabled else 0,
342
344
  color='cyan' if leaf.enabled else 'yellow',
343
- postfix=None if leaf.enabled else 'SKIPPED.')
345
+ status=None if leaf.enabled else 'SKIPPED.')
344
346
 
345
347
  # Run leaf groups in parallel.
346
348
  try:
@@ -354,17 +356,17 @@ class Evaluable(lf.Component):
354
356
  # Save results for non-leaf nodes.
355
357
  lf.concurrent.ProgressBar.update(
356
358
  overview_bar,
357
- postfix='SAVING RESULTS...',
359
+ status='SAVING RESULTS...',
358
360
  color='yellow')
359
361
 
360
362
  for node in self.nonleaf_nodes:
361
- node._result = {c.id: c.result for c in node.children} # pylint: disable=protected-access
363
+ node._result = {c.id: c.result for c in node.leaf_nodes} # pylint: disable=protected-access
362
364
  if should_save:
363
365
  node.save(result=False, report=False)
364
366
 
365
367
  if should_save and summary:
366
368
  lf.concurrent.ProgressBar.update(
367
- overview_bar, postfix='FINALIZING SUMMARY...'
369
+ overview_bar, status='FINALIZING SUMMARY...'
368
370
  )
369
371
 
370
372
  summary.save(os.path.join(self.root_dir, Evaluable.SUMMARY_HTML))
@@ -378,7 +380,7 @@ class Evaluable(lf.Component):
378
380
  # Signal all task completed by making the bar green.
379
381
  lf.concurrent.ProgressBar.update(
380
382
  overview_bar,
381
- postfix='COMPLETED',
383
+ status='COMPLETED',
382
384
  color='green')
383
385
 
384
386
  finally:
@@ -398,6 +400,7 @@ class Evaluable(lf.Component):
398
400
  verbose: bool,
399
401
  progress_bar: int | None,
400
402
  label: str | None,
403
+ timeout: int | None = None,
401
404
  **kwargs,
402
405
  ) -> None:
403
406
  """Run the evaluate and fill `self.result`. Subclass to implement."""
@@ -528,37 +531,14 @@ class Evaluable(lf.Component):
528
531
  self._render_message(self.dryrun_output, s)
529
532
 
530
533
  def _render_message(self, message: lf.Message, s: io.StringIO) -> None:
531
- for m in message.trace():
532
- if 'lm-input' in m.tags:
533
- text_color = 'green'
534
- elif 'lm-response' in m.tags:
535
- text_color = 'blue'
536
- else:
537
- text_color = 'black'
538
-
539
- s.write(
540
- f'<div style="color: {text_color}; white-space: pre-wrap;'
541
- 'padding: 10px; border: 1px solid; margin-top: 10px">'
542
- )
543
- s.write(m.get('formatted_text', m.text))
544
- if m.result is not None:
545
- s.write(
546
- '<div style="color: magenta; white-space: pre-wrap;'
547
- 'padding: 10px; border: 1px solid; margin: 10px">'
548
- )
549
- s.write(pg.format(m.result))
550
- s.write('</div>')
551
- if 'usage' in m.metadata:
552
- s.write(
553
- '<div style="background-color: #EEEEEE; color: black; '
554
- 'white-space: pre-wrap; padding: 10px; border: 0px solid; '
555
- 'margin: 10px">'
556
- f'prompt: {m.usage.prompt_tokens} tokens, '
557
- f'response: {m.usage.completion_tokens} tokens, '
558
- f'total: {m.usage.total_tokens} tokens'
559
- '</div>'
534
+ s.write(
535
+ message.to_html_str(
536
+ extra_flags=dict(
537
+ include_message_metadata=False,
538
+ source_tag=['lm-input', 'lm-response'],
539
+ )
560
540
  )
561
- s.write('</div>')
541
+ )
562
542
 
563
543
  @classmethod
564
544
  def from_dir(
@@ -598,7 +578,6 @@ class _LeafNode:
598
578
  @pg.use_init_args(['children'])
599
579
  class Suite(Evaluable):
600
580
  """Evaluation suite."""
601
-
602
581
  children: Annotated[list[Evaluable], 'Child evaluation sets or suites.']
603
582
 
604
583
  # Use empty ID as suite is just a container of child evaluations.
@@ -753,10 +732,12 @@ class Evaluation(Evaluable):
753
732
 
754
733
  # Constants.
755
734
  CACHE_JSON = 'cache.json'
756
- FAILURES_HTML = 'failures.html'
735
+ OOP_FAILURES_HTML = 'oop_failures.html'
736
+ NON_OOP_FAILURES_HTML = 'non_oop_failures.html'
757
737
 
758
738
  @functools.cached_property
759
739
  def hash(self) -> str:
740
+ """Returns the semantic-based hash of the evaluation."""
760
741
  if self.is_deterministic:
761
742
  identity = pg.format(self._identifiers(), compact=True)
762
743
  else:
@@ -805,6 +786,10 @@ class Evaluation(Evaluable):
805
786
  """Returns the complete rate."""
806
787
  return self.num_completed / self.num_examples
807
788
 
789
+ #
790
+ # Properties on failures.
791
+ #
792
+
808
793
  @property
809
794
  def failures(self) -> list[tuple[Any, Exception]]:
810
795
  """Returns the failed examples and their errors."""
@@ -815,6 +800,15 @@ class Evaluation(Evaluable):
815
800
  """Returns the number of failed examples."""
816
801
  return len(self.failures)
817
802
 
803
+ @functools.cached_property
804
+ def failure_breakdown(self) -> dict[str, int]:
805
+ """Returns the breakdown of failures."""
806
+ breakdown = collections.defaultdict(int)
807
+ for _, error in self.failures:
808
+ breakdown[_error_key(error)] += 1
809
+ sorted_items = sorted(breakdown.items(), key=lambda x: x[1], reverse=True)
810
+ return pg.Dict({x[0]: x[1] for x in sorted_items})
811
+
818
812
  @property
819
813
  def failure_rate(self) -> float:
820
814
  """Returns the failure rate in range [0, 1]."""
@@ -822,6 +816,46 @@ class Evaluation(Evaluable):
822
816
  return 0.0
823
817
  return self.num_failures / self.num_completed
824
818
 
819
+ @functools.cached_property
820
+ def oop_failures(self) -> list[tuple[Any, lf_structured.MappingError]]:
821
+ """Returns the OOP failures."""
822
+ return [item for item in self.failures
823
+ if isinstance(item[1], lf_structured.MappingError)]
824
+
825
+ @property
826
+ def num_oop_failures(self) -> int:
827
+ """Returns the number of OOP failures."""
828
+ return len(self.oop_failures)
829
+
830
+ @property
831
+ def oop_failure_rate(self) -> float:
832
+ """Returns the OOP failure rate in range [0, 1]."""
833
+ if self.num_completed == 0:
834
+ return 0.0
835
+ return self.num_oop_failures / self.num_completed
836
+
837
+ @functools.cached_property
838
+ def non_oop_failures(self) -> list[tuple[Any, Exception]]:
839
+ """Returns the OOP failures."""
840
+ return [item for item in self.failures
841
+ if not isinstance(item[1], lf_structured.MappingError)]
842
+
843
+ @property
844
+ def num_non_oop_failures(self) -> int:
845
+ """Returns the number of non-OOP failures."""
846
+ return len(self.non_oop_failures)
847
+
848
+ @property
849
+ def non_oop_failure_rate(self) -> float:
850
+ """Returns the non-OOP failure rate in range [0, 1]."""
851
+ if self.num_completed == 0:
852
+ return 0.0
853
+ return self.num_non_oop_failures / self.num_completed
854
+
855
+ #
856
+ # Properties on usage.
857
+ #
858
+
825
859
  @property
826
860
  def has_usage(self) -> bool:
827
861
  """Returns True if token usage is enabled."""
@@ -905,7 +939,7 @@ class Evaluation(Evaluable):
905
939
 
906
940
  fields = list(cls.__schema__.values())
907
941
  fields.insert(0, (self.completion_prompt_field, pg.typing.Str()))
908
- pg.symbolic.update_schema(cls, fields, extend=False)
942
+ cls.update_schema(fields, extend=False)
909
943
 
910
944
  def _maybe_adjust_examples_for_completion(
911
945
  self,
@@ -976,13 +1010,22 @@ class Evaluation(Evaluable):
976
1010
  self._total_prompt_tokens = 0
977
1011
  self._total_completion_tokens = 0
978
1012
  self._num_usages = 0
1013
+ self.__dict__.pop('oop_failures', None)
1014
+ self.__dict__.pop('non_oop_failures', None)
1015
+
1016
+ @property
1017
+ def oop_failures_link(self) -> str | None:
1018
+ """Returns the link to the OOP failures page."""
1019
+ if self.dir is None:
1020
+ return None
1021
+ return self.link(os.path.join(self.dir, Evaluation.OOP_FAILURES_HTML))
979
1022
 
980
1023
  @property
981
- def failures_link(self) -> str | None:
982
- """Returns the link to the failures page."""
1024
+ def non_oop_failures_link(self) -> str | None:
1025
+ """Returns the link to then non-OOP failures page."""
983
1026
  if self.dir is None:
984
1027
  return None
985
- return self.link(os.path.join(self.dir, Evaluation.FAILURES_HTML))
1028
+ return self.link(os.path.join(self.dir, Evaluation.NON_OOP_FAILURES_HTML))
986
1029
 
987
1030
  def _dryrun(
988
1031
  self,
@@ -992,11 +1035,11 @@ class Evaluation(Evaluable):
992
1035
  verbose: bool,
993
1036
  **kwargs,
994
1037
  ) -> None:
995
- # Set the example for dryrun.
996
- example = example or self.examples[0]
997
-
998
1038
  # We make a copy to avoid pollute the state of current object.
999
1039
  copy: Evaluation = self.clone()
1040
+
1041
+ # Set the example for dryrun.
1042
+ example = example or copy.examples[0]
1000
1043
  copy.__dict__['examples'] = [example]
1001
1044
 
1002
1045
  # We set the symbolic parent of the cloned to access contextual information
@@ -1011,23 +1054,36 @@ class Evaluation(Evaluable):
1011
1054
  color='green',
1012
1055
  )
1013
1056
 
1014
- with lf.use_settings(debug=debug):
1015
- output_message = copy.process(example, **(self.additional_args or {}))
1016
- if self.schema is None:
1017
- output = output_message.text
1018
- else:
1019
- output = output_message.result
1057
+ error, output_message = None, None
1020
1058
 
1021
- if verbose:
1059
+ try:
1060
+ with lf.use_settings(debug=debug):
1061
+ output_message = copy.process(example, **(self.additional_args or {}))
1062
+ self.process_output(example, output_message)
1063
+
1064
+ if self.schema is None:
1065
+ output = output_message.text
1066
+ else:
1067
+ output = output_message.result
1068
+
1069
+ if verbose:
1070
+ lf.console.write('')
1071
+ lf.console.write(
1072
+ str(output),
1073
+ title='OUTPUT',
1074
+ color='blue',
1075
+ )
1076
+ except lf_structured.MappingError as e:
1022
1077
  lf.console.write('')
1023
1078
  lf.console.write(
1024
- str(output),
1025
- title='OUTPUT',
1026
- color='blue',
1079
+ str(e),
1080
+ title='ERROR',
1081
+ color='red',
1027
1082
  )
1083
+ error = e
1028
1084
 
1029
- copy.audit(example, output_message, None, dryrun=True)
1030
- result = copy.summarize()
1085
+ copy.audit(1, example, output_message, error, dryrun=True)
1086
+ result = copy.finalize()
1031
1087
 
1032
1088
  if verbose:
1033
1089
  lf.console.write('')
@@ -1048,9 +1104,13 @@ class Evaluation(Evaluable):
1048
1104
  verbose: bool,
1049
1105
  progress_bar: int | None,
1050
1106
  label: str | None,
1107
+ timeout: int | None = None,
1051
1108
  **kwargs,
1052
1109
  ) -> None:
1053
1110
  # Setup examples.
1111
+ # Reset examples so it could be read from the input functor.
1112
+ self.__dict__.pop('examples', None)
1113
+
1054
1114
  if end is None:
1055
1115
  end = len(self.examples)
1056
1116
  examples = self.examples[start:end]
@@ -1059,20 +1119,24 @@ class Evaluation(Evaluable):
1059
1119
  with lf.use_settings(debug=debug, cache=self.cache):
1060
1120
  self._reset()
1061
1121
 
1062
- def _process(example: Any):
1122
+ def _process(idx_and_example: Any):
1063
1123
  # NOTE(daiyip): set the `input` symbol of the globals to None, so LLM
1064
1124
  # generated code with calls to `input` will raise an error, thus not
1065
1125
  # blocking the evaluation.
1126
+ _, example = idx_and_example
1066
1127
  with lf_coding.context(input=None):
1067
- return self.process(example, **(self.additional_args or {}))
1128
+ output_message = self.process(example, **(self.additional_args or {}))
1129
+ self.process_output(example, output_message)
1130
+ return output_message
1068
1131
 
1069
1132
  try:
1070
- for example, message, error in lf.concurrent_map(
1133
+ for (idx, example), message, error in lf.concurrent_map(
1071
1134
  _process,
1072
- examples,
1135
+ enumerate(examples),
1073
1136
  max_workers=self.max_workers,
1074
1137
  show_progress=progress_bar or False,
1075
1138
  status_fn=self._status,
1139
+ timeout=timeout,
1076
1140
  ):
1077
1141
  if error is not None:
1078
1142
  message = (
@@ -1080,14 +1144,14 @@ class Evaluation(Evaluable):
1080
1144
  if isinstance(error, lf_structured.MappingError)
1081
1145
  else None
1082
1146
  )
1083
- self.audit(example, message, error)
1147
+ self.audit(idx + 1, example, message, error)
1084
1148
  finally:
1085
1149
  # Save cache upon completion or interruption.
1086
1150
  if self.dir and self.cache:
1087
1151
  self.cache.save()
1088
1152
 
1089
1153
  # Summarize result.
1090
- self._result = self.summarize()
1154
+ self._result = self.finalize()
1091
1155
  if verbose:
1092
1156
  lf.console.write(
1093
1157
  str(self.result),
@@ -1101,7 +1165,7 @@ class Evaluation(Evaluable):
1101
1165
 
1102
1166
  def process(self, example: Any, **kwargs) -> lf.Message:
1103
1167
  """Process an example and returns its output."""
1104
- prompt = self.prompt.render(example=example).text
1168
+ prompt = lf.Template.from_value(self.prompt, example=example)
1105
1169
  if self.method == 'call':
1106
1170
  return lf_structured.call(
1107
1171
  prompt,
@@ -1129,7 +1193,9 @@ class Evaluation(Evaluable):
1129
1193
  else:
1130
1194
  assert self.method == 'complete', self.method
1131
1195
  assert isinstance(self.schema.spec, pg.typing.Object), self.schema
1132
- input_value = self.schema.spec.cls.partial(prompt)
1196
+ # TODO(daiyip): Currently multi-modal inputs within the prompt for
1197
+ # completion is not supported.
1198
+ input_value = self.schema.spec.cls.partial(prompt.render().text)
1133
1199
  return lf_structured.complete(
1134
1200
  input_value,
1135
1201
  lm=self.lm,
@@ -1140,16 +1206,48 @@ class Evaluation(Evaluable):
1140
1206
  **kwargs,
1141
1207
  )
1142
1208
 
1209
+ def process_output(self, example: Any, output: lf.Message) -> None:
1210
+ """Process the output for an example.
1211
+
1212
+ Subclasses can override this method to generate and attach additional
1213
+ metadata for debugging purpose. For example, draw bounding boxes on the
1214
+ input image based on LLM predicted boxes and attach to output_message's
1215
+ metadata.
1216
+
1217
+ Example:
1218
+
1219
+ class BoundingBoxEval(lf.eval.Matching):
1220
+ ...
1221
+ def process_output(example, output):
1222
+ output.metadata.image_with_bbox = draw_bboxes(
1223
+ example.image, output.result)
1224
+
1225
+ Args:
1226
+ example: User input.
1227
+ output: LLM's output message. Users could attach additional
1228
+ information to the message, which will be shown in debugging
1229
+ """
1230
+ del example, output
1231
+
1143
1232
  def _status(self, progress: lf.concurrent.Progress) -> dict[str, Any]:
1233
+ status = {'Model': self.lm.model_id}
1234
+ status.update(self._eval_status(progress))
1235
+
1236
+ if progress.last_error is not None:
1237
+ status['LastError'] = progress.last_error_str()
1238
+ if progress.timeit_summary:
1239
+ status['TimeIt'] = progress.timeit_summary_str()
1240
+ return status
1241
+
1242
+ def _eval_status(self, progress: lf.concurrent.Progress) -> dict[str, Any]:
1144
1243
  return {
1145
- 'Model': self.lm.model_id,
1146
- 'Succeeded': f'%.{self.report_precision}f%% (%d/%d)' % (
1147
- progress.success_rate * 100,
1244
+ 'Succeeded': '%s (%d/%d)' % (
1245
+ self._format_rate(progress.success_rate),
1148
1246
  progress.succeeded,
1149
1247
  progress.completed,
1150
1248
  ),
1151
- 'Failed': f'%.{self.report_precision}f%% (%d/%d)' % (
1152
- progress.failure_rate * 100,
1249
+ 'Failed': '%s (%d/%d)' % (
1250
+ self._format_rate(progress.failure_rate),
1153
1251
  progress.failed,
1154
1252
  progress.completed,
1155
1253
  ),
@@ -1159,22 +1257,21 @@ class Evaluation(Evaluable):
1159
1257
  assert self.result is not None
1160
1258
  m = self.result.metrics
1161
1259
  return (
1162
- f'COMPLETED(%s): Successes=%.{self.report_precision}f%% (%d/%d)'
1163
- f' Failures=%.{self.report_precision}f%% (%d/%d)'
1260
+ 'COMPLETED(%s): Successes=%s(%d/%d) Failures=%s (%d/%d)'
1164
1261
  % (
1165
1262
  run_status,
1166
- (1 - m.failure_rate) * 100,
1263
+ self._format_rate(1 - m.failure_rate),
1167
1264
  m.total - m.failures,
1168
1265
  m.total,
1169
- m.failure_rate * 100,
1266
+ self._format_rate(m.failure_rate),
1170
1267
  m.failures,
1171
1268
  m.total,
1172
1269
  )
1173
1270
  )
1174
1271
 
1175
- def summarize(self) -> pg.Dict:
1176
- """Summarizes the evaluation result."""
1177
- if self.cache:
1272
+ def finalize(self) -> pg.Dict:
1273
+ """Finalizes the evaluation result."""
1274
+ if self.cache is not None:
1178
1275
  cache_stats = dict(
1179
1276
  use_cache=True,
1180
1277
  num_queries=self.cache.stats.num_queries,
@@ -1201,7 +1298,7 @@ class Evaluation(Evaluable):
1201
1298
  id=self.id,
1202
1299
  dir=self.dir,
1203
1300
  model=self.lm.model_id,
1204
- prompt_template=lf.text_formatting.decolored(str(self.prompt)),
1301
+ prompt_template=pg.decolor(str(self.prompt)),
1205
1302
  method=self.method,
1206
1303
  schema_fn=str(self.schema_fn),
1207
1304
  ),
@@ -1210,38 +1307,47 @@ class Evaluation(Evaluable):
1210
1307
  total=self.num_completed,
1211
1308
  failures=self.num_failures,
1212
1309
  failure_rate=self.failure_rate,
1310
+ oop_failures=self.num_oop_failures,
1311
+ oop_failure_rate=self.oop_failure_rate,
1312
+ non_oop_failures=self.num_non_oop_failures,
1313
+ non_oop_failure_rate=self.non_oop_failure_rate,
1314
+ failure_breakdown=self.failure_breakdown,
1213
1315
  ),
1214
1316
  usage=usage,
1215
1317
  )
1216
1318
  return result
1217
1319
 
1218
- def summarize_html(self) -> str:
1320
+ def summary_card(self) -> str:
1321
+ """Returns summary card in HTML."""
1219
1322
  s = io.StringIO()
1220
1323
  definition = _html_repr(self, compact=False, escape=True)
1221
1324
  s.write('<div><table><tr><td>')
1325
+ self._render_link(
1326
+ s,
1327
+ definition,
1328
+ self.hash,
1329
+ '',
1330
+ lambda: self.link(self.dir),
1331
+ )
1222
1332
  if self.result is None:
1223
1333
  s.write(
1224
- f'<a target="_blank" title="{definition}" '
1225
- f'href="{self.link(self.dir)}">{self.hash}</a>'
1226
1334
  '</td></tr><tr><td>'
1227
1335
  '<span style="color: gray">(IN-PROGRESS...)</span>'
1228
1336
  )
1229
1337
  else:
1230
- s.write(
1231
- f'<a target="_blank" title="{definition}" '
1232
- f'href="{self.index_link}">{self.hash}</a>'
1233
- '</td></tr><tr><td>'
1234
- )
1235
- self._render_metric(s)
1338
+ if self.dir:
1339
+ s.write(f' &nbsp;[<a href="{self.link(self.dir)}">dir</a>]')
1340
+ s.write('</td></tr><tr><td>')
1341
+ self._render_summary_metrics(s)
1236
1342
 
1237
1343
  # Summarize average usage.
1238
- if self.result.usage is not None:
1239
- self._render_usage(s)
1344
+ if self.result.usage:
1345
+ self._render_summary_usage(s)
1240
1346
 
1241
1347
  s.write('</td></tr></table></div>')
1242
1348
  return s.getvalue()
1243
1349
 
1244
- def _render_usage(self, s: io.StringIO) -> None:
1350
+ def _render_summary_usage(self, s: io.StringIO) -> None:
1245
1351
  """Renders usage in HTML."""
1246
1352
  usage = self.result.usage
1247
1353
  total = usage.total_prompt_tokens + usage.total_completion_tokens
@@ -1255,22 +1361,79 @@ class Evaluation(Evaluable):
1255
1361
  f'" style="color:gray">({total} tokens)</a>'
1256
1362
  )
1257
1363
 
1258
- def _render_metric(self, s: io.StringIO) -> None:
1364
+ def _render_link(self,
1365
+ s: io.StringIO,
1366
+ title: str,
1367
+ text: str,
1368
+ style: str,
1369
+ url_fn: Callable[[], str]) -> None:
1370
+ """Renders a link in HTML."""
1371
+ s.write(
1372
+ f'<a target="_blank" title="{title}" style="{style}"'
1373
+ )
1374
+ if self.dir:
1375
+ s.write(f' href="{url_fn()}"')
1376
+ s.write(f'>{text}</a>')
1377
+
1378
+ def _render_summary_metrics(self, s: io.StringIO) -> None:
1259
1379
  """Renders metrics in HTML."""
1260
1380
  assert self.result is not None
1261
1381
  m = self.result.metrics
1262
- s.write(
1263
- '<a title="Failures (%d/%d)" href="%s" style="color:red">%s</a>'
1264
- % (
1265
- m.failures,
1266
- m.total,
1267
- self.failures_link,
1268
- f'%.{self.report_precision}f%% ' % (m.failure_rate * 100),
1269
- )
1382
+
1383
+ # OOP failures.
1384
+ oop_failure_title = f'OOP failures ({m.oop_failures}/{m.total})'
1385
+ if m.oop_failures:
1386
+ oop_failure_title += '&#013;'
1387
+ for name, count in m.failure_breakdown.items():
1388
+ if name.startswith('MappingError'):
1389
+ oop_failure_title += '&#013;%s: %s (%d/%d)' % (
1390
+ name.removeprefix('MappingError.'),
1391
+ self._format_rate(count / m.total),
1392
+ count,
1393
+ m.total,
1394
+ )
1395
+
1396
+ extra_style = ''
1397
+ if m.oop_failure_rate > 0.1 and m.oop_failures > 3:
1398
+ extra_style = ';font-weight:bold'
1399
+ self._render_link(
1400
+ s,
1401
+ oop_failure_title,
1402
+ self._format_rate(m.oop_failure_rate),
1403
+ f'color:magenta{extra_style}',
1404
+ lambda: self.oop_failures_link,
1270
1405
  )
1406
+ s.write(' | ')
1407
+
1408
+ # Non-OOP failures.
1409
+ non_oop_failure_title = f'Non-OOP failures ({m.non_oop_failures}/{m.total})'
1410
+ if m.non_oop_failures:
1411
+ non_oop_failure_title += '&#013;'
1412
+ for name, count in m.failure_breakdown.items():
1413
+ if not name.startswith('MappingError'):
1414
+ non_oop_failure_title += '&#013;%s: %s (%d/%d)' % (
1415
+ name,
1416
+ self._format_rate(count / m.total),
1417
+ count,
1418
+ m.total,
1419
+ )
1420
+
1421
+ extra_style = ';font-weight:bold' if m.non_oop_failures > 0 else ''
1422
+ self._render_link(
1423
+ s,
1424
+ non_oop_failure_title,
1425
+ self._format_rate(m.non_oop_failure_rate),
1426
+ f'color:red{extra_style}',
1427
+ lambda: self.non_oop_failures_link,
1428
+ )
1429
+
1430
+ def _format_rate(self, rate: float) -> str:
1431
+ """Formats a rate."""
1432
+ return f'%.{self.report_precision}f%% ' % (rate * 100)
1271
1433
 
1272
1434
  def audit(
1273
1435
  self,
1436
+ example_idx: int,
1274
1437
  example: Any,
1275
1438
  message: lf.Message | None,
1276
1439
  error: Exception | None = None,
@@ -1279,6 +1442,7 @@ class Evaluation(Evaluable):
1279
1442
  """Audits the example against the output. Subclasses should override.
1280
1443
 
1281
1444
  Args:
1445
+ example_idx: 1-based index of the example in its dataset.
1282
1446
  example: The input object.
1283
1447
  message: The entire message returned by the LM, which could be used to
1284
1448
  trace the LM input, response and parsed structure. If error is raised
@@ -1287,13 +1451,19 @@ class Evaluation(Evaluable):
1287
1451
  dryrun: Whether or not audition takes place during dryrun.
1288
1452
  """
1289
1453
  if error is not None:
1290
- self._failures.append((example, str(error)))
1454
+ self._failures.append((example, error))
1455
+
1456
+ # Invalid cache of num_oop_failures.
1457
+ self.__dict__.pop('oop_failures', None)
1458
+ self.__dict__.pop('non_oop_failures', None)
1459
+ self.__dict__.pop('failure_breakdown', None)
1460
+
1291
1461
  if isinstance(error, lf_structured.MappingError):
1292
1462
  message = error.lm_response
1293
1463
  else:
1294
1464
  assert message is not None
1295
1465
  output = message.text if self.schema is None else message.result
1296
- self.audit_processed(example, output, message, dryrun=dryrun)
1466
+ self.audit_processed(example_idx, example, output, message, dryrun=dryrun)
1297
1467
 
1298
1468
  # Audit usage.
1299
1469
  if message is not None:
@@ -1301,14 +1471,17 @@ class Evaluation(Evaluable):
1301
1471
  self._num_completed += 1
1302
1472
 
1303
1473
  def audit_usage(self, message: lf.Message, dryrun: bool = False) -> None:
1474
+ del dryrun
1304
1475
  for m in message.trace():
1305
- if 'usage' in m.metadata:
1306
- self._total_prompt_tokens += m.usage.prompt_tokens
1307
- self._total_completion_tokens += m.usage.completion_tokens
1476
+ usage = m.metadata.get('usage', None)
1477
+ if usage:
1478
+ self._total_prompt_tokens += usage.prompt_tokens
1479
+ self._total_completion_tokens += usage.completion_tokens
1308
1480
  self._num_usages += 1
1309
1481
 
1310
1482
  def audit_processed(
1311
- self, example: Any, output: Any, message: lf.Message, dryrun: bool = False
1483
+ self, example_idx: int, example: Any, output: Any, message: lf.Message,
1484
+ dryrun: bool = False
1312
1485
  ) -> None:
1313
1486
  """Audits a successfully processed example. Subclass should override."""
1314
1487
 
@@ -1333,16 +1506,26 @@ class Evaluation(Evaluable):
1333
1506
  # Save failures.
1334
1507
  pg.save(
1335
1508
  [
1336
- pg.Dict(
1337
- input=input, error=lf.text_formatting.decolored(str(error))
1338
- )
1339
- for input, error in self.failures
1509
+ pg.Dict(input=input, error=_format_error(error))
1510
+ for input, error in self.oop_failures
1340
1511
  ],
1341
- os.path.join(self.dir, Evaluation.FAILURES_JSON),
1512
+ os.path.join(self.dir, Evaluation.OOP_FAILURES_JSON),
1342
1513
  )
1343
1514
  pg.save(
1344
- self._html([self._render_result, self._render_failures]),
1345
- os.path.join(self.dir, Evaluation.FAILURES_HTML),
1515
+ self._html([self._render_result, self._render_oop_failures]),
1516
+ os.path.join(self.dir, Evaluation.OOP_FAILURES_HTML),
1517
+ file_format='txt',
1518
+ )
1519
+ pg.save(
1520
+ [
1521
+ pg.Dict(input=input, error=_format_error(error))
1522
+ for input, error in self.non_oop_failures
1523
+ ],
1524
+ os.path.join(self.dir, Evaluation.NON_OOP_FAILURES_JSON),
1525
+ )
1526
+ pg.save(
1527
+ self._html([self._render_result, self._render_non_oop_failures]),
1528
+ os.path.join(self.dir, Evaluation.NON_OOP_FAILURES_HTML),
1346
1529
  file_format='txt',
1347
1530
  )
1348
1531
 
@@ -1355,9 +1538,10 @@ class Evaluation(Evaluable):
1355
1538
  '<td>Schema</td>'
1356
1539
  '<td>Additional Args</td>'
1357
1540
  )
1358
- if self.result.usage is not None:
1541
+ if self.result.usage:
1359
1542
  s.write('<td>Usage</td>')
1360
- s.write('<td>Failures</td>')
1543
+ s.write('<td>OOP Failures</td>')
1544
+ s.write('<td>Non-OOP Failures</td>')
1361
1545
 
1362
1546
  def _render_result_row(self, s: io.StringIO) -> None:
1363
1547
  s.write(
@@ -1383,18 +1567,31 @@ class Evaluation(Evaluable):
1383
1567
  f'{_html_repr(self.additional_args, compact=False)}</td>'
1384
1568
  )
1385
1569
  # Usage.
1386
- if self.result.usage is not None:
1570
+ if self.result.usage:
1387
1571
  s.write('<td>')
1388
- self._render_usage(s)
1572
+ self._render_summary_usage(s)
1389
1573
  s.write('</td>')
1390
1574
 
1391
- # Failures.
1575
+ # OOP failures.
1392
1576
  s.write(
1393
- '<td><span style="color:orange">%s</span>%s</td>'
1577
+ '<td><span style="color:magenta">%s</span>%s</td>'
1394
1578
  % (
1395
- f'%.{self.report_precision}f%%' % (self.failure_rate * 100),
1579
+ self._format_rate(self.oop_failure_rate),
1396
1580
  '<a href="%s">(%d/%d)</a>'
1397
- % (self.failures_link, self.num_failures, self.num_completed),
1581
+ % (self.oop_failures_link,
1582
+ self.num_oop_failures,
1583
+ self.num_completed),
1584
+ )
1585
+ )
1586
+ # Non-OOP failures.
1587
+ s.write(
1588
+ '<td><span style="color:red">%s</span>%s</td>'
1589
+ % (
1590
+ self._format_rate(self.non_oop_failure_rate),
1591
+ '<a href="%s">(%d/%d)</a>'
1592
+ % (self.non_oop_failures_link,
1593
+ self.num_non_oop_failures,
1594
+ self.num_completed),
1398
1595
  )
1399
1596
  )
1400
1597
 
@@ -1408,31 +1605,99 @@ class Evaluation(Evaluable):
1408
1605
  else:
1409
1606
  return 'cyan'
1410
1607
 
1411
- def _render_failures(self, s: io.StringIO) -> None:
1608
+ def _render_oop_failures(self, s: io.StringIO) -> None:
1609
+ self._render_failures(s, '^MappingError.*', error_color='magenta')
1610
+
1611
+ def _render_non_oop_failures(self, s: io.StringIO) -> None:
1612
+ self._render_failures(s, '^(?!MappingError).*', error_color='red')
1613
+
1614
+ def _render_failures(
1615
+ self, s: io.StringIO, error_regex: str, error_color: str) -> None:
1412
1616
  """Formats the failed cases into html."""
1617
+ # Failure summary.
1413
1618
  s.write(
1414
- '<h2> Failed Cases </h2>'
1619
+ '<h2> Error Summary </h2>'
1415
1620
  '<div style="white-space:pre">\n'
1416
1621
  '<table style="border:1px solid">'
1417
- '<tr class="header"><td>No.</td><td>Input</td><td>Error</td></tr>'
1622
+ '<tr class="header"><td>Error type</td><td>Stats</td></tr>'
1623
+ )
1624
+ error_regex = re.compile(error_regex)
1625
+ if self.result.metrics.failure_breakdown:
1626
+ for name, count in self.result.metrics.failure_breakdown.items():
1627
+ if not error_regex.match(name):
1628
+ continue
1629
+
1630
+ link = f'<a href="#{name}">{name}</a>'
1631
+ error_rate = self._format_rate(count / self.result.metrics.total)
1632
+ stats = (f'<span style="color:{error_color}">{error_rate} '
1633
+ f'({count}/{self.result.metrics.total})</span>')
1634
+ s.write(f'<tr><td>{link}</td><td>{stats})</td></tr>')
1635
+ s.write(
1636
+ '</table></div>'
1637
+ '<h2> Failed Cases </h2>'
1638
+ '<div style="white-space:pre">'
1418
1639
  )
1640
+ # Failure details by error type.
1641
+ failures_by_error = collections.defaultdict(list)
1642
+ for example, error in self.failures:
1643
+ error_name = _error_key(error)
1644
+ if error_regex.match(error_name):
1645
+ failures_by_error[error_name].append((example, error))
1646
+
1647
+ for error_key, failures in failures_by_error.items():
1648
+ s.write(
1649
+ f'<h3 id="{error_key}"><a href="#{error_key}">{error_key}</a> '
1650
+ f'(count={len(failures)})</h3>'
1651
+ '<table style="border:1px solid">'
1652
+ '<tr class="header"><td>No.</td><td>Input</td>'
1653
+ '<td>LM invocation</td><td>Error</td></tr>'
1654
+ )
1655
+ for i, (example, error) in enumerate(failures):
1656
+ lm_response = None
1657
+ if isinstance(error, lf.structured.MappingError):
1658
+ lm_response = error.lm_response
1659
+ error = error.cause
1660
+
1661
+ bgcolor = 'white' if i % 2 == 0 else '#DDDDDD'
1662
+ s.write(f'<tr style="background-color: {bgcolor}"><td>{i + 1}</td>')
1663
+ s.write('<td style="color:green;white-space:pre-wrap">')
1664
+ s.write(pg.format(example, verbose=False))
1665
+ s.write('</td><td>')
1666
+ if lm_response is not None:
1667
+ self._render_message(lm_response, s)
1668
+ s.write(f'</td><td style="color:{error_color};white-space:pre">')
1669
+ s.write(_format_error(error))
1670
+ s.write('</td></tr>')
1671
+ s.write('</table>')
1672
+ s.write('</div>')
1419
1673
 
1420
- for i, (example, error) in enumerate(self.failures):
1421
- bgcolor = 'white' if i % 2 == 0 else '#DDDDDD'
1422
- s.write(f'<tr style="background-color: {bgcolor}"><td>{i + 1}</td>')
1423
- input_str = pg.format(example, verbose=False)
1424
- s.write(f'<td style="color:green;white-space:pre-wrap">{input_str}</td>')
1425
- error_str = lf.text_formatting.decolored(str(error))
1426
- s.write(f'<td style="color:red;white-space:pre">{error_str}</td>')
1427
- s.write('</tr>')
1428
- s.write('</table></div>')
1674
+ @classmethod
1675
+ def visualize(cls, evaluations: list['Evaluation']) -> str | None:
1676
+ """Visualize the a list of evaluations of this task in HTML."""
1677
+ del evaluations
1678
+ return None
1429
1679
 
1430
1680
 
1431
1681
  @pg.functor()
1432
- def inputs_from(path: str | list[str]) -> list[Any]:
1682
+ def inputs_from(path: str | list[str], **kwargs) -> list[Any]:
1433
1683
  """A functor that returns a list of user-defined objects as eval inputs."""
1434
1684
  if isinstance(path, str):
1435
- return pg.load(path)
1685
+ if path.endswith('.json'):
1686
+ return pg.load(path)
1687
+ elif path.endswith('.jsonl'):
1688
+ return list(iter(pg.open_jsonl(path)))
1689
+ elif path.endswith('.csv'):
1690
+ import pandas as pd # pylint: disable=g-import-not-at-top
1691
+ dataset_df = pd.read_csv(path, **kwargs)
1692
+ dataset = []
1693
+ for i in range(dataset_df.shape[0]):
1694
+ row = {}
1695
+ for col in dataset_df.columns:
1696
+ row[col] = dataset_df.iloc[i][col]
1697
+ dataset.append(row)
1698
+ return dataset
1699
+ else:
1700
+ raise ValueError(f'Unsupported file format: {path}')
1436
1701
  examples = []
1437
1702
  for p in path:
1438
1703
  examples.extend(pg.load(p))
@@ -1578,7 +1843,7 @@ class Summary(pg.Object):
1578
1843
  if e is None:
1579
1844
  s.write('<span style="color: gray">N/A<span>')
1580
1845
  else:
1581
- s.write(e.summarize_html())
1846
+ s.write(e.summary_card())
1582
1847
  s.write('</td>')
1583
1848
  s.write('</tr>')
1584
1849
  s.write('</table>')
@@ -1653,13 +1918,22 @@ class Summary(pg.Object):
1653
1918
  s.write('<html><body>')
1654
1919
  for task in sorted(self.tasks(), key=lambda cls: cls.__name__):
1655
1920
  table_id = task.__name__.lower()
1921
+ evaluations = self.select(task=task).evaluations
1922
+ table = Summary.Table.from_evaluations(evaluations, pivot_field)
1656
1923
  s.write('<div>')
1657
- s.write(f'<a id="{table_id}"')
1658
- s.write(f'<h2><a href="#{table_id}">{task.__name__}</a></h2>')
1659
- s.write('</a>')
1660
- table = Summary.Table.from_evaluations(
1661
- self.select(task=task).evaluations, pivot_field
1924
+ s.write(
1925
+ f'<a id="{table_id}" href="#{table_id}">'
1926
+ f'<h2>{task.__name__}</h2></a>'
1662
1927
  )
1928
+
1929
+ # Allow users to plugin visualization code (e.g. matplot) in the summary
1930
+ # page.
1931
+ visual_part = task.visualize(evaluations)
1932
+ if visual_part:
1933
+ s.write(visual_part)
1934
+
1935
+ s.write(f'<h4 style="color:gray">{len(evaluations)} experiments</h4>')
1936
+ s.write('<hr/>')
1663
1937
  s.write(table.html())
1664
1938
  s.write('</div>')
1665
1939
  s.write('</body></html>')
@@ -1685,6 +1959,7 @@ class Summary(pg.Object):
1685
1959
  experiment=entry,
1686
1960
  dir=entry.dir,
1687
1961
  metrics=entry.result.metrics if entry.result else None,
1962
+ usage=entry.result.usage if entry.result else None,
1688
1963
  )
1689
1964
  )
1690
1965
  task_results[task.__name__] = results
@@ -1833,6 +2108,20 @@ class Summary(pg.Object):
1833
2108
  return result.join()
1834
2109
 
1835
2110
 
2111
+ def _format_error(error: Exception):
2112
+ """Formats an error into a string."""
2113
+ return (f'({error.__class__.__name__}) ' + pg.decolor(str(error)))
2114
+
2115
+
2116
+ def _error_key(error: Exception) -> str:
2117
+ """Returns the key for an error."""
2118
+ error_names = []
2119
+ while error is not None:
2120
+ error_names.append(error.__class__.__name__)
2121
+ error = getattr(error, 'cause', None)
2122
+ return '.'.join(error_names)
2123
+
2124
+
1836
2125
  def _html_repr(value: Any, compact: bool = True, escape: bool = False) -> str:
1837
2126
  """Formats prompt in HTML."""
1838
2127
  if type(value) is lf.Template: # pylint: disable=unidiomatic-typecheck
@@ -1909,41 +2198,200 @@ def monitor_async(
1909
2198
  )
1910
2199
 
1911
2200
 
1912
- def app_run(target: Evaluable):
1913
- """Runs the target evaluation as an absl app.
2201
+ #
2202
+ # Named evaluations and experiments support.
2203
+ #
1914
2204
 
1915
- Args:
1916
- target: An Langfun evaluable object.
1917
- """
1918
- flags.DEFINE_string(
1919
- 'root_dir', None, 'Root directory for running the evaluation.'
1920
- )
1921
2205
 
1922
- flags.DEFINE_bool(
1923
- 'dryrun', False, 'If True, dryrun the experiment instead of running it.'
1924
- )
2206
+ class _NamedEvaluationRegistry:
2207
+ """Named evaluation registry."""
1925
2208
 
1926
- flags.DEFINE_bool(
1927
- 'debug', False, 'If True, output prompt and response to the console.'
1928
- )
2209
+ def __init__(self):
2210
+ self._registry = {}
1929
2211
 
1930
- flags.DEFINE_bool(
1931
- 'rerun',
1932
- False,
1933
- 'If True, rerun the experiment even a cached result is found.',
1934
- )
2212
+ def names(self) -> list[str]:
2213
+ """Returns all registered names."""
2214
+ return sorted(self._registry.keys())
2215
+
2216
+ def get(self, name: str) -> list[Type[Evaluable]]:
2217
+ """Gets an evaluation by name."""
2218
+ matches = []
2219
+ if name in self._registry:
2220
+ matches.append(self._registry[name])
2221
+ else:
2222
+ regex = re.compile(name)
2223
+ for key, cls in self._registry.items():
2224
+ if regex.match(key):
2225
+ matches.append(cls)
2226
+ return matches
2227
+
2228
+ def register(
2229
+ self,
2230
+ name: str,
2231
+ experiment_cls: Type[Evaluable],
2232
+ ):
2233
+ """Register an experiment class."""
2234
+ self._registry[name] = experiment_cls
1935
2235
 
1936
- FLAGS = flags.FLAGS # pylint: disable=invalid-name
1937
2236
 
1938
- def _main(argv):
1939
- if len(argv) > 1:
1940
- raise app.UsageError('Too many command-line arguments.')
2237
+ _eval_registry = _NamedEvaluationRegistry()
1941
2238
 
1942
- if FLAGS.root_dir:
1943
- target.rebind(root_dir=FLAGS.root_dir, raise_on_no_change=False)
1944
- if FLAGS.dryrun:
1945
- target.dryrun(debug=FLAGS.debug)
2239
+
2240
+ def registered_names() -> list[str]:
2241
+ """Returns all registered names."""
2242
+ return _eval_registry.names()
2243
+
2244
+
2245
+ def get_evaluations(evaluation: str | Evaluable) -> list[Evaluable]:
2246
+ """Gets an evaluation experiment by name."""
2247
+ if isinstance(evaluation, str):
2248
+ return [e() for e in _eval_registry.get(evaluation)]
2249
+ return [evaluation]
2250
+
2251
+
2252
+ def register(name: str):
2253
+ """Decorator to create a named evaluation class."""
2254
+
2255
+ def _register(func_or_cls: Type[Evaluation] | types.FunctionType):
2256
+ if inspect.isfunction(func_or_cls):
2257
+ e = func_or_cls()
2258
+ if not isinstance(e, Evaluable):
2259
+ raise TypeError(
2260
+ f'The return value of `{func_or_cls}` should be an instance of '
2261
+ '`lf.eval.Evaluable` subclass.'
2262
+ )
2263
+
2264
+ class GeneratedSuite(Suite):
2265
+ # NOTE(daiyip): Delay serialization key registration for generated
2266
+ # class.
2267
+ auto_register = False
2268
+ children = e.children if isinstance(e, Suite) else [e]
2269
+
2270
+ cls = GeneratedSuite
2271
+ cls.__name__ = func_or_cls.__name__
2272
+ cls.__doc__ = func_or_cls.__doc__
2273
+ cls.__qualname__ = func_or_cls.__qualname__
2274
+ cls.__module__ = getattr(func_or_cls, '__module__', 'wrapper')
2275
+ cls.register_for_deserialization(cls.__type_name__)
2276
+
2277
+ elif issubclass(func_or_cls, Evaluable):
2278
+ cls = func_or_cls
1946
2279
  else:
1947
- target.run(debug=FLAGS.debug, rerun=FLAGS.rerun)
2280
+ raise ValueError(f'Unsupported type: {type(func_or_cls)}')
2281
+
2282
+ _eval_registry.register(name, cls)
2283
+ return cls
2284
+
2285
+ return _register
2286
+
2287
+
2288
+ def get(
2289
+ root_dir: str,
2290
+ evaluations: list[str | Evaluable],
2291
+ filter: Union[ # pylint: disable=redefined-builtin
2292
+ str, # Regex to filter evaluation based on ID.
2293
+ Callable[[Evaluable], bool], # Custom filter function.
2294
+ None # No filtering (Default).
2295
+ ] = None, # pylint: disable=bad-whitespace
2296
+ patches: list[Union[
2297
+ str, # String-based PyGlove patcher.
2298
+ pg.patching.Patcher, # PyGlove patcher object.
2299
+ Callable[[pg.KeyPath, Any, Any], Any], # PyGlove rebind function.
2300
+ ]] | None = None, # pylint: disable=bad-whitespace
2301
+ ) -> Suite:
2302
+ """Gets a suite from a list of patched evaluations.
2303
+
2304
+ Args:
2305
+ root_dir: The root directory of the experiment.
2306
+ evaluations: A list of evaluations to be included in the suite.
2307
+ filter: A regular expression (str) for selecting sub-experiments of matched
2308
+ IDs, or a filter function to filter the evaluations.
2309
+ patches: A list of patches to be applied to the suite. Each element can be
2310
+ a string (for string-based patcher), a `pg.patching.Patcher` object, or
2311
+ a rebind function (e.g. `pg.rebind`). See `lf.eval.patch_*` for more
2312
+ details.
2313
+
2314
+ Returns:
2315
+ A suite of selected `lf.eval.Evaluation` objects.
2316
+ """
2317
+ matches = []
2318
+ for e in evaluations:
2319
+ matches.extend(get_evaluations(e))
2320
+
2321
+ if not matches:
2322
+ raise ValueError('No evaluations found.')
2323
+
2324
+ suite = Suite(matches, root_dir=root_dir)
2325
+ if patches:
2326
+ suite = pg.patch(suite, patches)
2327
+
2328
+ if isinstance(filter, str):
2329
+ regex = re.compile(filter)
2330
+ filter = lambda x: bool(regex.match(x.id))
2331
+
2332
+ if filter:
2333
+ suite = Suite(
2334
+ [leaf for leaf in suite.leaf_nodes if filter(leaf)], root_dir=root_dir)
2335
+ return suite
2336
+
2337
+
2338
+ def run(
2339
+ root_dir: str,
2340
+ evaluations: list[str | Evaluable],
2341
+ filter: Union[ # pylint: disable=redefined-builtin
2342
+ str, # Regex to filter evaluation based on ID.
2343
+ Callable[[Evaluable], bool], # Custom filter function.
2344
+ None # No filtering (Default).
2345
+ ] = None, # pylint: disable=bad-whitespace
2346
+ patches: list[Union[
2347
+ str, # String-based PyGlove patcher.
2348
+ pg.patching.Patcher, # PyGlove patcher object.
2349
+ Callable[[pg.KeyPath, Any, Any], Any], # PyGlove rebind function.
2350
+ ]] | None = None, # pylint: disable=bad-whitespace
2351
+ mode: Literal['run', 'rerun', 'dryrun', 'noop'] = 'run',
2352
+ debug: bool = False,
2353
+ print_definition: bool = False,
2354
+ **kwargs,
2355
+ ) -> Suite:
2356
+ """Run selected evaluations with patching.
2357
+
2358
+ Args:
2359
+ root_dir: The root directory of the experiment.
2360
+ evaluations: A list of evaluations to be included in the suite.
2361
+ filter: A regular expression (str) for selecting sub-experiments of matched
2362
+ IDs, or a filter function to filter the evaluations.
2363
+ patches: A list of patches to be applied to the suite. Each element can be
2364
+ a string (for string-based patcher), a `pg.patching.Patcher` object, or
2365
+ a rebind function (e.g. `pg.rebind`). See `lf.eval.patch_*` for more
2366
+ details.
2367
+ mode: The mode to run the suite. "run" to run the suite, with reusing
2368
+ existing results if available; "rerun" to rerun all evaluations even if
2369
+ there are existing results; "dryrun" to dryrun the suite; and "noop"
2370
+ to do nothing.
2371
+ debug: Whether to run in debug mode.
2372
+ print_definition: Whether to print the experiment definition.
2373
+ **kwargs: Additional arguments to be passed to dryrun/run the suite.
2374
+
2375
+ Returns:
2376
+ A suite of selected `lf.eval.Evaluation` objects.
2377
+ """
2378
+ suite = get(root_dir, evaluations, patches=patches, filter=filter)
2379
+ if print_definition:
2380
+ lf.console.write(
2381
+ pg.format(
2382
+ suite,
2383
+ compact=False,
2384
+ verbose=False,
2385
+ hide_default_values=True,
2386
+ python_format=True,
2387
+ ),
2388
+ title='[EXPERIMENT DEFINITION]',
2389
+ color='blue',
2390
+ )
1948
2391
 
1949
- app.run(_main)
2392
+ if mode == 'run':
2393
+ rerun = mode == 'rerun'
2394
+ suite.run(debug=debug, rerun=rerun, **kwargs)
2395
+ elif mode == 'dryrun':
2396
+ suite.dryrun(debug=debug, **kwargs)
2397
+ return suite