langfun 0.0.2.dev20240330__py3-none-any.whl → 0.1.2.dev202501140804__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (145) hide show
  1. langfun/__init__.py +22 -2
  2. langfun/core/__init__.py +17 -5
  3. langfun/core/agentic/__init__.py +30 -0
  4. langfun/core/agentic/action.py +854 -0
  5. langfun/core/agentic/action_eval.py +150 -0
  6. langfun/core/agentic/action_eval_test.py +109 -0
  7. langfun/core/agentic/action_test.py +136 -0
  8. langfun/core/coding/python/__init__.py +5 -11
  9. langfun/core/coding/python/correction.py +37 -28
  10. langfun/core/coding/python/correction_test.py +29 -3
  11. langfun/core/coding/python/execution.py +40 -216
  12. langfun/core/coding/python/execution_test.py +29 -89
  13. langfun/core/coding/python/generation.py +21 -11
  14. langfun/core/coding/python/generation_test.py +2 -2
  15. langfun/core/coding/python/parsing.py +108 -193
  16. langfun/core/coding/python/parsing_test.py +2 -105
  17. langfun/core/component.py +69 -2
  18. langfun/core/component_test.py +54 -0
  19. langfun/core/concurrent.py +414 -117
  20. langfun/core/concurrent_test.py +111 -24
  21. langfun/core/console.py +18 -5
  22. langfun/core/console_test.py +17 -0
  23. langfun/core/eval/__init__.py +17 -0
  24. langfun/core/eval/base.py +767 -140
  25. langfun/core/eval/base_test.py +238 -53
  26. langfun/core/eval/matching.py +80 -76
  27. langfun/core/eval/matching_test.py +19 -9
  28. langfun/core/eval/patching.py +130 -0
  29. langfun/core/eval/patching_test.py +170 -0
  30. langfun/core/eval/scoring.py +37 -28
  31. langfun/core/eval/scoring_test.py +21 -3
  32. langfun/core/eval/v2/__init__.py +42 -0
  33. langfun/core/eval/v2/checkpointing.py +380 -0
  34. langfun/core/eval/v2/checkpointing_test.py +228 -0
  35. langfun/core/eval/v2/eval_test_helper.py +136 -0
  36. langfun/core/eval/v2/evaluation.py +725 -0
  37. langfun/core/eval/v2/evaluation_test.py +180 -0
  38. langfun/core/eval/v2/example.py +305 -0
  39. langfun/core/eval/v2/example_test.py +128 -0
  40. langfun/core/eval/v2/experiment.py +1048 -0
  41. langfun/core/eval/v2/experiment_test.py +433 -0
  42. langfun/core/eval/v2/metric_values.py +156 -0
  43. langfun/core/eval/v2/metric_values_test.py +80 -0
  44. langfun/core/eval/v2/metrics.py +357 -0
  45. langfun/core/eval/v2/metrics_test.py +203 -0
  46. langfun/core/eval/v2/progress.py +348 -0
  47. langfun/core/eval/v2/progress_test.py +82 -0
  48. langfun/core/eval/v2/progress_tracking.py +210 -0
  49. langfun/core/eval/v2/progress_tracking_test.py +66 -0
  50. langfun/core/eval/v2/reporting.py +270 -0
  51. langfun/core/eval/v2/reporting_test.py +158 -0
  52. langfun/core/eval/v2/runners.py +488 -0
  53. langfun/core/eval/v2/runners_test.py +334 -0
  54. langfun/core/langfunc.py +3 -21
  55. langfun/core/langfunc_test.py +26 -8
  56. langfun/core/language_model.py +686 -48
  57. langfun/core/language_model_test.py +681 -44
  58. langfun/core/llms/__init__.py +100 -12
  59. langfun/core/llms/anthropic.py +488 -0
  60. langfun/core/llms/anthropic_test.py +235 -0
  61. langfun/core/llms/cache/base.py +21 -2
  62. langfun/core/llms/cache/in_memory.py +13 -0
  63. langfun/core/llms/cache/in_memory_test.py +88 -28
  64. langfun/core/llms/compositional.py +101 -0
  65. langfun/core/llms/compositional_test.py +73 -0
  66. langfun/core/llms/deepseek.py +117 -0
  67. langfun/core/llms/deepseek_test.py +61 -0
  68. langfun/core/llms/fake.py +39 -26
  69. langfun/core/llms/fake_test.py +136 -11
  70. langfun/core/llms/gemini.py +507 -0
  71. langfun/core/llms/gemini_test.py +195 -0
  72. langfun/core/llms/google_genai.py +62 -218
  73. langfun/core/llms/google_genai_test.py +9 -197
  74. langfun/core/llms/groq.py +276 -0
  75. langfun/core/llms/groq_test.py +64 -0
  76. langfun/core/llms/llama_cpp.py +15 -40
  77. langfun/core/llms/llama_cpp_test.py +4 -30
  78. langfun/core/llms/openai.py +436 -226
  79. langfun/core/llms/openai_compatible.py +179 -0
  80. langfun/core/llms/openai_compatible_test.py +495 -0
  81. langfun/core/llms/openai_test.py +35 -174
  82. langfun/core/llms/rest.py +113 -0
  83. langfun/core/llms/rest_test.py +111 -0
  84. langfun/core/llms/vertexai.py +192 -0
  85. langfun/core/llms/vertexai_test.py +52 -0
  86. langfun/core/logging.py +284 -0
  87. langfun/core/logging_test.py +125 -0
  88. langfun/core/message.py +319 -9
  89. langfun/core/message_test.py +190 -13
  90. langfun/core/modalities/__init__.py +6 -2
  91. langfun/core/modalities/audio.py +30 -0
  92. langfun/core/modalities/audio_test.py +63 -0
  93. langfun/core/modalities/image.py +39 -20
  94. langfun/core/modalities/image_test.py +52 -9
  95. langfun/core/modalities/mime.py +206 -29
  96. langfun/core/modalities/mime_test.py +90 -9
  97. langfun/core/modalities/ms_office.py +117 -0
  98. langfun/core/modalities/ms_office_test.py +389 -0
  99. langfun/core/modalities/pdf.py +22 -0
  100. langfun/core/modalities/pdf_test.py +57 -0
  101. langfun/core/modalities/video.py +9 -23
  102. langfun/core/modalities/video_test.py +3 -3
  103. langfun/core/modality.py +26 -3
  104. langfun/core/modality_test.py +2 -2
  105. langfun/core/sampling.py +11 -11
  106. langfun/core/structured/__init__.py +15 -16
  107. langfun/core/structured/completion.py +32 -5
  108. langfun/core/structured/completion_test.py +9 -8
  109. langfun/core/structured/description.py +2 -2
  110. langfun/core/structured/description_test.py +3 -3
  111. langfun/core/structured/function_generation.py +278 -0
  112. langfun/core/structured/function_generation_test.py +399 -0
  113. langfun/core/structured/mapping.py +150 -46
  114. langfun/core/structured/mapping_test.py +105 -0
  115. langfun/core/structured/parsing.py +33 -21
  116. langfun/core/structured/parsing_test.py +71 -22
  117. langfun/core/structured/querying.py +746 -0
  118. langfun/core/structured/{prompting_test.py → querying_test.py} +545 -60
  119. langfun/core/structured/schema.py +208 -99
  120. langfun/core/structured/schema_generation.py +1 -1
  121. langfun/core/structured/schema_generation_test.py +2 -2
  122. langfun/core/structured/schema_test.py +133 -34
  123. langfun/core/structured/scoring.py +125 -19
  124. langfun/core/structured/scoring_test.py +30 -0
  125. langfun/core/structured/tokenization.py +64 -0
  126. langfun/core/structured/tokenization_test.py +48 -0
  127. langfun/core/template.py +240 -11
  128. langfun/core/template_test.py +146 -1
  129. langfun/core/templates/conversation.py +9 -0
  130. langfun/core/templates/conversation_test.py +4 -3
  131. langfun/core/templates/selfplay_test.py +14 -2
  132. langfun-0.1.2.dev202501140804.dist-info/METADATA +225 -0
  133. langfun-0.1.2.dev202501140804.dist-info/RECORD +153 -0
  134. {langfun-0.0.2.dev20240330.dist-info → langfun-0.1.2.dev202501140804.dist-info}/WHEEL +1 -1
  135. langfun/core/coding/python/errors.py +0 -108
  136. langfun/core/coding/python/errors_test.py +0 -99
  137. langfun/core/coding/python/permissions.py +0 -90
  138. langfun/core/coding/python/permissions_test.py +0 -86
  139. langfun/core/structured/prompting.py +0 -217
  140. langfun/core/text_formatting.py +0 -162
  141. langfun/core/text_formatting_test.py +0 -47
  142. langfun-0.0.2.dev20240330.dist-info/METADATA +0 -99
  143. langfun-0.0.2.dev20240330.dist-info/RECORD +0 -102
  144. {langfun-0.0.2.dev20240330.dist-info → langfun-0.1.2.dev202501140804.dist-info}/LICENSE +0 -0
  145. {langfun-0.0.2.dev20240330.dist-info → langfun-0.1.2.dev202501140804.dist-info}/top_level.txt +0 -0
langfun/core/eval/base.py CHANGED
@@ -24,6 +24,7 @@ import os
24
24
  import re
25
25
  import threading
26
26
  import time
27
+ import types
27
28
  from typing import Annotated, Any, Callable, Iterator, Literal, Optional, Sequence, Type, Union
28
29
 
29
30
  import langfun.core as lf
@@ -38,7 +39,8 @@ class Evaluable(lf.Component):
38
39
 
39
40
  EXPERIMENT_JSON = 'experiment.json'
40
41
  RESULT_JSON = 'result.json'
41
- FAILURES_JSON = 'failures.json'
42
+ OOP_FAILURES_JSON = 'oop_failures.json'
43
+ NON_OOP_FAILURES_JSON = 'non_oop_failures.json'
42
44
  INDEX_HTML = 'index.html'
43
45
  SUMMARY_HTML = 'summary.html'
44
46
 
@@ -213,6 +215,7 @@ class Evaluable(lf.Component):
213
215
  summary: bool = True,
214
216
  pivot_field: str = 'lm',
215
217
  from_root: bool = True,
218
+ timeout: int | None = None,
216
219
  **kwargs,
217
220
  ) -> Union['Summary', pg.Dict]:
218
221
  """Run the evaluation, which fills and returns the result."""
@@ -240,7 +243,7 @@ class Evaluable(lf.Component):
240
243
  ):
241
244
  if show_progress:
242
245
  lf.concurrent.ProgressBar.update(
243
- progress_bar, postfix='LOADING SAVED RESULTS...', color='yellow'
246
+ progress_bar, status='LOADING SAVED RESULTS...', color='yellow'
244
247
  )
245
248
  if self.try_load_result():
246
249
  run_status = 'CACHED'
@@ -263,13 +266,14 @@ class Evaluable(lf.Component):
263
266
  verbose=verbose,
264
267
  progress_bar=progress_bar,
265
268
  label=label,
269
+ timeout=timeout,
266
270
  **kwargs,
267
271
  )
268
272
 
269
273
  if should_save:
270
274
  if show_progress:
271
275
  lf.concurrent.ProgressBar.update(
272
- progress_bar, postfix='SAVING RESULTS...', color='yellow'
276
+ progress_bar, status='SAVING RESULTS...', color='yellow'
273
277
  )
274
278
 
275
279
  # Save evaluation results.
@@ -282,7 +286,7 @@ class Evaluable(lf.Component):
282
286
  if show_progress:
283
287
  lf.concurrent.ProgressBar.update(
284
288
  progress_bar,
285
- postfix=self._completion_status(run_status),
289
+ status=self._completion_status(run_status),
286
290
  color='green',
287
291
  )
288
292
  else:
@@ -338,7 +342,7 @@ class Evaluable(lf.Component):
338
342
  f'[#{leaf.index} - {leaf.node.id}]',
339
343
  total=leaf.node.num_examples if leaf.enabled else 0,
340
344
  color='cyan' if leaf.enabled else 'yellow',
341
- postfix=None if leaf.enabled else 'SKIPPED.')
345
+ status=None if leaf.enabled else 'SKIPPED.')
342
346
 
343
347
  # Run leaf groups in parallel.
344
348
  try:
@@ -352,17 +356,17 @@ class Evaluable(lf.Component):
352
356
  # Save results for non-leaf nodes.
353
357
  lf.concurrent.ProgressBar.update(
354
358
  overview_bar,
355
- postfix='SAVING RESULTS...',
359
+ status='SAVING RESULTS...',
356
360
  color='yellow')
357
361
 
358
362
  for node in self.nonleaf_nodes:
359
- node._result = {c.id: c.result for c in node.children} # pylint: disable=protected-access
363
+ node._result = {c.id: c.result for c in node.leaf_nodes} # pylint: disable=protected-access
360
364
  if should_save:
361
365
  node.save(result=False, report=False)
362
366
 
363
367
  if should_save and summary:
364
368
  lf.concurrent.ProgressBar.update(
365
- overview_bar, postfix='FINALIZING SUMMARY...'
369
+ overview_bar, status='FINALIZING SUMMARY...'
366
370
  )
367
371
 
368
372
  summary.save(os.path.join(self.root_dir, Evaluable.SUMMARY_HTML))
@@ -376,7 +380,7 @@ class Evaluable(lf.Component):
376
380
  # Signal all task completed by making the bar green.
377
381
  lf.concurrent.ProgressBar.update(
378
382
  overview_bar,
379
- postfix='COMPLETED',
383
+ status='COMPLETED',
380
384
  color='green')
381
385
 
382
386
  finally:
@@ -396,6 +400,7 @@ class Evaluable(lf.Component):
396
400
  verbose: bool,
397
401
  progress_bar: int | None,
398
402
  label: str | None,
403
+ timeout: int | None = None,
399
404
  **kwargs,
400
405
  ) -> None:
401
406
  """Run the evaluate and fill `self.result`. Subclass to implement."""
@@ -526,27 +531,14 @@ class Evaluable(lf.Component):
526
531
  self._render_message(self.dryrun_output, s)
527
532
 
528
533
  def _render_message(self, message: lf.Message, s: io.StringIO) -> None:
529
- for m in message.trace():
530
- if 'lm-input' in m.tags:
531
- text_color = 'green'
532
- elif 'lm-response' in m.tags:
533
- text_color = 'blue'
534
- else:
535
- text_color = 'black'
536
-
537
- s.write(
538
- f'<div style="color: {text_color}; white-space: pre-wrap;'
539
- 'padding: 10px; border: 1px solid; margin-top: 10px">'
540
- )
541
- s.write(m.text)
542
- if m.result is not None:
543
- s.write(
544
- '<div style="color: magenta; white-space: pre-wrap;'
545
- 'padding: 10px; border: 1px solid; margin: 10px">'
534
+ s.write(
535
+ message.to_html_str(
536
+ extra_flags=dict(
537
+ include_message_metadata=False,
538
+ source_tag=['lm-input', 'lm-response'],
539
+ )
546
540
  )
547
- s.write(pg.format(m.result))
548
- s.write('</div>')
549
- s.write('</div>')
541
+ )
550
542
 
551
543
  @classmethod
552
544
  def from_dir(
@@ -586,7 +578,6 @@ class _LeafNode:
586
578
  @pg.use_init_args(['children'])
587
579
  class Suite(Evaluable):
588
580
  """Evaluation suite."""
589
-
590
581
  children: Annotated[list[Evaluable], 'Child evaluation sets or suites.']
591
582
 
592
583
  # Use empty ID as suite is just a container of child evaluations.
@@ -741,10 +732,12 @@ class Evaluation(Evaluable):
741
732
 
742
733
  # Constants.
743
734
  CACHE_JSON = 'cache.json'
744
- FAILURES_HTML = 'failures.html'
735
+ OOP_FAILURES_HTML = 'oop_failures.html'
736
+ NON_OOP_FAILURES_HTML = 'non_oop_failures.html'
745
737
 
746
738
  @functools.cached_property
747
739
  def hash(self) -> str:
740
+ """Returns the semantic-based hash of the evaluation."""
748
741
  if self.is_deterministic:
749
742
  identity = pg.format(self._identifiers(), compact=True)
750
743
  else:
@@ -793,6 +786,10 @@ class Evaluation(Evaluable):
793
786
  """Returns the complete rate."""
794
787
  return self.num_completed / self.num_examples
795
788
 
789
+ #
790
+ # Properties on failures.
791
+ #
792
+
796
793
  @property
797
794
  def failures(self) -> list[tuple[Any, Exception]]:
798
795
  """Returns the failed examples and their errors."""
@@ -803,6 +800,15 @@ class Evaluation(Evaluable):
803
800
  """Returns the number of failed examples."""
804
801
  return len(self.failures)
805
802
 
803
+ @functools.cached_property
804
+ def failure_breakdown(self) -> dict[str, int]:
805
+ """Returns the breakdown of failures."""
806
+ breakdown = collections.defaultdict(int)
807
+ for _, error in self.failures:
808
+ breakdown[_error_key(error)] += 1
809
+ sorted_items = sorted(breakdown.items(), key=lambda x: x[1], reverse=True)
810
+ return pg.Dict({x[0]: x[1] for x in sorted_items})
811
+
806
812
  @property
807
813
  def failure_rate(self) -> float:
808
814
  """Returns the failure rate in range [0, 1]."""
@@ -810,17 +816,76 @@ class Evaluation(Evaluable):
810
816
  return 0.0
811
817
  return self.num_failures / self.num_completed
812
818
 
819
+ @functools.cached_property
820
+ def oop_failures(self) -> list[tuple[Any, lf_structured.MappingError]]:
821
+ """Returns the OOP failures."""
822
+ return [item for item in self.failures
823
+ if isinstance(item[1], lf_structured.MappingError)]
824
+
825
+ @property
826
+ def num_oop_failures(self) -> int:
827
+ """Returns the number of OOP failures."""
828
+ return len(self.oop_failures)
829
+
830
+ @property
831
+ def oop_failure_rate(self) -> float:
832
+ """Returns the OOP failure rate in range [0, 1]."""
833
+ if self.num_completed == 0:
834
+ return 0.0
835
+ return self.num_oop_failures / self.num_completed
836
+
837
+ @functools.cached_property
838
+ def non_oop_failures(self) -> list[tuple[Any, Exception]]:
839
+ """Returns the OOP failures."""
840
+ return [item for item in self.failures
841
+ if not isinstance(item[1], lf_structured.MappingError)]
842
+
843
+ @property
844
+ def num_non_oop_failures(self) -> int:
845
+ """Returns the number of non-OOP failures."""
846
+ return len(self.non_oop_failures)
847
+
848
+ @property
849
+ def non_oop_failure_rate(self) -> float:
850
+ """Returns the non-OOP failure rate in range [0, 1]."""
851
+ if self.num_completed == 0:
852
+ return 0.0
853
+ return self.num_non_oop_failures / self.num_completed
854
+
855
+ #
856
+ # Properties on usage.
857
+ #
858
+
859
+ @property
860
+ def has_usage(self) -> bool:
861
+ """Returns True if token usage is enabled."""
862
+ return self._num_usages > 0
863
+
864
+ @property
865
+ def average_prompt_tokens(self) -> int:
866
+ """Returns the average prompt tokens."""
867
+ if not self.has_usage:
868
+ return 0
869
+ return self._total_prompt_tokens // self._num_usages
870
+
871
+ @property
872
+ def average_completion_tokens(self) -> int:
873
+ """Returns the average completion tokens."""
874
+ if not self.has_usage:
875
+ return 0
876
+ return self._total_completion_tokens // self._num_usages
877
+
878
+ @property
879
+ def average_total_tokens(self) -> int:
880
+ """Returns the average total tokens."""
881
+ return self.average_prompt_tokens + self.average_completion_tokens
882
+
813
883
  @functools.cached_property
814
884
  def schema(self) -> lf_structured.Schema | None:
815
885
  """Schema."""
816
886
  if self.schema_fn is None:
817
887
  return None
818
888
 
819
- kwargs = {}
820
- # Allow schema to be a function based on current evaluation.
821
- if 'evaluation' in self.schema_fn.__signature__.arg_names:
822
- kwargs['evaluation'] = self
823
-
824
889
  schema = self._call_schema_fn()
825
890
  fewshot_examples = None
826
891
  if isinstance(schema, tuple):
@@ -861,7 +926,11 @@ class Evaluation(Evaluable):
861
926
  'Encountered: {annotation!r}.'
862
927
  )
863
928
  self._maybe_adjust_schema_for_completion(annotation)
864
- return lf_structured.Schema.from_value(annotation)
929
+ schema = lf_structured.Schema.from_value(annotation)
930
+ # NOTE(daiyip): add references to the dependent classes of the returned type
931
+ # to prevent unused subclasses get garbage collected by Python.
932
+ setattr(schema, '__dependencies__', schema.class_dependencies())
933
+ return schema
865
934
 
866
935
  def _maybe_adjust_schema_for_completion(self, cls):
867
936
  if (self.completion_prompt_field is None
@@ -870,7 +939,7 @@ class Evaluation(Evaluable):
870
939
 
871
940
  fields = list(cls.__schema__.values())
872
941
  fields.insert(0, (self.completion_prompt_field, pg.typing.Str()))
873
- pg.symbolic.update_schema(cls, fields, extend=False)
942
+ cls.update_schema(fields, extend=False)
874
943
 
875
944
  def _maybe_adjust_examples_for_completion(
876
945
  self,
@@ -938,12 +1007,25 @@ class Evaluation(Evaluable):
938
1007
  self._failures = []
939
1008
  self._num_completed = 0
940
1009
 
1010
+ self._total_prompt_tokens = 0
1011
+ self._total_completion_tokens = 0
1012
+ self._num_usages = 0
1013
+ self.__dict__.pop('oop_failures', None)
1014
+ self.__dict__.pop('non_oop_failures', None)
1015
+
941
1016
  @property
942
- def failures_link(self) -> str | None:
943
- """Returns the link to the failures page."""
1017
+ def oop_failures_link(self) -> str | None:
1018
+ """Returns the link to the OOP failures page."""
944
1019
  if self.dir is None:
945
1020
  return None
946
- return self.link(os.path.join(self.dir, Evaluation.FAILURES_HTML))
1021
+ return self.link(os.path.join(self.dir, Evaluation.OOP_FAILURES_HTML))
1022
+
1023
+ @property
1024
+ def non_oop_failures_link(self) -> str | None:
1025
+ """Returns the link to then non-OOP failures page."""
1026
+ if self.dir is None:
1027
+ return None
1028
+ return self.link(os.path.join(self.dir, Evaluation.NON_OOP_FAILURES_HTML))
947
1029
 
948
1030
  def _dryrun(
949
1031
  self,
@@ -953,11 +1035,11 @@ class Evaluation(Evaluable):
953
1035
  verbose: bool,
954
1036
  **kwargs,
955
1037
  ) -> None:
956
- # Set the example for dryrun.
957
- example = example or self.examples[0]
958
-
959
1038
  # We make a copy to avoid pollute the state of current object.
960
- copy = self.clone()
1039
+ copy: Evaluation = self.clone()
1040
+
1041
+ # Set the example for dryrun.
1042
+ example = example or copy.examples[0]
961
1043
  copy.__dict__['examples'] = [example]
962
1044
 
963
1045
  # We set the symbolic parent of the cloned to access contextual information
@@ -972,24 +1054,37 @@ class Evaluation(Evaluable):
972
1054
  color='green',
973
1055
  )
974
1056
 
975
- with lf.use_settings(debug=debug):
976
- output_message = copy.process(example, **(self.additional_args or {}))
977
- if self.schema is None:
978
- output = output_message.text
979
- else:
980
- output = output_message.result
1057
+ error, output_message = None, None
981
1058
 
982
- if verbose:
1059
+ try:
1060
+ with lf.use_settings(debug=debug):
1061
+ output_message = copy.process(example, **(self.additional_args or {}))
1062
+ self.process_output(example, output_message)
1063
+
1064
+ if self.schema is None:
1065
+ output = output_message.text
1066
+ else:
1067
+ output = output_message.result
1068
+
1069
+ if verbose:
1070
+ lf.console.write('')
1071
+ lf.console.write(
1072
+ str(output),
1073
+ title='OUTPUT',
1074
+ color='blue',
1075
+ )
1076
+ except lf_structured.MappingError as e:
983
1077
  lf.console.write('')
984
1078
  lf.console.write(
985
- str(output),
986
- title='OUTPUT',
987
- color='blue',
1079
+ str(e),
1080
+ title='ERROR',
1081
+ color='red',
988
1082
  )
1083
+ error = e
1084
+
1085
+ copy.audit(1, example, output_message, error, dryrun=True)
1086
+ result = copy.finalize()
989
1087
 
990
- # Audit the result.
991
- copy.audit(example, output, output_message)
992
- result = copy.summarize()
993
1088
  if verbose:
994
1089
  lf.console.write('')
995
1090
  lf.console.write(
@@ -1009,9 +1104,13 @@ class Evaluation(Evaluable):
1009
1104
  verbose: bool,
1010
1105
  progress_bar: int | None,
1011
1106
  label: str | None,
1107
+ timeout: int | None = None,
1012
1108
  **kwargs,
1013
1109
  ) -> None:
1014
1110
  # Setup examples.
1111
+ # Reset examples so it could be read from the input functor.
1112
+ self.__dict__.pop('examples', None)
1113
+
1015
1114
  if end is None:
1016
1115
  end = len(self.examples)
1017
1116
  examples = self.examples[start:end]
@@ -1020,34 +1119,39 @@ class Evaluation(Evaluable):
1020
1119
  with lf.use_settings(debug=debug, cache=self.cache):
1021
1120
  self._reset()
1022
1121
 
1023
- def _process(example: Any):
1122
+ def _process(idx_and_example: Any):
1024
1123
  # NOTE(daiyip): set the `input` symbol of the globals to None, so LLM
1025
1124
  # generated code with calls to `input` will raise an error, thus not
1026
1125
  # blocking the evaluation.
1126
+ _, example = idx_and_example
1027
1127
  with lf_coding.context(input=None):
1028
- return self.process(example, **(self.additional_args or {}))
1128
+ output_message = self.process(example, **(self.additional_args or {}))
1129
+ self.process_output(example, output_message)
1130
+ return output_message
1029
1131
 
1030
1132
  try:
1031
- for example, message, error in lf.concurrent_map(
1133
+ for (idx, example), message, error in lf.concurrent_map(
1032
1134
  _process,
1033
- examples,
1135
+ enumerate(examples),
1034
1136
  max_workers=self.max_workers,
1035
1137
  show_progress=progress_bar or False,
1036
1138
  status_fn=self._status,
1139
+ timeout=timeout,
1037
1140
  ):
1038
1141
  if error is not None:
1039
- self._failures.append((example, str(error)))
1040
- else:
1041
- output = message.text if self.schema is None else message.result
1042
- self.audit(example, output, message)
1043
- self._num_completed += 1
1142
+ message = (
1143
+ error.lm_response
1144
+ if isinstance(error, lf_structured.MappingError)
1145
+ else None
1146
+ )
1147
+ self.audit(idx + 1, example, message, error)
1044
1148
  finally:
1045
1149
  # Save cache upon completion or interruption.
1046
1150
  if self.dir and self.cache:
1047
1151
  self.cache.save()
1048
1152
 
1049
1153
  # Summarize result.
1050
- self._result = self.summarize()
1154
+ self._result = self.finalize()
1051
1155
  if verbose:
1052
1156
  lf.console.write(
1053
1157
  str(self.result),
@@ -1061,7 +1165,7 @@ class Evaluation(Evaluable):
1061
1165
 
1062
1166
  def process(self, example: Any, **kwargs) -> lf.Message:
1063
1167
  """Process an example and returns its output."""
1064
- prompt = self.prompt.render(example=example).text
1168
+ prompt = lf.Template.from_value(self.prompt, example=example)
1065
1169
  if self.method == 'call':
1066
1170
  return lf_structured.call(
1067
1171
  prompt,
@@ -1089,7 +1193,9 @@ class Evaluation(Evaluable):
1089
1193
  else:
1090
1194
  assert self.method == 'complete', self.method
1091
1195
  assert isinstance(self.schema.spec, pg.typing.Object), self.schema
1092
- input_value = self.schema.spec.cls.partial(prompt)
1196
+ # TODO(daiyip): Currently multi-modal inputs within the prompt for
1197
+ # completion is not supported.
1198
+ input_value = self.schema.spec.cls.partial(prompt.render().text)
1093
1199
  return lf_structured.complete(
1094
1200
  input_value,
1095
1201
  lm=self.lm,
@@ -1100,16 +1206,48 @@ class Evaluation(Evaluable):
1100
1206
  **kwargs,
1101
1207
  )
1102
1208
 
1209
+ def process_output(self, example: Any, output: lf.Message) -> None:
1210
+ """Process the output for an example.
1211
+
1212
+ Subclasses can override this method to generate and attach additional
1213
+ metadata for debugging purpose. For example, draw bounding boxes on the
1214
+ input image based on LLM predicted boxes and attach to output_message's
1215
+ metadata.
1216
+
1217
+ Example:
1218
+
1219
+ class BoundingBoxEval(lf.eval.Matching):
1220
+ ...
1221
+ def process_output(example, output):
1222
+ output.metadata.image_with_bbox = draw_bboxes(
1223
+ example.image, output.result)
1224
+
1225
+ Args:
1226
+ example: User input.
1227
+ output: LLM's output message. Users could attach additional
1228
+ information to the message, which will be shown in debugging
1229
+ """
1230
+ del example, output
1231
+
1103
1232
  def _status(self, progress: lf.concurrent.Progress) -> dict[str, Any]:
1233
+ status = {'Model': self.lm.model_id}
1234
+ status.update(self._eval_status(progress))
1235
+
1236
+ if progress.last_error is not None:
1237
+ status['LastError'] = progress.last_error_str()
1238
+ if progress.timeit_summary:
1239
+ status['TimeIt'] = progress.timeit_summary_str()
1240
+ return status
1241
+
1242
+ def _eval_status(self, progress: lf.concurrent.Progress) -> dict[str, Any]:
1104
1243
  return {
1105
- 'Model': self.lm.model_id,
1106
- 'Succeeded': f'%.{self.report_precision}f%% (%d/%d)' % (
1107
- progress.success_rate * 100,
1244
+ 'Succeeded': '%s (%d/%d)' % (
1245
+ self._format_rate(progress.success_rate),
1108
1246
  progress.succeeded,
1109
1247
  progress.completed,
1110
1248
  ),
1111
- 'Failed': f'%.{self.report_precision}f%% (%d/%d)' % (
1112
- progress.failure_rate * 100,
1249
+ 'Failed': '%s (%d/%d)' % (
1250
+ self._format_rate(progress.failure_rate),
1113
1251
  progress.failed,
1114
1252
  progress.completed,
1115
1253
  ),
@@ -1119,22 +1257,21 @@ class Evaluation(Evaluable):
1119
1257
  assert self.result is not None
1120
1258
  m = self.result.metrics
1121
1259
  return (
1122
- f'COMPLETED(%s): Successes=%.{self.report_precision}f%% (%d/%d)'
1123
- f' Failures=%.{self.report_precision}f%% (%d/%d)'
1260
+ 'COMPLETED(%s): Successes=%s(%d/%d) Failures=%s (%d/%d)'
1124
1261
  % (
1125
1262
  run_status,
1126
- (1 - m.failure_rate) * 100,
1263
+ self._format_rate(1 - m.failure_rate),
1127
1264
  m.total - m.failures,
1128
1265
  m.total,
1129
- m.failure_rate * 100,
1266
+ self._format_rate(m.failure_rate),
1130
1267
  m.failures,
1131
1268
  m.total,
1132
1269
  )
1133
1270
  )
1134
1271
 
1135
- def summarize(self) -> pg.Dict:
1136
- """Summarizes the evaluation result."""
1137
- if self.cache:
1272
+ def finalize(self) -> pg.Dict:
1273
+ """Finalizes the evaluation result."""
1274
+ if self.cache is not None:
1138
1275
  cache_stats = dict(
1139
1276
  use_cache=True,
1140
1277
  num_queries=self.cache.stats.num_queries,
@@ -1143,12 +1280,25 @@ class Evaluation(Evaluable):
1143
1280
  )
1144
1281
  else:
1145
1282
  cache_stats = dict(use_cache=False)
1283
+
1284
+ if self.has_usage:
1285
+ usage = pg.Dict(
1286
+ total_prompt_tokens=self._total_prompt_tokens,
1287
+ total_completion_tokens=self._total_completion_tokens,
1288
+ num_usages=self._num_usages,
1289
+ average_prompt_tokens=self.average_prompt_tokens,
1290
+ average_completion_tokens=self.average_completion_tokens,
1291
+ average_total_tokens=self.average_total_tokens,
1292
+ )
1293
+ else:
1294
+ usage = None
1295
+
1146
1296
  result = pg.Dict(
1147
1297
  experiment_setup=pg.Dict(
1148
1298
  id=self.id,
1149
1299
  dir=self.dir,
1150
1300
  model=self.lm.model_id,
1151
- prompt_template=lf.text_formatting.decolored(str(self.prompt)),
1301
+ prompt_template=pg.decolor(str(self.prompt)),
1152
1302
  method=self.method,
1153
1303
  schema_fn=str(self.schema_fn),
1154
1304
  ),
@@ -1157,56 +1307,183 @@ class Evaluation(Evaluable):
1157
1307
  total=self.num_completed,
1158
1308
  failures=self.num_failures,
1159
1309
  failure_rate=self.failure_rate,
1310
+ oop_failures=self.num_oop_failures,
1311
+ oop_failure_rate=self.oop_failure_rate,
1312
+ non_oop_failures=self.num_non_oop_failures,
1313
+ non_oop_failure_rate=self.non_oop_failure_rate,
1314
+ failure_breakdown=self.failure_breakdown,
1160
1315
  ),
1316
+ usage=usage,
1161
1317
  )
1162
1318
  return result
1163
1319
 
1164
- def summarize_html(self) -> str:
1320
+ def summary_card(self) -> str:
1321
+ """Returns summary card in HTML."""
1165
1322
  s = io.StringIO()
1166
1323
  definition = _html_repr(self, compact=False, escape=True)
1167
1324
  s.write('<div><table><tr><td>')
1325
+ self._render_link(
1326
+ s,
1327
+ definition,
1328
+ self.hash,
1329
+ '',
1330
+ lambda: self.link(self.dir),
1331
+ )
1168
1332
  if self.result is None:
1169
1333
  s.write(
1170
- f'<a target="_blank" title="{definition}" '
1171
- f'href="{self.link(self.dir)}">{self.hash}</a>'
1172
1334
  '</td></tr><tr><td>'
1173
1335
  '<span style="color: gray">(IN-PROGRESS...)</span>'
1174
1336
  )
1175
1337
  else:
1176
- s.write(
1177
- f'<a target="_blank" title="{definition}" '
1178
- f'href="{self.index_link}">{self.hash}</a>'
1179
- '</td></tr><tr><td>'
1180
- )
1181
- self._render_metric(s)
1338
+ if self.dir:
1339
+ s.write(f' &nbsp;[<a href="{self.link(self.dir)}">dir</a>]')
1340
+ s.write('</td></tr><tr><td>')
1341
+ self._render_summary_metrics(s)
1342
+
1343
+ # Summarize average usage.
1344
+ if self.result.usage:
1345
+ self._render_summary_usage(s)
1346
+
1182
1347
  s.write('</td></tr></table></div>')
1183
1348
  return s.getvalue()
1184
1349
 
1185
- def _render_metric(self, s: io.StringIO) -> None:
1350
+ def _render_summary_usage(self, s: io.StringIO) -> None:
1351
+ """Renders usage in HTML."""
1352
+ usage = self.result.usage
1353
+ total = usage.total_prompt_tokens + usage.total_completion_tokens
1354
+ s.write(
1355
+ '&nbsp;<a title="'
1356
+ f'# of usages: {usage.num_usages}&#013;'
1357
+ f'total prompt: {usage.total_prompt_tokens}&#013;'
1358
+ f'total response: {usage.total_completion_tokens}&#013;'
1359
+ f'avg prompt: {usage.average_prompt_tokens}&#013;'
1360
+ f'avg response: {usage.average_completion_tokens}'
1361
+ f'" style="color:gray">({total} tokens)</a>'
1362
+ )
1363
+
1364
+ def _render_link(self,
1365
+ s: io.StringIO,
1366
+ title: str,
1367
+ text: str,
1368
+ style: str,
1369
+ url_fn: Callable[[], str]) -> None:
1370
+ """Renders a link in HTML."""
1371
+ s.write(
1372
+ f'<a target="_blank" title="{title}" style="{style}"'
1373
+ )
1374
+ if self.dir:
1375
+ s.write(f' href="{url_fn()}"')
1376
+ s.write(f'>{text}</a>')
1377
+
1378
+ def _render_summary_metrics(self, s: io.StringIO) -> None:
1186
1379
  """Renders metrics in HTML."""
1187
1380
  assert self.result is not None
1188
1381
  m = self.result.metrics
1189
- s.write(
1190
- '<a title="Failures (%d/%d)" href="%s" style="color:red">%s</a>'
1191
- % (
1192
- m.failures,
1193
- m.total,
1194
- self.failures_link,
1195
- f'%.{self.report_precision}f%% ' % (m.failure_rate * 100),
1196
- )
1382
+
1383
+ # OOP failures.
1384
+ oop_failure_title = f'OOP failures ({m.oop_failures}/{m.total})'
1385
+ if m.oop_failures:
1386
+ oop_failure_title += '&#013;'
1387
+ for name, count in m.failure_breakdown.items():
1388
+ if name.startswith('MappingError'):
1389
+ oop_failure_title += '&#013;%s: %s (%d/%d)' % (
1390
+ name.removeprefix('MappingError.'),
1391
+ self._format_rate(count / m.total),
1392
+ count,
1393
+ m.total,
1394
+ )
1395
+
1396
+ extra_style = ''
1397
+ if m.oop_failure_rate > 0.1 and m.oop_failures > 3:
1398
+ extra_style = ';font-weight:bold'
1399
+ self._render_link(
1400
+ s,
1401
+ oop_failure_title,
1402
+ self._format_rate(m.oop_failure_rate),
1403
+ f'color:magenta{extra_style}',
1404
+ lambda: self.oop_failures_link,
1405
+ )
1406
+ s.write(' | ')
1407
+
1408
+ # Non-OOP failures.
1409
+ non_oop_failure_title = f'Non-OOP failures ({m.non_oop_failures}/{m.total})'
1410
+ if m.non_oop_failures:
1411
+ non_oop_failure_title += '&#013;'
1412
+ for name, count in m.failure_breakdown.items():
1413
+ if not name.startswith('MappingError'):
1414
+ non_oop_failure_title += '&#013;%s: %s (%d/%d)' % (
1415
+ name,
1416
+ self._format_rate(count / m.total),
1417
+ count,
1418
+ m.total,
1419
+ )
1420
+
1421
+ extra_style = ';font-weight:bold' if m.non_oop_failures > 0 else ''
1422
+ self._render_link(
1423
+ s,
1424
+ non_oop_failure_title,
1425
+ self._format_rate(m.non_oop_failure_rate),
1426
+ f'color:red{extra_style}',
1427
+ lambda: self.non_oop_failures_link,
1197
1428
  )
1198
1429
 
1199
- def audit(self, example: Any, output: Any, message: lf.Message) -> None:
1430
+ def _format_rate(self, rate: float) -> str:
1431
+ """Formats a rate."""
1432
+ return f'%.{self.report_precision}f%% ' % (rate * 100)
1433
+
1434
+ def audit(
1435
+ self,
1436
+ example_idx: int,
1437
+ example: Any,
1438
+ message: lf.Message | None,
1439
+ error: Exception | None = None,
1440
+ dryrun: bool = False,
1441
+ ) -> None:
1200
1442
  """Audits the example against the output. Subclasses should override.
1201
1443
 
1202
1444
  Args:
1445
+ example_idx: 1-based index of the example in its dataset.
1203
1446
  example: The input object.
1204
- output: The output from LM. For `lf.call`, if `schema_fn` is not provided,
1205
- it will be the raw LM response string. Otherwise it will be the
1206
- structured output from the LM.
1207
1447
  message: The entire message returned by the LM, which could be used to
1208
- trace the LM input, response and parsed structure.
1448
+ trace the LM input, response and parsed structure. If error is raised
1449
+ before LLM could return a response, None will be its value.
1450
+ error: The exception during processing the example.
1451
+ dryrun: Whether or not audition takes place during dryrun.
1209
1452
  """
1453
+ if error is not None:
1454
+ self._failures.append((example, error))
1455
+
1456
+ # Invalid cache of num_oop_failures.
1457
+ self.__dict__.pop('oop_failures', None)
1458
+ self.__dict__.pop('non_oop_failures', None)
1459
+ self.__dict__.pop('failure_breakdown', None)
1460
+
1461
+ if isinstance(error, lf_structured.MappingError):
1462
+ message = error.lm_response
1463
+ else:
1464
+ assert message is not None
1465
+ output = message.text if self.schema is None else message.result
1466
+ self.audit_processed(example_idx, example, output, message, dryrun=dryrun)
1467
+
1468
+ # Audit usage.
1469
+ if message is not None:
1470
+ self.audit_usage(message, dryrun=dryrun)
1471
+ self._num_completed += 1
1472
+
1473
+ def audit_usage(self, message: lf.Message, dryrun: bool = False) -> None:
1474
+ del dryrun
1475
+ for m in message.trace():
1476
+ usage = m.metadata.get('usage', None)
1477
+ if usage:
1478
+ self._total_prompt_tokens += usage.prompt_tokens
1479
+ self._total_completion_tokens += usage.completion_tokens
1480
+ self._num_usages += 1
1481
+
1482
+ def audit_processed(
1483
+ self, example_idx: int, example: Any, output: Any, message: lf.Message,
1484
+ dryrun: bool = False
1485
+ ) -> None:
1486
+ """Audits a successfully processed example. Subclass should override."""
1210
1487
 
1211
1488
  def save(
1212
1489
  self, definition: bool = True, result: bool = True, report: bool = True
@@ -1229,16 +1506,26 @@ class Evaluation(Evaluable):
1229
1506
  # Save failures.
1230
1507
  pg.save(
1231
1508
  [
1232
- pg.Dict(
1233
- input=input, error=lf.text_formatting.decolored(str(error))
1234
- )
1235
- for input, error in self.failures
1509
+ pg.Dict(input=input, error=_format_error(error))
1510
+ for input, error in self.oop_failures
1236
1511
  ],
1237
- os.path.join(self.dir, Evaluation.FAILURES_JSON),
1512
+ os.path.join(self.dir, Evaluation.OOP_FAILURES_JSON),
1238
1513
  )
1239
1514
  pg.save(
1240
- self._html([self._render_result, self._render_failures]),
1241
- os.path.join(self.dir, Evaluation.FAILURES_HTML),
1515
+ self._html([self._render_result, self._render_oop_failures]),
1516
+ os.path.join(self.dir, Evaluation.OOP_FAILURES_HTML),
1517
+ file_format='txt',
1518
+ )
1519
+ pg.save(
1520
+ [
1521
+ pg.Dict(input=input, error=_format_error(error))
1522
+ for input, error in self.non_oop_failures
1523
+ ],
1524
+ os.path.join(self.dir, Evaluation.NON_OOP_FAILURES_JSON),
1525
+ )
1526
+ pg.save(
1527
+ self._html([self._render_result, self._render_non_oop_failures]),
1528
+ os.path.join(self.dir, Evaluation.NON_OOP_FAILURES_HTML),
1242
1529
  file_format='txt',
1243
1530
  )
1244
1531
 
@@ -1250,8 +1537,11 @@ class Evaluation(Evaluable):
1250
1537
  '<td>Prompt</td>'
1251
1538
  '<td>Schema</td>'
1252
1539
  '<td>Additional Args</td>'
1253
- '<td>Failures</td>'
1254
1540
  )
1541
+ if self.result.usage:
1542
+ s.write('<td>Usage</td>')
1543
+ s.write('<td>OOP Failures</td>')
1544
+ s.write('<td>Non-OOP Failures</td>')
1255
1545
 
1256
1546
  def _render_result_row(self, s: io.StringIO) -> None:
1257
1547
  s.write(
@@ -1276,13 +1566,32 @@ class Evaluation(Evaluable):
1276
1566
  '<td style="color:purple" '
1277
1567
  f'{_html_repr(self.additional_args, compact=False)}</td>'
1278
1568
  )
1279
- # Failures.
1569
+ # Usage.
1570
+ if self.result.usage:
1571
+ s.write('<td>')
1572
+ self._render_summary_usage(s)
1573
+ s.write('</td>')
1574
+
1575
+ # OOP failures.
1576
+ s.write(
1577
+ '<td><span style="color:magenta">%s</span>%s</td>'
1578
+ % (
1579
+ self._format_rate(self.oop_failure_rate),
1580
+ '<a href="%s">(%d/%d)</a>'
1581
+ % (self.oop_failures_link,
1582
+ self.num_oop_failures,
1583
+ self.num_completed),
1584
+ )
1585
+ )
1586
+ # Non-OOP failures.
1280
1587
  s.write(
1281
- '<td><span style="color:orange">%s</span>%s</td>'
1588
+ '<td><span style="color:red">%s</span>%s</td>'
1282
1589
  % (
1283
- f'%.{self.report_precision}f%%' % (self.failure_rate * 100),
1590
+ self._format_rate(self.non_oop_failure_rate),
1284
1591
  '<a href="%s">(%d/%d)</a>'
1285
- % (self.failures_link, self.num_failures, self.num_completed),
1592
+ % (self.non_oop_failures_link,
1593
+ self.num_non_oop_failures,
1594
+ self.num_completed),
1286
1595
  )
1287
1596
  )
1288
1597
 
@@ -1296,31 +1605,99 @@ class Evaluation(Evaluable):
1296
1605
  else:
1297
1606
  return 'cyan'
1298
1607
 
1299
- def _render_failures(self, s: io.StringIO) -> None:
1608
+ def _render_oop_failures(self, s: io.StringIO) -> None:
1609
+ self._render_failures(s, '^MappingError.*', error_color='magenta')
1610
+
1611
+ def _render_non_oop_failures(self, s: io.StringIO) -> None:
1612
+ self._render_failures(s, '^(?!MappingError).*', error_color='red')
1613
+
1614
+ def _render_failures(
1615
+ self, s: io.StringIO, error_regex: str, error_color: str) -> None:
1300
1616
  """Formats the failed cases into html."""
1617
+ # Failure summary.
1301
1618
  s.write(
1302
- '<h2> Failed Cases </h2>'
1619
+ '<h2> Error Summary </h2>'
1303
1620
  '<div style="white-space:pre">\n'
1304
1621
  '<table style="border:1px solid">'
1305
- '<tr class="header"><td>No.</td><td>Input</td><td>Error</td></tr>'
1622
+ '<tr class="header"><td>Error type</td><td>Stats</td></tr>'
1306
1623
  )
1624
+ error_regex = re.compile(error_regex)
1625
+ if self.result.metrics.failure_breakdown:
1626
+ for name, count in self.result.metrics.failure_breakdown.items():
1627
+ if not error_regex.match(name):
1628
+ continue
1629
+
1630
+ link = f'<a href="#{name}">{name}</a>'
1631
+ error_rate = self._format_rate(count / self.result.metrics.total)
1632
+ stats = (f'<span style="color:{error_color}">{error_rate} '
1633
+ f'({count}/{self.result.metrics.total})</span>')
1634
+ s.write(f'<tr><td>{link}</td><td>{stats})</td></tr>')
1635
+ s.write(
1636
+ '</table></div>'
1637
+ '<h2> Failed Cases </h2>'
1638
+ '<div style="white-space:pre">'
1639
+ )
1640
+ # Failure details by error type.
1641
+ failures_by_error = collections.defaultdict(list)
1642
+ for example, error in self.failures:
1643
+ error_name = _error_key(error)
1644
+ if error_regex.match(error_name):
1645
+ failures_by_error[error_name].append((example, error))
1646
+
1647
+ for error_key, failures in failures_by_error.items():
1648
+ s.write(
1649
+ f'<h3 id="{error_key}"><a href="#{error_key}">{error_key}</a> '
1650
+ f'(count={len(failures)})</h3>'
1651
+ '<table style="border:1px solid">'
1652
+ '<tr class="header"><td>No.</td><td>Input</td>'
1653
+ '<td>LM invocation</td><td>Error</td></tr>'
1654
+ )
1655
+ for i, (example, error) in enumerate(failures):
1656
+ lm_response = None
1657
+ if isinstance(error, lf.structured.MappingError):
1658
+ lm_response = error.lm_response
1659
+ error = error.cause
1660
+
1661
+ bgcolor = 'white' if i % 2 == 0 else '#DDDDDD'
1662
+ s.write(f'<tr style="background-color: {bgcolor}"><td>{i + 1}</td>')
1663
+ s.write('<td style="color:green;white-space:pre-wrap">')
1664
+ s.write(pg.format(example, verbose=False))
1665
+ s.write('</td><td>')
1666
+ if lm_response is not None:
1667
+ self._render_message(lm_response, s)
1668
+ s.write(f'</td><td style="color:{error_color};white-space:pre">')
1669
+ s.write(_format_error(error))
1670
+ s.write('</td></tr>')
1671
+ s.write('</table>')
1672
+ s.write('</div>')
1307
1673
 
1308
- for i, (example, error) in enumerate(self.failures):
1309
- bgcolor = 'white' if i % 2 == 0 else '#DDDDDD'
1310
- s.write(f'<tr style="background-color: {bgcolor}"><td>{i + 1}</td>')
1311
- input_str = pg.format(example, verbose=False)
1312
- s.write(f'<td style="color:green;white-space:pre-wrap">{input_str}</td>')
1313
- error_str = lf.text_formatting.decolored(str(error))
1314
- s.write(f'<td style="color:red;white-space:pre">{error_str}</td>')
1315
- s.write('</tr>')
1316
- s.write('</table></div>')
1674
+ @classmethod
1675
+ def visualize(cls, evaluations: list['Evaluation']) -> str | None:
1676
+ """Visualize the a list of evaluations of this task in HTML."""
1677
+ del evaluations
1678
+ return None
1317
1679
 
1318
1680
 
1319
1681
  @pg.functor()
1320
- def inputs_from(path: str | list[str]) -> list[Any]:
1682
+ def inputs_from(path: str | list[str], **kwargs) -> list[Any]:
1321
1683
  """A functor that returns a list of user-defined objects as eval inputs."""
1322
1684
  if isinstance(path, str):
1323
- return pg.load(path)
1685
+ if path.endswith('.json'):
1686
+ return pg.load(path)
1687
+ elif path.endswith('.jsonl'):
1688
+ return list(iter(pg.open_jsonl(path)))
1689
+ elif path.endswith('.csv'):
1690
+ import pandas as pd # pylint: disable=g-import-not-at-top
1691
+ dataset_df = pd.read_csv(path, **kwargs)
1692
+ dataset = []
1693
+ for i in range(dataset_df.shape[0]):
1694
+ row = {}
1695
+ for col in dataset_df.columns:
1696
+ row[col] = dataset_df.iloc[i][col]
1697
+ dataset.append(row)
1698
+ return dataset
1699
+ else:
1700
+ raise ValueError(f'Unsupported file format: {path}')
1324
1701
  examples = []
1325
1702
  for p in path:
1326
1703
  examples.extend(pg.load(p))
@@ -1374,8 +1751,8 @@ class Summary(pg.Object):
1374
1751
  Type[lf.LanguageModel],
1375
1752
  tuple[lf.LanguageModel | Type[lf.LanguageModel], ...],
1376
1753
  ] = lf.LanguageModel,
1377
- method: Union[str, tuple[str], None] = None,
1378
- schema_fn: Union[pg.Functor, tuple[pg.Functor], None] = None,
1754
+ method: Union[str, tuple[str, ...], None] = None,
1755
+ schema_fn: Union[pg.Functor, tuple[pg.Functor, ...], None] = None,
1379
1756
  completed: bool | None = None,
1380
1757
  pivot_field: str | None = None,
1381
1758
  ) -> 'Summary':
@@ -1466,7 +1843,7 @@ class Summary(pg.Object):
1466
1843
  if e is None:
1467
1844
  s.write('<span style="color: gray">N/A<span>')
1468
1845
  else:
1469
- s.write(e.summarize_html())
1846
+ s.write(e.summary_card())
1470
1847
  s.write('</td>')
1471
1848
  s.write('</tr>')
1472
1849
  s.write('</table>')
@@ -1541,13 +1918,22 @@ class Summary(pg.Object):
1541
1918
  s.write('<html><body>')
1542
1919
  for task in sorted(self.tasks(), key=lambda cls: cls.__name__):
1543
1920
  table_id = task.__name__.lower()
1921
+ evaluations = self.select(task=task).evaluations
1922
+ table = Summary.Table.from_evaluations(evaluations, pivot_field)
1544
1923
  s.write('<div>')
1545
- s.write(f'<a id="{table_id}"')
1546
- s.write(f'<h2><a href="#{table_id}">{task.__name__}</a></h2>')
1547
- s.write('</a>')
1548
- table = Summary.Table.from_evaluations(
1549
- self.select(task=task).evaluations, pivot_field
1924
+ s.write(
1925
+ f'<a id="{table_id}" href="#{table_id}">'
1926
+ f'<h2>{task.__name__}</h2></a>'
1550
1927
  )
1928
+
1929
+ # Allow users to plugin visualization code (e.g. matplot) in the summary
1930
+ # page.
1931
+ visual_part = task.visualize(evaluations)
1932
+ if visual_part:
1933
+ s.write(visual_part)
1934
+
1935
+ s.write(f'<h4 style="color:gray">{len(evaluations)} experiments</h4>')
1936
+ s.write('<hr/>')
1551
1937
  s.write(table.html())
1552
1938
  s.write('</div>')
1553
1939
  s.write('</body></html>')
@@ -1556,8 +1942,36 @@ class Summary(pg.Object):
1556
1942
  def _repr_html_(self) -> str:
1557
1943
  return self.html()
1558
1944
 
1945
+ def json(
1946
+ self,
1947
+ ) -> dict[
1948
+ str, # Task name
1949
+ list[pg.Dict], # List of pg.Dict with `experiment` and `metrics`.
1950
+ ]:
1951
+ """Returns the JSON representation of the summary."""
1952
+ task_results = {}
1953
+ for task in sorted(self.tasks(), key=lambda cls: cls.__name__):
1954
+ results = []
1955
+ for entry in self.select(task=task).evaluations:
1956
+ results.append(
1957
+ pg.Dict(
1958
+ id=entry.id,
1959
+ experiment=entry,
1960
+ dir=entry.dir,
1961
+ metrics=entry.result.metrics if entry.result else None,
1962
+ usage=entry.result.usage if entry.result else None,
1963
+ )
1964
+ )
1965
+ task_results[task.__name__] = results
1966
+ return task_results
1967
+
1559
1968
  def save(self, file: str, pivot_field: str | None = None) -> None:
1560
1969
  pg.save(self.html(pivot_field), file, file_format='txt')
1970
+ if file.endswith('.html'):
1971
+ json_file = file.replace('.html', '.json')
1972
+ else:
1973
+ json_file = os.path.join(file, '.json')
1974
+ pg.save(self.json(), json_file)
1561
1975
 
1562
1976
  @classmethod
1563
1977
  def from_dirs(
@@ -1694,6 +2108,20 @@ class Summary(pg.Object):
1694
2108
  return result.join()
1695
2109
 
1696
2110
 
2111
+ def _format_error(error: Exception):
2112
+ """Formats an error into a string."""
2113
+ return (f'({error.__class__.__name__}) ' + pg.decolor(str(error)))
2114
+
2115
+
2116
+ def _error_key(error: Exception) -> str:
2117
+ """Returns the key for an error."""
2118
+ error_names = []
2119
+ while error is not None:
2120
+ error_names.append(error.__class__.__name__)
2121
+ error = getattr(error, 'cause', None)
2122
+ return '.'.join(error_names)
2123
+
2124
+
1697
2125
  def _html_repr(value: Any, compact: bool = True, escape: bool = False) -> str:
1698
2126
  """Formats prompt in HTML."""
1699
2127
  if type(value) is lf.Template: # pylint: disable=unidiomatic-typecheck
@@ -1768,3 +2196,202 @@ def monitor_async(
1768
2196
  scan_interval=scan_interval,
1769
2197
  refresh_when_stop=refresh_when_stop,
1770
2198
  )
2199
+
2200
+
2201
+ #
2202
+ # Named evaluations and experiments support.
2203
+ #
2204
+
2205
+
2206
+ class _NamedEvaluationRegistry:
2207
+ """Named evaluation registry."""
2208
+
2209
+ def __init__(self):
2210
+ self._registry = {}
2211
+
2212
+ def names(self) -> list[str]:
2213
+ """Returns all registered names."""
2214
+ return sorted(self._registry.keys())
2215
+
2216
+ def get(self, name: str) -> list[Type[Evaluable]]:
2217
+ """Gets an evaluation by name."""
2218
+ matches = []
2219
+ if name in self._registry:
2220
+ matches.append(self._registry[name])
2221
+ else:
2222
+ regex = re.compile(name)
2223
+ for key, cls in self._registry.items():
2224
+ if regex.match(key):
2225
+ matches.append(cls)
2226
+ return matches
2227
+
2228
+ def register(
2229
+ self,
2230
+ name: str,
2231
+ experiment_cls: Type[Evaluable],
2232
+ ):
2233
+ """Register an experiment class."""
2234
+ self._registry[name] = experiment_cls
2235
+
2236
+
2237
+ _eval_registry = _NamedEvaluationRegistry()
2238
+
2239
+
2240
+ def registered_names() -> list[str]:
2241
+ """Returns all registered names."""
2242
+ return _eval_registry.names()
2243
+
2244
+
2245
+ def get_evaluations(evaluation: str | Evaluable) -> list[Evaluable]:
2246
+ """Gets an evaluation experiment by name."""
2247
+ if isinstance(evaluation, str):
2248
+ return [e() for e in _eval_registry.get(evaluation)]
2249
+ return [evaluation]
2250
+
2251
+
2252
+ def register(name: str):
2253
+ """Decorator to create a named evaluation class."""
2254
+
2255
+ def _register(func_or_cls: Type[Evaluation] | types.FunctionType):
2256
+ if inspect.isfunction(func_or_cls):
2257
+ e = func_or_cls()
2258
+ if not isinstance(e, Evaluable):
2259
+ raise TypeError(
2260
+ f'The return value of `{func_or_cls}` should be an instance of '
2261
+ '`lf.eval.Evaluable` subclass.'
2262
+ )
2263
+
2264
+ class GeneratedSuite(Suite):
2265
+ # NOTE(daiyip): Delay serialization key registration for generated
2266
+ # class.
2267
+ auto_register = False
2268
+ children = e.children if isinstance(e, Suite) else [e]
2269
+
2270
+ cls = GeneratedSuite
2271
+ cls.__name__ = func_or_cls.__name__
2272
+ cls.__doc__ = func_or_cls.__doc__
2273
+ cls.__qualname__ = func_or_cls.__qualname__
2274
+ cls.__module__ = getattr(func_or_cls, '__module__', 'wrapper')
2275
+ cls.register_for_deserialization(cls.__type_name__)
2276
+
2277
+ elif issubclass(func_or_cls, Evaluable):
2278
+ cls = func_or_cls
2279
+ else:
2280
+ raise ValueError(f'Unsupported type: {type(func_or_cls)}')
2281
+
2282
+ _eval_registry.register(name, cls)
2283
+ return cls
2284
+
2285
+ return _register
2286
+
2287
+
2288
+ def get(
2289
+ root_dir: str,
2290
+ evaluations: list[str | Evaluable],
2291
+ filter: Union[ # pylint: disable=redefined-builtin
2292
+ str, # Regex to filter evaluation based on ID.
2293
+ Callable[[Evaluable], bool], # Custom filter function.
2294
+ None # No filtering (Default).
2295
+ ] = None, # pylint: disable=bad-whitespace
2296
+ patches: list[Union[
2297
+ str, # String-based PyGlove patcher.
2298
+ pg.patching.Patcher, # PyGlove patcher object.
2299
+ Callable[[pg.KeyPath, Any, Any], Any], # PyGlove rebind function.
2300
+ ]] | None = None, # pylint: disable=bad-whitespace
2301
+ ) -> Suite:
2302
+ """Gets a suite from a list of patched evaluations.
2303
+
2304
+ Args:
2305
+ root_dir: The root directory of the experiment.
2306
+ evaluations: A list of evaluations to be included in the suite.
2307
+ filter: A regular expression (str) for selecting sub-experiments of matched
2308
+ IDs, or a filter function to filter the evaluations.
2309
+ patches: A list of patches to be applied to the suite. Each element can be
2310
+ a string (for string-based patcher), a `pg.patching.Patcher` object, or
2311
+ a rebind function (e.g. `pg.rebind`). See `lf.eval.patch_*` for more
2312
+ details.
2313
+
2314
+ Returns:
2315
+ A suite of selected `lf.eval.Evaluation` objects.
2316
+ """
2317
+ matches = []
2318
+ for e in evaluations:
2319
+ matches.extend(get_evaluations(e))
2320
+
2321
+ if not matches:
2322
+ raise ValueError('No evaluations found.')
2323
+
2324
+ suite = Suite(matches, root_dir=root_dir)
2325
+ if patches:
2326
+ suite = pg.patch(suite, patches)
2327
+
2328
+ if isinstance(filter, str):
2329
+ regex = re.compile(filter)
2330
+ filter = lambda x: bool(regex.match(x.id))
2331
+
2332
+ if filter:
2333
+ suite = Suite(
2334
+ [leaf for leaf in suite.leaf_nodes if filter(leaf)], root_dir=root_dir)
2335
+ return suite
2336
+
2337
+
2338
+ def run(
2339
+ root_dir: str,
2340
+ evaluations: list[str | Evaluable],
2341
+ filter: Union[ # pylint: disable=redefined-builtin
2342
+ str, # Regex to filter evaluation based on ID.
2343
+ Callable[[Evaluable], bool], # Custom filter function.
2344
+ None # No filtering (Default).
2345
+ ] = None, # pylint: disable=bad-whitespace
2346
+ patches: list[Union[
2347
+ str, # String-based PyGlove patcher.
2348
+ pg.patching.Patcher, # PyGlove patcher object.
2349
+ Callable[[pg.KeyPath, Any, Any], Any], # PyGlove rebind function.
2350
+ ]] | None = None, # pylint: disable=bad-whitespace
2351
+ mode: Literal['run', 'rerun', 'dryrun', 'noop'] = 'run',
2352
+ debug: bool = False,
2353
+ print_definition: bool = False,
2354
+ **kwargs,
2355
+ ) -> Suite:
2356
+ """Run selected evaluations with patching.
2357
+
2358
+ Args:
2359
+ root_dir: The root directory of the experiment.
2360
+ evaluations: A list of evaluations to be included in the suite.
2361
+ filter: A regular expression (str) for selecting sub-experiments of matched
2362
+ IDs, or a filter function to filter the evaluations.
2363
+ patches: A list of patches to be applied to the suite. Each element can be
2364
+ a string (for string-based patcher), a `pg.patching.Patcher` object, or
2365
+ a rebind function (e.g. `pg.rebind`). See `lf.eval.patch_*` for more
2366
+ details.
2367
+ mode: The mode to run the suite. "run" to run the suite, with reusing
2368
+ existing results if available; "rerun" to rerun all evaluations even if
2369
+ there are existing results; "dryrun" to dryrun the suite; and "noop"
2370
+ to do nothing.
2371
+ debug: Whether to run in debug mode.
2372
+ print_definition: Whether to print the experiment definition.
2373
+ **kwargs: Additional arguments to be passed to dryrun/run the suite.
2374
+
2375
+ Returns:
2376
+ A suite of selected `lf.eval.Evaluation` objects.
2377
+ """
2378
+ suite = get(root_dir, evaluations, patches=patches, filter=filter)
2379
+ if print_definition:
2380
+ lf.console.write(
2381
+ pg.format(
2382
+ suite,
2383
+ compact=False,
2384
+ verbose=False,
2385
+ hide_default_values=True,
2386
+ python_format=True,
2387
+ ),
2388
+ title='[EXPERIMENT DEFINITION]',
2389
+ color='blue',
2390
+ )
2391
+
2392
+ if mode == 'run':
2393
+ rerun = mode == 'rerun'
2394
+ suite.run(debug=debug, rerun=rerun, **kwargs)
2395
+ elif mode == 'dryrun':
2396
+ suite.dryrun(debug=debug, **kwargs)
2397
+ return suite