PyPI - langfun - Versions diffs - 0.0.2.dev20240330__py3-none-any.whl → 0.1.2.dev202501140804__py3-none-any.whl - Mend

langfun 0.0.2.dev20240330py3-none-any.whl → 0.1.2.dev202501140804py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (145) hide show

langfun/__init__.py +22 -2
langfun/core/__init__.py +17 -5
langfun/core/agentic/__init__.py +30 -0
langfun/core/agentic/action.py +854 -0
langfun/core/agentic/action_eval.py +150 -0
langfun/core/agentic/action_eval_test.py +109 -0
langfun/core/agentic/action_test.py +136 -0
langfun/core/coding/python/__init__.py +5 -11
langfun/core/coding/python/correction.py +37 -28
langfun/core/coding/python/correction_test.py +29 -3
langfun/core/coding/python/execution.py +40 -216
langfun/core/coding/python/execution_test.py +29 -89
langfun/core/coding/python/generation.py +21 -11
langfun/core/coding/python/generation_test.py +2 -2
langfun/core/coding/python/parsing.py +108 -193
langfun/core/coding/python/parsing_test.py +2 -105
langfun/core/component.py +69 -2
langfun/core/component_test.py +54 -0
langfun/core/concurrent.py +414 -117
langfun/core/concurrent_test.py +111 -24
langfun/core/console.py +18 -5
langfun/core/console_test.py +17 -0
langfun/core/eval/__init__.py +17 -0
langfun/core/eval/base.py +767 -140
langfun/core/eval/base_test.py +238 -53
langfun/core/eval/matching.py +80 -76
langfun/core/eval/matching_test.py +19 -9
langfun/core/eval/patching.py +130 -0
langfun/core/eval/patching_test.py +170 -0
langfun/core/eval/scoring.py +37 -28
langfun/core/eval/scoring_test.py +21 -3
langfun/core/eval/v2/__init__.py +42 -0
langfun/core/eval/v2/checkpointing.py +380 -0
langfun/core/eval/v2/checkpointing_test.py +228 -0
langfun/core/eval/v2/eval_test_helper.py +136 -0
langfun/core/eval/v2/evaluation.py +725 -0
langfun/core/eval/v2/evaluation_test.py +180 -0
langfun/core/eval/v2/example.py +305 -0
langfun/core/eval/v2/example_test.py +128 -0
langfun/core/eval/v2/experiment.py +1048 -0
langfun/core/eval/v2/experiment_test.py +433 -0
langfun/core/eval/v2/metric_values.py +156 -0
langfun/core/eval/v2/metric_values_test.py +80 -0
langfun/core/eval/v2/metrics.py +357 -0
langfun/core/eval/v2/metrics_test.py +203 -0
langfun/core/eval/v2/progress.py +348 -0
langfun/core/eval/v2/progress_test.py +82 -0
langfun/core/eval/v2/progress_tracking.py +210 -0
langfun/core/eval/v2/progress_tracking_test.py +66 -0
langfun/core/eval/v2/reporting.py +270 -0
langfun/core/eval/v2/reporting_test.py +158 -0
langfun/core/eval/v2/runners.py +488 -0
langfun/core/eval/v2/runners_test.py +334 -0
langfun/core/langfunc.py +3 -21
langfun/core/langfunc_test.py +26 -8
langfun/core/language_model.py +686 -48
langfun/core/language_model_test.py +681 -44
langfun/core/llms/__init__.py +100 -12
langfun/core/llms/anthropic.py +488 -0
langfun/core/llms/anthropic_test.py +235 -0
langfun/core/llms/cache/base.py +21 -2
langfun/core/llms/cache/in_memory.py +13 -0
langfun/core/llms/cache/in_memory_test.py +88 -28
langfun/core/llms/compositional.py +101 -0
langfun/core/llms/compositional_test.py +73 -0
langfun/core/llms/deepseek.py +117 -0
langfun/core/llms/deepseek_test.py +61 -0
langfun/core/llms/fake.py +39 -26
langfun/core/llms/fake_test.py +136 -11
langfun/core/llms/gemini.py +507 -0
langfun/core/llms/gemini_test.py +195 -0
langfun/core/llms/google_genai.py +62 -218
langfun/core/llms/google_genai_test.py +9 -197
langfun/core/llms/groq.py +276 -0
langfun/core/llms/groq_test.py +64 -0
langfun/core/llms/llama_cpp.py +15 -40
langfun/core/llms/llama_cpp_test.py +4 -30
langfun/core/llms/openai.py +436 -226
langfun/core/llms/openai_compatible.py +179 -0
langfun/core/llms/openai_compatible_test.py +495 -0
langfun/core/llms/openai_test.py +35 -174
langfun/core/llms/rest.py +113 -0
langfun/core/llms/rest_test.py +111 -0
langfun/core/llms/vertexai.py +192 -0
langfun/core/llms/vertexai_test.py +52 -0
langfun/core/logging.py +284 -0
langfun/core/logging_test.py +125 -0
langfun/core/message.py +319 -9
langfun/core/message_test.py +190 -13
langfun/core/modalities/__init__.py +6 -2
langfun/core/modalities/audio.py +30 -0
langfun/core/modalities/audio_test.py +63 -0
langfun/core/modalities/image.py +39 -20
langfun/core/modalities/image_test.py +52 -9
langfun/core/modalities/mime.py +206 -29
langfun/core/modalities/mime_test.py +90 -9
langfun/core/modalities/ms_office.py +117 -0
langfun/core/modalities/ms_office_test.py +389 -0
langfun/core/modalities/pdf.py +22 -0
langfun/core/modalities/pdf_test.py +57 -0
langfun/core/modalities/video.py +9 -23
langfun/core/modalities/video_test.py +3 -3
langfun/core/modality.py +26 -3
langfun/core/modality_test.py +2 -2
langfun/core/sampling.py +11 -11
langfun/core/structured/__init__.py +15 -16
langfun/core/structured/completion.py +32 -5
langfun/core/structured/completion_test.py +9 -8
langfun/core/structured/description.py +2 -2
langfun/core/structured/description_test.py +3 -3
langfun/core/structured/function_generation.py +278 -0
langfun/core/structured/function_generation_test.py +399 -0
langfun/core/structured/mapping.py +150 -46
langfun/core/structured/mapping_test.py +105 -0
langfun/core/structured/parsing.py +33 -21
langfun/core/structured/parsing_test.py +71 -22
langfun/core/structured/querying.py +746 -0
langfun/core/structured/{prompting_test.py → querying_test.py} +545 -60
langfun/core/structured/schema.py +208 -99
langfun/core/structured/schema_generation.py +1 -1
langfun/core/structured/schema_generation_test.py +2 -2
langfun/core/structured/schema_test.py +133 -34
langfun/core/structured/scoring.py +125 -19
langfun/core/structured/scoring_test.py +30 -0
langfun/core/structured/tokenization.py +64 -0
langfun/core/structured/tokenization_test.py +48 -0
langfun/core/template.py +240 -11
langfun/core/template_test.py +146 -1
langfun/core/templates/conversation.py +9 -0
langfun/core/templates/conversation_test.py +4 -3
langfun/core/templates/selfplay_test.py +14 -2
langfun-0.1.2.dev202501140804.dist-info/METADATA +225 -0
langfun-0.1.2.dev202501140804.dist-info/RECORD +153 -0
{langfun-0.0.2.dev20240330.dist-info → langfun-0.1.2.dev202501140804.dist-info}/WHEEL +1 -1
langfun/core/coding/python/errors.py +0 -108
langfun/core/coding/python/errors_test.py +0 -99
langfun/core/coding/python/permissions.py +0 -90
langfun/core/coding/python/permissions_test.py +0 -86
langfun/core/structured/prompting.py +0 -217
langfun/core/text_formatting.py +0 -162
langfun/core/text_formatting_test.py +0 -47
langfun-0.0.2.dev20240330.dist-info/METADATA +0 -99
langfun-0.0.2.dev20240330.dist-info/RECORD +0 -102
{langfun-0.0.2.dev20240330.dist-info → langfun-0.1.2.dev202501140804.dist-info}/LICENSE +0 -0
{langfun-0.0.2.dev20240330.dist-info → langfun-0.1.2.dev202501140804.dist-info}/top_level.txt +0 -0

langfun/core/eval/base.py CHANGED Viewed

@@ -24,6 +24,7 @@ import os
 import re
 import threading
 import time
+import types
 from typing import Annotated, Any, Callable, Iterator, Literal, Optional, Sequence, Type, Union
 import langfun.core as lf
@@ -38,7 +39,8 @@ class Evaluable(lf.Component):
   EXPERIMENT_JSON = 'experiment.json'
   RESULT_JSON = 'result.json'
-  FAILURES_JSON = 'failures.json'
+  OOP_FAILURES_JSON = 'oop_failures.json'
+  NON_OOP_FAILURES_JSON = 'non_oop_failures.json'
   INDEX_HTML = 'index.html'
   SUMMARY_HTML = 'summary.html'
@@ -213,6 +215,7 @@ class Evaluable(lf.Component):
       summary: bool = True,
       pivot_field: str = 'lm',
       from_root: bool = True,
+      timeout: int | None = None,
       **kwargs,
   ) -> Union['Summary', pg.Dict]:
     """Run the evaluation, which fills and returns the result."""
@@ -240,7 +243,7 @@ class Evaluable(lf.Component):
       ):
         if show_progress:
           lf.concurrent.ProgressBar.update(
-              progress_bar, postfix='LOADING SAVED RESULTS...', color='yellow'
+              progress_bar, status='LOADING SAVED RESULTS...', color='yellow'
           )
         if self.try_load_result():
           run_status = 'CACHED'
@@ -263,13 +266,14 @@ class Evaluable(lf.Component):
             verbose=verbose,
             progress_bar=progress_bar,
             label=label,
+            timeout=timeout,
             **kwargs,
         )
         if should_save:
           if show_progress:
             lf.concurrent.ProgressBar.update(
-                progress_bar, postfix='SAVING RESULTS...', color='yellow'
+                progress_bar, status='SAVING RESULTS...', color='yellow'
             )
           # Save evaluation results.
@@ -282,7 +286,7 @@ class Evaluable(lf.Component):
       if show_progress:
         lf.concurrent.ProgressBar.update(
             progress_bar,
-            postfix=self._completion_status(run_status),
+            status=self._completion_status(run_status),
             color='green',
         )
     else:
@@ -338,7 +342,7 @@ class Evaluable(lf.Component):
               f'[#{leaf.index} - {leaf.node.id}]',
               total=leaf.node.num_examples if leaf.enabled else 0,
               color='cyan' if leaf.enabled else 'yellow',
-              postfix=None if leaf.enabled else 'SKIPPED.')
+              status=None if leaf.enabled else 'SKIPPED.')
         # Run leaf groups in parallel.
         try:
@@ -352,17 +356,17 @@ class Evaluable(lf.Component):
           # Save results for non-leaf nodes.
           lf.concurrent.ProgressBar.update(
               overview_bar,
-              postfix='SAVING RESULTS...',
+              status='SAVING RESULTS...',
               color='yellow')
           for node in self.nonleaf_nodes:
-            node._result = {c.id: c.result for c in node.children}  # pylint: disable=protected-access
+            node._result = {c.id: c.result for c in node.leaf_nodes}  # pylint: disable=protected-access
             if should_save:
               node.save(result=False, report=False)
           if should_save and summary:
             lf.concurrent.ProgressBar.update(
-                overview_bar, postfix='FINALIZING SUMMARY...'
+                overview_bar, status='FINALIZING SUMMARY...'
             )
             summary.save(os.path.join(self.root_dir, Evaluable.SUMMARY_HTML))
@@ -376,7 +380,7 @@ class Evaluable(lf.Component):
           # Signal all task completed by making the bar green.
           lf.concurrent.ProgressBar.update(
               overview_bar,
-              postfix='COMPLETED',
+              status='COMPLETED',
               color='green')
         finally:
@@ -396,6 +400,7 @@ class Evaluable(lf.Component):
       verbose: bool,
       progress_bar: int | None,
       label: str | None,
+      timeout: int | None = None,
       **kwargs,
   ) -> None:
     """Run the evaluate and fill `self.result`. Subclass to implement."""
@@ -526,27 +531,14 @@ class Evaluable(lf.Component):
     self._render_message(self.dryrun_output, s)
   def _render_message(self, message: lf.Message, s: io.StringIO) -> None:
-    for m in message.trace():
-      if 'lm-input' in m.tags:
-        text_color = 'green'
-      elif 'lm-response' in m.tags:
-        text_color = 'blue'
-      else:
-        text_color = 'black'
-      s.write(
-          f'<div style="color: {text_color}; white-space: pre-wrap;'
-          'padding: 10px; border: 1px solid; margin-top: 10px">'
-      )
-      s.write(m.text)
-      if m.result is not None:
-        s.write(
-            '<div style="color: magenta; white-space: pre-wrap;'
-            'padding: 10px; border: 1px solid; margin: 10px">'
+    s.write(
+        message.to_html_str(
+            extra_flags=dict(
+                include_message_metadata=False,
+                source_tag=['lm-input', 'lm-response'],
+            )
         )
-        s.write(pg.format(m.result))
-        s.write('</div>')
-      s.write('</div>')
+    )
   @classmethod
   def from_dir(
@@ -586,7 +578,6 @@ class _LeafNode:
 @pg.use_init_args(['children'])
 class Suite(Evaluable):
   """Evaluation suite."""
   children: Annotated[list[Evaluable], 'Child evaluation sets or suites.']
   # Use empty ID as suite is just a container of child evaluations.
@@ -741,10 +732,12 @@ class Evaluation(Evaluable):
   # Constants.
   CACHE_JSON = 'cache.json'
-  FAILURES_HTML = 'failures.html'
+  OOP_FAILURES_HTML = 'oop_failures.html'
+  NON_OOP_FAILURES_HTML = 'non_oop_failures.html'
   @functools.cached_property
   def hash(self) -> str:
+    """Returns the semantic-based hash of the evaluation."""
     if self.is_deterministic:
       identity = pg.format(self._identifiers(), compact=True)
     else:
@@ -793,6 +786,10 @@ class Evaluation(Evaluable):
     """Returns the complete rate."""
     return self.num_completed / self.num_examples
+  #
+  # Properties on failures.
+  #
   @property
   def failures(self) -> list[tuple[Any, Exception]]:
     """Returns the failed examples and their errors."""
@@ -803,6 +800,15 @@ class Evaluation(Evaluable):
     """Returns the number of failed examples."""
     return len(self.failures)
+  @functools.cached_property
+  def failure_breakdown(self) -> dict[str, int]:
+    """Returns the breakdown of failures."""
+    breakdown = collections.defaultdict(int)
+    for _, error in self.failures:
+      breakdown[_error_key(error)] += 1
+    sorted_items = sorted(breakdown.items(), key=lambda x: x[1], reverse=True)
+    return pg.Dict({x[0]: x[1] for x in sorted_items})
   @property
   def failure_rate(self) -> float:
     """Returns the failure rate in range [0, 1]."""
@@ -810,17 +816,76 @@ class Evaluation(Evaluable):
       return 0.0
     return self.num_failures / self.num_completed
+  @functools.cached_property
+  def oop_failures(self) -> list[tuple[Any, lf_structured.MappingError]]:
+    """Returns the OOP failures."""
+    return [item for item in self.failures
+            if isinstance(item[1], lf_structured.MappingError)]
+  @property
+  def num_oop_failures(self) -> int:
+    """Returns the number of OOP failures."""
+    return len(self.oop_failures)
+  @property
+  def oop_failure_rate(self) -> float:
+    """Returns the OOP failure rate in range [0, 1]."""
+    if self.num_completed == 0:
+      return 0.0
+    return self.num_oop_failures / self.num_completed
+  @functools.cached_property
+  def non_oop_failures(self) -> list[tuple[Any, Exception]]:
+    """Returns the OOP failures."""
+    return [item for item in self.failures
+            if not isinstance(item[1], lf_structured.MappingError)]
+  @property
+  def num_non_oop_failures(self) -> int:
+    """Returns the number of non-OOP failures."""
+    return len(self.non_oop_failures)
+  @property
+  def non_oop_failure_rate(self) -> float:
+    """Returns the non-OOP failure rate in range [0, 1]."""
+    if self.num_completed == 0:
+      return 0.0
+    return self.num_non_oop_failures / self.num_completed
+  #
+  # Properties on usage.
+  #
+  @property
+  def has_usage(self) -> bool:
+    """Returns True if token usage is enabled."""
+    return self._num_usages > 0
+  @property
+  def average_prompt_tokens(self) -> int:
+    """Returns the average prompt tokens."""
+    if not self.has_usage:
+      return 0
+    return self._total_prompt_tokens // self._num_usages
+  @property
+  def average_completion_tokens(self) -> int:
+    """Returns the average completion tokens."""
+    if not self.has_usage:
+      return 0
+    return self._total_completion_tokens // self._num_usages
+  @property
+  def average_total_tokens(self) -> int:
+    """Returns the average total tokens."""
+    return self.average_prompt_tokens + self.average_completion_tokens
   @functools.cached_property
   def schema(self) -> lf_structured.Schema | None:
     """Schema."""
     if self.schema_fn is None:
       return None
-    kwargs = {}
-    # Allow schema to be a function based on current evaluation.
-    if 'evaluation' in self.schema_fn.__signature__.arg_names:
-      kwargs['evaluation'] = self
     schema = self._call_schema_fn()
     fewshot_examples = None
     if isinstance(schema, tuple):
@@ -861,7 +926,11 @@ class Evaluation(Evaluable):
             'Encountered: {annotation!r}.'
         )
       self._maybe_adjust_schema_for_completion(annotation)
-    return lf_structured.Schema.from_value(annotation)
+    schema = lf_structured.Schema.from_value(annotation)
+    # NOTE(daiyip): add references to the dependent classes of the returned type
+    # to prevent unused subclasses get garbage collected by Python.
+    setattr(schema, '__dependencies__', schema.class_dependencies())
+    return schema
   def _maybe_adjust_schema_for_completion(self, cls):
     if (self.completion_prompt_field is None
@@ -870,7 +939,7 @@ class Evaluation(Evaluable):
     fields = list(cls.__schema__.values())
     fields.insert(0, (self.completion_prompt_field, pg.typing.Str()))
-    pg.symbolic.update_schema(cls, fields, extend=False)
+    cls.update_schema(fields, extend=False)
   def _maybe_adjust_examples_for_completion(
       self,
@@ -938,12 +1007,25 @@ class Evaluation(Evaluable):
     self._failures = []
     self._num_completed = 0
+    self._total_prompt_tokens = 0
+    self._total_completion_tokens = 0
+    self._num_usages = 0
+    self.__dict__.pop('oop_failures', None)
+    self.__dict__.pop('non_oop_failures', None)
   @property
-  def failures_link(self) -> str | None:
-    """Returns the link to the failures page."""
+  def oop_failures_link(self) -> str | None:
+    """Returns the link to the OOP failures page."""
     if self.dir is None:
       return None
-    return self.link(os.path.join(self.dir, Evaluation.FAILURES_HTML))
+    return self.link(os.path.join(self.dir, Evaluation.OOP_FAILURES_HTML))
+  @property
+  def non_oop_failures_link(self) -> str | None:
+    """Returns the link to then non-OOP failures page."""
+    if self.dir is None:
+      return None
+    return self.link(os.path.join(self.dir, Evaluation.NON_OOP_FAILURES_HTML))
   def _dryrun(
       self,
@@ -953,11 +1035,11 @@ class Evaluation(Evaluable):
       verbose: bool,
       **kwargs,
   ) -> None:
-    # Set the example for dryrun.
-    example = example or self.examples[0]
     # We make a copy to avoid pollute the state of current object.
-    copy = self.clone()
+    copy: Evaluation = self.clone()
+    # Set the example for dryrun.
+    example = example or copy.examples[0]
     copy.__dict__['examples'] = [example]
     # We set the symbolic parent of the cloned to access contextual information
@@ -972,24 +1054,37 @@ class Evaluation(Evaluable):
           color='green',
       )
-    with lf.use_settings(debug=debug):
-      output_message = copy.process(example, **(self.additional_args or {}))
-      if self.schema is None:
-        output = output_message.text
-      else:
-        output = output_message.result
+    error, output_message = None, None
-    if verbose:
+    try:
+      with lf.use_settings(debug=debug):
+        output_message = copy.process(example, **(self.additional_args or {}))
+        self.process_output(example, output_message)
+        if self.schema is None:
+          output = output_message.text
+        else:
+          output = output_message.result
+      if verbose:
+        lf.console.write('')
+        lf.console.write(
+            str(output),
+            title='OUTPUT',
+            color='blue',
+        )
+    except lf_structured.MappingError as e:
       lf.console.write('')
       lf.console.write(
-          str(output),
-          title='OUTPUT',
-          color='blue',
+          str(e),
+          title='ERROR',
+          color='red',
       )
+      error = e
+    copy.audit(1, example, output_message, error, dryrun=True)
+    result = copy.finalize()
-    # Audit the result.
-    copy.audit(example, output, output_message)
-    result = copy.summarize()
     if verbose:
       lf.console.write('')
       lf.console.write(
@@ -1009,9 +1104,13 @@ class Evaluation(Evaluable):
       verbose: bool,
       progress_bar: int | None,
       label: str | None,
+      timeout: int | None = None,
       **kwargs,
   ) -> None:
     # Setup examples.
+    # Reset examples so it could be read from the input functor.
+    self.__dict__.pop('examples', None)
     if end is None:
       end = len(self.examples)
     examples = self.examples[start:end]
@@ -1020,34 +1119,39 @@ class Evaluation(Evaluable):
     with lf.use_settings(debug=debug, cache=self.cache):
       self._reset()
-      def _process(example: Any):
+      def _process(idx_and_example: Any):
         # NOTE(daiyip): set the `input` symbol of the globals to None, so LLM
         # generated code with calls to `input` will raise an error, thus not
         # blocking the evaluation.
+        _, example = idx_and_example
         with lf_coding.context(input=None):
-          return self.process(example, **(self.additional_args or {}))
+          output_message = self.process(example, **(self.additional_args or {}))
+          self.process_output(example, output_message)
+          return output_message
       try:
-        for example, message, error in lf.concurrent_map(
+        for (idx, example), message, error in lf.concurrent_map(
             _process,
-            examples,
+            enumerate(examples),
             max_workers=self.max_workers,
             show_progress=progress_bar or False,
             status_fn=self._status,
+            timeout=timeout,
         ):
           if error is not None:
-            self._failures.append((example, str(error)))
-          else:
-            output = message.text if self.schema is None else message.result
-            self.audit(example, output, message)
-          self._num_completed += 1
+            message = (
+                error.lm_response
+                if isinstance(error, lf_structured.MappingError)
+                else None
+            )
+          self.audit(idx + 1, example, message, error)
       finally:
         # Save cache upon completion or interruption.
         if self.dir and self.cache:
           self.cache.save()
     # Summarize result.
-    self._result = self.summarize()
+    self._result = self.finalize()
     if verbose:
       lf.console.write(
           str(self.result),
@@ -1061,7 +1165,7 @@ class Evaluation(Evaluable):
   def process(self, example: Any, **kwargs) -> lf.Message:
     """Process an example and returns its output."""
-    prompt = self.prompt.render(example=example).text
+    prompt = lf.Template.from_value(self.prompt, example=example)
     if self.method == 'call':
       return lf_structured.call(
           prompt,
@@ -1089,7 +1193,9 @@ class Evaluation(Evaluable):
     else:
       assert self.method == 'complete', self.method
       assert isinstance(self.schema.spec, pg.typing.Object), self.schema
-      input_value = self.schema.spec.cls.partial(prompt)
+      # TODO(daiyip): Currently multi-modal inputs within the prompt for
+      # completion is not supported.
+      input_value = self.schema.spec.cls.partial(prompt.render().text)
       return lf_structured.complete(
           input_value,
           lm=self.lm,
@@ -1100,16 +1206,48 @@ class Evaluation(Evaluable):
           **kwargs,
       )
+  def process_output(self, example: Any, output: lf.Message) -> None:
+    """Process the output for an example.
+    Subclasses can override this method to generate and attach additional
+    metadata for debugging purpose. For example, draw bounding boxes on the
+    input image based on LLM predicted boxes and attach to output_message's
+    metadata.
+    Example:
+      class BoundingBoxEval(lf.eval.Matching):
+        ...
+        def process_output(example, output):
+          output.metadata.image_with_bbox = draw_bboxes(
+              example.image, output.result)
+    Args:
+      example: User input.
+      output: LLM's output message. Users could attach additional
+        information to the message, which will be shown in debugging
+    """
+    del example, output
   def _status(self, progress: lf.concurrent.Progress) -> dict[str, Any]:
+    status = {'Model': self.lm.model_id}
+    status.update(self._eval_status(progress))
+    if progress.last_error is not None:
+      status['LastError'] = progress.last_error_str()
+    if progress.timeit_summary:
+      status['TimeIt'] = progress.timeit_summary_str()
+    return status
+  def _eval_status(self, progress: lf.concurrent.Progress) -> dict[str, Any]:
     return {
-        'Model': self.lm.model_id,
-        'Succeeded': f'%.{self.report_precision}f%% (%d/%d)' % (
-            progress.success_rate * 100,
+        'Succeeded': '%s (%d/%d)' % (
+            self._format_rate(progress.success_rate),
             progress.succeeded,
             progress.completed,
         ),
-        'Failed': f'%.{self.report_precision}f%% (%d/%d)' % (
-            progress.failure_rate * 100,
+        'Failed': '%s (%d/%d)' % (
+            self._format_rate(progress.failure_rate),
             progress.failed,
             progress.completed,
         ),
@@ -1119,22 +1257,21 @@ class Evaluation(Evaluable):
     assert self.result is not None
     m = self.result.metrics
     return (
-        f'COMPLETED(%s): Successes=%.{self.report_precision}f%% (%d/%d)'
-        f' Failures=%.{self.report_precision}f%% (%d/%d)'
+        'COMPLETED(%s): Successes=%s(%d/%d) Failures=%s (%d/%d)'
         % (
             run_status,
-            (1 - m.failure_rate) * 100,
+            self._format_rate(1 - m.failure_rate),
             m.total - m.failures,
             m.total,
-            m.failure_rate * 100,
+            self._format_rate(m.failure_rate),
             m.failures,
             m.total,
         )
     )
-  def summarize(self) -> pg.Dict:
-    """Summarizes the evaluation result."""
-    if self.cache:
+  def finalize(self) -> pg.Dict:
+    """Finalizes the evaluation result."""
+    if self.cache is not None:
       cache_stats = dict(
           use_cache=True,
           num_queries=self.cache.stats.num_queries,
@@ -1143,12 +1280,25 @@ class Evaluation(Evaluable):
       )
     else:
       cache_stats = dict(use_cache=False)
+    if self.has_usage:
+      usage = pg.Dict(
+          total_prompt_tokens=self._total_prompt_tokens,
+          total_completion_tokens=self._total_completion_tokens,
+          num_usages=self._num_usages,
+          average_prompt_tokens=self.average_prompt_tokens,
+          average_completion_tokens=self.average_completion_tokens,
+          average_total_tokens=self.average_total_tokens,
+      )
+    else:
+      usage = None
     result = pg.Dict(
         experiment_setup=pg.Dict(
             id=self.id,
             dir=self.dir,
             model=self.lm.model_id,
-            prompt_template=lf.text_formatting.decolored(str(self.prompt)),
+            prompt_template=pg.decolor(str(self.prompt)),
             method=self.method,
             schema_fn=str(self.schema_fn),
         ),
@@ -1157,56 +1307,183 @@ class Evaluation(Evaluable):
             total=self.num_completed,
             failures=self.num_failures,
             failure_rate=self.failure_rate,
+            oop_failures=self.num_oop_failures,
+            oop_failure_rate=self.oop_failure_rate,
+            non_oop_failures=self.num_non_oop_failures,
+            non_oop_failure_rate=self.non_oop_failure_rate,
+            failure_breakdown=self.failure_breakdown,
         ),
+        usage=usage,
     )
     return result
-  def summarize_html(self) -> str:
+  def summary_card(self) -> str:
+    """Returns summary card in HTML."""
     s = io.StringIO()
     definition = _html_repr(self, compact=False, escape=True)
     s.write('<div><table><tr><td>')
+    self._render_link(
+        s,
+        definition,
+        self.hash,
+        '',
+        lambda: self.link(self.dir),
+    )
     if self.result is None:
       s.write(
-          f'<a target="_blank" title="{definition}" '
-          f'href="{self.link(self.dir)}">{self.hash}</a>'
           '</td></tr><tr><td>'
           '<span style="color: gray">(IN-PROGRESS...)</span>'
       )
     else:
-      s.write(
-          f'<a target="_blank" title="{definition}" '
-          f'href="{self.index_link}">{self.hash}</a>'
-          '</td></tr><tr><td>'
-      )
-      self._render_metric(s)
+      if self.dir:
+        s.write(f' &nbsp;[<a href="{self.link(self.dir)}">dir</a>]')
+      s.write('</td></tr><tr><td>')
+      self._render_summary_metrics(s)
+      # Summarize average usage.
+      if self.result.usage:
+        self._render_summary_usage(s)
     s.write('</td></tr></table></div>')
     return s.getvalue()
-  def _render_metric(self, s: io.StringIO) -> None:
+  def _render_summary_usage(self, s: io.StringIO) -> None:
+    """Renders usage in HTML."""
+    usage = self.result.usage
+    total = usage.total_prompt_tokens + usage.total_completion_tokens
+    s.write(
+        '&nbsp;<a title="'
+        f'# of usages: {usage.num_usages}&#013;'
+        f'total prompt: {usage.total_prompt_tokens}&#013;'
+        f'total response: {usage.total_completion_tokens}&#013;'
+        f'avg prompt: {usage.average_prompt_tokens}&#013;'
+        f'avg response: {usage.average_completion_tokens}'
+        f'" style="color:gray">({total} tokens)</a>'
+    )
+  def _render_link(self,
+                   s: io.StringIO,
+                   title: str,
+                   text: str,
+                   style: str,
+                   url_fn: Callable[[], str]) -> None:
+    """Renders a link in HTML."""
+    s.write(
+        f'<a target="_blank" title="{title}" style="{style}"'
+    )
+    if self.dir:
+      s.write(f' href="{url_fn()}"')
+    s.write(f'>{text}</a>')
+  def _render_summary_metrics(self, s: io.StringIO) -> None:
     """Renders metrics in HTML."""
     assert self.result is not None
     m = self.result.metrics
-    s.write(
-        '<a title="Failures (%d/%d)" href="%s" style="color:red">%s</a>'
-        % (
-            m.failures,
-            m.total,
-            self.failures_link,
-            f'%.{self.report_precision}f%% ' % (m.failure_rate * 100),
-        )
+    # OOP failures.
+    oop_failure_title = f'OOP failures ({m.oop_failures}/{m.total})'
+    if m.oop_failures:
+      oop_failure_title += '&#013;'
+      for name, count in m.failure_breakdown.items():
+        if name.startswith('MappingError'):
+          oop_failure_title += '&#013;%s: %s (%d/%d)' % (
+              name.removeprefix('MappingError.'),
+              self._format_rate(count / m.total),
+              count,
+              m.total,
+          )
+    extra_style = ''
+    if m.oop_failure_rate > 0.1 and m.oop_failures > 3:
+      extra_style = ';font-weight:bold'
+    self._render_link(
+        s,
+        oop_failure_title,
+        self._format_rate(m.oop_failure_rate),
+        f'color:magenta{extra_style}',
+        lambda: self.oop_failures_link,
+    )
+    s.write(' | ')
+    # Non-OOP failures.
+    non_oop_failure_title = f'Non-OOP failures ({m.non_oop_failures}/{m.total})'
+    if m.non_oop_failures:
+      non_oop_failure_title += '&#013;'
+      for name, count in m.failure_breakdown.items():
+        if not name.startswith('MappingError'):
+          non_oop_failure_title += '&#013;%s: %s (%d/%d)' % (
+              name,
+              self._format_rate(count / m.total),
+              count,
+              m.total,
+          )
+    extra_style = ';font-weight:bold' if m.non_oop_failures > 0 else ''
+    self._render_link(
+        s,
+        non_oop_failure_title,
+        self._format_rate(m.non_oop_failure_rate),
+        f'color:red{extra_style}',
+        lambda: self.non_oop_failures_link,
     )
-  def audit(self, example: Any, output: Any, message: lf.Message) -> None:
+  def _format_rate(self, rate: float) -> str:
+    """Formats a rate."""
+    return f'%.{self.report_precision}f%% ' % (rate * 100)
+  def audit(
+      self,
+      example_idx: int,
+      example: Any,
+      message: lf.Message | None,
+      error: Exception | None = None,
+      dryrun: bool = False,
+  ) -> None:
     """Audits the example against the output. Subclasses should override.
     Args:
+      example_idx: 1-based index of the example in its dataset.
       example: The input object.
-      output: The output from LM. For `lf.call`, if `schema_fn` is not provided,
-        it will be the raw LM response string. Otherwise it will be the
-        structured output from the LM.
       message: The entire message returned by the LM, which could be used to
-        trace the LM input, response and parsed structure.
+        trace the LM input, response and parsed structure. If error is raised
+        before LLM could return a response, None will be its value.
+      error: The exception during processing the example.
+      dryrun: Whether or not audition takes place during dryrun.
     """
+    if error is not None:
+      self._failures.append((example, error))
+      # Invalid cache of num_oop_failures.
+      self.__dict__.pop('oop_failures', None)
+      self.__dict__.pop('non_oop_failures', None)
+      self.__dict__.pop('failure_breakdown', None)
+      if isinstance(error, lf_structured.MappingError):
+        message = error.lm_response
+    else:
+      assert message is not None
+      output = message.text if self.schema is None else message.result
+      self.audit_processed(example_idx, example, output, message, dryrun=dryrun)
+    # Audit usage.
+    if message is not None:
+      self.audit_usage(message, dryrun=dryrun)
+    self._num_completed += 1
+  def audit_usage(self, message: lf.Message, dryrun: bool = False) -> None:
+    del dryrun
+    for m in message.trace():
+      usage = m.metadata.get('usage', None)
+      if usage:
+        self._total_prompt_tokens += usage.prompt_tokens
+        self._total_completion_tokens += usage.completion_tokens
+        self._num_usages += 1
+  def audit_processed(
+      self, example_idx: int, example: Any, output: Any, message: lf.Message,
+      dryrun: bool = False
+  ) -> None:
+    """Audits a successfully processed example. Subclass should override."""
   def save(
       self, definition: bool = True, result: bool = True, report: bool = True
@@ -1229,16 +1506,26 @@ class Evaluation(Evaluable):
       # Save failures.
       pg.save(
           [
-              pg.Dict(
-                  input=input, error=lf.text_formatting.decolored(str(error))
-              )
-              for input, error in self.failures
+              pg.Dict(input=input, error=_format_error(error))
+              for input, error in self.oop_failures
           ],
-          os.path.join(self.dir, Evaluation.FAILURES_JSON),
+          os.path.join(self.dir, Evaluation.OOP_FAILURES_JSON),
       )
       pg.save(
-          self._html([self._render_result, self._render_failures]),
-          os.path.join(self.dir, Evaluation.FAILURES_HTML),
+          self._html([self._render_result, self._render_oop_failures]),
+          os.path.join(self.dir, Evaluation.OOP_FAILURES_HTML),
+          file_format='txt',
+      )
+      pg.save(
+          [
+              pg.Dict(input=input, error=_format_error(error))
+              for input, error in self.non_oop_failures
+          ],
+          os.path.join(self.dir, Evaluation.NON_OOP_FAILURES_JSON),
+      )
+      pg.save(
+          self._html([self._render_result, self._render_non_oop_failures]),
+          os.path.join(self.dir, Evaluation.NON_OOP_FAILURES_HTML),
           file_format='txt',
       )
@@ -1250,8 +1537,11 @@ class Evaluation(Evaluable):
         '<td>Prompt</td>'
         '<td>Schema</td>'
         '<td>Additional Args</td>'
-        '<td>Failures</td>'
     )
+    if self.result.usage:
+      s.write('<td>Usage</td>')
+    s.write('<td>OOP Failures</td>')
+    s.write('<td>Non-OOP Failures</td>')
   def _render_result_row(self, s: io.StringIO) -> None:
     s.write(
@@ -1276,13 +1566,32 @@ class Evaluation(Evaluable):
         '<td style="color:purple" '
         f'{_html_repr(self.additional_args, compact=False)}</td>'
     )
-    # Failures.
+    # Usage.
+    if self.result.usage:
+      s.write('<td>')
+      self._render_summary_usage(s)
+      s.write('</td>')
+    # OOP failures.
+    s.write(
+        '<td><span style="color:magenta">%s</span>%s</td>'
+        % (
+            self._format_rate(self.oop_failure_rate),
+            '<a href="%s">(%d/%d)</a>'
+            % (self.oop_failures_link,
+               self.num_oop_failures,
+               self.num_completed),
+        )
+    )
+    # Non-OOP failures.
     s.write(
-        '<td><span style="color:orange">%s</span>%s</td>'
+        '<td><span style="color:red">%s</span>%s</td>'
         % (
-            f'%.{self.report_precision}f%%' % (self.failure_rate * 100),
+            self._format_rate(self.non_oop_failure_rate),
             '<a href="%s">(%d/%d)</a>'
-            % (self.failures_link, self.num_failures, self.num_completed),
+            % (self.non_oop_failures_link,
+               self.num_non_oop_failures,
+               self.num_completed),
         )
     )
@@ -1296,31 +1605,99 @@ class Evaluation(Evaluable):
     else:
       return 'cyan'
-  def _render_failures(self, s: io.StringIO) -> None:
+  def _render_oop_failures(self, s: io.StringIO) -> None:
+    self._render_failures(s, '^MappingError.*', error_color='magenta')
+  def _render_non_oop_failures(self, s: io.StringIO) -> None:
+    self._render_failures(s, '^(?!MappingError).*', error_color='red')
+  def _render_failures(
+      self, s: io.StringIO, error_regex: str, error_color: str) -> None:
     """Formats the failed cases into html."""
+    # Failure summary.
     s.write(
-        '<h2> Failed Cases </h2>'
+        '<h2> Error Summary </h2>'
         '<div style="white-space:pre">\n'
         '<table style="border:1px solid">'
-        '<tr class="header"><td>No.</td><td>Input</td><td>Error</td></tr>'
+        '<tr class="header"><td>Error type</td><td>Stats</td></tr>'
     )
+    error_regex = re.compile(error_regex)
+    if self.result.metrics.failure_breakdown:
+      for name, count in self.result.metrics.failure_breakdown.items():
+        if not error_regex.match(name):
+          continue
+        link = f'<a href="#{name}">{name}</a>'
+        error_rate = self._format_rate(count / self.result.metrics.total)
+        stats = (f'<span style="color:{error_color}">{error_rate} '
+                 f'({count}/{self.result.metrics.total})</span>')
+        s.write(f'<tr><td>{link}</td><td>{stats})</td></tr>')
+    s.write(
+        '</table></div>'
+        '<h2> Failed Cases </h2>'
+        '<div style="white-space:pre">'
+    )
+    # Failure details by error type.
+    failures_by_error = collections.defaultdict(list)
+    for example, error in self.failures:
+      error_name = _error_key(error)
+      if error_regex.match(error_name):
+        failures_by_error[error_name].append((example, error))
+    for error_key, failures in failures_by_error.items():
+      s.write(
+          f'<h3 id="{error_key}"><a href="#{error_key}">{error_key}</a> '
+          f'(count={len(failures)})</h3>'
+          '<table style="border:1px solid">'
+          '<tr class="header"><td>No.</td><td>Input</td>'
+          '<td>LM invocation</td><td>Error</td></tr>'
+      )
+      for i, (example, error) in enumerate(failures):
+        lm_response = None
+        if isinstance(error, lf.structured.MappingError):
+          lm_response = error.lm_response
+          error = error.cause
+        bgcolor = 'white' if i % 2 == 0 else '#DDDDDD'
+        s.write(f'<tr style="background-color: {bgcolor}"><td>{i + 1}</td>')
+        s.write('<td style="color:green;white-space:pre-wrap">')
+        s.write(pg.format(example, verbose=False))
+        s.write('</td><td>')
+        if lm_response is not None:
+          self._render_message(lm_response, s)
+        s.write(f'</td><td style="color:{error_color};white-space:pre">')
+        s.write(_format_error(error))
+        s.write('</td></tr>')
+      s.write('</table>')
+    s.write('</div>')
-    for i, (example, error) in enumerate(self.failures):
-      bgcolor = 'white' if i % 2 == 0 else '#DDDDDD'
-      s.write(f'<tr style="background-color: {bgcolor}"><td>{i + 1}</td>')
-      input_str = pg.format(example, verbose=False)
-      s.write(f'<td style="color:green;white-space:pre-wrap">{input_str}</td>')
-      error_str = lf.text_formatting.decolored(str(error))
-      s.write(f'<td style="color:red;white-space:pre">{error_str}</td>')
-      s.write('</tr>')
-    s.write('</table></div>')
+  @classmethod
+  def visualize(cls, evaluations: list['Evaluation']) -> str | None:
+    """Visualize the a list of evaluations of this task in HTML."""
+    del evaluations
+    return None
 @pg.functor()
-def inputs_from(path: str | list[str]) -> list[Any]:
+def inputs_from(path: str | list[str], **kwargs) -> list[Any]:
   """A functor that returns a list of user-defined objects as eval inputs."""
   if isinstance(path, str):
-    return pg.load(path)
+    if path.endswith('.json'):
+      return pg.load(path)
+    elif path.endswith('.jsonl'):
+      return list(iter(pg.open_jsonl(path)))
+    elif path.endswith('.csv'):
+      import pandas as pd  # pylint: disable=g-import-not-at-top
+      dataset_df = pd.read_csv(path, **kwargs)
+      dataset = []
+      for i in range(dataset_df.shape[0]):
+        row = {}
+        for col in dataset_df.columns:
+          row[col] = dataset_df.iloc[i][col]
+        dataset.append(row)
+      return dataset
+    else:
+      raise ValueError(f'Unsupported file format: {path}')
   examples = []
   for p in path:
     examples.extend(pg.load(p))
@@ -1374,8 +1751,8 @@ class Summary(pg.Object):
           Type[lf.LanguageModel],
           tuple[lf.LanguageModel | Type[lf.LanguageModel], ...],
       ] = lf.LanguageModel,
-      method: Union[str, tuple[str], None] = None,
-      schema_fn: Union[pg.Functor, tuple[pg.Functor], None] = None,
+      method: Union[str, tuple[str, ...], None] = None,
+      schema_fn: Union[pg.Functor, tuple[pg.Functor, ...], None] = None,
       completed: bool | None = None,
       pivot_field: str | None = None,
   ) -> 'Summary':
@@ -1466,7 +1843,7 @@ class Summary(pg.Object):
           if e is None:
             s.write('<span style="color: gray">N/A<span>')
           else:
-            s.write(e.summarize_html())
+            s.write(e.summary_card())
           s.write('</td>')
         s.write('</tr>')
       s.write('</table>')
@@ -1541,13 +1918,22 @@ class Summary(pg.Object):
     s.write('<html><body>')
     for task in sorted(self.tasks(), key=lambda cls: cls.__name__):
       table_id = task.__name__.lower()
+      evaluations = self.select(task=task).evaluations
+      table = Summary.Table.from_evaluations(evaluations, pivot_field)
       s.write('<div>')
-      s.write(f'<a id="{table_id}"')
-      s.write(f'<h2><a href="#{table_id}">{task.__name__}</a></h2>')
-      s.write('</a>')
-      table = Summary.Table.from_evaluations(
-          self.select(task=task).evaluations, pivot_field
+      s.write(
+          f'<a id="{table_id}" href="#{table_id}">'
+          f'<h2>{task.__name__}</h2></a>'
       )
+      # Allow users to plugin visualization code (e.g. matplot) in the summary
+      # page.
+      visual_part = task.visualize(evaluations)
+      if visual_part:
+        s.write(visual_part)
+      s.write(f'<h4 style="color:gray">{len(evaluations)} experiments</h4>')
+      s.write('<hr/>')
       s.write(table.html())
       s.write('</div>')
     s.write('</body></html>')
@@ -1556,8 +1942,36 @@ class Summary(pg.Object):
   def _repr_html_(self) -> str:
     return self.html()
+  def json(
+      self,
+  ) -> dict[
+      str,  # Task name
+      list[pg.Dict],  # List of pg.Dict with `experiment` and `metrics`.
+  ]:
+    """Returns the JSON representation of the summary."""
+    task_results = {}
+    for task in sorted(self.tasks(), key=lambda cls: cls.__name__):
+      results = []
+      for entry in self.select(task=task).evaluations:
+        results.append(
+            pg.Dict(
+                id=entry.id,
+                experiment=entry,
+                dir=entry.dir,
+                metrics=entry.result.metrics if entry.result else None,
+                usage=entry.result.usage if entry.result else None,
+            )
+        )
+      task_results[task.__name__] = results
+    return task_results
   def save(self, file: str, pivot_field: str | None = None) -> None:
     pg.save(self.html(pivot_field), file, file_format='txt')
+    if file.endswith('.html'):
+      json_file = file.replace('.html', '.json')
+    else:
+      json_file = os.path.join(file, '.json')
+    pg.save(self.json(), json_file)
   @classmethod
   def from_dirs(
@@ -1694,6 +2108,20 @@ class Summary(pg.Object):
     return result.join()
+def _format_error(error: Exception):
+  """Formats an error into a string."""
+  return (f'({error.__class__.__name__}) ' + pg.decolor(str(error)))
+def _error_key(error: Exception) -> str:
+  """Returns the key for an error."""
+  error_names = []
+  while error is not None:
+    error_names.append(error.__class__.__name__)
+    error = getattr(error, 'cause', None)
+  return '.'.join(error_names)
 def _html_repr(value: Any, compact: bool = True, escape: bool = False) -> str:
   """Formats prompt in HTML."""
   if type(value) is lf.Template:  # pylint: disable=unidiomatic-typecheck
@@ -1768,3 +2196,202 @@ def monitor_async(
       scan_interval=scan_interval,
       refresh_when_stop=refresh_when_stop,
   )
+#
+# Named evaluations and experiments support.
+#
+class _NamedEvaluationRegistry:
+  """Named evaluation registry."""
+  def __init__(self):
+    self._registry = {}
+  def names(self) -> list[str]:
+    """Returns all registered names."""
+    return sorted(self._registry.keys())
+  def get(self, name: str) -> list[Type[Evaluable]]:
+    """Gets an evaluation by name."""
+    matches = []
+    if name in self._registry:
+      matches.append(self._registry[name])
+    else:
+      regex = re.compile(name)
+      for key, cls in self._registry.items():
+        if regex.match(key):
+          matches.append(cls)
+    return matches
+  def register(
+      self,
+      name: str,
+      experiment_cls: Type[Evaluable],
+  ):
+    """Register an experiment class."""
+    self._registry[name] = experiment_cls
+_eval_registry = _NamedEvaluationRegistry()
+def registered_names() -> list[str]:
+  """Returns all registered names."""
+  return _eval_registry.names()
+def get_evaluations(evaluation: str | Evaluable) -> list[Evaluable]:
+  """Gets an evaluation experiment by name."""
+  if isinstance(evaluation, str):
+    return [e() for e in _eval_registry.get(evaluation)]
+  return [evaluation]
+def register(name: str):
+  """Decorator to create a named evaluation class."""
+  def _register(func_or_cls: Type[Evaluation] | types.FunctionType):
+    if inspect.isfunction(func_or_cls):
+      e = func_or_cls()
+      if not isinstance(e, Evaluable):
+        raise TypeError(
+            f'The return value of `{func_or_cls}` should be an instance of '
+            '`lf.eval.Evaluable` subclass.'
+        )
+      class GeneratedSuite(Suite):
+        # NOTE(daiyip): Delay serialization key registration for generated
+        # class.
+        auto_register = False
+        children = e.children if isinstance(e, Suite) else [e]
+      cls = GeneratedSuite
+      cls.__name__ = func_or_cls.__name__
+      cls.__doc__ = func_or_cls.__doc__
+      cls.__qualname__ = func_or_cls.__qualname__
+      cls.__module__ = getattr(func_or_cls, '__module__', 'wrapper')
+      cls.register_for_deserialization(cls.__type_name__)
+    elif issubclass(func_or_cls, Evaluable):
+      cls = func_or_cls
+    else:
+      raise ValueError(f'Unsupported type: {type(func_or_cls)}')
+    _eval_registry.register(name, cls)
+    return cls
+  return _register
+def get(
+    root_dir: str,
+    evaluations: list[str | Evaluable],
+    filter: Union[                    # pylint: disable=redefined-builtin
+        str,                          # Regex to filter evaluation based on ID.
+        Callable[[Evaluable], bool],  # Custom filter function.
+        None                          # No filtering (Default).
+    ] = None,                         # pylint: disable=bad-whitespace
+    patches: list[Union[
+        str,                                    # String-based PyGlove patcher.
+        pg.patching.Patcher,                    # PyGlove patcher object.
+        Callable[[pg.KeyPath, Any, Any], Any],  # PyGlove rebind function.
+    ]] | None = None,                           # pylint: disable=bad-whitespace
+) -> Suite:
+  """Gets a suite from a list of patched evaluations.
+  Args:
+    root_dir: The root directory of the experiment.
+    evaluations: A list of evaluations to be included in the suite.
+    filter: A regular expression (str) for selecting sub-experiments of matched
+      IDs, or a filter function to filter the evaluations.
+    patches: A list of patches to be applied to the suite. Each element can be
+      a string (for string-based patcher), a `pg.patching.Patcher` object, or
+      a rebind function (e.g. `pg.rebind`). See `lf.eval.patch_*` for more
+      details.
+  Returns:
+    A suite of selected `lf.eval.Evaluation` objects.
+  """
+  matches = []
+  for e in evaluations:
+    matches.extend(get_evaluations(e))
+  if not matches:
+    raise ValueError('No evaluations found.')
+  suite = Suite(matches, root_dir=root_dir)
+  if patches:
+    suite = pg.patch(suite, patches)
+  if isinstance(filter, str):
+    regex = re.compile(filter)
+    filter = lambda x: bool(regex.match(x.id))
+  if filter:
+    suite = Suite(
+        [leaf for leaf in suite.leaf_nodes if filter(leaf)], root_dir=root_dir)
+  return suite
+def run(
+    root_dir: str,
+    evaluations: list[str | Evaluable],
+    filter: Union[                    # pylint: disable=redefined-builtin
+        str,                          # Regex to filter evaluation based on ID.
+        Callable[[Evaluable], bool],  # Custom filter function.
+        None                          # No filtering (Default).
+    ] = None,                         # pylint: disable=bad-whitespace
+    patches: list[Union[
+        str,                                    # String-based PyGlove patcher.
+        pg.patching.Patcher,                    # PyGlove patcher object.
+        Callable[[pg.KeyPath, Any, Any], Any],  # PyGlove rebind function.
+    ]] | None = None,                           # pylint: disable=bad-whitespace
+    mode: Literal['run', 'rerun', 'dryrun', 'noop'] = 'run',
+    debug: bool = False,
+    print_definition: bool = False,
+    **kwargs,
+) -> Suite:
+  """Run selected evaluations with patching.
+  Args:
+    root_dir: The root directory of the experiment.
+    evaluations: A list of evaluations to be included in the suite.
+    filter: A regular expression (str) for selecting sub-experiments of matched
+      IDs, or a filter function to filter the evaluations.
+    patches: A list of patches to be applied to the suite. Each element can be
+      a string (for string-based patcher), a `pg.patching.Patcher` object, or
+      a rebind function (e.g. `pg.rebind`). See `lf.eval.patch_*` for more
+      details.
+    mode: The mode to run the suite. "run" to run the suite, with reusing
+      existing results if available; "rerun" to rerun all evaluations even if
+      there are existing results; "dryrun" to dryrun the suite; and "noop"
+      to do nothing.
+    debug: Whether to run in debug mode.
+    print_definition: Whether to print the experiment definition.
+    **kwargs: Additional arguments to be passed to dryrun/run the suite.
+  Returns:
+    A suite of selected `lf.eval.Evaluation` objects.
+  """
+  suite = get(root_dir, evaluations, patches=patches, filter=filter)
+  if print_definition:
+    lf.console.write(
+        pg.format(
+            suite,
+            compact=False,
+            verbose=False,
+            hide_default_values=True,
+            python_format=True,
+        ),
+        title='[EXPERIMENT DEFINITION]',
+        color='blue',
+    )
+  if mode == 'run':
+    rerun = mode == 'rerun'
+    suite.run(debug=debug, rerun=rerun, **kwargs)
+  elif mode == 'dryrun':
+    suite.dryrun(debug=debug, **kwargs)
+  return suite

langfun 0.0.2.dev20240330__py3-none-any.whl → 0.1.2.dev202501140804__py3-none-any.whl

langfun 0.0.2.dev20240330py3-none-any.whl → 0.1.2.dev202501140804py3-none-any.whl