PyPI - langfun - Versions diffs - 0.1.2.dev202509120804__py3-none-any.whl → 0.1.2.dev202512040805__py3-none-any.whl - Mend

langfun 0.1.2.dev202509120804py3-none-any.whl → 0.1.2.dev202512040805py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (162) hide show

langfun/__init__.py +1 -1
langfun/core/__init__.py +7 -1
langfun/core/agentic/__init__.py +8 -1
langfun/core/agentic/action.py +740 -112
langfun/core/agentic/action_eval.py +9 -2
langfun/core/agentic/action_test.py +189 -24
langfun/core/async_support.py +104 -5
langfun/core/async_support_test.py +23 -0
langfun/core/coding/python/correction.py +19 -9
langfun/core/coding/python/execution.py +14 -12
langfun/core/coding/python/generation.py +21 -16
langfun/core/coding/python/sandboxing.py +23 -3
langfun/core/component.py +42 -3
langfun/core/concurrent.py +70 -6
langfun/core/concurrent_test.py +9 -2
langfun/core/console.py +1 -1
langfun/core/data/conversion/anthropic.py +12 -3
langfun/core/data/conversion/anthropic_test.py +8 -6
langfun/core/data/conversion/gemini.py +11 -2
langfun/core/data/conversion/gemini_test.py +48 -9
langfun/core/data/conversion/openai.py +145 -31
langfun/core/data/conversion/openai_test.py +161 -17
langfun/core/eval/base.py +48 -44
langfun/core/eval/base_test.py +5 -5
langfun/core/eval/matching.py +5 -2
langfun/core/eval/patching.py +3 -3
langfun/core/eval/scoring.py +4 -3
langfun/core/eval/v2/__init__.py +2 -0
langfun/core/eval/v2/checkpointing.py +76 -7
langfun/core/eval/v2/checkpointing_test.py +9 -2
langfun/core/eval/v2/config_saver.py +37 -0
langfun/core/eval/v2/config_saver_test.py +36 -0
langfun/core/eval/v2/eval_test_helper.py +104 -3
langfun/core/eval/v2/evaluation.py +92 -17
langfun/core/eval/v2/evaluation_test.py +9 -3
langfun/core/eval/v2/example.py +50 -40
langfun/core/eval/v2/example_test.py +16 -8
langfun/core/eval/v2/experiment.py +84 -15
langfun/core/eval/v2/experiment_test.py +19 -0
langfun/core/eval/v2/metric_values.py +31 -3
langfun/core/eval/v2/metric_values_test.py +32 -0
langfun/core/eval/v2/metrics.py +157 -44
langfun/core/eval/v2/metrics_test.py +39 -18
langfun/core/eval/v2/progress.py +31 -1
langfun/core/eval/v2/progress_test.py +27 -0
langfun/core/eval/v2/progress_tracking.py +13 -5
langfun/core/eval/v2/progress_tracking_test.py +9 -1
langfun/core/eval/v2/reporting.py +90 -71
langfun/core/eval/v2/reporting_test.py +24 -6
langfun/core/eval/v2/runners/__init__.py +30 -0
langfun/core/eval/v2/{runners.py → runners/base.py} +72 -180
langfun/core/eval/v2/runners/beam.py +354 -0
langfun/core/eval/v2/runners/beam_test.py +153 -0
langfun/core/eval/v2/runners/ckpt_monitor.py +294 -0
langfun/core/eval/v2/runners/ckpt_monitor_test.py +162 -0
langfun/core/eval/v2/runners/debug.py +40 -0
langfun/core/eval/v2/runners/debug_test.py +76 -0
langfun/core/eval/v2/runners/parallel.py +243 -0
langfun/core/eval/v2/runners/parallel_test.py +182 -0
langfun/core/eval/v2/runners/sequential.py +47 -0
langfun/core/eval/v2/runners/sequential_test.py +169 -0
langfun/core/langfunc.py +45 -130
langfun/core/langfunc_test.py +7 -5
langfun/core/language_model.py +189 -36
langfun/core/language_model_test.py +54 -3
langfun/core/llms/__init__.py +12 -1
langfun/core/llms/anthropic.py +157 -2
langfun/core/llms/azure_openai.py +29 -17
langfun/core/llms/cache/base.py +25 -3
langfun/core/llms/cache/in_memory.py +48 -7
langfun/core/llms/cache/in_memory_test.py +14 -4
langfun/core/llms/compositional.py +25 -1
langfun/core/llms/deepseek.py +30 -2
langfun/core/llms/fake.py +32 -1
langfun/core/llms/gemini.py +64 -12
langfun/core/llms/gemini_test.py +110 -0
langfun/core/llms/google_genai.py +34 -1
langfun/core/llms/groq.py +28 -3
langfun/core/llms/llama_cpp.py +23 -4
langfun/core/llms/openai.py +120 -3
langfun/core/llms/openai_compatible.py +148 -27
langfun/core/llms/openai_compatible_test.py +207 -20
langfun/core/llms/openai_test.py +0 -2
langfun/core/llms/rest.py +16 -1
langfun/core/llms/vertexai.py +58 -8
langfun/core/logging.py +1 -1
langfun/core/mcp/__init__.py +10 -0
langfun/core/mcp/client.py +177 -0
langfun/core/mcp/client_test.py +71 -0
langfun/core/mcp/session.py +241 -0
langfun/core/mcp/session_test.py +54 -0
langfun/core/mcp/testing/simple_mcp_client.py +33 -0
langfun/core/mcp/testing/simple_mcp_server.py +33 -0
langfun/core/mcp/tool.py +254 -0
langfun/core/mcp/tool_test.py +197 -0
langfun/core/memory.py +1 -0
langfun/core/message.py +160 -55
langfun/core/message_test.py +65 -81
langfun/core/modalities/__init__.py +8 -0
langfun/core/modalities/audio.py +21 -1
langfun/core/modalities/image.py +73 -3
langfun/core/modalities/image_test.py +116 -0
langfun/core/modalities/mime.py +64 -3
langfun/core/modalities/mime_test.py +11 -0
langfun/core/modalities/pdf.py +19 -1
langfun/core/modalities/video.py +21 -1
langfun/core/modality.py +167 -29
langfun/core/modality_test.py +42 -12
langfun/core/natural_language.py +1 -1
langfun/core/sampling.py +4 -4
langfun/core/sampling_test.py +20 -4
langfun/core/structured/__init__.py +2 -24
langfun/core/structured/completion.py +34 -44
langfun/core/structured/completion_test.py +23 -43
langfun/core/structured/description.py +54 -50
langfun/core/structured/function_generation.py +29 -12
langfun/core/structured/mapping.py +81 -37
langfun/core/structured/parsing.py +95 -79
langfun/core/structured/parsing_test.py +0 -3
langfun/core/structured/querying.py +230 -154
langfun/core/structured/querying_test.py +69 -33
langfun/core/structured/schema/__init__.py +49 -0
langfun/core/structured/schema/base.py +664 -0
langfun/core/structured/schema/base_test.py +531 -0
langfun/core/structured/schema/json.py +174 -0
langfun/core/structured/schema/json_test.py +121 -0
langfun/core/structured/schema/python.py +316 -0
langfun/core/structured/schema/python_test.py +410 -0
langfun/core/structured/schema_generation.py +33 -14
langfun/core/structured/scoring.py +47 -36
langfun/core/structured/tokenization.py +26 -11
langfun/core/subscription.py +2 -2
langfun/core/template.py +175 -50
langfun/core/template_test.py +123 -17
langfun/env/__init__.py +43 -0
langfun/env/base_environment.py +827 -0
langfun/env/base_environment_test.py +473 -0
langfun/env/base_feature.py +304 -0
langfun/env/base_feature_test.py +228 -0
langfun/env/base_sandbox.py +842 -0
langfun/env/base_sandbox_test.py +1235 -0
langfun/env/event_handlers/__init__.py +14 -0
langfun/env/event_handlers/chain.py +233 -0
langfun/env/event_handlers/chain_test.py +253 -0
langfun/env/event_handlers/event_logger.py +472 -0
langfun/env/event_handlers/event_logger_test.py +304 -0
langfun/env/event_handlers/metric_writer.py +726 -0
langfun/env/event_handlers/metric_writer_test.py +214 -0
langfun/env/interface.py +1640 -0
langfun/env/interface_test.py +153 -0
langfun/env/load_balancers.py +59 -0
langfun/env/load_balancers_test.py +141 -0
langfun/env/test_utils.py +507 -0
{langfun-0.1.2.dev202509120804.dist-info → langfun-0.1.2.dev202512040805.dist-info}/METADATA +7 -3
langfun-0.1.2.dev202512040805.dist-info/RECORD +217 -0
langfun/core/eval/v2/runners_test.py +0 -343
langfun/core/structured/schema.py +0 -987
langfun/core/structured/schema_test.py +0 -982
langfun-0.1.2.dev202509120804.dist-info/RECORD +0 -172
{langfun-0.1.2.dev202509120804.dist-info → langfun-0.1.2.dev202512040805.dist-info}/WHEEL +0 -0
{langfun-0.1.2.dev202509120804.dist-info → langfun-0.1.2.dev202512040805.dist-info}/licenses/LICENSE +0 -0
{langfun-0.1.2.dev202509120804.dist-info → langfun-0.1.2.dev202512040805.dist-info}/top_level.txt +0 -0

langfun/core/eval/v2/experiment.py CHANGED Viewed

@@ -139,10 +139,10 @@ class Experiment(lf.Component, pg.views.HtmlTreeView.Extension):
   # Checkpointing
-  Experiments support checkpointing, which is enabled by default. It allows
+  Experiments support checkpointing, which is enabled by default. It allows
   users to resume their experiments from a saved state. When an experiment runs,
-  it creates a new directory for that run and saves the current state to a
-  checkpoint file. If the experiment is interrupted or fails, users can resume
+  it creates a new directory for that run and saves its progress to checkpoint
+  files. If the experiment is interrupted or fails, users can resume
   it by specifying the 'id' or 'warm_start_from' argument (shown above) to
   seamlessly continue from previously saved state without starting over.
@@ -169,7 +169,7 @@ class Experiment(lf.Component, pg.views.HtmlTreeView.Extension):
   # Experiment Plugins
-  Experiment can be extended by plugins. Plugins can listen to the events of
+  Experiments can be extended by plugins. Plugins can listen to the events of
   experiment execution and produce additional outputs. For example, a plugin
   can be added to an experiment to generate additional metrics or to save
   additional data to a database. More details will be added in the future.
@@ -657,7 +657,30 @@ class Experiment(lf.Component, pg.views.HtmlTreeView.Extension):
 @pg.use_init_args(['children'])
 class Suite(Experiment):
-  """A suite of evaluations."""
+  """A suite of evaluations.
+  `lf.eval.Suite` groups multiple `lf.eval.Evaluation` or other `Suite`
+  objects into a single experiment, allowing them to be run, managed, and
+  reported together.
+  **Example:**
+  ```python
+  import langfun as lf
+  suite = lf.eval.Suite([
+      MyEval(lm=lf.llms.Gpt4()),
+      MyEval(lm=lf.llms.Gemini()),
+      lf.eval.Suite([
+          AnotherEval(lm=lf.llms.Gpt4()),
+          AnotherEval(lm=lf.llms.Gemini())
+      ])
+  ])
+  # Run all evaluations in the suite
+  run_info = suite.run('/path/to/my/suite_run')
+  ```
+  """
   children: Annotated[
       list[Experiment], 'A list of child experiments.'
@@ -791,7 +814,14 @@ class RunId(pg.Object):
 class Run(pg.Object, pg.views.html.HtmlTreeView.Extension):
-  """A run of an experiment."""
+  """Represents a single run of an experiment.
+  A `Run` object holds all the configurations for executing an experiment,
+  such as the experiment definition, input/output directories, and flags
+  controlling the execution behavior (e.g., error handling, checkpointing).
+  It also provides utility methods for accessing run-specific paths and
+  filtering examples for evaluation.
+  """
   root_dir: Annotated[
       str,
@@ -818,10 +848,10 @@ class Run(pg.Object, pg.views.html.HtmlTreeView.Extension):
   ] = None
   example_ids: Annotated[
-      list[int] | None,
+      list[int] | Callable[[Experiment], list[int]] | None,
       (
-          'The example IDs to run. If None, it will run all examples. '
-          'Though '
+          'The example IDs to run. Or a callable for determining the examples '
+          'to run based on the experiment. If None, it will run all examples. '
       )
   ] = None
@@ -937,10 +967,13 @@ class Run(pg.Object, pg.views.html.HtmlTreeView.Extension):
     """Returns the example IDs to evaluate."""
     if not experiment.is_leaf:
       return set()
-    return set(
-        self.example_ids if self.example_ids else
-        range(1, experiment.num_examples + 1)
-    )
+    if self.example_ids is None:
+      return set(range(1, experiment.num_examples + 1))
+    elif isinstance(self.example_ids, Callable):
+      return set(self.example_ids(experiment))
+    else:
+      assert isinstance(self.example_ids, list), self.example_ids
+      return set(self.example_ids)
   def examples_to_reprocess(self, experiment: Experiment) -> set[int]:
     """Returns the example IDs to reprocess per request."""
@@ -971,7 +1004,13 @@ class Run(pg.Object, pg.views.html.HtmlTreeView.Extension):
 class Runner(pg.Object):
-  """Interface for experiment runner."""
+  """Interface for experiment runner.
+  A runner is responsible for executing the evaluations within an experiment
+  based on the configuration specified in a `Run` object. Different runners
+  can implement different execution strategies, such as sequential or parallel
+  processing of examples and evaluations.
+  """
   # Class-level variable for registering the runner.
   NAME = None
@@ -1010,7 +1049,37 @@ class Runner(pg.Object):
 class Plugin(lf.Component):
-  """Base class for experiment plugins."""
+  """Base class for experiment plugins.
+  Plugins provide a mechanism to extend the behavior of an experiment run
+  by hooking into various events during the lifecycle of experiment and
+  example execution, such as `on_run_start`, `on_experiment_complete`,
+  `on_example_start`, etc. They can be used for custom logging, monitoring,
+  or result processing.
+  """
+  @classmethod
+  def is_per_example(cls) -> bool:
+    """Returns whether the plugin is per example only.
+    Per-example plugins can be installed on individual workers when examples
+    are evaluated by multiple processes in parallel.
+    """
+    def same_code(method1, method2):
+      return method1.__code__ == method2.__code__
+    return all(
+        same_code(method1, method2)
+        for method1, method2 in [
+            (Plugin.on_run_start, cls.on_run_start),
+            (Plugin.on_run_complete, cls.on_run_complete),
+            (Plugin.on_run_abort, cls.on_run_abort),
+            (Plugin.on_experiment_start, cls.on_experiment_start),
+            (Plugin.on_experiment_skipped, cls.on_experiment_skipped),
+            (Plugin.on_experiment_complete, cls.on_experiment_complete),
+            (Plugin.on_experiment_abort, cls.on_experiment_abort),
+        ]
+    )
   def on_run_start(
       self,

langfun/core/eval/v2/experiment_test.py CHANGED Viewed

@@ -433,5 +433,24 @@ class RunnerTest(unittest.TestCase):
           pass
+class PluginTest(unittest.TestCase):
+  def test_per_example_only(self):
+    class PerExamplePlugin(experiment_lib.Plugin):
+      def on_example_complete(self, runner, experiment, example):
+        print('on_example_complete')
+    self.assertTrue(PerExamplePlugin.is_per_example())
+    class NonPerExamplePlugin(experiment_lib.Plugin):
+      def on_experiment_complete(self, runner, experiment):
+        print('on_example_complete')
+    self.assertFalse(NonPerExamplePlugin.is_per_example())
 if __name__ == '__main__':
   unittest.main()

langfun/core/eval/v2/metric_values.py CHANGED Viewed

@@ -20,7 +20,15 @@ import pyglove as pg
 class MetricValue(pg.Object):
-  """Base class for metric values."""
+  """Base class for metric values.
+  `MetricValue` is the base class for representing aggregated metric values
+  in an evaluation. It accumulates data points from individual examples,
+  each consisting of a value and an optional weight, associated with an example
+  ID. Subclasses must implement `reduce` method to compute a single float value
+  from accumulated data points, and `scalar_repr` to provide a string
+  representation of the reduced value.
+  """
   class DataPoint(pg.Object):
     """A data point for a metric value."""
@@ -88,6 +96,14 @@ class MetricValue(pg.Object):
         self.increment_total()
     return self
+  def merge_from(self, other: 'MetricValue') -> 'MetricValue':
+    """Merges the values from another metric value."""
+    self._weighted_sum += other._weighted_sum  # pylint: disable=protected-access
+    with pg.notify_on_change(False), pg.allow_writable_accessors(True):
+      self.data_points.extend(other.data_points)
+      self.increment_total(other.total)
+    return self
   def __gt__(self, other: Union['MetricValue', float]) -> bool:
     if isinstance(other, self.__class__):
       return float(self) > float(other)
@@ -133,7 +149,13 @@ class MetricValue(pg.Object):
 class Rate(MetricValue):
-  """Representing a rate in range [0, 1]."""
+  """Metric value representing a rate in range [0, 1].
+  `Rate` is used for metrics that compute a rate, such as accuracy or error
+  rate. The final value is computed as the weighted sum of accumulated values
+  divided by the total number of examples. It's displayed as a percentage
+  (e.g., 90.0%).
+  """
   def reduce(self) -> float:
     return self._weighted_sum / self.total
@@ -145,7 +167,13 @@ class Rate(MetricValue):
 class Average(MetricValue):
-  """Average of a aggregated values."""
+  """Metric value representing an average of accumulated values.
+  `Average` is used for metrics that compute an average score across examples
+  (e.g., average quality score). The final value is computed as the weighted
+  sum of accumulated values divided by the number of data points.
+  It's displayed as a float with 3 decimal places (e.g., 4.750).
+  """
   def reduce(self) -> float:
     if not self.data_points:

langfun/core/eval/v2/metric_values_test.py CHANGED Viewed

@@ -51,6 +51,22 @@ class RateTest(unittest.TestCase):
     self.assertEqual(rate.total, 0)
     self.assertTrue(math.isnan(float(rate)))
+  def test_merge_from(self):
+    rate1 = metric_values.Rate()
+    rate1.add(1, 1.0, 1.0, increment_total=True)
+    rate2 = metric_values.Rate()
+    rate2.add(2, 0.0, 1.0, increment_total=True)
+    rate1.merge_from(rate2)
+    self.assertEqual(rate1.total, 2)
+    self.assertEqual(float(rate1), 0.5)
+    self.assertEqual(
+        rate1.data_points,
+        [
+            metric_values.MetricValue.DataPoint(1, 1.0, 1.0),
+            metric_values.MetricValue.DataPoint(2, 0.0, 1.0),
+        ],
+    )
 class AverageTest(unittest.TestCase):
@@ -75,6 +91,22 @@ class AverageTest(unittest.TestCase):
     average.reset()
     self.assertEqual(average.total, 0)
+  def test_merge_from(self):
+    avg1 = metric_values.Average()
+    avg1.add(1, 1.0, 0.5, increment_total=True)
+    avg2 = metric_values.Average()
+    avg2.add(2, 0.0, 1.0, increment_total=True)
+    avg1.merge_from(avg2)
+    self.assertEqual(avg1.total, 2)
+    self.assertEqual(float(avg1), 0.25)
+    self.assertEqual(
+        avg1.data_points,
+        [
+            metric_values.MetricValue.DataPoint(1, 1.0, 0.5),
+            metric_values.MetricValue.DataPoint(2, 0.0, 1.0),
+        ],
+    )
 if __name__ == '__main__':
   unittest.main()

langfun/core/eval/v2/metrics.py CHANGED Viewed

@@ -29,7 +29,15 @@ Average = metric_values.Average
 class Metric(pg.Object, pg.views.HtmlTreeView.Extension):
-  """Interface for an evaluation metric."""
+  """Interface for an evaluation metric.
+  A metric is used to evaluate the quality of the outputs produced by an
+  evaluation. It works by auditing each processed example via its `audit`
+  method, which in turn calls the user-overridable `_audit` method to perform
+  metric-specific logic and update metric values. Metrics can compute multiple
+  values (e.g., precision, recall, F1 score) which are exposed via the
+  `values` method.
+  """
   name: Annotated[
       str,
@@ -44,24 +52,43 @@ class Metric(pg.Object, pg.views.HtmlTreeView.Extension):
     self._label_group = None
     self._lock = threading.Lock()
-  def audit(self, example: example_lib.Example) -> dict[str, Any]:
-    """Audits a processed example and returns metric metadata for it."""
-    # NOTE(daiyip): the metric values are being updated concurrently, so we
-    # uses a lock to avoid race condition. We might consider relaxing the lock
-    # later if metric auditing becomes a bottleneck.
-    with self._lock:
-      for v in self.values():
-        v.increment_total()
+  def update(
+      self,
+      example: example_lib.Example,
+      force_recompute: bool = False
+  ) -> dict[str, Any]:
+    """Updates metric values with a processed example.
-      metadata = self._audit(example)
+    Args:
+      example: The processed example.
+      force_recompute: Whether to force recompute the metric metadata even if
+        they are already present.
-      self._update_view()
-      return metadata
+    Returns:
+      A dict of metric metadata.
+    """
+    if (force_recompute
+        or example.metric_metadata is None
+        or self.name not in example.metric_metadata):
+      metadata = self.compute_metric_metadata(example)
+    else:
+      metadata = example.metric_metadata[self.name]
+    self.update_metric_values(example.id, metadata)
+    self._update_view()
+    return metadata
   @abc.abstractmethod
-  def _audit(self, example: example_lib.Example) -> dict[str, Any]:
+  def compute_metric_metadata(
+      self, example: example_lib.Example
+  ) -> dict[str, Any]:
     """Subclasses should override this method to implement the metric logic."""
+  @abc.abstractmethod
+  def update_metric_values(
+      self, example_id: int, metric_metadata: dict[str, Any]
+  ) -> None:
+    """Update metric values based on metric metadata."""
   @abc.abstractmethod
   def values(self) -> list[metric_values.MetricValue]:
     """Returns all the values computed by this metric."""
@@ -71,6 +98,12 @@ class Metric(pg.Object, pg.views.HtmlTreeView.Extension):
     for v in self.values():
       v.reset()
+  def merge_from(self, other: 'Metric') -> 'Metric':
+    """Merges the values from another metric."""
+    for v1, v2 in zip(self.values(), other.values()):
+      v1.merge_from(v2)
+    return self
   def _update_view(self):
     """Refreshes the metric values."""
     if self._label_group is None:
@@ -169,7 +202,15 @@ class Metric(pg.Object, pg.views.HtmlTreeView.Extension):
 class MetricBase(Metric):
-  """Base class for common metrics."""
+  """Base class for common metrics.
+  `MetricBase` provides common functionalities for metrics, such as automatic
+  error counting based on whether an example has an error during evaluation.
+  It distinguishes between Object-Oriented Programming (OOP) errors
+  (e.g. `MappingError` during structured output generation) and other errors.
+  Subclasses should implement `_audit_processed` for metric computation on
+  successfully processed examples.
+  """
   oop_errors: Rate | None = Rate()
   non_oop_errors: Rate | None = Rate()
@@ -183,27 +224,67 @@ class MetricBase(Metric):
     super().reset()
     self._error_breakdown = collections.defaultdict(list)
-  def _audit(self, example: example_lib.Example) -> dict[str, Any]:
-    """Audits the evaluation example after processing."""
+  def compute_metric_metadata(
+      self, example: example_lib.Example
+  ) -> dict[str, Any]:
+    """Computes the metric metadata for the example."""
     if example.error is None:
-      return self._audit_processed(example)
+      return self._compute_metric_metadata(example)
+    return self._compute_metric_metadata_with_processing_error(example)
+  def update_metric_values(
+      self,
+      example_id: int,
+      metric_metadata: dict[str, Any]
+  ) -> None:
+    """Collects the metric metadata."""
+    # NOTE(daiyip): the metric values are being updated concurrently, so we
+    # uses a lock to avoid race condition. We might consider relaxing the lock
+    # later if metric auditing becomes a bottleneck.
+    with self._lock:
+      for v in self.values():
+        v.increment_total()
+    if 'error' in metric_metadata:
+      self._update_metric_values_with_processing_error(
+          example_id, metric_metadata
+      )
     else:
-      return self._audit_error(example)
+      self._update_metric_values(example_id, metric_metadata)
+  @abc.abstractmethod
+  def _compute_metric_metadata(
+      self,
+      example: example_lib.Example
+  ) -> dict[str, Any]:
+    """Computes the metric metadata for the example."""
-  def _audit_error(self, example: example_lib.Example) -> dict[str, Any]:
+  def _compute_metric_metadata_with_processing_error(
+      self,
+      example: example_lib.Example
+  ) -> dict[str, Any]:
     """Audits the evaluation example after processing."""
     assert example.error is not None
-    tag = example.error.tag
-    if tag.startswith('MappingError'):
-      self.oop_errors.add(example.id, 1)
-    else:
-      self.non_oop_errors.add(example.id, 1)
-    self._error_breakdown[tag].append(example.id)
-    return dict(error=tag)
+    return dict(error=example.error.tag)
   @abc.abstractmethod
-  def _audit_processed(self, example: example_lib.Example) -> dict[str, Any]:
-    """Audits the evaluation example after processing."""
+  def _update_metric_values(self, metadata: dict[str, Any]) -> None:
+    """Update metric values based metric metadata."""
+  def _update_metric_values_with_processing_error(
+      self,
+      example_id: int,
+      metric_metadata: dict[str, Any]
+  ) -> None:
+    """Updates metric values with processing error."""
+    error_tag = metric_metadata.get('error')
+    assert error_tag is not None, (example_id, metric_metadata)
+    self._error_breakdown[error_tag].append(example_id)
+    if error_tag.startswith('MappingError'):
+      self.oop_errors.add(example_id, 1)
+    else:
+      self.non_oop_errors.add(example_id, 1)
+    self._error_breakdown[error_tag].append(example_id)
   def _oop_errors_breakdown(self) -> str | None:
     """Returns the OOP error breakdown as a string."""
@@ -229,7 +310,13 @@ class MetricBase(Metric):
 class Match(MetricBase):
-  """Metric for matching outputs against groundtruth."""
+  """Metric for matching outputs against ground truth.
+  This metric computes match and mismatch rates by comparing the output of
+  an example with its ground truth. By default, it looks for a `groundtruth`
+  attribute in `example.input` for comparison. Users can customize this behavior
+  by subclassing `Match` and overriding the `match` method.
+  """
   name = 'match'
   matches: Rate = Rate()
@@ -257,20 +344,30 @@ class Match(MetricBase):
       )
     return pg.eq(output, groundtruth)
-  def _audit_processed(self, example: example_lib.Example) -> dict[str, Any]:
-    """Audits the evaluation example after processing."""
+  def _compute_metric_metadata(
+      self, example: example_lib.Example
+  ) -> dict[str, Any]:
+    """Computes the metric metadata for the example."""
     metadata = {}
-    is_match = self.match(example.input, example.output)
-    if isinstance(is_match, tuple):
-      is_match, metadata = is_match
-    if is_match:
-      self.matches.add(example.id, 1)
-      metadata['match'] = True
-    else:
-      self.mismatches.add(example.id, 1)
-      metadata['mismatch'] = True
+    is_correct = self.match(example.input, example.output)
+    if isinstance(is_correct, tuple):
+      is_correct, metadata = is_correct
+    metadata['is_correct'] = is_correct
     return metadata
+  def _update_metric_values(
+      self, example_id: int, metadata: dict[str, Any]
+  ) -> None:
+    """Update metric values based metric metadata."""
+    is_correct = metadata.get('is_correct')
+    assert is_correct is not None, (example_id, metadata)
+    if is_correct:
+      self.matches.add(example_id, 1)
+    else:
+      assert not is_correct
+      self.mismatches.add(example_id, 1)
   def values(self) -> list[metric_values.MetricValue]:
     """Returns all the values computed by this metric."""
     return [
@@ -302,7 +399,14 @@ class Match(MetricBase):
 class Score(MetricBase):
-  """Base class for scoring."""
+  """Base class for scoring metrics.
+  `Score` is a base class for metrics that assign a numerical score to each
+  example's output (e.g., evaluating quality on a scale of 1-5).
+  It automatically computes the average score across all examples.
+  Subclasses must implement the `score` method to define how an example
+  should be scored.
+  """
   name = 'score'
   average_score: Average = Average()
@@ -322,16 +426,25 @@ class Score(MetricBase):
       A float score. Or a tuple of (score, metadata).
     """
-  def _audit_processed(self, example: example_lib.Example) -> dict[str, Any]:
-    """Audits the evaluation example after processing."""
+  def _compute_metric_metadata(
+      self, example: example_lib.Example
+  ) -> dict[str, Any]:
+    """Computes the metric metadata for the example."""
     metadata = {}
     score = self.score(example.input, example.output)
     if isinstance(score, tuple):
       score, metadata = score
-    self.average_score.add(example.id, score)
     metadata['score'] = score
     return metadata
+  def _update_metric_values(
+      self, example_id: int, metadata: dict[str, Any]
+  ) -> None:
+    """Update metric values based metric metadata."""
+    score = metadata.get('score')
+    assert score is not None, (example_id, metadata)
+    self.average_score.add(example_id, score)
   def values(self) -> list[metric_values.MetricValue]:
     """Returns all the values computed by this metric."""
     return [

langfun 0.1.2.dev202509120804__py3-none-any.whl → 0.1.2.dev202512040805__py3-none-any.whl

langfun 0.1.2.dev202509120804py3-none-any.whl → 0.1.2.dev202512040805py3-none-any.whl