PyPI - langfun - Versions diffs - 0.1.2.dev202509120804__py3-none-any.whl → 0.1.2.dev202512150805__py3-none-any.whl - Mend

langfun 0.1.2.dev202509120804py3-none-any.whl → 0.1.2.dev202512150805py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (162) hide show

langfun/__init__.py +1 -1
langfun/core/__init__.py +7 -1
langfun/core/agentic/__init__.py +8 -1
langfun/core/agentic/action.py +740 -112
langfun/core/agentic/action_eval.py +9 -2
langfun/core/agentic/action_test.py +189 -24
langfun/core/async_support.py +104 -5
langfun/core/async_support_test.py +23 -0
langfun/core/coding/python/correction.py +19 -9
langfun/core/coding/python/execution.py +14 -12
langfun/core/coding/python/generation.py +21 -16
langfun/core/coding/python/sandboxing.py +23 -3
langfun/core/component.py +42 -3
langfun/core/concurrent.py +70 -6
langfun/core/concurrent_test.py +9 -2
langfun/core/console.py +1 -1
langfun/core/data/conversion/anthropic.py +12 -3
langfun/core/data/conversion/anthropic_test.py +8 -6
langfun/core/data/conversion/gemini.py +11 -2
langfun/core/data/conversion/gemini_test.py +48 -9
langfun/core/data/conversion/openai.py +145 -31
langfun/core/data/conversion/openai_test.py +161 -17
langfun/core/eval/base.py +48 -44
langfun/core/eval/base_test.py +5 -5
langfun/core/eval/matching.py +5 -2
langfun/core/eval/patching.py +3 -3
langfun/core/eval/scoring.py +4 -3
langfun/core/eval/v2/__init__.py +3 -0
langfun/core/eval/v2/checkpointing.py +148 -46
langfun/core/eval/v2/checkpointing_test.py +9 -2
langfun/core/eval/v2/config_saver.py +37 -0
langfun/core/eval/v2/config_saver_test.py +36 -0
langfun/core/eval/v2/eval_test_helper.py +104 -3
langfun/core/eval/v2/evaluation.py +102 -19
langfun/core/eval/v2/evaluation_test.py +9 -3
langfun/core/eval/v2/example.py +50 -40
langfun/core/eval/v2/example_test.py +16 -8
langfun/core/eval/v2/experiment.py +95 -20
langfun/core/eval/v2/experiment_test.py +19 -0
langfun/core/eval/v2/metric_values.py +31 -3
langfun/core/eval/v2/metric_values_test.py +32 -0
langfun/core/eval/v2/metrics.py +157 -44
langfun/core/eval/v2/metrics_test.py +39 -18
langfun/core/eval/v2/progress.py +31 -1
langfun/core/eval/v2/progress_test.py +27 -0
langfun/core/eval/v2/progress_tracking.py +13 -5
langfun/core/eval/v2/progress_tracking_test.py +9 -1
langfun/core/eval/v2/reporting.py +88 -71
langfun/core/eval/v2/reporting_test.py +24 -6
langfun/core/eval/v2/runners/__init__.py +30 -0
langfun/core/eval/v2/{runners.py → runners/base.py} +73 -180
langfun/core/eval/v2/runners/beam.py +354 -0
langfun/core/eval/v2/runners/beam_test.py +153 -0
langfun/core/eval/v2/runners/ckpt_monitor.py +350 -0
langfun/core/eval/v2/runners/ckpt_monitor_test.py +213 -0
langfun/core/eval/v2/runners/debug.py +40 -0
langfun/core/eval/v2/runners/debug_test.py +76 -0
langfun/core/eval/v2/runners/parallel.py +243 -0
langfun/core/eval/v2/runners/parallel_test.py +182 -0
langfun/core/eval/v2/runners/sequential.py +47 -0
langfun/core/eval/v2/runners/sequential_test.py +169 -0
langfun/core/langfunc.py +45 -130
langfun/core/langfunc_test.py +7 -5
langfun/core/language_model.py +189 -36
langfun/core/language_model_test.py +54 -3
langfun/core/llms/__init__.py +14 -1
langfun/core/llms/anthropic.py +157 -2
langfun/core/llms/azure_openai.py +29 -17
langfun/core/llms/cache/base.py +25 -3
langfun/core/llms/cache/in_memory.py +48 -7
langfun/core/llms/cache/in_memory_test.py +14 -4
langfun/core/llms/compositional.py +25 -1
langfun/core/llms/deepseek.py +30 -2
langfun/core/llms/fake.py +32 -1
langfun/core/llms/gemini.py +90 -12
langfun/core/llms/gemini_test.py +110 -0
langfun/core/llms/google_genai.py +52 -1
langfun/core/llms/groq.py +28 -3
langfun/core/llms/llama_cpp.py +23 -4
langfun/core/llms/openai.py +120 -3
langfun/core/llms/openai_compatible.py +148 -27
langfun/core/llms/openai_compatible_test.py +207 -20
langfun/core/llms/openai_test.py +0 -2
langfun/core/llms/rest.py +16 -1
langfun/core/llms/vertexai.py +78 -8
langfun/core/logging.py +1 -1
langfun/core/mcp/__init__.py +10 -0
langfun/core/mcp/client.py +177 -0
langfun/core/mcp/client_test.py +71 -0
langfun/core/mcp/session.py +241 -0
langfun/core/mcp/session_test.py +54 -0
langfun/core/mcp/testing/simple_mcp_client.py +33 -0
langfun/core/mcp/testing/simple_mcp_server.py +33 -0
langfun/core/mcp/tool.py +254 -0
langfun/core/mcp/tool_test.py +197 -0
langfun/core/memory.py +1 -0
langfun/core/message.py +160 -55
langfun/core/message_test.py +65 -81
langfun/core/modalities/__init__.py +8 -0
langfun/core/modalities/audio.py +21 -1
langfun/core/modalities/image.py +73 -3
langfun/core/modalities/image_test.py +116 -0
langfun/core/modalities/mime.py +78 -4
langfun/core/modalities/mime_test.py +59 -0
langfun/core/modalities/pdf.py +19 -1
langfun/core/modalities/video.py +21 -1
langfun/core/modality.py +167 -29
langfun/core/modality_test.py +42 -12
langfun/core/natural_language.py +1 -1
langfun/core/sampling.py +4 -4
langfun/core/sampling_test.py +20 -4
langfun/core/structured/__init__.py +2 -24
langfun/core/structured/completion.py +34 -44
langfun/core/structured/completion_test.py +23 -43
langfun/core/structured/description.py +54 -50
langfun/core/structured/function_generation.py +29 -12
langfun/core/structured/mapping.py +81 -37
langfun/core/structured/parsing.py +95 -79
langfun/core/structured/parsing_test.py +0 -3
langfun/core/structured/querying.py +230 -154
langfun/core/structured/querying_test.py +69 -33
langfun/core/structured/schema/__init__.py +49 -0
langfun/core/structured/schema/base.py +664 -0
langfun/core/structured/schema/base_test.py +531 -0
langfun/core/structured/schema/json.py +174 -0
langfun/core/structured/schema/json_test.py +121 -0
langfun/core/structured/schema/python.py +316 -0
langfun/core/structured/schema/python_test.py +410 -0
langfun/core/structured/schema_generation.py +33 -14
langfun/core/structured/scoring.py +47 -36
langfun/core/structured/tokenization.py +26 -11
langfun/core/subscription.py +2 -2
langfun/core/template.py +175 -50
langfun/core/template_test.py +123 -17
langfun/env/__init__.py +43 -0
langfun/env/base_environment.py +827 -0
langfun/env/base_environment_test.py +473 -0
langfun/env/base_feature.py +304 -0
langfun/env/base_feature_test.py +228 -0
langfun/env/base_sandbox.py +842 -0
langfun/env/base_sandbox_test.py +1235 -0
langfun/env/event_handlers/__init__.py +14 -0
langfun/env/event_handlers/chain.py +233 -0
langfun/env/event_handlers/chain_test.py +253 -0
langfun/env/event_handlers/event_logger.py +472 -0
langfun/env/event_handlers/event_logger_test.py +304 -0
langfun/env/event_handlers/metric_writer.py +726 -0
langfun/env/event_handlers/metric_writer_test.py +214 -0
langfun/env/interface.py +1640 -0
langfun/env/interface_test.py +153 -0
langfun/env/load_balancers.py +59 -0
langfun/env/load_balancers_test.py +141 -0
langfun/env/test_utils.py +507 -0
{langfun-0.1.2.dev202509120804.dist-info → langfun-0.1.2.dev202512150805.dist-info}/METADATA +7 -3
langfun-0.1.2.dev202512150805.dist-info/RECORD +217 -0
langfun/core/eval/v2/runners_test.py +0 -343
langfun/core/structured/schema.py +0 -987
langfun/core/structured/schema_test.py +0 -982
langfun-0.1.2.dev202509120804.dist-info/RECORD +0 -172
{langfun-0.1.2.dev202509120804.dist-info → langfun-0.1.2.dev202512150805.dist-info}/WHEEL +0 -0
{langfun-0.1.2.dev202509120804.dist-info → langfun-0.1.2.dev202512150805.dist-info}/licenses/LICENSE +0 -0
{langfun-0.1.2.dev202509120804.dist-info → langfun-0.1.2.dev202512150805.dist-info}/top_level.txt +0 -0

langfun/core/eval/v2/metric_values.py CHANGED Viewed

@@ -20,7 +20,15 @@ import pyglove as pg
 class MetricValue(pg.Object):
-  """Base class for metric values."""
+  """Base class for metric values.
+  `MetricValue` is the base class for representing aggregated metric values
+  in an evaluation. It accumulates data points from individual examples,
+  each consisting of a value and an optional weight, associated with an example
+  ID. Subclasses must implement `reduce` method to compute a single float value
+  from accumulated data points, and `scalar_repr` to provide a string
+  representation of the reduced value.
+  """
   class DataPoint(pg.Object):
     """A data point for a metric value."""
@@ -88,6 +96,14 @@ class MetricValue(pg.Object):
         self.increment_total()
     return self
+  def merge_from(self, other: 'MetricValue') -> 'MetricValue':
+    """Merges the values from another metric value."""
+    self._weighted_sum += other._weighted_sum  # pylint: disable=protected-access
+    with pg.notify_on_change(False), pg.allow_writable_accessors(True):
+      self.data_points.extend(other.data_points)
+      self.increment_total(other.total)
+    return self
   def __gt__(self, other: Union['MetricValue', float]) -> bool:
     if isinstance(other, self.__class__):
       return float(self) > float(other)
@@ -133,7 +149,13 @@ class MetricValue(pg.Object):
 class Rate(MetricValue):
-  """Representing a rate in range [0, 1]."""
+  """Metric value representing a rate in range [0, 1].
+  `Rate` is used for metrics that compute a rate, such as accuracy or error
+  rate. The final value is computed as the weighted sum of accumulated values
+  divided by the total number of examples. It's displayed as a percentage
+  (e.g., 90.0%).
+  """
   def reduce(self) -> float:
     return self._weighted_sum / self.total
@@ -145,7 +167,13 @@ class Rate(MetricValue):
 class Average(MetricValue):
-  """Average of a aggregated values."""
+  """Metric value representing an average of accumulated values.
+  `Average` is used for metrics that compute an average score across examples
+  (e.g., average quality score). The final value is computed as the weighted
+  sum of accumulated values divided by the number of data points.
+  It's displayed as a float with 3 decimal places (e.g., 4.750).
+  """
   def reduce(self) -> float:
     if not self.data_points:

langfun/core/eval/v2/metric_values_test.py CHANGED Viewed

@@ -51,6 +51,22 @@ class RateTest(unittest.TestCase):
     self.assertEqual(rate.total, 0)
     self.assertTrue(math.isnan(float(rate)))
+  def test_merge_from(self):
+    rate1 = metric_values.Rate()
+    rate1.add(1, 1.0, 1.0, increment_total=True)
+    rate2 = metric_values.Rate()
+    rate2.add(2, 0.0, 1.0, increment_total=True)
+    rate1.merge_from(rate2)
+    self.assertEqual(rate1.total, 2)
+    self.assertEqual(float(rate1), 0.5)
+    self.assertEqual(
+        rate1.data_points,
+        [
+            metric_values.MetricValue.DataPoint(1, 1.0, 1.0),
+            metric_values.MetricValue.DataPoint(2, 0.0, 1.0),
+        ],
+    )
 class AverageTest(unittest.TestCase):
@@ -75,6 +91,22 @@ class AverageTest(unittest.TestCase):
     average.reset()
     self.assertEqual(average.total, 0)
+  def test_merge_from(self):
+    avg1 = metric_values.Average()
+    avg1.add(1, 1.0, 0.5, increment_total=True)
+    avg2 = metric_values.Average()
+    avg2.add(2, 0.0, 1.0, increment_total=True)
+    avg1.merge_from(avg2)
+    self.assertEqual(avg1.total, 2)
+    self.assertEqual(float(avg1), 0.25)
+    self.assertEqual(
+        avg1.data_points,
+        [
+            metric_values.MetricValue.DataPoint(1, 1.0, 0.5),
+            metric_values.MetricValue.DataPoint(2, 0.0, 1.0),
+        ],
+    )
 if __name__ == '__main__':
   unittest.main()

langfun/core/eval/v2/metrics.py CHANGED Viewed

@@ -29,7 +29,15 @@ Average = metric_values.Average
 class Metric(pg.Object, pg.views.HtmlTreeView.Extension):
-  """Interface for an evaluation metric."""
+  """Interface for an evaluation metric.
+  A metric is used to evaluate the quality of the outputs produced by an
+  evaluation. It works by auditing each processed example via its `audit`
+  method, which in turn calls the user-overridable `_audit` method to perform
+  metric-specific logic and update metric values. Metrics can compute multiple
+  values (e.g., precision, recall, F1 score) which are exposed via the
+  `values` method.
+  """
   name: Annotated[
       str,
@@ -44,24 +52,43 @@ class Metric(pg.Object, pg.views.HtmlTreeView.Extension):
     self._label_group = None
     self._lock = threading.Lock()
-  def audit(self, example: example_lib.Example) -> dict[str, Any]:
-    """Audits a processed example and returns metric metadata for it."""
-    # NOTE(daiyip): the metric values are being updated concurrently, so we
-    # uses a lock to avoid race condition. We might consider relaxing the lock
-    # later if metric auditing becomes a bottleneck.
-    with self._lock:
-      for v in self.values():
-        v.increment_total()
+  def update(
+      self,
+      example: example_lib.Example,
+      force_recompute: bool = False
+  ) -> dict[str, Any]:
+    """Updates metric values with a processed example.
-      metadata = self._audit(example)
+    Args:
+      example: The processed example.
+      force_recompute: Whether to force recompute the metric metadata even if
+        they are already present.
-      self._update_view()
-      return metadata
+    Returns:
+      A dict of metric metadata.
+    """
+    if (force_recompute
+        or example.metric_metadata is None
+        or self.name not in example.metric_metadata):
+      metadata = self.compute_metric_metadata(example)
+    else:
+      metadata = example.metric_metadata[self.name]
+    self.update_metric_values(example.id, metadata)
+    self._update_view()
+    return metadata
   @abc.abstractmethod
-  def _audit(self, example: example_lib.Example) -> dict[str, Any]:
+  def compute_metric_metadata(
+      self, example: example_lib.Example
+  ) -> dict[str, Any]:
     """Subclasses should override this method to implement the metric logic."""
+  @abc.abstractmethod
+  def update_metric_values(
+      self, example_id: int, metric_metadata: dict[str, Any]
+  ) -> None:
+    """Update metric values based on metric metadata."""
   @abc.abstractmethod
   def values(self) -> list[metric_values.MetricValue]:
     """Returns all the values computed by this metric."""
@@ -71,6 +98,12 @@ class Metric(pg.Object, pg.views.HtmlTreeView.Extension):
     for v in self.values():
       v.reset()
+  def merge_from(self, other: 'Metric') -> 'Metric':
+    """Merges the values from another metric."""
+    for v1, v2 in zip(self.values(), other.values()):
+      v1.merge_from(v2)
+    return self
   def _update_view(self):
     """Refreshes the metric values."""
     if self._label_group is None:
@@ -169,7 +202,15 @@ class Metric(pg.Object, pg.views.HtmlTreeView.Extension):
 class MetricBase(Metric):
-  """Base class for common metrics."""
+  """Base class for common metrics.
+  `MetricBase` provides common functionalities for metrics, such as automatic
+  error counting based on whether an example has an error during evaluation.
+  It distinguishes between Object-Oriented Programming (OOP) errors
+  (e.g. `MappingError` during structured output generation) and other errors.
+  Subclasses should implement `_audit_processed` for metric computation on
+  successfully processed examples.
+  """
   oop_errors: Rate | None = Rate()
   non_oop_errors: Rate | None = Rate()
@@ -183,27 +224,67 @@ class MetricBase(Metric):
     super().reset()
     self._error_breakdown = collections.defaultdict(list)
-  def _audit(self, example: example_lib.Example) -> dict[str, Any]:
-    """Audits the evaluation example after processing."""
+  def compute_metric_metadata(
+      self, example: example_lib.Example
+  ) -> dict[str, Any]:
+    """Computes the metric metadata for the example."""
     if example.error is None:
-      return self._audit_processed(example)
+      return self._compute_metric_metadata(example)
+    return self._compute_metric_metadata_with_processing_error(example)
+  def update_metric_values(
+      self,
+      example_id: int,
+      metric_metadata: dict[str, Any]
+  ) -> None:
+    """Collects the metric metadata."""
+    # NOTE(daiyip): the metric values are being updated concurrently, so we
+    # uses a lock to avoid race condition. We might consider relaxing the lock
+    # later if metric auditing becomes a bottleneck.
+    with self._lock:
+      for v in self.values():
+        v.increment_total()
+    if 'error' in metric_metadata:
+      self._update_metric_values_with_processing_error(
+          example_id, metric_metadata
+      )
     else:
-      return self._audit_error(example)
+      self._update_metric_values(example_id, metric_metadata)
+  @abc.abstractmethod
+  def _compute_metric_metadata(
+      self,
+      example: example_lib.Example
+  ) -> dict[str, Any]:
+    """Computes the metric metadata for the example."""
-  def _audit_error(self, example: example_lib.Example) -> dict[str, Any]:
+  def _compute_metric_metadata_with_processing_error(
+      self,
+      example: example_lib.Example
+  ) -> dict[str, Any]:
     """Audits the evaluation example after processing."""
     assert example.error is not None
-    tag = example.error.tag
-    if tag.startswith('MappingError'):
-      self.oop_errors.add(example.id, 1)
-    else:
-      self.non_oop_errors.add(example.id, 1)
-    self._error_breakdown[tag].append(example.id)
-    return dict(error=tag)
+    return dict(error=example.error.tag)
   @abc.abstractmethod
-  def _audit_processed(self, example: example_lib.Example) -> dict[str, Any]:
-    """Audits the evaluation example after processing."""
+  def _update_metric_values(self, metadata: dict[str, Any]) -> None:
+    """Update metric values based metric metadata."""
+  def _update_metric_values_with_processing_error(
+      self,
+      example_id: int,
+      metric_metadata: dict[str, Any]
+  ) -> None:
+    """Updates metric values with processing error."""
+    error_tag = metric_metadata.get('error')
+    assert error_tag is not None, (example_id, metric_metadata)
+    self._error_breakdown[error_tag].append(example_id)
+    if error_tag.startswith('MappingError'):
+      self.oop_errors.add(example_id, 1)
+    else:
+      self.non_oop_errors.add(example_id, 1)
+    self._error_breakdown[error_tag].append(example_id)
   def _oop_errors_breakdown(self) -> str | None:
     """Returns the OOP error breakdown as a string."""
@@ -229,7 +310,13 @@ class MetricBase(Metric):
 class Match(MetricBase):
-  """Metric for matching outputs against groundtruth."""
+  """Metric for matching outputs against ground truth.
+  This metric computes match and mismatch rates by comparing the output of
+  an example with its ground truth. By default, it looks for a `groundtruth`
+  attribute in `example.input` for comparison. Users can customize this behavior
+  by subclassing `Match` and overriding the `match` method.
+  """
   name = 'match'
   matches: Rate = Rate()
@@ -257,20 +344,30 @@ class Match(MetricBase):
       )
     return pg.eq(output, groundtruth)
-  def _audit_processed(self, example: example_lib.Example) -> dict[str, Any]:
-    """Audits the evaluation example after processing."""
+  def _compute_metric_metadata(
+      self, example: example_lib.Example
+  ) -> dict[str, Any]:
+    """Computes the metric metadata for the example."""
     metadata = {}
-    is_match = self.match(example.input, example.output)
-    if isinstance(is_match, tuple):
-      is_match, metadata = is_match
-    if is_match:
-      self.matches.add(example.id, 1)
-      metadata['match'] = True
-    else:
-      self.mismatches.add(example.id, 1)
-      metadata['mismatch'] = True
+    is_correct = self.match(example.input, example.output)
+    if isinstance(is_correct, tuple):
+      is_correct, metadata = is_correct
+    metadata['is_correct'] = is_correct
     return metadata
+  def _update_metric_values(
+      self, example_id: int, metadata: dict[str, Any]
+  ) -> None:
+    """Update metric values based metric metadata."""
+    is_correct = metadata.get('is_correct')
+    assert is_correct is not None, (example_id, metadata)
+    if is_correct:
+      self.matches.add(example_id, 1)
+    else:
+      assert not is_correct
+      self.mismatches.add(example_id, 1)
   def values(self) -> list[metric_values.MetricValue]:
     """Returns all the values computed by this metric."""
     return [
@@ -302,7 +399,14 @@ class Match(MetricBase):
 class Score(MetricBase):
-  """Base class for scoring."""
+  """Base class for scoring metrics.
+  `Score` is a base class for metrics that assign a numerical score to each
+  example's output (e.g., evaluating quality on a scale of 1-5).
+  It automatically computes the average score across all examples.
+  Subclasses must implement the `score` method to define how an example
+  should be scored.
+  """
   name = 'score'
   average_score: Average = Average()
@@ -322,16 +426,25 @@ class Score(MetricBase):
       A float score. Or a tuple of (score, metadata).
     """
-  def _audit_processed(self, example: example_lib.Example) -> dict[str, Any]:
-    """Audits the evaluation example after processing."""
+  def _compute_metric_metadata(
+      self, example: example_lib.Example
+  ) -> dict[str, Any]:
+    """Computes the metric metadata for the example."""
     metadata = {}
     score = self.score(example.input, example.output)
     if isinstance(score, tuple):
       score, metadata = score
-    self.average_score.add(example.id, score)
     metadata['score'] = score
     return metadata
+  def _update_metric_values(
+      self, example_id: int, metadata: dict[str, Any]
+  ) -> None:
+    """Update metric values based metric metadata."""
+    score = metadata.get('score')
+    assert score is not None, (example_id, metadata)
+    self.average_score.add(example_id, score)
   def values(self) -> list[metric_values.MetricValue]:
     """Returns all the values computed by this metric."""
     return [

langfun/core/eval/v2/metrics_test.py CHANGED Viewed

@@ -25,15 +25,22 @@ class MatchTest(unittest.TestCase):
   def test_basic(self):
     m = metrics.Match()  # pylint: disable=invalid-name
     self.assertEqual(
-        m.audit(Example(id=1, input=pg.Dict(groundtruth=1), output=1)),
-        dict(match=True)
+        m.update(Example(id=1, input=pg.Dict(groundtruth=1), output=1)),
+        dict(is_correct=True)
     )
     self.assertEqual(
-        m.audit(Example(id=2, input=pg.Dict(groundtruth=1), output=2)),
-        dict(mismatch=True)
+        m.update(
+            Example(
+                id=2,
+                input=pg.Dict(groundtruth=1),
+                output=2,
+                metric_metadata=dict(match=dict(is_correct=False, x=1))
+            )
+        ),
+        dict(is_correct=False, x=1)
     )
     self.assertEqual(
-        m.audit(
+        m.update(
             Example(
                 id=3,
                 input=pg.Dict(groundtruth=1),
@@ -47,7 +54,7 @@ class MatchTest(unittest.TestCase):
         dict(error='ValueError')
     )
     self.assertEqual(
-        m.audit(
+        m.update(
             Example(
                 id=3,
                 input=pg.Dict(groundtruth=1),
@@ -80,7 +87,7 @@ class MatchTest(unittest.TestCase):
   def test_bad_case(self):
     m = metrics.Match()  # pylint: disable=invalid-name
     with self.assertRaisesRegex(ValueError, '`groundtruth` is not present'):
-      m.audit(Example(id=1, input=pg.Dict(x=1), output=1))
+      m.update(Example(id=1, input=pg.Dict(x=1), output=1))
   def test_custom_metadata(self):
@@ -90,22 +97,36 @@ class MatchTest(unittest.TestCase):
     m = MyMatch()  # pylint: disable=invalid-name
     self.assertEqual(
-        m.audit(Example(id=1, input=pg.Dict(x=1), output=1)),
-        dict(match=True, x=1)
+        m.update(Example(id=1, input=pg.Dict(x=1), output=1)),
+        dict(is_correct=True, x=1)
     )
     self.assertEqual(m.matches, 1.0)
   def test_html_view(self):
     m = metrics.Match()  # pylint: disable=invalid-name
-    m.audit(Example(id=1, input=pg.Dict(groundtruth=1), output=1))
+    m.update(Example(id=1, input=pg.Dict(groundtruth=1), output=1))
     self.assertIn(
         '100.0%',
         m.to_html().content,
     )
     with pg.views.html.controls.HtmlControl.track_scripts() as scripts:
-      m.audit(Example(id=2, input=pg.Dict(groundtruth=1), output=2))
+      m.update(Example(id=2, input=pg.Dict(groundtruth=1), output=2))
       self.assertEqual(len(scripts), 12)
+  def test_merge_from(self):
+    m1 = metrics.Match()
+    m1.update(Example(id=1, input=pg.Dict(groundtruth=1), output=1))
+    m2 = metrics.Match()
+    m2.update(Example(id=2, input=pg.Dict(groundtruth=1), output=2))
+    m1.merge_from(m2)
+    self.assertEqual(m1.matches, 0.5)
+    self.assertEqual(m1.mismatches, 0.5)
+    self.assertEqual(m1.oop_errors, 0.0)
+    self.assertEqual(m1.non_oop_errors, 0.0)
+    self.assertEqual(m1.matches.total, 2)
+    self.assertEqual(len(m1.matches.data_points), 1)
+    self.assertEqual(len(m1.mismatches.data_points), 1)
 class ScoreTest(unittest.TestCase):
@@ -118,15 +139,15 @@ class ScoreTest(unittest.TestCase):
     m = MyScore()  # pylint: disable=invalid-name
     self.assertEqual(
-        m.audit(Example(id=1, input=pg.Dict(x=1), output=1)),
+        m.update(Example(id=1, input=pg.Dict(x=1), output=1)),
         dict(score=1 * 1)
     )
     self.assertEqual(
-        m.audit(Example(id=2, input=pg.Dict(x=2), output=2)),
+        m.update(Example(id=2, input=pg.Dict(x=2), output=2)),
         dict(score=2 * 2)
     )
     self.assertEqual(
-        m.audit(
+        m.update(
             Example(
                 id=3,
                 input=pg.Dict(x=1),
@@ -140,7 +161,7 @@ class ScoreTest(unittest.TestCase):
         dict(error='ValueError')
     )
     self.assertEqual(
-        m.audit(
+        m.update(
             Example(
                 id=3,
                 input=pg.Dict(x=1),
@@ -176,7 +197,7 @@ class ScoreTest(unittest.TestCase):
     m = MyScore()  # pylint: disable=invalid-name
     self.assertEqual(
-        m.audit(Example(id=1, input=pg.Dict(x=1), output=1)),
+        m.update(Example(id=1, input=pg.Dict(x=1), output=1)),
         dict(score=1 * 1, x=1)
     )
     self.assertEqual(m.average_score, 1.0)
@@ -189,13 +210,13 @@ class ScoreTest(unittest.TestCase):
         return example_input.x * output
     m = MyScore()  # pylint: disable=invalid-name
-    m.audit(Example(id=1, input=pg.Dict(x=1), output=2))
+    m.update(Example(id=1, input=pg.Dict(x=1), output=2))
     self.assertIn(
         '2.000',
         m.to_html().content,
     )
     with pg.views.html.controls.HtmlControl.track_scripts() as scripts:
-      m.audit(Example(id=2, input=pg.Dict(x=1), output=2))
+      m.update(Example(id=2, input=pg.Dict(x=1), output=2))
       self.assertEqual(len(scripts), 9)

langfun/core/eval/v2/progress.py CHANGED Viewed

@@ -21,7 +21,15 @@ import pyglove as pg
 class Progress(pg.Object, pg.views.HtmlTreeView.Extension):
-  """Evaluation progress."""
+  """Represents and tracks the progress of an evaluation.
+  The `Progress` class maintains counts of processed, failed, and skipped
+  items in an evaluation, along with timing information (start time, stop time,
+  duration) and an execution summary. It provides properties to check the
+  status of the evaluation (e.g., `is_started`, `is_completed`) and methods
+  to update progress as items are evaluated.
+  It also supports HTML rendering as a progress bar for visualization.
+  """
   num_total: Annotated[
       int | None,
@@ -84,6 +92,7 @@ class Progress(pg.Object, pg.views.HtmlTreeView.Extension):
         stop_time=None,
         execution_summary=pg.object_utils.TimeIt.StatusSummary(),
     )
+    self._progress_bar = None
   @property
   def num_completed(self) -> int:
@@ -216,6 +225,27 @@ class Progress(pg.Object, pg.views.HtmlTreeView.Extension):
     """Overrides nondefault values so volatile values are not included."""
     return dict()
+  def merge_from(self, other: 'Progress') -> None:
+    """Merges the progress from another progress."""
+    with pg.notify_on_change(False), pg.allow_writable_accessors(True):
+      if other.start_time is not None and (
+          self.start_time is None or self.start_time > other.start_time):
+        self.start_time = other.start_time
+      if other.stop_time is not None and (
+          self.stop_time is None or self.stop_time < other.stop_time):
+        self.stop_time = other.stop_time
+      if other.num_total is not None:
+        if self.num_total is None:
+          self.num_total = other.num_total
+        else:
+          assert self.num_total == other.num_total, (self, other)
+      self.num_processed += other.num_processed
+      self.num_failed += other.num_failed
+      self.num_skipped += other.num_skipped
+      self.execution_summary.aggregate(other.execution_summary.breakdown)
   #
   # HTML view.
   #

langfun/core/eval/v2/progress_test.py CHANGED Viewed

@@ -77,6 +77,33 @@ class ProgressTest(unittest.TestCase):
     self.assertTrue(p.is_stopped)
     self.assertIsNotNone(p.stop_time_str)
+  def test_merge_from(self):
+    p1 = Progress()
+    p1.start(10)
+    p1.increment_processed()
+    p1.increment_failed()
+    p1.stop()
+    p2 = Progress()
+    p2.start(10)
+    p2.increment_skipped()
+    p2.stop()
+    with pg.allow_writable_accessors(True):
+      p1.start_time = 2.0
+      p1.stop_time = 4.0
+      p2.start_time = 1.0
+      p2.stop_time = 5.0
+    p1.merge_from(p2)
+    self.assertEqual(p1.num_total, 10)
+    self.assertEqual(p1.num_processed, 1)
+    self.assertEqual(p1.num_failed, 1)
+    self.assertEqual(p1.num_skipped, 1)
+    self.assertEqual(p1.num_completed, 3)
+    self.assertEqual(p1.start_time, 1.0)
+    self.assertEqual(p1.stop_time, 5.0)
 if __name__ == '__main__':
   unittest.main()

langfun/core/eval/v2/progress_tracking.py CHANGED Viewed

@@ -14,6 +14,7 @@
 """Tracking evaluation run progress."""
 import os
+from typing import Literal
 import langfun.core as lf
 from langfun.core.eval.v2 import example as example_lib
 from langfun.core.eval.v2 import experiment as experiment_lib
@@ -24,16 +25,24 @@ Experiment = experiment_lib.Experiment
 Example = example_lib.Example
-def progress_tracker(tqdm: bool = False) -> experiment_lib.Plugin:
+def progress_tracker(
+    tracker_type: Literal['tqdm', 'html', 'auto'] = 'auto'
+) -> experiment_lib.Plugin:
   """Creates a progress tracker as a plugin.
   Args:
-    tqdm: If True, force using tqdm for progress update.
+    tracker_type: The type of progress tracker to use.
+      If `tqdm`, force using tqdm for progress update.
+      If `html`, force using html for progress update.
+      If `auto`, determine it automatically based on the running
+        environment (console vs. notebook)
   Returns:
     The progress tracker plugin.
   """
-  if tqdm or not lf.console.under_notebook():
+  if tracker_type == 'tqdm' or (
+      tracker_type == 'auto' and not lf.console.under_notebook()
+  ):
     return _TqdmProgressTracker()
   else:
     return _HtmlProgressTracker()
@@ -88,8 +97,7 @@ class _TqdmProgressTracker(experiment_lib.Plugin):
     self._leaf_progresses = {
         leaf.id: lf.concurrent.ProgressBar.install(
             label=f'[#{i + 1} - {leaf.id}]',
-            total=(len(runner.current_run.example_ids)
-                   if runner.current_run.example_ids else leaf.num_examples),
+            total=len(runner.current_run.examples_to_evaluate(leaf)),
             color='cyan',
             status=None
         )

langfun 0.1.2.dev202509120804__py3-none-any.whl → 0.1.2.dev202512150805__py3-none-any.whl

langfun 0.1.2.dev202509120804py3-none-any.whl → 0.1.2.dev202512150805py3-none-any.whl