PyPI - langfun - Versions diffs - 0.1.2.dev202509020804__py3-none-any.whl → 0.1.2.dev202511110805__py3-none-any.whl - Mend

langfun 0.1.2.dev202509020804py3-none-any.whl → 0.1.2.dev202511110805py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of langfun might be problematic. Click here for more details.

Files changed (133) hide show

langfun/__init__.py +1 -1
langfun/core/__init__.py +6 -1
langfun/core/agentic/__init__.py +4 -0
langfun/core/agentic/action.py +412 -103
langfun/core/agentic/action_eval.py +9 -2
langfun/core/agentic/action_test.py +68 -6
langfun/core/async_support.py +104 -5
langfun/core/async_support_test.py +23 -0
langfun/core/coding/python/correction.py +19 -9
langfun/core/coding/python/execution.py +14 -12
langfun/core/coding/python/generation.py +21 -16
langfun/core/coding/python/sandboxing.py +23 -3
langfun/core/component.py +42 -3
langfun/core/concurrent.py +70 -6
langfun/core/concurrent_test.py +9 -2
langfun/core/console.py +1 -1
langfun/core/data/conversion/anthropic.py +12 -3
langfun/core/data/conversion/anthropic_test.py +8 -6
langfun/core/data/conversion/gemini.py +9 -2
langfun/core/data/conversion/gemini_test.py +12 -9
langfun/core/data/conversion/openai.py +145 -31
langfun/core/data/conversion/openai_test.py +161 -17
langfun/core/eval/base.py +47 -43
langfun/core/eval/base_test.py +4 -4
langfun/core/eval/matching.py +5 -2
langfun/core/eval/patching.py +3 -3
langfun/core/eval/scoring.py +4 -3
langfun/core/eval/v2/__init__.py +1 -0
langfun/core/eval/v2/checkpointing.py +30 -4
langfun/core/eval/v2/eval_test_helper.py +1 -1
langfun/core/eval/v2/evaluation.py +60 -14
langfun/core/eval/v2/example.py +22 -11
langfun/core/eval/v2/experiment.py +51 -8
langfun/core/eval/v2/metric_values.py +31 -3
langfun/core/eval/v2/metric_values_test.py +32 -0
langfun/core/eval/v2/metrics.py +39 -4
langfun/core/eval/v2/metrics_test.py +14 -0
langfun/core/eval/v2/progress.py +30 -1
langfun/core/eval/v2/progress_test.py +27 -0
langfun/core/eval/v2/progress_tracking_test.py +6 -0
langfun/core/eval/v2/reporting.py +90 -71
langfun/core/eval/v2/reporting_test.py +20 -6
langfun/core/eval/v2/runners.py +27 -7
langfun/core/eval/v2/runners_test.py +3 -0
langfun/core/langfunc.py +45 -130
langfun/core/langfunc_test.py +6 -4
langfun/core/language_model.py +151 -31
langfun/core/language_model_test.py +9 -3
langfun/core/llms/__init__.py +12 -1
langfun/core/llms/anthropic.py +157 -2
langfun/core/llms/azure_openai.py +29 -17
langfun/core/llms/cache/base.py +25 -3
langfun/core/llms/cache/in_memory.py +48 -7
langfun/core/llms/cache/in_memory_test.py +14 -4
langfun/core/llms/compositional.py +25 -1
langfun/core/llms/deepseek.py +30 -2
langfun/core/llms/fake.py +39 -1
langfun/core/llms/fake_test.py +9 -0
langfun/core/llms/gemini.py +43 -7
langfun/core/llms/google_genai.py +34 -1
langfun/core/llms/groq.py +28 -3
langfun/core/llms/llama_cpp.py +23 -4
langfun/core/llms/openai.py +93 -3
langfun/core/llms/openai_compatible.py +148 -27
langfun/core/llms/openai_compatible_test.py +207 -20
langfun/core/llms/openai_test.py +0 -2
langfun/core/llms/rest.py +16 -1
langfun/core/llms/vertexai.py +59 -8
langfun/core/logging.py +1 -1
langfun/core/mcp/__init__.py +10 -0
langfun/core/mcp/client.py +177 -0
langfun/core/mcp/client_test.py +71 -0
langfun/core/mcp/session.py +241 -0
langfun/core/mcp/session_test.py +54 -0
langfun/core/mcp/testing/simple_mcp_client.py +33 -0
langfun/core/mcp/testing/simple_mcp_server.py +33 -0
langfun/core/mcp/tool.py +256 -0
langfun/core/mcp/tool_test.py +197 -0
langfun/core/memory.py +1 -0
langfun/core/message.py +160 -55
langfun/core/message_test.py +65 -81
langfun/core/modalities/__init__.py +8 -0
langfun/core/modalities/audio.py +21 -1
langfun/core/modalities/image.py +19 -1
langfun/core/modalities/mime.py +62 -3
langfun/core/modalities/pdf.py +19 -1
langfun/core/modalities/video.py +21 -1
langfun/core/modality.py +167 -29
langfun/core/modality_test.py +42 -12
langfun/core/natural_language.py +1 -1
langfun/core/sampling.py +4 -4
langfun/core/sampling_test.py +20 -4
langfun/core/structured/completion.py +34 -44
langfun/core/structured/completion_test.py +23 -43
langfun/core/structured/description.py +54 -50
langfun/core/structured/function_generation.py +29 -12
langfun/core/structured/mapping.py +74 -28
langfun/core/structured/parsing.py +90 -74
langfun/core/structured/parsing_test.py +0 -3
langfun/core/structured/querying.py +242 -156
langfun/core/structured/querying_test.py +95 -64
langfun/core/structured/schema.py +70 -10
langfun/core/structured/schema_generation.py +33 -14
langfun/core/structured/scoring.py +45 -34
langfun/core/structured/tokenization.py +24 -9
langfun/core/subscription.py +2 -2
langfun/core/template.py +175 -50
langfun/core/template_test.py +123 -17
langfun/env/__init__.py +43 -0
langfun/env/base_environment.py +827 -0
langfun/env/base_environment_test.py +473 -0
langfun/env/base_feature.py +304 -0
langfun/env/base_feature_test.py +228 -0
langfun/env/base_sandbox.py +842 -0
langfun/env/base_sandbox_test.py +1235 -0
langfun/env/event_handlers/__init__.py +14 -0
langfun/env/event_handlers/chain.py +233 -0
langfun/env/event_handlers/chain_test.py +253 -0
langfun/env/event_handlers/event_logger.py +472 -0
langfun/env/event_handlers/event_logger_test.py +304 -0
langfun/env/event_handlers/metric_writer.py +726 -0
langfun/env/event_handlers/metric_writer_test.py +214 -0
langfun/env/interface.py +1640 -0
langfun/env/interface_test.py +151 -0
langfun/env/load_balancers.py +59 -0
langfun/env/load_balancers_test.py +139 -0
langfun/env/test_utils.py +497 -0
{langfun-0.1.2.dev202509020804.dist-info → langfun-0.1.2.dev202511110805.dist-info}/METADATA +7 -3
langfun-0.1.2.dev202511110805.dist-info/RECORD +200 -0
langfun-0.1.2.dev202509020804.dist-info/RECORD +0 -172
{langfun-0.1.2.dev202509020804.dist-info → langfun-0.1.2.dev202511110805.dist-info}/WHEEL +0 -0
{langfun-0.1.2.dev202509020804.dist-info → langfun-0.1.2.dev202511110805.dist-info}/licenses/LICENSE +0 -0
{langfun-0.1.2.dev202509020804.dist-info → langfun-0.1.2.dev202511110805.dist-info}/top_level.txt +0 -0

langfun/core/eval/v2/__init__.py CHANGED Viewed

@@ -38,6 +38,7 @@ from langfun.core.eval.v2 import runners
 from langfun.core.eval.v2.checkpointing import BulkCheckpointer
 from langfun.core.eval.v2.checkpointing import PerExampleCheckpointer
 from langfun.core.eval.v2.reporting import HtmlReporter
+from langfun.core.eval.v2.reporting import ExampleHtmlGenerator
 # pylint: enable=g-bad-import-order

langfun/core/eval/v2/checkpointing.py CHANGED Viewed

@@ -29,7 +29,17 @@ Runner = experiment_lib.Runner
 class Checkpointer(experiment_lib.Plugin):
-  """Base class for checkpointing evaluation examples."""
+  """Base class for checkpointing evaluation examples.
+  `Checkpointer` is a plugin that saves the state of processed examples
+  incrementally during an experiment run, allowing the experiment to be resumed
+  later. When an experiment starts, the checkpointer loads any previously saved
+  examples from an earlier run (or a warm-start run) into `experiment.state`,
+  so the runner can skip processing them again.
+  Subclasses should implement `_list_checkpoint_filenames` to identify
+  checkpoint files to load, and `_save_example` to save a newly processed
+  example.
+  """
   checkpoint_filename: Annotated[
       str,
@@ -170,7 +180,12 @@ class Checkpointer(experiment_lib.Plugin):
 class PerExampleCheckpointer(Checkpointer):
-  """Checkpointer that saves each example to a separate file."""
+  """Checkpointer that saves each example to a separate file.
+  This checkpointer saves each processed example to its own checkpoint file,
+  named using the pattern `<checkpoint_filename_prefix>_<example_id>.<ext>`.
+  For example, `checkpoint_1.bagz`, `checkpoint_2.bagz`, etc.
+  """
   def _on_bound(self):
     super()._on_bound()
@@ -235,7 +250,13 @@ class PerExampleCheckpointer(Checkpointer):
 class BulkCheckpointer(Checkpointer):
-  """Checkpointer that saves all examples to a single file."""
+  """Checkpointer that saves all examples of an evaluation to a single file.
+  This checkpointer appends newly processed examples of an evaluation to a
+  single sequence file (e.g., `checkpoint.bagz`). This is often more efficient
+  than `PerExampleCheckpointer` when dealing with a large number of examples
+  or when file system overhead is a concern.
+  """
   def _on_bound(self):
     super()._on_bound()
@@ -341,7 +362,12 @@ class BulkCheckpointer(Checkpointer):
 class SequenceWriter:
-  """Thread safe sequence writer."""
+  """A thread-safe writer for sequence files (e.g., Bagz).
+  `SequenceWriter` wraps a `pg.io.SequenceWriter` to provide thread-safe
+  `add` and `close` operations, ensuring that examples can be written
+  concurrently from multiple threads without corrupting the sequence file.
+  """
   def __init__(self, path: str):
     self._lock = threading.Lock()

langfun/core/eval/v2/eval_test_helper.py CHANGED Viewed

@@ -75,7 +75,7 @@ class TestEvaluation(Evaluation):
 class BadJsonConvertible(pg.Object):
-  def to_json(self, *args, **kwargs):
+  def sym_jsonify(self, *args, **kwargs):
     raise ValueError('Cannot convert to JSON.')

langfun/core/eval/v2/evaluation.py CHANGED Viewed

@@ -32,17 +32,63 @@ import pyglove as pg
 class Evaluation(experiment_lib.Experiment):
-  """Evaluation.
-  An evaluation can be a leaf node or a container of other evaluations,
-  depending on whether the current evaluation object is configured with
-  any `pg.oneof`.
-  For example, `MyEval(lm=pg.oneof([lf.llms.Gpt4(), lf.llms.Gemini1_5Pro()]))`
-  is a container of two sub-experiments, one for each LLM. In such case, the
-  evaluation object with `pg.oneof` is called a hyper evaluation, which
-  represents a search space of evaluations, and each sub-evaluation is called
-  a leaf evaluation, which will perform the actual evaluation.
+  """Base class for Langfun evaluations.
+  `lf.eval.Evaluation` is the base class for defining evaluation tasks in
+  Langfun. Users typically subclass it to implement custom evaluation logic by
+  overriding `inputs` and `process` methods.
+  An `Evaluation` object encapsulates:
+  *   **`inputs`**: A callable that returns an iterable of input examples to be
+      processed. This is usually provided by implementing an `inputs(self)`
+      method in the subclass, which yields input items for evaluation one by
+      one.
+  *   **`process(self, example)`**: An abstract method that processes one
+      example and returns the output, or a tuple of (output, metadata).
+      The output will be used for computing metrics.
+  *   **`metrics`**: A list of metrics (e.g., `lf.metrics.Accuracy`) to compute
+      based on the outputs from `process`. Some metrics may require users to
+      implement a `ground_truth(self, example)` method in the subclass to
+      compute metrics against ground truth.
+  *   **Hyperparameters**: Any other attributes of the class serve as
+      hyperparameters for the evaluation (e.g., the language model to use).
+  **Running Evaluations:**
+  Evaluations are executed via `lf.eval.Suite` or by calling the `.run()`
+  method on an `Evaluation` instance, which returns a `Run` object
+  containing the evaluation run information and results. If an evaluation
+  contains sweeable parameters (using `pg.oneof`), `.run()` will expand it
+  into multiple evaluation sub-tasks -- one for each combination of
+  hyperparameters -- all managed within the same `Run`.
+  **Example:**
+  ```python
+  import langfun as lf
+  import pyglove as pg
+  class MyEval(lf.eval.Evaluation):
+    lm: lf.LanguageModel
+    prompt: str = '1 + 1 = '
+    def inputs(self):
+      yield 2
+    def process(self, example: lf.eval.Example):
+      return int(lf.query(self.prompt, lm=self.lm))
+    def ground_truth(self, example: lf.eval.Example) -> int:
+      return example.input
+  # Run evaluation using two different LMs
+  evaluation = MyEval(
+      lm=pg.oneof([lf.llms.Gpt4(), lf.llms.Gemini()]),
+      metrics=[lf.metrics.Accuracy()]
+  )
+  run_info = evaluation.run()
+  ```
   """
   inputs: Annotated[
@@ -137,7 +183,7 @@ class Evaluation(experiment_lib.Experiment):
     Args:
       example: An example object to process. `example.input` is an object
-        returned from `Evaluable.inputs`.
+        yielded from `inputs()` method.
     Returns:
       A processed output. Or a tuple of (output, metadata).
@@ -287,7 +333,7 @@ class Evaluation(experiment_lib.Experiment):
       A unique string representing the resource required.
     """
     return {
-        v.resource_id for _, v in self.sym_init_args.items()
+        v.resource_id for _, v in self.sym_init_args.sym_items()
         if isinstance(v, lf.LanguageModel)
     }
@@ -760,7 +806,7 @@ class Evaluation(experiment_lib.Experiment):
 class EvaluationState:
-  """Evaluation state."""
+  """In-memory state of an evaluation."""
   class ExampleStatus(pg.Object):
     """Example state."""

langfun/core/eval/v2/example.py CHANGED Viewed

@@ -22,19 +22,30 @@ import pyglove as pg
 @dataclasses.dataclass
 class Example(pg.JSONConvertible, pg.views.HtmlTreeView.Extension):
-  """An item for the evaluation.
+  """An example for evaluation.
+  An evaluation example contains the input and output of an evaluation task,
+  as well as metadata about the evaluation process, such as execution time,
+  LLM usage, and metric results.
   Attributes:
-    id: The 1-based ID of the item in the evaluation set.
-    input: An element returned from the `Evaluable.inputs` functor.
-    output: The output of the `process` method. If `pg.MISSING_VALUE`, it has
-      not been processed yet.
-    metadata: The metadata of the item produced by the `process` method.
-    metric_metadata: The dictionary returned from `Metric.audit`.
-    start_time: The start time of the evaluation item.
-    end_time: The end time of the evaluation item.
-    usage_summary: The summary of LLM usages of the evaluation item.
-    execution_status: The timeit status of the evaluation item.
+    id: The 1-based ID of the example in the evaluation set.
+    input: An element returned from the `Evaluable.inputs` functor, which serves
+      as the input for `lf.Evaluable.process`.
+    output: The output of `lf.Evaluable.process` method. If `pg.MISSING_VALUE`,
+      it indicates the example has not been processed yet.
+    error: The error encountered during `lf.Evaluable.process`. If None, it
+      indicates the process was successful.
+    metadata: The metadata of the example produced by `lf.Evaluable.process`.
+    metric_metadata: The dictionary returned from `Metric.audit`, which contains
+      metadata about metric computation for this example.
+    newly_processed: Whether this example is processed in the current run. If
+      False, it indicates the example was loaded from a checkpoint from previous
+      runs.
+    start_time: The start time of processing this example.
+    end_time: The end time of processing this example.
+    usage_summary: The summary of LLM usages for processing this example.
+    execution_status: The timeit status of processing this example.
   """
   id: int
   input: Any = pg.MISSING_VALUE

langfun/core/eval/v2/experiment.py CHANGED Viewed

@@ -139,10 +139,10 @@ class Experiment(lf.Component, pg.views.HtmlTreeView.Extension):
   # Checkpointing
-  Experiments support checkpointing, which is enabled by default. It allows
+  Experiments support checkpointing, which is enabled by default. It allows
   users to resume their experiments from a saved state. When an experiment runs,
-  it creates a new directory for that run and saves the current state to a
-  checkpoint file. If the experiment is interrupted or fails, users can resume
+  it creates a new directory for that run and saves its progress to checkpoint
+  files. If the experiment is interrupted or fails, users can resume
   it by specifying the 'id' or 'warm_start_from' argument (shown above) to
   seamlessly continue from previously saved state without starting over.
@@ -169,7 +169,7 @@ class Experiment(lf.Component, pg.views.HtmlTreeView.Extension):
   # Experiment Plugins
-  Experiment can be extended by plugins. Plugins can listen to the events of
+  Experiments can be extended by plugins. Plugins can listen to the events of
   experiment execution and produce additional outputs. For example, a plugin
   can be added to an experiment to generate additional metrics or to save
   additional data to a database. More details will be added in the future.
@@ -657,7 +657,30 @@ class Experiment(lf.Component, pg.views.HtmlTreeView.Extension):
 @pg.use_init_args(['children'])
 class Suite(Experiment):
-  """A suite of evaluations."""
+  """A suite of evaluations.
+  `lf.eval.Suite` groups multiple `lf.eval.Evaluation` or other `Suite`
+  objects into a single experiment, allowing them to be run, managed, and
+  reported together.
+  **Example:**
+  ```python
+  import langfun as lf
+  suite = lf.eval.Suite([
+      MyEval(lm=lf.llms.Gpt4()),
+      MyEval(lm=lf.llms.Gemini()),
+      lf.eval.Suite([
+          AnotherEval(lm=lf.llms.Gpt4()),
+          AnotherEval(lm=lf.llms.Gemini())
+      ])
+  ])
+  # Run all evaluations in the suite
+  run_info = suite.run('/path/to/my/suite_run')
+  ```
+  """
   children: Annotated[
       list[Experiment], 'A list of child experiments.'
@@ -791,7 +814,14 @@ class RunId(pg.Object):
 class Run(pg.Object, pg.views.html.HtmlTreeView.Extension):
-  """A run of an experiment."""
+  """Represents a single run of an experiment.
+  A `Run` object holds all the configurations for executing an experiment,
+  such as the experiment definition, input/output directories, and flags
+  controlling the execution behavior (e.g., error handling, checkpointing).
+  It also provides utility methods for accessing run-specific paths and
+  filtering examples for evaluation.
+  """
   root_dir: Annotated[
       str,
@@ -971,7 +1001,13 @@ class Run(pg.Object, pg.views.html.HtmlTreeView.Extension):
 class Runner(pg.Object):
-  """Interface for experiment runner."""
+  """Interface for experiment runner.
+  A runner is responsible for executing the evaluations within an experiment
+  based on the configuration specified in a `Run` object. Different runners
+  can implement different execution strategies, such as sequential or parallel
+  processing of examples and evaluations.
+  """
   # Class-level variable for registering the runner.
   NAME = None
@@ -1010,7 +1046,14 @@ class Runner(pg.Object):
 class Plugin(lf.Component):
-  """Base class for experiment plugins."""
+  """Base class for experiment plugins.
+  Plugins provide a mechanism to extend the behavior of an experiment run
+  by hooking into various events during the lifecycle of experiment and
+  example execution, such as `on_run_start`, `on_experiment_complete`,
+  `on_example_start`, etc. They can be used for custom logging, monitoring,
+  or result processing.
+  """
   def on_run_start(
       self,

langfun/core/eval/v2/metric_values.py CHANGED Viewed

@@ -20,7 +20,15 @@ import pyglove as pg
 class MetricValue(pg.Object):
-  """Base class for metric values."""
+  """Base class for metric values.
+  `MetricValue` is the base class for representing aggregated metric values
+  in an evaluation. It accumulates data points from individual examples,
+  each consisting of a value and an optional weight, associated with an example
+  ID. Subclasses must implement `reduce` method to compute a single float value
+  from accumulated data points, and `scalar_repr` to provide a string
+  representation of the reduced value.
+  """
   class DataPoint(pg.Object):
     """A data point for a metric value."""
@@ -88,6 +96,14 @@ class MetricValue(pg.Object):
         self.increment_total()
     return self
+  def merge_from(self, other: 'MetricValue') -> 'MetricValue':
+    """Merges the values from another metric value."""
+    self._weighted_sum += other._weighted_sum  # pylint: disable=protected-access
+    with pg.notify_on_change(False), pg.allow_writable_accessors(True):
+      self.data_points.extend(other.data_points)
+      self.increment_total(other.total)
+    return self
   def __gt__(self, other: Union['MetricValue', float]) -> bool:
     if isinstance(other, self.__class__):
       return float(self) > float(other)
@@ -133,7 +149,13 @@ class MetricValue(pg.Object):
 class Rate(MetricValue):
-  """Representing a rate in range [0, 1]."""
+  """Metric value representing a rate in range [0, 1].
+  `Rate` is used for metrics that compute a rate, such as accuracy or error
+  rate. The final value is computed as the weighted sum of accumulated values
+  divided by the total number of examples. It's displayed as a percentage
+  (e.g., 90.0%).
+  """
   def reduce(self) -> float:
     return self._weighted_sum / self.total
@@ -145,7 +167,13 @@ class Rate(MetricValue):
 class Average(MetricValue):
-  """Average of a aggregated values."""
+  """Metric value representing an average of accumulated values.
+  `Average` is used for metrics that compute an average score across examples
+  (e.g., average quality score). The final value is computed as the weighted
+  sum of accumulated values divided by the number of data points.
+  It's displayed as a float with 3 decimal places (e.g., 4.750).
+  """
   def reduce(self) -> float:
     if not self.data_points:

langfun/core/eval/v2/metric_values_test.py CHANGED Viewed

@@ -51,6 +51,22 @@ class RateTest(unittest.TestCase):
     self.assertEqual(rate.total, 0)
     self.assertTrue(math.isnan(float(rate)))
+  def test_merge_from(self):
+    rate1 = metric_values.Rate()
+    rate1.add(1, 1.0, 1.0, increment_total=True)
+    rate2 = metric_values.Rate()
+    rate2.add(2, 0.0, 1.0, increment_total=True)
+    rate1.merge_from(rate2)
+    self.assertEqual(rate1.total, 2)
+    self.assertEqual(float(rate1), 0.5)
+    self.assertEqual(
+        rate1.data_points,
+        [
+            metric_values.MetricValue.DataPoint(1, 1.0, 1.0),
+            metric_values.MetricValue.DataPoint(2, 0.0, 1.0),
+        ],
+    )
 class AverageTest(unittest.TestCase):
@@ -75,6 +91,22 @@ class AverageTest(unittest.TestCase):
     average.reset()
     self.assertEqual(average.total, 0)
+  def test_merge_from(self):
+    avg1 = metric_values.Average()
+    avg1.add(1, 1.0, 0.5, increment_total=True)
+    avg2 = metric_values.Average()
+    avg2.add(2, 0.0, 1.0, increment_total=True)
+    avg1.merge_from(avg2)
+    self.assertEqual(avg1.total, 2)
+    self.assertEqual(float(avg1), 0.25)
+    self.assertEqual(
+        avg1.data_points,
+        [
+            metric_values.MetricValue.DataPoint(1, 1.0, 0.5),
+            metric_values.MetricValue.DataPoint(2, 0.0, 1.0),
+        ],
+    )
 if __name__ == '__main__':
   unittest.main()

langfun/core/eval/v2/metrics.py CHANGED Viewed

@@ -29,7 +29,15 @@ Average = metric_values.Average
 class Metric(pg.Object, pg.views.HtmlTreeView.Extension):
-  """Interface for an evaluation metric."""
+  """Interface for an evaluation metric.
+  A metric is used to evaluate the quality of the outputs produced by an
+  evaluation. It works by auditing each processed example via its `audit`
+  method, which in turn calls the user-overridable `_audit` method to perform
+  metric-specific logic and update metric values. Metrics can compute multiple
+  values (e.g., precision, recall, F1 score) which are exposed via the
+  `values` method.
+  """
   name: Annotated[
       str,
@@ -71,6 +79,12 @@ class Metric(pg.Object, pg.views.HtmlTreeView.Extension):
     for v in self.values():
       v.reset()
+  def merge_from(self, other: 'Metric') -> 'Metric':
+    """Merges the values from another metric."""
+    for v1, v2 in zip(self.values(), other.values()):
+      v1.merge_from(v2)
+    return self
   def _update_view(self):
     """Refreshes the metric values."""
     if self._label_group is None:
@@ -169,7 +183,15 @@ class Metric(pg.Object, pg.views.HtmlTreeView.Extension):
 class MetricBase(Metric):
-  """Base class for common metrics."""
+  """Base class for common metrics.
+  `MetricBase` provides common functionalities for metrics, such as automatic
+  error counting based on whether an example has an error during evaluation.
+  It distinguishes between Object-Oriented Programming (OOP) errors
+  (e.g. `MappingError` during structured output generation) and other errors.
+  Subclasses should implement `_audit_processed` for metric computation on
+  successfully processed examples.
+  """
   oop_errors: Rate | None = Rate()
   non_oop_errors: Rate | None = Rate()
@@ -229,7 +251,13 @@ class MetricBase(Metric):
 class Match(MetricBase):
-  """Metric for matching outputs against groundtruth."""
+  """Metric for matching outputs against ground truth.
+  This metric computes match and mismatch rates by comparing the output of
+  an example with its ground truth. By default, it looks for a `groundtruth`
+  attribute in `example.input` for comparison. Users can customize this behavior
+  by subclassing `Match` and overriding the `match` method.
+  """
   name = 'match'
   matches: Rate = Rate()
@@ -302,7 +330,14 @@ class Match(MetricBase):
 class Score(MetricBase):
-  """Base class for scoring."""
+  """Base class for scoring metrics.
+  `Score` is a base class for metrics that assign a numerical score to each
+  example's output (e.g., evaluating quality on a scale of 1-5).
+  It automatically computes the average score across all examples.
+  Subclasses must implement the `score` method to define how an example
+  should be scored.
+  """
   name = 'score'
   average_score: Average = Average()

langfun/core/eval/v2/metrics_test.py CHANGED Viewed

@@ -106,6 +106,20 @@ class MatchTest(unittest.TestCase):
       m.audit(Example(id=2, input=pg.Dict(groundtruth=1), output=2))
       self.assertEqual(len(scripts), 12)
+  def test_merge_from(self):
+    m1 = metrics.Match()
+    m1.audit(Example(id=1, input=pg.Dict(groundtruth=1), output=1))
+    m2 = metrics.Match()
+    m2.audit(Example(id=2, input=pg.Dict(groundtruth=1), output=2))
+    m1.merge_from(m2)
+    self.assertEqual(m1.matches, 0.5)
+    self.assertEqual(m1.mismatches, 0.5)
+    self.assertEqual(m1.oop_errors, 0.0)
+    self.assertEqual(m1.non_oop_errors, 0.0)
+    self.assertEqual(m1.matches.total, 2)
+    self.assertEqual(len(m1.matches.data_points), 1)
+    self.assertEqual(len(m1.mismatches.data_points), 1)
 class ScoreTest(unittest.TestCase):

langfun/core/eval/v2/progress.py CHANGED Viewed

@@ -21,7 +21,15 @@ import pyglove as pg
 class Progress(pg.Object, pg.views.HtmlTreeView.Extension):
-  """Evaluation progress."""
+  """Represents and tracks the progress of an evaluation.
+  The `Progress` class maintains counts of processed, failed, and skipped
+  items in an evaluation, along with timing information (start time, stop time,
+  duration) and an execution summary. It provides properties to check the
+  status of the evaluation (e.g., `is_started`, `is_completed`) and methods
+  to update progress as items are evaluated.
+  It also supports HTML rendering as a progress bar for visualization.
+  """
   num_total: Annotated[
       int | None,
@@ -216,6 +224,27 @@ class Progress(pg.Object, pg.views.HtmlTreeView.Extension):
     """Overrides nondefault values so volatile values are not included."""
     return dict()
+  def merge_from(self, other: 'Progress') -> None:
+    """Merges the progress from another progress."""
+    with pg.notify_on_change(False), pg.allow_writable_accessors(True):
+      if other.start_time is not None and (
+          self.start_time is None or self.start_time > other.start_time):
+        self.start_time = other.start_time
+      if other.stop_time is not None and (
+          self.stop_time is None or self.stop_time < other.stop_time):
+        self.stop_time = other.stop_time
+      if other.num_total is not None:
+        if self.num_total is None:
+          self.num_total = other.num_total
+        else:
+          assert self.num_total == other.num_total, (self, other)
+      self.num_processed += other.num_processed
+      self.num_failed += other.num_failed
+      self.num_skipped += other.num_skipped
+      self.execution_summary.aggregate(other.execution_summary.breakdown)
   #
   # HTML view.
   #

langfun/core/eval/v2/progress_test.py CHANGED Viewed

@@ -77,6 +77,33 @@ class ProgressTest(unittest.TestCase):
     self.assertTrue(p.is_stopped)
     self.assertIsNotNone(p.stop_time_str)
+  def test_merge_from(self):
+    p1 = Progress()
+    p1.start(10)
+    p1.increment_processed()
+    p1.increment_failed()
+    p1.stop()
+    p2 = Progress()
+    p2.start(10)
+    p2.increment_skipped()
+    p2.stop()
+    with pg.allow_writable_accessors(True):
+      p1.start_time = 2.0
+      p1.stop_time = 4.0
+      p2.start_time = 1.0
+      p2.stop_time = 5.0
+    p1.merge_from(p2)
+    self.assertEqual(p1.num_total, 10)
+    self.assertEqual(p1.num_processed, 1)
+    self.assertEqual(p1.num_failed, 1)
+    self.assertEqual(p1.num_skipped, 1)
+    self.assertEqual(p1.num_completed, 3)
+    self.assertEqual(p1.start_time, 1.0)
+    self.assertEqual(p1.stop_time, 5.0)
 if __name__ == '__main__':
   unittest.main()

langfun/core/eval/v2/progress_tracking_test.py CHANGED Viewed

@@ -14,9 +14,11 @@
 import contextlib
 import io
 import os
+import sys
 import tempfile
 import unittest
+from langfun.core import concurrent as lf_concurrent
 from langfun.core import console as lf_console
 from langfun.core.eval.v2 import eval_test_helper
 from langfun.core.eval.v2 import progress_tracking    # pylint: disable=unused-import
@@ -49,6 +51,8 @@ class TqdmProgressTrackerTest(unittest.TestCase):
     string_io = io.StringIO()
     with contextlib.redirect_stderr(string_io):
       _ = experiment.run(root_dir, 'new', plugins=[])
+      sys.stderr.flush()
+    lf_concurrent.ProgressBar.refresh()
     self.assertIn('All: 100%', string_io.getvalue())
   def test_with_example_ids(self):
@@ -59,6 +63,8 @@ class TqdmProgressTrackerTest(unittest.TestCase):
     string_io = io.StringIO()
     with contextlib.redirect_stderr(string_io):
       _ = experiment.run(root_dir, 'new', example_ids=[1], plugins=[])
+      sys.stderr.flush()
+    lf_concurrent.ProgressBar.refresh()
     self.assertIn('All: 100%', string_io.getvalue())

langfun 0.1.2.dev202509020804__py3-none-any.whl → 0.1.2.dev202511110805__py3-none-any.whl

Potentially problematic release.

langfun 0.1.2.dev202509020804py3-none-any.whl → 0.1.2.dev202511110805py3-none-any.whl