PyPI - langfun - Versions diffs - 0.1.2.dev202510200805__py3-none-any.whl → 0.1.2.dev202511160804__py3-none-any.whl - Mend

langfun 0.1.2.dev202510200805py3-none-any.whl → 0.1.2.dev202511160804py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of langfun might be problematic. Click here for more details.

Files changed (146) hide show

langfun/core/__init__.py +1 -0
langfun/core/agentic/action.py +107 -12
langfun/core/agentic/action_eval.py +9 -2
langfun/core/agentic/action_test.py +25 -0
langfun/core/async_support.py +32 -3
langfun/core/coding/python/correction.py +19 -9
langfun/core/coding/python/execution.py +14 -12
langfun/core/coding/python/generation.py +21 -16
langfun/core/coding/python/sandboxing.py +23 -3
langfun/core/component.py +42 -3
langfun/core/concurrent.py +70 -6
langfun/core/concurrent_test.py +1 -0
langfun/core/console.py +1 -1
langfun/core/data/conversion/anthropic.py +12 -3
langfun/core/data/conversion/anthropic_test.py +8 -6
langfun/core/data/conversion/gemini.py +9 -2
langfun/core/data/conversion/gemini_test.py +12 -9
langfun/core/data/conversion/openai.py +145 -31
langfun/core/data/conversion/openai_test.py +161 -17
langfun/core/eval/base.py +48 -44
langfun/core/eval/base_test.py +4 -4
langfun/core/eval/matching.py +5 -2
langfun/core/eval/patching.py +3 -3
langfun/core/eval/scoring.py +4 -3
langfun/core/eval/v2/__init__.py +1 -0
langfun/core/eval/v2/checkpointing.py +39 -5
langfun/core/eval/v2/checkpointing_test.py +1 -1
langfun/core/eval/v2/eval_test_helper.py +97 -1
langfun/core/eval/v2/evaluation.py +88 -16
langfun/core/eval/v2/evaluation_test.py +9 -3
langfun/core/eval/v2/example.py +45 -39
langfun/core/eval/v2/example_test.py +3 -3
langfun/core/eval/v2/experiment.py +51 -8
langfun/core/eval/v2/metric_values.py +31 -3
langfun/core/eval/v2/metric_values_test.py +32 -0
langfun/core/eval/v2/metrics.py +157 -44
langfun/core/eval/v2/metrics_test.py +39 -18
langfun/core/eval/v2/progress.py +30 -1
langfun/core/eval/v2/progress_test.py +27 -0
langfun/core/eval/v2/progress_tracking_test.py +3 -0
langfun/core/eval/v2/reporting.py +90 -71
langfun/core/eval/v2/reporting_test.py +20 -6
langfun/core/eval/v2/runners/__init__.py +26 -0
langfun/core/eval/v2/{runners.py → runners/base.py} +22 -124
langfun/core/eval/v2/runners/debug.py +40 -0
langfun/core/eval/v2/runners/debug_test.py +79 -0
langfun/core/eval/v2/runners/parallel.py +100 -0
langfun/core/eval/v2/runners/parallel_test.py +98 -0
langfun/core/eval/v2/runners/sequential.py +47 -0
langfun/core/eval/v2/runners/sequential_test.py +175 -0
langfun/core/langfunc.py +45 -130
langfun/core/langfunc_test.py +6 -4
langfun/core/language_model.py +103 -16
langfun/core/language_model_test.py +9 -3
langfun/core/llms/__init__.py +7 -1
langfun/core/llms/anthropic.py +157 -2
langfun/core/llms/azure_openai.py +29 -17
langfun/core/llms/cache/base.py +25 -3
langfun/core/llms/cache/in_memory.py +48 -7
langfun/core/llms/cache/in_memory_test.py +14 -4
langfun/core/llms/compositional.py +25 -1
langfun/core/llms/deepseek.py +30 -2
langfun/core/llms/fake.py +32 -1
langfun/core/llms/gemini.py +14 -9
langfun/core/llms/google_genai.py +29 -1
langfun/core/llms/groq.py +28 -3
langfun/core/llms/llama_cpp.py +23 -4
langfun/core/llms/openai.py +36 -3
langfun/core/llms/openai_compatible.py +148 -27
langfun/core/llms/openai_compatible_test.py +207 -20
langfun/core/llms/openai_test.py +0 -2
langfun/core/llms/rest.py +12 -1
langfun/core/llms/vertexai.py +51 -8
langfun/core/logging.py +1 -1
langfun/core/mcp/client.py +77 -22
langfun/core/mcp/client_test.py +8 -35
langfun/core/mcp/session.py +94 -29
langfun/core/mcp/session_test.py +54 -0
langfun/core/mcp/tool.py +151 -22
langfun/core/mcp/tool_test.py +197 -0
langfun/core/memory.py +1 -0
langfun/core/message.py +160 -55
langfun/core/message_test.py +65 -81
langfun/core/modalities/__init__.py +8 -0
langfun/core/modalities/audio.py +21 -1
langfun/core/modalities/image.py +19 -1
langfun/core/modalities/mime.py +62 -3
langfun/core/modalities/pdf.py +19 -1
langfun/core/modalities/video.py +21 -1
langfun/core/modality.py +167 -29
langfun/core/modality_test.py +42 -12
langfun/core/natural_language.py +1 -1
langfun/core/sampling.py +4 -4
langfun/core/sampling_test.py +20 -4
langfun/core/structured/__init__.py +2 -24
langfun/core/structured/completion.py +34 -44
langfun/core/structured/completion_test.py +23 -43
langfun/core/structured/description.py +54 -50
langfun/core/structured/function_generation.py +29 -12
langfun/core/structured/mapping.py +81 -37
langfun/core/structured/parsing.py +95 -79
langfun/core/structured/parsing_test.py +0 -3
langfun/core/structured/querying.py +215 -142
langfun/core/structured/querying_test.py +65 -29
langfun/core/structured/schema/__init__.py +48 -0
langfun/core/structured/schema/base.py +664 -0
langfun/core/structured/schema/base_test.py +531 -0
langfun/core/structured/schema/json.py +174 -0
langfun/core/structured/schema/json_test.py +121 -0
langfun/core/structured/schema/python.py +316 -0
langfun/core/structured/schema/python_test.py +410 -0
langfun/core/structured/schema_generation.py +33 -14
langfun/core/structured/scoring.py +47 -36
langfun/core/structured/tokenization.py +26 -11
langfun/core/subscription.py +2 -2
langfun/core/template.py +175 -50
langfun/core/template_test.py +123 -17
langfun/env/__init__.py +8 -2
langfun/env/base_environment.py +320 -128
langfun/env/base_environment_test.py +473 -0
langfun/env/base_feature.py +92 -15
langfun/env/base_feature_test.py +228 -0
langfun/env/base_sandbox.py +84 -361
langfun/env/base_sandbox_test.py +1235 -0
langfun/env/event_handlers/__init__.py +1 -1
langfun/env/event_handlers/chain.py +233 -0
langfun/env/event_handlers/chain_test.py +253 -0
langfun/env/event_handlers/event_logger.py +95 -98
langfun/env/event_handlers/event_logger_test.py +21 -21
langfun/env/event_handlers/metric_writer.py +225 -140
langfun/env/event_handlers/metric_writer_test.py +23 -6
langfun/env/interface.py +854 -40
langfun/env/interface_test.py +112 -2
langfun/env/load_balancers_test.py +23 -2
langfun/env/test_utils.py +126 -84
{langfun-0.1.2.dev202510200805.dist-info → langfun-0.1.2.dev202511160804.dist-info}/METADATA +1 -1
langfun-0.1.2.dev202511160804.dist-info/RECORD +211 -0
langfun/core/eval/v2/runners_test.py +0 -343
langfun/core/structured/schema.py +0 -987
langfun/core/structured/schema_test.py +0 -982
langfun/env/base_test.py +0 -1481
langfun/env/event_handlers/base.py +0 -350
langfun-0.1.2.dev202510200805.dist-info/RECORD +0 -195
{langfun-0.1.2.dev202510200805.dist-info → langfun-0.1.2.dev202511160804.dist-info}/WHEEL +0 -0
{langfun-0.1.2.dev202510200805.dist-info → langfun-0.1.2.dev202511160804.dist-info}/licenses/LICENSE +0 -0
{langfun-0.1.2.dev202510200805.dist-info → langfun-0.1.2.dev202511160804.dist-info}/top_level.txt +0 -0

langfun/core/eval/scoring.py CHANGED Viewed

@@ -41,18 +41,19 @@ class Scoring(base.Evaluation):
   @property
   def score_rate(self) -> float:
-    """Returns the score rate."""
+    """Returns the rate of scored examples among the completed ones."""
     if self.num_completed == 0:
       return 0.0
     return self.num_scored / self.num_completed
   @property
   def scored_link(self) -> str:
-    """Returns the matches page."""
+    """Returns the scored examples page."""
     return self.link(os.path.join(self.dir, Scoring.SCORED_HTML))
   @property
   def avg_score(self) -> float:
+    """Returns the average score of scored examples."""
     if self.num_scored == 0:
       return 0
     return sum([i[2] for i in self._scored]) / self.num_scored
@@ -181,7 +182,7 @@ class Scoring(base.Evaluation):
     super()._render_summary_metrics(s)
   def _render_scored(self, s: io.StringIO) -> None:
-    """Formats the matched cases into html."""
+    """Formats the scored cases into html."""
     s.write('<h2> Scored </h2>')
     s.write('<div style="white-space:pre">\n')
     s.write(

langfun/core/eval/v2/__init__.py CHANGED Viewed

@@ -38,6 +38,7 @@ from langfun.core.eval.v2 import runners
 from langfun.core.eval.v2.checkpointing import BulkCheckpointer
 from langfun.core.eval.v2.checkpointing import PerExampleCheckpointer
 from langfun.core.eval.v2.reporting import HtmlReporter
+from langfun.core.eval.v2.reporting import ExampleHtmlGenerator
 # pylint: enable=g-bad-import-order

langfun/core/eval/v2/checkpointing.py CHANGED Viewed

@@ -29,13 +29,28 @@ Runner = experiment_lib.Runner
 class Checkpointer(experiment_lib.Plugin):
-  """Base class for checkpointing evaluation examples."""
+  """Base class for checkpointing evaluation examples.
+  `Checkpointer` is a plugin that saves the state of processed examples
+  incrementally during an experiment run, allowing the experiment to be resumed
+  later. When an experiment starts, the checkpointer loads any previously saved
+  examples from an earlier run (or a warm-start run) into `experiment.state`,
+  so the runner can skip processing them again.
+  Subclasses should implement `_list_checkpoint_filenames` to identify
+  checkpoint files to load, and `_save_example` to save a newly processed
+  example.
+  """
   checkpoint_filename: Annotated[
       str,
       'Checkpoint file pattern.'
   ] = 'checkpoint.bagz'
+  max_ckpt_loading_threads: Annotated[
+      int,
+      'Max number of workers for loading checkpoint files at startup.'
+  ] = 128
   def on_experiment_start(
       self,
       runner: Runner,
@@ -149,7 +164,10 @@ class Checkpointer(experiment_lib.Plugin):
     _ = list(
         lf.concurrent_map(
-            _load_state, ckpt_files, max_workers=16, silence_on_errors=None
+            _load_state,
+            ckpt_files,
+            max_workers=self.max_ckpt_loading_threads,
+            silence_on_errors=None
         )
     )
@@ -170,7 +188,12 @@ class Checkpointer(experiment_lib.Plugin):
 class PerExampleCheckpointer(Checkpointer):
-  """Checkpointer that saves each example to a separate file."""
+  """Checkpointer that saves each example to a separate file.
+  This checkpointer saves each processed example to its own checkpoint file,
+  named using the pattern `<checkpoint_filename_prefix>_<example_id>.<ext>`.
+  For example, `checkpoint_1.bagz`, `checkpoint_2.bagz`, etc.
+  """
   def _on_bound(self):
     super()._on_bound()
@@ -235,7 +258,13 @@ class PerExampleCheckpointer(Checkpointer):
 class BulkCheckpointer(Checkpointer):
-  """Checkpointer that saves all examples to a single file."""
+  """Checkpointer that saves all examples of an evaluation to a single file.
+  This checkpointer appends newly processed examples of an evaluation to a
+  single sequence file (e.g., `checkpoint.bagz`). This is often more efficient
+  than `PerExampleCheckpointer` when dealing with a large number of examples
+  or when file system overhead is a concern.
+  """
   def _on_bound(self):
     super()._on_bound()
@@ -341,7 +370,12 @@ class BulkCheckpointer(Checkpointer):
 class SequenceWriter:
-  """Thread safe sequence writer."""
+  """A thread-safe writer for sequence files (e.g., Bagz).
+  `SequenceWriter` wraps a `pg.io.SequenceWriter` to provide thread-safe
+  `add` and `close` operations, ensuring that examples can be written
+  concurrently from multiple threads without corrupting the sequence file.
+  """
   def __init__(self, path: str):
     self._lock = threading.Lock()

langfun/core/eval/v2/checkpointing_test.py CHANGED Viewed

@@ -65,7 +65,7 @@ class ExampleCollector(experiment_lib.Plugin):
     return self._examples
   def on_example_complete(
-      self, runner: runners_lib.Runner,
+      self, runner: experiment_lib.Runner,
       experiment: experiment_lib.Experiment,
       example: example_lib.Example,
   ):

langfun/core/eval/v2/eval_test_helper.py CHANGED Viewed

@@ -13,6 +13,9 @@
 # limitations under the License.
 """Helper classes and functions for evaluation tests."""
+import threading
+import time
 from langfun.core import language_model
 from langfun.core import llms
 from langfun.core import message as message_lib
@@ -47,6 +50,8 @@ class TestLLM(llms.Fake):
   offset: int = 0
+  __test__ = False
   def _response_from(self, prompt: message_lib.Message) -> message_lib.Message:
     return message_lib.AIMessage(
         str(prompt.metadata.x + prompt.metadata.y + self.offset)
@@ -63,6 +68,8 @@ class TestEvaluation(Evaluation):
   metrics = [metrics_lib.Match()]
   lm: language_model.LanguageModel = TestLLM()
+  __test__ = False
   def process(self, example):
     v = example.input
     if v.x == 5:
@@ -75,7 +82,7 @@ class TestEvaluation(Evaluation):
 class BadJsonConvertible(pg.Object):
-  def to_json(self, *args, **kwargs):
+  def sym_jsonify(self, *args, **kwargs):
     raise ValueError('Cannot convert to JSON.')
@@ -84,6 +91,8 @@ class TestEvaluationWithExampleCheckpointingError(TestEvaluation):
   inputs = test_inputs()
   metrics = [metrics_lib.Match()]
+  __test__ = False
   def process(self, example):
     return 1, dict(
         x=BadJsonConvertible()
@@ -101,6 +110,8 @@ class TestEvaluationWithExampleHtmlGenerationError(Evaluation):
   inputs = test_inputs()
   metrics = [metrics_lib.Match()]
+  __test__ = False
   def process(self, example):
     return 1, dict(
         x=BadHtmlConvertible()
@@ -110,6 +121,8 @@ class TestEvaluationWithExampleHtmlGenerationError(Evaluation):
 class TestEvaluationWithIndexHtmlGenerationError(TestEvaluation):
   """Test evaluation class with bad index HTML generation."""
+  __test__ = False
   def _html_tree_view(self, *args, **kwargs):
     raise ValueError('Cannot render HTML.')
@@ -135,3 +148,86 @@ def test_experiment_with_example_html_generation_error():
 def test_experiment_with_index_html_generation_error():
   """Returns a test experiment with bad index HTML."""
   return TestEvaluationWithIndexHtmlGenerationError()
+class TestPlugin(experiment_lib.Plugin):
+  """Plugin for testing."""
+  started_experiments: list[experiment_lib.Experiment] = []
+  completed_experiments: list[experiment_lib.Experiment] = []
+  skipped_experiments: list[experiment_lib.Experiment] = []
+  started_example_ids: list[int] = []
+  completed_example_ids: list[int] = []
+  start_time: float | None = None
+  complete_time: float | None = None
+  __test__ = False
+  def _on_bound(self):
+    super()._on_bound()
+    self._lock = threading.Lock()
+  def on_run_start(
+      self,
+      runner: experiment_lib.Runner,
+      root: experiment_lib.Experiment
+  ) -> None:
+    del root
+    with pg.notify_on_change(False), pg.allow_writable_accessors(True):
+      self.start_time = time.time()
+  def on_run_complete(
+      self,
+      runner: experiment_lib.Runner,
+      root: experiment_lib.Experiment
+  ) -> None:
+    del root
+    with pg.notify_on_change(False), pg.allow_writable_accessors(True):
+      self.complete_time = time.time()
+  def on_experiment_start(
+      self,
+      runner: experiment_lib.Runner,
+      experiment: experiment_lib.Experiment
+  ) -> None:
+    del runner
+    with pg.notify_on_change(False), self._lock:
+      self.started_experiments.append(pg.Ref(experiment))
+  def on_experiment_skipped(
+      self,
+      runner: experiment_lib.Runner,
+      experiment: experiment_lib.Experiment
+  ) -> None:
+    del runner
+    with pg.notify_on_change(False), self._lock:
+      self.skipped_experiments.append(pg.Ref(experiment))
+  def on_experiment_complete(
+      self,
+      runner: experiment_lib.Runner,
+      experiment: experiment_lib.Experiment
+  ) -> None:
+    del runner
+    with pg.notify_on_change(False), self._lock:
+      self.completed_experiments.append(pg.Ref(experiment))
+  def on_example_start(
+      self,
+      runner: experiment_lib.Runner,
+      experiment: experiment_lib.Experiment,
+      example: Example
+  ) -> None:
+    del runner, experiment
+    with pg.notify_on_change(False), self._lock:
+      self.started_example_ids.append(example.id)
+  def on_example_complete(
+      self,
+      runner: experiment_lib.Runner,
+      experiment: experiment_lib.Experiment,
+      example: Example
+  ) -> None:
+    del runner, experiment
+    with pg.notify_on_change(False), self._lock:
+      self.completed_example_ids.append(example.id)

langfun/core/eval/v2/evaluation.py CHANGED Viewed

@@ -32,17 +32,63 @@ import pyglove as pg
 class Evaluation(experiment_lib.Experiment):
-  """Evaluation.
-  An evaluation can be a leaf node or a container of other evaluations,
-  depending on whether the current evaluation object is configured with
-  any `pg.oneof`.
-  For example, `MyEval(lm=pg.oneof([lf.llms.Gpt4(), lf.llms.Gemini1_5Pro()]))`
-  is a container of two sub-experiments, one for each LLM. In such case, the
-  evaluation object with `pg.oneof` is called a hyper evaluation, which
-  represents a search space of evaluations, and each sub-evaluation is called
-  a leaf evaluation, which will perform the actual evaluation.
+  """Base class for Langfun evaluations.
+  `lf.eval.Evaluation` is the base class for defining evaluation tasks in
+  Langfun. Users typically subclass it to implement custom evaluation logic by
+  overriding `inputs` and `process` methods.
+  An `Evaluation` object encapsulates:
+  *   **`inputs`**: A callable that returns an iterable of input examples to be
+      processed. This is usually provided by implementing an `inputs(self)`
+      method in the subclass, which yields input items for evaluation one by
+      one.
+  *   **`process(self, example)`**: An abstract method that processes one
+      example and returns the output, or a tuple of (output, metadata).
+      The output will be used for computing metrics.
+  *   **`metrics`**: A list of metrics (e.g., `lf.metrics.Accuracy`) to compute
+      based on the outputs from `process`. Some metrics may require users to
+      implement a `ground_truth(self, example)` method in the subclass to
+      compute metrics against ground truth.
+  *   **Hyperparameters**: Any other attributes of the class serve as
+      hyperparameters for the evaluation (e.g., the language model to use).
+  **Running Evaluations:**
+  Evaluations are executed via `lf.eval.Suite` or by calling the `.run()`
+  method on an `Evaluation` instance, which returns a `Run` object
+  containing the evaluation run information and results. If an evaluation
+  contains sweeable parameters (using `pg.oneof`), `.run()` will expand it
+  into multiple evaluation sub-tasks -- one for each combination of
+  hyperparameters -- all managed within the same `Run`.
+  **Example:**
+  ```python
+  import langfun as lf
+  import pyglove as pg
+  class MyEval(lf.eval.Evaluation):
+    lm: lf.LanguageModel
+    prompt: str = '1 + 1 = '
+    def inputs(self):
+      yield 2
+    def process(self, example: lf.eval.Example):
+      return int(lf.query(self.prompt, lm=self.lm))
+    def ground_truth(self, example: lf.eval.Example) -> int:
+      return example.input
+  # Run evaluation using two different LMs
+  evaluation = MyEval(
+      lm=pg.oneof([lf.llms.Gpt4(), lf.llms.Gemini()]),
+      metrics=[lf.metrics.Accuracy()]
+  )
+  run_info = evaluation.run()
+  ```
   """
   inputs: Annotated[
@@ -126,6 +172,20 @@ class Evaluation(experiment_lib.Experiment):
   # Evaluation logics.
   #
+  def setup(self) -> None:
+    """Sets up resources required by the evaluation.
+    Subclasses should always call the super().setup() method to ensure the
+    proper initialization of the evaluation.
+    """
+  def teardown(self) -> None:
+    """Tears down resources used by the evaluation.
+    Subclasses should always call the super().teardown() method to ensure the
+    proper cleanup of the evaluation.
+    """
   @abc.abstractmethod
   def process(
       self,
@@ -137,7 +197,7 @@ class Evaluation(experiment_lib.Experiment):
     Args:
       example: An example object to process. `example.input` is an object
-        returned from `Evaluable.inputs`.
+        yielded from `inputs()` method.
     Returns:
       A processed output. Or a tuple of (output, metadata).
@@ -150,6 +210,7 @@ class Evaluation(experiment_lib.Experiment):
       example: example_lib.Example | int,
       raise_if_has_error: bool = False,
       reevaluate_upon_previous_errors: bool = True,
+      force_recompute_metrics: bool = False
   ) -> example_lib.Example:
     """Evaluates a single example input.
@@ -158,6 +219,8 @@ class Evaluation(experiment_lib.Experiment):
       raise_if_has_error: Whether to raise an error if the example has error.
       reevaluate_upon_previous_errors: Whether to reevaluate the example if
         the previous checkpointed run has error.
+      force_recompute_metrics: If True, force recompute the metrics even if
+        metric metadata is already present from previous checkpoint.
     Returns:
       The evaluated example with the output and metric metadata populated.
@@ -206,6 +269,7 @@ class Evaluation(experiment_lib.Experiment):
         # Use the output and metadata obtained from the previous processing.
         example.output = checkpointed.output
         example.metadata = checkpointed.metadata
+        example.metric_metadata = checkpointed.metric_metadata
         example.error = checkpointed.error
         example.newly_processed = False
         example.execution_status = checkpointed.execution_status
@@ -225,8 +289,16 @@ class Evaluation(experiment_lib.Experiment):
         self.info(f'Starting metric computation for example {example.id}.')
         metric_metadata = {}
         for metric in self.metrics:
-          metric_metadata.update(metric.audit(example))
-        example.metric_metadata = metric_metadata
+          metric_metadata[metric.name] = metric.update(
+              example, force_recompute=force_recompute_metrics
+          )
+        if example.metric_metadata is None:
+          example.metric_metadata = metric_metadata
+        else:
+          # Accumulate the metric metadata as there might be existing metadata
+          # from previous metric computation runs.
+          example.metric_metadata.update(metric_metadata)
         self.info(f'Completed metric computation for example {example.id}.')
     # For previously processed examples, we keep the execution status for the
@@ -287,7 +359,7 @@ class Evaluation(experiment_lib.Experiment):
       A unique string representing the resource required.
     """
     return {
-        v.resource_id for _, v in self.sym_init_args.items()
+        v.resource_id for _, v in self.sym_init_args.sym_items()
         if isinstance(v, lf.LanguageModel)
     }
@@ -760,7 +832,7 @@ class Evaluation(experiment_lib.Experiment):
 class EvaluationState:
-  """Evaluation state."""
+  """In-memory state of an evaluation."""
   class ExampleStatus(pg.Object):
     """Example state."""

langfun/core/eval/v2/evaluation_test.py CHANGED Viewed

@@ -88,7 +88,7 @@ class EvaluationTest(unittest.TestCase):
     self.assertEqual(example.output, 6)
     self.assertIsNone(example.error)
     self.assertEqual(example.metadata, {})
-    self.assertEqual(example.metric_metadata, dict(match=True))
+    self.assertEqual(example.metric_metadata, dict(match=dict(is_correct=True)))
     self.assertIsNotNone(example.usage_summary)
     self.assertGreater(example.usage_summary.total.total_tokens, 0)
     self.assertEqual(example.usage_summary.total.num_requests, 1)
@@ -103,7 +103,10 @@ class EvaluationTest(unittest.TestCase):
     self.assertEqual(example.output, 7)
     self.assertIsNone(example.error)
     self.assertEqual(example.metadata, {})
-    self.assertEqual(example.metric_metadata, dict(mismatch=True))
+    self.assertEqual(
+        example.metric_metadata,
+        dict(match=dict(is_correct=False))
+    )
     with self.assertRaisesRegex(ValueError, 'x should not be 5'):
       _ = exp.evaluate(6, raise_if_has_error=True)
@@ -113,7 +116,10 @@ class EvaluationTest(unittest.TestCase):
     self.assertEqual(pg.MISSING_VALUE, example.output)
     self.assertEqual(example.error.tag, 'ValueError')
     self.assertEqual(example.metadata, {})
-    self.assertEqual(example.metric_metadata, dict(error='ValueError'))
+    self.assertEqual(
+        example.metric_metadata,
+        dict(match=dict(error='ValueError'))
+    )
   def test_evaluate_withstate(self):
     eval_dir = os.path.join(tempfile.mkdtemp(), 'test_eval')

langfun/core/eval/v2/example.py CHANGED Viewed

@@ -22,19 +22,30 @@ import pyglove as pg
 @dataclasses.dataclass
 class Example(pg.JSONConvertible, pg.views.HtmlTreeView.Extension):
-  """An item for the evaluation.
+  """An example for evaluation.
+  An evaluation example contains the input and output of an evaluation task,
+  as well as metadata about the evaluation process, such as execution time,
+  LLM usage, and metric results.
   Attributes:
-    id: The 1-based ID of the item in the evaluation set.
-    input: An element returned from the `Evaluable.inputs` functor.
-    output: The output of the `process` method. If `pg.MISSING_VALUE`, it has
-      not been processed yet.
-    metadata: The metadata of the item produced by the `process` method.
-    metric_metadata: The dictionary returned from `Metric.audit`.
-    start_time: The start time of the evaluation item.
-    end_time: The end time of the evaluation item.
-    usage_summary: The summary of LLM usages of the evaluation item.
-    execution_status: The timeit status of the evaluation item.
+    id: The 1-based ID of the example in the evaluation set.
+    input: An element returned from the `Evaluable.inputs` functor, which serves
+      as the input for `lf.Evaluable.process`.
+    output: The output of `lf.Evaluable.process` method. If `pg.MISSING_VALUE`,
+      it indicates the example has not been processed yet.
+    error: The error raised from `lf.Evaluable.process`. If None, it
+      indicates the process was successful.
+    metadata: The metadata of the example produced by `lf.Evaluable.process`.
+    metric_metadata: The dictionary returned from `Metric.audit`, which contains
+      metadata about metric computation for this example.
+    newly_processed: Whether this example is processed in the current run. If
+      False, it indicates the example was loaded from a checkpoint from previous
+      runs.
+    start_time: The start time of processing this example.
+    end_time: The end time of processing this example.
+    usage_summary: The summary of LLM usages for processing this example.
+    execution_status: The timeit status of processing this example.
   """
   id: int
   input: Any = pg.MISSING_VALUE
@@ -49,14 +60,6 @@ class Example(pg.JSONConvertible, pg.views.HtmlTreeView.Extension):
   usage_summary: lf.UsageSummary | None = None
   execution_status: dict[str, pg.utils.TimeIt.Status] | None = None
-  def __post_init__(self):
-    if self.execution_status is not None:
-      for status in self.execution_status.values():
-        if status.has_error:
-          assert isinstance(status.error, pg.ErrorInfo)
-          self.error = status.error
-          break
   @property
   def is_processed(self) -> bool:
     """Returns whether the item has been processed."""
@@ -182,15 +185,23 @@ class Example(pg.JSONConvertible, pg.views.HtmlTreeView.Extension):
     extra_flags = extra_flags or {}
     num_examples = extra_flags.get('num_examples', None)
-    def _metric_metadata_badge(key, value):
-      if isinstance(value, bool) and bool:
-        text = key
-      else:
-        text = f'{key}:{value}'
-      return pg.views.html.controls.Badge(
-          text,
-          css_classes=[pg.utils.camel_to_snake(key, '-')],
-      )
+    def _metric_label_group(metric_metadata: dict[str, Any] | None):
+      """Renders a label group for metric metadata."""
+      badges = []
+      if metric_metadata:
+        for metric_name, metadata in metric_metadata.items():
+          assert isinstance(metadata, dict), (metric_name, metadata)
+          for k, v in metadata.items():
+            css_class = k
+            if isinstance(v, bool):
+              css_class += '_true' if v else '_false'
+            badge = pg.views.html.controls.Badge(
+                f'{k}:{v}',
+                tooltip=f'{metric_name}: {k}',
+                css_classes=[css_class],
+            )
+            badges.append(badge)
+      return pg.views.html.controls.LabelGroup(badges)
     def _render_header():
       return pg.Html.element(
@@ -229,12 +240,7 @@ class Example(pg.JSONConvertible, pg.views.HtmlTreeView.Extension):
                           extra_flags=dict(as_badge=True)
                       ) if self.usage_summary is not None else None,
                       # Metric metadata.
-                      pg.views.html.controls.LabelGroup(
-                          [   # pylint: disable=g-long-ternary
-                              _metric_metadata_badge(k, v)
-                              for k, v in self.metric_metadata.items()
-                          ] if self.metric_metadata else []
-                      ),
+                      _metric_label_group(self.metric_metadata)
                   ],
                   css_classes=['example-container'],
               )
@@ -305,18 +311,18 @@ class Example(pg.JSONConvertible, pg.views.HtmlTreeView.Extension):
           color: black;
         }
         /* Badge styles. */
-        .eval-example .badge.match {
+        .eval-example .badge.is_correct_true {
           color: green;
           background-color: #dcefbe;
         }
+        .eval-example .badge.is_correct_false {
+          color: orange;
+          background-color: #ffefc4;
+        }
         .eval-example .badge.error {
           color: red;
           background-color: #fdcccc;
         }
-        .eval-example .badge.mismatch {
-          color: orange;
-          background-color: #ffefc4;
-        }
         .eval-example .badge.score {
           color: blue;
           background-color: #c4dced;

langfun/core/eval/v2/example_test.py CHANGED Viewed

@@ -32,9 +32,9 @@ class ExampleTest(unittest.TestCase):
             name='evaluation', elapse=1.0, error=error
         )
     })
-    self.assertEqual(ex.error, error)
+    self.assertIsNone(ex.error)
     self.assertFalse(ex.is_processed)
-    self.assertTrue(ex.has_error)
+    self.assertFalse(ex.has_error)
     self.assertEqual(ex.elapse, 1.0)
     ex = Example(id=2, output=1)
@@ -116,7 +116,7 @@ class ExampleTest(unittest.TestCase):
         input=pg.Dict(a=1, b=2),
         output=3,
         metadata=dict(sum=3),
-        metric_metadata=dict(match=True),
+        metric_metadata=dict(match=dict(match=True)),
     )
     self.assertNotIn(
         'next',

langfun 0.1.2.dev202510200805__py3-none-any.whl → 0.1.2.dev202511160804__py3-none-any.whl

Potentially problematic release.

langfun 0.1.2.dev202510200805py3-none-any.whl → 0.1.2.dev202511160804py3-none-any.whl