PyPI - langfun - Versions diffs - 0.1.2.dev202412190804__py3-none-any.whl → 0.1.2.dev202412210804__py3-none-any.whl - Mend

langfun 0.1.2.dev202412190804py3-none-any.whl → 0.1.2.dev202412210804py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

langfun/core/eval/v2/checkpointing.py CHANGED Viewed

@@ -13,6 +13,7 @@
 # limitations under the License.
 """Checkpointing evaluation runs."""
 import threading
+import traceback
 import langfun.core as lf
 from langfun.core.eval.v2 import example as example_lib
@@ -27,6 +28,21 @@ Runner = experiment_lib.Runner
 class Checkpointer(experiment_lib.Plugin):
   """Base class for checkpointing evaluation examples."""
+  def on_experiment_start(self, experiment: Experiment):
+    if experiment.state.evaluated_examples:
+      experiment.info(
+          'Loaded %d examples from checkpoint files. Example IDs: %s' %
+          (
+              len(experiment.state.evaluated_examples),
+              list(sorted(experiment.state.evaluated_examples.keys()))
+          ),
+      )
+    else:
+      experiment.info(
+          'No previous evaluated examples are loaded. '
+          f'Experiment {experiment.id} starts from scratch.'
+      )
 class PerExampleCheckpointer(Checkpointer):
   """Checkpointer that saves each example to a separate file."""
@@ -50,6 +66,10 @@ class PerExampleCheckpointer(Checkpointer):
     # For refresh runs, we don't want to load the previous state.
     if not runner.current_run.refresh:
+      if runner.current_run.input_root != runner.current_run.output_root:
+        experiment.info(
+            f'Warm starting from directory: {runner.current_run.input_root}.'
+        )
       def _load_state(ckpt_file):
         experiment.load_state(ckpt_file)
@@ -68,10 +88,11 @@ class PerExampleCheckpointer(Checkpointer):
           _load_state, ckpt_files, max_workers=64,
       ):
         if error is not None:
-          pg.logging.warning(
-              'Failed to load checkpoint file %s: %s. Skipping the file.',
-              ckpt_file, error
+          experiment.warning(
+              f'Failed to load checkpoint file {ckpt_file}: {error}. '
+              'Skipping the file.'
           )
+    super().on_experiment_start(experiment)
   def on_example_complete(
       self,
@@ -80,7 +101,11 @@ class PerExampleCheckpointer(Checkpointer):
       example: Example,
   ) -> None:
     """Saves the example to the checkpoint file."""
-    if not example.has_error:
+    if example.has_error:
+      experiment.warning(
+          f'Example {example.id} has error. Skipping checkpointing.'
+      )
+    else:
       def save_state(example: Example):
         writer = SequenceWriter(
             runner.current_run.output_path_for(
@@ -91,8 +116,18 @@ class PerExampleCheckpointer(Checkpointer):
                 )
             )
         )
-        writer.add(example)
-        writer.close()
+        try:
+          writer.add(example)
+          writer.close()
+          experiment.info(
+              f'Example {example.id} is saved to {writer.path}.',
+          )
+        except BaseException as e:  # pylint: disable=broad-except
+          experiment.error(
+              f'Failed to save example {example.id} to {writer.path}. '
+              f'Error: {e}, Stacktrace: \n{traceback.format_exc()}.',
+          )
+          raise e
       runner.background_run(save_state, example)
   def _file_prefix_and_ext(self, filename: str) -> tuple[str, str]:
@@ -150,6 +185,10 @@ class BulkCheckpointer(Checkpointer):
       return
     # For refresh runs, we don't want to load the previous state.
     if not runner.current_run.refresh:
+      if runner.current_run.input_root != runner.current_run.output_root:
+        experiment.info(
+            f'Warm starting from directory: {runner.current_run.input_root}.'
+        )
       experiment.load_state(
           runner.current_run.input_path_for(
               experiment, self.checkpoint_filename
@@ -164,6 +203,7 @@ class BulkCheckpointer(Checkpointer):
     with self._lock:
       if self._sequence_writer is not None:
         self._sequence_writer[experiment.id] = sequence_writer
+    super().on_experiment_start(experiment)
   def on_experiment_complete(
       self,
@@ -178,8 +218,12 @@ class BulkCheckpointer(Checkpointer):
       if self._sequence_writer is not None:
         # Make sure the writer is closed without delay so the file will be
         # available immediately.
-        self._sequence_writer[experiment.id].close()
-        del self._sequence_writer[experiment.id]
+        writer = self._sequence_writer.pop(experiment.id)
+        writer.close()
+        experiment.info(
+            f'{len(experiment.state.evaluated_examples)} examples are '
+            f'checkpointed to {writer.path}.'
+        )
   def on_example_complete(
       self,
@@ -189,8 +233,22 @@ class BulkCheckpointer(Checkpointer):
   ) -> None:
     """Saves the example to the checkpoint file."""
     assert experiment.id in self._sequence_writer
-    if not example.has_error:
-      runner.background_run(self._sequence_writer[experiment.id].add, example)
+    if example.has_error:
+      experiment.warning(
+          f'Example {example.id} has error. Skipping checkpointing.'
+      )
+    else:
+      def _save_example(example: Example):
+        writer = self._sequence_writer[experiment.id]
+        try:
+          writer.add(example)
+        except BaseException as e:  # pylint: disable=broad-except
+          experiment.error(
+              f'Failed to save example {example.id} to {writer.path}. '
+              f'Error: {e}, Stacktrace: \n{traceback.format_exc()}.',
+          )
+          raise e
+      runner.background_run(_save_example, example)
 class SequenceWriter:
@@ -198,8 +256,13 @@ class SequenceWriter:
   def __init__(self, path: str):
     self._lock = threading.Lock()
+    self._path = path
     self._sequence_writer = pg.io.open_sequence(path, 'w')
+  @property
+  def path(self) -> str:
+    return self._path
   def add(self, example: Example):
     example_blob = pg.to_json_str(
         example,

langfun/core/eval/v2/checkpointing_test.py CHANGED Viewed

@@ -16,9 +16,9 @@ import tempfile
 import unittest
 from langfun.core.eval.v2 import checkpointing
+from langfun.core.eval.v2 import eval_test_helper
 from langfun.core.eval.v2 import example as example_lib
 from langfun.core.eval.v2 import runners as runners_lib  # pylint: disable=unused-import
-from langfun.core.eval.v2 import test_helper
 import pyglove as pg
 Example = example_lib.Example
@@ -56,7 +56,7 @@ class PerExampleCheckpointerTest(unittest.TestCase):
   def test_checkpointing(self):
     root_dir = os.path.join(tempfile.gettempdir(), 'per_example_checkpointer')
-    experiment = test_helper.test_experiment()
+    experiment = eval_test_helper.test_experiment()
     checkpoint_filename = 'checkpoint.jsonl'
     checkpointer = checkpointing.PerExampleCheckpointer(checkpoint_filename)
     run = experiment.run(
@@ -89,7 +89,7 @@ class BulkCheckpointerTest(unittest.TestCase):
   def test_checkpointing(self):
     root_dir = os.path.join(tempfile.gettempdir(), 'test_bulk_checkpointer')
-    experiment = test_helper.test_experiment()
+    experiment = eval_test_helper.test_experiment()
     checkpoint_filename = 'checkpoint.jsonl'
     checkpointer = checkpointing.BulkCheckpointer(checkpoint_filename)
     run = experiment.run(

langfun/core/eval/v2/evaluation.py CHANGED Viewed

@@ -285,36 +285,43 @@ class Evaluation(experiment_lib.Experiment):
   # Evaluation-level logging.
   #
-  def _log(self, level: lf.logging.LogLevel, message: str, **kwargs):
+  def _log(self, log_func, level: lf.logging.LogLevel, message: str, **kwargs):
+    # Write to external logging system.
+    log_message = f'{self.id}: {message}'
+    if kwargs:
+      log_message = f'{log_message} (metadata: {kwargs!r})'
+    log_func(log_message)
+    # Add to experiment log history.
+    log_entry = lf.logging.LogEntry(
+        level=level,
+        time=datetime.datetime.now(),
+        message=message,
+        metadata=kwargs,
+    )
     with self._log_lock:
-      self._log_entries.append(
-          lf.logging.LogEntry(
-              level=level,
-              time=datetime.datetime.now(),
-              message=message,
-              metadata=kwargs,
-          )
-      )
+      self._log_entries.append(log_entry)
   def debug(self, message: str, **kwargs):
     """Logs a debug message to the session."""
-    self._log('debug', message, **kwargs)
+    self._log(pg.logging.debug, 'debug', message, **kwargs)
   def info(self, message: str, **kwargs):
     """Logs an info message to the session."""
-    self._log('info', message, **kwargs)
+    self._log(pg.logging.info, 'info', message, **kwargs)
   def warning(self, message: str, **kwargs):
     """Logs a warning message to the session."""
-    self._log('warning', message, **kwargs)
+    self._log(pg.logging.warning, 'warning', message, **kwargs)
   def error(self, message: str, **kwargs):
     """Logs an error message to the session."""
-    self._log('error', message, **kwargs)
+    self._log(pg.logging.error, 'error', message, **kwargs)
   def fatal(self, message: str, **kwargs):
     """Logs a fatal message to the session."""
-    self._log('fatal', message, **kwargs)
+    # We use error level for fatal message, which does not trigger assertion.
+    self._log(pg.logging.error, 'fatal', message, **kwargs)
   #
   # HTML views.

langfun/core/eval/v2/evaluation_test.py CHANGED Viewed

@@ -15,12 +15,11 @@ import os
 import tempfile
 import unittest
+from langfun.core.eval.v2 import eval_test_helper
 from langfun.core.eval.v2 import evaluation as evaluation_lib
 from langfun.core.eval.v2 import example as example_lib
 from langfun.core.eval.v2 import experiment as experiment_lib
-from langfun.core.eval.v2 import test_helper
 import pyglove as pg
 Example = example_lib.Example
@@ -32,17 +31,23 @@ Run = experiment_lib.Run
 class EvaluationTest(unittest.TestCase):
   def test_hyper_evaluation(self):
-    exp = test_helper.TestEvaluation(
-        lm=test_helper.TestLLM(offset=pg.oneof(range(3)))
+    exp = eval_test_helper.TestEvaluation(
+        lm=eval_test_helper.TestLLM(offset=pg.oneof(range(3)))
     )
     self.assertFalse(exp.is_leaf)
     self.assertTrue(
         pg.eq(
             exp.children,
             [
-                test_helper.TestEvaluation(lm=test_helper.TestLLM(offset=0)),
-                test_helper.TestEvaluation(lm=test_helper.TestLLM(offset=1)),
-                test_helper.TestEvaluation(lm=test_helper.TestLLM(offset=2)),
+                eval_test_helper.TestEvaluation(
+                    lm=eval_test_helper.TestLLM(offset=0)
+                ),
+                eval_test_helper.TestEvaluation(
+                    lm=eval_test_helper.TestLLM(offset=1)
+                ),
+                eval_test_helper.TestEvaluation(
+                    lm=eval_test_helper.TestLLM(offset=2)
+                ),
             ]
         )
     )
@@ -57,19 +62,21 @@ class EvaluationTest(unittest.TestCase):
     )
   def test_input(self):
-    exp = test_helper.TestEvaluation()
+    exp = eval_test_helper.TestEvaluation()
     self.assertEqual(exp.num_examples, 10)
-    exp = test_helper.TestEvaluation(inputs=test_helper.test_inputs(None))
+    exp = eval_test_helper.TestEvaluation(
+        inputs=eval_test_helper.test_inputs(None)
+    )
     self.assertEqual(exp.num_examples, 20)
     @pg.functor
     def my_inputs():
       yield pg.Dict(x=1, y=2)
       yield pg.Dict(x=3, y=4)
-    exp = test_helper.TestEvaluation(inputs=my_inputs())
+    exp = eval_test_helper.TestEvaluation(inputs=my_inputs())
     self.assertEqual(exp.num_examples, 2)
   def test_evaluate(self):
-    exp = test_helper.TestEvaluation()
+    exp = eval_test_helper.TestEvaluation()
     example = exp.evaluate(Example(id=3))
     self.assertIs(exp.state.get(3), example)
     self.assertTrue(example.newly_processed)
@@ -85,7 +92,7 @@ class EvaluationTest(unittest.TestCase):
     self.assertIsNotNone(example.start_time)
     self.assertIsNotNone(example.end_time)
-    exp = test_helper.TestEvaluation(lm=test_helper.TestLLM(offset=1))
+    exp = eval_test_helper.TestEvaluation(lm=eval_test_helper.TestLLM(offset=1))
     example = exp.evaluate(3)
     self.assertTrue(example.newly_processed)
     self.assertEqual(example.input, pg.Dict(x=2, y=4, groundtruth=6))
@@ -109,7 +116,7 @@ class EvaluationTest(unittest.TestCase):
     pg.io.mkdirs(eval_dir, exist_ok=True)
     state_file = os.path.join(eval_dir, 'state.jsonl')
     with pg.io.open_sequence(state_file, 'w') as f:
-      exp = test_helper.TestEvaluation()
+      exp = eval_test_helper.TestEvaluation()
       example = exp.evaluate(3)
       self.assertTrue(example.newly_processed)
       self.assertEqual(example.input, pg.Dict(x=2, y=4, groundtruth=6))
@@ -132,7 +139,7 @@ class EvaluationTest(unittest.TestCase):
     self.assertEqual(example.usage_summary.uncached.total.num_requests, 0)
   def test_html_view(self):
-    exp = test_helper.TestEvaluation()
+    exp = eval_test_helper.TestEvaluation()
     exp.debug('debug message')
     exp.info('info message')
     exp.warning('warning message', x=1)

langfun/core/eval/v2/experiment.py CHANGED Viewed

@@ -959,6 +959,14 @@ class Plugin(lf.Component):
   ) -> None:
     """Called when an experiment (both leaf and non-leaf) is complete."""
+  def on_experiment_abort(
+      self,
+      runner: Runner,
+      experiment: Experiment,
+      error: BaseException,
+  ) -> None:
+    """Called when an experiment (both leaf and non-leaf) is aborted."""
   def on_example_start(
       self,
       runner: Runner,

langfun/core/eval/v2/progress_tracking_test.py CHANGED Viewed

@@ -18,9 +18,9 @@ import tempfile
 import unittest
 from langfun.core import console as lf_console
+from langfun.core.eval.v2 import eval_test_helper
 from langfun.core.eval.v2 import progress_tracking    # pylint: disable=unused-import
 from langfun.core.eval.v2 import runners as runners_lib  # pylint: disable=unused-import
-from langfun.core.eval.v2 import test_helper
 import pyglove as pg
@@ -35,7 +35,7 @@ class HtmlProgressTrackerTest(unittest.TestCase):
         display=display
     )
     root_dir = os.path.join(tempfile.gettempdir(), 'test_html_progress_tracker')
-    experiment = test_helper.test_experiment()
+    experiment = eval_test_helper.test_experiment()
     _ = experiment.run(root_dir, 'new', plugins=[])
     self.assertIsInstance(result['view'], pg.Html)
     lf_console._notebook = None
@@ -45,7 +45,7 @@ class TqdmProgressTrackerTest(unittest.TestCase):
   def test_basic(self):
     root_dir = os.path.join(tempfile.gettempdir(), 'test_tqdm_progress_tracker')
-    experiment = test_helper.test_experiment()
+    experiment = eval_test_helper.test_experiment()
     string_io = io.StringIO()
     with contextlib.redirect_stderr(string_io):
       _ = experiment.run(root_dir, 'new', plugins=[])
@@ -55,7 +55,7 @@ class TqdmProgressTrackerTest(unittest.TestCase):
     root_dir = os.path.join(
         tempfile.gettempdir(), 'test_tqdm_progress_tracker_with_example_ids'
     )
-    experiment = test_helper.test_experiment()
+    experiment = eval_test_helper.test_experiment()
     string_io = io.StringIO()
     with contextlib.redirect_stderr(string_io):
       _ = experiment.run(root_dir, 'new', example_ids=[1], plugins=[])

langfun/core/eval/v2/reporting.py CHANGED Viewed

@@ -14,6 +14,7 @@
 """Reporting evaluation results."""
 import time
+import traceback
 from typing import Annotated
 from langfun.core.eval.v2 import example as example_lib
@@ -61,6 +62,14 @@ class HtmlReporter(experiment_lib.Plugin):
   ) -> None:
     self._maybe_update_summary(runner, force=True)
+  def on_run_abort(
+      self,
+      runner: Runner,
+      root: Experiment,
+      error: BaseException
+  ) -> None:
+    self._maybe_update_summary(runner, force=True)
   def on_experiment_start(
       self,
       runner: Runner,
@@ -75,6 +84,16 @@ class HtmlReporter(experiment_lib.Plugin):
     if experiment.is_leaf:
       self._maybe_update_experiment_html(runner, experiment, force=True)
+  def on_experiment_abort(
+      self,
+      runner: Runner,
+      experiment: Experiment,
+      error: BaseException
+  ) -> None:
+    del error
+    assert experiment.is_leaf
+    self._maybe_update_experiment_html(runner, experiment, force=True)
   def on_example_complete(
       self, runner: Runner, experiment: Experiment, example: Example
   ):
@@ -103,19 +122,26 @@ class HtmlReporter(experiment_lib.Plugin):
       self, runner: Runner, experiment: Experiment, force: bool = False
   ) -> None:
     def _save():
-      html = experiment.to_html(
-          collapse_level=None,
-          extra_flags=dict(
-              current_run=runner.current_run,
-              interactive=False,
-              card_view=False,
-          ),
-      )
-      html.save(
-          runner.current_run.output_path_for(
-              experiment, _EVALULATION_DETAIL_FILE
-          )
+      index_html_path = runner.current_run.output_path_for(
+          experiment, _EVALULATION_DETAIL_FILE
       )
+      try:
+        html = experiment.to_html(
+            collapse_level=None,
+            extra_flags=dict(
+                current_run=runner.current_run,
+                interactive=False,
+                card_view=False,
+            ),
+        )
+        html.save(index_html_path)
+      except BaseException as e:  # pylint: disable=broad-except
+        experiment.error(
+            f'Failed to save HTML {index_html_path!r}. '
+            f'Error: {e}, Stacktrace: \n{traceback.format_exc()}.',
+        )
+        raise e
     if force or (
         time.time() - self._last_experiment_report_time[experiment.id]
         > self.experiment_report_interval
@@ -128,17 +154,24 @@ class HtmlReporter(experiment_lib.Plugin):
   ) -> None:
     """Saves the example."""
     def _save():
-      html = example.to_html(
-          collapse_level=None,
-          enable_summary_tooltip=False,
-          extra_flags=dict(
-              # For properly rendering the next link.
-              num_examples=getattr(experiment, 'num_examples', None)
-          ),
-      )
-      html.save(
-          runner.current_run.output_path_for(
-              experiment, f'{example.id}.html'
-          )
-      )
+      try:
+        html = example.to_html(
+            collapse_level=None,
+            enable_summary_tooltip=False,
+            extra_flags=dict(
+                # For properly rendering the next link.
+                num_examples=getattr(experiment, 'num_examples', None)
+            ),
+        )
+        html.save(
+            runner.current_run.output_path_for(
+                experiment, f'{example.id}.html'
+            )
+        )
+      except BaseException as e:  # pylint: disable=broad-except
+        experiment.error(
+            f'Failed to save HTML {example.id}.html. '
+            f'Error: {e}, Stacktrace: \n{traceback.format_exc()}.',
+        )
+        raise e
     runner.background_run(_save)

langfun/core/eval/v2/reporting_test.py CHANGED Viewed

@@ -15,9 +15,9 @@ import os
 import tempfile
 import unittest
+from langfun.core.eval.v2 import eval_test_helper
 from langfun.core.eval.v2 import reporting
 from langfun.core.eval.v2 import runners as runners_lib  # pylint: disable=unused-import
-from langfun.core.eval.v2 import test_helper
 import pyglove as pg
@@ -25,7 +25,7 @@ class ReportingTest(unittest.TestCase):
   def test_reporting(self):
     root_dir = os.path.join(tempfile.gettempdir(), 'test_reporting')
-    experiment = test_helper.test_experiment()
+    experiment = eval_test_helper.test_experiment()
     reporter = reporting.HtmlReporter()
     run = experiment.run(root_dir, 'new', plugins=[reporter])
     pg.io.path_exists(run.output_path_for(experiment, 'summary.html'))

langfun/core/eval/v2/runners.py CHANGED Viewed

@@ -18,6 +18,7 @@ import concurrent.futures
 import random
 import threading
 import time
+import traceback
 from typing import Any, Annotated, Callable, Iterator
 from langfun import core as lf
@@ -64,6 +65,7 @@ class RunnerBase(Runner):
     with pg.notify_on_change(False):
       self.plugins.append(progress_tracking.progress_tracker(self.tqdm))
+    self._io_pool_lock = threading.Lock()
     self._io_pool = concurrent.futures.ThreadPoolExecutor(max_workers=16)
     # TODO(daiyip): render background errors.
     self._background_last_error = None
@@ -75,7 +77,10 @@ class RunnerBase(Runner):
         func(*args, **kwargs)
       except Exception as e:  # pylint: disable=broad-except
         self._background_last_error = e
-    self._io_pool.submit(_background_run, *args, **kwargs)
+    with self._io_pool_lock:
+      if self._io_pool is not None:
+        self._io_pool.submit(_background_run, *args, **kwargs)
   def _all_plugins(self, experiment: Experiment) -> Iterator[Plugin]:
     """Returns all plugins for the experiment."""
@@ -120,9 +125,14 @@ class RunnerBase(Runner):
     # Start the progress of the evaluation.
     if experiment.is_leaf:
       assert isinstance(experiment, Evaluation)
-      experiment.progress.start(
-          total=(len(self.current_run.example_ids)
-                 if self.current_run.example_ids else experiment.num_examples)
+      num_examples_to_evaluate = (
+          len(self.current_run.example_ids)
+          if self.current_run.example_ids else experiment.num_examples
+      )
+      experiment.progress.start(total=num_examples_to_evaluate)
+      experiment.info(
+          'Starting evaluation %s with %d examples to evaluate.'
+          % (experiment.id, num_examples_to_evaluate)
       )
     else:
       experiment.progress.start(total=len(experiment.leaf_nodes))
@@ -144,8 +154,7 @@ class RunnerBase(Runner):
     # Only leaf evaluations will trigger the complete notification of the
     # ancestors.
-    if experiment.is_leaf:
-      self._update_ancestor_progresses(experiment)
+    self._update_ancestor_progresses(experiment)
   def on_experiment_complete(self, experiment: Experiment) -> None:
     """Called when an evaluation is complete."""
@@ -160,6 +169,35 @@ class RunnerBase(Runner):
     # ancestors.
     if experiment.is_leaf:
       self._update_ancestor_progresses(experiment)
+      self._log_experiment_completion(experiment)
+  def _log_experiment_completion(self, experiment: Experiment):
+    example_ids = (
+        self.current_run.example_ids if self.current_run.example_ids else
+        list(range(1, experiment.num_examples + 1))
+    )
+    num_from_checkpoint, num_processed = 0, 0
+    for example_id in example_ids:
+      example = experiment.state.get(example_id)
+      if example.newly_processed:
+        num_processed += 1
+      else:
+        num_from_checkpoint += 1
+    experiment.info(
+        f'{experiment.id} completed with {num_from_checkpoint + num_processed} '
+        f'examples evaluated ({num_from_checkpoint} from checkpoint, '
+        f'{num_processed} newly processed).'
+    )
+  def on_experiment_abort(
+      self, experiment: Experiment, error: BaseException) -> None:
+    """Called when an evaluation is complete."""
+    assert experiment.is_leaf
+    experiment.fatal(f'{error}\n\n{traceback.format_exc()}')
+    # Notify the plugins of the experiment abort.
+    for plugin in self._all_plugins(experiment):
+      plugin.on_experiment_abort(self, experiment, error)
   def _update_ancestor_progresses(self, experiment: Experiment):
     """Updates the progresses of the parent nodes of the experiment."""
@@ -262,7 +300,9 @@ class RunnerBase(Runner):
         self.background_run(cache.save)
       # Wait for the background tasks to finish.
-      self._io_pool.shutdown(wait=True)
+      with self._io_pool_lock:
+        self._io_pool, io_pool = None, self._io_pool
+      io_pool.shutdown(wait=True)
   @abc.abstractmethod
   def _run(self, evaluations: list[Evaluation]) -> None:
@@ -270,31 +310,36 @@ class RunnerBase(Runner):
   def run_evaluation(self, evaluation: Evaluation) -> None:
     """Runs the evaluation."""
-    self.on_experiment_start(evaluation)
-    per_evaluation_settings = {}
-    cache = None
-    if self.current_run.use_cache == 'per_dataset':
-      cache = self._load_or_create_cache(evaluation)
-      per_evaluation_settings['cache'] = cache
-    with lf.use_settings(**per_evaluation_settings):
-      if self.current_run.example_ids is None:
-        items = (
-            Example(id=i + 1, input=ex) for i, ex in enumerate(
-                evaluation.example_inputs)
-        )
-      else:
-        items = (
-            Example(
-                id=example_id, input=evaluation.example_input_by_id(example_id)
-            ) for example_id in self.current_run.example_ids
-        )
-      self._evaluate_items(evaluation, items)
-    if cache:
-      self.background_run(cache.save)
-    self.on_experiment_complete(evaluation)
+    try:
+      self.on_experiment_start(evaluation)
+      per_evaluation_settings = {}
+      cache = None
+      if self.current_run.use_cache == 'per_dataset':
+        cache = self._load_or_create_cache(evaluation)
+        per_evaluation_settings['cache'] = cache
+      with lf.use_settings(**per_evaluation_settings):
+        if self.current_run.example_ids is None:
+          items = (
+              Example(id=i + 1, input=ex) for i, ex in enumerate(
+                  evaluation.example_inputs)
+          )
+        else:
+          items = (
+              Example(
+                  id=example_id,
+                  input=evaluation.example_input_by_id(example_id)
+              ) for example_id in self.current_run.example_ids
+          )
+        self._evaluate_items(evaluation, items)
+      if cache:
+        self.background_run(cache.save)
+      self.on_experiment_complete(evaluation)
+    except BaseException as e:  # pylint: disable=broad-except
+      self.on_experiment_abort(evaluation, e)
+      raise e
   @abc.abstractmethod
   def _evaluate_items(
@@ -410,9 +455,7 @@ class ParallelRunner(RunnerBase):
         groups.values(),
         max_workers=max(64, len(groups)),
         timeout=self.timeout,
-        silence_on_errors=(
-            None if self.current_run.raise_if_has_error else BaseException
-        )
+        silence_on_errors=None,
     ):
       pass
@@ -437,8 +480,6 @@ class ParallelRunner(RunnerBase):
         items,
         max_workers=evaluation.max_workers,
         timeout=self.timeout,
-        silence_on_errors=(
-            None if self.current_run.raise_if_has_error else BaseException
-        )
+        silence_on_errors=None,
     ):
       pass

langfun/core/eval/v2/runners_test.py CHANGED Viewed

@@ -18,10 +18,11 @@ import time
 from typing import Any
 import unittest
+from langfun.core.eval.v2 import eval_test_helper
 from langfun.core.eval.v2 import example as example_lib
 from langfun.core.eval.v2 import experiment as experiment_lib
 from langfun.core.eval.v2 import runners as runners_lib  # pylint: disable=unused-import
-from langfun.core.eval.v2 import test_helper
 import pyglove as pg
@@ -101,7 +102,7 @@ class RunnerTest(unittest.TestCase):
   def test_basic(self):
     plugin = TestPlugin()
-    exp = test_helper.test_experiment()
+    exp = eval_test_helper.test_experiment()
     root_dir = os.path.join(tempfile.gettempdir(), 'test_sequential_runner')
     run = exp.run(root_dir, runner='sequential', plugins=[plugin])
@@ -143,7 +144,7 @@ class RunnerTest(unittest.TestCase):
   def test_raise_if_has_error(self):
     root_dir = os.path.join(tempfile.gettempdir(), 'test_raise_if_has_error')
-    exp = test_helper.TestEvaluation()
+    exp = eval_test_helper.TestEvaluation()
     with self.assertRaisesRegex(ValueError, 'x should not be 5'):
       exp.run(
           root_dir, runner='sequential', plugins=[], raise_if_has_error=True
@@ -154,7 +155,7 @@ class RunnerTest(unittest.TestCase):
   def test_example_ids(self):
     root_dir = os.path.join(tempfile.gettempdir(), 'test_example_ids')
-    exp = test_helper.test_experiment()
+    exp = eval_test_helper.test_experiment()
     plugin = TestPlugin()
     _ = exp.run(
         root_dir, runner='sequential', plugins=[plugin], example_ids=[5, 7, 9]
@@ -164,7 +165,7 @@ class RunnerTest(unittest.TestCase):
   def test_filter(self):
     plugin = TestPlugin()
-    exp = test_helper.test_experiment()
+    exp = eval_test_helper.test_experiment()
     root_dir = os.path.join(tempfile.gettempdir(), 'test_filter')
     _ = exp.run(
@@ -193,7 +194,7 @@ class RunnerTest(unittest.TestCase):
           ) for i in range(num_examples)
       ]
-    exp = test_helper.TestEvaluation(
+    exp = eval_test_helper.TestEvaluation(
         inputs=test_inputs(num_examples=pg.oneof([2, 4]))
     )
     # Global cache.
@@ -234,7 +235,7 @@ class ParallelRunnerTest(RunnerTest):
   def test_parallel_runner(self):
     plugin = TestPlugin()
-    exp = test_helper.test_experiment()
+    exp = eval_test_helper.test_experiment()
     root_dir = os.path.join(tempfile.gettempdir(), 'test_parallel_runner')
     run = exp.run(root_dir, runner='parallel', plugins=[plugin])
@@ -274,7 +275,7 @@ class ParallelRunnerTest(RunnerTest):
   def test_concurrent_startup_delay(self):
     plugin = TestPlugin()
-    exp = test_helper.test_experiment()
+    exp = eval_test_helper.test_experiment()
     root_dir = os.path.join(
         tempfile.gettempdir(), 'test_concurrent_startup_delay'
     )
@@ -290,7 +291,7 @@ class DebugRunnerTest(RunnerTest):
   def test_debug_runner(self):
     plugin = TestPlugin()
-    exp = test_helper.test_experiment()
+    exp = eval_test_helper.test_experiment()
     root_dir = os.path.join(tempfile.gettempdir(), 'test_debug_runner')
     run = exp.run(root_dir, runner='debug', plugins=[plugin])

langfun/core/structured/querying.py CHANGED Viewed

@@ -583,7 +583,16 @@ class QueryInvocation(pg.Object, pg.views.HtmlTreeView.Extension):
   @functools.cached_property
   def output(self) -> Any:
-    return query_output(self.lm_response, self.schema)
+    """The output of `lf.query`. If it failed, returns the `MappingError`."""
+    try:
+      return query_output(self.lm_response, self.schema)
+    except mapping.MappingError as e:
+      return e
+  @property
+  def has_error(self) -> bool:
+    """Returns True if the query failed to generate a valid output."""
+    return isinstance(self.output, BaseException)
   @property
   def elapse(self) -> float:

langfun/core/structured/querying_test.py CHANGED Viewed

@@ -1051,6 +1051,16 @@ class QueryStructureJsonTest(unittest.TestCase):
 class QueryInvocationTest(unittest.TestCase):
+  def test_basics(self):
+    lm = fake.StaticSequence([
+        'Activity(description="hi"',
+    ])
+    with querying.track_queries() as queries:
+      querying.query('foo', Activity, default=None, lm=lm)
+    self.assertTrue(queries[0].has_error)
+    self.assertIsInstance(queries[0].output, mapping.MappingError)
   def test_to_html(self):
     lm = fake.StaticSequence([
         'Activity(description="hi")',

{langfun-0.1.2.dev202412190804.dist-info → langfun-0.1.2.dev202412210804.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: langfun
-Version: 0.1.2.dev202412190804
+Version: 0.1.2.dev202412210804
 Summary: Langfun: Language as Functions.
 Home-page: https://github.com/google/langfun
 Author: Langfun Authors

{langfun-0.1.2.dev202412190804.dist-info → langfun-0.1.2.dev202412210804.dist-info}/RECORD RENAMED Viewed

@@ -58,13 +58,14 @@ langfun/core/eval/patching_test.py,sha256=8kCd54Egjju22FMgtJuxEsrXkW8ifs-UUBHtrC
 langfun/core/eval/scoring.py,sha256=B69IsIxiPs1xZcOBFIhZF70YmDue2Siik-CPL2bh33s,6254
 langfun/core/eval/scoring_test.py,sha256=O8olHbrUEg60gMxwOkWzKBJZpZoUlmVnBANX5Se2SXM,4546
 langfun/core/eval/v2/__init__.py,sha256=qoa6zKdFXOFyCX6vay6OdgPf1eUhYGoHYAxe35qECGk,1628
-langfun/core/eval/v2/checkpointing.py,sha256=8vxH3AfIBS8dxA0IiOZBUxAHXIx5m2tSWSSumDLpzp8,6546
-langfun/core/eval/v2/checkpointing_test.py,sha256=dAERKQTW_PM1B0oUauB0YVQkMEI-cgJq0q-wAVlGYpU,4383
-langfun/core/eval/v2/evaluation.py,sha256=7PC-npbEQjwwv0pWbv8vGi_OkzZ7QpJrEpYoixFBlno,21429
-langfun/core/eval/v2/evaluation_test.py,sha256=ld8oBOjsfN-LNLL2eViSTu17wAq90GcsfURXX6oVlFo,6014
+langfun/core/eval/v2/checkpointing.py,sha256=zr2hxOjm6Hdq71sYTsbQtL_CwQOWr-Ir9T5TPUnhqMI,8741
+langfun/core/eval/v2/checkpointing_test.py,sha256=Imy96lwDkvmtj-1YFpP2DZukMOoYqpPov2J_MsQKxxI,4398
+langfun/core/eval/v2/eval_test_helper.py,sha256=pDpZTBnWRR5xjJv3Uy3NWEzArqlL8FTMOgeR4C53F5M,2348
+langfun/core/eval/v2/evaluation.py,sha256=NFBGAWw2BtW7H0zcoZhfWtz59Psra84eshJm73uAFwg,21807
+langfun/core/eval/v2/evaluation_test.py,sha256=GmV1TiqX1V15st2qpcGWooM5hudomQVjW5kajovGDvE,6231
 langfun/core/eval/v2/example.py,sha256=fURrvdNmMsVMqoEErcsmLmC6Xq3ny16dYsnLH8HVlcY,9626
 langfun/core/eval/v2/example_test.py,sha256=WcJmU7IQQXvjFia63mokySC4CqxzVL9Wso1sC5F0YK8,3032
-langfun/core/eval/v2/experiment.py,sha256=0JBGckJ93aqSdffpJPDVPy_I5T2BXscghTxiglHzJWo,29556
+langfun/core/eval/v2/experiment.py,sha256=xfk4aNZ3dH46y0lWSS_fC7JpfJCG77Z5qsakV4gHcOs,29762
 langfun/core/eval/v2/experiment_test.py,sha256=zSMHYqC9cA0k61U71pCSYTAJ6yK2_b6Dml5btc-bKzQ,9133
 langfun/core/eval/v2/metric_values.py,sha256=_B905bC-jxrYPLSEcP2M8MaHZOVMz_bVrUw8YC4arCE,4660
 langfun/core/eval/v2/metric_values_test.py,sha256=ab2oF_HsIwrSy459108ggyjgefHSPn8UVILR4dRwx14,2634
@@ -73,12 +74,11 @@ langfun/core/eval/v2/metrics_test.py,sha256=p4FzLJsE8XAzAQuyP9hfEf9YeKWZ__PO_ue8
 langfun/core/eval/v2/progress.py,sha256=azZgssQgNdv3IgjKEaQBuGI5ucFDNbdi02P4z_nQ8GE,10292
 langfun/core/eval/v2/progress_test.py,sha256=YU7VHzmy5knPZwj9vpBN3rQQH2tukj9eKHkuBCI62h8,2540
 langfun/core/eval/v2/progress_tracking.py,sha256=l9fEkz4oP5McpZzf72Ua7PYm3lAWtRru7gRWNf8H0ms,6083
-langfun/core/eval/v2/progress_tracking_test.py,sha256=iO-DslCJWncU7-27XaMKxDeKrsGbwdk_tKfoRk3KboE,2271
-langfun/core/eval/v2/reporting.py,sha256=TGkli1IDwqfqsCJ_WslOMGk_24JDg7oRRTGXlAJlWpc,4361
-langfun/core/eval/v2/reporting_test.py,sha256=JxffbUPWInUyLjo-AQVFrllga884Mdfm05R86FtxSss,1482
-langfun/core/eval/v2/runners.py,sha256=nh5qIAkdlY1MohDfiPkFcCY_By1SN0A1SOqmaShGziM,14339
-langfun/core/eval/v2/runners_test.py,sha256=UeiUNygux_U6iGVG18rhp68ZE4hoWeoT6XsXvSjxNQg,11620
-langfun/core/eval/v2/test_helper.py,sha256=pDpZTBnWRR5xjJv3Uy3NWEzArqlL8FTMOgeR4C53F5M,2348
+langfun/core/eval/v2/progress_tracking_test.py,sha256=fouMVJkFJqHjbhQJngGLGCmA9x3n0dU4USI2dY163mg,2291
+langfun/core/eval/v2/reporting.py,sha256=vsh45GLVnA7GMU-8cvNYOt4Nb7mEwvcguhO-BSXSzTE,5358
+langfun/core/eval/v2/reporting_test.py,sha256=4nobW6pcaatiZh8u4xciexciaiZNDlDoJci157Wp_RI,1492
+langfun/core/eval/v2/runners.py,sha256=t6_yHAJ4HWufK4wvh_OntKcok2KquA5ARIHIk1vvEwc,15870
+langfun/core/eval/v2/runners_test.py,sha256=A37fKK2MvAVTiShsg_laluJzJ9AuAQn52k7HPbfD0Ks,11666
 langfun/core/llms/__init__.py,sha256=lWXKjGHv66ShG7AE_Bc4QM7SDTxJdfoQMn3PF0lr0sU,6461
 langfun/core/llms/anthropic.py,sha256=afKZmdiLcosS_UEBlB8WKyf1K-zeXgwtPAx6ofg2Gww,13989
 langfun/core/llms/anthropic_test.py,sha256=-2U4kc_pgBM7wqxu8RuxzyHPGww1EAWqKUvN4PW8Btw,8058
@@ -129,8 +129,8 @@ langfun/core/structured/mapping.py,sha256=vLKH79UT-j0qkQdvqlQBO7SkXXuM-yr2Idm8_H
 langfun/core/structured/mapping_test.py,sha256=bHm2ZCXBITq_G8Lvw_olFHeUUc4s_lGXZm9v9JhoPB4,9630
 langfun/core/structured/parsing.py,sha256=MGvI7ypXlwfzr5XB8_TFU9Ei0_5reYqkWkv64eAy0EA,12015
 langfun/core/structured/parsing_test.py,sha256=kNPrhpdPY3iWhUld0TFYU-Zgn44wC0d6YuQ9XdVbQ8o,22346
-langfun/core/structured/querying.py,sha256=sXGhYtiEBac8iOkYOErGXyX8SAHSB1gg69WePhOyGxE,22759
-langfun/core/structured/querying_test.py,sha256=M9Apg83KjQUjT42K9LheBEr74DX3Inwd0YmCanA71kc,31738
+langfun/core/structured/querying.py,sha256=nqvsfMS_KLv5EvO0_VAGEHwY4pHy4S0CvJmeV0HBXlM,23066
+langfun/core/structured/querying_test.py,sha256=YlC4s9LVChfhGZzaXGW1UYlcBnAjNOunu4SLl5_p7PQ,32054
 langfun/core/structured/schema.py,sha256=0VUPSfX1JEQ0xu8WvEymCKK_WSGwBNA-rQD2hATErmU,27912
 langfun/core/structured/schema_generation.py,sha256=U3nRQsqmMZg_qIVDh2fiY3K4JLfsAL1LcKzIFP1iXFg,5316
 langfun/core/structured/schema_generation_test.py,sha256=RM9s71kMNg2jTePwInkiW9fK1ACN37eyPeF8OII-0zw,2950
@@ -148,8 +148,8 @@ langfun/core/templates/demonstration.py,sha256=vCrgYubdZM5Umqcgp8NUVGXgr4P_c-fik
 langfun/core/templates/demonstration_test.py,sha256=SafcDQ0WgI7pw05EmPI2S4v1t3ABKzup8jReCljHeK4,2162
 langfun/core/templates/selfplay.py,sha256=yhgrJbiYwq47TgzThmHrDQTF4nDrTI09CWGhuQPNv-s,2273
 langfun/core/templates/selfplay_test.py,sha256=Ot__1P1M8oJfoTp-M9-PQ6HUXqZKyMwvZ5f7yQ3yfyM,2326
-langfun-0.1.2.dev202412190804.dist-info/LICENSE,sha256=WNHhf_5RCaeuKWyq_K39vmp9F28LxKsB4SpomwSZ2L0,11357
-langfun-0.1.2.dev202412190804.dist-info/METADATA,sha256=Zr8TfOnhdo83h3aGRNRWXTrJ54h7Sh7E-7Lj95iJVDw,8281
-langfun-0.1.2.dev202412190804.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
-langfun-0.1.2.dev202412190804.dist-info/top_level.txt,sha256=RhlEkHxs1qtzmmtWSwYoLVJAc1YrbPtxQ52uh8Z9VvY,8
-langfun-0.1.2.dev202412190804.dist-info/RECORD,,
+langfun-0.1.2.dev202412210804.dist-info/LICENSE,sha256=WNHhf_5RCaeuKWyq_K39vmp9F28LxKsB4SpomwSZ2L0,11357
+langfun-0.1.2.dev202412210804.dist-info/METADATA,sha256=u3a7ssXSuTdAJuhvtQtGyroed6k5r9HeobDPozhHjJ0,8281
+langfun-0.1.2.dev202412210804.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
+langfun-0.1.2.dev202412210804.dist-info/top_level.txt,sha256=RhlEkHxs1qtzmmtWSwYoLVJAc1YrbPtxQ52uh8Z9VvY,8
+langfun-0.1.2.dev202412210804.dist-info/RECORD,,

/langfun/core/eval/v2/{test_helper.py → eval_test_helper.py} RENAMED Viewed

File without changes

{langfun-0.1.2.dev202412190804.dist-info → langfun-0.1.2.dev202412210804.dist-info}/LICENSE RENAMED Viewed

File without changes

{langfun-0.1.2.dev202412190804.dist-info → langfun-0.1.2.dev202412210804.dist-info}/WHEEL RENAMED Viewed

File without changes

{langfun-0.1.2.dev202412190804.dist-info → langfun-0.1.2.dev202412210804.dist-info}/top_level.txt RENAMED Viewed

File without changes

langfun 0.1.2.dev202412190804__py3-none-any.whl → 0.1.2.dev202412210804__py3-none-any.whl

langfun 0.1.2.dev202412190804py3-none-any.whl → 0.1.2.dev202412210804py3-none-any.whl