PyPI - langfun - Versions diffs - 0.1.2.dev202509120804__py3-none-any.whl → 0.1.2.dev202512040805__py3-none-any.whl - Mend

langfun 0.1.2.dev202509120804py3-none-any.whl → 0.1.2.dev202512040805py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (162) hide show

langfun/__init__.py +1 -1
langfun/core/__init__.py +7 -1
langfun/core/agentic/__init__.py +8 -1
langfun/core/agentic/action.py +740 -112
langfun/core/agentic/action_eval.py +9 -2
langfun/core/agentic/action_test.py +189 -24
langfun/core/async_support.py +104 -5
langfun/core/async_support_test.py +23 -0
langfun/core/coding/python/correction.py +19 -9
langfun/core/coding/python/execution.py +14 -12
langfun/core/coding/python/generation.py +21 -16
langfun/core/coding/python/sandboxing.py +23 -3
langfun/core/component.py +42 -3
langfun/core/concurrent.py +70 -6
langfun/core/concurrent_test.py +9 -2
langfun/core/console.py +1 -1
langfun/core/data/conversion/anthropic.py +12 -3
langfun/core/data/conversion/anthropic_test.py +8 -6
langfun/core/data/conversion/gemini.py +11 -2
langfun/core/data/conversion/gemini_test.py +48 -9
langfun/core/data/conversion/openai.py +145 -31
langfun/core/data/conversion/openai_test.py +161 -17
langfun/core/eval/base.py +48 -44
langfun/core/eval/base_test.py +5 -5
langfun/core/eval/matching.py +5 -2
langfun/core/eval/patching.py +3 -3
langfun/core/eval/scoring.py +4 -3
langfun/core/eval/v2/__init__.py +2 -0
langfun/core/eval/v2/checkpointing.py +76 -7
langfun/core/eval/v2/checkpointing_test.py +9 -2
langfun/core/eval/v2/config_saver.py +37 -0
langfun/core/eval/v2/config_saver_test.py +36 -0
langfun/core/eval/v2/eval_test_helper.py +104 -3
langfun/core/eval/v2/evaluation.py +92 -17
langfun/core/eval/v2/evaluation_test.py +9 -3
langfun/core/eval/v2/example.py +50 -40
langfun/core/eval/v2/example_test.py +16 -8
langfun/core/eval/v2/experiment.py +84 -15
langfun/core/eval/v2/experiment_test.py +19 -0
langfun/core/eval/v2/metric_values.py +31 -3
langfun/core/eval/v2/metric_values_test.py +32 -0
langfun/core/eval/v2/metrics.py +157 -44
langfun/core/eval/v2/metrics_test.py +39 -18
langfun/core/eval/v2/progress.py +31 -1
langfun/core/eval/v2/progress_test.py +27 -0
langfun/core/eval/v2/progress_tracking.py +13 -5
langfun/core/eval/v2/progress_tracking_test.py +9 -1
langfun/core/eval/v2/reporting.py +90 -71
langfun/core/eval/v2/reporting_test.py +24 -6
langfun/core/eval/v2/runners/__init__.py +30 -0
langfun/core/eval/v2/{runners.py → runners/base.py} +72 -180
langfun/core/eval/v2/runners/beam.py +354 -0
langfun/core/eval/v2/runners/beam_test.py +153 -0
langfun/core/eval/v2/runners/ckpt_monitor.py +294 -0
langfun/core/eval/v2/runners/ckpt_monitor_test.py +162 -0
langfun/core/eval/v2/runners/debug.py +40 -0
langfun/core/eval/v2/runners/debug_test.py +76 -0
langfun/core/eval/v2/runners/parallel.py +243 -0
langfun/core/eval/v2/runners/parallel_test.py +182 -0
langfun/core/eval/v2/runners/sequential.py +47 -0
langfun/core/eval/v2/runners/sequential_test.py +169 -0
langfun/core/langfunc.py +45 -130
langfun/core/langfunc_test.py +7 -5
langfun/core/language_model.py +189 -36
langfun/core/language_model_test.py +54 -3
langfun/core/llms/__init__.py +12 -1
langfun/core/llms/anthropic.py +157 -2
langfun/core/llms/azure_openai.py +29 -17
langfun/core/llms/cache/base.py +25 -3
langfun/core/llms/cache/in_memory.py +48 -7
langfun/core/llms/cache/in_memory_test.py +14 -4
langfun/core/llms/compositional.py +25 -1
langfun/core/llms/deepseek.py +30 -2
langfun/core/llms/fake.py +32 -1
langfun/core/llms/gemini.py +64 -12
langfun/core/llms/gemini_test.py +110 -0
langfun/core/llms/google_genai.py +34 -1
langfun/core/llms/groq.py +28 -3
langfun/core/llms/llama_cpp.py +23 -4
langfun/core/llms/openai.py +120 -3
langfun/core/llms/openai_compatible.py +148 -27
langfun/core/llms/openai_compatible_test.py +207 -20
langfun/core/llms/openai_test.py +0 -2
langfun/core/llms/rest.py +16 -1
langfun/core/llms/vertexai.py +58 -8
langfun/core/logging.py +1 -1
langfun/core/mcp/__init__.py +10 -0
langfun/core/mcp/client.py +177 -0
langfun/core/mcp/client_test.py +71 -0
langfun/core/mcp/session.py +241 -0
langfun/core/mcp/session_test.py +54 -0
langfun/core/mcp/testing/simple_mcp_client.py +33 -0
langfun/core/mcp/testing/simple_mcp_server.py +33 -0
langfun/core/mcp/tool.py +254 -0
langfun/core/mcp/tool_test.py +197 -0
langfun/core/memory.py +1 -0
langfun/core/message.py +160 -55
langfun/core/message_test.py +65 -81
langfun/core/modalities/__init__.py +8 -0
langfun/core/modalities/audio.py +21 -1
langfun/core/modalities/image.py +73 -3
langfun/core/modalities/image_test.py +116 -0
langfun/core/modalities/mime.py +64 -3
langfun/core/modalities/mime_test.py +11 -0
langfun/core/modalities/pdf.py +19 -1
langfun/core/modalities/video.py +21 -1
langfun/core/modality.py +167 -29
langfun/core/modality_test.py +42 -12
langfun/core/natural_language.py +1 -1
langfun/core/sampling.py +4 -4
langfun/core/sampling_test.py +20 -4
langfun/core/structured/__init__.py +2 -24
langfun/core/structured/completion.py +34 -44
langfun/core/structured/completion_test.py +23 -43
langfun/core/structured/description.py +54 -50
langfun/core/structured/function_generation.py +29 -12
langfun/core/structured/mapping.py +81 -37
langfun/core/structured/parsing.py +95 -79
langfun/core/structured/parsing_test.py +0 -3
langfun/core/structured/querying.py +230 -154
langfun/core/structured/querying_test.py +69 -33
langfun/core/structured/schema/__init__.py +49 -0
langfun/core/structured/schema/base.py +664 -0
langfun/core/structured/schema/base_test.py +531 -0
langfun/core/structured/schema/json.py +174 -0
langfun/core/structured/schema/json_test.py +121 -0
langfun/core/structured/schema/python.py +316 -0
langfun/core/structured/schema/python_test.py +410 -0
langfun/core/structured/schema_generation.py +33 -14
langfun/core/structured/scoring.py +47 -36
langfun/core/structured/tokenization.py +26 -11
langfun/core/subscription.py +2 -2
langfun/core/template.py +175 -50
langfun/core/template_test.py +123 -17
langfun/env/__init__.py +43 -0
langfun/env/base_environment.py +827 -0
langfun/env/base_environment_test.py +473 -0
langfun/env/base_feature.py +304 -0
langfun/env/base_feature_test.py +228 -0
langfun/env/base_sandbox.py +842 -0
langfun/env/base_sandbox_test.py +1235 -0
langfun/env/event_handlers/__init__.py +14 -0
langfun/env/event_handlers/chain.py +233 -0
langfun/env/event_handlers/chain_test.py +253 -0
langfun/env/event_handlers/event_logger.py +472 -0
langfun/env/event_handlers/event_logger_test.py +304 -0
langfun/env/event_handlers/metric_writer.py +726 -0
langfun/env/event_handlers/metric_writer_test.py +214 -0
langfun/env/interface.py +1640 -0
langfun/env/interface_test.py +153 -0
langfun/env/load_balancers.py +59 -0
langfun/env/load_balancers_test.py +141 -0
langfun/env/test_utils.py +507 -0
{langfun-0.1.2.dev202509120804.dist-info → langfun-0.1.2.dev202512040805.dist-info}/METADATA +7 -3
langfun-0.1.2.dev202512040805.dist-info/RECORD +217 -0
langfun/core/eval/v2/runners_test.py +0 -343
langfun/core/structured/schema.py +0 -987
langfun/core/structured/schema_test.py +0 -982
langfun-0.1.2.dev202509120804.dist-info/RECORD +0 -172
{langfun-0.1.2.dev202509120804.dist-info → langfun-0.1.2.dev202512040805.dist-info}/WHEEL +0 -0
{langfun-0.1.2.dev202509120804.dist-info → langfun-0.1.2.dev202512040805.dist-info}/licenses/LICENSE +0 -0
{langfun-0.1.2.dev202509120804.dist-info → langfun-0.1.2.dev202512040805.dist-info}/top_level.txt +0 -0

langfun/core/eval/v2/metrics_test.py CHANGED Viewed

@@ -25,15 +25,22 @@ class MatchTest(unittest.TestCase):
   def test_basic(self):
     m = metrics.Match()  # pylint: disable=invalid-name
     self.assertEqual(
-        m.audit(Example(id=1, input=pg.Dict(groundtruth=1), output=1)),
-        dict(match=True)
+        m.update(Example(id=1, input=pg.Dict(groundtruth=1), output=1)),
+        dict(is_correct=True)
     )
     self.assertEqual(
-        m.audit(Example(id=2, input=pg.Dict(groundtruth=1), output=2)),
-        dict(mismatch=True)
+        m.update(
+            Example(
+                id=2,
+                input=pg.Dict(groundtruth=1),
+                output=2,
+                metric_metadata=dict(match=dict(is_correct=False, x=1))
+            )
+        ),
+        dict(is_correct=False, x=1)
     )
     self.assertEqual(
-        m.audit(
+        m.update(
             Example(
                 id=3,
                 input=pg.Dict(groundtruth=1),
@@ -47,7 +54,7 @@ class MatchTest(unittest.TestCase):
         dict(error='ValueError')
     )
     self.assertEqual(
-        m.audit(
+        m.update(
             Example(
                 id=3,
                 input=pg.Dict(groundtruth=1),
@@ -80,7 +87,7 @@ class MatchTest(unittest.TestCase):
   def test_bad_case(self):
     m = metrics.Match()  # pylint: disable=invalid-name
     with self.assertRaisesRegex(ValueError, '`groundtruth` is not present'):
-      m.audit(Example(id=1, input=pg.Dict(x=1), output=1))
+      m.update(Example(id=1, input=pg.Dict(x=1), output=1))
   def test_custom_metadata(self):
@@ -90,22 +97,36 @@ class MatchTest(unittest.TestCase):
     m = MyMatch()  # pylint: disable=invalid-name
     self.assertEqual(
-        m.audit(Example(id=1, input=pg.Dict(x=1), output=1)),
-        dict(match=True, x=1)
+        m.update(Example(id=1, input=pg.Dict(x=1), output=1)),
+        dict(is_correct=True, x=1)
     )
     self.assertEqual(m.matches, 1.0)
   def test_html_view(self):
     m = metrics.Match()  # pylint: disable=invalid-name
-    m.audit(Example(id=1, input=pg.Dict(groundtruth=1), output=1))
+    m.update(Example(id=1, input=pg.Dict(groundtruth=1), output=1))
     self.assertIn(
         '100.0%',
         m.to_html().content,
     )
     with pg.views.html.controls.HtmlControl.track_scripts() as scripts:
-      m.audit(Example(id=2, input=pg.Dict(groundtruth=1), output=2))
+      m.update(Example(id=2, input=pg.Dict(groundtruth=1), output=2))
       self.assertEqual(len(scripts), 12)
+  def test_merge_from(self):
+    m1 = metrics.Match()
+    m1.update(Example(id=1, input=pg.Dict(groundtruth=1), output=1))
+    m2 = metrics.Match()
+    m2.update(Example(id=2, input=pg.Dict(groundtruth=1), output=2))
+    m1.merge_from(m2)
+    self.assertEqual(m1.matches, 0.5)
+    self.assertEqual(m1.mismatches, 0.5)
+    self.assertEqual(m1.oop_errors, 0.0)
+    self.assertEqual(m1.non_oop_errors, 0.0)
+    self.assertEqual(m1.matches.total, 2)
+    self.assertEqual(len(m1.matches.data_points), 1)
+    self.assertEqual(len(m1.mismatches.data_points), 1)
 class ScoreTest(unittest.TestCase):
@@ -118,15 +139,15 @@ class ScoreTest(unittest.TestCase):
     m = MyScore()  # pylint: disable=invalid-name
     self.assertEqual(
-        m.audit(Example(id=1, input=pg.Dict(x=1), output=1)),
+        m.update(Example(id=1, input=pg.Dict(x=1), output=1)),
         dict(score=1 * 1)
     )
     self.assertEqual(
-        m.audit(Example(id=2, input=pg.Dict(x=2), output=2)),
+        m.update(Example(id=2, input=pg.Dict(x=2), output=2)),
         dict(score=2 * 2)
     )
     self.assertEqual(
-        m.audit(
+        m.update(
             Example(
                 id=3,
                 input=pg.Dict(x=1),
@@ -140,7 +161,7 @@ class ScoreTest(unittest.TestCase):
         dict(error='ValueError')
     )
     self.assertEqual(
-        m.audit(
+        m.update(
             Example(
                 id=3,
                 input=pg.Dict(x=1),
@@ -176,7 +197,7 @@ class ScoreTest(unittest.TestCase):
     m = MyScore()  # pylint: disable=invalid-name
     self.assertEqual(
-        m.audit(Example(id=1, input=pg.Dict(x=1), output=1)),
+        m.update(Example(id=1, input=pg.Dict(x=1), output=1)),
         dict(score=1 * 1, x=1)
     )
     self.assertEqual(m.average_score, 1.0)
@@ -189,13 +210,13 @@ class ScoreTest(unittest.TestCase):
         return example_input.x * output
     m = MyScore()  # pylint: disable=invalid-name
-    m.audit(Example(id=1, input=pg.Dict(x=1), output=2))
+    m.update(Example(id=1, input=pg.Dict(x=1), output=2))
     self.assertIn(
         '2.000',
         m.to_html().content,
     )
     with pg.views.html.controls.HtmlControl.track_scripts() as scripts:
-      m.audit(Example(id=2, input=pg.Dict(x=1), output=2))
+      m.update(Example(id=2, input=pg.Dict(x=1), output=2))
       self.assertEqual(len(scripts), 9)

langfun/core/eval/v2/progress.py CHANGED Viewed

@@ -21,7 +21,15 @@ import pyglove as pg
 class Progress(pg.Object, pg.views.HtmlTreeView.Extension):
-  """Evaluation progress."""
+  """Represents and tracks the progress of an evaluation.
+  The `Progress` class maintains counts of processed, failed, and skipped
+  items in an evaluation, along with timing information (start time, stop time,
+  duration) and an execution summary. It provides properties to check the
+  status of the evaluation (e.g., `is_started`, `is_completed`) and methods
+  to update progress as items are evaluated.
+  It also supports HTML rendering as a progress bar for visualization.
+  """
   num_total: Annotated[
       int | None,
@@ -84,6 +92,7 @@ class Progress(pg.Object, pg.views.HtmlTreeView.Extension):
         stop_time=None,
         execution_summary=pg.object_utils.TimeIt.StatusSummary(),
     )
+    self._progress_bar = None
   @property
   def num_completed(self) -> int:
@@ -216,6 +225,27 @@ class Progress(pg.Object, pg.views.HtmlTreeView.Extension):
     """Overrides nondefault values so volatile values are not included."""
     return dict()
+  def merge_from(self, other: 'Progress') -> None:
+    """Merges the progress from another progress."""
+    with pg.notify_on_change(False), pg.allow_writable_accessors(True):
+      if other.start_time is not None and (
+          self.start_time is None or self.start_time > other.start_time):
+        self.start_time = other.start_time
+      if other.stop_time is not None and (
+          self.stop_time is None or self.stop_time < other.stop_time):
+        self.stop_time = other.stop_time
+      if other.num_total is not None:
+        if self.num_total is None:
+          self.num_total = other.num_total
+        else:
+          assert self.num_total == other.num_total, (self, other)
+      self.num_processed += other.num_processed
+      self.num_failed += other.num_failed
+      self.num_skipped += other.num_skipped
+      self.execution_summary.aggregate(other.execution_summary.breakdown)
   #
   # HTML view.
   #

langfun/core/eval/v2/progress_test.py CHANGED Viewed

@@ -77,6 +77,33 @@ class ProgressTest(unittest.TestCase):
     self.assertTrue(p.is_stopped)
     self.assertIsNotNone(p.stop_time_str)
+  def test_merge_from(self):
+    p1 = Progress()
+    p1.start(10)
+    p1.increment_processed()
+    p1.increment_failed()
+    p1.stop()
+    p2 = Progress()
+    p2.start(10)
+    p2.increment_skipped()
+    p2.stop()
+    with pg.allow_writable_accessors(True):
+      p1.start_time = 2.0
+      p1.stop_time = 4.0
+      p2.start_time = 1.0
+      p2.stop_time = 5.0
+    p1.merge_from(p2)
+    self.assertEqual(p1.num_total, 10)
+    self.assertEqual(p1.num_processed, 1)
+    self.assertEqual(p1.num_failed, 1)
+    self.assertEqual(p1.num_skipped, 1)
+    self.assertEqual(p1.num_completed, 3)
+    self.assertEqual(p1.start_time, 1.0)
+    self.assertEqual(p1.stop_time, 5.0)
 if __name__ == '__main__':
   unittest.main()

langfun/core/eval/v2/progress_tracking.py CHANGED Viewed

@@ -14,6 +14,7 @@
 """Tracking evaluation run progress."""
 import os
+from typing import Literal
 import langfun.core as lf
 from langfun.core.eval.v2 import example as example_lib
 from langfun.core.eval.v2 import experiment as experiment_lib
@@ -24,16 +25,24 @@ Experiment = experiment_lib.Experiment
 Example = example_lib.Example
-def progress_tracker(tqdm: bool = False) -> experiment_lib.Plugin:
+def progress_tracker(
+    tracker_type: Literal['tqdm', 'html', 'auto'] = 'auto'
+) -> experiment_lib.Plugin:
   """Creates a progress tracker as a plugin.
   Args:
-    tqdm: If True, force using tqdm for progress update.
+    tracker_type: The type of progress tracker to use.
+      If `tqdm`, force using tqdm for progress update.
+      If `html`, force using html for progress update.
+      If `auto`, determine it automatically based on the running
+        environment (console vs. notebook)
   Returns:
     The progress tracker plugin.
   """
-  if tqdm or not lf.console.under_notebook():
+  if tracker_type == 'tqdm' or (
+      tracker_type == 'auto' and not lf.console.under_notebook()
+  ):
     return _TqdmProgressTracker()
   else:
     return _HtmlProgressTracker()
@@ -88,8 +97,7 @@ class _TqdmProgressTracker(experiment_lib.Plugin):
     self._leaf_progresses = {
         leaf.id: lf.concurrent.ProgressBar.install(
             label=f'[#{i + 1} - {leaf.id}]',
-            total=(len(runner.current_run.example_ids)
-                   if runner.current_run.example_ids else leaf.num_examples),
+            total=len(runner.current_run.examples_to_evaluate(leaf)),
             color='cyan',
             status=None
         )

langfun/core/eval/v2/progress_tracking_test.py CHANGED Viewed

@@ -14,12 +14,14 @@
 import contextlib
 import io
 import os
+import sys
 import tempfile
 import unittest
+from langfun.core import concurrent as lf_concurrent
 from langfun.core import console as lf_console
 from langfun.core.eval.v2 import eval_test_helper
-from langfun.core.eval.v2 import progress_tracking    # pylint: disable=unused-import
+from langfun.core.eval.v2 import progress_tracking
 from langfun.core.eval.v2 import runners as runners_lib  # pylint: disable=unused-import
 import pyglove as pg
@@ -31,6 +33,7 @@ class HtmlProgressTrackerTest(unittest.TestCase):
     def display(x):
       result['view'] = x.to_html()
+    self.assertFalse(progress_tracking._HtmlProgressTracker.is_per_example())
     lf_console._notebook = pg.Dict(
         display=display
     )
@@ -44,11 +47,14 @@ class HtmlProgressTrackerTest(unittest.TestCase):
 class TqdmProgressTrackerTest(unittest.TestCase):
   def test_basic(self):
+    self.assertFalse(progress_tracking._TqdmProgressTracker.is_per_example())
     root_dir = os.path.join(tempfile.mkdtemp(), 'test_tqdm_progress_tracker')
     experiment = eval_test_helper.test_experiment()
     string_io = io.StringIO()
     with contextlib.redirect_stderr(string_io):
       _ = experiment.run(root_dir, 'new', plugins=[])
+      sys.stderr.flush()
+    lf_concurrent.ProgressBar.refresh()
     self.assertIn('All: 100%', string_io.getvalue())
   def test_with_example_ids(self):
@@ -59,6 +65,8 @@ class TqdmProgressTrackerTest(unittest.TestCase):
     string_io = io.StringIO()
     with contextlib.redirect_stderr(string_io):
       _ = experiment.run(root_dir, 'new', example_ids=[1], plugins=[])
+      sys.stderr.flush()
+    lf_concurrent.ProgressBar.refresh()
     self.assertIn('All: 100%', string_io.getvalue())

langfun/core/eval/v2/reporting.py CHANGED Viewed

@@ -32,8 +32,97 @@ _SUMMARY_FILE = 'summary.html'
 _EVALULATION_DETAIL_FILE = 'index.html'
+class ExampleHtmlGenerator(experiment_lib.Plugin):
+  """Plugin for generating HTML views for each evaluation example."""
+  def on_example_complete(
+      self, runner: Runner, experiment: Experiment, example: Example
+  ):
+    self._save_example_html(runner, experiment, example)
+  def _save_example_html(
+      self, runner: Runner, experiment: Experiment, example: Example
+  ) -> None:
+    """Saves the example in HTML format."""
+    current_run = runner.current_run
+    def _generate():
+      try:
+        with pg.timeit() as t:
+          html = example.to_html(
+              collapse_level=None,
+              enable_summary_tooltip=False,
+              extra_flags=dict(
+                  # For properly rendering the next link.
+                  num_examples=getattr(experiment, 'num_examples', None)
+              ),
+          )
+          html.save(
+              runner.current_run.output_path_for(
+                  experiment, f'{example.id}.html'
+              )
+          )
+        experiment.info(
+            f'\'{example.id}.html\' generated in {t.elapse:.2f} seconds. '
+        )
+      except BaseException as e:  # pylint: disable=broad-except
+        experiment.error(
+            f'Failed to generate \'{example.id}.html\'. '
+            f'Error: {e}, Stacktrace: \n{traceback.format_exc()}.',
+        )
+        raise e
+    def _copy():
+      src_file = current_run.input_path_for(experiment, f'{example.id}.html')
+      dest_file = current_run.output_path_for(experiment, f'{example.id}.html')
+      if src_file == dest_file:
+        return
+      if not pg.io.path_exists(src_file):
+        experiment.warning(
+            f'Skip copying \'{example.id}.html\' as '
+            f'{src_file!r} does not exist.'
+        )
+        return
+      try:
+        with pg.timeit() as t, pg.io.open(src_file, 'r') as src:
+          content = src.read()
+          with pg.io.open(dest_file, 'w') as dest:
+            dest.write(content)
+        experiment.info(
+            f'\'{example.id}.html\' copied in {t.elapse:.2f} seconds.'
+        )
+      except BaseException as e:  # pylint: disable=broad-except
+        experiment.error(
+            f'Failed to copy {src_file!r} to {dest_file!r}. Error: {e}.'
+        )
+        raise e
+    generate_example_html = current_run.generate_example_html
+    if (generate_example_html == 'all'
+        or (generate_example_html == 'new' and example.newly_processed)
+        or (isinstance(generate_example_html, list)
+            and example.id in generate_example_html)):
+      op = _generate
+    else:
+      op = _copy
+    runner.background_run(op)
 class HtmlReporter(experiment_lib.Plugin):
-  """Plugin for periodically generating HTML reports for the experiment."""
+  """Plugin for periodically generating HTML reports for the experiment.
+  The `HtmlReporter` plugin generates several HTML files during an experiment
+  run:
+    - A `summary.html` at the root of the run directory, summarizing all
+      evaluations in the experiment.
+    - An `index.html` for each leaf evaluation, detailing the evaluation
+      definition, metrics, and logs.
+  These reports are updated periodically in the background during the run,
+  allowing users to monitor progress in near real-time.
+  """
   summary_interval: Annotated[
       int,
@@ -127,7 +216,6 @@ class HtmlReporter(experiment_lib.Plugin):
   def on_example_complete(
       self, runner: Runner, experiment: Experiment, example: Example
   ):
-    self._save_example_html(runner, experiment, example)
     self._maybe_update_experiment_html(runner, experiment)
     self._maybe_update_summary(runner)
@@ -197,72 +285,3 @@ class HtmlReporter(experiment_lib.Plugin):
         runner.background_run(_save)
       else:
         _save()
-  def _save_example_html(
-      self, runner: Runner, experiment: Experiment, example: Example
-  ) -> None:
-    """Saves the example in HTML format."""
-    current_run = runner.current_run
-    def _generate():
-      try:
-        with pg.timeit() as t:
-          html = example.to_html(
-              collapse_level=None,
-              enable_summary_tooltip=False,
-              extra_flags=dict(
-                  # For properly rendering the next link.
-                  num_examples=getattr(experiment, 'num_examples', None)
-              ),
-          )
-          html.save(
-              runner.current_run.output_path_for(
-                  experiment, f'{example.id}.html'
-              )
-          )
-        experiment.info(
-            f'\'{example.id}.html\' generated in {t.elapse:.2f} seconds. '
-        )
-      except BaseException as e:  # pylint: disable=broad-except
-        experiment.error(
-            f'Failed to generate \'{example.id}.html\'. '
-            f'Error: {e}, Stacktrace: \n{traceback.format_exc()}.',
-        )
-        raise e
-    def _copy():
-      src_file = current_run.input_path_for(experiment, f'{example.id}.html')
-      dest_file = current_run.output_path_for(experiment, f'{example.id}.html')
-      if src_file == dest_file:
-        return
-      if not pg.io.path_exists(src_file):
-        experiment.warning(
-            f'Skip copying \'{example.id}.html\' as '
-            f'{src_file!r} does not exist.'
-        )
-        return
-      try:
-        with pg.timeit() as t, pg.io.open(src_file, 'r') as src:
-          content = src.read()
-          with pg.io.open(dest_file, 'w') as dest:
-            dest.write(content)
-        experiment.info(
-            f'\'{example.id}.html\' copied in {t.elapse:.2f} seconds.'
-        )
-      except BaseException as e:  # pylint: disable=broad-except
-        experiment.error(
-            f'Failed to copy {src_file!r} to {dest_file!r}. Error: {e}.'
-        )
-        raise e
-    generate_example_html = current_run.generate_example_html
-    if (generate_example_html == 'all'
-        or (generate_example_html == 'new' and example.newly_processed)
-        or (isinstance(generate_example_html, list)
-            and example.id in generate_example_html)):
-      op = _generate
-    else:
-      op = _copy
-    runner.background_run(op)

langfun/core/eval/v2/reporting_test.py CHANGED Viewed

@@ -29,7 +29,16 @@ class ReportingTest(unittest.TestCase):
     experiment = eval_test_helper.test_experiment()
     checkpointer = checkpointing.BulkCheckpointer('checkpoint.jsonl')
     reporter = reporting.HtmlReporter()
-    run = experiment.run(root_dir, 'new', plugins=[checkpointer, reporter])
+    self.assertFalse(reporter.is_per_example())
+    example_html_generator = reporting.ExampleHtmlGenerator()
+    self.assertTrue(example_html_generator.is_per_example())
+    run = experiment.run(
+        root_dir,
+        'new',
+        plugins=[checkpointer, reporter, example_html_generator]
+    )
     self.assertTrue(
         pg.io.path_exists(os.path.join(run.output_root, 'summary.html'))
     )
@@ -52,8 +61,10 @@ class ReportingTest(unittest.TestCase):
     root_dir = os.path.join(tempfile.mkdtemp(), 'test_reporting2')
     experiment = eval_test_helper.test_experiment()
     run = experiment.run(
-        root_dir, 'new', plugins=[checkpointer, reporter],
-        warm_start_from=run.output_root
+        root_dir,
+        'new',
+        plugins=[checkpointer, reporter, example_html_generator],
+        warm_start_from=run.output_root,
     )
     self.assertTrue(
         pg.io.path_exists(os.path.join(run.output_root, 'summary.html'))
@@ -105,7 +116,12 @@ class ReportingTest(unittest.TestCase):
                   .test_experiment_with_example_html_generation_error())
     checkpointer = checkpointing.BulkCheckpointer('checkpoint.jsonl')
     reporter = reporting.HtmlReporter()
-    run = experiment.run(root_dir, 'new', plugins=[checkpointer, reporter])
+    example_html_generator = reporting.ExampleHtmlGenerator()
+    run = experiment.run(
+        root_dir,
+        'new',
+        plugins=[checkpointer, reporter, example_html_generator]
+    )
     self.assertTrue(
         pg.io.path_exists(os.path.join(run.output_root, 'summary.html'))
     )
@@ -132,8 +148,10 @@ class ReportingTest(unittest.TestCase):
     experiment = (eval_test_helper
                   .test_experiment_with_example_html_generation_error())
     run = experiment.run(
-        root_dir, 'new', plugins=[checkpointer, reporter],
-        warm_start_from=run.output_root
+        root_dir,
+        'new',
+        plugins=[checkpointer, reporter, example_html_generator],
+        warm_start_from=run.output_root,
     )
     self.assertTrue(
         pg.io.path_exists(os.path.join(run.output_root, 'summary.html'))

langfun/core/eval/v2/runners/__init__.py ADDED Viewed

@@ -0,0 +1,30 @@
+# Copyright 2024 The Langfun Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Langfun evaluation runners."""
+# pylint: disable=g-importing-member
+from langfun.core.eval.v2.runners.base import RunnerBase
+from langfun.core.eval.v2.runners.beam import BeamRunner
+from langfun.core.eval.v2.runners.debug import DebugRunner
+from langfun.core.eval.v2.runners.parallel import ParallelRunner
+from langfun.core.eval.v2.runners.sequential import SequentialRunner
+# pylint: enable=g-importing-member
+__all__ = [
+    'RunnerBase',
+    'BeamRunner',
+    'DebugRunner',
+    'ParallelRunner',
+    'SequentialRunner',
+]

langfun 0.1.2.dev202509120804__py3-none-any.whl → 0.1.2.dev202512040805__py3-none-any.whl

langfun 0.1.2.dev202509120804py3-none-any.whl → 0.1.2.dev202512040805py3-none-any.whl