PyPI - langfun - Versions diffs - 0.1.2.dev202510230805__py3-none-any.whl → 0.1.2.dev202511270805__py3-none-any.whl - Mend

langfun 0.1.2.dev202510230805py3-none-any.whl → 0.1.2.dev202511270805py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of langfun might be problematic. Click here for more details.

Files changed (155) hide show

langfun/core/__init__.py +2 -0
langfun/core/agentic/__init__.py +4 -1
langfun/core/agentic/action.py +447 -29
langfun/core/agentic/action_eval.py +9 -2
langfun/core/agentic/action_test.py +149 -21
langfun/core/async_support.py +32 -3
langfun/core/coding/python/correction.py +19 -9
langfun/core/coding/python/execution.py +14 -12
langfun/core/coding/python/generation.py +21 -16
langfun/core/coding/python/sandboxing.py +23 -3
langfun/core/component.py +42 -3
langfun/core/concurrent.py +70 -6
langfun/core/concurrent_test.py +1 -0
langfun/core/console.py +1 -1
langfun/core/data/conversion/anthropic.py +12 -3
langfun/core/data/conversion/anthropic_test.py +8 -6
langfun/core/data/conversion/gemini.py +9 -2
langfun/core/data/conversion/gemini_test.py +12 -9
langfun/core/data/conversion/openai.py +145 -31
langfun/core/data/conversion/openai_test.py +161 -17
langfun/core/eval/base.py +47 -43
langfun/core/eval/base_test.py +5 -5
langfun/core/eval/matching.py +5 -2
langfun/core/eval/patching.py +3 -3
langfun/core/eval/scoring.py +4 -3
langfun/core/eval/v2/__init__.py +1 -0
langfun/core/eval/v2/checkpointing.py +64 -6
langfun/core/eval/v2/checkpointing_test.py +9 -2
langfun/core/eval/v2/eval_test_helper.py +103 -2
langfun/core/eval/v2/evaluation.py +91 -16
langfun/core/eval/v2/evaluation_test.py +9 -3
langfun/core/eval/v2/example.py +50 -40
langfun/core/eval/v2/example_test.py +16 -8
langfun/core/eval/v2/experiment.py +74 -8
langfun/core/eval/v2/experiment_test.py +19 -0
langfun/core/eval/v2/metric_values.py +31 -3
langfun/core/eval/v2/metric_values_test.py +32 -0
langfun/core/eval/v2/metrics.py +157 -44
langfun/core/eval/v2/metrics_test.py +39 -18
langfun/core/eval/v2/progress.py +30 -1
langfun/core/eval/v2/progress_test.py +27 -0
langfun/core/eval/v2/progress_tracking.py +12 -3
langfun/core/eval/v2/progress_tracking_test.py +6 -1
langfun/core/eval/v2/reporting.py +90 -71
langfun/core/eval/v2/reporting_test.py +24 -6
langfun/core/eval/v2/runners/__init__.py +30 -0
langfun/core/eval/v2/{runners.py → runners/base.py} +59 -142
langfun/core/eval/v2/runners/beam.py +341 -0
langfun/core/eval/v2/runners/beam_test.py +131 -0
langfun/core/eval/v2/runners/ckpt_monitor.py +294 -0
langfun/core/eval/v2/runners/ckpt_monitor_test.py +162 -0
langfun/core/eval/v2/runners/debug.py +40 -0
langfun/core/eval/v2/runners/debug_test.py +76 -0
langfun/core/eval/v2/runners/parallel.py +100 -0
langfun/core/eval/v2/runners/parallel_test.py +95 -0
langfun/core/eval/v2/runners/sequential.py +47 -0
langfun/core/eval/v2/runners/sequential_test.py +172 -0
langfun/core/langfunc.py +45 -130
langfun/core/langfunc_test.py +7 -5
langfun/core/language_model.py +141 -21
langfun/core/language_model_test.py +54 -3
langfun/core/llms/__init__.py +9 -1
langfun/core/llms/anthropic.py +157 -2
langfun/core/llms/azure_openai.py +29 -17
langfun/core/llms/cache/base.py +25 -3
langfun/core/llms/cache/in_memory.py +48 -7
langfun/core/llms/cache/in_memory_test.py +14 -4
langfun/core/llms/compositional.py +25 -1
langfun/core/llms/deepseek.py +30 -2
langfun/core/llms/fake.py +32 -1
langfun/core/llms/gemini.py +55 -17
langfun/core/llms/gemini_test.py +84 -0
langfun/core/llms/google_genai.py +34 -1
langfun/core/llms/groq.py +28 -3
langfun/core/llms/llama_cpp.py +23 -4
langfun/core/llms/openai.py +36 -3
langfun/core/llms/openai_compatible.py +148 -27
langfun/core/llms/openai_compatible_test.py +207 -20
langfun/core/llms/openai_test.py +0 -2
langfun/core/llms/rest.py +12 -1
langfun/core/llms/vertexai.py +58 -8
langfun/core/logging.py +1 -1
langfun/core/mcp/client.py +77 -22
langfun/core/mcp/client_test.py +8 -35
langfun/core/mcp/session.py +94 -29
langfun/core/mcp/session_test.py +54 -0
langfun/core/mcp/tool.py +151 -22
langfun/core/mcp/tool_test.py +197 -0
langfun/core/memory.py +1 -0
langfun/core/message.py +160 -55
langfun/core/message_test.py +65 -81
langfun/core/modalities/__init__.py +8 -0
langfun/core/modalities/audio.py +21 -1
langfun/core/modalities/image.py +19 -1
langfun/core/modalities/mime.py +64 -3
langfun/core/modalities/mime_test.py +11 -0
langfun/core/modalities/pdf.py +19 -1
langfun/core/modalities/video.py +21 -1
langfun/core/modality.py +167 -29
langfun/core/modality_test.py +42 -12
langfun/core/natural_language.py +1 -1
langfun/core/sampling.py +4 -4
langfun/core/sampling_test.py +20 -4
langfun/core/structured/__init__.py +2 -24
langfun/core/structured/completion.py +34 -44
langfun/core/structured/completion_test.py +23 -43
langfun/core/structured/description.py +54 -50
langfun/core/structured/function_generation.py +29 -12
langfun/core/structured/mapping.py +81 -37
langfun/core/structured/parsing.py +95 -79
langfun/core/structured/parsing_test.py +0 -3
langfun/core/structured/querying.py +215 -142
langfun/core/structured/querying_test.py +65 -29
langfun/core/structured/schema/__init__.py +49 -0
langfun/core/structured/schema/base.py +664 -0
langfun/core/structured/schema/base_test.py +531 -0
langfun/core/structured/schema/json.py +174 -0
langfun/core/structured/schema/json_test.py +121 -0
langfun/core/structured/schema/python.py +316 -0
langfun/core/structured/schema/python_test.py +410 -0
langfun/core/structured/schema_generation.py +33 -14
langfun/core/structured/scoring.py +47 -36
langfun/core/structured/tokenization.py +26 -11
langfun/core/subscription.py +2 -2
langfun/core/template.py +174 -49
langfun/core/template_test.py +123 -17
langfun/env/__init__.py +8 -2
langfun/env/base_environment.py +320 -128
langfun/env/base_environment_test.py +473 -0
langfun/env/base_feature.py +92 -15
langfun/env/base_feature_test.py +228 -0
langfun/env/base_sandbox.py +84 -361
langfun/env/base_sandbox_test.py +1235 -0
langfun/env/event_handlers/__init__.py +1 -1
langfun/env/event_handlers/chain.py +233 -0
langfun/env/event_handlers/chain_test.py +253 -0
langfun/env/event_handlers/event_logger.py +95 -98
langfun/env/event_handlers/event_logger_test.py +21 -21
langfun/env/event_handlers/metric_writer.py +225 -140
langfun/env/event_handlers/metric_writer_test.py +23 -6
langfun/env/interface.py +854 -40
langfun/env/interface_test.py +112 -2
langfun/env/load_balancers_test.py +23 -2
langfun/env/test_utils.py +126 -84
{langfun-0.1.2.dev202510230805.dist-info → langfun-0.1.2.dev202511270805.dist-info}/METADATA +1 -1
langfun-0.1.2.dev202511270805.dist-info/RECORD +215 -0
langfun/core/eval/v2/runners_test.py +0 -343
langfun/core/structured/schema.py +0 -987
langfun/core/structured/schema_test.py +0 -982
langfun/env/base_test.py +0 -1481
langfun/env/event_handlers/base.py +0 -350
langfun-0.1.2.dev202510230805.dist-info/RECORD +0 -195
{langfun-0.1.2.dev202510230805.dist-info → langfun-0.1.2.dev202511270805.dist-info}/WHEEL +0 -0
{langfun-0.1.2.dev202510230805.dist-info → langfun-0.1.2.dev202511270805.dist-info}/licenses/LICENSE +0 -0
{langfun-0.1.2.dev202510230805.dist-info → langfun-0.1.2.dev202511270805.dist-info}/top_level.txt +0 -0

langfun/core/eval/v2/runners/ckpt_monitor.py ADDED Viewed

@@ -0,0 +1,294 @@
+# Copyright 2025 The Langfun Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Checkpoint aggregator for Langfun evaluations."""
+import concurrent.futures
+import dataclasses
+import os
+import threading
+import time
+from typing import Annotated, Iterator
+from langfun.core.eval.v2 import evaluation as evaluation_lib
+from langfun.core.eval.v2 import example as example_lib
+from langfun.core.eval.v2 import reporting
+from langfun.core.eval.v2.runners import base
+import pyglove as pg
+class CheckpointMonitor(base.RunnerBase):
+  """Runner for monitoring checkpoing files generated by other runners.
+  Currently checkpoint monitor only supports aggregating per-example
+  checkpoint files.
+  """
+  NAME = 'checkpoint_monitor'
+  plugins = [
+      reporting.HtmlReporter(),
+  ]
+  checkpoint_pattern: Annotated[
+      str, 'The glob pattern of the checkpoint files to monitor.'
+  ] = 'checkpoint_*.bagz'
+  monitor_inprogress_files: Annotated[
+      bool,
+      'If True, monitor in-progress files to aggregate.'
+  ] = False
+  poll_interval: Annotated[
+      int,
+      'The interval in seconds to poll for new checkpoint files.'
+  ] = 5
+  max_aggregation_threads: Annotated[
+      int,
+      'The maximum number of threads to aggregate checkpoints.'
+  ] = 128
+  @dataclasses.dataclass
+  class _AggregationEntry:
+    evaluation: evaluation_lib.Evaluation
+    output_dir: str
+    inprogress_file_pattern: str | None
+    ckpt_file_pattern: str
+    example_ids_inprogress: set[int]
+    example_ids_to_be_aggregated: set[int]
+    example_ids_being_aggregated: set[int]
+    completion_lock: threading.Lock
+    is_completed: bool = False
+  def _on_bound(self):
+    super()._on_bound()
+    self._monitor_thread = None
+    self._aggregation_entries = []
+    self._aggregator_pool = None
+    self._error = None
+  def start(self):
+    # Reset the experiment state before getting started.
+    self.current_run.experiment.reset()
+    # Signal the start of the run.
+    self.on_run_start()
+    # Start the non-leaf nodes.
+    for node in self.current_run.experiment.nonleaf_nodes:
+      self.on_experiment_start(node)
+    for evaluation in self.current_run.experiment.leaf_nodes:
+      # This is not precise, but we at least notify example start.
+      if not self.current_run.filter or self.current_run.filter(evaluation):
+        self.on_experiment_start(evaluation)
+        # Signal the start of the examples if we are not monitoring in-progress
+        # files.
+        if not self.monitor_inprogress_files:
+          for example_id in self.current_run.examples_to_evaluate(evaluation):
+            self._mark_example_started(evaluation, example_id)
+        # Create the aggregation entries for polling.
+        output_dir = self.current_run.output_dir(evaluation)
+        self._aggregation_entries.append(
+            self._AggregationEntry(
+                evaluation=evaluation,
+                output_dir=output_dir,
+                ckpt_file_pattern=os.path.join(
+                    output_dir, self.checkpoint_pattern
+                ),
+                inprogress_file_pattern=os.path.join(
+                    output_dir, '*.inprogress'
+                ) if self.monitor_inprogress_files else None,
+                example_ids_to_be_aggregated=(
+                    self.current_run.examples_to_evaluate(evaluation)
+                ),
+                example_ids_inprogress=set(),
+                example_ids_being_aggregated=set(),
+                completion_lock=threading.Lock(),
+                is_completed=False,
+            )
+        )
+      else:
+        self.on_experiment_skipped(evaluation)
+    self._aggregator_pool = concurrent.futures.ThreadPoolExecutor(
+        max_workers=self.max_aggregation_threads
+    )
+    self._monitor_thread = threading.Thread(target=self._monitor_loop)
+    self._monitor_thread.start()
+  def join(self):
+    if self._monitor_thread:
+      self._monitor_thread.join()
+    if self._error is not None:
+      raise self._error
+  def run(self):
+    self.start()
+    self.join()
+  def _monitor_loop(self):
+    while not self._error and any(
+        not e.is_completed for e in self._aggregation_entries
+    ):
+      for entry in self._aggregation_entries:
+        if not entry.example_ids_to_be_aggregated:
+          continue
+        # Signal example processing.
+        if self.monitor_inprogress_files:
+          inprogress_files = pg.io.glob(entry.inprogress_file_pattern)
+          for inprogress_file in inprogress_files:
+            example_id = int(
+                os.path.basename(inprogress_file).split('.')[0]
+            )
+            if example_id not in entry.example_ids_inprogress:
+              self._mark_example_started(entry.evaluation, example_id)
+              entry.example_ids_inprogress.add(example_id)
+        for filepath in pg.io.glob(entry.ckpt_file_pattern):
+          example_id = int(
+              os.path.basename(filepath).split('.')[0].split('_')[-1]
+          )
+          if example_id in entry.example_ids_to_be_aggregated:
+            # Remove example ID from the set to avoid duplicate processing.
+            entry.example_ids_to_be_aggregated.remove(example_id)
+            entry.example_ids_being_aggregated.add(example_id)
+            # It could be that the example has been processed before, but the
+            # inprogress file was removed. In this case, we should signal the
+            # example has started before completing it.
+            if example_id not in entry.example_ids_inprogress:
+              self._mark_example_started(entry.evaluation, example_id)
+              entry.example_ids_inprogress.add(example_id)
+            self._aggregator_pool.submit(
+                self._aggregate, entry, filepath, example_id
+            )
+            pg.logging.info(
+                '[%s] Aggregating example %d from %s...',
+                entry.evaluation.id,
+                example_id,
+                filepath,
+            )
+      time.sleep(self.poll_interval)
+    if self._error is None:
+      self.on_run_complete()
+    else:
+      self.on_run_abort(self._error)
+  def _aggregate(
+      self,
+      entry: _AggregationEntry,
+      ckpt_filepath: str,
+      example_id: int
+  ):
+    """Aggregate an example from a checkpoint file."""
+    try:
+      loaded_examples = entry.evaluation.state.load(
+          ckpt_filepath,
+          example_input_by_id=entry.evaluation.example_input_by_id,
+          # Example metadata may be expensive to load, and is not used by
+          # metric aggregation. Thus we do not load example metadata.
+          load_example_metadata=False
+      )
+      assert len(loaded_examples) > 1, loaded_examples
+      # Ocassionally the per-example checkpoint file may contain the same
+      # example processed multiple times. We only need to aggregate the last
+      # example.
+      example = loaded_examples[-1]
+    except BaseException as e:  # pylint: disable=broad-except
+      error_info = pg.ErrorInfo.from_exception(e)
+      pg.logging.error(
+          '[%s] Failed to aggregate example %d: %s',
+          entry.evaluation.id,
+          example_id,
+          error_info
+      )
+      example = example_lib.Example(
+          id=example_id,
+          input=entry.evaluation.example_input_by_id(example_id),
+          error=error_info,
+      )
+    # This will skip processing but still allow metrics to be collected.
+    # `process` will never be called for evaluation, thus we do not
+    # need to setup/teardown evaluation.
+    example = entry.evaluation.evaluate(
+        example, reevaluate_upon_previous_errors=False
+    )
+    example.newly_processed = True
+    pg.logging.info(
+        '[%s] Successfully aggregated example %d from %s.',
+        entry.evaluation.id,
+        example_id,
+        ckpt_filepath,
+    )
+    try:
+      self.on_example_complete(entry.evaluation, example)
+    except BaseException as e:  # pylint: disable=broad-except
+      # Plugin failures should be raised to the user.
+      self._error = e
+    entry.example_ids_being_aggregated.remove(example_id)
+    # Remove the in-progress file to indicate that the example has been
+    # processed.
+    try:
+      pg.io.rm(os.path.join(entry.output_dir, f'{example_id}.inprogress'))
+    except FileNotFoundError:
+      pass
+    if (not self._error
+        and not entry.example_ids_to_be_aggregated
+        and not entry.example_ids_being_aggregated):
+      with entry.completion_lock:
+        if not entry.is_completed:
+          entry.is_completed = True
+          try:
+            self.on_experiment_complete(entry.evaluation)
+          except BaseException as e:  # pylint: disable=broad-except
+            # Plugin failures should be raised to the user.
+            self._error = e
+  def _mark_example_started(
+      self,
+      evaluation: evaluation_lib.Evaluation,
+      example_id: int
+  ) -> None:
+    """Mark an example as started."""
+    example = example_lib.Example(
+        id=example_id, input=evaluation.example_input_by_id(example_id),
+    )
+    example.start_time = time.time()
+    self.on_example_start(evaluation, example)
+    # We update evaluation state with the inprogress status so the evaluation
+    # HTML could show remotely in-progress examples.
+    evaluation.state.update(example, in_progress=True)
+  def _run(self, evaluations: list[evaluation_lib.Evaluation]):
+    raise NotImplementedError('Not needed in checkpoint monitor.')
+  def _evaluate_items(
+      self,
+      evaluation: evaluation_lib.Evaluation,
+      items: Iterator[example_lib.Example]
+  ) -> None:
+    raise NotImplementedError('Not needed in checkpoint monitor.')

langfun/core/eval/v2/runners/ckpt_monitor_test.py ADDED Viewed

@@ -0,0 +1,162 @@
+# Copyright 2024 The Langfun Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import tempfile
+import unittest
+from langfun.core.eval.v2 import checkpointing
+from langfun.core.eval.v2 import eval_test_helper
+from langfun.core.eval.v2 import example as example_lib
+from langfun.core.eval.v2 import experiment as experiment_lib
+from langfun.core.eval.v2.runners import ckpt_monitor
+from langfun.core.eval.v2.runners import sequential  # pylint: disable=unused-import
+import pyglove as pg
+class CheckpointMonitorTest(unittest.TestCase):
+  def setUp(self):
+    super().setUp()
+    self.test_dir = tempfile.mkdtemp()
+  def test_aggregate(self):
+    exp = eval_test_helper.test_experiment()
+    root_dir = os.path.join(self.test_dir, 'test_aggregate')
+    run = exp.run(
+        root_dir,
+        runner='sequential',
+        progress_tracker=None,
+        plugins=[
+            checkpointing.PerExampleCheckpointer(
+                checkpoint_filename='checkpoint.jsonl'
+            )
+        ],
+        use_cache='no',
+    )
+    # Try to corrupt one of the checkpoint files.
+    pg.io.writefile(
+        run.output_path_for(exp.leaf_nodes[0], 'checkpoint_1.jsonl'),
+        'bad ckpt'
+    )
+    plugin = eval_test_helper.TestPlugin()
+    monitor = ckpt_monitor.CheckpointMonitor(
+        run,
+        plugins=[plugin],
+        checkpoint_pattern='checkpoint_*.jsonl',
+        monitor_inprogress_files=True,
+    )
+    monitor.run()
+    # Assert that the in-progress files are created and not removed.
+    for entry in monitor._aggregation_entries:
+      self.assertEqual(len(entry.example_ids_inprogress), 10)
+    # 6 leaf nodes + 1 suite + 1 hyper.
+    self.assertEqual(len(plugin.started_experiments), 6 + 2)
+    self.assertEqual(len(plugin.completed_experiments), 6 + 2)
+    self.assertEqual(len(plugin.started_example_ids), 10 * 6)
+    self.assertEqual(len(plugin.completed_example_ids), 10 * 6)
+    for e in exp.leaf_nodes:
+      self.assertEqual(e.progress.num_completed, 10)
+  def test_aggregate_with_filter(self):
+    exp = eval_test_helper.test_experiment()
+    root_dir = os.path.join(self.test_dir, 'test_aggregate_with_filter')
+    node_to_skip = exp.leaf_nodes[2]
+    # Run experiment to generate checkpoint files for all examples.
+    run = exp.run(
+        root_dir,
+        runner='sequential',
+        filter=lambda e: e.id != node_to_skip.id,
+        progress_tracker=None,
+        plugins=[
+            checkpointing.PerExampleCheckpointer(
+                checkpoint_filename='checkpoint.jsonl'
+            )
+        ],
+        use_cache='no',
+    )
+    plugin = eval_test_helper.TestPlugin()
+    monitor = ckpt_monitor.CheckpointMonitor(
+        run,
+        plugins=[plugin],
+        checkpoint_pattern='checkpoint_*.jsonl',
+    )
+    monitor.run()
+    # Assert that on_experiment_skipped was called for the filtered node.
+    self.assertEqual(len(plugin.skipped_experiments), 1)
+    self.assertEqual(plugin.skipped_experiments[0].id, node_to_skip.id)
+    # Assert that the skipped node was not started.
+    started_ids = [e.id for e in plugin.started_experiments]
+    self.assertNotIn(node_to_skip.id, started_ids)
+  def test_plugin_raise(self):
+    class TestPlugin(eval_test_helper.TestPlugin):
+      simulate_raise_on_example_complete: bool = False
+      simulate_raise_on_experiment_complete: bool = False
+      def on_example_complete(
+          self,
+          runner: experiment_lib.Runner,
+          experiment: experiment_lib.Experiment,
+          example: example_lib.Example
+      ):
+        if self.simulate_raise_on_example_complete:
+          raise ValueError('example complete error')
+      def on_experiment_complete(
+          self,
+          runner: experiment_lib.Runner,
+          experiment: experiment_lib.Experiment
+      ):
+        if self.simulate_raise_on_experiment_complete:
+          raise ValueError('experiment complete error')
+    exp = eval_test_helper.test_evaluation()
+    root_dir = os.path.join(self.test_dir, 'test_plugin_raise')
+    # Run experiment to generate checkpoint files for all examples.
+    run = exp.run(
+        root_dir,
+        runner='sequential',
+        progress_tracker=None,
+        plugins=[
+            checkpointing.PerExampleCheckpointer(
+                checkpoint_filename='checkpoint.jsonl'
+            )
+        ],
+        use_cache='no',
+    )
+    with self.assertRaisesRegex(ValueError, 'example complete error'):
+      ckpt_monitor.CheckpointMonitor(
+          run,
+          plugins=[TestPlugin(simulate_raise_on_example_complete=True)],
+          checkpoint_pattern='checkpoint_*.jsonl',
+      ).run()
+    with self.assertRaisesRegex(ValueError, 'experiment complete error'):
+      ckpt_monitor.CheckpointMonitor(
+          run,
+          plugins=[TestPlugin(simulate_raise_on_experiment_complete=True)],
+          checkpoint_pattern='checkpoint_*.jsonl',
+      ).run()
+if __name__ == '__main__':
+  unittest.main()

langfun/core/eval/v2/runners/debug.py ADDED Viewed

@@ -0,0 +1,40 @@
+# Copyright 2024 The Langfun Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Debug runner."""
+from langfun.core.eval.v2.runners import sequential
+class DebugRunner(sequential.SequentialRunner):
+  """A runner for debugging evaluations.
+  The debug runner is a sequential runner that only runs the first example
+  of each evaluation, with `raise_if_has_error` enabled. This is useful for
+  quickly identifying issues in evaluation logic during development.
+  Checkpointers are disabled for this runner.
+  """
+  NAME = 'debug'
+  # Do not use the checkpointer for debug runner.
+  plugins = []
+  def _on_bound(self):
+    super()._on_bound()
+    if self.current_run.example_ids is None:
+      self.current_run.rebind(example_ids=[1], skip_notification=True)
+    self.current_run.rebind(raise_if_has_error=True, skip_notification=True)
+  def _save_run_manifest(self) -> None:
+    """Do nothing to avoid overriden existing runs."""

langfun/core/eval/v2/runners/debug_test.py ADDED Viewed

@@ -0,0 +1,76 @@
+# Copyright 2024 The Langfun Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for debug runner."""
+import os
+import tempfile
+from typing import Any
+import unittest
+from langfun.core.eval.v2 import eval_test_helper
+from langfun.core.eval.v2.runners import debug  # pylint: disable=unused-import
+import pyglove as pg
+class DebugRunnerTest(unittest.TestCase):
+  def assert_same_list(self, actual: list[Any], expected: list[Any]):
+    self.assertEqual(len(actual), len(expected))
+    for i, (x, y) in enumerate(zip(actual, expected)):
+      if x is not y:
+        print(i, pg.diff(x, y))
+      self.assertIs(x, y)
+  def test_debug_runner(self):
+    plugin = eval_test_helper.TestPlugin()
+    exp = eval_test_helper.test_experiment()
+    root_dir = os.path.join(tempfile.mkdtemp(), 'test_debug_runner')
+    run = exp.run(root_dir, runner='debug', plugins=[plugin])
+    self.assertIsNotNone(plugin.start_time)
+    self.assertIsNotNone(plugin.complete_time)
+    self.assertGreater(plugin.complete_time, plugin.start_time)
+    self.assertEqual(
+        len(plugin.started_experiments), len(exp.nodes)
+    )
+    self.assertEqual(
+        len(plugin.completed_experiments), len(exp.nodes)
+    )
+    self.assertEqual(
+        len(plugin.started_example_ids), 6 * 1
+    )
+    self.assertEqual(
+        len(plugin.completed_example_ids), 6 * 1
+    )
+    self.assert_same_list(plugin.skipped_experiments, [])
+    self.assertFalse(
+        pg.io.path_exists(os.path.join(run.output_root, 'run.json'))
+    )
+    for node in exp.nodes:
+      self.assertTrue(node.progress.is_started)
+      self.assertTrue(node.progress.is_completed)
+      if node.is_leaf:
+        self.assertEqual(node.progress.num_skipped, 0)
+        self.assertEqual(node.progress.num_completed, 1)
+        self.assertEqual(node.progress.num_failed, 0)
+      else:
+        self.assertEqual(node.progress.num_skipped, 0)
+        self.assertEqual(node.progress.num_failed, 0)
+        self.assertEqual(node.progress.num_processed, node.progress.num_total)
+if __name__ == '__main__':
+  unittest.main()

langfun/core/eval/v2/runners/parallel.py ADDED Viewed

@@ -0,0 +1,100 @@
+# Copyright 2025 The Langfun Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Parallel runner."""
+import collections
+import random
+import threading
+import time
+from typing import Annotated, Iterator
+import langfun.core as lf
+from langfun.core.eval.v2.runners import base
+class ParallelRunner(base.RunnerBase):
+  """A runner that executes evaluations and examples in parallel.
+  The parallel runner groups evaluations by their required resources
+  (e.g., specific LLMs) and runs evaluations that do not share resources in
+  parallel. Within each evaluation, examples are also processed in parallel
+  using threads, up to `Evaluation.max_workers`.
+  """
+  NAME = 'parallel'
+  timeout: Annotated[
+      int | None,
+      'Timeout for each evaluation example.'
+  ] = None
+  concurrent_startup_delay: Annotated[
+      tuple[int, int] | None,
+      (
+          'A range of seconds to delay the initial evaluation of each thread '
+          'in the thread pool, helping to prevent a burst in LLM QPS at '
+          'startup. If set to None, no delay will be applied.'
+      )
+  ] = None
+  def _run(self, evaluations: list[base.Evaluation]) -> None:
+    """Runs the evaluations in parallel."""
+    def _run_group(evaluation_group: list[base.Evaluation]):
+      for e in evaluation_group:
+        self.run_evaluation(e)
+    # Run evaluations in parallel groupped by resource key.
+    groups: dict[str, list[base.Evaluation]] = collections.defaultdict(list)
+    for e in evaluations:
+      resource_ids = e.resource_ids()
+      if not resource_ids:
+        group_id = e.id
+      else:
+        # TODO(daiyip): support group that requires multiple resources.
+        group_id = resource_ids.pop()
+      groups[group_id].append(e)
+    for _, _, _ in lf.concurrent_map(
+        _run_group,
+        groups.values(),
+        max_workers=max(64, len(groups)),
+        timeout=self.timeout,
+        silence_on_errors=None,
+    ):
+      pass
+  def _evaluate_items(
+      self, evaluation: base.Evaluation, items: Iterator[base.Example]
+  ) -> None:
+    """Override run items to run in parallel."""
+    if self.concurrent_startup_delay is not None:
+      thread_delayed = {}
+      def _evaluate_item(item: base.Example):
+        thread_id = threading.current_thread().ident
+        if thread_id not in thread_delayed:
+          thread_delayed[thread_id] = True
+          time.sleep(random.randint(*self.concurrent_startup_delay))
+        return self.evaluate_item(evaluation, item)
+    else:
+      def _evaluate_item(item: base.Example):
+        return self.evaluate_item(evaluation, item)
+    for _, _, _ in lf.concurrent_map(
+        _evaluate_item,
+        items,
+        max_workers=evaluation.max_workers,
+        timeout=self.timeout,
+        silence_on_errors=None,
+    ):
+      pass

langfun 0.1.2.dev202510230805__py3-none-any.whl → 0.1.2.dev202511270805__py3-none-any.whl

Potentially problematic release.

langfun 0.1.2.dev202510230805py3-none-any.whl → 0.1.2.dev202511270805py3-none-any.whl