PyPI - langfun - Versions diffs - 0.1.2.dev202509120804__py3-none-any.whl → 0.1.2.dev202512040805__py3-none-any.whl - Mend

langfun 0.1.2.dev202509120804py3-none-any.whl → 0.1.2.dev202512040805py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (162) hide show

langfun/__init__.py +1 -1
langfun/core/__init__.py +7 -1
langfun/core/agentic/__init__.py +8 -1
langfun/core/agentic/action.py +740 -112
langfun/core/agentic/action_eval.py +9 -2
langfun/core/agentic/action_test.py +189 -24
langfun/core/async_support.py +104 -5
langfun/core/async_support_test.py +23 -0
langfun/core/coding/python/correction.py +19 -9
langfun/core/coding/python/execution.py +14 -12
langfun/core/coding/python/generation.py +21 -16
langfun/core/coding/python/sandboxing.py +23 -3
langfun/core/component.py +42 -3
langfun/core/concurrent.py +70 -6
langfun/core/concurrent_test.py +9 -2
langfun/core/console.py +1 -1
langfun/core/data/conversion/anthropic.py +12 -3
langfun/core/data/conversion/anthropic_test.py +8 -6
langfun/core/data/conversion/gemini.py +11 -2
langfun/core/data/conversion/gemini_test.py +48 -9
langfun/core/data/conversion/openai.py +145 -31
langfun/core/data/conversion/openai_test.py +161 -17
langfun/core/eval/base.py +48 -44
langfun/core/eval/base_test.py +5 -5
langfun/core/eval/matching.py +5 -2
langfun/core/eval/patching.py +3 -3
langfun/core/eval/scoring.py +4 -3
langfun/core/eval/v2/__init__.py +2 -0
langfun/core/eval/v2/checkpointing.py +76 -7
langfun/core/eval/v2/checkpointing_test.py +9 -2
langfun/core/eval/v2/config_saver.py +37 -0
langfun/core/eval/v2/config_saver_test.py +36 -0
langfun/core/eval/v2/eval_test_helper.py +104 -3
langfun/core/eval/v2/evaluation.py +92 -17
langfun/core/eval/v2/evaluation_test.py +9 -3
langfun/core/eval/v2/example.py +50 -40
langfun/core/eval/v2/example_test.py +16 -8
langfun/core/eval/v2/experiment.py +84 -15
langfun/core/eval/v2/experiment_test.py +19 -0
langfun/core/eval/v2/metric_values.py +31 -3
langfun/core/eval/v2/metric_values_test.py +32 -0
langfun/core/eval/v2/metrics.py +157 -44
langfun/core/eval/v2/metrics_test.py +39 -18
langfun/core/eval/v2/progress.py +31 -1
langfun/core/eval/v2/progress_test.py +27 -0
langfun/core/eval/v2/progress_tracking.py +13 -5
langfun/core/eval/v2/progress_tracking_test.py +9 -1
langfun/core/eval/v2/reporting.py +90 -71
langfun/core/eval/v2/reporting_test.py +24 -6
langfun/core/eval/v2/runners/__init__.py +30 -0
langfun/core/eval/v2/{runners.py → runners/base.py} +72 -180
langfun/core/eval/v2/runners/beam.py +354 -0
langfun/core/eval/v2/runners/beam_test.py +153 -0
langfun/core/eval/v2/runners/ckpt_monitor.py +294 -0
langfun/core/eval/v2/runners/ckpt_monitor_test.py +162 -0
langfun/core/eval/v2/runners/debug.py +40 -0
langfun/core/eval/v2/runners/debug_test.py +76 -0
langfun/core/eval/v2/runners/parallel.py +243 -0
langfun/core/eval/v2/runners/parallel_test.py +182 -0
langfun/core/eval/v2/runners/sequential.py +47 -0
langfun/core/eval/v2/runners/sequential_test.py +169 -0
langfun/core/langfunc.py +45 -130
langfun/core/langfunc_test.py +7 -5
langfun/core/language_model.py +189 -36
langfun/core/language_model_test.py +54 -3
langfun/core/llms/__init__.py +12 -1
langfun/core/llms/anthropic.py +157 -2
langfun/core/llms/azure_openai.py +29 -17
langfun/core/llms/cache/base.py +25 -3
langfun/core/llms/cache/in_memory.py +48 -7
langfun/core/llms/cache/in_memory_test.py +14 -4
langfun/core/llms/compositional.py +25 -1
langfun/core/llms/deepseek.py +30 -2
langfun/core/llms/fake.py +32 -1
langfun/core/llms/gemini.py +64 -12
langfun/core/llms/gemini_test.py +110 -0
langfun/core/llms/google_genai.py +34 -1
langfun/core/llms/groq.py +28 -3
langfun/core/llms/llama_cpp.py +23 -4
langfun/core/llms/openai.py +120 -3
langfun/core/llms/openai_compatible.py +148 -27
langfun/core/llms/openai_compatible_test.py +207 -20
langfun/core/llms/openai_test.py +0 -2
langfun/core/llms/rest.py +16 -1
langfun/core/llms/vertexai.py +58 -8
langfun/core/logging.py +1 -1
langfun/core/mcp/__init__.py +10 -0
langfun/core/mcp/client.py +177 -0
langfun/core/mcp/client_test.py +71 -0
langfun/core/mcp/session.py +241 -0
langfun/core/mcp/session_test.py +54 -0
langfun/core/mcp/testing/simple_mcp_client.py +33 -0
langfun/core/mcp/testing/simple_mcp_server.py +33 -0
langfun/core/mcp/tool.py +254 -0
langfun/core/mcp/tool_test.py +197 -0
langfun/core/memory.py +1 -0
langfun/core/message.py +160 -55
langfun/core/message_test.py +65 -81
langfun/core/modalities/__init__.py +8 -0
langfun/core/modalities/audio.py +21 -1
langfun/core/modalities/image.py +73 -3
langfun/core/modalities/image_test.py +116 -0
langfun/core/modalities/mime.py +64 -3
langfun/core/modalities/mime_test.py +11 -0
langfun/core/modalities/pdf.py +19 -1
langfun/core/modalities/video.py +21 -1
langfun/core/modality.py +167 -29
langfun/core/modality_test.py +42 -12
langfun/core/natural_language.py +1 -1
langfun/core/sampling.py +4 -4
langfun/core/sampling_test.py +20 -4
langfun/core/structured/__init__.py +2 -24
langfun/core/structured/completion.py +34 -44
langfun/core/structured/completion_test.py +23 -43
langfun/core/structured/description.py +54 -50
langfun/core/structured/function_generation.py +29 -12
langfun/core/structured/mapping.py +81 -37
langfun/core/structured/parsing.py +95 -79
langfun/core/structured/parsing_test.py +0 -3
langfun/core/structured/querying.py +230 -154
langfun/core/structured/querying_test.py +69 -33
langfun/core/structured/schema/__init__.py +49 -0
langfun/core/structured/schema/base.py +664 -0
langfun/core/structured/schema/base_test.py +531 -0
langfun/core/structured/schema/json.py +174 -0
langfun/core/structured/schema/json_test.py +121 -0
langfun/core/structured/schema/python.py +316 -0
langfun/core/structured/schema/python_test.py +410 -0
langfun/core/structured/schema_generation.py +33 -14
langfun/core/structured/scoring.py +47 -36
langfun/core/structured/tokenization.py +26 -11
langfun/core/subscription.py +2 -2
langfun/core/template.py +175 -50
langfun/core/template_test.py +123 -17
langfun/env/__init__.py +43 -0
langfun/env/base_environment.py +827 -0
langfun/env/base_environment_test.py +473 -0
langfun/env/base_feature.py +304 -0
langfun/env/base_feature_test.py +228 -0
langfun/env/base_sandbox.py +842 -0
langfun/env/base_sandbox_test.py +1235 -0
langfun/env/event_handlers/__init__.py +14 -0
langfun/env/event_handlers/chain.py +233 -0
langfun/env/event_handlers/chain_test.py +253 -0
langfun/env/event_handlers/event_logger.py +472 -0
langfun/env/event_handlers/event_logger_test.py +304 -0
langfun/env/event_handlers/metric_writer.py +726 -0
langfun/env/event_handlers/metric_writer_test.py +214 -0
langfun/env/interface.py +1640 -0
langfun/env/interface_test.py +153 -0
langfun/env/load_balancers.py +59 -0
langfun/env/load_balancers_test.py +141 -0
langfun/env/test_utils.py +507 -0
{langfun-0.1.2.dev202509120804.dist-info → langfun-0.1.2.dev202512040805.dist-info}/METADATA +7 -3
langfun-0.1.2.dev202512040805.dist-info/RECORD +217 -0
langfun/core/eval/v2/runners_test.py +0 -343
langfun/core/structured/schema.py +0 -987
langfun/core/structured/schema_test.py +0 -982
langfun-0.1.2.dev202509120804.dist-info/RECORD +0 -172
{langfun-0.1.2.dev202509120804.dist-info → langfun-0.1.2.dev202512040805.dist-info}/WHEEL +0 -0
{langfun-0.1.2.dev202509120804.dist-info → langfun-0.1.2.dev202512040805.dist-info}/licenses/LICENSE +0 -0
{langfun-0.1.2.dev202509120804.dist-info → langfun-0.1.2.dev202512040805.dist-info}/top_level.txt +0 -0

langfun/core/eval/v2/{runners.py → runners/base.py} RENAMED Viewed

@@ -11,18 +11,18 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Evaluation experiment runners."""
+"""Base experiment runner."""
 import abc
-import collections
 import concurrent.futures
 import random
 import threading
-import time
 import traceback
-from typing import Any, Annotated, Callable, Iterator
+from typing import Any, Annotated, Callable, Iterator, Literal
 from langfun import core as lf
 from langfun.core.eval.v2 import checkpointing
+from langfun.core.eval.v2 import config_saver
 from langfun.core.eval.v2 import evaluation as evaluation_lib
 from langfun.core.eval.v2 import example as example_lib
 from langfun.core.eval.v2 import experiment as experiment_lib
@@ -38,35 +38,57 @@ Experiment = experiment_lib.Experiment
 Plugin = experiment_lib.Plugin
-_RUN_MANIFEST = 'run.json'
 class RunnerBase(Runner):
-  """A simple runner that runs evaluations and their examples sequentially."""
+  """Base class for runners with plugin support and IO pooling.
+  `RunnerBase` provides the basic runner functionalities such as plugin
+  integration for checkpointing, reporting and progress tracking.
+  It also manages a thread pool for background IO operations.
+  Subclasses should implement `_run` and `_evaluate_items` for different
+  execution strategies.
+  """
-  tqdm: Annotated[
-      bool,
+  progress_tracker: Annotated[
+      Literal['tqdm', 'html', 'auto', None],
       (
-          'If True, force using tqdm for progress update. Otherwise, determine '
-          'it automatically based on the running environment (console vs. '
-          'notebook)'
+          'If `tqdm`, force using tqdm for progress update. '
+          'If `html`, force using html for progress update. '
+          'If `auto`, determine it automatically based on the running '
+          'environment (console vs. notebook)'
+          'If `none`, disable progress update.'
       )
-  ] = False
+  ] = 'auto'
   plugins = [
       checkpointing.BulkCheckpointer(),
       reporting.HtmlReporter(),
+      config_saver.RunConfigSaver(),
   ]
+  max_background_threads: Annotated[
+      int,
+      'Max number of background threads for IO operations.'
+  ] = 128
   def _on_bound(self):
     super()._on_bound()
     # Install the tqdm plugin if needed.
-    with pg.notify_on_change(False):
-      self.plugins.append(progress_tracking.progress_tracker(self.tqdm))
+    if self.progress_tracker is not None:
+      with pg.notify_on_change(False):
+        self.plugins.append(
+            progress_tracking.progress_tracker(self.progress_tracker)
+        )
+    if self.max_background_threads > 0:
+      self._io_pool_lock = threading.Lock()
+      self._io_pool = concurrent.futures.ThreadPoolExecutor(
+          max_workers=self.max_background_threads
+      )
+    else:
+      self._io_pool_lock = None
+      self._io_pool = None
-    self._io_pool_lock = threading.Lock()
-    self._io_pool = concurrent.futures.ThreadPoolExecutor(max_workers=16)
     # TODO(daiyip): render background errors.
     self._background_last_error = None
@@ -78,9 +100,12 @@ class RunnerBase(Runner):
       except Exception as e:  # pylint: disable=broad-except
         self._background_last_error = e
-    with self._io_pool_lock:
-      if self._io_pool is not None:
-        self._io_pool.submit(_background_run, *args, **kwargs)
+    if self.max_background_threads > 0:
+      with self._io_pool_lock:
+        if self._io_pool is not None:
+          self._io_pool.submit(_background_run, *args, **kwargs)
+    else:
+      _background_run(*args, **kwargs)
   def _all_plugins(self, experiment: Experiment) -> Iterator[Plugin]:
     """Returns all plugins for the experiment."""
@@ -89,24 +114,8 @@ class RunnerBase(Runner):
     for plugin in experiment.plugins:
       yield plugin
-  #
-  # IO operations for saving running files.
-  #
-  def _save_run_manifest(self) -> None:
-    def _save():
-      pg.symbolic.deref(self.current_run.clone(), recursive=True).save(
-          self.current_run.output_path_for(
-              self.current_run.experiment, _RUN_MANIFEST
-          ),
-          hide_default_values=True
-      )
-    self.background_run(_save)
   def on_run_start(self) -> None:
     """Called when a runner is started."""
-    self._save_run_manifest()
     for plugin in self._all_plugins(self.current_run.experiment):
       plugin.on_run_start(self, self.current_run.experiment)
@@ -126,9 +135,8 @@ class RunnerBase(Runner):
     num_examples_to_evaluate = 0
     if experiment.is_leaf:
       assert isinstance(experiment, Evaluation)
-      num_examples_to_evaluate = (
-          len(self.current_run.example_ids)
-          if self.current_run.example_ids else experiment.num_examples
+      num_examples_to_evaluate = len(
+          self.current_run.examples_to_evaluate(experiment)
       )
       experiment.progress.start(total=num_examples_to_evaluate)
     else:
@@ -139,6 +147,7 @@ class RunnerBase(Runner):
       plugin.on_experiment_start(self, experiment)
     if experiment.is_leaf:
+      pg.io.mkdirs(self.current_run.output_dir(experiment))
       experiment.info(
           f'Starting evaluation {experiment.id!r} with '
           f'{num_examples_to_evaluate} examples to evaluate.'
@@ -180,10 +189,7 @@ class RunnerBase(Runner):
       self._log_experiment_completion(experiment)
   def _log_experiment_completion(self, experiment: Experiment):
-    example_ids = (
-        self.current_run.example_ids if self.current_run.example_ids else
-        list(range(1, experiment.num_examples + 1))
-    )
+    example_ids = sorted(self.current_run.examples_to_evaluate(experiment))
     num_from_checkpoint, num_processed = 0, 0
     for example_id in example_ids:
       status = experiment.state.get_status(example_id)
@@ -220,7 +226,7 @@ class RunnerBase(Runner):
       else:
         # A evaluation could be considered as done if it has processed all the
         # examples specified by `example_ids`.
-        assert progress.is_completed
+        assert progress.is_completed, progress
         parent_progress.increment_processed()
       if parent_progress.is_completed:
@@ -235,6 +241,8 @@ class RunnerBase(Runner):
       example: Example
   ) -> None:
     """Called when an evaluation example is started."""
+    assert isinstance(experiment, Evaluation), experiment
+    experiment.state.update(example, in_progress=True)
     for plugin in self._all_plugins(experiment):
       plugin.on_example_start(self, experiment, example)
     experiment.info(f'Starting to evaluate example {example.id}.')
@@ -245,6 +253,8 @@ class RunnerBase(Runner):
       example: Example
   ) -> None:
     """Called when an evaluation example is complete."""
+    assert isinstance(experiment, Evaluation), experiment
+    experiment.state.update(example, in_progress=False)
     if example.newly_processed:
       if example.error is None:
         experiment.progress.increment_processed()
@@ -256,7 +266,7 @@ class RunnerBase(Runner):
         experiment.progress.increment_failed()
         experiment.error(
             (
-                f'Failed to evaluate example {example.id} in'
+                f'Failed to evaluate example {example.id} in '
                 f'{example.elapse:.2f} seconds.'
             ),
             error=example.error
@@ -316,7 +326,7 @@ class RunnerBase(Runner):
         self._run(targets)
       self.on_run_complete()
-    except Exception as e:  # pylint: disable=broad-except
+    except BaseException as e:  # pylint: disable=broad-except
       self.on_run_abort(e)
       raise e
     finally:
@@ -324,9 +334,10 @@ class RunnerBase(Runner):
         self.background_run(cache.save)
       # Wait for the background tasks to finish.
-      with self._io_pool_lock:
-        self._io_pool, io_pool = None, self._io_pool
-      io_pool.shutdown(wait=True)
+      if self.max_background_threads > 0:
+        with self._io_pool_lock:
+          self._io_pool, io_pool = None, self._io_pool
+        io_pool.shutdown(wait=True)
   @abc.abstractmethod
   def _run(self, evaluations: list[Evaluation]) -> None:
@@ -335,6 +346,7 @@ class RunnerBase(Runner):
   def run_evaluation(self, evaluation: Evaluation) -> None:
     """Runs the evaluation."""
     try:
+      evaluation.setup()
       self.on_experiment_start(evaluation)
       per_evaluation_settings = {}
@@ -344,18 +356,14 @@ class RunnerBase(Runner):
         per_evaluation_settings['cache'] = cache
       with lf.use_settings(**per_evaluation_settings):
-        if self.current_run.example_ids is None:
-          items = (
-              Example(id=i + 1, input=ex) for i, ex in enumerate(
-                  evaluation.example_inputs)
-          )
-        else:
-          items = (
-              Example(
-                  id=example_id,
-                  input=evaluation.example_input_by_id(example_id)
-              ) for example_id in self.current_run.example_ids
-          )
+        items = (
+            Example(
+                id=example_id,
+                input=evaluation.example_input_by_id(example_id)
+            ) for example_id in sorted(
+                self.current_run.examples_to_evaluate(evaluation)
+            )
+        )
         if self.current_run.shuffle_inputs:
           items = list(items)
           random.shuffle(items)
@@ -367,6 +375,8 @@ class RunnerBase(Runner):
     except BaseException as e:  # pylint: disable=broad-except
       self.on_experiment_abort(evaluation, e)
       raise e
+    finally:
+      evaluation.teardown()
   @abc.abstractmethod
   def _evaluate_items(
@@ -394,121 +404,3 @@ class RunnerBase(Runner):
     return in_memory.InMemory(
         self.current_run.output_path_for(experiment, 'cache.json')
     )
-class SequentialRunner(RunnerBase):
-  """Sequential runner.
-  Sequential runner runs all evaluations and their examples in sequence,
-  as well as the background tasks, it allows the developer to catch all
-  exceptions thrown from the background tasks, making it easier to debug.
-  """
-  NAME = 'sequential'
-  def background_run(
-      self, func: Callable[..., Any], *args: Any, **kwargs: Any
-  ) -> None:
-    """Runs the function with the IO pool."""
-    func(*args, **kwargs)
-  def _run(self, evaluations: list[Evaluation]) -> None:
-    """Runs the experiment in sequence."""
-    for e in evaluations:
-      self.run_evaluation(e)
-  def _evaluate_items(
-      self, evaluation: Evaluation, items: Iterator[Example]
-  ) -> None:
-    """Runs the evaluation items in sequence."""
-    for item in items:
-      self.evaluate_item(evaluation, item)
-class DebugRunner(SequentialRunner):
-  """Debug runner."""
-  NAME = 'debug'
-  # Do not use the checkpointer for debug runner.
-  plugins = []
-  def _on_bound(self):
-    super()._on_bound()
-    if self.current_run.example_ids is None:
-      self.current_run.rebind(example_ids=[1], skip_notification=True)
-    self.current_run.rebind(raise_if_has_error=True, skip_notification=True)
-  def _save_run_manifest(self) -> None:
-    """Do nothing to avoid overriden existing runs."""
-class ParallelRunner(RunnerBase):
-  """Parallel runner."""
-  NAME = 'parallel'
-  timeout: Annotated[
-      int | None,
-      'Timeout for each evaluation example.'
-  ] = None
-  concurrent_startup_delay: Annotated[
-      tuple[int, int] | None,
-      (
-          'A range of seconds to delay the initial evaluation of each thread '
-          'in the thread pool, helping to prevent a burst in LLM QPS at '
-          'startup. If set to None, no delay will be applied.'
-      )
-  ] = None
-  def _run(self, evaluations: list[Evaluation]) -> None:
-    """Runs the evaluations in parallel."""
-    def _run_group(evaluation_group: list[Evaluation]):
-      for e in evaluation_group:
-        self.run_evaluation(e)
-    # Run evaluations in parallel groupped by resource key.
-    groups: dict[str, list[Evaluation]] = collections.defaultdict(list)
-    for e in evaluations:
-      resource_ids = e.resource_ids()
-      if not resource_ids:
-        group_id = e.id
-      else:
-        # TODO(daiyip): support group that requires multiple resources.
-        group_id = resource_ids.pop()
-      groups[group_id].append(e)
-    for _, _, _ in lf.concurrent_map(
-        _run_group,
-        groups.values(),
-        max_workers=max(64, len(groups)),
-        timeout=self.timeout,
-        silence_on_errors=None,
-    ):
-      pass
-  def _evaluate_items(
-      self, evaluation: Evaluation, items: Iterator[Example]
-  ) -> None:
-    """Override run items to run in parallel."""
-    if self.concurrent_startup_delay is not None:
-      thread_delayed = {}
-      def _evaluate_item(item: Example):
-        thread_id = threading.current_thread().ident
-        if thread_id not in thread_delayed:
-          thread_delayed[thread_id] = True
-          time.sleep(random.randint(*self.concurrent_startup_delay))
-        return self.evaluate_item(evaluation, item)
-    else:
-      def _evaluate_item(item: Example):
-        return self.evaluate_item(evaluation, item)
-    for _, _, _ in lf.concurrent_map(
-        _evaluate_item,
-        items,
-        max_workers=evaluation.max_workers,
-        timeout=self.timeout,
-        silence_on_errors=None,
-    ):
-      pass

langfun 0.1.2.dev202509120804__py3-none-any.whl → 0.1.2.dev202512040805__py3-none-any.whl

langfun 0.1.2.dev202509120804py3-none-any.whl → 0.1.2.dev202512040805py3-none-any.whl