PyPI - langfun - Versions diffs - 0.1.2.dev202509120804__py3-none-any.whl → 0.1.2.dev202512040805__py3-none-any.whl - Mend

langfun 0.1.2.dev202509120804py3-none-any.whl → 0.1.2.dev202512040805py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (162) hide show

langfun/__init__.py +1 -1
langfun/core/__init__.py +7 -1
langfun/core/agentic/__init__.py +8 -1
langfun/core/agentic/action.py +740 -112
langfun/core/agentic/action_eval.py +9 -2
langfun/core/agentic/action_test.py +189 -24
langfun/core/async_support.py +104 -5
langfun/core/async_support_test.py +23 -0
langfun/core/coding/python/correction.py +19 -9
langfun/core/coding/python/execution.py +14 -12
langfun/core/coding/python/generation.py +21 -16
langfun/core/coding/python/sandboxing.py +23 -3
langfun/core/component.py +42 -3
langfun/core/concurrent.py +70 -6
langfun/core/concurrent_test.py +9 -2
langfun/core/console.py +1 -1
langfun/core/data/conversion/anthropic.py +12 -3
langfun/core/data/conversion/anthropic_test.py +8 -6
langfun/core/data/conversion/gemini.py +11 -2
langfun/core/data/conversion/gemini_test.py +48 -9
langfun/core/data/conversion/openai.py +145 -31
langfun/core/data/conversion/openai_test.py +161 -17
langfun/core/eval/base.py +48 -44
langfun/core/eval/base_test.py +5 -5
langfun/core/eval/matching.py +5 -2
langfun/core/eval/patching.py +3 -3
langfun/core/eval/scoring.py +4 -3
langfun/core/eval/v2/__init__.py +2 -0
langfun/core/eval/v2/checkpointing.py +76 -7
langfun/core/eval/v2/checkpointing_test.py +9 -2
langfun/core/eval/v2/config_saver.py +37 -0
langfun/core/eval/v2/config_saver_test.py +36 -0
langfun/core/eval/v2/eval_test_helper.py +104 -3
langfun/core/eval/v2/evaluation.py +92 -17
langfun/core/eval/v2/evaluation_test.py +9 -3
langfun/core/eval/v2/example.py +50 -40
langfun/core/eval/v2/example_test.py +16 -8
langfun/core/eval/v2/experiment.py +84 -15
langfun/core/eval/v2/experiment_test.py +19 -0
langfun/core/eval/v2/metric_values.py +31 -3
langfun/core/eval/v2/metric_values_test.py +32 -0
langfun/core/eval/v2/metrics.py +157 -44
langfun/core/eval/v2/metrics_test.py +39 -18
langfun/core/eval/v2/progress.py +31 -1
langfun/core/eval/v2/progress_test.py +27 -0
langfun/core/eval/v2/progress_tracking.py +13 -5
langfun/core/eval/v2/progress_tracking_test.py +9 -1
langfun/core/eval/v2/reporting.py +90 -71
langfun/core/eval/v2/reporting_test.py +24 -6
langfun/core/eval/v2/runners/__init__.py +30 -0
langfun/core/eval/v2/{runners.py → runners/base.py} +72 -180
langfun/core/eval/v2/runners/beam.py +354 -0
langfun/core/eval/v2/runners/beam_test.py +153 -0
langfun/core/eval/v2/runners/ckpt_monitor.py +294 -0
langfun/core/eval/v2/runners/ckpt_monitor_test.py +162 -0
langfun/core/eval/v2/runners/debug.py +40 -0
langfun/core/eval/v2/runners/debug_test.py +76 -0
langfun/core/eval/v2/runners/parallel.py +243 -0
langfun/core/eval/v2/runners/parallel_test.py +182 -0
langfun/core/eval/v2/runners/sequential.py +47 -0
langfun/core/eval/v2/runners/sequential_test.py +169 -0
langfun/core/langfunc.py +45 -130
langfun/core/langfunc_test.py +7 -5
langfun/core/language_model.py +189 -36
langfun/core/language_model_test.py +54 -3
langfun/core/llms/__init__.py +12 -1
langfun/core/llms/anthropic.py +157 -2
langfun/core/llms/azure_openai.py +29 -17
langfun/core/llms/cache/base.py +25 -3
langfun/core/llms/cache/in_memory.py +48 -7
langfun/core/llms/cache/in_memory_test.py +14 -4
langfun/core/llms/compositional.py +25 -1
langfun/core/llms/deepseek.py +30 -2
langfun/core/llms/fake.py +32 -1
langfun/core/llms/gemini.py +64 -12
langfun/core/llms/gemini_test.py +110 -0
langfun/core/llms/google_genai.py +34 -1
langfun/core/llms/groq.py +28 -3
langfun/core/llms/llama_cpp.py +23 -4
langfun/core/llms/openai.py +120 -3
langfun/core/llms/openai_compatible.py +148 -27
langfun/core/llms/openai_compatible_test.py +207 -20
langfun/core/llms/openai_test.py +0 -2
langfun/core/llms/rest.py +16 -1
langfun/core/llms/vertexai.py +58 -8
langfun/core/logging.py +1 -1
langfun/core/mcp/__init__.py +10 -0
langfun/core/mcp/client.py +177 -0
langfun/core/mcp/client_test.py +71 -0
langfun/core/mcp/session.py +241 -0
langfun/core/mcp/session_test.py +54 -0
langfun/core/mcp/testing/simple_mcp_client.py +33 -0
langfun/core/mcp/testing/simple_mcp_server.py +33 -0
langfun/core/mcp/tool.py +254 -0
langfun/core/mcp/tool_test.py +197 -0
langfun/core/memory.py +1 -0
langfun/core/message.py +160 -55
langfun/core/message_test.py +65 -81
langfun/core/modalities/__init__.py +8 -0
langfun/core/modalities/audio.py +21 -1
langfun/core/modalities/image.py +73 -3
langfun/core/modalities/image_test.py +116 -0
langfun/core/modalities/mime.py +64 -3
langfun/core/modalities/mime_test.py +11 -0
langfun/core/modalities/pdf.py +19 -1
langfun/core/modalities/video.py +21 -1
langfun/core/modality.py +167 -29
langfun/core/modality_test.py +42 -12
langfun/core/natural_language.py +1 -1
langfun/core/sampling.py +4 -4
langfun/core/sampling_test.py +20 -4
langfun/core/structured/__init__.py +2 -24
langfun/core/structured/completion.py +34 -44
langfun/core/structured/completion_test.py +23 -43
langfun/core/structured/description.py +54 -50
langfun/core/structured/function_generation.py +29 -12
langfun/core/structured/mapping.py +81 -37
langfun/core/structured/parsing.py +95 -79
langfun/core/structured/parsing_test.py +0 -3
langfun/core/structured/querying.py +230 -154
langfun/core/structured/querying_test.py +69 -33
langfun/core/structured/schema/__init__.py +49 -0
langfun/core/structured/schema/base.py +664 -0
langfun/core/structured/schema/base_test.py +531 -0
langfun/core/structured/schema/json.py +174 -0
langfun/core/structured/schema/json_test.py +121 -0
langfun/core/structured/schema/python.py +316 -0
langfun/core/structured/schema/python_test.py +410 -0
langfun/core/structured/schema_generation.py +33 -14
langfun/core/structured/scoring.py +47 -36
langfun/core/structured/tokenization.py +26 -11
langfun/core/subscription.py +2 -2
langfun/core/template.py +175 -50
langfun/core/template_test.py +123 -17
langfun/env/__init__.py +43 -0
langfun/env/base_environment.py +827 -0
langfun/env/base_environment_test.py +473 -0
langfun/env/base_feature.py +304 -0
langfun/env/base_feature_test.py +228 -0
langfun/env/base_sandbox.py +842 -0
langfun/env/base_sandbox_test.py +1235 -0
langfun/env/event_handlers/__init__.py +14 -0
langfun/env/event_handlers/chain.py +233 -0
langfun/env/event_handlers/chain_test.py +253 -0
langfun/env/event_handlers/event_logger.py +472 -0
langfun/env/event_handlers/event_logger_test.py +304 -0
langfun/env/event_handlers/metric_writer.py +726 -0
langfun/env/event_handlers/metric_writer_test.py +214 -0
langfun/env/interface.py +1640 -0
langfun/env/interface_test.py +153 -0
langfun/env/load_balancers.py +59 -0
langfun/env/load_balancers_test.py +141 -0
langfun/env/test_utils.py +507 -0
{langfun-0.1.2.dev202509120804.dist-info → langfun-0.1.2.dev202512040805.dist-info}/METADATA +7 -3
langfun-0.1.2.dev202512040805.dist-info/RECORD +217 -0
langfun/core/eval/v2/runners_test.py +0 -343
langfun/core/structured/schema.py +0 -987
langfun/core/structured/schema_test.py +0 -982
langfun-0.1.2.dev202509120804.dist-info/RECORD +0 -172
{langfun-0.1.2.dev202509120804.dist-info → langfun-0.1.2.dev202512040805.dist-info}/WHEEL +0 -0
{langfun-0.1.2.dev202509120804.dist-info → langfun-0.1.2.dev202512040805.dist-info}/licenses/LICENSE +0 -0
{langfun-0.1.2.dev202509120804.dist-info → langfun-0.1.2.dev202512040805.dist-info}/top_level.txt +0 -0

langfun/core/eval/base.py CHANGED Viewed

@@ -59,18 +59,20 @@ class Evaluable(lf.Component):
   @property
   @abc.abstractmethod
   def id(self) -> str:
-    """Returns the ID of the task.
+    """Returns the ID of this evaluable node.
     Returns:
-      Evaluation task ID. Different evaluation task should have their unique
-      task IDs, for each task will be stored in sub-directoreis identified by
-      their IDs. For suites, the ID could be an empty string as they will not
-      produce sub-directories
+      A string as the ID of this evaluable node.
+      If an evaluable node acts as a container for other evaluable nodes
+      (e.g. `lf.Suite`), its ID could be empty.
+      Leaf evaluable nodes (e.g. `lf.Evaluation`) must have unique IDs
+      under the same container, as their IDs will be used as the directory
+      name for saving their results.
     """
   @property
   def dir(self) -> str | None:
-    """Returns the directory for saving results and details."""
+    """Returns the directory for saving results."""
     if self.root_dir is None:
       return None
     return os.path.join(self.root_dir, self.id)
@@ -82,18 +84,18 @@ class Evaluable(lf.Component):
   @property
   def index_link(self) -> str | None:
-    """Returns the index page."""
+    """Returns the link to the index page."""
     if self.dir is None:
       return None
     return self.link(os.path.join(self.dir, Evaluable.INDEX_HTML))
   def summary(self, pivot_field: str = 'lm') -> 'Summary':
-    """Returns a summary for all child evaluations.."""
+    """Returns a summary for all child evaluations."""
     return Summary([pg.Ref(x) for x in self.leaf_nodes], pivot_field)
   @property
   def summary_link(self) -> str | None:
-    """Returns the summary page."""
+    """Returns the link to the summary page."""
     if self.root_dir is None:
       return None
     return self.link(os.path.join(self.root_dir, Evaluable.SUMMARY_HTML))
@@ -177,6 +179,7 @@ class Evaluable(lf.Component):
   @property
   def is_leaf(self) -> bool:
+    """Returns whether this node is a leaf node."""
     return isinstance(self, Evaluation) and not self.children
   @functools.cached_property
@@ -404,7 +407,7 @@ class Evaluable(lf.Component):
       timeout: int | None = None,
       **kwargs,
   ) -> None:
-    """Run the evaluate and fill `self.result`. Subclass to implement."""
+    """Run the evaluation and fill `self.result`. Subclass to implement."""
   @abc.abstractmethod
   def _completion_status(self, run_status: str) -> str:
@@ -545,6 +548,7 @@ class Evaluable(lf.Component):
   def from_dir(
       cls, maybe_dir: str, load_result: bool = True
   ) -> Optional['Evaluable']:
+    """Loads an evaluable object from a directory."""
     exp_json = os.path.join(maybe_dir, Evaluable.EXPERIMENT_JSON)
     if not pg.io.path_exists(exp_json):
       return None
@@ -558,7 +562,7 @@ class Evaluable(lf.Component):
     return experiment
   def try_load_result(self) -> bool:
-    """Try load result."""
+    """Try loads result from file if it's not loaded."""
     if self.result is None:
       result_json = os.path.join(self.dir, Evaluable.RESULT_JSON)
       if pg.io.path_exists(result_json):
@@ -595,7 +599,7 @@ class Suite(Evaluable):
   def _on_bound(self):
     super()._on_bound()
     overrides = {
-        k: v for k, v in self.sym_init_args.items()
+        k: v for k, v in self.sym_init_args.sym_items()
         if k not in ('id', 'children')
     }
     for child in self.children:
@@ -604,6 +608,7 @@ class Suite(Evaluable):
   @functools.cached_property
   def hash(self) -> str:
+    """Returns the hash of this suite."""
     return hashlib.md5(
         ' '.join(sorted([c.hash for c in self.children])).encode()
     ).hexdigest()[:8]
@@ -619,14 +624,14 @@ class Suite(Evaluable):
 class Evaluation(Evaluable):
-  """Base class for evaluation set."""
+  """Base class for evaluation sets."""
   inputs: pg.typing.Annotated[
       pg.typing.Functor(),
       (
           'A functor that returns a list of user-defined objects as the input '
-          'examples. It could be inputs loaded from a JSON file via '
-          '`lf.eval.inputs_from(path)`, from a Python coded list via '
+          'examples. It can be inputs loaded from a JSON file via '
+          '`lf.eval.inputs_from(path)`, from a Python-coded list via '
           '`lf.eval.as_inputs(values)` or a user-defined functor that '
           'generates input objects at runtime.'
       ),
@@ -648,12 +653,12 @@ class Evaluation(Evaluable):
       pg.typing.Functor().noneable(),
       (
           'A functor that returns a type annotation that will be converted to '
-          '`lf.Schema`, or a tuple of (annotation, fewshot examples). '
+          '`lf.Schema`, or a tuple of (annotation, few-shot examples). '
           'For "call" method, it could be None, indicating that the raw '
-          'response from the LM will be used as the output, and the fewshot '
-          'examples will be used for parsing. For "query" and "complete", it '
-          'must be provided, and the fewshot examples will be used directly '
-          'for prompting. Here are the example code on how the '
+          'response from the LM will be used as the output, and the few-shot '
+          'examples will be used for parsing. For "query" and "complete" '
+          'methods, it must be provided, and the few-shot examples will be '
+          'used directly for prompting. Here is example code on how the '
           'functors should be defined:'
           + inspect.cleandoc("""
               ```
@@ -693,7 +698,7 @@ class Evaluation(Evaluable):
   completion_prompt_field: Annotated[
       str | None,
       (
-          'A str field that will be automatically added to the class of the '
+          'A string field that will be automatically added to the class of the '
           'input object for `lf.complete`. If None, no field will be added to '
           'the class, instead the prompt will be passed as the first argument '
           'of the input object to complete. Applicable only when `method` is '
@@ -738,7 +743,7 @@ class Evaluation(Evaluable):
   @functools.cached_property
   def hash(self) -> str:
-    """Returns the semantic-based hash of the evaluation."""
+    """Returns the semantics-based hash of the evaluation."""
     if self.is_deterministic:
       identity = pg.format(self._identifiers(), compact=True)
     else:
@@ -784,7 +789,7 @@ class Evaluation(Evaluable):
   @property
   def complete_rate(self) -> float:
-    """Returns the complete rate."""
+    """Returns the completion rate of examples."""
     return self.num_completed / self.num_examples
   #
@@ -837,7 +842,7 @@ class Evaluation(Evaluable):
   @functools.cached_property
   def non_oop_failures(self) -> list[tuple[Any, Exception]]:
-    """Returns the OOP failures."""
+    """Returns the non-OOP failures."""
     return [item for item in self.failures
             if not isinstance(item[1], lf_structured.MappingError)]
@@ -883,7 +888,7 @@ class Evaluation(Evaluable):
   @functools.cached_property
   def schema(self) -> lf_structured.Schema | None:
-    """Schema."""
+    """Returns the schema for parsing LLM response."""
     if self.schema_fn is None:
       return None
@@ -897,7 +902,7 @@ class Evaluation(Evaluable):
   @functools.cached_property
   def fewshot_examples(self) -> list[lf.structured.MappingExample] | None:
-    """Fewshot examples."""
+    """Returns the few-shot examples for prompting or parsing."""
     if self.schema_fn is None:
       return None
@@ -973,7 +978,7 @@ class Evaluation(Evaluable):
   @functools.cached_property
   def children(self) -> list['Evaluation']:
-    """Returns the trials as child evaluations if this evaluation is a space."""
+    """Returns child evaluations if this evaluation has a parameter space."""
     if self.is_deterministic:
       return []
     children = []
@@ -1023,7 +1028,7 @@ class Evaluation(Evaluable):
   @property
   def non_oop_failures_link(self) -> str | None:
-    """Returns the link to then non-OOP failures page."""
+    """Returns the link to the non-OOP failures page."""
     if self.dir is None:
       return None
     return self.link(os.path.join(self.dir, Evaluation.NON_OOP_FAILURES_HTML))
@@ -1208,10 +1213,10 @@ class Evaluation(Evaluable):
       )
   def process_output(self, example: Any, output: lf.Message) -> None:
-    """Process the output for an example.
+    """Processes the output for an example.
     Subclasses can override this method to generate and attach additional
-    metadata for debugging purpose. For example, draw bounding boxes on the
+    metadata for debugging purposes. For example, draw bounding boxes on the
     input image based on LLM predicted boxes and attach to output_message's
     metadata.
@@ -1219,8 +1224,8 @@ class Evaluation(Evaluable):
       class BoundingBoxEval(lf.eval.Matching):
         ...
-        def process_output(example, output):
-          output.metadata.image_with_bbox = draw_bboxes(
+        def process_output(self, example, output):
+          output.metadata.image_with_bbox = draw_bounding_box(
               example.image, output.result)
     Args:
@@ -1449,7 +1454,7 @@ class Evaluation(Evaluable):
         trace the LM input, response and parsed structure. If error is raised
         before LLM could return a response, None will be its value.
       error: The exception during processing the example.
-      dryrun: Whether or not audition takes place during dryrun.
+      dryrun: Whether or not auditing takes place during dryrun.
     """
     if error is not None:
       self._failures.append((example, error))
@@ -1557,7 +1562,7 @@ class Evaluation(Evaluable):
         f'style="color:darkgray">{_html_repr(self.prompt)}</td>'
     )
     # Schema.
-    schema_title = self.schema.schema_str('python') if self.schema else None
+    schema_title = self.schema.schema_repr('python') if self.schema else None
     s.write(
         '<td style="color:purple" '
         f'title="{schema_title}">'
@@ -1674,7 +1679,7 @@ class Evaluation(Evaluable):
   @classmethod
   def visualize(cls, evaluations: list['Evaluation']) -> str | None:
-    """Visualize the a list of evaluations of this task in HTML."""
+    """Visualize a list of evaluations of this task in HTML."""
     del evaluations
     return None
@@ -1810,7 +1815,7 @@ class Summary(pg.Object):
     )
   class Table(pg.Object):
-    """A pivot table for view evaluations."""
+    """A pivot table for viewing evaluations."""
     class Row(pg.Object):
       descriptor: dict[str, Any]
@@ -2013,12 +2018,12 @@ class Summary(pg.Object):
       return self._context.completed
     def stop(self) -> 'Summary':
-      """Signal and wait the monitor thread to stop."""
+      """Signals and waits for the monitor thread to stop."""
       self._context.stopping = True
       return self.join()
     def join(self) -> 'Summary':
-      """Waits the monitor thread to complete."""
+      """Waits for the monitor thread to complete."""
       self._thread.join()
       summary = self.summary
       assert summary is not None
@@ -2035,7 +2040,7 @@ class Summary(pg.Object):
       scan_interval: int = 60,
       refresh_when_stop: bool = True,
   ) -> MonitorResult:
-    """Monitor one or more root directories and save summary in period."""
+    """Monitors one or more root directories and save summary periodically."""
     context = pg.Dict(stopping=False, completed=False, summary=None)
     def _monitor():
@@ -2187,7 +2192,7 @@ def monitor_async(
     scan_interval: int = 60,
     refresh_when_stop: bool = True,
 ) -> Summary.MonitorResult:
-  """Asynchronorsly monitor one or more root directories for summary."""
+  """Asynchronously monitors one or more root directories for summary."""
   return Summary.monitor_async(
       root_dir,
       save_as,
@@ -2365,10 +2370,9 @@ def run(
       a string (for string-based patcher), a `pg.patching.Patcher` object, or
       a rebind function (e.g. `pg.rebind`). See `lf.eval.patch_*` for more
       details.
-    mode: The mode to run the suite. "run" to run the suite, with reusing
-      existing results if available; "rerun" to rerun all evaluations even if
-      there are existing results; "dryrun" to dryrun the suite; and "noop"
-      to do nothing.
+    mode: The mode to run the suite: "run" to run with reuse of existing
+      results, "rerun" to force re-evaluation, "dryrun" for a dry run, and
+      "noop" to do nothing.
     debug: Whether to run in debug mode.
     print_definition: Whether to print the experiment definition.
     **kwargs: Additional arguments to be passed to dryrun/run the suite.

langfun/core/eval/base_test.py CHANGED Viewed

@@ -101,7 +101,7 @@ class EvaluationTest(unittest.TestCase):
     self.assertEqual(s.dir, os.path.join(s.root_dir, s.id))
     self.assertEqual(s.hash, s.clone().hash)
     # Test persistent hash.
-    self.assertEqual(s.hash, 'e43392e4')
+    self.assertEqual(s.hash, '4dfe486a')
     self.assertEqual(
         s.hash, s.clone(override={'max_workers': 2, 'lm.timeout': 20}).hash
     )
@@ -211,7 +211,7 @@ class EvaluationTest(unittest.TestCase):
         s.result,
         dict(
             experiment_setup=dict(
-                id='Evaluation@2fbf1b05',
+                id='Evaluation@e028b6e6',
                 dir=s.dir,
                 model='StaticSequence',
                 prompt_template='{{example.question}}',
@@ -269,7 +269,7 @@ class EvaluationTest(unittest.TestCase):
         s.root_dir, base.Evaluation.SUMMARY_HTML.replace('.html', '.json')
     )
     self.assertTrue(os.path.exists(summary_json))
-    summary = pg.load(summary_json, auto_dict=True)
+    summary = pg.load(summary_json, convert_unknown=True)
     self.assertIn('Evaluation', summary)
     self.assertEqual(len(summary['Evaluation']), 1)
     self.assertIsNotNone(summary['Evaluation'][0].experiment)
@@ -376,7 +376,7 @@ class EvaluationTest(unittest.TestCase):
         s.children[0].dir, os.path.join(s.root_dir, s.children[0].id)
     )
     # Test persistent hash.
-    self.assertEqual(s.hash, 'de23bf31')
+    self.assertEqual(s.hash, 'fa8f5419')
     summary = s.run(verbose=True)
     self.assertEqual(len(summary.evaluations), 2)
@@ -526,7 +526,7 @@ class SuiteTest(unittest.TestCase):
         lm=lm
     )
     # Test for persistent hash.
-    self.assertEqual(s.hash, '1c42f93e')
+    self.assertEqual(s.hash, 'ec3901b8')
     s.run()
     expected = {
         s.children[0].id: dict(

langfun/core/eval/matching.py CHANGED Viewed

@@ -38,7 +38,7 @@ class Matching(base.Evaluation):
   @abc.abstractmethod
   def answer(self, output: Any, example: Any) -> Any:
-    """Returns the answer from the structure output."""
+    """Returns the answer from the structured output."""
   @property
   def matches(self) -> list[tuple[int, Any, Any, lf.Message]]:
@@ -52,6 +52,7 @@ class Matching(base.Evaluation):
   @property
   def match_rate(self) -> float:
+    """Returns the match rate."""
     if self.num_completed == 0:
       return 0.0
     return self.num_matches / self.num_completed
@@ -68,17 +69,19 @@ class Matching(base.Evaluation):
   @property
   def mismatch_rate(self) -> float:
+    """Returns the mismatch rate."""
     if self.num_completed == 0:
       return 0.0
     return self.num_mismatches / self.num_completed
   @property
   def matches_link(self) -> str:
-    """Returns the matches page."""
+    """Returns the link to the matches page."""
     return self.link(os.path.join(self.dir, Matching.MATCHES_HTML))
   @property
   def mismatches_link(self) -> str:
+    """Returns the link to the mismatches page."""
     return self.link(os.path.join(self.dir, Matching.MISMATCHES_HTML))
   def _reset(self) -> None:

langfun/core/eval/patching.py CHANGED Viewed

@@ -114,17 +114,17 @@ def model_by_name(name: str) -> lf.LanguageModel:
 @pg.patcher(auto_typing=True)
 def lm(unused_eval, models: list[str]):
-  """Patch the LM used for benchmarking."""
+  """Patches the LM used for benchmarking."""
   return patch_lm(pg.oneof([model_by_name(name) for name in models]))
 @pg.patcher(auto_typing=True)
 def temperature(unused_eval, value: float):
-  """Patch the temperature used for benchmarking."""
+  """Patches the temperature used for benchmarking."""
   return patch_member(lf.LMSamplingOptions, "temperature", value)
 @pg.patcher(auto_typing=True)
 def max_tokens(unused_eval, value: int | None):
-  """Patch the temperature used for benchmarking."""
+  """Patches the max_tokens used for benchmarking."""
   return patch_member(lf.LMSamplingOptions, "max_tokens", value)

langfun/core/eval/scoring.py CHANGED Viewed

@@ -41,18 +41,19 @@ class Scoring(base.Evaluation):
   @property
   def score_rate(self) -> float:
-    """Returns the score rate."""
+    """Returns the rate of scored examples among the completed ones."""
     if self.num_completed == 0:
       return 0.0
     return self.num_scored / self.num_completed
   @property
   def scored_link(self) -> str:
-    """Returns the matches page."""
+    """Returns the scored examples page."""
     return self.link(os.path.join(self.dir, Scoring.SCORED_HTML))
   @property
   def avg_score(self) -> float:
+    """Returns the average score of scored examples."""
     if self.num_scored == 0:
       return 0
     return sum([i[2] for i in self._scored]) / self.num_scored
@@ -181,7 +182,7 @@ class Scoring(base.Evaluation):
     super()._render_summary_metrics(s)
   def _render_scored(self, s: io.StringIO) -> None:
-    """Formats the matched cases into html."""
+    """Formats the scored cases into html."""
     s.write('<h2> Scored </h2>')
     s.write('<div style="white-space:pre">\n')
     s.write(

langfun/core/eval/v2/__init__.py CHANGED Viewed

@@ -35,9 +35,11 @@ from langfun.core.eval.v2.experiment import Runner
 from langfun.core.eval.v2 import runners
 # Plugins
+from langfun.core.eval.v2.config_saver import RunConfigSaver
 from langfun.core.eval.v2.checkpointing import BulkCheckpointer
 from langfun.core.eval.v2.checkpointing import PerExampleCheckpointer
 from langfun.core.eval.v2.reporting import HtmlReporter
+from langfun.core.eval.v2.reporting import ExampleHtmlGenerator
 # pylint: enable=g-bad-import-order

langfun/core/eval/v2/checkpointing.py CHANGED Viewed

@@ -13,6 +13,8 @@
 # limitations under the License.
 """Checkpointing evaluation runs."""
 import abc
+import datetime
+import os
 import re
 import threading
 import traceback
@@ -29,12 +31,32 @@ Runner = experiment_lib.Runner
 class Checkpointer(experiment_lib.Plugin):
-  """Base class for checkpointing evaluation examples."""
+  """Base class for checkpointing evaluation examples.
+  `Checkpointer` is a plugin that saves the state of processed examples
+  incrementally during an experiment run, allowing the experiment to be resumed
+  later. When an experiment starts, the checkpointer loads any previously saved
+  examples from an earlier run (or a warm-start run) into `experiment.state`,
+  so the runner can skip processing them again.
+  Subclasses should implement `_list_checkpoint_filenames` to identify
+  checkpoint files to load, and `_save_example` to save a newly processed
+  example.
+  """
   checkpoint_filename: Annotated[
       str,
       'Checkpoint file pattern.'
-  ] = 'checkpoint.bagz'
+  ] = 'checkpoint.jsonl'
+  enable_inprogress_file: Annotated[
+      bool,
+      'If True, write file "<example_id>.inprogress" when example gets started.'
+  ] = True
+  max_ckpt_loading_threads: Annotated[
+      int,
+      'Max number of workers for loading checkpoint files at startup.'
+  ] = 128
   def on_experiment_start(
       self,
@@ -75,6 +97,24 @@ class Checkpointer(experiment_lib.Plugin):
           f'scratch. Example IDs: {example_ids_to_evaluate}.'
       )
+  def on_example_start(
+      self,
+      runner: Runner,
+      experiment: Experiment,
+      example: Example,
+  ) -> None:
+    """Saves the example to the checkpoint file."""
+    if self.enable_inprogress_file:
+      def _save_inprogress_file(example: Example):
+        inprogress_file = runner.current_run.output_path_for(
+            experiment, f'{example.id}.inprogress'
+        )
+        pg.io.writefile(
+            inprogress_file,
+            datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+        )
+      runner.background_run(_save_inprogress_file, example)
   def on_example_complete(
       self,
       runner: Runner,
@@ -149,7 +189,10 @@ class Checkpointer(experiment_lib.Plugin):
     _ = list(
         lf.concurrent_map(
-            _load_state, ckpt_files, max_workers=16, silence_on_errors=None
+            _load_state,
+            ckpt_files,
+            max_workers=self.max_ckpt_loading_threads,
+            silence_on_errors=None
         )
     )
@@ -170,7 +213,12 @@ class Checkpointer(experiment_lib.Plugin):
 class PerExampleCheckpointer(Checkpointer):
-  """Checkpointer that saves each example to a separate file."""
+  """Checkpointer that saves each example to a separate file.
+  This checkpointer saves each processed example to its own checkpoint file,
+  named using the pattern `<checkpoint_filename_prefix>_<example_id>.<ext>`.
+  For example, `checkpoint_1.bagz`, `checkpoint_2.bagz`, etc.
+  """
   def _on_bound(self):
     super()._on_bound()
@@ -235,7 +283,13 @@ class PerExampleCheckpointer(Checkpointer):
 class BulkCheckpointer(Checkpointer):
-  """Checkpointer that saves all examples to a single file."""
+  """Checkpointer that saves all examples of an evaluation to a single file.
+  This checkpointer appends newly processed examples of an evaluation to a
+  single sequence file (e.g., `checkpoint.bagz`). This is often more efficient
+  than `PerExampleCheckpointer` when dealing with a large number of examples
+  or when file system overhead is a concern.
+  """
   def _on_bound(self):
     super()._on_bound()
@@ -341,12 +395,26 @@ class BulkCheckpointer(Checkpointer):
 class SequenceWriter:
-  """Thread safe sequence writer."""
+  """A thread-safe writer for sequence files (e.g., Bagz) with atomic write.
+  `SequenceWriter` wraps a `pg.io.SequenceWriter` to provide thread-safe
+  `add` and `close` operations, ensuring that examples can be written
+  concurrently from multiple threads without corrupting the sequence file.
+  It writes to a temporary file and renames it to target path on `close` to
+  achieve atomic write. If the target path exists, new examples are appended
+  to existing content.
+  """
   def __init__(self, path: str):
     self._lock = threading.Lock()
     self._path = path
-    self._sequence_writer = pg.io.open_sequence(path, 'a')
+    basename = os.path.basename(path)
+    self._tmp_path = os.path.join(
+        os.path.dirname(path), f'tmp.{basename}'
+    )
+    if pg.io.path_exists(self._path):
+      pg.io.copy(self._path, self._tmp_path)
+    self._sequence_writer = pg.io.open_sequence(self._tmp_path, 'a')
   @property
   def path(self) -> str:
@@ -371,6 +439,7 @@ class SequenceWriter:
         return
       self._sequence_writer.close()
       self._sequence_writer = None
+      pg.io.rename(self._tmp_path, self._path)
   def __del__(self):
     self.close()

langfun/core/eval/v2/checkpointing_test.py CHANGED Viewed

@@ -65,7 +65,7 @@ class ExampleCollector(experiment_lib.Plugin):
     return self._examples
   def on_example_complete(
-      self, runner: runners_lib.Runner,
+      self, runner: experiment_lib.Runner,
       experiment: experiment_lib.Experiment,
       example: example_lib.Example,
   ):
@@ -90,7 +90,10 @@ class PerExampleCheckpointerTest(CheckpointerTest):
     root_dir = os.path.join(tempfile.mkdtemp(), 'per_example_checkpointer')
     experiment = eval_test_helper.test_experiment()
     checkpoint_filename = 'checkpoint.jsonl'
-    checkpointer = checkpointing.PerExampleCheckpointer(checkpoint_filename)
+    checkpointer = checkpointing.PerExampleCheckpointer(
+        checkpoint_filename,
+        enable_inprogress_file=True
+    )
     collector = ExampleCollector()
     run = experiment.run(
         root_dir, 'new', runner='sequential', plugins=[checkpointer, collector]
@@ -102,6 +105,10 @@ class PerExampleCheckpointerTest(CheckpointerTest):
         example = collector.examples[i + 1]
         ckpt = run.output_path_for(leaf, f'checkpoint_{example.id}.jsonl')
         self.assertTrue(pg.io.path_exists(ckpt))
+        inprogress_file = run.output_path_for(
+            leaf, f'{example.id}.inprogress'
+        )
+        self.assertTrue(pg.io.path_exists(inprogress_file))
         with pg.io.open_sequence(ckpt) as f:
           examples_from_ckpt = list(iter(f))
           # `eval_test_helper.test_experiment` has two TestEvaluation with

langfun/core/eval/v2/config_saver.py ADDED Viewed

@@ -0,0 +1,37 @@
+# Copyright 2024 The Langfun Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Config saver plugins."""
+import os
+from langfun.core.eval.v2 import experiment as experiment_lib
+class RunConfigSaver(experiment_lib.Plugin):
+  """Saves the current run."""
+  def on_run_start(
+      self,
+      runner: experiment_lib.Runner,
+      root: experiment_lib.Experiment
+  ) -> None:
+    del root  # Unused.
+    self._save_run_config(runner)
+  def _save_run_config(self, runner: experiment_lib.Runner) -> None:
+    def _save():
+      runner.current_run.save(
+          os.path.join(runner.current_run.output_root, 'run.json'),
+          hide_default_values=True,
+      )
+    runner.background_run(_save)

langfun 0.1.2.dev202509120804__py3-none-any.whl → 0.1.2.dev202512040805__py3-none-any.whl

langfun 0.1.2.dev202509120804py3-none-any.whl → 0.1.2.dev202512040805py3-none-any.whl