PyPI - langfun - Versions diffs - 0.1.2.dev202411100803__py3-none-any.whl → 0.1.2.dev202411120804__py3-none-any.whl - Mend

langfun 0.1.2.dev202411100803py3-none-any.whl → 0.1.2.dev202411120804py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

langfun/core/console.py +10 -2
langfun/core/console_test.py +17 -0
langfun/core/eval/__init__.py +2 -0
langfun/core/eval/v2/__init__.py +34 -0
langfun/core/eval/v2/checkpointing.py +130 -0
langfun/core/eval/v2/checkpointing_test.py +89 -0
langfun/core/eval/v2/evaluation.py +615 -0
langfun/core/eval/v2/evaluation_test.py +143 -0
langfun/core/eval/v2/example.py +286 -0
langfun/core/eval/v2/example_test.py +92 -0
langfun/core/eval/v2/experiment.py +949 -0
langfun/core/eval/v2/experiment_test.py +304 -0
langfun/core/eval/v2/metric_values.py +156 -0
langfun/core/eval/v2/metric_values_test.py +80 -0
langfun/core/eval/v2/metrics.py +357 -0
langfun/core/eval/v2/metrics_test.py +203 -0
langfun/core/eval/v2/progress.py +348 -0
langfun/core/eval/v2/progress_test.py +82 -0
langfun/core/eval/v2/progress_tracking.py +209 -0
langfun/core/eval/v2/progress_tracking_test.py +56 -0
langfun/core/eval/v2/reporting.py +144 -0
langfun/core/eval/v2/reporting_test.py +41 -0
langfun/core/eval/v2/runners.py +417 -0
langfun/core/eval/v2/runners_test.py +311 -0
langfun/core/eval/v2/test_helper.py +78 -0
langfun/core/language_model.py +122 -11
langfun/core/language_model_test.py +97 -4
langfun/core/llms/__init__.py +3 -0
langfun/core/llms/compositional.py +101 -0
langfun/core/llms/compositional_test.py +73 -0
{langfun-0.1.2.dev202411100803.dist-info → langfun-0.1.2.dev202411120804.dist-info}/METADATA +1 -1
{langfun-0.1.2.dev202411100803.dist-info → langfun-0.1.2.dev202411120804.dist-info}/RECORD +35 -11
{langfun-0.1.2.dev202411100803.dist-info → langfun-0.1.2.dev202411120804.dist-info}/WHEEL +1 -1
{langfun-0.1.2.dev202411100803.dist-info → langfun-0.1.2.dev202411120804.dist-info}/LICENSE +0 -0
{langfun-0.1.2.dev202411100803.dist-info → langfun-0.1.2.dev202411120804.dist-info}/top_level.txt +0 -0

langfun/core/eval/v2/evaluation_test.py ADDED Viewed

@@ -0,0 +1,143 @@
+# Copyright 2024 The Langfun Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import tempfile
+import unittest
+from langfun.core.eval.v2 import evaluation as evaluation_lib
+from langfun.core.eval.v2 import example as example_lib
+from langfun.core.eval.v2 import experiment as experiment_lib
+from langfun.core.eval.v2 import test_helper
+import pyglove as pg
+Example = example_lib.Example
+Evaluation = evaluation_lib.Evaluation
+RunId = experiment_lib.RunId
+Run = experiment_lib.Run
+class EvaluationTest(unittest.TestCase):
+  def test_hyper_evaluation(self):
+    exp = test_helper.TestEvaluation(
+        lm=test_helper.TestLLM(offset=pg.oneof(range(3)))
+    )
+    self.assertFalse(exp.is_leaf)
+    self.assertTrue(
+        pg.eq(
+            exp.children,
+            [
+                test_helper.TestEvaluation(lm=test_helper.TestLLM(offset=0)),
+                test_helper.TestEvaluation(lm=test_helper.TestLLM(offset=1)),
+                test_helper.TestEvaluation(lm=test_helper.TestLLM(offset=2)),
+            ]
+        )
+    )
+    self.assertEqual(exp.children[0].num_examples, 10)
+    self.assertEqual(
+        [c.is_leaf for c in exp.children],
+        [True] * len(exp.children)
+    )
+    self.assertEqual(
+        [r.resource_ids() for r in exp.leaf_nodes],
+        [set(['test_llm:0']), set(['test_llm:1']), set(['test_llm:2'])]
+    )
+  def test_evaluate(self):
+    exp = test_helper.TestEvaluation()
+    example = exp.evaluate(Example(id=3))
+    self.assertTrue(example.newly_processed)
+    self.assertEqual(example.input, pg.Dict(x=2, y=4, groundtruth=6))
+    self.assertEqual(example.output, 6)
+    self.assertIsNone(example.error)
+    self.assertEqual(example.metadata, {})
+    self.assertEqual(example.metric_metadata, dict(match=True))
+    self.assertIsNotNone(example.usage_summary)
+    self.assertGreater(example.usage_summary.total.total_tokens, 0)
+    self.assertEqual(example.usage_summary.total.num_requests, 1)
+    self.assertIsNotNone(example.execution_status)
+    self.assertIsNotNone(example.start_time)
+    self.assertIsNotNone(example.end_time)
+    exp = test_helper.TestEvaluation(lm=test_helper.TestLLM(offset=1))
+    example = exp.evaluate(3)
+    self.assertTrue(example.newly_processed)
+    self.assertEqual(example.input, pg.Dict(x=2, y=4, groundtruth=6))
+    self.assertEqual(example.output, 7)
+    self.assertIsNone(example.error)
+    self.assertEqual(example.metadata, {})
+    self.assertEqual(example.metric_metadata, dict(mismatch=True))
+    with self.assertRaisesRegex(ValueError, 'x should not be 5'):
+      _ = exp.evaluate(6, raise_if_has_error=True)
+    example = exp.evaluate(6)
+    self.assertTrue(example.newly_processed)
+    self.assertEqual(example.input, pg.Dict(x=5, y=25, groundtruth=30))
+    self.assertEqual(pg.MISSING_VALUE, example.output)
+    self.assertEqual(example.error.tag, 'ValueError')
+    self.assertEqual(example.metadata, {})
+    self.assertEqual(example.metric_metadata, dict(error='ValueError'))
+  def test_evaluate_with_state(self):
+    eval_dir = os.path.join(tempfile.gettempdir(), 'test_eval')
+    pg.io.mkdirs(eval_dir, exist_ok=True)
+    state_file = os.path.join(eval_dir, 'state.jsonl')
+    with pg.io.open_sequence(state_file, 'w') as f:
+      exp = test_helper.TestEvaluation()
+      example = exp.evaluate(3)
+      self.assertTrue(example.newly_processed)
+      self.assertEqual(example.input, pg.Dict(x=2, y=4, groundtruth=6))
+      self.assertEqual(example.output, 6)
+      self.assertEqual(len(exp._state.evaluated_examples), 1)
+      f.add(pg.to_json_str(example))
+    exp.reset()
+    self.assertEqual(len(exp._state.evaluated_examples), 0)
+    exp.load_state(state_file)
+    self.assertEqual(len(exp._state.evaluated_examples), 1)
+    example = exp.evaluate(3)
+    self.assertFalse(example.newly_processed)
+    self.assertEqual(example.input, pg.Dict(x=2, y=4, groundtruth=6))
+    self.assertEqual(example.output, 6)
+    self.assertGreater(example.usage_summary.total.total_tokens, 0)
+    self.assertGreater(example.usage_summary.cached.total.total_tokens, 0)
+    self.assertEqual(example.usage_summary.cached.total.num_requests, 1)
+    self.assertEqual(example.usage_summary.uncached.total.total_tokens, 0)
+    self.assertEqual(example.usage_summary.uncached.total.num_requests, 0)
+  def test_html_view(self):
+    exp = test_helper.TestEvaluation()
+    self.assertIn(
+        exp.id,
+        exp.to_html(extra_flags=dict(card_view=True, current_run=None)).content
+    )
+    self.assertIn(
+        exp.id,
+        exp.to_html(
+            extra_flags=dict(
+                card_view=False,
+                current_run=Run(
+                    root_dir='/tmp/test_run',
+                    id=RunId.from_id('20241031_1'),
+                    experiment=pg.Ref(exp),
+                )
+            )
+        ).content
+    )
+if __name__ == '__main__':
+  unittest.main()

langfun/core/eval/v2/example.py ADDED Viewed

@@ -0,0 +1,286 @@
+# Copyright 2024 The Langfun Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Base classes for Langfun evaluation."""
+import dataclasses
+import inspect
+from typing import Any, Callable
+import langfun.core as lf
+import pyglove as pg
+@dataclasses.dataclass
+class Example(pg.JSONConvertible, pg.views.HtmlTreeView.Extension):
+  """An item for the evaluation.
+  Attributes:
+    id: The 1-based ID of the item in the evaluation set.
+    input: An element returned from the `Evaluable.inputs` functor.
+    output: The output of the `process` method. If `pg.MISSING_VALUE`, it has
+      not been processed yet.
+    metadata: The metadata of the item produced by the `process` method.
+    metric_metadata: The dictionary returned from `Metric.audit`.
+    start_time: The start time of the evaluation item.
+    end_time: The end time of the evaluation item.
+    usage_summary: The summary of LLM usages of the evaluation item.
+    execution_status: The timeit status of the evaluation item.
+  """
+  id: int
+  input: Any = pg.MISSING_VALUE
+  output: Any = pg.MISSING_VALUE
+  error: pg.object_utils.ErrorInfo | None = None
+  metadata: dict[str, Any] = dataclasses.field(default_factory=dict)
+  metric_metadata: dict[str, Any] | None = None
+  # Execution information.
+  newly_processed: bool = True
+  start_time: float | None = None
+  end_time: float | None = None
+  usage_summary: lf.UsageSummary | None = None
+  execution_status: dict[str, pg.object_utils.TimeIt.Status] | None = None
+  def __post_init__(self):
+    if self.execution_status is not None:
+      for status in self.execution_status.values():
+        if status.has_error:
+          self.error = status.error
+          break
+  @property
+  def is_processed(self) -> bool:
+    """Returns whether the item has been processed."""
+    return pg.MISSING_VALUE != self.output
+  @property
+  def has_error(self) -> bool:
+    """Returns whether the item has an error."""
+    return self.error is not None
+  @property
+  def elapse(self) -> float | None:
+    """Returns the elapse time of the item."""
+    if self.execution_status is not None:
+      return self.execution_status['evaluate'].elapse
+    return None
+  def to_json(self, **kwargs) -> dict[str, Any]:
+    """Returns the JSON representation of the item."""
+    return self.to_json_dict(
+        fields=dict(
+            id=(self.id, None),
+            # NOTE(daiyip): We do not write `input` to JSON as it will be
+            # loaded from the input functor. This allows us to support
+            # non-serializable examples.
+            output=(self.output, pg.MISSING_VALUE),
+            error=(self.error, None),
+            metadata=(self.metadata, {}),
+            metric_metadata=(self.metric_metadata, None),
+            start_time=(self.start_time, None),
+            end_time=(self.end_time, None),
+            usage_summary=(self.usage_summary, None),
+            execution_status=(self.execution_status, None),
+        ),
+        exclude_default=True,
+        **kwargs,
+    )
+  @classmethod
+  def from_json(
+      cls,
+      json_value: dict[str, Any],
+      *,
+      example_input_by_id: Callable[[int], Any],
+      **kwargs
+  ) -> 'Example':
+    """Creates an example from the JSON representation."""
+    example_id = json_value.get('id')
+    example_input = example_input_by_id(example_id)
+    json_value['input'] = example_input
+    # NOTE(daiyip): We need to load the types of the examples into the
+    # deserialization context, otherwise the deserialization will fail if the
+    # types are not registered.
+    def example_class_defs(example) -> list[type[Any]]:
+      referred_types = set()
+      def _visit(k, v, p):
+        del k, p
+        if inspect.isclass(v):
+          referred_types.add(v)
+        elif isinstance(v, pg.Object):
+          referred_types.add(v.__class__)
+        return pg.TraverseAction.ENTER
+      pg.traverse(example, _visit)
+      return list(referred_types)
+    with pg.JSONConvertible.load_types_for_deserialization(
+        *example_class_defs(example_input)
+    ):
+      return cls(
+          **{k: pg.from_json(v, **kwargs) for k, v in json_value.items()}
+      )
+  #
+  # HTML rendering.
+  #
+  def _html_tree_view_content(
+      self,
+      *,
+      view: pg.views.HtmlTreeView,
+      root_path: pg.KeyPath | None = None,
+      extra_flags: dict[str, Any] | None = None,
+      **kwargs
+  ):
+    root_path = root_path or pg.KeyPath()
+    extra_flags = extra_flags or {}
+    num_examples = extra_flags.get('num_examples', None)
+    def _metric_metadata_badge(key, value):
+      if isinstance(value, bool) and bool:
+        text = key
+      else:
+        text = f'{key}:{value}'
+      return pg.views.html.controls.Badge(
+          text,
+          css_classes=[pg.object_utils.camel_to_snake(key, '-')],
+      )
+    def _render_header():
+      return pg.Html.element(
+          'div',
+          [
+              pg.Html.element(
+                  'div',
+                  [
+                      # Previous button.
+                      pg.views.html.controls.Label(   # pylint: disable=g-long-ternary
+                          '◀',
+                          link=f'{self.id - 1}.html',
+                          css_classes=['previous'],
+                      ) if self.id > 1 else None,
+                      # Current example ID.
+                      pg.views.html.controls.Label(
+                          f'#{self.id}',
+                          css_classes=['example-id'],
+                      ),
+                      # Next button.
+                      pg.views.html.controls.Label(   # pylint: disable=g-long-ternary
+                          '▶',
+                          link=f'{self.id + 1}.html',
+                          css_classes=['next'],
+                      ) if (num_examples is None
+                            or self.id < num_examples) else None,
+                  ]
+              ),
+              pg.Html.element(
+                  'div',
+                  [
+                      # Usage summary.
+                      pg.view(  # pylint: disable=g-long-ternary
+                          self.usage_summary,
+                          extra_flags=dict(as_badge=True)
+                      ) if self.usage_summary is not None else None,
+                      # Metric metadata.
+                      pg.views.html.controls.LabelGroup(
+                          [   # pylint: disable=g-long-ternary
+                              _metric_metadata_badge(k, v)
+                              for k, v in self.metric_metadata.items()
+                          ] if self.metric_metadata else []
+                      ),
+                  ],
+                  css_classes=['example-container'],
+              )
+          ]
+      )
+    def _render_content():
+      def _tab(label, key):
+        field = getattr(self, key)
+        if pg.MISSING_VALUE == field or not field:
+          return None
+        return pg.views.html.controls.Tab(
+            label=label,
+            content=view.render(
+                field,
+                root_path=root_path + key,
+                **view.get_passthrough_kwargs(**kwargs),
+            ),
+        )
+      tabs = [
+          _tab('Input', 'input'),
+          _tab('Output', 'output'),
+          _tab('Output Metadata', 'metadata'),
+          _tab('Error', 'error'),
+      ]
+      return pg.views.html.controls.TabControl(
+          [tab for tab in tabs if tab is not None]
+      )
+    return pg.Html.element(
+        'div',
+        [
+            _render_header(),
+            _render_content(),
+        ],
+        css_classes=['eval-example']
+    )
+  def _html_tree_view_summary(self, *, view, **kwargs):
+    return None
+  @classmethod
+  def _html_tree_view_css_styles(cls) -> list[str]:
+    return super()._html_tree_view_css_styles() + [
+        """
+        .example-container {
+          display: block;
+          padding: 10px;
+        }
+        .example-id {
+          font-weight: bold;
+          font-size: 40px;
+          margin: 0 10px;
+          vertical-align: middle;
+        }
+        a.previous, a.next {
+          text-decoration: none;
+          vertical-align: middle;
+          display: inline-block;
+          padding: 8px 8px;
+          color: #DDD;
+        }
+        a.previous:hover, a.next:hover {
+          background-color: #ddd;
+          color: black;
+        }
+        /* Badge styles. */
+        .eval-example .badge.match {
+          color: green;
+          background-color: #dcefbe;
+        }
+        .eval-example .badge.error {
+          color: red;
+          background-color: #fdcccc;
+        }
+        .eval-example .badge.mismatch {
+          color: orange;
+          background-color: #ffefc4;
+        }
+        .eval-example .badge.score {
+          color: blue;
+          background-color: #c4dced;
+        }
+        """
+    ]

langfun/core/eval/v2/example_test.py ADDED Viewed

@@ -0,0 +1,92 @@
+# Copyright 2024 The Langfun Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+from langfun.core.eval.v2 import example as example_lib
+import pyglove as pg
+Example = example_lib.Example
+class ExampleTest(unittest.TestCase):
+  def test_basic(self):
+    error = pg.object_utils.ErrorInfo(
+        tag='ValueError',
+        description='Bad input',
+        stacktrace='...',
+    )
+    ex = Example(id=1, execution_status={
+        'evaluate': pg.object_utils.TimeIt.Status(
+            name='evaluation', elapse=1.0, error=error
+        )
+    })
+    self.assertEqual(ex.error, error)
+    self.assertFalse(ex.is_processed)
+    self.assertTrue(ex.has_error)
+    self.assertEqual(ex.elapse, 1.0)
+    ex = Example(id=2, output=1)
+    self.assertTrue(ex.is_processed)
+    self.assertFalse(ex.has_error)
+    self.assertIsNone(ex.elapse)
+  def test_json_conversion(self):
+    def input_func():
+      class A(pg.Object):
+        x: int
+      class B(pg.Object):
+        x: int = 1
+        y: int = 2
+      return [
+          pg.Dict(
+              a=A,
+              b=B
+          )
+      ]
+    inputs = input_func()
+    ex = Example(
+        id=1,
+        input=inputs[0],
+        output=inputs[0].a(1),
+        metadata=dict(b=inputs[0].b())
+    )
+    json_str = pg.to_json_str(ex)
+    self.assertEqual(
+        pg.from_json_str(
+            json_str,
+            example_input_by_id=lambda i: inputs[i - 1]
+        ),
+        ex
+    )
+  def test_html_view(self):
+    ex = Example(
+        id=1,
+        input=pg.Dict(a=1, b=2),
+        output=3,
+        metadata=dict(sum=3),
+        metric_metadata=dict(match=True),
+    )
+    self.assertNotIn(
+        'next',
+        ex.to_html(extra_flags=dict(num_examples=1)).content,
+    )
+if __name__ == '__main__':
+  unittest.main()

langfun 0.1.2.dev202411100803__py3-none-any.whl → 0.1.2.dev202411120804__py3-none-any.whl

langfun 0.1.2.dev202411100803py3-none-any.whl → 0.1.2.dev202411120804py3-none-any.whl