PyPI - langfun - Versions diffs - 0.0.2.dev20240429__py3-none-any.whl → 0.0.2.dev20240511__py3-none-any.whl - Mend

langfun 0.0.2.dev20240429py3-none-any.whl → 0.0.2.dev20240511py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of langfun might be problematic. Click here for more details.

Files changed (37) hide show

langfun/__init__.py +5 -0
langfun/core/eval/__init__.py +14 -1
langfun/core/eval/base.py +503 -112
langfun/core/eval/base_test.py +185 -53
langfun/core/eval/matching.py +22 -21
langfun/core/eval/matching_test.py +23 -2
langfun/core/eval/patching.py +130 -0
langfun/core/eval/patching_test.py +170 -0
langfun/core/eval/scoring.py +4 -4
langfun/core/eval/scoring_test.py +19 -2
langfun/core/langfunc.py +1 -17
langfun/core/langfunc_test.py +4 -0
langfun/core/language_model.py +6 -0
langfun/core/llms/__init__.py +8 -0
langfun/core/llms/fake.py +6 -6
langfun/core/llms/google_genai.py +8 -0
langfun/core/llms/openai.py +3 -2
langfun/core/llms/openai_test.py +2 -1
langfun/core/llms/vertexai.py +291 -0
langfun/core/llms/vertexai_test.py +233 -0
langfun/core/modalities/image.py +1 -3
langfun/core/modalities/mime.py +6 -0
langfun/core/modalities/video.py +1 -3
langfun/core/structured/__init__.py +2 -0
langfun/core/structured/mapping.py +5 -1
langfun/core/structured/prompting.py +39 -11
langfun/core/structured/prompting_test.py +43 -0
langfun/core/structured/schema.py +34 -4
langfun/core/structured/schema_test.py +32 -1
langfun/core/structured/scoring.py +4 -1
langfun/core/structured/scoring_test.py +6 -0
langfun/core/template.py +22 -1
{langfun-0.0.2.dev20240429.dist-info → langfun-0.0.2.dev20240511.dist-info}/METADATA +2 -2
{langfun-0.0.2.dev20240429.dist-info → langfun-0.0.2.dev20240511.dist-info}/RECORD +37 -33
{langfun-0.0.2.dev20240429.dist-info → langfun-0.0.2.dev20240511.dist-info}/LICENSE +0 -0
{langfun-0.0.2.dev20240429.dist-info → langfun-0.0.2.dev20240511.dist-info}/WHEEL +0 -0
{langfun-0.0.2.dev20240429.dist-info → langfun-0.0.2.dev20240511.dist-info}/top_level.txt +0 -0

langfun/core/eval/base_test.py CHANGED Viewed

@@ -220,7 +220,18 @@ class EvaluationTest(unittest.TestCase):
             cache_stats=dict(
                 use_cache=True, num_queries=2, num_hits=0, num_updates=2
             ),
-            metrics=dict(total=2, failures=1, failure_rate=0.5),
+            metrics=dict(
+                total=2,
+                failures=1,
+                failure_rate=0.5,
+                oop_failures=1,
+                oop_failure_rate=0.5,
+                non_oop_failures=0,
+                non_oop_failure_rate=0.0,
+                failure_breakdown={
+                    'MappingError.SchemaError.TypeError': 1
+                }
+            ),
             usage=dict(
                 total_prompt_tokens=774,
                 total_completion_tokens=25,
@@ -235,12 +246,20 @@ class EvaluationTest(unittest.TestCase):
         os.path.exists(os.path.join(s.dir, base.Evaluation.EXPERIMENT_JSON)))
     self.assertTrue(
         os.path.exists(os.path.join(s.dir, base.Evaluation.RESULT_JSON)))
+    self.assertTrue(
+        os.path.exists(os.path.join(s.dir, base.Evaluation.OOP_FAILURES_JSON)))
+    self.assertTrue(
+        os.path.exists(
+            os.path.join(s.dir, base.Evaluation.NON_OOP_FAILURES_JSON)))
     self.assertTrue(
         os.path.exists(os.path.join(s.dir, base.Evaluation.CACHE_JSON)))
     self.assertTrue(
         os.path.exists(os.path.join(s.dir, base.Evaluation.INDEX_HTML)))
     self.assertTrue(
-        os.path.exists(os.path.join(s.dir, base.Evaluation.FAILURES_HTML)))
+        os.path.exists(os.path.join(s.dir, base.Evaluation.OOP_FAILURES_HTML)))
+    self.assertTrue(
+        os.path.exists(
+            os.path.join(s.dir, base.Evaluation.NON_OOP_FAILURES_HTML)))
     self.assertTrue(
         os.path.exists(os.path.join(s.root_dir, base.Evaluation.SUMMARY_HTML))
     )
@@ -274,7 +293,10 @@ class EvaluationTest(unittest.TestCase):
     self.assertFalse(
         os.path.exists(os.path.join(s.dir, base.Evaluation.INDEX_HTML)))
     self.assertFalse(
-        os.path.exists(os.path.join(s.dir, base.Evaluation.FAILURES_HTML)))
+        os.path.exists(os.path.join(s.dir, base.Evaluation.OOP_FAILURES_HTML)))
+    self.assertFalse(
+        os.path.exists(
+            os.path.join(s.dir, base.Evaluation.NON_OOP_FAILURES_HTML)))
   def test_load(self):
     lm = fake.StaticResponse('Solution(final_answer=2)')
@@ -312,7 +334,16 @@ class EvaluationTest(unittest.TestCase):
                 cache_stats=dict(
                     use_cache=True, num_queries=2, num_hits=0, num_updates=2
                 ),
-                metrics=dict(total=2, failures=0, failure_rate=0.0),
+                metrics=dict(
+                    total=2,
+                    failures=0,
+                    failure_rate=0.0,
+                    oop_failures=0,
+                    oop_failure_rate=0.0,
+                    non_oop_failures=0,
+                    non_oop_failure_rate=0.0,
+                    failure_breakdown={},
+                ),
                 usage=s.children[1].result.usage,
             ),
         },
@@ -363,7 +394,18 @@ class EvaluationTest(unittest.TestCase):
                 cache_stats=dict(
                     use_cache=True, num_queries=2, num_hits=0, num_updates=2
                 ),
-                metrics=dict(total=2, failures=1, failure_rate=0.5),
+                metrics=dict(
+                    total=2,
+                    failures=1,
+                    failure_rate=0.5,
+                    oop_failures=1,
+                    oop_failure_rate=0.5,
+                    non_oop_failures=0,
+                    non_oop_failure_rate=0.0,
+                    failure_breakdown={
+                        'MappingError.SchemaError.TypeError': 1
+                    }
+                ),
                 usage=s.children[0].result.usage,
             ),
             s.children[1].id: dict(
@@ -378,7 +420,18 @@ class EvaluationTest(unittest.TestCase):
                 cache_stats=dict(
                     use_cache=True, num_queries=2, num_hits=0, num_updates=2
                 ),
-                metrics=dict(total=2, failures=1, failure_rate=0.5),
+                metrics=dict(
+                    total=2,
+                    failures=1,
+                    failure_rate=0.5,
+                    oop_failures=1,
+                    oop_failure_rate=0.5,
+                    non_oop_failures=0,
+                    non_oop_failure_rate=0.0,
+                    failure_breakdown={
+                        'MappingError.SchemaError.TypeError': 1
+                    }
+                ),
                 usage=s.children[1].result.usage,
             ),
         },
@@ -475,7 +528,7 @@ class SuiteTest(unittest.TestCase):
     self.assertEqual(s.hash, '26e6cc25')
     s.run()
     expected = {
-        s.children[0].id: dict(
+        'Evaluation@0fade07d': dict(
             experiment_setup=dict(
                 id=s.children[0].id,
                 dir=s.children[0].dir,
@@ -487,48 +540,46 @@ class SuiteTest(unittest.TestCase):
             cache_stats=dict(
                 use_cache=True, num_queries=2, num_hits=0, num_updates=2
             ),
-            metrics=dict(total=2, failures=1, failure_rate=0.5),
+            metrics=dict(
+                total=2,
+                failures=1,
+                failure_rate=0.5,
+                oop_failures=1,
+                oop_failure_rate=0.5,
+                non_oop_failures=0,
+                non_oop_failure_rate=0.0,
+                failure_breakdown={
+                    'MappingError.SchemaError.TypeError': 1
+                }
+            ),
             usage=s.children[0].result.usage,
         ),
-        s.children[1].id: {
-            s.children[1]
-            .children[0]
-            .id: dict(
-                experiment_setup=dict(
-                    id=s.children[1].children[0].id,
-                    dir=s.children[1].children[0].dir,
-                    model='StaticSequence',
-                    prompt_template='{{example.question}}',
-                    method='call',
-                    schema_fn='answer_schema()',
-                ),
-                cache_stats=dict(
-                    use_cache=True, num_queries=4, num_hits=1, num_updates=3
-                ),
-                metrics=dict(total=2, failures=2, failure_rate=1.0),
-                usage=s.children[1].children[0].result.usage,
+        'Evaluation@ae86c703': dict(
+            experiment_setup=dict(
+                id=s.children[1].children[0].id,
+                dir=s.children[1].children[0].dir,
+                model='StaticSequence',
+                prompt_template='{{example.question}}',
+                method='call',
+                schema_fn='answer_schema()',
             ),
-            s.children[1]
-            .children[2]
-            .id: dict(
-                experiment_setup=dict(
-                    id=s.children[1].children[2].id,
-                    dir=s.children[1].children[2].dir,
-                    model='StaticSequence',
-                    prompt_template='{{example.question}}',
-                    method='query',
-                    schema_fn='answer_schema()',
-                ),
-                cache_stats=dict(
-                    use_cache=True,
-                    num_queries=2,
-                    num_hits=0,
-                    num_updates=2,
-                ),
-                metrics=dict(total=2, failures=1, failure_rate=0.5),
-                usage=s.children[1].children[2].result.usage,
+            cache_stats=dict(
+                use_cache=True, num_queries=4, num_hits=1, num_updates=3
             ),
-        },
+            metrics=dict(
+                total=2,
+                failures=2,
+                failure_rate=1.0,
+                oop_failures=2,
+                oop_failure_rate=1.0,
+                non_oop_failures=0,
+                non_oop_failure_rate=0.0,
+                failure_breakdown={
+                    'MappingError.SchemaError.TypeError': 2
+                }
+            ),
+            usage=s.children[1].children[0].result.usage,
+        ),
     }
     self.assertEqual(s.result, expected)
@@ -698,16 +749,97 @@ class SummaryTest(unittest.TestCase):
     self.assertTrue(pg.io.path_exists(summary_file))
-class AppRunTest(unittest.TestCase):
+class NamedEvaluationTest(unittest.TestCase):
-  def test_app_run(self):
-    lm = fake.StaticSequence(['two', 'Solution(final_answer=2)'])
-    try:
-      base.app_run(
-          eval_set('app_run_test', 'query', schema_fn=answer_schema(), lm=lm)
+  def test_named_eval_class(self):
+    @base.register('named_eval/class_test')
+    class MyEval(base.Evaluation):
+      inputs = base.as_inputs([
+          pg.Dict(question='Compute 1 + 1'),
+      ])
+      method = 'query'
+      prompt = pg.oneof([
+          lf.Template('{{example.question}}'),
+          lf.Template('Hello {{example.question}}'),
+      ])
+      schema_fn = answer_schema()
+    evaluation = base.get_evaluation('named_eval/class_test')
+    self.assertIsInstance(evaluation, MyEval)
+    self.assertIsNone(evaluation.dir)
+    self.assertIsNone(evaluation.root_dir)
+    self.assertIn('named_eval/class_test', base.registered_names())
+    with self.assertRaisesRegex(ValueError, 'Unsupported type.*'):
+      @base.register('named_eval/bad_class')
+      class Foo:  # pylint: disable=unused-variable
+        pass
+  def test_named_eval_functor(self):
+    @base.register('named_eval/functor_test')
+    def my_eval():
+      return base.Evaluation(
+          inputs=base.as_inputs([
+              pg.Dict(question='Compute 1 + 1'),
+          ]),
+          method='query',
+          prompt=pg.oneof([
+              lf.Template('{{example.question}}'),
+              lf.Template('Hello {{example.question}}'),
+          ]),
+          schema_fn=answer_schema(),
       )
-    except SystemExit:
-      pass
+    self.assertTrue(issubclass(my_eval, base.Evaluable))
+    evaluation = base.get_evaluation('named_eval/functor_test')
+    self.assertIn('named_eval/functor_test', base.registered_names())
+    self.assertIsInstance(evaluation, my_eval)
+    self.assertIsNone(evaluation.root_dir, None)
+    with self.assertRaisesRegex(ValueError, 'Evaluation .* not found'):
+      base.get_evaluation('named_eval/non_existent')
+    with self.assertRaisesRegex(TypeError, 'The return value .*'):
+      @base.register('named_eval/bad_return_type')
+      def bad_eval():   # pylint: disable=unused-variable
+        return 1
+  def test_run(self):
+    @base.register('test/run')
+    def test_run():  # pylint: disable=unused-variable
+      lm = fake.StaticResponse('Solution(final_answer=2)')
+      return eval_set('run_test', 'query', schema_fn=answer_schema(), lm=lm)
+    e = base.run(
+        tempfile.gettempdir(),
+        ['test/run'],
+        id_regex='run_test.*',
+        mode='dryrun',
+        print_definition=True,
+    )
+    self.assertEqual(
+        e.leaf_nodes[0].dir,
+        os.path.join(tempfile.gettempdir(), e.leaf_nodes[0].id),
+    )
+    self.assertTrue(
+        pg.eq(
+            e.leaf_nodes[0].lm, fake.StaticResponse('Solution(final_answer=2)')
+        )
+    )
+    @pg.patcher()
+    def bad_lm(unused_eval):  # pylint: disable=unused-variable
+      return dict(lm=fake.StaticResponse('efg'))
+    e = base.run(
+        tempfile.gettempdir(),
+        [test_run()],
+        filter='Evaluation.*',
+        patches=['bad_lm']
+    )
+    self.assertTrue(pg.eq(e.leaf_nodes[0].lm, fake.StaticResponse('efg')))
 if __name__ == '__main__':

langfun/core/eval/matching.py CHANGED Viewed

@@ -119,18 +119,18 @@ class Matching(base.Evaluation):
     del progress
     return {
         'Model': self.lm.model_id,
-        'Matches': f'%.{self.report_precision}f%% (%d/%d)' % (
-            self.match_rate * 100,
+        'Matches': '%s (%d/%d)' % (
+            self._format_rate(self.match_rate),
             self.num_matches,
             self.num_completed,
         ),
-        'Mismatches': f'%.{self.report_precision}f%% (%d/%d)' % (
-            self.mismatch_rate * 100,
+        'Mismatches': '%s (%d/%d)' % (
+            self._format_rate(self.mismatch_rate),
             self.num_mismatches,
             self.num_completed,
         ),
-        'Failed': f'%.{self.report_precision}f%% (%d/%d)' % (
-            self.failure_rate * 100,
+        'Failed': '%s (%d/%d)' % (
+            self._format_rate(self.failure_rate),
             self.num_failures,
             self.num_completed,
         ),
@@ -140,24 +140,25 @@ class Matching(base.Evaluation):
     assert self.result is not None
     m = self.result.metrics
     return (
-        f'COMPLETED(%s): Matches=%.{self.report_precision}f%% (%d/%d)'
-        f' Mismatches=%.{self.report_precision}f%% (%d/%d)'
-        f' Failures=%.{self.report_precision}f%% (%d/%d)'
+        'COMPLETED(%s):'
+        ' Matches=%s (%d/%d)'
+        ' Mismatches=%s (%d/%d)'
+        ' Failures=%s (%d/%d)'
     ) % (
         run_status,
-        m.match_rate * 100,
+        self._format_rate(m.match_rate),
         m.num_matches,
         m.total,
-        m.mismatch_rate * 100,
+        self._format_rate(m.mismatch_rate),
         m.num_mismatches,
         m.total,
-        m.failure_rate * 100,
+        self._format_rate(m.failure_rate),
         m.failures,
         m.total,
     )
-  def summarize(self) -> pg.Dict:
-    result = super().summarize()
+  def finalize(self) -> pg.Dict:
+    result = super().finalize()
     result.metrics.update(
         num_matches=self.num_matches,
         match_rate=self.match_rate,
@@ -218,9 +219,9 @@ class Matching(base.Evaluation):
   def _render_result_row(self, s: io.StringIO):
     super()._render_result_row(s)
     s.write(
-        '<td><span style="color:red">%s</span>%s</td>'
+        '<td><span style="color:orange">%s</span>%s</td>'
         % (
-            f'%.{self.report_precision}f%% ' % (self.mismatch_rate * 100),
+            self._format_rate(self.mismatch_rate),
             '<a href="%s">(%d/%d)</a>'
             % (self.mismatches_link, self.num_mismatches, self.num_completed),
         )
@@ -228,13 +229,13 @@ class Matching(base.Evaluation):
     s.write(
         '<td><span style="color:green">%s</span>%s</td>'
         % (
-            f'%.{self.report_precision}f%% ' % (self.match_rate * 100),
+            self._format_rate(self.match_rate),
             '<a href="%s">(%d/%d)</a>'
             % (self.matches_link, self.num_matches, self.num_completed),
         )
     )
-  def _render_metric(self, s: io.StringIO) -> None:
+  def _render_summary_metrics(self, s: io.StringIO) -> None:
     """Renders metrics in HTML."""
     assert self.result is not None
     m = self.result.metrics
@@ -244,7 +245,7 @@ class Matching(base.Evaluation):
             m.num_matches,
             m.total,
             self.matches_link,
-            f'%.{self.report_precision}f%% ' % (m.match_rate * 100),
+            self._format_rate(m.match_rate),
         )
     )
     s.write(' | ')
@@ -254,11 +255,11 @@ class Matching(base.Evaluation):
             m.num_mismatches,
             m.total,
             self.mismatches_link,
-            f'%.{self.report_precision}f%% ' % (m.mismatch_rate * 100),
+            self._format_rate(m.mismatch_rate),
         )
     )
     s.write(' | ')
-    super()._render_metric(s)
+    super()._render_summary_metrics(s)
   def _render_matches(self, s: io.StringIO) -> None:
     """Formats the matched cases into html."""

langfun/core/eval/matching_test.py CHANGED Viewed

@@ -120,6 +120,13 @@ class MatchingTest(unittest.TestCase):
                 total=4,
                 failures=1,
                 failure_rate=0.25,
+                oop_failures=1,
+                oop_failure_rate=0.25,
+                non_oop_failures=0,
+                non_oop_failure_rate=0.0,
+                failure_breakdown={
+                    'MappingError.SchemaError.TypeError': 1
+                },
                 num_matches=2,
                 match_rate=0.5,
                 num_mismatches=1,
@@ -160,7 +167,14 @@ class MatchingTest(unittest.TestCase):
     self.assertTrue(
         os.path.exists(
             os.path.join(
-                s.dir, matching.Matching.FAILURES_JSON
+                s.dir, matching.Matching.OOP_FAILURES_JSON
+            )
+        )
+    )
+    self.assertTrue(
+        os.path.exists(
+            os.path.join(
+                s.dir, matching.Matching.NON_OOP_FAILURES_JSON
             )
         )
     )
@@ -175,7 +189,14 @@ class MatchingTest(unittest.TestCase):
     self.assertTrue(
         os.path.exists(
             os.path.join(
-                s.dir, matching.Matching.FAILURES_HTML
+                s.dir, matching.Matching.OOP_FAILURES_HTML
+            )
+        )
+    )
+    self.assertTrue(
+        os.path.exists(
+            os.path.join(
+                s.dir, matching.Matching.NON_OOP_FAILURES_HTML
             )
         )
     )

langfun/core/eval/patching.py ADDED Viewed

@@ -0,0 +1,130 @@
+# Copyright 2024 The Langfun Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Experiment patching for Langfun evaluations."""
+import inspect
+from typing import Union
+import langfun.core as lf
+from langfun.core import llms as lf_llms
+from langfun.core.eval import base
+import pyglove as pg
+#
+# Program-based patchers.
+#
+def patch_member(cls, key, value, parent_key: str | None = None):
+  """Patches a member of a class."""
+  def _rebind_fn(k, v, p):
+    if (
+        isinstance(p, cls)
+        and k.key == key
+        and (parent_key is None or (p and p.sym_path.key == parent_key))
+    ):
+      if inspect.isfunction(value):
+        return value(k, v, p)
+      return value
+    return v
+  return _rebind_fn
+def patch_lm(lm: Union[lf.LanguageModel, pg.hyper.OneOf]):  # pylint: disable=redefined-outer-name
+  """Patches the LLM of evaluations."""
+  return patch_member(base.Evaluable, "lm", lm)
+def patch_parsing_lm(lm: Union[lf.LanguageModel, pg.hyper.OneOf]):  # pylint: disable=redefined-outer-name
+  """Patches the parsing LLM of evaluations."""
+  return patch_member(base.Evaluable, "parsing_lm", lm)
+def patch_schema_fn(schema_fn: Union[pg.Functor, pg.hyper.OneOf]):
+  """Patches the schema_fn of evaluations."""
+  return patch_member(base.Evaluable, "schema_fn", schema_fn)
+def patch_prompt(prompt: Union[str, lf.Template, pg.hyper.OneOf]):
+  """Patches the prompt of evaluations."""
+  return patch_member(base.Evaluable, "prompt", prompt)
+def patch_inputs(inputs: Union[pg.Functor, pg.hyper.OneOf]):
+  """Patches the inputs used in evaluations."""
+  return patch_member(base.Evaluable, "inputs", inputs)
+def patch_additional_args(**kwargs):
+  """Patches additional_args."""
+  def value_fn(k, unused_v, p):
+    # We infer the symbolic value for the old args, as it might be a
+    # contextual attribute referring to its containing object.
+    old_args = p.sym_inferred(k.key)
+    if old_args:
+      old_args = dict(old_args)
+      old_args.update(kwargs)
+      return old_args
+    return kwargs
+  return patch_member(base.Evaluable, "additional_args", value_fn)
+#
+# String-based patching.
+#
+_NAMED_MODELS = {
+    # GPT models.
+    "gpt35turbo": lf_llms.Gpt35Turbo,
+    "gpt35turbo16k": lf_llms.Gpt35Turbo16K,
+    "gpt4": lf_llms.Gpt4,
+    "gpt4turbo": lf_llms.Gpt4Turbo,
+    # Anthropic models.
+    "haiku": lf_llms.Claude3Haiku,
+    "claude3haiku": lf_llms.Claude3Haiku,
+    "opus": lf_llms.Claude3Opus,
+    "claude3opus": lf_llms.Claude3Opus,
+    "sonnet": lf_llms.Claude3Sonnet,
+    "claude3sonnet": lf_llms.Claude3Opus,
+}
+def model_by_name(name: str) -> lf.LanguageModel:
+  """Gets model by name."""
+  name = name.strip().lower()
+  if name in _NAMED_MODELS:
+    return _NAMED_MODELS[name]()
+  raise ValueError(f"Unknown model name: {name}")
+@pg.patcher(auto_typing=True)
+def lm(unused_eval, models: list[str]):
+  """Patch the LM used for benchmarking."""
+  return patch_lm(pg.oneof([model_by_name(name) for name in models]))
+@pg.patcher(auto_typing=True)
+def temperature(unused_eval, value: float):
+  """Patch the temperature used for benchmarking."""
+  return patch_member(lf.LMSamplingOptions, "temperature", value)
+@pg.patcher(auto_typing=True)
+def max_tokens(unused_eval, value: int | None):
+  """Patch the temperature used for benchmarking."""
+  return patch_member(lf.LMSamplingOptions, "max_tokens", value)

langfun 0.0.2.dev20240429__py3-none-any.whl → 0.0.2.dev20240511__py3-none-any.whl

Potentially problematic release.

langfun 0.0.2.dev20240429py3-none-any.whl → 0.0.2.dev20240511py3-none-any.whl