PyPI - langfun - Versions diffs - 0.0.2.dev20240319__py3-none-any.whl → 0.0.2.dev20240429__py3-none-any.whl - Mend

langfun 0.0.2.dev20240319py3-none-any.whl → 0.0.2.dev20240429py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

langfun/__init__.py +2 -0
langfun/core/__init__.py +1 -0
langfun/core/coding/python/correction.py +0 -7
langfun/core/component.py +6 -0
langfun/core/component_test.py +1 -0
langfun/core/eval/__init__.py +2 -0
langfun/core/eval/base.py +240 -37
langfun/core/eval/base_test.py +52 -18
langfun/core/eval/matching.py +26 -9
langfun/core/eval/matching_test.py +3 -4
langfun/core/eval/scoring.py +15 -6
langfun/core/eval/scoring_test.py +2 -2
langfun/core/langfunc.py +0 -5
langfun/core/langfunc_test.py +6 -4
langfun/core/language_model.py +124 -24
langfun/core/language_model_test.py +249 -26
langfun/core/llms/__init__.py +24 -5
langfun/core/llms/anthropic.py +263 -0
langfun/core/llms/anthropic_test.py +167 -0
langfun/core/llms/cache/in_memory_test.py +37 -28
langfun/core/llms/fake.py +31 -22
langfun/core/llms/fake_test.py +122 -11
langfun/core/llms/{gemini.py → google_genai.py} +117 -15
langfun/core/llms/{gemini_test.py → google_genai_test.py} +83 -15
langfun/core/llms/groq.py +260 -0
langfun/core/llms/groq_test.py +170 -0
langfun/core/llms/llama_cpp.py +3 -1
langfun/core/llms/openai.py +97 -79
langfun/core/llms/openai_test.py +285 -59
langfun/core/modalities/video.py +5 -2
langfun/core/structured/__init__.py +3 -0
langfun/core/structured/completion_test.py +2 -2
langfun/core/structured/function_generation.py +245 -0
langfun/core/structured/function_generation_test.py +329 -0
langfun/core/structured/mapping.py +59 -3
langfun/core/structured/mapping_test.py +17 -0
langfun/core/structured/parsing.py +2 -1
langfun/core/structured/parsing_test.py +18 -13
langfun/core/structured/prompting.py +27 -6
langfun/core/structured/prompting_test.py +79 -12
langfun/core/structured/schema.py +25 -22
langfun/core/structured/schema_generation.py +2 -3
langfun/core/structured/schema_generation_test.py +2 -2
langfun/core/structured/schema_test.py +42 -27
langfun/core/template.py +125 -10
langfun/core/template_test.py +75 -0
langfun/core/templates/selfplay_test.py +6 -2
{langfun-0.0.2.dev20240319.dist-info → langfun-0.0.2.dev20240429.dist-info}/METADATA +3 -2
{langfun-0.0.2.dev20240319.dist-info → langfun-0.0.2.dev20240429.dist-info}/RECORD +52 -46
{langfun-0.0.2.dev20240319.dist-info → langfun-0.0.2.dev20240429.dist-info}/LICENSE +0 -0
{langfun-0.0.2.dev20240319.dist-info → langfun-0.0.2.dev20240429.dist-info}/WHEEL +0 -0
{langfun-0.0.2.dev20240319.dist-info → langfun-0.0.2.dev20240429.dist-info}/top_level.txt +0 -0

langfun/core/eval/base_test.py CHANGED Viewed

@@ -70,8 +70,7 @@ def eval_set(
   """Creates an evaluation object for testing."""
   tmp_dir = tempfile.gettempdir()
   return cls(
-      id=eval_id,
-      root_dir=tmp_dir,
+      root_dir=os.path.join(tmp_dir, eval_id),
       inputs=base.as_inputs([
           pg.Dict(question='Compute 1 + 1'),
           pg.Dict(question='Compute 1 + 2'),
@@ -102,7 +101,7 @@ class EvaluationTest(unittest.TestCase):
     self.assertEqual(s.dir, os.path.join(s.root_dir, s.id))
     self.assertEqual(s.hash, s.clone().hash)
     # Test persistent hash.
-    self.assertEqual(s.hash, 'abc7c29a')
+    self.assertEqual(s.hash, 'ae86c703')
     self.assertEqual(
         s.hash, s.clone(override={'max_workers': 2, 'lm.timeout': 20}).hash
     )
@@ -195,6 +194,7 @@ class EvaluationTest(unittest.TestCase):
             cache_seed=0,
             score=1.0,
             logprobs=None,
+            usage=lf.LMSamplingUsage(387, 24, 411),
             tags=['lm-response', 'lm-output', 'transformed'],
         ),
     )
@@ -210,7 +210,7 @@ class EvaluationTest(unittest.TestCase):
         s.result,
         dict(
             experiment_setup=dict(
-                id='run_test',
+                id='Evaluation@0fade07d',
                 dir=s.dir,
                 model='StaticSequence',
                 prompt_template='{{example.question}}',
@@ -221,6 +221,14 @@ class EvaluationTest(unittest.TestCase):
                 use_cache=True, num_queries=2, num_hits=0, num_updates=2
             ),
             metrics=dict(total=2, failures=1, failure_rate=0.5),
+            usage=dict(
+                total_prompt_tokens=774,
+                total_completion_tokens=25,
+                num_usages=2,
+                average_prompt_tokens=387,
+                average_completion_tokens=12,
+                average_total_tokens=399,
+            ),
         ),
     )
     self.assertTrue(
@@ -229,13 +237,23 @@ class EvaluationTest(unittest.TestCase):
         os.path.exists(os.path.join(s.dir, base.Evaluation.RESULT_JSON)))
     self.assertTrue(
         os.path.exists(os.path.join(s.dir, base.Evaluation.CACHE_JSON)))
-    self.assertTrue(
-        os.path.exists(os.path.join(s.root_dir, base.Evaluation.SUMMARY_HTML))
-    )
     self.assertTrue(
         os.path.exists(os.path.join(s.dir, base.Evaluation.INDEX_HTML)))
     self.assertTrue(
         os.path.exists(os.path.join(s.dir, base.Evaluation.FAILURES_HTML)))
+    self.assertTrue(
+        os.path.exists(os.path.join(s.root_dir, base.Evaluation.SUMMARY_HTML))
+    )
+    # Check summary JSON.
+    summary_json = os.path.join(
+        s.root_dir, base.Evaluation.SUMMARY_HTML.replace('.html', '.json')
+    )
+    self.assertTrue(os.path.exists(summary_json))
+    summary = pg.load(summary_json, force_dict=True)
+    self.assertIn('Evaluation', summary)
+    self.assertEqual(len(summary['Evaluation']), 1)
+    self.assertIsNotNone(summary['Evaluation'][0].experiment)
+    self.assertIsNotNone(summary['Evaluation'][0].metrics)
   def test_run_wihtout_save(self):
     lm = fake.StaticSequence([
@@ -275,8 +293,11 @@ class EvaluationTest(unittest.TestCase):
     s = eval_set(
         'run_filter_test', pg.oneof(['call', 'query']),
         schema_fn=answer_schema(), lm=lm)
+    result = s.run(
+        filter=lambda x: x.method == 'query', dryrun=True, summary=False
+    )
     self.assertEqual(
-        s.run(filter=lambda x: x.method == 'query', dryrun=True, summary=False),
+        result,
         {
             s.children[0].id: None,
             s.children[1].id: dict(
@@ -292,7 +313,8 @@ class EvaluationTest(unittest.TestCase):
                     use_cache=True, num_queries=2, num_hits=0, num_updates=2
                 ),
                 metrics=dict(total=2, failures=0, failure_rate=0.0),
-            )
+                usage=s.children[1].result.usage,
+            ),
         },
     )
@@ -302,7 +324,6 @@ class EvaluationTest(unittest.TestCase):
         '3',
     ])
     s = base.Evaluation(
-        id='search_space_test',
         root_dir=tempfile.gettempdir(),
         inputs=base.as_inputs([
             pg.Dict(question='Compute 1 + 1'),
@@ -323,11 +344,10 @@ class EvaluationTest(unittest.TestCase):
         s.children[0].dir, os.path.join(s.root_dir, s.children[0].id)
     )
     # Test persistent hash.
-    self.assertEqual(s.hash, 'ca7f722b')
+    self.assertEqual(s.hash, 'b66a4e88')
     summary = s.run(verbose=True)
     self.assertEqual(len(summary.evaluations), 2)
     self.assertEqual(
         s.result,
         {
@@ -344,6 +364,7 @@ class EvaluationTest(unittest.TestCase):
                     use_cache=True, num_queries=2, num_hits=0, num_updates=2
                 ),
                 metrics=dict(total=2, failures=1, failure_rate=0.5),
+                usage=s.children[0].result.usage,
             ),
             s.children[1].id: dict(
                 experiment_setup=dict(
@@ -358,6 +379,7 @@ class EvaluationTest(unittest.TestCase):
                     use_cache=True, num_queries=2, num_hits=0, num_updates=2
                 ),
                 metrics=dict(total=2, failures=1, failure_rate=0.5),
+                usage=s.children[1].result.usage,
             ),
         },
     )
@@ -439,7 +461,6 @@ class SuiteTest(unittest.TestCase):
         '3',
     ] * 5)
     s = base.Suite(
-        'suite_run_test',
         [
             eval_set('run_test_1', 'query', schema_fn=answer_schema()),
             # A suite of search space. Two of the sub-experiments are identical,
@@ -451,7 +472,7 @@ class SuiteTest(unittest.TestCase):
         lm=lm
     )
     # Test for persistent hash.
-    self.assertEqual(s.hash, '7285e52b')
+    self.assertEqual(s.hash, '26e6cc25')
     s.run()
     expected = {
         s.children[0].id: dict(
@@ -467,6 +488,7 @@ class SuiteTest(unittest.TestCase):
                 use_cache=True, num_queries=2, num_hits=0, num_updates=2
             ),
             metrics=dict(total=2, failures=1, failure_rate=0.5),
+            usage=s.children[0].result.usage,
         ),
         s.children[1].id: {
             s.children[1]
@@ -484,6 +506,7 @@ class SuiteTest(unittest.TestCase):
                     use_cache=True, num_queries=4, num_hits=1, num_updates=3
                 ),
                 metrics=dict(total=2, failures=2, failure_rate=1.0),
+                usage=s.children[1].children[0].result.usage,
             ),
             s.children[1]
             .children[2]
@@ -503,6 +526,7 @@ class SuiteTest(unittest.TestCase):
                     num_updates=2,
                 ),
                 metrics=dict(total=2, failures=1, failure_rate=0.5),
+                usage=s.children[1].children[2].result.usage,
             ),
         },
     }
@@ -548,7 +572,6 @@ class SummaryTest(unittest.TestCase):
   def _eval_set(self, root_dir):
     return base.Suite(id='select_test', children=[
         TaskA(
-            id='task_a',
             inputs=base.as_inputs([
                 pg.Dict(question='Compute 1 + 1'),
             ]),
@@ -569,7 +592,6 @@ class SummaryTest(unittest.TestCase):
             max_workers=1,
         ),
         TaskB(
-            id='task_b',
             inputs=base.as_inputs([
                 pg.Dict(question='Compute 1 + 1'),
             ]),
@@ -650,10 +672,10 @@ class SummaryTest(unittest.TestCase):
         len(base.Summary.from_dirs(root_dir)), 2 * 2 * 2 * 2 + 2 * 1 * 1 * 2
     )
     self.assertEqual(
-        len(base.Summary.from_dirs(root_dir, 'task_b')), 2 * 1 * 1 * 2
+        len(base.Summary.from_dirs(root_dir, 'TaskB')), 2 * 1 * 1 * 2
     )
     self.assertEqual(
-        len(base.Summary.from_dirs(root_dir, ('task_a'))), 2 * 2 * 2 * 2
+        len(base.Summary.from_dirs(root_dir, ('TaskA'))), 2 * 2 * 2 * 2
     )
   def test_monitor(self):
@@ -676,5 +698,17 @@ class SummaryTest(unittest.TestCase):
     self.assertTrue(pg.io.path_exists(summary_file))
+class AppRunTest(unittest.TestCase):
+  def test_app_run(self):
+    lm = fake.StaticSequence(['two', 'Solution(final_answer=2)'])
+    try:
+      base.app_run(
+          eval_set('app_run_test', 'query', schema_fn=answer_schema(), lm=lm)
+      )
+    except SystemExit:
+      pass
 if __name__ == '__main__':
   unittest.main()

langfun/core/eval/matching.py CHANGED Viewed

@@ -86,9 +86,26 @@ class Matching(base.Evaluation):
     self._matches = []
     self._mismatches = []
-  def audit(self, example: Any, output: Any, message: lf.Message) -> None:
+  def audit_processed(
+      self, example: Any, output: Any, message: lf.Message, dryrun: bool = False
+  ) -> None:
     groundtruth = self.groundtruth(example)
     answer = self.answer(output, example)
+    if dryrun:
+      lf.console.write('')
+      lf.console.write(
+          str(groundtruth),
+          title='GROUDTRUTH',
+          color='green',
+      )
+      lf.console.write('')
+      lf.console.write(
+          str(answer),
+          title='ANSWER',
+          color='blue',
+      )
     if self.match(answer, groundtruth):
       self._matches.append((example, output, message))
     else:
@@ -155,19 +172,16 @@ class Matching(base.Evaluation):
     super().save(definition, result, report)
     if result:
-      def force_dict(v):
-        return pg.object_utils.json_conversion.strip_types(pg.to_json(v))
       # Save matches.
       pg.save(
           [
-              # We force the output to be dict as its type may be defined
-              # within functors which could be deserialized.
-              pg.Dict(input=input, output=force_dict(output))
+              pg.Dict(input=input, output=output)
               for input, output, _ in self.matches
           ],
           os.path.join(self.dir, Matching.MATCHES_JSON),
+          # We force the input and output to be dict so it does not depend on
+          # the downstream to serialize.
+          force_dict=True,
       )
       # Save mismatches.
@@ -175,10 +189,13 @@ class Matching(base.Evaluation):
           [
               # We force the output to be dict as its type may be defined
               # within functors which could be deserialized.
-              pg.Dict(input=input, output=force_dict(output))
+              pg.Dict(input=input, output=output)
               for input, output, _ in self.mismatches
           ],
           os.path.join(self.dir, Matching.MISMATCHES_JSON),
+          # We force the input and output to be dict so it does not depend on
+          # the downstream to serialize.
+          force_dict=True,
       )
     if report:

langfun/core/eval/matching_test.py CHANGED Viewed

@@ -65,10 +65,8 @@ def eval_set(
     use_cache: bool = True,
 ):
   """Creates an evaluation object for testing."""
-  tmp_dir = tempfile.gettempdir()
   return MyTask(
-      id=eval_id,
-      root_dir=tmp_dir,
+      root_dir=os.path.join(tempfile.gettempdir(), eval_id),
       inputs=base.as_inputs([
           pg.Dict(question='Compute 1 + 1', groundtruth=2),
           pg.Dict(question='Compute 1 + 2', groundtruth=3),
@@ -105,7 +103,7 @@ class MatchingTest(unittest.TestCase):
         s.result,
         dict(
             experiment_setup=dict(
-                id='match_run_test',
+                id='MyTask@739a174b',
                 dir=s.dir,
                 model='StaticSequence',
                 prompt_template='{{example.question}}',
@@ -127,6 +125,7 @@ class MatchingTest(unittest.TestCase):
                 num_mismatches=1,
                 mismatch_rate=0.25,
             ),
+            usage=s.result.usage,
         ),
     )
     self.assertTrue(

langfun/core/eval/scoring.py CHANGED Viewed

@@ -61,8 +61,18 @@ class Scoring(base.Evaluation):
     super()._reset()
     self._scored = []
-  def audit(self, example: Any, output: Any, message: lf.Message) -> None:
+  def audit_processed(
+      self, example: Any, output: Any, message: lf.Message, dryrun: bool = False
+  ) -> None:
     score = self.score(example, output)
+    if dryrun:
+      lf.console.write('')
+      lf.console.write(
+          str(score),
+          title='SCORE',
+          color='blue',
+      )
     self._scored.append((example, output, score, message))
   @abc.abstractmethod
@@ -118,19 +128,18 @@ class Scoring(base.Evaluation):
     super().save(definition, result, report)
     if result:
-      def force_dict(v):
-        return pg.object_utils.json_conversion.strip_types(pg.to_json(v))
       # Save scored.
       pg.save(
           [
               # We force the output to be dict as its type may be defined
               # within functors which could be deserialized.
-              pg.Dict(input=input, output=force_dict(output), score=score)
+              pg.Dict(input=input, output=output, score=score)
               for input, output, score, _ in self.scored
           ],
           os.path.join(self.dir, Scoring.SCORED_JSON),
+          # We force the input and output to be dict so it does not depend on
+          # the downstream to serialize.
+          force_dict=True,
       )
     if report:

langfun/core/eval/scoring_test.py CHANGED Viewed

@@ -43,7 +43,6 @@ def constrained_by_upperbound(upper_bound: int):
 class ConstraintFollowing(scoring.Scoring):
-  id = 'constraint_following'
   inputs = constrained_by_upperbound(1)
   prompt = '{{example}}'
   method = 'query'
@@ -82,7 +81,7 @@ class ScoringTest(unittest.TestCase):
         s.result,
         dict(
             experiment_setup=dict(
-                id='constraint_following',
+                id='ConstraintFollowing@5c88a5eb',
                 dir=s.dir,
                 model='StaticSequence',
                 prompt_template='{{example}}',
@@ -103,6 +102,7 @@ class ScoringTest(unittest.TestCase):
                 score_rate=1.0,
                 avg_score=0.5,
             ),
+            usage=s.result.usage,
         ),
     )
     self.assertTrue(

langfun/core/langfunc.py CHANGED Viewed

@@ -261,7 +261,6 @@ class LangFunc(
         if lm_input is None:
           lm_input = self.render(**kwargs)
-        lm_input.tag(message_lib.Message.TAG_LM_INPUT)
         if skip_lm:
           return lm_input
@@ -270,10 +269,6 @@ class LangFunc(
         # Send rendered text to LM.
         lm_output = self.lm(lm_input, cache_seed=cache_seed)
-        # Track the input as the source of the output.
-        lm_output.source = lm_input
-        lm_output.tag(message_lib.Message.TAG_LM_RESPONSE)
         # Transform the output message.
         lm_output = self.transform_output(lm_output)
         lm_output.tag(message_lib.Message.TAG_LM_OUTPUT)

langfun/core/langfunc_test.py CHANGED Viewed

@@ -82,7 +82,9 @@ class LangFuncCallTest(unittest.TestCase):
     self.assertEqual(i.tags, ['rendered'])
     r = l()
-    self.assertEqual(r, message.AIMessage('Hello!!!', score=0.0, logprobs=None))
+    self.assertEqual(
+        r, message.AIMessage('Hello!!!', score=0.0, logprobs=None, usage=None)
+    )
     self.assertEqual(r.tags, ['lm-response', 'lm-output'])
     self.assertEqual(r.source, message.UserMessage('Hello'))
     self.assertEqual(r.source.tags, ['rendered', 'lm-input'])
@@ -92,8 +94,8 @@ class LangFuncCallTest(unittest.TestCase):
     self.assertEqual(
         repr(l),
         "LangFunc(template_str='Hello', clean=True,"
-        ' lm=ExcitedEchoer(sampling_options=LMSamplingOptions(temperature=0.0,'
-        ' max_tokens=1024, n=1, top_k=40, top_p=None, stop=None,'
+        ' lm=ExcitedEchoer(sampling_options=LMSamplingOptions(temperature=None,'
+        ' max_tokens=None, n=1, top_k=40, top_p=None, stop=None,'
         ' random_seed=None, logprobs=False, top_logprobs=None), cache=None,'
         ' max_concurrency=None, timeout=120.0, max_attempts=5,'
         ' retry_interval=(5, 60), exponential_backoff=True, debug=False))',
@@ -106,7 +108,7 @@ class LangFuncCallTest(unittest.TestCase):
       self.assertEqual(l.render(), 'Hello')
       r = l()
       self.assertEqual(
-          r, message.AIMessage('Hello!!!', score=0.0, logprobs=None)
+          r, message.AIMessage('Hello!!!', score=0.0, logprobs=None, usage=None)
       )
       self.assertEqual(r.tags, ['lm-response', 'lm-output'])

langfun 0.0.2.dev20240319__py3-none-any.whl → 0.0.2.dev20240429__py3-none-any.whl

langfun 0.0.2.dev20240319py3-none-any.whl → 0.0.2.dev20240429py3-none-any.whl