PyPI - langfun - Versions diffs - 0.0.2.dev20240330__py3-none-any.whl → 0.1.2.dev202501140804__py3-none-any.whl - Mend

langfun 0.0.2.dev20240330py3-none-any.whl → 0.1.2.dev202501140804py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (145) hide show

langfun/__init__.py +22 -2
langfun/core/__init__.py +17 -5
langfun/core/agentic/__init__.py +30 -0
langfun/core/agentic/action.py +854 -0
langfun/core/agentic/action_eval.py +150 -0
langfun/core/agentic/action_eval_test.py +109 -0
langfun/core/agentic/action_test.py +136 -0
langfun/core/coding/python/__init__.py +5 -11
langfun/core/coding/python/correction.py +37 -28
langfun/core/coding/python/correction_test.py +29 -3
langfun/core/coding/python/execution.py +40 -216
langfun/core/coding/python/execution_test.py +29 -89
langfun/core/coding/python/generation.py +21 -11
langfun/core/coding/python/generation_test.py +2 -2
langfun/core/coding/python/parsing.py +108 -193
langfun/core/coding/python/parsing_test.py +2 -105
langfun/core/component.py +69 -2
langfun/core/component_test.py +54 -0
langfun/core/concurrent.py +414 -117
langfun/core/concurrent_test.py +111 -24
langfun/core/console.py +18 -5
langfun/core/console_test.py +17 -0
langfun/core/eval/__init__.py +17 -0
langfun/core/eval/base.py +767 -140
langfun/core/eval/base_test.py +238 -53
langfun/core/eval/matching.py +80 -76
langfun/core/eval/matching_test.py +19 -9
langfun/core/eval/patching.py +130 -0
langfun/core/eval/patching_test.py +170 -0
langfun/core/eval/scoring.py +37 -28
langfun/core/eval/scoring_test.py +21 -3
langfun/core/eval/v2/__init__.py +42 -0
langfun/core/eval/v2/checkpointing.py +380 -0
langfun/core/eval/v2/checkpointing_test.py +228 -0
langfun/core/eval/v2/eval_test_helper.py +136 -0
langfun/core/eval/v2/evaluation.py +725 -0
langfun/core/eval/v2/evaluation_test.py +180 -0
langfun/core/eval/v2/example.py +305 -0
langfun/core/eval/v2/example_test.py +128 -0
langfun/core/eval/v2/experiment.py +1048 -0
langfun/core/eval/v2/experiment_test.py +433 -0
langfun/core/eval/v2/metric_values.py +156 -0
langfun/core/eval/v2/metric_values_test.py +80 -0
langfun/core/eval/v2/metrics.py +357 -0
langfun/core/eval/v2/metrics_test.py +203 -0
langfun/core/eval/v2/progress.py +348 -0
langfun/core/eval/v2/progress_test.py +82 -0
langfun/core/eval/v2/progress_tracking.py +210 -0
langfun/core/eval/v2/progress_tracking_test.py +66 -0
langfun/core/eval/v2/reporting.py +270 -0
langfun/core/eval/v2/reporting_test.py +158 -0
langfun/core/eval/v2/runners.py +488 -0
langfun/core/eval/v2/runners_test.py +334 -0
langfun/core/langfunc.py +3 -21
langfun/core/langfunc_test.py +26 -8
langfun/core/language_model.py +686 -48
langfun/core/language_model_test.py +681 -44
langfun/core/llms/__init__.py +100 -12
langfun/core/llms/anthropic.py +488 -0
langfun/core/llms/anthropic_test.py +235 -0
langfun/core/llms/cache/base.py +21 -2
langfun/core/llms/cache/in_memory.py +13 -0
langfun/core/llms/cache/in_memory_test.py +88 -28
langfun/core/llms/compositional.py +101 -0
langfun/core/llms/compositional_test.py +73 -0
langfun/core/llms/deepseek.py +117 -0
langfun/core/llms/deepseek_test.py +61 -0
langfun/core/llms/fake.py +39 -26
langfun/core/llms/fake_test.py +136 -11
langfun/core/llms/gemini.py +507 -0
langfun/core/llms/gemini_test.py +195 -0
langfun/core/llms/google_genai.py +62 -218
langfun/core/llms/google_genai_test.py +9 -197
langfun/core/llms/groq.py +276 -0
langfun/core/llms/groq_test.py +64 -0
langfun/core/llms/llama_cpp.py +15 -40
langfun/core/llms/llama_cpp_test.py +4 -30
langfun/core/llms/openai.py +436 -226
langfun/core/llms/openai_compatible.py +179 -0
langfun/core/llms/openai_compatible_test.py +495 -0
langfun/core/llms/openai_test.py +35 -174
langfun/core/llms/rest.py +113 -0
langfun/core/llms/rest_test.py +111 -0
langfun/core/llms/vertexai.py +192 -0
langfun/core/llms/vertexai_test.py +52 -0
langfun/core/logging.py +284 -0
langfun/core/logging_test.py +125 -0
langfun/core/message.py +319 -9
langfun/core/message_test.py +190 -13
langfun/core/modalities/__init__.py +6 -2
langfun/core/modalities/audio.py +30 -0
langfun/core/modalities/audio_test.py +63 -0
langfun/core/modalities/image.py +39 -20
langfun/core/modalities/image_test.py +52 -9
langfun/core/modalities/mime.py +206 -29
langfun/core/modalities/mime_test.py +90 -9
langfun/core/modalities/ms_office.py +117 -0
langfun/core/modalities/ms_office_test.py +389 -0
langfun/core/modalities/pdf.py +22 -0
langfun/core/modalities/pdf_test.py +57 -0
langfun/core/modalities/video.py +9 -23
langfun/core/modalities/video_test.py +3 -3
langfun/core/modality.py +26 -3
langfun/core/modality_test.py +2 -2
langfun/core/sampling.py +11 -11
langfun/core/structured/__init__.py +15 -16
langfun/core/structured/completion.py +32 -5
langfun/core/structured/completion_test.py +9 -8
langfun/core/structured/description.py +2 -2
langfun/core/structured/description_test.py +3 -3
langfun/core/structured/function_generation.py +278 -0
langfun/core/structured/function_generation_test.py +399 -0
langfun/core/structured/mapping.py +150 -46
langfun/core/structured/mapping_test.py +105 -0
langfun/core/structured/parsing.py +33 -21
langfun/core/structured/parsing_test.py +71 -22
langfun/core/structured/querying.py +746 -0
langfun/core/structured/{prompting_test.py → querying_test.py} +545 -60
langfun/core/structured/schema.py +208 -99
langfun/core/structured/schema_generation.py +1 -1
langfun/core/structured/schema_generation_test.py +2 -2
langfun/core/structured/schema_test.py +133 -34
langfun/core/structured/scoring.py +125 -19
langfun/core/structured/scoring_test.py +30 -0
langfun/core/structured/tokenization.py +64 -0
langfun/core/structured/tokenization_test.py +48 -0
langfun/core/template.py +240 -11
langfun/core/template_test.py +146 -1
langfun/core/templates/conversation.py +9 -0
langfun/core/templates/conversation_test.py +4 -3
langfun/core/templates/selfplay_test.py +14 -2
langfun-0.1.2.dev202501140804.dist-info/METADATA +225 -0
langfun-0.1.2.dev202501140804.dist-info/RECORD +153 -0
{langfun-0.0.2.dev20240330.dist-info → langfun-0.1.2.dev202501140804.dist-info}/WHEEL +1 -1
langfun/core/coding/python/errors.py +0 -108
langfun/core/coding/python/errors_test.py +0 -99
langfun/core/coding/python/permissions.py +0 -90
langfun/core/coding/python/permissions_test.py +0 -86
langfun/core/structured/prompting.py +0 -217
langfun/core/text_formatting.py +0 -162
langfun/core/text_formatting_test.py +0 -47
langfun-0.0.2.dev20240330.dist-info/METADATA +0 -99
langfun-0.0.2.dev20240330.dist-info/RECORD +0 -102
{langfun-0.0.2.dev20240330.dist-info → langfun-0.1.2.dev202501140804.dist-info}/LICENSE +0 -0
{langfun-0.0.2.dev20240330.dist-info → langfun-0.1.2.dev202501140804.dist-info}/top_level.txt +0 -0

langfun/core/eval/base_test.py CHANGED Viewed

@@ -101,7 +101,7 @@ class EvaluationTest(unittest.TestCase):
     self.assertEqual(s.dir, os.path.join(s.root_dir, s.id))
     self.assertEqual(s.hash, s.clone().hash)
     # Test persistent hash.
-    self.assertEqual(s.hash, 'abc7c29a')
+    self.assertEqual(s.hash, 'ae86c703')
     self.assertEqual(
         s.hash, s.clone(override={'max_workers': 2, 'lm.timeout': 20}).hash
     )
@@ -194,6 +194,8 @@ class EvaluationTest(unittest.TestCase):
             cache_seed=0,
             score=1.0,
             logprobs=None,
+            is_cached=False,
+            usage=lf.LMSamplingUsage(387, 24, 411),
             tags=['lm-response', 'lm-output', 'transformed'],
         ),
     )
@@ -209,7 +211,7 @@ class EvaluationTest(unittest.TestCase):
         s.result,
         dict(
             experiment_setup=dict(
-                id='Evaluation@17915dc6',
+                id='Evaluation@0fade07d',
                 dir=s.dir,
                 model='StaticSequence',
                 prompt_template='{{example.question}}',
@@ -219,7 +221,26 @@ class EvaluationTest(unittest.TestCase):
             cache_stats=dict(
                 use_cache=True, num_queries=2, num_hits=0, num_updates=2
             ),
-            metrics=dict(total=2, failures=1, failure_rate=0.5),
+            metrics=dict(
+                total=2,
+                failures=1,
+                failure_rate=0.5,
+                oop_failures=1,
+                oop_failure_rate=0.5,
+                non_oop_failures=0,
+                non_oop_failure_rate=0.0,
+                failure_breakdown={
+                    'MappingError.SchemaError.TypeError': 1
+                }
+            ),
+            usage=dict(
+                total_prompt_tokens=774,
+                total_completion_tokens=25,
+                num_usages=2,
+                average_prompt_tokens=387,
+                average_completion_tokens=12,
+                average_total_tokens=399,
+            ),
         ),
     )
     self.assertTrue(
@@ -227,14 +248,32 @@ class EvaluationTest(unittest.TestCase):
     self.assertTrue(
         os.path.exists(os.path.join(s.dir, base.Evaluation.RESULT_JSON)))
     self.assertTrue(
-        os.path.exists(os.path.join(s.dir, base.Evaluation.CACHE_JSON)))
+        os.path.exists(os.path.join(s.dir, base.Evaluation.OOP_FAILURES_JSON)))
     self.assertTrue(
-        os.path.exists(os.path.join(s.root_dir, base.Evaluation.SUMMARY_HTML))
-    )
+        os.path.exists(
+            os.path.join(s.dir, base.Evaluation.NON_OOP_FAILURES_JSON)))
+    self.assertTrue(
+        os.path.exists(os.path.join(s.dir, base.Evaluation.CACHE_JSON)))
     self.assertTrue(
         os.path.exists(os.path.join(s.dir, base.Evaluation.INDEX_HTML)))
     self.assertTrue(
-        os.path.exists(os.path.join(s.dir, base.Evaluation.FAILURES_HTML)))
+        os.path.exists(os.path.join(s.dir, base.Evaluation.OOP_FAILURES_HTML)))
+    self.assertTrue(
+        os.path.exists(
+            os.path.join(s.dir, base.Evaluation.NON_OOP_FAILURES_HTML)))
+    self.assertTrue(
+        os.path.exists(os.path.join(s.root_dir, base.Evaluation.SUMMARY_HTML))
+    )
+    # Check summary JSON.
+    summary_json = os.path.join(
+        s.root_dir, base.Evaluation.SUMMARY_HTML.replace('.html', '.json')
+    )
+    self.assertTrue(os.path.exists(summary_json))
+    summary = pg.load(summary_json, auto_dict=True)
+    self.assertIn('Evaluation', summary)
+    self.assertEqual(len(summary['Evaluation']), 1)
+    self.assertIsNotNone(summary['Evaluation'][0].experiment)
+    self.assertIsNotNone(summary['Evaluation'][0].metrics)
   def test_run_wihtout_save(self):
     lm = fake.StaticSequence([
@@ -255,7 +294,10 @@ class EvaluationTest(unittest.TestCase):
     self.assertFalse(
         os.path.exists(os.path.join(s.dir, base.Evaluation.INDEX_HTML)))
     self.assertFalse(
-        os.path.exists(os.path.join(s.dir, base.Evaluation.FAILURES_HTML)))
+        os.path.exists(os.path.join(s.dir, base.Evaluation.OOP_FAILURES_HTML)))
+    self.assertFalse(
+        os.path.exists(
+            os.path.join(s.dir, base.Evaluation.NON_OOP_FAILURES_HTML)))
   def test_load(self):
     lm = fake.StaticResponse('Solution(final_answer=2)')
@@ -274,8 +316,11 @@ class EvaluationTest(unittest.TestCase):
     s = eval_set(
         'run_filter_test', pg.oneof(['call', 'query']),
         schema_fn=answer_schema(), lm=lm)
+    result = s.run(
+        filter=lambda x: x.method == 'query', dryrun=True, summary=False
+    )
     self.assertEqual(
-        s.run(filter=lambda x: x.method == 'query', dryrun=True, summary=False),
+        result,
         {
             s.children[0].id: None,
             s.children[1].id: dict(
@@ -290,8 +335,18 @@ class EvaluationTest(unittest.TestCase):
                 cache_stats=dict(
                     use_cache=True, num_queries=2, num_hits=0, num_updates=2
                 ),
-                metrics=dict(total=2, failures=0, failure_rate=0.0),
-            )
+                metrics=dict(
+                    total=2,
+                    failures=0,
+                    failure_rate=0.0,
+                    oop_failures=0,
+                    oop_failure_rate=0.0,
+                    non_oop_failures=0,
+                    non_oop_failure_rate=0.0,
+                    failure_breakdown={},
+                ),
+                usage=s.children[1].result.usage,
+            ),
         },
     )
@@ -321,11 +376,10 @@ class EvaluationTest(unittest.TestCase):
         s.children[0].dir, os.path.join(s.root_dir, s.children[0].id)
     )
     # Test persistent hash.
-    self.assertEqual(s.hash, 'ca7f722b')
+    self.assertEqual(s.hash, 'b66a4e88')
     summary = s.run(verbose=True)
     self.assertEqual(len(summary.evaluations), 2)
     self.assertEqual(
         s.result,
         {
@@ -341,7 +395,19 @@ class EvaluationTest(unittest.TestCase):
                 cache_stats=dict(
                     use_cache=True, num_queries=2, num_hits=0, num_updates=2
                 ),
-                metrics=dict(total=2, failures=1, failure_rate=0.5),
+                metrics=dict(
+                    total=2,
+                    failures=1,
+                    failure_rate=0.5,
+                    oop_failures=1,
+                    oop_failure_rate=0.5,
+                    non_oop_failures=0,
+                    non_oop_failure_rate=0.0,
+                    failure_breakdown={
+                        'MappingError.SchemaError.TypeError': 1
+                    }
+                ),
+                usage=s.children[0].result.usage,
             ),
             s.children[1].id: dict(
                 experiment_setup=dict(
@@ -355,7 +421,19 @@ class EvaluationTest(unittest.TestCase):
                 cache_stats=dict(
                     use_cache=True, num_queries=2, num_hits=0, num_updates=2
                 ),
-                metrics=dict(total=2, failures=1, failure_rate=0.5),
+                metrics=dict(
+                    total=2,
+                    failures=1,
+                    failure_rate=0.5,
+                    oop_failures=1,
+                    oop_failure_rate=0.5,
+                    non_oop_failures=0,
+                    non_oop_failure_rate=0.0,
+                    failure_breakdown={
+                        'MappingError.SchemaError.TypeError': 1
+                    }
+                ),
+                usage=s.children[1].result.usage,
             ),
         },
     )
@@ -448,10 +526,10 @@ class SuiteTest(unittest.TestCase):
         lm=lm
     )
     # Test for persistent hash.
-    self.assertEqual(s.hash, '7285e52b')
+    self.assertEqual(s.hash, '26e6cc25')
     s.run()
     expected = {
-        s.children[0].id: dict(
+        'Evaluation@0fade07d': dict(
             experiment_setup=dict(
                 id=s.children[0].id,
                 dir=s.children[0].dir,
@@ -463,45 +541,46 @@ class SuiteTest(unittest.TestCase):
             cache_stats=dict(
                 use_cache=True, num_queries=2, num_hits=0, num_updates=2
             ),
-            metrics=dict(total=2, failures=1, failure_rate=0.5),
+            metrics=dict(
+                total=2,
+                failures=1,
+                failure_rate=0.5,
+                oop_failures=1,
+                oop_failure_rate=0.5,
+                non_oop_failures=0,
+                non_oop_failure_rate=0.0,
+                failure_breakdown={
+                    'MappingError.SchemaError.TypeError': 1
+                }
+            ),
+            usage=s.children[0].result.usage,
         ),
-        s.children[1].id: {
-            s.children[1]
-            .children[0]
-            .id: dict(
-                experiment_setup=dict(
-                    id=s.children[1].children[0].id,
-                    dir=s.children[1].children[0].dir,
-                    model='StaticSequence',
-                    prompt_template='{{example.question}}',
-                    method='call',
-                    schema_fn='answer_schema()',
-                ),
-                cache_stats=dict(
-                    use_cache=True, num_queries=4, num_hits=1, num_updates=3
-                ),
-                metrics=dict(total=2, failures=2, failure_rate=1.0),
+        'Evaluation@ae86c703': dict(
+            experiment_setup=dict(
+                id=s.children[1].children[0].id,
+                dir=s.children[1].children[0].dir,
+                model='StaticSequence',
+                prompt_template='{{example.question}}',
+                method='call',
+                schema_fn='answer_schema()',
             ),
-            s.children[1]
-            .children[2]
-            .id: dict(
-                experiment_setup=dict(
-                    id=s.children[1].children[2].id,
-                    dir=s.children[1].children[2].dir,
-                    model='StaticSequence',
-                    prompt_template='{{example.question}}',
-                    method='query',
-                    schema_fn='answer_schema()',
-                ),
-                cache_stats=dict(
-                    use_cache=True,
-                    num_queries=2,
-                    num_hits=0,
-                    num_updates=2,
-                ),
-                metrics=dict(total=2, failures=1, failure_rate=0.5),
+            cache_stats=dict(
+                use_cache=True, num_queries=4, num_hits=0, num_updates=4
             ),
-        },
+            metrics=dict(
+                total=2,
+                failures=2,
+                failure_rate=1.0,
+                oop_failures=2,
+                oop_failure_rate=1.0,
+                non_oop_failures=0,
+                non_oop_failure_rate=0.0,
+                failure_breakdown={
+                    'MappingError.SchemaError.TypeError': 2
+                }
+            ),
+            usage=s.children[1].children[0].result.usage,
+        ),
     }
     self.assertEqual(s.result, expected)
@@ -520,6 +599,14 @@ class InputsFrom(unittest.TestCase):
     pg.save([1, 2, 3], path)
     self.assertEqual(base.inputs_from(path)(), [1, 2, 3])
+    path = os.path.join(tmp_dir, 'input_file.jsonl')
+    with pg.open_jsonl(path, 'w') as f:
+      f.add(pg.Dict(x=1))
+      f.add(dict(y=2))
+    self.assertEqual(
+        base.inputs_from(path)(), [pg.Dict(x=1), dict(y=2)]
+    )
   def test_inputs_from_multiple_files(self):
     tmp_dir = tempfile.gettempdir()
     path1 = os.path.join(tmp_dir, 'input_file1.json')
@@ -671,5 +758,103 @@ class SummaryTest(unittest.TestCase):
     self.assertTrue(pg.io.path_exists(summary_file))
+class NamedEvaluationTest(unittest.TestCase):
+  def test_named_eval_class(self):
+    @base.register('named_eval/class_test')
+    class MyEval(base.Evaluation):
+      inputs = base.as_inputs([
+          pg.Dict(question='Compute 1 + 1'),
+      ])
+      method = 'query'
+      prompt = pg.oneof([
+          lf.Template('{{example.question}}'),
+          lf.Template('Hello {{example.question}}'),
+      ])
+      schema_fn = answer_schema()
+    [evaluation] = base.get_evaluations('named_eval/class_test')
+    self.assertIsInstance(evaluation, MyEval)
+    self.assertIsNone(evaluation.dir)
+    self.assertIsNone(evaluation.root_dir)
+    self.assertIn('named_eval/class_test', base.registered_names())
+    with self.assertRaisesRegex(ValueError, 'Unsupported type.*'):
+      @base.register('named_eval/bad_class')
+      class Foo:  # pylint: disable=unused-variable
+        pass
+  def test_named_eval_functor(self):
+    @base.register('named_eval/functor_test')
+    def my_eval():
+      return base.Evaluation(
+          inputs=base.as_inputs([
+              pg.Dict(question='Compute 1 + 1'),
+          ]),
+          method='query',
+          prompt=pg.oneof([
+              lf.Template('{{example.question}}'),
+              lf.Template('Hello {{example.question}}'),
+          ]),
+          schema_fn=answer_schema(),
+      )
+    self.assertTrue(issubclass(my_eval, base.Evaluable))
+    [evaluation] = base.get_evaluations('named_eval/functor_test')
+    self.assertIn('named_eval/functor_test', base.registered_names())
+    self.assertIsInstance(evaluation, my_eval)
+    self.assertIsNone(evaluation.root_dir, None)
+    self.assertTrue(
+        pg.eq(base.get_evaluations('named_eval/functor.*'), [evaluation])
+    )
+    self.assertEqual(base.get_evaluations('named_eval/non_existent'), [])
+    with self.assertRaisesRegex(TypeError, 'The return value .*'):
+      @base.register('named_eval/bad_return_type')
+      def bad_eval():   # pylint: disable=unused-variable
+        return 1
+  def test_run(self):
+    @base.register('test/run')
+    def test_run():  # pylint: disable=unused-variable
+      lm = fake.StaticResponse('Solution(final_answer=2)')
+      return eval_set('run_test', 'query', schema_fn=answer_schema(), lm=lm)
+    e = base.run(
+        tempfile.gettempdir(),
+        ['test/run'],
+        id_regex='run_test.*',
+        mode='dryrun',
+        print_definition=True,
+    )
+    self.assertEqual(
+        e.leaf_nodes[0].dir,
+        os.path.join(tempfile.gettempdir(), e.leaf_nodes[0].id),
+    )
+    self.assertTrue(
+        pg.eq(
+            e.leaf_nodes[0].lm, fake.StaticResponse('Solution(final_answer=2)')
+        )
+    )
+    @pg.patcher()
+    def bad_lm(unused_eval):  # pylint: disable=unused-variable
+      return dict(lm=fake.StaticResponse('efg'))
+    e = base.run(
+        tempfile.gettempdir(),
+        [test_run()],
+        filter='Evaluation.*',
+        patches=['bad_lm']
+    )
+    self.assertTrue(pg.eq(e.leaf_nodes[0].lm, fake.StaticResponse('efg')))
+    with self.assertRaisesRegex(ValueError, 'No evaluations found'):
+      base.run(tempfile.gettempdir(), ['test/non_existent'])
 if __name__ == '__main__':
   unittest.main()

langfun/core/eval/matching.py CHANGED Viewed

@@ -41,8 +41,8 @@ class Matching(base.Evaluation):
     """Returns the answer from the structure output."""
   @property
-  def matches(self) -> list[tuple[Any, Any, lf.Message]]:
-    """Returns the matches examples, outputs and the output messages."""
+  def matches(self) -> list[tuple[int, Any, Any, lf.Message]]:
+    """Returns the matches IDs, examples, outputs and the output messages."""
     return self._matches
   @property
@@ -57,7 +57,7 @@ class Matching(base.Evaluation):
     return self.num_matches / self.num_completed
   @property
-  def mismatches(self) -> list[tuple[Any, Any, lf.Message]]:
+  def mismatches(self) -> list[tuple[int, Any, Any, lf.Message]]:
     """Returns the mismatches examples, outputs and output messages."""
     return self._mismatches
@@ -86,34 +86,51 @@ class Matching(base.Evaluation):
     self._matches = []
     self._mismatches = []
-  def audit(self, example: Any, output: Any, message: lf.Message) -> None:
+  def audit_processed(
+      self, example_idx: int, example: Any, output: Any, message: lf.Message,
+      dryrun: bool = False
+  ) -> None:
     groundtruth = self.groundtruth(example)
     answer = self.answer(output, example)
+    if dryrun:
+      lf.console.write('')
+      lf.console.write(
+          str(groundtruth),
+          title='GROUDTRUTH',
+          color='green',
+      )
+      lf.console.write('')
+      lf.console.write(
+          str(answer),
+          title='ANSWER',
+          color='blue',
+      )
     if self.match(answer, groundtruth):
-      self._matches.append((example, output, message))
+      self._matches.append((example_idx, example, output, message))
     else:
-      self._mismatches.append((example, output, message))
+      self._mismatches.append((example_idx, example, output, message))
   def match(self, answer: Any, groundtruth: Any) -> bool:
     """Matches answer against the groundtruth. Subclasses can override."""
     return pg.eq(answer, groundtruth)
-  def _status(self, progress: lf.concurrent.Progress) -> dict[str, Any]:
+  def _eval_status(self, progress: lf.concurrent.Progress) -> dict[str, Any]:
     del progress
     return {
-        'Model': self.lm.model_id,
-        'Matches': f'%.{self.report_precision}f%% (%d/%d)' % (
-            self.match_rate * 100,
+        'Matches': '%s (%d/%d)' % (
+            self._format_rate(self.match_rate),
             self.num_matches,
             self.num_completed,
         ),
-        'Mismatches': f'%.{self.report_precision}f%% (%d/%d)' % (
-            self.mismatch_rate * 100,
+        'Mismatches': '%s (%d/%d)' % (
+            self._format_rate(self.mismatch_rate),
             self.num_mismatches,
             self.num_completed,
         ),
-        'Failed': f'%.{self.report_precision}f%% (%d/%d)' % (
-            self.failure_rate * 100,
+        'Failed': '%s (%d/%d)' % (
+            self._format_rate(self.failure_rate),
             self.num_failures,
             self.num_completed,
         ),
@@ -123,24 +140,25 @@ class Matching(base.Evaluation):
     assert self.result is not None
     m = self.result.metrics
     return (
-        f'COMPLETED(%s): Matches=%.{self.report_precision}f%% (%d/%d)'
-        f' Mismatches=%.{self.report_precision}f%% (%d/%d)'
-        f' Failures=%.{self.report_precision}f%% (%d/%d)'
+        'COMPLETED(%s):'
+        ' Matches=%s (%d/%d)'
+        ' Mismatches=%s (%d/%d)'
+        ' Failures=%s (%d/%d)'
     ) % (
         run_status,
-        m.match_rate * 100,
+        self._format_rate(m.match_rate),
         m.num_matches,
         m.total,
-        m.mismatch_rate * 100,
+        self._format_rate(m.mismatch_rate),
         m.num_mismatches,
         m.total,
-        m.failure_rate * 100,
+        self._format_rate(m.failure_rate),
         m.failures,
         m.total,
     )
-  def summarize(self) -> pg.Dict:
-    result = super().summarize()
+  def finalize(self) -> pg.Dict:
+    result = super().finalize()
     result.metrics.update(
         num_matches=self.num_matches,
         match_rate=self.match_rate,
@@ -154,33 +172,6 @@ class Matching(base.Evaluation):
   ) -> None:
     super().save(definition, result, report)
-    if result:
-      def force_dict(v):
-        return pg.object_utils.json_conversion.strip_types(pg.to_json(v))
-      # Save matches.
-      pg.save(
-          [
-              # We force the output to be dict as its type may be defined
-              # within functors which could be deserialized.
-              pg.Dict(input=input, output=force_dict(output))
-              for input, output, _ in self.matches
-          ],
-          os.path.join(self.dir, Matching.MATCHES_JSON),
-      )
-      # Save mismatches.
-      pg.save(
-          [
-              # We force the output to be dict as its type may be defined
-              # within functors which could be deserialized.
-              pg.Dict(input=input, output=force_dict(output))
-              for input, output, _ in self.mismatches
-          ],
-          os.path.join(self.dir, Matching.MISMATCHES_JSON),
-      )
     if report:
       pg.save(
           self._html([self._render_result, self._render_matches]),
@@ -201,9 +192,9 @@ class Matching(base.Evaluation):
   def _render_result_row(self, s: io.StringIO):
     super()._render_result_row(s)
     s.write(
-        '<td><span style="color:red">%s</span>%s</td>'
+        '<td><span style="color:orange">%s</span>%s</td>'
         % (
-            f'%.{self.report_precision}f%% ' % (self.mismatch_rate * 100),
+            self._format_rate(self.mismatch_rate),
             '<a href="%s">(%d/%d)</a>'
             % (self.mismatches_link, self.num_mismatches, self.num_completed),
         )
@@ -211,37 +202,33 @@ class Matching(base.Evaluation):
     s.write(
         '<td><span style="color:green">%s</span>%s</td>'
         % (
-            f'%.{self.report_precision}f%% ' % (self.match_rate * 100),
+            self._format_rate(self.match_rate),
             '<a href="%s">(%d/%d)</a>'
             % (self.matches_link, self.num_matches, self.num_completed),
         )
     )
-  def _render_metric(self, s: io.StringIO) -> None:
+  def _render_summary_metrics(self, s: io.StringIO) -> None:
     """Renders metrics in HTML."""
     assert self.result is not None
     m = self.result.metrics
-    s.write(
-        '<a title="Matches (%d/%d)" href="%s" style="color:green">%s</a>'
-        % (
-            m.num_matches,
-            m.total,
-            self.matches_link,
-            f'%.{self.report_precision}f%% ' % (m.match_rate * 100),
-        )
+    self._render_link(
+        s,
+        'Matches (%d/%d)' % (m.num_matches, m.total),
+        self._format_rate(m.match_rate),
+        'color:green',
+        lambda: self.matches_link,
     )
     s.write(' | ')
-    s.write(
-        '<a title="Mismatches (%d/%d)" href="%s" style="color:orange">%s</a>'
-        % (
-            m.num_mismatches,
-            m.total,
-            self.mismatches_link,
-            f'%.{self.report_precision}f%% ' % (m.mismatch_rate * 100),
-        )
+    self._render_link(
+        s,
+        'Mismatches (%d/%d)' % (m.num_mismatches, m.total),
+        self._format_rate(m.mismatch_rate),
+        'color:orange',
+        lambda: self.mismatches_link,
     )
     s.write(' | ')
-    super()._render_metric(s)
+    super()._render_summary_metrics(s)
   def _render_matches(self, s: io.StringIO) -> None:
     """Formats the matched cases into html."""
@@ -254,12 +241,29 @@ class Matching(base.Evaluation):
         '<td>Prompt/Response Chain</td>'
         '</tr>'
     )
-    for i, (example, output, message) in enumerate(self.matches):
+    def _maybe_html(v, root_indent: int):
+      del root_indent
+      if hasattr(v, '_repr_html_'):
+        return v._repr_html_()  # pylint: disable=protected-access
+      # Fall back to the default format.
+      return None
+    for i, (_, example, output, message) in enumerate(self.matches):
       bgcolor = 'white' if i % 2 == 0 else '#DDDDDD'
       s.write(f'<tr style="background-color: {bgcolor}"><td>{i + 1}</td>')
-      input_str = pg.format(example, verbose=False)
+      input_str = pg.Html.escape(
+          pg.format(
+              example, verbose=False, max_bytes_len=32,
+              custom_format=_maybe_html
+          )
+      )
       s.write(f'<td style="color:green;white-space:pre-wrap">{input_str}</td>')
-      output_str = pg.format(output, verbose=False)
+      output_str = pg.Html.escape(
+          pg.format(
+              output, verbose=False, max_bytes_len=32,
+              custom_format=_maybe_html
+          )
+      )
       s.write(f'<td style="color:blue;white-space:pre-wrap">{output_str}</td>')
       s.write('<td>')
       self._render_message(message, s)
@@ -279,12 +283,12 @@ class Matching(base.Evaluation):
         '</tr>'
     )
-    for i, (example, output, message) in enumerate(self.mismatches):
+    for i, (_, example, output, message) in enumerate(self.mismatches):
       bgcolor = 'white' if i % 2 == 0 else '#DDDDDD'
       s.write(f'<tr style="background-color: {bgcolor}"><td>{i + 1}</td>')
-      input_str = pg.format(example, verbose=False)
+      input_str = pg.format(example, verbose=False, max_bytes_len=32)
       s.write(f'<td style="color:green;white-space:pre-wrap">{input_str}</td>')
-      output_str = pg.format(output, verbose=False)
+      output_str = pg.format(output, verbose=False, max_bytes_len=32)
       s.write(
           f'<td style="color:magenta;white-space:pre-wrap">{output_str}</td>'
       )

langfun 0.0.2.dev20240330__py3-none-any.whl → 0.1.2.dev202501140804__py3-none-any.whl

langfun 0.0.2.dev20240330py3-none-any.whl → 0.1.2.dev202501140804py3-none-any.whl