PyPI - langfun - Versions diffs - 0.0.2.dev20240330__py3-none-any.whl → 0.0.2.dev20240429__py3-none-any.whl - Mend

langfun 0.0.2.dev20240330py3-none-any.whl → 0.0.2.dev20240429py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

langfun/__init__.py +2 -0
langfun/core/__init__.py +1 -0
langfun/core/coding/python/correction.py +0 -7
langfun/core/component.py +6 -0
langfun/core/component_test.py +1 -0
langfun/core/eval/__init__.py +2 -0
langfun/core/eval/base.py +202 -23
langfun/core/eval/base_test.py +49 -10
langfun/core/eval/matching.py +26 -9
langfun/core/eval/matching_test.py +2 -1
langfun/core/eval/scoring.py +15 -6
langfun/core/eval/scoring_test.py +2 -1
langfun/core/langfunc.py +0 -5
langfun/core/langfunc_test.py +6 -4
langfun/core/language_model.py +124 -24
langfun/core/language_model_test.py +249 -26
langfun/core/llms/__init__.py +19 -2
langfun/core/llms/anthropic.py +263 -0
langfun/core/llms/anthropic_test.py +167 -0
langfun/core/llms/cache/in_memory_test.py +37 -28
langfun/core/llms/fake.py +31 -22
langfun/core/llms/fake_test.py +122 -11
langfun/core/llms/google_genai_test.py +8 -3
langfun/core/llms/groq.py +260 -0
langfun/core/llms/groq_test.py +170 -0
langfun/core/llms/llama_cpp.py +3 -1
langfun/core/llms/openai.py +97 -79
langfun/core/llms/openai_test.py +285 -59
langfun/core/modalities/video.py +5 -2
langfun/core/structured/__init__.py +3 -0
langfun/core/structured/completion_test.py +2 -2
langfun/core/structured/function_generation.py +245 -0
langfun/core/structured/function_generation_test.py +329 -0
langfun/core/structured/mapping.py +56 -2
langfun/core/structured/mapping_test.py +17 -0
langfun/core/structured/parsing_test.py +18 -13
langfun/core/structured/prompting.py +27 -6
langfun/core/structured/prompting_test.py +79 -12
langfun/core/structured/schema.py +4 -2
langfun/core/structured/schema_generation_test.py +2 -2
langfun/core/structured/schema_test.py +4 -6
langfun/core/template.py +125 -10
langfun/core/template_test.py +75 -0
langfun/core/templates/selfplay_test.py +6 -2
{langfun-0.0.2.dev20240330.dist-info → langfun-0.0.2.dev20240429.dist-info}/METADATA +3 -2
{langfun-0.0.2.dev20240330.dist-info → langfun-0.0.2.dev20240429.dist-info}/RECORD +49 -43
{langfun-0.0.2.dev20240330.dist-info → langfun-0.0.2.dev20240429.dist-info}/LICENSE +0 -0
{langfun-0.0.2.dev20240330.dist-info → langfun-0.0.2.dev20240429.dist-info}/WHEEL +0 -0
{langfun-0.0.2.dev20240330.dist-info → langfun-0.0.2.dev20240429.dist-info}/top_level.txt +0 -0

langfun/__init__.py CHANGED Viewed

@@ -34,6 +34,7 @@ score = structured.score
 generate_class = structured.generate_class
 source_form = structured.source_form
+function_gen = structured.function_gen
 from langfun.core import eval  # pylint: disable=redefined-builtin
 from langfun.core import templates
@@ -54,6 +55,7 @@ Video = modalities.Video
 PDF = modalities.PDF
 # Error types.
+MappingError = structured.MappingError
 SchemaError = structured.SchemaError
 JsonError = structured.JsonError
 CodeError = coding.CodeError

langfun/core/__init__.py CHANGED Viewed

@@ -99,6 +99,7 @@ from langfun.core.modality import ModalityRef
 from langfun.core.language_model import LanguageModel
 from langfun.core.language_model import LMSample
 from langfun.core.language_model import LMSamplingOptions
+from langfun.core.language_model import LMSamplingUsage
 from langfun.core.language_model import LMSamplingResult
 from langfun.core.language_model import LMScoringResult
 from langfun.core.language_model import LMCache

langfun/core/coding/python/correction.py CHANGED Viewed

@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Python code error correction."""
-import re
 from typing import Any
 import langfun.core as lf
 from langfun.core.coding.python import errors
@@ -31,11 +30,6 @@ class CorrectedCode(pg.Object):
   corrected_code: str
-def remove_docstrings(code):
-  pattern = re.compile(r"(def .+?:\s*?)('''|\"\"\")((.|\s)*?)(\2)", re.DOTALL)
-  return pattern.sub(r"\1", code)
 def run_with_correction(
     code: str,
     error: str | None = None,
@@ -86,7 +80,6 @@ def run_with_correction(
   # pytype: enable=import-error
   # pylint: enable=g-import-not-at-top
-  code = remove_docstrings(code)
   if max_attempts == 0:
     result = execution.run(
         code,

langfun/core/component.py CHANGED Viewed

@@ -210,6 +210,12 @@ def get_contextual_override(var_name: str) -> ContextualOverride | None:
   return _get_scoped_value(_global_tls, _CONTEXT_OVERRIDES, var_name)
+def all_contextual_values() -> dict[str, Any]:
+  """Returns all contextual values provided from `lf.context` in scope."""
+  overrides = getattr(_global_tls, _CONTEXT_OVERRIDES, {})
+  return {k: v.value for k, v in overrides.items()}
 @contextlib.contextmanager
 def _contextual_scope(
     tls: threading.local, tls_key, **variables

langfun/core/component_test.py CHANGED Viewed

@@ -84,6 +84,7 @@ class ComponentContextTest(unittest.TestCase):
           lf.get_contextual_override('y'),
           lf.ContextualOverride(3, cascade=False, override_attrs=False),
       )
+      self.assertEqual(lf.all_contextual_values(), dict(x=3, y=3, z=3))
       # Member attributes take precedence over `lf.context`.
       self.assertEqual(a1.x, 1)

langfun/core/eval/__init__.py CHANGED Viewed

@@ -16,6 +16,8 @@
 # pylint: disable=g-importing-member
 # pylint: disable=g-bad-import-order
+from langfun.core.eval.base import app_run
 from langfun.core.eval.base import Evaluable
 from langfun.core.eval.base import Evaluation
 from langfun.core.eval.base import Suite

langfun/core/eval/base.py CHANGED Viewed

@@ -26,6 +26,8 @@ import threading
 import time
 from typing import Annotated, Any, Callable, Iterator, Literal, Optional, Sequence, Type, Union
+from absl import app
+from absl import flags
 import langfun.core as lf
 import langfun.core.coding as lf_coding
 from langfun.core.llms.cache import in_memory
@@ -538,7 +540,7 @@ class Evaluable(lf.Component):
           f'<div style="color: {text_color}; white-space: pre-wrap;'
           'padding: 10px; border: 1px solid; margin-top: 10px">'
       )
-      s.write(m.text)
+      s.write(m.get('formatted_text', m.text))
       if m.result is not None:
         s.write(
             '<div style="color: magenta; white-space: pre-wrap;'
@@ -546,6 +548,16 @@ class Evaluable(lf.Component):
         )
         s.write(pg.format(m.result))
         s.write('</div>')
+      if 'usage' in m.metadata:
+        s.write(
+            '<div style="background-color: #EEEEEE; color: black; '
+            'white-space: pre-wrap; padding: 10px; border: 0px solid; '
+            'margin: 10px">'
+            f'prompt: {m.usage.prompt_tokens} tokens, '
+            f'response: {m.usage.completion_tokens} tokens, '
+            f'total: {m.usage.total_tokens} tokens'
+            '</div>'
+        )
       s.write('</div>')
   @classmethod
@@ -810,17 +822,36 @@ class Evaluation(Evaluable):
       return 0.0
     return self.num_failures / self.num_completed
+  @property
+  def has_usage(self) -> bool:
+    """Returns True if token usage is enabled."""
+    return self._num_usages > 0
+  @property
+  def average_prompt_tokens(self) -> int:
+    """Returns the average prompt tokens."""
+    if not self.has_usage:
+      return 0
+    return self._total_prompt_tokens // self._num_usages
+  @property
+  def average_completion_tokens(self) -> int:
+    """Returns the average completion tokens."""
+    if not self.has_usage:
+      return 0
+    return self._total_completion_tokens // self._num_usages
+  @property
+  def average_total_tokens(self) -> int:
+    """Returns the average total tokens."""
+    return self.average_prompt_tokens + self.average_completion_tokens
   @functools.cached_property
   def schema(self) -> lf_structured.Schema | None:
     """Schema."""
     if self.schema_fn is None:
       return None
-    kwargs = {}
-    # Allow schema to be a function based on current evaluation.
-    if 'evaluation' in self.schema_fn.__signature__.arg_names:
-      kwargs['evaluation'] = self
     schema = self._call_schema_fn()
     fewshot_examples = None
     if isinstance(schema, tuple):
@@ -861,7 +892,11 @@ class Evaluation(Evaluable):
             'Encountered: {annotation!r}.'
         )
       self._maybe_adjust_schema_for_completion(annotation)
-    return lf_structured.Schema.from_value(annotation)
+    schema = lf_structured.Schema.from_value(annotation)
+    # NOTE(daiyip): add references to the dependent classes of the returned type
+    # to prevent unused subclasses get garbage collected by Python.
+    setattr(schema, '__dependencies__', schema.class_dependencies())
+    return schema
   def _maybe_adjust_schema_for_completion(self, cls):
     if (self.completion_prompt_field is None
@@ -938,6 +973,10 @@ class Evaluation(Evaluable):
     self._failures = []
     self._num_completed = 0
+    self._total_prompt_tokens = 0
+    self._total_completion_tokens = 0
+    self._num_usages = 0
   @property
   def failures_link(self) -> str | None:
     """Returns the link to the failures page."""
@@ -957,7 +996,7 @@ class Evaluation(Evaluable):
     example = example or self.examples[0]
     # We make a copy to avoid pollute the state of current object.
-    copy = self.clone()
+    copy: Evaluation = self.clone()
     copy.__dict__['examples'] = [example]
     # We set the symbolic parent of the cloned to access contextual information
@@ -987,9 +1026,9 @@ class Evaluation(Evaluable):
           color='blue',
       )
-    # Audit the result.
-    copy.audit(example, output, output_message)
+    copy.audit(example, output_message, None, dryrun=True)
     result = copy.summarize()
     if verbose:
       lf.console.write('')
       lf.console.write(
@@ -1036,11 +1075,12 @@ class Evaluation(Evaluable):
             status_fn=self._status,
         ):
           if error is not None:
-            self._failures.append((example, str(error)))
-          else:
-            output = message.text if self.schema is None else message.result
-            self.audit(example, output, message)
-          self._num_completed += 1
+            message = (
+                error.lm_response
+                if isinstance(error, lf_structured.MappingError)
+                else None
+            )
+          self.audit(example, message, error)
       finally:
         # Save cache upon completion or interruption.
         if self.dir and self.cache:
@@ -1143,6 +1183,19 @@ class Evaluation(Evaluable):
       )
     else:
       cache_stats = dict(use_cache=False)
+    if self.has_usage:
+      usage = pg.Dict(
+          total_prompt_tokens=self._total_prompt_tokens,
+          total_completion_tokens=self._total_completion_tokens,
+          num_usages=self._num_usages,
+          average_prompt_tokens=self.average_prompt_tokens,
+          average_completion_tokens=self.average_completion_tokens,
+          average_total_tokens=self.average_total_tokens,
+      )
+    else:
+      usage = None
     result = pg.Dict(
         experiment_setup=pg.Dict(
             id=self.id,
@@ -1158,6 +1211,7 @@ class Evaluation(Evaluable):
             failures=self.num_failures,
             failure_rate=self.failure_rate,
         ),
+        usage=usage,
     )
     return result
@@ -1179,9 +1233,28 @@ class Evaluation(Evaluable):
           '</td></tr><tr><td>'
       )
       self._render_metric(s)
+      # Summarize average usage.
+      if self.result.usage is not None:
+        self._render_usage(s)
     s.write('</td></tr></table></div>')
     return s.getvalue()
+  def _render_usage(self, s: io.StringIO) -> None:
+    """Renders usage in HTML."""
+    usage = self.result.usage
+    total = usage.total_prompt_tokens + usage.total_completion_tokens
+    s.write(
+        '&nbsp;<a title="'
+        f'# of usages: {usage.num_usages}&#013;'
+        f'total prompt: {usage.total_prompt_tokens}&#013;'
+        f'total response: {usage.total_completion_tokens}&#013;'
+        f'avg prompt: {usage.average_prompt_tokens}&#013;'
+        f'avg response: {usage.average_completion_tokens}'
+        f'" style="color:gray">({total} tokens)</a>'
+    )
   def _render_metric(self, s: io.StringIO) -> None:
     """Renders metrics in HTML."""
     assert self.result is not None
@@ -1196,17 +1269,48 @@ class Evaluation(Evaluable):
         )
     )
-  def audit(self, example: Any, output: Any, message: lf.Message) -> None:
+  def audit(
+      self,
+      example: Any,
+      message: lf.Message | None,
+      error: Exception | None = None,
+      dryrun: bool = False,
+  ) -> None:
     """Audits the example against the output. Subclasses should override.
     Args:
       example: The input object.
-      output: The output from LM. For `lf.call`, if `schema_fn` is not provided,
-        it will be the raw LM response string. Otherwise it will be the
-        structured output from the LM.
       message: The entire message returned by the LM, which could be used to
-        trace the LM input, response and parsed structure.
+        trace the LM input, response and parsed structure. If error is raised
+        before LLM could return a response, None will be its value.
+      error: The exception during processing the example.
+      dryrun: Whether or not audition takes place during dryrun.
     """
+    if error is not None:
+      self._failures.append((example, str(error)))
+      if isinstance(error, lf_structured.MappingError):
+        message = error.lm_response
+    else:
+      assert message is not None
+      output = message.text if self.schema is None else message.result
+      self.audit_processed(example, output, message, dryrun=dryrun)
+    # Audit usage.
+    if message is not None:
+      self.audit_usage(message, dryrun=dryrun)
+    self._num_completed += 1
+  def audit_usage(self, message: lf.Message, dryrun: bool = False) -> None:
+    for m in message.trace():
+      if 'usage' in m.metadata:
+        self._total_prompt_tokens += m.usage.prompt_tokens
+        self._total_completion_tokens += m.usage.completion_tokens
+        self._num_usages += 1
+  def audit_processed(
+      self, example: Any, output: Any, message: lf.Message, dryrun: bool = False
+  ) -> None:
+    """Audits a successfully processed example. Subclass should override."""
   def save(
       self, definition: bool = True, result: bool = True, report: bool = True
@@ -1250,8 +1354,10 @@ class Evaluation(Evaluable):
         '<td>Prompt</td>'
         '<td>Schema</td>'
         '<td>Additional Args</td>'
-        '<td>Failures</td>'
     )
+    if self.result.usage is not None:
+      s.write('<td>Usage</td>')
+    s.write('<td>Failures</td>')
   def _render_result_row(self, s: io.StringIO) -> None:
     s.write(
@@ -1276,6 +1382,12 @@ class Evaluation(Evaluable):
         '<td style="color:purple" '
         f'{_html_repr(self.additional_args, compact=False)}</td>'
     )
+    # Usage.
+    if self.result.usage is not None:
+      s.write('<td>')
+      self._render_usage(s)
+      s.write('</td>')
     # Failures.
     s.write(
         '<td><span style="color:orange">%s</span>%s</td>'
@@ -1374,8 +1486,8 @@ class Summary(pg.Object):
           Type[lf.LanguageModel],
           tuple[lf.LanguageModel | Type[lf.LanguageModel], ...],
       ] = lf.LanguageModel,
-      method: Union[str, tuple[str], None] = None,
-      schema_fn: Union[pg.Functor, tuple[pg.Functor], None] = None,
+      method: Union[str, tuple[str, ...], None] = None,
+      schema_fn: Union[pg.Functor, tuple[pg.Functor, ...], None] = None,
       completed: bool | None = None,
       pivot_field: str | None = None,
   ) -> 'Summary':
@@ -1556,8 +1668,35 @@ class Summary(pg.Object):
   def _repr_html_(self) -> str:
     return self.html()
+  def json(
+      self,
+  ) -> dict[
+      str,  # Task name
+      list[pg.Dict],  # List of pg.Dict with `experiment` and `metrics`.
+  ]:
+    """Returns the JSON representation of the summary."""
+    task_results = {}
+    for task in sorted(self.tasks(), key=lambda cls: cls.__name__):
+      results = []
+      for entry in self.select(task=task).evaluations:
+        results.append(
+            pg.Dict(
+                id=entry.id,
+                experiment=entry,
+                dir=entry.dir,
+                metrics=entry.result.metrics if entry.result else None,
+            )
+        )
+      task_results[task.__name__] = results
+    return task_results
   def save(self, file: str, pivot_field: str | None = None) -> None:
     pg.save(self.html(pivot_field), file, file_format='txt')
+    if file.endswith('.html'):
+      json_file = file.replace('.html', '.json')
+    else:
+      json_file = os.path.join(file, '.json')
+    pg.save(self.json(), json_file)
   @classmethod
   def from_dirs(
@@ -1768,3 +1907,43 @@ def monitor_async(
       scan_interval=scan_interval,
       refresh_when_stop=refresh_when_stop,
   )
+def app_run(target: Evaluable):
+  """Runs the target evaluation as an absl app.
+  Args:
+    target: An Langfun evaluable object.
+  """
+  flags.DEFINE_string(
+      'root_dir', None, 'Root directory for running the evaluation.'
+  )
+  flags.DEFINE_bool(
+      'dryrun', False, 'If True, dryrun the experiment instead of running it.'
+  )
+  flags.DEFINE_bool(
+      'debug', False, 'If True, output prompt and response to the console.'
+  )
+  flags.DEFINE_bool(
+      'rerun',
+      False,
+      'If True, rerun the experiment even a cached result is found.',
+  )
+  FLAGS = flags.FLAGS  # pylint: disable=invalid-name
+  def _main(argv):
+    if len(argv) > 1:
+      raise app.UsageError('Too many command-line arguments.')
+    if FLAGS.root_dir:
+      target.rebind(root_dir=FLAGS.root_dir, raise_on_no_change=False)
+    if FLAGS.dryrun:
+      target.dryrun(debug=FLAGS.debug)
+    else:
+      target.run(debug=FLAGS.debug, rerun=FLAGS.rerun)
+  app.run(_main)

langfun/core/eval/base_test.py CHANGED Viewed

@@ -101,7 +101,7 @@ class EvaluationTest(unittest.TestCase):
     self.assertEqual(s.dir, os.path.join(s.root_dir, s.id))
     self.assertEqual(s.hash, s.clone().hash)
     # Test persistent hash.
-    self.assertEqual(s.hash, 'abc7c29a')
+    self.assertEqual(s.hash, 'ae86c703')
     self.assertEqual(
         s.hash, s.clone(override={'max_workers': 2, 'lm.timeout': 20}).hash
     )
@@ -194,6 +194,7 @@ class EvaluationTest(unittest.TestCase):
             cache_seed=0,
             score=1.0,
             logprobs=None,
+            usage=lf.LMSamplingUsage(387, 24, 411),
             tags=['lm-response', 'lm-output', 'transformed'],
         ),
     )
@@ -209,7 +210,7 @@ class EvaluationTest(unittest.TestCase):
         s.result,
         dict(
             experiment_setup=dict(
-                id='Evaluation@17915dc6',
+                id='Evaluation@0fade07d',
                 dir=s.dir,
                 model='StaticSequence',
                 prompt_template='{{example.question}}',
@@ -220,6 +221,14 @@ class EvaluationTest(unittest.TestCase):
                 use_cache=True, num_queries=2, num_hits=0, num_updates=2
             ),
             metrics=dict(total=2, failures=1, failure_rate=0.5),
+            usage=dict(
+                total_prompt_tokens=774,
+                total_completion_tokens=25,
+                num_usages=2,
+                average_prompt_tokens=387,
+                average_completion_tokens=12,
+                average_total_tokens=399,
+            ),
         ),
     )
     self.assertTrue(
@@ -228,13 +237,23 @@ class EvaluationTest(unittest.TestCase):
         os.path.exists(os.path.join(s.dir, base.Evaluation.RESULT_JSON)))
     self.assertTrue(
         os.path.exists(os.path.join(s.dir, base.Evaluation.CACHE_JSON)))
-    self.assertTrue(
-        os.path.exists(os.path.join(s.root_dir, base.Evaluation.SUMMARY_HTML))
-    )
     self.assertTrue(
         os.path.exists(os.path.join(s.dir, base.Evaluation.INDEX_HTML)))
     self.assertTrue(
         os.path.exists(os.path.join(s.dir, base.Evaluation.FAILURES_HTML)))
+    self.assertTrue(
+        os.path.exists(os.path.join(s.root_dir, base.Evaluation.SUMMARY_HTML))
+    )
+    # Check summary JSON.
+    summary_json = os.path.join(
+        s.root_dir, base.Evaluation.SUMMARY_HTML.replace('.html', '.json')
+    )
+    self.assertTrue(os.path.exists(summary_json))
+    summary = pg.load(summary_json, force_dict=True)
+    self.assertIn('Evaluation', summary)
+    self.assertEqual(len(summary['Evaluation']), 1)
+    self.assertIsNotNone(summary['Evaluation'][0].experiment)
+    self.assertIsNotNone(summary['Evaluation'][0].metrics)
   def test_run_wihtout_save(self):
     lm = fake.StaticSequence([
@@ -274,8 +293,11 @@ class EvaluationTest(unittest.TestCase):
     s = eval_set(
         'run_filter_test', pg.oneof(['call', 'query']),
         schema_fn=answer_schema(), lm=lm)
+    result = s.run(
+        filter=lambda x: x.method == 'query', dryrun=True, summary=False
+    )
     self.assertEqual(
-        s.run(filter=lambda x: x.method == 'query', dryrun=True, summary=False),
+        result,
         {
             s.children[0].id: None,
             s.children[1].id: dict(
@@ -291,7 +313,8 @@ class EvaluationTest(unittest.TestCase):
                     use_cache=True, num_queries=2, num_hits=0, num_updates=2
                 ),
                 metrics=dict(total=2, failures=0, failure_rate=0.0),
-            )
+                usage=s.children[1].result.usage,
+            ),
         },
     )
@@ -321,11 +344,10 @@ class EvaluationTest(unittest.TestCase):
         s.children[0].dir, os.path.join(s.root_dir, s.children[0].id)
     )
     # Test persistent hash.
-    self.assertEqual(s.hash, 'ca7f722b')
+    self.assertEqual(s.hash, 'b66a4e88')
     summary = s.run(verbose=True)
     self.assertEqual(len(summary.evaluations), 2)
     self.assertEqual(
         s.result,
         {
@@ -342,6 +364,7 @@ class EvaluationTest(unittest.TestCase):
                     use_cache=True, num_queries=2, num_hits=0, num_updates=2
                 ),
                 metrics=dict(total=2, failures=1, failure_rate=0.5),
+                usage=s.children[0].result.usage,
             ),
             s.children[1].id: dict(
                 experiment_setup=dict(
@@ -356,6 +379,7 @@ class EvaluationTest(unittest.TestCase):
                     use_cache=True, num_queries=2, num_hits=0, num_updates=2
                 ),
                 metrics=dict(total=2, failures=1, failure_rate=0.5),
+                usage=s.children[1].result.usage,
             ),
         },
     )
@@ -448,7 +472,7 @@ class SuiteTest(unittest.TestCase):
         lm=lm
     )
     # Test for persistent hash.
-    self.assertEqual(s.hash, '7285e52b')
+    self.assertEqual(s.hash, '26e6cc25')
     s.run()
     expected = {
         s.children[0].id: dict(
@@ -464,6 +488,7 @@ class SuiteTest(unittest.TestCase):
                 use_cache=True, num_queries=2, num_hits=0, num_updates=2
             ),
             metrics=dict(total=2, failures=1, failure_rate=0.5),
+            usage=s.children[0].result.usage,
         ),
         s.children[1].id: {
             s.children[1]
@@ -481,6 +506,7 @@ class SuiteTest(unittest.TestCase):
                     use_cache=True, num_queries=4, num_hits=1, num_updates=3
                 ),
                 metrics=dict(total=2, failures=2, failure_rate=1.0),
+                usage=s.children[1].children[0].result.usage,
             ),
             s.children[1]
             .children[2]
@@ -500,6 +526,7 @@ class SuiteTest(unittest.TestCase):
                     num_updates=2,
                 ),
                 metrics=dict(total=2, failures=1, failure_rate=0.5),
+                usage=s.children[1].children[2].result.usage,
             ),
         },
     }
@@ -671,5 +698,17 @@ class SummaryTest(unittest.TestCase):
     self.assertTrue(pg.io.path_exists(summary_file))
+class AppRunTest(unittest.TestCase):
+  def test_app_run(self):
+    lm = fake.StaticSequence(['two', 'Solution(final_answer=2)'])
+    try:
+      base.app_run(
+          eval_set('app_run_test', 'query', schema_fn=answer_schema(), lm=lm)
+      )
+    except SystemExit:
+      pass
 if __name__ == '__main__':
   unittest.main()

langfun/core/eval/matching.py CHANGED Viewed

@@ -86,9 +86,26 @@ class Matching(base.Evaluation):
     self._matches = []
     self._mismatches = []
-  def audit(self, example: Any, output: Any, message: lf.Message) -> None:
+  def audit_processed(
+      self, example: Any, output: Any, message: lf.Message, dryrun: bool = False
+  ) -> None:
     groundtruth = self.groundtruth(example)
     answer = self.answer(output, example)
+    if dryrun:
+      lf.console.write('')
+      lf.console.write(
+          str(groundtruth),
+          title='GROUDTRUTH',
+          color='green',
+      )
+      lf.console.write('')
+      lf.console.write(
+          str(answer),
+          title='ANSWER',
+          color='blue',
+      )
     if self.match(answer, groundtruth):
       self._matches.append((example, output, message))
     else:
@@ -155,19 +172,16 @@ class Matching(base.Evaluation):
     super().save(definition, result, report)
     if result:
-      def force_dict(v):
-        return pg.object_utils.json_conversion.strip_types(pg.to_json(v))
       # Save matches.
       pg.save(
           [
-              # We force the output to be dict as its type may be defined
-              # within functors which could be deserialized.
-              pg.Dict(input=input, output=force_dict(output))
+              pg.Dict(input=input, output=output)
               for input, output, _ in self.matches
           ],
           os.path.join(self.dir, Matching.MATCHES_JSON),
+          # We force the input and output to be dict so it does not depend on
+          # the downstream to serialize.
+          force_dict=True,
       )
       # Save mismatches.
@@ -175,10 +189,13 @@ class Matching(base.Evaluation):
           [
               # We force the output to be dict as its type may be defined
               # within functors which could be deserialized.
-              pg.Dict(input=input, output=force_dict(output))
+              pg.Dict(input=input, output=output)
               for input, output, _ in self.mismatches
           ],
           os.path.join(self.dir, Matching.MISMATCHES_JSON),
+          # We force the input and output to be dict so it does not depend on
+          # the downstream to serialize.
+          force_dict=True,
       )
     if report:

langfun 0.0.2.dev20240330__py3-none-any.whl → 0.0.2.dev20240429__py3-none-any.whl

langfun 0.0.2.dev20240330py3-none-any.whl → 0.0.2.dev20240429py3-none-any.whl