PyPI - langfun - Versions diffs - 0.0.2.dev20240429__py3-none-any.whl → 0.0.2.dev20240430__py3-none-any.whl - Mend

langfun 0.0.2.dev20240429py3-none-any.whl → 0.0.2.dev20240430py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

langfun/core/eval/base.py CHANGED Viewed

@@ -18,6 +18,7 @@ import collections
 import dataclasses
 import functools
 import hashlib
+import html
 import inspect
 import io
 import os
@@ -40,7 +41,8 @@ class Evaluable(lf.Component):
   EXPERIMENT_JSON = 'experiment.json'
   RESULT_JSON = 'result.json'
-  FAILURES_JSON = 'failures.json'
+  OOP_FAILURES_JSON = 'oop_failures.json'
+  NON_OOP_FAILURES_JSON = 'non_oop_failures.json'
   INDEX_HTML = 'index.html'
   SUMMARY_HTML = 'summary.html'
@@ -358,7 +360,7 @@ class Evaluable(lf.Component):
               color='yellow')
           for node in self.nonleaf_nodes:
-            node._result = {c.id: c.result for c in node.children}  # pylint: disable=protected-access
+            node._result = {c.id: c.result for c in node.leaf_nodes}  # pylint: disable=protected-access
             if should_save:
               node.save(result=False, report=False)
@@ -540,13 +542,13 @@ class Evaluable(lf.Component):
           f'<div style="color: {text_color}; white-space: pre-wrap;'
           'padding: 10px; border: 1px solid; margin-top: 10px">'
       )
-      s.write(m.get('formatted_text', m.text))
+      s.write(html.escape(m.get('formatted_text', m.text)))
       if m.result is not None:
         s.write(
             '<div style="color: magenta; white-space: pre-wrap;'
             'padding: 10px; border: 1px solid; margin: 10px">'
         )
-        s.write(pg.format(m.result))
+        s.write(html.escape(pg.format(m.result)))
         s.write('</div>')
       if 'usage' in m.metadata:
         s.write(
@@ -753,10 +755,12 @@ class Evaluation(Evaluable):
   # Constants.
   CACHE_JSON = 'cache.json'
-  FAILURES_HTML = 'failures.html'
+  OOP_FAILURES_HTML = 'oop_failures.html'
+  NON_OOP_FAILURES_HTML = 'non_oop_failures.html'
   @functools.cached_property
   def hash(self) -> str:
+    """Returns the semantic-based hash of the evaluation."""
     if self.is_deterministic:
       identity = pg.format(self._identifiers(), compact=True)
     else:
@@ -805,6 +809,10 @@ class Evaluation(Evaluable):
     """Returns the complete rate."""
     return self.num_completed / self.num_examples
+  #
+  # Properties on failures.
+  #
   @property
   def failures(self) -> list[tuple[Any, Exception]]:
     """Returns the failed examples and their errors."""
@@ -815,6 +823,15 @@ class Evaluation(Evaluable):
     """Returns the number of failed examples."""
     return len(self.failures)
+  @functools.cached_property
+  def failure_breakdown(self) -> dict[str, int]:
+    """Returns the breakdown of failures."""
+    breakdown = collections.defaultdict(int)
+    for _, error in self.failures:
+      breakdown[_error_key(error)] += 1
+    sorted_items = sorted(breakdown.items(), key=lambda x: x[1], reverse=True)
+    return pg.Dict({x[0]: x[1] for x in sorted_items})
   @property
   def failure_rate(self) -> float:
     """Returns the failure rate in range [0, 1]."""
@@ -822,6 +839,46 @@ class Evaluation(Evaluable):
       return 0.0
     return self.num_failures / self.num_completed
+  @functools.cached_property
+  def oop_failures(self) -> list[tuple[Any, lf_structured.MappingError]]:
+    """Returns the OOP failures."""
+    return [item for item in self.failures
+            if isinstance(item[1], lf_structured.MappingError)]
+  @property
+  def num_oop_failures(self) -> int:
+    """Returns the number of OOP failures."""
+    return len(self.oop_failures)
+  @property
+  def oop_failure_rate(self) -> float:
+    """Returns the OOP failure rate in range [0, 1]."""
+    if self.num_completed == 0:
+      return 0.0
+    return self.num_oop_failures / self.num_completed
+  @functools.cached_property
+  def non_oop_failures(self) -> list[tuple[Any, Exception]]:
+    """Returns the OOP failures."""
+    return [item for item in self.failures
+            if not isinstance(item[1], lf_structured.MappingError)]
+  @property
+  def num_non_oop_failures(self) -> int:
+    """Returns the number of non-OOP failures."""
+    return len(self.non_oop_failures)
+  @property
+  def non_oop_failure_rate(self) -> float:
+    """Returns the non-OOP failure rate in range [0, 1]."""
+    if self.num_completed == 0:
+      return 0.0
+    return self.num_non_oop_failures / self.num_completed
+  #
+  # Properties on usage.
+  #
   @property
   def has_usage(self) -> bool:
     """Returns True if token usage is enabled."""
@@ -976,13 +1033,22 @@ class Evaluation(Evaluable):
     self._total_prompt_tokens = 0
     self._total_completion_tokens = 0
     self._num_usages = 0
+    self.__dict__.pop('oop_failures', None)
+    self.__dict__.pop('non_oop_failures', None)
   @property
-  def failures_link(self) -> str | None:
-    """Returns the link to the failures page."""
+  def oop_failures_link(self) -> str | None:
+    """Returns the link to the OOP failures page."""
     if self.dir is None:
       return None
-    return self.link(os.path.join(self.dir, Evaluation.FAILURES_HTML))
+    return self.link(os.path.join(self.dir, Evaluation.OOP_FAILURES_HTML))
+  @property
+  def non_oop_failures_link(self) -> str | None:
+    """Returns the link to then non-OOP failures page."""
+    if self.dir is None:
+      return None
+    return self.link(os.path.join(self.dir, Evaluation.NON_OOP_FAILURES_HTML))
   def _dryrun(
       self,
@@ -1011,23 +1077,34 @@ class Evaluation(Evaluable):
           color='green',
       )
-    with lf.use_settings(debug=debug):
-      output_message = copy.process(example, **(self.additional_args or {}))
-      if self.schema is None:
-        output = output_message.text
-      else:
-        output = output_message.result
+    error, output_message = None, None
-    if verbose:
+    try:
+      with lf.use_settings(debug=debug):
+        output_message = copy.process(example, **(self.additional_args or {}))
+        if self.schema is None:
+          output = output_message.text
+        else:
+          output = output_message.result
+      if verbose:
+        lf.console.write('')
+        lf.console.write(
+            str(output),
+            title='OUTPUT',
+            color='blue',
+        )
+    except lf_structured.MappingError as e:
       lf.console.write('')
       lf.console.write(
-          str(output),
-          title='OUTPUT',
-          color='blue',
+          str(e),
+          title='ERROR',
+          color='red',
       )
+      error = e
-    copy.audit(example, output_message, None, dryrun=True)
-    result = copy.summarize()
+    copy.audit(example, output_message, error, dryrun=True)
+    result = copy.finalize()
     if verbose:
       lf.console.write('')
@@ -1087,7 +1164,7 @@ class Evaluation(Evaluable):
           self.cache.save()
     # Summarize result.
-    self._result = self.summarize()
+    self._result = self.finalize()
     if verbose:
       lf.console.write(
           str(self.result),
@@ -1143,13 +1220,13 @@ class Evaluation(Evaluable):
   def _status(self, progress: lf.concurrent.Progress) -> dict[str, Any]:
     return {
         'Model': self.lm.model_id,
-        'Succeeded': f'%.{self.report_precision}f%% (%d/%d)' % (
-            progress.success_rate * 100,
+        'Succeeded': '%s (%d/%d)' % (
+            self._format_rate(progress.success_rate),
             progress.succeeded,
             progress.completed,
         ),
-        'Failed': f'%.{self.report_precision}f%% (%d/%d)' % (
-            progress.failure_rate * 100,
+        'Failed': '%s (%d/%d)' % (
+            self._format_rate(progress.failure_rate),
             progress.failed,
             progress.completed,
         ),
@@ -1159,21 +1236,20 @@ class Evaluation(Evaluable):
     assert self.result is not None
     m = self.result.metrics
     return (
-        f'COMPLETED(%s): Successes=%.{self.report_precision}f%% (%d/%d)'
-        f' Failures=%.{self.report_precision}f%% (%d/%d)'
+        'COMPLETED(%s): Successes=%s(%d/%d) Failures=%s (%d/%d)'
         % (
             run_status,
-            (1 - m.failure_rate) * 100,
+            self._format_rate(1 - m.failure_rate),
             m.total - m.failures,
             m.total,
-            m.failure_rate * 100,
+            self._format_rate(m.failure_rate),
             m.failures,
             m.total,
         )
     )
-  def summarize(self) -> pg.Dict:
-    """Summarizes the evaluation result."""
+  def finalize(self) -> pg.Dict:
+    """Finalizes the evaluation result."""
     if self.cache:
       cache_stats = dict(
           use_cache=True,
@@ -1210,12 +1286,18 @@ class Evaluation(Evaluable):
             total=self.num_completed,
             failures=self.num_failures,
             failure_rate=self.failure_rate,
+            oop_failures=self.num_oop_failures,
+            oop_failure_rate=self.oop_failure_rate,
+            non_oop_failures=self.num_non_oop_failures,
+            non_oop_failure_rate=self.non_oop_failure_rate,
+            failure_breakdown=self.failure_breakdown,
         ),
         usage=usage,
     )
     return result
-  def summarize_html(self) -> str:
+  def summary_card(self) -> str:
+    """Returns summary card in HTML."""
     s = io.StringIO()
     definition = _html_repr(self, compact=False, escape=True)
     s.write('<div><table><tr><td>')
@@ -1230,18 +1312,19 @@ class Evaluation(Evaluable):
       s.write(
           f'<a target="_blank" title="{definition}" '
           f'href="{self.index_link}">{self.hash}</a>'
+          f' &nbsp;[<a href="{self.link(self.dir)}">dir</a>]'
           '</td></tr><tr><td>'
       )
-      self._render_metric(s)
+      self._render_summary_metrics(s)
       # Summarize average usage.
       if self.result.usage is not None:
-        self._render_usage(s)
+        self._render_summary_usage(s)
     s.write('</td></tr></table></div>')
     return s.getvalue()
-  def _render_usage(self, s: io.StringIO) -> None:
+  def _render_summary_usage(self, s: io.StringIO) -> None:
     """Renders usage in HTML."""
     usage = self.result.usage
     total = usage.total_prompt_tokens + usage.total_completion_tokens
@@ -1255,20 +1338,66 @@ class Evaluation(Evaluable):
         f'" style="color:gray">({total} tokens)</a>'
     )
-  def _render_metric(self, s: io.StringIO) -> None:
+  def _render_summary_metrics(self, s: io.StringIO) -> None:
     """Renders metrics in HTML."""
     assert self.result is not None
     m = self.result.metrics
+    # OOP failures.
+    oop_failure_title = f'OOP failures ({m.oop_failures}/{m.total})'
+    if m.oop_failures:
+      oop_failure_title += '&#013;'
+      for name, count in m.failure_breakdown.items():
+        if name.startswith('MappingError'):
+          oop_failure_title += '&#013;%s: %s (%d/%d)' % (
+              name.removeprefix('MappingError.'),
+              self._format_rate(count / m.total),
+              count,
+              m.total,
+          )
+    extra_style = ''
+    if m.oop_failure_rate > 0.1 and m.oop_failures > 3:
+      extra_style = ';font-weight:bold'
     s.write(
-        '<a title="Failures (%d/%d)" href="%s" style="color:red">%s</a>'
+        '<a title="%s" href="%s" style="color:magenta%s">%s</a>'
         % (
-            m.failures,
-            m.total,
-            self.failures_link,
-            f'%.{self.report_precision}f%% ' % (m.failure_rate * 100),
+            oop_failure_title,
+            self.oop_failures_link,
+            extra_style,
+            self._format_rate(m.oop_failure_rate),
+        )
+    )
+    s.write(' | ')
+    # Non-OOP failures.
+    non_oop_failure_title = f'Non-OOP failures ({m.non_oop_failures}/{m.total})'
+    if m.non_oop_failures:
+      non_oop_failure_title += '&#013;'
+      for name, count in m.failure_breakdown.items():
+        if not name.startswith('MappingError'):
+          non_oop_failure_title += '&#013;%s: %s (%d/%d)' % (
+              name,
+              self._format_rate(count / m.total),
+              count,
+              m.total,
+          )
+    extra_style = ';font-weight:bold' if m.non_oop_failures > 0 else ''
+    s.write(
+        '<a title="%s" href="%s" style="color:red%s">%s</a>'
+        % (
+            non_oop_failure_title,
+            self.non_oop_failures_link,
+            extra_style,
+            self._format_rate(m.non_oop_failure_rate),
         )
     )
+  def _format_rate(self, rate: float) -> str:
+    """Formats a rate."""
+    return f'%.{self.report_precision}f%% ' % (rate * 100)
   def audit(
       self,
       example: Any,
@@ -1287,7 +1416,13 @@ class Evaluation(Evaluable):
       dryrun: Whether or not audition takes place during dryrun.
     """
     if error is not None:
-      self._failures.append((example, str(error)))
+      self._failures.append((example, error))
+      # Invalid cache of num_oop_failures.
+      self.__dict__.pop('oop_failures', None)
+      self.__dict__.pop('non_oop_failures', None)
+      self.__dict__.pop('failure_breakdown', None)
       if isinstance(error, lf_structured.MappingError):
         message = error.lm_response
     else:
@@ -1333,16 +1468,26 @@ class Evaluation(Evaluable):
       # Save failures.
       pg.save(
           [
-              pg.Dict(
-                  input=input, error=lf.text_formatting.decolored(str(error))
-              )
-              for input, error in self.failures
+              pg.Dict(input=input, error=_format_error(error))
+              for input, error in self.oop_failures
+          ],
+          os.path.join(self.dir, Evaluation.OOP_FAILURES_JSON),
+      )
+      pg.save(
+          self._html([self._render_result, self._render_oop_failures]),
+          os.path.join(self.dir, Evaluation.OOP_FAILURES_HTML),
+          file_format='txt',
+      )
+      pg.save(
+          [
+              pg.Dict(input=input, error=_format_error(error))
+              for input, error in self.non_oop_failures
           ],
-          os.path.join(self.dir, Evaluation.FAILURES_JSON),
+          os.path.join(self.dir, Evaluation.NON_OOP_FAILURES_JSON),
       )
       pg.save(
-          self._html([self._render_result, self._render_failures]),
-          os.path.join(self.dir, Evaluation.FAILURES_HTML),
+          self._html([self._render_result, self._render_non_oop_failures]),
+          os.path.join(self.dir, Evaluation.NON_OOP_FAILURES_HTML),
           file_format='txt',
       )
@@ -1357,7 +1502,8 @@ class Evaluation(Evaluable):
     )
     if self.result.usage is not None:
       s.write('<td>Usage</td>')
-    s.write('<td>Failures</td>')
+    s.write('<td>OOP Failures</td>')
+    s.write('<td>Non-OOP Failures</td>')
   def _render_result_row(self, s: io.StringIO) -> None:
     s.write(
@@ -1385,16 +1531,29 @@ class Evaluation(Evaluable):
     # Usage.
     if self.result.usage is not None:
       s.write('<td>')
-      self._render_usage(s)
+      self._render_summary_usage(s)
       s.write('</td>')
-    # Failures.
+    # OOP failures.
+    s.write(
+        '<td><span style="color:magenta">%s</span>%s</td>'
+        % (
+            self._format_rate(self.oop_failure_rate),
+            '<a href="%s">(%d/%d)</a>'
+            % (self.oop_failures_link,
+               self.num_oop_failures,
+               self.num_completed),
+        )
+    )
+    # Non-OOP failures.
     s.write(
-        '<td><span style="color:orange">%s</span>%s</td>'
+        '<td><span style="color:red">%s</span>%s</td>'
         % (
-            f'%.{self.report_precision}f%%' % (self.failure_rate * 100),
+            self._format_rate(self.non_oop_failure_rate),
             '<a href="%s">(%d/%d)</a>'
-            % (self.failures_link, self.num_failures, self.num_completed),
+            % (self.non_oop_failures_link,
+               self.num_non_oop_failures,
+               self.num_completed),
         )
     )
@@ -1408,24 +1567,77 @@ class Evaluation(Evaluable):
     else:
       return 'cyan'
-  def _render_failures(self, s: io.StringIO) -> None:
+  def _render_oop_failures(self, s: io.StringIO) -> None:
+    self._render_failures(s, '^MappingError.*', error_color='magenta')
+  def _render_non_oop_failures(self, s: io.StringIO) -> None:
+    self._render_failures(s, '^(?!MappingError).*', error_color='red')
+  def _render_failures(
+      self, s: io.StringIO, error_regex: str, error_color: str) -> None:
     """Formats the failed cases into html."""
+    # Failure summary.
     s.write(
-        '<h2> Failed Cases </h2>'
+        '<h2> Error Summary </h2>'
         '<div style="white-space:pre">\n'
         '<table style="border:1px solid">'
-        '<tr class="header"><td>No.</td><td>Input</td><td>Error</td></tr>'
+        '<tr class="header"><td>Error type</td><td>Stats</td></tr>'
     )
+    error_regex = re.compile(error_regex)
+    if self.result.metrics.failure_breakdown:
+      for name, count in self.result.metrics.failure_breakdown.items():
+        if not error_regex.match(name):
+          continue
+        link = f'<a href="#{name}">{name}</a>'
+        error_rate = self._format_rate(count / self.result.metrics.total)
+        stats = (f'<span style="color:{error_color}">{error_rate} '
+                 f'({count}/{self.result.metrics.total})</span>')
+        s.write(f'<tr><td>{link}</td><td>{stats})</td></tr>')
+    s.write(
+        '</table></div>'
+        '<h2> Failed Cases </h2>'
+        '<div style="white-space:pre">'
+    )
+    # Failure details by error type.
+    failures_by_error = collections.defaultdict(list)
+    for example, error in self.failures:
+      error_name = _error_key(error)
+      if error_regex.match(error_name):
+        failures_by_error[error_name].append((example, error))
+    for error_key, failures in failures_by_error.items():
+      s.write(
+          f'<h3 id="{error_key}"><a href="#{error_key}">{error_key}</a> '
+          f'(count={len(failures)})</h3>'
+          '<table style="border:1px solid">'
+          '<tr class="header"><td>No.</td><td>Input</td>'
+          '<td>LM invocation</td><td>Error</td></tr>'
+      )
+      for i, (example, error) in enumerate(failures):
+        lm_response = None
+        if isinstance(error, lf.structured.MappingError):
+          lm_response = error.lm_response
+          error = error.cause
+        bgcolor = 'white' if i % 2 == 0 else '#DDDDDD'
+        s.write(f'<tr style="background-color: {bgcolor}"><td>{i + 1}</td>')
+        s.write('<td style="color:green;white-space:pre-wrap">')
+        s.write(pg.format(example, verbose=False))
+        s.write('</td><td>')
+        if lm_response is not None:
+          self._render_message(lm_response, s)
+        s.write(f'</td><td style="color:{error_color};white-space:pre">')
+        s.write(_format_error(error))
+        s.write('</td></tr>')
+      s.write('</table>')
+    s.write('</div>')
-    for i, (example, error) in enumerate(self.failures):
-      bgcolor = 'white' if i % 2 == 0 else '#DDDDDD'
-      s.write(f'<tr style="background-color: {bgcolor}"><td>{i + 1}</td>')
-      input_str = pg.format(example, verbose=False)
-      s.write(f'<td style="color:green;white-space:pre-wrap">{input_str}</td>')
-      error_str = lf.text_formatting.decolored(str(error))
-      s.write(f'<td style="color:red;white-space:pre">{error_str}</td>')
-      s.write('</tr>')
-    s.write('</table></div>')
+  @classmethod
+  def visualize(cls, evaluations: list['Evaluation']) -> str | None:
+    """Visualize the a list of evaluations of this task in HTML."""
+    del evaluations
+    return None
 @pg.functor()
@@ -1578,7 +1790,7 @@ class Summary(pg.Object):
           if e is None:
             s.write('<span style="color: gray">N/A<span>')
           else:
-            s.write(e.summarize_html())
+            s.write(e.summary_card())
           s.write('</td>')
         s.write('</tr>')
       s.write('</table>')
@@ -1653,13 +1865,22 @@ class Summary(pg.Object):
     s.write('<html><body>')
     for task in sorted(self.tasks(), key=lambda cls: cls.__name__):
       table_id = task.__name__.lower()
+      evaluations = self.select(task=task).evaluations
+      table = Summary.Table.from_evaluations(evaluations, pivot_field)
       s.write('<div>')
-      s.write(f'<a id="{table_id}"')
-      s.write(f'<h2><a href="#{table_id}">{task.__name__}</a></h2>')
-      s.write('</a>')
-      table = Summary.Table.from_evaluations(
-          self.select(task=task).evaluations, pivot_field
+      s.write(
+          f'<a id="{table_id}" href="#{table_id}">'
+          f'<h2>{task.__name__}</h2></a>'
       )
+      # Allow users to plugin visualization code (e.g. matplot) in the summary
+      # page.
+      visual_part = task.visualize(evaluations)
+      if visual_part:
+        s.write(visual_part)
+      s.write(f'<h4 style="color:gray">{len(evaluations)} experiments</h4>')
+      s.write('<hr/>')
       s.write(table.html())
       s.write('</div>')
     s.write('</body></html>')
@@ -1685,6 +1906,7 @@ class Summary(pg.Object):
                 experiment=entry,
                 dir=entry.dir,
                 metrics=entry.result.metrics if entry.result else None,
+                usage=entry.result.usage if entry.result else None,
             )
         )
       task_results[task.__name__] = results
@@ -1833,6 +2055,21 @@ class Summary(pg.Object):
     return result.join()
+def _format_error(error: Exception):
+  """Formats an error into a string."""
+  return (f'({error.__class__.__name__}) '
+          + lf.text_formatting.decolored(str(error)))
+def _error_key(error: Exception) -> str:
+  """Returns the key for an error."""
+  error_names = []
+  while error is not None:
+    error_names.append(error.__class__.__name__)
+    error = getattr(error, 'cause', None)
+  return '.'.join(error_names)
 def _html_repr(value: Any, compact: bool = True, escape: bool = False) -> str:
   """Formats prompt in HTML."""
   if type(value) is lf.Template:  # pylint: disable=unidiomatic-typecheck

langfun/core/eval/base_test.py CHANGED Viewed

@@ -220,7 +220,18 @@ class EvaluationTest(unittest.TestCase):
             cache_stats=dict(
                 use_cache=True, num_queries=2, num_hits=0, num_updates=2
             ),
-            metrics=dict(total=2, failures=1, failure_rate=0.5),
+            metrics=dict(
+                total=2,
+                failures=1,
+                failure_rate=0.5,
+                oop_failures=1,
+                oop_failure_rate=0.5,
+                non_oop_failures=0,
+                non_oop_failure_rate=0.0,
+                failure_breakdown={
+                    'MappingError.SchemaError.TypeError': 1
+                }
+            ),
             usage=dict(
                 total_prompt_tokens=774,
                 total_completion_tokens=25,
@@ -235,12 +246,20 @@ class EvaluationTest(unittest.TestCase):
         os.path.exists(os.path.join(s.dir, base.Evaluation.EXPERIMENT_JSON)))
     self.assertTrue(
         os.path.exists(os.path.join(s.dir, base.Evaluation.RESULT_JSON)))
+    self.assertTrue(
+        os.path.exists(os.path.join(s.dir, base.Evaluation.OOP_FAILURES_JSON)))
+    self.assertTrue(
+        os.path.exists(
+            os.path.join(s.dir, base.Evaluation.NON_OOP_FAILURES_JSON)))
     self.assertTrue(
         os.path.exists(os.path.join(s.dir, base.Evaluation.CACHE_JSON)))
     self.assertTrue(
         os.path.exists(os.path.join(s.dir, base.Evaluation.INDEX_HTML)))
     self.assertTrue(
-        os.path.exists(os.path.join(s.dir, base.Evaluation.FAILURES_HTML)))
+        os.path.exists(os.path.join(s.dir, base.Evaluation.OOP_FAILURES_HTML)))
+    self.assertTrue(
+        os.path.exists(
+            os.path.join(s.dir, base.Evaluation.NON_OOP_FAILURES_HTML)))
     self.assertTrue(
         os.path.exists(os.path.join(s.root_dir, base.Evaluation.SUMMARY_HTML))
     )
@@ -274,7 +293,10 @@ class EvaluationTest(unittest.TestCase):
     self.assertFalse(
         os.path.exists(os.path.join(s.dir, base.Evaluation.INDEX_HTML)))
     self.assertFalse(
-        os.path.exists(os.path.join(s.dir, base.Evaluation.FAILURES_HTML)))
+        os.path.exists(os.path.join(s.dir, base.Evaluation.OOP_FAILURES_HTML)))
+    self.assertFalse(
+        os.path.exists(
+            os.path.join(s.dir, base.Evaluation.NON_OOP_FAILURES_HTML)))
   def test_load(self):
     lm = fake.StaticResponse('Solution(final_answer=2)')
@@ -312,7 +334,16 @@ class EvaluationTest(unittest.TestCase):
                 cache_stats=dict(
                     use_cache=True, num_queries=2, num_hits=0, num_updates=2
                 ),
-                metrics=dict(total=2, failures=0, failure_rate=0.0),
+                metrics=dict(
+                    total=2,
+                    failures=0,
+                    failure_rate=0.0,
+                    oop_failures=0,
+                    oop_failure_rate=0.0,
+                    non_oop_failures=0,
+                    non_oop_failure_rate=0.0,
+                    failure_breakdown={},
+                ),
                 usage=s.children[1].result.usage,
             ),
         },
@@ -363,7 +394,18 @@ class EvaluationTest(unittest.TestCase):
                 cache_stats=dict(
                     use_cache=True, num_queries=2, num_hits=0, num_updates=2
                 ),
-                metrics=dict(total=2, failures=1, failure_rate=0.5),
+                metrics=dict(
+                    total=2,
+                    failures=1,
+                    failure_rate=0.5,
+                    oop_failures=1,
+                    oop_failure_rate=0.5,
+                    non_oop_failures=0,
+                    non_oop_failure_rate=0.0,
+                    failure_breakdown={
+                        'MappingError.SchemaError.TypeError': 1
+                    }
+                ),
                 usage=s.children[0].result.usage,
             ),
             s.children[1].id: dict(
@@ -378,7 +420,18 @@ class EvaluationTest(unittest.TestCase):
                 cache_stats=dict(
                     use_cache=True, num_queries=2, num_hits=0, num_updates=2
                 ),
-                metrics=dict(total=2, failures=1, failure_rate=0.5),
+                metrics=dict(
+                    total=2,
+                    failures=1,
+                    failure_rate=0.5,
+                    oop_failures=1,
+                    oop_failure_rate=0.5,
+                    non_oop_failures=0,
+                    non_oop_failure_rate=0.0,
+                    failure_breakdown={
+                        'MappingError.SchemaError.TypeError': 1
+                    }
+                ),
                 usage=s.children[1].result.usage,
             ),
         },
@@ -475,7 +528,7 @@ class SuiteTest(unittest.TestCase):
     self.assertEqual(s.hash, '26e6cc25')
     s.run()
     expected = {
-        s.children[0].id: dict(
+        'Evaluation@0fade07d': dict(
             experiment_setup=dict(
                 id=s.children[0].id,
                 dir=s.children[0].dir,
@@ -487,48 +540,46 @@ class SuiteTest(unittest.TestCase):
             cache_stats=dict(
                 use_cache=True, num_queries=2, num_hits=0, num_updates=2
             ),
-            metrics=dict(total=2, failures=1, failure_rate=0.5),
+            metrics=dict(
+                total=2,
+                failures=1,
+                failure_rate=0.5,
+                oop_failures=1,
+                oop_failure_rate=0.5,
+                non_oop_failures=0,
+                non_oop_failure_rate=0.0,
+                failure_breakdown={
+                    'MappingError.SchemaError.TypeError': 1
+                }
+            ),
             usage=s.children[0].result.usage,
         ),
-        s.children[1].id: {
-            s.children[1]
-            .children[0]
-            .id: dict(
-                experiment_setup=dict(
-                    id=s.children[1].children[0].id,
-                    dir=s.children[1].children[0].dir,
-                    model='StaticSequence',
-                    prompt_template='{{example.question}}',
-                    method='call',
-                    schema_fn='answer_schema()',
-                ),
-                cache_stats=dict(
-                    use_cache=True, num_queries=4, num_hits=1, num_updates=3
-                ),
-                metrics=dict(total=2, failures=2, failure_rate=1.0),
-                usage=s.children[1].children[0].result.usage,
+        'Evaluation@ae86c703': dict(
+            experiment_setup=dict(
+                id=s.children[1].children[0].id,
+                dir=s.children[1].children[0].dir,
+                model='StaticSequence',
+                prompt_template='{{example.question}}',
+                method='call',
+                schema_fn='answer_schema()',
             ),
-            s.children[1]
-            .children[2]
-            .id: dict(
-                experiment_setup=dict(
-                    id=s.children[1].children[2].id,
-                    dir=s.children[1].children[2].dir,
-                    model='StaticSequence',
-                    prompt_template='{{example.question}}',
-                    method='query',
-                    schema_fn='answer_schema()',
-                ),
-                cache_stats=dict(
-                    use_cache=True,
-                    num_queries=2,
-                    num_hits=0,
-                    num_updates=2,
-                ),
-                metrics=dict(total=2, failures=1, failure_rate=0.5),
-                usage=s.children[1].children[2].result.usage,
+            cache_stats=dict(
+                use_cache=True, num_queries=4, num_hits=1, num_updates=3
             ),
-        },
+            metrics=dict(
+                total=2,
+                failures=2,
+                failure_rate=1.0,
+                oop_failures=2,
+                oop_failure_rate=1.0,
+                non_oop_failures=0,
+                non_oop_failure_rate=0.0,
+                failure_breakdown={
+                    'MappingError.SchemaError.TypeError': 2
+                }
+            ),
+            usage=s.children[1].children[0].result.usage,
+        ),
     }
     self.assertEqual(s.result, expected)

langfun/core/eval/matching.py CHANGED Viewed

@@ -119,18 +119,18 @@ class Matching(base.Evaluation):
     del progress
     return {
         'Model': self.lm.model_id,
-        'Matches': f'%.{self.report_precision}f%% (%d/%d)' % (
-            self.match_rate * 100,
+        'Matches': '%s (%d/%d)' % (
+            self._format_rate(self.match_rate),
             self.num_matches,
             self.num_completed,
         ),
-        'Mismatches': f'%.{self.report_precision}f%% (%d/%d)' % (
-            self.mismatch_rate * 100,
+        'Mismatches': '%s (%d/%d)' % (
+            self._format_rate(self.mismatch_rate),
             self.num_mismatches,
             self.num_completed,
         ),
-        'Failed': f'%.{self.report_precision}f%% (%d/%d)' % (
-            self.failure_rate * 100,
+        'Failed': '%s (%d/%d)' % (
+            self._format_rate(self.failure_rate),
             self.num_failures,
             self.num_completed,
         ),
@@ -140,24 +140,25 @@ class Matching(base.Evaluation):
     assert self.result is not None
     m = self.result.metrics
     return (
-        f'COMPLETED(%s): Matches=%.{self.report_precision}f%% (%d/%d)'
-        f' Mismatches=%.{self.report_precision}f%% (%d/%d)'
-        f' Failures=%.{self.report_precision}f%% (%d/%d)'
+        'COMPLETED(%s):'
+        ' Matches=%s (%d/%d)'
+        ' Mismatches=%s (%d/%d)'
+        ' Failures=%s (%d/%d)'
     ) % (
         run_status,
-        m.match_rate * 100,
+        self._format_rate(m.match_rate),
         m.num_matches,
         m.total,
-        m.mismatch_rate * 100,
+        self._format_rate(m.mismatch_rate),
         m.num_mismatches,
         m.total,
-        m.failure_rate * 100,
+        self._format_rate(m.failure_rate),
         m.failures,
         m.total,
     )
-  def summarize(self) -> pg.Dict:
-    result = super().summarize()
+  def finalize(self) -> pg.Dict:
+    result = super().finalize()
     result.metrics.update(
         num_matches=self.num_matches,
         match_rate=self.match_rate,
@@ -218,9 +219,9 @@ class Matching(base.Evaluation):
   def _render_result_row(self, s: io.StringIO):
     super()._render_result_row(s)
     s.write(
-        '<td><span style="color:red">%s</span>%s</td>'
+        '<td><span style="color:orange">%s</span>%s</td>'
         % (
-            f'%.{self.report_precision}f%% ' % (self.mismatch_rate * 100),
+            self._format_rate(self.mismatch_rate),
             '<a href="%s">(%d/%d)</a>'
             % (self.mismatches_link, self.num_mismatches, self.num_completed),
         )
@@ -228,13 +229,13 @@ class Matching(base.Evaluation):
     s.write(
         '<td><span style="color:green">%s</span>%s</td>'
         % (
-            f'%.{self.report_precision}f%% ' % (self.match_rate * 100),
+            self._format_rate(self.match_rate),
             '<a href="%s">(%d/%d)</a>'
             % (self.matches_link, self.num_matches, self.num_completed),
         )
     )
-  def _render_metric(self, s: io.StringIO) -> None:
+  def _render_summary_metrics(self, s: io.StringIO) -> None:
     """Renders metrics in HTML."""
     assert self.result is not None
     m = self.result.metrics
@@ -244,7 +245,7 @@ class Matching(base.Evaluation):
             m.num_matches,
             m.total,
             self.matches_link,
-            f'%.{self.report_precision}f%% ' % (m.match_rate * 100),
+            self._format_rate(m.match_rate),
         )
     )
     s.write(' | ')
@@ -254,11 +255,11 @@ class Matching(base.Evaluation):
             m.num_mismatches,
             m.total,
             self.mismatches_link,
-            f'%.{self.report_precision}f%% ' % (m.mismatch_rate * 100),
+            self._format_rate(m.mismatch_rate),
         )
     )
     s.write(' | ')
-    super()._render_metric(s)
+    super()._render_summary_metrics(s)
   def _render_matches(self, s: io.StringIO) -> None:
     """Formats the matched cases into html."""

langfun/core/eval/matching_test.py CHANGED Viewed

@@ -120,6 +120,13 @@ class MatchingTest(unittest.TestCase):
                 total=4,
                 failures=1,
                 failure_rate=0.25,
+                oop_failures=1,
+                oop_failure_rate=0.25,
+                non_oop_failures=0,
+                non_oop_failure_rate=0.0,
+                failure_breakdown={
+                    'MappingError.SchemaError.TypeError': 1
+                },
                 num_matches=2,
                 match_rate=0.5,
                 num_mismatches=1,
@@ -160,7 +167,14 @@ class MatchingTest(unittest.TestCase):
     self.assertTrue(
         os.path.exists(
             os.path.join(
-                s.dir, matching.Matching.FAILURES_JSON
+                s.dir, matching.Matching.OOP_FAILURES_JSON
+            )
+        )
+    )
+    self.assertTrue(
+        os.path.exists(
+            os.path.join(
+                s.dir, matching.Matching.NON_OOP_FAILURES_JSON
             )
         )
     )
@@ -175,7 +189,14 @@ class MatchingTest(unittest.TestCase):
     self.assertTrue(
         os.path.exists(
             os.path.join(
-                s.dir, matching.Matching.FAILURES_HTML
+                s.dir, matching.Matching.OOP_FAILURES_HTML
+            )
+        )
+    )
+    self.assertTrue(
+        os.path.exists(
+            os.path.join(
+                s.dir, matching.Matching.NON_OOP_FAILURES_HTML
             )
         )
     )

langfun/core/eval/scoring.py CHANGED Viewed

@@ -113,8 +113,8 @@ class Scoring(base.Evaluation):
         m.total,
     )
-  def summarize(self) -> pg.Dict:
-    result = super().summarize()
+  def finalize(self) -> pg.Dict:
+    result = super().finalize()
     result.metrics.update(
         num_scored=self.num_scored,
         score_rate=self.score_rate,
@@ -168,7 +168,7 @@ class Scoring(base.Evaluation):
         )
     )
-  def _render_metric(self, s: io.StringIO) -> None:
+  def _render_summary_metrics(self, s: io.StringIO) -> None:
     """Renders metrics in HTML."""
     assert self.result is not None
     m = self.result.metrics
@@ -182,7 +182,7 @@ class Scoring(base.Evaluation):
         )
     )
     s.write(' | ')
-    super()._render_metric(s)
+    super()._render_summary_metrics(s)
   def _render_scored(self, s: io.StringIO) -> None:
     """Formats the matched cases into html."""

langfun/core/eval/scoring_test.py CHANGED Viewed

@@ -98,6 +98,11 @@ class ScoringTest(unittest.TestCase):
                 total=2,
                 failures=0,
                 failure_rate=0.0,
+                oop_failures=0,
+                oop_failure_rate=0.0,
+                non_oop_failures=0,
+                non_oop_failure_rate=0.0,
+                failure_breakdown={},
                 num_scored=2,
                 score_rate=1.0,
                 avg_score=0.5,
@@ -124,7 +129,12 @@ class ScoringTest(unittest.TestCase):
     )
     self.assertTrue(
         os.path.exists(
-            os.path.join(s.dir, scoring.Scoring.FAILURES_JSON)
+            os.path.join(s.dir, scoring.Scoring.OOP_FAILURES_JSON)
+        )
+    )
+    self.assertTrue(
+        os.path.exists(
+            os.path.join(s.dir, scoring.Scoring.NON_OOP_FAILURES_JSON)
         )
     )
     self.assertTrue(
@@ -143,7 +153,14 @@ class ScoringTest(unittest.TestCase):
     self.assertTrue(
         os.path.exists(
             os.path.join(
-                s.dir, scoring.Scoring.FAILURES_HTML
+                s.dir, scoring.Scoring.OOP_FAILURES_HTML
+            )
+        )
+    )
+    self.assertTrue(
+        os.path.exists(
+            os.path.join(
+                s.dir, scoring.Scoring.NON_OOP_FAILURES_HTML
             )
         )
     )

langfun/core/llms/openai.py CHANGED Viewed

@@ -234,7 +234,7 @@ class OpenAI(lf.LanguageModel):
           if isinstance(chunk, str):
             item = dict(type='text', text=chunk)
           elif isinstance(chunk, lf_modalities.Image) and chunk.uri:
-            item = dict(type='image_url', image_url=chunk.uri)
+            item = dict(type='image_url', image_url=dict(url=chunk.uri))
           else:
             raise ValueError(f'Unsupported modality object: {chunk!r}.')
           content.append(item)

langfun/core/llms/openai_test.py CHANGED Viewed

@@ -66,7 +66,8 @@ def mock_chat_completion_query_vision(messages, *, n=1, **kwargs):
   del kwargs
   choices = []
   urls = [
-      c['image_url'] for c in messages[0]['content'] if c['type'] == 'image_url'
+      c['image_url']['url']
+      for c in messages[0]['content'] if c['type'] == 'image_url'
   ]
   for k in range(n):
     choices.append(pg.Dict(

{langfun-0.0.2.dev20240429.dist-info → langfun-0.0.2.dev20240430.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: langfun
-Version: 0.0.2.dev20240429
+Version: 0.0.2.dev20240430
 Summary: Langfun: Language as Functions.
 Home-page: https://github.com/google/langfun
 Author: Langfun Authors

{langfun-0.0.2.dev20240429.dist-info → langfun-0.0.2.dev20240430.dist-info}/RECORD RENAMED Viewed

@@ -40,12 +40,12 @@ langfun/core/coding/python/parsing_test.py,sha256=9vAWF484kWIm6JZq8NFiMgKUDhXV-d
 langfun/core/coding/python/permissions.py,sha256=1QWGHvzL8MM0Ok_auQ9tURqZHtdOfJaDpBzZ29GUE-c,2544
 langfun/core/coding/python/permissions_test.py,sha256=w5EDb8QxpxgJyZkojyzVWQvDfg366zn99-g__6TbPQ0,2699
 langfun/core/eval/__init__.py,sha256=NSmPe2lxdxFoY4h8VkNyONPAFtOTUpK9WhmZRaqUgiI,1335
-langfun/core/eval/base.py,sha256=1svQoZ0C2DGCVLvr0Qt0TcrlJKtJptdoOBVAxkxnHoU,60264
-langfun/core/eval/base_test.py,sha256=g3lRp2dcq411cLYHpn8spI4feyv2nOccs5PlFBwav3g,22512
-langfun/core/eval/matching.py,sha256=Ks-L9vyMNDj4R8zFczzByT_4DK2wAFatyCZupdHzx_g,9932
-langfun/core/eval/matching_test.py,sha256=5Qs9ETaLoyNcJ43f-_bK2Bfe--2Y3U79DnSA55-l6pc,4932
-langfun/core/eval/scoring.py,sha256=A3y6HMcmpREQPqUD-WtImYOb2jG-23WpcUO2-WGhel0,6360
-langfun/core/eval/scoring_test.py,sha256=vxJR-2rBghUDUOCLTIMd6M3i1F8xDhA-U45wuBHVfc0,4058
+langfun/core/eval/base.py,sha256=ImIdyjh89yWUbFoSI12xzpcSmvB34y8_F0WAcUi-4sg,68405
+langfun/core/eval/base_test.py,sha256=SEo43ftMscpZ5QV6AGaywrA6SobVaG_P7sUbjoBGqg8,24081
+langfun/core/eval/matching.py,sha256=Y4vFoNTQEOwko6IA8l9OZ52-vt52e3VGmcTtvLA67wM,9782
+langfun/core/eval/matching_test.py,sha256=f7iVyXH5KGJBWt4Wp14Bt9J3X59A6Ayfog9MbuFvPew,5532
+langfun/core/eval/scoring.py,sha256=1J7IATo-8FXUR0SBqk9icztHiM0lWkBFcWUo-vUURgQ,6376
+langfun/core/eval/scoring_test.py,sha256=O8olHbrUEg60gMxwOkWzKBJZpZoUlmVnBANX5Se2SXM,4546
 langfun/core/llms/__init__.py,sha256=1bPg1QI8duOZCYINm-jWi094x0JtLmsk4KX60qIC_gs,3245
 langfun/core/llms/anthropic.py,sha256=7W9YdPN3SlAFhAIQlihMkrpo7tTY_4NvD0KIlCrqcsk,8505
 langfun/core/llms/anthropic_test.py,sha256=TMM30myyEhwF99Le4RvJEXOn8RYl0q1FRkt9Q9nl1jk,5540
@@ -57,8 +57,8 @@ langfun/core/llms/groq.py,sha256=NaGItVL_pkOpqPpI4bPGU27xLFRoaeizZ49v2s-4ERs,784
 langfun/core/llms/groq_test.py,sha256=M6GtlrsOvDun_j-sR8cPh4W_moHWZNSTiThu3kuwbbc,5281
 langfun/core/llms/llama_cpp.py,sha256=Y_KkMUf3Xfac49koMUtUslKl3h-HWp3-ntq7Jaa3bdo,2385
 langfun/core/llms/llama_cpp_test.py,sha256=ZxC6defGd_HX9SFRU9U4cJiQnBKundbOrchbXuC1Z2M,1683
-langfun/core/llms/openai.py,sha256=06nPhmw0zIA5Zqv3eqsrZtYLHnKwW7N8yt3LlFUFVpI,13247
-langfun/core/llms/openai_test.py,sha256=MiLqBaYliAkWVEwOBmX3HTj_eAuWLv77q8-I3VyVEBU,14841
+langfun/core/llms/openai.py,sha256=rPwO4qPGEwbB4O7TaQD0spg_PXIfF2ioRI_ilE3Pg6Y,13257
+langfun/core/llms/openai_test.py,sha256=asSA1sVy_7hnXioD_2HTxtSDpVTKBUO_EjZuyHpwbn0,14854
 langfun/core/llms/cache/__init__.py,sha256=QAo3InUMDM_YpteNnVCSejI4zOsnjSMWKJKzkb3VY64,993
 langfun/core/llms/cache/base.py,sha256=cFfYvOIUae842pncqCAsRvqXCk2AnAsRYVx0mcIoAeY,3338
 langfun/core/llms/cache/in_memory.py,sha256=YfFyJEhLs73cUiB0ZfhMxYpdE8Iuxxw-dvMFwGHTSHw,4742
@@ -101,8 +101,8 @@ langfun/core/templates/demonstration.py,sha256=vCrgYubdZM5Umqcgp8NUVGXgr4P_c-fik
 langfun/core/templates/demonstration_test.py,sha256=SafcDQ0WgI7pw05EmPI2S4v1t3ABKzup8jReCljHeK4,2162
 langfun/core/templates/selfplay.py,sha256=yhgrJbiYwq47TgzThmHrDQTF4nDrTI09CWGhuQPNv-s,2273
 langfun/core/templates/selfplay_test.py,sha256=DYVrkk7uNKCqJGEHH31HssU2BPuMItU1vJLzfcXIlYg,2156
-langfun-0.0.2.dev20240429.dist-info/LICENSE,sha256=WNHhf_5RCaeuKWyq_K39vmp9F28LxKsB4SpomwSZ2L0,11357
-langfun-0.0.2.dev20240429.dist-info/METADATA,sha256=2ilR8AAbFugi7GfU5Szd9nOmkThPTNsTrOCOseGc7gQ,3436
-langfun-0.0.2.dev20240429.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
-langfun-0.0.2.dev20240429.dist-info/top_level.txt,sha256=RhlEkHxs1qtzmmtWSwYoLVJAc1YrbPtxQ52uh8Z9VvY,8
-langfun-0.0.2.dev20240429.dist-info/RECORD,,
+langfun-0.0.2.dev20240430.dist-info/LICENSE,sha256=WNHhf_5RCaeuKWyq_K39vmp9F28LxKsB4SpomwSZ2L0,11357
+langfun-0.0.2.dev20240430.dist-info/METADATA,sha256=RpEIB1auHihqOoDrPnFQaYqgpqxFKA9_Z9iuCfPxe5s,3436
+langfun-0.0.2.dev20240430.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
+langfun-0.0.2.dev20240430.dist-info/top_level.txt,sha256=RhlEkHxs1qtzmmtWSwYoLVJAc1YrbPtxQ52uh8Z9VvY,8
+langfun-0.0.2.dev20240430.dist-info/RECORD,,

{langfun-0.0.2.dev20240429.dist-info → langfun-0.0.2.dev20240430.dist-info}/LICENSE RENAMED Viewed

File without changes

{langfun-0.0.2.dev20240429.dist-info → langfun-0.0.2.dev20240430.dist-info}/WHEEL RENAMED Viewed

File without changes

{langfun-0.0.2.dev20240429.dist-info → langfun-0.0.2.dev20240430.dist-info}/top_level.txt RENAMED Viewed

File without changes

langfun 0.0.2.dev20240429__py3-none-any.whl → 0.0.2.dev20240430__py3-none-any.whl

langfun 0.0.2.dev20240429py3-none-any.whl → 0.0.2.dev20240430py3-none-any.whl