PyPI - langfun - Versions diffs - 0.0.2.dev20240319__py3-none-any.whl → 0.0.2.dev20240429__py3-none-any.whl - Mend

langfun 0.0.2.dev20240319py3-none-any.whl → 0.0.2.dev20240429py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

langfun/__init__.py +2 -0
langfun/core/__init__.py +1 -0
langfun/core/coding/python/correction.py +0 -7
langfun/core/component.py +6 -0
langfun/core/component_test.py +1 -0
langfun/core/eval/__init__.py +2 -0
langfun/core/eval/base.py +240 -37
langfun/core/eval/base_test.py +52 -18
langfun/core/eval/matching.py +26 -9
langfun/core/eval/matching_test.py +3 -4
langfun/core/eval/scoring.py +15 -6
langfun/core/eval/scoring_test.py +2 -2
langfun/core/langfunc.py +0 -5
langfun/core/langfunc_test.py +6 -4
langfun/core/language_model.py +124 -24
langfun/core/language_model_test.py +249 -26
langfun/core/llms/__init__.py +24 -5
langfun/core/llms/anthropic.py +263 -0
langfun/core/llms/anthropic_test.py +167 -0
langfun/core/llms/cache/in_memory_test.py +37 -28
langfun/core/llms/fake.py +31 -22
langfun/core/llms/fake_test.py +122 -11
langfun/core/llms/{gemini.py → google_genai.py} +117 -15
langfun/core/llms/{gemini_test.py → google_genai_test.py} +83 -15
langfun/core/llms/groq.py +260 -0
langfun/core/llms/groq_test.py +170 -0
langfun/core/llms/llama_cpp.py +3 -1
langfun/core/llms/openai.py +97 -79
langfun/core/llms/openai_test.py +285 -59
langfun/core/modalities/video.py +5 -2
langfun/core/structured/__init__.py +3 -0
langfun/core/structured/completion_test.py +2 -2
langfun/core/structured/function_generation.py +245 -0
langfun/core/structured/function_generation_test.py +329 -0
langfun/core/structured/mapping.py +59 -3
langfun/core/structured/mapping_test.py +17 -0
langfun/core/structured/parsing.py +2 -1
langfun/core/structured/parsing_test.py +18 -13
langfun/core/structured/prompting.py +27 -6
langfun/core/structured/prompting_test.py +79 -12
langfun/core/structured/schema.py +25 -22
langfun/core/structured/schema_generation.py +2 -3
langfun/core/structured/schema_generation_test.py +2 -2
langfun/core/structured/schema_test.py +42 -27
langfun/core/template.py +125 -10
langfun/core/template_test.py +75 -0
langfun/core/templates/selfplay_test.py +6 -2
{langfun-0.0.2.dev20240319.dist-info → langfun-0.0.2.dev20240429.dist-info}/METADATA +3 -2
{langfun-0.0.2.dev20240319.dist-info → langfun-0.0.2.dev20240429.dist-info}/RECORD +52 -46
{langfun-0.0.2.dev20240319.dist-info → langfun-0.0.2.dev20240429.dist-info}/LICENSE +0 -0
{langfun-0.0.2.dev20240319.dist-info → langfun-0.0.2.dev20240429.dist-info}/WHEEL +0 -0
{langfun-0.0.2.dev20240319.dist-info → langfun-0.0.2.dev20240429.dist-info}/top_level.txt +0 -0

langfun/__init__.py CHANGED Viewed

@@ -34,6 +34,7 @@ score = structured.score
 generate_class = structured.generate_class
 source_form = structured.source_form
+function_gen = structured.function_gen
 from langfun.core import eval  # pylint: disable=redefined-builtin
 from langfun.core import templates
@@ -54,6 +55,7 @@ Video = modalities.Video
 PDF = modalities.PDF
 # Error types.
+MappingError = structured.MappingError
 SchemaError = structured.SchemaError
 JsonError = structured.JsonError
 CodeError = coding.CodeError

langfun/core/__init__.py CHANGED Viewed

@@ -99,6 +99,7 @@ from langfun.core.modality import ModalityRef
 from langfun.core.language_model import LanguageModel
 from langfun.core.language_model import LMSample
 from langfun.core.language_model import LMSamplingOptions
+from langfun.core.language_model import LMSamplingUsage
 from langfun.core.language_model import LMSamplingResult
 from langfun.core.language_model import LMScoringResult
 from langfun.core.language_model import LMCache

langfun/core/coding/python/correction.py CHANGED Viewed

@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Python code error correction."""
-import re
 from typing import Any
 import langfun.core as lf
 from langfun.core.coding.python import errors
@@ -31,11 +30,6 @@ class CorrectedCode(pg.Object):
   corrected_code: str
-def remove_docstrings(code):
-  pattern = re.compile(r"(def .+?:\s*?)('''|\"\"\")((.|\s)*?)(\2)", re.DOTALL)
-  return pattern.sub(r"\1", code)
 def run_with_correction(
     code: str,
     error: str | None = None,
@@ -86,7 +80,6 @@ def run_with_correction(
   # pytype: enable=import-error
   # pylint: enable=g-import-not-at-top
-  code = remove_docstrings(code)
   if max_attempts == 0:
     result = execution.run(
         code,

langfun/core/component.py CHANGED Viewed

@@ -210,6 +210,12 @@ def get_contextual_override(var_name: str) -> ContextualOverride | None:
   return _get_scoped_value(_global_tls, _CONTEXT_OVERRIDES, var_name)
+def all_contextual_values() -> dict[str, Any]:
+  """Returns all contextual values provided from `lf.context` in scope."""
+  overrides = getattr(_global_tls, _CONTEXT_OVERRIDES, {})
+  return {k: v.value for k, v in overrides.items()}
 @contextlib.contextmanager
 def _contextual_scope(
     tls: threading.local, tls_key, **variables

langfun/core/component_test.py CHANGED Viewed

@@ -84,6 +84,7 @@ class ComponentContextTest(unittest.TestCase):
           lf.get_contextual_override('y'),
           lf.ContextualOverride(3, cascade=False, override_attrs=False),
       )
+      self.assertEqual(lf.all_contextual_values(), dict(x=3, y=3, z=3))
       # Member attributes take precedence over `lf.context`.
       self.assertEqual(a1.x, 1)

langfun/core/eval/__init__.py CHANGED Viewed

@@ -16,6 +16,8 @@
 # pylint: disable=g-importing-member
 # pylint: disable=g-bad-import-order
+from langfun.core.eval.base import app_run
 from langfun.core.eval.base import Evaluable
 from langfun.core.eval.base import Evaluation
 from langfun.core.eval.base import Suite

langfun/core/eval/base.py CHANGED Viewed

@@ -26,7 +26,10 @@ import threading
 import time
 from typing import Annotated, Any, Callable, Iterator, Literal, Optional, Sequence, Type, Union
+from absl import app
+from absl import flags
 import langfun.core as lf
+import langfun.core.coding as lf_coding
 from langfun.core.llms.cache import in_memory
 import langfun.core.structured as lf_structured
 import pyglove as pg
@@ -41,14 +44,6 @@ class Evaluable(lf.Component):
   INDEX_HTML = 'index.html'
   SUMMARY_HTML = 'summary.html'
-  id: Annotated[
-      str,
-      (
-          'The ID of the evaluation, which should be unique across all '
-          'evaluations.'
-      ),
-  ]
   root_dir: Annotated[
       str | None,
       (
@@ -61,6 +56,18 @@ class Evaluable(lf.Component):
       int, 'Number of decimals when reporting precision.'
   ] = lf.contextual(default=1)
+  @property
+  @abc.abstractmethod
+  def id(self) -> str:
+    """Returns the ID of the task.
+    Returns:
+      Evaluation task ID. Different evaluation task should have their unique
+      task IDs, for each task will be stored in sub-directoreis identified by
+      their IDs. For suites, the ID could be an empty string as they will not
+      produce sub-directories
+    """
   @property
   def dir(self) -> str | None:
     """Returns the directory for saving results and details."""
@@ -533,7 +540,7 @@ class Evaluable(lf.Component):
           f'<div style="color: {text_color}; white-space: pre-wrap;'
           'padding: 10px; border: 1px solid; margin-top: 10px">'
       )
-      s.write(m.text)
+      s.write(m.get('formatted_text', m.text))
       if m.result is not None:
         s.write(
             '<div style="color: magenta; white-space: pre-wrap;'
@@ -541,6 +548,16 @@ class Evaluable(lf.Component):
         )
         s.write(pg.format(m.result))
         s.write('</div>')
+      if 'usage' in m.metadata:
+        s.write(
+            '<div style="background-color: #EEEEEE; color: black; '
+            'white-space: pre-wrap; padding: 10px; border: 0px solid; '
+            'margin: 10px">'
+            f'prompt: {m.usage.prompt_tokens} tokens, '
+            f'response: {m.usage.completion_tokens} tokens, '
+            f'total: {m.usage.total_tokens} tokens'
+            '</div>'
+        )
       s.write('</div>')
   @classmethod
@@ -578,12 +595,15 @@ class _LeafNode:
   progress_bar: int | None = None
-@pg.use_init_args(['id', 'children'])
+@pg.use_init_args(['children'])
 class Suite(Evaluable):
   """Evaluation suite."""
   children: Annotated[list[Evaluable], 'Child evaluation sets or suites.']
+  # Use empty ID as suite is just a container of child evaluations.
+  id: str = ''
   __kwargs__: Annotated[
       Any,
       (
@@ -802,17 +822,36 @@ class Evaluation(Evaluable):
       return 0.0
     return self.num_failures / self.num_completed
+  @property
+  def has_usage(self) -> bool:
+    """Returns True if token usage is enabled."""
+    return self._num_usages > 0
+  @property
+  def average_prompt_tokens(self) -> int:
+    """Returns the average prompt tokens."""
+    if not self.has_usage:
+      return 0
+    return self._total_prompt_tokens // self._num_usages
+  @property
+  def average_completion_tokens(self) -> int:
+    """Returns the average completion tokens."""
+    if not self.has_usage:
+      return 0
+    return self._total_completion_tokens // self._num_usages
+  @property
+  def average_total_tokens(self) -> int:
+    """Returns the average total tokens."""
+    return self.average_prompt_tokens + self.average_completion_tokens
   @functools.cached_property
   def schema(self) -> lf_structured.Schema | None:
     """Schema."""
     if self.schema_fn is None:
       return None
-    kwargs = {}
-    # Allow schema to be a function based on current evaluation.
-    if 'evaluation' in self.schema_fn.__signature__.arg_names:
-      kwargs['evaluation'] = self
     schema = self._call_schema_fn()
     fewshot_examples = None
     if isinstance(schema, tuple):
@@ -841,8 +880,10 @@ class Evaluation(Evaluable):
       kwargs['evaluation'] = self
     return self.schema_fn(**kwargs)
-  def _formalize_schema(self, annotation) -> lf_structured.Schema:
+  def _formalize_schema(self, annotation) -> lf_structured.Schema | None:
     """Formalizes schema from annotation."""
+    if annotation in (str, None):
+      return None
     if self.method == 'complete':
       if not hasattr(annotation, '__schema__'):
         raise TypeError(
@@ -851,7 +892,11 @@ class Evaluation(Evaluable):
             'Encountered: {annotation!r}.'
         )
       self._maybe_adjust_schema_for_completion(annotation)
-    return lf_structured.Schema.from_value(annotation)
+    schema = lf_structured.Schema.from_value(annotation)
+    # NOTE(daiyip): add references to the dependent classes of the returned type
+    # to prevent unused subclasses get garbage collected by Python.
+    setattr(schema, '__dependencies__', schema.class_dependencies())
+    return schema
   def _maybe_adjust_schema_for_completion(self, cls):
     if (self.completion_prompt_field is None
@@ -883,6 +928,14 @@ class Evaluation(Evaluable):
       completion_examples.append(ex)
     return completion_examples
+  @property
+  def id(self) -> str:
+    """Returns the ID of this evaluation."""
+    id_prefix = self.__class__.__name__
+    if not self.is_deterministic:
+      return id_prefix
+    return f'{id_prefix}@{self.hash}'
   @functools.cached_property
   def children(self) -> list['Evaluation']:
     """Returns the trials as child evaluations if this evaluation is a space."""
@@ -892,7 +945,6 @@ class Evaluation(Evaluable):
     for i, child in enumerate(pg.iter(self)):
       child.sym_setparent(self)
       child.sym_setpath(self.sym_path + f'children[{i}]')
-      child.rebind(id=f'{self.id}@{child.hash}', skip_notification=True)
       children.append(child)
     return children
@@ -921,6 +973,10 @@ class Evaluation(Evaluable):
     self._failures = []
     self._num_completed = 0
+    self._total_prompt_tokens = 0
+    self._total_completion_tokens = 0
+    self._num_usages = 0
   @property
   def failures_link(self) -> str | None:
     """Returns the link to the failures page."""
@@ -940,7 +996,7 @@ class Evaluation(Evaluable):
     example = example or self.examples[0]
     # We make a copy to avoid pollute the state of current object.
-    copy = self.clone()
+    copy: Evaluation = self.clone()
     copy.__dict__['examples'] = [example]
     # We set the symbolic parent of the cloned to access contextual information
@@ -970,9 +1026,9 @@ class Evaluation(Evaluable):
           color='blue',
       )
-    # Audit the result.
-    copy.audit(example, output, output_message)
+    copy.audit(example, output_message, None, dryrun=True)
     result = copy.summarize()
     if verbose:
       lf.console.write('')
       lf.console.write(
@@ -1004,7 +1060,11 @@ class Evaluation(Evaluable):
       self._reset()
       def _process(example: Any):
-        return self.process(example, **(self.additional_args or {}))
+        # NOTE(daiyip): set the `input` symbol of the globals to None, so LLM
+        # generated code with calls to `input` will raise an error, thus not
+        # blocking the evaluation.
+        with lf_coding.context(input=None):
+          return self.process(example, **(self.additional_args or {}))
       try:
         for example, message, error in lf.concurrent_map(
@@ -1015,11 +1075,12 @@ class Evaluation(Evaluable):
             status_fn=self._status,
         ):
           if error is not None:
-            self._failures.append((example, str(error)))
-          else:
-            output = message.text if self.schema is None else message.result
-            self.audit(example, output, message)
-          self._num_completed += 1
+            message = (
+                error.lm_response
+                if isinstance(error, lf_structured.MappingError)
+                else None
+            )
+          self.audit(example, message, error)
       finally:
         # Save cache upon completion or interruption.
         if self.dir and self.cache:
@@ -1122,6 +1183,19 @@ class Evaluation(Evaluable):
       )
     else:
       cache_stats = dict(use_cache=False)
+    if self.has_usage:
+      usage = pg.Dict(
+          total_prompt_tokens=self._total_prompt_tokens,
+          total_completion_tokens=self._total_completion_tokens,
+          num_usages=self._num_usages,
+          average_prompt_tokens=self.average_prompt_tokens,
+          average_completion_tokens=self.average_completion_tokens,
+          average_total_tokens=self.average_total_tokens,
+      )
+    else:
+      usage = None
     result = pg.Dict(
         experiment_setup=pg.Dict(
             id=self.id,
@@ -1137,6 +1211,7 @@ class Evaluation(Evaluable):
             failures=self.num_failures,
             failure_rate=self.failure_rate,
         ),
+        usage=usage,
     )
     return result
@@ -1158,9 +1233,28 @@ class Evaluation(Evaluable):
           '</td></tr><tr><td>'
       )
       self._render_metric(s)
+      # Summarize average usage.
+      if self.result.usage is not None:
+        self._render_usage(s)
     s.write('</td></tr></table></div>')
     return s.getvalue()
+  def _render_usage(self, s: io.StringIO) -> None:
+    """Renders usage in HTML."""
+    usage = self.result.usage
+    total = usage.total_prompt_tokens + usage.total_completion_tokens
+    s.write(
+        '&nbsp;<a title="'
+        f'# of usages: {usage.num_usages}&#013;'
+        f'total prompt: {usage.total_prompt_tokens}&#013;'
+        f'total response: {usage.total_completion_tokens}&#013;'
+        f'avg prompt: {usage.average_prompt_tokens}&#013;'
+        f'avg response: {usage.average_completion_tokens}'
+        f'" style="color:gray">({total} tokens)</a>'
+    )
   def _render_metric(self, s: io.StringIO) -> None:
     """Renders metrics in HTML."""
     assert self.result is not None
@@ -1175,17 +1269,48 @@ class Evaluation(Evaluable):
         )
     )
-  def audit(self, example: Any, output: Any, message: lf.Message) -> None:
+  def audit(
+      self,
+      example: Any,
+      message: lf.Message | None,
+      error: Exception | None = None,
+      dryrun: bool = False,
+  ) -> None:
     """Audits the example against the output. Subclasses should override.
     Args:
       example: The input object.
-      output: The output from LM. For `lf.call`, if `schema_fn` is not provided,
-        it will be the raw LM response string. Otherwise it will be the
-        structured output from the LM.
       message: The entire message returned by the LM, which could be used to
-        trace the LM input, response and parsed structure.
+        trace the LM input, response and parsed structure. If error is raised
+        before LLM could return a response, None will be its value.
+      error: The exception during processing the example.
+      dryrun: Whether or not audition takes place during dryrun.
     """
+    if error is not None:
+      self._failures.append((example, str(error)))
+      if isinstance(error, lf_structured.MappingError):
+        message = error.lm_response
+    else:
+      assert message is not None
+      output = message.text if self.schema is None else message.result
+      self.audit_processed(example, output, message, dryrun=dryrun)
+    # Audit usage.
+    if message is not None:
+      self.audit_usage(message, dryrun=dryrun)
+    self._num_completed += 1
+  def audit_usage(self, message: lf.Message, dryrun: bool = False) -> None:
+    for m in message.trace():
+      if 'usage' in m.metadata:
+        self._total_prompt_tokens += m.usage.prompt_tokens
+        self._total_completion_tokens += m.usage.completion_tokens
+        self._num_usages += 1
+  def audit_processed(
+      self, example: Any, output: Any, message: lf.Message, dryrun: bool = False
+  ) -> None:
+    """Audits a successfully processed example. Subclass should override."""
   def save(
       self, definition: bool = True, result: bool = True, report: bool = True
@@ -1229,8 +1354,10 @@ class Evaluation(Evaluable):
         '<td>Prompt</td>'
         '<td>Schema</td>'
         '<td>Additional Args</td>'
-        '<td>Failures</td>'
     )
+    if self.result.usage is not None:
+      s.write('<td>Usage</td>')
+    s.write('<td>Failures</td>')
   def _render_result_row(self, s: io.StringIO) -> None:
     s.write(
@@ -1255,6 +1382,12 @@ class Evaluation(Evaluable):
         '<td style="color:purple" '
         f'{_html_repr(self.additional_args, compact=False)}</td>'
     )
+    # Usage.
+    if self.result.usage is not None:
+      s.write('<td>')
+      self._render_usage(s)
+      s.write('</td>')
     # Failures.
     s.write(
         '<td><span style="color:orange">%s</span>%s</td>'
@@ -1353,8 +1486,8 @@ class Summary(pg.Object):
           Type[lf.LanguageModel],
           tuple[lf.LanguageModel | Type[lf.LanguageModel], ...],
       ] = lf.LanguageModel,
-      method: Union[str, tuple[str], None] = None,
-      schema_fn: Union[pg.Functor, tuple[pg.Functor], None] = None,
+      method: Union[str, tuple[str, ...], None] = None,
+      schema_fn: Union[pg.Functor, tuple[pg.Functor, ...], None] = None,
       completed: bool | None = None,
       pivot_field: str | None = None,
   ) -> 'Summary':
@@ -1518,9 +1651,12 @@ class Summary(pg.Object):
     pivot_field = pivot_field or self.pivot_field
     s = io.StringIO()
     s.write('<html><body>')
-    for task in self.tasks():
+    for task in sorted(self.tasks(), key=lambda cls: cls.__name__):
+      table_id = task.__name__.lower()
       s.write('<div>')
-      s.write(f'<h2>{task.__name__}</h2>')
+      s.write(f'<a id="{table_id}"')
+      s.write(f'<h2><a href="#{table_id}">{task.__name__}</a></h2>')
+      s.write('</a>')
       table = Summary.Table.from_evaluations(
           self.select(task=task).evaluations, pivot_field
       )
@@ -1532,8 +1668,35 @@ class Summary(pg.Object):
   def _repr_html_(self) -> str:
     return self.html()
+  def json(
+      self,
+  ) -> dict[
+      str,  # Task name
+      list[pg.Dict],  # List of pg.Dict with `experiment` and `metrics`.
+  ]:
+    """Returns the JSON representation of the summary."""
+    task_results = {}
+    for task in sorted(self.tasks(), key=lambda cls: cls.__name__):
+      results = []
+      for entry in self.select(task=task).evaluations:
+        results.append(
+            pg.Dict(
+                id=entry.id,
+                experiment=entry,
+                dir=entry.dir,
+                metrics=entry.result.metrics if entry.result else None,
+            )
+        )
+      task_results[task.__name__] = results
+    return task_results
   def save(self, file: str, pivot_field: str | None = None) -> None:
     pg.save(self.html(pivot_field), file, file_format='txt')
+    if file.endswith('.html'):
+      json_file = file.replace('.html', '.json')
+    else:
+      json_file = os.path.join(file, '.json')
+    pg.save(self.json(), json_file)
   @classmethod
   def from_dirs(
@@ -1744,3 +1907,43 @@ def monitor_async(
       scan_interval=scan_interval,
       refresh_when_stop=refresh_when_stop,
   )
+def app_run(target: Evaluable):
+  """Runs the target evaluation as an absl app.
+  Args:
+    target: An Langfun evaluable object.
+  """
+  flags.DEFINE_string(
+      'root_dir', None, 'Root directory for running the evaluation.'
+  )
+  flags.DEFINE_bool(
+      'dryrun', False, 'If True, dryrun the experiment instead of running it.'
+  )
+  flags.DEFINE_bool(
+      'debug', False, 'If True, output prompt and response to the console.'
+  )
+  flags.DEFINE_bool(
+      'rerun',
+      False,
+      'If True, rerun the experiment even a cached result is found.',
+  )
+  FLAGS = flags.FLAGS  # pylint: disable=invalid-name
+  def _main(argv):
+    if len(argv) > 1:
+      raise app.UsageError('Too many command-line arguments.')
+    if FLAGS.root_dir:
+      target.rebind(root_dir=FLAGS.root_dir, raise_on_no_change=False)
+    if FLAGS.dryrun:
+      target.dryrun(debug=FLAGS.debug)
+    else:
+      target.run(debug=FLAGS.debug, rerun=FLAGS.rerun)
+  app.run(_main)

langfun 0.0.2.dev20240319__py3-none-any.whl → 0.0.2.dev20240429__py3-none-any.whl

langfun 0.0.2.dev20240319py3-none-any.whl → 0.0.2.dev20240429py3-none-any.whl