PyPI - langfun - Versions diffs - 0.0.2.dev20240330__py3-none-any.whl → 0.0.2.dev20240429__py3-none-any.whl - Mend

langfun 0.0.2.dev20240330py3-none-any.whl → 0.0.2.dev20240429py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (49) hide show

langfun/__init__.py +2 -0
langfun/core/__init__.py +1 -0
langfun/core/coding/python/correction.py +0 -7
langfun/core/component.py +6 -0
langfun/core/component_test.py +1 -0
langfun/core/eval/__init__.py +2 -0
langfun/core/eval/base.py +202 -23
langfun/core/eval/base_test.py +49 -10
langfun/core/eval/matching.py +26 -9
langfun/core/eval/matching_test.py +2 -1
langfun/core/eval/scoring.py +15 -6
langfun/core/eval/scoring_test.py +2 -1
langfun/core/langfunc.py +0 -5
langfun/core/langfunc_test.py +6 -4
langfun/core/language_model.py +124 -24
langfun/core/language_model_test.py +249 -26
langfun/core/llms/__init__.py +19 -2
langfun/core/llms/anthropic.py +263 -0
langfun/core/llms/anthropic_test.py +167 -0
langfun/core/llms/cache/in_memory_test.py +37 -28
langfun/core/llms/fake.py +31 -22
langfun/core/llms/fake_test.py +122 -11
langfun/core/llms/google_genai_test.py +8 -3
langfun/core/llms/groq.py +260 -0
langfun/core/llms/groq_test.py +170 -0
langfun/core/llms/llama_cpp.py +3 -1
langfun/core/llms/openai.py +97 -79
langfun/core/llms/openai_test.py +285 -59
langfun/core/modalities/video.py +5 -2
langfun/core/structured/__init__.py +3 -0
langfun/core/structured/completion_test.py +2 -2
langfun/core/structured/function_generation.py +245 -0
langfun/core/structured/function_generation_test.py +329 -0
langfun/core/structured/mapping.py +56 -2
langfun/core/structured/mapping_test.py +17 -0
langfun/core/structured/parsing_test.py +18 -13
langfun/core/structured/prompting.py +27 -6
langfun/core/structured/prompting_test.py +79 -12
langfun/core/structured/schema.py +4 -2
langfun/core/structured/schema_generation_test.py +2 -2
langfun/core/structured/schema_test.py +4 -6
langfun/core/template.py +125 -10
langfun/core/template_test.py +75 -0
langfun/core/templates/selfplay_test.py +6 -2
{langfun-0.0.2.dev20240330.dist-info → langfun-0.0.2.dev20240429.dist-info}/METADATA +3 -2
{langfun-0.0.2.dev20240330.dist-info → langfun-0.0.2.dev20240429.dist-info}/RECORD +49 -43
{langfun-0.0.2.dev20240330.dist-info → langfun-0.0.2.dev20240429.dist-info}/LICENSE +0 -0
{langfun-0.0.2.dev20240330.dist-info → langfun-0.0.2.dev20240429.dist-info}/WHEEL +0 -0
{langfun-0.0.2.dev20240330.dist-info → langfun-0.0.2.dev20240429.dist-info}/top_level.txt +0 -0

langfun/core/eval/matching_test.py CHANGED Viewed

@@ -103,7 +103,7 @@ class MatchingTest(unittest.TestCase):
         s.result,
         dict(
             experiment_setup=dict(
-                id='MyTask@3d87f97f',
+                id='MyTask@739a174b',
                 dir=s.dir,
                 model='StaticSequence',
                 prompt_template='{{example.question}}',
@@ -125,6 +125,7 @@ class MatchingTest(unittest.TestCase):
                 num_mismatches=1,
                 mismatch_rate=0.25,
             ),
+            usage=s.result.usage,
         ),
     )
     self.assertTrue(

langfun/core/eval/scoring.py CHANGED Viewed

@@ -61,8 +61,18 @@ class Scoring(base.Evaluation):
     super()._reset()
     self._scored = []
-  def audit(self, example: Any, output: Any, message: lf.Message) -> None:
+  def audit_processed(
+      self, example: Any, output: Any, message: lf.Message, dryrun: bool = False
+  ) -> None:
     score = self.score(example, output)
+    if dryrun:
+      lf.console.write('')
+      lf.console.write(
+          str(score),
+          title='SCORE',
+          color='blue',
+      )
     self._scored.append((example, output, score, message))
   @abc.abstractmethod
@@ -118,19 +128,18 @@ class Scoring(base.Evaluation):
     super().save(definition, result, report)
     if result:
-      def force_dict(v):
-        return pg.object_utils.json_conversion.strip_types(pg.to_json(v))
       # Save scored.
       pg.save(
           [
               # We force the output to be dict as its type may be defined
               # within functors which could be deserialized.
-              pg.Dict(input=input, output=force_dict(output), score=score)
+              pg.Dict(input=input, output=output, score=score)
               for input, output, score, _ in self.scored
           ],
           os.path.join(self.dir, Scoring.SCORED_JSON),
+          # We force the input and output to be dict so it does not depend on
+          # the downstream to serialize.
+          force_dict=True,
       )
     if report:

langfun/core/eval/scoring_test.py CHANGED Viewed

@@ -81,7 +81,7 @@ class ScoringTest(unittest.TestCase):
         s.result,
         dict(
             experiment_setup=dict(
-                id='ConstraintFollowing@9e51bb9e',
+                id='ConstraintFollowing@5c88a5eb',
                 dir=s.dir,
                 model='StaticSequence',
                 prompt_template='{{example}}',
@@ -102,6 +102,7 @@ class ScoringTest(unittest.TestCase):
                 score_rate=1.0,
                 avg_score=0.5,
             ),
+            usage=s.result.usage,
         ),
     )
     self.assertTrue(

langfun/core/langfunc.py CHANGED Viewed

@@ -261,7 +261,6 @@ class LangFunc(
         if lm_input is None:
           lm_input = self.render(**kwargs)
-        lm_input.tag(message_lib.Message.TAG_LM_INPUT)
         if skip_lm:
           return lm_input
@@ -270,10 +269,6 @@ class LangFunc(
         # Send rendered text to LM.
         lm_output = self.lm(lm_input, cache_seed=cache_seed)
-        # Track the input as the source of the output.
-        lm_output.source = lm_input
-        lm_output.tag(message_lib.Message.TAG_LM_RESPONSE)
         # Transform the output message.
         lm_output = self.transform_output(lm_output)
         lm_output.tag(message_lib.Message.TAG_LM_OUTPUT)

langfun/core/langfunc_test.py CHANGED Viewed

@@ -82,7 +82,9 @@ class LangFuncCallTest(unittest.TestCase):
     self.assertEqual(i.tags, ['rendered'])
     r = l()
-    self.assertEqual(r, message.AIMessage('Hello!!!', score=0.0, logprobs=None))
+    self.assertEqual(
+        r, message.AIMessage('Hello!!!', score=0.0, logprobs=None, usage=None)
+    )
     self.assertEqual(r.tags, ['lm-response', 'lm-output'])
     self.assertEqual(r.source, message.UserMessage('Hello'))
     self.assertEqual(r.source.tags, ['rendered', 'lm-input'])
@@ -92,8 +94,8 @@ class LangFuncCallTest(unittest.TestCase):
     self.assertEqual(
         repr(l),
         "LangFunc(template_str='Hello', clean=True,"
-        ' lm=ExcitedEchoer(sampling_options=LMSamplingOptions(temperature=0.0,'
-        ' max_tokens=1024, n=1, top_k=40, top_p=None, stop=None,'
+        ' lm=ExcitedEchoer(sampling_options=LMSamplingOptions(temperature=None,'
+        ' max_tokens=None, n=1, top_k=40, top_p=None, stop=None,'
         ' random_seed=None, logprobs=False, top_logprobs=None), cache=None,'
         ' max_concurrency=None, timeout=120.0, max_attempts=5,'
         ' retry_interval=(5, 60), exponential_backoff=True, debug=False))',
@@ -106,7 +108,7 @@ class LangFuncCallTest(unittest.TestCase):
       self.assertEqual(l.render(), 'Hello')
       r = l()
       self.assertEqual(
-          r, message.AIMessage('Hello!!!', score=0.0, logprobs=None)
+          r, message.AIMessage('Hello!!!', score=0.0, logprobs=None, usage=None)
       )
       self.assertEqual(r.tags, ['lm-response', 'lm-output'])

langfun/core/language_model.py CHANGED Viewed

@@ -24,6 +24,9 @@ from langfun.core import console
 from langfun.core import message as message_lib
 import pyglove as pg
+TOKENS_PER_REQUEST = 250  # Estimated num tokens for a single request
+DEFAULT_MAX_CONCURRENCY = 1  # Use this as max concurrency if no RPM or TPM data
 class LMSample(pg.Object):
   """Response candidate."""
@@ -47,6 +50,14 @@ class LMSample(pg.Object):
   ] = None
+class LMSamplingUsage(pg.Object):
+  """Usage information per completion."""
+  prompt_tokens: int
+  completion_tokens: int
+  total_tokens: int
 class LMSamplingResult(pg.Object):
   """Language model response."""
@@ -58,19 +69,34 @@ class LMSamplingResult(pg.Object):
       ),
   ] = []
+  usage: Annotated[
+      LMSamplingUsage | None,
+      'Usage information. Currently only OpenAI models are supported.',
+  ] = None
 class LMSamplingOptions(component.Component):
   """Language model sampling options."""
   temperature: Annotated[
-      float,
+      float | None,
       (
           'Model temperature, which is usually between 0 and 1.0. '
-          'OpenAI models have temperature range from 0.0 to 2.0.'
+          'OpenAI models have temperature range from 0.0 to 2.0. '
+          'If None (default), honor the model\'s default behavior. '
       )
-  ] = 0.0
-  max_tokens: Annotated[int, 'Per example max tokens to generate.'] = 1024
+  ] = None
+  max_tokens: Annotated[
+      int | None,
+      (
+          'Per example max tokens to generate. '
+          'If None, use the model default.'
+      )
+  ] = None
   n: Annotated[int | None, 'Max number of samples to return.'] = 1
   top_k: Annotated[
       int | None,
       (
@@ -78,6 +104,7 @@ class LMSamplingOptions(component.Component):
           'Not applicable to OpenAI models.'
       )
   ] = 40
   top_p: Annotated[
       float | None,
       (
@@ -86,6 +113,7 @@ class LMSamplingOptions(component.Component):
           '`top_p` but not both.'
       ),
   ] = None
   stop: Annotated[
       list[str] | None,
       (
@@ -95,9 +123,11 @@ class LMSamplingOptions(component.Component):
           '`Model:` is reached.'
       ),
   ] = None
   random_seed: Annotated[
       int | None, 'A fixed random seed used during model inference.'
   ] = None
   logprobs: Annotated[
       bool,
       (
@@ -106,6 +136,7 @@ class LMSamplingOptions(component.Component):
           'in the content of message.'
       ),
   ] = False
   top_logprobs: Annotated[
       int | None,
       (
@@ -315,9 +346,42 @@ class LanguageModel(component.Component):
     with component.context(override_attrs=True, **kwargs):
       if self.cache is None:
-        return self._sample(prompts)
+        results = self._sample(prompts)
       else:
-        return self._sample_with_cache_lookup(prompts, cache_seed)
+        results = self._sample_with_cache_lookup(prompts, cache_seed)
+      for prompt, result in zip(prompts, results):
+        # Tag LM input.
+        prompt.tag(message_lib.Message.TAG_LM_INPUT)
+        for sample in result.samples:
+          # Update metadata for response message.
+          response = sample.response
+          response.metadata.score = sample.score
+          response.metadata.logprobs = sample.logprobs
+          # NOTE(daiyip): Current usage is computed at per-result level,
+          # which is accurate when n=1. For n > 1, we average the usage across
+          # multiple samples.
+          usage = result.usage
+          if len(result.samples) == 1 or usage is None:
+            response.metadata.usage = usage
+          else:
+            n = len(result.samples)
+            response.metadata.usage = LMSamplingUsage(
+                prompt_tokens=usage.prompt_tokens // n,
+                completion_tokens=usage.completion_tokens // n,
+                total_tokens=usage.total_tokens // n,
+            )
+          # Track the prompt for corresponding response.
+          response.source = prompt
+          # Tag LM response.
+          response.tag(message_lib.Message.TAG_LM_RESPONSE)
+      return results
   def _sample_with_cache_lookup(
       self, prompts: list[str | message_lib.Message], cache_seed: int
@@ -405,12 +469,9 @@ class LanguageModel(component.Component):
       result = self.sample(
           [prompt], sampling_options=sampling_options, cache_seed=cache_seed
       )[0]
-      response = result.samples[0].response
-      logprobs = result.samples[0].logprobs
-      response.set('score', result.samples[0].score)
-      response.metadata.logprobs = logprobs
       elapse = time.time() - request_start
-      self._debug(prompt, response, call_counter, elapse)
+      response = result.samples[0].response
+      self._debug(prompt, response, call_counter, result.usage, elapse)
       return response
   def _debug(
@@ -418,35 +479,53 @@ class LanguageModel(component.Component):
       prompt: message_lib.Message,
       response: message_lib.Message,
       call_counter: int,
+      usage: LMSamplingUsage | None,
       elapse: float,
-  ):
+  ) -> None:
     """Outputs debugging information."""
     debug = self.debug
     if isinstance(debug, bool):
       debug = LMDebugMode.ALL if debug else LMDebugMode.NONE
     if debug & LMDebugMode.INFO:
-      self._debug_model_info(call_counter)
+      self._debug_model_info(call_counter, usage)
     if debug & LMDebugMode.PROMPT:
-      self._debug_prompt(prompt, call_counter)
+      self._debug_prompt(prompt, call_counter, usage)
     if debug & LMDebugMode.RESPONSE:
-      self._debug_response(response, call_counter, elapse)
+      self._debug_response(response, call_counter, usage, elapse)
-  def _debug_model_info(self, call_counter: int):
+  def _debug_model_info(
+      self, call_counter: int, usage: LMSamplingUsage | None) -> None:
     """Outputs debugging information about the model."""
+    title_suffix = ''
+    if usage and usage.total_tokens != 0:
+      title_suffix = console.colored(
+          f' (total {usage.total_tokens} tokens)', 'red')
     console.write(
         self.format(compact=True, use_inferred=True),
-        title=f'[{call_counter}] LM INFO:',
+        title=f'[{call_counter}] LM INFO{title_suffix}:',
         color='magenta',
     )
-  def _debug_prompt(self, prompt: message_lib.Message, call_counter: int):
+  def _debug_prompt(
+      self,
+      prompt: message_lib.Message,
+      call_counter: int,
+      usage: LMSamplingUsage | None,
+  ) -> None:
     """Outputs debugging information about the prompt."""
+    title_suffix = ''
+    if usage and usage.prompt_tokens != 0:
+      title_suffix = console.colored(f' ({usage.prompt_tokens} tokens)', 'red')
     console.write(
-        prompt,
-        title=f'\n[{call_counter}] PROMPT SENT TO LM:',
+        # We use metadata 'formatted_text' for scenarios where the prompt text
+        # is formatted by the LM.
+        prompt.get('formatted_text', prompt.text),
+        title=f'\n[{call_counter}] PROMPT SENT TO LM{title_suffix}:',
         color='green',
     )
     referred_modalities = prompt.referred_modalities()
@@ -460,12 +539,22 @@ class LanguageModel(component.Component):
       )
   def _debug_response(
-      self, response: message_lib.Message, call_counter: int, elapse: float
-  ):
+      self,
+      response: message_lib.Message,
+      call_counter: int,
+      usage: LMSamplingUsage | None,
+      elapse: float
+  ) -> None:
     """Outputs debugging information about the response."""
+    title_suffix = ' ('
+    if usage and usage.completion_tokens != 0:
+      title_suffix += f'{usage.completion_tokens} tokens '
+    title_suffix += f'in {elapse:.2f} seconds)'
+    title_suffix = console.colored(title_suffix, 'red')
     console.write(
         str(response) + '\n',
-        title=f'\n[{call_counter}] LM RESPONSE (in {elapse:.2f} seconds):',
+        title=f'\n[{call_counter}] LM RESPONSE{title_suffix}:',
         color='blue',
     )
@@ -512,7 +601,7 @@ class LanguageModel(component.Component):
       debug = LMDebugMode.ALL if debug else LMDebugMode.NONE
     if debug & LMDebugMode.INFO:
-      self._debug_model_info(call_counter)
+      self._debug_model_info(call_counter, None)
     if debug & LMDebugMode.PROMPT:
       console.write(
@@ -548,3 +637,14 @@ class LanguageModel(component.Component):
             f'score: {r.score}',
             color='blue',
         )
+  def rate_to_max_concurrency(
+      self, requests_per_min: float = 0, tokens_per_min: float = 0
+  ) -> int:
+    """Converts a rate to a max concurrency."""
+    if tokens_per_min > 0:
+      return max(int(tokens_per_min / TOKENS_PER_REQUEST / 60), 1)
+    elif requests_per_min > 0:
+      return max(int(requests_per_min / 60), 1)  # Max concurrency can't be zero
+    else:
+      return DEFAULT_MAX_CONCURRENCY  # Default of 1

langfun 0.0.2.dev20240330__py3-none-any.whl → 0.0.2.dev20240429__py3-none-any.whl

langfun 0.0.2.dev20240330py3-none-any.whl → 0.0.2.dev20240429py3-none-any.whl