PyPI - langfun - Versions diffs - 0.0.2.dev20240429__py3-none-any.whl → 0.1.2.dev202501140804__py3-none-any.whl - Mend

langfun 0.0.2.dev20240429py3-none-any.whl → 0.1.2.dev202501140804py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (144) hide show

langfun/__init__.py +20 -2
langfun/core/__init__.py +16 -5
langfun/core/agentic/__init__.py +30 -0
langfun/core/agentic/action.py +854 -0
langfun/core/agentic/action_eval.py +150 -0
langfun/core/agentic/action_eval_test.py +109 -0
langfun/core/agentic/action_test.py +136 -0
langfun/core/coding/python/__init__.py +5 -11
langfun/core/coding/python/correction.py +37 -21
langfun/core/coding/python/correction_test.py +29 -3
langfun/core/coding/python/execution.py +40 -216
langfun/core/coding/python/execution_test.py +29 -89
langfun/core/coding/python/generation.py +21 -11
langfun/core/coding/python/generation_test.py +2 -2
langfun/core/coding/python/parsing.py +108 -193
langfun/core/coding/python/parsing_test.py +2 -105
langfun/core/component.py +63 -2
langfun/core/component_test.py +53 -0
langfun/core/concurrent.py +414 -117
langfun/core/concurrent_test.py +111 -24
langfun/core/console.py +18 -5
langfun/core/console_test.py +17 -0
langfun/core/eval/__init__.py +16 -1
langfun/core/eval/base.py +622 -174
langfun/core/eval/base_test.py +200 -54
langfun/core/eval/matching.py +63 -76
langfun/core/eval/matching_test.py +17 -8
langfun/core/eval/patching.py +130 -0
langfun/core/eval/patching_test.py +170 -0
langfun/core/eval/scoring.py +26 -26
langfun/core/eval/scoring_test.py +19 -2
langfun/core/eval/v2/__init__.py +42 -0
langfun/core/eval/v2/checkpointing.py +380 -0
langfun/core/eval/v2/checkpointing_test.py +228 -0
langfun/core/eval/v2/eval_test_helper.py +136 -0
langfun/core/eval/v2/evaluation.py +725 -0
langfun/core/eval/v2/evaluation_test.py +180 -0
langfun/core/eval/v2/example.py +305 -0
langfun/core/eval/v2/example_test.py +128 -0
langfun/core/eval/v2/experiment.py +1048 -0
langfun/core/eval/v2/experiment_test.py +433 -0
langfun/core/eval/v2/metric_values.py +156 -0
langfun/core/eval/v2/metric_values_test.py +80 -0
langfun/core/eval/v2/metrics.py +357 -0
langfun/core/eval/v2/metrics_test.py +203 -0
langfun/core/eval/v2/progress.py +348 -0
langfun/core/eval/v2/progress_test.py +82 -0
langfun/core/eval/v2/progress_tracking.py +210 -0
langfun/core/eval/v2/progress_tracking_test.py +66 -0
langfun/core/eval/v2/reporting.py +270 -0
langfun/core/eval/v2/reporting_test.py +158 -0
langfun/core/eval/v2/runners.py +488 -0
langfun/core/eval/v2/runners_test.py +334 -0
langfun/core/langfunc.py +4 -17
langfun/core/langfunc_test.py +22 -6
langfun/core/language_model.py +577 -39
langfun/core/language_model_test.py +470 -56
langfun/core/llms/__init__.py +87 -16
langfun/core/llms/anthropic.py +312 -87
langfun/core/llms/anthropic_test.py +71 -3
langfun/core/llms/cache/base.py +21 -2
langfun/core/llms/cache/in_memory.py +13 -0
langfun/core/llms/cache/in_memory_test.py +53 -2
langfun/core/llms/compositional.py +101 -0
langfun/core/llms/compositional_test.py +73 -0
langfun/core/llms/deepseek.py +117 -0
langfun/core/llms/deepseek_test.py +61 -0
langfun/core/llms/fake.py +11 -7
langfun/core/llms/fake_test.py +14 -0
langfun/core/llms/gemini.py +507 -0
langfun/core/llms/gemini_test.py +195 -0
langfun/core/llms/google_genai.py +62 -218
langfun/core/llms/google_genai_test.py +9 -202
langfun/core/llms/groq.py +160 -144
langfun/core/llms/groq_test.py +31 -137
langfun/core/llms/llama_cpp.py +15 -42
langfun/core/llms/llama_cpp_test.py +4 -30
langfun/core/llms/openai.py +395 -203
langfun/core/llms/openai_compatible.py +179 -0
langfun/core/llms/openai_compatible_test.py +495 -0
langfun/core/llms/openai_test.py +30 -395
langfun/core/llms/rest.py +113 -0
langfun/core/llms/rest_test.py +111 -0
langfun/core/llms/vertexai.py +192 -0
langfun/core/llms/vertexai_test.py +52 -0
langfun/core/logging.py +284 -0
langfun/core/logging_test.py +125 -0
langfun/core/message.py +319 -9
langfun/core/message_test.py +190 -13
langfun/core/modalities/__init__.py +6 -2
langfun/core/modalities/audio.py +30 -0
langfun/core/modalities/audio_test.py +63 -0
langfun/core/modalities/image.py +39 -20
langfun/core/modalities/image_test.py +52 -9
langfun/core/modalities/mime.py +206 -29
langfun/core/modalities/mime_test.py +90 -9
langfun/core/modalities/ms_office.py +117 -0
langfun/core/modalities/ms_office_test.py +389 -0
langfun/core/modalities/pdf.py +22 -0
langfun/core/modalities/pdf_test.py +57 -0
langfun/core/modalities/video.py +9 -26
langfun/core/modalities/video_test.py +3 -3
langfun/core/modality.py +26 -3
langfun/core/modality_test.py +2 -2
langfun/core/sampling.py +11 -11
langfun/core/structured/__init__.py +12 -16
langfun/core/structured/completion.py +32 -5
langfun/core/structured/completion_test.py +7 -6
langfun/core/structured/description.py +2 -2
langfun/core/structured/description_test.py +3 -3
langfun/core/structured/function_generation.py +60 -27
langfun/core/structured/function_generation_test.py +72 -2
langfun/core/structured/mapping.py +97 -47
langfun/core/structured/mapping_test.py +90 -2
langfun/core/structured/parsing.py +33 -21
langfun/core/structured/parsing_test.py +53 -9
langfun/core/structured/querying.py +746 -0
langfun/core/structured/{prompting_test.py → querying_test.py} +469 -51
langfun/core/structured/schema.py +204 -97
langfun/core/structured/schema_generation.py +1 -1
langfun/core/structured/schema_test.py +130 -29
langfun/core/structured/scoring.py +125 -19
langfun/core/structured/scoring_test.py +30 -0
langfun/core/structured/tokenization.py +64 -0
langfun/core/structured/tokenization_test.py +48 -0
langfun/core/template.py +115 -1
langfun/core/template_test.py +71 -1
langfun/core/templates/conversation.py +9 -0
langfun/core/templates/conversation_test.py +4 -3
langfun/core/templates/selfplay_test.py +10 -2
langfun-0.1.2.dev202501140804.dist-info/METADATA +225 -0
langfun-0.1.2.dev202501140804.dist-info/RECORD +153 -0
{langfun-0.0.2.dev20240429.dist-info → langfun-0.1.2.dev202501140804.dist-info}/WHEEL +1 -1
langfun/core/coding/python/errors.py +0 -108
langfun/core/coding/python/errors_test.py +0 -99
langfun/core/coding/python/permissions.py +0 -90
langfun/core/coding/python/permissions_test.py +0 -86
langfun/core/structured/prompting.py +0 -238
langfun/core/text_formatting.py +0 -162
langfun/core/text_formatting_test.py +0 -47
langfun-0.0.2.dev20240429.dist-info/METADATA +0 -100
langfun-0.0.2.dev20240429.dist-info/RECORD +0 -108
{langfun-0.0.2.dev20240429.dist-info → langfun-0.1.2.dev202501140804.dist-info}/LICENSE +0 -0
{langfun-0.0.2.dev20240429.dist-info → langfun-0.1.2.dev202501140804.dist-info}/top_level.txt +0 -0

langfun/core/language_model_test.py CHANGED Viewed

@@ -27,48 +27,53 @@ import pyglove as pg
 @pg.use_init_args(['failures_before_attempt'])
 class MockModel(lm_lib.LanguageModel):
   """A mock model that echo back user prompts."""
   failures_before_attempt: int = 0
+  name: str = 'MockModel'
   def _sample(self,
               prompts: list[message_lib.Message]
               ) -> list[lm_lib.LMSamplingResult]:
     context = pg.Dict(attempt=0)
-    def fake_sample(prompts):
+    def fake_sample(prompt):
       if context.attempt >= self.failures_before_attempt:
-        return [
-            lm_lib.LMSamplingResult(
-                [
-                    lm_lib.LMSample(  # pylint: disable=g-complex-comprehension
-                        response=prompt.text * self.sampling_options.top_k,
-                        score=self.sampling_options.temperature or -1.0,
-                    )
-                ],
-                usage=lm_lib.LMSamplingUsage(
-                    prompt_tokens=100,
-                    completion_tokens=100,
-                    total_tokens=200,
-                ),
-            )
-            for prompt in prompts
-        ]
-      context.attempt += 1
+        return lm_lib.LMSamplingResult(
+            [
+                lm_lib.LMSample(  # pylint: disable=g-complex-comprehension
+                    response=prompt.text * self.sampling_options.top_k,
+                    score=self.sampling_options.temperature or -1.0,
+                )
+            ],
+            usage=lm_lib.LMSamplingUsage(
+                prompt_tokens=100,
+                completion_tokens=100,
+                total_tokens=200,
+                estimated_cost=1.0,
+            ),
+        )
+      else:
+        context.attempt += 1
       raise ValueError('Failed to sample prompts.')
-    return concurrent.with_retry(
-        fake_sample,
-        retry_on_errors=ValueError,
-        max_attempts=self.max_attempts,
-        retry_interval=1,
-    )(prompts)
+    results = self._parallel_execute_with_currency_control(
+        fake_sample, prompts, retry_on_errors=ValueError
+    )
+    for result in results:
+      result.usage.retry_stats.rebind(
+          total_call_interval=0, skip_notification=True
+      )
+    return results
+  @property
+  def model_id(self) -> str:
+    return self.name
 class MockScoringModel(MockModel):
   def _score(
       self,
-      prompt: message_lib.Message,
+      prompt: message_lib.Message | list[message_lib.Message],
       completions: list[message_lib.Message],
       **kwargs
   ) -> list[lm_lib.LMScoringResult]:
@@ -77,6 +82,13 @@ class MockScoringModel(MockModel):
     ]
+class MockTokenizeModel(MockModel):
+  def _tokenize(
+      self, prompt: message_lib.Message) -> list[tuple[str | bytes, int]]:
+    return [(w, i) for i, w in enumerate(prompt.text.split(' '))]
 class LMSamplingOptionsTest(unittest.TestCase):
   """Tests for LMSamplingOptions."""
@@ -105,6 +117,21 @@ class LanguageModelTest(unittest.TestCase):
     self.assertEqual(lm.sampling_options.top_k, 2)
     self.assertEqual(lm.max_attempts, 2)
+  def test_subclassing(self):
+    class ChildModel(lm_lib.LanguageModel):
+      sampling_options = lm_lib.LMSamplingOptions(
+          temperature=0.5, top_k=20
+      )
+      def _sample(self, *args, **kwargs):
+        pass
+    lm = ChildModel(top_k=10)
+    self.assertEqual(lm.sampling_options.temperature, 0.5)
+    self.assertEqual(lm.sampling_options.top_k, 10)
   def test_sample(self):
     lm = MockModel(top_k=1)
     self.assertEqual(
@@ -117,14 +144,15 @@ class LanguageModelTest(unittest.TestCase):
                             'foo',
                             score=-1.0,
                             logprobs=None,
-                            usage=lm_lib.LMSamplingUsage(100, 100, 200),
+                            is_cached=False,
+                            usage=lm_lib.LMSamplingUsage(100, 100, 200, 1, 1.0),
                             tags=[message_lib.Message.TAG_LM_RESPONSE],
                         ),
                         score=-1.0,
                         logprobs=None,
                     )
                 ],
-                usage=lm_lib.LMSamplingUsage(100, 100, 200),
+                usage=lm_lib.LMSamplingUsage(100, 100, 200, 1, 1.0),
             ),
             lm_lib.LMSamplingResult(
                 [
@@ -133,14 +161,15 @@ class LanguageModelTest(unittest.TestCase):
                             'bar',
                             score=-1.0,
                             logprobs=None,
-                            usage=lm_lib.LMSamplingUsage(100, 100, 200),
+                            is_cached=False,
+                            usage=lm_lib.LMSamplingUsage(100, 100, 200, 1, 1.0),
                             tags=[message_lib.Message.TAG_LM_RESPONSE],
                         ),
                         score=-1.0,
                         logprobs=None,
                     )
                 ],
-                usage=lm_lib.LMSamplingUsage(100, 100, 200),
+                usage=lm_lib.LMSamplingUsage(100, 100, 200, 1, 1.0),
             ),
         ],
     )
@@ -158,14 +187,15 @@ class LanguageModelTest(unittest.TestCase):
                             'foo' * 2,
                             score=0.5,
                             logprobs=None,
-                            usage=lm_lib.LMSamplingUsage(100, 100, 200),
+                            is_cached=False,
+                            usage=lm_lib.LMSamplingUsage(100, 100, 200, 1, 1.0),
                             tags=[message_lib.Message.TAG_LM_RESPONSE],
                         ),
                         score=0.5,
                         logprobs=None,
                     ),
                 ],
-                usage=lm_lib.LMSamplingUsage(100, 100, 200),
+                usage=lm_lib.LMSamplingUsage(100, 100, 200, 1, 1.0),
             ),
             lm_lib.LMSamplingResult(
                 [
@@ -174,7 +204,8 @@ class LanguageModelTest(unittest.TestCase):
                             'bar' * 2,
                             score=0.5,
                             logprobs=None,
-                            usage=lm_lib.LMSamplingUsage(100, 100, 200),
+                            is_cached=False,
+                            usage=lm_lib.LMSamplingUsage(100, 100, 200, 1, 1.0),
                             tags=[message_lib.Message.TAG_LM_RESPONSE],
                         ),
                         score=0.5,
@@ -182,7 +213,8 @@ class LanguageModelTest(unittest.TestCase):
                     ),
                 ],
                 usage=lm_lib.LMSamplingUsage(
-                    prompt_tokens=100, completion_tokens=100, total_tokens=200
+                    prompt_tokens=100, completion_tokens=100, total_tokens=200,
+                    num_requests=1, estimated_cost=1.0,
                 ),
             ),
         ]
@@ -198,14 +230,15 @@ class LanguageModelTest(unittest.TestCase):
                             'foo',
                             score=1.0,
                             logprobs=None,
-                            usage=lm_lib.LMSamplingUsage(100, 100, 200),
+                            is_cached=False,
+                            usage=lm_lib.LMSamplingUsage(100, 100, 200, 1, 1.0),
                             tags=[message_lib.Message.TAG_LM_RESPONSE],
                         ),
                         score=1.0,
                         logprobs=None,
                     ),
                 ],
-                usage=lm_lib.LMSamplingUsage(100, 100, 200),
+                usage=lm_lib.LMSamplingUsage(100, 100, 200, 1, 1.0),
             ),
             lm_lib.LMSamplingResult(
                 [
@@ -214,7 +247,8 @@ class LanguageModelTest(unittest.TestCase):
                             'bar',
                             score=1.0,
                             logprobs=None,
-                            usage=lm_lib.LMSamplingUsage(100, 100, 200),
+                            is_cached=False,
+                            usage=lm_lib.LMSamplingUsage(100, 100, 200, 1, 1.0),
                             tags=[message_lib.Message.TAG_LM_RESPONSE],
                         ),
                         score=1.0,
@@ -222,7 +256,8 @@ class LanguageModelTest(unittest.TestCase):
                     ),
                 ],
                 usage=lm_lib.LMSamplingUsage(
-                    prompt_tokens=100, completion_tokens=100, total_tokens=200
+                    prompt_tokens=100, completion_tokens=100, total_tokens=200,
+                    num_requests=1, estimated_cost=1.0,
                 ),
             ),
         ]
@@ -237,14 +272,15 @@ class LanguageModelTest(unittest.TestCase):
                             'foo' * 2,
                             score=0.7,
                             logprobs=None,
-                            usage=lm_lib.LMSamplingUsage(100, 100, 200),
+                            is_cached=False,
+                            usage=lm_lib.LMSamplingUsage(100, 100, 200, 1, 1.0),
                             tags=[message_lib.Message.TAG_LM_RESPONSE],
                         ),
                         score=0.7,
                         logprobs=None,
                     ),
                 ],
-                usage=lm_lib.LMSamplingUsage(100, 100, 200),
+                usage=lm_lib.LMSamplingUsage(100, 100, 200, 1, 1.0),
             ),
             lm_lib.LMSamplingResult(
                 [
@@ -253,7 +289,8 @@ class LanguageModelTest(unittest.TestCase):
                             'bar' * 2,
                             score=0.7,
                             logprobs=None,
-                            usage=lm_lib.LMSamplingUsage(100, 100, 200),
+                            is_cached=False,
+                            usage=lm_lib.LMSamplingUsage(100, 100, 200, 1, 1.0),
                             tags=[message_lib.Message.TAG_LM_RESPONSE],
                         ),
                         score=0.7,
@@ -261,7 +298,8 @@ class LanguageModelTest(unittest.TestCase):
                     ),
                 ],
                 usage=lm_lib.LMSamplingUsage(
-                    prompt_tokens=100, completion_tokens=100, total_tokens=200
+                    prompt_tokens=100, completion_tokens=100, total_tokens=200,
+                    num_requests=1, estimated_cost=1.0,
                 ),
             ),
         ]
@@ -273,7 +311,9 @@ class LanguageModelTest(unittest.TestCase):
     self.assertEqual(response.text, 'foo')
     self.assertEqual(response.score, -1.0)
     self.assertIsNone(response.logprobs)
-    self.assertEqual(response.usage, lm_lib.LMSamplingUsage(100, 100, 200))
+    self.assertEqual(
+        response.usage, lm_lib.LMSamplingUsage(100, 100, 200, 1, 1.0)
+    )
     # Test override sampling_options.
     self.assertEqual(
@@ -296,14 +336,17 @@ class LanguageModelTest(unittest.TestCase):
                             cache_seed=0,
                             score=-1.0,
                             logprobs=None,
-                            usage=lm_lib.LMSamplingUsage(100, 100, 200),
+                            is_cached=False,
+                            usage=lm_lib.LMSamplingUsage(
+                                100, 100, 200, 1, 1.0
+                            ),
                             tags=[message_lib.Message.TAG_LM_RESPONSE],
                         ),
                         score=-1.0,
                         logprobs=None,
                     )
                 ],
-                usage=lm_lib.LMSamplingUsage(100, 100, 200),
+                usage=lm_lib.LMSamplingUsage(100, 100, 200, 1, 1.0),
             ),
             lm_lib.LMSamplingResult(
                 [
@@ -313,14 +356,15 @@ class LanguageModelTest(unittest.TestCase):
                             cache_seed=0,
                             score=-1.0,
                             logprobs=None,
-                            usage=lm_lib.LMSamplingUsage(100, 100, 200),
+                            is_cached=False,
+                            usage=lm_lib.LMSamplingUsage(100, 100, 200, 1, 1.0),
                             tags=[message_lib.Message.TAG_LM_RESPONSE],
                         ),
                         score=-1.0,
                         logprobs=None,
                     )
                 ],
-                usage=lm_lib.LMSamplingUsage(100, 100, 200),
+                usage=lm_lib.LMSamplingUsage(100, 100, 200, 1, 1.0),
             ),
         ],
     )
@@ -328,7 +372,9 @@ class LanguageModelTest(unittest.TestCase):
     self.assertEqual(cache.stats.num_hits, 0)
     self.assertEqual(cache.stats.num_updates, 2)
-    self.assertEqual(lm('foo'), 'foo')
+    result = lm('foo')
+    self.assertEqual(result, 'foo')
+    self.assertTrue(result.metadata.is_cached)
     self.assertEqual(lm('bar'), 'bar')
     self.assertEqual(cache.stats.num_queries, 4)
     self.assertEqual(cache.stats.num_hits, 2)
@@ -350,14 +396,15 @@ class LanguageModelTest(unittest.TestCase):
                             cache_seed=0,
                             score=1.0,
                             logprobs=None,
-                            usage=lm_lib.LMSamplingUsage(100, 100, 200),
+                            is_cached=False,
+                            usage=lm_lib.LMSamplingUsage(100, 100, 200, 1, 1.0),
                             tags=[message_lib.Message.TAG_LM_RESPONSE],
                         ),
                         score=1.0,
                         logprobs=None,
                     )
                 ],
-                usage=lm_lib.LMSamplingUsage(100, 100, 200),
+                usage=lm_lib.LMSamplingUsage(100, 100, 200, 1, 1.0),
             ),
             lm_lib.LMSamplingResult(
                 [
@@ -367,14 +414,15 @@ class LanguageModelTest(unittest.TestCase):
                             cache_seed=0,
                             score=1.0,
                             logprobs=None,
-                            usage=lm_lib.LMSamplingUsage(100, 100, 200),
+                            is_cached=False,
+                            usage=lm_lib.LMSamplingUsage(100, 100, 200, 1, 1.0),
                             tags=[message_lib.Message.TAG_LM_RESPONSE],
                         ),
                         score=1.0,
                         logprobs=None,
                     )
                 ],
-                usage=lm_lib.LMSamplingUsage(100, 100, 200),
+                usage=lm_lib.LMSamplingUsage(100, 100, 200, 1, 1.0),
             ),
         ],
     )
@@ -400,13 +448,50 @@ class LanguageModelTest(unittest.TestCase):
   def test_retry(self):
     lm = MockModel(
-        failures_before_attempt=1, top_k=1,
+        failures_before_attempt=1, top_k=1, max_attempts=2, retry_interval=1
     )
     with self.assertRaisesRegex(
         concurrent.RetryError, 'Calling .* failed after 1 attempts'
     ):
       lm('foo', max_attempts=1)
-    self.assertEqual(lm('foo', max_attempts=2), 'foo')
+    usage = lm_lib.LMSamplingUsage(
+        prompt_tokens=100,
+        completion_tokens=100,
+        total_tokens=200,
+        num_requests=1,
+        estimated_cost=1.0,
+        retry_stats=lm_lib.RetryStats(
+            num_occurences=1,
+            total_wait_interval=1,
+            errors={'ValueError': 1},
+        ),
+    )
+    out = lm.sample(['foo'])
+    self.assertEqual(
+        # lm.sample(['foo'], max_attempts=2),
+        out,
+        [
+            lm_lib.LMSamplingResult(
+                [
+                    lm_lib.LMSample(
+                        message_lib.AIMessage(
+                            'foo',
+                            score=-1.0,
+                            logprobs=None,
+                            is_cached=False,
+                            usage=usage,
+                            tags=['lm-response'],
+                        ),
+                        score=-1.0,
+                        logprobs=None,
+                    )
+                ],
+                usage=usage,
+                is_cached=False,
+            )
+        ],
+    )
   def test_debug(self):
     class Image(modality.Modality):
@@ -418,8 +503,9 @@ class LanguageModelTest(unittest.TestCase):
     with contextlib.redirect_stdout(string_io):
       self.assertEqual(
           lm(message_lib.UserMessage(
-              'hi {{image}}', image=Image()), debug=True),
-          'hi {{image}}')
+              'hi <<[[image]]>>', image=Image()), debug=True),
+          'hi <<[[image]]>>'
+      )
     debug_info = string_io.getvalue()
     self.assertIn('[0] LM INFO', debug_info)
@@ -508,6 +594,17 @@ class LanguageModelTest(unittest.TestCase):
             ],
         )
+        self.assertEqual(
+            lm.score(
+                [message_lib.UserMessage('hi {{image}}', image=Image()),
+                 message_lib.UserMessage('hi {{image}}', image=Image())],
+                ['1', '2'], debug=debug_mode),
+            [
+                lm_lib.LMScoringResult(score=-0.0),
+                lm_lib.LMScoringResult(score=-1.0),
+            ],
+        )
       debug_info = string_io.getvalue()
       expected_included = [
           debug_prints[f]
@@ -528,10 +625,73 @@ class LanguageModelTest(unittest.TestCase):
       if debug_mode & lm_lib.LMDebugMode.PROMPT:
         self.assertIn('[0] MODALITY OBJECTS SENT TO LM', debug_info)
+  def test_score_with_unmatched_prompt_and_completions(self):
+    with self.assertRaises(ValueError):
+      MockScoringModel().score(['hi',], ['1', '2', '3'])
   def test_score_with_unsupported_model(self):
     with self.assertRaises(NotImplementedError):
       MockModel().score('hi', ['1', '2'])
+  def test_tokenize(self):
+    info_flag = lm_lib.LMDebugMode.INFO
+    prompt_flag = lm_lib.LMDebugMode.PROMPT
+    response_flag = lm_lib.LMDebugMode.RESPONSE
+    debug_prints = {
+        info_flag: 'LM INFO',
+        prompt_flag: 'PROMPT TO TOKENIZE',
+        response_flag: 'TOKENS RETURNED',
+    }
+    debug_modes = [
+        info_flag,
+        prompt_flag,
+        response_flag,
+        info_flag | prompt_flag,
+        info_flag | response_flag,
+        prompt_flag | response_flag,
+        info_flag | prompt_flag | response_flag,
+    ]
+    class Image(modality.Modality):
+      def to_bytes(self):
+        return b'fake_image'
+    for debug_mode in debug_modes:
+      string_io = io.StringIO()
+      lm = MockTokenizeModel()
+      with contextlib.redirect_stdout(string_io):
+        self.assertEqual(
+            lm.tokenize(
+                message_lib.UserMessage('hi <<[[image]]>>', image=Image()),
+                debug=debug_mode),
+            [('hi', 0), ('<<[[image]]>>', 1)],
+        )
+      debug_info = string_io.getvalue()
+      expected_included = [
+          debug_prints[f]
+          for f in lm_lib.LMDebugMode
+          if f != lm_lib.LMDebugMode.NONE and f in debug_mode
+      ]
+      expected_excluded = [
+          debug_prints[f]
+          for f in lm_lib.LMDebugMode
+          if f != lm_lib.LMDebugMode.NONE and f not in debug_mode
+      ]
+      for expected_include in expected_included:
+        self.assertIn(expected_include, debug_info)
+      for expected_exclude in expected_excluded:
+        self.assertNotIn(expected_exclude, debug_info)
+      if debug_mode & lm_lib.LMDebugMode.PROMPT:
+        self.assertIn('[0] MODALITY OBJECTS SENT TO LM', debug_info)
+  def test_tokenize_with_unsupported_model(self):
+    with self.assertRaises(NotImplementedError):
+      MockModel().tokenize('hi')
   def test_rate_to_max_concurrency_no_rpm_no_tpm(self) -> None:
     lm = MockModel()
     self.assertEqual(
@@ -564,6 +724,260 @@ class LanguageModelTest(unittest.TestCase):
     self.assertEqual(lm.rate_to_max_concurrency(requests_per_min=1), 1)
     self.assertEqual(lm.rate_to_max_concurrency(tokens_per_min=1), 1)
+  def test_track_usages(self):
+    lm = MockModel(name='model1')
+    lm2 = MockModel(name='model2')
+    with lm_lib.track_usages() as usages1:
+      _ = lm('hi')
+      with lm_lib.track_usages(lm2) as usages2:
+        with lm_lib.track_usages('model1') as usages3:
+          with lm_lib.track_usages('model1', lm2) as usages4:
+            def call_lm(prompt):
+              _ = lm.sample([prompt] * 2)
+            lm2('hi')
+            list(concurrent.concurrent_map(call_lm, ['hi', 'hello']))
+    self.assertEqual(usages2.uncached.breakdown, {
+        'model2': lm_lib.LMSamplingUsage(100, 100, 200, 1, 1.0),
+    })
+    self.assertFalse(usages2.cached)
+    self.assertEqual(usages3.uncached.breakdown, {
+        'model1': lm_lib.LMSamplingUsage(100 * 4, 100 * 4, 200 * 4, 4, 4.0),
+    })
+    self.assertFalse(usages3.cached)
+    self.assertEqual(usages4.uncached.breakdown, {
+        'model1': lm_lib.LMSamplingUsage(100 * 4, 100 * 4, 200 * 4, 4, 4.0),
+        'model2': lm_lib.LMSamplingUsage(100, 100, 200, 1, 1.0),
+    })
+    self.assertFalse(usages4.cached)
+    self.assertEqual(usages1.uncached.breakdown, {
+        'model1': lm_lib.LMSamplingUsage(100 * 5, 100 * 5, 200 * 5, 5, 5.0),
+        'model2': lm_lib.LMSamplingUsage(100, 100, 200, 1, 1.0),
+    })
+    self.assertFalse(usages1.cached)
+    self.assertEqual(
+        usages1.total,
+        lm_lib.LMSamplingUsage(100 * 6, 100 * 6, 200 * 6, 6, 6.0),
+    )
+    cache = in_memory.InMemory()
+    lm = MockModel(cache=cache, name='model1')
+    with lm_lib.track_usages() as usages1:
+      _ = lm('hi')
+    self.assertEqual(usages1.uncached.breakdown, {
+        'model1': lm_lib.LMSamplingUsage(100, 100, 200, 1, 1.0),
+    })
+    self.assertFalse(usages1.cached)
+    with lm_lib.track_usages() as usages2:
+      _ = lm('hi')
+    self.assertEqual(usages2.cached.breakdown, {
+        'model1': lm_lib.LMSamplingUsage(100, 100, 200, 1, 0.0),
+    })
+    self.assertFalse(usages2.uncached)
+class LMSamplingUsageTest(unittest.TestCase):
+  def test_basics(self):
+    usage = lm_lib.LMSamplingUsage(100, 200, 300, 4, 5.0)
+    self.assertEqual(usage.num_requests, 4)
+    self.assertEqual(usage.prompt_tokens, 100)
+    self.assertEqual(usage.completion_tokens, 200)
+    self.assertEqual(usage.total_tokens, 300)
+    self.assertEqual(usage.estimated_cost, 5.0)
+    self.assertEqual(usage.average_prompt_tokens, 25)
+    self.assertEqual(usage.average_completion_tokens, 50)
+    self.assertEqual(usage.average_total_tokens, 75)
+    self.assertEqual(usage.average_estimated_cost, 1.25)
+  def test_add(self):
+    usage1 = lm_lib.LMSamplingUsage(100, 200, 300, 4, 5.0)
+    usage1.rebind(retry_stats=lm_lib.RetryStats(1, 3, 4, {'e1': 1}))
+    usage2 = lm_lib.LMSamplingUsage(100, 200, 300, 4, 5.0)
+    self.assertEqual(usage1 + usage2, usage1 + usage2)
+    self.assertIs(usage1 + None, usage1)
+    self.assertIs(None + usage1, usage1)
+    usage3 = lm_lib.LMSamplingUsage(100, 200, 300, 4, None)
+    usage3.rebind(retry_stats=lm_lib.RetryStats(2, 4, 5, {'e1': 2, 'e2': 3}))
+    self.assertEqual(
+        usage1 + usage3,
+        lm_lib.LMSamplingUsage(
+            200,
+            400,
+            600,
+            8,
+            5.0,
+            retry_stats=lm_lib.RetryStats(3, 7, 9, {'e1': 3, 'e2': 3}),
+        ),
+    )
+    self.assertEqual(
+        usage3 + usage1,
+        lm_lib.LMSamplingUsage(
+            200,
+            400,
+            600,
+            8,
+            5.0,
+            retry_stats=lm_lib.RetryStats(3, 7, 9, {'e1': 3, 'e2': 3}),
+        ),
+    )
+  def test_usage_not_available(self):
+    usage_not_available = lm_lib.UsageNotAvailable()
+    self.assertEqual(usage_not_available.prompt_tokens, 0)
+    self.assertEqual(usage_not_available.completion_tokens, 0)
+    self.assertEqual(usage_not_available.total_tokens, 0)
+    self.assertEqual(usage_not_available.average_prompt_tokens, 0)
+    self.assertEqual(usage_not_available.average_completion_tokens, 0)
+    self.assertEqual(usage_not_available.average_total_tokens, 0)
+    self.assertIsNone(usage_not_available.average_estimated_cost)
+    self.assertTrue(usage_not_available)
+    self.assertEqual(
+        usage_not_available + lm_lib.LMSamplingUsage(1, 2, 3, 4, 5.0),
+        lm_lib.UsageNotAvailable(num_requests=5)
+    )
+    self.assertEqual(
+        lm_lib.LMSamplingUsage(1, 2, 3, 4, 5.0) + usage_not_available,
+        lm_lib.UsageNotAvailable(num_requests=5)
+    )
+    self.assertIs(None + usage_not_available, usage_not_available)
+    self.assertIs(usage_not_available + None, usage_not_available)
+class UsageSummaryTest(unittest.TestCase):
+  def test_basics(self):
+    usage_summary = lm_lib.UsageSummary()
+    self.assertFalse(usage_summary.total)
+    self.assertFalse(usage_summary.cached)
+    self.assertFalse(usage_summary.uncached)
+    # Add uncached.
+    usage_summary.add(
+        'model1', lm_lib.LMSamplingUsage(1, 2, 3, 1, 5.0), False
+    )
+    self.assertEqual(
+        usage_summary.total, lm_lib.LMSamplingUsage(1, 2, 3, 1, 5.0)
+    )
+    self.assertEqual(
+        usage_summary.uncached.total, lm_lib.LMSamplingUsage(1, 2, 3, 1, 5.0)
+    )
+    # Add cached.
+    self.assertFalse(usage_summary.cached)
+    usage_summary.add(
+        'model1', lm_lib.LMSamplingUsage(1, 2, 3, 1, 5.0), True
+    )
+    self.assertEqual(
+        usage_summary.total, lm_lib.LMSamplingUsage(2, 4, 6, 2, 5.0)
+    )
+    self.assertEqual(
+        usage_summary.cached.total, lm_lib.LMSamplingUsage(1, 2, 3, 1, 0.0)
+    )
+    # Add UsageNotAvailable.
+    usage_summary.add(
+        'model1', lm_lib.UsageNotAvailable(num_requests=1), False
+    )
+    self.assertEqual(
+        usage_summary.total, lm_lib.UsageNotAvailable(num_requests=3)
+    )
+    self.assertEqual(
+        usage_summary.uncached.total, lm_lib.UsageNotAvailable(num_requests=2)
+    )
+  def test_merge(self):
+    usage_summary = lm_lib.UsageSummary()
+    usage_summary.add(
+        'model1', lm_lib.LMSamplingUsage(1, 2, 3, 1, 5.0), False
+    )
+    usage_summary.add(
+        'model2', lm_lib.LMSamplingUsage(1, 2, 3, 1, 5.0), False
+    )
+    usage_summary.add(
+        'model1', lm_lib.LMSamplingUsage(1, 2, 3, 1, 5.0), False
+    )
+    usage_summary2 = lm_lib.UsageSummary()
+    usage_summary2.add(
+        'model1', lm_lib.LMSamplingUsage(1, 2, 3, 1, 5.0), False
+    )
+    usage_summary2.add(
+        'model3', lm_lib.LMSamplingUsage(1, 2, 3, 1, 5.0), False
+    )
+    usage_summary2.merge(usage_summary)
+    self.assertEqual(
+        usage_summary2,
+        lm_lib.UsageSummary(
+            cached=lm_lib.UsageSummary.AggregatedUsage(
+                total=lm_lib.LMSamplingUsage(
+                    prompt_tokens=0,
+                    completion_tokens=0,
+                    total_tokens=0,
+                    num_requests=0,
+                    estimated_cost=0.0,
+                ),
+                breakdown={}
+            ),
+            uncached=lm_lib.UsageSummary.AggregatedUsage(
+                total=lm_lib.LMSamplingUsage(
+                    prompt_tokens=5,
+                    completion_tokens=10,
+                    total_tokens=15,
+                    num_requests=5,
+                    estimated_cost=25.0
+                ),
+                breakdown=dict(
+                    model1=lm_lib.LMSamplingUsage(
+                        prompt_tokens=3,
+                        completion_tokens=6,
+                        total_tokens=9,
+                        num_requests=3,
+                        estimated_cost=15.0
+                    ),
+                    model3=lm_lib.LMSamplingUsage(
+                        prompt_tokens=1,
+                        completion_tokens=2,
+                        total_tokens=3,
+                        num_requests=1,
+                        estimated_cost=5.0
+                    ),
+                    model2=lm_lib.LMSamplingUsage(
+                        prompt_tokens=1,
+                        completion_tokens=2,
+                        total_tokens=3,
+                        num_requests=1,
+                        estimated_cost=5.0
+                    )
+                )
+            )
+        )
+    )
+  def test_html_view(self):
+    usage_summary = lm_lib.UsageSummary()
+    usage_summary.add(
+        'model1', lm_lib.LMSamplingUsage(1, 2, 3, 1, 5.0), False
+    )
+    self.assertIn(
+        '5.000',
+        usage_summary.to_html(extra_flags=dict(as_badge=True)).content
+    )
+    usage_summary.add(
+        'model1', lm_lib.LMSamplingUsage(1, 2, 3, 1, 5.0), False
+    )
+    self.assertIn(
+        '10.000',
+        usage_summary.to_html(
+            extra_flags=dict(as_badge=True, interactive=True)
+        ).content
+    )
+    self.assertTrue(
+        usage_summary.to_html().content.startswith('<details open')
+    )
+    with pg.views.html.controls.HtmlControl.track_scripts() as scripts:
+      usage_summary.add(
+          'model2', lm_lib.LMSamplingUsage(1, 2, 3, 1, 5.0), False
+      )
+      self.assertEqual(len(scripts), 4)
 if __name__ == '__main__':
   unittest.main()

langfun 0.0.2.dev20240429__py3-none-any.whl → 0.1.2.dev202501140804__py3-none-any.whl

langfun 0.0.2.dev20240429py3-none-any.whl → 0.1.2.dev202501140804py3-none-any.whl