langfun 0.1.2.dev202509120804__py3-none-any.whl → 0.1.2.dev202512040805__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. langfun/__init__.py +1 -1
  2. langfun/core/__init__.py +7 -1
  3. langfun/core/agentic/__init__.py +8 -1
  4. langfun/core/agentic/action.py +740 -112
  5. langfun/core/agentic/action_eval.py +9 -2
  6. langfun/core/agentic/action_test.py +189 -24
  7. langfun/core/async_support.py +104 -5
  8. langfun/core/async_support_test.py +23 -0
  9. langfun/core/coding/python/correction.py +19 -9
  10. langfun/core/coding/python/execution.py +14 -12
  11. langfun/core/coding/python/generation.py +21 -16
  12. langfun/core/coding/python/sandboxing.py +23 -3
  13. langfun/core/component.py +42 -3
  14. langfun/core/concurrent.py +70 -6
  15. langfun/core/concurrent_test.py +9 -2
  16. langfun/core/console.py +1 -1
  17. langfun/core/data/conversion/anthropic.py +12 -3
  18. langfun/core/data/conversion/anthropic_test.py +8 -6
  19. langfun/core/data/conversion/gemini.py +11 -2
  20. langfun/core/data/conversion/gemini_test.py +48 -9
  21. langfun/core/data/conversion/openai.py +145 -31
  22. langfun/core/data/conversion/openai_test.py +161 -17
  23. langfun/core/eval/base.py +48 -44
  24. langfun/core/eval/base_test.py +5 -5
  25. langfun/core/eval/matching.py +5 -2
  26. langfun/core/eval/patching.py +3 -3
  27. langfun/core/eval/scoring.py +4 -3
  28. langfun/core/eval/v2/__init__.py +2 -0
  29. langfun/core/eval/v2/checkpointing.py +76 -7
  30. langfun/core/eval/v2/checkpointing_test.py +9 -2
  31. langfun/core/eval/v2/config_saver.py +37 -0
  32. langfun/core/eval/v2/config_saver_test.py +36 -0
  33. langfun/core/eval/v2/eval_test_helper.py +104 -3
  34. langfun/core/eval/v2/evaluation.py +92 -17
  35. langfun/core/eval/v2/evaluation_test.py +9 -3
  36. langfun/core/eval/v2/example.py +50 -40
  37. langfun/core/eval/v2/example_test.py +16 -8
  38. langfun/core/eval/v2/experiment.py +84 -15
  39. langfun/core/eval/v2/experiment_test.py +19 -0
  40. langfun/core/eval/v2/metric_values.py +31 -3
  41. langfun/core/eval/v2/metric_values_test.py +32 -0
  42. langfun/core/eval/v2/metrics.py +157 -44
  43. langfun/core/eval/v2/metrics_test.py +39 -18
  44. langfun/core/eval/v2/progress.py +31 -1
  45. langfun/core/eval/v2/progress_test.py +27 -0
  46. langfun/core/eval/v2/progress_tracking.py +13 -5
  47. langfun/core/eval/v2/progress_tracking_test.py +9 -1
  48. langfun/core/eval/v2/reporting.py +90 -71
  49. langfun/core/eval/v2/reporting_test.py +24 -6
  50. langfun/core/eval/v2/runners/__init__.py +30 -0
  51. langfun/core/eval/v2/{runners.py → runners/base.py} +72 -180
  52. langfun/core/eval/v2/runners/beam.py +354 -0
  53. langfun/core/eval/v2/runners/beam_test.py +153 -0
  54. langfun/core/eval/v2/runners/ckpt_monitor.py +294 -0
  55. langfun/core/eval/v2/runners/ckpt_monitor_test.py +162 -0
  56. langfun/core/eval/v2/runners/debug.py +40 -0
  57. langfun/core/eval/v2/runners/debug_test.py +76 -0
  58. langfun/core/eval/v2/runners/parallel.py +243 -0
  59. langfun/core/eval/v2/runners/parallel_test.py +182 -0
  60. langfun/core/eval/v2/runners/sequential.py +47 -0
  61. langfun/core/eval/v2/runners/sequential_test.py +169 -0
  62. langfun/core/langfunc.py +45 -130
  63. langfun/core/langfunc_test.py +7 -5
  64. langfun/core/language_model.py +189 -36
  65. langfun/core/language_model_test.py +54 -3
  66. langfun/core/llms/__init__.py +12 -1
  67. langfun/core/llms/anthropic.py +157 -2
  68. langfun/core/llms/azure_openai.py +29 -17
  69. langfun/core/llms/cache/base.py +25 -3
  70. langfun/core/llms/cache/in_memory.py +48 -7
  71. langfun/core/llms/cache/in_memory_test.py +14 -4
  72. langfun/core/llms/compositional.py +25 -1
  73. langfun/core/llms/deepseek.py +30 -2
  74. langfun/core/llms/fake.py +32 -1
  75. langfun/core/llms/gemini.py +64 -12
  76. langfun/core/llms/gemini_test.py +110 -0
  77. langfun/core/llms/google_genai.py +34 -1
  78. langfun/core/llms/groq.py +28 -3
  79. langfun/core/llms/llama_cpp.py +23 -4
  80. langfun/core/llms/openai.py +120 -3
  81. langfun/core/llms/openai_compatible.py +148 -27
  82. langfun/core/llms/openai_compatible_test.py +207 -20
  83. langfun/core/llms/openai_test.py +0 -2
  84. langfun/core/llms/rest.py +16 -1
  85. langfun/core/llms/vertexai.py +58 -8
  86. langfun/core/logging.py +1 -1
  87. langfun/core/mcp/__init__.py +10 -0
  88. langfun/core/mcp/client.py +177 -0
  89. langfun/core/mcp/client_test.py +71 -0
  90. langfun/core/mcp/session.py +241 -0
  91. langfun/core/mcp/session_test.py +54 -0
  92. langfun/core/mcp/testing/simple_mcp_client.py +33 -0
  93. langfun/core/mcp/testing/simple_mcp_server.py +33 -0
  94. langfun/core/mcp/tool.py +254 -0
  95. langfun/core/mcp/tool_test.py +197 -0
  96. langfun/core/memory.py +1 -0
  97. langfun/core/message.py +160 -55
  98. langfun/core/message_test.py +65 -81
  99. langfun/core/modalities/__init__.py +8 -0
  100. langfun/core/modalities/audio.py +21 -1
  101. langfun/core/modalities/image.py +73 -3
  102. langfun/core/modalities/image_test.py +116 -0
  103. langfun/core/modalities/mime.py +64 -3
  104. langfun/core/modalities/mime_test.py +11 -0
  105. langfun/core/modalities/pdf.py +19 -1
  106. langfun/core/modalities/video.py +21 -1
  107. langfun/core/modality.py +167 -29
  108. langfun/core/modality_test.py +42 -12
  109. langfun/core/natural_language.py +1 -1
  110. langfun/core/sampling.py +4 -4
  111. langfun/core/sampling_test.py +20 -4
  112. langfun/core/structured/__init__.py +2 -24
  113. langfun/core/structured/completion.py +34 -44
  114. langfun/core/structured/completion_test.py +23 -43
  115. langfun/core/structured/description.py +54 -50
  116. langfun/core/structured/function_generation.py +29 -12
  117. langfun/core/structured/mapping.py +81 -37
  118. langfun/core/structured/parsing.py +95 -79
  119. langfun/core/structured/parsing_test.py +0 -3
  120. langfun/core/structured/querying.py +230 -154
  121. langfun/core/structured/querying_test.py +69 -33
  122. langfun/core/structured/schema/__init__.py +49 -0
  123. langfun/core/structured/schema/base.py +664 -0
  124. langfun/core/structured/schema/base_test.py +531 -0
  125. langfun/core/structured/schema/json.py +174 -0
  126. langfun/core/structured/schema/json_test.py +121 -0
  127. langfun/core/structured/schema/python.py +316 -0
  128. langfun/core/structured/schema/python_test.py +410 -0
  129. langfun/core/structured/schema_generation.py +33 -14
  130. langfun/core/structured/scoring.py +47 -36
  131. langfun/core/structured/tokenization.py +26 -11
  132. langfun/core/subscription.py +2 -2
  133. langfun/core/template.py +175 -50
  134. langfun/core/template_test.py +123 -17
  135. langfun/env/__init__.py +43 -0
  136. langfun/env/base_environment.py +827 -0
  137. langfun/env/base_environment_test.py +473 -0
  138. langfun/env/base_feature.py +304 -0
  139. langfun/env/base_feature_test.py +228 -0
  140. langfun/env/base_sandbox.py +842 -0
  141. langfun/env/base_sandbox_test.py +1235 -0
  142. langfun/env/event_handlers/__init__.py +14 -0
  143. langfun/env/event_handlers/chain.py +233 -0
  144. langfun/env/event_handlers/chain_test.py +253 -0
  145. langfun/env/event_handlers/event_logger.py +472 -0
  146. langfun/env/event_handlers/event_logger_test.py +304 -0
  147. langfun/env/event_handlers/metric_writer.py +726 -0
  148. langfun/env/event_handlers/metric_writer_test.py +214 -0
  149. langfun/env/interface.py +1640 -0
  150. langfun/env/interface_test.py +153 -0
  151. langfun/env/load_balancers.py +59 -0
  152. langfun/env/load_balancers_test.py +141 -0
  153. langfun/env/test_utils.py +507 -0
  154. {langfun-0.1.2.dev202509120804.dist-info → langfun-0.1.2.dev202512040805.dist-info}/METADATA +7 -3
  155. langfun-0.1.2.dev202512040805.dist-info/RECORD +217 -0
  156. langfun/core/eval/v2/runners_test.py +0 -343
  157. langfun/core/structured/schema.py +0 -987
  158. langfun/core/structured/schema_test.py +0 -982
  159. langfun-0.1.2.dev202509120804.dist-info/RECORD +0 -172
  160. {langfun-0.1.2.dev202509120804.dist-info → langfun-0.1.2.dev202512040805.dist-info}/WHEEL +0 -0
  161. {langfun-0.1.2.dev202509120804.dist-info → langfun-0.1.2.dev202512040805.dist-info}/licenses/LICENSE +0 -0
  162. {langfun-0.1.2.dev202509120804.dist-info → langfun-0.1.2.dev202512040805.dist-info}/top_level.txt +0 -0
langfun/core/langfunc.py CHANGED
@@ -32,146 +32,43 @@ _TLS_LFUN_CALL_STACK = '_langfunc_callstack'
32
32
  # NOTE(daiyip): Only the template string belongs to the positional arguments,
33
33
  # all others are keyword-only for clarity.
34
34
  @pg.use_init_args(['template_str'])
35
- class LangFunc(
36
- template_lib.Template,
37
- ):
38
- r"""Base class for natural-language driven component.
39
-
40
- ``LangFunc`` is a language-driven component that enables users to
41
- seamlessly interact with Language Models (LLMs) using a blend of natural
42
- language and code. It empowers users to easily modularize prompt/execution
43
- logics, compose them, and simplify the creation of Language Model (LLM)-based
44
- components and applications.
45
-
46
- LangFunc can be conceptualized as a string template with embeddable code,
47
- but it distinguishes itself from traditional template systems in four key
48
- ways.
49
-
50
- Firstly, it enables easy modularization of templates along with the required
51
- values with OO principles, providing a reusable way for LLM-based content
52
- generation. For example:
53
-
54
- ```
55
- class FewshotExamples(lf.LangFunc):
56
- '''Base for fewshot prompt.
57
-
58
- {% for example in examples %}
59
- {{ example }}
60
- {% endfor %}
61
- '''
62
-
63
- # Usage 1: __init__ time binding.
64
- assert FewshotPrompt(examples=['foo', 'bar'])() == 'foo\nbar'
65
-
66
- # Usage 2: __call__ time binding.
67
- assert FewshotPrompt()(examples=['foo', 'bar']) == 'foo\nbar'
68
-
69
- class ToolDescription(lf.LangFunc):
70
- '''Tool descriptions.
71
-
72
- {% for tool in tools %}
73
- {{ tool.description }}
74
- {% endfor %}
75
- '''
76
- # We want to constrain tools to be a list of `Tool` objects.
77
- tools: list[Tool]
78
-
79
- # Raises: runtime type checking will fail on [1, 2, 3].
80
- ToolDescription(tools=[1, 2, 3])
81
- ```
82
-
83
- Secondly, it has the capability to compose multiple LangFuncs together,
84
- enabling the accomplishment of complex language tasks with maximum reuse.
85
- It allows users to provide program inputs to all the LangFuncs within a
86
- composition at the top level, significantly simplifying the process of
87
- providing context for users. For example:
88
-
89
- ```
90
- class ReAct(lf.LangFunc):
91
- '''ReAct prompt for tool-use.
92
-
93
- {{ preamble }}
94
- {{ tool_description }}
95
- {{ tool_examples }}
96
- {{ user_input }}
97
- '''
98
- # Default preamble, which could be overriden from subclass
99
- # or parsed from the `__init__` argument.
100
- preamble = 'Please help me on my task based on the following tools.',
101
-
102
- react = ReAct(
103
- tool_description=ToolDescription()
104
- tool_examples=FewshotExamples(),
105
- # Partially bind `tools` and `examples`.
106
- tools=my_tools,
107
- examples=[t.examples for t in my_tools]
108
- )
109
-
110
- # Late bind `user_input` at __call__ time.
111
- react(user_input='Help me get a lunch to go, veggie please.' )
112
- ```
113
-
114
- Thirdly, it allows the flexibility to encapsulate complex compositions to
115
- reusable classes and modify them. For example:
116
-
117
- ```
118
- # The compound decorator converts a function into a LangFunc.
119
- @lf.compound
120
- def react_with_tools(preamble, tools: list[Tool]):
121
- return ReAct(
122
- preamble=preamble,
123
- tool_description=ToolDescription()
124
- tool_examples=FewshotExamples(),
125
- # Partially bind `tools` and `examples`.
126
- tools=my_tools,
127
- examples=[t.examples for t in my_tools]
128
- )
35
+ class LangFunc(template_lib.Template):
36
+ r"""Base class for Language-based functions.
129
37
 
130
- # Actually, the entire chat application is a LangFunc.
131
- class Chat(lt.LangFunc):
132
- '''LLM-based Chat application.
38
+ LangFunc represents a function powered by a language model. It is a subclass
39
+ of `lf.Template` and can be thought of as a `lf.Template` augmented with an LM
40
+ and an output transformation. Calling a `lf.LangFunc` is equivalent to calling
41
+ the LM with the rendered prompt and transforming the output.
133
42
 
134
- llm({{ prompt }})
135
- '''
43
+ LangFunc can be directly constructed and used.
136
44
 
137
- chat = Chat(
138
- llm=Bard24B(),
139
- prompt=react_with_tools(
140
- preamble=(
141
- f'Please help me solve my problem using tools. '
142
- f'Current time is {{datetime.datetime.now()}}'),
143
- tools=my_tools))
45
+ ```python
46
+ import langfun as lf
144
47
 
145
- chat(user_input='Help me get a lunch to go, veggie please.')
146
- ```
48
+ func = lf.LangFunc("Hello, {{name}}!")
49
+ print(func(name="Gemini", lm=lf.llms.Gemini25Flash()))
50
+ # Output: Hello, how are you today?
51
+ ```
147
52
 
148
- Fourthly, LangFunc is built on top of PyGlove symbolic programming power,
149
- it could be manipulated programmatically, turned into a space for data
150
- sampling, or even tuned by AutoML. For example:
53
+ Or it can be subclassed:
151
54
 
152
- ```
153
- import pyglove as pg
55
+ ```python
56
+ import langfun as lf
154
57
 
155
- prompt_space = react_with_tools(
156
- preamble=pg.oneof([
157
- 'Help me solve my problem using the following tools:',
158
- 'Help me with the tools below:',
159
- ...
160
- ])
161
- # Choose any two of the tools for generating data.
162
- tools=pg.manyof(2, [
163
- google_search(...),
164
- doordash(...),
165
- ...
166
- ])
58
+ class Compute(lf.LangFunc):
59
+ '''Compute a simple arithmetic expression.
167
60
 
168
- for prompt in pg.random_sample(prompt_space):
169
- print(prompt(user_input='Help me book a conf room please.'))
61
+ {{expression}} = ?
62
+ '''
63
+ expression: str
170
64
 
171
- ```
65
+ def transform_output(self, lm_output: lf.Message) -> lf.Message:
66
+ lm_output.metadata.result = float(lm_output.text)
67
+ return lm_output
172
68
 
173
- For more capabilities on symbolic programming with PyGlove, please checkout
174
- https://pyglove.readthedocs.io/en/latest/.
69
+ r = Compute(expression="1 + 1")(lm=lf.llms.Gemini25Flash())
70
+ print(r.result)
71
+ # Output: 2.0
175
72
 
176
73
  Final note: always include these capitalized words if you don't want to treat
177
74
  the docstr as the template str: THIS IS NOT A TEMPLATE. So as a result, this
@@ -305,6 +202,24 @@ class LangFunc(
305
202
  message_cls: Type[message_lib.Message] = message_lib.UserMessage,
306
203
  **kwargs,
307
204
  ) -> message_lib.Message:
205
+ """Renders the template and transforms it as LM input message.
206
+
207
+ Args:
208
+ allow_partial: If True, allows partial rendering, which leaves unresolved
209
+ variables in place in the output text. Otherwise, raises error when
210
+ there are unresolved variables.
211
+ implicit: If True, reuse the rendering output if a parent `lf.Template`
212
+ is rendering current `lf.Template` multiple times. This is important
213
+ for making sure all references to the same `lf.Template` within a single
214
+ top-level rendering would return the same result. If False, every call
215
+ to `render` will trigger the actual rendering process.
216
+ message_cls: The message class used for creating the return value.
217
+ **kwargs: Values for template variables, which override values from
218
+ member attributes or context.
219
+
220
+ Returns:
221
+ A Message object containing the rendered result.
222
+ """
308
223
  lm_input = super().render(
309
224
  allow_partial=allow_partial,
310
225
  implicit=implicit,
@@ -82,7 +82,7 @@ class LangFuncCallTest(unittest.TestCase):
82
82
 
83
83
  i = l.render()
84
84
  self.assertEqual(i, 'Hello')
85
- self.assertEqual(i, message.UserMessage('Hello'))
85
+ self.assertEqual(i, message.UserMessage('Hello', __template_input__={}))
86
86
  self.assertEqual(i.tags, ['rendered'])
87
87
 
88
88
  r = l()
@@ -96,7 +96,9 @@ class LangFuncCallTest(unittest.TestCase):
96
96
  self.assertEqual(r.tags, ['lm-response', 'lm-output'])
97
97
  self.assertEqual(
98
98
  r.source,
99
- message.UserMessage('Hello', metadata=dict(cache_seed=0))
99
+ message.UserMessage(
100
+ 'Hello', metadata=dict(cache_seed=0, __template_input__={})
101
+ )
100
102
  )
101
103
  self.assertEqual(r.source.tags, ['rendered', 'lm-input'])
102
104
 
@@ -107,9 +109,9 @@ class LangFuncCallTest(unittest.TestCase):
107
109
  ' lm=ExcitedEchoer(sampling_options=LMSamplingOptions(temperature=None,'
108
110
  ' max_tokens=None, n=1, top_k=40, top_p=None, stop=None,'
109
111
  ' random_seed=None, logprobs=False, top_logprobs=None,'
110
- ' max_thinking_tokens=None, reasoning_effort=None), cache=None,'
111
- ' max_concurrency=None, timeout=120.0, max_attempts=5,'
112
- ' retry_interval=(5, 60), exponential_backoff=True,'
112
+ ' max_thinking_tokens=None, thinking_level=None, reasoning_effort=None,'
113
+ ' extras={}), cache=None, max_concurrency=None, timeout=120.0,'
114
+ ' max_attempts=5, retry_interval=(5, 60), exponential_backoff=True,'
113
115
  ' max_retry_interval=300, debug=False))',
114
116
  )
115
117
 
@@ -53,6 +53,10 @@ class RetryableLMError(LMError):
53
53
  """Base class for LLM errors that can be solved by retrying."""
54
54
 
55
55
 
56
+ class EmptyGenerationError(RetryableLMError):
57
+ """Error for empty generaition."""
58
+
59
+
56
60
  class RateLimitError(RetryableLMError):
57
61
  """Error for rate limit reached."""
58
62
 
@@ -478,7 +482,7 @@ class UsageNotAvailable(LMSamplingUsage):
478
482
 
479
483
 
480
484
  class LMSamplingResult(pg.Object):
481
- """Language model response."""
485
+ """The result from a language model sampling."""
482
486
 
483
487
  samples: Annotated[
484
488
  list[LMSample],
@@ -575,6 +579,14 @@ class LMSamplingOptions(component.Component):
575
579
  int | None, 'Number of max thinking tokens.'
576
580
  ] = None
577
581
 
582
+ thinking_level: Annotated[
583
+ Literal['low', 'high'] | None,
584
+ (
585
+ 'Thinking level for Gemini models. High is for complex tasks, '
586
+ 'while low is for faster responses.'
587
+ ),
588
+ ] = None
589
+
578
590
  reasoning_effort: Annotated[
579
591
  Literal['low', 'medium', 'high'] | None,
580
592
  (
@@ -584,6 +596,15 @@ class LMSamplingOptions(component.Component):
584
596
  ),
585
597
  ] = None
586
598
 
599
+ extras: Annotated[
600
+ dict[str, Any],
601
+ (
602
+ 'Extra arguments (e.g. configuration for tool calls) to pass to '
603
+ 'the model. This is model-specific, please check model '
604
+ 'implementation to see how to use this.'
605
+ ),
606
+ ] = {}
607
+
587
608
  def cache_key(self) -> tuple[Any, ...]:
588
609
  """Returns a tuple of current values as cache key."""
589
610
  return (
@@ -672,13 +693,91 @@ class LMDebugMode(enum.IntFlag):
672
693
 
673
694
 
674
695
  class LanguageModel(component.Component):
675
- """Interface of a language model.
676
-
677
- Language models are at the center of LLM-based agents. ``LanguageModel``
678
- is the interface to interact with different language modles.
679
-
680
- In langfun, users can use different language models with the same agents,
681
- allowing fast prototype, as well as side-by-side comparisons.
696
+ """Interface for language model.
697
+
698
+ `lf.LanguageModel` is the cornerstone of Langfun, providing a consistent
699
+ interface for interacting with various language models, such as those from
700
+ Google, OpenAI, Anthropic, and more. It abstracts away provider-specific
701
+ details, allowing users to switch between models seamlessly.
702
+
703
+ All language models in Langfun can be accessed via `lf.llms`. For example,
704
+ `lf.llms.Gpt4()` creates an instance for OpenAI's GPT-4, and
705
+ `lf.llms.GeminiPro()` creates an instance for Google's Gemini Pro.
706
+
707
+ **Key Features:**
708
+
709
+ * **Unified API**: Provides `sample`, `score`, and `tokenize` methods
710
+ across all supported models.
711
+ * **Sampling**: The `__call__` method and `sample` method allow generating
712
+ text completions or chat responses.
713
+ * **Scoring**: The `score` method computes the likelihood of completions
714
+ given a prompt.
715
+ * **Tokenization**: The `tokenize` method breaks text into tokens
716
+ according to the model's tokenizer.
717
+ * **Caching**: Built-in support for caching LLM requests to save cost and
718
+ time via the `cache` attribute.
719
+ * **Concurrency**: Manages concurrency to respect API rate limits via
720
+ `max_concurrency`.
721
+ * **Retries**: Automatic retries with exponential backoff for transient
722
+ errors via `max_attempts` and `retry_interval`.
723
+
724
+ **1. Creating a Language Model:**
725
+ You can create a language model by instantiating its class or by using
726
+ `lf.LanguageModel.get`:
727
+
728
+ ```python
729
+ # Direct instantiation
730
+ gpt4 = lf.llms.Gpt4()
731
+ gemini = lf.llms.GeminiPro()
732
+
733
+ # Creation via lf.LanguageModel.get()
734
+ gpt4 = lf.LanguageModel.get('gpt-4')
735
+ ```
736
+
737
+ **2. Customizing Sampling Options:**
738
+ Sampling options like `temperature`, `max_tokens`, etc., can be customized
739
+ at model creation, or overridden at call time or via `lf.context`.
740
+
741
+ ```python
742
+ # Set temperature to 0 at model creation
743
+ lm = lf.llms.Gpt4(temperature=0.0)
744
+
745
+ # Override temperature to 0.5 for a single call
746
+ response = lm('1 + 1 =', temperature=0.5)
747
+
748
+ # Override temperature to 1.0 using lf.context
749
+ with lf.context(temperature=1.0):
750
+ response = lm('1 + 1 =')
751
+ ```
752
+
753
+ **3. Sampling:**
754
+ Use `lm()`, `lm.sample()`, or `lf.query()` to generate text:
755
+
756
+ ```python
757
+ lm = lf.llms.Gpt4()
758
+ response = lm('1 + 1 =')
759
+ print(response.text)
760
+ # Output: 2
761
+ ```
762
+
763
+ **4. Scoring:**
764
+ Use `lm.score()` to score completions:
765
+
766
+ ```python
767
+ lm = lf.llms.Gpt4()
768
+ results = lm.score('Weather in SF is', completions=['sunny', 'cloudy'])
769
+ print(results[0].score)
770
+ # Output: -1.0
771
+ ```
772
+
773
+ **5. Tokenization:**
774
+ Use `lm.tokenize()` to get tokens:
775
+ ```python
776
+ lm = lf.llms.Gpt4()
777
+ tokens = lm.tokenize('hello world')
778
+ print(tokens)
779
+ # Output: [('hello', 15339), (' world', 1917)]
780
+ ```
682
781
  """
683
782
 
684
783
  sampling_options: LMSamplingOptions = LMSamplingOptions()
@@ -989,10 +1088,32 @@ class LanguageModel(component.Component):
989
1088
  prompts = [message_lib.UserMessage.from_value(p) for p in prompts]
990
1089
 
991
1090
  with component.context(override_attrs=True, **kwargs):
992
- if self.cache is None:
993
- results = self._sample(prompts)
994
- else:
995
- results = self._sample_with_cache_lookup(prompts, cache_seed)
1091
+
1092
+ def _sample_with_retry():
1093
+ if self.cache is None:
1094
+ results = self._sample(prompts)
1095
+ else:
1096
+ results = self._sample_with_cache_lookup(prompts, cache_seed)
1097
+
1098
+ for i, result in enumerate(results):
1099
+ for sample in result.samples:
1100
+ if not sample.response.text:
1101
+ if self.cache is not None:
1102
+ self.cache.delete(self, prompts[i], seed=cache_seed)
1103
+ raise EmptyGenerationError(
1104
+ f'Empty generation encountered from model {self.model_id}.'
1105
+ )
1106
+ return results
1107
+
1108
+ retry_fn = concurrent.with_retry(
1109
+ _sample_with_retry,
1110
+ retry_on_errors=EmptyGenerationError,
1111
+ max_attempts=self.max_attempts,
1112
+ retry_interval=self.retry_interval,
1113
+ exponential_backoff=self.exponential_backoff,
1114
+ max_retry_interval=self.max_retry_interval,
1115
+ )
1116
+ results = retry_fn()
996
1117
 
997
1118
  for prompt, result in zip(prompts, results):
998
1119
 
@@ -1001,7 +1122,6 @@ class LanguageModel(component.Component):
1001
1122
 
1002
1123
  for sample in result.samples:
1003
1124
  # Update metadata for response message.
1004
-
1005
1125
  response = sample.response
1006
1126
  response.metadata.score = sample.score
1007
1127
  response.metadata.logprobs = sample.logprobs
@@ -1159,21 +1279,35 @@ class LanguageModel(component.Component):
1159
1279
  ) -> message_lib.Message:
1160
1280
  """Returns the first candidate."""
1161
1281
  prompt = message_lib.UserMessage.from_value(prompt)
1162
- with component.context(override_attrs=True, **kwargs):
1163
- sampling_options = self.sampling_options
1164
- if sampling_options.n != 1:
1165
- sampling_options = sampling_options.clone(override=dict(n=1))
1166
-
1167
- call_counter = self._call_counter
1168
- self._call_counter += 1
1169
- request_start = time.time()
1170
- result = self.sample(
1171
- [prompt], sampling_options=sampling_options, cache_seed=cache_seed
1172
- )[0]
1173
- elapse = time.time() - request_start
1174
- response = result.samples[0].response
1175
- self._debug(prompt, response, call_counter, result.usage, elapse)
1176
- return response
1282
+ start_time = time.time()
1283
+ error_tag = ''
1284
+ try:
1285
+ with component.context(override_attrs=True, **kwargs):
1286
+ sampling_options = self.sampling_options
1287
+ if sampling_options.n != 1:
1288
+ sampling_options = sampling_options.clone(override=dict(n=1))
1289
+
1290
+ call_counter = self._call_counter
1291
+ self._call_counter += 1
1292
+ request_start = time.time()
1293
+ result = self.sample(
1294
+ [prompt], sampling_options=sampling_options, cache_seed=cache_seed
1295
+ )[0]
1296
+ elapse = time.time() - request_start
1297
+ response = result.samples[0].response
1298
+ self._debug(prompt, response, call_counter, result.usage, elapse)
1299
+ return response
1300
+ except BaseException as e:
1301
+ error_tag = pg.ErrorInfo.from_exception(e).tag
1302
+ raise e
1303
+ finally:
1304
+ _METRICS.language_model_calls.increment(
1305
+ model=self.model_id, error=error_tag
1306
+ )
1307
+ _METRICS.language_model_call_duration_ms.record(
1308
+ int((time.time() - start_time) * 1000),
1309
+ model=self.model_id, error=error_tag,
1310
+ )
1177
1311
 
1178
1312
  def _debug(
1179
1313
  self,
@@ -1230,11 +1364,11 @@ class LanguageModel(component.Component):
1230
1364
  title=f'\n[{call_counter}] PROMPT SENT TO LM{title_suffix}:',
1231
1365
  color='green',
1232
1366
  )
1233
- referred_modalities = prompt.referred_modalities()
1234
- if referred_modalities:
1367
+ if prompt.referred_modalities:
1235
1368
  console.write(
1236
1369
  pg.object_utils.kvlist_str(
1237
- [(k, repr(v), None) for k, v in referred_modalities.items()]
1370
+ [(k, repr(v), None)
1371
+ for k, v in prompt.referred_modalities.items()]
1238
1372
  ),
1239
1373
  title=f'\n[{call_counter}] MODALITY OBJECTS SENT TO LM:',
1240
1374
  color='green',
@@ -1320,9 +1454,9 @@ class LanguageModel(component.Component):
1320
1454
  color='green',
1321
1455
  )
1322
1456
  if isinstance(prompt, list):
1323
- referred_modalities_lst = [p.referred_modalities() for p in prompt]
1457
+ referred_modalities_lst = [p.referred_modalities for p in prompt]
1324
1458
  else:
1325
- referred_modalities_lst = [prompt.referred_modalities(),]
1459
+ referred_modalities_lst = [prompt.referred_modalities,]
1326
1460
  if referred_modalities_lst:
1327
1461
  for referred_modalities in referred_modalities_lst:
1328
1462
  console.write(
@@ -1397,7 +1531,7 @@ class LanguageModel(component.Component):
1397
1531
  title=f'\n[{call_counter}] PROMPT TO TOKENIZE:',
1398
1532
  color='green',
1399
1533
  )
1400
- referred_modalities_lst = [prompt.referred_modalities(),]
1534
+ referred_modalities_lst = [prompt.referred_modalities,]
1401
1535
  if referred_modalities_lst:
1402
1536
  for referred_modalities in referred_modalities_lst:
1403
1537
  console.write(
@@ -1425,7 +1559,7 @@ class LanguageModel(component.Component):
1425
1559
  max_requests_per_minute: int | None,
1426
1560
  average_tokens_per_request: int = 250
1427
1561
  ) -> int | None:
1428
- """Estimates max concurrency concurrency based on the rate limits."""
1562
+ """Estimates max concurrency based on the rate limits."""
1429
1563
  # NOTE(daiyip): max concurrency is estimated based on the rate limit.
1430
1564
  # We assume each request has approximately 250 tokens, and each request
1431
1565
  # takes 1 second to complete. This might not be accurate for all models.
@@ -1438,6 +1572,25 @@ class LanguageModel(component.Component):
1438
1572
  return None
1439
1573
 
1440
1574
 
1575
+ class _Metrics:
1576
+ """Metrics for Langfun."""
1577
+
1578
+ def __init__(self):
1579
+ self._metrics = pg.monitoring.metric_collection('/third_party/langfun')
1580
+ self.language_model_calls = self._metrics.get_counter(
1581
+ 'language_model_calls',
1582
+ 'Number of calls to the language model.',
1583
+ parameters={'model': str, 'error': str},
1584
+ )
1585
+ self.language_model_call_duration_ms = self._metrics.get_distribution(
1586
+ 'language_model_call_duration_ms',
1587
+ 'Duration of calls to the language model in milliseconds.',
1588
+ parameters={'model': str, 'error': str},
1589
+ )
1590
+
1591
+ _METRICS = _Metrics()
1592
+
1593
+
1441
1594
  class _ConcurrencyControl:
1442
1595
  """Controls the max concurrent LLM calls for a given model."""
1443
1596
 
@@ -1479,7 +1632,7 @@ class _ConcurrencyControl:
1479
1632
 
1480
1633
 
1481
1634
  class UsageSummary(pg.Object, pg.views.HtmlTreeView.Extension):
1482
- """Usage sumary."""
1635
+ """Usage summary."""
1483
1636
 
1484
1637
  class AggregatedUsage(pg.Object):
1485
1638
  """Aggregated usage."""
@@ -591,6 +591,51 @@ class LanguageModelTest(unittest.TestCase):
591
591
  lm = MockModel(cache=cache, top_k=1)
592
592
  self.assertEqual(lm('a'), 'a')
593
593
 
594
+ def test_empty_generation_error(self):
595
+ class MockModelWithEmptyResponse(MockModel):
596
+ def _sample(self,
597
+ prompts: list[message_lib.Message]
598
+ ) -> list[lm_lib.LMSamplingResult]:
599
+ return [lm_lib.LMSamplingResult(
600
+ [lm_lib.LMSample(response='')],
601
+ usage=lm_lib.LMSamplingUsage(100, 0, 100, 1, 1.0)
602
+ )]
603
+ lm = MockModelWithEmptyResponse(max_attempts=1, retry_interval=0)
604
+ with self.assertRaisesRegex(
605
+ concurrent.RetryError, 'Empty generation encountered'
606
+ ):
607
+ lm('a')
608
+
609
+ def test_empty_generation_retry(self):
610
+ class MockModelWithEmptyThenValid(MockModel):
611
+ attempt_count: int = 0
612
+
613
+ def _sample(
614
+ self, prompts: list[message_lib.Message]
615
+ ) -> list[lm_lib.LMSamplingResult]:
616
+ self.rebind(attempt_count=self.attempt_count + 1)
617
+ if self.attempt_count == 1:
618
+ # First attempt returns empty
619
+ return [
620
+ lm_lib.LMSamplingResult(
621
+ [lm_lib.LMSample(response='')],
622
+ usage=lm_lib.LMSamplingUsage(100, 0, 100, 1, 1.0),
623
+ )
624
+ ]
625
+ else:
626
+ # Subsequent attempts return valid response
627
+ return [
628
+ lm_lib.LMSamplingResult(
629
+ [lm_lib.LMSample(response='valid response')],
630
+ usage=lm_lib.LMSamplingUsage(100, 100, 200, 1, 1.0),
631
+ )
632
+ ]
633
+
634
+ lm = MockModelWithEmptyThenValid(max_attempts=3, retry_interval=0)
635
+ result = lm('a')
636
+ self.assertEqual(result.text, 'valid response')
637
+ self.assertEqual(lm.attempt_count, 2)
638
+
594
639
  def test_estimate_max_concurrency(self):
595
640
  self.assertIsNone(lm_lib.LanguageModel.estimate_max_concurrency(None, None))
596
641
  self.assertEqual(
@@ -656,11 +701,17 @@ class LanguageModelTest(unittest.TestCase):
656
701
 
657
702
  string_io = io.StringIO()
658
703
  lm = MockModel(sampling_options=lm_lib.LMSamplingOptions(top_k=1))
704
+ image = Image()
659
705
  with contextlib.redirect_stdout(string_io):
660
706
  self.assertEqual(
661
- lm(message_lib.UserMessage(
662
- 'hi <<[[image]]>>', image=Image()), debug=True),
663
- 'hi <<[[image]]>>'
707
+ lm(
708
+ message_lib.UserMessage(
709
+ f'hi <<[[{image.id}]]>>',
710
+ referred_modalities=[image],
711
+ ),
712
+ debug=True
713
+ ),
714
+ f'hi <<[[{image.id}]]>>'
664
715
  )
665
716
 
666
717
  debug_info = string_io.getvalue()
@@ -30,7 +30,8 @@ from langfun.core.llms.compositional import RandomChoice
30
30
 
31
31
  # Base models by request/response protocol.
32
32
  from langfun.core.llms.rest import REST
33
- from langfun.core.llms.openai_compatible import OpenAICompatible
33
+ from langfun.core.llms.openai_compatible import OpenAIChatCompletionAPI
34
+ from langfun.core.llms.openai_compatible import OpenAIResponsesAPI
34
35
  from langfun.core.llms.gemini import Gemini
35
36
  from langfun.core.llms.anthropic import Anthropic
36
37
 
@@ -41,6 +42,7 @@ from langfun.core.llms.azure_openai import AzureOpenAI
41
42
 
42
43
  # Gemini models.
43
44
  from langfun.core.llms.google_genai import GenAI
45
+ from langfun.core.llms.google_genai import Gemini3ProPreview
44
46
  from langfun.core.llms.google_genai import Gemini25Pro
45
47
  from langfun.core.llms.google_genai import Gemini25Flash
46
48
  from langfun.core.llms.google_genai import Gemini25ProPreview_20250605
@@ -89,6 +91,7 @@ from langfun.core.llms.vertexai import VertexAIGemini25ProPreview_20250605
89
91
  from langfun.core.llms.vertexai import VertexAIGemini25Pro
90
92
  from langfun.core.llms.vertexai import VertexAIGemini25Flash
91
93
  from langfun.core.llms.vertexai import VertexAIGemini25FlashImagePreview
94
+ from langfun.core.llms.vertexai import VertexAIGemini3ProPreview
92
95
 
93
96
  # For backward compatibility.
94
97
  GeminiPro1_5 = Gemini15Pro
@@ -99,6 +102,9 @@ VertexAIGeminiFlash1_5 = VertexAIGemini15Flash
99
102
  # OpenAI models.
100
103
  from langfun.core.llms.openai import OpenAI
101
104
 
105
+ from langfun.core.llms.openai import Gpt51
106
+ from langfun.core.llms.openai import Gpt5
107
+ from langfun.core.llms.openai import Gpt5Mini
102
108
  from langfun.core.llms.openai import Gpt41
103
109
  from langfun.core.llms.openai import GptO3
104
110
  from langfun.core.llms.openai import GptO4Mini
@@ -149,6 +155,9 @@ from langfun.core.llms.openai import Gpt35
149
155
 
150
156
  # Anthropic models.
151
157
 
158
+ from langfun.core.llms.anthropic import Claude45
159
+ from langfun.core.llms.anthropic import Claude45Haiku_20251001
160
+ from langfun.core.llms.anthropic import Claude45Sonnet_20250929
152
161
  from langfun.core.llms.anthropic import Claude4
153
162
  from langfun.core.llms.anthropic import Claude4Sonnet_20250514
154
163
  from langfun.core.llms.anthropic import Claude4Opus_20250514
@@ -166,6 +175,8 @@ from langfun.core.llms.anthropic import Claude3Haiku
166
175
  from langfun.core.llms.anthropic import Claude3Haiku_20240307
167
176
 
168
177
  from langfun.core.llms.vertexai import VertexAIAnthropic
178
+ from langfun.core.llms.vertexai import VertexAIClaude45Haiku_20251001
179
+ from langfun.core.llms.vertexai import VertexAIClaude45Sonnet_20250929
169
180
  from langfun.core.llms.vertexai import VertexAIClaude4Opus_20250514
170
181
  from langfun.core.llms.vertexai import VertexAIClaude4Sonnet_20250514
171
182
  from langfun.core.llms.vertexai import VertexAIClaude37Sonnet_20250219