langfun 0.0.2.dev20240429__py3-none-any.whl → 0.0.2.dev20240511__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of langfun might be problematic. Click here for more details.

Files changed (37) hide show
  1. langfun/__init__.py +5 -0
  2. langfun/core/eval/__init__.py +14 -1
  3. langfun/core/eval/base.py +503 -112
  4. langfun/core/eval/base_test.py +185 -53
  5. langfun/core/eval/matching.py +22 -21
  6. langfun/core/eval/matching_test.py +23 -2
  7. langfun/core/eval/patching.py +130 -0
  8. langfun/core/eval/patching_test.py +170 -0
  9. langfun/core/eval/scoring.py +4 -4
  10. langfun/core/eval/scoring_test.py +19 -2
  11. langfun/core/langfunc.py +1 -17
  12. langfun/core/langfunc_test.py +4 -0
  13. langfun/core/language_model.py +6 -0
  14. langfun/core/llms/__init__.py +8 -0
  15. langfun/core/llms/fake.py +6 -6
  16. langfun/core/llms/google_genai.py +8 -0
  17. langfun/core/llms/openai.py +3 -2
  18. langfun/core/llms/openai_test.py +2 -1
  19. langfun/core/llms/vertexai.py +291 -0
  20. langfun/core/llms/vertexai_test.py +233 -0
  21. langfun/core/modalities/image.py +1 -3
  22. langfun/core/modalities/mime.py +6 -0
  23. langfun/core/modalities/video.py +1 -3
  24. langfun/core/structured/__init__.py +2 -0
  25. langfun/core/structured/mapping.py +5 -1
  26. langfun/core/structured/prompting.py +39 -11
  27. langfun/core/structured/prompting_test.py +43 -0
  28. langfun/core/structured/schema.py +34 -4
  29. langfun/core/structured/schema_test.py +32 -1
  30. langfun/core/structured/scoring.py +4 -1
  31. langfun/core/structured/scoring_test.py +6 -0
  32. langfun/core/template.py +22 -1
  33. {langfun-0.0.2.dev20240429.dist-info → langfun-0.0.2.dev20240511.dist-info}/METADATA +2 -2
  34. {langfun-0.0.2.dev20240429.dist-info → langfun-0.0.2.dev20240511.dist-info}/RECORD +37 -33
  35. {langfun-0.0.2.dev20240429.dist-info → langfun-0.0.2.dev20240511.dist-info}/LICENSE +0 -0
  36. {langfun-0.0.2.dev20240429.dist-info → langfun-0.0.2.dev20240511.dist-info}/WHEEL +0 -0
  37. {langfun-0.0.2.dev20240429.dist-info → langfun-0.0.2.dev20240511.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,170 @@
1
+ # Copyright 2024 The Langfun Authors
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Tests for evaluation patching."""
15
+
16
+ import unittest
17
+ from langfun.core import llms as lf_llms
18
+ from langfun.core.eval import base
19
+ from langfun.core.eval import patching
20
+ import pyglove as pg
21
+
22
+
23
+ class PatchingCommonTest(unittest.TestCase):
24
+
25
+ def test_patch_member(self):
26
+ class A(pg.Object):
27
+ x: int = 1
28
+
29
+ class B(pg.Object):
30
+ a: A
31
+
32
+ b = B(A())
33
+ pg.patch(b, [patching.patch_member(A, 'x', 2)])
34
+ self.assertEqual(b, B(A(2)))
35
+
36
+ def test_patch_args(self):
37
+ s = base.Suite(
38
+ [base.Evaluation(inputs=base.as_inputs([1]))],
39
+ additional_args=dict(x=1, y=2),
40
+ )
41
+ pg.patch(s, [patching.patch_additional_args(x=3, z=4)])
42
+ self.assertTrue(
43
+ pg.eq(
44
+ s,
45
+ base.Suite(
46
+ [
47
+ base.Evaluation(
48
+ inputs=base.as_inputs([1]),
49
+ additional_args=dict(x=3, y=2, z=4),
50
+ )
51
+ ],
52
+ additional_args=dict(x=3, y=2, z=4),
53
+ ),
54
+ )
55
+ )
56
+
57
+ def test_patch_lm(self):
58
+ s = base.Suite(
59
+ [base.Evaluation(inputs=base.as_inputs([1]))],
60
+ lm=lf_llms.Gpt35Turbo(),
61
+ )
62
+ pg.patch(
63
+ s, [patching.patch_lm(pg.oneof([lf_llms.Gpt35Turbo(), lf_llms.Gpt4()]))]
64
+ )
65
+ self.assertTrue(
66
+ pg.eq(
67
+ s,
68
+ base.Suite(
69
+ [
70
+ base.Evaluation(
71
+ inputs=base.as_inputs([1]),
72
+ lm=pg.oneof([lf_llms.Gpt35Turbo(), lf_llms.Gpt4()]),
73
+ )
74
+ ],
75
+ lm=pg.oneof([lf_llms.Gpt35Turbo(), lf_llms.Gpt4()]),
76
+ ),
77
+ )
78
+ )
79
+
80
+ def test_patch_parsing_lm(self):
81
+ s = base.Suite(
82
+ [base.Evaluation(inputs=base.as_inputs([1]))],
83
+ lm=lf_llms.Gpt4(),
84
+ )
85
+ pg.patch(s, [patching.patch_parsing_lm(lf_llms.Gpt35Turbo())])
86
+ self.assertTrue(
87
+ pg.eq(
88
+ s,
89
+ base.Suite(
90
+ [
91
+ base.Evaluation(
92
+ inputs=base.as_inputs([1]),
93
+ lm=lf_llms.Gpt4(),
94
+ parsing_lm=lf_llms.Gpt35Turbo(),
95
+ )
96
+ ],
97
+ # NOTE(daiyip): Suite does not have `parsing_lm` as one of its
98
+ # variable keyword fields yet, so patching does not add to it.
99
+ # This is okay since we only care about the leaf nodes.
100
+ lm=lf_llms.Gpt4(),
101
+ ),
102
+ )
103
+ )
104
+
105
+ def test_patch_prompt(self):
106
+ e = base.Evaluation(inputs=base.as_inputs([1]))
107
+ pg.patch(e, [patching.patch_prompt('Q: {{example.question}}')])
108
+ self.assertTrue(
109
+ pg.eq(
110
+ e,
111
+ base.Evaluation(
112
+ inputs=base.as_inputs([1]),
113
+ prompt='Q: {{example.question}}',
114
+ ),
115
+ )
116
+ )
117
+
118
+ def test_patch_inputs(self):
119
+ e = base.Evaluation(inputs=base.as_inputs([1]))
120
+ pg.patch(e, [patching.patch_inputs(base.as_inputs([2]))])
121
+ self.assertTrue(
122
+ pg.eq(
123
+ e,
124
+ base.Evaluation(
125
+ inputs=base.as_inputs([2]),
126
+ ),
127
+ )
128
+ )
129
+
130
+ def test_patch_schema_fn(self):
131
+ @pg.functor()
132
+ def int_schema():
133
+ return int
134
+
135
+ e = base.Evaluation(inputs=base.as_inputs([1]))
136
+ pg.patch(e, [patching.patch_schema_fn(int_schema())])
137
+ self.assertTrue(
138
+ pg.eq(
139
+ e,
140
+ base.Evaluation(
141
+ inputs=base.as_inputs([1]),
142
+ schema_fn=int_schema(),
143
+ ),
144
+ )
145
+ )
146
+
147
+
148
+ class StringPatcheTest(unittest.TestCase):
149
+
150
+ def test_lm(self):
151
+ target = pg.patch(
152
+ base.Evaluation(inputs=base.as_inputs([1])),
153
+ ['lm?haiku:gpt4', 'max_tokens?1024', 'temperature?0.7'],
154
+ )
155
+ self.assertEqual(
156
+ target.lm,
157
+ pg.oneof([
158
+ lf_llms.Claude3Haiku(temperature=0.7, max_tokens=1024),
159
+ lf_llms.Gpt4(temperature=0.7, max_tokens=1024),
160
+ ]),
161
+ )
162
+ with self.assertRaisesRegex(ValueError, 'Unknown model name'):
163
+ pg.patch(
164
+ base.Evaluation(inputs=base.as_inputs([1])),
165
+ ['lm?gpt2'],
166
+ )
167
+
168
+
169
+ if __name__ == '__main__':
170
+ unittest.main()
@@ -113,8 +113,8 @@ class Scoring(base.Evaluation):
113
113
  m.total,
114
114
  )
115
115
 
116
- def summarize(self) -> pg.Dict:
117
- result = super().summarize()
116
+ def finalize(self) -> pg.Dict:
117
+ result = super().finalize()
118
118
  result.metrics.update(
119
119
  num_scored=self.num_scored,
120
120
  score_rate=self.score_rate,
@@ -168,7 +168,7 @@ class Scoring(base.Evaluation):
168
168
  )
169
169
  )
170
170
 
171
- def _render_metric(self, s: io.StringIO) -> None:
171
+ def _render_summary_metrics(self, s: io.StringIO) -> None:
172
172
  """Renders metrics in HTML."""
173
173
  assert self.result is not None
174
174
  m = self.result.metrics
@@ -182,7 +182,7 @@ class Scoring(base.Evaluation):
182
182
  )
183
183
  )
184
184
  s.write(' | ')
185
- super()._render_metric(s)
185
+ super()._render_summary_metrics(s)
186
186
 
187
187
  def _render_scored(self, s: io.StringIO) -> None:
188
188
  """Formats the matched cases into html."""
@@ -98,6 +98,11 @@ class ScoringTest(unittest.TestCase):
98
98
  total=2,
99
99
  failures=0,
100
100
  failure_rate=0.0,
101
+ oop_failures=0,
102
+ oop_failure_rate=0.0,
103
+ non_oop_failures=0,
104
+ non_oop_failure_rate=0.0,
105
+ failure_breakdown={},
101
106
  num_scored=2,
102
107
  score_rate=1.0,
103
108
  avg_score=0.5,
@@ -124,7 +129,12 @@ class ScoringTest(unittest.TestCase):
124
129
  )
125
130
  self.assertTrue(
126
131
  os.path.exists(
127
- os.path.join(s.dir, scoring.Scoring.FAILURES_JSON)
132
+ os.path.join(s.dir, scoring.Scoring.OOP_FAILURES_JSON)
133
+ )
134
+ )
135
+ self.assertTrue(
136
+ os.path.exists(
137
+ os.path.join(s.dir, scoring.Scoring.NON_OOP_FAILURES_JSON)
128
138
  )
129
139
  )
130
140
  self.assertTrue(
@@ -143,7 +153,14 @@ class ScoringTest(unittest.TestCase):
143
153
  self.assertTrue(
144
154
  os.path.exists(
145
155
  os.path.join(
146
- s.dir, scoring.Scoring.FAILURES_HTML
156
+ s.dir, scoring.Scoring.OOP_FAILURES_HTML
157
+ )
158
+ )
159
+ )
160
+ self.assertTrue(
161
+ os.path.exists(
162
+ os.path.join(
163
+ s.dir, scoring.Scoring.NON_OOP_FAILURES_HTML
147
164
  )
148
165
  )
149
166
  )
langfun/core/langfunc.py CHANGED
@@ -14,7 +14,7 @@
14
14
  """LangFunc: Language-based functions."""
15
15
 
16
16
  import dataclasses
17
- from typing import Annotated, Type, Union
17
+ from typing import Annotated, Type
18
18
 
19
19
  from langfun.core import component
20
20
  from langfun.core import language_model
@@ -328,22 +328,6 @@ class LangFunc(
328
328
  """Transforms the output message before returning from __call__."""
329
329
  return lm_output
330
330
 
331
- @classmethod
332
- def from_value(
333
- cls, value: Union[str, template_lib.Template], **kwargs
334
- ) -> 'LangFunc':
335
- """Create a LangFunc object from a string or template."""
336
- if isinstance(value, LangFunc):
337
- return value
338
- if isinstance(value, template_lib.Template):
339
- lfun = LangFunc(value.template_str, **kwargs)
340
- # So lfun could acccess all attributes from value.
341
- lfun.sym_setparent(value)
342
- return lfun
343
- if isinstance(value, str):
344
- return LangFunc(template_str=value, **kwargs)
345
- return LangFunc('{{input}}', input=value, **kwargs)
346
-
347
331
 
348
332
  # Register converter from str to LangFunc, therefore we can always
349
333
  # pass strs to attributes that accept LangFunc.
@@ -57,6 +57,10 @@ class BasicTest(unittest.TestCase):
57
57
  l2 = LangFunc.from_value(l1)
58
58
  self.assertIs(l2, l1)
59
59
 
60
+ l3 = LangFunc.from_value(l1, x=1)
61
+ self.assertIsNot(l3, l1)
62
+ self.assertTrue(pg.eq(l3, LangFunc('Hello', x=1)))
63
+
60
64
  c = template_lib.Template(
61
65
  '{{x}} + {{l}}',
62
66
  x=1,
@@ -22,6 +22,7 @@ from langfun.core import component
22
22
  from langfun.core import concurrent
23
23
  from langfun.core import console
24
24
  from langfun.core import message as message_lib
25
+
25
26
  import pyglove as pg
26
27
 
27
28
  TOKENS_PER_REQUEST = 250 # Estimated num tokens for a single request
@@ -166,6 +167,11 @@ class LMScoringResult(pg.Object):
166
167
  float,
167
168
  'The log likelyhood of the requested completion towards the prompt.',
168
169
  ]
170
+ gradients: Annotated[
171
+ Any | None,
172
+ '(Optional) gradients from the score method, w.r.t.' +
173
+ ' prompt.metadata.weights.',
174
+ ] = None
169
175
 
170
176
 
171
177
  class LMCache(pg.Object):
@@ -27,6 +27,7 @@ from langfun.core.llms.fake import StaticSequence
27
27
  # Gemini models.
28
28
  from langfun.core.llms.google_genai import GenAI
29
29
  from langfun.core.llms.google_genai import GeminiPro
30
+ from langfun.core.llms.google_genai import GeminiPro1_5
30
31
  from langfun.core.llms.google_genai import GeminiProVision
31
32
  from langfun.core.llms.google_genai import Palm2
32
33
  from langfun.core.llms.google_genai import Palm2_IT
@@ -73,6 +74,13 @@ from langfun.core.llms.groq import GroqLlama2_70B
73
74
  from langfun.core.llms.groq import GroqMistral_8x7B
74
75
  from langfun.core.llms.groq import GroqGemma7B_IT
75
76
 
77
+ from langfun.core.llms.vertexai import VertexAI
78
+ from langfun.core.llms.vertexai import VertexAIGeminiPro1_5
79
+ from langfun.core.llms.vertexai import VertexAIGeminiPro1
80
+ from langfun.core.llms.vertexai import VertexAIGeminiPro1Vision
81
+ from langfun.core.llms.vertexai import VertexAIPalm2
82
+ from langfun.core.llms.vertexai import VertexAIPalm2_32K
83
+
76
84
 
77
85
  # LLaMA C++ models.
78
86
  from langfun.core.llms.llama_cpp import LlamaCppRemote
langfun/core/llms/fake.py CHANGED
@@ -57,12 +57,12 @@ class StaticResponse(Fake):
57
57
  """Language model that always gives the same canned response."""
58
58
 
59
59
  response: Annotated[
60
- str,
60
+ str | lf.Message,
61
61
  'A canned response that will be returned regardless of the prompt.'
62
62
  ]
63
63
 
64
64
  def _response_from(self, prompt: lf.Message) -> lf.Message:
65
- return lf.AIMessage(self.response)
65
+ return lf.AIMessage.from_value(self.response)
66
66
 
67
67
 
68
68
  @lf.use_init_args(['mapping'])
@@ -70,12 +70,12 @@ class StaticMapping(Fake):
70
70
  """A static mapping from prompt to response."""
71
71
 
72
72
  mapping: Annotated[
73
- dict[str, str],
73
+ dict[str, str | lf.Message],
74
74
  'A mapping from prompt to response.'
75
75
  ]
76
76
 
77
77
  def _response_from(self, prompt: lf.Message) -> lf.Message:
78
- return lf.AIMessage(self.mapping[prompt])
78
+ return lf.AIMessage.from_value(self.mapping[prompt])
79
79
 
80
80
 
81
81
  @lf.use_init_args(['sequence'])
@@ -83,7 +83,7 @@ class StaticSequence(Fake):
83
83
  """A static sequence of responses to use."""
84
84
 
85
85
  sequence: Annotated[
86
- list[str],
86
+ list[str | lf.Message],
87
87
  'A sequence of strings as the response.'
88
88
  ]
89
89
 
@@ -92,6 +92,6 @@ class StaticSequence(Fake):
92
92
  self._pos = 0
93
93
 
94
94
  def _response_from(self, prompt: lf.Message) -> lf.Message:
95
- r = lf.AIMessage(self.sequence[self._pos])
95
+ r = lf.AIMessage.from_value(self.sequence[self._pos])
96
96
  self._pos += 1
97
97
  return r
@@ -34,6 +34,7 @@ class GenAI(lf.LanguageModel):
34
34
  'gemini-pro-vision',
35
35
  'text-bison-001',
36
36
  'chat-bison-001',
37
+ 'gemini-1.5-pro-latest',
37
38
  ],
38
39
  'Model name.',
39
40
  ]
@@ -262,6 +263,13 @@ _GOOGLE_GENAI_MODEL_HUB = _ModelHub()
262
263
  #
263
264
 
264
265
 
266
+ class GeminiPro1_5(GenAI): # pylint: disable=invalid-name
267
+ """Gemini Pro latest model."""
268
+
269
+ model = 'gemini-1.5-pro-latest'
270
+ multimodal = True
271
+
272
+
265
273
  class GeminiPro(GenAI):
266
274
  """Gemini Pro model."""
267
275
 
@@ -233,8 +233,9 @@ class OpenAI(lf.LanguageModel):
233
233
  for chunk in prompt.chunk():
234
234
  if isinstance(chunk, str):
235
235
  item = dict(type='text', text=chunk)
236
- elif isinstance(chunk, lf_modalities.Image) and chunk.uri:
237
- item = dict(type='image_url', image_url=chunk.uri)
236
+ elif isinstance(chunk, lf_modalities.Image):
237
+ uri = chunk.uri or chunk.content_uri
238
+ item = dict(type='image_url', image_url=dict(url=uri))
238
239
  else:
239
240
  raise ValueError(f'Unsupported modality object: {chunk!r}.')
240
241
  content.append(item)
@@ -66,7 +66,8 @@ def mock_chat_completion_query_vision(messages, *, n=1, **kwargs):
66
66
  del kwargs
67
67
  choices = []
68
68
  urls = [
69
- c['image_url'] for c in messages[0]['content'] if c['type'] == 'image_url'
69
+ c['image_url']['url']
70
+ for c in messages[0]['content'] if c['type'] == 'image_url'
70
71
  ]
71
72
  for k in range(n):
72
73
  choices.append(pg.Dict(