langfun 0.0.2.dev20240330__py3-none-any.whl → 0.1.2.dev202501140804__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (145) hide show
  1. langfun/__init__.py +22 -2
  2. langfun/core/__init__.py +17 -5
  3. langfun/core/agentic/__init__.py +30 -0
  4. langfun/core/agentic/action.py +854 -0
  5. langfun/core/agentic/action_eval.py +150 -0
  6. langfun/core/agentic/action_eval_test.py +109 -0
  7. langfun/core/agentic/action_test.py +136 -0
  8. langfun/core/coding/python/__init__.py +5 -11
  9. langfun/core/coding/python/correction.py +37 -28
  10. langfun/core/coding/python/correction_test.py +29 -3
  11. langfun/core/coding/python/execution.py +40 -216
  12. langfun/core/coding/python/execution_test.py +29 -89
  13. langfun/core/coding/python/generation.py +21 -11
  14. langfun/core/coding/python/generation_test.py +2 -2
  15. langfun/core/coding/python/parsing.py +108 -193
  16. langfun/core/coding/python/parsing_test.py +2 -105
  17. langfun/core/component.py +69 -2
  18. langfun/core/component_test.py +54 -0
  19. langfun/core/concurrent.py +414 -117
  20. langfun/core/concurrent_test.py +111 -24
  21. langfun/core/console.py +18 -5
  22. langfun/core/console_test.py +17 -0
  23. langfun/core/eval/__init__.py +17 -0
  24. langfun/core/eval/base.py +767 -140
  25. langfun/core/eval/base_test.py +238 -53
  26. langfun/core/eval/matching.py +80 -76
  27. langfun/core/eval/matching_test.py +19 -9
  28. langfun/core/eval/patching.py +130 -0
  29. langfun/core/eval/patching_test.py +170 -0
  30. langfun/core/eval/scoring.py +37 -28
  31. langfun/core/eval/scoring_test.py +21 -3
  32. langfun/core/eval/v2/__init__.py +42 -0
  33. langfun/core/eval/v2/checkpointing.py +380 -0
  34. langfun/core/eval/v2/checkpointing_test.py +228 -0
  35. langfun/core/eval/v2/eval_test_helper.py +136 -0
  36. langfun/core/eval/v2/evaluation.py +725 -0
  37. langfun/core/eval/v2/evaluation_test.py +180 -0
  38. langfun/core/eval/v2/example.py +305 -0
  39. langfun/core/eval/v2/example_test.py +128 -0
  40. langfun/core/eval/v2/experiment.py +1048 -0
  41. langfun/core/eval/v2/experiment_test.py +433 -0
  42. langfun/core/eval/v2/metric_values.py +156 -0
  43. langfun/core/eval/v2/metric_values_test.py +80 -0
  44. langfun/core/eval/v2/metrics.py +357 -0
  45. langfun/core/eval/v2/metrics_test.py +203 -0
  46. langfun/core/eval/v2/progress.py +348 -0
  47. langfun/core/eval/v2/progress_test.py +82 -0
  48. langfun/core/eval/v2/progress_tracking.py +210 -0
  49. langfun/core/eval/v2/progress_tracking_test.py +66 -0
  50. langfun/core/eval/v2/reporting.py +270 -0
  51. langfun/core/eval/v2/reporting_test.py +158 -0
  52. langfun/core/eval/v2/runners.py +488 -0
  53. langfun/core/eval/v2/runners_test.py +334 -0
  54. langfun/core/langfunc.py +3 -21
  55. langfun/core/langfunc_test.py +26 -8
  56. langfun/core/language_model.py +686 -48
  57. langfun/core/language_model_test.py +681 -44
  58. langfun/core/llms/__init__.py +100 -12
  59. langfun/core/llms/anthropic.py +488 -0
  60. langfun/core/llms/anthropic_test.py +235 -0
  61. langfun/core/llms/cache/base.py +21 -2
  62. langfun/core/llms/cache/in_memory.py +13 -0
  63. langfun/core/llms/cache/in_memory_test.py +88 -28
  64. langfun/core/llms/compositional.py +101 -0
  65. langfun/core/llms/compositional_test.py +73 -0
  66. langfun/core/llms/deepseek.py +117 -0
  67. langfun/core/llms/deepseek_test.py +61 -0
  68. langfun/core/llms/fake.py +39 -26
  69. langfun/core/llms/fake_test.py +136 -11
  70. langfun/core/llms/gemini.py +507 -0
  71. langfun/core/llms/gemini_test.py +195 -0
  72. langfun/core/llms/google_genai.py +62 -218
  73. langfun/core/llms/google_genai_test.py +9 -197
  74. langfun/core/llms/groq.py +276 -0
  75. langfun/core/llms/groq_test.py +64 -0
  76. langfun/core/llms/llama_cpp.py +15 -40
  77. langfun/core/llms/llama_cpp_test.py +4 -30
  78. langfun/core/llms/openai.py +436 -226
  79. langfun/core/llms/openai_compatible.py +179 -0
  80. langfun/core/llms/openai_compatible_test.py +495 -0
  81. langfun/core/llms/openai_test.py +35 -174
  82. langfun/core/llms/rest.py +113 -0
  83. langfun/core/llms/rest_test.py +111 -0
  84. langfun/core/llms/vertexai.py +192 -0
  85. langfun/core/llms/vertexai_test.py +52 -0
  86. langfun/core/logging.py +284 -0
  87. langfun/core/logging_test.py +125 -0
  88. langfun/core/message.py +319 -9
  89. langfun/core/message_test.py +190 -13
  90. langfun/core/modalities/__init__.py +6 -2
  91. langfun/core/modalities/audio.py +30 -0
  92. langfun/core/modalities/audio_test.py +63 -0
  93. langfun/core/modalities/image.py +39 -20
  94. langfun/core/modalities/image_test.py +52 -9
  95. langfun/core/modalities/mime.py +206 -29
  96. langfun/core/modalities/mime_test.py +90 -9
  97. langfun/core/modalities/ms_office.py +117 -0
  98. langfun/core/modalities/ms_office_test.py +389 -0
  99. langfun/core/modalities/pdf.py +22 -0
  100. langfun/core/modalities/pdf_test.py +57 -0
  101. langfun/core/modalities/video.py +9 -23
  102. langfun/core/modalities/video_test.py +3 -3
  103. langfun/core/modality.py +26 -3
  104. langfun/core/modality_test.py +2 -2
  105. langfun/core/sampling.py +11 -11
  106. langfun/core/structured/__init__.py +15 -16
  107. langfun/core/structured/completion.py +32 -5
  108. langfun/core/structured/completion_test.py +9 -8
  109. langfun/core/structured/description.py +2 -2
  110. langfun/core/structured/description_test.py +3 -3
  111. langfun/core/structured/function_generation.py +278 -0
  112. langfun/core/structured/function_generation_test.py +399 -0
  113. langfun/core/structured/mapping.py +150 -46
  114. langfun/core/structured/mapping_test.py +105 -0
  115. langfun/core/structured/parsing.py +33 -21
  116. langfun/core/structured/parsing_test.py +71 -22
  117. langfun/core/structured/querying.py +746 -0
  118. langfun/core/structured/{prompting_test.py → querying_test.py} +545 -60
  119. langfun/core/structured/schema.py +208 -99
  120. langfun/core/structured/schema_generation.py +1 -1
  121. langfun/core/structured/schema_generation_test.py +2 -2
  122. langfun/core/structured/schema_test.py +133 -34
  123. langfun/core/structured/scoring.py +125 -19
  124. langfun/core/structured/scoring_test.py +30 -0
  125. langfun/core/structured/tokenization.py +64 -0
  126. langfun/core/structured/tokenization_test.py +48 -0
  127. langfun/core/template.py +240 -11
  128. langfun/core/template_test.py +146 -1
  129. langfun/core/templates/conversation.py +9 -0
  130. langfun/core/templates/conversation_test.py +4 -3
  131. langfun/core/templates/selfplay_test.py +14 -2
  132. langfun-0.1.2.dev202501140804.dist-info/METADATA +225 -0
  133. langfun-0.1.2.dev202501140804.dist-info/RECORD +153 -0
  134. {langfun-0.0.2.dev20240330.dist-info → langfun-0.1.2.dev202501140804.dist-info}/WHEEL +1 -1
  135. langfun/core/coding/python/errors.py +0 -108
  136. langfun/core/coding/python/errors_test.py +0 -99
  137. langfun/core/coding/python/permissions.py +0 -90
  138. langfun/core/coding/python/permissions_test.py +0 -86
  139. langfun/core/structured/prompting.py +0 -217
  140. langfun/core/text_formatting.py +0 -162
  141. langfun/core/text_formatting_test.py +0 -47
  142. langfun-0.0.2.dev20240330.dist-info/METADATA +0 -99
  143. langfun-0.0.2.dev20240330.dist-info/RECORD +0 -102
  144. {langfun-0.0.2.dev20240330.dist-info → langfun-0.1.2.dev202501140804.dist-info}/LICENSE +0 -0
  145. {langfun-0.0.2.dev20240330.dist-info → langfun-0.1.2.dev202501140804.dist-info}/top_level.txt +0 -0
@@ -103,7 +103,7 @@ class MatchingTest(unittest.TestCase):
103
103
  s.result,
104
104
  dict(
105
105
  experiment_setup=dict(
106
- id='MyTask@3d87f97f',
106
+ id='MyTask@739a174b',
107
107
  dir=s.dir,
108
108
  model='StaticSequence',
109
109
  prompt_template='{{example.question}}',
@@ -120,11 +120,19 @@ class MatchingTest(unittest.TestCase):
120
120
  total=4,
121
121
  failures=1,
122
122
  failure_rate=0.25,
123
+ oop_failures=1,
124
+ oop_failure_rate=0.25,
125
+ non_oop_failures=0,
126
+ non_oop_failure_rate=0.0,
127
+ failure_breakdown={
128
+ 'MappingError.SchemaError.TypeError': 1
129
+ },
123
130
  num_matches=2,
124
131
  match_rate=0.5,
125
132
  num_mismatches=1,
126
133
  mismatch_rate=0.25,
127
134
  ),
135
+ usage=s.result.usage,
128
136
  ),
129
137
  )
130
138
  self.assertTrue(
@@ -144,22 +152,17 @@ class MatchingTest(unittest.TestCase):
144
152
  os.path.join(s.dir, matching.Matching.CACHE_JSON)
145
153
  )
146
154
  )
147
- self.assertTrue(
148
- os.path.exists(
149
- os.path.join(s.dir, matching.Matching.MATCHES_JSON)
150
- )
151
- )
152
155
  self.assertTrue(
153
156
  os.path.exists(
154
157
  os.path.join(
155
- s.dir, matching.Matching.MISMATCHES_JSON
158
+ s.dir, matching.Matching.OOP_FAILURES_JSON
156
159
  )
157
160
  )
158
161
  )
159
162
  self.assertTrue(
160
163
  os.path.exists(
161
164
  os.path.join(
162
- s.dir, matching.Matching.FAILURES_JSON
165
+ s.dir, matching.Matching.NON_OOP_FAILURES_JSON
163
166
  )
164
167
  )
165
168
  )
@@ -174,7 +177,14 @@ class MatchingTest(unittest.TestCase):
174
177
  self.assertTrue(
175
178
  os.path.exists(
176
179
  os.path.join(
177
- s.dir, matching.Matching.FAILURES_HTML
180
+ s.dir, matching.Matching.OOP_FAILURES_HTML
181
+ )
182
+ )
183
+ )
184
+ self.assertTrue(
185
+ os.path.exists(
186
+ os.path.join(
187
+ s.dir, matching.Matching.NON_OOP_FAILURES_HTML
178
188
  )
179
189
  )
180
190
  )
@@ -0,0 +1,130 @@
1
+ # Copyright 2024 The Langfun Authors
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Experiment patching for Langfun evaluations."""
15
+
16
+ import inspect
17
+ from typing import Union
18
+ import langfun.core as lf
19
+ from langfun.core import llms as lf_llms
20
+ from langfun.core.eval import base
21
+ import pyglove as pg
22
+
23
+
24
+ #
25
+ # Program-based patchers.
26
+ #
27
+
28
+
29
+ def patch_member(cls, key, value, parent_key: str | None = None):
30
+ """Patches a member of a class."""
31
+
32
+ def _rebind_fn(k, v, p):
33
+ if (
34
+ isinstance(p, cls)
35
+ and k.key == key
36
+ and (parent_key is None or (p and p.sym_path.key == parent_key))
37
+ ):
38
+ if inspect.isfunction(value):
39
+ return value(k, v, p)
40
+ return value
41
+ return v
42
+
43
+ return _rebind_fn
44
+
45
+
46
+ def patch_lm(lm: Union[lf.LanguageModel, pg.hyper.OneOf]): # pylint: disable=redefined-outer-name
47
+ """Patches the LLM of evaluations."""
48
+ return patch_member(base.Evaluable, "lm", lm)
49
+
50
+
51
+ def patch_parsing_lm(lm: Union[lf.LanguageModel, pg.hyper.OneOf]): # pylint: disable=redefined-outer-name
52
+ """Patches the parsing LLM of evaluations."""
53
+ return patch_member(base.Evaluable, "parsing_lm", lm)
54
+
55
+
56
+ def patch_schema_fn(schema_fn: Union[pg.Functor, pg.hyper.OneOf]):
57
+ """Patches the schema_fn of evaluations."""
58
+ return patch_member(base.Evaluable, "schema_fn", schema_fn)
59
+
60
+
61
+ def patch_prompt(prompt: Union[str, lf.Template, pg.hyper.OneOf]):
62
+ """Patches the prompt of evaluations."""
63
+ return patch_member(base.Evaluable, "prompt", prompt)
64
+
65
+
66
+ def patch_inputs(inputs: Union[pg.Functor, pg.hyper.OneOf]):
67
+ """Patches the inputs used in evaluations."""
68
+ return patch_member(base.Evaluable, "inputs", inputs)
69
+
70
+
71
+ def patch_additional_args(**kwargs):
72
+ """Patches additional_args."""
73
+
74
+ def value_fn(k, unused_v, p):
75
+ # We infer the symbolic value for the old args, as it might be a
76
+ # contextual attribute referring to its containing object.
77
+ old_args = p.sym_inferred(k.key)
78
+ if old_args:
79
+ old_args = dict(old_args)
80
+ old_args.update(kwargs)
81
+ return old_args
82
+ return kwargs
83
+
84
+ return patch_member(base.Evaluable, "additional_args", value_fn)
85
+
86
+
87
+ #
88
+ # String-based patching.
89
+ #
90
+
91
+ _NAMED_MODELS = {
92
+ # GPT models.
93
+ "gpt35turbo": lf_llms.Gpt35Turbo,
94
+ "gpt35turbo16k": lf_llms.Gpt35Turbo16K,
95
+ "gpt4": lf_llms.Gpt4,
96
+ "gpt4turbo": lf_llms.Gpt4Turbo,
97
+ # Anthropic models.
98
+ "haiku": lf_llms.Claude3Haiku,
99
+ "claude3haiku": lf_llms.Claude3Haiku,
100
+ "opus": lf_llms.Claude3Opus,
101
+ "claude3opus": lf_llms.Claude3Opus,
102
+ "sonnet": lf_llms.Claude3Sonnet,
103
+ "claude3sonnet": lf_llms.Claude3Opus,
104
+ }
105
+
106
+
107
+ def model_by_name(name: str) -> lf.LanguageModel:
108
+ """Gets model by name."""
109
+ name = name.strip().lower()
110
+ if name in _NAMED_MODELS:
111
+ return _NAMED_MODELS[name]()
112
+ raise ValueError(f"Unknown model name: {name}")
113
+
114
+
115
+ @pg.patcher(auto_typing=True)
116
+ def lm(unused_eval, models: list[str]):
117
+ """Patch the LM used for benchmarking."""
118
+ return patch_lm(pg.oneof([model_by_name(name) for name in models]))
119
+
120
+
121
+ @pg.patcher(auto_typing=True)
122
+ def temperature(unused_eval, value: float):
123
+ """Patch the temperature used for benchmarking."""
124
+ return patch_member(lf.LMSamplingOptions, "temperature", value)
125
+
126
+
127
+ @pg.patcher(auto_typing=True)
128
+ def max_tokens(unused_eval, value: int | None):
129
+ """Patch the temperature used for benchmarking."""
130
+ return patch_member(lf.LMSamplingOptions, "max_tokens", value)
@@ -0,0 +1,170 @@
1
+ # Copyright 2024 The Langfun Authors
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Tests for evaluation patching."""
15
+
16
+ import unittest
17
+ from langfun.core import llms as lf_llms
18
+ from langfun.core.eval import base
19
+ from langfun.core.eval import patching
20
+ import pyglove as pg
21
+
22
+
23
+ class PatchingCommonTest(unittest.TestCase):
24
+
25
+ def test_patch_member(self):
26
+ class A(pg.Object):
27
+ x: int = 1
28
+
29
+ class B(pg.Object):
30
+ a: A
31
+
32
+ b = B(A())
33
+ pg.patch(b, [patching.patch_member(A, 'x', 2)])
34
+ self.assertEqual(b, B(A(2)))
35
+
36
+ def test_patch_args(self):
37
+ s = base.Suite(
38
+ [base.Evaluation(inputs=base.as_inputs([1]))],
39
+ additional_args=dict(x=1, y=2),
40
+ )
41
+ pg.patch(s, [patching.patch_additional_args(x=3, z=4)])
42
+ self.assertTrue(
43
+ pg.eq(
44
+ s,
45
+ base.Suite(
46
+ [
47
+ base.Evaluation(
48
+ inputs=base.as_inputs([1]),
49
+ additional_args=dict(x=3, y=2, z=4),
50
+ )
51
+ ],
52
+ additional_args=dict(x=3, y=2, z=4),
53
+ ),
54
+ )
55
+ )
56
+
57
+ def test_patch_lm(self):
58
+ s = base.Suite(
59
+ [base.Evaluation(inputs=base.as_inputs([1]))],
60
+ lm=lf_llms.Gpt35Turbo(),
61
+ )
62
+ pg.patch(
63
+ s, [patching.patch_lm(pg.oneof([lf_llms.Gpt35Turbo(), lf_llms.Gpt4()]))]
64
+ )
65
+ self.assertTrue(
66
+ pg.eq(
67
+ s,
68
+ base.Suite(
69
+ [
70
+ base.Evaluation(
71
+ inputs=base.as_inputs([1]),
72
+ lm=pg.oneof([lf_llms.Gpt35Turbo(), lf_llms.Gpt4()]),
73
+ )
74
+ ],
75
+ lm=pg.oneof([lf_llms.Gpt35Turbo(), lf_llms.Gpt4()]),
76
+ ),
77
+ )
78
+ )
79
+
80
+ def test_patch_parsing_lm(self):
81
+ s = base.Suite(
82
+ [base.Evaluation(inputs=base.as_inputs([1]))],
83
+ lm=lf_llms.Gpt4(),
84
+ )
85
+ pg.patch(s, [patching.patch_parsing_lm(lf_llms.Gpt35Turbo())])
86
+ self.assertTrue(
87
+ pg.eq(
88
+ s,
89
+ base.Suite(
90
+ [
91
+ base.Evaluation(
92
+ inputs=base.as_inputs([1]),
93
+ lm=lf_llms.Gpt4(),
94
+ parsing_lm=lf_llms.Gpt35Turbo(),
95
+ )
96
+ ],
97
+ # NOTE(daiyip): Suite does not have `parsing_lm` as one of its
98
+ # variable keyword fields yet, so patching does not add to it.
99
+ # This is okay since we only care about the leaf nodes.
100
+ lm=lf_llms.Gpt4(),
101
+ ),
102
+ )
103
+ )
104
+
105
+ def test_patch_prompt(self):
106
+ e = base.Evaluation(inputs=base.as_inputs([1]))
107
+ pg.patch(e, [patching.patch_prompt('Q: {{example.question}}')])
108
+ self.assertTrue(
109
+ pg.eq(
110
+ e,
111
+ base.Evaluation(
112
+ inputs=base.as_inputs([1]),
113
+ prompt='Q: {{example.question}}',
114
+ ),
115
+ )
116
+ )
117
+
118
+ def test_patch_inputs(self):
119
+ e = base.Evaluation(inputs=base.as_inputs([1]))
120
+ pg.patch(e, [patching.patch_inputs(base.as_inputs([2]))])
121
+ self.assertTrue(
122
+ pg.eq(
123
+ e,
124
+ base.Evaluation(
125
+ inputs=base.as_inputs([2]),
126
+ ),
127
+ )
128
+ )
129
+
130
+ def test_patch_schema_fn(self):
131
+ @pg.functor()
132
+ def int_schema():
133
+ return int
134
+
135
+ e = base.Evaluation(inputs=base.as_inputs([1]))
136
+ pg.patch(e, [patching.patch_schema_fn(int_schema())])
137
+ self.assertTrue(
138
+ pg.eq(
139
+ e,
140
+ base.Evaluation(
141
+ inputs=base.as_inputs([1]),
142
+ schema_fn=int_schema(),
143
+ ),
144
+ )
145
+ )
146
+
147
+
148
+ class StringPatcheTest(unittest.TestCase):
149
+
150
+ def test_lm(self):
151
+ target = pg.patch(
152
+ base.Evaluation(inputs=base.as_inputs([1])),
153
+ ['lm?haiku:gpt4', 'max_tokens?1024', 'temperature?0.7'],
154
+ )
155
+ self.assertEqual(
156
+ target.lm,
157
+ pg.oneof([
158
+ lf_llms.Claude3Haiku(temperature=0.7, max_tokens=1024),
159
+ lf_llms.Gpt4(temperature=0.7, max_tokens=1024),
160
+ ]),
161
+ )
162
+ with self.assertRaisesRegex(ValueError, 'Unknown model name'):
163
+ pg.patch(
164
+ base.Evaluation(inputs=base.as_inputs([1])),
165
+ ['lm?gpt2'],
166
+ )
167
+
168
+
169
+ if __name__ == '__main__':
170
+ unittest.main()
@@ -61,25 +61,36 @@ class Scoring(base.Evaluation):
61
61
  super()._reset()
62
62
  self._scored = []
63
63
 
64
- def audit(self, example: Any, output: Any, message: lf.Message) -> None:
64
+ def audit_processed(
65
+ self, example_idx: int, example: Any, output: Any, message: lf.Message,
66
+ dryrun: bool = False
67
+ ) -> None:
68
+ del example_idx
65
69
  score = self.score(example, output)
70
+
71
+ if dryrun:
72
+ lf.console.write('')
73
+ lf.console.write(
74
+ str(score),
75
+ title='SCORE',
76
+ color='blue',
77
+ )
66
78
  self._scored.append((example, output, score, message))
67
79
 
68
80
  @abc.abstractmethod
69
81
  def score(self, example: Any, output: Any) -> float:
70
82
  """Scores the output against its input example."""
71
83
 
72
- def _status(self, progress: lf.concurrent.Progress) -> dict[str, Any]:
84
+ def _eval_status(self, progress: lf.concurrent.Progress) -> dict[str, Any]:
73
85
  del progress
74
86
  return {
75
- 'Model': self.lm.model_id,
76
87
  'Average Score': {self.avg_score},
77
- 'Scored': '%.2f%% (%d/%d)' % (
88
+ 'Scored': '%.3f%% (%d/%d)' % (
78
89
  self.score_rate * 100,
79
90
  self.num_scored,
80
91
  self.num_completed,
81
92
  ),
82
- 'Failed': '%.2f%% (%d/%d)' % (
93
+ 'Failed': '%.3f%% (%d/%d)' % (
83
94
  self.failure_rate * 100,
84
95
  self.num_failures,
85
96
  self.num_completed,
@@ -90,8 +101,8 @@ class Scoring(base.Evaluation):
90
101
  assert self.result is not None
91
102
  m = self.result.metrics
92
103
  return (
93
- 'COMPLETED(%s): AvgScore=%f Scored=%.2f%% (%d/%d) '
94
- 'Failures=%.2f%% (%d/%d)'
104
+ 'COMPLETED(%s): AvgScore=%f Scored=%.3f%% (%d/%d) '
105
+ 'Failures=%.3f%% (%d/%d)'
95
106
  ) % (
96
107
  run_status,
97
108
  m.avg_score,
@@ -103,8 +114,8 @@ class Scoring(base.Evaluation):
103
114
  m.total,
104
115
  )
105
116
 
106
- def summarize(self) -> pg.Dict:
107
- result = super().summarize()
117
+ def finalize(self) -> pg.Dict:
118
+ result = super().finalize()
108
119
  result.metrics.update(
109
120
  num_scored=self.num_scored,
110
121
  score_rate=self.score_rate,
@@ -118,16 +129,12 @@ class Scoring(base.Evaluation):
118
129
  super().save(definition, result, report)
119
130
 
120
131
  if result:
121
-
122
- def force_dict(v):
123
- return pg.object_utils.json_conversion.strip_types(pg.to_json(v))
124
-
125
132
  # Save scored.
126
133
  pg.save(
127
134
  [
128
135
  # We force the output to be dict as its type may be defined
129
136
  # within functors which could be deserialized.
130
- pg.Dict(input=input, output=force_dict(output), score=score)
137
+ pg.Dict(input=input, output=output, score=score)
131
138
  for input, output, score, _ in self.scored
132
139
  ],
133
140
  os.path.join(self.dir, Scoring.SCORED_JSON),
@@ -148,32 +155,30 @@ class Scoring(base.Evaluation):
148
155
  def _render_result_row(self, s: io.StringIO):
149
156
  super()._render_result_row(s)
150
157
  s.write(
151
- '<td><span style="color:blue">%.2f</span></td>' % self.avg_score
158
+ '<td><span style="color:blue">%.3f</span></td>' % self.avg_score
152
159
  )
153
160
  s.write(
154
161
  '<td><span style="color:red">%s</span>%s</td>'
155
162
  % (
156
- '%.2f%% ' % (self.score_rate * 100),
163
+ '%.3f%% ' % (self.score_rate * 100),
157
164
  '<a href="%s">(%d/%d)</a>'
158
165
  % (self.scored_link, self.num_scored, self.num_completed),
159
166
  )
160
167
  )
161
168
 
162
- def _render_metric(self, s: io.StringIO) -> None:
169
+ def _render_summary_metrics(self, s: io.StringIO) -> None:
163
170
  """Renders metrics in HTML."""
164
171
  assert self.result is not None
165
172
  m = self.result.metrics
166
- s.write(
167
- '<a title="Average score (%d/%d)" href="%s" style="color:green">%s</a>'
168
- % (
169
- m.num_scored,
170
- m.total,
171
- self.scored_link,
172
- '%.2f%%' % (m.score_rate * 100),
173
- )
173
+ self._render_link(
174
+ s,
175
+ 'Average score (%d/%d)' % (m.num_scored, m.total),
176
+ '%.3f (%.3f%%)' % (m.avg_score, m.score_rate * 100),
177
+ 'color:green',
178
+ lambda: self.scored_link,
174
179
  )
175
180
  s.write(' | ')
176
- super()._render_metric(s)
181
+ super()._render_summary_metrics(s)
177
182
 
178
183
  def _render_scored(self, s: io.StringIO) -> None:
179
184
  """Formats the matched cases into html."""
@@ -189,9 +194,13 @@ class Scoring(base.Evaluation):
189
194
  for i, (example, output, score, message) in enumerate(self.scored):
190
195
  bgcolor = 'white' if i % 2 == 0 else '#DDDDDD'
191
196
  s.write(f'<tr style="background-color: {bgcolor}"><td>{i + 1}</td>')
192
- input_str = pg.format(example, verbose=False)
197
+ input_str = pg.Html.escape(
198
+ pg.format(example, verbose=False, max_bytes_len=32)
199
+ )
193
200
  s.write(f'<td style="color:green;white-space:pre-wrap">{input_str}</td>')
194
- output_str = pg.format(output, verbose=False)
201
+ output_str = pg.Html.escape(
202
+ pg.format(output, verbose=False, max_bytes_len=32)
203
+ )
195
204
  s.write(f'<td style="color:blue;white-space:pre-wrap">{output_str}</td>')
196
205
  s.write(f'<td style="color:magenta;white-space:pre-wrap">{score}</td>')
197
206
  s.write('<td>')
@@ -81,7 +81,7 @@ class ScoringTest(unittest.TestCase):
81
81
  s.result,
82
82
  dict(
83
83
  experiment_setup=dict(
84
- id='ConstraintFollowing@9e51bb9e',
84
+ id='ConstraintFollowing@5c88a5eb',
85
85
  dir=s.dir,
86
86
  model='StaticSequence',
87
87
  prompt_template='{{example}}',
@@ -98,10 +98,16 @@ class ScoringTest(unittest.TestCase):
98
98
  total=2,
99
99
  failures=0,
100
100
  failure_rate=0.0,
101
+ oop_failures=0,
102
+ oop_failure_rate=0.0,
103
+ non_oop_failures=0,
104
+ non_oop_failure_rate=0.0,
105
+ failure_breakdown={},
101
106
  num_scored=2,
102
107
  score_rate=1.0,
103
108
  avg_score=0.5,
104
109
  ),
110
+ usage=s.result.usage,
105
111
  ),
106
112
  )
107
113
  self.assertTrue(
@@ -123,7 +129,12 @@ class ScoringTest(unittest.TestCase):
123
129
  )
124
130
  self.assertTrue(
125
131
  os.path.exists(
126
- os.path.join(s.dir, scoring.Scoring.FAILURES_JSON)
132
+ os.path.join(s.dir, scoring.Scoring.OOP_FAILURES_JSON)
133
+ )
134
+ )
135
+ self.assertTrue(
136
+ os.path.exists(
137
+ os.path.join(s.dir, scoring.Scoring.NON_OOP_FAILURES_JSON)
127
138
  )
128
139
  )
129
140
  self.assertTrue(
@@ -142,7 +153,14 @@ class ScoringTest(unittest.TestCase):
142
153
  self.assertTrue(
143
154
  os.path.exists(
144
155
  os.path.join(
145
- s.dir, scoring.Scoring.FAILURES_HTML
156
+ s.dir, scoring.Scoring.OOP_FAILURES_HTML
157
+ )
158
+ )
159
+ )
160
+ self.assertTrue(
161
+ os.path.exists(
162
+ os.path.join(
163
+ s.dir, scoring.Scoring.NON_OOP_FAILURES_HTML
146
164
  )
147
165
  )
148
166
  )
@@ -0,0 +1,42 @@
1
+ # Copyright 2024 The Langfun Authors
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """langfun eval framework v2."""
15
+
16
+ # pylint: disable=g-importing-member
17
+ # pylint: disable=g-bad-import-order
18
+ from langfun.core.eval.v2.experiment import Experiment
19
+ from langfun.core.eval.v2.experiment import Suite
20
+ from langfun.core.eval.v2.evaluation import Evaluation
21
+
22
+ from langfun.core.eval.v2.example import Example
23
+ from langfun.core.eval.v2.progress import Progress
24
+
25
+ from langfun.core.eval.v2.metric_values import MetricValue
26
+ from langfun.core.eval.v2.metric_values import Rate
27
+ from langfun.core.eval.v2.metric_values import Average
28
+ from langfun.core.eval.v2.metrics import Metric
29
+ from langfun.core.eval.v2 import metrics
30
+
31
+ from langfun.core.eval.v2.experiment import Plugin
32
+ from langfun.core.eval.v2.experiment import Runner
33
+ from langfun.core.eval.v2 import runners
34
+
35
+ # Plugins
36
+ from langfun.core.eval.v2.checkpointing import BulkCheckpointer
37
+ from langfun.core.eval.v2.checkpointing import PerExampleCheckpointer
38
+ from langfun.core.eval.v2.reporting import HtmlReporter
39
+
40
+
41
+ # pylint: enable=g-bad-import-order
42
+ # pylint: enable=g-importing-member