langfun 0.0.2.dev20240330__py3-none-any.whl → 0.1.2.dev202501140804__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langfun/__init__.py +22 -2
- langfun/core/__init__.py +17 -5
- langfun/core/agentic/__init__.py +30 -0
- langfun/core/agentic/action.py +854 -0
- langfun/core/agentic/action_eval.py +150 -0
- langfun/core/agentic/action_eval_test.py +109 -0
- langfun/core/agentic/action_test.py +136 -0
- langfun/core/coding/python/__init__.py +5 -11
- langfun/core/coding/python/correction.py +37 -28
- langfun/core/coding/python/correction_test.py +29 -3
- langfun/core/coding/python/execution.py +40 -216
- langfun/core/coding/python/execution_test.py +29 -89
- langfun/core/coding/python/generation.py +21 -11
- langfun/core/coding/python/generation_test.py +2 -2
- langfun/core/coding/python/parsing.py +108 -193
- langfun/core/coding/python/parsing_test.py +2 -105
- langfun/core/component.py +69 -2
- langfun/core/component_test.py +54 -0
- langfun/core/concurrent.py +414 -117
- langfun/core/concurrent_test.py +111 -24
- langfun/core/console.py +18 -5
- langfun/core/console_test.py +17 -0
- langfun/core/eval/__init__.py +17 -0
- langfun/core/eval/base.py +767 -140
- langfun/core/eval/base_test.py +238 -53
- langfun/core/eval/matching.py +80 -76
- langfun/core/eval/matching_test.py +19 -9
- langfun/core/eval/patching.py +130 -0
- langfun/core/eval/patching_test.py +170 -0
- langfun/core/eval/scoring.py +37 -28
- langfun/core/eval/scoring_test.py +21 -3
- langfun/core/eval/v2/__init__.py +42 -0
- langfun/core/eval/v2/checkpointing.py +380 -0
- langfun/core/eval/v2/checkpointing_test.py +228 -0
- langfun/core/eval/v2/eval_test_helper.py +136 -0
- langfun/core/eval/v2/evaluation.py +725 -0
- langfun/core/eval/v2/evaluation_test.py +180 -0
- langfun/core/eval/v2/example.py +305 -0
- langfun/core/eval/v2/example_test.py +128 -0
- langfun/core/eval/v2/experiment.py +1048 -0
- langfun/core/eval/v2/experiment_test.py +433 -0
- langfun/core/eval/v2/metric_values.py +156 -0
- langfun/core/eval/v2/metric_values_test.py +80 -0
- langfun/core/eval/v2/metrics.py +357 -0
- langfun/core/eval/v2/metrics_test.py +203 -0
- langfun/core/eval/v2/progress.py +348 -0
- langfun/core/eval/v2/progress_test.py +82 -0
- langfun/core/eval/v2/progress_tracking.py +210 -0
- langfun/core/eval/v2/progress_tracking_test.py +66 -0
- langfun/core/eval/v2/reporting.py +270 -0
- langfun/core/eval/v2/reporting_test.py +158 -0
- langfun/core/eval/v2/runners.py +488 -0
- langfun/core/eval/v2/runners_test.py +334 -0
- langfun/core/langfunc.py +3 -21
- langfun/core/langfunc_test.py +26 -8
- langfun/core/language_model.py +686 -48
- langfun/core/language_model_test.py +681 -44
- langfun/core/llms/__init__.py +100 -12
- langfun/core/llms/anthropic.py +488 -0
- langfun/core/llms/anthropic_test.py +235 -0
- langfun/core/llms/cache/base.py +21 -2
- langfun/core/llms/cache/in_memory.py +13 -0
- langfun/core/llms/cache/in_memory_test.py +88 -28
- langfun/core/llms/compositional.py +101 -0
- langfun/core/llms/compositional_test.py +73 -0
- langfun/core/llms/deepseek.py +117 -0
- langfun/core/llms/deepseek_test.py +61 -0
- langfun/core/llms/fake.py +39 -26
- langfun/core/llms/fake_test.py +136 -11
- langfun/core/llms/gemini.py +507 -0
- langfun/core/llms/gemini_test.py +195 -0
- langfun/core/llms/google_genai.py +62 -218
- langfun/core/llms/google_genai_test.py +9 -197
- langfun/core/llms/groq.py +276 -0
- langfun/core/llms/groq_test.py +64 -0
- langfun/core/llms/llama_cpp.py +15 -40
- langfun/core/llms/llama_cpp_test.py +4 -30
- langfun/core/llms/openai.py +436 -226
- langfun/core/llms/openai_compatible.py +179 -0
- langfun/core/llms/openai_compatible_test.py +495 -0
- langfun/core/llms/openai_test.py +35 -174
- langfun/core/llms/rest.py +113 -0
- langfun/core/llms/rest_test.py +111 -0
- langfun/core/llms/vertexai.py +192 -0
- langfun/core/llms/vertexai_test.py +52 -0
- langfun/core/logging.py +284 -0
- langfun/core/logging_test.py +125 -0
- langfun/core/message.py +319 -9
- langfun/core/message_test.py +190 -13
- langfun/core/modalities/__init__.py +6 -2
- langfun/core/modalities/audio.py +30 -0
- langfun/core/modalities/audio_test.py +63 -0
- langfun/core/modalities/image.py +39 -20
- langfun/core/modalities/image_test.py +52 -9
- langfun/core/modalities/mime.py +206 -29
- langfun/core/modalities/mime_test.py +90 -9
- langfun/core/modalities/ms_office.py +117 -0
- langfun/core/modalities/ms_office_test.py +389 -0
- langfun/core/modalities/pdf.py +22 -0
- langfun/core/modalities/pdf_test.py +57 -0
- langfun/core/modalities/video.py +9 -23
- langfun/core/modalities/video_test.py +3 -3
- langfun/core/modality.py +26 -3
- langfun/core/modality_test.py +2 -2
- langfun/core/sampling.py +11 -11
- langfun/core/structured/__init__.py +15 -16
- langfun/core/structured/completion.py +32 -5
- langfun/core/structured/completion_test.py +9 -8
- langfun/core/structured/description.py +2 -2
- langfun/core/structured/description_test.py +3 -3
- langfun/core/structured/function_generation.py +278 -0
- langfun/core/structured/function_generation_test.py +399 -0
- langfun/core/structured/mapping.py +150 -46
- langfun/core/structured/mapping_test.py +105 -0
- langfun/core/structured/parsing.py +33 -21
- langfun/core/structured/parsing_test.py +71 -22
- langfun/core/structured/querying.py +746 -0
- langfun/core/structured/{prompting_test.py → querying_test.py} +545 -60
- langfun/core/structured/schema.py +208 -99
- langfun/core/structured/schema_generation.py +1 -1
- langfun/core/structured/schema_generation_test.py +2 -2
- langfun/core/structured/schema_test.py +133 -34
- langfun/core/structured/scoring.py +125 -19
- langfun/core/structured/scoring_test.py +30 -0
- langfun/core/structured/tokenization.py +64 -0
- langfun/core/structured/tokenization_test.py +48 -0
- langfun/core/template.py +240 -11
- langfun/core/template_test.py +146 -1
- langfun/core/templates/conversation.py +9 -0
- langfun/core/templates/conversation_test.py +4 -3
- langfun/core/templates/selfplay_test.py +14 -2
- langfun-0.1.2.dev202501140804.dist-info/METADATA +225 -0
- langfun-0.1.2.dev202501140804.dist-info/RECORD +153 -0
- {langfun-0.0.2.dev20240330.dist-info → langfun-0.1.2.dev202501140804.dist-info}/WHEEL +1 -1
- langfun/core/coding/python/errors.py +0 -108
- langfun/core/coding/python/errors_test.py +0 -99
- langfun/core/coding/python/permissions.py +0 -90
- langfun/core/coding/python/permissions_test.py +0 -86
- langfun/core/structured/prompting.py +0 -217
- langfun/core/text_formatting.py +0 -162
- langfun/core/text_formatting_test.py +0 -47
- langfun-0.0.2.dev20240330.dist-info/METADATA +0 -99
- langfun-0.0.2.dev20240330.dist-info/RECORD +0 -102
- {langfun-0.0.2.dev20240330.dist-info → langfun-0.1.2.dev202501140804.dist-info}/LICENSE +0 -0
- {langfun-0.0.2.dev20240330.dist-info → langfun-0.1.2.dev202501140804.dist-info}/top_level.txt +0 -0
@@ -103,7 +103,7 @@ class MatchingTest(unittest.TestCase):
|
|
103
103
|
s.result,
|
104
104
|
dict(
|
105
105
|
experiment_setup=dict(
|
106
|
-
id='MyTask@
|
106
|
+
id='MyTask@739a174b',
|
107
107
|
dir=s.dir,
|
108
108
|
model='StaticSequence',
|
109
109
|
prompt_template='{{example.question}}',
|
@@ -120,11 +120,19 @@ class MatchingTest(unittest.TestCase):
|
|
120
120
|
total=4,
|
121
121
|
failures=1,
|
122
122
|
failure_rate=0.25,
|
123
|
+
oop_failures=1,
|
124
|
+
oop_failure_rate=0.25,
|
125
|
+
non_oop_failures=0,
|
126
|
+
non_oop_failure_rate=0.0,
|
127
|
+
failure_breakdown={
|
128
|
+
'MappingError.SchemaError.TypeError': 1
|
129
|
+
},
|
123
130
|
num_matches=2,
|
124
131
|
match_rate=0.5,
|
125
132
|
num_mismatches=1,
|
126
133
|
mismatch_rate=0.25,
|
127
134
|
),
|
135
|
+
usage=s.result.usage,
|
128
136
|
),
|
129
137
|
)
|
130
138
|
self.assertTrue(
|
@@ -144,22 +152,17 @@ class MatchingTest(unittest.TestCase):
|
|
144
152
|
os.path.join(s.dir, matching.Matching.CACHE_JSON)
|
145
153
|
)
|
146
154
|
)
|
147
|
-
self.assertTrue(
|
148
|
-
os.path.exists(
|
149
|
-
os.path.join(s.dir, matching.Matching.MATCHES_JSON)
|
150
|
-
)
|
151
|
-
)
|
152
155
|
self.assertTrue(
|
153
156
|
os.path.exists(
|
154
157
|
os.path.join(
|
155
|
-
s.dir, matching.Matching.
|
158
|
+
s.dir, matching.Matching.OOP_FAILURES_JSON
|
156
159
|
)
|
157
160
|
)
|
158
161
|
)
|
159
162
|
self.assertTrue(
|
160
163
|
os.path.exists(
|
161
164
|
os.path.join(
|
162
|
-
s.dir, matching.Matching.
|
165
|
+
s.dir, matching.Matching.NON_OOP_FAILURES_JSON
|
163
166
|
)
|
164
167
|
)
|
165
168
|
)
|
@@ -174,7 +177,14 @@ class MatchingTest(unittest.TestCase):
|
|
174
177
|
self.assertTrue(
|
175
178
|
os.path.exists(
|
176
179
|
os.path.join(
|
177
|
-
s.dir, matching.Matching.
|
180
|
+
s.dir, matching.Matching.OOP_FAILURES_HTML
|
181
|
+
)
|
182
|
+
)
|
183
|
+
)
|
184
|
+
self.assertTrue(
|
185
|
+
os.path.exists(
|
186
|
+
os.path.join(
|
187
|
+
s.dir, matching.Matching.NON_OOP_FAILURES_HTML
|
178
188
|
)
|
179
189
|
)
|
180
190
|
)
|
@@ -0,0 +1,130 @@
|
|
1
|
+
# Copyright 2024 The Langfun Authors
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
"""Experiment patching for Langfun evaluations."""
|
15
|
+
|
16
|
+
import inspect
|
17
|
+
from typing import Union
|
18
|
+
import langfun.core as lf
|
19
|
+
from langfun.core import llms as lf_llms
|
20
|
+
from langfun.core.eval import base
|
21
|
+
import pyglove as pg
|
22
|
+
|
23
|
+
|
24
|
+
#
|
25
|
+
# Program-based patchers.
|
26
|
+
#
|
27
|
+
|
28
|
+
|
29
|
+
def patch_member(cls, key, value, parent_key: str | None = None):
|
30
|
+
"""Patches a member of a class."""
|
31
|
+
|
32
|
+
def _rebind_fn(k, v, p):
|
33
|
+
if (
|
34
|
+
isinstance(p, cls)
|
35
|
+
and k.key == key
|
36
|
+
and (parent_key is None or (p and p.sym_path.key == parent_key))
|
37
|
+
):
|
38
|
+
if inspect.isfunction(value):
|
39
|
+
return value(k, v, p)
|
40
|
+
return value
|
41
|
+
return v
|
42
|
+
|
43
|
+
return _rebind_fn
|
44
|
+
|
45
|
+
|
46
|
+
def patch_lm(lm: Union[lf.LanguageModel, pg.hyper.OneOf]): # pylint: disable=redefined-outer-name
|
47
|
+
"""Patches the LLM of evaluations."""
|
48
|
+
return patch_member(base.Evaluable, "lm", lm)
|
49
|
+
|
50
|
+
|
51
|
+
def patch_parsing_lm(lm: Union[lf.LanguageModel, pg.hyper.OneOf]): # pylint: disable=redefined-outer-name
|
52
|
+
"""Patches the parsing LLM of evaluations."""
|
53
|
+
return patch_member(base.Evaluable, "parsing_lm", lm)
|
54
|
+
|
55
|
+
|
56
|
+
def patch_schema_fn(schema_fn: Union[pg.Functor, pg.hyper.OneOf]):
|
57
|
+
"""Patches the schema_fn of evaluations."""
|
58
|
+
return patch_member(base.Evaluable, "schema_fn", schema_fn)
|
59
|
+
|
60
|
+
|
61
|
+
def patch_prompt(prompt: Union[str, lf.Template, pg.hyper.OneOf]):
|
62
|
+
"""Patches the prompt of evaluations."""
|
63
|
+
return patch_member(base.Evaluable, "prompt", prompt)
|
64
|
+
|
65
|
+
|
66
|
+
def patch_inputs(inputs: Union[pg.Functor, pg.hyper.OneOf]):
|
67
|
+
"""Patches the inputs used in evaluations."""
|
68
|
+
return patch_member(base.Evaluable, "inputs", inputs)
|
69
|
+
|
70
|
+
|
71
|
+
def patch_additional_args(**kwargs):
|
72
|
+
"""Patches additional_args."""
|
73
|
+
|
74
|
+
def value_fn(k, unused_v, p):
|
75
|
+
# We infer the symbolic value for the old args, as it might be a
|
76
|
+
# contextual attribute referring to its containing object.
|
77
|
+
old_args = p.sym_inferred(k.key)
|
78
|
+
if old_args:
|
79
|
+
old_args = dict(old_args)
|
80
|
+
old_args.update(kwargs)
|
81
|
+
return old_args
|
82
|
+
return kwargs
|
83
|
+
|
84
|
+
return patch_member(base.Evaluable, "additional_args", value_fn)
|
85
|
+
|
86
|
+
|
87
|
+
#
|
88
|
+
# String-based patching.
|
89
|
+
#
|
90
|
+
|
91
|
+
_NAMED_MODELS = {
|
92
|
+
# GPT models.
|
93
|
+
"gpt35turbo": lf_llms.Gpt35Turbo,
|
94
|
+
"gpt35turbo16k": lf_llms.Gpt35Turbo16K,
|
95
|
+
"gpt4": lf_llms.Gpt4,
|
96
|
+
"gpt4turbo": lf_llms.Gpt4Turbo,
|
97
|
+
# Anthropic models.
|
98
|
+
"haiku": lf_llms.Claude3Haiku,
|
99
|
+
"claude3haiku": lf_llms.Claude3Haiku,
|
100
|
+
"opus": lf_llms.Claude3Opus,
|
101
|
+
"claude3opus": lf_llms.Claude3Opus,
|
102
|
+
"sonnet": lf_llms.Claude3Sonnet,
|
103
|
+
"claude3sonnet": lf_llms.Claude3Opus,
|
104
|
+
}
|
105
|
+
|
106
|
+
|
107
|
+
def model_by_name(name: str) -> lf.LanguageModel:
|
108
|
+
"""Gets model by name."""
|
109
|
+
name = name.strip().lower()
|
110
|
+
if name in _NAMED_MODELS:
|
111
|
+
return _NAMED_MODELS[name]()
|
112
|
+
raise ValueError(f"Unknown model name: {name}")
|
113
|
+
|
114
|
+
|
115
|
+
@pg.patcher(auto_typing=True)
|
116
|
+
def lm(unused_eval, models: list[str]):
|
117
|
+
"""Patch the LM used for benchmarking."""
|
118
|
+
return patch_lm(pg.oneof([model_by_name(name) for name in models]))
|
119
|
+
|
120
|
+
|
121
|
+
@pg.patcher(auto_typing=True)
|
122
|
+
def temperature(unused_eval, value: float):
|
123
|
+
"""Patch the temperature used for benchmarking."""
|
124
|
+
return patch_member(lf.LMSamplingOptions, "temperature", value)
|
125
|
+
|
126
|
+
|
127
|
+
@pg.patcher(auto_typing=True)
|
128
|
+
def max_tokens(unused_eval, value: int | None):
|
129
|
+
"""Patch the temperature used for benchmarking."""
|
130
|
+
return patch_member(lf.LMSamplingOptions, "max_tokens", value)
|
@@ -0,0 +1,170 @@
|
|
1
|
+
# Copyright 2024 The Langfun Authors
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
"""Tests for evaluation patching."""
|
15
|
+
|
16
|
+
import unittest
|
17
|
+
from langfun.core import llms as lf_llms
|
18
|
+
from langfun.core.eval import base
|
19
|
+
from langfun.core.eval import patching
|
20
|
+
import pyglove as pg
|
21
|
+
|
22
|
+
|
23
|
+
class PatchingCommonTest(unittest.TestCase):
|
24
|
+
|
25
|
+
def test_patch_member(self):
|
26
|
+
class A(pg.Object):
|
27
|
+
x: int = 1
|
28
|
+
|
29
|
+
class B(pg.Object):
|
30
|
+
a: A
|
31
|
+
|
32
|
+
b = B(A())
|
33
|
+
pg.patch(b, [patching.patch_member(A, 'x', 2)])
|
34
|
+
self.assertEqual(b, B(A(2)))
|
35
|
+
|
36
|
+
def test_patch_args(self):
|
37
|
+
s = base.Suite(
|
38
|
+
[base.Evaluation(inputs=base.as_inputs([1]))],
|
39
|
+
additional_args=dict(x=1, y=2),
|
40
|
+
)
|
41
|
+
pg.patch(s, [patching.patch_additional_args(x=3, z=4)])
|
42
|
+
self.assertTrue(
|
43
|
+
pg.eq(
|
44
|
+
s,
|
45
|
+
base.Suite(
|
46
|
+
[
|
47
|
+
base.Evaluation(
|
48
|
+
inputs=base.as_inputs([1]),
|
49
|
+
additional_args=dict(x=3, y=2, z=4),
|
50
|
+
)
|
51
|
+
],
|
52
|
+
additional_args=dict(x=3, y=2, z=4),
|
53
|
+
),
|
54
|
+
)
|
55
|
+
)
|
56
|
+
|
57
|
+
def test_patch_lm(self):
|
58
|
+
s = base.Suite(
|
59
|
+
[base.Evaluation(inputs=base.as_inputs([1]))],
|
60
|
+
lm=lf_llms.Gpt35Turbo(),
|
61
|
+
)
|
62
|
+
pg.patch(
|
63
|
+
s, [patching.patch_lm(pg.oneof([lf_llms.Gpt35Turbo(), lf_llms.Gpt4()]))]
|
64
|
+
)
|
65
|
+
self.assertTrue(
|
66
|
+
pg.eq(
|
67
|
+
s,
|
68
|
+
base.Suite(
|
69
|
+
[
|
70
|
+
base.Evaluation(
|
71
|
+
inputs=base.as_inputs([1]),
|
72
|
+
lm=pg.oneof([lf_llms.Gpt35Turbo(), lf_llms.Gpt4()]),
|
73
|
+
)
|
74
|
+
],
|
75
|
+
lm=pg.oneof([lf_llms.Gpt35Turbo(), lf_llms.Gpt4()]),
|
76
|
+
),
|
77
|
+
)
|
78
|
+
)
|
79
|
+
|
80
|
+
def test_patch_parsing_lm(self):
|
81
|
+
s = base.Suite(
|
82
|
+
[base.Evaluation(inputs=base.as_inputs([1]))],
|
83
|
+
lm=lf_llms.Gpt4(),
|
84
|
+
)
|
85
|
+
pg.patch(s, [patching.patch_parsing_lm(lf_llms.Gpt35Turbo())])
|
86
|
+
self.assertTrue(
|
87
|
+
pg.eq(
|
88
|
+
s,
|
89
|
+
base.Suite(
|
90
|
+
[
|
91
|
+
base.Evaluation(
|
92
|
+
inputs=base.as_inputs([1]),
|
93
|
+
lm=lf_llms.Gpt4(),
|
94
|
+
parsing_lm=lf_llms.Gpt35Turbo(),
|
95
|
+
)
|
96
|
+
],
|
97
|
+
# NOTE(daiyip): Suite does not have `parsing_lm` as one of its
|
98
|
+
# variable keyword fields yet, so patching does not add to it.
|
99
|
+
# This is okay since we only care about the leaf nodes.
|
100
|
+
lm=lf_llms.Gpt4(),
|
101
|
+
),
|
102
|
+
)
|
103
|
+
)
|
104
|
+
|
105
|
+
def test_patch_prompt(self):
|
106
|
+
e = base.Evaluation(inputs=base.as_inputs([1]))
|
107
|
+
pg.patch(e, [patching.patch_prompt('Q: {{example.question}}')])
|
108
|
+
self.assertTrue(
|
109
|
+
pg.eq(
|
110
|
+
e,
|
111
|
+
base.Evaluation(
|
112
|
+
inputs=base.as_inputs([1]),
|
113
|
+
prompt='Q: {{example.question}}',
|
114
|
+
),
|
115
|
+
)
|
116
|
+
)
|
117
|
+
|
118
|
+
def test_patch_inputs(self):
|
119
|
+
e = base.Evaluation(inputs=base.as_inputs([1]))
|
120
|
+
pg.patch(e, [patching.patch_inputs(base.as_inputs([2]))])
|
121
|
+
self.assertTrue(
|
122
|
+
pg.eq(
|
123
|
+
e,
|
124
|
+
base.Evaluation(
|
125
|
+
inputs=base.as_inputs([2]),
|
126
|
+
),
|
127
|
+
)
|
128
|
+
)
|
129
|
+
|
130
|
+
def test_patch_schema_fn(self):
|
131
|
+
@pg.functor()
|
132
|
+
def int_schema():
|
133
|
+
return int
|
134
|
+
|
135
|
+
e = base.Evaluation(inputs=base.as_inputs([1]))
|
136
|
+
pg.patch(e, [patching.patch_schema_fn(int_schema())])
|
137
|
+
self.assertTrue(
|
138
|
+
pg.eq(
|
139
|
+
e,
|
140
|
+
base.Evaluation(
|
141
|
+
inputs=base.as_inputs([1]),
|
142
|
+
schema_fn=int_schema(),
|
143
|
+
),
|
144
|
+
)
|
145
|
+
)
|
146
|
+
|
147
|
+
|
148
|
+
class StringPatcheTest(unittest.TestCase):
|
149
|
+
|
150
|
+
def test_lm(self):
|
151
|
+
target = pg.patch(
|
152
|
+
base.Evaluation(inputs=base.as_inputs([1])),
|
153
|
+
['lm?haiku:gpt4', 'max_tokens?1024', 'temperature?0.7'],
|
154
|
+
)
|
155
|
+
self.assertEqual(
|
156
|
+
target.lm,
|
157
|
+
pg.oneof([
|
158
|
+
lf_llms.Claude3Haiku(temperature=0.7, max_tokens=1024),
|
159
|
+
lf_llms.Gpt4(temperature=0.7, max_tokens=1024),
|
160
|
+
]),
|
161
|
+
)
|
162
|
+
with self.assertRaisesRegex(ValueError, 'Unknown model name'):
|
163
|
+
pg.patch(
|
164
|
+
base.Evaluation(inputs=base.as_inputs([1])),
|
165
|
+
['lm?gpt2'],
|
166
|
+
)
|
167
|
+
|
168
|
+
|
169
|
+
if __name__ == '__main__':
|
170
|
+
unittest.main()
|
langfun/core/eval/scoring.py
CHANGED
@@ -61,25 +61,36 @@ class Scoring(base.Evaluation):
|
|
61
61
|
super()._reset()
|
62
62
|
self._scored = []
|
63
63
|
|
64
|
-
def
|
64
|
+
def audit_processed(
|
65
|
+
self, example_idx: int, example: Any, output: Any, message: lf.Message,
|
66
|
+
dryrun: bool = False
|
67
|
+
) -> None:
|
68
|
+
del example_idx
|
65
69
|
score = self.score(example, output)
|
70
|
+
|
71
|
+
if dryrun:
|
72
|
+
lf.console.write('')
|
73
|
+
lf.console.write(
|
74
|
+
str(score),
|
75
|
+
title='SCORE',
|
76
|
+
color='blue',
|
77
|
+
)
|
66
78
|
self._scored.append((example, output, score, message))
|
67
79
|
|
68
80
|
@abc.abstractmethod
|
69
81
|
def score(self, example: Any, output: Any) -> float:
|
70
82
|
"""Scores the output against its input example."""
|
71
83
|
|
72
|
-
def
|
84
|
+
def _eval_status(self, progress: lf.concurrent.Progress) -> dict[str, Any]:
|
73
85
|
del progress
|
74
86
|
return {
|
75
|
-
'Model': self.lm.model_id,
|
76
87
|
'Average Score': {self.avg_score},
|
77
|
-
'Scored': '%.
|
88
|
+
'Scored': '%.3f%% (%d/%d)' % (
|
78
89
|
self.score_rate * 100,
|
79
90
|
self.num_scored,
|
80
91
|
self.num_completed,
|
81
92
|
),
|
82
|
-
'Failed': '%.
|
93
|
+
'Failed': '%.3f%% (%d/%d)' % (
|
83
94
|
self.failure_rate * 100,
|
84
95
|
self.num_failures,
|
85
96
|
self.num_completed,
|
@@ -90,8 +101,8 @@ class Scoring(base.Evaluation):
|
|
90
101
|
assert self.result is not None
|
91
102
|
m = self.result.metrics
|
92
103
|
return (
|
93
|
-
'COMPLETED(%s): AvgScore=%f Scored=%.
|
94
|
-
'Failures=%.
|
104
|
+
'COMPLETED(%s): AvgScore=%f Scored=%.3f%% (%d/%d) '
|
105
|
+
'Failures=%.3f%% (%d/%d)'
|
95
106
|
) % (
|
96
107
|
run_status,
|
97
108
|
m.avg_score,
|
@@ -103,8 +114,8 @@ class Scoring(base.Evaluation):
|
|
103
114
|
m.total,
|
104
115
|
)
|
105
116
|
|
106
|
-
def
|
107
|
-
result = super().
|
117
|
+
def finalize(self) -> pg.Dict:
|
118
|
+
result = super().finalize()
|
108
119
|
result.metrics.update(
|
109
120
|
num_scored=self.num_scored,
|
110
121
|
score_rate=self.score_rate,
|
@@ -118,16 +129,12 @@ class Scoring(base.Evaluation):
|
|
118
129
|
super().save(definition, result, report)
|
119
130
|
|
120
131
|
if result:
|
121
|
-
|
122
|
-
def force_dict(v):
|
123
|
-
return pg.object_utils.json_conversion.strip_types(pg.to_json(v))
|
124
|
-
|
125
132
|
# Save scored.
|
126
133
|
pg.save(
|
127
134
|
[
|
128
135
|
# We force the output to be dict as its type may be defined
|
129
136
|
# within functors which could be deserialized.
|
130
|
-
pg.Dict(input=input, output=
|
137
|
+
pg.Dict(input=input, output=output, score=score)
|
131
138
|
for input, output, score, _ in self.scored
|
132
139
|
],
|
133
140
|
os.path.join(self.dir, Scoring.SCORED_JSON),
|
@@ -148,32 +155,30 @@ class Scoring(base.Evaluation):
|
|
148
155
|
def _render_result_row(self, s: io.StringIO):
|
149
156
|
super()._render_result_row(s)
|
150
157
|
s.write(
|
151
|
-
'<td><span style="color:blue">%.
|
158
|
+
'<td><span style="color:blue">%.3f</span></td>' % self.avg_score
|
152
159
|
)
|
153
160
|
s.write(
|
154
161
|
'<td><span style="color:red">%s</span>%s</td>'
|
155
162
|
% (
|
156
|
-
'%.
|
163
|
+
'%.3f%% ' % (self.score_rate * 100),
|
157
164
|
'<a href="%s">(%d/%d)</a>'
|
158
165
|
% (self.scored_link, self.num_scored, self.num_completed),
|
159
166
|
)
|
160
167
|
)
|
161
168
|
|
162
|
-
def
|
169
|
+
def _render_summary_metrics(self, s: io.StringIO) -> None:
|
163
170
|
"""Renders metrics in HTML."""
|
164
171
|
assert self.result is not None
|
165
172
|
m = self.result.metrics
|
166
|
-
|
167
|
-
|
168
|
-
% (
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
'%.2f%%' % (m.score_rate * 100),
|
173
|
-
)
|
173
|
+
self._render_link(
|
174
|
+
s,
|
175
|
+
'Average score (%d/%d)' % (m.num_scored, m.total),
|
176
|
+
'%.3f (%.3f%%)' % (m.avg_score, m.score_rate * 100),
|
177
|
+
'color:green',
|
178
|
+
lambda: self.scored_link,
|
174
179
|
)
|
175
180
|
s.write(' | ')
|
176
|
-
super().
|
181
|
+
super()._render_summary_metrics(s)
|
177
182
|
|
178
183
|
def _render_scored(self, s: io.StringIO) -> None:
|
179
184
|
"""Formats the matched cases into html."""
|
@@ -189,9 +194,13 @@ class Scoring(base.Evaluation):
|
|
189
194
|
for i, (example, output, score, message) in enumerate(self.scored):
|
190
195
|
bgcolor = 'white' if i % 2 == 0 else '#DDDDDD'
|
191
196
|
s.write(f'<tr style="background-color: {bgcolor}"><td>{i + 1}</td>')
|
192
|
-
input_str = pg.
|
197
|
+
input_str = pg.Html.escape(
|
198
|
+
pg.format(example, verbose=False, max_bytes_len=32)
|
199
|
+
)
|
193
200
|
s.write(f'<td style="color:green;white-space:pre-wrap">{input_str}</td>')
|
194
|
-
output_str = pg.
|
201
|
+
output_str = pg.Html.escape(
|
202
|
+
pg.format(output, verbose=False, max_bytes_len=32)
|
203
|
+
)
|
195
204
|
s.write(f'<td style="color:blue;white-space:pre-wrap">{output_str}</td>')
|
196
205
|
s.write(f'<td style="color:magenta;white-space:pre-wrap">{score}</td>')
|
197
206
|
s.write('<td>')
|
@@ -81,7 +81,7 @@ class ScoringTest(unittest.TestCase):
|
|
81
81
|
s.result,
|
82
82
|
dict(
|
83
83
|
experiment_setup=dict(
|
84
|
-
id='ConstraintFollowing@
|
84
|
+
id='ConstraintFollowing@5c88a5eb',
|
85
85
|
dir=s.dir,
|
86
86
|
model='StaticSequence',
|
87
87
|
prompt_template='{{example}}',
|
@@ -98,10 +98,16 @@ class ScoringTest(unittest.TestCase):
|
|
98
98
|
total=2,
|
99
99
|
failures=0,
|
100
100
|
failure_rate=0.0,
|
101
|
+
oop_failures=0,
|
102
|
+
oop_failure_rate=0.0,
|
103
|
+
non_oop_failures=0,
|
104
|
+
non_oop_failure_rate=0.0,
|
105
|
+
failure_breakdown={},
|
101
106
|
num_scored=2,
|
102
107
|
score_rate=1.0,
|
103
108
|
avg_score=0.5,
|
104
109
|
),
|
110
|
+
usage=s.result.usage,
|
105
111
|
),
|
106
112
|
)
|
107
113
|
self.assertTrue(
|
@@ -123,7 +129,12 @@ class ScoringTest(unittest.TestCase):
|
|
123
129
|
)
|
124
130
|
self.assertTrue(
|
125
131
|
os.path.exists(
|
126
|
-
os.path.join(s.dir, scoring.Scoring.
|
132
|
+
os.path.join(s.dir, scoring.Scoring.OOP_FAILURES_JSON)
|
133
|
+
)
|
134
|
+
)
|
135
|
+
self.assertTrue(
|
136
|
+
os.path.exists(
|
137
|
+
os.path.join(s.dir, scoring.Scoring.NON_OOP_FAILURES_JSON)
|
127
138
|
)
|
128
139
|
)
|
129
140
|
self.assertTrue(
|
@@ -142,7 +153,14 @@ class ScoringTest(unittest.TestCase):
|
|
142
153
|
self.assertTrue(
|
143
154
|
os.path.exists(
|
144
155
|
os.path.join(
|
145
|
-
s.dir, scoring.Scoring.
|
156
|
+
s.dir, scoring.Scoring.OOP_FAILURES_HTML
|
157
|
+
)
|
158
|
+
)
|
159
|
+
)
|
160
|
+
self.assertTrue(
|
161
|
+
os.path.exists(
|
162
|
+
os.path.join(
|
163
|
+
s.dir, scoring.Scoring.NON_OOP_FAILURES_HTML
|
146
164
|
)
|
147
165
|
)
|
148
166
|
)
|
@@ -0,0 +1,42 @@
|
|
1
|
+
# Copyright 2024 The Langfun Authors
|
2
|
+
#
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
# you may not use this file except in compliance with the License.
|
5
|
+
# You may obtain a copy of the License at
|
6
|
+
#
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
#
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
# See the License for the specific language governing permissions and
|
13
|
+
# limitations under the License.
|
14
|
+
"""langfun eval framework v2."""
|
15
|
+
|
16
|
+
# pylint: disable=g-importing-member
|
17
|
+
# pylint: disable=g-bad-import-order
|
18
|
+
from langfun.core.eval.v2.experiment import Experiment
|
19
|
+
from langfun.core.eval.v2.experiment import Suite
|
20
|
+
from langfun.core.eval.v2.evaluation import Evaluation
|
21
|
+
|
22
|
+
from langfun.core.eval.v2.example import Example
|
23
|
+
from langfun.core.eval.v2.progress import Progress
|
24
|
+
|
25
|
+
from langfun.core.eval.v2.metric_values import MetricValue
|
26
|
+
from langfun.core.eval.v2.metric_values import Rate
|
27
|
+
from langfun.core.eval.v2.metric_values import Average
|
28
|
+
from langfun.core.eval.v2.metrics import Metric
|
29
|
+
from langfun.core.eval.v2 import metrics
|
30
|
+
|
31
|
+
from langfun.core.eval.v2.experiment import Plugin
|
32
|
+
from langfun.core.eval.v2.experiment import Runner
|
33
|
+
from langfun.core.eval.v2 import runners
|
34
|
+
|
35
|
+
# Plugins
|
36
|
+
from langfun.core.eval.v2.checkpointing import BulkCheckpointer
|
37
|
+
from langfun.core.eval.v2.checkpointing import PerExampleCheckpointer
|
38
|
+
from langfun.core.eval.v2.reporting import HtmlReporter
|
39
|
+
|
40
|
+
|
41
|
+
# pylint: enable=g-bad-import-order
|
42
|
+
# pylint: enable=g-importing-member
|