langfun 0.1.2.dev202510230805__py3-none-any.whl → 0.1.2.dev202511160804__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of langfun might be problematic. Click here for more details.
- langfun/core/__init__.py +1 -0
- langfun/core/agentic/action.py +107 -12
- langfun/core/agentic/action_eval.py +9 -2
- langfun/core/agentic/action_test.py +25 -0
- langfun/core/async_support.py +32 -3
- langfun/core/coding/python/correction.py +19 -9
- langfun/core/coding/python/execution.py +14 -12
- langfun/core/coding/python/generation.py +21 -16
- langfun/core/coding/python/sandboxing.py +23 -3
- langfun/core/component.py +42 -3
- langfun/core/concurrent.py +70 -6
- langfun/core/concurrent_test.py +1 -0
- langfun/core/console.py +1 -1
- langfun/core/data/conversion/anthropic.py +12 -3
- langfun/core/data/conversion/anthropic_test.py +8 -6
- langfun/core/data/conversion/gemini.py +9 -2
- langfun/core/data/conversion/gemini_test.py +12 -9
- langfun/core/data/conversion/openai.py +145 -31
- langfun/core/data/conversion/openai_test.py +161 -17
- langfun/core/eval/base.py +47 -43
- langfun/core/eval/base_test.py +4 -4
- langfun/core/eval/matching.py +5 -2
- langfun/core/eval/patching.py +3 -3
- langfun/core/eval/scoring.py +4 -3
- langfun/core/eval/v2/__init__.py +1 -0
- langfun/core/eval/v2/checkpointing.py +39 -5
- langfun/core/eval/v2/checkpointing_test.py +1 -1
- langfun/core/eval/v2/eval_test_helper.py +96 -0
- langfun/core/eval/v2/evaluation.py +87 -15
- langfun/core/eval/v2/evaluation_test.py +9 -3
- langfun/core/eval/v2/example.py +45 -39
- langfun/core/eval/v2/example_test.py +3 -3
- langfun/core/eval/v2/experiment.py +51 -8
- langfun/core/eval/v2/metric_values.py +31 -3
- langfun/core/eval/v2/metric_values_test.py +32 -0
- langfun/core/eval/v2/metrics.py +157 -44
- langfun/core/eval/v2/metrics_test.py +39 -18
- langfun/core/eval/v2/progress.py +30 -1
- langfun/core/eval/v2/progress_test.py +27 -0
- langfun/core/eval/v2/progress_tracking_test.py +3 -0
- langfun/core/eval/v2/reporting.py +90 -71
- langfun/core/eval/v2/reporting_test.py +20 -6
- langfun/core/eval/v2/runners/__init__.py +26 -0
- langfun/core/eval/v2/{runners.py → runners/base.py} +22 -124
- langfun/core/eval/v2/runners/debug.py +40 -0
- langfun/core/eval/v2/runners/debug_test.py +79 -0
- langfun/core/eval/v2/runners/parallel.py +100 -0
- langfun/core/eval/v2/runners/parallel_test.py +98 -0
- langfun/core/eval/v2/runners/sequential.py +47 -0
- langfun/core/eval/v2/runners/sequential_test.py +175 -0
- langfun/core/langfunc.py +45 -130
- langfun/core/langfunc_test.py +6 -4
- langfun/core/language_model.py +103 -16
- langfun/core/language_model_test.py +9 -3
- langfun/core/llms/__init__.py +7 -1
- langfun/core/llms/anthropic.py +157 -2
- langfun/core/llms/azure_openai.py +29 -17
- langfun/core/llms/cache/base.py +25 -3
- langfun/core/llms/cache/in_memory.py +48 -7
- langfun/core/llms/cache/in_memory_test.py +14 -4
- langfun/core/llms/compositional.py +25 -1
- langfun/core/llms/deepseek.py +30 -2
- langfun/core/llms/fake.py +32 -1
- langfun/core/llms/gemini.py +14 -9
- langfun/core/llms/google_genai.py +29 -1
- langfun/core/llms/groq.py +28 -3
- langfun/core/llms/llama_cpp.py +23 -4
- langfun/core/llms/openai.py +36 -3
- langfun/core/llms/openai_compatible.py +148 -27
- langfun/core/llms/openai_compatible_test.py +207 -20
- langfun/core/llms/openai_test.py +0 -2
- langfun/core/llms/rest.py +12 -1
- langfun/core/llms/vertexai.py +51 -8
- langfun/core/logging.py +1 -1
- langfun/core/mcp/client.py +77 -22
- langfun/core/mcp/client_test.py +8 -35
- langfun/core/mcp/session.py +94 -29
- langfun/core/mcp/session_test.py +54 -0
- langfun/core/mcp/tool.py +151 -22
- langfun/core/mcp/tool_test.py +197 -0
- langfun/core/memory.py +1 -0
- langfun/core/message.py +160 -55
- langfun/core/message_test.py +65 -81
- langfun/core/modalities/__init__.py +8 -0
- langfun/core/modalities/audio.py +21 -1
- langfun/core/modalities/image.py +19 -1
- langfun/core/modalities/mime.py +62 -3
- langfun/core/modalities/pdf.py +19 -1
- langfun/core/modalities/video.py +21 -1
- langfun/core/modality.py +167 -29
- langfun/core/modality_test.py +42 -12
- langfun/core/natural_language.py +1 -1
- langfun/core/sampling.py +4 -4
- langfun/core/sampling_test.py +20 -4
- langfun/core/structured/__init__.py +2 -24
- langfun/core/structured/completion.py +34 -44
- langfun/core/structured/completion_test.py +23 -43
- langfun/core/structured/description.py +54 -50
- langfun/core/structured/function_generation.py +29 -12
- langfun/core/structured/mapping.py +81 -37
- langfun/core/structured/parsing.py +95 -79
- langfun/core/structured/parsing_test.py +0 -3
- langfun/core/structured/querying.py +215 -142
- langfun/core/structured/querying_test.py +65 -29
- langfun/core/structured/schema/__init__.py +48 -0
- langfun/core/structured/schema/base.py +664 -0
- langfun/core/structured/schema/base_test.py +531 -0
- langfun/core/structured/schema/json.py +174 -0
- langfun/core/structured/schema/json_test.py +121 -0
- langfun/core/structured/schema/python.py +316 -0
- langfun/core/structured/schema/python_test.py +410 -0
- langfun/core/structured/schema_generation.py +33 -14
- langfun/core/structured/scoring.py +47 -36
- langfun/core/structured/tokenization.py +26 -11
- langfun/core/subscription.py +2 -2
- langfun/core/template.py +174 -49
- langfun/core/template_test.py +123 -17
- langfun/env/__init__.py +8 -2
- langfun/env/base_environment.py +320 -128
- langfun/env/base_environment_test.py +473 -0
- langfun/env/base_feature.py +92 -15
- langfun/env/base_feature_test.py +228 -0
- langfun/env/base_sandbox.py +84 -361
- langfun/env/base_sandbox_test.py +1235 -0
- langfun/env/event_handlers/__init__.py +1 -1
- langfun/env/event_handlers/chain.py +233 -0
- langfun/env/event_handlers/chain_test.py +253 -0
- langfun/env/event_handlers/event_logger.py +95 -98
- langfun/env/event_handlers/event_logger_test.py +21 -21
- langfun/env/event_handlers/metric_writer.py +225 -140
- langfun/env/event_handlers/metric_writer_test.py +23 -6
- langfun/env/interface.py +854 -40
- langfun/env/interface_test.py +112 -2
- langfun/env/load_balancers_test.py +23 -2
- langfun/env/test_utils.py +126 -84
- {langfun-0.1.2.dev202510230805.dist-info → langfun-0.1.2.dev202511160804.dist-info}/METADATA +1 -1
- langfun-0.1.2.dev202511160804.dist-info/RECORD +211 -0
- langfun/core/eval/v2/runners_test.py +0 -343
- langfun/core/structured/schema.py +0 -987
- langfun/core/structured/schema_test.py +0 -982
- langfun/env/base_test.py +0 -1481
- langfun/env/event_handlers/base.py +0 -350
- langfun-0.1.2.dev202510230805.dist-info/RECORD +0 -195
- {langfun-0.1.2.dev202510230805.dist-info → langfun-0.1.2.dev202511160804.dist-info}/WHEEL +0 -0
- {langfun-0.1.2.dev202510230805.dist-info → langfun-0.1.2.dev202511160804.dist-info}/licenses/LICENSE +0 -0
- {langfun-0.1.2.dev202510230805.dist-info → langfun-0.1.2.dev202511160804.dist-info}/top_level.txt +0 -0
langfun/core/eval/v2/__init__.py
CHANGED
|
@@ -38,6 +38,7 @@ from langfun.core.eval.v2 import runners
|
|
|
38
38
|
from langfun.core.eval.v2.checkpointing import BulkCheckpointer
|
|
39
39
|
from langfun.core.eval.v2.checkpointing import PerExampleCheckpointer
|
|
40
40
|
from langfun.core.eval.v2.reporting import HtmlReporter
|
|
41
|
+
from langfun.core.eval.v2.reporting import ExampleHtmlGenerator
|
|
41
42
|
|
|
42
43
|
|
|
43
44
|
# pylint: enable=g-bad-import-order
|
|
@@ -29,13 +29,28 @@ Runner = experiment_lib.Runner
|
|
|
29
29
|
|
|
30
30
|
|
|
31
31
|
class Checkpointer(experiment_lib.Plugin):
|
|
32
|
-
"""Base class for checkpointing evaluation examples.
|
|
32
|
+
"""Base class for checkpointing evaluation examples.
|
|
33
|
+
|
|
34
|
+
`Checkpointer` is a plugin that saves the state of processed examples
|
|
35
|
+
incrementally during an experiment run, allowing the experiment to be resumed
|
|
36
|
+
later. When an experiment starts, the checkpointer loads any previously saved
|
|
37
|
+
examples from an earlier run (or a warm-start run) into `experiment.state`,
|
|
38
|
+
so the runner can skip processing them again.
|
|
39
|
+
Subclasses should implement `_list_checkpoint_filenames` to identify
|
|
40
|
+
checkpoint files to load, and `_save_example` to save a newly processed
|
|
41
|
+
example.
|
|
42
|
+
"""
|
|
33
43
|
|
|
34
44
|
checkpoint_filename: Annotated[
|
|
35
45
|
str,
|
|
36
46
|
'Checkpoint file pattern.'
|
|
37
47
|
] = 'checkpoint.bagz'
|
|
38
48
|
|
|
49
|
+
max_ckpt_loading_threads: Annotated[
|
|
50
|
+
int,
|
|
51
|
+
'Max number of workers for loading checkpoint files at startup.'
|
|
52
|
+
] = 128
|
|
53
|
+
|
|
39
54
|
def on_experiment_start(
|
|
40
55
|
self,
|
|
41
56
|
runner: Runner,
|
|
@@ -149,7 +164,10 @@ class Checkpointer(experiment_lib.Plugin):
|
|
|
149
164
|
|
|
150
165
|
_ = list(
|
|
151
166
|
lf.concurrent_map(
|
|
152
|
-
_load_state,
|
|
167
|
+
_load_state,
|
|
168
|
+
ckpt_files,
|
|
169
|
+
max_workers=self.max_ckpt_loading_threads,
|
|
170
|
+
silence_on_errors=None
|
|
153
171
|
)
|
|
154
172
|
)
|
|
155
173
|
|
|
@@ -170,7 +188,12 @@ class Checkpointer(experiment_lib.Plugin):
|
|
|
170
188
|
|
|
171
189
|
|
|
172
190
|
class PerExampleCheckpointer(Checkpointer):
|
|
173
|
-
"""Checkpointer that saves each example to a separate file.
|
|
191
|
+
"""Checkpointer that saves each example to a separate file.
|
|
192
|
+
|
|
193
|
+
This checkpointer saves each processed example to its own checkpoint file,
|
|
194
|
+
named using the pattern `<checkpoint_filename_prefix>_<example_id>.<ext>`.
|
|
195
|
+
For example, `checkpoint_1.bagz`, `checkpoint_2.bagz`, etc.
|
|
196
|
+
"""
|
|
174
197
|
|
|
175
198
|
def _on_bound(self):
|
|
176
199
|
super()._on_bound()
|
|
@@ -235,7 +258,13 @@ class PerExampleCheckpointer(Checkpointer):
|
|
|
235
258
|
|
|
236
259
|
|
|
237
260
|
class BulkCheckpointer(Checkpointer):
|
|
238
|
-
"""Checkpointer that saves all examples to a single file.
|
|
261
|
+
"""Checkpointer that saves all examples of an evaluation to a single file.
|
|
262
|
+
|
|
263
|
+
This checkpointer appends newly processed examples of an evaluation to a
|
|
264
|
+
single sequence file (e.g., `checkpoint.bagz`). This is often more efficient
|
|
265
|
+
than `PerExampleCheckpointer` when dealing with a large number of examples
|
|
266
|
+
or when file system overhead is a concern.
|
|
267
|
+
"""
|
|
239
268
|
|
|
240
269
|
def _on_bound(self):
|
|
241
270
|
super()._on_bound()
|
|
@@ -341,7 +370,12 @@ class BulkCheckpointer(Checkpointer):
|
|
|
341
370
|
|
|
342
371
|
|
|
343
372
|
class SequenceWriter:
|
|
344
|
-
"""
|
|
373
|
+
"""A thread-safe writer for sequence files (e.g., Bagz).
|
|
374
|
+
|
|
375
|
+
`SequenceWriter` wraps a `pg.io.SequenceWriter` to provide thread-safe
|
|
376
|
+
`add` and `close` operations, ensuring that examples can be written
|
|
377
|
+
concurrently from multiple threads without corrupting the sequence file.
|
|
378
|
+
"""
|
|
345
379
|
|
|
346
380
|
def __init__(self, path: str):
|
|
347
381
|
self._lock = threading.Lock()
|
|
@@ -65,7 +65,7 @@ class ExampleCollector(experiment_lib.Plugin):
|
|
|
65
65
|
return self._examples
|
|
66
66
|
|
|
67
67
|
def on_example_complete(
|
|
68
|
-
self, runner:
|
|
68
|
+
self, runner: experiment_lib.Runner,
|
|
69
69
|
experiment: experiment_lib.Experiment,
|
|
70
70
|
example: example_lib.Example,
|
|
71
71
|
):
|
|
@@ -13,6 +13,9 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
"""Helper classes and functions for evaluation tests."""
|
|
15
15
|
|
|
16
|
+
import threading
|
|
17
|
+
import time
|
|
18
|
+
|
|
16
19
|
from langfun.core import language_model
|
|
17
20
|
from langfun.core import llms
|
|
18
21
|
from langfun.core import message as message_lib
|
|
@@ -47,6 +50,8 @@ class TestLLM(llms.Fake):
|
|
|
47
50
|
|
|
48
51
|
offset: int = 0
|
|
49
52
|
|
|
53
|
+
__test__ = False
|
|
54
|
+
|
|
50
55
|
def _response_from(self, prompt: message_lib.Message) -> message_lib.Message:
|
|
51
56
|
return message_lib.AIMessage(
|
|
52
57
|
str(prompt.metadata.x + prompt.metadata.y + self.offset)
|
|
@@ -63,6 +68,8 @@ class TestEvaluation(Evaluation):
|
|
|
63
68
|
metrics = [metrics_lib.Match()]
|
|
64
69
|
lm: language_model.LanguageModel = TestLLM()
|
|
65
70
|
|
|
71
|
+
__test__ = False
|
|
72
|
+
|
|
66
73
|
def process(self, example):
|
|
67
74
|
v = example.input
|
|
68
75
|
if v.x == 5:
|
|
@@ -84,6 +91,8 @@ class TestEvaluationWithExampleCheckpointingError(TestEvaluation):
|
|
|
84
91
|
inputs = test_inputs()
|
|
85
92
|
metrics = [metrics_lib.Match()]
|
|
86
93
|
|
|
94
|
+
__test__ = False
|
|
95
|
+
|
|
87
96
|
def process(self, example):
|
|
88
97
|
return 1, dict(
|
|
89
98
|
x=BadJsonConvertible()
|
|
@@ -101,6 +110,8 @@ class TestEvaluationWithExampleHtmlGenerationError(Evaluation):
|
|
|
101
110
|
inputs = test_inputs()
|
|
102
111
|
metrics = [metrics_lib.Match()]
|
|
103
112
|
|
|
113
|
+
__test__ = False
|
|
114
|
+
|
|
104
115
|
def process(self, example):
|
|
105
116
|
return 1, dict(
|
|
106
117
|
x=BadHtmlConvertible()
|
|
@@ -110,6 +121,8 @@ class TestEvaluationWithExampleHtmlGenerationError(Evaluation):
|
|
|
110
121
|
class TestEvaluationWithIndexHtmlGenerationError(TestEvaluation):
|
|
111
122
|
"""Test evaluation class with bad index HTML generation."""
|
|
112
123
|
|
|
124
|
+
__test__ = False
|
|
125
|
+
|
|
113
126
|
def _html_tree_view(self, *args, **kwargs):
|
|
114
127
|
raise ValueError('Cannot render HTML.')
|
|
115
128
|
|
|
@@ -135,3 +148,86 @@ def test_experiment_with_example_html_generation_error():
|
|
|
135
148
|
def test_experiment_with_index_html_generation_error():
|
|
136
149
|
"""Returns a test experiment with bad index HTML."""
|
|
137
150
|
return TestEvaluationWithIndexHtmlGenerationError()
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
class TestPlugin(experiment_lib.Plugin):
|
|
154
|
+
"""Plugin for testing."""
|
|
155
|
+
|
|
156
|
+
started_experiments: list[experiment_lib.Experiment] = []
|
|
157
|
+
completed_experiments: list[experiment_lib.Experiment] = []
|
|
158
|
+
skipped_experiments: list[experiment_lib.Experiment] = []
|
|
159
|
+
started_example_ids: list[int] = []
|
|
160
|
+
completed_example_ids: list[int] = []
|
|
161
|
+
start_time: float | None = None
|
|
162
|
+
complete_time: float | None = None
|
|
163
|
+
|
|
164
|
+
__test__ = False
|
|
165
|
+
|
|
166
|
+
def _on_bound(self):
|
|
167
|
+
super()._on_bound()
|
|
168
|
+
self._lock = threading.Lock()
|
|
169
|
+
|
|
170
|
+
def on_run_start(
|
|
171
|
+
self,
|
|
172
|
+
runner: experiment_lib.Runner,
|
|
173
|
+
root: experiment_lib.Experiment
|
|
174
|
+
) -> None:
|
|
175
|
+
del root
|
|
176
|
+
with pg.notify_on_change(False), pg.allow_writable_accessors(True):
|
|
177
|
+
self.start_time = time.time()
|
|
178
|
+
|
|
179
|
+
def on_run_complete(
|
|
180
|
+
self,
|
|
181
|
+
runner: experiment_lib.Runner,
|
|
182
|
+
root: experiment_lib.Experiment
|
|
183
|
+
) -> None:
|
|
184
|
+
del root
|
|
185
|
+
with pg.notify_on_change(False), pg.allow_writable_accessors(True):
|
|
186
|
+
self.complete_time = time.time()
|
|
187
|
+
|
|
188
|
+
def on_experiment_start(
|
|
189
|
+
self,
|
|
190
|
+
runner: experiment_lib.Runner,
|
|
191
|
+
experiment: experiment_lib.Experiment
|
|
192
|
+
) -> None:
|
|
193
|
+
del runner
|
|
194
|
+
with pg.notify_on_change(False), self._lock:
|
|
195
|
+
self.started_experiments.append(pg.Ref(experiment))
|
|
196
|
+
|
|
197
|
+
def on_experiment_skipped(
|
|
198
|
+
self,
|
|
199
|
+
runner: experiment_lib.Runner,
|
|
200
|
+
experiment: experiment_lib.Experiment
|
|
201
|
+
) -> None:
|
|
202
|
+
del runner
|
|
203
|
+
with pg.notify_on_change(False), self._lock:
|
|
204
|
+
self.skipped_experiments.append(pg.Ref(experiment))
|
|
205
|
+
|
|
206
|
+
def on_experiment_complete(
|
|
207
|
+
self,
|
|
208
|
+
runner: experiment_lib.Runner,
|
|
209
|
+
experiment: experiment_lib.Experiment
|
|
210
|
+
) -> None:
|
|
211
|
+
del runner
|
|
212
|
+
with pg.notify_on_change(False), self._lock:
|
|
213
|
+
self.completed_experiments.append(pg.Ref(experiment))
|
|
214
|
+
|
|
215
|
+
def on_example_start(
|
|
216
|
+
self,
|
|
217
|
+
runner: experiment_lib.Runner,
|
|
218
|
+
experiment: experiment_lib.Experiment,
|
|
219
|
+
example: Example
|
|
220
|
+
) -> None:
|
|
221
|
+
del runner, experiment
|
|
222
|
+
with pg.notify_on_change(False), self._lock:
|
|
223
|
+
self.started_example_ids.append(example.id)
|
|
224
|
+
|
|
225
|
+
def on_example_complete(
|
|
226
|
+
self,
|
|
227
|
+
runner: experiment_lib.Runner,
|
|
228
|
+
experiment: experiment_lib.Experiment,
|
|
229
|
+
example: Example
|
|
230
|
+
) -> None:
|
|
231
|
+
del runner, experiment
|
|
232
|
+
with pg.notify_on_change(False), self._lock:
|
|
233
|
+
self.completed_example_ids.append(example.id)
|
|
@@ -32,17 +32,63 @@ import pyglove as pg
|
|
|
32
32
|
|
|
33
33
|
|
|
34
34
|
class Evaluation(experiment_lib.Experiment):
|
|
35
|
-
"""
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
35
|
+
"""Base class for Langfun evaluations.
|
|
36
|
+
|
|
37
|
+
`lf.eval.Evaluation` is the base class for defining evaluation tasks in
|
|
38
|
+
Langfun. Users typically subclass it to implement custom evaluation logic by
|
|
39
|
+
overriding `inputs` and `process` methods.
|
|
40
|
+
|
|
41
|
+
An `Evaluation` object encapsulates:
|
|
42
|
+
|
|
43
|
+
* **`inputs`**: A callable that returns an iterable of input examples to be
|
|
44
|
+
processed. This is usually provided by implementing an `inputs(self)`
|
|
45
|
+
method in the subclass, which yields input items for evaluation one by
|
|
46
|
+
one.
|
|
47
|
+
* **`process(self, example)`**: An abstract method that processes one
|
|
48
|
+
example and returns the output, or a tuple of (output, metadata).
|
|
49
|
+
The output will be used for computing metrics.
|
|
50
|
+
* **`metrics`**: A list of metrics (e.g., `lf.metrics.Accuracy`) to compute
|
|
51
|
+
based on the outputs from `process`. Some metrics may require users to
|
|
52
|
+
implement a `ground_truth(self, example)` method in the subclass to
|
|
53
|
+
compute metrics against ground truth.
|
|
54
|
+
* **Hyperparameters**: Any other attributes of the class serve as
|
|
55
|
+
hyperparameters for the evaluation (e.g., the language model to use).
|
|
56
|
+
|
|
57
|
+
**Running Evaluations:**
|
|
58
|
+
|
|
59
|
+
Evaluations are executed via `lf.eval.Suite` or by calling the `.run()`
|
|
60
|
+
method on an `Evaluation` instance, which returns a `Run` object
|
|
61
|
+
containing the evaluation run information and results. If an evaluation
|
|
62
|
+
contains sweeable parameters (using `pg.oneof`), `.run()` will expand it
|
|
63
|
+
into multiple evaluation sub-tasks -- one for each combination of
|
|
64
|
+
hyperparameters -- all managed within the same `Run`.
|
|
65
|
+
|
|
66
|
+
**Example:**
|
|
67
|
+
|
|
68
|
+
```python
|
|
69
|
+
import langfun as lf
|
|
70
|
+
import pyglove as pg
|
|
71
|
+
|
|
72
|
+
class MyEval(lf.eval.Evaluation):
|
|
73
|
+
lm: lf.LanguageModel
|
|
74
|
+
prompt: str = '1 + 1 = '
|
|
75
|
+
|
|
76
|
+
def inputs(self):
|
|
77
|
+
yield 2
|
|
78
|
+
|
|
79
|
+
def process(self, example: lf.eval.Example):
|
|
80
|
+
return int(lf.query(self.prompt, lm=self.lm))
|
|
81
|
+
|
|
82
|
+
def ground_truth(self, example: lf.eval.Example) -> int:
|
|
83
|
+
return example.input
|
|
84
|
+
|
|
85
|
+
# Run evaluation using two different LMs
|
|
86
|
+
evaluation = MyEval(
|
|
87
|
+
lm=pg.oneof([lf.llms.Gpt4(), lf.llms.Gemini()]),
|
|
88
|
+
metrics=[lf.metrics.Accuracy()]
|
|
89
|
+
)
|
|
90
|
+
run_info = evaluation.run()
|
|
91
|
+
```
|
|
46
92
|
"""
|
|
47
93
|
|
|
48
94
|
inputs: Annotated[
|
|
@@ -126,6 +172,20 @@ class Evaluation(experiment_lib.Experiment):
|
|
|
126
172
|
# Evaluation logics.
|
|
127
173
|
#
|
|
128
174
|
|
|
175
|
+
def setup(self) -> None:
|
|
176
|
+
"""Sets up resources required by the evaluation.
|
|
177
|
+
|
|
178
|
+
Subclasses should always call the super().setup() method to ensure the
|
|
179
|
+
proper initialization of the evaluation.
|
|
180
|
+
"""
|
|
181
|
+
|
|
182
|
+
def teardown(self) -> None:
|
|
183
|
+
"""Tears down resources used by the evaluation.
|
|
184
|
+
|
|
185
|
+
Subclasses should always call the super().teardown() method to ensure the
|
|
186
|
+
proper cleanup of the evaluation.
|
|
187
|
+
"""
|
|
188
|
+
|
|
129
189
|
@abc.abstractmethod
|
|
130
190
|
def process(
|
|
131
191
|
self,
|
|
@@ -137,7 +197,7 @@ class Evaluation(experiment_lib.Experiment):
|
|
|
137
197
|
|
|
138
198
|
Args:
|
|
139
199
|
example: An example object to process. `example.input` is an object
|
|
140
|
-
|
|
200
|
+
yielded from `inputs()` method.
|
|
141
201
|
|
|
142
202
|
Returns:
|
|
143
203
|
A processed output. Or a tuple of (output, metadata).
|
|
@@ -150,6 +210,7 @@ class Evaluation(experiment_lib.Experiment):
|
|
|
150
210
|
example: example_lib.Example | int,
|
|
151
211
|
raise_if_has_error: bool = False,
|
|
152
212
|
reevaluate_upon_previous_errors: bool = True,
|
|
213
|
+
force_recompute_metrics: bool = False
|
|
153
214
|
) -> example_lib.Example:
|
|
154
215
|
"""Evaluates a single example input.
|
|
155
216
|
|
|
@@ -158,6 +219,8 @@ class Evaluation(experiment_lib.Experiment):
|
|
|
158
219
|
raise_if_has_error: Whether to raise an error if the example has error.
|
|
159
220
|
reevaluate_upon_previous_errors: Whether to reevaluate the example if
|
|
160
221
|
the previous checkpointed run has error.
|
|
222
|
+
force_recompute_metrics: If True, force recompute the metrics even if
|
|
223
|
+
metric metadata is already present from previous checkpoint.
|
|
161
224
|
|
|
162
225
|
Returns:
|
|
163
226
|
The evaluated example with the output and metric metadata populated.
|
|
@@ -206,6 +269,7 @@ class Evaluation(experiment_lib.Experiment):
|
|
|
206
269
|
# Use the output and metadata obtained from the previous processing.
|
|
207
270
|
example.output = checkpointed.output
|
|
208
271
|
example.metadata = checkpointed.metadata
|
|
272
|
+
example.metric_metadata = checkpointed.metric_metadata
|
|
209
273
|
example.error = checkpointed.error
|
|
210
274
|
example.newly_processed = False
|
|
211
275
|
example.execution_status = checkpointed.execution_status
|
|
@@ -225,8 +289,16 @@ class Evaluation(experiment_lib.Experiment):
|
|
|
225
289
|
self.info(f'Starting metric computation for example {example.id}.')
|
|
226
290
|
metric_metadata = {}
|
|
227
291
|
for metric in self.metrics:
|
|
228
|
-
metric_metadata.
|
|
229
|
-
|
|
292
|
+
metric_metadata[metric.name] = metric.update(
|
|
293
|
+
example, force_recompute=force_recompute_metrics
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
if example.metric_metadata is None:
|
|
297
|
+
example.metric_metadata = metric_metadata
|
|
298
|
+
else:
|
|
299
|
+
# Accumulate the metric metadata as there might be existing metadata
|
|
300
|
+
# from previous metric computation runs.
|
|
301
|
+
example.metric_metadata.update(metric_metadata)
|
|
230
302
|
self.info(f'Completed metric computation for example {example.id}.')
|
|
231
303
|
|
|
232
304
|
# For previously processed examples, we keep the execution status for the
|
|
@@ -760,7 +832,7 @@ class Evaluation(experiment_lib.Experiment):
|
|
|
760
832
|
|
|
761
833
|
|
|
762
834
|
class EvaluationState:
|
|
763
|
-
"""
|
|
835
|
+
"""In-memory state of an evaluation."""
|
|
764
836
|
|
|
765
837
|
class ExampleStatus(pg.Object):
|
|
766
838
|
"""Example state."""
|
|
@@ -88,7 +88,7 @@ class EvaluationTest(unittest.TestCase):
|
|
|
88
88
|
self.assertEqual(example.output, 6)
|
|
89
89
|
self.assertIsNone(example.error)
|
|
90
90
|
self.assertEqual(example.metadata, {})
|
|
91
|
-
self.assertEqual(example.metric_metadata, dict(match=True))
|
|
91
|
+
self.assertEqual(example.metric_metadata, dict(match=dict(is_correct=True)))
|
|
92
92
|
self.assertIsNotNone(example.usage_summary)
|
|
93
93
|
self.assertGreater(example.usage_summary.total.total_tokens, 0)
|
|
94
94
|
self.assertEqual(example.usage_summary.total.num_requests, 1)
|
|
@@ -103,7 +103,10 @@ class EvaluationTest(unittest.TestCase):
|
|
|
103
103
|
self.assertEqual(example.output, 7)
|
|
104
104
|
self.assertIsNone(example.error)
|
|
105
105
|
self.assertEqual(example.metadata, {})
|
|
106
|
-
self.assertEqual(
|
|
106
|
+
self.assertEqual(
|
|
107
|
+
example.metric_metadata,
|
|
108
|
+
dict(match=dict(is_correct=False))
|
|
109
|
+
)
|
|
107
110
|
|
|
108
111
|
with self.assertRaisesRegex(ValueError, 'x should not be 5'):
|
|
109
112
|
_ = exp.evaluate(6, raise_if_has_error=True)
|
|
@@ -113,7 +116,10 @@ class EvaluationTest(unittest.TestCase):
|
|
|
113
116
|
self.assertEqual(pg.MISSING_VALUE, example.output)
|
|
114
117
|
self.assertEqual(example.error.tag, 'ValueError')
|
|
115
118
|
self.assertEqual(example.metadata, {})
|
|
116
|
-
self.assertEqual(
|
|
119
|
+
self.assertEqual(
|
|
120
|
+
example.metric_metadata,
|
|
121
|
+
dict(match=dict(error='ValueError'))
|
|
122
|
+
)
|
|
117
123
|
|
|
118
124
|
def test_evaluate_withstate(self):
|
|
119
125
|
eval_dir = os.path.join(tempfile.mkdtemp(), 'test_eval')
|
langfun/core/eval/v2/example.py
CHANGED
|
@@ -22,19 +22,30 @@ import pyglove as pg
|
|
|
22
22
|
|
|
23
23
|
@dataclasses.dataclass
|
|
24
24
|
class Example(pg.JSONConvertible, pg.views.HtmlTreeView.Extension):
|
|
25
|
-
"""An
|
|
25
|
+
"""An example for evaluation.
|
|
26
|
+
|
|
27
|
+
An evaluation example contains the input and output of an evaluation task,
|
|
28
|
+
as well as metadata about the evaluation process, such as execution time,
|
|
29
|
+
LLM usage, and metric results.
|
|
26
30
|
|
|
27
31
|
Attributes:
|
|
28
|
-
id: The 1-based ID of the
|
|
29
|
-
input: An element returned from the `Evaluable.inputs` functor
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
32
|
+
id: The 1-based ID of the example in the evaluation set.
|
|
33
|
+
input: An element returned from the `Evaluable.inputs` functor, which serves
|
|
34
|
+
as the input for `lf.Evaluable.process`.
|
|
35
|
+
output: The output of `lf.Evaluable.process` method. If `pg.MISSING_VALUE`,
|
|
36
|
+
it indicates the example has not been processed yet.
|
|
37
|
+
error: The error raised from `lf.Evaluable.process`. If None, it
|
|
38
|
+
indicates the process was successful.
|
|
39
|
+
metadata: The metadata of the example produced by `lf.Evaluable.process`.
|
|
40
|
+
metric_metadata: The dictionary returned from `Metric.audit`, which contains
|
|
41
|
+
metadata about metric computation for this example.
|
|
42
|
+
newly_processed: Whether this example is processed in the current run. If
|
|
43
|
+
False, it indicates the example was loaded from a checkpoint from previous
|
|
44
|
+
runs.
|
|
45
|
+
start_time: The start time of processing this example.
|
|
46
|
+
end_time: The end time of processing this example.
|
|
47
|
+
usage_summary: The summary of LLM usages for processing this example.
|
|
48
|
+
execution_status: The timeit status of processing this example.
|
|
38
49
|
"""
|
|
39
50
|
id: int
|
|
40
51
|
input: Any = pg.MISSING_VALUE
|
|
@@ -49,14 +60,6 @@ class Example(pg.JSONConvertible, pg.views.HtmlTreeView.Extension):
|
|
|
49
60
|
usage_summary: lf.UsageSummary | None = None
|
|
50
61
|
execution_status: dict[str, pg.utils.TimeIt.Status] | None = None
|
|
51
62
|
|
|
52
|
-
def __post_init__(self):
|
|
53
|
-
if self.execution_status is not None:
|
|
54
|
-
for status in self.execution_status.values():
|
|
55
|
-
if status.has_error:
|
|
56
|
-
assert isinstance(status.error, pg.ErrorInfo)
|
|
57
|
-
self.error = status.error
|
|
58
|
-
break
|
|
59
|
-
|
|
60
63
|
@property
|
|
61
64
|
def is_processed(self) -> bool:
|
|
62
65
|
"""Returns whether the item has been processed."""
|
|
@@ -182,15 +185,23 @@ class Example(pg.JSONConvertible, pg.views.HtmlTreeView.Extension):
|
|
|
182
185
|
extra_flags = extra_flags or {}
|
|
183
186
|
num_examples = extra_flags.get('num_examples', None)
|
|
184
187
|
|
|
185
|
-
def
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
188
|
+
def _metric_label_group(metric_metadata: dict[str, Any] | None):
|
|
189
|
+
"""Renders a label group for metric metadata."""
|
|
190
|
+
badges = []
|
|
191
|
+
if metric_metadata:
|
|
192
|
+
for metric_name, metadata in metric_metadata.items():
|
|
193
|
+
assert isinstance(metadata, dict), (metric_name, metadata)
|
|
194
|
+
for k, v in metadata.items():
|
|
195
|
+
css_class = k
|
|
196
|
+
if isinstance(v, bool):
|
|
197
|
+
css_class += '_true' if v else '_false'
|
|
198
|
+
badge = pg.views.html.controls.Badge(
|
|
199
|
+
f'{k}:{v}',
|
|
200
|
+
tooltip=f'{metric_name}: {k}',
|
|
201
|
+
css_classes=[css_class],
|
|
202
|
+
)
|
|
203
|
+
badges.append(badge)
|
|
204
|
+
return pg.views.html.controls.LabelGroup(badges)
|
|
194
205
|
|
|
195
206
|
def _render_header():
|
|
196
207
|
return pg.Html.element(
|
|
@@ -229,12 +240,7 @@ class Example(pg.JSONConvertible, pg.views.HtmlTreeView.Extension):
|
|
|
229
240
|
extra_flags=dict(as_badge=True)
|
|
230
241
|
) if self.usage_summary is not None else None,
|
|
231
242
|
# Metric metadata.
|
|
232
|
-
|
|
233
|
-
[ # pylint: disable=g-long-ternary
|
|
234
|
-
_metric_metadata_badge(k, v)
|
|
235
|
-
for k, v in self.metric_metadata.items()
|
|
236
|
-
] if self.metric_metadata else []
|
|
237
|
-
),
|
|
243
|
+
_metric_label_group(self.metric_metadata)
|
|
238
244
|
],
|
|
239
245
|
css_classes=['example-container'],
|
|
240
246
|
)
|
|
@@ -305,18 +311,18 @@ class Example(pg.JSONConvertible, pg.views.HtmlTreeView.Extension):
|
|
|
305
311
|
color: black;
|
|
306
312
|
}
|
|
307
313
|
/* Badge styles. */
|
|
308
|
-
.eval-example .badge.
|
|
314
|
+
.eval-example .badge.is_correct_true {
|
|
309
315
|
color: green;
|
|
310
316
|
background-color: #dcefbe;
|
|
311
317
|
}
|
|
318
|
+
.eval-example .badge.is_correct_false {
|
|
319
|
+
color: orange;
|
|
320
|
+
background-color: #ffefc4;
|
|
321
|
+
}
|
|
312
322
|
.eval-example .badge.error {
|
|
313
323
|
color: red;
|
|
314
324
|
background-color: #fdcccc;
|
|
315
325
|
}
|
|
316
|
-
.eval-example .badge.mismatch {
|
|
317
|
-
color: orange;
|
|
318
|
-
background-color: #ffefc4;
|
|
319
|
-
}
|
|
320
326
|
.eval-example .badge.score {
|
|
321
327
|
color: blue;
|
|
322
328
|
background-color: #c4dced;
|
|
@@ -32,9 +32,9 @@ class ExampleTest(unittest.TestCase):
|
|
|
32
32
|
name='evaluation', elapse=1.0, error=error
|
|
33
33
|
)
|
|
34
34
|
})
|
|
35
|
-
self.
|
|
35
|
+
self.assertIsNone(ex.error)
|
|
36
36
|
self.assertFalse(ex.is_processed)
|
|
37
|
-
self.
|
|
37
|
+
self.assertFalse(ex.has_error)
|
|
38
38
|
self.assertEqual(ex.elapse, 1.0)
|
|
39
39
|
|
|
40
40
|
ex = Example(id=2, output=1)
|
|
@@ -116,7 +116,7 @@ class ExampleTest(unittest.TestCase):
|
|
|
116
116
|
input=pg.Dict(a=1, b=2),
|
|
117
117
|
output=3,
|
|
118
118
|
metadata=dict(sum=3),
|
|
119
|
-
metric_metadata=dict(match=True),
|
|
119
|
+
metric_metadata=dict(match=dict(match=True)),
|
|
120
120
|
)
|
|
121
121
|
self.assertNotIn(
|
|
122
122
|
'next',
|
|
@@ -139,10 +139,10 @@ class Experiment(lf.Component, pg.views.HtmlTreeView.Extension):
|
|
|
139
139
|
|
|
140
140
|
# Checkpointing
|
|
141
141
|
|
|
142
|
-
Experiments support checkpointing, which is enabled by default. It allows
|
|
142
|
+
Experiments support checkpointing, which is enabled by default. It allows
|
|
143
143
|
users to resume their experiments from a saved state. When an experiment runs,
|
|
144
|
-
it creates a new directory for that run and saves
|
|
145
|
-
|
|
144
|
+
it creates a new directory for that run and saves its progress to checkpoint
|
|
145
|
+
files. If the experiment is interrupted or fails, users can resume
|
|
146
146
|
it by specifying the 'id' or 'warm_start_from' argument (shown above) to
|
|
147
147
|
seamlessly continue from previously saved state without starting over.
|
|
148
148
|
|
|
@@ -169,7 +169,7 @@ class Experiment(lf.Component, pg.views.HtmlTreeView.Extension):
|
|
|
169
169
|
|
|
170
170
|
# Experiment Plugins
|
|
171
171
|
|
|
172
|
-
|
|
172
|
+
Experiments can be extended by plugins. Plugins can listen to the events of
|
|
173
173
|
experiment execution and produce additional outputs. For example, a plugin
|
|
174
174
|
can be added to an experiment to generate additional metrics or to save
|
|
175
175
|
additional data to a database. More details will be added in the future.
|
|
@@ -657,7 +657,30 @@ class Experiment(lf.Component, pg.views.HtmlTreeView.Extension):
|
|
|
657
657
|
|
|
658
658
|
@pg.use_init_args(['children'])
|
|
659
659
|
class Suite(Experiment):
|
|
660
|
-
"""A suite of evaluations.
|
|
660
|
+
"""A suite of evaluations.
|
|
661
|
+
|
|
662
|
+
`lf.eval.Suite` groups multiple `lf.eval.Evaluation` or other `Suite`
|
|
663
|
+
objects into a single experiment, allowing them to be run, managed, and
|
|
664
|
+
reported together.
|
|
665
|
+
|
|
666
|
+
**Example:**
|
|
667
|
+
|
|
668
|
+
```python
|
|
669
|
+
import langfun as lf
|
|
670
|
+
|
|
671
|
+
suite = lf.eval.Suite([
|
|
672
|
+
MyEval(lm=lf.llms.Gpt4()),
|
|
673
|
+
MyEval(lm=lf.llms.Gemini()),
|
|
674
|
+
lf.eval.Suite([
|
|
675
|
+
AnotherEval(lm=lf.llms.Gpt4()),
|
|
676
|
+
AnotherEval(lm=lf.llms.Gemini())
|
|
677
|
+
])
|
|
678
|
+
])
|
|
679
|
+
|
|
680
|
+
# Run all evaluations in the suite
|
|
681
|
+
run_info = suite.run('/path/to/my/suite_run')
|
|
682
|
+
```
|
|
683
|
+
"""
|
|
661
684
|
|
|
662
685
|
children: Annotated[
|
|
663
686
|
list[Experiment], 'A list of child experiments.'
|
|
@@ -791,7 +814,14 @@ class RunId(pg.Object):
|
|
|
791
814
|
|
|
792
815
|
|
|
793
816
|
class Run(pg.Object, pg.views.html.HtmlTreeView.Extension):
|
|
794
|
-
"""
|
|
817
|
+
"""Represents a single run of an experiment.
|
|
818
|
+
|
|
819
|
+
A `Run` object holds all the configurations for executing an experiment,
|
|
820
|
+
such as the experiment definition, input/output directories, and flags
|
|
821
|
+
controlling the execution behavior (e.g., error handling, checkpointing).
|
|
822
|
+
It also provides utility methods for accessing run-specific paths and
|
|
823
|
+
filtering examples for evaluation.
|
|
824
|
+
"""
|
|
795
825
|
|
|
796
826
|
root_dir: Annotated[
|
|
797
827
|
str,
|
|
@@ -971,7 +1001,13 @@ class Run(pg.Object, pg.views.html.HtmlTreeView.Extension):
|
|
|
971
1001
|
|
|
972
1002
|
|
|
973
1003
|
class Runner(pg.Object):
|
|
974
|
-
"""Interface for experiment runner.
|
|
1004
|
+
"""Interface for experiment runner.
|
|
1005
|
+
|
|
1006
|
+
A runner is responsible for executing the evaluations within an experiment
|
|
1007
|
+
based on the configuration specified in a `Run` object. Different runners
|
|
1008
|
+
can implement different execution strategies, such as sequential or parallel
|
|
1009
|
+
processing of examples and evaluations.
|
|
1010
|
+
"""
|
|
975
1011
|
|
|
976
1012
|
# Class-level variable for registering the runner.
|
|
977
1013
|
NAME = None
|
|
@@ -1010,7 +1046,14 @@ class Runner(pg.Object):
|
|
|
1010
1046
|
|
|
1011
1047
|
|
|
1012
1048
|
class Plugin(lf.Component):
|
|
1013
|
-
"""Base class for experiment plugins.
|
|
1049
|
+
"""Base class for experiment plugins.
|
|
1050
|
+
|
|
1051
|
+
Plugins provide a mechanism to extend the behavior of an experiment run
|
|
1052
|
+
by hooking into various events during the lifecycle of experiment and
|
|
1053
|
+
example execution, such as `on_run_start`, `on_experiment_complete`,
|
|
1054
|
+
`on_example_start`, etc. They can be used for custom logging, monitoring,
|
|
1055
|
+
or result processing.
|
|
1056
|
+
"""
|
|
1014
1057
|
|
|
1015
1058
|
def on_run_start(
|
|
1016
1059
|
self,
|