langfun 0.1.2.dev202510230805__py3-none-any.whl → 0.1.2.dev202511160804__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of langfun might be problematic. Click here for more details.

Files changed (146) hide show
  1. langfun/core/__init__.py +1 -0
  2. langfun/core/agentic/action.py +107 -12
  3. langfun/core/agentic/action_eval.py +9 -2
  4. langfun/core/agentic/action_test.py +25 -0
  5. langfun/core/async_support.py +32 -3
  6. langfun/core/coding/python/correction.py +19 -9
  7. langfun/core/coding/python/execution.py +14 -12
  8. langfun/core/coding/python/generation.py +21 -16
  9. langfun/core/coding/python/sandboxing.py +23 -3
  10. langfun/core/component.py +42 -3
  11. langfun/core/concurrent.py +70 -6
  12. langfun/core/concurrent_test.py +1 -0
  13. langfun/core/console.py +1 -1
  14. langfun/core/data/conversion/anthropic.py +12 -3
  15. langfun/core/data/conversion/anthropic_test.py +8 -6
  16. langfun/core/data/conversion/gemini.py +9 -2
  17. langfun/core/data/conversion/gemini_test.py +12 -9
  18. langfun/core/data/conversion/openai.py +145 -31
  19. langfun/core/data/conversion/openai_test.py +161 -17
  20. langfun/core/eval/base.py +47 -43
  21. langfun/core/eval/base_test.py +4 -4
  22. langfun/core/eval/matching.py +5 -2
  23. langfun/core/eval/patching.py +3 -3
  24. langfun/core/eval/scoring.py +4 -3
  25. langfun/core/eval/v2/__init__.py +1 -0
  26. langfun/core/eval/v2/checkpointing.py +39 -5
  27. langfun/core/eval/v2/checkpointing_test.py +1 -1
  28. langfun/core/eval/v2/eval_test_helper.py +96 -0
  29. langfun/core/eval/v2/evaluation.py +87 -15
  30. langfun/core/eval/v2/evaluation_test.py +9 -3
  31. langfun/core/eval/v2/example.py +45 -39
  32. langfun/core/eval/v2/example_test.py +3 -3
  33. langfun/core/eval/v2/experiment.py +51 -8
  34. langfun/core/eval/v2/metric_values.py +31 -3
  35. langfun/core/eval/v2/metric_values_test.py +32 -0
  36. langfun/core/eval/v2/metrics.py +157 -44
  37. langfun/core/eval/v2/metrics_test.py +39 -18
  38. langfun/core/eval/v2/progress.py +30 -1
  39. langfun/core/eval/v2/progress_test.py +27 -0
  40. langfun/core/eval/v2/progress_tracking_test.py +3 -0
  41. langfun/core/eval/v2/reporting.py +90 -71
  42. langfun/core/eval/v2/reporting_test.py +20 -6
  43. langfun/core/eval/v2/runners/__init__.py +26 -0
  44. langfun/core/eval/v2/{runners.py → runners/base.py} +22 -124
  45. langfun/core/eval/v2/runners/debug.py +40 -0
  46. langfun/core/eval/v2/runners/debug_test.py +79 -0
  47. langfun/core/eval/v2/runners/parallel.py +100 -0
  48. langfun/core/eval/v2/runners/parallel_test.py +98 -0
  49. langfun/core/eval/v2/runners/sequential.py +47 -0
  50. langfun/core/eval/v2/runners/sequential_test.py +175 -0
  51. langfun/core/langfunc.py +45 -130
  52. langfun/core/langfunc_test.py +6 -4
  53. langfun/core/language_model.py +103 -16
  54. langfun/core/language_model_test.py +9 -3
  55. langfun/core/llms/__init__.py +7 -1
  56. langfun/core/llms/anthropic.py +157 -2
  57. langfun/core/llms/azure_openai.py +29 -17
  58. langfun/core/llms/cache/base.py +25 -3
  59. langfun/core/llms/cache/in_memory.py +48 -7
  60. langfun/core/llms/cache/in_memory_test.py +14 -4
  61. langfun/core/llms/compositional.py +25 -1
  62. langfun/core/llms/deepseek.py +30 -2
  63. langfun/core/llms/fake.py +32 -1
  64. langfun/core/llms/gemini.py +14 -9
  65. langfun/core/llms/google_genai.py +29 -1
  66. langfun/core/llms/groq.py +28 -3
  67. langfun/core/llms/llama_cpp.py +23 -4
  68. langfun/core/llms/openai.py +36 -3
  69. langfun/core/llms/openai_compatible.py +148 -27
  70. langfun/core/llms/openai_compatible_test.py +207 -20
  71. langfun/core/llms/openai_test.py +0 -2
  72. langfun/core/llms/rest.py +12 -1
  73. langfun/core/llms/vertexai.py +51 -8
  74. langfun/core/logging.py +1 -1
  75. langfun/core/mcp/client.py +77 -22
  76. langfun/core/mcp/client_test.py +8 -35
  77. langfun/core/mcp/session.py +94 -29
  78. langfun/core/mcp/session_test.py +54 -0
  79. langfun/core/mcp/tool.py +151 -22
  80. langfun/core/mcp/tool_test.py +197 -0
  81. langfun/core/memory.py +1 -0
  82. langfun/core/message.py +160 -55
  83. langfun/core/message_test.py +65 -81
  84. langfun/core/modalities/__init__.py +8 -0
  85. langfun/core/modalities/audio.py +21 -1
  86. langfun/core/modalities/image.py +19 -1
  87. langfun/core/modalities/mime.py +62 -3
  88. langfun/core/modalities/pdf.py +19 -1
  89. langfun/core/modalities/video.py +21 -1
  90. langfun/core/modality.py +167 -29
  91. langfun/core/modality_test.py +42 -12
  92. langfun/core/natural_language.py +1 -1
  93. langfun/core/sampling.py +4 -4
  94. langfun/core/sampling_test.py +20 -4
  95. langfun/core/structured/__init__.py +2 -24
  96. langfun/core/structured/completion.py +34 -44
  97. langfun/core/structured/completion_test.py +23 -43
  98. langfun/core/structured/description.py +54 -50
  99. langfun/core/structured/function_generation.py +29 -12
  100. langfun/core/structured/mapping.py +81 -37
  101. langfun/core/structured/parsing.py +95 -79
  102. langfun/core/structured/parsing_test.py +0 -3
  103. langfun/core/structured/querying.py +215 -142
  104. langfun/core/structured/querying_test.py +65 -29
  105. langfun/core/structured/schema/__init__.py +48 -0
  106. langfun/core/structured/schema/base.py +664 -0
  107. langfun/core/structured/schema/base_test.py +531 -0
  108. langfun/core/structured/schema/json.py +174 -0
  109. langfun/core/structured/schema/json_test.py +121 -0
  110. langfun/core/structured/schema/python.py +316 -0
  111. langfun/core/structured/schema/python_test.py +410 -0
  112. langfun/core/structured/schema_generation.py +33 -14
  113. langfun/core/structured/scoring.py +47 -36
  114. langfun/core/structured/tokenization.py +26 -11
  115. langfun/core/subscription.py +2 -2
  116. langfun/core/template.py +174 -49
  117. langfun/core/template_test.py +123 -17
  118. langfun/env/__init__.py +8 -2
  119. langfun/env/base_environment.py +320 -128
  120. langfun/env/base_environment_test.py +473 -0
  121. langfun/env/base_feature.py +92 -15
  122. langfun/env/base_feature_test.py +228 -0
  123. langfun/env/base_sandbox.py +84 -361
  124. langfun/env/base_sandbox_test.py +1235 -0
  125. langfun/env/event_handlers/__init__.py +1 -1
  126. langfun/env/event_handlers/chain.py +233 -0
  127. langfun/env/event_handlers/chain_test.py +253 -0
  128. langfun/env/event_handlers/event_logger.py +95 -98
  129. langfun/env/event_handlers/event_logger_test.py +21 -21
  130. langfun/env/event_handlers/metric_writer.py +225 -140
  131. langfun/env/event_handlers/metric_writer_test.py +23 -6
  132. langfun/env/interface.py +854 -40
  133. langfun/env/interface_test.py +112 -2
  134. langfun/env/load_balancers_test.py +23 -2
  135. langfun/env/test_utils.py +126 -84
  136. {langfun-0.1.2.dev202510230805.dist-info → langfun-0.1.2.dev202511160804.dist-info}/METADATA +1 -1
  137. langfun-0.1.2.dev202511160804.dist-info/RECORD +211 -0
  138. langfun/core/eval/v2/runners_test.py +0 -343
  139. langfun/core/structured/schema.py +0 -987
  140. langfun/core/structured/schema_test.py +0 -982
  141. langfun/env/base_test.py +0 -1481
  142. langfun/env/event_handlers/base.py +0 -350
  143. langfun-0.1.2.dev202510230805.dist-info/RECORD +0 -195
  144. {langfun-0.1.2.dev202510230805.dist-info → langfun-0.1.2.dev202511160804.dist-info}/WHEEL +0 -0
  145. {langfun-0.1.2.dev202510230805.dist-info → langfun-0.1.2.dev202511160804.dist-info}/licenses/LICENSE +0 -0
  146. {langfun-0.1.2.dev202510230805.dist-info → langfun-0.1.2.dev202511160804.dist-info}/top_level.txt +0 -0
@@ -38,6 +38,7 @@ from langfun.core.eval.v2 import runners
38
38
  from langfun.core.eval.v2.checkpointing import BulkCheckpointer
39
39
  from langfun.core.eval.v2.checkpointing import PerExampleCheckpointer
40
40
  from langfun.core.eval.v2.reporting import HtmlReporter
41
+ from langfun.core.eval.v2.reporting import ExampleHtmlGenerator
41
42
 
42
43
 
43
44
  # pylint: enable=g-bad-import-order
@@ -29,13 +29,28 @@ Runner = experiment_lib.Runner
29
29
 
30
30
 
31
31
  class Checkpointer(experiment_lib.Plugin):
32
- """Base class for checkpointing evaluation examples."""
32
+ """Base class for checkpointing evaluation examples.
33
+
34
+ `Checkpointer` is a plugin that saves the state of processed examples
35
+ incrementally during an experiment run, allowing the experiment to be resumed
36
+ later. When an experiment starts, the checkpointer loads any previously saved
37
+ examples from an earlier run (or a warm-start run) into `experiment.state`,
38
+ so the runner can skip processing them again.
39
+ Subclasses should implement `_list_checkpoint_filenames` to identify
40
+ checkpoint files to load, and `_save_example` to save a newly processed
41
+ example.
42
+ """
33
43
 
34
44
  checkpoint_filename: Annotated[
35
45
  str,
36
46
  'Checkpoint file pattern.'
37
47
  ] = 'checkpoint.bagz'
38
48
 
49
+ max_ckpt_loading_threads: Annotated[
50
+ int,
51
+ 'Max number of workers for loading checkpoint files at startup.'
52
+ ] = 128
53
+
39
54
  def on_experiment_start(
40
55
  self,
41
56
  runner: Runner,
@@ -149,7 +164,10 @@ class Checkpointer(experiment_lib.Plugin):
149
164
 
150
165
  _ = list(
151
166
  lf.concurrent_map(
152
- _load_state, ckpt_files, max_workers=16, silence_on_errors=None
167
+ _load_state,
168
+ ckpt_files,
169
+ max_workers=self.max_ckpt_loading_threads,
170
+ silence_on_errors=None
153
171
  )
154
172
  )
155
173
 
@@ -170,7 +188,12 @@ class Checkpointer(experiment_lib.Plugin):
170
188
 
171
189
 
172
190
  class PerExampleCheckpointer(Checkpointer):
173
- """Checkpointer that saves each example to a separate file."""
191
+ """Checkpointer that saves each example to a separate file.
192
+
193
+ This checkpointer saves each processed example to its own checkpoint file,
194
+ named using the pattern `<checkpoint_filename_prefix>_<example_id>.<ext>`.
195
+ For example, `checkpoint_1.bagz`, `checkpoint_2.bagz`, etc.
196
+ """
174
197
 
175
198
  def _on_bound(self):
176
199
  super()._on_bound()
@@ -235,7 +258,13 @@ class PerExampleCheckpointer(Checkpointer):
235
258
 
236
259
 
237
260
  class BulkCheckpointer(Checkpointer):
238
- """Checkpointer that saves all examples to a single file."""
261
+ """Checkpointer that saves all examples of an evaluation to a single file.
262
+
263
+ This checkpointer appends newly processed examples of an evaluation to a
264
+ single sequence file (e.g., `checkpoint.bagz`). This is often more efficient
265
+ than `PerExampleCheckpointer` when dealing with a large number of examples
266
+ or when file system overhead is a concern.
267
+ """
239
268
 
240
269
  def _on_bound(self):
241
270
  super()._on_bound()
@@ -341,7 +370,12 @@ class BulkCheckpointer(Checkpointer):
341
370
 
342
371
 
343
372
  class SequenceWriter:
344
- """Thread safe sequence writer."""
373
+ """A thread-safe writer for sequence files (e.g., Bagz).
374
+
375
+ `SequenceWriter` wraps a `pg.io.SequenceWriter` to provide thread-safe
376
+ `add` and `close` operations, ensuring that examples can be written
377
+ concurrently from multiple threads without corrupting the sequence file.
378
+ """
345
379
 
346
380
  def __init__(self, path: str):
347
381
  self._lock = threading.Lock()
@@ -65,7 +65,7 @@ class ExampleCollector(experiment_lib.Plugin):
65
65
  return self._examples
66
66
 
67
67
  def on_example_complete(
68
- self, runner: runners_lib.Runner,
68
+ self, runner: experiment_lib.Runner,
69
69
  experiment: experiment_lib.Experiment,
70
70
  example: example_lib.Example,
71
71
  ):
@@ -13,6 +13,9 @@
13
13
  # limitations under the License.
14
14
  """Helper classes and functions for evaluation tests."""
15
15
 
16
+ import threading
17
+ import time
18
+
16
19
  from langfun.core import language_model
17
20
  from langfun.core import llms
18
21
  from langfun.core import message as message_lib
@@ -47,6 +50,8 @@ class TestLLM(llms.Fake):
47
50
 
48
51
  offset: int = 0
49
52
 
53
+ __test__ = False
54
+
50
55
  def _response_from(self, prompt: message_lib.Message) -> message_lib.Message:
51
56
  return message_lib.AIMessage(
52
57
  str(prompt.metadata.x + prompt.metadata.y + self.offset)
@@ -63,6 +68,8 @@ class TestEvaluation(Evaluation):
63
68
  metrics = [metrics_lib.Match()]
64
69
  lm: language_model.LanguageModel = TestLLM()
65
70
 
71
+ __test__ = False
72
+
66
73
  def process(self, example):
67
74
  v = example.input
68
75
  if v.x == 5:
@@ -84,6 +91,8 @@ class TestEvaluationWithExampleCheckpointingError(TestEvaluation):
84
91
  inputs = test_inputs()
85
92
  metrics = [metrics_lib.Match()]
86
93
 
94
+ __test__ = False
95
+
87
96
  def process(self, example):
88
97
  return 1, dict(
89
98
  x=BadJsonConvertible()
@@ -101,6 +110,8 @@ class TestEvaluationWithExampleHtmlGenerationError(Evaluation):
101
110
  inputs = test_inputs()
102
111
  metrics = [metrics_lib.Match()]
103
112
 
113
+ __test__ = False
114
+
104
115
  def process(self, example):
105
116
  return 1, dict(
106
117
  x=BadHtmlConvertible()
@@ -110,6 +121,8 @@ class TestEvaluationWithExampleHtmlGenerationError(Evaluation):
110
121
  class TestEvaluationWithIndexHtmlGenerationError(TestEvaluation):
111
122
  """Test evaluation class with bad index HTML generation."""
112
123
 
124
+ __test__ = False
125
+
113
126
  def _html_tree_view(self, *args, **kwargs):
114
127
  raise ValueError('Cannot render HTML.')
115
128
 
@@ -135,3 +148,86 @@ def test_experiment_with_example_html_generation_error():
135
148
  def test_experiment_with_index_html_generation_error():
136
149
  """Returns a test experiment with bad index HTML."""
137
150
  return TestEvaluationWithIndexHtmlGenerationError()
151
+
152
+
153
+ class TestPlugin(experiment_lib.Plugin):
154
+ """Plugin for testing."""
155
+
156
+ started_experiments: list[experiment_lib.Experiment] = []
157
+ completed_experiments: list[experiment_lib.Experiment] = []
158
+ skipped_experiments: list[experiment_lib.Experiment] = []
159
+ started_example_ids: list[int] = []
160
+ completed_example_ids: list[int] = []
161
+ start_time: float | None = None
162
+ complete_time: float | None = None
163
+
164
+ __test__ = False
165
+
166
+ def _on_bound(self):
167
+ super()._on_bound()
168
+ self._lock = threading.Lock()
169
+
170
+ def on_run_start(
171
+ self,
172
+ runner: experiment_lib.Runner,
173
+ root: experiment_lib.Experiment
174
+ ) -> None:
175
+ del root
176
+ with pg.notify_on_change(False), pg.allow_writable_accessors(True):
177
+ self.start_time = time.time()
178
+
179
+ def on_run_complete(
180
+ self,
181
+ runner: experiment_lib.Runner,
182
+ root: experiment_lib.Experiment
183
+ ) -> None:
184
+ del root
185
+ with pg.notify_on_change(False), pg.allow_writable_accessors(True):
186
+ self.complete_time = time.time()
187
+
188
+ def on_experiment_start(
189
+ self,
190
+ runner: experiment_lib.Runner,
191
+ experiment: experiment_lib.Experiment
192
+ ) -> None:
193
+ del runner
194
+ with pg.notify_on_change(False), self._lock:
195
+ self.started_experiments.append(pg.Ref(experiment))
196
+
197
+ def on_experiment_skipped(
198
+ self,
199
+ runner: experiment_lib.Runner,
200
+ experiment: experiment_lib.Experiment
201
+ ) -> None:
202
+ del runner
203
+ with pg.notify_on_change(False), self._lock:
204
+ self.skipped_experiments.append(pg.Ref(experiment))
205
+
206
+ def on_experiment_complete(
207
+ self,
208
+ runner: experiment_lib.Runner,
209
+ experiment: experiment_lib.Experiment
210
+ ) -> None:
211
+ del runner
212
+ with pg.notify_on_change(False), self._lock:
213
+ self.completed_experiments.append(pg.Ref(experiment))
214
+
215
+ def on_example_start(
216
+ self,
217
+ runner: experiment_lib.Runner,
218
+ experiment: experiment_lib.Experiment,
219
+ example: Example
220
+ ) -> None:
221
+ del runner, experiment
222
+ with pg.notify_on_change(False), self._lock:
223
+ self.started_example_ids.append(example.id)
224
+
225
+ def on_example_complete(
226
+ self,
227
+ runner: experiment_lib.Runner,
228
+ experiment: experiment_lib.Experiment,
229
+ example: Example
230
+ ) -> None:
231
+ del runner, experiment
232
+ with pg.notify_on_change(False), self._lock:
233
+ self.completed_example_ids.append(example.id)
@@ -32,17 +32,63 @@ import pyglove as pg
32
32
 
33
33
 
34
34
  class Evaluation(experiment_lib.Experiment):
35
- """Evaluation.
36
-
37
- An evaluation can be a leaf node or a container of other evaluations,
38
- depending on whether the current evaluation object is configured with
39
- any `pg.oneof`.
40
-
41
- For example, `MyEval(lm=pg.oneof([lf.llms.Gpt4(), lf.llms.Gemini1_5Pro()]))`
42
- is a container of two sub-experiments, one for each LLM. In such case, the
43
- evaluation object with `pg.oneof` is called a hyper evaluation, which
44
- represents a search space of evaluations, and each sub-evaluation is called
45
- a leaf evaluation, which will perform the actual evaluation.
35
+ """Base class for Langfun evaluations.
36
+
37
+ `lf.eval.Evaluation` is the base class for defining evaluation tasks in
38
+ Langfun. Users typically subclass it to implement custom evaluation logic by
39
+ overriding `inputs` and `process` methods.
40
+
41
+ An `Evaluation` object encapsulates:
42
+
43
+ * **`inputs`**: A callable that returns an iterable of input examples to be
44
+ processed. This is usually provided by implementing an `inputs(self)`
45
+ method in the subclass, which yields input items for evaluation one by
46
+ one.
47
+ * **`process(self, example)`**: An abstract method that processes one
48
+ example and returns the output, or a tuple of (output, metadata).
49
+ The output will be used for computing metrics.
50
+ * **`metrics`**: A list of metrics (e.g., `lf.metrics.Accuracy`) to compute
51
+ based on the outputs from `process`. Some metrics may require users to
52
+ implement a `ground_truth(self, example)` method in the subclass to
53
+ compute metrics against ground truth.
54
+ * **Hyperparameters**: Any other attributes of the class serve as
55
+ hyperparameters for the evaluation (e.g., the language model to use).
56
+
57
+ **Running Evaluations:**
58
+
59
+ Evaluations are executed via `lf.eval.Suite` or by calling the `.run()`
60
+ method on an `Evaluation` instance, which returns a `Run` object
61
+ containing the evaluation run information and results. If an evaluation
62
+ contains sweeable parameters (using `pg.oneof`), `.run()` will expand it
63
+ into multiple evaluation sub-tasks -- one for each combination of
64
+ hyperparameters -- all managed within the same `Run`.
65
+
66
+ **Example:**
67
+
68
+ ```python
69
+ import langfun as lf
70
+ import pyglove as pg
71
+
72
+ class MyEval(lf.eval.Evaluation):
73
+ lm: lf.LanguageModel
74
+ prompt: str = '1 + 1 = '
75
+
76
+ def inputs(self):
77
+ yield 2
78
+
79
+ def process(self, example: lf.eval.Example):
80
+ return int(lf.query(self.prompt, lm=self.lm))
81
+
82
+ def ground_truth(self, example: lf.eval.Example) -> int:
83
+ return example.input
84
+
85
+ # Run evaluation using two different LMs
86
+ evaluation = MyEval(
87
+ lm=pg.oneof([lf.llms.Gpt4(), lf.llms.Gemini()]),
88
+ metrics=[lf.metrics.Accuracy()]
89
+ )
90
+ run_info = evaluation.run()
91
+ ```
46
92
  """
47
93
 
48
94
  inputs: Annotated[
@@ -126,6 +172,20 @@ class Evaluation(experiment_lib.Experiment):
126
172
  # Evaluation logics.
127
173
  #
128
174
 
175
+ def setup(self) -> None:
176
+ """Sets up resources required by the evaluation.
177
+
178
+ Subclasses should always call the super().setup() method to ensure the
179
+ proper initialization of the evaluation.
180
+ """
181
+
182
+ def teardown(self) -> None:
183
+ """Tears down resources used by the evaluation.
184
+
185
+ Subclasses should always call the super().teardown() method to ensure the
186
+ proper cleanup of the evaluation.
187
+ """
188
+
129
189
  @abc.abstractmethod
130
190
  def process(
131
191
  self,
@@ -137,7 +197,7 @@ class Evaluation(experiment_lib.Experiment):
137
197
 
138
198
  Args:
139
199
  example: An example object to process. `example.input` is an object
140
- returned from `Evaluable.inputs`.
200
+ yielded from `inputs()` method.
141
201
 
142
202
  Returns:
143
203
  A processed output. Or a tuple of (output, metadata).
@@ -150,6 +210,7 @@ class Evaluation(experiment_lib.Experiment):
150
210
  example: example_lib.Example | int,
151
211
  raise_if_has_error: bool = False,
152
212
  reevaluate_upon_previous_errors: bool = True,
213
+ force_recompute_metrics: bool = False
153
214
  ) -> example_lib.Example:
154
215
  """Evaluates a single example input.
155
216
 
@@ -158,6 +219,8 @@ class Evaluation(experiment_lib.Experiment):
158
219
  raise_if_has_error: Whether to raise an error if the example has error.
159
220
  reevaluate_upon_previous_errors: Whether to reevaluate the example if
160
221
  the previous checkpointed run has error.
222
+ force_recompute_metrics: If True, force recompute the metrics even if
223
+ metric metadata is already present from previous checkpoint.
161
224
 
162
225
  Returns:
163
226
  The evaluated example with the output and metric metadata populated.
@@ -206,6 +269,7 @@ class Evaluation(experiment_lib.Experiment):
206
269
  # Use the output and metadata obtained from the previous processing.
207
270
  example.output = checkpointed.output
208
271
  example.metadata = checkpointed.metadata
272
+ example.metric_metadata = checkpointed.metric_metadata
209
273
  example.error = checkpointed.error
210
274
  example.newly_processed = False
211
275
  example.execution_status = checkpointed.execution_status
@@ -225,8 +289,16 @@ class Evaluation(experiment_lib.Experiment):
225
289
  self.info(f'Starting metric computation for example {example.id}.')
226
290
  metric_metadata = {}
227
291
  for metric in self.metrics:
228
- metric_metadata.update(metric.audit(example))
229
- example.metric_metadata = metric_metadata
292
+ metric_metadata[metric.name] = metric.update(
293
+ example, force_recompute=force_recompute_metrics
294
+ )
295
+
296
+ if example.metric_metadata is None:
297
+ example.metric_metadata = metric_metadata
298
+ else:
299
+ # Accumulate the metric metadata as there might be existing metadata
300
+ # from previous metric computation runs.
301
+ example.metric_metadata.update(metric_metadata)
230
302
  self.info(f'Completed metric computation for example {example.id}.')
231
303
 
232
304
  # For previously processed examples, we keep the execution status for the
@@ -760,7 +832,7 @@ class Evaluation(experiment_lib.Experiment):
760
832
 
761
833
 
762
834
  class EvaluationState:
763
- """Evaluation state."""
835
+ """In-memory state of an evaluation."""
764
836
 
765
837
  class ExampleStatus(pg.Object):
766
838
  """Example state."""
@@ -88,7 +88,7 @@ class EvaluationTest(unittest.TestCase):
88
88
  self.assertEqual(example.output, 6)
89
89
  self.assertIsNone(example.error)
90
90
  self.assertEqual(example.metadata, {})
91
- self.assertEqual(example.metric_metadata, dict(match=True))
91
+ self.assertEqual(example.metric_metadata, dict(match=dict(is_correct=True)))
92
92
  self.assertIsNotNone(example.usage_summary)
93
93
  self.assertGreater(example.usage_summary.total.total_tokens, 0)
94
94
  self.assertEqual(example.usage_summary.total.num_requests, 1)
@@ -103,7 +103,10 @@ class EvaluationTest(unittest.TestCase):
103
103
  self.assertEqual(example.output, 7)
104
104
  self.assertIsNone(example.error)
105
105
  self.assertEqual(example.metadata, {})
106
- self.assertEqual(example.metric_metadata, dict(mismatch=True))
106
+ self.assertEqual(
107
+ example.metric_metadata,
108
+ dict(match=dict(is_correct=False))
109
+ )
107
110
 
108
111
  with self.assertRaisesRegex(ValueError, 'x should not be 5'):
109
112
  _ = exp.evaluate(6, raise_if_has_error=True)
@@ -113,7 +116,10 @@ class EvaluationTest(unittest.TestCase):
113
116
  self.assertEqual(pg.MISSING_VALUE, example.output)
114
117
  self.assertEqual(example.error.tag, 'ValueError')
115
118
  self.assertEqual(example.metadata, {})
116
- self.assertEqual(example.metric_metadata, dict(error='ValueError'))
119
+ self.assertEqual(
120
+ example.metric_metadata,
121
+ dict(match=dict(error='ValueError'))
122
+ )
117
123
 
118
124
  def test_evaluate_withstate(self):
119
125
  eval_dir = os.path.join(tempfile.mkdtemp(), 'test_eval')
@@ -22,19 +22,30 @@ import pyglove as pg
22
22
 
23
23
  @dataclasses.dataclass
24
24
  class Example(pg.JSONConvertible, pg.views.HtmlTreeView.Extension):
25
- """An item for the evaluation.
25
+ """An example for evaluation.
26
+
27
+ An evaluation example contains the input and output of an evaluation task,
28
+ as well as metadata about the evaluation process, such as execution time,
29
+ LLM usage, and metric results.
26
30
 
27
31
  Attributes:
28
- id: The 1-based ID of the item in the evaluation set.
29
- input: An element returned from the `Evaluable.inputs` functor.
30
- output: The output of the `process` method. If `pg.MISSING_VALUE`, it has
31
- not been processed yet.
32
- metadata: The metadata of the item produced by the `process` method.
33
- metric_metadata: The dictionary returned from `Metric.audit`.
34
- start_time: The start time of the evaluation item.
35
- end_time: The end time of the evaluation item.
36
- usage_summary: The summary of LLM usages of the evaluation item.
37
- execution_status: The timeit status of the evaluation item.
32
+ id: The 1-based ID of the example in the evaluation set.
33
+ input: An element returned from the `Evaluable.inputs` functor, which serves
34
+ as the input for `lf.Evaluable.process`.
35
+ output: The output of `lf.Evaluable.process` method. If `pg.MISSING_VALUE`,
36
+ it indicates the example has not been processed yet.
37
+ error: The error raised from `lf.Evaluable.process`. If None, it
38
+ indicates the process was successful.
39
+ metadata: The metadata of the example produced by `lf.Evaluable.process`.
40
+ metric_metadata: The dictionary returned from `Metric.audit`, which contains
41
+ metadata about metric computation for this example.
42
+ newly_processed: Whether this example is processed in the current run. If
43
+ False, it indicates the example was loaded from a checkpoint from previous
44
+ runs.
45
+ start_time: The start time of processing this example.
46
+ end_time: The end time of processing this example.
47
+ usage_summary: The summary of LLM usages for processing this example.
48
+ execution_status: The timeit status of processing this example.
38
49
  """
39
50
  id: int
40
51
  input: Any = pg.MISSING_VALUE
@@ -49,14 +60,6 @@ class Example(pg.JSONConvertible, pg.views.HtmlTreeView.Extension):
49
60
  usage_summary: lf.UsageSummary | None = None
50
61
  execution_status: dict[str, pg.utils.TimeIt.Status] | None = None
51
62
 
52
- def __post_init__(self):
53
- if self.execution_status is not None:
54
- for status in self.execution_status.values():
55
- if status.has_error:
56
- assert isinstance(status.error, pg.ErrorInfo)
57
- self.error = status.error
58
- break
59
-
60
63
  @property
61
64
  def is_processed(self) -> bool:
62
65
  """Returns whether the item has been processed."""
@@ -182,15 +185,23 @@ class Example(pg.JSONConvertible, pg.views.HtmlTreeView.Extension):
182
185
  extra_flags = extra_flags or {}
183
186
  num_examples = extra_flags.get('num_examples', None)
184
187
 
185
- def _metric_metadata_badge(key, value):
186
- if isinstance(value, bool) and bool:
187
- text = key
188
- else:
189
- text = f'{key}:{value}'
190
- return pg.views.html.controls.Badge(
191
- text,
192
- css_classes=[pg.utils.camel_to_snake(key, '-')],
193
- )
188
+ def _metric_label_group(metric_metadata: dict[str, Any] | None):
189
+ """Renders a label group for metric metadata."""
190
+ badges = []
191
+ if metric_metadata:
192
+ for metric_name, metadata in metric_metadata.items():
193
+ assert isinstance(metadata, dict), (metric_name, metadata)
194
+ for k, v in metadata.items():
195
+ css_class = k
196
+ if isinstance(v, bool):
197
+ css_class += '_true' if v else '_false'
198
+ badge = pg.views.html.controls.Badge(
199
+ f'{k}:{v}',
200
+ tooltip=f'{metric_name}: {k}',
201
+ css_classes=[css_class],
202
+ )
203
+ badges.append(badge)
204
+ return pg.views.html.controls.LabelGroup(badges)
194
205
 
195
206
  def _render_header():
196
207
  return pg.Html.element(
@@ -229,12 +240,7 @@ class Example(pg.JSONConvertible, pg.views.HtmlTreeView.Extension):
229
240
  extra_flags=dict(as_badge=True)
230
241
  ) if self.usage_summary is not None else None,
231
242
  # Metric metadata.
232
- pg.views.html.controls.LabelGroup(
233
- [ # pylint: disable=g-long-ternary
234
- _metric_metadata_badge(k, v)
235
- for k, v in self.metric_metadata.items()
236
- ] if self.metric_metadata else []
237
- ),
243
+ _metric_label_group(self.metric_metadata)
238
244
  ],
239
245
  css_classes=['example-container'],
240
246
  )
@@ -305,18 +311,18 @@ class Example(pg.JSONConvertible, pg.views.HtmlTreeView.Extension):
305
311
  color: black;
306
312
  }
307
313
  /* Badge styles. */
308
- .eval-example .badge.match {
314
+ .eval-example .badge.is_correct_true {
309
315
  color: green;
310
316
  background-color: #dcefbe;
311
317
  }
318
+ .eval-example .badge.is_correct_false {
319
+ color: orange;
320
+ background-color: #ffefc4;
321
+ }
312
322
  .eval-example .badge.error {
313
323
  color: red;
314
324
  background-color: #fdcccc;
315
325
  }
316
- .eval-example .badge.mismatch {
317
- color: orange;
318
- background-color: #ffefc4;
319
- }
320
326
  .eval-example .badge.score {
321
327
  color: blue;
322
328
  background-color: #c4dced;
@@ -32,9 +32,9 @@ class ExampleTest(unittest.TestCase):
32
32
  name='evaluation', elapse=1.0, error=error
33
33
  )
34
34
  })
35
- self.assertEqual(ex.error, error)
35
+ self.assertIsNone(ex.error)
36
36
  self.assertFalse(ex.is_processed)
37
- self.assertTrue(ex.has_error)
37
+ self.assertFalse(ex.has_error)
38
38
  self.assertEqual(ex.elapse, 1.0)
39
39
 
40
40
  ex = Example(id=2, output=1)
@@ -116,7 +116,7 @@ class ExampleTest(unittest.TestCase):
116
116
  input=pg.Dict(a=1, b=2),
117
117
  output=3,
118
118
  metadata=dict(sum=3),
119
- metric_metadata=dict(match=True),
119
+ metric_metadata=dict(match=dict(match=True)),
120
120
  )
121
121
  self.assertNotIn(
122
122
  'next',
@@ -139,10 +139,10 @@ class Experiment(lf.Component, pg.views.HtmlTreeView.Extension):
139
139
 
140
140
  # Checkpointing
141
141
 
142
- Experiments support checkpointing, which is enabled by default. It allows
142
+ Experiments support checkpointing, which is enabled by default. It allows
143
143
  users to resume their experiments from a saved state. When an experiment runs,
144
- it creates a new directory for that run and saves the current state to a
145
- checkpoint file. If the experiment is interrupted or fails, users can resume
144
+ it creates a new directory for that run and saves its progress to checkpoint
145
+ files. If the experiment is interrupted or fails, users can resume
146
146
  it by specifying the 'id' or 'warm_start_from' argument (shown above) to
147
147
  seamlessly continue from previously saved state without starting over.
148
148
 
@@ -169,7 +169,7 @@ class Experiment(lf.Component, pg.views.HtmlTreeView.Extension):
169
169
 
170
170
  # Experiment Plugins
171
171
 
172
- Experiment can be extended by plugins. Plugins can listen to the events of
172
+ Experiments can be extended by plugins. Plugins can listen to the events of
173
173
  experiment execution and produce additional outputs. For example, a plugin
174
174
  can be added to an experiment to generate additional metrics or to save
175
175
  additional data to a database. More details will be added in the future.
@@ -657,7 +657,30 @@ class Experiment(lf.Component, pg.views.HtmlTreeView.Extension):
657
657
 
658
658
  @pg.use_init_args(['children'])
659
659
  class Suite(Experiment):
660
- """A suite of evaluations."""
660
+ """A suite of evaluations.
661
+
662
+ `lf.eval.Suite` groups multiple `lf.eval.Evaluation` or other `Suite`
663
+ objects into a single experiment, allowing them to be run, managed, and
664
+ reported together.
665
+
666
+ **Example:**
667
+
668
+ ```python
669
+ import langfun as lf
670
+
671
+ suite = lf.eval.Suite([
672
+ MyEval(lm=lf.llms.Gpt4()),
673
+ MyEval(lm=lf.llms.Gemini()),
674
+ lf.eval.Suite([
675
+ AnotherEval(lm=lf.llms.Gpt4()),
676
+ AnotherEval(lm=lf.llms.Gemini())
677
+ ])
678
+ ])
679
+
680
+ # Run all evaluations in the suite
681
+ run_info = suite.run('/path/to/my/suite_run')
682
+ ```
683
+ """
661
684
 
662
685
  children: Annotated[
663
686
  list[Experiment], 'A list of child experiments.'
@@ -791,7 +814,14 @@ class RunId(pg.Object):
791
814
 
792
815
 
793
816
  class Run(pg.Object, pg.views.html.HtmlTreeView.Extension):
794
- """A run of an experiment."""
817
+ """Represents a single run of an experiment.
818
+
819
+ A `Run` object holds all the configurations for executing an experiment,
820
+ such as the experiment definition, input/output directories, and flags
821
+ controlling the execution behavior (e.g., error handling, checkpointing).
822
+ It also provides utility methods for accessing run-specific paths and
823
+ filtering examples for evaluation.
824
+ """
795
825
 
796
826
  root_dir: Annotated[
797
827
  str,
@@ -971,7 +1001,13 @@ class Run(pg.Object, pg.views.html.HtmlTreeView.Extension):
971
1001
 
972
1002
 
973
1003
  class Runner(pg.Object):
974
- """Interface for experiment runner."""
1004
+ """Interface for experiment runner.
1005
+
1006
+ A runner is responsible for executing the evaluations within an experiment
1007
+ based on the configuration specified in a `Run` object. Different runners
1008
+ can implement different execution strategies, such as sequential or parallel
1009
+ processing of examples and evaluations.
1010
+ """
975
1011
 
976
1012
  # Class-level variable for registering the runner.
977
1013
  NAME = None
@@ -1010,7 +1046,14 @@ class Runner(pg.Object):
1010
1046
 
1011
1047
 
1012
1048
  class Plugin(lf.Component):
1013
- """Base class for experiment plugins."""
1049
+ """Base class for experiment plugins.
1050
+
1051
+ Plugins provide a mechanism to extend the behavior of an experiment run
1052
+ by hooking into various events during the lifecycle of experiment and
1053
+ example execution, such as `on_run_start`, `on_experiment_complete`,
1054
+ `on_example_start`, etc. They can be used for custom logging, monitoring,
1055
+ or result processing.
1056
+ """
1014
1057
 
1015
1058
  def on_run_start(
1016
1059
  self,