langfun 0.0.2.dev20240429__py3-none-any.whl → 0.1.2.dev202501140804__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (144) hide show
  1. langfun/__init__.py +20 -2
  2. langfun/core/__init__.py +16 -5
  3. langfun/core/agentic/__init__.py +30 -0
  4. langfun/core/agentic/action.py +854 -0
  5. langfun/core/agentic/action_eval.py +150 -0
  6. langfun/core/agentic/action_eval_test.py +109 -0
  7. langfun/core/agentic/action_test.py +136 -0
  8. langfun/core/coding/python/__init__.py +5 -11
  9. langfun/core/coding/python/correction.py +37 -21
  10. langfun/core/coding/python/correction_test.py +29 -3
  11. langfun/core/coding/python/execution.py +40 -216
  12. langfun/core/coding/python/execution_test.py +29 -89
  13. langfun/core/coding/python/generation.py +21 -11
  14. langfun/core/coding/python/generation_test.py +2 -2
  15. langfun/core/coding/python/parsing.py +108 -193
  16. langfun/core/coding/python/parsing_test.py +2 -105
  17. langfun/core/component.py +63 -2
  18. langfun/core/component_test.py +53 -0
  19. langfun/core/concurrent.py +414 -117
  20. langfun/core/concurrent_test.py +111 -24
  21. langfun/core/console.py +18 -5
  22. langfun/core/console_test.py +17 -0
  23. langfun/core/eval/__init__.py +16 -1
  24. langfun/core/eval/base.py +622 -174
  25. langfun/core/eval/base_test.py +200 -54
  26. langfun/core/eval/matching.py +63 -76
  27. langfun/core/eval/matching_test.py +17 -8
  28. langfun/core/eval/patching.py +130 -0
  29. langfun/core/eval/patching_test.py +170 -0
  30. langfun/core/eval/scoring.py +26 -26
  31. langfun/core/eval/scoring_test.py +19 -2
  32. langfun/core/eval/v2/__init__.py +42 -0
  33. langfun/core/eval/v2/checkpointing.py +380 -0
  34. langfun/core/eval/v2/checkpointing_test.py +228 -0
  35. langfun/core/eval/v2/eval_test_helper.py +136 -0
  36. langfun/core/eval/v2/evaluation.py +725 -0
  37. langfun/core/eval/v2/evaluation_test.py +180 -0
  38. langfun/core/eval/v2/example.py +305 -0
  39. langfun/core/eval/v2/example_test.py +128 -0
  40. langfun/core/eval/v2/experiment.py +1048 -0
  41. langfun/core/eval/v2/experiment_test.py +433 -0
  42. langfun/core/eval/v2/metric_values.py +156 -0
  43. langfun/core/eval/v2/metric_values_test.py +80 -0
  44. langfun/core/eval/v2/metrics.py +357 -0
  45. langfun/core/eval/v2/metrics_test.py +203 -0
  46. langfun/core/eval/v2/progress.py +348 -0
  47. langfun/core/eval/v2/progress_test.py +82 -0
  48. langfun/core/eval/v2/progress_tracking.py +210 -0
  49. langfun/core/eval/v2/progress_tracking_test.py +66 -0
  50. langfun/core/eval/v2/reporting.py +270 -0
  51. langfun/core/eval/v2/reporting_test.py +158 -0
  52. langfun/core/eval/v2/runners.py +488 -0
  53. langfun/core/eval/v2/runners_test.py +334 -0
  54. langfun/core/langfunc.py +4 -17
  55. langfun/core/langfunc_test.py +22 -6
  56. langfun/core/language_model.py +577 -39
  57. langfun/core/language_model_test.py +470 -56
  58. langfun/core/llms/__init__.py +87 -16
  59. langfun/core/llms/anthropic.py +312 -87
  60. langfun/core/llms/anthropic_test.py +71 -3
  61. langfun/core/llms/cache/base.py +21 -2
  62. langfun/core/llms/cache/in_memory.py +13 -0
  63. langfun/core/llms/cache/in_memory_test.py +53 -2
  64. langfun/core/llms/compositional.py +101 -0
  65. langfun/core/llms/compositional_test.py +73 -0
  66. langfun/core/llms/deepseek.py +117 -0
  67. langfun/core/llms/deepseek_test.py +61 -0
  68. langfun/core/llms/fake.py +11 -7
  69. langfun/core/llms/fake_test.py +14 -0
  70. langfun/core/llms/gemini.py +507 -0
  71. langfun/core/llms/gemini_test.py +195 -0
  72. langfun/core/llms/google_genai.py +62 -218
  73. langfun/core/llms/google_genai_test.py +9 -202
  74. langfun/core/llms/groq.py +160 -144
  75. langfun/core/llms/groq_test.py +31 -137
  76. langfun/core/llms/llama_cpp.py +15 -42
  77. langfun/core/llms/llama_cpp_test.py +4 -30
  78. langfun/core/llms/openai.py +395 -203
  79. langfun/core/llms/openai_compatible.py +179 -0
  80. langfun/core/llms/openai_compatible_test.py +495 -0
  81. langfun/core/llms/openai_test.py +30 -395
  82. langfun/core/llms/rest.py +113 -0
  83. langfun/core/llms/rest_test.py +111 -0
  84. langfun/core/llms/vertexai.py +192 -0
  85. langfun/core/llms/vertexai_test.py +52 -0
  86. langfun/core/logging.py +284 -0
  87. langfun/core/logging_test.py +125 -0
  88. langfun/core/message.py +319 -9
  89. langfun/core/message_test.py +190 -13
  90. langfun/core/modalities/__init__.py +6 -2
  91. langfun/core/modalities/audio.py +30 -0
  92. langfun/core/modalities/audio_test.py +63 -0
  93. langfun/core/modalities/image.py +39 -20
  94. langfun/core/modalities/image_test.py +52 -9
  95. langfun/core/modalities/mime.py +206 -29
  96. langfun/core/modalities/mime_test.py +90 -9
  97. langfun/core/modalities/ms_office.py +117 -0
  98. langfun/core/modalities/ms_office_test.py +389 -0
  99. langfun/core/modalities/pdf.py +22 -0
  100. langfun/core/modalities/pdf_test.py +57 -0
  101. langfun/core/modalities/video.py +9 -26
  102. langfun/core/modalities/video_test.py +3 -3
  103. langfun/core/modality.py +26 -3
  104. langfun/core/modality_test.py +2 -2
  105. langfun/core/sampling.py +11 -11
  106. langfun/core/structured/__init__.py +12 -16
  107. langfun/core/structured/completion.py +32 -5
  108. langfun/core/structured/completion_test.py +7 -6
  109. langfun/core/structured/description.py +2 -2
  110. langfun/core/structured/description_test.py +3 -3
  111. langfun/core/structured/function_generation.py +60 -27
  112. langfun/core/structured/function_generation_test.py +72 -2
  113. langfun/core/structured/mapping.py +97 -47
  114. langfun/core/structured/mapping_test.py +90 -2
  115. langfun/core/structured/parsing.py +33 -21
  116. langfun/core/structured/parsing_test.py +53 -9
  117. langfun/core/structured/querying.py +746 -0
  118. langfun/core/structured/{prompting_test.py → querying_test.py} +469 -51
  119. langfun/core/structured/schema.py +204 -97
  120. langfun/core/structured/schema_generation.py +1 -1
  121. langfun/core/structured/schema_test.py +130 -29
  122. langfun/core/structured/scoring.py +125 -19
  123. langfun/core/structured/scoring_test.py +30 -0
  124. langfun/core/structured/tokenization.py +64 -0
  125. langfun/core/structured/tokenization_test.py +48 -0
  126. langfun/core/template.py +115 -1
  127. langfun/core/template_test.py +71 -1
  128. langfun/core/templates/conversation.py +9 -0
  129. langfun/core/templates/conversation_test.py +4 -3
  130. langfun/core/templates/selfplay_test.py +10 -2
  131. langfun-0.1.2.dev202501140804.dist-info/METADATA +225 -0
  132. langfun-0.1.2.dev202501140804.dist-info/RECORD +153 -0
  133. {langfun-0.0.2.dev20240429.dist-info → langfun-0.1.2.dev202501140804.dist-info}/WHEEL +1 -1
  134. langfun/core/coding/python/errors.py +0 -108
  135. langfun/core/coding/python/errors_test.py +0 -99
  136. langfun/core/coding/python/permissions.py +0 -90
  137. langfun/core/coding/python/permissions_test.py +0 -86
  138. langfun/core/structured/prompting.py +0 -238
  139. langfun/core/text_formatting.py +0 -162
  140. langfun/core/text_formatting_test.py +0 -47
  141. langfun-0.0.2.dev20240429.dist-info/METADATA +0 -100
  142. langfun-0.0.2.dev20240429.dist-info/RECORD +0 -108
  143. {langfun-0.0.2.dev20240429.dist-info → langfun-0.1.2.dev202501140804.dist-info}/LICENSE +0 -0
  144. {langfun-0.0.2.dev20240429.dist-info → langfun-0.1.2.dev202501140804.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,725 @@
1
+ # Copyright 2024 The Langfun Authors
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Base class for Langfun evaluation tasks."""
15
+
16
+ import abc
17
+ import datetime
18
+ import functools
19
+ import threading
20
+ import time
21
+
22
+ from typing import Annotated, Any, Callable, Iterable
23
+ import langfun.core as lf
24
+ import langfun.core.coding as lf_coding
25
+
26
+ from langfun.core.eval.v2 import example as example_lib
27
+ from langfun.core.eval.v2 import experiment as experiment_lib
28
+ from langfun.core.eval.v2 import metric_values as metric_values_lib
29
+ from langfun.core.eval.v2 import metrics as metrics_lib
30
+
31
+ import pyglove as pg
32
+
33
+
34
+ class Evaluation(experiment_lib.Experiment):
35
+ """Evaluation.
36
+
37
+ An evaluation can be a leaf node or a container of other evaluations,
38
+ depending on whether the current evaluation object is configured with
39
+ any `pg.oneof`.
40
+
41
+ For example, `MyEval(lm=pg.oneof([lf.llms.Gpt4(), lf.llms.Gemini1_5Pro()]))`
42
+ is a container of two sub-experiments, one for each LLM. In such case, the
43
+ evaluation object with `pg.oneof` is called a hyper evaluation, which
44
+ represents a search space of evaluations, and each sub-evaluation is called
45
+ a leaf evaluation, which will perform the actual evaluation.
46
+ """
47
+
48
+ inputs: Annotated[
49
+ pg.Functor,
50
+ 'A functor that returns a list of inputs.'
51
+ ]
52
+
53
+ metrics: Annotated[
54
+ list[metrics_lib.Metric],
55
+ 'The metrics to be evaluated.'
56
+ ]
57
+
58
+ max_workers: Annotated[
59
+ int,
60
+ 'The maximum number of workers to use for the evaluation.'
61
+ ] = 32
62
+
63
+ def _on_bound(self):
64
+ # Invalidate cached properties.
65
+ self.__dict__.pop('is_leaf', None)
66
+ self.__dict__.pop('children', None)
67
+ super()._on_bound()
68
+ self._log_entries = []
69
+ self._log_lock = threading.Lock()
70
+
71
+ #
72
+ # Handling evaluation hierarchy (materialized vs. hyper evaluations).
73
+ #
74
+
75
+ @functools.cached_property
76
+ def is_leaf(self) -> bool:
77
+ """Returns whether the task is a leaf."""
78
+ return self.is_deterministic
79
+
80
+ @functools.cached_property
81
+ def children(self) -> list['Evaluation']:
82
+ """Returns the children tasks."""
83
+ if self.is_leaf:
84
+ return []
85
+ children = []
86
+ for i, child in enumerate(pg.iter(self)):
87
+ child.sym_setparent(self)
88
+ child.sym_setpath(self.sym_path + 'children' + i)
89
+ children.append(child)
90
+ return children
91
+
92
+ #
93
+ # Handling evaluation inputs.
94
+ #
95
+
96
+ @functools.cached_property
97
+ def example_inputs(self) -> Iterable[Any]:
98
+ """Returns the examples from the inputs."""
99
+ return self.inputs()
100
+
101
+ def example_input_by_id(self, example_id: int) -> Any:
102
+ """Returns the example from the inputs by ID."""
103
+ assert example_id <= len(self.example_inputs), example_id
104
+ return self._example_input_by_id[example_id]
105
+
106
+ @functools.cached_property
107
+ def _example_input_by_id(self) -> dict[int, Any]:
108
+ """Returns the examples from the inputs by ID."""
109
+ return {i + 1: v for i, v in enumerate(self.example_inputs)}
110
+
111
+ @property
112
+ def num_examples(self) -> int:
113
+ """Returns the number of examples from the inputs."""
114
+ # NOTE(daiyip): setting `num_examples` of the input functor allows fast
115
+ # retrieval of number of examples without iterating the whole dataset.
116
+ num_examples = getattr(self.inputs, 'num_examples', None)
117
+ if not isinstance(num_examples, int):
118
+ it = self.example_inputs
119
+ if hasattr(it, '__len__'):
120
+ num_examples = len(it)
121
+ else:
122
+ num_examples = len(list(it))
123
+ return num_examples
124
+
125
+ #
126
+ # Evaluation logics.
127
+ #
128
+
129
+ @abc.abstractmethod
130
+ def process(self, example_input: Any) -> Any | tuple[Any, dict[str, Any]]:
131
+ """Processes a single example from the evaluation set.
132
+
133
+ Users should override this method to implement the evaluation logic.
134
+
135
+ Args:
136
+ example_input: An object returned from `Evaluable.inputs`.
137
+
138
+ Returns:
139
+ A processed output. Or a tuple of (output, metadata).
140
+ The output will be used for computing the metrics, and the metadata will
141
+ be included in the evaluation HTML view.
142
+ """
143
+
144
+ def evaluate(
145
+ self,
146
+ example: example_lib.Example | int,
147
+ raise_if_has_error: bool = False,
148
+ ) -> example_lib.Example:
149
+ """Evaluates a single example input.
150
+
151
+ Args:
152
+ example: An example ID or an example object with ID.
153
+ raise_if_has_error: Whether to raise an error if the example has error.
154
+
155
+ Returns:
156
+ The evaluated example with the output and metric metadata populated.
157
+ """
158
+ if isinstance(example, int):
159
+ example = example_lib.Example(id=example)
160
+ assert isinstance(example, example_lib.Example), example
161
+
162
+ if pg.MISSING_VALUE == example.input:
163
+ example.input = self.example_input_by_id(example.id)
164
+
165
+ cached = self._state.get(example.id)
166
+
167
+ with pg.timeit('evaluate') as timeit, lf.track_usages() as usage_summary:
168
+ if cached is None or cached.has_error:
169
+ example.start_time = time.time()
170
+ self._process(example, raise_if_has_error=raise_if_has_error)
171
+ else:
172
+ example.start_time = cached.start_time
173
+
174
+ # Use cached output and metadata obtained from the previous processing.
175
+ example.output = cached.output
176
+ example.metadata = cached.metadata
177
+ example.newly_processed = False
178
+
179
+ # For previously processed examples, we merge previous usages as
180
+ # cached, so the usage summary will account previous usages, but as
181
+ # cached.
182
+ assert cached.usage_summary is not None
183
+ usage_summary.merge(cached.usage_summary, as_cached=True)
184
+
185
+ # Recompute the metrics and metadata for the example even its processed
186
+ # output and metadata were from the cache.
187
+ # NOTE(daiyip): It's possible that metrics could use LLMs, so we need to
188
+ # track the usage of the metrics separately.
189
+ with pg.timeit('metric'):
190
+ metric_metadata = {}
191
+ for metric in self.metrics:
192
+ metric_metadata.update(metric.audit(example))
193
+ example.metric_metadata = metric_metadata
194
+
195
+ # For previously processed examples, we keep the execution status for the
196
+ # processing step.
197
+ execution_status = dict(example.execution_status or {})
198
+ execution_status.update(timeit.status())
199
+
200
+ example.execution_status = execution_status
201
+ example.usage_summary = usage_summary
202
+ if example.newly_processed:
203
+ example.end_time = time.time()
204
+
205
+ self._state.update(example)
206
+ return example
207
+
208
+ def _process(
209
+ self,
210
+ example: example_lib.Example,
211
+ raise_if_has_error: bool = False
212
+ ) -> None:
213
+ """Processes a single example."""
214
+ with (
215
+ pg.notify_on_change(False),
216
+ pg.allow_writable_accessors(True),
217
+ # NOTE(daiyip): set the `input` symbol of the globals to None, so
218
+ # LLM generated code with calls to `input` will raise an error, thus
219
+ # not blocking the evaluation.
220
+ lf_coding.context(input=None),
221
+ ):
222
+ try:
223
+ with pg.timeit('process'):
224
+ output = self.process(example.input)
225
+ if (isinstance(output, tuple)
226
+ and len(output) == 2
227
+ and isinstance(output[1], dict)):
228
+ output, metadata = output
229
+ else:
230
+ metadata = {}
231
+ example.output = output
232
+ example.metadata = metadata
233
+ except BaseException as e: # pylint: disable=broad-except
234
+ if raise_if_has_error:
235
+ raise
236
+ example.error = pg.object_utils.ErrorInfo.from_exception(e)
237
+
238
+ #
239
+ # Handling evaluation scheduling.
240
+ #
241
+
242
+ def resource_ids(self) -> set[str]:
243
+ """Returns a set of resource IDs required by this evaluation.
244
+
245
+ Resource IDs are used to by the runner to determine which evaluations can
246
+ be run in parallel. Evaluations using the same resource key will be run
247
+ sequentially.
248
+
249
+ Returns:
250
+ A unique string representing the resource required.
251
+ """
252
+ return {
253
+ v.resource_id for _, v in self.sym_init_args.items()
254
+ if isinstance(v, lf.LanguageModel)
255
+ }
256
+
257
+ #
258
+ # Handling evaluation state.
259
+ #
260
+
261
+ @property
262
+ def state(self) -> 'EvaluationState':
263
+ """Returns the state of the evaluation."""
264
+ return self._state
265
+
266
+ def load_state(
267
+ self,
268
+ state_file: str,
269
+ *,
270
+ load_example_metadata: bool = True,
271
+ filter: Callable[[example_lib.Example], bool] | None = None, # pylint: disable=redefined-builtin
272
+ raise_if_not_exist: bool = False
273
+ ) -> None:
274
+ """Loads saved state from a sequence IO file."""
275
+ if pg.io.path_exists(state_file):
276
+ self._state.load(
277
+ state_file,
278
+ example_input_by_id=self.example_input_by_id,
279
+ load_example_metadata=load_example_metadata,
280
+ filter=filter,
281
+ )
282
+ elif raise_if_not_exist:
283
+ raise ValueError(f'State file {state_file} does not exist.')
284
+
285
+ def _reset(self) -> None:
286
+ """Resets the state of the evaluation."""
287
+ super()._reset()
288
+ if self.is_leaf:
289
+ # Create a new state for the leaf evaluation.
290
+ self._state = EvaluationState()
291
+ for metric in self.metrics:
292
+ metric.reset()
293
+
294
+ #
295
+ # Evaluation-level logging.
296
+ #
297
+
298
+ def _log(self, log_func, level: lf.logging.LogLevel, message: str, **kwargs):
299
+ # Write to external logging system.
300
+ log_message = f'{self.id}: {message}'
301
+ if kwargs:
302
+ log_message = f'{log_message} (metadata: {kwargs!r})'
303
+ log_func(log_message)
304
+
305
+ # Add to experiment log history.
306
+ log_entry = lf.logging.LogEntry(
307
+ level=level,
308
+ time=datetime.datetime.now(),
309
+ message=message,
310
+ metadata=kwargs,
311
+ )
312
+ with self._log_lock:
313
+ self._log_entries.append(log_entry)
314
+
315
+ def debug(self, message: str, **kwargs):
316
+ """Logs a debug message to the session."""
317
+ self._log(pg.logging.debug, 'debug', message, **kwargs)
318
+
319
+ def info(self, message: str, **kwargs):
320
+ """Logs an info message to the session."""
321
+ self._log(pg.logging.info, 'info', message, **kwargs)
322
+
323
+ def warning(self, message: str, **kwargs):
324
+ """Logs a warning message to the session."""
325
+ self._log(pg.logging.warning, 'warning', message, **kwargs)
326
+
327
+ def error(self, message: str, **kwargs):
328
+ """Logs an error message to the session."""
329
+ self._log(pg.logging.error, 'error', message, **kwargs)
330
+
331
+ def fatal(self, message: str, **kwargs):
332
+ """Logs a fatal message to the session."""
333
+ # We use error level for fatal message, which does not trigger assertion.
334
+ self._log(pg.logging.error, 'fatal', message, **kwargs)
335
+
336
+ #
337
+ # HTML views.
338
+ #
339
+
340
+ def _html_tree_view_content(
341
+ self, *, view, extra_flags: dict[str, Any] | None, **kwargs
342
+ ):
343
+ if not self.is_leaf:
344
+ return super()._html_tree_view_content(
345
+ view=view, extra_flags=extra_flags, **kwargs
346
+ )
347
+
348
+ extra_flags = extra_flags or {}
349
+ run = extra_flags.pop('current_run', None)
350
+ if extra_flags.pop('card_view', True):
351
+ return self._summary_card_view(
352
+ extra_flags.get('interactive', True), run
353
+ )
354
+ assert run is not None
355
+ return self._details_view(run)
356
+
357
+ def _parameter_badge(self, key, value) -> pg.Html.WritableTypes:
358
+ """Renders a badge for a parameter."""
359
+ face_value = pg.format(
360
+ value,
361
+ compact=True,
362
+ python_format=True,
363
+ hide_default_values=True,
364
+ use_inferred=True
365
+ )
366
+ short_text = face_value
367
+ if len(face_value) > 40:
368
+ short_text = f'{type(value).__name__}(...)'
369
+ label = f'{key.split(".")[-1]}: {short_text}'
370
+ tooltip = f'{key}: {face_value}'
371
+ return pg.views.html.controls.Badge(
372
+ text=label,
373
+ tooltip=tooltip,
374
+ css_classes=['parameter'],
375
+ interactive=False,
376
+ )
377
+
378
+ def _summary_card_view(
379
+ self,
380
+ interactive: bool = True,
381
+ run: experiment_lib.Run | None = None,
382
+ ) -> pg.Html.WritableTypes:
383
+ """Renders the summary card view of the evaluation."""
384
+ del run
385
+ return pg.Html(
386
+ pg.Html.element(
387
+ 'div',
388
+ [
389
+ pg.views.html.controls.LabelGroup([
390
+ self._parameter_badge(k, v)
391
+ for k, v in self.non_default_values(
392
+ flatten=True
393
+ ).items()
394
+ ], css_classes=['parameter-group']),
395
+ pg.Html.element(
396
+ 'div',
397
+ [
398
+ m.to_html(
399
+ extra_flags=dict(
400
+ interactive=interactive,
401
+ )
402
+ )
403
+ for m in self.metrics
404
+ ],
405
+ css_classes=['metric-group'],
406
+ ),
407
+ ],
408
+ css_classes=['badge-groups'],
409
+ )
410
+ )
411
+
412
+ def _details_view(
413
+ self, run: experiment_lib.Run
414
+ ) -> pg.Html:
415
+ """Renders the details view of the evaluation."""
416
+
417
+ def _title():
418
+ return pg.Html.element(
419
+ 'div',
420
+ [
421
+ pg.views.html.controls.LabelGroup(
422
+ [
423
+ pg.views.html.controls.Label(
424
+ 'Summary',
425
+ link=run.experiment.output_link(run, 'summary.html'),
426
+ css_classes=['summary-link'],
427
+ ),
428
+ '|',
429
+ pg.views.html.controls.Label(
430
+ 'Directory',
431
+ link=self.output_link(run, ''),
432
+ css_classes=['dir-link'],
433
+ ),
434
+ ],
435
+ css_classes=['experiment-links'],
436
+ ),
437
+ pg.views.html.controls.Label(
438
+ self.id,
439
+ css_classes=['experiment-id'],
440
+ ),
441
+ self.progress.to_html(
442
+ extra_flags=dict(interactive=False),
443
+ ),
444
+ self.usage_summary.to_html(
445
+ extra_flags=dict(as_badge=True, interactive=False),
446
+ ),
447
+ ]
448
+ )
449
+
450
+ def _parameter_badges():
451
+ """Renders a tab group for a metric (group)."""
452
+ return pg.views.html.controls.LabelGroup(
453
+ [
454
+ self._parameter_badge(k, v)
455
+ for k, v in self.non_default_values(flatten=True).items()
456
+ ],
457
+ css_classes=['parameter-group'],
458
+ )
459
+
460
+ def _definition_tab() -> pg.views.html.controls.Tab:
461
+ """Renders a tab for the definition of the evaluation."""
462
+ return pg.views.html.controls.Tab(
463
+ label='Definition',
464
+ content=pg.Html.element(
465
+ 'div',
466
+ [
467
+ pg.views.html.controls.Label(
468
+ pg.format(
469
+ self,
470
+ compact=False,
471
+ verbose=False,
472
+ use_inferred=True,
473
+ hide_frozen=True,
474
+ exclude_keys=set(['progress', 'usage_summary'])
475
+ ),
476
+ css_classes=['eval-definition'],
477
+ ),
478
+ ]
479
+ )
480
+ )
481
+
482
+ def _metric_tab(metric: metrics_lib.Metric) -> pg.views.html.controls.Tab:
483
+ """Renders a tab for a metric (group)."""
484
+ return pg.views.html.controls.Tab(
485
+ label=f'Metric: {metric.name}',
486
+ content=pg.Html.element(
487
+ 'div',
488
+ [
489
+ metric.to_html(
490
+ extra_flags=dict(
491
+ interactive=False,
492
+ )
493
+ ),
494
+ pg.views.html.controls.TabControl(
495
+ tabs=[
496
+ _metric_value_tab(mv)
497
+ for mv in metric.values()
498
+ ]
499
+ )
500
+ ]
501
+ )
502
+ )
503
+
504
+ def _metric_value_tab(
505
+ metric_value: metric_values_lib.MetricValue
506
+ ) -> pg.views.html.controls.Tab:
507
+ """Renders the example links for a metric value."""
508
+ return pg.views.html.controls.Tab(
509
+ label=metric_value.sym_path.key,
510
+ content=pg.Html.element(
511
+ 'div',
512
+ [
513
+ pg.views.html.controls.Label(
514
+ str(dp.example_id),
515
+ link=self.output_link(run, f'{dp.example_id}.html'),
516
+ target='example-view',
517
+ css_classes=['example-link'],
518
+ )
519
+ for dp in metric_value.data_points
520
+ ]
521
+ )
522
+ )
523
+
524
+ def _logs_tab() -> pg.views.html.controls.Tab:
525
+ """Renders a tab for the logs of the evaluation."""
526
+ with self._log_lock:
527
+ log_history = '\n'.join(str(l) for l in self._log_entries)
528
+ return pg.views.html.controls.Tab(
529
+ label='Logs',
530
+ content=pg.Html.element(
531
+ 'div',
532
+ [
533
+ pg.Html.element(
534
+ 'textarea',
535
+ [pg.Html.escape(log_history)],
536
+ readonly=True,
537
+ css_classes=['logs-textarea'],
538
+ )
539
+ ]
540
+ )
541
+ )
542
+
543
+ def _main_tabs() -> pg.Html:
544
+ return pg.Html.element(
545
+ 'div',
546
+ [
547
+ pg.views.html.controls.TabControl(
548
+ [
549
+ _definition_tab(),
550
+ ] + [
551
+ _metric_tab(m) for m in self.metrics
552
+ ] + [
553
+ _logs_tab()
554
+ ],
555
+ selected=1,
556
+ )
557
+ ],
558
+ )
559
+
560
+ return pg.Html.element(
561
+ 'div',
562
+ [
563
+ _title(),
564
+ _parameter_badges(),
565
+ _main_tabs(),
566
+ pg.Html.element(
567
+ 'iframe', [],
568
+ name='example-view',
569
+ src='./1.html',
570
+ title='Example view.',
571
+ css_classes=['example-view'],
572
+ ),
573
+ ],
574
+ css_classes=['eval-details'],
575
+ )
576
+
577
+ def _html_tree_view_config(self) -> dict[str, Any]:
578
+ return dict(
579
+ css_classes=['eval-card'] if self.is_leaf else None
580
+ )
581
+
582
+ def _html_tree_view_css_styles(self) -> list[str]:
583
+ return super()._html_tree_view_css_styles() + [
584
+ """
585
+ details.eval-card {
586
+ display: inline-block;
587
+ border: 0px;
588
+ box-shadow: rgba(0, 0, 0, 0.16) 0px 1px 4px;
589
+ margin: 15px;
590
+ }
591
+ .eval-card details {
592
+ border: 0px;
593
+ }
594
+ .badge-groups {
595
+ font-weight: normal;
596
+ padding: 5px;
597
+ }
598
+ .parameter-group {
599
+ display: inline-grid;
600
+ grid-template-rows: auto auto;
601
+ border: 0px;
602
+ margin-right: 10px;
603
+ }
604
+ .parameter.badge {
605
+ margin: 2px;
606
+ }
607
+ .metric-group {
608
+ display: inline-grid;
609
+ grid-template-rows: auto auto;
610
+ }
611
+ .eval-details .progress-bar > .shade {
612
+ visibility: hidden;
613
+ width: 0px;
614
+ margin: 0px;
615
+ }
616
+ .eval-details .progress-label {
617
+ font-size: 16px;
618
+ background-color: #eee;
619
+ }
620
+ .eval-details .progress-time {
621
+ font-size: 16px;
622
+ color: dodgerblue;
623
+ background-color: #eee;
624
+ margin-right: 10px;
625
+ }
626
+ .eval-details .usage-summary.badge {
627
+ color: orange;
628
+ font-size: 16px;
629
+ background-color: #eee;
630
+ }
631
+ .eval-details .experiment-links {
632
+ display: block;
633
+ border: 0px;
634
+ margin: 0px;
635
+ }
636
+ .eval-details .tab-control {
637
+ width: 100%;
638
+ }
639
+ .eval-details .tab-button {
640
+ font-size: large;
641
+ }
642
+ .experiment-links .label {
643
+ color: revert;
644
+ margin: 0px;
645
+ padding: 2px;
646
+ }
647
+ .eval-details .experiment-id {
648
+ font-size: 2.0em;
649
+ font-weight: bold;
650
+ display: block;
651
+ }
652
+ .eval-details .parameter-group {
653
+ display: inline-block;
654
+ padding: 5px;
655
+ }
656
+ .eval-definition {
657
+ white-space: pre;
658
+ background-color: #eee;
659
+ padding: 15px;
660
+ }
661
+ .eval-details .metric-container {
662
+ display: block;
663
+ padding: 15px 0px;
664
+ }
665
+ .example-link {
666
+ color: revert;
667
+ }
668
+ .example-view {
669
+ border: 0px;
670
+ width:100%;
671
+ height:100%;
672
+ }
673
+ .logs-textarea {
674
+ width: 100%;
675
+ height: 500px;
676
+ padding: 5px;
677
+ border: 1px solid #DDD;
678
+ background-color: #EEE;
679
+ resize: vertical;
680
+ }
681
+ """
682
+ ]
683
+
684
+
685
+ class EvaluationState:
686
+ """Evaluation state."""
687
+
688
+ def __init__(self):
689
+ super().__init__()
690
+ self._evaluated_examples: dict[int, example_lib.Example] = {}
691
+
692
+ def load(
693
+ self,
694
+ state_file: str,
695
+ *,
696
+ example_input_by_id: Callable[[int], Any] | None = None,
697
+ load_example_metadata: bool | Callable[
698
+ [example_lib.Example], bool] = True,
699
+ filter: Callable[[example_lib.Example], bool] | None = None, # pylint: disable=redefined-builtin
700
+ ) -> None:
701
+ """Loads the state from the example sequence file."""
702
+ with pg.io.sequence.open_sequence(state_file) as f:
703
+ for record in f:
704
+ example = pg.from_json_str(
705
+ record,
706
+ example_input_by_id=example_input_by_id,
707
+ load_example_metadata=load_example_metadata
708
+ )
709
+ assert isinstance(example, example_lib.Example), example
710
+ if filter is not None and not filter(example):
711
+ continue
712
+ self._evaluated_examples[example.id] = example
713
+
714
+ @property
715
+ def evaluated_examples(self) -> dict[int, example_lib.Example]:
716
+ """Returns the examples in the state."""
717
+ return self._evaluated_examples
718
+
719
+ def get(self, example_id: int) -> example_lib.Example | None:
720
+ """Returns the example with the given ID."""
721
+ return self._evaluated_examples.get(example_id)
722
+
723
+ def update(self, example: example_lib.Example) -> None:
724
+ """Updates the state with the given example."""
725
+ self._evaluated_examples[example.id] = example