langfun 0.0.2.dev20240429__py3-none-any.whl → 0.1.2.dev202501140804__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (144) hide show
  1. langfun/__init__.py +20 -2
  2. langfun/core/__init__.py +16 -5
  3. langfun/core/agentic/__init__.py +30 -0
  4. langfun/core/agentic/action.py +854 -0
  5. langfun/core/agentic/action_eval.py +150 -0
  6. langfun/core/agentic/action_eval_test.py +109 -0
  7. langfun/core/agentic/action_test.py +136 -0
  8. langfun/core/coding/python/__init__.py +5 -11
  9. langfun/core/coding/python/correction.py +37 -21
  10. langfun/core/coding/python/correction_test.py +29 -3
  11. langfun/core/coding/python/execution.py +40 -216
  12. langfun/core/coding/python/execution_test.py +29 -89
  13. langfun/core/coding/python/generation.py +21 -11
  14. langfun/core/coding/python/generation_test.py +2 -2
  15. langfun/core/coding/python/parsing.py +108 -193
  16. langfun/core/coding/python/parsing_test.py +2 -105
  17. langfun/core/component.py +63 -2
  18. langfun/core/component_test.py +53 -0
  19. langfun/core/concurrent.py +414 -117
  20. langfun/core/concurrent_test.py +111 -24
  21. langfun/core/console.py +18 -5
  22. langfun/core/console_test.py +17 -0
  23. langfun/core/eval/__init__.py +16 -1
  24. langfun/core/eval/base.py +622 -174
  25. langfun/core/eval/base_test.py +200 -54
  26. langfun/core/eval/matching.py +63 -76
  27. langfun/core/eval/matching_test.py +17 -8
  28. langfun/core/eval/patching.py +130 -0
  29. langfun/core/eval/patching_test.py +170 -0
  30. langfun/core/eval/scoring.py +26 -26
  31. langfun/core/eval/scoring_test.py +19 -2
  32. langfun/core/eval/v2/__init__.py +42 -0
  33. langfun/core/eval/v2/checkpointing.py +380 -0
  34. langfun/core/eval/v2/checkpointing_test.py +228 -0
  35. langfun/core/eval/v2/eval_test_helper.py +136 -0
  36. langfun/core/eval/v2/evaluation.py +725 -0
  37. langfun/core/eval/v2/evaluation_test.py +180 -0
  38. langfun/core/eval/v2/example.py +305 -0
  39. langfun/core/eval/v2/example_test.py +128 -0
  40. langfun/core/eval/v2/experiment.py +1048 -0
  41. langfun/core/eval/v2/experiment_test.py +433 -0
  42. langfun/core/eval/v2/metric_values.py +156 -0
  43. langfun/core/eval/v2/metric_values_test.py +80 -0
  44. langfun/core/eval/v2/metrics.py +357 -0
  45. langfun/core/eval/v2/metrics_test.py +203 -0
  46. langfun/core/eval/v2/progress.py +348 -0
  47. langfun/core/eval/v2/progress_test.py +82 -0
  48. langfun/core/eval/v2/progress_tracking.py +210 -0
  49. langfun/core/eval/v2/progress_tracking_test.py +66 -0
  50. langfun/core/eval/v2/reporting.py +270 -0
  51. langfun/core/eval/v2/reporting_test.py +158 -0
  52. langfun/core/eval/v2/runners.py +488 -0
  53. langfun/core/eval/v2/runners_test.py +334 -0
  54. langfun/core/langfunc.py +4 -17
  55. langfun/core/langfunc_test.py +22 -6
  56. langfun/core/language_model.py +577 -39
  57. langfun/core/language_model_test.py +470 -56
  58. langfun/core/llms/__init__.py +87 -16
  59. langfun/core/llms/anthropic.py +312 -87
  60. langfun/core/llms/anthropic_test.py +71 -3
  61. langfun/core/llms/cache/base.py +21 -2
  62. langfun/core/llms/cache/in_memory.py +13 -0
  63. langfun/core/llms/cache/in_memory_test.py +53 -2
  64. langfun/core/llms/compositional.py +101 -0
  65. langfun/core/llms/compositional_test.py +73 -0
  66. langfun/core/llms/deepseek.py +117 -0
  67. langfun/core/llms/deepseek_test.py +61 -0
  68. langfun/core/llms/fake.py +11 -7
  69. langfun/core/llms/fake_test.py +14 -0
  70. langfun/core/llms/gemini.py +507 -0
  71. langfun/core/llms/gemini_test.py +195 -0
  72. langfun/core/llms/google_genai.py +62 -218
  73. langfun/core/llms/google_genai_test.py +9 -202
  74. langfun/core/llms/groq.py +160 -144
  75. langfun/core/llms/groq_test.py +31 -137
  76. langfun/core/llms/llama_cpp.py +15 -42
  77. langfun/core/llms/llama_cpp_test.py +4 -30
  78. langfun/core/llms/openai.py +395 -203
  79. langfun/core/llms/openai_compatible.py +179 -0
  80. langfun/core/llms/openai_compatible_test.py +495 -0
  81. langfun/core/llms/openai_test.py +30 -395
  82. langfun/core/llms/rest.py +113 -0
  83. langfun/core/llms/rest_test.py +111 -0
  84. langfun/core/llms/vertexai.py +192 -0
  85. langfun/core/llms/vertexai_test.py +52 -0
  86. langfun/core/logging.py +284 -0
  87. langfun/core/logging_test.py +125 -0
  88. langfun/core/message.py +319 -9
  89. langfun/core/message_test.py +190 -13
  90. langfun/core/modalities/__init__.py +6 -2
  91. langfun/core/modalities/audio.py +30 -0
  92. langfun/core/modalities/audio_test.py +63 -0
  93. langfun/core/modalities/image.py +39 -20
  94. langfun/core/modalities/image_test.py +52 -9
  95. langfun/core/modalities/mime.py +206 -29
  96. langfun/core/modalities/mime_test.py +90 -9
  97. langfun/core/modalities/ms_office.py +117 -0
  98. langfun/core/modalities/ms_office_test.py +389 -0
  99. langfun/core/modalities/pdf.py +22 -0
  100. langfun/core/modalities/pdf_test.py +57 -0
  101. langfun/core/modalities/video.py +9 -26
  102. langfun/core/modalities/video_test.py +3 -3
  103. langfun/core/modality.py +26 -3
  104. langfun/core/modality_test.py +2 -2
  105. langfun/core/sampling.py +11 -11
  106. langfun/core/structured/__init__.py +12 -16
  107. langfun/core/structured/completion.py +32 -5
  108. langfun/core/structured/completion_test.py +7 -6
  109. langfun/core/structured/description.py +2 -2
  110. langfun/core/structured/description_test.py +3 -3
  111. langfun/core/structured/function_generation.py +60 -27
  112. langfun/core/structured/function_generation_test.py +72 -2
  113. langfun/core/structured/mapping.py +97 -47
  114. langfun/core/structured/mapping_test.py +90 -2
  115. langfun/core/structured/parsing.py +33 -21
  116. langfun/core/structured/parsing_test.py +53 -9
  117. langfun/core/structured/querying.py +746 -0
  118. langfun/core/structured/{prompting_test.py → querying_test.py} +469 -51
  119. langfun/core/structured/schema.py +204 -97
  120. langfun/core/structured/schema_generation.py +1 -1
  121. langfun/core/structured/schema_test.py +130 -29
  122. langfun/core/structured/scoring.py +125 -19
  123. langfun/core/structured/scoring_test.py +30 -0
  124. langfun/core/structured/tokenization.py +64 -0
  125. langfun/core/structured/tokenization_test.py +48 -0
  126. langfun/core/template.py +115 -1
  127. langfun/core/template_test.py +71 -1
  128. langfun/core/templates/conversation.py +9 -0
  129. langfun/core/templates/conversation_test.py +4 -3
  130. langfun/core/templates/selfplay_test.py +10 -2
  131. langfun-0.1.2.dev202501140804.dist-info/METADATA +225 -0
  132. langfun-0.1.2.dev202501140804.dist-info/RECORD +153 -0
  133. {langfun-0.0.2.dev20240429.dist-info → langfun-0.1.2.dev202501140804.dist-info}/WHEEL +1 -1
  134. langfun/core/coding/python/errors.py +0 -108
  135. langfun/core/coding/python/errors_test.py +0 -99
  136. langfun/core/coding/python/permissions.py +0 -90
  137. langfun/core/coding/python/permissions_test.py +0 -86
  138. langfun/core/structured/prompting.py +0 -238
  139. langfun/core/text_formatting.py +0 -162
  140. langfun/core/text_formatting_test.py +0 -47
  141. langfun-0.0.2.dev20240429.dist-info/METADATA +0 -100
  142. langfun-0.0.2.dev20240429.dist-info/RECORD +0 -108
  143. {langfun-0.0.2.dev20240429.dist-info → langfun-0.1.2.dev202501140804.dist-info}/LICENSE +0 -0
  144. {langfun-0.0.2.dev20240429.dist-info → langfun-0.1.2.dev202501140804.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1048 @@
1
+ # Copyright 2024 The Langfun Authors
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Evaluation experiment."""
15
+
16
+ import abc
17
+ import datetime
18
+ import functools
19
+ import hashlib
20
+ import inspect
21
+ import os
22
+ import re
23
+ from typing import Annotated, Any, Callable, Literal, Optional
24
+
25
+ import langfun.core as lf
26
+ from langfun.core.eval.v2 import example as example_lib
27
+ from langfun.core.eval.v2 import progress as progress_lib
28
+ import pyglove as pg
29
+
30
+
31
+ class Experiment(lf.Component, pg.views.HtmlTreeView.Extension):
32
+ """Evaluation Experiment.
33
+
34
+ # Experiment Structure.
35
+
36
+ An evaluation experiment is structured as a tree of evaluation tasks, where
37
+ each task is represented as a node in the tree. Leaf tasks are instances of
38
+ `Evaluation` with concrete hyper-parameter values. Nodes such as `Suite` and
39
+ `Evaluation` that utilize `pg.oneof` are non-leaf tasks, as they represent
40
+ multiple configurations. Leaf tasks can be retrieved using property
41
+ `leaf_nodes`, while non-leaf tasks can be retrieved using property
42
+ `nonleaf_nodes`. An experiment without any leaf tasks is considered
43
+ empty.
44
+
45
+ For example:
46
+
47
+ ```
48
+ Suite(
49
+ MyEvaluation1(
50
+ lm=pg.oneof([lm1, lm2]),
51
+ ),
52
+ Suite(
53
+ MyEvaluation2(
54
+ lm=lm1,
55
+ ),
56
+ MyEvaluation3(
57
+ lm=lm2,
58
+ ),
59
+ )
60
+ )
61
+ ```
62
+
63
+ In this example:
64
+ - The two `Suite` nodes and the `MyEvaluation1` node (with pg.oneof) are
65
+ non-leaf nodes, as they contain leaf tasks.
66
+ - There are four leaf nodes. Two leaf nodes under `MyEvaluation1`, which
67
+ correspond to `MyEvaluation1` instances with `lm1` and `lm2` as
68
+ hyper-parameters respectively. The objects of `MyEvaluation2` and
69
+ `MyEvaluation3` are also leaf nodes as they have specific hyper-parameter
70
+ values.
71
+
72
+ # Running an Experiment
73
+
74
+ To run an experiment, users can call `Experiment.run`. This will execute the
75
+ experiment using a specified `Runner` instance (e.g., 'parallel' or
76
+ 'sequential'). Progress and results will be periodically written to HTML
77
+ files. Users can also assign an id to each run, which will identify the output
78
+ directory of that run.
79
+
80
+ By default, the experiment will resume from the latest run under the root
81
+ directory (using the ID 'latest'). Users can specify 'new' to start a fresh
82
+ run or provide a specific run ID (typically in the format %Y%m%d_%<number>).
83
+ Additionally, when initiating a new run, users may specify a `warm_start_from`
84
+ directory to restore the experiment’s state from a previous run.
85
+
86
+ Examples:
87
+
88
+ ```
89
+ root_dir = '/path/to/experiment/root'
90
+
91
+ # Resume the latest experiment run, or start a new run if none exists.
92
+ experiment.run(root_dir)
93
+
94
+ # Equivalent to:
95
+ experiment.run(root_dir, 'latest')
96
+
97
+ # Start a new, clean run.
98
+ experiment.run(root_dir, 'new')
99
+
100
+ # Start a new run with a warm start from the another run located at
101
+ # '/path/to/another/run' (e.g. /my_expreriment/run_20241031_1).
102
+ experiment.run(root_dir, 'new', warm_start_from='/path/to/another/run')
103
+
104
+ # Resume run '20241031_1', re-running failed examples and recomputing
105
+ # metrics as needed.
106
+ experiment.run(root_dir, '20241031_1')
107
+
108
+ # Reprocess the previous run located in 'run_20241031_1'.
109
+ experiment.run(root_dir, '20241031_1', reprocess=True)
110
+ ```
111
+
112
+ # Experiment Registration and Lookup
113
+
114
+ Experiments can be registered by setting a class-level NAME attribute.
115
+ Users can then retrieve a registered experiment using Experiment.find(name).
116
+
117
+ For example:
118
+
119
+ ```
120
+ class MyEval(lf.eval.v2.Evaluation):
121
+ NAME = 'my_eval'
122
+
123
+ class MyEvalVariation1(MyEval):
124
+ NAME = 'my_eval/gemini'
125
+ lm = pg.oneof([lf.llms.GeminiPro(), lf.llms.GeminiFlash(), ...])
126
+
127
+ class MyEvalVariation2(MyEval):
128
+ NAME = 'my_eval/openai'
129
+ lm = pg.oneof([lf.llms.Gpt4o(), lf.llms.Gpt4Turbo(), ...])
130
+
131
+ # Run all experiments with "gemini" in their name.
132
+ experiment = Experiment.find('.*/gemini')
133
+ experiment.run()
134
+
135
+ # Run all experiments with "my_eval" in their name.
136
+ experiment = Experiment.find('my_eval.*')
137
+ experiment.run()
138
+ ```
139
+
140
+ # Checkpointing
141
+
142
+ Experiments support checkpointing, which is enabled by default. It allows
143
+ users to resume their experiments from a saved state. When an experiment runs,
144
+ it creates a new directory for that run and saves the current state to a
145
+ checkpoint file. If the experiment is interrupted or fails, users can resume
146
+ it by specifying the 'id' or 'warm_start_from' argument (shown above) to
147
+ seamlessly continue from previously saved state without starting over.
148
+
149
+ # Monitoring and Reporting
150
+
151
+ Evaluations can take considerable time to complete, so Langfun provides
152
+ several tools to monitor progress. Progress bars display the status of each
153
+ evaluation: HTML-based progress bars update in real time within Colab
154
+ notebooks, while text-based progress bars appear in the terminal using tqdm.
155
+
156
+ Additionally, Langfun generates HTML files at regular intervals to provide
157
+ progress updates and detailed evaluation results. These files are saved in
158
+ the evaluation's output directory, organized as follows:
159
+
160
+ root_dir> # Root directory of the experiment.
161
+ |_ <run_id> # Root directory of current run.
162
+ |_ summary.html # Summary of the run. Updated every 60 seconds.
163
+ |_ <experiment_cls> # Directory of a particular experiment type.
164
+ |_ <experiment_hash> # Directory of a particular experiment config.
165
+ |_ index.html # Experiment report. Updated every 60 seconds.
166
+ |_ 1.html # Detailed evaluation output of example 1.
167
+ |_ 2.html # Detailed evaluation output of example 2.
168
+ |_ ...
169
+
170
+ # Experiment Plugins
171
+
172
+ Experiment can be extended by plugins. Plugins can listen to the events of
173
+ experiment execution and produce additional outputs. For example, a plugin
174
+ can be added to an experiment to generate additional metrics or to save
175
+ additional data to a database. More details will be added in the future.
176
+ """
177
+
178
+ #
179
+ # Class-level functionalities.
180
+ #
181
+
182
+ # An global unique str as a well-known name for an experiment,
183
+ # which can be retrieved by `Experiment.find(name)`. If None, the experiment
184
+ # does not have a well-known name, thus users need to create the experiment
185
+ # by constructing it explicitly.
186
+ NAME = None
187
+
188
+ # Global registry for experiment classes with GLOBAL_ID.
189
+ _NAME_TO_CLASS = {}
190
+
191
+ def __init_subclass__(cls):
192
+ super().__init_subclass__()
193
+
194
+ if inspect.isabstract(cls):
195
+ return
196
+
197
+ if cls.NAME is not None:
198
+ cls._NAME_TO_CLASS[cls.NAME] = cls
199
+
200
+ @classmethod
201
+ def find(cls, pattern: str) -> 'Experiment':
202
+ """Finds an experiment by global name.
203
+
204
+ Args:
205
+ pattern: A regular expression to match the global names of registered
206
+ experiments.
207
+
208
+ Returns:
209
+ An experiment object. If multiple experiments are found, a
210
+ `Suite` of matched experiments will be returned. If no experiment is
211
+ found, an empty `Suite` will be returned.
212
+ """
213
+ if pattern in cls._NAME_TO_CLASS:
214
+ return cls._NAME_TO_CLASS[pattern]()
215
+ regex = re.compile(pattern)
216
+ selected = []
217
+ for cls_name, exp_cls in cls._NAME_TO_CLASS.items():
218
+ if regex.match(cls_name):
219
+ selected.append(exp_cls())
220
+ return selected[0] if len(selected) == 1 else Suite(selected)
221
+
222
+ #
223
+ # Instance-level functionalities.
224
+ #
225
+
226
+ progress: Annotated[
227
+ progress_lib.Progress,
228
+ 'The progress of the experiment.'
229
+ ] = progress_lib.Progress()
230
+
231
+ usage_summary: Annotated[
232
+ lf.UsageSummary,
233
+ 'The usage summary of the experiment.'
234
+ ] = lf.UsageSummary()
235
+
236
+ plugins: Annotated[
237
+ list['Plugin'],
238
+ (
239
+ 'Plugins for current experiment, which can listen to the events '
240
+ 'of experiment execution and produce additional outputs.'
241
+ )
242
+ ] = []
243
+
244
+ def _on_bound(self):
245
+ super()._on_bound()
246
+ self.__dict__.pop('hash', None)
247
+ self.__dict__.pop('dir', None)
248
+ self._reset()
249
+
250
+ #
251
+ # Identity of an experiment.
252
+ #
253
+
254
+ @property
255
+ def id(self) -> str:
256
+ """Returns the ID for this evaluaton."""
257
+ return f'{self.__class__.__name__}@{self.hash}'
258
+
259
+ def definition(self, hide_default_values: bool = True) -> str:
260
+ """Returns the definition of the experiment."""
261
+ return self.format(
262
+ compact=False,
263
+ hide_default_values=hide_default_values,
264
+ use_inferred=True,
265
+ exclude_keys=('progress', 'usage_summary')
266
+ )
267
+
268
+ @functools.cached_property
269
+ def hash(self) -> str:
270
+ """A 8-byte MD5 hash computed from experiment identity."""
271
+ identity = self.format(
272
+ compact=True, hide_default_values=True, use_inferred=True,
273
+ exclude_keys=('plugins', 'progress', 'usage_summary')
274
+ )
275
+ return hashlib.md5(identity.encode()).hexdigest()[:8]
276
+
277
+ @classmethod
278
+ def link(cls, path: str) -> str:
279
+ return f'file://{path}'
280
+
281
+ #
282
+ # Hierarchy of an experiment tree.
283
+ #
284
+
285
+ @property
286
+ @abc.abstractmethod
287
+ def children(self) -> list['Experiment']:
288
+ """Returns the child experiments."""
289
+
290
+ @property
291
+ @abc.abstractmethod
292
+ def is_leaf(self) -> bool:
293
+ """Returns whether the experiment is a leaf node."""
294
+
295
+ def empty(self) -> bool:
296
+ """Returns whether the experiment is empty."""
297
+ return not self.leaf_nodes
298
+
299
+ @functools.cached_property
300
+ def nodes(self) -> list['Experiment']:
301
+ """Returns all the experiment nodes in the subtree (including self)."""
302
+ nodes = [self]
303
+ for child in self.children:
304
+ nodes.extend(child.nodes)
305
+ return nodes
306
+
307
+ @functools.cached_property
308
+ def leaf_nodes(self) -> list['Experiment']:
309
+ """Returns the leaf nodes.
310
+
311
+ The leaf-nodes of an experiment are evaluable objects that has materilized
312
+ hyper-parameters.
313
+ """
314
+ if self.is_leaf:
315
+ return [self]
316
+
317
+ nodes = []
318
+ for child in self.children:
319
+ nodes.extend(child.leaf_nodes)
320
+ return nodes
321
+
322
+ @functools.cached_property
323
+ def nonleaf_nodes(self) -> list['Experiment']:
324
+ """Returns the non-leaf nodes."""
325
+ if self.is_leaf:
326
+ return []
327
+ nodes = [self]
328
+ for child in self.children:
329
+ nodes.extend(child.nonleaf_nodes)
330
+ return nodes
331
+
332
+ @functools.cached_property
333
+ def parent(self) -> Optional['Experiment']:
334
+ """Returns the parent experiment."""
335
+ parent = self.sym_parent
336
+ while parent is not None and not isinstance(parent, Experiment):
337
+ parent = parent.sym_parent
338
+ return parent
339
+
340
+ def get(self, evaluation_id: str) -> Optional['Experiment']:
341
+ """Returns the experiment by ID."""
342
+ for leaf in self.leaf_nodes:
343
+ if leaf.id == evaluation_id:
344
+ return leaf
345
+ return None
346
+
347
+ #
348
+ # Mutable states during evaluaton.
349
+ #
350
+
351
+ def reset(self) -> None:
352
+ """Resets the experiment for a new run."""
353
+ self.progress.reset()
354
+ self.rebind(
355
+ usage_summary=lf.UsageSummary(),
356
+ skip_notification=True,
357
+ raise_on_no_change=False
358
+ )
359
+ if self.is_leaf:
360
+ self._reset()
361
+ else:
362
+ for child in self.children:
363
+ child.reset()
364
+
365
+ def _reset(self) -> None:
366
+ """Subclass could override."""
367
+
368
+ #
369
+ # Helper methods for running the evaluation without explicitly creating the
370
+ # runner.
371
+ #
372
+
373
+ def run(
374
+ self,
375
+ root_dir: str,
376
+ id: str | None = None, # pylint: disable=redefined-builtin
377
+ *,
378
+ runner: str = 'parallel',
379
+ warm_start_from: str | None = None,
380
+ filter: Callable[['Experiment'], bool] | None = None, # pylint: disable=redefined-builtin
381
+ example_ids: list[int] | None = None,
382
+ raise_if_has_error: bool = False,
383
+ reprocess: bool | list[int] = False,
384
+ generate_example_html: Literal['new', 'all', 'no'] | list[int] = 'new',
385
+ process_timeout: int | None = None,
386
+ use_cache: Literal['global', 'per_dataset', 'no'] = 'per_dataset',
387
+ note: str | None = None,
388
+ tags: list[str] | None = None,
389
+ plugins: list['Plugin'] | None = None,
390
+ **kwargs
391
+ ) -> 'Run':
392
+ """Runs the experiment.
393
+
394
+ Examples:
395
+ # Start a new run under root_dir.
396
+ experiment.run(root_dir, 'new')
397
+
398
+ # Continue the latest experiment run.
399
+ experiment.run(root_dir, 'latest')
400
+
401
+ # Continue the latest experiment run or start a new run if it does not
402
+ # exist.
403
+ experiment.run(root_dir)
404
+
405
+ # Start a new run and warm start from another run's directory
406
+ # '/path/to/another/run_20241031_1/'.
407
+ experiment.run(
408
+ root_dir, 'new',
409
+ warm_start_from='/path/to/another/run_20241031_1/'
410
+ )
411
+
412
+ # Reprocess previous run under sub-dir 'run_20241031_1'.
413
+ experiment.run(root_dir, '20241031_1', reprocess=True)
414
+
415
+ Args:
416
+ root_dir: The root of the output directory of the experiment.
417
+ id: The ID of the current run. It can be None, a special keyword 'latest'
418
+ or 'new', or a datetime string in format `%Y%m%d%_%` (e.g. 20241031_1).
419
+ If None, it will use the latest run ID under the root directory or
420
+ create a new run based on the current time if no previous run exists.
421
+ If `latest`, it will use the latest run ID under the root directory.
422
+ If `new`, it will create a new run ID based on the current time.
423
+ runner: The runner to use. If None, it will use the default runner for
424
+ the experiment.
425
+ warm_start_from: The ID of the previous run to warm start from. If None,
426
+ it will continue the experiment identified by `id` from where it left
427
+ off. Otherwise, it will create a new experiment run by warming start.
428
+ filter: A filter function to decide whether an experiment should be run
429
+ or not.
430
+ example_ids: The example IDs to run. If None, it will run all examples.
431
+ raise_if_has_error: If True, it will raise an error if any example fails.
432
+ Otherwise, it will continue and report the error in the output.
433
+ reprocess: A boolean or a list of example IDs. If boolean, it indicates
434
+ that whether all the examples to be evaluated will be reprocessed,
435
+ meaning that existing checkpoints will be ignored. If a list of
436
+ example IDs, it indicates that only the specified examples will be
437
+ reprocessed.
438
+ generate_example_html: Among 'new', 'all', 'no' or a list of example IDs.
439
+ If 'new', generate HTML files for all newly processed examples, and
440
+ keep/copy existing HTML files for unchanged examples.
441
+ If 'all', generate HTML files for all examples.
442
+ If 'no', do not generate HTML files for any examples.
443
+ If a list of example IDs, generate HTML files for the specified
444
+ examples.
445
+ process_timeout: The timeout in seconds for each process. If None, it
446
+ will use the default timeout for the runner.
447
+ use_cache: Whether to use LLM cache for the experiment.
448
+ If `global`, it will use a global cache shared by all experiments.
449
+ If `per_dataset`, it will use a cache dedicated for each dataset.
450
+ If `no`, it will not use any cache.
451
+ note: The note for the current run.
452
+ tags: The tags for the current run.
453
+ plugins: Runner plugins to use.
454
+ **kwargs: Additional kwargs to pass to the runner.
455
+
456
+ Returns:
457
+ The current run.
458
+ """
459
+ if plugins is not None:
460
+ kwargs['plugins'] = plugins
461
+ runner = Runner.create(
462
+ runner,
463
+ current_run=Run(
464
+ root_dir=root_dir,
465
+ experiment=pg.Ref(self),
466
+ id=RunId.from_id(id, root_dir),
467
+ warm_start_from=warm_start_from,
468
+ filter=filter,
469
+ example_ids=example_ids,
470
+ raise_if_has_error=raise_if_has_error,
471
+ reprocess=reprocess,
472
+ generate_example_html=generate_example_html,
473
+ use_cache=use_cache,
474
+ process_timeout=process_timeout,
475
+ note=note,
476
+ tags=tags or [],
477
+ ),
478
+ **kwargs
479
+ )
480
+ runner.run()
481
+ return runner.current_run
482
+
483
+ def run_preconfigured(
484
+ self,
485
+ root_dir: str | None = None,
486
+ id: str | None = None, # pylint: disable=redefined-builtin
487
+ **kwargs
488
+ ) -> 'Run':
489
+ """Runs the experiment with pre-configured kwargs from `cls.RUN_ARGS`.
490
+
491
+ This helper method allows users to config running arguments as a part of
492
+ the class.
493
+
494
+ Args:
495
+ root_dir: root directory of the experiment.
496
+ id: ID of the current run.
497
+ **kwargs: Keyword arguments to override the RUN_CONFIG.
498
+
499
+ Returns:
500
+ The current run.
501
+ """
502
+ run_config = getattr(self, 'RUN_ARGS', {})
503
+ run_config.update(kwargs)
504
+ if root_dir is not None:
505
+ run_config['root_dir'] = root_dir
506
+ if id is not None:
507
+ run_config['id'] = id
508
+ return self.run(**run_config)
509
+
510
+ #
511
+ # HTML views.
512
+ #
513
+
514
+ def output_link(
515
+ self,
516
+ run: Optional['Run'], relative_path: str
517
+ ) -> str | None:
518
+ """Returns the output path of the experiment."""
519
+ if run is None:
520
+ return None
521
+ return self.link(run.output_path_for(self, relative_path))
522
+
523
+ def _html_tree_view_summary_title(
524
+ self,
525
+ current_run: Optional['Run'] = None,
526
+ interactive: bool = True,
527
+ ):
528
+ title, link, dir_link = self.id, None, None
529
+ if current_run is not None:
530
+ dir_link = self.output_link(current_run, '')
531
+ if self.is_leaf:
532
+ link = self.output_link(current_run, 'index.html')
533
+ elif self.parent is None:
534
+ title = str(current_run.id)
535
+ link = self.output_link(current_run, 'summary.html')
536
+ return pg.Html.element(
537
+ 'div',
538
+ [
539
+ # Experiment ID.
540
+ pg.views.html.controls.Label(
541
+ title,
542
+ link=link,
543
+ tooltip=pg.format( # pytype: disable=wrong-arg-types
544
+ self,
545
+ verbose=False,
546
+ use_inferred=True,
547
+ hide_default_values=True,
548
+ exclude_keys=(
549
+ 'root_dir', 'plugins', 'progress', 'usage_summary'
550
+ ),
551
+ ),
552
+ css_classes=['experiment-name'],
553
+ ),
554
+ # Experiment directory (if root or leaf).
555
+ pg.views.html.controls.Label( # pylint: disable=g-long-ternary
556
+ '[dir]',
557
+ link=dir_link,
558
+ css_classes=['experiment-dir'],
559
+ ) if dir_link is not None else None,
560
+ # Progress bar.
561
+ self.progress.to_html(
562
+ extra_flags=dict(interactive=interactive),
563
+ ),
564
+ # Usage summary,
565
+ self.usage_summary.to_html(
566
+ extra_flags=dict(as_badge=True, interactive=interactive)
567
+ ),
568
+ ],
569
+ css_classes=['experiment-summary']
570
+ )
571
+
572
+ def _html_tree_view_summary(
573
+ self,
574
+ *,
575
+ view,
576
+ name: str | None = None,
577
+ extra_flags: dict[str, Any] | None = None,
578
+ **kwargs
579
+ ):
580
+ extra_flags = extra_flags or {}
581
+ if not extra_flags.get('card_view', True):
582
+ return None
583
+
584
+ kwargs.pop('title', None)
585
+ kwargs.pop('enable_key_tooltip', None)
586
+ kwargs.pop('enable_summary_tooltip', None)
587
+ return view.summary(
588
+ self,
589
+ name=name if self.is_leaf else None,
590
+ title=self._html_tree_view_summary_title(
591
+ extra_flags.get('current_run', None),
592
+ extra_flags.get('interactive', True)
593
+ ),
594
+ enable_key_tooltip=False,
595
+ enable_summary_tooltip=False,
596
+ **kwargs
597
+ )
598
+
599
+ def _html_tree_view_content(
600
+ self,
601
+ *,
602
+ view,
603
+ collapse_level: int | None = 1,
604
+ extra_flags: dict[str, Any],
605
+ **kwargs):
606
+ return pg.Html.element(
607
+ 'div',
608
+ [
609
+ c.to_html(
610
+ collapse_level=view.get_collapse_level(
611
+ (collapse_level, -1), 0
612
+ ),
613
+ name=f'#{i + 1}',
614
+ extra_flags=extra_flags,
615
+ **view.get_passthrough_kwargs(**kwargs)
616
+ )
617
+ for i, c in enumerate(self.children)
618
+ ],
619
+ )
620
+
621
+ def _html_tree_view_css_styles(self) -> list[str]:
622
+ return super()._html_tree_view_css_styles() + [
623
+ """
624
+ .experiment-summary {
625
+ display: inline-block;
626
+ font-weight: normal;
627
+ }
628
+ .experiment-name {
629
+ font-weight: bold;
630
+ }
631
+ .experiment-dir.label {
632
+ color: revert;
633
+ margin-left: 0px;
634
+ padding: 2px;
635
+ }
636
+ .usage-summary-badge {
637
+ margin-left: 10px;
638
+ }
639
+ body {
640
+ font: normal 16px "Roboto","Noto",sans-serif;
641
+ }
642
+ """
643
+ ]
644
+
645
+
646
+ @pg.use_init_args(['children'])
647
+ class Suite(Experiment):
648
+ """A suite of evaluations."""
649
+
650
+ children: Annotated[
651
+ list[Experiment], 'A list of child experiments.'
652
+ ] = []
653
+
654
+ @property
655
+ def is_leaf(self) -> bool:
656
+ """Returns whether the task is a leaf."""
657
+ return False
658
+
659
+
660
+ class RunId(pg.Object):
661
+ """Structured repreesentation a experiment run ID."""
662
+ date: datetime.date
663
+ number: int
664
+
665
+ _REGEX = re.compile(r'^(\d{8})_(\d+)$')
666
+
667
+ def dirname(self, root_dir: str | None = None) -> str:
668
+ """Returns the directory name of the run ID."""
669
+ dir_name = f'run_{self}'
670
+ if root_dir is None:
671
+ return dir_name
672
+ return os.path.join(root_dir, dir_name)
673
+
674
+ def __str__(self) -> str:
675
+ """Returns the string representation of the run ID."""
676
+ return f'{self.date.strftime("%Y%m%d")}_{self.number}'
677
+
678
+ def __lt__(self, other: 'RunId') -> bool:
679
+ """Returns whether the run ID is less than the other."""
680
+ return self.date < other.date or (
681
+ self.date == other.date and self.number < other.number
682
+ )
683
+
684
+ def _le__(self, other: 'RunId') -> bool:
685
+ """Returns whether the run ID is less than or equal to the other."""
686
+ return self == other or self < other
687
+
688
+ def __gt__(self, other: 'RunId') -> bool:
689
+ """Returns whether the run ID is greater than the other."""
690
+ return other < self
691
+
692
+ def __ge__(self, other: 'RunId') -> bool:
693
+ """Returns whether the run ID is greater than or equal to the other."""
694
+ return self == other or self > other
695
+
696
+ def next(self) -> 'RunId':
697
+ """Returns the next run ID."""
698
+ return RunId(
699
+ date=self.date,
700
+ number=self.number + 1,
701
+ )
702
+
703
+ @classmethod
704
+ def from_dirname(cls, dirname: str) -> Optional['RunId']:
705
+ """Creates a run ID from the directory name."""
706
+ if not dirname.startswith('run_'):
707
+ return None
708
+ run_id_str = dirname.removeprefix('run_')
709
+ if cls.is_valid(run_id_str):
710
+ return cls.from_id(run_id_str)
711
+ return None
712
+
713
+ @classmethod
714
+ def is_valid(cls, run_id: str) -> bool:
715
+ """Returns whether the run ID is valid."""
716
+ return run_id in ('latest', 'new') or bool(cls._REGEX.match(run_id))
717
+
718
+ @classmethod
719
+ def from_id(
720
+ cls,
721
+ run_id: str | None,
722
+ root_dir: str | None = None
723
+ ) -> 'RunId':
724
+ """Creates a run ID from the string representation."""
725
+ if run_id is not None and not cls.is_valid(run_id):
726
+ raise ValueError(
727
+ f'`run_id` must be one of `latest`, `new` and a '
728
+ f'datetime string in format `%Y%m%d%_<number>` (e.g. 20240101_1). '
729
+ f'Encountered: {run_id!r}.'
730
+ )
731
+ if run_id in (None, 'latest', 'new'):
732
+ if root_dir is None:
733
+ raise ValueError(
734
+ '`root_dir` must be provided for `latest` or `new` run ID.'
735
+ )
736
+ if run_id == 'latest':
737
+ run_id = cls.get_latest(root_dir)
738
+ if run_id is None:
739
+ raise ValueError(
740
+ f'There are no previous runs under the root directory: '
741
+ f'{root_dir}. Consider running the experiment using `new` as id.'
742
+ )
743
+ return run_id
744
+ if run_id == 'new':
745
+ return cls.new(root_dir)
746
+ return cls.get_latest(root_dir) or cls.new()
747
+
748
+ assert run_id is not None
749
+ date_str, number_str = run_id.split('_')
750
+ return cls(
751
+ date=datetime.datetime.strptime(date_str, '%Y%m%d').date(),
752
+ number=int(number_str),
753
+ )
754
+
755
+ @classmethod
756
+ def get_latest(cls, root_dir: str) -> Optional['RunId']:
757
+ """Returns the latest run ID under the root directory."""
758
+ if not pg.io.isdir(root_dir):
759
+ return None
760
+ run_ids = [
761
+ RunId.from_dirname(dirname)
762
+ for dirname in pg.io.listdir(root_dir)
763
+ ]
764
+ run_ids = [run_id for run_id in run_ids if run_id is not None]
765
+ if not run_ids:
766
+ return None
767
+ return max(run_ids)
768
+
769
+ @classmethod
770
+ def new(cls, root_dir: str | None = None) -> 'RunId':
771
+ """Creates a new run ID."""
772
+ latest = None if root_dir is None else cls.get_latest(root_dir)
773
+ if latest is not None and latest.date == datetime.date.today():
774
+ return latest.next()
775
+ return cls(
776
+ date=datetime.date.today(),
777
+ number=1,
778
+ )
779
+
780
+
781
+ class Run(pg.Object, pg.views.html.HtmlTreeView.Extension):
782
+ """A run of an experiment."""
783
+
784
+ root_dir: Annotated[
785
+ str,
786
+ 'The root of the output directory of the experiment.'
787
+ ]
788
+
789
+ id: Annotated[
790
+ RunId,
791
+ (
792
+ 'The ID of the current run.'
793
+ )
794
+ ]
795
+
796
+ experiment: Annotated[
797
+ Experiment,
798
+ 'The root experiment to run.'
799
+ ]
800
+
801
+ warm_start_from: Annotated[
802
+ str | None,
803
+ (
804
+ 'The directory for a previous run to warm start from.'
805
+ )
806
+ ] = None
807
+
808
+ example_ids: Annotated[
809
+ list[int] | None,
810
+ (
811
+ 'The example IDs to run. If None, it will run all examples. '
812
+ 'Though '
813
+ )
814
+ ] = None
815
+
816
+ raise_if_has_error: Annotated[
817
+ bool,
818
+ (
819
+ 'If True, it will raise an error if any example fails.'
820
+ )
821
+ ] = False
822
+
823
+ note: Annotated[
824
+ str | None,
825
+ 'The user note for the current run.'
826
+ ] = None
827
+
828
+ tags: Annotated[
829
+ list[str],
830
+ 'The user tags for the current run.'
831
+ ] = []
832
+
833
+ reprocess: Annotated[
834
+ bool | list[int],
835
+ (
836
+ 'If True, it will reprocess all examples under the current '
837
+ 'run directory. If a list of integers, examples of the given IDS '
838
+ 'will be reprocessed.'
839
+ )
840
+ ] = False
841
+
842
+ generate_example_html: Annotated[
843
+ Literal['new', 'all', 'no'] | list[int],
844
+ (
845
+ 'If "new", generate HTML files for all newly processed examples, '
846
+ 'and keep/copy existing HTML files for unchanged examples. '
847
+ 'If "all", generate HTML files for all examples. '
848
+ 'If "no", do not generate HTML files for any examples. '
849
+ 'If a list of example IDs, generate HTML files for the specified '
850
+ 'examples.'
851
+ )
852
+ ] = 'new'
853
+
854
+ filter: Annotated[
855
+ Callable[[Experiment], bool] | None,
856
+ 'A filter to decide whether a leaf experiment should be run or not.'
857
+ ] = None
858
+
859
+ process_timeout: Annotated[
860
+ int | None,
861
+ 'Timeout for each evaluation example.'
862
+ ] = None
863
+
864
+ use_cache: Annotated[
865
+ Literal['global', 'per_dataset', 'no'],
866
+ (
867
+ 'The cache policy for the runner. If `global`, the runner will use '
868
+ 'the cache for all evaluations. If `per_dataset`, the runner will '
869
+ 'use the cache for each evaluation. If `no`, the runner will not '
870
+ 'use the cache.'
871
+ )
872
+ ] = 'per_dataset'
873
+
874
+ @property
875
+ def output_root(self) -> str:
876
+ """Returns the root directory of the experiment."""
877
+ return self.id.dirname(self.root_dir)
878
+
879
+ @property
880
+ def input_root(self) -> str:
881
+ """Returns the input root d."""
882
+ return self.warm_start_from if self.warm_start_from else self.output_root
883
+
884
+ def output_dir(self, experiment: Experiment) -> str:
885
+ """Returns the output directory of the experiment."""
886
+ if experiment.is_leaf:
887
+ return os.path.join(self.output_root, experiment.id.replace('@', '/'))
888
+ return self.output_root
889
+
890
+ def input_dir(self, experiment: Experiment) -> str:
891
+ """Returns the input directory of the experiment."""
892
+ if experiment.is_leaf:
893
+ return os.path.join(self.input_root, experiment.id.replace('@', '/'))
894
+ return self.input_root
895
+
896
+ def input_path_for(self, experiment: Experiment, relative_path: str) -> str:
897
+ """Returns the input path for the experiment."""
898
+ return os.path.join(self.input_dir(experiment), relative_path)
899
+
900
+ def output_path_for(self, experiment: Experiment, relative_path: str) -> str:
901
+ """Returns the output path for the experiment."""
902
+ return os.path.join(self.output_dir(experiment), relative_path)
903
+
904
+ def examples_to_evaluate(self, experiment: Experiment) -> set[int]:
905
+ """Returns the example IDs to evaluate."""
906
+ if not experiment.is_leaf:
907
+ return set()
908
+ return set(
909
+ self.example_ids if self.example_ids else
910
+ range(1, experiment.num_examples + 1)
911
+ )
912
+
913
+ def examples_to_reprocess(self, experiment: Experiment) -> set[int]:
914
+ """Returns the example IDs to reprocess per request."""
915
+ if not self.reprocess:
916
+ return set()
917
+ reprocess_ids = self.examples_to_evaluate(experiment)
918
+ if isinstance(self.reprocess, list):
919
+ reprocess_ids &= set(self.reprocess)
920
+ return reprocess_ids
921
+
922
+ def examples_to_load(self, experiment: Experiment) -> set[int]:
923
+ """Returns the example IDs to load from checkpoint files.."""
924
+ load_ids = self.examples_to_evaluate(experiment)
925
+ if isinstance(self.generate_example_html, list):
926
+ load_ids |= set(self.generate_example_html)
927
+ load_ids -= self.examples_to_reprocess(experiment)
928
+ return load_ids
929
+
930
+ def examples_to_load_metadata(self, experiment: Experiment) -> set[int]:
931
+ """Returns the example IDs to load the metadata."""
932
+ load_metadata_ids = set()
933
+ if isinstance(self.generate_example_html, list):
934
+ load_metadata_ids = set(self.generate_example_html)
935
+ elif self.generate_example_html == 'all':
936
+ load_metadata_ids = self.examples_to_evaluate(experiment)
937
+ load_metadata_ids -= self.examples_to_reprocess(experiment)
938
+ return load_metadata_ids
939
+
940
+
941
+ class Runner(pg.Object):
942
+ """Interface for experiment runner."""
943
+
944
+ # Class-level variable for registering the runner.
945
+ NAME = None
946
+
947
+ _REGISTRY = {}
948
+
949
+ current_run: Annotated[
950
+ Run,
951
+ 'The current run.'
952
+ ]
953
+
954
+ plugins: Annotated[
955
+ list['Plugin'],
956
+ 'The plugins for the runner.'
957
+ ] = []
958
+
959
+ def __init_subclass__(cls):
960
+ super().__init_subclass__()
961
+ if inspect.isabstract(cls):
962
+ return
963
+ if cls.NAME is None:
964
+ raise ValueError(
965
+ 'Runner class must define a NAME constant. '
966
+ 'Please use the same constant in the runner class.'
967
+ )
968
+ cls._REGISTRY[cls.NAME] = cls
969
+
970
+ @abc.abstractmethod
971
+ def run(self) -> None:
972
+ """Runs a evaluation task."""
973
+
974
+ @classmethod
975
+ def create(cls, runner: str, **kwargs) -> 'Runner':
976
+ """Creates a runner instance by ID and kwargs."""
977
+ return cls._REGISTRY[runner](**kwargs)
978
+
979
+
980
+ class Plugin(lf.Component):
981
+ """Base class for experiment plugins."""
982
+
983
+ def on_run_start(
984
+ self,
985
+ runner: Runner,
986
+ root: Experiment
987
+ ) -> None:
988
+ """Called when a runner is started."""
989
+
990
+ def on_run_complete(
991
+ self,
992
+ runner: Runner,
993
+ root: Experiment
994
+ ) -> None:
995
+ """Called when a runner is complete."""
996
+
997
+ def on_run_abort(
998
+ self,
999
+ runner: Runner,
1000
+ root: Experiment,
1001
+ error: BaseException,
1002
+ ) -> None:
1003
+ """Called when a runner is aborted."""
1004
+
1005
+ def on_experiment_start(
1006
+ self,
1007
+ runner: Runner,
1008
+ experiment: Experiment
1009
+ ) -> None:
1010
+ """Called when an evaluation is started."""
1011
+
1012
+ def on_experiment_skipped(
1013
+ self,
1014
+ runner: Runner,
1015
+ experiment: Experiment
1016
+ ) -> None:
1017
+ """Called when an experiment (both leaf and non-leaf) is skipped."""
1018
+
1019
+ def on_experiment_complete(
1020
+ self,
1021
+ runner: Runner,
1022
+ experiment: Experiment
1023
+ ) -> None:
1024
+ """Called when an experiment (both leaf and non-leaf) is complete."""
1025
+
1026
+ def on_experiment_abort(
1027
+ self,
1028
+ runner: Runner,
1029
+ experiment: Experiment,
1030
+ error: BaseException,
1031
+ ) -> None:
1032
+ """Called when an experiment (both leaf and non-leaf) is aborted."""
1033
+
1034
+ def on_example_start(
1035
+ self,
1036
+ runner: Runner,
1037
+ experiment: Experiment,
1038
+ example: example_lib.Example
1039
+ ) -> None:
1040
+ """Called when an example is about to be evaluated."""
1041
+
1042
+ def on_example_complete(
1043
+ self,
1044
+ runner: Runner,
1045
+ experiment: Experiment,
1046
+ example: example_lib.Example
1047
+ ) -> None:
1048
+ """Called when an example is evaluated."""