langfun 0.1.2.dev202411090804__py3-none-any.whl → 0.1.2.dev202411140804__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of langfun might be problematic. Click here for more details.

Files changed (36) hide show
  1. langfun/core/console.py +10 -2
  2. langfun/core/console_test.py +17 -0
  3. langfun/core/eval/__init__.py +2 -0
  4. langfun/core/eval/v2/__init__.py +38 -0
  5. langfun/core/eval/v2/checkpointing.py +135 -0
  6. langfun/core/eval/v2/checkpointing_test.py +89 -0
  7. langfun/core/eval/v2/evaluation.py +627 -0
  8. langfun/core/eval/v2/evaluation_test.py +156 -0
  9. langfun/core/eval/v2/example.py +295 -0
  10. langfun/core/eval/v2/example_test.py +114 -0
  11. langfun/core/eval/v2/experiment.py +949 -0
  12. langfun/core/eval/v2/experiment_test.py +304 -0
  13. langfun/core/eval/v2/metric_values.py +156 -0
  14. langfun/core/eval/v2/metric_values_test.py +80 -0
  15. langfun/core/eval/v2/metrics.py +357 -0
  16. langfun/core/eval/v2/metrics_test.py +203 -0
  17. langfun/core/eval/v2/progress.py +348 -0
  18. langfun/core/eval/v2/progress_test.py +82 -0
  19. langfun/core/eval/v2/progress_tracking.py +209 -0
  20. langfun/core/eval/v2/progress_tracking_test.py +56 -0
  21. langfun/core/eval/v2/reporting.py +144 -0
  22. langfun/core/eval/v2/reporting_test.py +41 -0
  23. langfun/core/eval/v2/runners.py +417 -0
  24. langfun/core/eval/v2/runners_test.py +311 -0
  25. langfun/core/eval/v2/test_helper.py +80 -0
  26. langfun/core/language_model.py +122 -11
  27. langfun/core/language_model_test.py +97 -4
  28. langfun/core/llms/__init__.py +3 -0
  29. langfun/core/llms/compositional.py +101 -0
  30. langfun/core/llms/compositional_test.py +73 -0
  31. langfun/core/llms/vertexai.py +4 -4
  32. {langfun-0.1.2.dev202411090804.dist-info → langfun-0.1.2.dev202411140804.dist-info}/METADATA +1 -1
  33. {langfun-0.1.2.dev202411090804.dist-info → langfun-0.1.2.dev202411140804.dist-info}/RECORD +36 -12
  34. {langfun-0.1.2.dev202411090804.dist-info → langfun-0.1.2.dev202411140804.dist-info}/WHEEL +1 -1
  35. {langfun-0.1.2.dev202411090804.dist-info → langfun-0.1.2.dev202411140804.dist-info}/LICENSE +0 -0
  36. {langfun-0.1.2.dev202411090804.dist-info → langfun-0.1.2.dev202411140804.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,949 @@
1
+ # Copyright 2024 The Langfun Authors
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Evaluation experiment."""
15
+
16
+ import abc
17
+ import datetime
18
+ import functools
19
+ import hashlib
20
+ import inspect
21
+ import os
22
+ import re
23
+ from typing import Annotated, Any, Callable, Literal, Optional
24
+
25
+ import langfun.core as lf
26
+ from langfun.core.eval.v2 import example as example_lib
27
+ from langfun.core.eval.v2 import progress as progress_lib
28
+ import pyglove as pg
29
+
30
+
31
+ class Experiment(lf.Component, pg.views.HtmlTreeView.Extension):
32
+ """Evaluation Experiment.
33
+
34
+ # Experiment Structure.
35
+
36
+ An evaluation experiment is structured as a tree of evaluation tasks, where
37
+ each task is represented as a node in the tree. Leaf tasks are instances of
38
+ `Evaluation` with concrete hyper-parameter values. Nodes such as `Suite` and
39
+ `Evaluation` that utilize `pg.oneof` are non-leaf tasks, as they represent
40
+ multiple configurations. Leaf tasks can be retrieved using property
41
+ `leaf_nodes`, while non-leaf tasks can be retrieved using property
42
+ `nonleaf_nodes`. An experiment without any leaf tasks is considered
43
+ empty.
44
+
45
+ For example:
46
+
47
+ ```
48
+ Suite(
49
+ MyEvaluation1(
50
+ lm=pg.oneof([lm1, lm2]),
51
+ ),
52
+ Suite(
53
+ MyEvaluation2(
54
+ lm=lm1,
55
+ ),
56
+ MyEvaluation3(
57
+ lm=lm2,
58
+ ),
59
+ )
60
+ )
61
+ ```
62
+
63
+ In this example:
64
+ - The two `Suite` nodes and the `MyEvaluation1` node (with pg.oneof) are
65
+ non-leaf nodes, as they contain leaf tasks.
66
+ - There are four leaf nodes. Two leaf nodes under `MyEvaluation1`, which
67
+ correspond to `MyEvaluation1` instances with `lm1` and `lm2` as
68
+ hyper-parameters respectively. The objects of `MyEvaluation2` and
69
+ `MyEvaluation3` are also leaf nodes as they have specific hyper-parameter
70
+ values.
71
+
72
+ # Running an Experiment
73
+
74
+ To run an experiment, users can call `Experiment.run`. This will execute the
75
+ experiment using a specified `Runner` instance (e.g., 'parallel' or
76
+ 'sequential'). Progress and results will be periodically written to HTML
77
+ files. Users can also assign an id to each run, which will identify the output
78
+ directory of that run.
79
+
80
+ By default, the experiment will resume from the latest run under the root
81
+ directory (using the ID 'latest'). Users can specify 'new' to start a fresh
82
+ run or provide a specific run ID (typically in the format %Y%m%d_%<number>).
83
+ Additionally, when initiating a new run, users may specify a `warm_start_from`
84
+ ID to restore the experiment’s state from a previous run.
85
+
86
+ Examples:
87
+
88
+ ```
89
+ root_dir = '/path/to/experiment/root'
90
+
91
+ # Resume the latest experiment run, or start a new run if none exists.
92
+ experiment.run(root_dir)
93
+
94
+ # Equivalent to:
95
+ experiment.run(root_dir, 'latest')
96
+
97
+ # Start a new, clean run.
98
+ experiment.run(root_dir, 'new')
99
+
100
+ # Start a new run with a warm start from the previous run located in
101
+ # 'run_20241031_1' of the root directory.
102
+ experiment.run(root_dir, 'new', warm_start_from='20241031_1')
103
+
104
+ # Resume run '20241031_1', re-running failed examples and recomputing
105
+ # metrics as needed.
106
+ experiment.run(root_dir, '20241031_1')
107
+
108
+ # Refresh the previous run located in 'run_20241031_1'.
109
+ experiment.run(root_dir, '20241031_1', refresh=True)
110
+ ```
111
+
112
+ # Experiment Registration and Lookup
113
+
114
+ Experiments can be registered by setting a class-level NAME attribute.
115
+ Users can then retrieve a registered experiment using Experiment.find(name).
116
+
117
+ For example:
118
+
119
+ ```
120
+ class MyEval(lf.eval.v2.Evaluation):
121
+ NAME = 'my_eval'
122
+
123
+ class MyEvalVariation1(MyEval):
124
+ NAME = 'my_eval/gemini'
125
+ lm = pg.oneof([lf.llms.GeminiPro(), lf.llms.GeminiFlash(), ...])
126
+
127
+ class MyEvalVariation2(MyEval):
128
+ NAME = 'my_eval/openai'
129
+ lm = pg.oneof([lf.llms.Gpt4o(), lf.llms.Gpt4Turbo(), ...])
130
+
131
+ # Run all experiments with "gemini" in their name.
132
+ experiment = Experiment.find('.*/gemini')
133
+ experiment.run()
134
+
135
+ # Run all experiments with "my_eval" in their name.
136
+ experiment = Experiment.find('my_eval.*')
137
+ experiment.run()
138
+ ```
139
+
140
+ # Checkpointing
141
+
142
+ Experiments support checkpointing, which is enabled by default. It allows
143
+ users to resume their experiments from a saved state. When an experiment runs,
144
+ it creates a new directory for that run and saves the current state to a
145
+ checkpoint file. If the experiment is interrupted or fails, users can resume
146
+ it by specifying the 'id' or 'warm_start_from' argument (shown above) to
147
+ seamlessly continue from previously saved state without starting over.
148
+
149
+ # Monitoring and Reporting
150
+
151
+ Evaluations can take considerable time to complete, so Langfun provides
152
+ several tools to monitor progress. Progress bars display the status of each
153
+ evaluation: HTML-based progress bars update in real time within Colab
154
+ notebooks, while text-based progress bars appear in the terminal using tqdm.
155
+
156
+ Additionally, Langfun generates HTML files at regular intervals to provide
157
+ progress updates and detailed evaluation results. These files are saved in
158
+ the evaluation's output directory, organized as follows:
159
+
160
+ root_dir> # Root directory of the experiment.
161
+ |_ <run_id> # Root directory of current run.
162
+ |_ summary.html # Summary of the run. Updated every 60 seconds.
163
+ |_ <experiment_cls> # Directory of a particular experiment type.
164
+ |_ <experiment_hash> # Directory of a particular experiment config.
165
+ |_ index.html # Experiment report. Updated every 60 seconds.
166
+ |_ 1.html # Detailed evaluation output of example 1.
167
+ |_ 2.html # Detailed evaluation output of example 2.
168
+ |_ ...
169
+
170
+ # Experiment Plugins
171
+
172
+ Experiment can be extended by plugins. Plugins can listen to the events of
173
+ experiment execution and produce additional outputs. For example, a plugin
174
+ can be added to an experiment to generate additional metrics or to save
175
+ additional data to a database. More details will be added in the future.
176
+ """
177
+
178
+ #
179
+ # Class-level functionalities.
180
+ #
181
+
182
+ # An global unique str as a well-known name for an experiment,
183
+ # which can be retrieved by `Experiment.find(name)`. If None, the experiment
184
+ # does not have a well-known name, thus users need to create the experiment
185
+ # by constructing it explicitly.
186
+ NAME = None
187
+
188
+ # Global registry for experiment classes with GLOBAL_ID.
189
+ _NAME_TO_CLASS = {}
190
+
191
+ def __init_subclass__(cls):
192
+ super().__init_subclass__()
193
+
194
+ if inspect.isabstract(cls):
195
+ return
196
+
197
+ if cls.NAME is not None:
198
+ cls._NAME_TO_CLASS[cls.NAME] = cls
199
+
200
+ @classmethod
201
+ def find(cls, pattern: str) -> 'Experiment':
202
+ """Finds an experiment by global name.
203
+
204
+ Args:
205
+ pattern: A regular expression to match the global names of registered
206
+ experiments.
207
+
208
+ Returns:
209
+ An experiment object. If multiple experiments are found, a
210
+ `Suite` of matched experiments will be returned. If no experiment is
211
+ found, an empty `Suite` will be returned.
212
+ """
213
+ if pattern in cls._NAME_TO_CLASS:
214
+ return cls._NAME_TO_CLASS[pattern]()
215
+ regex = re.compile(pattern)
216
+ selected = []
217
+ for cls_name, exp_cls in cls._NAME_TO_CLASS.items():
218
+ if regex.match(cls_name):
219
+ selected.append(exp_cls())
220
+ return selected[0] if len(selected) == 1 else Suite(selected)
221
+
222
+ #
223
+ # Instance-level functionalities.
224
+ #
225
+
226
+ progress: Annotated[
227
+ progress_lib.Progress,
228
+ 'The progress of the experiment.'
229
+ ] = progress_lib.Progress()
230
+
231
+ usage_summary: Annotated[
232
+ lf.UsageSummary,
233
+ 'The usage summary of the experiment.'
234
+ ] = lf.UsageSummary()
235
+
236
+ plugins: Annotated[
237
+ list['Plugin'],
238
+ (
239
+ 'Plugins for current experiment, which can listen to the events '
240
+ 'of experiment execution and produce additional outputs.'
241
+ )
242
+ ] = []
243
+
244
+ def _on_bound(self):
245
+ super()._on_bound()
246
+ self.__dict__.pop('hash', None)
247
+ self.__dict__.pop('dir', None)
248
+ self._reset()
249
+
250
+ #
251
+ # Identity of an experiment.
252
+ #
253
+
254
+ @property
255
+ def id(self) -> str:
256
+ """Returns the ID for this evaluaton."""
257
+ return f'{self.__class__.__name__}@{self.hash}'
258
+
259
+ def definition(self, hide_default_values: bool = True) -> str:
260
+ """Returns the definition of the experiment."""
261
+ return self.format(
262
+ compact=False,
263
+ hide_default_values=hide_default_values,
264
+ use_inferred=True,
265
+ exclude_keys=('progress', 'usage_summary')
266
+ )
267
+
268
+ @functools.cached_property
269
+ def hash(self) -> str:
270
+ """A 8-byte MD5 hash computed from experiment identity."""
271
+ identity = self.format(
272
+ compact=True, hide_default_values=True, use_inferred=True,
273
+ exclude_keys=('plugins', 'progress', 'usage_summary')
274
+ )
275
+ return hashlib.md5(identity.encode()).hexdigest()[:8]
276
+
277
+ @classmethod
278
+ def link(cls, path: str) -> str:
279
+ return f'file://{path}'
280
+
281
+ #
282
+ # Hierarchy of an experiment tree.
283
+ #
284
+
285
+ @property
286
+ @abc.abstractmethod
287
+ def children(self) -> list['Experiment']:
288
+ """Returns the child experiments."""
289
+
290
+ @property
291
+ @abc.abstractmethod
292
+ def is_leaf(self) -> bool:
293
+ """Returns whether the experiment is a leaf node."""
294
+
295
+ def empty(self) -> bool:
296
+ """Returns whether the experiment is empty."""
297
+ return not self.leaf_nodes
298
+
299
+ @functools.cached_property
300
+ def nodes(self) -> list['Experiment']:
301
+ """Returns all the experiment nodes in the subtree (including self)."""
302
+ nodes = [self]
303
+ for child in self.children:
304
+ nodes.extend(child.nodes)
305
+ return nodes
306
+
307
+ @functools.cached_property
308
+ def leaf_nodes(self) -> list['Experiment']:
309
+ """Returns the leaf nodes.
310
+
311
+ The leaf-nodes of an experiment are evaluable objects that has materilized
312
+ hyper-parameters.
313
+ """
314
+ if self.is_leaf:
315
+ return [self]
316
+
317
+ nodes = []
318
+ for child in self.children:
319
+ nodes.extend(child.leaf_nodes)
320
+ return nodes
321
+
322
+ @functools.cached_property
323
+ def nonleaf_nodes(self) -> list['Experiment']:
324
+ """Returns the non-leaf nodes."""
325
+ if self.is_leaf:
326
+ return []
327
+ nodes = [self]
328
+ for child in self.children:
329
+ nodes.extend(child.nonleaf_nodes)
330
+ return nodes
331
+
332
+ @functools.cached_property
333
+ def parent(self) -> Optional['Experiment']:
334
+ """Returns the parent experiment."""
335
+ parent = self.sym_parent
336
+ while parent is not None and not isinstance(parent, Experiment):
337
+ parent = parent.sym_parent
338
+ return parent
339
+
340
+ def get(self, evaluation_id: str) -> Optional['Experiment']:
341
+ """Returns the experiment by ID."""
342
+ for leaf in self.leaf_nodes:
343
+ if leaf.id == evaluation_id:
344
+ return leaf
345
+ return None
346
+
347
+ #
348
+ # Mutable states during evaluaton.
349
+ #
350
+
351
+ def reset(self) -> None:
352
+ """Resets the experiment for a new run."""
353
+ self.progress.reset()
354
+ self.rebind(
355
+ usage_summary=lf.UsageSummary(),
356
+ skip_notification=True,
357
+ raise_on_no_change=False
358
+ )
359
+ if self.is_leaf:
360
+ self._reset()
361
+ else:
362
+ for child in self.children:
363
+ child.reset()
364
+
365
+ def _reset(self) -> None:
366
+ """Subclass could override."""
367
+
368
+ #
369
+ # Helper methods for running the evaluation without explicitly creating the
370
+ # runner.
371
+ #
372
+
373
+ def run(
374
+ self,
375
+ root_dir: str,
376
+ id: str | None = None, # pylint: disable=redefined-builtin
377
+ *,
378
+ runner: str = 'parallel',
379
+ warm_start_from: str | None = None,
380
+ filter: Callable[['Experiment'], bool] | None = None, # pylint: disable=redefined-builtin
381
+ example_ids: list[int] | None = None,
382
+ raise_if_has_error: bool = False,
383
+ refresh: bool = False,
384
+ process_timeout: int | None = None,
385
+ use_cache: Literal['global', 'per_dataset', 'no'] = 'per_dataset',
386
+ note: str | None = None,
387
+ tags: list[str] | None = None,
388
+ plugins: list['Plugin'] | None = None,
389
+ **kwargs
390
+ ) -> 'Run':
391
+ """Runs the experiment.
392
+
393
+ Examples:
394
+ # Start a new run.
395
+ experiment.run('new')
396
+
397
+ # Continue the latest experiment run.
398
+ experiment.run('latest')
399
+
400
+ # Continue the latest experiment run or start a new run if it does not
401
+ # exist.
402
+ experiment.run()
403
+
404
+ # Start a new run and warm start from a previous run under sub-dir
405
+ # 'run_20241031_1'.
406
+ experiment.run('new', warm_start_from='20241031_1')
407
+
408
+ # Refresh previous run under sub-dir 'run_20241031_1'.
409
+ experiment.run('20241031_1', refresh=True)
410
+
411
+ Args:
412
+ root_dir: The root of the output directory of the experiment.
413
+ id: The ID of the current run. It can be None, a special keyword 'latest'
414
+ or 'new', or a datetime string in format `%Y%m%d%_%` (e.g. 20241031_1).
415
+ If None, it will use the latest run ID under the root directory or
416
+ create a new run based on the current time if no previous run exists.
417
+ If `latest`, it will use the latest run ID under the root directory.
418
+ If `new`, it will create a new run ID based on the current time.
419
+ runner: The runner to use. If None, it will use the default runner for
420
+ the experiment.
421
+ warm_start_from: The ID of the previous run to warm start from. If None,
422
+ it will continue the experiment identified by `id` from where it left
423
+ off. Otherwise, it will create a new experiment run by warming start.
424
+ filter: A filter function to decide whether an experiment should be run
425
+ or not.
426
+ example_ids: The example IDs to run. If None, it will run all examples.
427
+ raise_if_has_error: If True, it will raise an error if any example fails.
428
+ Otherwise, it will continue and report the error in the output.
429
+ refresh: Whether to refresh the experiment. If True, it will delete the
430
+ data under the current experiment run directory and start a new run.
431
+ process_timeout: The timeout in seconds for each process. If None, it
432
+ will use the default timeout for the runner.
433
+ use_cache: Whether to use LLM cache for the experiment.
434
+ If `global`, it will use a global cache shared by all experiments.
435
+ If `per_dataset`, it will use a cache dedicated for each dataset.
436
+ If `no`, it will not use any cache.
437
+ note: The note for the current run.
438
+ tags: The tags for the current run.
439
+ plugins: Runner plugins to use.
440
+ **kwargs: Additional kwargs to pass to the runner.
441
+
442
+ Returns:
443
+ The current run.
444
+ """
445
+ if plugins is not None:
446
+ kwargs['plugins'] = plugins
447
+ runner = Runner.create(
448
+ runner,
449
+ current_run=Run(
450
+ root_dir=root_dir,
451
+ experiment=pg.Ref(self),
452
+ id=RunId.from_id(id, root_dir),
453
+ warm_start_from=warm_start_from,
454
+ filter=filter,
455
+ example_ids=example_ids,
456
+ raise_if_has_error=raise_if_has_error,
457
+ refresh=refresh,
458
+ use_cache=use_cache,
459
+ process_timeout=process_timeout,
460
+ note=note,
461
+ tags=tags or [],
462
+ ),
463
+ **kwargs
464
+ )
465
+ runner.run()
466
+ return runner.current_run
467
+
468
+ #
469
+ # HTML views.
470
+ #
471
+
472
+ def output_link(
473
+ self,
474
+ run: Optional['Run'], relative_path: str
475
+ ) -> str | None:
476
+ """Returns the output path of the experiment."""
477
+ if run is None:
478
+ return None
479
+ return self.link(run.output_path_for(self, relative_path))
480
+
481
+ def _html_tree_view_summary_title(
482
+ self,
483
+ current_run: Optional['Run'] = None,
484
+ interactive: bool = True,
485
+ ):
486
+ title, link, dir_link = self.id, None, None
487
+ if current_run is not None:
488
+ dir_link = self.output_link(current_run, '')
489
+ if self.is_leaf:
490
+ link = self.output_link(current_run, 'index.html')
491
+ elif self.parent is None:
492
+ title = str(current_run.id)
493
+ link = self.output_link(current_run, 'summary.html')
494
+ return pg.Html.element(
495
+ 'div',
496
+ [
497
+ # Experiment ID.
498
+ pg.views.html.controls.Label(
499
+ title,
500
+ link=link,
501
+ tooltip=pg.format( # pytype: disable=wrong-arg-types
502
+ self,
503
+ verbose=False,
504
+ use_inferred=True,
505
+ hide_default_values=True,
506
+ exclude_keys=(
507
+ 'root_dir', 'plugins', 'progress', 'usage_summary'
508
+ ),
509
+ ),
510
+ css_classes=['experiment-name'],
511
+ ),
512
+ # Experiment directory (if root or leaf).
513
+ pg.views.html.controls.Label( # pylint: disable=g-long-ternary
514
+ '[dir]',
515
+ link=dir_link,
516
+ css_classes=['experiment-dir'],
517
+ ) if dir_link is not None else None,
518
+ # Progress bar.
519
+ self.progress.to_html(
520
+ extra_flags=dict(interactive=interactive),
521
+ ),
522
+ # Usage summary,
523
+ self.usage_summary.to_html(
524
+ extra_flags=dict(as_badge=True, interactive=interactive)
525
+ ),
526
+ ],
527
+ css_classes=['experiment-summary']
528
+ )
529
+
530
+ def _html_tree_view_summary(
531
+ self,
532
+ *,
533
+ view,
534
+ name: str | None = None,
535
+ extra_flags: dict[str, Any] | None = None,
536
+ **kwargs
537
+ ):
538
+ extra_flags = extra_flags or {}
539
+ if not extra_flags.get('card_view', True):
540
+ return None
541
+
542
+ kwargs.pop('title', None)
543
+ kwargs.pop('enable_key_tooltip', None)
544
+ kwargs.pop('enable_summary_tooltip', None)
545
+ return view.summary(
546
+ self,
547
+ name=name if self.is_leaf else None,
548
+ title=self._html_tree_view_summary_title(
549
+ extra_flags.get('current_run', None),
550
+ extra_flags.get('interactive', True)
551
+ ),
552
+ enable_key_tooltip=False,
553
+ enable_summary_tooltip=False,
554
+ **kwargs
555
+ )
556
+
557
+ def _html_tree_view_content(
558
+ self,
559
+ *,
560
+ view,
561
+ collapse_level: int | None = 1,
562
+ extra_flags: dict[str, Any],
563
+ **kwargs):
564
+ return pg.Html.element(
565
+ 'div',
566
+ [
567
+ c.to_html(
568
+ collapse_level=view.get_collapse_level(
569
+ (collapse_level, -1), 0
570
+ ),
571
+ name=f'#{i + 1}',
572
+ extra_flags=extra_flags,
573
+ **view.get_passthrough_kwargs(**kwargs)
574
+ )
575
+ for i, c in enumerate(self.children)
576
+ ],
577
+ )
578
+
579
+ def _html_tree_view_css_styles(self) -> list[str]:
580
+ return super()._html_tree_view_css_styles() + [
581
+ """
582
+ .experiment-summary {
583
+ display: inline-block;
584
+ font-weight: normal;
585
+ }
586
+ .experiment-name {
587
+ font-weight: bold;
588
+ }
589
+ .experiment-dir.label {
590
+ color: revert;
591
+ margin-left: 0px;
592
+ padding: 2px;
593
+ }
594
+ .usage-summary-badge {
595
+ margin-left: 10px;
596
+ }
597
+ body {
598
+ font: normal 16px "Roboto","Noto",sans-serif;
599
+ }
600
+ """
601
+ ]
602
+
603
+
604
+ @pg.use_init_args(['children'])
605
+ class Suite(Experiment):
606
+ """A suite of evaluations."""
607
+
608
+ children: Annotated[
609
+ list[Experiment], 'A list of child experiments.'
610
+ ] = []
611
+
612
+ @property
613
+ def is_leaf(self) -> bool:
614
+ """Returns whether the task is a leaf."""
615
+ return False
616
+
617
+
618
+ class RunId(pg.Object):
619
+ """Structured repreesentation a experiment run ID."""
620
+ date: datetime.date
621
+ number: int
622
+
623
+ _REGEX = re.compile(r'^(\d{8})_(\d+)$')
624
+
625
+ def dirname(self, root_dir: str | None = None) -> str:
626
+ """Returns the directory name of the run ID."""
627
+ dir_name = f'run_{self}'
628
+ if root_dir is None:
629
+ return dir_name
630
+ return os.path.join(root_dir, dir_name)
631
+
632
+ def __str__(self) -> str:
633
+ """Returns the string representation of the run ID."""
634
+ return f'{self.date.strftime("%Y%m%d")}_{self.number}'
635
+
636
+ def __lt__(self, other: 'RunId') -> bool:
637
+ """Returns whether the run ID is less than the other."""
638
+ return self.date < other.date or (
639
+ self.date == other.date and self.number < other.number
640
+ )
641
+
642
+ def _le__(self, other: 'RunId') -> bool:
643
+ """Returns whether the run ID is less than or equal to the other."""
644
+ return self == other or self < other
645
+
646
+ def __gt__(self, other: 'RunId') -> bool:
647
+ """Returns whether the run ID is greater than the other."""
648
+ return other < self
649
+
650
+ def __ge__(self, other: 'RunId') -> bool:
651
+ """Returns whether the run ID is greater than or equal to the other."""
652
+ return self == other or self > other
653
+
654
+ def next(self) -> 'RunId':
655
+ """Returns the next run ID."""
656
+ return RunId(
657
+ date=self.date,
658
+ number=self.number + 1,
659
+ )
660
+
661
+ @classmethod
662
+ def from_dirname(cls, dirname: str) -> Optional['RunId']:
663
+ """Creates a run ID from the directory name."""
664
+ if not dirname.startswith('run_'):
665
+ return None
666
+ run_id_str = dirname.removeprefix('run_')
667
+ if cls.is_valid(run_id_str):
668
+ return cls.from_id(run_id_str)
669
+ return None
670
+
671
+ @classmethod
672
+ def is_valid(cls, run_id: str) -> bool:
673
+ """Returns whether the run ID is valid."""
674
+ return run_id in ('latest', 'new') or bool(cls._REGEX.match(run_id))
675
+
676
+ @classmethod
677
+ def from_id(
678
+ cls,
679
+ run_id: str | None,
680
+ root_dir: str | None = None
681
+ ) -> 'RunId':
682
+ """Creates a run ID from the string representation."""
683
+ if run_id is not None and not cls.is_valid(run_id):
684
+ raise ValueError(
685
+ f'`run_id` must be one of `latest`, `new` and a '
686
+ f'datetime string in format `%Y%m%d%_<number>` (e.g. 20240101_1). '
687
+ f'Encountered: {run_id!r}.'
688
+ )
689
+ if run_id in (None, 'latest', 'new'):
690
+ if root_dir is None:
691
+ raise ValueError(
692
+ '`root_dir` must be provided for `latest` or `new` run ID.'
693
+ )
694
+ if run_id == 'latest':
695
+ run_id = cls.get_latest(root_dir)
696
+ if run_id is None:
697
+ raise ValueError(
698
+ f'There are no previous runs under the root directory: '
699
+ f'{root_dir}. Consider running the experiment using `new` as id.'
700
+ )
701
+ return run_id
702
+ if run_id == 'new':
703
+ return cls.new(root_dir)
704
+ return cls.get_latest(root_dir) or cls.new()
705
+
706
+ assert run_id is not None
707
+ date_str, number_str = run_id.split('_')
708
+ return cls(
709
+ date=datetime.datetime.strptime(date_str, '%Y%m%d').date(),
710
+ number=int(number_str),
711
+ )
712
+
713
+ @classmethod
714
+ def get_latest(cls, root_dir: str) -> Optional['RunId']:
715
+ """Returns the latest run ID under the root directory."""
716
+ if not pg.io.isdir(root_dir):
717
+ return None
718
+ run_ids = [
719
+ RunId.from_dirname(dirname)
720
+ for dirname in pg.io.listdir(root_dir)
721
+ ]
722
+ run_ids = [run_id for run_id in run_ids if run_id is not None]
723
+ if not run_ids:
724
+ return None
725
+ return max(run_ids)
726
+
727
+ @classmethod
728
+ def new(cls, root_dir: str | None = None) -> 'RunId':
729
+ """Creates a new run ID."""
730
+ latest = None if root_dir is None else cls.get_latest(root_dir)
731
+ if latest is not None and latest.date == datetime.date.today():
732
+ return latest.next()
733
+ return cls(
734
+ date=datetime.date.today(),
735
+ number=1,
736
+ )
737
+
738
+
739
+ class Run(pg.Object, pg.views.html.HtmlTreeView.Extension):
740
+ """A run of an experiment."""
741
+
742
+ root_dir: Annotated[
743
+ str,
744
+ 'The root of the output directory of the experiment.'
745
+ ]
746
+
747
+ id: Annotated[
748
+ RunId,
749
+ (
750
+ 'The ID of the current run.'
751
+ )
752
+ ]
753
+
754
+ experiment: Annotated[
755
+ Experiment,
756
+ 'The root experiment to run.'
757
+ ]
758
+
759
+ warm_start_from: Annotated[
760
+ str | None,
761
+ (
762
+ 'The directory for a previous run to warm start from.'
763
+ )
764
+ ] = None
765
+
766
+ example_ids: Annotated[
767
+ list[int] | None,
768
+ (
769
+ 'The example IDs to run. If None, it will run all examples. '
770
+ 'Though '
771
+ )
772
+ ] = None
773
+
774
+ raise_if_has_error: Annotated[
775
+ bool,
776
+ (
777
+ 'If True, it will raise an error if any example fails.'
778
+ )
779
+ ] = False
780
+
781
+ note: Annotated[
782
+ str | None,
783
+ 'The user note for the current run.'
784
+ ] = None
785
+
786
+ tags: Annotated[
787
+ list[str],
788
+ 'The user tags for the current run.'
789
+ ] = []
790
+
791
+ refresh: Annotated[
792
+ bool,
793
+ (
794
+ 'If True, it will delete the data under the current '
795
+ 'run directory and start a new run.'
796
+ )
797
+ ] = False
798
+
799
+ filter: Annotated[
800
+ Callable[[Experiment], bool] | None,
801
+ 'A filter to decide whether a leaf experiment should be run or not.'
802
+ ] = None
803
+
804
+ process_timeout: Annotated[
805
+ int | None,
806
+ 'Timeout for each evaluation example.'
807
+ ] = None
808
+
809
+ use_cache: Annotated[
810
+ Literal['global', 'per_dataset', 'no'],
811
+ (
812
+ 'The cache policy for the runner. If `global`, the runner will use '
813
+ 'the cache for all evaluations. If `per_dataset`, the runner will '
814
+ 'use the cache for each evaluation. If `no`, the runner will not '
815
+ 'use the cache.'
816
+ )
817
+ ] = 'per_dataset'
818
+
819
+ @property
820
+ def output_root(self) -> str:
821
+ """Returns the root directory of the experiment."""
822
+ return self.id.dirname(self.root_dir)
823
+
824
+ @property
825
+ def input_root(self) -> str:
826
+ """Returns the input root d."""
827
+ return self.warm_start_from if self.warm_start_from else self.output_root
828
+
829
+ def output_dir(self, experiment: Experiment) -> str:
830
+ """Returns the output directory of the experiment."""
831
+ if experiment.is_leaf:
832
+ return os.path.join(self.output_root, experiment.id.replace('@', '/'))
833
+ return self.output_root
834
+
835
+ def input_dir(self, experiment: Experiment) -> str:
836
+ """Returns the input directory of the experiment."""
837
+ if experiment.is_leaf:
838
+ return os.path.join(self.input_root, experiment.id.replace('@', '/'))
839
+ return self.input_root
840
+
841
+ def input_path_for(self, experiment: Experiment, relative_path: str) -> str:
842
+ """Returns the input path for the experiment."""
843
+ return os.path.join(self.input_dir(experiment), relative_path)
844
+
845
+ def output_path_for(self, experiment: Experiment, relative_path: str) -> str:
846
+ """Returns the output path for the experiment."""
847
+ return os.path.join(self.output_dir(experiment), relative_path)
848
+
849
+
850
+ class Runner(pg.Object):
851
+ """Interface for experiment runner."""
852
+
853
+ # Class-level variable for registering the runner.
854
+ NAME = None
855
+
856
+ _REGISTRY = {}
857
+
858
+ current_run: Annotated[
859
+ Run,
860
+ 'The current run.'
861
+ ]
862
+
863
+ plugins: Annotated[
864
+ list['Plugin'],
865
+ 'The plugins for the runner.'
866
+ ] = []
867
+
868
+ def __init_subclass__(cls):
869
+ super().__init_subclass__()
870
+ if inspect.isabstract(cls):
871
+ return
872
+ if cls.NAME is None:
873
+ raise ValueError(
874
+ 'Runner class must define a NAME constant. '
875
+ 'Please use the same constant in the runner class.'
876
+ )
877
+ cls._REGISTRY[cls.NAME] = cls
878
+
879
+ @abc.abstractmethod
880
+ def run(self) -> None:
881
+ """Runs a evaluation task."""
882
+
883
+ @classmethod
884
+ def create(cls, runner: str, **kwargs) -> 'Runner':
885
+ """Creates a runner instance by ID and kwargs."""
886
+ return cls._REGISTRY[runner](**kwargs)
887
+
888
+
889
+ class Plugin(lf.Component):
890
+ """Base class for experiment plugins."""
891
+
892
+ def on_run_start(
893
+ self,
894
+ runner: Runner,
895
+ root: Experiment
896
+ ) -> None:
897
+ """Called when a runner is started."""
898
+
899
+ def on_run_complete(
900
+ self,
901
+ runner: Runner,
902
+ root: Experiment
903
+ ) -> None:
904
+ """Called when a runner is complete."""
905
+
906
+ def on_run_abort(
907
+ self,
908
+ runner: Runner,
909
+ root: Experiment,
910
+ error: BaseException,
911
+ ) -> None:
912
+ """Called when a runner is aborted."""
913
+
914
+ def on_experiment_start(
915
+ self,
916
+ runner: Runner,
917
+ experiment: Experiment
918
+ ) -> None:
919
+ """Called when an evaluation is started."""
920
+
921
+ def on_experiment_skipped(
922
+ self,
923
+ runner: Runner,
924
+ experiment: Experiment
925
+ ) -> None:
926
+ """Called when an experiment (both leaf and non-leaf) is skipped."""
927
+
928
+ def on_experiment_complete(
929
+ self,
930
+ runner: Runner,
931
+ experiment: Experiment
932
+ ) -> None:
933
+ """Called when an experiment (both leaf and non-leaf) is complete."""
934
+
935
+ def on_example_start(
936
+ self,
937
+ runner: Runner,
938
+ experiment: Experiment,
939
+ example: example_lib.Example
940
+ ) -> None:
941
+ """Called when an example is about to be evaluated."""
942
+
943
+ def on_example_complete(
944
+ self,
945
+ runner: Runner,
946
+ experiment: Experiment,
947
+ example: example_lib.Example
948
+ ) -> None:
949
+ """Called when an example is evaluated."""