langfun 0.1.2.dev202411110804__py3-none-any.whl → 0.1.2.dev202411120804__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. langfun/core/console.py +10 -2
  2. langfun/core/console_test.py +17 -0
  3. langfun/core/eval/__init__.py +2 -0
  4. langfun/core/eval/v2/__init__.py +34 -0
  5. langfun/core/eval/v2/checkpointing.py +130 -0
  6. langfun/core/eval/v2/checkpointing_test.py +89 -0
  7. langfun/core/eval/v2/evaluation.py +615 -0
  8. langfun/core/eval/v2/evaluation_test.py +143 -0
  9. langfun/core/eval/v2/example.py +286 -0
  10. langfun/core/eval/v2/example_test.py +92 -0
  11. langfun/core/eval/v2/experiment.py +949 -0
  12. langfun/core/eval/v2/experiment_test.py +304 -0
  13. langfun/core/eval/v2/metric_values.py +156 -0
  14. langfun/core/eval/v2/metric_values_test.py +80 -0
  15. langfun/core/eval/v2/metrics.py +357 -0
  16. langfun/core/eval/v2/metrics_test.py +203 -0
  17. langfun/core/eval/v2/progress.py +348 -0
  18. langfun/core/eval/v2/progress_test.py +82 -0
  19. langfun/core/eval/v2/progress_tracking.py +209 -0
  20. langfun/core/eval/v2/progress_tracking_test.py +56 -0
  21. langfun/core/eval/v2/reporting.py +144 -0
  22. langfun/core/eval/v2/reporting_test.py +41 -0
  23. langfun/core/eval/v2/runners.py +417 -0
  24. langfun/core/eval/v2/runners_test.py +311 -0
  25. langfun/core/eval/v2/test_helper.py +78 -0
  26. langfun/core/language_model.py +122 -11
  27. langfun/core/language_model_test.py +97 -4
  28. langfun/core/llms/__init__.py +3 -0
  29. langfun/core/llms/compositional.py +101 -0
  30. langfun/core/llms/compositional_test.py +73 -0
  31. {langfun-0.1.2.dev202411110804.dist-info → langfun-0.1.2.dev202411120804.dist-info}/METADATA +1 -1
  32. {langfun-0.1.2.dev202411110804.dist-info → langfun-0.1.2.dev202411120804.dist-info}/RECORD +35 -11
  33. {langfun-0.1.2.dev202411110804.dist-info → langfun-0.1.2.dev202411120804.dist-info}/WHEEL +1 -1
  34. {langfun-0.1.2.dev202411110804.dist-info → langfun-0.1.2.dev202411120804.dist-info}/LICENSE +0 -0
  35. {langfun-0.1.2.dev202411110804.dist-info → langfun-0.1.2.dev202411120804.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,615 @@
1
+ # Copyright 2024 The Langfun Authors
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Base class for Langfun evaluation tasks."""
15
+
16
+ import abc
17
+ import functools
18
+ import time
19
+
20
+ from typing import Annotated, Any, Callable, Iterable
21
+ import langfun.core as lf
22
+ import langfun.core.coding as lf_coding
23
+
24
+ from langfun.core.eval.v2 import example as example_lib
25
+ from langfun.core.eval.v2 import experiment as experiment_lib
26
+ from langfun.core.eval.v2 import metric_values as metric_values_lib
27
+ from langfun.core.eval.v2 import metrics as metrics_lib
28
+
29
+ import pyglove as pg
30
+
31
+
32
+ class Evaluation(experiment_lib.Experiment):
33
+ """Evaluation.
34
+
35
+ An evaluation can be a leaf node or a container of other evaluations,
36
+ depending on whether the current evaluation object is configured with
37
+ any `pg.oneof`.
38
+
39
+ For example, `MyEval(lm=pg.oneof([lf.llms.Gpt4(), lf.llms.Gemini1_5Pro()]))`
40
+ is a container of two sub-experiments, one for each LLM. In such case, the
41
+ evaluation object with `pg.oneof` is called a hyper evaluation, which
42
+ represents a search space of evaluations, and each sub-evaluation is called
43
+ a leaf evaluation, which will perform the actual evaluation.
44
+ """
45
+
46
+ inputs: Annotated[
47
+ pg.Functor,
48
+ 'A functor that returns a list of inputs.'
49
+ ]
50
+
51
+ metrics: Annotated[
52
+ list[metrics_lib.Metric],
53
+ 'The metrics to be evaluated.'
54
+ ]
55
+
56
+ max_workers: Annotated[
57
+ int,
58
+ 'The maximum number of workers to use for the evaluation.'
59
+ ] = 32
60
+
61
+ def _on_bound(self):
62
+ # Invalidate cached properties.
63
+ self.__dict__.pop('is_leaf', None)
64
+ self.__dict__.pop('children', None)
65
+ super()._on_bound()
66
+
67
+ #
68
+ # Handling evaluation hierarchy (materialized vs. hyper evaluations).
69
+ #
70
+
71
+ @functools.cached_property
72
+ def is_leaf(self) -> bool:
73
+ """Returns whether the task is a leaf."""
74
+ return self.is_deterministic
75
+
76
+ @functools.cached_property
77
+ def children(self) -> list['Evaluation']:
78
+ """Returns the children tasks."""
79
+ if self.is_leaf:
80
+ return []
81
+ children = []
82
+ for i, child in enumerate(pg.iter(self)):
83
+ child.sym_setparent(self)
84
+ child.sym_setpath(self.sym_path + 'children' + i)
85
+ children.append(child)
86
+ return children
87
+
88
+ #
89
+ # Handling evaluation inputs.
90
+ #
91
+
92
+ @functools.cached_property
93
+ def example_inputs(self) -> Iterable[Any]:
94
+ """Returns the examples from the inputs."""
95
+ return self.inputs()
96
+
97
+ def example_input_by_id(self, example_id: int) -> Any:
98
+ """Returns the example from the inputs by ID."""
99
+ assert example_id <= len(self.example_inputs), example_id
100
+ return self._example_input_by_id[example_id]
101
+
102
+ @functools.cached_property
103
+ def _example_input_by_id(self) -> dict[int, Any]:
104
+ """Returns the examples from the inputs by ID."""
105
+ return {i + 1: v for i, v in enumerate(self.example_inputs)}
106
+
107
+ @property
108
+ def num_examples(self) -> int:
109
+ """Returns the number of examples from the inputs."""
110
+ # NOTE(daiyip): setting `num_examples` of the input functor allows fast
111
+ # retrieval of number of examples without interating the whole dataset.
112
+ return getattr(self.inputs, 'num_examples', len(self.example_inputs))
113
+
114
+ #
115
+ # Evaluation logics.
116
+ #
117
+
118
+ @abc.abstractmethod
119
+ def process(self, example_input: Any) -> Any | tuple[Any, dict[str, Any]]:
120
+ """Processes a single example from the evaluation set.
121
+
122
+ Users should override this method to implement the evaluation logic.
123
+
124
+ Args:
125
+ example_input: An object returned from `Evaluable.inputs`.
126
+
127
+ Returns:
128
+ A processed output. Or a tuple of (output, metadata).
129
+ The output will be used for computing the metrics, and the metadata will
130
+ be included in the evaluation HTML view.
131
+ """
132
+
133
+ def evaluate(
134
+ self,
135
+ example: example_lib.Example | int,
136
+ raise_if_has_error: bool = False,
137
+ ) -> example_lib.Example:
138
+ """Evaluates a single example input.
139
+
140
+ Args:
141
+ example: An example ID or an example object with ID.
142
+ raise_if_has_error: Whether to raise an error if the example has error.
143
+
144
+ Returns:
145
+ The evaluated example with the output and metric metadata populated.
146
+ """
147
+ if isinstance(example, int):
148
+ example = example_lib.Example(id=example)
149
+ assert isinstance(example, example_lib.Example), example
150
+
151
+ if pg.MISSING_VALUE == example.input:
152
+ example.input = self.example_input_by_id(example.id)
153
+
154
+ cached = self._state.get(example.id)
155
+
156
+ with pg.timeit('evaluate') as timeit, lf.track_usages() as usage_summary:
157
+ if cached is None or cached.has_error:
158
+ example.start_time = time.time()
159
+ self._process(example, raise_if_has_error=raise_if_has_error)
160
+ else:
161
+ example.start_time = cached.start_time
162
+
163
+ # Use cached output and metadata obtained from the previous processing.
164
+ example.output = cached.output
165
+ example.metadata = cached.metadata
166
+ example.newly_processed = False
167
+
168
+ # For previously processed examples, we merge previous usages as
169
+ # cached, so the usage summary will account previous usages, but as
170
+ # cached.
171
+ assert cached.usage_summary is not None
172
+ usage_summary.merge(cached.usage_summary, as_cached=True)
173
+
174
+ # Recompute the metrics and metadata for the example even its processed
175
+ # output and metadata were from the cache.
176
+ # NOTE(daiyip): It's possible that metrics could use LLMs, so we need to
177
+ # track the usage of the metrics separately.
178
+ with pg.timeit('metric'):
179
+ metric_metadata = {}
180
+ for metric in self.metrics:
181
+ metric_metadata.update(metric.audit(example))
182
+ example.metric_metadata = metric_metadata
183
+
184
+ # For previously processed examples, we keep the execution status for the
185
+ # processing step.
186
+ execution_status = dict(example.execution_status or {})
187
+ execution_status.update(timeit.status())
188
+
189
+ example.execution_status = execution_status
190
+ example.usage_summary = usage_summary
191
+ if example.newly_processed:
192
+ example.end_time = time.time()
193
+
194
+ self._state.update(example)
195
+ return example
196
+
197
+ def _process(
198
+ self,
199
+ example: example_lib.Example,
200
+ raise_if_has_error: bool = False
201
+ ) -> None:
202
+ """Processes a single example."""
203
+ with (
204
+ pg.notify_on_change(False),
205
+ pg.allow_writable_accessors(True),
206
+ # NOTE(daiyip): set the `input` symbol of the globals to None, so
207
+ # LLM generated code with calls to `input` will raise an error, thus
208
+ # not blocking the evaluation.
209
+ lf_coding.context(input=None),
210
+ ):
211
+ try:
212
+ with pg.timeit('process'):
213
+ output = self.process(example.input)
214
+ if (isinstance(output, tuple)
215
+ and len(output) == 2
216
+ and isinstance(output[1], dict)):
217
+ output, metadata = output
218
+ else:
219
+ metadata = {}
220
+ example.output = output
221
+ example.metadata = metadata
222
+ except BaseException as e: # pylint: disable=broad-except
223
+ if raise_if_has_error:
224
+ raise
225
+ example.error = pg.object_utils.ErrorInfo.from_exception(e)
226
+
227
+ #
228
+ # Handling evaluation scheduling.
229
+ #
230
+
231
+ def resource_ids(self) -> set[str]:
232
+ """Returns a set of resource IDs required by this evaluation.
233
+
234
+ Resource IDs are used to by the runner to determine which evaluations can
235
+ be run in parallel. Evaluations using the same resource key will be run
236
+ sequentially.
237
+
238
+ Returns:
239
+ A unique string representing the resource required.
240
+ """
241
+ return {
242
+ v.resource_id for _, v in self.sym_init_args.items()
243
+ if isinstance(v, lf.LanguageModel)
244
+ }
245
+
246
+ #
247
+ # Handling evaluation state.
248
+ #
249
+
250
+ def load_state(
251
+ self, state_file: str, raise_if_not_exist: bool = False
252
+ ) -> None:
253
+ """Loads saved state from a sequence IO file."""
254
+ if pg.io.path_exists(state_file):
255
+ self._state.load(state_file, self.example_input_by_id)
256
+ elif raise_if_not_exist:
257
+ raise ValueError(f'State file {state_file} does not exist.')
258
+
259
+ def _reset(self) -> None:
260
+ """Resets the state of the evaluation."""
261
+ super()._reset()
262
+ if self.is_leaf:
263
+ # Create a new state for the leaf evaluation.
264
+ self._state = EvaluationState()
265
+ for metric in self.metrics:
266
+ metric.reset()
267
+
268
+ #
269
+ # HTML views.
270
+ #
271
+
272
+ def _html_tree_view_content(
273
+ self, *, view, extra_flags: dict[str, Any] | None, **kwargs
274
+ ):
275
+ if not self.is_leaf:
276
+ return super()._html_tree_view_content(
277
+ view=view, extra_flags=extra_flags, **kwargs
278
+ )
279
+
280
+ extra_flags = extra_flags or {}
281
+ run = extra_flags.pop('current_run', None)
282
+ if extra_flags.pop('card_view', True):
283
+ return self._summary_card_view(
284
+ extra_flags.get('interactive', True), run
285
+ )
286
+ assert run is not None
287
+ return self._details_view(run)
288
+
289
+ def _parameter_badge(self, key, value) -> pg.Html.WritableTypes:
290
+ """Renders a badge for a parameter."""
291
+ face_value = pg.format(
292
+ value,
293
+ compact=True,
294
+ python_format=True,
295
+ hide_default_values=True,
296
+ use_inferred=True
297
+ )
298
+ short_text = face_value
299
+ if len(face_value) > 40:
300
+ short_text = f'{type(value).__name__}(...)'
301
+ label = f'{key.split(".")[-1]}: {short_text}'
302
+ tooltip = f'{key}: {face_value}'
303
+ return pg.views.html.controls.Badge(
304
+ text=label,
305
+ tooltip=tooltip,
306
+ css_classes=['parameter'],
307
+ interactive=False,
308
+ )
309
+
310
+ def _summary_card_view(
311
+ self,
312
+ interactive: bool = True,
313
+ run: experiment_lib.Run | None = None,
314
+ ) -> pg.Html.WritableTypes:
315
+ """Renders the summary card view of the evaluation."""
316
+ del run
317
+ return pg.Html(
318
+ pg.Html.element(
319
+ 'div',
320
+ [
321
+ pg.views.html.controls.LabelGroup([
322
+ self._parameter_badge(k, v)
323
+ for k, v in self.non_default_values(
324
+ flatten=True
325
+ ).items()
326
+ ], css_classes=['parameter-group']),
327
+ pg.Html.element(
328
+ 'div',
329
+ [
330
+ m.to_html(
331
+ extra_flags=dict(
332
+ interactive=interactive,
333
+ )
334
+ )
335
+ for m in self.metrics
336
+ ],
337
+ css_classes=['metric-group'],
338
+ ),
339
+ ],
340
+ css_classes=['badge-groups'],
341
+ )
342
+ )
343
+
344
+ def _details_view(
345
+ self, run: experiment_lib.Run
346
+ ) -> pg.Html:
347
+ """Renders the details view of the evaluation."""
348
+
349
+ def _title():
350
+ return pg.Html.element(
351
+ 'div',
352
+ [
353
+ pg.views.html.controls.LabelGroup(
354
+ [
355
+ pg.views.html.controls.Label(
356
+ 'Summary',
357
+ link=run.experiment.output_link(run, 'summary.html'),
358
+ css_classes=['summary-link'],
359
+ ),
360
+ '|',
361
+ pg.views.html.controls.Label(
362
+ 'Directory',
363
+ link=self.output_link(run, ''),
364
+ css_classes=['dir-link'],
365
+ ),
366
+ ],
367
+ css_classes=['experiment-links'],
368
+ ),
369
+ pg.views.html.controls.Label(
370
+ self.id,
371
+ css_classes=['experiment-id'],
372
+ ),
373
+ self.progress.to_html(
374
+ extra_flags=dict(interactive=False),
375
+ ),
376
+ self.usage_summary.to_html(
377
+ extra_flags=dict(as_badge=True, interactive=False),
378
+ ),
379
+ ]
380
+ )
381
+
382
+ def _parameter_badges():
383
+ """Renders a tab group for a metric (group)."""
384
+ return pg.views.html.controls.LabelGroup(
385
+ [
386
+ self._parameter_badge(k, v)
387
+ for k, v in self.non_default_values(flatten=True).items()
388
+ ],
389
+ css_classes=['parameter-group'],
390
+ )
391
+
392
+ def _definition_tab() -> pg.views.html.controls.Tab:
393
+ """Renders a tab for the definition of the evaluation."""
394
+ return pg.views.html.controls.Tab(
395
+ label='Definition',
396
+ content=pg.Html.element(
397
+ 'div',
398
+ [
399
+ pg.views.html.controls.Label(
400
+ pg.format(
401
+ self,
402
+ compact=False,
403
+ verbose=False,
404
+ use_inferred=True,
405
+ hide_frozen=True,
406
+ exclude_keys=set(['progress', 'usage_summary'])
407
+ ),
408
+ css_classes=['eval-definition'],
409
+ ),
410
+ ]
411
+ )
412
+ )
413
+
414
+ def _metric_tab(metric: metrics_lib.Metric) -> pg.views.html.controls.Tab:
415
+ """Renders a tab for a metric (group)."""
416
+ return pg.views.html.controls.Tab(
417
+ label=f'Metric: {metric.name}',
418
+ content=pg.Html.element(
419
+ 'div',
420
+ [
421
+ metric.to_html(
422
+ extra_flags=dict(
423
+ interactive=False,
424
+ )
425
+ ),
426
+ pg.views.html.controls.TabControl(
427
+ tabs=[
428
+ _metric_value_tab(mv)
429
+ for mv in metric.values()
430
+ ]
431
+ )
432
+ ]
433
+ )
434
+ )
435
+
436
+ def _metric_value_tab(
437
+ metric_value: metric_values_lib.MetricValue
438
+ ) -> pg.views.html.controls.Tab:
439
+ """Renders the example links for a metric value."""
440
+ return pg.views.html.controls.Tab(
441
+ label=metric_value.sym_path.key,
442
+ content=pg.Html.element(
443
+ 'div',
444
+ [
445
+ pg.views.html.controls.Label(
446
+ str(dp.example_id),
447
+ link=self.output_link(run, f'{dp.example_id}.html'),
448
+ target='example-view',
449
+ css_classes=['example-link'],
450
+ )
451
+ for dp in metric_value.data_points
452
+ ]
453
+ )
454
+ )
455
+
456
+ def _main_tabs() -> pg.Html:
457
+ return pg.Html.element(
458
+ 'div',
459
+ [
460
+ pg.views.html.controls.TabControl(
461
+ [
462
+ _definition_tab(),
463
+ ] + [
464
+ _metric_tab(m) for m in self.metrics
465
+ ],
466
+ selected=1,
467
+ )
468
+ ],
469
+ )
470
+
471
+ return pg.Html.element(
472
+ 'div',
473
+ [
474
+ _title(),
475
+ _parameter_badges(),
476
+ _main_tabs(),
477
+ pg.Html.element(
478
+ 'iframe', [],
479
+ name='example-view',
480
+ src='./1.html',
481
+ title='Example view.',
482
+ css_classes=['example-view'],
483
+ ),
484
+ ],
485
+ css_classes=['eval-details'],
486
+ )
487
+
488
+ def _html_tree_view_config(self) -> dict[str, Any]:
489
+ return dict(
490
+ css_classes=['eval-card'] if self.is_leaf else None
491
+ )
492
+
493
+ def _html_tree_view_css_styles(self) -> list[str]:
494
+ return super()._html_tree_view_css_styles() + [
495
+ """
496
+ details.eval-card {
497
+ display: inline-block;
498
+ border: 0px;
499
+ box-shadow: rgba(0, 0, 0, 0.16) 0px 1px 4px;
500
+ margin: 15px;
501
+ }
502
+ .eval-card details {
503
+ border: 0px;
504
+ }
505
+ .badge-groups {
506
+ font-weight: normal;
507
+ padding: 5px;
508
+ }
509
+ .parameter-group {
510
+ display: inline-grid;
511
+ grid-template-rows: auto auto;
512
+ border: 0px;
513
+ margin-right: 10px;
514
+ }
515
+ .parameter.badge {
516
+ margin: 2px;
517
+ }
518
+ .metric-group {
519
+ display: inline-grid;
520
+ grid-template-rows: auto auto;
521
+ }
522
+ .eval-details .progress-bar > .shade {
523
+ visibility: hidden;
524
+ width: 0px;
525
+ margin: 0px;
526
+ }
527
+ .eval-details .progress-label {
528
+ font-size: 16px;
529
+ background-color: #eee;
530
+ }
531
+ .eval-details .progress-time {
532
+ font-size: 16px;
533
+ color: dodgerblue;
534
+ background-color: #eee;
535
+ margin-right: 10px;
536
+ }
537
+ .eval-details .usage-summary.badge {
538
+ color: orange;
539
+ font-size: 16px;
540
+ background-color: #eee;
541
+ }
542
+ .eval-details .experiment-links {
543
+ display: block;
544
+ border: 0px;
545
+ margin: 0px;
546
+ }
547
+ .eval-details .tab-button {
548
+ font-size: large;
549
+ }
550
+ .experiment-links .label {
551
+ color: revert;
552
+ margin: 0px;
553
+ padding: 2px;
554
+ }
555
+ .eval-details .experiment-id {
556
+ font-size: 2.0em;
557
+ font-weight: bold;
558
+ display: block;
559
+ }
560
+ .eval-details .parameter-group {
561
+ display: inline-block;
562
+ padding: 5px;
563
+ }
564
+ .eval-definition {
565
+ white-space: pre;
566
+ background-color: #eee;
567
+ padding: 15px;
568
+ }
569
+ .eval-details .metric-container {
570
+ display: block;
571
+ padding: 15px 0px;
572
+ }
573
+ .example-link {
574
+ color: revert;
575
+ }
576
+ .example-view {
577
+ border: 0px;
578
+ width:100%;
579
+ height:100%;
580
+ }
581
+ """
582
+ ]
583
+
584
+
585
+ class EvaluationState:
586
+ """Evaluation state."""
587
+
588
+ def __init__(self):
589
+ super().__init__()
590
+ self._evaluated_examples: dict[int, example_lib.Example] = {}
591
+
592
+ def load(
593
+ self, state_file: str, example_input_by_id: Callable[[int], Any]) -> None:
594
+ """Loads the state from the example sequence file."""
595
+ with pg.io.sequence.open_sequence(state_file) as f:
596
+ for record in f:
597
+ example = pg.from_json_str(
598
+ record, example_input_by_id=example_input_by_id
599
+ )
600
+ assert isinstance(example, example_lib.Example), example
601
+ self._evaluated_examples[example.id] = example
602
+
603
+ def get(self, example_id: int) -> example_lib.Example | None:
604
+ """Returns the example with the given ID."""
605
+ return self._evaluated_examples.get(example_id)
606
+
607
+ def update(self, example: example_lib.Example) -> None:
608
+ """Updates the state with the given example."""
609
+ self._evaluated_examples[example.id] = example
610
+
611
+ @property
612
+ def evaluated_examples(self) -> dict[int, example_lib.Example]:
613
+ """Returns the examples in the state."""
614
+ return self._evaluated_examples
615
+