langfun 0.1.2.dev202411090804__py3-none-any.whl → 0.1.2.dev202411140804__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of langfun might be problematic. Click here for more details.

Files changed (36) hide show
  1. langfun/core/console.py +10 -2
  2. langfun/core/console_test.py +17 -0
  3. langfun/core/eval/__init__.py +2 -0
  4. langfun/core/eval/v2/__init__.py +38 -0
  5. langfun/core/eval/v2/checkpointing.py +135 -0
  6. langfun/core/eval/v2/checkpointing_test.py +89 -0
  7. langfun/core/eval/v2/evaluation.py +627 -0
  8. langfun/core/eval/v2/evaluation_test.py +156 -0
  9. langfun/core/eval/v2/example.py +295 -0
  10. langfun/core/eval/v2/example_test.py +114 -0
  11. langfun/core/eval/v2/experiment.py +949 -0
  12. langfun/core/eval/v2/experiment_test.py +304 -0
  13. langfun/core/eval/v2/metric_values.py +156 -0
  14. langfun/core/eval/v2/metric_values_test.py +80 -0
  15. langfun/core/eval/v2/metrics.py +357 -0
  16. langfun/core/eval/v2/metrics_test.py +203 -0
  17. langfun/core/eval/v2/progress.py +348 -0
  18. langfun/core/eval/v2/progress_test.py +82 -0
  19. langfun/core/eval/v2/progress_tracking.py +209 -0
  20. langfun/core/eval/v2/progress_tracking_test.py +56 -0
  21. langfun/core/eval/v2/reporting.py +144 -0
  22. langfun/core/eval/v2/reporting_test.py +41 -0
  23. langfun/core/eval/v2/runners.py +417 -0
  24. langfun/core/eval/v2/runners_test.py +311 -0
  25. langfun/core/eval/v2/test_helper.py +80 -0
  26. langfun/core/language_model.py +122 -11
  27. langfun/core/language_model_test.py +97 -4
  28. langfun/core/llms/__init__.py +3 -0
  29. langfun/core/llms/compositional.py +101 -0
  30. langfun/core/llms/compositional_test.py +73 -0
  31. langfun/core/llms/vertexai.py +4 -4
  32. {langfun-0.1.2.dev202411090804.dist-info → langfun-0.1.2.dev202411140804.dist-info}/METADATA +1 -1
  33. {langfun-0.1.2.dev202411090804.dist-info → langfun-0.1.2.dev202411140804.dist-info}/RECORD +36 -12
  34. {langfun-0.1.2.dev202411090804.dist-info → langfun-0.1.2.dev202411140804.dist-info}/WHEEL +1 -1
  35. {langfun-0.1.2.dev202411090804.dist-info → langfun-0.1.2.dev202411140804.dist-info}/LICENSE +0 -0
  36. {langfun-0.1.2.dev202411090804.dist-info → langfun-0.1.2.dev202411140804.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,627 @@
1
+ # Copyright 2024 The Langfun Authors
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Base class for Langfun evaluation tasks."""
15
+
16
+ import abc
17
+ import functools
18
+ import time
19
+
20
+ from typing import Annotated, Any, Callable, Iterable
21
+ import langfun.core as lf
22
+ import langfun.core.coding as lf_coding
23
+
24
+ from langfun.core.eval.v2 import example as example_lib
25
+ from langfun.core.eval.v2 import experiment as experiment_lib
26
+ from langfun.core.eval.v2 import metric_values as metric_values_lib
27
+ from langfun.core.eval.v2 import metrics as metrics_lib
28
+
29
+ import pyglove as pg
30
+
31
+
32
+ class Evaluation(experiment_lib.Experiment):
33
+ """Evaluation.
34
+
35
+ An evaluation can be a leaf node or a container of other evaluations,
36
+ depending on whether the current evaluation object is configured with
37
+ any `pg.oneof`.
38
+
39
+ For example, `MyEval(lm=pg.oneof([lf.llms.Gpt4(), lf.llms.Gemini1_5Pro()]))`
40
+ is a container of two sub-experiments, one for each LLM. In such case, the
41
+ evaluation object with `pg.oneof` is called a hyper evaluation, which
42
+ represents a search space of evaluations, and each sub-evaluation is called
43
+ a leaf evaluation, which will perform the actual evaluation.
44
+ """
45
+
46
+ inputs: Annotated[
47
+ pg.Functor,
48
+ 'A functor that returns a list of inputs.'
49
+ ]
50
+
51
+ metrics: Annotated[
52
+ list[metrics_lib.Metric],
53
+ 'The metrics to be evaluated.'
54
+ ]
55
+
56
+ max_workers: Annotated[
57
+ int,
58
+ 'The maximum number of workers to use for the evaluation.'
59
+ ] = 32
60
+
61
+ def _on_bound(self):
62
+ # Invalidate cached properties.
63
+ self.__dict__.pop('is_leaf', None)
64
+ self.__dict__.pop('children', None)
65
+ super()._on_bound()
66
+
67
+ #
68
+ # Handling evaluation hierarchy (materialized vs. hyper evaluations).
69
+ #
70
+
71
+ @functools.cached_property
72
+ def is_leaf(self) -> bool:
73
+ """Returns whether the task is a leaf."""
74
+ return self.is_deterministic
75
+
76
+ @functools.cached_property
77
+ def children(self) -> list['Evaluation']:
78
+ """Returns the children tasks."""
79
+ if self.is_leaf:
80
+ return []
81
+ children = []
82
+ for i, child in enumerate(pg.iter(self)):
83
+ child.sym_setparent(self)
84
+ child.sym_setpath(self.sym_path + 'children' + i)
85
+ children.append(child)
86
+ return children
87
+
88
+ #
89
+ # Handling evaluation inputs.
90
+ #
91
+
92
+ @functools.cached_property
93
+ def example_inputs(self) -> Iterable[Any]:
94
+ """Returns the examples from the inputs."""
95
+ return self.inputs()
96
+
97
+ def example_input_by_id(self, example_id: int) -> Any:
98
+ """Returns the example from the inputs by ID."""
99
+ assert example_id <= len(self.example_inputs), example_id
100
+ return self._example_input_by_id[example_id]
101
+
102
+ @functools.cached_property
103
+ def _example_input_by_id(self) -> dict[int, Any]:
104
+ """Returns the examples from the inputs by ID."""
105
+ return {i + 1: v for i, v in enumerate(self.example_inputs)}
106
+
107
+ @property
108
+ def num_examples(self) -> int:
109
+ """Returns the number of examples from the inputs."""
110
+ # NOTE(daiyip): setting `num_examples` of the input functor allows fast
111
+ # retrieval of number of examples without iterating the whole dataset.
112
+ num_examples = getattr(self.inputs, 'num_examples', None)
113
+ if not isinstance(num_examples, int):
114
+ it = self.example_inputs
115
+ if hasattr(it, '__len__'):
116
+ num_examples = len(it)
117
+ else:
118
+ num_examples = len(list(it))
119
+ return num_examples
120
+
121
+ #
122
+ # Evaluation logics.
123
+ #
124
+
125
+ @abc.abstractmethod
126
+ def process(self, example_input: Any) -> Any | tuple[Any, dict[str, Any]]:
127
+ """Processes a single example from the evaluation set.
128
+
129
+ Users should override this method to implement the evaluation logic.
130
+
131
+ Args:
132
+ example_input: An object returned from `Evaluable.inputs`.
133
+
134
+ Returns:
135
+ A processed output. Or a tuple of (output, metadata).
136
+ The output will be used for computing the metrics, and the metadata will
137
+ be included in the evaluation HTML view.
138
+ """
139
+
140
+ def evaluate(
141
+ self,
142
+ example: example_lib.Example | int,
143
+ raise_if_has_error: bool = False,
144
+ ) -> example_lib.Example:
145
+ """Evaluates a single example input.
146
+
147
+ Args:
148
+ example: An example ID or an example object with ID.
149
+ raise_if_has_error: Whether to raise an error if the example has error.
150
+
151
+ Returns:
152
+ The evaluated example with the output and metric metadata populated.
153
+ """
154
+ if isinstance(example, int):
155
+ example = example_lib.Example(id=example)
156
+ assert isinstance(example, example_lib.Example), example
157
+
158
+ if pg.MISSING_VALUE == example.input:
159
+ example.input = self.example_input_by_id(example.id)
160
+
161
+ cached = self._state.get(example.id)
162
+
163
+ with pg.timeit('evaluate') as timeit, lf.track_usages() as usage_summary:
164
+ if cached is None or cached.has_error:
165
+ example.start_time = time.time()
166
+ self._process(example, raise_if_has_error=raise_if_has_error)
167
+ else:
168
+ example.start_time = cached.start_time
169
+
170
+ # Use cached output and metadata obtained from the previous processing.
171
+ example.output = cached.output
172
+ example.metadata = cached.metadata
173
+ example.newly_processed = False
174
+
175
+ # For previously processed examples, we merge previous usages as
176
+ # cached, so the usage summary will account previous usages, but as
177
+ # cached.
178
+ assert cached.usage_summary is not None
179
+ usage_summary.merge(cached.usage_summary, as_cached=True)
180
+
181
+ # Recompute the metrics and metadata for the example even its processed
182
+ # output and metadata were from the cache.
183
+ # NOTE(daiyip): It's possible that metrics could use LLMs, so we need to
184
+ # track the usage of the metrics separately.
185
+ with pg.timeit('metric'):
186
+ metric_metadata = {}
187
+ for metric in self.metrics:
188
+ metric_metadata.update(metric.audit(example))
189
+ example.metric_metadata = metric_metadata
190
+
191
+ # For previously processed examples, we keep the execution status for the
192
+ # processing step.
193
+ execution_status = dict(example.execution_status or {})
194
+ execution_status.update(timeit.status())
195
+
196
+ example.execution_status = execution_status
197
+ example.usage_summary = usage_summary
198
+ if example.newly_processed:
199
+ example.end_time = time.time()
200
+
201
+ self._state.update(example)
202
+ return example
203
+
204
+ def _process(
205
+ self,
206
+ example: example_lib.Example,
207
+ raise_if_has_error: bool = False
208
+ ) -> None:
209
+ """Processes a single example."""
210
+ with (
211
+ pg.notify_on_change(False),
212
+ pg.allow_writable_accessors(True),
213
+ # NOTE(daiyip): set the `input` symbol of the globals to None, so
214
+ # LLM generated code with calls to `input` will raise an error, thus
215
+ # not blocking the evaluation.
216
+ lf_coding.context(input=None),
217
+ ):
218
+ try:
219
+ with pg.timeit('process'):
220
+ output = self.process(example.input)
221
+ if (isinstance(output, tuple)
222
+ and len(output) == 2
223
+ and isinstance(output[1], dict)):
224
+ output, metadata = output
225
+ else:
226
+ metadata = {}
227
+ example.output = output
228
+ example.metadata = metadata
229
+ except BaseException as e: # pylint: disable=broad-except
230
+ if raise_if_has_error:
231
+ raise
232
+ example.error = pg.object_utils.ErrorInfo.from_exception(e)
233
+
234
+ #
235
+ # Handling evaluation scheduling.
236
+ #
237
+
238
+ def resource_ids(self) -> set[str]:
239
+ """Returns a set of resource IDs required by this evaluation.
240
+
241
+ Resource IDs are used to by the runner to determine which evaluations can
242
+ be run in parallel. Evaluations using the same resource key will be run
243
+ sequentially.
244
+
245
+ Returns:
246
+ A unique string representing the resource required.
247
+ """
248
+ return {
249
+ v.resource_id for _, v in self.sym_init_args.items()
250
+ if isinstance(v, lf.LanguageModel)
251
+ }
252
+
253
+ #
254
+ # Handling evaluation state.
255
+ #
256
+
257
+ @property
258
+ def state(self) -> 'EvaluationState':
259
+ """Returns the state of the evaluation."""
260
+ return self._state
261
+
262
+ def load_state(
263
+ self, state_file: str, raise_if_not_exist: bool = False
264
+ ) -> None:
265
+ """Loads saved state from a sequence IO file."""
266
+ if pg.io.path_exists(state_file):
267
+ self._state.load(state_file, self.example_input_by_id)
268
+ elif raise_if_not_exist:
269
+ raise ValueError(f'State file {state_file} does not exist.')
270
+
271
+ def _reset(self) -> None:
272
+ """Resets the state of the evaluation."""
273
+ super()._reset()
274
+ if self.is_leaf:
275
+ # Create a new state for the leaf evaluation.
276
+ self._state = EvaluationState()
277
+ for metric in self.metrics:
278
+ metric.reset()
279
+
280
+ #
281
+ # HTML views.
282
+ #
283
+
284
+ def _html_tree_view_content(
285
+ self, *, view, extra_flags: dict[str, Any] | None, **kwargs
286
+ ):
287
+ if not self.is_leaf:
288
+ return super()._html_tree_view_content(
289
+ view=view, extra_flags=extra_flags, **kwargs
290
+ )
291
+
292
+ extra_flags = extra_flags or {}
293
+ run = extra_flags.pop('current_run', None)
294
+ if extra_flags.pop('card_view', True):
295
+ return self._summary_card_view(
296
+ extra_flags.get('interactive', True), run
297
+ )
298
+ assert run is not None
299
+ return self._details_view(run)
300
+
301
+ def _parameter_badge(self, key, value) -> pg.Html.WritableTypes:
302
+ """Renders a badge for a parameter."""
303
+ face_value = pg.format(
304
+ value,
305
+ compact=True,
306
+ python_format=True,
307
+ hide_default_values=True,
308
+ use_inferred=True
309
+ )
310
+ short_text = face_value
311
+ if len(face_value) > 40:
312
+ short_text = f'{type(value).__name__}(...)'
313
+ label = f'{key.split(".")[-1]}: {short_text}'
314
+ tooltip = f'{key}: {face_value}'
315
+ return pg.views.html.controls.Badge(
316
+ text=label,
317
+ tooltip=tooltip,
318
+ css_classes=['parameter'],
319
+ interactive=False,
320
+ )
321
+
322
+ def _summary_card_view(
323
+ self,
324
+ interactive: bool = True,
325
+ run: experiment_lib.Run | None = None,
326
+ ) -> pg.Html.WritableTypes:
327
+ """Renders the summary card view of the evaluation."""
328
+ del run
329
+ return pg.Html(
330
+ pg.Html.element(
331
+ 'div',
332
+ [
333
+ pg.views.html.controls.LabelGroup([
334
+ self._parameter_badge(k, v)
335
+ for k, v in self.non_default_values(
336
+ flatten=True
337
+ ).items()
338
+ ], css_classes=['parameter-group']),
339
+ pg.Html.element(
340
+ 'div',
341
+ [
342
+ m.to_html(
343
+ extra_flags=dict(
344
+ interactive=interactive,
345
+ )
346
+ )
347
+ for m in self.metrics
348
+ ],
349
+ css_classes=['metric-group'],
350
+ ),
351
+ ],
352
+ css_classes=['badge-groups'],
353
+ )
354
+ )
355
+
356
+ def _details_view(
357
+ self, run: experiment_lib.Run
358
+ ) -> pg.Html:
359
+ """Renders the details view of the evaluation."""
360
+
361
+ def _title():
362
+ return pg.Html.element(
363
+ 'div',
364
+ [
365
+ pg.views.html.controls.LabelGroup(
366
+ [
367
+ pg.views.html.controls.Label(
368
+ 'Summary',
369
+ link=run.experiment.output_link(run, 'summary.html'),
370
+ css_classes=['summary-link'],
371
+ ),
372
+ '|',
373
+ pg.views.html.controls.Label(
374
+ 'Directory',
375
+ link=self.output_link(run, ''),
376
+ css_classes=['dir-link'],
377
+ ),
378
+ ],
379
+ css_classes=['experiment-links'],
380
+ ),
381
+ pg.views.html.controls.Label(
382
+ self.id,
383
+ css_classes=['experiment-id'],
384
+ ),
385
+ self.progress.to_html(
386
+ extra_flags=dict(interactive=False),
387
+ ),
388
+ self.usage_summary.to_html(
389
+ extra_flags=dict(as_badge=True, interactive=False),
390
+ ),
391
+ ]
392
+ )
393
+
394
+ def _parameter_badges():
395
+ """Renders a tab group for a metric (group)."""
396
+ return pg.views.html.controls.LabelGroup(
397
+ [
398
+ self._parameter_badge(k, v)
399
+ for k, v in self.non_default_values(flatten=True).items()
400
+ ],
401
+ css_classes=['parameter-group'],
402
+ )
403
+
404
+ def _definition_tab() -> pg.views.html.controls.Tab:
405
+ """Renders a tab for the definition of the evaluation."""
406
+ return pg.views.html.controls.Tab(
407
+ label='Definition',
408
+ content=pg.Html.element(
409
+ 'div',
410
+ [
411
+ pg.views.html.controls.Label(
412
+ pg.format(
413
+ self,
414
+ compact=False,
415
+ verbose=False,
416
+ use_inferred=True,
417
+ hide_frozen=True,
418
+ exclude_keys=set(['progress', 'usage_summary'])
419
+ ),
420
+ css_classes=['eval-definition'],
421
+ ),
422
+ ]
423
+ )
424
+ )
425
+
426
+ def _metric_tab(metric: metrics_lib.Metric) -> pg.views.html.controls.Tab:
427
+ """Renders a tab for a metric (group)."""
428
+ return pg.views.html.controls.Tab(
429
+ label=f'Metric: {metric.name}',
430
+ content=pg.Html.element(
431
+ 'div',
432
+ [
433
+ metric.to_html(
434
+ extra_flags=dict(
435
+ interactive=False,
436
+ )
437
+ ),
438
+ pg.views.html.controls.TabControl(
439
+ tabs=[
440
+ _metric_value_tab(mv)
441
+ for mv in metric.values()
442
+ ]
443
+ )
444
+ ]
445
+ )
446
+ )
447
+
448
+ def _metric_value_tab(
449
+ metric_value: metric_values_lib.MetricValue
450
+ ) -> pg.views.html.controls.Tab:
451
+ """Renders the example links for a metric value."""
452
+ return pg.views.html.controls.Tab(
453
+ label=metric_value.sym_path.key,
454
+ content=pg.Html.element(
455
+ 'div',
456
+ [
457
+ pg.views.html.controls.Label(
458
+ str(dp.example_id),
459
+ link=self.output_link(run, f'{dp.example_id}.html'),
460
+ target='example-view',
461
+ css_classes=['example-link'],
462
+ )
463
+ for dp in metric_value.data_points
464
+ ]
465
+ )
466
+ )
467
+
468
+ def _main_tabs() -> pg.Html:
469
+ return pg.Html.element(
470
+ 'div',
471
+ [
472
+ pg.views.html.controls.TabControl(
473
+ [
474
+ _definition_tab(),
475
+ ] + [
476
+ _metric_tab(m) for m in self.metrics
477
+ ],
478
+ selected=1,
479
+ )
480
+ ],
481
+ )
482
+
483
+ return pg.Html.element(
484
+ 'div',
485
+ [
486
+ _title(),
487
+ _parameter_badges(),
488
+ _main_tabs(),
489
+ pg.Html.element(
490
+ 'iframe', [],
491
+ name='example-view',
492
+ src='./1.html',
493
+ title='Example view.',
494
+ css_classes=['example-view'],
495
+ ),
496
+ ],
497
+ css_classes=['eval-details'],
498
+ )
499
+
500
+ def _html_tree_view_config(self) -> dict[str, Any]:
501
+ return dict(
502
+ css_classes=['eval-card'] if self.is_leaf else None
503
+ )
504
+
505
+ def _html_tree_view_css_styles(self) -> list[str]:
506
+ return super()._html_tree_view_css_styles() + [
507
+ """
508
+ details.eval-card {
509
+ display: inline-block;
510
+ border: 0px;
511
+ box-shadow: rgba(0, 0, 0, 0.16) 0px 1px 4px;
512
+ margin: 15px;
513
+ }
514
+ .eval-card details {
515
+ border: 0px;
516
+ }
517
+ .badge-groups {
518
+ font-weight: normal;
519
+ padding: 5px;
520
+ }
521
+ .parameter-group {
522
+ display: inline-grid;
523
+ grid-template-rows: auto auto;
524
+ border: 0px;
525
+ margin-right: 10px;
526
+ }
527
+ .parameter.badge {
528
+ margin: 2px;
529
+ }
530
+ .metric-group {
531
+ display: inline-grid;
532
+ grid-template-rows: auto auto;
533
+ }
534
+ .eval-details .progress-bar > .shade {
535
+ visibility: hidden;
536
+ width: 0px;
537
+ margin: 0px;
538
+ }
539
+ .eval-details .progress-label {
540
+ font-size: 16px;
541
+ background-color: #eee;
542
+ }
543
+ .eval-details .progress-time {
544
+ font-size: 16px;
545
+ color: dodgerblue;
546
+ background-color: #eee;
547
+ margin-right: 10px;
548
+ }
549
+ .eval-details .usage-summary.badge {
550
+ color: orange;
551
+ font-size: 16px;
552
+ background-color: #eee;
553
+ }
554
+ .eval-details .experiment-links {
555
+ display: block;
556
+ border: 0px;
557
+ margin: 0px;
558
+ }
559
+ .eval-details .tab-button {
560
+ font-size: large;
561
+ }
562
+ .experiment-links .label {
563
+ color: revert;
564
+ margin: 0px;
565
+ padding: 2px;
566
+ }
567
+ .eval-details .experiment-id {
568
+ font-size: 2.0em;
569
+ font-weight: bold;
570
+ display: block;
571
+ }
572
+ .eval-details .parameter-group {
573
+ display: inline-block;
574
+ padding: 5px;
575
+ }
576
+ .eval-definition {
577
+ white-space: pre;
578
+ background-color: #eee;
579
+ padding: 15px;
580
+ }
581
+ .eval-details .metric-container {
582
+ display: block;
583
+ padding: 15px 0px;
584
+ }
585
+ .example-link {
586
+ color: revert;
587
+ }
588
+ .example-view {
589
+ border: 0px;
590
+ width:100%;
591
+ height:100%;
592
+ }
593
+ """
594
+ ]
595
+
596
+
597
+ class EvaluationState:
598
+ """Evaluation state."""
599
+
600
+ def __init__(self):
601
+ super().__init__()
602
+ self._evaluated_examples: dict[int, example_lib.Example] = {}
603
+
604
+ def load(
605
+ self, state_file: str, example_input_by_id: Callable[[int], Any]) -> None:
606
+ """Loads the state from the example sequence file."""
607
+ with pg.io.sequence.open_sequence(state_file) as f:
608
+ for record in f:
609
+ example = pg.from_json_str(
610
+ record, example_input_by_id=example_input_by_id
611
+ )
612
+ assert isinstance(example, example_lib.Example), example
613
+ self._evaluated_examples[example.id] = example
614
+
615
+ def get(self, example_id: int) -> example_lib.Example | None:
616
+ """Returns the example with the given ID."""
617
+ return self._evaluated_examples.get(example_id)
618
+
619
+ def update(self, example: example_lib.Example) -> None:
620
+ """Updates the state with the given example."""
621
+ self._evaluated_examples[example.id] = example
622
+
623
+ @property
624
+ def evaluated_examples(self) -> dict[int, example_lib.Example]:
625
+ """Returns the examples in the state."""
626
+ return self._evaluated_examples
627
+