langfun 0.0.2.dev20240429__py3-none-any.whl → 0.1.2.dev202501150804__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (144) hide show
  1. langfun/__init__.py +20 -2
  2. langfun/core/__init__.py +16 -5
  3. langfun/core/agentic/__init__.py +30 -0
  4. langfun/core/agentic/action.py +854 -0
  5. langfun/core/agentic/action_eval.py +150 -0
  6. langfun/core/agentic/action_eval_test.py +109 -0
  7. langfun/core/agentic/action_test.py +136 -0
  8. langfun/core/coding/python/__init__.py +5 -11
  9. langfun/core/coding/python/correction.py +37 -21
  10. langfun/core/coding/python/correction_test.py +29 -3
  11. langfun/core/coding/python/execution.py +40 -216
  12. langfun/core/coding/python/execution_test.py +29 -89
  13. langfun/core/coding/python/generation.py +21 -11
  14. langfun/core/coding/python/generation_test.py +2 -2
  15. langfun/core/coding/python/parsing.py +108 -193
  16. langfun/core/coding/python/parsing_test.py +2 -105
  17. langfun/core/component.py +63 -2
  18. langfun/core/component_test.py +53 -0
  19. langfun/core/concurrent.py +414 -117
  20. langfun/core/concurrent_test.py +111 -24
  21. langfun/core/console.py +17 -5
  22. langfun/core/console_test.py +17 -0
  23. langfun/core/eval/__init__.py +16 -1
  24. langfun/core/eval/base.py +622 -174
  25. langfun/core/eval/base_test.py +200 -54
  26. langfun/core/eval/matching.py +63 -76
  27. langfun/core/eval/matching_test.py +17 -8
  28. langfun/core/eval/patching.py +130 -0
  29. langfun/core/eval/patching_test.py +170 -0
  30. langfun/core/eval/scoring.py +26 -26
  31. langfun/core/eval/scoring_test.py +19 -2
  32. langfun/core/eval/v2/__init__.py +42 -0
  33. langfun/core/eval/v2/checkpointing.py +380 -0
  34. langfun/core/eval/v2/checkpointing_test.py +228 -0
  35. langfun/core/eval/v2/eval_test_helper.py +136 -0
  36. langfun/core/eval/v2/evaluation.py +725 -0
  37. langfun/core/eval/v2/evaluation_test.py +180 -0
  38. langfun/core/eval/v2/example.py +305 -0
  39. langfun/core/eval/v2/example_test.py +128 -0
  40. langfun/core/eval/v2/experiment.py +1048 -0
  41. langfun/core/eval/v2/experiment_test.py +433 -0
  42. langfun/core/eval/v2/metric_values.py +156 -0
  43. langfun/core/eval/v2/metric_values_test.py +80 -0
  44. langfun/core/eval/v2/metrics.py +357 -0
  45. langfun/core/eval/v2/metrics_test.py +203 -0
  46. langfun/core/eval/v2/progress.py +348 -0
  47. langfun/core/eval/v2/progress_test.py +82 -0
  48. langfun/core/eval/v2/progress_tracking.py +210 -0
  49. langfun/core/eval/v2/progress_tracking_test.py +66 -0
  50. langfun/core/eval/v2/reporting.py +270 -0
  51. langfun/core/eval/v2/reporting_test.py +158 -0
  52. langfun/core/eval/v2/runners.py +488 -0
  53. langfun/core/eval/v2/runners_test.py +334 -0
  54. langfun/core/langfunc.py +4 -17
  55. langfun/core/langfunc_test.py +22 -6
  56. langfun/core/language_model.py +577 -39
  57. langfun/core/language_model_test.py +470 -56
  58. langfun/core/llms/__init__.py +87 -16
  59. langfun/core/llms/anthropic.py +312 -87
  60. langfun/core/llms/anthropic_test.py +71 -3
  61. langfun/core/llms/cache/base.py +21 -2
  62. langfun/core/llms/cache/in_memory.py +13 -0
  63. langfun/core/llms/cache/in_memory_test.py +53 -2
  64. langfun/core/llms/compositional.py +101 -0
  65. langfun/core/llms/compositional_test.py +73 -0
  66. langfun/core/llms/deepseek.py +117 -0
  67. langfun/core/llms/deepseek_test.py +61 -0
  68. langfun/core/llms/fake.py +11 -7
  69. langfun/core/llms/fake_test.py +14 -0
  70. langfun/core/llms/gemini.py +507 -0
  71. langfun/core/llms/gemini_test.py +195 -0
  72. langfun/core/llms/google_genai.py +62 -218
  73. langfun/core/llms/google_genai_test.py +9 -202
  74. langfun/core/llms/groq.py +160 -144
  75. langfun/core/llms/groq_test.py +31 -137
  76. langfun/core/llms/llama_cpp.py +15 -42
  77. langfun/core/llms/llama_cpp_test.py +4 -30
  78. langfun/core/llms/openai.py +395 -203
  79. langfun/core/llms/openai_compatible.py +179 -0
  80. langfun/core/llms/openai_compatible_test.py +495 -0
  81. langfun/core/llms/openai_test.py +30 -395
  82. langfun/core/llms/rest.py +113 -0
  83. langfun/core/llms/rest_test.py +111 -0
  84. langfun/core/llms/vertexai.py +192 -0
  85. langfun/core/llms/vertexai_test.py +52 -0
  86. langfun/core/logging.py +284 -0
  87. langfun/core/logging_test.py +125 -0
  88. langfun/core/message.py +319 -9
  89. langfun/core/message_test.py +190 -13
  90. langfun/core/modalities/__init__.py +6 -2
  91. langfun/core/modalities/audio.py +30 -0
  92. langfun/core/modalities/audio_test.py +63 -0
  93. langfun/core/modalities/image.py +39 -20
  94. langfun/core/modalities/image_test.py +52 -9
  95. langfun/core/modalities/mime.py +206 -29
  96. langfun/core/modalities/mime_test.py +90 -9
  97. langfun/core/modalities/ms_office.py +117 -0
  98. langfun/core/modalities/ms_office_test.py +389 -0
  99. langfun/core/modalities/pdf.py +22 -0
  100. langfun/core/modalities/pdf_test.py +57 -0
  101. langfun/core/modalities/video.py +9 -26
  102. langfun/core/modalities/video_test.py +3 -3
  103. langfun/core/modality.py +26 -3
  104. langfun/core/modality_test.py +2 -2
  105. langfun/core/sampling.py +11 -11
  106. langfun/core/structured/__init__.py +12 -16
  107. langfun/core/structured/completion.py +32 -5
  108. langfun/core/structured/completion_test.py +7 -6
  109. langfun/core/structured/description.py +2 -2
  110. langfun/core/structured/description_test.py +3 -3
  111. langfun/core/structured/function_generation.py +60 -27
  112. langfun/core/structured/function_generation_test.py +72 -2
  113. langfun/core/structured/mapping.py +97 -47
  114. langfun/core/structured/mapping_test.py +90 -2
  115. langfun/core/structured/parsing.py +33 -21
  116. langfun/core/structured/parsing_test.py +53 -9
  117. langfun/core/structured/querying.py +746 -0
  118. langfun/core/structured/{prompting_test.py → querying_test.py} +469 -51
  119. langfun/core/structured/schema.py +204 -97
  120. langfun/core/structured/schema_generation.py +1 -1
  121. langfun/core/structured/schema_test.py +130 -29
  122. langfun/core/structured/scoring.py +125 -19
  123. langfun/core/structured/scoring_test.py +30 -0
  124. langfun/core/structured/tokenization.py +64 -0
  125. langfun/core/structured/tokenization_test.py +48 -0
  126. langfun/core/template.py +115 -1
  127. langfun/core/template_test.py +71 -1
  128. langfun/core/templates/conversation.py +9 -0
  129. langfun/core/templates/conversation_test.py +4 -3
  130. langfun/core/templates/selfplay_test.py +10 -2
  131. langfun-0.1.2.dev202501150804.dist-info/METADATA +225 -0
  132. langfun-0.1.2.dev202501150804.dist-info/RECORD +153 -0
  133. {langfun-0.0.2.dev20240429.dist-info → langfun-0.1.2.dev202501150804.dist-info}/WHEEL +1 -1
  134. langfun/core/coding/python/errors.py +0 -108
  135. langfun/core/coding/python/errors_test.py +0 -99
  136. langfun/core/coding/python/permissions.py +0 -90
  137. langfun/core/coding/python/permissions_test.py +0 -86
  138. langfun/core/structured/prompting.py +0 -238
  139. langfun/core/text_formatting.py +0 -162
  140. langfun/core/text_formatting_test.py +0 -47
  141. langfun-0.0.2.dev20240429.dist-info/METADATA +0 -100
  142. langfun-0.0.2.dev20240429.dist-info/RECORD +0 -108
  143. {langfun-0.0.2.dev20240429.dist-info → langfun-0.1.2.dev202501150804.dist-info}/LICENSE +0 -0
  144. {langfun-0.0.2.dev20240429.dist-info → langfun-0.1.2.dev202501150804.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,180 @@
1
+ # Copyright 2024 The Langfun Authors
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import os
15
+ import tempfile
16
+ import unittest
17
+
18
+ from langfun.core.eval.v2 import eval_test_helper
19
+ from langfun.core.eval.v2 import evaluation as evaluation_lib
20
+ from langfun.core.eval.v2 import example as example_lib
21
+ from langfun.core.eval.v2 import experiment as experiment_lib
22
+
23
+ import pyglove as pg
24
+
25
+ Example = example_lib.Example
26
+ Evaluation = evaluation_lib.Evaluation
27
+ RunId = experiment_lib.RunId
28
+ Run = experiment_lib.Run
29
+
30
+
31
+ class EvaluationTest(unittest.TestCase):
32
+
33
+ def test_hyper_evaluation(self):
34
+ exp = eval_test_helper.TestEvaluation(
35
+ lm=eval_test_helper.TestLLM(offset=pg.oneof(range(3)))
36
+ )
37
+ self.assertFalse(exp.is_leaf)
38
+ self.assertTrue(
39
+ pg.eq(
40
+ exp.children,
41
+ [
42
+ eval_test_helper.TestEvaluation(
43
+ lm=eval_test_helper.TestLLM(offset=0)
44
+ ),
45
+ eval_test_helper.TestEvaluation(
46
+ lm=eval_test_helper.TestLLM(offset=1)
47
+ ),
48
+ eval_test_helper.TestEvaluation(
49
+ lm=eval_test_helper.TestLLM(offset=2)
50
+ ),
51
+ ]
52
+ )
53
+ )
54
+ self.assertEqual(exp.children[0].num_examples, 10)
55
+ self.assertEqual(
56
+ [c.is_leaf for c in exp.children],
57
+ [True] * len(exp.children)
58
+ )
59
+ self.assertEqual(
60
+ [r.resource_ids() for r in exp.leaf_nodes],
61
+ [set(['test_llm:0']), set(['test_llm:1']), set(['test_llm:2'])]
62
+ )
63
+
64
+ def test_input(self):
65
+ exp = eval_test_helper.TestEvaluation()
66
+ self.assertEqual(exp.num_examples, 10)
67
+ exp = eval_test_helper.TestEvaluation(
68
+ inputs=eval_test_helper.test_inputs(None)
69
+ )
70
+ self.assertEqual(exp.num_examples, 20)
71
+ @pg.functor
72
+ def my_inputs():
73
+ yield pg.Dict(x=1, y=2)
74
+ yield pg.Dict(x=3, y=4)
75
+ exp = eval_test_helper.TestEvaluation(inputs=my_inputs())
76
+ self.assertEqual(exp.num_examples, 2)
77
+
78
+ def test_evaluate(self):
79
+ exp = eval_test_helper.TestEvaluation()
80
+ example = exp.evaluate(Example(id=3))
81
+ self.assertIs(exp.state.get(3), example)
82
+ self.assertTrue(example.newly_processed)
83
+ self.assertEqual(example.input, pg.Dict(x=2, y=4, groundtruth=6))
84
+ self.assertEqual(example.output, 6)
85
+ self.assertIsNone(example.error)
86
+ self.assertEqual(example.metadata, {})
87
+ self.assertEqual(example.metric_metadata, dict(match=True))
88
+ self.assertIsNotNone(example.usage_summary)
89
+ self.assertGreater(example.usage_summary.total.total_tokens, 0)
90
+ self.assertEqual(example.usage_summary.total.num_requests, 1)
91
+ self.assertIsNotNone(example.execution_status)
92
+ self.assertIsNotNone(example.start_time)
93
+ self.assertIsNotNone(example.end_time)
94
+
95
+ exp = eval_test_helper.TestEvaluation(lm=eval_test_helper.TestLLM(offset=1))
96
+ example = exp.evaluate(3)
97
+ self.assertTrue(example.newly_processed)
98
+ self.assertEqual(example.input, pg.Dict(x=2, y=4, groundtruth=6))
99
+ self.assertEqual(example.output, 7)
100
+ self.assertIsNone(example.error)
101
+ self.assertEqual(example.metadata, {})
102
+ self.assertEqual(example.metric_metadata, dict(mismatch=True))
103
+
104
+ with self.assertRaisesRegex(ValueError, 'x should not be 5'):
105
+ _ = exp.evaluate(6, raise_if_has_error=True)
106
+ example = exp.evaluate(6)
107
+ self.assertTrue(example.newly_processed)
108
+ self.assertEqual(example.input, pg.Dict(x=5, y=25, groundtruth=30))
109
+ self.assertEqual(pg.MISSING_VALUE, example.output)
110
+ self.assertEqual(example.error.tag, 'ValueError')
111
+ self.assertEqual(example.metadata, {})
112
+ self.assertEqual(example.metric_metadata, dict(error='ValueError'))
113
+
114
+ def test_evaluate_with_state(self):
115
+ eval_dir = os.path.join(tempfile.gettempdir(), 'test_eval')
116
+ pg.io.mkdirs(eval_dir, exist_ok=True)
117
+ state_file = os.path.join(eval_dir, 'state.jsonl')
118
+ with pg.io.open_sequence(state_file, 'w') as f:
119
+ exp = eval_test_helper.TestEvaluation()
120
+ example = exp.evaluate(3)
121
+ self.assertTrue(example.newly_processed)
122
+ self.assertEqual(example.input, pg.Dict(x=2, y=4, groundtruth=6))
123
+ self.assertEqual(example.output, 6)
124
+ self.assertEqual(len(exp._state.evaluated_examples), 1)
125
+ f.add(pg.to_json_str(example))
126
+
127
+ exp.reset()
128
+ self.assertEqual(len(exp._state.evaluated_examples), 0)
129
+ exp.load_state(state_file)
130
+ self.assertEqual(len(exp._state.evaluated_examples), 1)
131
+ example = exp.evaluate(3)
132
+ self.assertFalse(example.newly_processed)
133
+ self.assertEqual(example.input, pg.Dict(x=2, y=4, groundtruth=6))
134
+ self.assertEqual(example.output, 6)
135
+ self.assertGreater(example.usage_summary.total.total_tokens, 0)
136
+ self.assertGreater(example.usage_summary.cached.total.total_tokens, 0)
137
+ self.assertEqual(example.usage_summary.cached.total.num_requests, 1)
138
+ self.assertEqual(example.usage_summary.uncached.total.total_tokens, 0)
139
+ self.assertEqual(example.usage_summary.uncached.total.num_requests, 0)
140
+
141
+ # Test load_state with filter.
142
+ exp.reset()
143
+ self.assertEqual(len(exp._state.evaluated_examples), 0)
144
+ exp.load_state(state_file, filter=lambda x: x.id == 3)
145
+ self.assertEqual(len(exp._state.evaluated_examples), 1)
146
+
147
+ exp.reset()
148
+ self.assertEqual(len(exp._state.evaluated_examples), 0)
149
+ exp.load_state(state_file, filter=lambda x: x.id == 1)
150
+ self.assertEqual(len(exp._state.evaluated_examples), 0)
151
+
152
+ def test_html_view(self):
153
+ exp = eval_test_helper.TestEvaluation()
154
+ exp.debug('debug message')
155
+ exp.info('info message')
156
+ exp.warning('warning message', x=1)
157
+ exp.error('error message', x=1)
158
+ exp.fatal('fatal message')
159
+
160
+ self.assertIn(
161
+ exp.id,
162
+ exp.to_html(extra_flags=dict(card_view=True, current_run=None)).content
163
+ )
164
+ self.assertIn(
165
+ exp.id,
166
+ exp.to_html(
167
+ extra_flags=dict(
168
+ card_view=False,
169
+ current_run=Run(
170
+ root_dir='/tmp/test_run',
171
+ id=RunId.from_id('20241031_1'),
172
+ experiment=pg.Ref(exp),
173
+ )
174
+ )
175
+ ).content
176
+ )
177
+
178
+
179
+ if __name__ == '__main__':
180
+ unittest.main()
@@ -0,0 +1,305 @@
1
+ # Copyright 2024 The Langfun Authors
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Base classes for Langfun evaluation."""
15
+
16
+ import dataclasses
17
+ import inspect
18
+ from typing import Any, Callable
19
+ import langfun.core as lf
20
+ import pyglove as pg
21
+
22
+
23
+ @dataclasses.dataclass
24
+ class Example(pg.JSONConvertible, pg.views.HtmlTreeView.Extension):
25
+ """An item for the evaluation.
26
+
27
+ Attributes:
28
+ id: The 1-based ID of the item in the evaluation set.
29
+ input: An element returned from the `Evaluable.inputs` functor.
30
+ output: The output of the `process` method. If `pg.MISSING_VALUE`, it has
31
+ not been processed yet.
32
+ metadata: The metadata of the item produced by the `process` method.
33
+ metric_metadata: The dictionary returned from `Metric.audit`.
34
+ start_time: The start time of the evaluation item.
35
+ end_time: The end time of the evaluation item.
36
+ usage_summary: The summary of LLM usages of the evaluation item.
37
+ execution_status: The timeit status of the evaluation item.
38
+ """
39
+ id: int
40
+ input: Any = pg.MISSING_VALUE
41
+ output: Any = pg.MISSING_VALUE
42
+ error: pg.object_utils.ErrorInfo | None = None
43
+ metadata: dict[str, Any] = dataclasses.field(default_factory=dict)
44
+ metric_metadata: dict[str, Any] | None = None
45
+ # Execution information.
46
+ newly_processed: bool = True
47
+ start_time: float | None = None
48
+ end_time: float | None = None
49
+ usage_summary: lf.UsageSummary | None = None
50
+ execution_status: dict[str, pg.object_utils.TimeIt.Status] | None = None
51
+
52
+ def __post_init__(self):
53
+ if self.execution_status is not None:
54
+ for status in self.execution_status.values():
55
+ if status.has_error:
56
+ self.error = status.error
57
+ break
58
+
59
+ @property
60
+ def is_processed(self) -> bool:
61
+ """Returns whether the item has been processed."""
62
+ return pg.MISSING_VALUE != self.output
63
+
64
+ @property
65
+ def has_error(self) -> bool:
66
+ """Returns whether the item has an error."""
67
+ return self.error is not None
68
+
69
+ @property
70
+ def elapse(self) -> float | None:
71
+ """Returns the elapse time of the item."""
72
+ if self.execution_status is not None:
73
+ return self.execution_status['evaluate'].elapse
74
+ return None
75
+
76
+ def to_json(self, *, exclude_input: bool = False, **kwargs):
77
+ """Returns the JSON representation of the item."""
78
+ return self.to_json_dict(
79
+ fields=dict(
80
+ id=(self.id, None),
81
+ input=(
82
+ self.input if not exclude_input else pg.MISSING_VALUE,
83
+ pg.MISSING_VALUE
84
+ ),
85
+ output=(self.output, pg.MISSING_VALUE),
86
+ error=(self.error, None),
87
+ metadata=(self.metadata, {}),
88
+ metric_metadata=(self.metric_metadata, None),
89
+ start_time=(self.start_time, None),
90
+ end_time=(self.end_time, None),
91
+ usage_summary=(self.usage_summary, None),
92
+ execution_status=(self.execution_status, None),
93
+ ),
94
+ exclude_default=True,
95
+ **kwargs,
96
+ )
97
+
98
+ @classmethod
99
+ def from_json(
100
+ cls,
101
+ json_value: dict[str, Any],
102
+ *,
103
+ example_input_by_id: Callable[[int], Any] | None = None,
104
+ load_example_metadata: bool | Callable[['Example'], bool] = False,
105
+ **kwargs
106
+ ) -> 'Example':
107
+ """Creates an example from the JSON representation."""
108
+ example_id = json_value.get('id')
109
+ if example_input_by_id:
110
+ example_input = example_input_by_id(example_id)
111
+ else:
112
+ example_input = json_value.pop('input', pg.MISSING_VALUE)
113
+ if example_input is not pg.MISSING_VALUE:
114
+ example_input = pg.from_json(example_input, **kwargs)
115
+ json_value['input'] = example_input
116
+
117
+ # NOTE(daiyip): We need to load the types of the examples into the
118
+ # deserialization context, otherwise the deserialization will fail if the
119
+ # types are not registered.
120
+ def example_class_defs(example) -> list[type[Any]]:
121
+ referred_types = set()
122
+ def _visit(k, v, p):
123
+ del k, p
124
+ if inspect.isclass(v):
125
+ referred_types.add(v)
126
+ elif isinstance(v, pg.Object):
127
+ referred_types.add(v.__class__)
128
+ return pg.TraverseAction.ENTER
129
+ pg.traverse(example, _visit)
130
+ return list(referred_types)
131
+
132
+ # We delay loading the metadata until the other parts of the example are
133
+ # loaded. So we could apply the filter to decide whether to load the
134
+ # metadata.
135
+ metadata_dict = json_value.pop('metadata', None)
136
+ with pg.JSONConvertible.load_types_for_deserialization(
137
+ *example_class_defs(example_input)
138
+ ):
139
+ example = cls(
140
+ **{k: pg.from_json(v, **kwargs) for k, v in json_value.items()}
141
+ )
142
+ if callable(load_example_metadata):
143
+ load_example_metadata = load_example_metadata(example)
144
+ if load_example_metadata:
145
+ example.metadata = pg.from_json(metadata_dict, **kwargs)
146
+ return example
147
+
148
+ #
149
+ # HTML rendering.
150
+ #
151
+
152
+ def _html_tree_view_content(
153
+ self,
154
+ *,
155
+ view: pg.views.HtmlTreeView,
156
+ root_path: pg.KeyPath | None = None,
157
+ extra_flags: dict[str, Any] | None = None,
158
+ **kwargs
159
+ ):
160
+ root_path = root_path or pg.KeyPath()
161
+ extra_flags = extra_flags or {}
162
+ num_examples = extra_flags.get('num_examples', None)
163
+
164
+ def _metric_metadata_badge(key, value):
165
+ if isinstance(value, bool) and bool:
166
+ text = key
167
+ else:
168
+ text = f'{key}:{value}'
169
+ return pg.views.html.controls.Badge(
170
+ text,
171
+ css_classes=[pg.object_utils.camel_to_snake(key, '-')],
172
+ )
173
+
174
+ def _render_header():
175
+ return pg.Html.element(
176
+ 'div',
177
+ [
178
+ pg.Html.element(
179
+ 'div',
180
+ [
181
+ # Previous button.
182
+ pg.views.html.controls.Label( # pylint: disable=g-long-ternary
183
+ '◀',
184
+ link=f'{self.id - 1}.html',
185
+ css_classes=['previous'],
186
+ ) if self.id > 1 else None,
187
+ # Current example ID.
188
+ pg.views.html.controls.Label(
189
+ f'#{self.id}',
190
+ css_classes=['example-id'],
191
+ ),
192
+ # Next button.
193
+ pg.views.html.controls.Label( # pylint: disable=g-long-ternary
194
+ '▶',
195
+ link=f'{self.id + 1}.html',
196
+ css_classes=['next'],
197
+ ) if (num_examples is None
198
+ or self.id < num_examples) else None,
199
+
200
+ ]
201
+ ),
202
+ pg.Html.element(
203
+ 'div',
204
+ [
205
+ # Usage summary.
206
+ pg.view( # pylint: disable=g-long-ternary
207
+ self.usage_summary,
208
+ extra_flags=dict(as_badge=True)
209
+ ) if self.usage_summary is not None else None,
210
+ # Metric metadata.
211
+ pg.views.html.controls.LabelGroup(
212
+ [ # pylint: disable=g-long-ternary
213
+ _metric_metadata_badge(k, v)
214
+ for k, v in self.metric_metadata.items()
215
+ ] if self.metric_metadata else []
216
+ ),
217
+ ],
218
+ css_classes=['example-container'],
219
+ )
220
+ ]
221
+ )
222
+
223
+ def _render_content():
224
+ def _tab(label, key, default):
225
+ field = getattr(self, key)
226
+ if default == field:
227
+ return None
228
+ return pg.views.html.controls.Tab(
229
+ label=label,
230
+ content=view.render(
231
+ field,
232
+ root_path=root_path + key,
233
+ collapse_level=None,
234
+ **view.get_passthrough_kwargs(**kwargs),
235
+ ),
236
+ )
237
+ tabs = [
238
+ _tab('Input', 'input', pg.MISSING_VALUE),
239
+ _tab('Output', 'output', pg.MISSING_VALUE),
240
+ _tab('Output Metadata', 'metadata', {}),
241
+ _tab('Error', 'error', None),
242
+ ]
243
+ tabs = [tab for tab in tabs if tab is not None]
244
+ return pg.views.html.controls.TabControl(
245
+ tabs,
246
+ len(tabs) - 1,
247
+ )
248
+
249
+ return pg.Html.element(
250
+ 'div',
251
+ [
252
+ _render_header(),
253
+ _render_content(),
254
+ ],
255
+ css_classes=['eval-example']
256
+ )
257
+
258
+ def _html_tree_view_summary(self, *, view, **kwargs):
259
+ return None
260
+
261
+ @classmethod
262
+ def _html_tree_view_css_styles(cls) -> list[str]:
263
+ return super()._html_tree_view_css_styles() + [
264
+ """
265
+ .example-container {
266
+ display: block;
267
+ padding: 10px;
268
+ }
269
+ .example-id {
270
+ font-weight: bold;
271
+ font-size: 40px;
272
+ margin: 0 10px;
273
+ vertical-align: middle;
274
+ }
275
+ a.previous, a.next {
276
+ text-decoration: none;
277
+ vertical-align: middle;
278
+ display: inline-block;
279
+ padding: 8px 8px;
280
+ color: #DDD;
281
+ }
282
+ a.previous:hover, a.next:hover {
283
+ background-color: #ddd;
284
+ color: black;
285
+ }
286
+ /* Badge styles. */
287
+ .eval-example .badge.match {
288
+ color: green;
289
+ background-color: #dcefbe;
290
+ }
291
+ .eval-example .badge.error {
292
+ color: red;
293
+ background-color: #fdcccc;
294
+ }
295
+ .eval-example .badge.mismatch {
296
+ color: orange;
297
+ background-color: #ffefc4;
298
+ }
299
+ .eval-example .badge.score {
300
+ color: blue;
301
+ background-color: #c4dced;
302
+ }
303
+ """
304
+ ]
305
+
@@ -0,0 +1,128 @@
1
+ # Copyright 2024 The Langfun Authors
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import unittest
15
+
16
+ from langfun.core.eval.v2 import example as example_lib
17
+ import pyglove as pg
18
+
19
+ Example = example_lib.Example
20
+
21
+
22
+ class ExampleTest(unittest.TestCase):
23
+
24
+ def test_basic(self):
25
+ error = pg.object_utils.ErrorInfo(
26
+ tag='ValueError',
27
+ description='Bad input',
28
+ stacktrace='...',
29
+ )
30
+ ex = Example(id=1, execution_status={
31
+ 'evaluate': pg.object_utils.TimeIt.Status(
32
+ name='evaluation', elapse=1.0, error=error
33
+ )
34
+ })
35
+ self.assertEqual(ex.error, error)
36
+ self.assertFalse(ex.is_processed)
37
+ self.assertTrue(ex.has_error)
38
+ self.assertEqual(ex.elapse, 1.0)
39
+
40
+ ex = Example(id=2, output=1)
41
+ self.assertTrue(ex.is_processed)
42
+ self.assertFalse(ex.has_error)
43
+ self.assertIsNone(ex.elapse)
44
+
45
+ def test_json_conversion(self):
46
+ def input_func():
47
+ class A(pg.Object):
48
+ x: int
49
+
50
+ class B(pg.Object):
51
+ x: int = 1
52
+ y: int = 2
53
+
54
+ return [
55
+ pg.Dict(
56
+ a=A,
57
+ b=B
58
+ )
59
+ ]
60
+
61
+ inputs = input_func()
62
+ ex = Example(
63
+ id=1,
64
+ input=inputs[0],
65
+ output=inputs[0].a(1),
66
+ metadata=dict(b=inputs[0].b())
67
+ )
68
+ # Serialize without input.
69
+ json_str = pg.to_json_str(ex, exclude_input=True)
70
+ self.assertEqual(
71
+ pg.from_json_str(
72
+ json_str,
73
+ example_input_by_id=lambda i: inputs[i - 1],
74
+ load_example_metadata=True,
75
+ ),
76
+ ex
77
+ )
78
+ self.assertEqual(
79
+ pg.from_json_str(
80
+ json_str,
81
+ example_input_by_id=lambda i: inputs[i - 1],
82
+ load_example_metadata=False,
83
+ ),
84
+ Example(
85
+ id=1,
86
+ input=inputs[0],
87
+ output=inputs[0].a(1),
88
+ metadata={}
89
+ )
90
+ )
91
+ pg.JSONConvertible._TYPE_REGISTRY._type_to_cls_map.pop(
92
+ inputs[0].a.__type_name__
93
+ )
94
+ pg.JSONConvertible._TYPE_REGISTRY._type_to_cls_map.pop(
95
+ inputs[0].b.__type_name__
96
+ )
97
+ v = pg.from_json_str(json_str, auto_dict=True, load_example_metadata=True)
98
+ v.output.pop('type_name')
99
+ v.metadata.b.pop('type_name')
100
+ self.assertEqual(
101
+ v,
102
+ Example(
103
+ id=1,
104
+ output=pg.Dict(x=1),
105
+ metadata=dict(b=pg.Dict(x=1, y=2)),
106
+ )
107
+ )
108
+ # Serialize with input.
109
+ ex = Example(id=2, input=pg.Dict(x=1), output=pg.Dict(x=2))
110
+ json_str = pg.to_json_str(ex, exclude_input=False)
111
+ self.assertEqual(pg.from_json_str(json_str), ex)
112
+
113
+ def test_html_view(self):
114
+ ex = Example(
115
+ id=1,
116
+ input=pg.Dict(a=1, b=2),
117
+ output=3,
118
+ metadata=dict(sum=3),
119
+ metric_metadata=dict(match=True),
120
+ )
121
+ self.assertNotIn(
122
+ 'next',
123
+ ex.to_html(extra_flags=dict(num_examples=1)).content,
124
+ )
125
+
126
+
127
+ if __name__ == '__main__':
128
+ unittest.main()