langfun 0.1.2.dev202508060805__py3-none-any.whl → 0.1.2.dev202508070805__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of langfun might be problematic. Click here for more details.
- langfun/core/eval/v2/checkpointing.py +1 -5
- langfun/core/eval/v2/checkpointing_test.py +13 -7
- langfun/core/eval/v2/evaluation.py +20 -5
- langfun/core/eval/v2/evaluation_test.py +21 -1
- langfun/core/eval/v2/experiment.py +13 -0
- langfun/core/eval/v2/runners.py +1 -0
- {langfun-0.1.2.dev202508060805.dist-info → langfun-0.1.2.dev202508070805.dist-info}/METADATA +1 -1
- {langfun-0.1.2.dev202508060805.dist-info → langfun-0.1.2.dev202508070805.dist-info}/RECORD +11 -11
- {langfun-0.1.2.dev202508060805.dist-info → langfun-0.1.2.dev202508070805.dist-info}/WHEEL +0 -0
- {langfun-0.1.2.dev202508060805.dist-info → langfun-0.1.2.dev202508070805.dist-info}/licenses/LICENSE +0 -0
- {langfun-0.1.2.dev202508060805.dist-info → langfun-0.1.2.dev202508070805.dist-info}/top_level.txt +0 -0
|
@@ -82,11 +82,7 @@ class Checkpointer(experiment_lib.Plugin):
|
|
|
82
82
|
example: Example,
|
|
83
83
|
) -> None:
|
|
84
84
|
"""Saves the example to the checkpoint file."""
|
|
85
|
-
if example.
|
|
86
|
-
experiment.warning(
|
|
87
|
-
f'Example {example.id} has error. Skipping checkpointing.'
|
|
88
|
-
)
|
|
89
|
-
elif example.newly_processed:
|
|
85
|
+
if example.newly_processed:
|
|
90
86
|
self._save_example(runner, experiment, example)
|
|
91
87
|
|
|
92
88
|
def _load_experiment(
|
|
@@ -101,12 +101,15 @@ class PerExampleCheckpointerTest(CheckpointerTest):
|
|
|
101
101
|
self.assertIn(i + 1, collector.examples)
|
|
102
102
|
example = collector.examples[i + 1]
|
|
103
103
|
ckpt = run.output_path_for(leaf, f'checkpoint_{example.id}.jsonl')
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
104
|
+
self.assertTrue(pg.io.path_exists(ckpt))
|
|
105
|
+
with pg.io.open_sequence(ckpt) as f:
|
|
106
|
+
examples_from_ckpt = list(iter(f))
|
|
107
|
+
# `eval_test_helper.test_experiment` has two TestEvaluation with
|
|
108
|
+
# llm offset 0, which makes that evaluation run twice.
|
|
109
|
+
if example.has_error and leaf.lm.offset == 0:
|
|
110
|
+
self.assertEqual(len(examples_from_ckpt), 2)
|
|
111
|
+
else:
|
|
112
|
+
self.assertEqual(len(examples_from_ckpt), 1)
|
|
110
113
|
if leaf.id not in num_processed:
|
|
111
114
|
self.assertEqual(leaf.progress.num_skipped, 0)
|
|
112
115
|
num_processed[leaf.id] = leaf.progress.num_processed
|
|
@@ -222,7 +225,10 @@ class BulkCheckpointerTest(CheckpointerTest):
|
|
|
222
225
|
with pg.io.open_sequence(ckpt) as f:
|
|
223
226
|
self.assertEqual(
|
|
224
227
|
len(list(iter(f))),
|
|
225
|
-
|
|
228
|
+
# `eval_test_helper.test_experiment` has two TestEvaluation with
|
|
229
|
+
# llm offset 0, which makes that evaluation run twice.
|
|
230
|
+
leaf.progress.num_completed
|
|
231
|
+
+ (leaf.progress.num_failed if leaf.lm.offset == 0 else 0)
|
|
226
232
|
)
|
|
227
233
|
if leaf.id not in num_processed:
|
|
228
234
|
self.assertEqual(leaf.progress.num_skipped, 0)
|
|
@@ -149,12 +149,15 @@ class Evaluation(experiment_lib.Experiment):
|
|
|
149
149
|
self,
|
|
150
150
|
example: example_lib.Example | int,
|
|
151
151
|
raise_if_has_error: bool = False,
|
|
152
|
+
reevaluate_upon_previous_errors: bool = True,
|
|
152
153
|
) -> example_lib.Example:
|
|
153
154
|
"""Evaluates a single example input.
|
|
154
155
|
|
|
155
156
|
Args:
|
|
156
157
|
example: An example ID or an example object with ID.
|
|
157
158
|
raise_if_has_error: Whether to raise an error if the example has error.
|
|
159
|
+
reevaluate_upon_previous_errors: Whether to reevaluate the example if
|
|
160
|
+
the previous checkpointed run has error.
|
|
158
161
|
|
|
159
162
|
Returns:
|
|
160
163
|
The evaluated example with the output and metric metadata populated.
|
|
@@ -169,7 +172,9 @@ class Evaluation(experiment_lib.Experiment):
|
|
|
169
172
|
checkpointed = self._state.ckpt_example(example.id)
|
|
170
173
|
|
|
171
174
|
with pg.timeit('evaluate') as timeit, lf.track_usages() as usage_summary:
|
|
172
|
-
if checkpointed is None or
|
|
175
|
+
if checkpointed is None or (
|
|
176
|
+
reevaluate_upon_previous_errors and checkpointed.has_error
|
|
177
|
+
):
|
|
173
178
|
if checkpointed is None:
|
|
174
179
|
self.info(
|
|
175
180
|
f'Example {example.id} is being processed for the first time '
|
|
@@ -184,17 +189,27 @@ class Evaluation(experiment_lib.Experiment):
|
|
|
184
189
|
self._state.update(example, in_progress=True)
|
|
185
190
|
self._process(example, raise_if_has_error=raise_if_has_error)
|
|
186
191
|
else:
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
192
|
+
if checkpointed.has_error:
|
|
193
|
+
self.warning(
|
|
194
|
+
f'Example {example.id} skipped processing despite its checkpoint '
|
|
195
|
+
f'has error: {checkpointed.error}, as '
|
|
196
|
+
'`reevaluate_upon_previous_errors` is set to False.'
|
|
197
|
+
)
|
|
198
|
+
else:
|
|
199
|
+
self.info(
|
|
200
|
+
f'Example {example.id} skipped processing as prior run '
|
|
201
|
+
'is available and error free.'
|
|
202
|
+
)
|
|
191
203
|
example.start_time = checkpointed.start_time
|
|
192
204
|
self._state.update(example, in_progress=True)
|
|
193
205
|
|
|
194
206
|
# Use the output and metadata obtained from the previous processing.
|
|
195
207
|
example.output = checkpointed.output
|
|
196
208
|
example.metadata = checkpointed.metadata
|
|
209
|
+
example.error = checkpointed.error
|
|
197
210
|
example.newly_processed = False
|
|
211
|
+
example.execution_status = checkpointed.execution_status
|
|
212
|
+
example.end_time = checkpointed.end_time
|
|
198
213
|
|
|
199
214
|
# For previously processed examples, we merge previous usages as
|
|
200
215
|
# cached, so the usage summary will account previous usages, but as
|
|
@@ -128,10 +128,17 @@ class EvaluationTest(unittest.TestCase):
|
|
|
128
128
|
self.assertEqual(len(exp.state.evaluation_status), 1)
|
|
129
129
|
f.add(pg.to_json_str(example))
|
|
130
130
|
|
|
131
|
+
example2 = exp.evaluate(6)
|
|
132
|
+
self.assertTrue(example2.newly_processed)
|
|
133
|
+
self.assertEqual(example2.input, pg.Dict(x=5, y=25, groundtruth=30))
|
|
134
|
+
self.assertTrue(example2.has_error)
|
|
135
|
+
self.assertEqual(len(exp.state.evaluation_status), 2)
|
|
136
|
+
f.add(pg.to_json_str(example2))
|
|
137
|
+
|
|
131
138
|
exp.reset()
|
|
132
139
|
self.assertEqual(len(exp.state.ckpt_examples), 0)
|
|
133
140
|
exp.load_state(state_file)
|
|
134
|
-
self.assertEqual(len(exp.state.ckpt_examples),
|
|
141
|
+
self.assertEqual(len(exp.state.ckpt_examples), 2)
|
|
135
142
|
self.assertEqual(len(exp.state.evaluation_status), 0)
|
|
136
143
|
example = exp.evaluate(3)
|
|
137
144
|
self.assertFalse(example.newly_processed)
|
|
@@ -142,6 +149,19 @@ class EvaluationTest(unittest.TestCase):
|
|
|
142
149
|
self.assertEqual(example.usage_summary.cached.total.num_requests, 1)
|
|
143
150
|
self.assertEqual(example.usage_summary.uncached.total.total_tokens, 0)
|
|
144
151
|
self.assertEqual(example.usage_summary.uncached.total.num_requests, 0)
|
|
152
|
+
self.assertIsNotNone(example.start_time)
|
|
153
|
+
self.assertIsNotNone(example.end_time)
|
|
154
|
+
self.assertIn('evaluate.process', example.execution_status)
|
|
155
|
+
self.assertIn('evaluate.metric', example.execution_status)
|
|
156
|
+
|
|
157
|
+
example2 = exp.evaluate(6, reevaluate_upon_previous_errors=False)
|
|
158
|
+
self.assertFalse(example2.newly_processed)
|
|
159
|
+
self.assertEqual(example2.input, pg.Dict(x=5, y=25, groundtruth=30))
|
|
160
|
+
self.assertTrue(example2.has_error)
|
|
161
|
+
self.assertIsNotNone(example2.start_time)
|
|
162
|
+
self.assertIsNotNone(example2.end_time)
|
|
163
|
+
self.assertIn('evaluate.process', example2.execution_status)
|
|
164
|
+
self.assertIn('evaluate.metric', example2.execution_status)
|
|
145
165
|
|
|
146
166
|
# Test load_state with filter.
|
|
147
167
|
exp.reset()
|
|
@@ -384,6 +384,7 @@ class Experiment(lf.Component, pg.views.HtmlTreeView.Extension):
|
|
|
384
384
|
example_ids: list[int] | None = None,
|
|
385
385
|
shuffle_inputs: bool = False,
|
|
386
386
|
raise_if_has_error: bool = False,
|
|
387
|
+
reevaluate_upon_previous_errors: bool = True,
|
|
387
388
|
reprocess: bool | list[int] = False,
|
|
388
389
|
generate_example_html: Literal['new', 'all', 'no'] | list[int] = 'new',
|
|
389
390
|
process_timeout: int | None = None,
|
|
@@ -436,6 +437,9 @@ class Experiment(lf.Component, pg.views.HtmlTreeView.Extension):
|
|
|
436
437
|
Neverthless, the example ID remains unchanged for each example.
|
|
437
438
|
raise_if_has_error: If True, it will raise an error if any example fails.
|
|
438
439
|
Otherwise, it will continue and report the error in the output.
|
|
440
|
+
reevaluate_upon_previous_errors: If True, it will reevaluate the example
|
|
441
|
+
if the previous checkpointed run has error when experiment runs with
|
|
442
|
+
existing checkpoints.
|
|
439
443
|
reprocess: A boolean or a list of example IDs. If boolean, it indicates
|
|
440
444
|
that whether all the examples to be evaluated will be reprocessed,
|
|
441
445
|
meaning that existing checkpoints will be ignored. If a list of
|
|
@@ -475,6 +479,7 @@ class Experiment(lf.Component, pg.views.HtmlTreeView.Extension):
|
|
|
475
479
|
example_ids=example_ids,
|
|
476
480
|
shuffle_inputs=shuffle_inputs,
|
|
477
481
|
raise_if_has_error=raise_if_has_error,
|
|
482
|
+
reevaluate_upon_previous_errors=reevaluate_upon_previous_errors,
|
|
478
483
|
reprocess=reprocess,
|
|
479
484
|
generate_example_html=generate_example_html,
|
|
480
485
|
use_cache=use_cache,
|
|
@@ -835,6 +840,14 @@ class Run(pg.Object, pg.views.html.HtmlTreeView.Extension):
|
|
|
835
840
|
)
|
|
836
841
|
] = False
|
|
837
842
|
|
|
843
|
+
reevaluate_upon_previous_errors: Annotated[
|
|
844
|
+
bool,
|
|
845
|
+
(
|
|
846
|
+
'If True, it will reevaluate the example if the previously '
|
|
847
|
+
'checkpointed run has error. Otherwise, it will skip the example.'
|
|
848
|
+
)
|
|
849
|
+
] = True
|
|
850
|
+
|
|
838
851
|
note: Annotated[
|
|
839
852
|
str | None,
|
|
840
853
|
'The user note for the current run.'
|
langfun/core/eval/v2/runners.py
CHANGED
|
@@ -384,6 +384,7 @@ class RunnerBase(Runner):
|
|
|
384
384
|
item = evaluation.evaluate(
|
|
385
385
|
item,
|
|
386
386
|
raise_if_has_error=self.current_run.raise_if_has_error,
|
|
387
|
+
reevaluate_upon_previous_errors=self.current_run.reevaluate_upon_previous_errors,
|
|
387
388
|
)
|
|
388
389
|
self.on_example_complete(evaluation, item)
|
|
389
390
|
return item
|
|
@@ -69,14 +69,14 @@ langfun/core/eval/patching_test.py,sha256=8kCd54Egjju22FMgtJuxEsrXkW8ifs-UUBHtrC
|
|
|
69
69
|
langfun/core/eval/scoring.py,sha256=_DvnlgI1SdRVaOojao_AkV3pnenfCPOqyhvlg-Sw-5M,6322
|
|
70
70
|
langfun/core/eval/scoring_test.py,sha256=UcBH0R6vAovZ0A4yM22s5cBHL1qVKASubrbu1t8dYBw,4529
|
|
71
71
|
langfun/core/eval/v2/__init__.py,sha256=9lNKJwbvl0lcFblAXYT_OHI8fOubJsTOdSkxEqsP1xU,1726
|
|
72
|
-
langfun/core/eval/v2/checkpointing.py,sha256=
|
|
73
|
-
langfun/core/eval/v2/checkpointing_test.py,sha256=
|
|
72
|
+
langfun/core/eval/v2/checkpointing.py,sha256=0ll04cCH3WAXoU2ocoSE8GaSWVCw9JERcnuvUiHRSBU,11624
|
|
73
|
+
langfun/core/eval/v2/checkpointing_test.py,sha256=NZo-yt3j5bUF-yPVfHGFBVGWZhhpk5xaT-6fsuIQUSs,9571
|
|
74
74
|
langfun/core/eval/v2/eval_test_helper.py,sha256=sKFi_wPYCNmr96WyTduuXY0KnxjFxcJyEhXey-_nGX8,3962
|
|
75
|
-
langfun/core/eval/v2/evaluation.py,sha256=
|
|
76
|
-
langfun/core/eval/v2/evaluation_test.py,sha256=
|
|
75
|
+
langfun/core/eval/v2/evaluation.py,sha256=Oiqjx7rL0Fql6OaYifiXlgZX5tzz1uAs88GydjJPDI8,27722
|
|
76
|
+
langfun/core/eval/v2/evaluation_test.py,sha256=GdB-CexNxQSNZoXqpyGNLdHzq0D9vXqgwAJ13iaAOO0,7878
|
|
77
77
|
langfun/core/eval/v2/example.py,sha256=v1dIz89pccIqujt7utrk0EbqMWM9kBn-2fYGRTKe358,10890
|
|
78
78
|
langfun/core/eval/v2/example_test.py,sha256=wsHQD6te7ghROmxe3Xg_NK4TU0xS2MkNfnpo-H0H8xM,3399
|
|
79
|
-
langfun/core/eval/v2/experiment.py,sha256=
|
|
79
|
+
langfun/core/eval/v2/experiment.py,sha256=Y6a-Qkb4AwWWmpOVGyYiK29feuQ3u2t6SWs5mXF2xjM,33957
|
|
80
80
|
langfun/core/eval/v2/experiment_test.py,sha256=BYrPYfQfU2jDfAlZcHDT0KUaXOnCnyTWyUEKhDoqXfw,13645
|
|
81
81
|
langfun/core/eval/v2/metric_values.py,sha256=_B905bC-jxrYPLSEcP2M8MaHZOVMz_bVrUw8YC4arCE,4660
|
|
82
82
|
langfun/core/eval/v2/metric_values_test.py,sha256=ab2oF_HsIwrSy459108ggyjgefHSPn8UVILR4dRwx14,2634
|
|
@@ -88,7 +88,7 @@ langfun/core/eval/v2/progress_tracking.py,sha256=zNhNPGlnJnHELEfFpbTMCSXFn8d1IJ5
|
|
|
88
88
|
langfun/core/eval/v2/progress_tracking_test.py,sha256=sJhlVfinGsg3Kf2wQ_hT7VMcpQfaI4ZkqyW9ujElkwA,2282
|
|
89
89
|
langfun/core/eval/v2/reporting.py,sha256=yUIPCAMnp7InIzpv1DDWrcLO-75iiOUTpscj7smkfrA,8335
|
|
90
90
|
langfun/core/eval/v2/reporting_test.py,sha256=CMK-vwho8cNRJwlbkCqm_v5fykE7Y3V6SaIOCY0CDyA,5671
|
|
91
|
-
langfun/core/eval/v2/runners.py,sha256=
|
|
91
|
+
langfun/core/eval/v2/runners.py,sha256=bEniZDNu44AQgvqpwLsvBU4V_7WltAe-NPhYgIsLj1E,16848
|
|
92
92
|
langfun/core/eval/v2/runners_test.py,sha256=spjkmqlls_vyERdZMdjv6dhIN9ZfxsDDvIQAWTj2kMk,11954
|
|
93
93
|
langfun/core/llms/__init__.py,sha256=CtxUdXohQ8AQk1DqBT6MBy2zdAoPSggNo00SYrj9-AY,9521
|
|
94
94
|
langfun/core/llms/anthropic.py,sha256=YcQ2VG8iOfXtry_tTpAukmiwXa2hK_9LkpkmXk41Nm0,26226
|
|
@@ -165,8 +165,8 @@ langfun/core/templates/demonstration.py,sha256=vCrgYubdZM5Umqcgp8NUVGXgr4P_c-fik
|
|
|
165
165
|
langfun/core/templates/demonstration_test.py,sha256=SafcDQ0WgI7pw05EmPI2S4v1t3ABKzup8jReCljHeK4,2162
|
|
166
166
|
langfun/core/templates/selfplay.py,sha256=yhgrJbiYwq47TgzThmHrDQTF4nDrTI09CWGhuQPNv-s,2273
|
|
167
167
|
langfun/core/templates/selfplay_test.py,sha256=Ot__1P1M8oJfoTp-M9-PQ6HUXqZKyMwvZ5f7yQ3yfyM,2326
|
|
168
|
-
langfun-0.1.2.
|
|
169
|
-
langfun-0.1.2.
|
|
170
|
-
langfun-0.1.2.
|
|
171
|
-
langfun-0.1.2.
|
|
172
|
-
langfun-0.1.2.
|
|
168
|
+
langfun-0.1.2.dev202508070805.dist-info/licenses/LICENSE,sha256=WNHhf_5RCaeuKWyq_K39vmp9F28LxKsB4SpomwSZ2L0,11357
|
|
169
|
+
langfun-0.1.2.dev202508070805.dist-info/METADATA,sha256=NKc-FcclfdUmowPZSNtHzPwmJZ8DpkdGHN0MiO-LUVM,7380
|
|
170
|
+
langfun-0.1.2.dev202508070805.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
171
|
+
langfun-0.1.2.dev202508070805.dist-info/top_level.txt,sha256=RhlEkHxs1qtzmmtWSwYoLVJAc1YrbPtxQ52uh8Z9VvY,8
|
|
172
|
+
langfun-0.1.2.dev202508070805.dist-info/RECORD,,
|
|
File without changes
|
{langfun-0.1.2.dev202508060805.dist-info → langfun-0.1.2.dev202508070805.dist-info}/licenses/LICENSE
RENAMED
|
File without changes
|
{langfun-0.1.2.dev202508060805.dist-info → langfun-0.1.2.dev202508070805.dist-info}/top_level.txt
RENAMED
|
File without changes
|