langfun 0.1.2.dev202508060805__py3-none-any.whl → 0.1.2.dev202508070805__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of langfun might be problematic. Click here for more details.

@@ -82,11 +82,7 @@ class Checkpointer(experiment_lib.Plugin):
82
82
  example: Example,
83
83
  ) -> None:
84
84
  """Saves the example to the checkpoint file."""
85
- if example.has_error:
86
- experiment.warning(
87
- f'Example {example.id} has error. Skipping checkpointing.'
88
- )
89
- elif example.newly_processed:
85
+ if example.newly_processed:
90
86
  self._save_example(runner, experiment, example)
91
87
 
92
88
  def _load_experiment(
@@ -101,12 +101,15 @@ class PerExampleCheckpointerTest(CheckpointerTest):
101
101
  self.assertIn(i + 1, collector.examples)
102
102
  example = collector.examples[i + 1]
103
103
  ckpt = run.output_path_for(leaf, f'checkpoint_{example.id}.jsonl')
104
- if example.has_error:
105
- self.assertFalse(pg.io.path_exists(ckpt))
106
- else:
107
- self.assertTrue(pg.io.path_exists(ckpt))
108
- with pg.io.open_sequence(ckpt) as f:
109
- self.assertEqual(len(list(iter(f))), 1)
104
+ self.assertTrue(pg.io.path_exists(ckpt))
105
+ with pg.io.open_sequence(ckpt) as f:
106
+ examples_from_ckpt = list(iter(f))
107
+ # `eval_test_helper.test_experiment` has two TestEvaluation with
108
+ # llm offset 0, which makes that evaluation run twice.
109
+ if example.has_error and leaf.lm.offset == 0:
110
+ self.assertEqual(len(examples_from_ckpt), 2)
111
+ else:
112
+ self.assertEqual(len(examples_from_ckpt), 1)
110
113
  if leaf.id not in num_processed:
111
114
  self.assertEqual(leaf.progress.num_skipped, 0)
112
115
  num_processed[leaf.id] = leaf.progress.num_processed
@@ -222,7 +225,10 @@ class BulkCheckpointerTest(CheckpointerTest):
222
225
  with pg.io.open_sequence(ckpt) as f:
223
226
  self.assertEqual(
224
227
  len(list(iter(f))),
225
- leaf.progress.num_completed - leaf.progress.num_failed
228
+ # `eval_test_helper.test_experiment` has two TestEvaluation with
229
+ # llm offset 0, which makes that evaluation run twice.
230
+ leaf.progress.num_completed
231
+ + (leaf.progress.num_failed if leaf.lm.offset == 0 else 0)
226
232
  )
227
233
  if leaf.id not in num_processed:
228
234
  self.assertEqual(leaf.progress.num_skipped, 0)
@@ -149,12 +149,15 @@ class Evaluation(experiment_lib.Experiment):
149
149
  self,
150
150
  example: example_lib.Example | int,
151
151
  raise_if_has_error: bool = False,
152
+ reevaluate_upon_previous_errors: bool = True,
152
153
  ) -> example_lib.Example:
153
154
  """Evaluates a single example input.
154
155
 
155
156
  Args:
156
157
  example: An example ID or an example object with ID.
157
158
  raise_if_has_error: Whether to raise an error if the example has error.
159
+ reevaluate_upon_previous_errors: Whether to reevaluate the example if
160
+ the previous checkpointed run has error.
158
161
 
159
162
  Returns:
160
163
  The evaluated example with the output and metric metadata populated.
@@ -169,7 +172,9 @@ class Evaluation(experiment_lib.Experiment):
169
172
  checkpointed = self._state.ckpt_example(example.id)
170
173
 
171
174
  with pg.timeit('evaluate') as timeit, lf.track_usages() as usage_summary:
172
- if checkpointed is None or checkpointed.has_error:
175
+ if checkpointed is None or (
176
+ reevaluate_upon_previous_errors and checkpointed.has_error
177
+ ):
173
178
  if checkpointed is None:
174
179
  self.info(
175
180
  f'Example {example.id} is being processed for the first time '
@@ -184,17 +189,27 @@ class Evaluation(experiment_lib.Experiment):
184
189
  self._state.update(example, in_progress=True)
185
190
  self._process(example, raise_if_has_error=raise_if_has_error)
186
191
  else:
187
- self.info(
188
- f'Example {example.id} skipped processing as prior run '
189
- 'is available and error free.'
190
- )
192
+ if checkpointed.has_error:
193
+ self.warning(
194
+ f'Example {example.id} skipped processing despite its checkpoint '
195
+ f'has error: {checkpointed.error}, as '
196
+ '`reevaluate_upon_previous_errors` is set to False.'
197
+ )
198
+ else:
199
+ self.info(
200
+ f'Example {example.id} skipped processing as prior run '
201
+ 'is available and error free.'
202
+ )
191
203
  example.start_time = checkpointed.start_time
192
204
  self._state.update(example, in_progress=True)
193
205
 
194
206
  # Use the output and metadata obtained from the previous processing.
195
207
  example.output = checkpointed.output
196
208
  example.metadata = checkpointed.metadata
209
+ example.error = checkpointed.error
197
210
  example.newly_processed = False
211
+ example.execution_status = checkpointed.execution_status
212
+ example.end_time = checkpointed.end_time
198
213
 
199
214
  # For previously processed examples, we merge previous usages as
200
215
  # cached, so the usage summary will account previous usages, but as
@@ -128,10 +128,17 @@ class EvaluationTest(unittest.TestCase):
128
128
  self.assertEqual(len(exp.state.evaluation_status), 1)
129
129
  f.add(pg.to_json_str(example))
130
130
 
131
+ example2 = exp.evaluate(6)
132
+ self.assertTrue(example2.newly_processed)
133
+ self.assertEqual(example2.input, pg.Dict(x=5, y=25, groundtruth=30))
134
+ self.assertTrue(example2.has_error)
135
+ self.assertEqual(len(exp.state.evaluation_status), 2)
136
+ f.add(pg.to_json_str(example2))
137
+
131
138
  exp.reset()
132
139
  self.assertEqual(len(exp.state.ckpt_examples), 0)
133
140
  exp.load_state(state_file)
134
- self.assertEqual(len(exp.state.ckpt_examples), 1)
141
+ self.assertEqual(len(exp.state.ckpt_examples), 2)
135
142
  self.assertEqual(len(exp.state.evaluation_status), 0)
136
143
  example = exp.evaluate(3)
137
144
  self.assertFalse(example.newly_processed)
@@ -142,6 +149,19 @@ class EvaluationTest(unittest.TestCase):
142
149
  self.assertEqual(example.usage_summary.cached.total.num_requests, 1)
143
150
  self.assertEqual(example.usage_summary.uncached.total.total_tokens, 0)
144
151
  self.assertEqual(example.usage_summary.uncached.total.num_requests, 0)
152
+ self.assertIsNotNone(example.start_time)
153
+ self.assertIsNotNone(example.end_time)
154
+ self.assertIn('evaluate.process', example.execution_status)
155
+ self.assertIn('evaluate.metric', example.execution_status)
156
+
157
+ example2 = exp.evaluate(6, reevaluate_upon_previous_errors=False)
158
+ self.assertFalse(example2.newly_processed)
159
+ self.assertEqual(example2.input, pg.Dict(x=5, y=25, groundtruth=30))
160
+ self.assertTrue(example2.has_error)
161
+ self.assertIsNotNone(example2.start_time)
162
+ self.assertIsNotNone(example2.end_time)
163
+ self.assertIn('evaluate.process', example2.execution_status)
164
+ self.assertIn('evaluate.metric', example2.execution_status)
145
165
 
146
166
  # Test load_state with filter.
147
167
  exp.reset()
@@ -384,6 +384,7 @@ class Experiment(lf.Component, pg.views.HtmlTreeView.Extension):
384
384
  example_ids: list[int] | None = None,
385
385
  shuffle_inputs: bool = False,
386
386
  raise_if_has_error: bool = False,
387
+ reevaluate_upon_previous_errors: bool = True,
387
388
  reprocess: bool | list[int] = False,
388
389
  generate_example_html: Literal['new', 'all', 'no'] | list[int] = 'new',
389
390
  process_timeout: int | None = None,
@@ -436,6 +437,9 @@ class Experiment(lf.Component, pg.views.HtmlTreeView.Extension):
436
437
  Neverthless, the example ID remains unchanged for each example.
437
438
  raise_if_has_error: If True, it will raise an error if any example fails.
438
439
  Otherwise, it will continue and report the error in the output.
440
+ reevaluate_upon_previous_errors: If True, it will reevaluate the example
441
+ if the previous checkpointed run has error when experiment runs with
442
+ existing checkpoints.
439
443
  reprocess: A boolean or a list of example IDs. If boolean, it indicates
440
444
  that whether all the examples to be evaluated will be reprocessed,
441
445
  meaning that existing checkpoints will be ignored. If a list of
@@ -475,6 +479,7 @@ class Experiment(lf.Component, pg.views.HtmlTreeView.Extension):
475
479
  example_ids=example_ids,
476
480
  shuffle_inputs=shuffle_inputs,
477
481
  raise_if_has_error=raise_if_has_error,
482
+ reevaluate_upon_previous_errors=reevaluate_upon_previous_errors,
478
483
  reprocess=reprocess,
479
484
  generate_example_html=generate_example_html,
480
485
  use_cache=use_cache,
@@ -835,6 +840,14 @@ class Run(pg.Object, pg.views.html.HtmlTreeView.Extension):
835
840
  )
836
841
  ] = False
837
842
 
843
+ reevaluate_upon_previous_errors: Annotated[
844
+ bool,
845
+ (
846
+ 'If True, it will reevaluate the example if the previously '
847
+ 'checkpointed run has error. Otherwise, it will skip the example.'
848
+ )
849
+ ] = True
850
+
838
851
  note: Annotated[
839
852
  str | None,
840
853
  'The user note for the current run.'
@@ -384,6 +384,7 @@ class RunnerBase(Runner):
384
384
  item = evaluation.evaluate(
385
385
  item,
386
386
  raise_if_has_error=self.current_run.raise_if_has_error,
387
+ reevaluate_upon_previous_errors=self.current_run.reevaluate_upon_previous_errors,
387
388
  )
388
389
  self.on_example_complete(evaluation, item)
389
390
  return item
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: langfun
3
- Version: 0.1.2.dev202508060805
3
+ Version: 0.1.2.dev202508070805
4
4
  Summary: Langfun: Language as Functions.
5
5
  Home-page: https://github.com/google/langfun
6
6
  Author: Langfun Authors
@@ -69,14 +69,14 @@ langfun/core/eval/patching_test.py,sha256=8kCd54Egjju22FMgtJuxEsrXkW8ifs-UUBHtrC
69
69
  langfun/core/eval/scoring.py,sha256=_DvnlgI1SdRVaOojao_AkV3pnenfCPOqyhvlg-Sw-5M,6322
70
70
  langfun/core/eval/scoring_test.py,sha256=UcBH0R6vAovZ0A4yM22s5cBHL1qVKASubrbu1t8dYBw,4529
71
71
  langfun/core/eval/v2/__init__.py,sha256=9lNKJwbvl0lcFblAXYT_OHI8fOubJsTOdSkxEqsP1xU,1726
72
- langfun/core/eval/v2/checkpointing.py,sha256=t47rBfzGZYgIqWW1N1Ak9yQnNtHd-IRbEO0cZjG2VRo,11755
73
- langfun/core/eval/v2/checkpointing_test.py,sha256=cuQ1zom5DMXIebxYW6L3N5XRyhfoEEDrs7XQcAxg8Nc,9164
72
+ langfun/core/eval/v2/checkpointing.py,sha256=0ll04cCH3WAXoU2ocoSE8GaSWVCw9JERcnuvUiHRSBU,11624
73
+ langfun/core/eval/v2/checkpointing_test.py,sha256=NZo-yt3j5bUF-yPVfHGFBVGWZhhpk5xaT-6fsuIQUSs,9571
74
74
  langfun/core/eval/v2/eval_test_helper.py,sha256=sKFi_wPYCNmr96WyTduuXY0KnxjFxcJyEhXey-_nGX8,3962
75
- langfun/core/eval/v2/evaluation.py,sha256=ihT5dljnUkHM97XS9OwE2wOnYC-oYnHYgG5KN1hmiaU,27037
76
- langfun/core/eval/v2/evaluation_test.py,sha256=46bGjNZmd57NXcJSoaC17DO9B74rpVBOVTEln_4W61c,6916
75
+ langfun/core/eval/v2/evaluation.py,sha256=Oiqjx7rL0Fql6OaYifiXlgZX5tzz1uAs88GydjJPDI8,27722
76
+ langfun/core/eval/v2/evaluation_test.py,sha256=GdB-CexNxQSNZoXqpyGNLdHzq0D9vXqgwAJ13iaAOO0,7878
77
77
  langfun/core/eval/v2/example.py,sha256=v1dIz89pccIqujt7utrk0EbqMWM9kBn-2fYGRTKe358,10890
78
78
  langfun/core/eval/v2/example_test.py,sha256=wsHQD6te7ghROmxe3Xg_NK4TU0xS2MkNfnpo-H0H8xM,3399
79
- langfun/core/eval/v2/experiment.py,sha256=fb3RHNOSRftV7ZTBfYVV50iEevqdPwRHCt3mgtLzuFw,33408
79
+ langfun/core/eval/v2/experiment.py,sha256=Y6a-Qkb4AwWWmpOVGyYiK29feuQ3u2t6SWs5mXF2xjM,33957
80
80
  langfun/core/eval/v2/experiment_test.py,sha256=BYrPYfQfU2jDfAlZcHDT0KUaXOnCnyTWyUEKhDoqXfw,13645
81
81
  langfun/core/eval/v2/metric_values.py,sha256=_B905bC-jxrYPLSEcP2M8MaHZOVMz_bVrUw8YC4arCE,4660
82
82
  langfun/core/eval/v2/metric_values_test.py,sha256=ab2oF_HsIwrSy459108ggyjgefHSPn8UVILR4dRwx14,2634
@@ -88,7 +88,7 @@ langfun/core/eval/v2/progress_tracking.py,sha256=zNhNPGlnJnHELEfFpbTMCSXFn8d1IJ5
88
88
  langfun/core/eval/v2/progress_tracking_test.py,sha256=sJhlVfinGsg3Kf2wQ_hT7VMcpQfaI4ZkqyW9ujElkwA,2282
89
89
  langfun/core/eval/v2/reporting.py,sha256=yUIPCAMnp7InIzpv1DDWrcLO-75iiOUTpscj7smkfrA,8335
90
90
  langfun/core/eval/v2/reporting_test.py,sha256=CMK-vwho8cNRJwlbkCqm_v5fykE7Y3V6SaIOCY0CDyA,5671
91
- langfun/core/eval/v2/runners.py,sha256=iqbH4jMtnNMhfuv1eHaxJmk1Vvsrz-sAJJFP8U44-tA,16758
91
+ langfun/core/eval/v2/runners.py,sha256=bEniZDNu44AQgvqpwLsvBU4V_7WltAe-NPhYgIsLj1E,16848
92
92
  langfun/core/eval/v2/runners_test.py,sha256=spjkmqlls_vyERdZMdjv6dhIN9ZfxsDDvIQAWTj2kMk,11954
93
93
  langfun/core/llms/__init__.py,sha256=CtxUdXohQ8AQk1DqBT6MBy2zdAoPSggNo00SYrj9-AY,9521
94
94
  langfun/core/llms/anthropic.py,sha256=YcQ2VG8iOfXtry_tTpAukmiwXa2hK_9LkpkmXk41Nm0,26226
@@ -165,8 +165,8 @@ langfun/core/templates/demonstration.py,sha256=vCrgYubdZM5Umqcgp8NUVGXgr4P_c-fik
165
165
  langfun/core/templates/demonstration_test.py,sha256=SafcDQ0WgI7pw05EmPI2S4v1t3ABKzup8jReCljHeK4,2162
166
166
  langfun/core/templates/selfplay.py,sha256=yhgrJbiYwq47TgzThmHrDQTF4nDrTI09CWGhuQPNv-s,2273
167
167
  langfun/core/templates/selfplay_test.py,sha256=Ot__1P1M8oJfoTp-M9-PQ6HUXqZKyMwvZ5f7yQ3yfyM,2326
168
- langfun-0.1.2.dev202508060805.dist-info/licenses/LICENSE,sha256=WNHhf_5RCaeuKWyq_K39vmp9F28LxKsB4SpomwSZ2L0,11357
169
- langfun-0.1.2.dev202508060805.dist-info/METADATA,sha256=Ksl1Bg7ntBn1YKRL9H-6pP1FTl-oeJutAv7PVvgjg-g,7380
170
- langfun-0.1.2.dev202508060805.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
171
- langfun-0.1.2.dev202508060805.dist-info/top_level.txt,sha256=RhlEkHxs1qtzmmtWSwYoLVJAc1YrbPtxQ52uh8Z9VvY,8
172
- langfun-0.1.2.dev202508060805.dist-info/RECORD,,
168
+ langfun-0.1.2.dev202508070805.dist-info/licenses/LICENSE,sha256=WNHhf_5RCaeuKWyq_K39vmp9F28LxKsB4SpomwSZ2L0,11357
169
+ langfun-0.1.2.dev202508070805.dist-info/METADATA,sha256=NKc-FcclfdUmowPZSNtHzPwmJZ8DpkdGHN0MiO-LUVM,7380
170
+ langfun-0.1.2.dev202508070805.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
171
+ langfun-0.1.2.dev202508070805.dist-info/top_level.txt,sha256=RhlEkHxs1qtzmmtWSwYoLVJAc1YrbPtxQ52uh8Z9VvY,8
172
+ langfun-0.1.2.dev202508070805.dist-info/RECORD,,