langfun 0.1.2.dev202509120804__py3-none-any.whl → 0.1.2.dev202512040805__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. langfun/__init__.py +1 -1
  2. langfun/core/__init__.py +7 -1
  3. langfun/core/agentic/__init__.py +8 -1
  4. langfun/core/agentic/action.py +740 -112
  5. langfun/core/agentic/action_eval.py +9 -2
  6. langfun/core/agentic/action_test.py +189 -24
  7. langfun/core/async_support.py +104 -5
  8. langfun/core/async_support_test.py +23 -0
  9. langfun/core/coding/python/correction.py +19 -9
  10. langfun/core/coding/python/execution.py +14 -12
  11. langfun/core/coding/python/generation.py +21 -16
  12. langfun/core/coding/python/sandboxing.py +23 -3
  13. langfun/core/component.py +42 -3
  14. langfun/core/concurrent.py +70 -6
  15. langfun/core/concurrent_test.py +9 -2
  16. langfun/core/console.py +1 -1
  17. langfun/core/data/conversion/anthropic.py +12 -3
  18. langfun/core/data/conversion/anthropic_test.py +8 -6
  19. langfun/core/data/conversion/gemini.py +11 -2
  20. langfun/core/data/conversion/gemini_test.py +48 -9
  21. langfun/core/data/conversion/openai.py +145 -31
  22. langfun/core/data/conversion/openai_test.py +161 -17
  23. langfun/core/eval/base.py +48 -44
  24. langfun/core/eval/base_test.py +5 -5
  25. langfun/core/eval/matching.py +5 -2
  26. langfun/core/eval/patching.py +3 -3
  27. langfun/core/eval/scoring.py +4 -3
  28. langfun/core/eval/v2/__init__.py +2 -0
  29. langfun/core/eval/v2/checkpointing.py +76 -7
  30. langfun/core/eval/v2/checkpointing_test.py +9 -2
  31. langfun/core/eval/v2/config_saver.py +37 -0
  32. langfun/core/eval/v2/config_saver_test.py +36 -0
  33. langfun/core/eval/v2/eval_test_helper.py +104 -3
  34. langfun/core/eval/v2/evaluation.py +92 -17
  35. langfun/core/eval/v2/evaluation_test.py +9 -3
  36. langfun/core/eval/v2/example.py +50 -40
  37. langfun/core/eval/v2/example_test.py +16 -8
  38. langfun/core/eval/v2/experiment.py +84 -15
  39. langfun/core/eval/v2/experiment_test.py +19 -0
  40. langfun/core/eval/v2/metric_values.py +31 -3
  41. langfun/core/eval/v2/metric_values_test.py +32 -0
  42. langfun/core/eval/v2/metrics.py +157 -44
  43. langfun/core/eval/v2/metrics_test.py +39 -18
  44. langfun/core/eval/v2/progress.py +31 -1
  45. langfun/core/eval/v2/progress_test.py +27 -0
  46. langfun/core/eval/v2/progress_tracking.py +13 -5
  47. langfun/core/eval/v2/progress_tracking_test.py +9 -1
  48. langfun/core/eval/v2/reporting.py +90 -71
  49. langfun/core/eval/v2/reporting_test.py +24 -6
  50. langfun/core/eval/v2/runners/__init__.py +30 -0
  51. langfun/core/eval/v2/{runners.py → runners/base.py} +72 -180
  52. langfun/core/eval/v2/runners/beam.py +354 -0
  53. langfun/core/eval/v2/runners/beam_test.py +153 -0
  54. langfun/core/eval/v2/runners/ckpt_monitor.py +294 -0
  55. langfun/core/eval/v2/runners/ckpt_monitor_test.py +162 -0
  56. langfun/core/eval/v2/runners/debug.py +40 -0
  57. langfun/core/eval/v2/runners/debug_test.py +76 -0
  58. langfun/core/eval/v2/runners/parallel.py +243 -0
  59. langfun/core/eval/v2/runners/parallel_test.py +182 -0
  60. langfun/core/eval/v2/runners/sequential.py +47 -0
  61. langfun/core/eval/v2/runners/sequential_test.py +169 -0
  62. langfun/core/langfunc.py +45 -130
  63. langfun/core/langfunc_test.py +7 -5
  64. langfun/core/language_model.py +189 -36
  65. langfun/core/language_model_test.py +54 -3
  66. langfun/core/llms/__init__.py +12 -1
  67. langfun/core/llms/anthropic.py +157 -2
  68. langfun/core/llms/azure_openai.py +29 -17
  69. langfun/core/llms/cache/base.py +25 -3
  70. langfun/core/llms/cache/in_memory.py +48 -7
  71. langfun/core/llms/cache/in_memory_test.py +14 -4
  72. langfun/core/llms/compositional.py +25 -1
  73. langfun/core/llms/deepseek.py +30 -2
  74. langfun/core/llms/fake.py +32 -1
  75. langfun/core/llms/gemini.py +64 -12
  76. langfun/core/llms/gemini_test.py +110 -0
  77. langfun/core/llms/google_genai.py +34 -1
  78. langfun/core/llms/groq.py +28 -3
  79. langfun/core/llms/llama_cpp.py +23 -4
  80. langfun/core/llms/openai.py +120 -3
  81. langfun/core/llms/openai_compatible.py +148 -27
  82. langfun/core/llms/openai_compatible_test.py +207 -20
  83. langfun/core/llms/openai_test.py +0 -2
  84. langfun/core/llms/rest.py +16 -1
  85. langfun/core/llms/vertexai.py +58 -8
  86. langfun/core/logging.py +1 -1
  87. langfun/core/mcp/__init__.py +10 -0
  88. langfun/core/mcp/client.py +177 -0
  89. langfun/core/mcp/client_test.py +71 -0
  90. langfun/core/mcp/session.py +241 -0
  91. langfun/core/mcp/session_test.py +54 -0
  92. langfun/core/mcp/testing/simple_mcp_client.py +33 -0
  93. langfun/core/mcp/testing/simple_mcp_server.py +33 -0
  94. langfun/core/mcp/tool.py +254 -0
  95. langfun/core/mcp/tool_test.py +197 -0
  96. langfun/core/memory.py +1 -0
  97. langfun/core/message.py +160 -55
  98. langfun/core/message_test.py +65 -81
  99. langfun/core/modalities/__init__.py +8 -0
  100. langfun/core/modalities/audio.py +21 -1
  101. langfun/core/modalities/image.py +73 -3
  102. langfun/core/modalities/image_test.py +116 -0
  103. langfun/core/modalities/mime.py +64 -3
  104. langfun/core/modalities/mime_test.py +11 -0
  105. langfun/core/modalities/pdf.py +19 -1
  106. langfun/core/modalities/video.py +21 -1
  107. langfun/core/modality.py +167 -29
  108. langfun/core/modality_test.py +42 -12
  109. langfun/core/natural_language.py +1 -1
  110. langfun/core/sampling.py +4 -4
  111. langfun/core/sampling_test.py +20 -4
  112. langfun/core/structured/__init__.py +2 -24
  113. langfun/core/structured/completion.py +34 -44
  114. langfun/core/structured/completion_test.py +23 -43
  115. langfun/core/structured/description.py +54 -50
  116. langfun/core/structured/function_generation.py +29 -12
  117. langfun/core/structured/mapping.py +81 -37
  118. langfun/core/structured/parsing.py +95 -79
  119. langfun/core/structured/parsing_test.py +0 -3
  120. langfun/core/structured/querying.py +230 -154
  121. langfun/core/structured/querying_test.py +69 -33
  122. langfun/core/structured/schema/__init__.py +49 -0
  123. langfun/core/structured/schema/base.py +664 -0
  124. langfun/core/structured/schema/base_test.py +531 -0
  125. langfun/core/structured/schema/json.py +174 -0
  126. langfun/core/structured/schema/json_test.py +121 -0
  127. langfun/core/structured/schema/python.py +316 -0
  128. langfun/core/structured/schema/python_test.py +410 -0
  129. langfun/core/structured/schema_generation.py +33 -14
  130. langfun/core/structured/scoring.py +47 -36
  131. langfun/core/structured/tokenization.py +26 -11
  132. langfun/core/subscription.py +2 -2
  133. langfun/core/template.py +175 -50
  134. langfun/core/template_test.py +123 -17
  135. langfun/env/__init__.py +43 -0
  136. langfun/env/base_environment.py +827 -0
  137. langfun/env/base_environment_test.py +473 -0
  138. langfun/env/base_feature.py +304 -0
  139. langfun/env/base_feature_test.py +228 -0
  140. langfun/env/base_sandbox.py +842 -0
  141. langfun/env/base_sandbox_test.py +1235 -0
  142. langfun/env/event_handlers/__init__.py +14 -0
  143. langfun/env/event_handlers/chain.py +233 -0
  144. langfun/env/event_handlers/chain_test.py +253 -0
  145. langfun/env/event_handlers/event_logger.py +472 -0
  146. langfun/env/event_handlers/event_logger_test.py +304 -0
  147. langfun/env/event_handlers/metric_writer.py +726 -0
  148. langfun/env/event_handlers/metric_writer_test.py +214 -0
  149. langfun/env/interface.py +1640 -0
  150. langfun/env/interface_test.py +153 -0
  151. langfun/env/load_balancers.py +59 -0
  152. langfun/env/load_balancers_test.py +141 -0
  153. langfun/env/test_utils.py +507 -0
  154. {langfun-0.1.2.dev202509120804.dist-info → langfun-0.1.2.dev202512040805.dist-info}/METADATA +7 -3
  155. langfun-0.1.2.dev202512040805.dist-info/RECORD +217 -0
  156. langfun/core/eval/v2/runners_test.py +0 -343
  157. langfun/core/structured/schema.py +0 -987
  158. langfun/core/structured/schema_test.py +0 -982
  159. langfun-0.1.2.dev202509120804.dist-info/RECORD +0 -172
  160. {langfun-0.1.2.dev202509120804.dist-info → langfun-0.1.2.dev202512040805.dist-info}/WHEEL +0 -0
  161. {langfun-0.1.2.dev202509120804.dist-info → langfun-0.1.2.dev202512040805.dist-info}/licenses/LICENSE +0 -0
  162. {langfun-0.1.2.dev202509120804.dist-info → langfun-0.1.2.dev202512040805.dist-info}/top_level.txt +0 -0
langfun/core/eval/base.py CHANGED
@@ -59,18 +59,20 @@ class Evaluable(lf.Component):
59
59
  @property
60
60
  @abc.abstractmethod
61
61
  def id(self) -> str:
62
- """Returns the ID of the task.
62
+ """Returns the ID of this evaluable node.
63
63
 
64
64
  Returns:
65
- Evaluation task ID. Different evaluation task should have their unique
66
- task IDs, for each task will be stored in sub-directoreis identified by
67
- their IDs. For suites, the ID could be an empty string as they will not
68
- produce sub-directories
65
+ A string as the ID of this evaluable node.
66
+ If an evaluable node acts as a container for other evaluable nodes
67
+ (e.g. `lf.Suite`), its ID could be empty.
68
+ Leaf evaluable nodes (e.g. `lf.Evaluation`) must have unique IDs
69
+ under the same container, as their IDs will be used as the directory
70
+ name for saving their results.
69
71
  """
70
72
 
71
73
  @property
72
74
  def dir(self) -> str | None:
73
- """Returns the directory for saving results and details."""
75
+ """Returns the directory for saving results."""
74
76
  if self.root_dir is None:
75
77
  return None
76
78
  return os.path.join(self.root_dir, self.id)
@@ -82,18 +84,18 @@ class Evaluable(lf.Component):
82
84
 
83
85
  @property
84
86
  def index_link(self) -> str | None:
85
- """Returns the index page."""
87
+ """Returns the link to the index page."""
86
88
  if self.dir is None:
87
89
  return None
88
90
  return self.link(os.path.join(self.dir, Evaluable.INDEX_HTML))
89
91
 
90
92
  def summary(self, pivot_field: str = 'lm') -> 'Summary':
91
- """Returns a summary for all child evaluations.."""
93
+ """Returns a summary for all child evaluations."""
92
94
  return Summary([pg.Ref(x) for x in self.leaf_nodes], pivot_field)
93
95
 
94
96
  @property
95
97
  def summary_link(self) -> str | None:
96
- """Returns the summary page."""
98
+ """Returns the link to the summary page."""
97
99
  if self.root_dir is None:
98
100
  return None
99
101
  return self.link(os.path.join(self.root_dir, Evaluable.SUMMARY_HTML))
@@ -177,6 +179,7 @@ class Evaluable(lf.Component):
177
179
 
178
180
  @property
179
181
  def is_leaf(self) -> bool:
182
+ """Returns whether this node is a leaf node."""
180
183
  return isinstance(self, Evaluation) and not self.children
181
184
 
182
185
  @functools.cached_property
@@ -404,7 +407,7 @@ class Evaluable(lf.Component):
404
407
  timeout: int | None = None,
405
408
  **kwargs,
406
409
  ) -> None:
407
- """Run the evaluate and fill `self.result`. Subclass to implement."""
410
+ """Run the evaluation and fill `self.result`. Subclass to implement."""
408
411
 
409
412
  @abc.abstractmethod
410
413
  def _completion_status(self, run_status: str) -> str:
@@ -545,6 +548,7 @@ class Evaluable(lf.Component):
545
548
  def from_dir(
546
549
  cls, maybe_dir: str, load_result: bool = True
547
550
  ) -> Optional['Evaluable']:
551
+ """Loads an evaluable object from a directory."""
548
552
  exp_json = os.path.join(maybe_dir, Evaluable.EXPERIMENT_JSON)
549
553
  if not pg.io.path_exists(exp_json):
550
554
  return None
@@ -558,7 +562,7 @@ class Evaluable(lf.Component):
558
562
  return experiment
559
563
 
560
564
  def try_load_result(self) -> bool:
561
- """Try load result."""
565
+ """Try loads result from file if it's not loaded."""
562
566
  if self.result is None:
563
567
  result_json = os.path.join(self.dir, Evaluable.RESULT_JSON)
564
568
  if pg.io.path_exists(result_json):
@@ -595,7 +599,7 @@ class Suite(Evaluable):
595
599
  def _on_bound(self):
596
600
  super()._on_bound()
597
601
  overrides = {
598
- k: v for k, v in self.sym_init_args.items()
602
+ k: v for k, v in self.sym_init_args.sym_items()
599
603
  if k not in ('id', 'children')
600
604
  }
601
605
  for child in self.children:
@@ -604,6 +608,7 @@ class Suite(Evaluable):
604
608
 
605
609
  @functools.cached_property
606
610
  def hash(self) -> str:
611
+ """Returns the hash of this suite."""
607
612
  return hashlib.md5(
608
613
  ' '.join(sorted([c.hash for c in self.children])).encode()
609
614
  ).hexdigest()[:8]
@@ -619,14 +624,14 @@ class Suite(Evaluable):
619
624
 
620
625
 
621
626
  class Evaluation(Evaluable):
622
- """Base class for evaluation set."""
627
+ """Base class for evaluation sets."""
623
628
 
624
629
  inputs: pg.typing.Annotated[
625
630
  pg.typing.Functor(),
626
631
  (
627
632
  'A functor that returns a list of user-defined objects as the input '
628
- 'examples. It could be inputs loaded from a JSON file via '
629
- '`lf.eval.inputs_from(path)`, from a Python coded list via '
633
+ 'examples. It can be inputs loaded from a JSON file via '
634
+ '`lf.eval.inputs_from(path)`, from a Python-coded list via '
630
635
  '`lf.eval.as_inputs(values)` or a user-defined functor that '
631
636
  'generates input objects at runtime.'
632
637
  ),
@@ -648,12 +653,12 @@ class Evaluation(Evaluable):
648
653
  pg.typing.Functor().noneable(),
649
654
  (
650
655
  'A functor that returns a type annotation that will be converted to '
651
- '`lf.Schema`, or a tuple of (annotation, fewshot examples). '
656
+ '`lf.Schema`, or a tuple of (annotation, few-shot examples). '
652
657
  'For "call" method, it could be None, indicating that the raw '
653
- 'response from the LM will be used as the output, and the fewshot '
654
- 'examples will be used for parsing. For "query" and "complete", it '
655
- 'must be provided, and the fewshot examples will be used directly '
656
- 'for prompting. Here are the example code on how the '
658
+ 'response from the LM will be used as the output, and the few-shot '
659
+ 'examples will be used for parsing. For "query" and "complete" '
660
+ 'methods, it must be provided, and the few-shot examples will be '
661
+ 'used directly for prompting. Here is example code on how the '
657
662
  'functors should be defined:'
658
663
  + inspect.cleandoc("""
659
664
  ```
@@ -693,7 +698,7 @@ class Evaluation(Evaluable):
693
698
  completion_prompt_field: Annotated[
694
699
  str | None,
695
700
  (
696
- 'A str field that will be automatically added to the class of the '
701
+ 'A string field that will be automatically added to the class of the '
697
702
  'input object for `lf.complete`. If None, no field will be added to '
698
703
  'the class, instead the prompt will be passed as the first argument '
699
704
  'of the input object to complete. Applicable only when `method` is '
@@ -738,7 +743,7 @@ class Evaluation(Evaluable):
738
743
 
739
744
  @functools.cached_property
740
745
  def hash(self) -> str:
741
- """Returns the semantic-based hash of the evaluation."""
746
+ """Returns the semantics-based hash of the evaluation."""
742
747
  if self.is_deterministic:
743
748
  identity = pg.format(self._identifiers(), compact=True)
744
749
  else:
@@ -784,7 +789,7 @@ class Evaluation(Evaluable):
784
789
 
785
790
  @property
786
791
  def complete_rate(self) -> float:
787
- """Returns the complete rate."""
792
+ """Returns the completion rate of examples."""
788
793
  return self.num_completed / self.num_examples
789
794
 
790
795
  #
@@ -837,7 +842,7 @@ class Evaluation(Evaluable):
837
842
 
838
843
  @functools.cached_property
839
844
  def non_oop_failures(self) -> list[tuple[Any, Exception]]:
840
- """Returns the OOP failures."""
845
+ """Returns the non-OOP failures."""
841
846
  return [item for item in self.failures
842
847
  if not isinstance(item[1], lf_structured.MappingError)]
843
848
 
@@ -883,7 +888,7 @@ class Evaluation(Evaluable):
883
888
 
884
889
  @functools.cached_property
885
890
  def schema(self) -> lf_structured.Schema | None:
886
- """Schema."""
891
+ """Returns the schema for parsing LLM response."""
887
892
  if self.schema_fn is None:
888
893
  return None
889
894
 
@@ -897,7 +902,7 @@ class Evaluation(Evaluable):
897
902
 
898
903
  @functools.cached_property
899
904
  def fewshot_examples(self) -> list[lf.structured.MappingExample] | None:
900
- """Fewshot examples."""
905
+ """Returns the few-shot examples for prompting or parsing."""
901
906
  if self.schema_fn is None:
902
907
  return None
903
908
 
@@ -973,7 +978,7 @@ class Evaluation(Evaluable):
973
978
 
974
979
  @functools.cached_property
975
980
  def children(self) -> list['Evaluation']:
976
- """Returns the trials as child evaluations if this evaluation is a space."""
981
+ """Returns child evaluations if this evaluation has a parameter space."""
977
982
  if self.is_deterministic:
978
983
  return []
979
984
  children = []
@@ -1023,7 +1028,7 @@ class Evaluation(Evaluable):
1023
1028
 
1024
1029
  @property
1025
1030
  def non_oop_failures_link(self) -> str | None:
1026
- """Returns the link to then non-OOP failures page."""
1031
+ """Returns the link to the non-OOP failures page."""
1027
1032
  if self.dir is None:
1028
1033
  return None
1029
1034
  return self.link(os.path.join(self.dir, Evaluation.NON_OOP_FAILURES_HTML))
@@ -1208,10 +1213,10 @@ class Evaluation(Evaluable):
1208
1213
  )
1209
1214
 
1210
1215
  def process_output(self, example: Any, output: lf.Message) -> None:
1211
- """Process the output for an example.
1216
+ """Processes the output for an example.
1212
1217
 
1213
1218
  Subclasses can override this method to generate and attach additional
1214
- metadata for debugging purpose. For example, draw bounding boxes on the
1219
+ metadata for debugging purposes. For example, draw bounding boxes on the
1215
1220
  input image based on LLM predicted boxes and attach to output_message's
1216
1221
  metadata.
1217
1222
 
@@ -1219,8 +1224,8 @@ class Evaluation(Evaluable):
1219
1224
 
1220
1225
  class BoundingBoxEval(lf.eval.Matching):
1221
1226
  ...
1222
- def process_output(example, output):
1223
- output.metadata.image_with_bbox = draw_bboxes(
1227
+ def process_output(self, example, output):
1228
+ output.metadata.image_with_bbox = draw_bounding_box(
1224
1229
  example.image, output.result)
1225
1230
 
1226
1231
  Args:
@@ -1449,7 +1454,7 @@ class Evaluation(Evaluable):
1449
1454
  trace the LM input, response and parsed structure. If error is raised
1450
1455
  before LLM could return a response, None will be its value.
1451
1456
  error: The exception during processing the example.
1452
- dryrun: Whether or not audition takes place during dryrun.
1457
+ dryrun: Whether or not auditing takes place during dryrun.
1453
1458
  """
1454
1459
  if error is not None:
1455
1460
  self._failures.append((example, error))
@@ -1557,7 +1562,7 @@ class Evaluation(Evaluable):
1557
1562
  f'style="color:darkgray">{_html_repr(self.prompt)}</td>'
1558
1563
  )
1559
1564
  # Schema.
1560
- schema_title = self.schema.schema_str('python') if self.schema else None
1565
+ schema_title = self.schema.schema_repr('python') if self.schema else None
1561
1566
  s.write(
1562
1567
  '<td style="color:purple" '
1563
1568
  f'title="{schema_title}">'
@@ -1674,7 +1679,7 @@ class Evaluation(Evaluable):
1674
1679
 
1675
1680
  @classmethod
1676
1681
  def visualize(cls, evaluations: list['Evaluation']) -> str | None:
1677
- """Visualize the a list of evaluations of this task in HTML."""
1682
+ """Visualize a list of evaluations of this task in HTML."""
1678
1683
  del evaluations
1679
1684
  return None
1680
1685
 
@@ -1810,7 +1815,7 @@ class Summary(pg.Object):
1810
1815
  )
1811
1816
 
1812
1817
  class Table(pg.Object):
1813
- """A pivot table for view evaluations."""
1818
+ """A pivot table for viewing evaluations."""
1814
1819
 
1815
1820
  class Row(pg.Object):
1816
1821
  descriptor: dict[str, Any]
@@ -2013,12 +2018,12 @@ class Summary(pg.Object):
2013
2018
  return self._context.completed
2014
2019
 
2015
2020
  def stop(self) -> 'Summary':
2016
- """Signal and wait the monitor thread to stop."""
2021
+ """Signals and waits for the monitor thread to stop."""
2017
2022
  self._context.stopping = True
2018
2023
  return self.join()
2019
2024
 
2020
2025
  def join(self) -> 'Summary':
2021
- """Waits the monitor thread to complete."""
2026
+ """Waits for the monitor thread to complete."""
2022
2027
  self._thread.join()
2023
2028
  summary = self.summary
2024
2029
  assert summary is not None
@@ -2035,7 +2040,7 @@ class Summary(pg.Object):
2035
2040
  scan_interval: int = 60,
2036
2041
  refresh_when_stop: bool = True,
2037
2042
  ) -> MonitorResult:
2038
- """Monitor one or more root directories and save summary in period."""
2043
+ """Monitors one or more root directories and save summary periodically."""
2039
2044
  context = pg.Dict(stopping=False, completed=False, summary=None)
2040
2045
 
2041
2046
  def _monitor():
@@ -2187,7 +2192,7 @@ def monitor_async(
2187
2192
  scan_interval: int = 60,
2188
2193
  refresh_when_stop: bool = True,
2189
2194
  ) -> Summary.MonitorResult:
2190
- """Asynchronorsly monitor one or more root directories for summary."""
2195
+ """Asynchronously monitors one or more root directories for summary."""
2191
2196
  return Summary.monitor_async(
2192
2197
  root_dir,
2193
2198
  save_as,
@@ -2365,10 +2370,9 @@ def run(
2365
2370
  a string (for string-based patcher), a `pg.patching.Patcher` object, or
2366
2371
  a rebind function (e.g. `pg.rebind`). See `lf.eval.patch_*` for more
2367
2372
  details.
2368
- mode: The mode to run the suite. "run" to run the suite, with reusing
2369
- existing results if available; "rerun" to rerun all evaluations even if
2370
- there are existing results; "dryrun" to dryrun the suite; and "noop"
2371
- to do nothing.
2373
+ mode: The mode to run the suite: "run" to run with reuse of existing
2374
+ results, "rerun" to force re-evaluation, "dryrun" for a dry run, and
2375
+ "noop" to do nothing.
2372
2376
  debug: Whether to run in debug mode.
2373
2377
  print_definition: Whether to print the experiment definition.
2374
2378
  **kwargs: Additional arguments to be passed to dryrun/run the suite.
@@ -101,7 +101,7 @@ class EvaluationTest(unittest.TestCase):
101
101
  self.assertEqual(s.dir, os.path.join(s.root_dir, s.id))
102
102
  self.assertEqual(s.hash, s.clone().hash)
103
103
  # Test persistent hash.
104
- self.assertEqual(s.hash, 'e43392e4')
104
+ self.assertEqual(s.hash, '4dfe486a')
105
105
  self.assertEqual(
106
106
  s.hash, s.clone(override={'max_workers': 2, 'lm.timeout': 20}).hash
107
107
  )
@@ -211,7 +211,7 @@ class EvaluationTest(unittest.TestCase):
211
211
  s.result,
212
212
  dict(
213
213
  experiment_setup=dict(
214
- id='Evaluation@2fbf1b05',
214
+ id='Evaluation@e028b6e6',
215
215
  dir=s.dir,
216
216
  model='StaticSequence',
217
217
  prompt_template='{{example.question}}',
@@ -269,7 +269,7 @@ class EvaluationTest(unittest.TestCase):
269
269
  s.root_dir, base.Evaluation.SUMMARY_HTML.replace('.html', '.json')
270
270
  )
271
271
  self.assertTrue(os.path.exists(summary_json))
272
- summary = pg.load(summary_json, auto_dict=True)
272
+ summary = pg.load(summary_json, convert_unknown=True)
273
273
  self.assertIn('Evaluation', summary)
274
274
  self.assertEqual(len(summary['Evaluation']), 1)
275
275
  self.assertIsNotNone(summary['Evaluation'][0].experiment)
@@ -376,7 +376,7 @@ class EvaluationTest(unittest.TestCase):
376
376
  s.children[0].dir, os.path.join(s.root_dir, s.children[0].id)
377
377
  )
378
378
  # Test persistent hash.
379
- self.assertEqual(s.hash, 'de23bf31')
379
+ self.assertEqual(s.hash, 'fa8f5419')
380
380
 
381
381
  summary = s.run(verbose=True)
382
382
  self.assertEqual(len(summary.evaluations), 2)
@@ -526,7 +526,7 @@ class SuiteTest(unittest.TestCase):
526
526
  lm=lm
527
527
  )
528
528
  # Test for persistent hash.
529
- self.assertEqual(s.hash, '1c42f93e')
529
+ self.assertEqual(s.hash, 'ec3901b8')
530
530
  s.run()
531
531
  expected = {
532
532
  s.children[0].id: dict(
@@ -38,7 +38,7 @@ class Matching(base.Evaluation):
38
38
 
39
39
  @abc.abstractmethod
40
40
  def answer(self, output: Any, example: Any) -> Any:
41
- """Returns the answer from the structure output."""
41
+ """Returns the answer from the structured output."""
42
42
 
43
43
  @property
44
44
  def matches(self) -> list[tuple[int, Any, Any, lf.Message]]:
@@ -52,6 +52,7 @@ class Matching(base.Evaluation):
52
52
 
53
53
  @property
54
54
  def match_rate(self) -> float:
55
+ """Returns the match rate."""
55
56
  if self.num_completed == 0:
56
57
  return 0.0
57
58
  return self.num_matches / self.num_completed
@@ -68,17 +69,19 @@ class Matching(base.Evaluation):
68
69
 
69
70
  @property
70
71
  def mismatch_rate(self) -> float:
72
+ """Returns the mismatch rate."""
71
73
  if self.num_completed == 0:
72
74
  return 0.0
73
75
  return self.num_mismatches / self.num_completed
74
76
 
75
77
  @property
76
78
  def matches_link(self) -> str:
77
- """Returns the matches page."""
79
+ """Returns the link to the matches page."""
78
80
  return self.link(os.path.join(self.dir, Matching.MATCHES_HTML))
79
81
 
80
82
  @property
81
83
  def mismatches_link(self) -> str:
84
+ """Returns the link to the mismatches page."""
82
85
  return self.link(os.path.join(self.dir, Matching.MISMATCHES_HTML))
83
86
 
84
87
  def _reset(self) -> None:
@@ -114,17 +114,17 @@ def model_by_name(name: str) -> lf.LanguageModel:
114
114
 
115
115
  @pg.patcher(auto_typing=True)
116
116
  def lm(unused_eval, models: list[str]):
117
- """Patch the LM used for benchmarking."""
117
+ """Patches the LM used for benchmarking."""
118
118
  return patch_lm(pg.oneof([model_by_name(name) for name in models]))
119
119
 
120
120
 
121
121
  @pg.patcher(auto_typing=True)
122
122
  def temperature(unused_eval, value: float):
123
- """Patch the temperature used for benchmarking."""
123
+ """Patches the temperature used for benchmarking."""
124
124
  return patch_member(lf.LMSamplingOptions, "temperature", value)
125
125
 
126
126
 
127
127
  @pg.patcher(auto_typing=True)
128
128
  def max_tokens(unused_eval, value: int | None):
129
- """Patch the temperature used for benchmarking."""
129
+ """Patches the max_tokens used for benchmarking."""
130
130
  return patch_member(lf.LMSamplingOptions, "max_tokens", value)
@@ -41,18 +41,19 @@ class Scoring(base.Evaluation):
41
41
 
42
42
  @property
43
43
  def score_rate(self) -> float:
44
- """Returns the score rate."""
44
+ """Returns the rate of scored examples among the completed ones."""
45
45
  if self.num_completed == 0:
46
46
  return 0.0
47
47
  return self.num_scored / self.num_completed
48
48
 
49
49
  @property
50
50
  def scored_link(self) -> str:
51
- """Returns the matches page."""
51
+ """Returns the scored examples page."""
52
52
  return self.link(os.path.join(self.dir, Scoring.SCORED_HTML))
53
53
 
54
54
  @property
55
55
  def avg_score(self) -> float:
56
+ """Returns the average score of scored examples."""
56
57
  if self.num_scored == 0:
57
58
  return 0
58
59
  return sum([i[2] for i in self._scored]) / self.num_scored
@@ -181,7 +182,7 @@ class Scoring(base.Evaluation):
181
182
  super()._render_summary_metrics(s)
182
183
 
183
184
  def _render_scored(self, s: io.StringIO) -> None:
184
- """Formats the matched cases into html."""
185
+ """Formats the scored cases into html."""
185
186
  s.write('<h2> Scored </h2>')
186
187
  s.write('<div style="white-space:pre">\n')
187
188
  s.write(
@@ -35,9 +35,11 @@ from langfun.core.eval.v2.experiment import Runner
35
35
  from langfun.core.eval.v2 import runners
36
36
 
37
37
  # Plugins
38
+ from langfun.core.eval.v2.config_saver import RunConfigSaver
38
39
  from langfun.core.eval.v2.checkpointing import BulkCheckpointer
39
40
  from langfun.core.eval.v2.checkpointing import PerExampleCheckpointer
40
41
  from langfun.core.eval.v2.reporting import HtmlReporter
42
+ from langfun.core.eval.v2.reporting import ExampleHtmlGenerator
41
43
 
42
44
 
43
45
  # pylint: enable=g-bad-import-order
@@ -13,6 +13,8 @@
13
13
  # limitations under the License.
14
14
  """Checkpointing evaluation runs."""
15
15
  import abc
16
+ import datetime
17
+ import os
16
18
  import re
17
19
  import threading
18
20
  import traceback
@@ -29,12 +31,32 @@ Runner = experiment_lib.Runner
29
31
 
30
32
 
31
33
  class Checkpointer(experiment_lib.Plugin):
32
- """Base class for checkpointing evaluation examples."""
34
+ """Base class for checkpointing evaluation examples.
35
+
36
+ `Checkpointer` is a plugin that saves the state of processed examples
37
+ incrementally during an experiment run, allowing the experiment to be resumed
38
+ later. When an experiment starts, the checkpointer loads any previously saved
39
+ examples from an earlier run (or a warm-start run) into `experiment.state`,
40
+ so the runner can skip processing them again.
41
+ Subclasses should implement `_list_checkpoint_filenames` to identify
42
+ checkpoint files to load, and `_save_example` to save a newly processed
43
+ example.
44
+ """
33
45
 
34
46
  checkpoint_filename: Annotated[
35
47
  str,
36
48
  'Checkpoint file pattern.'
37
- ] = 'checkpoint.bagz'
49
+ ] = 'checkpoint.jsonl'
50
+
51
+ enable_inprogress_file: Annotated[
52
+ bool,
53
+ 'If True, write file "<example_id>.inprogress" when example gets started.'
54
+ ] = True
55
+
56
+ max_ckpt_loading_threads: Annotated[
57
+ int,
58
+ 'Max number of workers for loading checkpoint files at startup.'
59
+ ] = 128
38
60
 
39
61
  def on_experiment_start(
40
62
  self,
@@ -75,6 +97,24 @@ class Checkpointer(experiment_lib.Plugin):
75
97
  f'scratch. Example IDs: {example_ids_to_evaluate}.'
76
98
  )
77
99
 
100
+ def on_example_start(
101
+ self,
102
+ runner: Runner,
103
+ experiment: Experiment,
104
+ example: Example,
105
+ ) -> None:
106
+ """Saves the example to the checkpoint file."""
107
+ if self.enable_inprogress_file:
108
+ def _save_inprogress_file(example: Example):
109
+ inprogress_file = runner.current_run.output_path_for(
110
+ experiment, f'{example.id}.inprogress'
111
+ )
112
+ pg.io.writefile(
113
+ inprogress_file,
114
+ datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
115
+ )
116
+ runner.background_run(_save_inprogress_file, example)
117
+
78
118
  def on_example_complete(
79
119
  self,
80
120
  runner: Runner,
@@ -149,7 +189,10 @@ class Checkpointer(experiment_lib.Plugin):
149
189
 
150
190
  _ = list(
151
191
  lf.concurrent_map(
152
- _load_state, ckpt_files, max_workers=16, silence_on_errors=None
192
+ _load_state,
193
+ ckpt_files,
194
+ max_workers=self.max_ckpt_loading_threads,
195
+ silence_on_errors=None
153
196
  )
154
197
  )
155
198
 
@@ -170,7 +213,12 @@ class Checkpointer(experiment_lib.Plugin):
170
213
 
171
214
 
172
215
  class PerExampleCheckpointer(Checkpointer):
173
- """Checkpointer that saves each example to a separate file."""
216
+ """Checkpointer that saves each example to a separate file.
217
+
218
+ This checkpointer saves each processed example to its own checkpoint file,
219
+ named using the pattern `<checkpoint_filename_prefix>_<example_id>.<ext>`.
220
+ For example, `checkpoint_1.bagz`, `checkpoint_2.bagz`, etc.
221
+ """
174
222
 
175
223
  def _on_bound(self):
176
224
  super()._on_bound()
@@ -235,7 +283,13 @@ class PerExampleCheckpointer(Checkpointer):
235
283
 
236
284
 
237
285
  class BulkCheckpointer(Checkpointer):
238
- """Checkpointer that saves all examples to a single file."""
286
+ """Checkpointer that saves all examples of an evaluation to a single file.
287
+
288
+ This checkpointer appends newly processed examples of an evaluation to a
289
+ single sequence file (e.g., `checkpoint.bagz`). This is often more efficient
290
+ than `PerExampleCheckpointer` when dealing with a large number of examples
291
+ or when file system overhead is a concern.
292
+ """
239
293
 
240
294
  def _on_bound(self):
241
295
  super()._on_bound()
@@ -341,12 +395,26 @@ class BulkCheckpointer(Checkpointer):
341
395
 
342
396
 
343
397
  class SequenceWriter:
344
- """Thread safe sequence writer."""
398
+ """A thread-safe writer for sequence files (e.g., Bagz) with atomic write.
399
+
400
+ `SequenceWriter` wraps a `pg.io.SequenceWriter` to provide thread-safe
401
+ `add` and `close` operations, ensuring that examples can be written
402
+ concurrently from multiple threads without corrupting the sequence file.
403
+ It writes to a temporary file and renames it to target path on `close` to
404
+ achieve atomic write. If the target path exists, new examples are appended
405
+ to existing content.
406
+ """
345
407
 
346
408
  def __init__(self, path: str):
347
409
  self._lock = threading.Lock()
348
410
  self._path = path
349
- self._sequence_writer = pg.io.open_sequence(path, 'a')
411
+ basename = os.path.basename(path)
412
+ self._tmp_path = os.path.join(
413
+ os.path.dirname(path), f'tmp.{basename}'
414
+ )
415
+ if pg.io.path_exists(self._path):
416
+ pg.io.copy(self._path, self._tmp_path)
417
+ self._sequence_writer = pg.io.open_sequence(self._tmp_path, 'a')
350
418
 
351
419
  @property
352
420
  def path(self) -> str:
@@ -371,6 +439,7 @@ class SequenceWriter:
371
439
  return
372
440
  self._sequence_writer.close()
373
441
  self._sequence_writer = None
442
+ pg.io.rename(self._tmp_path, self._path)
374
443
 
375
444
  def __del__(self):
376
445
  self.close()
@@ -65,7 +65,7 @@ class ExampleCollector(experiment_lib.Plugin):
65
65
  return self._examples
66
66
 
67
67
  def on_example_complete(
68
- self, runner: runners_lib.Runner,
68
+ self, runner: experiment_lib.Runner,
69
69
  experiment: experiment_lib.Experiment,
70
70
  example: example_lib.Example,
71
71
  ):
@@ -90,7 +90,10 @@ class PerExampleCheckpointerTest(CheckpointerTest):
90
90
  root_dir = os.path.join(tempfile.mkdtemp(), 'per_example_checkpointer')
91
91
  experiment = eval_test_helper.test_experiment()
92
92
  checkpoint_filename = 'checkpoint.jsonl'
93
- checkpointer = checkpointing.PerExampleCheckpointer(checkpoint_filename)
93
+ checkpointer = checkpointing.PerExampleCheckpointer(
94
+ checkpoint_filename,
95
+ enable_inprogress_file=True
96
+ )
94
97
  collector = ExampleCollector()
95
98
  run = experiment.run(
96
99
  root_dir, 'new', runner='sequential', plugins=[checkpointer, collector]
@@ -102,6 +105,10 @@ class PerExampleCheckpointerTest(CheckpointerTest):
102
105
  example = collector.examples[i + 1]
103
106
  ckpt = run.output_path_for(leaf, f'checkpoint_{example.id}.jsonl')
104
107
  self.assertTrue(pg.io.path_exists(ckpt))
108
+ inprogress_file = run.output_path_for(
109
+ leaf, f'{example.id}.inprogress'
110
+ )
111
+ self.assertTrue(pg.io.path_exists(inprogress_file))
105
112
  with pg.io.open_sequence(ckpt) as f:
106
113
  examples_from_ckpt = list(iter(f))
107
114
  # `eval_test_helper.test_experiment` has two TestEvaluation with
@@ -0,0 +1,37 @@
1
+ # Copyright 2024 The Langfun Authors
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Config saver plugins."""
15
+
16
+ import os
17
+ from langfun.core.eval.v2 import experiment as experiment_lib
18
+
19
+
20
+ class RunConfigSaver(experiment_lib.Plugin):
21
+ """Saves the current run."""
22
+
23
+ def on_run_start(
24
+ self,
25
+ runner: experiment_lib.Runner,
26
+ root: experiment_lib.Experiment
27
+ ) -> None:
28
+ del root # Unused.
29
+ self._save_run_config(runner)
30
+
31
+ def _save_run_config(self, runner: experiment_lib.Runner) -> None:
32
+ def _save():
33
+ runner.current_run.save(
34
+ os.path.join(runner.current_run.output_root, 'run.json'),
35
+ hide_default_values=True,
36
+ )
37
+ runner.background_run(_save)