langfun 0.1.2.dev202509120804__py3-none-any.whl → 0.1.2.dev202512040805__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- langfun/__init__.py +1 -1
- langfun/core/__init__.py +7 -1
- langfun/core/agentic/__init__.py +8 -1
- langfun/core/agentic/action.py +740 -112
- langfun/core/agentic/action_eval.py +9 -2
- langfun/core/agentic/action_test.py +189 -24
- langfun/core/async_support.py +104 -5
- langfun/core/async_support_test.py +23 -0
- langfun/core/coding/python/correction.py +19 -9
- langfun/core/coding/python/execution.py +14 -12
- langfun/core/coding/python/generation.py +21 -16
- langfun/core/coding/python/sandboxing.py +23 -3
- langfun/core/component.py +42 -3
- langfun/core/concurrent.py +70 -6
- langfun/core/concurrent_test.py +9 -2
- langfun/core/console.py +1 -1
- langfun/core/data/conversion/anthropic.py +12 -3
- langfun/core/data/conversion/anthropic_test.py +8 -6
- langfun/core/data/conversion/gemini.py +11 -2
- langfun/core/data/conversion/gemini_test.py +48 -9
- langfun/core/data/conversion/openai.py +145 -31
- langfun/core/data/conversion/openai_test.py +161 -17
- langfun/core/eval/base.py +48 -44
- langfun/core/eval/base_test.py +5 -5
- langfun/core/eval/matching.py +5 -2
- langfun/core/eval/patching.py +3 -3
- langfun/core/eval/scoring.py +4 -3
- langfun/core/eval/v2/__init__.py +2 -0
- langfun/core/eval/v2/checkpointing.py +76 -7
- langfun/core/eval/v2/checkpointing_test.py +9 -2
- langfun/core/eval/v2/config_saver.py +37 -0
- langfun/core/eval/v2/config_saver_test.py +36 -0
- langfun/core/eval/v2/eval_test_helper.py +104 -3
- langfun/core/eval/v2/evaluation.py +92 -17
- langfun/core/eval/v2/evaluation_test.py +9 -3
- langfun/core/eval/v2/example.py +50 -40
- langfun/core/eval/v2/example_test.py +16 -8
- langfun/core/eval/v2/experiment.py +84 -15
- langfun/core/eval/v2/experiment_test.py +19 -0
- langfun/core/eval/v2/metric_values.py +31 -3
- langfun/core/eval/v2/metric_values_test.py +32 -0
- langfun/core/eval/v2/metrics.py +157 -44
- langfun/core/eval/v2/metrics_test.py +39 -18
- langfun/core/eval/v2/progress.py +31 -1
- langfun/core/eval/v2/progress_test.py +27 -0
- langfun/core/eval/v2/progress_tracking.py +13 -5
- langfun/core/eval/v2/progress_tracking_test.py +9 -1
- langfun/core/eval/v2/reporting.py +90 -71
- langfun/core/eval/v2/reporting_test.py +24 -6
- langfun/core/eval/v2/runners/__init__.py +30 -0
- langfun/core/eval/v2/{runners.py → runners/base.py} +72 -180
- langfun/core/eval/v2/runners/beam.py +354 -0
- langfun/core/eval/v2/runners/beam_test.py +153 -0
- langfun/core/eval/v2/runners/ckpt_monitor.py +294 -0
- langfun/core/eval/v2/runners/ckpt_monitor_test.py +162 -0
- langfun/core/eval/v2/runners/debug.py +40 -0
- langfun/core/eval/v2/runners/debug_test.py +76 -0
- langfun/core/eval/v2/runners/parallel.py +243 -0
- langfun/core/eval/v2/runners/parallel_test.py +182 -0
- langfun/core/eval/v2/runners/sequential.py +47 -0
- langfun/core/eval/v2/runners/sequential_test.py +169 -0
- langfun/core/langfunc.py +45 -130
- langfun/core/langfunc_test.py +7 -5
- langfun/core/language_model.py +189 -36
- langfun/core/language_model_test.py +54 -3
- langfun/core/llms/__init__.py +12 -1
- langfun/core/llms/anthropic.py +157 -2
- langfun/core/llms/azure_openai.py +29 -17
- langfun/core/llms/cache/base.py +25 -3
- langfun/core/llms/cache/in_memory.py +48 -7
- langfun/core/llms/cache/in_memory_test.py +14 -4
- langfun/core/llms/compositional.py +25 -1
- langfun/core/llms/deepseek.py +30 -2
- langfun/core/llms/fake.py +32 -1
- langfun/core/llms/gemini.py +64 -12
- langfun/core/llms/gemini_test.py +110 -0
- langfun/core/llms/google_genai.py +34 -1
- langfun/core/llms/groq.py +28 -3
- langfun/core/llms/llama_cpp.py +23 -4
- langfun/core/llms/openai.py +120 -3
- langfun/core/llms/openai_compatible.py +148 -27
- langfun/core/llms/openai_compatible_test.py +207 -20
- langfun/core/llms/openai_test.py +0 -2
- langfun/core/llms/rest.py +16 -1
- langfun/core/llms/vertexai.py +58 -8
- langfun/core/logging.py +1 -1
- langfun/core/mcp/__init__.py +10 -0
- langfun/core/mcp/client.py +177 -0
- langfun/core/mcp/client_test.py +71 -0
- langfun/core/mcp/session.py +241 -0
- langfun/core/mcp/session_test.py +54 -0
- langfun/core/mcp/testing/simple_mcp_client.py +33 -0
- langfun/core/mcp/testing/simple_mcp_server.py +33 -0
- langfun/core/mcp/tool.py +254 -0
- langfun/core/mcp/tool_test.py +197 -0
- langfun/core/memory.py +1 -0
- langfun/core/message.py +160 -55
- langfun/core/message_test.py +65 -81
- langfun/core/modalities/__init__.py +8 -0
- langfun/core/modalities/audio.py +21 -1
- langfun/core/modalities/image.py +73 -3
- langfun/core/modalities/image_test.py +116 -0
- langfun/core/modalities/mime.py +64 -3
- langfun/core/modalities/mime_test.py +11 -0
- langfun/core/modalities/pdf.py +19 -1
- langfun/core/modalities/video.py +21 -1
- langfun/core/modality.py +167 -29
- langfun/core/modality_test.py +42 -12
- langfun/core/natural_language.py +1 -1
- langfun/core/sampling.py +4 -4
- langfun/core/sampling_test.py +20 -4
- langfun/core/structured/__init__.py +2 -24
- langfun/core/structured/completion.py +34 -44
- langfun/core/structured/completion_test.py +23 -43
- langfun/core/structured/description.py +54 -50
- langfun/core/structured/function_generation.py +29 -12
- langfun/core/structured/mapping.py +81 -37
- langfun/core/structured/parsing.py +95 -79
- langfun/core/structured/parsing_test.py +0 -3
- langfun/core/structured/querying.py +230 -154
- langfun/core/structured/querying_test.py +69 -33
- langfun/core/structured/schema/__init__.py +49 -0
- langfun/core/structured/schema/base.py +664 -0
- langfun/core/structured/schema/base_test.py +531 -0
- langfun/core/structured/schema/json.py +174 -0
- langfun/core/structured/schema/json_test.py +121 -0
- langfun/core/structured/schema/python.py +316 -0
- langfun/core/structured/schema/python_test.py +410 -0
- langfun/core/structured/schema_generation.py +33 -14
- langfun/core/structured/scoring.py +47 -36
- langfun/core/structured/tokenization.py +26 -11
- langfun/core/subscription.py +2 -2
- langfun/core/template.py +175 -50
- langfun/core/template_test.py +123 -17
- langfun/env/__init__.py +43 -0
- langfun/env/base_environment.py +827 -0
- langfun/env/base_environment_test.py +473 -0
- langfun/env/base_feature.py +304 -0
- langfun/env/base_feature_test.py +228 -0
- langfun/env/base_sandbox.py +842 -0
- langfun/env/base_sandbox_test.py +1235 -0
- langfun/env/event_handlers/__init__.py +14 -0
- langfun/env/event_handlers/chain.py +233 -0
- langfun/env/event_handlers/chain_test.py +253 -0
- langfun/env/event_handlers/event_logger.py +472 -0
- langfun/env/event_handlers/event_logger_test.py +304 -0
- langfun/env/event_handlers/metric_writer.py +726 -0
- langfun/env/event_handlers/metric_writer_test.py +214 -0
- langfun/env/interface.py +1640 -0
- langfun/env/interface_test.py +153 -0
- langfun/env/load_balancers.py +59 -0
- langfun/env/load_balancers_test.py +141 -0
- langfun/env/test_utils.py +507 -0
- {langfun-0.1.2.dev202509120804.dist-info → langfun-0.1.2.dev202512040805.dist-info}/METADATA +7 -3
- langfun-0.1.2.dev202512040805.dist-info/RECORD +217 -0
- langfun/core/eval/v2/runners_test.py +0 -343
- langfun/core/structured/schema.py +0 -987
- langfun/core/structured/schema_test.py +0 -982
- langfun-0.1.2.dev202509120804.dist-info/RECORD +0 -172
- {langfun-0.1.2.dev202509120804.dist-info → langfun-0.1.2.dev202512040805.dist-info}/WHEEL +0 -0
- {langfun-0.1.2.dev202509120804.dist-info → langfun-0.1.2.dev202512040805.dist-info}/licenses/LICENSE +0 -0
- {langfun-0.1.2.dev202509120804.dist-info → langfun-0.1.2.dev202512040805.dist-info}/top_level.txt +0 -0
langfun/core/eval/base.py
CHANGED
|
@@ -59,18 +59,20 @@ class Evaluable(lf.Component):
|
|
|
59
59
|
@property
|
|
60
60
|
@abc.abstractmethod
|
|
61
61
|
def id(self) -> str:
|
|
62
|
-
"""Returns the ID of
|
|
62
|
+
"""Returns the ID of this evaluable node.
|
|
63
63
|
|
|
64
64
|
Returns:
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
65
|
+
A string as the ID of this evaluable node.
|
|
66
|
+
If an evaluable node acts as a container for other evaluable nodes
|
|
67
|
+
(e.g. `lf.Suite`), its ID could be empty.
|
|
68
|
+
Leaf evaluable nodes (e.g. `lf.Evaluation`) must have unique IDs
|
|
69
|
+
under the same container, as their IDs will be used as the directory
|
|
70
|
+
name for saving their results.
|
|
69
71
|
"""
|
|
70
72
|
|
|
71
73
|
@property
|
|
72
74
|
def dir(self) -> str | None:
|
|
73
|
-
"""Returns the directory for saving results
|
|
75
|
+
"""Returns the directory for saving results."""
|
|
74
76
|
if self.root_dir is None:
|
|
75
77
|
return None
|
|
76
78
|
return os.path.join(self.root_dir, self.id)
|
|
@@ -82,18 +84,18 @@ class Evaluable(lf.Component):
|
|
|
82
84
|
|
|
83
85
|
@property
|
|
84
86
|
def index_link(self) -> str | None:
|
|
85
|
-
"""Returns the index page."""
|
|
87
|
+
"""Returns the link to the index page."""
|
|
86
88
|
if self.dir is None:
|
|
87
89
|
return None
|
|
88
90
|
return self.link(os.path.join(self.dir, Evaluable.INDEX_HTML))
|
|
89
91
|
|
|
90
92
|
def summary(self, pivot_field: str = 'lm') -> 'Summary':
|
|
91
|
-
"""Returns a summary for all child evaluations
|
|
93
|
+
"""Returns a summary for all child evaluations."""
|
|
92
94
|
return Summary([pg.Ref(x) for x in self.leaf_nodes], pivot_field)
|
|
93
95
|
|
|
94
96
|
@property
|
|
95
97
|
def summary_link(self) -> str | None:
|
|
96
|
-
"""Returns the summary page."""
|
|
98
|
+
"""Returns the link to the summary page."""
|
|
97
99
|
if self.root_dir is None:
|
|
98
100
|
return None
|
|
99
101
|
return self.link(os.path.join(self.root_dir, Evaluable.SUMMARY_HTML))
|
|
@@ -177,6 +179,7 @@ class Evaluable(lf.Component):
|
|
|
177
179
|
|
|
178
180
|
@property
|
|
179
181
|
def is_leaf(self) -> bool:
|
|
182
|
+
"""Returns whether this node is a leaf node."""
|
|
180
183
|
return isinstance(self, Evaluation) and not self.children
|
|
181
184
|
|
|
182
185
|
@functools.cached_property
|
|
@@ -404,7 +407,7 @@ class Evaluable(lf.Component):
|
|
|
404
407
|
timeout: int | None = None,
|
|
405
408
|
**kwargs,
|
|
406
409
|
) -> None:
|
|
407
|
-
"""Run the
|
|
410
|
+
"""Run the evaluation and fill `self.result`. Subclass to implement."""
|
|
408
411
|
|
|
409
412
|
@abc.abstractmethod
|
|
410
413
|
def _completion_status(self, run_status: str) -> str:
|
|
@@ -545,6 +548,7 @@ class Evaluable(lf.Component):
|
|
|
545
548
|
def from_dir(
|
|
546
549
|
cls, maybe_dir: str, load_result: bool = True
|
|
547
550
|
) -> Optional['Evaluable']:
|
|
551
|
+
"""Loads an evaluable object from a directory."""
|
|
548
552
|
exp_json = os.path.join(maybe_dir, Evaluable.EXPERIMENT_JSON)
|
|
549
553
|
if not pg.io.path_exists(exp_json):
|
|
550
554
|
return None
|
|
@@ -558,7 +562,7 @@ class Evaluable(lf.Component):
|
|
|
558
562
|
return experiment
|
|
559
563
|
|
|
560
564
|
def try_load_result(self) -> bool:
|
|
561
|
-
"""Try
|
|
565
|
+
"""Try loads result from file if it's not loaded."""
|
|
562
566
|
if self.result is None:
|
|
563
567
|
result_json = os.path.join(self.dir, Evaluable.RESULT_JSON)
|
|
564
568
|
if pg.io.path_exists(result_json):
|
|
@@ -595,7 +599,7 @@ class Suite(Evaluable):
|
|
|
595
599
|
def _on_bound(self):
|
|
596
600
|
super()._on_bound()
|
|
597
601
|
overrides = {
|
|
598
|
-
k: v for k, v in self.sym_init_args.
|
|
602
|
+
k: v for k, v in self.sym_init_args.sym_items()
|
|
599
603
|
if k not in ('id', 'children')
|
|
600
604
|
}
|
|
601
605
|
for child in self.children:
|
|
@@ -604,6 +608,7 @@ class Suite(Evaluable):
|
|
|
604
608
|
|
|
605
609
|
@functools.cached_property
|
|
606
610
|
def hash(self) -> str:
|
|
611
|
+
"""Returns the hash of this suite."""
|
|
607
612
|
return hashlib.md5(
|
|
608
613
|
' '.join(sorted([c.hash for c in self.children])).encode()
|
|
609
614
|
).hexdigest()[:8]
|
|
@@ -619,14 +624,14 @@ class Suite(Evaluable):
|
|
|
619
624
|
|
|
620
625
|
|
|
621
626
|
class Evaluation(Evaluable):
|
|
622
|
-
"""Base class for evaluation
|
|
627
|
+
"""Base class for evaluation sets."""
|
|
623
628
|
|
|
624
629
|
inputs: pg.typing.Annotated[
|
|
625
630
|
pg.typing.Functor(),
|
|
626
631
|
(
|
|
627
632
|
'A functor that returns a list of user-defined objects as the input '
|
|
628
|
-
'examples. It
|
|
629
|
-
'`lf.eval.inputs_from(path)`, from a Python
|
|
633
|
+
'examples. It can be inputs loaded from a JSON file via '
|
|
634
|
+
'`lf.eval.inputs_from(path)`, from a Python-coded list via '
|
|
630
635
|
'`lf.eval.as_inputs(values)` or a user-defined functor that '
|
|
631
636
|
'generates input objects at runtime.'
|
|
632
637
|
),
|
|
@@ -648,12 +653,12 @@ class Evaluation(Evaluable):
|
|
|
648
653
|
pg.typing.Functor().noneable(),
|
|
649
654
|
(
|
|
650
655
|
'A functor that returns a type annotation that will be converted to '
|
|
651
|
-
'`lf.Schema`, or a tuple of (annotation,
|
|
656
|
+
'`lf.Schema`, or a tuple of (annotation, few-shot examples). '
|
|
652
657
|
'For "call" method, it could be None, indicating that the raw '
|
|
653
|
-
'response from the LM will be used as the output, and the
|
|
654
|
-
'examples will be used for parsing. For "query" and "complete"
|
|
655
|
-
'must be provided, and the
|
|
656
|
-
'for prompting. Here
|
|
658
|
+
'response from the LM will be used as the output, and the few-shot '
|
|
659
|
+
'examples will be used for parsing. For "query" and "complete" '
|
|
660
|
+
'methods, it must be provided, and the few-shot examples will be '
|
|
661
|
+
'used directly for prompting. Here is example code on how the '
|
|
657
662
|
'functors should be defined:'
|
|
658
663
|
+ inspect.cleandoc("""
|
|
659
664
|
```
|
|
@@ -693,7 +698,7 @@ class Evaluation(Evaluable):
|
|
|
693
698
|
completion_prompt_field: Annotated[
|
|
694
699
|
str | None,
|
|
695
700
|
(
|
|
696
|
-
'A
|
|
701
|
+
'A string field that will be automatically added to the class of the '
|
|
697
702
|
'input object for `lf.complete`. If None, no field will be added to '
|
|
698
703
|
'the class, instead the prompt will be passed as the first argument '
|
|
699
704
|
'of the input object to complete. Applicable only when `method` is '
|
|
@@ -738,7 +743,7 @@ class Evaluation(Evaluable):
|
|
|
738
743
|
|
|
739
744
|
@functools.cached_property
|
|
740
745
|
def hash(self) -> str:
|
|
741
|
-
"""Returns the
|
|
746
|
+
"""Returns the semantics-based hash of the evaluation."""
|
|
742
747
|
if self.is_deterministic:
|
|
743
748
|
identity = pg.format(self._identifiers(), compact=True)
|
|
744
749
|
else:
|
|
@@ -784,7 +789,7 @@ class Evaluation(Evaluable):
|
|
|
784
789
|
|
|
785
790
|
@property
|
|
786
791
|
def complete_rate(self) -> float:
|
|
787
|
-
"""Returns the
|
|
792
|
+
"""Returns the completion rate of examples."""
|
|
788
793
|
return self.num_completed / self.num_examples
|
|
789
794
|
|
|
790
795
|
#
|
|
@@ -837,7 +842,7 @@ class Evaluation(Evaluable):
|
|
|
837
842
|
|
|
838
843
|
@functools.cached_property
|
|
839
844
|
def non_oop_failures(self) -> list[tuple[Any, Exception]]:
|
|
840
|
-
"""Returns the OOP failures."""
|
|
845
|
+
"""Returns the non-OOP failures."""
|
|
841
846
|
return [item for item in self.failures
|
|
842
847
|
if not isinstance(item[1], lf_structured.MappingError)]
|
|
843
848
|
|
|
@@ -883,7 +888,7 @@ class Evaluation(Evaluable):
|
|
|
883
888
|
|
|
884
889
|
@functools.cached_property
|
|
885
890
|
def schema(self) -> lf_structured.Schema | None:
|
|
886
|
-
"""
|
|
891
|
+
"""Returns the schema for parsing LLM response."""
|
|
887
892
|
if self.schema_fn is None:
|
|
888
893
|
return None
|
|
889
894
|
|
|
@@ -897,7 +902,7 @@ class Evaluation(Evaluable):
|
|
|
897
902
|
|
|
898
903
|
@functools.cached_property
|
|
899
904
|
def fewshot_examples(self) -> list[lf.structured.MappingExample] | None:
|
|
900
|
-
"""
|
|
905
|
+
"""Returns the few-shot examples for prompting or parsing."""
|
|
901
906
|
if self.schema_fn is None:
|
|
902
907
|
return None
|
|
903
908
|
|
|
@@ -973,7 +978,7 @@ class Evaluation(Evaluable):
|
|
|
973
978
|
|
|
974
979
|
@functools.cached_property
|
|
975
980
|
def children(self) -> list['Evaluation']:
|
|
976
|
-
"""Returns
|
|
981
|
+
"""Returns child evaluations if this evaluation has a parameter space."""
|
|
977
982
|
if self.is_deterministic:
|
|
978
983
|
return []
|
|
979
984
|
children = []
|
|
@@ -1023,7 +1028,7 @@ class Evaluation(Evaluable):
|
|
|
1023
1028
|
|
|
1024
1029
|
@property
|
|
1025
1030
|
def non_oop_failures_link(self) -> str | None:
|
|
1026
|
-
"""Returns the link to
|
|
1031
|
+
"""Returns the link to the non-OOP failures page."""
|
|
1027
1032
|
if self.dir is None:
|
|
1028
1033
|
return None
|
|
1029
1034
|
return self.link(os.path.join(self.dir, Evaluation.NON_OOP_FAILURES_HTML))
|
|
@@ -1208,10 +1213,10 @@ class Evaluation(Evaluable):
|
|
|
1208
1213
|
)
|
|
1209
1214
|
|
|
1210
1215
|
def process_output(self, example: Any, output: lf.Message) -> None:
|
|
1211
|
-
"""
|
|
1216
|
+
"""Processes the output for an example.
|
|
1212
1217
|
|
|
1213
1218
|
Subclasses can override this method to generate and attach additional
|
|
1214
|
-
metadata for debugging
|
|
1219
|
+
metadata for debugging purposes. For example, draw bounding boxes on the
|
|
1215
1220
|
input image based on LLM predicted boxes and attach to output_message's
|
|
1216
1221
|
metadata.
|
|
1217
1222
|
|
|
@@ -1219,8 +1224,8 @@ class Evaluation(Evaluable):
|
|
|
1219
1224
|
|
|
1220
1225
|
class BoundingBoxEval(lf.eval.Matching):
|
|
1221
1226
|
...
|
|
1222
|
-
def process_output(example, output):
|
|
1223
|
-
output.metadata.image_with_bbox =
|
|
1227
|
+
def process_output(self, example, output):
|
|
1228
|
+
output.metadata.image_with_bbox = draw_bounding_box(
|
|
1224
1229
|
example.image, output.result)
|
|
1225
1230
|
|
|
1226
1231
|
Args:
|
|
@@ -1449,7 +1454,7 @@ class Evaluation(Evaluable):
|
|
|
1449
1454
|
trace the LM input, response and parsed structure. If error is raised
|
|
1450
1455
|
before LLM could return a response, None will be its value.
|
|
1451
1456
|
error: The exception during processing the example.
|
|
1452
|
-
dryrun: Whether or not
|
|
1457
|
+
dryrun: Whether or not auditing takes place during dryrun.
|
|
1453
1458
|
"""
|
|
1454
1459
|
if error is not None:
|
|
1455
1460
|
self._failures.append((example, error))
|
|
@@ -1557,7 +1562,7 @@ class Evaluation(Evaluable):
|
|
|
1557
1562
|
f'style="color:darkgray">{_html_repr(self.prompt)}</td>'
|
|
1558
1563
|
)
|
|
1559
1564
|
# Schema.
|
|
1560
|
-
schema_title = self.schema.
|
|
1565
|
+
schema_title = self.schema.schema_repr('python') if self.schema else None
|
|
1561
1566
|
s.write(
|
|
1562
1567
|
'<td style="color:purple" '
|
|
1563
1568
|
f'title="{schema_title}">'
|
|
@@ -1674,7 +1679,7 @@ class Evaluation(Evaluable):
|
|
|
1674
1679
|
|
|
1675
1680
|
@classmethod
|
|
1676
1681
|
def visualize(cls, evaluations: list['Evaluation']) -> str | None:
|
|
1677
|
-
"""Visualize
|
|
1682
|
+
"""Visualize a list of evaluations of this task in HTML."""
|
|
1678
1683
|
del evaluations
|
|
1679
1684
|
return None
|
|
1680
1685
|
|
|
@@ -1810,7 +1815,7 @@ class Summary(pg.Object):
|
|
|
1810
1815
|
)
|
|
1811
1816
|
|
|
1812
1817
|
class Table(pg.Object):
|
|
1813
|
-
"""A pivot table for
|
|
1818
|
+
"""A pivot table for viewing evaluations."""
|
|
1814
1819
|
|
|
1815
1820
|
class Row(pg.Object):
|
|
1816
1821
|
descriptor: dict[str, Any]
|
|
@@ -2013,12 +2018,12 @@ class Summary(pg.Object):
|
|
|
2013
2018
|
return self._context.completed
|
|
2014
2019
|
|
|
2015
2020
|
def stop(self) -> 'Summary':
|
|
2016
|
-
"""
|
|
2021
|
+
"""Signals and waits for the monitor thread to stop."""
|
|
2017
2022
|
self._context.stopping = True
|
|
2018
2023
|
return self.join()
|
|
2019
2024
|
|
|
2020
2025
|
def join(self) -> 'Summary':
|
|
2021
|
-
"""Waits the monitor thread to complete."""
|
|
2026
|
+
"""Waits for the monitor thread to complete."""
|
|
2022
2027
|
self._thread.join()
|
|
2023
2028
|
summary = self.summary
|
|
2024
2029
|
assert summary is not None
|
|
@@ -2035,7 +2040,7 @@ class Summary(pg.Object):
|
|
|
2035
2040
|
scan_interval: int = 60,
|
|
2036
2041
|
refresh_when_stop: bool = True,
|
|
2037
2042
|
) -> MonitorResult:
|
|
2038
|
-
"""
|
|
2043
|
+
"""Monitors one or more root directories and save summary periodically."""
|
|
2039
2044
|
context = pg.Dict(stopping=False, completed=False, summary=None)
|
|
2040
2045
|
|
|
2041
2046
|
def _monitor():
|
|
@@ -2187,7 +2192,7 @@ def monitor_async(
|
|
|
2187
2192
|
scan_interval: int = 60,
|
|
2188
2193
|
refresh_when_stop: bool = True,
|
|
2189
2194
|
) -> Summary.MonitorResult:
|
|
2190
|
-
"""
|
|
2195
|
+
"""Asynchronously monitors one or more root directories for summary."""
|
|
2191
2196
|
return Summary.monitor_async(
|
|
2192
2197
|
root_dir,
|
|
2193
2198
|
save_as,
|
|
@@ -2365,10 +2370,9 @@ def run(
|
|
|
2365
2370
|
a string (for string-based patcher), a `pg.patching.Patcher` object, or
|
|
2366
2371
|
a rebind function (e.g. `pg.rebind`). See `lf.eval.patch_*` for more
|
|
2367
2372
|
details.
|
|
2368
|
-
mode: The mode to run the suite
|
|
2369
|
-
|
|
2370
|
-
|
|
2371
|
-
to do nothing.
|
|
2373
|
+
mode: The mode to run the suite: "run" to run with reuse of existing
|
|
2374
|
+
results, "rerun" to force re-evaluation, "dryrun" for a dry run, and
|
|
2375
|
+
"noop" to do nothing.
|
|
2372
2376
|
debug: Whether to run in debug mode.
|
|
2373
2377
|
print_definition: Whether to print the experiment definition.
|
|
2374
2378
|
**kwargs: Additional arguments to be passed to dryrun/run the suite.
|
langfun/core/eval/base_test.py
CHANGED
|
@@ -101,7 +101,7 @@ class EvaluationTest(unittest.TestCase):
|
|
|
101
101
|
self.assertEqual(s.dir, os.path.join(s.root_dir, s.id))
|
|
102
102
|
self.assertEqual(s.hash, s.clone().hash)
|
|
103
103
|
# Test persistent hash.
|
|
104
|
-
self.assertEqual(s.hash, '
|
|
104
|
+
self.assertEqual(s.hash, '4dfe486a')
|
|
105
105
|
self.assertEqual(
|
|
106
106
|
s.hash, s.clone(override={'max_workers': 2, 'lm.timeout': 20}).hash
|
|
107
107
|
)
|
|
@@ -211,7 +211,7 @@ class EvaluationTest(unittest.TestCase):
|
|
|
211
211
|
s.result,
|
|
212
212
|
dict(
|
|
213
213
|
experiment_setup=dict(
|
|
214
|
-
id='Evaluation@
|
|
214
|
+
id='Evaluation@e028b6e6',
|
|
215
215
|
dir=s.dir,
|
|
216
216
|
model='StaticSequence',
|
|
217
217
|
prompt_template='{{example.question}}',
|
|
@@ -269,7 +269,7 @@ class EvaluationTest(unittest.TestCase):
|
|
|
269
269
|
s.root_dir, base.Evaluation.SUMMARY_HTML.replace('.html', '.json')
|
|
270
270
|
)
|
|
271
271
|
self.assertTrue(os.path.exists(summary_json))
|
|
272
|
-
summary = pg.load(summary_json,
|
|
272
|
+
summary = pg.load(summary_json, convert_unknown=True)
|
|
273
273
|
self.assertIn('Evaluation', summary)
|
|
274
274
|
self.assertEqual(len(summary['Evaluation']), 1)
|
|
275
275
|
self.assertIsNotNone(summary['Evaluation'][0].experiment)
|
|
@@ -376,7 +376,7 @@ class EvaluationTest(unittest.TestCase):
|
|
|
376
376
|
s.children[0].dir, os.path.join(s.root_dir, s.children[0].id)
|
|
377
377
|
)
|
|
378
378
|
# Test persistent hash.
|
|
379
|
-
self.assertEqual(s.hash, '
|
|
379
|
+
self.assertEqual(s.hash, 'fa8f5419')
|
|
380
380
|
|
|
381
381
|
summary = s.run(verbose=True)
|
|
382
382
|
self.assertEqual(len(summary.evaluations), 2)
|
|
@@ -526,7 +526,7 @@ class SuiteTest(unittest.TestCase):
|
|
|
526
526
|
lm=lm
|
|
527
527
|
)
|
|
528
528
|
# Test for persistent hash.
|
|
529
|
-
self.assertEqual(s.hash, '
|
|
529
|
+
self.assertEqual(s.hash, 'ec3901b8')
|
|
530
530
|
s.run()
|
|
531
531
|
expected = {
|
|
532
532
|
s.children[0].id: dict(
|
langfun/core/eval/matching.py
CHANGED
|
@@ -38,7 +38,7 @@ class Matching(base.Evaluation):
|
|
|
38
38
|
|
|
39
39
|
@abc.abstractmethod
|
|
40
40
|
def answer(self, output: Any, example: Any) -> Any:
|
|
41
|
-
"""Returns the answer from the
|
|
41
|
+
"""Returns the answer from the structured output."""
|
|
42
42
|
|
|
43
43
|
@property
|
|
44
44
|
def matches(self) -> list[tuple[int, Any, Any, lf.Message]]:
|
|
@@ -52,6 +52,7 @@ class Matching(base.Evaluation):
|
|
|
52
52
|
|
|
53
53
|
@property
|
|
54
54
|
def match_rate(self) -> float:
|
|
55
|
+
"""Returns the match rate."""
|
|
55
56
|
if self.num_completed == 0:
|
|
56
57
|
return 0.0
|
|
57
58
|
return self.num_matches / self.num_completed
|
|
@@ -68,17 +69,19 @@ class Matching(base.Evaluation):
|
|
|
68
69
|
|
|
69
70
|
@property
|
|
70
71
|
def mismatch_rate(self) -> float:
|
|
72
|
+
"""Returns the mismatch rate."""
|
|
71
73
|
if self.num_completed == 0:
|
|
72
74
|
return 0.0
|
|
73
75
|
return self.num_mismatches / self.num_completed
|
|
74
76
|
|
|
75
77
|
@property
|
|
76
78
|
def matches_link(self) -> str:
|
|
77
|
-
"""Returns the matches page."""
|
|
79
|
+
"""Returns the link to the matches page."""
|
|
78
80
|
return self.link(os.path.join(self.dir, Matching.MATCHES_HTML))
|
|
79
81
|
|
|
80
82
|
@property
|
|
81
83
|
def mismatches_link(self) -> str:
|
|
84
|
+
"""Returns the link to the mismatches page."""
|
|
82
85
|
return self.link(os.path.join(self.dir, Matching.MISMATCHES_HTML))
|
|
83
86
|
|
|
84
87
|
def _reset(self) -> None:
|
langfun/core/eval/patching.py
CHANGED
|
@@ -114,17 +114,17 @@ def model_by_name(name: str) -> lf.LanguageModel:
|
|
|
114
114
|
|
|
115
115
|
@pg.patcher(auto_typing=True)
|
|
116
116
|
def lm(unused_eval, models: list[str]):
|
|
117
|
-
"""
|
|
117
|
+
"""Patches the LM used for benchmarking."""
|
|
118
118
|
return patch_lm(pg.oneof([model_by_name(name) for name in models]))
|
|
119
119
|
|
|
120
120
|
|
|
121
121
|
@pg.patcher(auto_typing=True)
|
|
122
122
|
def temperature(unused_eval, value: float):
|
|
123
|
-
"""
|
|
123
|
+
"""Patches the temperature used for benchmarking."""
|
|
124
124
|
return patch_member(lf.LMSamplingOptions, "temperature", value)
|
|
125
125
|
|
|
126
126
|
|
|
127
127
|
@pg.patcher(auto_typing=True)
|
|
128
128
|
def max_tokens(unused_eval, value: int | None):
|
|
129
|
-
"""
|
|
129
|
+
"""Patches the max_tokens used for benchmarking."""
|
|
130
130
|
return patch_member(lf.LMSamplingOptions, "max_tokens", value)
|
langfun/core/eval/scoring.py
CHANGED
|
@@ -41,18 +41,19 @@ class Scoring(base.Evaluation):
|
|
|
41
41
|
|
|
42
42
|
@property
|
|
43
43
|
def score_rate(self) -> float:
|
|
44
|
-
"""Returns the
|
|
44
|
+
"""Returns the rate of scored examples among the completed ones."""
|
|
45
45
|
if self.num_completed == 0:
|
|
46
46
|
return 0.0
|
|
47
47
|
return self.num_scored / self.num_completed
|
|
48
48
|
|
|
49
49
|
@property
|
|
50
50
|
def scored_link(self) -> str:
|
|
51
|
-
"""Returns the
|
|
51
|
+
"""Returns the scored examples page."""
|
|
52
52
|
return self.link(os.path.join(self.dir, Scoring.SCORED_HTML))
|
|
53
53
|
|
|
54
54
|
@property
|
|
55
55
|
def avg_score(self) -> float:
|
|
56
|
+
"""Returns the average score of scored examples."""
|
|
56
57
|
if self.num_scored == 0:
|
|
57
58
|
return 0
|
|
58
59
|
return sum([i[2] for i in self._scored]) / self.num_scored
|
|
@@ -181,7 +182,7 @@ class Scoring(base.Evaluation):
|
|
|
181
182
|
super()._render_summary_metrics(s)
|
|
182
183
|
|
|
183
184
|
def _render_scored(self, s: io.StringIO) -> None:
|
|
184
|
-
"""Formats the
|
|
185
|
+
"""Formats the scored cases into html."""
|
|
185
186
|
s.write('<h2> Scored </h2>')
|
|
186
187
|
s.write('<div style="white-space:pre">\n')
|
|
187
188
|
s.write(
|
langfun/core/eval/v2/__init__.py
CHANGED
|
@@ -35,9 +35,11 @@ from langfun.core.eval.v2.experiment import Runner
|
|
|
35
35
|
from langfun.core.eval.v2 import runners
|
|
36
36
|
|
|
37
37
|
# Plugins
|
|
38
|
+
from langfun.core.eval.v2.config_saver import RunConfigSaver
|
|
38
39
|
from langfun.core.eval.v2.checkpointing import BulkCheckpointer
|
|
39
40
|
from langfun.core.eval.v2.checkpointing import PerExampleCheckpointer
|
|
40
41
|
from langfun.core.eval.v2.reporting import HtmlReporter
|
|
42
|
+
from langfun.core.eval.v2.reporting import ExampleHtmlGenerator
|
|
41
43
|
|
|
42
44
|
|
|
43
45
|
# pylint: enable=g-bad-import-order
|
|
@@ -13,6 +13,8 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
"""Checkpointing evaluation runs."""
|
|
15
15
|
import abc
|
|
16
|
+
import datetime
|
|
17
|
+
import os
|
|
16
18
|
import re
|
|
17
19
|
import threading
|
|
18
20
|
import traceback
|
|
@@ -29,12 +31,32 @@ Runner = experiment_lib.Runner
|
|
|
29
31
|
|
|
30
32
|
|
|
31
33
|
class Checkpointer(experiment_lib.Plugin):
|
|
32
|
-
"""Base class for checkpointing evaluation examples.
|
|
34
|
+
"""Base class for checkpointing evaluation examples.
|
|
35
|
+
|
|
36
|
+
`Checkpointer` is a plugin that saves the state of processed examples
|
|
37
|
+
incrementally during an experiment run, allowing the experiment to be resumed
|
|
38
|
+
later. When an experiment starts, the checkpointer loads any previously saved
|
|
39
|
+
examples from an earlier run (or a warm-start run) into `experiment.state`,
|
|
40
|
+
so the runner can skip processing them again.
|
|
41
|
+
Subclasses should implement `_list_checkpoint_filenames` to identify
|
|
42
|
+
checkpoint files to load, and `_save_example` to save a newly processed
|
|
43
|
+
example.
|
|
44
|
+
"""
|
|
33
45
|
|
|
34
46
|
checkpoint_filename: Annotated[
|
|
35
47
|
str,
|
|
36
48
|
'Checkpoint file pattern.'
|
|
37
|
-
] = 'checkpoint.
|
|
49
|
+
] = 'checkpoint.jsonl'
|
|
50
|
+
|
|
51
|
+
enable_inprogress_file: Annotated[
|
|
52
|
+
bool,
|
|
53
|
+
'If True, write file "<example_id>.inprogress" when example gets started.'
|
|
54
|
+
] = True
|
|
55
|
+
|
|
56
|
+
max_ckpt_loading_threads: Annotated[
|
|
57
|
+
int,
|
|
58
|
+
'Max number of workers for loading checkpoint files at startup.'
|
|
59
|
+
] = 128
|
|
38
60
|
|
|
39
61
|
def on_experiment_start(
|
|
40
62
|
self,
|
|
@@ -75,6 +97,24 @@ class Checkpointer(experiment_lib.Plugin):
|
|
|
75
97
|
f'scratch. Example IDs: {example_ids_to_evaluate}.'
|
|
76
98
|
)
|
|
77
99
|
|
|
100
|
+
def on_example_start(
|
|
101
|
+
self,
|
|
102
|
+
runner: Runner,
|
|
103
|
+
experiment: Experiment,
|
|
104
|
+
example: Example,
|
|
105
|
+
) -> None:
|
|
106
|
+
"""Saves the example to the checkpoint file."""
|
|
107
|
+
if self.enable_inprogress_file:
|
|
108
|
+
def _save_inprogress_file(example: Example):
|
|
109
|
+
inprogress_file = runner.current_run.output_path_for(
|
|
110
|
+
experiment, f'{example.id}.inprogress'
|
|
111
|
+
)
|
|
112
|
+
pg.io.writefile(
|
|
113
|
+
inprogress_file,
|
|
114
|
+
datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
|
115
|
+
)
|
|
116
|
+
runner.background_run(_save_inprogress_file, example)
|
|
117
|
+
|
|
78
118
|
def on_example_complete(
|
|
79
119
|
self,
|
|
80
120
|
runner: Runner,
|
|
@@ -149,7 +189,10 @@ class Checkpointer(experiment_lib.Plugin):
|
|
|
149
189
|
|
|
150
190
|
_ = list(
|
|
151
191
|
lf.concurrent_map(
|
|
152
|
-
_load_state,
|
|
192
|
+
_load_state,
|
|
193
|
+
ckpt_files,
|
|
194
|
+
max_workers=self.max_ckpt_loading_threads,
|
|
195
|
+
silence_on_errors=None
|
|
153
196
|
)
|
|
154
197
|
)
|
|
155
198
|
|
|
@@ -170,7 +213,12 @@ class Checkpointer(experiment_lib.Plugin):
|
|
|
170
213
|
|
|
171
214
|
|
|
172
215
|
class PerExampleCheckpointer(Checkpointer):
|
|
173
|
-
"""Checkpointer that saves each example to a separate file.
|
|
216
|
+
"""Checkpointer that saves each example to a separate file.
|
|
217
|
+
|
|
218
|
+
This checkpointer saves each processed example to its own checkpoint file,
|
|
219
|
+
named using the pattern `<checkpoint_filename_prefix>_<example_id>.<ext>`.
|
|
220
|
+
For example, `checkpoint_1.bagz`, `checkpoint_2.bagz`, etc.
|
|
221
|
+
"""
|
|
174
222
|
|
|
175
223
|
def _on_bound(self):
|
|
176
224
|
super()._on_bound()
|
|
@@ -235,7 +283,13 @@ class PerExampleCheckpointer(Checkpointer):
|
|
|
235
283
|
|
|
236
284
|
|
|
237
285
|
class BulkCheckpointer(Checkpointer):
|
|
238
|
-
"""Checkpointer that saves all examples to a single file.
|
|
286
|
+
"""Checkpointer that saves all examples of an evaluation to a single file.
|
|
287
|
+
|
|
288
|
+
This checkpointer appends newly processed examples of an evaluation to a
|
|
289
|
+
single sequence file (e.g., `checkpoint.bagz`). This is often more efficient
|
|
290
|
+
than `PerExampleCheckpointer` when dealing with a large number of examples
|
|
291
|
+
or when file system overhead is a concern.
|
|
292
|
+
"""
|
|
239
293
|
|
|
240
294
|
def _on_bound(self):
|
|
241
295
|
super()._on_bound()
|
|
@@ -341,12 +395,26 @@ class BulkCheckpointer(Checkpointer):
|
|
|
341
395
|
|
|
342
396
|
|
|
343
397
|
class SequenceWriter:
|
|
344
|
-
"""
|
|
398
|
+
"""A thread-safe writer for sequence files (e.g., Bagz) with atomic write.
|
|
399
|
+
|
|
400
|
+
`SequenceWriter` wraps a `pg.io.SequenceWriter` to provide thread-safe
|
|
401
|
+
`add` and `close` operations, ensuring that examples can be written
|
|
402
|
+
concurrently from multiple threads without corrupting the sequence file.
|
|
403
|
+
It writes to a temporary file and renames it to target path on `close` to
|
|
404
|
+
achieve atomic write. If the target path exists, new examples are appended
|
|
405
|
+
to existing content.
|
|
406
|
+
"""
|
|
345
407
|
|
|
346
408
|
def __init__(self, path: str):
|
|
347
409
|
self._lock = threading.Lock()
|
|
348
410
|
self._path = path
|
|
349
|
-
|
|
411
|
+
basename = os.path.basename(path)
|
|
412
|
+
self._tmp_path = os.path.join(
|
|
413
|
+
os.path.dirname(path), f'tmp.{basename}'
|
|
414
|
+
)
|
|
415
|
+
if pg.io.path_exists(self._path):
|
|
416
|
+
pg.io.copy(self._path, self._tmp_path)
|
|
417
|
+
self._sequence_writer = pg.io.open_sequence(self._tmp_path, 'a')
|
|
350
418
|
|
|
351
419
|
@property
|
|
352
420
|
def path(self) -> str:
|
|
@@ -371,6 +439,7 @@ class SequenceWriter:
|
|
|
371
439
|
return
|
|
372
440
|
self._sequence_writer.close()
|
|
373
441
|
self._sequence_writer = None
|
|
442
|
+
pg.io.rename(self._tmp_path, self._path)
|
|
374
443
|
|
|
375
444
|
def __del__(self):
|
|
376
445
|
self.close()
|
|
@@ -65,7 +65,7 @@ class ExampleCollector(experiment_lib.Plugin):
|
|
|
65
65
|
return self._examples
|
|
66
66
|
|
|
67
67
|
def on_example_complete(
|
|
68
|
-
self, runner:
|
|
68
|
+
self, runner: experiment_lib.Runner,
|
|
69
69
|
experiment: experiment_lib.Experiment,
|
|
70
70
|
example: example_lib.Example,
|
|
71
71
|
):
|
|
@@ -90,7 +90,10 @@ class PerExampleCheckpointerTest(CheckpointerTest):
|
|
|
90
90
|
root_dir = os.path.join(tempfile.mkdtemp(), 'per_example_checkpointer')
|
|
91
91
|
experiment = eval_test_helper.test_experiment()
|
|
92
92
|
checkpoint_filename = 'checkpoint.jsonl'
|
|
93
|
-
checkpointer = checkpointing.PerExampleCheckpointer(
|
|
93
|
+
checkpointer = checkpointing.PerExampleCheckpointer(
|
|
94
|
+
checkpoint_filename,
|
|
95
|
+
enable_inprogress_file=True
|
|
96
|
+
)
|
|
94
97
|
collector = ExampleCollector()
|
|
95
98
|
run = experiment.run(
|
|
96
99
|
root_dir, 'new', runner='sequential', plugins=[checkpointer, collector]
|
|
@@ -102,6 +105,10 @@ class PerExampleCheckpointerTest(CheckpointerTest):
|
|
|
102
105
|
example = collector.examples[i + 1]
|
|
103
106
|
ckpt = run.output_path_for(leaf, f'checkpoint_{example.id}.jsonl')
|
|
104
107
|
self.assertTrue(pg.io.path_exists(ckpt))
|
|
108
|
+
inprogress_file = run.output_path_for(
|
|
109
|
+
leaf, f'{example.id}.inprogress'
|
|
110
|
+
)
|
|
111
|
+
self.assertTrue(pg.io.path_exists(inprogress_file))
|
|
105
112
|
with pg.io.open_sequence(ckpt) as f:
|
|
106
113
|
examples_from_ckpt = list(iter(f))
|
|
107
114
|
# `eval_test_helper.test_experiment` has two TestEvaluation with
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
# Copyright 2024 The Langfun Authors
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Config saver plugins."""
|
|
15
|
+
|
|
16
|
+
import os
|
|
17
|
+
from langfun.core.eval.v2 import experiment as experiment_lib
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class RunConfigSaver(experiment_lib.Plugin):
|
|
21
|
+
"""Saves the current run."""
|
|
22
|
+
|
|
23
|
+
def on_run_start(
|
|
24
|
+
self,
|
|
25
|
+
runner: experiment_lib.Runner,
|
|
26
|
+
root: experiment_lib.Experiment
|
|
27
|
+
) -> None:
|
|
28
|
+
del root # Unused.
|
|
29
|
+
self._save_run_config(runner)
|
|
30
|
+
|
|
31
|
+
def _save_run_config(self, runner: experiment_lib.Runner) -> None:
|
|
32
|
+
def _save():
|
|
33
|
+
runner.current_run.save(
|
|
34
|
+
os.path.join(runner.current_run.output_root, 'run.json'),
|
|
35
|
+
hide_default_values=True,
|
|
36
|
+
)
|
|
37
|
+
runner.background_run(_save)
|