langfun 0.1.2.dev202511030805__py3-none-any.whl → 0.1.2.dev202511050805__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of langfun might be problematic. Click here for more details.
- langfun/core/agentic/action.py +76 -9
- langfun/core/agentic/action_eval.py +9 -2
- langfun/core/async_support.py +32 -3
- langfun/core/coding/python/correction.py +19 -9
- langfun/core/coding/python/execution.py +14 -12
- langfun/core/coding/python/generation.py +21 -16
- langfun/core/coding/python/sandboxing.py +23 -3
- langfun/core/component.py +42 -3
- langfun/core/concurrent.py +70 -6
- langfun/core/console.py +1 -1
- langfun/core/data/conversion/anthropic.py +10 -3
- langfun/core/data/conversion/gemini.py +9 -2
- langfun/core/data/conversion/openai.py +17 -7
- langfun/core/eval/base.py +46 -42
- langfun/core/eval/matching.py +5 -2
- langfun/core/eval/patching.py +3 -3
- langfun/core/eval/scoring.py +4 -3
- langfun/core/eval/v2/checkpointing.py +30 -4
- langfun/core/eval/v2/evaluation.py +59 -13
- langfun/core/eval/v2/example.py +22 -11
- langfun/core/eval/v2/experiment.py +51 -8
- langfun/core/eval/v2/metric_values.py +23 -3
- langfun/core/eval/v2/metrics.py +33 -4
- langfun/core/eval/v2/progress.py +9 -1
- langfun/core/eval/v2/reporting.py +15 -1
- langfun/core/eval/v2/runners.py +27 -7
- langfun/core/langfunc.py +45 -130
- langfun/core/language_model.py +88 -10
- langfun/core/llms/anthropic.py +27 -2
- langfun/core/llms/azure_openai.py +29 -17
- langfun/core/llms/cache/base.py +22 -2
- langfun/core/llms/cache/in_memory.py +48 -7
- langfun/core/llms/compositional.py +25 -1
- langfun/core/llms/deepseek.py +29 -1
- langfun/core/llms/fake.py +32 -1
- langfun/core/llms/gemini.py +9 -1
- langfun/core/llms/google_genai.py +29 -1
- langfun/core/llms/groq.py +27 -2
- langfun/core/llms/llama_cpp.py +22 -3
- langfun/core/llms/openai.py +29 -1
- langfun/core/llms/openai_compatible.py +18 -6
- langfun/core/llms/rest.py +12 -1
- langfun/core/llms/vertexai.py +39 -6
- langfun/core/logging.py +1 -1
- langfun/core/mcp/client.py +77 -22
- langfun/core/mcp/session.py +90 -10
- langfun/core/mcp/tool.py +83 -23
- langfun/core/memory.py +1 -0
- langfun/core/message.py +75 -11
- langfun/core/message_test.py +9 -0
- langfun/core/modalities/audio.py +21 -1
- langfun/core/modalities/image.py +19 -1
- langfun/core/modalities/mime.py +54 -4
- langfun/core/modalities/pdf.py +19 -1
- langfun/core/modalities/video.py +21 -1
- langfun/core/modality.py +66 -5
- langfun/core/natural_language.py +1 -1
- langfun/core/sampling.py +4 -4
- langfun/core/structured/completion.py +32 -37
- langfun/core/structured/description.py +54 -50
- langfun/core/structured/function_generation.py +29 -12
- langfun/core/structured/mapping.py +70 -15
- langfun/core/structured/parsing.py +90 -74
- langfun/core/structured/parsing_test.py +0 -3
- langfun/core/structured/querying.py +201 -130
- langfun/core/structured/schema.py +70 -10
- langfun/core/structured/schema_generation.py +33 -14
- langfun/core/structured/scoring.py +45 -34
- langfun/core/structured/tokenization.py +24 -9
- langfun/core/subscription.py +2 -2
- langfun/core/template.py +139 -40
- langfun/core/template_test.py +40 -0
- {langfun-0.1.2.dev202511030805.dist-info → langfun-0.1.2.dev202511050805.dist-info}/METADATA +1 -1
- {langfun-0.1.2.dev202511030805.dist-info → langfun-0.1.2.dev202511050805.dist-info}/RECORD +77 -77
- {langfun-0.1.2.dev202511030805.dist-info → langfun-0.1.2.dev202511050805.dist-info}/WHEEL +0 -0
- {langfun-0.1.2.dev202511030805.dist-info → langfun-0.1.2.dev202511050805.dist-info}/licenses/LICENSE +0 -0
- {langfun-0.1.2.dev202511030805.dist-info → langfun-0.1.2.dev202511050805.dist-info}/top_level.txt +0 -0
|
@@ -32,17 +32,63 @@ import pyglove as pg
|
|
|
32
32
|
|
|
33
33
|
|
|
34
34
|
class Evaluation(experiment_lib.Experiment):
|
|
35
|
-
"""
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
35
|
+
"""Base class for Langfun evaluations.
|
|
36
|
+
|
|
37
|
+
`lf.eval.Evaluation` is the base class for defining evaluation tasks in
|
|
38
|
+
Langfun. Users typically subclass it to implement custom evaluation logic by
|
|
39
|
+
overriding `inputs` and `process` methods.
|
|
40
|
+
|
|
41
|
+
An `Evaluation` object encapsulates:
|
|
42
|
+
|
|
43
|
+
* **`inputs`**: A callable that returns an iterable of input examples to be
|
|
44
|
+
processed. This is usually provided by implementing an `inputs(self)`
|
|
45
|
+
method in the subclass, which yields input items for evaluation one by
|
|
46
|
+
one.
|
|
47
|
+
* **`process(self, example)`**: An abstract method that processes one
|
|
48
|
+
example and returns the output, or a tuple of (output, metadata).
|
|
49
|
+
The output will be used for computing metrics.
|
|
50
|
+
* **`metrics`**: A list of metrics (e.g., `lf.metrics.Accuracy`) to compute
|
|
51
|
+
based on the outputs from `process`. Some metrics may require users to
|
|
52
|
+
implement a `ground_truth(self, example)` method in the subclass to
|
|
53
|
+
compute metrics against ground truth.
|
|
54
|
+
* **Hyperparameters**: Any other attributes of the class serve as
|
|
55
|
+
hyperparameters for the evaluation (e.g., the language model to use).
|
|
56
|
+
|
|
57
|
+
**Running Evaluations:**
|
|
58
|
+
|
|
59
|
+
Evaluations are executed via `lf.eval.Suite` or by calling the `.run()`
|
|
60
|
+
method on an `Evaluation` instance, which returns a `Run` object
|
|
61
|
+
containing the evaluation run information and results. If an evaluation
|
|
62
|
+
contains sweeable parameters (using `pg.oneof`), `.run()` will expand it
|
|
63
|
+
into multiple evaluation sub-tasks -- one for each combination of
|
|
64
|
+
hyperparameters -- all managed within the same `Run`.
|
|
65
|
+
|
|
66
|
+
**Example:**
|
|
67
|
+
|
|
68
|
+
```python
|
|
69
|
+
import langfun as lf
|
|
70
|
+
import pyglove as pg
|
|
71
|
+
|
|
72
|
+
class MyEval(lf.eval.Evaluation):
|
|
73
|
+
lm: lf.LanguageModel
|
|
74
|
+
prompt: str = '1 + 1 = '
|
|
75
|
+
|
|
76
|
+
def inputs(self):
|
|
77
|
+
yield 2
|
|
78
|
+
|
|
79
|
+
def process(self, example: lf.eval.Example):
|
|
80
|
+
return int(lf.query(self.prompt, lm=self.lm))
|
|
81
|
+
|
|
82
|
+
def ground_truth(self, example: lf.eval.Example) -> int:
|
|
83
|
+
return example.input
|
|
84
|
+
|
|
85
|
+
# Run evaluation using two different LMs
|
|
86
|
+
evaluation = MyEval(
|
|
87
|
+
lm=pg.oneof([lf.llms.Gpt4(), lf.llms.Gemini()]),
|
|
88
|
+
metrics=[lf.metrics.Accuracy()]
|
|
89
|
+
)
|
|
90
|
+
run_info = evaluation.run()
|
|
91
|
+
```
|
|
46
92
|
"""
|
|
47
93
|
|
|
48
94
|
inputs: Annotated[
|
|
@@ -137,7 +183,7 @@ class Evaluation(experiment_lib.Experiment):
|
|
|
137
183
|
|
|
138
184
|
Args:
|
|
139
185
|
example: An example object to process. `example.input` is an object
|
|
140
|
-
|
|
186
|
+
yielded from `inputs()` method.
|
|
141
187
|
|
|
142
188
|
Returns:
|
|
143
189
|
A processed output. Or a tuple of (output, metadata).
|
|
@@ -760,7 +806,7 @@ class Evaluation(experiment_lib.Experiment):
|
|
|
760
806
|
|
|
761
807
|
|
|
762
808
|
class EvaluationState:
|
|
763
|
-
"""
|
|
809
|
+
"""In-memory state of an evaluation."""
|
|
764
810
|
|
|
765
811
|
class ExampleStatus(pg.Object):
|
|
766
812
|
"""Example state."""
|
langfun/core/eval/v2/example.py
CHANGED
|
@@ -22,19 +22,30 @@ import pyglove as pg
|
|
|
22
22
|
|
|
23
23
|
@dataclasses.dataclass
|
|
24
24
|
class Example(pg.JSONConvertible, pg.views.HtmlTreeView.Extension):
|
|
25
|
-
"""An
|
|
25
|
+
"""An example for evaluation.
|
|
26
|
+
|
|
27
|
+
An evaluation example contains the input and output of an evaluation task,
|
|
28
|
+
as well as metadata about the evaluation process, such as execution time,
|
|
29
|
+
LLM usage, and metric results.
|
|
26
30
|
|
|
27
31
|
Attributes:
|
|
28
|
-
id: The 1-based ID of the
|
|
29
|
-
input: An element returned from the `Evaluable.inputs` functor
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
32
|
+
id: The 1-based ID of the example in the evaluation set.
|
|
33
|
+
input: An element returned from the `Evaluable.inputs` functor, which serves
|
|
34
|
+
as the input for `lf.Evaluable.process`.
|
|
35
|
+
output: The output of `lf.Evaluable.process` method. If `pg.MISSING_VALUE`,
|
|
36
|
+
it indicates the example has not been processed yet.
|
|
37
|
+
error: The error encountered during `lf.Evaluable.process`. If None, it
|
|
38
|
+
indicates the process was successful.
|
|
39
|
+
metadata: The metadata of the example produced by `lf.Evaluable.process`.
|
|
40
|
+
metric_metadata: The dictionary returned from `Metric.audit`, which contains
|
|
41
|
+
metadata about metric computation for this example.
|
|
42
|
+
newly_processed: Whether this example is processed in the current run. If
|
|
43
|
+
False, it indicates the example was loaded from a checkpoint from previous
|
|
44
|
+
runs.
|
|
45
|
+
start_time: The start time of processing this example.
|
|
46
|
+
end_time: The end time of processing this example.
|
|
47
|
+
usage_summary: The summary of LLM usages for processing this example.
|
|
48
|
+
execution_status: The timeit status of processing this example.
|
|
38
49
|
"""
|
|
39
50
|
id: int
|
|
40
51
|
input: Any = pg.MISSING_VALUE
|
|
@@ -139,10 +139,10 @@ class Experiment(lf.Component, pg.views.HtmlTreeView.Extension):
|
|
|
139
139
|
|
|
140
140
|
# Checkpointing
|
|
141
141
|
|
|
142
|
-
Experiments support checkpointing, which is enabled by default. It allows
|
|
142
|
+
Experiments support checkpointing, which is enabled by default. It allows
|
|
143
143
|
users to resume their experiments from a saved state. When an experiment runs,
|
|
144
|
-
it creates a new directory for that run and saves
|
|
145
|
-
|
|
144
|
+
it creates a new directory for that run and saves its progress to checkpoint
|
|
145
|
+
files. If the experiment is interrupted or fails, users can resume
|
|
146
146
|
it by specifying the 'id' or 'warm_start_from' argument (shown above) to
|
|
147
147
|
seamlessly continue from previously saved state without starting over.
|
|
148
148
|
|
|
@@ -169,7 +169,7 @@ class Experiment(lf.Component, pg.views.HtmlTreeView.Extension):
|
|
|
169
169
|
|
|
170
170
|
# Experiment Plugins
|
|
171
171
|
|
|
172
|
-
|
|
172
|
+
Experiments can be extended by plugins. Plugins can listen to the events of
|
|
173
173
|
experiment execution and produce additional outputs. For example, a plugin
|
|
174
174
|
can be added to an experiment to generate additional metrics or to save
|
|
175
175
|
additional data to a database. More details will be added in the future.
|
|
@@ -657,7 +657,30 @@ class Experiment(lf.Component, pg.views.HtmlTreeView.Extension):
|
|
|
657
657
|
|
|
658
658
|
@pg.use_init_args(['children'])
|
|
659
659
|
class Suite(Experiment):
|
|
660
|
-
"""A suite of evaluations.
|
|
660
|
+
"""A suite of evaluations.
|
|
661
|
+
|
|
662
|
+
`lf.eval.Suite` groups multiple `lf.eval.Evaluation` or other `Suite`
|
|
663
|
+
objects into a single experiment, allowing them to be run, managed, and
|
|
664
|
+
reported together.
|
|
665
|
+
|
|
666
|
+
**Example:**
|
|
667
|
+
|
|
668
|
+
```python
|
|
669
|
+
import langfun as lf
|
|
670
|
+
|
|
671
|
+
suite = lf.eval.Suite([
|
|
672
|
+
MyEval(lm=lf.llms.Gpt4()),
|
|
673
|
+
MyEval(lm=lf.llms.Gemini()),
|
|
674
|
+
lf.eval.Suite([
|
|
675
|
+
AnotherEval(lm=lf.llms.Gpt4()),
|
|
676
|
+
AnotherEval(lm=lf.llms.Gemini())
|
|
677
|
+
])
|
|
678
|
+
])
|
|
679
|
+
|
|
680
|
+
# Run all evaluations in the suite
|
|
681
|
+
run_info = suite.run('/path/to/my/suite_run')
|
|
682
|
+
```
|
|
683
|
+
"""
|
|
661
684
|
|
|
662
685
|
children: Annotated[
|
|
663
686
|
list[Experiment], 'A list of child experiments.'
|
|
@@ -791,7 +814,14 @@ class RunId(pg.Object):
|
|
|
791
814
|
|
|
792
815
|
|
|
793
816
|
class Run(pg.Object, pg.views.html.HtmlTreeView.Extension):
|
|
794
|
-
"""
|
|
817
|
+
"""Represents a single run of an experiment.
|
|
818
|
+
|
|
819
|
+
A `Run` object holds all the configurations for executing an experiment,
|
|
820
|
+
such as the experiment definition, input/output directories, and flags
|
|
821
|
+
controlling the execution behavior (e.g., error handling, checkpointing).
|
|
822
|
+
It also provides utility methods for accessing run-specific paths and
|
|
823
|
+
filtering examples for evaluation.
|
|
824
|
+
"""
|
|
795
825
|
|
|
796
826
|
root_dir: Annotated[
|
|
797
827
|
str,
|
|
@@ -971,7 +1001,13 @@ class Run(pg.Object, pg.views.html.HtmlTreeView.Extension):
|
|
|
971
1001
|
|
|
972
1002
|
|
|
973
1003
|
class Runner(pg.Object):
|
|
974
|
-
"""Interface for experiment runner.
|
|
1004
|
+
"""Interface for experiment runner.
|
|
1005
|
+
|
|
1006
|
+
A runner is responsible for executing the evaluations within an experiment
|
|
1007
|
+
based on the configuration specified in a `Run` object. Different runners
|
|
1008
|
+
can implement different execution strategies, such as sequential or parallel
|
|
1009
|
+
processing of examples and evaluations.
|
|
1010
|
+
"""
|
|
975
1011
|
|
|
976
1012
|
# Class-level variable for registering the runner.
|
|
977
1013
|
NAME = None
|
|
@@ -1010,7 +1046,14 @@ class Runner(pg.Object):
|
|
|
1010
1046
|
|
|
1011
1047
|
|
|
1012
1048
|
class Plugin(lf.Component):
|
|
1013
|
-
"""Base class for experiment plugins.
|
|
1049
|
+
"""Base class for experiment plugins.
|
|
1050
|
+
|
|
1051
|
+
Plugins provide a mechanism to extend the behavior of an experiment run
|
|
1052
|
+
by hooking into various events during the lifecycle of experiment and
|
|
1053
|
+
example execution, such as `on_run_start`, `on_experiment_complete`,
|
|
1054
|
+
`on_example_start`, etc. They can be used for custom logging, monitoring,
|
|
1055
|
+
or result processing.
|
|
1056
|
+
"""
|
|
1014
1057
|
|
|
1015
1058
|
def on_run_start(
|
|
1016
1059
|
self,
|
|
@@ -20,7 +20,15 @@ import pyglove as pg
|
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
class MetricValue(pg.Object):
|
|
23
|
-
"""Base class for metric values.
|
|
23
|
+
"""Base class for metric values.
|
|
24
|
+
|
|
25
|
+
`MetricValue` is the base class for representing aggregated metric values
|
|
26
|
+
in an evaluation. It accumulates data points from individual examples,
|
|
27
|
+
each consisting of a value and an optional weight, associated with an example
|
|
28
|
+
ID. Subclasses must implement `reduce` method to compute a single float value
|
|
29
|
+
from accumulated data points, and `scalar_repr` to provide a string
|
|
30
|
+
representation of the reduced value.
|
|
31
|
+
"""
|
|
24
32
|
|
|
25
33
|
class DataPoint(pg.Object):
|
|
26
34
|
"""A data point for a metric value."""
|
|
@@ -133,7 +141,13 @@ class MetricValue(pg.Object):
|
|
|
133
141
|
|
|
134
142
|
|
|
135
143
|
class Rate(MetricValue):
|
|
136
|
-
"""
|
|
144
|
+
"""Metric value representing a rate in range [0, 1].
|
|
145
|
+
|
|
146
|
+
`Rate` is used for metrics that compute a rate, such as accuracy or error
|
|
147
|
+
rate. The final value is computed as the weighted sum of accumulated values
|
|
148
|
+
divided by the total number of examples. It's displayed as a percentage
|
|
149
|
+
(e.g., 90.0%).
|
|
150
|
+
"""
|
|
137
151
|
|
|
138
152
|
def reduce(self) -> float:
|
|
139
153
|
return self._weighted_sum / self.total
|
|
@@ -145,7 +159,13 @@ class Rate(MetricValue):
|
|
|
145
159
|
|
|
146
160
|
|
|
147
161
|
class Average(MetricValue):
|
|
148
|
-
"""
|
|
162
|
+
"""Metric value representing an average of accumulated values.
|
|
163
|
+
|
|
164
|
+
`Average` is used for metrics that compute an average score across examples
|
|
165
|
+
(e.g., average quality score). The final value is computed as the weighted
|
|
166
|
+
sum of accumulated values divided by the number of data points.
|
|
167
|
+
It's displayed as a float with 3 decimal places (e.g., 4.750).
|
|
168
|
+
"""
|
|
149
169
|
|
|
150
170
|
def reduce(self) -> float:
|
|
151
171
|
if not self.data_points:
|
langfun/core/eval/v2/metrics.py
CHANGED
|
@@ -29,7 +29,15 @@ Average = metric_values.Average
|
|
|
29
29
|
|
|
30
30
|
|
|
31
31
|
class Metric(pg.Object, pg.views.HtmlTreeView.Extension):
|
|
32
|
-
"""Interface for an evaluation metric.
|
|
32
|
+
"""Interface for an evaluation metric.
|
|
33
|
+
|
|
34
|
+
A metric is used to evaluate the quality of the outputs produced by an
|
|
35
|
+
evaluation. It works by auditing each processed example via its `audit`
|
|
36
|
+
method, which in turn calls the user-overridable `_audit` method to perform
|
|
37
|
+
metric-specific logic and update metric values. Metrics can compute multiple
|
|
38
|
+
values (e.g., precision, recall, F1 score) which are exposed via the
|
|
39
|
+
`values` method.
|
|
40
|
+
"""
|
|
33
41
|
|
|
34
42
|
name: Annotated[
|
|
35
43
|
str,
|
|
@@ -169,7 +177,15 @@ class Metric(pg.Object, pg.views.HtmlTreeView.Extension):
|
|
|
169
177
|
|
|
170
178
|
|
|
171
179
|
class MetricBase(Metric):
|
|
172
|
-
"""Base class for common metrics.
|
|
180
|
+
"""Base class for common metrics.
|
|
181
|
+
|
|
182
|
+
`MetricBase` provides common functionalities for metrics, such as automatic
|
|
183
|
+
error counting based on whether an example has an error during evaluation.
|
|
184
|
+
It distinguishes between Object-Oriented Programming (OOP) errors
|
|
185
|
+
(e.g. `MappingError` during structured output generation) and other errors.
|
|
186
|
+
Subclasses should implement `_audit_processed` for metric computation on
|
|
187
|
+
successfully processed examples.
|
|
188
|
+
"""
|
|
173
189
|
|
|
174
190
|
oop_errors: Rate | None = Rate()
|
|
175
191
|
non_oop_errors: Rate | None = Rate()
|
|
@@ -229,7 +245,13 @@ class MetricBase(Metric):
|
|
|
229
245
|
|
|
230
246
|
|
|
231
247
|
class Match(MetricBase):
|
|
232
|
-
"""Metric for matching outputs against
|
|
248
|
+
"""Metric for matching outputs against ground truth.
|
|
249
|
+
|
|
250
|
+
This metric computes match and mismatch rates by comparing the output of
|
|
251
|
+
an example with its ground truth. By default, it looks for a `groundtruth`
|
|
252
|
+
attribute in `example.input` for comparison. Users can customize this behavior
|
|
253
|
+
by subclassing `Match` and overriding the `match` method.
|
|
254
|
+
"""
|
|
233
255
|
|
|
234
256
|
name = 'match'
|
|
235
257
|
matches: Rate = Rate()
|
|
@@ -302,7 +324,14 @@ class Match(MetricBase):
|
|
|
302
324
|
|
|
303
325
|
|
|
304
326
|
class Score(MetricBase):
|
|
305
|
-
"""Base class for scoring.
|
|
327
|
+
"""Base class for scoring metrics.
|
|
328
|
+
|
|
329
|
+
`Score` is a base class for metrics that assign a numerical score to each
|
|
330
|
+
example's output (e.g., evaluating quality on a scale of 1-5).
|
|
331
|
+
It automatically computes the average score across all examples.
|
|
332
|
+
Subclasses must implement the `score` method to define how an example
|
|
333
|
+
should be scored.
|
|
334
|
+
"""
|
|
306
335
|
|
|
307
336
|
name = 'score'
|
|
308
337
|
average_score: Average = Average()
|
langfun/core/eval/v2/progress.py
CHANGED
|
@@ -21,7 +21,15 @@ import pyglove as pg
|
|
|
21
21
|
|
|
22
22
|
|
|
23
23
|
class Progress(pg.Object, pg.views.HtmlTreeView.Extension):
|
|
24
|
-
"""
|
|
24
|
+
"""Represents and tracks the progress of an evaluation.
|
|
25
|
+
|
|
26
|
+
The `Progress` class maintains counts of processed, failed, and skipped
|
|
27
|
+
items in an evaluation, along with timing information (start time, stop time,
|
|
28
|
+
duration) and an execution summary. It provides properties to check the
|
|
29
|
+
status of the evaluation (e.g., `is_started`, `is_completed`) and methods
|
|
30
|
+
to update progress as items are evaluated.
|
|
31
|
+
It also supports HTML rendering as a progress bar for visualization.
|
|
32
|
+
"""
|
|
25
33
|
|
|
26
34
|
num_total: Annotated[
|
|
27
35
|
int | None,
|
|
@@ -33,7 +33,21 @@ _EVALULATION_DETAIL_FILE = 'index.html'
|
|
|
33
33
|
|
|
34
34
|
|
|
35
35
|
class HtmlReporter(experiment_lib.Plugin):
|
|
36
|
-
"""Plugin for periodically generating HTML reports for the experiment.
|
|
36
|
+
"""Plugin for periodically generating HTML reports for the experiment.
|
|
37
|
+
|
|
38
|
+
The `HtmlReporter` plugin generates several HTML files during an experiment
|
|
39
|
+
run:
|
|
40
|
+
- A `summary.html` at the root of the run directory, summarizing all
|
|
41
|
+
evaluations in the experiment.
|
|
42
|
+
- An `index.html` for each leaf evaluation, detailing the evaluation
|
|
43
|
+
definition, metrics, and logs.
|
|
44
|
+
- An HTML file for each example (e.g., `1.html`, `2.html`, ...) within
|
|
45
|
+
each leaf evaluation's directory, showing the input, output, metadata,
|
|
46
|
+
and any errors for that example.
|
|
47
|
+
|
|
48
|
+
These reports are updated periodically in the background during the run,
|
|
49
|
+
allowing users to monitor progress in near real-time.
|
|
50
|
+
"""
|
|
37
51
|
|
|
38
52
|
summary_interval: Annotated[
|
|
39
53
|
int,
|
langfun/core/eval/v2/runners.py
CHANGED
|
@@ -42,7 +42,14 @@ _RUN_MANIFEST = 'run.json'
|
|
|
42
42
|
|
|
43
43
|
|
|
44
44
|
class RunnerBase(Runner):
|
|
45
|
-
"""
|
|
45
|
+
"""Base class for runners with plugin support and IO pooling.
|
|
46
|
+
|
|
47
|
+
`RunnerBase` provides the basic runner functionalities such as plugin
|
|
48
|
+
integration for checkpointing, reporting and progress tracking.
|
|
49
|
+
It also manages a thread pool for background IO operations.
|
|
50
|
+
Subclasses should implement `_run` and `_evaluate_items` for different
|
|
51
|
+
execution strategies.
|
|
52
|
+
"""
|
|
46
53
|
|
|
47
54
|
tqdm: Annotated[
|
|
48
55
|
bool,
|
|
@@ -397,11 +404,12 @@ class RunnerBase(Runner):
|
|
|
397
404
|
|
|
398
405
|
|
|
399
406
|
class SequentialRunner(RunnerBase):
|
|
400
|
-
"""
|
|
407
|
+
"""A runner that executes evaluations and examples sequentially.
|
|
401
408
|
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
exceptions
|
|
409
|
+
The sequential runner executes all evaluations and their examples in the
|
|
410
|
+
calling thread. Background tasks are also run sequentially, which makes it
|
|
411
|
+
easier to debug as exceptions from background tasks will be raised
|
|
412
|
+
immediately.
|
|
405
413
|
"""
|
|
406
414
|
|
|
407
415
|
NAME = 'sequential'
|
|
@@ -426,7 +434,13 @@ class SequentialRunner(RunnerBase):
|
|
|
426
434
|
|
|
427
435
|
|
|
428
436
|
class DebugRunner(SequentialRunner):
|
|
429
|
-
"""
|
|
437
|
+
"""A runner for debugging evaluations.
|
|
438
|
+
|
|
439
|
+
The debug runner is a sequential runner that only runs the first example
|
|
440
|
+
of each evaluation, with `raise_if_has_error` enabled. This is useful for
|
|
441
|
+
quickly identifying issues in evaluation logic during development.
|
|
442
|
+
Checkpointers are disabled for this runner.
|
|
443
|
+
"""
|
|
430
444
|
|
|
431
445
|
NAME = 'debug'
|
|
432
446
|
|
|
@@ -444,7 +458,13 @@ class DebugRunner(SequentialRunner):
|
|
|
444
458
|
|
|
445
459
|
|
|
446
460
|
class ParallelRunner(RunnerBase):
|
|
447
|
-
"""
|
|
461
|
+
"""A runner that executes evaluations and examples in parallel.
|
|
462
|
+
|
|
463
|
+
The parallel runner groups evaluations by their required resources
|
|
464
|
+
(e.g., specific LLMs) and runs evaluations that do not share resources in
|
|
465
|
+
parallel. Within each evaluation, examples are also processed in parallel
|
|
466
|
+
using threads, up to `Evaluation.max_workers`.
|
|
467
|
+
"""
|
|
448
468
|
|
|
449
469
|
NAME = 'parallel'
|
|
450
470
|
|
langfun/core/langfunc.py
CHANGED
|
@@ -32,146 +32,43 @@ _TLS_LFUN_CALL_STACK = '_langfunc_callstack'
|
|
|
32
32
|
# NOTE(daiyip): Only the template string belongs to the positional arguments,
|
|
33
33
|
# all others are keyword-only for clarity.
|
|
34
34
|
@pg.use_init_args(['template_str'])
|
|
35
|
-
class LangFunc(
|
|
36
|
-
|
|
37
|
-
):
|
|
38
|
-
r"""Base class for natural-language driven component.
|
|
39
|
-
|
|
40
|
-
``LangFunc`` is a language-driven component that enables users to
|
|
41
|
-
seamlessly interact with Language Models (LLMs) using a blend of natural
|
|
42
|
-
language and code. It empowers users to easily modularize prompt/execution
|
|
43
|
-
logics, compose them, and simplify the creation of Language Model (LLM)-based
|
|
44
|
-
components and applications.
|
|
45
|
-
|
|
46
|
-
LangFunc can be conceptualized as a string template with embeddable code,
|
|
47
|
-
but it distinguishes itself from traditional template systems in four key
|
|
48
|
-
ways.
|
|
49
|
-
|
|
50
|
-
Firstly, it enables easy modularization of templates along with the required
|
|
51
|
-
values with OO principles, providing a reusable way for LLM-based content
|
|
52
|
-
generation. For example:
|
|
53
|
-
|
|
54
|
-
```
|
|
55
|
-
class FewshotExamples(lf.LangFunc):
|
|
56
|
-
'''Base for fewshot prompt.
|
|
57
|
-
|
|
58
|
-
{% for example in examples %}
|
|
59
|
-
{{ example }}
|
|
60
|
-
{% endfor %}
|
|
61
|
-
'''
|
|
62
|
-
|
|
63
|
-
# Usage 1: __init__ time binding.
|
|
64
|
-
assert FewshotPrompt(examples=['foo', 'bar'])() == 'foo\nbar'
|
|
65
|
-
|
|
66
|
-
# Usage 2: __call__ time binding.
|
|
67
|
-
assert FewshotPrompt()(examples=['foo', 'bar']) == 'foo\nbar'
|
|
68
|
-
|
|
69
|
-
class ToolDescription(lf.LangFunc):
|
|
70
|
-
'''Tool descriptions.
|
|
71
|
-
|
|
72
|
-
{% for tool in tools %}
|
|
73
|
-
{{ tool.description }}
|
|
74
|
-
{% endfor %}
|
|
75
|
-
'''
|
|
76
|
-
# We want to constrain tools to be a list of `Tool` objects.
|
|
77
|
-
tools: list[Tool]
|
|
78
|
-
|
|
79
|
-
# Raises: runtime type checking will fail on [1, 2, 3].
|
|
80
|
-
ToolDescription(tools=[1, 2, 3])
|
|
81
|
-
```
|
|
82
|
-
|
|
83
|
-
Secondly, it has the capability to compose multiple LangFuncs together,
|
|
84
|
-
enabling the accomplishment of complex language tasks with maximum reuse.
|
|
85
|
-
It allows users to provide program inputs to all the LangFuncs within a
|
|
86
|
-
composition at the top level, significantly simplifying the process of
|
|
87
|
-
providing context for users. For example:
|
|
88
|
-
|
|
89
|
-
```
|
|
90
|
-
class ReAct(lf.LangFunc):
|
|
91
|
-
'''ReAct prompt for tool-use.
|
|
92
|
-
|
|
93
|
-
{{ preamble }}
|
|
94
|
-
{{ tool_description }}
|
|
95
|
-
{{ tool_examples }}
|
|
96
|
-
{{ user_input }}
|
|
97
|
-
'''
|
|
98
|
-
# Default preamble, which could be overriden from subclass
|
|
99
|
-
# or parsed from the `__init__` argument.
|
|
100
|
-
preamble = 'Please help me on my task based on the following tools.',
|
|
101
|
-
|
|
102
|
-
react = ReAct(
|
|
103
|
-
tool_description=ToolDescription()
|
|
104
|
-
tool_examples=FewshotExamples(),
|
|
105
|
-
# Partially bind `tools` and `examples`.
|
|
106
|
-
tools=my_tools,
|
|
107
|
-
examples=[t.examples for t in my_tools]
|
|
108
|
-
)
|
|
109
|
-
|
|
110
|
-
# Late bind `user_input` at __call__ time.
|
|
111
|
-
react(user_input='Help me get a lunch to go, veggie please.' )
|
|
112
|
-
```
|
|
113
|
-
|
|
114
|
-
Thirdly, it allows the flexibility to encapsulate complex compositions to
|
|
115
|
-
reusable classes and modify them. For example:
|
|
116
|
-
|
|
117
|
-
```
|
|
118
|
-
# The compound decorator converts a function into a LangFunc.
|
|
119
|
-
@lf.compound
|
|
120
|
-
def react_with_tools(preamble, tools: list[Tool]):
|
|
121
|
-
return ReAct(
|
|
122
|
-
preamble=preamble,
|
|
123
|
-
tool_description=ToolDescription()
|
|
124
|
-
tool_examples=FewshotExamples(),
|
|
125
|
-
# Partially bind `tools` and `examples`.
|
|
126
|
-
tools=my_tools,
|
|
127
|
-
examples=[t.examples for t in my_tools]
|
|
128
|
-
)
|
|
35
|
+
class LangFunc(template_lib.Template):
|
|
36
|
+
r"""Base class for Language-based functions.
|
|
129
37
|
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
38
|
+
LangFunc represents a function powered by a language model. It is a subclass
|
|
39
|
+
of `lf.Template` and can be thought of as a `lf.Template` augmented with an LM
|
|
40
|
+
and an output transformation. Calling a `lf.LangFunc` is equivalent to calling
|
|
41
|
+
the LM with the rendered prompt and transforming the output.
|
|
133
42
|
|
|
134
|
-
|
|
135
|
-
'''
|
|
43
|
+
LangFunc can be directly constructed and used.
|
|
136
44
|
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
prompt=react_with_tools(
|
|
140
|
-
preamble=(
|
|
141
|
-
f'Please help me solve my problem using tools. '
|
|
142
|
-
f'Current time is {{datetime.datetime.now()}}'),
|
|
143
|
-
tools=my_tools))
|
|
45
|
+
```python
|
|
46
|
+
import langfun as lf
|
|
144
47
|
|
|
145
|
-
|
|
146
|
-
|
|
48
|
+
func = lf.LangFunc("Hello, {{name}}!")
|
|
49
|
+
print(func(name="Gemini", lm=lf.llms.Gemini25Flash()))
|
|
50
|
+
# Output: Hello, how are you today?
|
|
51
|
+
```
|
|
147
52
|
|
|
148
|
-
|
|
149
|
-
it could be manipulated programmatically, turned into a space for data
|
|
150
|
-
sampling, or even tuned by AutoML. For example:
|
|
53
|
+
Or it can be subclassed:
|
|
151
54
|
|
|
152
|
-
|
|
153
|
-
|
|
55
|
+
```python
|
|
56
|
+
import langfun as lf
|
|
154
57
|
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
'Help me solve my problem using the following tools:',
|
|
158
|
-
'Help me with the tools below:',
|
|
159
|
-
...
|
|
160
|
-
])
|
|
161
|
-
# Choose any two of the tools for generating data.
|
|
162
|
-
tools=pg.manyof(2, [
|
|
163
|
-
google_search(...),
|
|
164
|
-
doordash(...),
|
|
165
|
-
...
|
|
166
|
-
])
|
|
58
|
+
class Compute(lf.LangFunc):
|
|
59
|
+
'''Compute a simple arithmetic expression.
|
|
167
60
|
|
|
168
|
-
|
|
169
|
-
|
|
61
|
+
{{expression}} = ?
|
|
62
|
+
'''
|
|
63
|
+
expression: str
|
|
170
64
|
|
|
171
|
-
|
|
65
|
+
def transform_output(self, lm_output: lf.Message) -> lf.Message:
|
|
66
|
+
lm_output.metadata.result = float(lm_output.text)
|
|
67
|
+
return lm_output
|
|
172
68
|
|
|
173
|
-
|
|
174
|
-
|
|
69
|
+
r = Compute(expression="1 + 1")(lm=lf.llms.Gemini25Flash())
|
|
70
|
+
print(r.result)
|
|
71
|
+
# Output: 2.0
|
|
175
72
|
|
|
176
73
|
Final note: always include these capitalized words if you don't want to treat
|
|
177
74
|
the docstr as the template str: THIS IS NOT A TEMPLATE. So as a result, this
|
|
@@ -305,6 +202,24 @@ class LangFunc(
|
|
|
305
202
|
message_cls: Type[message_lib.Message] = message_lib.UserMessage,
|
|
306
203
|
**kwargs,
|
|
307
204
|
) -> message_lib.Message:
|
|
205
|
+
"""Renders the template and transforms it as LM input message.
|
|
206
|
+
|
|
207
|
+
Args:
|
|
208
|
+
allow_partial: If True, allows partial rendering, which leaves unresolved
|
|
209
|
+
variables in place in the output text. Otherwise, raises error when
|
|
210
|
+
there are unresolved variables.
|
|
211
|
+
implicit: If True, reuse the rendering output if a parent `lf.Template`
|
|
212
|
+
is rendering current `lf.Template` multiple times. This is important
|
|
213
|
+
for making sure all references to the same `lf.Template` within a single
|
|
214
|
+
top-level rendering would return the same result. If False, every call
|
|
215
|
+
to `render` will trigger the actual rendering process.
|
|
216
|
+
message_cls: The message class used for creating the return value.
|
|
217
|
+
**kwargs: Values for template variables, which override values from
|
|
218
|
+
member attributes or context.
|
|
219
|
+
|
|
220
|
+
Returns:
|
|
221
|
+
A Message object containing the rendered result.
|
|
222
|
+
"""
|
|
308
223
|
lm_input = super().render(
|
|
309
224
|
allow_partial=allow_partial,
|
|
310
225
|
implicit=implicit,
|