langfun 0.1.2.dev202511030805__py3-none-any.whl → 0.1.2.dev202511050805__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of langfun might be problematic. Click here for more details.
- langfun/core/agentic/action.py +76 -9
- langfun/core/agentic/action_eval.py +9 -2
- langfun/core/async_support.py +32 -3
- langfun/core/coding/python/correction.py +19 -9
- langfun/core/coding/python/execution.py +14 -12
- langfun/core/coding/python/generation.py +21 -16
- langfun/core/coding/python/sandboxing.py +23 -3
- langfun/core/component.py +42 -3
- langfun/core/concurrent.py +70 -6
- langfun/core/console.py +1 -1
- langfun/core/data/conversion/anthropic.py +10 -3
- langfun/core/data/conversion/gemini.py +9 -2
- langfun/core/data/conversion/openai.py +17 -7
- langfun/core/eval/base.py +46 -42
- langfun/core/eval/matching.py +5 -2
- langfun/core/eval/patching.py +3 -3
- langfun/core/eval/scoring.py +4 -3
- langfun/core/eval/v2/checkpointing.py +30 -4
- langfun/core/eval/v2/evaluation.py +59 -13
- langfun/core/eval/v2/example.py +22 -11
- langfun/core/eval/v2/experiment.py +51 -8
- langfun/core/eval/v2/metric_values.py +23 -3
- langfun/core/eval/v2/metrics.py +33 -4
- langfun/core/eval/v2/progress.py +9 -1
- langfun/core/eval/v2/reporting.py +15 -1
- langfun/core/eval/v2/runners.py +27 -7
- langfun/core/langfunc.py +45 -130
- langfun/core/language_model.py +88 -10
- langfun/core/llms/anthropic.py +27 -2
- langfun/core/llms/azure_openai.py +29 -17
- langfun/core/llms/cache/base.py +22 -2
- langfun/core/llms/cache/in_memory.py +48 -7
- langfun/core/llms/compositional.py +25 -1
- langfun/core/llms/deepseek.py +29 -1
- langfun/core/llms/fake.py +32 -1
- langfun/core/llms/gemini.py +9 -1
- langfun/core/llms/google_genai.py +29 -1
- langfun/core/llms/groq.py +27 -2
- langfun/core/llms/llama_cpp.py +22 -3
- langfun/core/llms/openai.py +29 -1
- langfun/core/llms/openai_compatible.py +18 -6
- langfun/core/llms/rest.py +12 -1
- langfun/core/llms/vertexai.py +39 -6
- langfun/core/logging.py +1 -1
- langfun/core/mcp/client.py +77 -22
- langfun/core/mcp/session.py +90 -10
- langfun/core/mcp/tool.py +83 -23
- langfun/core/memory.py +1 -0
- langfun/core/message.py +75 -11
- langfun/core/message_test.py +9 -0
- langfun/core/modalities/audio.py +21 -1
- langfun/core/modalities/image.py +19 -1
- langfun/core/modalities/mime.py +54 -4
- langfun/core/modalities/pdf.py +19 -1
- langfun/core/modalities/video.py +21 -1
- langfun/core/modality.py +66 -5
- langfun/core/natural_language.py +1 -1
- langfun/core/sampling.py +4 -4
- langfun/core/structured/completion.py +32 -37
- langfun/core/structured/description.py +54 -50
- langfun/core/structured/function_generation.py +29 -12
- langfun/core/structured/mapping.py +70 -15
- langfun/core/structured/parsing.py +90 -74
- langfun/core/structured/parsing_test.py +0 -3
- langfun/core/structured/querying.py +201 -130
- langfun/core/structured/schema.py +70 -10
- langfun/core/structured/schema_generation.py +33 -14
- langfun/core/structured/scoring.py +45 -34
- langfun/core/structured/tokenization.py +24 -9
- langfun/core/subscription.py +2 -2
- langfun/core/template.py +139 -40
- langfun/core/template_test.py +40 -0
- {langfun-0.1.2.dev202511030805.dist-info → langfun-0.1.2.dev202511050805.dist-info}/METADATA +1 -1
- {langfun-0.1.2.dev202511030805.dist-info → langfun-0.1.2.dev202511050805.dist-info}/RECORD +77 -77
- {langfun-0.1.2.dev202511030805.dist-info → langfun-0.1.2.dev202511050805.dist-info}/WHEEL +0 -0
- {langfun-0.1.2.dev202511030805.dist-info → langfun-0.1.2.dev202511050805.dist-info}/licenses/LICENSE +0 -0
- {langfun-0.1.2.dev202511030805.dist-info → langfun-0.1.2.dev202511050805.dist-info}/top_level.txt +0 -0
langfun/core/concurrent.py
CHANGED
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
"""
|
|
14
|
+
"""Utilities for concurrency in Langfun."""
|
|
15
15
|
|
|
16
16
|
import abc
|
|
17
17
|
import collections
|
|
@@ -97,7 +97,7 @@ class RetryError(RuntimeError):
|
|
|
97
97
|
|
|
98
98
|
|
|
99
99
|
def with_retry(
|
|
100
|
-
func: Callable[
|
|
100
|
+
func: Callable[..., Any],
|
|
101
101
|
retry_on_errors: Union[
|
|
102
102
|
Union[Type[BaseException], Tuple[Type[BaseException], str]],
|
|
103
103
|
Sequence[Union[Type[BaseException], Tuple[Type[BaseException], str]]],
|
|
@@ -108,10 +108,25 @@ def with_retry(
|
|
|
108
108
|
max_retry_interval: int = 300,
|
|
109
109
|
seed: int | None = None,
|
|
110
110
|
) -> Callable[..., Any]:
|
|
111
|
-
"""
|
|
111
|
+
"""Decorator-like function to add retry mechanism to a function.
|
|
112
|
+
|
|
113
|
+
Example:
|
|
114
|
+
|
|
115
|
+
```
|
|
116
|
+
def flaky_function():
|
|
117
|
+
if random.random() < 0.5:
|
|
118
|
+
raise ValueError('error')
|
|
119
|
+
return 1
|
|
120
|
+
|
|
121
|
+
reliable_function = lf.with_retry(
|
|
122
|
+
flaky_function,
|
|
123
|
+
retry_on_errors=ValueError,
|
|
124
|
+
max_attempts=3)
|
|
125
|
+
reliable_function()
|
|
126
|
+
```
|
|
112
127
|
|
|
113
128
|
Args:
|
|
114
|
-
func:
|
|
129
|
+
func: The function to add retry mechanism.
|
|
115
130
|
retry_on_errors: A sequence of exception types or tuples of exception type
|
|
116
131
|
and error messages (described in regular expression) as the desired
|
|
117
132
|
exception types to retry.
|
|
@@ -128,8 +143,7 @@ def with_retry(
|
|
|
128
143
|
determined based on current time.
|
|
129
144
|
|
|
130
145
|
Returns:
|
|
131
|
-
A function with the same signature of
|
|
132
|
-
capability.
|
|
146
|
+
A function with the same signature of `func`, but with retry capability.
|
|
133
147
|
"""
|
|
134
148
|
|
|
135
149
|
def _func(*args, **kwargs):
|
|
@@ -179,6 +193,24 @@ def concurrent_execute(
|
|
|
179
193
|
) -> list[Any]:
|
|
180
194
|
"""Executes a function concurrently under current component context.
|
|
181
195
|
|
|
196
|
+
`lf.concurrent_execute` applies a function to each item in an iterable of
|
|
197
|
+
inputs in parallel and returns a list of results in the same order as the
|
|
198
|
+
inputs. It is a convenient wrapper around `lf.concurrent_map` for synchronous
|
|
199
|
+
bulk processing.
|
|
200
|
+
|
|
201
|
+
**Example:**
|
|
202
|
+
|
|
203
|
+
```python
|
|
204
|
+
import langfun as lf
|
|
205
|
+
|
|
206
|
+
def square(x):
|
|
207
|
+
return x ** 2
|
|
208
|
+
|
|
209
|
+
results = lf.concurrent_execute(square, [1, 2, 3, 4], max_workers=2)
|
|
210
|
+
print(results)
|
|
211
|
+
# Output: [1, 4, 9, 16]
|
|
212
|
+
```
|
|
213
|
+
|
|
182
214
|
Args:
|
|
183
215
|
func: A user function.
|
|
184
216
|
parallel_inputs: The inputs for `func` which will be processed in parallel.
|
|
@@ -649,6 +681,38 @@ def concurrent_map(
|
|
|
649
681
|
) -> Iterator[Any]:
|
|
650
682
|
"""Maps inputs to outptus via func concurrently under current context.
|
|
651
683
|
|
|
684
|
+
`lf.concurrent_map` applies a function to each item in an iterable of
|
|
685
|
+
inputs in parallel and yields `(input, output, error)` tuples as they are
|
|
686
|
+
completed. It supports features like ordered/unordered results, progress
|
|
687
|
+
bars, timeouts, and automatic retries for transient errors.
|
|
688
|
+
|
|
689
|
+
**Example:**
|
|
690
|
+
|
|
691
|
+
```python
|
|
692
|
+
import langfun as lf
|
|
693
|
+
import time
|
|
694
|
+
import random
|
|
695
|
+
|
|
696
|
+
def flaky_square(x):
|
|
697
|
+
time.sleep(random.random())
|
|
698
|
+
if random.random() < 0.3:
|
|
699
|
+
raise ValueError("Flaky error")
|
|
700
|
+
return x ** 2
|
|
701
|
+
|
|
702
|
+
# Unordered execution with progress bar and retries
|
|
703
|
+
for input, output, error in lf.concurrent_map(
|
|
704
|
+
flaky_square,
|
|
705
|
+
range(10),
|
|
706
|
+
max_workers=3,
|
|
707
|
+
show_progress=True,
|
|
708
|
+
retry_on_errors=ValueError,
|
|
709
|
+
max_attempts=3):
|
|
710
|
+
if error:
|
|
711
|
+
print(f"Input {input} failed with error: {error}")
|
|
712
|
+
else:
|
|
713
|
+
print(f"Input {input} succeeded with output: {output}")
|
|
714
|
+
```
|
|
715
|
+
|
|
652
716
|
Args:
|
|
653
717
|
func: A user function.
|
|
654
718
|
parallel_inputs: The inputs for `func` which will be processed in parallel.
|
langfun/core/console.py
CHANGED
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
"""
|
|
14
|
+
"""Utilities for console output and notebook display."""
|
|
15
15
|
|
|
16
16
|
import sys
|
|
17
17
|
from typing import Any
|
|
@@ -21,7 +21,14 @@ from langfun.core import modalities as lf_modalities
|
|
|
21
21
|
|
|
22
22
|
|
|
23
23
|
class AnthropicMessageConverter(lf.MessageConverter):
|
|
24
|
-
"""Converter
|
|
24
|
+
"""Converter between Langfun messages and Anthropic API message format.
|
|
25
|
+
|
|
26
|
+
This converter translates `lf.Message` objects into the JSON format required
|
|
27
|
+
by the Anthropic API and vice versa. It handles text and modalities like
|
|
28
|
+
images and PDFs by encoding them in base64 format as expected by Anthropic.
|
|
29
|
+
An optional `chunk_preprocessor` can be provided to modify or filter
|
|
30
|
+
chunks before conversion.
|
|
31
|
+
"""
|
|
25
32
|
|
|
26
33
|
FORMAT_ID = 'anthropic'
|
|
27
34
|
|
|
@@ -30,12 +37,12 @@ class AnthropicMessageConverter(lf.MessageConverter):
|
|
|
30
37
|
(
|
|
31
38
|
'Chunk preprocessor for Langfun chunk to Anthropic chunk conversion. '
|
|
32
39
|
'It will be applied before each Langfun chunk is converted. '
|
|
33
|
-
'If returns None, the chunk will be skipped.'
|
|
40
|
+
'If it returns None, the chunk will be skipped.'
|
|
34
41
|
)
|
|
35
42
|
] = None
|
|
36
43
|
|
|
37
44
|
def to_value(self, message: lf.Message) -> dict[str, Any]:
|
|
38
|
-
"""Converts a Langfun message to
|
|
45
|
+
"""Converts a Langfun message to Anthropic API."""
|
|
39
46
|
content = []
|
|
40
47
|
for chunk in message.chunk():
|
|
41
48
|
if self.chunk_preprocessor:
|
|
@@ -21,7 +21,14 @@ from langfun.core import modalities as lf_modalities
|
|
|
21
21
|
|
|
22
22
|
|
|
23
23
|
class GeminiMessageConverter(lf.MessageConverter):
|
|
24
|
-
"""Converter
|
|
24
|
+
"""Converter between Langfun messages and Gemini API message format.
|
|
25
|
+
|
|
26
|
+
This converter translates `lf.Message` objects into the JSON format required
|
|
27
|
+
by the public Gemini API (e.g., via Vertex AI or Google AI Studio) and
|
|
28
|
+
vice versa. It handles text and modalities like images, extracting thought
|
|
29
|
+
chunks if present. An optional `chunk_preprocessor` can be provided to
|
|
30
|
+
modify or filter chunks before conversion.
|
|
31
|
+
"""
|
|
25
32
|
|
|
26
33
|
FORMAT_ID = 'gemini'
|
|
27
34
|
|
|
@@ -30,7 +37,7 @@ class GeminiMessageConverter(lf.MessageConverter):
|
|
|
30
37
|
(
|
|
31
38
|
'Chunk preprocessor for Langfun chunk to Gemini chunk conversion. '
|
|
32
39
|
'It will be applied before each Langfun chunk is converted. '
|
|
33
|
-
'If returns None, the chunk will be skipped.'
|
|
40
|
+
'If it returns None, the chunk will be skipped.'
|
|
34
41
|
),
|
|
35
42
|
] = None
|
|
36
43
|
|
|
@@ -20,9 +20,14 @@ from langfun.core import modalities as lf_modalities
|
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
class OpenAIChatCompletionAPIMessageConverter(lf.MessageConverter):
|
|
23
|
-
"""Converter
|
|
24
|
-
|
|
25
|
-
|
|
23
|
+
"""Converter for OpenAI Chat Completion API.
|
|
24
|
+
|
|
25
|
+
This converter translates `lf.Message` objects into the JSON format
|
|
26
|
+
required by the OpenAI Chat Completions API
|
|
27
|
+
(https://platform.openai.com/docs/api-reference/chat) and vice versa.
|
|
28
|
+
It handles text and image modalities, mapping Langfun roles to OpenAI
|
|
29
|
+
roles ('system', 'user', 'assistant'). An optional `chunk_preprocessor`
|
|
30
|
+
can be provided to modify or filter chunks before conversion.
|
|
26
31
|
"""
|
|
27
32
|
|
|
28
33
|
FORMAT_ID = 'openai_chat_completion_api'
|
|
@@ -32,7 +37,7 @@ class OpenAIChatCompletionAPIMessageConverter(lf.MessageConverter):
|
|
|
32
37
|
(
|
|
33
38
|
'Chunk preprocessor for Langfun chunk to OpenAI chunk conversion. '
|
|
34
39
|
'It will be applied before each Langfun chunk is converted. '
|
|
35
|
-
'If returns None, the chunk will be skipped.'
|
|
40
|
+
'If it returns None, the chunk will be skipped.'
|
|
36
41
|
)
|
|
37
42
|
] = None
|
|
38
43
|
|
|
@@ -159,9 +164,14 @@ lf.Message.from_openai_chat_completion_api_format = (
|
|
|
159
164
|
class OpenAIResponsesAPIMessageConverter(
|
|
160
165
|
OpenAIChatCompletionAPIMessageConverter
|
|
161
166
|
):
|
|
162
|
-
"""Converter
|
|
163
|
-
|
|
164
|
-
|
|
167
|
+
"""Converter for OpenAI Responses API.
|
|
168
|
+
|
|
169
|
+
This converter translates `lf.Message` objects into the JSON format
|
|
170
|
+
required by the OpenAI Responses API
|
|
171
|
+
(https://platform.openai.com/docs/api-reference/responses/create),
|
|
172
|
+
which is used for human-in-the-loop rating, and vice versa.
|
|
173
|
+
It extends `OpenAIChatCompletionAPIMessageConverter` but uses different
|
|
174
|
+
type names for content chunks (e.g., 'input_text', 'output_image').
|
|
165
175
|
"""
|
|
166
176
|
|
|
167
177
|
FORMAT_ID = 'openai_responses_api'
|
langfun/core/eval/base.py
CHANGED
|
@@ -59,18 +59,20 @@ class Evaluable(lf.Component):
|
|
|
59
59
|
@property
|
|
60
60
|
@abc.abstractmethod
|
|
61
61
|
def id(self) -> str:
|
|
62
|
-
"""Returns the ID of
|
|
62
|
+
"""Returns the ID of this evaluable node.
|
|
63
63
|
|
|
64
64
|
Returns:
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
65
|
+
A string as the ID of this evaluable node.
|
|
66
|
+
If an evaluable node acts as a container for other evaluable nodes
|
|
67
|
+
(e.g. `lf.Suite`), its ID could be empty.
|
|
68
|
+
Leaf evaluable nodes (e.g. `lf.Evaluation`) must have unique IDs
|
|
69
|
+
under the same container, as their IDs will be used as the directory
|
|
70
|
+
name for saving their results.
|
|
69
71
|
"""
|
|
70
72
|
|
|
71
73
|
@property
|
|
72
74
|
def dir(self) -> str | None:
|
|
73
|
-
"""Returns the directory for saving results
|
|
75
|
+
"""Returns the directory for saving results."""
|
|
74
76
|
if self.root_dir is None:
|
|
75
77
|
return None
|
|
76
78
|
return os.path.join(self.root_dir, self.id)
|
|
@@ -82,18 +84,18 @@ class Evaluable(lf.Component):
|
|
|
82
84
|
|
|
83
85
|
@property
|
|
84
86
|
def index_link(self) -> str | None:
|
|
85
|
-
"""Returns the index page."""
|
|
87
|
+
"""Returns the link to the index page."""
|
|
86
88
|
if self.dir is None:
|
|
87
89
|
return None
|
|
88
90
|
return self.link(os.path.join(self.dir, Evaluable.INDEX_HTML))
|
|
89
91
|
|
|
90
92
|
def summary(self, pivot_field: str = 'lm') -> 'Summary':
|
|
91
|
-
"""Returns a summary for all child evaluations
|
|
93
|
+
"""Returns a summary for all child evaluations."""
|
|
92
94
|
return Summary([pg.Ref(x) for x in self.leaf_nodes], pivot_field)
|
|
93
95
|
|
|
94
96
|
@property
|
|
95
97
|
def summary_link(self) -> str | None:
|
|
96
|
-
"""Returns the summary page."""
|
|
98
|
+
"""Returns the link to the summary page."""
|
|
97
99
|
if self.root_dir is None:
|
|
98
100
|
return None
|
|
99
101
|
return self.link(os.path.join(self.root_dir, Evaluable.SUMMARY_HTML))
|
|
@@ -177,6 +179,7 @@ class Evaluable(lf.Component):
|
|
|
177
179
|
|
|
178
180
|
@property
|
|
179
181
|
def is_leaf(self) -> bool:
|
|
182
|
+
"""Returns whether this node is a leaf node."""
|
|
180
183
|
return isinstance(self, Evaluation) and not self.children
|
|
181
184
|
|
|
182
185
|
@functools.cached_property
|
|
@@ -404,7 +407,7 @@ class Evaluable(lf.Component):
|
|
|
404
407
|
timeout: int | None = None,
|
|
405
408
|
**kwargs,
|
|
406
409
|
) -> None:
|
|
407
|
-
"""Run the
|
|
410
|
+
"""Run the evaluation and fill `self.result`. Subclass to implement."""
|
|
408
411
|
|
|
409
412
|
@abc.abstractmethod
|
|
410
413
|
def _completion_status(self, run_status: str) -> str:
|
|
@@ -545,6 +548,7 @@ class Evaluable(lf.Component):
|
|
|
545
548
|
def from_dir(
|
|
546
549
|
cls, maybe_dir: str, load_result: bool = True
|
|
547
550
|
) -> Optional['Evaluable']:
|
|
551
|
+
"""Loads an evaluable object from a directory."""
|
|
548
552
|
exp_json = os.path.join(maybe_dir, Evaluable.EXPERIMENT_JSON)
|
|
549
553
|
if not pg.io.path_exists(exp_json):
|
|
550
554
|
return None
|
|
@@ -558,7 +562,7 @@ class Evaluable(lf.Component):
|
|
|
558
562
|
return experiment
|
|
559
563
|
|
|
560
564
|
def try_load_result(self) -> bool:
|
|
561
|
-
"""Try
|
|
565
|
+
"""Try loads result from file if it's not loaded."""
|
|
562
566
|
if self.result is None:
|
|
563
567
|
result_json = os.path.join(self.dir, Evaluable.RESULT_JSON)
|
|
564
568
|
if pg.io.path_exists(result_json):
|
|
@@ -604,6 +608,7 @@ class Suite(Evaluable):
|
|
|
604
608
|
|
|
605
609
|
@functools.cached_property
|
|
606
610
|
def hash(self) -> str:
|
|
611
|
+
"""Returns the hash of this suite."""
|
|
607
612
|
return hashlib.md5(
|
|
608
613
|
' '.join(sorted([c.hash for c in self.children])).encode()
|
|
609
614
|
).hexdigest()[:8]
|
|
@@ -619,14 +624,14 @@ class Suite(Evaluable):
|
|
|
619
624
|
|
|
620
625
|
|
|
621
626
|
class Evaluation(Evaluable):
|
|
622
|
-
"""Base class for evaluation
|
|
627
|
+
"""Base class for evaluation sets."""
|
|
623
628
|
|
|
624
629
|
inputs: pg.typing.Annotated[
|
|
625
630
|
pg.typing.Functor(),
|
|
626
631
|
(
|
|
627
632
|
'A functor that returns a list of user-defined objects as the input '
|
|
628
|
-
'examples. It
|
|
629
|
-
'`lf.eval.inputs_from(path)`, from a Python
|
|
633
|
+
'examples. It can be inputs loaded from a JSON file via '
|
|
634
|
+
'`lf.eval.inputs_from(path)`, from a Python-coded list via '
|
|
630
635
|
'`lf.eval.as_inputs(values)` or a user-defined functor that '
|
|
631
636
|
'generates input objects at runtime.'
|
|
632
637
|
),
|
|
@@ -648,12 +653,12 @@ class Evaluation(Evaluable):
|
|
|
648
653
|
pg.typing.Functor().noneable(),
|
|
649
654
|
(
|
|
650
655
|
'A functor that returns a type annotation that will be converted to '
|
|
651
|
-
'`lf.Schema`, or a tuple of (annotation,
|
|
656
|
+
'`lf.Schema`, or a tuple of (annotation, few-shot examples). '
|
|
652
657
|
'For "call" method, it could be None, indicating that the raw '
|
|
653
|
-
'response from the LM will be used as the output, and the
|
|
654
|
-
'examples will be used for parsing. For "query" and "complete"
|
|
655
|
-
'must be provided, and the
|
|
656
|
-
'for prompting. Here
|
|
658
|
+
'response from the LM will be used as the output, and the few-shot '
|
|
659
|
+
'examples will be used for parsing. For "query" and "complete" '
|
|
660
|
+
'methods, it must be provided, and the few-shot examples will be '
|
|
661
|
+
'used directly for prompting. Here is example code on how the '
|
|
657
662
|
'functors should be defined:'
|
|
658
663
|
+ inspect.cleandoc("""
|
|
659
664
|
```
|
|
@@ -693,7 +698,7 @@ class Evaluation(Evaluable):
|
|
|
693
698
|
completion_prompt_field: Annotated[
|
|
694
699
|
str | None,
|
|
695
700
|
(
|
|
696
|
-
'A
|
|
701
|
+
'A string field that will be automatically added to the class of the '
|
|
697
702
|
'input object for `lf.complete`. If None, no field will be added to '
|
|
698
703
|
'the class, instead the prompt will be passed as the first argument '
|
|
699
704
|
'of the input object to complete. Applicable only when `method` is '
|
|
@@ -738,7 +743,7 @@ class Evaluation(Evaluable):
|
|
|
738
743
|
|
|
739
744
|
@functools.cached_property
|
|
740
745
|
def hash(self) -> str:
|
|
741
|
-
"""Returns the
|
|
746
|
+
"""Returns the semantics-based hash of the evaluation."""
|
|
742
747
|
if self.is_deterministic:
|
|
743
748
|
identity = pg.format(self._identifiers(), compact=True)
|
|
744
749
|
else:
|
|
@@ -784,7 +789,7 @@ class Evaluation(Evaluable):
|
|
|
784
789
|
|
|
785
790
|
@property
|
|
786
791
|
def complete_rate(self) -> float:
|
|
787
|
-
"""Returns the
|
|
792
|
+
"""Returns the completion rate of examples."""
|
|
788
793
|
return self.num_completed / self.num_examples
|
|
789
794
|
|
|
790
795
|
#
|
|
@@ -837,7 +842,7 @@ class Evaluation(Evaluable):
|
|
|
837
842
|
|
|
838
843
|
@functools.cached_property
|
|
839
844
|
def non_oop_failures(self) -> list[tuple[Any, Exception]]:
|
|
840
|
-
"""Returns the OOP failures."""
|
|
845
|
+
"""Returns the non-OOP failures."""
|
|
841
846
|
return [item for item in self.failures
|
|
842
847
|
if not isinstance(item[1], lf_structured.MappingError)]
|
|
843
848
|
|
|
@@ -883,7 +888,7 @@ class Evaluation(Evaluable):
|
|
|
883
888
|
|
|
884
889
|
@functools.cached_property
|
|
885
890
|
def schema(self) -> lf_structured.Schema | None:
|
|
886
|
-
"""
|
|
891
|
+
"""Returns the schema for parsing LLM response."""
|
|
887
892
|
if self.schema_fn is None:
|
|
888
893
|
return None
|
|
889
894
|
|
|
@@ -897,7 +902,7 @@ class Evaluation(Evaluable):
|
|
|
897
902
|
|
|
898
903
|
@functools.cached_property
|
|
899
904
|
def fewshot_examples(self) -> list[lf.structured.MappingExample] | None:
|
|
900
|
-
"""
|
|
905
|
+
"""Returns the few-shot examples for prompting or parsing."""
|
|
901
906
|
if self.schema_fn is None:
|
|
902
907
|
return None
|
|
903
908
|
|
|
@@ -973,7 +978,7 @@ class Evaluation(Evaluable):
|
|
|
973
978
|
|
|
974
979
|
@functools.cached_property
|
|
975
980
|
def children(self) -> list['Evaluation']:
|
|
976
|
-
"""Returns
|
|
981
|
+
"""Returns child evaluations if this evaluation has a parameter space."""
|
|
977
982
|
if self.is_deterministic:
|
|
978
983
|
return []
|
|
979
984
|
children = []
|
|
@@ -1023,7 +1028,7 @@ class Evaluation(Evaluable):
|
|
|
1023
1028
|
|
|
1024
1029
|
@property
|
|
1025
1030
|
def non_oop_failures_link(self) -> str | None:
|
|
1026
|
-
"""Returns the link to
|
|
1031
|
+
"""Returns the link to the non-OOP failures page."""
|
|
1027
1032
|
if self.dir is None:
|
|
1028
1033
|
return None
|
|
1029
1034
|
return self.link(os.path.join(self.dir, Evaluation.NON_OOP_FAILURES_HTML))
|
|
@@ -1208,10 +1213,10 @@ class Evaluation(Evaluable):
|
|
|
1208
1213
|
)
|
|
1209
1214
|
|
|
1210
1215
|
def process_output(self, example: Any, output: lf.Message) -> None:
|
|
1211
|
-
"""
|
|
1216
|
+
"""Processes the output for an example.
|
|
1212
1217
|
|
|
1213
1218
|
Subclasses can override this method to generate and attach additional
|
|
1214
|
-
metadata for debugging
|
|
1219
|
+
metadata for debugging purposes. For example, draw bounding boxes on the
|
|
1215
1220
|
input image based on LLM predicted boxes and attach to output_message's
|
|
1216
1221
|
metadata.
|
|
1217
1222
|
|
|
@@ -1219,8 +1224,8 @@ class Evaluation(Evaluable):
|
|
|
1219
1224
|
|
|
1220
1225
|
class BoundingBoxEval(lf.eval.Matching):
|
|
1221
1226
|
...
|
|
1222
|
-
def process_output(example, output):
|
|
1223
|
-
output.metadata.image_with_bbox =
|
|
1227
|
+
def process_output(self, example, output):
|
|
1228
|
+
output.metadata.image_with_bbox = draw_bounding_box(
|
|
1224
1229
|
example.image, output.result)
|
|
1225
1230
|
|
|
1226
1231
|
Args:
|
|
@@ -1449,7 +1454,7 @@ class Evaluation(Evaluable):
|
|
|
1449
1454
|
trace the LM input, response and parsed structure. If error is raised
|
|
1450
1455
|
before LLM could return a response, None will be its value.
|
|
1451
1456
|
error: The exception during processing the example.
|
|
1452
|
-
dryrun: Whether or not
|
|
1457
|
+
dryrun: Whether or not auditing takes place during dryrun.
|
|
1453
1458
|
"""
|
|
1454
1459
|
if error is not None:
|
|
1455
1460
|
self._failures.append((example, error))
|
|
@@ -1674,7 +1679,7 @@ class Evaluation(Evaluable):
|
|
|
1674
1679
|
|
|
1675
1680
|
@classmethod
|
|
1676
1681
|
def visualize(cls, evaluations: list['Evaluation']) -> str | None:
|
|
1677
|
-
"""Visualize
|
|
1682
|
+
"""Visualize a list of evaluations of this task in HTML."""
|
|
1678
1683
|
del evaluations
|
|
1679
1684
|
return None
|
|
1680
1685
|
|
|
@@ -1810,7 +1815,7 @@ class Summary(pg.Object):
|
|
|
1810
1815
|
)
|
|
1811
1816
|
|
|
1812
1817
|
class Table(pg.Object):
|
|
1813
|
-
"""A pivot table for
|
|
1818
|
+
"""A pivot table for viewing evaluations."""
|
|
1814
1819
|
|
|
1815
1820
|
class Row(pg.Object):
|
|
1816
1821
|
descriptor: dict[str, Any]
|
|
@@ -2013,12 +2018,12 @@ class Summary(pg.Object):
|
|
|
2013
2018
|
return self._context.completed
|
|
2014
2019
|
|
|
2015
2020
|
def stop(self) -> 'Summary':
|
|
2016
|
-
"""
|
|
2021
|
+
"""Signals and waits for the monitor thread to stop."""
|
|
2017
2022
|
self._context.stopping = True
|
|
2018
2023
|
return self.join()
|
|
2019
2024
|
|
|
2020
2025
|
def join(self) -> 'Summary':
|
|
2021
|
-
"""Waits the monitor thread to complete."""
|
|
2026
|
+
"""Waits for the monitor thread to complete."""
|
|
2022
2027
|
self._thread.join()
|
|
2023
2028
|
summary = self.summary
|
|
2024
2029
|
assert summary is not None
|
|
@@ -2035,7 +2040,7 @@ class Summary(pg.Object):
|
|
|
2035
2040
|
scan_interval: int = 60,
|
|
2036
2041
|
refresh_when_stop: bool = True,
|
|
2037
2042
|
) -> MonitorResult:
|
|
2038
|
-
"""
|
|
2043
|
+
"""Monitors one or more root directories and save summary periodically."""
|
|
2039
2044
|
context = pg.Dict(stopping=False, completed=False, summary=None)
|
|
2040
2045
|
|
|
2041
2046
|
def _monitor():
|
|
@@ -2187,7 +2192,7 @@ def monitor_async(
|
|
|
2187
2192
|
scan_interval: int = 60,
|
|
2188
2193
|
refresh_when_stop: bool = True,
|
|
2189
2194
|
) -> Summary.MonitorResult:
|
|
2190
|
-
"""
|
|
2195
|
+
"""Asynchronously monitors one or more root directories for summary."""
|
|
2191
2196
|
return Summary.monitor_async(
|
|
2192
2197
|
root_dir,
|
|
2193
2198
|
save_as,
|
|
@@ -2365,10 +2370,9 @@ def run(
|
|
|
2365
2370
|
a string (for string-based patcher), a `pg.patching.Patcher` object, or
|
|
2366
2371
|
a rebind function (e.g. `pg.rebind`). See `lf.eval.patch_*` for more
|
|
2367
2372
|
details.
|
|
2368
|
-
mode: The mode to run the suite
|
|
2369
|
-
|
|
2370
|
-
|
|
2371
|
-
to do nothing.
|
|
2373
|
+
mode: The mode to run the suite: "run" to run with reuse of existing
|
|
2374
|
+
results, "rerun" to force re-evaluation, "dryrun" for a dry run, and
|
|
2375
|
+
"noop" to do nothing.
|
|
2372
2376
|
debug: Whether to run in debug mode.
|
|
2373
2377
|
print_definition: Whether to print the experiment definition.
|
|
2374
2378
|
**kwargs: Additional arguments to be passed to dryrun/run the suite.
|
langfun/core/eval/matching.py
CHANGED
|
@@ -38,7 +38,7 @@ class Matching(base.Evaluation):
|
|
|
38
38
|
|
|
39
39
|
@abc.abstractmethod
|
|
40
40
|
def answer(self, output: Any, example: Any) -> Any:
|
|
41
|
-
"""Returns the answer from the
|
|
41
|
+
"""Returns the answer from the structured output."""
|
|
42
42
|
|
|
43
43
|
@property
|
|
44
44
|
def matches(self) -> list[tuple[int, Any, Any, lf.Message]]:
|
|
@@ -52,6 +52,7 @@ class Matching(base.Evaluation):
|
|
|
52
52
|
|
|
53
53
|
@property
|
|
54
54
|
def match_rate(self) -> float:
|
|
55
|
+
"""Returns the match rate."""
|
|
55
56
|
if self.num_completed == 0:
|
|
56
57
|
return 0.0
|
|
57
58
|
return self.num_matches / self.num_completed
|
|
@@ -68,17 +69,19 @@ class Matching(base.Evaluation):
|
|
|
68
69
|
|
|
69
70
|
@property
|
|
70
71
|
def mismatch_rate(self) -> float:
|
|
72
|
+
"""Returns the mismatch rate."""
|
|
71
73
|
if self.num_completed == 0:
|
|
72
74
|
return 0.0
|
|
73
75
|
return self.num_mismatches / self.num_completed
|
|
74
76
|
|
|
75
77
|
@property
|
|
76
78
|
def matches_link(self) -> str:
|
|
77
|
-
"""Returns the matches page."""
|
|
79
|
+
"""Returns the link to the matches page."""
|
|
78
80
|
return self.link(os.path.join(self.dir, Matching.MATCHES_HTML))
|
|
79
81
|
|
|
80
82
|
@property
|
|
81
83
|
def mismatches_link(self) -> str:
|
|
84
|
+
"""Returns the link to the mismatches page."""
|
|
82
85
|
return self.link(os.path.join(self.dir, Matching.MISMATCHES_HTML))
|
|
83
86
|
|
|
84
87
|
def _reset(self) -> None:
|
langfun/core/eval/patching.py
CHANGED
|
@@ -114,17 +114,17 @@ def model_by_name(name: str) -> lf.LanguageModel:
|
|
|
114
114
|
|
|
115
115
|
@pg.patcher(auto_typing=True)
|
|
116
116
|
def lm(unused_eval, models: list[str]):
|
|
117
|
-
"""
|
|
117
|
+
"""Patches the LM used for benchmarking."""
|
|
118
118
|
return patch_lm(pg.oneof([model_by_name(name) for name in models]))
|
|
119
119
|
|
|
120
120
|
|
|
121
121
|
@pg.patcher(auto_typing=True)
|
|
122
122
|
def temperature(unused_eval, value: float):
|
|
123
|
-
"""
|
|
123
|
+
"""Patches the temperature used for benchmarking."""
|
|
124
124
|
return patch_member(lf.LMSamplingOptions, "temperature", value)
|
|
125
125
|
|
|
126
126
|
|
|
127
127
|
@pg.patcher(auto_typing=True)
|
|
128
128
|
def max_tokens(unused_eval, value: int | None):
|
|
129
|
-
"""
|
|
129
|
+
"""Patches the max_tokens used for benchmarking."""
|
|
130
130
|
return patch_member(lf.LMSamplingOptions, "max_tokens", value)
|
langfun/core/eval/scoring.py
CHANGED
|
@@ -41,18 +41,19 @@ class Scoring(base.Evaluation):
|
|
|
41
41
|
|
|
42
42
|
@property
|
|
43
43
|
def score_rate(self) -> float:
|
|
44
|
-
"""Returns the
|
|
44
|
+
"""Returns the rate of scored examples among the completed ones."""
|
|
45
45
|
if self.num_completed == 0:
|
|
46
46
|
return 0.0
|
|
47
47
|
return self.num_scored / self.num_completed
|
|
48
48
|
|
|
49
49
|
@property
|
|
50
50
|
def scored_link(self) -> str:
|
|
51
|
-
"""Returns the
|
|
51
|
+
"""Returns the scored examples page."""
|
|
52
52
|
return self.link(os.path.join(self.dir, Scoring.SCORED_HTML))
|
|
53
53
|
|
|
54
54
|
@property
|
|
55
55
|
def avg_score(self) -> float:
|
|
56
|
+
"""Returns the average score of scored examples."""
|
|
56
57
|
if self.num_scored == 0:
|
|
57
58
|
return 0
|
|
58
59
|
return sum([i[2] for i in self._scored]) / self.num_scored
|
|
@@ -181,7 +182,7 @@ class Scoring(base.Evaluation):
|
|
|
181
182
|
super()._render_summary_metrics(s)
|
|
182
183
|
|
|
183
184
|
def _render_scored(self, s: io.StringIO) -> None:
|
|
184
|
-
"""Formats the
|
|
185
|
+
"""Formats the scored cases into html."""
|
|
185
186
|
s.write('<h2> Scored </h2>')
|
|
186
187
|
s.write('<div style="white-space:pre">\n')
|
|
187
188
|
s.write(
|
|
@@ -29,7 +29,17 @@ Runner = experiment_lib.Runner
|
|
|
29
29
|
|
|
30
30
|
|
|
31
31
|
class Checkpointer(experiment_lib.Plugin):
|
|
32
|
-
"""Base class for checkpointing evaluation examples.
|
|
32
|
+
"""Base class for checkpointing evaluation examples.
|
|
33
|
+
|
|
34
|
+
`Checkpointer` is a plugin that saves the state of processed examples
|
|
35
|
+
incrementally during an experiment run, allowing the experiment to be resumed
|
|
36
|
+
later. When an experiment starts, the checkpointer loads any previously saved
|
|
37
|
+
examples from an earlier run (or a warm-start run) into `experiment.state`,
|
|
38
|
+
so the runner can skip processing them again.
|
|
39
|
+
Subclasses should implement `_list_checkpoint_filenames` to identify
|
|
40
|
+
checkpoint files to load, and `_save_example` to save a newly processed
|
|
41
|
+
example.
|
|
42
|
+
"""
|
|
33
43
|
|
|
34
44
|
checkpoint_filename: Annotated[
|
|
35
45
|
str,
|
|
@@ -170,7 +180,12 @@ class Checkpointer(experiment_lib.Plugin):
|
|
|
170
180
|
|
|
171
181
|
|
|
172
182
|
class PerExampleCheckpointer(Checkpointer):
|
|
173
|
-
"""Checkpointer that saves each example to a separate file.
|
|
183
|
+
"""Checkpointer that saves each example to a separate file.
|
|
184
|
+
|
|
185
|
+
This checkpointer saves each processed example to its own checkpoint file,
|
|
186
|
+
named using the pattern `<checkpoint_filename_prefix>_<example_id>.<ext>`.
|
|
187
|
+
For example, `checkpoint_1.bagz`, `checkpoint_2.bagz`, etc.
|
|
188
|
+
"""
|
|
174
189
|
|
|
175
190
|
def _on_bound(self):
|
|
176
191
|
super()._on_bound()
|
|
@@ -235,7 +250,13 @@ class PerExampleCheckpointer(Checkpointer):
|
|
|
235
250
|
|
|
236
251
|
|
|
237
252
|
class BulkCheckpointer(Checkpointer):
|
|
238
|
-
"""Checkpointer that saves all examples to a single file.
|
|
253
|
+
"""Checkpointer that saves all examples of an evaluation to a single file.
|
|
254
|
+
|
|
255
|
+
This checkpointer appends newly processed examples of an evaluation to a
|
|
256
|
+
single sequence file (e.g., `checkpoint.bagz`). This is often more efficient
|
|
257
|
+
than `PerExampleCheckpointer` when dealing with a large number of examples
|
|
258
|
+
or when file system overhead is a concern.
|
|
259
|
+
"""
|
|
239
260
|
|
|
240
261
|
def _on_bound(self):
|
|
241
262
|
super()._on_bound()
|
|
@@ -341,7 +362,12 @@ class BulkCheckpointer(Checkpointer):
|
|
|
341
362
|
|
|
342
363
|
|
|
343
364
|
class SequenceWriter:
|
|
344
|
-
"""
|
|
365
|
+
"""A thread-safe writer for sequence files (e.g., Bagz).
|
|
366
|
+
|
|
367
|
+
`SequenceWriter` wraps a `pg.io.SequenceWriter` to provide thread-safe
|
|
368
|
+
`add` and `close` operations, ensuring that examples can be written
|
|
369
|
+
concurrently from multiple threads without corrupting the sequence file.
|
|
370
|
+
"""
|
|
345
371
|
|
|
346
372
|
def __init__(self, path: str):
|
|
347
373
|
self._lock = threading.Lock()
|