langfun 0.1.2.dev202510230805__py3-none-any.whl → 0.1.2.dev202511270805__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of langfun might be problematic. Click here for more details.
- langfun/core/__init__.py +2 -0
- langfun/core/agentic/__init__.py +4 -1
- langfun/core/agentic/action.py +447 -29
- langfun/core/agentic/action_eval.py +9 -2
- langfun/core/agentic/action_test.py +149 -21
- langfun/core/async_support.py +32 -3
- langfun/core/coding/python/correction.py +19 -9
- langfun/core/coding/python/execution.py +14 -12
- langfun/core/coding/python/generation.py +21 -16
- langfun/core/coding/python/sandboxing.py +23 -3
- langfun/core/component.py +42 -3
- langfun/core/concurrent.py +70 -6
- langfun/core/concurrent_test.py +1 -0
- langfun/core/console.py +1 -1
- langfun/core/data/conversion/anthropic.py +12 -3
- langfun/core/data/conversion/anthropic_test.py +8 -6
- langfun/core/data/conversion/gemini.py +9 -2
- langfun/core/data/conversion/gemini_test.py +12 -9
- langfun/core/data/conversion/openai.py +145 -31
- langfun/core/data/conversion/openai_test.py +161 -17
- langfun/core/eval/base.py +47 -43
- langfun/core/eval/base_test.py +5 -5
- langfun/core/eval/matching.py +5 -2
- langfun/core/eval/patching.py +3 -3
- langfun/core/eval/scoring.py +4 -3
- langfun/core/eval/v2/__init__.py +1 -0
- langfun/core/eval/v2/checkpointing.py +64 -6
- langfun/core/eval/v2/checkpointing_test.py +9 -2
- langfun/core/eval/v2/eval_test_helper.py +103 -2
- langfun/core/eval/v2/evaluation.py +91 -16
- langfun/core/eval/v2/evaluation_test.py +9 -3
- langfun/core/eval/v2/example.py +50 -40
- langfun/core/eval/v2/example_test.py +16 -8
- langfun/core/eval/v2/experiment.py +74 -8
- langfun/core/eval/v2/experiment_test.py +19 -0
- langfun/core/eval/v2/metric_values.py +31 -3
- langfun/core/eval/v2/metric_values_test.py +32 -0
- langfun/core/eval/v2/metrics.py +157 -44
- langfun/core/eval/v2/metrics_test.py +39 -18
- langfun/core/eval/v2/progress.py +30 -1
- langfun/core/eval/v2/progress_test.py +27 -0
- langfun/core/eval/v2/progress_tracking.py +12 -3
- langfun/core/eval/v2/progress_tracking_test.py +6 -1
- langfun/core/eval/v2/reporting.py +90 -71
- langfun/core/eval/v2/reporting_test.py +24 -6
- langfun/core/eval/v2/runners/__init__.py +30 -0
- langfun/core/eval/v2/{runners.py → runners/base.py} +59 -142
- langfun/core/eval/v2/runners/beam.py +341 -0
- langfun/core/eval/v2/runners/beam_test.py +131 -0
- langfun/core/eval/v2/runners/ckpt_monitor.py +294 -0
- langfun/core/eval/v2/runners/ckpt_monitor_test.py +162 -0
- langfun/core/eval/v2/runners/debug.py +40 -0
- langfun/core/eval/v2/runners/debug_test.py +76 -0
- langfun/core/eval/v2/runners/parallel.py +100 -0
- langfun/core/eval/v2/runners/parallel_test.py +95 -0
- langfun/core/eval/v2/runners/sequential.py +47 -0
- langfun/core/eval/v2/runners/sequential_test.py +172 -0
- langfun/core/langfunc.py +45 -130
- langfun/core/langfunc_test.py +7 -5
- langfun/core/language_model.py +141 -21
- langfun/core/language_model_test.py +54 -3
- langfun/core/llms/__init__.py +9 -1
- langfun/core/llms/anthropic.py +157 -2
- langfun/core/llms/azure_openai.py +29 -17
- langfun/core/llms/cache/base.py +25 -3
- langfun/core/llms/cache/in_memory.py +48 -7
- langfun/core/llms/cache/in_memory_test.py +14 -4
- langfun/core/llms/compositional.py +25 -1
- langfun/core/llms/deepseek.py +30 -2
- langfun/core/llms/fake.py +32 -1
- langfun/core/llms/gemini.py +55 -17
- langfun/core/llms/gemini_test.py +84 -0
- langfun/core/llms/google_genai.py +34 -1
- langfun/core/llms/groq.py +28 -3
- langfun/core/llms/llama_cpp.py +23 -4
- langfun/core/llms/openai.py +36 -3
- langfun/core/llms/openai_compatible.py +148 -27
- langfun/core/llms/openai_compatible_test.py +207 -20
- langfun/core/llms/openai_test.py +0 -2
- langfun/core/llms/rest.py +12 -1
- langfun/core/llms/vertexai.py +58 -8
- langfun/core/logging.py +1 -1
- langfun/core/mcp/client.py +77 -22
- langfun/core/mcp/client_test.py +8 -35
- langfun/core/mcp/session.py +94 -29
- langfun/core/mcp/session_test.py +54 -0
- langfun/core/mcp/tool.py +151 -22
- langfun/core/mcp/tool_test.py +197 -0
- langfun/core/memory.py +1 -0
- langfun/core/message.py +160 -55
- langfun/core/message_test.py +65 -81
- langfun/core/modalities/__init__.py +8 -0
- langfun/core/modalities/audio.py +21 -1
- langfun/core/modalities/image.py +19 -1
- langfun/core/modalities/mime.py +64 -3
- langfun/core/modalities/mime_test.py +11 -0
- langfun/core/modalities/pdf.py +19 -1
- langfun/core/modalities/video.py +21 -1
- langfun/core/modality.py +167 -29
- langfun/core/modality_test.py +42 -12
- langfun/core/natural_language.py +1 -1
- langfun/core/sampling.py +4 -4
- langfun/core/sampling_test.py +20 -4
- langfun/core/structured/__init__.py +2 -24
- langfun/core/structured/completion.py +34 -44
- langfun/core/structured/completion_test.py +23 -43
- langfun/core/structured/description.py +54 -50
- langfun/core/structured/function_generation.py +29 -12
- langfun/core/structured/mapping.py +81 -37
- langfun/core/structured/parsing.py +95 -79
- langfun/core/structured/parsing_test.py +0 -3
- langfun/core/structured/querying.py +215 -142
- langfun/core/structured/querying_test.py +65 -29
- langfun/core/structured/schema/__init__.py +49 -0
- langfun/core/structured/schema/base.py +664 -0
- langfun/core/structured/schema/base_test.py +531 -0
- langfun/core/structured/schema/json.py +174 -0
- langfun/core/structured/schema/json_test.py +121 -0
- langfun/core/structured/schema/python.py +316 -0
- langfun/core/structured/schema/python_test.py +410 -0
- langfun/core/structured/schema_generation.py +33 -14
- langfun/core/structured/scoring.py +47 -36
- langfun/core/structured/tokenization.py +26 -11
- langfun/core/subscription.py +2 -2
- langfun/core/template.py +174 -49
- langfun/core/template_test.py +123 -17
- langfun/env/__init__.py +8 -2
- langfun/env/base_environment.py +320 -128
- langfun/env/base_environment_test.py +473 -0
- langfun/env/base_feature.py +92 -15
- langfun/env/base_feature_test.py +228 -0
- langfun/env/base_sandbox.py +84 -361
- langfun/env/base_sandbox_test.py +1235 -0
- langfun/env/event_handlers/__init__.py +1 -1
- langfun/env/event_handlers/chain.py +233 -0
- langfun/env/event_handlers/chain_test.py +253 -0
- langfun/env/event_handlers/event_logger.py +95 -98
- langfun/env/event_handlers/event_logger_test.py +21 -21
- langfun/env/event_handlers/metric_writer.py +225 -140
- langfun/env/event_handlers/metric_writer_test.py +23 -6
- langfun/env/interface.py +854 -40
- langfun/env/interface_test.py +112 -2
- langfun/env/load_balancers_test.py +23 -2
- langfun/env/test_utils.py +126 -84
- {langfun-0.1.2.dev202510230805.dist-info → langfun-0.1.2.dev202511270805.dist-info}/METADATA +1 -1
- langfun-0.1.2.dev202511270805.dist-info/RECORD +215 -0
- langfun/core/eval/v2/runners_test.py +0 -343
- langfun/core/structured/schema.py +0 -987
- langfun/core/structured/schema_test.py +0 -982
- langfun/env/base_test.py +0 -1481
- langfun/env/event_handlers/base.py +0 -350
- langfun-0.1.2.dev202510230805.dist-info/RECORD +0 -195
- {langfun-0.1.2.dev202510230805.dist-info → langfun-0.1.2.dev202511270805.dist-info}/WHEEL +0 -0
- {langfun-0.1.2.dev202510230805.dist-info → langfun-0.1.2.dev202511270805.dist-info}/licenses/LICENSE +0 -0
- {langfun-0.1.2.dev202510230805.dist-info → langfun-0.1.2.dev202511270805.dist-info}/top_level.txt +0 -0
|
@@ -24,7 +24,14 @@ import pyglove as pg
|
|
|
24
24
|
|
|
25
25
|
|
|
26
26
|
class ActionEval(lf.eval.v2.Evaluation):
|
|
27
|
-
"""
|
|
27
|
+
"""Evaluation for agentic actions.
|
|
28
|
+
|
|
29
|
+
`ActionEval` is a specialized evaluation class for executing and evaluating
|
|
30
|
+
agentic actions based on provided inputs. Each input example is expected to
|
|
31
|
+
contain an `action` attribute. The `process` method executes the action
|
|
32
|
+
within a dedicated `Session`, captures the final result, and returns it
|
|
33
|
+
along with the session details in the metadata.
|
|
34
|
+
"""
|
|
28
35
|
|
|
29
36
|
action_args: Annotated[
|
|
30
37
|
dict[str, Any],
|
|
@@ -68,7 +75,7 @@ class ExampleView(pg.Object):
|
|
|
68
75
|
class ActionEvalV1(lf_eval.Matching):
|
|
69
76
|
"""Base class for action evaluations.
|
|
70
77
|
|
|
71
|
-
The input function should
|
|
78
|
+
The input function should return a list of pg.Dict, with `action` and
|
|
72
79
|
`groundtruth` fields.
|
|
73
80
|
"""
|
|
74
81
|
# We override the schema and prompt to dummy values since they are not used.
|
|
@@ -52,6 +52,7 @@ class Foo(action_lib.Action):
|
|
|
52
52
|
with session.track_phase('prepare'):
|
|
53
53
|
session.info('Begin Foo', x=1)
|
|
54
54
|
time.sleep(self.simulate_execution_time[0])
|
|
55
|
+
Bar()(session, lm=lm)
|
|
55
56
|
session.query(
|
|
56
57
|
'foo',
|
|
57
58
|
schema=int if self.simulate_query_error else None,
|
|
@@ -65,6 +66,7 @@ class Foo(action_lib.Action):
|
|
|
65
66
|
def _sub_task(i):
|
|
66
67
|
session.add_metadata(**{f'subtask_{i}': i})
|
|
67
68
|
time.sleep(self.simulate_execution_time[2])
|
|
69
|
+
Bar()(session, lm=lm)
|
|
68
70
|
return lf_structured.query(f'subtask_{i}', lm=lm)
|
|
69
71
|
|
|
70
72
|
self._state = []
|
|
@@ -88,6 +90,50 @@ class Foo(action_lib.Action):
|
|
|
88
90
|
lf_structured.query('additional query', lm=lm)
|
|
89
91
|
|
|
90
92
|
|
|
93
|
+
class ExecutionUnitPositionTest(unittest.TestCase):
|
|
94
|
+
|
|
95
|
+
def test_basics(self):
|
|
96
|
+
pos1 = action_lib.ExecutionUnit.Position(None, 0)
|
|
97
|
+
self.assertEqual(repr(pos1), 'Position(0)')
|
|
98
|
+
self.assertEqual(str(pos1), '')
|
|
99
|
+
self.assertIsNone(pos1.parent)
|
|
100
|
+
self.assertEqual(pos1.index, 0)
|
|
101
|
+
self.assertEqual(pos1.indices(), (0,))
|
|
102
|
+
self.assertEqual(pos1, (0,))
|
|
103
|
+
self.assertEqual(pos1, '')
|
|
104
|
+
self.assertEqual(pos1, action_lib.ExecutionUnit.Position(None, 0))
|
|
105
|
+
self.assertNotEqual(pos1, 1)
|
|
106
|
+
self.assertNotEqual(pos1, (1,))
|
|
107
|
+
self.assertNotEqual(pos1, action_lib.ExecutionUnit.Position(None, 1))
|
|
108
|
+
|
|
109
|
+
pos2 = action_lib.ExecutionUnit.Position(pos1, 0)
|
|
110
|
+
self.assertEqual(repr(pos2), 'Position(0, 0)')
|
|
111
|
+
self.assertEqual(str(pos2), '1')
|
|
112
|
+
self.assertEqual(pos2, '1')
|
|
113
|
+
self.assertEqual(pos2.parent, pos1)
|
|
114
|
+
self.assertEqual(pos2.index, 0)
|
|
115
|
+
self.assertEqual(pos2.indices(), (0, 0))
|
|
116
|
+
self.assertNotEqual(pos1, pos2)
|
|
117
|
+
self.assertLess(pos1, pos2)
|
|
118
|
+
self.assertGreater(pos2, pos1)
|
|
119
|
+
self.assertEqual(
|
|
120
|
+
hash(pos2),
|
|
121
|
+
hash(
|
|
122
|
+
action_lib.ExecutionUnit.Position(
|
|
123
|
+
action_lib.ExecutionUnit.Position(None, 0), 0
|
|
124
|
+
)
|
|
125
|
+
)
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
pos3 = action_lib.ExecutionUnit.Position(pos2, 0)
|
|
129
|
+
self.assertEqual(str(pos3), '1.1')
|
|
130
|
+
self.assertEqual(pos3, '1.1')
|
|
131
|
+
self.assertEqual(pos3.parent, pos2)
|
|
132
|
+
self.assertEqual(pos3.index, 0)
|
|
133
|
+
self.assertEqual(pos3.indices(), (0, 0, 0))
|
|
134
|
+
self.assertEqual(pos3.to_str(separator='>'), '1>1')
|
|
135
|
+
|
|
136
|
+
|
|
91
137
|
class ActionInvocationTest(unittest.TestCase):
|
|
92
138
|
|
|
93
139
|
def test_basics(self):
|
|
@@ -108,9 +154,7 @@ class ExecutionTraceTest(unittest.TestCase):
|
|
|
108
154
|
self.assertEqual(execution.id, '')
|
|
109
155
|
|
|
110
156
|
root = action_lib.ActionInvocation(action=action_lib.RootAction())
|
|
111
|
-
action_invocation = action_lib.ActionInvocation(
|
|
112
|
-
action=Foo(1)
|
|
113
|
-
)
|
|
157
|
+
action_invocation = action_lib.ActionInvocation(action=Foo(1))
|
|
114
158
|
root.execution.append(action_invocation)
|
|
115
159
|
self.assertEqual(action_invocation.execution.id, '/a1')
|
|
116
160
|
|
|
@@ -153,6 +197,7 @@ class SessionTest(unittest.TestCase):
|
|
|
153
197
|
|
|
154
198
|
self.assertIsInstance(session.root.action, action_lib.RootAction)
|
|
155
199
|
self.assertIs(session.current_action, session.root)
|
|
200
|
+
self.assertIs(session.metadata, session.root.metadata)
|
|
156
201
|
|
|
157
202
|
#
|
|
158
203
|
# Inspecting the root invocation.
|
|
@@ -175,20 +220,25 @@ class SessionTest(unittest.TestCase):
|
|
|
175
220
|
)
|
|
176
221
|
|
|
177
222
|
# The root space should have one action (foo), no queries, and no logs.
|
|
223
|
+
self.assertEqual(len(root.execution_units), 1)
|
|
178
224
|
self.assertEqual(len(root.actions), 1)
|
|
179
225
|
self.assertEqual(len(root.queries), 0)
|
|
180
226
|
self.assertEqual(len(root.logs), 0)
|
|
181
|
-
#
|
|
182
|
-
self.assertEqual(len(session.all_queries),
|
|
183
|
-
self.assertEqual(len(root.all_queries),
|
|
184
|
-
#
|
|
185
|
-
self.assertEqual(len(session.all_actions),
|
|
186
|
-
self.assertEqual(
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
227
|
+
# 2 query from Bar, 2 from Foo and 2 * 3 from parallel executions.
|
|
228
|
+
self.assertEqual(len(session.all_queries), 10)
|
|
229
|
+
self.assertEqual(len(root.all_queries), 10)
|
|
230
|
+
# 6 actions: Foo and 2 Bar, and 3 Bar from parallel executions.
|
|
231
|
+
self.assertEqual(len(session.all_actions), 6)
|
|
232
|
+
self.assertEqual(
|
|
233
|
+
[str(a.position) for a in session.all_actions],
|
|
234
|
+
['1', '1.1', '1.2.1.1', '1.2.2.1', '1.2.3.1', '1.3']
|
|
235
|
+
)
|
|
236
|
+
self.assertEqual(len(root.all_actions), 6)
|
|
237
|
+
# 1 log from Bar and 1 from Foo and 3 from Bar in parallel executions.
|
|
238
|
+
self.assertEqual(len(session.all_logs), 6)
|
|
239
|
+
self.assertEqual(len(root.all_logs), 6)
|
|
190
240
|
self.assertIs(session.usage_summary, root.usage_summary)
|
|
191
|
-
self.assertEqual(root.usage_summary.total.num_requests,
|
|
241
|
+
self.assertEqual(root.usage_summary.total.num_requests, 10)
|
|
192
242
|
|
|
193
243
|
# Inspecting the top-level action (Foo)
|
|
194
244
|
foo_invocation = root.execution[0]
|
|
@@ -200,15 +250,19 @@ class SessionTest(unittest.TestCase):
|
|
|
200
250
|
|
|
201
251
|
# Prepare phase.
|
|
202
252
|
prepare_phase = foo_invocation.execution[0]
|
|
253
|
+
self.assertIsNone(prepare_phase.position)
|
|
203
254
|
self.assertIsInstance(prepare_phase, action_lib.ExecutionTrace)
|
|
204
255
|
self.assertEqual(prepare_phase.id, 'agent@1:/a1/prepare')
|
|
205
|
-
self.assertEqual(len(prepare_phase.items),
|
|
256
|
+
self.assertEqual(len(prepare_phase.items), 3)
|
|
206
257
|
self.assertTrue(prepare_phase.has_started)
|
|
207
258
|
self.assertTrue(prepare_phase.has_stopped)
|
|
208
|
-
self.assertEqual(prepare_phase.usage_summary.total.num_requests,
|
|
259
|
+
self.assertEqual(prepare_phase.usage_summary.total.num_requests, 2)
|
|
209
260
|
self.assertIsInstance(prepare_phase.items[0], lf.logging.LogEntry)
|
|
210
|
-
self.assertIsInstance(prepare_phase.items[1],
|
|
211
|
-
self.
|
|
261
|
+
self.assertIsInstance(prepare_phase.items[1], action_lib.ActionInvocation)
|
|
262
|
+
self.assertIs(prepare_phase.items[1].parent_execution_unit, foo_invocation)
|
|
263
|
+
self.assertEqual(prepare_phase.items[1].id, 'agent@1:/a1/prepare/a1')
|
|
264
|
+
self.assertIsInstance(prepare_phase.items[2], lf_structured.QueryInvocation)
|
|
265
|
+
self.assertEqual(prepare_phase.items[2].id, 'agent@1:/a1/prepare/q1')
|
|
212
266
|
|
|
213
267
|
# Tracked queries.
|
|
214
268
|
query_invocation = foo_invocation.execution[1]
|
|
@@ -230,20 +284,44 @@ class SessionTest(unittest.TestCase):
|
|
|
230
284
|
|
|
231
285
|
# Tracked parallel executions.
|
|
232
286
|
parallel_executions = foo_invocation.execution[2]
|
|
287
|
+
# root (0) > foo (0) > parallel executions (1)
|
|
288
|
+
self.assertEqual(parallel_executions.position, (0, 0, 1))
|
|
233
289
|
self.assertEqual(parallel_executions.id, 'agent@1:/a1/p1')
|
|
234
290
|
self.assertIsInstance(parallel_executions, action_lib.ParallelExecutions)
|
|
291
|
+
self.assertIs(
|
|
292
|
+
parallel_executions.all_actions[0].parent_execution_unit,
|
|
293
|
+
parallel_executions
|
|
294
|
+
)
|
|
295
|
+
self.assertIs(
|
|
296
|
+
parallel_executions.all_actions[0].parent_action,
|
|
297
|
+
foo_invocation
|
|
298
|
+
)
|
|
235
299
|
self.assertEqual(len(parallel_executions), 3)
|
|
236
300
|
self.assertEqual(parallel_executions[0].id, 'agent@1:/a1/p1/b1')
|
|
237
301
|
self.assertEqual(parallel_executions[1].id, 'agent@1:/a1/p1/b2')
|
|
238
302
|
self.assertEqual(parallel_executions[2].id, 'agent@1:/a1/p1/b3')
|
|
303
|
+
self.assertEqual(len(parallel_executions[0].execution_units), 1)
|
|
304
|
+
self.assertEqual(len(parallel_executions[1].execution_units), 1)
|
|
305
|
+
self.assertEqual(len(parallel_executions[2].execution_units), 1)
|
|
239
306
|
self.assertEqual(len(parallel_executions[0].queries), 1)
|
|
307
|
+
self.assertEqual(len(parallel_executions[0].all_queries), 2)
|
|
240
308
|
self.assertEqual(len(parallel_executions[1].queries), 1)
|
|
309
|
+
self.assertEqual(len(parallel_executions[1].all_queries), 2)
|
|
241
310
|
self.assertEqual(len(parallel_executions[2].queries), 1)
|
|
311
|
+
self.assertEqual(len(parallel_executions[2].all_queries), 2)
|
|
312
|
+
self.assertEqual(len(parallel_executions.execution_units), 0)
|
|
313
|
+
self.assertEqual(len(parallel_executions.actions), 0)
|
|
314
|
+
self.assertEqual(len(parallel_executions.queries), 0)
|
|
315
|
+
self.assertEqual(len(parallel_executions.logs), 0)
|
|
316
|
+
self.assertEqual(len(parallel_executions.all_actions), 3)
|
|
317
|
+
self.assertEqual(len(parallel_executions.all_queries), 6)
|
|
318
|
+
self.assertEqual(len(parallel_executions.all_logs), 3)
|
|
242
319
|
|
|
243
320
|
# Invocation to Bar.
|
|
244
321
|
bar_invocation = foo_invocation.execution[3]
|
|
245
322
|
self.assertIs(bar_invocation.parent_action, foo_invocation)
|
|
246
|
-
self.
|
|
323
|
+
self.assertIs(bar_invocation.parent_execution_unit, foo_invocation)
|
|
324
|
+
self.assertEqual(bar_invocation.id, 'agent@1:/a1/a5')
|
|
247
325
|
self.assertIsInstance(bar_invocation, action_lib.ActionInvocation)
|
|
248
326
|
self.assertIsInstance(bar_invocation.action, Bar)
|
|
249
327
|
self.assertEqual(bar_invocation.result, 2)
|
|
@@ -497,26 +575,51 @@ class SessionTest(unittest.TestCase):
|
|
|
497
575
|
super()._on_bound()
|
|
498
576
|
self.progresses = []
|
|
499
577
|
|
|
578
|
+
def on_session_start(self, session):
|
|
579
|
+
session.add_metadata(progresses=pg.Ref(self.progresses))
|
|
580
|
+
|
|
500
581
|
def on_action_progress(self, session, action, title, **kwargs):
|
|
501
582
|
self.progresses.append((action.id, title))
|
|
502
583
|
|
|
503
584
|
handler = MyActionHandler()
|
|
585
|
+
self.assertIs(handler.get(MyActionHandler), handler)
|
|
586
|
+
self.assertIsNone(handler.get(action_lib.SessionLogging))
|
|
587
|
+
|
|
588
|
+
handler_chain = action_lib.SessionEventHandlerChain(
|
|
589
|
+
handlers=[handler, action_lib.SessionLogging()]
|
|
590
|
+
)
|
|
591
|
+
self.assertIs(handler_chain.get(MyActionHandler), handler)
|
|
592
|
+
self.assertIs(
|
|
593
|
+
handler_chain.get(action_lib.SessionLogging),
|
|
594
|
+
handler_chain.handlers[1]
|
|
595
|
+
)
|
|
596
|
+
|
|
504
597
|
session = action_lib.Session(
|
|
505
598
|
id='agent@1',
|
|
506
|
-
event_handler=
|
|
507
|
-
handlers=[handler, action_lib.SessionLogging()]
|
|
508
|
-
)
|
|
599
|
+
event_handler=handler_chain
|
|
509
600
|
)
|
|
510
601
|
bar = Bar()
|
|
511
602
|
with session:
|
|
512
603
|
bar(session, lm=fake.StaticResponse('lm response'))
|
|
513
604
|
session.update_progress('Trajectory completed')
|
|
514
605
|
|
|
606
|
+
self.assertIs(session.metadata['progresses'], handler.progresses)
|
|
515
607
|
self.assertEqual(handler.progresses, [
|
|
516
608
|
('agent@1:/a1', 'Query completed'),
|
|
517
609
|
('agent@1:', 'Trajectory completed'),
|
|
518
610
|
])
|
|
519
611
|
|
|
612
|
+
def test_clone(self):
|
|
613
|
+
event_handler = action_lib.SessionLogging()
|
|
614
|
+
session = action_lib.Session(event_handler=event_handler)
|
|
615
|
+
other = session.clone()
|
|
616
|
+
self.assertIsNot(session, other)
|
|
617
|
+
self.assertIs(other.event_handler, event_handler)
|
|
618
|
+
|
|
619
|
+
other = session.clone(deep=True)
|
|
620
|
+
self.assertIsNot(session, other)
|
|
621
|
+
self.assertIsNot(other.event_handler, session.event_handler)
|
|
622
|
+
|
|
520
623
|
def test_log(self):
|
|
521
624
|
session = action_lib.Session()
|
|
522
625
|
session.debug('hi', x=1, y=2)
|
|
@@ -530,6 +633,31 @@ class SessionTest(unittest.TestCase):
|
|
|
530
633
|
self.assertIn('agent@', session.id)
|
|
531
634
|
self.assertIsInstance(session.as_message(), lf.AIMessage)
|
|
532
635
|
|
|
636
|
+
def test_query_with_track_if(self):
|
|
637
|
+
lm = fake.StaticResponse('lm response')
|
|
638
|
+
session = action_lib.Session()
|
|
639
|
+
|
|
640
|
+
# Render session to trigger javascript updates to the HTML when
|
|
641
|
+
# operating on the session.
|
|
642
|
+
_ = session.to_html()
|
|
643
|
+
with session:
|
|
644
|
+
# This query will succeed.
|
|
645
|
+
session.query(
|
|
646
|
+
'prompt1',
|
|
647
|
+
schema=None,
|
|
648
|
+
lm=lm,
|
|
649
|
+
track_if=lambda q: not q.has_error,
|
|
650
|
+
default=None)
|
|
651
|
+
# This query will fail during parsing.
|
|
652
|
+
session.query(
|
|
653
|
+
'prompt2',
|
|
654
|
+
schema=int,
|
|
655
|
+
lm=lm,
|
|
656
|
+
track_if=lambda q: not q.has_error,
|
|
657
|
+
default=None)
|
|
658
|
+
self.assertEqual(len(session.root.queries), 1)
|
|
659
|
+
self.assertIsNone(session.root.queries[0].error)
|
|
660
|
+
|
|
533
661
|
|
|
534
662
|
if __name__ == '__main__':
|
|
535
663
|
unittest.main()
|
langfun/core/async_support.py
CHANGED
|
@@ -11,7 +11,7 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
"""
|
|
14
|
+
"""Utilities for asynchronous programming in Langfun."""
|
|
15
15
|
|
|
16
16
|
import asyncio
|
|
17
17
|
import contextlib
|
|
@@ -23,7 +23,20 @@ import pyglove as pg
|
|
|
23
23
|
async def invoke_async(
|
|
24
24
|
sync_callable: Callable[..., Any], *args, **kwargs
|
|
25
25
|
) -> Any:
|
|
26
|
-
"""Invokes a callable asynchronously
|
|
26
|
+
"""Invokes a sync callable asynchronously in a separate thread.
|
|
27
|
+
|
|
28
|
+
This is useful for wrapping a sync function into an async function,
|
|
29
|
+
allowing multiple calls of the sync function to run concurrently.
|
|
30
|
+
`lf.context` will be propagated to the thread that runs the sync callable.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
sync_callable: The sync callable to invoke.
|
|
34
|
+
*args: Positional arguments to pass to the callable.
|
|
35
|
+
**kwargs: Keyword arguments to pass to the callable.
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
An awaitable that resolves to the return value of the sync_callable.
|
|
39
|
+
"""
|
|
27
40
|
return await asyncio.to_thread(
|
|
28
41
|
# Enable `lf.context` manager for async calls.
|
|
29
42
|
pg.with_contextual_override(sync_callable), *args, **kwargs
|
|
@@ -35,7 +48,23 @@ def invoke_sync(
|
|
|
35
48
|
*args,
|
|
36
49
|
**kwargs
|
|
37
50
|
) -> Any:
|
|
38
|
-
"""Invokes
|
|
51
|
+
"""Invokes an async callable synchronously.
|
|
52
|
+
|
|
53
|
+
This is useful for calling an async function from a sync context.
|
|
54
|
+
If there is an existing async event loop in current thread managed by
|
|
55
|
+
`lf.sync_context_manager`, it will be used for running the async callable.
|
|
56
|
+
Otherwise, `anyio.run` will be used to run the async callable in a new
|
|
57
|
+
event loop.
|
|
58
|
+
`lf.context` will be propagated to the async callable.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
async_callable: The async callable to invoke.
|
|
62
|
+
*args: Positional arguments to pass to the callable.
|
|
63
|
+
**kwargs: Keyword arguments to pass to the callable.
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
The return value of the async_callable.
|
|
67
|
+
"""
|
|
39
68
|
async def _invoke():
|
|
40
69
|
return await async_callable(*args, **kwargs)
|
|
41
70
|
invoke_fn = pg.with_contextual_override(_invoke)
|
|
@@ -19,13 +19,23 @@ import pyglove as pg
|
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
class CodeWithError(pg.Object):
|
|
22
|
-
"""Python code with error.
|
|
22
|
+
"""A structure representing Python code along with an execution error.
|
|
23
|
+
|
|
24
|
+
This is used as input to a language model for error correction, providing
|
|
25
|
+
the model with the code that failed and the error message it produced.
|
|
26
|
+
"""
|
|
23
27
|
|
|
24
28
|
code: str
|
|
25
29
|
error: str
|
|
26
30
|
|
|
27
31
|
|
|
28
32
|
class CorrectedCode(pg.Object):
|
|
33
|
+
"""A structure containing corrected Python code.
|
|
34
|
+
|
|
35
|
+
This is used as the output schema when asking a language model to correct
|
|
36
|
+
code, expecting the model to return the fixed code in the `corrected_code`
|
|
37
|
+
field.
|
|
38
|
+
"""
|
|
29
39
|
corrected_code: str
|
|
30
40
|
|
|
31
41
|
|
|
@@ -49,7 +59,7 @@ def run_with_correction(
|
|
|
49
59
|
code: The source code that may or may not be problematic.
|
|
50
60
|
error: An optional initial error for `code` when it's problematic, usually
|
|
51
61
|
caught from elsewhere when it ran. If None, code will be executed once to
|
|
52
|
-
verify if
|
|
62
|
+
verify if it's good and obtain a feedback error message.
|
|
53
63
|
global_vars: A dict of str to value as the global variables that could be
|
|
54
64
|
accessed within the corrected code.
|
|
55
65
|
lm: Language model to be used. If not specified, it will try to use the `lm`
|
|
@@ -57,15 +67,15 @@ def run_with_correction(
|
|
|
57
67
|
max_attempts: Max number of attempts for the correction.
|
|
58
68
|
sandbox: If True, run code in sandbox; If False, run code in current
|
|
59
69
|
process. If None, run in sandbox first, if the output could not be
|
|
60
|
-
serialized and
|
|
70
|
+
serialized and passed to current process, run the code again in current
|
|
61
71
|
process.
|
|
62
72
|
permission: The permission to run the code.
|
|
63
73
|
timeout: The timeout for running the corrected code. If None, there is no
|
|
64
74
|
timeout. Applicable only when sandbox is set to True.
|
|
65
75
|
returns_code: If True, the return value is a tuple of (result, final code).
|
|
66
76
|
Otherwise the return value is the result only.
|
|
67
|
-
returns_stdout: If True, the stdout (a
|
|
68
|
-
outputs_intermediate: If True, intermediate output will be
|
|
77
|
+
returns_stdout: If True, the stdout (a string) will be returned.
|
|
78
|
+
outputs_intermediate: If True, intermediate output will be output as a
|
|
69
79
|
dict, with the last line's value accessible by key '__result__'. Otherwise
|
|
70
80
|
the value of the last line will be returned.
|
|
71
81
|
|
|
@@ -161,7 +171,7 @@ def correct(
|
|
|
161
171
|
code: The source code that may or may not be problematic.
|
|
162
172
|
error: An optional initial error for `code` when it's problematic, usually
|
|
163
173
|
caught from elsewhere when it ran. If None, code will be executed once to
|
|
164
|
-
verify if
|
|
174
|
+
verify if it's good and obtain a feedback error message.
|
|
165
175
|
global_vars: A dict of str to value as the global variables that could be
|
|
166
176
|
accessed within the corrected code.
|
|
167
177
|
lm: Language model to be used. If not specified, it will try to use the `lm`
|
|
@@ -169,7 +179,7 @@ def correct(
|
|
|
169
179
|
max_attempts: Max number of attempts for the correction.
|
|
170
180
|
sandbox: If True, run code in sandbox; If False, run code in current
|
|
171
181
|
process. If None, run in sandbox first, if the output could not be
|
|
172
|
-
serialized and
|
|
182
|
+
serialized and passed to current process, run the code again in current
|
|
173
183
|
process.
|
|
174
184
|
timeout: The timeout for running the corrected code. If None, there is no
|
|
175
185
|
timeout. Applicable only when sandbox is set to True.
|
|
@@ -193,7 +203,7 @@ def correct(
|
|
|
193
203
|
|
|
194
204
|
|
|
195
205
|
def _error_feedback_str(error: Exception) -> str:
|
|
196
|
-
"""Returns the error
|
|
206
|
+
"""Returns the error string for feedback."""
|
|
197
207
|
if isinstance(error, pg.coding.CodeError):
|
|
198
208
|
return pg.decolor(error.format(include_complete_code=False))
|
|
199
209
|
else:
|
|
@@ -201,7 +211,7 @@ def _error_feedback_str(error: Exception) -> str:
|
|
|
201
211
|
|
|
202
212
|
|
|
203
213
|
def _maybe_custom_validate(result: Any) -> Any:
|
|
204
|
-
"""
|
|
214
|
+
"""Applies custom validation through __validate__ method."""
|
|
205
215
|
if isinstance(result, dict) and "__result__" in result:
|
|
206
216
|
r = result["__result__"]
|
|
207
217
|
else:
|
|
@@ -45,17 +45,17 @@ def evaluate(
|
|
|
45
45
|
global_vars: An optional dict as the globals that could be referenced by the
|
|
46
46
|
code.
|
|
47
47
|
permission: Permission for the Python code to run.
|
|
48
|
-
returns_stdout: If True, the stdout (a
|
|
48
|
+
returns_stdout: If True, the stdout (a string) will be returned.
|
|
49
49
|
outputs_intermediate: Applicable when returns_stdout is False. If True,
|
|
50
|
-
intermediate output will be
|
|
51
|
-
value accessible by key '__result__' and the
|
|
50
|
+
intermediate output will be output as a dict, with the last line's
|
|
51
|
+
value accessible by key '__result__' and the stdout accessible by
|
|
52
52
|
key '__stdout__'. Otherwise the value of the last line will be returned.
|
|
53
53
|
|
|
54
54
|
Returns:
|
|
55
55
|
The value of the last line of the code block. Or a dict of variable
|
|
56
56
|
names of all locals to their evaluated values as the output of the code to
|
|
57
57
|
run. The value for the last line can be accessed by key '__result__'. Or the
|
|
58
|
-
stdout as a
|
|
58
|
+
stdout as a string.
|
|
59
59
|
"""
|
|
60
60
|
return pg.coding.evaluate(
|
|
61
61
|
parsing.clean(code),
|
|
@@ -85,28 +85,30 @@ def run(
|
|
|
85
85
|
|
|
86
86
|
Args:
|
|
87
87
|
code: Python code to run.
|
|
88
|
-
global_vars: An optional dict
|
|
88
|
+
global_vars: An optional dict as the globals that could be referenced by the
|
|
89
|
+
code.
|
|
89
90
|
permission: Permission for the Python code to run.
|
|
90
|
-
returns_stdout: If True, the stdout (a
|
|
91
|
+
returns_stdout: If True, the stdout (a string) will be returned.
|
|
91
92
|
outputs_intermediate: Applicable when returns_stdout is False. If True,
|
|
92
|
-
intermediate output will be
|
|
93
|
-
value accessible by key '__result__' and the
|
|
93
|
+
intermediate output will be output as a dict, with the last line's
|
|
94
|
+
value accessible by key '__result__' and the stdout accessible by
|
|
94
95
|
key '__stdout__'. Otherwise the value of the last line will be returned.
|
|
95
96
|
sandbox: If True, run code in sandbox; If False, run code in current
|
|
96
97
|
process. If None, run in sandbox first, if the output could not be
|
|
97
|
-
serialized and
|
|
98
|
+
serialized and passed to current process, run the code again in current
|
|
98
99
|
process.
|
|
99
|
-
timeout: Execution timeout in seconds. If None, wait the code
|
|
100
|
+
timeout: Execution timeout in seconds. If None, wait for the code to
|
|
101
|
+
complete.
|
|
100
102
|
|
|
101
103
|
Returns:
|
|
102
104
|
The value of the last line of the code block. Or a dict of variable
|
|
103
105
|
names of all locals to their evaluated values as the output of the code to
|
|
104
106
|
run. The value for the last line can be accessed by key '__result__'. Or the
|
|
105
|
-
stdout as a
|
|
107
|
+
stdout as a string.
|
|
106
108
|
|
|
107
109
|
Raises:
|
|
108
110
|
TimeoutError: If the execution time exceeds the timeout.
|
|
109
|
-
Exception:
|
|
111
|
+
Exception: Exceptions that are raised from the code.
|
|
110
112
|
"""
|
|
111
113
|
return pg.coding.run(
|
|
112
114
|
parsing.clean(code),
|
|
@@ -22,9 +22,13 @@ import pyglove as pg
|
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
class PythonCode(pg.Object):
|
|
25
|
-
"""
|
|
25
|
+
"""Represents a piece of Python code that can be executed.
|
|
26
26
|
|
|
27
|
-
|
|
27
|
+
When `PythonCode` is instantiated within a `PythonCode.auto_run()` context,
|
|
28
|
+
it automatically executes the code and returns the result of the last
|
|
29
|
+
expression. Otherwise, it acts as a container for the source code, which
|
|
30
|
+
can be executed by calling the instance. The class also supports automatic
|
|
31
|
+
error correction via `lf.coding.run_with_correction` when called.
|
|
28
32
|
"""
|
|
29
33
|
|
|
30
34
|
source: Annotated[
|
|
@@ -56,7 +60,7 @@ class PythonCode(pg.Object):
|
|
|
56
60
|
Otherwise, auto call will be disabled.
|
|
57
61
|
sandbox: If True, run code in sandbox; If False, run code in current
|
|
58
62
|
process. If None, run in sandbox first, if the output could not be
|
|
59
|
-
serialized and
|
|
63
|
+
serialized and passed to current process, run the code again in current
|
|
60
64
|
process. Applicable when `enabled` is set to True.
|
|
61
65
|
timeout: Timeout in seconds. Applicable when both `enabled` and `sandbox`
|
|
62
66
|
are set to True.
|
|
@@ -98,17 +102,17 @@ class PythonCode(pg.Object):
|
|
|
98
102
|
Args:
|
|
99
103
|
sandbox: If True, run code in sandbox; If False, run code in current
|
|
100
104
|
process. If None, run in sandbox first, if the output could not be
|
|
101
|
-
serialized and
|
|
105
|
+
serialized and passed to current process, run the code again in current
|
|
102
106
|
process.
|
|
103
107
|
timeout: Timeout in seconds. If None, there is no timeout. Applicable when
|
|
104
108
|
sandbox is set to True.
|
|
105
109
|
global_vars: Global variables that could be accessed from the source code.
|
|
106
|
-
returns_stdout: If True, the stdout (a
|
|
110
|
+
returns_stdout: If True, the stdout (a string) will be returned.
|
|
107
111
|
outputs_intermediate: Applicable when returns_stdout is False. If True,
|
|
108
|
-
intermediate output will be
|
|
109
|
-
value accessible by key '__result__' and the
|
|
112
|
+
intermediate output will be output as a dict, with the last line's
|
|
113
|
+
value accessible by key '__result__' and the stdout accessible by
|
|
110
114
|
key '__stdout__'. Otherwise the value of the last line will be returned.
|
|
111
|
-
autofix: Number of attempts to
|
|
115
|
+
autofix: Number of attempts to autofix the generated code. If 0, autofix
|
|
112
116
|
is disabled.
|
|
113
117
|
autofix_lm: Language model to be used. If not specified, it will try to
|
|
114
118
|
use the `lm` under `lf.context`.
|
|
@@ -117,8 +121,8 @@ class PythonCode(pg.Object):
|
|
|
117
121
|
The value of the last expression in the source code. Or a dict of local
|
|
118
122
|
variable names defined in the source code to their values if
|
|
119
123
|
`outputs_intermediate` is set to True. The value for the last line can be
|
|
120
|
-
accessed by key '__result__'. Or the stdout as a
|
|
121
|
-
is set to True.
|
|
124
|
+
accessed by key '__result__'. Or the stdout as a string if
|
|
125
|
+
`returns_stdout` is set to True.
|
|
122
126
|
|
|
123
127
|
Raises:
|
|
124
128
|
TimeoutError: If `sandbox` is True and timeout has reached.
|
|
@@ -152,12 +156,12 @@ class PythonCode(pg.Object):
|
|
|
152
156
|
Args:
|
|
153
157
|
sandbox: If True, run code in sandbox; If False, run code in current
|
|
154
158
|
process. If None, run in sandbox first, if the output could not be
|
|
155
|
-
serialized and
|
|
159
|
+
serialized and passed to current process, run the code again in current
|
|
156
160
|
process.
|
|
157
161
|
timeout: Timeout in seconds. If None, there is no timeout. Applicable when
|
|
158
162
|
sandbox is set to True.
|
|
159
163
|
global_vars: Global variables that could be accessed from the source code.
|
|
160
|
-
autofix: Number of attempts to
|
|
164
|
+
autofix: Number of attempts to autofix the generated code. If 0, autofix
|
|
161
165
|
is disabled. Auto-fix is not supported for 'json' protocol.
|
|
162
166
|
autofix_lm: Language model to be used. If not specified, it will try to
|
|
163
167
|
use the `lm` under `lf.context`.
|
|
@@ -182,10 +186,11 @@ class PythonCode(pg.Object):
|
|
|
182
186
|
|
|
183
187
|
|
|
184
188
|
class PythonFunction(pg.Object):
|
|
185
|
-
"""
|
|
189
|
+
"""Represents a Python function defined by source code.
|
|
186
190
|
|
|
187
|
-
|
|
188
|
-
|
|
191
|
+
This class takes Python source code that defines a function and makes it
|
|
192
|
+
callable. The source code is evaluated to create a function object, which
|
|
193
|
+
can then be invoked like a regular Python function.
|
|
189
194
|
"""
|
|
190
195
|
|
|
191
196
|
name: str
|
|
@@ -214,7 +219,7 @@ class PythonFunction(pg.Object):
|
|
|
214
219
|
*args: Positional arguments that will be passed to the implementation.
|
|
215
220
|
sandbox: If True, run code in sandbox; If False, run code in current
|
|
216
221
|
process. If None, run in sandbox first, if the output could not be
|
|
217
|
-
serialized and
|
|
222
|
+
serialized and passed to current process, run the code again in current
|
|
218
223
|
process.
|
|
219
224
|
timeout: Timeout in seconds. If None, there is no timeout. Applicable when
|
|
220
225
|
sandbox is set to True.
|
|
@@ -23,7 +23,14 @@ import pyglove as pg
|
|
|
23
23
|
|
|
24
24
|
|
|
25
25
|
class SandboxOutput(pg.Object):
|
|
26
|
-
"""
|
|
26
|
+
"""A structure containing the output from a sandbox execution.
|
|
27
|
+
|
|
28
|
+
Attributes:
|
|
29
|
+
stdout: The standard output captured during execution.
|
|
30
|
+
stderr: The standard error captured during execution.
|
|
31
|
+
output_files: A dictionary of file names to their byte content for files
|
|
32
|
+
generated during execution.
|
|
33
|
+
"""
|
|
27
34
|
|
|
28
35
|
stdout: Annotated[
|
|
29
36
|
str,
|
|
@@ -42,7 +49,14 @@ class SandboxOutput(pg.Object):
|
|
|
42
49
|
|
|
43
50
|
|
|
44
51
|
class BaseSandbox(pg.Object):
|
|
45
|
-
"""
|
|
52
|
+
"""Base class for Python code sandboxing.
|
|
53
|
+
|
|
54
|
+
A sandbox provides an isolated environment for executing Python code,
|
|
55
|
+
typically with restrictions on file system access, network calls, or other
|
|
56
|
+
potentially harmful operations. This base class defines the interface for
|
|
57
|
+
sandboxes, including methods for running code (`run`), uploading files
|
|
58
|
+
(`upload`), and managing the sandbox lifecycle (`setup`, `cleanup`).
|
|
59
|
+
"""
|
|
46
60
|
|
|
47
61
|
def _on_bound(self):
|
|
48
62
|
super()._on_bound()
|
|
@@ -111,7 +125,13 @@ class BaseSandbox(pg.Object):
|
|
|
111
125
|
|
|
112
126
|
|
|
113
127
|
class MultiProcessingSandbox(BaseSandbox):
|
|
114
|
-
"""
|
|
128
|
+
"""A sandbox implementation using Python's `multiprocessing`.
|
|
129
|
+
|
|
130
|
+
This sandbox executes code in a separate process, providing isolation from
|
|
131
|
+
the main process. It uses a temporary directory for file operations,
|
|
132
|
+
which is cleaned up when the sandbox is closed. It relies on
|
|
133
|
+
`pg.coding.run` with `sandbox=True` for execution.
|
|
134
|
+
"""
|
|
115
135
|
|
|
116
136
|
def _on_bound(self):
|
|
117
137
|
super()._on_bound()
|