langfun 0.1.2.dev202509120804__py3-none-any.whl → 0.1.2.dev202512150805__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. langfun/__init__.py +1 -1
  2. langfun/core/__init__.py +7 -1
  3. langfun/core/agentic/__init__.py +8 -1
  4. langfun/core/agentic/action.py +740 -112
  5. langfun/core/agentic/action_eval.py +9 -2
  6. langfun/core/agentic/action_test.py +189 -24
  7. langfun/core/async_support.py +104 -5
  8. langfun/core/async_support_test.py +23 -0
  9. langfun/core/coding/python/correction.py +19 -9
  10. langfun/core/coding/python/execution.py +14 -12
  11. langfun/core/coding/python/generation.py +21 -16
  12. langfun/core/coding/python/sandboxing.py +23 -3
  13. langfun/core/component.py +42 -3
  14. langfun/core/concurrent.py +70 -6
  15. langfun/core/concurrent_test.py +9 -2
  16. langfun/core/console.py +1 -1
  17. langfun/core/data/conversion/anthropic.py +12 -3
  18. langfun/core/data/conversion/anthropic_test.py +8 -6
  19. langfun/core/data/conversion/gemini.py +11 -2
  20. langfun/core/data/conversion/gemini_test.py +48 -9
  21. langfun/core/data/conversion/openai.py +145 -31
  22. langfun/core/data/conversion/openai_test.py +161 -17
  23. langfun/core/eval/base.py +48 -44
  24. langfun/core/eval/base_test.py +5 -5
  25. langfun/core/eval/matching.py +5 -2
  26. langfun/core/eval/patching.py +3 -3
  27. langfun/core/eval/scoring.py +4 -3
  28. langfun/core/eval/v2/__init__.py +3 -0
  29. langfun/core/eval/v2/checkpointing.py +148 -46
  30. langfun/core/eval/v2/checkpointing_test.py +9 -2
  31. langfun/core/eval/v2/config_saver.py +37 -0
  32. langfun/core/eval/v2/config_saver_test.py +36 -0
  33. langfun/core/eval/v2/eval_test_helper.py +104 -3
  34. langfun/core/eval/v2/evaluation.py +102 -19
  35. langfun/core/eval/v2/evaluation_test.py +9 -3
  36. langfun/core/eval/v2/example.py +50 -40
  37. langfun/core/eval/v2/example_test.py +16 -8
  38. langfun/core/eval/v2/experiment.py +95 -20
  39. langfun/core/eval/v2/experiment_test.py +19 -0
  40. langfun/core/eval/v2/metric_values.py +31 -3
  41. langfun/core/eval/v2/metric_values_test.py +32 -0
  42. langfun/core/eval/v2/metrics.py +157 -44
  43. langfun/core/eval/v2/metrics_test.py +39 -18
  44. langfun/core/eval/v2/progress.py +31 -1
  45. langfun/core/eval/v2/progress_test.py +27 -0
  46. langfun/core/eval/v2/progress_tracking.py +13 -5
  47. langfun/core/eval/v2/progress_tracking_test.py +9 -1
  48. langfun/core/eval/v2/reporting.py +88 -71
  49. langfun/core/eval/v2/reporting_test.py +24 -6
  50. langfun/core/eval/v2/runners/__init__.py +30 -0
  51. langfun/core/eval/v2/{runners.py → runners/base.py} +73 -180
  52. langfun/core/eval/v2/runners/beam.py +354 -0
  53. langfun/core/eval/v2/runners/beam_test.py +153 -0
  54. langfun/core/eval/v2/runners/ckpt_monitor.py +350 -0
  55. langfun/core/eval/v2/runners/ckpt_monitor_test.py +213 -0
  56. langfun/core/eval/v2/runners/debug.py +40 -0
  57. langfun/core/eval/v2/runners/debug_test.py +76 -0
  58. langfun/core/eval/v2/runners/parallel.py +243 -0
  59. langfun/core/eval/v2/runners/parallel_test.py +182 -0
  60. langfun/core/eval/v2/runners/sequential.py +47 -0
  61. langfun/core/eval/v2/runners/sequential_test.py +169 -0
  62. langfun/core/langfunc.py +45 -130
  63. langfun/core/langfunc_test.py +7 -5
  64. langfun/core/language_model.py +189 -36
  65. langfun/core/language_model_test.py +54 -3
  66. langfun/core/llms/__init__.py +14 -1
  67. langfun/core/llms/anthropic.py +157 -2
  68. langfun/core/llms/azure_openai.py +29 -17
  69. langfun/core/llms/cache/base.py +25 -3
  70. langfun/core/llms/cache/in_memory.py +48 -7
  71. langfun/core/llms/cache/in_memory_test.py +14 -4
  72. langfun/core/llms/compositional.py +25 -1
  73. langfun/core/llms/deepseek.py +30 -2
  74. langfun/core/llms/fake.py +32 -1
  75. langfun/core/llms/gemini.py +90 -12
  76. langfun/core/llms/gemini_test.py +110 -0
  77. langfun/core/llms/google_genai.py +52 -1
  78. langfun/core/llms/groq.py +28 -3
  79. langfun/core/llms/llama_cpp.py +23 -4
  80. langfun/core/llms/openai.py +120 -3
  81. langfun/core/llms/openai_compatible.py +148 -27
  82. langfun/core/llms/openai_compatible_test.py +207 -20
  83. langfun/core/llms/openai_test.py +0 -2
  84. langfun/core/llms/rest.py +16 -1
  85. langfun/core/llms/vertexai.py +78 -8
  86. langfun/core/logging.py +1 -1
  87. langfun/core/mcp/__init__.py +10 -0
  88. langfun/core/mcp/client.py +177 -0
  89. langfun/core/mcp/client_test.py +71 -0
  90. langfun/core/mcp/session.py +241 -0
  91. langfun/core/mcp/session_test.py +54 -0
  92. langfun/core/mcp/testing/simple_mcp_client.py +33 -0
  93. langfun/core/mcp/testing/simple_mcp_server.py +33 -0
  94. langfun/core/mcp/tool.py +254 -0
  95. langfun/core/mcp/tool_test.py +197 -0
  96. langfun/core/memory.py +1 -0
  97. langfun/core/message.py +160 -55
  98. langfun/core/message_test.py +65 -81
  99. langfun/core/modalities/__init__.py +8 -0
  100. langfun/core/modalities/audio.py +21 -1
  101. langfun/core/modalities/image.py +73 -3
  102. langfun/core/modalities/image_test.py +116 -0
  103. langfun/core/modalities/mime.py +78 -4
  104. langfun/core/modalities/mime_test.py +59 -0
  105. langfun/core/modalities/pdf.py +19 -1
  106. langfun/core/modalities/video.py +21 -1
  107. langfun/core/modality.py +167 -29
  108. langfun/core/modality_test.py +42 -12
  109. langfun/core/natural_language.py +1 -1
  110. langfun/core/sampling.py +4 -4
  111. langfun/core/sampling_test.py +20 -4
  112. langfun/core/structured/__init__.py +2 -24
  113. langfun/core/structured/completion.py +34 -44
  114. langfun/core/structured/completion_test.py +23 -43
  115. langfun/core/structured/description.py +54 -50
  116. langfun/core/structured/function_generation.py +29 -12
  117. langfun/core/structured/mapping.py +81 -37
  118. langfun/core/structured/parsing.py +95 -79
  119. langfun/core/structured/parsing_test.py +0 -3
  120. langfun/core/structured/querying.py +230 -154
  121. langfun/core/structured/querying_test.py +69 -33
  122. langfun/core/structured/schema/__init__.py +49 -0
  123. langfun/core/structured/schema/base.py +664 -0
  124. langfun/core/structured/schema/base_test.py +531 -0
  125. langfun/core/structured/schema/json.py +174 -0
  126. langfun/core/structured/schema/json_test.py +121 -0
  127. langfun/core/structured/schema/python.py +316 -0
  128. langfun/core/structured/schema/python_test.py +410 -0
  129. langfun/core/structured/schema_generation.py +33 -14
  130. langfun/core/structured/scoring.py +47 -36
  131. langfun/core/structured/tokenization.py +26 -11
  132. langfun/core/subscription.py +2 -2
  133. langfun/core/template.py +175 -50
  134. langfun/core/template_test.py +123 -17
  135. langfun/env/__init__.py +43 -0
  136. langfun/env/base_environment.py +827 -0
  137. langfun/env/base_environment_test.py +473 -0
  138. langfun/env/base_feature.py +304 -0
  139. langfun/env/base_feature_test.py +228 -0
  140. langfun/env/base_sandbox.py +842 -0
  141. langfun/env/base_sandbox_test.py +1235 -0
  142. langfun/env/event_handlers/__init__.py +14 -0
  143. langfun/env/event_handlers/chain.py +233 -0
  144. langfun/env/event_handlers/chain_test.py +253 -0
  145. langfun/env/event_handlers/event_logger.py +472 -0
  146. langfun/env/event_handlers/event_logger_test.py +304 -0
  147. langfun/env/event_handlers/metric_writer.py +726 -0
  148. langfun/env/event_handlers/metric_writer_test.py +214 -0
  149. langfun/env/interface.py +1640 -0
  150. langfun/env/interface_test.py +153 -0
  151. langfun/env/load_balancers.py +59 -0
  152. langfun/env/load_balancers_test.py +141 -0
  153. langfun/env/test_utils.py +507 -0
  154. {langfun-0.1.2.dev202509120804.dist-info → langfun-0.1.2.dev202512150805.dist-info}/METADATA +7 -3
  155. langfun-0.1.2.dev202512150805.dist-info/RECORD +217 -0
  156. langfun/core/eval/v2/runners_test.py +0 -343
  157. langfun/core/structured/schema.py +0 -987
  158. langfun/core/structured/schema_test.py +0 -982
  159. langfun-0.1.2.dev202509120804.dist-info/RECORD +0 -172
  160. {langfun-0.1.2.dev202509120804.dist-info → langfun-0.1.2.dev202512150805.dist-info}/WHEEL +0 -0
  161. {langfun-0.1.2.dev202509120804.dist-info → langfun-0.1.2.dev202512150805.dist-info}/licenses/LICENSE +0 -0
  162. {langfun-0.1.2.dev202509120804.dist-info → langfun-0.1.2.dev202512150805.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,827 @@
1
+ # Copyright 2025 The Langfun Authors
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Common base class for sandbox-based environments.
15
+
16
+ This module provides `BaseEnvironment`, a common base class for sandbox-based
17
+ environments that handles pooling, load balancing, and maintenance.
18
+
19
+ Note that:
20
+ - Environments do not have to inherit from this class, especially if features
21
+ like pooling or load balancing are not needed.
22
+ - `BaseEnvironment` is coupled with `BaseSandbox`.
23
+ """
24
+
25
+ import abc
26
+ import collections
27
+ import functools
28
+ import random
29
+ import re
30
+ import threading
31
+ import time
32
+ from typing import Annotated, Any
33
+ import uuid
34
+
35
+ import langfun.core as lf
36
+ from langfun.env import base_sandbox
37
+ from langfun.env import interface
38
+ from langfun.env import load_balancers
39
+ import pyglove as pg
40
+
41
+
42
+ class BaseEnvironment(interface.Environment):
43
+ """Common base for environments.
44
+
45
+ The base environment provides the common functionalities for sandbox-based
46
+ environments, such as environment pooling, load balancing, and sandbox
47
+ maintenance.
48
+ """
49
+
50
+ image_ids: Annotated[
51
+ list[str],
52
+ (
53
+ 'A list of static image IDs served by the environment. '
54
+ )
55
+ ]
56
+
57
+ supports_dynamic_image_loading: Annotated[
58
+ bool,
59
+ (
60
+ 'Whether the environment supports dynamic loading of images which is '
61
+ 'not included in the `image_ids`. `image_ids` could coexist with '
62
+ 'dynamic image loading, which allows users to specify an image id '
63
+ 'that is not included in the `image_ids`.'
64
+ )
65
+ ] = False
66
+
67
+ root_dir: Annotated[
68
+ str | None,
69
+ (
70
+ 'The root directory for the environment for writting output files.'
71
+ 'If None, no output files will be allowed for the sandboxes.'
72
+ )
73
+ ] = None
74
+
75
+ pool_size: Annotated[
76
+ int | tuple[int, int] | dict[str, int | tuple[int, int]],
77
+ (
78
+ 'The (min_size, max_size) of the sandbox pool. If an integer, it '
79
+ 'will be used as both min and max size. If 0, all sandboxes will be '
80
+ 'created on demand and shutdown when user session ends. If a dict, '
81
+ 'users could configure the pool size based on image IDs. The keys '
82
+ 'are regular expressions for image IDs, and the values are '
83
+ '(min_size, max_size) tuples. For dynamic image IDs, min_size will '
84
+ 'ignored while max_size will be honored.'
85
+ )
86
+ ] = (0, 256)
87
+
88
+ load_balancer: Annotated[
89
+ load_balancers.LoadBalancer,
90
+ (
91
+ 'The load balancer for the environment to acquire sandboxes.'
92
+ )
93
+ ] = load_balancers.RoundRobin()
94
+
95
+ sandbox_keepalive_interval: Annotated[
96
+ float | None,
97
+ (
98
+ 'The interval in seconds to send keepalive pings to sandboxes. '
99
+ 'If None, sandbox keepalive is disabled. Please note that sandbox '
100
+ 'keepalive is different from feature housekeeping. Usually sandbox '
101
+ 'keepalive and feature housekeeping are different operations.'
102
+ )
103
+ ] = None
104
+
105
+ proactive_session_setup: Annotated[
106
+ bool,
107
+ (
108
+ 'If True, all sandboxes will perform setup work before a user '
109
+ 'session is started. This is useful for features that need to '
110
+ 'perform heavy setup work, which could block the user thread for a '
111
+ 'long time.'
112
+ )
113
+ ] = True
114
+
115
+ event_handler: Annotated[
116
+ interface.EventHandler,
117
+ (
118
+ 'User handler for the environment events.'
119
+ 'By default, the no-op event handler is used.'
120
+ )
121
+ ] = interface.EventHandler()
122
+
123
+ outage_grace_period: Annotated[
124
+ float,
125
+ (
126
+ 'The grace period in seconds before the environment is treated '
127
+ 'as out of service. When calling `environment.sandbox()`, '
128
+ 'wait until the grace period has passed before raising an error.'
129
+ )
130
+ ] = 3600.0
131
+
132
+ outage_retry_interval: Annotated[
133
+ float,
134
+ (
135
+ 'The retry interval in seconds for environment outage. '
136
+ 'When calling `environment.sandbox()`, retry after the interval '
137
+ 'if the environment is out of service.'
138
+ )
139
+ ] = 10.0
140
+
141
+ housekeep_interval: Annotated[
142
+ float,
143
+ (
144
+ 'The interval in seconds for environment housekeeping. It recycles '
145
+ 'the dead sandboxes in the pool. This interval is the minimal time '
146
+ 'to detect outage while there is no request to obtain new sandboxes.'
147
+ 'This is applicable only when the environment enables pooling.'
148
+ )
149
+ ] = 10.0
150
+
151
+ pool_operation_max_parallelism: Annotated[
152
+ int,
153
+ (
154
+ 'The maximum number of threads for bringing up or shutting down '
155
+ 'sandboxes in the pool.'
156
+ )
157
+ ] = 256
158
+
159
+ random_seed: Annotated[
160
+ int | None,
161
+ (
162
+ 'The random seed for generating session IDs with reproducibility. '
163
+ 'If None, no seed will be used.'
164
+ )
165
+ ] = None
166
+
167
+ def _on_bound(self) -> None:
168
+ super()._on_bound()
169
+
170
+ self._status = self.Status.CREATED
171
+ self._start_time = None
172
+
173
+ self._sandbox_pool: dict[str, list[base_sandbox.BaseSandbox]] = (
174
+ collections.defaultdict(list)
175
+ )
176
+ self._next_sandbox_id: dict[str, int] = collections.defaultdict(int)
177
+ self._random = (
178
+ random if self.random_seed is None else random.Random(self.random_seed)
179
+ )
180
+ self._housekeep_thread = None
181
+ self._offline_start_time = None
182
+ self._non_sandbox_based_features_with_setup_called = set()
183
+
184
+ # Check image IDs and feature requirements.
185
+ self._check_image_ids()
186
+ self._check_feature_requirements()
187
+
188
+ def _check_image_ids(self) -> None:
189
+ """Checks image ids. Subclass could override this method."""
190
+
191
+ def _check_feature_requirements(self) -> None:
192
+ """Checks if the image ID is supported by the feature."""
193
+ if self.supports_dynamic_image_loading:
194
+ return
195
+ for name, feature in self.features.items():
196
+ if not feature.is_sandbox_based or any(
197
+ feature.is_applicable(image_id) for image_id in self.image_ids
198
+ ):
199
+ continue
200
+ raise ValueError(
201
+ f'Feature {name!r} is not applicable to all available images: '
202
+ f'{self.image_ids!r}. '
203
+ f'Applicable images: {feature.applicable_images}.'
204
+ )
205
+
206
+ #
207
+ # Subclasses must implement:
208
+ #
209
+
210
+ @abc.abstractmethod
211
+ def _create_sandbox(
212
+ self,
213
+ image_id: str,
214
+ sandbox_id: str,
215
+ reusable: bool,
216
+ proactive_session_setup: bool,
217
+ keepalive_interval: float | None,
218
+ ) -> base_sandbox.BaseSandbox:
219
+ """Creates a sandbox with the given identifier.
220
+
221
+ Args:
222
+ image_id: The image ID to use for the sandbox.
223
+ sandbox_id: The identifier for the sandbox.
224
+ reusable: Whether the sandbox is reusable across user sessions.
225
+ proactive_session_setup: Whether the sandbox performs session setup work
226
+ before a user session is started.
227
+ keepalive_interval: Interval to ping the sandbox for keeping it alive.
228
+ If None, the sandbox will not be pinged.
229
+
230
+ Returns:
231
+ The created sandbox.
232
+
233
+ Raises:
234
+ interface.EnvironmentError: If environment cannot create the sandbox.
235
+ interface.SandboxStateError: If sandbox cannot be started.
236
+ """
237
+
238
+ def new_session_id(self, feature_hint: str | None = None) -> str:
239
+ """Generates a random session ID."""
240
+ suffix = uuid.UUID(
241
+ bytes=bytes(bytes(self._random.getrandbits(8) for _ in range(16))),
242
+ version=4
243
+ ).hex[:7]
244
+ return f'{feature_hint or "unknown"}-session-{suffix}'
245
+
246
+ @property
247
+ def housekeep_counter(self) -> int:
248
+ """Returns the housekeeping counter."""
249
+ return self._housekeep_counter
250
+
251
+ #
252
+ # Subclasses can override:
253
+ #
254
+
255
+ def stats(self) -> dict[str, Any]:
256
+ """Returns the stats of the environment."""
257
+ stats_by_image_id = {}
258
+ for image_id, sandboxes in self._sandbox_pool.items():
259
+ stats_dict = {
260
+ status.value: 0
261
+ for status in interface.Sandbox.Status
262
+ }
263
+ for sandbox in sandboxes:
264
+ stats_dict[sandbox.status.value] += 1
265
+ stats_by_image_id[image_id] = stats_dict
266
+ return {
267
+ 'sandbox': stats_by_image_id,
268
+ }
269
+
270
+ def _start(self) -> None:
271
+ """Implementation of starting the environment."""
272
+ sandbox_startup_infos = []
273
+ self._non_sandbox_based_features_with_setup_called.clear()
274
+ # Setup all non-sandbox-based features.
275
+ for feature in self.non_sandbox_based_features():
276
+ self._non_sandbox_based_features_with_setup_called.add(feature.name)
277
+ feature.setup(sandbox=None)
278
+
279
+ # Setup sandbox pools.
280
+ for image_id in self.image_ids:
281
+ next_sandbox_id = 0
282
+ if self.enable_pooling(image_id):
283
+ min_pool_size = self.min_pool_size(image_id)
284
+ for i in range(min_pool_size):
285
+ sandbox_startup_infos.append((image_id, i))
286
+ self._sandbox_pool[image_id] = [None] * min_pool_size
287
+ next_sandbox_id = min_pool_size
288
+ self._next_sandbox_id[image_id] = next_sandbox_id
289
+
290
+ def _start_sandbox(sandbox_startup_info) -> None:
291
+ image_id, index = sandbox_startup_info
292
+ self._sandbox_pool[image_id][index] = self._bring_up_sandbox_with_retry(
293
+ image_id=image_id,
294
+ sandbox_id=f'{index}:0',
295
+ shutdown_env_upon_outage=False
296
+ )
297
+
298
+ if sandbox_startup_infos:
299
+ # Pre-allocate the sandbox pool before usage.
300
+ _ = list(
301
+ lf.concurrent_map(
302
+ _start_sandbox,
303
+ sandbox_startup_infos,
304
+ silence_on_errors=None,
305
+ max_workers=min(
306
+ self.pool_operation_max_parallelism,
307
+ len(sandbox_startup_infos)
308
+ ),
309
+ )
310
+ )
311
+
312
+ self._housekeep_thread = threading.Thread(
313
+ target=self._housekeep_loop, daemon=True
314
+ )
315
+ self._housekeep_counter = 0
316
+ self._housekeep_thread.start()
317
+
318
+ def _shutdown(self) -> None:
319
+ """Implementation of shutting down the environment."""
320
+ if (self._housekeep_thread is not None
321
+ and threading.current_thread() is not self._housekeep_thread):
322
+ self._housekeep_thread.join()
323
+ self._housekeep_thread = None
324
+
325
+ # Teardown all non-sandbox-based features.
326
+ for feature in self.non_sandbox_based_features():
327
+ if feature.name in self._non_sandbox_based_features_with_setup_called:
328
+ try:
329
+ feature.teardown()
330
+ except BaseException: # pylint: disable=broad-except
331
+ pass
332
+
333
+ # Shutdown sandbox pools.
334
+ if self._sandbox_pool:
335
+ sandboxes = []
336
+ for sandbox in self._sandbox_pool.values():
337
+ sandboxes.extend(sandbox)
338
+ self._sandbox_pool = {}
339
+
340
+ if sandboxes:
341
+ def _shutdown_sandbox(sandbox: base_sandbox.BaseSandbox) -> None:
342
+ if sandbox is not None:
343
+ sandbox.shutdown()
344
+
345
+ _ = list(
346
+ lf.concurrent_map(
347
+ _shutdown_sandbox,
348
+ sandboxes,
349
+ silence_on_errors=None,
350
+ max_workers=min(
351
+ self.pool_operation_max_parallelism,
352
+ len(sandboxes)
353
+ ),
354
+ )
355
+ )
356
+
357
+ #
358
+ # Environment basics.
359
+ #
360
+
361
+ @property
362
+ def sandbox_pool(self) -> dict[str, list[base_sandbox.BaseSandbox]]:
363
+ """Returns the sandbox pool."""
364
+ return self._sandbox_pool
365
+
366
+ @functools.cached_property
367
+ def working_dir(self) -> str | None:
368
+ """Returns the working directory for the environment."""
369
+ return self.id.working_dir(self.root_dir)
370
+
371
+ @property
372
+ def status(self) -> interface.Environment.Status:
373
+ """Returns whether the environment is online."""
374
+ return self._status
375
+
376
+ def _set_status(self, status: interface.Environment.Status) -> None:
377
+ """Sets the status of the environment."""
378
+ self._status = status
379
+
380
+ def enable_pooling(self, image_id: str) -> bool:
381
+ """Returns whether the environment enables pooling."""
382
+ return self.max_pool_size(image_id) > 0
383
+
384
+ def min_pool_size(self, image_id: str) -> int:
385
+ """Returns the minimum size of the sandbox pool."""
386
+ return self._pool_size(image_id)[0]
387
+
388
+ def max_pool_size(self, image_id: str) -> int:
389
+ """Returns the maximum size of the sandbox pool."""
390
+ return self._pool_size(image_id)[1]
391
+
392
+ def _pool_size(self, image_id: str) -> tuple[int, int]:
393
+ """Returns the minimum and maximum size of the sandbox pool."""
394
+ if isinstance(self.pool_size, dict):
395
+ if image_id in self.pool_size:
396
+ pool_size = self.pool_size[image_id]
397
+ else:
398
+ for k, v in self.pool_size.items():
399
+ if re.match(k, image_id):
400
+ pool_size = v
401
+ break
402
+ else:
403
+ # Default pool size is 0 and 256.
404
+ pool_size = (0, 256)
405
+ else:
406
+ pool_size = self.pool_size
407
+
408
+ if isinstance(pool_size, int):
409
+ return pool_size, pool_size
410
+ else:
411
+ assert isinstance(pool_size, tuple) and len(pool_size) == 2
412
+ return pool_size
413
+
414
+ @property
415
+ def start_time(self) -> float | None:
416
+ """Returns the start time of the environment."""
417
+ return self._start_time
418
+
419
+ @property
420
+ def offline_duration(self) -> float:
421
+ """Returns the offline duration of the environment."""
422
+ if self._offline_start_time is None:
423
+ return 0.0
424
+ return time.time() - self._offline_start_time
425
+
426
+ #
427
+ # Environment lifecycle.
428
+ #
429
+
430
+ def start(self) -> None:
431
+ """Starts the environment.
432
+
433
+ Raises:
434
+ interface.EnvironmentOutageError: If the environment is out of service.
435
+ """
436
+ assert self._status == self.Status.CREATED, (
437
+ f'Environment {self.id} cannot be started because '
438
+ f'it is in {self._status.value!r} status.'
439
+ )
440
+
441
+ self.on_starting()
442
+ starting_time = time.time()
443
+ try:
444
+ self._start()
445
+ self._start_time = time.time()
446
+ self._set_status(self.Status.ONLINE)
447
+ self.on_start(duration=time.time() - starting_time)
448
+ except BaseException as e:
449
+ self.on_start(duration=time.time() - starting_time, error=e)
450
+ self.shutdown()
451
+ raise e
452
+
453
+ def shutdown(self) -> None:
454
+ """Shuts down the environment.
455
+
456
+ This method should not raise any exceptions.
457
+ """
458
+ if self._status in (
459
+ self.Status.SHUTTING_DOWN,
460
+ self.Status.OFFLINE,
461
+ ):
462
+ return
463
+
464
+ self._set_status(self.Status.SHUTTING_DOWN)
465
+ self.on_shutting_down()
466
+
467
+ shutting_down_time = time.time()
468
+ try:
469
+ self._shutdown()
470
+ self.on_shutdown(duration=time.time() - shutting_down_time)
471
+ except BaseException as e: # pylint: disable=broad-except
472
+ self.on_shutdown(duration=time.time() - shutting_down_time, error=e)
473
+ raise e
474
+
475
+ #
476
+ # Environment operations.
477
+ #
478
+
479
+ def acquire(
480
+ self,
481
+ image_id: str | None = None
482
+ ) -> base_sandbox.BaseSandbox:
483
+ """Acquires a sandbox from the environment.
484
+
485
+ Args:
486
+ image_id: The image ID to use for the sandbox. If None, it will be
487
+ automatically determined by the environment.
488
+
489
+ Returns:
490
+ The acquired sandbox.
491
+
492
+ Raises:
493
+ interface.EnvironmentOutageError: If the environment is offline and the
494
+ grace period has passed.
495
+ interface.EnvironmentOverloadError: If the max pool size is reached and
496
+ the grace period has passed.
497
+ """
498
+ if not self.is_online:
499
+ raise interface.EnvironmentOutageError(
500
+ f'Environment {self.id} is not alive.',
501
+ environment=self,
502
+ offline_duration=self.offline_duration,
503
+ )
504
+ if image_id is None:
505
+ if not self.image_ids:
506
+ raise ValueError(
507
+ f'Environment {self.id} does not have a default image ID. '
508
+ 'Please specify the image ID explicitly.'
509
+ )
510
+ image_id = self.image_ids[0]
511
+ elif (image_id not in self.image_ids
512
+ and not self.supports_dynamic_image_loading):
513
+ raise ValueError(
514
+ f'Environment {self.id} does not serve image ID {image_id!r}. '
515
+ f'Please use one of the following image IDs: {self.image_ids!r} or '
516
+ f'set `{self.__class__.__name__}.supports_dynamic_image_loading` '
517
+ 'to True if dynamic image loading is supported.'
518
+ )
519
+ return self._acquire(image_id)
520
+
521
+ def _acquire(
522
+ self,
523
+ image_id: str | None = None
524
+ ) -> base_sandbox.BaseSandbox:
525
+ """Acquires a sandbox from the environment."""
526
+ if not self.enable_pooling(image_id):
527
+ return self._bring_up_sandbox_with_retry(
528
+ image_id=image_id,
529
+ sandbox_id=str(self._increment_sandbox_id(image_id)),
530
+ set_acquired=True,
531
+ )
532
+
533
+ allocation_start_time = time.time()
534
+ sandbox_pool = self._sandbox_pool[image_id]
535
+ while True:
536
+ try:
537
+ # We only append or replace items in the sandbox pool, therefore
538
+ # there is no need to lock the pool.
539
+ return self.load_balancer.acquire(sandbox_pool)
540
+ except IndexError:
541
+ if len(sandbox_pool) == self.max_pool_size(image_id):
542
+ if time.time() - allocation_start_time > self.outage_grace_period:
543
+ raise interface.EnvironmentOverloadError( # pylint: disable=raise-missing-from
544
+ environment=self
545
+ )
546
+ time.sleep(1)
547
+ else:
548
+ try:
549
+ sandbox = self._bring_up_sandbox(
550
+ image_id=image_id,
551
+ sandbox_id=f'{self._increment_sandbox_id(image_id)}:0',
552
+ set_acquired=True,
553
+ )
554
+ # Append is atomic and does not require locking.
555
+ sandbox_pool.append(sandbox)
556
+ return sandbox
557
+ except (
558
+ interface.EnvironmentError, interface.SandboxStateError
559
+ ) as ex:
560
+ self._report_outage_or_wait(ex)
561
+
562
+ def _bring_up_sandbox(
563
+ self,
564
+ image_id: str,
565
+ sandbox_id: str,
566
+ set_acquired: bool = False,
567
+ ) -> base_sandbox.BaseSandbox:
568
+ """Brings up a new sandbox."""
569
+ env_error = None
570
+ try:
571
+ sandbox = self._create_sandbox(
572
+ image_id=image_id,
573
+ sandbox_id=sandbox_id,
574
+ reusable=self.enable_pooling(image_id),
575
+ proactive_session_setup=self.proactive_session_setup,
576
+ keepalive_interval=self.sandbox_keepalive_interval,
577
+ )
578
+ sandbox.start()
579
+ if set_acquired:
580
+ sandbox.set_acquired()
581
+ return sandbox
582
+ except (interface.EnvironmentError, interface.SandboxStateError) as e:
583
+ env_error = e
584
+ raise e
585
+ finally:
586
+ if env_error is None:
587
+ self._offline_start_time = None
588
+ elif self._offline_start_time is None:
589
+ self._offline_start_time = time.time()
590
+
591
+ def _bring_up_sandbox_with_retry(
592
+ self,
593
+ image_id: str,
594
+ sandbox_id: str,
595
+ set_acquired: bool = False,
596
+ shutdown_env_upon_outage: bool = True,
597
+ ) -> base_sandbox.BaseSandbox:
598
+ """Brings up a new sandbox with retry until grace period is passed.
599
+
600
+ Args:
601
+ image_id: The image ID to use for the sandbox.
602
+ sandbox_id: The ID of the sandbox to bring up.
603
+ set_acquired: If True, the sandbox will be marked as acquired.
604
+ shutdown_env_upon_outage: Whether to shutdown the environment when the
605
+ outage grace period is passed.
606
+
607
+ Returns:
608
+ A new sandbox ready to use.
609
+
610
+ Raises:
611
+ interface.EnvironmentOutageError: If the environment is offline and the
612
+ grace period has passed.
613
+ """
614
+ while True:
615
+ try:
616
+ return self._bring_up_sandbox(
617
+ image_id=image_id, sandbox_id=sandbox_id, set_acquired=set_acquired
618
+ )
619
+ except (interface.EnvironmentError, interface.SandboxStateError) as e:
620
+ self._report_outage_or_wait(e, shutdown_env_upon_outage)
621
+
622
+ def _increment_sandbox_id(self, image_id: str) -> int:
623
+ """Returns the next pooled sandbox ID."""
624
+ x = self._next_sandbox_id[image_id]
625
+ self._next_sandbox_id[image_id] += 1
626
+ return x
627
+
628
+ def _report_outage_or_wait(
629
+ self,
630
+ error: interface.SandboxStateError,
631
+ shutdown_env_upon_outage: bool = True
632
+ ):
633
+ """Raises error if the grace period has passed or wait for retry."""
634
+ if self.offline_duration > self.outage_grace_period:
635
+ if shutdown_env_upon_outage:
636
+ self.shutdown()
637
+ raise interface.EnvironmentOutageError(
638
+ environment=self,
639
+ offline_duration=self.offline_duration,
640
+ ) from error
641
+ time.sleep(self.outage_retry_interval)
642
+
643
+ #
644
+ # Environment maintenance loop.
645
+ #
646
+
647
+ def _housekeep_loop(self) -> None:
648
+ """Housekeeping loop for the environment."""
649
+ def _indices_by_image_id(
650
+ entries: list[tuple[str, int, Any]]
651
+ ) -> dict[str, list[int]]:
652
+ indices_by_image_id = collections.defaultdict(list)
653
+ for image_id, i, _ in entries:
654
+ indices_by_image_id[image_id].append(i)
655
+ return indices_by_image_id
656
+
657
+ last_housekeep_time = {
658
+ f.name: time.time() for f in self.non_sandbox_based_features()
659
+ }
660
+
661
+ while self._status not in (self.Status.SHUTTING_DOWN, self.Status.OFFLINE):
662
+ housekeep_start_time = time.time()
663
+ feature_housekeep_successes = []
664
+ feature_housekeep_failures = []
665
+
666
+ # Housekeeping non-sandbox-based features.
667
+ for feature in self.non_sandbox_based_features():
668
+ if feature.housekeep_interval is None:
669
+ continue
670
+ if (last_housekeep_time[feature.name]
671
+ + feature.housekeep_interval < time.time()):
672
+ try:
673
+ feature.housekeep()
674
+ last_housekeep_time[feature.name] = time.time()
675
+ feature_housekeep_successes.append(feature.name)
676
+ except BaseException as e: # pylint: disable=broad-except
677
+ pg.logging.error(
678
+ '[%s/%s]: Feature housekeeping failed with error: %s.'
679
+ 'Shutting down environment...',
680
+ self.id,
681
+ feature.name,
682
+ e,
683
+ )
684
+ feature_housekeep_failures.append(feature.name)
685
+ self._housekeep_counter += 1
686
+ self.on_housekeep(
687
+ duration=time.time() - housekeep_start_time,
688
+ error=e,
689
+ feature_housekeep_successes=feature_housekeep_successes,
690
+ feature_housekeep_failures=feature_housekeep_failures,
691
+ )
692
+ self.shutdown()
693
+ return
694
+
695
+ # Replace dead sandboxes.
696
+ is_online = True
697
+ dead_sandbox_entries = []
698
+ for image_id, sandboxes in self._sandbox_pool.items():
699
+ for i, sandbox in enumerate(sandboxes):
700
+ if sandbox.status == interface.Sandbox.Status.OFFLINE:
701
+ dead_sandbox_entries.append((image_id, i, sandbox))
702
+
703
+ replaced_indices_by_image_id = {}
704
+
705
+ if dead_sandbox_entries:
706
+ replaced_indices_by_image_id = self._replace_dead_sandboxes(
707
+ dead_sandbox_entries
708
+ )
709
+ if not replaced_indices_by_image_id:
710
+ is_online = self.offline_duration < self.outage_grace_period
711
+
712
+ self._housekeep_counter += 1
713
+ duration = time.time() - housekeep_start_time
714
+
715
+ kwargs = dict(
716
+ feature_housekeep_successes=feature_housekeep_successes,
717
+ feature_housekeep_failures=feature_housekeep_failures,
718
+ dead_sandboxes=_indices_by_image_id(dead_sandbox_entries),
719
+ replaced_sandboxes=replaced_indices_by_image_id,
720
+ offline_duration=self.offline_duration,
721
+ )
722
+ if is_online:
723
+ self.on_housekeep(duration, **kwargs)
724
+ time.sleep(self.housekeep_interval)
725
+ else:
726
+ self.on_housekeep(
727
+ duration,
728
+ interface.EnvironmentOutageError(
729
+ environment=self, offline_duration=self.offline_duration
730
+ ),
731
+ **kwargs
732
+ )
733
+ self.shutdown()
734
+
735
+ def _replace_dead_sandboxes(
736
+ self,
737
+ dead_sandbox_entries: list[tuple[str, int, base_sandbox.BaseSandbox]]
738
+ ) -> dict[str, list[int]]:
739
+ """Replaces a dead sandbox with a new one.
740
+
741
+ Args:
742
+ dead_sandbox_entries: A list of tuples (image_id, index, sandbox) of
743
+ dead sandboxes to replace.
744
+
745
+ Returns:
746
+ Successfully replaced sandboxes in a dict of image ID to a list of
747
+ indices.
748
+ """
749
+ pg.logging.warning(
750
+ '[%s]: %s maintenance: '
751
+ 'Replacing %d dead sandbox(es) with new ones...',
752
+ self.id,
753
+ self.__class__.__name__,
754
+ len(dead_sandbox_entries),
755
+ )
756
+ def _replace(sandbox_entry: tuple[str, int, base_sandbox.BaseSandbox]):
757
+ image_id, i, sandbox = sandbox_entry
758
+ generation = int(sandbox.id.sandbox_id.split(':')[-1])
759
+ replaced_sandbox = self._bring_up_sandbox(
760
+ image_id=image_id,
761
+ sandbox_id=f'{i}:{generation + 1}'
762
+ )
763
+ self._sandbox_pool[image_id][i] = replaced_sandbox
764
+
765
+ # TODO(daiyip): Consider to loose the condition to allow some dead
766
+ # sandboxes to be replaced successfully.
767
+ replaced_indices_by_image_id = collections.defaultdict(list)
768
+ num_replaced = 0
769
+ for (image_id, index, _), _, error in lf.concurrent_map(
770
+ _replace, dead_sandbox_entries,
771
+ max_workers=min(
772
+ self.pool_operation_max_parallelism,
773
+ len(dead_sandbox_entries)
774
+ ),
775
+ ):
776
+ if error is None:
777
+ replaced_indices_by_image_id[image_id].append(index)
778
+ num_replaced += 1
779
+
780
+ pg.logging.warning(
781
+ '[%s]: %s maintenance: '
782
+ '%d/%d dead sandbox(es) have been replaced with new ones. (slots=%s)',
783
+ self.id,
784
+ self.__class__.__name__,
785
+ num_replaced,
786
+ len(dead_sandbox_entries),
787
+ replaced_indices_by_image_id,
788
+ )
789
+ return replaced_indices_by_image_id
790
+
791
+ #
792
+ # Event handlers subclasses can override.
793
+ #
794
+
795
+ def on_starting(self) -> None:
796
+ """Called when the environment is getting started."""
797
+ self.event_handler.on_environment_starting(self)
798
+
799
+ def on_start(
800
+ self,
801
+ duration: float, error: BaseException | None = None
802
+ ) -> None:
803
+ """Called when the environment is started."""
804
+ self.event_handler.on_environment_start(self, duration, error)
805
+
806
+ def on_housekeep(
807
+ self,
808
+ duration: float,
809
+ error: BaseException | None = None,
810
+ **kwargs
811
+ ) -> None:
812
+ """Called when the environment finishes a round of housekeeping."""
813
+ self.event_handler.on_environment_housekeep(
814
+ self, self.housekeep_counter, duration, error, **kwargs
815
+ )
816
+
817
+ def on_shutting_down(self) -> None:
818
+ """Called when the environment is shutting down."""
819
+ self.event_handler.on_environment_shutting_down(self, self.offline_duration)
820
+
821
+ def on_shutdown(
822
+ self,
823
+ duration: float,
824
+ error: BaseException | None = None) -> None:
825
+ """Called when the environment is shutdown."""
826
+ lifetime = (time.time() - self.start_time) if self.start_time else 0.0
827
+ self.event_handler.on_environment_shutdown(self, duration, lifetime, error)