dstack 0.19.25__py3-none-any.whl → 0.19.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of dstack might be problematic. Click here for more details.

Files changed (128) hide show
  1. dstack/_internal/cli/commands/__init__.py +2 -2
  2. dstack/_internal/cli/commands/apply.py +3 -61
  3. dstack/_internal/cli/commands/attach.py +1 -1
  4. dstack/_internal/cli/commands/completion.py +1 -1
  5. dstack/_internal/cli/commands/delete.py +2 -2
  6. dstack/_internal/cli/commands/fleet.py +1 -1
  7. dstack/_internal/cli/commands/gateway.py +2 -2
  8. dstack/_internal/cli/commands/init.py +56 -24
  9. dstack/_internal/cli/commands/logs.py +1 -1
  10. dstack/_internal/cli/commands/metrics.py +1 -1
  11. dstack/_internal/cli/commands/offer.py +45 -7
  12. dstack/_internal/cli/commands/project.py +2 -2
  13. dstack/_internal/cli/commands/secrets.py +2 -2
  14. dstack/_internal/cli/commands/server.py +1 -1
  15. dstack/_internal/cli/commands/stop.py +1 -1
  16. dstack/_internal/cli/commands/volume.py +1 -1
  17. dstack/_internal/cli/main.py +2 -2
  18. dstack/_internal/cli/services/completion.py +2 -2
  19. dstack/_internal/cli/services/configurators/__init__.py +6 -2
  20. dstack/_internal/cli/services/configurators/base.py +6 -7
  21. dstack/_internal/cli/services/configurators/fleet.py +1 -3
  22. dstack/_internal/cli/services/configurators/gateway.py +2 -4
  23. dstack/_internal/cli/services/configurators/run.py +195 -55
  24. dstack/_internal/cli/services/configurators/volume.py +2 -4
  25. dstack/_internal/cli/services/profile.py +1 -1
  26. dstack/_internal/cli/services/repos.py +51 -47
  27. dstack/_internal/core/backends/aws/configurator.py +11 -7
  28. dstack/_internal/core/backends/azure/configurator.py +11 -7
  29. dstack/_internal/core/backends/base/configurator.py +25 -13
  30. dstack/_internal/core/backends/cloudrift/configurator.py +13 -7
  31. dstack/_internal/core/backends/cudo/configurator.py +11 -7
  32. dstack/_internal/core/backends/datacrunch/compute.py +5 -1
  33. dstack/_internal/core/backends/datacrunch/configurator.py +13 -7
  34. dstack/_internal/core/backends/gcp/configurator.py +11 -7
  35. dstack/_internal/core/backends/hotaisle/configurator.py +13 -7
  36. dstack/_internal/core/backends/kubernetes/configurator.py +13 -7
  37. dstack/_internal/core/backends/lambdalabs/configurator.py +11 -7
  38. dstack/_internal/core/backends/nebius/compute.py +1 -1
  39. dstack/_internal/core/backends/nebius/configurator.py +11 -7
  40. dstack/_internal/core/backends/nebius/resources.py +21 -11
  41. dstack/_internal/core/backends/oci/configurator.py +11 -7
  42. dstack/_internal/core/backends/runpod/configurator.py +11 -7
  43. dstack/_internal/core/backends/template/configurator.py.jinja +11 -7
  44. dstack/_internal/core/backends/tensordock/configurator.py +13 -7
  45. dstack/_internal/core/backends/vastai/configurator.py +11 -7
  46. dstack/_internal/core/backends/vultr/configurator.py +11 -4
  47. dstack/_internal/core/compatibility/gpus.py +13 -0
  48. dstack/_internal/core/compatibility/runs.py +1 -0
  49. dstack/_internal/core/models/common.py +3 -3
  50. dstack/_internal/core/models/configurations.py +172 -27
  51. dstack/_internal/core/models/files.py +1 -1
  52. dstack/_internal/core/models/fleets.py +5 -1
  53. dstack/_internal/core/models/profiles.py +41 -11
  54. dstack/_internal/core/models/resources.py +46 -42
  55. dstack/_internal/core/models/runs.py +4 -0
  56. dstack/_internal/core/services/configs/__init__.py +2 -2
  57. dstack/_internal/core/services/profiles.py +2 -2
  58. dstack/_internal/core/services/repos.py +5 -3
  59. dstack/_internal/core/services/ssh/ports.py +1 -1
  60. dstack/_internal/proxy/lib/deps.py +6 -2
  61. dstack/_internal/server/app.py +22 -17
  62. dstack/_internal/server/background/tasks/process_gateways.py +4 -1
  63. dstack/_internal/server/background/tasks/process_instances.py +10 -2
  64. dstack/_internal/server/background/tasks/process_probes.py +1 -1
  65. dstack/_internal/server/background/tasks/process_running_jobs.py +10 -4
  66. dstack/_internal/server/background/tasks/process_runs.py +1 -1
  67. dstack/_internal/server/background/tasks/process_submitted_jobs.py +54 -43
  68. dstack/_internal/server/background/tasks/process_terminating_jobs.py +2 -2
  69. dstack/_internal/server/background/tasks/process_volumes.py +1 -1
  70. dstack/_internal/server/db.py +8 -4
  71. dstack/_internal/server/models.py +1 -0
  72. dstack/_internal/server/routers/gpus.py +1 -6
  73. dstack/_internal/server/schemas/runner.py +10 -0
  74. dstack/_internal/server/services/backends/__init__.py +14 -8
  75. dstack/_internal/server/services/backends/handlers.py +6 -1
  76. dstack/_internal/server/services/docker.py +5 -5
  77. dstack/_internal/server/services/fleets.py +14 -13
  78. dstack/_internal/server/services/gateways/__init__.py +2 -0
  79. dstack/_internal/server/services/gateways/client.py +5 -2
  80. dstack/_internal/server/services/gateways/connection.py +1 -1
  81. dstack/_internal/server/services/gpus.py +50 -49
  82. dstack/_internal/server/services/instances.py +41 -1
  83. dstack/_internal/server/services/jobs/__init__.py +15 -4
  84. dstack/_internal/server/services/jobs/configurators/base.py +7 -11
  85. dstack/_internal/server/services/jobs/configurators/dev.py +5 -0
  86. dstack/_internal/server/services/jobs/configurators/extensions/cursor.py +3 -3
  87. dstack/_internal/server/services/jobs/configurators/extensions/vscode.py +3 -3
  88. dstack/_internal/server/services/jobs/configurators/service.py +1 -0
  89. dstack/_internal/server/services/jobs/configurators/task.py +3 -0
  90. dstack/_internal/server/services/locking.py +5 -5
  91. dstack/_internal/server/services/logging.py +10 -2
  92. dstack/_internal/server/services/logs/__init__.py +8 -6
  93. dstack/_internal/server/services/logs/aws.py +330 -327
  94. dstack/_internal/server/services/logs/filelog.py +7 -6
  95. dstack/_internal/server/services/logs/gcp.py +141 -139
  96. dstack/_internal/server/services/plugins.py +1 -1
  97. dstack/_internal/server/services/projects.py +2 -5
  98. dstack/_internal/server/services/proxy/repo.py +5 -1
  99. dstack/_internal/server/services/requirements/__init__.py +0 -0
  100. dstack/_internal/server/services/requirements/combine.py +259 -0
  101. dstack/_internal/server/services/runner/client.py +7 -0
  102. dstack/_internal/server/services/runs.py +1 -1
  103. dstack/_internal/server/services/services/__init__.py +8 -2
  104. dstack/_internal/server/services/services/autoscalers.py +2 -0
  105. dstack/_internal/server/services/ssh.py +2 -1
  106. dstack/_internal/server/services/storage/__init__.py +5 -6
  107. dstack/_internal/server/services/storage/gcs.py +49 -49
  108. dstack/_internal/server/services/storage/s3.py +52 -52
  109. dstack/_internal/server/statics/index.html +1 -1
  110. dstack/_internal/server/testing/common.py +1 -1
  111. dstack/_internal/server/utils/logging.py +3 -3
  112. dstack/_internal/server/utils/provisioning.py +3 -3
  113. dstack/_internal/utils/json_schema.py +3 -1
  114. dstack/_internal/utils/typing.py +14 -0
  115. dstack/api/_public/repos.py +21 -2
  116. dstack/api/_public/runs.py +5 -7
  117. dstack/api/server/__init__.py +17 -19
  118. dstack/api/server/_gpus.py +2 -1
  119. dstack/api/server/_group.py +4 -3
  120. dstack/api/server/_repos.py +20 -3
  121. dstack/plugins/builtin/rest_plugin/_plugin.py +1 -0
  122. dstack/version.py +1 -1
  123. {dstack-0.19.25.dist-info → dstack-0.19.26.dist-info}/METADATA +1 -1
  124. {dstack-0.19.25.dist-info → dstack-0.19.26.dist-info}/RECORD +127 -124
  125. dstack/api/huggingface/__init__.py +0 -73
  126. {dstack-0.19.25.dist-info → dstack-0.19.26.dist-info}/WHEEL +0 -0
  127. {dstack-0.19.25.dist-info → dstack-0.19.26.dist-info}/entry_points.txt +0 -0
  128. {dstack-0.19.25.dist-info → dstack-0.19.26.dist-info}/licenses/LICENSE.md +0 -0
@@ -24,347 +24,350 @@ from dstack._internal.server.services.logs.base import (
24
24
  )
25
25
  from dstack._internal.utils.logging import get_logger
26
26
 
27
+ logger = get_logger(__name__)
28
+
29
+
27
30
  BOTO_AVAILABLE = True
28
31
  try:
29
32
  import boto3
30
33
  import botocore.exceptions
31
34
  except ImportError:
32
35
  BOTO_AVAILABLE = False
33
-
34
- logger = get_logger(__name__)
35
-
36
-
37
- class _CloudWatchLogEvent(TypedDict):
38
- timestamp: int # unix time in milliseconds
39
- message: str
40
-
41
-
42
- class CloudWatchLogStorage(LogStorage):
43
- # "The maximum number of log events in a batch is 10,000".
44
- EVENT_MAX_COUNT_IN_BATCH = 10000
45
- # "The maximum batch size is 1,048,576 bytes" — exactly 1 MiB. "This size is calculated
46
- # as the sum of all event messages in UTF-8, plus 26 bytes for each log event".
47
- BATCH_MAX_SIZE = 1048576
48
- # "Each log event can be no larger than 256 KB" — KB means KiB; includes MESSAGE_OVERHEAD_SIZE.
49
- MESSAGE_MAX_SIZE = 262144
50
- # Message size in bytes = len(message.encode("utf-8")) + MESSAGE_OVERHEAD_SIZE.
51
- MESSAGE_OVERHEAD_SIZE = 26
52
- # "A batch of log events in a single request cannot span more than 24 hours".
53
- BATCH_MAX_SPAN = int(timedelta(hours=24).total_seconds()) * 1000
54
- # Decrease allowed deltas by possible clock drift between dstack and CloudWatch.
55
- CLOCK_DRIFT = int(timedelta(minutes=10).total_seconds()) * 1000
56
- # "None of the log events in the batch can be more than 14 days in the past."
57
- PAST_EVENT_MAX_DELTA = int((timedelta(days=14)).total_seconds()) * 1000 - CLOCK_DRIFT
58
- # "None of the log events in the batch can be more than 2 hours in the future."
59
- FUTURE_EVENT_MAX_DELTA = int((timedelta(hours=2)).total_seconds()) * 1000 - CLOCK_DRIFT
60
- # Maximum number of retries when polling for log events to skip empty pages.
61
- MAX_RETRIES = 10
62
-
63
- def __init__(self, *, group: str, region: Optional[str] = None) -> None:
64
- with self._wrap_boto_errors():
65
- session = boto3.Session(region_name=region)
66
- self._client = session.client("logs")
67
- self._check_group_exists(group)
68
- self._group = group
69
- self._region = self._client.meta.region_name
70
- # Stores names of already created streams.
71
- # XXX: This set acts as an unbound cache. If this becomes a problem (in case of _very_ long
72
- # running server and/or lots of jobs, consider replacing it with an LRU cache, e.g.,
73
- # a simple OrderedDict-based implementation should be OK.
74
- self._streams: Set[str] = set()
75
-
76
- def close(self) -> None:
77
- self._client.close()
78
-
79
- def poll_logs(self, project: ProjectModel, request: PollLogsRequest) -> JobSubmissionLogs:
80
- log_producer = LogProducer.RUNNER if request.diagnose else LogProducer.JOB
81
- stream = self._get_stream_name(
82
- project.name, request.run_name, request.job_submission_id, log_producer
83
- )
84
- cw_events: List[_CloudWatchLogEvent]
85
- next_token: Optional[str] = None
86
- with self._wrap_boto_errors():
87
- try:
88
- cw_events, next_token = self._get_log_events_with_retry(stream, request)
89
- except botocore.exceptions.ClientError as e:
90
- if not self._is_resource_not_found_exception(e):
91
- raise
92
- # Check if the group exists to distinguish between group not found vs stream not found
93
- try:
94
- self._check_group_exists(self._group)
95
- # Group exists, so the error must be due to missing stream
96
- logger.debug("Stream %s not found, returning dummy response", stream)
97
- cw_events = []
98
- except LogStorageError:
99
- # Group doesn't exist, re-raise the LogStorageError
100
- raise
101
- logs = [
102
- LogEvent(
103
- timestamp=unix_time_ms_to_datetime(cw_event["timestamp"]),
104
- log_source=LogEventSource.STDOUT,
105
- message=cw_event["message"],
36
+ else:
37
+
38
+ class _CloudWatchLogEvent(TypedDict):
39
+ timestamp: int # unix time in milliseconds
40
+ message: str
41
+
42
+ class CloudWatchLogStorage(LogStorage):
43
+ # "The maximum number of log events in a batch is 10,000".
44
+ EVENT_MAX_COUNT_IN_BATCH = 10000
45
+ # "The maximum batch size is 1,048,576 bytes" — exactly 1 MiB. "This size is calculated
46
+ # as the sum of all event messages in UTF-8, plus 26 bytes for each log event".
47
+ BATCH_MAX_SIZE = 1048576
48
+ # "Each log event can be no larger than 256 KB" — KB means KiB; includes MESSAGE_OVERHEAD_SIZE.
49
+ MESSAGE_MAX_SIZE = 262144
50
+ # Message size in bytes = len(message.encode("utf-8")) + MESSAGE_OVERHEAD_SIZE.
51
+ MESSAGE_OVERHEAD_SIZE = 26
52
+ # "A batch of log events in a single request cannot span more than 24 hours".
53
+ BATCH_MAX_SPAN = int(timedelta(hours=24).total_seconds()) * 1000
54
+ # Decrease allowed deltas by possible clock drift between dstack and CloudWatch.
55
+ CLOCK_DRIFT = int(timedelta(minutes=10).total_seconds()) * 1000
56
+ # "None of the log events in the batch can be more than 14 days in the past."
57
+ PAST_EVENT_MAX_DELTA = int((timedelta(days=14)).total_seconds()) * 1000 - CLOCK_DRIFT
58
+ # "None of the log events in the batch can be more than 2 hours in the future."
59
+ FUTURE_EVENT_MAX_DELTA = int((timedelta(hours=2)).total_seconds()) * 1000 - CLOCK_DRIFT
60
+ # Maximum number of retries when polling for log events to skip empty pages.
61
+ MAX_RETRIES = 10
62
+
63
+ def __init__(self, *, group: str, region: Optional[str] = None) -> None:
64
+ with self._wrap_boto_errors():
65
+ session = boto3.Session(region_name=region)
66
+ self._client = session.client("logs")
67
+ self._check_group_exists(group)
68
+ self._group = group
69
+ self._region = self._client.meta.region_name
70
+ # Stores names of already created streams.
71
+ # XXX: This set acts as an unbound cache. If this becomes a problem (in case of _very_ long
72
+ # running server and/or lots of jobs, consider replacing it with an LRU cache, e.g.,
73
+ # a simple OrderedDict-based implementation should be OK.
74
+ self._streams: Set[str] = set()
75
+
76
+ def close(self) -> None:
77
+ self._client.close()
78
+
79
+ def poll_logs(self, project: ProjectModel, request: PollLogsRequest) -> JobSubmissionLogs:
80
+ log_producer = LogProducer.RUNNER if request.diagnose else LogProducer.JOB
81
+ stream = self._get_stream_name(
82
+ project.name, request.run_name, request.job_submission_id, log_producer
106
83
  )
107
- for cw_event in cw_events
108
- ]
109
- return JobSubmissionLogs(
110
- logs=logs,
111
- external_url=self._get_stream_external_url(stream),
112
- next_token=next_token,
113
- )
114
-
115
- def _get_log_events_with_retry(
116
- self, stream: str, request: PollLogsRequest
117
- ) -> Tuple[List[_CloudWatchLogEvent], Optional[str]]:
118
- current_request = request
119
- previous_next_token = request.next_token
120
-
121
- for attempt in range(self.MAX_RETRIES):
122
- cw_events, next_token = self._get_log_events(stream, current_request)
123
-
124
- if cw_events:
125
- return cw_events, next_token
126
-
127
- if not next_token or next_token == previous_next_token:
128
- return [], None
129
-
130
- previous_next_token = next_token
131
- current_request = PollLogsRequest(
132
- run_name=request.run_name,
133
- job_submission_id=request.job_submission_id,
134
- start_time=request.start_time,
135
- end_time=request.end_time,
136
- descending=request.descending,
84
+ cw_events: List[_CloudWatchLogEvent]
85
+ next_token: Optional[str] = None
86
+ with self._wrap_boto_errors():
87
+ try:
88
+ cw_events, next_token = self._get_log_events_with_retry(stream, request)
89
+ except botocore.exceptions.ClientError as e:
90
+ if not self._is_resource_not_found_exception(e):
91
+ raise
92
+ # Check if the group exists to distinguish between group not found vs stream not found
93
+ try:
94
+ self._check_group_exists(self._group)
95
+ # Group exists, so the error must be due to missing stream
96
+ logger.debug("Stream %s not found, returning dummy response", stream)
97
+ cw_events = []
98
+ except LogStorageError:
99
+ # Group doesn't exist, re-raise the LogStorageError
100
+ raise
101
+ logs = [
102
+ LogEvent(
103
+ timestamp=unix_time_ms_to_datetime(cw_event["timestamp"]),
104
+ log_source=LogEventSource.STDOUT,
105
+ message=cw_event["message"],
106
+ )
107
+ for cw_event in cw_events
108
+ ]
109
+ return JobSubmissionLogs(
110
+ logs=logs,
111
+ external_url=self._get_stream_external_url(stream),
137
112
  next_token=next_token,
138
- limit=request.limit,
139
- diagnose=request.diagnose,
140
113
  )
141
114
 
142
- if not request.descending:
143
- logger.debug(
144
- "Stream %s: exhausted %d retries without finding logs, returning empty response",
145
- stream,
146
- self.MAX_RETRIES,
147
- )
148
- # Only return the next token after exhausting retries if going descending—
149
- # AWS CloudWatch guarantees more logs in that case. In ascending mode,
150
- # next token is always returned, even if no logs remain.
151
- # So descending works reliably; ascending has limits if gaps are too large.
152
- # In the future, UI/CLI should handle retries, and we can return next token for ascending too.
153
- return [], next_token if request.descending else None
154
-
155
- def _get_log_events(
156
- self, stream: str, request: PollLogsRequest
157
- ) -> Tuple[List[_CloudWatchLogEvent], Optional[str]]:
158
- start_from_head = not request.descending
159
- parameters = {
160
- "logGroupName": self._group,
161
- "logStreamName": stream,
162
- "limit": request.limit,
163
- "startFromHead": start_from_head,
164
- }
165
-
166
- if request.start_time:
167
- parameters["startTime"] = datetime_to_unix_time_ms(request.start_time)
168
-
169
- if request.end_time:
170
- parameters["endTime"] = datetime_to_unix_time_ms(request.end_time)
171
- elif start_from_head:
172
- # When startFromHead=true and no endTime is provided, set endTime to "now"
173
- # to prevent infinite pagination as new logs arrive faster than we can read them
174
- parameters["endTime"] = datetime_to_unix_time_ms(datetime.now(timezone.utc))
175
-
176
- if request.next_token:
177
- parameters["nextToken"] = request.next_token
178
-
179
- response = self._client.get_log_events(**parameters)
180
-
181
- events = response.get("events", [])
182
- next_token_key = "nextForwardToken" if start_from_head else "nextBackwardToken"
183
- next_token = response.get(next_token_key)
184
-
185
- # TODO: The code below is not going to be used until we migrate from base64-encoded logs to plain text logs.
186
- if request.descending:
187
- events = list(reversed(events))
188
-
189
- return events, next_token
190
-
191
- def _get_stream_external_url(self, stream: str) -> str:
192
- quoted_group = urllib.parse.quote(self._group, safe="")
193
- quoted_stream = urllib.parse.quote(stream, safe="")
194
- return f"https://console.aws.amazon.com/cloudwatch/home?region={self._region}#logsV2:log-groups/log-group/{quoted_group}/log-events/{quoted_stream}"
195
-
196
- def write_logs(
197
- self,
198
- project: ProjectModel,
199
- run_name: str,
200
- job_submission_id: UUID,
201
- runner_logs: List[RunnerLogEvent],
202
- job_logs: List[RunnerLogEvent],
203
- ):
204
- if len(runner_logs) > 0:
205
- runner_stream = self._get_stream_name(
206
- project.name, run_name, job_submission_id, LogProducer.RUNNER
207
- )
208
- self._write_logs(
209
- stream=runner_stream,
210
- log_events=runner_logs,
211
- )
212
- if len(job_logs) > 0:
213
- jog_stream = self._get_stream_name(
214
- project.name, run_name, job_submission_id, LogProducer.JOB
215
- )
216
- self._write_logs(
217
- stream=jog_stream,
218
- log_events=job_logs,
219
- )
115
+ def _get_log_events_with_retry(
116
+ self, stream: str, request: PollLogsRequest
117
+ ) -> Tuple[List[_CloudWatchLogEvent], Optional[str]]:
118
+ current_request = request
119
+ previous_next_token = request.next_token
120
+ next_token = None
121
+
122
+ for _ in range(self.MAX_RETRIES):
123
+ cw_events, next_token = self._get_log_events(stream, current_request)
124
+
125
+ if cw_events:
126
+ return cw_events, next_token
127
+
128
+ if not next_token or next_token == previous_next_token:
129
+ return [], None
130
+
131
+ previous_next_token = next_token
132
+ current_request = PollLogsRequest(
133
+ run_name=request.run_name,
134
+ job_submission_id=request.job_submission_id,
135
+ start_time=request.start_time,
136
+ end_time=request.end_time,
137
+ descending=request.descending,
138
+ next_token=next_token,
139
+ limit=request.limit,
140
+ diagnose=request.diagnose,
141
+ )
220
142
 
221
- def _write_logs(self, stream: str, log_events: List[RunnerLogEvent]) -> None:
222
- with self._wrap_boto_errors():
223
- self._ensure_stream_exists(stream)
224
- try:
143
+ if not request.descending:
144
+ logger.debug(
145
+ "Stream %s: exhausted %d retries without finding logs, returning empty response",
146
+ stream,
147
+ self.MAX_RETRIES,
148
+ )
149
+ # Only return the next token after exhausting retries if going descending—
150
+ # AWS CloudWatch guarantees more logs in that case. In ascending mode,
151
+ # next token is always returned, even if no logs remain.
152
+ # So descending works reliably; ascending has limits if gaps are too large.
153
+ # In the future, UI/CLI should handle retries, and we can return next token for ascending too.
154
+ return [], next_token if request.descending else None
155
+
156
+ def _get_log_events(
157
+ self, stream: str, request: PollLogsRequest
158
+ ) -> Tuple[List[_CloudWatchLogEvent], Optional[str]]:
159
+ start_from_head = not request.descending
160
+ parameters = {
161
+ "logGroupName": self._group,
162
+ "logStreamName": stream,
163
+ "limit": request.limit,
164
+ "startFromHead": start_from_head,
165
+ }
166
+
167
+ if request.start_time:
168
+ parameters["startTime"] = datetime_to_unix_time_ms(request.start_time)
169
+
170
+ if request.end_time:
171
+ parameters["endTime"] = datetime_to_unix_time_ms(request.end_time)
172
+ elif start_from_head:
173
+ # When startFromHead=true and no endTime is provided, set endTime to "now"
174
+ # to prevent infinite pagination as new logs arrive faster than we can read them
175
+ parameters["endTime"] = datetime_to_unix_time_ms(datetime.now(timezone.utc))
176
+
177
+ if request.next_token:
178
+ parameters["nextToken"] = request.next_token
179
+
180
+ response = self._client.get_log_events(**parameters)
181
+
182
+ events = response.get("events", [])
183
+ next_token_key = "nextForwardToken" if start_from_head else "nextBackwardToken"
184
+ next_token = response.get(next_token_key)
185
+
186
+ # TODO: The code below is not going to be used until we migrate from base64-encoded logs to plain text logs.
187
+ if request.descending:
188
+ events = list(reversed(events))
189
+
190
+ return events, next_token
191
+
192
+ def _get_stream_external_url(self, stream: str) -> str:
193
+ quoted_group = urllib.parse.quote(self._group, safe="")
194
+ quoted_stream = urllib.parse.quote(stream, safe="")
195
+ return f"https://console.aws.amazon.com/cloudwatch/home?region={self._region}#logsV2:log-groups/log-group/{quoted_group}/log-events/{quoted_stream}"
196
+
197
+ def write_logs(
198
+ self,
199
+ project: ProjectModel,
200
+ run_name: str,
201
+ job_submission_id: UUID,
202
+ runner_logs: List[RunnerLogEvent],
203
+ job_logs: List[RunnerLogEvent],
204
+ ):
205
+ if len(runner_logs) > 0:
206
+ runner_stream = self._get_stream_name(
207
+ project.name, run_name, job_submission_id, LogProducer.RUNNER
208
+ )
209
+ self._write_logs(
210
+ stream=runner_stream,
211
+ log_events=runner_logs,
212
+ )
213
+ if len(job_logs) > 0:
214
+ jog_stream = self._get_stream_name(
215
+ project.name, run_name, job_submission_id, LogProducer.JOB
216
+ )
217
+ self._write_logs(
218
+ stream=jog_stream,
219
+ log_events=job_logs,
220
+ )
221
+
222
+ def _write_logs(self, stream: str, log_events: List[RunnerLogEvent]) -> None:
223
+ with self._wrap_boto_errors():
224
+ self._ensure_stream_exists(stream)
225
+ try:
226
+ self._put_log_events(stream, log_events)
227
+ return
228
+ except botocore.exceptions.ClientError as e:
229
+ if not self._is_resource_not_found_exception(e):
230
+ raise
231
+ logger.debug("Stream %s not found, recreating", stream)
232
+ # The stream is probably deleted due to retention policy, our cache is stale.
233
+ self._ensure_stream_exists(stream, force=True)
225
234
  self._put_log_events(stream, log_events)
226
- return
227
- except botocore.exceptions.ClientError as e:
228
- if not self._is_resource_not_found_exception(e):
229
- raise
230
- logger.debug("Stream %s not found, recreating", stream)
231
- # The stream is probably deleted due to retention policy, our cache is stale.
232
- self._ensure_stream_exists(stream, force=True)
233
- self._put_log_events(stream, log_events)
234
-
235
- def _put_log_events(self, stream: str, log_events: List[RunnerLogEvent]) -> None:
236
- # Python docs: "The built-in sorted() function is guaranteed to be stable."
237
- sorted_log_events = sorted(log_events, key=operator.attrgetter("timestamp"))
238
- if tuple(map(id, log_events)) != tuple(map(id, sorted_log_events)):
239
- logger.error(
240
- "Stream %s: events are not in chronological order, something wrong with runner",
241
- stream,
242
- )
243
- for batch in self._get_batch_iter(stream, sorted_log_events):
244
- self._client.put_log_events(
245
- logGroupName=self._group,
246
- logStreamName=stream,
247
- logEvents=batch,
248
- )
249
235
 
250
- def _get_batch_iter(
251
- self, stream: str, log_events: List[RunnerLogEvent]
252
- ) -> Iterator[List[_CloudWatchLogEvent]]:
253
- shared_event_iter = iter(log_events)
254
- event_iter = shared_event_iter
255
- while True:
256
- batch, excessive_event = self._get_next_batch(stream, event_iter)
257
- if not batch:
258
- return
259
- yield batch
260
- if excessive_event is not None:
261
- event_iter = itertools.chain([excessive_event], shared_event_iter)
262
- else:
263
- event_iter = shared_event_iter
264
-
265
- def _get_next_batch(
266
- self, stream: str, event_iter: Iterator[RunnerLogEvent]
267
- ) -> Tuple[List[_CloudWatchLogEvent], Optional[RunnerLogEvent]]:
268
- now_timestamp = int(datetime.now(timezone.utc).timestamp() * 1000)
269
- batch: List[_CloudWatchLogEvent] = []
270
- total_size = 0
271
- event_count = 0
272
- first_timestamp: Optional[int] = None
273
- skipped_past_events = 0
274
- skipped_future_events = 0
275
- # event that doesn't fit in the current batch
276
- excessive_event: Optional[RunnerLogEvent] = None
277
- for event in event_iter:
278
- # Normally there should not be empty messages.
279
- if not event.message:
280
- continue
281
- timestamp = event.timestamp
282
- if first_timestamp is None:
283
- first_timestamp = timestamp
284
- elif timestamp - first_timestamp > self.BATCH_MAX_SPAN:
285
- excessive_event = event
286
- break
287
- if now_timestamp - timestamp > self.PAST_EVENT_MAX_DELTA:
288
- skipped_past_events += 1
289
- continue
290
- if timestamp - now_timestamp > self.FUTURE_EVENT_MAX_DELTA:
291
- skipped_future_events += 1
292
- continue
293
- cw_event = self._runner_log_event_to_cloudwatch_event(event)
294
- message_size = len(event.message) + self.MESSAGE_OVERHEAD_SIZE
295
- if message_size > self.MESSAGE_MAX_SIZE:
296
- # we should never hit this limit, as we use `io.Copy` to copy from pty to logs,
297
- # which under the hood uses 32KiB buffer, see runner/internal/executor/executor.go,
298
- # `execJob` -> `io.Copy(logger, ptmx)`
236
+ def _put_log_events(self, stream: str, log_events: List[RunnerLogEvent]) -> None:
237
+ # Python docs: "The built-in sorted() function is guaranteed to be stable."
238
+ sorted_log_events = sorted(log_events, key=operator.attrgetter("timestamp"))
239
+ if tuple(map(id, log_events)) != tuple(map(id, sorted_log_events)):
299
240
  logger.error(
300
- "Stream %s: skipping event %d, message exceeds max size: %d > %d",
241
+ "Stream %s: events are not in chronological order, something wrong with runner",
301
242
  stream,
302
- timestamp,
303
- message_size,
304
- self.MESSAGE_MAX_SIZE,
305
243
  )
306
- continue
307
- if total_size + message_size > self.BATCH_MAX_SIZE:
308
- excessive_event = event
309
- break
310
- batch.append(cw_event)
311
- total_size += message_size
312
- event_count += 1
313
- if event_count >= self.EVENT_MAX_COUNT_IN_BATCH:
314
- break
315
- if skipped_past_events > 0:
316
- logger.error("Stream %s: skipping %d past event(s)", stream, skipped_past_events)
317
- if skipped_future_events > 0:
318
- logger.error("Stream %s: skipping %d future event(s)", stream, skipped_future_events)
319
- return batch, excessive_event
320
-
321
- def _runner_log_event_to_cloudwatch_event(
322
- self, runner_log_event: RunnerLogEvent
323
- ) -> _CloudWatchLogEvent:
324
- return {
325
- "timestamp": runner_log_event.timestamp,
326
- "message": runner_log_event.message.decode(errors="replace"),
327
- }
328
-
329
- @contextmanager
330
- def _wrap_boto_errors(self) -> Iterator[None]:
331
- try:
332
- yield
333
- except (botocore.exceptions.BotoCoreError, botocore.exceptions.ClientError) as e:
334
- raise LogStorageError(f"CloudWatch Logs error: {type(e).__name__}: {e}") from e
335
-
336
- def _is_resource_not_found_exception(self, exc: "botocore.exceptions.ClientError") -> bool:
337
- try:
338
- return exc.response["Error"]["Code"] == "ResourceNotFoundException"
339
- except KeyError:
340
- return False
341
-
342
- def _check_group_exists(self, name: str) -> None:
343
- try:
344
- self._client.describe_log_streams(logGroupName=name, limit=1)
345
- except botocore.exceptions.ClientError as e:
346
- if self._is_resource_not_found_exception(e):
347
- raise LogStorageError(f"LogGroup '{name}' does not exist")
348
- raise
349
-
350
- def _ensure_stream_exists(self, name: str, *, force: bool = False) -> None:
351
- if not force and name in self._streams:
352
- return
353
- response = self._client.describe_log_streams(
354
- logGroupName=self._group, logStreamNamePrefix=name
355
- )
356
- for stream in response["logStreams"]:
357
- if stream["logStreamName"] == name:
358
- self._streams.add(name)
244
+ for batch in self._get_batch_iter(stream, sorted_log_events):
245
+ self._client.put_log_events(
246
+ logGroupName=self._group,
247
+ logStreamName=stream,
248
+ logEvents=batch,
249
+ )
250
+
251
+ def _get_batch_iter(
252
+ self, stream: str, log_events: List[RunnerLogEvent]
253
+ ) -> Iterator[List[_CloudWatchLogEvent]]:
254
+ shared_event_iter = iter(log_events)
255
+ event_iter = shared_event_iter
256
+ while True:
257
+ batch, excessive_event = self._get_next_batch(stream, event_iter)
258
+ if not batch:
259
+ return
260
+ yield batch
261
+ if excessive_event is not None:
262
+ event_iter = itertools.chain([excessive_event], shared_event_iter)
263
+ else:
264
+ event_iter = shared_event_iter
265
+
266
+ def _get_next_batch(
267
+ self, stream: str, event_iter: Iterator[RunnerLogEvent]
268
+ ) -> Tuple[List[_CloudWatchLogEvent], Optional[RunnerLogEvent]]:
269
+ now_timestamp = int(datetime.now(timezone.utc).timestamp() * 1000)
270
+ batch: List[_CloudWatchLogEvent] = []
271
+ total_size = 0
272
+ event_count = 0
273
+ first_timestamp: Optional[int] = None
274
+ skipped_past_events = 0
275
+ skipped_future_events = 0
276
+ # event that doesn't fit in the current batch
277
+ excessive_event: Optional[RunnerLogEvent] = None
278
+ for event in event_iter:
279
+ # Normally there should not be empty messages.
280
+ if not event.message:
281
+ continue
282
+ timestamp = event.timestamp
283
+ if first_timestamp is None:
284
+ first_timestamp = timestamp
285
+ elif timestamp - first_timestamp > self.BATCH_MAX_SPAN:
286
+ excessive_event = event
287
+ break
288
+ if now_timestamp - timestamp > self.PAST_EVENT_MAX_DELTA:
289
+ skipped_past_events += 1
290
+ continue
291
+ if timestamp - now_timestamp > self.FUTURE_EVENT_MAX_DELTA:
292
+ skipped_future_events += 1
293
+ continue
294
+ cw_event = self._runner_log_event_to_cloudwatch_event(event)
295
+ message_size = len(event.message) + self.MESSAGE_OVERHEAD_SIZE
296
+ if message_size > self.MESSAGE_MAX_SIZE:
297
+ # we should never hit this limit, as we use `io.Copy` to copy from pty to logs,
298
+ # which under the hood uses 32KiB buffer, see runner/internal/executor/executor.go,
299
+ # `execJob` -> `io.Copy(logger, ptmx)`
300
+ logger.error(
301
+ "Stream %s: skipping event %d, message exceeds max size: %d > %d",
302
+ stream,
303
+ timestamp,
304
+ message_size,
305
+ self.MESSAGE_MAX_SIZE,
306
+ )
307
+ continue
308
+ if total_size + message_size > self.BATCH_MAX_SIZE:
309
+ excessive_event = event
310
+ break
311
+ batch.append(cw_event)
312
+ total_size += message_size
313
+ event_count += 1
314
+ if event_count >= self.EVENT_MAX_COUNT_IN_BATCH:
315
+ break
316
+ if skipped_past_events > 0:
317
+ logger.error("Stream %s: skipping %d past event(s)", stream, skipped_past_events)
318
+ if skipped_future_events > 0:
319
+ logger.error(
320
+ "Stream %s: skipping %d future event(s)", stream, skipped_future_events
321
+ )
322
+ return batch, excessive_event
323
+
324
+ def _runner_log_event_to_cloudwatch_event(
325
+ self, runner_log_event: RunnerLogEvent
326
+ ) -> _CloudWatchLogEvent:
327
+ return {
328
+ "timestamp": runner_log_event.timestamp,
329
+ "message": runner_log_event.message.decode(errors="replace"),
330
+ }
331
+
332
+ @contextmanager
333
+ def _wrap_boto_errors(self) -> Iterator[None]:
334
+ try:
335
+ yield
336
+ except (botocore.exceptions.BotoCoreError, botocore.exceptions.ClientError) as e:
337
+ raise LogStorageError(f"CloudWatch Logs error: {type(e).__name__}: {e}") from e
338
+
339
+ def _is_resource_not_found_exception(self, exc: "botocore.exceptions.ClientError") -> bool:
340
+ try:
341
+ return exc.response["Error"]["Code"] == "ResourceNotFoundException"
342
+ except KeyError:
343
+ return False
344
+
345
+ def _check_group_exists(self, name: str) -> None:
346
+ try:
347
+ self._client.describe_log_streams(logGroupName=name, limit=1)
348
+ except botocore.exceptions.ClientError as e:
349
+ if self._is_resource_not_found_exception(e):
350
+ raise LogStorageError(f"LogGroup '{name}' does not exist")
351
+ raise
352
+
353
+ def _ensure_stream_exists(self, name: str, *, force: bool = False) -> None:
354
+ if not force and name in self._streams:
359
355
  return
360
- self._client.create_log_stream(logGroupName=self._group, logStreamName=name)
361
- self._streams.add(name)
362
-
363
- def _get_stream_name(
364
- self,
365
- project_name: str,
366
- run_name: str,
367
- job_submission_id: UUID,
368
- producer: LogProducer,
369
- ) -> str:
370
- return f"{project_name}/{run_name}/{job_submission_id}/{producer.value}"
356
+ response = self._client.describe_log_streams(
357
+ logGroupName=self._group, logStreamNamePrefix=name
358
+ )
359
+ for stream in response["logStreams"]:
360
+ if stream["logStreamName"] == name:
361
+ self._streams.add(name)
362
+ return
363
+ self._client.create_log_stream(logGroupName=self._group, logStreamName=name)
364
+ self._streams.add(name)
365
+
366
+ def _get_stream_name(
367
+ self,
368
+ project_name: str,
369
+ run_name: str,
370
+ job_submission_id: UUID,
371
+ producer: LogProducer,
372
+ ) -> str:
373
+ return f"{project_name}/{run_name}/{job_submission_id}/{producer.value}"