sagemaker-ops-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sagemaker_ops/__init__.py +4 -0
- sagemaker_ops/aws.py +502 -0
- sagemaker_ops/cli.py +262 -0
- sagemaker_ops/tui.py +458 -0
- sagemaker_ops_cli-0.1.0.dist-info/METADATA +241 -0
- sagemaker_ops_cli-0.1.0.dist-info/RECORD +9 -0
- sagemaker_ops_cli-0.1.0.dist-info/WHEEL +5 -0
- sagemaker_ops_cli-0.1.0.dist-info/entry_points.txt +2 -0
- sagemaker_ops_cli-0.1.0.dist-info/top_level.txt +1 -0
sagemaker_ops/tui.py
ADDED
|
@@ -0,0 +1,458 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
from textual import on
|
|
6
|
+
from textual.binding import Binding
|
|
7
|
+
from textual.app import App, ComposeResult
|
|
8
|
+
from textual.containers import Horizontal, Vertical
|
|
9
|
+
from textual.widgets import DataTable, Footer, Header, RichLog, Static
|
|
10
|
+
|
|
11
|
+
from sagemaker_ops.aws import (
|
|
12
|
+
AwsCliError,
|
|
13
|
+
AwsContext,
|
|
14
|
+
ProcessingJobView,
|
|
15
|
+
PipelineExecutionView,
|
|
16
|
+
build_contexts,
|
|
17
|
+
format_dt,
|
|
18
|
+
format_duration,
|
|
19
|
+
infer_log_source,
|
|
20
|
+
list_active_pipeline_executions,
|
|
21
|
+
list_pipeline_steps,
|
|
22
|
+
list_processing_jobs,
|
|
23
|
+
tail_step_logs,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class BaseSageMakerApp(App[None]):
|
|
28
|
+
CSS = """
|
|
29
|
+
Screen {
|
|
30
|
+
layout: vertical;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
#content {
|
|
34
|
+
height: 1fr;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
DataTable {
|
|
38
|
+
height: 1fr;
|
|
39
|
+
border: solid $surface;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
#detail, #logs {
|
|
43
|
+
height: 1fr;
|
|
44
|
+
border: solid $surface;
|
|
45
|
+
padding: 1;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
#detail {
|
|
49
|
+
width: 40%;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
#pipeline-content {
|
|
53
|
+
height: 1fr;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
#executions-pane {
|
|
57
|
+
height: 42%;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
#bottom-pane {
|
|
61
|
+
height: 58%;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
#steps-pane {
|
|
65
|
+
width: 48%;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
#logs-pane {
|
|
69
|
+
width: 52%;
|
|
70
|
+
}
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
BINDINGS = [
|
|
74
|
+
("r", "refresh", "Refresh"),
|
|
75
|
+
("q", "quit", "Quit"),
|
|
76
|
+
]
|
|
77
|
+
|
|
78
|
+
def __init__(
|
|
79
|
+
self,
|
|
80
|
+
profiles: tuple[str, ...],
|
|
81
|
+
region: str | None,
|
|
82
|
+
all_profiles: bool,
|
|
83
|
+
refresh_seconds: int,
|
|
84
|
+
) -> None:
|
|
85
|
+
super().__init__()
|
|
86
|
+
self.profiles = profiles
|
|
87
|
+
self.region = region
|
|
88
|
+
self.all_profiles = all_profiles
|
|
89
|
+
self.refresh_seconds = refresh_seconds
|
|
90
|
+
self.contexts: list[AwsContext] = []
|
|
91
|
+
|
|
92
|
+
def load_contexts(self) -> list[AwsContext]:
|
|
93
|
+
if not self.contexts:
|
|
94
|
+
self.contexts = build_contexts(self.profiles, self.region, self.all_profiles)
|
|
95
|
+
return self.contexts
|
|
96
|
+
|
|
97
|
+
def on_mount(self) -> None:
|
|
98
|
+
self.set_interval(self.refresh_seconds, self.action_refresh)
|
|
99
|
+
self.action_refresh()
|
|
100
|
+
|
|
101
|
+
def show_error(self, exc: Exception) -> None:
|
|
102
|
+
self.query_one("#status", Static).update(f"[red]{exc}[/red]")
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
class ProcessingJobsApp(BaseSageMakerApp):
|
|
106
|
+
TITLE = "SageMaker Processing Jobs"
|
|
107
|
+
BINDINGS = BaseSageMakerApp.BINDINGS + [
|
|
108
|
+
Binding("left", "previous_job", "Previous", priority=True),
|
|
109
|
+
Binding("up", "previous_job", "Previous", priority=True),
|
|
110
|
+
Binding("right", "next_job", "Next", priority=True),
|
|
111
|
+
Binding("down", "next_job", "Next", priority=True),
|
|
112
|
+
]
|
|
113
|
+
|
|
114
|
+
def __init__(
|
|
115
|
+
self,
|
|
116
|
+
profiles: tuple[str, ...],
|
|
117
|
+
region: str | None,
|
|
118
|
+
all_profiles: bool,
|
|
119
|
+
refresh_seconds: int,
|
|
120
|
+
) -> None:
|
|
121
|
+
super().__init__(profiles, region, all_profiles, refresh_seconds)
|
|
122
|
+
self.jobs: list[ProcessingJobView] = []
|
|
123
|
+
|
|
124
|
+
def compose(self) -> ComposeResult:
|
|
125
|
+
yield Header()
|
|
126
|
+
yield Static("Loading...", id="status")
|
|
127
|
+
with Horizontal(id="content"):
|
|
128
|
+
table = DataTable(id="jobs")
|
|
129
|
+
table.cursor_type = "row"
|
|
130
|
+
yield table
|
|
131
|
+
yield Static("", id="detail")
|
|
132
|
+
yield Footer()
|
|
133
|
+
|
|
134
|
+
def on_mount(self) -> None:
|
|
135
|
+
table = self.query_one("#jobs", DataTable)
|
|
136
|
+
table.add_columns("Profile", "Region", "Job", "Status", "Runtime", "Instance", "Created")
|
|
137
|
+
super().on_mount()
|
|
138
|
+
|
|
139
|
+
def action_refresh(self) -> None:
|
|
140
|
+
try:
|
|
141
|
+
self.jobs = [job for ctx in self.load_contexts() for job in list_processing_jobs(ctx)]
|
|
142
|
+
self.render_jobs()
|
|
143
|
+
self.query_one("#status", Static).update(
|
|
144
|
+
f"{len(self.jobs)} running processing job(s). Refresh every {self.refresh_seconds}s. "
|
|
145
|
+
"Use arrows to move, r to refresh, q to quit."
|
|
146
|
+
)
|
|
147
|
+
except AwsCliError as exc:
|
|
148
|
+
self.show_error(exc)
|
|
149
|
+
|
|
150
|
+
def action_previous_job(self) -> None:
|
|
151
|
+
table = self.query_one("#jobs", DataTable)
|
|
152
|
+
if table.row_count:
|
|
153
|
+
table.move_cursor(row=max(0, table.cursor_row - 1))
|
|
154
|
+
|
|
155
|
+
def action_next_job(self) -> None:
|
|
156
|
+
table = self.query_one("#jobs", DataTable)
|
|
157
|
+
if table.row_count:
|
|
158
|
+
table.move_cursor(row=min(table.row_count - 1, table.cursor_row + 1))
|
|
159
|
+
|
|
160
|
+
def render_jobs(self) -> None:
|
|
161
|
+
table = self.query_one("#jobs", DataTable)
|
|
162
|
+
table.clear()
|
|
163
|
+
for index, job in enumerate(self.jobs):
|
|
164
|
+
instance = job.instance_type
|
|
165
|
+
if job.instance_count:
|
|
166
|
+
instance = f"{job.instance_count}x {instance}"
|
|
167
|
+
table.add_row(
|
|
168
|
+
job.profile,
|
|
169
|
+
job.region,
|
|
170
|
+
job.name,
|
|
171
|
+
job.status,
|
|
172
|
+
format_duration(job.started_time or job.creation_time),
|
|
173
|
+
instance,
|
|
174
|
+
format_dt(job.creation_time),
|
|
175
|
+
key=str(index),
|
|
176
|
+
)
|
|
177
|
+
self.update_processing_detail(0 if self.jobs else None)
|
|
178
|
+
|
|
179
|
+
@on(DataTable.RowHighlighted, "#jobs")
|
|
180
|
+
def on_job_highlighted(self, event: DataTable.RowHighlighted) -> None:
|
|
181
|
+
try:
|
|
182
|
+
self.update_processing_detail(int(str(event.row_key.value)))
|
|
183
|
+
except (TypeError, ValueError):
|
|
184
|
+
pass
|
|
185
|
+
|
|
186
|
+
def update_processing_detail(self, index: int | None) -> None:
|
|
187
|
+
detail = self.query_one("#detail", Static)
|
|
188
|
+
if index is None or index >= len(self.jobs):
|
|
189
|
+
detail.update("No running processing jobs.")
|
|
190
|
+
return
|
|
191
|
+
job = self.jobs[index]
|
|
192
|
+
detail.update(
|
|
193
|
+
"\n".join(
|
|
194
|
+
[
|
|
195
|
+
f"[bold]{job.name}[/bold]",
|
|
196
|
+
f"Profile: {job.profile}",
|
|
197
|
+
f"Region: {job.region}",
|
|
198
|
+
f"Status: {job.status}",
|
|
199
|
+
f"Created: {format_dt(job.creation_time)}",
|
|
200
|
+
f"Started: {format_dt(job.started_time)}",
|
|
201
|
+
f"Runtime: {format_duration(job.started_time or job.creation_time)}",
|
|
202
|
+
f"Instance: {job.instance_count or ''}x {job.instance_type}".strip(),
|
|
203
|
+
f"Role: {job.role_arn}",
|
|
204
|
+
f"Arn: {job.arn}",
|
|
205
|
+
f"Failure: {job.failure_reason}" if job.failure_reason else "",
|
|
206
|
+
]
|
|
207
|
+
)
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
class PipelineExecutionsApp(BaseSageMakerApp):
|
|
212
|
+
TITLE = "SageMaker Pipeline Executions"
|
|
213
|
+
BINDINGS = BaseSageMakerApp.BINDINGS + [
|
|
214
|
+
("left", "focus_executions", "Executions"),
|
|
215
|
+
("right", "focus_steps", "Steps"),
|
|
216
|
+
("l", "load_logs", "Logs"),
|
|
217
|
+
]
|
|
218
|
+
|
|
219
|
+
def __init__(
|
|
220
|
+
self,
|
|
221
|
+
profiles: tuple[str, ...],
|
|
222
|
+
region: str | None,
|
|
223
|
+
all_profiles: bool,
|
|
224
|
+
refresh_seconds: int,
|
|
225
|
+
pipeline_name: str | None,
|
|
226
|
+
recent_hours: int = 3,
|
|
227
|
+
) -> None:
|
|
228
|
+
super().__init__(profiles, region, all_profiles, refresh_seconds)
|
|
229
|
+
self.pipeline_name = pipeline_name
|
|
230
|
+
self.recent_hours = recent_hours
|
|
231
|
+
self.executions: list[tuple[AwsContext, PipelineExecutionView]] = []
|
|
232
|
+
self.steps: list[dict[str, Any]] = []
|
|
233
|
+
self.selected_context: AwsContext | None = None
|
|
234
|
+
self.selected_execution_arn: str | None = None
|
|
235
|
+
self.loaded_log_step_key: tuple[str, str] | None = None
|
|
236
|
+
self._rendering_executions = False
|
|
237
|
+
self._updating_steps = False
|
|
238
|
+
self._suppress_next_execution_highlight = False
|
|
239
|
+
self._suppress_next_step_highlight = False
|
|
240
|
+
|
|
241
|
+
def compose(self) -> ComposeResult:
|
|
242
|
+
yield Header()
|
|
243
|
+
yield Static("Loading...", id="status")
|
|
244
|
+
with Vertical(id="pipeline-content"):
|
|
245
|
+
with Vertical(id="executions-pane"):
|
|
246
|
+
executions = DataTable(id="executions")
|
|
247
|
+
executions.cursor_type = "row"
|
|
248
|
+
yield executions
|
|
249
|
+
with Horizontal(id="bottom-pane"):
|
|
250
|
+
with Vertical(id="steps-pane"):
|
|
251
|
+
steps = DataTable(id="steps")
|
|
252
|
+
steps.cursor_type = "row"
|
|
253
|
+
yield steps
|
|
254
|
+
with Vertical(id="logs-pane"):
|
|
255
|
+
yield RichLog(id="logs", wrap=True, highlight=True, auto_scroll=False)
|
|
256
|
+
yield Footer()
|
|
257
|
+
|
|
258
|
+
def on_mount(self) -> None:
|
|
259
|
+
executions = self.query_one("#executions", DataTable)
|
|
260
|
+
executions.add_columns("Profile", "Region", "Pipeline", "Execution", "Status", "Runtime")
|
|
261
|
+
steps = self.query_one("#steps", DataTable)
|
|
262
|
+
steps.add_columns("Step", "Type", "Status", "Runtime", "Failure")
|
|
263
|
+
super().on_mount()
|
|
264
|
+
|
|
265
|
+
def action_refresh(self) -> None:
|
|
266
|
+
previous_execution_arn = self.selected_execution_arn
|
|
267
|
+
previous_step_name = self.selected_step_name()
|
|
268
|
+
try:
|
|
269
|
+
pairs: list[tuple[AwsContext, PipelineExecutionView]] = []
|
|
270
|
+
for ctx in self.load_contexts():
|
|
271
|
+
for execution in list_active_pipeline_executions(
|
|
272
|
+
ctx, pipeline_name=self.pipeline_name, recent_hours=self.recent_hours
|
|
273
|
+
):
|
|
274
|
+
pairs.append((ctx, execution))
|
|
275
|
+
self.executions = pairs
|
|
276
|
+
self.render_executions(previous_execution_arn, previous_step_name, preserve_logs=True)
|
|
277
|
+
self.query_one("#status", Static).update(
|
|
278
|
+
f"{len(self.executions)} active/recent pipeline execution(s), window={self.recent_hours}h. "
|
|
279
|
+
"Use left/right to switch panes, arrows to move, l to load failed-step logs."
|
|
280
|
+
)
|
|
281
|
+
except AwsCliError as exc:
|
|
282
|
+
self.show_error(exc)
|
|
283
|
+
|
|
284
|
+
def action_focus_executions(self) -> None:
|
|
285
|
+
self.query_one("#executions", DataTable).focus()
|
|
286
|
+
|
|
287
|
+
def action_focus_steps(self) -> None:
|
|
288
|
+
self.query_one("#steps", DataTable).focus()
|
|
289
|
+
|
|
290
|
+
def action_load_logs(self) -> None:
|
|
291
|
+
self.load_selected_step_logs()
|
|
292
|
+
|
|
293
|
+
def render_executions(
|
|
294
|
+
self,
|
|
295
|
+
preferred_execution_arn: str | None = None,
|
|
296
|
+
preferred_step_name: str | None = None,
|
|
297
|
+
preserve_logs: bool = False,
|
|
298
|
+
) -> None:
|
|
299
|
+
table = self.query_one("#executions", DataTable)
|
|
300
|
+
selected_index = 0 if self.executions else None
|
|
301
|
+
self._rendering_executions = True
|
|
302
|
+
try:
|
|
303
|
+
table.clear()
|
|
304
|
+
for index, (_, execution) in enumerate(self.executions):
|
|
305
|
+
if preferred_execution_arn and execution.execution_arn == preferred_execution_arn:
|
|
306
|
+
selected_index = index
|
|
307
|
+
table.add_row(
|
|
308
|
+
execution.profile,
|
|
309
|
+
execution.region,
|
|
310
|
+
execution.pipeline_name,
|
|
311
|
+
execution.display_name or execution.execution_arn.rsplit("/", 1)[-1],
|
|
312
|
+
execution.status,
|
|
313
|
+
format_duration(execution.start_time),
|
|
314
|
+
key=str(index),
|
|
315
|
+
)
|
|
316
|
+
if selected_index is not None and table.row_count:
|
|
317
|
+
self._suppress_next_execution_highlight = True
|
|
318
|
+
table.move_cursor(row=selected_index, scroll=False)
|
|
319
|
+
finally:
|
|
320
|
+
self._rendering_executions = False
|
|
321
|
+
self.update_steps(selected_index, preferred_step_name=preferred_step_name, preserve_logs=preserve_logs)
|
|
322
|
+
|
|
323
|
+
@on(DataTable.RowHighlighted, "#executions")
|
|
324
|
+
def on_execution_highlighted(self, event: DataTable.RowHighlighted) -> None:
|
|
325
|
+
if self._rendering_executions or self._suppress_next_execution_highlight:
|
|
326
|
+
self._suppress_next_execution_highlight = False
|
|
327
|
+
return
|
|
328
|
+
try:
|
|
329
|
+
self.update_steps(int(str(event.row_key.value)))
|
|
330
|
+
except (TypeError, ValueError):
|
|
331
|
+
pass
|
|
332
|
+
|
|
333
|
+
@on(DataTable.RowHighlighted, "#steps")
|
|
334
|
+
def on_step_highlighted(self, _: DataTable.RowHighlighted) -> None:
|
|
335
|
+
if self._updating_steps or self._suppress_next_step_highlight:
|
|
336
|
+
self._suppress_next_step_highlight = False
|
|
337
|
+
return
|
|
338
|
+
if self.loaded_log_step_key == self.selected_step_key():
|
|
339
|
+
return
|
|
340
|
+
self.loaded_log_step_key = None
|
|
341
|
+
self.render_step_failure_or_hint()
|
|
342
|
+
|
|
343
|
+
def update_steps(
|
|
344
|
+
self,
|
|
345
|
+
index: int | None,
|
|
346
|
+
preferred_step_name: str | None = None,
|
|
347
|
+
preserve_logs: bool = False,
|
|
348
|
+
) -> None:
|
|
349
|
+
self.steps = []
|
|
350
|
+
self.selected_context = None
|
|
351
|
+
self.selected_execution_arn = None
|
|
352
|
+
logs = self.query_one("#logs", RichLog)
|
|
353
|
+
if index is None or index >= len(self.executions):
|
|
354
|
+
self.query_one("#steps", DataTable).clear()
|
|
355
|
+
self.loaded_log_step_key = None
|
|
356
|
+
logs.clear()
|
|
357
|
+
logs.write(f"No active or recent pipeline executions in the last {self.recent_hours}h.")
|
|
358
|
+
return
|
|
359
|
+
|
|
360
|
+
ctx, execution = self.executions[index]
|
|
361
|
+
self.selected_context = ctx
|
|
362
|
+
self.selected_execution_arn = execution.execution_arn
|
|
363
|
+
try:
|
|
364
|
+
self.steps = list_pipeline_steps(ctx, execution.execution_arn)
|
|
365
|
+
except AwsCliError as exc:
|
|
366
|
+
self.loaded_log_step_key = None
|
|
367
|
+
logs.clear()
|
|
368
|
+
logs.write(str(exc))
|
|
369
|
+
return
|
|
370
|
+
table = self.query_one("#steps", DataTable)
|
|
371
|
+
selected_step_index = 0 if self.steps else None
|
|
372
|
+
self._updating_steps = True
|
|
373
|
+
try:
|
|
374
|
+
table.clear()
|
|
375
|
+
for step_index, step in enumerate(self.steps):
|
|
376
|
+
if preferred_step_name and step.get("StepName", "") == preferred_step_name:
|
|
377
|
+
selected_step_index = step_index
|
|
378
|
+
table.add_row(
|
|
379
|
+
step.get("StepName", ""),
|
|
380
|
+
step.get("StepType", ""),
|
|
381
|
+
step.get("StepStatus", ""),
|
|
382
|
+
format_duration(step.get("StartTime"), step.get("EndTime")),
|
|
383
|
+
_shorten(step.get("FailureReason", ""), 80),
|
|
384
|
+
key=str(step_index),
|
|
385
|
+
)
|
|
386
|
+
if selected_step_index is not None and table.row_count:
|
|
387
|
+
self._suppress_next_step_highlight = True
|
|
388
|
+
table.move_cursor(row=selected_step_index, scroll=False)
|
|
389
|
+
finally:
|
|
390
|
+
self._updating_steps = False
|
|
391
|
+
if preserve_logs and self.loaded_log_step_key == self.selected_step_key():
|
|
392
|
+
return
|
|
393
|
+
self.loaded_log_step_key = None
|
|
394
|
+
self.render_step_failure_or_hint()
|
|
395
|
+
|
|
396
|
+
def render_step_failure_or_hint(self) -> None:
|
|
397
|
+
if self.loaded_log_step_key == self.selected_step_key():
|
|
398
|
+
return
|
|
399
|
+
logs = self.query_one("#logs", RichLog)
|
|
400
|
+
logs.clear()
|
|
401
|
+
step = self.selected_step()
|
|
402
|
+
if step is None:
|
|
403
|
+
logs.write("Select a step to view details.")
|
|
404
|
+
return
|
|
405
|
+
failure = step.get("FailureReason")
|
|
406
|
+
if failure:
|
|
407
|
+
logs.write(f"[bold red]Failure[/bold red] {failure}")
|
|
408
|
+
source = infer_log_source(step)
|
|
409
|
+
if source:
|
|
410
|
+
logs.write("Press l to load CloudWatch log tail.")
|
|
411
|
+
else:
|
|
412
|
+
logs.write("This step type has no supported CloudWatch log source yet.")
|
|
413
|
+
return
|
|
414
|
+
logs.write(
|
|
415
|
+
f"{step.get('StepName', '')} {step.get('StepStatus', '')} "
|
|
416
|
+
f"{format_duration(step.get('StartTime'), step.get('EndTime'))}"
|
|
417
|
+
)
|
|
418
|
+
|
|
419
|
+
def load_selected_step_logs(self) -> None:
|
|
420
|
+
ctx = self.selected_context
|
|
421
|
+
step = self.selected_step()
|
|
422
|
+
logs = self.query_one("#logs", RichLog)
|
|
423
|
+
logs.clear()
|
|
424
|
+
if ctx is None or step is None:
|
|
425
|
+
self.loaded_log_step_key = None
|
|
426
|
+
logs.write("Select a pipeline step first.")
|
|
427
|
+
return
|
|
428
|
+
self.loaded_log_step_key = self.selected_step_key()
|
|
429
|
+
failure = step.get("FailureReason")
|
|
430
|
+
if failure:
|
|
431
|
+
logs.write(f"[bold red]Failure[/bold red] {failure}")
|
|
432
|
+
for line in tail_step_logs(ctx, step):
|
|
433
|
+
logs.write(line)
|
|
434
|
+
|
|
435
|
+
def selected_step(self) -> dict[str, Any] | None:
|
|
436
|
+
table = self.query_one("#steps", DataTable)
|
|
437
|
+
if not self.steps or table.cursor_row >= len(self.steps):
|
|
438
|
+
return None
|
|
439
|
+
return self.steps[table.cursor_row]
|
|
440
|
+
|
|
441
|
+
def selected_step_name(self) -> str | None:
|
|
442
|
+
step = self.selected_step()
|
|
443
|
+
if step is None:
|
|
444
|
+
return None
|
|
445
|
+
return step.get("StepName", "")
|
|
446
|
+
|
|
447
|
+
def selected_step_key(self) -> tuple[str, str] | None:
|
|
448
|
+
step_name = self.selected_step_name()
|
|
449
|
+
if not self.selected_execution_arn or not step_name:
|
|
450
|
+
return None
|
|
451
|
+
return self.selected_execution_arn, step_name
|
|
452
|
+
|
|
453
|
+
|
|
454
|
+
def _shorten(value: str, max_len: int) -> str:
|
|
455
|
+
if len(value) <= max_len:
|
|
456
|
+
return value
|
|
457
|
+
return value[: max_len - 1] + "…"
|
|
458
|
+
|
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: sagemaker-ops-cli
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: CLI and TUI for submitting and monitoring Amazon SageMaker Processing Jobs and Pipelines.
|
|
5
|
+
Requires-Python: >=3.10
|
|
6
|
+
Description-Content-Type: text/markdown
|
|
7
|
+
Requires-Dist: boto3>=1.34.0
|
|
8
|
+
Requires-Dist: botocore>=1.34.0
|
|
9
|
+
Requires-Dist: rich>=13.7.0
|
|
10
|
+
Requires-Dist: typer>=0.12.0
|
|
11
|
+
Requires-Dist: textual>=0.58.0
|
|
12
|
+
Provides-Extra: yaml
|
|
13
|
+
Requires-Dist: PyYAML>=6.0.0; extra == "yaml"
|
|
14
|
+
Provides-Extra: dev
|
|
15
|
+
Requires-Dist: build>=1.2.0; extra == "dev"
|
|
16
|
+
Requires-Dist: moto[logs,sagemaker]>=5.0.0; extra == "dev"
|
|
17
|
+
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
|
18
|
+
Requires-Dist: ruff>=0.5.0; extra == "dev"
|
|
19
|
+
Requires-Dist: twine>=5.0.0; extra == "dev"
|
|
20
|
+
|
|
21
|
+
# SageMaker Ops CLI
|
|
22
|
+
|
|
23
|
+
`smops` 是一个面向 SageMaker Processing Job 和 SageMaker Pipeline 的命令行工具:
|
|
24
|
+
|
|
25
|
+
- 提交 SageMaker Processing Job
|
|
26
|
+
- 启动 SageMaker Pipeline execution
|
|
27
|
+
- 用 TUI 查看正在运行的 Processing Jobs
|
|
28
|
+
- 用 TUI 查看正在运行和最近结束的 Pipeline executions、steps 状态和失败 step 的 CloudWatch 日志尾部
|
|
29
|
+
- 支持单个、多个或所有 AWS profiles
|
|
30
|
+
|
|
31
|
+
## 安装
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
python -m venv .venv
|
|
35
|
+
source .venv/bin/activate
|
|
36
|
+
pip install -e .
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
从 GitHub 直接安装:
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
pip install git+https://github.com/southpolemonkey/smops.git
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
从本地 wheel 安装:
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
pip install dist/sagemaker_ops_cli-0.1.0-py3-none-any.whl
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
如果要读取 YAML 配置:
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
pip install -e '.[yaml]'
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
## 构建 Python 包
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
pip install -e '.[dev]'
|
|
61
|
+
python -m build
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
构建产物会输出到 `dist/`:
|
|
65
|
+
|
|
66
|
+
- `sagemaker_ops_cli-0.1.0-py3-none-any.whl`
|
|
67
|
+
- `sagemaker_ops_cli-0.1.0.tar.gz`
|
|
68
|
+
|
|
69
|
+
## 提交 Processing Job
|
|
70
|
+
|
|
71
|
+
配置文件直接使用 boto3 `create_processing_job` 的参数结构。
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
smops processing submit \
|
|
75
|
+
--profile dev \
|
|
76
|
+
--region us-east-1 \
|
|
77
|
+
--config examples/processing-job.json
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
只检查请求内容,不提交:
|
|
81
|
+
|
|
82
|
+
```bash
|
|
83
|
+
smops processing submit --config examples/processing-job.json --dry-run
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
## 启动 Pipeline
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
smops pipeline start \
|
|
90
|
+
--profile dev \
|
|
91
|
+
--region us-east-1 \
|
|
92
|
+
--name my-pipeline \
|
|
93
|
+
--display-name manual-run-001 \
|
|
94
|
+
--parameter InputDate=2026-06-30 \
|
|
95
|
+
--parameter Mode=prod
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
## TUI 查看 Processing Jobs
|
|
99
|
+
|
|
100
|
+
```bash
|
|
101
|
+
smops tui processing --profile dev --region us-east-1
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
多个 profile:
|
|
105
|
+
|
|
106
|
+
```bash
|
|
107
|
+
smops tui processing --profile dev --profile prod --region us-east-1
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
所有 profile:
|
|
111
|
+
|
|
112
|
+
```bash
|
|
113
|
+
smops tui processing --all-profiles
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
快捷键:
|
|
117
|
+
|
|
118
|
+
- `↑/↓` 或 `←/→` 切换 job
|
|
119
|
+
- `r` 刷新
|
|
120
|
+
- `q` 退出
|
|
121
|
+
|
|
122
|
+
## TUI 查看 Pipelines
|
|
123
|
+
|
|
124
|
+
```bash
|
|
125
|
+
smops tui pipelines --profile dev --region us-east-1
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
只看某个 pipeline:
|
|
129
|
+
|
|
130
|
+
```bash
|
|
131
|
+
smops tui pipelines --profile dev --region us-east-1 --name my-pipeline
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
默认会显示正在运行的 executions,以及最近 3 小时内结束的 executions,方便查看成功/失败结果。可以用 `--hours` 调整窗口:
|
|
135
|
+
|
|
136
|
+
```bash
|
|
137
|
+
smops tui pipelines --profile dev --region us-east-1 --name my-pipeline --hours 6
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
快捷键:
|
|
141
|
+
|
|
142
|
+
- `←/→` 在 executions 和 steps 面板之间切换
|
|
143
|
+
- `↑/↓` 移动当前面板选中行
|
|
144
|
+
- `l` 加载选中失败 step 的 CloudWatch 日志尾部
|
|
145
|
+
- `r` 刷新
|
|
146
|
+
- `q` 退出
|
|
147
|
+
|
|
148
|
+
目前自动支持这些 step 的日志定位:
|
|
149
|
+
|
|
150
|
+
- ProcessingJob: `/aws/sagemaker/ProcessingJobs`
|
|
151
|
+
- TrainingJob: `/aws/sagemaker/TrainingJobs`
|
|
152
|
+
- TransformJob: `/aws/sagemaker/TransformJobs`
|
|
153
|
+
|
|
154
|
+
## 非交互式查看
|
|
155
|
+
|
|
156
|
+
```bash
|
|
157
|
+
smops processing list --profile dev --region us-east-1
|
|
158
|
+
smops pipeline list --profile dev --region us-east-1
|
|
159
|
+
smops pipeline list --profile dev --region us-east-1 --name my-pipeline --hours 6
|
|
160
|
+
smops pipeline steps --profile dev --region us-east-1 --execution-arn arn:aws:sagemaker:...
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
`processing list` 默认每页读取 20 个 running jobs。输出 `Next token` 时,用它继续翻页:
|
|
164
|
+
|
|
165
|
+
```bash
|
|
166
|
+
smops processing list --profile dev --region us-east-1 --max-results 20
|
|
167
|
+
smops processing list --profile dev --region us-east-1 --max-results 20 --next-token '<token>'
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
`pipeline list` 不传 `--name` 时默认每页只扫描 10 个 pipelines,避免真实账号里 pipelines 很多时卡住。输出 `Next token` 时,用它继续翻页:
|
|
171
|
+
|
|
172
|
+
```bash
|
|
173
|
+
smops pipeline list --profile dev --region us-east-1 --pipeline-page-size 10
|
|
174
|
+
smops pipeline list --profile dev --region us-east-1 --pipeline-page-size 10 --next-token '<token>'
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
## AWS 权限
|
|
178
|
+
|
|
179
|
+
运行账号需要至少具备这些权限:
|
|
180
|
+
|
|
181
|
+
- `sagemaker:CreateProcessingJob`
|
|
182
|
+
- `sagemaker:StartPipelineExecution`
|
|
183
|
+
- `sagemaker:ListProcessingJobs`
|
|
184
|
+
- `sagemaker:DescribeProcessingJob`
|
|
185
|
+
- `sagemaker:ListPipelines`
|
|
186
|
+
- `sagemaker:ListPipelineExecutions`
|
|
187
|
+
- `sagemaker:DescribePipelineExecution`
|
|
188
|
+
- `sagemaker:ListPipelineExecutionSteps`
|
|
189
|
+
- `logs:DescribeLogStreams`
|
|
190
|
+
- `logs:GetLogEvents`
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
## Mock AWS Profile
|
|
194
|
+
|
|
195
|
+
仓库里提供了一套 mock AWS 配置,方便本地演示 profile 切换和 CLI 参数解析,不会写入真实 `~/.aws`:
|
|
196
|
+
|
|
197
|
+
```bash
|
|
198
|
+
export AWS_CONFIG_FILE=examples/aws/config
|
|
199
|
+
export AWS_SHARED_CREDENTIALS_FILE=examples/aws/credentials
|
|
200
|
+
export AWS_PROFILE=mock-dev
|
|
201
|
+
export AWS_DEFAULT_REGION=us-east-1
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
也可以直接加载样例环境变量:
|
|
205
|
+
|
|
206
|
+
```bash
|
|
207
|
+
set -a
|
|
208
|
+
source examples/aws/mock.env
|
|
209
|
+
set +a
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
然后运行:
|
|
213
|
+
|
|
214
|
+
```bash
|
|
215
|
+
smops processing submit --config examples/processing-job.json --dry-run
|
|
216
|
+
smops processing list --profile mock-dev
|
|
217
|
+
smops tui processing --profile mock-dev
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
注意:这套 credentials 是 dummy 值,只适合 dry-run、mock、本地端点或配合 botocore Stubber/moto 使用;直接访问真实 AWS 会认证失败。
|
|
221
|
+
|
|
222
|
+
## E2E 测试
|
|
223
|
+
|
|
224
|
+
测试使用 `moto` 模拟 AWS SageMaker 和 CloudWatch Logs,不会访问真实 AWS:
|
|
225
|
+
|
|
226
|
+
```bash
|
|
227
|
+
pip install -e '.[dev]'
|
|
228
|
+
pytest
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
覆盖范围包括:
|
|
232
|
+
|
|
233
|
+
- Processing Job 提交和 running job 分页列表
|
|
234
|
+
- Pipeline execution 启动和 active/recent execution 列表
|
|
235
|
+
- Pipeline steps 状态展示
|
|
236
|
+
- 失败 step 的 CloudWatch Logs tail
|
|
237
|
+
- Processing Job TUI 的上下左右键导航
|
|
238
|
+
- Pipeline TUI 的 executions、steps 和失败日志加载
|
|
239
|
+
- 多 AWS profile 解析
|
|
240
|
+
|
|
241
|
+
Moto 目前还没有实现 `list_pipeline_execution_steps`,测试里对这一个 paginator 做了内存 fake,其余 SageMaker/Logs 调用都在 moto 环境中执行。
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
sagemaker_ops/__init__.py,sha256=cNBPNnVf26SYJQHBnRMAVEN8lsbBqzz-9mubqkdJcXQ,56
|
|
2
|
+
sagemaker_ops/aws.py,sha256=qTW0AVB3JoGqSWxjX0UK-zxyo3DUx-3v7chdVms7zSs,17718
|
|
3
|
+
sagemaker_ops/cli.py,sha256=kYnf7q_eD2qRPgUB2vc1CHpyUbpC-Kg4LECbV9mUfz4,12173
|
|
4
|
+
sagemaker_ops/tui.py,sha256=5aDBxQnF3U3-XmjDUnVhxbLlTy4yXlSdu1z6Rfvng2M,16452
|
|
5
|
+
sagemaker_ops_cli-0.1.0.dist-info/METADATA,sha256=LDQzO25kmo02iCXOOwZGBupfuTJzlpZIPvAw0d6A0iQ,6214
|
|
6
|
+
sagemaker_ops_cli-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
7
|
+
sagemaker_ops_cli-0.1.0.dist-info/entry_points.txt,sha256=GwMPaX4XAFwRWaAbot6pSvXtXYc9m5zDj1uWTRFszCA,48
|
|
8
|
+
sagemaker_ops_cli-0.1.0.dist-info/top_level.txt,sha256=2ajnfMn6IWi-PBRpeuatLRUhcLomdOd_s9cmVrX2V9o,14
|
|
9
|
+
sagemaker_ops_cli-0.1.0.dist-info/RECORD,,
|