sagemaker-ops-cli 0.1.1__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sagemaker_ops_cli-0.1.1 → sagemaker_ops_cli-0.2.0}/PKG-INFO +69 -4
- {sagemaker_ops_cli-0.1.1 → sagemaker_ops_cli-0.2.0}/README.md +68 -3
- {sagemaker_ops_cli-0.1.1 → sagemaker_ops_cli-0.2.0}/pyproject.toml +1 -1
- {sagemaker_ops_cli-0.1.1 → sagemaker_ops_cli-0.2.0}/sagemaker_ops/__init__.py +1 -1
- {sagemaker_ops_cli-0.1.1 → sagemaker_ops_cli-0.2.0}/sagemaker_ops/aws.py +180 -13
- sagemaker_ops_cli-0.2.0/sagemaker_ops/cli.py +586 -0
- sagemaker_ops_cli-0.2.0/sagemaker_ops/config.py +57 -0
- sagemaker_ops_cli-0.2.0/sagemaker_ops/tui.py +882 -0
- {sagemaker_ops_cli-0.1.1 → sagemaker_ops_cli-0.2.0}/sagemaker_ops_cli.egg-info/PKG-INFO +69 -4
- {sagemaker_ops_cli-0.1.1 → sagemaker_ops_cli-0.2.0}/sagemaker_ops_cli.egg-info/SOURCES.txt +1 -0
- sagemaker_ops_cli-0.2.0/tests/test_e2e_moto.py +711 -0
- sagemaker_ops_cli-0.1.1/sagemaker_ops/cli.py +0 -262
- sagemaker_ops_cli-0.1.1/sagemaker_ops/tui.py +0 -458
- sagemaker_ops_cli-0.1.1/tests/test_e2e_moto.py +0 -298
- {sagemaker_ops_cli-0.1.1 → sagemaker_ops_cli-0.2.0}/sagemaker_ops_cli.egg-info/dependency_links.txt +0 -0
- {sagemaker_ops_cli-0.1.1 → sagemaker_ops_cli-0.2.0}/sagemaker_ops_cli.egg-info/entry_points.txt +0 -0
- {sagemaker_ops_cli-0.1.1 → sagemaker_ops_cli-0.2.0}/sagemaker_ops_cli.egg-info/requires.txt +0 -0
- {sagemaker_ops_cli-0.1.1 → sagemaker_ops_cli-0.2.0}/sagemaker_ops_cli.egg-info/top_level.txt +0 -0
- {sagemaker_ops_cli-0.1.1 → sagemaker_ops_cli-0.2.0}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: sagemaker-ops-cli
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: CLI and TUI for submitting and monitoring Amazon SageMaker Processing Jobs and Pipelines.
|
|
5
5
|
Requires-Python: >=3.10
|
|
6
6
|
Description-Content-Type: text/markdown
|
|
@@ -54,7 +54,7 @@ pip install git+https://github.com/southpolemonkey/smops.git
|
|
|
54
54
|
Install from a local wheel:
|
|
55
55
|
|
|
56
56
|
```bash
|
|
57
|
-
pip install dist/sagemaker_ops_cli-0.
|
|
57
|
+
pip install dist/sagemaker_ops_cli-0.2.0-py3-none-any.whl
|
|
58
58
|
```
|
|
59
59
|
|
|
60
60
|
Install with Homebrew:
|
|
@@ -85,6 +85,29 @@ To enable YAML config files:
|
|
|
85
85
|
pip install -e '.[yaml]'
|
|
86
86
|
```
|
|
87
87
|
|
|
88
|
+
## Defaults
|
|
89
|
+
|
|
90
|
+
Set a default AWS region once so you do not need to pass `--region` on every command:
|
|
91
|
+
|
|
92
|
+
```bash
|
|
93
|
+
smops config set-region ap-southeast-2
|
|
94
|
+
smops config get-region
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
The config file is stored at `~/.config/smops/config.json` by default. You can inspect it with:
|
|
98
|
+
|
|
99
|
+
```bash
|
|
100
|
+
smops config show
|
|
101
|
+
smops config path
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
Region resolution order is:
|
|
105
|
+
|
|
106
|
+
1. `--region`
|
|
107
|
+
2. `SMOPS_DEFAULT_REGION`
|
|
108
|
+
3. `smops config set-region ...`
|
|
109
|
+
4. Region configured on the selected AWS profile
|
|
110
|
+
|
|
88
111
|
## Build The Python Package
|
|
89
112
|
|
|
90
113
|
```bash
|
|
@@ -94,8 +117,8 @@ python -m build
|
|
|
94
117
|
|
|
95
118
|
Build artifacts are written to `dist/`:
|
|
96
119
|
|
|
97
|
-
- `sagemaker_ops_cli-0.
|
|
98
|
-
- `sagemaker_ops_cli-0.
|
|
120
|
+
- `sagemaker_ops_cli-0.2.0-py3-none-any.whl`
|
|
121
|
+
- `sagemaker_ops_cli-0.2.0.tar.gz`
|
|
99
122
|
|
|
100
123
|
## Submit A Processing Job
|
|
101
124
|
|
|
@@ -126,6 +149,23 @@ smops pipeline start \
|
|
|
126
149
|
--parameter Mode=prod
|
|
127
150
|
```
|
|
128
151
|
|
|
152
|
+
## Interactive TUI
|
|
153
|
+
|
|
154
|
+
Open the TUI selector and choose between Pipelines and Processing Jobs:
|
|
155
|
+
|
|
156
|
+
```bash
|
|
157
|
+
smops tui --profile dev
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
Inside the TUI:
|
|
161
|
+
|
|
162
|
+
- `p` / `P`: switch to the next AWS profile from your local AWS config
|
|
163
|
+
- `s`: start a pipeline or submit a processing job from the current TUI
|
|
164
|
+
- `r`: refresh
|
|
165
|
+
- `q`: quit
|
|
166
|
+
|
|
167
|
+
For pipeline starts, enter the pipeline name, optional display name, and optional comma-separated parameters such as `InputDate=2026-07-01,Mode=test`. For processing job submits, enter the path to a JSON/YAML config file using the same structure as boto3 `create_processing_job`.
|
|
168
|
+
|
|
129
169
|
## Processing Jobs TUI
|
|
130
170
|
|
|
131
171
|
```bash
|
|
@@ -147,6 +187,8 @@ smops tui processing --all-profiles
|
|
|
147
187
|
Keyboard shortcuts:
|
|
148
188
|
|
|
149
189
|
- `Up` / `Down` or `Left` / `Right`: switch jobs
|
|
190
|
+
- `p` / `P`: switch to the next AWS profile
|
|
191
|
+
- `s`: submit a Processing Job from a JSON/YAML config file
|
|
150
192
|
- `r`: refresh
|
|
151
193
|
- `q`: quit
|
|
152
194
|
|
|
@@ -172,6 +214,8 @@ Keyboard shortcuts:
|
|
|
172
214
|
|
|
173
215
|
- `Left` / `Right`: switch focus between the executions and steps panels
|
|
174
216
|
- `Up` / `Down`: move within the focused panel
|
|
217
|
+
- `p` / `P`: switch to the next AWS profile
|
|
218
|
+
- `s`: start a Pipeline execution
|
|
175
219
|
- `l`: load the CloudWatch log tail for the selected failed step
|
|
176
220
|
- `r`: refresh
|
|
177
221
|
- `q`: quit
|
|
@@ -186,11 +230,32 @@ Log discovery is currently supported for these step job types:
|
|
|
186
230
|
|
|
187
231
|
```bash
|
|
188
232
|
smops processing list --profile dev --region us-east-1
|
|
233
|
+
smops processing wait --profile dev --region us-east-1 --name my-processing-job
|
|
189
234
|
smops pipeline list --profile dev --region us-east-1
|
|
190
235
|
smops pipeline list --profile dev --region us-east-1 --name my-pipeline --hours 6
|
|
191
236
|
smops pipeline steps --profile dev --region us-east-1 --execution-arn arn:aws:sagemaker:...
|
|
237
|
+
smops pipeline wait --profile dev --region us-east-1 --execution-arn arn:aws:sagemaker:...
|
|
238
|
+
smops pipeline inspect --profile dev --region us-east-1 --execution-arn arn:aws:sagemaker:...
|
|
239
|
+
smops pipeline diagnose --profile dev --region us-east-1 --execution-arn arn:aws:sagemaker:...
|
|
192
240
|
```
|
|
193
241
|
|
|
242
|
+
Most non-interactive commands support `--json` for agents and automation:
|
|
243
|
+
|
|
244
|
+
```bash
|
|
245
|
+
smops processing list --profile dev --region us-east-1 --json
|
|
246
|
+
smops processing wait --profile dev --region us-east-1 --name my-processing-job --json
|
|
247
|
+
smops pipeline start --profile dev --region us-east-1 --name my-pipeline --json
|
|
248
|
+
smops pipeline list --profile dev --region us-east-1 --json
|
|
249
|
+
smops pipeline steps --profile dev --region us-east-1 --execution-arn arn:aws:sagemaker:... --json
|
|
250
|
+
smops pipeline wait --profile dev --region us-east-1 --execution-arn arn:aws:sagemaker:... --json
|
|
251
|
+
smops pipeline inspect --profile dev --region us-east-1 --execution-arn arn:aws:sagemaker:... --json
|
|
252
|
+
smops pipeline diagnose --profile dev --region us-east-1 --execution-arn arn:aws:sagemaker:... --json
|
|
253
|
+
```
|
|
254
|
+
|
|
255
|
+
JSON responses use a stable envelope. Successful commands return `status: "ok"`; errors return `status: "error"` and a user-facing `error` message. List commands return `items`, `count`, and `next_token`.
|
|
256
|
+
|
|
257
|
+
`pipeline inspect` returns execution details, all steps, and failed steps. `pipeline diagnose` extends that with the first failed step, inferred SageMaker job type/name, CloudWatch log group and stream prefix, log tail, and suggested next actions.
|
|
258
|
+
|
|
194
259
|
`processing list` reads 20 running jobs per page by default. If the output includes `Next token`, pass it to fetch the next page:
|
|
195
260
|
|
|
196
261
|
```bash
|
|
@@ -34,7 +34,7 @@ pip install git+https://github.com/southpolemonkey/smops.git
|
|
|
34
34
|
Install from a local wheel:
|
|
35
35
|
|
|
36
36
|
```bash
|
|
37
|
-
pip install dist/sagemaker_ops_cli-0.
|
|
37
|
+
pip install dist/sagemaker_ops_cli-0.2.0-py3-none-any.whl
|
|
38
38
|
```
|
|
39
39
|
|
|
40
40
|
Install with Homebrew:
|
|
@@ -65,6 +65,29 @@ To enable YAML config files:
|
|
|
65
65
|
pip install -e '.[yaml]'
|
|
66
66
|
```
|
|
67
67
|
|
|
68
|
+
## Defaults
|
|
69
|
+
|
|
70
|
+
Set a default AWS region once so you do not need to pass `--region` on every command:
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
smops config set-region ap-southeast-2
|
|
74
|
+
smops config get-region
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
The config file is stored at `~/.config/smops/config.json` by default. You can inspect it with:
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
smops config show
|
|
81
|
+
smops config path
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
Region resolution order is:
|
|
85
|
+
|
|
86
|
+
1. `--region`
|
|
87
|
+
2. `SMOPS_DEFAULT_REGION`
|
|
88
|
+
3. `smops config set-region ...`
|
|
89
|
+
4. Region configured on the selected AWS profile
|
|
90
|
+
|
|
68
91
|
## Build The Python Package
|
|
69
92
|
|
|
70
93
|
```bash
|
|
@@ -74,8 +97,8 @@ python -m build
|
|
|
74
97
|
|
|
75
98
|
Build artifacts are written to `dist/`:
|
|
76
99
|
|
|
77
|
-
- `sagemaker_ops_cli-0.
|
|
78
|
-
- `sagemaker_ops_cli-0.
|
|
100
|
+
- `sagemaker_ops_cli-0.2.0-py3-none-any.whl`
|
|
101
|
+
- `sagemaker_ops_cli-0.2.0.tar.gz`
|
|
79
102
|
|
|
80
103
|
## Submit A Processing Job
|
|
81
104
|
|
|
@@ -106,6 +129,23 @@ smops pipeline start \
|
|
|
106
129
|
--parameter Mode=prod
|
|
107
130
|
```
|
|
108
131
|
|
|
132
|
+
## Interactive TUI
|
|
133
|
+
|
|
134
|
+
Open the TUI selector and choose between Pipelines and Processing Jobs:
|
|
135
|
+
|
|
136
|
+
```bash
|
|
137
|
+
smops tui --profile dev
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
Inside the TUI:
|
|
141
|
+
|
|
142
|
+
- `p` / `P`: switch to the next AWS profile from your local AWS config
|
|
143
|
+
- `s`: start a pipeline or submit a processing job from the current TUI
|
|
144
|
+
- `r`: refresh
|
|
145
|
+
- `q`: quit
|
|
146
|
+
|
|
147
|
+
For pipeline starts, enter the pipeline name, optional display name, and optional comma-separated parameters such as `InputDate=2026-07-01,Mode=test`. For processing job submits, enter the path to a JSON/YAML config file using the same structure as boto3 `create_processing_job`.
|
|
148
|
+
|
|
109
149
|
## Processing Jobs TUI
|
|
110
150
|
|
|
111
151
|
```bash
|
|
@@ -127,6 +167,8 @@ smops tui processing --all-profiles
|
|
|
127
167
|
Keyboard shortcuts:
|
|
128
168
|
|
|
129
169
|
- `Up` / `Down` or `Left` / `Right`: switch jobs
|
|
170
|
+
- `p` / `P`: switch to the next AWS profile
|
|
171
|
+
- `s`: submit a Processing Job from a JSON/YAML config file
|
|
130
172
|
- `r`: refresh
|
|
131
173
|
- `q`: quit
|
|
132
174
|
|
|
@@ -152,6 +194,8 @@ Keyboard shortcuts:
|
|
|
152
194
|
|
|
153
195
|
- `Left` / `Right`: switch focus between the executions and steps panels
|
|
154
196
|
- `Up` / `Down`: move within the focused panel
|
|
197
|
+
- `p` / `P`: switch to the next AWS profile
|
|
198
|
+
- `s`: start a Pipeline execution
|
|
155
199
|
- `l`: load the CloudWatch log tail for the selected failed step
|
|
156
200
|
- `r`: refresh
|
|
157
201
|
- `q`: quit
|
|
@@ -166,11 +210,32 @@ Log discovery is currently supported for these step job types:
|
|
|
166
210
|
|
|
167
211
|
```bash
|
|
168
212
|
smops processing list --profile dev --region us-east-1
|
|
213
|
+
smops processing wait --profile dev --region us-east-1 --name my-processing-job
|
|
169
214
|
smops pipeline list --profile dev --region us-east-1
|
|
170
215
|
smops pipeline list --profile dev --region us-east-1 --name my-pipeline --hours 6
|
|
171
216
|
smops pipeline steps --profile dev --region us-east-1 --execution-arn arn:aws:sagemaker:...
|
|
217
|
+
smops pipeline wait --profile dev --region us-east-1 --execution-arn arn:aws:sagemaker:...
|
|
218
|
+
smops pipeline inspect --profile dev --region us-east-1 --execution-arn arn:aws:sagemaker:...
|
|
219
|
+
smops pipeline diagnose --profile dev --region us-east-1 --execution-arn arn:aws:sagemaker:...
|
|
172
220
|
```
|
|
173
221
|
|
|
222
|
+
Most non-interactive commands support `--json` for agents and automation:
|
|
223
|
+
|
|
224
|
+
```bash
|
|
225
|
+
smops processing list --profile dev --region us-east-1 --json
|
|
226
|
+
smops processing wait --profile dev --region us-east-1 --name my-processing-job --json
|
|
227
|
+
smops pipeline start --profile dev --region us-east-1 --name my-pipeline --json
|
|
228
|
+
smops pipeline list --profile dev --region us-east-1 --json
|
|
229
|
+
smops pipeline steps --profile dev --region us-east-1 --execution-arn arn:aws:sagemaker:... --json
|
|
230
|
+
smops pipeline wait --profile dev --region us-east-1 --execution-arn arn:aws:sagemaker:... --json
|
|
231
|
+
smops pipeline inspect --profile dev --region us-east-1 --execution-arn arn:aws:sagemaker:... --json
|
|
232
|
+
smops pipeline diagnose --profile dev --region us-east-1 --execution-arn arn:aws:sagemaker:... --json
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
JSON responses use a stable envelope. Successful commands return `status: "ok"`; errors return `status: "error"` and a user-facing `error` message. List commands return `items`, `count`, and `next_token`.
|
|
236
|
+
|
|
237
|
+
`pipeline inspect` returns execution details, all steps, and failed steps. `pipeline diagnose` extends that with the first failed step, inferred SageMaker job type/name, CloudWatch log group and stream prefix, log tail, and suggested next actions.
|
|
238
|
+
|
|
174
239
|
`processing list` reads 20 running jobs per page by default. If the output includes `Next token`, pass it to fetch the next page:
|
|
175
240
|
|
|
176
241
|
```bash
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "sagemaker-ops-cli"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.2.0"
|
|
8
8
|
description = "CLI and TUI for submitting and monitoring Amazon SageMaker Processing Jobs and Pipelines."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.10"
|
|
@@ -3,6 +3,8 @@ from __future__ import annotations
|
|
|
3
3
|
import base64
|
|
4
4
|
import binascii
|
|
5
5
|
import json
|
|
6
|
+
import time
|
|
7
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
6
8
|
from dataclasses import dataclass
|
|
7
9
|
from datetime import datetime, timedelta, timezone
|
|
8
10
|
from pathlib import Path
|
|
@@ -14,6 +16,8 @@ from botocore.exceptions import BotoCoreError, ClientError
|
|
|
14
16
|
|
|
15
17
|
ACTIVE_PROCESSING_STATUSES = ("InProgress", "Stopping")
|
|
16
18
|
ACTIVE_PIPELINE_STATUSES = ("Executing", "Stopping")
|
|
19
|
+
TERMINAL_PROCESSING_STATUSES = ("Completed", "Failed", "Stopped")
|
|
20
|
+
TERMINAL_PIPELINE_STATUSES = ("Succeeded", "Failed", "Stopped")
|
|
17
21
|
|
|
18
22
|
|
|
19
23
|
class AwsCliError(RuntimeError):
|
|
@@ -102,6 +106,13 @@ def parse_parameters(items: Iterable[str]) -> list[dict[str, str]]:
|
|
|
102
106
|
return parameters
|
|
103
107
|
|
|
104
108
|
|
|
109
|
+
def available_profiles() -> list[str]:
|
|
110
|
+
try:
|
|
111
|
+
return list(boto3.Session().available_profiles)
|
|
112
|
+
except BotoCoreError as exc:
|
|
113
|
+
raise AwsCliError(f"读取 AWS profiles 失败: {exc}") from exc
|
|
114
|
+
|
|
115
|
+
|
|
105
116
|
def build_contexts(
|
|
106
117
|
profiles: tuple[str, ...],
|
|
107
118
|
region: str | None,
|
|
@@ -143,6 +154,32 @@ def submit_processing_job(ctx: AwsContext, spec: dict[str, Any]) -> dict[str, An
|
|
|
143
154
|
raise AwsCliError(f"提交 processing job 失败: {exc}") from exc
|
|
144
155
|
|
|
145
156
|
|
|
157
|
+
def describe_processing_job(ctx: AwsContext, job_name: str) -> ProcessingJobView:
|
|
158
|
+
try:
|
|
159
|
+
detail = ctx.sagemaker.describe_processing_job(ProcessingJobName=job_name)
|
|
160
|
+
except (BotoCoreError, ClientError) as exc:
|
|
161
|
+
raise AwsCliError(f"读取 processing job 失败 job={job_name}: {exc}") from exc
|
|
162
|
+
return _processing_job_view_from_detail(ctx, detail)
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def wait_processing_job(
|
|
166
|
+
ctx: AwsContext,
|
|
167
|
+
job_name: str,
|
|
168
|
+
timeout_seconds: int = 3600,
|
|
169
|
+
poll_seconds: int = 30,
|
|
170
|
+
) -> ProcessingJobView:
|
|
171
|
+
deadline = time.monotonic() + max(0, timeout_seconds)
|
|
172
|
+
poll_seconds = max(1, poll_seconds)
|
|
173
|
+
|
|
174
|
+
while True:
|
|
175
|
+
job = describe_processing_job(ctx, job_name)
|
|
176
|
+
if job.status in TERMINAL_PROCESSING_STATUSES:
|
|
177
|
+
return job
|
|
178
|
+
if time.monotonic() >= deadline:
|
|
179
|
+
raise AwsCliError(f"等待 processing job 超时 job={job_name} status={job.status}")
|
|
180
|
+
time.sleep(min(poll_seconds, max(0.0, deadline - time.monotonic())))
|
|
181
|
+
|
|
182
|
+
|
|
146
183
|
def start_pipeline_execution(
|
|
147
184
|
ctx: AwsContext,
|
|
148
185
|
pipeline_name: str,
|
|
@@ -214,13 +251,22 @@ def _processing_job_view_from_summary(ctx: AwsContext, summary: dict[str, Any])
|
|
|
214
251
|
detail = ctx.sagemaker.describe_processing_job(ProcessingJobName=name)
|
|
215
252
|
except (BotoCoreError, ClientError):
|
|
216
253
|
detail = summary
|
|
254
|
+
return _processing_job_view_from_detail(ctx, detail, fallback=summary)
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
def _processing_job_view_from_detail(
|
|
258
|
+
ctx: AwsContext,
|
|
259
|
+
detail: dict[str, Any],
|
|
260
|
+
fallback: dict[str, Any] | None = None,
|
|
261
|
+
) -> ProcessingJobView:
|
|
262
|
+
fallback = fallback or {}
|
|
217
263
|
cluster = detail.get("ProcessingResources", {}).get("ClusterConfig", {})
|
|
218
264
|
return ProcessingJobView(
|
|
219
265
|
profile=ctx.profile,
|
|
220
266
|
region=ctx.region,
|
|
221
|
-
name=
|
|
222
|
-
status=detail.get("ProcessingJobStatus",
|
|
223
|
-
creation_time=detail.get("CreationTime",
|
|
267
|
+
name=detail.get("ProcessingJobName", fallback.get("ProcessingJobName", "")),
|
|
268
|
+
status=detail.get("ProcessingJobStatus", fallback.get("ProcessingJobStatus", "")),
|
|
269
|
+
creation_time=detail.get("CreationTime", fallback.get("CreationTime")),
|
|
224
270
|
last_modified_time=detail.get("LastModifiedTime"),
|
|
225
271
|
started_time=detail.get("ProcessingStartTime"),
|
|
226
272
|
ended_time=detail.get("ProcessingEndTime"),
|
|
@@ -228,7 +274,7 @@ def _processing_job_view_from_summary(ctx: AwsContext, summary: dict[str, Any])
|
|
|
228
274
|
instance_count=cluster.get("InstanceCount"),
|
|
229
275
|
role_arn=detail.get("RoleArn", ""),
|
|
230
276
|
failure_reason=detail.get("FailureReason", ""),
|
|
231
|
-
arn=detail.get("ProcessingJobArn",
|
|
277
|
+
arn=detail.get("ProcessingJobArn", fallback.get("ProcessingJobArn", "")),
|
|
232
278
|
)
|
|
233
279
|
|
|
234
280
|
|
|
@@ -281,10 +327,7 @@ def list_pipeline_executions_page(
|
|
|
281
327
|
next_token=next_token,
|
|
282
328
|
)
|
|
283
329
|
cutoff = datetime.now(timezone.utc) - timedelta(hours=recent_hours)
|
|
284
|
-
executions
|
|
285
|
-
|
|
286
|
-
for name in names:
|
|
287
|
-
executions.extend(_list_recent_pipeline_executions_for_name(ctx, name, per_pipeline, cutoff))
|
|
330
|
+
executions = _list_recent_pipeline_executions(ctx, names, per_pipeline, cutoff)
|
|
288
331
|
|
|
289
332
|
return PipelineExecutionsPage(
|
|
290
333
|
executions=sorted(
|
|
@@ -296,6 +339,29 @@ def list_pipeline_executions_page(
|
|
|
296
339
|
)
|
|
297
340
|
|
|
298
341
|
|
|
342
|
+
def _list_recent_pipeline_executions(
|
|
343
|
+
ctx: AwsContext,
|
|
344
|
+
pipeline_names: list[str],
|
|
345
|
+
per_pipeline: int,
|
|
346
|
+
cutoff: datetime,
|
|
347
|
+
) -> list[PipelineExecutionView]:
|
|
348
|
+
if not pipeline_names:
|
|
349
|
+
return []
|
|
350
|
+
if len(pipeline_names) == 1:
|
|
351
|
+
return _list_recent_pipeline_executions_for_name(ctx, pipeline_names[0], per_pipeline, cutoff)
|
|
352
|
+
|
|
353
|
+
executions: list[PipelineExecutionView] = []
|
|
354
|
+
workers = min(8, len(pipeline_names))
|
|
355
|
+
with ThreadPoolExecutor(max_workers=workers) as executor:
|
|
356
|
+
futures = [
|
|
357
|
+
executor.submit(_list_recent_pipeline_executions_for_name, ctx, name, per_pipeline, cutoff)
|
|
358
|
+
for name in pipeline_names
|
|
359
|
+
]
|
|
360
|
+
for future in as_completed(futures):
|
|
361
|
+
executions.extend(future.result())
|
|
362
|
+
return executions
|
|
363
|
+
|
|
364
|
+
|
|
299
365
|
def _list_recent_pipeline_executions_for_name(
|
|
300
366
|
ctx: AwsContext,
|
|
301
367
|
pipeline_name: str,
|
|
@@ -317,18 +383,17 @@ def _list_recent_pipeline_executions_for_name(
|
|
|
317
383
|
for summary in response.get("PipelineExecutionSummaries", []):
|
|
318
384
|
status = summary.get("PipelineExecutionStatus", "")
|
|
319
385
|
execution_arn = summary.get("PipelineExecutionArn", "")
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
last_modified_time = detail.get("LastModifiedTime", summary.get("LastModifiedTime"))
|
|
386
|
+
start_time = summary.get("StartTime")
|
|
387
|
+
last_modified_time = summary.get("LastModifiedTime")
|
|
323
388
|
if not _should_show_pipeline_execution(status, start_time, last_modified_time, cutoff):
|
|
324
389
|
continue
|
|
325
390
|
executions.append(
|
|
326
391
|
PipelineExecutionView(
|
|
327
392
|
profile=ctx.profile,
|
|
328
393
|
region=ctx.region,
|
|
329
|
-
pipeline_name=
|
|
394
|
+
pipeline_name=pipeline_name,
|
|
330
395
|
execution_arn=execution_arn,
|
|
331
|
-
display_name=summary.get("PipelineExecutionDisplayName",
|
|
396
|
+
display_name=summary.get("PipelineExecutionDisplayName", ""),
|
|
332
397
|
status=status,
|
|
333
398
|
start_time=start_time,
|
|
334
399
|
last_modified_time=last_modified_time,
|
|
@@ -405,6 +470,71 @@ def describe_pipeline_execution(ctx: AwsContext, execution_arn: str) -> dict[str
|
|
|
405
470
|
raise AwsCliError(f"读取 pipeline execution 失败: {exc}") from exc
|
|
406
471
|
|
|
407
472
|
|
|
473
|
+
def wait_pipeline_execution(
|
|
474
|
+
ctx: AwsContext,
|
|
475
|
+
execution_arn: str,
|
|
476
|
+
timeout_seconds: int = 3600,
|
|
477
|
+
poll_seconds: int = 30,
|
|
478
|
+
) -> dict[str, Any]:
|
|
479
|
+
deadline = time.monotonic() + max(0, timeout_seconds)
|
|
480
|
+
poll_seconds = max(1, poll_seconds)
|
|
481
|
+
|
|
482
|
+
while True:
|
|
483
|
+
detail = describe_pipeline_execution(ctx, execution_arn)
|
|
484
|
+
status = detail.get("PipelineExecutionStatus", "")
|
|
485
|
+
if status in TERMINAL_PIPELINE_STATUSES:
|
|
486
|
+
return detail
|
|
487
|
+
if time.monotonic() >= deadline:
|
|
488
|
+
raise AwsCliError(f"等待 pipeline execution 超时 execution={execution_arn} status={status}")
|
|
489
|
+
time.sleep(min(poll_seconds, max(0.0, deadline - time.monotonic())))
|
|
490
|
+
|
|
491
|
+
|
|
492
|
+
def inspect_pipeline_execution(ctx: AwsContext, execution_arn: str) -> dict[str, Any]:
|
|
493
|
+
detail = describe_pipeline_execution(ctx, execution_arn)
|
|
494
|
+
detail = {
|
|
495
|
+
**detail,
|
|
496
|
+
"PipelineExecutionArn": detail.get("PipelineExecutionArn") or execution_arn,
|
|
497
|
+
"PipelineName": detail.get("PipelineName") or _pipeline_name_from_execution_arn(execution_arn),
|
|
498
|
+
}
|
|
499
|
+
steps = list_pipeline_steps(ctx, execution_arn)
|
|
500
|
+
failed_steps = [step for step in steps if step.get("StepStatus") == "Failed"]
|
|
501
|
+
return {
|
|
502
|
+
"profile": ctx.profile,
|
|
503
|
+
"region": ctx.region,
|
|
504
|
+
"execution": detail,
|
|
505
|
+
"steps": steps,
|
|
506
|
+
"failed_steps": failed_steps,
|
|
507
|
+
}
|
|
508
|
+
|
|
509
|
+
|
|
510
|
+
def diagnose_pipeline_execution(
|
|
511
|
+
ctx: AwsContext,
|
|
512
|
+
execution_arn: str,
|
|
513
|
+
log_limit: int = 80,
|
|
514
|
+
) -> dict[str, Any]:
|
|
515
|
+
inspection = inspect_pipeline_execution(ctx, execution_arn)
|
|
516
|
+
failed_steps = inspection["failed_steps"]
|
|
517
|
+
failed_step = failed_steps[0] if failed_steps else None
|
|
518
|
+
log_source = infer_log_source(failed_step) if failed_step else None
|
|
519
|
+
log_tail = tail_step_logs(ctx, failed_step, limit=log_limit) if failed_step else []
|
|
520
|
+
job_type = None
|
|
521
|
+
job_name = None
|
|
522
|
+
if log_source and failed_step:
|
|
523
|
+
job_type = _step_job_type(failed_step)
|
|
524
|
+
job_name = log_source[1]
|
|
525
|
+
|
|
526
|
+
return {
|
|
527
|
+
**inspection,
|
|
528
|
+
"failed_step": failed_step,
|
|
529
|
+
"job_type": job_type,
|
|
530
|
+
"job_name": job_name,
|
|
531
|
+
"log_group": log_source[0] if log_source else None,
|
|
532
|
+
"log_stream_prefix": log_source[1] if log_source else None,
|
|
533
|
+
"log_tail": log_tail,
|
|
534
|
+
"next_actions": _diagnostic_next_actions(execution_arn, failed_step),
|
|
535
|
+
}
|
|
536
|
+
|
|
537
|
+
|
|
408
538
|
def tail_step_logs(ctx: AwsContext, step: dict[str, Any], limit: int = 80) -> list[str]:
|
|
409
539
|
source = infer_log_source(step)
|
|
410
540
|
if source is None:
|
|
@@ -413,6 +543,10 @@ def tail_step_logs(ctx: AwsContext, step: dict[str, Any], limit: int = 80) -> li
|
|
|
413
543
|
return tail_cloudwatch_logs(ctx, log_group, stream_prefix, limit=limit)
|
|
414
544
|
|
|
415
545
|
|
|
546
|
+
def tail_processing_job_logs(ctx: AwsContext, job_name: str, limit: int = 80) -> list[str]:
|
|
547
|
+
return tail_cloudwatch_logs(ctx, "/aws/sagemaker/ProcessingJobs", job_name, limit=limit)
|
|
548
|
+
|
|
549
|
+
|
|
416
550
|
def tail_cloudwatch_logs(
|
|
417
551
|
ctx: AwsContext,
|
|
418
552
|
log_group: str,
|
|
@@ -467,6 +601,39 @@ def infer_log_source(step: dict[str, Any]) -> tuple[str, str] | None:
|
|
|
467
601
|
return None
|
|
468
602
|
|
|
469
603
|
|
|
604
|
+
def _step_job_type(step: dict[str, Any]) -> str | None:
|
|
605
|
+
metadata = step.get("Metadata") or {}
|
|
606
|
+
for key in ("ProcessingJob", "TrainingJob", "TransformJob"):
|
|
607
|
+
if isinstance(metadata.get(key), dict):
|
|
608
|
+
return key
|
|
609
|
+
return None
|
|
610
|
+
|
|
611
|
+
|
|
612
|
+
def _pipeline_name_from_execution_arn(execution_arn: str) -> str:
|
|
613
|
+
marker = ":pipeline/"
|
|
614
|
+
if marker not in execution_arn:
|
|
615
|
+
return ""
|
|
616
|
+
tail = execution_arn.split(marker, 1)[1]
|
|
617
|
+
return tail.split("/", 1)[0]
|
|
618
|
+
|
|
619
|
+
|
|
620
|
+
def _diagnostic_next_actions(execution_arn: str, failed_step: dict[str, Any] | None) -> list[dict[str, str]]:
|
|
621
|
+
actions = [
|
|
622
|
+
{
|
|
623
|
+
"type": "inspect",
|
|
624
|
+
"command": f"smops pipeline inspect --execution-arn {execution_arn} --json",
|
|
625
|
+
}
|
|
626
|
+
]
|
|
627
|
+
if failed_step:
|
|
628
|
+
actions.append(
|
|
629
|
+
{
|
|
630
|
+
"type": "diagnose",
|
|
631
|
+
"command": f"smops pipeline diagnose --execution-arn {execution_arn} --json",
|
|
632
|
+
}
|
|
633
|
+
)
|
|
634
|
+
return actions
|
|
635
|
+
|
|
636
|
+
|
|
470
637
|
def format_dt(value: datetime | None) -> str:
|
|
471
638
|
if value is None:
|
|
472
639
|
return ""
|