@groupby/ai-dev 0.5.5 → 0.5.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/package.json +1 -1
  2. package/teams/OOF/skills/jira-ticket-creator/README.md +22 -0
  3. package/teams/OOF/skills/jira-ticket-creator/SKILL.md +266 -0
  4. package/teams/fhr-ai-team/github/PULL_REQUEST_TEMPLATE/full.md +31 -0
  5. package/teams/fhr-ai-team/github/PULL_REQUEST_TEMPLATE/light.md +7 -0
  6. package/teams/fhr-ai-team/github/copilot-instructions.md +24 -0
  7. package/teams/fhr-ai-team/github/instructions/python.instructions.md +23 -0
  8. package/teams/fhr-ai-team/github/pull_request_template.md +21 -0
  9. package/teams/fhr-ai-team/prompts/brainstorm.md +7 -0
  10. package/teams/fhr-ai-team/prompts/plan-algo-tests.md +7 -0
  11. package/teams/fhr-ai-team/prompts/plan.md +7 -0
  12. package/teams/fhr-ai-team/prompts/pr-description.md +7 -0
  13. package/teams/fhr-ai-team/prompts/test.md +7 -0
  14. package/teams/fhr-ai-team/resources/AGENTS.md +55 -0
  15. package/teams/fhr-ai-team/resources/CLAUDE.md +52 -0
  16. package/teams/fhr-ai-team/resources/README.md +51 -0
  17. package/teams/fhr-ai-team/resources/claude-code-setup.md +60 -0
  18. package/teams/fhr-ai-team/resources/copilot-setup.md +64 -0
  19. package/teams/fhr-ai-team/resources/onboarding.md +179 -0
  20. package/teams/fhr-ai-team/resources/opencode-install.md +29 -0
  21. package/teams/fhr-ai-team/resources/opencode-setup.md +43 -0
  22. package/teams/fhr-ai-team/skills/algo-test-planning/SKILL.md +192 -0
  23. package/teams/fhr-ai-team/skills/algo-test-planning/references/pipeline-registry.md +280 -0
  24. package/teams/fhr-ai-team/skills/brainstorming/SKILL.md +111 -0
  25. package/teams/fhr-ai-team/skills/e2e-testing/SKILL.md +163 -0
  26. package/teams/fhr-ai-team/skills/grill-me/SKILL.md +10 -0
  27. package/teams/fhr-ai-team/skills/ml-tooling-dev/SKILL.md +313 -0
  28. package/teams/fhr-ai-team/skills/ml-tooling-dev/references/kubectl-debug.md +165 -0
  29. package/teams/fhr-ai-team/skills/ml-tooling-dev/references/mongodb-config.md +218 -0
  30. package/teams/fhr-ai-team/skills/ml-tooling-dev/references/pipeline-configs.md +190 -0
  31. package/teams/fhr-ai-team/skills/ml-tooling-dev/references/pipeline-steps.md +182 -0
  32. package/teams/fhr-ai-team/skills/ml-tooling-dev/scripts/kf_logs.py +203 -0
  33. package/teams/fhr-ai-team/skills/ml-tooling-dev/scripts/kf_query.py +233 -0
  34. package/teams/fhr-ai-team/skills/ml-tooling-dev/scripts/kf_wait.py +195 -0
  35. package/teams/fhr-ai-team/skills/ml-tooling-dev/scripts/mlflow_query.py +252 -0
  36. package/teams/fhr-ai-team/skills/ml-tooling-dev/scripts/mongo_predictor.py +352 -0
  37. package/teams/fhr-ai-team/skills/naming-conventions-reviewer/SKILL.md +230 -0
  38. package/teams/fhr-ai-team/skills/naming-conventions-reviewer/references/dataset-naming.md +190 -0
  39. package/teams/fhr-ai-team/skills/naming-conventions-reviewer/references/domain-vocabulary.md +447 -0
  40. package/teams/fhr-ai-team/skills/naming-conventions-reviewer/references/repo-dependency-graph.md +264 -0
  41. package/teams/fhr-ai-team/skills/planning/SKILL.md +138 -0
  42. package/teams/fhr-ai-team/skills/pr-description/SKILL.md +94 -0
  43. package/teams/snpd/skills/code-review-github/SKILL.md +475 -0
@@ -0,0 +1,203 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Fetch pod logs for a Kubeflow run step in one shot.
4
+
5
+ Resolves: run_id -> task_details -> child_tasks.pod_name -> kubectl logs
6
+ Driver/DAG steps (display_name ends in '-driver') and KFP-internal plumbing steps
7
+ are skipped by default; pass --all to include them. Pods are deduped by pod_name
8
+ across duplicate task_details that the KFP v2 API can return for templated steps.
9
+
10
+ Usage:
11
+ kf_logs.py <run_id> # Logs from user-code pods (skip drivers + internals)
12
+ kf_logs.py <run_id> --step <name> # Filter to a step (partial match)
13
+ kf_logs.py <run_id> --step <name> --previous # Logs from the previous (crashed) container
14
+ kf_logs.py <run_id> --step <name> -f # Stream live (requires exactly one matching pod)
15
+ kf_logs.py <run_id> --list # List matching pods without fetching logs
16
+ kf_logs.py <run_id> --all # Include driver and KFP-internal step pods
17
+ kf_logs.py <run_id> --step <name> --tail 500 # Tail N lines (default 200)
18
+ """
19
+
20
+ import argparse
21
+ import json
22
+ import shutil
23
+ import subprocess
24
+ import sys
25
+ from urllib.error import URLError
26
+ from urllib.parse import urlencode
27
+ from urllib.request import urlopen
28
+
29
+ KF_HOST = "http://10.11.96.10"
30
+ API_V2 = f"{KF_HOST}/pipeline/apis/v2beta1"
31
+ NAMESPACE = "kubeflow"
32
+
33
+ # Same list as kf_query.py so behavior is consistent
34
+ INTERNAL_PREFIXES = (
35
+ "generate-", "get-resource-request", "create-config-map",
36
+ "compute-", "generate-env-variables", "spark-history",
37
+ "dataproc-create", "generate-dataproc", "exit-handler",
38
+ "send-workflow-notification", "root", "condition-",
39
+ "retrieve-item-data",
40
+ )
41
+
42
+
43
+ def get(url: str, params: dict = None) -> dict:
44
+ if params:
45
+ url = f"{url}?{urlencode(params)}"
46
+ try:
47
+ with urlopen(url, timeout=15) as r:
48
+ return json.loads(r.read())
49
+ except URLError as e:
50
+ print(f"[ERROR] Cannot reach {url}: {e}", file=sys.stderr)
51
+ sys.exit(1)
52
+
53
+
54
+ def is_internal(name: str) -> bool:
55
+ name_lower = name.lower()
56
+ return any(name_lower.startswith(p) for p in INTERNAL_PREFIXES)
57
+
58
+
59
+ def is_driver(name: str) -> bool:
60
+ """KFP v2 driver / DAG steps end in '-driver'; their pods produce orchestration logs only."""
61
+ return name.endswith("-driver")
62
+
63
+
64
+ def require_kubectl() -> None:
65
+ if shutil.which("kubectl") is None:
66
+ print("[ERROR] kubectl not found on PATH", file=sys.stderr)
67
+ sys.exit(1)
68
+
69
+
70
+ def resolve_pods(run_id: str, step_filter: str = None, include_internal: bool = False) -> list:
71
+ """Return [(step_name, state, pod_name)] for matching pods, deduped by pod_name.
72
+
73
+ Filters skipped by default (overridable with include_internal=True):
74
+ - KFP-internal plumbing steps (matching INTERNAL_PREFIXES)
75
+ - Driver / DAG orchestration steps (display_name ends in '-driver')
76
+ """
77
+ data = get(f"{API_V2}/runs/{run_id}")
78
+ tasks = data.get("run_details", {}).get("task_details", [])
79
+ if not tasks:
80
+ print(
81
+ f"[ERROR] No task_details on run {run_id} (may still be initializing)",
82
+ file=sys.stderr,
83
+ )
84
+ sys.exit(1)
85
+
86
+ seen_pods = set()
87
+ results = []
88
+ for t in tasks:
89
+ step = t.get("display_name", "")
90
+ if not step:
91
+ continue
92
+ if step_filter and step_filter.lower() not in step.lower():
93
+ continue
94
+ if not include_internal and (is_internal(step) or is_driver(step)):
95
+ continue
96
+ state = t.get("state", "UNKNOWN")
97
+ for c in t.get("child_tasks", []):
98
+ pod = c.get("pod_name", "")
99
+ if not pod or pod in seen_pods:
100
+ continue
101
+ seen_pods.add(pod)
102
+ results.append((step, state, pod))
103
+ # Sort by step name, then pod name, so repeat invocations are deterministic
104
+ results.sort(key=lambda r: (r[0], r[2]))
105
+ return results
106
+
107
+
108
+ def print_pod_list(pods: list) -> None:
109
+ print(f"{'State':<12} {'Step':<55} Pod")
110
+ print("-" * 120)
111
+ for step, state, pod in pods:
112
+ print(f"{state:<12} {step:<55} {pod}")
113
+
114
+
115
+ def fetch_logs(pod: str, tail: int, previous: bool, follow: bool) -> int:
116
+ cmd = ["kubectl", "logs", "-n", NAMESPACE, pod]
117
+ if tail is not None:
118
+ cmd += [f"--tail={tail}"]
119
+ if previous:
120
+ cmd += ["--previous"]
121
+ if follow:
122
+ cmd += ["-f"]
123
+ # Stream output directly to the caller's stdout/stderr
124
+ try:
125
+ result = subprocess.run(cmd)
126
+ except KeyboardInterrupt:
127
+ return 130
128
+ return result.returncode
129
+
130
+
131
+ def main():
132
+ parser = argparse.ArgumentParser(
133
+ description=f"Fetch pod logs for a Kubeflow run step ({KF_HOST})",
134
+ formatter_class=argparse.RawDescriptionHelpFormatter,
135
+ )
136
+ parser.add_argument("run_id", help="Kubeflow run ID")
137
+ parser.add_argument("--step", help="Filter steps by name (partial match)")
138
+ parser.add_argument("--tail", type=int, default=200, help="Tail N lines (default: 200)")
139
+ parser.add_argument(
140
+ "--previous",
141
+ action="store_true",
142
+ help="Logs from the previous (crashed) container, if any",
143
+ )
144
+ parser.add_argument(
145
+ "-f", "--follow",
146
+ action="store_true",
147
+ help="Stream live logs. Requires exactly one matching pod.",
148
+ )
149
+ parser.add_argument(
150
+ "--all",
151
+ action="store_true",
152
+ dest="show_all",
153
+ help="Include driver and KFP internal-plumbing step pods",
154
+ )
155
+ parser.add_argument(
156
+ "--list",
157
+ action="store_true",
158
+ dest="list_only",
159
+ help="List matching pods without fetching logs",
160
+ )
161
+ args = parser.parse_args()
162
+
163
+ pods = resolve_pods(args.run_id, step_filter=args.step, include_internal=args.show_all)
164
+ if not pods:
165
+ msg = f"[ERROR] No matching pods found for run {args.run_id}"
166
+ if args.step:
167
+ msg += f" matching step {args.step!r}"
168
+ if not args.show_all:
169
+ msg += " (pass --all to include driver / KFP-internal step pods)"
170
+ print(msg, file=sys.stderr)
171
+ sys.exit(1)
172
+
173
+ if args.list_only:
174
+ print_pod_list(pods)
175
+ return
176
+
177
+ if args.follow and len(pods) > 1:
178
+ print(
179
+ f"[ERROR] --follow requires exactly one matching pod. Found {len(pods)}. "
180
+ f"Narrow with --step.",
181
+ file=sys.stderr,
182
+ )
183
+ print_pod_list(pods)
184
+ sys.exit(1)
185
+
186
+ require_kubectl()
187
+ overall_rc = 0
188
+ for i, (step, state, pod) in enumerate(pods):
189
+ if len(pods) > 1:
190
+ print()
191
+ print("=" * 100)
192
+ print(f"Step: {step}")
193
+ print(f"State: {state}")
194
+ print(f"Pod: {pod}")
195
+ print("=" * 100)
196
+ rc = fetch_logs(pod, args.tail, args.previous, args.follow)
197
+ if rc != 0 and overall_rc == 0:
198
+ overall_rc = rc
199
+ sys.exit(overall_rc)
200
+
201
+
202
+ if __name__ == "__main__":
203
+ main()
@@ -0,0 +1,233 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Query the dev Kubeflow API for run details, step statuses, and pod names.
4
+
5
+ Usage:
6
+ python3 kf_query.py <run_id> # Show all step statuses + pod names
7
+ python3 kf_query.py <run_id> --failed # Show only failed/running steps
8
+ python3 kf_query.py <run_id> --step <name> # Filter steps by name
9
+ python3 kf_query.py --list # List recent runs
10
+ python3 kf_query.py --list --experiment <name> # List runs in experiment
11
+ python3 kf_query.py --experiments # List all experiments
12
+ python3 kf_query.py --pipelines # List all pipelines + latest version_name
13
+ python3 kf_query.py --pipeline-versions <name> # List all versions of a pipeline
14
+ """
15
+
16
+ import argparse
17
+ import json
18
+ import sys
19
+ from urllib.request import urlopen
20
+ from urllib.parse import urlencode
21
+ from urllib.error import URLError
22
+
23
+ KF_HOST = "http://10.11.96.10"
24
+ API_V2 = f"{KF_HOST}/pipeline/apis/v2beta1"
25
+
26
+ # Steps that are internal KFP plumbing; hidden by default
27
+ INTERNAL_PREFIXES = (
28
+ "generate-", "get-resource-request", "create-config-map",
29
+ "compute-", "generate-env-variables", "spark-history",
30
+ "dataproc-create", "generate-dataproc", "exit-handler",
31
+ "send-workflow-notification", "root", "condition-",
32
+ "retrieve-item-data",
33
+ )
34
+
35
+
36
+ def get(url: str, params: dict = None) -> dict:
37
+ if params:
38
+ url = f"{url}?{urlencode(params)}"
39
+ try:
40
+ with urlopen(url, timeout=15) as r:
41
+ return json.loads(r.read())
42
+ except URLError as e:
43
+ print(f"[ERROR] Cannot reach {url}: {e}", file=sys.stderr)
44
+ sys.exit(1)
45
+
46
+
47
+ def is_internal(name: str) -> bool:
48
+ name_lower = name.lower()
49
+ return any(name_lower.startswith(p) for p in INTERNAL_PREFIXES)
50
+
51
+
52
+ def list_experiments() -> None:
53
+ data = get(f"{API_V2}/experiments", {"page_size": 50, "sort_by": "created_at desc"})
54
+ experiments = data.get("experiments", [])
55
+ print(f"{'Experiment ID':<38} {'Name'}")
56
+ print("-" * 90)
57
+ for exp in experiments:
58
+ print(f"{exp['experiment_id']:<38} {exp['display_name']}")
59
+
60
+
61
+ def list_runs(experiment_name: str = None) -> None:
62
+ params = {"page_size": 20, "sort_by": "created_at desc"}
63
+ if experiment_name:
64
+ exps = get(f"{API_V2}/experiments", {"page_size": 200})
65
+ exp_id = None
66
+ for e in exps.get("experiments", []):
67
+ if experiment_name.lower() in e["display_name"].lower():
68
+ exp_id = e["experiment_id"]
69
+ print(f"Experiment: {e['display_name']}")
70
+ break
71
+ if not exp_id:
72
+ print(f"[ERROR] Experiment containing '{experiment_name}' not found.")
73
+ sys.exit(1)
74
+ params["experiment_id"] = exp_id
75
+
76
+ data = get(f"{API_V2}/runs", params)
77
+ runs = data.get("runs", [])
78
+ print(f"\n{'Run ID':<38} {'State':<12} {'Created':<22} Name")
79
+ print("-" * 110)
80
+ for run in runs:
81
+ run_id = run.get("run_id", "")
82
+ state = run.get("state", "")
83
+ created = run.get("created_at", "")[:19].replace("T", " ")
84
+ name = run.get("display_name", "")
85
+ print(f"{run_id:<38} {state:<12} {created:<22} {name}")
86
+
87
+
88
+ def show_run(run_id: str, step_filter: str = None, failed_only: bool = False, show_all: bool = False) -> None:
89
+ data = get(f"{API_V2}/runs/{run_id}")
90
+
91
+ name = data.get("display_name", "")
92
+ state = data.get("state", "Unknown")
93
+ created = data.get("created_at", "")[:19].replace("T", " ")
94
+ finished_raw = data.get("finished_at", "")
95
+ finished = finished_raw[:19].replace("T", " ") if finished_raw and not finished_raw.startswith("1970") else "N/A"
96
+ error = data.get("error", {}).get("message", "")
97
+
98
+ print(f"Run: {name}")
99
+ print(f"ID: {run_id}")
100
+ print(f"State: {state}")
101
+ print(f"Created: {created}")
102
+ print(f"Finished: {finished}")
103
+ if error:
104
+ print(f"Error: {error}")
105
+ print()
106
+
107
+ # Show runtime params (the job config) unless filtering by step
108
+ runtime_params = data.get("runtime_config", {}).get("parameters", {})
109
+ if runtime_params and not step_filter and not failed_only:
110
+ print("JOB CONFIG PARAMS:")
111
+ for k, v in runtime_params.items():
112
+ if isinstance(v, dict):
113
+ print(f" {k}: {json.dumps(v)}")
114
+ else:
115
+ print(f" {k}: {v}")
116
+ print()
117
+
118
+ tasks = data.get("run_details", {}).get("task_details", [])
119
+ if not tasks:
120
+ print("No task details available (run may still be initializing).")
121
+ return
122
+
123
+ # Deduplicate: group by display_name, keep lowest-priority (most severe) state
124
+ STATE_PRIORITY = {"FAILED": 0, "RUNNING": 1, "SUCCEEDED": 2, "SKIPPED": 3, "UNKNOWN": 4}
125
+ seen: dict = {}
126
+ for t in tasks:
127
+ dname = t.get("display_name", "")
128
+ if not dname:
129
+ continue
130
+ existing = seen.get(dname)
131
+ if existing is None:
132
+ seen[dname] = t
133
+ else:
134
+ cur_pri = STATE_PRIORITY.get(t.get("state", "UNKNOWN"), 99)
135
+ ex_pri = STATE_PRIORITY.get(existing.get("state", "UNKNOWN"), 99)
136
+ if cur_pri < ex_pri:
137
+ seen[dname] = t
138
+
139
+ print(f"{'State':<12} {'Step Name':<55} Pod(s)")
140
+ print("-" * 120)
141
+
142
+ for dname, t in sorted(seen.items(), key=lambda x: x[1].get("start_time", "")):
143
+ state_str = t.get("state", "UNKNOWN")
144
+
145
+ # Apply filters
146
+ if failed_only and state_str not in ("FAILED", "RUNNING"):
147
+ continue
148
+ if step_filter and step_filter.lower() not in dname.lower():
149
+ continue
150
+ if not show_all and is_internal(dname) and state_str != "FAILED":
151
+ continue
152
+
153
+ children = t.get("child_tasks", [])
154
+ pods = [c.get("pod_name", "") for c in children if c.get("pod_name")]
155
+ pod_str = ", ".join(pods[:2])
156
+
157
+ print(f"{state_str:<12} {dname:<55} {pod_str}")
158
+
159
+ if not show_all and not step_filter and not failed_only:
160
+ print("\n(Use --all to show internal KFP plumbing steps)")
161
+
162
+
163
+ def list_pipelines() -> None:
164
+ """List all pipelines with their latest version_name (needed for config files)."""
165
+ data = get(f"{API_V2}/pipelines", {"page_size": 100, "sort_by": "created_at desc"})
166
+ pipelines = data.get("pipelines", [])
167
+ print(f"{'Pipeline Name':<55} {'Latest version_name'}")
168
+ print("-" * 85)
169
+ for p in pipelines:
170
+ name = p.get("display_name", "")
171
+ # Fetch latest version for this pipeline
172
+ pid = p.get("pipeline_id", "")
173
+ try:
174
+ vdata = get(f"{API_V2}/pipelines/{pid}/versions", {"page_size": 1, "sort_by": "created_at desc"})
175
+ versions = vdata.get("pipeline_versions", [])
176
+ latest = versions[0].get("display_name", "N/A") if versions else "N/A"
177
+ except SystemExit:
178
+ latest = "?"
179
+ print(f"{name:<55} {latest}")
180
+
181
+
182
+ def list_pipeline_versions(pipeline_name: str) -> None:
183
+ """List all versions of a pipeline by name (partial match)."""
184
+ data = get(f"{API_V2}/pipelines", {"page_size": 100})
185
+ pipelines = data.get("pipelines", [])
186
+ matched = [p for p in pipelines if pipeline_name.lower() in p.get("display_name", "").lower()]
187
+ if not matched:
188
+ print(f"[ERROR] No pipeline found matching '{pipeline_name}'")
189
+ sys.exit(1)
190
+ for p in matched:
191
+ pid = p.get("pipeline_id", "")
192
+ pname = p.get("display_name", "")
193
+ print(f"\nPipeline: {pname} (ID: {pid})")
194
+ vdata = get(f"{API_V2}/pipelines/{pid}/versions", {"page_size": 20, "sort_by": "created_at desc"})
195
+ versions = vdata.get("pipeline_versions", [])
196
+ print(f" {'version_name':<20} {'Version ID':<38} Created")
197
+ print(" " + "-" * 80)
198
+ for v in versions:
199
+ vname = v.get("display_name", "")
200
+ vid = v.get("pipeline_version_id", "")
201
+ created = v.get("created_at", "")[:19].replace("T", " ")
202
+ print(f" {vname:<20} {vid:<38} {created}")
203
+
204
+
205
+ def main():
206
+ parser = argparse.ArgumentParser(description="Query dev Kubeflow API (http://10.11.96.10)")
207
+ parser.add_argument("run_id", nargs="?", help="Kubeflow run ID")
208
+ parser.add_argument("--step", help="Filter steps by name (partial match)")
209
+ parser.add_argument("--failed", action="store_true", help="Show only failed/running steps")
210
+ parser.add_argument("--all", action="store_true", dest="show_all", help="Show all steps incl. KFP internals")
211
+ parser.add_argument("--list", action="store_true", help="List recent runs")
212
+ parser.add_argument("--experiment", help="Experiment name filter")
213
+ parser.add_argument("--experiments", action="store_true", help="List all experiments")
214
+ parser.add_argument("--pipelines", action="store_true", help="List all pipelines + latest version_name")
215
+ parser.add_argument("--pipeline-versions", metavar="NAME", help="List all versions of a pipeline (partial name match)")
216
+ args = parser.parse_args()
217
+
218
+ if args.pipelines:
219
+ list_pipelines()
220
+ elif args.pipeline_versions:
221
+ list_pipeline_versions(args.pipeline_versions)
222
+ elif args.experiments:
223
+ list_experiments()
224
+ elif args.list or (not args.run_id and args.experiment):
225
+ list_runs(args.experiment)
226
+ elif args.run_id:
227
+ show_run(args.run_id, step_filter=args.step, failed_only=args.failed, show_all=args.show_all)
228
+ else:
229
+ parser.print_help()
230
+
231
+
232
+ if __name__ == "__main__":
233
+ main()
@@ -0,0 +1,195 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Poll a Kubeflow run step until it reaches a target state.
4
+
5
+ Usage:
6
+ kf_wait.py <run_id> --step <name> --state RUNNING [--timeout 600] [--interval 10]
7
+
8
+ States: PENDING, RUNNING, SUCCEEDED, FAILED, SKIPPED, CANCELLED
9
+
10
+ Exit codes:
11
+ 0 Step reached the target state
12
+ 1 Run not found, or step terminated in a different state than the target
13
+ 2 API unreachable for an extended period (consecutive failures past the timeout)
14
+ 124 Timeout: target state never observed within --timeout
15
+
16
+ If multiple steps match --step, the most-severe state wins:
17
+ FAILED > RUNNING > SUCCEEDED > SKIPPED > CANCELLED > PENDING
18
+ A single FAILED match aborts immediately. Terminal states (SKIPPED, CANCELLED) out-rank
19
+ PENDING, so a SKIPPED-ended branch is reported as terminal and triggers fail-fast when
20
+ the target is non-terminal, instead of being masked by leftover PENDING entries.
21
+ """
22
+
23
+ import argparse
24
+ import json
25
+ import sys
26
+ import time
27
+ from urllib.error import HTTPError, URLError
28
+ from urllib.parse import urlencode
29
+ from urllib.request import urlopen
30
+
31
+ KF_HOST = "http://10.11.96.10"
32
+ API_V2 = f"{KF_HOST}/pipeline/apis/v2beta1"
33
+
34
+ VALID_STATES = ["PENDING", "RUNNING", "SUCCEEDED", "FAILED", "SKIPPED", "CANCELLED"]
35
+ TERMINAL_STATES = {"SUCCEEDED", "FAILED", "SKIPPED", "CANCELLED"}
36
+
37
+ # Severity priority for dedup when multiple task_details share a display_name
38
+ # (KFP v2 returns duplicates for templated steps).
39
+ # Order matters: FAILED first so we abort fast; terminal states (SKIPPED, CANCELLED)
40
+ # out-rank PENDING so a SKIPPED-ended branch is reported as terminal, not "still pending".
41
+ STATE_PRIORITY = {
42
+ "FAILED": 0,
43
+ "RUNNING": 1,
44
+ "SUCCEEDED": 2,
45
+ "SKIPPED": 3,
46
+ "CANCELLED": 4,
47
+ "PENDING": 5,
48
+ "UNKNOWN": 99,
49
+ }
50
+
51
+
52
+ def get(url: str, params: dict = None):
53
+ """GET that returns parsed JSON, the sentinel '__NOT_FOUND__' on HTTP 404,
54
+ or None on other transient failures (network unreachable, other HTTP codes,
55
+ parse errors).
56
+ """
57
+ if params:
58
+ url = f"{url}?{urlencode(params)}"
59
+ try:
60
+ with urlopen(url, timeout=15) as r:
61
+ return json.loads(r.read())
62
+ except HTTPError as e:
63
+ if e.code == 404:
64
+ return "__NOT_FOUND__"
65
+ return None # other HTTP codes treated as transient (e.g. 502, 503)
66
+ except URLError:
67
+ return None # network unreachable
68
+ except (json.JSONDecodeError, OSError):
69
+ return None
70
+
71
+
72
+ def find_step_state(run_id: str, step_filter: str):
73
+ """Return (matched_step_name, state) using STATE_PRIORITY when multiple match.
74
+
75
+ Returns (None, None) if the run was reached but no step matches yet.
76
+ Returns (None, "__NOT_FOUND__") on HTTP 404.
77
+ Returns (None, "__UNREACHABLE__") on transient API failures.
78
+ """
79
+ data = get(f"{API_V2}/runs/{run_id}")
80
+ if data == "__NOT_FOUND__":
81
+ return None, "__NOT_FOUND__"
82
+ if data is None:
83
+ return None, "__UNREACHABLE__"
84
+ if isinstance(data, dict) and "run_id" not in data and "error" in data:
85
+ # Some KFP versions return an error envelope (HTTP 200) instead of 404
86
+ return None, "__NOT_FOUND__"
87
+
88
+ tasks = data.get("run_details", {}).get("task_details", [])
89
+ matching = [
90
+ t for t in tasks
91
+ if t.get("display_name") and step_filter.lower() in t["display_name"].lower()
92
+ ]
93
+ if not matching:
94
+ return None, None
95
+
96
+ matching.sort(key=lambda t: STATE_PRIORITY.get(t.get("state", "UNKNOWN"), 99))
97
+ best = matching[0]
98
+ return best.get("display_name"), best.get("state", "UNKNOWN")
99
+
100
+
101
+ def main():
102
+ parser = argparse.ArgumentParser(
103
+ description=f"Wait for a Kubeflow run step to reach a target state ({KF_HOST})",
104
+ formatter_class=argparse.RawDescriptionHelpFormatter,
105
+ )
106
+ parser.add_argument("run_id", help="Kubeflow run ID")
107
+ parser.add_argument("--step", required=True, help="Step name (partial match)")
108
+ parser.add_argument(
109
+ "--state",
110
+ required=True,
111
+ choices=VALID_STATES,
112
+ type=str.upper,
113
+ help="Target state",
114
+ )
115
+ parser.add_argument(
116
+ "--timeout", type=int, default=600,
117
+ help="Maximum wait time in seconds (default: 600)",
118
+ )
119
+ parser.add_argument(
120
+ "--interval", type=int, default=10,
121
+ help="Poll interval in seconds (default: 10)",
122
+ )
123
+ parser.add_argument("--quiet", action="store_true", help="Suppress progress output")
124
+ args = parser.parse_args()
125
+
126
+ target = args.state
127
+ deadline = time.monotonic() + args.timeout
128
+ consecutive_unreachable = 0
129
+ last_status = ""
130
+
131
+ while True:
132
+ name, state = find_step_state(args.run_id, args.step)
133
+ elapsed = max(0, args.timeout - int(deadline - time.monotonic()))
134
+
135
+ if state == "__NOT_FOUND__":
136
+ print(f"[ERROR] Run not found: {args.run_id}", file=sys.stderr)
137
+ sys.exit(1)
138
+
139
+ if state == "__UNREACHABLE__":
140
+ consecutive_unreachable += 1
141
+ if not args.quiet:
142
+ print(f"[{elapsed}s] API unreachable, retrying...")
143
+ # Bail only if we've used up the budget
144
+ if time.monotonic() >= deadline:
145
+ print(
146
+ f"[ERROR] Kubeflow API unreachable for the full {args.timeout}s window",
147
+ file=sys.stderr,
148
+ )
149
+ sys.exit(2)
150
+ time.sleep(args.interval)
151
+ continue
152
+ consecutive_unreachable = 0
153
+
154
+ if state == target:
155
+ if not args.quiet:
156
+ print(f"[OK] Step {name!r} reached state {target} after {elapsed}s")
157
+ sys.exit(0)
158
+
159
+ # Terminal but wrong: fail fast, do not keep polling
160
+ if state in TERMINAL_STATES and target not in TERMINAL_STATES:
161
+ print(
162
+ f"[ERROR] Step {name!r} terminated as {state}, "
163
+ f"will never reach {target}",
164
+ file=sys.stderr,
165
+ )
166
+ sys.exit(1)
167
+ if state in TERMINAL_STATES and state != target:
168
+ print(
169
+ f"[ERROR] Step {name!r} terminated as {state}, target was {target}",
170
+ file=sys.stderr,
171
+ )
172
+ sys.exit(1)
173
+
174
+ if time.monotonic() >= deadline:
175
+ current = state if name else "(no matching step yet)"
176
+ print(
177
+ f"[TIMEOUT] Step {args.step!r} did not reach {target} within "
178
+ f"{args.timeout}s. Current: {current}",
179
+ file=sys.stderr,
180
+ )
181
+ sys.exit(124)
182
+
183
+ if not args.quiet:
184
+ shown = name or args.step
185
+ current = state if name else "not yet present"
186
+ status = f"[{elapsed}s] {shown}: {current} -> {target}"
187
+ if status != last_status:
188
+ print(status)
189
+ last_status = status
190
+
191
+ time.sleep(args.interval)
192
+
193
+
194
+ if __name__ == "__main__":
195
+ main()