npm - @groupby/ai-dev - Versions diffs - 0.5.5 → 0.5.8 - Mend

@groupby/ai-dev 0.5.5 → 0.5.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

package/teams/fhr-ai-team/skills/ml-tooling-dev/scripts/kf_logs.py ADDED Viewed

@@ -0,0 +1,203 @@
+#!/usr/bin/env python3
+"""
+Fetch pod logs for a Kubeflow run step in one shot.
+Resolves: run_id -> task_details -> child_tasks.pod_name -> kubectl logs
+Driver/DAG steps (display_name ends in '-driver') and KFP-internal plumbing steps
+are skipped by default; pass --all to include them. Pods are deduped by pod_name
+across duplicate task_details that the KFP v2 API can return for templated steps.
+Usage:
+    kf_logs.py <run_id>                          # Logs from user-code pods (skip drivers + internals)
+    kf_logs.py <run_id> --step <name>            # Filter to a step (partial match)
+    kf_logs.py <run_id> --step <name> --previous # Logs from the previous (crashed) container
+    kf_logs.py <run_id> --step <name> -f         # Stream live (requires exactly one matching pod)
+    kf_logs.py <run_id> --list                   # List matching pods without fetching logs
+    kf_logs.py <run_id> --all                    # Include driver and KFP-internal step pods
+    kf_logs.py <run_id> --step <name> --tail 500 # Tail N lines (default 200)
+"""
+import argparse
+import json
+import shutil
+import subprocess
+import sys
+from urllib.error import URLError
+from urllib.parse import urlencode
+from urllib.request import urlopen
+KF_HOST = "http://10.11.96.10"
+API_V2 = f"{KF_HOST}/pipeline/apis/v2beta1"
+NAMESPACE = "kubeflow"
+# Same list as kf_query.py so behavior is consistent
+INTERNAL_PREFIXES = (
+    "generate-", "get-resource-request", "create-config-map",
+    "compute-", "generate-env-variables", "spark-history",
+    "dataproc-create", "generate-dataproc", "exit-handler",
+    "send-workflow-notification", "root", "condition-",
+    "retrieve-item-data",
+)
+def get(url: str, params: dict = None) -> dict:
+    if params:
+        url = f"{url}?{urlencode(params)}"
+    try:
+        with urlopen(url, timeout=15) as r:
+            return json.loads(r.read())
+    except URLError as e:
+        print(f"[ERROR] Cannot reach {url}: {e}", file=sys.stderr)
+        sys.exit(1)
+def is_internal(name: str) -> bool:
+    name_lower = name.lower()
+    return any(name_lower.startswith(p) for p in INTERNAL_PREFIXES)
+def is_driver(name: str) -> bool:
+    """KFP v2 driver / DAG steps end in '-driver'; their pods produce orchestration logs only."""
+    return name.endswith("-driver")
+def require_kubectl() -> None:
+    if shutil.which("kubectl") is None:
+        print("[ERROR] kubectl not found on PATH", file=sys.stderr)
+        sys.exit(1)
+def resolve_pods(run_id: str, step_filter: str = None, include_internal: bool = False) -> list:
+    """Return [(step_name, state, pod_name)] for matching pods, deduped by pod_name.
+    Filters skipped by default (overridable with include_internal=True):
+      - KFP-internal plumbing steps (matching INTERNAL_PREFIXES)
+      - Driver / DAG orchestration steps (display_name ends in '-driver')
+    """
+    data = get(f"{API_V2}/runs/{run_id}")
+    tasks = data.get("run_details", {}).get("task_details", [])
+    if not tasks:
+        print(
+            f"[ERROR] No task_details on run {run_id} (may still be initializing)",
+            file=sys.stderr,
+        )
+        sys.exit(1)
+    seen_pods = set()
+    results = []
+    for t in tasks:
+        step = t.get("display_name", "")
+        if not step:
+            continue
+        if step_filter and step_filter.lower() not in step.lower():
+            continue
+        if not include_internal and (is_internal(step) or is_driver(step)):
+            continue
+        state = t.get("state", "UNKNOWN")
+        for c in t.get("child_tasks", []):
+            pod = c.get("pod_name", "")
+            if not pod or pod in seen_pods:
+                continue
+            seen_pods.add(pod)
+            results.append((step, state, pod))
+    # Sort by step name, then pod name, so repeat invocations are deterministic
+    results.sort(key=lambda r: (r[0], r[2]))
+    return results
+def print_pod_list(pods: list) -> None:
+    print(f"{'State':<12}  {'Step':<55}  Pod")
+    print("-" * 120)
+    for step, state, pod in pods:
+        print(f"{state:<12}  {step:<55}  {pod}")
+def fetch_logs(pod: str, tail: int, previous: bool, follow: bool) -> int:
+    cmd = ["kubectl", "logs", "-n", NAMESPACE, pod]
+    if tail is not None:
+        cmd += [f"--tail={tail}"]
+    if previous:
+        cmd += ["--previous"]
+    if follow:
+        cmd += ["-f"]
+    # Stream output directly to the caller's stdout/stderr
+    try:
+        result = subprocess.run(cmd)
+    except KeyboardInterrupt:
+        return 130
+    return result.returncode
+def main():
+    parser = argparse.ArgumentParser(
+        description=f"Fetch pod logs for a Kubeflow run step ({KF_HOST})",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument("run_id", help="Kubeflow run ID")
+    parser.add_argument("--step", help="Filter steps by name (partial match)")
+    parser.add_argument("--tail", type=int, default=200, help="Tail N lines (default: 200)")
+    parser.add_argument(
+        "--previous",
+        action="store_true",
+        help="Logs from the previous (crashed) container, if any",
+    )
+    parser.add_argument(
+        "-f", "--follow",
+        action="store_true",
+        help="Stream live logs. Requires exactly one matching pod.",
+    )
+    parser.add_argument(
+        "--all",
+        action="store_true",
+        dest="show_all",
+        help="Include driver and KFP internal-plumbing step pods",
+    )
+    parser.add_argument(
+        "--list",
+        action="store_true",
+        dest="list_only",
+        help="List matching pods without fetching logs",
+    )
+    args = parser.parse_args()
+    pods = resolve_pods(args.run_id, step_filter=args.step, include_internal=args.show_all)
+    if not pods:
+        msg = f"[ERROR] No matching pods found for run {args.run_id}"
+        if args.step:
+            msg += f" matching step {args.step!r}"
+        if not args.show_all:
+            msg += " (pass --all to include driver / KFP-internal step pods)"
+        print(msg, file=sys.stderr)
+        sys.exit(1)
+    if args.list_only:
+        print_pod_list(pods)
+        return
+    if args.follow and len(pods) > 1:
+        print(
+            f"[ERROR] --follow requires exactly one matching pod. Found {len(pods)}. "
+            f"Narrow with --step.",
+            file=sys.stderr,
+        )
+        print_pod_list(pods)
+        sys.exit(1)
+    require_kubectl()
+    overall_rc = 0
+    for i, (step, state, pod) in enumerate(pods):
+        if len(pods) > 1:
+            print()
+            print("=" * 100)
+            print(f"Step:  {step}")
+            print(f"State: {state}")
+            print(f"Pod:   {pod}")
+            print("=" * 100)
+        rc = fetch_logs(pod, args.tail, args.previous, args.follow)
+        if rc != 0 and overall_rc == 0:
+            overall_rc = rc
+    sys.exit(overall_rc)
+if __name__ == "__main__":
+    main()

package/teams/fhr-ai-team/skills/ml-tooling-dev/scripts/kf_query.py ADDED Viewed

@@ -0,0 +1,233 @@
+#!/usr/bin/env python3
+"""
+Query the dev Kubeflow API for run details, step statuses, and pod names.
+Usage:
+    python3 kf_query.py <run_id>                          # Show all step statuses + pod names
+    python3 kf_query.py <run_id> --failed                 # Show only failed/running steps
+    python3 kf_query.py <run_id> --step <name>            # Filter steps by name
+    python3 kf_query.py --list                            # List recent runs
+    python3 kf_query.py --list --experiment <name>        # List runs in experiment
+    python3 kf_query.py --experiments                     # List all experiments
+    python3 kf_query.py --pipelines                       # List all pipelines + latest version_name
+    python3 kf_query.py --pipeline-versions <name>        # List all versions of a pipeline
+"""
+import argparse
+import json
+import sys
+from urllib.request import urlopen
+from urllib.parse import urlencode
+from urllib.error import URLError
+KF_HOST = "http://10.11.96.10"
+API_V2 = f"{KF_HOST}/pipeline/apis/v2beta1"
+# Steps that are internal KFP plumbing; hidden by default
+INTERNAL_PREFIXES = (
+    "generate-", "get-resource-request", "create-config-map",
+    "compute-", "generate-env-variables", "spark-history",
+    "dataproc-create", "generate-dataproc", "exit-handler",
+    "send-workflow-notification", "root", "condition-",
+    "retrieve-item-data",
+)
+def get(url: str, params: dict = None) -> dict:
+    if params:
+        url = f"{url}?{urlencode(params)}"
+    try:
+        with urlopen(url, timeout=15) as r:
+            return json.loads(r.read())
+    except URLError as e:
+        print(f"[ERROR] Cannot reach {url}: {e}", file=sys.stderr)
+        sys.exit(1)
+def is_internal(name: str) -> bool:
+    name_lower = name.lower()
+    return any(name_lower.startswith(p) for p in INTERNAL_PREFIXES)
+def list_experiments() -> None:
+    data = get(f"{API_V2}/experiments", {"page_size": 50, "sort_by": "created_at desc"})
+    experiments = data.get("experiments", [])
+    print(f"{'Experiment ID':<38}  {'Name'}")
+    print("-" * 90)
+    for exp in experiments:
+        print(f"{exp['experiment_id']:<38}  {exp['display_name']}")
+def list_runs(experiment_name: str = None) -> None:
+    params = {"page_size": 20, "sort_by": "created_at desc"}
+    if experiment_name:
+        exps = get(f"{API_V2}/experiments", {"page_size": 200})
+        exp_id = None
+        for e in exps.get("experiments", []):
+            if experiment_name.lower() in e["display_name"].lower():
+                exp_id = e["experiment_id"]
+                print(f"Experiment: {e['display_name']}")
+                break
+        if not exp_id:
+            print(f"[ERROR] Experiment containing '{experiment_name}' not found.")
+            sys.exit(1)
+        params["experiment_id"] = exp_id
+    data = get(f"{API_V2}/runs", params)
+    runs = data.get("runs", [])
+    print(f"\n{'Run ID':<38}  {'State':<12}  {'Created':<22}  Name")
+    print("-" * 110)
+    for run in runs:
+        run_id = run.get("run_id", "")
+        state = run.get("state", "")
+        created = run.get("created_at", "")[:19].replace("T", " ")
+        name = run.get("display_name", "")
+        print(f"{run_id:<38}  {state:<12}  {created:<22}  {name}")
+def show_run(run_id: str, step_filter: str = None, failed_only: bool = False, show_all: bool = False) -> None:
+    data = get(f"{API_V2}/runs/{run_id}")
+    name = data.get("display_name", "")
+    state = data.get("state", "Unknown")
+    created = data.get("created_at", "")[:19].replace("T", " ")
+    finished_raw = data.get("finished_at", "")
+    finished = finished_raw[:19].replace("T", " ") if finished_raw and not finished_raw.startswith("1970") else "N/A"
+    error = data.get("error", {}).get("message", "")
+    print(f"Run:      {name}")
+    print(f"ID:       {run_id}")
+    print(f"State:    {state}")
+    print(f"Created:  {created}")
+    print(f"Finished: {finished}")
+    if error:
+        print(f"Error:    {error}")
+    print()
+    # Show runtime params (the job config) unless filtering by step
+    runtime_params = data.get("runtime_config", {}).get("parameters", {})
+    if runtime_params and not step_filter and not failed_only:
+        print("JOB CONFIG PARAMS:")
+        for k, v in runtime_params.items():
+            if isinstance(v, dict):
+                print(f"  {k}: {json.dumps(v)}")
+            else:
+                print(f"  {k}: {v}")
+        print()
+    tasks = data.get("run_details", {}).get("task_details", [])
+    if not tasks:
+        print("No task details available (run may still be initializing).")
+        return
+    # Deduplicate: group by display_name, keep lowest-priority (most severe) state
+    STATE_PRIORITY = {"FAILED": 0, "RUNNING": 1, "SUCCEEDED": 2, "SKIPPED": 3, "UNKNOWN": 4}
+    seen: dict = {}
+    for t in tasks:
+        dname = t.get("display_name", "")
+        if not dname:
+            continue
+        existing = seen.get(dname)
+        if existing is None:
+            seen[dname] = t
+        else:
+            cur_pri = STATE_PRIORITY.get(t.get("state", "UNKNOWN"), 99)
+            ex_pri = STATE_PRIORITY.get(existing.get("state", "UNKNOWN"), 99)
+            if cur_pri < ex_pri:
+                seen[dname] = t
+    print(f"{'State':<12}  {'Step Name':<55}  Pod(s)")
+    print("-" * 120)
+    for dname, t in sorted(seen.items(), key=lambda x: x[1].get("start_time", "")):
+        state_str = t.get("state", "UNKNOWN")
+        # Apply filters
+        if failed_only and state_str not in ("FAILED", "RUNNING"):
+            continue
+        if step_filter and step_filter.lower() not in dname.lower():
+            continue
+        if not show_all and is_internal(dname) and state_str != "FAILED":
+            continue
+        children = t.get("child_tasks", [])
+        pods = [c.get("pod_name", "") for c in children if c.get("pod_name")]
+        pod_str = ", ".join(pods[:2])
+        print(f"{state_str:<12}  {dname:<55}  {pod_str}")
+    if not show_all and not step_filter and not failed_only:
+        print("\n(Use --all to show internal KFP plumbing steps)")
+def list_pipelines() -> None:
+    """List all pipelines with their latest version_name (needed for config files)."""
+    data = get(f"{API_V2}/pipelines", {"page_size": 100, "sort_by": "created_at desc"})
+    pipelines = data.get("pipelines", [])
+    print(f"{'Pipeline Name':<55}  {'Latest version_name'}")
+    print("-" * 85)
+    for p in pipelines:
+        name = p.get("display_name", "")
+        # Fetch latest version for this pipeline
+        pid = p.get("pipeline_id", "")
+        try:
+            vdata = get(f"{API_V2}/pipelines/{pid}/versions", {"page_size": 1, "sort_by": "created_at desc"})
+            versions = vdata.get("pipeline_versions", [])
+            latest = versions[0].get("display_name", "N/A") if versions else "N/A"
+        except SystemExit:
+            latest = "?"
+        print(f"{name:<55}  {latest}")
+def list_pipeline_versions(pipeline_name: str) -> None:
+    """List all versions of a pipeline by name (partial match)."""
+    data = get(f"{API_V2}/pipelines", {"page_size": 100})
+    pipelines = data.get("pipelines", [])
+    matched = [p for p in pipelines if pipeline_name.lower() in p.get("display_name", "").lower()]
+    if not matched:
+        print(f"[ERROR] No pipeline found matching '{pipeline_name}'")
+        sys.exit(1)
+    for p in matched:
+        pid = p.get("pipeline_id", "")
+        pname = p.get("display_name", "")
+        print(f"\nPipeline: {pname}  (ID: {pid})")
+        vdata = get(f"{API_V2}/pipelines/{pid}/versions", {"page_size": 20, "sort_by": "created_at desc"})
+        versions = vdata.get("pipeline_versions", [])
+        print(f"  {'version_name':<20}  {'Version ID':<38}  Created")
+        print("  " + "-" * 80)
+        for v in versions:
+            vname = v.get("display_name", "")
+            vid = v.get("pipeline_version_id", "")
+            created = v.get("created_at", "")[:19].replace("T", " ")
+            print(f"  {vname:<20}  {vid:<38}  {created}")
+def main():
+    parser = argparse.ArgumentParser(description="Query dev Kubeflow API (http://10.11.96.10)")
+    parser.add_argument("run_id", nargs="?", help="Kubeflow run ID")
+    parser.add_argument("--step", help="Filter steps by name (partial match)")
+    parser.add_argument("--failed", action="store_true", help="Show only failed/running steps")
+    parser.add_argument("--all", action="store_true", dest="show_all", help="Show all steps incl. KFP internals")
+    parser.add_argument("--list", action="store_true", help="List recent runs")
+    parser.add_argument("--experiment", help="Experiment name filter")
+    parser.add_argument("--experiments", action="store_true", help="List all experiments")
+    parser.add_argument("--pipelines", action="store_true", help="List all pipelines + latest version_name")
+    parser.add_argument("--pipeline-versions", metavar="NAME", help="List all versions of a pipeline (partial name match)")
+    args = parser.parse_args()
+    if args.pipelines:
+        list_pipelines()
+    elif args.pipeline_versions:
+        list_pipeline_versions(args.pipeline_versions)
+    elif args.experiments:
+        list_experiments()
+    elif args.list or (not args.run_id and args.experiment):
+        list_runs(args.experiment)
+    elif args.run_id:
+        show_run(args.run_id, step_filter=args.step, failed_only=args.failed, show_all=args.show_all)
+    else:
+        parser.print_help()
+if __name__ == "__main__":
+    main()

package/teams/fhr-ai-team/skills/ml-tooling-dev/scripts/kf_wait.py ADDED Viewed

@@ -0,0 +1,195 @@
+#!/usr/bin/env python3
+"""
+Poll a Kubeflow run step until it reaches a target state.
+Usage:
+    kf_wait.py <run_id> --step <name> --state RUNNING [--timeout 600] [--interval 10]
+States: PENDING, RUNNING, SUCCEEDED, FAILED, SKIPPED, CANCELLED
+Exit codes:
+    0   Step reached the target state
+    1   Run not found, or step terminated in a different state than the target
+    2   API unreachable for an extended period (consecutive failures past the timeout)
+    124 Timeout: target state never observed within --timeout
+If multiple steps match --step, the most-severe state wins:
+    FAILED > RUNNING > SUCCEEDED > SKIPPED > CANCELLED > PENDING
+A single FAILED match aborts immediately. Terminal states (SKIPPED, CANCELLED) out-rank
+PENDING, so a SKIPPED-ended branch is reported as terminal and triggers fail-fast when
+the target is non-terminal, instead of being masked by leftover PENDING entries.
+"""
+import argparse
+import json
+import sys
+import time
+from urllib.error import HTTPError, URLError
+from urllib.parse import urlencode
+from urllib.request import urlopen
+KF_HOST = "http://10.11.96.10"
+API_V2 = f"{KF_HOST}/pipeline/apis/v2beta1"
+VALID_STATES = ["PENDING", "RUNNING", "SUCCEEDED", "FAILED", "SKIPPED", "CANCELLED"]
+TERMINAL_STATES = {"SUCCEEDED", "FAILED", "SKIPPED", "CANCELLED"}
+# Severity priority for dedup when multiple task_details share a display_name
+# (KFP v2 returns duplicates for templated steps).
+# Order matters: FAILED first so we abort fast; terminal states (SKIPPED, CANCELLED)
+# out-rank PENDING so a SKIPPED-ended branch is reported as terminal, not "still pending".
+STATE_PRIORITY = {
+    "FAILED": 0,
+    "RUNNING": 1,
+    "SUCCEEDED": 2,
+    "SKIPPED": 3,
+    "CANCELLED": 4,
+    "PENDING": 5,
+    "UNKNOWN": 99,
+}
+def get(url: str, params: dict = None):
+    """GET that returns parsed JSON, the sentinel '__NOT_FOUND__' on HTTP 404,
+    or None on other transient failures (network unreachable, other HTTP codes,
+    parse errors).
+    """
+    if params:
+        url = f"{url}?{urlencode(params)}"
+    try:
+        with urlopen(url, timeout=15) as r:
+            return json.loads(r.read())
+    except HTTPError as e:
+        if e.code == 404:
+            return "__NOT_FOUND__"
+        return None  # other HTTP codes treated as transient (e.g. 502, 503)
+    except URLError:
+        return None  # network unreachable
+    except (json.JSONDecodeError, OSError):
+        return None
+def find_step_state(run_id: str, step_filter: str):
+    """Return (matched_step_name, state) using STATE_PRIORITY when multiple match.
+    Returns (None, None) if the run was reached but no step matches yet.
+    Returns (None, "__NOT_FOUND__") on HTTP 404.
+    Returns (None, "__UNREACHABLE__") on transient API failures.
+    """
+    data = get(f"{API_V2}/runs/{run_id}")
+    if data == "__NOT_FOUND__":
+        return None, "__NOT_FOUND__"
+    if data is None:
+        return None, "__UNREACHABLE__"
+    if isinstance(data, dict) and "run_id" not in data and "error" in data:
+        # Some KFP versions return an error envelope (HTTP 200) instead of 404
+        return None, "__NOT_FOUND__"
+    tasks = data.get("run_details", {}).get("task_details", [])
+    matching = [
+        t for t in tasks
+        if t.get("display_name") and step_filter.lower() in t["display_name"].lower()
+    ]
+    if not matching:
+        return None, None
+    matching.sort(key=lambda t: STATE_PRIORITY.get(t.get("state", "UNKNOWN"), 99))
+    best = matching[0]
+    return best.get("display_name"), best.get("state", "UNKNOWN")
+def main():
+    parser = argparse.ArgumentParser(
+        description=f"Wait for a Kubeflow run step to reach a target state ({KF_HOST})",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument("run_id", help="Kubeflow run ID")
+    parser.add_argument("--step", required=True, help="Step name (partial match)")
+    parser.add_argument(
+        "--state",
+        required=True,
+        choices=VALID_STATES,
+        type=str.upper,
+        help="Target state",
+    )
+    parser.add_argument(
+        "--timeout", type=int, default=600,
+        help="Maximum wait time in seconds (default: 600)",
+    )
+    parser.add_argument(
+        "--interval", type=int, default=10,
+        help="Poll interval in seconds (default: 10)",
+    )
+    parser.add_argument("--quiet", action="store_true", help="Suppress progress output")
+    args = parser.parse_args()
+    target = args.state
+    deadline = time.monotonic() + args.timeout
+    consecutive_unreachable = 0
+    last_status = ""
+    while True:
+        name, state = find_step_state(args.run_id, args.step)
+        elapsed = max(0, args.timeout - int(deadline - time.monotonic()))
+        if state == "__NOT_FOUND__":
+            print(f"[ERROR] Run not found: {args.run_id}", file=sys.stderr)
+            sys.exit(1)
+        if state == "__UNREACHABLE__":
+            consecutive_unreachable += 1
+            if not args.quiet:
+                print(f"[{elapsed}s] API unreachable, retrying...")
+            # Bail only if we've used up the budget
+            if time.monotonic() >= deadline:
+                print(
+                    f"[ERROR] Kubeflow API unreachable for the full {args.timeout}s window",
+                    file=sys.stderr,
+                )
+                sys.exit(2)
+            time.sleep(args.interval)
+            continue
+        consecutive_unreachable = 0
+        if state == target:
+            if not args.quiet:
+                print(f"[OK] Step {name!r} reached state {target} after {elapsed}s")
+            sys.exit(0)
+        # Terminal but wrong: fail fast, do not keep polling
+        if state in TERMINAL_STATES and target not in TERMINAL_STATES:
+            print(
+                f"[ERROR] Step {name!r} terminated as {state}, "
+                f"will never reach {target}",
+                file=sys.stderr,
+            )
+            sys.exit(1)
+        if state in TERMINAL_STATES and state != target:
+            print(
+                f"[ERROR] Step {name!r} terminated as {state}, target was {target}",
+                file=sys.stderr,
+            )
+            sys.exit(1)
+        if time.monotonic() >= deadline:
+            current = state if name else "(no matching step yet)"
+            print(
+                f"[TIMEOUT] Step {args.step!r} did not reach {target} within "
+                f"{args.timeout}s. Current: {current}",
+                file=sys.stderr,
+            )
+            sys.exit(124)
+        if not args.quiet:
+            shown = name or args.step
+            current = state if name else "not yet present"
+            status = f"[{elapsed}s] {shown}: {current} -> {target}"
+            if status != last_status:
+                print(status)
+                last_status = status
+        time.sleep(args.interval)
+if __name__ == "__main__":
+    main()