PyPI - ellf-cli - Versions diffs - 5.0.8__tar.gz → 5.0.14__tar.gz - Mend

ellf-cli 5.0.8tar.gz → 5.0.14tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (186) hide show

{ellf_cli-5.0.8/ellf_cli.egg-info → ellf_cli-5.0.14}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: ellf-cli
-Version: 5.0.8
+Version: 5.0.14
 Summary: Ellf Command Line Interface
 Home-page: https://prodi.gy
 Author: ExplosionAI GmbH

{ellf_cli-5.0.8 → ellf_cli-5.0.14}/ellf_cli/about.json RENAMED Viewed

@@ -1,7 +1,7 @@
 {
   "title": "Ellf CLI",
   "name": "ellf-cli",
-  "version": "5.0.8",
+  "version": "5.0.14",
   "summary": "Ellf Command Line Interface",
   "uri": "https://prodi.gy",
   "prog": "ellf",

{ellf_cli-5.0.8 → ellf_cli-5.0.14}/ellf_cli/auth.py RENAMED Viewed

@@ -145,6 +145,9 @@ class AuthState(Protocol):
     @property
     def org_id(self) -> UUID: ...
+    @property
+    def user_id(self) -> UUID: ...
     @property
     def pam_host(self) -> str: ...
@@ -269,6 +272,21 @@ class AuthStateImpl:
             self._org_id = orgs[0].id
         return self._org_id
+    @property
+    def user_id(self) -> UUID:
+        """Extract the current user's UUID from the PAM api token.
+        Decoded from the JWT's ``uid`` claim (set by pam.tokens.encode_pam_token).
+        Reading from the token avoids a /v1/user/read round-trip for the
+        common case where a caller just wants to associate state with the
+        current user — e.g. persisting last_cluster_id after `clusters use`.
+        """
+        token = self.get_api_token()
+        uid = token.header.get("uid")
+        if uid is None:
+            raise EllfError(message="PAM token missing uid claim")
+        return UUID(uid)
     @property
     def pam_host(self) -> str:
         return self.pam_url.netloc

{ellf_cli-5.0.8 → ellf_cli-5.0.14}/ellf_cli/commands/clusters.py RENAMED Viewed

@@ -33,6 +33,7 @@ from ..ui import print_info_table, print_mutation_result, print_table_with_selec
 from ..util import URL
 from ._cluster_select import select_cluster
 from ._state import get_auth_state, get_root_cfg, get_saved_settings
+from .general import persist_last_active_context
 def uuid7() -> uuid.UUID:
@@ -156,6 +157,9 @@ def use(
     settings.update("cluster_id", chosen.id)
     settings.save(get_root_cfg().saved_settings_path)
     auth.set_active_cluster(chosen.id, broker_url)
+    # Mirror to the user's PAM record so the web app — and other CLI
+    # sessions — see the same active cluster after an org switch.
+    persist_last_active_context(auth, last_cluster_id=chosen.id)
     print_mutation_result(
         {
             "status": "ok",

{ellf_cli-5.0.8 → ellf_cli-5.0.14}/ellf_cli/commands/config.py RENAMED Viewed

@@ -16,7 +16,8 @@ from ..query import (
 )
 from ..ui import print_mutation_result
 from ..util import URL
-from ._state import get_root_cfg, get_saved_settings
+from ._state import get_auth_state, get_root_cfg, get_saved_settings
+from .general import persist_last_active_context
 @cli.subcommand(
@@ -53,6 +54,9 @@ def project(name_or_id: Union[str, UUID], as_json: bool = False) -> UUID:
     settings = get_saved_settings()
     settings.update("project", project_id)
     settings.save(root_cfg.saved_settings_path)
+    # Mirror to the user's PAM record so the web app — and other CLI
+    # sessions — see the same active project after an org switch.
+    persist_last_active_context(get_auth_state(), last_project_id=project_id)
     print_mutation_result(
         {"status": "ok", "project_id": str(project_id)},
         Messages.T019.format(noun="project", name=project_id),

{ellf_cli-5.0.8 → ellf_cli-5.0.14}/ellf_cli/commands/general.py RENAMED Viewed

@@ -114,9 +114,36 @@ def _select_and_persist_cluster(
     settings.update("cluster_id", chosen.id)
     settings.save(get_root_cfg().saved_settings_path)
     auth.set_active_cluster(chosen.id, broker_url)
+    persist_last_active_context(auth, last_cluster_id=chosen.id)
     return chosen.id
+def persist_last_active_context(
+    auth: AuthState,
+    *,
+    last_cluster_id: Optional[UUID] = None,
+    last_project_id: Optional[UUID] = None,
+) -> None:
+    """Mirror local cluster/project state into the user's PAM record.
+    The web app reads ``User.last_cluster_id`` / ``last_project_id`` to
+    restore context after org switches, and the CLI persists here so a
+    web session that follows a ``clusters use`` lands on the same
+    cluster.
+    """
+    from ellf_pam_sdk.models import UserUpdating
+    if last_cluster_id is None and last_project_id is None:
+        return
+    auth.client.user.update(
+        UserUpdating(
+            id=auth.user_id,
+            last_cluster_id=last_cluster_id,
+            last_project_id=last_project_id,
+        )
+    )
 @cli.command("info", field=Arg(help=Messages.select_field))
 def info(field: Optional[Literal["config-dir", "code", "defaults"]] = None) -> Any:
     """Print information about the CLI"""

{ellf_cli-5.0.8 → ellf_cli-5.0.14}/ellf_cli/ellf.json RENAMED Viewed

@@ -1,7 +1,7 @@
 {
   "prog": "ellf",
   "help": "Ellf Command Line Interface.",
-  "version": "5.0.7",
+  "version": "5.0.13",
   "extra_key": "_extra",
   "commands": {
     "actions": {

ellf_cli-5.0.14/ellf_cli/ellf_skills/skills/ellf-annotate.assistant/SKILL.md ADDED Viewed

@@ -0,0 +1,170 @@
+---
+name: ellf-annotate
+description: "Prepares annotation for launch in the Ellf cluster: audits readiness, selects the right recipe, and resolves arguments. Delegates to `/ellf-ops` to actually create the task (and optionally start it) once the audit passes. Use when the user is setting up annotation from scratch, choosing the right built-in recipe, verifying readiness, or planning an annotation agent — not for `run X` / `start Y` requests on a known recipe, which go directly to `/ellf-ops`. Use `/ellf-handoff` when a new custom recipe is needed (routes the implementation to the coding agent), and `/ellf-project` when broader project planning is required."
+argument-hint: "[describe what you want to annotate]"
+---
+# Prepare Annotation In Ellf
+Help the user get from an annotation-ready plan to a running annotation task.
+$ARGUMENTS
+## Your role
+You are responsible for:
+- checking that annotation is ready to launch (the audit)
+- choosing the right built-in Ellf annotation and/or agent recipe
+- deciding whether an existing custom recipe is sufficient
+- deciding when a new custom recipe is required
+- resolving the natural-scalar arguments the user can answer in chat
+- delegating to `/ellf-ops` to actually create (and optionally start) the task
+You do not call `*_create` tools yourself — that is `/ellf-ops`'s job. After
+the audit and arg resolution, delegate. You also do not implement new recipes.
+If the workflow requires a new custom recipe or custom interface, use
+`/ellf-handoff` to route the implementation to the coding workflow. If broader
+methodology or schema work is needed, use `/ellf-project`.
+**Never narrate routing or argument inference to the user.** The user does
+not need to know that you're delegating to `/ellf-ops`, what `auto_start`
+value you inferred from their verb, or which cli-name you chose for which
+field. Say only what the user can act on: the outcome, the next step, the
+link. The audit summary is useful when something failed and the user needs
+to fix it; otherwise skip it and let the create tool's artifact (card,
+form link, or prerequisite link) do the talking.
+## Required readiness audit
+Before choosing a recipe or building a launch spec, read:
+- `${CLAUDE_SKILL_DIR}/references/annotation_audit.md`
+- `${CLAUDE_SKILL_DIR}/references/builtin_ellf_annotation_recipes.md`
+The audit exists because launching with a poorly designed schema or a
+mismatched recipe wastes annotation effort and produces training data the
+model can't learn from. Use it to catch problems before you touch the
+platform.
+Do not launch until you have confirmed:
+- the annotation objective is clear
+- the schema or review target is stable enough
+- the recipe choice actually matches the task
+- the input data is ready
+- the target dataset is clear
+If the audit surfaces a problem:
+- methodological issues (schema design, task decomposition) → route to `/ellf-project`
+- recipe implementation needs (custom UI, routing logic) → route to `/ellf-handoff`
+## Recipe selection
+### Built-in task recipe first
+Prefer a built-in Ellf task recipe whenever it fits cleanly. The built-in
+recipes are documented in
+`${CLAUDE_SKILL_DIR}/references/builtin_ellf_annotation_recipes.md`.
+Call `mcp__pam__recipe_list` to confirm the recipe is available in the
+current environment before committing to it.
+### Existing custom recipe
+If the user names an existing cluster recipe and it matches the workflow,
+use it.
+### New custom recipe required
+If the audit shows the user needs a custom interface, routing logic, or
+annotation flow that built-ins cannot express cleanly:
+- do not force a bad built-in fit
+- use `/ellf-handoff` to assign custom recipe implementation to the coding agent
+- describe what the custom recipe must do
+## Annotation agents
+If the user wants automated annotation:
+- first ensure the base task is methodologically sound
+- prepare the task spec first
+- then identify an annotation-capable `agent_recipe` and prepare its args
+- both task and agent get created via `/ellf-ops` (separate `*_create` calls,
+  then `mcp__pam__task_assign_bot` to attach the agent to the task)
+Do not treat the agent as a replacement for task setup. The task is the
+base annotation workflow.
+## Recipe arguments
+Once the recipe is selected:
+- call `mcp__pam__recipe_list` to find the recipe ID
+- call `mcp__pam__recipe_schema` with the recipe ID
+- treat the returned field spec as the authoritative source for: exact arg keys, types, required vs optional fields, union variants, and cli-name remappings
+- fill args from context and the project plan where possible
+- ask only for natural-scalar values the user can answer in a sentence (a name, a language, a label list); object-typed args belong to the form
+- the create tool runs validation internally and decides what the user sees — do not narrate the schema response back to the user
+The annotation interface itself (what annotators see) cannot be previewed
+from the assistant. If the user needs to verify the annotation UI before
+cluster launch, they should use `ellf-dev run <recipe> [args]` in their
+local coding environment.
+## Execute via /ellf-ops
+After audit + recipe selection + arg resolution, delegate to `/ellf-ops` to
+create the task (and optionally start it):
+```text
+mcp__pam__task_create(
+  recipe_id=<from recipe_list>,
+  args=<resolved scalar args>,
+  name=<optional; PAM auto-names from recipe + timestamp when omitted>,
+  auto_start=<true if the user said run/start/launch, false for create/save/set up>,
+)
+```
+If an annotation agent was also planned, call `mcp__pam__agent_create(...)`
+for the agent. After the agent is created and started, attach it to the
+task with `mcp__pam__task_assign_bot(task_id=..., agent_id=...)`.
+The create tool's internal validation routes every case to a useful
+user-facing artifact:
+- confirmation card (three buttons: `Create and start` / `Create only` / `Cancel`) on clean validation
+- form-handoff link on missing scalars or complex JSON args
+- missing-prerequisite link on a referenced asset / dataset / secret that doesn't exist on the cluster
+Do not stop early to flag schema issues, preview validation problems, or
+direct the user elsewhere. The create tool produces a more useful artifact
+than your commentary. See `/ellf-ops` for the full create workflow and the
+forbidden patterns ("this won't validate, please fix", "the schema requires
+X, Y, Z", any pre-emptive validation reasoning).
+## After creation
+Once the user confirms the create card (or the form-handoff link is followed
+through to completion), help verify and run any follow-ups:
+- if the task wasn't auto-started (user clicked `Create only`): use `/ellf-ops`
+  start workflow when the user is ready
+- check cluster status with `mcp__cluster__job_status(id="<id>")` to confirm the
+  task is running or healthy
+- provide the task link and tell the user where to open it in the app
+- if an agent was added, confirm it is assigned with `mcp__pam__task_bots_read`
+If startup or assignment fails:
+- inspect the cluster error or logs with `mcp__cluster__job_logs(id="<id>")` /
+  `mcp__cluster__job_errors(id="<id>")`
+- if this is an operational problem, use `/ellf-monitor` (or consult `/ellf-ops`)
+- if this is a recipe-capability problem, use `/ellf-handoff`
+When you finish:
+- state whether the setup passed the readiness audit
+- state which recipe path you selected
+- summarize the launch outcome (created / created and started / handed off to form / blocked on missing prerequisite)
+- if not launched, explain exactly what is missing and where you are routing the user next
+## Reference files
+| File | What it covers | When to read |
+|------|---------------|--------------|
+| `${CLAUDE_SKILL_DIR}/references/annotation_audit.md` | Readiness checklist: objective, schema, recipe fit, data, dataset | Before every launch |
+| `${CLAUDE_SKILL_DIR}/references/builtin_ellf_annotation_recipes.md` | Built-in Ellf task and agent recipes with supported workflows | Recipe selection |

{ellf_cli-5.0.8 → ellf_cli-5.0.14}/ellf_cli/ellf_skills/skills/ellf-annotate.coding/SKILL.md RENAMED Viewed

@@ -183,3 +183,11 @@ When you finish:
 - state which runtime target will be used
 - summarize the launch spec clearly
 - if not launching, explain exactly what is missing and where you are routing the user next
+## Reference files
+| File | What it covers | When to read |
+|------|---------------|--------------|
+| `${CLAUDE_SKILL_DIR}/references/annotation_audit.md` | Readiness checklist: objective, schema, recipe fit, data, dataset | Before every launch |
+| `${CLAUDE_SKILL_DIR}/references/builtin_ellf_annotation_recipes.md` | Built-in Ellf task and agent recipes with supported workflows | Recipe selection for cluster |
+| `${CLAUDE_SKILL_DIR}/references/builtin_prodigy_recipes.md` | Built-in standalone Prodigy recipes | Recipe selection for local |

{ellf_cli-5.0.8 → ellf_cli-5.0.14}/ellf_cli/ellf_skills/skills/ellf-handoff/SKILL.md RENAMED Viewed

@@ -21,6 +21,7 @@ the **AskUserQuestion** tool to ask — do NOT ask in plain text.
 |---|---|
 | **description** | Specific and actionable. Not "implement NER" but "Create a custom ner.correct recipe with Ctrl+Enter keybinding for accept_best." |
 | **context_summary** | 2-3 paragraphs condensing the conversation: what was discussed, decisions made, constraints, label schemes, data formats. |
+| **plan_docs** | List of plan names the coding agent should read for project context. Use `project_plan_list` to see what exists, then pass only the plans relevant to this handoff (e.g. the overview plan plus the component plan the work targets). Leave empty if no plans exist yet. |
 ## Step 2: Create the request
@@ -29,7 +30,8 @@ Call the `todo_create` PAM tool — do NOT use Bash or ellf:
 ```
 todo_create(
   description="<description>",
-  context_summary="<context_summary>"
+  context_summary="<context_summary>",
+  plan_docs=["project_plan", ...]
 )
 ```

{ellf_cli-5.0.8 → ellf_cli-5.0.14}/ellf_cli/ellf_skills/skills/ellf-monitor.assistant/SKILL.md RENAMED Viewed

@@ -1,6 +1,6 @@
 ---
 name: ellf-monitor
-description: "Has alert classification (overfitting, plateau, NaN loss, spikes), annotation metrics references, and diagnostic routing for training and annotation jobs. Load this skill to monitor cluster jobs, annotation activity, training progress, or cluster health — it produces structured summaries instead of raw log dumps. Use proactively after launching any job, not just when the user asks. Also trigger on status checks, log inspection, 'how's the task doing', 'what failed', or training metric questions."
+description: "Monitors cluster jobs, annotation activity, training progress, and cluster health — produces structured summaries instead of raw log dumps. Includes alert classification (overfitting, plateau, NaN loss, spikes), annotation metrics, and diagnostic routing. Use proactively after launching any job, not just when the user asks. Also trigger on status checks, log inspection, 'how's the task doing', 'what failed', or training metric questions."
 argument-hint: "[job name, job type, or 'cluster']"
 ---
@@ -29,8 +29,8 @@ If an action is needed:
 ## Tool surface
 Use:
-- `mcp__broker__broker_request` for runtime status, logs, errors, cluster health, and dataset counts
-- PAM read/list tools when you need persisted object details such as task, action, or agent identity
+- The named cluster tools — `job_status`, `job_logs`, `job_errors`, `cluster_status`, `nodes_list`, `worker_types_list`, `dataset_example_count`, `dataset_session_counts` — for runtime state. No free-form proxy; each tool has a typed schema.
+- PAM read/list tools when you need persisted object details such as task, action, or agent identity.
 Do not guess cluster state. Always check.
@@ -44,7 +44,7 @@ Before monitoring, determine what to monitor.
 Use:
 ```text
-mcp__broker__broker_request(method="GET", path="/api/v1/jobs/status")
+mcp__cluster__job_status()
 ```
 If there are multiple plausible jobs, use `AskUserQuestion` to let the user choose.
@@ -56,11 +56,11 @@ If there are multiple plausible jobs, use `AskUserQuestion` to let the user choo
 Read:
 - `${CLAUDE_SKILL_DIR}/references/training_monitoring.md`
 - `${CLAUDE_SKILL_DIR}/../ellf-train/references/training_troubleshooting.md`
-Use broker calls such as:
+Use cluster calls such as:
 ```text
-mcp__broker__broker_request(method="GET", path="/api/v1/jobs/{id}/status")
-mcp__broker__broker_request(method="GET", path="/api/v1/jobs/{id}/logs", query={"tail_lines": 100})
-mcp__broker__broker_request(method="GET", path="/api/v1/jobs/{id}/errors")
+mcp__cluster__job_status(id="<id>")
+mcp__cluster__job_logs(id="<id>", tail_lines=100)
+mcp__cluster__job_errors(id="<id>")
 ```
 Your job is to:
@@ -75,19 +75,29 @@ If follow-up action is needed, route to `/ellf-ops` or `/ellf-handoff`.
 ### Annotation tasks
-Use broker calls such as:
+Use cluster calls for job status and per-dataset counts; use PAM for dataset
+discovery:
 ```text
-mcp__broker__broker_request(method="GET", path="/api/v1/jobs/{id}/status")
-mcp__broker__broker_request(method="GET", path="/api/v1/datasets/example-count", query={"dataset": "<name>"})
+mcp__cluster__job_status(id="<id>")
+mcp__pam__dataset_list(cluster_id="<id>")               # discover datasets (org-wide, registered with PAM)
+mcp__cluster__dataset_example_count(name="<name>")      # total examples in the dataset (per-cluster, via broker)
+mcp__cluster__dataset_session_counts(name="<name>")     # per-annotator breakdown (keys are session_ids; null = bulk import)
 ```
+`dataset_list` is a PAM read (org-wide registry), while the count tools talk to
+the broker because example counts live in the per-cluster Prodigy database.
+All three are read-only and do not require user confirmation. They do not
+read annotation contents — only counts and metadata.
 Read when needed:
 - `${CLAUDE_SKILL_DIR}/references/annotation_metrics.md`
 Report:
 - task state
 - whether the task appears reachable and healthy
-- annotation count
+- annotation count (from `dataset_example_count`)
+- per-annotator activity (from `dataset_session_counts`) when the user asks
+  who annotated what or about active annotators
 - dataset growth
 - whether agent assignment appears to be producing data if applicable
@@ -95,11 +105,11 @@ If useful, combine task status with task detail from PAM reads to confirm the da
 ### Agents
-Use broker calls such as:
+Use cluster calls such as:
 ```text
-mcp__broker__broker_request(method="GET", path="/api/v1/jobs/{id}/status")
-mcp__broker__broker_request(method="GET", path="/api/v1/jobs/{id}/logs", query={"tail_lines": 50})
-mcp__broker__broker_request(method="GET", path="/api/v1/jobs/{id}/errors")
+mcp__cluster__job_status(id="<id>")
+mcp__cluster__job_logs(id="<id>", tail_lines=50)
+mcp__cluster__job_errors(id="<id>")
 ```
 Report:
@@ -114,9 +124,9 @@ If the agent is failing repeatedly, recommend `/ellf-ops` for stop/restart or `/
 For non-training actions:
 ```text
-mcp__broker__broker_request(method="GET", path="/api/v1/jobs/{id}/status")
-mcp__broker__broker_request(method="GET", path="/api/v1/jobs/{id}/logs", query={"tail_lines": 50})
-mcp__broker__broker_request(method="GET", path="/api/v1/jobs/{id}/errors")
+mcp__cluster__job_status(id="<id>")
+mcp__cluster__job_logs(id="<id>", tail_lines=50)
+mcp__cluster__job_errors(id="<id>")
 ```
 Report:
@@ -129,9 +139,9 @@ Report:
 When the user asks about the cluster itself, use:
 ```text
-mcp__broker__broker_request(method="GET", path="/api/v1/status")
-mcp__broker__broker_request(method="GET", path="/api/v1/nodes")
-mcp__broker__broker_request(method="GET", path="/api/v1/worker-types")
+mcp__cluster__cluster_status()
+mcp__cluster__nodes_list()
+mcp__cluster__worker_types_list()
 ```
 Report:
@@ -144,7 +154,7 @@ Report:
 Use compact summaries and tables.
-Never dump raw JSON or raw broker responses.
+Never dump raw JSON or raw cluster responses.
 When presenting logs:
 - summarize the important lines
@@ -173,3 +183,11 @@ When you finish, state:
 - the most important evidence
 - whether intervention is needed
 - the next action, if any
+## Reference files
+| File | What it covers | When to read |
+|------|---------------|--------------|
+| `${CLAUDE_SKILL_DIR}/references/training_monitoring.md` | Training log interpretation, alert classification, metric extraction | Training actions |
+| `${CLAUDE_SKILL_DIR}/references/annotation_metrics.md` | Annotation progress signals, dataset growth, annotator activity | Annotation tasks |
+| `${CLAUDE_SKILL_DIR}/../ellf-train.assistant/references/diagnostics.md` | Six problem classes with detection signals and fix guidance | Diagnosing training issues |

{ellf_cli-5.0.8/ellf_cli/ellf_skills/skills/ellf-monitor.coding → ellf_cli-5.0.14/ellf_cli/ellf_skills/skills/ellf-monitor.assistant}/references/annotation_metrics.md RENAMED Viewed

@@ -4,10 +4,13 @@ Use this reference to interpret annotation activity and quality signals.
 ## What to watch
-- Total annotation count
-- Dataset growth over time
-- Number of active annotators or agents
-- Whether a running task is producing new examples
+- Total annotation count — from `mcp__cluster__dataset_example_count`
+- Dataset growth over time — compare counts across checks
+- Number of active annotators or agents — from `mcp__cluster__dataset_session_counts`
+  (each non-null key is a session_id; `null` means examples written directly via
+  `db-in` rather than through an annotation session)
+- Whether a running task is producing new examples — combine job status with
+  example count over time
 ## Warning signals

{ellf_cli-5.0.8 → ellf_cli-5.0.14}/ellf_cli/ellf_skills/skills/ellf-monitor.assistant/references/training_monitoring.md RENAMED Viewed

@@ -4,7 +4,7 @@ How to monitor training jobs from the web assistant and interpret their signals.
 ## What the web assistant can do
-Use broker-backed status, logs, and errors to determine:
+Use cluster-backed status, logs, and errors to determine:
 - whether the training action is running
 - whether it completed or failed
 - whether logs show loss, score, or alert-like signals

{ellf_cli-5.0.8 → ellf_cli-5.0.14}/ellf_cli/ellf_skills/skills/ellf-monitor.coding/SKILL.md RENAMED Viewed

@@ -1,6 +1,6 @@
 ---
 name: ellf-monitor
-description: "Has a structured training check script, alert classification (overfitting, plateau, NaN loss, spikes), annotation metrics references, and diagnostic routing. Load this skill to monitor Ellf jobs, local Prodigy servers, training progress, or cluster health — it keeps raw logs out of context and produces structured summaries. Use proactively after launching any job, not just when the user asks. Also trigger on status checks, log inspection, 'how's the task doing', 'what failed', or training metric questions."
+description: "Monitors Ellf jobs, local Prodigy servers, training progress, and cluster health — keeps raw logs out of context and produces structured summaries. Includes a structured training check script, alert classification (overfitting, plateau, NaN loss, spikes), annotation metrics, and diagnostic routing. Use proactively after launching any job, not just when the user asks. Also trigger on status checks, log inspection, 'how's the task doing', 'what failed', or training metric questions."
 argument-hint: "[job name, job type, output dir, or 'cluster']"
 ---
@@ -10,6 +10,21 @@ You are the monitoring and diagnosis skill for the coding environment.
 $ARGUMENTS
+## Contents
+- Scope — what this skill does and doesn't do
+- Monitoring surfaces — cluster jobs, local runs, Prodigy servers, cluster health
+- Job discovery — finding the right job to monitor
+- Training monitoring — structured check script, alerts, log interpretation
+- Annotation tasks — task state, dataset growth
+- Agents — state, errors, assignment health
+- Generic actions — state, duration, failures
+- Standalone local Prodigy — process and URL checks
+- Cluster health — connectivity, nodes, worker classes
+- Presenting results — summaries, not raw dumps
+- Continuous monitoring — `/loop` pattern
+- When to escalate — routing to ops, train, project
+- Reference files
 ## Scope
 You monitor and diagnose.
@@ -152,7 +167,7 @@ Use the captured output or process info from the launch context when available.
 # Connectivity and service health
 ellf clusters check
-# Deeper broker-side checks (K8s, NFS, database)
+# Deeper cluster-side checks (K8s, NFS, database)
 ellf clusters check --deep
 # Node capacity and utilization (cpu, memory, gpu, pod count)
@@ -206,3 +221,12 @@ When you finish, state:
 - the key evidence
 - whether intervention is needed
 - the next operational or implementation step
+## Reference files
+| File | What it covers | When to read |
+|------|---------------|--------------|
+| `${CLAUDE_SKILL_DIR}/references/training_monitoring.md` | Training log interpretation, alert classification, metric extraction | Training actions |
+| `${CLAUDE_SKILL_DIR}/references/annotation_metrics.md` | Annotation progress signals, dataset growth, annotator activity | Annotation tasks |
+| `${CLAUDE_SKILL_DIR}/../ellf-train.coding/references/diagnostics.md` | Six problem classes with detection signals and fix guidance | Diagnosing training issues |
+| `${CLAUDE_SKILL_DIR}/../ellf-train.coding/references/training_troubleshooting.md` | Error taxonomy with concrete fixes | Concrete setup or execution errors |

ellf-cli 5.0.8__tar.gz → 5.0.14__tar.gz

ellf-cli 5.0.8tar.gz → 5.0.14tar.gz