nemo-evaluator-launcher 0.1.0rc6__py3-none-any.whl → 0.1.41__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. nemo_evaluator_launcher/__init__.py +15 -1
  2. nemo_evaluator_launcher/api/functional.py +188 -27
  3. nemo_evaluator_launcher/api/types.py +9 -0
  4. nemo_evaluator_launcher/cli/export.py +131 -12
  5. nemo_evaluator_launcher/cli/info.py +477 -82
  6. nemo_evaluator_launcher/cli/kill.py +5 -3
  7. nemo_evaluator_launcher/cli/logs.py +102 -0
  8. nemo_evaluator_launcher/cli/ls_runs.py +31 -10
  9. nemo_evaluator_launcher/cli/ls_tasks.py +105 -3
  10. nemo_evaluator_launcher/cli/main.py +101 -5
  11. nemo_evaluator_launcher/cli/run.py +153 -30
  12. nemo_evaluator_launcher/cli/status.py +49 -5
  13. nemo_evaluator_launcher/cli/version.py +26 -23
  14. nemo_evaluator_launcher/common/execdb.py +121 -27
  15. nemo_evaluator_launcher/common/helpers.py +213 -33
  16. nemo_evaluator_launcher/common/logging_utils.py +16 -5
  17. nemo_evaluator_launcher/common/printing_utils.py +100 -0
  18. nemo_evaluator_launcher/configs/deployment/generic.yaml +33 -0
  19. nemo_evaluator_launcher/configs/deployment/sglang.yaml +4 -2
  20. nemo_evaluator_launcher/configs/deployment/trtllm.yaml +23 -0
  21. nemo_evaluator_launcher/configs/deployment/vllm.yaml +2 -2
  22. nemo_evaluator_launcher/configs/execution/local.yaml +2 -0
  23. nemo_evaluator_launcher/configs/execution/slurm/default.yaml +19 -4
  24. nemo_evaluator_launcher/executors/base.py +54 -1
  25. nemo_evaluator_launcher/executors/lepton/deployment_helpers.py +60 -5
  26. nemo_evaluator_launcher/executors/lepton/executor.py +240 -101
  27. nemo_evaluator_launcher/executors/lepton/job_helpers.py +15 -11
  28. nemo_evaluator_launcher/executors/local/executor.py +492 -56
  29. nemo_evaluator_launcher/executors/local/run.template.sh +76 -9
  30. nemo_evaluator_launcher/executors/slurm/executor.py +571 -98
  31. nemo_evaluator_launcher/executors/slurm/proxy.cfg.template +26 -0
  32. nemo_evaluator_launcher/exporters/base.py +9 -0
  33. nemo_evaluator_launcher/exporters/gsheets.py +27 -9
  34. nemo_evaluator_launcher/exporters/local.py +30 -16
  35. nemo_evaluator_launcher/exporters/mlflow.py +245 -74
  36. nemo_evaluator_launcher/exporters/utils.py +139 -184
  37. nemo_evaluator_launcher/exporters/wandb.py +157 -43
  38. nemo_evaluator_launcher/package_info.py +6 -3
  39. nemo_evaluator_launcher/resources/mapping.toml +56 -15
  40. nemo_evaluator_launcher-0.1.41.dist-info/METADATA +494 -0
  41. nemo_evaluator_launcher-0.1.41.dist-info/RECORD +62 -0
  42. {nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/entry_points.txt +1 -0
  43. nemo_evaluator_launcher-0.1.0rc6.dist-info/METADATA +0 -35
  44. nemo_evaluator_launcher-0.1.0rc6.dist-info/RECORD +0 -57
  45. {nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/WHEEL +0 -0
  46. {nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/licenses/LICENSE +0 -0
  47. {nemo_evaluator_launcher-0.1.0rc6.dist-info → nemo_evaluator_launcher-0.1.41.dist-info}/top_level.txt +0 -0
@@ -23,9 +23,11 @@ import os
23
23
  import pathlib
24
24
  import platform
25
25
  import shlex
26
+ import shutil
26
27
  import subprocess
27
28
  import time
28
- from typing import List, Optional
29
+ import warnings
30
+ from typing import Iterator, List, Optional, Tuple, Union
29
31
 
30
32
  import jinja2
31
33
  import yaml
@@ -38,14 +40,19 @@ from nemo_evaluator_launcher.common.execdb import (
38
40
  generate_job_id,
39
41
  )
40
42
  from nemo_evaluator_launcher.common.helpers import (
43
+ get_api_key_name,
44
+ get_endpoint_url,
41
45
  get_eval_factory_command,
42
46
  get_eval_factory_dataset_size_from_run_config,
47
+ get_health_url,
43
48
  get_timestamp_string,
44
49
  )
50
+ from nemo_evaluator_launcher.common.logging_utils import logger
45
51
  from nemo_evaluator_launcher.common.mapping import (
46
52
  get_task_from_mapping,
47
53
  load_tasks_mapping,
48
54
  )
55
+ from nemo_evaluator_launcher.common.printing_utils import bold, cyan, grey, red
49
56
  from nemo_evaluator_launcher.executors.base import (
50
57
  BaseExecutor,
51
58
  ExecutionState,
@@ -68,12 +75,13 @@ class LocalExecutor(BaseExecutor):
68
75
  str: The invocation ID for the evaluation run.
69
76
 
70
77
  Raises:
71
- NotImplementedError: If deployment is not 'none'.
72
78
  RuntimeError: If the run script fails.
73
79
  """
74
- if cfg.deployment.type != "none":
75
- raise NotImplementedError(
76
- f"type {cfg.deployment.type} is not implemented -- add deployment support"
80
+ # Check if docker is available (skip in dry_run mode)
81
+ if not dry_run and shutil.which("docker") is None:
82
+ raise RuntimeError(
83
+ "Docker is not installed or not in PATH. "
84
+ "Please install Docker to run local evaluations."
77
85
  )
78
86
 
79
87
  # Generate invocation ID for this evaluation run
@@ -88,12 +96,16 @@ class LocalExecutor(BaseExecutor):
88
96
  evaluation_tasks = []
89
97
  job_ids = []
90
98
 
91
- eval_template = jinja2.Template(
99
+ run_template = jinja2.Template(
92
100
  open(pathlib.Path(__file__).parent / "run.template.sh", "r").read()
93
101
  )
94
102
 
95
103
  execution_mode = cfg.execution.get("mode", "parallel")
96
104
  if execution_mode == "parallel":
105
+ if cfg.deployment.type != "none":
106
+ raise ValueError(
107
+ f"Execution mode 'parallel' is not supported with deployment type: {cfg.deployment.type}. Use 'sequential' instead."
108
+ )
97
109
  is_execution_mode_sequential = False
98
110
  elif execution_mode == "sequential":
99
111
  is_execution_mode_sequential = True
@@ -104,20 +116,76 @@ class LocalExecutor(BaseExecutor):
104
116
  )
105
117
  )
106
118
 
119
+ # Will accumulate if any task contains unsafe commands.
120
+ is_potentially_unsafe = False
121
+
122
+ deployment = None
123
+
107
124
  for idx, task in enumerate(cfg.evaluation.tasks):
125
+ timestamp = get_timestamp_string()
108
126
  task_definition = get_task_from_mapping(task.name, tasks_mapping)
109
127
 
128
+ if cfg.deployment.type != "none":
129
+ # container name
130
+ server_container_name = f"server-{task.name}-{timestamp}"
131
+
132
+ # health_url
133
+ health_url = get_health_url(
134
+ cfg, get_endpoint_url(cfg, task, task_definition["endpoint_type"])
135
+ )
136
+
137
+ # mounts
138
+ deployment_mounts_list = []
139
+ if checkpoint_path := cfg.deployment.get("checkpoint_path"):
140
+ deployment_mounts_list.append(f"{checkpoint_path}:/checkpoint:ro")
141
+ if cache_path := cfg.deployment.get("cache_path"):
142
+ deployment_mounts_list.append(f"{cache_path}:/cache")
143
+ for source_mnt, target_mnt in (
144
+ cfg.execution.get("mounts", {}).get("deployment", {}).items()
145
+ ):
146
+ deployment_mounts_list.append(f"{source_mnt}:{target_mnt}")
147
+
148
+ # env vars
149
+ deployment_env_vars = cfg.execution.get("env_vars", {}).get(
150
+ "deployment", {}
151
+ )
152
+
153
+ if cfg.deployment.get("env_vars"):
154
+ warnings.warn(
155
+ "cfg.deployment.env_vars will be deprecated in future versions. "
156
+ "Use cfg.execution.env_vars.deployment instead.",
157
+ category=DeprecationWarning,
158
+ stacklevel=2,
159
+ )
160
+ deployment_env_vars.update(cfg.deployment["env_vars"])
161
+
162
+ command = cfg.deployment.command
163
+ deployment_extra_docker_args = cfg.execution.get(
164
+ "extra_docker_args", ""
165
+ )
166
+
167
+ deployment = {
168
+ "container_name": server_container_name,
169
+ "image": cfg.deployment.image,
170
+ "command": command,
171
+ "mounts": deployment_mounts_list,
172
+ "env_vars": [f"{k}={v}" for k, v in deployment_env_vars.items()],
173
+ "health_url": health_url,
174
+ "port": cfg.deployment.port,
175
+ "extra_docker_args": deployment_extra_docker_args,
176
+ }
177
+
110
178
  # Create job ID as <invocation_id>.<n>
111
179
  job_id = generate_job_id(invocation_id, idx)
112
180
  job_ids.append(job_id)
113
- container_name = f"{task.name}-{get_timestamp_string()}"
181
+ client_container_name = f"client-{task.name}-{timestamp}"
114
182
 
115
183
  # collect all env vars
116
184
  env_vars = copy.deepcopy(dict(cfg.evaluation.get("env_vars", {})))
117
185
  env_vars.update(task.get("env_vars", {}))
118
- if cfg.target.api_endpoint.api_key_name:
186
+ if api_key_name := get_api_key_name(cfg):
119
187
  assert "API_KEY" not in env_vars
120
- env_vars["API_KEY"] = cfg.target.api_endpoint.api_key_name
188
+ env_vars["API_KEY"] = api_key_name
121
189
 
122
190
  # check if the environment variables are set
123
191
  for env_var in env_vars.values():
@@ -126,8 +194,11 @@ class LocalExecutor(BaseExecutor):
126
194
  f"Trying to pass an unset environment variable {env_var}."
127
195
  )
128
196
 
129
- # check if required env vars are defined:
197
+ # check if required env vars are defined (excluding NEMO_EVALUATOR_DATASET_DIR which is handled separately):
130
198
  for required_env_var in task_definition.get("required_env_vars", []):
199
+ # Skip NEMO_EVALUATOR_DATASET_DIR as it's handled by dataset mounting logic below
200
+ if required_env_var == "NEMO_EVALUATOR_DATASET_DIR":
201
+ continue
131
202
  if required_env_var not in env_vars.keys():
132
203
  raise ValueError(
133
204
  f"{task.name} task requires environment variable {required_env_var}."
@@ -135,28 +206,70 @@ class LocalExecutor(BaseExecutor):
135
206
  f" pair {required_env_var}: YOUR_ENV_VAR_NAME"
136
207
  )
137
208
 
209
+ # Handle dataset directory mounting if NEMO_EVALUATOR_DATASET_DIR is required
210
+ dataset_mount_host = None
211
+ dataset_mount_container = None
212
+ dataset_env_var_value = None
213
+ if "NEMO_EVALUATOR_DATASET_DIR" in task_definition.get(
214
+ "required_env_vars", []
215
+ ):
216
+ # Get dataset directory from task config
217
+ if "dataset_dir" in task:
218
+ dataset_mount_host = task["dataset_dir"]
219
+ else:
220
+ raise ValueError(
221
+ f"{task.name} task requires a dataset_dir to be specified. "
222
+ f"Add 'dataset_dir: /path/to/your/dataset' under the task configuration."
223
+ )
224
+ # Get container mount path (default to /datasets if not specified)
225
+ dataset_mount_container = task.get("dataset_mount_path", "/datasets")
226
+ # Set NEMO_EVALUATOR_DATASET_DIR to the container mount path
227
+ dataset_env_var_value = dataset_mount_container
228
+
138
229
  # format env_vars for a template
139
- env_vars = [
230
+ env_vars_list = [
140
231
  f"{env_var_dst}=${env_var_src}"
141
232
  for env_var_dst, env_var_src in env_vars.items()
142
233
  ]
143
234
 
235
+ # Add dataset env var if needed (directly with value, not from host env)
236
+ if dataset_env_var_value:
237
+ env_vars_list.append(
238
+ f"NEMO_EVALUATOR_DATASET_DIR={dataset_env_var_value}"
239
+ )
240
+
144
241
  eval_image = task_definition["container"]
145
242
  if "container" in task:
146
243
  eval_image = task["container"]
147
244
 
148
245
  task_output_dir = output_dir / task.name
149
246
  task_output_dir.mkdir(parents=True, exist_ok=True)
247
+ eval_factory_command_struct = get_eval_factory_command(
248
+ cfg, task, task_definition
249
+ )
250
+ eval_factory_command = eval_factory_command_struct.cmd
251
+ # The debug comment for placing into the script and easy debug. Reason
252
+ # (see `CmdAndReadableComment`) is the current way of passing the command
253
+ # is base64-encoded config `echo`-ed into file.
254
+ # TODO(agronskiy): cleaner way is to encode everything with base64, not
255
+ # some parts (like ef_config.yaml) and just output as logs somewhere.
256
+ eval_factory_command_debug_comment = eval_factory_command_struct.debug
257
+ is_potentially_unsafe = (
258
+ is_potentially_unsafe
259
+ or eval_factory_command_struct.is_potentially_unsafe
260
+ )
150
261
  evaluation_task = {
262
+ "deployment": deployment,
151
263
  "name": task.name,
152
264
  "job_id": job_id,
153
265
  "eval_image": eval_image,
154
- "container_name": container_name,
155
- "env_vars": env_vars,
266
+ "client_container_name": client_container_name,
267
+ "env_vars": env_vars_list,
156
268
  "output_dir": task_output_dir,
157
- "eval_factory_command": get_eval_factory_command(
158
- cfg, task, task_definition
159
- ),
269
+ "eval_factory_command": eval_factory_command,
270
+ "eval_factory_command_debug_comment": eval_factory_command_debug_comment,
271
+ "dataset_mount_host": dataset_mount_host,
272
+ "dataset_mount_container": dataset_mount_container,
160
273
  }
161
274
  evaluation_tasks.append(evaluation_task)
162
275
 
@@ -164,10 +277,13 @@ class LocalExecutor(BaseExecutor):
164
277
  auto_export_config = cfg.execution.get("auto_export", {})
165
278
  auto_export_destinations = auto_export_config.get("destinations", [])
166
279
 
280
+ extra_docker_args = cfg.execution.get("extra_docker_args", "")
281
+
167
282
  run_sh_content = (
168
- eval_template.render(
283
+ run_template.render(
169
284
  evaluation_tasks=[evaluation_task],
170
285
  auto_export_destinations=auto_export_destinations,
286
+ extra_docker_args=extra_docker_args,
171
287
  ).rstrip("\n")
172
288
  + "\n"
173
289
  )
@@ -175,9 +291,10 @@ class LocalExecutor(BaseExecutor):
175
291
  (task_output_dir / "run.sh").write_text(run_sh_content)
176
292
 
177
293
  run_all_sequentially_sh_content = (
178
- eval_template.render(
294
+ run_template.render(
179
295
  evaluation_tasks=evaluation_tasks,
180
296
  auto_export_destinations=auto_export_destinations,
297
+ extra_docker_args=extra_docker_args,
181
298
  ).rstrip("\n")
182
299
  + "\n"
183
300
  )
@@ -185,6 +302,57 @@ class LocalExecutor(BaseExecutor):
185
302
  run_all_sequentially_sh_content
186
303
  )
187
304
 
305
+ if dry_run:
306
+ print(bold("\n\n=============================================\n\n"))
307
+ print(bold(cyan(f"DRY RUN: Scripts prepared and saved to {output_dir}")))
308
+ if is_execution_mode_sequential:
309
+ print(
310
+ cyan(
311
+ "\n\n=========== Main script | run_all.sequential.sh =====================\n\n"
312
+ )
313
+ )
314
+
315
+ with open(output_dir / "run_all.sequential.sh", "r") as f:
316
+ print(grey(f.read()))
317
+ else:
318
+ for idx, task in enumerate(cfg.evaluation.tasks):
319
+ task_output_dir = output_dir / task.name
320
+ print(
321
+ cyan(
322
+ f"\n\n=========== Task script | {task.name}/run.sh =====================\n\n"
323
+ )
324
+ )
325
+ with open(task_output_dir / "run.sh", "r") as f:
326
+ print(grey(f.read()))
327
+ print(bold("\nTo execute, run without --dry-run"))
328
+
329
+ if is_potentially_unsafe:
330
+ print(
331
+ red(
332
+ "\nFound `pre_cmd` which carries security risk. When running without --dry-run "
333
+ "make sure you trust the command and set NEMO_EVALUATOR_TRUST_PRE_CMD=1"
334
+ )
335
+ )
336
+ return invocation_id
337
+
338
+ if is_potentially_unsafe:
339
+ if os.environ.get("NEMO_EVALUATOR_TRUST_PRE_CMD", "") == "1":
340
+ logger.warning(
341
+ "Found non-empty task commands (e.g. `pre_cmd`) and NEMO_EVALUATOR_TRUST_PRE_CMD "
342
+ "is set, proceeding with caution."
343
+ )
344
+
345
+ else:
346
+ logger.error(
347
+ "Found non-empty task commands (e.g. `pre_cmd`) and NEMO_EVALUATOR_TRUST_PRE_CMD "
348
+ "is not set. This might carry security risk and unstable environments. "
349
+ "To continue, make sure you trust the command and set NEMO_EVALUATOR_TRUST_PRE_CMD=1.",
350
+ )
351
+ raise AttributeError(
352
+ "Untrusted command found in config, make sure you trust and "
353
+ "set NEMO_EVALUATOR_TRUST_PRE_CMD=1."
354
+ )
355
+
188
356
  # Save launched jobs metadata
189
357
  db = ExecutionDB()
190
358
  for job_id, task, evaluation_task in zip(
@@ -198,74 +366,67 @@ class LocalExecutor(BaseExecutor):
198
366
  executor="local",
199
367
  data={
200
368
  "output_dir": str(evaluation_task["output_dir"]),
201
- "container": evaluation_task["container_name"],
369
+ "container": evaluation_task["client_container_name"],
202
370
  "eval_image": evaluation_task["eval_image"],
203
371
  },
204
372
  config=OmegaConf.to_object(cfg),
205
373
  )
206
374
  )
207
375
 
208
- if dry_run:
209
- print("\n\n=============================================\n\n")
210
- print(f"DRY RUN: Scripts prepared and saved to {output_dir}")
211
- if is_execution_mode_sequential:
212
- print(
213
- "\n\n =========== Main script | run_all.sequential.sh ===================== \n\n"
214
- )
215
- with open(output_dir / "run_all.sequential.sh", "r") as f:
216
- print(f.read())
217
- else:
218
- for idx, task in enumerate(cfg.evaluation.tasks):
219
- task_output_dir = output_dir / task.name
220
- print(
221
- f"\n\n =========== Task script | {task.name}/run.sh ===================== \n\n"
222
- )
223
- with open(task_output_dir / "run.sh", "r") as f:
224
- print(f.read())
225
- print("\nTo execute, run without --dry-run")
226
- return invocation_id
227
-
228
376
  # Launch bash scripts with Popen for non-blocking execution.
229
377
  # To ensure subprocess continues after python exits:
230
378
  # - on Unix-like systems, to fully detach the subprocess
231
379
  # so it does not die when Python exits, pass start_new_session=True;
232
- # - on Widnows use creationflags=subprocess.CREATE_NEW_PROCESS_GROUP flag.
380
+ # - on Windows use creationflags=subprocess.CREATE_NEW_PROCESS_GROUP flag.
233
381
  os_name = platform.system()
382
+ processes = []
383
+
234
384
  if is_execution_mode_sequential:
235
385
  if os_name == "Windows":
236
- subprocess.Popen(
386
+ proc = subprocess.Popen(
237
387
  shlex.split("bash run_all.sequential.sh"),
238
388
  cwd=output_dir,
239
389
  creationflags=subprocess.CREATE_NEW_PROCESS_GROUP,
240
390
  )
241
391
  else:
242
- subprocess.Popen(
392
+ proc = subprocess.Popen(
243
393
  shlex.split("bash run_all.sequential.sh"),
244
394
  cwd=output_dir,
245
395
  start_new_session=True,
246
396
  )
397
+ processes.append(("run_all.sequential.sh", proc, output_dir))
247
398
  else:
248
399
  for task in cfg.evaluation.tasks:
249
400
  if os_name == "Windows":
250
- subprocess.Popen(
401
+ proc = subprocess.Popen(
251
402
  shlex.split("bash run.sh"),
252
403
  cwd=output_dir / task.name,
253
404
  creationflags=subprocess.CREATE_NEW_PROCESS_GROUP,
254
405
  )
255
406
  else:
256
- subprocess.Popen(
407
+ proc = subprocess.Popen(
257
408
  shlex.split("bash run.sh"),
258
409
  cwd=output_dir / task.name,
259
410
  start_new_session=True,
260
411
  )
412
+ processes.append((task.name, proc, output_dir / task.name))
413
+
414
+ # Wait briefly and check if bash scripts exited immediately (which means error)
415
+ time.sleep(0.3)
261
416
 
262
- print("\nCommands for real-time monitoring:")
417
+ for name, proc, work_dir in processes:
418
+ exit_code = proc.poll()
419
+ if exit_code is not None and exit_code != 0:
420
+ error_msg = f"Script for {name} exited with code {exit_code}"
421
+ raise RuntimeError(f"Job startup failed | {error_msg}")
422
+
423
+ print(bold(cyan("\nCommands for real-time monitoring:")))
263
424
  for job_id, evaluation_task in zip(job_ids, evaluation_tasks):
264
- log_file = evaluation_task["output_dir"] / "logs" / "stdout.log"
265
- print(f" tail -f {log_file}")
425
+ print(f"\n Job {job_id} ({evaluation_task['name']}):")
426
+ print(f" nemo-evaluator-launcher logs {job_id}")
266
427
 
267
- print("\nFollow all logs for this invocation:")
268
- print(f" tail -f {output_dir}/*/logs/stdout.log")
428
+ print(bold(cyan("\nFollow all logs for this invocation:")))
429
+ print(f" nemo-evaluator-launcher logs {invocation_id}")
269
430
 
270
431
  return invocation_id
271
432
 
@@ -281,8 +442,8 @@ class LocalExecutor(BaseExecutor):
281
442
  """
282
443
  db = ExecutionDB()
283
444
 
284
- # If id looks like an invocation_id (8 hex digits, no dot), get all jobs for it
285
- if len(id) == 8 and "." not in id:
445
+ # If id looks like an invocation_id (no dot), get all jobs for it
446
+ if "." not in id:
286
447
  jobs = db.get_jobs(id)
287
448
  statuses: List[ExecutionStatus] = []
288
449
  for job_id, _ in jobs.items():
@@ -390,10 +551,10 @@ class LocalExecutor(BaseExecutor):
390
551
 
391
552
  @staticmethod
392
553
  def kill_job(job_id: str) -> None:
393
- """Kill a local job by stopping its Docker container and related processes.
554
+ """Kill a local job.
394
555
 
395
556
  Args:
396
- job_id: The job ID to kill.
557
+ job_id: The job ID (e.g., abc123.0) to kill.
397
558
 
398
559
  Raises:
399
560
  ValueError: If job is not found or invalid.
@@ -438,15 +599,290 @@ class LocalExecutor(BaseExecutor):
438
599
  if result.returncode == 0:
439
600
  killed_something = True
440
601
 
441
- # Mark job as killed in database if we killed something
602
+ # If we successfully killed something, mark as killed
442
603
  if killed_something:
443
604
  job_data.data["killed"] = True
444
605
  db.write_job(job_data)
606
+ LocalExecutor._add_to_killed_jobs(job_data.invocation_id, job_id)
607
+ return
608
+
609
+ # If nothing was killed, check if this is a pending job
610
+ status_list = LocalExecutor.get_status(job_id)
611
+ if status_list and status_list[0].state == ExecutionState.PENDING:
612
+ # For pending jobs, mark as killed even though there's nothing to kill yet
613
+ job_data.data["killed"] = True
614
+ db.write_job(job_data)
615
+ LocalExecutor._add_to_killed_jobs(job_data.invocation_id, job_id)
616
+ return
617
+
618
+ # Use common helper to get informative error message based on job status
619
+ current_status = status_list[0].state if status_list else None
620
+ error_msg = LocalExecutor.get_kill_failure_message(
621
+ job_id, f"container: {container_name}", current_status
622
+ )
623
+ raise RuntimeError(error_msg)
624
+
625
+ @staticmethod
626
+ def stream_logs(
627
+ id: Union[str, List[str]], executor_name: Optional[str] = None
628
+ ) -> Iterator[Tuple[str, str, str]]:
629
+ """Stream logs from a job or invocation group.
630
+
631
+ Args:
632
+ id: Unique job identifier, invocation identifier, or list of job IDs to stream simultaneously.
633
+
634
+ Yields:
635
+ Tuple[str, str, str]: Tuples of (job_id, task_name, log_line) for each log line.
636
+ Empty lines are yielded as empty strings.
637
+ """
638
+ db = ExecutionDB()
639
+
640
+ # Handle list of job IDs for simultaneous streaming
641
+ if isinstance(id, list):
642
+ # Collect all jobs from the list of job IDs
643
+ jobs = {}
644
+ for job_id in id:
645
+ job_data = db.get_job(job_id)
646
+ if job_data is None or job_data.executor != "local":
647
+ continue
648
+ jobs[job_id] = job_data
649
+ if not jobs:
650
+ return
651
+ # If id looks like an invocation_id (no dot), get all jobs for it
652
+ elif "." not in id:
653
+ jobs = db.get_jobs(id)
654
+ if not jobs:
655
+ return
445
656
  else:
446
- raise RuntimeError(
447
- f"Could not find or kill job {job_id} (container: {container_name})"
657
+ # Otherwise, treat as job_id
658
+ job_data = db.get_job(id)
659
+ if job_data is None or job_data.executor != "local":
660
+ return
661
+ jobs = {id: job_data}
662
+
663
+ # Collect log file paths and metadata
664
+ log_files = []
665
+
666
+ for job_id, job_data in jobs.items():
667
+ output_dir = pathlib.Path(job_data.data.get("output_dir", ""))
668
+ if not output_dir:
669
+ continue
670
+
671
+ # Get task name from config
672
+ task_name = LocalExecutor._extract_task_name(job_data, job_id)
673
+
674
+ log_file_path = output_dir / "logs" / "client_stdout.log"
675
+
676
+ log_files.append(
677
+ {
678
+ "job_id": job_id,
679
+ "task_name": task_name,
680
+ "path": log_file_path,
681
+ "file_handle": None,
682
+ "position": 0,
683
+ }
448
684
  )
449
685
 
686
+ if not log_files:
687
+ return
688
+
689
+ # Track which files we've seen before (for tail behavior)
690
+ file_seen_before = {}
691
+
692
+ # Open files that exist, keep track of which ones we're waiting for
693
+ # First, yield the last 15 lines from existing files
694
+ for log_info in log_files:
695
+ if log_info["path"].exists():
696
+ file_seen_before[log_info["path"]] = True
697
+ # Read and yield last 15 lines
698
+ last_lines = LocalExecutor._read_last_n_lines(log_info["path"], 15)
699
+ for line in last_lines:
700
+ yield (
701
+ log_info["job_id"],
702
+ log_info["task_name"],
703
+ line,
704
+ )
705
+ try:
706
+ log_info["file_handle"] = open(
707
+ log_info["path"], "r", encoding="utf-8", errors="replace"
708
+ )
709
+ # Seek to end if file already exists (tail behavior)
710
+ log_info["file_handle"].seek(0, 2)
711
+ log_info["position"] = log_info["file_handle"].tell()
712
+ except Exception as e:
713
+ logger.error(f"Could not open {log_info['path']}: {e}")
714
+ else:
715
+ file_seen_before[log_info["path"]] = False
716
+
717
+ try:
718
+ while True:
719
+ any_activity = False
720
+
721
+ for log_info in log_files:
722
+ # Try to open file if it doesn't exist yet
723
+ if log_info["file_handle"] is None:
724
+ if log_info["path"].exists():
725
+ try:
726
+ # If file was just created, read last 15 lines first
727
+ if not file_seen_before.get(log_info["path"], False):
728
+ last_lines = LocalExecutor._read_last_n_lines(
729
+ log_info["path"], 15
730
+ )
731
+ for line in last_lines:
732
+ yield (
733
+ log_info["job_id"],
734
+ log_info["task_name"],
735
+ line,
736
+ )
737
+ file_seen_before[log_info["path"]] = True
738
+
739
+ log_info["file_handle"] = open(
740
+ log_info["path"],
741
+ "r",
742
+ encoding="utf-8",
743
+ errors="replace",
744
+ )
745
+ # Seek to end for tail behavior
746
+ log_info["file_handle"].seek(0, 2)
747
+ log_info["position"] = log_info["file_handle"].tell()
748
+ except Exception as e:
749
+ logger.error(f"Could not open {log_info['path']}: {e}")
750
+ continue
751
+
752
+ # Read new lines from file
753
+ if log_info["file_handle"] is not None:
754
+ try:
755
+ # Check if file has grown
756
+ current_size = log_info["path"].stat().st_size
757
+ if current_size > log_info["position"]:
758
+ log_info["file_handle"].seek(log_info["position"])
759
+ new_lines = log_info["file_handle"].readlines()
760
+ log_info["position"] = log_info["file_handle"].tell()
761
+
762
+ # Yield new lines
763
+ for line in new_lines:
764
+ line_stripped = line.rstrip("\n\r")
765
+ yield (
766
+ log_info["job_id"],
767
+ log_info["task_name"],
768
+ line_stripped,
769
+ )
770
+ any_activity = True
771
+ except (OSError, IOError) as e:
772
+ # File might have been deleted or moved
773
+ # Don't log error for every check, only on first error
774
+ if log_info.get("error_printed", False) is False:
775
+ logger.error(f"Error reading {log_info['path']}: {e}")
776
+ log_info["error_printed"] = True
777
+ log_info["file_handle"] = None
778
+ except Exception:
779
+ # Reset error flag if we successfully read again
780
+ log_info["error_printed"] = False
781
+
782
+ # If no activity, sleep briefly to avoid busy waiting
783
+ if not any_activity:
784
+ time.sleep(0.1)
785
+
786
+ except KeyboardInterrupt:
787
+ # Clean exit on Ctrl+C
788
+ pass
789
+ finally:
790
+ # Close all file handles
791
+ for log_info in log_files:
792
+ if log_info["file_handle"] is not None:
793
+ try:
794
+ log_info["file_handle"].close()
795
+ except Exception:
796
+ pass
797
+
798
+ @staticmethod
799
+ def _read_last_n_lines(file_path: pathlib.Path, n: int) -> List[str]:
800
+ """Read the last N lines from a file efficiently.
801
+
802
+ Args:
803
+ file_path: Path to the file to read from.
804
+ n: Number of lines to read from the end.
805
+
806
+ Returns:
807
+ List of the last N lines (or fewer if file has fewer lines).
808
+ """
809
+ try:
810
+ with open(file_path, "r", encoding="utf-8", errors="replace") as f:
811
+ # Read all lines
812
+ all_lines = f.readlines()
813
+ # Return last n lines, stripping newlines
814
+ return [line.rstrip("\n\r") for line in all_lines[-n:]]
815
+ except Exception as e:
816
+ logger.warning(f"Could not read last {n} lines from {file_path}: {e}")
817
+ return []
818
+
819
+ @staticmethod
820
+ def _extract_task_name(job_data: JobData, job_id: str) -> str:
821
+ """Extract task name from job data config.
822
+
823
+ Args:
824
+ job_data: JobData object containing config.
825
+ job_id: Job ID for error reporting.
826
+
827
+ Returns:
828
+ Task name string.
829
+ """
830
+ config = job_data.config or {}
831
+ evaluation = config.get("evaluation", {})
832
+ tasks = evaluation.get("tasks", [])
833
+
834
+ # Find the task that matches this job
835
+ # For job_id like "15b9f667.0", index is 0
836
+ try:
837
+ if "." in job_id:
838
+ index = int(job_id.split(".")[1])
839
+ if len(tasks) > 0 and index >= len(tasks):
840
+ raise AttributeError(
841
+ f"Job task index {job_id} is larger than number of tasks {len(tasks)} in invocation"
842
+ )
843
+ # If index is valid and tasks exist, return the task name
844
+ if len(tasks) > 0 and index < len(tasks):
845
+ return tasks[index].get("name", "unknown")
846
+ except (ValueError, IndexError):
847
+ pass
848
+
849
+ # Fallback: try to get task name from output_dir
850
+ # output_dir typically ends with task name
851
+ output_dir = job_data.data.get("output_dir", "")
852
+ if output_dir:
853
+ parts = pathlib.Path(output_dir).parts
854
+ if parts:
855
+ return parts[-1]
856
+
857
+ return "unknown"
858
+
859
+ @staticmethod
860
+ def _add_to_killed_jobs(invocation_id: str, job_id: str) -> None:
861
+ """Add a job ID to the killed jobs file for this invocation.
862
+
863
+ Args:
864
+ invocation_id: The invocation ID.
865
+ job_id: The job ID to mark as killed.
866
+ """
867
+ db = ExecutionDB()
868
+ jobs = db.get_jobs(invocation_id)
869
+ if not jobs:
870
+ return
871
+
872
+ # Get invocation output directory from any job's output_dir
873
+ first_job_data = next(iter(jobs.values()))
874
+ job_output_dir = pathlib.Path(first_job_data.data.get("output_dir", ""))
875
+ if not job_output_dir.exists():
876
+ return
877
+
878
+ # Invocation dir is parent of job output dir
879
+ invocation_dir = job_output_dir.parent
880
+ killed_jobs_file = invocation_dir / "killed_jobs.txt"
881
+
882
+ # Append job_id to file
883
+ with open(killed_jobs_file, "a") as f:
884
+ f.write(f"{job_id}\n")
885
+
450
886
 
451
887
  def _get_progress(artifacts_dir: pathlib.Path) -> Optional[float]:
452
888
  """Get the progress of a local job.