nemo-evaluator-launcher 0.1.21__tar.gz → 0.1.44__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/PKG-INFO +3 -3
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/pyproject.toml +4 -2
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/api/functional.py +106 -2
- nemo_evaluator_launcher-0.1.44/src/nemo_evaluator_launcher/cli/logs.py +102 -0
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/cli/main.py +12 -0
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/cli/run.py +73 -15
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/cli/version.py +26 -23
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/common/helpers.py +76 -14
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/common/logging_utils.py +4 -1
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/common/printing_utils.py +7 -0
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/configs/deployment/sglang.yaml +4 -2
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/configs/deployment/trtllm.yaml +2 -3
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/configs/deployment/vllm.yaml +0 -1
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/configs/execution/slurm/default.yaml +14 -0
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/executors/base.py +31 -1
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/executors/lepton/deployment_helpers.py +36 -1
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/executors/lepton/executor.py +81 -1
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/executors/local/executor.py +377 -22
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/executors/local/run.template.sh +54 -2
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/executors/slurm/executor.py +422 -68
- nemo_evaluator_launcher-0.1.44/src/nemo_evaluator_launcher/executors/slurm/proxy.cfg.template +26 -0
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/exporters/utils.py +32 -46
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/package_info.py +1 -1
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/resources/mapping.toml +36 -31
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher.egg-info/PKG-INFO +3 -3
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher.egg-info/SOURCES.txt +2 -0
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher.egg-info/entry_points.txt +1 -0
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher.egg-info/requires.txt +1 -1
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/LICENSE +0 -0
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/README.md +0 -0
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/setup.cfg +0 -0
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/__init__.py +0 -0
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/api/__init__.py +0 -0
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/api/types.py +0 -0
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/api/utils.py +0 -0
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/cli/__init__.py +0 -0
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/cli/export.py +0 -0
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/cli/info.py +0 -0
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/cli/kill.py +0 -0
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/cli/ls_runs.py +0 -0
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/cli/ls_tasks.py +0 -0
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/cli/status.py +0 -0
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/common/__init__.py +0 -0
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/common/execdb.py +0 -0
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/common/mapping.py +0 -0
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/configs/__init__.py +0 -0
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/configs/default.yaml +0 -0
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/configs/deployment/generic.yaml +0 -0
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/configs/deployment/nim.yaml +0 -0
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/configs/deployment/none.yaml +0 -0
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/configs/execution/lepton/default.yaml +0 -0
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/configs/execution/local.yaml +0 -0
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/executors/__init__.py +0 -0
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/executors/lepton/__init__.py +0 -0
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/executors/lepton/job_helpers.py +0 -0
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/executors/local/__init__.py +0 -0
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/executors/registry.py +0 -0
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/executors/slurm/__init__.py +0 -0
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/exporters/__init__.py +0 -0
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/exporters/base.py +0 -0
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/exporters/gsheets.py +0 -0
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/exporters/local.py +0 -0
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/exporters/mlflow.py +0 -0
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/exporters/registry.py +0 -0
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher/exporters/wandb.py +0 -0
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher.egg-info/dependency_links.txt +0 -0
- {nemo_evaluator_launcher-0.1.21 → nemo_evaluator_launcher-0.1.44}/src/nemo_evaluator_launcher.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: nemo-evaluator-launcher
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.44
|
|
4
4
|
Summary: Launcher for the evaluations provided by NeMo Evaluator containers with different runtime backends
|
|
5
5
|
Author: NVIDIA
|
|
6
6
|
Author-email: nemo-toolkit@nvidia.com
|
|
@@ -458,7 +458,7 @@ License:
|
|
|
458
458
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
459
459
|
SOFTWARE.
|
|
460
460
|
|
|
461
|
-
Project-URL: homepage, https://github.com/NVIDIA-NeMo/
|
|
461
|
+
Project-URL: homepage, https://github.com/NVIDIA-NeMo/Evaluator
|
|
462
462
|
Project-URL: repository, https://github.com/NVIDIA-NeMo/Evaluator/packages/nemo-evaluator-launcher
|
|
463
463
|
Keywords: deep learning,evaluations,machine learning,gpu,NLP,pytorch,torch
|
|
464
464
|
Requires-Python: <3.14,>=3.10
|
|
@@ -478,7 +478,7 @@ Requires-Dist: mlflow>=2.8.0; extra == "mlflow"
|
|
|
478
478
|
Provides-Extra: wandb
|
|
479
479
|
Requires-Dist: wandb>=0.15.0; extra == "wandb"
|
|
480
480
|
Provides-Extra: gsheets
|
|
481
|
-
Requires-Dist:
|
|
481
|
+
Requires-Dist: gspread>=5.0.0; extra == "gsheets"
|
|
482
482
|
Provides-Extra: exporters
|
|
483
483
|
Requires-Dist: mlflow; extra == "exporters"
|
|
484
484
|
Requires-Dist: wandb; extra == "exporters"
|
|
@@ -33,20 +33,21 @@ keywords = [
|
|
|
33
33
|
|
|
34
34
|
[project.urls]
|
|
35
35
|
# BEGIN(if-changed): check package_info.py
|
|
36
|
-
homepage = "https://github.com/NVIDIA-NeMo/
|
|
36
|
+
homepage = "https://github.com/NVIDIA-NeMo/Evaluator"
|
|
37
37
|
repository = "https://github.com/NVIDIA-NeMo/Evaluator/packages/nemo-evaluator-launcher"
|
|
38
38
|
# END(if-changed)
|
|
39
39
|
|
|
40
40
|
[project.optional-dependencies]
|
|
41
41
|
mlflow = ["mlflow>=2.8.0"]
|
|
42
42
|
wandb = ["wandb>=0.15.0"]
|
|
43
|
-
gsheets = ["
|
|
43
|
+
gsheets = ["gspread>=5.0.0"]
|
|
44
44
|
exporters = ["mlflow", "wandb", "gsheets"]
|
|
45
45
|
all = ["mlflow", "wandb", "gsheets"]
|
|
46
46
|
|
|
47
47
|
[project.scripts]
|
|
48
48
|
nemo-evaluator-launcher = "nemo_evaluator_launcher.cli.main:main"
|
|
49
49
|
nv-eval = "nemo_evaluator_launcher.cli.main:main"
|
|
50
|
+
nel = "nemo_evaluator_launcher.cli.main:main"
|
|
50
51
|
|
|
51
52
|
[dependency-groups]
|
|
52
53
|
dev = [
|
|
@@ -75,6 +76,7 @@ where = ["src"]
|
|
|
75
76
|
"resources/**/*",
|
|
76
77
|
"configs/**/*",
|
|
77
78
|
"executors/**/*.sh",
|
|
79
|
+
"executors/**/*.template",
|
|
78
80
|
]
|
|
79
81
|
|
|
80
82
|
[tool.setuptools.dynamic]
|
|
@@ -19,7 +19,7 @@ This module provides the main functional entry points for running evaluations, q
|
|
|
19
19
|
"""
|
|
20
20
|
|
|
21
21
|
from pathlib import Path
|
|
22
|
-
from typing import Any, List, Optional, Union
|
|
22
|
+
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
|
|
23
23
|
|
|
24
24
|
import yaml
|
|
25
25
|
from omegaconf import DictConfig, OmegaConf
|
|
@@ -116,6 +116,7 @@ def get_status(ids_or_prefixes: list[str]) -> list[dict[str, Any]]:
|
|
|
116
116
|
db = ExecutionDB()
|
|
117
117
|
results: List[dict[str, Any]] = []
|
|
118
118
|
|
|
119
|
+
# TODO(agronskiy): refactor the `.`-checking job in all the functions.
|
|
119
120
|
for id_or_prefix in ids_or_prefixes:
|
|
120
121
|
# If id looks like an invocation_id (no dot), get all jobs for it
|
|
121
122
|
if "." not in id_or_prefix:
|
|
@@ -259,6 +260,108 @@ def get_status(ids_or_prefixes: list[str]) -> list[dict[str, Any]]:
|
|
|
259
260
|
return results
|
|
260
261
|
|
|
261
262
|
|
|
263
|
+
def stream_logs(
|
|
264
|
+
ids_or_prefixes: Union[str, list[str]],
|
|
265
|
+
) -> Iterator[Tuple[str, str, str]]:
|
|
266
|
+
"""Stream logs from jobs or invocations by their IDs or invocation IDs.
|
|
267
|
+
|
|
268
|
+
Args:
|
|
269
|
+
ids_or_prefixes: Single ID/prefix or list of job IDs or invocation IDs to stream logs from.
|
|
270
|
+
Short prefixes are allowed, we would try to match the full ones from
|
|
271
|
+
prefixes if no collisions are present.
|
|
272
|
+
|
|
273
|
+
Yields:
|
|
274
|
+
Tuple[str, str, str]: Tuples of (job_id, task_name, log_line) for each log line.
|
|
275
|
+
Empty lines are yielded as empty strings.
|
|
276
|
+
|
|
277
|
+
Raises:
|
|
278
|
+
ValueError: If the executor doesn't support log streaming.
|
|
279
|
+
"""
|
|
280
|
+
db = ExecutionDB()
|
|
281
|
+
|
|
282
|
+
# Normalize to list for consistent processing
|
|
283
|
+
if isinstance(ids_or_prefixes, str):
|
|
284
|
+
ids_or_prefixes = [ids_or_prefixes]
|
|
285
|
+
|
|
286
|
+
# Collect all jobs from all IDs, grouped by executor
|
|
287
|
+
executor_to_jobs: Dict[str, Dict[str, JobData]] = {}
|
|
288
|
+
executor_to_invocations: Dict[str, List[str]] = {}
|
|
289
|
+
|
|
290
|
+
# TODO(agronskiy): refactor the `.`-checking job in all the functions.
|
|
291
|
+
for id_or_prefix in ids_or_prefixes:
|
|
292
|
+
# Determine if this is a job ID or invocation ID
|
|
293
|
+
if "." in id_or_prefix:
|
|
294
|
+
# This is a job ID
|
|
295
|
+
job_data = db.get_job(id_or_prefix)
|
|
296
|
+
if job_data is None:
|
|
297
|
+
continue
|
|
298
|
+
|
|
299
|
+
executor = job_data.executor
|
|
300
|
+
if executor not in executor_to_jobs:
|
|
301
|
+
executor_to_jobs[executor] = {}
|
|
302
|
+
executor_to_jobs[executor][id_or_prefix] = job_data
|
|
303
|
+
else:
|
|
304
|
+
# This is an invocation ID
|
|
305
|
+
jobs = db.get_jobs(id_or_prefix)
|
|
306
|
+
if not jobs:
|
|
307
|
+
continue
|
|
308
|
+
|
|
309
|
+
# Get the executor class from the first job
|
|
310
|
+
first_job_data = next(iter(jobs.values()))
|
|
311
|
+
executor = first_job_data.executor
|
|
312
|
+
if executor not in executor_to_invocations:
|
|
313
|
+
executor_to_invocations[executor] = []
|
|
314
|
+
executor_to_invocations[executor].append(id_or_prefix)
|
|
315
|
+
|
|
316
|
+
# Stream logs from each executor simultaneously
|
|
317
|
+
# For each executor, collect all job IDs and stream them together
|
|
318
|
+
for executor, jobs_dict in executor_to_jobs.items():
|
|
319
|
+
try:
|
|
320
|
+
executor_cls = get_executor(executor)
|
|
321
|
+
except ValueError:
|
|
322
|
+
continue
|
|
323
|
+
|
|
324
|
+
# For local executor with multiple jobs, pass list to stream simultaneously
|
|
325
|
+
# For other executors or single jobs, pass individual job IDs
|
|
326
|
+
if executor == "local" and len(jobs_dict) > 1:
|
|
327
|
+
# Pass all job IDs as a list to stream simultaneously
|
|
328
|
+
try:
|
|
329
|
+
yield from executor_cls.stream_logs(
|
|
330
|
+
list(jobs_dict.keys()), executor_name=executor
|
|
331
|
+
)
|
|
332
|
+
except NotImplementedError:
|
|
333
|
+
raise ValueError(
|
|
334
|
+
f"Log streaming is not yet implemented for executor '{executor}'"
|
|
335
|
+
)
|
|
336
|
+
else:
|
|
337
|
+
# Single job or non-local executor
|
|
338
|
+
for job_id in jobs_dict.keys():
|
|
339
|
+
try:
|
|
340
|
+
yield from executor_cls.stream_logs(job_id, executor_name=executor)
|
|
341
|
+
except NotImplementedError:
|
|
342
|
+
raise ValueError(
|
|
343
|
+
f"Log streaming is not yet implemented for executor '{executor}'"
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
# Stream logs from invocation IDs
|
|
347
|
+
for executor, invocation_ids in executor_to_invocations.items():
|
|
348
|
+
try:
|
|
349
|
+
executor_cls = get_executor(executor)
|
|
350
|
+
except ValueError:
|
|
351
|
+
continue
|
|
352
|
+
|
|
353
|
+
# Stream each invocation (each invocation already handles multiple jobs internally)
|
|
354
|
+
for invocation_id in invocation_ids:
|
|
355
|
+
try:
|
|
356
|
+
yield from executor_cls.stream_logs(
|
|
357
|
+
invocation_id, executor_name=executor
|
|
358
|
+
)
|
|
359
|
+
except NotImplementedError:
|
|
360
|
+
raise ValueError(
|
|
361
|
+
f"Log streaming is not yet implemented for executor '{executor}'"
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
|
|
262
365
|
def list_all_invocations_summary() -> list[dict[str, Any]]:
|
|
263
366
|
"""Return a concise per-invocation summary from the exec DB.
|
|
264
367
|
|
|
@@ -378,6 +481,7 @@ def kill_job_or_invocation(id: str) -> list[dict[str, Any]]:
|
|
|
378
481
|
"data": {"error": f"Unexpected error: {str(e)}"},
|
|
379
482
|
}
|
|
380
483
|
|
|
484
|
+
# TODO(agronskiy): refactor the `.`-checking job in all the functions.
|
|
381
485
|
# Determine if this is a job ID or invocation ID
|
|
382
486
|
if "." in id:
|
|
383
487
|
# This is a job ID - kill single job
|
|
@@ -442,7 +546,7 @@ def export_results(
|
|
|
442
546
|
if "." in single_id: # job_id
|
|
443
547
|
# Try reading config from artifacts working dir (auto-export on remote node)
|
|
444
548
|
cfg_file = None
|
|
445
|
-
for name in ("
|
|
549
|
+
for name in ("config.yml", "run_config.yml"):
|
|
446
550
|
p = Path(name)
|
|
447
551
|
if p.exists():
|
|
448
552
|
cfg_file = p
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
#
|
|
16
|
+
"""Logs command for streaming logs from evaluation jobs."""
|
|
17
|
+
|
|
18
|
+
import sys
|
|
19
|
+
from dataclasses import dataclass
|
|
20
|
+
from typing import Callable, Dict
|
|
21
|
+
|
|
22
|
+
from simple_parsing import field
|
|
23
|
+
|
|
24
|
+
import nemo_evaluator_launcher.common.printing_utils as pu
|
|
25
|
+
from nemo_evaluator_launcher.api.functional import stream_logs
|
|
26
|
+
from nemo_evaluator_launcher.common.execdb import ExecutionDB
|
|
27
|
+
from nemo_evaluator_launcher.common.logging_utils import logger
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class Cmd:
|
|
32
|
+
"""Logs command configuration."""
|
|
33
|
+
|
|
34
|
+
ids: list[str] = field(
|
|
35
|
+
default_factory=list,
|
|
36
|
+
positional=True,
|
|
37
|
+
help="Invocation IDs or job IDs (e.g., '15b9f667' or '15b9f667.0'). Multiple IDs can be provided.",
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
def execute(self) -> None:
|
|
41
|
+
"""Execute the logs command to stream logs from jobs."""
|
|
42
|
+
if not self.ids:
|
|
43
|
+
logger.error("At least one ID is required")
|
|
44
|
+
sys.exit(1)
|
|
45
|
+
|
|
46
|
+
db = ExecutionDB()
|
|
47
|
+
|
|
48
|
+
# Validate all IDs exist
|
|
49
|
+
all_job_ids = []
|
|
50
|
+
for id_or_prefix in self.ids:
|
|
51
|
+
if "." in id_or_prefix:
|
|
52
|
+
# This is a job ID - get single job
|
|
53
|
+
job_data = db.get_job(id_or_prefix)
|
|
54
|
+
if job_data is None:
|
|
55
|
+
logger.error(f"Job {id_or_prefix} not found")
|
|
56
|
+
sys.exit(1)
|
|
57
|
+
all_job_ids.append(id_or_prefix)
|
|
58
|
+
else:
|
|
59
|
+
# This is an invocation ID - get all jobs
|
|
60
|
+
jobs = db.get_jobs(id_or_prefix)
|
|
61
|
+
if not jobs:
|
|
62
|
+
logger.error(f"Invocation {id_or_prefix} not found")
|
|
63
|
+
sys.exit(1)
|
|
64
|
+
all_job_ids.extend(jobs.keys())
|
|
65
|
+
|
|
66
|
+
# Build color mapping for job IDs
|
|
67
|
+
colors = [pu.red, pu.green, pu.yellow, pu.magenta, pu.cyan]
|
|
68
|
+
job_colors: Dict[str, Callable[[str], str]] = {}
|
|
69
|
+
color_index = 0
|
|
70
|
+
|
|
71
|
+
for job_id in all_job_ids:
|
|
72
|
+
job_colors[job_id] = colors[color_index % len(colors)]
|
|
73
|
+
color_index += 1
|
|
74
|
+
|
|
75
|
+
# Stream logs from executor
|
|
76
|
+
try:
|
|
77
|
+
log_stream = stream_logs(self.ids)
|
|
78
|
+
for job_id, task_name, log_line in log_stream:
|
|
79
|
+
# Extract short prefix: first 6 chars of invocation ID + job number
|
|
80
|
+
if "." in job_id:
|
|
81
|
+
inv_id, job_num = job_id.split(".", 1)
|
|
82
|
+
short_prefix = f"{inv_id[:6]}.{job_num}"
|
|
83
|
+
else:
|
|
84
|
+
short_prefix = job_id[:6]
|
|
85
|
+
prefix = f"{short_prefix}:"
|
|
86
|
+
color_func = job_colors.get(job_id, pu.grey)
|
|
87
|
+
if log_line:
|
|
88
|
+
print(f"{color_func(prefix)} {log_line}")
|
|
89
|
+
else:
|
|
90
|
+
# Print empty lines without prefix
|
|
91
|
+
print()
|
|
92
|
+
|
|
93
|
+
except ValueError:
|
|
94
|
+
# Handle case where executor doesn't support streaming
|
|
95
|
+
# Warning already logged by BaseExecutor.stream_logs
|
|
96
|
+
pass
|
|
97
|
+
except KeyboardInterrupt:
|
|
98
|
+
# Clean exit on Ctrl+C
|
|
99
|
+
pass
|
|
100
|
+
except Exception as e:
|
|
101
|
+
logger.error(f"Error streaming logs: {e}")
|
|
102
|
+
sys.exit(1)
|
|
@@ -22,6 +22,7 @@ from simple_parsing import ArgumentParser
|
|
|
22
22
|
import nemo_evaluator_launcher.cli.export as export
|
|
23
23
|
import nemo_evaluator_launcher.cli.info as info
|
|
24
24
|
import nemo_evaluator_launcher.cli.kill as kill
|
|
25
|
+
import nemo_evaluator_launcher.cli.logs as logs
|
|
25
26
|
import nemo_evaluator_launcher.cli.ls_runs as ls_runs
|
|
26
27
|
import nemo_evaluator_launcher.cli.ls_tasks as ls_tasks
|
|
27
28
|
import nemo_evaluator_launcher.cli.run as run
|
|
@@ -42,6 +43,7 @@ def is_verbose_enabled(args) -> bool:
|
|
|
42
43
|
subcommands = [
|
|
43
44
|
"run",
|
|
44
45
|
"status",
|
|
46
|
+
"logs",
|
|
45
47
|
"info",
|
|
46
48
|
"kill",
|
|
47
49
|
"tasks_alias",
|
|
@@ -106,6 +108,14 @@ def create_parser() -> ArgumentParser:
|
|
|
106
108
|
)
|
|
107
109
|
status_parser.add_arguments(status.Cmd, dest="status")
|
|
108
110
|
|
|
111
|
+
# Logs subcommand
|
|
112
|
+
logs_parser = subparsers.add_parser(
|
|
113
|
+
"logs",
|
|
114
|
+
help="Stream logs from evaluation jobs",
|
|
115
|
+
description="Stream logs from evaluation jobs by invocation ID or job ID",
|
|
116
|
+
)
|
|
117
|
+
logs_parser.add_arguments(logs.Cmd, dest="logs")
|
|
118
|
+
|
|
109
119
|
# Kill subcommand
|
|
110
120
|
kill_parser = subparsers.add_parser(
|
|
111
121
|
"kill",
|
|
@@ -204,6 +214,8 @@ def main() -> None:
|
|
|
204
214
|
args.run.execute()
|
|
205
215
|
elif args.command == "status":
|
|
206
216
|
args.status.execute()
|
|
217
|
+
elif args.command == "logs":
|
|
218
|
+
args.logs.execute()
|
|
207
219
|
elif args.command == "kill":
|
|
208
220
|
args.kill.execute()
|
|
209
221
|
elif args.command == "ls":
|
|
@@ -16,6 +16,7 @@
|
|
|
16
16
|
import pathlib
|
|
17
17
|
import time
|
|
18
18
|
from dataclasses import dataclass
|
|
19
|
+
from typing import Literal
|
|
19
20
|
|
|
20
21
|
from simple_parsing import field
|
|
21
22
|
|
|
@@ -26,6 +27,7 @@ from nemo_evaluator_launcher.common.printing_utils import (
|
|
|
26
27
|
green,
|
|
27
28
|
magenta,
|
|
28
29
|
red,
|
|
30
|
+
yellow,
|
|
29
31
|
)
|
|
30
32
|
|
|
31
33
|
|
|
@@ -33,6 +35,13 @@ from nemo_evaluator_launcher.common.printing_utils import (
|
|
|
33
35
|
class Cmd:
|
|
34
36
|
"""Run command parameters"""
|
|
35
37
|
|
|
38
|
+
config: str | None = field(
|
|
39
|
+
default=None,
|
|
40
|
+
alias=["--config"],
|
|
41
|
+
metadata={
|
|
42
|
+
"help": "Full path to config file. Uses Hydra by default (--config-mode=hydra). Use --config-mode=raw to load directly (bypasses Hydra)."
|
|
43
|
+
},
|
|
44
|
+
)
|
|
36
45
|
config_name: str = field(
|
|
37
46
|
default="default",
|
|
38
47
|
alias=["-c", "--config-name"],
|
|
@@ -47,11 +56,11 @@ class Cmd:
|
|
|
47
56
|
"help": "Path to user config directory. If provided, searches here first, then falls back to internal configs."
|
|
48
57
|
},
|
|
49
58
|
)
|
|
50
|
-
|
|
51
|
-
default=
|
|
52
|
-
alias=["
|
|
59
|
+
config_mode: Literal["hydra", "raw"] = field(
|
|
60
|
+
default="hydra",
|
|
61
|
+
alias=["--config-mode"],
|
|
53
62
|
metadata={
|
|
54
|
-
"help": "
|
|
63
|
+
"help": "Config loading mode: 'hydra' (default) uses Hydra config system, 'raw' loads config file directly bypassing Hydra."
|
|
55
64
|
},
|
|
56
65
|
)
|
|
57
66
|
override: list[str] = field(
|
|
@@ -83,28 +92,59 @@ class Cmd:
|
|
|
83
92
|
|
|
84
93
|
from nemo_evaluator_launcher.api.functional import RunConfig, run_eval
|
|
85
94
|
|
|
86
|
-
#
|
|
87
|
-
if self.
|
|
88
|
-
|
|
95
|
+
# Validate config_mode value
|
|
96
|
+
if self.config_mode not in ["hydra", "raw"]:
|
|
97
|
+
raise ValueError(
|
|
98
|
+
f"Invalid --config-mode value: {self.config_mode}. Must be 'hydra' or 'raw'."
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
# Validate that raw mode requires --config
|
|
102
|
+
if self.config_mode == "raw" and self.config is None:
|
|
103
|
+
raise ValueError(
|
|
104
|
+
"--config-mode=raw requires --config to be specified. Raw mode loads config files directly."
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
# Load configuration either from Hydra or directly from a config file
|
|
108
|
+
if self.config_mode == "raw" and self.config:
|
|
109
|
+
# Validate that raw config loading is not used with other config options
|
|
89
110
|
if self.config_name != "default":
|
|
90
|
-
raise ValueError(
|
|
111
|
+
raise ValueError(
|
|
112
|
+
"Cannot use --config-mode=raw with --config-name. Raw mode only works with --config."
|
|
113
|
+
)
|
|
91
114
|
if self.config_dir is not None:
|
|
92
|
-
raise ValueError(
|
|
115
|
+
raise ValueError(
|
|
116
|
+
"Cannot use --config-mode=raw with --config-dir. Raw mode only works with --config."
|
|
117
|
+
)
|
|
93
118
|
if self.override:
|
|
94
|
-
raise ValueError(
|
|
119
|
+
raise ValueError(
|
|
120
|
+
"Cannot use --config-mode=raw with --override. Raw mode only works with --config."
|
|
121
|
+
)
|
|
95
122
|
|
|
96
|
-
# Load from
|
|
97
|
-
with open(self.
|
|
123
|
+
# Load from config file directly (bypass Hydra)
|
|
124
|
+
with open(self.config, "r") as f:
|
|
98
125
|
config_dict = yaml.safe_load(f)
|
|
99
126
|
|
|
100
127
|
# Create RunConfig from the loaded data
|
|
101
128
|
config = OmegaConf.create(config_dict)
|
|
102
129
|
else:
|
|
130
|
+
# Handle --config parameter: split path into config_dir and config_name for Hydra
|
|
131
|
+
if self.config:
|
|
132
|
+
if self.config_name != "default":
|
|
133
|
+
raise ValueError("Cannot use --config with --config-name")
|
|
134
|
+
if self.config_dir is not None:
|
|
135
|
+
raise ValueError("Cannot use --config with --config-dir")
|
|
136
|
+
config_path = pathlib.Path(self.config)
|
|
137
|
+
config_dir = str(config_path.parent)
|
|
138
|
+
config_name = str(config_path.stem)
|
|
139
|
+
else:
|
|
140
|
+
config_dir = self.config_dir
|
|
141
|
+
config_name = self.config_name
|
|
142
|
+
|
|
103
143
|
# Load the complete Hydra configuration
|
|
104
144
|
config = RunConfig.from_hydra(
|
|
105
|
-
|
|
145
|
+
config_dir=config_dir,
|
|
146
|
+
config_name=config_name,
|
|
106
147
|
hydra_overrides=self.override,
|
|
107
|
-
config_dir=self.config_dir,
|
|
108
148
|
)
|
|
109
149
|
|
|
110
150
|
try:
|
|
@@ -150,7 +190,7 @@ class Cmd:
|
|
|
150
190
|
f.write("#\n")
|
|
151
191
|
f.write("# To rerun this exact configuration:\n")
|
|
152
192
|
f.write(
|
|
153
|
-
f"# nemo-evaluator-launcher run --
|
|
193
|
+
f"# nemo-evaluator-launcher run --config {config_path} --config-mode=raw\n"
|
|
154
194
|
)
|
|
155
195
|
f.write("#\n")
|
|
156
196
|
f.write(config_yaml)
|
|
@@ -164,6 +204,10 @@ class Cmd:
|
|
|
164
204
|
bold(cyan("To check status: "))
|
|
165
205
|
+ f"nemo-evaluator-launcher status {invocation_id}"
|
|
166
206
|
)
|
|
207
|
+
print(
|
|
208
|
+
bold(cyan("To view job info: "))
|
|
209
|
+
+ f"nemo-evaluator-launcher info {invocation_id}"
|
|
210
|
+
)
|
|
167
211
|
print(
|
|
168
212
|
bold(cyan("To kill all jobs: "))
|
|
169
213
|
+ f"nemo-evaluator-launcher kill {invocation_id}"
|
|
@@ -198,3 +242,17 @@ class Cmd:
|
|
|
198
242
|
)
|
|
199
243
|
)
|
|
200
244
|
)
|
|
245
|
+
|
|
246
|
+
# Warn if both config_dir and config_name are provided (and config_name is not default)
|
|
247
|
+
if (
|
|
248
|
+
self.config is None
|
|
249
|
+
and self.config_dir is not None
|
|
250
|
+
and self.config_name != "default"
|
|
251
|
+
):
|
|
252
|
+
joint_path = pathlib.Path(self.config_dir) / f"{self.config_name}.yaml"
|
|
253
|
+
print(
|
|
254
|
+
yellow(
|
|
255
|
+
f"Warning: Using --config-dir and --config-name together is deprecated. "
|
|
256
|
+
f"Please use --config {joint_path} instead."
|
|
257
|
+
)
|
|
258
|
+
)
|
|
@@ -19,6 +19,29 @@ import importlib
|
|
|
19
19
|
from dataclasses import dataclass
|
|
20
20
|
|
|
21
21
|
from nemo_evaluator_launcher import __package_name__, __version__
|
|
22
|
+
from nemo_evaluator_launcher.common.logging_utils import logger
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def get_versions() -> dict:
|
|
26
|
+
internal_module_name = "nemo_evaluator_launcher_internal"
|
|
27
|
+
res = {__package_name__: __version__}
|
|
28
|
+
# Check for internal package
|
|
29
|
+
try:
|
|
30
|
+
internal_module = importlib.import_module(internal_module_name)
|
|
31
|
+
# Try to get version from internal package
|
|
32
|
+
internal_version = getattr(internal_module, "__version__", None)
|
|
33
|
+
if internal_version:
|
|
34
|
+
res[internal_module_name] = internal_version
|
|
35
|
+
else:
|
|
36
|
+
res[internal_module_name] = "available (version unknown)"
|
|
37
|
+
except ImportError:
|
|
38
|
+
# Internal package not available - this is expected in many cases
|
|
39
|
+
pass
|
|
40
|
+
except Exception as e:
|
|
41
|
+
logger.error(f"nemo_evaluator_launcher_internal: error loading ({e})")
|
|
42
|
+
raise
|
|
43
|
+
|
|
44
|
+
return res
|
|
22
45
|
|
|
23
46
|
|
|
24
47
|
@dataclass
|
|
@@ -27,26 +50,6 @@ class Cmd:
|
|
|
27
50
|
|
|
28
51
|
def execute(self) -> None:
|
|
29
52
|
"""Execute the version command."""
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
try:
|
|
34
|
-
internal_module = importlib.import_module(
|
|
35
|
-
"nemo_evaluator_launcher_internal"
|
|
36
|
-
)
|
|
37
|
-
# Try to get version from internal package
|
|
38
|
-
try:
|
|
39
|
-
internal_version = getattr(internal_module, "__version__", None)
|
|
40
|
-
if internal_version:
|
|
41
|
-
print(f"nemo-evaluator-launcher-internal: {internal_version}")
|
|
42
|
-
else:
|
|
43
|
-
print(
|
|
44
|
-
"nemo-evaluator-launcher-internal: available (version unknown)"
|
|
45
|
-
)
|
|
46
|
-
except Exception:
|
|
47
|
-
print("nemo-evaluator-launcher-internal: available (version unknown)")
|
|
48
|
-
except ImportError:
|
|
49
|
-
# Internal package not available - this is expected in many cases
|
|
50
|
-
pass
|
|
51
|
-
except Exception as e:
|
|
52
|
-
print(f"nemo-evaluator-launcher-internal: error loading ({e})")
|
|
53
|
+
res = get_versions()
|
|
54
|
+
for package, version in res.items():
|
|
55
|
+
print(f"{package}: {version}")
|