konduktor-nightly 0.1.0.dev20250722105323__tar.gz → 0.1.0.dev20250723105251__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of konduktor-nightly might be problematic. Click here for more details.

Files changed (98) hide show
  1. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/PKG-INFO +1 -1
  2. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/__init__.py +2 -2
  3. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/backends/jobset.py +2 -2
  4. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/backends/jobset_utils.py +1 -1
  5. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/cli.py +5 -5
  6. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/utils/log_utils.py +167 -1
  7. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/utils/schemas.py +13 -0
  8. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/pyproject.toml +1 -1
  9. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/LICENSE +0 -0
  10. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/README.md +0 -0
  11. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/adaptors/__init__.py +0 -0
  12. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/adaptors/aws.py +0 -0
  13. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/adaptors/common.py +0 -0
  14. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/adaptors/gcp.py +0 -0
  15. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/authentication.py +0 -0
  16. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/backends/__init__.py +0 -0
  17. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/backends/backend.py +0 -0
  18. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/backends/constants.py +0 -0
  19. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/check.py +0 -0
  20. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/config.py +0 -0
  21. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/constants.py +0 -0
  22. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/controller/__init__.py +0 -0
  23. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/controller/constants.py +0 -0
  24. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/controller/launch.py +0 -0
  25. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/controller/node.py +0 -0
  26. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/controller/parse.py +0 -0
  27. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/dashboard/README.md +0 -0
  28. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/dashboard/backend/main.py +0 -0
  29. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/dashboard/backend/sockets.py +0 -0
  30. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/dashboard/frontend/.eslintrc.json +0 -0
  31. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/dashboard/frontend/.gitignore +0 -0
  32. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/dashboard/frontend/app/api/jobs/route.js +0 -0
  33. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/dashboard/frontend/app/api/namespaces/route.js +0 -0
  34. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/dashboard/frontend/app/components/Grafana.jsx +0 -0
  35. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/dashboard/frontend/app/components/JobsData.jsx +0 -0
  36. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/dashboard/frontend/app/components/LogsData.jsx +0 -0
  37. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/dashboard/frontend/app/components/NavMenu.jsx +0 -0
  38. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/dashboard/frontend/app/components/NavTabs.jsx +0 -0
  39. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/dashboard/frontend/app/components/NavTabs2.jsx +0 -0
  40. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/dashboard/frontend/app/components/SelectBtn.jsx +0 -0
  41. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/dashboard/frontend/app/components/lib/utils.js +0 -0
  42. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/dashboard/frontend/app/components/ui/chip-select.jsx +0 -0
  43. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/dashboard/frontend/app/components/ui/input.jsx +0 -0
  44. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/dashboard/frontend/app/components/ui/navigation-menu.jsx +0 -0
  45. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/dashboard/frontend/app/components/ui/select.jsx +0 -0
  46. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/dashboard/frontend/app/favicon.ico +0 -0
  47. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/dashboard/frontend/app/globals.css +0 -0
  48. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/dashboard/frontend/app/jobs/page.js +0 -0
  49. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/dashboard/frontend/app/layout.js +0 -0
  50. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/dashboard/frontend/app/logs/page.js +0 -0
  51. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/dashboard/frontend/app/page.js +0 -0
  52. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/dashboard/frontend/jsconfig.json +0 -0
  53. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/dashboard/frontend/next.config.mjs +0 -0
  54. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/dashboard/frontend/package-lock.json +0 -0
  55. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/dashboard/frontend/package.json +0 -0
  56. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/dashboard/frontend/postcss.config.mjs +0 -0
  57. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/dashboard/frontend/server.js +0 -0
  58. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/dashboard/frontend/tailwind.config.js +0 -0
  59. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/data/__init__.py +0 -0
  60. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/data/aws/__init__.py +0 -0
  61. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/data/aws/s3.py +0 -0
  62. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/data/constants.py +0 -0
  63. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/data/data_utils.py +0 -0
  64. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/data/gcp/__init__.py +0 -0
  65. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/data/gcp/constants.py +0 -0
  66. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/data/gcp/gcs.py +0 -0
  67. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/data/gcp/utils.py +0 -0
  68. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/data/registry.py +0 -0
  69. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/data/storage.py +0 -0
  70. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/data/storage_utils.py +0 -0
  71. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/execution.py +0 -0
  72. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/kube_client.py +0 -0
  73. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/logging.py +0 -0
  74. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/manifests/controller_deployment.yaml +0 -0
  75. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/manifests/dashboard_deployment.yaml +0 -0
  76. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/manifests/dmesg_daemonset.yaml +0 -0
  77. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/manifests/pod_cleanup_controller.yaml +0 -0
  78. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/resource.py +0 -0
  79. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/task.py +0 -0
  80. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/templates/jobset.yaml.j2 +0 -0
  81. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/templates/pod.yaml.j2 +0 -0
  82. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/usage/__init__.py +0 -0
  83. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/usage/constants.py +0 -0
  84. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/utils/__init__.py +0 -0
  85. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/utils/accelerator_registry.py +0 -0
  86. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/utils/annotations.py +0 -0
  87. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/utils/base64_utils.py +0 -0
  88. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/utils/common_utils.py +0 -0
  89. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/utils/constants.py +0 -0
  90. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/utils/env_options.py +0 -0
  91. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/utils/exceptions.py +0 -0
  92. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/utils/kubernetes_enums.py +0 -0
  93. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/utils/kubernetes_utils.py +0 -0
  94. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/utils/loki_utils.py +0 -0
  95. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/utils/rich_utils.py +0 -0
  96. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/utils/subprocess_utils.py +0 -0
  97. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/utils/ux_utils.py +0 -0
  98. {konduktor_nightly-0.1.0.dev20250722105323 → konduktor_nightly-0.1.0.dev20250723105251}/konduktor/utils/validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: konduktor-nightly
3
- Version: 0.1.0.dev20250722105323
3
+ Version: 0.1.0.dev20250723105251
4
4
  Summary: GPU Cluster Health Management
5
5
  Author: Andrew Aikawa
6
6
  Author-email: asai@berkeley.edu
@@ -14,7 +14,7 @@ __all__ = [
14
14
  ]
15
15
 
16
16
  # Replaced with the current commit when building the wheels.
17
- _KONDUKTOR_COMMIT_SHA = '84818710a16e0a0515fbbd7878395fca37cf94f7'
17
+ _KONDUKTOR_COMMIT_SHA = '4572b2fb2c49fb54573c080a71ab9d982ced5bf2'
18
18
  os.makedirs(os.path.expanduser('~/.konduktor'), exist_ok=True)
19
19
 
20
20
 
@@ -48,5 +48,5 @@ def _get_git_commit():
48
48
 
49
49
 
50
50
  __commit__ = _get_git_commit()
51
- __version__ = '1.0.0.dev0.1.0.dev20250722105323'
51
+ __version__ = '1.0.0.dev0.1.0.dev20250723105251'
52
52
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
@@ -16,7 +16,7 @@ if typing.TYPE_CHECKING:
16
16
 
17
17
  from konduktor import config, logging
18
18
  from konduktor.backends import backend, jobset_utils
19
- from konduktor.utils import kubernetes_utils, loki_utils, rich_utils, ux_utils
19
+ from konduktor.utils import kubernetes_utils, log_utils, rich_utils, ux_utils
20
20
 
21
21
  Path = str
22
22
  logger = logging.get_logger(__file__)
@@ -193,7 +193,7 @@ class JobsetBackend(backend.Backend):
193
193
  _wait_for_jobset_start(namespace, task.name)
194
194
  try:
195
195
  log_thread = threading.Thread(
196
- target=loki_utils.tail_loki_logs_ws,
196
+ target=log_utils.tail_logs,
197
197
  args=(task.name,),
198
198
  daemon=True,
199
199
  )
@@ -595,7 +595,7 @@ def show_status_table(namespace: str, all_users: bool):
595
595
  rows.append(
596
596
  [
597
597
  job['metadata']['name'],
598
- _get_status_string_colorized(job['status']),
598
+ _get_status_string_colorized(job.get('status', {})),
599
599
  _get_resources(job),
600
600
  *_get_time_delta(job['metadata']['creationTimestamp']),
601
601
  ]
@@ -56,7 +56,6 @@ from konduktor.utils import (
56
56
  common_utils,
57
57
  kubernetes_utils,
58
58
  log_utils,
59
- loki_utils,
60
59
  ux_utils,
61
60
  )
62
61
 
@@ -554,7 +553,8 @@ def status(all_users: bool):
554
553
  ),
555
554
  )
556
555
  @click.option(
557
- '--tail',
556
+ '--num-lines',
557
+ '--num_lines' '-n',
558
558
  default=1000,
559
559
  type=int,
560
560
  help=(
@@ -568,10 +568,10 @@ def logs(
568
568
  status: bool,
569
569
  job_id: str,
570
570
  follow: bool,
571
- tail: int,
571
+ num_lines: int,
572
572
  ):
573
573
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
574
- """Tail the log of a job."""
574
+ """Retrieve/tail the log of a job."""
575
575
  if status:
576
576
  raise click.UsageError('`--status` is being deprecated)')
577
577
 
@@ -598,7 +598,7 @@ def logs(
598
598
  'Logs are tailed from 1 hour ago, ' 'to see more logs, check Grafana.',
599
599
  fg='yellow',
600
600
  )
601
- loki_utils.tail_loki_logs_ws(job_id, follow=follow, num_logs=tail)
601
+ log_utils.tail_logs(job_id, follow=follow, num_logs=num_lines)
602
602
 
603
603
 
604
604
  @cli.command(cls=_DocumentedCodeCommand)
@@ -10,19 +10,41 @@
10
10
  # See the License for the specific language governing permissions and
11
11
  # limitations under the License.
12
12
 
13
+ import asyncio
13
14
  import copy
15
+ import enum
14
16
  import io
17
+ import json
15
18
  import multiprocessing
16
19
  import os
17
20
  import subprocess
18
21
  import sys
19
22
  import types
20
- from typing import List, Optional, Tuple, Type, Union
23
+ import urllib.parse
24
+ from typing import Any, Dict, List, Optional, Tuple, Type, Union
21
25
 
26
+ import colorama
27
+ import kr8s
22
28
  import prettytable
29
+ import requests
30
+ import websockets
23
31
 
32
+ from konduktor import config, logging
24
33
  from konduktor.utils import subprocess_utils
25
34
 
35
+ logger = logging.get_logger(__name__)
36
+
37
+
38
+ LOKI_REMOTE_PORT = 3100
39
+ WEBSOCKET_TIMEOUT = 10
40
+ INFINITY = 999999
41
+ VICKY_REMOTE_PORT = 9428
42
+
43
+
44
+ class LogBackend(enum.Enum):
45
+ VICTORIA = 'victoria'
46
+ LOKI = 'loki'
47
+
26
48
 
27
49
  class LineProcessor(object):
28
50
  """A processor for log lines."""
@@ -247,3 +269,147 @@ def run_with_log(
247
269
  # causing the stream handling stuck at `readline`.
248
270
  subprocess_utils.kill_children_processes()
249
271
  raise
272
+
273
+
274
+ async def _read_logs(url: str, timeout: int, job_name: str, worker_id: int, port: int):
275
+ ws = await asyncio.wait_for(websockets.connect(url), timeout=WEBSOCKET_TIMEOUT)
276
+ logger.info(
277
+ f'{colorama.Fore.YELLOW}Tailing logs. '
278
+ f'Forwarding from remote port {port}. Press Ctrl+C to stop. '
279
+ f'{colorama.Style.RESET_ALL}'
280
+ )
281
+ try:
282
+ while True:
283
+ message = await asyncio.wait_for(ws.recv(), timeout=timeout)
284
+ try:
285
+ payload = json.loads(message)
286
+ for stream in payload['streams']:
287
+ if stream['values'][0][1] is not None:
288
+ print(
289
+ f"{colorama.Fore.CYAN}{colorama.Style.BRIGHT} "
290
+ f"(job_name={job_name} worker_id={worker_id})"
291
+ f"{colorama.Style.RESET_ALL} {stream['values'][0][1]}",
292
+ flush=True,
293
+ )
294
+ except json.JSONDecodeError:
295
+ logger.warning(f'Failed to decode log skipping: {message}')
296
+ logger.debug(f'Dropped log: {message}')
297
+ continue
298
+ except asyncio.exceptions.TimeoutError:
299
+ logger.debug('Websocket timed-out, closing log stream!')
300
+ except KeyboardInterrupt:
301
+ logger.debug('Keyboard interrupt, closing log stream!')
302
+
303
+
304
+ def tail_loki_logs_ws(
305
+ job_name: str, worker_id: int = 0, num_logs: int = 1000, follow: bool = True
306
+ ):
307
+ if num_logs > 5000:
308
+ # TODO(asaiacai): we should not have a limit on the number of logs, but rather
309
+ # let the user specify any number of lines, and we can print the last N lines.
310
+ # this can be done in chunks. Potentially, we can query range
311
+ # until we reach the end of the log and then invoke tail again.
312
+ # Also include checks that the job is running/ever ran.
313
+ raise ValueError('num_logs must be less than or equal to 5000')
314
+ loki_svc = kr8s.objects.Service.get('loki', namespace='loki')
315
+ with kr8s.portforward.PortForward(
316
+ loki_svc, LOKI_REMOTE_PORT, local_port='auto'
317
+ ) as port:
318
+ loki_url = f'ws://localhost:{port}/loki/api/v1/tail'
319
+ logger.debug(f'Loki URL: {loki_url}')
320
+ params = {
321
+ 'query': urllib.parse.quote(
322
+ r'{' + f'k8s_job_name="{job_name}-workers-0",'
323
+ r' k8s_container_name="konduktor-container"} '
324
+ f' | batch_kubernetes_io_job_completion_index = `{worker_id}`'
325
+ ),
326
+ 'limit': num_logs,
327
+ 'delay': 5,
328
+ # TODO(asaiacai): need to auto-generate the start and end times.
329
+ }
330
+
331
+ query_string = '&'.join(f'{key}={value}' for key, value in params.items())
332
+ loki_url += f'?{query_string}'
333
+ timeout = INFINITY if follow else WEBSOCKET_TIMEOUT
334
+ asyncio.run(
335
+ _read_logs(loki_url, timeout, job_name, worker_id, LOKI_REMOTE_PORT)
336
+ )
337
+
338
+
339
+ def tail_vicky_logs(
340
+ job_name: str, worker_id: int = 0, num_logs: int = 1000, follow: bool = True
341
+ ):
342
+ query: Dict[str, Any] = {}
343
+ if num_logs > 5000:
344
+ # TODO(asaiacai): we should not have a limit on the number of logs, but rather
345
+ # let the user specify any number of lines, and we can print the last N lines.
346
+ # this can be done in chunks. Potentially, we can query range
347
+ # until we reach the end of the log and then invoke tail again.
348
+ # Also include checks that the job is running/ever ran.
349
+ raise ValueError('num_logs must be less than or equal to 5000')
350
+ logger.info('ignoring num_logs argument for VictoriaLogs')
351
+ vicky_svc = kr8s.objects.Service.get(
352
+ 'vls-victoria-logs-single-server', namespace='victoria-logs'
353
+ )
354
+ with kr8s.portforward.PortForward(
355
+ vicky_svc, VICKY_REMOTE_PORT, local_port='auto'
356
+ ) as port:
357
+ if follow:
358
+ timeout = INFINITY
359
+ vicky_url = f'http://localhost:{port}/select/logsql/tail'
360
+ query = {}
361
+ else:
362
+ vicky_url = f'http://localhost:{port}/select/logsql/query'
363
+ query = {'limit': num_logs}
364
+ timeout = 1
365
+ logger.debug(f'Vicky URL: {vicky_url}')
366
+
367
+ query['query'] = 'k8s.namespace.name: "default"'
368
+ query['start_offset'] = '1h'
369
+
370
+ try:
371
+ logger.debug(f'Making request to {vicky_url} with query: {query}')
372
+ with requests.post(
373
+ vicky_url, data=query, stream=True, timeout=timeout
374
+ ) as response:
375
+ logger.debug(f'Response status: {response.status_code}')
376
+ if response.status_code != 200:
377
+ logger.error(
378
+ f'VictoriaLogs API returned status {response.status_code}: '
379
+ f'{response.text}'
380
+ )
381
+ return
382
+
383
+ for line in response.iter_lines(decode_unicode=True):
384
+ if line:
385
+ payload = json.loads(line)
386
+ print(
387
+ f"{colorama.Fore.CYAN}{colorama.Style.BRIGHT} "
388
+ f"(job_name={job_name} worker_id={worker_id})"
389
+ f"{colorama.Style.RESET_ALL} {payload['_msg']}",
390
+ flush=True,
391
+ )
392
+
393
+ except KeyboardInterrupt:
394
+ logger.info('\nStopping log stream...')
395
+ except requests.exceptions.Timeout:
396
+ logger.error(f'Request to VictoriaLogs timed out after {timeout} seconds')
397
+ except requests.exceptions.ConnectionError as e:
398
+ logger.error(f'Failed to connect to VictoriaLogs at {vicky_url}: {e}')
399
+ except requests.exceptions.RequestException as e:
400
+ logger.error(f'Request to VictoriaLogs failed: {e}')
401
+ except Exception as e:
402
+ logger.error(f'Unexpected error while tailing VictoriaLogs: {e}')
403
+
404
+
405
+ def tail_logs(
406
+ job_name: str, worker_id: int = 0, num_logs: int = 1000, follow: bool = True
407
+ ):
408
+ logs_backend = config.get_nested(('logs', 'backend'), None)
409
+ if logs_backend == LogBackend.VICTORIA:
410
+ tail_vicky_logs(job_name, worker_id, num_logs, follow)
411
+ elif logs_backend == LogBackend.LOKI:
412
+ tail_loki_logs_ws(job_name, worker_id, num_logs, follow)
413
+ else:
414
+ logger.info('Defaulting to VictoriaLogs')
415
+ tail_vicky_logs(job_name, worker_id, num_logs, follow)
@@ -490,6 +490,18 @@ def get_config_schema():
490
490
  },
491
491
  }
492
492
 
493
+ logs_configs = {
494
+ 'type': 'object',
495
+ 'required': [],
496
+ 'additionalProperties': False,
497
+ 'properties': {
498
+ 'backend': {
499
+ 'type': 'string',
500
+ 'case_insensitive_enum': ['loki', 'victoria'],
501
+ },
502
+ },
503
+ }
504
+
493
505
  gpu_configs = {
494
506
  'type': 'object',
495
507
  'required': [],
@@ -537,6 +549,7 @@ def get_config_schema():
537
549
  'admin_policy': admin_policy_schema,
538
550
  'nvidia_gpus': gpu_configs,
539
551
  'allowed_clouds': allowed_clouds,
552
+ 'logs': logs_configs,
540
553
  'tailscale': tailscale_configs,
541
554
  'ssh': ssh_configs,
542
555
  **cloud_configs,
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "konduktor-nightly"
3
- version = "0.1.0.dev20250722105323"
3
+ version = "0.1.0.dev20250723105251"
4
4
  description = "GPU Cluster Health Management"
5
5
  packages = [
6
6
  {include = "konduktor"}