PyPI - xpk - Versions diffs - 0.14.3__py3-none-any.whl → 0.15.0__py3-none-any.whl - Mend

xpk 0.14.3py3-none-any.whl → 0.15.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (58) hide show

integration/gcluster_a3mega_test.py +11 -0
integration/gcluster_a3ultra_test.py +11 -0
integration/gcluster_a4_test.py +11 -0
xpk/commands/cluster.py +57 -21
xpk/commands/cluster_gcluster.py +25 -5
xpk/commands/cluster_gcluster_test.py +11 -2
xpk/commands/cluster_test.py +233 -12
xpk/commands/config.py +3 -5
xpk/commands/kind.py +1 -1
xpk/commands/storage.py +8 -10
xpk/commands/workload.py +28 -11
xpk/commands/workload_test.py +3 -3
xpk/core/blueprint/blueprint_generator.py +70 -33
xpk/core/blueprint/blueprint_test.py +9 -0
xpk/core/capacity.py +46 -8
xpk/core/capacity_test.py +32 -1
xpk/core/cluster.py +37 -57
xpk/core/cluster_test.py +95 -0
xpk/core/commands.py +4 -10
xpk/core/config.py +9 -2
xpk/core/gcloud_context.py +18 -12
xpk/core/gcloud_context_test.py +111 -1
xpk/core/kjob.py +6 -9
xpk/core/kueue_manager.py +192 -32
xpk/core/kueue_manager_test.py +132 -4
xpk/core/nodepool.py +21 -29
xpk/core/nodepool_test.py +17 -15
xpk/core/scheduling.py +16 -1
xpk/core/scheduling_test.py +85 -6
xpk/core/system_characteristics.py +77 -19
xpk/core/system_characteristics_test.py +80 -5
xpk/core/telemetry.py +263 -0
xpk/core/telemetry_test.py +211 -0
xpk/main.py +31 -13
xpk/parser/cluster.py +48 -9
xpk/parser/cluster_test.py +42 -3
xpk/parser/workload.py +12 -0
xpk/parser/workload_test.py +4 -4
xpk/telemetry_uploader.py +29 -0
xpk/templates/kueue_gke_default_topology.yaml.j2 +1 -1
xpk/templates/kueue_sub_slicing_topology.yaml.j2 +3 -8
xpk/utils/console.py +41 -10
xpk/utils/console_test.py +106 -0
xpk/utils/feature_flags.py +7 -1
xpk/utils/file.py +4 -1
xpk/utils/topology.py +4 -0
xpk/utils/user_agent.py +35 -0
xpk/utils/user_agent_test.py +44 -0
xpk/utils/user_input.py +48 -0
xpk/utils/user_input_test.py +92 -0
xpk/utils/validation.py +0 -11
xpk/utils/versions.py +31 -0
{xpk-0.14.3.dist-info → xpk-0.15.0.dist-info}/METADATA +113 -92
{xpk-0.14.3.dist-info → xpk-0.15.0.dist-info}/RECORD +58 -48
{xpk-0.14.3.dist-info → xpk-0.15.0.dist-info}/WHEEL +0 -0
{xpk-0.14.3.dist-info → xpk-0.15.0.dist-info}/entry_points.txt +0 -0
{xpk-0.14.3.dist-info → xpk-0.15.0.dist-info}/licenses/LICENSE +0 -0
{xpk-0.14.3.dist-info → xpk-0.15.0.dist-info}/top_level.txt +0 -0

xpk/core/system_characteristics.py CHANGED Viewed

@@ -18,6 +18,8 @@ from dataclasses import dataclass
 from ..utils.topology import get_topology_product
 from enum import Enum
+SUB_SLICING_TOPOLOGIES = ['2x4', '4x4', '4x8', '8x8', '8x16', '16x16']
 class AcceleratorType(Enum):
   TPU = 1
@@ -131,6 +133,33 @@ def get_system_characteristics_by_device_type(
     return None, 1
+def generate_tpu_topologies(
+    max_cubes: int, enforce_nondecreasing: bool = True
+) -> list[str]:
+  """Generates a list of unique TPU topologies formatted as strings "AxBxC".
+  The list will contain all triplets (A, B, C) such that:
+    - A, B and C are integers in range 4..256 (including 4 and 256)
+    - A, B and C are divisible by 4
+    - (A/4) * (B/4) * (C/4) <= max_cubes
+    - if enforce_nondecreasing: A <= B <= C
+  Additionally, the list will also contain the following triplets:
+    2x2x1, 2x2x2, 2x2x4, 2x4x4
+  Args:
+    max_cubes: maximum number of cubes supported by a TPU platform
+    enforce_nondecreasing: whether to enforce A <= B <= C or not
+  """
+  topologies = ['2x2x1', '2x2x2', '2x2x4', '2x4x4']
+  MAX = 256
+  for x in range(4, MAX + 1, 4):
+    for y in range(x if enforce_nondecreasing else 4, MAX + 1, 4):
+      for z in range(y if enforce_nondecreasing else 4, MAX + 1, 4):
+        if (x // 4) * (y // 4) * (z // 4) <= max_cubes:
+          topologies.append(f'{x}x{y}x{z}')
+  return topologies
 def get_tpu_system_characteristics_map(
     prefix: str,
     tensorcores_per_chip: int,
@@ -138,13 +167,17 @@ def get_tpu_system_characteristics_map(
     machine_type: str,
     supported_topologies: list[str],
     supports_sub_slicing: bool,
-    requires_workload_policy: bool = False,
+    tpu_type_requires_workload_policy: bool = False,
+    default_topologies: set[str] | None = None,
 ) -> dict[str, SystemCharacteristics]:
   system_characteristics_map = {}
+  if default_topologies is None:
+    default_topologies = set()
   for topology in supported_topologies:
     chips_per_vm = compute_chips_per_vm(topology)
     vms_per_slice = compute_vms_per_slice(topology)
     num_tensorcores = compute_num_tensorcores(tensorcores_per_chip, topology)
+    device_type = f'{prefix}-{num_tensorcores}'
     system = SystemCharacteristics(
         topology=topology,
         vms_per_slice=vms_per_slice,
@@ -152,12 +185,17 @@ def get_tpu_system_characteristics_map(
         gce_machine_type=machine_type,
         chips_per_vm=chips_per_vm,
         accelerator_type=AcceleratorType.TPU,
-        device_type=f'{prefix}-{num_tensorcores}',
-        requires_workload_policy=requires_workload_policy,
+        device_type=device_type,
+        requires_workload_policy=tpu_type_requires_workload_policy
+        and vms_per_slice > 1,
         supports_sub_slicing=supports_sub_slicing,
     )
     system_characteristics_map[f'{prefix}-{topology}'] = system
-    system_characteristics_map[f'{prefix}-{num_tensorcores}'] = system
+    if (
+        topology in default_topologies
+        or device_type not in system_characteristics_map
+    ):
+      system_characteristics_map[device_type] = system
   return system_characteristics_map
@@ -334,7 +372,7 @@ UserFacingNameToSystemCharacteristics = {
         gke_accelerator='tpu7x',
         machine_type='tpu7x-standard-1t',
         supported_topologies=['1x1x1'],
-        requires_workload_policy=True,
+        tpu_type_requires_workload_policy=True,
         supports_sub_slicing=False,
     ),
     **get_tpu_system_characteristics_map(
@@ -342,9 +380,10 @@ UserFacingNameToSystemCharacteristics = {
         tensorcores_per_chip=2,
         gke_accelerator='tpu7x',
         machine_type='tpu7x-standard-4t',
-        requires_workload_policy=True,
+        tpu_type_requires_workload_policy=True,
         supports_sub_slicing=False,
-        supported_topologies=[
+        supported_topologies=generate_tpu_topologies(max_cubes=144),
+        default_topologies=set([
             '12x12x12',
             '12x12x16',
             '12x12x20',
@@ -443,7 +482,7 @@ UserFacingNameToSystemCharacteristics = {
             '8x8x76',
             '8x8x8',
             '8x8x92',
-        ],
+        ]),
     ),
     **get_tpu_system_characteristics_map(
         prefix='v6e',
@@ -458,24 +497,27 @@ UserFacingNameToSystemCharacteristics = {
         tensorcores_per_chip=1,
         gke_accelerator='tpu-v6e-slice',
         machine_type='ct6e-standard-4t',
-        supports_sub_slicing=True,
+        supports_sub_slicing=False,
         supported_topologies=[
             '2x2',
-            '2x4',
-            '4x4',
-            '4x8',
-            '8x8',
-            '8x16',
-            '16x16',
         ],
     ),
+    **get_tpu_system_characteristics_map(
+        prefix='v6e',
+        tensorcores_per_chip=1,
+        gke_accelerator='tpu-v6e-slice',
+        machine_type='ct6e-standard-4t',
+        supports_sub_slicing=True,
+        supported_topologies=SUB_SLICING_TOPOLOGIES,
+    ),
     **get_tpu_system_characteristics_map(
         prefix='v5p',
         tensorcores_per_chip=2,
         gke_accelerator='tpu-v5p-slice',
         machine_type='ct5p-hightpu-4t',
         supports_sub_slicing=False,
-        supported_topologies=[
+        supported_topologies=generate_tpu_topologies(max_cubes=140),
+        default_topologies=set([
             '2x2x1',
             '2x2x2',
             '2x2x4',
@@ -572,7 +614,7 @@ UserFacingNameToSystemCharacteristics = {
             '16x16x24',
             '12x24x24',
             '16x20x28',
-        ],
+        ]),
     ),
     **get_tpu_system_characteristics_map(
         prefix='v5litepod',
@@ -588,7 +630,10 @@ UserFacingNameToSystemCharacteristics = {
         gke_accelerator='tpu-v4-podslice',
         machine_type='ct4p-hightpu-4t',
         supports_sub_slicing=False,
-        supported_topologies=[
+        supported_topologies=generate_tpu_topologies(
+            max_cubes=64, enforce_nondecreasing=False
+        ),
+        default_topologies=set([
             '2x2x1',
             '2x2x2',
             '2x2x4',
@@ -600,7 +645,7 @@ UserFacingNameToSystemCharacteristics = {
             '8x8x12',
             '8x8x16',
             '8x16x16',
-        ],
+        ]),
     ),
     # CPU system characteristics.
     # Note that chips_per_vm is actually the number of vCPUs in that CPU.
@@ -750,3 +795,16 @@ UserFacingNameToSystemCharacteristics = {
 }
 """ If you modify UserFacingNameToSystemCharacteristics you should also modify
 the corresponding Map in MaxText/accelerator_to_spec_map.py """
+def get_system_characteristics_keys_by_accelerator_type(
+    accelerators: list[AcceleratorType] | None = None,
+) -> list[str]:
+  """Returns UserFacingNameToSystemCharacteristics keys for given AcceleratorTypes."""
+  if accelerators is None:
+    accelerators = list(AcceleratorType)
+  return [
+      key
+      for key, value in UserFacingNameToSystemCharacteristics.items()
+      if value.accelerator_type in accelerators
+  ]

xpk/core/system_characteristics_test.py CHANGED Viewed

@@ -14,7 +14,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 """
-from .system_characteristics import get_tpu_system_characteristics_map, SystemCharacteristics, AcceleratorType
+from .system_characteristics import get_tpu_system_characteristics_map, generate_tpu_topologies, SystemCharacteristics, AcceleratorType
 def test_get_tpu_system_characteristics_map_returns_correct_values_for_1x1_topology():
@@ -25,7 +25,7 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_1x1_topol
       machine_type="test",
       supported_topologies=["1x1"],
       supports_sub_slicing=False,
-      requires_workload_policy=True,
+      tpu_type_requires_workload_policy=False,
   )
   expected_system_characteristics = SystemCharacteristics(
@@ -37,7 +37,7 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_1x1_topol
       accelerator_type=AcceleratorType.TPU,
       device_type="test-1",
       supports_sub_slicing=False,
-      requires_workload_policy=True,
+      requires_workload_policy=False,
   )
   assert result == {
       "test-1": expected_system_characteristics,
@@ -53,7 +53,7 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_2x2_topol
       machine_type="test",
       supported_topologies=["2x2"],
       supports_sub_slicing=False,
-      requires_workload_policy=True,
+      tpu_type_requires_workload_policy=True,
   )
   expected_system_characteristics = SystemCharacteristics(
@@ -65,9 +65,84 @@ def test_get_tpu_system_characteristics_map_returns_correct_values_for_2x2_topol
       accelerator_type=AcceleratorType.TPU,
       device_type="test-8",
       supports_sub_slicing=False,
-      requires_workload_policy=True,
+      requires_workload_policy=False,
   )
   assert result == {
       "test-8": expected_system_characteristics,
       "test-2x2": expected_system_characteristics,
   }
+def test_get_tpu_system_characteristics_map_returns_correct_values_for_2x2x2_topology():
+  result = get_tpu_system_characteristics_map(
+      prefix="test",
+      tensorcores_per_chip=2,
+      gke_accelerator="test",
+      machine_type="test",
+      supported_topologies=["2x2x2"],
+      supports_sub_slicing=False,
+      tpu_type_requires_workload_policy=True,
+  )
+  expected_system_characteristics = SystemCharacteristics(
+      topology="2x2x2",
+      vms_per_slice=2,
+      gke_accelerator="test",
+      gce_machine_type="test",
+      chips_per_vm=4,
+      accelerator_type=AcceleratorType.TPU,
+      device_type="test-16",
+      supports_sub_slicing=False,
+      requires_workload_policy=True,
+  )
+  assert result == {
+      "test-16": expected_system_characteristics,
+      "test-2x2x2": expected_system_characteristics,
+  }
+def test_get_tpu_system_characteristics_map_prefers_default_topologies():
+  result = get_tpu_system_characteristics_map(
+      prefix="test",
+      tensorcores_per_chip=2,
+      gke_accelerator="test",
+      machine_type="test",
+      supported_topologies=["4x4x4", "4x4x32", "4x8x16", "8x8x8"],
+      supports_sub_slicing=False,
+      default_topologies=set(["4x8x16"]),
+  )
+  assert result["test-128"].topology == "4x4x4"
+  assert result["test-1024"].topology == "4x8x16"
+def test_generate_tpu_topologies_returns_correct_number_of_values_for_TPU_platforms():
+  v4 = generate_tpu_topologies(max_cubes=64, enforce_nondecreasing=False)
+  v5p = generate_tpu_topologies(max_cubes=140)
+  tpu7x = generate_tpu_topologies(max_cubes=144)
+  assert len(v4) == 800
+  assert len(v5p) == 414
+  assert len(tpu7x) == 432
+def test_generate_tpu_topologies_respects_constraints():
+  ordered_6_cubes = generate_tpu_topologies(
+      max_cubes=6, enforce_nondecreasing=True
+  )
+  non_ordered_6_cubes = generate_tpu_topologies(
+      max_cubes=6, enforce_nondecreasing=False
+  )
+  assert "8x4x4" not in ordered_6_cubes
+  assert "8x4x4" in non_ordered_6_cubes
+  assert "4x8x12" in ordered_6_cubes  # exactly 6 cubes
+  assert "4x8x12" in non_ordered_6_cubes  # exactly 6 cubes
+  assert "4x8x16" not in ordered_6_cubes  # too many cubes (8)
+  assert "4x8x16" not in non_ordered_6_cubes  # too many cubes (8)
+def test_generate_tpu_topologies_contains_sub_cube_slices():
+  one_cube = generate_tpu_topologies(max_cubes=1)
+  assert one_cube == ["2x2x1", "2x2x2", "2x2x4", "2x4x4", "4x4x4"]

xpk/core/telemetry.py ADDED Viewed

@@ -0,0 +1,263 @@
+"""
+Copyright 2025 Google LLC
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+     https://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import platform
+import uuid
+import json
+import os
+import time
+import sys
+import importlib
+import subprocess
+import tempfile
+import requests
+from enum import Enum
+from typing import Any
+from dataclasses import dataclass
+from .config import xpk_config, CLIENT_ID_KEY, SEND_TELEMETRY_KEY, __version__ as xpk_version
+from ..utils.execution_context import is_dry_run
+from ..utils.user_agent import get_user_agent
+from ..utils.feature_flags import FeatureFlags
+def should_send_telemetry():
+  return (
+      FeatureFlags.TELEMETRY_ENABLED
+      and xpk_config.get(SEND_TELEMETRY_KEY) != "false"
+  )
+def send_clearcut_payload(data: str, wait_to_complete: bool = False) -> None:
+  """Sends payload to clearcut endpoint."""
+  try:
+    file_path = _store_payload_in_temp_file(data)
+    if not _schedule_clearcut_background_flush(file_path, wait_to_complete):
+      _clearcut_flush(file_path)
+  except Exception:  # pylint: disable=broad-exception-caught
+    pass
+def _store_payload_in_temp_file(data: str) -> str:
+  with tempfile.NamedTemporaryFile(
+      mode="w", delete=False, encoding="utf-8"
+  ) as file:
+    json.dump(
+        {
+            "data": data,
+            "url": "https://play.googleapis.com/log",
+            "params": {"format": "json_proto"},
+            "headers": {"User-Agent": get_user_agent()},
+            "method": "POST",
+        },
+        file,
+    )
+    return file.name
+def _schedule_clearcut_background_flush(
+    file_path: str, wait_to_complete: bool
+) -> bool:
+  """Schedules clearcut background flush.
+  Args:
+    file_path: path to the temporary file where the events are stored.
+    wait_to_complete: whenever to wait for the background script completion.
+  Returns:
+    True if successful and False otherwise
+  """
+  with importlib.resources.path("xpk", "telemetry_uploader.py") as path:
+    if not os.path.exists(path):
+      return False
+    kwargs: dict[str, Any] = {}
+    if sys.platform == "win32":
+      kwargs["creationflags"] = (
+          subprocess.DETACHED_PROCESS | subprocess.CREATE_NO_WINDOW
+      )
+    else:
+      kwargs["start_new_session"] = True
+    process = subprocess.Popen(
+        args=[
+            sys.executable,
+            str(path),
+            file_path,
+        ],
+        stdout=sys.stdout if wait_to_complete else subprocess.DEVNULL,
+        stderr=sys.stderr if wait_to_complete else subprocess.DEVNULL,
+        **kwargs,
+    )
+    if wait_to_complete:
+      process.wait()
+    return True
+def _clearcut_flush(file_path: str) -> None:
+  with open(file_path, mode="r", encoding="utf-8") as file:
+    kwargs = json.load(file)
+    requests.request(**kwargs)
+    os.remove(file_path)
+class MetricsEventMetadataKey(Enum):
+  SESSION_ID = "XPK_SESSION_ID"
+  DRY_RUN = "XPK_DRY_RUN"
+  PYTHON_VERSION = "XPK_PYTHON_VERSION"
+  ZONE = "XPK_ZONE"
+  SYSTEM_CHARACTERISTICS = "XPK_SYSTEM_CHARACTERISTICS"
+  PROVISIONING_MODE = "XPK_PROVISIONING_MODE"
+  COMMAND = "XPK_COMMAND"
+  EXIT_CODE = "XPK_EXIT_CODE"
+  RUNNING_AS_PIP = "XPK_RUNNING_AS_PIP"
+  RUNNING_FROM_SOURCE = "XPK_RUNNING_FROM_SOURCE"
+@dataclass
+class _MetricsEvent:
+  time: float
+  type: str
+  name: str
+  metadata: dict[MetricsEventMetadataKey, str]
+class _MetricsCollector:
+  """Metrics collector for collecting various metrics and events across application."""
+  _events: list[_MetricsEvent] = []
+  def log_start(self, command: str) -> None:
+    """Logs start event."""
+    self._events.append(
+        _MetricsEvent(
+            time=time.time(),
+            type="commands",
+            name="start",
+            metadata={MetricsEventMetadataKey.COMMAND: command},
+        )
+    )
+  def log_complete(self, exit_code: int) -> None:
+    """Logs complete event."""
+    self._events.append(
+        _MetricsEvent(
+            time=time.time(),
+            type="commands",
+            name="complete",
+            metadata={MetricsEventMetadataKey.EXIT_CODE: str(exit_code)},
+        )
+    )
+  def log_custom(
+      self,
+      name: str,
+      metadata: dict[MetricsEventMetadataKey, str] | None = None,
+  ) -> None:
+    """Logs custom event."""
+    self._events.append(
+        _MetricsEvent(
+            time=time.time(),
+            type="custom",
+            name=name,
+            metadata=metadata if metadata is not None else {},
+        )
+    )
+  def flush(self) -> str:
+    """Flushes collected events into concord payload."""
+    result = _generate_payload(self._events)
+    self._events.clear()
+    return result
+MetricsCollector = _MetricsCollector()
+def _generate_payload(events: list[_MetricsEvent]) -> str:
+  base_concord_event = _get_base_concord_event()
+  base_event_metadata = _get_base_event_metadata()
+  serialized_events = []
+  for event in events:
+    metadata = {
+        **base_event_metadata,
+        **event.metadata,
+    }
+    serialized_events.append({
+        "event_time_ms": int(event.time * 1000),
+        "source_extension_json": json.dumps({
+            **base_concord_event,
+            "event_type": event.type,
+            "event_name": event.name,
+            "event_metadata": [
+                {"key": key.value, "value": value}
+                for key, value in metadata.items()
+            ],
+        }),
+    })
+  return json.dumps({
+      "client_info": {"client_type": "XPK"},
+      "log_source_name": "CONCORD",
+      "request_time_ms": int(time.time() * 1000),
+      "log_event": serialized_events,
+  })
+def _get_base_event_metadata() -> dict[MetricsEventMetadataKey, str]:
+  return {
+      MetricsEventMetadataKey.SESSION_ID: _get_session_id(),
+      MetricsEventMetadataKey.DRY_RUN: str(is_dry_run()).lower(),
+      MetricsEventMetadataKey.PYTHON_VERSION: platform.python_version(),
+      MetricsEventMetadataKey.RUNNING_AS_PIP: str(_is_running_as_pip()).lower(),
+      MetricsEventMetadataKey.RUNNING_FROM_SOURCE: str(
+          _is_running_from_source()
+      ).lower(),
+  }
+def _get_base_concord_event() -> dict[str, str]:
+  return {
+      "release_version": xpk_version,
+      "console_type": "XPK",
+      "client_install_id": _ensure_client_id(),
+  }
+def _is_running_as_pip() -> bool:
+  return os.path.basename(sys.argv[0]) == "xpk"
+def _is_running_from_source() -> bool:
+  current_path = os.path.abspath(os.path.realpath(__file__))
+  return (
+      "site-packages" not in current_path
+      and "dist-packages" not in current_path
+  )
+def _get_session_id() -> str:
+  return str(uuid.uuid4())
+def _ensure_client_id() -> str:
+  """Generates Client ID and stores in configuration if not already present."""
+  current_client_id = xpk_config.get(CLIENT_ID_KEY)
+  if current_client_id is not None:
+    return current_client_id
+  new_client_id = str(uuid.uuid4())
+  xpk_config.set(CLIENT_ID_KEY, new_client_id)
+  return new_client_id

xpk 0.14.3__py3-none-any.whl → 0.15.0__py3-none-any.whl

xpk 0.14.3py3-none-any.whl → 0.15.0py3-none-any.whl