PyPI - kalavai-client - Versions diffs - 0.7.12__tar.gz → 0.7.13__tar.gz - Mend

kalavai-client 0.7.12tar.gz → 0.7.13tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

{kalavai_client-0.7.12 → kalavai_client-0.7.13}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: kalavai-client
-Version: 0.7.12
+Version: 0.7.13
 Summary: Client app for kalavai platform
 License-Expression: Apache-2.0
 License-File: LICENSE

kalavai_client-0.7.13/kalavai_client/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+
2	+ __version__ = "0.7.13"

{kalavai_client-0.7.12 → kalavai_client-0.7.13}/kalavai_client/assets/apps.yaml RENAMED Viewed

@@ -19,7 +19,7 @@ repositories:
     url: https://charts.longhorn.io
   - name: volcano-sh
     url: https://volcano-sh.github.io/helm-charts
-  - name: prometheus
+  - name: prometheus-community # prometheus
     url: https://prometheus-community.github.io/helm-charts
   - name: opencost-charts
     url: https://opencost.github.io/opencost-helm-chart
@@ -152,11 +152,27 @@ releases:
     namespace: opencost
     chart: opencost-charts/opencost
     installed: {{deploy_opencost|default("false", true)}}
+    set:
+    - name: service.type
+      value: NodePort
+    - name: opencost.nodeSelector.{{kalavai_role_label}}
+      value: server
+    # point at prometheus instance (theres an opencost.prometheus.external too)
+    - name: opencost.prometheus.internal.enabled
+      value: true
+    - name: opencost.prometheus.internal.serviceName
+      value: {{prometheus_service_name}}
+    - name: opencost.prometheus.internal.namespaceName
+      value: {{prometheus_namespace}}
+    - name: opencost.prometheus.internal.port
+      value: {{prometheus_port}}
   - name: prometheus
-    namespace: prometheus-system
-    chart: prometheus/prometheus
+    namespace: {{prometheus_namespace}}
+    chart: prometheus-community/kube-prometheus-stack #prometheus/prometheus
     installed: {{deploy_prometheus|default("false", true)}}
     set:
+    - name: server.nodeSelector.{{kalavai_role_label}}
+      value: server
     - name: prometheus-pushgateway.enabled
       value: false
     - name: alertmanager.enabled
@@ -209,7 +225,7 @@ releases:
     set:
     - name: namespace
       value: kalavai
-    - name: replicas
+    - name: deployment.replicas
       value: {{watcher_replicas}}
     - name: image_tag
       value: "{{watcher_image_tag}}" #"v2025.07.34"
@@ -232,7 +248,7 @@ releases:
     - name: deployment.kalavai_api_endpoint
       value: {{kalavai_api_endpoint}}
     - name: deployment.prometheus_endpoint
-      value: {{prometheus_endpoint}}
+      value: "http://{{prometheus_service_name}}.{{prometheus_namespace}}.svc.cluster.local:{{prometheus_port}}"
     - name: deployment.opencost_endpoint
       value: {{opencost_endpoint}}
     - name: deployment.longhorn_manager_endpoint
@@ -243,7 +259,7 @@ releases:
       value: {{watcher_resources_memory}}
     - name: resources.limits.cpu
       value: {{watcher_resources_cpu}}
-    - name: deployment.nodeSelector.kalavai/role
+    - name: deployment.nodeSelector.{{kalavai_role_label}}
       value: "server"
   - name: hami-vgpu
     namespace: kalavai
@@ -253,6 +269,10 @@ releases:
     set:
     - name: resourceCores
       value: "nvidia.com/gpucores"
+    - name: resourceMem
+      value: "nvidia.com/gpumem"
+    - name: resourceMemPercentage
+      value: "nvidia.com/gpumem-percentage"
     - name: devicePlugin.runtimeClassName
       value: "nvidia"
     - name: scheduler.defaultSchedulerPolicy.nodeSchedulerPolicy

{kalavai_client-0.7.12 → kalavai_client-0.7.13}/kalavai_client/assets/default_pool_config.yaml RENAMED Viewed

@@ -4,14 +4,14 @@ server:
   location: null
   name: "kalavai_cluster"
   mtu: ""
-  watcher_image_tag: "v2025.10.10"
+  watcher_image_tag: "v2025.10.24"
 core:
   # Deploy systems
   deploy:
   - volcano
-  - cert_manager
-  - rocm
+  #- cert_manager
+  #- rocm
   - watcher
   - hami
   #- lago
@@ -21,6 +21,7 @@ core:
   #- minio
   # "Kalavai API endpoint"
   kalavai_api_endpoint: "https://platform.kalavai.net/_/api"
+  kalavai_role_label: "kalavai/role"
   # "Opencost endpoint"
   opencost_endpoint: "http://opencost.opencost.svc.cluster.local:9003"
   # "Longhorn manager endpoint"
@@ -29,14 +30,15 @@ core:
   helios_harvest_interval: 120
   # "Watcher is shared pool"
   watcher_is_shared_pool: "True"
-  watcher_resources_memory: 2
-  watcher_resources_cpu: 2
-  watcher_replicas: 2
-  # "Prometheus endpoint"
-  prometheus_endpoint: "http://prometheus-server.prometheus-system.svc.cluster.local:80"
+  watcher_resources_memory: 0.5
+  watcher_resources_cpu: 0.5
+  watcher_replicas: 1
+  # "Prometheus and opencost"
+  prometheus_service_name: prometheus-kube-prometheus-prometheus
+  prometheus_namespace: prometheus-system
+  prometheus_port: 9090
   # "Prometheus server retention"
   prometheus_server_retention: "30d"
-  # "Prometheus disk size"
   prometheus_disk_size: "8Gi"
   # "Longhorn UI port"
   longhorn_ui_port: 30000

{kalavai_client-0.7.12 → kalavai_client-0.7.13}/kalavai_client/assets/docker-compose-template.yaml RENAMED Viewed

@@ -20,7 +20,7 @@ services:
 # run worker only if command is set
 {%if command %}
   {{service_name}}:
-    image: docker.io/bundenth/kalavai-runner:{{target_platform}}-latest
+    image: docker.io/bundenth/kalavai-runner-{{target_platform}}:latest
     pull_policy: always
     container_name: {{service_name}}
     platform: linux/{{target_platform}}
@@ -43,26 +43,29 @@ services:
     {% if random_suffix %}
       --random_suffix="{{random_suffix}}"
     {% endif %}
-      {% if command == "server" %}
+    {% if command == "server" %}
       --port_range="30000-32767"
-      {% else %}
+      {% if load_balancer_ip_address %}
+      --tls_san={{load_balancer_ip_address}}
+      {% endif %}
+    {% else %}
       --server_ip={{pool_ip}}
       --token={{pool_token}}
-      {% endif %}
-      {%if vpn %}
+    {% endif %}
+    {%if vpn %}
       --flannel_iface={{flannel_iface}}
-      {% endif %}
-      {% if num_gpus and num_gpus > 0 %}
+    {% endif %}
+    {% if num_gpus and num_gpus > 0 %}
       --gpu=on
-      {% else %}
+    {% else %}
       --gpu=off
-      {% endif %}
-      {% if node_labels %}
+    {% endif %}
+    {% if node_labels %}
       --extra="{{node_labels}}"
-      {% endif %}
-      {% if mtu != "" %}
+    {% endif %}
+    {% if mtu != "" %}
       --mtu={{mtu}}
-      {% endif %}
+    {% endif %}
     # volumes:
     # - {{k3s_path}}:/var/lib/rancher/k3s # Persist data

{kalavai_client-0.7.12 → kalavai_client-0.7.13}/kalavai_client/bridge_api.py RENAMED Viewed

@@ -614,14 +614,14 @@ def node_labels(request: NodeLabelsRequest, api_key: str = Depends(verify_api_ke
     description="Retrieves all labels associated with specified compute nodes in the pool. Labels provide metadata about nodes and can be used for filtering and scheduling decisions.",
     tags=["info"],
     response_description="Node labels")
-def node_labels_get(request: Optional[NodesActionRequest]=NodesActionRequest(), api_key: str = Depends(verify_api_key)):
+def node_labels_get(nodes: Optional[List[str]] = Query(None), api_key: str = Depends(verify_api_key)):
     """
     Get node labels with the following parameters:
-    - **node_names**: List of node names to get labels for
+    - **nodes**: List of node names to get labels for
     """
     result = get_node_labels(
-        node_names=request.nodes
+        node_names=nodes
     )
     return result

{kalavai_client-0.7.12 → kalavai_client-0.7.13}/kalavai_client/cli.py RENAMED Viewed

@@ -406,6 +406,7 @@ def pool__start(
     watcher_image_tag: str=None,
     platform="amd64",
     ip_address: str=None,
+    lb_address: str=None,
     location: str=None,
     non_interactive: bool=False,
     node_labels: Annotated[dict, arguably.arg.handler(parse_key_value_pairs)] = {}
@@ -421,6 +422,10 @@ def pool__start(
     if CLUSTER.is_cluster_init():
         console.log(f"[white] You are already connected to {load_server_info(data_key=CLUSTER_NAME_KEY, file=USER_LOCAL_SERVER_FILE)}. Enter [yellow]kalavai pool stop[white] to exit and join another one.")
         return
+    if non_interactive and all([value is None for value in [location, lb_address, ip_address]]):
+        console.log("[red]In --non-interactive mode without --location, one of --lb-address or --ip-address must be set")
+        return
     if node_labels:
         console.log(f"[blue]Configuration received: {node_labels}")
@@ -451,6 +456,7 @@ def pool__start(
     result = create_pool(
         ip_address=ip_address,
+        lb_ip_address=lb_address,
         location=location,
         target_platform=platform,
         watcher_image_tag=watcher_image_tag,
@@ -523,7 +529,8 @@ def pool__join(
     platform="amd64",
     node_name=None,
     non_interactive=False,
-    node_labels: Annotated[dict, arguably.arg.handler(parse_key_value_pairs)] = {}
+    node_labels: Annotated[dict, arguably.arg.handler(parse_key_value_pairs)] = {},
+    seed: bool=False
 ):
     """
     Join Kalavai pool and start/resume sharing resources.
@@ -536,6 +543,7 @@ def pool__join(
         node_name: Name for this node
         non_interactive: Run in non-interactive mode
         node_labels: Node labels as key=value pairs (e.g., "key1=value1,key2=value2")
+        seed: if the node should join as an extra seed (for HA deployments)
     """
     # Process node labels if provided
@@ -590,7 +598,8 @@ def pool__join(
         num_gpus=num_gpus,
         ip_address=ip_address,
         mtu=mtu,
-        node_labels=node_labels
+        node_labels=node_labels,
+        is_seed=seed
     )
     if "error" in result:
         console.log(f"[red]Error when connecting: {result}")
@@ -1320,7 +1329,7 @@ def job__list(*others):
 @arguably.command
-def job__logs(name, *others, pod_name=None, stream=False, tail=100, force_namespace: str=None):
+def job__logs(name, *others, pod_name=None, tail=100, force_namespace: str=None):
     """
     Get logs for a specific job
     """
@@ -1333,34 +1342,57 @@ def job__logs(name, *others, pod_name=None, stream=False, tail=100, force_namesp
     if force_namespace is not None:
         console.log("[WARNING][yellow]--force-namespace [white]requires an admin key. Request will fail if you are not an admin.")
-    all_logs = fetch_job_logs(
+    data = fetch_job_logs(
         job_name=name,
         pod_name=pod_name,
         force_namespace=force_namespace,
         tail=tail)
-    if "error" in all_logs:
-        console.log(f"[red]{all_logs}")
+    if "error" in data:
+        console.log(f"[red]{data}")
         return
-    while True:
-        try:
-            if not stream:
-                for pod, info in all_logs.items():
-                    if pod_name is not None and pod_name != pod:
-                        continue
-                    console.log(f"[yellow]Pod {pod} in {info['pod']['spec']['node_name']}")
-                    console.log(f"[green]{info['logs']}")
-                break
-            else:
-                os.system("clear")
-                for pod, info in all_logs.items():
-                    if pod_name is not None and pod_name != pod:
-                        continue
-                    print(f"Pod {pod} in {info['pod']['spec']['node_name']}")
-                    print(f"{info['logs']}")
-                time.sleep(1)
-        except KeyboardInterrupt:
-            break
+    for pod, info in data.items():
+        if pod_name is not None and pod_name != pod:
+            continue
+        if "pod" not in info or info["pod"] is None:
+            console.log(f"[white]Logs for {pod_name} not ready yet. Try [yellow]kalavai job describe {pod_name}")
+            continue
+        console.log(f"[yellow]Pod {pod} in {info['pod']['spec']['node_name']}")
+        console.log(f"[green]{info['logs']}")
+        console.log("---------------------------")
+        console.log("---------------------------")
+        console.log(f"[yellow]Status {pod} in {info['pod']['spec']['node_name']}")
+        console.log(f"[green]{info['status']}")
+@arguably.command
+def job__describe(name, *others, pod_name=None, force_namespace: str=None):
+    """
+    Get logs for a specific job
+    """
+    try:
+        CLUSTER.validate_cluster()
+    except Exception as e:
+        console.log(f"[red]Problems with your pool: {str(e)}")
+        return
+    if force_namespace is not None:
+        console.log("[WARNING][yellow]--force-namespace [white]requires an admin key. Request will fail if you are not an admin.")
+    data = fetch_job_logs(
+        job_name=name,
+        pod_name=pod_name,
+        force_namespace=force_namespace)
+    if "error" in data:
+        console.log(f"[red]{data}")
+        return
+    console.log(f"[yellow]Status for {name}:")
+    for pod, info in data.items():
+        if pod_name is not None and pod_name != pod:
+            continue
+        if "pod" not in info or info["pod"] is None:
+            console.log(f"[white]Logs for {pod_name} not ready yet. Try [yellow]kalavai job describe {pod_name}")
+            continue
+        console.log(json.dumps(info['status'], indent=2))
 @arguably.command
 def job__manifest(*others, name, force_namespace: str=None):

{kalavai_client-0.7.12 → kalavai_client-0.7.13}/kalavai_client/core.py RENAMED Viewed

@@ -468,7 +468,7 @@ def fetch_pod_logs(label_key, label_value, force_namespace=None, pod_name=None,
     data = {
         "label": label_key,
         "value": label_value,
-        "tail": tail
+        "tail_lines": tail
     }
     if force_namespace is not None:
         data["force_namespace"] = force_namespace
@@ -476,7 +476,7 @@ def fetch_pod_logs(label_key, label_value, force_namespace=None, pod_name=None,
         # send tail as parameter (fetch only last _tail_ lines)
         all_logs = request_to_server(
             method="post",
-            endpoint="/v1/get_logs_for_label",
+            endpoint="/v1/get_job_details",
             data=data,
             server_creds=USER_LOCAL_SERVER_FILE,
             user_cookie=USER_COOKIE
@@ -673,7 +673,8 @@ def join_pool(
         ip_address=None,
         target_platform="amd64",
         mtu="",
-        node_labels={}
+        node_labels={},
+        is_seed=False
 ):
     compatibility = check_worker_compatibility()
     if len(compatibility["issues"]) > 0:
@@ -705,13 +706,13 @@ def join_pool(
     node_labels = {
         **node_labels,
         STORAGE_CLASS_LABEL: is_storage_compatible(),
-        NODE_ROLE_LABEL: "worker"
+        NODE_ROLE_LABEL: "worker" if not is_seed else "server"
     }
     # local agent join
     # Generate docker compose recipe
     generate_compose_config(
         target_platform=target_platform,
-        role="agent",
+        role="agent" if not is_seed else "seed",
         node_ip_address=ip_address,
         pool_ip=f"https://{kalavai_seed_ip}:6443",
         pool_token=kalavai_token,
@@ -758,20 +759,21 @@ def join_pool(
     return cluster_name
 def create_pool(
-        cluster_name: str=None,
-        ip_address: str=None,
-        location: str=None,
-        target_platform: str="amd64",
-        watcher_image_tag: str=None,
-        pool_config_file: str=None,
-        description: str="",
-        token_mode: TokenType=TokenType.USER,
-        num_gpus: int=-1,
-        node_name: str=None,
-        mtu: str="",
-        apps: list=[],
-        node_labels: dict={}
-    ):
+    cluster_name: str=None,
+    ip_address: str=None,
+    lb_ip_address: str=None,
+    location: str=None,
+    target_platform: str="amd64",
+    watcher_image_tag: str=None,
+    pool_config_file: str=None,
+    description: str="",
+    token_mode: TokenType=TokenType.USER,
+    num_gpus: int=-1,
+    node_name: str=None,
+    mtu: str="",
+    apps: list=[],
+    node_labels: dict={}
+):
     if not check_seed_compatibility():
         return {"error": "Requirements failed"}
@@ -822,6 +824,7 @@ def create_pool(
         role="server",
         vpn_token=location,
         node_ip_address=ip_address,
+        lb_ip_address=lb_ip_address,
         num_gpus=num_gpus,
         node_name=node_name,
         node_labels=node_labels,
@@ -848,7 +851,7 @@ def create_pool(
     watcher_service = f"{ip_address}:{DEFAULT_WATCHER_PORT}"
     values = {
         #CLUSTER_NAME_KEY: cluster_name,
-        CLUSTER_IP_KEY: ip_address,
+        CLUSTER_IP_KEY: ip_address if lb_ip_address is None else lb_ip_address,
         USER_ID_KEY: user_id if user_id is not None else "",
         AUTH_KEY: auth_key,
         READONLY_AUTH_KEY: readonly_auth_key,
@@ -861,7 +864,7 @@ def create_pool(
     }
     store_server_info(
-        server_ip=ip_address,
+        server_ip=ip_address if lb_ip_address is None else lb_ip_address,
         auth_key=auth_key,
         readonly_auth_key=readonly_auth_key,
         write_auth_key=write_auth_key,

{kalavai_client-0.7.12 → kalavai_client-0.7.13}/kalavai_client/utils.py RENAMED Viewed

@@ -163,6 +163,7 @@ def generate_compose_config(
     target_platform="amd64",
     write_to_file=True,
     node_ip_address="0.0.0.0",
+    lb_ip_address=None,
     num_gpus=0,
     node_labels=None,
     pool_ip=None,
@@ -180,6 +181,7 @@ def generate_compose_config(
         "vpn_name": DEFAULT_VPN_CONTAINER_NAME,
         "mtu": mtu,
         "node_ip_address": node_ip_address,
+        "load_balancer_ip_address": lb_ip_address if lb_ip_address is not None else "",
         "pool_ip": pool_ip,
         "pool_token": pool_token,
         "vpn_token": vpn_token,

{kalavai_client-0.7.12 → kalavai_client-0.7.13}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name            = "kalavai-client"
-version         = "0.7.12"
+version         = "0.7.13"
 authors = [
   {name = "Carlos Fernandez Musoles", email = "carlos@kalavai.net"}
 ]