kubetorch 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kubetorch might be problematic. Click here for more details.

Files changed (93) hide show
  1. kubetorch/__init__.py +60 -0
  2. kubetorch/cli.py +1985 -0
  3. kubetorch/cli_utils.py +1025 -0
  4. kubetorch/config.py +453 -0
  5. kubetorch/constants.py +18 -0
  6. kubetorch/docs/Makefile +18 -0
  7. kubetorch/docs/__init__.py +0 -0
  8. kubetorch/docs/_ext/json_globaltoc.py +42 -0
  9. kubetorch/docs/api/cli.rst +10 -0
  10. kubetorch/docs/api/python/app.rst +21 -0
  11. kubetorch/docs/api/python/cls.rst +19 -0
  12. kubetorch/docs/api/python/compute.rst +25 -0
  13. kubetorch/docs/api/python/config.rst +11 -0
  14. kubetorch/docs/api/python/fn.rst +19 -0
  15. kubetorch/docs/api/python/image.rst +14 -0
  16. kubetorch/docs/api/python/secret.rst +18 -0
  17. kubetorch/docs/api/python/volumes.rst +13 -0
  18. kubetorch/docs/api/python.rst +101 -0
  19. kubetorch/docs/conf.py +69 -0
  20. kubetorch/docs/index.rst +20 -0
  21. kubetorch/docs/requirements.txt +5 -0
  22. kubetorch/globals.py +285 -0
  23. kubetorch/logger.py +59 -0
  24. kubetorch/resources/__init__.py +0 -0
  25. kubetorch/resources/callables/__init__.py +0 -0
  26. kubetorch/resources/callables/cls/__init__.py +0 -0
  27. kubetorch/resources/callables/cls/cls.py +157 -0
  28. kubetorch/resources/callables/fn/__init__.py +0 -0
  29. kubetorch/resources/callables/fn/fn.py +133 -0
  30. kubetorch/resources/callables/module.py +1416 -0
  31. kubetorch/resources/callables/utils.py +174 -0
  32. kubetorch/resources/compute/__init__.py +0 -0
  33. kubetorch/resources/compute/app.py +261 -0
  34. kubetorch/resources/compute/compute.py +2596 -0
  35. kubetorch/resources/compute/decorators.py +139 -0
  36. kubetorch/resources/compute/rbac.py +74 -0
  37. kubetorch/resources/compute/utils.py +1114 -0
  38. kubetorch/resources/compute/websocket.py +137 -0
  39. kubetorch/resources/images/__init__.py +1 -0
  40. kubetorch/resources/images/image.py +414 -0
  41. kubetorch/resources/images/images.py +74 -0
  42. kubetorch/resources/secrets/__init__.py +2 -0
  43. kubetorch/resources/secrets/kubernetes_secrets_client.py +412 -0
  44. kubetorch/resources/secrets/provider_secrets/__init__.py +0 -0
  45. kubetorch/resources/secrets/provider_secrets/anthropic_secret.py +12 -0
  46. kubetorch/resources/secrets/provider_secrets/aws_secret.py +16 -0
  47. kubetorch/resources/secrets/provider_secrets/azure_secret.py +14 -0
  48. kubetorch/resources/secrets/provider_secrets/cohere_secret.py +12 -0
  49. kubetorch/resources/secrets/provider_secrets/gcp_secret.py +16 -0
  50. kubetorch/resources/secrets/provider_secrets/github_secret.py +13 -0
  51. kubetorch/resources/secrets/provider_secrets/huggingface_secret.py +20 -0
  52. kubetorch/resources/secrets/provider_secrets/kubeconfig_secret.py +12 -0
  53. kubetorch/resources/secrets/provider_secrets/lambda_secret.py +13 -0
  54. kubetorch/resources/secrets/provider_secrets/langchain_secret.py +12 -0
  55. kubetorch/resources/secrets/provider_secrets/openai_secret.py +11 -0
  56. kubetorch/resources/secrets/provider_secrets/pinecone_secret.py +12 -0
  57. kubetorch/resources/secrets/provider_secrets/providers.py +93 -0
  58. kubetorch/resources/secrets/provider_secrets/ssh_secret.py +12 -0
  59. kubetorch/resources/secrets/provider_secrets/wandb_secret.py +11 -0
  60. kubetorch/resources/secrets/secret.py +238 -0
  61. kubetorch/resources/secrets/secret_factory.py +70 -0
  62. kubetorch/resources/secrets/utils.py +209 -0
  63. kubetorch/resources/volumes/__init__.py +0 -0
  64. kubetorch/resources/volumes/volume.py +365 -0
  65. kubetorch/servers/__init__.py +0 -0
  66. kubetorch/servers/http/__init__.py +0 -0
  67. kubetorch/servers/http/distributed_utils.py +3223 -0
  68. kubetorch/servers/http/http_client.py +730 -0
  69. kubetorch/servers/http/http_server.py +1788 -0
  70. kubetorch/servers/http/server_metrics.py +278 -0
  71. kubetorch/servers/http/utils.py +728 -0
  72. kubetorch/serving/__init__.py +0 -0
  73. kubetorch/serving/autoscaling.py +173 -0
  74. kubetorch/serving/base_service_manager.py +363 -0
  75. kubetorch/serving/constants.py +83 -0
  76. kubetorch/serving/deployment_service_manager.py +478 -0
  77. kubetorch/serving/knative_service_manager.py +519 -0
  78. kubetorch/serving/raycluster_service_manager.py +582 -0
  79. kubetorch/serving/service_manager.py +18 -0
  80. kubetorch/serving/templates/deployment_template.yaml +17 -0
  81. kubetorch/serving/templates/knative_service_template.yaml +19 -0
  82. kubetorch/serving/templates/kt_setup_template.sh.j2 +81 -0
  83. kubetorch/serving/templates/pod_template.yaml +194 -0
  84. kubetorch/serving/templates/raycluster_service_template.yaml +42 -0
  85. kubetorch/serving/templates/raycluster_template.yaml +35 -0
  86. kubetorch/serving/templates/service_template.yaml +21 -0
  87. kubetorch/serving/templates/workerset_template.yaml +36 -0
  88. kubetorch/serving/utils.py +377 -0
  89. kubetorch/utils.py +284 -0
  90. kubetorch-0.2.0.dist-info/METADATA +121 -0
  91. kubetorch-0.2.0.dist-info/RECORD +93 -0
  92. kubetorch-0.2.0.dist-info/WHEEL +4 -0
  93. kubetorch-0.2.0.dist-info/entry_points.txt +5 -0
@@ -0,0 +1,81 @@
1
+ # Increase file descriptor limit for large-scale distributed jobs
2
+ ulimit -n 65536
3
+
4
+ {% if python_path %}
5
+ export PATH="{{ python_path }}:$PATH"
6
+ if command -v "{{ python_path }}" &> /dev/null; then
7
+ python_bin="{{ python_path }}"
8
+ fi
9
+ {% endif %}
10
+ # If the user set the python_path to exact executable, then we'll use it directly here but adding it to PATH
11
+ # above will have little effect. If they set it to a directory, then this command check will fail as desired,
12
+ # and we'll then look for python3 or python in PATH (starting with their directory) as desired.
13
+ if [[ -z "$python_bin" ]]; then
14
+ if command -v python3 &> /dev/null; then
15
+ python_bin="python3"
16
+ elif command -v python &> /dev/null; then
17
+ python_bin="python"
18
+ else
19
+ echo "Error: Neither python3 nor python found in PATH. Please set python_path to a valid Python executable."
20
+ exit 1
21
+ fi
22
+ fi
23
+ echo "Using Python binary: $python_bin"
24
+
25
+ {% if not freeze %}
26
+ if ! command -v rsync &> /dev/null; then
27
+ apt-get update && apt-get install -y rsync
28
+ fi
29
+ if ! command -v nohup &> /dev/null; then
30
+ apt-get update && apt-get install -y coreutils
31
+ fi
32
+
33
+ {% if install_cmd %}
34
+ # Use the explicitly provided install command
35
+ uv_pip_cmd="{{ install_cmd }}"
36
+ {% else %}
37
+
38
+ if $python_bin -c "import sys; exit(0 if sys.prefix != sys.base_prefix else 1)" 2>/dev/null; then
39
+ install_flags=""
40
+ else
41
+ install_flags="--system --break-system-packages"
42
+ fi
43
+
44
+ # Check if uv is available and set the appropriate command
45
+ if command -v uv &> /dev/null; then
46
+ # Use system-wide uv with the detected Python interpreter
47
+ uv_pip_cmd="uv pip install $install_flags --python=$python_bin"
48
+ elif $python_bin -m uv --version &> /dev/null; then
49
+ # Use Python module uv - it inherently uses the right Python
50
+ uv_pip_cmd="$python_bin -m uv pip install $install_flags"
51
+ else
52
+ # Install uv as a Python module and use it
53
+ echo "uv not found, installing it..."
54
+ $python_bin -m pip install uv
55
+ uv_pip_cmd="$python_bin -m uv pip install $install_flags"
56
+ fi
57
+ {% endif %}
58
+
59
+ # Export the install command as an environment variable for use in applications
60
+ echo "Setting KT_PIP_INSTALL_CMD env var to $uv_pip_cmd"
61
+ export KT_PIP_INSTALL_CMD="$uv_pip_cmd"
62
+ echo "$uv_pip_cmd" > .kt/kt_pip_install_cmd
63
+
64
+ {% if install_url %}
65
+ $uv_pip_cmd "kubetorch[server] @ {{ install_url }}"
66
+ {% if install_otel %}
67
+ $uv_pip_cmd "kubetorch[otel] @ {{ install_url }}"
68
+ {% endif %}
69
+ {% else %}
70
+ {{ rsync_kt_editable_cmd }}
71
+ $uv_pip_cmd -e "python_client[server]"
72
+ {% if install_otel %}
73
+ $uv_pip_cmd -e "python_client[otel]"
74
+ {% endif %}
75
+ {% endif %}
76
+
77
+ {% endif %}
78
+
79
+ $python_bin -m uvicorn kubetorch.servers.http.http_server:app \
80
+ --host 0.0.0.0 \
81
+ --port {{ server_port }}
@@ -0,0 +1,194 @@
1
+ {% if service_account_name is not none %}
2
+ serviceAccountName: {{ service_account_name }}
3
+ {% endif %}
4
+
5
+ {% if priority_class_name is not none %}
6
+ priorityClassName: {{ priority_class_name }}
7
+ {% endif %}
8
+
9
+ {% if queue_name is not none %}
10
+ schedulerName: {{ scheduler_name }}
11
+ {% endif %}
12
+
13
+ {% if gpu_anti_affinity is sameas true %}
14
+ affinity:
15
+ nodeAffinity:
16
+ requiredDuringSchedulingIgnoredDuringExecution:
17
+ nodeSelectorTerms:
18
+ - matchExpressions:
19
+ - key: nvidia.com/gpu
20
+ operator: DoesNotExist
21
+ - key: eks.amazonaws.com/instance-gpu-count
22
+ operator: DoesNotExist
23
+ - key: cloud.google.com/gke-accelerator
24
+ operator: DoesNotExist
25
+ {% endif %}
26
+
27
+ {% if node_selector is not none %}
28
+ nodeSelector:
29
+ {% for key, value in node_selector.items() %}
30
+ {{ key }}: {{ value }}
31
+ {% endfor %}
32
+ {% endif %}
33
+
34
+ {% if tolerations is not none and tolerations|length > 0 %}
35
+ tolerations:
36
+ {% for tol in tolerations %}
37
+ - key: "{{ tol.key }}"
38
+ operator: "{{ tol.operator }}"
39
+ value: "{{ tol.value }}"
40
+ effect: "{{ tol.effect }}"
41
+ {% endfor %}
42
+ {% endif %}
43
+
44
+ timeoutSeconds: {{ launch_timeout }}
45
+ containers:
46
+ - name: kubetorch
47
+ image: {{ server_image }}
48
+ {% if image_pull_policy is not none %}
49
+ imagePullPolicy: {{ image_pull_policy }}
50
+ {% endif %}
51
+ {% if working_dir is not none %}
52
+ workingDir: {{ working_dir }}
53
+ {% endif %}
54
+ ports:
55
+ - name: http1
56
+ containerPort: {{ server_port }}
57
+ command: ["/bin/bash", "-c"]
58
+ {% if not freeze %}
59
+ securityContext:
60
+ capabilities:
61
+ add:
62
+ - "SYS_PTRACE"
63
+ {% endif %}
64
+ args:
65
+ - |
66
+ {{ setup_script | indent(8, true) }}
67
+
68
+ env:
69
+ # Pod metadata available via the Kubernetes Downward API
70
+ - name: POD_NAME
71
+ valueFrom:
72
+ fieldRef:
73
+ fieldPath: metadata.name
74
+ - name: POD_NAMESPACE
75
+ valueFrom:
76
+ fieldRef:
77
+ fieldPath: metadata.namespace
78
+ - name: POD_IP
79
+ valueFrom:
80
+ fieldRef:
81
+ fieldPath: status.podIP
82
+ - name: POD_UUID
83
+ valueFrom:
84
+ fieldRef:
85
+ fieldPath: metadata.uid
86
+ - name: MODULE_NAME
87
+ valueFrom:
88
+ fieldRef:
89
+ fieldPath: metadata.labels['kubetorch.com/module']
90
+ - name: KUBETORCH_VERSION
91
+ valueFrom:
92
+ fieldRef:
93
+ fieldPath: metadata.labels['kubetorch.com/version']
94
+ - name: UV_LINK_MODE
95
+ value: "copy" # Suppress the hardlink warning
96
+ - name: OTEL_EXPORTER_OTLP_ENDPOINT
97
+ value: "kubetorch-otel-opentelemetry-collector.kubetorch-monitoring.svc.cluster.local:4317"
98
+ - name: OTEL_EXPORTER_OTLP_PROTOCOL
99
+ value: "grpc"
100
+ - name: OTEL_TRACES_EXPORTER
101
+ value: "otlp"
102
+ - name: OTEL_PROPAGATORS
103
+ value: "tracecontext,baggage"
104
+ - name: KT_OTEL_ENABLED
105
+ value: "{{ otel_enabled }}"
106
+ - name: KT_SERVER_PORT
107
+ value: "{{ server_port }}"
108
+ - name: KT_FREEZE
109
+ value: "{{ freeze }}"
110
+ {% if inactivity_ttl is not none %}
111
+ - name: KT_INACTIVITY_TTL
112
+ value: "{{ inactivity_ttl }}"
113
+ {% endif %}
114
+ {% for key, value in config_env_vars.items() %}
115
+ - name: {{ key }}
116
+ value: "{{ value }}"
117
+ {% endfor %}
118
+ {% if env_vars is not none and env_vars|length > 0 %}
119
+ {% for key, value in env_vars.items() %}
120
+ - name: {{ key }}
121
+ value: "{{ value }}"
122
+ {% endfor %}
123
+ {% endif %}
124
+ {% for secret in secret_env_vars %}
125
+ {% for key in secret.env_vars %}
126
+ - name: {{ key }}
127
+ valueFrom:
128
+ secretKeyRef:
129
+ name: {{ secret.secret_name }}
130
+ key: {{ key }}
131
+ {% endfor %}
132
+ {% endfor %}
133
+ volumeMounts:
134
+ - mountPath: /dev/shm
135
+ name: dshm
136
+ {% for secret in secret_volumes %}
137
+ - name: {{ secret.name }}
138
+ mountPath: {{ secret.path }}
139
+ readOnly: true
140
+ {% endfor %}
141
+ {% if volume_mounts is not none and volume_mounts|length > 0 %}
142
+ {% for mount in volume_mounts %}
143
+ - name: {{ mount.name }}
144
+ mountPath: {{ mount.mountPath }}
145
+ {% endfor %}
146
+ {% endif %}
147
+ resources:
148
+ {{ resources | tojson }}
149
+ # TODO: do we want these health checks?
150
+ # Note: Knative won't consider the service ready to receive traffic until the probe succeeds at least once
151
+ # Initial readiness check
152
+ startupProbe:
153
+ httpGet:
154
+ path: /health
155
+ port: {{ server_port }}
156
+ initialDelaySeconds: 0
157
+ periodSeconds: 5
158
+ timeoutSeconds: 2
159
+ failureThreshold: {{ launch_timeout // 5 }}
160
+ readinessProbe:
161
+ httpGet:
162
+ path: /health
163
+ port: {{ server_port }}
164
+ periodSeconds: 3
165
+ successThreshold: 1
166
+ failureThreshold: 5
167
+ # Ongoing health monitoring with less frequent checks
168
+ livenessProbe:
169
+ httpGet:
170
+ path: /health
171
+ port: {{ server_port }}
172
+ periodSeconds: 30
173
+ timeoutSeconds: 1
174
+ failureThreshold: 3
175
+
176
+ volumes:
177
+ - name: dshm
178
+ emptyDir:
179
+ medium: Memory
180
+ {% if shm_size_limit is not none %}
181
+ sizeLimit: {{ shm_size_limit }}
182
+ {% endif %}
183
+ {% for secret in secret_volumes %}
184
+ - name: {{ secret.name }}
185
+ secret:
186
+ secretName: {{ secret.secret_name }}
187
+ {% endfor %}
188
+ {% if volume_specs is not none and volume_specs|length > 0 %}
189
+ {% for spec in volume_specs %}
190
+ - name: {{ spec.name }}
191
+ persistentVolumeClaim:
192
+ claimName: {{ spec.persistentVolumeClaim.claimName }}
193
+ {% endfor %}
194
+ {% endif %}
@@ -0,0 +1,42 @@
1
+ apiVersion: v1
2
+ kind: Service
3
+ metadata:
4
+ name: {{ name }}
5
+ namespace: {{ namespace }}
6
+ annotations: {{ annotations | tojson }}
7
+ labels: {{ labels | tojson }}
8
+ spec:
9
+ {% if distributed %}
10
+ clusterIP: None # Headless service for Ray pod discovery
11
+ {% else %}
12
+ sessionAffinity: ClientIP # Ensure requests from same client go to same pod
13
+ {% endif %}
14
+ selector:
15
+ kubetorch.com/service: {{ deployment_name }}
16
+ kubetorch.com/module: {{ module_name }}
17
+ ray.io/node-type: head # Only select head node pods
18
+ ports:
19
+ - name: http
20
+ port: 80
21
+ targetPort: {{ server_port }}
22
+ protocol: TCP
23
+ - name: ray-gcs
24
+ port: 6379
25
+ targetPort: 6379
26
+ protocol: TCP
27
+ - name: ray-object-mgr
28
+ port: 8076
29
+ targetPort: 8076
30
+ protocol: TCP
31
+ - name: ray-node-mgr
32
+ port: 8077
33
+ targetPort: 8077
34
+ protocol: TCP
35
+ - name: ray-dashboard
36
+ port: 8265
37
+ targetPort: 8265
38
+ protocol: TCP
39
+ - name: ray-metrics
40
+ port: 8080
41
+ targetPort: 8080
42
+ protocol: TCP
@@ -0,0 +1,35 @@
1
+ apiVersion: ray.io/v1
2
+ kind: RayCluster
3
+ metadata:
4
+ name: {{ name }}
5
+ namespace: {{ namespace }}
6
+ annotations: {{ annotations | tojson }}
7
+ labels: {{ labels | tojson }}
8
+ spec:
9
+ rayVersion: "2.8.0"
10
+ enableInTreeAutoscaling: false
11
+ headGroupSpec:
12
+ rayStartParams:
13
+ dashboard-host: "0.0.0.0"
14
+ port: "6379"
15
+ object-manager-port: "8076"
16
+ node-manager-port: "8077"
17
+ dashboard-port: "8265"
18
+ metrics-export-port: "8080"
19
+ replicas: 1
20
+ template:
21
+ metadata:
22
+ annotations: {{ template_annotations | tojson }}
23
+ labels: {{ head_template_labels | tojson }}
24
+ spec: {{ pod_template | tojson }}
25
+ workerGroupSpecs:
26
+ - groupName: worker-group
27
+ rayStartParams: {}
28
+ minReplicas: 0
29
+ maxReplicas: {{ worker_replicas }}
30
+ replicas: {{ worker_replicas }}
31
+ template:
32
+ metadata:
33
+ annotations: {{ template_annotations | tojson }}
34
+ labels: {{ worker_template_labels | tojson }}
35
+ spec: {{ pod_template | tojson }}
@@ -0,0 +1,21 @@
1
+ apiVersion: v1
2
+ kind: Service
3
+ metadata:
4
+ name: {{ name }}
5
+ namespace: {{ namespace }}
6
+ annotations: {{ annotations | tojson }}
7
+ labels: {{ labels | tojson }}
8
+ spec:
9
+ {% if distributed %}
10
+ clusterIP: None # Headless service for distributed pod discovery
11
+ {% else %}
12
+ sessionAffinity: ClientIP # Ensure requests from same client go to same pod
13
+ {% endif %}
14
+ selector:
15
+ kubetorch.com/service: {{ deployment_name }}
16
+ kubetorch.com/module: {{ module_name }} # Only deployment pods have this set, so allows us to exclude the jump pod
17
+ ports:
18
+ - name: http
19
+ port: 80
20
+ targetPort: {{ server_port }}
21
+ protocol: TCP
@@ -0,0 +1,36 @@
1
+ # --- Headless Service selecting all pods with 'app=my-app-name' ---
2
+ apiVersion: v1
3
+ kind: Service
4
+ metadata:
5
+ name: {{ workerset_name }}
6
+ namespace: {{ namespace }}
7
+ annotations: {{ annotations | tojson }}
8
+ labels: {{ labels | tojson }}
9
+ spec:
10
+ clusterIP: None # Make it headless
11
+ selector:
12
+ app: {{ workerset_name_app }} # Selects pods from any associated StatefulSets
13
+
14
+ ---
15
+ # --- StatefulSet ---
16
+ apiVersion: apps/v1
17
+ kind: StatefulSet
18
+ metadata:
19
+ name: {{ worker_group_name }}
20
+ namespace: {{ namespace }}
21
+ ownerReferences:
22
+ - apiVersion: v1
23
+ kind: Pod
24
+ name: {{ service_pod_name }}
25
+ uid: {{ service_pod_uid }}
26
+ spec:
27
+ serviceName: {{ workerset_name }} # Must match the headless service name
28
+ replicas: {{ replicas }}
29
+ selector:
30
+ matchLabels:
31
+ app: {{ workerset_name_app }} # Matches the service selector
32
+ template:
33
+ metadata:
34
+ labels:
35
+ app: {{ workerset_name_app }} # Label for the service
36
+ spec: {{ pod_template | tojson }}