kubetorch 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kubetorch might be problematic. Click here for more details.
- kubetorch/__init__.py +60 -0
- kubetorch/cli.py +1985 -0
- kubetorch/cli_utils.py +1025 -0
- kubetorch/config.py +453 -0
- kubetorch/constants.py +18 -0
- kubetorch/docs/Makefile +18 -0
- kubetorch/docs/__init__.py +0 -0
- kubetorch/docs/_ext/json_globaltoc.py +42 -0
- kubetorch/docs/api/cli.rst +10 -0
- kubetorch/docs/api/python/app.rst +21 -0
- kubetorch/docs/api/python/cls.rst +19 -0
- kubetorch/docs/api/python/compute.rst +25 -0
- kubetorch/docs/api/python/config.rst +11 -0
- kubetorch/docs/api/python/fn.rst +19 -0
- kubetorch/docs/api/python/image.rst +14 -0
- kubetorch/docs/api/python/secret.rst +18 -0
- kubetorch/docs/api/python/volumes.rst +13 -0
- kubetorch/docs/api/python.rst +101 -0
- kubetorch/docs/conf.py +69 -0
- kubetorch/docs/index.rst +20 -0
- kubetorch/docs/requirements.txt +5 -0
- kubetorch/globals.py +285 -0
- kubetorch/logger.py +59 -0
- kubetorch/resources/__init__.py +0 -0
- kubetorch/resources/callables/__init__.py +0 -0
- kubetorch/resources/callables/cls/__init__.py +0 -0
- kubetorch/resources/callables/cls/cls.py +157 -0
- kubetorch/resources/callables/fn/__init__.py +0 -0
- kubetorch/resources/callables/fn/fn.py +133 -0
- kubetorch/resources/callables/module.py +1416 -0
- kubetorch/resources/callables/utils.py +174 -0
- kubetorch/resources/compute/__init__.py +0 -0
- kubetorch/resources/compute/app.py +261 -0
- kubetorch/resources/compute/compute.py +2596 -0
- kubetorch/resources/compute/decorators.py +139 -0
- kubetorch/resources/compute/rbac.py +74 -0
- kubetorch/resources/compute/utils.py +1114 -0
- kubetorch/resources/compute/websocket.py +137 -0
- kubetorch/resources/images/__init__.py +1 -0
- kubetorch/resources/images/image.py +414 -0
- kubetorch/resources/images/images.py +74 -0
- kubetorch/resources/secrets/__init__.py +2 -0
- kubetorch/resources/secrets/kubernetes_secrets_client.py +412 -0
- kubetorch/resources/secrets/provider_secrets/__init__.py +0 -0
- kubetorch/resources/secrets/provider_secrets/anthropic_secret.py +12 -0
- kubetorch/resources/secrets/provider_secrets/aws_secret.py +16 -0
- kubetorch/resources/secrets/provider_secrets/azure_secret.py +14 -0
- kubetorch/resources/secrets/provider_secrets/cohere_secret.py +12 -0
- kubetorch/resources/secrets/provider_secrets/gcp_secret.py +16 -0
- kubetorch/resources/secrets/provider_secrets/github_secret.py +13 -0
- kubetorch/resources/secrets/provider_secrets/huggingface_secret.py +20 -0
- kubetorch/resources/secrets/provider_secrets/kubeconfig_secret.py +12 -0
- kubetorch/resources/secrets/provider_secrets/lambda_secret.py +13 -0
- kubetorch/resources/secrets/provider_secrets/langchain_secret.py +12 -0
- kubetorch/resources/secrets/provider_secrets/openai_secret.py +11 -0
- kubetorch/resources/secrets/provider_secrets/pinecone_secret.py +12 -0
- kubetorch/resources/secrets/provider_secrets/providers.py +93 -0
- kubetorch/resources/secrets/provider_secrets/ssh_secret.py +12 -0
- kubetorch/resources/secrets/provider_secrets/wandb_secret.py +11 -0
- kubetorch/resources/secrets/secret.py +238 -0
- kubetorch/resources/secrets/secret_factory.py +70 -0
- kubetorch/resources/secrets/utils.py +209 -0
- kubetorch/resources/volumes/__init__.py +0 -0
- kubetorch/resources/volumes/volume.py +365 -0
- kubetorch/servers/__init__.py +0 -0
- kubetorch/servers/http/__init__.py +0 -0
- kubetorch/servers/http/distributed_utils.py +3223 -0
- kubetorch/servers/http/http_client.py +730 -0
- kubetorch/servers/http/http_server.py +1788 -0
- kubetorch/servers/http/server_metrics.py +278 -0
- kubetorch/servers/http/utils.py +728 -0
- kubetorch/serving/__init__.py +0 -0
- kubetorch/serving/autoscaling.py +173 -0
- kubetorch/serving/base_service_manager.py +363 -0
- kubetorch/serving/constants.py +83 -0
- kubetorch/serving/deployment_service_manager.py +478 -0
- kubetorch/serving/knative_service_manager.py +519 -0
- kubetorch/serving/raycluster_service_manager.py +582 -0
- kubetorch/serving/service_manager.py +18 -0
- kubetorch/serving/templates/deployment_template.yaml +17 -0
- kubetorch/serving/templates/knative_service_template.yaml +19 -0
- kubetorch/serving/templates/kt_setup_template.sh.j2 +81 -0
- kubetorch/serving/templates/pod_template.yaml +194 -0
- kubetorch/serving/templates/raycluster_service_template.yaml +42 -0
- kubetorch/serving/templates/raycluster_template.yaml +35 -0
- kubetorch/serving/templates/service_template.yaml +21 -0
- kubetorch/serving/templates/workerset_template.yaml +36 -0
- kubetorch/serving/utils.py +377 -0
- kubetorch/utils.py +284 -0
- kubetorch-0.2.0.dist-info/METADATA +121 -0
- kubetorch-0.2.0.dist-info/RECORD +93 -0
- kubetorch-0.2.0.dist-info/WHEEL +4 -0
- kubetorch-0.2.0.dist-info/entry_points.txt +5 -0
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
# Increase file descriptor limit for large-scale distributed jobs
|
|
2
|
+
ulimit -n 65536
|
|
3
|
+
|
|
4
|
+
{% if python_path %}
|
|
5
|
+
export PATH="{{ python_path }}:$PATH"
|
|
6
|
+
if command -v "{{ python_path }}" &> /dev/null; then
|
|
7
|
+
python_bin="{{ python_path }}"
|
|
8
|
+
fi
|
|
9
|
+
{% endif %}
|
|
10
|
+
# If the user set the python_path to exact executable, then we'll use it directly here but adding it to PATH
|
|
11
|
+
# above will have little effect. If they set it to a directory, then this command check will fail as desired,
|
|
12
|
+
# and we'll then look for python3 or python in PATH (starting with their directory) as desired.
|
|
13
|
+
if [[ -z "$python_bin" ]]; then
|
|
14
|
+
if command -v python3 &> /dev/null; then
|
|
15
|
+
python_bin="python3"
|
|
16
|
+
elif command -v python &> /dev/null; then
|
|
17
|
+
python_bin="python"
|
|
18
|
+
else
|
|
19
|
+
echo "Error: Neither python3 nor python found in PATH. Please set python_path to a valid Python executable."
|
|
20
|
+
exit 1
|
|
21
|
+
fi
|
|
22
|
+
fi
|
|
23
|
+
echo "Using Python binary: $python_bin"
|
|
24
|
+
|
|
25
|
+
{% if not freeze %}
|
|
26
|
+
if ! command -v rsync &> /dev/null; then
|
|
27
|
+
apt-get update && apt-get install -y rsync
|
|
28
|
+
fi
|
|
29
|
+
if ! command -v nohup &> /dev/null; then
|
|
30
|
+
apt-get update && apt-get install -y coreutils
|
|
31
|
+
fi
|
|
32
|
+
|
|
33
|
+
{% if install_cmd %}
|
|
34
|
+
# Use the explicitly provided install command
|
|
35
|
+
uv_pip_cmd="{{ install_cmd }}"
|
|
36
|
+
{% else %}
|
|
37
|
+
|
|
38
|
+
if $python_bin -c "import sys; exit(0 if sys.prefix != sys.base_prefix else 1)" 2>/dev/null; then
|
|
39
|
+
install_flags=""
|
|
40
|
+
else
|
|
41
|
+
install_flags="--system --break-system-packages"
|
|
42
|
+
fi
|
|
43
|
+
|
|
44
|
+
# Check if uv is available and set the appropriate command
|
|
45
|
+
if command -v uv &> /dev/null; then
|
|
46
|
+
# Use system-wide uv with the detected Python interpreter
|
|
47
|
+
uv_pip_cmd="uv pip install $install_flags --python=$python_bin"
|
|
48
|
+
elif $python_bin -m uv --version &> /dev/null; then
|
|
49
|
+
# Use Python module uv - it inherently uses the right Python
|
|
50
|
+
uv_pip_cmd="$python_bin -m uv pip install $install_flags"
|
|
51
|
+
else
|
|
52
|
+
# Install uv as a Python module and use it
|
|
53
|
+
echo "uv not found, installing it..."
|
|
54
|
+
$python_bin -m pip install uv
|
|
55
|
+
uv_pip_cmd="$python_bin -m uv pip install $install_flags"
|
|
56
|
+
fi
|
|
57
|
+
{% endif %}
|
|
58
|
+
|
|
59
|
+
# Export the install command as an environment variable for use in applications
|
|
60
|
+
echo "Setting KT_PIP_INSTALL_CMD env var to $uv_pip_cmd"
|
|
61
|
+
export KT_PIP_INSTALL_CMD="$uv_pip_cmd"
|
|
62
|
+
echo "$uv_pip_cmd" > .kt/kt_pip_install_cmd
|
|
63
|
+
|
|
64
|
+
{% if install_url %}
|
|
65
|
+
$uv_pip_cmd "kubetorch[server] @ {{ install_url }}"
|
|
66
|
+
{% if install_otel %}
|
|
67
|
+
$uv_pip_cmd "kubetorch[otel] @ {{ install_url }}"
|
|
68
|
+
{% endif %}
|
|
69
|
+
{% else %}
|
|
70
|
+
{{ rsync_kt_editable_cmd }}
|
|
71
|
+
$uv_pip_cmd -e "python_client[server]"
|
|
72
|
+
{% if install_otel %}
|
|
73
|
+
$uv_pip_cmd -e "python_client[otel]"
|
|
74
|
+
{% endif %}
|
|
75
|
+
{% endif %}
|
|
76
|
+
|
|
77
|
+
{% endif %}
|
|
78
|
+
|
|
79
|
+
$python_bin -m uvicorn kubetorch.servers.http.http_server:app \
|
|
80
|
+
--host 0.0.0.0 \
|
|
81
|
+
--port {{ server_port }}
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
{% if service_account_name is not none %}
|
|
2
|
+
serviceAccountName: {{ service_account_name }}
|
|
3
|
+
{% endif %}
|
|
4
|
+
|
|
5
|
+
{% if priority_class_name is not none %}
|
|
6
|
+
priorityClassName: {{ priority_class_name }}
|
|
7
|
+
{% endif %}
|
|
8
|
+
|
|
9
|
+
{% if queue_name is not none %}
|
|
10
|
+
schedulerName: {{ scheduler_name }}
|
|
11
|
+
{% endif %}
|
|
12
|
+
|
|
13
|
+
{% if gpu_anti_affinity is sameas true %}
|
|
14
|
+
affinity:
|
|
15
|
+
nodeAffinity:
|
|
16
|
+
requiredDuringSchedulingIgnoredDuringExecution:
|
|
17
|
+
nodeSelectorTerms:
|
|
18
|
+
- matchExpressions:
|
|
19
|
+
- key: nvidia.com/gpu
|
|
20
|
+
operator: DoesNotExist
|
|
21
|
+
- key: eks.amazonaws.com/instance-gpu-count
|
|
22
|
+
operator: DoesNotExist
|
|
23
|
+
- key: cloud.google.com/gke-accelerator
|
|
24
|
+
operator: DoesNotExist
|
|
25
|
+
{% endif %}
|
|
26
|
+
|
|
27
|
+
{% if node_selector is not none %}
|
|
28
|
+
nodeSelector:
|
|
29
|
+
{% for key, value in node_selector.items() %}
|
|
30
|
+
{{ key }}: {{ value }}
|
|
31
|
+
{% endfor %}
|
|
32
|
+
{% endif %}
|
|
33
|
+
|
|
34
|
+
{% if tolerations is not none and tolerations|length > 0 %}
|
|
35
|
+
tolerations:
|
|
36
|
+
{% for tol in tolerations %}
|
|
37
|
+
- key: "{{ tol.key }}"
|
|
38
|
+
operator: "{{ tol.operator }}"
|
|
39
|
+
value: "{{ tol.value }}"
|
|
40
|
+
effect: "{{ tol.effect }}"
|
|
41
|
+
{% endfor %}
|
|
42
|
+
{% endif %}
|
|
43
|
+
|
|
44
|
+
timeoutSeconds: {{ launch_timeout }}
|
|
45
|
+
containers:
|
|
46
|
+
- name: kubetorch
|
|
47
|
+
image: {{ server_image }}
|
|
48
|
+
{% if image_pull_policy is not none %}
|
|
49
|
+
imagePullPolicy: {{ image_pull_policy }}
|
|
50
|
+
{% endif %}
|
|
51
|
+
{% if working_dir is not none %}
|
|
52
|
+
workingDir: {{ working_dir }}
|
|
53
|
+
{% endif %}
|
|
54
|
+
ports:
|
|
55
|
+
- name: http1
|
|
56
|
+
containerPort: {{ server_port }}
|
|
57
|
+
command: ["/bin/bash", "-c"]
|
|
58
|
+
{% if not freeze %}
|
|
59
|
+
securityContext:
|
|
60
|
+
capabilities:
|
|
61
|
+
add:
|
|
62
|
+
- "SYS_PTRACE"
|
|
63
|
+
{% endif %}
|
|
64
|
+
args:
|
|
65
|
+
- |
|
|
66
|
+
{{ setup_script | indent(8, true) }}
|
|
67
|
+
|
|
68
|
+
env:
|
|
69
|
+
# Pod metadata available via the Kubernetes Downward API
|
|
70
|
+
- name: POD_NAME
|
|
71
|
+
valueFrom:
|
|
72
|
+
fieldRef:
|
|
73
|
+
fieldPath: metadata.name
|
|
74
|
+
- name: POD_NAMESPACE
|
|
75
|
+
valueFrom:
|
|
76
|
+
fieldRef:
|
|
77
|
+
fieldPath: metadata.namespace
|
|
78
|
+
- name: POD_IP
|
|
79
|
+
valueFrom:
|
|
80
|
+
fieldRef:
|
|
81
|
+
fieldPath: status.podIP
|
|
82
|
+
- name: POD_UUID
|
|
83
|
+
valueFrom:
|
|
84
|
+
fieldRef:
|
|
85
|
+
fieldPath: metadata.uid
|
|
86
|
+
- name: MODULE_NAME
|
|
87
|
+
valueFrom:
|
|
88
|
+
fieldRef:
|
|
89
|
+
fieldPath: metadata.labels['kubetorch.com/module']
|
|
90
|
+
- name: KUBETORCH_VERSION
|
|
91
|
+
valueFrom:
|
|
92
|
+
fieldRef:
|
|
93
|
+
fieldPath: metadata.labels['kubetorch.com/version']
|
|
94
|
+
- name: UV_LINK_MODE
|
|
95
|
+
value: "copy" # Suppress the hardlink warning
|
|
96
|
+
- name: OTEL_EXPORTER_OTLP_ENDPOINT
|
|
97
|
+
value: "kubetorch-otel-opentelemetry-collector.kubetorch-monitoring.svc.cluster.local:4317"
|
|
98
|
+
- name: OTEL_EXPORTER_OTLP_PROTOCOL
|
|
99
|
+
value: "grpc"
|
|
100
|
+
- name: OTEL_TRACES_EXPORTER
|
|
101
|
+
value: "otlp"
|
|
102
|
+
- name: OTEL_PROPAGATORS
|
|
103
|
+
value: "tracecontext,baggage"
|
|
104
|
+
- name: KT_OTEL_ENABLED
|
|
105
|
+
value: "{{ otel_enabled }}"
|
|
106
|
+
- name: KT_SERVER_PORT
|
|
107
|
+
value: "{{ server_port }}"
|
|
108
|
+
- name: KT_FREEZE
|
|
109
|
+
value: "{{ freeze }}"
|
|
110
|
+
{% if inactivity_ttl is not none %}
|
|
111
|
+
- name: KT_INACTIVITY_TTL
|
|
112
|
+
value: "{{ inactivity_ttl }}"
|
|
113
|
+
{% endif %}
|
|
114
|
+
{% for key, value in config_env_vars.items() %}
|
|
115
|
+
- name: {{ key }}
|
|
116
|
+
value: "{{ value }}"
|
|
117
|
+
{% endfor %}
|
|
118
|
+
{% if env_vars is not none and env_vars|length > 0 %}
|
|
119
|
+
{% for key, value in env_vars.items() %}
|
|
120
|
+
- name: {{ key }}
|
|
121
|
+
value: "{{ value }}"
|
|
122
|
+
{% endfor %}
|
|
123
|
+
{% endif %}
|
|
124
|
+
{% for secret in secret_env_vars %}
|
|
125
|
+
{% for key in secret.env_vars %}
|
|
126
|
+
- name: {{ key }}
|
|
127
|
+
valueFrom:
|
|
128
|
+
secretKeyRef:
|
|
129
|
+
name: {{ secret.secret_name }}
|
|
130
|
+
key: {{ key }}
|
|
131
|
+
{% endfor %}
|
|
132
|
+
{% endfor %}
|
|
133
|
+
volumeMounts:
|
|
134
|
+
- mountPath: /dev/shm
|
|
135
|
+
name: dshm
|
|
136
|
+
{% for secret in secret_volumes %}
|
|
137
|
+
- name: {{ secret.name }}
|
|
138
|
+
mountPath: {{ secret.path }}
|
|
139
|
+
readOnly: true
|
|
140
|
+
{% endfor %}
|
|
141
|
+
{% if volume_mounts is not none and volume_mounts|length > 0 %}
|
|
142
|
+
{% for mount in volume_mounts %}
|
|
143
|
+
- name: {{ mount.name }}
|
|
144
|
+
mountPath: {{ mount.mountPath }}
|
|
145
|
+
{% endfor %}
|
|
146
|
+
{% endif %}
|
|
147
|
+
resources:
|
|
148
|
+
{{ resources | tojson }}
|
|
149
|
+
# TODO: do we want these health checks?
|
|
150
|
+
# Note: Knative won't consider the service ready to receive traffic until the probe succeeds at least once
|
|
151
|
+
# Initial readiness check
|
|
152
|
+
startupProbe:
|
|
153
|
+
httpGet:
|
|
154
|
+
path: /health
|
|
155
|
+
port: {{ server_port }}
|
|
156
|
+
initialDelaySeconds: 0
|
|
157
|
+
periodSeconds: 5
|
|
158
|
+
timeoutSeconds: 2
|
|
159
|
+
failureThreshold: {{ launch_timeout // 5 }}
|
|
160
|
+
readinessProbe:
|
|
161
|
+
httpGet:
|
|
162
|
+
path: /health
|
|
163
|
+
port: {{ server_port }}
|
|
164
|
+
periodSeconds: 3
|
|
165
|
+
successThreshold: 1
|
|
166
|
+
failureThreshold: 5
|
|
167
|
+
# Ongoing health monitoring with less frequent checks
|
|
168
|
+
livenessProbe:
|
|
169
|
+
httpGet:
|
|
170
|
+
path: /health
|
|
171
|
+
port: {{ server_port }}
|
|
172
|
+
periodSeconds: 30
|
|
173
|
+
timeoutSeconds: 1
|
|
174
|
+
failureThreshold: 3
|
|
175
|
+
|
|
176
|
+
volumes:
|
|
177
|
+
- name: dshm
|
|
178
|
+
emptyDir:
|
|
179
|
+
medium: Memory
|
|
180
|
+
{% if shm_size_limit is not none %}
|
|
181
|
+
sizeLimit: {{ shm_size_limit }}
|
|
182
|
+
{% endif %}
|
|
183
|
+
{% for secret in secret_volumes %}
|
|
184
|
+
- name: {{ secret.name }}
|
|
185
|
+
secret:
|
|
186
|
+
secretName: {{ secret.secret_name }}
|
|
187
|
+
{% endfor %}
|
|
188
|
+
{% if volume_specs is not none and volume_specs|length > 0 %}
|
|
189
|
+
{% for spec in volume_specs %}
|
|
190
|
+
- name: {{ spec.name }}
|
|
191
|
+
persistentVolumeClaim:
|
|
192
|
+
claimName: {{ spec.persistentVolumeClaim.claimName }}
|
|
193
|
+
{% endfor %}
|
|
194
|
+
{% endif %}
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
apiVersion: v1
|
|
2
|
+
kind: Service
|
|
3
|
+
metadata:
|
|
4
|
+
name: {{ name }}
|
|
5
|
+
namespace: {{ namespace }}
|
|
6
|
+
annotations: {{ annotations | tojson }}
|
|
7
|
+
labels: {{ labels | tojson }}
|
|
8
|
+
spec:
|
|
9
|
+
{% if distributed %}
|
|
10
|
+
clusterIP: None # Headless service for Ray pod discovery
|
|
11
|
+
{% else %}
|
|
12
|
+
sessionAffinity: ClientIP # Ensure requests from same client go to same pod
|
|
13
|
+
{% endif %}
|
|
14
|
+
selector:
|
|
15
|
+
kubetorch.com/service: {{ deployment_name }}
|
|
16
|
+
kubetorch.com/module: {{ module_name }}
|
|
17
|
+
ray.io/node-type: head # Only select head node pods
|
|
18
|
+
ports:
|
|
19
|
+
- name: http
|
|
20
|
+
port: 80
|
|
21
|
+
targetPort: {{ server_port }}
|
|
22
|
+
protocol: TCP
|
|
23
|
+
- name: ray-gcs
|
|
24
|
+
port: 6379
|
|
25
|
+
targetPort: 6379
|
|
26
|
+
protocol: TCP
|
|
27
|
+
- name: ray-object-mgr
|
|
28
|
+
port: 8076
|
|
29
|
+
targetPort: 8076
|
|
30
|
+
protocol: TCP
|
|
31
|
+
- name: ray-node-mgr
|
|
32
|
+
port: 8077
|
|
33
|
+
targetPort: 8077
|
|
34
|
+
protocol: TCP
|
|
35
|
+
- name: ray-dashboard
|
|
36
|
+
port: 8265
|
|
37
|
+
targetPort: 8265
|
|
38
|
+
protocol: TCP
|
|
39
|
+
- name: ray-metrics
|
|
40
|
+
port: 8080
|
|
41
|
+
targetPort: 8080
|
|
42
|
+
protocol: TCP
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
apiVersion: ray.io/v1
|
|
2
|
+
kind: RayCluster
|
|
3
|
+
metadata:
|
|
4
|
+
name: {{ name }}
|
|
5
|
+
namespace: {{ namespace }}
|
|
6
|
+
annotations: {{ annotations | tojson }}
|
|
7
|
+
labels: {{ labels | tojson }}
|
|
8
|
+
spec:
|
|
9
|
+
rayVersion: "2.8.0"
|
|
10
|
+
enableInTreeAutoscaling: false
|
|
11
|
+
headGroupSpec:
|
|
12
|
+
rayStartParams:
|
|
13
|
+
dashboard-host: "0.0.0.0"
|
|
14
|
+
port: "6379"
|
|
15
|
+
object-manager-port: "8076"
|
|
16
|
+
node-manager-port: "8077"
|
|
17
|
+
dashboard-port: "8265"
|
|
18
|
+
metrics-export-port: "8080"
|
|
19
|
+
replicas: 1
|
|
20
|
+
template:
|
|
21
|
+
metadata:
|
|
22
|
+
annotations: {{ template_annotations | tojson }}
|
|
23
|
+
labels: {{ head_template_labels | tojson }}
|
|
24
|
+
spec: {{ pod_template | tojson }}
|
|
25
|
+
workerGroupSpecs:
|
|
26
|
+
- groupName: worker-group
|
|
27
|
+
rayStartParams: {}
|
|
28
|
+
minReplicas: 0
|
|
29
|
+
maxReplicas: {{ worker_replicas }}
|
|
30
|
+
replicas: {{ worker_replicas }}
|
|
31
|
+
template:
|
|
32
|
+
metadata:
|
|
33
|
+
annotations: {{ template_annotations | tojson }}
|
|
34
|
+
labels: {{ worker_template_labels | tojson }}
|
|
35
|
+
spec: {{ pod_template | tojson }}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
apiVersion: v1
|
|
2
|
+
kind: Service
|
|
3
|
+
metadata:
|
|
4
|
+
name: {{ name }}
|
|
5
|
+
namespace: {{ namespace }}
|
|
6
|
+
annotations: {{ annotations | tojson }}
|
|
7
|
+
labels: {{ labels | tojson }}
|
|
8
|
+
spec:
|
|
9
|
+
{% if distributed %}
|
|
10
|
+
clusterIP: None # Headless service for distributed pod discovery
|
|
11
|
+
{% else %}
|
|
12
|
+
sessionAffinity: ClientIP # Ensure requests from same client go to same pod
|
|
13
|
+
{% endif %}
|
|
14
|
+
selector:
|
|
15
|
+
kubetorch.com/service: {{ deployment_name }}
|
|
16
|
+
kubetorch.com/module: {{ module_name }} # Only deployment pods have this set, so allows us to exclude the jump pod
|
|
17
|
+
ports:
|
|
18
|
+
- name: http
|
|
19
|
+
port: 80
|
|
20
|
+
targetPort: {{ server_port }}
|
|
21
|
+
protocol: TCP
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# --- Headless Service selecting all pods with 'app=my-app-name' ---
|
|
2
|
+
apiVersion: v1
|
|
3
|
+
kind: Service
|
|
4
|
+
metadata:
|
|
5
|
+
name: {{ workerset_name }}
|
|
6
|
+
namespace: {{ namespace }}
|
|
7
|
+
annotations: {{ annotations | tojson }}
|
|
8
|
+
labels: {{ labels | tojson }}
|
|
9
|
+
spec:
|
|
10
|
+
clusterIP: None # Make it headless
|
|
11
|
+
selector:
|
|
12
|
+
app: {{ workerset_name_app }} # Selects pods from any associated StatefulSets
|
|
13
|
+
|
|
14
|
+
---
|
|
15
|
+
# --- StatefulSet ---
|
|
16
|
+
apiVersion: apps/v1
|
|
17
|
+
kind: StatefulSet
|
|
18
|
+
metadata:
|
|
19
|
+
name: {{ worker_group_name }}
|
|
20
|
+
namespace: {{ namespace }}
|
|
21
|
+
ownerReferences:
|
|
22
|
+
- apiVersion: v1
|
|
23
|
+
kind: Pod
|
|
24
|
+
name: {{ service_pod_name }}
|
|
25
|
+
uid: {{ service_pod_uid }}
|
|
26
|
+
spec:
|
|
27
|
+
serviceName: {{ workerset_name }} # Must match the headless service name
|
|
28
|
+
replicas: {{ replicas }}
|
|
29
|
+
selector:
|
|
30
|
+
matchLabels:
|
|
31
|
+
app: {{ workerset_name_app }} # Matches the service selector
|
|
32
|
+
template:
|
|
33
|
+
metadata:
|
|
34
|
+
labels:
|
|
35
|
+
app: {{ workerset_name_app }} # Label for the service
|
|
36
|
+
spec: {{ pod_template | tojson }}
|