@nexusgpu/repterm-plugin-kubectl 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. package/README.md +277 -0
  2. package/dist/index.d.ts +314 -0
  3. package/dist/index.d.ts.map +1 -0
  4. package/dist/index.js +544 -0
  5. package/dist/matchers.d.ts +113 -0
  6. package/dist/matchers.d.ts.map +1 -0
  7. package/dist/matchers.js +527 -0
  8. package/dist/plugin-kubectl/examples/00-simple-demo.d.ts +10 -0
  9. package/dist/plugin-kubectl/examples/00-simple-demo.d.ts.map +1 -0
  10. package/dist/plugin-kubectl/examples/00-simple-demo.js +51 -0
  11. package/dist/plugin-kubectl/examples/01-basic-kubectl.d.ts +13 -0
  12. package/dist/plugin-kubectl/examples/01-basic-kubectl.d.ts.map +1 -0
  13. package/dist/plugin-kubectl/examples/01-basic-kubectl.js +86 -0
  14. package/dist/plugin-kubectl/examples/02-debugging.d.ts +13 -0
  15. package/dist/plugin-kubectl/examples/02-debugging.d.ts.map +1 -0
  16. package/dist/plugin-kubectl/examples/02-debugging.js +80 -0
  17. package/dist/plugin-kubectl/examples/03-resource-management.d.ts +13 -0
  18. package/dist/plugin-kubectl/examples/03-resource-management.d.ts.map +1 -0
  19. package/dist/plugin-kubectl/examples/03-resource-management.js +134 -0
  20. package/dist/plugin-kubectl/examples/04-rollout.d.ts +13 -0
  21. package/dist/plugin-kubectl/examples/04-rollout.d.ts.map +1 -0
  22. package/dist/plugin-kubectl/examples/04-rollout.js +122 -0
  23. package/dist/plugin-kubectl/examples/05-matchers.d.ts +15 -0
  24. package/dist/plugin-kubectl/examples/05-matchers.d.ts.map +1 -0
  25. package/dist/plugin-kubectl/examples/05-matchers.js +138 -0
  26. package/dist/plugin-kubectl/examples/06-advanced.d.ts +14 -0
  27. package/dist/plugin-kubectl/examples/06-advanced.d.ts.map +1 -0
  28. package/dist/plugin-kubectl/examples/06-advanced.js +140 -0
  29. package/dist/plugin-kubectl/examples/tensor-fusion/00-prerequisites.d.ts +14 -0
  30. package/dist/plugin-kubectl/examples/tensor-fusion/00-prerequisites.d.ts.map +1 -0
  31. package/dist/plugin-kubectl/examples/tensor-fusion/00-prerequisites.js +66 -0
  32. package/dist/plugin-kubectl/examples/tensor-fusion/01-workload-allocation.d.ts +14 -0
  33. package/dist/plugin-kubectl/examples/tensor-fusion/01-workload-allocation.d.ts.map +1 -0
  34. package/dist/plugin-kubectl/examples/tensor-fusion/01-workload-allocation.js +145 -0
  35. package/dist/plugin-kubectl/examples/tensor-fusion/02-annotation-mode.d.ts +13 -0
  36. package/dist/plugin-kubectl/examples/tensor-fusion/02-annotation-mode.d.ts.map +1 -0
  37. package/dist/plugin-kubectl/examples/tensor-fusion/02-annotation-mode.js +123 -0
  38. package/dist/plugin-kubectl/examples/tensor-fusion/03-insufficient.d.ts +17 -0
  39. package/dist/plugin-kubectl/examples/tensor-fusion/03-insufficient.d.ts.map +1 -0
  40. package/dist/plugin-kubectl/examples/tensor-fusion/03-insufficient.js +96 -0
  41. package/dist/plugin-kubectl/examples/tensor-fusion/04-release.d.ts +13 -0
  42. package/dist/plugin-kubectl/examples/tensor-fusion/04-release.d.ts.map +1 -0
  43. package/dist/plugin-kubectl/examples/tensor-fusion/04-release.js +117 -0
  44. package/dist/plugin-kubectl/examples/tensor-fusion/05-multi-workload-shared-gpu.d.ts +14 -0
  45. package/dist/plugin-kubectl/examples/tensor-fusion/05-multi-workload-shared-gpu.d.ts.map +1 -0
  46. package/dist/plugin-kubectl/examples/tensor-fusion/05-multi-workload-shared-gpu.js +145 -0
  47. package/dist/plugin-kubectl/examples/tensor-fusion/06-workload-resource-resize.d.ts +14 -0
  48. package/dist/plugin-kubectl/examples/tensor-fusion/06-workload-resource-resize.d.ts.map +1 -0
  49. package/dist/plugin-kubectl/examples/tensor-fusion/06-workload-resource-resize.js +235 -0
  50. package/dist/plugin-kubectl/examples/tensor-fusion/07-workload-worker-pod-generation.d.ts +15 -0
  51. package/dist/plugin-kubectl/examples/tensor-fusion/07-workload-worker-pod-generation.d.ts.map +1 -0
  52. package/dist/plugin-kubectl/examples/tensor-fusion/07-workload-worker-pod-generation.js +146 -0
  53. package/dist/plugin-kubectl/examples/tensor-fusion/08-workload-replicas-scale.d.ts +13 -0
  54. package/dist/plugin-kubectl/examples/tensor-fusion/08-workload-replicas-scale.d.ts.map +1 -0
  55. package/dist/plugin-kubectl/examples/tensor-fusion/08-workload-replicas-scale.js +141 -0
  56. package/dist/plugin-kubectl/examples/tensor-fusion/09-gpu-remote-invocation.d.ts +15 -0
  57. package/dist/plugin-kubectl/examples/tensor-fusion/09-gpu-remote-invocation.d.ts.map +1 -0
  58. package/dist/plugin-kubectl/examples/tensor-fusion/09-gpu-remote-invocation.js +256 -0
  59. package/dist/plugin-kubectl/examples/tensor-fusion/_config.d.ts +71 -0
  60. package/dist/plugin-kubectl/examples/tensor-fusion/_config.d.ts.map +1 -0
  61. package/dist/plugin-kubectl/examples/tensor-fusion/_config.js +159 -0
  62. package/dist/plugin-kubectl/src/index.d.ts +314 -0
  63. package/dist/plugin-kubectl/src/index.d.ts.map +1 -0
  64. package/dist/plugin-kubectl/src/index.js +545 -0
  65. package/dist/plugin-kubectl/src/matchers.d.ts +113 -0
  66. package/dist/plugin-kubectl/src/matchers.d.ts.map +1 -0
  67. package/dist/plugin-kubectl/src/matchers.js +527 -0
  68. package/dist/plugin-kubectl/src/result.d.ts +80 -0
  69. package/dist/plugin-kubectl/src/result.d.ts.map +1 -0
  70. package/dist/plugin-kubectl/src/result.js +134 -0
  71. package/dist/repterm/src/api/describe.d.ts +18 -0
  72. package/dist/repterm/src/api/describe.d.ts.map +1 -0
  73. package/dist/repterm/src/api/describe.js +32 -0
  74. package/dist/repterm/src/api/expect.d.ts +43 -0
  75. package/dist/repterm/src/api/expect.d.ts.map +1 -0
  76. package/dist/repterm/src/api/expect.js +166 -0
  77. package/dist/repterm/src/api/hooks.d.ts +178 -0
  78. package/dist/repterm/src/api/hooks.d.ts.map +1 -0
  79. package/dist/repterm/src/api/hooks.js +230 -0
  80. package/dist/repterm/src/api/steps.d.ts +45 -0
  81. package/dist/repterm/src/api/steps.d.ts.map +1 -0
  82. package/dist/repterm/src/api/steps.js +105 -0
  83. package/dist/repterm/src/api/test.d.ts +101 -0
  84. package/dist/repterm/src/api/test.d.ts.map +1 -0
  85. package/dist/repterm/src/api/test.js +206 -0
  86. package/dist/repterm/src/index.d.ts +15 -0
  87. package/dist/repterm/src/index.d.ts.map +1 -0
  88. package/dist/repterm/src/index.js +23 -0
  89. package/dist/repterm/src/plugin/index.d.ts +47 -0
  90. package/dist/repterm/src/plugin/index.d.ts.map +1 -0
  91. package/dist/repterm/src/plugin/index.js +85 -0
  92. package/dist/repterm/src/plugin/withPlugins.d.ts +71 -0
  93. package/dist/repterm/src/plugin/withPlugins.d.ts.map +1 -0
  94. package/dist/repterm/src/plugin/withPlugins.js +100 -0
  95. package/dist/repterm/src/runner/models.d.ts +261 -0
  96. package/dist/repterm/src/runner/models.d.ts.map +1 -0
  97. package/dist/repterm/src/runner/models.js +4 -0
  98. package/dist/result.d.ts +80 -0
  99. package/dist/result.d.ts.map +1 -0
  100. package/dist/result.js +134 -0
  101. package/package.json +38 -0
@@ -0,0 +1,146 @@
1
+ /**
2
+ * 测试场景 7: Workload 创建和 Worker Pod 生成
3
+ *
4
+ * 基于 `Workload创建和WorkerPod生成测试.md`:
5
+ * - 创建 `replicas=2` 的 TensorFusionWorkload
6
+ * - 等待 `status.workerCount` 变为 2
7
+ * - 等待 worker Pod Ready
8
+ * - 验证 worker Pod 数量与 `spec.replicas` 一致
9
+ * - 验证 `status.workerCount` 正确更新
10
+ *
11
+ * 运行方式:
12
+ * bun run repterm packages/plugin-kubectl/examples/tensor-fusion/07-workload-worker-pod-generation.ts
13
+ */
14
+ import { sleep } from 'bun';
15
+ import { test, describe, expect, step, tensorfusionworkload, TEST_NAMESPACE, TEST_GPU_POOL, } from './_config.js';
16
+ const WORKLOAD_NAME = 'tf-workload-worker-count-test';
17
+ const EXPECTED_REPLICAS = 2;
18
+ const WAIT_TIMEOUT = 180000;
19
+ const POLL_INTERVAL = 1500;
20
+ function workloadWorkerCountYaml(name, poolName) {
21
+ return `
22
+ apiVersion: tensor-fusion.ai/v1
23
+ kind: TensorFusionWorkload
24
+ metadata:
25
+ name: ${name}
26
+ namespace: ${TEST_NAMESPACE}
27
+ labels:
28
+ app: ${name}
29
+ test-type: worker-count
30
+ spec:
31
+ replicas: ${EXPECTED_REPLICAS}
32
+ gpuCount: 1
33
+ poolName: ${poolName}
34
+ qos: medium
35
+ isolation: soft
36
+ isLocalGPU: false
37
+ resources:
38
+ requests:
39
+ tflops: "100m"
40
+ vram: "1Gi"
41
+ limits:
42
+ tflops: "100m"
43
+ vram: "1Gi"
44
+ `;
45
+ }
46
+ async function listWorkerPods(kubectl, workloadName) {
47
+ const strictSelector = `tensor-fusion.ai/workload=${workloadName},tensor-fusion.ai/component=worker`;
48
+ const fallbackSelector = `tensor-fusion.ai/workload=${workloadName}`;
49
+ const workerPods = await kubectl.get('pod', undefined, {
50
+ selector: strictSelector,
51
+ jqFilter: '[.items[] | {name: .metadata.name, phase: .status.phase}]',
52
+ });
53
+ if (Array.isArray(workerPods) && workerPods.length > 0) {
54
+ return workerPods;
55
+ }
56
+ const fallbackPods = await kubectl.get('pod', undefined, {
57
+ selector: fallbackSelector,
58
+ jqFilter: '[.items[] | {name: .metadata.name, phase: .status.phase}]',
59
+ });
60
+ return fallbackPods ?? [];
61
+ }
62
+ async function waitForWorkerPods(kubectl, workloadName, expectedCount, timeoutMs) {
63
+ const deadline = Date.now() + timeoutMs;
64
+ while (Date.now() < deadline) {
65
+ const workerPods = await listWorkerPods(kubectl, workloadName);
66
+ if (workerPods.length >= expectedCount) {
67
+ return workerPods;
68
+ }
69
+ await sleep(POLL_INTERVAL);
70
+ }
71
+ throw new Error(`Timeout waiting worker pods for ${workloadName} reach ${expectedCount}`);
72
+ }
73
+ describe('测试场景 7: Workload 创建和 Worker Pod 生成', { record: true }, () => {
74
+ test('创建 replicas=2 的 Workload 后生成对应 worker Pods', async (ctx) => {
75
+ const { kubectl } = ctx.plugins;
76
+ try {
77
+ await step('创建 TensorFusionWorkload(replicas=2)', {
78
+ showStepTitle: false,
79
+ typingSpeed: 90,
80
+ pauseAfter: 1800,
81
+ }, async () => {
82
+ const yaml = workloadWorkerCountYaml(WORKLOAD_NAME, TEST_GPU_POOL);
83
+ const result = await kubectl.apply(yaml);
84
+ await expect(result).toBeSuccessful();
85
+ const specReplicas = await kubectl.getJsonPath('tensorfusionworkload', WORKLOAD_NAME, '.spec.replicas');
86
+ expect(specReplicas).toBe(EXPECTED_REPLICAS);
87
+ });
88
+ await step('等待 Workload status.workerCount=2', {
89
+ showStepTitle: false,
90
+ pauseAfter: 1800,
91
+ }, async () => {
92
+ await kubectl.waitForJsonPath('tensorfusionworkload', WORKLOAD_NAME, '.status.workerCount', String(EXPECTED_REPLICAS), WAIT_TIMEOUT);
93
+ const workload = tensorfusionworkload(kubectl, WORKLOAD_NAME);
94
+ await expect(workload).toHaveStatusField('workerCount', EXPECTED_REPLICAS);
95
+ });
96
+ await step('等待 worker Pods 创建并 Ready', {
97
+ showStepTitle: false,
98
+ typingSpeed: 80,
99
+ pauseAfter: 2000,
100
+ }, async () => {
101
+ const workerPods = await waitForWorkerPods(kubectl, WORKLOAD_NAME, EXPECTED_REPLICAS, WAIT_TIMEOUT);
102
+ for (const podInfo of workerPods.slice(0, EXPECTED_REPLICAS)) {
103
+ const waitResult = await kubectl.wait('pod', podInfo.name, 'Ready', {
104
+ timeout: WAIT_TIMEOUT,
105
+ });
106
+ await expect(waitResult).toBeSuccessful();
107
+ }
108
+ });
109
+ await step('验证 worker Pod 数量与 replicas 一致', {
110
+ typingSpeed: 80,
111
+ pauseAfter: 1800,
112
+ }, async () => {
113
+ const expectedReplicas = (await kubectl.getJsonPath('tensorfusionworkload', WORKLOAD_NAME, '.spec.replicas')) ?? 0;
114
+ const workerPods = await listWorkerPods(kubectl, WORKLOAD_NAME);
115
+ const actualWorkerCount = workerPods.length;
116
+ expect(actualWorkerCount).toBe(expectedReplicas);
117
+ });
118
+ await step('验证 status.workerCount 与 replicas 一致', {
119
+ showStepTitle: false,
120
+ pauseAfter: 1500,
121
+ }, async () => {
122
+ const statusWorkerCount = (await kubectl.getJsonPath('tensorfusionworkload', WORKLOAD_NAME, '.status.workerCount')) ?? 0;
123
+ expect(statusWorkerCount).toBe(EXPECTED_REPLICAS);
124
+ });
125
+ }
126
+ finally {
127
+ await step('清理 TensorFusionWorkload', {
128
+ showStepTitle: false,
129
+ typingSpeed: 70,
130
+ pauseAfter: 1200,
131
+ }, async () => {
132
+ try {
133
+ const exists = await kubectl.exists('tensorfusionworkload', WORKLOAD_NAME);
134
+ if (!exists) {
135
+ return;
136
+ }
137
+ const deleteResult = await kubectl.delete('tensorfusionworkload', WORKLOAD_NAME);
138
+ await expect(deleteResult).toBeSuccessful();
139
+ }
140
+ catch {
141
+ // 清理失败不覆盖主断言错误
142
+ }
143
+ });
144
+ }
145
+ });
146
+ });
@@ -0,0 +1,13 @@
1
+ /**
2
+ * 测试场景 8: Workload Replicas 扩缩容
3
+ *
4
+ * 基于 `WorkloadReplicas扩缩容测.md`:
5
+ * - 先创建一个 replicas=1 的 TensorFusionWorkload 并等待就绪
6
+ * - 将 replicas 从 1 扩容到 2,验证 worker pods 增加、status.workerCount=2
7
+ * - 将 replicas 从 2 缩容回 1,验证 worker pods 减少、status.workerCount=1
8
+ *
9
+ * 运行方式:
10
+ * bun run repterm packages/plugin-kubectl/examples/tensor-fusion/08-workload-replicas-scale.ts
11
+ */
12
+ export {};
13
+ //# sourceMappingURL=08-workload-replicas-scale.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"08-workload-replicas-scale.d.ts","sourceRoot":"","sources":["../../../../examples/tensor-fusion/08-workload-replicas-scale.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG"}
@@ -0,0 +1,141 @@
1
+ /**
2
+ * 测试场景 8: Workload Replicas 扩缩容
3
+ *
4
+ * 基于 `WorkloadReplicas扩缩容测.md`:
5
+ * - 先创建一个 replicas=1 的 TensorFusionWorkload 并等待就绪
6
+ * - 将 replicas 从 1 扩容到 2,验证 worker pods 增加、status.workerCount=2
7
+ * - 将 replicas 从 2 缩容回 1,验证 worker pods 减少、status.workerCount=1
8
+ *
9
+ * 运行方式:
10
+ * bun run repterm packages/plugin-kubectl/examples/tensor-fusion/08-workload-replicas-scale.ts
11
+ */
12
+ import { sleep } from 'bun';
13
+ import { test, describe, expect, step, tensorfusionworkload, workloadYaml, DEFAULT_TIMEOUT, } from './_config.js';
14
+ const WORKLOAD_NAME = 'test-workload-replicas';
15
+ /**
16
+ * 获取指定 workload 的 worker pod 数量
17
+ */
18
+ async function getWorkerPodCount(kubectl, workloadName) {
19
+ const pods = await kubectl.get('pod', undefined, {
20
+ selector: `tensor-fusion.ai/workload=${workloadName},tensor-fusion.ai/component=worker`,
21
+ jqFilter: '[.items[] | {name: .metadata.name, phase: .status.phase}]',
22
+ });
23
+ return pods?.length ?? 0;
24
+ }
25
+ describe('测试场景 8: Workload Replicas 扩缩容', { record: true }, () => {
26
+ test('TensorFusionWorkload 扩缩容后 worker pods 与 workerCount 同步更新', async (ctx) => {
27
+ const { kubectl } = ctx.plugins;
28
+ // ===== Step 1: 创建 replicas=1 的 Workload 并等待就绪 =====
29
+ await step('创建 Workload(replicas=1)', {
30
+ showStepTitle: false,
31
+ typingSpeed: 100,
32
+ pauseAfter: 2000,
33
+ }, async () => {
34
+ const yaml = workloadYaml(WORKLOAD_NAME, {
35
+ tflopsRequest: '1000m',
36
+ tflopsLimit: '2000m',
37
+ vramRequest: '1Gi',
38
+ vramLimit: '2Gi',
39
+ replicas: 1,
40
+ });
41
+ const result = await kubectl.apply(yaml);
42
+ await expect(result).toBeSuccessful();
43
+ });
44
+ await step('等待 Workload Ready', {
45
+ showStepTitle: false,
46
+ pauseAfter: 1800,
47
+ }, async () => {
48
+ const waitResult = await kubectl.wait('tensorfusionworkload', WORKLOAD_NAME, 'Ready', { timeout: DEFAULT_TIMEOUT });
49
+ await expect(waitResult).toBeSuccessful();
50
+ const workload = tensorfusionworkload(kubectl, WORKLOAD_NAME);
51
+ await expect(workload).toHaveStatusField('phase', 'Running');
52
+ });
53
+ // ===== Step 2: 记录当前 worker pod 数量 =====
54
+ await step('记录当前 worker pod 数量', {
55
+ typingSpeed: 80,
56
+ pauseAfter: 1500,
57
+ }, async () => {
58
+ const beforeCount = await getWorkerPodCount(kubectl, WORKLOAD_NAME);
59
+ expect(beforeCount).toBe(1);
60
+ // 验证 status.workerCount 也为 1
61
+ const status = await kubectl.getJsonPath('tensorfusionworkload', WORKLOAD_NAME, '.status');
62
+ expect(status?.workerCount).toBe(1);
63
+ });
64
+ // ===== Step 3: 扩容 replicas 到 2 =====
65
+ await step('patch Workload replicas=2', {
66
+ showStepTitle: false,
67
+ typingSpeed: 100,
68
+ pauseAfter: 1800,
69
+ }, async () => {
70
+ const patchResult = await kubectl.patch('tensorfusionworkload', WORKLOAD_NAME, { spec: { replicas: 2 } }, 'merge');
71
+ await expect(patchResult).toBeSuccessful();
72
+ // 确认 spec.replicas 已更新
73
+ const replicas = await kubectl.getJsonPath('tensorfusionworkload', WORKLOAD_NAME, '.spec.replicas');
74
+ expect(replicas).toBe(2);
75
+ });
76
+ // ===== Step 4: 验证扩容结果 =====
77
+ await step('等待 workerCount=2', {
78
+ showStepTitle: false,
79
+ pauseAfter: 2000,
80
+ }, async () => {
81
+ const deadline = Date.now() + DEFAULT_TIMEOUT;
82
+ while (Date.now() < deadline) {
83
+ const status = await kubectl.getJsonPath('tensorfusionworkload', WORKLOAD_NAME, '.status');
84
+ if (status?.workerCount === 2) {
85
+ break;
86
+ }
87
+ await sleep(3000);
88
+ }
89
+ const status = await kubectl.getJsonPath('tensorfusionworkload', WORKLOAD_NAME, '.status');
90
+ expect(status?.workerCount).toBe(2);
91
+ });
92
+ await step('验证扩容后 worker pod 数量为 2', {
93
+ typingSpeed: 80,
94
+ pauseAfter: 2500,
95
+ }, async () => {
96
+ const afterScaleUpCount = await getWorkerPodCount(kubectl, WORKLOAD_NAME);
97
+ expect(afterScaleUpCount).toBe(2);
98
+ });
99
+ // ===== Step 5: 缩容 replicas 回 1 =====
100
+ await step('patch Workload replicas=1', {
101
+ showStepTitle: false,
102
+ typingSpeed: 100,
103
+ pauseAfter: 1800,
104
+ }, async () => {
105
+ const patchResult = await kubectl.patch('tensorfusionworkload', WORKLOAD_NAME, { spec: { replicas: 1 } }, 'merge');
106
+ await expect(patchResult).toBeSuccessful();
107
+ });
108
+ await step('等待 workerCount=1', {
109
+ showStepTitle: false,
110
+ pauseAfter: 2000,
111
+ }, async () => {
112
+ const deadline = Date.now() + DEFAULT_TIMEOUT;
113
+ while (Date.now() < deadline) {
114
+ const status = await kubectl.getJsonPath('tensorfusionworkload', WORKLOAD_NAME, '.status');
115
+ if (status?.workerCount === 1) {
116
+ break;
117
+ }
118
+ await sleep(3000);
119
+ }
120
+ const status = await kubectl.getJsonPath('tensorfusionworkload', WORKLOAD_NAME, '.status');
121
+ expect(status?.workerCount).toBe(1);
122
+ });
123
+ await step('验证缩容后 worker pod 数量为 1', {
124
+ typingSpeed: 80,
125
+ pauseAfter: 2500,
126
+ }, async () => {
127
+ const afterScaleDownCount = await getWorkerPodCount(kubectl, WORKLOAD_NAME);
128
+ expect(afterScaleDownCount).toBe(1);
129
+ });
130
+ // ===== Step 6: 清理 =====
131
+ await step('删除 TensorFusionWorkload', {
132
+ showStepTitle: false,
133
+ typingSpeed: 80,
134
+ pauseAfter: 2000,
135
+ }, async () => {
136
+ const deleteResult = await kubectl.delete('tensorfusionworkload', WORKLOAD_NAME);
137
+ await expect(deleteResult).toBeSuccessful();
138
+ await sleep(5000);
139
+ });
140
+ });
141
+ });
@@ -0,0 +1,15 @@
1
+ /**
2
+ * 测试场景 9: GPU 远程调用
3
+ *
4
+ * 基于 `GPU远程调用测试.md`:
5
+ * - 创建一个远程模式(isLocalGPU=false)的 TensorFusionWorkload
6
+ * - 创建一个带 remote annotation 的 client pod
7
+ * - 验证 TensorFusionConnection 自动创建
8
+ * - 验证 connection 的 metadata(namespace、labels、ownerReferences)与 spec(workloadName、clientPod)
9
+ * - 在 client pod 内执行 nvidia-smi 和 PyTorch 验证,确认可通过远程方式使用 GPU
10
+ *
11
+ * 运行方式:
12
+ * bun run repterm packages/plugin-kubectl/examples/tensor-fusion/09-gpu-remote-invocation.ts
13
+ */
14
+ export {};
15
+ //# sourceMappingURL=09-gpu-remote-invocation.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"09-gpu-remote-invocation.d.ts","sourceRoot":"","sources":["../../../../examples/tensor-fusion/09-gpu-remote-invocation.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;GAYG"}
@@ -0,0 +1,256 @@
1
+ /**
2
+ * 测试场景 9: GPU 远程调用
3
+ *
4
+ * 基于 `GPU远程调用测试.md`:
5
+ * - 创建一个远程模式(isLocalGPU=false)的 TensorFusionWorkload
6
+ * - 创建一个带 remote annotation 的 client pod
7
+ * - 验证 TensorFusionConnection 自动创建
8
+ * - 验证 connection 的 metadata(namespace、labels、ownerReferences)与 spec(workloadName、clientPod)
9
+ * - 在 client pod 内执行 nvidia-smi 和 PyTorch 验证,确认可通过远程方式使用 GPU
10
+ *
11
+ * 运行方式:
12
+ * bun run repterm packages/plugin-kubectl/examples/tensor-fusion/09-gpu-remote-invocation.ts
13
+ */
14
+ import { sleep } from 'bun';
15
+ import { test, describe, expect, step, tensorfusionworkload, DEFAULT_TIMEOUT, TEST_GPU_POOL, TEST_NAMESPACE, } from './_config.js';
16
+ const WORKLOAD_NAME = 'test-remote-workload';
17
+ const CLIENT_POD_NAME = 'test-remote-client';
18
+ /**
19
+ * 远程模式 TensorFusionWorkload YAML 模板
20
+ */
21
+ function remoteWorkloadYaml(name) {
22
+ return `
23
+ apiVersion: tensor-fusion.ai/v1
24
+ kind: TensorFusionWorkload
25
+ metadata:
26
+ name: ${name}
27
+ namespace: ${TEST_NAMESPACE}
28
+ labels:
29
+ app: ${name}
30
+ test-type: gpu-remote
31
+ spec:
32
+ replicas: 1
33
+ gpuCount: 1
34
+ poolName: ${TEST_GPU_POOL}
35
+ qos: medium
36
+ isolation: soft
37
+ resources:
38
+ requests:
39
+ tflops: 100m
40
+ vram: "1Gi"
41
+ limits:
42
+ tflops: 100m
43
+ vram: "1Gi"
44
+ isLocalGPU: false
45
+ autoScalingConfig:
46
+ autoSetResources:
47
+ enable: false
48
+ `;
49
+ }
50
+ /**
51
+ * 远程模式 client pod YAML 模板
52
+ */
53
+ function clientPodYaml(podName, poolName) {
54
+ return `
55
+ apiVersion: v1
56
+ kind: Pod
57
+ metadata:
58
+ name: ${podName}
59
+ namespace: ${TEST_NAMESPACE}
60
+ labels:
61
+ tensor-fusion.ai/enabled: "true"
62
+ annotations:
63
+ tensor-fusion.ai/is-local-gpu: "false"
64
+ tensor-fusion.ai/gpupool: "${poolName}"
65
+ tensor-fusion.ai/tflops-request: "100m"
66
+ tensor-fusion.ai/vram-request: "1Gi"
67
+ tensor-fusion.ai/tflops-limit: "100m"
68
+ tensor-fusion.ai/vram-limit: "1Gi"
69
+ tensor-fusion.ai/inject-container: "app"
70
+ spec:
71
+ restartPolicy: Never
72
+ containers:
73
+ - name: app
74
+ image: pytorch/pytorch:2.4.1-cuda12.1-cudnn9-runtime
75
+ command: ["sh", "-c", "sleep 3600"]
76
+ `;
77
+ }
78
+ /**
79
+ * 从 client pod 的 env 中读取 connection 信息
80
+ */
81
+ async function getConnectionInfoFromPod(kubectl, podName) {
82
+ const envData = await kubectl.get('pod', podName, {
83
+ jqFilter: '[.spec.containers[0].env[] | select(.name == "TENSOR_FUSION_CONNECTION_NAME" or .name == "TENSOR_FUSION_CONNECTION_NAMESPACE") | {name: .name, value: .value}]',
84
+ });
85
+ const envMap = new Map((envData ?? []).map((e) => [e.name, e.value]));
86
+ return {
87
+ connName: envMap.get('TENSOR_FUSION_CONNECTION_NAME') ?? '',
88
+ connNamespace: envMap.get('TENSOR_FUSION_CONNECTION_NAMESPACE') ?? '',
89
+ };
90
+ }
91
+ describe('测试场景 9: GPU 远程调用', { record: true }, () => {
92
+ test('远程模式下 client pod 自动创建 TensorFusionConnection 并验证关联', async (ctx) => {
93
+ const { kubectl } = ctx.plugins;
94
+ let connName;
95
+ let connNamespace;
96
+ // ===== Step 1: 创建远程模式 Workload =====
97
+ await step('创建远程模式 Workload(isLocalGPU=false)', {
98
+ showStepTitle: false,
99
+ typingSpeed: 100,
100
+ pauseAfter: 2000,
101
+ }, async () => {
102
+ const yaml = remoteWorkloadYaml(WORKLOAD_NAME);
103
+ const result = await kubectl.apply(yaml);
104
+ await expect(result).toBeSuccessful();
105
+ });
106
+ await step('等待 Workload Ready', {
107
+ showStepTitle: false,
108
+ pauseAfter: 1800,
109
+ }, async () => {
110
+ const waitResult = await kubectl.wait('tensorfusionworkload', WORKLOAD_NAME, 'Ready', { timeout: DEFAULT_TIMEOUT });
111
+ await expect(waitResult).toBeSuccessful();
112
+ const workload = tensorfusionworkload(kubectl, WORKLOAD_NAME);
113
+ await expect(workload).toHaveStatusField('phase', 'Running');
114
+ });
115
+ await step('确认 Workload 为远程模式', {
116
+ typingSpeed: 80,
117
+ pauseAfter: 1500,
118
+ }, async () => {
119
+ const spec = await kubectl.getJsonPath('tensorfusionworkload', WORKLOAD_NAME, '.spec');
120
+ expect(spec?.isLocalGPU).toBe(false);
121
+ });
122
+ // ===== Step 2: 创建 client pod =====
123
+ await step('创建远程模式 client pod', {
124
+ showStepTitle: false,
125
+ typingSpeed: 100,
126
+ pauseAfter: 2500,
127
+ }, async () => {
128
+ const yaml = clientPodYaml(CLIENT_POD_NAME, TEST_GPU_POOL);
129
+ const result = await kubectl.apply(yaml);
130
+ await expect(result).toBeSuccessful();
131
+ });
132
+ await step('等待 client pod Ready', {
133
+ showStepTitle: false,
134
+ pauseAfter: 2000,
135
+ }, async () => {
136
+ await kubectl.waitForPod(CLIENT_POD_NAME, 'Running', DEFAULT_TIMEOUT * 3);
137
+ });
138
+ // ===== Step 3: 检查 TensorFusionConnection 自动创建 =====
139
+ await step('从 client pod env 读取 connection 信息', {
140
+ showStepTitle: false,
141
+ typingSpeed: 80,
142
+ pauseAfter: 2000,
143
+ }, async () => {
144
+ const info = await getConnectionInfoFromPod(kubectl, CLIENT_POD_NAME);
145
+ connName = info.connName;
146
+ connNamespace = info.connNamespace;
147
+ expect(connName.length).toBeGreaterThan(0);
148
+ expect(connNamespace.length).toBeGreaterThan(0);
149
+ });
150
+ await step('验证 TensorFusionConnection 资源存在', {
151
+ typingSpeed: 80,
152
+ pauseAfter: 2000,
153
+ }, async () => {
154
+ const exists = await kubectl.exists('tensorfusionconnection', connName);
155
+ expect(exists).toBe(true);
156
+ });
157
+ // ===== Step 4: 验证 connection metadata =====
158
+ await step('验证 connection metadata 字段', {
159
+ showStepTitle: false,
160
+ typingSpeed: 80,
161
+ pauseAfter: 2500,
162
+ }, async () => {
163
+ const metadata = await kubectl.get('tensorfusionconnection', connName, {
164
+ jqFilter: `{namespace: .metadata.namespace, workloadLabel: .metadata.labels["tensor-fusion.ai/workload"], ownerKind: .metadata.ownerReferences[0].kind, ownerName: .metadata.ownerReferences[0].name}`,
165
+ });
166
+ // metadata.namespace 应与测试命名空间一致
167
+ expect(metadata?.namespace).toBe(TEST_NAMESPACE);
168
+ // ownerReferences 指向 client pod
169
+ expect(metadata?.ownerKind).toBe('Pod');
170
+ expect(metadata?.ownerName).toBe(CLIENT_POD_NAME);
171
+ });
172
+ // ===== Step 5: 验证 connection spec =====
173
+ await step('验证 connection spec 字段', {
174
+ typingSpeed: 80,
175
+ pauseAfter: 2500,
176
+ }, async () => {
177
+ const spec = await kubectl.getJsonPath('tensorfusionconnection', connName, '.spec');
178
+ expect(spec?.clientPod).toBe(CLIENT_POD_NAME);
179
+ });
180
+ // ===== Step 6: (可选)查看 connection 状态 =====
181
+ await step('查看 connection 状态', {
182
+ pauseAfter: 2000,
183
+ }, async () => {
184
+ const status = await kubectl.getJsonPath('tensorfusionconnection', connName, '.status');
185
+ // phase 应存在
186
+ expect(status?.phase).toBeDefined();
187
+ });
188
+ // ===== Step 7: 在 client pod 内验证 GPU 可用 =====
189
+ await step('执行 nvidia-smi 验证 GPU 可见', {
190
+ showStepTitle: false,
191
+ typingSpeed: 80,
192
+ pauseAfter: 2500,
193
+ }, async () => {
194
+ const nvidiaSmiOutput = await kubectl.exec(CLIENT_POD_NAME, ['nvidia-smi'], { container: 'app' });
195
+ // nvidia-smi 输出应包含 NVIDIA 驱动信息和 GPU 信息
196
+ expect(nvidiaSmiOutput).toContain('NVIDIA');
197
+ expect(nvidiaSmiOutput).toContain('GPU');
198
+ });
199
+ await step('执行 nvidia-smi -L 列出 GPU 设备', {
200
+ typingSpeed: 80,
201
+ pauseAfter: 2000,
202
+ }, async () => {
203
+ const gpuListOutput = await kubectl.exec(CLIENT_POD_NAME, ['nvidia-smi', '-L'], { container: 'app' });
204
+ // 至少存在一块 GPU
205
+ expect(gpuListOutput).toContain('GPU 0');
206
+ });
207
+ await step('PyTorch 检测 CUDA 可用', {
208
+ showStepTitle: false,
209
+ typingSpeed: 80,
210
+ pauseAfter: 2500,
211
+ }, async () => {
212
+ const cudaAvailable = await kubectl.exec(CLIENT_POD_NAME, ['python3', '-c', 'import torch; print(torch.cuda.is_available())'], { container: 'app' });
213
+ expect(cudaAvailable.trim()).toBe('True');
214
+ });
215
+ await step('PyTorch 获取 GPU 设备信息', {
216
+ typingSpeed: 80,
217
+ pauseAfter: 2500,
218
+ }, async () => {
219
+ const gpuInfo = await kubectl.exec(CLIENT_POD_NAME, ['python3', '-c', 'import torch; print(f"device_count={torch.cuda.device_count()}, name={torch.cuda.get_device_name(0)}")'], { container: 'app' });
220
+ // 至少有 1 块 GPU
221
+ expect(gpuInfo).toContain('device_count=');
222
+ expect(gpuInfo).not.toContain('device_count=0');
223
+ });
224
+ await step('PyTorch GPU 张量运算验证', {
225
+ showStepTitle: false,
226
+ typingSpeed: 80,
227
+ pauseAfter: 3000,
228
+ }, async () => {
229
+ const tensorTest = await kubectl.exec(CLIENT_POD_NAME, [
230
+ 'python3', '-c',
231
+ 'import torch; a = torch.randn(2, 3, device="cuda"); b = torch.randn(3, 2, device="cuda"); c = torch.mm(a, b); print(f"shape={list(c.shape)}, device={c.device}")',
232
+ ], { container: 'app' });
233
+ // 验证运算在 GPU 上完成
234
+ expect(tensorTest).toContain('shape=[2, 2]');
235
+ expect(tensorTest).toContain('device=cuda');
236
+ });
237
+ // ===== Step 8: 清理 =====
238
+ await step('删除 client pod', {
239
+ showStepTitle: false,
240
+ typingSpeed: 80,
241
+ pauseAfter: 1500,
242
+ }, async () => {
243
+ const result = await kubectl.delete('pod', CLIENT_POD_NAME);
244
+ await expect(result).toBeSuccessful();
245
+ });
246
+ await step('删除 TensorFusionWorkload', {
247
+ showStepTitle: false,
248
+ typingSpeed: 80,
249
+ pauseAfter: 2000,
250
+ }, async () => {
251
+ const result = await kubectl.delete('tensorfusionworkload', WORKLOAD_NAME);
252
+ await expect(result).toBeSuccessful();
253
+ await sleep(5000);
254
+ });
255
+ });
256
+ });
@@ -0,0 +1,71 @@
1
+ /**
2
+ * Tensor Fusion GPU 资源分配测试 - 共享配置
3
+ *
4
+ * 运行方式:
5
+ * bun run repterm packages/plugin-kubectl/examples/tensor-fusion/
6
+ *
7
+ * 前置条件:
8
+ * - 已配置 kubectl 并连接到 Kubernetes 集群
9
+ * - Tensor Fusion Controller 已部署并运行
10
+ * - 至少存在一个 GPUPool 和可用 GPU
11
+ */
12
+ import { describe, expect, step } from 'repterm';
13
+ import { gpupool, gpu, tensorfusionworkload, tensorfusionconnection, pod, deployment, resource, type KubectlMethods } from '../../src/index.js';
14
+ /** 测试使用的 GPUPool 名称 */
15
+ export declare const TEST_GPU_POOL = "tensor-fusion-shared";
16
+ /** 测试命名空间 */
17
+ export declare const TEST_NAMESPACE = "default";
18
+ /** Tensor Fusion 系统命名空间 */
19
+ export declare const TF_SYSTEM_NAMESPACE = "tensor-fusion-sys";
20
+ /** Controller Deployment 名称 */
21
+ export declare const TF_CONTROLLER_DEPLOYMENT = "tensor-fusion-sys-controller";
22
+ /** 默认等待超时时间 (ms) */
23
+ export declare const DEFAULT_TIMEOUT = 60000;
24
+ export declare const config: import("node_modules/repterm/dist/plugin/index.js").PluginRuntime<readonly [import("repterm").PluginDefinition<"kubectl", import("@repterm/plugin-api").BasePluginContext, import("../../src/index.js").KubectlContext, KubectlMethods>]>;
25
+ export declare const test: (name: string, fn: import("repterm").PluginTestFunction<readonly [import("repterm").PluginDefinition<"kubectl", import("@repterm/plugin-api").BasePluginContext, import("../../src/index.js").KubectlContext, KubectlMethods>]>) => void;
26
+ export { describe, expect, step, gpupool, gpu, tensorfusionworkload, tensorfusionconnection, pod, deployment, resource, };
27
+ export type { KubectlMethods };
28
+ /**
29
+ * TensorFusionWorkload 测试模板
30
+ */
31
+ export declare const workloadYaml: (name: string, options?: {
32
+ tflopsRequest?: string;
33
+ tflopsLimit?: string;
34
+ vramRequest?: string;
35
+ vramLimit?: string;
36
+ replicas?: number;
37
+ poolName?: string;
38
+ }) => string;
39
+ /**
40
+ * 带 Tensor Fusion Annotation 的 Deployment 模板
41
+ */
42
+ export declare const annotatedDeploymentYaml: (name: string, options?: {
43
+ tflopsRequest?: string;
44
+ tflopsLimit?: string;
45
+ vramRequest?: string;
46
+ vramLimit?: string;
47
+ poolName?: string;
48
+ }) => string;
49
+ /**
50
+ * 获取 GPUPool 中第一个 GPU 名称
51
+ */
52
+ export declare function getFirstGpuName(kubectl: KubectlMethods): Promise<string>;
53
+ /**
54
+ * 获取 GPU 的可用资源
55
+ */
56
+ export declare function getGpuAvailable(kubectl: KubectlMethods, gpuName: string): Promise<{
57
+ tflops: string;
58
+ vram: string;
59
+ }>;
60
+ /**
61
+ * 解析 TFlops 值为数字(支持带 m 后缀和不带后缀的格式)
62
+ */
63
+ export declare function parseTflops(value: string | number): number;
64
+ /**
65
+ * 清理测试资源
66
+ */
67
+ export declare function cleanup(kubectl: KubectlMethods, resources: Array<{
68
+ kind: string;
69
+ name: string;
70
+ }>): Promise<void>;
71
+ //# sourceMappingURL=_config.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"_config.d.ts","sourceRoot":"","sources":["../../../../examples/tensor-fusion/_config.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,OAAO,EACL,QAAQ,EAGR,MAAM,EAEN,IAAI,EACL,MAAM,SAAS,CAAC;AACjB,OAAO,EAEL,OAAO,EACP,GAAG,EACH,oBAAoB,EACpB,sBAAsB,EACtB,GAAG,EACH,UAAU,EACV,QAAQ,EACR,KAAK,cAAc,EACpB,MAAM,oBAAoB,CAAC;AAI5B,uBAAuB;AACvB,eAAO,MAAM,aAAa,yBAAyB,CAAC;AAEpD,aAAa;AACb,eAAO,MAAM,cAAc,YAAY,CAAC;AAExC,2BAA2B;AAC3B,eAAO,MAAM,mBAAmB,sBAAsB,CAAC;AAEvD,+BAA+B;AAC/B,eAAO,MAAM,wBAAwB,iCAAiC,CAAC;AAEvE,oBAAoB;AACpB,eAAO,MAAM,eAAe,QAAQ,CAAC;AAIrC,eAAO,MAAM,MAAM,2OAEjB,CAAC;AAEH,eAAO,MAAM,IAAI,0OAAgC,CAAC;AAIlD,OAAO,EACL,QAAQ,EACR,MAAM,EACN,IAAI,EACJ,OAAO,EACP,GAAG,EACH,oBAAoB,EACpB,sBAAsB,EACtB,GAAG,EACH,UAAU,EACV,QAAQ,GACT,CAAC;AAEF,YAAY,EAAE,cAAc,EAAE,CAAC;AAI/B;;GAEG;AACH,eAAO,MAAM,YAAY,GAAI,MAAM,MAAM,EAAE,UAAS;IAClD,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,QAAQ,CAAC,EAAE,MAAM,CAAC;CACd,WA4BL,CAAC;AAEF;;GAEG;AACH,eAAO,MAAM,uBAAuB,GAAI,MAAM,MAAM,EAAE,UAAS;IAC7D,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,QAAQ,CAAC,EAAE,MAAM,CAAC;CACd,WAwCL,CAAC;AAIF;;GAEG;AACH,wBAAsB,eAAe,CAAC,OAAO,EAAE,cAAc,GAAG,OAAO,CAAC,MAAM,CAAC,CAc9E;AAED;;GAEG;AACH,wBAAsB,eAAe,CAAC,OAAO,EAAE,cAAc,EAAE,OAAO,EAAE,MAAM,GAAG,OAAO,CAAC;IACvF,MAAM,EAAE,MAAM,CAAC;IACf,IAAI,EAAE,MAAM,CAAC;CACd,CAAC,CASD;AAED;;GAEG;AACH,wBAAgB,WAAW,CAAC,KAAK,EAAE,MAAM,GAAG,MAAM,GAAG,MAAM,CAQ1D;AAED;;GAEG;AACH,wBAAsB,OAAO,CAAC,OAAO,EAAE,cAAc,EAAE,SAAS,EAAE,KAAK,CAAC;IAAE,IAAI,EAAE,MAAM,CAAC;IAAC,IAAI,EAAE,MAAM,CAAA;CAAE,CAAC,iBAQtG"}