@nexusgpu/repterm-plugin-kubectl 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. package/README.md +277 -0
  2. package/dist/index.d.ts +314 -0
  3. package/dist/index.d.ts.map +1 -0
  4. package/dist/index.js +544 -0
  5. package/dist/matchers.d.ts +113 -0
  6. package/dist/matchers.d.ts.map +1 -0
  7. package/dist/matchers.js +527 -0
  8. package/dist/plugin-kubectl/examples/00-simple-demo.d.ts +10 -0
  9. package/dist/plugin-kubectl/examples/00-simple-demo.d.ts.map +1 -0
  10. package/dist/plugin-kubectl/examples/00-simple-demo.js +51 -0
  11. package/dist/plugin-kubectl/examples/01-basic-kubectl.d.ts +13 -0
  12. package/dist/plugin-kubectl/examples/01-basic-kubectl.d.ts.map +1 -0
  13. package/dist/plugin-kubectl/examples/01-basic-kubectl.js +86 -0
  14. package/dist/plugin-kubectl/examples/02-debugging.d.ts +13 -0
  15. package/dist/plugin-kubectl/examples/02-debugging.d.ts.map +1 -0
  16. package/dist/plugin-kubectl/examples/02-debugging.js +80 -0
  17. package/dist/plugin-kubectl/examples/03-resource-management.d.ts +13 -0
  18. package/dist/plugin-kubectl/examples/03-resource-management.d.ts.map +1 -0
  19. package/dist/plugin-kubectl/examples/03-resource-management.js +134 -0
  20. package/dist/plugin-kubectl/examples/04-rollout.d.ts +13 -0
  21. package/dist/plugin-kubectl/examples/04-rollout.d.ts.map +1 -0
  22. package/dist/plugin-kubectl/examples/04-rollout.js +122 -0
  23. package/dist/plugin-kubectl/examples/05-matchers.d.ts +15 -0
  24. package/dist/plugin-kubectl/examples/05-matchers.d.ts.map +1 -0
  25. package/dist/plugin-kubectl/examples/05-matchers.js +138 -0
  26. package/dist/plugin-kubectl/examples/06-advanced.d.ts +14 -0
  27. package/dist/plugin-kubectl/examples/06-advanced.d.ts.map +1 -0
  28. package/dist/plugin-kubectl/examples/06-advanced.js +140 -0
  29. package/dist/plugin-kubectl/examples/tensor-fusion/00-prerequisites.d.ts +14 -0
  30. package/dist/plugin-kubectl/examples/tensor-fusion/00-prerequisites.d.ts.map +1 -0
  31. package/dist/plugin-kubectl/examples/tensor-fusion/00-prerequisites.js +66 -0
  32. package/dist/plugin-kubectl/examples/tensor-fusion/01-workload-allocation.d.ts +14 -0
  33. package/dist/plugin-kubectl/examples/tensor-fusion/01-workload-allocation.d.ts.map +1 -0
  34. package/dist/plugin-kubectl/examples/tensor-fusion/01-workload-allocation.js +145 -0
  35. package/dist/plugin-kubectl/examples/tensor-fusion/02-annotation-mode.d.ts +13 -0
  36. package/dist/plugin-kubectl/examples/tensor-fusion/02-annotation-mode.d.ts.map +1 -0
  37. package/dist/plugin-kubectl/examples/tensor-fusion/02-annotation-mode.js +123 -0
  38. package/dist/plugin-kubectl/examples/tensor-fusion/03-insufficient.d.ts +17 -0
  39. package/dist/plugin-kubectl/examples/tensor-fusion/03-insufficient.d.ts.map +1 -0
  40. package/dist/plugin-kubectl/examples/tensor-fusion/03-insufficient.js +96 -0
  41. package/dist/plugin-kubectl/examples/tensor-fusion/04-release.d.ts +13 -0
  42. package/dist/plugin-kubectl/examples/tensor-fusion/04-release.d.ts.map +1 -0
  43. package/dist/plugin-kubectl/examples/tensor-fusion/04-release.js +117 -0
  44. package/dist/plugin-kubectl/examples/tensor-fusion/05-multi-workload-shared-gpu.d.ts +14 -0
  45. package/dist/plugin-kubectl/examples/tensor-fusion/05-multi-workload-shared-gpu.d.ts.map +1 -0
  46. package/dist/plugin-kubectl/examples/tensor-fusion/05-multi-workload-shared-gpu.js +145 -0
  47. package/dist/plugin-kubectl/examples/tensor-fusion/06-workload-resource-resize.d.ts +14 -0
  48. package/dist/plugin-kubectl/examples/tensor-fusion/06-workload-resource-resize.d.ts.map +1 -0
  49. package/dist/plugin-kubectl/examples/tensor-fusion/06-workload-resource-resize.js +235 -0
  50. package/dist/plugin-kubectl/examples/tensor-fusion/07-workload-worker-pod-generation.d.ts +15 -0
  51. package/dist/plugin-kubectl/examples/tensor-fusion/07-workload-worker-pod-generation.d.ts.map +1 -0
  52. package/dist/plugin-kubectl/examples/tensor-fusion/07-workload-worker-pod-generation.js +146 -0
  53. package/dist/plugin-kubectl/examples/tensor-fusion/08-workload-replicas-scale.d.ts +13 -0
  54. package/dist/plugin-kubectl/examples/tensor-fusion/08-workload-replicas-scale.d.ts.map +1 -0
  55. package/dist/plugin-kubectl/examples/tensor-fusion/08-workload-replicas-scale.js +141 -0
  56. package/dist/plugin-kubectl/examples/tensor-fusion/09-gpu-remote-invocation.d.ts +15 -0
  57. package/dist/plugin-kubectl/examples/tensor-fusion/09-gpu-remote-invocation.d.ts.map +1 -0
  58. package/dist/plugin-kubectl/examples/tensor-fusion/09-gpu-remote-invocation.js +256 -0
  59. package/dist/plugin-kubectl/examples/tensor-fusion/_config.d.ts +71 -0
  60. package/dist/plugin-kubectl/examples/tensor-fusion/_config.d.ts.map +1 -0
  61. package/dist/plugin-kubectl/examples/tensor-fusion/_config.js +159 -0
  62. package/dist/plugin-kubectl/src/index.d.ts +314 -0
  63. package/dist/plugin-kubectl/src/index.d.ts.map +1 -0
  64. package/dist/plugin-kubectl/src/index.js +545 -0
  65. package/dist/plugin-kubectl/src/matchers.d.ts +113 -0
  66. package/dist/plugin-kubectl/src/matchers.d.ts.map +1 -0
  67. package/dist/plugin-kubectl/src/matchers.js +527 -0
  68. package/dist/plugin-kubectl/src/result.d.ts +80 -0
  69. package/dist/plugin-kubectl/src/result.d.ts.map +1 -0
  70. package/dist/plugin-kubectl/src/result.js +134 -0
  71. package/dist/repterm/src/api/describe.d.ts +18 -0
  72. package/dist/repterm/src/api/describe.d.ts.map +1 -0
  73. package/dist/repterm/src/api/describe.js +32 -0
  74. package/dist/repterm/src/api/expect.d.ts +43 -0
  75. package/dist/repterm/src/api/expect.d.ts.map +1 -0
  76. package/dist/repterm/src/api/expect.js +166 -0
  77. package/dist/repterm/src/api/hooks.d.ts +178 -0
  78. package/dist/repterm/src/api/hooks.d.ts.map +1 -0
  79. package/dist/repterm/src/api/hooks.js +230 -0
  80. package/dist/repterm/src/api/steps.d.ts +45 -0
  81. package/dist/repterm/src/api/steps.d.ts.map +1 -0
  82. package/dist/repterm/src/api/steps.js +105 -0
  83. package/dist/repterm/src/api/test.d.ts +101 -0
  84. package/dist/repterm/src/api/test.d.ts.map +1 -0
  85. package/dist/repterm/src/api/test.js +206 -0
  86. package/dist/repterm/src/index.d.ts +15 -0
  87. package/dist/repterm/src/index.d.ts.map +1 -0
  88. package/dist/repterm/src/index.js +23 -0
  89. package/dist/repterm/src/plugin/index.d.ts +47 -0
  90. package/dist/repterm/src/plugin/index.d.ts.map +1 -0
  91. package/dist/repterm/src/plugin/index.js +85 -0
  92. package/dist/repterm/src/plugin/withPlugins.d.ts +71 -0
  93. package/dist/repterm/src/plugin/withPlugins.d.ts.map +1 -0
  94. package/dist/repterm/src/plugin/withPlugins.js +100 -0
  95. package/dist/repterm/src/runner/models.d.ts +261 -0
  96. package/dist/repterm/src/runner/models.d.ts.map +1 -0
  97. package/dist/repterm/src/runner/models.js +4 -0
  98. package/dist/result.d.ts +80 -0
  99. package/dist/result.d.ts.map +1 -0
  100. package/dist/result.js +134 -0
  101. package/package.json +38 -0
@@ -0,0 +1,117 @@
1
+ /**
2
+ * 测试场景 4: 资源释放验证
3
+ *
4
+ * 验证删除 TensorFusionWorkload 后:
5
+ * - GPU 资源正确释放
6
+ * - 可用资源恢复到初始值
7
+ * - 关联的 Worker Pod 被清理
8
+ *
9
+ * 运行方式:
10
+ * bun run repterm packages/plugin-kubectl/examples/tensor-fusion/04-release.ts
11
+ */
12
+ import { test, describe, expect, step, tensorfusionworkload, workloadYaml, DEFAULT_TIMEOUT, getFirstGpuName, getGpuAvailable, parseTflops, } from './_config.js';
13
+ const WORKLOAD_NAME = 'test-workload-release';
14
+ describe('测试场景 4: 资源释放验证', { record: true }, () => {
15
+ let gpuName;
16
+ let initialTflops;
17
+ let allocatedTflops;
18
+ // ===== Step 1: 记录初始状态 =====
19
+ test('Step 1: 记录初始 GPU 资源状态', async (ctx) => {
20
+ const { kubectl } = ctx.plugins;
21
+ kubectl.get();
22
+ await step('获取测试 GPU', async () => {
23
+ gpuName = await getFirstGpuName(kubectl);
24
+ });
25
+ await step('记录初始可用资源', async () => {
26
+ const available = await getGpuAvailable(kubectl, gpuName);
27
+ initialTflops = available.tflops;
28
+ });
29
+ });
30
+ // ===== Step 2: 创建并等待 Workload 就绪 =====
31
+ test('Step 2: 创建 TensorFusionWorkload 并等待就绪', async (ctx) => {
32
+ const { kubectl } = ctx.plugins;
33
+ await step('创建 Workload', async () => {
34
+ const yaml = workloadYaml(WORKLOAD_NAME, {
35
+ tflopsRequest: '1000m',
36
+ tflopsLimit: '2000m',
37
+ vramRequest: '1Gi',
38
+ vramLimit: '2Gi',
39
+ });
40
+ const result = await kubectl.apply(yaml);
41
+ await expect(result).toBeSuccessful();
42
+ });
43
+ await step('等待 Workload 就绪', async () => {
44
+ await kubectl.wait('tensorfusionworkload', WORKLOAD_NAME, 'Ready', { timeout: DEFAULT_TIMEOUT });
45
+ });
46
+ await step('记录分配后的资源', async () => {
47
+ const available = await getGpuAvailable(kubectl, gpuName);
48
+ allocatedTflops = available.tflops;
49
+ const initialNum = parseTflops(initialTflops);
50
+ const allocatedNum = parseTflops(allocatedTflops);
51
+ // 验证资源确实被分配了
52
+ expect(allocatedNum).toBeLessThan(initialNum);
53
+ });
54
+ });
55
+ // ===== Step 3: 确认 Worker Pod 存在 =====
56
+ test('Step 3: 确认 Worker Pod 存在', async (ctx) => {
57
+ const { kubectl } = ctx.plugins;
58
+ await step('查找 Worker Pod', async () => {
59
+ const pods = await kubectl.get('pod', undefined, { selector: `tensor-fusion.ai/workload=${WORKLOAD_NAME}` });
60
+ expect(pods.items?.length).toBeGreaterThan(0);
61
+ expect(pods.items[0].status.phase).toBe('Running');
62
+ });
63
+ });
64
+ // ===== Step 4: 删除 Workload =====
65
+ test('Step 4: 删除 TensorFusionWorkload', async (ctx) => {
66
+ const { kubectl } = ctx.plugins;
67
+ await step('删除 Workload', async () => {
68
+ const result = await kubectl.delete('tensorfusionworkload', WORKLOAD_NAME);
69
+ await expect(result).toBeSuccessful();
70
+ });
71
+ await step('等待 Workload 删除完成', async () => {
72
+ const startTime = Date.now();
73
+ const timeout = 30000;
74
+ while (Date.now() - startTime < timeout) {
75
+ const exists = await kubectl.exists('tensorfusionworkload', WORKLOAD_NAME);
76
+ if (!exists) {
77
+ return;
78
+ }
79
+ await new Promise(resolve => setTimeout(resolve, 2000));
80
+ }
81
+ // 超时后断言 workload 不存在
82
+ const exists = await kubectl.exists('tensorfusionworkload', WORKLOAD_NAME);
83
+ expect(exists).toBe(false);
84
+ });
85
+ });
86
+ // ===== Step 5: 验证资源释放 =====
87
+ test('Step 5: 验证 GPU 资源已释放', async (ctx) => {
88
+ const { kubectl } = ctx.plugins;
89
+ await step('等待资源释放', async () => {
90
+ await new Promise(resolve => setTimeout(resolve, 5000));
91
+ });
92
+ await step('检查 GPU 可用资源', async () => {
93
+ const releasedAvailable = await getGpuAvailable(kubectl, gpuName);
94
+ const initialNum = parseTflops(initialTflops);
95
+ const releasedNum = parseTflops(releasedAvailable.tflops);
96
+ // 验证资源已恢复(允许小误差)
97
+ expect(Math.abs(releasedNum - initialNum)).toBeLessThan(100);
98
+ });
99
+ });
100
+ // ===== Step 6: 验证 Worker Pod 已清理 =====
101
+ test('Step 6: 验证 Worker Pod 已清理', async (ctx) => {
102
+ const { kubectl } = ctx.plugins;
103
+ await step('检查 Worker Pod', async () => {
104
+ const pods = await kubectl.get('pod', undefined, { selector: `tensor-fusion.ai/workload=${WORKLOAD_NAME}` });
105
+ // Worker Pod 应该已被删除
106
+ expect(pods.items?.length ?? 0).toBe(0);
107
+ });
108
+ });
109
+ // ===== Step 7: 验证 Workload 不存在 =====
110
+ test('Step 7: 确认 Workload 已完全删除', async (ctx) => {
111
+ const { kubectl } = ctx.plugins;
112
+ await step('验证 Workload 不存在', async () => {
113
+ const workload = tensorfusionworkload(kubectl, WORKLOAD_NAME);
114
+ await expect(workload).toNotExistInCluster();
115
+ });
116
+ });
117
+ });
@@ -0,0 +1,14 @@
1
+ /**
2
+ * 测试场景 5: 多 Workload 共享 GPU
3
+ *
4
+ * 验证两个 TensorFusionWorkload 共享同一张 GPU 时:
5
+ * - 两个 workload 均可正常调度并进入 Running
6
+ * - GPU.status.available 资源扣减准确(两倍请求量)
7
+ * - GPU.status.runningApps 包含两个 workload
8
+ * - 删除后资源正确恢复
9
+ *
10
+ * 运行方式:
11
+ * bun run repterm packages/plugin-kubectl/examples/tensor-fusion/05-multi-workload-shared-gpu.ts
12
+ */
13
+ export {};
14
+ //# sourceMappingURL=05-multi-workload-shared-gpu.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"05-multi-workload-shared-gpu.d.ts","sourceRoot":"","sources":["../../../../examples/tensor-fusion/05-multi-workload-shared-gpu.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG"}
@@ -0,0 +1,145 @@
1
+ /**
2
+ * 测试场景 5: 多 Workload 共享 GPU
3
+ *
4
+ * 验证两个 TensorFusionWorkload 共享同一张 GPU 时:
5
+ * - 两个 workload 均可正常调度并进入 Running
6
+ * - GPU.status.available 资源扣减准确(两倍请求量)
7
+ * - GPU.status.runningApps 包含两个 workload
8
+ * - 删除后资源正确恢复
9
+ *
10
+ * 运行方式:
11
+ * bun run repterm packages/plugin-kubectl/examples/tensor-fusion/05-multi-workload-shared-gpu.ts
12
+ */
13
+ import { sleep } from 'bun';
14
+ import { test, describe, expect, step, tensorfusionworkload, workloadYaml, DEFAULT_TIMEOUT, getFirstGpuName, getGpuAvailable, parseTflops, TEST_NAMESPACE, } from './_config.js';
15
+ const WL_NAME_1 = 'tf-share-wl-1';
16
+ const WL_NAME_2 = 'tf-share-wl-2';
17
+ /** 每个 workload 请求的 tflops(1 TFlops = 1000m) */
18
+ const TFLOPS_REQUEST = '1000m';
19
+ const TFLOPS_LIMIT = '1000m';
20
+ const VRAM_REQUEST = '1Gi';
21
+ const VRAM_LIMIT = '1Gi';
22
+ describe('测试场景 5: 多 Workload 共享 GPU', { record: true }, () => {
23
+ test('两个 TensorFusionWorkload 共享同一张 GPU 完整流程', async (ctx) => {
24
+ const { kubectl } = ctx.plugins;
25
+ let gpuName;
26
+ let initialTflops;
27
+ // ===== Step 1: 记录初始状态 =====
28
+ await step('获取目标 GPU 并记录初始资源', {
29
+ showStepTitle: false,
30
+ typingSpeed: 60,
31
+ pauseAfter: 1500,
32
+ }, async () => {
33
+ gpuName = await getFirstGpuName(kubectl);
34
+ const available = await getGpuAvailable(kubectl, gpuName);
35
+ initialTflops = available.tflops;
36
+ // 确认 GPU 有足够资源容纳两个 workload
37
+ const initialNum = parseTflops(initialTflops);
38
+ const requiredNum = parseTflops(TFLOPS_REQUEST) * 2;
39
+ expect(initialNum).toBeGreaterThanOrEqual(requiredNum);
40
+ });
41
+ // ===== Step 2: 创建两个 workload =====
42
+ await step('创建第一个 Workload: ' + WL_NAME_1, {
43
+ showStepTitle: false,
44
+ typingSpeed: 100,
45
+ pauseAfter: 2000,
46
+ }, async () => {
47
+ const yaml = workloadYaml(WL_NAME_1, {
48
+ tflopsRequest: TFLOPS_REQUEST,
49
+ tflopsLimit: TFLOPS_LIMIT,
50
+ vramRequest: VRAM_REQUEST,
51
+ vramLimit: VRAM_LIMIT,
52
+ });
53
+ const result = await kubectl.apply(yaml);
54
+ await expect(result).toBeSuccessful();
55
+ });
56
+ await step('创建第二个 Workload: ' + WL_NAME_2, {
57
+ showStepTitle: false,
58
+ typingSpeed: 100,
59
+ pauseAfter: 2000,
60
+ }, async () => {
61
+ const yaml = workloadYaml(WL_NAME_2, {
62
+ tflopsRequest: TFLOPS_REQUEST,
63
+ tflopsLimit: TFLOPS_LIMIT,
64
+ vramRequest: VRAM_REQUEST,
65
+ vramLimit: VRAM_LIMIT,
66
+ });
67
+ const result = await kubectl.apply(yaml);
68
+ await expect(result).toBeSuccessful();
69
+ });
70
+ // ===== Step 3: 等待两个 workload 都变为 Running =====
71
+ await step('等待 ' + WL_NAME_1 + ' Ready', {
72
+ showStepTitle: false,
73
+ pauseAfter: 1500,
74
+ }, async () => {
75
+ const result = await kubectl.wait('tensorfusionworkload', WL_NAME_1, 'Ready', { timeout: DEFAULT_TIMEOUT });
76
+ await expect(result).toBeSuccessful();
77
+ });
78
+ await step('等待 ' + WL_NAME_2 + ' Ready', {
79
+ showStepTitle: false,
80
+ pauseAfter: 1500,
81
+ }, async () => {
82
+ const result = await kubectl.wait('tensorfusionworkload', WL_NAME_2, 'Ready', { timeout: DEFAULT_TIMEOUT });
83
+ await expect(result).toBeSuccessful();
84
+ });
85
+ await step('验证两个 Workload 状态均为 Running', {
86
+ typingSpeed: 80,
87
+ pauseAfter: 2000,
88
+ }, async () => {
89
+ const wl1 = tensorfusionworkload(kubectl, WL_NAME_1);
90
+ await expect(wl1).toHaveStatusField('phase', 'Running');
91
+ const wl2 = tensorfusionworkload(kubectl, WL_NAME_2);
92
+ await expect(wl2).toHaveStatusField('phase', 'Running');
93
+ });
94
+ // ===== Step 4: 验证 GPU 可用资源扣减 =====
95
+ await step('检查 GPU 可用资源变化', {
96
+ showStepTitle: false,
97
+ typingSpeed: 80,
98
+ pauseAfter: 2500,
99
+ }, async () => {
100
+ await sleep(1000);
101
+ const afterAvailable = await getGpuAvailable(kubectl, gpuName);
102
+ const initialNum = parseTflops(initialTflops);
103
+ const afterNum = parseTflops(afterAvailable.tflops);
104
+ const expectedDeduction = parseTflops(TFLOPS_REQUEST) * 2;
105
+ // TFlops 应减少约 2 个 workload 的请求量
106
+ expect(afterNum).toBeLessThan(initialNum);
107
+ expect(initialNum - afterNum).toBeGreaterThanOrEqual(expectedDeduction - 100);
108
+ expect(initialNum - afterNum).toBeLessThanOrEqual(expectedDeduction + 100);
109
+ });
110
+ // ===== Step 5: 验证 runningApps =====
111
+ await step('检查 GPU runningApps 包含两个 workload', {
112
+ showStepTitle: false,
113
+ typingSpeed: 80,
114
+ pauseAfter: 2500,
115
+ }, async () => {
116
+ const runningApps = await kubectl.getJsonPath('gpu', gpuName, '.status.runningApps');
117
+ expect(runningApps).toBeDefined();
118
+ expect(Array.isArray(runningApps)).toBe(true);
119
+ const appNames = (runningApps ?? []).map(app => `${app.namespace}/${app.name}`);
120
+ expect(appNames).toContain(`${TEST_NAMESPACE}/${WL_NAME_1}`);
121
+ expect(appNames).toContain(`${TEST_NAMESPACE}/${WL_NAME_2}`);
122
+ });
123
+ // ===== Step 6: 清理 =====
124
+ await step('删除两个 TensorFusionWorkload', {
125
+ showStepTitle: false,
126
+ typingSpeed: 80,
127
+ pauseAfter: 2000,
128
+ }, async () => {
129
+ const r1 = await kubectl.delete('tensorfusionworkload', WL_NAME_1);
130
+ await expect(r1).toBeSuccessful();
131
+ const r2 = await kubectl.delete('tensorfusionworkload', WL_NAME_2);
132
+ await expect(r2).toBeSuccessful();
133
+ });
134
+ await step('等待资源释放并验证恢复', {
135
+ pauseAfter: 2000,
136
+ }, async () => {
137
+ await new Promise(resolve => setTimeout(resolve, 5000));
138
+ const afterRelease = await getGpuAvailable(kubectl, gpuName);
139
+ const releasedNum = parseTflops(afterRelease.tflops);
140
+ const initialNum = parseTflops(initialTflops);
141
+ // 允许小误差,但应接近初始值
142
+ expect(releasedNum).toBeGreaterThanOrEqual(initialNum - 100);
143
+ });
144
+ });
145
+ });
@@ -0,0 +1,14 @@
1
+ /**
2
+ * 测试场景 6: GPU 资源调整(扩容)
3
+ *
4
+ * 基于 `GPU资源调整测试.md`:
5
+ * - 先创建一个已分配 GPU 资源的 TensorFusionWorkload
6
+ * - 记录 GPU 可用资源与 worker Pod 注解
7
+ * - 手动 patch workload 扩容资源
8
+ * - 验证 GPU 可用资源继续下降,worker 注解已更新
9
+ *
10
+ * 运行方式:
11
+ * bun run repterm packages/plugin-kubectl/examples/tensor-fusion/06-workload-resource-resize.ts
12
+ */
13
+ export {};
14
+ //# sourceMappingURL=06-workload-resource-resize.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"06-workload-resource-resize.d.ts","sourceRoot":"","sources":["../../../../examples/tensor-fusion/06-workload-resource-resize.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG"}
@@ -0,0 +1,235 @@
1
+ /**
2
+ * 测试场景 6: GPU 资源调整(扩容)
3
+ *
4
+ * 基于 `GPU资源调整测试.md`:
5
+ * - 先创建一个已分配 GPU 资源的 TensorFusionWorkload
6
+ * - 记录 GPU 可用资源与 worker Pod 注解
7
+ * - 手动 patch workload 扩容资源
8
+ * - 验证 GPU 可用资源继续下降,worker 注解已更新
9
+ *
10
+ * 运行方式:
11
+ * bun run repterm packages/plugin-kubectl/examples/tensor-fusion/06-workload-resource-resize.ts
12
+ */
13
+ import { sleep } from 'bun';
14
+ import { test, describe, expect, step, tensorfusionworkload, workloadYaml, DEFAULT_TIMEOUT, getGpuAvailable, parseTflops, } from './_config.js';
15
+ const WORKLOAD_NAME = 'test-workload-resize';
16
+ const INITIAL_TFLOPS = '100m';
17
+ const INITIAL_VRAM = '8Gi';
18
+ const TARGET_TFLOPS = '200m';
19
+ const TARGET_VRAM = '16Gi';
20
+ function parseResourceBytes(value) {
21
+ const text = String(value).trim();
22
+ if (!text) {
23
+ return 0;
24
+ }
25
+ const match = text.match(/^([0-9]+(?:\.[0-9]+)?)([a-zA-Z]+)?$/);
26
+ if (!match) {
27
+ return Number(text) || 0;
28
+ }
29
+ const num = Number(match[1]);
30
+ const unit = match[2] ?? '';
31
+ const binaryUnits = {
32
+ Ki: 1024,
33
+ Mi: 1024 ** 2,
34
+ Gi: 1024 ** 3,
35
+ Ti: 1024 ** 4,
36
+ Pi: 1024 ** 5,
37
+ Ei: 1024 ** 6,
38
+ };
39
+ const decimalUnits = {
40
+ K: 1000,
41
+ M: 1000 ** 2,
42
+ G: 1000 ** 3,
43
+ T: 1000 ** 4,
44
+ P: 1000 ** 5,
45
+ E: 1000 ** 6,
46
+ m: 0.001,
47
+ };
48
+ if (binaryUnits[unit] !== undefined) {
49
+ return num * binaryUnits[unit];
50
+ }
51
+ if (decimalUnits[unit] !== undefined) {
52
+ return num * decimalUnits[unit];
53
+ }
54
+ return num;
55
+ }
56
+ async function getWorkerPodName(kubectl, workloadName) {
57
+ let workerPods = await kubectl.get('pod', undefined, {
58
+ selector: `tensor-fusion.ai/workload=${workloadName},tensor-fusion.ai/component=worker`,
59
+ jqFilter: '[.items[] | {name: .metadata.name, phase: .status.phase}]',
60
+ });
61
+ if (!workerPods || workerPods.length === 0) {
62
+ workerPods = await kubectl.get('pod', undefined, {
63
+ selector: `tensor-fusion.ai/workload=${workloadName}`,
64
+ jqFilter: '[.items[] | {name: .metadata.name, phase: .status.phase}]',
65
+ });
66
+ }
67
+ if (!workerPods || workerPods.length === 0) {
68
+ throw new Error(`No worker pod found for workload ${workloadName}`);
69
+ }
70
+ const runningPod = workerPods.find((podInfo) => podInfo.phase === 'Running');
71
+ return runningPod?.name ?? workerPods[0].name;
72
+ }
73
+ async function getWorkerResources(kubectl, workerPodName) {
74
+ const data = await kubectl.get('pod', workerPodName, {
75
+ jqFilter: '{tflops: .metadata.annotations["tensor-fusion.ai/tflops-request"], vram: .metadata.annotations["tensor-fusion.ai/vram-request"], gpuIds: .metadata.annotations["tensor-fusion.ai/gpu-ids"]}',
76
+ });
77
+ return {
78
+ tflops: String(data?.tflops ?? ''),
79
+ vram: String(data?.vram ?? ''),
80
+ gpuIds: String(data?.gpuIds ?? ''),
81
+ };
82
+ }
83
+ async function resolveGpuNameFromId(kubectl, gpuId) {
84
+ try {
85
+ const direct = await kubectl.getJsonPath('gpu', gpuId, '.metadata.name');
86
+ if (direct) {
87
+ return direct;
88
+ }
89
+ }
90
+ catch {
91
+ // gpuId 不是 GPU CR 名称时,继续走 UUID 反查
92
+ }
93
+ const mapped = await kubectl.get('gpu', undefined, {
94
+ jqFilter: `[.items[] | select(.status.uuid == "${gpuId}")][0].metadata.name`,
95
+ });
96
+ if (!mapped || mapped === 'null') {
97
+ throw new Error(`Cannot resolve GPU resource name from id: ${gpuId}`);
98
+ }
99
+ return String(mapped);
100
+ }
101
+ describe('测试场景 6: GPU 资源调整(扩容)', { record: true }, () => {
102
+ test('TensorFusionWorkload 扩容后 GPU 与 worker 注解同步更新', async (ctx) => {
103
+ const { kubectl } = ctx.plugins;
104
+ let workerPodName;
105
+ let gpuName;
106
+ let beforeGpuTflops;
107
+ let beforeGpuVram;
108
+ let beforeWorkerTflops;
109
+ let beforeWorkerVram;
110
+ let afterGpuTflops;
111
+ let afterGpuVram;
112
+ let afterWorkerTflops;
113
+ let afterWorkerVram;
114
+ let beforeGpuId;
115
+ let afterGpuId;
116
+ // ===== Step 1: 创建初始 Workload 并等待就绪 =====
117
+ await step('创建初始 Workload(100m/8Gi)', {
118
+ showStepTitle: false,
119
+ typingSpeed: 100,
120
+ pauseAfter: 1800,
121
+ }, async () => {
122
+ const yaml = workloadYaml(WORKLOAD_NAME, {
123
+ tflopsRequest: INITIAL_TFLOPS,
124
+ tflopsLimit: INITIAL_TFLOPS,
125
+ vramRequest: INITIAL_VRAM,
126
+ vramLimit: INITIAL_VRAM,
127
+ });
128
+ const result = await kubectl.apply(yaml);
129
+ await expect(result).toBeSuccessful();
130
+ });
131
+ await step('等待 Workload Ready 并确认 Running', {
132
+ pauseAfter: 1800,
133
+ }, async () => {
134
+ const waitResult = await kubectl.wait('tensorfusionworkload', WORKLOAD_NAME, 'Ready', {
135
+ timeout: DEFAULT_TIMEOUT,
136
+ });
137
+ await expect(waitResult).toBeSuccessful();
138
+ const workload = tensorfusionworkload(kubectl, WORKLOAD_NAME);
139
+ await expect(workload).toHaveStatusField('phase', 'Running');
140
+ });
141
+ // ===== Step 2: 记录扩容前基线 =====
142
+ await step('记录扩容前 GPU 可用资源与 worker 注解', {
143
+ showStepTitle: false,
144
+ typingSpeed: 80,
145
+ pauseAfter: 2000,
146
+ }, async () => {
147
+ workerPodName = await getWorkerPodName(kubectl, WORKLOAD_NAME);
148
+ const beforeWorker = await getWorkerResources(kubectl, workerPodName);
149
+ beforeWorkerTflops = beforeWorker.tflops;
150
+ beforeWorkerVram = beforeWorker.vram;
151
+ expect(beforeWorkerTflops).toBe(INITIAL_TFLOPS);
152
+ expect(beforeWorkerVram).toBe(INITIAL_VRAM);
153
+ const gpuId = beforeWorker.gpuIds.split(',')[0]?.trim();
154
+ expect(gpuId).toBeDefined();
155
+ expect(gpuId?.length ?? 0).toBeGreaterThan(0);
156
+ beforeGpuId = gpuId;
157
+ gpuName = await resolveGpuNameFromId(kubectl, beforeGpuId);
158
+ const beforeGpu = await getGpuAvailable(kubectl, gpuName);
159
+ beforeGpuTflops = beforeGpu.tflops;
160
+ beforeGpuVram = beforeGpu.vram;
161
+ });
162
+ // ===== Step 3: 扩容 workload 资源 =====
163
+ await step('patch Workload 资源到 200m/16Gi', {
164
+ showStepTitle: false,
165
+ typingSpeed: 100,
166
+ pauseAfter: 1800,
167
+ }, async () => {
168
+ const patchResult = await kubectl.patch('tensorfusionworkload', WORKLOAD_NAME, {
169
+ spec: {
170
+ resources: {
171
+ requests: {
172
+ tflops: TARGET_TFLOPS,
173
+ vram: TARGET_VRAM,
174
+ },
175
+ limits: {
176
+ tflops: TARGET_TFLOPS,
177
+ vram: TARGET_VRAM,
178
+ },
179
+ },
180
+ },
181
+ }, 'merge');
182
+ await expect(patchResult).toBeSuccessful();
183
+ const spec = await kubectl.getJsonPath('tensorfusionworkload', WORKLOAD_NAME, '.spec.resources');
184
+ expect(spec?.requests?.tflops).toBe(TARGET_TFLOPS);
185
+ expect(spec?.requests?.vram).toBe(TARGET_VRAM);
186
+ });
187
+ // ===== Step 4: 等待 worker 注解更新 =====
188
+ await step('等待 worker 注解更新为目标值', {
189
+ showStepTitle: false,
190
+ typingSpeed: 80,
191
+ pauseAfter: 2000,
192
+ }, async () => {
193
+ const deadline = Date.now() + DEFAULT_TIMEOUT;
194
+ while (Date.now() < deadline) {
195
+ workerPodName = await getWorkerPodName(kubectl, WORKLOAD_NAME);
196
+ const currentWorker = await getWorkerResources(kubectl, workerPodName);
197
+ if (currentWorker.tflops === TARGET_TFLOPS && currentWorker.vram === TARGET_VRAM) {
198
+ afterWorkerTflops = currentWorker.tflops;
199
+ afterWorkerVram = currentWorker.vram;
200
+ afterGpuId = currentWorker.gpuIds.split(',')[0]?.trim() ?? '';
201
+ break;
202
+ }
203
+ await sleep(3000);
204
+ }
205
+ expect(afterWorkerTflops).toBe(TARGET_TFLOPS);
206
+ expect(afterWorkerVram).toBe(TARGET_VRAM);
207
+ expect(afterGpuId).toBe(beforeGpuId);
208
+ });
209
+ // ===== Step 5: 验证 GPU 可用资源进一步下降 =====
210
+ await step('验证 GPU 可用资源进一步减少', {
211
+ typingSpeed: 80,
212
+ pauseAfter: 2200,
213
+ }, async () => {
214
+ const afterGpu = await getGpuAvailable(kubectl, gpuName);
215
+ afterGpuTflops = afterGpu.tflops;
216
+ afterGpuVram = afterGpu.vram;
217
+ const beforeTflopsNum = parseTflops(beforeGpuTflops);
218
+ const afterTflopsNum = parseTflops(afterGpuTflops);
219
+ const beforeVramBytes = parseResourceBytes(beforeGpuVram);
220
+ const afterVramBytes = parseResourceBytes(afterGpuVram);
221
+ expect(afterTflopsNum).toBeLessThan(beforeTflopsNum);
222
+ expect(afterVramBytes).toBeLessThan(beforeVramBytes);
223
+ });
224
+ // ===== Step 6: 清理 =====
225
+ await step('删除 Workload 并等待资源释放', {
226
+ showStepTitle: false,
227
+ typingSpeed: 80,
228
+ pauseAfter: 1800,
229
+ }, async () => {
230
+ const deleteResult = await kubectl.delete('tensorfusionworkload', WORKLOAD_NAME);
231
+ await expect(deleteResult).toBeSuccessful();
232
+ await sleep(5000);
233
+ });
234
+ });
235
+ });
@@ -0,0 +1,15 @@
1
+ /**
2
+ * 测试场景 7: Workload 创建和 Worker Pod 生成
3
+ *
4
+ * 基于 `Workload创建和WorkerPod生成测试.md`:
5
+ * - 创建 `replicas=2` 的 TensorFusionWorkload
6
+ * - 等待 `status.workerCount` 变为 2
7
+ * - 等待 worker Pod Ready
8
+ * - 验证 worker Pod 数量与 `spec.replicas` 一致
9
+ * - 验证 `status.workerCount` 正确更新
10
+ *
11
+ * 运行方式:
12
+ * bun run repterm packages/plugin-kubectl/examples/tensor-fusion/07-workload-worker-pod-generation.ts
13
+ */
14
+ export {};
15
+ //# sourceMappingURL=07-workload-worker-pod-generation.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"07-workload-worker-pod-generation.d.ts","sourceRoot":"","sources":["../../../../examples/tensor-fusion/07-workload-worker-pod-generation.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;GAYG"}