aws-cdk-neuronx-patterns 0.1.1 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.jsii +384 -82
- package/AGENT.md +11 -0
- package/API.md +237 -1
- package/lib/base/aws-batch/neuronx-batch-compute-environment.js +1 -1
- package/lib/base/aws-batch/neuronx-batch-ecs-job-definition.js +1 -1
- package/lib/base/aws-batch/neuronx-batch.js +1 -1
- package/lib/base/aws-ecs-patterns/application-load-balanced-neuronx-service.js +2 -2
- package/lib/base/neuronx/calculator.test.js +61 -1
- package/lib/base/neuronx/deep-learning-containers.d.ts +21 -3
- package/lib/base/neuronx/deep-learning-containers.js +25 -7
- package/lib/base/neuronx/deep-learning-containers.test.js +80 -3
- package/lib/base/neuronx/model.js +2 -2
- package/lib/base/neuronx/neuron-optimized-machine-image.js +1 -1
- package/lib/base/neuronx/neuronx-instance-type.d.ts +18 -0
- package/lib/base/neuronx/neuronx-instance-type.js +60 -7
- package/lib/base/neuronx/neuronx-instance-type.test.js +80 -1
- package/lib/base/neuronx-compiler/neuronx-compiler.js +1 -1
- package/lib/base/server-engine/vllm-engine/vllm-engine-argments.js +1 -1
- package/lib/vllm-nxd-inference/vllm-nxd-inference-compiler.js +5 -3
- package/lib/vllm-nxd-inference/vllm-nxd-inference-ecs-patterns.js +4 -4
- package/package.json +6 -6
|
@@ -14,6 +14,33 @@ describe("Inferentia2Chips", () => {
|
|
|
14
14
|
expect(inferentia2Chips.acceleratorMemory).toStrictEqual(aws_cdk_lib_1.Size.gibibytes(384));
|
|
15
15
|
});
|
|
16
16
|
});
|
|
17
|
+
// https://aws.amazon.com/ec2/instance-types/trn2/
|
|
18
|
+
describe("Trainium2Chips", () => {
|
|
19
|
+
describe("single chip", () => {
|
|
20
|
+
const trainium2Chips = new neuronx_instance_type_1.Trainium2Chips(1);
|
|
21
|
+
it("chips as is number", () => {
|
|
22
|
+
expect(trainium2Chips.chips).toBe(1);
|
|
23
|
+
});
|
|
24
|
+
it("NeuronxCores is 4 per chip (LNC=2 default)", () => {
|
|
25
|
+
expect(trainium2Chips.neuronxCores).toBe(4);
|
|
26
|
+
});
|
|
27
|
+
it("AcceleratorMemory is 24 GiB per logical core", () => {
|
|
28
|
+
expect(trainium2Chips.acceleratorMemory).toStrictEqual(aws_cdk_lib_1.Size.gibibytes(96));
|
|
29
|
+
});
|
|
30
|
+
});
|
|
31
|
+
describe("16 chips (trn2.48xlarge)", () => {
|
|
32
|
+
const trainium2Chips = new neuronx_instance_type_1.Trainium2Chips(16);
|
|
33
|
+
it("chips as is number", () => {
|
|
34
|
+
expect(trainium2Chips.chips).toBe(16);
|
|
35
|
+
});
|
|
36
|
+
it("NeuronxCores is 64 for 16 chips (LNC=2 default)", () => {
|
|
37
|
+
expect(trainium2Chips.neuronxCores).toBe(64);
|
|
38
|
+
});
|
|
39
|
+
it("AcceleratorMemory is 1536 GiB for 16 chips", () => {
|
|
40
|
+
expect(trainium2Chips.acceleratorMemory).toStrictEqual(aws_cdk_lib_1.Size.gibibytes(1536));
|
|
41
|
+
});
|
|
42
|
+
});
|
|
43
|
+
});
|
|
17
44
|
// https://aws.amazon.com/ec2/instance-types/inf2/?nc1=h_ls
|
|
18
45
|
describe("NeuronxInstanceType", () => {
|
|
19
46
|
const instanceTypes = [
|
|
@@ -79,4 +106,56 @@ describe("NeuronxInstanceType", () => {
|
|
|
79
106
|
expect(instanceType.acceleratorChips.acceleratorMemory).toStrictEqual(expected.acceleratorChips.acceleratorMemory);
|
|
80
107
|
});
|
|
81
108
|
});
|
|
82
|
-
|
|
109
|
+
// https://aws.amazon.com/ec2/instance-types/trn2/
|
|
110
|
+
describe("NeuronxInstanceType Trn2", () => {
|
|
111
|
+
const instanceTypes = [
|
|
112
|
+
[
|
|
113
|
+
neuronx_instance_type_1.NeuronxInstanceType.TRN2_3XLARGE,
|
|
114
|
+
{
|
|
115
|
+
instanceType: "trn2.3xlarge",
|
|
116
|
+
vCpu: 12,
|
|
117
|
+
memory: aws_cdk_lib_1.Size.gibibytes(128),
|
|
118
|
+
acceleratorChips: {
|
|
119
|
+
chips: 1,
|
|
120
|
+
neuronxCores: 4,
|
|
121
|
+
acceleratorMemory: aws_cdk_lib_1.Size.gibibytes(96),
|
|
122
|
+
},
|
|
123
|
+
},
|
|
124
|
+
],
|
|
125
|
+
[
|
|
126
|
+
neuronx_instance_type_1.NeuronxInstanceType.TRN2_48XLARGE,
|
|
127
|
+
{
|
|
128
|
+
instanceType: "trn2.48xlarge",
|
|
129
|
+
vCpu: 192,
|
|
130
|
+
memory: aws_cdk_lib_1.Size.gibibytes(2048),
|
|
131
|
+
acceleratorChips: {
|
|
132
|
+
chips: 16,
|
|
133
|
+
neuronxCores: 64,
|
|
134
|
+
acceleratorMemory: aws_cdk_lib_1.Size.gibibytes(1536),
|
|
135
|
+
},
|
|
136
|
+
},
|
|
137
|
+
],
|
|
138
|
+
[
|
|
139
|
+
neuronx_instance_type_1.NeuronxInstanceType.TRN2U_48XLARGE,
|
|
140
|
+
{
|
|
141
|
+
instanceType: "trn2u.48xlarge",
|
|
142
|
+
vCpu: 192,
|
|
143
|
+
memory: aws_cdk_lib_1.Size.gibibytes(2048),
|
|
144
|
+
acceleratorChips: {
|
|
145
|
+
chips: 16,
|
|
146
|
+
neuronxCores: 64,
|
|
147
|
+
acceleratorMemory: aws_cdk_lib_1.Size.gibibytes(1536),
|
|
148
|
+
},
|
|
149
|
+
},
|
|
150
|
+
],
|
|
151
|
+
];
|
|
152
|
+
it.each(instanceTypes)("InstanceType of %s", (instanceType, expected) => {
|
|
153
|
+
expect(instanceType.toString()).toBe(expected.instanceType);
|
|
154
|
+
expect(instanceType.vCpu).toBe(expected.vCpu);
|
|
155
|
+
expect(instanceType.memory).toStrictEqual(expected.memory);
|
|
156
|
+
expect(instanceType.acceleratorChips.chips).toBe(expected.acceleratorChips.chips);
|
|
157
|
+
expect(instanceType.acceleratorChips.neuronxCores).toBe(expected.acceleratorChips.neuronxCores);
|
|
158
|
+
expect(instanceType.acceleratorChips.acceleratorMemory).toStrictEqual(expected.acceleratorChips.acceleratorMemory);
|
|
159
|
+
});
|
|
160
|
+
});
|
|
161
|
+
//# sourceMappingURL=data:application/json;base64,{"version":3,"file":"neuronx-instance-type.test.js","sourceRoot":"","sources":["../../../src/base/neuronx/neuronx-instance-type.test.ts"],"names":[],"mappings":";;AAAA,6CAAmC;AACnC,mEAKiC;AAEjC,QAAQ,CAAC,kBAAkB,EAAE,GAAG,EAAE;IAChC,MAAM,gBAAgB,GAAG,IAAI,wCAAgB,CAAC,EAAE,CAAC,CAAC;IAClD,EAAE,CAAC,oBAAoB,EAAE,GAAG,EAAE;QAC5B,MAAM,CAAC,gBAAgB,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IAC1C,CAAC,CAAC,CAAC;IACH,EAAE,CAAC,gCAAgC,EAAE,GAAG,EAAE;QACxC,MAAM,CAAC,gBAAgB,CAAC,YAAY,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IACjD,CAAC,CAAC,CAAC;IACH,EAAE,CAAC,qDAAqD,EAAE,GAAG,EAAE;QAC7D,MAAM,CAAC,gBAAgB,CAAC,iBAAiB,CAAC,CAAC,aAAa,CACtD,kBAAI,CAAC,SAAS,CAAC,GAAG,CAAC,CACpB,CAAC;IACJ,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC;AAEH,kDAAkD;AAClD,QAAQ,CAAC,gBAAgB,EAAE,GAAG,EAAE;IAC9B,QAAQ,CAAC,aAAa,EAAE,GAAG,EAAE;QAC3B,MAAM,cAAc,GAAG,IAAI,sCAAc,CAAC,CAAC,CAAC,CAAC;QAC7C,EAAE,CAAC,oBAAoB,EAAE,GAAG,EAAE;YAC5B,MAAM,CAAC,cAAc,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACvC,CAAC,CAAC,CAAC;QACH,EAAE,CAAC,4CAA4C,EAAE,GAAG,EAAE;YACpD,MAAM,CAAC,cAAc,CAAC,YAAY,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC9C,CAAC,CAAC,CAAC;QACH,EAAE,CAAC,8CAA8C,EAAE,GAAG,EAAE;YACtD,MAAM,CAAC,cAAc,CAAC,iBAAiB,CAAC,CAAC,aAAa,CACpD,kBAAI,CAAC,SAAS,CAAC,EAAE,CAAC,CACnB,CAAC;QACJ,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IACH,QAAQ,CAAC,0BAA0B,EAAE,GAAG,EAAE;QACxC,MAAM,cAAc,GAAG,IAAI,sCAAc,CAAC,EAAE,CAAC,CAAC;QAC9C,EAAE,CAAC,oBAAoB,EAAE,GAAG,EAAE;YAC5B,MAAM,CAAC,cAAc,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACxC,CAAC,CAAC,CAAC;QACH,EAAE,CAAC,iDAAiD,EAAE,GAAG,EAAE;YACzD,MAAM,CAAC,cAAc,CAAC,YAAY,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QAC/C,CAAC,CAAC,CAAC;QACH,EAAE,CAAC,4CAA4C,EAAE,GAAG,EAAE;YACpD,MAAM,CAAC,cAAc,CAAC,iBAAiB,CAAC,CAAC,aAAa,CACpD,kBAAI,CAAC,SAAS,CAAC,IAAI,CAAC,CACrB,CAAC;QACJ,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC;AAEH,2DAA2D;AAC3D,QAAQ,CAAC,qBAAqB,EAAE,GAAG,EAAE;IACnC,MAAM,aAAa,GAAG;QACpB;YACE,2CAAmB,CAAC,WAAW;YAC/B;gBACE,YAAY,EAAE,aAAa;gBAC3B,IAAI,EAAE,CAAC;gBACP,MAAM,EAAE,kBAAI,CAAC,SAAS,CAAC,EAAE,CAAC;gBAC1B,gBAAgB,EAAE;oBAChB,KAAK,EAAE,CAAC;oBACR,YAAY,EAAE,CAAC;oBACf,iBAAiB,EAAE,kBAAI,CAAC,SAAS,CAAC,EAAE,CAAC;iBACV;aAC9B;SACO;QACV;YACE,2CAAmB,CAAC,YAAY;YAChC;gBACE,YAAY,EAAE,cAAc;gBAC5B,IAAI,EAAE,EAAE;gBACR,MAAM,EAAE,kBAAI,CAAC,SAAS,CAAC,GAAG,CAAC;gBAC3B,gBAAgB,EAAE;oBAChB,KAAK,EAAE,CAAC;oBACR,YAAY,EAAE,CAAC;oBACf,iBAAiB,EAAE,kBAAI,CAAC,SAAS,CAAC,EAAE,CAAC;iBACV;aAC9B;SACO;QACV;YACE,2CAAmB,CAAC,aAAa;YACjC;gBACE,YAAY,EAAE,eAAe;gBAC7B,IAAI,EAAE,EAAE;gBACR,MAAM,EAAE,kBAAI,CAAC,SAAS,CAAC,GAAG,CAAC;gBAC3B,gBAAgB,EAAE;oBAChB,KAAK,EAAE,CAAC;oBACR,YAAY,EAAE,EAAE;oBAChB,iBAAiB,EAAE,kBAAI,CAAC,SAAS,CAAC,GAAG,CAAC;iBACX;aAC9B;SACO;QACV;YACE,2CAAmB,CAAC,aAAa;YACjC;gBACE,YAAY,EAAE,eAAe;gBAC7B,IAAI,EAAE,GAAG;gBACT,MAAM,EAAE,kBAAI,CAAC,SAAS,CAAC,GAAG,CAAC;gBAC3B,gBAAgB,EAAE;oBAChB,KAAK,EAAE,EAAE;oBACT,YAAY,EAAE,EAAE;oBAChB,iBAAiB,EAAE,kBAAI,CAAC,SAAS,CAAC,GAAG,CAAC;iBACX;aAC9B;SACO;KACX,CAAC;IACF,EAAE,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC,oBAAoB,EAAE,CAAC,YAAY,EAAE,QAAQ,EAAE,EAAE;QACtE,MAAM,CAAC,YAAY,CAAC,QAAQ,EAAE,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,YAAY,CAAC,CAAC;QAC5D,MAAM,CAAC,YAAY,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;QAC9C,MAAM,CAAC,YAAY,CAAC,MAAM,CAAC,CAAC,aAAa,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;QAC3D,MAAM,CAAC,YAAY,CAAC,gBAAgB,CAAC,KAAK,CAAC,CAAC,IAAI,CAC9C,QAAQ,CAAC,gBAAgB,CAAC,KAAK,CAChC,CAAC;QACF,MAAM,CAAC,YAAY,CAAC,gBAAgB,CAAC,YAAY,CAAC,CAAC,IAAI,CACrD,QAAQ,CAAC,gBAAgB,CAAC,YAAY,CACvC,CAAC;QACF,MAAM,CAAC,YAAY,CAAC,gBAAgB,CAAC,iBAAiB,CAAC,CAAC,aAAa,CACnE,QAAQ,CAAC,gBAAgB,CAAC,iBAAiB,CAC5C,CAAC;IACJ,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC;AAEH,kDAAkD;AAClD,QAAQ,CAAC,0BAA0B,EAAE,GAAG,EAAE;IACxC,MAAM,aAAa,GAAG;QACpB;YACE,2CAAmB,CAAC,YAAY;YAChC;gBACE,YAAY,EAAE,cAAc;gBAC5B,IAAI,EAAE,EAAE;gBACR,MAAM,EAAE,kBAAI,CAAC,SAAS,CAAC,GAAG,CAAC;gBAC3B,gBAAgB,EAAE;oBAChB,KAAK,EAAE,CAAC;oBACR,YAAY,EAAE,CAAC;oBACf,iBAAiB,EAAE,kBAAI,CAAC,SAAS,CAAC,EAAE,CAAC;iBACV;aAC9B;SACO;QACV;YACE,2CAAmB,CAAC,aAAa;YACjC;gBACE,YAAY,EAAE,eAAe;gBAC7B,IAAI,EAAE,GAAG;gBACT,MAAM,EAAE,kBAAI,CAAC,SAAS,CAAC,IAAI,CAAC;gBAC5B,gBAAgB,EAAE;oBAChB,KAAK,EAAE,EAAE;oBACT,YAAY,EAAE,EAAE;oBAChB,iBAAiB,EAAE,kBAAI,CAAC,SAAS,CAAC,IAAI,CAAC;iBACZ;aAC9B;SACO;QACV;YACE,2CAAmB,CAAC,cAAc;YAClC;gBACE,YAAY,EAAE,gBAAgB;gBAC9B,IAAI,EAAE,GAAG;gBACT,MAAM,EAAE,kBAAI,CAAC,SAAS,CAAC,IAAI,CAAC;gBAC5B,gBAAgB,EAAE;oBAChB,KAAK,EAAE,EAAE;oBACT,YAAY,EAAE,EAAE;oBAChB,iBAAiB,EAAE,kBAAI,CAAC,SAAS,CAAC,IAAI,CAAC;iBACZ;aAC9B;SACO;KACX,CAAC;IACF,EAAE,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC,oBAAoB,EAAE,CAAC,YAAY,EAAE,QAAQ,EAAE,EAAE;QACtE,MAAM,CAAC,YAAY,CAAC,QAAQ,EAAE,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,YAAY,CAAC,CAAC;QAC5D,MAAM,CAAC,YAAY,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC;QAC9C,MAAM,CAAC,YAAY,CAAC,MAAM,CAAC,CAAC,aAAa,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC;QAC3D,MAAM,CAAC,YAAY,CAAC,gBAAgB,CAAC,KAAK,CAAC,CAAC,IAAI,CAC9C,QAAQ,CAAC,gBAAgB,CAAC,KAAK,CAChC,CAAC;QACF,MAAM,CAAC,YAAY,CAAC,gBAAgB,CAAC,YAAY,CAAC,CAAC,IAAI,CACrD,QAAQ,CAAC,gBAAgB,CAAC,YAAY,CACvC,CAAC;QACF,MAAM,CAAC,YAAY,CAAC,gBAAgB,CAAC,iBAAiB,CAAC,CAAC,aAAa,CACnE,QAAQ,CAAC,gBAAgB,CAAC,iBAAiB,CAC5C,CAAC;IACJ,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC","sourcesContent":["import { Size } from \"aws-cdk-lib\";\nimport {\n  IAcceleratorChips,\n  Inferentia2Chips,\n  Trainium2Chips,\n  NeuronxInstanceType,\n} from \"./neuronx-instance-type\";\n\ndescribe(\"Inferentia2Chips\", () => {\n  const inferentia2Chips = new Inferentia2Chips(12);\n  it(\"chips as is number\", () => {\n    expect(inferentia2Chips.chips).toBe(12);\n  });\n  it(\"NeuronxCores is twice of chips\", () => {\n    expect(inferentia2Chips.neuronxCores).toBe(24);\n  });\n  it(\"AcceleratorMemory is 16 times more than Neuron core\", () => {\n    expect(inferentia2Chips.acceleratorMemory).toStrictEqual(\n      Size.gibibytes(384),\n    );\n  });\n});\n\n// https://aws.amazon.com/ec2/instance-types/trn2/\ndescribe(\"Trainium2Chips\", () => {\n  describe(\"single chip\", () => {\n    const trainium2Chips = new Trainium2Chips(1);\n    it(\"chips as is number\", () => {\n      expect(trainium2Chips.chips).toBe(1);\n    });\n    it(\"NeuronxCores is 4 per chip (LNC=2 default)\", () => {\n      expect(trainium2Chips.neuronxCores).toBe(4);\n    });\n    it(\"AcceleratorMemory is 24 GiB per logical core\", () => {\n      expect(trainium2Chips.acceleratorMemory).toStrictEqual(\n        Size.gibibytes(96),\n      );\n    });\n  });\n  describe(\"16 chips (trn2.48xlarge)\", () => {\n    const trainium2Chips = new Trainium2Chips(16);\n    it(\"chips as is number\", () => {\n      expect(trainium2Chips.chips).toBe(16);\n    });\n    it(\"NeuronxCores is 64 for 16 chips (LNC=2 default)\", () => {\n      expect(trainium2Chips.neuronxCores).toBe(64);\n    });\n    it(\"AcceleratorMemory is 1536 GiB for 16 chips\", () => {\n      expect(trainium2Chips.acceleratorMemory).toStrictEqual(\n        Size.gibibytes(1536),\n      );\n    });\n  });\n});\n\n// https://aws.amazon.com/ec2/instance-types/inf2/?nc1=h_ls\ndescribe(\"NeuronxInstanceType\", () => {\n  const instanceTypes = [\n    [\n      NeuronxInstanceType.INF2_XLARGE,\n      {\n        instanceType: \"inf2.xlarge\",\n        vCpu: 4,\n        memory: Size.gibibytes(16),\n        acceleratorChips: {\n          chips: 1,\n          neuronxCores: 2,\n          acceleratorMemory: Size.gibibytes(32),\n        } satisfies IAcceleratorChips,\n      },\n    ] as const,\n    [\n      NeuronxInstanceType.INF2_8XLARGE,\n      {\n        instanceType: \"inf2.8xlarge\",\n        vCpu: 32,\n        memory: Size.gibibytes(128),\n        acceleratorChips: {\n          chips: 1,\n          neuronxCores: 2,\n          acceleratorMemory: Size.gibibytes(32),\n        } satisfies IAcceleratorChips,\n      },\n    ] as const,\n    [\n      NeuronxInstanceType.INF2_24XLARGE,\n      {\n        instanceType: \"inf2.24xlarge\",\n        vCpu: 96,\n        memory: Size.gibibytes(384),\n        acceleratorChips: {\n          chips: 6,\n          neuronxCores: 12,\n          acceleratorMemory: Size.gibibytes(192),\n        } satisfies IAcceleratorChips,\n      },\n    ] as const,\n    [\n      NeuronxInstanceType.INF2_48XLARGE,\n      {\n        instanceType: \"inf2.48xlarge\",\n        vCpu: 192,\n        memory: Size.gibibytes(768),\n        acceleratorChips: {\n          chips: 12,\n          neuronxCores: 24,\n          acceleratorMemory: Size.gibibytes(384),\n        } satisfies IAcceleratorChips,\n      },\n    ] as const,\n  ];\n  it.each(instanceTypes)(\"InstanceType of %s\", (instanceType, expected) => {\n    expect(instanceType.toString()).toBe(expected.instanceType);\n    expect(instanceType.vCpu).toBe(expected.vCpu);\n    expect(instanceType.memory).toStrictEqual(expected.memory);\n    expect(instanceType.acceleratorChips.chips).toBe(\n      expected.acceleratorChips.chips,\n    );\n    expect(instanceType.acceleratorChips.neuronxCores).toBe(\n      expected.acceleratorChips.neuronxCores,\n    );\n    expect(instanceType.acceleratorChips.acceleratorMemory).toStrictEqual(\n      expected.acceleratorChips.acceleratorMemory,\n    );\n  });\n});\n\n// https://aws.amazon.com/ec2/instance-types/trn2/\ndescribe(\"NeuronxInstanceType Trn2\", () => {\n  const instanceTypes = [\n    [\n      NeuronxInstanceType.TRN2_3XLARGE,\n      {\n        instanceType: \"trn2.3xlarge\",\n        vCpu: 12,\n        memory: Size.gibibytes(128),\n        acceleratorChips: {\n          chips: 1,\n          neuronxCores: 4,\n          acceleratorMemory: Size.gibibytes(96),\n        } satisfies IAcceleratorChips,\n      },\n    ] as const,\n    [\n      NeuronxInstanceType.TRN2_48XLARGE,\n      {\n        instanceType: \"trn2.48xlarge\",\n        vCpu: 192,\n        memory: Size.gibibytes(2048),\n        acceleratorChips: {\n          chips: 16,\n          neuronxCores: 64,\n          acceleratorMemory: Size.gibibytes(1536),\n        } satisfies IAcceleratorChips,\n      },\n    ] as const,\n    [\n      NeuronxInstanceType.TRN2U_48XLARGE,\n      {\n        instanceType: \"trn2u.48xlarge\",\n        vCpu: 192,\n        memory: Size.gibibytes(2048),\n        acceleratorChips: {\n          chips: 16,\n          neuronxCores: 64,\n          acceleratorMemory: Size.gibibytes(1536),\n        } satisfies IAcceleratorChips,\n      },\n    ] as const,\n  ];\n  it.each(instanceTypes)(\"InstanceType of %s\", (instanceType, expected) => {\n    expect(instanceType.toString()).toBe(expected.instanceType);\n    expect(instanceType.vCpu).toBe(expected.vCpu);\n    expect(instanceType.memory).toStrictEqual(expected.memory);\n    expect(instanceType.acceleratorChips.chips).toBe(\n      expected.acceleratorChips.chips,\n    );\n    expect(instanceType.acceleratorChips.neuronxCores).toBe(\n      expected.acceleratorChips.neuronxCores,\n    );\n    expect(instanceType.acceleratorChips.acceleratorMemory).toStrictEqual(\n      expected.acceleratorChips.acceleratorMemory,\n    );\n  });\n});\n"]}
|
|
@@ -162,5 +162,5 @@ class NeuronxCompiler extends constructs_1.Construct {
|
|
|
162
162
|
}
|
|
163
163
|
exports.NeuronxCompiler = NeuronxCompiler;
|
|
164
164
|
_a = JSII_RTTI_SYMBOL_1;
|
|
165
|
-
NeuronxCompiler[_a] = { fqn: "aws-cdk-neuronx-patterns.NeuronxCompiler", version: "0.
|
|
165
|
+
NeuronxCompiler[_a] = { fqn: "aws-cdk-neuronx-patterns.NeuronxCompiler", version: "0.2.1" };
|
|
166
166
|
//# sourceMappingURL=data:application/json;base64,{"version":3,"file":"neuronx-compiler.js","sourceRoot":"","sources":["../../../src/base/neuronx-compiler/neuronx-compiler.ts"],"names":[],"mappings":";;;;;AAAA,6CAQqB;AACrB,+CAA+C;AAC/C,2CAA2C;AAE3C,iDAA4C;AAC5C,uDAKgC;AAEhC,mEAAwD;AACxD,2CAAuC;AACvC,+BAA4B;AAC5B,4CAGsB;AACtB,wCAMoB;AAmGpB;;;GAGG;AACH,MAAa,eAAgB,SAAQ,sBAAS;IAU5C,YAAY,KAAgB,EAAE,EAAU,EAAE,KAA2B;QACnE,KAAK,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;QACjB,MAAM,UAAU,GAAG,kBAAI,CAAC,SAAS,CAC/B,KAAK,CAAC,KAAK,CAAC,OAAO,CAAC,UAAU,CAAC,SAAS,EAAE,GAAG,GAAG,CACjD,CAAC;QACF,MAAM,UAAU,GACd,KAAK,CAAC,UAAU,EAAE,WAAW,EAAE;YAC/B,IAAI,CAAC,IAAI,CACP,UAAU,CAAC,WAAW,EAAE;gBACtB,qCAA2B,CAAC,IAAI,CAAC,WAAW,EAAE;gBAC9C,qCAA2B,CAAC,IAAI,CAAC,WAAW,EAAE,CACjD,CAAC;QACJ,MAAM,cAAc,GAAG,IAAI,GAAG,CAAC,cAAc,CAAC,IAAI,EAAE,gBAAgB,EAAE;YACpE,YAAY,EAAE;gBACZ;oBACE,UAAU,EAAE,WAAW;oBACvB,MAAM,EAAE,GAAG,CAAC,iBAAiB,CAAC,GAAG,CAAC,UAAU,EAAE;wBAC5C,UAAU,EAAE,GAAG,CAAC,mBAAmB,CAAC,GAAG;wBACvC,SAAS,EAAE,IAAI;qBAChB,CAAC;iBACH;aACF;SACF,CAAC,CAAC;QAEH,MAAM,mBAAmB,GACvB,KAAK,CAAC,mBAAmB,IAAI,6BAAmB,CAAC,aAAa,CAAC;QACjE,MAAM,kBAAkB,GAAG,IAAI,0CAA8B,CAC3D,IAAI,EACJ,oBAAoB,EACpB;YACE,GAAG,EAAE,KAAK,CAAC,GAAG;YACd,UAAU,EAAE,KAAK,CAAC,UAAU;YAC5B,aAAa,EAAE,CAAC,mBAAmB,CAAC,YAAY,CAAC;YACjD,yBAAyB,EAAE,KAAK;YAChC,cAAc;YACd,IAAI,EAAE,KAAK,CAAC,IAAI;SACjB,CACF,CAAC;QAEF,kBAAI,CAAC,EAAE,CAAC,kBAAkB,CAAC,CAAC,GAAG,CAAC,MAAM,EAAE,wBAAwB,CAAC,CAAC;QAClE,IAAI,CAAC,QAAQ,GAAG,IAAI,KAAK,CAAC,QAAQ,CAAC,IAAI,EAAE,UAAU,EAAE;YACnD,mBAAmB,EAAE;gBACnB;oBACE,kBAAkB;oBAClB,KAAK,EAAE,CAAC;iBACT;aACF;YACD,wBAAwB,EAAE;gBACxB;oBACE,KAAK,EAAE,KAAK,CAAC,6BAA6B,CAAC,QAAQ;oBACnD,MAAM,EAAE,KAAK,CAAC,8BAA8B,CAAC,wBAAwB;oBACrE,OAAO,EAAE,sBAAQ,CAAC,OAAO,CAAC,EAAE,CAAC;oBAC7B,MAAM,EAAE,KAAK,CAAC,8BAA8B,CAAC,MAAM;iBACpD;aACF;SACF,CAAC,CAAC;QACH,KAAK,CAAC,KAAK,CAAC,MAAM,EAAE,SAAS,CAAC,kBAAkB,CAAC,YAAY,CAAC,CAAC;QAC/D,KAAK,CAAC,MAAM,CAAC,cAAc,CAAC,kBAAkB,CAAC,YAAY,CAAC,CAAC;QAC7D,IAAI,CAAC,aAAa,GAAG,IAAI,wCAA4B,CACnD,IAAI,EACJ,eAAe,EACf;YACE,mBAAmB;YACnB,KAAK,EAAE,KAAK,CAAC,KAAK,CAAC,KAAK;YACxB,oDAAoD;YACpD,kBAAkB;YAClB,2DAA2D;YAC3D,0EAA0E;YAC1E,4EAA4E;YAC5E,MAAM,EAAE,kBAAI,CAAC,SAAS,CACpB,IAAI,CAAC,IAAI,CAAC,mBAAmB,CAAC,MAAM,CAAC,WAAW,EAAE,GAAG,IAAI,CAAC,CAC3D;YACD,GAAG,EAAE,mBAAmB,CAAC,IAAI;YAC7B,WAAW,EAAE;gBACX,wBAAwB,EAAE,GAAG,KAAK,CAAC,MAAM,CAAC,cAAc,CAAC,sBAAsB,CAAC,EAAE;gBAClF,GAAG,KAAK,CAAC,WAAW;aACrB;YACD,OAAO,EAAE,KAAK,CAAC,OAAO;YACtB,OAAO,EAAE,KAAK,CAAC,OAAO;SACvB,CACF,CAAC;QAEF,MAAM,iBAAiB,GAAG,IAAI,8BAAiB,CAAC,IAAI,EAAE,mBAAmB,EAAE;YACzE,IAAI,EAAE,iBAAI,CAAC,SAAS,CAAC,IAAA,WAAI,EAAC,SAAS,EAAE,2BAA2B,CAAC,CAAC;YAClE,OAAO,EAAE,eAAe;YACxB,OAAO,EAAE,oBAAO,CAAC,aAAa;YAC9B,IAAI,EAAE,sCAAsC;YAC5C,WAAW,EAAE;gBACX,kBAAkB,EAAE,IAAI,CAAC,aAAa,CAAC,gBAAgB;gBACvD,aAAa,EAAE,IAAI,CAAC,QAAQ,CAAC,WAAW;aACzC;SACF,CAAC,CAAC;QACH,IAAI,CAAC,aAAa,CAAC,cAAc,CAAC,iBAAiB,EAAE,IAAI,CAAC,QAAQ,CAAC,CAAC;QACpE,MAAM,qBAAqB,GAAG,IAAI,8BAAiB,CACjD,IAAI,EACJ,uBAAuB,EACvB;YACE,IAAI,EAAE,iBAAI,CAAC,SAAS,CAAC,IAAA,WAAI,EAAC,SAAS,EAAE,2BAA2B,CAAC,CAAC;YAClE,OAAO,EAAE,kBAAkB;YAC3B,OAAO,EAAE,oBAAO,CAAC,aAAa;YAC9B,IAAI,EAAE,sCAAsC;YAC5C,WAAW,EAAE;gBACX,kBAAkB,EAAE,KAAK,CAAC,gBAAgB;aAC3C;SACF,CACF,CAAC;QACF,eAAK,CAAC,cAAc,CAAC;YACnB,YAAY,EAAE,CAAC,GAAG,CAAC;YACnB,OAAO,EAAE,qBAAqB;YAC9B,OAAO,EAAE,CAAC,oBAAoB,CAAC;SAChC,CAAC,CAAC;QACH,MAAM,QAAQ,GAAG,IAAI,2BAAQ,CAAC,IAAI,EAAE,oBAAoB,EAAE;YACxD,cAAc,EAAE,iBAAiB;YACjC,iBAAiB,EAAE,qBAAqB;YACxC,aAAa,EAAE,sBAAQ,CAAC,OAAO,CAAC,CAAC,CAAC;YAClC,YAAY,EAAE,sBAAQ,CAAC,KAAK,CAAC,EAAE,CAAC;SACjC,CAAC,CAAC;QACH,IAAI,CAAC,UAAU,GAAG,IAAI,8BAAiB,CAAC,IAAI,EAAE,uBAAuB,EAAE;YACrE,IAAI,EAAE,iBAAI,CAAC,SAAS,CAAC,IAAA,WAAI,EAAC,SAAS,EAAE,2BAA2B,CAAC,CAAC;YAClE,OAAO,EAAE,kBAAkB;YAC3B,WAAW,EAAE;gBACX,YAAY,EAAE,QAAQ,CAAC,YAAY;aACpC;YACD,OAAO,EAAE,sBAAQ,CAAC,OAAO,CAAC,EAAE,CAAC;YAC7B,OAAO,EAAE,oBAAO,CAAC,aAAa;YAC9B,IAAI,EAAE,sCAAsC;SAC7C,CAAC,CAAC;QACH,qBAAQ,CAAC,eAAe,CACtB,IAAI,EACJ,kBAAkB,EAClB,QAAQ,CAAC,YAAY,CACtB,CAAC,WAAW,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;QAE/B,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC,KAAK,CAAC;QACzB,IAAI,CAAC,MAAM,GAAG,KAAK,CAAC,MAAM,CAAC;QAC3B,IAAI,CAAC,gBAAgB,GAAG,KAAK,CAAC,gBAAgB,CAAC;QAC/C,IAAI,CAAC,UAAU,GAAG,UAAU,CAAC;QAC7B,IAAI,CAAC,mBAAmB,GAAG,mBAAmB,CAAC;IACjD,CAAC;IACD,OAAO;QACL,6BAA6B;QAC7B,IAAI,IAAI,CAAC,aAAa,EAAE,CAAC;YACvB,OAAO,IAAI,CAAC,aAAa,CAAC;QAC5B,CAAC;QACD,MAAM,mBAAmB,GAAG,IAAI,oCAAsB,CACpD,IAAI,EACJ,sBAAsB,IAAI,CAAC,gBAAgB,EAAE,CAC9C,CAAC;QACF,MAAM,UAAU,GAAG,IAAI,4BAAc,CAAC,IAAI,EAAE,gBAAgB,EAAE;YAC5D,YAAY,EAAE,IAAI,CAAC,UAAU,CAAC,WAAW;YACzC,YAAY,EAAE,wBAAwB;YACtC,UAAU,EAAE;gBACV,wBAAwB,EAAE,mBAAmB,CAAC,GAAG;aAClD;SACF,CAAC,CAAC;QACH,MAAM,IAAI,GAAG,IAAI,8BAAgB,CAC/B,IAAI,EACJ,gBAAgB,IAAI,CAAC,gBAAgB,EAAE,EACvC;YACE,KAAK,EAAE,CAAC;YACR,OAAO,EAAE,sBAAQ,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC,SAAS,EAAE,CAAC,QAAQ,EAAE;YAClD,MAAM,EAAE,mBAAmB,CAAC,GAAG;SAChC,CACF,CAAC;QACF,IAAI,CAAC,IAAI,CAAC,aAAa,CAAC,UAAU,CAAC,CAAC;QACpC,MAAM,QAAQ,GAAG,gBAAE,CAAC,MAAM,CAAC,CAAC,EAAE,gBAAE,CAAC,KAAK,CAAC,GAAG,EAAE,IAAI,CAAC,QAAQ,CAAC,QAAQ,EAAE,CAAC,CAAC,CAAC;QAEvE,IAAI,CAAC,aAAa,GAAG;YACnB,SAAS,EAAE,IAAI,CAAC,KAAK,CAAC,SAAS;YAC/B,uBAAuB,EAAE,IAAI,CAAC,mBAAmB;YACjD,MAAM,EAAE,IAAI,CAAC,MAAM;YACnB,QAAQ;YACR,KAAK,EAAE,IAAI,CAAC,MAAM,CAAC,cAAc,CAAC,QAAQ,CAAC;YAC3C,UAAU,EAAE,IAAI,CAAC,UAAU;SAC5B,CAAC;QACF,OAAO,IAAI,CAAC,aAAa,CAAC;IAC5B,CAAC;;AA1LH,0CA2LC","sourcesContent":["import {\n  CfnWaitCondition,\n  CfnWaitConditionHandle,\n  CustomResource,\n  Duration,\n  Fn,\n  Size,\n  Tags,\n} from \"aws-cdk-lib\";\nimport * as batch from \"aws-cdk-lib/aws-batch\";\nimport * as ec2 from \"aws-cdk-lib/aws-ec2\";\nimport { ContainerImage } from \"aws-cdk-lib/aws-ecs\";\nimport { Grant } from \"aws-cdk-lib/aws-iam\";\nimport {\n  Code,\n  Function,\n  Runtime,\n  SingletonFunction,\n} from \"aws-cdk-lib/aws-lambda\";\nimport { IBucket } from \"aws-cdk-lib/aws-s3\";\nimport { Provider } from \"aws-cdk-lib/custom-resources\";\nimport { Construct } from \"constructs\";\nimport { join } from \"path\";\nimport {\n  NeuronxBatchComputeEnvironment,\n  NeuronxBatchEcsJobDefinition,\n} from \"../aws-batch\";\nimport {\n  INeuronxInstanceType,\n  Model,\n  NeuronOptimizedMachineImage,\n  NeuronxInstanceType,\n  PytorchTrainingNeuronxImage,\n} from \"../neuronx\";\n\n/**\n * Compile runtime.\n */\nexport interface INeuronxContainerImage {\n  /**\n   * An image of the container where the compile job is executed.\n   */\n  readonly image: ContainerImage;\n  /**\n   * Neuronx version included in container image.\n   */\n  readonly neuronSdkVersion: string;\n}\n\n/**\n * Props of NeuronxCompiler.\n */\nexport interface NeuronxCompilerProps {\n  /**\n   * VPC in which this will launch compile worker instance.\n   */\n  readonly vpc: ec2.IVpc;\n  /**\n   * The bucket to upload compiled artifacts.\n   */\n  readonly bucket: IBucket;\n  /**\n   * Secrets to pass to the container.\n   */\n  readonly secrets?: { [key: string]: batch.Secret };\n  /**\n   * S3 Prefix that compiled artifact uploaded.\n   * This property is not depends on compile job finish.\n   */\n  readonly artifactS3Prefix: string;\n  /**\n   * The instance type of compile worker instance.\n   */\n  readonly neuronxInstanceType: INeuronxInstanceType;\n  /**\n   * The model to be compiled.\n   */\n  readonly model: Model;\n  /**\n   * An image of the container where the compile job is executed.\n   */\n  readonly image: INeuronxContainerImage;\n  readonly command?: string[];\n  /**\n   * The root volume of worker instance.\n   * @default - N bilion parameters * 5GiB EBS\n   */\n  readonly volumeSize?: Size;\n  /**\n   * Whether or not to use spot instances. Spot instances are less expensive EC2 instances that can be reclaimed by EC2 at any time; your job will be given two minutes of notice before reclamation.\n   *\n   * @default false\n   */\n  readonly spot?: boolean;\n  /**\n   * The VPC Subnets this Compute Environment will launch instances in.\n   *\n   * @default - new subnets will be created\n   */\n  readonly vpcSubnets?: ec2.SubnetSelection;\n  /**\n   * The environment variables to pass to the container.\n   * This is only applicable when using container runtime.\n   *\n   * @default - No environment variables.\n   */\n  readonly environment?: {\n    [key: string]: string;\n  };\n}\n\nexport interface NeuronxCompiledModel {\n  readonly compileTimeInstanceType: INeuronxInstanceType;\n  /**\n   * The bucket to upload compiled artifacts.\n   */\n  readonly bucket: IBucket;\n  /**\n   * S3 URL that compiled artifact uploaded.\n   */\n  readonly s3Uri: string;\n  /**\n   * S3 prefix that compiled artifact uploaded.\n   */\n  readonly s3Prefix: string;\n  /**\n   * The model name.\n   */\n  readonly modelName: string;\n  readonly weightSize: Size;\n}\n\n/**\n * Neuronx compiler construct.\n * Compile the model to work with Inferentia2 and Trainium1 and upload it to an S3 bucket.\n */\nexport class NeuronxCompiler extends Construct {\n  private compiledModel?: NeuronxCompiledModel;\n  private readonly entrypoint: SingletonFunction;\n  private readonly jobDefinition: NeuronxBatchEcsJobDefinition;\n  private readonly jobQueue: batch.JobQueue;\n  private readonly artifactS3Prefix: string;\n  private readonly weightSize: Size;\n  private readonly neuronxInstanceType: INeuronxInstanceType;\n  private readonly model: Model;\n  private readonly bucket: IBucket;\n  constructor(scope: Construct, id: string, props: NeuronxCompilerProps) {\n    super(scope, id);\n    const weightSize = Size.gibibytes(\n      props.model.options.parameters.toBillion() * 2.5,\n    );\n    const volumeSize =\n      props.volumeSize?.toGibibytes() ??\n      Math.ceil(\n        weightSize.toGibibytes() +\n          PytorchTrainingNeuronxImage.size.toGibibytes() +\n          NeuronOptimizedMachineImage.size.toGibibytes(),\n      );\n    const launchTemplate = new ec2.LaunchTemplate(this, \"LaunchTemplate\", {\n      blockDevices: [\n        {\n          deviceName: \"/dev/xvda\",\n          volume: ec2.BlockDeviceVolume.ebs(volumeSize, {\n            volumeType: ec2.EbsDeviceVolumeType.GP3,\n            encrypted: true,\n          }),\n        },\n      ],\n    });\n\n    const neuronxInstanceType =\n      props.neuronxInstanceType ?? NeuronxInstanceType.INF2_48XLARGE;\n    const computeEnvironment = new NeuronxBatchComputeEnvironment(\n      this,\n      \"ComputeEnvironment\",\n      {\n        vpc: props.vpc,\n        vpcSubnets: props.vpcSubnets,\n        instanceTypes: [neuronxInstanceType.instanceType],\n        useOptimalInstanceClasses: false,\n        launchTemplate,\n        spot: props.spot,\n      },\n    );\n\n    Tags.of(computeEnvironment).add(\"Name\", \"neuronx-compile-worker\");\n    this.jobQueue = new batch.JobQueue(this, \"JobQueue\", {\n      computeEnvironments: [\n        {\n          computeEnvironment,\n          order: 1,\n        },\n      ],\n      jobStateTimeLimitActions: [\n        {\n          state: batch.JobStateTimeLimitActionsState.RUNNABLE,\n          reason: batch.JobStateTimeLimitActionsReason.JOB_RESOURCE_REQUIREMENT,\n          maxTime: Duration.minutes(10),\n          action: batch.JobStateTimeLimitActionsAction.CANCEL,\n        },\n      ],\n    });\n    props.model.bucket?.grantRead(computeEnvironment.instanceRole);\n    props.bucket.grantReadWrite(computeEnvironment.instanceRole);\n    this.jobDefinition = new NeuronxBatchEcsJobDefinition(\n      this,\n      \"JobDefinition\",\n      {\n        neuronxInstanceType,\n        image: props.image.image,\n        // The fllowing command was executed on inf2.8xlarge\n        // sh-5.2$ free -b\n        // \t\t\ttotal\t\t\t\t\tused\t\t\tfree\t\t\t\t\tshared\tbuff/cache\tavailable\n        // Mem:\t132265766912\t866320384\t130341785600\t667648\t1057660928\t130529148928\n        // https://docs.aws.amazon.com/batch/latest/userguide/memory-management.html\n        memory: Size.mebibytes(\n          Math.ceil(neuronxInstanceType.memory.toMebibytes() * 0.95),\n        ),\n        cpu: neuronxInstanceType.vCpu,\n        environment: {\n          NEURON_COMPILE_CACHE_URL: `${props.bucket.s3UrlForObject(\"neuron-compile-cache\")}`,\n          ...props.environment,\n        },\n        command: props.command,\n        secrets: props.secrets,\n      },\n    );\n\n    const jobSubmitFunction = new SingletonFunction(this, \"JobSubmitFunction\", {\n      code: Code.fromAsset(join(__dirname, \"private/await-compile-job\")),\n      handler: \"index.onEvent\",\n      runtime: Runtime.NODEJS_LATEST,\n      uuid: \"1361f469-5c92-4c46-9e11-5d1dbf925bac\",\n      environment: {\n        JOB_DEFINITION_ARN: this.jobDefinition.jobDefinitionArn,\n        JOB_QUEUE_ARN: this.jobQueue.jobQueueArn,\n      },\n    });\n    this.jobDefinition.grantSubmitJob(jobSubmitFunction, this.jobQueue);\n    const jobMonitoringFunction = new SingletonFunction(\n      this,\n      \"JobMonitoringFunction\",\n      {\n        code: Code.fromAsset(join(__dirname, \"private/await-compile-job\")),\n        handler: \"index.isComplete\",\n        runtime: Runtime.NODEJS_LATEST,\n        uuid: \"df16dba8-5f77-480c-a6ad-cfdf74c3de62\",\n        environment: {\n          ARTIFACT_S3_PREFIX: props.artifactS3Prefix,\n        },\n      },\n    );\n    Grant.addToPrincipal({\n      resourceArns: [\"*\"],\n      grantee: jobMonitoringFunction,\n      actions: [\"batch:DescribeJobs\"],\n    });\n    const provider = new Provider(this, \"CompileJobProvider\", {\n      onEventHandler: jobSubmitFunction,\n      isCompleteHandler: jobMonitoringFunction,\n      queryInterval: Duration.minutes(1),\n      totalTimeout: Duration.hours(12),\n    });\n    this.entrypoint = new SingletonFunction(this, \"JobEntrypointFunction\", {\n      code: Code.fromAsset(join(__dirname, \"private/await-compile-job\")),\n      handler: \"index.entrypoint\",\n      environment: {\n        PROVIDER_ARN: provider.serviceToken,\n      },\n      timeout: Duration.minutes(15),\n      runtime: Runtime.NODEJS_LATEST,\n      uuid: \"f6e66997-5042-4df1-8781-bd68b3ac5313\",\n    });\n    Function.fromFunctionArn(\n      this,\n      \"ProviderFunction\",\n      provider.serviceToken,\n    ).grantInvoke(this.entrypoint);\n\n    this.model = props.model;\n    this.bucket = props.bucket;\n    this.artifactS3Prefix = props.artifactS3Prefix;\n    this.weightSize = weightSize;\n    this.neuronxInstanceType = neuronxInstanceType;\n  }\n  compile() {\n    // when invoke multiple times\n    if (this.compiledModel) {\n      return this.compiledModel;\n    }\n    const waitConditionHandle = new CfnWaitConditionHandle(\n      this,\n      `WaitConditionHandle${this.artifactS3Prefix}`,\n    );\n    const compileJob = new CustomResource(this, \"NeuronxCompile\", {\n      serviceToken: this.entrypoint.functionArn,\n      resourceType: \"Custom::NeuronxCompile\",\n      properties: {\n        waitConditionCallbackURL: waitConditionHandle.ref,\n      },\n    });\n    const wait = new CfnWaitCondition(\n      this,\n      `WaitCondition${this.artifactS3Prefix}`,\n      {\n        count: 1,\n        timeout: Duration.hours(12).toSeconds().toString(),\n        handle: waitConditionHandle.ref,\n      },\n    );\n    wait.node.addDependency(compileJob);\n    const s3Prefix = Fn.select(3, Fn.split('\"', wait.attrData.toString()));\n\n    this.compiledModel = {\n      modelName: this.model.modelName,\n      compileTimeInstanceType: this.neuronxInstanceType,\n      bucket: this.bucket,\n      s3Prefix,\n      s3Uri: this.bucket.s3UrlForObject(s3Prefix),\n      weightSize: this.weightSize,\n    };\n    return this.compiledModel;\n  }\n}\n"]}
|
|
@@ -349,5 +349,5 @@ class VllmEngineArgumentsParser {
|
|
|
349
349
|
}
|
|
350
350
|
exports.VllmEngineArgumentsParser = VllmEngineArgumentsParser;
|
|
351
351
|
_a = JSII_RTTI_SYMBOL_1;
|
|
352
|
-
VllmEngineArgumentsParser[_a] = { fqn: "aws-cdk-neuronx-patterns.VllmEngineArgumentsParser", version: "0.
|
|
352
|
+
VllmEngineArgumentsParser[_a] = { fqn: "aws-cdk-neuronx-patterns.VllmEngineArgumentsParser", version: "0.2.1" };
|
|
353
353
|
//# sourceMappingURL=data:application/json;base64,{"version":3,"file":"vllm-engine-argments.js","sourceRoot":"","sources":["../../../../src/base/server-engine/vllm-engine/vllm-engine-argments.ts"],"names":[],"mappings":";;;;;AAEA;;GAEG;AACH,IAAY,eAOX;AAPD,WAAY,eAAe;IACzB,kCAAe,CAAA;IACf,gCAAa,CAAA;IACb,sCAAmB,CAAA;IACnB,kCAAe,CAAA;IACf,wCAAqB,CAAA;IACrB,kCAAe,CAAA;AACjB,CAAC,EAPW,eAAe,+BAAf,eAAe,QAO1B;AAED;;GAEG;AACH,IAAY,UAaX;AAbD,WAAY,UAAU;IACpB,2BAAa,CAAA;IACb,uBAAS,CAAA;IACT,yCAA2B,CAAA;IAC3B,iCAAmB,CAAA;IACnB,6BAAe,CAAA;IACf,uCAAyB,CAAA;IACzB,6CAA+B,CAAA;IAC/B,2BAAa,CAAA;IACb,2CAA6B,CAAA;IAC7B,iCAAmB,CAAA;IACnB,+CAAiC,CAAA;IACjC,iDAAmC,CAAA;AACrC,CAAC,EAbW,UAAU,0BAAV,UAAU,QAarB;AAED;;GAEG;AACH,IAAY,QAyBX;AAzBD,WAAY,QAAQ;IAClB;;OAEG;IACH,yBAAa,CAAA;IACb;;OAEG;IACH,yBAAa,CAAA;IACb;;OAEG;IACH,+BAAmB,CAAA;IACnB;;OAEG;IACH,iCAAqB,CAAA;IACrB;;OAEG;IACH,2BAAe,CAAA;IACf;;OAEG;IACH,+BAAmB,CAAA;AACrB,CAAC,EAzBW,QAAQ,wBAAR,QAAQ,QAyBnB;AAED;;GAEG;AACH,IAAY,gBAGX;AAHD,WAAY,gBAAgB;IAC1B,iCAAa,CAAA;IACb,iCAAa,CAAA;AACf,CAAC,EAHW,gBAAgB,gCAAhB,gBAAgB,QAG3B;AAED;;GAEG;AACH,IAAY,qBAKX;AALD,WAAY,qBAAqB;IAC/B,sCAAa,CAAA;IACb,8CAAqB,CAAA;IACrB,kEAAyC,CAAA;IACzC,8CAAqB,CAAA;AACvB,CAAC,EALW,qBAAqB,qCAArB,qBAAqB,QAKhC;AAED;;GAEG;AACH,IAAY,eAGX;AAHD,WAAY,eAAe;IACzB,8CAA2B,CAAA;IAC3B,sCAAmB,CAAA;AACrB,CAAC,EAHW,eAAe,+BAAf,eAAe,QAG1B;AAED;;GAEG;AACH,IAAY,SAcX;AAdD,WAAY,SAAS;IACnB;;;OAGG;IACH,0BAAa,CAAA;IACb;;OAEG;IACH,0BAAa,CAAA;IACb;;OAEG;IACH,0CAA6B,CAAA;AAC/B,CAAC,EAdW,SAAS,yBAAT,SAAS,QAcpB;AAED;;GAEG;AACH,IAAY,0BAKX;AALD,WAAY,0BAA0B;IACpC,yCAAW,CAAA;IACX,uCAAS,CAAA;IACT,yCAAW,CAAA;IACX,qEAAuC,CAAA;AACzC,CAAC,EALW,0BAA0B,0CAA1B,0BAA0B,QAKrC;AAED;;GAEG;AACH,IAAY,SAMX;AAND,WAAY,SAAS;IACnB,6CAAU,CAAA;IACV,gDAAY,CAAA;IACZ,gDAAY,CAAA;IACZ,gDAAY,CAAA;IACZ,mDAAc,CAAA;AAChB,CAAC,EANW,SAAS,yBAAT,SAAS,QAMpB;AAED;;GAEG;AACH,IAAY,YAKX;AALD,WAAY,YAAY;IACtB,6BAAa,CAAA;IACb,2BAAW,CAAA;IACX,qCAAqB,CAAA;IACrB,qCAAqB,CAAA;AACvB,CAAC,EALW,YAAY,4BAAZ,YAAY,QAKvB;AAED;;GAEG;AACH,IAAY,qBASX;AATD,WAAY,qBAAqB;IAC/B;;OAEG;IACH,4CAAmB,CAAA;IACnB;;OAEG;IACH,0CAAiB,CAAA;AACnB,CAAC,EATW,qBAAqB,qCAArB,qBAAqB,QAShC;AAED;;GAEG;AACH,IAAY,YA2BX;AA3BD,WAAY,YAAY;IACtB,6BAAa,CAAA;IACb,2BAAW,CAAA;IACX,2CAA2B,CAAA;IAC3B,qCAAqB,CAAA;IACrB,2BAAW,CAAA;IACX,qCAAqB,CAAA;IACrB,yCAAyB,CAAA;IACzB,qCAAqB,CAAA;IACrB,+BAAe,CAAA;IACf,iCAAiB,CAAA;IACjB,6BAAa,CAAA;IACb,iDAAiC,CAAA;IACjC,2CAA2B,CAAA;IAC3B,yCAAyB,CAAA;IACzB,6BAAa,CAAA;IACb,yDAAyC,CAAA;IACzC,6CAA6B,CAAA;IAC7B,2BAAW,CAAA;IACX,2BAAW,CAAA;IACX,6CAA6B,CAAA;IAC7B,6CAA6B,CAAA;IAC7B,6BAAa,CAAA;IACb,+BAAe,CAAA;IACf,uCAAuB,CAAA;IACvB,mCAAmB,CAAA;IACnB,6BAAa,CAAA;AACf,CAAC,EA3BW,YAAY,4BAAZ,YAAY,QA2BvB;AAED;;GAEG;AACH,IAAY,aAiBX;AAjBD,WAAY,aAAa;IACvB;;OAEG;IACH,8BAAa,CAAA;IACb;;OAEG;IACH,8BAAa,CAAA;IACb;;OAEG;IACH,oCAAmB,CAAA;IACnB;;OAEG;IACH,kCAAiB,CAAA;AACnB,CAAC,EAjBW,aAAa,6BAAb,aAAa,QAiBxB;AAED;;GAEG;AACH,IAAY,YAOX;AAPD,WAAY,YAAY;IACtB;;OAEG;IACH,6BAAa,CAAA;IACb,yBAAS,CAAA;IACT,mCAAmB,CAAA;AACrB,CAAC,EAPW,YAAY,4BAAZ,YAAY,QAOvB;AAED;;GAEG;AACH,IAAY,iBAEX;AAFD,WAAY,iBAAiB;IAC3B,gCAAW,CAAA;AACb,CAAC,EAFW,iBAAiB,iCAAjB,iBAAiB,QAE5B;AAED;;GAEG;AACH,IAAY,MAQX;AARD,WAAY,MAAM;IAChB,uBAAa,CAAA;IACb,uBAAa,CAAA;IACb,2BAAiB,CAAA;IACjB,qBAAW,CAAA;IACX,qBAAW,CAAA;IACX,qBAAW,CAAA;IACX,qBAAW,CAAA;AACb,CAAC,EARW,MAAM,sBAAN,MAAM,QAQjB;AAED;;GAEG;AACH,IAAY,SAIX;AAJD,WAAY,SAAS;IACnB,0BAAa,CAAA;IACb,gCAAmB,CAAA;IACnB,kCAAqB,CAAA;AACvB,CAAC,EAJW,SAAS,yBAAT,SAAS,QAIpB;AAED;;GAEG;AACH,IAAY,yBAIX;AAJD,WAAY,yBAAyB;IACnC,0CAAa,CAAA;IACb,8CAAiB,CAAA;IACjB,8CAAiB,CAAA;AACnB,CAAC,EAJW,yBAAyB,yCAAzB,yBAAyB,QAIpC;AAED;;GAEG;AACH,IAAY,cAUX;AAVD,WAAY,cAAc;IACxB,mDAAiC,CAAA;IACjC,qCAAmB,CAAA;IACnB,mCAAiB,CAAA;IACjB,uCAAqB,CAAA;IACrB,iCAAe,CAAA;IACf,6CAA2B,CAAA;IAC3B,qCAAmB,CAAA;IACnB,mDAAiC,CAAA;IACjC,uCAAqB,CAAA;AACvB,CAAC,EAVW,cAAc,8BAAd,cAAc,QAUzB;AAED;;GAEG;AACH,IAAY,QASX;AATD,WAAY,QAAQ;IAClB,yBAAa,CAAA;IACb,iCAAqB,CAAA;IACrB,mCAAuB,CAAA;IACvB,2BAAe,CAAA;IACf,iCAAqB,CAAA;IACrB,2BAAe,CAAA;IACf,6BAAiB,CAAA;IACjB,2CAA+B,CAAA;AACjC,CAAC,EATW,QAAQ,wBAAR,QAAQ,QASnB;AAED;;GAEG;AACH,IAAY,cAGX;AAHD,WAAY,cAAc;IACxB,yCAAuB,CAAA;IACvB,+BAAa,CAAA;AACf,CAAC,EAHW,cAAc,8BAAd,cAAc,QAGzB;AAED;;GAEG;AACH,IAAY,gBAGX;AAHD,WAAY,gBAAgB;IAC1B,iCAAa,CAAA;IACb,yCAAqB,CAAA;AACvB,CAAC,EAHW,gBAAgB,gCAAhB,gBAAgB,QAG3B;AAm7BD,MAAM,mBAAmB,GAAkC;IACzD,gBAAgB;IAChB,gBAAgB;IAChB,gBAAgB;CACjB,CAAC;AACF,MAAM,UAAU,GAAkC,CAAC,OAAO,EAAE,SAAS,CAAC,CAAC;AAEvE,MAAsB,yBAAyB;IAC7C;;;;;OAKG;IACH,MAAM,CAAC,MAAM,CAAC,IAAyB;QACrC,OAAO,MAAM,CAAC,OAAO,CAAC,IAAI,CAAC;aACxB,MAAM,CAAC,CAAC,CAAC,GAAG,CAAC,EAAE,EAAE,CAAC,CAAC,UAAU,CAAC,QAAQ,CAAC,GAAgC,CAAC,CAAC;aACzE,MAAM,CAA2B,CAAC,IAAI,EAAE,CAAC,GAAG,EAAE,KAAK,CAAC,EAAE,EAAE;YACvD,MAAM,CAAC,GAAG,GAAG,CAAC,OAAO,CAAC,QAAQ,EAAE,CAAC,KAAK,EAAE,EAAE,CAAC,IAAI,KAAK,CAAC,WAAW,EAAE,EAAE,CAAC,CAAC;YACtE,IACE,mBAAmB,CAAC,QAAQ,CAAC,GAAgC,CAAC;gBAC9D,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,KAAK,CAAC,IAAI,OAAO,KAAK,KAAK,QAAQ,CAAC,EACpD,CAAC;gBACD,KAAK,GAAG,IAAI,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC;YAChC,CAAC;YACD,IAAI,CAAC,CAAC,CAAC,GAAG,KAAK,CAAC;YAChB,OAAO,IAAI,CAAC;QACd,CAAC,EAAE,EAAE,CAAC,CAAC;IACX,CAAC;IACD,MAAM,CAAC,GAAG,CAAC,IAAyB;QAClC,MAAM,WAAW,GAAG,MAAM,CAAC,OAAO,CAAC,IAAI,CAAC;aACrC,MAAM,CAAC,CAAC,CAAC,GAAG,CAAC,EAAE,EAAE,CAAC,CAAC,UAAU,CAAC,QAAQ,CAAC,GAAgC,CAAC,CAAC;aACzE,OAAO,CAAC,CAAC,CAAC,CAAC,EAAE,KAAK,CAAC,EAAE,EAAE;YACtB,MAAM,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,QAAQ,EAAE,CAAC,KAAK,EAAE,EAAE,CAAC,IAAI,KAAK,CAAC,WAAW,EAAE,EAAE,CAAC,CAAC;YACtE,IAAI,OAAO,KAAK,KAAK,SAAS,EAAE,CAAC;gBAC/B,OAAO,KAAK,CAAC,CAAC,CAAC,CAAC,KAAK,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;YACnC,CAAC;YACD,IAAI,KAAK,CAAC,OAAO,CAAC,KAAK,CAAC,EAAE,CAAC;gBACzB,OAAO,CAAC,KAAK,GAAG,EAAE,EAAE,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC;YACnD,CAAC;YACD,IAAI,OAAO,KAAK,KAAK,QAAQ,EAAE,CAAC;gBAC9B,OAAO,CAAC,KAAK,GAAG,EAAE,EAAE,IAAI,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC,CAAC;YAC7C,CAAC;YACD,OAAO,CAAC,KAAK,GAAG,EAAE,EAAE,GAAG,KAAK,EAAE,CAAC,CAAC;QAClC,CAAC,CAAC,CAAC;QACL,OAAO,CAAC,IAAI,CAAC,KAAM,EAAE,GAAG,WAAW,CAAC,CAAC;IACvC,CAAC;;AAvCH,8DAwCC","sourcesContent":["import { Secret } from \"aws-cdk-lib/aws-batch\";\n\n/**\n * Log level options for Uvicorn\n */\nexport enum UvicornLogLevel {\n  DEBUG = \"debug\",\n  INFO = \"info\",\n  WARNING = \"warning\",\n  ERROR = \"error\",\n  CRITICAL = \"critical\",\n  TRACE = \"trace\",\n}\n\n/**\n * Available model weight loading formats\n */\nexport enum LoadFormat {\n  AUTO = \"auto\",\n  PT = \"pt\",\n  SAFETENSORS = \"safetensors\",\n  NPCACHE = \"npcache\",\n  DUMMY = \"dummy\",\n  TENSORIZER = \"tensorizer\",\n  SHARDED_STATE = \"sharded_state\",\n  GGUF = \"gguf\",\n  BITSANDBYTES = \"bitsandbytes\",\n  MISTRAL = \"mistral\",\n  RUNAI_STREAMER = \"runai_streamer\",\n  FASTSAFETENSORS = \"fastsafetensors\",\n}\n\n/**\n * Data types for model weights and activations\n */\nexport enum DataType {\n  /**\n   * “auto” will use FP16 precision for FP32 and FP16 models, and BF16 precision for BF16 models.\n   */\n  AUTO = \"auto\",\n  /**\n   * “half” for FP16. Recommended for AWQ quantization.\n   */\n  HALF = \"half\",\n  /**\n   * “float16” is the same as “half”.\n   */\n  FLOAT16 = \"float16\",\n  /**\n   * “bfloat16” for a balance between precision and range.\n   */\n  BFLOAT16 = \"bfloat16\",\n  /**\n   * “float” is shorthand for FP32 precision.\n   */\n  FLOAT = \"float\",\n  /**\n   * “float32” for FP32 precision.\n   */\n  FLOAT32 = \"float32\",\n}\n\n/**\n * The folder path to the generation config.\n */\nexport enum GenerationConfig {\n  AUTO = \"auto\",\n  VLLM = \"vllm\",\n}\n\n/**\n * Available guided decoding backends\n */\nexport enum GuidedDecodingBackend {\n  AUTO = \"auto\",\n  OUTLINES = \"outlines\",\n  LM_FORMAT_ENFORCER = \"lm-format-enforcer\",\n  XGRAMMAR = \"xgrammar\",\n}\n\n/**\n * Available reasoning parsers\n */\nexport enum ReasoningParser {\n  DEEPSEEK_R1 = \"deepseek_r1\",\n  GRANITE = \"granite\",\n}\n\n/**\n * Model implementation options\n */\nexport enum ModelImpl {\n  /**\n   * “auto” will try to use the vLLM implementation if it exists and fall back to the Transformers\n   * implementation if no vLLM implementation is available.\n   */\n  AUTO = \"auto\",\n  /**\n   * “vllm” will use the vLLM model implementation.\n   */\n  VLLM = \"vllm\",\n  /**\n   * “transformers” will use the Transformers model implementation.\n   */\n  TRANSFORMERS = \"transformers\",\n}\n\n/**\n * Distributed execution backend options\n */\nexport enum DistributedExecutorBackend {\n  RAY = \"ray\",\n  MP = \"mp\",\n  UNI = \"uni\",\n  EXTERNAL_LAUNCHER = \"external_launcher\",\n}\n\n/**\n * Cache block size options in number of tokens\n */\nexport enum BlockSize {\n  SIZE_8 = 8,\n  SIZE_16 = 16,\n  SIZE_32 = 32,\n  SIZE_64 = 64,\n  SIZE_128 = 128,\n}\n\n/**\n * KV cache data type options\n */\nexport enum KvCacheDtype {\n  AUTO = \"auto\",\n  FP8 = \"fp8\",\n  FP8_E4M3 = \"fp8_e4m3\",\n  FP8_E5M2 = \"fp8_e5m2\",\n}\n\n/**\n * Hash algorithm options for prefix caching\n */\nexport enum PrefixCachingHashAlgo {\n  /**\n   * “builtin” is Python’s built-in hash.\n   */\n  BUILTIN = \"builtin\",\n  /**\n   * “sha256” is collision resistant but with certain overheads.\n   */\n  SHA256 = \"sha256\",\n}\n\n/**\n * Quantization methods\n */\nexport enum Quantization {\n  AQLM = \"aqlm\",\n  AWQ = \"awq\",\n  DEEPSPEEDFP = \"deepspeedfp\",\n  TPU_INT8 = \"tpu_int8\",\n  FP8 = \"fp8\",\n  PTPC_FP8 = \"ptpc_fp8\",\n  FBGEMM_FP8 = \"fbgemm_fp8\",\n  MODELOPT = \"modelopt\",\n  NVFP4 = \"nvfp4\",\n  MARLIN = \"marlin\",\n  GGUF = \"gguf\",\n  GPTQ_MARLIN_24 = \"gptq_marlin_24\",\n  GPTQ_MARLIN = \"gptq_marlin\",\n  AWQ_MARLIN = \"awq_marlin\",\n  GPTQ = \"gptq\",\n  COMPRESSED_TENSORS = \"compressed-tensors\",\n  BITSANDBYTES = \"bitsandbytes\",\n  QQQ = \"qqq\",\n  HQQ = \"hqq\",\n  EXPERTS_INT8 = \"experts_int8\",\n  NEURON_QUANT = \"neuron_quant\",\n  IPEX = \"ipex\",\n  QUARK = \"quark\",\n  MOE_WNA16 = \"moe_wna16\",\n  TORCHAO = \"torchao\",\n  NONE = \"None\",\n}\n\n/**\n * Tokenizer mode options\n */\nexport enum TokenizerMode {\n  /**\n   * “auto” will use the fast tokenizer if available.\n   */\n  AUTO = \"auto\",\n  /**\n   * “slow” will always use the slow tokenizer.\n   */\n  SLOW = \"slow\",\n  /**\n   * “mistral” will always use the mistral_common tokenizer.\n   */\n  MISTRAL = \"mistral\",\n  /**\n   * “custom” will use –tokenizer to select the preregistered tokenizer.\n   */\n  CUSTOM = \"custom\",\n}\n\n/**\n * Model config format options\n */\nexport enum ConfigFormat {\n  /**\n   * “auto” will try to load the config in hf format if available else it will try to load in mistral format\n   */\n  AUTO = \"auto\",\n  HF = \"hf\",\n  MISTRAL = \"mistral\",\n}\n\n/**\n * Tokenizer pool type options\n */\nexport enum TokenizerPoolType {\n  RAY = \"ray\",\n}\n\n/**\n * Device type options for vLLM execution\n */\nexport enum Device {\n  AUTO = \"auto\",\n  CUDA = \"cuda\",\n  NEURON = \"neuron\",\n  CPU = \"cpu\",\n  TPU = \"tpu\",\n  XPU = \"xpu\",\n  HPU = \"hpu\",\n}\n\n/**\n * LoRA data type options\n */\nexport enum LoraDtype {\n  AUTO = \"auto\",\n  FLOAT16 = \"float16\",\n  BFLOAT16 = \"bfloat16\",\n}\n\n/**\n * Format options for rendering message content within a chat template\n */\nexport enum ChatTemplateContentFormat {\n  AUTO = \"auto\",\n  STRING = \"string\",\n  OPENAI = \"openai\",\n}\n\n/**\n * Tool call parser options\n */\nexport enum ToolCallParser {\n  GRANITE_20B_FC = \"granite-20b-fc\",\n  GRANITE = \"granite\",\n  HERMES = \"hermes\",\n  INTERNLM = \"internlm\",\n  JAMBA = \"jamba\",\n  LLAMA3_JSON = \"llama3_json\",\n  MISTRAL = \"mistral\",\n  PHI4_MINI_JSON = \"phi4_mini_json\",\n  PYTHONIC = \"pythonic\",\n}\n\n/**\n * Task options for model usage\n */\nexport enum VllmTask {\n  AUTO = \"auto\",\n  GENERATE = \"generate\",\n  EMBEDDING = \"embedding\",\n  EMBED = \"embed\",\n  CLASSIFY = \"classify\",\n  SCORE = \"score\",\n  REWARD = \"reward\",\n  TRANSCRIPTION = \"transcription\",\n}\n\n/**\n * Preemption mode.\n */\nexport enum PreemptionMode {\n  RECOMPUTE = \"recompute\",\n  SWAP = \"swap\",\n}\n\n/**\n * Scheduling policy options\n */\nexport enum SchedulingPolicy {\n  FCFS = \"fcfs\",\n  PRIORITY = \"priority\",\n}\n\n/**\n * VllmNamedArguments\n */\nexport interface VllmNamedArguments {\n  /**\n   * Host name.\n   */\n  readonly host?: string;\n\n  /**\n   * Port number.\n   * @default 8000\n   */\n  readonly port?: number;\n\n  /**\n   * Log level for uvicorn.\n   * @default UvicornLogLevel.INFO\n   */\n  readonly uvicornLogLevel?: UvicornLogLevel;\n\n  /**\n   * Disable uvicorn access log.\n   * @default false\n   */\n  readonly disableUvicornAccessLog?: boolean;\n\n  /**\n   * Allow credentials.\n   * @default false\n   */\n  readonly allowCredentials?: boolean;\n\n  /**\n   * Allowed origins.\n   * @default ['*']\n   */\n  readonly allowedOrigins?: string[];\n\n  /**\n   * Allowed methods.\n   * @default ['*']\n   */\n  readonly allowedMethods?: string[];\n\n  /**\n   * Allowed headers.\n   * @default ['*']\n   */\n  readonly allowedHeaders?: string[];\n\n  /**\n   * If provided, the server will require this key to be presented in the header.\n   */\n  readonly apiKey?: string;\n\n  /**\n   * LoRA module configurations.\n   * @example {\"name\": \"name\", \"path\": \"lora_path\", \"base_model_name\": \"id\"}\n   */\n  readonly loraModules?: { [key: string]: any };\n\n  /**\n   * Prompt adapter configurations in the format name=path. Multiple adapters can be specified.\n   */\n  readonly promptAdapters?: string[];\n\n  /**\n   * The file path to the chat template, or the template in single-line form for the specified model.\n   */\n  readonly chatTemplate?: string;\n\n  /**\n   * The format to render message content within a chat template.\n   * - “string” will render the content as a string.\n   *   - Example: `\"Hello World\"`\n   * - “openai” will render the content as a list of dictionaries, similar to OpenAI schema.\n   *   - Example: `[{\"type\": \"text\", \"text\": \"Hello world!\"}]`\n   * @default ChatTemplateContentFormat.AUTO\n   */\n  readonly chatTemplateContentFormat?: ChatTemplateContentFormat;\n\n  /**\n   * The role name to return if `request.add_generation_prompt=true`.\n   * @default \"assistant\"\n   */\n  readonly responseRole?: string;\n\n  /**\n   * The file path to the SSL key file.\n   */\n  readonly sslKeyfile?: string;\n\n  /**\n   * The file path to the SSL cert file.\n   */\n  readonly sslCertfile?: string;\n\n  /**\n   * The CA certificates file.\n   */\n  readonly sslCaCerts?: string;\n\n  /**\n   * Refresh SSL Context when SSL certificate files change.\n   * @default false\n   */\n  readonly enableSslRefresh?: boolean;\n\n  /**\n   * Whether client certificate is required (see stdlib ssl module's).\n   * @default 0\n   */\n  readonly sslCertReqs?: number;\n\n  /**\n   * FastAPI root_path when app is behind a path based routing proxy.\n   */\n  readonly rootPath?: string;\n\n  /**\n   * Additional ASGI middleware to apply to the app.\n   * We accept multiple –middleware arguments. The value should be an import path.\n   * If a function is provided, vLLM will add it to the server using `@app.middleware('http')`.\n   * If a class is provided, vLLM will add it to the server using `app.add_middleware()`.\n   * @default []\n   */\n  readonly middleware?: string[];\n\n  /**\n   * When `--max-logprobs` is specified,\n   * represents single tokens as strings of the form 'token_id:{token_id}' so that tokens that are not JSON-encodable can be identified..\n   * @default false\n   */\n  readonly returnTokensAsTokenIds?: boolean;\n\n  /**\n   * If specified, will run the OpenAI frontend server in the same process as the model serving engine.\n   * @default false\n   */\n  readonly disableFrontendMultiprocessing?: boolean;\n\n  /**\n   * If specified, API server will add X-Request-Id header to responses.\n   *\n   * Caution: this hurts performance at high QPS.\n   * @default false\n   */\n  readonly enableRequestIdHeaders?: boolean;\n\n  /**\n   * Enable auto tool choice for supported models.\n   * Use `--tool-call-parser` to specify which parser to use.\n   * @default false\n   */\n  readonly enableAutoToolChoice?: boolean;\n\n  /**\n   * Select the tool call parser depending on the model that you’re using.\n   * This is used to parse the model-generated tool call into OpenAI API format.\n   *\n   * Required for `--enable-auto-tool-choice`.\n   */\n  readonly toolCallParser?: ToolCallParser;\n\n  /**\n   * Specify the tool parser plugin.\n   * @default \"\"\n   */\n  readonly toolParserPlugin?: string;\n\n  /**\n   * Name or path of the huggingface model to use.\n   * @default \"facebook/opt-125m\"\n   */\n  readonly model?: string;\n\n  /**\n   * The task to use the model for.\n   * Each vLLM instance only supports one task, even if the same model can be used for multiple tasks.\n   * When the model only supports one task, \"auto\" can be used to select it; otherwise,\n   * you must specify explicitly which task to use.\n   * @default VllmTask.AUTO\n   */\n  readonly task?: VllmTask;\n\n  /**\n   * Name or path of the huggingface tokenizer to use.\n   * If unspecified, model name or path will be used.\n   */\n  readonly tokenizer?: string;\n\n  /**\n   * Name or path of the huggingface config to use.\n   * If unspecified, model name or path will be used.\n   */\n  readonly hfConfigPath?: string;\n\n  /**\n   * Skip initialization of tokenizer and detokenizer.\n   * Expects valid prompt_token_ids and None for prompt from the input.\n   * The generated output will contain token ids.\n   * @default false\n   */\n  readonly skipTokenizerInit?: boolean;\n\n  /**\n   * The specific model version to use. It can be a branch name, a tag name, or a commit id.\n   * If unspecified, will use the default version.\n   */\n  readonly revision?: string;\n\n  /**\n   * The specific revision to use for the model code on Hugging Face Hub.\n   * It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version.\n   */\n  readonly codeRevision?: string;\n\n  /**\n   * Revision of the huggingface tokenizer to use.\n   * It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version.\n   */\n  readonly tokenizerRevision?: string;\n\n  /**\n   * The tokenizer mode.\n   * @default TokenizerMode.AUTO\n   */\n  readonly tokenizerMode?: TokenizerMode;\n\n  /**\n   * Trust remote code from huggingface.\n   * @default false\n   */\n  readonly trustRemoteCode?: boolean;\n\n  /**\n   * Allowing API requests to read local images or videos from directories specified by the server file system.\n   * This is a security risk. Should only be enabled in trusted environments.\n   */\n  readonly allowedLocalMediaPath?: string;\n\n  /**\n   * The format of the model config to load.\n   * @default ConfigFormat.AUTO\n   */\n  readonly configFormat?: ConfigFormat;\n\n  /**\n   * Data type for model weights and activations.\n   * @default DataType.AUTO\n   */\n  readonly dtype?: DataType;\n\n  /**\n   * Model context length.\n   */\n  readonly maxModelLen?: number;\n\n  /**\n   * Optional regex pattern specifying valid logits processor qualified names that can be passed\n   * with the logits_processors extra completion argument. Defaults to None, which allows no processors.\n   */\n  readonly logitsProcessorPattern?: string;\n\n  /**\n   * Which implementation of the model to use.\n   * @default ModelImpl.AUTO\n   */\n  readonly modelImpl?: ModelImpl;\n\n  /**\n   * Disables sliding window, capping to sliding window size.\n   * @default false\n   */\n  readonly disableSlidingWindow?: boolean;\n\n  /**\n   * Block manager v1 has been removed and SelfAttnBlockSpaceManager (i.e. block manager v2) is now the default.\n   * @default true\n   * @deprecated Setting this flag to True or False has no effect on vLLM behavior.\n   */\n  readonly useV2BlockManager?: boolean;\n\n  /**\n   * Random seed for operations.\n   */\n  readonly seed?: number;\n\n  /**\n   * Max number of log probs to return logprobs is specified in SamplingParams.\n   * @default 20\n   */\n  readonly maxLogprobs?: number;\n\n  /**\n   * Disable logging statistics.\n   * @default false\n   */\n  readonly disableLogStats?: boolean;\n\n  /**\n   * Method used to quantize the weights.\n   * If None, we first check the quantization_config attribute in the model config file.\n   * If that is None, we assume the model weights are not quantized and use dtype to determine the data type of the weights.\n   */\n  readonly quantization?: Quantization;\n\n  /**\n   * RoPE scaling configuration in JSON format.\n   * @example {\"rope_type\":\"dynamic\",\"factor\":2.0}\n   */\n  readonly ropeScaling?: { [key: string]: any };\n\n  /**\n   * RoPE theta. Use with rope_scaling.\n   * In some cases, changing the RoPE theta improves the performance of the scaled model.\n   */\n  readonly ropeTheta?: number;\n\n  /**\n   * The token to use as HTTP bearer authorization for remote files.\n   * If provided, the Secret will be passed as HF_TOKEN secret to compile environment.\n   */\n  readonly hfToken?: Secret;\n\n  /**\n   * Extra arguments for the HuggingFace config.\n   * This should be a object that will be parsed into a dictionary.\n   */\n  readonly hfOverrides?: { [key: string]: any };\n\n  /**\n   * Always use eager-mode PyTorch.\n   * If False, will use eager mode and CUDA graph in hybrid for maximal performance and flexibility.\n   * @default false\n   */\n  readonly enforceEager?: boolean;\n\n  /**\n   * Maximum sequence length covered by CUDA graphs.\n   * When a sequence has context length larger than this, we fall back to eager mode.\n   * Additionally for encoder-decoder models, if the sequence length of the encoder input is larger than this,\n   * we fall back to the eager mode.\n   * @default 8192\n   */\n  readonly maxSeqLenToCapture?: number;\n\n  /**\n   * Overrides for the multimodal input mapping/processing, e.g., image processor.\n   * @example {\"num_crops\": 4}\n   */\n  readonly mmProcessorKwargs?: { [key: string]: any };\n\n  /**\n   * If true, then disables caching of the multi-modal preprocessor/mapper. (not recommended)\n   * @default false\n   */\n  readonly disableMmPreprocessorCache?: boolean;\n\n  /**\n   * The pattern(s) to ignore when loading the model.Default to original/**\\/* to avoid\n   * repeated loading of llama’s checkpoints.\n   * @default []\n   */\n  readonly ignorePatterns?: string[];\n\n  /**\n   * The model name(s) used in the API.\n   * If multiple names are provided, the server will respond to any of the provided names.\n   * The model name in the model field of a response will be the first name in this list.\n   * If not specified, the model name will be the same as the `--model` argument.\n   * Noted that this name(s) will also be used in model_name tag content of prometheus metrics,\n   * if multiple names provided, metrics tag will take the first one.\n   */\n  readonly servedModelName?: string[];\n\n  /**\n   * Name or path of the QLoRA adapter.\n   */\n  readonly qloraAdapterNameOrPath?: string;\n\n  /**\n   * Enable deprecated Prometheus metrics that have been hidden since the specified version.\n   * For example, if a previously deprecated metric has been hidden since the v0.7.0 release,\n   * you use –show-hidden-metrics-for-version=0.7 as a temporary escape hatch while you migrate to new metrics.\n   * The metric is likely to be removed completely in an upcoming release.\n   */\n  readonly showHiddenMetricsForVersion?: string;\n\n  /**\n   * Target URL to which OpenTelemetry traces will be sent.\n   */\n  readonly otlpTracesEndpoint?: string;\n\n  /**\n   * Valid choices are model,worker,all.\n   * It makes sense to set this only if --otlp-traces-endpoint is set.\n   * If set, it will collect detailed traces for the specified modules.\n   * This involves use of possibly costly and or blocking operations and hence might have a performance impact.\n   */\n  readonly collectDetailedTraces?: string;\n\n  /**\n   * Disable async output processing. This may result in lower performance.\n   * @default false\n   */\n  readonly disableAsyncOutputProc?: boolean;\n\n  /**\n   * The scheduler class to use.\n   * @default \"vllm.core.scheduler.Scheduler\"\n   */\n  readonly schedulerCls?: string;\n\n  /**\n   * Override or set neuron device configuration.\n   * @example {\"cast_logits_dtype\": \"bloat16\"}\n   */\n  readonly overrideNeuronConfig?: { [key: string]: any };\n\n  /**\n   * Override or set the pooling method for pooling models.\n   * @example {\"pooling_type\": \"mean\", \"normalize\": false}\n   */\n  readonly overridePoolerConfig?: { [key: string]: any };\n\n  /**\n   * torch.compile configuration for the model.\n   * When it is a number (0, 1, 2, 3), it will be interpreted as the optimization level.\n   *\n   * NOTE: level 0 is the default level without any optimization.\n   * level 1 and 2 are for internal testing only. level 3 is the recommended level for production.\n   * To specify the full compilation config, use a JSON string,\n   * e.g. `{\"level\": 3, \"cudagraph_capture_sizes\": [1, 2, 4, 8]}` Following the convention of traditional compilers,\n   * using -O without space is also supported. -O3 is equivalent to -O 3.\n   */\n  readonly compilationConfig?: { [key: string]: any };\n\n  /**\n   * Configurations for distributed KV cache transfer in object.\n   */\n  readonly kvTransferConfig?: { [key: string]: any };\n\n  /**\n   * The worker class to use for distributed execution.\n   * @default \"auto\"\n   */\n  readonly workerCls?: string;\n\n  /**\n   * The worker extension class.\n   * @default \"\"\n   */\n  readonly workerExtensionCls?: string;\n\n  /**\n   * The folder path to the generation config. Defaults to ‘auto’,\n   * the generation config will be loaded from model path. If set to ‘vllm’,\n   * no generation config is loaded, vLLM defaults will be used.\n   * If set to a folder path, the generation config will be loaded from the specified folder path.\n   * If max_new_tokens is specified in generation config,\n   * then it sets a server-wide limit on the number of output tokens for all requests.\n   * @default \"auto\"\n   */\n  readonly generationConfig?: string;\n\n  /**\n   * Overrides or sets generation config.\n   * If used with –generation-config=auto, the override parameters will be merged with the default config from the model.\n   * If generation-config is None, only the override parameters are used.\n   * @example {\"temperature\": 0.5}\n   */\n  readonly overrideGenerationConfig?: { [key: string]: any };\n\n  /**\n   * Enable sleep mode for the engine. (only cuda platform is supported)\n   * @default false\n   */\n  readonly enableSleepMode?: boolean;\n\n  /**\n   * Additional config for specified platform.\n   * Different platforms may support different configs.\n   * Make sure the configs are valid for the platform you are using.\n   * The input format is like ‘{“config_key”:”config_value”}’\n   */\n  readonly additionalConfig?: { [key: string]: any };\n\n  /**\n   * Enable reasoning_content for the model.\n   * @default false\n   */\n  readonly enableReasoning?: boolean;\n\n  /**\n   * Disable cascade attention for V1.\n   * @default false\n   */\n  readonly disableCascadeAttn?: boolean;\n\n  /**\n   * Disable logging requests.\n   * @default false\n   */\n  readonly disableLogRequests?: boolean;\n\n  /**\n   * Max number of prompt characters or prompt ID numbers in log.\n   */\n  readonly maxLogLen?: number;\n\n  /**\n   * Disable FastAPI's OpenAPI schema, Swagger UI, and ReDoc endpoint.\n   * @default false\n   */\n  readonly disableFastApiDocs?: boolean;\n\n  /**\n   * Enable prompt_tokens_details in usage.\n   * @default false\n   */\n  readonly enablePromptTokensDetails?: boolean;\n\n  /**\n   * Enable tracking server_load_metrics in the app state.\n   * @default false\n   */\n  readonly enableServerLoadTracking?: boolean;\n}\n\n/**\n * Configuration for loading the model weights.\n */\nexport interface VllmLoadConfig {\n  /**\n   * The format of the model weights to load:\n   * - “auto” will try to load the weights in the safetensors format and fall back to the pytorch bin format if safetensors format is not available.\n   * - “pt” will load the weights in the pytorch bin format.\n   * - “safetensors” will load the weights in the safetensors format.\n   * - “npcache” will load the weights in pytorch format and store a numpy cache to speed up the loading.\n   * - “dummy” will initialize the weights with random values, which is mainly for profiling.\n   * - “tensorizer” will use CoreWeave’s tensorizer library for fast weight loading. See the Tensorize vLLM Model script in the Examples section for more information.\n   * - “runai_streamer” will load the Safetensors weights using Run:ai Model Streamer.\n   * - “bitsandbytes” will load the weights using bitsandbytes quantization.\n   * - “sharded_state” will load weights from pre-sharded checkpoint files, supporting efficient loading of tensor-parallel models.\n   * - “gguf” will load weights from GGUF format files (details specified in ggml-org/ggml).\n   * - “mistral” will load weights from consolidated safetensors files used by Mistral models.\n   * @default LoadFormat.AUTO\n   */\n  readonly loadFormat?: LoadFormat;\n\n  /**\n   * Directory to download and load the weights, default to the default cache directory of Hugging Face.\n   */\n  readonly downloadDir?: string;\n\n  /**\n   * Extra config for model loader. This will be passed to the model loader corresponding to the chosen load_format.\n   * This should be a object that will be parsed into a dictionary.\n   * @default {}\n   */\n  readonly modelLoaderExtraConfig?: { [key: string]: any };\n\n  /**\n   * Whether to enable tqdm for showing progress bar when loading model weights.\n   * @default true\n   */\n  readonly useTqdmOnLoad?: boolean;\n}\n\n/**\n * Dataclass which contains the decoding strategy of the engine.\n */\nexport interface VllmDecodingConfig {\n  /**\n   * Which engine will be used for guided decoding (JSON schema / regex etc) by default.\n   * With “auto”, we will make opinionated choices based on request contents and what the backend libraries currently support,\n   * so the behavior is subject to change in each release.\n   * @default GuidedDecodingBackend.AUTO\n   */\n  readonly guidedDecodingBackend?: GuidedDecodingBackend;\n\n  /**\n   * Select the reasoning parser depending on the model that you’re using.\n   * This is used to parse the reasoning content into OpenAI API format. Required for –enable-reasoning.\n   */\n  readonly reasoningParser?: ReasoningParser;\n}\n\n/**\n * Configuration for the distributed execution.\n */\nexport interface VllmParallelConfig {\n  /**\n   * Backend to use for distributed model workers, either “ray” or “mp” (multiprocessing).\n   * If the product of pipeline_parallel_size and tensor_parallel_size is less than or equal to the number of GPUs available,\n   * “mp” will be used to keep processing on a single host. Otherwise, this will default to “ray” if Ray is installed and fail otherwise.\n   * Note that tpu and hpu only support Ray for distributed inference.\n   */\n  readonly distributedExecutorBackend?: DistributedExecutorBackend;\n\n  /**\n   * Number of pipeline parallel groups.\n   * @default 1\n   */\n  readonly pipelineParallelSize?: number;\n\n  /**\n   * Number of tensor parallel groups.\n   * @default 1\n   */\n  readonly tensorParallelSize?: number;\n\n  /**\n   * Number of data parallel groups.\n   * MoE layers will be sharded according to the product of the tensor parallel size and data parallel size.\n   * @default 1\n   */\n  readonly dataParallelSize?: number;\n\n  /**\n   * Use expert parallelism instead of tensor parallelism for MoE layers.\n   * @default false\n   */\n  readonly enableExpertParallel?: boolean;\n\n  /**\n   * Maximum number of parallal loading workers when loading model sequentially in multiple batches.\n   * To avoid RAM OOM when using tensor parallel and large models.\n   */\n  readonly maxParallelLoadingWorkers?: number;\n\n  /**\n   * Whether to profile Ray workers with nsight.\n   * @see https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler\n   * @default false\n   */\n  readonly rayWorkersUseNsight?: boolean;\n\n  /**\n   * Disable the custom all-reduce kernel and fall back to NCCL.\n   * @default false\n   */\n  readonly disableCustomAllReduce?: boolean;\n}\n\n/**\n * Configuration for the KV cache.\n */\nexport interface VllmCacheConfig {\n  /**\n   * Size of a contiguous cache block in number of tokens.\n   * This is ignored on neuron devices and set to –max-model-len. On CUDA devices, only block sizes up to 32 are supported.\n   * On HPU devices, block size defaults to 128.\n   */\n  readonly blockSize?: BlockSize;\n\n  /**\n   * The fraction of GPU memory to be used for the model executor, which can range from 0 to 1.\n   * For example, a value of 0.5 would imply 50% GPU memory utilization.\n   * If unspecified, will use the default value of 0.9. This is a per-instance limit,\n   * and only applies to the current vLLM instance.\n   * It does not matter if you have another vLLM instance running on the same GPU. For example,\n   * if you have two vLLM instances running on the same GPU, you can set the GPU memory utilization to 0.5 for each instance.\n   * @default 0.9\n   */\n  readonly gpuMemoryUtilization?: number;\n\n  /**\n   * Size of the CPU swap space per GPU (in GiB).\n   * @default 4\n   */\n  readonly swapSpace?: number;\n\n  /**\n   * Data type for kv cache storage. If “auto”, will use model data type.\n   * CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ROCm (AMD GPU) supports fp8 (=fp8_e4m3).\n   * @default KvCacheDtype.AUTO\n   */\n  readonly kvCacheDtype?: KvCacheDtype;\n\n  /**\n   * Number of GPU blocks to use. This overrides the profiled num_gpu_blocks if specified.\n   * Does nothing if None. Used for testing preemption.\n   */\n  readonly numGpuBlocksOverride?: number;\n\n  /**\n   * Whether to enable prefix caching. Disabled by default for V0. Enabled by default for V1.\n   */\n  readonly enablePrefixCaching?: boolean;\n\n  /**\n   * Set the hash algorithm for prefix caching.\n   * @default PrefixCachingHashAlgo.BUILTIN\n   */\n  readonly prefixCachingHashAlgo?: PrefixCachingHashAlgo;\n\n  /**\n   * The space in GiB to offload to CPU, per GPU.\n   * Default is 0, which means no offloading. Intuitively,\n   * this argument can be seen as a virtual way to increase the GPU memory size.\n   * For example, if you have one 24 GB GPU and set this to 10, virtually you can think of it as a 34 GB GPU.\n   * Then you can load a 13B model with BF16 weight, which requires at least 26GB GPU memory.\n   *\n   * Note that this requires fast CPU-GPU interconnect,\n   * as part of the model is loaded from CPU memory to GPU memory on the fly in each model forward pass.\n   * @default 0\n   */\n  readonly cpuOffloadGb?: number;\n\n  /**\n   * This enables dynamic calculation of k_scale and v_scale when kv_cache_dtype is fp8.\n   * If False, the scales will be loaded from the model checkpoint if available. Otherwise, the scales will default to 1.0.\n   * @default false\n   */\n  readonly calculateKvScales?: boolean;\n}\n\n/**\n * Controls the behavior of multimodal models.\n */\nexport interface VllmMultiModalConfig {\n  /**\n   * The maximum number of input items allowed per prompt for each modality.\n   * This should be a object that will be parsed into a dictionary. Defaults to 1 (V0) or 999 (V1) for each modality.\n   * @default {}\n   */\n  readonly limitMmPerPrompt?: { [key: string]: any };\n}\n\n/**\n * Configuration for LoRA.\n */\nexport interface VllmLoraConfig {\n  /**\n   * If True, enable handling of LoRA adapters.\n   * @default false\n   */\n  readonly enableLora?: boolean;\n\n  /**\n   * If True, enable bias for LoRA adapters.\n   * @default false\n   */\n  readonly enableLoraBias?: boolean;\n\n  /**\n   * Max number of LoRAs in a single batch.\n   * @default 1\n   */\n  readonly maxLoras?: number;\n\n  /**\n   * Max LoRA rank.\n   * @default 16\n   */\n  readonly maxLoraRank?: number;\n\n  /**\n   * Maximum size of extra vocabulary that can be present in a LoRA adapter (added to the base model vocabulary).\n   * @default 256\n   */\n  readonly loraExtraVocabSize?: number;\n\n  /**\n   * Data type for LoRA. If auto, will default to base model dtype.\n   * @default LoraDtype.AUTO\n   */\n  readonly loraDtype?: LoraDtype;\n\n  /**\n   * Specify multiple scaling factors (which can be different from base model scaling factorsee eg. Long LoRA)\n   * to allow for multiple LoRA adapters trained with those scaling factors to be used at the same time.\n   * If not specified, only adapters trained with the base model scaling factor are allowed.\n   */\n  readonly longLoraScalingFactors?: number;\n\n  /**\n   * Maximum number of LoRAs to store in CPU memory. Must be >= than max_loras.\n   */\n  readonly maxCpuLoras?: number;\n\n  /**\n   * By default, only half of the LoRA computation is sharded with tensor parallelism.\n   * Enabling this will use the fully sharded layers.\n   * At high sequence length, max rank or tensor parallel size, this is likely faster.\n   * @default false\n   */\n  readonly fullyShardedLoras?: boolean;\n}\n\n/**\n * Configuration for PromptAdapters.\n */\nexport interface VllmPromptAdapterConfig {\n  /**\n   * If True, enable handling of PromptAdapters.\n   * @default false\n   */\n  readonly enablePromptAdapter?: boolean;\n\n  /**\n   * Max number of PromptAdapters in a batch.\n   * @default 1\n   */\n  readonly maxPromptAdapters?: number;\n\n  /**\n   * Max number of PromptAdapters tokens.\n   * @default 0\n   */\n  readonly maxPromptAdapterToken?: number;\n}\n\nexport interface VllmDeviceConfig {\n  /**\n   * Device type for vLLM execution.\n   * @default Device.AUTO\n   */\n  readonly device?: Device;\n}\n\n/**\n * Configuration for speculative decoding.\n */\nexport interface VllmSpeculativeConfig {\n  /**\n   * The configurations for speculative decoding. Should be a object.\n   */\n  readonly speculativeConfig?: { [key: string]: any };\n}\n\nexport interface VllmSchedulerConfig {\n  /**\n   * Maximum number of tokens to be processed in a single iteration.\n   *\n   * This config has no static default. If left unspecified by the user, it will be set in EngineArgs.create_engine_config based on the usage context.\n   */\n  readonly maxNumBatchedTokens?: number;\n\n  /**\n   * Maximum number of sequences to be processed in a single iteration.\n   *\n   * This config has no static default. If left unspecified by the user, it will be set in EngineArgs.create_engine_config based on the usage context.\n   */\n  readonly maxNumSeqs?: number;\n\n  /**\n   * For chunked prefill, the maximum number of sequences that can be partially prefilled concurrently.\n   * @default 1\n   */\n  readonly maxNumPartialPrefills?: number;\n\n  /**\n   * For chunked prefill, the maximum number of prompts longer than long_prefill_token_threshold that will be prefilled concurrently.\n   * Setting this less than max_num_partial_prefills will allow shorter prompts to jump the queue in front of longer prompts in some cases, improving latency.\n   * @default 1\n   */\n  readonly maxLongPartialPrefills?: number;\n\n  /**\n   * For chunked prefill, a request is considered long if the prompt is longer than this number of tokens.\n   * @default 0\n   */\n  readonly longPrefillTokenThreshold?: number;\n\n  /**\n   * The number of slots to allocate per sequence per step,\n   * beyond the known token ids. This is used in speculative decoding to store KV activations of tokens\n   * which may or may not be accepted.\n   *\n   * NOTE: This will be replaced by speculative config in the future; it is present to enable correctness tests until then.\n   * @default 0\n   */\n  readonly numLookaheadSlots?: number;\n\n  /**\n   * Apply a delay (of delay factor multiplied by previous prompt latency) before scheduling next prompt.\n   * @default 0.0\n   */\n  readonly schedulerDelayFactor?: number;\n\n  /**\n   * Whether to perform preemption by swapping or recomputation.\n   * If not specified, we determine the mode as follows:\n   * We use recomputation by default since it incurs lower overhead than swapping.\n   * However, when the sequence group has multiple sequences (e.g., beam search),\n   * recomputation is not currently supported. In such a case, we use swapping instead.\n   */\n  readonly preemptionMode?: PreemptionMode;\n\n  /**\n   * Maximum number of forward steps per scheduler call.\n   * @default 1\n   */\n  readonly numSchedulerSteps?: number;\n\n  /**\n   * If False, then multi-step will stream outputs at the end of all steps\n   * @default true\n   */\n  readonly multiStepStreamOutputs?: boolean;\n\n  /**\n   * The scheduling policy to use:\n   * - “fcfs” means first come first served, i.e. requests are handled in order of arrival.\n   * - “priority” means requests are handled based on given priority (lower value means earlier handling) and time of arrival deciding any ties).\n   * @default SchedulingPolicy.FCFS\n   */\n  readonly schedulingPolicy?: SchedulingPolicy;\n\n  /**\n   * If True, prefill requests can be chunked based on the remaining max_num_batched_tokens.\n   */\n  readonly enableChunkedPrefill?: boolean;\n\n  /**\n   * If set to true and chunked prefill is enabled, we do not want to partially schedule a multimodal item.\n   * Only used in V1 This ensures that if a request has a mixed prompt (like text tokens TTTT followed by image tokens IIIIIIIIII)\n   * where only some image tokens can be scheduled (like TTTTIIIII, leaving IIIII),\n   * it will be scheduled as TTTT in one step and IIIIIIIIII in the next.\n   * @default false\n   */\n  readonly disableChunkedMmInput?: boolean;\n}\n\n/**\n * Interface for vLLM server command line arguments\n */\nexport interface VllmEngineArguments\n  extends\n    VllmNamedArguments,\n    VllmLoadConfig,\n    VllmDecodingConfig,\n    VllmParallelConfig,\n    VllmCacheConfig,\n    VllmMultiModalConfig,\n    VllmLoraConfig,\n    VllmPromptAdapterConfig,\n    VllmDeviceConfig,\n    VllmSpeculativeConfig,\n    VllmSchedulerConfig {}\n\nconst jsonValueProperties: (keyof VllmEngineArguments)[] = [\n  \"allowedOrigins\",\n  \"allowedHeaders\",\n  \"allowedMethods\",\n];\nconst ignoreKeys: (keyof VllmEngineArguments)[] = [\"model\", \"hfToken\"];\n\nexport abstract class VllmEngineArgumentsParser {\n  /**\n   * Convert vLLM engine arguments (camel case) to config (kebab case)\n   * @param args vLLM engine arguments\n   * @returns vLLM engine config\n   * @see https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#configuration-file\n   */\n  static config(args: VllmEngineArguments) {\n    return Object.entries(args)\n      .filter(([key]) => !ignoreKeys.includes(key as keyof VllmEngineArguments))\n      .reduce<{ [key in string]: any }>((prev, [key, value]) => {\n        const k = key.replace(/[A-Z]/g, (match) => `-${match.toLowerCase()}`);\n        if (\n          jsonValueProperties.includes(key as keyof VllmEngineArguments) ||\n          (!Array.isArray(value) && typeof value === \"object\")\n        ) {\n          value = JSON.stringify(value);\n        }\n        prev[k] = value;\n        return prev;\n      }, {});\n  }\n  static cli(args: VllmEngineArguments) {\n    const generalArgs = Object.entries(args)\n      .filter(([key]) => !ignoreKeys.includes(key as keyof VllmEngineArguments))\n      .flatMap(([k, value]) => {\n        const key = k.replace(/[A-Z]/g, (match) => `-${match.toLowerCase()}`);\n        if (typeof value === \"boolean\") {\n          return value ? [`--${key}`] : [];\n        }\n        if (Array.isArray(value)) {\n          return [`--${key}`, ...value.map((v) => `${v}`)];\n        }\n        if (typeof value === \"object\") {\n          return [`--${key}`, JSON.stringify(value)];\n        }\n        return [`--${key}`, `${value}`];\n      });\n    return [args.model!, ...generalArgs];\n  }\n}\n"]}
|
|
@@ -31,7 +31,7 @@ class VllmNxdInferenceCompileImage extends vllm_nxd_inference_ecs_patterns_1.Vll
|
|
|
31
31
|
}
|
|
32
32
|
exports.VllmNxdInferenceCompileImage = VllmNxdInferenceCompileImage;
|
|
33
33
|
_a = JSII_RTTI_SYMBOL_1;
|
|
34
|
-
VllmNxdInferenceCompileImage[_a] = { fqn: "aws-cdk-neuronx-patterns.VllmNxdInferenceCompileImage", version: "0.
|
|
34
|
+
VllmNxdInferenceCompileImage[_a] = { fqn: "aws-cdk-neuronx-patterns.VllmNxdInferenceCompileImage", version: "0.2.1" };
|
|
35
35
|
/**
|
|
36
36
|
* Neuronx compiler construct for vLLM on NxD Inference.
|
|
37
37
|
* Compile the model to work with Neuronx instance and upload it to an S3 bucket.
|
|
@@ -63,6 +63,7 @@ class VllmNxdInferenceCompiler extends constructs_1.Construct {
|
|
|
63
63
|
}
|
|
64
64
|
const tensorParallelSize = availableInstancePatterns[0].tp;
|
|
65
65
|
const image = props.image ?? new VllmNxdInferenceCompileImage(this, "CompileImage");
|
|
66
|
+
const blockSize = props.vllmArgs?.blockSize ?? vllm_engine_1.BlockSize.SIZE_32;
|
|
66
67
|
const vllmArgs = {
|
|
67
68
|
...props.vllmArgs,
|
|
68
69
|
model: props.model.modelId,
|
|
@@ -70,6 +71,7 @@ class VllmNxdInferenceCompiler extends constructs_1.Construct {
|
|
|
70
71
|
maxModelLen,
|
|
71
72
|
maxNumSeqs,
|
|
72
73
|
tensorParallelSize,
|
|
74
|
+
blockSize,
|
|
73
75
|
};
|
|
74
76
|
// change dirname every engine args patterns
|
|
75
77
|
const hash = (str) => (0, crypto_1.createHash)("sha256").update(str).digest("hex");
|
|
@@ -124,5 +126,5 @@ class VllmNxdInferenceCompiler extends constructs_1.Construct {
|
|
|
124
126
|
}
|
|
125
127
|
exports.VllmNxdInferenceCompiler = VllmNxdInferenceCompiler;
|
|
126
128
|
_b = JSII_RTTI_SYMBOL_1;
|
|
127
|
-
VllmNxdInferenceCompiler[_b] = { fqn: "aws-cdk-neuronx-patterns.VllmNxdInferenceCompiler", version: "0.
|
|
128
|
-
//# sourceMappingURL=data:application/json;base64,{"version":3,"file":"vllm-nxd-inference-compiler.js","sourceRoot":"","sources":["../../src/vllm-nxd-inference/vllm-nxd-inference-compiler.ts"],"names":[],"mappings":";;;;;AAAA,kEAAiE;AACjE,6CAAmC;AAKnC,2CAAuC;AACvC,mCAAoC;AACpC,+BAA4B;AAC5B,6CAYyB;AACzB,+DAIkC;AAClC,mEAG2C;AAC3C,uFAAiF;AAEjF;;GAEG;AACH,MAAa,4BAA6B,SAAQ,8DAA4B;IAK5E,YACE,KAAgB,EAChB,EAAU,EACV,yBAAsD;QAEtD,yBAAyB,KAAzB,yBAAyB,GAAK,mCAAyB,CAAC,MAAM,EAAC;QAC/D,KAAK,CAAC,yBAAyB,CAAC,CAAC;QACjC,MAAM,KAAK,GAAG,IAAI,uCAAmB,CAAC,KAAK,EAAE,EAAE,EAAE;YAC/C,SAAS,EAAE,IAAA,WAAI,EAAC,SAAS,EAAE,0CAA0C,CAAC;YACtE,SAAS,EAAE;gBACT,UAAU,EAAE,yBAAyB,CAAC,SAAS;gBAC/C,SAAS,EAAE,yBAAyB,CAAC,QAAQ;aAC9C;SACF,CAAC,CAAC;QACH,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC,oBAAoB,EAAE,CAAC;IAC5C,CAAC;;AApBH,oEAqBC;;;AAsED;;;GAGG;AACH,MAAa,wBAAyB,SAAQ,sBAAS;IAGrD,YACE,KAAgB,EAChB,EAAU,EACV,KAAmC;QAEnC,KAAK,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;QACjB,MAAM,WAAW,GAAG,KAAK,CAAC,QAAQ,EAAE,WAAW,IAAI,GAAG,CAAC;QACvD,MAAM,UAAU,GAAG,KAAK,CAAC,QAAQ,EAAE,UAAU,IAAI,CAAC,CAAC;QACnD,MAAM,eAAe,GAAG,KAAK,CAAC,KAAK,CAAC,OAAO,CAAC,MAAM;YAChD,CAAC,CAAC,IAAA,6BAAmB,EACjB,KAAK,CAAC,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,kBAAkB,EAC7C,KAAK,CAAC,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,MAAM,EACjC,sBAAY,CAAC,YAAY,EACzB,WAAW,EACX,UAAU,CACX;YACH,CAAC,CAAC,IAAA,4CAAkC,EAChC,KAAK,CAAC,KAAK,CAAC,OAAO,CAAC,UAAU,EAC9B,WAAW,EACX,UAAU,CACX,CAAC;QACN,MAAM,oBAAoB,GAAG,KAAK,CAAC,mBAAmB;YACpD,CAAC,CAAC,CAAC,KAAK,CAAC,mBAAmB,CAAC;YAC7B,CAAC,CAAC;gBACE,6BAAmB,CAAC,YAAY;gBAChC,6BAAmB,CAAC,aAAa;gBACjC,6BAAmB,CAAC,aAAa;aAClC,CAAC;QACN,MAAM,yBAAyB,GAAG,oBAAoB;aACnD,OAAO,CAAC,CAAC,mBAAmB,EAAE,EAAE,CAC/B,IAAA,4BAAkB,EAChB,mBAAmB,EACnB,eAAe,EACf,KAAK,CAAC,KAAK,CAAC,OAAO,CAAC,MAAM,EAAE,cAAc,CAC3C,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;YACZ,mBAAmB;YACnB,GAAG,CAAC;SACL,CAAC,CAAC,CACJ;aACA,MAAM,CACL,CAAC,YAAY,EAAE,EAAE,CACf,CAAC,KAAK,CAAC,QAAQ,EAAE,kBAAkB;YACnC,YAAY,CAAC,EAAE,KAAK,KAAK,CAAC,QAAQ,CAAC,kBAAkB,CACxD,CAAC;QACJ,IAAI,yBAAyB,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC3C,MAAM,IAAI,KAAK,CACb,kEAAkE,yBAAyB,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,CAC3H,CAAC;QACJ,CAAC;QACD,MAAM,kBAAkB,GAAG,yBAAyB,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;QAC3D,MAAM,KAAK,GACT,KAAK,CAAC,KAAK,IAAI,IAAI,4BAA4B,CAAC,IAAI,EAAE,cAAc,CAAC,CAAC;QACxE,MAAM,QAAQ,GAAG;YACf,GAAG,KAAK,CAAC,QAAQ;YACjB,KAAK,EAAE,KAAK,CAAC,KAAK,CAAC,OAAO;YAC1B,eAAe,EAAE,CAAC,KAAK,CAAC,KAAK,CAAC,SAAS,CAAC;YACxC,WAAW;YACX,UAAU;YACV,kBAAkB;SACW,CAAC;QAEhC,4CAA4C;QAC5C,MAAM,IAAI,GAAG,CAAC,GAAW,EAAE,EAAE,CAC3B,IAAA,mBAAU,EAAC,QAAQ,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;QACjD,MAAM,gBAAgB,GAAG,OAAO,KAAK,CAAC,gBAAgB,IAAI,IAAI,CAAC,IAAI,CAAC,SAAS,CAAC,QAAQ,CAAC,CAAC,EAAE,CAAC;QAC3F,MAAM,WAAW,GAAG,uCAAyB,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;QAC5D,MAAM,WAAW,GAA2B;YAC1C,GAAG,KAAK,CAAC,WAAW;YACpB,qBAAqB,EAAE,+BAA+B;YACtD,yBAAyB,EAAE,2BAA2B;YACtD,mBAAmB,EAAE,kBAAkB,CAAC,QAAQ,EAAE;YAClD,yBAAyB,EAAE,GAAG;YAC9B,QAAQ,EAAE,KAAK,CAAC,KAAK,CAAC,OAAO;YAC7B,UAAU,EAAE,KAAK,CAAC,KAAK,CAAC,SAAS;YACjC,yBAAyB,EAAE,KAAK,CAAC,MAAM,CAAC,cAAc,CAAC,gBAAgB,CAAC;YACxE,4BAA4B,EAAE,gBAAgB;SAC/C,CAAC;QACF,MAAM,OAAO,GAAoC,EAAE,CAAC;QACpD,IAAI,KAAK,CAAC,QAAQ,EAAE,OAAO,EAAE,CAAC;YAC5B,OAAO,CAAC,QAAQ,GAAG,KAAK,CAAC,QAAQ,CAAC,OAAO,CAAC;QAC5C,CAAC;QACD,MAAM,UAAU,GAAG,kBAAI,CAAC,SAAS,CAC/B,KAAK,CAAC,KAAK,CAAC,OAAO,CAAC,UAAU,CAAC,SAAS,EAAE,GAAG,GAAG,CACjD,CAAC;QACF,MAAM,kBAAkB,GAAG,KAAK,CAAC,QAAQ,EAAE,oBAAoB,EAAE,SAAS;YACxE,CAAC,CAAC,UAAU;YACZ,CAAC,CAAC,kBAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QAClB,MAAM,UAAU,GAAG,kBAAI,CAAC,SAAS,CAC/B,IAAI,CAAC,IAAI,CACP,UAAU,CAAC,WAAW,EAAE;YACtB,kBAAkB,CAAC,WAAW,EAAE;YAChC,qCAA2B,CAAC,IAAI,CAAC,WAAW,EAAE;YAC9C,qCAA2B,CAAC,IAAI,CAAC,WAAW,EAAE,CACjD,CACF,CAAC;QAEF,MAAM,QAAQ,GAAG,IAAI,kCAAe,CAAC,IAAI,EAAE,UAAU,EAAE;YACrD,GAAG,KAAK;YACR,mBAAmB,EAAE,yBAAyB,CAAC,CAAC,CAAC,CAAC,mBAAmB;YACrE,gBAAgB;YAChB,KAAK,EAAE,KAAK;YACZ,OAAO,EAAE,WAAW;YACpB,WAAW;YACX,OAAO;YACP,UAAU;SACX,CAAC,CAAC;QACH,IAAI,CAAC,QAAQ,GAAG,QAAQ,CAAC;QACzB,IAAI,CAAC,QAAQ,GAAG,QAAQ,CAAC;IAC3B,CAAC;IACD;;;OAGG;IACH,OAAO;QACL,OAAO;YACL,GAAG,IAAI,CAAC,QAAQ,CAAC,OAAO,EAAE;YAC1B,QAAQ,EAAE,IAAI,CAAC,QAAQ;SACgB,CAAC;IAC5C,CAAC;;AAzHH,4DA0HC","sourcesContent":["import { ContainerImageBuild } from \"@cdklabs/deploy-time-build\";\nimport { Size } from \"aws-cdk-lib\";\nimport * as batch from \"aws-cdk-lib/aws-batch\";\nimport { IVpc, SubnetSelection } from \"aws-cdk-lib/aws-ec2\";\nimport { ContainerImage } from \"aws-cdk-lib/aws-ecs\";\nimport { IBucket } from \"aws-cdk-lib/aws-s3\";\nimport { Construct } from \"constructs\";\nimport { createHash } from \"crypto\";\nimport { join } from \"path\";\nimport {\n  calcMemoryFootprint,\n  calcTensorParallel,\n  DataTypeBits,\n  INeuronxInstanceType,\n  inferMemoryFootprintFromParameters,\n  IVllmInferenceNeuronxImage,\n  Model,\n  NeuronOptimizedMachineImage,\n  NeuronxInstanceType,\n  PytorchTrainingNeuronxImage,\n  VllmInferenceNeuronxImage,\n} from \"../base/neuronx\";\nimport {\n  INeuronxContainerImage,\n  NeuronxCompiledModel,\n  NeuronxCompiler,\n} from \"../base/neuronx-compiler\";\nimport {\n  VllmEngineArguments,\n  VllmEngineArgumentsParser,\n} from \"../base/server-engine/vllm-engine\";\nimport { VllmNxdInferenceEcsImageBase } from \"./vllm-nxd-inference-ecs-patterns\";\n\n/**\n * Compile runtime container image for vLLM NxD Inference\n */\nexport class VllmNxdInferenceCompileImage extends VllmNxdInferenceEcsImageBase {\n  /**\n   * The container image.\n   */\n  readonly image: ContainerImage;\n  constructor(\n    scope: Construct,\n    id: string,\n    vllmInferenceNeuronxImage?: IVllmInferenceNeuronxImage,\n  ) {\n    vllmInferenceNeuronxImage ??= VllmInferenceNeuronxImage.LATEST;\n    super(vllmInferenceNeuronxImage);\n    const build = new ContainerImageBuild(scope, id, {\n      directory: join(__dirname, \"../../scripts/compile/vllm-nxd-inference\"),\n      buildArgs: {\n        IMAGE_NAME: vllmInferenceNeuronxImage.imageName,\n        IMAGE_TAG: vllmInferenceNeuronxImage.imageTag,\n      },\n    });\n    this.image = build.toEcsDockerImageCode();\n  }\n}\n\n/**\n * Props of VllmNxdInferenceCompiler.\n */\nexport interface VllmNxdInferenceCompileProps {\n  /**\n   * VPC in which this will launch compile worker instance.\n   */\n  readonly vpc: IVpc;\n  /**\n   * The bucket to upload compiled artifacts.\n   */\n  readonly bucket: IBucket;\n  /**\n   * The instance type of compile worker instance.\n   */\n  readonly neuronxInstanceType?: INeuronxInstanceType;\n  /**\n   * The model to be compiled.\n   */\n  readonly model: Model;\n  /**\n   * The root volume of worker instance.\n   * @default - N bilion parameters * 5GiB EBS\n   */\n  readonly volumeSize?: Size;\n  /**\n   * Whether or not to use spot instances. Spot instances are less expensive EC2 instances that can be reclaimed by EC2 at any time; your job will be given two minutes of notice before reclamation.\n   *\n   * @default false\n   */\n  readonly spot?: boolean;\n  /**\n   * The VPC Subnets this Compute Environment will launch instances in.\n   *\n   * @default - new subnets will be created\n   */\n  readonly vpcSubnets?: SubnetSelection;\n  /**\n   * The environment variables to pass to the container.\n   * This is only applicable when using container runtime.\n   *\n   * @default - No environment variables.\n   */\n  readonly environment?: {\n    [key: string]: string;\n  };\n  /**\n   * The arguments to pass to the vllm engine.\n   * @default - no specific values. use default values.\n   */\n  readonly vllmArgs?: VllmEngineArguments;\n  /**\n   * An image of the container where the compile job is executed.\n   * @default - latest image\n   */\n  readonly image?: INeuronxContainerImage;\n}\n\n/**\n * The model compiled by Neuronx compiler.\n */\nexport interface VllmNxdInferenceCompiledModel extends NeuronxCompiledModel {\n  /**\n   * Passed to the vllm engine at compile time.\n   */\n  readonly vllmArgs: VllmEngineArguments;\n}\n\n/**\n * Neuronx compiler construct for vLLM on NxD Inference.\n * Compile the model to work with Neuronx instance and upload it to an S3 bucket.\n */\nexport class VllmNxdInferenceCompiler extends Construct {\n  private readonly vllmArgs: VllmEngineArguments;\n  private readonly compiler: NeuronxCompiler;\n  constructor(\n    scope: Construct,\n    id: string,\n    props: VllmNxdInferenceCompileProps,\n  ) {\n    super(scope, id);\n    const maxModelLen = props.vllmArgs?.maxModelLen ?? 128;\n    const maxNumSeqs = props.vllmArgs?.maxNumSeqs ?? 1;\n    const memoryFootprint = props.model.options.config\n      ? calcMemoryFootprint(\n          props.model.options.config.embeddingDimension,\n          props.model.options.config.layers,\n          DataTypeBits.BF16_OR_FP16,\n          maxModelLen,\n          maxNumSeqs,\n        )\n      : inferMemoryFootprintFromParameters(\n          props.model.options.parameters,\n          maxModelLen,\n          maxNumSeqs,\n        );\n    const neuronxInstanceTypes = props.neuronxInstanceType\n      ? [props.neuronxInstanceType]\n      : [\n          NeuronxInstanceType.INF2_8XLARGE,\n          NeuronxInstanceType.INF2_24XLARGE,\n          NeuronxInstanceType.INF2_48XLARGE,\n        ];\n    const availableInstancePatterns = neuronxInstanceTypes\n      .flatMap((neuronxInstanceType) =>\n        calcTensorParallel(\n          neuronxInstanceType,\n          memoryFootprint,\n          props.model.options.config?.attentionHeads,\n        ).map((v) => ({\n          neuronxInstanceType,\n          ...v,\n        })),\n      )\n      .filter(\n        (instanceType) =>\n          !props.vllmArgs?.tensorParallelSize ||\n          instanceType.tp === props.vllmArgs.tensorParallelSize,\n      );\n    if (availableInstancePatterns.length === 0) {\n      throw new Error(\n        `No available instance type. You can use tensorParallelSize are ${availableInstancePatterns.map((p) => p.tp).join(\", \")}.`,\n      );\n    }\n    const tensorParallelSize = availableInstancePatterns[0].tp;\n    const image =\n      props.image ?? new VllmNxdInferenceCompileImage(this, \"CompileImage\");\n    const vllmArgs = {\n      ...props.vllmArgs,\n      model: props.model.modelId,\n      servedModelName: [props.model.modelName],\n      maxModelLen,\n      maxNumSeqs,\n      tensorParallelSize,\n    } satisfies VllmEngineArguments;\n\n    // change dirname every engine args patterns\n    const hash = (str: string) =>\n      createHash(\"sha256\").update(str).digest(\"hex\");\n    const artifactS3Prefix = `sdk-${image.neuronSdkVersion}/${hash(JSON.stringify(vllmArgs))}`;\n    const vllmCliArgs = VllmEngineArgumentsParser.cli(vllmArgs);\n    const environment: Record<string, string> = {\n      ...props.environment,\n      VLLM_NEURON_FRAMEWORK: \"neuronx-distributed-inference\",\n      NEURON_COMPILED_ARTIFACTS: \"neuron-compiled-artifacts\",\n      NEURON_RT_NUM_CORES: tensorParallelSize.toString(),\n      XLA_HANDLE_SPECIAL_SCALAR: \"1\",\n      MODEL_ID: props.model.modelId,\n      MODEL_NAME: props.model.modelName,\n      COMPILED_ARTIFACTS_S3_URI: props.bucket.s3UrlForObject(artifactS3Prefix),\n      COMPILED_ARTIFACTS_S3_PREFIX: artifactS3Prefix,\n    };\n    const secrets: { [key: string]: batch.Secret } = {};\n    if (props.vllmArgs?.hfToken) {\n      secrets.HF_TOKEN = props.vllmArgs.hfToken;\n    }\n    const weightSize = Size.gibibytes(\n      props.model.options.parameters.toBillion() * 2.5,\n    );\n    const quantizedWightSize = props.vllmArgs?.overrideNeuronConfig?.quantized\n      ? weightSize\n      : Size.bytes(0);\n    const volumeSize = Size.gibibytes(\n      Math.ceil(\n        weightSize.toGibibytes() +\n          quantizedWightSize.toGibibytes() +\n          PytorchTrainingNeuronxImage.size.toGibibytes() +\n          NeuronOptimizedMachineImage.size.toGibibytes(),\n      ),\n    );\n\n    const compiler = new NeuronxCompiler(this, \"Resource\", {\n      ...props,\n      neuronxInstanceType: availableInstancePatterns[0].neuronxInstanceType,\n      artifactS3Prefix,\n      image: image,\n      command: vllmCliArgs,\n      environment,\n      secrets,\n      volumeSize,\n    });\n    this.vllmArgs = vllmArgs;\n    this.compiler = compiler;\n  }\n  /**\n   * Compile the model and return the compiled model.\n   * @returns The compiled model.\n   */\n  compile(): VllmNxdInferenceCompiledModel {\n    return {\n      ...this.compiler.compile(),\n      vllmArgs: this.vllmArgs,\n    } satisfies VllmNxdInferenceCompiledModel;\n  }\n}\n"]}
|
|
129
|
+
VllmNxdInferenceCompiler[_b] = { fqn: "aws-cdk-neuronx-patterns.VllmNxdInferenceCompiler", version: "0.2.1" };
|
|
130
|
+
//# sourceMappingURL=data:application/json;base64,{"version":3,"file":"vllm-nxd-inference-compiler.js","sourceRoot":"","sources":["../../src/vllm-nxd-inference/vllm-nxd-inference-compiler.ts"],"names":[],"mappings":";;;;;AAAA,kEAAiE;AACjE,6CAAmC;AAKnC,2CAAuC;AACvC,mCAAoC;AACpC,+BAA4B;AAC5B,6CAYyB;AACzB,+DAIkC;AAClC,mEAI2C;AAC3C,uFAAiF;AAEjF;;GAEG;AACH,MAAa,4BAA6B,SAAQ,8DAA4B;IAK5E,YACE,KAAgB,EAChB,EAAU,EACV,yBAAsD;QAEtD,yBAAyB,KAAzB,yBAAyB,GAAK,mCAAyB,CAAC,MAAM,EAAC;QAC/D,KAAK,CAAC,yBAAyB,CAAC,CAAC;QACjC,MAAM,KAAK,GAAG,IAAI,uCAAmB,CAAC,KAAK,EAAE,EAAE,EAAE;YAC/C,SAAS,EAAE,IAAA,WAAI,EAAC,SAAS,EAAE,0CAA0C,CAAC;YACtE,SAAS,EAAE;gBACT,UAAU,EAAE,yBAAyB,CAAC,SAAS;gBAC/C,SAAS,EAAE,yBAAyB,CAAC,QAAQ;aAC9C;SACF,CAAC,CAAC;QACH,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC,oBAAoB,EAAE,CAAC;IAC5C,CAAC;;AApBH,oEAqBC;;;AAsED;;;GAGG;AACH,MAAa,wBAAyB,SAAQ,sBAAS;IAGrD,YACE,KAAgB,EAChB,EAAU,EACV,KAAmC;QAEnC,KAAK,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;QACjB,MAAM,WAAW,GAAG,KAAK,CAAC,QAAQ,EAAE,WAAW,IAAI,GAAG,CAAC;QACvD,MAAM,UAAU,GAAG,KAAK,CAAC,QAAQ,EAAE,UAAU,IAAI,CAAC,CAAC;QACnD,MAAM,eAAe,GAAG,KAAK,CAAC,KAAK,CAAC,OAAO,CAAC,MAAM;YAChD,CAAC,CAAC,IAAA,6BAAmB,EACjB,KAAK,CAAC,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,kBAAkB,EAC7C,KAAK,CAAC,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,MAAM,EACjC,sBAAY,CAAC,YAAY,EACzB,WAAW,EACX,UAAU,CACX;YACH,CAAC,CAAC,IAAA,4CAAkC,EAChC,KAAK,CAAC,KAAK,CAAC,OAAO,CAAC,UAAU,EAC9B,WAAW,EACX,UAAU,CACX,CAAC;QACN,MAAM,oBAAoB,GAAG,KAAK,CAAC,mBAAmB;YACpD,CAAC,CAAC,CAAC,KAAK,CAAC,mBAAmB,CAAC;YAC7B,CAAC,CAAC;gBACE,6BAAmB,CAAC,YAAY;gBAChC,6BAAmB,CAAC,aAAa;gBACjC,6BAAmB,CAAC,aAAa;aAClC,CAAC;QACN,MAAM,yBAAyB,GAAG,oBAAoB;aACnD,OAAO,CAAC,CAAC,mBAAmB,EAAE,EAAE,CAC/B,IAAA,4BAAkB,EAChB,mBAAmB,EACnB,eAAe,EACf,KAAK,CAAC,KAAK,CAAC,OAAO,CAAC,MAAM,EAAE,cAAc,CAC3C,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;YACZ,mBAAmB;YACnB,GAAG,CAAC;SACL,CAAC,CAAC,CACJ;aACA,MAAM,CACL,CAAC,YAAY,EAAE,EAAE,CACf,CAAC,KAAK,CAAC,QAAQ,EAAE,kBAAkB;YACnC,YAAY,CAAC,EAAE,KAAK,KAAK,CAAC,QAAQ,CAAC,kBAAkB,CACxD,CAAC;QACJ,IAAI,yBAAyB,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC3C,MAAM,IAAI,KAAK,CACb,kEAAkE,yBAAyB,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,CAC3H,CAAC;QACJ,CAAC;QACD,MAAM,kBAAkB,GAAG,yBAAyB,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;QAC3D,MAAM,KAAK,GACT,KAAK,CAAC,KAAK,IAAI,IAAI,4BAA4B,CAAC,IAAI,EAAE,cAAc,CAAC,CAAC;QACxE,MAAM,SAAS,GAAG,KAAK,CAAC,QAAQ,EAAE,SAAS,IAAI,uBAAS,CAAC,OAAO,CAAC;QACjE,MAAM,QAAQ,GAAG;YACf,GAAG,KAAK,CAAC,QAAQ;YACjB,KAAK,EAAE,KAAK,CAAC,KAAK,CAAC,OAAO;YAC1B,eAAe,EAAE,CAAC,KAAK,CAAC,KAAK,CAAC,SAAS,CAAC;YACxC,WAAW;YACX,UAAU;YACV,kBAAkB;YAClB,SAAS;SACoB,CAAC;QAEhC,4CAA4C;QAC5C,MAAM,IAAI,GAAG,CAAC,GAAW,EAAE,EAAE,CAC3B,IAAA,mBAAU,EAAC,QAAQ,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;QACjD,MAAM,gBAAgB,GAAG,OAAO,KAAK,CAAC,gBAAgB,IAAI,IAAI,CAAC,IAAI,CAAC,SAAS,CAAC,QAAQ,CAAC,CAAC,EAAE,CAAC;QAC3F,MAAM,WAAW,GAAG,uCAAyB,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;QAC5D,MAAM,WAAW,GAA2B;YAC1C,GAAG,KAAK,CAAC,WAAW;YACpB,qBAAqB,EAAE,+BAA+B;YACtD,yBAAyB,EAAE,2BAA2B;YACtD,mBAAmB,EAAE,kBAAkB,CAAC,QAAQ,EAAE;YAClD,yBAAyB,EAAE,GAAG;YAC9B,QAAQ,EAAE,KAAK,CAAC,KAAK,CAAC,OAAO;YAC7B,UAAU,EAAE,KAAK,CAAC,KAAK,CAAC,SAAS;YACjC,yBAAyB,EAAE,KAAK,CAAC,MAAM,CAAC,cAAc,CAAC,gBAAgB,CAAC;YACxE,4BAA4B,EAAE,gBAAgB;SAC/C,CAAC;QACF,MAAM,OAAO,GAAoC,EAAE,CAAC;QACpD,IAAI,KAAK,CAAC,QAAQ,EAAE,OAAO,EAAE,CAAC;YAC5B,OAAO,CAAC,QAAQ,GAAG,KAAK,CAAC,QAAQ,CAAC,OAAO,CAAC;QAC5C,CAAC;QACD,MAAM,UAAU,GAAG,kBAAI,CAAC,SAAS,CAC/B,KAAK,CAAC,KAAK,CAAC,OAAO,CAAC,UAAU,CAAC,SAAS,EAAE,GAAG,GAAG,CACjD,CAAC;QACF,MAAM,kBAAkB,GAAG,KAAK,CAAC,QAAQ,EAAE,oBAAoB,EAAE,SAAS;YACxE,CAAC,CAAC,UAAU;YACZ,CAAC,CAAC,kBAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;QAClB,MAAM,UAAU,GAAG,kBAAI,CAAC,SAAS,CAC/B,IAAI,CAAC,IAAI,CACP,UAAU,CAAC,WAAW,EAAE;YACtB,kBAAkB,CAAC,WAAW,EAAE;YAChC,qCAA2B,CAAC,IAAI,CAAC,WAAW,EAAE;YAC9C,qCAA2B,CAAC,IAAI,CAAC,WAAW,EAAE,CACjD,CACF,CAAC;QAEF,MAAM,QAAQ,GAAG,IAAI,kCAAe,CAAC,IAAI,EAAE,UAAU,EAAE;YACrD,GAAG,KAAK;YACR,mBAAmB,EAAE,yBAAyB,CAAC,CAAC,CAAC,CAAC,mBAAmB;YACrE,gBAAgB;YAChB,KAAK,EAAE,KAAK;YACZ,OAAO,EAAE,WAAW;YACpB,WAAW;YACX,OAAO;YACP,UAAU;SACX,CAAC,CAAC;QACH,IAAI,CAAC,QAAQ,GAAG,QAAQ,CAAC;QACzB,IAAI,CAAC,QAAQ,GAAG,QAAQ,CAAC;IAC3B,CAAC;IACD;;;OAGG;IACH,OAAO;QACL,OAAO;YACL,GAAG,IAAI,CAAC,QAAQ,CAAC,OAAO,EAAE;YAC1B,QAAQ,EAAE,IAAI,CAAC,QAAQ;SACgB,CAAC;IAC5C,CAAC;;AA3HH,4DA4HC","sourcesContent":["import { ContainerImageBuild } from \"@cdklabs/deploy-time-build\";\nimport { Size } from \"aws-cdk-lib\";\nimport * as batch from \"aws-cdk-lib/aws-batch\";\nimport { IVpc, SubnetSelection } from \"aws-cdk-lib/aws-ec2\";\nimport { ContainerImage } from \"aws-cdk-lib/aws-ecs\";\nimport { IBucket } from \"aws-cdk-lib/aws-s3\";\nimport { Construct } from \"constructs\";\nimport { createHash } from \"crypto\";\nimport { join } from \"path\";\nimport {\n  calcMemoryFootprint,\n  calcTensorParallel,\n  DataTypeBits,\n  INeuronxInstanceType,\n  inferMemoryFootprintFromParameters,\n  IVllmInferenceNeuronxImage,\n  Model,\n  NeuronOptimizedMachineImage,\n  NeuronxInstanceType,\n  PytorchTrainingNeuronxImage,\n  VllmInferenceNeuronxImage,\n} from \"../base/neuronx\";\nimport {\n  INeuronxContainerImage,\n  NeuronxCompiledModel,\n  NeuronxCompiler,\n} from \"../base/neuronx-compiler\";\nimport {\n  BlockSize,\n  VllmEngineArguments,\n  VllmEngineArgumentsParser,\n} from \"../base/server-engine/vllm-engine\";\nimport { VllmNxdInferenceEcsImageBase } from \"./vllm-nxd-inference-ecs-patterns\";\n\n/**\n * Compile runtime container image for vLLM NxD Inference\n */\nexport class VllmNxdInferenceCompileImage extends VllmNxdInferenceEcsImageBase {\n  /**\n   * The container image.\n   */\n  readonly image: ContainerImage;\n  constructor(\n    scope: Construct,\n    id: string,\n    vllmInferenceNeuronxImage?: IVllmInferenceNeuronxImage,\n  ) {\n    vllmInferenceNeuronxImage ??= VllmInferenceNeuronxImage.LATEST;\n    super(vllmInferenceNeuronxImage);\n    const build = new ContainerImageBuild(scope, id, {\n      directory: join(__dirname, \"../../scripts/compile/vllm-nxd-inference\"),\n      buildArgs: {\n        IMAGE_NAME: vllmInferenceNeuronxImage.imageName,\n        IMAGE_TAG: vllmInferenceNeuronxImage.imageTag,\n      },\n    });\n    this.image = build.toEcsDockerImageCode();\n  }\n}\n\n/**\n * Props of VllmNxdInferenceCompiler.\n */\nexport interface VllmNxdInferenceCompileProps {\n  /**\n   * VPC in which this will launch compile worker instance.\n   */\n  readonly vpc: IVpc;\n  /**\n   * The bucket to upload compiled artifacts.\n   */\n  readonly bucket: IBucket;\n  /**\n   * The instance type of compile worker instance.\n   */\n  readonly neuronxInstanceType?: INeuronxInstanceType;\n  /**\n   * The model to be compiled.\n   */\n  readonly model: Model;\n  /**\n   * The root volume of worker instance.\n   * @default - N bilion parameters * 5GiB EBS\n   */\n  readonly volumeSize?: Size;\n  /**\n   * Whether or not to use spot instances. Spot instances are less expensive EC2 instances that can be reclaimed by EC2 at any time; your job will be given two minutes of notice before reclamation.\n   *\n   * @default false\n   */\n  readonly spot?: boolean;\n  /**\n   * The VPC Subnets this Compute Environment will launch instances in.\n   *\n   * @default - new subnets will be created\n   */\n  readonly vpcSubnets?: SubnetSelection;\n  /**\n   * The environment variables to pass to the container.\n   * This is only applicable when using container runtime.\n   *\n   * @default - No environment variables.\n   */\n  readonly environment?: {\n    [key: string]: string;\n  };\n  /**\n   * The arguments to pass to the vllm engine.\n   * @default - no specific values. use default values.\n   */\n  readonly vllmArgs?: VllmEngineArguments;\n  /**\n   * An image of the container where the compile job is executed.\n   * @default - latest image\n   */\n  readonly image?: INeuronxContainerImage;\n}\n\n/**\n * The model compiled by Neuronx compiler.\n */\nexport interface VllmNxdInferenceCompiledModel extends NeuronxCompiledModel {\n  /**\n   * Passed to the vllm engine at compile time.\n   */\n  readonly vllmArgs: VllmEngineArguments;\n}\n\n/**\n * Neuronx compiler construct for vLLM on NxD Inference.\n * Compile the model to work with Neuronx instance and upload it to an S3 bucket.\n */\nexport class VllmNxdInferenceCompiler extends Construct {\n  private readonly vllmArgs: VllmEngineArguments;\n  private readonly compiler: NeuronxCompiler;\n  constructor(\n    scope: Construct,\n    id: string,\n    props: VllmNxdInferenceCompileProps,\n  ) {\n    super(scope, id);\n    const maxModelLen = props.vllmArgs?.maxModelLen ?? 128;\n    const maxNumSeqs = props.vllmArgs?.maxNumSeqs ?? 1;\n    const memoryFootprint = props.model.options.config\n      ? calcMemoryFootprint(\n          props.model.options.config.embeddingDimension,\n          props.model.options.config.layers,\n          DataTypeBits.BF16_OR_FP16,\n          maxModelLen,\n          maxNumSeqs,\n        )\n      : inferMemoryFootprintFromParameters(\n          props.model.options.parameters,\n          maxModelLen,\n          maxNumSeqs,\n        );\n    const neuronxInstanceTypes = props.neuronxInstanceType\n      ? [props.neuronxInstanceType]\n      : [\n          NeuronxInstanceType.INF2_8XLARGE,\n          NeuronxInstanceType.INF2_24XLARGE,\n          NeuronxInstanceType.INF2_48XLARGE,\n        ];\n    const availableInstancePatterns = neuronxInstanceTypes\n      .flatMap((neuronxInstanceType) =>\n        calcTensorParallel(\n          neuronxInstanceType,\n          memoryFootprint,\n          props.model.options.config?.attentionHeads,\n        ).map((v) => ({\n          neuronxInstanceType,\n          ...v,\n        })),\n      )\n      .filter(\n        (instanceType) =>\n          !props.vllmArgs?.tensorParallelSize ||\n          instanceType.tp === props.vllmArgs.tensorParallelSize,\n      );\n    if (availableInstancePatterns.length === 0) {\n      throw new Error(\n        `No available instance type. You can use tensorParallelSize are ${availableInstancePatterns.map((p) => p.tp).join(\", \")}.`,\n      );\n    }\n    const tensorParallelSize = availableInstancePatterns[0].tp;\n    const image =\n      props.image ?? new VllmNxdInferenceCompileImage(this, \"CompileImage\");\n    const blockSize = props.vllmArgs?.blockSize ?? BlockSize.SIZE_32;\n    const vllmArgs = {\n      ...props.vllmArgs,\n      model: props.model.modelId,\n      servedModelName: [props.model.modelName],\n      maxModelLen,\n      maxNumSeqs,\n      tensorParallelSize,\n      blockSize,\n    } satisfies VllmEngineArguments;\n\n    // change dirname every engine args patterns\n    const hash = (str: string) =>\n      createHash(\"sha256\").update(str).digest(\"hex\");\n    const artifactS3Prefix = `sdk-${image.neuronSdkVersion}/${hash(JSON.stringify(vllmArgs))}`;\n    const vllmCliArgs = VllmEngineArgumentsParser.cli(vllmArgs);\n    const environment: Record<string, string> = {\n      ...props.environment,\n      VLLM_NEURON_FRAMEWORK: \"neuronx-distributed-inference\",\n      NEURON_COMPILED_ARTIFACTS: \"neuron-compiled-artifacts\",\n      NEURON_RT_NUM_CORES: tensorParallelSize.toString(),\n      XLA_HANDLE_SPECIAL_SCALAR: \"1\",\n      MODEL_ID: props.model.modelId,\n      MODEL_NAME: props.model.modelName,\n      COMPILED_ARTIFACTS_S3_URI: props.bucket.s3UrlForObject(artifactS3Prefix),\n      COMPILED_ARTIFACTS_S3_PREFIX: artifactS3Prefix,\n    };\n    const secrets: { [key: string]: batch.Secret } = {};\n    if (props.vllmArgs?.hfToken) {\n      secrets.HF_TOKEN = props.vllmArgs.hfToken;\n    }\n    const weightSize = Size.gibibytes(\n      props.model.options.parameters.toBillion() * 2.5,\n    );\n    const quantizedWightSize = props.vllmArgs?.overrideNeuronConfig?.quantized\n      ? weightSize\n      : Size.bytes(0);\n    const volumeSize = Size.gibibytes(\n      Math.ceil(\n        weightSize.toGibibytes() +\n          quantizedWightSize.toGibibytes() +\n          PytorchTrainingNeuronxImage.size.toGibibytes() +\n          NeuronOptimizedMachineImage.size.toGibibytes(),\n      ),\n    );\n\n    const compiler = new NeuronxCompiler(this, \"Resource\", {\n      ...props,\n      neuronxInstanceType: availableInstancePatterns[0].neuronxInstanceType,\n      artifactS3Prefix,\n      image: image,\n      command: vllmCliArgs,\n      environment,\n      secrets,\n      volumeSize,\n    });\n    this.vllmArgs = vllmArgs;\n    this.compiler = compiler;\n  }\n  /**\n   * Compile the model and return the compiled model.\n   * @returns The compiled model.\n   */\n  compile(): VllmNxdInferenceCompiledModel {\n    return {\n      ...this.compiler.compile(),\n      vllmArgs: this.vllmArgs,\n    } satisfies VllmNxdInferenceCompiledModel;\n  }\n}\n"]}
|
|
@@ -19,7 +19,7 @@ class VllmNxdInferenceEcsImageBase {
|
|
|
19
19
|
}
|
|
20
20
|
exports.VllmNxdInferenceEcsImageBase = VllmNxdInferenceEcsImageBase;
|
|
21
21
|
_a = JSII_RTTI_SYMBOL_1;
|
|
22
|
-
VllmNxdInferenceEcsImageBase[_a] = { fqn: "aws-cdk-neuronx-patterns.VllmNxdInferenceEcsImageBase", version: "0.
|
|
22
|
+
VllmNxdInferenceEcsImageBase[_a] = { fqn: "aws-cdk-neuronx-patterns.VllmNxdInferenceEcsImageBase", version: "0.2.1" };
|
|
23
23
|
/**
|
|
24
24
|
* Inference ECS container image for vLLM on NxD Inference.
|
|
25
25
|
* This image uses the official AWS Neuron Deep Learning Containers which come with vLLM pre-installed.
|
|
@@ -42,7 +42,7 @@ class VllmNxdInferenceEcsImage extends VllmNxdInferenceEcsImageBase {
|
|
|
42
42
|
}
|
|
43
43
|
exports.VllmNxdInferenceEcsImage = VllmNxdInferenceEcsImage;
|
|
44
44
|
_b = JSII_RTTI_SYMBOL_1;
|
|
45
|
-
VllmNxdInferenceEcsImage[_b] = { fqn: "aws-cdk-neuronx-patterns.VllmNxdInferenceEcsImage", version: "0.
|
|
45
|
+
VllmNxdInferenceEcsImage[_b] = { fqn: "aws-cdk-neuronx-patterns.VllmNxdInferenceEcsImage", version: "0.2.1" };
|
|
46
46
|
/**
|
|
47
47
|
* Task definition for VllmNxdInference.
|
|
48
48
|
*/
|
|
@@ -90,7 +90,7 @@ class VllmNxdInferenceTaskDefinition extends aws_ecs_patterns_1.NeuronxTaskDefin
|
|
|
90
90
|
}
|
|
91
91
|
exports.VllmNxdInferenceTaskDefinition = VllmNxdInferenceTaskDefinition;
|
|
92
92
|
_c = JSII_RTTI_SYMBOL_1;
|
|
93
|
-
VllmNxdInferenceTaskDefinition[_c] = { fqn: "aws-cdk-neuronx-patterns.VllmNxdInferenceTaskDefinition", version: "0.
|
|
93
|
+
VllmNxdInferenceTaskDefinition[_c] = { fqn: "aws-cdk-neuronx-patterns.VllmNxdInferenceTaskDefinition", version: "0.2.1" };
|
|
94
94
|
/**
|
|
95
95
|
* ApplicationLoadBalancedVllmNxDInferenceService is a wrapper of ApplicationLoadBalancedNeuronxServiceBase.
|
|
96
96
|
* It provides a simple way to deploy vLLM on NxD Inference.
|
|
@@ -129,5 +129,5 @@ class ApplicationLoadBalancedVllmNxDInferenceService extends constructs_1.Constr
|
|
|
129
129
|
}
|
|
130
130
|
exports.ApplicationLoadBalancedVllmNxDInferenceService = ApplicationLoadBalancedVllmNxDInferenceService;
|
|
131
131
|
_d = JSII_RTTI_SYMBOL_1;
|
|
132
|
-
ApplicationLoadBalancedVllmNxDInferenceService[_d] = { fqn: "aws-cdk-neuronx-patterns.ApplicationLoadBalancedVllmNxDInferenceService", version: "0.
|
|
132
|
+
ApplicationLoadBalancedVllmNxDInferenceService[_d] = { fqn: "aws-cdk-neuronx-patterns.ApplicationLoadBalancedVllmNxDInferenceService", version: "0.2.1" };
|
|
133
133
|
//# sourceMappingURL=data:application/json;base64,{"version":3,"file":"vllm-nxd-inference-ecs-patterns.js","sourceRoot":"","sources":["../../src/vllm-nxd-inference/vllm-nxd-inference-ecs-patterns.ts"],"names":[],"mappings":";;;;;AAAA,6CAAuC;AACvC,2CAA2C;AAM3C,2CAAuC;AACvC,+DAKkC;AAClC,6CAGyB;AAEzB,mEAA8E;AAO9E;;GAEG;AACH,MAAsB,4BAA4B;IAShD,YAAY,YAAwC;QAClD,IAAI,CAAC,gBAAgB,GAAG,YAAY,CAAC,gBAAgB,CAAC;IACxD,CAAC;;AAXH,oEAYC;;;AAED;;;;;GAKG;AACH,MAAa,wBAAyB,SAAQ,4BAA4B;IAGxE;;;;;;OAMG;IACH,YAAY,yBAAsD;QAChE,yBAAyB,KAAzB,yBAAyB,GAAK,mCAAyB,CAAC,MAAM,EAAC;QAC/D,KAAK,CAAC,yBAAyB,CAAC,CAAC;QACjC,IAAI,CAAC,KAAK,GAAG,GAAG,CAAC,cAAc,CAAC,YAAY,CAC1C,GAAG,yBAAyB,CAAC,SAAS,IAAI,yBAAyB,CAAC,QAAQ,EAAE,CAC/E,CAAC;IACJ,CAAC;;AAhBH,4DAiBC;;;AA0BD;;GAEG;AACH,MAAa,8BAA+B,SAAQ,wCAAqB;IACvE,YACE,KAAgB,EAChB,EAAU,EACV,KAA0C;QAE1C,MAAM,mBAAmB,GACvB,KAAK,CAAC,mBAAmB,IAAI,KAAK,CAAC,aAAa,CAAC,uBAAuB,CAAC;QAC3E,MAAM,kBAAkB,GACtB,KAAK,CAAC,aAAa,CAAC,QAAQ,CAAC,kBAAkB,IAAI,CAAC,CAAC;QACvD,KAAK,CAAC,KAAK,EAAE,EAAE,EAAE;YACf,GAAG,KAAK;YACR,mBAAmB;YACnB,kBAAkB;SACnB,CAAC,CAAC;QACH,MAAM,KAAK,GAAG,KAAK,CAAC,KAAK,IAAI,IAAI,wBAAwB,EAAE,CAAC;QAC5D,MAAM,IAAI,GAAG,KAAK,CAAC,aAAa,CAAC,QAAQ,CAAC,IAAI,IAAI,IAAI,CAAC;QACvD,MAAM,WAAW,GAAG,uCAAyB,CAAC,GAAG,CAC/C,KAAK,CAAC,aAAa,CAAC,QAAQ,CAC7B,CAAC;QACF,gCAAgC;QAChC,MAAM,WAAW,GAA2B;YAC1C,GAAG,KAAK,CAAC,WAAW;YACpB,qBAAqB,EAAE,+BAA+B;YACtD,yBAAyB,EAAE,2BAA2B;YACtD,mBAAmB,EAAE,kBAAkB,CAAC,QAAQ,EAAE;YAClD,yBAAyB,EAAE,GAAG;YAC9B,UAAU,EAAE,KAAK,CAAC,aAAa,CAAC,SAAS;SAC1C,CAAC;QAEF,IAAI,CAAC,uBAAuB,CAAC,MAAM,EAAE;YACnC,KAAK,EAAE,KAAK,CAAC,KAAK;YAClB,YAAY,EAAE;gBACZ;oBACE,aAAa,EAAE,IAAI;iBACpB;aACF;YACD,WAAW,EAAE;gBACX,OAAO,EAAE;oBACP,WAAW;oBACX,4BAA4B,IAAI,mBAAmB;iBACpD;gBACD,WAAW,EAAE,sBAAQ,CAAC,OAAO,CAAC,CAAC,CAAC;aACjC;YACD,gBAAgB,EAAE,iBAAiB,KAAK,CAAC,aAAa,CAAC,QAAQ,EAAE;YACjE,UAAU,EAAE,CAAC,MAAM,EAAE,OAAO,CAAC;YAC7B,OAAO,EAAE,WAAW;YACpB,WAAW;SACZ,CAAC,CAAC;IACL,CAAC;;AAjDH,wEAkDC;;;AAOD;;;;;;;;;;;;;;;;;;;;;GAqBG;AACH,MAAa,8CAA+C,SAAQ,sBAAS;IAM3E,YACE,KAAgB,EAChB,EAAU,EACV,KAA0D;QAE1D,KAAK,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;QACjB,MAAM,QAAQ,GAAG,IAAI,wDAAqC,CACxD,IAAI,EACJ,UAAU,EACV,KAAK,CACN,CAAC;QACF,QAAQ,CAAC,WAAW,CAAC,oBAAoB,CAAC;YACxC,IAAI,EAAE,SAAS;SAChB,CAAC,CAAC;QACH,IAAI,CAAC,YAAY,GAAG,QAAQ,CAAC,YAAY,CAAC;QAC1C,IAAI,CAAC,QAAQ,GAAG,QAAQ,CAAC,QAAQ,CAAC;QAClC,IAAI,CAAC,WAAW,GAAG,QAAQ,CAAC,WAAW,CAAC;QACxC,IAAI,CAAC,OAAO,GAAG,QAAQ,CAAC,OAAO,CAAC;QAChC,IAAI,CAAC,cAAc,GAAG,QAAQ,CAAC,cAAc,CAAC;IAChD,CAAC;;AAzBH,wGA0BC","sourcesContent":["import { Duration } from \"aws-cdk-lib\";\nimport * as ecs from \"aws-cdk-lib/aws-ecs\";\nimport {\n  ApplicationListener,\n  ApplicationLoadBalancer,\n  ApplicationTargetGroup,\n} from \"aws-cdk-lib/aws-elasticloadbalancingv2\";\nimport { Construct } from \"constructs\";\nimport {\n  ApplicationLoadBalancedNeuronxService,\n  ApplicationLoadBalancedNeuronxServiceProps,\n  NeuronxTaskDefinition,\n  NeuronxTaskDefinitionPropsBase,\n} from \"../base/aws-ecs-patterns\";\nimport {\n  IVllmInferenceNeuronxImage,\n  VllmInferenceNeuronxImage,\n} from \"../base/neuronx\";\nimport { INeuronxContainerImage } from \"../base/neuronx-compiler\";\nimport { VllmEngineArgumentsParser } from \"../base/server-engine/vllm-engine\";\nimport { VllmNxdInferenceCompiledModel } from \"./vllm-nxd-inference-compiler\";\n\nexport interface VllmNxdInferenceImageOptions {\n  readonly vllmGitBranch?: string;\n  readonly vllmGitCommitHash?: string;\n}\n/**\n * Base class for VllmNxdInferenceImage.\n */\nexport abstract class VllmNxdInferenceEcsImageBase implements INeuronxContainerImage {\n  /**\n   * The container image.\n   */\n  abstract readonly image: ecs.ContainerImage;\n  /**\n   * The neuronx SDK version.\n   */\n  readonly neuronSdkVersion: string;\n  constructor(neuronxImage: IVllmInferenceNeuronxImage) {\n    this.neuronSdkVersion = neuronxImage.neuronSdkVersion;\n  }\n}\n\n/**\n * Inference ECS container image for vLLM on NxD Inference.\n * This image uses the official AWS Neuron Deep Learning Containers which come with vLLM pre-installed.\n *\n * @example new VllmNxdInferenceEcsImage(VllmInferenceNeuronxImage.LATEST)\n */\nexport class VllmNxdInferenceEcsImage extends VllmNxdInferenceEcsImageBase {\n  readonly image: ecs.ContainerImage;\n\n  /**\n   * Create a VllmNxdInferenceImage from a custom neuronx image.\n   * This will build a container image using a Dockerfile that installs vLLM from source.\n   *\n   * @example\n   * new VllmNxdInferenceEcsImage(VllmInferenceNeuronxImage.LATEST)\n   */\n  constructor(vllmInferenceNeuronxImage?: IVllmInferenceNeuronxImage) {\n    vllmInferenceNeuronxImage ??= VllmInferenceNeuronxImage.LATEST;\n    super(vllmInferenceNeuronxImage);\n    this.image = ecs.ContainerImage.fromRegistry(\n      `${vllmInferenceNeuronxImage.imageName}:${vllmInferenceNeuronxImage.imageTag}`,\n    );\n  }\n}\n\n/**\n * Task definition for VllmNxdInference.\n */\nexport interface VllmNxdInferenceTaskDefinitionProps extends NeuronxTaskDefinitionPropsBase {\n  /**\n   * The model to be compiled.\n   */\n  readonly compiledModel: VllmNxdInferenceCompiledModel;\n  /**\n   * The image to be used for the container.\n   * @default - latest VllmNxdInferenceImage\n   */\n  readonly image?: VllmNxdInferenceEcsImageBase;\n  /**\n   * The environment variables to pass to the container.\n   * This is only applicable when using container runtime.\n   *\n   * @default - No environment variables.\n   */\n  readonly environment?: {\n    [key: string]: string;\n  };\n}\n\n/**\n * Task definition for VllmNxdInference.\n */\nexport class VllmNxdInferenceTaskDefinition extends NeuronxTaskDefinition {\n  constructor(\n    scope: Construct,\n    id: string,\n    props: VllmNxdInferenceTaskDefinitionProps,\n  ) {\n    const neuronxInstanceType =\n      props.neuronxInstanceType ?? props.compiledModel.compileTimeInstanceType;\n    const tensorParallelSize =\n      props.compiledModel.vllmArgs.tensorParallelSize ?? 1;\n    super(scope, id, {\n      ...props,\n      neuronxInstanceType,\n      tensorParallelSize,\n    });\n    const image = props.image ?? new VllmNxdInferenceEcsImage();\n    const port = props.compiledModel.vllmArgs.port ?? 8000;\n    const vllmCliArgs = VllmEngineArgumentsParser.cli(\n      props.compiledModel.vllmArgs,\n    );\n    // Prepare environment variables\n    const environment: Record<string, string> = {\n      ...props.environment,\n      VLLM_NEURON_FRAMEWORK: \"neuronx-distributed-inference\",\n      NEURON_COMPILED_ARTIFACTS: \"neuron-compiled-artifacts\",\n      NEURON_RT_NUM_CORES: tensorParallelSize.toString(),\n      XLA_HANDLE_SPECIAL_SCALAR: \"1\",\n      MODEL_NAME: props.compiledModel.modelName,\n    };\n\n    this.addContainerWithDefault(\"vLLM\", {\n      image: image.image,\n      portMappings: [\n        {\n          containerPort: port,\n        },\n      ],\n      healthCheck: {\n        command: [\n          \"CMD-SHELL\",\n          `curl -f http://localhost:${port}/health || exit 1`,\n        ],\n        startPeriod: Duration.minutes(5),\n      },\n      workingDirectory: `/opt/ml/model/${props.compiledModel.s3Prefix}`,\n      entryPoint: [\"vllm\", \"serve\"],\n      command: vllmCliArgs,\n      environment,\n    });\n  }\n}\n\n/**\n * Props for ApplicationLoadBalancedVllmNxDInferenceService.\n */\nexport interface ApplicationLoadBalancedVllmNxDInferenceServiceProps extends ApplicationLoadBalancedNeuronxServiceProps {}\n\n/**\n * ApplicationLoadBalancedVllmNxDInferenceService is a wrapper of ApplicationLoadBalancedNeuronxServiceBase.\n * It provides a simple way to deploy vLLM on NxD Inference.\n * @example\n * const compiler = new VllmNxdInferenceCompiler(this, \"Compiler\", {\n *   vpc,\n *   bucket,\n *   model: Model.fromHuggingFace(\"example/example-7b-chat\"),\n * });\n * const compiledModel = compiler.compile();\n * const taskDefinition = new VllmNxdInferenceTaskDefinition(\n *   this,\n *   \"TaskDefinition\",\n *   {\n *     vpc,\n *     compiledModel,\n *   },\n * );\n * new ApplicationLoadBalancedVllmNxDInferenceService(this, \"Service\", {\n *   taskDefinition,\n * });\n */\nexport class ApplicationLoadBalancedVllmNxDInferenceService extends Construct {\n  readonly loadBalancer: ApplicationLoadBalancer;\n  readonly listener: ApplicationListener;\n  readonly targetGroup: ApplicationTargetGroup;\n  readonly service: ecs.Ec2Service;\n  readonly taskDefinition: ecs.Ec2TaskDefinition;\n  constructor(\n    scope: Construct,\n    id: string,\n    props: ApplicationLoadBalancedVllmNxDInferenceServiceProps,\n  ) {\n    super(scope, id);\n    const resource = new ApplicationLoadBalancedNeuronxService(\n      this,\n      \"Resource\",\n      props,\n    );\n    resource.targetGroup.configureHealthCheck({\n      path: \"/health\",\n    });\n    this.loadBalancer = resource.loadBalancer;\n    this.listener = resource.listener;\n    this.targetGroup = resource.targetGroup;\n    this.service = resource.service;\n    this.taskDefinition = resource.taskDefinition;\n  }\n}\n"]}
|
package/package.json
CHANGED
|
@@ -39,9 +39,9 @@
|
|
|
39
39
|
"@aws-cdk/aws-sagemaker-alpha": "2.240.0-alpha.0",
|
|
40
40
|
"@aws-cdk/integ-runner": "latest",
|
|
41
41
|
"@aws-cdk/integ-tests-alpha": "latest",
|
|
42
|
-
"@aws-sdk/client-batch": "^3.
|
|
43
|
-
"@aws-sdk/client-lambda": "^3.
|
|
44
|
-
"@types/aws-lambda": "^8.10.
|
|
42
|
+
"@aws-sdk/client-batch": "^3.1003.0",
|
|
43
|
+
"@aws-sdk/client-lambda": "^3.1003.0",
|
|
44
|
+
"@types/aws-lambda": "^8.10.161",
|
|
45
45
|
"@types/cfn-response": "^1.0.8",
|
|
46
46
|
"@types/jest": "^29.5.14",
|
|
47
47
|
"@types/node": "^18",
|
|
@@ -64,7 +64,7 @@
|
|
|
64
64
|
"jsii-pacmak": "^1.127.0",
|
|
65
65
|
"jsii-rosetta": "~5.9.0",
|
|
66
66
|
"prettier": "^3.8.1",
|
|
67
|
-
"projen": "^0.99.
|
|
67
|
+
"projen": "^0.99.18",
|
|
68
68
|
"ts-jest": "^29.4.6",
|
|
69
69
|
"ts-node": "^10.9.2",
|
|
70
70
|
"typescript": "^5.9.3"
|
|
@@ -76,7 +76,7 @@
|
|
|
76
76
|
},
|
|
77
77
|
"dependencies": {
|
|
78
78
|
"@aws-cdk/aws-sagemaker-alpha": "2.240.0-alpha.0",
|
|
79
|
-
"@cdklabs/deploy-time-build": "^0.0
|
|
79
|
+
"@cdklabs/deploy-time-build": "^0.1.0"
|
|
80
80
|
},
|
|
81
81
|
"keywords": [
|
|
82
82
|
"cdk",
|
|
@@ -95,7 +95,7 @@
|
|
|
95
95
|
]
|
|
96
96
|
}
|
|
97
97
|
},
|
|
98
|
-
"version": "0.
|
|
98
|
+
"version": "0.2.1",
|
|
99
99
|
"jest": {
|
|
100
100
|
"coverageProvider": "v8",
|
|
101
101
|
"testMatch": [
|