aws-cdk-neuronx-patterns 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.jsii +754 -117
- package/API.md +1044 -158
- package/README.ja.md +18 -6
- package/README.md +16 -5
- package/lib/base/aws-batch/neuronx-batch-compute-environment.js +1 -1
- package/lib/base/aws-batch/neuronx-batch-ecs-job-definition.js +1 -1
- package/lib/base/aws-batch/neuronx-batch.js +1 -1
- package/lib/base/aws-ecs-patterns/application-load-balanced-neuronx-service.js +4 -4
- package/lib/base/neuronx/calculator.test.js +61 -1
- package/lib/base/neuronx/deep-learning-containers.js +3 -3
- package/lib/base/neuronx/model.js +2 -2
- package/lib/base/neuronx/neuron-optimized-machine-image.js +1 -1
- package/lib/base/neuronx/neuronx-instance-type.d.ts +18 -0
- package/lib/base/neuronx/neuronx-instance-type.js +60 -7
- package/lib/base/neuronx/neuronx-instance-type.test.js +80 -1
- package/lib/base/neuronx-compiler/index.d.ts +3 -1
- package/lib/base/neuronx-compiler/index.js +4 -2
- package/lib/base/neuronx-compiler/{neuronx-compiler.d.ts → neuronx-compiler-base.d.ts} +74 -32
- package/lib/base/neuronx-compiler/neuronx-compiler-base.js +129 -0
- package/lib/base/neuronx-compiler/neuronx-cross-compiler.d.ts +30 -0
- package/lib/base/neuronx-compiler/neuronx-cross-compiler.js +83 -0
- package/lib/base/neuronx-compiler/neuronx-native-compiler.d.ts +18 -0
- package/lib/base/neuronx-compiler/neuronx-native-compiler.js +69 -0
- package/lib/base/server-engine/vllm-engine/vllm-engine-argments.js +1 -1
- package/lib/sagemaker-inference-toolkit-tnx/sagemaker-inference-toolkit-tnx-compiler.js +2 -2
- package/lib/sagemaker-inference-toolkit-tnx/sagemaker-inference-toolkit-tnx-sagemaker.d.ts +1 -1
- package/lib/sagemaker-inference-toolkit-tnx/sagemaker-inference-toolkit-tnx-sagemaker.js +2 -2
- package/lib/vllm-nxd-inference/vllm-nxd-inference-compiler.d.ts +8 -0
- package/lib/vllm-nxd-inference/vllm-nxd-inference-compiler.js +32 -4
- package/lib/vllm-nxd-inference/vllm-nxd-inference-ecs-patterns.js +6 -6
- package/package.json +7 -7
- package/scripts/compile/vllm-nxd-inference/Dockerfile +5 -0
- package/scripts/compile/vllm-nxd-inference/entrypoint.sh +39 -14
- package/lib/base/neuronx-compiler/neuronx-compiler.js +0 -166
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import * as batch from "aws-cdk-lib/aws-batch";
|
|
2
|
+
import * as ec2 from "aws-cdk-lib/aws-ec2";
|
|
3
|
+
import { Construct } from "constructs";
|
|
4
|
+
import { NeuronxCompilerBase, NeuronxCompilerBaseProps, ComputeEnvironmentResult } from "./neuronx-compiler-base";
|
|
5
|
+
/**
|
|
6
|
+
* Props of NeuronxCrossCompiler.
|
|
7
|
+
*/
|
|
8
|
+
export interface NeuronxCrossCompilerProps extends NeuronxCompilerBaseProps {
|
|
9
|
+
/**
|
|
10
|
+
* The EC2 instance type to use for cross-compilation.
|
|
11
|
+
* This should be a non-Neuron instance type with sufficient memory and CPU
|
|
12
|
+
* for model compilation.
|
|
13
|
+
*
|
|
14
|
+
* @default ec2.InstanceType.of(ec2.InstanceClass.C7I, ec2.InstanceSize.XLARGE4)
|
|
15
|
+
*/
|
|
16
|
+
readonly compileInstanceType?: ec2.InstanceType;
|
|
17
|
+
}
|
|
18
|
+
/**
|
|
19
|
+
* Neuronx cross-compiler construct.
|
|
20
|
+
* Compile the model on a non-Neuron instance and upload the artifacts to an S3 bucket.
|
|
21
|
+
* This avoids the need for expensive Neuron instances during the compilation phase.
|
|
22
|
+
*
|
|
23
|
+
* The compilation uses `vllm serve` which performs model tracing and neuronx-cc compilation
|
|
24
|
+
* entirely on CPU. The resulting artifacts are compatible with Neuron instances for inference.
|
|
25
|
+
*/
|
|
26
|
+
export declare class NeuronxCrossCompiler extends NeuronxCompilerBase {
|
|
27
|
+
constructor(scope: Construct, id: string, props: NeuronxCrossCompilerProps);
|
|
28
|
+
protected createComputeEnvironment(props: NeuronxCompilerBaseProps): ComputeEnvironmentResult;
|
|
29
|
+
protected createJobDefinition(props: NeuronxCompilerBaseProps): batch.IJobDefinition;
|
|
30
|
+
}
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var _a;
|
|
3
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
4
|
+
exports.NeuronxCrossCompiler = void 0;
|
|
5
|
+
const JSII_RTTI_SYMBOL_1 = Symbol.for("jsii.rtti");
|
|
6
|
+
const aws_cdk_lib_1 = require("aws-cdk-lib");
|
|
7
|
+
const batch = require("aws-cdk-lib/aws-batch");
|
|
8
|
+
const ec2 = require("aws-cdk-lib/aws-ec2");
|
|
9
|
+
const neuronx_1 = require("../neuronx");
|
|
10
|
+
const neuronx_compiler_base_1 = require("./neuronx-compiler-base");
|
|
11
|
+
/**
|
|
12
|
+
* Neuronx cross-compiler construct.
|
|
13
|
+
* Compile the model on a non-Neuron instance and upload the artifacts to an S3 bucket.
|
|
14
|
+
* This avoids the need for expensive Neuron instances during the compilation phase.
|
|
15
|
+
*
|
|
16
|
+
* The compilation uses `vllm serve` which performs model tracing and neuronx-cc compilation
|
|
17
|
+
* entirely on CPU. The resulting artifacts are compatible with Neuron instances for inference.
|
|
18
|
+
*/
|
|
19
|
+
class NeuronxCrossCompiler extends neuronx_compiler_base_1.NeuronxCompilerBase {
|
|
20
|
+
constructor(scope, id, props) {
|
|
21
|
+
super(scope, id, props);
|
|
22
|
+
}
|
|
23
|
+
createComputeEnvironment(props) {
|
|
24
|
+
const volumeSize = props.volumeSize?.toGibibytes() ??
|
|
25
|
+
Math.ceil(this.weightSize.toGibibytes() +
|
|
26
|
+
neuronx_1.PytorchTrainingNeuronxImage.size.toGibibytes() +
|
|
27
|
+
neuronx_1.NeuronOptimizedMachineImage.size.toGibibytes());
|
|
28
|
+
const launchTemplate = new ec2.LaunchTemplate(this, "LaunchTemplate", {
|
|
29
|
+
blockDevices: [
|
|
30
|
+
{
|
|
31
|
+
deviceName: "/dev/xvda",
|
|
32
|
+
volume: ec2.BlockDeviceVolume.ebs(volumeSize, {
|
|
33
|
+
volumeType: ec2.EbsDeviceVolumeType.GP3,
|
|
34
|
+
encrypted: true,
|
|
35
|
+
}),
|
|
36
|
+
},
|
|
37
|
+
],
|
|
38
|
+
});
|
|
39
|
+
const compileInstanceType = props.compileInstanceType ??
|
|
40
|
+
ec2.InstanceType.of(ec2.InstanceClass.C7I, ec2.InstanceSize.XLARGE4);
|
|
41
|
+
const computeEnvironment = new batch.ManagedEc2EcsComputeEnvironment(this, "ComputeEnvironment", {
|
|
42
|
+
vpc: props.vpc,
|
|
43
|
+
vpcSubnets: props.vpcSubnets,
|
|
44
|
+
instanceTypes: [compileInstanceType],
|
|
45
|
+
useOptimalInstanceClasses: false,
|
|
46
|
+
launchTemplate,
|
|
47
|
+
spot: props.spot,
|
|
48
|
+
});
|
|
49
|
+
if (launchTemplate instanceof ec2.LaunchTemplate) {
|
|
50
|
+
computeEnvironment.node.defaultChild.addPropertyOverride("ComputeResources.LaunchTemplate.Version", launchTemplate.latestVersionNumber);
|
|
51
|
+
}
|
|
52
|
+
aws_cdk_lib_1.Tags.of(computeEnvironment).add("Name", "neuronx-cross-compile-worker");
|
|
53
|
+
return {
|
|
54
|
+
computeEnvironment,
|
|
55
|
+
instanceRole: computeEnvironment.instanceRole,
|
|
56
|
+
};
|
|
57
|
+
}
|
|
58
|
+
createJobDefinition(props) {
|
|
59
|
+
const neuronxInstanceType = props.neuronxInstanceType;
|
|
60
|
+
const targetPlatform = neuronxInstanceType.instanceType
|
|
61
|
+
.toString()
|
|
62
|
+
.split(".")[0];
|
|
63
|
+
const jobDefinition = new batch.EcsJobDefinition(this, "JobDefinition", {
|
|
64
|
+
container: new batch.EcsEc2ContainerDefinition(this, "ContainerDefinition", {
|
|
65
|
+
image: props.image.image,
|
|
66
|
+
memory: aws_cdk_lib_1.Size.mebibytes(Math.ceil(neuronxInstanceType.memory.toMebibytes() * 0.95)),
|
|
67
|
+
cpu: neuronxInstanceType.vCpu,
|
|
68
|
+
environment: {
|
|
69
|
+
NEURON_COMPILE_CACHE_URL: `${props.bucket.s3UrlForObject("neuron-compile-cache")}`,
|
|
70
|
+
NEURON_PLATFORM_TARGET_OVERRIDE: targetPlatform,
|
|
71
|
+
...props.environment,
|
|
72
|
+
},
|
|
73
|
+
command: props.command,
|
|
74
|
+
secrets: props.secrets,
|
|
75
|
+
}),
|
|
76
|
+
});
|
|
77
|
+
return jobDefinition;
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
exports.NeuronxCrossCompiler = NeuronxCrossCompiler;
|
|
81
|
+
_a = JSII_RTTI_SYMBOL_1;
|
|
82
|
+
NeuronxCrossCompiler[_a] = { fqn: "aws-cdk-neuronx-patterns.NeuronxCrossCompiler", version: "0.3.0" };
|
|
83
|
+
//# sourceMappingURL=data:application/json;base64,{"version":3,"file":"neuronx-cross-compiler.js","sourceRoot":"","sources":["../../../src/base/neuronx-compiler/neuronx-cross-compiler.ts"],"names":[],"mappings":";;;;;AAAA,6CAAyC;AACzC,+CAA+C;AAC/C,2CAA2C;AAE3C,wCAGoB;AACpB,mEAIiC;AAgBjC;;;;;;;GAOG;AACH,MAAa,oBAAqB,SAAQ,2CAAmB;IAC3D,YAAY,KAAgB,EAAE,EAAU,EAAE,KAAgC;QACxE,KAAK,CAAC,KAAK,EAAE,EAAE,EAAE,KAAK,CAAC,CAAC;IAC1B,CAAC;IAES,wBAAwB,CAChC,KAA+B;QAE/B,MAAM,UAAU,GACd,KAAK,CAAC,UAAU,EAAE,WAAW,EAAE;YAC/B,IAAI,CAAC,IAAI,CACP,IAAI,CAAC,UAAU,CAAC,WAAW,EAAE;gBAC3B,qCAA2B,CAAC,IAAI,CAAC,WAAW,EAAE;gBAC9C,qCAA2B,CAAC,IAAI,CAAC,WAAW,EAAE,CACjD,CAAC;QACJ,MAAM,cAAc,GAAG,IAAI,GAAG,CAAC,cAAc,CAAC,IAAI,EAAE,gBAAgB,EAAE;YACpE,YAAY,EAAE;gBACZ;oBACE,UAAU,EAAE,WAAW;oBACvB,MAAM,EAAE,GAAG,CAAC,iBAAiB,CAAC,GAAG,CAAC,UAAU,EAAE;wBAC5C,UAAU,EAAE,GAAG,CAAC,mBAAmB,CAAC,GAAG;wBACvC,SAAS,EAAE,IAAI;qBAChB,CAAC;iBACH;aACF;SACF,CAAC,CAAC;QAEH,MAAM,mBAAmB,GACtB,KAAmC,CAAC,mBAAmB;YACxD,GAAG,CAAC,YAAY,CAAC,EAAE,CAAC,GAAG,CAAC,aAAa,CAAC,GAAG,EAAE,GAAG,CAAC,YAAY,CAAC,OAAO,CAAC,CAAC;QAEvE,MAAM,kBAAkB,GAAG,IAAI,KAAK,CAAC,+BAA+B,CAClE,IAAI,EACJ,oBAAoB,EACpB;YACE,GAAG,EAAE,KAAK,CAAC,GAAG;YACd,UAAU,EAAE,KAAK,CAAC,UAAU;YAC5B,aAAa,EAAE,CAAC,mBAAmB,CAAC;YACpC,yBAAyB,EAAE,KAAK;YAChC,cAAc;YACd,IAAI,EAAE,KAAK,CAAC,IAAI;SACjB,CACF,CAAC;QACF,IAAI,cAAc,YAAY,GAAG,CAAC,cAAc,EAAE,CAAC;YAE/C,kBAAkB,CAAC,IAAI,CAAC,YACzB,CAAC,mBAAmB,CACnB,yCAAyC,EACzC,cAAc,CAAC,mBAAmB,CACnC,CAAC;QACJ,CAAC;QAED,kBAAI,CAAC,EAAE,CAAC,kBAAkB,CAAC,CAAC,GAAG,CAAC,MAAM,EAAE,8BAA8B,CAAC,CAAC;QAExE,OAAO;YACL,kBAAkB;YAClB,YAAY,EAAE,kBAAkB,CAAC,YAAa;SAC/C,CAAC;IACJ,CAAC;IAES,mBAAmB,CAC3B,KAA+B;QAE/B,MAAM,mBAAmB,GAAG,KAAK,CAAC,mBAAmB,CAAC;QACtD,MAAM,cAAc,GAAG,mBAAmB,CAAC,YAAY;aACpD,QAAQ,EAAE;aACV,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;QAEjB,MAAM,aAAa,GAAG,IAAI,KAAK,CAAC,gBAAgB,CAAC,IAAI,EAAE,eAAe,EAAE;YACtE,SAAS,EAAE,IAAI,KAAK,CAAC,yBAAyB,CAC5C,IAAI,EACJ,qBAAqB,EACrB;gBACE,KAAK,EAAE,KAAK,CAAC,KAAK,CAAC,KAAK;gBACxB,MAAM,EAAE,kBAAI,CAAC,SAAS,CACpB,IAAI,CAAC,IAAI,CAAC,mBAAmB,CAAC,MAAM,CAAC,WAAW,EAAE,GAAG,IAAI,CAAC,CAC3D;gBACD,GAAG,EAAE,mBAAmB,CAAC,IAAI;gBAC7B,WAAW,EAAE;oBACX,wBAAwB,EAAE,GAAG,KAAK,CAAC,MAAM,CAAC,cAAc,CAAC,sBAAsB,CAAC,EAAE;oBAClF,+BAA+B,EAAE,cAAc;oBAC/C,GAAG,KAAK,CAAC,WAAW;iBACrB;gBACD,OAAO,EAAE,KAAK,CAAC,OAAO;gBACtB,OAAO,EAAE,KAAK,CAAC,OAAO;aACvB,CACF;SACF,CAAC,CAAC;QAEH,OAAO,aAAa,CAAC;IACvB,CAAC;;AA1FH,oDA2FC","sourcesContent":["import { Size, Tags } from \"aws-cdk-lib\";\nimport * as batch from \"aws-cdk-lib/aws-batch\";\nimport * as ec2 from \"aws-cdk-lib/aws-ec2\";\nimport { Construct } from \"constructs\";\nimport {\n  NeuronOptimizedMachineImage,\n  PytorchTrainingNeuronxImage,\n} from \"../neuronx\";\nimport {\n  NeuronxCompilerBase,\n  NeuronxCompilerBaseProps,\n  ComputeEnvironmentResult,\n} from \"./neuronx-compiler-base\";\n\n/**\n * Props of NeuronxCrossCompiler.\n */\nexport interface NeuronxCrossCompilerProps extends NeuronxCompilerBaseProps {\n  /**\n   * The EC2 instance type to use for cross-compilation.\n   * This should be a non-Neuron instance type with sufficient memory and CPU\n   * for model compilation.\n   *\n   * @default ec2.InstanceType.of(ec2.InstanceClass.C7I, ec2.InstanceSize.XLARGE4)\n   */\n  readonly compileInstanceType?: ec2.InstanceType;\n}\n\n/**\n * Neuronx cross-compiler construct.\n * Compile the model on a non-Neuron instance and upload the artifacts to an S3 bucket.\n * This avoids the need for expensive Neuron instances during the compilation phase.\n *\n * The compilation uses `vllm serve` which performs model tracing and neuronx-cc compilation\n * entirely on CPU. The resulting artifacts are compatible with Neuron instances for inference.\n */\nexport class NeuronxCrossCompiler extends NeuronxCompilerBase {\n  constructor(scope: Construct, id: string, props: NeuronxCrossCompilerProps) {\n    super(scope, id, props);\n  }\n\n  protected createComputeEnvironment(\n    props: NeuronxCompilerBaseProps,\n  ): ComputeEnvironmentResult {\n    const volumeSize =\n      props.volumeSize?.toGibibytes() ??\n      Math.ceil(\n        this.weightSize.toGibibytes() +\n          PytorchTrainingNeuronxImage.size.toGibibytes() +\n          NeuronOptimizedMachineImage.size.toGibibytes(),\n      );\n    const launchTemplate = new ec2.LaunchTemplate(this, \"LaunchTemplate\", {\n      blockDevices: [\n        {\n          deviceName: \"/dev/xvda\",\n          volume: ec2.BlockDeviceVolume.ebs(volumeSize, {\n            volumeType: ec2.EbsDeviceVolumeType.GP3,\n            encrypted: true,\n          }),\n        },\n      ],\n    });\n\n    const compileInstanceType =\n      (props as NeuronxCrossCompilerProps).compileInstanceType ??\n      ec2.InstanceType.of(ec2.InstanceClass.C7I, ec2.InstanceSize.XLARGE4);\n\n    const computeEnvironment = new batch.ManagedEc2EcsComputeEnvironment(\n      this,\n      \"ComputeEnvironment\",\n      {\n        vpc: props.vpc,\n        vpcSubnets: props.vpcSubnets,\n        instanceTypes: [compileInstanceType],\n        useOptimalInstanceClasses: false,\n        launchTemplate,\n        spot: props.spot,\n      },\n    );\n    if (launchTemplate instanceof ec2.LaunchTemplate) {\n      (\n        computeEnvironment.node.defaultChild as batch.CfnComputeEnvironment\n      ).addPropertyOverride(\n        \"ComputeResources.LaunchTemplate.Version\",\n        launchTemplate.latestVersionNumber,\n      );\n    }\n\n    Tags.of(computeEnvironment).add(\"Name\", \"neuronx-cross-compile-worker\");\n\n    return {\n      computeEnvironment,\n      instanceRole: computeEnvironment.instanceRole!,\n    };\n  }\n\n  protected createJobDefinition(\n    props: NeuronxCompilerBaseProps,\n  ): batch.IJobDefinition {\n    const neuronxInstanceType = props.neuronxInstanceType;\n    const targetPlatform = neuronxInstanceType.instanceType\n      .toString()\n      .split(\".\")[0];\n\n    const jobDefinition = new batch.EcsJobDefinition(this, \"JobDefinition\", {\n      container: new batch.EcsEc2ContainerDefinition(\n        this,\n        \"ContainerDefinition\",\n        {\n          image: props.image.image,\n          memory: Size.mebibytes(\n            Math.ceil(neuronxInstanceType.memory.toMebibytes() * 0.95),\n          ),\n          cpu: neuronxInstanceType.vCpu,\n          environment: {\n            NEURON_COMPILE_CACHE_URL: `${props.bucket.s3UrlForObject(\"neuron-compile-cache\")}`,\n            NEURON_PLATFORM_TARGET_OVERRIDE: targetPlatform,\n            ...props.environment,\n          },\n          command: props.command,\n          secrets: props.secrets,\n        },\n      ),\n    });\n\n    return jobDefinition;\n  }\n}\n"]}
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import * as batch from "aws-cdk-lib/aws-batch";
|
|
2
|
+
import { Construct } from "constructs";
|
|
3
|
+
import { INeuronxContainerImage, NeuronxCompiledModel, NeuronxCompilerBase, NeuronxCompilerBaseProps, ComputeEnvironmentResult } from "./neuronx-compiler-base";
|
|
4
|
+
/**
|
|
5
|
+
* Props of NeuronxNativeCompiler.
|
|
6
|
+
*/
|
|
7
|
+
export interface NeuronxNativeCompilerProps extends NeuronxCompilerBaseProps {
|
|
8
|
+
}
|
|
9
|
+
/**
|
|
10
|
+
* Neuronx compiler construct.
|
|
11
|
+
* Compile the model to work with Inferentia2 and Trainium1 and upload it to an S3 bucket.
|
|
12
|
+
*/
|
|
13
|
+
export declare class NeuronxNativeCompiler extends NeuronxCompilerBase {
|
|
14
|
+
constructor(scope: Construct, id: string, props: NeuronxNativeCompilerProps);
|
|
15
|
+
protected createComputeEnvironment(props: NeuronxCompilerBaseProps): ComputeEnvironmentResult;
|
|
16
|
+
protected createJobDefinition(props: NeuronxCompilerBaseProps): batch.IJobDefinition;
|
|
17
|
+
}
|
|
18
|
+
export { INeuronxContainerImage, NeuronxCompiledModel };
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var _a;
|
|
3
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
4
|
+
exports.NeuronxNativeCompiler = void 0;
|
|
5
|
+
const JSII_RTTI_SYMBOL_1 = Symbol.for("jsii.rtti");
|
|
6
|
+
const aws_cdk_lib_1 = require("aws-cdk-lib");
|
|
7
|
+
const ec2 = require("aws-cdk-lib/aws-ec2");
|
|
8
|
+
const aws_batch_1 = require("../aws-batch");
|
|
9
|
+
const neuronx_1 = require("../neuronx");
|
|
10
|
+
const neuronx_compiler_base_1 = require("./neuronx-compiler-base");
|
|
11
|
+
/**
|
|
12
|
+
* Neuronx compiler construct.
|
|
13
|
+
* Compile the model to work with Inferentia2 and Trainium1 and upload it to an S3 bucket.
|
|
14
|
+
*/
|
|
15
|
+
class NeuronxNativeCompiler extends neuronx_compiler_base_1.NeuronxCompilerBase {
|
|
16
|
+
constructor(scope, id, props) {
|
|
17
|
+
super(scope, id, props);
|
|
18
|
+
}
|
|
19
|
+
createComputeEnvironment(props) {
|
|
20
|
+
const volumeSize = props.volumeSize?.toGibibytes() ??
|
|
21
|
+
Math.ceil(this.weightSize.toGibibytes() +
|
|
22
|
+
neuronx_1.PytorchTrainingNeuronxImage.size.toGibibytes() +
|
|
23
|
+
neuronx_1.NeuronOptimizedMachineImage.size.toGibibytes());
|
|
24
|
+
const launchTemplate = new ec2.LaunchTemplate(this, "LaunchTemplate", {
|
|
25
|
+
blockDevices: [
|
|
26
|
+
{
|
|
27
|
+
deviceName: "/dev/xvda",
|
|
28
|
+
volume: ec2.BlockDeviceVolume.ebs(volumeSize, {
|
|
29
|
+
volumeType: ec2.EbsDeviceVolumeType.GP3,
|
|
30
|
+
encrypted: true,
|
|
31
|
+
}),
|
|
32
|
+
},
|
|
33
|
+
],
|
|
34
|
+
});
|
|
35
|
+
const neuronxInstanceType = props.neuronxInstanceType ?? neuronx_1.NeuronxInstanceType.INF2_48XLARGE;
|
|
36
|
+
const computeEnvironment = new aws_batch_1.NeuronxBatchComputeEnvironment(this, "ComputeEnvironment", {
|
|
37
|
+
vpc: props.vpc,
|
|
38
|
+
vpcSubnets: props.vpcSubnets,
|
|
39
|
+
instanceTypes: [neuronxInstanceType.instanceType],
|
|
40
|
+
useOptimalInstanceClasses: false,
|
|
41
|
+
launchTemplate,
|
|
42
|
+
spot: props.spot,
|
|
43
|
+
});
|
|
44
|
+
aws_cdk_lib_1.Tags.of(computeEnvironment).add("Name", "neuronx-compile-worker");
|
|
45
|
+
return {
|
|
46
|
+
computeEnvironment,
|
|
47
|
+
instanceRole: computeEnvironment.instanceRole,
|
|
48
|
+
};
|
|
49
|
+
}
|
|
50
|
+
createJobDefinition(props) {
|
|
51
|
+
const neuronxInstanceType = props.neuronxInstanceType ?? neuronx_1.NeuronxInstanceType.INF2_48XLARGE;
|
|
52
|
+
return new aws_batch_1.NeuronxBatchEcsJobDefinition(this, "JobDefinition", {
|
|
53
|
+
neuronxInstanceType,
|
|
54
|
+
image: props.image.image,
|
|
55
|
+
memory: aws_cdk_lib_1.Size.mebibytes(Math.ceil(neuronxInstanceType.memory.toMebibytes() * 0.95)),
|
|
56
|
+
cpu: neuronxInstanceType.vCpu,
|
|
57
|
+
environment: {
|
|
58
|
+
NEURON_COMPILE_CACHE_URL: `${props.bucket.s3UrlForObject("neuron-compile-cache")}`,
|
|
59
|
+
...props.environment,
|
|
60
|
+
},
|
|
61
|
+
command: props.command,
|
|
62
|
+
secrets: props.secrets,
|
|
63
|
+
});
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
exports.NeuronxNativeCompiler = NeuronxNativeCompiler;
|
|
67
|
+
_a = JSII_RTTI_SYMBOL_1;
|
|
68
|
+
NeuronxNativeCompiler[_a] = { fqn: "aws-cdk-neuronx-patterns.NeuronxNativeCompiler", version: "0.3.0" };
|
|
69
|
+
//# sourceMappingURL=data:application/json;base64,eyJ2ZXJzaW9uIjozLCJmaWxlIjoibmV1cm9ueC1uYXRpdmUtY29tcGlsZXIuanMiLCJzb3VyY2VSb290IjoiIiwic291cmNlcyI6WyIuLi8uLi8uLi9zcmMvYmFzZS9uZXVyb254LWNvbXBpbGVyL25ldXJvbngtbmF0aXZlLWNvbXBpbGVyLnRzIl0sIm5hbWVzIjpbXSwibWFwcGluZ3MiOiI7Ozs7O0FBQUEsNkNBQXlDO0FBRXpDLDJDQUEyQztBQUUzQyw0Q0FHc0I7QUFDdEIsd0NBSW9CO0FBQ3BCLG1FQU1pQztBQU9qQzs7O0dBR0c7QUFDSCxNQUFhLHFCQUFzQixTQUFRLDJDQUFtQjtJQUM1RCxZQUFZLEtBQWdCLEVBQUUsRUFBVSxFQUFFLEtBQWlDO1FBQ3pFLEtBQUssQ0FBQyxLQUFLLEVBQUUsRUFBRSxFQUFFLEtBQUssQ0FBQyxDQUFDO0lBQzFCLENBQUM7SUFFUyx3QkFBd0IsQ0FDaEMsS0FBK0I7UUFFL0IsTUFBTSxVQUFVLEdBQ2QsS0FBSyxDQUFDLFVBQVUsRUFBRSxXQUFXLEVBQUU7WUFDL0IsSUFBSSxDQUFDLElBQUksQ0FDUCxJQUFJLENBQUMsVUFBVSxDQUFDLFdBQVcsRUFBRTtnQkFDM0IscUNBQTJCLENBQUMsSUFBSSxDQUFDLFdBQVcsRUFBRTtnQkFDOUMscUNBQTJCLENBQUMsSUFBSSxDQUFDLFdBQVcsRUFBRSxDQUNqRCxDQUFDO1FBQ0osTUFBTSxjQUFjLEdBQUcsSUFBSSxHQUFHLENBQUMsY0FBYyxDQUFDLElBQUksRUFBRSxnQkFBZ0IsRUFBRTtZQUNwRSxZQUFZLEVBQUU7Z0JBQ1o7b0JBQ0UsVUFBVSxFQUFFLFdBQVc7b0JBQ3ZCLE1BQU0sRUFBRSxHQUFHLENBQUMsaUJBQWlCLENBQUMsR0FBRyxDQUFDLFVBQVUsRUFBRTt3QkFDNUMsVUFBVSxFQUFFLEdBQUcsQ0FBQyxtQkFBbUIsQ0FBQyxHQUFHO3dCQUN2QyxTQUFTLEVBQUUsSUFBSTtxQkFDaEIsQ0FBQztpQkFDSDthQUNGO1NBQ0YsQ0FBQyxDQUFDO1FBRUgsTUFBTSxtQkFBbUIsR0FDdkIsS0FBSyxDQUFDLG1CQUFtQixJQUFJLDZCQUFtQixDQUFDLGFBQWEsQ0FBQztRQUNqRSxNQUFNLGtCQUFrQixHQUFHLElBQUksMENBQThCLENBQzNELElBQUksRUFDSixvQkFBb0IsRUFDcEI7WUFDRSxHQUFHLEVBQUUsS0FBSyxDQUFDLEdBQUc7WUFDZCxVQUFVLEVBQUUsS0FBSyxDQUFDLFVBQVU7WUFDNUIsYUFBYSxFQUFFLENBQUMsbUJBQW1CLENBQUMsWUFBWSxDQUFDO1lBQ2pELHlCQUF5QixFQUFFLEtBQUs7WUFDaEMsY0FBYztZQUNkLElBQUksRUFBRSxLQUFLLENBQUMsSUFBSTtTQUNqQixDQUNGLENBQUM7UUFDRixrQkFBSSxDQUFDLEVBQUUsQ0FBQyxrQkFBa0IsQ0FBQyxDQUFDLEdBQUcsQ0FBQyxNQUFNLEVBQUUsd0JBQXdCLENBQUMsQ0FBQztRQUVsRSxPQUFPO1lBQ0wsa0JBQWtCO1lBQ2xCLFlBQVksRUFBRSxrQkFBa0IsQ0FBQyxZQUFZO1NBQzlDLENBQUM7SUFDSixDQUFDO0lBRVMsbUJBQW1CLENBQzNCLEtBQStCO1FBRS9CLE1BQU0sbUJBQW1CLEdBQ3ZCLEtBQUssQ0FBQyxtQkFBbUIsSUFBSSw2QkFBbUIsQ0FBQyxhQUFhLENBQUM7UUFDakUsT0FBTyxJQUFJLHdDQUE0QixDQUFDLElBQUksRUFBRSxlQUFlLEVBQUU7WUFDN0QsbUJBQW1CO1lBQ25CLEtBQUssRUFBRSxLQUFLLENBQUMsS0FBSyxDQUFDLEtBQUs7WUFDeEIsTUFBTSxFQUFFLGtCQUFJLENBQUMsU0FBUyxDQUNwQixJQUFJLENBQUMsSUFBSSxDQUFDLG1CQUFtQixDQUFDLE1BQU0sQ0FBQyxXQUFXLEVBQUUsR0FBRyxJQUFJLENBQUMsQ0FDM0Q7WUFDRCxHQUFHLEVBQUUsbUJBQW1CLENBQUMsSUFBSTtZQUM3QixXQUFXLEVBQUU7Z0JBQ1gsd0JBQXdCLEVBQUUsR0FBRyxLQUFLLENBQUMsTUFBTSxDQUFDLGNBQWMsQ0FBQyxzQkFBc0IsQ0FBQyxFQUFFO2dCQUNsRixHQUFHLEtBQUssQ0FBQyxXQUFXO2FBQ3JCO1lBQ0QsT0FBTyxFQUFFLEtBQUssQ0FBQyxPQUFPO1lBQ3RCLE9BQU8sRUFBRSxLQUFLLENBQUMsT0FBTztTQUN2QixDQUFDLENBQUM7SUFDTCxDQUFDOztBQXBFSCxzREFxRUMiLCJzb3VyY2VzQ29udGVudCI6WyJpbXBvcnQgeyBTaXplLCBUYWdzIH0gZnJvbSBcImF3cy1jZGstbGliXCI7XG5pbXBvcnQgKiBhcyBiYXRjaCBmcm9tIFwiYXdzLWNkay1saWIvYXdzLWJhdGNoXCI7XG5pbXBvcnQgKiBhcyBlYzIgZnJvbSBcImF3cy1jZGstbGliL2F3cy1lYzJcIjtcbmltcG9ydCB7IENvbnN0cnVjdCB9IGZyb20gXCJjb25zdHJ1Y3RzXCI7XG5pbXBvcnQge1xuICBOZXVyb254QmF0Y2hDb21wdXRlRW52aXJvbm1lbnQsXG4gIE5ldXJvbnhCYXRjaEVjc0pvYkRlZmluaXRpb24sXG59IGZyb20gXCIuLi9hd3MtYmF0Y2hcIjtcbmltcG9ydCB7XG4gIE5ldXJvbk9wdGltaXplZE1hY2hpbmVJbWFnZSxcbiAgTmV1cm9ueEluc3RhbmNlVHlwZSxcbiAgUHl0b3JjaFRyYWluaW5nTmV1cm9ueEltYWdlLFxufSBmcm9tIFwiLi4vbmV1cm9ueFwiO1xuaW1wb3J0IHtcbiAgSU5ldXJvbnhDb250YWluZXJJbWFnZSxcbiAgTmV1cm9ueENvbXBpbGVkTW9kZWwsXG4gIE5ldXJvbnhDb21waWxlckJhc2UsXG4gIE5ldXJvbnhDb21waWxlckJhc2VQcm9wcyxcbiAgQ29tcHV0ZUVudmlyb25tZW50UmVzdWx0LFxufSBmcm9tIFwiLi9uZXVyb254LWNvbXBpbGVyLWJhc2VcIjtcblxuLyoqXG4gKiBQcm9wcyBvZiBOZXVyb254TmF0aXZlQ29tcGlsZXIuXG4gKi9cbmV4cG9ydCBpbnRlcmZhY2UgTmV1cm9ueE5hdGl2ZUNvbXBpbGVyUHJvcHMgZXh0ZW5kcyBOZXVyb254Q29tcGlsZXJCYXNlUHJvcHMge31cblxuLyoqXG4gKiBOZXVyb254IGNvbXBpbGVyIGNvbnN0cnVjdC5cbiAqIENvbXBpbGUgdGhlIG1vZGVsIHRvIHdvcmsgd2l0aCBJbmZlcmVudGlhMiBhbmQgVHJhaW5pdW0xIGFuZCB1cGxvYWQgaXQgdG8gYW4gUzMgYnVja2V0LlxuICovXG5leHBvcnQgY2xhc3MgTmV1cm9ueE5hdGl2ZUNvbXBpbGVyIGV4dGVuZHMgTmV1cm9ueENvbXBpbGVyQmFzZSB7XG4gIGNvbnN0cnVjdG9yKHNjb3BlOiBDb25zdHJ1Y3QsIGlkOiBzdHJpbmcsIHByb3BzOiBOZXVyb254TmF0aXZlQ29tcGlsZXJQcm9wcykge1xuICAgIHN1cGVyKHNjb3BlLCBpZCwgcHJvcHMpO1xuICB9XG5cbiAgcHJvdGVjdGVkIGNyZWF0ZUNvbXB1dGVFbnZpcm9ubWVudChcbiAgICBwcm9wczogTmV1cm9ueENvbXBpbGVyQmFzZVByb3BzLFxuICApOiBDb21wdXRlRW52aXJvbm1lbnRSZXN1bHQge1xuICAgIGNvbnN0IHZvbHVtZVNpemUgPVxuICAgICAgcHJvcHMudm9sdW1lU2l6ZT8udG9HaWJpYnl0ZXMoKSA/P1xuICAgICAgTWF0aC5jZWlsKFxuICAgICAgICB0aGlzLndlaWdodFNpemUudG9HaWJpYnl0ZXMoKSArXG4gICAgICAgICAgUHl0b3JjaFRyYWluaW5nTmV1cm9ueEltYWdlLnNpemUudG9HaWJpYnl0ZXMoKSArXG4gICAgICAgICAgTmV1cm9uT3B0aW1pemVkTWFjaGluZUltYWdlLnNpemUudG9HaWJpYnl0ZXMoKSxcbiAgICAgICk7XG4gICAgY29uc3QgbGF1bmNoVGVtcGxhdGUgPSBuZXcgZWMyLkxhdW5jaFRlbXBsYXRlKHRoaXMsIFwiTGF1bmNoVGVtcGxhdGVcIiwge1xuICAgICAgYmxvY2tEZXZpY2VzOiBbXG4gICAgICAgIHtcbiAgICAgICAgICBkZXZpY2VOYW1lOiBcIi9kZXYveHZkYVwiLFxuICAgICAgICAgIHZvbHVtZTogZWMyLkJsb2NrRGV2aWNlVm9sdW1lLmVicyh2b2x1bWVTaXplLCB7XG4gICAgICAgICAgICB2b2x1bWVUeXBlOiBlYzIuRWJzRGV2aWNlVm9sdW1lVHlwZS5HUDMsXG4gICAgICAgICAgICBlbmNyeXB0ZWQ6IHRydWUsXG4gICAgICAgICAgfSksXG4gICAgICAgIH0sXG4gICAgICBdLFxuICAgIH0pO1xuXG4gICAgY29uc3QgbmV1cm9ueEluc3RhbmNlVHlwZSA9XG4gICAgICBwcm9wcy5uZXVyb254SW5zdGFuY2VUeXBlID8/IE5ldXJvbnhJbnN0YW5jZVR5cGUuSU5GMl80OFhMQVJHRTtcbiAgICBjb25zdCBjb21wdXRlRW52aXJvbm1lbnQgPSBuZXcgTmV1cm9ueEJhdGNoQ29tcHV0ZUVudmlyb25tZW50KFxuICAgICAgdGhpcyxcbiAgICAgIFwiQ29tcHV0ZUVudmlyb25tZW50XCIsXG4gICAgICB7XG4gICAgICAgIHZwYzogcHJvcHMudnBjLFxuICAgICAgICB2cGNTdWJuZXRzOiBwcm9wcy52cGNTdWJuZXRzLFxuICAgICAgICBpbnN0YW5jZVR5cGVzOiBbbmV1cm9ueEluc3RhbmNlVHlwZS5pbnN0YW5jZVR5cGVdLFxuICAgICAgICB1c2VPcHRpbWFsSW5zdGFuY2VDbGFzc2VzOiBmYWxzZSxcbiAgICAgICAgbGF1bmNoVGVtcGxhdGUsXG4gICAgICAgIHNwb3Q6IHByb3BzLnNwb3QsXG4gICAgICB9LFxuICAgICk7XG4gICAgVGFncy5vZihjb21wdXRlRW52aXJvbm1lbnQpLmFkZChcIk5hbWVcIiwgXCJuZXVyb254LWNvbXBpbGUtd29ya2VyXCIpO1xuXG4gICAgcmV0dXJuIHtcbiAgICAgIGNvbXB1dGVFbnZpcm9ubWVudCxcbiAgICAgIGluc3RhbmNlUm9sZTogY29tcHV0ZUVudmlyb25tZW50Lmluc3RhbmNlUm9sZSxcbiAgICB9O1xuICB9XG5cbiAgcHJvdGVjdGVkIGNyZWF0ZUpvYkRlZmluaXRpb24oXG4gICAgcHJvcHM6IE5ldXJvbnhDb21waWxlckJhc2VQcm9wcyxcbiAgKTogYmF0Y2guSUpvYkRlZmluaXRpb24ge1xuICAgIGNvbnN0IG5ldXJvbnhJbnN0YW5jZVR5cGUgPVxuICAgICAgcHJvcHMubmV1cm9ueEluc3RhbmNlVHlwZSA/PyBOZXVyb254SW5zdGFuY2VUeXBlLklORjJfNDhYTEFSR0U7XG4gICAgcmV0dXJuIG5ldyBOZXVyb254QmF0Y2hFY3NKb2JEZWZpbml0aW9uKHRoaXMsIFwiSm9iRGVmaW5pdGlvblwiLCB7XG4gICAgICBuZXVyb254SW5zdGFuY2VUeXBlLFxuICAgICAgaW1hZ2U6IHByb3BzLmltYWdlLmltYWdlLFxuICAgICAgbWVtb3J5OiBTaXplLm1lYmlieXRlcyhcbiAgICAgICAgTWF0aC5jZWlsKG5ldXJvbnhJbnN0YW5jZVR5cGUubWVtb3J5LnRvTWViaWJ5dGVzKCkgKiAwLjk1KSxcbiAgICAgICksXG4gICAgICBjcHU6IG5ldXJvbnhJbnN0YW5jZVR5cGUudkNwdSxcbiAgICAgIGVudmlyb25tZW50OiB7XG4gICAgICAgIE5FVVJPTl9DT01QSUxFX0NBQ0hFX1VSTDogYCR7cHJvcHMuYnVja2V0LnMzVXJsRm9yT2JqZWN0KFwibmV1cm9uLWNvbXBpbGUtY2FjaGVcIil9YCxcbiAgICAgICAgLi4ucHJvcHMuZW52aXJvbm1lbnQsXG4gICAgICB9LFxuICAgICAgY29tbWFuZDogcHJvcHMuY29tbWFuZCxcbiAgICAgIHNlY3JldHM6IHByb3BzLnNlY3JldHMsXG4gICAgfSk7XG4gIH1cbn1cblxuZXhwb3J0IHsgSU5ldXJvbnhDb250YWluZXJJbWFnZSwgTmV1cm9ueENvbXBpbGVkTW9kZWwgfTtcbiJdfQ==
|
|
@@ -349,5 +349,5 @@ class VllmEngineArgumentsParser {
|
|
|
349
349
|
}
|
|
350
350
|
exports.VllmEngineArgumentsParser = VllmEngineArgumentsParser;
|
|
351
351
|
_a = JSII_RTTI_SYMBOL_1;
|
|
352
|
-
VllmEngineArgumentsParser[_a] = { fqn: "aws-cdk-neuronx-patterns.VllmEngineArgumentsParser", version: "0.
|
|
352
|
+
VllmEngineArgumentsParser[_a] = { fqn: "aws-cdk-neuronx-patterns.VllmEngineArgumentsParser", version: "0.3.0" };
|
|
353
353
|
//# sourceMappingURL=data:application/json;base64,{"version":3,"file":"vllm-engine-argments.js","sourceRoot":"","sources":["../../../../src/base/server-engine/vllm-engine/vllm-engine-argments.ts"],"names":[],"mappings":";;;;;AAEA;;GAEG;AACH,IAAY,eAOX;AAPD,WAAY,eAAe;IACzB,kCAAe,CAAA;IACf,gCAAa,CAAA;IACb,sCAAmB,CAAA;IACnB,kCAAe,CAAA;IACf,wCAAqB,CAAA;IACrB,kCAAe,CAAA;AACjB,CAAC,EAPW,eAAe,+BAAf,eAAe,QAO1B;AAED;;GAEG;AACH,IAAY,UAaX;AAbD,WAAY,UAAU;IACpB,2BAAa,CAAA;IACb,uBAAS,CAAA;IACT,yCAA2B,CAAA;IAC3B,iCAAmB,CAAA;IACnB,6BAAe,CAAA;IACf,uCAAyB,CAAA;IACzB,6CAA+B,CAAA;IAC/B,2BAAa,CAAA;IACb,2CAA6B,CAAA;IAC7B,iCAAmB,CAAA;IACnB,+CAAiC,CAAA;IACjC,iDAAmC,CAAA;AACrC,CAAC,EAbW,UAAU,0BAAV,UAAU,QAarB;AAED;;GAEG;AACH,IAAY,QAyBX;AAzBD,WAAY,QAAQ;IAClB;;OAEG;IACH,yBAAa,CAAA;IACb;;OAEG;IACH,yBAAa,CAAA;IACb;;OAEG;IACH,+BAAmB,CAAA;IACnB;;OAEG;IACH,iCAAqB,CAAA;IACrB;;OAEG;IACH,2BAAe,CAAA;IACf;;OAEG;IACH,+BAAmB,CAAA;AACrB,CAAC,EAzBW,QAAQ,wBAAR,QAAQ,QAyBnB;AAED;;GAEG;AACH,IAAY,gBAGX;AAHD,WAAY,gBAAgB;IAC1B,iCAAa,CAAA;IACb,iCAAa,CAAA;AACf,CAAC,EAHW,gBAAgB,gCAAhB,gBAAgB,QAG3B;AAED;;GAEG;AACH,IAAY,qBAKX;AALD,WAAY,qBAAqB;IAC/B,sCAAa,CAAA;IACb,8CAAqB,CAAA;IACrB,kEAAyC,CAAA;IACzC,8CAAqB,CAAA;AACvB,CAAC,EALW,qBAAqB,qCAArB,qBAAqB,QAKhC;AAED;;GAEG;AACH,IAAY,eAGX;AAHD,WAAY,eAAe;IACzB,8CAA2B,CAAA;IAC3B,sCAAmB,CAAA;AACrB,CAAC,EAHW,eAAe,+BAAf,eAAe,QAG1B;AAED;;GAEG;AACH,IAAY,SAcX;AAdD,WAAY,SAAS;IACnB;;;OAGG;IACH,0BAAa,CAAA;IACb;;OAEG;IACH,0BAAa,CAAA;IACb;;OAEG;IACH,0CAA6B,CAAA;AAC/B,CAAC,EAdW,SAAS,yBAAT,SAAS,QAcpB;AAED;;GAEG;AACH,IAAY,0BAKX;AALD,WAAY,0BAA0B;IACpC,yCAAW,CAAA;IACX,uCAAS,CAAA;IACT,yCAAW,CAAA;IACX,qEAAuC,CAAA;AACzC,CAAC,EALW,0BAA0B,0CAA1B,0BAA0B,QAKrC;AAED;;GAEG;AACH,IAAY,SAMX;AAND,WAAY,SAAS;IACnB,6CAAU,CAAA;IACV,gDAAY,CAAA;IACZ,gDAAY,CAAA;IACZ,gDAAY,CAAA;IACZ,mDAAc,CAAA;AAChB,CAAC,EANW,SAAS,yBAAT,SAAS,QAMpB;AAED;;GAEG;AACH,IAAY,YAKX;AALD,WAAY,YAAY;IACtB,6BAAa,CAAA;IACb,2BAAW,CAAA;IACX,qCAAqB,CAAA;IACrB,qCAAqB,CAAA;AACvB,CAAC,EALW,YAAY,4BAAZ,YAAY,QAKvB;AAED;;GAEG;AACH,IAAY,qBASX;AATD,WAAY,qBAAqB;IAC/B;;OAEG;IACH,4CAAmB,CAAA;IACnB;;OAEG;IACH,0CAAiB,CAAA;AACnB,CAAC,EATW,qBAAqB,qCAArB,qBAAqB,QAShC;AAED;;GAEG;AACH,IAAY,YA2BX;AA3BD,WAAY,YAAY;IACtB,6BAAa,CAAA;IACb,2BAAW,CAAA;IACX,2CAA2B,CAAA;IAC3B,qCAAqB,CAAA;IACrB,2BAAW,CAAA;IACX,qCAAqB,CAAA;IACrB,yCAAyB,CAAA;IACzB,qCAAqB,CAAA;IACrB,+BAAe,CAAA;IACf,iCAAiB,CAAA;IACjB,6BAAa,CAAA;IACb,iDAAiC,CAAA;IACjC,2CAA2B,CAAA;IAC3B,yCAAyB,CAAA;IACzB,6BAAa,CAAA;IACb,yDAAyC,CAAA;IACzC,6CAA6B,CAAA;IAC7B,2BAAW,CAAA;IACX,2BAAW,CAAA;IACX,6CAA6B,CAAA;IAC7B,6CAA6B,CAAA;IAC7B,6BAAa,CAAA;IACb,+BAAe,CAAA;IACf,uCAAuB,CAAA;IACvB,mCAAmB,CAAA;IACnB,6BAAa,CAAA;AACf,CAAC,EA3BW,YAAY,4BAAZ,YAAY,QA2BvB;AAED;;GAEG;AACH,IAAY,aAiBX;AAjBD,WAAY,aAAa;IACvB;;OAEG;IACH,8BAAa,CAAA;IACb;;OAEG;IACH,8BAAa,CAAA;IACb;;OAEG;IACH,oCAAmB,CAAA;IACnB;;OAEG;IACH,kCAAiB,CAAA;AACnB,CAAC,EAjBW,aAAa,6BAAb,aAAa,QAiBxB;AAED;;GAEG;AACH,IAAY,YAOX;AAPD,WAAY,YAAY;IACtB;;OAEG;IACH,6BAAa,CAAA;IACb,yBAAS,CAAA;IACT,mCAAmB,CAAA;AACrB,CAAC,EAPW,YAAY,4BAAZ,YAAY,QAOvB;AAED;;GAEG;AACH,IAAY,iBAEX;AAFD,WAAY,iBAAiB;IAC3B,gCAAW,CAAA;AACb,CAAC,EAFW,iBAAiB,iCAAjB,iBAAiB,QAE5B;AAED;;GAEG;AACH,IAAY,MAQX;AARD,WAAY,MAAM;IAChB,uBAAa,CAAA;IACb,uBAAa,CAAA;IACb,2BAAiB,CAAA;IACjB,qBAAW,CAAA;IACX,qBAAW,CAAA;IACX,qBAAW,CAAA;IACX,qBAAW,CAAA;AACb,CAAC,EARW,MAAM,sBAAN,MAAM,QAQjB;AAED;;GAEG;AACH,IAAY,SAIX;AAJD,WAAY,SAAS;IACnB,0BAAa,CAAA;IACb,gCAAmB,CAAA;IACnB,kCAAqB,CAAA;AACvB,CAAC,EAJW,SAAS,yBAAT,SAAS,QAIpB;AAED;;GAEG;AACH,IAAY,yBAIX;AAJD,WAAY,yBAAyB;IACnC,0CAAa,CAAA;IACb,8CAAiB,CAAA;IACjB,8CAAiB,CAAA;AACnB,CAAC,EAJW,yBAAyB,yCAAzB,yBAAyB,QAIpC;AAED;;GAEG;AACH,IAAY,cAUX;AAVD,WAAY,cAAc;IACxB,mDAAiC,CAAA;IACjC,qCAAmB,CAAA;IACnB,mCAAiB,CAAA;IACjB,uCAAqB,CAAA;IACrB,iCAAe,CAAA;IACf,6CAA2B,CAAA;IAC3B,qCAAmB,CAAA;IACnB,mDAAiC,CAAA;IACjC,uCAAqB,CAAA;AACvB,CAAC,EAVW,cAAc,8BAAd,cAAc,QAUzB;AAED;;GAEG;AACH,IAAY,QASX;AATD,WAAY,QAAQ;IAClB,yBAAa,CAAA;IACb,iCAAqB,CAAA;IACrB,mCAAuB,CAAA;IACvB,2BAAe,CAAA;IACf,iCAAqB,CAAA;IACrB,2BAAe,CAAA;IACf,6BAAiB,CAAA;IACjB,2CAA+B,CAAA;AACjC,CAAC,EATW,QAAQ,wBAAR,QAAQ,QASnB;AAED;;GAEG;AACH,IAAY,cAGX;AAHD,WAAY,cAAc;IACxB,yCAAuB,CAAA;IACvB,+BAAa,CAAA;AACf,CAAC,EAHW,cAAc,8BAAd,cAAc,QAGzB;AAED;;GAEG;AACH,IAAY,gBAGX;AAHD,WAAY,gBAAgB;IAC1B,iCAAa,CAAA;IACb,yCAAqB,CAAA;AACvB,CAAC,EAHW,gBAAgB,gCAAhB,gBAAgB,QAG3B;AAm7BD,MAAM,mBAAmB,GAAkC;IACzD,gBAAgB;IAChB,gBAAgB;IAChB,gBAAgB;CACjB,CAAC;AACF,MAAM,UAAU,GAAkC,CAAC,OAAO,EAAE,SAAS,CAAC,CAAC;AAEvE,MAAsB,yBAAyB;IAC7C;;;;;OAKG;IACH,MAAM,CAAC,MAAM,CAAC,IAAyB;QACrC,OAAO,MAAM,CAAC,OAAO,CAAC,IAAI,CAAC;aACxB,MAAM,CAAC,CAAC,CAAC,GAAG,CAAC,EAAE,EAAE,CAAC,CAAC,UAAU,CAAC,QAAQ,CAAC,GAAgC,CAAC,CAAC;aACzE,MAAM,CAA2B,CAAC,IAAI,EAAE,CAAC,GAAG,EAAE,KAAK,CAAC,EAAE,EAAE;YACvD,MAAM,CAAC,GAAG,GAAG,CAAC,OAAO,CAAC,QAAQ,EAAE,CAAC,KAAK,EAAE,EAAE,CAAC,IAAI,KAAK,CAAC,WAAW,EAAE,EAAE,CAAC,CAAC;YACtE,IACE,mBAAmB,CAAC,QAAQ,CAAC,GAAgC,CAAC;gBAC9D,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,KAAK,CAAC,IAAI,OAAO,KAAK,KAAK,QAAQ,CAAC,EACpD,CAAC;gBACD,KAAK,GAAG,IAAI,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC;YAChC,CAAC;YACD,IAAI,CAAC,CAAC,CAAC,GAAG,KAAK,CAAC;YAChB,OAAO,IAAI,CAAC;QACd,CAAC,EAAE,EAAE,CAAC,CAAC;IACX,CAAC;IACD,MAAM,CAAC,GAAG,CAAC,IAAyB;QAClC,MAAM,WAAW,GAAG,MAAM,CAAC,OAAO,CAAC,IAAI,CAAC;aACrC,MAAM,CAAC,CAAC,CAAC,GAAG,CAAC,EAAE,EAAE,CAAC,CAAC,UAAU,CAAC,QAAQ,CAAC,GAAgC,CAAC,CAAC;aACzE,OAAO,CAAC,CAAC,CAAC,CAAC,EAAE,KAAK,CAAC,EAAE,EAAE;YACtB,MAAM,GAAG,GAAG,CAAC,CAAC,OAAO,CAAC,QAAQ,EAAE,CAAC,KAAK,EAAE,EAAE,CAAC,IAAI,KAAK,CAAC,WAAW,EAAE,EAAE,CAAC,CAAC;YACtE,IAAI,OAAO,KAAK,KAAK,SAAS,EAAE,CAAC;gBAC/B,OAAO,KAAK,CAAC,CAAC,CAAC,CAAC,KAAK,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;YACnC,CAAC;YACD,IAAI,KAAK,CAAC,OAAO,CAAC,KAAK,CAAC,EAAE,CAAC;gBACzB,OAAO,CAAC,KAAK,GAAG,EAAE,EAAE,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAC;YACnD,CAAC;YACD,IAAI,OAAO,KAAK,KAAK,QAAQ,EAAE,CAAC;gBAC9B,OAAO,CAAC,KAAK,GAAG,EAAE,EAAE,IAAI,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC,CAAC;YAC7C,CAAC;YACD,OAAO,CAAC,KAAK,GAAG,EAAE,EAAE,GAAG,KAAK,EAAE,CAAC,CAAC;QAClC,CAAC,CAAC,CAAC;QACL,OAAO,CAAC,IAAI,CAAC,KAAM,EAAE,GAAG,WAAW,CAAC,CAAC;IACvC,CAAC;;AAvCH,8DAwCC","sourcesContent":["import { Secret } from \"aws-cdk-lib/aws-batch\";\n\n/**\n * Log level options for Uvicorn\n */\nexport enum UvicornLogLevel {\n  DEBUG = \"debug\",\n  INFO = \"info\",\n  WARNING = \"warning\",\n  ERROR = \"error\",\n  CRITICAL = \"critical\",\n  TRACE = \"trace\",\n}\n\n/**\n * Available model weight loading formats\n */\nexport enum LoadFormat {\n  AUTO = \"auto\",\n  PT = \"pt\",\n  SAFETENSORS = \"safetensors\",\n  NPCACHE = \"npcache\",\n  DUMMY = \"dummy\",\n  TENSORIZER = \"tensorizer\",\n  SHARDED_STATE = \"sharded_state\",\n  GGUF = \"gguf\",\n  BITSANDBYTES = \"bitsandbytes\",\n  MISTRAL = \"mistral\",\n  RUNAI_STREAMER = \"runai_streamer\",\n  FASTSAFETENSORS = \"fastsafetensors\",\n}\n\n/**\n * Data types for model weights and activations\n */\nexport enum DataType {\n  /**\n   * “auto” will use FP16 precision for FP32 and FP16 models, and BF16 precision for BF16 models.\n   */\n  AUTO = \"auto\",\n  /**\n   * “half” for FP16. Recommended for AWQ quantization.\n   */\n  HALF = \"half\",\n  /**\n   * “float16” is the same as “half”.\n   */\n  FLOAT16 = \"float16\",\n  /**\n   * “bfloat16” for a balance between precision and range.\n   */\n  BFLOAT16 = \"bfloat16\",\n  /**\n   * “float” is shorthand for FP32 precision.\n   */\n  FLOAT = \"float\",\n  /**\n   * “float32” for FP32 precision.\n   */\n  FLOAT32 = \"float32\",\n}\n\n/**\n * The folder path to the generation config.\n */\nexport enum GenerationConfig {\n  AUTO = \"auto\",\n  VLLM = \"vllm\",\n}\n\n/**\n * Available guided decoding backends\n */\nexport enum GuidedDecodingBackend {\n  AUTO = \"auto\",\n  OUTLINES = \"outlines\",\n  LM_FORMAT_ENFORCER = \"lm-format-enforcer\",\n  XGRAMMAR = \"xgrammar\",\n}\n\n/**\n * Available reasoning parsers\n */\nexport enum ReasoningParser {\n  DEEPSEEK_R1 = \"deepseek_r1\",\n  GRANITE = \"granite\",\n}\n\n/**\n * Model implementation options\n */\nexport enum ModelImpl {\n  /**\n   * “auto” will try to use the vLLM implementation if it exists and fall back to the Transformers\n   * implementation if no vLLM implementation is available.\n   */\n  AUTO = \"auto\",\n  /**\n   * “vllm” will use the vLLM model implementation.\n   */\n  VLLM = \"vllm\",\n  /**\n   * “transformers” will use the Transformers model implementation.\n   */\n  TRANSFORMERS = \"transformers\",\n}\n\n/**\n * Distributed execution backend options\n */\nexport enum DistributedExecutorBackend {\n  RAY = \"ray\",\n  MP = \"mp\",\n  UNI = \"uni\",\n  EXTERNAL_LAUNCHER = \"external_launcher\",\n}\n\n/**\n * Cache block size options in number of tokens\n */\nexport enum BlockSize {\n  SIZE_8 = 8,\n  SIZE_16 = 16,\n  SIZE_32 = 32,\n  SIZE_64 = 64,\n  SIZE_128 = 128,\n}\n\n/**\n * KV cache data type options\n */\nexport enum KvCacheDtype {\n  AUTO = \"auto\",\n  FP8 = \"fp8\",\n  FP8_E4M3 = \"fp8_e4m3\",\n  FP8_E5M2 = \"fp8_e5m2\",\n}\n\n/**\n * Hash algorithm options for prefix caching\n */\nexport enum PrefixCachingHashAlgo {\n  /**\n   * “builtin” is Python’s built-in hash.\n   */\n  BUILTIN = \"builtin\",\n  /**\n   * “sha256” is collision resistant but with certain overheads.\n   */\n  SHA256 = \"sha256\",\n}\n\n/**\n * Quantization methods\n */\nexport enum Quantization {\n  AQLM = \"aqlm\",\n  AWQ = \"awq\",\n  DEEPSPEEDFP = \"deepspeedfp\",\n  TPU_INT8 = \"tpu_int8\",\n  FP8 = \"fp8\",\n  PTPC_FP8 = \"ptpc_fp8\",\n  FBGEMM_FP8 = \"fbgemm_fp8\",\n  MODELOPT = \"modelopt\",\n  NVFP4 = \"nvfp4\",\n  MARLIN = \"marlin\",\n  GGUF = \"gguf\",\n  GPTQ_MARLIN_24 = \"gptq_marlin_24\",\n  GPTQ_MARLIN = \"gptq_marlin\",\n  AWQ_MARLIN = \"awq_marlin\",\n  GPTQ = \"gptq\",\n  COMPRESSED_TENSORS = \"compressed-tensors\",\n  BITSANDBYTES = \"bitsandbytes\",\n  QQQ = \"qqq\",\n  HQQ = \"hqq\",\n  EXPERTS_INT8 = \"experts_int8\",\n  NEURON_QUANT = \"neuron_quant\",\n  IPEX = \"ipex\",\n  QUARK = \"quark\",\n  MOE_WNA16 = \"moe_wna16\",\n  TORCHAO = \"torchao\",\n  NONE = \"None\",\n}\n\n/**\n * Tokenizer mode options\n */\nexport enum TokenizerMode {\n  /**\n   * “auto” will use the fast tokenizer if available.\n   */\n  AUTO = \"auto\",\n  /**\n   * “slow” will always use the slow tokenizer.\n   */\n  SLOW = \"slow\",\n  /**\n   * “mistral” will always use the mistral_common tokenizer.\n   */\n  MISTRAL = \"mistral\",\n  /**\n   * “custom” will use –tokenizer to select the preregistered tokenizer.\n   */\n  CUSTOM = \"custom\",\n}\n\n/**\n * Model config format options\n */\nexport enum ConfigFormat {\n  /**\n   * “auto” will try to load the config in hf format if available else it will try to load in mistral format\n   */\n  AUTO = \"auto\",\n  HF = \"hf\",\n  MISTRAL = \"mistral\",\n}\n\n/**\n * Tokenizer pool type options\n */\nexport enum TokenizerPoolType {\n  RAY = \"ray\",\n}\n\n/**\n * Device type options for vLLM execution\n */\nexport enum Device {\n  AUTO = \"auto\",\n  CUDA = \"cuda\",\n  NEURON = \"neuron\",\n  CPU = \"cpu\",\n  TPU = \"tpu\",\n  XPU = \"xpu\",\n  HPU = \"hpu\",\n}\n\n/**\n * LoRA data type options\n */\nexport enum LoraDtype {\n  AUTO = \"auto\",\n  FLOAT16 = \"float16\",\n  BFLOAT16 = \"bfloat16\",\n}\n\n/**\n * Format options for rendering message content within a chat template\n */\nexport enum ChatTemplateContentFormat {\n  AUTO = \"auto\",\n  STRING = \"string\",\n  OPENAI = \"openai\",\n}\n\n/**\n * Tool call parser options\n */\nexport enum ToolCallParser {\n  GRANITE_20B_FC = \"granite-20b-fc\",\n  GRANITE = \"granite\",\n  HERMES = \"hermes\",\n  INTERNLM = \"internlm\",\n  JAMBA = \"jamba\",\n  LLAMA3_JSON = \"llama3_json\",\n  MISTRAL = \"mistral\",\n  PHI4_MINI_JSON = \"phi4_mini_json\",\n  PYTHONIC = \"pythonic\",\n}\n\n/**\n * Task options for model usage\n */\nexport enum VllmTask {\n  AUTO = \"auto\",\n  GENERATE = \"generate\",\n  EMBEDDING = \"embedding\",\n  EMBED = \"embed\",\n  CLASSIFY = \"classify\",\n  SCORE = \"score\",\n  REWARD = \"reward\",\n  TRANSCRIPTION = \"transcription\",\n}\n\n/**\n * Preemption mode.\n */\nexport enum PreemptionMode {\n  RECOMPUTE = \"recompute\",\n  SWAP = \"swap\",\n}\n\n/**\n * Scheduling policy options\n */\nexport enum SchedulingPolicy {\n  FCFS = \"fcfs\",\n  PRIORITY = \"priority\",\n}\n\n/**\n * VllmNamedArguments\n */\nexport interface VllmNamedArguments {\n  /**\n   * Host name.\n   */\n  readonly host?: string;\n\n  /**\n   * Port number.\n   * @default 8000\n   */\n  readonly port?: number;\n\n  /**\n   * Log level for uvicorn.\n   * @default UvicornLogLevel.INFO\n   */\n  readonly uvicornLogLevel?: UvicornLogLevel;\n\n  /**\n   * Disable uvicorn access log.\n   * @default false\n   */\n  readonly disableUvicornAccessLog?: boolean;\n\n  /**\n   * Allow credentials.\n   * @default false\n   */\n  readonly allowCredentials?: boolean;\n\n  /**\n   * Allowed origins.\n   * @default ['*']\n   */\n  readonly allowedOrigins?: string[];\n\n  /**\n   * Allowed methods.\n   * @default ['*']\n   */\n  readonly allowedMethods?: string[];\n\n  /**\n   * Allowed headers.\n   * @default ['*']\n   */\n  readonly allowedHeaders?: string[];\n\n  /**\n   * If provided, the server will require this key to be presented in the header.\n   */\n  readonly apiKey?: string;\n\n  /**\n   * LoRA module configurations.\n   * @example {\"name\": \"name\", \"path\": \"lora_path\", \"base_model_name\": \"id\"}\n   */\n  readonly loraModules?: { [key: string]: any };\n\n  /**\n   * Prompt adapter configurations in the format name=path. Multiple adapters can be specified.\n   */\n  readonly promptAdapters?: string[];\n\n  /**\n   * The file path to the chat template, or the template in single-line form for the specified model.\n   */\n  readonly chatTemplate?: string;\n\n  /**\n   * The format to render message content within a chat template.\n   * - “string” will render the content as a string.\n   *   - Example: `\"Hello World\"`\n   * - “openai” will render the content as a list of dictionaries, similar to OpenAI schema.\n   *   - Example: `[{\"type\": \"text\", \"text\": \"Hello world!\"}]`\n   * @default ChatTemplateContentFormat.AUTO\n   */\n  readonly chatTemplateContentFormat?: ChatTemplateContentFormat;\n\n  /**\n   * The role name to return if `request.add_generation_prompt=true`.\n   * @default \"assistant\"\n   */\n  readonly responseRole?: string;\n\n  /**\n   * The file path to the SSL key file.\n   */\n  readonly sslKeyfile?: string;\n\n  /**\n   * The file path to the SSL cert file.\n   */\n  readonly sslCertfile?: string;\n\n  /**\n   * The CA certificates file.\n   */\n  readonly sslCaCerts?: string;\n\n  /**\n   * Refresh SSL Context when SSL certificate files change.\n   * @default false\n   */\n  readonly enableSslRefresh?: boolean;\n\n  /**\n   * Whether client certificate is required (see stdlib ssl module's).\n   * @default 0\n   */\n  readonly sslCertReqs?: number;\n\n  /**\n   * FastAPI root_path when app is behind a path based routing proxy.\n   */\n  readonly rootPath?: string;\n\n  /**\n   * Additional ASGI middleware to apply to the app.\n   * We accept multiple –middleware arguments. The value should be an import path.\n   * If a function is provided, vLLM will add it to the server using `@app.middleware('http')`.\n   * If a class is provided, vLLM will add it to the server using `app.add_middleware()`.\n   * @default []\n   */\n  readonly middleware?: string[];\n\n  /**\n   * When `--max-logprobs` is specified,\n   * represents single tokens as strings of the form 'token_id:{token_id}' so that tokens that are not JSON-encodable can be identified..\n   * @default false\n   */\n  readonly returnTokensAsTokenIds?: boolean;\n\n  /**\n   * If specified, will run the OpenAI frontend server in the same process as the model serving engine.\n   * @default false\n   */\n  readonly disableFrontendMultiprocessing?: boolean;\n\n  /**\n   * If specified, API server will add X-Request-Id header to responses.\n   *\n   * Caution: this hurts performance at high QPS.\n   * @default false\n   */\n  readonly enableRequestIdHeaders?: boolean;\n\n  /**\n   * Enable auto tool choice for supported models.\n   * Use `--tool-call-parser` to specify which parser to use.\n   * @default false\n   */\n  readonly enableAutoToolChoice?: boolean;\n\n  /**\n   * Select the tool call parser depending on the model that you’re using.\n   * This is used to parse the model-generated tool call into OpenAI API format.\n   *\n   * Required for `--enable-auto-tool-choice`.\n   */\n  readonly toolCallParser?: ToolCallParser;\n\n  /**\n   * Specify the tool parser plugin.\n   * @default \"\"\n   */\n  readonly toolParserPlugin?: string;\n\n  /**\n   * Name or path of the huggingface model to use.\n   * @default \"facebook/opt-125m\"\n   */\n  readonly model?: string;\n\n  /**\n   * The task to use the model for.\n   * Each vLLM instance only supports one task, even if the same model can be used for multiple tasks.\n   * When the model only supports one task, \"auto\" can be used to select it; otherwise,\n   * you must specify explicitly which task to use.\n   * @default VllmTask.AUTO\n   */\n  readonly task?: VllmTask;\n\n  /**\n   * Name or path of the huggingface tokenizer to use.\n   * If unspecified, model name or path will be used.\n   */\n  readonly tokenizer?: string;\n\n  /**\n   * Name or path of the huggingface config to use.\n   * If unspecified, model name or path will be used.\n   */\n  readonly hfConfigPath?: string;\n\n  /**\n   * Skip initialization of tokenizer and detokenizer.\n   * Expects valid prompt_token_ids and None for prompt from the input.\n   * The generated output will contain token ids.\n   * @default false\n   */\n  readonly skipTokenizerInit?: boolean;\n\n  /**\n   * The specific model version to use. It can be a branch name, a tag name, or a commit id.\n   * If unspecified, will use the default version.\n   */\n  readonly revision?: string;\n\n  /**\n   * The specific revision to use for the model code on Hugging Face Hub.\n   * It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version.\n   */\n  readonly codeRevision?: string;\n\n  /**\n   * Revision of the huggingface tokenizer to use.\n   * It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version.\n   */\n  readonly tokenizerRevision?: string;\n\n  /**\n   * The tokenizer mode.\n   * @default TokenizerMode.AUTO\n   */\n  readonly tokenizerMode?: TokenizerMode;\n\n  /**\n   * Trust remote code from huggingface.\n   * @default false\n   */\n  readonly trustRemoteCode?: boolean;\n\n  /**\n   * Allowing API requests to read local images or videos from directories specified by the server file system.\n   * This is a security risk. Should only be enabled in trusted environments.\n   */\n  readonly allowedLocalMediaPath?: string;\n\n  /**\n   * The format of the model config to load.\n   * @default ConfigFormat.AUTO\n   */\n  readonly configFormat?: ConfigFormat;\n\n  /**\n   * Data type for model weights and activations.\n   * @default DataType.AUTO\n   */\n  readonly dtype?: DataType;\n\n  /**\n   * Model context length.\n   */\n  readonly maxModelLen?: number;\n\n  /**\n   * Optional regex pattern specifying valid logits processor qualified names that can be passed\n   * with the logits_processors extra completion argument. Defaults to None, which allows no processors.\n   */\n  readonly logitsProcessorPattern?: string;\n\n  /**\n   * Which implementation of the model to use.\n   * @default ModelImpl.AUTO\n   */\n  readonly modelImpl?: ModelImpl;\n\n  /**\n   * Disables sliding window, capping to sliding window size.\n   * @default false\n   */\n  readonly disableSlidingWindow?: boolean;\n\n  /**\n   * Block manager v1 has been removed and SelfAttnBlockSpaceManager (i.e. block manager v2) is now the default.\n   * @default true\n   * @deprecated Setting this flag to True or False has no effect on vLLM behavior.\n   */\n  readonly useV2BlockManager?: boolean;\n\n  /**\n   * Random seed for operations.\n   */\n  readonly seed?: number;\n\n  /**\n   * Max number of log probs to return logprobs is specified in SamplingParams.\n   * @default 20\n   */\n  readonly maxLogprobs?: number;\n\n  /**\n   * Disable logging statistics.\n   * @default false\n   */\n  readonly disableLogStats?: boolean;\n\n  /**\n   * Method used to quantize the weights.\n   * If None, we first check the quantization_config attribute in the model config file.\n   * If that is None, we assume the model weights are not quantized and use dtype to determine the data type of the weights.\n   */\n  readonly quantization?: Quantization;\n\n  /**\n   * RoPE scaling configuration in JSON format.\n   * @example {\"rope_type\":\"dynamic\",\"factor\":2.0}\n   */\n  readonly ropeScaling?: { [key: string]: any };\n\n  /**\n   * RoPE theta. Use with rope_scaling.\n   * In some cases, changing the RoPE theta improves the performance of the scaled model.\n   */\n  readonly ropeTheta?: number;\n\n  /**\n   * The token to use as HTTP bearer authorization for remote files.\n   * If provided, the Secret will be passed as HF_TOKEN secret to compile environment.\n   */\n  readonly hfToken?: Secret;\n\n  /**\n   * Extra arguments for the HuggingFace config.\n   * This should be a object that will be parsed into a dictionary.\n   */\n  readonly hfOverrides?: { [key: string]: any };\n\n  /**\n   * Always use eager-mode PyTorch.\n   * If False, will use eager mode and CUDA graph in hybrid for maximal performance and flexibility.\n   * @default false\n   */\n  readonly enforceEager?: boolean;\n\n  /**\n   * Maximum sequence length covered by CUDA graphs.\n   * When a sequence has context length larger than this, we fall back to eager mode.\n   * Additionally for encoder-decoder models, if the sequence length of the encoder input is larger than this,\n   * we fall back to the eager mode.\n   * @default 8192\n   */\n  readonly maxSeqLenToCapture?: number;\n\n  /**\n   * Overrides for the multimodal input mapping/processing, e.g., image processor.\n   * @example {\"num_crops\": 4}\n   */\n  readonly mmProcessorKwargs?: { [key: string]: any };\n\n  /**\n   * If true, then disables caching of the multi-modal preprocessor/mapper. (not recommended)\n   * @default false\n   */\n  readonly disableMmPreprocessorCache?: boolean;\n\n  /**\n   * The pattern(s) to ignore when loading the model.Default to original/**\\/* to avoid\n   * repeated loading of llama’s checkpoints.\n   * @default []\n   */\n  readonly ignorePatterns?: string[];\n\n  /**\n   * The model name(s) used in the API.\n   * If multiple names are provided, the server will respond to any of the provided names.\n   * The model name in the model field of a response will be the first name in this list.\n   * If not specified, the model name will be the same as the `--model` argument.\n   * Noted that this name(s) will also be used in model_name tag content of prometheus metrics,\n   * if multiple names provided, metrics tag will take the first one.\n   */\n  readonly servedModelName?: string[];\n\n  /**\n   * Name or path of the QLoRA adapter.\n   */\n  readonly qloraAdapterNameOrPath?: string;\n\n  /**\n   * Enable deprecated Prometheus metrics that have been hidden since the specified version.\n   * For example, if a previously deprecated metric has been hidden since the v0.7.0 release,\n   * you use –show-hidden-metrics-for-version=0.7 as a temporary escape hatch while you migrate to new metrics.\n   * The metric is likely to be removed completely in an upcoming release.\n   */\n  readonly showHiddenMetricsForVersion?: string;\n\n  /**\n   * Target URL to which OpenTelemetry traces will be sent.\n   */\n  readonly otlpTracesEndpoint?: string;\n\n  /**\n   * Valid choices are model,worker,all.\n   * It makes sense to set this only if --otlp-traces-endpoint is set.\n   * If set, it will collect detailed traces for the specified modules.\n   * This involves use of possibly costly and or blocking operations and hence might have a performance impact.\n   */\n  readonly collectDetailedTraces?: string;\n\n  /**\n   * Disable async output processing. This may result in lower performance.\n   * @default false\n   */\n  readonly disableAsyncOutputProc?: boolean;\n\n  /**\n   * The scheduler class to use.\n   * @default \"vllm.core.scheduler.Scheduler\"\n   */\n  readonly schedulerCls?: string;\n\n  /**\n   * Override or set neuron device configuration.\n   * @example {\"cast_logits_dtype\": \"bloat16\"}\n   */\n  readonly overrideNeuronConfig?: { [key: string]: any };\n\n  /**\n   * Override or set the pooling method for pooling models.\n   * @example {\"pooling_type\": \"mean\", \"normalize\": false}\n   */\n  readonly overridePoolerConfig?: { [key: string]: any };\n\n  /**\n   * torch.compile configuration for the model.\n   * When it is a number (0, 1, 2, 3), it will be interpreted as the optimization level.\n   *\n   * NOTE: level 0 is the default level without any optimization.\n   * level 1 and 2 are for internal testing only. level 3 is the recommended level for production.\n   * To specify the full compilation config, use a JSON string,\n   * e.g. `{\"level\": 3, \"cudagraph_capture_sizes\": [1, 2, 4, 8]}` Following the convention of traditional compilers,\n   * using -O without space is also supported. -O3 is equivalent to -O 3.\n   */\n  readonly compilationConfig?: { [key: string]: any };\n\n  /**\n   * Configurations for distributed KV cache transfer in object.\n   */\n  readonly kvTransferConfig?: { [key: string]: any };\n\n  /**\n   * The worker class to use for distributed execution.\n   * @default \"auto\"\n   */\n  readonly workerCls?: string;\n\n  /**\n   * The worker extension class.\n   * @default \"\"\n   */\n  readonly workerExtensionCls?: string;\n\n  /**\n   * The folder path to the generation config. Defaults to ‘auto’,\n   * the generation config will be loaded from model path. If set to ‘vllm’,\n   * no generation config is loaded, vLLM defaults will be used.\n   * If set to a folder path, the generation config will be loaded from the specified folder path.\n   * If max_new_tokens is specified in generation config,\n   * then it sets a server-wide limit on the number of output tokens for all requests.\n   * @default \"auto\"\n   */\n  readonly generationConfig?: string;\n\n  /**\n   * Overrides or sets generation config.\n   * If used with –generation-config=auto, the override parameters will be merged with the default config from the model.\n   * If generation-config is None, only the override parameters are used.\n   * @example {\"temperature\": 0.5}\n   */\n  readonly overrideGenerationConfig?: { [key: string]: any };\n\n  /**\n   * Enable sleep mode for the engine. (only cuda platform is supported)\n   * @default false\n   */\n  readonly enableSleepMode?: boolean;\n\n  /**\n   * Additional config for specified platform.\n   * Different platforms may support different configs.\n   * Make sure the configs are valid for the platform you are using.\n   * The input format is like ‘{“config_key”:”config_value”}’\n   */\n  readonly additionalConfig?: { [key: string]: any };\n\n  /**\n   * Enable reasoning_content for the model.\n   * @default false\n   */\n  readonly enableReasoning?: boolean;\n\n  /**\n   * Disable cascade attention for V1.\n   * @default false\n   */\n  readonly disableCascadeAttn?: boolean;\n\n  /**\n   * Disable logging requests.\n   * @default false\n   */\n  readonly disableLogRequests?: boolean;\n\n  /**\n   * Max number of prompt characters or prompt ID numbers in log.\n   */\n  readonly maxLogLen?: number;\n\n  /**\n   * Disable FastAPI's OpenAPI schema, Swagger UI, and ReDoc endpoint.\n   * @default false\n   */\n  readonly disableFastApiDocs?: boolean;\n\n  /**\n   * Enable prompt_tokens_details in usage.\n   * @default false\n   */\n  readonly enablePromptTokensDetails?: boolean;\n\n  /**\n   * Enable tracking server_load_metrics in the app state.\n   * @default false\n   */\n  readonly enableServerLoadTracking?: boolean;\n}\n\n/**\n * Configuration for loading the model weights.\n */\nexport interface VllmLoadConfig {\n  /**\n   * The format of the model weights to load:\n   * - “auto” will try to load the weights in the safetensors format and fall back to the pytorch bin format if safetensors format is not available.\n   * - “pt” will load the weights in the pytorch bin format.\n   * - “safetensors” will load the weights in the safetensors format.\n   * - “npcache” will load the weights in pytorch format and store a numpy cache to speed up the loading.\n   * - “dummy” will initialize the weights with random values, which is mainly for profiling.\n   * - “tensorizer” will use CoreWeave’s tensorizer library for fast weight loading. See the Tensorize vLLM Model script in the Examples section for more information.\n   * - “runai_streamer” will load the Safetensors weights using Run:ai Model Streamer.\n   * - “bitsandbytes” will load the weights using bitsandbytes quantization.\n   * - “sharded_state” will load weights from pre-sharded checkpoint files, supporting efficient loading of tensor-parallel models.\n   * - “gguf” will load weights from GGUF format files (details specified in ggml-org/ggml).\n   * - “mistral” will load weights from consolidated safetensors files used by Mistral models.\n   * @default LoadFormat.AUTO\n   */\n  readonly loadFormat?: LoadFormat;\n\n  /**\n   * Directory to download and load the weights, default to the default cache directory of Hugging Face.\n   */\n  readonly downloadDir?: string;\n\n  /**\n   * Extra config for model loader. This will be passed to the model loader corresponding to the chosen load_format.\n   * This should be a object that will be parsed into a dictionary.\n   * @default {}\n   */\n  readonly modelLoaderExtraConfig?: { [key: string]: any };\n\n  /**\n   * Whether to enable tqdm for showing progress bar when loading model weights.\n   * @default true\n   */\n  readonly useTqdmOnLoad?: boolean;\n}\n\n/**\n * Dataclass which contains the decoding strategy of the engine.\n */\nexport interface VllmDecodingConfig {\n  /**\n   * Which engine will be used for guided decoding (JSON schema / regex etc) by default.\n   * With “auto”, we will make opinionated choices based on request contents and what the backend libraries currently support,\n   * so the behavior is subject to change in each release.\n   * @default GuidedDecodingBackend.AUTO\n   */\n  readonly guidedDecodingBackend?: GuidedDecodingBackend;\n\n  /**\n   * Select the reasoning parser depending on the model that you’re using.\n   * This is used to parse the reasoning content into OpenAI API format. Required for –enable-reasoning.\n   */\n  readonly reasoningParser?: ReasoningParser;\n}\n\n/**\n * Configuration for the distributed execution.\n */\nexport interface VllmParallelConfig {\n  /**\n   * Backend to use for distributed model workers, either “ray” or “mp” (multiprocessing).\n   * If the product of pipeline_parallel_size and tensor_parallel_size is less than or equal to the number of GPUs available,\n   * “mp” will be used to keep processing on a single host. Otherwise, this will default to “ray” if Ray is installed and fail otherwise.\n   * Note that tpu and hpu only support Ray for distributed inference.\n   */\n  readonly distributedExecutorBackend?: DistributedExecutorBackend;\n\n  /**\n   * Number of pipeline parallel groups.\n   * @default 1\n   */\n  readonly pipelineParallelSize?: number;\n\n  /**\n   * Number of tensor parallel groups.\n   * @default 1\n   */\n  readonly tensorParallelSize?: number;\n\n  /**\n   * Number of data parallel groups.\n   * MoE layers will be sharded according to the product of the tensor parallel size and data parallel size.\n   * @default 1\n   */\n  readonly dataParallelSize?: number;\n\n  /**\n   * Use expert parallelism instead of tensor parallelism for MoE layers.\n   * @default false\n   */\n  readonly enableExpertParallel?: boolean;\n\n  /**\n   * Maximum number of parallal loading workers when loading model sequentially in multiple batches.\n   * To avoid RAM OOM when using tensor parallel and large models.\n   */\n  readonly maxParallelLoadingWorkers?: number;\n\n  /**\n   * Whether to profile Ray workers with nsight.\n   * @see https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html#profiling-nsight-profiler\n   * @default false\n   */\n  readonly rayWorkersUseNsight?: boolean;\n\n  /**\n   * Disable the custom all-reduce kernel and fall back to NCCL.\n   * @default false\n   */\n  readonly disableCustomAllReduce?: boolean;\n}\n\n/**\n * Configuration for the KV cache.\n */\nexport interface VllmCacheConfig {\n  /**\n   * Size of a contiguous cache block in number of tokens.\n   * This is ignored on neuron devices and set to –max-model-len. On CUDA devices, only block sizes up to 32 are supported.\n   * On HPU devices, block size defaults to 128.\n   */\n  readonly blockSize?: BlockSize;\n\n  /**\n   * The fraction of GPU memory to be used for the model executor, which can range from 0 to 1.\n   * For example, a value of 0.5 would imply 50% GPU memory utilization.\n   * If unspecified, will use the default value of 0.9. This is a per-instance limit,\n   * and only applies to the current vLLM instance.\n   * It does not matter if you have another vLLM instance running on the same GPU. For example,\n   * if you have two vLLM instances running on the same GPU, you can set the GPU memory utilization to 0.5 for each instance.\n   * @default 0.9\n   */\n  readonly gpuMemoryUtilization?: number;\n\n  /**\n   * Size of the CPU swap space per GPU (in GiB).\n   * @default 4\n   */\n  readonly swapSpace?: number;\n\n  /**\n   * Data type for kv cache storage. If “auto”, will use model data type.\n   * CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. ROCm (AMD GPU) supports fp8 (=fp8_e4m3).\n   * @default KvCacheDtype.AUTO\n   */\n  readonly kvCacheDtype?: KvCacheDtype;\n\n  /**\n   * Number of GPU blocks to use. This overrides the profiled num_gpu_blocks if specified.\n   * Does nothing if None. Used for testing preemption.\n   */\n  readonly numGpuBlocksOverride?: number;\n\n  /**\n   * Whether to enable prefix caching. Disabled by default for V0. Enabled by default for V1.\n   */\n  readonly enablePrefixCaching?: boolean;\n\n  /**\n   * Set the hash algorithm for prefix caching.\n   * @default PrefixCachingHashAlgo.BUILTIN\n   */\n  readonly prefixCachingHashAlgo?: PrefixCachingHashAlgo;\n\n  /**\n   * The space in GiB to offload to CPU, per GPU.\n   * Default is 0, which means no offloading. Intuitively,\n   * this argument can be seen as a virtual way to increase the GPU memory size.\n   * For example, if you have one 24 GB GPU and set this to 10, virtually you can think of it as a 34 GB GPU.\n   * Then you can load a 13B model with BF16 weight, which requires at least 26GB GPU memory.\n   *\n   * Note that this requires fast CPU-GPU interconnect,\n   * as part of the model is loaded from CPU memory to GPU memory on the fly in each model forward pass.\n   * @default 0\n   */\n  readonly cpuOffloadGb?: number;\n\n  /**\n   * This enables dynamic calculation of k_scale and v_scale when kv_cache_dtype is fp8.\n   * If False, the scales will be loaded from the model checkpoint if available. Otherwise, the scales will default to 1.0.\n   * @default false\n   */\n  readonly calculateKvScales?: boolean;\n}\n\n/**\n * Controls the behavior of multimodal models.\n */\nexport interface VllmMultiModalConfig {\n  /**\n   * The maximum number of input items allowed per prompt for each modality.\n   * This should be a object that will be parsed into a dictionary. Defaults to 1 (V0) or 999 (V1) for each modality.\n   * @default {}\n   */\n  readonly limitMmPerPrompt?: { [key: string]: any };\n}\n\n/**\n * Configuration for LoRA.\n */\nexport interface VllmLoraConfig {\n  /**\n   * If True, enable handling of LoRA adapters.\n   * @default false\n   */\n  readonly enableLora?: boolean;\n\n  /**\n   * If True, enable bias for LoRA adapters.\n   * @default false\n   */\n  readonly enableLoraBias?: boolean;\n\n  /**\n   * Max number of LoRAs in a single batch.\n   * @default 1\n   */\n  readonly maxLoras?: number;\n\n  /**\n   * Max LoRA rank.\n   * @default 16\n   */\n  readonly maxLoraRank?: number;\n\n  /**\n   * Maximum size of extra vocabulary that can be present in a LoRA adapter (added to the base model vocabulary).\n   * @default 256\n   */\n  readonly loraExtraVocabSize?: number;\n\n  /**\n   * Data type for LoRA. If auto, will default to base model dtype.\n   * @default LoraDtype.AUTO\n   */\n  readonly loraDtype?: LoraDtype;\n\n  /**\n   * Specify multiple scaling factors (which can be different from base model scaling factorsee eg. Long LoRA)\n   * to allow for multiple LoRA adapters trained with those scaling factors to be used at the same time.\n   * If not specified, only adapters trained with the base model scaling factor are allowed.\n   */\n  readonly longLoraScalingFactors?: number;\n\n  /**\n   * Maximum number of LoRAs to store in CPU memory. Must be >= than max_loras.\n   */\n  readonly maxCpuLoras?: number;\n\n  /**\n   * By default, only half of the LoRA computation is sharded with tensor parallelism.\n   * Enabling this will use the fully sharded layers.\n   * At high sequence length, max rank or tensor parallel size, this is likely faster.\n   * @default false\n   */\n  readonly fullyShardedLoras?: boolean;\n}\n\n/**\n * Configuration for PromptAdapters.\n */\nexport interface VllmPromptAdapterConfig {\n  /**\n   * If True, enable handling of PromptAdapters.\n   * @default false\n   */\n  readonly enablePromptAdapter?: boolean;\n\n  /**\n   * Max number of PromptAdapters in a batch.\n   * @default 1\n   */\n  readonly maxPromptAdapters?: number;\n\n  /**\n   * Max number of PromptAdapters tokens.\n   * @default 0\n   */\n  readonly maxPromptAdapterToken?: number;\n}\n\nexport interface VllmDeviceConfig {\n  /**\n   * Device type for vLLM execution.\n   * @default Device.AUTO\n   */\n  readonly device?: Device;\n}\n\n/**\n * Configuration for speculative decoding.\n */\nexport interface VllmSpeculativeConfig {\n  /**\n   * The configurations for speculative decoding. Should be a object.\n   */\n  readonly speculativeConfig?: { [key: string]: any };\n}\n\nexport interface VllmSchedulerConfig {\n  /**\n   * Maximum number of tokens to be processed in a single iteration.\n   *\n   * This config has no static default. If left unspecified by the user, it will be set in EngineArgs.create_engine_config based on the usage context.\n   */\n  readonly maxNumBatchedTokens?: number;\n\n  /**\n   * Maximum number of sequences to be processed in a single iteration.\n   *\n   * This config has no static default. If left unspecified by the user, it will be set in EngineArgs.create_engine_config based on the usage context.\n   */\n  readonly maxNumSeqs?: number;\n\n  /**\n   * For chunked prefill, the maximum number of sequences that can be partially prefilled concurrently.\n   * @default 1\n   */\n  readonly maxNumPartialPrefills?: number;\n\n  /**\n   * For chunked prefill, the maximum number of prompts longer than long_prefill_token_threshold that will be prefilled concurrently.\n   * Setting this less than max_num_partial_prefills will allow shorter prompts to jump the queue in front of longer prompts in some cases, improving latency.\n   * @default 1\n   */\n  readonly maxLongPartialPrefills?: number;\n\n  /**\n   * For chunked prefill, a request is considered long if the prompt is longer than this number of tokens.\n   * @default 0\n   */\n  readonly longPrefillTokenThreshold?: number;\n\n  /**\n   * The number of slots to allocate per sequence per step,\n   * beyond the known token ids. This is used in speculative decoding to store KV activations of tokens\n   * which may or may not be accepted.\n   *\n   * NOTE: This will be replaced by speculative config in the future; it is present to enable correctness tests until then.\n   * @default 0\n   */\n  readonly numLookaheadSlots?: number;\n\n  /**\n   * Apply a delay (of delay factor multiplied by previous prompt latency) before scheduling next prompt.\n   * @default 0.0\n   */\n  readonly schedulerDelayFactor?: number;\n\n  /**\n   * Whether to perform preemption by swapping or recomputation.\n   * If not specified, we determine the mode as follows:\n   * We use recomputation by default since it incurs lower overhead than swapping.\n   * However, when the sequence group has multiple sequences (e.g., beam search),\n   * recomputation is not currently supported. In such a case, we use swapping instead.\n   */\n  readonly preemptionMode?: PreemptionMode;\n\n  /**\n   * Maximum number of forward steps per scheduler call.\n   * @default 1\n   */\n  readonly numSchedulerSteps?: number;\n\n  /**\n   * If False, then multi-step will stream outputs at the end of all steps\n   * @default true\n   */\n  readonly multiStepStreamOutputs?: boolean;\n\n  /**\n   * The scheduling policy to use:\n   * - “fcfs” means first come first served, i.e. requests are handled in order of arrival.\n   * - “priority” means requests are handled based on given priority (lower value means earlier handling) and time of arrival deciding any ties).\n   * @default SchedulingPolicy.FCFS\n   */\n  readonly schedulingPolicy?: SchedulingPolicy;\n\n  /**\n   * If True, prefill requests can be chunked based on the remaining max_num_batched_tokens.\n   */\n  readonly enableChunkedPrefill?: boolean;\n\n  /**\n   * If set to true and chunked prefill is enabled, we do not want to partially schedule a multimodal item.\n   * Only used in V1 This ensures that if a request has a mixed prompt (like text tokens TTTT followed by image tokens IIIIIIIIII)\n   * where only some image tokens can be scheduled (like TTTTIIIII, leaving IIIII),\n   * it will be scheduled as TTTT in one step and IIIIIIIIII in the next.\n   * @default false\n   */\n  readonly disableChunkedMmInput?: boolean;\n}\n\n/**\n * Interface for vLLM server command line arguments\n */\nexport interface VllmEngineArguments\n  extends\n    VllmNamedArguments,\n    VllmLoadConfig,\n    VllmDecodingConfig,\n    VllmParallelConfig,\n    VllmCacheConfig,\n    VllmMultiModalConfig,\n    VllmLoraConfig,\n    VllmPromptAdapterConfig,\n    VllmDeviceConfig,\n    VllmSpeculativeConfig,\n    VllmSchedulerConfig {}\n\nconst jsonValueProperties: (keyof VllmEngineArguments)[] = [\n  \"allowedOrigins\",\n  \"allowedHeaders\",\n  \"allowedMethods\",\n];\nconst ignoreKeys: (keyof VllmEngineArguments)[] = [\"model\", \"hfToken\"];\n\nexport abstract class VllmEngineArgumentsParser {\n  /**\n   * Convert vLLM engine arguments (camel case) to config (kebab case)\n   * @param args vLLM engine arguments\n   * @returns vLLM engine config\n   * @see https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#configuration-file\n   */\n  static config(args: VllmEngineArguments) {\n    return Object.entries(args)\n      .filter(([key]) => !ignoreKeys.includes(key as keyof VllmEngineArguments))\n      .reduce<{ [key in string]: any }>((prev, [key, value]) => {\n        const k = key.replace(/[A-Z]/g, (match) => `-${match.toLowerCase()}`);\n        if (\n          jsonValueProperties.includes(key as keyof VllmEngineArguments) ||\n          (!Array.isArray(value) && typeof value === \"object\")\n        ) {\n          value = JSON.stringify(value);\n        }\n        prev[k] = value;\n        return prev;\n      }, {});\n  }\n  static cli(args: VllmEngineArguments) {\n    const generalArgs = Object.entries(args)\n      .filter(([key]) => !ignoreKeys.includes(key as keyof VllmEngineArguments))\n      .flatMap(([k, value]) => {\n        const key = k.replace(/[A-Z]/g, (match) => `-${match.toLowerCase()}`);\n        if (typeof value === \"boolean\") {\n          return value ? [`--${key}`] : [];\n        }\n        if (Array.isArray(value)) {\n          return [`--${key}`, ...value.map((v) => `${v}`)];\n        }\n        if (typeof value === \"object\") {\n          return [`--${key}`, JSON.stringify(value)];\n        }\n        return [`--${key}`, `${value}`];\n      });\n    return [args.model!, ...generalArgs];\n  }\n}\n"]}
|
|
@@ -62,7 +62,7 @@ class SageMakerInferenceToolkitTnxCompiler extends constructs_1.Construct {
|
|
|
62
62
|
if (quantDtype) {
|
|
63
63
|
compiledArtifactPathPrefix = `${compiledArtifactPathPrefix}-quant${quantDtype}`;
|
|
64
64
|
}
|
|
65
|
-
const compiler = new neuronx_compiler_1.
|
|
65
|
+
const compiler = new neuronx_compiler_1.NeuronxNativeCompiler(this, "Resource", {
|
|
66
66
|
...props,
|
|
67
67
|
neuronxInstanceType: availableInstancePatterns[0].neuronxInstanceType,
|
|
68
68
|
artifactS3Prefix: "test",
|
|
@@ -94,4 +94,4 @@ class SageMakerInferenceToolkitTnxCompiler extends constructs_1.Construct {
|
|
|
94
94
|
}
|
|
95
95
|
}
|
|
96
96
|
exports.SageMakerInferenceToolkitTnxCompiler = SageMakerInferenceToolkitTnxCompiler;
|
|
97
|
-
//# sourceMappingURL=data:application/json;base64,{"version":3,"file":"sagemaker-inference-toolkit-tnx-compiler.js","sourceRoot":"","sources":["../../src/sagemaker-inference-toolkit-tnx/sagemaker-inference-toolkit-tnx-compiler.ts"],"names":[],"mappings":";;;AAAA,kEAAiE;AAKjE,2CAAuC;AACvC,+BAA4B;AAC5B,6CAYyB;AACzB,+DAIkC;AAwClC;;;;GAIG;AACH,MAAa,wCAAwC;IAGnD,YAAY,KAAgB,EAAE,EAAU,EAAE,YAA2B;QACnE,MAAM,KAAK,GAAG,IAAI,uCAAmB,CAAC,KAAK,EAAE,EAAE,EAAE;YAC/C,SAAS,EAAE,IAAA,WAAI,EACb,SAAS,EACT,uDAAuD,CACxD;YACD,SAAS,EAAE;gBACT,UAAU,EAAE,YAAY,CAAC,SAAS;gBAClC,SAAS,EAAE,YAAY,CAAC,QAAQ;aACjC;SACF,CAAC,CAAC;QACH,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC,oBAAoB,EAAE,CAAC;QAC1C,IAAI,CAAC,gBAAgB,GAAG,YAAY,CAAC,gBAAgB,CAAC;IACxD,CAAC;CACF;AAjBD,4FAiBC;AAmDD;;GAEG;AACH,MAAa,oCAAqC,SAAQ,sBAAS;IAOjE,YACE,KAAgB,EAChB,EAAU,EACV,KAAgD;QAEhD,KAAK,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;QACjB,MAAM,QAAQ,GAAG,KAAK,CAAC,cAAc,EAAE,QAAQ,IAAI,kBAAQ,CAAC,YAAY,CAAC;QACzE,MAAM,SAAS,GAAG,KAAK,CAAC,cAAc,EAAE,SAAS,IAAI,CAAC,CAAC;QACvD,MAAM,UAAU,GAAG,KAAK,CAAC,cAAc,EAAE,UAAU,IAAI,IAAI,CAAC;QAC5D,MAAM,UAAU,GAAG,KAAK,CAAC,cAAc,EAAE,UAAU,CAAC;QACpD,MAAM,eAAe,GAAG,KAAK,CAAC,KAAK,CAAC,OAAO,CAAC,MAAM;YAChD,CAAC,CAAC,IAAA,6BAAmB,EACjB,KAAK,CAAC,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,kBAAkB,EAC7C,KAAK,CAAC,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,MAAM,EACjC,sBAAY,CAAC,YAAY,EACzB,UAAU,EACV,SAAS,CACV;YACH,CAAC,CAAC,IAAA,4CAAkC,EAChC,KAAK,CAAC,KAAK,CAAC,OAAO,CAAC,UAAU,EAC9B,UAAU,EACV,SAAS,CACV,CAAC;QACN,MAAM,oBAAoB,GAAG,KAAK,CAAC,mBAAmB;YACpD,CAAC,CAAC,CAAC,KAAK,CAAC,mBAAmB,CAAC;YAC7B,CAAC,CAAC;gBACE,6BAAmB,CAAC,YAAY;gBAChC,6BAAmB,CAAC,aAAa;gBACjC,6BAAmB,CAAC,aAAa;aAClC,CAAC;QACN,MAAM,yBAAyB,GAAG,oBAAoB;aACnD,OAAO,CAAC,CAAC,mBAAmB,EAAE,EAAE,CAC/B,IAAA,4BAAkB,EAChB,mBAAmB,EACnB,eAAe,EACf,KAAK,CAAC,KAAK,CAAC,OAAO,CAAC,MAAM,EAAE,cAAc,CAC3C,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;YACZ,mBAAmB;YACnB,GAAG,CAAC;SACL,CAAC,CAAC,CACJ;aACA,MAAM,CACL,CAAC,OAAO,EAAE,EAAE,CACV,CAAC,KAAK,CAAC,cAAc,EAAE,QAAQ;YAC/B,OAAO,CAAC,EAAE,KAAK,KAAK,CAAC,cAAc,CAAC,QAAQ,CAC/C,CAAC;QACJ,IAAI,yBAAyB,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC3C,MAAM,IAAI,KAAK,CACb,wDAAwD,yBAAyB,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,CACjH,CAAC;QACJ,CAAC;QACD,MAAM,QAAQ,GAAG,yBAAyB,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;QACjD,MAAM,KAAK,GACT,KAAK,CAAC,KAAK;YACX,IAAI,wCAAwC,CAC1C,IAAI,EACJ,cAAc,EACd,qCAA2B,CAAC,MAAM,CACnC,CAAC;QACJ,IAAI,0BAA0B,GAAG,GAAG,KAAK,CAAC,KAAK,CAAC,OAAO,YAAY,KAAK,CAAC,gBAAgB,MAAM,QAAQ,MAAM,UAAU,OAAO,QAAQ,EAAE,CAAC;QACzI,IAAI,UAAY,EAAE,CAAC;YACjB,0BAA0B,GAAG,GAAG,0BAA0B,SAAS,UAAU,EAAE,CAAC;QAClF,CAAC;QACD,MAAM,QAAQ,GAAG,IAAI,kCAAe,CAAC,IAAI,EAAE,UAAU,EAAE;YACrD,GAAG,KAAK;YACR,mBAAmB,EAAE,yBAAyB,CAAC,CAAC,CAAC,CAAC,mBAAmB;YACrE,gBAAgB,EAAE,MAAM;YACxB,KAAK,EAAE,KAAK;YACZ,WAAW,EAAE;gBACX,QAAQ,EAAE,KAAK,CAAC,KAAK,CAAC,OAAO;gBAC7B,SAAS,EAAE,QAAQ,CAAC,QAAQ,EAAE;gBAC9B,WAAW,EAAE,UAAU,CAAC,QAAQ,EAAE;gBAClC,SAAS,EAAE,QAAQ,CAAC,QAAQ,EAAE;gBAC9B,WAAW,EAAE,UAAU,EAAE,QAAQ,EAAE,IAAI,EAAE;gBACzC,eAAe,EAAE,KAAK,CAAC,MAAM,CAAC,cAAc,CAC1C,0BAA0B,CAC3B;aACF;SACF,CAAC,CAAC;QACH,IAAI,CAAC,QAAQ,GAAG,QAAQ,CAAC;QACzB,IAAI,CAAC,UAAU,GAAG,UAAU,CAAC;QAC7B,IAAI,CAAC,UAAU,GAAG,UAAU,CAAC;QAC7B,IAAI,CAAC,QAAQ,GAAG,QAAQ,CAAC;QACzB,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC,KAAK,CAAC;QACzB,IAAI,CAAC,QAAQ,GAAG,QAAQ,CAAC;IAC3B,CAAC;IACD,OAAO;QACL,OAAO;YACL,GAAG,IAAI,CAAC,QAAQ,CAAC,OAAO,EAAE;YAC1B,QAAQ,EAAE,IAAI,CAAC,QAAQ;YACvB,QAAQ,EAAE,IAAI,CAAC,QAAQ;YACvB,UAAU,EAAE,IAAI,CAAC,UAAU;YAC3B,UAAU,EAAE,IAAI,CAAC,UAAU;SAC5B,CAAC;IACJ,CAAC;CACF;AAtGD,oFAsGC","sourcesContent":["import { ContainerImageBuild } from \"@cdklabs/deploy-time-build\";\nimport { Size } from \"aws-cdk-lib\";\nimport * as ec2 from \"aws-cdk-lib/aws-ec2\";\nimport { ContainerImage } from \"aws-cdk-lib/aws-ecs\";\nimport { IBucket } from \"aws-cdk-lib/aws-s3\";\nimport { Construct } from \"constructs\";\nimport { join } from \"path\";\nimport {\n  calcMemoryFootprint,\n  calcTensorParallel,\n  DataTypeBits,\n  INeuronxImage,\n  INeuronxInstanceType,\n  inferMemoryFootprintFromParameters,\n  Model,\n  NeuronxInstanceType,\n  OptLevel,\n  PytorchTrainingNeuronxImage,\n  QuantDtype,\n} from \"../base/neuronx\";\nimport {\n  INeuronxContainerImage,\n  NeuronxCompiledModel,\n  NeuronxCompiler,\n} from \"../base/neuronx-compiler\";\n\n/**\n * Compile options.\n */\nexport interface SageMakerInferenceToolkitTnxCompileOptions {\n  /**\n   * Number of tensor parallel groups.\n   * @default - calc from parameters and quantDtype\n   */\n  readonly tpDegree?: number;\n  /**\n   * Maximum number of sequences per iteration.\n   * @default 1\n   */\n  readonly batchSize?: number;\n  /**\n   * @default - No quant\n   */\n  readonly quantDtype?: QuantDtype;\n  /**\n   * @default 4096\n   */\n  readonly nPositions?: number;\n  /**\n   * @default OptLevel.BEST_BALANCE\n   */\n  readonly optLevel?: OptLevel;\n}\n\n/**\n * The model compiled by Neuronx compiler.\n */\nexport interface SageMakerInferenceToolkitTnxCompiledModel extends NeuronxCompiledModel {\n  readonly quantDtype?: QuantDtype;\n  readonly optLevel?: OptLevel;\n  readonly tpDegree: number;\n  readonly nPositions: number;\n}\n\n/**\n * Compile image for SageMakerInferenceToolkitTnxCompile.\n * @example\n * new SageMakerInferenceToolkitTnxCompileImage(scope, 'CompileImage', PytorchTrainingNeuronxImage.LATEST);\n */\nexport class SageMakerInferenceToolkitTnxCompileImage implements INeuronxContainerImage {\n  readonly image: ContainerImage;\n  readonly neuronSdkVersion: string;\n  constructor(scope: Construct, id: string, neruonxImage: INeuronxImage) {\n    const build = new ContainerImageBuild(scope, id, {\n      directory: join(\n        __dirname,\n        \"../../scripts/compile/sagemaker-inference-toolkit-tnx\",\n      ),\n      buildArgs: {\n        IMAGE_NAME: neruonxImage.imageName,\n        IMAGE_TAG: neruonxImage.imageTag,\n      },\n    });\n    this.image = build.toEcsDockerImageCode();\n    this.neuronSdkVersion = neruonxImage.neuronSdkVersion;\n  }\n}\n\n/**\n * Props of NeuronxCompile.\n */\nexport interface SageMakerInferenceToolkitTnxCompilerProps {\n  /**\n   * VPC in which this will launch compile worker instance.\n   */\n  readonly vpc: ec2.IVpc;\n  /**\n   * The bucket to upload compiled artifacts.\n   */\n  readonly bucket: IBucket;\n  /**\n   * The model to be compiled.\n   */\n  readonly model: Model;\n  /**\n   * The instance type of compile worker instance.\n   */\n  readonly neuronxInstanceType?: INeuronxInstanceType;\n  /**\n   * The root volume of worker instance.\n   * @default - N bilion parameters * 5GiB EBS\n   */\n  readonly volumeSize?: Size;\n  /**\n   * Compile runtime image.\n   * @default - latest image\n   */\n  readonly image?: SageMakerInferenceToolkitTnxCompileImage;\n  /**\n   * Neuronx compile options.\n   * @default - Each properties are set default.\n   */\n  readonly compileOptions?: SageMakerInferenceToolkitTnxCompileOptions;\n  /**\n   * Whether or not to use spot instances. Spot instances are less expensive EC2 instances that can be reclaimed by EC2 at any time; your job will be given two minutes of notice before reclamation.\n   *\n   * @default false\n   */\n  readonly spot?: boolean;\n  /**\n   * The VPC Subnets this Compute Environment will launch instances in.\n   *\n   * @default - new subnets will be created\n   */\n  readonly vpcSubnets?: ec2.SubnetSelection;\n}\n\n/**\n * Neuronx compile construct. Compile the model to work with Inferentia2 and Trainium1 and upload it to an S3 bucket.\n */\nexport class SageMakerInferenceToolkitTnxCompiler extends Construct {\n  readonly quantDtype?: QuantDtype;\n  readonly optLevel?: OptLevel;\n  readonly tpDegree: number;\n  readonly nPositions: number;\n  model: Model;\n  private readonly compiler: NeuronxCompiler;\n  constructor(\n    scope: Construct,\n    id: string,\n    props: SageMakerInferenceToolkitTnxCompilerProps,\n  ) {\n    super(scope, id);\n    const optLevel = props.compileOptions?.optLevel ?? OptLevel.BEST_BALANCE;\n    const batchSize = props.compileOptions?.batchSize ?? 1;\n    const nPositions = props.compileOptions?.nPositions ?? 4096;\n    const quantDtype = props.compileOptions?.quantDtype;\n    const memoryFootprint = props.model.options.config\n      ? calcMemoryFootprint(\n          props.model.options.config.embeddingDimension,\n          props.model.options.config.layers,\n          DataTypeBits.BF16_OR_FP16,\n          nPositions,\n          batchSize,\n        )\n      : inferMemoryFootprintFromParameters(\n          props.model.options.parameters,\n          nPositions,\n          batchSize,\n        );\n    const neuronxInstanceTypes = props.neuronxInstanceType\n      ? [props.neuronxInstanceType]\n      : [\n          NeuronxInstanceType.INF2_8XLARGE,\n          NeuronxInstanceType.INF2_24XLARGE,\n          NeuronxInstanceType.INF2_48XLARGE,\n        ];\n    const availableInstancePatterns = neuronxInstanceTypes\n      .flatMap((neuronxInstanceType) =>\n        calcTensorParallel(\n          neuronxInstanceType,\n          memoryFootprint,\n          props.model.options.config?.attentionHeads,\n        ).map((v) => ({\n          neuronxInstanceType,\n          ...v,\n        })),\n      )\n      .filter(\n        (pattern) =>\n          !props.compileOptions?.tpDegree ||\n          pattern.tp === props.compileOptions.tpDegree,\n      );\n    if (availableInstancePatterns.length === 0) {\n      throw new Error(\n        `No available instance type. You can use tpDegree are ${availableInstancePatterns.map((p) => p.tp).join(\", \")}.`,\n      );\n    }\n    const tpDegree = availableInstancePatterns[0].tp;\n    const image =\n      props.image ??\n      new SageMakerInferenceToolkitTnxCompileImage(\n        this,\n        \"CompileImage\",\n        PytorchTrainingNeuronxImage.LATEST,\n      );\n    let compiledArtifactPathPrefix = `${props.model.modelId}/neuronx-${image.neuronSdkVersion}/tp${tpDegree}-np${nPositions}-opt${optLevel}`;\n    if (quantDtype!!) {\n      compiledArtifactPathPrefix = `${compiledArtifactPathPrefix}-quant${quantDtype}`;\n    }\n    const compiler = new NeuronxCompiler(this, \"Resource\", {\n      ...props,\n      neuronxInstanceType: availableInstancePatterns[0].neuronxInstanceType,\n      artifactS3Prefix: \"test\",\n      image: image,\n      environment: {\n        MODEL_ID: props.model.modelId,\n        TP_DEGREE: tpDegree.toString(),\n        N_POSITIONS: nPositions.toString(),\n        OPT_LEVEL: optLevel.toString(),\n        QUANT_DTYPE: quantDtype?.toString() ?? \"\",\n        ARTIFACT_S3_URL: props.bucket.s3UrlForObject(\n          compiledArtifactPathPrefix,\n        ),\n      },\n    });\n    this.tpDegree = tpDegree;\n    this.nPositions = nPositions;\n    this.quantDtype = quantDtype;\n    this.optLevel = optLevel;\n    this.model = props.model;\n    this.compiler = compiler;\n  }\n  compile(): SageMakerInferenceToolkitTnxCompiledModel {\n    return {\n      ...this.compiler.compile(),\n      tpDegree: this.tpDegree,\n      optLevel: this.optLevel,\n      nPositions: this.nPositions,\n      quantDtype: this.quantDtype,\n    };\n  }\n}\n"]}
|
|
97
|
+
//# sourceMappingURL=data:application/json;base64,{"version":3,"file":"sagemaker-inference-toolkit-tnx-compiler.js","sourceRoot":"","sources":["../../src/sagemaker-inference-toolkit-tnx/sagemaker-inference-toolkit-tnx-compiler.ts"],"names":[],"mappings":";;;AAAA,kEAAiE;AAKjE,2CAAuC;AACvC,+BAA4B;AAC5B,6CAYyB;AACzB,+DAIkC;AAwClC;;;;GAIG;AACH,MAAa,wCAAwC;IAGnD,YAAY,KAAgB,EAAE,EAAU,EAAE,YAA2B;QACnE,MAAM,KAAK,GAAG,IAAI,uCAAmB,CAAC,KAAK,EAAE,EAAE,EAAE;YAC/C,SAAS,EAAE,IAAA,WAAI,EACb,SAAS,EACT,uDAAuD,CACxD;YACD,SAAS,EAAE;gBACT,UAAU,EAAE,YAAY,CAAC,SAAS;gBAClC,SAAS,EAAE,YAAY,CAAC,QAAQ;aACjC;SACF,CAAC,CAAC;QACH,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC,oBAAoB,EAAE,CAAC;QAC1C,IAAI,CAAC,gBAAgB,GAAG,YAAY,CAAC,gBAAgB,CAAC;IACxD,CAAC;CACF;AAjBD,4FAiBC;AAmDD;;GAEG;AACH,MAAa,oCAAqC,SAAQ,sBAAS;IAOjE,YACE,KAAgB,EAChB,EAAU,EACV,KAAgD;QAEhD,KAAK,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;QACjB,MAAM,QAAQ,GAAG,KAAK,CAAC,cAAc,EAAE,QAAQ,IAAI,kBAAQ,CAAC,YAAY,CAAC;QACzE,MAAM,SAAS,GAAG,KAAK,CAAC,cAAc,EAAE,SAAS,IAAI,CAAC,CAAC;QACvD,MAAM,UAAU,GAAG,KAAK,CAAC,cAAc,EAAE,UAAU,IAAI,IAAI,CAAC;QAC5D,MAAM,UAAU,GAAG,KAAK,CAAC,cAAc,EAAE,UAAU,CAAC;QACpD,MAAM,eAAe,GAAG,KAAK,CAAC,KAAK,CAAC,OAAO,CAAC,MAAM;YAChD,CAAC,CAAC,IAAA,6BAAmB,EACjB,KAAK,CAAC,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,kBAAkB,EAC7C,KAAK,CAAC,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,MAAM,EACjC,sBAAY,CAAC,YAAY,EACzB,UAAU,EACV,SAAS,CACV;YACH,CAAC,CAAC,IAAA,4CAAkC,EAChC,KAAK,CAAC,KAAK,CAAC,OAAO,CAAC,UAAU,EAC9B,UAAU,EACV,SAAS,CACV,CAAC;QACN,MAAM,oBAAoB,GAAG,KAAK,CAAC,mBAAmB;YACpD,CAAC,CAAC,CAAC,KAAK,CAAC,mBAAmB,CAAC;YAC7B,CAAC,CAAC;gBACE,6BAAmB,CAAC,YAAY;gBAChC,6BAAmB,CAAC,aAAa;gBACjC,6BAAmB,CAAC,aAAa;aAClC,CAAC;QACN,MAAM,yBAAyB,GAAG,oBAAoB;aACnD,OAAO,CAAC,CAAC,mBAAmB,EAAE,EAAE,CAC/B,IAAA,4BAAkB,EAChB,mBAAmB,EACnB,eAAe,EACf,KAAK,CAAC,KAAK,CAAC,OAAO,CAAC,MAAM,EAAE,cAAc,CAC3C,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;YACZ,mBAAmB;YACnB,GAAG,CAAC;SACL,CAAC,CAAC,CACJ;aACA,MAAM,CACL,CAAC,OAAO,EAAE,EAAE,CACV,CAAC,KAAK,CAAC,cAAc,EAAE,QAAQ;YAC/B,OAAO,CAAC,EAAE,KAAK,KAAK,CAAC,cAAc,CAAC,QAAQ,CAC/C,CAAC;QACJ,IAAI,yBAAyB,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC3C,MAAM,IAAI,KAAK,CACb,wDAAwD,yBAAyB,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,CACjH,CAAC;QACJ,CAAC;QACD,MAAM,QAAQ,GAAG,yBAAyB,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;QACjD,MAAM,KAAK,GACT,KAAK,CAAC,KAAK;YACX,IAAI,wCAAwC,CAC1C,IAAI,EACJ,cAAc,EACd,qCAA2B,CAAC,MAAM,CACnC,CAAC;QACJ,IAAI,0BAA0B,GAAG,GAAG,KAAK,CAAC,KAAK,CAAC,OAAO,YAAY,KAAK,CAAC,gBAAgB,MAAM,QAAQ,MAAM,UAAU,OAAO,QAAQ,EAAE,CAAC;QACzI,IAAI,UAAY,EAAE,CAAC;YACjB,0BAA0B,GAAG,GAAG,0BAA0B,SAAS,UAAU,EAAE,CAAC;QAClF,CAAC;QACD,MAAM,QAAQ,GAAG,IAAI,wCAAqB,CAAC,IAAI,EAAE,UAAU,EAAE;YAC3D,GAAG,KAAK;YACR,mBAAmB,EAAE,yBAAyB,CAAC,CAAC,CAAC,CAAC,mBAAmB;YACrE,gBAAgB,EAAE,MAAM;YACxB,KAAK,EAAE,KAAK;YACZ,WAAW,EAAE;gBACX,QAAQ,EAAE,KAAK,CAAC,KAAK,CAAC,OAAO;gBAC7B,SAAS,EAAE,QAAQ,CAAC,QAAQ,EAAE;gBAC9B,WAAW,EAAE,UAAU,CAAC,QAAQ,EAAE;gBAClC,SAAS,EAAE,QAAQ,CAAC,QAAQ,EAAE;gBAC9B,WAAW,EAAE,UAAU,EAAE,QAAQ,EAAE,IAAI,EAAE;gBACzC,eAAe,EAAE,KAAK,CAAC,MAAM,CAAC,cAAc,CAC1C,0BAA0B,CAC3B;aACF;SACF,CAAC,CAAC;QACH,IAAI,CAAC,QAAQ,GAAG,QAAQ,CAAC;QACzB,IAAI,CAAC,UAAU,GAAG,UAAU,CAAC;QAC7B,IAAI,CAAC,UAAU,GAAG,UAAU,CAAC;QAC7B,IAAI,CAAC,QAAQ,GAAG,QAAQ,CAAC;QACzB,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC,KAAK,CAAC;QACzB,IAAI,CAAC,QAAQ,GAAG,QAAQ,CAAC;IAC3B,CAAC;IACD,OAAO;QACL,OAAO;YACL,GAAG,IAAI,CAAC,QAAQ,CAAC,OAAO,EAAE;YAC1B,QAAQ,EAAE,IAAI,CAAC,QAAQ;YACvB,QAAQ,EAAE,IAAI,CAAC,QAAQ;YACvB,UAAU,EAAE,IAAI,CAAC,UAAU;YAC3B,UAAU,EAAE,IAAI,CAAC,UAAU;SAC5B,CAAC;IACJ,CAAC;CACF;AAtGD,oFAsGC","sourcesContent":["import { ContainerImageBuild } from \"@cdklabs/deploy-time-build\";\nimport { Size } from \"aws-cdk-lib\";\nimport * as ec2 from \"aws-cdk-lib/aws-ec2\";\nimport { ContainerImage } from \"aws-cdk-lib/aws-ecs\";\nimport { IBucket } from \"aws-cdk-lib/aws-s3\";\nimport { Construct } from \"constructs\";\nimport { join } from \"path\";\nimport {\n  calcMemoryFootprint,\n  calcTensorParallel,\n  DataTypeBits,\n  INeuronxImage,\n  INeuronxInstanceType,\n  inferMemoryFootprintFromParameters,\n  Model,\n  NeuronxInstanceType,\n  OptLevel,\n  PytorchTrainingNeuronxImage,\n  QuantDtype,\n} from \"../base/neuronx\";\nimport {\n  INeuronxContainerImage,\n  NeuronxCompiledModel,\n  NeuronxNativeCompiler,\n} from \"../base/neuronx-compiler\";\n\n/**\n * Compile options.\n */\nexport interface SageMakerInferenceToolkitTnxCompileOptions {\n  /**\n   * Number of tensor parallel groups.\n   * @default - calc from parameters and quantDtype\n   */\n  readonly tpDegree?: number;\n  /**\n   * Maximum number of sequences per iteration.\n   * @default 1\n   */\n  readonly batchSize?: number;\n  /**\n   * @default - No quant\n   */\n  readonly quantDtype?: QuantDtype;\n  /**\n   * @default 4096\n   */\n  readonly nPositions?: number;\n  /**\n   * @default OptLevel.BEST_BALANCE\n   */\n  readonly optLevel?: OptLevel;\n}\n\n/**\n * The model compiled by Neuronx compiler.\n */\nexport interface SageMakerInferenceToolkitTnxCompiledModel extends NeuronxCompiledModel {\n  readonly quantDtype?: QuantDtype;\n  readonly optLevel?: OptLevel;\n  readonly tpDegree: number;\n  readonly nPositions: number;\n}\n\n/**\n * Compile image for SageMakerInferenceToolkitTnxCompile.\n * @example\n * new SageMakerInferenceToolkitTnxCompileImage(scope, 'CompileImage', PytorchTrainingNeuronxImage.LATEST);\n */\nexport class SageMakerInferenceToolkitTnxCompileImage implements INeuronxContainerImage {\n  readonly image: ContainerImage;\n  readonly neuronSdkVersion: string;\n  constructor(scope: Construct, id: string, neruonxImage: INeuronxImage) {\n    const build = new ContainerImageBuild(scope, id, {\n      directory: join(\n        __dirname,\n        \"../../scripts/compile/sagemaker-inference-toolkit-tnx\",\n      ),\n      buildArgs: {\n        IMAGE_NAME: neruonxImage.imageName,\n        IMAGE_TAG: neruonxImage.imageTag,\n      },\n    });\n    this.image = build.toEcsDockerImageCode();\n    this.neuronSdkVersion = neruonxImage.neuronSdkVersion;\n  }\n}\n\n/**\n * Props of NeuronxCompile.\n */\nexport interface SageMakerInferenceToolkitTnxCompilerProps {\n  /**\n   * VPC in which this will launch compile worker instance.\n   */\n  readonly vpc: ec2.IVpc;\n  /**\n   * The bucket to upload compiled artifacts.\n   */\n  readonly bucket: IBucket;\n  /**\n   * The model to be compiled.\n   */\n  readonly model: Model;\n  /**\n   * The instance type of compile worker instance.\n   */\n  readonly neuronxInstanceType?: INeuronxInstanceType;\n  /**\n   * The root volume of worker instance.\n   * @default - N bilion parameters * 5GiB EBS\n   */\n  readonly volumeSize?: Size;\n  /**\n   * Compile runtime image.\n   * @default - latest image\n   */\n  readonly image?: SageMakerInferenceToolkitTnxCompileImage;\n  /**\n   * Neuronx compile options.\n   * @default - Each properties are set default.\n   */\n  readonly compileOptions?: SageMakerInferenceToolkitTnxCompileOptions;\n  /**\n   * Whether or not to use spot instances. Spot instances are less expensive EC2 instances that can be reclaimed by EC2 at any time; your job will be given two minutes of notice before reclamation.\n   *\n   * @default false\n   */\n  readonly spot?: boolean;\n  /**\n   * The VPC Subnets this Compute Environment will launch instances in.\n   *\n   * @default - new subnets will be created\n   */\n  readonly vpcSubnets?: ec2.SubnetSelection;\n}\n\n/**\n * Neuronx compile construct. Compile the model to work with Inferentia2 and Trainium1 and upload it to an S3 bucket.\n */\nexport class SageMakerInferenceToolkitTnxCompiler extends Construct {\n  readonly quantDtype?: QuantDtype;\n  readonly optLevel?: OptLevel;\n  readonly tpDegree: number;\n  readonly nPositions: number;\n  model: Model;\n  private readonly compiler: NeuronxNativeCompiler;\n  constructor(\n    scope: Construct,\n    id: string,\n    props: SageMakerInferenceToolkitTnxCompilerProps,\n  ) {\n    super(scope, id);\n    const optLevel = props.compileOptions?.optLevel ?? OptLevel.BEST_BALANCE;\n    const batchSize = props.compileOptions?.batchSize ?? 1;\n    const nPositions = props.compileOptions?.nPositions ?? 4096;\n    const quantDtype = props.compileOptions?.quantDtype;\n    const memoryFootprint = props.model.options.config\n      ? calcMemoryFootprint(\n          props.model.options.config.embeddingDimension,\n          props.model.options.config.layers,\n          DataTypeBits.BF16_OR_FP16,\n          nPositions,\n          batchSize,\n        )\n      : inferMemoryFootprintFromParameters(\n          props.model.options.parameters,\n          nPositions,\n          batchSize,\n        );\n    const neuronxInstanceTypes = props.neuronxInstanceType\n      ? [props.neuronxInstanceType]\n      : [\n          NeuronxInstanceType.INF2_8XLARGE,\n          NeuronxInstanceType.INF2_24XLARGE,\n          NeuronxInstanceType.INF2_48XLARGE,\n        ];\n    const availableInstancePatterns = neuronxInstanceTypes\n      .flatMap((neuronxInstanceType) =>\n        calcTensorParallel(\n          neuronxInstanceType,\n          memoryFootprint,\n          props.model.options.config?.attentionHeads,\n        ).map((v) => ({\n          neuronxInstanceType,\n          ...v,\n        })),\n      )\n      .filter(\n        (pattern) =>\n          !props.compileOptions?.tpDegree ||\n          pattern.tp === props.compileOptions.tpDegree,\n      );\n    if (availableInstancePatterns.length === 0) {\n      throw new Error(\n        `No available instance type. You can use tpDegree are ${availableInstancePatterns.map((p) => p.tp).join(\", \")}.`,\n      );\n    }\n    const tpDegree = availableInstancePatterns[0].tp;\n    const image =\n      props.image ??\n      new SageMakerInferenceToolkitTnxCompileImage(\n        this,\n        \"CompileImage\",\n        PytorchTrainingNeuronxImage.LATEST,\n      );\n    let compiledArtifactPathPrefix = `${props.model.modelId}/neuronx-${image.neuronSdkVersion}/tp${tpDegree}-np${nPositions}-opt${optLevel}`;\n    if (quantDtype!!) {\n      compiledArtifactPathPrefix = `${compiledArtifactPathPrefix}-quant${quantDtype}`;\n    }\n    const compiler = new NeuronxNativeCompiler(this, \"Resource\", {\n      ...props,\n      neuronxInstanceType: availableInstancePatterns[0].neuronxInstanceType,\n      artifactS3Prefix: \"test\",\n      image: image,\n      environment: {\n        MODEL_ID: props.model.modelId,\n        TP_DEGREE: tpDegree.toString(),\n        N_POSITIONS: nPositions.toString(),\n        OPT_LEVEL: optLevel.toString(),\n        QUANT_DTYPE: quantDtype?.toString() ?? \"\",\n        ARTIFACT_S3_URL: props.bucket.s3UrlForObject(\n          compiledArtifactPathPrefix,\n        ),\n      },\n    });\n    this.tpDegree = tpDegree;\n    this.nPositions = nPositions;\n    this.quantDtype = quantDtype;\n    this.optLevel = optLevel;\n    this.model = props.model;\n    this.compiler = compiler;\n  }\n  compile(): SageMakerInferenceToolkitTnxCompiledModel {\n    return {\n      ...this.compiler.compile(),\n      tpDegree: this.tpDegree,\n      optLevel: this.optLevel,\n      nPositions: this.nPositions,\n      quantDtype: this.quantDtype,\n    };\n  }\n}\n"]}
|
|
@@ -43,7 +43,7 @@ export interface BucketCompiledModelOptions extends CompiledModelOptions {
|
|
|
43
43
|
*/
|
|
44
44
|
export declare class SageMakerInferenceToolkitTnxSageMakerInferenceModelData {
|
|
45
45
|
static fromBucket(bucket: IBucket, prefix: string, options: BucketCompiledModelOptions): SageMakerInferenceToolkitTnxSageMakerInferenceModelData;
|
|
46
|
-
static
|
|
46
|
+
static fromNeuronxNativeCompiler(compiler: SageMakerInferenceToolkitTnxCompiler, code?: ISource): SageMakerInferenceToolkitTnxSageMakerInferenceModelData;
|
|
47
47
|
readonly bucket: IBucket;
|
|
48
48
|
readonly compiledArtifactS3Prefix: string;
|
|
49
49
|
readonly code: ISource;
|
|
@@ -48,7 +48,7 @@ class SageMakerInferenceToolkitTnxSageMakerInferenceModelData {
|
|
|
48
48
|
parameters: options.parameters,
|
|
49
49
|
});
|
|
50
50
|
}
|
|
51
|
-
static
|
|
51
|
+
static fromNeuronxNativeCompiler(compiler, code) {
|
|
52
52
|
const compiledModel = compiler.compile();
|
|
53
53
|
return new SageMakerInferenceToolkitTnxSageMakerInferenceModelData({
|
|
54
54
|
...compiledModel,
|
|
@@ -183,4 +183,4 @@ function inferenceModelDataDownloadTime(volumeSize) {
|
|
|
183
183
|
const seconds = volumeSize.toGibibytes() * 15;
|
|
184
184
|
return aws_cdk_lib_1.Duration.seconds(seconds < 3600 ? seconds : 3600);
|
|
185
185
|
}
|
|
186
|
-
//# sourceMappingURL=data:application/json;base64,{"version":3,"file":"sagemaker-inference-toolkit-tnx-sagemaker.js","sourceRoot":"","sources":["../../src/sagemaker-inference-toolkit-tnx/sagemaker-inference-toolkit-tnx-sagemaker.ts"],"names":[],"mappings":";;;AAAA,kEAAiE;AACjE,0DAA0D;AAC1D,6CAAoD;AAGpD,qEAIuC;AAEvC,2CAAoD;AACpD,+BAA4B;AAC5B,6CAWyB;AAwCzB;;GAEG;AACH,MAAa,uDAAuD;IAClE,MAAM,CAAC,UAAU,CACf,MAAe,EACf,MAAc,EACd,OAAmC;QAEnC,MAAM,UAAU,GAAG,OAAO,CAAC,cAAc,EAAE,UAAU,IAAI,IAAI,CAAC;QAC9D,MAAM,UAAU,GAAG,OAAO,CAAC,cAAc,EAAE,UAAU,CAAC;QACtD,MAAM,QAAQ,GAAG,kBAAQ,CAAC,YAAY,CAAC;QACvC,MAAM,SAAS,GAAG,OAAO,CAAC,cAAc,EAAE,SAAS,IAAI,CAAC,CAAC;QACzD,MAAM,eAAe,GAAG,OAAO,CAAC,MAAM;YACpC,CAAC,CAAC,IAAA,6BAAmB,EACjB,OAAO,CAAC,MAAM,CAAC,kBAAkB,EACjC,OAAO,CAAC,MAAM,CAAC,MAAM,EACrB,sBAAY,CAAC,YAAY,EACzB,UAAU,EACV,SAAS,CACV;YACH,CAAC,CAAC,IAAA,4CAAkC,EAChC,OAAO,CAAC,UAAU,EAClB,UAAU,EACV,SAAS,CACV,CAAC;QACN,MAAM,oBAAoB,GAAG;YAC3B,6BAAmB,CAAC,YAAY;YAChC,6BAAmB,CAAC,aAAa;YACjC,6BAAmB,CAAC,aAAa;SAClC,CAAC;QACF,MAAM,yBAAyB,GAAG,oBAAoB;aACnD,OAAO,CAAC,CAAC,mBAAmB,EAAE,EAAE,CAC/B,IAAA,4BAAkB,EAChB,mBAAmB,EACnB,eAAe,EACf,OAAO,CAAC,MAAM,EAAE,cAAc,CAC/B,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;YACZ,mBAAmB;YACnB,GAAG,CAAC;SACL,CAAC,CAAC,CACJ;aACA,MAAM,CACL,CAAC,OAAO,EAAE,EAAE,CACV,CAAC,OAAO,CAAC,cAAc,EAAE,QAAQ;YACjC,OAAO,CAAC,EAAE,KAAK,OAAO,CAAC,cAAc,CAAC,QAAQ,CACjD,CAAC;QACJ,IAAI,yBAAyB,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC3C,MAAM,IAAI,KAAK,CACb,wDAAwD,yBAAyB,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,CACjH,CAAC;QACJ,CAAC;QACD,MAAM,QAAQ,GACZ,OAAO,CAAC,cAAc,EAAE,QAAQ,IAAI,yBAAyB,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;QACtE,OAAO,IAAI,uDAAuD,CAAC;YACjE,MAAM;YACN,wBAAwB,EAAE,MAAM;YAChC,UAAU;YACV,UAAU;YACV,QAAQ;YACR,QAAQ;YACR,IAAI,EAAE,OAAO,CAAC,IAAI;YAClB,aAAa,EAAE,OAAO,CAAC,aAAa;YACpC,UAAU,EAAE,OAAO,CAAC,UAAU;SAC/B,CAAC,CAAC;IACL,CAAC;IACD,MAAM,CAAC,mBAAmB,CACxB,QAA8C,EAC9C,IAAc;QAEd,MAAM,aAAa,GAAG,QAAQ,CAAC,OAAO,EAAE,CAAC;QACzC,OAAO,IAAI,uDAAuD,CAAC;YACjE,GAAG,aAAa;YAChB,QAAQ,EAAE,aAAa,CAAC,QAAQ,IAAI,kBAAQ,CAAC,YAAY;YACzD,wBAAwB,EAAE,aAAa,CAAC,QAAQ;YAChD,UAAU,EAAE,QAAQ,CAAC,KAAK,CAAC,OAAO,CAAC,UAAU;YAC7C,IAAI;YACJ,WAAW,EAAE,CAAC,QAAQ,CAAC,IAAI,CAAC,YAAa,CAAC;SAC3C,CAAC,CAAC;IACL,CAAC;IAaD,YAAoB,OAYnB;QACC,IAAI,CAAC,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC;QAC7B,IAAI,CAAC,wBAAwB,GAAG,OAAO,CAAC,wBAAwB,CAAC;QACjE,IAAI,CAAC,IAAI;YACP,OAAO,CAAC,IAAI;gBACZ,0BAAM,CAAC,KAAK,CACV,IAAA,WAAI,EACF,SAAS,EACT,8DAA8D,CAC/D,CACF,CAAC;QACJ,IAAI,CAAC,QAAQ,GAAG,OAAO,CAAC,QAAQ,CAAC;QACjC,IAAI,CAAC,UAAU,GAAG,OAAO,CAAC,UAAU,CAAC;QACrC,IAAI,CAAC,UAAU,GAAG,OAAO,CAAC,UAAU,CAAC;QACrC,IAAI,CAAC,QAAQ,GAAG,OAAO,CAAC,QAAQ,CAAC;QACjC,IAAI,CAAC,aAAa,GAAG,OAAO,CAAC,aAAa,CAAC;QAC3C,IAAI,CAAC,oBAAoB,GAAG,OAAO,CAAC,oBAAoB,CAAC;QACzD,IAAI,CAAC,UAAU,GAAG,OAAO,CAAC,UAAU,CAAC;QACrC,IAAI,CAAC,WAAW,GAAG,OAAO,CAAC,WAAW,IAAI,EAAE,CAAC;IAC/C,CAAC;IAED,IAAI,CAAC,KAAgB,EAAE,KAAuB;QAC5C,MAAM,MAAM,GAAG,IAAI,oCAAgB,CAAC,KAAK,EAAE,gBAAgB,EAAE;YAC3D,iBAAiB,EAAE,IAAI,CAAC,MAAM;YAC9B,OAAO,EAAE,CAAC,IAAI,CAAC,IAAI,CAAC;YACpB,oBAAoB,EAAE,IAAA,WAAI,EAAC,IAAI,CAAC,wBAAwB,EAAE,MAAM,CAAC;SAClE,CAAC,CAAC;QACH,MAAM,CAAC,IAAI,CAAC,aAAa,CAAC,GAAG,IAAI,CAAC,WAAW,CAAC,CAAC;QAC/C,KAAK,CAAC,IAAI,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC;IACnC,CAAC;CACF;AAnID,0HAmIC;AA2CD;;GAEG;AACH,MAAa,8DAA+D,SAAQ,sBAAS;IAY3F,YACE,KAAgB,EAChB,EAAU,EACV,KAA0E;QAE1E,KAAK,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;QACjB,IAAI,KAA+B,CAAC;QACpC,IAAI,KAAK,CAAC,KAAK,EAAE,CAAC;YAChB,KAAK,GAAG,KAAK,CAAC,KAAK,CAAC;QACtB,CAAC;aAAM,CAAC;YACN,MAAM,KAAK,GAAG,IAAI,uCAAmB,CAAC,IAAI,EAAE,qBAAqB,EAAE;gBACjE,SAAS,EAAE,IAAA,WAAI,EACb,SAAS,EACT,yDAAyD,CAC1D;aACF,CAAC,CAAC;YACH,KAAK,GAAG,SAAS,CAAC,cAAc,CAAC,iBAAiB,CAChD,KAAK,CAAC,UAAU,EAChB,KAAK,CAAC,QAAQ,CACf,CAAC;QACJ,CAAC;QACD,MAAM,YAAY,GAChB,KAAK,CAAC,YAAY;YAClB,IAAI,CAAC,4BAA4B,CAAC,KAAK,CAAC,SAAS,CAAC,QAAQ,CAAC,CAAC;QAC9D,MAAM,KAAK,GAAG,IAAI,SAAS,CAAC,KAAK,CAAC,IAAI,EAAE,OAAO,EAAE;YAC/C,UAAU,EAAE;gBACV;oBACE,KAAK;oBACL,WAAW,EAAE;wBACX,mBAAmB,EAAE,KAAK,CAAC,SAAS,CAAC,QAAQ,CAAC,QAAQ,EAAE;wBACxD,2BAA2B,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,CAAC,QAAQ,EAAE;wBACjD,4BAA4B,EAAE,IAAI,CAAC,KAAK,CACtC,YAAY,CAAC,gBAAgB,CAAC,YAAY;4BACxC,KAAK,CAAC,SAAS,CAAC,QAAQ,CAC3B,CAAC,QAAQ,EAAE;wBACZ,KAAK,EAAE,KAAK,CAAC,SAAS,CAAC,aAAa,IAAI,SAAS;wBACjD,iBAAiB,EACf,KAAK,CAAC,SAAS,CAAC,oBAAoB,IAAI,YAAY;wBACtD,SAAS,EAAE,KAAK,CAAC,SAAS,CAAC,QAAQ,CAAC,QAAQ,EAAE;wBAC9C,WAAW,EAAE,KAAK,CAAC,SAAS,CAAC,UAAU,CAAC,QAAQ,EAAE;wBAClD,SAAS,EAAE,KAAK,CAAC,SAAS,CAAC,QAAQ,CAAC,QAAQ,EAAE;wBAC9C,WAAW,EAAE,KAAK,CAAC,SAAS,CAAC,UAAU,EAAE,QAAQ,EAAE,IAAI,EAAE;wBACzD,GAAG,KAAK,CAAC,WAAW;qBACrB;iBACF;aACF;SACF,CAAC,CAAC;QACH,MAAM,QAAQ,GAAG,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,OAAO,CAAa,CAAC;QAC3D,QAAQ,CAAC,mBAAmB,CAC1B,qDAAqD,EACrD,KAAK,CAAC,SAAS,CAAC,MAAM,CAAC,cAAc,CACnC,mBAAK,CAAC,YAAY,CAAC,KAAK,CAAC,SAAS,CAAC,wBAAwB,CAAC;YAC1D,KAAK,CAAC,SAAS,CAAC,wBAAwB,CAAC,QAAQ,CAAC,GAAG,CAAC;YACtD,CAAC,CAAC,KAAK,CAAC,SAAS,CAAC,wBAAwB;YAC1C,CAAC,CAAC,GAAG,KAAK,CAAC,SAAS,CAAC,wBAAwB,GAAG,CACnD,CACF,CAAC;QACF,QAAQ,CAAC,mBAAmB,CAC1B,8DAA8D,EAC9D,UAAU,CACX,CAAC;QACF,QAAQ,CAAC,mBAAmB,CAC1B,+DAA+D,EAC/D,MAAM,CACP,CAAC;QACF,KAAK,CAAC,SAAS,CAAC,MAAM,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC;QACxC,KAAK,CAAC,SAAS,CAAC,IAAI,CAAC,IAAI,EAAE,KAAK,CAAC,CAAC;QAClC,MAAM,cAAc,GAAG,IAAI,SAAS,CAAC,cAAc,CACjD,IAAI,EACJ,gBAAgB,EAChB;YACE,0BAA0B,EAAE;gBAC1B;oBACE,KAAK;oBACL,WAAW,EAAE,gBAAgB;oBAC7B,YAAY,EAAE,SAAS,CAAC,YAAY,CAAC,EAAE,CACrC,MAAM,YAAY,CAAC,QAAQ,EAAE,EAAE,CAChC;iBACF;aACF;SACF,CACF,CAAC;QACF,MAAM,iBAAiB,GAAG,cAAc,CAAC,IAAI,CAAC,SAAS,CACrD,gBAAgB,CACI,CAAC;QACvB,MAAM,UAAU,GACd,KAAK,CAAC,UAAU;YAChB,kBAAI,CAAC,SAAS,CACZ,IAAI,CAAC,IAAI,CACP,KAAK,CAAC,SAAS,CAAC,UAAU,CAAC,SAAS,EAAE,GAAG,CAAC,GAAG,GAAG;gBAC9C,CAAC,CAAC,GAAG;gBACL,CAAC,CAAC,KAAK,CAAC,SAAS,CAAC,UAAU,CAAC,SAAS,EAAE,GAAG,CAAC,CAC/C,CACF,CAAC;QACJ,iBAAiB,CAAC,mBAAmB,CACnC,qCAAqC,EACrC,YAAY,CAAC,QAAQ,EAAE,CAAC,UAAU,CAAC,SAAS,CAAC;YAC3C,CAAC,CAAC,SAAS;YACX,CAAC,CAAC,UAAU,CAAC,WAAW,EAAE,CAC7B,CAAC;QACF,MAAM,wBAAwB,GAC5B,KAAK,CAAC,wBAAwB;YAC9B,8BAA8B,CAAC,UAAU,CAAC,CAAC;QAC7C,iBAAiB,CAAC,mBAAmB,CACnC,wDAAwD,EACxD,wBAAwB,CAAC,SAAS,EAAE,CACrC,CAAC;QACF,iBAAiB,CAAC,mBAAmB,CACnC,kEAAkE,EAClE,CACE,KAAK,CAAC,kCAAkC,IAAI,wBAAwB,CACrE,CAAC,SAAS,EAAE,CACd,CAAC;QACF,MAAM,QAAQ,GAAG,IAAI,SAAS,CAAC,QAAQ,CAAC,IAAI,EAAE,UAAU,EAAE;YACxD,cAAc;SACf,CAAC,CAAC;QACH,IAAI,CAAC,WAAW,GAAG,QAAQ,CAAC,WAAW,CAAC;QACxC,IAAI,CAAC,YAAY,GAAG,QAAQ,CAAC,YAAY,CAAC;QAC1C,IAAI,CAAC,QAAQ,GAAG,QAAQ,CAAC;IAC3B,CAAC;IACD,WAAW,CAAC,OAAmB;QAC7B,OAAO,IAAI,CAAC,QAAQ,CAAC,WAAW,CAAC,OAAO,CAAC,CAAC;IAC5C,CAAC;IAEO,4BAA4B,CAAC,QAAgB;QACnD,MAAM,aAAa,GAAG;YACpB,6BAAmB,CAAC,YAAY;YAChC,6BAAmB,CAAC,aAAa;YACjC,6BAAmB,CAAC,aAAa;YACjC,6BAAmB,CAAC,aAAa;SAClC,CAAC;QACF,KAAK,MAAM,YAAY,IAAI,aAAa,EAAE,CAAC;YACzC,IACE,YAAY,CAAC,0BAA0B,CAAC,IAAI,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,EAAE,IAAI,QAAQ,CAAC,EACpE,CAAC;gBACD,OAAO,YAAY,CAAC;YACtB,CAAC;QACH,CAAC;QACD,MAAM,IAAI,KAAK,CACb,wEAAwE,CACzE,CAAC;IACJ,CAAC;CACF;AA1JD,wIA0JC;AAED,SAAS,8BAA8B,CAAC,UAAgB;IACtD,MAAM,OAAO,GAAG,UAAU,CAAC,WAAW,EAAE,GAAG,EAAE,CAAC;IAC9C,OAAO,sBAAQ,CAAC,OAAO,CAAC,OAAO,GAAG,IAAI,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;AAC3D,CAAC","sourcesContent":["import { ContainerImageBuild } from \"@cdklabs/deploy-time-build\";\nimport * as sagemaker from \"@aws-cdk/aws-sagemaker-alpha\";\nimport { Duration, Size, Token } from \"aws-cdk-lib\";\nimport { Grant, IGrantable } from \"aws-cdk-lib/aws-iam\";\nimport { IBucket } from \"aws-cdk-lib/aws-s3\";\nimport {\n  BucketDeployment,\n  ISource,\n  Source,\n} from \"aws-cdk-lib/aws-s3-deployment\";\nimport { CfnEndpointConfig, CfnModel } from \"aws-cdk-lib/aws-sagemaker\";\nimport { Construct, IDependable } from \"constructs\";\nimport { join } from \"path\";\nimport {\n  calcMemoryFootprint,\n  calcTensorParallel,\n  DataTypeBits,\n  INeuronxInstanceType,\n  inferMemoryFootprintFromParameters,\n  ModelConfig,\n  NeuronxInstanceType,\n  OptLevel,\n  Parameters,\n  QuantDtype,\n} from \"../base/neuronx\";\nimport {\n  SageMakerInferenceToolkitTnxCompileOptions,\n  SageMakerInferenceToolkitTnxCompiler,\n} from \"./sagemaker-inference-toolkit-tnx-compiler\";\n\n/**\n * Precompiled model options.\n */\nexport interface CompiledModelOptions {\n  /**\n   * Neuronx compile options.\n   * @default - Each properties are set default.\n   */\n  readonly compileOptions?: SageMakerInferenceToolkitTnxCompileOptions;\n  /**\n   * Code used for inference\n   * @default - using the predefined code\n   */\n  readonly code?: ISource;\n  /**\n   * Model ID or saved path\n   * @default \"./model\"\n   */\n  readonly modelIdOrPath?: string;\n  /**\n   * The path where compiled artifacts (i.e. xxx.neff) are stored\n   * @default \"./compiled\"\n   */\n  readonly compiledArtifactPath?: string;\n}\n\nexport interface BucketCompiledModelOptions extends CompiledModelOptions {\n  /**\n   * The number of parameters of model.\n   */\n  readonly parameters: Parameters;\n  readonly config?: ModelConfig;\n}\n\n/**\n * @deprecated\n */\nexport class SageMakerInferenceToolkitTnxSageMakerInferenceModelData {\n  static fromBucket(\n    bucket: IBucket,\n    prefix: string,\n    options: BucketCompiledModelOptions,\n  ) {\n    const nPositions = options.compileOptions?.nPositions ?? 4096;\n    const quantDtype = options.compileOptions?.quantDtype;\n    const optLevel = OptLevel.BEST_BALANCE;\n    const batchSize = options.compileOptions?.batchSize ?? 1;\n    const memoryFootprint = options.config\n      ? calcMemoryFootprint(\n          options.config.embeddingDimension,\n          options.config.layers,\n          DataTypeBits.BF16_OR_FP16,\n          nPositions,\n          batchSize,\n        )\n      : inferMemoryFootprintFromParameters(\n          options.parameters,\n          nPositions,\n          batchSize,\n        );\n    const neuronxInstanceTypes = [\n      NeuronxInstanceType.INF2_8XLARGE,\n      NeuronxInstanceType.INF2_24XLARGE,\n      NeuronxInstanceType.INF2_48XLARGE,\n    ];\n    const availableInstancePatterns = neuronxInstanceTypes\n      .flatMap((neuronxInstanceType) =>\n        calcTensorParallel(\n          neuronxInstanceType,\n          memoryFootprint,\n          options.config?.attentionHeads,\n        ).map((v) => ({\n          neuronxInstanceType,\n          ...v,\n        })),\n      )\n      .filter(\n        (pattern) =>\n          !options.compileOptions?.tpDegree ||\n          pattern.tp === options.compileOptions.tpDegree,\n      );\n    if (availableInstancePatterns.length === 0) {\n      throw new Error(\n        `No available instance type. You can use tpDegree are ${availableInstancePatterns.map((p) => p.tp).join(\", \")}.`,\n      );\n    }\n    const tpDegree =\n      options.compileOptions?.tpDegree ?? availableInstancePatterns[0].tp;\n    return new SageMakerInferenceToolkitTnxSageMakerInferenceModelData({\n      bucket,\n      compiledArtifactS3Prefix: prefix,\n      nPositions,\n      quantDtype,\n      optLevel,\n      tpDegree,\n      code: options.code,\n      modelIdOrPath: options.modelIdOrPath,\n      parameters: options.parameters,\n    });\n  }\n  static fromNeuronxCompiler(\n    compiler: SageMakerInferenceToolkitTnxCompiler,\n    code?: ISource,\n  ) {\n    const compiledModel = compiler.compile();\n    return new SageMakerInferenceToolkitTnxSageMakerInferenceModelData({\n      ...compiledModel,\n      optLevel: compiledModel.optLevel ?? OptLevel.BEST_BALANCE,\n      compiledArtifactS3Prefix: compiledModel.s3Prefix,\n      parameters: compiler.model.options.parameters,\n      code,\n      dependables: [compiler.node.defaultChild!],\n    });\n  }\n  readonly bucket: IBucket;\n  readonly compiledArtifactS3Prefix: string;\n  readonly code: ISource;\n  readonly tpDegree: number;\n  readonly quantDtype?: QuantDtype;\n  readonly nPositions: number;\n  readonly optLevel: OptLevel;\n  readonly modelIdOrPath?: string;\n  readonly compiledArtifactPath?: string;\n  readonly parameters: Parameters;\n  private readonly dependables: IDependable[];\n\n  private constructor(options: {\n    readonly bucket: IBucket;\n    readonly compiledArtifactS3Prefix: string;\n    readonly tpDegree: number;\n    readonly quantDtype?: QuantDtype;\n    readonly nPositions: number;\n    readonly optLevel: OptLevel;\n    readonly code?: ISource;\n    readonly modelIdOrPath?: string;\n    readonly compiledArtifactPath?: string;\n    readonly parameters: Parameters;\n    readonly dependables?: IDependable[];\n  }) {\n    this.bucket = options.bucket;\n    this.compiledArtifactS3Prefix = options.compiledArtifactS3Prefix;\n    this.code =\n      options.code ??\n      Source.asset(\n        join(\n          __dirname,\n          \"../../scripts/inference/sagemaker-inference-toolkit-tnx/code\",\n        ),\n      );\n    this.tpDegree = options.tpDegree;\n    this.quantDtype = options.quantDtype;\n    this.nPositions = options.nPositions;\n    this.optLevel = options.optLevel;\n    this.modelIdOrPath = options.modelIdOrPath;\n    this.compiledArtifactPath = options.compiledArtifactPath;\n    this.parameters = options.parameters;\n    this.dependables = options.dependables ?? [];\n  }\n\n  bind(scope: Construct, model: sagemaker.IModel) {\n    const deploy = new BucketDeployment(scope, \"CodeDeployment\", {\n      destinationBucket: this.bucket,\n      sources: [this.code],\n      destinationKeyPrefix: join(this.compiledArtifactS3Prefix, \"code\"),\n    });\n    deploy.node.addDependency(...this.dependables);\n    model.node.addDependency(deploy);\n  }\n}\n\nexport interface SageMakerInferenceToolkitTnxSageMakerRealtimeInferenceEndpointProps {\n  /**\n   * Model data for SageMaker inference.\n   * The model data requires at least compiled artifacts.\n   */\n  readonly modelData: SageMakerInferenceToolkitTnxSageMakerInferenceModelData;\n  /**\n   * An image of the container where the inference job is executed.\n   */\n  readonly image?: sagemaker.ContainerImage;\n  /**\n   * A map of environment variables to pass into the container.\n   * @default - Only the predefined environment variables required to use Neuronx have been set.\n   */\n  readonly environment?: { [key: string]: string };\n  /**\n   * The instance type of compile worker instance.\n   * @default - It is determined automatically according to the number of model parameters and compilation options.\n   */\n  readonly instanceType?: INeuronxInstanceType;\n  /**\n   * The size, of the ML storage volume attached to individual inference instance associated with the production variant.\n   * Currently only Amazon EBS gp2 storage volumes are supported.\n   * @see https://aws.amazon.com/jp/releasenotes/host-instance-storage-volumes-table\n   * @default - 3 GB per billion parameter (Max 512 GB)\n   */\n  readonly volumeSize?: Size;\n  /**\n   * The timeout value, to download and extract the model that you want to host from Amazon S3\n   * to the individual inference instance associated with this production variant.\n   * @default - 60 seconds, when `volumeSize` larger than 30GB then 1GB x 15 seconds (max 60 minutes)\n   */\n  readonly modelDataDownloadTimeout?: Duration;\n  /**\n   * The timeout value, for your inference container to pass health check by SageMaker Hosting.\n   * @see https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-inference-code.html#your-algorithms-inference-algo-ping-requests\n   * @default - 60 seconds, when set the `modelDataDownloadTimeout` then use same value (max 60 minutes)\n   */\n  readonly containerStartupHealthCheckTimeout?: Duration;\n}\n\n/**\n * @deprecated\n */\nexport class SageMakerInferenceToolkitTnxSageMakerRealtimeInferenceEndpoint extends Construct {\n  /**\n   * The ARN of the endpoint.\n   * @attribute\n   */\n  readonly endpointArn: string;\n  /**\n   * The name of the endpoint.\n   * @attribute\n   */\n  readonly endpointName: string;\n  private readonly endpoint: sagemaker.Endpoint;\n  constructor(\n    scope: Construct,\n    id: string,\n    props: SageMakerInferenceToolkitTnxSageMakerRealtimeInferenceEndpointProps,\n  ) {\n    super(scope, id);\n    let image: sagemaker.ContainerImage;\n    if (props.image) {\n      image = props.image;\n    } else {\n      const build = new ContainerImageBuild(this, \"InferenceImageBuild\", {\n        directory: join(\n          __dirname,\n          \"../../scripts/inference/sagemaker-inference-toolkit-tnx\",\n        ),\n      });\n      image = sagemaker.ContainerImage.fromEcrRepository(\n        build.repository,\n        build.imageTag,\n      );\n    }\n    const instanceType =\n      props.instanceType ??\n      this.selectInstanceTypeByTpDegree(props.modelData.tpDegree);\n    const model = new sagemaker.Model(this, \"Model\", {\n      containers: [\n        {\n          image,\n          environment: {\n            NEURON_RT_NUM_CORES: props.modelData.tpDegree.toString(),\n            TS_DEFAULT_RESPONSE_TIMEOUT: (60 * 60).toString(),\n            TS_DEFAULT_WORKERS_PER_MODEL: Math.floor(\n              instanceType.acceleratorChips.neuronxCores /\n                props.modelData.tpDegree,\n            ).toString(),\n            MODEL: props.modelData.modelIdOrPath ?? \"./model\",\n            COMPILED_ARTIFACT:\n              props.modelData.compiledArtifactPath ?? \"./compiled\",\n            TP_DEGREE: props.modelData.tpDegree.toString(),\n            N_POSITIONS: props.modelData.nPositions.toString(),\n            OPT_LEVEL: props.modelData.optLevel.toString(),\n            QUANT_DTYPE: props.modelData.quantDtype?.toString() ?? \"\",\n            ...props.environment,\n          },\n        },\n      ],\n    });\n    const cfnModel = model.node.findChild(\"Model\") as CfnModel;\n    cfnModel.addPropertyOverride(\n      \"PrimaryContainer.ModelDataSource.S3DataSource.S3Uri\",\n      props.modelData.bucket.s3UrlForObject(\n        Token.isUnresolved(props.modelData.compiledArtifactS3Prefix) ||\n          props.modelData.compiledArtifactS3Prefix.endsWith(\"/\")\n          ? props.modelData.compiledArtifactS3Prefix\n          : `${props.modelData.compiledArtifactS3Prefix}/`,\n      ),\n    );\n    cfnModel.addPropertyOverride(\n      \"PrimaryContainer.ModelDataSource.S3DataSource.S3DataTypeBits\",\n      \"S3Prefix\",\n    );\n    cfnModel.addPropertyOverride(\n      \"PrimaryContainer.ModelDataSource.S3DataSource.CompressionType\",\n      \"None\",\n    );\n    props.modelData.bucket.grantRead(model);\n    props.modelData.bind(this, model);\n    const endpointConfig = new sagemaker.EndpointConfig(\n      this,\n      \"EndpointConfig\",\n      {\n        instanceProductionVariants: [\n          {\n            model,\n            variantName: \"PrimaryVariant\",\n            instanceType: sagemaker.InstanceType.of(\n              `ml.${instanceType.toString()}`,\n            ),\n          },\n        ],\n      },\n    );\n    const cfnEndpointConfig = endpointConfig.node.findChild(\n      \"EndpointConfig\",\n    ) as CfnEndpointConfig;\n    const volumeSize =\n      props.volumeSize ??\n      Size.gibibytes(\n        Math.ceil(\n          props.modelData.parameters.toBillion() * 3 > 512\n            ? 512\n            : props.modelData.parameters.toBillion() * 3,\n        ),\n      );\n    cfnEndpointConfig.addPropertyOverride(\n      \"ProductionVariants.0.VolumeSizeInGB\",\n      instanceType.toString().startsWith(\"ml.trn1\")\n        ? undefined\n        : volumeSize.toGibibytes(),\n    );\n    const modelDataDownloadTimeout =\n      props.modelDataDownloadTimeout ??\n      inferenceModelDataDownloadTime(volumeSize);\n    cfnEndpointConfig.addPropertyOverride(\n      \"ProductionVariants.0.ModelDataDownloadTimeoutInSeconds\",\n      modelDataDownloadTimeout.toSeconds(),\n    );\n    cfnEndpointConfig.addPropertyOverride(\n      \"ProductionVariants.0.ContainerStartupHealthCheckTimeoutInSeconds\",\n      (\n        props.containerStartupHealthCheckTimeout ?? modelDataDownloadTimeout\n      ).toSeconds(),\n    );\n    const endpoint = new sagemaker.Endpoint(this, \"Endpoint\", {\n      endpointConfig,\n    });\n    this.endpointArn = endpoint.endpointArn;\n    this.endpointName = endpoint.endpointName;\n    this.endpoint = endpoint;\n  }\n  grantInvoke(grantee: IGrantable): Grant {\n    return this.endpoint.grantInvoke(grantee);\n  }\n\n  private selectInstanceTypeByTpDegree(tpDegree: number) {\n    const instanceTypes = [\n      NeuronxInstanceType.INF2_8XLARGE,\n      NeuronxInstanceType.INF2_24XLARGE,\n      NeuronxInstanceType.INF2_48XLARGE,\n      NeuronxInstanceType.TRN1_32XLARGE,\n    ];\n    for (const instanceType of instanceTypes) {\n      if (\n        instanceType.supportedTensorParallelism.find((tp) => tp >= tpDegree)\n      ) {\n        return instanceType;\n      }\n    }\n    throw new Error(\n      \"This model is too large, I can not support this model current version.\",\n    );\n  }\n}\n\nfunction inferenceModelDataDownloadTime(volumeSize: Size) {\n  const seconds = volumeSize.toGibibytes() * 15;\n  return Duration.seconds(seconds < 3600 ? seconds : 3600);\n}\n"]}
|
|
186
|
+
//# sourceMappingURL=data:application/json;base64,{"version":3,"file":"sagemaker-inference-toolkit-tnx-sagemaker.js","sourceRoot":"","sources":["../../src/sagemaker-inference-toolkit-tnx/sagemaker-inference-toolkit-tnx-sagemaker.ts"],"names":[],"mappings":";;;AAAA,kEAAiE;AACjE,0DAA0D;AAC1D,6CAAoD;AAGpD,qEAIuC;AAEvC,2CAAoD;AACpD,+BAA4B;AAC5B,6CAWyB;AAwCzB;;GAEG;AACH,MAAa,uDAAuD;IAClE,MAAM,CAAC,UAAU,CACf,MAAe,EACf,MAAc,EACd,OAAmC;QAEnC,MAAM,UAAU,GAAG,OAAO,CAAC,cAAc,EAAE,UAAU,IAAI,IAAI,CAAC;QAC9D,MAAM,UAAU,GAAG,OAAO,CAAC,cAAc,EAAE,UAAU,CAAC;QACtD,MAAM,QAAQ,GAAG,kBAAQ,CAAC,YAAY,CAAC;QACvC,MAAM,SAAS,GAAG,OAAO,CAAC,cAAc,EAAE,SAAS,IAAI,CAAC,CAAC;QACzD,MAAM,eAAe,GAAG,OAAO,CAAC,MAAM;YACpC,CAAC,CAAC,IAAA,6BAAmB,EACjB,OAAO,CAAC,MAAM,CAAC,kBAAkB,EACjC,OAAO,CAAC,MAAM,CAAC,MAAM,EACrB,sBAAY,CAAC,YAAY,EACzB,UAAU,EACV,SAAS,CACV;YACH,CAAC,CAAC,IAAA,4CAAkC,EAChC,OAAO,CAAC,UAAU,EAClB,UAAU,EACV,SAAS,CACV,CAAC;QACN,MAAM,oBAAoB,GAAG;YAC3B,6BAAmB,CAAC,YAAY;YAChC,6BAAmB,CAAC,aAAa;YACjC,6BAAmB,CAAC,aAAa;SAClC,CAAC;QACF,MAAM,yBAAyB,GAAG,oBAAoB;aACnD,OAAO,CAAC,CAAC,mBAAmB,EAAE,EAAE,CAC/B,IAAA,4BAAkB,EAChB,mBAAmB,EACnB,eAAe,EACf,OAAO,CAAC,MAAM,EAAE,cAAc,CAC/B,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;YACZ,mBAAmB;YACnB,GAAG,CAAC;SACL,CAAC,CAAC,CACJ;aACA,MAAM,CACL,CAAC,OAAO,EAAE,EAAE,CACV,CAAC,OAAO,CAAC,cAAc,EAAE,QAAQ;YACjC,OAAO,CAAC,EAAE,KAAK,OAAO,CAAC,cAAc,CAAC,QAAQ,CACjD,CAAC;QACJ,IAAI,yBAAyB,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YAC3C,MAAM,IAAI,KAAK,CACb,wDAAwD,yBAAyB,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,CACjH,CAAC;QACJ,CAAC;QACD,MAAM,QAAQ,GACZ,OAAO,CAAC,cAAc,EAAE,QAAQ,IAAI,yBAAyB,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;QACtE,OAAO,IAAI,uDAAuD,CAAC;YACjE,MAAM;YACN,wBAAwB,EAAE,MAAM;YAChC,UAAU;YACV,UAAU;YACV,QAAQ;YACR,QAAQ;YACR,IAAI,EAAE,OAAO,CAAC,IAAI;YAClB,aAAa,EAAE,OAAO,CAAC,aAAa;YACpC,UAAU,EAAE,OAAO,CAAC,UAAU;SAC/B,CAAC,CAAC;IACL,CAAC;IACD,MAAM,CAAC,yBAAyB,CAC9B,QAA8C,EAC9C,IAAc;QAEd,MAAM,aAAa,GAAG,QAAQ,CAAC,OAAO,EAAE,CAAC;QACzC,OAAO,IAAI,uDAAuD,CAAC;YACjE,GAAG,aAAa;YAChB,QAAQ,EAAE,aAAa,CAAC,QAAQ,IAAI,kBAAQ,CAAC,YAAY;YACzD,wBAAwB,EAAE,aAAa,CAAC,QAAQ;YAChD,UAAU,EAAE,QAAQ,CAAC,KAAK,CAAC,OAAO,CAAC,UAAU;YAC7C,IAAI;YACJ,WAAW,EAAE,CAAC,QAAQ,CAAC,IAAI,CAAC,YAAa,CAAC;SAC3C,CAAC,CAAC;IACL,CAAC;IAaD,YAAoB,OAYnB;QACC,IAAI,CAAC,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC;QAC7B,IAAI,CAAC,wBAAwB,GAAG,OAAO,CAAC,wBAAwB,CAAC;QACjE,IAAI,CAAC,IAAI;YACP,OAAO,CAAC,IAAI;gBACZ,0BAAM,CAAC,KAAK,CACV,IAAA,WAAI,EACF,SAAS,EACT,8DAA8D,CAC/D,CACF,CAAC;QACJ,IAAI,CAAC,QAAQ,GAAG,OAAO,CAAC,QAAQ,CAAC;QACjC,IAAI,CAAC,UAAU,GAAG,OAAO,CAAC,UAAU,CAAC;QACrC,IAAI,CAAC,UAAU,GAAG,OAAO,CAAC,UAAU,CAAC;QACrC,IAAI,CAAC,QAAQ,GAAG,OAAO,CAAC,QAAQ,CAAC;QACjC,IAAI,CAAC,aAAa,GAAG,OAAO,CAAC,aAAa,CAAC;QAC3C,IAAI,CAAC,oBAAoB,GAAG,OAAO,CAAC,oBAAoB,CAAC;QACzD,IAAI,CAAC,UAAU,GAAG,OAAO,CAAC,UAAU,CAAC;QACrC,IAAI,CAAC,WAAW,GAAG,OAAO,CAAC,WAAW,IAAI,EAAE,CAAC;IAC/C,CAAC;IAED,IAAI,CAAC,KAAgB,EAAE,KAAuB;QAC5C,MAAM,MAAM,GAAG,IAAI,oCAAgB,CAAC,KAAK,EAAE,gBAAgB,EAAE;YAC3D,iBAAiB,EAAE,IAAI,CAAC,MAAM;YAC9B,OAAO,EAAE,CAAC,IAAI,CAAC,IAAI,CAAC;YACpB,oBAAoB,EAAE,IAAA,WAAI,EAAC,IAAI,CAAC,wBAAwB,EAAE,MAAM,CAAC;SAClE,CAAC,CAAC;QACH,MAAM,CAAC,IAAI,CAAC,aAAa,CAAC,GAAG,IAAI,CAAC,WAAW,CAAC,CAAC;QAC/C,KAAK,CAAC,IAAI,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC;IACnC,CAAC;CACF;AAnID,0HAmIC;AA2CD;;GAEG;AACH,MAAa,8DAA+D,SAAQ,sBAAS;IAY3F,YACE,KAAgB,EAChB,EAAU,EACV,KAA0E;QAE1E,KAAK,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;QACjB,IAAI,KAA+B,CAAC;QACpC,IAAI,KAAK,CAAC,KAAK,EAAE,CAAC;YAChB,KAAK,GAAG,KAAK,CAAC,KAAK,CAAC;QACtB,CAAC;aAAM,CAAC;YACN,MAAM,KAAK,GAAG,IAAI,uCAAmB,CAAC,IAAI,EAAE,qBAAqB,EAAE;gBACjE,SAAS,EAAE,IAAA,WAAI,EACb,SAAS,EACT,yDAAyD,CAC1D;aACF,CAAC,CAAC;YACH,KAAK,GAAG,SAAS,CAAC,cAAc,CAAC,iBAAiB,CAChD,KAAK,CAAC,UAAU,EAChB,KAAK,CAAC,QAAQ,CACf,CAAC;QACJ,CAAC;QACD,MAAM,YAAY,GAChB,KAAK,CAAC,YAAY;YAClB,IAAI,CAAC,4BAA4B,CAAC,KAAK,CAAC,SAAS,CAAC,QAAQ,CAAC,CAAC;QAC9D,MAAM,KAAK,GAAG,IAAI,SAAS,CAAC,KAAK,CAAC,IAAI,EAAE,OAAO,EAAE;YAC/C,UAAU,EAAE;gBACV;oBACE,KAAK;oBACL,WAAW,EAAE;wBACX,mBAAmB,EAAE,KAAK,CAAC,SAAS,CAAC,QAAQ,CAAC,QAAQ,EAAE;wBACxD,2BAA2B,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,CAAC,QAAQ,EAAE;wBACjD,4BAA4B,EAAE,IAAI,CAAC,KAAK,CACtC,YAAY,CAAC,gBAAgB,CAAC,YAAY;4BACxC,KAAK,CAAC,SAAS,CAAC,QAAQ,CAC3B,CAAC,QAAQ,EAAE;wBACZ,KAAK,EAAE,KAAK,CAAC,SAAS,CAAC,aAAa,IAAI,SAAS;wBACjD,iBAAiB,EACf,KAAK,CAAC,SAAS,CAAC,oBAAoB,IAAI,YAAY;wBACtD,SAAS,EAAE,KAAK,CAAC,SAAS,CAAC,QAAQ,CAAC,QAAQ,EAAE;wBAC9C,WAAW,EAAE,KAAK,CAAC,SAAS,CAAC,UAAU,CAAC,QAAQ,EAAE;wBAClD,SAAS,EAAE,KAAK,CAAC,SAAS,CAAC,QAAQ,CAAC,QAAQ,EAAE;wBAC9C,WAAW,EAAE,KAAK,CAAC,SAAS,CAAC,UAAU,EAAE,QAAQ,EAAE,IAAI,EAAE;wBACzD,GAAG,KAAK,CAAC,WAAW;qBACrB;iBACF;aACF;SACF,CAAC,CAAC;QACH,MAAM,QAAQ,GAAG,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,OAAO,CAAa,CAAC;QAC3D,QAAQ,CAAC,mBAAmB,CAC1B,qDAAqD,EACrD,KAAK,CAAC,SAAS,CAAC,MAAM,CAAC,cAAc,CACnC,mBAAK,CAAC,YAAY,CAAC,KAAK,CAAC,SAAS,CAAC,wBAAwB,CAAC;YAC1D,KAAK,CAAC,SAAS,CAAC,wBAAwB,CAAC,QAAQ,CAAC,GAAG,CAAC;YACtD,CAAC,CAAC,KAAK,CAAC,SAAS,CAAC,wBAAwB;YAC1C,CAAC,CAAC,GAAG,KAAK,CAAC,SAAS,CAAC,wBAAwB,GAAG,CACnD,CACF,CAAC;QACF,QAAQ,CAAC,mBAAmB,CAC1B,8DAA8D,EAC9D,UAAU,CACX,CAAC;QACF,QAAQ,CAAC,mBAAmB,CAC1B,+DAA+D,EAC/D,MAAM,CACP,CAAC;QACF,KAAK,CAAC,SAAS,CAAC,MAAM,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC;QACxC,KAAK,CAAC,SAAS,CAAC,IAAI,CAAC,IAAI,EAAE,KAAK,CAAC,CAAC;QAClC,MAAM,cAAc,GAAG,IAAI,SAAS,CAAC,cAAc,CACjD,IAAI,EACJ,gBAAgB,EAChB;YACE,0BAA0B,EAAE;gBAC1B;oBACE,KAAK;oBACL,WAAW,EAAE,gBAAgB;oBAC7B,YAAY,EAAE,SAAS,CAAC,YAAY,CAAC,EAAE,CACrC,MAAM,YAAY,CAAC,QAAQ,EAAE,EAAE,CAChC;iBACF;aACF;SACF,CACF,CAAC;QACF,MAAM,iBAAiB,GAAG,cAAc,CAAC,IAAI,CAAC,SAAS,CACrD,gBAAgB,CACI,CAAC;QACvB,MAAM,UAAU,GACd,KAAK,CAAC,UAAU;YAChB,kBAAI,CAAC,SAAS,CACZ,IAAI,CAAC,IAAI,CACP,KAAK,CAAC,SAAS,CAAC,UAAU,CAAC,SAAS,EAAE,GAAG,CAAC,GAAG,GAAG;gBAC9C,CAAC,CAAC,GAAG;gBACL,CAAC,CAAC,KAAK,CAAC,SAAS,CAAC,UAAU,CAAC,SAAS,EAAE,GAAG,CAAC,CAC/C,CACF,CAAC;QACJ,iBAAiB,CAAC,mBAAmB,CACnC,qCAAqC,EACrC,YAAY,CAAC,QAAQ,EAAE,CAAC,UAAU,CAAC,SAAS,CAAC;YAC3C,CAAC,CAAC,SAAS;YACX,CAAC,CAAC,UAAU,CAAC,WAAW,EAAE,CAC7B,CAAC;QACF,MAAM,wBAAwB,GAC5B,KAAK,CAAC,wBAAwB;YAC9B,8BAA8B,CAAC,UAAU,CAAC,CAAC;QAC7C,iBAAiB,CAAC,mBAAmB,CACnC,wDAAwD,EACxD,wBAAwB,CAAC,SAAS,EAAE,CACrC,CAAC;QACF,iBAAiB,CAAC,mBAAmB,CACnC,kEAAkE,EAClE,CACE,KAAK,CAAC,kCAAkC,IAAI,wBAAwB,CACrE,CAAC,SAAS,EAAE,CACd,CAAC;QACF,MAAM,QAAQ,GAAG,IAAI,SAAS,CAAC,QAAQ,CAAC,IAAI,EAAE,UAAU,EAAE;YACxD,cAAc;SACf,CAAC,CAAC;QACH,IAAI,CAAC,WAAW,GAAG,QAAQ,CAAC,WAAW,CAAC;QACxC,IAAI,CAAC,YAAY,GAAG,QAAQ,CAAC,YAAY,CAAC;QAC1C,IAAI,CAAC,QAAQ,GAAG,QAAQ,CAAC;IAC3B,CAAC;IACD,WAAW,CAAC,OAAmB;QAC7B,OAAO,IAAI,CAAC,QAAQ,CAAC,WAAW,CAAC,OAAO,CAAC,CAAC;IAC5C,CAAC;IAEO,4BAA4B,CAAC,QAAgB;QACnD,MAAM,aAAa,GAAG;YACpB,6BAAmB,CAAC,YAAY;YAChC,6BAAmB,CAAC,aAAa;YACjC,6BAAmB,CAAC,aAAa;YACjC,6BAAmB,CAAC,aAAa;SAClC,CAAC;QACF,KAAK,MAAM,YAAY,IAAI,aAAa,EAAE,CAAC;YACzC,IACE,YAAY,CAAC,0BAA0B,CAAC,IAAI,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,EAAE,IAAI,QAAQ,CAAC,EACpE,CAAC;gBACD,OAAO,YAAY,CAAC;YACtB,CAAC;QACH,CAAC;QACD,MAAM,IAAI,KAAK,CACb,wEAAwE,CACzE,CAAC;IACJ,CAAC;CACF;AA1JD,wIA0JC;AAED,SAAS,8BAA8B,CAAC,UAAgB;IACtD,MAAM,OAAO,GAAG,UAAU,CAAC,WAAW,EAAE,GAAG,EAAE,CAAC;IAC9C,OAAO,sBAAQ,CAAC,OAAO,CAAC,OAAO,GAAG,IAAI,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;AAC3D,CAAC","sourcesContent":["import { ContainerImageBuild } from \"@cdklabs/deploy-time-build\";\nimport * as sagemaker from \"@aws-cdk/aws-sagemaker-alpha\";\nimport { Duration, Size, Token } from \"aws-cdk-lib\";\nimport { Grant, IGrantable } from \"aws-cdk-lib/aws-iam\";\nimport { IBucket } from \"aws-cdk-lib/aws-s3\";\nimport {\n  BucketDeployment,\n  ISource,\n  Source,\n} from \"aws-cdk-lib/aws-s3-deployment\";\nimport { CfnEndpointConfig, CfnModel } from \"aws-cdk-lib/aws-sagemaker\";\nimport { Construct, IDependable } from \"constructs\";\nimport { join } from \"path\";\nimport {\n  calcMemoryFootprint,\n  calcTensorParallel,\n  DataTypeBits,\n  INeuronxInstanceType,\n  inferMemoryFootprintFromParameters,\n  ModelConfig,\n  NeuronxInstanceType,\n  OptLevel,\n  Parameters,\n  QuantDtype,\n} from \"../base/neuronx\";\nimport {\n  SageMakerInferenceToolkitTnxCompileOptions,\n  SageMakerInferenceToolkitTnxCompiler,\n} from \"./sagemaker-inference-toolkit-tnx-compiler\";\n\n/**\n * Precompiled model options.\n */\nexport interface CompiledModelOptions {\n  /**\n   * Neuronx compile options.\n   * @default - Each properties are set default.\n   */\n  readonly compileOptions?: SageMakerInferenceToolkitTnxCompileOptions;\n  /**\n   * Code used for inference\n   * @default - using the predefined code\n   */\n  readonly code?: ISource;\n  /**\n   * Model ID or saved path\n   * @default \"./model\"\n   */\n  readonly modelIdOrPath?: string;\n  /**\n   * The path where compiled artifacts (i.e. xxx.neff) are stored\n   * @default \"./compiled\"\n   */\n  readonly compiledArtifactPath?: string;\n}\n\nexport interface BucketCompiledModelOptions extends CompiledModelOptions {\n  /**\n   * The number of parameters of model.\n   */\n  readonly parameters: Parameters;\n  readonly config?: ModelConfig;\n}\n\n/**\n * @deprecated\n */\nexport class SageMakerInferenceToolkitTnxSageMakerInferenceModelData {\n  static fromBucket(\n    bucket: IBucket,\n    prefix: string,\n    options: BucketCompiledModelOptions,\n  ) {\n    const nPositions = options.compileOptions?.nPositions ?? 4096;\n    const quantDtype = options.compileOptions?.quantDtype;\n    const optLevel = OptLevel.BEST_BALANCE;\n    const batchSize = options.compileOptions?.batchSize ?? 1;\n    const memoryFootprint = options.config\n      ? calcMemoryFootprint(\n          options.config.embeddingDimension,\n          options.config.layers,\n          DataTypeBits.BF16_OR_FP16,\n          nPositions,\n          batchSize,\n        )\n      : inferMemoryFootprintFromParameters(\n          options.parameters,\n          nPositions,\n          batchSize,\n        );\n    const neuronxInstanceTypes = [\n      NeuronxInstanceType.INF2_8XLARGE,\n      NeuronxInstanceType.INF2_24XLARGE,\n      NeuronxInstanceType.INF2_48XLARGE,\n    ];\n    const availableInstancePatterns = neuronxInstanceTypes\n      .flatMap((neuronxInstanceType) =>\n        calcTensorParallel(\n          neuronxInstanceType,\n          memoryFootprint,\n          options.config?.attentionHeads,\n        ).map((v) => ({\n          neuronxInstanceType,\n          ...v,\n        })),\n      )\n      .filter(\n        (pattern) =>\n          !options.compileOptions?.tpDegree ||\n          pattern.tp === options.compileOptions.tpDegree,\n      );\n    if (availableInstancePatterns.length === 0) {\n      throw new Error(\n        `No available instance type. You can use tpDegree are ${availableInstancePatterns.map((p) => p.tp).join(\", \")}.`,\n      );\n    }\n    const tpDegree =\n      options.compileOptions?.tpDegree ?? availableInstancePatterns[0].tp;\n    return new SageMakerInferenceToolkitTnxSageMakerInferenceModelData({\n      bucket,\n      compiledArtifactS3Prefix: prefix,\n      nPositions,\n      quantDtype,\n      optLevel,\n      tpDegree,\n      code: options.code,\n      modelIdOrPath: options.modelIdOrPath,\n      parameters: options.parameters,\n    });\n  }\n  static fromNeuronxNativeCompiler(\n    compiler: SageMakerInferenceToolkitTnxCompiler,\n    code?: ISource,\n  ) {\n    const compiledModel = compiler.compile();\n    return new SageMakerInferenceToolkitTnxSageMakerInferenceModelData({\n      ...compiledModel,\n      optLevel: compiledModel.optLevel ?? OptLevel.BEST_BALANCE,\n      compiledArtifactS3Prefix: compiledModel.s3Prefix,\n      parameters: compiler.model.options.parameters,\n      code,\n      dependables: [compiler.node.defaultChild!],\n    });\n  }\n  readonly bucket: IBucket;\n  readonly compiledArtifactS3Prefix: string;\n  readonly code: ISource;\n  readonly tpDegree: number;\n  readonly quantDtype?: QuantDtype;\n  readonly nPositions: number;\n  readonly optLevel: OptLevel;\n  readonly modelIdOrPath?: string;\n  readonly compiledArtifactPath?: string;\n  readonly parameters: Parameters;\n  private readonly dependables: IDependable[];\n\n  private constructor(options: {\n    readonly bucket: IBucket;\n    readonly compiledArtifactS3Prefix: string;\n    readonly tpDegree: number;\n    readonly quantDtype?: QuantDtype;\n    readonly nPositions: number;\n    readonly optLevel: OptLevel;\n    readonly code?: ISource;\n    readonly modelIdOrPath?: string;\n    readonly compiledArtifactPath?: string;\n    readonly parameters: Parameters;\n    readonly dependables?: IDependable[];\n  }) {\n    this.bucket = options.bucket;\n    this.compiledArtifactS3Prefix = options.compiledArtifactS3Prefix;\n    this.code =\n      options.code ??\n      Source.asset(\n        join(\n          __dirname,\n          \"../../scripts/inference/sagemaker-inference-toolkit-tnx/code\",\n        ),\n      );\n    this.tpDegree = options.tpDegree;\n    this.quantDtype = options.quantDtype;\n    this.nPositions = options.nPositions;\n    this.optLevel = options.optLevel;\n    this.modelIdOrPath = options.modelIdOrPath;\n    this.compiledArtifactPath = options.compiledArtifactPath;\n    this.parameters = options.parameters;\n    this.dependables = options.dependables ?? [];\n  }\n\n  bind(scope: Construct, model: sagemaker.IModel) {\n    const deploy = new BucketDeployment(scope, \"CodeDeployment\", {\n      destinationBucket: this.bucket,\n      sources: [this.code],\n      destinationKeyPrefix: join(this.compiledArtifactS3Prefix, \"code\"),\n    });\n    deploy.node.addDependency(...this.dependables);\n    model.node.addDependency(deploy);\n  }\n}\n\nexport interface SageMakerInferenceToolkitTnxSageMakerRealtimeInferenceEndpointProps {\n  /**\n   * Model data for SageMaker inference.\n   * The model data requires at least compiled artifacts.\n   */\n  readonly modelData: SageMakerInferenceToolkitTnxSageMakerInferenceModelData;\n  /**\n   * An image of the container where the inference job is executed.\n   */\n  readonly image?: sagemaker.ContainerImage;\n  /**\n   * A map of environment variables to pass into the container.\n   * @default - Only the predefined environment variables required to use Neuronx have been set.\n   */\n  readonly environment?: { [key: string]: string };\n  /**\n   * The instance type of compile worker instance.\n   * @default - It is determined automatically according to the number of model parameters and compilation options.\n   */\n  readonly instanceType?: INeuronxInstanceType;\n  /**\n   * The size, of the ML storage volume attached to individual inference instance associated with the production variant.\n   * Currently only Amazon EBS gp2 storage volumes are supported.\n   * @see https://aws.amazon.com/jp/releasenotes/host-instance-storage-volumes-table\n   * @default - 3 GB per billion parameter (Max 512 GB)\n   */\n  readonly volumeSize?: Size;\n  /**\n   * The timeout value, to download and extract the model that you want to host from Amazon S3\n   * to the individual inference instance associated with this production variant.\n   * @default - 60 seconds, when `volumeSize` larger than 30GB then 1GB x 15 seconds (max 60 minutes)\n   */\n  readonly modelDataDownloadTimeout?: Duration;\n  /**\n   * The timeout value, for your inference container to pass health check by SageMaker Hosting.\n   * @see https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-inference-code.html#your-algorithms-inference-algo-ping-requests\n   * @default - 60 seconds, when set the `modelDataDownloadTimeout` then use same value (max 60 minutes)\n   */\n  readonly containerStartupHealthCheckTimeout?: Duration;\n}\n\n/**\n * @deprecated\n */\nexport class SageMakerInferenceToolkitTnxSageMakerRealtimeInferenceEndpoint extends Construct {\n  /**\n   * The ARN of the endpoint.\n   * @attribute\n   */\n  readonly endpointArn: string;\n  /**\n   * The name of the endpoint.\n   * @attribute\n   */\n  readonly endpointName: string;\n  private readonly endpoint: sagemaker.Endpoint;\n  constructor(\n    scope: Construct,\n    id: string,\n    props: SageMakerInferenceToolkitTnxSageMakerRealtimeInferenceEndpointProps,\n  ) {\n    super(scope, id);\n    let image: sagemaker.ContainerImage;\n    if (props.image) {\n      image = props.image;\n    } else {\n      const build = new ContainerImageBuild(this, \"InferenceImageBuild\", {\n        directory: join(\n          __dirname,\n          \"../../scripts/inference/sagemaker-inference-toolkit-tnx\",\n        ),\n      });\n      image = sagemaker.ContainerImage.fromEcrRepository(\n        build.repository,\n        build.imageTag,\n      );\n    }\n    const instanceType =\n      props.instanceType ??\n      this.selectInstanceTypeByTpDegree(props.modelData.tpDegree);\n    const model = new sagemaker.Model(this, \"Model\", {\n      containers: [\n        {\n          image,\n          environment: {\n            NEURON_RT_NUM_CORES: props.modelData.tpDegree.toString(),\n            TS_DEFAULT_RESPONSE_TIMEOUT: (60 * 60).toString(),\n            TS_DEFAULT_WORKERS_PER_MODEL: Math.floor(\n              instanceType.acceleratorChips.neuronxCores /\n                props.modelData.tpDegree,\n            ).toString(),\n            MODEL: props.modelData.modelIdOrPath ?? \"./model\",\n            COMPILED_ARTIFACT:\n              props.modelData.compiledArtifactPath ?? \"./compiled\",\n            TP_DEGREE: props.modelData.tpDegree.toString(),\n            N_POSITIONS: props.modelData.nPositions.toString(),\n            OPT_LEVEL: props.modelData.optLevel.toString(),\n            QUANT_DTYPE: props.modelData.quantDtype?.toString() ?? \"\",\n            ...props.environment,\n          },\n        },\n      ],\n    });\n    const cfnModel = model.node.findChild(\"Model\") as CfnModel;\n    cfnModel.addPropertyOverride(\n      \"PrimaryContainer.ModelDataSource.S3DataSource.S3Uri\",\n      props.modelData.bucket.s3UrlForObject(\n        Token.isUnresolved(props.modelData.compiledArtifactS3Prefix) ||\n          props.modelData.compiledArtifactS3Prefix.endsWith(\"/\")\n          ? props.modelData.compiledArtifactS3Prefix\n          : `${props.modelData.compiledArtifactS3Prefix}/`,\n      ),\n    );\n    cfnModel.addPropertyOverride(\n      \"PrimaryContainer.ModelDataSource.S3DataSource.S3DataTypeBits\",\n      \"S3Prefix\",\n    );\n    cfnModel.addPropertyOverride(\n      \"PrimaryContainer.ModelDataSource.S3DataSource.CompressionType\",\n      \"None\",\n    );\n    props.modelData.bucket.grantRead(model);\n    props.modelData.bind(this, model);\n    const endpointConfig = new sagemaker.EndpointConfig(\n      this,\n      \"EndpointConfig\",\n      {\n        instanceProductionVariants: [\n          {\n            model,\n            variantName: \"PrimaryVariant\",\n            instanceType: sagemaker.InstanceType.of(\n              `ml.${instanceType.toString()}`,\n            ),\n          },\n        ],\n      },\n    );\n    const cfnEndpointConfig = endpointConfig.node.findChild(\n      \"EndpointConfig\",\n    ) as CfnEndpointConfig;\n    const volumeSize =\n      props.volumeSize ??\n      Size.gibibytes(\n        Math.ceil(\n          props.modelData.parameters.toBillion() * 3 > 512\n            ? 512\n            : props.modelData.parameters.toBillion() * 3,\n        ),\n      );\n    cfnEndpointConfig.addPropertyOverride(\n      \"ProductionVariants.0.VolumeSizeInGB\",\n      instanceType.toString().startsWith(\"ml.trn1\")\n        ? undefined\n        : volumeSize.toGibibytes(),\n    );\n    const modelDataDownloadTimeout =\n      props.modelDataDownloadTimeout ??\n      inferenceModelDataDownloadTime(volumeSize);\n    cfnEndpointConfig.addPropertyOverride(\n      \"ProductionVariants.0.ModelDataDownloadTimeoutInSeconds\",\n      modelDataDownloadTimeout.toSeconds(),\n    );\n    cfnEndpointConfig.addPropertyOverride(\n      \"ProductionVariants.0.ContainerStartupHealthCheckTimeoutInSeconds\",\n      (\n        props.containerStartupHealthCheckTimeout ?? modelDataDownloadTimeout\n      ).toSeconds(),\n    );\n    const endpoint = new sagemaker.Endpoint(this, \"Endpoint\", {\n      endpointConfig,\n    });\n    this.endpointArn = endpoint.endpointArn;\n    this.endpointName = endpoint.endpointName;\n    this.endpoint = endpoint;\n  }\n  grantInvoke(grantee: IGrantable): Grant {\n    return this.endpoint.grantInvoke(grantee);\n  }\n\n  private selectInstanceTypeByTpDegree(tpDegree: number) {\n    const instanceTypes = [\n      NeuronxInstanceType.INF2_8XLARGE,\n      NeuronxInstanceType.INF2_24XLARGE,\n      NeuronxInstanceType.INF2_48XLARGE,\n      NeuronxInstanceType.TRN1_32XLARGE,\n    ];\n    for (const instanceType of instanceTypes) {\n      if (\n        instanceType.supportedTensorParallelism.find((tp) => tp >= tpDegree)\n      ) {\n        return instanceType;\n      }\n    }\n    throw new Error(\n      \"This model is too large, I can not support this model current version.\",\n    );\n  }\n}\n\nfunction inferenceModelDataDownloadTime(volumeSize: Size) {\n  const seconds = volumeSize.toGibibytes() * 15;\n  return Duration.seconds(seconds < 3600 ? seconds : 3600);\n}\n"]}
|