aws-cdk-neuronx-patterns 0.0.2 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.jsii +925 -106
- package/API.md +733 -1
- package/README.md +121 -27
- package/docs/neuronx-compile-architecture.png +0 -0
- package/lib/.types-compat/ts3.9/index.d.ts +2 -0
- package/lib/.types-compat/ts3.9/model.d.ts +97 -0
- package/lib/.types-compat/ts3.9/neuronx-compile.d.ts +15 -92
- package/lib/.types-compat/ts3.9/private/util.d.ts +2 -0
- package/lib/.types-compat/ts3.9/transformers-neuronx-sagemaker-realtime-inference.d.ts +113 -0
- package/lib/index.d.ts +2 -0
- package/lib/index.js +3 -1
- package/lib/model.d.ts +97 -0
- package/lib/model.js +93 -0
- package/lib/neuronx-compile.d.ts +15 -92
- package/lib/neuronx-compile.js +43 -156
- package/lib/neuronx-instance-type.js +2 -2
- package/lib/private/await-compile-job/index.js +2 -2
- package/lib/private/util.d.ts +2 -0
- package/lib/private/util.js +31 -0
- package/lib/transformers-neuronx-sagemaker-realtime-inference.d.ts +113 -0
- package/lib/transformers-neuronx-sagemaker-realtime-inference.js +150 -0
- package/package.json +9 -5
- package/scripts/compile/Dockerfile +10 -0
- package/scripts/compile/entrypoint.sh +9 -0
- package/scripts/inference/transformers-neuronx/Dockerfile +1 -0
- package/scripts/inference/transformers-neuronx/code/inference.py +63 -0
- package/scripts/inference/transformers-neuronx/code/requirements.txt +1 -0
- /package/scripts/{compile.py → compile/compile.py} +0 -0
package/README.md
CHANGED
|
@@ -1,29 +1,121 @@
|
|
|
1
1
|
# Neuronx patterns Construct Library
|
|
2
2
|
|
|
3
|
+
> [!WARNING]
|
|
4
|
+
> This library is experimental module.
|
|
5
|
+
|
|
3
6
|
This library provides high-level architectural patterns using neuronx (e.g. Inferentia2 and Trainium1). It contains:
|
|
4
7
|
|
|
8
|
+
- Transformers Neuronx SageMaker Real-time Inference Endpoint
|
|
5
9
|
- Neuronx Compile
|
|
6
10
|
|
|
7
|
-
## Neuronx
|
|
11
|
+
## Transformers Neuronx SageMaker Real-time Inference Endpoint
|
|
8
12
|
|
|
9
|
-
|
|
10
|
-
This construct uses an Inferentia2 instance on
|
|
11
|
-
:::
|
|
13
|
+
> [!WARNING]
|
|
14
|
+
> This construct uses an Inferentia2 instance on SageMaker. You may need to increase your request limit for your AWS account.
|
|
12
15
|
|
|
13
|
-
|
|
16
|
+
By using the `NeuronxCompile` construct included in this construct library, models published on HuggingFace can be easily deployed to SageMaker Real-time inference. To define using the `NeuronxCompile` construct:
|
|
14
17
|
|
|
15
|
-
|
|
16
|
-
|
|
18
|
+
```ts
|
|
19
|
+
import * as ec2 from "aws-cdk-lib/aws-ec2";
|
|
20
|
+
import * as s3 from "aws-cdk-lib/aws-s3";
|
|
21
|
+
|
|
22
|
+
declare const vpc: ec2.Vpc;
|
|
23
|
+
declare const bucket: s3.Bucket;
|
|
24
|
+
const compile = new NeuronxCompile(this, "NeuronxCompile", {
|
|
25
|
+
vpc,
|
|
26
|
+
bucket,
|
|
27
|
+
model: Model.fromHuggingFace("example/example-7b-chat", {
|
|
28
|
+
parameters: Parameters.billion(7),
|
|
29
|
+
}),
|
|
30
|
+
});
|
|
31
|
+
new TransformersNeuronxSageMakerRealtimeInferenceEndpoint(
|
|
32
|
+
this,
|
|
33
|
+
"RealtimeInference",
|
|
34
|
+
{
|
|
35
|
+
modelData:
|
|
36
|
+
TransformersNeuronxSageMakerInferenceModelData.fromNeuronxCompile(
|
|
37
|
+
compile,
|
|
38
|
+
),
|
|
39
|
+
},
|
|
40
|
+
);
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
### Default inference code
|
|
44
|
+
|
|
45
|
+
By default, default inference code is deployed to implement the chat interface. The default inference code takes an object array like [transformers' conversations](https://huggingface.co/docs/transformers/main/en/conversations) and responds to the generated text. The following code is an example using the AWS SDK for JavaScript v3.
|
|
17
46
|
|
|
18
|
-
|
|
47
|
+
```ts
|
|
48
|
+
import {
|
|
49
|
+
InvokeEndpointCommand,
|
|
50
|
+
SageMakerRuntimeClient,
|
|
51
|
+
} from "@aws-sdk/client-sagemaker-runtime";
|
|
52
|
+
|
|
53
|
+
const client = new SageMakerRuntimeClient({
|
|
54
|
+
region: "us-east-1",
|
|
55
|
+
});
|
|
56
|
+
client
|
|
57
|
+
.send(
|
|
58
|
+
new InvokeEndpointCommand({
|
|
59
|
+
EndpointName: "my-endpoint-id",
|
|
60
|
+
Body: JSON.stringify({
|
|
61
|
+
// Optional. You can change answer role.
|
|
62
|
+
role: "ai",
|
|
63
|
+
// Require. The messages like conversation.
|
|
64
|
+
messages: [
|
|
65
|
+
{
|
|
66
|
+
role: "system",
|
|
67
|
+
content: `You are helpfull assistant.`,
|
|
68
|
+
},
|
|
69
|
+
{
|
|
70
|
+
role: "user",
|
|
71
|
+
content:
|
|
72
|
+
"please answer '1+1=?'. You must answer only answer numeric.",
|
|
73
|
+
},
|
|
74
|
+
],
|
|
75
|
+
}),
|
|
76
|
+
ContentType: "application/json",
|
|
77
|
+
Accept: "application/json",
|
|
78
|
+
}),
|
|
79
|
+
)
|
|
80
|
+
.then((res) => {
|
|
81
|
+
// { generated_text: "2" }
|
|
82
|
+
console.log(JSON.parse(res.Body.transformToString()));
|
|
83
|
+
});
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
To change your own inference code, you can pass the code source.
|
|
87
|
+
|
|
88
|
+
```ts
|
|
89
|
+
import * as s3Deplyment from "aws-cdk-lib/aws-s3-deployment";
|
|
90
|
+
|
|
91
|
+
declare const compile: NeuronxCompile;
|
|
92
|
+
new TransformersNeuronxSageMakerRealtimeInferenceEndpoint(
|
|
93
|
+
this,
|
|
94
|
+
"RealtimeInference",
|
|
95
|
+
{
|
|
96
|
+
modelData:
|
|
97
|
+
TransformersNeuronxSageMakerInferenceModelData.fromNeuronxCompile(
|
|
98
|
+
compile,
|
|
99
|
+
s3Deplyment.Source.asset("path/to/my/code/directory"),
|
|
100
|
+
),
|
|
101
|
+
},
|
|
102
|
+
);
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
## Neuronx Compile
|
|
106
|
+
|
|
107
|
+
> [!WARNING]
|
|
108
|
+
> This construct uses an Inferentia2 instance on EC2. You may need to increase your request limit for your AWS account.
|
|
109
|
+
|
|
110
|
+
This construct compiles models supported by Neuronx and uploads them to the specified S3 bucket. To define
|
|
19
111
|
|
|
20
112
|
```ts
|
|
21
|
-
import
|
|
22
|
-
import
|
|
113
|
+
import * as ec2 from "aws-cdk-lib/aws-ec2";
|
|
114
|
+
import * as s3 from "aws-cdk-lib/aws-s3";
|
|
23
115
|
|
|
24
|
-
declare const vpc: Vpc;
|
|
25
|
-
declare const bucket: Bucket;
|
|
26
|
-
const compile = new NeuronxCompile(
|
|
116
|
+
declare const vpc: ec2.Vpc;
|
|
117
|
+
declare const bucket: s3.Bucket;
|
|
118
|
+
const compile = new NeuronxCompile(this, "NeuronxCompile", {
|
|
27
119
|
vpc,
|
|
28
120
|
bucket,
|
|
29
121
|
model: Model.fromHuggingFace("example/example-7b-chat", {
|
|
@@ -32,7 +124,7 @@ const compile = new NeuronxCompile(stack, "NeuronxCompile", {
|
|
|
32
124
|
});
|
|
33
125
|
|
|
34
126
|
// Get the compiled artifacts from this S3 URL
|
|
35
|
-
new CfnOutput(
|
|
127
|
+
new CfnOutput(this, "CompiledArtifact", {
|
|
36
128
|
value: compile.compiledArtifactS3Url,
|
|
37
129
|
});
|
|
38
130
|
```
|
|
@@ -54,21 +146,23 @@ After compiled, you can see like the this file tree in the S3 bucket.
|
|
|
54
146
|
└── xxx.neff
|
|
55
147
|
```
|
|
56
148
|
|
|
149
|
+
This is NeuronxCompile architecture.
|
|
150
|
+

|
|
151
|
+
|
|
57
152
|
### Spot Instance
|
|
58
153
|
|
|
59
|
-
|
|
60
|
-
If you use Spot Instances, check if the request limit for Spot has been increased.
|
|
61
|
-
:::
|
|
154
|
+
> [!WARNING]
|
|
155
|
+
> If you use Spot Instances, check if the request limit for Spot has been increased.
|
|
62
156
|
|
|
63
157
|
You can also use Spot Instances.
|
|
64
158
|
|
|
65
159
|
```ts
|
|
66
|
-
import
|
|
67
|
-
import
|
|
160
|
+
import * as ec2 from "aws-cdk-lib/aws-ec2";
|
|
161
|
+
import * as s3 from "aws-cdk-lib/aws-s3";
|
|
68
162
|
|
|
69
|
-
declare const vpc: Vpc;
|
|
70
|
-
declare const bucket: Bucket;
|
|
71
|
-
new NeuronxCompile(
|
|
163
|
+
declare const vpc: ec2.Vpc;
|
|
164
|
+
declare const bucket: s3.Bucket;
|
|
165
|
+
new NeuronxCompile(this, "NeuronxCompile", {
|
|
72
166
|
vpc,
|
|
73
167
|
bucket,
|
|
74
168
|
model: Model.fromHuggingFace("example/example-7b-chat", {
|
|
@@ -83,12 +177,12 @@ new NeuronxCompile(stack, "NeuronxCompile", {
|
|
|
83
177
|
If you are familiar with Neuronx, you can also specify compilation options to better meet your requirements.
|
|
84
178
|
|
|
85
179
|
```ts
|
|
86
|
-
import
|
|
87
|
-
import
|
|
180
|
+
import * as ec2 from "aws-cdk-lib/aws-ec2";
|
|
181
|
+
import * as s3 from "aws-cdk-lib/aws-s3";
|
|
88
182
|
|
|
89
|
-
declare const vpc: Vpc;
|
|
90
|
-
declare const bucket: Bucket;
|
|
91
|
-
new NeuronxCompile(
|
|
183
|
+
declare const vpc: ec2.Vpc;
|
|
184
|
+
declare const bucket: s3.Bucket;
|
|
185
|
+
new NeuronxCompile(this, "NeuronxCompile", {
|
|
92
186
|
vpc,
|
|
93
187
|
bucket,
|
|
94
188
|
model: Model.fromHuggingFace("example/example-22b-chat", {
|
|
Binary file
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
import { IBucket } from "aws-cdk-lib/aws-s3";
|
|
2
|
+
/**
|
|
3
|
+
* Quant data type.
|
|
4
|
+
*/
|
|
5
|
+
export declare enum QuantDtype {
|
|
6
|
+
/**
|
|
7
|
+
* int8 weight storage.
|
|
8
|
+
*/
|
|
9
|
+
S8 = "s8"
|
|
10
|
+
}
|
|
11
|
+
/**
|
|
12
|
+
* Optimization level.
|
|
13
|
+
*/
|
|
14
|
+
export declare enum OptLevel {
|
|
15
|
+
/**
|
|
16
|
+
* enables the core performance optimizations in the compiler, while also minimizing compile time.
|
|
17
|
+
*/
|
|
18
|
+
MINIMIZING_COMPILE_TIME = 1,
|
|
19
|
+
/**
|
|
20
|
+
* provides the best balance between model performance and compile time.
|
|
21
|
+
*/
|
|
22
|
+
BEST_BALANCE = 2,
|
|
23
|
+
/**
|
|
24
|
+
* may provide additional model execution performance but may incur longer compile times and higher host memory usage during model compilation.
|
|
25
|
+
*/
|
|
26
|
+
MODEL_EXECUTION_PERFORMANCE = 3
|
|
27
|
+
}
|
|
28
|
+
/**
|
|
29
|
+
* Compile options.
|
|
30
|
+
*/
|
|
31
|
+
export interface CompileOptions {
|
|
32
|
+
/**
|
|
33
|
+
* @default - calc from parameters and quantDtype
|
|
34
|
+
*/
|
|
35
|
+
readonly tpDegree?: number;
|
|
36
|
+
/**
|
|
37
|
+
* @default - No quant
|
|
38
|
+
*/
|
|
39
|
+
readonly quantDtype?: QuantDtype;
|
|
40
|
+
/**
|
|
41
|
+
* @default 4096
|
|
42
|
+
*/
|
|
43
|
+
readonly nPositions?: number;
|
|
44
|
+
/**
|
|
45
|
+
* @default OptLevel.BEST_BALANCE
|
|
46
|
+
*/
|
|
47
|
+
readonly optLevel?: OptLevel;
|
|
48
|
+
}
|
|
49
|
+
/**
|
|
50
|
+
* Represents the amount of parameters.
|
|
51
|
+
*/
|
|
52
|
+
export declare class Parameters {
|
|
53
|
+
private readonly billion;
|
|
54
|
+
/**
|
|
55
|
+
* Create a Parameters representing an amount bilion.
|
|
56
|
+
* @param parameters number of parameters bilionX
|
|
57
|
+
* @returns parameters
|
|
58
|
+
*/
|
|
59
|
+
static billion(parameters: number): Parameters;
|
|
60
|
+
private constructor();
|
|
61
|
+
/**
|
|
62
|
+
* Return this number of parameters as bilion.
|
|
63
|
+
* @returns This number of parameters as bilion.
|
|
64
|
+
*/
|
|
65
|
+
toBilion(): number;
|
|
66
|
+
}
|
|
67
|
+
/**
|
|
68
|
+
* Compile target model basic infromation
|
|
69
|
+
*/
|
|
70
|
+
export interface ModelOptions {
|
|
71
|
+
readonly parameters: Parameters;
|
|
72
|
+
}
|
|
73
|
+
/**
|
|
74
|
+
* Compile target model.
|
|
75
|
+
*/
|
|
76
|
+
export declare class Model {
|
|
77
|
+
readonly modelId: string;
|
|
78
|
+
readonly options: ModelOptions;
|
|
79
|
+
readonly bucket?: IBucket | undefined;
|
|
80
|
+
readonly prefix?: string | undefined;
|
|
81
|
+
/**
|
|
82
|
+
* model informations at HuggingFace
|
|
83
|
+
* @param modelId model id on the HuggingFace
|
|
84
|
+
* @param options model basic infromation
|
|
85
|
+
* @returns model instance
|
|
86
|
+
*/
|
|
87
|
+
static fromHuggingFace(modelId: string, options: ModelOptions): Model;
|
|
88
|
+
/**
|
|
89
|
+
* model informations at S3 Bucket
|
|
90
|
+
* @param bucket Model stored S3 Bucket
|
|
91
|
+
* @param prefix Model stored objects prefix
|
|
92
|
+
* @param options model basic infromation
|
|
93
|
+
* @returns model instance
|
|
94
|
+
*/
|
|
95
|
+
static fromBucket(bucket: IBucket, prefix: string, options: ModelOptions): Model;
|
|
96
|
+
private constructor();
|
|
97
|
+
}
|
|
@@ -3,6 +3,7 @@ import * as ec2 from "aws-cdk-lib/aws-ec2";
|
|
|
3
3
|
import { ContainerImage } from "aws-cdk-lib/aws-ecs";
|
|
4
4
|
import { IBucket } from "aws-cdk-lib/aws-s3";
|
|
5
5
|
import { Construct } from "constructs";
|
|
6
|
+
import { CompileOptions, Model, OptLevel, Parameters, QuantDtype } from "./model";
|
|
6
7
|
import { NeuronxInstanceType } from "./neuronx-instance-type";
|
|
7
8
|
/**
|
|
8
9
|
* Compile runtime.
|
|
@@ -17,92 +18,6 @@ export interface CompileRuntime {
|
|
|
17
18
|
*/
|
|
18
19
|
readonly neuronxVersion: string;
|
|
19
20
|
}
|
|
20
|
-
/**
|
|
21
|
-
* Quant data type.
|
|
22
|
-
*/
|
|
23
|
-
export declare enum QuantDtype {
|
|
24
|
-
/**
|
|
25
|
-
* int8 weight storage.
|
|
26
|
-
*/
|
|
27
|
-
S8 = "s8"
|
|
28
|
-
}
|
|
29
|
-
/**
|
|
30
|
-
* Optimization level.
|
|
31
|
-
*/
|
|
32
|
-
export declare enum OptLevel {
|
|
33
|
-
/**
|
|
34
|
-
* enables the core performance optimizations in the compiler, while also minimizing compile time.
|
|
35
|
-
*/
|
|
36
|
-
MINIMIZING_COMPILE_TIME = 1,
|
|
37
|
-
/**
|
|
38
|
-
* provides the best balance between model performance and compile time.
|
|
39
|
-
*/
|
|
40
|
-
BEST_BALANCE = 2,
|
|
41
|
-
/**
|
|
42
|
-
* may provide additional model execution performance but may incur longer compile times and higher host memory usage during model compilation.
|
|
43
|
-
*/
|
|
44
|
-
MODEL_EXECUTION_PERFORMANCE = 3
|
|
45
|
-
}
|
|
46
|
-
/**
|
|
47
|
-
* Compile options.
|
|
48
|
-
*/
|
|
49
|
-
export interface CompileOptions {
|
|
50
|
-
/**
|
|
51
|
-
* @default - calc from parameters and quantDtype
|
|
52
|
-
*/
|
|
53
|
-
readonly tpDegree?: number;
|
|
54
|
-
/**
|
|
55
|
-
* @default - No quant
|
|
56
|
-
*/
|
|
57
|
-
readonly quantDtype?: QuantDtype;
|
|
58
|
-
/**
|
|
59
|
-
* @default 4092
|
|
60
|
-
*/
|
|
61
|
-
readonly nPositions?: number;
|
|
62
|
-
/**
|
|
63
|
-
* @default OptLevel.BEST_BALANCE
|
|
64
|
-
*/
|
|
65
|
-
readonly optLevel?: OptLevel;
|
|
66
|
-
}
|
|
67
|
-
/**
|
|
68
|
-
* Represents the amount of parameters.
|
|
69
|
-
*/
|
|
70
|
-
export declare class Parameters {
|
|
71
|
-
private readonly billion;
|
|
72
|
-
/**
|
|
73
|
-
* Create a Parameters representing an amount bilion.
|
|
74
|
-
* @param parameters number of parameters bilionX
|
|
75
|
-
* @returns parameters
|
|
76
|
-
*/
|
|
77
|
-
static billion(parameters: number): Parameters;
|
|
78
|
-
private constructor();
|
|
79
|
-
/**
|
|
80
|
-
* Return this number of parameters as bilion.
|
|
81
|
-
* @returns This number of parameters as bilion.
|
|
82
|
-
*/
|
|
83
|
-
toBilion(): number;
|
|
84
|
-
}
|
|
85
|
-
/**
|
|
86
|
-
* Compile target model basic infromation
|
|
87
|
-
*/
|
|
88
|
-
export interface ModelOptions {
|
|
89
|
-
readonly parameters: Parameters;
|
|
90
|
-
}
|
|
91
|
-
/**
|
|
92
|
-
* Compile target model.
|
|
93
|
-
*/
|
|
94
|
-
export declare class Model {
|
|
95
|
-
readonly modelId: string;
|
|
96
|
-
readonly options: ModelOptions;
|
|
97
|
-
/**
|
|
98
|
-
* model informations at HuggingFace
|
|
99
|
-
* @param modelId model id on the HuggingFace
|
|
100
|
-
* @param options model basic infromation
|
|
101
|
-
* @returns model instance
|
|
102
|
-
*/
|
|
103
|
-
static fromHuggingFace(modelId: string, options: ModelOptions): Model;
|
|
104
|
-
private constructor();
|
|
105
|
-
}
|
|
106
21
|
/**
|
|
107
22
|
* Props of NeuronxCompile.
|
|
108
23
|
*/
|
|
@@ -111,10 +26,6 @@ export interface NeuronxCompileProps {
|
|
|
111
26
|
* VPC in which this will launch compile worker instance.
|
|
112
27
|
*/
|
|
113
28
|
readonly vpc: ec2.IVpc;
|
|
114
|
-
/**
|
|
115
|
-
* The instance type of compile worker instance.
|
|
116
|
-
*/
|
|
117
|
-
readonly instanceType?: NeuronxInstanceType;
|
|
118
29
|
/**
|
|
119
30
|
* The bucket to upload compiled artifacts.
|
|
120
31
|
*/
|
|
@@ -123,6 +34,10 @@ export interface NeuronxCompileProps {
|
|
|
123
34
|
* The model to be compiled.
|
|
124
35
|
*/
|
|
125
36
|
readonly model: Model;
|
|
37
|
+
/**
|
|
38
|
+
* The instance type of compile worker instance.
|
|
39
|
+
*/
|
|
40
|
+
readonly instanceType?: NeuronxInstanceType;
|
|
126
41
|
/**
|
|
127
42
|
* The root volume of worker instance.
|
|
128
43
|
* @default - N bilion parameters * 5GiB EBS
|
|
@@ -155,12 +70,20 @@ export interface NeuronxCompileProps {
|
|
|
155
70
|
* Neuronx compile construct. Compile the model to work with Inferentia2 and Trainium1 and upload it to an S3 bucket.
|
|
156
71
|
*/
|
|
157
72
|
export declare class NeuronxCompile extends Construct {
|
|
73
|
+
readonly compiledArtifactS3Bucket: IBucket;
|
|
158
74
|
/**
|
|
159
75
|
* S3 URL that compiled artifact uploaded.
|
|
160
76
|
*/
|
|
161
77
|
readonly compiledArtifactS3Url: string;
|
|
78
|
+
/**
|
|
79
|
+
* S3 Prefix that compiled artifact uploaded.
|
|
80
|
+
*/
|
|
81
|
+
readonly compiledArtifactS3Prefix: string;
|
|
82
|
+
readonly tpDegree: number;
|
|
83
|
+
readonly quantDtype?: QuantDtype;
|
|
84
|
+
readonly nPositions: number;
|
|
85
|
+
readonly optLevel: OptLevel;
|
|
86
|
+
readonly parameters: Parameters;
|
|
162
87
|
constructor(scope: Construct, id: string, props: NeuronxCompileProps);
|
|
163
|
-
private connectAcceleratorChips;
|
|
164
|
-
private calcTpDegree;
|
|
165
88
|
private selectInstanceTypeByTpDegree;
|
|
166
89
|
}
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
import * as sagemaker from "@aws-cdk/aws-sagemaker-alpha";
|
|
2
|
+
import { Duration, Size } from "aws-cdk-lib";
|
|
3
|
+
import { Grant, IGrantable } from "aws-cdk-lib/aws-iam";
|
|
4
|
+
import { IBucket } from "aws-cdk-lib/aws-s3";
|
|
5
|
+
import { ISource } from "aws-cdk-lib/aws-s3-deployment";
|
|
6
|
+
import { Construct } from "constructs";
|
|
7
|
+
import { CompileOptions, OptLevel, Parameters, QuantDtype } from "./model";
|
|
8
|
+
import { NeuronxCompile } from "./neuronx-compile";
|
|
9
|
+
import { NeuronxInstanceType } from "./neuronx-instance-type";
|
|
10
|
+
/**
|
|
11
|
+
* Precompiled model options.
|
|
12
|
+
*/
|
|
13
|
+
export interface CompiledModelOptions {
|
|
14
|
+
/**
|
|
15
|
+
* Neuronx compile options.
|
|
16
|
+
* @default - Each properties are set default.
|
|
17
|
+
*/
|
|
18
|
+
readonly compileOptions?: CompileOptions;
|
|
19
|
+
/**
|
|
20
|
+
* Code used for inference
|
|
21
|
+
* @default - using the predefined code
|
|
22
|
+
*/
|
|
23
|
+
readonly code?: ISource;
|
|
24
|
+
/**
|
|
25
|
+
* Model ID or saved path
|
|
26
|
+
* @default "./model"
|
|
27
|
+
*/
|
|
28
|
+
readonly modelIdOrPath?: string;
|
|
29
|
+
/**
|
|
30
|
+
* The path where compiled artifacts (i.e. xxx.neff) are stored
|
|
31
|
+
* @default "./compiled"
|
|
32
|
+
*/
|
|
33
|
+
readonly compiledArtifactPath?: string;
|
|
34
|
+
}
|
|
35
|
+
export interface BucketCompiledModelOptions extends CompiledModelOptions {
|
|
36
|
+
/**
|
|
37
|
+
* The number of parameters of model.
|
|
38
|
+
*/
|
|
39
|
+
readonly parameters: Parameters;
|
|
40
|
+
}
|
|
41
|
+
export declare class TransformersNeuronxSageMakerInferenceModelData {
|
|
42
|
+
static fromBucket(bucket: IBucket, prefix: string, options: BucketCompiledModelOptions): TransformersNeuronxSageMakerInferenceModelData;
|
|
43
|
+
static fromNeuronxCompile(compile: NeuronxCompile, code?: ISource): TransformersNeuronxSageMakerInferenceModelData;
|
|
44
|
+
readonly bucket: IBucket;
|
|
45
|
+
readonly compiledArtifactS3Prefix: string;
|
|
46
|
+
readonly code: ISource;
|
|
47
|
+
readonly tpDegree: number;
|
|
48
|
+
readonly quantDtype?: QuantDtype;
|
|
49
|
+
readonly nPositions: number;
|
|
50
|
+
readonly optLevel: OptLevel;
|
|
51
|
+
readonly modelIdOrPath?: string;
|
|
52
|
+
readonly compiledArtifactPath?: string;
|
|
53
|
+
readonly parameters: Parameters;
|
|
54
|
+
private constructor();
|
|
55
|
+
}
|
|
56
|
+
export interface TransformersNeuronxSageMakerRealtimeInferenceEndpointProps {
|
|
57
|
+
/**
|
|
58
|
+
* Model data for SageMaker inference.
|
|
59
|
+
* The model data requires at least compiled artifacts.
|
|
60
|
+
*/
|
|
61
|
+
readonly modelData: TransformersNeuronxSageMakerInferenceModelData;
|
|
62
|
+
/**
|
|
63
|
+
* An image of the container where the inference job is executed.
|
|
64
|
+
*/
|
|
65
|
+
readonly image?: sagemaker.ContainerImage;
|
|
66
|
+
/**
|
|
67
|
+
* A map of environment variables to pass into the container.
|
|
68
|
+
* @default - Only the predefined environment variables required to use Neuronx have been set.
|
|
69
|
+
*/
|
|
70
|
+
readonly environment?: {
|
|
71
|
+
[key: string]: string;
|
|
72
|
+
};
|
|
73
|
+
/**
|
|
74
|
+
* The instance type of compile worker instance.
|
|
75
|
+
* @default - It is determined automatically according to the number of model parameters and compilation options.
|
|
76
|
+
*/
|
|
77
|
+
readonly instanceType?: NeuronxInstanceType;
|
|
78
|
+
/**
|
|
79
|
+
* The size, of the ML storage volume attached to individual inference instance associated with the production variant.
|
|
80
|
+
* Currently only Amazon EBS gp2 storage volumes are supported.
|
|
81
|
+
* @see https://aws.amazon.com/jp/releasenotes/host-instance-storage-volumes-table
|
|
82
|
+
* @default - 2.5 GB per billion parameter (Max 512 GB)
|
|
83
|
+
*/
|
|
84
|
+
readonly volumeSize?: Size;
|
|
85
|
+
/**
|
|
86
|
+
* The timeout value, to download and extract the model that you want to host from Amazon S3
|
|
87
|
+
* to the individual inference instance associated with this production variant.
|
|
88
|
+
* @default - 60 seconds, when `volumeSize` larger than 30GB then 1GB x 15 seconds (max 60 minutes)
|
|
89
|
+
*/
|
|
90
|
+
readonly modelDataDownloadTimeout?: Duration;
|
|
91
|
+
/**
|
|
92
|
+
* The timeout value, for your inference container to pass health check by SageMaker Hosting.
|
|
93
|
+
* @see https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-inference-code.html#your-algorithms-inference-algo-ping-requests
|
|
94
|
+
* @default - 60 seconds, when set the `modelDataDownloadTimeout` then use same value (max 60 minutes)
|
|
95
|
+
*/
|
|
96
|
+
readonly containerStartupHealthCheckTimeout?: Duration;
|
|
97
|
+
}
|
|
98
|
+
export declare class TransformersNeuronxSageMakerRealtimeInferenceEndpoint extends Construct {
|
|
99
|
+
/**
|
|
100
|
+
* The ARN of the endpoint.
|
|
101
|
+
* @attribute
|
|
102
|
+
*/
|
|
103
|
+
readonly endpointArn: string;
|
|
104
|
+
/**
|
|
105
|
+
* The name of the endpoint.
|
|
106
|
+
* @attribute
|
|
107
|
+
*/
|
|
108
|
+
readonly endpointName: string;
|
|
109
|
+
private readonly endpoint;
|
|
110
|
+
constructor(scope: Construct, id: string, props: TransformersNeuronxSageMakerRealtimeInferenceEndpointProps);
|
|
111
|
+
grantInvoke(grantee: IGrantable): Grant;
|
|
112
|
+
private selectInstanceTypeByTpDegree;
|
|
113
|
+
}
|
package/lib/index.d.ts
CHANGED
package/lib/index.js
CHANGED
|
@@ -14,6 +14,8 @@ var __exportStar = (this && this.__exportStar) || function(m, exports) {
|
|
|
14
14
|
for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
|
|
15
15
|
};
|
|
16
16
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
17
|
+
__exportStar(require("./model"), exports);
|
|
17
18
|
__exportStar(require("./neuronx-compile"), exports);
|
|
18
19
|
__exportStar(require("./neuronx-instance-type"), exports);
|
|
19
|
-
|
|
20
|
+
__exportStar(require("./transformers-neuronx-sagemaker-realtime-inference"), exports);
|
|
21
|
+
//# sourceMappingURL=data:application/json;base64,eyJ2ZXJzaW9uIjozLCJmaWxlIjoiaW5kZXguanMiLCJzb3VyY2VSb290IjoiIiwic291cmNlcyI6WyIuLi9zcmMvaW5kZXgudHMiXSwibmFtZXMiOltdLCJtYXBwaW5ncyI6Ijs7Ozs7Ozs7Ozs7Ozs7OztBQUFBLDBDQUF3QjtBQUN4QixvREFBa0M7QUFDbEMsMERBQXdDO0FBQ3hDLHNGQUFvRSIsInNvdXJjZXNDb250ZW50IjpbImV4cG9ydCAqIGZyb20gXCIuL21vZGVsXCI7XG5leHBvcnQgKiBmcm9tIFwiLi9uZXVyb254LWNvbXBpbGVcIjtcbmV4cG9ydCAqIGZyb20gXCIuL25ldXJvbngtaW5zdGFuY2UtdHlwZVwiO1xuZXhwb3J0ICogZnJvbSBcIi4vdHJhbnNmb3JtZXJzLW5ldXJvbngtc2FnZW1ha2VyLXJlYWx0aW1lLWluZmVyZW5jZVwiO1xuIl19
|
package/lib/model.d.ts
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
import { IBucket } from "aws-cdk-lib/aws-s3";
|
|
2
|
+
/**
|
|
3
|
+
* Quant data type.
|
|
4
|
+
*/
|
|
5
|
+
export declare enum QuantDtype {
|
|
6
|
+
/**
|
|
7
|
+
* int8 weight storage.
|
|
8
|
+
*/
|
|
9
|
+
S8 = "s8"
|
|
10
|
+
}
|
|
11
|
+
/**
|
|
12
|
+
* Optimization level.
|
|
13
|
+
*/
|
|
14
|
+
export declare enum OptLevel {
|
|
15
|
+
/**
|
|
16
|
+
* enables the core performance optimizations in the compiler, while also minimizing compile time.
|
|
17
|
+
*/
|
|
18
|
+
MINIMIZING_COMPILE_TIME = 1,
|
|
19
|
+
/**
|
|
20
|
+
* provides the best balance between model performance and compile time.
|
|
21
|
+
*/
|
|
22
|
+
BEST_BALANCE = 2,
|
|
23
|
+
/**
|
|
24
|
+
* may provide additional model execution performance but may incur longer compile times and higher host memory usage during model compilation.
|
|
25
|
+
*/
|
|
26
|
+
MODEL_EXECUTION_PERFORMANCE = 3
|
|
27
|
+
}
|
|
28
|
+
/**
|
|
29
|
+
* Compile options.
|
|
30
|
+
*/
|
|
31
|
+
export interface CompileOptions {
|
|
32
|
+
/**
|
|
33
|
+
* @default - calc from parameters and quantDtype
|
|
34
|
+
*/
|
|
35
|
+
readonly tpDegree?: number;
|
|
36
|
+
/**
|
|
37
|
+
* @default - No quant
|
|
38
|
+
*/
|
|
39
|
+
readonly quantDtype?: QuantDtype;
|
|
40
|
+
/**
|
|
41
|
+
* @default 4096
|
|
42
|
+
*/
|
|
43
|
+
readonly nPositions?: number;
|
|
44
|
+
/**
|
|
45
|
+
* @default OptLevel.BEST_BALANCE
|
|
46
|
+
*/
|
|
47
|
+
readonly optLevel?: OptLevel;
|
|
48
|
+
}
|
|
49
|
+
/**
|
|
50
|
+
* Represents the amount of parameters.
|
|
51
|
+
*/
|
|
52
|
+
export declare class Parameters {
|
|
53
|
+
private readonly billion;
|
|
54
|
+
/**
|
|
55
|
+
* Create a Parameters representing an amount bilion.
|
|
56
|
+
* @param parameters number of parameters bilionX
|
|
57
|
+
* @returns parameters
|
|
58
|
+
*/
|
|
59
|
+
static billion(parameters: number): Parameters;
|
|
60
|
+
private constructor();
|
|
61
|
+
/**
|
|
62
|
+
* Return this number of parameters as bilion.
|
|
63
|
+
* @returns This number of parameters as bilion.
|
|
64
|
+
*/
|
|
65
|
+
toBilion(): number;
|
|
66
|
+
}
|
|
67
|
+
/**
|
|
68
|
+
* Compile target model basic infromation
|
|
69
|
+
*/
|
|
70
|
+
export interface ModelOptions {
|
|
71
|
+
readonly parameters: Parameters;
|
|
72
|
+
}
|
|
73
|
+
/**
|
|
74
|
+
* Compile target model.
|
|
75
|
+
*/
|
|
76
|
+
export declare class Model {
|
|
77
|
+
readonly modelId: string;
|
|
78
|
+
readonly options: ModelOptions;
|
|
79
|
+
readonly bucket?: IBucket | undefined;
|
|
80
|
+
readonly prefix?: string | undefined;
|
|
81
|
+
/**
|
|
82
|
+
* model informations at HuggingFace
|
|
83
|
+
* @param modelId model id on the HuggingFace
|
|
84
|
+
* @param options model basic infromation
|
|
85
|
+
* @returns model instance
|
|
86
|
+
*/
|
|
87
|
+
static fromHuggingFace(modelId: string, options: ModelOptions): Model;
|
|
88
|
+
/**
|
|
89
|
+
* model informations at S3 Bucket
|
|
90
|
+
* @param bucket Model stored S3 Bucket
|
|
91
|
+
* @param prefix Model stored objects prefix
|
|
92
|
+
* @param options model basic infromation
|
|
93
|
+
* @returns model instance
|
|
94
|
+
*/
|
|
95
|
+
static fromBucket(bucket: IBucket, prefix: string, options: ModelOptions): Model;
|
|
96
|
+
private constructor();
|
|
97
|
+
}
|