aws-cdk-neuronx-patterns 0.0.2 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,29 +1,121 @@
1
1
  # Neuronx patterns Construct Library
2
2
 
3
+ > [!WARNING]
4
+ > This library is experimental module.
5
+
3
6
  This library provides high-level architectural patterns using neuronx (e.g. Inferentia2 and Trainium1). It contains:
4
7
 
8
+ - Transformers Neuronx SageMaker Real-time Inference Endpoint
5
9
  - Neuronx Compile
6
10
 
7
- ## Neuronx Compile
11
+ ## Transformers Neuronx SageMaker Real-time Inference Endpoint
8
12
 
9
- :::note warn
10
- This construct uses an Inferentia2 instance on EC2. You may need to increase your request limit for your AWS account.
11
- :::
13
+ > [!WARNING]
14
+ > This construct uses an Inferentia2 instance on SageMaker. You may need to increase your request limit for your AWS account.
12
15
 
13
- This construct compiles models supported by Neuronx and uploads them to the specified S3 bucket.
16
+ By using the `NeuronxCompile` construct included in this construct library, models published on HuggingFace can be easily deployed to SageMaker Real-time inference. To define using the `NeuronxCompile` construct:
14
17
 
15
- This is NeuronxCompile architecture.
16
- ![NeuronxCompile architecture](./docs/neuronx-compile-architecture.png)
18
+ ```ts
19
+ import * as ec2 from "aws-cdk-lib/aws-ec2";
20
+ import * as s3 from "aws-cdk-lib/aws-s3";
21
+
22
+ declare const vpc: ec2.Vpc;
23
+ declare const bucket: s3.Bucket;
24
+ const compile = new NeuronxCompile(this, "NeuronxCompile", {
25
+ vpc,
26
+ bucket,
27
+ model: Model.fromHuggingFace("example/example-7b-chat", {
28
+ parameters: Parameters.billion(7),
29
+ }),
30
+ });
31
+ new TransformersNeuronxSageMakerRealtimeInferenceEndpoint(
32
+ this,
33
+ "RealtimeInference",
34
+ {
35
+ modelData:
36
+ TransformersNeuronxSageMakerInferenceModelData.fromNeuronxCompile(
37
+ compile,
38
+ ),
39
+ },
40
+ );
41
+ ```
42
+
43
+ ### Default inference code
44
+
45
+ By default, default inference code is deployed to implement the chat interface. The default inference code takes an object array like [transformers' conversations](https://huggingface.co/docs/transformers/main/en/conversations) and responds to the generated text. The following code is an example using the AWS SDK for JavaScript v3.
17
46
 
18
- To define
47
+ ```ts
48
+ import {
49
+ InvokeEndpointCommand,
50
+ SageMakerRuntimeClient,
51
+ } from "@aws-sdk/client-sagemaker-runtime";
52
+
53
+ const client = new SageMakerRuntimeClient({
54
+ region: "us-east-1",
55
+ });
56
+ client
57
+ .send(
58
+ new InvokeEndpointCommand({
59
+ EndpointName: "my-endpoint-id",
60
+ Body: JSON.stringify({
61
+ // Optional. You can change answer role.
62
+ role: "ai",
63
+ // Require. The messages like conversation.
64
+ messages: [
65
+ {
66
+ role: "system",
67
+ content: `You are helpfull assistant.`,
68
+ },
69
+ {
70
+ role: "user",
71
+ content:
72
+ "please answer '1+1=?'. You must answer only answer numeric.",
73
+ },
74
+ ],
75
+ }),
76
+ ContentType: "application/json",
77
+ Accept: "application/json",
78
+ }),
79
+ )
80
+ .then((res) => {
81
+ // { generated_text: "2" }
82
+ console.log(JSON.parse(res.Body.transformToString()));
83
+ });
84
+ ```
85
+
86
+ To change your own inference code, you can pass the code source.
87
+
88
+ ```ts
89
+ import * as s3Deplyment from "aws-cdk-lib/aws-s3-deployment";
90
+
91
+ declare const compile: NeuronxCompile;
92
+ new TransformersNeuronxSageMakerRealtimeInferenceEndpoint(
93
+ this,
94
+ "RealtimeInference",
95
+ {
96
+ modelData:
97
+ TransformersNeuronxSageMakerInferenceModelData.fromNeuronxCompile(
98
+ compile,
99
+ s3Deplyment.Source.asset("path/to/my/code/directory"),
100
+ ),
101
+ },
102
+ );
103
+ ```
104
+
105
+ ## Neuronx Compile
106
+
107
+ > [!WARNING]
108
+ > This construct uses an Inferentia2 instance on EC2. You may need to increase your request limit for your AWS account.
109
+
110
+ This construct compiles models supported by Neuronx and uploads them to the specified S3 bucket. To define
19
111
 
20
112
  ```ts
21
- import { Vpc } from "aws-cdk-lib/aws-ec2";
22
- import { Bucket } from "aws-cdk-lib/aws-s3";
113
+ import * as ec2 from "aws-cdk-lib/aws-ec2";
114
+ import * as s3 from "aws-cdk-lib/aws-s3";
23
115
 
24
- declare const vpc: Vpc;
25
- declare const bucket: Bucket;
26
- const compile = new NeuronxCompile(stack, "NeuronxCompile", {
116
+ declare const vpc: ec2.Vpc;
117
+ declare const bucket: s3.Bucket;
118
+ const compile = new NeuronxCompile(this, "NeuronxCompile", {
27
119
  vpc,
28
120
  bucket,
29
121
  model: Model.fromHuggingFace("example/example-7b-chat", {
@@ -32,7 +124,7 @@ const compile = new NeuronxCompile(stack, "NeuronxCompile", {
32
124
  });
33
125
 
34
126
  // Get the compiled artifacts from this S3 URL
35
- new CfnOutput(stack, "CompiledArtifact", {
127
+ new CfnOutput(this, "CompiledArtifact", {
36
128
  value: compile.compiledArtifactS3Url,
37
129
  });
38
130
  ```
@@ -54,21 +146,23 @@ After compiled, you can see like the this file tree in the S3 bucket.
54
146
  └── xxx.neff
55
147
  ```
56
148
 
149
+ This is NeuronxCompile architecture.
150
+ ![NeuronxCompile architecture](./docs/neuronx-compile-architecture.png)
151
+
57
152
  ### Spot Instance
58
153
 
59
- :::note warn
60
- If you use Spot Instances, check if the request limit for Spot has been increased.
61
- :::
154
+ > [!WARNING]
155
+ > If you use Spot Instances, check if the request limit for Spot has been increased.
62
156
 
63
157
  You can also use Spot Instances.
64
158
 
65
159
  ```ts
66
- import { Vpc } from "aws-cdk-lib/aws-ec2";
67
- import { Bucket } from "aws-cdk-lib/aws-s3";
160
+ import * as ec2 from "aws-cdk-lib/aws-ec2";
161
+ import * as s3 from "aws-cdk-lib/aws-s3";
68
162
 
69
- declare const vpc: Vpc;
70
- declare const bucket: Bucket;
71
- new NeuronxCompile(stack, "NeuronxCompile", {
163
+ declare const vpc: ec2.Vpc;
164
+ declare const bucket: s3.Bucket;
165
+ new NeuronxCompile(this, "NeuronxCompile", {
72
166
  vpc,
73
167
  bucket,
74
168
  model: Model.fromHuggingFace("example/example-7b-chat", {
@@ -83,12 +177,12 @@ new NeuronxCompile(stack, "NeuronxCompile", {
83
177
  If you are familiar with Neuronx, you can also specify compilation options to better meet your requirements.
84
178
 
85
179
  ```ts
86
- import { Vpc } from "aws-cdk-lib/aws-ec2";
87
- import { Bucket } from "aws-cdk-lib/aws-s3";
180
+ import * as ec2 from "aws-cdk-lib/aws-ec2";
181
+ import * as s3 from "aws-cdk-lib/aws-s3";
88
182
 
89
- declare const vpc: Vpc;
90
- declare const bucket: Bucket;
91
- new NeuronxCompile(stack, "NeuronxCompile", {
183
+ declare const vpc: ec2.Vpc;
184
+ declare const bucket: s3.Bucket;
185
+ new NeuronxCompile(this, "NeuronxCompile", {
92
186
  vpc,
93
187
  bucket,
94
188
  model: Model.fromHuggingFace("example/example-22b-chat", {
@@ -1,2 +1,4 @@
1
+ export * from "./model";
1
2
  export * from "./neuronx-compile";
2
3
  export * from "./neuronx-instance-type";
4
+ export * from "./transformers-neuronx-sagemaker-realtime-inference";
@@ -0,0 +1,97 @@
1
+ import { IBucket } from "aws-cdk-lib/aws-s3";
2
+ /**
3
+ * Quant data type.
4
+ */
5
+ export declare enum QuantDtype {
6
+ /**
7
+ * int8 weight storage.
8
+ */
9
+ S8 = "s8"
10
+ }
11
+ /**
12
+ * Optimization level.
13
+ */
14
+ export declare enum OptLevel {
15
+ /**
16
+ * enables the core performance optimizations in the compiler, while also minimizing compile time.
17
+ */
18
+ MINIMIZING_COMPILE_TIME = 1,
19
+ /**
20
+ * provides the best balance between model performance and compile time.
21
+ */
22
+ BEST_BALANCE = 2,
23
+ /**
24
+ * may provide additional model execution performance but may incur longer compile times and higher host memory usage during model compilation.
25
+ */
26
+ MODEL_EXECUTION_PERFORMANCE = 3
27
+ }
28
+ /**
29
+ * Compile options.
30
+ */
31
+ export interface CompileOptions {
32
+ /**
33
+ * @default - calc from parameters and quantDtype
34
+ */
35
+ readonly tpDegree?: number;
36
+ /**
37
+ * @default - No quant
38
+ */
39
+ readonly quantDtype?: QuantDtype;
40
+ /**
41
+ * @default 4096
42
+ */
43
+ readonly nPositions?: number;
44
+ /**
45
+ * @default OptLevel.BEST_BALANCE
46
+ */
47
+ readonly optLevel?: OptLevel;
48
+ }
49
+ /**
50
+ * Represents the amount of parameters.
51
+ */
52
+ export declare class Parameters {
53
+ private readonly billion;
54
+ /**
55
+ * Create a Parameters representing an amount bilion.
56
+ * @param parameters number of parameters bilionX
57
+ * @returns parameters
58
+ */
59
+ static billion(parameters: number): Parameters;
60
+ private constructor();
61
+ /**
62
+ * Return this number of parameters as bilion.
63
+ * @returns This number of parameters as bilion.
64
+ */
65
+ toBilion(): number;
66
+ }
67
+ /**
68
+ * Compile target model basic infromation
69
+ */
70
+ export interface ModelOptions {
71
+ readonly parameters: Parameters;
72
+ }
73
+ /**
74
+ * Compile target model.
75
+ */
76
+ export declare class Model {
77
+ readonly modelId: string;
78
+ readonly options: ModelOptions;
79
+ readonly bucket?: IBucket | undefined;
80
+ readonly prefix?: string | undefined;
81
+ /**
82
+ * model informations at HuggingFace
83
+ * @param modelId model id on the HuggingFace
84
+ * @param options model basic infromation
85
+ * @returns model instance
86
+ */
87
+ static fromHuggingFace(modelId: string, options: ModelOptions): Model;
88
+ /**
89
+ * model informations at S3 Bucket
90
+ * @param bucket Model stored S3 Bucket
91
+ * @param prefix Model stored objects prefix
92
+ * @param options model basic infromation
93
+ * @returns model instance
94
+ */
95
+ static fromBucket(bucket: IBucket, prefix: string, options: ModelOptions): Model;
96
+ private constructor();
97
+ }
@@ -3,6 +3,7 @@ import * as ec2 from "aws-cdk-lib/aws-ec2";
3
3
  import { ContainerImage } from "aws-cdk-lib/aws-ecs";
4
4
  import { IBucket } from "aws-cdk-lib/aws-s3";
5
5
  import { Construct } from "constructs";
6
+ import { CompileOptions, Model, OptLevel, Parameters, QuantDtype } from "./model";
6
7
  import { NeuronxInstanceType } from "./neuronx-instance-type";
7
8
  /**
8
9
  * Compile runtime.
@@ -17,92 +18,6 @@ export interface CompileRuntime {
17
18
  */
18
19
  readonly neuronxVersion: string;
19
20
  }
20
- /**
21
- * Quant data type.
22
- */
23
- export declare enum QuantDtype {
24
- /**
25
- * int8 weight storage.
26
- */
27
- S8 = "s8"
28
- }
29
- /**
30
- * Optimization level.
31
- */
32
- export declare enum OptLevel {
33
- /**
34
- * enables the core performance optimizations in the compiler, while also minimizing compile time.
35
- */
36
- MINIMIZING_COMPILE_TIME = 1,
37
- /**
38
- * provides the best balance between model performance and compile time.
39
- */
40
- BEST_BALANCE = 2,
41
- /**
42
- * may provide additional model execution performance but may incur longer compile times and higher host memory usage during model compilation.
43
- */
44
- MODEL_EXECUTION_PERFORMANCE = 3
45
- }
46
- /**
47
- * Compile options.
48
- */
49
- export interface CompileOptions {
50
- /**
51
- * @default - calc from parameters and quantDtype
52
- */
53
- readonly tpDegree?: number;
54
- /**
55
- * @default - No quant
56
- */
57
- readonly quantDtype?: QuantDtype;
58
- /**
59
- * @default 4092
60
- */
61
- readonly nPositions?: number;
62
- /**
63
- * @default OptLevel.BEST_BALANCE
64
- */
65
- readonly optLevel?: OptLevel;
66
- }
67
- /**
68
- * Represents the amount of parameters.
69
- */
70
- export declare class Parameters {
71
- private readonly billion;
72
- /**
73
- * Create a Parameters representing an amount bilion.
74
- * @param parameters number of parameters bilionX
75
- * @returns parameters
76
- */
77
- static billion(parameters: number): Parameters;
78
- private constructor();
79
- /**
80
- * Return this number of parameters as bilion.
81
- * @returns This number of parameters as bilion.
82
- */
83
- toBilion(): number;
84
- }
85
- /**
86
- * Compile target model basic infromation
87
- */
88
- export interface ModelOptions {
89
- readonly parameters: Parameters;
90
- }
91
- /**
92
- * Compile target model.
93
- */
94
- export declare class Model {
95
- readonly modelId: string;
96
- readonly options: ModelOptions;
97
- /**
98
- * model informations at HuggingFace
99
- * @param modelId model id on the HuggingFace
100
- * @param options model basic infromation
101
- * @returns model instance
102
- */
103
- static fromHuggingFace(modelId: string, options: ModelOptions): Model;
104
- private constructor();
105
- }
106
21
  /**
107
22
  * Props of NeuronxCompile.
108
23
  */
@@ -111,10 +26,6 @@ export interface NeuronxCompileProps {
111
26
  * VPC in which this will launch compile worker instance.
112
27
  */
113
28
  readonly vpc: ec2.IVpc;
114
- /**
115
- * The instance type of compile worker instance.
116
- */
117
- readonly instanceType?: NeuronxInstanceType;
118
29
  /**
119
30
  * The bucket to upload compiled artifacts.
120
31
  */
@@ -123,6 +34,10 @@ export interface NeuronxCompileProps {
123
34
  * The model to be compiled.
124
35
  */
125
36
  readonly model: Model;
37
+ /**
38
+ * The instance type of compile worker instance.
39
+ */
40
+ readonly instanceType?: NeuronxInstanceType;
126
41
  /**
127
42
  * The root volume of worker instance.
128
43
  * @default - N bilion parameters * 5GiB EBS
@@ -155,12 +70,20 @@ export interface NeuronxCompileProps {
155
70
  * Neuronx compile construct. Compile the model to work with Inferentia2 and Trainium1 and upload it to an S3 bucket.
156
71
  */
157
72
  export declare class NeuronxCompile extends Construct {
73
+ readonly compiledArtifactS3Bucket: IBucket;
158
74
  /**
159
75
  * S3 URL that compiled artifact uploaded.
160
76
  */
161
77
  readonly compiledArtifactS3Url: string;
78
+ /**
79
+ * S3 Prefix that compiled artifact uploaded.
80
+ */
81
+ readonly compiledArtifactS3Prefix: string;
82
+ readonly tpDegree: number;
83
+ readonly quantDtype?: QuantDtype;
84
+ readonly nPositions: number;
85
+ readonly optLevel: OptLevel;
86
+ readonly parameters: Parameters;
162
87
  constructor(scope: Construct, id: string, props: NeuronxCompileProps);
163
- private connectAcceleratorChips;
164
- private calcTpDegree;
165
88
  private selectInstanceTypeByTpDegree;
166
89
  }
@@ -0,0 +1,2 @@
1
+ import { CompileOptions, Parameters } from "../model";
2
+ export declare function calcTpDegree(parameters: Parameters, compileOptions: CompileOptions): number;
@@ -0,0 +1,113 @@
1
+ import * as sagemaker from "@aws-cdk/aws-sagemaker-alpha";
2
+ import { Duration, Size } from "aws-cdk-lib";
3
+ import { Grant, IGrantable } from "aws-cdk-lib/aws-iam";
4
+ import { IBucket } from "aws-cdk-lib/aws-s3";
5
+ import { ISource } from "aws-cdk-lib/aws-s3-deployment";
6
+ import { Construct } from "constructs";
7
+ import { CompileOptions, OptLevel, Parameters, QuantDtype } from "./model";
8
+ import { NeuronxCompile } from "./neuronx-compile";
9
+ import { NeuronxInstanceType } from "./neuronx-instance-type";
10
+ /**
11
+ * Precompiled model options.
12
+ */
13
+ export interface CompiledModelOptions {
14
+ /**
15
+ * Neuronx compile options.
16
+ * @default - Each properties are set default.
17
+ */
18
+ readonly compileOptions?: CompileOptions;
19
+ /**
20
+ * Code used for inference
21
+ * @default - using the predefined code
22
+ */
23
+ readonly code?: ISource;
24
+ /**
25
+ * Model ID or saved path
26
+ * @default "./model"
27
+ */
28
+ readonly modelIdOrPath?: string;
29
+ /**
30
+ * The path where compiled artifacts (i.e. xxx.neff) are stored
31
+ * @default "./compiled"
32
+ */
33
+ readonly compiledArtifactPath?: string;
34
+ }
35
+ export interface BucketCompiledModelOptions extends CompiledModelOptions {
36
+ /**
37
+ * The number of parameters of model.
38
+ */
39
+ readonly parameters: Parameters;
40
+ }
41
+ export declare class TransformersNeuronxSageMakerInferenceModelData {
42
+ static fromBucket(bucket: IBucket, prefix: string, options: BucketCompiledModelOptions): TransformersNeuronxSageMakerInferenceModelData;
43
+ static fromNeuronxCompile(compile: NeuronxCompile, code?: ISource): TransformersNeuronxSageMakerInferenceModelData;
44
+ readonly bucket: IBucket;
45
+ readonly compiledArtifactS3Prefix: string;
46
+ readonly code: ISource;
47
+ readonly tpDegree: number;
48
+ readonly quantDtype?: QuantDtype;
49
+ readonly nPositions: number;
50
+ readonly optLevel: OptLevel;
51
+ readonly modelIdOrPath?: string;
52
+ readonly compiledArtifactPath?: string;
53
+ readonly parameters: Parameters;
54
+ private constructor();
55
+ }
56
+ export interface TransformersNeuronxSageMakerRealtimeInferenceEndpointProps {
57
+ /**
58
+ * Model data for SageMaker inference.
59
+ * The model data requires at least compiled artifacts.
60
+ */
61
+ readonly modelData: TransformersNeuronxSageMakerInferenceModelData;
62
+ /**
63
+ * An image of the container where the inference job is executed.
64
+ */
65
+ readonly image?: sagemaker.ContainerImage;
66
+ /**
67
+ * A map of environment variables to pass into the container.
68
+ * @default - Only the predefined environment variables required to use Neuronx have been set.
69
+ */
70
+ readonly environment?: {
71
+ [key: string]: string;
72
+ };
73
+ /**
74
+ * The instance type of compile worker instance.
75
+ * @default - It is determined automatically according to the number of model parameters and compilation options.
76
+ */
77
+ readonly instanceType?: NeuronxInstanceType;
78
+ /**
79
+ * The size, of the ML storage volume attached to individual inference instance associated with the production variant.
80
+ * Currently only Amazon EBS gp2 storage volumes are supported.
81
+ * @see https://aws.amazon.com/jp/releasenotes/host-instance-storage-volumes-table
82
+ * @default - 2.5 GB per billion parameter (Max 512 GB)
83
+ */
84
+ readonly volumeSize?: Size;
85
+ /**
86
+ * The timeout value, to download and extract the model that you want to host from Amazon S3
87
+ * to the individual inference instance associated with this production variant.
88
+ * @default - 60 seconds, when `volumeSize` larger than 30GB then 1GB x 15 seconds (max 60 minutes)
89
+ */
90
+ readonly modelDataDownloadTimeout?: Duration;
91
+ /**
92
+ * The timeout value, for your inference container to pass health check by SageMaker Hosting.
93
+ * @see https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-inference-code.html#your-algorithms-inference-algo-ping-requests
94
+ * @default - 60 seconds, when set the `modelDataDownloadTimeout` then use same value (max 60 minutes)
95
+ */
96
+ readonly containerStartupHealthCheckTimeout?: Duration;
97
+ }
98
+ export declare class TransformersNeuronxSageMakerRealtimeInferenceEndpoint extends Construct {
99
+ /**
100
+ * The ARN of the endpoint.
101
+ * @attribute
102
+ */
103
+ readonly endpointArn: string;
104
+ /**
105
+ * The name of the endpoint.
106
+ * @attribute
107
+ */
108
+ readonly endpointName: string;
109
+ private readonly endpoint;
110
+ constructor(scope: Construct, id: string, props: TransformersNeuronxSageMakerRealtimeInferenceEndpointProps);
111
+ grantInvoke(grantee: IGrantable): Grant;
112
+ private selectInstanceTypeByTpDegree;
113
+ }
package/lib/index.d.ts CHANGED
@@ -1,2 +1,4 @@
1
+ export * from "./model";
1
2
  export * from "./neuronx-compile";
2
3
  export * from "./neuronx-instance-type";
4
+ export * from "./transformers-neuronx-sagemaker-realtime-inference";
package/lib/index.js CHANGED
@@ -14,6 +14,8 @@ var __exportStar = (this && this.__exportStar) || function(m, exports) {
14
14
  for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
15
15
  };
16
16
  Object.defineProperty(exports, "__esModule", { value: true });
17
+ __exportStar(require("./model"), exports);
17
18
  __exportStar(require("./neuronx-compile"), exports);
18
19
  __exportStar(require("./neuronx-instance-type"), exports);
19
- //# sourceMappingURL=data:application/json;base64,eyJ2ZXJzaW9uIjozLCJmaWxlIjoiaW5kZXguanMiLCJzb3VyY2VSb290IjoiIiwic291cmNlcyI6WyIuLi9zcmMvaW5kZXgudHMiXSwibmFtZXMiOltdLCJtYXBwaW5ncyI6Ijs7Ozs7Ozs7Ozs7Ozs7OztBQUFBLG9EQUFrQztBQUNsQywwREFBd0MiLCJzb3VyY2VzQ29udGVudCI6WyJleHBvcnQgKiBmcm9tIFwiLi9uZXVyb254LWNvbXBpbGVcIjtcbmV4cG9ydCAqIGZyb20gXCIuL25ldXJvbngtaW5zdGFuY2UtdHlwZVwiO1xuIl19
20
+ __exportStar(require("./transformers-neuronx-sagemaker-realtime-inference"), exports);
21
+ //# sourceMappingURL=data:application/json;base64,eyJ2ZXJzaW9uIjozLCJmaWxlIjoiaW5kZXguanMiLCJzb3VyY2VSb290IjoiIiwic291cmNlcyI6WyIuLi9zcmMvaW5kZXgudHMiXSwibmFtZXMiOltdLCJtYXBwaW5ncyI6Ijs7Ozs7Ozs7Ozs7Ozs7OztBQUFBLDBDQUF3QjtBQUN4QixvREFBa0M7QUFDbEMsMERBQXdDO0FBQ3hDLHNGQUFvRSIsInNvdXJjZXNDb250ZW50IjpbImV4cG9ydCAqIGZyb20gXCIuL21vZGVsXCI7XG5leHBvcnQgKiBmcm9tIFwiLi9uZXVyb254LWNvbXBpbGVcIjtcbmV4cG9ydCAqIGZyb20gXCIuL25ldXJvbngtaW5zdGFuY2UtdHlwZVwiO1xuZXhwb3J0ICogZnJvbSBcIi4vdHJhbnNmb3JtZXJzLW5ldXJvbngtc2FnZW1ha2VyLXJlYWx0aW1lLWluZmVyZW5jZVwiO1xuIl19
package/lib/model.d.ts ADDED
@@ -0,0 +1,97 @@
1
+ import { IBucket } from "aws-cdk-lib/aws-s3";
2
+ /**
3
+ * Quant data type.
4
+ */
5
+ export declare enum QuantDtype {
6
+ /**
7
+ * int8 weight storage.
8
+ */
9
+ S8 = "s8"
10
+ }
11
+ /**
12
+ * Optimization level.
13
+ */
14
+ export declare enum OptLevel {
15
+ /**
16
+ * enables the core performance optimizations in the compiler, while also minimizing compile time.
17
+ */
18
+ MINIMIZING_COMPILE_TIME = 1,
19
+ /**
20
+ * provides the best balance between model performance and compile time.
21
+ */
22
+ BEST_BALANCE = 2,
23
+ /**
24
+ * may provide additional model execution performance but may incur longer compile times and higher host memory usage during model compilation.
25
+ */
26
+ MODEL_EXECUTION_PERFORMANCE = 3
27
+ }
28
+ /**
29
+ * Compile options.
30
+ */
31
+ export interface CompileOptions {
32
+ /**
33
+ * @default - calc from parameters and quantDtype
34
+ */
35
+ readonly tpDegree?: number;
36
+ /**
37
+ * @default - No quant
38
+ */
39
+ readonly quantDtype?: QuantDtype;
40
+ /**
41
+ * @default 4096
42
+ */
43
+ readonly nPositions?: number;
44
+ /**
45
+ * @default OptLevel.BEST_BALANCE
46
+ */
47
+ readonly optLevel?: OptLevel;
48
+ }
49
+ /**
50
+ * Represents the amount of parameters.
51
+ */
52
+ export declare class Parameters {
53
+ private readonly billion;
54
+ /**
55
+ * Create a Parameters representing an amount bilion.
56
+ * @param parameters number of parameters bilionX
57
+ * @returns parameters
58
+ */
59
+ static billion(parameters: number): Parameters;
60
+ private constructor();
61
+ /**
62
+ * Return this number of parameters as bilion.
63
+ * @returns This number of parameters as bilion.
64
+ */
65
+ toBilion(): number;
66
+ }
67
+ /**
68
+ * Compile target model basic infromation
69
+ */
70
+ export interface ModelOptions {
71
+ readonly parameters: Parameters;
72
+ }
73
+ /**
74
+ * Compile target model.
75
+ */
76
+ export declare class Model {
77
+ readonly modelId: string;
78
+ readonly options: ModelOptions;
79
+ readonly bucket?: IBucket | undefined;
80
+ readonly prefix?: string | undefined;
81
+ /**
82
+ * model informations at HuggingFace
83
+ * @param modelId model id on the HuggingFace
84
+ * @param options model basic infromation
85
+ * @returns model instance
86
+ */
87
+ static fromHuggingFace(modelId: string, options: ModelOptions): Model;
88
+ /**
89
+ * model informations at S3 Bucket
90
+ * @param bucket Model stored S3 Bucket
91
+ * @param prefix Model stored objects prefix
92
+ * @param options model basic infromation
93
+ * @returns model instance
94
+ */
95
+ static fromBucket(bucket: IBucket, prefix: string, options: ModelOptions): Model;
96
+ private constructor();
97
+ }