npm - aws-cdk-neuronx-patterns - Versions diffs - 0.0.2 → 0.0.4 - Mend

aws-cdk-neuronx-patterns 0.0.2 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

package/.jsii +925 -106
package/API.md +733 -1
package/README.md +121 -27
package/docs/neuronx-compile-architecture.png +0 -0
package/lib/.types-compat/ts3.9/index.d.ts +2 -0
package/lib/.types-compat/ts3.9/model.d.ts +97 -0
package/lib/.types-compat/ts3.9/neuronx-compile.d.ts +15 -92
package/lib/.types-compat/ts3.9/private/util.d.ts +2 -0
package/lib/.types-compat/ts3.9/transformers-neuronx-sagemaker-realtime-inference.d.ts +113 -0
package/lib/index.d.ts +2 -0
package/lib/index.js +3 -1
package/lib/model.d.ts +97 -0
package/lib/model.js +93 -0
package/lib/neuronx-compile.d.ts +15 -92
package/lib/neuronx-compile.js +43 -156
package/lib/neuronx-instance-type.js +2 -2
package/lib/private/await-compile-job/index.js +2 -2
package/lib/private/util.d.ts +2 -0
package/lib/private/util.js +31 -0
package/lib/transformers-neuronx-sagemaker-realtime-inference.d.ts +113 -0
package/lib/transformers-neuronx-sagemaker-realtime-inference.js +150 -0
package/package.json +9 -5
package/scripts/compile/Dockerfile +10 -0
package/scripts/compile/entrypoint.sh +9 -0
package/scripts/inference/transformers-neuronx/Dockerfile +1 -0
package/scripts/inference/transformers-neuronx/code/inference.py +63 -0
package/scripts/inference/transformers-neuronx/code/requirements.txt +1 -0
/package/scripts/{compile.py → compile/compile.py} +0 -0

package/README.md CHANGED Viewed

@@ -1,29 +1,121 @@
 # Neuronx patterns Construct Library
+> [!WARNING]
+> This library is experimental module.
 This library provides high-level architectural patterns using neuronx (e.g. Inferentia2 and Trainium1). It contains:
+- Transformers Neuronx SageMaker Real-time Inference Endpoint
 - Neuronx Compile
-## Neuronx Compile
+## Transformers Neuronx SageMaker Real-time Inference Endpoint
-:::note warn
-This construct uses an Inferentia2 instance on EC2. You may need to increase your request limit for your AWS account.
-:::
+> [!WARNING]
+> This construct uses an Inferentia2 instance on SageMaker. You may need to increase your request limit for your AWS account.
-This construct compiles models supported by Neuronx and uploads them to the specified S3 bucket.
+By using the `NeuronxCompile` construct included in this construct library, models published on HuggingFace can be easily deployed to SageMaker Real-time inference. To define using the `NeuronxCompile` construct:
-This is NeuronxCompile architecture.
-![NeuronxCompile architecture](./docs/neuronx-compile-architecture.png)
+```ts
+import * as ec2 from "aws-cdk-lib/aws-ec2";
+import * as s3 from "aws-cdk-lib/aws-s3";
+declare const vpc: ec2.Vpc;
+declare const bucket: s3.Bucket;
+const compile = new NeuronxCompile(this, "NeuronxCompile", {
+  vpc,
+  bucket,
+  model: Model.fromHuggingFace("example/example-7b-chat", {
+    parameters: Parameters.billion(7),
+  }),
+});
+new TransformersNeuronxSageMakerRealtimeInferenceEndpoint(
+  this,
+  "RealtimeInference",
+  {
+    modelData:
+      TransformersNeuronxSageMakerInferenceModelData.fromNeuronxCompile(
+        compile,
+      ),
+  },
+);
+```
+### Default inference code
+By default, default inference code is deployed to implement the chat interface. The default inference code takes an object array like [transformers' conversations](https://huggingface.co/docs/transformers/main/en/conversations) and responds to the generated text. The following code is an example using the AWS SDK for JavaScript v3.
-To define
+```ts
+import {
+  InvokeEndpointCommand,
+  SageMakerRuntimeClient,
+} from "@aws-sdk/client-sagemaker-runtime";
+const client = new SageMakerRuntimeClient({
+  region: "us-east-1",
+});
+client
+  .send(
+    new InvokeEndpointCommand({
+      EndpointName: "my-endpoint-id",
+      Body: JSON.stringify({
+        // Optional. You can change answer role.
+        role: "ai",
+        // Require. The messages like conversation.
+        messages: [
+          {
+            role: "system",
+            content: `You are helpfull assistant.`,
+          },
+          {
+            role: "user",
+            content:
+              "please answer '1+1=?'. You must answer only answer numeric.",
+          },
+        ],
+      }),
+      ContentType: "application/json",
+      Accept: "application/json",
+    }),
+  )
+  .then((res) => {
+    // { generated_text: "2" }
+    console.log(JSON.parse(res.Body.transformToString()));
+  });
+```
+To change your own inference code, you can pass the code source.
+```ts
+import * as s3Deplyment from "aws-cdk-lib/aws-s3-deployment";
+declare const compile: NeuronxCompile;
+new TransformersNeuronxSageMakerRealtimeInferenceEndpoint(
+  this,
+  "RealtimeInference",
+  {
+    modelData:
+      TransformersNeuronxSageMakerInferenceModelData.fromNeuronxCompile(
+        compile,
+        s3Deplyment.Source.asset("path/to/my/code/directory"),
+      ),
+  },
+);
+```
+## Neuronx Compile
+> [!WARNING]
+> This construct uses an Inferentia2 instance on EC2. You may need to increase your request limit for your AWS account.
+This construct compiles models supported by Neuronx and uploads them to the specified S3 bucket. To define
 ```ts
-import { Vpc } from "aws-cdk-lib/aws-ec2";
-import { Bucket } from "aws-cdk-lib/aws-s3";
+import * as ec2 from "aws-cdk-lib/aws-ec2";
+import * as s3 from "aws-cdk-lib/aws-s3";
-declare const vpc: Vpc;
-declare const bucket: Bucket;
-const compile = new NeuronxCompile(stack, "NeuronxCompile", {
+declare const vpc: ec2.Vpc;
+declare const bucket: s3.Bucket;
+const compile = new NeuronxCompile(this, "NeuronxCompile", {
   vpc,
   bucket,
   model: Model.fromHuggingFace("example/example-7b-chat", {
@@ -32,7 +124,7 @@ const compile = new NeuronxCompile(stack, "NeuronxCompile", {
 });
 // Get the compiled artifacts from this S3 URL
-new CfnOutput(stack, "CompiledArtifact", {
+new CfnOutput(this, "CompiledArtifact", {
   value: compile.compiledArtifactS3Url,
 });
 ```
@@ -54,21 +146,23 @@ After compiled, you can see like the this file tree in the S3 bucket.
     └── xxx.neff
 ```
+This is NeuronxCompile architecture.
+![NeuronxCompile architecture](./docs/neuronx-compile-architecture.png)
 ### Spot Instance
-:::note warn
-If you use Spot Instances, check if the request limit for Spot has been increased.
-:::
+> [!WARNING]
+> If you use Spot Instances, check if the request limit for Spot has been increased.
 You can also use Spot Instances.
 ```ts
-import { Vpc } from "aws-cdk-lib/aws-ec2";
-import { Bucket } from "aws-cdk-lib/aws-s3";
+import * as ec2 from "aws-cdk-lib/aws-ec2";
+import * as s3 from "aws-cdk-lib/aws-s3";
-declare const vpc: Vpc;
-declare const bucket: Bucket;
-new NeuronxCompile(stack, "NeuronxCompile", {
+declare const vpc: ec2.Vpc;
+declare const bucket: s3.Bucket;
+new NeuronxCompile(this, "NeuronxCompile", {
   vpc,
   bucket,
   model: Model.fromHuggingFace("example/example-7b-chat", {
@@ -83,12 +177,12 @@ new NeuronxCompile(stack, "NeuronxCompile", {
 If you are familiar with Neuronx, you can also specify compilation options to better meet your requirements.
 ```ts
-import { Vpc } from "aws-cdk-lib/aws-ec2";
-import { Bucket } from "aws-cdk-lib/aws-s3";
+import * as ec2 from "aws-cdk-lib/aws-ec2";
+import * as s3 from "aws-cdk-lib/aws-s3";
-declare const vpc: Vpc;
-declare const bucket: Bucket;
-new NeuronxCompile(stack, "NeuronxCompile", {
+declare const vpc: ec2.Vpc;
+declare const bucket: s3.Bucket;
+new NeuronxCompile(this, "NeuronxCompile", {
   vpc,
   bucket,
   model: Model.fromHuggingFace("example/example-22b-chat", {

package/docs/neuronx-compile-architecture.png CHANGED Viewed

Binary file

package/lib/.types-compat/ts3.9/index.d.ts CHANGED Viewed

@@ -1,2 +1,4 @@
+export * from "./model";
 export * from "./neuronx-compile";
 export * from "./neuronx-instance-type";
+export * from "./transformers-neuronx-sagemaker-realtime-inference";

package/lib/.types-compat/ts3.9/model.d.ts ADDED Viewed

@@ -0,0 +1,97 @@
+import { IBucket } from "aws-cdk-lib/aws-s3";
+/**
+ * Quant data type.
+ */
+export declare enum QuantDtype {
+    /**
+     * int8 weight storage.
+     */
+    S8 = "s8"
+}
+/**
+ * Optimization level.
+ */
+export declare enum OptLevel {
+    /**
+     * enables the core performance optimizations in the compiler, while also minimizing compile time.
+     */
+    MINIMIZING_COMPILE_TIME = 1,
+    /**
+     * provides the best balance between model performance and compile time.
+     */
+    BEST_BALANCE = 2,
+    /**
+     * may provide additional model execution performance but may incur longer compile times and higher host memory usage during model compilation.
+     */
+    MODEL_EXECUTION_PERFORMANCE = 3
+}
+/**
+ * Compile options.
+ */
+export interface CompileOptions {
+    /**
+     * @default - calc from parameters and quantDtype
+     */
+    readonly tpDegree?: number;
+    /**
+     * @default - No quant
+     */
+    readonly quantDtype?: QuantDtype;
+    /**
+     * @default 4096
+     */
+    readonly nPositions?: number;
+    /**
+     * @default OptLevel.BEST_BALANCE
+     */
+    readonly optLevel?: OptLevel;
+}
+/**
+ * Represents the amount of parameters.
+ */
+export declare class Parameters {
+    private readonly billion;
+    /**
+     * Create a Parameters representing an amount bilion.
+     * @param parameters number of parameters bilionX
+     * @returns parameters
+     */
+    static billion(parameters: number): Parameters;
+    private constructor();
+    /**
+     * Return this number of parameters as bilion.
+     * @returns This number of parameters as bilion.
+     */
+    toBilion(): number;
+}
+/**
+ * Compile target model basic infromation
+ */
+export interface ModelOptions {
+    readonly parameters: Parameters;
+}
+/**
+ * Compile target model.
+ */
+export declare class Model {
+    readonly modelId: string;
+    readonly options: ModelOptions;
+    readonly bucket?: IBucket | undefined;
+    readonly prefix?: string | undefined;
+    /**
+     * model informations at HuggingFace
+     * @param modelId model id on the HuggingFace
+     * @param options model basic infromation
+     * @returns model instance
+     */
+    static fromHuggingFace(modelId: string, options: ModelOptions): Model;
+    /**
+     * model informations at S3 Bucket
+     * @param bucket Model stored S3 Bucket
+     * @param prefix Model stored objects prefix
+     * @param options model basic infromation
+     * @returns model instance
+     */
+    static fromBucket(bucket: IBucket, prefix: string, options: ModelOptions): Model;
+    private constructor();
+}

package/lib/.types-compat/ts3.9/neuronx-compile.d.ts CHANGED Viewed

@@ -3,6 +3,7 @@ import * as ec2 from "aws-cdk-lib/aws-ec2";
 import { ContainerImage } from "aws-cdk-lib/aws-ecs";
 import { IBucket } from "aws-cdk-lib/aws-s3";
 import { Construct } from "constructs";
+import { CompileOptions, Model, OptLevel, Parameters, QuantDtype } from "./model";
 import { NeuronxInstanceType } from "./neuronx-instance-type";
 /**
  * Compile runtime.
@@ -17,92 +18,6 @@ export interface CompileRuntime {
      */
     readonly neuronxVersion: string;
 }
-/**
- * Quant data type.
- */
-export declare enum QuantDtype {
-    /**
-     * int8 weight storage.
-     */
-    S8 = "s8"
-}
-/**
- * Optimization level.
- */
-export declare enum OptLevel {
-    /**
-     * enables the core performance optimizations in the compiler, while also minimizing compile time.
-     */
-    MINIMIZING_COMPILE_TIME = 1,
-    /**
-     * provides the best balance between model performance and compile time.
-     */
-    BEST_BALANCE = 2,
-    /**
-     * may provide additional model execution performance but may incur longer compile times and higher host memory usage during model compilation.
-     */
-    MODEL_EXECUTION_PERFORMANCE = 3
-}
-/**
- * Compile options.
- */
-export interface CompileOptions {
-    /**
-     * @default - calc from parameters and quantDtype
-     */
-    readonly tpDegree?: number;
-    /**
-     * @default - No quant
-     */
-    readonly quantDtype?: QuantDtype;
-    /**
-     * @default 4092
-     */
-    readonly nPositions?: number;
-    /**
-     * @default OptLevel.BEST_BALANCE
-     */
-    readonly optLevel?: OptLevel;
-}
-/**
- * Represents the amount of parameters.
- */
-export declare class Parameters {
-    private readonly billion;
-    /**
-     * Create a Parameters representing an amount bilion.
-     * @param parameters number of parameters bilionX
-     * @returns parameters
-     */
-    static billion(parameters: number): Parameters;
-    private constructor();
-    /**
-     * Return this number of parameters as bilion.
-     * @returns This number of parameters as bilion.
-     */
-    toBilion(): number;
-}
-/**
- * Compile target model basic infromation
- */
-export interface ModelOptions {
-    readonly parameters: Parameters;
-}
-/**
- * Compile target model.
- */
-export declare class Model {
-    readonly modelId: string;
-    readonly options: ModelOptions;
-    /**
-     * model informations at HuggingFace
-     * @param modelId model id on the HuggingFace
-     * @param options model basic infromation
-     * @returns model instance
-     */
-    static fromHuggingFace(modelId: string, options: ModelOptions): Model;
-    private constructor();
-}
 /**
  * Props of NeuronxCompile.
  */
@@ -111,10 +26,6 @@ export interface NeuronxCompileProps {
      * VPC in which this will launch compile worker instance.
      */
     readonly vpc: ec2.IVpc;
-    /**
-     * The instance type of compile worker instance.
-     */
-    readonly instanceType?: NeuronxInstanceType;
     /**
      * The bucket to upload compiled artifacts.
      */
@@ -123,6 +34,10 @@ export interface NeuronxCompileProps {
      * The model to be compiled.
      */
     readonly model: Model;
+    /**
+     * The instance type of compile worker instance.
+     */
+    readonly instanceType?: NeuronxInstanceType;
     /**
      * The root volume of worker instance.
      * @default - N bilion parameters * 5GiB EBS
@@ -155,12 +70,20 @@ export interface NeuronxCompileProps {
  * Neuronx compile construct. Compile the model to work with Inferentia2 and Trainium1 and upload it to an S3 bucket.
  */
 export declare class NeuronxCompile extends Construct {
+    readonly compiledArtifactS3Bucket: IBucket;
     /**
      * S3 URL that compiled artifact uploaded.
      */
     readonly compiledArtifactS3Url: string;
+    /**
+     * S3 Prefix that compiled artifact uploaded.
+     */
+    readonly compiledArtifactS3Prefix: string;
+    readonly tpDegree: number;
+    readonly quantDtype?: QuantDtype;
+    readonly nPositions: number;
+    readonly optLevel: OptLevel;
+    readonly parameters: Parameters;
     constructor(scope: Construct, id: string, props: NeuronxCompileProps);
-    private connectAcceleratorChips;
-    private calcTpDegree;
     private selectInstanceTypeByTpDegree;
 }

package/lib/.types-compat/ts3.9/private/util.d.ts ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ import { CompileOptions, Parameters } from "../model";
2	+ export declare function calcTpDegree(parameters: Parameters, compileOptions: CompileOptions): number;

package/lib/.types-compat/ts3.9/transformers-neuronx-sagemaker-realtime-inference.d.ts ADDED Viewed

@@ -0,0 +1,113 @@
+import * as sagemaker from "@aws-cdk/aws-sagemaker-alpha";
+import { Duration, Size } from "aws-cdk-lib";
+import { Grant, IGrantable } from "aws-cdk-lib/aws-iam";
+import { IBucket } from "aws-cdk-lib/aws-s3";
+import { ISource } from "aws-cdk-lib/aws-s3-deployment";
+import { Construct } from "constructs";
+import { CompileOptions, OptLevel, Parameters, QuantDtype } from "./model";
+import { NeuronxCompile } from "./neuronx-compile";
+import { NeuronxInstanceType } from "./neuronx-instance-type";
+/**
+ * Precompiled model options.
+ */
+export interface CompiledModelOptions {
+    /**
+     * Neuronx compile options.
+     * @default - Each properties are set default.
+     */
+    readonly compileOptions?: CompileOptions;
+    /**
+     * Code used for inference
+     * @default - using the predefined code
+     */
+    readonly code?: ISource;
+    /**
+     * Model ID or saved path
+     * @default "./model"
+     */
+    readonly modelIdOrPath?: string;
+    /**
+     * The path where compiled artifacts (i.e. xxx.neff) are stored
+     * @default "./compiled"
+     */
+    readonly compiledArtifactPath?: string;
+}
+export interface BucketCompiledModelOptions extends CompiledModelOptions {
+    /**
+     * The number of parameters of model.
+     */
+    readonly parameters: Parameters;
+}
+export declare class TransformersNeuronxSageMakerInferenceModelData {
+    static fromBucket(bucket: IBucket, prefix: string, options: BucketCompiledModelOptions): TransformersNeuronxSageMakerInferenceModelData;
+    static fromNeuronxCompile(compile: NeuronxCompile, code?: ISource): TransformersNeuronxSageMakerInferenceModelData;
+    readonly bucket: IBucket;
+    readonly compiledArtifactS3Prefix: string;
+    readonly code: ISource;
+    readonly tpDegree: number;
+    readonly quantDtype?: QuantDtype;
+    readonly nPositions: number;
+    readonly optLevel: OptLevel;
+    readonly modelIdOrPath?: string;
+    readonly compiledArtifactPath?: string;
+    readonly parameters: Parameters;
+    private constructor();
+}
+export interface TransformersNeuronxSageMakerRealtimeInferenceEndpointProps {
+    /**
+     * Model data for SageMaker inference.
+     * The model data requires at least compiled artifacts.
+     */
+    readonly modelData: TransformersNeuronxSageMakerInferenceModelData;
+    /**
+     * An image of the container where the inference job is executed.
+     */
+    readonly image?: sagemaker.ContainerImage;
+    /**
+     * A map of environment variables to pass into the container.
+     * @default - Only the predefined environment variables required to use Neuronx have been set.
+     */
+    readonly environment?: {
+        [key: string]: string;
+    };
+    /**
+     * The instance type of compile worker instance.
+     * @default - It is determined automatically according to the number of model parameters and compilation options.
+     */
+    readonly instanceType?: NeuronxInstanceType;
+    /**
+     * The size, of the ML storage volume attached to individual inference instance associated with the production variant.
+     * Currently only Amazon EBS gp2 storage volumes are supported.
+     * @see https://aws.amazon.com/jp/releasenotes/host-instance-storage-volumes-table
+     * @default - 2.5 GB per billion parameter (Max 512 GB)
+     */
+    readonly volumeSize?: Size;
+    /**
+     * The timeout value, to download and extract the model that you want to host from Amazon S3
+     * to the individual inference instance associated with this production variant.
+     * @default - 60 seconds, when `volumeSize` larger than 30GB then 1GB x 15 seconds (max 60 minutes)
+     */
+    readonly modelDataDownloadTimeout?: Duration;
+    /**
+     * The timeout value, for your inference container to pass health check by SageMaker Hosting.
+     * @see https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-inference-code.html#your-algorithms-inference-algo-ping-requests
+     * @default - 60 seconds, when set the `modelDataDownloadTimeout` then use same value (max 60 minutes)
+     */
+    readonly containerStartupHealthCheckTimeout?: Duration;
+}
+export declare class TransformersNeuronxSageMakerRealtimeInferenceEndpoint extends Construct {
+    /**
+     * The ARN of the endpoint.
+     * @attribute
+     */
+    readonly endpointArn: string;
+    /**
+     * The name of the endpoint.
+     * @attribute
+     */
+    readonly endpointName: string;
+    private readonly endpoint;
+    constructor(scope: Construct, id: string, props: TransformersNeuronxSageMakerRealtimeInferenceEndpointProps);
+    grantInvoke(grantee: IGrantable): Grant;
+    private selectInstanceTypeByTpDegree;
+}

package/lib/index.d.ts CHANGED Viewed

@@ -1,2 +1,4 @@
+export * from "./model";
 export * from "./neuronx-compile";
 export * from "./neuronx-instance-type";
+export * from "./transformers-neuronx-sagemaker-realtime-inference";

package/lib/index.js CHANGED Viewed

@@ -14,6 +14,8 @@ var __exportStar = (this && this.__exportStar) || function(m, exports) {
     for (var p in m) if (p !== "default" && !Object.prototype.hasOwnProperty.call(exports, p)) __createBinding(exports, m, p);
 };
 Object.defineProperty(exports, "__esModule", { value: true });
+__exportStar(require("./model"), exports);
 __exportStar(require("./neuronx-compile"), exports);
 __exportStar(require("./neuronx-instance-type"), exports);
-//# sourceMappingURL=data:application/json;base64,eyJ2ZXJzaW9uIjozLCJmaWxlIjoiaW5kZXguanMiLCJzb3VyY2VSb290IjoiIiwic291cmNlcyI6WyIuLi9zcmMvaW5kZXgudHMiXSwibmFtZXMiOltdLCJtYXBwaW5ncyI6Ijs7Ozs7Ozs7Ozs7Ozs7OztBQUFBLG9EQUFrQztBQUNsQywwREFBd0MiLCJzb3VyY2VzQ29udGVudCI6WyJleHBvcnQgKiBmcm9tIFwiLi9uZXVyb254LWNvbXBpbGVcIjtcbmV4cG9ydCAqIGZyb20gXCIuL25ldXJvbngtaW5zdGFuY2UtdHlwZVwiO1xuIl19
+__exportStar(require("./transformers-neuronx-sagemaker-realtime-inference"), exports);
+//# sourceMappingURL=data:application/json;base64,eyJ2ZXJzaW9uIjozLCJmaWxlIjoiaW5kZXguanMiLCJzb3VyY2VSb290IjoiIiwic291cmNlcyI6WyIuLi9zcmMvaW5kZXgudHMiXSwibmFtZXMiOltdLCJtYXBwaW5ncyI6Ijs7Ozs7Ozs7Ozs7Ozs7OztBQUFBLDBDQUF3QjtBQUN4QixvREFBa0M7QUFDbEMsMERBQXdDO0FBQ3hDLHNGQUFvRSIsInNvdXJjZXNDb250ZW50IjpbImV4cG9ydCAqIGZyb20gXCIuL21vZGVsXCI7XG5leHBvcnQgKiBmcm9tIFwiLi9uZXVyb254LWNvbXBpbGVcIjtcbmV4cG9ydCAqIGZyb20gXCIuL25ldXJvbngtaW5zdGFuY2UtdHlwZVwiO1xuZXhwb3J0ICogZnJvbSBcIi4vdHJhbnNmb3JtZXJzLW5ldXJvbngtc2FnZW1ha2VyLXJlYWx0aW1lLWluZmVyZW5jZVwiO1xuIl19

package/lib/model.d.ts ADDED Viewed

@@ -0,0 +1,97 @@
+import { IBucket } from "aws-cdk-lib/aws-s3";
+/**
+ * Quant data type.
+ */
+export declare enum QuantDtype {
+    /**
+     * int8 weight storage.
+     */
+    S8 = "s8"
+}
+/**
+ * Optimization level.
+ */
+export declare enum OptLevel {
+    /**
+     * enables the core performance optimizations in the compiler, while also minimizing compile time.
+     */
+    MINIMIZING_COMPILE_TIME = 1,
+    /**
+     * provides the best balance between model performance and compile time.
+     */
+    BEST_BALANCE = 2,
+    /**
+     * may provide additional model execution performance but may incur longer compile times and higher host memory usage during model compilation.
+     */
+    MODEL_EXECUTION_PERFORMANCE = 3
+}
+/**
+ * Compile options.
+ */
+export interface CompileOptions {
+    /**
+     * @default - calc from parameters and quantDtype
+     */
+    readonly tpDegree?: number;
+    /**
+     * @default - No quant
+     */
+    readonly quantDtype?: QuantDtype;
+    /**
+     * @default 4096
+     */
+    readonly nPositions?: number;
+    /**
+     * @default OptLevel.BEST_BALANCE
+     */
+    readonly optLevel?: OptLevel;
+}
+/**
+ * Represents the amount of parameters.
+ */
+export declare class Parameters {
+    private readonly billion;
+    /**
+     * Create a Parameters representing an amount bilion.
+     * @param parameters number of parameters bilionX
+     * @returns parameters
+     */
+    static billion(parameters: number): Parameters;
+    private constructor();
+    /**
+     * Return this number of parameters as bilion.
+     * @returns This number of parameters as bilion.
+     */
+    toBilion(): number;
+}
+/**
+ * Compile target model basic infromation
+ */
+export interface ModelOptions {
+    readonly parameters: Parameters;
+}
+/**
+ * Compile target model.
+ */
+export declare class Model {
+    readonly modelId: string;
+    readonly options: ModelOptions;
+    readonly bucket?: IBucket | undefined;
+    readonly prefix?: string | undefined;
+    /**
+     * model informations at HuggingFace
+     * @param modelId model id on the HuggingFace
+     * @param options model basic infromation
+     * @returns model instance
+     */
+    static fromHuggingFace(modelId: string, options: ModelOptions): Model;
+    /**
+     * model informations at S3 Bucket
+     * @param bucket Model stored S3 Bucket
+     * @param prefix Model stored objects prefix
+     * @param options model basic infromation
+     * @returns model instance
+     */
+    static fromBucket(bucket: IBucket, prefix: string, options: ModelOptions): Model;
+    private constructor();
+}