npm - @stellarapp/tfjs-stellar - Versions diffs - 1.0.0 - Mend

@stellarapp/tfjs-stellar 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

package/jest.config.ts +203 -0
package/package.json +24 -0
package/src/index.ts +93 -0
package/src/kv_cache.ts +205 -0
package/src/layers/cached_rope_multihead_attention.test.ts +59 -0
package/src/layers/cached_rope_multihead_attention.ts +113 -0
package/src/layers/gpt_decoder_block.ts +77 -0
package/src/layers/multihead_attention.test.ts +212 -0
package/src/layers/multihead_attention.ts +371 -0
package/src/layers/positional_encoding.test.ts +113 -0
package/src/layers/positional_encoding.ts +158 -0
package/src/layers/rotary_position_embedding.test.ts +107 -0
package/src/layers/rotary_position_embedding.ts +163 -0
package/src/layers/token_and_positional_embedding.test.ts +81 -0
package/src/layers/token_and_positional_embedding.ts +149 -0
package/src/layers/transformer_decoder.test.ts +100 -0
package/src/layers/transformer_decoder.ts +236 -0
package/src/layers/transformer_encoder.test.ts +85 -0
package/src/layers/transformer_encoder.ts +224 -0
package/src/losses/dice.ts +156 -0
package/src/losses/index.ts +1 -0
package/src/metrics.ts +32 -0
package/src/models/gpt_model.ts +232 -0
package/src/models/index.ts +2 -0
package/src/models/llm_model.ts +355 -0
package/src/models/u_net.ts +240 -0
package/src/packing_mask.ts +28 -0
package/src/testing.ts +1 -0
package/src/tfjs_types.ts +15 -0
package/src/utils.test.ts +101 -0
package/src/utils.ts +86 -0
package/tsconfig.json +49 -0

package/src/layers/rotary_position_embedding.test.ts ADDED Viewed

@@ -0,0 +1,107 @@
+import { RotaryPositionEmbedding } from "@/layers/rotary_position_embedding";
+import * as tf from "@tensorflow/tfjs";
+// disables warning for using the faster node backend,
+// https://github.com/tensorflow/tfjs/issues/5349#issuecomment-885170504
+tf.env().set('IS_NODE', false);
+describe("RotaryPositionEmbedding tests", () => {
+    test("create cache", async () => {
+        const rope = new RotaryPositionEmbedding({ dim: 8, maxSequenceLength: 15 });
+        rope.build([]);
+        const expected_cosine_cache = tf.tensor([[[
+            [1, 1, 1, 1, 1, 1, 1, 1],
+            [0.5403022766113281, 0.5403022766113281, 0.9950041770935059, 0.9950041770935059, 0.9999499917030334, 0.9999499917030334, 0.9999995231628418, 0.9999995231628418],
+            [-0.416146844625473, -0.416146844625473, 0.9800665974617004, 0.9800665974617004, 0.9998000264167786, 0.9998000264167786, 0.9999979734420776, 0.9999979734420776],
+            [-0.9899924993515015, -0.9899924993515015, 0.9553365111351013, 0.9553365111351013, 0.9995500445365906, 0.9995500445365906, 0.9999955296516418, 0.9999955296516418],
+            [-0.6536436080932617, -0.6536436080932617, 0.9210609793663025, 0.9210609793663025, 0.9992001056671143, 0.9992001056671143, 0.9999920129776001, 0.9999920129776001],
+            [0.28366219997406006, 0.28366219997406006, 0.8775825500488281, 0.8775825500488281, 0.9987502694129944, 0.9987502694129944, 0.9999874830245972, 0.9999874830245972],
+            [0.9601702690124512, 0.9601702690124512, 0.8253356218338013, 0.8253356218338013, 0.998200535774231, 0.998200535774231, 0.9999819993972778, 0.9999819993972778],
+            [0.7539022564888, 0.7539022564888, 0.7648422122001648, 0.7648422122001648, 0.9975510239601135, 0.9975510239601135, 0.9999755024909973, 0.9999755024909973],
+            [-0.1455000340938568, -0.1455000340938568, 0.6967067122459412, 0.6967067122459412, 0.9968017339706421, 0.9968017339706421, 0.9999679923057556, 0.9999679923057556],
+            [-0.9111302495002747, -0.9111302495002747, 0.6216099262237549, 0.6216099262237549, 0.9959527254104614, 0.9959527254104614, 0.9999595284461975, 0.9999595284461975],
+            [-0.83907151222229, -0.83907151222229, 0.5403022766113281, 0.5403022766113281, 0.9950041770935059, 0.9950041770935059, 0.9999499917030334, 0.9999499917030334],
+            [0.004425697959959507, 0.004425697959959507, 0.4535960853099823, 0.4535960853099823, 0.9939560890197754, 0.9939560890197754, 0.999939501285553, 0.999939501285553],
+            [0.8438539505004883, 0.8438539505004883, 0.3623577058315277, 0.3623577058315277, 0.9928086400032043, 0.9928086400032043, 0.9999279975891113, 0.9999279975891113],
+            [0.9074468016624451, 0.9074468016624451, 0.26749876141548157, 0.26749876141548157, 0.9915618896484375, 0.9915618896484375, 0.9999154806137085, 0.9999154806137085],
+            [0.13673721253871918, 0.13673721253871918, 0.1699671596288681, 0.1699671596288681, 0.9902160167694092, 0.9902160167694092, 0.9999020099639893, 0.9999020099639893]
+        ]]]);
+        const expected_sine_cache = tf.tensor([[[
+            [0, 0, 0, 0, 0, 0, 0, 0],
+            [0.8414709568023682, 0.8414709568023682, 0.0998334214091301, 0.0998334214091301, 0.009999833069741726, 0.009999833069741726, 0.0009999999310821295, 0.0009999999310821295],
+            [0.9092974066734314, 0.9092974066734314, 0.19866932928562164, 0.19866932928562164, 0.019998665899038315, 0.019998665899038315, 0.0019999986980110407, 0.0019999986980110407],
+            [0.14112000167369843, 0.14112000167369843, 0.29552021622657776, 0.29552021622657776, 0.029995499178767204, 0.029995499178767204, 0.0029999956022948027, 0.0029999956022948027],
+            [-0.756802499294281, -0.756802499294281, 0.3894183337688446, 0.3894183337688446, 0.03998933359980583, 0.03998933359980583, 0.003999989479780197, 0.003999989479780197],
+            [-0.9589242935180664, -0.9589242935180664, 0.4794255495071411, 0.4794255495071411, 0.04997916519641876, 0.04997916519641876, 0.0049999793991446495, 0.0049999793991446495],
+            [-0.279415488243103, -0.279415488243103, 0.5646424889564514, 0.5646424889564514, 0.059964004904031754, 0.059964004904031754, 0.0059999641962349415, 0.0059999641962349415],
+            [0.6569865942001343, 0.6569865942001343, 0.6442176699638367, 0.6442176699638367, 0.06994284689426422, 0.06994284689426422, 0.0069999429397284985, 0.0069999429397284985],
+            [0.9893582463264465, 0.9893582463264465, 0.7173560857772827, 0.7173560857772827, 0.07991468906402588, 0.07991468906402588, 0.007999914698302746, 0.007999914698302746],
+            [0.41211849451065063, 0.41211849451065063, 0.7833269238471985, 0.7833269238471985, 0.08987854421138763, 0.08987854421138763, 0.008999879471957684, 0.008999879471957684],
+            [-0.5440211296081543, -0.5440211296081543, 0.8414709568023682, 0.8414709568023682, 0.0998334139585495, 0.0998334139585495, 0.0099998340010643, 0.0099998340010643],
+            [-0.9999902248382568, -0.9999902248382568, 0.8912073969841003, 0.8912073969841003, 0.10977829992771149, 0.10977829992771149, 0.010999779216945171, 0.010999779216945171],
+            [-0.5365729331970215, -0.5365729331970215, 0.9320390820503235, 0.9320390820503235, 0.11971220374107361, 0.11971220374107361, 0.011999712325632572, 0.011999712325632572],
+            [0.4201670289039612, 0.4201670289039612, 0.9635581970214844, 0.9635581970214844, 0.12963414192199707, 0.12963414192199707, 0.012999634258449078, 0.012999634258449078],
+            [0.9906073808670044, 0.9906073808670044, 0.9854497313499451, 0.9854497313499451, 0.13954311609268188, 0.13954311609268188, 0.013999543152749538, 0.013999543152749538]
+        ]]]);
+        const [cosine_cache, sine_cache] = rope.getWeights();
+        expect(await cosine_cache?.sub(expected_cosine_cache).sum().array() as number).toBeLessThanOrEqual(1e-6);
+        expect(await sine_cache?.sub(expected_sine_cache).sum().array() as number).toBeLessThanOrEqual(1e-6);
+    })
+    test("rotate inputs", async () => {
+        const rope = new RotaryPositionEmbedding({ dim: 8, maxSequenceLength: 15 });
+        const x = tf.tensor([[[
+            [0.0766048, 0.5706575, 0.6705932, 0.5273118, 0.4794086, 0.9378104, 0.9888024, 0.6926053],
+            [0.9064133, 0.5875182, 0.1681865, 0.3833345, 0.9901192, 0.4677338, 0.3353315, 0.02699],
+            [0.3033573, 0.4139377, 0.4062586, 0.9705839, 0.3582608, 0.328775, 0.1340587, 0.2193414],
+            [0.5565202, 0.4334963, 0.9912352, 0.3388563, 0.7991487, 0.1911893, 0.1140554, 0.6949552]]]
+        ]); // batch=1, seq = 1, heads=4, embedDim=8
+        const expected_output = tf.tensor([[[
+            [0.07660479843616486, 0.57065749168396, 0.6705932021141052, 0.5273118019104004, 0.4794085919857025, 0.9378104209899902, 0.9888023734092712, 0.6926053166389465],
+            [-0.004642367362976074, 1.08015775680542, 0.12907665967941284, 0.39820998907089233, 0.9853923320770264, 0.47761136293411255, 0.33530429005622864, 0.027325313538312912],
+            [-0.5026336908340454, 0.10358311235904694, 0.20533521473407745, 1.0319478511810303, 0.3516140580177307, 0.33587393164634705, 0.1336197406053543, 0.21960905194282532],
+            [-0.6121258735656738, -0.3506217896938324, 0.8468242287635803, 0.6166517734527588, 0.7930541634559631, 0.2150741070508957, 0.11197001487016678, 0.695294201374054]
+        ]]]);
+        const output = rope.apply(x) as tf.Tensor;
+        expect(await expected_output.sub(output).sum().array() as number).toBeLessThan(1e-6);
+        expect(rope.computeOutputShape(x.shape)).toEqual(x.shape);
+        expect(rope.computeOutputShape([x.shape])).toEqual(x.shape);
+    })
+    test("expand cache when input sequences are larger than rope's max sequence length", async () => {
+        const dim = 8;
+        const rope = new RotaryPositionEmbedding({ dim, maxSequenceLength: 15, theta: 1_000_000 });
+        const larger_sequence = 20;
+        const even_larger_sequence = 50;
+        rope.apply(tf.randomUniform([1, 1, larger_sequence, dim]));
+        rope.getWeights().forEach(weight => {
+            expect(weight.shape).toEqual([1, 1, 32, dim]);
+        });
+        rope.apply([tf.randomUniform([1, 1, even_larger_sequence, dim])]);
+        rope.getWeights().forEach(weight => {
+            expect(weight.shape).toEqual([1, 1, 64, dim]);
+        });
+    })
+    test("create layer", async () => {
+        // dim must be even
+        expect(() => new RotaryPositionEmbedding({ dim: 7, maxSequenceLength: 15 })).toThrow();
+        expect(() => new RotaryPositionEmbedding({ dim: 8, maxSequenceLength: 25 })).not.toThrow();
+    })
+});

package/src/layers/rotary_position_embedding.ts ADDED Viewed

@@ -0,0 +1,163 @@
+import * as tf from "@tensorflow/tfjs";
+import { type LayerArgs } from "@tensorflow/tfjs-layers/dist/engine/topology";
+export function applyRope(x: tf.Tensor, dim: number, cosine_cache: tf.Tensor, sine_cache: tf.Tensor) {
+    return tf.tidy(() => {
+        const seq_length = x.shape[2]!;
+        // get a slice of the pre-computed cache, up to the input's sequence length
+        const cosine = cosine_cache.slice([0, 0, 0, 0], [1, 1, seq_length, dim]);
+        const sine = sine_cache.slice([0, 0, 0, 0], [1, 1, seq_length, dim]);
+        // apply RoPE formula (x1 * cosine) + (rotate(-x2) * sine)
+        const rotated_x = rotateHalf(x, dim);
+        return tf.add(tf.mul(x, cosine), tf.mul(rotated_x, sine));
+    });
+}
+export function rotateHalf(x: tf.Tensor, dim: number): tf.Tensor {
+    return tf.tidy(() => {
+        // reshape the last dimension such that adjacent coordinates are paired together
+        // [x1, x2, x3, x4] -> [[x1, x2], [x3, x4]]
+        // the leading dimensions are flattened because TFJS has issues during
+        // backpropagation with 5D slicing
+        const reshaped = x.reshape([-1, dim / 2, 2]);
+        const x1 = reshaped.slice([0, 0, 0], [-1, -1, 1]);
+        const x2 = reshaped.slice([0, 0, 1], [-1, -1, 1]);
+        // [x1, x2] -> [-x2, x1]
+        const rotated = tf.concat([tf.neg(x2), x1], -1);
+        return rotated.reshape(x.shape);
+    });
+}
+export function createRoPECache(dim: number, max_sequence_length: number, theta: number = 10_000) {
+    return tf.tidy(() => {
+        // [dim]
+        const inv_frequencies = tf.div<tf.Tensor1D>(1, tf.pow(
+            theta,
+            tf.range(0, Math.floor(dim / 2) * 2, 2, "float32").div(dim)));
+        // [max_sequene_length]
+        const sequence_indices = tf.range(0, max_sequence_length);
+        //
+        const freq = tf.outerProduct(sequence_indices, inv_frequencies);
+        // cache final shape [max_sequence_length, dim]
+        const freq_pairs = tf.stack([freq, freq], -1)
+            .reshape([max_sequence_length, dim]);
+        return [
+            tf.keep(tf.cos(freq_pairs).expandDims(0).expandDims(0)),
+            tf.keep(tf.sin(freq_pairs).expandDims(0).expandDims(0))
+        ]
+    });
+}
+export interface RotaryPositionEmbeddingArgs extends LayerArgs {
+    /**
+     * The dimension of each head (rounded down), e.g. `Math.floor(embedDim / numHeads)`
+     */
+    dim: number,
+    /**
+     * The RoPE cache will be pre-calculated up to the max sequence length, and re-caculated as needed. Defaults to `4096`.
+     */
+    maxSequenceLength?: number,
+    /**
+     * The base for the geometric progression used to compute the rotation angles. Defaults to `10_000`.
+     */
+    theta?: number,
+}
+/**
+ * Implements RoPE from the RoFormer: Enhanced Transformer with Rotary Position Embedding paper
+ * Inspired by: https://meta-pytorch.org/torchtune/stable/_modules/torchtune/modules/position_embeddings.html#RotaryPositionalEmbeddings
+ */
+export class RotaryPositionEmbedding extends tf.layers.Layer {
+    static className = "RotaryPositionEmbedding";
+    protected dim: number;
+    protected max_sequence_length: number;
+    protected theta: number;
+    // cached sine and cosine frequencies, untrainable weights
+    protected cosine_cache: tf.LayerVariable;
+    protected sine_cache: tf.LayerVariable;
+    constructor({ dim, maxSequenceLength = 4096, theta = 10_000, ...args }: RotaryPositionEmbeddingArgs) {
+        super(args);
+        if (dim % 2 !== 0) {
+            throw Error(`${this.getClassName()}::constructor ${this.name} expected dim to be even, got ${dim}`);
+        }
+        this.dim = dim;
+        this.max_sequence_length = maxSequenceLength;
+        this.theta = theta;
+        this.cosine_cache = this.addWeight("sine_cache",
+            [1, 1, maxSequenceLength, Math.floor(this.dim)],
+            "float32", tf.initializers.zeros(), undefined, false);
+        this.sine_cache = this.addWeight("cosine_cache",
+            [1, 1, maxSequenceLength, Math.floor(this.dim)],
+            "float32", tf.initializers.zeros(), undefined, false);
+    }
+    override call(inputs: tf.Tensor | tf.Tensor[], kwargs: any): tf.Tensor | tf.Tensor[] {
+        const shape = Array.isArray(inputs) ? inputs[0].shape : inputs.shape;
+        const seq_length = shape[2];
+        if (seq_length > this.max_sequence_length) {
+            // expand cache to the nearest power of 2
+            this.max_sequence_length = Math.pow(2, Math.ceil(Math.log2(seq_length)));
+            this.build([]);
+        }
+        return applyRope(
+            Array.isArray(inputs) ? inputs[0] : inputs,
+            this.dim,
+            this.cosine_cache.read(),
+            this.sine_cache.read())
+    }
+    override build(input_shape: tf.Shape | tf.Shape[]) {
+        const [cosine, sine] = createRoPECache(
+            this.dim, this.max_sequence_length, this.theta);
+        this.cosine_cache.dispose();
+        this.sine_cache.dispose();
+        this.cosine_cache = new tf.LayerVariable(cosine);
+        this.sine_cache = new tf.LayerVariable(sine);
+        this.nonTrainableWeights = [
+            new tf.LayerVariable(cosine),
+            new tf.LayerVariable(sine)
+        ];
+        this.setWeights([cosine, sine]);
+    }
+    /**
+     * Output shape: [batch, head, sequence, head_dim]
+     */
+    public computeOutputShape(input_shape: tf.Shape | tf.Shape[]) {
+        return Array.isArray(input_shape[0])
+            ? input_shape[0] as tf.Shape
+            : input_shape as tf.Shape;
+    }
+}
+tf.serialization.registerClass(RotaryPositionEmbedding);

package/src/layers/token_and_positional_embedding.test.ts ADDED Viewed

@@ -0,0 +1,81 @@
+import * as tf from '@tensorflow/tfjs';
+import { TokenAndPositionalEmbedding } from '@/layers/token_and_positional_embedding';
+// disables warning for using the faster node backend,
+// https://github.com/tensorflow/tfjs/issues/5349#issuecomment-885170504
+tf.env().set('IS_NODE', false);
+describe("PositionalEncoding tests", () => {
+    test("layer initialization", () => {
+        expect(() => new TokenAndPositionalEmbedding({ maxSequenceLength: 0, embedDim: 10, vocabularySize: 10_000 })).toThrow();
+        expect(() => new TokenAndPositionalEmbedding({ embedDim: 0, vocabularySize: 10_000 })).toThrow();
+        expect(() => new TokenAndPositionalEmbedding({ embedDim: 10, vocabularySize: 0 })).toThrow();
+        expect(() => new TokenAndPositionalEmbedding({ embedDim: 10, vocabularySize: 10_000 })).not.toThrow();
+        expect(() => new TokenAndPositionalEmbedding({ embedDim: 10, vocabularySize: 10_000 })).not.toThrow();
+    })
+    test("successfull forward calls", () => {
+        const embed_dims = 32;
+        const sequences = 4;
+        const vocab_size = 10_000;
+        const input = tf.randomUniform([2, sequences]);
+        const embedding = new TokenAndPositionalEmbedding({ embedDim: embed_dims, dropout: 0.1, vocabularySize: vocab_size });
+        expect(() => embedding.apply(input)).not.toThrow();
+        expect(() => embedding.apply([input])).not.toThrow();
+    })
+    test("layer build", () => {
+        const input_ok = tf.randomUniform([2, 4]);
+        const input_too_many_words = tf.randomUniform([2, 700]);
+        const input_is_image = tf.randomUniform([1, 32, 32, 3]);
+        let embedding = new TokenAndPositionalEmbedding({ embedDim: 32, maxSequenceLength: 500, vocabularySize: 1_000 });
+        expect(() => embedding.build(input_ok.shape)).not.toThrow();
+        embedding = new TokenAndPositionalEmbedding({ embedDim: 32, maxSequenceLength: 500, vocabularySize: 1_000 });
+        expect(() => embedding.build([input_ok.shape, input_ok.shape])).not.toThrow();
+        new TokenAndPositionalEmbedding({ embedDim: 32, maxSequenceLength: 500, vocabularySize: 1_000 });
+        expect(() => embedding.build(input_too_many_words.shape)).toThrow();
+        expect(() => embedding.build(input_is_image.shape)).toThrow();
+    })
+    it("should throw when more than one input provided, input sequences are too large, or incorrect input rank", () => {
+        const sequences_too_long = tf.randomUniform([10, 1000]);
+        const multiple_correct_inputs = [tf.randomUniform([2, 3]), tf.randomUniform([2, 3])];
+        const wrong_rank = tf.randomUniform([10, 32, 32]);
+        const positional = new TokenAndPositionalEmbedding({ maxSequenceLength: 10, embedDim: 32, vocabularySize: 10_000 });
+        positional.build([2, 3]); // get past the initial build call to test forward prop
+        expect(() => positional.apply(sequences_too_long)).toThrow();
+        expect(() => positional.apply(multiple_correct_inputs)).toThrow();
+        expect(() => positional.apply(wrong_rank)).toThrow();
+    })
+    it("should return a non-empty config dict", () => {
+        const embedding = new TokenAndPositionalEmbedding({ embedDim: 32, vocabularySize: 10_000 });
+        expect(Object.keys(embedding.getConfig())).not.toBe(0);
+    })
+    it("should return an output shape of [batch, sequences, embed dims]", () => {
+        const words = 100;
+        const batch = 2;
+        const embed_dims = 64;
+        const input = tf.randomUniform([batch, words]);
+        const embedding = new TokenAndPositionalEmbedding({ embedDim: embed_dims, vocabularySize: 10_000 });
+        expect(embedding.computeOutputShape(input.shape)).toEqual([batch, words, embed_dims]);
+    })
+});

package/src/layers/token_and_positional_embedding.ts ADDED Viewed

@@ -0,0 +1,149 @@
+import * as tf from '@tensorflow/tfjs';
+import { type LayerArgs } from '@tensorflow/tfjs-layers/dist/engine/topology';
+import { type Kwargs } from '@tensorflow/tfjs-layers/dist/types';
+import { PositionalEncoding, type PositionalEncodingArgs } from '@/layers/positional_encoding';
+export interface TokenAndPositionalEmbeddingArgs extends LayerArgs, PositionalEncodingArgs {
+    vocabularySize: number;
+    dropout?: number
+}
+/**
+ * This class implements combines sinusoidal positional encoding from the
+ * 2017 paper "Attention Is All You Need" with a normal embedding layer to
+ * form a simplified single embedding layer.
+ *
+ * This layer accepts tokenized inputs of the shape `[ batch, tokens ]` and runs
+ * it through an embedding layer before adding sinusoidal positional encoding.
+ *
+ * @param embedDim the size of each token/word's embedding
+ * @param vocabularySize the number of tokens to embed
+ * @param maxSequenceLength the max number of tokens (words) per input (sentence), default `5120`
+ * @param dropout applies dropout to the positionally encoded embeddings, default `0.1`
+ */
+export class TokenAndPositionalEmbedding extends tf.layers.Layer {
+    static className = "TokenAndPositionalEmbedding";
+    private readonly embedDim: number;
+    private readonly vocabularySize: number;
+    private embedding: tf.layers.Layer;
+    private positional: tf.layers.Layer
+    private readonly maxSequenceLength: number;
+    private readonly dropout: number;
+    private dropoutLayer: tf.layers.Layer;
+    constructor({ embedDim, vocabularySize, maxSequenceLength, dropout, ...args }: TokenAndPositionalEmbeddingArgs) {
+        super(args);
+        this.embedDim = embedDim;
+        this.vocabularySize = vocabularySize;
+        this.maxSequenceLength = maxSequenceLength ?? 5120;
+        this.dropout = dropout ?? 0.1;
+        if (this.dropout >= 1) {
+            throw Error(`${this.getClassName()}::constructor dropout must be within [0, 1)`);
+        }
+        this.embedding = tf.layers.embedding({
+            inputDim: this.vocabularySize,
+            outputDim: this.embedDim,
+        });
+        this.positional = new PositionalEncoding({
+            maxSequenceLength: this.maxSequenceLength,
+            embedDim: this.embedDim,
+        });
+        this.dropoutLayer = tf.layers.dropout({ rate: this.dropout });
+    }
+    /**
+     * Forward propagation.
+     */
+    override call(inputs: tf.Tensor | tf.Tensor[], kwargs: Kwargs) {
+        if (Array.isArray(inputs) && inputs.length != 1) {
+            throw Error(`${this.getClassName()}::call ${this.name} expects exactly` +
+                ` 1 tensor input, received ${inputs.length}`);
+        }
+        return tf.tidy(() => {
+            let output = this.positional.apply(this.embedding.apply(inputs)) as tf.Tensor;
+            output = this.dropoutLayer.apply(output) as tf.Tensor;
+            return output;
+        })
+    }
+    /**
+     * Build the sublayers and enable serialization
+     */
+    override build(inputShape: tf.Shape | tf.Shape[]): void {
+        let input_shapes: tf.Shape[] = [];
+        // only consider the first shape if multiple provided
+        if (Array.isArray(inputShape) && Array.isArray(inputShape[0])) {
+            // input is an array of shapes
+            input_shapes = inputShape as tf.Shape[];
+        } else if (inputShape.length != 0) {
+            // input is a single shape
+            input_shapes = [inputShape as tf.Shape];
+        }
+        if (input_shapes[0].length != 2 || input_shapes[0][1]! > this.maxSequenceLength) {
+            throw Error(`${this.getClassName()}::build ${this.name} expected an input of` +
+                ` shape [batch, tokens] where tokens < ${this.maxSequenceLength},` +
+                ` received ${JSON.stringify(input_shapes[0])}`);
+        }
+        // initialize the sublayers' weights
+        this.embedding.build(input_shapes[0]);
+        this.positional.build(this.embedding.computeOutputShape(input_shapes[0]));
+        // no need to rename weights, haven't found a case where their names collide
+        this.trainableWeights = [
+            ...this.embedding.trainableWeights,
+            ...this.positional.trainableWeights
+        ];
+        super.build(input_shapes[0]);
+    }
+    /**
+     * The output shape, for an input shape of [batch, sequences], is
+     * [batch, sequences, embedDim]
+     */
+    override computeOutputShape(inputShape: tf.Shape | tf.Shape[]): tf.Shape | tf.Shape[] {
+        const embedding_shape = this.embedding.computeOutputShape(inputShape);
+        const positional_shape = this.positional.computeOutputShape(embedding_shape);
+        return positional_shape;
+    }
+    override getConfig(): tf.serialization.ConfigDict {
+        const base_config = super.getConfig();
+        const config = {
+            embedDim: this.embedDim,
+            vocabularySize: this.vocabularySize,
+            maxSequenceLength: this.maxSequenceLength,
+            dropout: this.dropout,
+        }
+        Object.assign(config, base_config);
+        return config;
+    }
+}
+tf.serialization.registerClass(TokenAndPositionalEmbedding);

package/src/layers/transformer_decoder.test.ts ADDED Viewed

@@ -0,0 +1,100 @@
+import * as tf from '@tensorflow/tfjs';
+import { TransformerDecoder } from '@/layers/transformer_decoder';
+// disables warning for using the faster node backend,
+// https://github.com/tensorflow/tfjs/issues/5349#issuecomment-885170504
+tf.env().set('IS_NODE', false);
+describe("TransformerDecoder tests", () => {
+    it("should return an output with the same shape as the input", () => {
+        const input = tf.randomUniform([2, 3, 12]);
+        const decoder = new TransformerDecoder({
+            numHeads: 2, embedDim: input.shape.at(-1)!,
+            dropout: 0.5, activation: "gelu", dimsFeedForward: 321, useBias: false
+        });
+        const output = decoder.apply(input) as tf.Tensor;
+        expect(output.shape.length).toBe(input.shape.length);
+    })
+    test("forward calls", () => {
+        const input = tf.randomUniform([2, 3, 12]);
+        const mask = tf.randomUniform([input.shape[0]!, input.shape[1]!], -1, 2, "bool");
+        const incorrect_mask = tf.randomUniform([2, 5, 12], -1, 2, "bool");
+        const decoder = new TransformerDecoder({ numHeads: 2, embedDim: input.shape.at(-1)! });
+        expect(() => decoder.apply(input)).not.toThrow();
+        expect(() => decoder.apply([input])).not.toThrow();
+        // causal masking
+        const causal = new TransformerDecoder({ numHeads: 2, embedDim: input.shape.at(-1)!, causal: true });
+        expect(() => causal.apply(input)).not.toThrow();
+        expect(() => causal.apply([input])).not.toThrow();
+    })
+    it("should fail to instantiate a layer if heads count is not divisible by the input's embedding dimension", () => {
+        const input = tf.randomUniform([2, 3, 12]);
+        expect(() => new TransformerDecoder({ numHeads: 3, embedDim: input.shape.at(-1)! })).not.toThrow();
+        expect(() => new TransformerDecoder({ numHeads: 5, embedDim: input.shape.at(-1)! })).toThrow();
+    })
+    it("should not accept non-rank 3 tensor inputs", () => {
+        const embed_dim = 12;
+        const BAD_RANK4 = tf.randomUniform([2, 3, 12, embed_dim]);
+        const BAD_RANK2 = tf.randomUniform([2, embed_dim]);
+        const GOOD = tf.randomUniform([2, 3, embed_dim]);
+        const mask = tf.randomUniform([GOOD.shape[0]!, GOOD.shape[1]!], -1, 2, "bool");
+        let decoder = new TransformerDecoder({ numHeads: 2, embedDim: embed_dim });
+        // BAD
+        expect(() => decoder.apply(BAD_RANK4)).toThrow();
+        expect(() => decoder.apply(BAD_RANK2)).toThrow();
+        // OK
+        decoder = new TransformerDecoder({ numHeads: 2, embedDim: embed_dim });
+        expect(() => decoder.apply(GOOD)).not.toThrow();
+        expect(() => decoder.apply([GOOD])).not.toThrow();
+        expect(() => decoder.apply([GOOD, mask])).not.toThrow();
+    })
+    it("should not accept inputs that are less or more than 1 and 2 tensors", () => {
+        const input = tf.randomUniform([2, 3, 12]);
+        let decoder = new TransformerDecoder({ numHeads: 1, embedDim: input.shape.at(-1)! });
+        // OK
+        expect(() => decoder.apply(input)).not.toThrow();
+        expect(() => decoder.apply([input])).not.toThrow();
+        // BAD
+        decoder = new TransformerDecoder({ numHeads: 1, embedDim: input.shape.at(-1)! });
+        expect(() => decoder.apply([])).toThrow(); // stops at build()
+        decoder.apply(input); // get past the initial build
+        expect(() => decoder.apply([input, input, input])).toThrow();
+        expect(() => decoder.apply([input, input, input, input])).toThrow();
+        // BAD (tests build())
+        decoder = new TransformerDecoder({ numHeads: 1, embedDim: input.shape.at(-1)! });
+        expect(() => decoder.apply([input, input, input])).toThrow();
+        expect(() => decoder.apply([input, input, input, input])).toThrow();
+    })
+    it("should return a non-empty config dict", () => {
+        const input = tf.randomUniform([2, 3, 12]);
+        const decoder = new TransformerDecoder({ numHeads: 1, embedDim: input.shape.at(-1)! });
+        expect(Object.keys(decoder.getConfig())).not.toBe(0);
+    })
+})