npm - @stellarapp/tfjs-stellar - Versions diffs - 1.0.0 - Mend

@stellarapp/tfjs-stellar 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

package/jest.config.ts +203 -0
package/package.json +24 -0
package/src/index.ts +93 -0
package/src/kv_cache.ts +205 -0
package/src/layers/cached_rope_multihead_attention.test.ts +59 -0
package/src/layers/cached_rope_multihead_attention.ts +113 -0
package/src/layers/gpt_decoder_block.ts +77 -0
package/src/layers/multihead_attention.test.ts +212 -0
package/src/layers/multihead_attention.ts +371 -0
package/src/layers/positional_encoding.test.ts +113 -0
package/src/layers/positional_encoding.ts +158 -0
package/src/layers/rotary_position_embedding.test.ts +107 -0
package/src/layers/rotary_position_embedding.ts +163 -0
package/src/layers/token_and_positional_embedding.test.ts +81 -0
package/src/layers/token_and_positional_embedding.ts +149 -0
package/src/layers/transformer_decoder.test.ts +100 -0
package/src/layers/transformer_decoder.ts +236 -0
package/src/layers/transformer_encoder.test.ts +85 -0
package/src/layers/transformer_encoder.ts +224 -0
package/src/losses/dice.ts +156 -0
package/src/losses/index.ts +1 -0
package/src/metrics.ts +32 -0
package/src/models/gpt_model.ts +232 -0
package/src/models/index.ts +2 -0
package/src/models/llm_model.ts +355 -0
package/src/models/u_net.ts +240 -0
package/src/packing_mask.ts +28 -0
package/src/testing.ts +1 -0
package/src/tfjs_types.ts +15 -0
package/src/utils.test.ts +101 -0
package/src/utils.ts +86 -0
package/tsconfig.json +49 -0

package/src/layers/cached_rope_multihead_attention.ts ADDED Viewed

@@ -0,0 +1,113 @@
+import * as tf from '@tensorflow/tfjs';
+import { KvCacheContainer } from "@/kv_cache";
+import { MultiHeadAttention, type MultiHeadAttentionArgs } from '@/layers/multihead_attention';
+import { RotaryPositionEmbedding } from '@/layers/rotary_position_embedding';
+import { type Kwargs } from '@tensorflow/tfjs-layers/dist/types';
+/**
+ * MultiHeadAttention with RoPE and KV caching. If using KV caching, this layer
+ * should be used in a custom training loop because it requires the cache to be
+ * passed through the `kwargs.kvCache` argument during the `layer.apply()`
+ * forward propagation.
+ *
+ * If a KV cache is not provided, then this layer operates as MultiHeadAttention with RoPE.
+ */
+export class CachedRoPEMultiHeadAttention extends MultiHeadAttention {
+    static className = "CachedRoPEMultiHeadAttention";
+    protected rope: tf.layers.Layer;
+    constructor(args: MultiHeadAttentionArgs) {
+        super(args);
+        this.rope = new RotaryPositionEmbedding({ dim: Math.floor(this.embedDim / this.numHeads) });
+    }
+    protected override forward(
+        query_input: tf.Tensor,
+        key_input: tf.Tensor,
+        value_input: tf.Tensor,
+        packing_mask: tf.Tensor | null,
+        causal_mask: tf.Tensor | null,
+        kwargs: Kwargs): tf.Tensor {
+        return tf.tidy(() => {
+            const { query, key, value } = this.applyInputProjections(query_input, key_input, value_input);
+            // swap the seq and heads dimensions: [batch, seq, heads, head_dim] -> [batch, heads, seq, head_dim]
+            const move_head_dim_forward = [0, 2, 1, 3];
+            const split = this.splitHeads(query, key, value, move_head_dim_forward);
+            const query_split = split.query_split;
+            let key_split = split.key_split;
+            let value_split = split.value_split;
+            if (kwargs.training !== true && kwargs.kvCache) {
+                // runs on inference, updates the KV cache and get the historical key and value
+                const cached_kv = this.getCachedKV(
+                    kwargs.kvCache as KvCacheContainer, key_split, value_split);
+                key_split = cached_kv.keyCache;
+                value_split = cached_kv.valueCache;
+            }
+            // apply scaled dot production attention to get [batch, seq, numHeads, embedDim]
+            const spda = MultiHeadAttention.scaledDotProductionAttention(
+                query_split, key_split, value_split,
+                kwargs.attentionMask ?? null, packing_mask, causal_mask,
+                this.dropout, this.causal, kwargs);
+            // concat heads and apply the output projection
+            const output = this.outputProjection.apply(
+                spda.transpose(move_head_dim_forward).reshape(
+                    [query_input.shape[0], query_input.shape[1]!, this.embedDim]));
+            return output as tf.Tensor;
+        })
+    }
+    protected getCachedKV(kv_container: KvCacheContainer, key_split: tf.Tensor4D, value_split: tf.Tensor4D) {
+        try {
+            let kv_cache = kv_container.update(this.name, key_split, value_split);
+            if (!kv_cache) {
+                kv_container.create(this.name, {
+                    batchSize: key_split.shape[0],
+                    numHeads: this.numHeads,
+                    headDim: this.embedDim / this.numHeads,
+                })
+                kv_cache = kv_container.update(this.name, key_split, value_split)!;
+            }
+            return kv_cache!;
+        } catch (error: any) {
+            throw Error(`${this.getClassName()}::getCachedKV ${this.name} ${error.toString()}`);
+        }
+    }
+    /**
+     * Adds RoPE position encoding right after splitting heads.
+     */
+    protected override splitHeads(query: tf.Tensor, key: tf.Tensor, value: tf.Tensor, shuffle: number[]) {
+        const batch_size = query.shape[0];
+        const split_heads = [batch_size, -1, this.numHeads, this.embedDim / this.numHeads];
+        return tf.tidy(() => {
+            return {
+                query_split: (this.rope.apply(query.reshape(split_heads)) as tf.Tensor)
+                    .transpose(shuffle) as tf.Tensor4D,
+                key_split: (this.rope.apply(key.reshape(split_heads)) as tf.Tensor)
+                    .transpose(shuffle) as tf.Tensor4D,
+                value_split: value.reshape(split_heads).transpose(shuffle) as tf.Tensor4D
+            }
+        })
+    }
+}
+tf.serialization.registerClass(CachedRoPEMultiHeadAttention);

package/src/layers/gpt_decoder_block.ts ADDED Viewed

@@ -0,0 +1,77 @@
+import * as tf from "@tensorflow/tfjs";
+import { type Kwargs } from "@tensorflow/tfjs-layers/dist/types";
+import { type MultiHeadAttentionArgs } from "@/layers/multihead_attention";
+import { TransformerDecoder, type TransformerDecoderArgs } from "@/layers/transformer_decoder";
+export interface GPTDecoderBlockArgs extends Omit<MultiHeadAttentionArgs, "causal"> {
+    dimsFeedForward?: number;
+}
+/**
+ * This implements the GPT-2 transformer block by modifying the transformer
+ * decoder block to use pre-layer-normalization and replacing ReLU activation
+ * with GELU.
+ *
+ * @param numHeads number of attention heads to use
+ * @param embedDim the embedding size of the input (input embeddings, typically the last dimension)
+ * @param causal use causal masking on inputs (masks future inputs to prevent looking ahead), default `true`
+ * @param dropout use dropout during the attention calculations, default `0.1`
+ * @param dimsFeedForward the size of the intermediate feed forward layer, default `2048`
+ * @param useBias use bias for the dense sublayers and multiHead attention's dense sublayers, default `true`
+ */
+export class GPT2DecoderBlock extends TransformerDecoder {
+    static className = "GPT2DecoderBlock";
+    constructor(args: TransformerDecoderArgs) {
+        super(args);
+    }
+    /**
+     * Attention sub-block which is similar to the original transformer except
+     * layer normalization is applied beginning
+     */
+    protected override causalSelfAttentionBlock(x: tf.Tensor, kwargs: Kwargs): tf.Tensor {
+        return tf.tidy(() => {
+            const residual = x;
+            let attention = this.causalSelfAttentionNorm.apply(x, kwargs) as tf.Tensor;
+            attention = this.causalSelfAttention.apply(attention, kwargs) as tf.Tensor;
+            attention = this.causalSelfAttentionDropout.apply(attention, kwargs) as tf.Tensor;
+            attention = tf.add(attention, residual);
+            return attention;
+        });
+    }
+    /**
+     * Feedforward sub-block which is similar to the original transformer except
+     * layer normalization is applied at the beginning and gelu activation is used
+     */
+    protected override feedForwardBlock(x: tf.Tensor, kwargs: Kwargs): tf.Tensor {
+        return tf.tidy(() => {
+            const residual = x;
+            let feedForward = this.feedFowardNorm.apply(x, kwargs);
+            feedForward = this.feedforward1.apply(feedForward, kwargs);
+            feedForward = this.feedforward2.apply(feedForward, kwargs);
+            feedForward = this.feedForwardDropout.apply(feedForward, kwargs) as tf.Tensor;
+            feedForward = tf.add(feedForward, residual);
+            return feedForward;
+        });
+    }
+    // the build() function does not need overriding because the layer normalization
+    // outputs the same shape as its input, its position as a sub-layer doesn't affect
+    // other sub-layer weight and output shapes
+}
+tf.serialization.registerClass(GPT2DecoderBlock);

package/src/layers/multihead_attention.test.ts ADDED Viewed

@@ -0,0 +1,212 @@
+import * as tf from '@tensorflow/tfjs';
+import { CachedRoPEMultiHeadAttention } from '@/layers/cached_rope_multihead_attention';
+import { generateCausalAttentionMask } from '@/utils';
+import { MultiHeadAttention } from '@/layers/multihead_attention';
+// disables warning for using the faster node backend,
+// https://github.com/tensorflow/tfjs/issues/5349#issuecomment-885170504
+tf.env().set('IS_NODE', false);
+describe("MultiHeadAttention tests", () => {
+    it("should fail to instantiate a layer if heads count is not divisible by the input's embedding dimension", () => {
+        expect(() => new CachedRoPEMultiHeadAttention({ numHeads: 3, embedDim: 10 })).toThrow();
+        expect(() => new CachedRoPEMultiHeadAttention({ numHeads: 15, embedDim: 60 })).not.toThrow();
+    })
+    test("successfull forward calls", () => {
+        const input = tf.randomUniform([2, 3, 12]);
+        const attention = new CachedRoPEMultiHeadAttention({ numHeads: 2, embedDim: input.shape.at(-1)! });
+        expect(() => attention.apply(input)).not.toThrow();
+        expect(() => attention.apply([input])).not.toThrow();
+        const causal = new CachedRoPEMultiHeadAttention({ numHeads: 2, embedDim: input.shape.at(-1)!, causal: true });
+        expect(() => causal.apply(input)).not.toThrow();
+        expect(() => causal.apply([input])).not.toThrow();
+    })
+    test("query and value must have the same shape for scaled dot product attention to succeed", () => {
+        const query = tf.randomUniform([2, 3, 12]);
+        const key = tf.randomUniform([2, 3, 12]);
+        const value = tf.randomUniform([2, 3, 12]);
+        const value_thats_too_long = tf.randomUniform([2, 100, 12]);
+        const attention = new CachedRoPEMultiHeadAttention({ numHeads: 2, embedDim: query.shape.at(-1)! });
+        expect(() => attention.apply([query, key, value])).not.toThrow();
+        expect(() => attention.apply([query, key, value_thats_too_long])).toThrow();
+    })
+    it("should only accept rank 3 tensors", () => {
+        const embed_dims = 12;
+        const BAD_RANK2 = tf.randomUniform([2, embed_dims]);
+        const GOOD = tf.randomUniform([2, 3, embed_dims]);
+        const BAD_RANK4 = tf.randomUniform([2, 3, 10, embed_dims]);
+        const attention = new CachedRoPEMultiHeadAttention({ numHeads: 2, embedDim: embed_dims });
+        // BAD
+        expect(() => attention.apply(BAD_RANK2)).toThrow();
+        expect(() => attention.apply([BAD_RANK2])).toThrow();
+        expect(() => attention.apply([BAD_RANK2, BAD_RANK2, BAD_RANK2])).toThrow();
+        // OK
+        expect(() => attention.apply(GOOD)).not.toThrow();
+        expect(() => attention.apply([GOOD])).not.toThrow();
+        expect(() => attention.apply([GOOD, GOOD, GOOD])).not.toThrow();
+        // BAD
+        expect(() => attention.apply(BAD_RANK4)).toThrow();
+        expect(() => attention.apply([BAD_RANK4])).toThrow();
+        expect(() => attention.apply([BAD_RANK4, BAD_RANK4, BAD_RANK4])).toThrow();
+        // BAD
+        expect(() => attention.apply([GOOD, BAD_RANK2, BAD_RANK4])).toThrow();
+        expect(() => attention.apply([BAD_RANK2, GOOD, BAD_RANK4])).toThrow();
+        expect(() => attention.apply([BAD_RANK2, BAD_RANK4, GOOD])).toThrow();
+        expect(() => attention.apply([BAD_RANK2, GOOD, GOOD])).toThrow();
+        expect(() => attention.apply([GOOD, GOOD, BAD_RANK4])).toThrow();
+    })
+    it("should only 1 or 3 inputs total", () => {
+        const input = tf.randomUniform([2, 3, 12]);
+        let attention = new CachedRoPEMultiHeadAttention({ numHeads: 2, embedDim: input.shape.at(-1)! });
+        // OK
+        expect(() => attention.apply(input, { packingMask: undefined })).not.toThrow();
+        expect(() => attention.apply([input])).not.toThrow();
+        // reinitialize to rerun build()
+        attention = new CachedRoPEMultiHeadAttention({ numHeads: 2, embedDim: input.shape.at(-1)! });
+        expect(() => attention.apply([input, input, input])).not.toThrow();
+        // BAD
+        expect(() => attention.apply([])).toThrow();
+        expect(() => attention.apply([input, input])).toThrow();
+        // reinitialize to rerun build()
+        attention = new CachedRoPEMultiHeadAttention({ numHeads: 2, embedDim: input.shape.at(-1)! });
+        expect(() => attention.apply([input, input, input, input])).toThrow();
+    })
+    test("attention masking", () => {
+        const query = tf.randomUniform([2, 3, 12]);
+        const key = tf.randomUniform([2, 3, 12]);
+        const value = tf.randomUniform([2, 3, 12]);
+        const attention = new CachedRoPEMultiHeadAttention({ numHeads: 2, embedDim: query.shape.at(-1)!, causal: true });
+        expect(() => attention.call(query, {})).not.toThrow();
+        // cross attention
+        expect(() => attention.call([query, key, value], {})).not.toThrow();
+        const query5 = tf.randomUniform([2, 5, 10]);
+        const key4 = tf.randomUniform([2, 4, 10]);
+        const value5 = tf.randomUniform([2, 4, 10]);
+        const expected_mask = tf.tensor([[
+            // vertical represents query, false means that token cannot attend to the keys
+            // horizontal represents key, false means that token cannot attend to the queries
+            [false, false, false, false],
+            [true, true, true, false,],
+            [true, true, true, false,],
+            [false, false, false, false],
+            [true, true, true, false,],
+        ]]);
+        const packing_mask = tf.tensor([
+            [0, 0, 0, -1e7, -1e7],
+            [0, 0, 0, -1e7, -1e7],
+            [0, 0, 0, -1e7, -1e7],
+            [-1e7, -1e7, -1e7, 0, 0],
+            [-1e7, -1e7, -1e7, 0, 0]
+        ])
+        // for causal attention, the attention mask must be boolean
+        expect(() => MultiHeadAttention.scaledDotProductionAttention(query5, key4, value5, expected_mask.asType("float32"), null, null, 0.1, true, { scaling_factor: 10 })).toThrow();
+        // for causal attention, using pre-calculated causal mask
+        expect(() => MultiHeadAttention.scaledDotProductionAttention(query5, key4, value5, expected_mask.asType("float32"), null, generateCausalAttentionMask(query5.shape[1]!, key4.shape[1]!), 0.2, true, { scaling_factor: 10 })).toThrow();
+        // when not using causal attention, the attention mask can be a float32 tensor
+        expect(() => MultiHeadAttention.scaledDotProductionAttention(query5, key4, value5, expected_mask.asType("float32"), null, null, 0, false)).not.toThrow();
+        // packing mask for self attention
+        expect(() => MultiHeadAttention.scaledDotProductionAttention(query5, query5, query5, null, packing_mask, null, 0.9, true)).not.toThrow();
+    })
+    it("should return a non-empty config dict", () => {
+        const input = tf.randomUniform([2, 3, 10]);
+        const attention = new CachedRoPEMultiHeadAttention({ numHeads: 1, embedDim: input.shape.at(-1)! });
+        expect(Object.keys(attention.getConfig())).not.toBe(0);
+    })
+    test("causal attention hard coded values", () => {
+        // input and output shapes: [2, 3, 10]
+        const input = tf.tensor([
+            [[0.2109915, 0.6158954, 0.6012088, 0.9867562, 0.8728716, 0.7496274, 0.8173883, 0.2958342, 0.9650571, 0.2075207],
+            [0.2946285, 0.9779906, 0.3203818, 0.4037617, 0.3762881, 0.9863171, 0.6655593, 0.7707329, 0.3216831, 0.7984023],
+            [0.9080769, 0.0026282, 0.379492, 0.0162054, 0.1939302, 0.2201049, 0.8190675, 0.0203963, 0.0114392, 0.5015539]],
+            [[0.6241482, 0.7631097, 0.6687831, 0.7259795, 0.0457698, 0.6889264, 0.0853676, 0.8697655, 0.3637198, 0.2105307],
+            [0.5221761, 0.4476321, 0.1244729, 0.8863543, 0.7319002, 0.2954829, 0.3200496, 0.0905503, 0.607977, 0.1309131],
+            [0.4693873, 0.4609751, 0.9170766, 0.7065565, 0.4795104, 0.3225758, 0.1353116, 0.7083887, 0.1928891, 0.967386]]
+        ]);
+        const expected = tf.tensor([
+            [[0.2055344, 0.2055344, 0.2055344, 0.2055344, 0.2055344, 0.2055344, 0.2055344, 0.2055344, 0.2055344, 0.2055344],
+            [0.205376, 0.205376, 0.205376, 0.205376, 0.205376, 0.205376, 0.205376, 0.205376, 0.205376, 0.205376],
+            [0.2042539, 0.2042539, 0.2042539, 0.2042539, 0.2042539, 0.2042539, 0.2042539, 0.2042539, 0.2042539, 0.2042539]],
+            [[0.1966718, 0.1966718, 0.1966718, 0.1966718, 0.1966718, 0.1966718, 0.1966718, 0.1966718, 0.1966718, 0.1966718],
+            [0.1966268, 0.1966268, 0.1966268, 0.1966268, 0.1966268, 0.1966268, 0.1966268, 0.1966268, 0.1966268, 0.1966268],
+            [0.1966877, 0.1966877, 0.1966877, 0.1966877, 0.1966877, 0.1966877, 0.1966877, 0.1966877, 0.1966877, 0.1966877]]
+        ]);
+        const attention = new CachedRoPEMultiHeadAttention({ numHeads: 1, embedDim: input.shape.at(-1)!, causal: true });
+        attention.build(input.shape);
+        attention.setWeights(attention.getWeights().map(weight => tf.onesLike(weight).mul(0.05)));
+        expect(expected.sub(attention.apply(input) as tf.Tensor).sum().dataSync()[0]).toBeLessThan(1e-6);
+    })
+    test("non-causal attention hard coded values", () => {
+        // input and output shapes: [2, 3, 10]
+        const input = tf.tensor([
+            [[0.2109915, 0.6158954, 0.6012088, 0.9867562, 0.8728716, 0.7496274, 0.8173883, 0.2958342, 0.9650571, 0.2075207],
+            [0.2946285, 0.9779906, 0.3203818, 0.4037617, 0.3762881, 0.9863171, 0.6655593, 0.7707329, 0.3216831, 0.7984023],
+            [0.9080769, 0.0026282, 0.379492, 0.0162054, 0.1939302, 0.2201049, 0.8190675, 0.0203963, 0.0114392, 0.5015539]],
+            [[0.6241482, 0.7631097, 0.6687831, 0.7259795, 0.0457698, 0.6889264, 0.0853676, 0.8697655, 0.3637198, 0.2105307],
+            [0.5221761, 0.4476321, 0.1244729, 0.8863543, 0.7319002, 0.2954829, 0.3200496, 0.0905503, 0.607977, 0.1309131],
+            [0.4693873, 0.4609751, 0.9170766, 0.7065565, 0.4795104, 0.3225758, 0.1353116, 0.7083887, 0.1928891, 0.967386]]
+        ]);
+        const expected = tf.tensor([
+            [[0.2055344, 0.2055344, 0.2055344, 0.2055344, 0.2055344, 0.2055344, 0.2055344, 0.2055344, 0.2055344, 0.2055344],
+            [0.205376, 0.205376, 0.205376, 0.205376, 0.205376, 0.205376, 0.205376, 0.205376, 0.205376, 0.205376],
+            [0.2042539, 0.2042539, 0.2042539, 0.2042539, 0.2042539, 0.2042539, 0.2042539, 0.2042539, 0.2042539, 0.2042539]],
+            [[0.1966718, 0.1966718, 0.1966718, 0.1966718, 0.1966718, 0.1966718, 0.1966718, 0.1966718, 0.1966718, 0.1966718],
+            [0.1966268, 0.1966268, 0.1966268, 0.1966268, 0.1966268, 0.1966268, 0.1966268, 0.1966268, 0.1966268, 0.1966268],
+            [0.1966877, 0.1966877, 0.1966877, 0.1966877, 0.1966877, 0.1966877, 0.1966877, 0.1966877, 0.1966877, 0.1966877]]
+        ]);
+        const attention = new CachedRoPEMultiHeadAttention({ numHeads: 1, embedDim: input.shape.at(-1)!, causal: false });
+        attention.build(input.shape);
+        attention.setWeights(attention.getWeights().map(weight => tf.onesLike(weight).mul(0.05)));
+        expect(expected.sub(attention.apply(input) as tf.Tensor).sum().dataSync()[0]).toBeLessThan(1e-6);
+    });
+});