@plurnk/plurnk-mimetypes-embeddings 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/.model-pin ADDED
@@ -0,0 +1 @@
1
+ 751bff37182d3f1213fa05d7196b954e230abad9
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 PossumTech Laboratories, LLC
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,45 @@
1
+ # @plurnk/plurnk-mimetypes-embeddings
2
+
3
+ Opt-in embedder for [`@plurnk/plurnk-mimetypes`](https://github.com/plurnk/plurnk-mimetypes)' `embedding` channel (issue #24). Install it and the framework's loader finds it; nothing else to configure.
4
+
5
+ ## Model
6
+
7
+ - **Xenova/all-MiniLM-L6-v2**, q8 quantized onnx (`onnx/model_quantized.onnx`), **384 dimensions**.
8
+ - Pinned revision: `751bff37182d3f1213fa05d7196b954e230abad9` (`.model-pin`).
9
+ - Model files are **bundled in the package** — no runtime network, ever. `env.allowRemoteModels = false` is set process-wide at import, and the loader is locked to the bundled `model/` directory. Integrity manifest in `model/model.sha256` (`npm run verify:model`).
10
+ - Inference runs on `@huggingface/transformers` (onnxruntime ships inside it).
11
+
12
+ ## Install
13
+
14
+ ```sh
15
+ npm install @plurnk/plurnk-mimetypes-embeddings
16
+ ```
17
+
18
+ ## Usage
19
+
20
+ The framework resolves this package lazily when the `embedding` channel is requested:
21
+
22
+ ```js
23
+ const result = await mimetypes.process(
24
+ { content: "hello", hint: "text/plain" },
25
+ { channels: ["embedding"] },
26
+ );
27
+ // result.embedding: Uint8Array, 1536 bytes — native-endian raw Float32 × 384,
28
+ // mean-pooled, L2-normalized. Store verbatim as a BLOB; cosine-rank over a
29
+ // Float32Array view. The same embed() serves entry bodies and query text.
30
+ ```
31
+
32
+ Direct surface, if you want it without the framework:
33
+
34
+ ```js
35
+ import { embed, dimension } from "@plurnk/plurnk-mimetypes-embeddings";
36
+ const bytes = await embed("database connection error"); // Uint8Array(4 × dimension)
37
+ ```
38
+
39
+ Input beyond the model's 512-token window is truncated by the tokenizer.
40
+
41
+ ## Scripts
42
+
43
+ - `npm run build:model` — re-download the pinned revision into `model/` and regenerate `model/model.sha256`.
44
+ - `npm run verify:model` — check the committed model bytes against the manifest.
45
+ - `npm test` — unit (duck surface, determinism, normalization, cosine sanity) + integration (real framework loader path).
package/index.js ADDED
@@ -0,0 +1,42 @@
1
+ // Opt-in embedder for @plurnk/plurnk-mimetypes' "embedding" channel
2
+ // (plurnk-mimetypes#24). The framework duck-checks exactly this surface:
3
+ // embed(text) → Promise<Uint8Array> of native-endian raw Float32 bytes
4
+ // (4 × dimension), plus the dimension constant.
5
+ //
6
+ // Model: Xenova/all-MiniLM-L6-v2, q8 quantized onnx, bundled in model/ at
7
+ // the revision in .model-pin. Hermetic: transformers.js is locked to the
8
+ // bundled directory and remote fetches are disabled process-wide — this is
9
+ // a global env mutation, deliberate enforcement, not a default.
10
+ import path from "node:path";
11
+ import { fileURLToPath } from "node:url";
12
+ import { env, pipeline } from "@huggingface/transformers";
13
+
14
+ const here = path.dirname(fileURLToPath(import.meta.url));
15
+ env.localModelPath = here;
16
+ env.allowRemoteModels = false;
17
+
18
+ export const dimension = 384;
19
+
20
+ // Model identity, surfaced by the framework as ProcessResult.embeddingModel.
21
+ // Consumers store it alongside each vector BLOB — vectors from different
22
+ // models are silently incomparable, and this is the staleness detector.
23
+ export const model = "Xenova/all-MiniLM-L6-v2@751bff37";
24
+
25
+ let pipelinePromise = null;
26
+
27
+ // text → 1536 bytes (Float32 × 384), mean-pooled, L2-normalized. Input
28
+ // beyond the model's 512-token window is truncated by the tokenizer
29
+ // (pipeline default). The returned Uint8Array owns its buffer exactly —
30
+ // safe to store verbatim as a BLOB.
31
+ export async function embed(text) {
32
+ // "model" resolves to <package>/model/ under env.localModelPath; dtype
33
+ // q8 selects onnx/model_quantized.onnx.
34
+ pipelinePromise ??= pipeline("feature-extraction", "model", { dtype: "q8" });
35
+ const extractor = await pipelinePromise;
36
+ const output = await extractor(text, { pooling: "mean", normalize: true });
37
+ const data = output.data;
38
+ if (!(data instanceof Float32Array) || data.length !== dimension) {
39
+ throw new Error(`embed: expected Float32Array[${dimension}], got ${data?.constructor?.name}[${data?.length}]`);
40
+ }
41
+ return new Uint8Array(data.buffer.slice(data.byteOffset, data.byteOffset + data.byteLength));
42
+ }
@@ -0,0 +1,25 @@
1
+ {
2
+ "_name_or_path": "sentence-transformers/all-MiniLM-L6-v2",
3
+ "architectures": [
4
+ "BertModel"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "classifier_dropout": null,
8
+ "gradient_checkpointing": false,
9
+ "hidden_act": "gelu",
10
+ "hidden_dropout_prob": 0.1,
11
+ "hidden_size": 384,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 1536,
14
+ "layer_norm_eps": 1e-12,
15
+ "max_position_embeddings": 512,
16
+ "model_type": "bert",
17
+ "num_attention_heads": 12,
18
+ "num_hidden_layers": 6,
19
+ "pad_token_id": 0,
20
+ "position_embedding_type": "absolute",
21
+ "transformers_version": "4.29.2",
22
+ "type_vocab_size": 2,
23
+ "use_cache": true,
24
+ "vocab_size": 30522
25
+ }
@@ -0,0 +1,4 @@
1
+ 7135149f7cffa1a573466c6e4d8423ed73b62fd2332c575bf738a0d033f70df7 config.json
2
+ da0e79933b9ed51798a3ae27893d3c5fa4a201126cef75586296df9b4d2c62a0 tokenizer.json
3
+ 9261e7d79b44c8195c1cada2b453e55b00aeb81e907a6664974b4d7776172ab3 tokenizer_config.json
4
+ afdb6f1a0e45b715d0bb9b11772f032c399babd23bfc31fed1c170afc848bdb1 onnx/model_quantized.onnx
Binary file