npm - @plurnk/plurnk-mimetypes-embeddings - Versions diffs - 0.1.0 - Mend

@plurnk/plurnk-mimetypes-embeddings 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/.model-pin +1 -0
package/LICENSE +21 -0
package/README.md +45 -0
package/index.js +42 -0
package/model/config.json +25 -0
package/model/model.sha256 +4 -0
package/model/onnx/model_quantized.onnx +0 -0
package/model/tokenizer.json +30686 -0
package/model/tokenizer_config.json +15 -0
package/package.json +38 -0

package/.model-pin ADDED Viewed

	@@ -0,0 +1 @@
1	+ 751bff37182d3f1213fa05d7196b954e230abad9

package/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 PossumTech Laboratories, LLC
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

package/README.md ADDED Viewed

@@ -0,0 +1,45 @@
+# @plurnk/plurnk-mimetypes-embeddings
+Opt-in embedder for [`@plurnk/plurnk-mimetypes`](https://github.com/plurnk/plurnk-mimetypes)' `embedding` channel (issue #24). Install it and the framework's loader finds it; nothing else to configure.
+## Model
+- **Xenova/all-MiniLM-L6-v2**, q8 quantized onnx (`onnx/model_quantized.onnx`), **384 dimensions**.
+- Pinned revision: `751bff37182d3f1213fa05d7196b954e230abad9` (`.model-pin`).
+- Model files are **bundled in the package** — no runtime network, ever. `env.allowRemoteModels = false` is set process-wide at import, and the loader is locked to the bundled `model/` directory. Integrity manifest in `model/model.sha256` (`npm run verify:model`).
+- Inference runs on `@huggingface/transformers` (onnxruntime ships inside it).
+## Install
+```sh
+npm install @plurnk/plurnk-mimetypes-embeddings
+```
+## Usage
+The framework resolves this package lazily when the `embedding` channel is requested:
+```js
+const result = await mimetypes.process(
+    { content: "hello", hint: "text/plain" },
+    { channels: ["embedding"] },
+);
+// result.embedding: Uint8Array, 1536 bytes — native-endian raw Float32 × 384,
+// mean-pooled, L2-normalized. Store verbatim as a BLOB; cosine-rank over a
+// Float32Array view. The same embed() serves entry bodies and query text.
+```
+Direct surface, if you want it without the framework:
+```js
+import { embed, dimension } from "@plurnk/plurnk-mimetypes-embeddings";
+const bytes = await embed("database connection error"); // Uint8Array(4 × dimension)
+```
+Input beyond the model's 512-token window is truncated by the tokenizer.
+## Scripts
+- `npm run build:model` — re-download the pinned revision into `model/` and regenerate `model/model.sha256`.
+- `npm run verify:model` — check the committed model bytes against the manifest.
+- `npm test` — unit (duck surface, determinism, normalization, cosine sanity) + integration (real framework loader path).

package/index.js ADDED Viewed

@@ -0,0 +1,42 @@
+// Opt-in embedder for @plurnk/plurnk-mimetypes' "embedding" channel
+// (plurnk-mimetypes#24). The framework duck-checks exactly this surface:
+// embed(text) → Promise<Uint8Array> of native-endian raw Float32 bytes
+// (4 × dimension), plus the dimension constant.
+//
+// Model: Xenova/all-MiniLM-L6-v2, q8 quantized onnx, bundled in model/ at
+// the revision in .model-pin. Hermetic: transformers.js is locked to the
+// bundled directory and remote fetches are disabled process-wide — this is
+// a global env mutation, deliberate enforcement, not a default.
+import path from "node:path";
+import { fileURLToPath } from "node:url";
+import { env, pipeline } from "@huggingface/transformers";
+const here = path.dirname(fileURLToPath(import.meta.url));
+env.localModelPath = here;
+env.allowRemoteModels = false;
+export const dimension = 384;
+// Model identity, surfaced by the framework as ProcessResult.embeddingModel.
+// Consumers store it alongside each vector BLOB — vectors from different
+// models are silently incomparable, and this is the staleness detector.
+export const model = "Xenova/all-MiniLM-L6-v2@751bff37";
+let pipelinePromise = null;
+// text → 1536 bytes (Float32 × 384), mean-pooled, L2-normalized. Input
+// beyond the model's 512-token window is truncated by the tokenizer
+// (pipeline default). The returned Uint8Array owns its buffer exactly —
+// safe to store verbatim as a BLOB.
+export async function embed(text) {
+    // "model" resolves to <package>/model/ under env.localModelPath; dtype
+    // q8 selects onnx/model_quantized.onnx.
+    pipelinePromise ??= pipeline("feature-extraction", "model", { dtype: "q8" });
+    const extractor = await pipelinePromise;
+    const output = await extractor(text, { pooling: "mean", normalize: true });
+    const data = output.data;
+    if (!(data instanceof Float32Array) || data.length !== dimension) {
+        throw new Error(`embed: expected Float32Array[${dimension}], got ${data?.constructor?.name}[${data?.length}]`);
+    }
+    return new Uint8Array(data.buffer.slice(data.byteOffset, data.byteOffset + data.byteLength));
+}

package/model/config.json ADDED Viewed

@@ -0,0 +1,25 @@
+{
+  "_name_or_path": "sentence-transformers/all-MiniLM-L6-v2",
+  "architectures": [
+    "BertModel"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 384,
+  "initializer_range": 0.02,
+  "intermediate_size": 1536,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 6,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "transformers_version": "4.29.2",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 30522
+}

package/model/model.sha256 ADDED Viewed

@@ -0,0 +1,4 @@
+7135149f7cffa1a573466c6e4d8423ed73b62fd2332c575bf738a0d033f70df7  config.json
+da0e79933b9ed51798a3ae27893d3c5fa4a201126cef75586296df9b4d2c62a0  tokenizer.json
+9261e7d79b44c8195c1cada2b453e55b00aeb81e907a6664974b4d7776172ab3  tokenizer_config.json
+afdb6f1a0e45b715d0bb9b11772f032c399babd23bfc31fed1c170afc848bdb1  onnx/model_quantized.onnx

package/model/onnx/model_quantized.onnx ADDED Viewed

Binary file