docsgov 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +242 -0
- package/dist/apispec/apispec.js +401 -0
- package/dist/apispec/apispec.test.js +444 -0
- package/dist/apispec/errors.js +17 -0
- package/dist/apispec/index.js +2 -0
- package/dist/check/doclinks.js +167 -0
- package/dist/check/index.js +8 -0
- package/dist/check/run.js +391 -0
- package/dist/check/run.test.js +513 -0
- package/dist/check/suggest.js +134 -0
- package/dist/check/suggest.test.js +92 -0
- package/dist/check/tokens.js +125 -0
- package/dist/cmd/main.js +330 -0
- package/dist/cmd/main.test.js +422 -0
- package/dist/codeq/cache.js +71 -0
- package/dist/codeq/cache.test.js +67 -0
- package/dist/codeq/errors.js +52 -0
- package/dist/codeq/grammars/tree-sitter-go.wasm +0 -0
- package/dist/codeq/grammars/tree-sitter-java.wasm +0 -0
- package/dist/codeq/grammars/tree-sitter-javascript.wasm +0 -0
- package/dist/codeq/grammars/tree-sitter-tsx.wasm +0 -0
- package/dist/codeq/grammars/tree-sitter-typescript.wasm +0 -0
- package/dist/codeq/index.js +11 -0
- package/dist/codeq/resolve.test.js +109 -0
- package/dist/codeq/resolver.js +128 -0
- package/dist/codeq/resolver.test.js +124 -0
- package/dist/codeq/resolvers/go.js +242 -0
- package/dist/codeq/resolvers/go.test.js +143 -0
- package/dist/codeq/resolvers/java.js +349 -0
- package/dist/codeq/resolvers/java.test.js +138 -0
- package/dist/codeq/resolvers/java_queries.js +63 -0
- package/dist/codeq/resolvers/javascript.js +412 -0
- package/dist/codeq/resolvers/javascript.test.js +125 -0
- package/dist/codeq/resolvers/javascript_queries.js +46 -0
- package/dist/codeq/resolvers/typescript.js +366 -0
- package/dist/codeq/resolvers/typescript.test.js +180 -0
- package/dist/codeq/resolvers/typescript_queries.js +78 -0
- package/dist/codeq/signature.js +50 -0
- package/dist/codeq/signature.test.js +50 -0
- package/dist/codeq/suggest.js +96 -0
- package/dist/codeq/treesitter.js +122 -0
- package/dist/codeq/treesitter.test.js +118 -0
- package/dist/config/config.js +74 -0
- package/dist/config/config.test.js +98 -0
- package/dist/config/fs.js +116 -0
- package/dist/config/glob.js +82 -0
- package/dist/config/glob.test.js +61 -0
- package/dist/config/index.js +4 -0
- package/dist/dedup/analyzer/analyzer.js +533 -0
- package/dist/dedup/analyzer/analyzer.test.js +530 -0
- package/dist/dedup/analyzer/canonical.js +74 -0
- package/dist/dedup/analyzer/canonical.test.js +70 -0
- package/dist/dedup/analyzer/cosine_clusters.js +169 -0
- package/dist/dedup/analyzer/cosine_clusters.test.js +131 -0
- package/dist/dedup/analyzer/distinctive.js +85 -0
- package/dist/dedup/analyzer/distinctive.test.js +49 -0
- package/dist/dedup/analyzer/exact_clusters.js +63 -0
- package/dist/dedup/analyzer/exact_clusters.test.js +81 -0
- package/dist/dedup/analyzer/index.js +14 -0
- package/dist/dedup/analyzer/multiplicity.js +110 -0
- package/dist/dedup/analyzer/multiplicity.test.js +123 -0
- package/dist/dedup/analyzer/order.js +22 -0
- package/dist/dedup/analyzer/partial_overlaps.js +65 -0
- package/dist/dedup/analyzer/partial_overlaps.test.js +161 -0
- package/dist/dedup/analyzer/preview.js +84 -0
- package/dist/dedup/analyzer/preview.test.js +46 -0
- package/dist/dedup/analyzer/safety.js +27 -0
- package/dist/dedup/analyzer/safety.test.js +39 -0
- package/dist/dedup/config.js +18 -0
- package/dist/dedup/configload.js +299 -0
- package/dist/dedup/configload.test.js +410 -0
- package/dist/dedup/dedup.index.test.js +203 -0
- package/dist/dedup/dedup.js +143 -0
- package/dist/dedup/dedup.test.js +212 -0
- package/dist/dedup/dedupcfg/config.js +112 -0
- package/dist/dedup/dedupcfg/config.test.js +70 -0
- package/dist/dedup/dedupcfg/index.js +1 -0
- package/dist/dedup/deduptypes/index.js +1 -0
- package/dist/dedup/deduptypes/types.js +9 -0
- package/dist/dedup/deduptypes/types.test.js +34 -0
- package/dist/dedup/embedder/cache.js +23 -0
- package/dist/dedup/embedder/cache.test.js +50 -0
- package/dist/dedup/embedder/constants.js +10 -0
- package/dist/dedup/embedder/embedder.js +76 -0
- package/dist/dedup/embedder/embedder.mock.test.js +128 -0
- package/dist/dedup/embedder/embedder.test.js +96 -0
- package/dist/dedup/embedder/errors.js +20 -0
- package/dist/dedup/embedder/errors.test.js +35 -0
- package/dist/dedup/embedder/index.js +4 -0
- package/dist/dedup/embedder/session.js +78 -0
- package/dist/dedup/embedder/session.test.js +172 -0
- package/dist/dedup/gitignore.js +97 -0
- package/dist/dedup/gitignore.test.js +98 -0
- package/dist/dedup/index.js +11 -0
- package/dist/dedup/indexdb/errors.js +48 -0
- package/dist/dedup/indexdb/index.js +6 -0
- package/dist/dedup/indexdb/indexdb.js +302 -0
- package/dist/dedup/indexdb/indexdb.test.js +739 -0
- package/dist/dedup/indexdb/load.js +110 -0
- package/dist/dedup/indexdb/migrations.js +58 -0
- package/dist/dedup/indexdb/schema.js +83 -0
- package/dist/dedup/indexer/index.js +9 -0
- package/dist/dedup/indexer/indexer.js +501 -0
- package/dist/dedup/indexer/indexer.test.js +510 -0
- package/dist/dedup/indexer/links.js +89 -0
- package/dist/dedup/mdsection/anchor.js +60 -0
- package/dist/dedup/mdsection/anchor.test.js +39 -0
- package/dist/dedup/mdsection/blocks.js +409 -0
- package/dist/dedup/mdsection/blocks.test.js +359 -0
- package/dist/dedup/mdsection/index.js +4 -0
- package/dist/dedup/mdsection/parse.js +21 -0
- package/dist/dedup/mdsection/section.js +234 -0
- package/dist/dedup/mdsection/section.test.js +221 -0
- package/dist/dedup/report/floatfmt.js +71 -0
- package/dist/dedup/report/floatfmt.test.js +42 -0
- package/dist/dedup/report/index.js +8 -0
- package/dist/dedup/report/quote.js +77 -0
- package/dist/dedup/report/quote.test.js +67 -0
- package/dist/dedup/report/text.js +251 -0
- package/dist/dedup/report/text.test.js +420 -0
- package/dist/dedup/report_types.js +8 -0
- package/dist/dedup/sectionid/index.js +1 -0
- package/dist/dedup/sectionid/sectionid.js +16 -0
- package/dist/dedup/sectionid/sectionid.test.js +49 -0
- package/dist/guard/api/errors.js +12 -0
- package/dist/guard/api/index.js +2 -0
- package/dist/guard/api/parser.js +81 -0
- package/dist/guard/api/parser.test.js +58 -0
- package/dist/guard/api/types.js +1 -0
- package/dist/guard/code/errors.js +16 -0
- package/dist/guard/code/index.js +2 -0
- package/dist/guard/code/parser.js +54 -0
- package/dist/guard/code/parser.test.js +111 -0
- package/dist/guard/code/types.js +6 -0
- package/dist/index.js +1 -0
- package/dist/index.test.js +5 -0
- package/dist/repo/boundary.js +92 -0
- package/dist/repo/boundary.test.js +65 -0
- package/dist/repo/errors.js +56 -0
- package/dist/repo/errors.test.js +85 -0
- package/dist/repo/exists.test.js +72 -0
- package/dist/repo/filename.js +46 -0
- package/dist/repo/filename.test.js +39 -0
- package/dist/repo/fs.js +53 -0
- package/dist/repo/index.js +7 -0
- package/dist/repo/overlay.js +36 -0
- package/dist/repo/overlay.test.js +80 -0
- package/dist/repo/repo.js +353 -0
- package/dist/repo/repo.test.js +255 -0
- package/dist/repo/testutil.js +27 -0
- package/dist/repo/write.test.js +125 -0
- package/dist/report/color.js +73 -0
- package/dist/report/index.js +1 -0
- package/dist/report/report.js +112 -0
- package/dist/report/report.test.js +368 -0
- package/dist/violation/index.js +1 -0
- package/dist/violation/types.js +22 -0
- package/dist/violation/types.test.js +70 -0
- package/package.json +48 -0
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Model is the HuggingFace model identifier used for embedding.
|
|
3
|
+
* The sentence-transformers/ prefix is load-bearing for parity against the
|
|
4
|
+
* Python POC (fastembed wraps this same ONNX model). Changing this constant
|
|
5
|
+
* invalidates the stored index — indexdb detects the mismatch and purges the
|
|
6
|
+
* sections table automatically.
|
|
7
|
+
*/
|
|
8
|
+
export const Model = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2";
|
|
9
|
+
/** Dimension is the output embedding dimension for the above model. */
|
|
10
|
+
export const Dimension = 768;
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
// Package embedder wraps a transformers.js feature-extraction pipeline to
|
|
2
|
+
// provide mean-pooled, L2-normalized embeddings from the
|
|
3
|
+
// sentence-transformers/paraphrase-multilingual-mpnet-base-v2 model.
|
|
4
|
+
//
|
|
5
|
+
// The embedder model is auto-downloaded on first use to a configurable cache
|
|
6
|
+
// directory (see NewOptions.cacheDir, the DOCGOV_MODEL_CACHE env var, or the
|
|
7
|
+
// default ~/.cache/docgov/models/<sanitized-model-name>/).
|
|
8
|
+
import { cacheDir } from "./cache.js";
|
|
9
|
+
import { Dimension, Model } from "./constants.js";
|
|
10
|
+
import { InferenceError } from "./errors.js";
|
|
11
|
+
import { Session } from "./session.js";
|
|
12
|
+
/**
|
|
13
|
+
* Embedder wraps a transformers.js feature-extraction pipeline.
|
|
14
|
+
*
|
|
15
|
+
* It is NOT safe for concurrent use; callers should serialize embed calls (the
|
|
16
|
+
* indexer does this by design).
|
|
17
|
+
*/
|
|
18
|
+
export class Embedder {
|
|
19
|
+
sess;
|
|
20
|
+
constructor(sess) {
|
|
21
|
+
this.sess = sess;
|
|
22
|
+
}
|
|
23
|
+
/**
|
|
24
|
+
* newEmbedder creates a new Embedder. It fully initialises the pipeline and
|
|
25
|
+
* tokenizer before returning — no lazy initialization. Callers must call
|
|
26
|
+
* {@link close} when done.
|
|
27
|
+
*
|
|
28
|
+
* If the model is not cached, this downloads it from Hugging Face Hub, which
|
|
29
|
+
* can take 30+ seconds on first run.
|
|
30
|
+
*/
|
|
31
|
+
static async newEmbedder(opts = {}) {
|
|
32
|
+
const dir = cacheDir(opts.cacheDir ?? "");
|
|
33
|
+
const sess = await Session.newSession(dir);
|
|
34
|
+
return new Embedder(sess);
|
|
35
|
+
}
|
|
36
|
+
/**
|
|
37
|
+
* embed encodes texts into mean-pooled, L2-normalized float vectors of length
|
|
38
|
+
* {@link Dimension}. Returns an empty array for empty input.
|
|
39
|
+
*
|
|
40
|
+
* Inference failures are wrapped as {@link InferenceError}.
|
|
41
|
+
*/
|
|
42
|
+
async embed(texts) {
|
|
43
|
+
if (texts.length === 0) {
|
|
44
|
+
return [];
|
|
45
|
+
}
|
|
46
|
+
if (this.sess === null) {
|
|
47
|
+
throw new Error("embedder.embed: embedder is closed");
|
|
48
|
+
}
|
|
49
|
+
try {
|
|
50
|
+
return await this.sess.embed(texts);
|
|
51
|
+
}
|
|
52
|
+
catch (cause) {
|
|
53
|
+
throw new InferenceError(undefined, { cause });
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
/** name returns the model identifier ({@link Model}). */
|
|
57
|
+
name() {
|
|
58
|
+
return Model;
|
|
59
|
+
}
|
|
60
|
+
/** dimension returns the output embedding dimension ({@link Dimension}). */
|
|
61
|
+
dimension() {
|
|
62
|
+
return Dimension;
|
|
63
|
+
}
|
|
64
|
+
/**
|
|
65
|
+
* close releases resources held by the embedder. Idempotent: a second close
|
|
66
|
+
* is a no-op. After close, embed throws.
|
|
67
|
+
*/
|
|
68
|
+
async close() {
|
|
69
|
+
if (this.sess === null) {
|
|
70
|
+
return;
|
|
71
|
+
}
|
|
72
|
+
const sess = this.sess;
|
|
73
|
+
this.sess = null;
|
|
74
|
+
await sess.destroy();
|
|
75
|
+
}
|
|
76
|
+
}
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
// Behavior tests for Embedder, the public facade over Session, with the model
|
|
2
|
+
// mocked so no ~1GB download happens. The real-model parity/golden gate stays
|
|
3
|
+
// in embedder.test.ts (DOCGOV_EMBED_E2E) and is intentionally untouched.
|
|
4
|
+
//
|
|
5
|
+
// vi.mock is hoisted above the imports and intercepts the dynamic
|
|
6
|
+
// `await import("@huggingface/transformers")` inside session.ts, so
|
|
7
|
+
// Embedder.newEmbedder runs against the fake pipeline below.
|
|
8
|
+
import { mkdtempSync, rmSync } from "node:fs";
|
|
9
|
+
import { tmpdir } from "node:os";
|
|
10
|
+
import { join } from "node:path";
|
|
11
|
+
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
|
|
12
|
+
import { Dimension, Model } from "./constants.js";
|
|
13
|
+
import { Embedder } from "./embedder.js";
|
|
14
|
+
import { InferenceError } from "./errors.js";
|
|
15
|
+
const mockState = vi.hoisted(() => {
|
|
16
|
+
return {
|
|
17
|
+
// inferThrows: when set, the pipe rejects at inference time, which
|
|
18
|
+
// Embedder.embed must wrap as InferenceError.
|
|
19
|
+
inferThrows: null,
|
|
20
|
+
env: { cacheDir: "" },
|
|
21
|
+
};
|
|
22
|
+
});
|
|
23
|
+
function l2row(i, dim) {
|
|
24
|
+
const v = new Array(dim).fill(0);
|
|
25
|
+
v[i % dim] = 1.0;
|
|
26
|
+
return v;
|
|
27
|
+
}
|
|
28
|
+
vi.mock("@huggingface/transformers", () => {
|
|
29
|
+
const env = mockState.env;
|
|
30
|
+
const pipeline = async (_task, _model) => {
|
|
31
|
+
return async (texts, _opts) => {
|
|
32
|
+
if (mockState.inferThrows)
|
|
33
|
+
throw mockState.inferThrows;
|
|
34
|
+
const rows = texts.map((_, i) => l2row(i, Dimension));
|
|
35
|
+
return {
|
|
36
|
+
data: new Float32Array(rows.flat()),
|
|
37
|
+
dims: [texts.length, Dimension],
|
|
38
|
+
tolist: () => rows,
|
|
39
|
+
};
|
|
40
|
+
};
|
|
41
|
+
};
|
|
42
|
+
return { pipeline, env };
|
|
43
|
+
});
|
|
44
|
+
const tmpDirs = [];
|
|
45
|
+
function newCacheDir() {
|
|
46
|
+
const d = mkdtempSync(join(tmpdir(), "embedder-cache-"));
|
|
47
|
+
tmpDirs.push(d);
|
|
48
|
+
return d;
|
|
49
|
+
}
|
|
50
|
+
beforeEach(() => {
|
|
51
|
+
mockState.inferThrows = null;
|
|
52
|
+
mockState.env.cacheDir = "";
|
|
53
|
+
});
|
|
54
|
+
afterEach(() => {
|
|
55
|
+
for (const d of tmpDirs.splice(0))
|
|
56
|
+
rmSync(d, { recursive: true, force: true });
|
|
57
|
+
});
|
|
58
|
+
describe("Embedder", () => {
|
|
59
|
+
// WHY: newEmbedder must fully initialise (no lazy init) and expose a usable
|
|
60
|
+
// embedder; the dedup Index path depends on this returning a ready object.
|
|
61
|
+
it("newEmbedder returns a ready embedder reporting the pinned name/dimension", async () => {
|
|
62
|
+
const emb = await Embedder.newEmbedder({ cacheDir: newCacheDir() });
|
|
63
|
+
try {
|
|
64
|
+
// WHY: indexdb keys the stored index on name()+dimension(); they must echo
|
|
65
|
+
// the pinned constants exactly or a stored index is silently invalidated.
|
|
66
|
+
expect(emb.name()).toBe(Model);
|
|
67
|
+
expect(emb.dimension()).toBe(Dimension);
|
|
68
|
+
}
|
|
69
|
+
finally {
|
|
70
|
+
await emb.close();
|
|
71
|
+
}
|
|
72
|
+
});
|
|
73
|
+
// WHY: an empty batch must short-circuit to [] without touching the session —
|
|
74
|
+
// the indexer relies on this to skip work for empty inputs.
|
|
75
|
+
it("embed returns [] for an empty batch", async () => {
|
|
76
|
+
const emb = await Embedder.newEmbedder({ cacheDir: newCacheDir() });
|
|
77
|
+
try {
|
|
78
|
+
expect(await emb.embed([])).toEqual([]);
|
|
79
|
+
}
|
|
80
|
+
finally {
|
|
81
|
+
await emb.close();
|
|
82
|
+
}
|
|
83
|
+
});
|
|
84
|
+
// WHY: the success path must yield one Dimension-vector per input, in order —
|
|
85
|
+
// the shape the indexer persists and the analyzer compares.
|
|
86
|
+
it("embed returns one Dimension-vector per input", async () => {
|
|
87
|
+
const emb = await Embedder.newEmbedder({ cacheDir: newCacheDir() });
|
|
88
|
+
try {
|
|
89
|
+
const vecs = await emb.embed(["a", "b"]);
|
|
90
|
+
expect(vecs).toHaveLength(2);
|
|
91
|
+
for (const v of vecs)
|
|
92
|
+
expect(v).toHaveLength(Dimension);
|
|
93
|
+
}
|
|
94
|
+
finally {
|
|
95
|
+
await emb.close();
|
|
96
|
+
}
|
|
97
|
+
});
|
|
98
|
+
// WHY: inference failures must be wrapped as the matchable InferenceError
|
|
99
|
+
// sentinel (with the original cause attached), not leak a raw transformers
|
|
100
|
+
// error — the CLI distinguishes inference failures from other errors by type.
|
|
101
|
+
it("embed wraps an inference failure as InferenceError", async () => {
|
|
102
|
+
const emb = await Embedder.newEmbedder({ cacheDir: newCacheDir() });
|
|
103
|
+
try {
|
|
104
|
+
const cause = new Error("onnx runtime exploded");
|
|
105
|
+
mockState.inferThrows = cause;
|
|
106
|
+
await expect(emb.embed(["a"])).rejects.toBeInstanceOf(InferenceError);
|
|
107
|
+
mockState.inferThrows = cause;
|
|
108
|
+
await expect(emb.embed(["a"])).rejects.toMatchObject({ cause });
|
|
109
|
+
}
|
|
110
|
+
finally {
|
|
111
|
+
await emb.close();
|
|
112
|
+
}
|
|
113
|
+
});
|
|
114
|
+
// WHY: after close, embed must throw loudly — reusing a closed embedder is a
|
|
115
|
+
// lifecycle bug; silently returning nothing would corrupt the index.
|
|
116
|
+
it("embed throws after close", async () => {
|
|
117
|
+
const emb = await Embedder.newEmbedder({ cacheDir: newCacheDir() });
|
|
118
|
+
await emb.close();
|
|
119
|
+
await expect(emb.embed(["a"])).rejects.toThrow(/embedder is closed/);
|
|
120
|
+
});
|
|
121
|
+
// WHY: the indexer's defer/cleanup may close twice; a second close must be a
|
|
122
|
+
// no-op so cleanup never throws.
|
|
123
|
+
it("close is idempotent", async () => {
|
|
124
|
+
const emb = await Embedder.newEmbedder({ cacheDir: newCacheDir() });
|
|
125
|
+
await emb.close();
|
|
126
|
+
await expect(emb.close()).resolves.toBeUndefined();
|
|
127
|
+
});
|
|
128
|
+
});
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
import { readFile } from "node:fs/promises";
|
|
2
|
+
import { tmpdir } from "node:os";
|
|
3
|
+
import { join } from "node:path";
|
|
4
|
+
import { fileURLToPath } from "node:url";
|
|
5
|
+
import { afterEach, beforeEach, describe, expect, it, test } from "vitest";
|
|
6
|
+
import { Dimension, Embedder, Model } from "./index.js";
|
|
7
|
+
// WHY (Go embedder_test.go: TestConstants): the sentence-transformers/ prefix
|
|
8
|
+
// and 768 dimension are load-bearing for cross-POC parity. A drift here
|
|
9
|
+
// silently invalidates every stored index.
|
|
10
|
+
test("Model and Dimension constants are pinned for parity", () => {
|
|
11
|
+
expect(Model).not.toBe("");
|
|
12
|
+
expect(Model.startsWith("sentence-transformers/")).toBe(true);
|
|
13
|
+
expect(Dimension).toBe(768);
|
|
14
|
+
});
|
|
15
|
+
function l2norm(v) {
|
|
16
|
+
let sum = 0;
|
|
17
|
+
for (const x of v)
|
|
18
|
+
sum += x * x;
|
|
19
|
+
return Math.sqrt(sum);
|
|
20
|
+
}
|
|
21
|
+
function cosine(a, b) {
|
|
22
|
+
if (a.length !== b.length)
|
|
23
|
+
return 0;
|
|
24
|
+
let dot = 0;
|
|
25
|
+
for (let i = 0; i < a.length; i++)
|
|
26
|
+
dot += (a[i] ?? 0) * (b[i] ?? 0);
|
|
27
|
+
return dot;
|
|
28
|
+
}
|
|
29
|
+
// E2E inference is gated by DOCGOV_EMBED_E2E because it downloads the model
|
|
30
|
+
// (~1GB). Bit-parity vs Go is already proven (maxAbsDiff 3.6e-8 in the spike),
|
|
31
|
+
// so default CI runs only the fast unit suites above and below.
|
|
32
|
+
const e2e = process.env.DOCGOV_EMBED_E2E ? describe : describe.skip;
|
|
33
|
+
e2e("real inference (DOCGOV_EMBED_E2E)", () => {
|
|
34
|
+
let emb;
|
|
35
|
+
let dir;
|
|
36
|
+
beforeEach(async () => {
|
|
37
|
+
dir = join(tmpdir(), `docgov-embed-${process.pid}-${Date.now()}`);
|
|
38
|
+
emb = await Embedder.newEmbedder({ cacheDir: dir });
|
|
39
|
+
});
|
|
40
|
+
afterEach(async () => {
|
|
41
|
+
if (emb)
|
|
42
|
+
await emb.close();
|
|
43
|
+
});
|
|
44
|
+
// WHY (TestNew_NameAndDimension): callers (indexdb) key the stored index on
|
|
45
|
+
// name()+dimension(); they must echo the pinned constants exactly.
|
|
46
|
+
it("reports the pinned name and dimension", () => {
|
|
47
|
+
expect(emb.name()).toBe(Model);
|
|
48
|
+
expect(emb.dimension()).toBe(Dimension);
|
|
49
|
+
});
|
|
50
|
+
// WHY (TestEmbed_OutputShape): downstream cosine math assumes one 768-vec per
|
|
51
|
+
// input, in order. A batch reshape bug would scramble section identity.
|
|
52
|
+
it("returns one 768-vector per input, in order", async () => {
|
|
53
|
+
const texts = ["hello world", "test sentence"];
|
|
54
|
+
const vecs = await emb.embed(texts);
|
|
55
|
+
expect(vecs).toHaveLength(texts.length);
|
|
56
|
+
for (const v of vecs)
|
|
57
|
+
expect(v).toHaveLength(Dimension);
|
|
58
|
+
});
|
|
59
|
+
// WHY (TestEmbed_L2Normalized): the index compares vectors by dot product as
|
|
60
|
+
// a stand-in for cosine; that identity only holds if vectors are unit-norm.
|
|
61
|
+
// Includes a CJK string to lock multilingual normalization.
|
|
62
|
+
it("produces L2-normalized vectors (incl. CJK input)", async () => {
|
|
63
|
+
const vecs = await emb.embed([
|
|
64
|
+
"The quick brown fox jumps over the lazy dog",
|
|
65
|
+
"補貨單流程",
|
|
66
|
+
]);
|
|
67
|
+
for (const v of vecs) {
|
|
68
|
+
expect(Math.abs(l2norm(v) - 1)).toBeLessThan(1e-4);
|
|
69
|
+
}
|
|
70
|
+
});
|
|
71
|
+
// WHY (TestEmbed_EmptyBatch): the indexer may pass an empty batch; it must
|
|
72
|
+
// get an empty result, not an error or a stray vector.
|
|
73
|
+
it("returns an empty result for an empty batch", async () => {
|
|
74
|
+
expect(await emb.embed([])).toEqual([]);
|
|
75
|
+
});
|
|
76
|
+
// WHY (TestEmbed_SentinelStability): asserts the embedder reproduces the
|
|
77
|
+
// committed golden vectors (cosine >= 0.9999). This is the parity anchor —
|
|
78
|
+
// the goldens come from the Go embedder, so this is the cross-port check.
|
|
79
|
+
it("matches the committed golden vectors (cosine >= 0.9999)", async () => {
|
|
80
|
+
const goldenPath = join(fileURLToPath(new URL(".", import.meta.url)), "..", "..", "..", "..", "internal", "dedup", "embedder", "testdata", "sentinel_vectors.json");
|
|
81
|
+
const entries = JSON.parse(await readFile(goldenPath, "utf8"));
|
|
82
|
+
expect(entries.length).toBeGreaterThan(0);
|
|
83
|
+
const vecs = await emb.embed(entries.map((e) => e.text));
|
|
84
|
+
entries.forEach((e, i) => {
|
|
85
|
+
const got = vecs[i];
|
|
86
|
+
expect(got).toBeDefined();
|
|
87
|
+
expect(cosine(got, e.vector)).toBeGreaterThanOrEqual(0.9999);
|
|
88
|
+
});
|
|
89
|
+
});
|
|
90
|
+
// WHY (TestClose_Idempotent): the indexer's defer/cleanup may close twice; a
|
|
91
|
+
// second close must not throw.
|
|
92
|
+
it("close is idempotent", async () => {
|
|
93
|
+
await emb.close();
|
|
94
|
+
await expect(emb.close()).resolves.toBeUndefined();
|
|
95
|
+
});
|
|
96
|
+
});
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
// Sentinel errors for the embedder package.
|
|
2
|
+
//
|
|
3
|
+
// In Go these are `var ErrModelDownload = errors.New(...)` sentinels matched
|
|
4
|
+
// with errors.Is. In TS they are Error subclasses so callers can match with
|
|
5
|
+
// `instanceof`. The underlying cause is attached via the standard Error
|
|
6
|
+
// `cause` option (mirroring Go's %v inner message in the formatted string).
|
|
7
|
+
/** ErrModelDownload is returned when the embedding model cannot be downloaded. */
|
|
8
|
+
export class ModelDownloadError extends Error {
|
|
9
|
+
constructor(message = "embedder: model download failed", options) {
|
|
10
|
+
super(message, options);
|
|
11
|
+
this.name = "ModelDownloadError";
|
|
12
|
+
}
|
|
13
|
+
}
|
|
14
|
+
/** ErrInference is returned when inference fails. */
|
|
15
|
+
export class InferenceError extends Error {
|
|
16
|
+
constructor(message = "embedder: inference failed", options) {
|
|
17
|
+
super(message, options);
|
|
18
|
+
this.name = "InferenceError";
|
|
19
|
+
}
|
|
20
|
+
}
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import { expect, test } from "vitest";
|
|
2
|
+
import { InferenceError, ModelDownloadError } from "./errors.js";
|
|
3
|
+
// WHY (Go sentinel_test.go: TestSentinels_NonNil): a nil/undefined sentinel
|
|
4
|
+
// would break every error match silently. The TS analogue: the error classes
|
|
5
|
+
// must be constructible and instanceof-matchable.
|
|
6
|
+
test("sentinel error classes are constructible and instanceof-matchable", () => {
|
|
7
|
+
const dl = new ModelDownloadError();
|
|
8
|
+
const inf = new InferenceError();
|
|
9
|
+
expect(dl).toBeInstanceOf(ModelDownloadError);
|
|
10
|
+
expect(dl).toBeInstanceOf(Error);
|
|
11
|
+
expect(inf).toBeInstanceOf(InferenceError);
|
|
12
|
+
expect(inf).toBeInstanceOf(Error);
|
|
13
|
+
});
|
|
14
|
+
// WHY (Go embedder.go default messages): the message text mirrors Go's
|
|
15
|
+
// errors.New strings so log output reads the same across the two ports.
|
|
16
|
+
test("sentinel errors carry their Go-equivalent default messages", () => {
|
|
17
|
+
expect(new ModelDownloadError().message).toBe("embedder: model download failed");
|
|
18
|
+
expect(new InferenceError().message).toBe("embedder: inference failed");
|
|
19
|
+
});
|
|
20
|
+
// WHY (Go sentinel_test.go: TestEmbed_ErrInferenceIsReachable): the Go code
|
|
21
|
+
// wraps with "%w: %v" so errors.Is(err, ErrInference) is true while the inner
|
|
22
|
+
// cause is NOT separately errors.Is-comparable (it lives only in the message).
|
|
23
|
+
// The faithful TS contract: the thrown error is `instanceof InferenceError`
|
|
24
|
+
// (the sentinel stays matchable) AND the underlying cause is preserved on
|
|
25
|
+
// `.cause` for diagnostics — distinct from the sentinel identity.
|
|
26
|
+
test("InferenceError stays matchable while preserving the wrapped cause", () => {
|
|
27
|
+
const cause = new Error("ort runtime: session failed");
|
|
28
|
+
const err = new InferenceError(undefined, { cause });
|
|
29
|
+
// Sentinel remains matchable (Go: errors.Is(err, ErrInference) == true).
|
|
30
|
+
expect(err).toBeInstanceOf(InferenceError);
|
|
31
|
+
// The cause is NOT the sentinel itself (Go: errors.Is(err, cause) == false),
|
|
32
|
+
// but is retained for diagnostics on the standard .cause slot.
|
|
33
|
+
expect(err.cause).toBe(cause);
|
|
34
|
+
expect(err).not.toBe(cause);
|
|
35
|
+
});
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
import { mkdir } from "node:fs/promises";
|
|
2
|
+
import { Dimension, Model } from "./constants.js";
|
|
3
|
+
import { ModelDownloadError } from "./errors.js";
|
|
4
|
+
/**
|
|
5
|
+
* Session wraps a transformers.js feature-extraction pipeline.
|
|
6
|
+
*
|
|
7
|
+
* It is the TS analogue of Go's hugotSession: it owns the loaded model and
|
|
8
|
+
* exposes embed/destroy. Isolated here so the public Embedder type stays thin.
|
|
9
|
+
*/
|
|
10
|
+
export class Session {
|
|
11
|
+
pipe;
|
|
12
|
+
constructor(pipe) {
|
|
13
|
+
this.pipe = pipe;
|
|
14
|
+
}
|
|
15
|
+
/**
|
|
16
|
+
* newSession downloads (if needed) and initialises the pipeline.
|
|
17
|
+
*
|
|
18
|
+
* The model is auto-downloaded on first use into modelCacheDir. This can take
|
|
19
|
+
* 30+ seconds on first run. Mirrors Go's newHugotSession including the
|
|
20
|
+
* cache-dir creation and a dry-run inference to verify initialisation.
|
|
21
|
+
*/
|
|
22
|
+
static async newSession(modelCacheDir) {
|
|
23
|
+
// Ensure cache dir exists (Go: os.MkdirAll, 0o755).
|
|
24
|
+
await mkdir(modelCacheDir, { recursive: true });
|
|
25
|
+
// Point transformers.js at our cache dir before loading. Setting it on the
|
|
26
|
+
// shared `env` is how transformers.js controls where HF models land.
|
|
27
|
+
const { pipeline, env } = await import("@huggingface/transformers");
|
|
28
|
+
env.cacheDir = modelCacheDir;
|
|
29
|
+
let pipe;
|
|
30
|
+
try {
|
|
31
|
+
pipe = (await pipeline("feature-extraction", Model));
|
|
32
|
+
}
|
|
33
|
+
catch (cause) {
|
|
34
|
+
throw new ModelDownloadError(undefined, { cause });
|
|
35
|
+
}
|
|
36
|
+
// Dry-run inference to verify the pipeline is fully initialized.
|
|
37
|
+
await pipe(["init"], { pooling: "mean", normalize: true });
|
|
38
|
+
return new Session(pipe);
|
|
39
|
+
}
|
|
40
|
+
/** embed runs inference and returns mean-pooled, L2-normalized vectors. */
|
|
41
|
+
async embed(texts) {
|
|
42
|
+
if (texts.length === 0) {
|
|
43
|
+
return [];
|
|
44
|
+
}
|
|
45
|
+
if (this.pipe === null) {
|
|
46
|
+
throw new Error("embedder.Session.embed: session is closed");
|
|
47
|
+
}
|
|
48
|
+
const out = await this.pipe(texts, { pooling: "mean", normalize: true });
|
|
49
|
+
return reshape(out, texts.length);
|
|
50
|
+
}
|
|
51
|
+
/** destroy releases the pipeline reference. */
|
|
52
|
+
async destroy() {
|
|
53
|
+
const pipe = this.pipe;
|
|
54
|
+
this.pipe = null;
|
|
55
|
+
if (pipe?.dispose) {
|
|
56
|
+
await pipe.dispose();
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
/**
|
|
61
|
+
* reshape converts the [batch, Dimension] feature-extraction tensor into a
|
|
62
|
+
* number[][]. tolist() yields the nested array directly; we validate the batch
|
|
63
|
+
* count and per-row dimension so a model/pooling mismatch fails loudly rather
|
|
64
|
+
* than silently producing the wrong shape.
|
|
65
|
+
*/
|
|
66
|
+
function reshape(out, want) {
|
|
67
|
+
const rows = out.tolist();
|
|
68
|
+
if (rows.length !== want) {
|
|
69
|
+
throw new Error(`embedder: expected ${want} vectors, got ${rows.length}`);
|
|
70
|
+
}
|
|
71
|
+
for (let i = 0; i < rows.length; i++) {
|
|
72
|
+
const row = rows[i];
|
|
73
|
+
if (row === undefined || row.length !== Dimension) {
|
|
74
|
+
throw new Error(`embedder: vector ${i} has length ${row?.length ?? 0}, want ${Dimension}`);
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
return rows;
|
|
78
|
+
}
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
// Behavior tests for Session, the transformers.js pipeline wrapper, with the
|
|
2
|
+
// model mocked so no ~1GB download happens. The real-model parity gate lives in
|
|
3
|
+
// embedder.test.ts (DOCGOV_EMBED_E2E) and is intentionally untouched.
|
|
4
|
+
//
|
|
5
|
+
// vi.mock is hoisted above the imports and intercepts BOTH the static and the
|
|
6
|
+
// dynamic `await import("@huggingface/transformers")` that session.ts performs,
|
|
7
|
+
// so Session.newSession runs entirely against the fake pipeline below.
|
|
8
|
+
import { existsSync } from "node:fs";
|
|
9
|
+
import { mkdtempSync, rmSync } from "node:fs";
|
|
10
|
+
import { tmpdir } from "node:os";
|
|
11
|
+
import { join } from "node:path";
|
|
12
|
+
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
|
|
13
|
+
import { Dimension } from "./constants.js";
|
|
14
|
+
import { ModelDownloadError } from "./errors.js";
|
|
15
|
+
import { Session } from "./session.js";
|
|
16
|
+
// --- mock state ------------------------------------------------------------
|
|
17
|
+
// mockState is shared, mutable hoisted state so each test can steer the fake
|
|
18
|
+
// pipeline (rows it returns, whether it throws, whether it has dispose()).
|
|
19
|
+
const mockState = vi.hoisted(() => {
|
|
20
|
+
return {
|
|
21
|
+
// pipelineThrows: when set, the mocked `pipeline()` factory rejects, which
|
|
22
|
+
// is how session.ts surfaces a ModelDownloadError.
|
|
23
|
+
pipelineThrows: null,
|
|
24
|
+
// rowCount/rowDim override the shape the pipe returns, to drive reshape's
|
|
25
|
+
// batch-count and per-row dimension guards. null = match the request.
|
|
26
|
+
rowCount: null,
|
|
27
|
+
rowDim: null,
|
|
28
|
+
// dispose is attached to the returned pipe when set, to exercise destroy().
|
|
29
|
+
dispose: null,
|
|
30
|
+
// capturedCacheDir records env.cacheDir at pipeline() time so we can assert
|
|
31
|
+
// newSession pointed transformers.js at our temp dir before loading.
|
|
32
|
+
capturedCacheDir: "",
|
|
33
|
+
env: { cacheDir: "" },
|
|
34
|
+
};
|
|
35
|
+
});
|
|
36
|
+
// l2row returns a deterministic, L2-normalized vector of length `dim`. The
|
|
37
|
+
// single non-zero entry keeps it unit-norm for free and varies by `i` so
|
|
38
|
+
// distinct inputs get distinct vectors (matters for the dedup Index drive-through).
|
|
39
|
+
function l2row(i, dim) {
|
|
40
|
+
const v = new Array(dim).fill(0);
|
|
41
|
+
v[i % dim] = 1.0;
|
|
42
|
+
return v;
|
|
43
|
+
}
|
|
44
|
+
vi.mock("@huggingface/transformers", () => {
|
|
45
|
+
const env = mockState.env;
|
|
46
|
+
const pipeline = async (_task, _model) => {
|
|
47
|
+
if (mockState.pipelineThrows) {
|
|
48
|
+
throw mockState.pipelineThrows;
|
|
49
|
+
}
|
|
50
|
+
// Record the cache dir as configured by newSession (env.cacheDir is set
|
|
51
|
+
// between import and the pipeline() call).
|
|
52
|
+
mockState.capturedCacheDir = env.cacheDir;
|
|
53
|
+
const pipe = async (texts, _opts) => {
|
|
54
|
+
const n = mockState.rowCount ?? texts.length;
|
|
55
|
+
const dim = mockState.rowDim ?? Dimension;
|
|
56
|
+
const rows = [];
|
|
57
|
+
for (let i = 0; i < n; i++)
|
|
58
|
+
rows.push(l2row(i, dim));
|
|
59
|
+
const flat = new Float32Array(rows.flat());
|
|
60
|
+
return {
|
|
61
|
+
data: flat,
|
|
62
|
+
dims: [n, dim],
|
|
63
|
+
tolist: () => rows,
|
|
64
|
+
};
|
|
65
|
+
};
|
|
66
|
+
if (mockState.dispose) {
|
|
67
|
+
pipe.dispose =
|
|
68
|
+
mockState.dispose;
|
|
69
|
+
}
|
|
70
|
+
return pipe;
|
|
71
|
+
};
|
|
72
|
+
return { pipeline, env };
|
|
73
|
+
});
|
|
74
|
+
// --- temp dirs -------------------------------------------------------------
|
|
75
|
+
const tmpDirs = [];
|
|
76
|
+
function newCacheDir() {
|
|
77
|
+
const d = mkdtempSync(join(tmpdir(), "session-cache-"));
|
|
78
|
+
tmpDirs.push(d);
|
|
79
|
+
return d;
|
|
80
|
+
}
|
|
81
|
+
beforeEach(() => {
|
|
82
|
+
mockState.pipelineThrows = null;
|
|
83
|
+
mockState.rowCount = null;
|
|
84
|
+
mockState.rowDim = null;
|
|
85
|
+
mockState.dispose = null;
|
|
86
|
+
mockState.capturedCacheDir = "";
|
|
87
|
+
mockState.env.cacheDir = "";
|
|
88
|
+
});
|
|
89
|
+
afterEach(() => {
|
|
90
|
+
for (const d of tmpDirs.splice(0))
|
|
91
|
+
rmSync(d, { recursive: true, force: true });
|
|
92
|
+
});
|
|
93
|
+
describe("Session.newSession", () => {
|
|
94
|
+
// WHY: newSession must create the cache dir AND point transformers.js at it
|
|
95
|
+
// before loading, so models land in the caller's chosen dir (not the host
|
|
96
|
+
// cache). A regression that skipped env.cacheDir would race CI on a shared
|
|
97
|
+
// cache; we assert both the dir exists and the pipeline saw it.
|
|
98
|
+
it("creates the cache dir and sets env.cacheDir before loading", async () => {
|
|
99
|
+
const dir = join(newCacheDir(), "nested", "models");
|
|
100
|
+
const sess = await Session.newSession(dir);
|
|
101
|
+
expect(existsSync(dir)).toBe(true);
|
|
102
|
+
expect(mockState.capturedCacheDir).toBe(dir);
|
|
103
|
+
// A usable session can embed (the dry-run inference already proved init).
|
|
104
|
+
expect(await sess.embed(["x"])).toHaveLength(1);
|
|
105
|
+
});
|
|
106
|
+
// WHY: a download/init failure must be surfaced as the matchable
|
|
107
|
+
// ModelDownloadError sentinel, not a raw transformers error — the CLI keys
|
|
108
|
+
// its "could not download the model" guidance on this type.
|
|
109
|
+
it("throws ModelDownloadError when the pipeline factory fails", async () => {
|
|
110
|
+
mockState.pipelineThrows = new Error("network down");
|
|
111
|
+
const dir = newCacheDir();
|
|
112
|
+
await expect(Session.newSession(dir)).rejects.toBeInstanceOf(ModelDownloadError);
|
|
113
|
+
});
|
|
114
|
+
});
|
|
115
|
+
describe("Session.embed", () => {
|
|
116
|
+
// WHY: the indexer may hand an empty batch; it must get [] back with no call
|
|
117
|
+
// into the pipeline — a stray vector or an error would corrupt the index.
|
|
118
|
+
it("returns [] for an empty batch", async () => {
|
|
119
|
+
const sess = await Session.newSession(newCacheDir());
|
|
120
|
+
expect(await sess.embed([])).toEqual([]);
|
|
121
|
+
});
|
|
122
|
+
// WHY: the core contract — one Dimension-length vector per input, in order.
|
|
123
|
+
// Downstream cosine math assumes this exact shape and ordering.
|
|
124
|
+
it("returns one Dimension-vector per input", async () => {
|
|
125
|
+
const sess = await Session.newSession(newCacheDir());
|
|
126
|
+
const vecs = await sess.embed(["a", "b", "c"]);
|
|
127
|
+
expect(vecs).toHaveLength(3);
|
|
128
|
+
for (const v of vecs)
|
|
129
|
+
expect(v).toHaveLength(Dimension);
|
|
130
|
+
});
|
|
131
|
+
// WHY: embedding after destroy must fail loudly rather than silently
|
|
132
|
+
// returning nothing — a closed session reused by mistake is a bug to surface.
|
|
133
|
+
it("throws when the session is closed", async () => {
|
|
134
|
+
const sess = await Session.newSession(newCacheDir());
|
|
135
|
+
await sess.destroy();
|
|
136
|
+
await expect(sess.embed(["a"])).rejects.toThrow(/session is closed/);
|
|
137
|
+
});
|
|
138
|
+
// WHY: a pooling/model mismatch that yields the wrong number of rows must
|
|
139
|
+
// fail loudly — silently returning the wrong count would scramble section
|
|
140
|
+
// identity in the index.
|
|
141
|
+
it("throws when the pipeline returns the wrong batch count", async () => {
|
|
142
|
+
const sess = await Session.newSession(newCacheDir());
|
|
143
|
+
mockState.rowCount = 1; // request 2, get 1
|
|
144
|
+
await expect(sess.embed(["a", "b"])).rejects.toThrow(/expected 2 vectors, got 1/);
|
|
145
|
+
});
|
|
146
|
+
// WHY: a per-row dimension mismatch (wrong model / no mean-pooling) must also
|
|
147
|
+
// fail loudly; a 767- or 769-dim row would silently break the index schema.
|
|
148
|
+
it("throws when a row has the wrong dimension", async () => {
|
|
149
|
+
const sess = await Session.newSession(newCacheDir());
|
|
150
|
+
mockState.rowDim = Dimension - 1;
|
|
151
|
+
await expect(sess.embed(["a"])).rejects.toThrow(new RegExp(`length ${Dimension - 1}, want ${Dimension}`));
|
|
152
|
+
});
|
|
153
|
+
});
|
|
154
|
+
describe("Session.destroy", () => {
|
|
155
|
+
// WHY: destroy must call dispose() on backends that expose it (releasing the
|
|
156
|
+
// ONNX session), so long-lived processes don't leak native memory.
|
|
157
|
+
it("invokes dispose() when the pipeline exposes it", async () => {
|
|
158
|
+
const dispose = vi.fn(async () => { });
|
|
159
|
+
mockState.dispose = dispose;
|
|
160
|
+
const sess = await Session.newSession(newCacheDir());
|
|
161
|
+
await sess.destroy();
|
|
162
|
+
expect(dispose).toHaveBeenCalledTimes(1);
|
|
163
|
+
});
|
|
164
|
+
// WHY: destroy must be safe on a pipeline without dispose() — not every
|
|
165
|
+
// backend exposes it; a hard requirement would crash cleanup.
|
|
166
|
+
it("is a no-op-safe when the pipeline has no dispose()", async () => {
|
|
167
|
+
const sess = await Session.newSession(newCacheDir());
|
|
168
|
+
await expect(sess.destroy()).resolves.toBeUndefined();
|
|
169
|
+
// A second destroy after the reference is cleared must also not throw.
|
|
170
|
+
await expect(sess.destroy()).resolves.toBeUndefined();
|
|
171
|
+
});
|
|
172
|
+
});
|