@mailwoman/neural 2.0.6 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/out/browser.d.ts +16 -0
- package/out/browser.d.ts.map +1 -0
- package/out/browser.js +15 -0
- package/out/browser.js.map +1 -0
- package/out/classifier.d.ts +84 -10
- package/out/classifier.d.ts.map +1 -1
- package/out/classifier.js +122 -18
- package/out/classifier.js.map +1 -1
- package/out/index.d.ts +4 -0
- package/out/index.d.ts.map +1 -1
- package/out/index.js +2 -0
- package/out/index.js.map +1 -1
- package/out/labels.d.ts +27 -6
- package/out/labels.d.ts.map +1 -1
- package/out/labels.js +30 -6
- package/out/labels.js.map +1 -1
- package/out/proposal-classifier.d.ts +5 -1
- package/out/proposal-classifier.d.ts.map +1 -1
- package/out/proposal-classifier.js +2 -2
- package/out/proposal-classifier.js.map +1 -1
- package/out/query-shape-prior.d.ts +62 -0
- package/out/query-shape-prior.d.ts.map +1 -0
- package/out/query-shape-prior.js +93 -0
- package/out/query-shape-prior.js.map +1 -0
- package/out/tokenizer.d.ts +6 -1
- package/out/tokenizer.d.ts.map +1 -1
- package/out/tokenizer.js +8 -3
- package/out/tokenizer.js.map +1 -1
- package/out/viterbi.d.ts +76 -0
- package/out/viterbi.d.ts.map +1 -0
- package/out/viterbi.js +163 -0
- package/out/viterbi.js.map +1 -0
- package/out/weights.d.ts +18 -0
- package/out/weights.d.ts.map +1 -1
- package/out/weights.js +47 -3
- package/out/weights.js.map +1 -1
- package/package.json +6 -3
package/out/browser.d.ts
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Browser-safe re-export surface. Excludes `./onnx-runner.js` + `./weights.js` (Node-only — they
|
|
7
|
+
* statically reference `onnxruntime-node` + `node:fs`), the dynamic `loadFromWeights` /
|
|
8
|
+
* `loadFromFile` paths from those modules guard the corresponding imports with `webpackIgnore` so
|
|
9
|
+
* Node callers still get them via the main `@mailwoman/neural` entry without bundling them into a
|
|
10
|
+
* browser graph.
|
|
11
|
+
*/
|
|
12
|
+
export * from "./classifier.js";
|
|
13
|
+
export * from "./labels.js";
|
|
14
|
+
export * from "./tokenizer.js";
|
|
15
|
+
export type { InferResult } from "./onnx-runner.js";
|
|
16
|
+
//# sourceMappingURL=browser.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"browser.d.ts","sourceRoot":"","sources":["../browser.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,cAAc,iBAAiB,CAAA;AAC/B,cAAc,aAAa,CAAA;AAC3B,cAAc,gBAAgB,CAAA;AAG9B,YAAY,EAAE,WAAW,EAAE,MAAM,kBAAkB,CAAA"}
|
package/out/browser.js
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Browser-safe re-export surface. Excludes `./onnx-runner.js` + `./weights.js` (Node-only — they
|
|
7
|
+
* statically reference `onnxruntime-node` + `node:fs`), the dynamic `loadFromWeights` /
|
|
8
|
+
* `loadFromFile` paths from those modules guard the corresponding imports with `webpackIgnore` so
|
|
9
|
+
* Node callers still get them via the main `@mailwoman/neural` entry without bundling them into a
|
|
10
|
+
* browser graph.
|
|
11
|
+
*/
|
|
12
|
+
export * from "./classifier.js";
|
|
13
|
+
export * from "./labels.js";
|
|
14
|
+
export * from "./tokenizer.js";
|
|
15
|
+
//# sourceMappingURL=browser.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"browser.js","sourceRoot":"","sources":["../browser.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,cAAc,iBAAiB,CAAA;AAC/B,cAAc,aAAa,CAAA;AAC3B,cAAc,gBAAgB,CAAA"}
|
package/out/classifier.d.ts
CHANGED
|
@@ -9,19 +9,55 @@
|
|
|
9
9
|
*
|
|
10
10
|
* Convenience wrappers `parseJson` / `parseTuples` / `parseXml` project the tree on the way out.
|
|
11
11
|
*/
|
|
12
|
-
import { type AddressTree, type ComponentTag
|
|
13
|
-
import {
|
|
12
|
+
import { decodeAsXml, type AddressTree, type ComponentTag } from "@mailwoman/core/decoder";
|
|
13
|
+
import type { InferResult } from "./onnx-runner.js";
|
|
14
|
+
import { type QueryShapeLike } from "./query-shape-prior.js";
|
|
14
15
|
import { MailwomanTokenizer } from "./tokenizer.js";
|
|
15
|
-
import {
|
|
16
|
+
import type { ResolveWeightsOpts } from "./weights.js";
|
|
17
|
+
/**
|
|
18
|
+
* Structural type the classifier needs from a runner. Lets callers swap the Node-side `OnnxRunner`
|
|
19
|
+
* for a browser-side runner (e.g. `@mailwoman/neural-web`'s `WebOnnxRunner`) without inheritance —
|
|
20
|
+
* the classifier only ever calls `infer(ids)`.
|
|
21
|
+
*/
|
|
22
|
+
export interface NeuralRunner {
|
|
23
|
+
infer(tokenIds: number[]): Promise<InferResult>;
|
|
24
|
+
}
|
|
16
25
|
export interface NeuralAddressClassifierConfig {
|
|
17
26
|
tokenizer: MailwomanTokenizer;
|
|
18
|
-
runner:
|
|
19
|
-
/**
|
|
27
|
+
runner: NeuralRunner;
|
|
28
|
+
/**
|
|
29
|
+
* Label vocabulary in the order the model emits them. Defaults to Stage 2 (v0.3.0). Stage 2
|
|
30
|
+
* strictly extends Stage 1 at the same indices, so a v0.2.0 Stage 1 model loaded with this
|
|
31
|
+
* default still decodes correctly — its emissions only span the first 15 entries.
|
|
32
|
+
*/
|
|
20
33
|
labels?: readonly string[];
|
|
34
|
+
/**
|
|
35
|
+
* Decoding strategy:
|
|
36
|
+
*
|
|
37
|
+
* - `"viterbi"` (default) — linear-chain CRF Viterbi with the BIO structural mask. Prevents
|
|
38
|
+
* orphan-`I-*` sequences. If `transitions` is provided, uses learned scores on top.
|
|
39
|
+
* - `"argmax"` — per-token argmax. Faster but produces structurally invalid sequences. Use only for
|
|
40
|
+
* debugging / comparison.
|
|
41
|
+
*/
|
|
42
|
+
decode?: "viterbi" | "argmax";
|
|
43
|
+
/**
|
|
44
|
+
* Optional learned CRF transition scores. Square matrix of size `labels.length × labels.length`.
|
|
45
|
+
* Added on top of the structural BIO mask. Future weights releases ship this; today's v3.0.0
|
|
46
|
+
* weights don't, so the structural mask alone is used.
|
|
47
|
+
*/
|
|
48
|
+
transitions?: number[][];
|
|
49
|
+
/** Optional learned start-of-sequence transition scores per label. */
|
|
50
|
+
startTransitions?: number[];
|
|
51
|
+
/** Optional learned end-of-sequence transition scores per label. */
|
|
52
|
+
endTransitions?: number[];
|
|
21
53
|
}
|
|
22
54
|
export declare class NeuralAddressClassifier {
|
|
23
55
|
private readonly cfg;
|
|
24
56
|
private readonly labels;
|
|
57
|
+
private readonly decodeMode;
|
|
58
|
+
private readonly transitions;
|
|
59
|
+
private readonly startTransitions;
|
|
60
|
+
private readonly endTransitions;
|
|
25
61
|
constructor(cfg: NeuralAddressClassifierConfig);
|
|
26
62
|
/**
|
|
27
63
|
* One-call factory that resolves the weights package (or explicit paths), loads the tokenizer and
|
|
@@ -29,12 +65,50 @@ export declare class NeuralAddressClassifier {
|
|
|
29
65
|
*
|
|
30
66
|
* Resolution order: explicit paths in `opts` → `@mailwoman/neural-weights-<locale>` package →
|
|
31
67
|
* throws a single actionable error.
|
|
68
|
+
*
|
|
69
|
+
* **Node-only.** The dynamic imports keep `OnnxRunner` (onnxruntime-node) + `resolveWeights`
|
|
70
|
+
* (uses Node fs) out of the static dependency graph, so this file can be bundled for the browser
|
|
71
|
+
* by `@mailwoman/neural-web`. Calling this method in a browser will throw at runtime — use
|
|
72
|
+
* `loadNeuralClassifierFromUrls` from `@mailwoman/neural-web` instead.
|
|
32
73
|
*/
|
|
33
74
|
static loadFromWeights(opts?: ResolveWeightsOpts): Promise<NeuralAddressClassifier>;
|
|
34
|
-
/** Tokenize → infer → argmax
|
|
35
|
-
parse(text: string): Promise<AddressTree>;
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
75
|
+
/** Tokenize → infer → Viterbi (or argmax) → decoder tree. */
|
|
76
|
+
parse(text: string, opts?: ParseOpts): Promise<AddressTree>;
|
|
77
|
+
/**
|
|
78
|
+
* Like `parse`, but also returns the raw per-token logits and piece offsets needed for per-span
|
|
79
|
+
* logit aggregation (Option C joint-reconcile integration).
|
|
80
|
+
*/
|
|
81
|
+
parseWithLogits(text: string, opts?: ParseOpts): Promise<ParseWithLogitsResult>;
|
|
82
|
+
parseJson(text: string, opts?: ParseOpts): Promise<Partial<Record<ComponentTag, string>>>;
|
|
83
|
+
parseTuples(text: string, opts?: ParseOpts): Promise<Array<[ComponentTag, string]>>;
|
|
84
|
+
parseXml(text: string, opts?: ParseOpts & {
|
|
85
|
+
xml?: Parameters<typeof decodeAsXml>[1];
|
|
86
|
+
}): Promise<string>;
|
|
87
|
+
}
|
|
88
|
+
/** Result of `parseWithLogits` — tree + raw material for per-span logit aggregation. */
|
|
89
|
+
export interface ParseWithLogitsResult {
|
|
90
|
+
tree: AddressTree;
|
|
91
|
+
logits: number[][];
|
|
92
|
+
pieces: Array<{
|
|
93
|
+
start: number;
|
|
94
|
+
end: number;
|
|
95
|
+
}>;
|
|
96
|
+
}
|
|
97
|
+
/**
|
|
98
|
+
* Per-call opts for `parse()`. Threading a precomputed `QueryShape` here turns on the soft-prior
|
|
99
|
+
* bias path in the Viterbi decoder (Stage 2.4 boundary → Stage 3 encoder integration).
|
|
100
|
+
*/
|
|
101
|
+
export interface ParseOpts {
|
|
102
|
+
/**
|
|
103
|
+
* Precomputed `QueryShape` for this input (from `@mailwoman/query-shape`'s `computeQueryShape`).
|
|
104
|
+
* Known-format hits in the shape produce additive emission biases toward the matching BIO label.
|
|
105
|
+
* Typed structurally — no runtime dependency on `@mailwoman/query-shape`.
|
|
106
|
+
*/
|
|
107
|
+
queryShape?: QueryShapeLike;
|
|
108
|
+
/**
|
|
109
|
+
* Maximum bias magnitude in log-odds units. Default 1.0 — adds up to ~e^1 ≈ 2.7× odds to the
|
|
110
|
+
* favored label. Confidence-scaled, so a 0.6-confidence format hit gets +0.6 max bias.
|
|
111
|
+
*/
|
|
112
|
+
queryShapeBiasScale?: number;
|
|
39
113
|
}
|
|
40
114
|
//# sourceMappingURL=classifier.d.ts.map
|
package/out/classifier.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"classifier.d.ts","sourceRoot":"","sources":["../classifier.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,OAAO,
|
|
1
|
+
{"version":3,"file":"classifier.d.ts","sourceRoot":"","sources":["../classifier.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,OAAO,EAIN,WAAW,EACX,KAAK,WAAW,EAChB,KAAK,YAAY,EAEjB,MAAM,yBAAyB,CAAA;AAEhC,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,kBAAkB,CAAA;AACnD,OAAO,EAA0C,KAAK,cAAc,EAAE,MAAM,wBAAwB,CAAA;AACpG,OAAO,EAAE,kBAAkB,EAAE,MAAM,gBAAgB,CAAA;AAEnD,OAAO,KAAK,EAAE,kBAAkB,EAAmB,MAAM,cAAc,CAAA;AAEvE;;;;GAIG;AACH,MAAM,WAAW,YAAY;IAC5B,KAAK,CAAC,QAAQ,EAAE,MAAM,EAAE,GAAG,OAAO,CAAC,WAAW,CAAC,CAAA;CAC/C;AAED,MAAM,WAAW,6BAA6B;IAC7C,SAAS,EAAE,kBAAkB,CAAA;IAC7B,MAAM,EAAE,YAAY,CAAA;IACpB;;;;OAIG;IACH,MAAM,CAAC,EAAE,SAAS,MAAM,EAAE,CAAA;IAC1B;;;;;;;OAOG;IACH,MAAM,CAAC,EAAE,SAAS,GAAG,QAAQ,CAAA;IAC7B;;;;OAIG;IACH,WAAW,CAAC,EAAE,MAAM,EAAE,EAAE,CAAA;IACxB,sEAAsE;IACtE,gBAAgB,CAAC,EAAE,MAAM,EAAE,CAAA;IAC3B,oEAAoE;IACpE,cAAc,CAAC,EAAE,MAAM,EAAE,CAAA;CACzB;AAED,qBAAa,uBAAuB;IAOvB,OAAO,CAAC,QAAQ,CAAC,GAAG;IANhC,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAmB;IAC1C,OAAO,CAAC,QAAQ,CAAC,UAAU,CAAsB;IACjD,OAAO,CAAC,QAAQ,CAAC,WAAW,CAAY;IACxC,OAAO,CAAC,QAAQ,CAAC,gBAAgB,CAAU;IAC3C,OAAO,CAAC,QAAQ,CAAC,cAAc,CAAU;gBAEZ,GAAG,EAAE,6BAA6B;IAa/D;;;;;;;;;;;OAWG;WACU,eAAe,CAAC,IAAI,GAAE,kBAAuB,GAAG,OAAO,CAAC,uBAAuB,CAAC;IAuB7F,6DAA6D;IACvD,KAAK,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,SAAS,GAAG,OAAO,CAAC,WAAW,CAAC;IA8CjE;;;OAGG;IACG,eAAe,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,SAAS,GAAG,OAAO,CAAC,qBAAqB,CAAC;IA6C/E,SAAS,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,SAAS,GAAG,OAAO,CAAC,OAAO,CAAC,MAAM,CAAC,YAAY,EAAE,MAAM,CAAC,CAAC,CAAC;IAIzF,WAAW,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,SAAS,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,YAAY,EAAE,MAAM,CAAC,CAAC,CAAC;IAInF,QAAQ,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,SAAS,GAAG;QAAE,GAAG,CAAC,EAAE,UAAU,CAAC,OAAO,WAAW,CAAC,CAAC,CAAC,CAAC,CAAA;KAAE,GAAG,OAAO,CAAC,MAAM,CAAC;CAG7G;AAED,wFAAwF;AACxF,MAAM,WAAW,qBAAqB;IACrC,IAAI,EAAE,WAAW,CAAA;IACjB,MAAM,EAAE,MAAM,EAAE,EAAE,CAAA;IAClB,MAAM,EAAE,KAAK,CAAC;QAAE,KAAK,EAAE,MAAM,CAAC;QAAC,GAAG,EAAE,MAAM,CAAA;KAAE,CAAC,CAAA;CAC7C;AAED;;;GAGG;AACH,MAAM,WAAW,SAAS;IACzB;;;;OAIG;IACH,UAAU,CAAC,EAAE,cAAc,CAAA;IAC3B;;;OAGG;IACH,mBAAmB,CAAC,EAAE,MAAM,CAAA;CAC5B"}
|
package/out/classifier.js
CHANGED
|
@@ -10,16 +10,30 @@
|
|
|
10
10
|
* Convenience wrappers `parseJson` / `parseTuples` / `parseXml` project the tree on the way out.
|
|
11
11
|
*/
|
|
12
12
|
import { buildAddressTree, decodeAsJson, decodeAsTuples, decodeAsXml, } from "@mailwoman/core/decoder";
|
|
13
|
-
import {
|
|
14
|
-
import {
|
|
13
|
+
import { STAGE2_BIO_LABELS } from "./labels.js";
|
|
14
|
+
import { addEmissionMatrix, buildEmissionPriors } from "./query-shape-prior.js";
|
|
15
15
|
import { MailwomanTokenizer } from "./tokenizer.js";
|
|
16
|
-
import {
|
|
16
|
+
import { buildBioEndMask, buildBioStartMask, buildBioTransitionMask, softmax, viterbi } from "./viterbi.js";
|
|
17
17
|
export class NeuralAddressClassifier {
|
|
18
18
|
cfg;
|
|
19
19
|
labels;
|
|
20
|
+
decodeMode;
|
|
21
|
+
transitions;
|
|
22
|
+
startTransitions;
|
|
23
|
+
endTransitions;
|
|
20
24
|
constructor(cfg) {
|
|
21
25
|
this.cfg = cfg;
|
|
22
|
-
this.labels = cfg.labels ??
|
|
26
|
+
this.labels = cfg.labels ?? STAGE2_BIO_LABELS;
|
|
27
|
+
this.decodeMode = cfg.decode ?? "viterbi";
|
|
28
|
+
const structural = buildBioTransitionMask(this.labels);
|
|
29
|
+
if (cfg.transitions) {
|
|
30
|
+
this.transitions = addMatrices(structural, cfg.transitions);
|
|
31
|
+
}
|
|
32
|
+
else {
|
|
33
|
+
this.transitions = structural;
|
|
34
|
+
}
|
|
35
|
+
this.startTransitions = cfg.startTransitions ?? buildBioStartMask(this.labels);
|
|
36
|
+
this.endTransitions = cfg.endTransitions ?? buildBioEndMask(this.labels);
|
|
23
37
|
}
|
|
24
38
|
/**
|
|
25
39
|
* One-call factory that resolves the weights package (or explicit paths), loads the tokenizer and
|
|
@@ -27,42 +41,120 @@ export class NeuralAddressClassifier {
|
|
|
27
41
|
*
|
|
28
42
|
* Resolution order: explicit paths in `opts` → `@mailwoman/neural-weights-<locale>` package →
|
|
29
43
|
* throws a single actionable error.
|
|
44
|
+
*
|
|
45
|
+
* **Node-only.** The dynamic imports keep `OnnxRunner` (onnxruntime-node) + `resolveWeights`
|
|
46
|
+
* (uses Node fs) out of the static dependency graph, so this file can be bundled for the browser
|
|
47
|
+
* by `@mailwoman/neural-web`. Calling this method in a browser will throw at runtime — use
|
|
48
|
+
* `loadNeuralClassifierFromUrls` from `@mailwoman/neural-web` instead.
|
|
30
49
|
*/
|
|
31
50
|
static async loadFromWeights(opts = {}) {
|
|
32
|
-
|
|
51
|
+
// /* webpackIgnore: true */ tells webpack to leave the dynamic import statement intact —
|
|
52
|
+
// it becomes a runtime native ESM import that resolves in Node (which has onnxruntime-node
|
|
53
|
+
// + node:fs) and throws cleanly in a browser if called. Without the directive, webpack
|
|
54
|
+
// pulls onnx-runner / weights into the browser chunk graph + then chokes on the Node-only
|
|
55
|
+
// builtins they reference.
|
|
56
|
+
const [{ OnnxRunner }, { resolveWeights, readLabelsFromModelCard }] = await Promise.all([
|
|
57
|
+
import(/* webpackIgnore: true */ "./onnx-runner.js"),
|
|
58
|
+
import(/* webpackIgnore: true */ "./weights.js"),
|
|
59
|
+
]);
|
|
60
|
+
const resolved = resolveWeights(opts);
|
|
61
|
+
// Read the trained label vocabulary from the bundled model-card.json when present. Falls
|
|
62
|
+
// through to the constructor default (STAGE2_BIO_LABELS) for legacy bundles that predate
|
|
63
|
+
// the `labels` field — those are always Stage 2 cards by construction, so the default is
|
|
64
|
+
// the correct fallback. A future Stage 3 ship will require the card to carry the field.
|
|
65
|
+
const labels = readLabelsFromModelCard(resolved.modelCardPath);
|
|
33
66
|
const [tokenizer, runner] = await Promise.all([
|
|
34
|
-
MailwomanTokenizer.loadFromFile(tokenizerPath),
|
|
35
|
-
OnnxRunner.create(modelPath),
|
|
67
|
+
MailwomanTokenizer.loadFromFile(resolved.tokenizerPath),
|
|
68
|
+
OnnxRunner.create(resolved.modelPath),
|
|
36
69
|
]);
|
|
37
|
-
return new NeuralAddressClassifier({ tokenizer, runner });
|
|
70
|
+
return new NeuralAddressClassifier({ tokenizer, runner, labels });
|
|
38
71
|
}
|
|
39
|
-
/** Tokenize → infer → argmax
|
|
40
|
-
async parse(text) {
|
|
72
|
+
/** Tokenize → infer → Viterbi (or argmax) → decoder tree. */
|
|
73
|
+
async parse(text, opts) {
|
|
41
74
|
if (text.length === 0)
|
|
42
75
|
return { raw: text, roots: [] };
|
|
43
76
|
const { pieces, ids } = this.cfg.tokenizer.encode(text);
|
|
44
77
|
const { logits } = await this.cfg.runner.infer(ids);
|
|
78
|
+
// QueryShape soft prior: when the caller supplies a QueryShape (typically from
|
|
79
|
+
// `@mailwoman/query-shape`'s `computeQueryShape`), nudge per-token emissions toward the
|
|
80
|
+
// labels implied by known-format hits. Bounded magnitude — confident encoder predictions
|
|
81
|
+
// still win.
|
|
82
|
+
const emissions = opts?.queryShape
|
|
83
|
+
? addEmissionMatrix(logits, buildEmissionPriors(opts.queryShape, pieces, this.labels, {
|
|
84
|
+
biasScale: opts.queryShapeBiasScale ?? 1.0,
|
|
85
|
+
}))
|
|
86
|
+
: logits;
|
|
87
|
+
const labelIndices = this.decodeMode === "viterbi"
|
|
88
|
+
? viterbi({
|
|
89
|
+
emissions,
|
|
90
|
+
transitions: this.transitions,
|
|
91
|
+
startTransitions: this.startTransitions,
|
|
92
|
+
endTransitions: this.endTransitions,
|
|
93
|
+
}).path
|
|
94
|
+
: emissions.map((row) => argmaxSoftmax(row).idx);
|
|
45
95
|
const tokens = pieces.map((p, i) => {
|
|
46
|
-
const
|
|
47
|
-
|
|
96
|
+
const idx = labelIndices[i];
|
|
97
|
+
// Confidence reports the encoder's *raw* probability (no prior baked in) so callers see
|
|
98
|
+
// the model's own conviction, not the prior-augmented score.
|
|
99
|
+
const probs = softmax(logits[i]);
|
|
48
100
|
return {
|
|
49
101
|
piece: p.piece,
|
|
50
102
|
start: p.start,
|
|
51
103
|
end: p.end,
|
|
52
104
|
label: (this.labels[idx] ?? "O"),
|
|
53
|
-
confidence:
|
|
105
|
+
confidence: probs[idx],
|
|
54
106
|
};
|
|
55
107
|
});
|
|
56
108
|
return buildAddressTree(text, tokens);
|
|
57
109
|
}
|
|
58
|
-
|
|
59
|
-
|
|
110
|
+
/**
|
|
111
|
+
* Like `parse`, but also returns the raw per-token logits and piece offsets needed for per-span
|
|
112
|
+
* logit aggregation (Option C joint-reconcile integration).
|
|
113
|
+
*/
|
|
114
|
+
async parseWithLogits(text, opts) {
|
|
115
|
+
if (text.length === 0) {
|
|
116
|
+
return { tree: { raw: text, roots: [] }, logits: [], pieces: [] };
|
|
117
|
+
}
|
|
118
|
+
const { pieces, ids } = this.cfg.tokenizer.encode(text);
|
|
119
|
+
const { logits } = await this.cfg.runner.infer(ids);
|
|
120
|
+
const emissions = opts?.queryShape
|
|
121
|
+
? addEmissionMatrix(logits, buildEmissionPriors(opts.queryShape, pieces, this.labels, {
|
|
122
|
+
biasScale: opts.queryShapeBiasScale ?? 1.0,
|
|
123
|
+
}))
|
|
124
|
+
: logits;
|
|
125
|
+
const labelIndices = this.decodeMode === "viterbi"
|
|
126
|
+
? viterbi({
|
|
127
|
+
emissions,
|
|
128
|
+
transitions: this.transitions,
|
|
129
|
+
startTransitions: this.startTransitions,
|
|
130
|
+
endTransitions: this.endTransitions,
|
|
131
|
+
}).path
|
|
132
|
+
: emissions.map((row) => argmaxSoftmax(row).idx);
|
|
133
|
+
const tokens = pieces.map((p, i) => {
|
|
134
|
+
const idx = labelIndices[i];
|
|
135
|
+
const probs = softmax(logits[i]);
|
|
136
|
+
return {
|
|
137
|
+
piece: p.piece,
|
|
138
|
+
start: p.start,
|
|
139
|
+
end: p.end,
|
|
140
|
+
label: (this.labels[idx] ?? "O"),
|
|
141
|
+
confidence: probs[idx],
|
|
142
|
+
};
|
|
143
|
+
});
|
|
144
|
+
return {
|
|
145
|
+
tree: buildAddressTree(text, tokens),
|
|
146
|
+
logits,
|
|
147
|
+
pieces: pieces.map((p) => ({ start: p.start, end: p.end })),
|
|
148
|
+
};
|
|
149
|
+
}
|
|
150
|
+
async parseJson(text, opts) {
|
|
151
|
+
return decodeAsJson(await this.parse(text, opts));
|
|
60
152
|
}
|
|
61
|
-
async parseTuples(text) {
|
|
62
|
-
return decodeAsTuples(await this.parse(text));
|
|
153
|
+
async parseTuples(text, opts) {
|
|
154
|
+
return decodeAsTuples(await this.parse(text, opts));
|
|
63
155
|
}
|
|
64
156
|
async parseXml(text, opts) {
|
|
65
|
-
return decodeAsXml(await this.parse(text), opts);
|
|
157
|
+
return decodeAsXml(await this.parse(text, opts), opts?.xml);
|
|
66
158
|
}
|
|
67
159
|
}
|
|
68
160
|
function argmaxSoftmax(row) {
|
|
@@ -80,4 +172,16 @@ function argmaxSoftmax(row) {
|
|
|
80
172
|
const conf = 1 / sumExp;
|
|
81
173
|
return { idx: maxIdx, conf };
|
|
82
174
|
}
|
|
175
|
+
/** Element-wise add two square matrices. Used to compose the structural mask + learned transitions. */
|
|
176
|
+
function addMatrices(a, b) {
|
|
177
|
+
const n = a.length;
|
|
178
|
+
const out = [];
|
|
179
|
+
for (let i = 0; i < n; i++) {
|
|
180
|
+
const row = new Array(n);
|
|
181
|
+
for (let j = 0; j < n; j++)
|
|
182
|
+
row[j] = a[i][j] + b[i][j];
|
|
183
|
+
out.push(row);
|
|
184
|
+
}
|
|
185
|
+
return out;
|
|
186
|
+
}
|
|
83
187
|
//# sourceMappingURL=classifier.js.map
|
package/out/classifier.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"classifier.js","sourceRoot":"","sources":["../classifier.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,OAAO,
|
|
1
|
+
{"version":3,"file":"classifier.js","sourceRoot":"","sources":["../classifier.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,OAAO,EACN,gBAAgB,EAChB,YAAY,EACZ,cAAc,EACd,WAAW,GAIX,MAAM,yBAAyB,CAAA;AAChC,OAAO,EAAE,iBAAiB,EAAE,MAAM,aAAa,CAAA;AAE/C,OAAO,EAAE,iBAAiB,EAAE,mBAAmB,EAAuB,MAAM,wBAAwB,CAAA;AACpG,OAAO,EAAE,kBAAkB,EAAE,MAAM,gBAAgB,CAAA;AACnD,OAAO,EAAE,eAAe,EAAE,iBAAiB,EAAE,sBAAsB,EAAE,OAAO,EAAE,OAAO,EAAE,MAAM,cAAc,CAAA;AA0C3G,MAAM,OAAO,uBAAuB;IAON;IANZ,MAAM,CAAmB;IACzB,UAAU,CAAsB;IAChC,WAAW,CAAY;IACvB,gBAAgB,CAAU;IAC1B,cAAc,CAAU;IAEzC,YAA6B,GAAkC;QAAlC,QAAG,GAAH,GAAG,CAA+B;QAC9D,IAAI,CAAC,MAAM,GAAG,GAAG,CAAC,MAAM,IAAI,iBAAiB,CAAA;QAC7C,IAAI,CAAC,UAAU,GAAG,GAAG,CAAC,MAAM,IAAI,SAAS,CAAA;QACzC,MAAM,UAAU,GAAG,sBAAsB,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;QACtD,IAAI,GAAG,CAAC,WAAW,EAAE,CAAC;YACrB,IAAI,CAAC,WAAW,GAAG,WAAW,CAAC,UAAU,EAAE,GAAG,CAAC,WAAW,CAAC,CAAA;QAC5D,CAAC;aAAM,CAAC;YACP,IAAI,CAAC,WAAW,GAAG,UAAU,CAAA;QAC9B,CAAC;QACD,IAAI,CAAC,gBAAgB,GAAG,GAAG,CAAC,gBAAgB,IAAI,iBAAiB,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;QAC9E,IAAI,CAAC,cAAc,GAAG,GAAG,CAAC,cAAc,IAAI,eAAe,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;IACzE,CAAC;IAED;;;;;;;;;;;OAWG;IACH,MAAM,CAAC,KAAK,CAAC,eAAe,CAAC,OAA2B,EAAE;QACzD,yFAAyF;QACzF,2FAA2F;QAC3F,uFAAuF;QACvF,0FAA0F;QAC1F,2BAA2B;QAC3B,MAAM,CAAC,EAAE,UAAU,EAAE,EAAE,EAAE,cAAc,EAAE,uBAAuB,EAAE,CAAC,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC;YACvF,MAAM,CAAC,yBAAyB,CAAC,kBAAkB,CAAC;YACpD,MAAM,CAAC,yBAAyB,CAAC,cAAc,CAAC;SAChD,CAAC,CAAA;QACF,MAAM,QAAQ,GAAoB,cAAc,CAAC,IAAI,CAAC,CAAA;QACtD,yFAAyF;QACzF,yFAAyF;QACzF,yFAAyF;QACzF,wFAAwF;QACxF,MAAM,MAAM,GAAG,uBAAuB,CAAC,QAAQ,CAAC,aAAa,CAAC,CAAA;QAC9D,MAAM,CAAC,SAAS,EAAE,MAAM,CAAC,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC;YAC7C,kBAAkB,CAAC,YAAY,CAAC,QAAQ,CAAC,aAAa,CAAC;YACvD,UAAU,CAAC,MAAM,CAAC,QAAQ,CAAC,SAAS,CAAC;SACrC,CAAC,CAAA;QACF,OAAO,IAAI,uBAAuB,CAAC,EAAE,SAAS,EAAE,MAAM,EAAE,MAAM,EAAE,CAAC,CAAA;IAClE,CAAC;IAED,6DAA6D;IAC7D,KAAK,CAAC,KAAK,CAAC,IAAY,EAAE,IAAgB;QACzC,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC;YAAE,OAAO,EAAE,GAAG,EAAE,IAAI,EAAE,KAAK,EAAE,EAAE,EAAE,CAAA;QAEtD,MAAM,EAAE,MAAM,EAAE,GAAG,EAAE,GAAG,IAAI,CAAC,GAAG,CAAC,SAAS,CAAC,MAAM,CAAC,IAAI,CAAC,CAAA;QACvD,MAAM,EAAE,MAAM,EAAE,GAAG,MAAM,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,CAAA;QAEnD,+EAA+E;QAC/E,wFAAwF;QACxF,yFAAyF;QACzF,aAAa;QACb,MAAM,SAAS,GAAG,IAAI,EAAE,UAAU;YACjC,CAAC,CAAC,iBAAiB,CACjB,MAAM,EACN,mBAAmB,CAAC,IAAI,CAAC,UAAU,EAAE,MAAM,EAAE,IAAI,CAAC,MAAM,EAAE;gBACzD,SAAS,EAAE,IAAI,CAAC,mBAAmB,IAAI,GAAG;aAC1C,CAAC,CACF;YACF,CAAC,CAAC,MAAM,CAAA;QAET,MAAM,YAAY,GACjB,IAAI,CAAC,UAAU,KAAK,SAAS;YAC5B,CAAC,CAAC,OAAO,CAAC;gBACR,SAAS;gBACT,WAAW,EAAE,IAAI,CAAC,WAAW;gBAC7B,gBAAgB,EAAE,IAAI,CAAC,gBAAgB;gBACvC,cAAc,EAAE,IAAI,CAAC,cAAc;aACnC,CAAC,CAAC,IAAI;YACR,CAAC,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,aAAa,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAA;QAElD,MAAM,MAAM,GAAmB,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE;YAClD,MAAM,GAAG,GAAG,YAAY,CAAC,CAAC,CAAE,CAAA;YAC5B,wFAAwF;YACxF,6DAA6D;YAC7D,MAAM,KAAK,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAE,CAAC,CAAA;YACjC,OAAO;gBACN,KAAK,EAAE,CAAC,CAAC,KAAK;gBACd,KAAK,EAAE,CAAC,CAAC,KAAK;gBACd,GAAG,EAAE,CAAC,CAAC,GAAG;gBACV,KAAK,EAAE,CAAC,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,IAAI,GAAG,CAA0B;gBACzD,UAAU,EAAE,KAAK,CAAC,GAAG,CAAE;aACvB,CAAA;QACF,CAAC,CAAC,CAAA;QAEF,OAAO,gBAAgB,CAAC,IAAI,EAAE,MAAM,CAAC,CAAA;IACtC,CAAC;IAED;;;OAGG;IACH,KAAK,CAAC,eAAe,CAAC,IAAY,EAAE,IAAgB;QACnD,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACvB,OAAO,EAAE,IAAI,EAAE,EAAE,GAAG,EAAE,IAAI,EAAE,KAAK,EAAE,EAAE,EAAE,EAAE,MAAM,EAAE,EAAE,EAAE,MAAM,EAAE,EAAE,EAAE,CAAA;QAClE,CAAC;QACD,MAAM,EAAE,MAAM,EAAE,GAAG,EAAE,GAAG,IAAI,CAAC,GAAG,CAAC,SAAS,CAAC,MAAM,CAAC,IAAI,CAAC,CAAA;QACvD,MAAM,EAAE,MAAM,EAAE,GAAG,MAAM,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,CAAC,CAAA;QAEnD,MAAM,SAAS,GAAG,IAAI,EAAE,UAAU;YACjC,CAAC,CAAC,iBAAiB,CACjB,MAAM,EACN,mBAAmB,CAAC,IAAI,CAAC,UAAU,EAAE,MAAM,EAAE,IAAI,CAAC,MAAM,EAAE;gBACzD,SAAS,EAAE,IAAI,CAAC,mBAAmB,IAAI,GAAG;aAC1C,CAAC,CACF;YACF,CAAC,CAAC,MAAM,CAAA;QAET,MAAM,YAAY,GACjB,IAAI,CAAC,UAAU,KAAK,SAAS;YAC5B,CAAC,CAAC,OAAO,CAAC;gBACR,SAAS;gBACT,WAAW,EAAE,IAAI,CAAC,WAAW;gBAC7B,gBAAgB,EAAE,IAAI,CAAC,gBAAgB;gBACvC,cAAc,EAAE,IAAI,CAAC,cAAc;aACnC,CAAC,CAAC,IAAI;YACR,CAAC,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,aAAa,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAA;QAElD,MAAM,MAAM,GAAmB,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE;YAClD,MAAM,GAAG,GAAG,YAAY,CAAC,CAAC,CAAE,CAAA;YAC5B,MAAM,KAAK,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAE,CAAC,CAAA;YACjC,OAAO;gBACN,KAAK,EAAE,CAAC,CAAC,KAAK;gBACd,KAAK,EAAE,CAAC,CAAC,KAAK;gBACd,GAAG,EAAE,CAAC,CAAC,GAAG;gBACV,KAAK,EAAE,CAAC,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,IAAI,GAAG,CAA0B;gBACzD,UAAU,EAAE,KAAK,CAAC,GAAG,CAAE;aACvB,CAAA;QACF,CAAC,CAAC,CAAA;QAEF,OAAO;YACN,IAAI,EAAE,gBAAgB,CAAC,IAAI,EAAE,MAAM,CAAC;YACpC,MAAM;YACN,MAAM,EAAE,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,EAAE,KAAK,EAAE,CAAC,CAAC,KAAK,EAAE,GAAG,EAAE,CAAC,CAAC,GAAG,EAAE,CAAC,CAAC;SAC3D,CAAA;IACF,CAAC;IAED,KAAK,CAAC,SAAS,CAAC,IAAY,EAAE,IAAgB;QAC7C,OAAO,YAAY,CAAC,MAAM,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC,CAAA;IAClD,CAAC;IAED,KAAK,CAAC,WAAW,CAAC,IAAY,EAAE,IAAgB;QAC/C,OAAO,cAAc,CAAC,MAAM,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC,CAAA;IACpD,CAAC;IAED,KAAK,CAAC,QAAQ,CAAC,IAAY,EAAE,IAA8D;QAC1F,OAAO,WAAW,CAAC,MAAM,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,IAAI,CAAC,EAAE,IAAI,EAAE,GAAG,CAAC,CAAA;IAC5D,CAAC;CACD;AA2BD,SAAS,aAAa,CAAC,GAAa;IACnC,IAAI,MAAM,GAAG,CAAC,CAAA;IACd,IAAI,MAAM,GAAG,GAAG,CAAC,CAAC,CAAE,CAAA;IACpB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACrC,IAAI,GAAG,CAAC,CAAC,CAAE,GAAG,MAAM,EAAE,CAAC;YACtB,MAAM,GAAG,GAAG,CAAC,CAAC,CAAE,CAAA;YAChB,MAAM,GAAG,CAAC,CAAA;QACX,CAAC;IACF,CAAC;IACD,IAAI,MAAM,GAAG,CAAC,CAAA;IACd,KAAK,MAAM,CAAC,IAAI,GAAG;QAAE,MAAM,IAAI,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,MAAM,CAAC,CAAA;IACnD,MAAM,IAAI,GAAG,CAAC,GAAG,MAAM,CAAA;IACvB,OAAO,EAAE,GAAG,EAAE,MAAM,EAAE,IAAI,EAAE,CAAA;AAC7B,CAAC;AAED,uGAAuG;AACvG,SAAS,WAAW,CAAC,CAAa,EAAE,CAAa;IAChD,MAAM,CAAC,GAAG,CAAC,CAAC,MAAM,CAAA;IAClB,MAAM,GAAG,GAAe,EAAE,CAAA;IAC1B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QAC5B,MAAM,GAAG,GAAG,IAAI,KAAK,CAAS,CAAC,CAAC,CAAA;QAChC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE;YAAE,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAE,CAAC,CAAC,CAAE,GAAG,CAAC,CAAC,CAAC,CAAE,CAAC,CAAC,CAAE,CAAA;QAC1D,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,CAAA;IACd,CAAC;IACD,OAAO,GAAG,CAAA;AACX,CAAC"}
|
package/out/index.d.ts
CHANGED
|
@@ -7,6 +7,10 @@ export * from "./classifier.js";
|
|
|
7
7
|
export * from "./labels.js";
|
|
8
8
|
export * from "./onnx-runner.js";
|
|
9
9
|
export * from "./proposal-classifier.js";
|
|
10
|
+
export { addEmissionMatrix, buildEmissionPriors } from "./query-shape-prior.js";
|
|
11
|
+
export type { BuildPriorsOpts, KnownFormatHitLike, QueryShapeLike, TokenLike } from "./query-shape-prior.js";
|
|
10
12
|
export * from "./tokenizer.js";
|
|
13
|
+
export { buildBioEndMask, buildBioStartMask, buildBioTransitionMask, perTokenArgmax, softmax, viterbi, } from "./viterbi.js";
|
|
14
|
+
export type { ViterbiInput, ViterbiResult } from "./viterbi.js";
|
|
11
15
|
export * from "./weights.js";
|
|
12
16
|
//# sourceMappingURL=index.d.ts.map
|
package/out/index.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../index.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,cAAc,iBAAiB,CAAA;AAC/B,cAAc,aAAa,CAAA;AAC3B,cAAc,kBAAkB,CAAA;AAChC,cAAc,0BAA0B,CAAA;AACxC,cAAc,gBAAgB,CAAA;AAC9B,cAAc,cAAc,CAAA"}
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../index.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,cAAc,iBAAiB,CAAA;AAC/B,cAAc,aAAa,CAAA;AAC3B,cAAc,kBAAkB,CAAA;AAChC,cAAc,0BAA0B,CAAA;AACxC,OAAO,EAAE,iBAAiB,EAAE,mBAAmB,EAAE,MAAM,wBAAwB,CAAA;AAC/E,YAAY,EAAE,eAAe,EAAE,kBAAkB,EAAE,cAAc,EAAE,SAAS,EAAE,MAAM,wBAAwB,CAAA;AAC5G,cAAc,gBAAgB,CAAA;AAC9B,OAAO,EACN,eAAe,EACf,iBAAiB,EACjB,sBAAsB,EACtB,cAAc,EACd,OAAO,EACP,OAAO,GACP,MAAM,cAAc,CAAA;AACrB,YAAY,EAAE,YAAY,EAAE,aAAa,EAAE,MAAM,cAAc,CAAA;AAC/D,cAAc,cAAc,CAAA"}
|
package/out/index.js
CHANGED
|
@@ -7,6 +7,8 @@ export * from "./classifier.js";
|
|
|
7
7
|
export * from "./labels.js";
|
|
8
8
|
export * from "./onnx-runner.js";
|
|
9
9
|
export * from "./proposal-classifier.js";
|
|
10
|
+
export { addEmissionMatrix, buildEmissionPriors } from "./query-shape-prior.js";
|
|
10
11
|
export * from "./tokenizer.js";
|
|
12
|
+
export { buildBioEndMask, buildBioStartMask, buildBioTransitionMask, perTokenArgmax, softmax, viterbi, } from "./viterbi.js";
|
|
11
13
|
export * from "./weights.js";
|
|
12
14
|
//# sourceMappingURL=index.js.map
|
package/out/index.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../index.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,cAAc,iBAAiB,CAAA;AAC/B,cAAc,aAAa,CAAA;AAC3B,cAAc,kBAAkB,CAAA;AAChC,cAAc,0BAA0B,CAAA;AACxC,cAAc,gBAAgB,CAAA;AAC9B,cAAc,cAAc,CAAA"}
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../index.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,cAAc,iBAAiB,CAAA;AAC/B,cAAc,aAAa,CAAA;AAC3B,cAAc,kBAAkB,CAAA;AAChC,cAAc,0BAA0B,CAAA;AACxC,OAAO,EAAE,iBAAiB,EAAE,mBAAmB,EAAE,MAAM,wBAAwB,CAAA;AAE/E,cAAc,gBAAgB,CAAA;AAC9B,OAAO,EACN,eAAe,EACf,iBAAiB,EACjB,sBAAsB,EACtB,cAAc,EACd,OAAO,EACP,OAAO,GACP,MAAM,cAAc,CAAA;AAErB,cAAc,cAAc,CAAA"}
|
package/out/labels.d.ts
CHANGED
|
@@ -3,18 +3,39 @@
|
|
|
3
3
|
* @license AGPL-3.0
|
|
4
4
|
* @author Teffen Ellis, et al.
|
|
5
5
|
*
|
|
6
|
-
* Mirror of `packages/corpus-python/src/mailwoman_train/labels.py
|
|
6
|
+
* Mirror of `packages/corpus-python/src/mailwoman_train/labels.py`.
|
|
7
7
|
*
|
|
8
|
-
*
|
|
9
|
-
* silently corrupts
|
|
8
|
+
* Index ↔ label parity is load-bearing: the model emits logits in one canonical order on both sides
|
|
9
|
+
* and any drift here silently corrupts BIO decoding. STAGE2 strictly extends STAGE1 — the first
|
|
10
|
+
* 15 indices are identical, so reading a v0.2.0 (Stage 1) model with the Stage 2 label vocabulary
|
|
11
|
+
* stays correct; the extra entries are unused.
|
|
10
12
|
*
|
|
11
|
-
*
|
|
12
|
-
*
|
|
13
|
-
*
|
|
13
|
+
* Runtime loading: as of v0.4.0 the trained label vocabulary is carried in `model-card.json`'s
|
|
14
|
+
* `labels` field and read by `loadFromWeights` (see `weights.readLabelsFromModelCard`). These
|
|
15
|
+
* constants remain the compile-time fallback for legacy bundles whose cards predate the field —
|
|
16
|
+
* safe because such bundles are by construction Stage 1 or Stage 2, and Stage 2 prefix-extends
|
|
17
|
+
* Stage 1. A future Stage 3 ship will not be safe under the fallback; the loader treats a missing
|
|
18
|
+
* `labels` field as "you are loading a pre-v0.4.0 bundle" rather than "unknown stage".
|
|
14
19
|
*/
|
|
15
20
|
import type { BioLabel } from "@mailwoman/core/decoder";
|
|
16
21
|
/** Coarse component tags trained in Phase 2 Stage 1 (v0.1.0 / v0.2.0). */
|
|
17
22
|
export declare const STAGE1_COARSE_TAGS: readonly ["country", "region", "locality", "dependent_locality", "postcode", "subregion", "cedex"];
|
|
18
23
|
/** BIO label vocabulary for Stage 1 — O + (B-/I- per coarse tag). 1 + 14 = 15 labels. */
|
|
19
24
|
export declare const STAGE1_BIO_LABELS: readonly BioLabel[];
|
|
25
|
+
/**
|
|
26
|
+
* Fine-grained tags added in Phase 2 Stage 2 (v0.3.0). venue covers organization/POI/landmark
|
|
27
|
+
* names; street + house_number break out the street-address components that Stage 1 collapsed to
|
|
28
|
+
* `O`.
|
|
29
|
+
*/
|
|
30
|
+
export declare const STAGE2_FINE_TAGS: readonly ["venue", "street", "house_number"];
|
|
31
|
+
/** Stage 2 ships the full coarse + fine set in the order STAGE2_BIO_LABELS is interleaved. */
|
|
32
|
+
export declare const STAGE2_TAGS: readonly ["country", "region", "locality", "dependent_locality", "postcode", "subregion", "cedex", "venue", "street", "house_number"];
|
|
33
|
+
/**
|
|
34
|
+
* BIO label vocabulary for Stage 2 (v0.3.0) — O + (B-/I- per Stage 2 tag). 1 + 20 = 21 labels.
|
|
35
|
+
*
|
|
36
|
+
* Index parity vs Stage 1: STAGE2_BIO_LABELS[i] === STAGE1_BIO_LABELS[i] for i ∈ [0, 15). Anyone
|
|
37
|
+
* loading a Stage 1 model with this vocabulary still decodes correctly; the tail (15..20) just
|
|
38
|
+
* never gets argmax'd because Stage 1 only emits 15 logits.
|
|
39
|
+
*/
|
|
40
|
+
export declare const STAGE2_BIO_LABELS: readonly BioLabel[];
|
|
20
41
|
//# sourceMappingURL=labels.d.ts.map
|
package/out/labels.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"labels.d.ts","sourceRoot":"","sources":["../labels.ts"],"names":[],"mappings":"AAAA
|
|
1
|
+
{"version":3,"file":"labels.d.ts","sourceRoot":"","sources":["../labels.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;GAkBG;AAEH,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,yBAAyB,CAAA;AAEvD,0EAA0E;AAC1E,eAAO,MAAM,kBAAkB,oGAQrB,CAAA;AAEV,yFAAyF;AACzF,eAAO,MAAM,iBAAiB,EAAE,SAAS,QAAQ,EAG/C,CAAA;AAEF;;;;GAIG;AACH,eAAO,MAAM,gBAAgB,8CAA+C,CAAA;AAE5E,8FAA8F;AAC9F,eAAO,MAAM,WAAW,uIAAwD,CAAA;AAEhF;;;;;;GAMG;AACH,eAAO,MAAM,iBAAiB,EAAE,SAAS,QAAQ,EAG/C,CAAA"}
|
package/out/labels.js
CHANGED
|
@@ -3,14 +3,19 @@
|
|
|
3
3
|
* @license AGPL-3.0
|
|
4
4
|
* @author Teffen Ellis, et al.
|
|
5
5
|
*
|
|
6
|
-
* Mirror of `packages/corpus-python/src/mailwoman_train/labels.py
|
|
6
|
+
* Mirror of `packages/corpus-python/src/mailwoman_train/labels.py`.
|
|
7
7
|
*
|
|
8
|
-
*
|
|
9
|
-
* silently corrupts
|
|
8
|
+
* Index ↔ label parity is load-bearing: the model emits logits in one canonical order on both sides
|
|
9
|
+
* and any drift here silently corrupts BIO decoding. STAGE2 strictly extends STAGE1 — the first
|
|
10
|
+
* 15 indices are identical, so reading a v0.2.0 (Stage 1) model with the Stage 2 label vocabulary
|
|
11
|
+
* stays correct; the extra entries are unused.
|
|
10
12
|
*
|
|
11
|
-
*
|
|
12
|
-
*
|
|
13
|
-
*
|
|
13
|
+
* Runtime loading: as of v0.4.0 the trained label vocabulary is carried in `model-card.json`'s
|
|
14
|
+
* `labels` field and read by `loadFromWeights` (see `weights.readLabelsFromModelCard`). These
|
|
15
|
+
* constants remain the compile-time fallback for legacy bundles whose cards predate the field —
|
|
16
|
+
* safe because such bundles are by construction Stage 1 or Stage 2, and Stage 2 prefix-extends
|
|
17
|
+
* Stage 1. A future Stage 3 ship will not be safe under the fallback; the loader treats a missing
|
|
18
|
+
* `labels` field as "you are loading a pre-v0.4.0 bundle" rather than "unknown stage".
|
|
14
19
|
*/
|
|
15
20
|
/** Coarse component tags trained in Phase 2 Stage 1 (v0.1.0 / v0.2.0). */
|
|
16
21
|
export const STAGE1_COARSE_TAGS = [
|
|
@@ -27,4 +32,23 @@ export const STAGE1_BIO_LABELS = Object.freeze([
|
|
|
27
32
|
"O",
|
|
28
33
|
...STAGE1_COARSE_TAGS.flatMap((tag) => [`B-${tag}`, `I-${tag}`]),
|
|
29
34
|
]);
|
|
35
|
+
/**
|
|
36
|
+
* Fine-grained tags added in Phase 2 Stage 2 (v0.3.0). venue covers organization/POI/landmark
|
|
37
|
+
* names; street + house_number break out the street-address components that Stage 1 collapsed to
|
|
38
|
+
* `O`.
|
|
39
|
+
*/
|
|
40
|
+
export const STAGE2_FINE_TAGS = ["venue", "street", "house_number"];
|
|
41
|
+
/** Stage 2 ships the full coarse + fine set in the order STAGE2_BIO_LABELS is interleaved. */
|
|
42
|
+
export const STAGE2_TAGS = [...STAGE1_COARSE_TAGS, ...STAGE2_FINE_TAGS];
|
|
43
|
+
/**
|
|
44
|
+
* BIO label vocabulary for Stage 2 (v0.3.0) — O + (B-/I- per Stage 2 tag). 1 + 20 = 21 labels.
|
|
45
|
+
*
|
|
46
|
+
* Index parity vs Stage 1: STAGE2_BIO_LABELS[i] === STAGE1_BIO_LABELS[i] for i ∈ [0, 15). Anyone
|
|
47
|
+
* loading a Stage 1 model with this vocabulary still decodes correctly; the tail (15..20) just
|
|
48
|
+
* never gets argmax'd because Stage 1 only emits 15 logits.
|
|
49
|
+
*/
|
|
50
|
+
export const STAGE2_BIO_LABELS = Object.freeze([
|
|
51
|
+
"O",
|
|
52
|
+
...STAGE2_TAGS.flatMap((tag) => [`B-${tag}`, `I-${tag}`]),
|
|
53
|
+
]);
|
|
30
54
|
//# sourceMappingURL=labels.js.map
|
package/out/labels.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"labels.js","sourceRoot":"","sources":["../labels.ts"],"names":[],"mappings":"AAAA
|
|
1
|
+
{"version":3,"file":"labels.js","sourceRoot":"","sources":["../labels.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;GAkBG;AAIH,0EAA0E;AAC1E,MAAM,CAAC,MAAM,kBAAkB,GAAG;IACjC,SAAS;IACT,QAAQ;IACR,UAAU;IACV,oBAAoB;IACpB,UAAU;IACV,WAAW;IACX,OAAO;CACE,CAAA;AAEV,yFAAyF;AACzF,MAAM,CAAC,MAAM,iBAAiB,GAAwB,MAAM,CAAC,MAAM,CAAC;IACnE,GAAe;IACf,GAAG,kBAAkB,CAAC,OAAO,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,CAAC,KAAK,GAAG,EAAc,EAAE,KAAK,GAAG,EAAc,CAAC,CAAC;CACxF,CAAC,CAAA;AAEF;;;;GAIG;AACH,MAAM,CAAC,MAAM,gBAAgB,GAAG,CAAC,OAAO,EAAE,QAAQ,EAAE,cAAc,CAAU,CAAA;AAE5E,8FAA8F;AAC9F,MAAM,CAAC,MAAM,WAAW,GAAG,CAAC,GAAG,kBAAkB,EAAE,GAAG,gBAAgB,CAAU,CAAA;AAEhF;;;;;;GAMG;AACH,MAAM,CAAC,MAAM,iBAAiB,GAAwB,MAAM,CAAC,MAAM,CAAC;IACnE,GAAe;IACf,GAAG,WAAW,CAAC,OAAO,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,CAAC,KAAK,GAAG,EAAc,EAAE,KAAK,GAAG,EAAc,CAAC,CAAC;CACjF,CAAC,CAAA"}
|
|
@@ -24,7 +24,11 @@ export interface NeuralProposalClassifierConfig {
|
|
|
24
24
|
id: string;
|
|
25
25
|
/** The underlying neural classifier instance. */
|
|
26
26
|
classifier: NeuralAddressClassifier;
|
|
27
|
-
/**
|
|
27
|
+
/**
|
|
28
|
+
* Component tags this classifier may emit. Defaults to the Stage 2 tag set (coarse +
|
|
29
|
+
* venue/street/house_number). v0.2.0 Stage 1 models never decode to the fine tags anyway, so the
|
|
30
|
+
* broader default is forwards-compat without back-compat risk.
|
|
31
|
+
*/
|
|
28
32
|
emits?: readonly ComponentTag[];
|
|
29
33
|
/** Locales this classifier is active for. `["*"]` (locale-agnostic) by default. */
|
|
30
34
|
locales?: readonly (string | "*")[];
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"proposal-classifier.d.ts","sourceRoot":"","sources":["../proposal-classifier.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;GAkBG;AAIH,OAAO,KAAK,EAGX,YAAY,EACZ,kBAAkB,EAElB,MAAM,uBAAuB,CAAA;AAC9B,OAAO,KAAK,EAAE,uBAAuB,EAAE,MAAM,iBAAiB,CAAA;AAG9D,MAAM,WAAW,8BAA8B;IAC9C,wFAAwF;IACxF,EAAE,EAAE,MAAM,CAAA;IACV,iDAAiD;IACjD,UAAU,EAAE,uBAAuB,CAAA;IACnC
|
|
1
|
+
{"version":3,"file":"proposal-classifier.d.ts","sourceRoot":"","sources":["../proposal-classifier.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;GAkBG;AAIH,OAAO,KAAK,EAGX,YAAY,EACZ,kBAAkB,EAElB,MAAM,uBAAuB,CAAA;AAC9B,OAAO,KAAK,EAAE,uBAAuB,EAAE,MAAM,iBAAiB,CAAA;AAG9D,MAAM,WAAW,8BAA8B;IAC9C,wFAAwF;IACxF,EAAE,EAAE,MAAM,CAAA;IACV,iDAAiD;IACjD,UAAU,EAAE,uBAAuB,CAAA;IACnC;;;;OAIG;IACH,KAAK,CAAC,EAAE,SAAS,YAAY,EAAE,CAAA;IAC/B,mFAAmF;IACnF,OAAO,CAAC,EAAE,SAAS,CAAC,MAAM,GAAG,GAAG,CAAC,EAAE,CAAA;IACnC,+DAA+D;IAC/D,OAAO,CAAC,EAAE,MAAM,CAAA;CAChB;AAED,0EAA0E;AAC1E,wBAAgB,8BAA8B,CAAC,GAAG,EAAE,8BAA8B,GAAG,kBAAkB,CA6CtG"}
|
|
@@ -17,10 +17,10 @@
|
|
|
17
17
|
* inference is a future optimization once the policy layer has a way to invoke a classifier "once
|
|
18
18
|
* per parse" instead of per section.
|
|
19
19
|
*/
|
|
20
|
-
import {
|
|
20
|
+
import { STAGE2_TAGS } from "./labels.js";
|
|
21
21
|
/** Build a `ProposalClassifier` backed by a `NeuralAddressClassifier`. */
|
|
22
22
|
export function createNeuralProposalClassifier(cfg) {
|
|
23
|
-
const emits = cfg.emits ??
|
|
23
|
+
const emits = cfg.emits ?? STAGE2_TAGS;
|
|
24
24
|
const emitsSet = new Set(emits);
|
|
25
25
|
const penalty = cfg.penalty ?? 0;
|
|
26
26
|
async function classify(section, _ctx) {
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"proposal-classifier.js","sourceRoot":"","sources":["../proposal-classifier.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;GAkBG;AAYH,OAAO,EAAE,
|
|
1
|
+
{"version":3,"file":"proposal-classifier.js","sourceRoot":"","sources":["../proposal-classifier.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;GAkBG;AAYH,OAAO,EAAE,WAAW,EAAE,MAAM,aAAa,CAAA;AAmBzC,0EAA0E;AAC1E,MAAM,UAAU,8BAA8B,CAAC,GAAmC;IACjF,MAAM,KAAK,GAAG,GAAG,CAAC,KAAK,IAAI,WAAW,CAAA;IACtC,MAAM,QAAQ,GAAG,IAAI,GAAG,CAAe,KAAgC,CAAC,CAAA;IACxE,MAAM,OAAO,GAAG,GAAG,CAAC,OAAO,IAAI,CAAC,CAAA;IAEhC,KAAK,UAAU,QAAQ,CAAC,OAAgB,EAAE,IAAuB;QAChE,MAAM,IAAI,GAAG,MAAM,GAAG,CAAC,UAAU,CAAC,KAAK,CAAC,OAAO,CAAC,IAAI,CAAC,CAAA;QACrD,MAAM,SAAS,GAA6B,EAAE,CAAA;QAC9C,MAAM,aAAa,GAAG,OAAO,CAAC,KAAK,CAAA;QAEnC,MAAM,KAAK,GAAG,CAAC,IAAiB,EAAQ,EAAE;YACzC,IAAI,QAAQ,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC;gBAC5B,wFAAwF;gBACxF,oFAAoF;gBACpF,uFAAuF;gBACvF,iFAAiF;gBACjF,mFAAmF;gBACnF,2EAA2E;gBAC3E,MAAM,IAAI,GAAG;oBACZ,KAAK,EAAE,aAAa,GAAG,IAAI,CAAC,KAAK;oBACjC,GAAG,EAAE,aAAa,GAAG,IAAI,CAAC,GAAG;oBAC7B,IAAI,EAAE,IAAI,CAAC,KAAK;iBACG,CAAA;gBACpB,SAAS,CAAC,IAAI,CAAC;oBACd,IAAI;oBACJ,SAAS,EAAE,IAAI,CAAC,GAAG;oBACnB,UAAU,EAAE,IAAI,CAAC,UAAU;oBAC3B,MAAM,EAAE,QAAQ;oBAChB,SAAS,EAAE,GAAG,CAAC,EAAE;oBACjB,OAAO;iBACP,CAAC,CAAA;YACH,CAAC;YACD,KAAK,MAAM,KAAK,IAAI,IAAI,CAAC,QAAQ;gBAAE,KAAK,CAAC,KAAK,CAAC,CAAA;QAChD,CAAC,CAAA;QAED,KAAK,MAAM,IAAI,IAAI,IAAI,CAAC,KAAK;YAAE,KAAK,CAAC,IAAI,CAAC,CAAA;QAC1C,OAAO,SAAS,CAAA;IACjB,CAAC;IAED,OAAO;QACN,EAAE,EAAE,GAAG,CAAC,EAAE;QACV,KAAK;QACL,OAAO,EAAE,GAAG,CAAC,OAAO,IAAI,CAAC,GAAG,CAAC;QAC7B,QAAQ;KACR,CAAA;AACF,CAAC"}
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Soft-prior emission biases derived from `QueryShape`.
|
|
7
|
+
*
|
|
8
|
+
* When the QueryShape sub-system has identified a known-format span (US ZIP, UK postcode, PO box,
|
|
9
|
+
* etc.), this module produces an additive bias matrix that nudges the encoder's per-token
|
|
10
|
+
* emissions toward the matching BIO label. The biases compose with the structural BIO mask in the
|
|
11
|
+
* Viterbi decoder — confident encoder predictions still win, but uncertain ones get pulled toward
|
|
12
|
+
* the format-implied label.
|
|
13
|
+
*
|
|
14
|
+
* Bitter-lesson-safe boundary: we don't override the encoder, just bias it. The encoder remains the
|
|
15
|
+
* authority on context-dependent calls (the "Buffalo Wild Wings, Buffalo, NY" disambiguation);
|
|
16
|
+
* the QueryShape prior helps on the easy cases (a 5-digit token is _probably_ a postcode).
|
|
17
|
+
*
|
|
18
|
+
* Uses structural typing for the QueryShape input so this module has zero dependencies on
|
|
19
|
+
* `@mailwoman/query-shape` — consumers compute the shape with that package, pass it in here.
|
|
20
|
+
*/
|
|
21
|
+
/**
|
|
22
|
+
* Minimal subset of `QueryShape` this module consumes. Compatible with `@mailwoman/query-shape`'s
|
|
23
|
+
* exported `QueryShape` type by shape — no import required.
|
|
24
|
+
*/
|
|
25
|
+
export interface QueryShapeLike {
|
|
26
|
+
knownFormats: ReadonlyArray<KnownFormatHitLike>;
|
|
27
|
+
}
|
|
28
|
+
export interface KnownFormatHitLike {
|
|
29
|
+
format: string;
|
|
30
|
+
span: {
|
|
31
|
+
start: number;
|
|
32
|
+
end: number;
|
|
33
|
+
};
|
|
34
|
+
/** 0..1; ambiguous patterns (e.g. 5-digit US/FR/DE overlap) score lower. */
|
|
35
|
+
confidence: number;
|
|
36
|
+
}
|
|
37
|
+
/** Minimal subset of `TokenizedPiece` this module consumes. */
|
|
38
|
+
export interface TokenLike {
|
|
39
|
+
start: number;
|
|
40
|
+
end: number;
|
|
41
|
+
}
|
|
42
|
+
export interface BuildPriorsOpts {
|
|
43
|
+
/**
|
|
44
|
+
* Maximum bias magnitude (in log-odds units). Default 1.0 — adds up to ~e^1 ≈ 2.7× odds to the
|
|
45
|
+
* favored label. Confidence-scaled, so a 0.6-confidence format hit gets +0.6 max bias.
|
|
46
|
+
*/
|
|
47
|
+
biasScale?: number;
|
|
48
|
+
}
|
|
49
|
+
/**
|
|
50
|
+
* Build a `[seqLen][numLabels]` matrix of additive log-bias to be added to encoder emissions before
|
|
51
|
+
* Viterbi decoding.
|
|
52
|
+
*
|
|
53
|
+
* For each (token, format-hit) pair where the token's character span overlaps the hit's span, the
|
|
54
|
+
* matrix entry for the format's mapped label receives `hit.confidence × biasScale`. Tokens that
|
|
55
|
+
* don't overlap any hit, or for which no label mapping exists, get 0.
|
|
56
|
+
*
|
|
57
|
+
* Returns the all-zeros matrix if `shape.knownFormats` is empty — composes harmlessly.
|
|
58
|
+
*/
|
|
59
|
+
export declare function buildEmissionPriors(shape: QueryShapeLike, tokens: ReadonlyArray<TokenLike>, labels: ReadonlyArray<string>, opts?: BuildPriorsOpts): number[][];
|
|
60
|
+
/** Element-wise add two matrices of equal shape. Returns a new matrix. */
|
|
61
|
+
export declare function addEmissionMatrix(emissions: number[][], priors: number[][]): number[][];
|
|
62
|
+
//# sourceMappingURL=query-shape-prior.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"query-shape-prior.d.ts","sourceRoot":"","sources":["../query-shape-prior.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;GAmBG;AAEH;;;GAGG;AACH,MAAM,WAAW,cAAc;IAC9B,YAAY,EAAE,aAAa,CAAC,kBAAkB,CAAC,CAAA;CAC/C;AAED,MAAM,WAAW,kBAAkB;IAClC,MAAM,EAAE,MAAM,CAAA;IACd,IAAI,EAAE;QAAE,KAAK,EAAE,MAAM,CAAC;QAAC,GAAG,EAAE,MAAM,CAAA;KAAE,CAAA;IACpC,4EAA4E;IAC5E,UAAU,EAAE,MAAM,CAAA;CAClB;AAED,+DAA+D;AAC/D,MAAM,WAAW,SAAS;IACzB,KAAK,EAAE,MAAM,CAAA;IACb,GAAG,EAAE,MAAM,CAAA;CACX;AAiBD,MAAM,WAAW,eAAe;IAC/B;;;OAGG;IACH,SAAS,CAAC,EAAE,MAAM,CAAA;CAClB;AAED;;;;;;;;;GASG;AACH,wBAAgB,mBAAmB,CAClC,KAAK,EAAE,cAAc,EACrB,MAAM,EAAE,aAAa,CAAC,SAAS,CAAC,EAChC,MAAM,EAAE,aAAa,CAAC,MAAM,CAAC,EAC7B,IAAI,GAAE,eAAoB,GACxB,MAAM,EAAE,EAAE,CA4BZ;AAMD,0EAA0E;AAC1E,wBAAgB,iBAAiB,CAAC,SAAS,EAAE,MAAM,EAAE,EAAE,EAAE,MAAM,EAAE,MAAM,EAAE,EAAE,GAAG,MAAM,EAAE,EAAE,CAWvF"}
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Soft-prior emission biases derived from `QueryShape`.
|
|
7
|
+
*
|
|
8
|
+
* When the QueryShape sub-system has identified a known-format span (US ZIP, UK postcode, PO box,
|
|
9
|
+
* etc.), this module produces an additive bias matrix that nudges the encoder's per-token
|
|
10
|
+
* emissions toward the matching BIO label. The biases compose with the structural BIO mask in the
|
|
11
|
+
* Viterbi decoder — confident encoder predictions still win, but uncertain ones get pulled toward
|
|
12
|
+
* the format-implied label.
|
|
13
|
+
*
|
|
14
|
+
* Bitter-lesson-safe boundary: we don't override the encoder, just bias it. The encoder remains the
|
|
15
|
+
* authority on context-dependent calls (the "Buffalo Wild Wings, Buffalo, NY" disambiguation);
|
|
16
|
+
* the QueryShape prior helps on the easy cases (a 5-digit token is _probably_ a postcode).
|
|
17
|
+
*
|
|
18
|
+
* Uses structural typing for the QueryShape input so this module has zero dependencies on
|
|
19
|
+
* `@mailwoman/query-shape` — consumers compute the shape with that package, pass it in here.
|
|
20
|
+
*/
|
|
21
|
+
/**
|
|
22
|
+
* Mapping from `KnownFormat` strings to the BIO label that should be boosted. Multiple formats may
|
|
23
|
+
* map to the same label (all postcode flavors → `B-postcode`).
|
|
24
|
+
*/
|
|
25
|
+
const FORMAT_TO_LABEL = new Map([
|
|
26
|
+
["us_zip", "B-postcode"],
|
|
27
|
+
["us_zip4", "B-postcode"],
|
|
28
|
+
["fr_postcode", "B-postcode"],
|
|
29
|
+
["de_postcode", "B-postcode"],
|
|
30
|
+
["uk_postcode", "B-postcode"],
|
|
31
|
+
["ca_postcode", "B-postcode"],
|
|
32
|
+
["jp_postcode", "B-postcode"],
|
|
33
|
+
["po_box", "B-po_box"],
|
|
34
|
+
]);
|
|
35
|
+
/**
|
|
36
|
+
* Build a `[seqLen][numLabels]` matrix of additive log-bias to be added to encoder emissions before
|
|
37
|
+
* Viterbi decoding.
|
|
38
|
+
*
|
|
39
|
+
* For each (token, format-hit) pair where the token's character span overlaps the hit's span, the
|
|
40
|
+
* matrix entry for the format's mapped label receives `hit.confidence × biasScale`. Tokens that
|
|
41
|
+
* don't overlap any hit, or for which no label mapping exists, get 0.
|
|
42
|
+
*
|
|
43
|
+
* Returns the all-zeros matrix if `shape.knownFormats` is empty — composes harmlessly.
|
|
44
|
+
*/
|
|
45
|
+
export function buildEmissionPriors(shape, tokens, labels, opts = {}) {
|
|
46
|
+
const T = tokens.length;
|
|
47
|
+
const L = labels.length;
|
|
48
|
+
const biasScale = opts.biasScale ?? 1.0;
|
|
49
|
+
const matrix = [];
|
|
50
|
+
for (let t = 0; t < T; t++)
|
|
51
|
+
matrix.push(new Array(L).fill(0));
|
|
52
|
+
if (shape.knownFormats.length === 0)
|
|
53
|
+
return matrix;
|
|
54
|
+
// Index label → column for fast lookup.
|
|
55
|
+
const labelToCol = new Map();
|
|
56
|
+
for (let k = 0; k < labels.length; k++)
|
|
57
|
+
labelToCol.set(labels[k], k);
|
|
58
|
+
for (const hit of shape.knownFormats) {
|
|
59
|
+
const targetLabel = FORMAT_TO_LABEL.get(hit.format);
|
|
60
|
+
if (!targetLabel)
|
|
61
|
+
continue;
|
|
62
|
+
const col = labelToCol.get(targetLabel);
|
|
63
|
+
if (col === undefined)
|
|
64
|
+
continue;
|
|
65
|
+
const bias = hit.confidence * biasScale;
|
|
66
|
+
for (let t = 0; t < T; t++) {
|
|
67
|
+
const tok = tokens[t];
|
|
68
|
+
if (overlaps(tok, hit.span)) {
|
|
69
|
+
matrix[t][col] = Math.max(matrix[t][col], bias);
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
return matrix;
|
|
74
|
+
}
|
|
75
|
+
function overlaps(a, b) {
|
|
76
|
+
return a.start < b.end && b.start < a.end;
|
|
77
|
+
}
|
|
78
|
+
/** Element-wise add two matrices of equal shape. Returns a new matrix. */
|
|
79
|
+
export function addEmissionMatrix(emissions, priors) {
|
|
80
|
+
if (priors.length === 0)
|
|
81
|
+
return emissions.map((row) => row.slice());
|
|
82
|
+
const out = [];
|
|
83
|
+
for (let t = 0; t < emissions.length; t++) {
|
|
84
|
+
const e = emissions[t];
|
|
85
|
+
const p = priors[t] ?? new Array(e.length).fill(0);
|
|
86
|
+
const row = new Array(e.length);
|
|
87
|
+
for (let k = 0; k < e.length; k++)
|
|
88
|
+
row[k] = e[k] + (p[k] ?? 0);
|
|
89
|
+
out.push(row);
|
|
90
|
+
}
|
|
91
|
+
return out;
|
|
92
|
+
}
|
|
93
|
+
//# sourceMappingURL=query-shape-prior.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"query-shape-prior.js","sourceRoot":"","sources":["../query-shape-prior.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;GAmBG;AAuBH;;;GAGG;AACH,MAAM,eAAe,GAAgC,IAAI,GAAG,CAAC;IAC5D,CAAC,QAAQ,EAAE,YAAY,CAAC;IACxB,CAAC,SAAS,EAAE,YAAY,CAAC;IACzB,CAAC,aAAa,EAAE,YAAY,CAAC;IAC7B,CAAC,aAAa,EAAE,YAAY,CAAC;IAC7B,CAAC,aAAa,EAAE,YAAY,CAAC;IAC7B,CAAC,aAAa,EAAE,YAAY,CAAC;IAC7B,CAAC,aAAa,EAAE,YAAY,CAAC;IAC7B,CAAC,QAAQ,EAAE,UAAU,CAAC;CACtB,CAAC,CAAA;AAUF;;;;;;;;;GASG;AACH,MAAM,UAAU,mBAAmB,CAClC,KAAqB,EACrB,MAAgC,EAChC,MAA6B,EAC7B,OAAwB,EAAE;IAE1B,MAAM,CAAC,GAAG,MAAM,CAAC,MAAM,CAAA;IACvB,MAAM,CAAC,GAAG,MAAM,CAAC,MAAM,CAAA;IACvB,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,IAAI,GAAG,CAAA;IACvC,MAAM,MAAM,GAAe,EAAE,CAAA;IAC7B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE;QAAE,MAAM,CAAC,IAAI,CAAC,IAAI,KAAK,CAAS,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAA;IAErE,IAAI,KAAK,CAAC,YAAY,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,MAAM,CAAA;IAElD,wCAAwC;IACxC,MAAM,UAAU,GAAG,IAAI,GAAG,EAAkB,CAAA;IAC5C,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE;QAAE,UAAU,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,CAAE,EAAE,CAAC,CAAC,CAAA;IAErE,KAAK,MAAM,GAAG,IAAI,KAAK,CAAC,YAAY,EAAE,CAAC;QACtC,MAAM,WAAW,GAAG,eAAe,CAAC,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,CAAA;QACnD,IAAI,CAAC,WAAW;YAAE,SAAQ;QAC1B,MAAM,GAAG,GAAG,UAAU,CAAC,GAAG,CAAC,WAAW,CAAC,CAAA;QACvC,IAAI,GAAG,KAAK,SAAS;YAAE,SAAQ;QAC/B,MAAM,IAAI,GAAG,GAAG,CAAC,UAAU,GAAG,SAAS,CAAA;QACvC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;YAC5B,MAAM,GAAG,GAAG,MAAM,CAAC,CAAC,CAAE,CAAA;YACtB,IAAI,QAAQ,CAAC,GAAG,EAAE,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC;gBAC7B,MAAM,CAAC,CAAC,CAAE,CAAC,GAAG,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,CAAE,CAAC,GAAG,CAAE,EAAE,IAAI,CAAC,CAAA;YACnD,CAAC;QACF,CAAC;IACF,CAAC;IAED,OAAO,MAAM,CAAA;AACd,CAAC;AAED,SAAS,QAAQ,CAAC,CAAiC,EAAE,CAAiC;IACrF,OAAO,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,GAAG,IAAI,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,GAAG,CAAA;AAC1C,CAAC;AAED,0EAA0E;AAC1E,MAAM,UAAU,iBAAiB,CAAC,SAAqB,EAAE,MAAkB;IAC1E,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,SAAS,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,GAAG,CAAC,KAAK,EAAE,CAAC,CAAA;IACnE,MAAM,GAAG,GAAe,EAAE,CAAA;IAC1B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAC3C,MAAM,CAAC,GAAG,SAAS,CAAC,CAAC,CAAE,CAAA;QACvB,MAAM,CAAC,GAAG,MAAM,CAAC,CAAC,CAAC,IAAI,IAAI,KAAK,CAAS,CAAC,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;QAC1D,MAAM,GAAG,GAAG,IAAI,KAAK,CAAS,CAAC,CAAC,MAAM,CAAC,CAAA;QACvC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,CAAC,MAAM,EAAE,CAAC,EAAE;YAAE,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAE,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAA;QAC/D,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,CAAA;IACd,CAAC;IACD,OAAO,GAAG,CAAA;AACX,CAAC"}
|
package/out/tokenizer.d.ts
CHANGED
|
@@ -51,7 +51,12 @@ export declare class MailwomanTokenizer {
|
|
|
51
51
|
private constructor();
|
|
52
52
|
/** Load from a base64-encoded `tokenizer.model`. Use for in-memory / test setups. */
|
|
53
53
|
static loadFromBase64(b64: string): Promise<MailwomanTokenizer>;
|
|
54
|
-
/**
|
|
54
|
+
/**
|
|
55
|
+
* Load from a path to a `tokenizer.model` file on disk. **Node-only** — the dynamic `node:fs`
|
|
56
|
+
* import keeps this method out of the static dependency graph so the rest of the tokenizer
|
|
57
|
+
* bundles cleanly for the browser. Calling it in a browser throws at runtime; use
|
|
58
|
+
* `loadFromBase64` (or the URL-fetching loaders in `@mailwoman/neural-web`) instead.
|
|
59
|
+
*/
|
|
55
60
|
static loadFromFile(modelPath: string): Promise<MailwomanTokenizer>;
|
|
56
61
|
/**
|
|
57
62
|
* Tokenize `text` to pieces + ids + realigned char offsets.
|
package/out/tokenizer.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"tokenizer.d.ts","sourceRoot":"","sources":["../tokenizer.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA8BG;
|
|
1
|
+
{"version":3,"file":"tokenizer.d.ts","sourceRoot":"","sources":["../tokenizer.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA8BG;AAIH,4EAA4E;AAC5E,eAAO,MAAM,cAAc,WAAM,CAAA;AAEjC,0EAA0E;AAC1E,MAAM,WAAW,cAAc;IAC9B,wFAAwF;IACxF,KAAK,EAAE,MAAM,CAAA;IACb,mCAAmC;IACnC,EAAE,EAAE,MAAM,CAAA;IACV,yDAAyD;IACzD,KAAK,EAAE,MAAM,CAAA;IACb,uDAAuD;IACvD,GAAG,EAAE,MAAM,CAAA;CACX;AAED,MAAM,WAAW,YAAY;IAC5B,MAAM,EAAE,cAAc,EAAE,CAAA;IACxB,GAAG,EAAE,MAAM,EAAE,CAAA;CACb;AAED,qBAAa,kBAAkB;IACV,OAAO,CAAC,QAAQ,CAAC,SAAS;IAA9C,OAAO;IAEP,qFAAqF;WACxE,cAAc,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,kBAAkB,CAAC;IAMrE;;;;;OAKG;WACU,YAAY,CAAC,SAAS,EAAE,MAAM,GAAG,OAAO,CAAC,kBAAkB,CAAC;IAMzE;;;;;;OAMG;IACH,MAAM,CAAC,IAAI,EAAE,MAAM,GAAG,YAAY;IA2BlC,oFAAoF;IACpF,MAAM,CAAC,GAAG,EAAE,MAAM,EAAE,GAAG,UAAU,GAAG,MAAM;CAI1C"}
|
package/out/tokenizer.js
CHANGED
|
@@ -30,7 +30,6 @@
|
|
|
30
30
|
* - `loadFromFile(path)` — convenience helper that does the read + b64 + load.
|
|
31
31
|
*/
|
|
32
32
|
import { SentencePieceProcessor } from "@sctg/sentencepiece-js";
|
|
33
|
-
import { promises as fs } from "node:fs";
|
|
34
33
|
/** SentencePiece's word-boundary marker (U+2581 LOWER ONE EIGHTH BLOCK). */
|
|
35
34
|
export const SPACE_SENTINEL = "▁";
|
|
36
35
|
export class MailwomanTokenizer {
|
|
@@ -44,9 +43,15 @@ export class MailwomanTokenizer {
|
|
|
44
43
|
await processor.loadFromB64StringModel(b64);
|
|
45
44
|
return new MailwomanTokenizer(processor);
|
|
46
45
|
}
|
|
47
|
-
/**
|
|
46
|
+
/**
|
|
47
|
+
* Load from a path to a `tokenizer.model` file on disk. **Node-only** — the dynamic `node:fs`
|
|
48
|
+
* import keeps this method out of the static dependency graph so the rest of the tokenizer
|
|
49
|
+
* bundles cleanly for the browser. Calling it in a browser throws at runtime; use
|
|
50
|
+
* `loadFromBase64` (or the URL-fetching loaders in `@mailwoman/neural-web`) instead.
|
|
51
|
+
*/
|
|
48
52
|
static async loadFromFile(modelPath) {
|
|
49
|
-
const
|
|
53
|
+
const { readFile } = await import(/* webpackIgnore: true */ "node:fs/promises");
|
|
54
|
+
const buf = await readFile(modelPath);
|
|
50
55
|
return MailwomanTokenizer.loadFromBase64(buf.toString("base64"));
|
|
51
56
|
}
|
|
52
57
|
/**
|
package/out/tokenizer.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"tokenizer.js","sourceRoot":"","sources":["../tokenizer.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA8BG;AAEH,OAAO,EAAE,sBAAsB,EAAE,MAAM,wBAAwB,CAAA;
|
|
1
|
+
{"version":3,"file":"tokenizer.js","sourceRoot":"","sources":["../tokenizer.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA8BG;AAEH,OAAO,EAAE,sBAAsB,EAAE,MAAM,wBAAwB,CAAA;AAE/D,4EAA4E;AAC5E,MAAM,CAAC,MAAM,cAAc,GAAG,GAAG,CAAA;AAmBjC,MAAM,OAAO,kBAAkB;IACO;IAArC,YAAqC,SAAiC;QAAjC,cAAS,GAAT,SAAS,CAAwB;IAAG,CAAC;IAE1E,qFAAqF;IACrF,MAAM,CAAC,KAAK,CAAC,cAAc,CAAC,GAAW;QACtC,MAAM,SAAS,GAAG,IAAI,sBAAsB,EAAE,CAAA;QAC9C,MAAM,SAAS,CAAC,sBAAsB,CAAC,GAAG,CAAC,CAAA;QAC3C,OAAO,IAAI,kBAAkB,CAAC,SAAS,CAAC,CAAA;IACzC,CAAC;IAED;;;;;OAKG;IACH,MAAM,CAAC,KAAK,CAAC,YAAY,CAAC,SAAiB;QAC1C,MAAM,EAAE,QAAQ,EAAE,GAAG,MAAM,MAAM,CAAC,yBAAyB,CAAC,kBAAkB,CAAC,CAAA;QAC/E,MAAM,GAAG,GAAG,MAAM,QAAQ,CAAC,SAAS,CAAC,CAAA;QACrC,OAAO,kBAAkB,CAAC,cAAc,CAAC,GAAG,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC,CAAA;IACjE,CAAC;IAED;;;;;;OAMG;IACH,MAAM,CAAC,IAAY;QAClB,MAAM,MAAM,GAAG,IAAI,CAAC,SAAS,CAAC,YAAY,CAAC,IAAI,CAAC,CAAA;QAChD,MAAM,GAAG,GAAG,IAAI,CAAC,SAAS,CAAC,SAAS,CAAC,IAAI,CAAC,CAAA;QAE1C,MAAM,SAAS,GAAqB,EAAE,CAAA;QACtC,IAAI,MAAM,GAAG,CAAC,CAAA;QAEd,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACxC,MAAM,KAAK,GAAG,MAAM,CAAC,CAAC,CAAE,CAAA;YACxB,MAAM,EAAE,GAAG,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAA;YACvB,MAAM,WAAW,GAAG,KAAK,CAAC,UAAU,CAAC,cAAc,CAAC,CAAA;YACpD,MAAM,OAAO,GAAG,WAAW,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,cAAc,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,KAAK,CAAA;YAExE,IAAI,WAAW,EAAE,CAAC;gBACjB,OAAO,MAAM,GAAG,IAAI,CAAC,MAAM,IAAI,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,CAAE,CAAC;oBAAE,MAAM,EAAE,CAAA;YAClE,CAAC;YAED,MAAM,KAAK,GAAG,MAAM,CAAA;YACpB,MAAM,IAAI,OAAO,CAAC,MAAM,CAAA;YACxB,MAAM,GAAG,GAAG,MAAM,CAAA;YAElB,SAAS,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,EAAE,EAAE,KAAK,EAAE,GAAG,EAAE,CAAC,CAAA;QAC1C,CAAC;QAED,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,GAAG,EAAE,CAAA;IAClC,CAAC;IAED,oFAAoF;IACpF,MAAM,CAAC,GAA0B;QAChC,MAAM,GAAG,GAAG,GAAG,YAAY,UAAU,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,UAAU,CAAC,IAAI,CAAC,GAAG,CAAC,CAAA;QAClE,OAAO,IAAI,CAAC,SAAS,CAAC,SAAS,CAAC,GAAG,CAAW,CAAA;IAC/C,CAAC;CACD"}
|
package/out/viterbi.d.ts
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Linear-chain CRF Viterbi decoder in TypeScript.
|
|
7
|
+
*
|
|
8
|
+
* Replaces per-token argmax in the classifier when transition scores are available. Mirrors the
|
|
9
|
+
* Python training-time / eval-time path so JS runtime decode agrees with the model card's
|
|
10
|
+
* metrics.
|
|
11
|
+
*
|
|
12
|
+
* Two transition matrix modes:
|
|
13
|
+
*
|
|
14
|
+
* 1. **Structural-only** (no weights changes required) — build from the BIO label vocabulary using
|
|
15
|
+
* `buildBioTransitionMask()`. Forbids `O → I-X`, `B-X → I-Y` (X ≠ Y), and sequence-start →
|
|
16
|
+
* `I-X`. Permits everything else. This alone prevents orphan-I decoding ("Saint Petersburg →
|
|
17
|
+
* Petersburg" bug) at runtime — a strict improvement over argmax.
|
|
18
|
+
* 2. **Learned** (requires a future weights release that ships `crf-transitions.json`) — load the
|
|
19
|
+
* trained transition matrix from the model card. Adds learned soft priors on top of the
|
|
20
|
+
* structural mask. Currently not exported from the training-side ONNX bundle.
|
|
21
|
+
*/
|
|
22
|
+
/**
|
|
23
|
+
* Build the BIO structural transition mask given the label vocabulary in order.
|
|
24
|
+
*
|
|
25
|
+
* Rules:
|
|
26
|
+
*
|
|
27
|
+
* - `X → O` always permitted (0)
|
|
28
|
+
* - `X → B-Y` always permitted (0)
|
|
29
|
+
* - `X → I-Y` permitted only if `X` is `B-Y` or `I-Y` (0); otherwise -inf
|
|
30
|
+
*
|
|
31
|
+
* Returns a `numLabels × numLabels` matrix where `mask[from][to]` is the additive log-score (0 for
|
|
32
|
+
* permitted, NEG_INF for forbidden).
|
|
33
|
+
*/
|
|
34
|
+
export declare function buildBioTransitionMask(labels: readonly string[]): number[][];
|
|
35
|
+
/** Returns the per-label vector of valid start-of-sequence transitions (0 or -inf). */
|
|
36
|
+
export declare function buildBioStartMask(labels: readonly string[]): number[];
|
|
37
|
+
/**
|
|
38
|
+
* End-of-sequence transitions. By default all labels are valid endings (returns zeros). Override if
|
|
39
|
+
* the trained model has learned end transitions.
|
|
40
|
+
*/
|
|
41
|
+
export declare function buildBioEndMask(labels: readonly string[]): number[];
|
|
42
|
+
export interface ViterbiInput {
|
|
43
|
+
/** `emissions[t][k]` — log-emission for label k at timestep t. Pass raw logits or log-softmaxes. */
|
|
44
|
+
emissions: number[][];
|
|
45
|
+
/** `transitions[from][to]` — additive log-score. Use `buildBioTransitionMask` if unsure. */
|
|
46
|
+
transitions: number[][];
|
|
47
|
+
/** Per-label log-score for being the FIRST label. */
|
|
48
|
+
startTransitions?: number[];
|
|
49
|
+
/** Per-label log-score for being the LAST label. */
|
|
50
|
+
endTransitions?: number[];
|
|
51
|
+
}
|
|
52
|
+
export interface ViterbiResult {
|
|
53
|
+
/** Best label index per timestep. */
|
|
54
|
+
path: number[];
|
|
55
|
+
/** Total path score (log-prob). */
|
|
56
|
+
score: number;
|
|
57
|
+
}
|
|
58
|
+
/**
|
|
59
|
+
* Viterbi decode: find the highest-scoring label sequence under the CRF.
|
|
60
|
+
*
|
|
61
|
+
* Time: O(seq_len × num_labels²). Space: O(seq_len × num_labels) for the backpointer table.
|
|
62
|
+
*/
|
|
63
|
+
export declare function viterbi(input: ViterbiInput): ViterbiResult;
|
|
64
|
+
/**
|
|
65
|
+
* Convenience: argmax over per-token softmax (existing behavior). Provided so callers can opt in to
|
|
66
|
+
* Viterbi only when transitions are available, falling back to this cleanly.
|
|
67
|
+
*/
|
|
68
|
+
export declare function perTokenArgmax(emissions: readonly number[][]): number[];
|
|
69
|
+
/**
|
|
70
|
+
* Softmax of a logit row (returns probabilities summing to 1).
|
|
71
|
+
*
|
|
72
|
+
* Used to compute per-token confidence after Viterbi picks the label sequence — the confidence is
|
|
73
|
+
* the softmax probability of the Viterbi-chosen label at that timestep.
|
|
74
|
+
*/
|
|
75
|
+
export declare function softmax(row: readonly number[]): number[];
|
|
76
|
+
//# sourceMappingURL=viterbi.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"viterbi.d.ts","sourceRoot":"","sources":["../viterbi.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;GAoBG;AAIH;;;;;;;;;;;GAWG;AACH,wBAAgB,sBAAsB,CAAC,MAAM,EAAE,SAAS,MAAM,EAAE,GAAG,MAAM,EAAE,EAAE,CAa5E;AAED,uFAAuF;AACvF,wBAAgB,iBAAiB,CAAC,MAAM,EAAE,SAAS,MAAM,EAAE,GAAG,MAAM,EAAE,CAErE;AAED;;;GAGG;AACH,wBAAgB,eAAe,CAAC,MAAM,EAAE,SAAS,MAAM,EAAE,GAAG,MAAM,EAAE,CAEnE;AAYD,MAAM,WAAW,YAAY;IAC5B,oGAAoG;IACpG,SAAS,EAAE,MAAM,EAAE,EAAE,CAAA;IACrB,4FAA4F;IAC5F,WAAW,EAAE,MAAM,EAAE,EAAE,CAAA;IACvB,qDAAqD;IACrD,gBAAgB,CAAC,EAAE,MAAM,EAAE,CAAA;IAC3B,oDAAoD;IACpD,cAAc,CAAC,EAAE,MAAM,EAAE,CAAA;CACzB;AAED,MAAM,WAAW,aAAa;IAC7B,qCAAqC;IACrC,IAAI,EAAE,MAAM,EAAE,CAAA;IACd,mCAAmC;IACnC,KAAK,EAAE,MAAM,CAAA;CACb;AAED;;;;GAIG;AACH,wBAAgB,OAAO,CAAC,KAAK,EAAE,YAAY,GAAG,aAAa,CA4D1D;AAED;;;GAGG;AACH,wBAAgB,cAAc,CAAC,SAAS,EAAE,SAAS,MAAM,EAAE,EAAE,GAAG,MAAM,EAAE,CAYvE;AAED;;;;;GAKG;AACH,wBAAgB,OAAO,CAAC,GAAG,EAAE,SAAS,MAAM,EAAE,GAAG,MAAM,EAAE,CAMxD"}
|
package/out/viterbi.js
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Linear-chain CRF Viterbi decoder in TypeScript.
|
|
7
|
+
*
|
|
8
|
+
* Replaces per-token argmax in the classifier when transition scores are available. Mirrors the
|
|
9
|
+
* Python training-time / eval-time path so JS runtime decode agrees with the model card's
|
|
10
|
+
* metrics.
|
|
11
|
+
*
|
|
12
|
+
* Two transition matrix modes:
|
|
13
|
+
*
|
|
14
|
+
* 1. **Structural-only** (no weights changes required) — build from the BIO label vocabulary using
|
|
15
|
+
* `buildBioTransitionMask()`. Forbids `O → I-X`, `B-X → I-Y` (X ≠ Y), and sequence-start →
|
|
16
|
+
* `I-X`. Permits everything else. This alone prevents orphan-I decoding ("Saint Petersburg →
|
|
17
|
+
* Petersburg" bug) at runtime — a strict improvement over argmax.
|
|
18
|
+
* 2. **Learned** (requires a future weights release that ships `crf-transitions.json`) — load the
|
|
19
|
+
* trained transition matrix from the model card. Adds learned soft priors on top of the
|
|
20
|
+
* structural mask. Currently not exported from the training-side ONNX bundle.
|
|
21
|
+
*/
|
|
22
|
+
const NEG_INF = -1e9;
|
|
23
|
+
/**
|
|
24
|
+
* Build the BIO structural transition mask given the label vocabulary in order.
|
|
25
|
+
*
|
|
26
|
+
* Rules:
|
|
27
|
+
*
|
|
28
|
+
* - `X → O` always permitted (0)
|
|
29
|
+
* - `X → B-Y` always permitted (0)
|
|
30
|
+
* - `X → I-Y` permitted only if `X` is `B-Y` or `I-Y` (0); otherwise -inf
|
|
31
|
+
*
|
|
32
|
+
* Returns a `numLabels × numLabels` matrix where `mask[from][to]` is the additive log-score (0 for
|
|
33
|
+
* permitted, NEG_INF for forbidden).
|
|
34
|
+
*/
|
|
35
|
+
export function buildBioTransitionMask(labels) {
|
|
36
|
+
const n = labels.length;
|
|
37
|
+
const mask = [];
|
|
38
|
+
for (let from = 0; from < n; from++) {
|
|
39
|
+
const row = new Array(n);
|
|
40
|
+
const fromLabel = labels[from];
|
|
41
|
+
for (let to = 0; to < n; to++) {
|
|
42
|
+
const toLabel = labels[to];
|
|
43
|
+
row[to] = isValidTransition(fromLabel, toLabel) ? 0 : NEG_INF;
|
|
44
|
+
}
|
|
45
|
+
mask.push(row);
|
|
46
|
+
}
|
|
47
|
+
return mask;
|
|
48
|
+
}
|
|
49
|
+
/** Returns the per-label vector of valid start-of-sequence transitions (0 or -inf). */
|
|
50
|
+
export function buildBioStartMask(labels) {
|
|
51
|
+
return labels.map((l) => (l.startsWith("I-") ? NEG_INF : 0));
|
|
52
|
+
}
|
|
53
|
+
/**
|
|
54
|
+
* End-of-sequence transitions. By default all labels are valid endings (returns zeros). Override if
|
|
55
|
+
* the trained model has learned end transitions.
|
|
56
|
+
*/
|
|
57
|
+
export function buildBioEndMask(labels) {
|
|
58
|
+
return labels.map(() => 0);
|
|
59
|
+
}
|
|
60
|
+
function isValidTransition(from, to) {
|
|
61
|
+
if (to === "O")
|
|
62
|
+
return true;
|
|
63
|
+
if (to.startsWith("B-"))
|
|
64
|
+
return true;
|
|
65
|
+
if (to.startsWith("I-")) {
|
|
66
|
+
const tag = to.slice(2);
|
|
67
|
+
return from === `B-${tag}` || from === `I-${tag}`;
|
|
68
|
+
}
|
|
69
|
+
return true;
|
|
70
|
+
}
|
|
71
|
+
/**
|
|
72
|
+
* Viterbi decode: find the highest-scoring label sequence under the CRF.
|
|
73
|
+
*
|
|
74
|
+
* Time: O(seq_len × num_labels²). Space: O(seq_len × num_labels) for the backpointer table.
|
|
75
|
+
*/
|
|
76
|
+
export function viterbi(input) {
|
|
77
|
+
const { emissions, transitions } = input;
|
|
78
|
+
const T = emissions.length;
|
|
79
|
+
if (T === 0)
|
|
80
|
+
return { path: [], score: 0 };
|
|
81
|
+
const numLabels = emissions[0].length;
|
|
82
|
+
const startTrans = input.startTransitions ?? new Array(numLabels).fill(0);
|
|
83
|
+
const endTrans = input.endTransitions ?? new Array(numLabels).fill(0);
|
|
84
|
+
// dp[t][k] = best log-score ending at (timestep t, label k)
|
|
85
|
+
const dp = [];
|
|
86
|
+
const back = [];
|
|
87
|
+
// t = 0
|
|
88
|
+
const first = new Array(numLabels);
|
|
89
|
+
for (let k = 0; k < numLabels; k++) {
|
|
90
|
+
first[k] = startTrans[k] + emissions[0][k];
|
|
91
|
+
}
|
|
92
|
+
dp.push(first);
|
|
93
|
+
back.push(new Array(numLabels).fill(-1));
|
|
94
|
+
for (let t = 1; t < T; t++) {
|
|
95
|
+
const cur = new Array(numLabels);
|
|
96
|
+
const ptr = new Array(numLabels);
|
|
97
|
+
for (let k = 0; k < numLabels; k++) {
|
|
98
|
+
let bestScore = NEG_INF;
|
|
99
|
+
let bestPrev = 0;
|
|
100
|
+
for (let j = 0; j < numLabels; j++) {
|
|
101
|
+
const s = dp[t - 1][j] + transitions[j][k];
|
|
102
|
+
if (s > bestScore) {
|
|
103
|
+
bestScore = s;
|
|
104
|
+
bestPrev = j;
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
cur[k] = bestScore + emissions[t][k];
|
|
108
|
+
ptr[k] = bestPrev;
|
|
109
|
+
}
|
|
110
|
+
dp.push(cur);
|
|
111
|
+
back.push(ptr);
|
|
112
|
+
}
|
|
113
|
+
// Pick the best ending state.
|
|
114
|
+
let bestEndScore = NEG_INF;
|
|
115
|
+
let bestEnd = 0;
|
|
116
|
+
for (let k = 0; k < numLabels; k++) {
|
|
117
|
+
const s = dp[T - 1][k] + endTrans[k];
|
|
118
|
+
if (s > bestEndScore) {
|
|
119
|
+
bestEndScore = s;
|
|
120
|
+
bestEnd = k;
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
// Trace back.
|
|
124
|
+
const path = new Array(T);
|
|
125
|
+
path[T - 1] = bestEnd;
|
|
126
|
+
for (let t = T - 1; t > 0; t--) {
|
|
127
|
+
path[t - 1] = back[t][path[t]];
|
|
128
|
+
}
|
|
129
|
+
return { path, score: bestEndScore };
|
|
130
|
+
}
|
|
131
|
+
/**
|
|
132
|
+
* Convenience: argmax over per-token softmax (existing behavior). Provided so callers can opt in to
|
|
133
|
+
* Viterbi only when transitions are available, falling back to this cleanly.
|
|
134
|
+
*/
|
|
135
|
+
export function perTokenArgmax(emissions) {
|
|
136
|
+
return emissions.map((row) => {
|
|
137
|
+
let bestIdx = 0;
|
|
138
|
+
let bestVal = row[0];
|
|
139
|
+
for (let k = 1; k < row.length; k++) {
|
|
140
|
+
if (row[k] > bestVal) {
|
|
141
|
+
bestVal = row[k];
|
|
142
|
+
bestIdx = k;
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
return bestIdx;
|
|
146
|
+
});
|
|
147
|
+
}
|
|
148
|
+
/**
|
|
149
|
+
* Softmax of a logit row (returns probabilities summing to 1).
|
|
150
|
+
*
|
|
151
|
+
* Used to compute per-token confidence after Viterbi picks the label sequence — the confidence is
|
|
152
|
+
* the softmax probability of the Viterbi-chosen label at that timestep.
|
|
153
|
+
*/
|
|
154
|
+
export function softmax(row) {
|
|
155
|
+
let max = row[0];
|
|
156
|
+
for (let i = 1; i < row.length; i++)
|
|
157
|
+
if (row[i] > max)
|
|
158
|
+
max = row[i];
|
|
159
|
+
const exps = row.map((v) => Math.exp(v - max));
|
|
160
|
+
const sum = exps.reduce((a, b) => a + b, 0);
|
|
161
|
+
return exps.map((e) => e / sum);
|
|
162
|
+
}
|
|
163
|
+
//# sourceMappingURL=viterbi.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"viterbi.js","sourceRoot":"","sources":["../viterbi.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;GAoBG;AAEH,MAAM,OAAO,GAAG,CAAC,GAAG,CAAA;AAEpB;;;;;;;;;;;GAWG;AACH,MAAM,UAAU,sBAAsB,CAAC,MAAyB;IAC/D,MAAM,CAAC,GAAG,MAAM,CAAC,MAAM,CAAA;IACvB,MAAM,IAAI,GAAe,EAAE,CAAA;IAC3B,KAAK,IAAI,IAAI,GAAG,CAAC,EAAE,IAAI,GAAG,CAAC,EAAE,IAAI,EAAE,EAAE,CAAC;QACrC,MAAM,GAAG,GAAG,IAAI,KAAK,CAAS,CAAC,CAAC,CAAA;QAChC,MAAM,SAAS,GAAG,MAAM,CAAC,IAAI,CAAE,CAAA;QAC/B,KAAK,IAAI,EAAE,GAAG,CAAC,EAAE,EAAE,GAAG,CAAC,EAAE,EAAE,EAAE,EAAE,CAAC;YAC/B,MAAM,OAAO,GAAG,MAAM,CAAC,EAAE,CAAE,CAAA;YAC3B,GAAG,CAAC,EAAE,CAAC,GAAG,iBAAiB,CAAC,SAAS,EAAE,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAA;QAC9D,CAAC;QACD,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,CAAA;IACf,CAAC;IACD,OAAO,IAAI,CAAA;AACZ,CAAC;AAED,uFAAuF;AACvF,MAAM,UAAU,iBAAiB,CAAC,MAAyB;IAC1D,OAAO,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAA;AAC7D,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,eAAe,CAAC,MAAyB;IACxD,OAAO,MAAM,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,CAAA;AAC3B,CAAC;AAED,SAAS,iBAAiB,CAAC,IAAY,EAAE,EAAU;IAClD,IAAI,EAAE,KAAK,GAAG;QAAE,OAAO,IAAI,CAAA;IAC3B,IAAI,EAAE,CAAC,UAAU,CAAC,IAAI,CAAC;QAAE,OAAO,IAAI,CAAA;IACpC,IAAI,EAAE,CAAC,UAAU,CAAC,IAAI,CAAC,EAAE,CAAC;QACzB,MAAM,GAAG,GAAG,EAAE,CAAC,KAAK,CAAC,CAAC,CAAC,CAAA;QACvB,OAAO,IAAI,KAAK,KAAK,GAAG,EAAE,IAAI,IAAI,KAAK,KAAK,GAAG,EAAE,CAAA;IAClD,CAAC;IACD,OAAO,IAAI,CAAA;AACZ,CAAC;AAoBD;;;;GAIG;AACH,MAAM,UAAU,OAAO,CAAC,KAAmB;IAC1C,MAAM,EAAE,SAAS,EAAE,WAAW,EAAE,GAAG,KAAK,CAAA;IACxC,MAAM,CAAC,GAAG,SAAS,CAAC,MAAM,CAAA;IAC1B,IAAI,CAAC,KAAK,CAAC;QAAE,OAAO,EAAE,IAAI,EAAE,EAAE,EAAE,KAAK,EAAE,CAAC,EAAE,CAAA;IAE1C,MAAM,SAAS,GAAG,SAAS,CAAC,CAAC,CAAE,CAAC,MAAM,CAAA;IACtC,MAAM,UAAU,GAAG,KAAK,CAAC,gBAAgB,IAAI,IAAI,KAAK,CAAS,SAAS,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;IACjF,MAAM,QAAQ,GAAG,KAAK,CAAC,cAAc,IAAI,IAAI,KAAK,CAAS,SAAS,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;IAE7E,4DAA4D;IAC5D,MAAM,EAAE,GAAe,EAAE,CAAA;IACzB,MAAM,IAAI,GAAe,EAAE,CAAA;IAE3B,QAAQ;IACR,MAAM,KAAK,GAAG,IAAI,KAAK,CAAS,SAAS,CAAC,CAAA;IAC1C,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,EAAE,CAAC,EAAE,EAAE,CAAC;QACpC,KAAK,CAAC,CAAC,CAAC,GAAG,UAAU,CAAC,CAAC,CAAE,GAAG,SAAS,CAAC,CAAC,CAAE,CAAC,CAAC,CAAE,CAAA;IAC9C,CAAC;IACD,EAAE,CAAC,IAAI,CAAC,KAAK,CAAC,CAAA;IACd,IAAI,CAAC,IAAI,CAAC,IAAI,KAAK,CAAS,SAAS,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,CAAA;IAEhD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QAC5B,MAAM,GAAG,GAAG,IAAI,KAAK,CAAS,SAAS,CAAC,CAAA;QACxC,MAAM,GAAG,GAAG,IAAI,KAAK,CAAS,SAAS,CAAC,CAAA;QACxC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,EAAE,CAAC,EAAE,EAAE,CAAC;YACpC,IAAI,SAAS,GAAG,OAAO,CAAA;YACvB,IAAI,QAAQ,GAAG,CAAC,CAAA;YAChB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,EAAE,CAAC,EAAE,EAAE,CAAC;gBACpC,MAAM,CAAC,GAAG,EAAE,CAAC,CAAC,GAAG,CAAC,CAAE,CAAC,CAAC,CAAE,GAAG,WAAW,CAAC,CAAC,CAAE,CAAC,CAAC,CAAE,CAAA;gBAC9C,IAAI,CAAC,GAAG,SAAS,EAAE,CAAC;oBACnB,SAAS,GAAG,CAAC,CAAA;oBACb,QAAQ,GAAG,CAAC,CAAA;gBACb,CAAC;YACF,CAAC;YACD,GAAG,CAAC,CAAC,CAAC,GAAG,SAAS,GAAG,SAAS,CAAC,CAAC,CAAE,CAAC,CAAC,CAAE,CAAA;YACtC,GAAG,CAAC,CAAC,CAAC,GAAG,QAAQ,CAAA;QAClB,CAAC;QACD,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,CAAA;QACZ,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,CAAA;IACf,CAAC;IAED,8BAA8B;IAC9B,IAAI,YAAY,GAAG,OAAO,CAAA;IAC1B,IAAI,OAAO,GAAG,CAAC,CAAA;IACf,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,EAAE,CAAC,EAAE,EAAE,CAAC;QACpC,MAAM,CAAC,GAAG,EAAE,CAAC,CAAC,GAAG,CAAC,CAAE,CAAC,CAAC,CAAE,GAAG,QAAQ,CAAC,CAAC,CAAE,CAAA;QACvC,IAAI,CAAC,GAAG,YAAY,EAAE,CAAC;YACtB,YAAY,GAAG,CAAC,CAAA;YAChB,OAAO,GAAG,CAAC,CAAA;QACZ,CAAC;IACF,CAAC;IAED,cAAc;IACd,MAAM,IAAI,GAAG,IAAI,KAAK,CAAS,CAAC,CAAC,CAAA;IACjC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,GAAG,OAAO,CAAA;IACrB,KAAK,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QAChC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,GAAG,IAAI,CAAC,CAAC,CAAE,CAAC,IAAI,CAAC,CAAC,CAAE,CAAE,CAAA;IAClC,CAAC;IAED,OAAO,EAAE,IAAI,EAAE,KAAK,EAAE,YAAY,EAAE,CAAA;AACrC,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,cAAc,CAAC,SAA8B;IAC5D,OAAO,SAAS,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE;QAC5B,IAAI,OAAO,GAAG,CAAC,CAAA;QACf,IAAI,OAAO,GAAG,GAAG,CAAC,CAAC,CAAE,CAAA;QACrB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACrC,IAAI,GAAG,CAAC,CAAC,CAAE,GAAG,OAAO,EAAE,CAAC;gBACvB,OAAO,GAAG,GAAG,CAAC,CAAC,CAAE,CAAA;gBACjB,OAAO,GAAG,CAAC,CAAA;YACZ,CAAC;QACF,CAAC;QACD,OAAO,OAAO,CAAA;IACf,CAAC,CAAC,CAAA;AACH,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,OAAO,CAAC,GAAsB;IAC7C,IAAI,GAAG,GAAG,GAAG,CAAC,CAAC,CAAE,CAAA;IACjB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,CAAC,MAAM,EAAE,CAAC,EAAE;QAAE,IAAI,GAAG,CAAC,CAAC,CAAE,GAAG,GAAG;YAAE,GAAG,GAAG,GAAG,CAAC,CAAC,CAAE,CAAA;IACrE,MAAM,IAAI,GAAG,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC,CAAA;IAC9C,MAAM,GAAG,GAAG,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,CAAA;IAC3C,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,GAAG,CAAC,CAAA;AAChC,CAAC"}
|
package/out/weights.d.ts
CHANGED
|
@@ -32,8 +32,26 @@ export interface ResolveWeightsOpts {
|
|
|
32
32
|
export interface ResolvedWeights {
|
|
33
33
|
modelPath: string;
|
|
34
34
|
tokenizerPath: string;
|
|
35
|
+
/**
|
|
36
|
+
* Path to `model-card.json` alongside the resolved model. `undefined` when the caller passed
|
|
37
|
+
* explicit paths or when the package directory has no card on disk. Read by `loadFromWeights` to
|
|
38
|
+
* thread the trained label vocabulary into the classifier — see {@link readLabelsFromModelCard}.
|
|
39
|
+
*/
|
|
40
|
+
modelCardPath?: string;
|
|
35
41
|
/** "explicit" if both paths came from opts; "package:<name>" if resolved via require.resolve. */
|
|
36
42
|
source: string;
|
|
37
43
|
}
|
|
38
44
|
export declare function resolveWeights(opts: ResolveWeightsOpts): ResolvedWeights;
|
|
45
|
+
/**
|
|
46
|
+
* Read the `labels` array from a `model-card.json` file. Returns `undefined` when the file is
|
|
47
|
+
* missing, unreadable, malformed, or has no `labels` field — callers should fall back to their
|
|
48
|
+
* compile-time default in that case (the loader contract: the JS-side default tracks the most
|
|
49
|
+
* recent shipped stage, so a card without `labels` is always a pre-v0.4.0 card whose label vocab
|
|
50
|
+
* matches that default by construction).
|
|
51
|
+
*
|
|
52
|
+
* Validates shape: must be a non-empty array of strings. Throws on a present-but-malformed `labels`
|
|
53
|
+
* field — a card that emits e.g. `labels: 21` rather than `labels: [...]` is a corrupted artifact
|
|
54
|
+
* and should be loud, not silently re-defaulted.
|
|
55
|
+
*/
|
|
56
|
+
export declare function readLabelsFromModelCard(modelCardPath: string | undefined): readonly string[] | undefined;
|
|
39
57
|
//# sourceMappingURL=weights.d.ts.map
|
package/out/weights.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"weights.d.ts","sourceRoot":"","sources":["../weights.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;GAsBG;AAQH,MAAM,WAAW,kBAAkB;IAClC,wFAAwF;IACxF,MAAM,CAAC,EAAE,MAAM,CAAA;IACf,4EAA4E;IAC5E,SAAS,CAAC,EAAE,MAAM,CAAA;IAClB,iFAAiF;IACjF,aAAa,CAAC,EAAE,MAAM,CAAA;CACtB;AAED,MAAM,WAAW,eAAe;IAC/B,SAAS,EAAE,MAAM,CAAA;IACjB,aAAa,EAAE,MAAM,CAAA;IACrB,iGAAiG;IACjG,MAAM,EAAE,MAAM,CAAA;CACd;AAED,wBAAgB,cAAc,CAAC,IAAI,EAAE,kBAAkB,GAAG,eAAe,
|
|
1
|
+
{"version":3,"file":"weights.d.ts","sourceRoot":"","sources":["../weights.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;GAsBG;AAQH,MAAM,WAAW,kBAAkB;IAClC,wFAAwF;IACxF,MAAM,CAAC,EAAE,MAAM,CAAA;IACf,4EAA4E;IAC5E,SAAS,CAAC,EAAE,MAAM,CAAA;IAClB,iFAAiF;IACjF,aAAa,CAAC,EAAE,MAAM,CAAA;CACtB;AAED,MAAM,WAAW,eAAe;IAC/B,SAAS,EAAE,MAAM,CAAA;IACjB,aAAa,EAAE,MAAM,CAAA;IACrB;;;;OAIG;IACH,aAAa,CAAC,EAAE,MAAM,CAAA;IACtB,iGAAiG;IACjG,MAAM,EAAE,MAAM,CAAA;CACd;AAED,wBAAgB,cAAc,CAAC,IAAI,EAAE,kBAAkB,GAAG,eAAe,CA0CxE;AAED;;;;;;;;;;GAUG;AACH,wBAAgB,uBAAuB,CAAC,aAAa,EAAE,MAAM,GAAG,SAAS,GAAG,SAAS,MAAM,EAAE,GAAG,SAAS,CAwBxG"}
|
package/out/weights.js
CHANGED
|
@@ -21,7 +21,7 @@
|
|
|
21
21
|
* The resolver checks for both files and throws a single actionable error when neither is findable,
|
|
22
22
|
* naming all the paths it tried.
|
|
23
23
|
*/
|
|
24
|
-
import { existsSync } from "node:fs";
|
|
24
|
+
import { existsSync, readFileSync } from "node:fs";
|
|
25
25
|
import { createRequire } from "node:module";
|
|
26
26
|
import { dirname, resolve } from "node:path";
|
|
27
27
|
const req = createRequire(import.meta.url);
|
|
@@ -34,7 +34,10 @@ export function resolveWeights(opts) {
|
|
|
34
34
|
throw new Error(`Explicit tokenizerPath does not exist: ${opts.tokenizerPath}`);
|
|
35
35
|
return { modelPath: opts.modelPath, tokenizerPath: opts.tokenizerPath, source: "explicit" };
|
|
36
36
|
}
|
|
37
|
-
|
|
37
|
+
// Package names follow the all-lowercase BCP-47 convention (`neural-weights-en-us`,
|
|
38
|
+
// `neural-weights-fr-fr`). The CLI's locale validation accepts canonical `en-US` / `fr-FR`
|
|
39
|
+
// casing, so we normalize here rather than at the callsite.
|
|
40
|
+
const locale = (opts.locale ?? "en-us").toLowerCase();
|
|
38
41
|
const packageName = `@mailwoman/neural-weights-${locale}`;
|
|
39
42
|
let packageDir;
|
|
40
43
|
try {
|
|
@@ -54,6 +57,47 @@ export function resolveWeights(opts) {
|
|
|
54
57
|
`Run \`scripts/link-dev-weights.sh\` inside the package to symlink dev weights, ` +
|
|
55
58
|
`or pass --model + --tokenizer with explicit paths.`);
|
|
56
59
|
}
|
|
57
|
-
|
|
60
|
+
const modelCardCandidate = resolve(packageDir, "model-card.json");
|
|
61
|
+
const modelCardPath = existsSync(modelCardCandidate) ? modelCardCandidate : undefined;
|
|
62
|
+
return { modelPath, tokenizerPath, modelCardPath, source: `package:${packageName}` };
|
|
63
|
+
}
|
|
64
|
+
/**
|
|
65
|
+
* Read the `labels` array from a `model-card.json` file. Returns `undefined` when the file is
|
|
66
|
+
* missing, unreadable, malformed, or has no `labels` field — callers should fall back to their
|
|
67
|
+
* compile-time default in that case (the loader contract: the JS-side default tracks the most
|
|
68
|
+
* recent shipped stage, so a card without `labels` is always a pre-v0.4.0 card whose label vocab
|
|
69
|
+
* matches that default by construction).
|
|
70
|
+
*
|
|
71
|
+
* Validates shape: must be a non-empty array of strings. Throws on a present-but-malformed `labels`
|
|
72
|
+
* field — a card that emits e.g. `labels: 21` rather than `labels: [...]` is a corrupted artifact
|
|
73
|
+
* and should be loud, not silently re-defaulted.
|
|
74
|
+
*/
|
|
75
|
+
export function readLabelsFromModelCard(modelCardPath) {
|
|
76
|
+
if (!modelCardPath || !existsSync(modelCardPath))
|
|
77
|
+
return undefined;
|
|
78
|
+
let raw;
|
|
79
|
+
try {
|
|
80
|
+
raw = readFileSync(modelCardPath, "utf8");
|
|
81
|
+
}
|
|
82
|
+
catch {
|
|
83
|
+
return undefined;
|
|
84
|
+
}
|
|
85
|
+
let parsed;
|
|
86
|
+
try {
|
|
87
|
+
parsed = JSON.parse(raw);
|
|
88
|
+
}
|
|
89
|
+
catch {
|
|
90
|
+
return undefined;
|
|
91
|
+
}
|
|
92
|
+
if (typeof parsed !== "object" || parsed === null)
|
|
93
|
+
return undefined;
|
|
94
|
+
const labels = parsed.labels;
|
|
95
|
+
if (labels === undefined)
|
|
96
|
+
return undefined;
|
|
97
|
+
if (!Array.isArray(labels) || labels.length === 0 || !labels.every((l) => typeof l === "string")) {
|
|
98
|
+
throw new Error(`model-card.json at ${modelCardPath} has a malformed \`labels\` field — ` +
|
|
99
|
+
`expected a non-empty array of strings, got ${JSON.stringify(labels)}.`);
|
|
100
|
+
}
|
|
101
|
+
return Object.freeze(labels.slice());
|
|
58
102
|
}
|
|
59
103
|
//# sourceMappingURL=weights.js.map
|
package/out/weights.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"weights.js","sourceRoot":"","sources":["../weights.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;GAsBG;AAEH,OAAO,EAAE,UAAU,EAAE,MAAM,SAAS,CAAA;
|
|
1
|
+
{"version":3,"file":"weights.js","sourceRoot":"","sources":["../weights.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;GAsBG;AAEH,OAAO,EAAE,UAAU,EAAE,YAAY,EAAE,MAAM,SAAS,CAAA;AAClD,OAAO,EAAE,aAAa,EAAE,MAAM,aAAa,CAAA;AAC3C,OAAO,EAAE,OAAO,EAAE,OAAO,EAAE,MAAM,WAAW,CAAA;AAE5C,MAAM,GAAG,GAAG,aAAa,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAA;AAwB1C,MAAM,UAAU,cAAc,CAAC,IAAwB;IACtD,MAAM,KAAK,GAAa,EAAE,CAAA;IAE1B,IAAI,IAAI,CAAC,SAAS,IAAI,IAAI,CAAC,aAAa,EAAE,CAAC;QAC1C,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,SAAS,CAAC;YAAE,MAAM,IAAI,KAAK,CAAC,sCAAsC,IAAI,CAAC,SAAS,EAAE,CAAC,CAAA;QACxG,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,aAAa,CAAC;YAAE,MAAM,IAAI,KAAK,CAAC,0CAA0C,IAAI,CAAC,aAAa,EAAE,CAAC,CAAA;QACpH,OAAO,EAAE,SAAS,EAAE,IAAI,CAAC,SAAS,EAAE,aAAa,EAAE,IAAI,CAAC,aAAa,EAAE,MAAM,EAAE,UAAU,EAAE,CAAA;IAC5F,CAAC;IAED,oFAAoF;IACpF,2FAA2F;IAC3F,4DAA4D;IAC5D,MAAM,MAAM,GAAG,CAAC,IAAI,CAAC,MAAM,IAAI,OAAO,CAAC,CAAC,WAAW,EAAE,CAAA;IACrD,MAAM,WAAW,GAAG,6BAA6B,MAAM,EAAE,CAAA;IACzD,IAAI,UAAkB,CAAA;IACtB,IAAI,CAAC;QACJ,MAAM,WAAW,GAAG,GAAG,CAAC,OAAO,CAAC,GAAG,WAAW,eAAe,CAAC,CAAA;QAC9D,UAAU,GAAG,OAAO,CAAC,WAAW,CAAC,CAAA;IAClC,CAAC;IAAC,MAAM,CAAC;QACR,MAAM,IAAI,KAAK,CACd,qBAAqB,WAAW,iCAAiC,WAAW,IAAI;YAC/E,oDAAoD,CACrD,CAAA;IACF,CAAC;IAED,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,IAAI,OAAO,CAAC,UAAU,EAAE,YAAY,CAAC,CAAA;IACrE,MAAM,aAAa,GAAG,IAAI,CAAC,aAAa,IAAI,OAAO,CAAC,UAAU,EAAE,iBAAiB,CAAC,CAAA;IAClF,KAAK,CAAC,IAAI,CAAC,SAAS,EAAE,aAAa,CAAC,CAAA;IAEpC,IAAI,CAAC,UAAU,CAAC,SAAS,CAAC,IAAI,CAAC,UAAU,CAAC,aAAa,CAAC,EAAE,CAAC;QAC1D,MAAM,IAAI,KAAK,CACd,mBAAmB,WAAW,gBAAgB,UAAU,gCAAgC;YACvF,aAAa,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,IAAI;YACnC,iFAAiF;YACjF,oDAAoD,CACrD,CAAA;IACF,CAAC;IAED,MAAM,kBAAkB,GAAG,OAAO,CAAC,UAAU,EAAE,iBAAiB,CAAC,CAAA;IACjE,MAAM,aAAa,GAAG,UAAU,CAAC,kBAAkB,CAAC,CAAC,CAAC,CAAC,kBAAkB,CAAC,CAAC,CAAC,SAAS,CAAA;IAErF,OAAO,EAAE,SAAS,EAAE,aAAa,EAAE,aAAa,EAAE,MAAM,EAAE,WAAW,WAAW,EAAE,EAAE,CAAA;AACrF,CAAC;AAED;;;;;;;;;;GAUG;AACH,MAAM,UAAU,uBAAuB,CAAC,aAAiC;IACxE,IAAI,CAAC,aAAa,IAAI,CAAC,UAAU,CAAC,aAAa,CAAC;QAAE,OAAO,SAAS,CAAA;IAClE,IAAI,GAAW,CAAA;IACf,IAAI,CAAC;QACJ,GAAG,GAAG,YAAY,CAAC,aAAa,EAAE,MAAM,CAAC,CAAA;IAC1C,CAAC;IAAC,MAAM,CAAC;QACR,OAAO,SAAS,CAAA;IACjB,CAAC;IACD,IAAI,MAAe,CAAA;IACnB,IAAI,CAAC;QACJ,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAA;IACzB,CAAC;IAAC,MAAM,CAAC;QACR,OAAO,SAAS,CAAA;IACjB,CAAC;IACD,IAAI,OAAO,MAAM,KAAK,QAAQ,IAAI,MAAM,KAAK,IAAI;QAAE,OAAO,SAAS,CAAA;IACnE,MAAM,MAAM,GAAI,MAA+B,CAAC,MAAM,CAAA;IACtD,IAAI,MAAM,KAAK,SAAS;QAAE,OAAO,SAAS,CAAA;IAC1C,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,OAAO,CAAC,KAAK,QAAQ,CAAC,EAAE,CAAC;QAClG,MAAM,IAAI,KAAK,CACd,sBAAsB,aAAa,sCAAsC;YACxE,8CAA8C,IAAI,CAAC,SAAS,CAAC,MAAM,CAAC,GAAG,CACxE,CAAA;IACF,CAAC;IACD,OAAO,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,KAAK,EAAE,CAAsB,CAAA;AAC1D,CAAC"}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@mailwoman/neural",
|
|
3
|
-
"version": "2.0
|
|
3
|
+
"version": "2.2.0",
|
|
4
4
|
"description": "Mailwoman neural classifier runtime: SentencePiece tokenizer + ONNX inference + decoder wiring.",
|
|
5
5
|
"license": "AGPL-3.0-only",
|
|
6
6
|
"repository": {
|
|
@@ -12,10 +12,12 @@
|
|
|
12
12
|
"exports": {
|
|
13
13
|
"./package.json": "./package.json",
|
|
14
14
|
".": "./out/index.js",
|
|
15
|
-
"./tokenizer": "./out/tokenizer.js"
|
|
15
|
+
"./tokenizer": "./out/tokenizer.js",
|
|
16
|
+
"./weights": "./out/weights.js",
|
|
17
|
+
"./browser": "./out/browser.js"
|
|
16
18
|
},
|
|
17
19
|
"dependencies": {
|
|
18
|
-
"@mailwoman/core": "2.0
|
|
20
|
+
"@mailwoman/core": "2.2.0",
|
|
19
21
|
"@sctg/sentencepiece-js": "^1.3.3",
|
|
20
22
|
"onnxruntime-node": "^1.26.0"
|
|
21
23
|
},
|
|
@@ -25,6 +27,7 @@
|
|
|
25
27
|
"out/**/*.d.ts",
|
|
26
28
|
"out/**/*.d.ts.map"
|
|
27
29
|
],
|
|
30
|
+
"sideEffects": false,
|
|
28
31
|
"publishConfig": {
|
|
29
32
|
"access": "public"
|
|
30
33
|
}
|