@mailwoman/neural 2.1.0 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/out/anchor-inference.d.ts +57 -0
- package/out/anchor-inference.d.ts.map +1 -0
- package/out/anchor-inference.js +94 -0
- package/out/anchor-inference.js.map +1 -0
- package/out/browser.d.ts +18 -0
- package/out/browser.d.ts.map +1 -0
- package/out/browser.js +19 -0
- package/out/browser.js.map +1 -0
- package/out/classifier.d.ts +145 -11
- package/out/classifier.d.ts.map +1 -1
- package/out/classifier.js +185 -20
- package/out/classifier.js.map +1 -1
- package/out/fst-prior.d.ts +71 -0
- package/out/fst-prior.d.ts.map +1 -0
- package/out/fst-prior.js +173 -0
- package/out/fst-prior.js.map +1 -0
- package/out/index.d.ts +7 -0
- package/out/index.d.ts.map +1 -1
- package/out/index.js +5 -0
- package/out/index.js.map +1 -1
- package/out/labels.d.ts +30 -6
- package/out/labels.d.ts.map +1 -1
- package/out/labels.js +43 -6
- package/out/labels.js.map +1 -1
- package/out/onnx-runner.d.ts +8 -1
- package/out/onnx-runner.d.ts.map +1 -1
- package/out/onnx-runner.js +31 -1
- package/out/onnx-runner.js.map +1 -1
- package/out/postcode-anchor.d.ts +117 -0
- package/out/postcode-anchor.d.ts.map +1 -0
- package/out/postcode-anchor.js +269 -0
- package/out/postcode-anchor.js.map +1 -0
- package/out/postcode-binary-resolver.d.ts +60 -0
- package/out/postcode-binary-resolver.d.ts.map +1 -0
- package/out/postcode-binary-resolver.js +208 -0
- package/out/postcode-binary-resolver.js.map +1 -0
- package/out/postcode-repair.d.ts +65 -0
- package/out/postcode-repair.d.ts.map +1 -0
- package/out/postcode-repair.js +171 -0
- package/out/postcode-repair.js.map +1 -0
- package/out/proposal-classifier.d.ts +5 -1
- package/out/proposal-classifier.d.ts.map +1 -1
- package/out/proposal-classifier.js +5 -3
- package/out/proposal-classifier.js.map +1 -1
- package/out/query-shape-prior.d.ts +74 -0
- package/out/query-shape-prior.d.ts.map +1 -0
- package/out/query-shape-prior.js +223 -0
- package/out/query-shape-prior.js.map +1 -0
- package/out/street-morphology-prior.d.ts +56 -0
- package/out/street-morphology-prior.d.ts.map +1 -0
- package/out/street-morphology-prior.js +159 -0
- package/out/street-morphology-prior.js.map +1 -0
- package/out/tokenizer.d.ts +6 -1
- package/out/tokenizer.d.ts.map +1 -1
- package/out/tokenizer.js +8 -3
- package/out/tokenizer.js.map +1 -1
- package/out/unit-repair.d.ts +46 -0
- package/out/unit-repair.d.ts.map +1 -0
- package/out/unit-repair.js +147 -0
- package/out/unit-repair.js.map +1 -0
- package/out/viterbi.d.ts +76 -0
- package/out/viterbi.d.ts.map +1 -0
- package/out/viterbi.js +163 -0
- package/out/viterbi.js.map +1 -0
- package/out/vitest.config.d.ts.map +1 -1
- package/out/vitest.config.js +3 -0
- package/out/vitest.config.js.map +1 -1
- package/out/weights.d.ts +42 -0
- package/out/weights.d.ts.map +1 -1
- package/out/weights.js +92 -4
- package/out/weights.js.map +1 -1
- package/package.json +10 -3
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Inference-side postcode-anchor features (#239/#240) — the mirror of the Python training pipeline
|
|
7
|
+
* (`mailwoman_train/tokenizer.py::anchor_feature_vector` + `realign_anchor_to_pieces`). At inference
|
|
8
|
+
* the model conditions on per-piece anchor features fed alongside `input_ids`; this builds them from
|
|
9
|
+
* a raw address + its SentencePiece pieces, using the SAME postcode→anchor lookup the model trained
|
|
10
|
+
* against (`scripts/build-pilot-anchor-lookup.py`), so the feature layout matches byte-for-byte.
|
|
11
|
+
*
|
|
12
|
+
* The layout is LOAD-BEARING and cross-language: a wrong locale order or centroid scale feeds the
|
|
13
|
+
* model garbage. `anchor-inference.test.ts` pins both `LOCALE_ORDER` and the vector to values emitted
|
|
14
|
+
* by the Python `anchor_feature_vector` — any drift fails the test.
|
|
15
|
+
*/
|
|
16
|
+
import type { TokenizedPiece } from "./tokenizer.js";
|
|
17
|
+
/**
|
|
18
|
+
* The locale class order — MUST match Python `mailwoman_train/labels.py::LOCALE_COUNTRIES`. The
|
|
19
|
+
* posterior occupies indices `[0, LOCALE_ORDER.length)`; the normalized centroid the last two.
|
|
20
|
+
* (Pinned by the test; do not reorder.)
|
|
21
|
+
*/
|
|
22
|
+
export declare const LOCALE_ORDER: readonly ["US", "FR", "DE", "CA", "GB", "JP", "ES", "IT", "NL"];
|
|
23
|
+
/** Anchor feature width = posterior over the locale set + a 2-d centroid. */
|
|
24
|
+
export declare const ANCHOR_FEATURE_DIM: number;
|
|
25
|
+
/** One postcode's anchor record (from the pilot lookup): country posterior + a single centroid. */
|
|
26
|
+
export interface AnchorEntry {
|
|
27
|
+
posterior: Record<string, number>;
|
|
28
|
+
lat: number;
|
|
29
|
+
lon: number;
|
|
30
|
+
}
|
|
31
|
+
export type AnchorLookup = Map<string, AnchorEntry>;
|
|
32
|
+
/**
|
|
33
|
+
* Build the fixed-width anchor feature vector — the exact mirror of Python `anchor_feature_vector`:
|
|
34
|
+
* a uniform country posterior over {@linkcode LOCALE_ORDER} (renormalized over the in-set mass) + a
|
|
35
|
+
* normalized centroid (`lat/90`, `lon/180` ∈ [-1, 1]).
|
|
36
|
+
*/
|
|
37
|
+
export declare function anchorFeatureVector(posterior: Record<string, number>, lat: number, lon: number): number[];
|
|
38
|
+
/**
|
|
39
|
+
* Parse the pilot postcode→anchor lookup JSON (`{postcode: [posterior, lat, lon]}`) into a Map. Pure
|
|
40
|
+
* (takes the parsed object, not a path) so this module stays browser-safe — the file read lives in
|
|
41
|
+
* the Node-side caller (the eval).
|
|
42
|
+
*/
|
|
43
|
+
export declare function parseAnchorLookup(raw: Record<string, [Record<string, number>, number, number]>): AnchorLookup;
|
|
44
|
+
/**
|
|
45
|
+
* Per-piece anchor features + confidence for `text`, projected onto its SP `pieces` by the SAME
|
|
46
|
+
* char→piece rule the labels use (a piece takes the anchor of the postcode span its first
|
|
47
|
+
* non-whitespace char falls inside) — so the anchor lands on exactly the postcode's sub-tokens.
|
|
48
|
+
*
|
|
49
|
+
* Postcode spans are the alphanumeric runs in `text` that the lookup recognizes (gold-equivalent on
|
|
50
|
+
* clean rendered addresses); a recognized span yields a confidence-1.0 anchor, like training's
|
|
51
|
+
* gold-span. Returns `(pieces × ANCHOR_FEATURE_DIM)` features + `(pieces,)` confidence.
|
|
52
|
+
*/
|
|
53
|
+
export declare function buildAnchorFeatures(text: string, pieces: ReadonlyArray<TokenizedPiece>, lookup: AnchorLookup): {
|
|
54
|
+
features: number[][];
|
|
55
|
+
confidence: number[];
|
|
56
|
+
};
|
|
57
|
+
//# sourceMappingURL=anchor-inference.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"anchor-inference.d.ts","sourceRoot":"","sources":["../anchor-inference.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;GAcG;AAEH,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,gBAAgB,CAAA;AAEpD;;;;GAIG;AACH,eAAO,MAAM,YAAY,iEAAkE,CAAA;AAE3F,6EAA6E;AAC7E,eAAO,MAAM,kBAAkB,QAA0B,CAAA;AAEzD,mGAAmG;AACnG,MAAM,WAAW,WAAW;IAC3B,SAAS,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAA;IACjC,GAAG,EAAE,MAAM,CAAA;IACX,GAAG,EAAE,MAAM,CAAA;CACX;AAED,MAAM,MAAM,YAAY,GAAG,GAAG,CAAC,MAAM,EAAE,WAAW,CAAC,CAAA;AAEnD;;;;GAIG;AACH,wBAAgB,mBAAmB,CAAC,SAAS,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,GAAG,EAAE,MAAM,EAAE,GAAG,EAAE,MAAM,GAAG,MAAM,EAAE,CAgBzG;AAED;;;;GAIG;AACH,wBAAgB,iBAAiB,CAAC,GAAG,EAAE,MAAM,CAAC,MAAM,EAAE,CAAC,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,MAAM,EAAE,MAAM,CAAC,CAAC,GAAG,YAAY,CAI7G;AAED;;;;;;;;GAQG;AACH,wBAAgB,mBAAmB,CAClC,IAAI,EAAE,MAAM,EACZ,MAAM,EAAE,aAAa,CAAC,cAAc,CAAC,EACrC,MAAM,EAAE,YAAY,GAClB;IAAE,QAAQ,EAAE,MAAM,EAAE,EAAE,CAAC;IAAC,UAAU,EAAE,MAAM,EAAE,CAAA;CAAE,CA0BhD"}
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Inference-side postcode-anchor features (#239/#240) — the mirror of the Python training pipeline
|
|
7
|
+
* (`mailwoman_train/tokenizer.py::anchor_feature_vector` + `realign_anchor_to_pieces`). At inference
|
|
8
|
+
* the model conditions on per-piece anchor features fed alongside `input_ids`; this builds them from
|
|
9
|
+
* a raw address + its SentencePiece pieces, using the SAME postcode→anchor lookup the model trained
|
|
10
|
+
* against (`scripts/build-pilot-anchor-lookup.py`), so the feature layout matches byte-for-byte.
|
|
11
|
+
*
|
|
12
|
+
* The layout is LOAD-BEARING and cross-language: a wrong locale order or centroid scale feeds the
|
|
13
|
+
* model garbage. `anchor-inference.test.ts` pins both `LOCALE_ORDER` and the vector to values emitted
|
|
14
|
+
* by the Python `anchor_feature_vector` — any drift fails the test.
|
|
15
|
+
*/
|
|
16
|
+
/**
|
|
17
|
+
* The locale class order — MUST match Python `mailwoman_train/labels.py::LOCALE_COUNTRIES`. The
|
|
18
|
+
* posterior occupies indices `[0, LOCALE_ORDER.length)`; the normalized centroid the last two.
|
|
19
|
+
* (Pinned by the test; do not reorder.)
|
|
20
|
+
*/
|
|
21
|
+
export const LOCALE_ORDER = ["US", "FR", "DE", "CA", "GB", "JP", "ES", "IT", "NL"];
|
|
22
|
+
/** Anchor feature width = posterior over the locale set + a 2-d centroid. */
|
|
23
|
+
export const ANCHOR_FEATURE_DIM = LOCALE_ORDER.length + 2;
|
|
24
|
+
/**
|
|
25
|
+
* Build the fixed-width anchor feature vector — the exact mirror of Python `anchor_feature_vector`:
|
|
26
|
+
* a uniform country posterior over {@linkcode LOCALE_ORDER} (renormalized over the in-set mass) + a
|
|
27
|
+
* normalized centroid (`lat/90`, `lon/180` ∈ [-1, 1]).
|
|
28
|
+
*/
|
|
29
|
+
export function anchorFeatureVector(posterior, lat, lon) {
|
|
30
|
+
const vec = new Array(ANCHOR_FEATURE_DIM).fill(0);
|
|
31
|
+
let total = 0;
|
|
32
|
+
for (const [country, weight] of Object.entries(posterior)) {
|
|
33
|
+
const idx = LOCALE_ORDER.indexOf(country.toUpperCase());
|
|
34
|
+
if (idx >= 0) {
|
|
35
|
+
vec[idx] = weight;
|
|
36
|
+
total += weight;
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
if (total > 0) {
|
|
40
|
+
for (let i = 0; i < LOCALE_ORDER.length; i++)
|
|
41
|
+
vec[i] /= total;
|
|
42
|
+
}
|
|
43
|
+
vec[LOCALE_ORDER.length] = Math.max(-1, Math.min(1, lat / 90));
|
|
44
|
+
vec[LOCALE_ORDER.length + 1] = Math.max(-1, Math.min(1, lon / 180));
|
|
45
|
+
return vec;
|
|
46
|
+
}
|
|
47
|
+
/**
|
|
48
|
+
* Parse the pilot postcode→anchor lookup JSON (`{postcode: [posterior, lat, lon]}`) into a Map. Pure
|
|
49
|
+
* (takes the parsed object, not a path) so this module stays browser-safe — the file read lives in
|
|
50
|
+
* the Node-side caller (the eval).
|
|
51
|
+
*/
|
|
52
|
+
export function parseAnchorLookup(raw) {
|
|
53
|
+
const out = new Map();
|
|
54
|
+
for (const [pc, [posterior, lat, lon]] of Object.entries(raw))
|
|
55
|
+
out.set(pc, { posterior, lat, lon });
|
|
56
|
+
return out;
|
|
57
|
+
}
|
|
58
|
+
/**
|
|
59
|
+
* Per-piece anchor features + confidence for `text`, projected onto its SP `pieces` by the SAME
|
|
60
|
+
* char→piece rule the labels use (a piece takes the anchor of the postcode span its first
|
|
61
|
+
* non-whitespace char falls inside) — so the anchor lands on exactly the postcode's sub-tokens.
|
|
62
|
+
*
|
|
63
|
+
* Postcode spans are the alphanumeric runs in `text` that the lookup recognizes (gold-equivalent on
|
|
64
|
+
* clean rendered addresses); a recognized span yields a confidence-1.0 anchor, like training's
|
|
65
|
+
* gold-span. Returns `(pieces × ANCHOR_FEATURE_DIM)` features + `(pieces,)` confidence.
|
|
66
|
+
*/
|
|
67
|
+
export function buildAnchorFeatures(text, pieces, lookup) {
|
|
68
|
+
const features = pieces.map(() => new Array(ANCHOR_FEATURE_DIM).fill(0));
|
|
69
|
+
const confidence = pieces.map(() => 0);
|
|
70
|
+
const tokenRe = /[A-Za-z0-9]+/g;
|
|
71
|
+
let m;
|
|
72
|
+
while ((m = tokenRe.exec(text)) !== null) {
|
|
73
|
+
const entry = lookup.get(m[0].toUpperCase());
|
|
74
|
+
if (!entry)
|
|
75
|
+
continue;
|
|
76
|
+
const spanBegin = m.index;
|
|
77
|
+
const spanEnd = m.index + m[0].length;
|
|
78
|
+
const vec = anchorFeatureVector(entry.posterior, entry.lat, entry.lon);
|
|
79
|
+
for (let i = 0; i < pieces.length; i++) {
|
|
80
|
+
const p = pieces[i];
|
|
81
|
+
for (let c = p.start; c < p.end; c++) {
|
|
82
|
+
if (c < text.length && !/\s/.test(text[c])) {
|
|
83
|
+
if (c >= spanBegin && c < spanEnd) {
|
|
84
|
+
features[i] = vec;
|
|
85
|
+
confidence[i] = 1.0;
|
|
86
|
+
}
|
|
87
|
+
break; // first non-whitespace char of the piece decides (mirrors realign_anchor_to_pieces)
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
return { features, confidence };
|
|
93
|
+
}
|
|
94
|
+
//# sourceMappingURL=anchor-inference.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"anchor-inference.js","sourceRoot":"","sources":["../anchor-inference.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;GAcG;AAIH;;;;GAIG;AACH,MAAM,CAAC,MAAM,YAAY,GAAG,CAAC,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,CAAU,CAAA;AAE3F,6EAA6E;AAC7E,MAAM,CAAC,MAAM,kBAAkB,GAAG,YAAY,CAAC,MAAM,GAAG,CAAC,CAAA;AAWzD;;;;GAIG;AACH,MAAM,UAAU,mBAAmB,CAAC,SAAiC,EAAE,GAAW,EAAE,GAAW;IAC9F,MAAM,GAAG,GAAG,IAAI,KAAK,CAAS,kBAAkB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;IACzD,IAAI,KAAK,GAAG,CAAC,CAAA;IACb,KAAK,MAAM,CAAC,OAAO,EAAE,MAAM,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC,EAAE,CAAC;QAC3D,MAAM,GAAG,GAAG,YAAY,CAAC,OAAO,CAAC,OAAO,CAAC,WAAW,EAAmC,CAAC,CAAA;QACxF,IAAI,GAAG,IAAI,CAAC,EAAE,CAAC;YACd,GAAG,CAAC,GAAG,CAAC,GAAG,MAAM,CAAA;YACjB,KAAK,IAAI,MAAM,CAAA;QAChB,CAAC;IACF,CAAC;IACD,IAAI,KAAK,GAAG,CAAC,EAAE,CAAC;QACf,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,YAAY,CAAC,MAAM,EAAE,CAAC,EAAE;YAAE,GAAG,CAAC,CAAC,CAAE,IAAI,KAAK,CAAA;IAC/D,CAAC;IACD,GAAG,CAAC,YAAY,CAAC,MAAM,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,GAAG,GAAG,EAAE,CAAC,CAAC,CAAA;IAC9D,GAAG,CAAC,YAAY,CAAC,MAAM,GAAG,CAAC,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,GAAG,GAAG,GAAG,CAAC,CAAC,CAAA;IACnE,OAAO,GAAG,CAAA;AACX,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,iBAAiB,CAAC,GAA6D;IAC9F,MAAM,GAAG,GAAiB,IAAI,GAAG,EAAE,CAAA;IACnC,KAAK,MAAM,CAAC,EAAE,EAAE,CAAC,SAAS,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC;QAAE,GAAG,CAAC,GAAG,CAAC,EAAE,EAAE,EAAE,SAAS,EAAE,GAAG,EAAE,GAAG,EAAE,CAAC,CAAA;IACnG,OAAO,GAAG,CAAA;AACX,CAAC;AAED;;;;;;;;GAQG;AACH,MAAM,UAAU,mBAAmB,CAClC,IAAY,EACZ,MAAqC,EACrC,MAAoB;IAEpB,MAAM,QAAQ,GAAe,MAAM,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,IAAI,KAAK,CAAS,kBAAkB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAA;IAC5F,MAAM,UAAU,GAAa,MAAM,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,CAAA;IAEhD,MAAM,OAAO,GAAG,eAAe,CAAA;IAC/B,IAAI,CAAyB,CAAA;IAC7B,OAAO,CAAC,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;QAC1C,MAAM,KAAK,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,WAAW,EAAE,CAAC,CAAA;QAC5C,IAAI,CAAC,KAAK;YAAE,SAAQ;QACpB,MAAM,SAAS,GAAG,CAAC,CAAC,KAAK,CAAA;QACzB,MAAM,OAAO,GAAG,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,CAAA;QACrC,MAAM,GAAG,GAAG,mBAAmB,CAAC,KAAK,CAAC,SAAS,EAAE,KAAK,CAAC,GAAG,EAAE,KAAK,CAAC,GAAG,CAAC,CAAA;QACtE,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACxC,MAAM,CAAC,GAAG,MAAM,CAAC,CAAC,CAAE,CAAA;YACpB,KAAK,IAAI,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC;gBACtC,IAAI,CAAC,GAAG,IAAI,CAAC,MAAM,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAE,CAAC,EAAE,CAAC;oBAC7C,IAAI,CAAC,IAAI,SAAS,IAAI,CAAC,GAAG,OAAO,EAAE,CAAC;wBACnC,QAAQ,CAAC,CAAC,CAAC,GAAG,GAAG,CAAA;wBACjB,UAAU,CAAC,CAAC,CAAC,GAAG,GAAG,CAAA;oBACpB,CAAC;oBACD,MAAK,CAAC,oFAAoF;gBAC3F,CAAC;YACF,CAAC;QACF,CAAC;IACF,CAAC;IACD,OAAO,EAAE,QAAQ,EAAE,UAAU,EAAE,CAAA;AAChC,CAAC"}
|
package/out/browser.d.ts
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Browser-safe re-export surface. Excludes `./onnx-runner.js` + `./weights.js` (Node-only — they
|
|
7
|
+
* statically reference `onnxruntime-node` + `node:fs`), the dynamic `loadFromWeights` /
|
|
8
|
+
* `loadFromFile` paths from those modules guard the corresponding imports with `webpackIgnore` so
|
|
9
|
+
* Node callers still get them via the main `@mailwoman/neural` entry without bundling them into a
|
|
10
|
+
* browser graph.
|
|
11
|
+
*/
|
|
12
|
+
export * from "./classifier.js";
|
|
13
|
+
export * from "./labels.js";
|
|
14
|
+
export * from "./tokenizer.js";
|
|
15
|
+
export * from "./anchor-inference.js";
|
|
16
|
+
export * from "./postcode-binary-resolver.js";
|
|
17
|
+
export type { InferResult } from "./onnx-runner.js";
|
|
18
|
+
//# sourceMappingURL=browser.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"browser.d.ts","sourceRoot":"","sources":["../browser.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,cAAc,iBAAiB,CAAA;AAC/B,cAAc,aAAa,CAAA;AAC3B,cAAc,gBAAgB,CAAA;AAG9B,cAAc,uBAAuB,CAAA;AACrC,cAAc,+BAA+B,CAAA;AAG7C,YAAY,EAAE,WAAW,EAAE,MAAM,kBAAkB,CAAA"}
|
package/out/browser.js
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Browser-safe re-export surface. Excludes `./onnx-runner.js` + `./weights.js` (Node-only — they
|
|
7
|
+
* statically reference `onnxruntime-node` + `node:fs`), the dynamic `loadFromWeights` /
|
|
8
|
+
* `loadFromFile` paths from those modules guard the corresponding imports with `webpackIgnore` so
|
|
9
|
+
* Node callers still get them via the main `@mailwoman/neural` entry without bundling them into a
|
|
10
|
+
* browser graph.
|
|
11
|
+
*/
|
|
12
|
+
export * from "./classifier.js";
|
|
13
|
+
export * from "./labels.js";
|
|
14
|
+
export * from "./tokenizer.js";
|
|
15
|
+
// Browser-safe anchor channel (#239/#240): the pure-JS feature builder + the postcode binary resolver
|
|
16
|
+
// (zero-dep) the demo wires together to feed the anchor at inference.
|
|
17
|
+
export * from "./anchor-inference.js";
|
|
18
|
+
export * from "./postcode-binary-resolver.js";
|
|
19
|
+
//# sourceMappingURL=browser.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"browser.js","sourceRoot":"","sources":["../browser.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,cAAc,iBAAiB,CAAA;AAC/B,cAAc,aAAa,CAAA;AAC3B,cAAc,gBAAgB,CAAA;AAC9B,sGAAsG;AACtG,sEAAsE;AACtE,cAAc,uBAAuB,CAAA;AACrC,cAAc,+BAA+B,CAAA"}
|
package/out/classifier.d.ts
CHANGED
|
@@ -9,19 +9,68 @@
|
|
|
9
9
|
*
|
|
10
10
|
* Convenience wrappers `parseJson` / `parseTuples` / `parseXml` project the tree on the way out.
|
|
11
11
|
*/
|
|
12
|
-
import { type AddressTree, type ComponentTag
|
|
13
|
-
import {
|
|
12
|
+
import { decodeAsXml, type AddressTree, type ComponentTag } from "@mailwoman/core/decoder";
|
|
13
|
+
import { type FstMatcherLike } from "./fst-prior.js";
|
|
14
|
+
import type { InferResult } from "./onnx-runner.js";
|
|
15
|
+
import { type QueryShapeLike } from "./query-shape-prior.js";
|
|
16
|
+
import { type StreetMorphologyPriorOpts } from "./street-morphology-prior.js";
|
|
14
17
|
import { MailwomanTokenizer } from "./tokenizer.js";
|
|
15
|
-
import { type
|
|
18
|
+
import { type AnchorLookup } from "./anchor-inference.js";
|
|
19
|
+
import type { ResolveWeightsOpts } from "./weights.js";
|
|
20
|
+
/**
|
|
21
|
+
* Structural type the classifier needs from a runner. Lets callers swap the Node-side `OnnxRunner`
|
|
22
|
+
* for a browser-side runner (e.g. `@mailwoman/neural-web`'s `WebOnnxRunner`) without inheritance —
|
|
23
|
+
* the classifier only ever calls `infer(ids)`.
|
|
24
|
+
*/
|
|
25
|
+
export interface NeuralRunner {
|
|
26
|
+
infer(tokenIds: number[], anchor?: {
|
|
27
|
+
features: ReadonlyArray<ReadonlyArray<number>>;
|
|
28
|
+
confidence: ReadonlyArray<number>;
|
|
29
|
+
}): Promise<InferResult>;
|
|
30
|
+
}
|
|
16
31
|
export interface NeuralAddressClassifierConfig {
|
|
17
32
|
tokenizer: MailwomanTokenizer;
|
|
18
|
-
runner:
|
|
19
|
-
/**
|
|
33
|
+
runner: NeuralRunner;
|
|
34
|
+
/**
|
|
35
|
+
* Label vocabulary in the order the model emits them. Defaults to Stage 2 (v0.3.0). Stage 2
|
|
36
|
+
* strictly extends Stage 1 at the same indices, so a v0.2.0 Stage 1 model loaded with this
|
|
37
|
+
* default still decodes correctly — its emissions only span the first 15 entries.
|
|
38
|
+
*/
|
|
20
39
|
labels?: readonly string[];
|
|
40
|
+
/**
|
|
41
|
+
* Decoding strategy:
|
|
42
|
+
*
|
|
43
|
+
* - `"viterbi"` (default) — linear-chain CRF Viterbi with the BIO structural mask. Prevents
|
|
44
|
+
* orphan-`I-*` sequences. If `transitions` is provided, uses learned scores on top.
|
|
45
|
+
* - `"argmax"` — per-token argmax. Faster but produces structurally invalid sequences. Use only for
|
|
46
|
+
* debugging / comparison.
|
|
47
|
+
*/
|
|
48
|
+
decode?: "viterbi" | "argmax";
|
|
49
|
+
/**
|
|
50
|
+
* Optional learned CRF transition scores. Square matrix of size `labels.length × labels.length`.
|
|
51
|
+
* Added on top of the structural BIO mask. Future weights releases ship this; today's v3.0.0
|
|
52
|
+
* weights don't, so the structural mask alone is used.
|
|
53
|
+
*/
|
|
54
|
+
transitions?: number[][];
|
|
55
|
+
/** Optional learned start-of-sequence transition scores per label. */
|
|
56
|
+
startTransitions?: number[];
|
|
57
|
+
/** Optional learned end-of-sequence transition scores per label. */
|
|
58
|
+
endTransitions?: number[];
|
|
59
|
+
/**
|
|
60
|
+
* Optional postcode-anchor lookup (#239/#240). When set, `parse` builds per-piece anchor features
|
|
61
|
+
* from the text + this lookup and feeds them to the runner — for models trained with the anchor
|
|
62
|
+
* channel (exported with the `anchor_features`/`anchor_confidence` ONNX inputs). Omit for plain
|
|
63
|
+
* models. Load via `loadAnchorLookup` from `./anchor-inference.js`.
|
|
64
|
+
*/
|
|
65
|
+
postcodeAnchorLookup?: AnchorLookup;
|
|
21
66
|
}
|
|
22
67
|
export declare class NeuralAddressClassifier {
|
|
23
68
|
private readonly cfg;
|
|
24
69
|
private readonly labels;
|
|
70
|
+
private readonly decodeMode;
|
|
71
|
+
private readonly transitions;
|
|
72
|
+
private readonly startTransitions;
|
|
73
|
+
private readonly endTransitions;
|
|
25
74
|
constructor(cfg: NeuralAddressClassifierConfig);
|
|
26
75
|
/**
|
|
27
76
|
* One-call factory that resolves the weights package (or explicit paths), loads the tokenizer and
|
|
@@ -29,12 +78,97 @@ export declare class NeuralAddressClassifier {
|
|
|
29
78
|
*
|
|
30
79
|
* Resolution order: explicit paths in `opts` → `@mailwoman/neural-weights-<locale>` package →
|
|
31
80
|
* throws a single actionable error.
|
|
81
|
+
*
|
|
82
|
+
* **Node-only.** The dynamic imports keep `OnnxRunner` (onnxruntime-node) + `resolveWeights`
|
|
83
|
+
* (uses Node fs) out of the static dependency graph, so this file can be bundled for the browser
|
|
84
|
+
* by `@mailwoman/neural-web`. Calling this method in a browser will throw at runtime — use
|
|
85
|
+
* `loadNeuralClassifierFromUrls` from `@mailwoman/neural-web` instead.
|
|
86
|
+
*/
|
|
87
|
+
static loadFromWeights(opts?: ResolveWeightsOpts & {
|
|
88
|
+
postcodeAnchorLookup?: AnchorLookup;
|
|
89
|
+
}): Promise<NeuralAddressClassifier>;
|
|
90
|
+
/** Tokenize → infer → Viterbi (or argmax) → decoder tree. */
|
|
91
|
+
parse(text: string, opts?: ParseOpts): Promise<AddressTree>;
|
|
92
|
+
/**
|
|
93
|
+
* Like `parse`, but also returns the raw per-token logits and piece offsets needed for per-span
|
|
94
|
+
* logit aggregation (Option C joint-reconcile integration).
|
|
95
|
+
*/
|
|
96
|
+
parseWithLogits(text: string, opts?: ParseOpts): Promise<ParseWithLogitsResult>;
|
|
97
|
+
parseJson(text: string, opts?: ParseOpts): Promise<Partial<Record<ComponentTag, string>>>;
|
|
98
|
+
parseTuples(text: string, opts?: ParseOpts): Promise<Array<[ComponentTag, string]>>;
|
|
99
|
+
parseXml(text: string, opts?: ParseOpts & {
|
|
100
|
+
xml?: Parameters<typeof decodeAsXml>[1];
|
|
101
|
+
}): Promise<string>;
|
|
102
|
+
/**
|
|
103
|
+
* Guard against a silent label/emission shape overrun. When the model emits MORE logits per token
|
|
104
|
+
* than the configured label vocabulary (e.g. a Stage 3 bundle loaded with the default Stage 2
|
|
105
|
+
* labels), viterbi indexes past the transition matrix and dies with an opaque `Cannot read
|
|
106
|
+
* properties of undefined (reading '0')`. Fail fast here with a message that names the contract
|
|
107
|
+
* the caller violated.
|
|
108
|
+
*
|
|
109
|
+
* The opposite shape (model narrower than labels) is intentionally permitted — STAGE2_BIO_LABELS
|
|
110
|
+
* prefix-extends STAGE1_BIO_LABELS so a Stage 1 model loaded with Stage 2 labels decodes
|
|
111
|
+
* correctly via the first 15 logits. See labels.ts for the contract.
|
|
112
|
+
*/
|
|
113
|
+
private assertEmissionWidth;
|
|
114
|
+
}
|
|
115
|
+
/** Result of `parseWithLogits` — tree + raw material for per-span logit aggregation. */
|
|
116
|
+
export interface ParseWithLogitsResult {
|
|
117
|
+
tree: AddressTree;
|
|
118
|
+
logits: number[][];
|
|
119
|
+
pieces: Array<{
|
|
120
|
+
start: number;
|
|
121
|
+
end: number;
|
|
122
|
+
}>;
|
|
123
|
+
}
|
|
124
|
+
/**
|
|
125
|
+
* Per-call opts for `parse()`. Threading a precomputed `QueryShape` here turns on the soft-prior
|
|
126
|
+
* bias path in the Viterbi decoder (Stage 2.4 boundary → Stage 3 encoder integration).
|
|
127
|
+
*/
|
|
128
|
+
export interface ParseOpts {
|
|
129
|
+
/**
|
|
130
|
+
* Precomputed `QueryShape` for this input (from `@mailwoman/query-shape`'s `computeQueryShape`).
|
|
131
|
+
* Known-format hits in the shape produce additive emission biases toward the matching BIO label.
|
|
132
|
+
* Typed structurally — no runtime dependency on `@mailwoman/query-shape`.
|
|
133
|
+
*/
|
|
134
|
+
queryShape?: QueryShapeLike;
|
|
135
|
+
/**
|
|
136
|
+
* Maximum bias magnitude in log-odds units. Default 1.0 — adds up to ~e^1 ≈ 2.7× odds to the
|
|
137
|
+
* favored label. Confidence-scaled, so a 0.6-confidence format hit gets +0.6 max bias.
|
|
138
|
+
*/
|
|
139
|
+
queryShapeBiasScale?: number;
|
|
140
|
+
/**
|
|
141
|
+
* Pre-built FST gazetteer matcher. When provided, gazetteer matches produce additive emission
|
|
142
|
+
* biases.
|
|
143
|
+
*/
|
|
144
|
+
fst?: FstMatcherLike;
|
|
145
|
+
/** Bias magnitude for FST gazetteer matches. Default 1.0. */
|
|
146
|
+
fstBiasScale?: number;
|
|
147
|
+
/**
|
|
148
|
+
* Pre-built street-morphology FST matcher. When provided, street-type affixes (Avenue, rue,
|
|
149
|
+
* Calle, Straße, …) produce additive emission biases toward `street_prefix`/`street_suffix` on
|
|
150
|
+
* the matched tokens AND toward `street` / away from `dependent_locality` on the adjacent name
|
|
151
|
+
* tokens. Closes the v0.6.1 dependent_locality vacuum; see
|
|
152
|
+
* `docs/articles/concepts/street-supplement-architecture.md` for the layered design.
|
|
153
|
+
*/
|
|
154
|
+
fstStreetMorphology?: FstMatcherLike;
|
|
155
|
+
/** Override bias magnitudes for the morphology prior. */
|
|
156
|
+
fstStreetMorphologyOpts?: StreetMorphologyPriorOpts;
|
|
157
|
+
/**
|
|
158
|
+
* When true, run the deterministic postcode regex repair pass (v0.7 #35) on the decoded label
|
|
159
|
+
* sequence before tree-building. Detects postcode-shaped substrings (GB/CA/NL/US/FR/… patterns)
|
|
160
|
+
* and snaps/adds the postcode span to the matched shape, fixing the SentencePiece-fragmentation
|
|
161
|
+
* failures catalogued in the 2026-05-29 postcode diagnostic. Off by default — opt-in until the
|
|
162
|
+
* v0.7 gate confirms it. See `./postcode-repair.ts`.
|
|
163
|
+
*/
|
|
164
|
+
postcodeRepair?: boolean;
|
|
165
|
+
/**
|
|
166
|
+
* When true, run the deterministic secondary-unit regex repair pass on the decoded label
|
|
167
|
+
* sequence before tree-building. Detects designator-shaped substrings ("Apt 4B", "Ste 12",
|
|
168
|
+
* "Unit 9400", bare "#104", …) and snaps/adds the unit span, fixing the unit-drop weakness the
|
|
169
|
+
* three-arena capability eval surfaced (postal secondary-unit 0% neural). Off by default —
|
|
170
|
+
* opt-in until the v0.7.2 arena re-run quantifies its delta. See `./unit-repair.ts`.
|
|
32
171
|
*/
|
|
33
|
-
|
|
34
|
-
/** Tokenize → infer → argmax/softmax → decoder tree. */
|
|
35
|
-
parse(text: string): Promise<AddressTree>;
|
|
36
|
-
parseJson(text: string): Promise<Partial<Record<ComponentTag, string>>>;
|
|
37
|
-
parseTuples(text: string): Promise<Array<[ComponentTag, string]>>;
|
|
38
|
-
parseXml(text: string, opts?: Parameters<typeof decodeAsXml>[1]): Promise<string>;
|
|
172
|
+
unitRepair?: boolean;
|
|
39
173
|
}
|
|
40
174
|
//# sourceMappingURL=classifier.d.ts.map
|
package/out/classifier.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"classifier.d.ts","sourceRoot":"","sources":["../classifier.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,OAAO,
|
|
1
|
+
{"version":3,"file":"classifier.d.ts","sourceRoot":"","sources":["../classifier.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,OAAO,EAIN,WAAW,EACX,KAAK,WAAW,EAChB,KAAK,YAAY,EAEjB,MAAM,yBAAyB,CAAA;AAChC,OAAO,EAA0B,KAAK,cAAc,EAAE,MAAM,gBAAgB,CAAA;AAE5E,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,kBAAkB,CAAA;AAGnD,OAAO,EAA0C,KAAK,cAAc,EAAE,MAAM,wBAAwB,CAAA;AACpG,OAAO,EAAuC,KAAK,yBAAyB,EAAE,MAAM,8BAA8B,CAAA;AAClH,OAAO,EAAE,kBAAkB,EAAE,MAAM,gBAAgB,CAAA;AACnD,OAAO,EAAuB,KAAK,YAAY,EAAE,MAAM,uBAAuB,CAAA;AAE9E,OAAO,KAAK,EAAE,kBAAkB,EAAmB,MAAM,cAAc,CAAA;AAEvE;;;;GAIG;AACH,MAAM,WAAW,YAAY;IAC5B,KAAK,CACJ,QAAQ,EAAE,MAAM,EAAE,EAClB,MAAM,CAAC,EAAE;QAAE,QAAQ,EAAE,aAAa,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC,CAAC;QAAC,UAAU,EAAE,aAAa,CAAC,MAAM,CAAC,CAAA;KAAE,GAC5F,OAAO,CAAC,WAAW,CAAC,CAAA;CACvB;AAED,MAAM,WAAW,6BAA6B;IAC7C,SAAS,EAAE,kBAAkB,CAAA;IAC7B,MAAM,EAAE,YAAY,CAAA;IACpB;;;;OAIG;IACH,MAAM,CAAC,EAAE,SAAS,MAAM,EAAE,CAAA;IAC1B;;;;;;;OAOG;IACH,MAAM,CAAC,EAAE,SAAS,GAAG,QAAQ,CAAA;IAC7B;;;;OAIG;IACH,WAAW,CAAC,EAAE,MAAM,EAAE,EAAE,CAAA;IACxB,sEAAsE;IACtE,gBAAgB,CAAC,EAAE,MAAM,EAAE,CAAA;IAC3B,oEAAoE;IACpE,cAAc,CAAC,EAAE,MAAM,EAAE,CAAA;IACzB;;;;;OAKG;IACH,oBAAoB,CAAC,EAAE,YAAY,CAAA;CACnC;AAED,qBAAa,uBAAuB;IAOvB,OAAO,CAAC,QAAQ,CAAC,GAAG;IANhC,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAmB;IAC1C,OAAO,CAAC,QAAQ,CAAC,UAAU,CAAsB;IACjD,OAAO,CAAC,QAAQ,CAAC,WAAW,CAAY;IACxC,OAAO,CAAC,QAAQ,CAAC,gBAAgB,CAAU;IAC3C,OAAO,CAAC,QAAQ,CAAC,cAAc,CAAU;gBAEZ,GAAG,EAAE,6BAA6B;IAa/D;;;;;;;;;;;OAWG;WACU,eAAe,CAC3B,IAAI,GAAE,kBAAkB,GAAG;QAAE,oBAAoB,CAAC,EAAE,YAAY,CAAA;KAAO,GACrE,OAAO,CAAC,uBAAuB,CAAC;IA4BnC,6DAA6D;IACvD,KAAK,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,SAAS,GAAG,OAAO,CAAC,WAAW,CAAC;IA4EjE;;;OAGG;IACG,eAAe,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,SAAS,GAAG,OAAO,CAAC,qBAAqB,CAAC;IA0E/E,SAAS,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,SAAS,GAAG,OAAO,CAAC,OAAO,CAAC,MAAM,CAAC,YAAY,EAAE,MAAM,CAAC,CAAC,CAAC;IAIzF,WAAW,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,SAAS,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,YAAY,EAAE,MAAM,CAAC,CAAC,CAAC;IAInF,QAAQ,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,SAAS,GAAG;QAAE,GAAG,CAAC,EAAE,UAAU,CAAC,OAAO,WAAW,CAAC,CAAC,CAAC,CAAC,CAAA;KAAE,GAAG,OAAO,CAAC,MAAM,CAAC;IAI7G;;;;;;;;;;OAUG;IACH,OAAO,CAAC,mBAAmB;CAW3B;AAED,wFAAwF;AACxF,MAAM,WAAW,qBAAqB;IACrC,IAAI,EAAE,WAAW,CAAA;IACjB,MAAM,EAAE,MAAM,EAAE,EAAE,CAAA;IAClB,MAAM,EAAE,KAAK,CAAC;QAAE,KAAK,EAAE,MAAM,CAAC;QAAC,GAAG,EAAE,MAAM,CAAA;KAAE,CAAC,CAAA;CAC7C;AAED;;;GAGG;AACH,MAAM,WAAW,SAAS;IACzB;;;;OAIG;IACH,UAAU,CAAC,EAAE,cAAc,CAAA;IAC3B;;;OAGG;IACH,mBAAmB,CAAC,EAAE,MAAM,CAAA;IAC5B;;;OAGG;IACH,GAAG,CAAC,EAAE,cAAc,CAAA;IACpB,6DAA6D;IAC7D,YAAY,CAAC,EAAE,MAAM,CAAA;IACrB;;;;;;OAMG;IACH,mBAAmB,CAAC,EAAE,cAAc,CAAA;IACpC,yDAAyD;IACzD,uBAAuB,CAAC,EAAE,yBAAyB,CAAA;IACnD;;;;;;OAMG;IACH,cAAc,CAAC,EAAE,OAAO,CAAA;IACxB;;;;;;OAMG;IACH,UAAU,CAAC,EAAE,OAAO,CAAA;CACpB"}
|
package/out/classifier.js
CHANGED
|
@@ -10,16 +10,35 @@
|
|
|
10
10
|
* Convenience wrappers `parseJson` / `parseTuples` / `parseXml` project the tree on the way out.
|
|
11
11
|
*/
|
|
12
12
|
import { buildAddressTree, decodeAsJson, decodeAsTuples, decodeAsXml, } from "@mailwoman/core/decoder";
|
|
13
|
-
import {
|
|
14
|
-
import {
|
|
13
|
+
import { buildFstEmissionPriors } from "./fst-prior.js";
|
|
14
|
+
import { STAGE2_BIO_LABELS } from "./labels.js";
|
|
15
|
+
import { repairPostcodeLabels } from "./postcode-repair.js";
|
|
16
|
+
import { repairUnitLabels } from "./unit-repair.js";
|
|
17
|
+
import { addEmissionMatrix, buildEmissionPriors } from "./query-shape-prior.js";
|
|
18
|
+
import { buildStreetMorphologyEmissionPriors } from "./street-morphology-prior.js";
|
|
15
19
|
import { MailwomanTokenizer } from "./tokenizer.js";
|
|
16
|
-
import {
|
|
20
|
+
import { buildAnchorFeatures } from "./anchor-inference.js";
|
|
21
|
+
import { buildBioEndMask, buildBioStartMask, buildBioTransitionMask, softmax, viterbi } from "./viterbi.js";
|
|
17
22
|
export class NeuralAddressClassifier {
|
|
18
23
|
cfg;
|
|
19
24
|
labels;
|
|
25
|
+
decodeMode;
|
|
26
|
+
transitions;
|
|
27
|
+
startTransitions;
|
|
28
|
+
endTransitions;
|
|
20
29
|
constructor(cfg) {
|
|
21
30
|
this.cfg = cfg;
|
|
22
|
-
this.labels = cfg.labels ??
|
|
31
|
+
this.labels = cfg.labels ?? STAGE2_BIO_LABELS;
|
|
32
|
+
this.decodeMode = cfg.decode ?? "viterbi";
|
|
33
|
+
const structural = buildBioTransitionMask(this.labels);
|
|
34
|
+
if (cfg.transitions) {
|
|
35
|
+
this.transitions = addMatrices(structural, cfg.transitions);
|
|
36
|
+
}
|
|
37
|
+
else {
|
|
38
|
+
this.transitions = structural;
|
|
39
|
+
}
|
|
40
|
+
this.startTransitions = cfg.startTransitions ?? buildBioStartMask(this.labels);
|
|
41
|
+
this.endTransitions = cfg.endTransitions ?? buildBioEndMask(this.labels);
|
|
23
42
|
}
|
|
24
43
|
/**
|
|
25
44
|
* One-call factory that resolves the weights package (or explicit paths), loads the tokenizer and
|
|
@@ -27,42 +46,176 @@ export class NeuralAddressClassifier {
|
|
|
27
46
|
*
|
|
28
47
|
* Resolution order: explicit paths in `opts` → `@mailwoman/neural-weights-<locale>` package →
|
|
29
48
|
* throws a single actionable error.
|
|
49
|
+
*
|
|
50
|
+
* **Node-only.** The dynamic imports keep `OnnxRunner` (onnxruntime-node) + `resolveWeights`
|
|
51
|
+
* (uses Node fs) out of the static dependency graph, so this file can be bundled for the browser
|
|
52
|
+
* by `@mailwoman/neural-web`. Calling this method in a browser will throw at runtime — use
|
|
53
|
+
* `loadNeuralClassifierFromUrls` from `@mailwoman/neural-web` instead.
|
|
30
54
|
*/
|
|
31
55
|
static async loadFromWeights(opts = {}) {
|
|
32
|
-
|
|
56
|
+
// /* webpackIgnore: true */ tells webpack to leave the dynamic import statement intact —
|
|
57
|
+
// it becomes a runtime native ESM import that resolves in Node (which has onnxruntime-node
|
|
58
|
+
// + node:fs) and throws cleanly in a browser if called. Without the directive, webpack
|
|
59
|
+
// pulls onnx-runner / weights into the browser chunk graph + then chokes on the Node-only
|
|
60
|
+
// builtins they reference.
|
|
61
|
+
const [{ OnnxRunner }, { resolveWeights, readLabelsFromModelCard, readCrfTransitions }] = await Promise.all([
|
|
62
|
+
import(/* webpackIgnore: true */ "./onnx-runner.js"),
|
|
63
|
+
import(/* webpackIgnore: true */ "./weights.js"),
|
|
64
|
+
]);
|
|
65
|
+
const resolved = resolveWeights(opts);
|
|
66
|
+
const labels = readLabelsFromModelCard(resolved.modelCardPath);
|
|
67
|
+
const crf = readCrfTransitions(resolved.crfTransitionsPath);
|
|
33
68
|
const [tokenizer, runner] = await Promise.all([
|
|
34
|
-
MailwomanTokenizer.loadFromFile(tokenizerPath),
|
|
35
|
-
OnnxRunner.create(modelPath),
|
|
69
|
+
MailwomanTokenizer.loadFromFile(resolved.tokenizerPath),
|
|
70
|
+
OnnxRunner.create(resolved.modelPath),
|
|
36
71
|
]);
|
|
37
|
-
return new NeuralAddressClassifier({
|
|
72
|
+
return new NeuralAddressClassifier({
|
|
73
|
+
tokenizer,
|
|
74
|
+
runner,
|
|
75
|
+
labels,
|
|
76
|
+
transitions: crf?.transitions,
|
|
77
|
+
startTransitions: crf?.startTransitions,
|
|
78
|
+
endTransitions: crf?.endTransitions,
|
|
79
|
+
...(opts.postcodeAnchorLookup ? { postcodeAnchorLookup: opts.postcodeAnchorLookup } : {}),
|
|
80
|
+
});
|
|
38
81
|
}
|
|
39
|
-
/** Tokenize → infer → argmax
|
|
40
|
-
async parse(text) {
|
|
82
|
+
/** Tokenize → infer → Viterbi (or argmax) → decoder tree. */
|
|
83
|
+
async parse(text, opts) {
|
|
41
84
|
if (text.length === 0)
|
|
42
85
|
return { raw: text, roots: [] };
|
|
43
86
|
const { pieces, ids } = this.cfg.tokenizer.encode(text);
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
87
|
+
// Postcode-anchor channel (#239/#240): build per-piece anchor features from the same lookup the
|
|
88
|
+
// model trained on, fed alongside the ids. No-op when no lookup is configured.
|
|
89
|
+
const anchor = this.cfg.postcodeAnchorLookup
|
|
90
|
+
? buildAnchorFeatures(text, pieces, this.cfg.postcodeAnchorLookup)
|
|
91
|
+
: undefined;
|
|
92
|
+
const { logits } = await this.cfg.runner.infer(ids, anchor);
|
|
93
|
+
this.assertEmissionWidth(logits);
|
|
94
|
+
let emissions = opts?.queryShape
|
|
95
|
+
? addEmissionMatrix(logits, buildEmissionPriors(opts.queryShape, pieces, this.labels, {
|
|
96
|
+
biasScale: opts.queryShapeBiasScale ?? 1.0,
|
|
97
|
+
inputText: text,
|
|
98
|
+
}))
|
|
99
|
+
: logits;
|
|
100
|
+
if (opts?.fst) {
|
|
101
|
+
emissions = addEmissionMatrix(emissions, buildFstEmissionPriors(opts.fst, pieces, this.labels, {
|
|
102
|
+
biasScale: opts.fstBiasScale ?? 1.0,
|
|
103
|
+
}));
|
|
104
|
+
}
|
|
105
|
+
if (opts?.fstStreetMorphology) {
|
|
106
|
+
emissions = addEmissionMatrix(emissions, buildStreetMorphologyEmissionPriors(opts.fstStreetMorphology, pieces, this.labels, opts.fstStreetMorphologyOpts ?? {}));
|
|
107
|
+
}
|
|
108
|
+
const labelIndices = this.decodeMode === "viterbi"
|
|
109
|
+
? viterbi({
|
|
110
|
+
emissions,
|
|
111
|
+
transitions: this.transitions,
|
|
112
|
+
startTransitions: this.startTransitions,
|
|
113
|
+
endTransitions: this.endTransitions,
|
|
114
|
+
}).path
|
|
115
|
+
: emissions.map((row) => argmaxSoftmax(row).idx);
|
|
116
|
+
let tokens = pieces.map((p, i) => {
|
|
117
|
+
const idx = labelIndices[i];
|
|
118
|
+
const probs = softmax(logits[i]);
|
|
48
119
|
return {
|
|
49
120
|
piece: p.piece,
|
|
50
121
|
start: p.start,
|
|
51
122
|
end: p.end,
|
|
52
123
|
label: (this.labels[idx] ?? "O"),
|
|
53
|
-
confidence:
|
|
124
|
+
confidence: probs[idx],
|
|
54
125
|
};
|
|
55
126
|
});
|
|
127
|
+
if (opts?.postcodeRepair) {
|
|
128
|
+
tokens = repairPostcodeLabels(text, tokens).tokens;
|
|
129
|
+
}
|
|
130
|
+
if (opts?.unitRepair) {
|
|
131
|
+
tokens = repairUnitLabels(text, tokens).tokens;
|
|
132
|
+
}
|
|
56
133
|
return buildAddressTree(text, tokens);
|
|
57
134
|
}
|
|
58
|
-
|
|
59
|
-
|
|
135
|
+
/**
|
|
136
|
+
* Like `parse`, but also returns the raw per-token logits and piece offsets needed for per-span
|
|
137
|
+
* logit aggregation (Option C joint-reconcile integration).
|
|
138
|
+
*/
|
|
139
|
+
async parseWithLogits(text, opts) {
|
|
140
|
+
if (text.length === 0) {
|
|
141
|
+
return { tree: { raw: text, roots: [] }, logits: [], pieces: [] };
|
|
142
|
+
}
|
|
143
|
+
const { pieces, ids } = this.cfg.tokenizer.encode(text);
|
|
144
|
+
// Postcode-anchor channel (#239/#240): build per-piece anchor features from the same lookup the
|
|
145
|
+
// model trained on, fed alongside the ids. No-op when no lookup is configured.
|
|
146
|
+
const anchor = this.cfg.postcodeAnchorLookup
|
|
147
|
+
? buildAnchorFeatures(text, pieces, this.cfg.postcodeAnchorLookup)
|
|
148
|
+
: undefined;
|
|
149
|
+
const { logits } = await this.cfg.runner.infer(ids, anchor);
|
|
150
|
+
this.assertEmissionWidth(logits);
|
|
151
|
+
let emissions = opts?.queryShape
|
|
152
|
+
? addEmissionMatrix(logits, buildEmissionPriors(opts.queryShape, pieces, this.labels, {
|
|
153
|
+
biasScale: opts.queryShapeBiasScale ?? 1.0,
|
|
154
|
+
inputText: text,
|
|
155
|
+
}))
|
|
156
|
+
: logits;
|
|
157
|
+
if (opts?.fst) {
|
|
158
|
+
emissions = addEmissionMatrix(emissions, buildFstEmissionPriors(opts.fst, pieces, this.labels, {
|
|
159
|
+
biasScale: opts.fstBiasScale ?? 1.0,
|
|
160
|
+
}));
|
|
161
|
+
}
|
|
162
|
+
if (opts?.fstStreetMorphology) {
|
|
163
|
+
emissions = addEmissionMatrix(emissions, buildStreetMorphologyEmissionPriors(opts.fstStreetMorphology, pieces, this.labels, opts.fstStreetMorphologyOpts ?? {}));
|
|
164
|
+
}
|
|
165
|
+
const labelIndices = this.decodeMode === "viterbi"
|
|
166
|
+
? viterbi({
|
|
167
|
+
emissions,
|
|
168
|
+
transitions: this.transitions,
|
|
169
|
+
startTransitions: this.startTransitions,
|
|
170
|
+
endTransitions: this.endTransitions,
|
|
171
|
+
}).path
|
|
172
|
+
: emissions.map((row) => argmaxSoftmax(row).idx);
|
|
173
|
+
const tokens = pieces.map((p, i) => {
|
|
174
|
+
const idx = labelIndices[i];
|
|
175
|
+
const probs = softmax(logits[i]);
|
|
176
|
+
return {
|
|
177
|
+
piece: p.piece,
|
|
178
|
+
start: p.start,
|
|
179
|
+
end: p.end,
|
|
180
|
+
label: (this.labels[idx] ?? "O"),
|
|
181
|
+
confidence: probs[idx],
|
|
182
|
+
};
|
|
183
|
+
});
|
|
184
|
+
return {
|
|
185
|
+
tree: buildAddressTree(text, tokens),
|
|
186
|
+
logits,
|
|
187
|
+
pieces: pieces.map((p) => ({ start: p.start, end: p.end })),
|
|
188
|
+
};
|
|
189
|
+
}
|
|
190
|
+
async parseJson(text, opts) {
|
|
191
|
+
return decodeAsJson(await this.parse(text, opts));
|
|
60
192
|
}
|
|
61
|
-
async parseTuples(text) {
|
|
62
|
-
return decodeAsTuples(await this.parse(text));
|
|
193
|
+
async parseTuples(text, opts) {
|
|
194
|
+
return decodeAsTuples(await this.parse(text, opts));
|
|
63
195
|
}
|
|
64
196
|
async parseXml(text, opts) {
|
|
65
|
-
return decodeAsXml(await this.parse(text), opts);
|
|
197
|
+
return decodeAsXml(await this.parse(text, opts), opts?.xml);
|
|
198
|
+
}
|
|
199
|
+
/**
|
|
200
|
+
* Guard against a silent label/emission shape overrun. When the model emits MORE logits per token
|
|
201
|
+
* than the configured label vocabulary (e.g. a Stage 3 bundle loaded with the default Stage 2
|
|
202
|
+
* labels), viterbi indexes past the transition matrix and dies with an opaque `Cannot read
|
|
203
|
+
* properties of undefined (reading '0')`. Fail fast here with a message that names the contract
|
|
204
|
+
* the caller violated.
|
|
205
|
+
*
|
|
206
|
+
* The opposite shape (model narrower than labels) is intentionally permitted — STAGE2_BIO_LABELS
|
|
207
|
+
* prefix-extends STAGE1_BIO_LABELS so a Stage 1 model loaded with Stage 2 labels decodes
|
|
208
|
+
* correctly via the first 15 logits. See labels.ts for the contract.
|
|
209
|
+
*/
|
|
210
|
+
assertEmissionWidth(logits) {
|
|
211
|
+
if (logits.length === 0)
|
|
212
|
+
return;
|
|
213
|
+
const width = logits[0].length;
|
|
214
|
+
if (width > this.labels.length) {
|
|
215
|
+
throw new Error(`Label/emission mismatch: model emits ${width} logits per token but the classifier was ` +
|
|
216
|
+
`configured with only ${this.labels.length} labels. Did you load a Stage 3 bundle without ` +
|
|
217
|
+
`passing its model-card labels? See loadFromWeights / loadNeuralClassifierFromUrls.`);
|
|
218
|
+
}
|
|
66
219
|
}
|
|
67
220
|
}
|
|
68
221
|
function argmaxSoftmax(row) {
|
|
@@ -80,4 +233,16 @@ function argmaxSoftmax(row) {
|
|
|
80
233
|
const conf = 1 / sumExp;
|
|
81
234
|
return { idx: maxIdx, conf };
|
|
82
235
|
}
|
|
236
|
+
/** Element-wise add two square matrices. Used to compose the structural mask + learned transitions. */
|
|
237
|
+
function addMatrices(a, b) {
|
|
238
|
+
const n = a.length;
|
|
239
|
+
const out = [];
|
|
240
|
+
for (let i = 0; i < n; i++) {
|
|
241
|
+
const row = new Array(n);
|
|
242
|
+
for (let j = 0; j < n; j++)
|
|
243
|
+
row[j] = a[i][j] + b[i][j];
|
|
244
|
+
out.push(row);
|
|
245
|
+
}
|
|
246
|
+
return out;
|
|
247
|
+
}
|
|
83
248
|
//# sourceMappingURL=classifier.js.map
|