@mailwoman/neural 2.2.0 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/out/anchor-inference.d.ts +58 -0
- package/out/anchor-inference.d.ts.map +1 -0
- package/out/anchor-inference.js +95 -0
- package/out/anchor-inference.js.map +1 -0
- package/out/browser.d.ts +2 -0
- package/out/browser.d.ts.map +1 -1
- package/out/browser.js +4 -0
- package/out/browser.js.map +1 -1
- package/out/classifier.d.ts +70 -3
- package/out/classifier.d.ts.map +1 -1
- package/out/classifier.js +80 -19
- package/out/classifier.js.map +1 -1
- package/out/fst-prior.d.ts +71 -0
- package/out/fst-prior.d.ts.map +1 -0
- package/out/fst-prior.js +173 -0
- package/out/fst-prior.js.map +1 -0
- package/out/index.d.ts +3 -0
- package/out/index.d.ts.map +1 -1
- package/out/index.js +3 -0
- package/out/index.js.map +1 -1
- package/out/labels.d.ts +3 -0
- package/out/labels.d.ts.map +1 -1
- package/out/labels.js +13 -0
- package/out/labels.js.map +1 -1
- package/out/onnx-runner.d.ts +8 -1
- package/out/onnx-runner.d.ts.map +1 -1
- package/out/onnx-runner.js +31 -1
- package/out/onnx-runner.js.map +1 -1
- package/out/postcode-anchor.d.ts +117 -0
- package/out/postcode-anchor.d.ts.map +1 -0
- package/out/postcode-anchor.js +269 -0
- package/out/postcode-anchor.js.map +1 -0
- package/out/postcode-binary-resolver.d.ts +60 -0
- package/out/postcode-binary-resolver.d.ts.map +1 -0
- package/out/postcode-binary-resolver.js +208 -0
- package/out/postcode-binary-resolver.js.map +1 -0
- package/out/postcode-repair.d.ts +65 -0
- package/out/postcode-repair.d.ts.map +1 -0
- package/out/postcode-repair.js +171 -0
- package/out/postcode-repair.js.map +1 -0
- package/out/proposal-classifier.d.ts.map +1 -1
- package/out/proposal-classifier.js +3 -1
- package/out/proposal-classifier.js.map +1 -1
- package/out/query-shape-prior.d.ts +12 -0
- package/out/query-shape-prior.d.ts.map +1 -1
- package/out/query-shape-prior.js +132 -2
- package/out/query-shape-prior.js.map +1 -1
- package/out/street-morphology-prior.d.ts +56 -0
- package/out/street-morphology-prior.d.ts.map +1 -0
- package/out/street-morphology-prior.js +159 -0
- package/out/street-morphology-prior.js.map +1 -0
- package/out/unit-repair.d.ts +42 -0
- package/out/unit-repair.d.ts.map +1 -0
- package/out/unit-repair.js +142 -0
- package/out/unit-repair.js.map +1 -0
- package/out/vitest.config.d.ts.map +1 -1
- package/out/vitest.config.js +3 -0
- package/out/vitest.config.js.map +1 -1
- package/out/weights.d.ts +27 -3
- package/out/weights.d.ts.map +1 -1
- package/out/weights.js +46 -2
- package/out/weights.js.map +1 -1
- package/package.json +6 -2
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Inference-side postcode-anchor features (#239/#240) — the mirror of the Python training pipeline
|
|
7
|
+
* (`mailwoman_train/tokenizer.py::anchor_feature_vector` + `realign_anchor_to_pieces`). At
|
|
8
|
+
* inference the model conditions on per-piece anchor features fed alongside `input_ids`; this
|
|
9
|
+
* builds them from a raw address + its SentencePiece pieces, using the SAME postcode→anchor
|
|
10
|
+
* lookup the model trained against (`scripts/build-pilot-anchor-lookup.py`), so the feature
|
|
11
|
+
* layout matches byte-for-byte.
|
|
12
|
+
*
|
|
13
|
+
* The layout is LOAD-BEARING and cross-language: a wrong locale order or centroid scale feeds the
|
|
14
|
+
* model garbage. `anchor-inference.test.ts` pins both `LOCALE_ORDER` and the vector to values
|
|
15
|
+
* emitted by the Python `anchor_feature_vector` — any drift fails the test.
|
|
16
|
+
*/
|
|
17
|
+
import type { TokenizedPiece } from "./tokenizer.js";
|
|
18
|
+
/**
|
|
19
|
+
* The locale class order — MUST match Python `mailwoman_train/labels.py::LOCALE_COUNTRIES`. The
|
|
20
|
+
* posterior occupies indices `[0, LOCALE_ORDER.length)`; the normalized centroid the last two.
|
|
21
|
+
* (Pinned by the test; do not reorder.)
|
|
22
|
+
*/
|
|
23
|
+
export declare const LOCALE_ORDER: readonly ["US", "FR", "DE", "CA", "GB", "JP", "ES", "IT", "NL"];
|
|
24
|
+
/** Anchor feature width = posterior over the locale set + a 2-d centroid. */
|
|
25
|
+
export declare const ANCHOR_FEATURE_DIM: number;
|
|
26
|
+
/** One postcode's anchor record (from the pilot lookup): country posterior + a single centroid. */
|
|
27
|
+
export interface AnchorEntry {
|
|
28
|
+
posterior: Record<string, number>;
|
|
29
|
+
lat: number;
|
|
30
|
+
lon: number;
|
|
31
|
+
}
|
|
32
|
+
export type AnchorLookup = Map<string, AnchorEntry>;
|
|
33
|
+
/**
|
|
34
|
+
* Build the fixed-width anchor feature vector — the exact mirror of Python `anchor_feature_vector`:
|
|
35
|
+
* a uniform country posterior over {@linkcode LOCALE_ORDER} (renormalized over the in-set mass) + a
|
|
36
|
+
* normalized centroid (`lat/90`, `lon/180` ∈ [-1, 1]).
|
|
37
|
+
*/
|
|
38
|
+
export declare function anchorFeatureVector(posterior: Record<string, number>, lat: number, lon: number): number[];
|
|
39
|
+
/**
|
|
40
|
+
* Parse the pilot postcode→anchor lookup JSON (`{postcode: [posterior, lat, lon]}`) into a Map.
|
|
41
|
+
* Pure (takes the parsed object, not a path) so this module stays browser-safe — the file read
|
|
42
|
+
* lives in the Node-side caller (the eval).
|
|
43
|
+
*/
|
|
44
|
+
export declare function parseAnchorLookup(raw: Record<string, [Record<string, number>, number, number]>): AnchorLookup;
|
|
45
|
+
/**
|
|
46
|
+
* Per-piece anchor features + confidence for `text`, projected onto its SP `pieces` by the SAME
|
|
47
|
+
* char→piece rule the labels use (a piece takes the anchor of the postcode span its first
|
|
48
|
+
* non-whitespace char falls inside) — so the anchor lands on exactly the postcode's sub-tokens.
|
|
49
|
+
*
|
|
50
|
+
* Postcode spans are the alphanumeric runs in `text` that the lookup recognizes (gold-equivalent on
|
|
51
|
+
* clean rendered addresses); a recognized span yields a confidence-1.0 anchor, like training's
|
|
52
|
+
* gold-span. Returns `(pieces × ANCHOR_FEATURE_DIM)` features + `(pieces,)` confidence.
|
|
53
|
+
*/
|
|
54
|
+
export declare function buildAnchorFeatures(text: string, pieces: ReadonlyArray<TokenizedPiece>, lookup: AnchorLookup): {
|
|
55
|
+
features: number[][];
|
|
56
|
+
confidence: number[];
|
|
57
|
+
};
|
|
58
|
+
//# sourceMappingURL=anchor-inference.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"anchor-inference.d.ts","sourceRoot":"","sources":["../anchor-inference.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;GAeG;AAEH,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,gBAAgB,CAAA;AAEpD;;;;GAIG;AACH,eAAO,MAAM,YAAY,iEAAkE,CAAA;AAE3F,6EAA6E;AAC7E,eAAO,MAAM,kBAAkB,QAA0B,CAAA;AAEzD,mGAAmG;AACnG,MAAM,WAAW,WAAW;IAC3B,SAAS,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAA;IACjC,GAAG,EAAE,MAAM,CAAA;IACX,GAAG,EAAE,MAAM,CAAA;CACX;AAED,MAAM,MAAM,YAAY,GAAG,GAAG,CAAC,MAAM,EAAE,WAAW,CAAC,CAAA;AAEnD;;;;GAIG;AACH,wBAAgB,mBAAmB,CAAC,SAAS,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,GAAG,EAAE,MAAM,EAAE,GAAG,EAAE,MAAM,GAAG,MAAM,EAAE,CAgBzG;AAED;;;;GAIG;AACH,wBAAgB,iBAAiB,CAAC,GAAG,EAAE,MAAM,CAAC,MAAM,EAAE,CAAC,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,MAAM,EAAE,MAAM,CAAC,CAAC,GAAG,YAAY,CAI7G;AAED;;;;;;;;GAQG;AACH,wBAAgB,mBAAmB,CAClC,IAAI,EAAE,MAAM,EACZ,MAAM,EAAE,aAAa,CAAC,cAAc,CAAC,EACrC,MAAM,EAAE,YAAY,GAClB;IAAE,QAAQ,EAAE,MAAM,EAAE,EAAE,CAAC;IAAC,UAAU,EAAE,MAAM,EAAE,CAAA;CAAE,CA0BhD"}
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Inference-side postcode-anchor features (#239/#240) — the mirror of the Python training pipeline
|
|
7
|
+
* (`mailwoman_train/tokenizer.py::anchor_feature_vector` + `realign_anchor_to_pieces`). At
|
|
8
|
+
* inference the model conditions on per-piece anchor features fed alongside `input_ids`; this
|
|
9
|
+
* builds them from a raw address + its SentencePiece pieces, using the SAME postcode→anchor
|
|
10
|
+
* lookup the model trained against (`scripts/build-pilot-anchor-lookup.py`), so the feature
|
|
11
|
+
* layout matches byte-for-byte.
|
|
12
|
+
*
|
|
13
|
+
* The layout is LOAD-BEARING and cross-language: a wrong locale order or centroid scale feeds the
|
|
14
|
+
* model garbage. `anchor-inference.test.ts` pins both `LOCALE_ORDER` and the vector to values
|
|
15
|
+
* emitted by the Python `anchor_feature_vector` — any drift fails the test.
|
|
16
|
+
*/
|
|
17
|
+
/**
|
|
18
|
+
* The locale class order — MUST match Python `mailwoman_train/labels.py::LOCALE_COUNTRIES`. The
|
|
19
|
+
* posterior occupies indices `[0, LOCALE_ORDER.length)`; the normalized centroid the last two.
|
|
20
|
+
* (Pinned by the test; do not reorder.)
|
|
21
|
+
*/
|
|
22
|
+
export const LOCALE_ORDER = ["US", "FR", "DE", "CA", "GB", "JP", "ES", "IT", "NL"];
|
|
23
|
+
/** Anchor feature width = posterior over the locale set + a 2-d centroid. */
|
|
24
|
+
export const ANCHOR_FEATURE_DIM = LOCALE_ORDER.length + 2;
|
|
25
|
+
/**
|
|
26
|
+
* Build the fixed-width anchor feature vector — the exact mirror of Python `anchor_feature_vector`:
|
|
27
|
+
* a uniform country posterior over {@linkcode LOCALE_ORDER} (renormalized over the in-set mass) + a
|
|
28
|
+
* normalized centroid (`lat/90`, `lon/180` ∈ [-1, 1]).
|
|
29
|
+
*/
|
|
30
|
+
export function anchorFeatureVector(posterior, lat, lon) {
|
|
31
|
+
const vec = new Array(ANCHOR_FEATURE_DIM).fill(0);
|
|
32
|
+
let total = 0;
|
|
33
|
+
for (const [country, weight] of Object.entries(posterior)) {
|
|
34
|
+
const idx = LOCALE_ORDER.indexOf(country.toUpperCase());
|
|
35
|
+
if (idx >= 0) {
|
|
36
|
+
vec[idx] = weight;
|
|
37
|
+
total += weight;
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
if (total > 0) {
|
|
41
|
+
for (let i = 0; i < LOCALE_ORDER.length; i++)
|
|
42
|
+
vec[i] /= total;
|
|
43
|
+
}
|
|
44
|
+
vec[LOCALE_ORDER.length] = Math.max(-1, Math.min(1, lat / 90));
|
|
45
|
+
vec[LOCALE_ORDER.length + 1] = Math.max(-1, Math.min(1, lon / 180));
|
|
46
|
+
return vec;
|
|
47
|
+
}
|
|
48
|
+
/**
|
|
49
|
+
* Parse the pilot postcode→anchor lookup JSON (`{postcode: [posterior, lat, lon]}`) into a Map.
|
|
50
|
+
* Pure (takes the parsed object, not a path) so this module stays browser-safe — the file read
|
|
51
|
+
* lives in the Node-side caller (the eval).
|
|
52
|
+
*/
|
|
53
|
+
export function parseAnchorLookup(raw) {
|
|
54
|
+
const out = new Map();
|
|
55
|
+
for (const [pc, [posterior, lat, lon]] of Object.entries(raw))
|
|
56
|
+
out.set(pc, { posterior, lat, lon });
|
|
57
|
+
return out;
|
|
58
|
+
}
|
|
59
|
+
/**
|
|
60
|
+
* Per-piece anchor features + confidence for `text`, projected onto its SP `pieces` by the SAME
|
|
61
|
+
* char→piece rule the labels use (a piece takes the anchor of the postcode span its first
|
|
62
|
+
* non-whitespace char falls inside) — so the anchor lands on exactly the postcode's sub-tokens.
|
|
63
|
+
*
|
|
64
|
+
* Postcode spans are the alphanumeric runs in `text` that the lookup recognizes (gold-equivalent on
|
|
65
|
+
* clean rendered addresses); a recognized span yields a confidence-1.0 anchor, like training's
|
|
66
|
+
* gold-span. Returns `(pieces × ANCHOR_FEATURE_DIM)` features + `(pieces,)` confidence.
|
|
67
|
+
*/
|
|
68
|
+
export function buildAnchorFeatures(text, pieces, lookup) {
|
|
69
|
+
const features = pieces.map(() => new Array(ANCHOR_FEATURE_DIM).fill(0));
|
|
70
|
+
const confidence = pieces.map(() => 0);
|
|
71
|
+
const tokenRe = /[A-Za-z0-9]+/g;
|
|
72
|
+
let m;
|
|
73
|
+
while ((m = tokenRe.exec(text)) !== null) {
|
|
74
|
+
const entry = lookup.get(m[0].toUpperCase());
|
|
75
|
+
if (!entry)
|
|
76
|
+
continue;
|
|
77
|
+
const spanBegin = m.index;
|
|
78
|
+
const spanEnd = m.index + m[0].length;
|
|
79
|
+
const vec = anchorFeatureVector(entry.posterior, entry.lat, entry.lon);
|
|
80
|
+
for (let i = 0; i < pieces.length; i++) {
|
|
81
|
+
const p = pieces[i];
|
|
82
|
+
for (let c = p.start; c < p.end; c++) {
|
|
83
|
+
if (c < text.length && !/\s/.test(text[c])) {
|
|
84
|
+
if (c >= spanBegin && c < spanEnd) {
|
|
85
|
+
features[i] = vec;
|
|
86
|
+
confidence[i] = 1.0;
|
|
87
|
+
}
|
|
88
|
+
break; // first non-whitespace char of the piece decides (mirrors realign_anchor_to_pieces)
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
return { features, confidence };
|
|
94
|
+
}
|
|
95
|
+
//# sourceMappingURL=anchor-inference.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"anchor-inference.js","sourceRoot":"","sources":["../anchor-inference.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;GAeG;AAIH;;;;GAIG;AACH,MAAM,CAAC,MAAM,YAAY,GAAG,CAAC,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,CAAU,CAAA;AAE3F,6EAA6E;AAC7E,MAAM,CAAC,MAAM,kBAAkB,GAAG,YAAY,CAAC,MAAM,GAAG,CAAC,CAAA;AAWzD;;;;GAIG;AACH,MAAM,UAAU,mBAAmB,CAAC,SAAiC,EAAE,GAAW,EAAE,GAAW;IAC9F,MAAM,GAAG,GAAG,IAAI,KAAK,CAAS,kBAAkB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;IACzD,IAAI,KAAK,GAAG,CAAC,CAAA;IACb,KAAK,MAAM,CAAC,OAAO,EAAE,MAAM,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC,EAAE,CAAC;QAC3D,MAAM,GAAG,GAAG,YAAY,CAAC,OAAO,CAAC,OAAO,CAAC,WAAW,EAAmC,CAAC,CAAA;QACxF,IAAI,GAAG,IAAI,CAAC,EAAE,CAAC;YACd,GAAG,CAAC,GAAG,CAAC,GAAG,MAAM,CAAA;YACjB,KAAK,IAAI,MAAM,CAAA;QAChB,CAAC;IACF,CAAC;IACD,IAAI,KAAK,GAAG,CAAC,EAAE,CAAC;QACf,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,YAAY,CAAC,MAAM,EAAE,CAAC,EAAE;YAAE,GAAG,CAAC,CAAC,CAAE,IAAI,KAAK,CAAA;IAC/D,CAAC;IACD,GAAG,CAAC,YAAY,CAAC,MAAM,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,GAAG,GAAG,EAAE,CAAC,CAAC,CAAA;IAC9D,GAAG,CAAC,YAAY,CAAC,MAAM,GAAG,CAAC,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,GAAG,GAAG,GAAG,CAAC,CAAC,CAAA;IACnE,OAAO,GAAG,CAAA;AACX,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,iBAAiB,CAAC,GAA6D;IAC9F,MAAM,GAAG,GAAiB,IAAI,GAAG,EAAE,CAAA;IACnC,KAAK,MAAM,CAAC,EAAE,EAAE,CAAC,SAAS,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC;QAAE,GAAG,CAAC,GAAG,CAAC,EAAE,EAAE,EAAE,SAAS,EAAE,GAAG,EAAE,GAAG,EAAE,CAAC,CAAA;IACnG,OAAO,GAAG,CAAA;AACX,CAAC;AAED;;;;;;;;GAQG;AACH,MAAM,UAAU,mBAAmB,CAClC,IAAY,EACZ,MAAqC,EACrC,MAAoB;IAEpB,MAAM,QAAQ,GAAe,MAAM,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,IAAI,KAAK,CAAS,kBAAkB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAA;IAC5F,MAAM,UAAU,GAAa,MAAM,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,CAAA;IAEhD,MAAM,OAAO,GAAG,eAAe,CAAA;IAC/B,IAAI,CAAyB,CAAA;IAC7B,OAAO,CAAC,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;QAC1C,MAAM,KAAK,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,WAAW,EAAE,CAAC,CAAA;QAC5C,IAAI,CAAC,KAAK;YAAE,SAAQ;QACpB,MAAM,SAAS,GAAG,CAAC,CAAC,KAAK,CAAA;QACzB,MAAM,OAAO,GAAG,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,CAAA;QACrC,MAAM,GAAG,GAAG,mBAAmB,CAAC,KAAK,CAAC,SAAS,EAAE,KAAK,CAAC,GAAG,EAAE,KAAK,CAAC,GAAG,CAAC,CAAA;QACtE,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACxC,MAAM,CAAC,GAAG,MAAM,CAAC,CAAC,CAAE,CAAA;YACpB,KAAK,IAAI,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC;gBACtC,IAAI,CAAC,GAAG,IAAI,CAAC,MAAM,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAE,CAAC,EAAE,CAAC;oBAC7C,IAAI,CAAC,IAAI,SAAS,IAAI,CAAC,GAAG,OAAO,EAAE,CAAC;wBACnC,QAAQ,CAAC,CAAC,CAAC,GAAG,GAAG,CAAA;wBACjB,UAAU,CAAC,CAAC,CAAC,GAAG,GAAG,CAAA;oBACpB,CAAC;oBACD,MAAK,CAAC,oFAAoF;gBAC3F,CAAC;YACF,CAAC;QACF,CAAC;IACF,CAAC;IACD,OAAO,EAAE,QAAQ,EAAE,UAAU,EAAE,CAAA;AAChC,CAAC"}
|
package/out/browser.d.ts
CHANGED
|
@@ -12,5 +12,7 @@
|
|
|
12
12
|
export * from "./classifier.js";
|
|
13
13
|
export * from "./labels.js";
|
|
14
14
|
export * from "./tokenizer.js";
|
|
15
|
+
export * from "./anchor-inference.js";
|
|
16
|
+
export * from "./postcode-binary-resolver.js";
|
|
15
17
|
export type { InferResult } from "./onnx-runner.js";
|
|
16
18
|
//# sourceMappingURL=browser.d.ts.map
|
package/out/browser.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"browser.d.ts","sourceRoot":"","sources":["../browser.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,cAAc,iBAAiB,CAAA;AAC/B,cAAc,aAAa,CAAA;AAC3B,cAAc,gBAAgB,CAAA;AAG9B,YAAY,EAAE,WAAW,EAAE,MAAM,kBAAkB,CAAA"}
|
|
1
|
+
{"version":3,"file":"browser.d.ts","sourceRoot":"","sources":["../browser.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,cAAc,iBAAiB,CAAA;AAC/B,cAAc,aAAa,CAAA;AAC3B,cAAc,gBAAgB,CAAA;AAG9B,cAAc,uBAAuB,CAAA;AACrC,cAAc,+BAA+B,CAAA;AAG7C,YAAY,EAAE,WAAW,EAAE,MAAM,kBAAkB,CAAA"}
|
package/out/browser.js
CHANGED
|
@@ -12,4 +12,8 @@
|
|
|
12
12
|
export * from "./classifier.js";
|
|
13
13
|
export * from "./labels.js";
|
|
14
14
|
export * from "./tokenizer.js";
|
|
15
|
+
// Browser-safe anchor channel (#239/#240): the pure-JS feature builder + the postcode binary resolver
|
|
16
|
+
// (zero-dep) the demo wires together to feed the anchor at inference.
|
|
17
|
+
export * from "./anchor-inference.js";
|
|
18
|
+
export * from "./postcode-binary-resolver.js";
|
|
15
19
|
//# sourceMappingURL=browser.js.map
|
package/out/browser.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"browser.js","sourceRoot":"","sources":["../browser.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,cAAc,iBAAiB,CAAA;AAC/B,cAAc,aAAa,CAAA;AAC3B,cAAc,gBAAgB,CAAA"}
|
|
1
|
+
{"version":3,"file":"browser.js","sourceRoot":"","sources":["../browser.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,cAAc,iBAAiB,CAAA;AAC/B,cAAc,aAAa,CAAA;AAC3B,cAAc,gBAAgB,CAAA;AAC9B,sGAAsG;AACtG,sEAAsE;AACtE,cAAc,uBAAuB,CAAA;AACrC,cAAc,+BAA+B,CAAA"}
|
package/out/classifier.d.ts
CHANGED
|
@@ -9,9 +9,12 @@
|
|
|
9
9
|
*
|
|
10
10
|
* Convenience wrappers `parseJson` / `parseTuples` / `parseXml` project the tree on the way out.
|
|
11
11
|
*/
|
|
12
|
-
import { decodeAsXml, type AddressTree, type ComponentTag } from "@mailwoman/core/decoder";
|
|
12
|
+
import { decodeAsXml, type AddressTree, type Calibrator, type ComponentTag } from "@mailwoman/core/decoder";
|
|
13
|
+
import { type AnchorLookup } from "./anchor-inference.js";
|
|
14
|
+
import { type FstMatcherLike } from "./fst-prior.js";
|
|
13
15
|
import type { InferResult } from "./onnx-runner.js";
|
|
14
16
|
import { type QueryShapeLike } from "./query-shape-prior.js";
|
|
17
|
+
import { type StreetMorphologyPriorOpts } from "./street-morphology-prior.js";
|
|
15
18
|
import { MailwomanTokenizer } from "./tokenizer.js";
|
|
16
19
|
import type { ResolveWeightsOpts } from "./weights.js";
|
|
17
20
|
/**
|
|
@@ -20,7 +23,10 @@ import type { ResolveWeightsOpts } from "./weights.js";
|
|
|
20
23
|
* the classifier only ever calls `infer(ids)`.
|
|
21
24
|
*/
|
|
22
25
|
export interface NeuralRunner {
|
|
23
|
-
infer(tokenIds: number[]
|
|
26
|
+
infer(tokenIds: number[], anchor?: {
|
|
27
|
+
features: ReadonlyArray<ReadonlyArray<number>>;
|
|
28
|
+
confidence: ReadonlyArray<number>;
|
|
29
|
+
}): Promise<InferResult>;
|
|
24
30
|
}
|
|
25
31
|
export interface NeuralAddressClassifierConfig {
|
|
26
32
|
tokenizer: MailwomanTokenizer;
|
|
@@ -50,6 +56,13 @@ export interface NeuralAddressClassifierConfig {
|
|
|
50
56
|
startTransitions?: number[];
|
|
51
57
|
/** Optional learned end-of-sequence transition scores per label. */
|
|
52
58
|
endTransitions?: number[];
|
|
59
|
+
/**
|
|
60
|
+
* Optional postcode-anchor lookup (#239/#240). When set, `parse` builds per-piece anchor features
|
|
61
|
+
* from the text + this lookup and feeds them to the runner — for models trained with the anchor
|
|
62
|
+
* channel (exported with the `anchor_features`/`anchor_confidence` ONNX inputs). Omit for plain
|
|
63
|
+
* models. Load via `loadAnchorLookup` from `./anchor-inference.js`.
|
|
64
|
+
*/
|
|
65
|
+
postcodeAnchorLookup?: AnchorLookup;
|
|
53
66
|
}
|
|
54
67
|
export declare class NeuralAddressClassifier {
|
|
55
68
|
private readonly cfg;
|
|
@@ -71,7 +84,9 @@ export declare class NeuralAddressClassifier {
|
|
|
71
84
|
* by `@mailwoman/neural-web`. Calling this method in a browser will throw at runtime — use
|
|
72
85
|
* `loadNeuralClassifierFromUrls` from `@mailwoman/neural-web` instead.
|
|
73
86
|
*/
|
|
74
|
-
static loadFromWeights(opts?: ResolveWeightsOpts
|
|
87
|
+
static loadFromWeights(opts?: ResolveWeightsOpts & {
|
|
88
|
+
postcodeAnchorLookup?: AnchorLookup;
|
|
89
|
+
}): Promise<NeuralAddressClassifier>;
|
|
75
90
|
/** Tokenize → infer → Viterbi (or argmax) → decoder tree. */
|
|
76
91
|
parse(text: string, opts?: ParseOpts): Promise<AddressTree>;
|
|
77
92
|
/**
|
|
@@ -84,6 +99,18 @@ export declare class NeuralAddressClassifier {
|
|
|
84
99
|
parseXml(text: string, opts?: ParseOpts & {
|
|
85
100
|
xml?: Parameters<typeof decodeAsXml>[1];
|
|
86
101
|
}): Promise<string>;
|
|
102
|
+
/**
|
|
103
|
+
* Guard against a silent label/emission shape overrun. When the model emits MORE logits per token
|
|
104
|
+
* than the configured label vocabulary (e.g. a Stage 3 bundle loaded with the default Stage 2
|
|
105
|
+
* labels), viterbi indexes past the transition matrix and dies with an opaque `Cannot read
|
|
106
|
+
* properties of undefined (reading '0')`. Fail fast here with a message that names the contract
|
|
107
|
+
* the caller violated.
|
|
108
|
+
*
|
|
109
|
+
* The opposite shape (model narrower than labels) is intentionally permitted — STAGE2_BIO_LABELS
|
|
110
|
+
* prefix-extends STAGE1_BIO_LABELS so a Stage 1 model loaded with Stage 2 labels decodes
|
|
111
|
+
* correctly via the first 15 logits. See labels.ts for the contract.
|
|
112
|
+
*/
|
|
113
|
+
private assertEmissionWidth;
|
|
87
114
|
}
|
|
88
115
|
/** Result of `parseWithLogits` — tree + raw material for per-span logit aggregation. */
|
|
89
116
|
export interface ParseWithLogitsResult {
|
|
@@ -110,5 +137,45 @@ export interface ParseOpts {
|
|
|
110
137
|
* favored label. Confidence-scaled, so a 0.6-confidence format hit gets +0.6 max bias.
|
|
111
138
|
*/
|
|
112
139
|
queryShapeBiasScale?: number;
|
|
140
|
+
/**
|
|
141
|
+
* Pre-built FST gazetteer matcher. When provided, gazetteer matches produce additive emission
|
|
142
|
+
* biases.
|
|
143
|
+
*/
|
|
144
|
+
fst?: FstMatcherLike;
|
|
145
|
+
/** Bias magnitude for FST gazetteer matches. Default 1.0. */
|
|
146
|
+
fstBiasScale?: number;
|
|
147
|
+
/**
|
|
148
|
+
* Pre-built street-morphology FST matcher. When provided, street-type affixes (Avenue, rue,
|
|
149
|
+
* Calle, Straße, …) produce additive emission biases toward `street_prefix`/`street_suffix` on
|
|
150
|
+
* the matched tokens AND toward `street` / away from `dependent_locality` on the adjacent name
|
|
151
|
+
* tokens. Closes the v0.6.1 dependent_locality vacuum; see
|
|
152
|
+
* `docs/articles/concepts/street-supplement-architecture.md` for the layered design.
|
|
153
|
+
*/
|
|
154
|
+
fstStreetMorphology?: FstMatcherLike;
|
|
155
|
+
/** Override bias magnitudes for the morphology prior. */
|
|
156
|
+
fstStreetMorphologyOpts?: StreetMorphologyPriorOpts;
|
|
157
|
+
/**
|
|
158
|
+
* When true, run the deterministic postcode regex repair pass (v0.7 #35) on the decoded label
|
|
159
|
+
* sequence before tree-building. Detects postcode-shaped substrings (GB/CA/NL/US/FR/… patterns)
|
|
160
|
+
* and snaps/adds the postcode span to the matched shape, fixing the SentencePiece-fragmentation
|
|
161
|
+
* failures catalogued in the 2026-05-29 postcode diagnostic. Off by default — opt-in until the
|
|
162
|
+
* v0.7 gate confirms it. See `./postcode-repair.ts`.
|
|
163
|
+
*/
|
|
164
|
+
postcodeRepair?: boolean;
|
|
165
|
+
/**
|
|
166
|
+
* When true, run the deterministic secondary-unit regex repair pass on the decoded label sequence
|
|
167
|
+
* before tree-building. Detects designator-shaped substrings ("Apt 4B", "Ste 12", "Unit 9400",
|
|
168
|
+
* bare "#104", …) and snaps/adds the unit span, fixing the unit-drop weakness the three-arena
|
|
169
|
+
* capability eval surfaced (postal secondary-unit 0% neural). Off by default — opt-in until the
|
|
170
|
+
* v0.7.2 arena re-run quantifies its delta. See `./unit-repair.ts`.
|
|
171
|
+
*/
|
|
172
|
+
unitRepair?: boolean;
|
|
173
|
+
/**
|
|
174
|
+
* Optional span-confidence calibrator (task #59). When provided, each decoded span's `conf=` is
|
|
175
|
+
* mapped through it (isotonic lookup table → calibrated probability of correctness). OPT-IN —
|
|
176
|
+
* omit for the byte-stable default softmax confidence. Build one via `createCalibrator`
|
|
177
|
+
* (`@mailwoman/core/decoder`) from `data/eval/calibration/isotonic-<locale>-<version>.json`.
|
|
178
|
+
*/
|
|
179
|
+
calibrate?: Calibrator;
|
|
113
180
|
}
|
|
114
181
|
//# sourceMappingURL=classifier.d.ts.map
|
package/out/classifier.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"classifier.d.ts","sourceRoot":"","sources":["../classifier.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,OAAO,EAIN,WAAW,EACX,KAAK,WAAW,EAChB,KAAK,YAAY,EAEjB,MAAM,yBAAyB,CAAA;
|
|
1
|
+
{"version":3,"file":"classifier.d.ts","sourceRoot":"","sources":["../classifier.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,OAAO,EAIN,WAAW,EACX,KAAK,WAAW,EAChB,KAAK,UAAU,EACf,KAAK,YAAY,EAEjB,MAAM,yBAAyB,CAAA;AAChC,OAAO,EAAuB,KAAK,YAAY,EAAE,MAAM,uBAAuB,CAAA;AAC9E,OAAO,EAA0B,KAAK,cAAc,EAAE,MAAM,gBAAgB,CAAA;AAE5E,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,kBAAkB,CAAA;AAEnD,OAAO,EAA0C,KAAK,cAAc,EAAE,MAAM,wBAAwB,CAAA;AACpG,OAAO,EAAuC,KAAK,yBAAyB,EAAE,MAAM,8BAA8B,CAAA;AAClH,OAAO,EAAE,kBAAkB,EAAE,MAAM,gBAAgB,CAAA;AAGnD,OAAO,KAAK,EAAE,kBAAkB,EAAmB,MAAM,cAAc,CAAA;AAEvE;;;;GAIG;AACH,MAAM,WAAW,YAAY;IAC5B,KAAK,CACJ,QAAQ,EAAE,MAAM,EAAE,EAClB,MAAM,CAAC,EAAE;QAAE,QAAQ,EAAE,aAAa,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC,CAAC;QAAC,UAAU,EAAE,aAAa,CAAC,MAAM,CAAC,CAAA;KAAE,GAC5F,OAAO,CAAC,WAAW,CAAC,CAAA;CACvB;AAED,MAAM,WAAW,6BAA6B;IAC7C,SAAS,EAAE,kBAAkB,CAAA;IAC7B,MAAM,EAAE,YAAY,CAAA;IACpB;;;;OAIG;IACH,MAAM,CAAC,EAAE,SAAS,MAAM,EAAE,CAAA;IAC1B;;;;;;;OAOG;IACH,MAAM,CAAC,EAAE,SAAS,GAAG,QAAQ,CAAA;IAC7B;;;;OAIG;IACH,WAAW,CAAC,EAAE,MAAM,EAAE,EAAE,CAAA;IACxB,sEAAsE;IACtE,gBAAgB,CAAC,EAAE,MAAM,EAAE,CAAA;IAC3B,oEAAoE;IACpE,cAAc,CAAC,EAAE,MAAM,EAAE,CAAA;IACzB;;;;;OAKG;IACH,oBAAoB,CAAC,EAAE,YAAY,CAAA;CACnC;AAED,qBAAa,uBAAuB;IAOvB,OAAO,CAAC,QAAQ,CAAC,GAAG;IANhC,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAmB;IAC1C,OAAO,CAAC,QAAQ,CAAC,UAAU,CAAsB;IACjD,OAAO,CAAC,QAAQ,CAAC,WAAW,CAAY;IACxC,OAAO,CAAC,QAAQ,CAAC,gBAAgB,CAAU;IAC3C,OAAO,CAAC,QAAQ,CAAC,cAAc,CAAU;gBAEZ,GAAG,EAAE,6BAA6B;IAa/D;;;;;;;;;;;OAWG;WACU,eAAe,CAC3B,IAAI,GAAE,kBAAkB,GAAG;QAAE,oBAAoB,CAAC,EAAE,YAAY,CAAA;KAAO,GACrE,OAAO,CAAC,uBAAuB,CAAC;IA4BnC,6DAA6D;IACvD,KAAK,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,SAAS,GAAG,OAAO,CAAC,WAAW,CAAC;IA4EjE;;;OAGG;IACG,eAAe,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,SAAS,GAAG,OAAO,CAAC,qBAAqB,CAAC;IA0E/E,SAAS,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,SAAS,GAAG,OAAO,CAAC,OAAO,CAAC,MAAM,CAAC,YAAY,EAAE,MAAM,CAAC,CAAC,CAAC;IAIzF,WAAW,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,SAAS,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,YAAY,EAAE,MAAM,CAAC,CAAC,CAAC;IAInF,QAAQ,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,SAAS,GAAG;QAAE,GAAG,CAAC,EAAE,UAAU,CAAC,OAAO,WAAW,CAAC,CAAC,CAAC,CAAC,CAAA;KAAE,GAAG,OAAO,CAAC,MAAM,CAAC;IAI7G;;;;;;;;;;OAUG;IACH,OAAO,CAAC,mBAAmB;CAW3B;AAED,wFAAwF;AACxF,MAAM,WAAW,qBAAqB;IACrC,IAAI,EAAE,WAAW,CAAA;IACjB,MAAM,EAAE,MAAM,EAAE,EAAE,CAAA;IAClB,MAAM,EAAE,KAAK,CAAC;QAAE,KAAK,EAAE,MAAM,CAAC;QAAC,GAAG,EAAE,MAAM,CAAA;KAAE,CAAC,CAAA;CAC7C;AAED;;;GAGG;AACH,MAAM,WAAW,SAAS;IACzB;;;;OAIG;IACH,UAAU,CAAC,EAAE,cAAc,CAAA;IAC3B;;;OAGG;IACH,mBAAmB,CAAC,EAAE,MAAM,CAAA;IAC5B;;;OAGG;IACH,GAAG,CAAC,EAAE,cAAc,CAAA;IACpB,6DAA6D;IAC7D,YAAY,CAAC,EAAE,MAAM,CAAA;IACrB;;;;;;OAMG;IACH,mBAAmB,CAAC,EAAE,cAAc,CAAA;IACpC,yDAAyD;IACzD,uBAAuB,CAAC,EAAE,yBAAyB,CAAA;IACnD;;;;;;OAMG;IACH,cAAc,CAAC,EAAE,OAAO,CAAA;IACxB;;;;;;OAMG;IACH,UAAU,CAAC,EAAE,OAAO,CAAA;IACpB;;;;;OAKG;IACH,SAAS,CAAC,EAAE,UAAU,CAAA;CACtB"}
|
package/out/classifier.js
CHANGED
|
@@ -10,9 +10,14 @@
|
|
|
10
10
|
* Convenience wrappers `parseJson` / `parseTuples` / `parseXml` project the tree on the way out.
|
|
11
11
|
*/
|
|
12
12
|
import { buildAddressTree, decodeAsJson, decodeAsTuples, decodeAsXml, } from "@mailwoman/core/decoder";
|
|
13
|
+
import { buildAnchorFeatures } from "./anchor-inference.js";
|
|
14
|
+
import { buildFstEmissionPriors } from "./fst-prior.js";
|
|
13
15
|
import { STAGE2_BIO_LABELS } from "./labels.js";
|
|
16
|
+
import { repairPostcodeLabels } from "./postcode-repair.js";
|
|
14
17
|
import { addEmissionMatrix, buildEmissionPriors } from "./query-shape-prior.js";
|
|
18
|
+
import { buildStreetMorphologyEmissionPriors } from "./street-morphology-prior.js";
|
|
15
19
|
import { MailwomanTokenizer } from "./tokenizer.js";
|
|
20
|
+
import { repairUnitLabels } from "./unit-repair.js";
|
|
16
21
|
import { buildBioEndMask, buildBioStartMask, buildBioTransitionMask, softmax, viterbi } from "./viterbi.js";
|
|
17
22
|
export class NeuralAddressClassifier {
|
|
18
23
|
cfg;
|
|
@@ -53,37 +58,53 @@ export class NeuralAddressClassifier {
|
|
|
53
58
|
// + node:fs) and throws cleanly in a browser if called. Without the directive, webpack
|
|
54
59
|
// pulls onnx-runner / weights into the browser chunk graph + then chokes on the Node-only
|
|
55
60
|
// builtins they reference.
|
|
56
|
-
const [{ OnnxRunner }, { resolveWeights, readLabelsFromModelCard }] = await Promise.all([
|
|
61
|
+
const [{ OnnxRunner }, { resolveWeights, readLabelsFromModelCard, readCrfTransitions }] = await Promise.all([
|
|
57
62
|
import(/* webpackIgnore: true */ "./onnx-runner.js"),
|
|
58
63
|
import(/* webpackIgnore: true */ "./weights.js"),
|
|
59
64
|
]);
|
|
60
65
|
const resolved = resolveWeights(opts);
|
|
61
|
-
// Read the trained label vocabulary from the bundled model-card.json when present. Falls
|
|
62
|
-
// through to the constructor default (STAGE2_BIO_LABELS) for legacy bundles that predate
|
|
63
|
-
// the `labels` field — those are always Stage 2 cards by construction, so the default is
|
|
64
|
-
// the correct fallback. A future Stage 3 ship will require the card to carry the field.
|
|
65
66
|
const labels = readLabelsFromModelCard(resolved.modelCardPath);
|
|
67
|
+
const crf = readCrfTransitions(resolved.crfTransitionsPath);
|
|
66
68
|
const [tokenizer, runner] = await Promise.all([
|
|
67
69
|
MailwomanTokenizer.loadFromFile(resolved.tokenizerPath),
|
|
68
70
|
OnnxRunner.create(resolved.modelPath),
|
|
69
71
|
]);
|
|
70
|
-
return new NeuralAddressClassifier({
|
|
72
|
+
return new NeuralAddressClassifier({
|
|
73
|
+
tokenizer,
|
|
74
|
+
runner,
|
|
75
|
+
labels,
|
|
76
|
+
transitions: crf?.transitions,
|
|
77
|
+
startTransitions: crf?.startTransitions,
|
|
78
|
+
endTransitions: crf?.endTransitions,
|
|
79
|
+
...(opts.postcodeAnchorLookup ? { postcodeAnchorLookup: opts.postcodeAnchorLookup } : {}),
|
|
80
|
+
});
|
|
71
81
|
}
|
|
72
82
|
/** Tokenize → infer → Viterbi (or argmax) → decoder tree. */
|
|
73
83
|
async parse(text, opts) {
|
|
74
84
|
if (text.length === 0)
|
|
75
85
|
return { raw: text, roots: [] };
|
|
76
86
|
const { pieces, ids } = this.cfg.tokenizer.encode(text);
|
|
77
|
-
|
|
78
|
-
//
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
const
|
|
87
|
+
// Postcode-anchor channel (#239/#240): build per-piece anchor features from the same lookup the
|
|
88
|
+
// model trained on, fed alongside the ids. No-op when no lookup is configured.
|
|
89
|
+
const anchor = this.cfg.postcodeAnchorLookup
|
|
90
|
+
? buildAnchorFeatures(text, pieces, this.cfg.postcodeAnchorLookup)
|
|
91
|
+
: undefined;
|
|
92
|
+
const { logits } = await this.cfg.runner.infer(ids, anchor);
|
|
93
|
+
this.assertEmissionWidth(logits);
|
|
94
|
+
let emissions = opts?.queryShape
|
|
83
95
|
? addEmissionMatrix(logits, buildEmissionPriors(opts.queryShape, pieces, this.labels, {
|
|
84
96
|
biasScale: opts.queryShapeBiasScale ?? 1.0,
|
|
97
|
+
inputText: text,
|
|
85
98
|
}))
|
|
86
99
|
: logits;
|
|
100
|
+
if (opts?.fst) {
|
|
101
|
+
emissions = addEmissionMatrix(emissions, buildFstEmissionPriors(opts.fst, pieces, this.labels, {
|
|
102
|
+
biasScale: opts.fstBiasScale ?? 1.0,
|
|
103
|
+
}));
|
|
104
|
+
}
|
|
105
|
+
if (opts?.fstStreetMorphology) {
|
|
106
|
+
emissions = addEmissionMatrix(emissions, buildStreetMorphologyEmissionPriors(opts.fstStreetMorphology, pieces, this.labels, opts.fstStreetMorphologyOpts ?? {}));
|
|
107
|
+
}
|
|
87
108
|
const labelIndices = this.decodeMode === "viterbi"
|
|
88
109
|
? viterbi({
|
|
89
110
|
emissions,
|
|
@@ -92,10 +113,8 @@ export class NeuralAddressClassifier {
|
|
|
92
113
|
endTransitions: this.endTransitions,
|
|
93
114
|
}).path
|
|
94
115
|
: emissions.map((row) => argmaxSoftmax(row).idx);
|
|
95
|
-
|
|
116
|
+
let tokens = pieces.map((p, i) => {
|
|
96
117
|
const idx = labelIndices[i];
|
|
97
|
-
// Confidence reports the encoder's *raw* probability (no prior baked in) so callers see
|
|
98
|
-
// the model's own conviction, not the prior-augmented score.
|
|
99
118
|
const probs = softmax(logits[i]);
|
|
100
119
|
return {
|
|
101
120
|
piece: p.piece,
|
|
@@ -105,7 +124,13 @@ export class NeuralAddressClassifier {
|
|
|
105
124
|
confidence: probs[idx],
|
|
106
125
|
};
|
|
107
126
|
});
|
|
108
|
-
|
|
127
|
+
if (opts?.postcodeRepair) {
|
|
128
|
+
tokens = repairPostcodeLabels(text, tokens).tokens;
|
|
129
|
+
}
|
|
130
|
+
if (opts?.unitRepair) {
|
|
131
|
+
tokens = repairUnitLabels(text, tokens).tokens;
|
|
132
|
+
}
|
|
133
|
+
return buildAddressTree(text, tokens, opts?.calibrate ? { calibrate: opts.calibrate } : undefined);
|
|
109
134
|
}
|
|
110
135
|
/**
|
|
111
136
|
* Like `parse`, but also returns the raw per-token logits and piece offsets needed for per-span
|
|
@@ -116,12 +141,27 @@ export class NeuralAddressClassifier {
|
|
|
116
141
|
return { tree: { raw: text, roots: [] }, logits: [], pieces: [] };
|
|
117
142
|
}
|
|
118
143
|
const { pieces, ids } = this.cfg.tokenizer.encode(text);
|
|
119
|
-
|
|
120
|
-
|
|
144
|
+
// Postcode-anchor channel (#239/#240): build per-piece anchor features from the same lookup the
|
|
145
|
+
// model trained on, fed alongside the ids. No-op when no lookup is configured.
|
|
146
|
+
const anchor = this.cfg.postcodeAnchorLookup
|
|
147
|
+
? buildAnchorFeatures(text, pieces, this.cfg.postcodeAnchorLookup)
|
|
148
|
+
: undefined;
|
|
149
|
+
const { logits } = await this.cfg.runner.infer(ids, anchor);
|
|
150
|
+
this.assertEmissionWidth(logits);
|
|
151
|
+
let emissions = opts?.queryShape
|
|
121
152
|
? addEmissionMatrix(logits, buildEmissionPriors(opts.queryShape, pieces, this.labels, {
|
|
122
153
|
biasScale: opts.queryShapeBiasScale ?? 1.0,
|
|
154
|
+
inputText: text,
|
|
123
155
|
}))
|
|
124
156
|
: logits;
|
|
157
|
+
if (opts?.fst) {
|
|
158
|
+
emissions = addEmissionMatrix(emissions, buildFstEmissionPriors(opts.fst, pieces, this.labels, {
|
|
159
|
+
biasScale: opts.fstBiasScale ?? 1.0,
|
|
160
|
+
}));
|
|
161
|
+
}
|
|
162
|
+
if (opts?.fstStreetMorphology) {
|
|
163
|
+
emissions = addEmissionMatrix(emissions, buildStreetMorphologyEmissionPriors(opts.fstStreetMorphology, pieces, this.labels, opts.fstStreetMorphologyOpts ?? {}));
|
|
164
|
+
}
|
|
125
165
|
const labelIndices = this.decodeMode === "viterbi"
|
|
126
166
|
? viterbi({
|
|
127
167
|
emissions,
|
|
@@ -142,7 +182,7 @@ export class NeuralAddressClassifier {
|
|
|
142
182
|
};
|
|
143
183
|
});
|
|
144
184
|
return {
|
|
145
|
-
tree: buildAddressTree(text, tokens),
|
|
185
|
+
tree: buildAddressTree(text, tokens, opts?.calibrate ? { calibrate: opts.calibrate } : undefined),
|
|
146
186
|
logits,
|
|
147
187
|
pieces: pieces.map((p) => ({ start: p.start, end: p.end })),
|
|
148
188
|
};
|
|
@@ -156,6 +196,27 @@ export class NeuralAddressClassifier {
|
|
|
156
196
|
async parseXml(text, opts) {
|
|
157
197
|
return decodeAsXml(await this.parse(text, opts), opts?.xml);
|
|
158
198
|
}
|
|
199
|
+
/**
|
|
200
|
+
* Guard against a silent label/emission shape overrun. When the model emits MORE logits per token
|
|
201
|
+
* than the configured label vocabulary (e.g. a Stage 3 bundle loaded with the default Stage 2
|
|
202
|
+
* labels), viterbi indexes past the transition matrix and dies with an opaque `Cannot read
|
|
203
|
+
* properties of undefined (reading '0')`. Fail fast here with a message that names the contract
|
|
204
|
+
* the caller violated.
|
|
205
|
+
*
|
|
206
|
+
* The opposite shape (model narrower than labels) is intentionally permitted — STAGE2_BIO_LABELS
|
|
207
|
+
* prefix-extends STAGE1_BIO_LABELS so a Stage 1 model loaded with Stage 2 labels decodes
|
|
208
|
+
* correctly via the first 15 logits. See labels.ts for the contract.
|
|
209
|
+
*/
|
|
210
|
+
assertEmissionWidth(logits) {
|
|
211
|
+
if (logits.length === 0)
|
|
212
|
+
return;
|
|
213
|
+
const width = logits[0].length;
|
|
214
|
+
if (width > this.labels.length) {
|
|
215
|
+
throw new Error(`Label/emission mismatch: model emits ${width} logits per token but the classifier was ` +
|
|
216
|
+
`configured with only ${this.labels.length} labels. Did you load a Stage 3 bundle without ` +
|
|
217
|
+
`passing its model-card labels? See loadFromWeights / loadNeuralClassifierFromUrls.`);
|
|
218
|
+
}
|
|
219
|
+
}
|
|
159
220
|
}
|
|
160
221
|
function argmaxSoftmax(row) {
|
|
161
222
|
let maxIdx = 0;
|
package/out/classifier.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"classifier.js","sourceRoot":"","sources":["../classifier.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,OAAO,EACN,gBAAgB,EAChB,YAAY,EACZ,cAAc,EACd,WAAW,
|
|
1
|
+
{"version":3,"file":"classifier.js","sourceRoot":"","sources":["../classifier.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,OAAO,EACN,gBAAgB,EAChB,YAAY,EACZ,cAAc,EACd,WAAW,GAKX,MAAM,yBAAyB,CAAA;AAChC,OAAO,EAAE,mBAAmB,EAAqB,MAAM,uBAAuB,CAAA;AAC9E,OAAO,EAAE,sBAAsB,EAAuB,MAAM,gBAAgB,CAAA;AAC5E,OAAO,EAAE,iBAAiB,EAAE,MAAM,aAAa,CAAA;AAE/C,OAAO,EAAE,oBAAoB,EAAE,MAAM,sBAAsB,CAAA;AAC3D,OAAO,EAAE,iBAAiB,EAAE,mBAAmB,EAAuB,MAAM,wBAAwB,CAAA;AACpG,OAAO,EAAE,mCAAmC,EAAkC,MAAM,8BAA8B,CAAA;AAClH,OAAO,EAAE,kBAAkB,EAAE,MAAM,gBAAgB,CAAA;AACnD,OAAO,EAAE,gBAAgB,EAAE,MAAM,kBAAkB,CAAA;AACnD,OAAO,EAAE,eAAe,EAAE,iBAAiB,EAAE,sBAAsB,EAAE,OAAO,EAAE,OAAO,EAAE,MAAM,cAAc,CAAA;AAoD3G,MAAM,OAAO,uBAAuB;IAON;IANZ,MAAM,CAAmB;IACzB,UAAU,CAAsB;IAChC,WAAW,CAAY;IACvB,gBAAgB,CAAU;IAC1B,cAAc,CAAU;IAEzC,YAA6B,GAAkC;QAAlC,QAAG,GAAH,GAAG,CAA+B;QAC9D,IAAI,CAAC,MAAM,GAAG,GAAG,CAAC,MAAM,IAAI,iBAAiB,CAAA;QAC7C,IAAI,CAAC,UAAU,GAAG,GAAG,CAAC,MAAM,IAAI,SAAS,CAAA;QACzC,MAAM,UAAU,GAAG,sBAAsB,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;QACtD,IAAI,GAAG,CAAC,WAAW,EAAE,CAAC;YACrB,IAAI,CAAC,WAAW,GAAG,WAAW,CAAC,UAAU,EAAE,GAAG,CAAC,WAAW,CAAC,CAAA;QAC5D,CAAC;aAAM,CAAC;YACP,IAAI,CAAC,WAAW,GAAG,UAAU,CAAA;QAC9B,CAAC;QACD,IAAI,CAAC,gBAAgB,GAAG,GAAG,CAAC,gBAAgB,IAAI,iBAAiB,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;QAC9E,IAAI,CAAC,cAAc,GAAG,GAAG,CAAC,cAAc,IAAI,eAAe,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;IACzE,CAAC;IAED;;;;;;;;;;;OAWG;IACH,MAAM,CAAC,KAAK,CAAC,eAAe,CAC3B,OAAqE,EAAE;QAEvE,yFAAyF;QACzF,2FAA2F;QAC3F,uFAAuF;QACvF,0FAA0F;QAC1F,2BAA2B;QAC3B,MAAM,CAAC,EAAE,UAAU,EAAE,EAAE,EAAE,cAAc,EAAE,uBAAuB,EAAE,kBAAkB,EAAE,CAAC,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC;YAC3G,MAAM,CAAC,yBAAyB,CAAC,kBAAkB,CAAC;YACpD,MAAM,CAAC,yBAAyB,CAAC,cAAc,CAAC;SAChD,CAAC,CAAA;QACF,MAAM,QAAQ,GAAoB,cAAc,CAAC,IAAI,CAAC,CAAA;QACtD,MAAM,MAAM,GAAG,uBAAuB,CAAC,QAAQ,CAAC,aAAa,CAAC,CAAA;QAC9D,MAAM,GAAG,GAAG,kBAAkB,CAAC,QAAQ,CAAC,kBAAkB,CAAC,CAAA;QAC3D,MAAM,CAAC,SAAS,EAAE,MAAM,CAAC,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC;YAC7C,kBAAkB,CAAC,YAAY,CAAC,QAAQ,CAAC,aAAa,CAAC;YACvD,UAAU,CAAC,MAAM,CAAC,QAAQ,CAAC,SAAS,CAAC;SACrC,CAAC,CAAA;QACF,OAAO,IAAI,uBAAuB,CAAC;YAClC,SAAS;YACT,MAAM;YACN,MAAM;YACN,WAAW,EAAE,GAAG,EAAE,WAAW;YAC7B,gBAAgB,EAAE,GAAG,EAAE,gBAAgB;YACvC,cAAc,EAAE,GAAG,EAAE,cAAc;YACnC,GAAG,CAAC,IAAI,CAAC,oBAAoB,CAAC,CAAC,CAAC,EAAE,oBAAoB,EAAE,IAAI,CAAC,oBAAoB,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;SACzF,CAAC,CAAA;IACH,CAAC;IAED,6DAA6D;IAC7D,KAAK,CAAC,KAAK,CAAC,IAAY,EAAE,IAAgB;QACzC,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC;YAAE,OAAO,EAAE,GAAG,EAAE,IAAI,EAAE,KAAK,EAAE,EAAE,EAAE,CAAA;QAEtD,MAAM,EAAE,MAAM,EAAE,GAAG,EAAE,GAAG,IAAI,CAAC,GAAG,CAAC,SAAS,CAAC,MAAM,CAAC,IAAI,CAAC,CAAA;QACvD,gGAAgG;QAChG,+EAA+E;QAC/E,MAAM,MAAM,GAAG,IAAI,CAAC,GAAG,CAAC,oBAAoB;YAC3C,CAAC,CAAC,mBAAmB,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,CAAC,GAAG,CAAC,oBAAoB,CAAC;YAClE,CAAC,CAAC,SAAS,CAAA;QACZ,MAAM,EAAE,MAAM,EAAE,GAAG,MAAM,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,EAAE,MAAM,CAAC,CAAA;QAE3D,IAAI,CAAC,mBAAmB,CAAC,MAAM,CAAC,CAAA;QAEhC,IAAI,SAAS,GAAG,IAAI,EAAE,UAAU;YAC/B,CAAC,CAAC,iBAAiB,CACjB,MAAM,EACN,mBAAmB,CAAC,IAAI,CAAC,UAAU,EAAE,MAAM,EAAE,IAAI,CAAC,MAAM,EAAE;gBACzD,SAAS,EAAE,IAAI,CAAC,mBAAmB,IAAI,GAAG;gBAC1C,SAAS,EAAE,IAAI;aACf,CAAC,CACF;YACF,CAAC,CAAC,MAAM,CAAA;QAET,IAAI,IAAI,EAAE,GAAG,EAAE,CAAC;YACf,SAAS,GAAG,iBAAiB,CAC5B,SAAS,EACT,sBAAsB,CAAC,IAAI,CAAC,GAAG,EAAE,MAAM,EAAE,IAAI,CAAC,MAAM,EAAE;gBACrD,SAAS,EAAE,IAAI,CAAC,YAAY,IAAI,GAAG;aACnC,CAAC,CACF,CAAA;QACF,CAAC;QAED,IAAI,IAAI,EAAE,mBAAmB,EAAE,CAAC;YAC/B,SAAS,GAAG,iBAAiB,CAC5B,SAAS,EACT,mCAAmC,CAClC,IAAI,CAAC,mBAAmB,EACxB,MAAM,EACN,IAAI,CAAC,MAAM,EACX,IAAI,CAAC,uBAAuB,IAAI,EAAE,CAClC,CACD,CAAA;QACF,CAAC;QAED,MAAM,YAAY,GACjB,IAAI,CAAC,UAAU,KAAK,SAAS;YAC5B,CAAC,CAAC,OAAO,CAAC;gBACR,SAAS;gBACT,WAAW,EAAE,IAAI,CAAC,WAAW;gBAC7B,gBAAgB,EAAE,IAAI,CAAC,gBAAgB;gBACvC,cAAc,EAAE,IAAI,CAAC,cAAc;aACnC,CAAC,CAAC,IAAI;YACR,CAAC,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,aAAa,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAA;QAElD,IAAI,MAAM,GAAmB,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE;YAChD,MAAM,GAAG,GAAG,YAAY,CAAC,CAAC,CAAE,CAAA;YAC5B,MAAM,KAAK,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAE,CAAC,CAAA;YACjC,OAAO;gBACN,KAAK,EAAE,CAAC,CAAC,KAAK;gBACd,KAAK,EAAE,CAAC,CAAC,KAAK;gBACd,GAAG,EAAE,CAAC,CAAC,GAAG;gBACV,KAAK,EAAE,CAAC,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,IAAI,GAAG,CAA0B;gBACzD,UAAU,EAAE,KAAK,CAAC,GAAG,CAAE;aACvB,CAAA;QACF,CAAC,CAAC,CAAA;QAEF,IAAI,IAAI,EAAE,cAAc,EAAE,CAAC;YAC1B,MAAM,GAAG,oBAAoB,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC,MAAM,CAAA;QACnD,CAAC;QACD,IAAI,IAAI,EAAE,UAAU,EAAE,CAAC;YACtB,MAAM,GAAG,gBAAgB,CAAC,IAAI,EAAE,MAAM,CAAC,CAAC,MAAM,CAAA;QAC/C,CAAC;QAED,OAAO,gBAAgB,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,SAAS,CAAC,CAAC,CAAC,EAAE,SAAS,EAAE,IAAI,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC,SAAS,CAAC,CAAA;IACnG,CAAC;IAED;;;OAGG;IACH,KAAK,CAAC,eAAe,CAAC,IAAY,EAAE,IAAgB;QACnD,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACvB,OAAO,EAAE,IAAI,EAAE,EAAE,GAAG,EAAE,IAAI,EAAE,KAAK,EAAE,EAAE,EAAE,EAAE,MAAM,EAAE,EAAE,EAAE,MAAM,EAAE,EAAE,EAAE,CAAA;QAClE,CAAC;QACD,MAAM,EAAE,MAAM,EAAE,GAAG,EAAE,GAAG,IAAI,CAAC,GAAG,CAAC,SAAS,CAAC,MAAM,CAAC,IAAI,CAAC,CAAA;QACvD,gGAAgG;QAChG,+EAA+E;QAC/E,MAAM,MAAM,GAAG,IAAI,CAAC,GAAG,CAAC,oBAAoB;YAC3C,CAAC,CAAC,mBAAmB,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,CAAC,GAAG,CAAC,oBAAoB,CAAC;YAClE,CAAC,CAAC,SAAS,CAAA;QACZ,MAAM,EAAE,MAAM,EAAE,GAAG,MAAM,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,KAAK,CAAC,GAAG,EAAE,MAAM,CAAC,CAAA;QAE3D,IAAI,CAAC,mBAAmB,CAAC,MAAM,CAAC,CAAA;QAEhC,IAAI,SAAS,GAAG,IAAI,EAAE,UAAU;YAC/B,CAAC,CAAC,iBAAiB,CACjB,MAAM,EACN,mBAAmB,CAAC,IAAI,CAAC,UAAU,EAAE,MAAM,EAAE,IAAI,CAAC,MAAM,EAAE;gBACzD,SAAS,EAAE,IAAI,CAAC,mBAAmB,IAAI,GAAG;gBAC1C,SAAS,EAAE,IAAI;aACf,CAAC,CACF;YACF,CAAC,CAAC,MAAM,CAAA;QAET,IAAI,IAAI,EAAE,GAAG,EAAE,CAAC;YACf,SAAS,GAAG,iBAAiB,CAC5B,SAAS,EACT,sBAAsB,CAAC,IAAI,CAAC,GAAG,EAAE,MAAM,EAAE,IAAI,CAAC,MAAM,EAAE;gBACrD,SAAS,EAAE,IAAI,CAAC,YAAY,IAAI,GAAG;aACnC,CAAC,CACF,CAAA;QACF,CAAC;QAED,IAAI,IAAI,EAAE,mBAAmB,EAAE,CAAC;YAC/B,SAAS,GAAG,iBAAiB,CAC5B,SAAS,EACT,mCAAmC,CAClC,IAAI,CAAC,mBAAmB,EACxB,MAAM,EACN,IAAI,CAAC,MAAM,EACX,IAAI,CAAC,uBAAuB,IAAI,EAAE,CAClC,CACD,CAAA;QACF,CAAC;QAED,MAAM,YAAY,GACjB,IAAI,CAAC,UAAU,KAAK,SAAS;YAC5B,CAAC,CAAC,OAAO,CAAC;gBACR,SAAS;gBACT,WAAW,EAAE,IAAI,CAAC,WAAW;gBAC7B,gBAAgB,EAAE,IAAI,CAAC,gBAAgB;gBACvC,cAAc,EAAE,IAAI,CAAC,cAAc;aACnC,CAAC,CAAC,IAAI;YACR,CAAC,CAAC,SAAS,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,aAAa,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAA;QAElD,MAAM,MAAM,GAAmB,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE;YAClD,MAAM,GAAG,GAAG,YAAY,CAAC,CAAC,CAAE,CAAA;YAC5B,MAAM,KAAK,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAE,CAAC,CAAA;YACjC,OAAO;gBACN,KAAK,EAAE,CAAC,CAAC,KAAK;gBACd,KAAK,EAAE,CAAC,CAAC,KAAK;gBACd,GAAG,EAAE,CAAC,CAAC,GAAG;gBACV,KAAK,EAAE,CAAC,IAAI,CAAC,MAAM,CAAC,GAAG,CAAC,IAAI,GAAG,CAA0B;gBACzD,UAAU,EAAE,KAAK,CAAC,GAAG,CAAE;aACvB,CAAA;QACF,CAAC,CAAC,CAAA;QAEF,OAAO;YACN,IAAI,EAAE,gBAAgB,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,SAAS,CAAC,CAAC,CAAC,EAAE,SAAS,EAAE,IAAI,CAAC,SAAS,EAAE,CAAC,CAAC,CAAC,SAAS,CAAC;YACjG,MAAM;YACN,MAAM,EAAE,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,EAAE,KAAK,EAAE,CAAC,CAAC,KAAK,EAAE,GAAG,EAAE,CAAC,CAAC,GAAG,EAAE,CAAC,CAAC;SAC3D,CAAA;IACF,CAAC;IAED,KAAK,CAAC,SAAS,CAAC,IAAY,EAAE,IAAgB;QAC7C,OAAO,YAAY,CAAC,MAAM,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC,CAAA;IAClD,CAAC;IAED,KAAK,CAAC,WAAW,CAAC,IAAY,EAAE,IAAgB;QAC/C,OAAO,cAAc,CAAC,MAAM,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC,CAAA;IACpD,CAAC;IAED,KAAK,CAAC,QAAQ,CAAC,IAAY,EAAE,IAA8D;QAC1F,OAAO,WAAW,CAAC,MAAM,IAAI,CAAC,KAAK,CAAC,IAAI,EAAE,IAAI,CAAC,EAAE,IAAI,EAAE,GAAG,CAAC,CAAA;IAC5D,CAAC;IAED;;;;;;;;;;OAUG;IACK,mBAAmB,CAAC,MAA2B;QACtD,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC;YAAE,OAAM;QAC/B,MAAM,KAAK,GAAG,MAAM,CAAC,CAAC,CAAE,CAAC,MAAM,CAAA;QAC/B,IAAI,KAAK,GAAG,IAAI,CAAC,MAAM,CAAC,MAAM,EAAE,CAAC;YAChC,MAAM,IAAI,KAAK,CACd,wCAAwC,KAAK,2CAA2C;gBACvF,wBAAwB,IAAI,CAAC,MAAM,CAAC,MAAM,iDAAiD;gBAC3F,oFAAoF,CACrF,CAAA;QACF,CAAC;IACF,CAAC;CACD;AAmED,SAAS,aAAa,CAAC,GAAa;IACnC,IAAI,MAAM,GAAG,CAAC,CAAA;IACd,IAAI,MAAM,GAAG,GAAG,CAAC,CAAC,CAAE,CAAA;IACpB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACrC,IAAI,GAAG,CAAC,CAAC,CAAE,GAAG,MAAM,EAAE,CAAC;YACtB,MAAM,GAAG,GAAG,CAAC,CAAC,CAAE,CAAA;YAChB,MAAM,GAAG,CAAC,CAAA;QACX,CAAC;IACF,CAAC;IACD,IAAI,MAAM,GAAG,CAAC,CAAA;IACd,KAAK,MAAM,CAAC,IAAI,GAAG;QAAE,MAAM,IAAI,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,MAAM,CAAC,CAAA;IACnD,MAAM,IAAI,GAAG,CAAC,GAAG,MAAM,CAAA;IACvB,OAAO,EAAE,GAAG,EAAE,MAAM,EAAE,IAAI,EAAE,CAAA;AAC7B,CAAC;AAED,uGAAuG;AACvG,SAAS,WAAW,CAAC,CAAa,EAAE,CAAa;IAChD,MAAM,CAAC,GAAG,CAAC,CAAC,MAAM,CAAA;IAClB,MAAM,GAAG,GAAe,EAAE,CAAA;IAC1B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC;QAC5B,MAAM,GAAG,GAAG,IAAI,KAAK,CAAS,CAAC,CAAC,CAAA;QAChC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE;YAAE,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAE,CAAC,CAAC,CAAE,GAAG,CAAC,CAAC,CAAC,CAAE,CAAC,CAAC,CAAE,CAAA;QAC1D,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,CAAA;IACd,CAAC;IACD,OAAO,GAAG,CAAA;AACX,CAAC"}
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Soft-prior emission biases derived from the FST gazetteer. When the FST finds that a token
|
|
7
|
+
* sequence matches a known place name (e.g., "New York" → locality + region), this module
|
|
8
|
+
* produces additive biases that nudge the Viterbi decoder toward the matching BIO labels.
|
|
9
|
+
*
|
|
10
|
+
* Composes with the QueryShape prior via addEmissionMatrix — same integration point, same additive
|
|
11
|
+
* semantics.
|
|
12
|
+
*
|
|
13
|
+
* SentencePiece ↔ FST bridge: SentencePiece pieces are grouped into whitespace words (by the ▁
|
|
14
|
+
* sentinel), normalized through the same pipeline as FST edges (NFKC, lowercase, strip
|
|
15
|
+
* non-alnum), and walked through the FST as contiguous subpaths.
|
|
16
|
+
*
|
|
17
|
+
* Uses structural typing for the FST input so this module has zero dependencies on
|
|
18
|
+
* `@mailwoman/resolver-wof-sqlite` — consumers pass an FstMatcher instance, but this file only
|
|
19
|
+
* consumes the shape.
|
|
20
|
+
*/
|
|
21
|
+
import type { TokenLike } from "./query-shape-prior.js";
|
|
22
|
+
export interface FstMatchLike {
|
|
23
|
+
stateId: number;
|
|
24
|
+
accepted: boolean;
|
|
25
|
+
depth: number;
|
|
26
|
+
}
|
|
27
|
+
export interface FstPlaceEntryLike {
|
|
28
|
+
wofID: number;
|
|
29
|
+
placetype: string;
|
|
30
|
+
importance: number;
|
|
31
|
+
}
|
|
32
|
+
export interface FstMatcherLike {
|
|
33
|
+
walk(tokens: string[]): FstMatchLike | null;
|
|
34
|
+
walkFrom(prev: FstMatchLike, token: string): FstMatchLike | null;
|
|
35
|
+
accepting(stateId: number): FstPlaceEntryLike[];
|
|
36
|
+
}
|
|
37
|
+
export interface WordGroup {
|
|
38
|
+
fstToken: string;
|
|
39
|
+
pieceIndices: number[];
|
|
40
|
+
}
|
|
41
|
+
export interface FstPriorOpts {
|
|
42
|
+
biasScale?: number;
|
|
43
|
+
/**
|
|
44
|
+
* Maximum bias magnitude (logits). Prevents large-population places from overriding the model.
|
|
45
|
+
* Default 3.0.
|
|
46
|
+
*/
|
|
47
|
+
maxBias?: number;
|
|
48
|
+
suppressionScale?: number;
|
|
49
|
+
}
|
|
50
|
+
/**
|
|
51
|
+
* Build a `[seqLen][numLabels]` bias matrix from FST gazetteer matches.
|
|
52
|
+
*
|
|
53
|
+
* Walks all contiguous subpaths of the reconstructed whitespace-token sequence through the FST. For
|
|
54
|
+
* each accepting state, biases the corresponding BIO labels on the matched pieces.
|
|
55
|
+
*/
|
|
56
|
+
export declare function buildFstEmissionPriors(fst: FstMatcherLike, pieces: ReadonlyArray<TokenLike & {
|
|
57
|
+
piece: string;
|
|
58
|
+
}>, labels: ReadonlyArray<string>, opts?: FstPriorOpts): number[][];
|
|
59
|
+
/**
|
|
60
|
+
* Group SentencePiece pieces into whitespace-delimited words. Each word's literal text is
|
|
61
|
+
* reconstructed by concatenating pieces (minus leading ▁), then normalized through the same
|
|
62
|
+
* pipeline the FST builder uses.
|
|
63
|
+
*
|
|
64
|
+
* Exported (alongside {@linkcode normalizeFstToken} and the {@linkcode WordGroup} type) so the
|
|
65
|
+
* street-morphology prior can reuse the same piece-grouping/normalization pipeline without
|
|
66
|
+
* duplication. Internal helper signature; not part of the public neural API.
|
|
67
|
+
*/
|
|
68
|
+
export declare function groupPiecesIntoWords(pieces: ReadonlyArray<{
|
|
69
|
+
piece: string;
|
|
70
|
+
}>): WordGroup[];
|
|
71
|
+
//# sourceMappingURL=fst-prior.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"fst-prior.d.ts","sourceRoot":"","sources":["../fst-prior.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;GAmBG;AAEH,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,wBAAwB,CAAA;AAQvD,MAAM,WAAW,YAAY;IAC5B,OAAO,EAAE,MAAM,CAAA;IACf,QAAQ,EAAE,OAAO,CAAA;IACjB,KAAK,EAAE,MAAM,CAAA;CACb;AAED,MAAM,WAAW,iBAAiB;IACjC,KAAK,EAAE,MAAM,CAAA;IACb,SAAS,EAAE,MAAM,CAAA;IACjB,UAAU,EAAE,MAAM,CAAA;CAClB;AAED,MAAM,WAAW,cAAc;IAC9B,IAAI,CAAC,MAAM,EAAE,MAAM,EAAE,GAAG,YAAY,GAAG,IAAI,CAAA;IAC3C,QAAQ,CAAC,IAAI,EAAE,YAAY,EAAE,KAAK,EAAE,MAAM,GAAG,YAAY,GAAG,IAAI,CAAA;IAChE,SAAS,CAAC,OAAO,EAAE,MAAM,GAAG,iBAAiB,EAAE,CAAA;CAC/C;AAiBD,MAAM,WAAW,SAAS;IACzB,QAAQ,EAAE,MAAM,CAAA;IAChB,YAAY,EAAE,MAAM,EAAE,CAAA;CACtB;AAID,MAAM,WAAW,YAAY;IAC5B,SAAS,CAAC,EAAE,MAAM,CAAA;IAClB;;;OAGG;IACH,OAAO,CAAC,EAAE,MAAM,CAAA;IAChB,gBAAgB,CAAC,EAAE,MAAM,CAAA;CACzB;AAED;;;;;GAKG;AACH,wBAAgB,sBAAsB,CACrC,GAAG,EAAE,cAAc,EACnB,MAAM,EAAE,aAAa,CAAC,SAAS,GAAG;IAAE,KAAK,EAAE,MAAM,CAAA;CAAE,CAAC,EACpD,MAAM,EAAE,aAAa,CAAC,MAAM,CAAC,EAC7B,IAAI,GAAE,YAAiB,GACrB,MAAM,EAAE,EAAE,CA+DZ;AAED;;;;;;;;GAQG;AACH,wBAAgB,oBAAoB,CAAC,MAAM,EAAE,aAAa,CAAC;IAAE,KAAK,EAAE,MAAM,CAAA;CAAE,CAAC,GAAG,SAAS,EAAE,CAiC1F"}
|