@mailwoman/neural 2.1.0 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. package/out/anchor-inference.d.ts +57 -0
  2. package/out/anchor-inference.d.ts.map +1 -0
  3. package/out/anchor-inference.js +94 -0
  4. package/out/anchor-inference.js.map +1 -0
  5. package/out/browser.d.ts +18 -0
  6. package/out/browser.d.ts.map +1 -0
  7. package/out/browser.js +19 -0
  8. package/out/browser.js.map +1 -0
  9. package/out/classifier.d.ts +145 -11
  10. package/out/classifier.d.ts.map +1 -1
  11. package/out/classifier.js +185 -20
  12. package/out/classifier.js.map +1 -1
  13. package/out/fst-prior.d.ts +71 -0
  14. package/out/fst-prior.d.ts.map +1 -0
  15. package/out/fst-prior.js +173 -0
  16. package/out/fst-prior.js.map +1 -0
  17. package/out/index.d.ts +7 -0
  18. package/out/index.d.ts.map +1 -1
  19. package/out/index.js +5 -0
  20. package/out/index.js.map +1 -1
  21. package/out/labels.d.ts +30 -6
  22. package/out/labels.d.ts.map +1 -1
  23. package/out/labels.js +43 -6
  24. package/out/labels.js.map +1 -1
  25. package/out/onnx-runner.d.ts +8 -1
  26. package/out/onnx-runner.d.ts.map +1 -1
  27. package/out/onnx-runner.js +31 -1
  28. package/out/onnx-runner.js.map +1 -1
  29. package/out/postcode-anchor.d.ts +117 -0
  30. package/out/postcode-anchor.d.ts.map +1 -0
  31. package/out/postcode-anchor.js +269 -0
  32. package/out/postcode-anchor.js.map +1 -0
  33. package/out/postcode-binary-resolver.d.ts +60 -0
  34. package/out/postcode-binary-resolver.d.ts.map +1 -0
  35. package/out/postcode-binary-resolver.js +208 -0
  36. package/out/postcode-binary-resolver.js.map +1 -0
  37. package/out/postcode-repair.d.ts +65 -0
  38. package/out/postcode-repair.d.ts.map +1 -0
  39. package/out/postcode-repair.js +171 -0
  40. package/out/postcode-repair.js.map +1 -0
  41. package/out/proposal-classifier.d.ts +5 -1
  42. package/out/proposal-classifier.d.ts.map +1 -1
  43. package/out/proposal-classifier.js +5 -3
  44. package/out/proposal-classifier.js.map +1 -1
  45. package/out/query-shape-prior.d.ts +74 -0
  46. package/out/query-shape-prior.d.ts.map +1 -0
  47. package/out/query-shape-prior.js +223 -0
  48. package/out/query-shape-prior.js.map +1 -0
  49. package/out/street-morphology-prior.d.ts +56 -0
  50. package/out/street-morphology-prior.d.ts.map +1 -0
  51. package/out/street-morphology-prior.js +159 -0
  52. package/out/street-morphology-prior.js.map +1 -0
  53. package/out/tokenizer.d.ts +6 -1
  54. package/out/tokenizer.d.ts.map +1 -1
  55. package/out/tokenizer.js +8 -3
  56. package/out/tokenizer.js.map +1 -1
  57. package/out/unit-repair.d.ts +46 -0
  58. package/out/unit-repair.d.ts.map +1 -0
  59. package/out/unit-repair.js +147 -0
  60. package/out/unit-repair.js.map +1 -0
  61. package/out/viterbi.d.ts +76 -0
  62. package/out/viterbi.d.ts.map +1 -0
  63. package/out/viterbi.js +163 -0
  64. package/out/viterbi.js.map +1 -0
  65. package/out/vitest.config.d.ts.map +1 -1
  66. package/out/vitest.config.js +3 -0
  67. package/out/vitest.config.js.map +1 -1
  68. package/out/weights.d.ts +42 -0
  69. package/out/weights.d.ts.map +1 -1
  70. package/out/weights.js +92 -4
  71. package/out/weights.js.map +1 -1
  72. package/package.json +10 -3
@@ -0,0 +1,57 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * Inference-side postcode-anchor features (#239/#240) — the mirror of the Python training pipeline
7
+ * (`mailwoman_train/tokenizer.py::anchor_feature_vector` + `realign_anchor_to_pieces`). At inference
8
+ * the model conditions on per-piece anchor features fed alongside `input_ids`; this builds them from
9
+ * a raw address + its SentencePiece pieces, using the SAME postcode→anchor lookup the model trained
10
+ * against (`scripts/build-pilot-anchor-lookup.py`), so the feature layout matches byte-for-byte.
11
+ *
12
+ * The layout is LOAD-BEARING and cross-language: a wrong locale order or centroid scale feeds the
13
+ * model garbage. `anchor-inference.test.ts` pins both `LOCALE_ORDER` and the vector to values emitted
14
+ * by the Python `anchor_feature_vector` — any drift fails the test.
15
+ */
16
+ import type { TokenizedPiece } from "./tokenizer.js";
17
+ /**
18
+ * The locale class order — MUST match Python `mailwoman_train/labels.py::LOCALE_COUNTRIES`. The
19
+ * posterior occupies indices `[0, LOCALE_ORDER.length)`; the normalized centroid the last two.
20
+ * (Pinned by the test; do not reorder.)
21
+ */
22
+ export declare const LOCALE_ORDER: readonly ["US", "FR", "DE", "CA", "GB", "JP", "ES", "IT", "NL"];
23
+ /** Anchor feature width = posterior over the locale set + a 2-d centroid. */
24
+ export declare const ANCHOR_FEATURE_DIM: number;
25
+ /** One postcode's anchor record (from the pilot lookup): country posterior + a single centroid. */
26
+ export interface AnchorEntry {
27
+ posterior: Record<string, number>;
28
+ lat: number;
29
+ lon: number;
30
+ }
31
+ export type AnchorLookup = Map<string, AnchorEntry>;
32
+ /**
33
+ * Build the fixed-width anchor feature vector — the exact mirror of Python `anchor_feature_vector`:
34
+ * a uniform country posterior over {@linkcode LOCALE_ORDER} (renormalized over the in-set mass) + a
35
+ * normalized centroid (`lat/90`, `lon/180` ∈ [-1, 1]).
36
+ */
37
+ export declare function anchorFeatureVector(posterior: Record<string, number>, lat: number, lon: number): number[];
38
+ /**
39
+ * Parse the pilot postcode→anchor lookup JSON (`{postcode: [posterior, lat, lon]}`) into a Map. Pure
40
+ * (takes the parsed object, not a path) so this module stays browser-safe — the file read lives in
41
+ * the Node-side caller (the eval).
42
+ */
43
+ export declare function parseAnchorLookup(raw: Record<string, [Record<string, number>, number, number]>): AnchorLookup;
44
+ /**
45
+ * Per-piece anchor features + confidence for `text`, projected onto its SP `pieces` by the SAME
46
+ * char→piece rule the labels use (a piece takes the anchor of the postcode span its first
47
+ * non-whitespace char falls inside) — so the anchor lands on exactly the postcode's sub-tokens.
48
+ *
49
+ * Postcode spans are the alphanumeric runs in `text` that the lookup recognizes (gold-equivalent on
50
+ * clean rendered addresses); a recognized span yields a confidence-1.0 anchor, like training's
51
+ * gold-span. Returns `(pieces × ANCHOR_FEATURE_DIM)` features + `(pieces,)` confidence.
52
+ */
53
+ export declare function buildAnchorFeatures(text: string, pieces: ReadonlyArray<TokenizedPiece>, lookup: AnchorLookup): {
54
+ features: number[][];
55
+ confidence: number[];
56
+ };
57
+ //# sourceMappingURL=anchor-inference.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"anchor-inference.d.ts","sourceRoot":"","sources":["../anchor-inference.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;GAcG;AAEH,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,gBAAgB,CAAA;AAEpD;;;;GAIG;AACH,eAAO,MAAM,YAAY,iEAAkE,CAAA;AAE3F,6EAA6E;AAC7E,eAAO,MAAM,kBAAkB,QAA0B,CAAA;AAEzD,mGAAmG;AACnG,MAAM,WAAW,WAAW;IAC3B,SAAS,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAA;IACjC,GAAG,EAAE,MAAM,CAAA;IACX,GAAG,EAAE,MAAM,CAAA;CACX;AAED,MAAM,MAAM,YAAY,GAAG,GAAG,CAAC,MAAM,EAAE,WAAW,CAAC,CAAA;AAEnD;;;;GAIG;AACH,wBAAgB,mBAAmB,CAAC,SAAS,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,GAAG,EAAE,MAAM,EAAE,GAAG,EAAE,MAAM,GAAG,MAAM,EAAE,CAgBzG;AAED;;;;GAIG;AACH,wBAAgB,iBAAiB,CAAC,GAAG,EAAE,MAAM,CAAC,MAAM,EAAE,CAAC,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,MAAM,EAAE,MAAM,CAAC,CAAC,GAAG,YAAY,CAI7G;AAED;;;;;;;;GAQG;AACH,wBAAgB,mBAAmB,CAClC,IAAI,EAAE,MAAM,EACZ,MAAM,EAAE,aAAa,CAAC,cAAc,CAAC,EACrC,MAAM,EAAE,YAAY,GAClB;IAAE,QAAQ,EAAE,MAAM,EAAE,EAAE,CAAC;IAAC,UAAU,EAAE,MAAM,EAAE,CAAA;CAAE,CA0BhD"}
@@ -0,0 +1,94 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * Inference-side postcode-anchor features (#239/#240) — the mirror of the Python training pipeline
7
+ * (`mailwoman_train/tokenizer.py::anchor_feature_vector` + `realign_anchor_to_pieces`). At inference
8
+ * the model conditions on per-piece anchor features fed alongside `input_ids`; this builds them from
9
+ * a raw address + its SentencePiece pieces, using the SAME postcode→anchor lookup the model trained
10
+ * against (`scripts/build-pilot-anchor-lookup.py`), so the feature layout matches byte-for-byte.
11
+ *
12
+ * The layout is LOAD-BEARING and cross-language: a wrong locale order or centroid scale feeds the
13
+ * model garbage. `anchor-inference.test.ts` pins both `LOCALE_ORDER` and the vector to values emitted
14
+ * by the Python `anchor_feature_vector` — any drift fails the test.
15
+ */
16
+ /**
17
+ * The locale class order — MUST match Python `mailwoman_train/labels.py::LOCALE_COUNTRIES`. The
18
+ * posterior occupies indices `[0, LOCALE_ORDER.length)`; the normalized centroid the last two.
19
+ * (Pinned by the test; do not reorder.)
20
+ */
21
+ export const LOCALE_ORDER = ["US", "FR", "DE", "CA", "GB", "JP", "ES", "IT", "NL"];
22
+ /** Anchor feature width = posterior over the locale set + a 2-d centroid. */
23
+ export const ANCHOR_FEATURE_DIM = LOCALE_ORDER.length + 2;
24
+ /**
25
+ * Build the fixed-width anchor feature vector — the exact mirror of Python `anchor_feature_vector`:
26
+ * a uniform country posterior over {@linkcode LOCALE_ORDER} (renormalized over the in-set mass) + a
27
+ * normalized centroid (`lat/90`, `lon/180` ∈ [-1, 1]).
28
+ */
29
+ export function anchorFeatureVector(posterior, lat, lon) {
30
+ const vec = new Array(ANCHOR_FEATURE_DIM).fill(0);
31
+ let total = 0;
32
+ for (const [country, weight] of Object.entries(posterior)) {
33
+ const idx = LOCALE_ORDER.indexOf(country.toUpperCase());
34
+ if (idx >= 0) {
35
+ vec[idx] = weight;
36
+ total += weight;
37
+ }
38
+ }
39
+ if (total > 0) {
40
+ for (let i = 0; i < LOCALE_ORDER.length; i++)
41
+ vec[i] /= total;
42
+ }
43
+ vec[LOCALE_ORDER.length] = Math.max(-1, Math.min(1, lat / 90));
44
+ vec[LOCALE_ORDER.length + 1] = Math.max(-1, Math.min(1, lon / 180));
45
+ return vec;
46
+ }
47
+ /**
48
+ * Parse the pilot postcode→anchor lookup JSON (`{postcode: [posterior, lat, lon]}`) into a Map. Pure
49
+ * (takes the parsed object, not a path) so this module stays browser-safe — the file read lives in
50
+ * the Node-side caller (the eval).
51
+ */
52
+ export function parseAnchorLookup(raw) {
53
+ const out = new Map();
54
+ for (const [pc, [posterior, lat, lon]] of Object.entries(raw))
55
+ out.set(pc, { posterior, lat, lon });
56
+ return out;
57
+ }
58
+ /**
59
+ * Per-piece anchor features + confidence for `text`, projected onto its SP `pieces` by the SAME
60
+ * char→piece rule the labels use (a piece takes the anchor of the postcode span its first
61
+ * non-whitespace char falls inside) — so the anchor lands on exactly the postcode's sub-tokens.
62
+ *
63
+ * Postcode spans are the alphanumeric runs in `text` that the lookup recognizes (gold-equivalent on
64
+ * clean rendered addresses); a recognized span yields a confidence-1.0 anchor, like training's
65
+ * gold-span. Returns `(pieces × ANCHOR_FEATURE_DIM)` features + `(pieces,)` confidence.
66
+ */
67
+ export function buildAnchorFeatures(text, pieces, lookup) {
68
+ const features = pieces.map(() => new Array(ANCHOR_FEATURE_DIM).fill(0));
69
+ const confidence = pieces.map(() => 0);
70
+ const tokenRe = /[A-Za-z0-9]+/g;
71
+ let m;
72
+ while ((m = tokenRe.exec(text)) !== null) {
73
+ const entry = lookup.get(m[0].toUpperCase());
74
+ if (!entry)
75
+ continue;
76
+ const spanBegin = m.index;
77
+ const spanEnd = m.index + m[0].length;
78
+ const vec = anchorFeatureVector(entry.posterior, entry.lat, entry.lon);
79
+ for (let i = 0; i < pieces.length; i++) {
80
+ const p = pieces[i];
81
+ for (let c = p.start; c < p.end; c++) {
82
+ if (c < text.length && !/\s/.test(text[c])) {
83
+ if (c >= spanBegin && c < spanEnd) {
84
+ features[i] = vec;
85
+ confidence[i] = 1.0;
86
+ }
87
+ break; // first non-whitespace char of the piece decides (mirrors realign_anchor_to_pieces)
88
+ }
89
+ }
90
+ }
91
+ }
92
+ return { features, confidence };
93
+ }
94
+ //# sourceMappingURL=anchor-inference.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"anchor-inference.js","sourceRoot":"","sources":["../anchor-inference.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;GAcG;AAIH;;;;GAIG;AACH,MAAM,CAAC,MAAM,YAAY,GAAG,CAAC,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,CAAU,CAAA;AAE3F,6EAA6E;AAC7E,MAAM,CAAC,MAAM,kBAAkB,GAAG,YAAY,CAAC,MAAM,GAAG,CAAC,CAAA;AAWzD;;;;GAIG;AACH,MAAM,UAAU,mBAAmB,CAAC,SAAiC,EAAE,GAAW,EAAE,GAAW;IAC9F,MAAM,GAAG,GAAG,IAAI,KAAK,CAAS,kBAAkB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;IACzD,IAAI,KAAK,GAAG,CAAC,CAAA;IACb,KAAK,MAAM,CAAC,OAAO,EAAE,MAAM,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC,EAAE,CAAC;QAC3D,MAAM,GAAG,GAAG,YAAY,CAAC,OAAO,CAAC,OAAO,CAAC,WAAW,EAAmC,CAAC,CAAA;QACxF,IAAI,GAAG,IAAI,CAAC,EAAE,CAAC;YACd,GAAG,CAAC,GAAG,CAAC,GAAG,MAAM,CAAA;YACjB,KAAK,IAAI,MAAM,CAAA;QAChB,CAAC;IACF,CAAC;IACD,IAAI,KAAK,GAAG,CAAC,EAAE,CAAC;QACf,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,YAAY,CAAC,MAAM,EAAE,CAAC,EAAE;YAAE,GAAG,CAAC,CAAC,CAAE,IAAI,KAAK,CAAA;IAC/D,CAAC;IACD,GAAG,CAAC,YAAY,CAAC,MAAM,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,GAAG,GAAG,EAAE,CAAC,CAAC,CAAA;IAC9D,GAAG,CAAC,YAAY,CAAC,MAAM,GAAG,CAAC,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,GAAG,GAAG,GAAG,CAAC,CAAC,CAAA;IACnE,OAAO,GAAG,CAAA;AACX,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,iBAAiB,CAAC,GAA6D;IAC9F,MAAM,GAAG,GAAiB,IAAI,GAAG,EAAE,CAAA;IACnC,KAAK,MAAM,CAAC,EAAE,EAAE,CAAC,SAAS,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC;QAAE,GAAG,CAAC,GAAG,CAAC,EAAE,EAAE,EAAE,SAAS,EAAE,GAAG,EAAE,GAAG,EAAE,CAAC,CAAA;IACnG,OAAO,GAAG,CAAA;AACX,CAAC;AAED;;;;;;;;GAQG;AACH,MAAM,UAAU,mBAAmB,CAClC,IAAY,EACZ,MAAqC,EACrC,MAAoB;IAEpB,MAAM,QAAQ,GAAe,MAAM,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,IAAI,KAAK,CAAS,kBAAkB,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAA;IAC5F,MAAM,UAAU,GAAa,MAAM,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,CAAA;IAEhD,MAAM,OAAO,GAAG,eAAe,CAAA;IAC/B,IAAI,CAAyB,CAAA;IAC7B,OAAO,CAAC,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;QAC1C,MAAM,KAAK,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,WAAW,EAAE,CAAC,CAAA;QAC5C,IAAI,CAAC,KAAK;YAAE,SAAQ;QACpB,MAAM,SAAS,GAAG,CAAC,CAAC,KAAK,CAAA;QACzB,MAAM,OAAO,GAAG,CAAC,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,CAAA;QACrC,MAAM,GAAG,GAAG,mBAAmB,CAAC,KAAK,CAAC,SAAS,EAAE,KAAK,CAAC,GAAG,EAAE,KAAK,CAAC,GAAG,CAAC,CAAA;QACtE,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACxC,MAAM,CAAC,GAAG,MAAM,CAAC,CAAC,CAAE,CAAA;YACpB,KAAK,IAAI,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC;gBACtC,IAAI,CAAC,GAAG,IAAI,CAAC,MAAM,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAE,CAAC,EAAE,CAAC;oBAC7C,IAAI,CAAC,IAAI,SAAS,IAAI,CAAC,GAAG,OAAO,EAAE,CAAC;wBACnC,QAAQ,CAAC,CAAC,CAAC,GAAG,GAAG,CAAA;wBACjB,UAAU,CAAC,CAAC,CAAC,GAAG,GAAG,CAAA;oBACpB,CAAC;oBACD,MAAK,CAAC,oFAAoF;gBAC3F,CAAC;YACF,CAAC;QACF,CAAC;IACF,CAAC;IACD,OAAO,EAAE,QAAQ,EAAE,UAAU,EAAE,CAAA;AAChC,CAAC"}
@@ -0,0 +1,18 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * Browser-safe re-export surface. Excludes `./onnx-runner.js` + `./weights.js` (Node-only — they
7
+ * statically reference `onnxruntime-node` + `node:fs`), the dynamic `loadFromWeights` /
8
+ * `loadFromFile` paths from those modules guard the corresponding imports with `webpackIgnore` so
9
+ * Node callers still get them via the main `@mailwoman/neural` entry without bundling them into a
10
+ * browser graph.
11
+ */
12
+ export * from "./classifier.js";
13
+ export * from "./labels.js";
14
+ export * from "./tokenizer.js";
15
+ export * from "./anchor-inference.js";
16
+ export * from "./postcode-binary-resolver.js";
17
+ export type { InferResult } from "./onnx-runner.js";
18
+ //# sourceMappingURL=browser.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"browser.d.ts","sourceRoot":"","sources":["../browser.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,cAAc,iBAAiB,CAAA;AAC/B,cAAc,aAAa,CAAA;AAC3B,cAAc,gBAAgB,CAAA;AAG9B,cAAc,uBAAuB,CAAA;AACrC,cAAc,+BAA+B,CAAA;AAG7C,YAAY,EAAE,WAAW,EAAE,MAAM,kBAAkB,CAAA"}
package/out/browser.js ADDED
@@ -0,0 +1,19 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * Browser-safe re-export surface. Excludes `./onnx-runner.js` + `./weights.js` (Node-only — they
7
+ * statically reference `onnxruntime-node` + `node:fs`), the dynamic `loadFromWeights` /
8
+ * `loadFromFile` paths from those modules guard the corresponding imports with `webpackIgnore` so
9
+ * Node callers still get them via the main `@mailwoman/neural` entry without bundling them into a
10
+ * browser graph.
11
+ */
12
+ export * from "./classifier.js";
13
+ export * from "./labels.js";
14
+ export * from "./tokenizer.js";
15
+ // Browser-safe anchor channel (#239/#240): the pure-JS feature builder + the postcode binary resolver
16
+ // (zero-dep) the demo wires together to feed the anchor at inference.
17
+ export * from "./anchor-inference.js";
18
+ export * from "./postcode-binary-resolver.js";
19
+ //# sourceMappingURL=browser.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"browser.js","sourceRoot":"","sources":["../browser.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,cAAc,iBAAiB,CAAA;AAC/B,cAAc,aAAa,CAAA;AAC3B,cAAc,gBAAgB,CAAA;AAC9B,sGAAsG;AACtG,sEAAsE;AACtE,cAAc,uBAAuB,CAAA;AACrC,cAAc,+BAA+B,CAAA"}
@@ -9,19 +9,68 @@
9
9
  *
10
10
  * Convenience wrappers `parseJson` / `parseTuples` / `parseXml` project the tree on the way out.
11
11
  */
12
- import { type AddressTree, type ComponentTag, decodeAsXml } from "@mailwoman/core/decoder";
13
- import { OnnxRunner } from "./onnx-runner.js";
12
+ import { decodeAsXml, type AddressTree, type ComponentTag } from "@mailwoman/core/decoder";
13
+ import { type FstMatcherLike } from "./fst-prior.js";
14
+ import type { InferResult } from "./onnx-runner.js";
15
+ import { type QueryShapeLike } from "./query-shape-prior.js";
16
+ import { type StreetMorphologyPriorOpts } from "./street-morphology-prior.js";
14
17
  import { MailwomanTokenizer } from "./tokenizer.js";
15
- import { type ResolveWeightsOpts } from "./weights.js";
18
+ import { type AnchorLookup } from "./anchor-inference.js";
19
+ import type { ResolveWeightsOpts } from "./weights.js";
20
+ /**
21
+ * Structural type the classifier needs from a runner. Lets callers swap the Node-side `OnnxRunner`
22
+ * for a browser-side runner (e.g. `@mailwoman/neural-web`'s `WebOnnxRunner`) without inheritance —
23
+ * the classifier only ever calls `infer(ids)`.
24
+ */
25
+ export interface NeuralRunner {
26
+ infer(tokenIds: number[], anchor?: {
27
+ features: ReadonlyArray<ReadonlyArray<number>>;
28
+ confidence: ReadonlyArray<number>;
29
+ }): Promise<InferResult>;
30
+ }
16
31
  export interface NeuralAddressClassifierConfig {
17
32
  tokenizer: MailwomanTokenizer;
18
- runner: OnnxRunner;
19
- /** Label vocabulary in the order the model emits them. Defaults to Stage 1 (v0.1.0/v0.2.0). */
33
+ runner: NeuralRunner;
34
+ /**
35
+ * Label vocabulary in the order the model emits them. Defaults to Stage 2 (v0.3.0). Stage 2
36
+ * strictly extends Stage 1 at the same indices, so a v0.2.0 Stage 1 model loaded with this
37
+ * default still decodes correctly — its emissions only span the first 15 entries.
38
+ */
20
39
  labels?: readonly string[];
40
+ /**
41
+ * Decoding strategy:
42
+ *
43
+ * - `"viterbi"` (default) — linear-chain CRF Viterbi with the BIO structural mask. Prevents
44
+ * orphan-`I-*` sequences. If `transitions` is provided, uses learned scores on top.
45
+ * - `"argmax"` — per-token argmax. Faster but produces structurally invalid sequences. Use only for
46
+ * debugging / comparison.
47
+ */
48
+ decode?: "viterbi" | "argmax";
49
+ /**
50
+ * Optional learned CRF transition scores. Square matrix of size `labels.length × labels.length`.
51
+ * Added on top of the structural BIO mask. Future weights releases ship this; today's v3.0.0
52
+ * weights don't, so the structural mask alone is used.
53
+ */
54
+ transitions?: number[][];
55
+ /** Optional learned start-of-sequence transition scores per label. */
56
+ startTransitions?: number[];
57
+ /** Optional learned end-of-sequence transition scores per label. */
58
+ endTransitions?: number[];
59
+ /**
60
+ * Optional postcode-anchor lookup (#239/#240). When set, `parse` builds per-piece anchor features
61
+ * from the text + this lookup and feeds them to the runner — for models trained with the anchor
62
+ * channel (exported with the `anchor_features`/`anchor_confidence` ONNX inputs). Omit for plain
63
+ * models. Load via `loadAnchorLookup` from `./anchor-inference.js`.
64
+ */
65
+ postcodeAnchorLookup?: AnchorLookup;
21
66
  }
22
67
  export declare class NeuralAddressClassifier {
23
68
  private readonly cfg;
24
69
  private readonly labels;
70
+ private readonly decodeMode;
71
+ private readonly transitions;
72
+ private readonly startTransitions;
73
+ private readonly endTransitions;
25
74
  constructor(cfg: NeuralAddressClassifierConfig);
26
75
  /**
27
76
  * One-call factory that resolves the weights package (or explicit paths), loads the tokenizer and
@@ -29,12 +78,97 @@ export declare class NeuralAddressClassifier {
29
78
  *
30
79
  * Resolution order: explicit paths in `opts` → `@mailwoman/neural-weights-<locale>` package →
31
80
  * throws a single actionable error.
81
+ *
82
+ * **Node-only.** The dynamic imports keep `OnnxRunner` (onnxruntime-node) + `resolveWeights`
83
+ * (uses Node fs) out of the static dependency graph, so this file can be bundled for the browser
84
+ * by `@mailwoman/neural-web`. Calling this method in a browser will throw at runtime — use
85
+ * `loadNeuralClassifierFromUrls` from `@mailwoman/neural-web` instead.
86
+ */
87
+ static loadFromWeights(opts?: ResolveWeightsOpts & {
88
+ postcodeAnchorLookup?: AnchorLookup;
89
+ }): Promise<NeuralAddressClassifier>;
90
+ /** Tokenize → infer → Viterbi (or argmax) → decoder tree. */
91
+ parse(text: string, opts?: ParseOpts): Promise<AddressTree>;
92
+ /**
93
+ * Like `parse`, but also returns the raw per-token logits and piece offsets needed for per-span
94
+ * logit aggregation (Option C joint-reconcile integration).
95
+ */
96
+ parseWithLogits(text: string, opts?: ParseOpts): Promise<ParseWithLogitsResult>;
97
+ parseJson(text: string, opts?: ParseOpts): Promise<Partial<Record<ComponentTag, string>>>;
98
+ parseTuples(text: string, opts?: ParseOpts): Promise<Array<[ComponentTag, string]>>;
99
+ parseXml(text: string, opts?: ParseOpts & {
100
+ xml?: Parameters<typeof decodeAsXml>[1];
101
+ }): Promise<string>;
102
+ /**
103
+ * Guard against a silent label/emission shape overrun. When the model emits MORE logits per token
104
+ * than the configured label vocabulary (e.g. a Stage 3 bundle loaded with the default Stage 2
105
+ * labels), viterbi indexes past the transition matrix and dies with an opaque `Cannot read
106
+ * properties of undefined (reading '0')`. Fail fast here with a message that names the contract
107
+ * the caller violated.
108
+ *
109
+ * The opposite shape (model narrower than labels) is intentionally permitted — STAGE2_BIO_LABELS
110
+ * prefix-extends STAGE1_BIO_LABELS so a Stage 1 model loaded with Stage 2 labels decodes
111
+ * correctly via the first 15 logits. See labels.ts for the contract.
112
+ */
113
+ private assertEmissionWidth;
114
+ }
115
+ /** Result of `parseWithLogits` — tree + raw material for per-span logit aggregation. */
116
+ export interface ParseWithLogitsResult {
117
+ tree: AddressTree;
118
+ logits: number[][];
119
+ pieces: Array<{
120
+ start: number;
121
+ end: number;
122
+ }>;
123
+ }
124
+ /**
125
+ * Per-call opts for `parse()`. Threading a precomputed `QueryShape` here turns on the soft-prior
126
+ * bias path in the Viterbi decoder (Stage 2.4 boundary → Stage 3 encoder integration).
127
+ */
128
+ export interface ParseOpts {
129
+ /**
130
+ * Precomputed `QueryShape` for this input (from `@mailwoman/query-shape`'s `computeQueryShape`).
131
+ * Known-format hits in the shape produce additive emission biases toward the matching BIO label.
132
+ * Typed structurally — no runtime dependency on `@mailwoman/query-shape`.
133
+ */
134
+ queryShape?: QueryShapeLike;
135
+ /**
136
+ * Maximum bias magnitude in log-odds units. Default 1.0 — adds up to ~e^1 ≈ 2.7× odds to the
137
+ * favored label. Confidence-scaled, so a 0.6-confidence format hit gets +0.6 max bias.
138
+ */
139
+ queryShapeBiasScale?: number;
140
+ /**
141
+ * Pre-built FST gazetteer matcher. When provided, gazetteer matches produce additive emission
142
+ * biases.
143
+ */
144
+ fst?: FstMatcherLike;
145
+ /** Bias magnitude for FST gazetteer matches. Default 1.0. */
146
+ fstBiasScale?: number;
147
+ /**
148
+ * Pre-built street-morphology FST matcher. When provided, street-type affixes (Avenue, rue,
149
+ * Calle, Straße, …) produce additive emission biases toward `street_prefix`/`street_suffix` on
150
+ * the matched tokens AND toward `street` / away from `dependent_locality` on the adjacent name
151
+ * tokens. Closes the v0.6.1 dependent_locality vacuum; see
152
+ * `docs/articles/concepts/street-supplement-architecture.md` for the layered design.
153
+ */
154
+ fstStreetMorphology?: FstMatcherLike;
155
+ /** Override bias magnitudes for the morphology prior. */
156
+ fstStreetMorphologyOpts?: StreetMorphologyPriorOpts;
157
+ /**
158
+ * When true, run the deterministic postcode regex repair pass (v0.7 #35) on the decoded label
159
+ * sequence before tree-building. Detects postcode-shaped substrings (GB/CA/NL/US/FR/… patterns)
160
+ * and snaps/adds the postcode span to the matched shape, fixing the SentencePiece-fragmentation
161
+ * failures catalogued in the 2026-05-29 postcode diagnostic. Off by default — opt-in until the
162
+ * v0.7 gate confirms it. See `./postcode-repair.ts`.
163
+ */
164
+ postcodeRepair?: boolean;
165
+ /**
166
+ * When true, run the deterministic secondary-unit regex repair pass on the decoded label
167
+ * sequence before tree-building. Detects designator-shaped substrings ("Apt 4B", "Ste 12",
168
+ * "Unit 9400", bare "#104", …) and snaps/adds the unit span, fixing the unit-drop weakness the
169
+ * three-arena capability eval surfaced (postal secondary-unit 0% neural). Off by default —
170
+ * opt-in until the v0.7.2 arena re-run quantifies its delta. See `./unit-repair.ts`.
32
171
  */
33
- static loadFromWeights(opts?: ResolveWeightsOpts): Promise<NeuralAddressClassifier>;
34
- /** Tokenize → infer → argmax/softmax → decoder tree. */
35
- parse(text: string): Promise<AddressTree>;
36
- parseJson(text: string): Promise<Partial<Record<ComponentTag, string>>>;
37
- parseTuples(text: string): Promise<Array<[ComponentTag, string]>>;
38
- parseXml(text: string, opts?: Parameters<typeof decodeAsXml>[1]): Promise<string>;
172
+ unitRepair?: boolean;
39
173
  }
40
174
  //# sourceMappingURL=classifier.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"classifier.d.ts","sourceRoot":"","sources":["../classifier.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,OAAO,EACN,KAAK,WAAW,EAChB,KAAK,YAAY,EAKjB,WAAW,EACX,MAAM,yBAAyB,CAAA;AAEhC,OAAO,EAAE,UAAU,EAAE,MAAM,kBAAkB,CAAA;AAC7C,OAAO,EAAE,kBAAkB,EAAE,MAAM,gBAAgB,CAAA;AACnD,OAAO,EAAE,KAAK,kBAAkB,EAAkB,MAAM,cAAc,CAAA;AAEtE,MAAM,WAAW,6BAA6B;IAC7C,SAAS,EAAE,kBAAkB,CAAA;IAC7B,MAAM,EAAE,UAAU,CAAA;IAClB,+FAA+F;IAC/F,MAAM,CAAC,EAAE,SAAS,MAAM,EAAE,CAAA;CAC1B;AAED,qBAAa,uBAAuB;IAGvB,OAAO,CAAC,QAAQ,CAAC,GAAG;IAFhC,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAmB;gBAEb,GAAG,EAAE,6BAA6B;IAI/D;;;;;;OAMG;WACU,eAAe,CAAC,IAAI,GAAE,kBAAuB,GAAG,OAAO,CAAC,uBAAuB,CAAC;IAS7F,wDAAwD;IAClD,KAAK,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,WAAW,CAAC;IAqBzC,SAAS,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,OAAO,CAAC,MAAM,CAAC,YAAY,EAAE,MAAM,CAAC,CAAC,CAAC;IAIvE,WAAW,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,YAAY,EAAE,MAAM,CAAC,CAAC,CAAC;IAIjE,QAAQ,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,UAAU,CAAC,OAAO,WAAW,CAAC,CAAC,CAAC,CAAC,GAAG,OAAO,CAAC,MAAM,CAAC;CAGvF"}
1
+ {"version":3,"file":"classifier.d.ts","sourceRoot":"","sources":["../classifier.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,OAAO,EAIN,WAAW,EACX,KAAK,WAAW,EAChB,KAAK,YAAY,EAEjB,MAAM,yBAAyB,CAAA;AAChC,OAAO,EAA0B,KAAK,cAAc,EAAE,MAAM,gBAAgB,CAAA;AAE5E,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,kBAAkB,CAAA;AAGnD,OAAO,EAA0C,KAAK,cAAc,EAAE,MAAM,wBAAwB,CAAA;AACpG,OAAO,EAAuC,KAAK,yBAAyB,EAAE,MAAM,8BAA8B,CAAA;AAClH,OAAO,EAAE,kBAAkB,EAAE,MAAM,gBAAgB,CAAA;AACnD,OAAO,EAAuB,KAAK,YAAY,EAAE,MAAM,uBAAuB,CAAA;AAE9E,OAAO,KAAK,EAAE,kBAAkB,EAAmB,MAAM,cAAc,CAAA;AAEvE;;;;GAIG;AACH,MAAM,WAAW,YAAY;IAC5B,KAAK,CACJ,QAAQ,EAAE,MAAM,EAAE,EAClB,MAAM,CAAC,EAAE;QAAE,QAAQ,EAAE,aAAa,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC,CAAC;QAAC,UAAU,EAAE,aAAa,CAAC,MAAM,CAAC,CAAA;KAAE,GAC5F,OAAO,CAAC,WAAW,CAAC,CAAA;CACvB;AAED,MAAM,WAAW,6BAA6B;IAC7C,SAAS,EAAE,kBAAkB,CAAA;IAC7B,MAAM,EAAE,YAAY,CAAA;IACpB;;;;OAIG;IACH,MAAM,CAAC,EAAE,SAAS,MAAM,EAAE,CAAA;IAC1B;;;;;;;OAOG;IACH,MAAM,CAAC,EAAE,SAAS,GAAG,QAAQ,CAAA;IAC7B;;;;OAIG;IACH,WAAW,CAAC,EAAE,MAAM,EAAE,EAAE,CAAA;IACxB,sEAAsE;IACtE,gBAAgB,CAAC,EAAE,MAAM,EAAE,CAAA;IAC3B,oEAAoE;IACpE,cAAc,CAAC,EAAE,MAAM,EAAE,CAAA;IACzB;;;;;OAKG;IACH,oBAAoB,CAAC,EAAE,YAAY,CAAA;CACnC;AAED,qBAAa,uBAAuB;IAOvB,OAAO,CAAC,QAAQ,CAAC,GAAG;IANhC,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAmB;IAC1C,OAAO,CAAC,QAAQ,CAAC,UAAU,CAAsB;IACjD,OAAO,CAAC,QAAQ,CAAC,WAAW,CAAY;IACxC,OAAO,CAAC,QAAQ,CAAC,gBAAgB,CAAU;IAC3C,OAAO,CAAC,QAAQ,CAAC,cAAc,CAAU;gBAEZ,GAAG,EAAE,6BAA6B;IAa/D;;;;;;;;;;;OAWG;WACU,eAAe,CAC3B,IAAI,GAAE,kBAAkB,GAAG;QAAE,oBAAoB,CAAC,EAAE,YAAY,CAAA;KAAO,GACrE,OAAO,CAAC,uBAAuB,CAAC;IA4BnC,6DAA6D;IACvD,KAAK,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,SAAS,GAAG,OAAO,CAAC,WAAW,CAAC;IA4EjE;;;OAGG;IACG,eAAe,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,SAAS,GAAG,OAAO,CAAC,qBAAqB,CAAC;IA0E/E,SAAS,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,SAAS,GAAG,OAAO,CAAC,OAAO,CAAC,MAAM,CAAC,YAAY,EAAE,MAAM,CAAC,CAAC,CAAC;IAIzF,WAAW,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,SAAS,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,YAAY,EAAE,MAAM,CAAC,CAAC,CAAC;IAInF,QAAQ,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,CAAC,EAAE,SAAS,GAAG;QAAE,GAAG,CAAC,EAAE,UAAU,CAAC,OAAO,WAAW,CAAC,CAAC,CAAC,CAAC,CAAA;KAAE,GAAG,OAAO,CAAC,MAAM,CAAC;IAI7G;;;;;;;;;;OAUG;IACH,OAAO,CAAC,mBAAmB;CAW3B;AAED,wFAAwF;AACxF,MAAM,WAAW,qBAAqB;IACrC,IAAI,EAAE,WAAW,CAAA;IACjB,MAAM,EAAE,MAAM,EAAE,EAAE,CAAA;IAClB,MAAM,EAAE,KAAK,CAAC;QAAE,KAAK,EAAE,MAAM,CAAC;QAAC,GAAG,EAAE,MAAM,CAAA;KAAE,CAAC,CAAA;CAC7C;AAED;;;GAGG;AACH,MAAM,WAAW,SAAS;IACzB;;;;OAIG;IACH,UAAU,CAAC,EAAE,cAAc,CAAA;IAC3B;;;OAGG;IACH,mBAAmB,CAAC,EAAE,MAAM,CAAA;IAC5B;;;OAGG;IACH,GAAG,CAAC,EAAE,cAAc,CAAA;IACpB,6DAA6D;IAC7D,YAAY,CAAC,EAAE,MAAM,CAAA;IACrB;;;;;;OAMG;IACH,mBAAmB,CAAC,EAAE,cAAc,CAAA;IACpC,yDAAyD;IACzD,uBAAuB,CAAC,EAAE,yBAAyB,CAAA;IACnD;;;;;;OAMG;IACH,cAAc,CAAC,EAAE,OAAO,CAAA;IACxB;;;;;;OAMG;IACH,UAAU,CAAC,EAAE,OAAO,CAAA;CACpB"}
package/out/classifier.js CHANGED
@@ -10,16 +10,35 @@
10
10
  * Convenience wrappers `parseJson` / `parseTuples` / `parseXml` project the tree on the way out.
11
11
  */
12
12
  import { buildAddressTree, decodeAsJson, decodeAsTuples, decodeAsXml, } from "@mailwoman/core/decoder";
13
- import { STAGE1_BIO_LABELS } from "./labels.js";
14
- import { OnnxRunner } from "./onnx-runner.js";
13
+ import { buildFstEmissionPriors } from "./fst-prior.js";
14
+ import { STAGE2_BIO_LABELS } from "./labels.js";
15
+ import { repairPostcodeLabels } from "./postcode-repair.js";
16
+ import { repairUnitLabels } from "./unit-repair.js";
17
+ import { addEmissionMatrix, buildEmissionPriors } from "./query-shape-prior.js";
18
+ import { buildStreetMorphologyEmissionPriors } from "./street-morphology-prior.js";
15
19
  import { MailwomanTokenizer } from "./tokenizer.js";
16
- import { resolveWeights } from "./weights.js";
20
+ import { buildAnchorFeatures } from "./anchor-inference.js";
21
+ import { buildBioEndMask, buildBioStartMask, buildBioTransitionMask, softmax, viterbi } from "./viterbi.js";
17
22
  export class NeuralAddressClassifier {
18
23
  cfg;
19
24
  labels;
25
+ decodeMode;
26
+ transitions;
27
+ startTransitions;
28
+ endTransitions;
20
29
  constructor(cfg) {
21
30
  this.cfg = cfg;
22
- this.labels = cfg.labels ?? STAGE1_BIO_LABELS;
31
+ this.labels = cfg.labels ?? STAGE2_BIO_LABELS;
32
+ this.decodeMode = cfg.decode ?? "viterbi";
33
+ const structural = buildBioTransitionMask(this.labels);
34
+ if (cfg.transitions) {
35
+ this.transitions = addMatrices(structural, cfg.transitions);
36
+ }
37
+ else {
38
+ this.transitions = structural;
39
+ }
40
+ this.startTransitions = cfg.startTransitions ?? buildBioStartMask(this.labels);
41
+ this.endTransitions = cfg.endTransitions ?? buildBioEndMask(this.labels);
23
42
  }
24
43
  /**
25
44
  * One-call factory that resolves the weights package (or explicit paths), loads the tokenizer and
@@ -27,42 +46,176 @@ export class NeuralAddressClassifier {
27
46
  *
28
47
  * Resolution order: explicit paths in `opts` → `@mailwoman/neural-weights-<locale>` package →
29
48
  * throws a single actionable error.
49
+ *
50
+ * **Node-only.** The dynamic imports keep `OnnxRunner` (onnxruntime-node) + `resolveWeights`
51
+ * (uses Node fs) out of the static dependency graph, so this file can be bundled for the browser
52
+ * by `@mailwoman/neural-web`. Calling this method in a browser will throw at runtime — use
53
+ * `loadNeuralClassifierFromUrls` from `@mailwoman/neural-web` instead.
30
54
  */
31
55
  static async loadFromWeights(opts = {}) {
32
- const { modelPath, tokenizerPath } = resolveWeights(opts);
56
+ // /* webpackIgnore: true */ tells webpack to leave the dynamic import statement intact —
57
+ // it becomes a runtime native ESM import that resolves in Node (which has onnxruntime-node
58
+ // + node:fs) and throws cleanly in a browser if called. Without the directive, webpack
59
+ // pulls onnx-runner / weights into the browser chunk graph + then chokes on the Node-only
60
+ // builtins they reference.
61
+ const [{ OnnxRunner }, { resolveWeights, readLabelsFromModelCard, readCrfTransitions }] = await Promise.all([
62
+ import(/* webpackIgnore: true */ "./onnx-runner.js"),
63
+ import(/* webpackIgnore: true */ "./weights.js"),
64
+ ]);
65
+ const resolved = resolveWeights(opts);
66
+ const labels = readLabelsFromModelCard(resolved.modelCardPath);
67
+ const crf = readCrfTransitions(resolved.crfTransitionsPath);
33
68
  const [tokenizer, runner] = await Promise.all([
34
- MailwomanTokenizer.loadFromFile(tokenizerPath),
35
- OnnxRunner.create(modelPath),
69
+ MailwomanTokenizer.loadFromFile(resolved.tokenizerPath),
70
+ OnnxRunner.create(resolved.modelPath),
36
71
  ]);
37
- return new NeuralAddressClassifier({ tokenizer, runner });
72
+ return new NeuralAddressClassifier({
73
+ tokenizer,
74
+ runner,
75
+ labels,
76
+ transitions: crf?.transitions,
77
+ startTransitions: crf?.startTransitions,
78
+ endTransitions: crf?.endTransitions,
79
+ ...(opts.postcodeAnchorLookup ? { postcodeAnchorLookup: opts.postcodeAnchorLookup } : {}),
80
+ });
38
81
  }
39
- /** Tokenize → infer → argmax/softmax → decoder tree. */
40
- async parse(text) {
82
+ /** Tokenize → infer → Viterbi (or argmax) → decoder tree. */
83
+ async parse(text, opts) {
41
84
  if (text.length === 0)
42
85
  return { raw: text, roots: [] };
43
86
  const { pieces, ids } = this.cfg.tokenizer.encode(text);
44
- const { logits } = await this.cfg.runner.infer(ids);
45
- const tokens = pieces.map((p, i) => {
46
- const row = logits[i];
47
- const { idx, conf } = argmaxSoftmax(row);
87
+ // Postcode-anchor channel (#239/#240): build per-piece anchor features from the same lookup the
88
+ // model trained on, fed alongside the ids. No-op when no lookup is configured.
89
+ const anchor = this.cfg.postcodeAnchorLookup
90
+ ? buildAnchorFeatures(text, pieces, this.cfg.postcodeAnchorLookup)
91
+ : undefined;
92
+ const { logits } = await this.cfg.runner.infer(ids, anchor);
93
+ this.assertEmissionWidth(logits);
94
+ let emissions = opts?.queryShape
95
+ ? addEmissionMatrix(logits, buildEmissionPriors(opts.queryShape, pieces, this.labels, {
96
+ biasScale: opts.queryShapeBiasScale ?? 1.0,
97
+ inputText: text,
98
+ }))
99
+ : logits;
100
+ if (opts?.fst) {
101
+ emissions = addEmissionMatrix(emissions, buildFstEmissionPriors(opts.fst, pieces, this.labels, {
102
+ biasScale: opts.fstBiasScale ?? 1.0,
103
+ }));
104
+ }
105
+ if (opts?.fstStreetMorphology) {
106
+ emissions = addEmissionMatrix(emissions, buildStreetMorphologyEmissionPriors(opts.fstStreetMorphology, pieces, this.labels, opts.fstStreetMorphologyOpts ?? {}));
107
+ }
108
+ const labelIndices = this.decodeMode === "viterbi"
109
+ ? viterbi({
110
+ emissions,
111
+ transitions: this.transitions,
112
+ startTransitions: this.startTransitions,
113
+ endTransitions: this.endTransitions,
114
+ }).path
115
+ : emissions.map((row) => argmaxSoftmax(row).idx);
116
+ let tokens = pieces.map((p, i) => {
117
+ const idx = labelIndices[i];
118
+ const probs = softmax(logits[i]);
48
119
  return {
49
120
  piece: p.piece,
50
121
  start: p.start,
51
122
  end: p.end,
52
123
  label: (this.labels[idx] ?? "O"),
53
- confidence: conf,
124
+ confidence: probs[idx],
54
125
  };
55
126
  });
127
+ if (opts?.postcodeRepair) {
128
+ tokens = repairPostcodeLabels(text, tokens).tokens;
129
+ }
130
+ if (opts?.unitRepair) {
131
+ tokens = repairUnitLabels(text, tokens).tokens;
132
+ }
56
133
  return buildAddressTree(text, tokens);
57
134
  }
58
- async parseJson(text) {
59
- return decodeAsJson(await this.parse(text));
135
+ /**
136
+ * Like `parse`, but also returns the raw per-token logits and piece offsets needed for per-span
137
+ * logit aggregation (Option C joint-reconcile integration).
138
+ */
139
+ async parseWithLogits(text, opts) {
140
+ if (text.length === 0) {
141
+ return { tree: { raw: text, roots: [] }, logits: [], pieces: [] };
142
+ }
143
+ const { pieces, ids } = this.cfg.tokenizer.encode(text);
144
+ // Postcode-anchor channel (#239/#240): build per-piece anchor features from the same lookup the
145
+ // model trained on, fed alongside the ids. No-op when no lookup is configured.
146
+ const anchor = this.cfg.postcodeAnchorLookup
147
+ ? buildAnchorFeatures(text, pieces, this.cfg.postcodeAnchorLookup)
148
+ : undefined;
149
+ const { logits } = await this.cfg.runner.infer(ids, anchor);
150
+ this.assertEmissionWidth(logits);
151
+ let emissions = opts?.queryShape
152
+ ? addEmissionMatrix(logits, buildEmissionPriors(opts.queryShape, pieces, this.labels, {
153
+ biasScale: opts.queryShapeBiasScale ?? 1.0,
154
+ inputText: text,
155
+ }))
156
+ : logits;
157
+ if (opts?.fst) {
158
+ emissions = addEmissionMatrix(emissions, buildFstEmissionPriors(opts.fst, pieces, this.labels, {
159
+ biasScale: opts.fstBiasScale ?? 1.0,
160
+ }));
161
+ }
162
+ if (opts?.fstStreetMorphology) {
163
+ emissions = addEmissionMatrix(emissions, buildStreetMorphologyEmissionPriors(opts.fstStreetMorphology, pieces, this.labels, opts.fstStreetMorphologyOpts ?? {}));
164
+ }
165
+ const labelIndices = this.decodeMode === "viterbi"
166
+ ? viterbi({
167
+ emissions,
168
+ transitions: this.transitions,
169
+ startTransitions: this.startTransitions,
170
+ endTransitions: this.endTransitions,
171
+ }).path
172
+ : emissions.map((row) => argmaxSoftmax(row).idx);
173
+ const tokens = pieces.map((p, i) => {
174
+ const idx = labelIndices[i];
175
+ const probs = softmax(logits[i]);
176
+ return {
177
+ piece: p.piece,
178
+ start: p.start,
179
+ end: p.end,
180
+ label: (this.labels[idx] ?? "O"),
181
+ confidence: probs[idx],
182
+ };
183
+ });
184
+ return {
185
+ tree: buildAddressTree(text, tokens),
186
+ logits,
187
+ pieces: pieces.map((p) => ({ start: p.start, end: p.end })),
188
+ };
189
+ }
190
+ async parseJson(text, opts) {
191
+ return decodeAsJson(await this.parse(text, opts));
60
192
  }
61
- async parseTuples(text) {
62
- return decodeAsTuples(await this.parse(text));
193
+ async parseTuples(text, opts) {
194
+ return decodeAsTuples(await this.parse(text, opts));
63
195
  }
64
196
  async parseXml(text, opts) {
65
- return decodeAsXml(await this.parse(text), opts);
197
+ return decodeAsXml(await this.parse(text, opts), opts?.xml);
198
+ }
199
+ /**
200
+ * Guard against a silent label/emission shape overrun. When the model emits MORE logits per token
201
+ * than the configured label vocabulary (e.g. a Stage 3 bundle loaded with the default Stage 2
202
+ * labels), viterbi indexes past the transition matrix and dies with an opaque `Cannot read
203
+ * properties of undefined (reading '0')`. Fail fast here with a message that names the contract
204
+ * the caller violated.
205
+ *
206
+ * The opposite shape (model narrower than labels) is intentionally permitted — STAGE2_BIO_LABELS
207
+ * prefix-extends STAGE1_BIO_LABELS so a Stage 1 model loaded with Stage 2 labels decodes
208
+ * correctly via the first 15 logits. See labels.ts for the contract.
209
+ */
210
+ assertEmissionWidth(logits) {
211
+ if (logits.length === 0)
212
+ return;
213
+ const width = logits[0].length;
214
+ if (width > this.labels.length) {
215
+ throw new Error(`Label/emission mismatch: model emits ${width} logits per token but the classifier was ` +
216
+ `configured with only ${this.labels.length} labels. Did you load a Stage 3 bundle without ` +
217
+ `passing its model-card labels? See loadFromWeights / loadNeuralClassifierFromUrls.`);
218
+ }
66
219
  }
67
220
  }
68
221
  function argmaxSoftmax(row) {
@@ -80,4 +233,16 @@ function argmaxSoftmax(row) {
80
233
  const conf = 1 / sumExp;
81
234
  return { idx: maxIdx, conf };
82
235
  }
236
+ /** Element-wise add two square matrices. Used to compose the structural mask + learned transitions. */
237
+ function addMatrices(a, b) {
238
+ const n = a.length;
239
+ const out = [];
240
+ for (let i = 0; i < n; i++) {
241
+ const row = new Array(n);
242
+ for (let j = 0; j < n; j++)
243
+ row[j] = a[i][j] + b[i][j];
244
+ out.push(row);
245
+ }
246
+ return out;
247
+ }
83
248
  //# sourceMappingURL=classifier.js.map