@mailwoman/neural 2.2.0 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/out/anchor-inference.d.ts +58 -0
- package/out/anchor-inference.d.ts.map +1 -0
- package/out/anchor-inference.js +95 -0
- package/out/anchor-inference.js.map +1 -0
- package/out/browser.d.ts +2 -0
- package/out/browser.d.ts.map +1 -1
- package/out/browser.js +4 -0
- package/out/browser.js.map +1 -1
- package/out/classifier.d.ts +70 -3
- package/out/classifier.d.ts.map +1 -1
- package/out/classifier.js +80 -19
- package/out/classifier.js.map +1 -1
- package/out/fst-prior.d.ts +71 -0
- package/out/fst-prior.d.ts.map +1 -0
- package/out/fst-prior.js +173 -0
- package/out/fst-prior.js.map +1 -0
- package/out/index.d.ts +3 -0
- package/out/index.d.ts.map +1 -1
- package/out/index.js +3 -0
- package/out/index.js.map +1 -1
- package/out/labels.d.ts +3 -0
- package/out/labels.d.ts.map +1 -1
- package/out/labels.js +13 -0
- package/out/labels.js.map +1 -1
- package/out/onnx-runner.d.ts +8 -1
- package/out/onnx-runner.d.ts.map +1 -1
- package/out/onnx-runner.js +31 -1
- package/out/onnx-runner.js.map +1 -1
- package/out/postcode-anchor.d.ts +117 -0
- package/out/postcode-anchor.d.ts.map +1 -0
- package/out/postcode-anchor.js +269 -0
- package/out/postcode-anchor.js.map +1 -0
- package/out/postcode-binary-resolver.d.ts +60 -0
- package/out/postcode-binary-resolver.d.ts.map +1 -0
- package/out/postcode-binary-resolver.js +208 -0
- package/out/postcode-binary-resolver.js.map +1 -0
- package/out/postcode-repair.d.ts +65 -0
- package/out/postcode-repair.d.ts.map +1 -0
- package/out/postcode-repair.js +171 -0
- package/out/postcode-repair.js.map +1 -0
- package/out/proposal-classifier.d.ts.map +1 -1
- package/out/proposal-classifier.js +3 -1
- package/out/proposal-classifier.js.map +1 -1
- package/out/query-shape-prior.d.ts +12 -0
- package/out/query-shape-prior.d.ts.map +1 -1
- package/out/query-shape-prior.js +132 -2
- package/out/query-shape-prior.js.map +1 -1
- package/out/street-morphology-prior.d.ts +56 -0
- package/out/street-morphology-prior.d.ts.map +1 -0
- package/out/street-morphology-prior.js +159 -0
- package/out/street-morphology-prior.js.map +1 -0
- package/out/unit-repair.d.ts +42 -0
- package/out/unit-repair.d.ts.map +1 -0
- package/out/unit-repair.js +142 -0
- package/out/unit-repair.js.map +1 -0
- package/out/vitest.config.d.ts.map +1 -1
- package/out/vitest.config.js +3 -0
- package/out/vitest.config.js.map +1 -1
- package/out/weights.d.ts +27 -3
- package/out/weights.d.ts.map +1 -1
- package/out/weights.js +46 -2
- package/out/weights.js.map +1 -1
- package/package.json +6 -2
package/out/fst-prior.js
ADDED
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Soft-prior emission biases derived from the FST gazetteer. When the FST finds that a token
|
|
7
|
+
* sequence matches a known place name (e.g., "New York" → locality + region), this module
|
|
8
|
+
* produces additive biases that nudge the Viterbi decoder toward the matching BIO labels.
|
|
9
|
+
*
|
|
10
|
+
* Composes with the QueryShape prior via addEmissionMatrix — same integration point, same additive
|
|
11
|
+
* semantics.
|
|
12
|
+
*
|
|
13
|
+
* SentencePiece ↔ FST bridge: SentencePiece pieces are grouped into whitespace words (by the ▁
|
|
14
|
+
* sentinel), normalized through the same pipeline as FST edges (NFKC, lowercase, strip
|
|
15
|
+
* non-alnum), and walked through the FST as contiguous subpaths.
|
|
16
|
+
*
|
|
17
|
+
* Uses structural typing for the FST input so this module has zero dependencies on
|
|
18
|
+
* `@mailwoman/resolver-wof-sqlite` — consumers pass an FstMatcher instance, but this file only
|
|
19
|
+
* consumes the shape.
|
|
20
|
+
*/
|
|
21
|
+
const SPACE_SENTINEL = "▁";
|
|
22
|
+
// ---------------------------------------------------------------------------
|
|
23
|
+
// Placetype → BIO label mapping
|
|
24
|
+
// ---------------------------------------------------------------------------
|
|
25
|
+
const PLACETYPE_TO_BIO = new Map([
|
|
26
|
+
["country", "country"],
|
|
27
|
+
["region", "region"],
|
|
28
|
+
["locality", "locality"],
|
|
29
|
+
["postalcode", "postcode"],
|
|
30
|
+
]);
|
|
31
|
+
const SUPPRESS_WHEN_PLACE = ["B-street", "I-street", "B-house_number", "I-house_number", "B-venue"];
|
|
32
|
+
/**
|
|
33
|
+
* Build a `[seqLen][numLabels]` bias matrix from FST gazetteer matches.
|
|
34
|
+
*
|
|
35
|
+
* Walks all contiguous subpaths of the reconstructed whitespace-token sequence through the FST. For
|
|
36
|
+
* each accepting state, biases the corresponding BIO labels on the matched pieces.
|
|
37
|
+
*/
|
|
38
|
+
export function buildFstEmissionPriors(fst, pieces, labels, opts = {}) {
|
|
39
|
+
const T = pieces.length;
|
|
40
|
+
const L = labels.length;
|
|
41
|
+
const biasScale = opts.biasScale ?? 1.0;
|
|
42
|
+
const seenWOFIDs = new Set();
|
|
43
|
+
const maxBias = opts.maxBias ?? 3.0;
|
|
44
|
+
const suppressionScale = opts.suppressionScale ?? 1.5;
|
|
45
|
+
const matrix = [];
|
|
46
|
+
for (let t = 0; t < T; t++)
|
|
47
|
+
matrix.push(new Array(L).fill(0));
|
|
48
|
+
const labelToCol = new Map();
|
|
49
|
+
for (let k = 0; k < labels.length; k++)
|
|
50
|
+
labelToCol.set(labels[k], k);
|
|
51
|
+
const wordGroups = groupPiecesIntoWords(pieces);
|
|
52
|
+
if (wordGroups.length === 0)
|
|
53
|
+
return matrix;
|
|
54
|
+
for (let start = 0; start < wordGroups.length; start++) {
|
|
55
|
+
const group = wordGroups[start];
|
|
56
|
+
if (group.fstToken === "")
|
|
57
|
+
continue;
|
|
58
|
+
const match = fst.walk([group.fstToken]);
|
|
59
|
+
if (!match)
|
|
60
|
+
continue;
|
|
61
|
+
if (match.accepted) {
|
|
62
|
+
applyBias(matrix, labelToCol, fst.accepting(match.stateId), [group], biasScale, maxBias, suppressionScale, seenWOFIDs);
|
|
63
|
+
}
|
|
64
|
+
let current = match;
|
|
65
|
+
for (let end = start + 1; end < wordGroups.length; end++) {
|
|
66
|
+
const nextGroup = wordGroups[end];
|
|
67
|
+
if (nextGroup.fstToken === "")
|
|
68
|
+
continue;
|
|
69
|
+
const next = fst.walkFrom(current, nextGroup.fstToken);
|
|
70
|
+
if (!next)
|
|
71
|
+
break;
|
|
72
|
+
if (next.accepted) {
|
|
73
|
+
const matchedGroups = wordGroups.slice(start, end + 1).filter((g) => g.fstToken !== "");
|
|
74
|
+
applyBias(matrix, labelToCol, fst.accepting(next.stateId), matchedGroups, biasScale, maxBias, suppressionScale, seenWOFIDs);
|
|
75
|
+
}
|
|
76
|
+
current = next;
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
return matrix;
|
|
80
|
+
}
|
|
81
|
+
/**
|
|
82
|
+
* Group SentencePiece pieces into whitespace-delimited words. Each word's literal text is
|
|
83
|
+
* reconstructed by concatenating pieces (minus leading ▁), then normalized through the same
|
|
84
|
+
* pipeline the FST builder uses.
|
|
85
|
+
*
|
|
86
|
+
* Exported (alongside {@linkcode normalizeFstToken} and the {@linkcode WordGroup} type) so the
|
|
87
|
+
* street-morphology prior can reuse the same piece-grouping/normalization pipeline without
|
|
88
|
+
* duplication. Internal helper signature; not part of the public neural API.
|
|
89
|
+
*/
|
|
90
|
+
export function groupPiecesIntoWords(pieces) {
|
|
91
|
+
const groups = [];
|
|
92
|
+
let current = null;
|
|
93
|
+
for (let i = 0; i < pieces.length; i++) {
|
|
94
|
+
const p = pieces[i];
|
|
95
|
+
const hasAlnum = /[\p{L}\p{N}]/u.test(p.piece);
|
|
96
|
+
if (p.piece.startsWith(SPACE_SENTINEL) || i === 0 || !hasAlnum) {
|
|
97
|
+
if (current)
|
|
98
|
+
groups.push(current);
|
|
99
|
+
if (!hasAlnum) {
|
|
100
|
+
groups.push({ fstToken: "", pieceIndices: [i] });
|
|
101
|
+
current = null;
|
|
102
|
+
continue;
|
|
103
|
+
}
|
|
104
|
+
const literal = p.piece.startsWith(SPACE_SENTINEL) ? p.piece.slice(SPACE_SENTINEL.length) : p.piece;
|
|
105
|
+
current = { fstToken: literal, pieceIndices: [i] };
|
|
106
|
+
}
|
|
107
|
+
else {
|
|
108
|
+
if (current) {
|
|
109
|
+
current.pieceIndices.push(i);
|
|
110
|
+
current.fstToken += p.piece;
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
if (current)
|
|
115
|
+
groups.push(current);
|
|
116
|
+
for (const g of groups) {
|
|
117
|
+
if (g.fstToken !== "") {
|
|
118
|
+
g.fstToken = normalizeFstToken(g.fstToken);
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
return groups;
|
|
122
|
+
}
|
|
123
|
+
function normalizeFstToken(s) {
|
|
124
|
+
const cleaned = s
|
|
125
|
+
.normalize("NFKC")
|
|
126
|
+
.toLowerCase()
|
|
127
|
+
.replace(/[\p{P}\p{S}]/gu, "");
|
|
128
|
+
return cleaned.length > 0 ? cleaned : "";
|
|
129
|
+
}
|
|
130
|
+
function applyBias(matrix, labelToCol, entries, groups, biasScale, maxBias, suppressionScale, seenWOFIDs) {
|
|
131
|
+
const seenTags = new Map();
|
|
132
|
+
for (const entry of entries) {
|
|
133
|
+
if (seenWOFIDs.has(entry.wofID))
|
|
134
|
+
continue;
|
|
135
|
+
seenWOFIDs.add(entry.wofID);
|
|
136
|
+
const bioTag = PLACETYPE_TO_BIO.get(entry.placetype);
|
|
137
|
+
if (!bioTag)
|
|
138
|
+
continue;
|
|
139
|
+
const impBias = entry.importance * biasScale * maxBias;
|
|
140
|
+
const existing = seenTags.get(bioTag) ?? 0;
|
|
141
|
+
if (impBias > existing)
|
|
142
|
+
seenTags.set(bioTag, impBias);
|
|
143
|
+
}
|
|
144
|
+
if (seenTags.size === 0)
|
|
145
|
+
return;
|
|
146
|
+
const allPieceIndices = [];
|
|
147
|
+
for (const group of groups) {
|
|
148
|
+
for (const pi of group.pieceIndices)
|
|
149
|
+
allPieceIndices.push(pi);
|
|
150
|
+
}
|
|
151
|
+
for (const [bioTag, bias] of seenTags) {
|
|
152
|
+
const bCol = labelToCol.get(`B-${bioTag}`);
|
|
153
|
+
const iCol = labelToCol.get(`I-${bioTag}`);
|
|
154
|
+
if (bCol === undefined)
|
|
155
|
+
continue;
|
|
156
|
+
for (let k = 0; k < allPieceIndices.length; k++) {
|
|
157
|
+
const pi = allPieceIndices[k];
|
|
158
|
+
const col = k === 0 ? bCol : (iCol ?? bCol);
|
|
159
|
+
matrix[pi][col] = Math.max(matrix[pi][col], bias);
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
if (suppressionScale > 0) {
|
|
163
|
+
for (const pi of allPieceIndices) {
|
|
164
|
+
for (const label of SUPPRESS_WHEN_PLACE) {
|
|
165
|
+
const col = labelToCol.get(label);
|
|
166
|
+
if (col !== undefined) {
|
|
167
|
+
matrix[pi][col] = Math.min(matrix[pi][col], -suppressionScale);
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
//# sourceMappingURL=fst-prior.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"fst-prior.js","sourceRoot":"","sources":["../fst-prior.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;GAmBG;AAIH,MAAM,cAAc,GAAG,GAAG,CAAA;AAwB1B,8EAA8E;AAC9E,gCAAgC;AAChC,8EAA8E;AAE9E,MAAM,gBAAgB,GAAgC,IAAI,GAAG,CAAC;IAC7D,CAAC,SAAS,EAAE,SAAS,CAAC;IACtB,CAAC,QAAQ,EAAE,QAAQ,CAAC;IACpB,CAAC,UAAU,EAAE,UAAU,CAAC;IACxB,CAAC,YAAY,EAAE,UAAU,CAAC;CAC1B,CAAC,CAAA;AAWF,MAAM,mBAAmB,GAAsB,CAAC,UAAU,EAAE,UAAU,EAAE,gBAAgB,EAAE,gBAAgB,EAAE,SAAS,CAAC,CAAA;AAYtH;;;;;GAKG;AACH,MAAM,UAAU,sBAAsB,CACrC,GAAmB,EACnB,MAAoD,EACpD,MAA6B,EAC7B,OAAqB,EAAE;IAEvB,MAAM,CAAC,GAAG,MAAM,CAAC,MAAM,CAAA;IACvB,MAAM,CAAC,GAAG,MAAM,CAAC,MAAM,CAAA;IACvB,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,IAAI,GAAG,CAAA;IACvC,MAAM,UAAU,GAAG,IAAI,GAAG,EAAU,CAAA;IACpC,MAAM,OAAO,GAAG,IAAI,CAAC,OAAO,IAAI,GAAG,CAAA;IACnC,MAAM,gBAAgB,GAAG,IAAI,CAAC,gBAAgB,IAAI,GAAG,CAAA;IACrD,MAAM,MAAM,GAAe,EAAE,CAAA;IAC7B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,CAAC,EAAE;QAAE,MAAM,CAAC,IAAI,CAAC,IAAI,KAAK,CAAS,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAA;IAErE,MAAM,UAAU,GAAG,IAAI,GAAG,EAAkB,CAAA;IAC5C,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE;QAAE,UAAU,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,CAAE,EAAE,CAAC,CAAC,CAAA;IAErE,MAAM,UAAU,GAAG,oBAAoB,CAAC,MAAM,CAAC,CAAA;IAC/C,IAAI,UAAU,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,MAAM,CAAA;IAE1C,KAAK,IAAI,KAAK,GAAG,CAAC,EAAE,KAAK,GAAG,UAAU,CAAC,MAAM,EAAE,KAAK,EAAE,EAAE,CAAC;QACxD,MAAM,KAAK,GAAG,UAAU,CAAC,KAAK,CAAE,CAAA;QAChC,IAAI,KAAK,CAAC,QAAQ,KAAK,EAAE;YAAE,SAAQ;QAEnC,MAAM,KAAK,GAAG,GAAG,CAAC,IAAI,CAAC,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC,CAAA;QACxC,IAAI,CAAC,KAAK;YAAE,SAAQ;QAEpB,IAAI,KAAK,CAAC,QAAQ,EAAE,CAAC;YACpB,SAAS,CACR,MAAM,EACN,UAAU,EACV,GAAG,CAAC,SAAS,CAAC,KAAK,CAAC,OAAO,CAAC,EAC5B,CAAC,KAAK,CAAC,EACP,SAAS,EACT,OAAO,EACP,gBAAgB,EAChB,UAAU,CACV,CAAA;QACF,CAAC;QAED,IAAI,OAAO,GAAG,KAAK,CAAA;QACnB,KAAK,IAAI,GAAG,GAAG,KAAK,GAAG,CAAC,EAAE,GAAG,GAAG,UAAU,CAAC,MAAM,EAAE,GAAG,EAAE,EAAE,CAAC;YAC1D,MAAM,SAAS,GAAG,UAAU,CAAC,GAAG,CAAE,CAAA;YAClC,IAAI,SAAS,CAAC,QAAQ,KAAK,EAAE;gBAAE,SAAQ;YAEvC,MAAM,IAAI,GAAG,GAAG,CAAC,QAAQ,CAAC,OAAO,EAAE,SAAS,CAAC,QAAQ,CAAC,CAAA;YACtD,IAAI,CAAC,IAAI;gBAAE,MAAK;YAEhB,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC;gBACnB,MAAM,aAAa,GAAG,UAAU,CAAC,KAAK,CAAC,KAAK,EAAE,GAAG,GAAG,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,KAAK,EAAE,CAAC,CAAA;gBACvF,SAAS,CACR,MAAM,EACN,UAAU,EACV,GAAG,CAAC,SAAS,CAAC,IAAI,CAAC,OAAO,CAAC,EAC3B,aAAa,EACb,SAAS,EACT,OAAO,EACP,gBAAgB,EAChB,UAAU,CACV,CAAA;YACF,CAAC;YAED,OAAO,GAAG,IAAI,CAAA;QACf,CAAC;IACF,CAAC;IAED,OAAO,MAAM,CAAA;AACd,CAAC;AAED;;;;;;;;GAQG;AACH,MAAM,UAAU,oBAAoB,CAAC,MAAwC;IAC5E,MAAM,MAAM,GAAgB,EAAE,CAAA;IAC9B,IAAI,OAAO,GAAqB,IAAI,CAAA;IAEpC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACxC,MAAM,CAAC,GAAG,MAAM,CAAC,CAAC,CAAE,CAAA;QACpB,MAAM,QAAQ,GAAG,eAAe,CAAC,IAAI,CAAC,CAAC,CAAC,KAAK,CAAC,CAAA;QAE9C,IAAI,CAAC,CAAC,KAAK,CAAC,UAAU,CAAC,cAAc,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,QAAQ,EAAE,CAAC;YAChE,IAAI,OAAO;gBAAE,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAA;YACjC,IAAI,CAAC,QAAQ,EAAE,CAAC;gBACf,MAAM,CAAC,IAAI,CAAC,EAAE,QAAQ,EAAE,EAAE,EAAE,YAAY,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,CAAA;gBAChD,OAAO,GAAG,IAAI,CAAA;gBACd,SAAQ;YACT,CAAC;YACD,MAAM,OAAO,GAAG,CAAC,CAAC,KAAK,CAAC,UAAU,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,cAAc,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,KAAK,CAAA;YACnG,OAAO,GAAG,EAAE,QAAQ,EAAE,OAAO,EAAE,YAAY,EAAE,CAAC,CAAC,CAAC,EAAE,CAAA;QACnD,CAAC;aAAM,CAAC;YACP,IAAI,OAAO,EAAE,CAAC;gBACb,OAAO,CAAC,YAAY,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;gBAC5B,OAAO,CAAC,QAAQ,IAAI,CAAC,CAAC,KAAK,CAAA;YAC5B,CAAC;QACF,CAAC;IACF,CAAC;IACD,IAAI,OAAO;QAAE,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAA;IAEjC,KAAK,MAAM,CAAC,IAAI,MAAM,EAAE,CAAC;QACxB,IAAI,CAAC,CAAC,QAAQ,KAAK,EAAE,EAAE,CAAC;YACvB,CAAC,CAAC,QAAQ,GAAG,iBAAiB,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAA;QAC3C,CAAC;IACF,CAAC;IAED,OAAO,MAAM,CAAA;AACd,CAAC;AAED,SAAS,iBAAiB,CAAC,CAAS;IACnC,MAAM,OAAO,GAAG,CAAC;SACf,SAAS,CAAC,MAAM,CAAC;SACjB,WAAW,EAAE;SACb,OAAO,CAAC,gBAAgB,EAAE,EAAE,CAAC,CAAA;IAC/B,OAAO,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAA;AACzC,CAAC;AAED,SAAS,SAAS,CACjB,MAAkB,EAClB,UAA+B,EAC/B,OAAyC,EACzC,MAAmB,EACnB,SAAiB,EACjB,OAAe,EACf,gBAAwB,EACxB,UAAuB;IAEvB,MAAM,QAAQ,GAAG,IAAI,GAAG,EAAkB,CAAA;IAE1C,KAAK,MAAM,KAAK,IAAI,OAAO,EAAE,CAAC;QAC7B,IAAI,UAAU,CAAC,GAAG,CAAC,KAAK,CAAC,KAAK,CAAC;YAAE,SAAQ;QACzC,UAAU,CAAC,GAAG,CAAC,KAAK,CAAC,KAAK,CAAC,CAAA;QAC3B,MAAM,MAAM,GAAG,gBAAgB,CAAC,GAAG,CAAC,KAAK,CAAC,SAAS,CAAC,CAAA;QACpD,IAAI,CAAC,MAAM;YAAE,SAAQ;QACrB,MAAM,OAAO,GAAG,KAAK,CAAC,UAAU,GAAG,SAAS,GAAG,OAAO,CAAA;QACtD,MAAM,QAAQ,GAAG,QAAQ,CAAC,GAAG,CAAC,MAAM,CAAC,IAAI,CAAC,CAAA;QAC1C,IAAI,OAAO,GAAG,QAAQ;YAAE,QAAQ,CAAC,GAAG,CAAC,MAAM,EAAE,OAAO,CAAC,CAAA;IACtD,CAAC;IAED,IAAI,QAAQ,CAAC,IAAI,KAAK,CAAC;QAAE,OAAM;IAE/B,MAAM,eAAe,GAAa,EAAE,CAAA;IACpC,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;QAC5B,KAAK,MAAM,EAAE,IAAI,KAAK,CAAC,YAAY;YAAE,eAAe,CAAC,IAAI,CAAC,EAAE,CAAC,CAAA;IAC9D,CAAC;IAED,KAAK,MAAM,CAAC,MAAM,EAAE,IAAI,CAAC,IAAI,QAAQ,EAAE,CAAC;QACvC,MAAM,IAAI,GAAG,UAAU,CAAC,GAAG,CAAC,KAAK,MAAM,EAAE,CAAC,CAAA;QAC1C,MAAM,IAAI,GAAG,UAAU,CAAC,GAAG,CAAC,KAAK,MAAM,EAAE,CAAC,CAAA;QAC1C,IAAI,IAAI,KAAK,SAAS;YAAE,SAAQ;QAEhC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,eAAe,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACjD,MAAM,EAAE,GAAG,eAAe,CAAC,CAAC,CAAE,CAAA;YAC9B,MAAM,GAAG,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,IAAI,IAAI,IAAI,CAAC,CAAA;YAC3C,MAAM,CAAC,EAAE,CAAE,CAAC,GAAG,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAE,CAAC,GAAG,CAAE,EAAE,IAAI,CAAC,CAAA;QACrD,CAAC;IACF,CAAC;IAED,IAAI,gBAAgB,GAAG,CAAC,EAAE,CAAC;QAC1B,KAAK,MAAM,EAAE,IAAI,eAAe,EAAE,CAAC;YAClC,KAAK,MAAM,KAAK,IAAI,mBAAmB,EAAE,CAAC;gBACzC,MAAM,GAAG,GAAG,UAAU,CAAC,GAAG,CAAC,KAAK,CAAC,CAAA;gBACjC,IAAI,GAAG,KAAK,SAAS,EAAE,CAAC;oBACvB,MAAM,CAAC,EAAE,CAAE,CAAC,GAAG,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAE,CAAC,GAAG,CAAE,EAAE,CAAC,gBAAgB,CAAC,CAAA;gBAClE,CAAC;YACF,CAAC;QACF,CAAC;IACF,CAAC;AACF,CAAC"}
|
package/out/index.d.ts
CHANGED
|
@@ -3,9 +3,12 @@
|
|
|
3
3
|
* @license AGPL-3.0
|
|
4
4
|
* @author Teffen Ellis, et al.
|
|
5
5
|
*/
|
|
6
|
+
export * from "./anchor-inference.js";
|
|
6
7
|
export * from "./classifier.js";
|
|
7
8
|
export * from "./labels.js";
|
|
8
9
|
export * from "./onnx-runner.js";
|
|
10
|
+
export * from "./postcode-anchor.js";
|
|
11
|
+
export * from "./postcode-binary-resolver.js";
|
|
9
12
|
export * from "./proposal-classifier.js";
|
|
10
13
|
export { addEmissionMatrix, buildEmissionPriors } from "./query-shape-prior.js";
|
|
11
14
|
export type { BuildPriorsOpts, KnownFormatHitLike, QueryShapeLike, TokenLike } from "./query-shape-prior.js";
|
package/out/index.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../index.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,cAAc,iBAAiB,CAAA;AAC/B,cAAc,aAAa,CAAA;AAC3B,cAAc,kBAAkB,CAAA;AAChC,cAAc,0BAA0B,CAAA;AACxC,OAAO,EAAE,iBAAiB,EAAE,mBAAmB,EAAE,MAAM,wBAAwB,CAAA;AAC/E,YAAY,EAAE,eAAe,EAAE,kBAAkB,EAAE,cAAc,EAAE,SAAS,EAAE,MAAM,wBAAwB,CAAA;AAC5G,cAAc,gBAAgB,CAAA;AAC9B,OAAO,EACN,eAAe,EACf,iBAAiB,EACjB,sBAAsB,EACtB,cAAc,EACd,OAAO,EACP,OAAO,GACP,MAAM,cAAc,CAAA;AACrB,YAAY,EAAE,YAAY,EAAE,aAAa,EAAE,MAAM,cAAc,CAAA;AAC/D,cAAc,cAAc,CAAA"}
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../index.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,cAAc,uBAAuB,CAAA;AACrC,cAAc,iBAAiB,CAAA;AAC/B,cAAc,aAAa,CAAA;AAC3B,cAAc,kBAAkB,CAAA;AAChC,cAAc,sBAAsB,CAAA;AACpC,cAAc,+BAA+B,CAAA;AAC7C,cAAc,0BAA0B,CAAA;AACxC,OAAO,EAAE,iBAAiB,EAAE,mBAAmB,EAAE,MAAM,wBAAwB,CAAA;AAC/E,YAAY,EAAE,eAAe,EAAE,kBAAkB,EAAE,cAAc,EAAE,SAAS,EAAE,MAAM,wBAAwB,CAAA;AAC5G,cAAc,gBAAgB,CAAA;AAC9B,OAAO,EACN,eAAe,EACf,iBAAiB,EACjB,sBAAsB,EACtB,cAAc,EACd,OAAO,EACP,OAAO,GACP,MAAM,cAAc,CAAA;AACrB,YAAY,EAAE,YAAY,EAAE,aAAa,EAAE,MAAM,cAAc,CAAA;AAC/D,cAAc,cAAc,CAAA"}
|
package/out/index.js
CHANGED
|
@@ -3,9 +3,12 @@
|
|
|
3
3
|
* @license AGPL-3.0
|
|
4
4
|
* @author Teffen Ellis, et al.
|
|
5
5
|
*/
|
|
6
|
+
export * from "./anchor-inference.js";
|
|
6
7
|
export * from "./classifier.js";
|
|
7
8
|
export * from "./labels.js";
|
|
8
9
|
export * from "./onnx-runner.js";
|
|
10
|
+
export * from "./postcode-anchor.js";
|
|
11
|
+
export * from "./postcode-binary-resolver.js";
|
|
9
12
|
export * from "./proposal-classifier.js";
|
|
10
13
|
export { addEmissionMatrix, buildEmissionPriors } from "./query-shape-prior.js";
|
|
11
14
|
export * from "./tokenizer.js";
|
package/out/index.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../index.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,cAAc,iBAAiB,CAAA;AAC/B,cAAc,aAAa,CAAA;AAC3B,cAAc,kBAAkB,CAAA;AAChC,cAAc,0BAA0B,CAAA;AACxC,OAAO,EAAE,iBAAiB,EAAE,mBAAmB,EAAE,MAAM,wBAAwB,CAAA;AAE/E,cAAc,gBAAgB,CAAA;AAC9B,OAAO,EACN,eAAe,EACf,iBAAiB,EACjB,sBAAsB,EACtB,cAAc,EACd,OAAO,EACP,OAAO,GACP,MAAM,cAAc,CAAA;AAErB,cAAc,cAAc,CAAA"}
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../index.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,cAAc,uBAAuB,CAAA;AACrC,cAAc,iBAAiB,CAAA;AAC/B,cAAc,aAAa,CAAA;AAC3B,cAAc,kBAAkB,CAAA;AAChC,cAAc,sBAAsB,CAAA;AACpC,cAAc,+BAA+B,CAAA;AAC7C,cAAc,0BAA0B,CAAA;AACxC,OAAO,EAAE,iBAAiB,EAAE,mBAAmB,EAAE,MAAM,wBAAwB,CAAA;AAE/E,cAAc,gBAAgB,CAAA;AAC9B,OAAO,EACN,eAAe,EACf,iBAAiB,EACjB,sBAAsB,EACtB,cAAc,EACd,OAAO,EACP,OAAO,GACP,MAAM,cAAc,CAAA;AAErB,cAAc,cAAc,CAAA"}
|
package/out/labels.d.ts
CHANGED
|
@@ -38,4 +38,7 @@ export declare const STAGE2_TAGS: readonly ["country", "region", "locality", "de
|
|
|
38
38
|
* never gets argmax'd because Stage 1 only emits 15 logits.
|
|
39
39
|
*/
|
|
40
40
|
export declare const STAGE2_BIO_LABELS: readonly BioLabel[];
|
|
41
|
+
export declare const STAGE3_FINE_TAGS: readonly ["street_prefix", "street_suffix", "unit", "po_box", "intersection_a", "intersection_b"];
|
|
42
|
+
export declare const STAGE3_TAGS: readonly ["country", "region", "locality", "dependent_locality", "postcode", "subregion", "cedex", "venue", "street", "house_number", "street_prefix", "street_suffix", "unit", "po_box", "intersection_a", "intersection_b"];
|
|
43
|
+
export declare const STAGE3_BIO_LABELS: readonly BioLabel[];
|
|
41
44
|
//# sourceMappingURL=labels.d.ts.map
|
package/out/labels.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"labels.d.ts","sourceRoot":"","sources":["../labels.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;GAkBG;AAEH,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,yBAAyB,CAAA;AAEvD,0EAA0E;AAC1E,eAAO,MAAM,kBAAkB,oGAQrB,CAAA;AAEV,yFAAyF;AACzF,eAAO,MAAM,iBAAiB,EAAE,SAAS,QAAQ,EAG/C,CAAA;AAEF;;;;GAIG;AACH,eAAO,MAAM,gBAAgB,8CAA+C,CAAA;AAE5E,8FAA8F;AAC9F,eAAO,MAAM,WAAW,uIAAwD,CAAA;AAEhF;;;;;;GAMG;AACH,eAAO,MAAM,iBAAiB,EAAE,SAAS,QAAQ,EAG/C,CAAA"}
|
|
1
|
+
{"version":3,"file":"labels.d.ts","sourceRoot":"","sources":["../labels.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;GAkBG;AAEH,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,yBAAyB,CAAA;AAEvD,0EAA0E;AAC1E,eAAO,MAAM,kBAAkB,oGAQrB,CAAA;AAEV,yFAAyF;AACzF,eAAO,MAAM,iBAAiB,EAAE,SAAS,QAAQ,EAG/C,CAAA;AAEF;;;;GAIG;AACH,eAAO,MAAM,gBAAgB,8CAA+C,CAAA;AAE5E,8FAA8F;AAC9F,eAAO,MAAM,WAAW,uIAAwD,CAAA;AAEhF;;;;;;GAMG;AACH,eAAO,MAAM,iBAAiB,EAAE,SAAS,QAAQ,EAG/C,CAAA;AAEF,eAAO,MAAM,gBAAgB,mGAOnB,CAAA;AAEV,eAAO,MAAM,WAAW,+NAAiD,CAAA;AAEzE,eAAO,MAAM,iBAAiB,EAAE,SAAS,QAAQ,EAG/C,CAAA"}
|
package/out/labels.js
CHANGED
|
@@ -51,4 +51,17 @@ export const STAGE2_BIO_LABELS = Object.freeze([
|
|
|
51
51
|
"O",
|
|
52
52
|
...STAGE2_TAGS.flatMap((tag) => [`B-${tag}`, `I-${tag}`]),
|
|
53
53
|
]);
|
|
54
|
+
export const STAGE3_FINE_TAGS = [
|
|
55
|
+
"street_prefix",
|
|
56
|
+
"street_suffix",
|
|
57
|
+
"unit",
|
|
58
|
+
"po_box",
|
|
59
|
+
"intersection_a",
|
|
60
|
+
"intersection_b",
|
|
61
|
+
];
|
|
62
|
+
export const STAGE3_TAGS = [...STAGE2_TAGS, ...STAGE3_FINE_TAGS];
|
|
63
|
+
export const STAGE3_BIO_LABELS = Object.freeze([
|
|
64
|
+
"O",
|
|
65
|
+
...STAGE3_TAGS.flatMap((tag) => [`B-${tag}`, `I-${tag}`]),
|
|
66
|
+
]);
|
|
54
67
|
//# sourceMappingURL=labels.js.map
|
package/out/labels.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"labels.js","sourceRoot":"","sources":["../labels.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;GAkBG;AAIH,0EAA0E;AAC1E,MAAM,CAAC,MAAM,kBAAkB,GAAG;IACjC,SAAS;IACT,QAAQ;IACR,UAAU;IACV,oBAAoB;IACpB,UAAU;IACV,WAAW;IACX,OAAO;CACE,CAAA;AAEV,yFAAyF;AACzF,MAAM,CAAC,MAAM,iBAAiB,GAAwB,MAAM,CAAC,MAAM,CAAC;IACnE,GAAe;IACf,GAAG,kBAAkB,CAAC,OAAO,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,CAAC,KAAK,GAAG,EAAc,EAAE,KAAK,GAAG,EAAc,CAAC,CAAC;CACxF,CAAC,CAAA;AAEF;;;;GAIG;AACH,MAAM,CAAC,MAAM,gBAAgB,GAAG,CAAC,OAAO,EAAE,QAAQ,EAAE,cAAc,CAAU,CAAA;AAE5E,8FAA8F;AAC9F,MAAM,CAAC,MAAM,WAAW,GAAG,CAAC,GAAG,kBAAkB,EAAE,GAAG,gBAAgB,CAAU,CAAA;AAEhF;;;;;;GAMG;AACH,MAAM,CAAC,MAAM,iBAAiB,GAAwB,MAAM,CAAC,MAAM,CAAC;IACnE,GAAe;IACf,GAAG,WAAW,CAAC,OAAO,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,CAAC,KAAK,GAAG,EAAc,EAAE,KAAK,GAAG,EAAc,CAAC,CAAC;CACjF,CAAC,CAAA"}
|
|
1
|
+
{"version":3,"file":"labels.js","sourceRoot":"","sources":["../labels.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;GAkBG;AAIH,0EAA0E;AAC1E,MAAM,CAAC,MAAM,kBAAkB,GAAG;IACjC,SAAS;IACT,QAAQ;IACR,UAAU;IACV,oBAAoB;IACpB,UAAU;IACV,WAAW;IACX,OAAO;CACE,CAAA;AAEV,yFAAyF;AACzF,MAAM,CAAC,MAAM,iBAAiB,GAAwB,MAAM,CAAC,MAAM,CAAC;IACnE,GAAe;IACf,GAAG,kBAAkB,CAAC,OAAO,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,CAAC,KAAK,GAAG,EAAc,EAAE,KAAK,GAAG,EAAc,CAAC,CAAC;CACxF,CAAC,CAAA;AAEF;;;;GAIG;AACH,MAAM,CAAC,MAAM,gBAAgB,GAAG,CAAC,OAAO,EAAE,QAAQ,EAAE,cAAc,CAAU,CAAA;AAE5E,8FAA8F;AAC9F,MAAM,CAAC,MAAM,WAAW,GAAG,CAAC,GAAG,kBAAkB,EAAE,GAAG,gBAAgB,CAAU,CAAA;AAEhF;;;;;;GAMG;AACH,MAAM,CAAC,MAAM,iBAAiB,GAAwB,MAAM,CAAC,MAAM,CAAC;IACnE,GAAe;IACf,GAAG,WAAW,CAAC,OAAO,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,CAAC,KAAK,GAAG,EAAc,EAAE,KAAK,GAAG,EAAc,CAAC,CAAC;CACjF,CAAC,CAAA;AAEF,MAAM,CAAC,MAAM,gBAAgB,GAAG;IAC/B,eAAe;IACf,eAAe;IACf,MAAM;IACN,QAAQ;IACR,gBAAgB;IAChB,gBAAgB;CACP,CAAA;AAEV,MAAM,CAAC,MAAM,WAAW,GAAG,CAAC,GAAG,WAAW,EAAE,GAAG,gBAAgB,CAAU,CAAA;AAEzE,MAAM,CAAC,MAAM,iBAAiB,GAAwB,MAAM,CAAC,MAAM,CAAC;IACnE,GAAe;IACf,GAAG,WAAW,CAAC,OAAO,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,CAAC,KAAK,GAAG,EAAc,EAAE,KAAK,GAAG,EAAc,CAAC,CAAC;CACjF,CAAC,CAAA"}
|
package/out/onnx-runner.d.ts
CHANGED
|
@@ -50,7 +50,14 @@ export declare class OnnxRunner {
|
|
|
50
50
|
* back to the actual input length.
|
|
51
51
|
*
|
|
52
52
|
* @param tokenIds The id sequence produced by the tokenizer (no special tokens added).
|
|
53
|
+
* @param anchor Optional postcode-anchor channel (#239/#240). When supplied (only for anchor
|
|
54
|
+
* models — exported with the `anchor_features`/`anchor_confidence` inputs), per-piece features
|
|
55
|
+
* `(seqLen × dim)` + confidence `(seqLen,)` are fed, zero-padded to `fixedSeqLen`. Omit for
|
|
56
|
+
* plain models, whose ONNX has no anchor inputs.
|
|
53
57
|
*/
|
|
54
|
-
infer(tokenIds: number[]
|
|
58
|
+
infer(tokenIds: number[], anchor?: {
|
|
59
|
+
features: ReadonlyArray<ReadonlyArray<number>>;
|
|
60
|
+
confidence: ReadonlyArray<number>;
|
|
61
|
+
}): Promise<InferResult>;
|
|
55
62
|
}
|
|
56
63
|
//# sourceMappingURL=onnx-runner.d.ts.map
|
package/out/onnx-runner.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"onnx-runner.d.ts","sourceRoot":"","sources":["../onnx-runner.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;GAaG;
|
|
1
|
+
{"version":3,"file":"onnx-runner.d.ts","sourceRoot":"","sources":["../onnx-runner.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;GAaG;AAOH,MAAM,WAAW,cAAc;IAC9B,wEAAwE;IACxE,MAAM,CAAC,EAAE,OAAO,CAAA;IAChB;;;;;OAKG;IACH,WAAW,CAAC,EAAE,MAAM,CAAA;CACpB;AAED,8FAA8F;AAC9F,eAAO,MAAM,qBAAqB,MAAM,CAAA;AAExC,MAAM,WAAW,WAAW;IAC3B,2EAA2E;IAC3E,MAAM,EAAE,MAAM,EAAE,EAAE,CAAA;IAClB,oEAAoE;IACpE,SAAS,EAAE,MAAM,CAAA;CACjB;AAED,qBAAa,UAAU;IAMrB,OAAO,CAAC,QAAQ,CAAC,SAAS;IAC1B,OAAO,CAAC,QAAQ,CAAC,UAAU;IAN5B,OAAO,CAAC,OAAO,CAAoC;IACnD,OAAO,CAAC,WAAW,CAA6C;IAChE,SAAgB,WAAW,EAAE,MAAM,CAAA;IAEnC,OAAO;IAQP,oEAAoE;WACvD,MAAM,CAAC,SAAS,EAAE,MAAM,EAAE,IAAI,GAAE,cAAmB,GAAG,OAAO,CAAC,UAAU,CAAC;IAMtF,6CAA6C;WAChC,SAAS,CAAC,UAAU,EAAE,UAAU,EAAE,IAAI,GAAE,cAAmB,GAAG,OAAO,CAAC,UAAU,CAAC;YAMhF,aAAa;IAgB3B;;;;;;;;;;;OAWG;IACG,KAAK,CACV,QAAQ,EAAE,MAAM,EAAE,EAClB,MAAM,CAAC,EAAE;QAAE,QAAQ,EAAE,aAAa,CAAC,aAAa,CAAC,MAAM,CAAC,CAAC,CAAC;QAAC,UAAU,EAAE,aAAa,CAAC,MAAM,CAAC,CAAA;KAAE,GAC5F,OAAO,CAAC,WAAW,CAAC;CAqDvB"}
|
package/out/onnx-runner.js
CHANGED
|
@@ -14,6 +14,7 @@
|
|
|
14
14
|
*/
|
|
15
15
|
import { promises as fs } from "node:fs";
|
|
16
16
|
import ort from "onnxruntime-node";
|
|
17
|
+
import { ANCHOR_FEATURE_DIM } from "./anchor-inference.js";
|
|
17
18
|
/** Default sequence length for v0.1.0 / v0.2.0 (BertConfig max_position_embeddings = 128). */
|
|
18
19
|
export const DEFAULT_FIXED_SEQ_LEN = 128;
|
|
19
20
|
export class OnnxRunner {
|
|
@@ -64,8 +65,12 @@ export class OnnxRunner {
|
|
|
64
65
|
* back to the actual input length.
|
|
65
66
|
*
|
|
66
67
|
* @param tokenIds The id sequence produced by the tokenizer (no special tokens added).
|
|
68
|
+
* @param anchor Optional postcode-anchor channel (#239/#240). When supplied (only for anchor
|
|
69
|
+
* models — exported with the `anchor_features`/`anchor_confidence` inputs), per-piece features
|
|
70
|
+
* `(seqLen × dim)` + confidence `(seqLen,)` are fed, zero-padded to `fixedSeqLen`. Omit for
|
|
71
|
+
* plain models, whose ONNX has no anchor inputs.
|
|
67
72
|
*/
|
|
68
|
-
async infer(tokenIds) {
|
|
73
|
+
async infer(tokenIds, anchor) {
|
|
69
74
|
const session = await this.ensureSession();
|
|
70
75
|
const seqLen = Math.min(tokenIds.length, this.fixedSeqLen);
|
|
71
76
|
const padded = new BigInt64Array(this.fixedSeqLen);
|
|
@@ -78,6 +83,31 @@ export class OnnxRunner {
|
|
|
78
83
|
input_ids: new ort.Tensor("int64", padded, [1, this.fixedSeqLen]),
|
|
79
84
|
attention_mask: new ort.Tensor("int64", mask, [1, this.fixedSeqLen]),
|
|
80
85
|
};
|
|
86
|
+
if (anchor) {
|
|
87
|
+
const dim = anchor.features[0]?.length ?? 0;
|
|
88
|
+
const af = new Float32Array(this.fixedSeqLen * dim);
|
|
89
|
+
const ac = new Float32Array(this.fixedSeqLen);
|
|
90
|
+
for (let i = 0; i < seqLen; i++) {
|
|
91
|
+
ac[i] = anchor.confidence[i] ?? 0;
|
|
92
|
+
const row = anchor.features[i];
|
|
93
|
+
if (row)
|
|
94
|
+
for (let d = 0; d < dim; d++)
|
|
95
|
+
af[i * dim + d] = row[d] ?? 0;
|
|
96
|
+
}
|
|
97
|
+
feeds.anchor_features = new ort.Tensor("float32", af, [1, this.fixedSeqLen, dim]);
|
|
98
|
+
feeds.anchor_confidence = new ort.Tensor("float32", ac, [1, this.fixedSeqLen]);
|
|
99
|
+
}
|
|
100
|
+
else if (session.inputNames.includes("anchor_features")) {
|
|
101
|
+
// Anchor-trained model (its ONNX declares the anchor inputs as mandatory) but no anchor data
|
|
102
|
+
// was supplied: feed zeros. That's the `confidence = 0` identity — the model's anchor-off
|
|
103
|
+
// behavior. Without it the session throws on the missing required inputs.
|
|
104
|
+
feeds.anchor_features = new ort.Tensor("float32", new Float32Array(this.fixedSeqLen * ANCHOR_FEATURE_DIM), [
|
|
105
|
+
1,
|
|
106
|
+
this.fixedSeqLen,
|
|
107
|
+
ANCHOR_FEATURE_DIM,
|
|
108
|
+
]);
|
|
109
|
+
feeds.anchor_confidence = new ort.Tensor("float32", new Float32Array(this.fixedSeqLen), [1, this.fixedSeqLen]);
|
|
110
|
+
}
|
|
81
111
|
const output = await session.run(feeds);
|
|
82
112
|
const logitsTensor = output.logits;
|
|
83
113
|
if (!logitsTensor)
|
package/out/onnx-runner.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"onnx-runner.js","sourceRoot":"","sources":["../onnx-runner.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;GAaG;AAEH,OAAO,EAAE,QAAQ,IAAI,EAAE,EAAE,MAAM,SAAS,CAAA;AACxC,OAAO,GAAG,MAAM,kBAAkB,CAAA;
|
|
1
|
+
{"version":3,"file":"onnx-runner.js","sourceRoot":"","sources":["../onnx-runner.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;GAaG;AAEH,OAAO,EAAE,QAAQ,IAAI,EAAE,EAAE,MAAM,SAAS,CAAA;AACxC,OAAO,GAAG,MAAM,kBAAkB,CAAA;AAElC,OAAO,EAAE,kBAAkB,EAAE,MAAM,uBAAuB,CAAA;AAc1D,8FAA8F;AAC9F,MAAM,CAAC,MAAM,qBAAqB,GAAG,GAAG,CAAA;AASxC,MAAM,OAAO,UAAU;IAMJ;IACA;IANV,OAAO,GAAgC,IAAI,CAAA;IAC3C,WAAW,GAAyC,IAAI,CAAA;IAChD,WAAW,CAAQ;IAEnC,YACkB,SAAiB,EACjB,UAA6B,EAC9C,IAAoB;QAFH,cAAS,GAAT,SAAS,CAAQ;QACjB,eAAU,GAAV,UAAU,CAAmB;QAG9C,IAAI,CAAC,WAAW,GAAG,IAAI,CAAC,WAAW,IAAI,qBAAqB,CAAA;IAC7D,CAAC;IAED,oEAAoE;IACpE,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC,SAAiB,EAAE,OAAuB,EAAE;QAC/D,MAAM,MAAM,GAAG,IAAI,UAAU,CAAC,SAAS,EAAE,IAAI,EAAE,IAAI,CAAC,CAAA;QACpD,IAAI,IAAI,CAAC,MAAM;YAAE,MAAM,MAAM,CAAC,aAAa,EAAE,CAAA;QAC7C,OAAO,MAAM,CAAA;IACd,CAAC;IAED,6CAA6C;IAC7C,MAAM,CAAC,KAAK,CAAC,SAAS,CAAC,UAAsB,EAAE,OAAuB,EAAE;QACvE,MAAM,MAAM,GAAG,IAAI,UAAU,CAAC,SAAS,EAAE,UAAU,EAAE,IAAI,CAAC,CAAA;QAC1D,IAAI,IAAI,CAAC,MAAM;YAAE,MAAM,MAAM,CAAC,aAAa,EAAE,CAAA;QAC7C,OAAO,MAAM,CAAA;IACd,CAAC;IAEO,KAAK,CAAC,aAAa;QAC1B,IAAI,IAAI,CAAC,OAAO;YAAE,OAAO,IAAI,CAAC,OAAO,CAAA;QACrC,IAAI,CAAC,IAAI,CAAC,WAAW,EAAE,CAAC;YACvB,IAAI,CAAC,WAAW,GAAG,CAAC,KAAK,IAAI,EAAE;gBAC9B,MAAM,KAAK,GAAG,IAAI,CAAC,UAAU,IAAI,IAAI,UAAU,CAAC,MAAM,EAAE,CAAC,QAAQ,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC,CAAA;gBAClF,MAAM,OAAO,GAAG,MAAM,GAAG,CAAC,gBAAgB,CAAC,MAAM,CAAC,KAAK,EAAE;oBACxD,kBAAkB,EAAE,CAAC,KAAK,CAAC;oBAC3B,sBAAsB,EAAE,KAAK;iBAC7B,CAAC,CAAA;gBACF,IAAI,CAAC,OAAO,GAAG,OAAO,CAAA;gBACtB,OAAO,OAAO,CAAA;YACf,CAAC,CAAC,EAAE,CAAA;QACL,CAAC;QACD,OAAO,IAAI,CAAC,WAAW,CAAA;IACxB,CAAC;IAED;;;;;;;;;;;OAWG;IACH,KAAK,CAAC,KAAK,CACV,QAAkB,EAClB,MAA8F;QAE9F,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,aAAa,EAAE,CAAA;QAC1C,MAAM,MAAM,GAAG,IAAI,CAAC,GAAG,CAAC,QAAQ,CAAC,MAAM,EAAE,IAAI,CAAC,WAAW,CAAC,CAAA;QAC1D,MAAM,MAAM,GAAG,IAAI,aAAa,CAAC,IAAI,CAAC,WAAW,CAAC,CAAA;QAClD,MAAM,IAAI,GAAG,IAAI,aAAa,CAAC,IAAI,CAAC,WAAW,CAAC,CAAA;QAChD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACjC,MAAM,CAAC,CAAC,CAAC,GAAG,MAAM,CAAC,QAAQ,CAAC,CAAC,CAAE,CAAC,CAAA;YAChC,IAAI,CAAC,CAAC,CAAC,GAAG,EAAE,CAAA;QACb,CAAC;QAED,MAAM,KAAK,GAA+B;YACzC,SAAS,EAAE,IAAI,GAAG,CAAC,MAAM,CAAC,OAAO,EAAE,MAAM,EAAE,CAAC,CAAC,EAAE,IAAI,CAAC,WAAW,CAAC,CAAC;YACjE,cAAc,EAAE,IAAI,GAAG,CAAC,MAAM,CAAC,OAAO,EAAE,IAAI,EAAE,CAAC,CAAC,EAAE,IAAI,CAAC,WAAW,CAAC,CAAC;SACpE,CAAA;QAED,IAAI,MAAM,EAAE,CAAC;YACZ,MAAM,GAAG,GAAG,MAAM,CAAC,QAAQ,CAAC,CAAC,CAAC,EAAE,MAAM,IAAI,CAAC,CAAA;YAC3C,MAAM,EAAE,GAAG,IAAI,YAAY,CAAC,IAAI,CAAC,WAAW,GAAG,GAAG,CAAC,CAAA;YACnD,MAAM,EAAE,GAAG,IAAI,YAAY,CAAC,IAAI,CAAC,WAAW,CAAC,CAAA;YAC7C,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBACjC,EAAE,CAAC,CAAC,CAAC,GAAG,MAAM,CAAC,UAAU,CAAC,CAAC,CAAC,IAAI,CAAC,CAAA;gBACjC,MAAM,GAAG,GAAG,MAAM,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAA;gBAC9B,IAAI,GAAG;oBAAE,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,EAAE,CAAC,EAAE;wBAAE,EAAE,CAAC,CAAC,GAAG,GAAG,GAAG,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,CAAA;YACrE,CAAC;YACD,KAAK,CAAC,eAAe,GAAG,IAAI,GAAG,CAAC,MAAM,CAAC,SAAS,EAAE,EAAE,EAAE,CAAC,CAAC,EAAE,IAAI,CAAC,WAAW,EAAE,GAAG,CAAC,CAAC,CAAA;YACjF,KAAK,CAAC,iBAAiB,GAAG,IAAI,GAAG,CAAC,MAAM,CAAC,SAAS,EAAE,EAAE,EAAE,CAAC,CAAC,EAAE,IAAI,CAAC,WAAW,CAAC,CAAC,CAAA;QAC/E,CAAC;aAAM,IAAI,OAAO,CAAC,UAAU,CAAC,QAAQ,CAAC,iBAAiB,CAAC,EAAE,CAAC;YAC3D,6FAA6F;YAC7F,0FAA0F;YAC1F,0EAA0E;YAC1E,KAAK,CAAC,eAAe,GAAG,IAAI,GAAG,CAAC,MAAM,CAAC,SAAS,EAAE,IAAI,YAAY,CAAC,IAAI,CAAC,WAAW,GAAG,kBAAkB,CAAC,EAAE;gBAC1G,CAAC;gBACD,IAAI,CAAC,WAAW;gBAChB,kBAAkB;aAClB,CAAC,CAAA;YACF,KAAK,CAAC,iBAAiB,GAAG,IAAI,GAAG,CAAC,MAAM,CAAC,SAAS,EAAE,IAAI,YAAY,CAAC,IAAI,CAAC,WAAW,CAAC,EAAE,CAAC,CAAC,EAAE,IAAI,CAAC,WAAW,CAAC,CAAC,CAAA;QAC/G,CAAC;QAED,MAAM,MAAM,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,CAAA;QACvC,MAAM,YAAY,GAAG,MAAM,CAAC,MAAM,CAAA;QAClC,IAAI,CAAC,YAAY;YAAE,MAAM,IAAI,KAAK,CAAC,6CAA6C,CAAC,CAAA;QACjF,MAAM,IAAI,GAAG,YAAY,CAAC,IAAoB,CAAA;QAC9C,MAAM,CAAC,EAAE,AAAD,EAAG,SAAS,CAAC,GAAG,YAAY,CAAC,IAAyC,CAAA;QAE9E,MAAM,MAAM,GAAe,EAAE,CAAA;QAC7B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACjC,MAAM,GAAG,GAAa,IAAI,KAAK,CAAC,SAAS,CAAC,CAAA;YAC1C,MAAM,IAAI,GAAG,CAAC,GAAG,SAAS,CAAA;YAC1B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,EAAE,CAAC,EAAE;gBAAE,GAAG,CAAC,CAAC,CAAC,GAAG,IAAI,CAAC,IAAI,GAAG,CAAC,CAAE,CAAA;YAC5D,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAA;QACjB,CAAC;QACD,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,CAAA;IAC7B,CAAC;CACD"}
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Postcode anchor — the first member of the "anchor-based parsing" family (Direction D, #240). See
|
|
7
|
+
* `docs/articles/plan/2026-06-03-anchor-based-parsing.md`.
|
|
8
|
+
*
|
|
9
|
+
* A postcode is the most information-dense token in an address: a hierarchical geo-encoding that
|
|
10
|
+
* places a query on Earth far more cheaply than the rest of the parse. This module lifts the
|
|
11
|
+
* postcode out of the BIO sequence-labelling problem and treats it as a structured anchor. It
|
|
12
|
+
* runs the same per-country shape regexes the decoder repair pass uses ({@link collectMatches}),
|
|
13
|
+
* resolves each shaped span against a postcode gazetteer, and returns a SOFT signal: a country
|
|
14
|
+
* posterior plus a calibrated confidence. It never decides a postcode's identity on its own — it
|
|
15
|
+
* reports "this string is (or is not) a real postcode, in these countries, near here", and leaves
|
|
16
|
+
* the parser to weigh that against the surrounding tokens.
|
|
17
|
+
*
|
|
18
|
+
* Two design rules carried from the DeepSeek consult
|
|
19
|
+
* (`.agents/skills/deepseek-consult/ds-pc-turn{1,2}-postcode-anchor.txt`):
|
|
20
|
+
*
|
|
21
|
+
* - The country posterior is UNIFORM over the countries a string actually exists in. We never weight
|
|
22
|
+
* by per-country postcode volume, because that skews "75001" toward whichever country owns
|
|
23
|
+
* more 5-digit codes — the exact bias the anchor exists to avoid. Disambiguation is the
|
|
24
|
+
* parser's job, using script, city tokens, and user locale.
|
|
25
|
+
* - Confidence combines gazetteer MEMBERSHIP with country AMBIGUITY. A string that matches a postcode
|
|
26
|
+
* regex but exists in no gazetteer (a bare `27`, or a 5-digit house number that is not a real
|
|
27
|
+
* code) gets confidence 0, so the parser treats it as a house number. A real-but-ambiguous
|
|
28
|
+
* code (`75001` in FR and US) gets moderate confidence. A real, single-country code gets
|
|
29
|
+
* 1.0.
|
|
30
|
+
*/
|
|
31
|
+
/** A gazetteer hit for a postcode string. `lat`/`lon` of 0 means "known postcode, no centroid yet". */
|
|
32
|
+
export interface PostcodePlace {
|
|
33
|
+
country: string;
|
|
34
|
+
lat: number;
|
|
35
|
+
lon: number;
|
|
36
|
+
}
|
|
37
|
+
/**
|
|
38
|
+
* The minimal surface the anchor needs from a gazetteer. Implementations: an in-memory fake (tests)
|
|
39
|
+
* or a SQLite-backed lookup over the `postalcode-*.db` shards (`@mailwoman/resolver-wof-sqlite`).
|
|
40
|
+
* Keeping the seam this narrow lets a future FST/WASM resolver drop in without touching the anchor
|
|
41
|
+
* logic.
|
|
42
|
+
*/
|
|
43
|
+
export interface PostcodeResolver {
|
|
44
|
+
/** Exact-match lookup of a normalized postcode string across every country shard. */
|
|
45
|
+
lookup(postcode: string): PostcodePlace[];
|
|
46
|
+
}
|
|
47
|
+
export interface PostcodeAnchor {
|
|
48
|
+
/** The shaped substring as it appeared in the raw text, with char offsets. */
|
|
49
|
+
span: {
|
|
50
|
+
text: string;
|
|
51
|
+
start: number;
|
|
52
|
+
end: number;
|
|
53
|
+
};
|
|
54
|
+
/** The normalized form actually queried (uppercased, `D-` prefix stripped, whitespace collapsed). */
|
|
55
|
+
normalized: string;
|
|
56
|
+
/** Coordinate-bearing gazetteer hits — best-effort centroid(s), one representative per country. */
|
|
57
|
+
candidates: PostcodePlace[];
|
|
58
|
+
/**
|
|
59
|
+
* Uniform distribution over the countries the postcode exists in (membership,
|
|
60
|
+
* coordinate-independent).
|
|
61
|
+
*/
|
|
62
|
+
posterior: Record<string, number>;
|
|
63
|
+
/** `1 - normalizedEntropy(posterior)` when the postcode exists; `0` when it is in no gazetteer. */
|
|
64
|
+
confidence: number;
|
|
65
|
+
/**
|
|
66
|
+
* `exact` — the string is a real postcode; `outward` — a GB unit (`SO4 3RX`) resolved to its
|
|
67
|
+
* outward district (`SO4`), the granularity the GB gazetteer is aggregated at (no penalty — it is
|
|
68
|
+
* a real, confident GB match); `fuzzy` — only an edit-distance-1 variant exists (a likely typo /
|
|
69
|
+
* OCR slip), so the confidence carries a penalty; `none` — in no gazetteer.
|
|
70
|
+
*/
|
|
71
|
+
matchType: "exact" | "outward" | "fuzzy" | "none";
|
|
72
|
+
/**
|
|
73
|
+
* Structural house-number prior in [0, 1]: `1` for a code that cannot be a house number, and
|
|
74
|
+
* below `1` for a digit-only code sharing its comma-delimited segment with a street word (so it
|
|
75
|
+
* reads as a house number rather than a postcode). Already folded into {@link confidence}; exposed
|
|
76
|
+
* so a consumer can rank competing spans, or see why one was down-weighted, without re-deriving
|
|
77
|
+
* it.
|
|
78
|
+
*/
|
|
79
|
+
positionFactor: number;
|
|
80
|
+
}
|
|
81
|
+
export interface ExtractPostcodeAnchorsOpts {
|
|
82
|
+
/**
|
|
83
|
+
* When an exact lookup finds nothing, retry Damerau–Levenshtein ≤1 variants to absorb typos and
|
|
84
|
+
* OCR slips (`75OO8` → `75008`). Off by default so existing callers keep exact-match behaviour.
|
|
85
|
+
*/
|
|
86
|
+
fuzzy?: boolean;
|
|
87
|
+
}
|
|
88
|
+
/**
|
|
89
|
+
* Class-aware edit-distance-1 variants of a postcode string: deletions, same-class substitutions
|
|
90
|
+
* (digit↔digit, letter↔letter), same-class insertions, and adjacent transpositions. Restricting
|
|
91
|
+
* substitutions/insertions to the character's class mirrors how humans mistype or OCR a postcode (a
|
|
92
|
+
* digit becomes another digit, not a letter) and keeps the candidate set small.
|
|
93
|
+
*/
|
|
94
|
+
export declare function editDistance1Variants(s: string): string[];
|
|
95
|
+
/**
|
|
96
|
+
* Normalize a shaped span to the canonical gazetteer key: uppercase, collapse internal whitespace
|
|
97
|
+
* to a single space, and strip the German `D-` courtesy prefix (the shards store `68161`, not
|
|
98
|
+
* `D-68161`).
|
|
99
|
+
*/
|
|
100
|
+
export declare function normalizePostcode(raw: string): string;
|
|
101
|
+
/**
|
|
102
|
+
* The GB outward code of a normalized unit postcode — the part before the space when the inward
|
|
103
|
+
* half is `\d[A-Z]{2}` (`SO4 3RX` → `SO4`). The GB gazetteer is aggregated to outward codes (2.7M
|
|
104
|
+
* units is too large + too fine for an anchor), so the extractor retries the outward code when a
|
|
105
|
+
* full GB unit misses. Returns `null` for any string that isn't a GB unit postcode (so it never
|
|
106
|
+
* fires elsewhere).
|
|
107
|
+
*/
|
|
108
|
+
export declare function gbOutwardCode(normalized: string): string | null;
|
|
109
|
+
/**
|
|
110
|
+
* Extract postcode anchors from raw text. For each postcode-shaped span, resolve it against the
|
|
111
|
+
* gazetteer and emit a soft anchor (country posterior + confidence). Spans that match a shape but
|
|
112
|
+
* exist in no gazetteer are still returned, with an empty posterior and confidence 0 — an explicit
|
|
113
|
+
* "looks like a postcode, but isn't one" so the caller can see the extractor fired and chose not to
|
|
114
|
+
* anchor.
|
|
115
|
+
*/
|
|
116
|
+
export declare function extractPostcodeAnchors(text: string, resolver: PostcodeResolver, opts?: ExtractPostcodeAnchorsOpts): PostcodeAnchor[];
|
|
117
|
+
//# sourceMappingURL=postcode-anchor.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"postcode-anchor.d.ts","sourceRoot":"","sources":["../postcode-anchor.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA6BG;AAQH,uGAAuG;AACvG,MAAM,WAAW,aAAa;IAC7B,OAAO,EAAE,MAAM,CAAA;IACf,GAAG,EAAE,MAAM,CAAA;IACX,GAAG,EAAE,MAAM,CAAA;CACX;AAED;;;;;GAKG;AACH,MAAM,WAAW,gBAAgB;IAChC,qFAAqF;IACrF,MAAM,CAAC,QAAQ,EAAE,MAAM,GAAG,aAAa,EAAE,CAAA;CACzC;AAED,MAAM,WAAW,cAAc;IAC9B,8EAA8E;IAC9E,IAAI,EAAE;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAC;QAAC,GAAG,EAAE,MAAM,CAAA;KAAE,CAAA;IAClD,qGAAqG;IACrG,UAAU,EAAE,MAAM,CAAA;IAClB,mGAAmG;IACnG,UAAU,EAAE,aAAa,EAAE,CAAA;IAC3B;;;OAGG;IACH,SAAS,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAA;IACjC,mGAAmG;IACnG,UAAU,EAAE,MAAM,CAAA;IAClB;;;;;OAKG;IACH,SAAS,EAAE,OAAO,GAAG,SAAS,GAAG,OAAO,GAAG,MAAM,CAAA;IACjD;;;;;;OAMG;IACH,cAAc,EAAE,MAAM,CAAA;CACtB;AAED,MAAM,WAAW,0BAA0B;IAC1C;;;OAGG;IACH,KAAK,CAAC,EAAE,OAAO,CAAA;CACf;AAWD;;;;;GAKG;AACH,wBAAgB,qBAAqB,CAAC,CAAC,EAAE,MAAM,GAAG,MAAM,EAAE,CAczD;AAED;;;;GAIG;AACH,wBAAgB,iBAAiB,CAAC,GAAG,EAAE,MAAM,GAAG,MAAM,CAKrD;AAED;;;;;;GAMG;AACH,wBAAgB,aAAa,CAAC,UAAU,EAAE,MAAM,GAAG,MAAM,GAAG,IAAI,CAI/D;AAuGD;;;;;;GAMG;AACH,wBAAgB,sBAAsB,CACrC,IAAI,EAAE,MAAM,EACZ,QAAQ,EAAE,gBAAgB,EAC1B,IAAI,GAAE,0BAA+B,GACnC,cAAc,EAAE,CAmElB"}
|